blob: 0fe802b8e55135c8c94a889e32794349f9258ed6 (
plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
|
__global__ void sum_array(const int * array, int * total, unsigned int n) {
unsigned int idx = threadIdx.x + blockIdx.x * blockDim.x;
unsigned int stride = gridDim.x * blockDim.x;
unsigned int input_idx = idx;
__shared__ int partial_res[256];
int partial_sum = 0;
while (input_idx < n) {
partial_sum += array[input_idx];
input_idx += stride;
}
partial_res[threadIdx.x] = partial_sum;
__syncthreads();
// reduction
unsigned int i = blockDim.x / 2;
while (i != 0) {
if (threadIdx.x < i) {
partial_res[threadIdx.x] += partial_res[threadIdx.x + i];
}
__syncthreads();
i /= 2;
}
if (threadIdx.x == 0) {
atomicAdd(total, partial_res[0]);
}
}
|