__global__ void sum_array(const int * array, int * total, unsigned int n) { unsigned int idx = threadIdx.x + blockIdx.x * blockDim.x; unsigned int stride = gridDim.x * blockDim.x; unsigned int input_idx = idx; __shared__ int partial_res[256]; int partial_sum = 0; while (input_idx < n) { partial_sum += array[input_idx]; input_idx += stride; } partial_res[threadIdx.x] = partial_sum; __syncthreads(); // reduction unsigned int i = blockDim.x / 2; while (i != 0) { if (threadIdx.x < i) { partial_res[threadIdx.x] += partial_res[threadIdx.x + i]; } __syncthreads(); i /= 2; } if (threadIdx.x == 0) { atomicAdd(total, partial_res[0]); } }