You can install pycuda like this:
pip install pycuda
The code to calculate the sum:
import math import numpy as np import pycuda.autoinit import pycuda.driver as cuda from pycuda.compiler import SourceModule if __name__ == "__main__": n = 400000000 # It is limited by RAM size of GPU block_size = 1024 # Maximum possible value grid_size = math.ceil(n / block_size / 2) # block_size threads can perform 2 * block_size numbers (because of the algorithm implemented) a = np.random.rand(n) # input a = a.astype(np.float64) b = np.zeros(1, dtype=np.float64) # output mod = SourceModule(""" __global__ void sum_array(double* a, uint n) { uint tid = threadIdx.x; uint offset = 2 * blockIdx.x * blockDim.x; for (uint s = 1; s <= blockDim.x; s <<= 1) { if (tid % s == 0) { uint idx = 2 * tid + offset; if (idx + s < n) { atomicAdd(a + idx, a[idx + s]); } } __syncthreads(); } if ((offset != 0) && (tid == 0)) { atomicAdd(a, a[offset]); // adding the results from each block to a[0] } } """) sum_array = mod.get_function("sum_array") a_gpu = cuda.mem_alloc(a.nbytes) cuda.memcpy_htod(a_gpu, a) sum_array(a_gpu, np.uint32(n), block=(block_size, 1, 1), grid=(grid_size, 1)) cuda.memcpy_dtoh(b, a_gpu) print("GPU result:", b[0]) print("CPU result:", a.sum())
The idea of the algorithm implemented is shown well on the picture from the official documentation of NVIDIA about the reduction with CUDA: https://developer.download.nvidia.com/assets/cuda/files/reduction.pdf
If the length of array is N, the complexity of the algorithm is O(log N).