You can install pycuda like this:

pip install pycuda

The code to calculate the sum:

import math

import numpy as np
import pycuda.autoinit
import pycuda.driver as cuda
from pycuda.compiler import SourceModule

if __name__ == "__main__":
    n = 400000000  # It is limited by RAM size of GPU
    block_size = 1024  # Maximum possible value
    grid_size = math.ceil(n / block_size / 2)  # block_size threads can perform 2 * block_size numbers (because of the algorithm implemented)

    a = np.random.rand(n)  # input
    a = a.astype(np.float64)
    b = np.zeros(1, dtype=np.float64)  # output

    mod = SourceModule("""
    __global__ void sum_array(double* a, uint n) {
        uint tid = threadIdx.x;
        uint offset = 2 * blockIdx.x * blockDim.x;

        for (uint s = 1; s <= blockDim.x; s <<= 1) {
            if (tid % s == 0) {
                uint idx = 2 * tid + offset;
                if (idx + s < n) {
                    atomicAdd(a + idx, a[idx + s]);
                }
            }
            __syncthreads();
        }
        if ((offset != 0) && (tid == 0)) {
            atomicAdd(a, a[offset]);  // adding the results from each block to a[0]
        }
    }
    """)
    sum_array = mod.get_function("sum_array")

    a_gpu = cuda.mem_alloc(a.nbytes)
    cuda.memcpy_htod(a_gpu, a)
    sum_array(a_gpu, np.uint32(n), block=(block_size, 1, 1), grid=(grid_size, 1))
    cuda.memcpy_dtoh(b, a_gpu)

    print("GPU result:", b[0])
    print("CPU result:", a.sum())

The idea of the algorithm implemented is shown well on the picture from the official documentation of NVIDIA about the reduction with CUDA: https://developer.download.nvidia.com/assets/cuda/files/reduction.pdf

CUDA sum of array

If the length of array is N, the complexity of the algorithm is O(log N).