dsaravanan
/
reference


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960
							# pycuda reference

# accessing a gpu
Google Colab: https://colab.research.google.com/
Kaggle Kernels: https://www.kaggle.com/kernels

# executing a kernel

mod = SourceModule("""
__global__ void doubleval(float *a) {
    int i = threadIdx.x + threadIdx.y*4;
    a[i] = 2 * a[i];
}
""")


# transferring data

""" 4x4 array of random numbers """
a = np.random.randn(4,4)

""" 'a' consists of double precision numbers, but
 most nvidia devices only support single precision """
a = a.astype(np.float32)

""" combine the above two steps """
a = np.random.randn(4,4).astype(np.float32)

""" allocate memory on the device """
d_a = drv.mem_alloc(a.nbytes)

""" transfer the data to the device """
drv.memcpy_htod(d_a, a)

""" transfer the data to the host """
drv.memcpy_dtoh(a, d_a)


# shortcuts for explicit memory copies
The pycuda.driver.In, pycuda.driver.Out, and pycuda.driver.InOut argument
handlers can simplify some of the memory transfers. For example, instead of
creating 'd_a', if replacing 'a' is fine, the following code can be used.

func(drv.InOut(a), block=(4, 4, 1))


# abstracting away the complications
Using a pycuda.gpuarray.GPUArray, the same effect can be achieved with much less
writing:

import pycuda.gpuarray as gpuarray
import pycuda.driver as drv
import pycuda.autoinit
import numpy as np

d_a = gpuarray.to_gpu(np.random.randn(4,4).astype(np.float32))
h_a = (2 * d_a).get()
print(h_a)
print(d_a)