CUDA and Python:


Hier würde ich gerne meine Erfahrung im Bereich Programming mit CUDA teilen.

3.Power 2 the array

from numba import cuda , jit
import numpy as np
@cuda.jit
def add(x,y,out):
    col,row=cuda.grid(2)
    out[row,col]=x[row,col]+y[row,col]
x=np.ones((32,32))
d_x=cuda.to_device(x)
y=np.arange((32*32)).reshape((32,32))
d_y=cuda.to_device(y)
#z=np.zeros((300,10)).astype(np.float32)
d_out = cuda.device_array_like(d_x)
thread=(4,4)
block=(8,8)
add[block,thread](d_x,d_y,d_out)
cuda.synchronize()
d_out.copy_to_host()

4.hypot:
from math import hypot
from numba import cuda

@cuda.jit
def hypot_stride(a, b, c):
    start=cuda.grid(1)
    stride=cuda.gridsize(1)
    for i in range(start,a.shape[0],stride):
        c[i] = hypot(a[i], b[i])

# You do not need to modify the contents in this cell
n = 1000000
a = np.random.uniform(-12, 12, n).astype(np.float32)
b = np.random.uniform(-12, 12, n).astype(np.float32)
d_a = cuda.to_device(a)
d_b = cuda.to_device(b)
d_c = cuda.device_array_like(d_b)

blocks = 128
threads_per_block = 64

hypot_stride[blocks, threads_per_block](d_a, d_b, d_c)


5.histogram
def cpu_histogram(x, xmin, xmax, histogram_out):

    nbins = histogram_out.shape[0]
    bin_width = (xmax - xmin) / nbins

    for element in x:
        bin_number = np.int32((element - xmin)/bin_width)
        if bin_number >= 0 and bin_number < histogram_out.shape[0]:
           
            histogram_out[bin_number] += 1


@cuda.jit
def cuda_histogram(x, xmin, xmax, histogram_out):
    start = cuda.grid(1)
    stride = cuda.gridsize(1)
    nbins = histogram_out.shape[0]
    bin_width =  (xmax - xmin) / nbins
    for i in range(start,x.shape[0],stride):
        bin_number= np.int32((x[i]-xmin)/bin_width)
       
        if bin_number >= 0 and bin_number < histogram_out.shape[0]:
            cuda.atomic.add(histogram_out, bin_number, 1)
 

5. 2D Threading:
import numpy as np
from numba import cuda
@cuda.jit
def add_2D_coordinates_stride(A):

    grid_col, grid_row = cuda.grid(2)
    # By passing `2`, we get the grid size in both the x an y dimensions
    stride_col, stride_row = cuda.gridsize(2)
 
    for data_row in range(grid_row, A.shape[0], stride_row):
        for data_col in range(grid_col, A.shape[1], stride_col):
            A[data_row][data_col] = grid_row + grid_col


def add_matrix_stride(A, B, C):
    grid_col,grid_row = cuda.grid(2)
    strid_col,strid_row =cuda.gridsize(2)
    for element_row in range(grid_row,A.shape[0],strid_row):
        for element_col in range(grid_col,B.shape[1],strid_col):
            C[element_row,element_col] = A[element_row,element_col] + B[element_row,element_col]
import numpy as np
from numba import cuda
@cuda.jit
def mm_stride(A, B, C):

    grid_col, grid_row = cuda.grid(2)
    stride_col, stride_row = cuda.gridsize(2)

    for data_row in range(grid_row,A.shape[0],stride_row):data_row
        for data_column in range(grid_col,B.shape[1],stride_col): data_column
            sum = 0
            for i in range(A.shape[1]): # B.shape[0] would also be okay here
                sum += A[data_row][i] * B[i][data_column]
               
            C[data_row][data_column] = sum


5. 2D Threading mit Speicher teilen für Matrix multipikation:
def mm_shared(a, b, c):
    column, row = cuda.grid(2)
    sum = 0
    stride_column,stride_row=cuda.gridsize(2)

    a_cache = cuda.shared.array(block_size, types.int32)
    b_cache = cuda.shared.array(block_size, types.int32)
    tx=cuda.threadIdx.x
    ty=cuda.threadIdx.y
   
    for i in range(row,a.shape[0],stride_row):
        a_cache[ty,tx]=a[i,tx]   
    cuda.syncthreads()
    for j in range(column,b.shape[1],stride_column):
        b_cache[ty,tx]=b[ty,j]
    cuda.syncthreads()
    for i in range(b_cache.shape[0]):
        sum += a_cache[ty][i] * b_cache[i][tx]

    c[row][column] = sum
M = 128
N = 32

# Input vectors of MxN and NxM dimensions
a = np.arange(M*N).reshape(M,N).astype(np.int32)
b = np.arange(M*N).reshape(N,M).astype(np.int32)
c = np.zeros((M, M)).astype(np.int32)

d_a = cuda.to_device(a)
d_b = cuda.to_device(b)
d_c = cuda.to_device(c)


block_size = (N,N)

grid_size = (int(M/N),int(M/N))
mm_shared[grid_size, block_size](d_a, d_b, d_c)
from numpy import testing
solution = a@b
output = d_c.copy_to_host()

testing.assert_array_equal(output, solution)