3.Power 2 the array
from numba import cuda , jitimport numpy as np
@cuda.jit
def add(x,y,out):
col,row=cuda.grid(2)
out[row,col]=x[row,col]+y[row,col]
x=np.ones((32,32))
d_x=cuda.to_device(x)
y=np.arange((32*32)).reshape((32,32))
d_y=cuda.to_device(y)
#z=np.zeros((300,10)).astype(np.float32)
d_out = cuda.device_array_like(d_x)
thread=(4,4)
block=(8,8)
add[block,thread](d_x,d_y,d_out)
cuda.synchronize()
d_out.copy_to_host()
4.hypot:
from math import hypot
from numba import cuda
@cuda.jit
def hypot_stride(a, b, c):
start=cuda.grid(1)
stride=cuda.gridsize(1)
for i in range(start,a.shape[0],stride):
c[i] = hypot(a[i], b[i])
# You do not need to modify the contents in this cell
n = 1000000
a = np.random.uniform(-12, 12, n).astype(np.float32)
b = np.random.uniform(-12, 12, n).astype(np.float32)
d_a = cuda.to_device(a)
d_b = cuda.to_device(b)
d_c = cuda.device_array_like(d_b)
blocks = 128
threads_per_block = 64
hypot_stride[blocks, threads_per_block](d_a, d_b, d_c)
from math import hypot
from numba import cuda
@cuda.jit
def hypot_stride(a, b, c):
start=cuda.grid(1)
stride=cuda.gridsize(1)
for i in range(start,a.shape[0],stride):
c[i] = hypot(a[i], b[i])
# You do not need to modify the contents in this cell
n = 1000000
a = np.random.uniform(-12, 12, n).astype(np.float32)
b = np.random.uniform(-12, 12, n).astype(np.float32)
d_a = cuda.to_device(a)
d_b = cuda.to_device(b)
d_c = cuda.device_array_like(d_b)
blocks = 128
threads_per_block = 64
hypot_stride[blocks, threads_per_block](d_a, d_b, d_c)
5.histogram
def cpu_histogram(x, xmin, xmax, histogram_out):
nbins = histogram_out.shape[0]
bin_width = (xmax - xmin) / nbins
for element in x:
bin_number = np.int32((element - xmin)/bin_width)
if bin_number >= 0 and bin_number < histogram_out.shape[0]:
histogram_out[bin_number] += 1
@cuda.jit
def cuda_histogram(x, xmin, xmax, histogram_out):
start = cuda.grid(1)
stride = cuda.gridsize(1)
nbins = histogram_out.shape[0]
bin_width = (xmax - xmin) / nbins
for i in range(start,x.shape[0],stride):
bin_number= np.int32((x[i]-xmin)/bin_width)
if bin_number >= 0 and bin_number < histogram_out.shape[0]:
cuda.atomic.add(histogram_out, bin_number, 1)
def cpu_histogram(x, xmin, xmax, histogram_out):
nbins = histogram_out.shape[0]
bin_width = (xmax - xmin) / nbins
for element in x:
bin_number = np.int32((element - xmin)/bin_width)
if bin_number >= 0 and bin_number < histogram_out.shape[0]:
histogram_out[bin_number] += 1
@cuda.jit
def cuda_histogram(x, xmin, xmax, histogram_out):
start = cuda.grid(1)
stride = cuda.gridsize(1)
nbins = histogram_out.shape[0]
bin_width = (xmax - xmin) / nbins
for i in range(start,x.shape[0],stride):
bin_number= np.int32((x[i]-xmin)/bin_width)
if bin_number >= 0 and bin_number < histogram_out.shape[0]:
cuda.atomic.add(histogram_out, bin_number, 1)
5. 2D Threading:
import numpy as np
from numba import cuda
@cuda.jit
def add_2D_coordinates_stride(A):
grid_col, grid_row = cuda.grid(2)
# By passing `2`, we get the grid size in both the x an y dimensions
stride_col, stride_row = cuda.gridsize(2)
for data_row in range(grid_row, A.shape[0], stride_row):
for data_col in range(grid_col, A.shape[1], stride_col):
A[data_row][data_col] = grid_row + grid_col
from numba import cuda
@cuda.jit
def add_2D_coordinates_stride(A):
grid_col, grid_row = cuda.grid(2)
# By passing `2`, we get the grid size in both the x an y dimensions
stride_col, stride_row = cuda.gridsize(2)
for data_row in range(grid_row, A.shape[0], stride_row):
for data_col in range(grid_col, A.shape[1], stride_col):
A[data_row][data_col] = grid_row + grid_col
def add_matrix_stride(A, B, C):
grid_col,grid_row = cuda.grid(2)
strid_col,strid_row =cuda.gridsize(2)
for element_row in range(grid_row,A.shape[0],strid_row):
for element_col in range(grid_col,B.shape[1],strid_col):
C[element_row,element_col] = A[element_row,element_col] + B[element_row,element_col]
import numpy as np
from numba import cuda
@cuda.jit
def mm_stride(A, B, C):
grid_col, grid_row = cuda.grid(2)
stride_col, stride_row = cuda.gridsize(2)
for data_row in range(grid_row,A.shape[0],stride_row):data_row
for data_column in range(grid_col,B.shape[1],stride_col): data_column
sum = 0
for i in range(A.shape[1]): # B.shape[0] would also be okay here
sum += A[data_row][i] * B[i][data_column]
C[data_row][data_column] = sum
from numba import cuda
@cuda.jit
def mm_stride(A, B, C):
grid_col, grid_row = cuda.grid(2)
stride_col, stride_row = cuda.gridsize(2)
for data_row in range(grid_row,A.shape[0],stride_row):data_row
for data_column in range(grid_col,B.shape[1],stride_col): data_column
sum = 0
for i in range(A.shape[1]): # B.shape[0] would also be okay here
sum += A[data_row][i] * B[i][data_column]
C[data_row][data_column] = sum
5. 2D Threading mit Speicher teilen für Matrix multipikation:
def mm_shared(a, b, c):
column, row = cuda.grid(2)
sum = 0
stride_column,stride_row=cuda.gridsize(2)
a_cache = cuda.shared.array(block_size, types.int32)
b_cache = cuda.shared.array(block_size, types.int32)
tx=cuda.threadIdx.x
ty=cuda.threadIdx.y
for i in range(row,a.shape[0],stride_row):
a_cache[ty,tx]=a[i,tx]
cuda.syncthreads()
for j in range(column,b.shape[1],stride_column):
b_cache[ty,tx]=b[ty,j]
cuda.syncthreads()
for i in range(b_cache.shape[0]):
sum += a_cache[ty][i] * b_cache[i][tx]
c[row][column] = sum
column, row = cuda.grid(2)
sum = 0
stride_column,stride_row=cuda.gridsize(2)
a_cache = cuda.shared.array(block_size, types.int32)
b_cache = cuda.shared.array(block_size, types.int32)
tx=cuda.threadIdx.x
ty=cuda.threadIdx.y
for i in range(row,a.shape[0],stride_row):
a_cache[ty,tx]=a[i,tx]
cuda.syncthreads()
for j in range(column,b.shape[1],stride_column):
b_cache[ty,tx]=b[ty,j]
cuda.syncthreads()
for i in range(b_cache.shape[0]):
sum += a_cache[ty][i] * b_cache[i][tx]
c[row][column] = sum
M = 128
N = 32
# Input vectors of MxN and NxM dimensions
a = np.arange(M*N).reshape(M,N).astype(np.int32)
b = np.arange(M*N).reshape(N,M).astype(np.int32)
c = np.zeros((M, M)).astype(np.int32)
d_a = cuda.to_device(a)
d_b = cuda.to_device(b)
d_c = cuda.to_device(c)
block_size = (N,N)
grid_size = (int(M/N),int(M/N))
N = 32
# Input vectors of MxN and NxM dimensions
a = np.arange(M*N).reshape(M,N).astype(np.int32)
b = np.arange(M*N).reshape(N,M).astype(np.int32)
c = np.zeros((M, M)).astype(np.int32)
d_a = cuda.to_device(a)
d_b = cuda.to_device(b)
d_c = cuda.to_device(c)
block_size = (N,N)
grid_size = (int(M/N),int(M/N))
mm_shared[grid_size, block_size](d_a, d_b, d_c)
from numpy import testing
solution = a@b
output = d_c.copy_to_host()
testing.assert_array_equal(output, solution)
solution = a@b
output = d_c.copy_to_host()
testing.assert_array_equal(output, solution)