i.e. Erstellen Sie einen Kernel für einen bestimmten Parametersatz (Schleife über i ,
Code: Select all
j
Code: Select all
k
Code: Select all
from cupyx.scipy.ndimage import convolve as convolve_gpu
energy_const = (1/np.sqrt(xi_list**2+2*a**2))
x = cp.asarray(x)
y = cp.asarray(y)
z = cp.asarray(z)
xi_list = cp.asarray(xi_list)
theta_list = cp.asarray(theta_list)
phi_list = cp.asarray(phi_list)
a = cp.asarray(a)
fib_sum_log = cp.asarray(fib_sum_log)
energy_const = cp.asarray(energy_const)
# Pre-calculate wavelet energies (if needed)
wavelet_ener = cp.zeros(xi_list.size)
for k in range(xi_list.size):
xi = xi_list[k]
wavelet = S.skern3(x,y,z, a, theta_list[0], phi_list[0], xi) # Use arbitrary theta/phi for initial calc
wavelet_ener[k] = cp.sum(cp.abs(wavelet)**2)
# Pre-allocate output arrays on the GPU
trans_ener = cp.zeros((theta_list.size, phi_list.size, xi_list.size))
var_image = cp.zeros((theta_list.size, phi_list.size, xi_list.size))
max_image = cp.zeros((theta_list.size, phi_list.size, xi_list.size))
for i in range(theta_list.size):
start = time.time()
theta = theta_list[i]
for j in range(phi_list.size):
phi = phi_list[j]
# Calculate wavelet *ONCE* for this theta and phi
for k in range(xi_list.size):
xi = xi_list[k]
wavelet = S.skern3(x,y,z, a, theta, phi, xi)
# No more freeing memory inside the loop!
G_Ixy = convolve_gpu(fib_sum_log, wavelet) # Make sure fib_sum_log is a CuPy array
print('Post Convolution')
print('mempool used bytes: ' + str(mempool.used_bytes()))
print('mempool total bytes: ' + str(mempool.total_bytes()))
print('pinned mempool free blocks: ' + str(pinned_mempool.n_free_blocks()))
print('=======================================================================\n')
images_wt = cp.multiply(energy_const[k], G_Ixy) # Make sure energy_const is a CuPy array
trans_ener[i, j, k] = cp.sum(cp.abs(G_Ixy)**2)
var_image[i, j, k] = cp.sqrt(cp.var(G_Ixy))
max_image[i, j, k] = cp.max(G_Ixy)
# Report time
end = time.time()
print('average time row ' + '{:.1f}'.format(100*i/num_tpoints) + ' [%]: ' + str(end-start) + ' s')
< /code>
Ich habe versucht, alles in den für Loops zu einem Cupy-Array zu machen, um Übertragungsaufwand zu vermeiden. Und es funktioniert ... aber ich muss iterieren, also nicht sicher, was ich tun sollte. n_free_blocks ()