使用CUDA在Python中循环

如何解决使用CUDA在Python中循环

我正在尝试在合理的时间内求解大量的耦合微分方程。对于常规的Numpy来说，求解很快变得非常缓慢，因为对于大量的迭代，我想求解的方程组数量约为10 ^ 7。

这基本上是大量的并行矩阵运算，对于GPU来说似乎是一个好任务。在方程式上使用jit的vectorize和CUDA装饰器有助于使代码比常规numpy的速度大大提高。我遇到的问题是，为了使其正常工作，我在循环的每个步骤中都从GPU复制了数据。据我了解，这是一个缓慢的过程，因此，如果可能，我希望在GPU上执行此操作，并且仅在计算完成后才将数据复制回去，但是我不确定如何设置这样的循环来在GPU上运行。

这是我到目前为止的简化版本：



import numpy as np
import matplotlib.pyplot as plt

from numba import jit
from numba import vectorize
from numba import cuda


## size of matrices
N=4

## Time for integration
T=0.1
## Length of Time steps
dt=0.0001
## Number of time steps
n = int(T/dt)


##mockup initial values,real arrays are much larger
F_0 = np.array([[ 0.0,-1.74998928e-03,0.0],[ 1.74998928e-03,0.00000000e+00+0.00005j,-4.42925874e-19+1j,1.74998928e-03-1j,],4.42925874e-19,0.0,1.74998928e-03,[ 0.0,0.0]],dtype = np.complex64)


G_0 = np.array([[0.00000000e+00,3.06247186e-06,[3.06247186e-06,1.0,[3.06247186e-06+0.0005j,1.00000000e+00,3.06247186e-06-0.0004j,[0.0,3.06247186e-06+ -0.04j,dtype = np.complex64)


delta_i = np.complex64(1)



### Time derivatives of functions we want to integrate
@cuda.jit(device=True)
def dFdt(F,G,delta):
    first_factor = -2j*F
    second_factor = -1j*delta*(2*G-1)
    return first_factor+second_factor

@cuda.jit(device=True)
def dGdt(F,delta):
    prefactor = -1j
    secondfactor =(delta.conjugate())*F -delta*F.conjugate()
    return prefactor*secondfactor

#Standard RK4 for the specified equation set,all inputs are NxN complex matrices
@vectorize(['c8(c8,c8,c8)'],target='cuda')
def RK4_step_F(G,F,delta,dt,index):  

    G1 = dGdt(F,delta) 
    F1 = dFdt(F,delta)
    G2 = dGdt(F + dt/2*F1,delta)
    F2 = dFdt(F + dt/2*F1,G + dt/2*G1,delta)
    G3 = dGdt(F + dt/2*F2,delta)
    F3 = dFdt(F + dt/2*F2,G + dt/2*G2,delta) 
    F4 = dFdt(F + dt*F3,G + dt*G3,delta)
    return (F+(dt/6)*(F1 +(2*F2) +(2*F3) + F4))


@vectorize(['c8(c8,target='cuda')
def RK4_step_G(G,delta)
    G4 = dGdt(F + dt*F3,delta)
    return (G + (dt/6)*(G1+ (2*G2) + (2*G3) + G4)) 


### Runs the integration,F_0 and G_0 are NxN matrices of initial values for the functions F and G,delta_i is the initial value of the parameter delta,### dt is the step size and n is the number of steps
def RK4_method(F_0,G_0):
    delta = np.zeros(n+1,dtype=np.complex64)
    delta[0]= delta_i
    for i in range(n):

        
        RK4_step_G( G_0,F_0,delta[i],out = out_G)
        RK4_step_F( G_0,out = out_F)
        F_0 = out_F.copy_to_host()
        G_0 = out_G.copy_to_host()

        delta[i+1] = np.sum(F_0)
        print("working")
        
    return delta

###copying initial matrices
G_cuda = cuda.to_device(G_0)
F_cuda = cuda.to_device(F_0)

##defining matrices
out_F = cuda.device_array(shape=(N,N),dtype=np.complex64)
out_G = cuda.device_array(shape=(N,dtype=np.complex64)


##start calculation
delta_time = RK4_method(F_cuda,G_cuda)

time = np.linspace(0,T,n + 1)

plt.plot(time[1:],delta_time[1:])
plt.show()

我想知道如何建立这样的循环而不必来回复制数据？

我的经验几乎完全是在Python中使用的，因此CUDA对我还是有点陌生。

编辑：包含更完整的代码

解决方法

这应该是一组相当简单的更改，可以执行您想要的操作。不幸的是，numba vectorize似乎有一个characteristic that complicates things。因此，我们必须放置额外的管道以实现目标。与此相关的是，如果将所有内容都转换为@cuda.jit内核，则可能会降低总体代码复杂性，但是由于您的实现重点是@vectorize，因此我将尽量保持与之接近。，尽管最终我将需要使用一个numba CUDA内核。

目标是能够在循环处理期间使用一组背对背的内核调用来执行for循环，并以最少的数据传输量（主机->设备或设备->主机）

在获得实现目标所需的更改之前，使用探查器按原样运行您的代码可能会有用/具有指导性，以了解每次循环迭代中“不必要的”复制的想法，正在尝试避免。为了保持简单，我们将循环迭代次数减少到2。由于我们对代码分析感兴趣，而不对结果感兴趣，因此我们还将删除绘图：

$ cat t39.py
import numpy as np
# import matplotlib.pyplot as plt

from numba import jit
from numba import vectorize
from numba import cuda


## size of matrices
N=4

## Time for integration
T=0.1
## Length of Time steps
dt=0.0001
## Number of time steps
n = int(T/dt)


##mockup initial values,real arrays are much larger
F_0 = np.array([[ 0.0,-1.74998928e-03,0.0],[ 1.74998928e-03,0.00000000e+00+0.00005j,-4.42925874e-19+1j,1.74998928e-03-1j,],4.42925874e-19,0.0,1.74998928e-03,[ 0.0,0.0]],dtype = np.complex64)


G_0 = np.array([[0.00000000e+00,3.06247186e-06,[3.06247186e-06,1.0,[3.06247186e-06+0.0005j,1.00000000e+00,3.06247186e-06-0.0004j,[0.0,3.06247186e-06+ -0.04j,dtype = np.complex64)


delta_i = np.complex64(1)



### Time derivatives of functions we want to integrate
@cuda.jit(device=True)
def dFdt(F,G,delta):
    first_factor = -2j*F
    second_factor = -1j*delta*(2*G-1)
    return first_factor+second_factor

@cuda.jit(device=True)
def dGdt(F,delta):
    prefactor = -1j
    secondfactor =(delta.conjugate())*F -delta*F.conjugate()
    return prefactor*secondfactor

#Standard RK4 for the specified equation set,all inputs are NxN complex matrices
@vectorize(['c8(c8,c8,c8)'],target='cuda')
def RK4_step_F(G,F,delta,dt,index):

    G1 = dGdt(F,delta)
    F1 = dFdt(F,delta)
    G2 = dGdt(F + dt/2*F1,delta)
    F2 = dFdt(F + dt/2*F1,G + dt/2*G1,delta)
    G3 = dGdt(F + dt/2*F2,delta)
    F3 = dFdt(F + dt/2*F2,G + dt/2*G2,delta)
    F4 = dFdt(F + dt*F3,G + dt*G3,delta)
    return (F+(dt/6)*(F1 +(2*F2) +(2*F3) + F4))


@vectorize(['c8(c8,target='cuda')
def RK4_step_G(G,delta)
    G4 = dGdt(F + dt*F3,delta)
    return (G + (dt/6)*(G1+ (2*G2) + (2*G3) + G4))


### Runs the integration,F_0 and G_0 are NxN matrices of initial values for the functions F and G,delta_i is the initial value of the parameter delta,### dt is the step size and n is the number of steps
def RK4_method(F_0,G_0):
    delta = np.zeros(n+1,dtype=np.complex64)
    delta[0]= delta_i
    for i in range(2):


        RK4_step_G( G_0,F_0,delta[i],out = out_G)
        RK4_step_F( G_0,out = out_F)
        F_0 = out_F.copy_to_host()
        G_0 = out_G.copy_to_host()

        delta[i+1] = np.sum(F_0)
#        print("working")

    return delta

###copying initial matrices
G_cuda = cuda.to_device(G_0)
F_cuda = cuda.to_device(F_0)

##defining matrices
out_F = cuda.device_array(shape=(N,N),dtype=np.complex64)
out_G = cuda.device_array(shape=(N,dtype=np.complex64)


##start calculation
delta_time = RK4_method(F_cuda,G_cuda)

#time = np.linspace(0,T,n + 1)

#plt.plot(time[1:],delta_time[1:])
#plt.show()
$ nvprof --print-gpu-trace python t39.py
==30045== NVPROF is profiling process 30045,command: python t39.py
==30045== Profiling application: python t39.py
==30045== Profiling result:
   Start  Duration            Grid Size      Block Size     Regs*    SSMem*    DSMem*      Size  Throughput  SrcMemType  DstMemType           Device   Context    Stream  Name
319.56ms  1.4080us                    -               -         -         -         -      128B  86.698MB/s    Pageable      Device  GeForce GTX 960         1         7  [CUDA memcpy HtoD]
319.98ms  1.0560us                    -               -         -         -         -      128B  115.60MB/s    Pageable      Device  GeForce GTX 960         1         7  [CUDA memcpy HtoD]
321.38ms  1.0240us                    -               -         -         -         -      128B  119.21MB/s    Pageable      Device  GeForce GTX 960         1         7  [CUDA memcpy HtoD]
321.78ms  1.0240us                    -               -         -         -         -      128B  119.21MB/s    Pageable      Device  GeForce GTX 960         1         7  [CUDA memcpy HtoD]
322.17ms  1.0240us                    -               -         -         -         -      128B  119.21MB/s    Pageable      Device  GeForce GTX 960         1         7  [CUDA memcpy HtoD]
491.96ms  7.3290us              (1 1 1)       (576 1 1)        56        0B        0B         -           -           -           -  GeForce GTX 960         1         7  cudapy::__main__::__vectorized_RK4_step_G$248(Array<complex64,int=1,C,mutable,aligned>,Array<complex64,aligned>) [84]
493.21ms  1.0240us                    -               -         -         -         -      128B  119.21MB/s    Pageable      Device  GeForce GTX 960         1         7  [CUDA memcpy HtoD]
493.61ms  1.0560us                    -               -         -         -         -      128B  115.60MB/s    Pageable      Device  GeForce GTX 960         1         7  [CUDA memcpy HtoD]
494.00ms  1.0240us                    -               -         -         -         -      128B  119.21MB/s    Pageable      Device  GeForce GTX 960         1         7  [CUDA memcpy HtoD]
653.33ms  6.2400us              (1 1 1)       (576 1 1)        56        0B        0B         -           -           -           -  GeForce GTX 960         1         7  cudapy::__main__::__vectorized_RK4_step_F$249(Array<complex64,aligned>) [130]
653.64ms  1.1840us                    -               -         -         -         -      128B  103.10MB/s      Device    Pageable  GeForce GTX 960         1         7  [CUDA memcpy DtoH]
653.73ms  1.1840us                    -               -         -         -         -      128B  103.10MB/s      Device    Pageable  GeForce GTX 960         1         7  [CUDA memcpy DtoH]
654.53ms  1.0560us                    -               -         -         -         -      128B  115.60MB/s    Pageable      Device  GeForce GTX 960         1         7  [CUDA memcpy HtoD]
654.93ms  1.0240us                    -               -         -         -         -      128B  119.21MB/s    Pageable      Device  GeForce GTX 960         1         7  [CUDA memcpy HtoD]
655.32ms  1.0560us                    -               -         -         -         -      128B  115.60MB/s    Pageable      Device  GeForce GTX 960         1         7  [CUDA memcpy HtoD]
655.72ms  1.0240us                    -               -         -         -         -      128B  119.21MB/s    Pageable      Device  GeForce GTX 960         1         7  [CUDA memcpy HtoD]
656.11ms  1.0240us                    -               -         -         -         -      128B  119.21MB/s    Pageable      Device  GeForce GTX 960         1         7  [CUDA memcpy HtoD]
656.92ms  5.8250us              (1 1 1)       (576 1 1)        56        0B        0B         -           -           -           -  GeForce GTX 960         1         7  cudapy::__main__::__vectorized_RK4_step_G$248(Array<complex64,aligned>) [169]
657.82ms  1.0560us                    -               -         -         -         -      128B  115.60MB/s    Pageable      Device  GeForce GTX 960         1         7  [CUDA memcpy HtoD]
658.22ms  1.0240us                    -               -         -         -         -      128B  119.21MB/s    Pageable      Device  GeForce GTX 960         1         7  [CUDA memcpy HtoD]
658.62ms  1.0240us                    -               -         -         -         -      128B  119.21MB/s    Pageable      Device  GeForce GTX 960         1         7  [CUDA memcpy HtoD]
659.01ms  1.0240us                    -               -         -         -         -      128B  119.21MB/s    Pageable      Device  GeForce GTX 960         1         7  [CUDA memcpy HtoD]
659.40ms  1.0560us                    -               -         -         -         -      128B  115.60MB/s    Pageable      Device  GeForce GTX 960         1         7  [CUDA memcpy HtoD]
660.03ms  5.3450us              (1 1 1)       (576 1 1)        56        0B        0B         -           -           -           -  GeForce GTX 960         1         7  cudapy::__main__::__vectorized_RK4_step_F$249(Array<complex64,aligned>) [213]
660.24ms  1.1840us                    -               -         -         -         -      128B  103.10MB/s      Device    Pageable  GeForce GTX 960         1         7  [CUDA memcpy DtoH]
660.33ms  1.1840us                    -               -         -         -         -      128B  103.10MB/s      Device    Pageable  GeForce GTX 960         1         7  [CUDA memcpy DtoH]

Regs: Number of registers used per CUDA thread. This number includes registers used internally by the CUDA driver and/or tools and can be more than what the compiler shows.
SSMem: Static shared memory allocated per CUDA block.
DSMem: Dynamic shared memory allocated per CUDA block.
SrcMemType: The type of source memory accessed by memory operation/copy
DstMemType: The type of destination memory accessed by memory operation/copy
$

查看上面的nvprof输出，我们看到对于2个循环迭代中的每一个，有2个内核被调用（每个vectorize函数一个），但是也有10个或更多的复制操作。这些操作中的6个是由于上述标量/临时数组问题引起的。

我们将采用以下技术来改善每个循环的复制情况。我们将无法完全消除它，但是我们可以将其减少到一个非常低的水平，该水平不会随着问题规模的增加而增加。我们将执行以下操作：

将所有输入阵列移至设备（例如F，G）并将其留在此处。
在循环中创建一个乒乓结构，以便在交替循环通过时，输入数组成为输出数组，反之亦然。乒乓球意味着我们可以避免在每次循环结束时将输出复制到输入，就像您实际上所做的那样。
由于前面提到的数字vectorize问题，我们将手动将标量输入（例如dt）转换为与数组大小相同（NxN）的设备数组其他输入数组。这是我发现的唯一方法，它可以防止numba在每次循环迭代时创建自己的临时数组并将其复制到设备中。
您在每次循环遍历（np.sum）时都会进行基于主机的操作。我们需要将数据保留在设备上，并为此使用基于设备的操作。幸运的是，numba通过特殊的reduce functionality提供了该功能。
减少的结果可以保留在设备上，但仍仅为单个值。这使我们再次回到标量船。我们需要将其转换回NxN数组，以避免已经讨论的向量化问题。我考虑了几种numba“本机”方法，例如guvectorize，但最终放弃并使用了numba cuda内核（@cuda.jit）来执行distribute函数（获取单个值并广播它）整个数组）。

有了这些重大更改，我们最终可以得到一个代码，该代码每次循环迭代仅执行单个complex64数量的单个设备到设备副本，这是步骤4中的reduce操作产生的，最后每次循环迭代，将单个complex64数量的单个“设备到主机”副本复制以将每次迭代结果传回主机阵列。

对于小型测试用例（例如N = 4），这几乎没有什么区别，整个问题还是微不足道的。对于下面演示的较大测试用例，这些重构更改可能导致整体性能提高约5倍。根据最终执行的循环数，此处的改进会有所不同，并且在很大程度上取决于问题的大小。

这是一个完整的示例，您可以使用与上面示例中所示类似的技术来配置它（我建议将循环计数减少到可管理的水平，例如3），以查看每次循环迭代的行为差异：

$ cat t40.py
import numpy as np
# import matplotlib.pyplot as plt
import time
from numba import jit
from numba import vectorize
from numba import cuda


## size of matrices
N=4

## Time for integration
T =0.01
## Length of Time steps
dt=0.0001
## Number of time steps
n = int(T/dt)


##mockup initial values,dtype = np.complex64)
#create a larger test case
N=512
F_0 = np.ones((N,dtype=np.complex64)
G_0 = np.ones((N,dtype=np.complex64)


delta_i = np.complex64(1)



### Time derivatives of functions we want to integrate
@cuda.jit(device=True)
def dFdt(F,delta)
    return (G + (dt/6)*(G1+ (2*G2) + (2*G3) + G4))

#code modifications begin here,the above functions are unmodified

@cuda.reduce
def sum_reduce(a,b):
    return a+b

@cuda.jit
def distribute(A,B,s):
    i = cuda.grid(1)
    if i < s:
        B[i] = A[0]


### Runs the integration,### dt is the step size and n is the number of steps
def RK4_method_new(F_arr,G_arr):
    delta_h = np.zeros(shape=(n+1),dtype=np.complex64)
    delta_h[0] = delta_i
    ##defining device matrices
    in_F = cuda.to_device(F_arr)
    in_G = cuda.to_device(G_arr)
    out_F = cuda.device_array(shape=(N,dtype=np.complex64)
    out_G = cuda.device_array(shape=(N,dtype=np.complex64)
    #setup "scalar arrays"
    dt_arr    = np.full((N,np.complex64(dt),dtype=np.complex64)
    dev_dt    = cuda.to_device(dt_arr)
    index_arr = np.full((N,np.complex64(0),dtype=np.complex64)
    dev_index = cuda.to_device(dev_dt)
    delta_arr = np.full((N,delta_i,dtype=np.complex64)
    dev_delta = cuda.to_device(delta_arr)
    b = 256
    g = (N*N+b-1)//b
    for i in range(n):

        RK4_step_G( in_G,in_F,dev_delta,dev_dt,dev_index,out = out_G)
        RK4_step_F( in_G,out = out_F)
        # ping-pong
        tmp_G = in_G
        tmp_F = in_F
        in_G = out_G
        in_F = out_F
        out_G = tmp_G
        out_F = tmp_F

        sum_reduce(in_F.reshape(N*N),res = dev_delta.reshape(N*N)) # causes 8 byte D->D copy
        distribute[g,b](dev_delta.reshape(N*N),dev_delta.reshape(N*N),N*N)
        delta_h[i+1] = dev_delta[0,0] # causes 8 byte D->H copy
#        print("working")
    return delta_h

def RK4_method(F_0,dtype=np.complex64)
    delta[0]= delta_i
    out_F = cuda.device_array(shape=(N,dtype=np.complex64)
    for i in range(n):


        RK4_step_G( G_0,out = out_F)
        F_0 = out_F.copy_to_host()
        G_0 = out_G.copy_to_host()

        delta[i+1] = np.sum(F_0)
    #    print("working")

    return delta

##start calculation
# "warm-up runs,to get JIT compilation effects removed from timing comparison
delta_time     = RK4_method(F_0,G_0)
delta_time_new = RK4_method_new(F_0,G_0)
s = time.time()
delta_time     = RK4_method(F_0,G_0)
e = time.time()
print("old time: ",e-s)
s = time.time()
delta_time_new = RK4_method_new(F_0,G_0)
e = time.time()
print("new time: ",e-s)
for i in range(3):
    print("delta time: ",delta_time[i]," delta_time_new: ",delta_time_new[i])
#time = np.linspace(0,delta_time[1:])
#plt.show()
$ python t40.py
old time:  2.48877215385437
new time:  0.5504293441772461
delta time:  (1+0j)  delta_time_new:  (1+0j)
delta time:  (262143.98-78.64321j)  delta_time_new:  (262143.98-78.64322j)
delta time:  (1361288.4+3141394700j)  delta_time_new:  (1361289.1+3141394700j)
$

使用CUDA在Python中循环

如何解决使用CUDA在Python中循环

解决方法

相关推荐