测量const内存访问延迟会得出奇怪的结果

如何解决测量const内存访问延迟会得出奇怪的结果

我有3个内核，它们将两个数字相加。第一个在寄存器中添加两个数字。第二个从const存储器中获取一个数字，而从寄存器中获取另一个。第三个从const内存中获取两个数字。

根据文章“通过微基准测试剖析NVIDIA Volta GPU架构”，访问L1常量缓存的延迟约为24，而IADD指令的延迟对于Pascal是6个周期。因此，对于第一个内核，我希望其持续时间大于6，第二个内核的持续时间大于24，第三个内核的持续时间大于24。但是，当我使用clock（）测量时间时，分别得到 13 ， 12 和 214 。

我的显卡是GeForce GTX 1050 Mobile。 CUDA 10.1

编译命令：nvcc -arch = sm_61 main.cu -o main

下面我给出程序代码和SASS代码的片段。

程序代码：

#include <iostream>
#define RES_SIZE 10

__global__ void measureReg(int *res){
    int a = res[0],b = res[1],c;
    __shared__ int shdata[1];
    for(int i=0;i<150;++i) __syncthreads(); // Covers latencies of accesses to global memory
    unsigned int t1,t2;
    t1 = clock();
    asm volatile("add.s32 %0,%1,%2;": "=r"(c) : "r"(a),"r"(b) : "memory");
    shdata[0] = c; //Prevents execution of clock() before add has finished
    t2 = clock();
    res[0] = t2 - t1;
}

__global__ void measureConst1(int *res,int n1){
    int a,b;
    a = res[0];
    __shared__ int shdata[1];
    for(int i=0;i<150;++i) __syncthreads();
    unsigned int t1,%2;": "=r"(b) : "r"(a),"r"(n1) : "memory");
    shdata[0] = b;
    t2 = clock();
    res[0] = t2 - t1;
}

__global__ void measureConst2(int *res,int n1,int n2){
    int a;
    __shared__ int shdata[1];
    unsigned int t1,%2;": "=r"(a) : "r"(n1),"r"(n2) : "memory");
    shdata[0] = a;
    t2 = clock();
    res[0] = t2 - t1;
}


int main(int argc,char** argv){
    int hostRes[RES_SIZE],*devRes;
    std::fill(hostRes,hostRes + RES_SIZE,1);
    cudaMalloc(&devRes,RES_SIZE*sizeof(int));
    cudaMemcpy(devRes,hostRes,RES_SIZE*sizeof(int),cudaMemcpyHostToDevice);

    measureReg<<<1,1>>>(devRes);
    cudaMemcpy(hostRes,devRes,cudaMemcpyDeviceToHost);
    std::cout<<"IADD with registers: "<<hostRes[0]<<std::endl;

    measureConst1<<<1,1>>>(devRes,10);
    cudaMemcpy(hostRes,cudaMemcpyDeviceToHost);
    std::cout<<"IADD with register and const mem: "<<hostRes[0]<<std::endl;

    measureConst2<<<1,10,20);
    cudaMemcpy(hostRes,cudaMemcpyDeviceToHost);
    std::cout<<"IADD with const mem: "<<hostRes[0]<<std::endl;

    cudaFree(devRes);
    return 0;
}

SASS代码片段：

/* measureReg */
CS2R R4,SR_CLOCKLO ;
IADD R0,R0,R5 ;
STS [RZ],R0 ;
CS2R R5,SR_CLOCKLO ;

/* measureConst1 */
CS2R R4,c[0x0][0x148] ;
STS [RZ],SR_CLOCKLO ;

/* measureConst2 */
CS2R R2,SR_CLOCKLO ;
MOV R0,c[0x0][0x148] ;
IADD R0,c[0x0][0x14c] ;
STS [RZ],R0 ;
CS2R R0,SR_CLOCKLO ;

解决方法

这是坏的：

int hostRes[RES_SIZE],*devRes;
std::fill(hostRes,hostRes + RES_SIZE,1);
cudaMemcpy(devRes,hostRes,RES_SIZE*sizeof(int),cudaMemcpyHostToDevice);

cudaMalloc(&devRes,RES_SIZE*sizeof(int));

编译器会发出警告，表示已在设置devRes的值之前使用了它。您不应忽略这些警告。正确的顺序是：

int hostRes[RES_SIZE],1);
cudaMalloc(&devRes,RES_SIZE*sizeof(int));
cudaMemcpy(devRes,cudaMemcpyHostToDevice);

进行了此更改，然后使用CUDA 10.2为sm_61编译了代码，并专注于您的measureConst2函数（尽管循环展开行为对于所有用户都是相同的），我观察到SASS看起来像这样：

Fatbin elf code:
================
arch = sm_61
code version = [1,7]
producer = <unknown>
host = linux
compile_size = 64bit

        code for sm_61
                Function : _Z13measureConst2Piii
        .headerflags    @"EF_CUDA_SM61 EF_CUDA_PTX_SM(EF_CUDA_SM61)"
                                                                 /* 0x001fbc00fde007f6 */
        /*0008*/                   MOV R1,c[0x0][0x20] ;        /* 0x4c98078000870001 */
        /*0010*/                   NOP ;                         /* 0x50b0000000070f00 */
        /*0018*/                   NOP ;                         /* 0x50b0000000070f00 */
                                                                 /* 0x001fd400ffe007ed */
        /*0028*/                   NOP ;                         /* 0x50b0000000070f00 */
        /*0030*/                   BAR.SYNC 0x0 ;                /* 0xf0a81b8000070000 */
        /*0038*/                   MEMBAR.CTA ;                  /* 0xef98000000070000 */
                                                                 /* 0x001fb800fde007ef */
        /*0048*/                   NOP ;                         /* 0x50b0000000070f00 */
        /*0050*/                   NOP ;                         /* 0x50b0000000070f00 */
        /*0058*/                   NOP ;                         /* 0x50b0000000070f00 */
                                                                 /* 0x001fbc00fea007ff */
        /*0068*/                   BAR.SYNC 0x0 ;                /* 0xf0a81b8000070000 */
        /*0070*/                   MEMBAR.CTA ;                  /* 0xef98000000070000 */
        The above sequence repeats 149 times due to compiler unrolling and optimization...
        /*1f18*/                   NOP ;                         /* 0x50b0000000070f00 */
                                                                 /* 0x001ffc00fdc007ef */
        /*1f28*/                   NOP ;                         /* 0x50b0000000070f00 */
        /*1f30*/                   NOP ;                         /* 0x50b0000000070f00 */
        /*1f38*/                   BAR.SYNC 0x0 ;                /* 0xf0a81b8000070000 */
                                                                 /* 0x001fd800fcc007f5 */
        /*1f48*/                   MEMBAR.CTA ;                  /* 0xef98000000070000 */
        /*1f50*/                   CS2R R2,SR_CLOCKLO ;         /* 0x50c8000005070002 */
        /*1f58*/                   MOV R0,c[0x0][0x148] ;       /* 0x4c98078005270000 */
                                                                 /* 0x003f98001e4007f4 */
        /*1f68*/                   IADD R0,R0,c[0x0][0x14c] ;  /* 0x4c10000005370000 */
        /*1f70*/                   STS [RZ],R0 ;                /* 0xef5c00000007ff00 */
        /*1f78*/                   CS2R R0,SR_CLOCKLO ;         /* 0x50c8000005070000 */
                                                                 /* 0x001fc800fe2007f1 */
        /*1f88*/                   IADD R0,-R2,R0 ;            /* 0x5c12000000070200 */
        /*1f90*/                   MOV R2,c[0x0][0x140] ;       /* 0x4c98078005070002 */
        /*1f98*/                   MOV R3,c[0x0][0x144] ;       /* 0x4c98078005170003 */
                                                                 /* 0x001ffc00fde007f1 */
        /*1fa8*/                   STG.E [R2],R0 ;              /* 0xeedc200000070200 */
        /*1fb0*/                   NOP ;                         /* 0x50b0000000070f00 */
        /*1fb8*/                   EXIT ;                        /* 0xe30000000007000f */
                                                                 /* 0x001f8000fc0007ff */
        /*1fc8*/                   BRA 0x1fc0 ;                  /* 0xe2400fffff07000f */
        /*1fd0*/                   NOP;                          /* 0x50b0000000070f00 */

我们注意到编译器具有：

编译器已经展开了150次迭代的循环（不确定要完成的工作）
您的n1和n1（内核参数）的加载仅发生一次。

这些负载发生在这里：

        /*1f58*/                   MOV R0,c[0x0][0x14c] ;  /* 0x4c10000005370000

这些正在通过__constant__内存系统加载内核参数（这是预期的行为）。 __constant__存储系统是not the same作为L1高速缓存或“ L1常量高速缓存”。即使我们忽略了这一点，也只有在请求的项目已经在缓存中的情况下谈论与缓存访问相关的延迟才有意义。在上面的SASS代码中，没有理由假设这是真的。您仅访问项目一次，因此见证了与用全局存储器中的项目填充__constant__高速缓存相关的延迟（这最终是所有数据从主机到设备（甚至是内核参数）的移动方式。 / p>

您现在可能会问：“为什么measureConst1上看不到这么长的延迟？在那种情况下，您的内核设计有些不同，并且我们看到加载行为也有所不同。SASS看起来像这样：

            Function : _Z13measureConst1Pii
    .headerflags    @"EF_CUDA_SM61 EF_CUDA_PTX_SM(EF_CUDA_SM61)"
                                                             /* 0x001fc800fe2007f6 */
    /*0008*/                   MOV R1,c[0x0][0x20] ;        /* 0x4c98078000870001 */
    /*0010*/                   MOV R2,c[0x0][0x140] ;       /* 0x4c98078005070002 */
    /*0018*/                   MOV R3,c[0x0][0x144] ;       /* 0x4c98078005170003 */
                                                             /* 0x001fbc00fde000b1 */
    /*0028*/                   LDG.E R0,[R2] ;              /* 0xeed4200000070200 */
    /*0030*/                   NOP ;                         /* 0x50b0000000070f00 */
    /*0038*/                   NOP ;                         /* 0x50b0000000070f00 */
                                                             /* 0x001fd400ffe007e9 */
    /*0048*/                   NOP ;                         /* 0x50b0000000070f00 */
    /*0050*/                   BAR.SYNC 0x0 ;                /* 0xf0a81b8000070000 */
    /*0058*/                   MEMBAR.CTA ;                  /* 0xef98000000070000 */
     (repeating ...)
    /*1f68*/                   MEMBAR.CTA ;                  /* 0xef98000000070000 */
    /*1f70*/                   CS2R R4,SR_CLOCKLO ;         /* 0x50c8000005070004 */
    /*1f78*/                   IADD R0,c[0x0][0x148] ;  /* 0x4c10000005270000 */
                                                             /* 0x003fc800fcc000f2 */
    /*1f88*/                   STS [RZ],R0 ;                /* 0xef5c00000007ff00 */
    /*1f90*/                   CS2R R5,SR_CLOCKLO ;         /* 0x50c8000005070005 */
    /*1f98*/                   IADD R0,-R4,R5 ;            /* 0x5c12000000570400 */
                                                             /* 0x001fa000fde007f1 */
    /*1fa8*/                   STG.E [R2],R0 ;              /* 0xeedc200000070200 */
    /*1fb0*/                   NOP ;                         /* 0x50b0000000070f00 */
    /*1fb8*/                   NOP ;                         /* 0x50b0000000070f00 */
                                                             /* 0x001f8000ffe007ff */
    /*1fc8*/                   EXIT ;                        /* 0xe30000000007000f */
    /*1fd0*/                   BRA 0x1fd0 ;                  /* 0xe2400fffff87000f */
    /*1fd8*/                   NOP;                          /* 0x50b0000000070f00 */

在这里，我们看到在内核的最开始（由于您特定的内核设计），SASS正在从__constant__装载与您正在装载的项目紧邻的项目在您的时间范围内。因此，合理的假设是，在具有某些缓存行负载粒度的情况下，您的计时区域现在正在测量缓存延迟而不是全局负载延迟之类的东西。

似乎我已经找到问题的答案了。

根据此question，有立即常量（IMC）和索引常量（INC）缓存。 INC缓存处理由LDC指令创建的访问，而IMC处理其他访问。

我相信，提到的文章“通过微基准分析剖析NVIDIA Volta GPU架构”中指出的L1常量缓存的延迟是这两个缓存的平均延迟。而且，IMC缓存的延迟不超过注册文件访问的延迟。

为了检验这些假设，我编写了一个小型基准测试，其中包含6个内核。这些内核中的每一个相应地读取位于“ const”内存中的整数数组，并测量读取的延迟。

这些内核之间存在差异。第一个内核（“ imc”）通过IMC缓存读取数据，而无需任何预取。第二个（“ imc_with_imc_prefetch”）也通过IMC读取数据，但首先将数据预取到IMC。第三个（“ imc_with_inc_prefetch”）将数据预取到INC，然后从IMC读取。还有另外三个内核：“ inc”，“ inc_with_imc_prefetch”，“ inc_with_inc_prefetch”。

根据获得的结果，我得出了结论：

IMC延迟为12，INC延迟为40。平均延迟为26，非常接近上述文章中的L1常量缓存的延迟。
IMC和INC的缓存行大小为64个字节。
有L1.5 const高速缓存，其行大小为256字节，平均延迟为78。如果通过IMC访问，则其延迟为60。如果通过INC访问，则其延迟为96。

基准测试的完整代码：

#include <iostream>
#define SMPL_COUNT 128
#define CONST_SIZE 10000 // Count of elements in constant array

__constant__ int carr[CONST_SIZE];

__global__ void imc(int* arr,int* t){
    unsigned int t1,t2;

    __shared__ int temp[SMPL_COUNT];
    __shared__ int times[SMPL_COUNT];

    // Reading from immediate constant cache
    int sum = 0;
    for(int i=0;i<SMPL_COUNT;++i) {
        t1 = clock();
        sum += carr[i];
        temp[i] = sum;
        t2 = clock();
        times[i] = t2-t1;
        __syncthreads();
    }

    for(int i=0;i<SMPL_COUNT;++i) t[i] = times[i];

    arr[0] = t2-t1;
    arr[1] = sum;
}

__global__ void imc_with_imc_prefetch(int* arr,t2;

    __shared__ int temp[SMPL_COUNT];
    __shared__ int times[SMPL_COUNT];
    
    const int stride = 32/4; // Make assumption that cache line is more or equal to 32 bytes
    // Prefetch data to immediate constant cache
    for(int i=0; i<(SMPL_COUNT+stride-1)/stride; ++i) arr[i] = carr[i*stride];

    // Reading from immediate constant cache
    int sum = 0;    
    for(int i=0;i<SMPL_COUNT;++i) {
        t1 = clock();
        sum += carr[i];
        temp[i] = sum;
        t2 = clock();
        times[i] = t2-t1;
        __syncthreads();
    }

    for(int i=0;i<SMPL_COUNT;++i) t[i] = times[i];

    arr[0] = t2-t1;
    arr[1] = sum;
}

__global__ void imc_with_inc_prefetch(int* arr,t2;
    
    __shared__ int temp[SMPL_COUNT];
    __shared__ int times[SMPL_COUNT];

    // Prefetch data to index constant cache
    int index = carr[CONST_SIZE-1];
    for(int i=0;i<SMPL_COUNT;++i)
        index = *((int*)(((char*)carr)+index)); //Subtle way to avoid calling of shift instruction
    arr[0] = index;

    __syncthreads();

    //Reading from immediate constant cache
    index = 0;    
    for(int i=0;i<SMPL_COUNT;++i){
        t1 = clock();
        index += carr[i];
        temp[i]=index;
        t2 = clock();
        times[i] = t2-t1;
        __syncthreads();
    }

    for(int i=0;i<SMPL_COUNT;++i) t[i] = times[i];
    

    arr[0] = t2-t1;
    arr[1] = index;
}

__global__ void inc(int* arr,t2;

    __shared__ int temp[SMPL_COUNT];
    __shared__ int times[SMPL_COUNT];

    int index = carr[CONST_SIZE-1];
    arr[SMPL_COUNT - 1] = index;

    __syncthreads();

    //Reading from index constant cache
    
    for(int i=0;i<SMPL_COUNT;++i){
        t1 = clock();
        index = *((int*)(((char*)carr)+index));
        temp[i] = index;
        t2 = clock();
        times[i] = t2-t1;
        __syncthreads();
    }

    for(int i=0;i<SMPL_COUNT;++i) t[i] = times[i];
    
    arr[0] = t2-t1;
    arr[1] = index;
}

__global__ void inc_with_imc_prefetch(int* arr,t2;

    __shared__ int temp[SMPL_COUNT];
    __shared__ int times[SMPL_COUNT];

    //Prefetch data to index constant cache
    const int stride = 32/4; // Make assumption that cache line is more or equal to 32 bytes
    // Prefetch data to immediate constant cache
    for(int i=0; i<(SMPL_COUNT+stride-1)/stride; ++i)
        arr[i] = carr[i*stride];

    int index = carr[CONST_SIZE-1];
    arr[SMPL_COUNT - 1] = index;

    __syncthreads();

    //Reading from index constant cache
    for(int i=0;i<SMPL_COUNT;++i){
        t1 = clock();
        index = *((int*)(((char*)carr)+index));
        temp[i] = index;
        t2 = clock();
        times[i] = t2-t1;
        __syncthreads();
    }

    for(int i=0;i<SMPL_COUNT;++i) t[i] = times[i];
    
    arr[0] = t2-t1;
    arr[1] = index;
}

__global__ void inc_with_inc_prefetch(int*arr,t2;

    __shared__ int temp[SMPL_COUNT];
    __shared__ int times[SMPL_COUNT];

    int index = carr[CONST_SIZE-1];
    for(int i=0;i<SMPL_COUNT;++i){
        index = carr[index/4];
    }

    arr[0] = index;

    index = carr[CONST_SIZE-1];
    arr[SMPL_COUNT - 1] = index;
    
    __syncthreads();

    
    for(int i=0;i<SMPL_COUNT;++i){
        t1 = clock();
        index = *((int*)(((char*)carr)+index));
        temp[i] = index;
        t2 = clock();
        times[i] = t2-t1;
        __syncthreads();
    }

    for(int i=0;i<SMPL_COUNT;++i) t[i] = times[i];
    
    arr[0] = t2-t1;
    arr[1] = index+2;
}

int main(int argc,char** argv){
    int hostArr[SMPL_COUNT],*devArr,*devTimes;
    int imc_times[SMPL_COUNT],imc_imc_times[SMPL_COUNT],imc_inc_times[SMPL_COUNT];
    int inc_times[SMPL_COUNT],inc_imc_times[SMPL_COUNT],inc_inc_times[SMPL_COUNT];
    cudaMalloc(&devArr,SMPL_COUNT*sizeof(int));
    cudaMalloc(&devTimes,SMPL_COUNT*sizeof(int));

    cudaMemset (carr,CONST_SIZE*sizeof(int));
    cudaMemset (devArr,SMPL_COUNT*sizeof(int));
    cudaMemset (devTimes,SMPL_COUNT*sizeof(int));

    for(int i=0;i<SMPL_COUNT;++i) hostArr[i]=4*(i+1);
    cudaMemcpyToSymbol(carr,hostArr,SMPL_COUNT*sizeof(int));

    imc<<<1,1>>>(devArr,devTimes);
    cudaMemcpy(imc_times,devTimes,SMPL_COUNT*sizeof(int),cudaMemcpyDeviceToHost);

    imc_with_imc_prefetch<<<1,devTimes);
    cudaMemcpy(imc_imc_times,cudaMemcpyDeviceToHost);

    imc_with_inc_prefetch<<<1,devTimes);
    cudaMemcpy(imc_inc_times,cudaMemcpyDeviceToHost);

    inc<<<1,devTimes);
    cudaMemcpy(inc_times,cudaMemcpyDeviceToHost);

    inc_with_imc_prefetch<<<1,devTimes);
    cudaMemcpy(inc_imc_times,cudaMemcpyDeviceToHost);

    inc_with_inc_prefetch<<<1,devTimes);
    cudaMemcpy(inc_inc_times,cudaMemcpyDeviceToHost);
    
    cudaFree(devArr);
    cudaFree(devTimes);
    std::cout<<"IMC\tIMC(IMC)\tIMC(INC)\tINC\tINC(IMC)\tINC(INC)\n";
    for(int i=0;i<SMPL_COUNT;++i){
        std::cout<<imc_times[i]<<"\t"<<imc_imc_times[i]<<"\t"<<imc_inc_times[i]<<"\t";
        std::cout<<inc_times[i]<<"\t"<<inc_imc_times[i]<<"\t"<<inc_inc_times[i]<<"\n";
    }
    return 0;
}

测量const内存访问延迟会得出奇怪的结果

如何解决测量const内存访问延迟会得出奇怪的结果

解决方法

相关推荐