且构网

分享程序员开发的那些事...
且构网 - 分享程序员编程开发的那些事

cudaStream性能奇怪

更新时间:2023-02-27 11:37:40

首先,请提供完整的代码。我也正在处理您的交叉发布



随着更改,复制活动可以彼此重叠(H-> D和D-> H),并与内核执行:





您面对的第二个问题是要获得并发内核 (太多块/线程):

  #define WIDTH 6400 
#define HEIGHT 4800
#define NB_STREAM 10

#define BLOC_X 32
#define BLOC_Y 32

dim3 threads(BLOC_X,BLOC_Y);
dim3 blocks(ceil((float)WIDTH / BLOC_X),ceil((float)HEIGHT / BLOC_Y));

我建议如果这些是你需要运行的内核的大小,尝试并争取内核重叠 - 每个内核都会启动足够的块来填充GPU,因此您已经暴露了足够的并行性以保持GPU的繁忙。但是如果你渴望看到内核并发,你可以让你的内核使用更小的块数,同时使每个内核花费更多的时间执行。我们可以通过启动1个块来实现,并且每个块中的线程都执行图像过滤。


I try to develop an example of sobel with cudaStream. Here is the program:

void SobelStream(void)
{

    cv::Mat imageGrayL2 = cv::imread("/home/xavier/Bureau/Image1.png",0);


    u_int8_t *u8_PtImageHost;
    u_int8_t *u8_PtImageDevice;

    u_int8_t *u8_ptDataOutHost;
    u_int8_t *u8_ptDataOutDevice;

    u_int8_t u8_Used[NB_STREAM];

    u8_ptDataOutHost = (u_int8_t *)malloc(WIDTH*HEIGHT*sizeof(u_int8_t));
    checkCudaErrors(cudaMalloc((void**)&u8_ptDataOutDevice,WIDTH*HEIGHT*sizeof(u_int8_t)));

    u8_PtImageHost = (u_int8_t *)malloc(WIDTH*HEIGHT*sizeof(u_int8_t));
    checkCudaErrors(cudaMalloc((void**)&u8_PtImageDevice,WIDTH*HEIGHT*sizeof(u_int8_t)));


    cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc<unsigned char>();
    checkCudaErrors(cudaMallocArray(&Array_PatchsMaxDevice, &channelDesc,WIDTH,HEIGHT ));
    checkCudaErrors(cudaBindTextureToArray(Image,Array_PatchsMaxDevice));


    dim3 threads(BLOC_X,BLOC_Y);
    dim3 blocks(ceil((float)WIDTH/BLOC_X),ceil((float)HEIGHT/BLOC_Y));

    ClearKernel<<<blocks,threads>>>(u8_ptDataOutDevice,WIDTH,HEIGHT);


    int blockh = HEIGHT/NB_STREAM;


    Stream = (cudaStream_t *) malloc(NB_STREAM * sizeof(cudaStream_t));

    for (int i = 0; i < NB_STREAM; i++)
    {
        checkCudaErrors(cudaStreamCreate(&(Stream[i])));
    }

//    for(int i=0;i<NB_STREAM;i++)
//    {
//        cudaSetDevice(0);
//        cudaStreamCreate(&Stream[i]);
//    }


    cudaEvent_t Start;
    cudaEvent_t Stop;
    cudaEventCreate(&Start);
    cudaEventCreate(&Stop);

    cudaEventRecord(Start, 0);


    //////////////////////////////////////////////////////////
    for(int i=0;i<NB_STREAM;i++)
    {
        if(i == 0)
        {
            int localHeight  = blockh;
            checkCudaErrors(cudaMemcpy2DToArrayAsync( Array_PatchsMaxDevice,
                                                      0,
                                                      0,
                                                      imageGrayL2.data,//u8_PtImageDevice,
                                                      WIDTH,
                                                      WIDTH,
                                                      blockh,
                                                      cudaMemcpyHostToDevice  ,
                                                      Stream[i]));

            dim3 threads(BLOC_X,BLOC_Y);
            dim3 blocks(ceil((float)WIDTH/BLOC_X),ceil((float)localHeight/BLOC_Y));
            SobelKernel<<<blocks,threads,0,Stream[i]>>>(u8_ptDataOutDevice,0,WIDTH,localHeight-1);
            checkCudaErrors(cudaGetLastError());

            u8_Used[i] = 1;

        }else{


            int ioffsetImage =  WIDTH*(HEIGHT/NB_STREAM  );
            int hoffset = HEIGHT/NB_STREAM *i;
            int hoffsetkernel = HEIGHT/NB_STREAM -1 + HEIGHT/NB_STREAM* (i-1);
            int localHeight  = min(HEIGHT - (blockh*i),blockh);

            //printf("hoffset: %d hoffsetkernel %d localHeight %d rest %d ioffsetImage %d \n",hoffset,hoffsetkernel,localHeight,HEIGHT - (blockh +1 +blockh*(i-1)),ioffsetImage*i/WIDTH);

            checkCudaErrors(cudaMemcpy2DToArrayAsync( Array_PatchsMaxDevice,
                                                      0,
                                                      hoffset,
                                                      &imageGrayL2.data[ioffsetImage*i],//&u8_PtImageDevice[ioffset*i],
                            WIDTH,
                            WIDTH,
                            localHeight,
                            cudaMemcpyHostToDevice  ,
                            Stream[i]));


            u8_Used[i] = 1;
            if(HEIGHT - (blockh +1 +blockh*(i-1))<=0)
            {
                break;
            }
        }
    }



    ///////////////////////////////////////////
    for(int i=0;i<NB_STREAM;i++)
    {
        if(i == 0)
        {
            int localHeight  = blockh;


            dim3 threads(BLOC_X,BLOC_Y);
            dim3 blocks(1,1);
            SobelKernel<<<blocks,threads,0,Stream[i]>>>(u8_ptDataOutDevice,0,WIDTH,localHeight-1);
            checkCudaErrors(cudaGetLastError());

            u8_Used[i] = 1;

        }else{


            int ioffsetImage =  WIDTH*(HEIGHT/NB_STREAM  );
            int hoffset = HEIGHT/NB_STREAM *i;
            int hoffsetkernel = HEIGHT/NB_STREAM -1 + HEIGHT/NB_STREAM* (i-1);
            int localHeight  = min(HEIGHT - (blockh*i),blockh);


            dim3 threads(BLOC_X,BLOC_Y);
            dim3 blocks(1,1);

            SobelKernel<<<blocks,threads,0,Stream[i]>>>(u8_ptDataOutDevice,hoffsetkernel,WIDTH,localHeight);
            checkCudaErrors(cudaGetLastError());

            u8_Used[i] = 1;
            if(HEIGHT - (blockh +1 +blockh*(i-1))<=0)
            {
                break;
            }
        }
    }


    ///////////////////////////////////////////////////////
    for(int i=0;i<NB_STREAM;i++)
    {
        if(i == 0)
        {
            int localHeight  = blockh;
            checkCudaErrors(cudaMemcpyAsync(u8_ptDataOutHost,u8_ptDataOutDevice,WIDTH*(localHeight-1)*sizeof(u_int8_t),cudaMemcpyDeviceToHost,Stream[i]));
            u8_Used[i] = 1;

        }else{

            int ioffsetImage =  WIDTH*(HEIGHT/NB_STREAM  );
            int hoffset = HEIGHT/NB_STREAM *i;
            int hoffsetkernel = HEIGHT/NB_STREAM -1 + HEIGHT/NB_STREAM* (i-1);
            int localHeight  = min(HEIGHT - (blockh*i),blockh);

            checkCudaErrors(cudaMemcpyAsync(&u8_ptDataOutHost[hoffsetkernel*WIDTH],&u8_ptDataOutDevice[hoffsetkernel*WIDTH],WIDTH*localHeight*sizeof(u_int8_t),cudaMemcpyDeviceToHost,Stream[i]));

            u8_Used[i] = 1;
            if(HEIGHT - (blockh +1 +blockh*(i-1))<=0)
            {
                break;
            }
        }
    }


    for(int i=0;i<NB_STREAM;i++)
    {
        cudaStreamSynchronize(Stream[i]);
    }

    cudaEventRecord(Stop, 0);

    cudaEventSynchronize(Start);
    cudaEventSynchronize(Stop);


    float dt_ms;
    cudaEventElapsedTime(&dt_ms, Start, Stop);

    printf("dt_ms %f \n",dt_ms);

}

I had a really strange performance on th execution of my program. I decided to profile my example and I get that:

I don't understand it seems that each stream are waiting each other. Can someone help me about that?

First of all, in the future, please provide a complete code. I'm also working off of your cross-posting here to fill in some details such as kernel sizes.

You have two issues to address:

First, any time you wish to use cudaMemcpyAsync, you will most likely want to be working with pinned host allocations. If you use allocations created e.g. with malloc, you will not get the expected behavior from cudaMemcpyAsync as far as asynchronous concurrent execution is concerned. This necessity is covered in the programming guide:

If host memory is involved in the copy, it must be page-locked.

So the first change to make to your code is to convert this:

u8_PtImageHost   = (u_int8_t *)malloc(WIDTH*HEIGHT*sizeof(u_int8_t));
u8_ptDataOutHost = (u_int8_t *)malloc(WIDTH*HEIGHT*sizeof(u_int8_t));

to this:

checkCudaErrors(cudaHostAlloc(&u8_PtImageHost, WIDTH*HEIGHT*sizeof(u_int8_t), cudaHostAllocDefault));
checkCudaErrors(cudaHostAlloc(&u8_ptDataOutHost, WIDTH*HEIGHT*sizeof(u_int8_t), cudaHostAllocDefault));

with that change alone, your execution duration drops from about 21ms to 7ms according to my testing. The reason for this is that without the change, we get no overlap whatsoever:

With the change, the copy activity can overlap with each other (H->D and D->H) and with kernel execution:

The second issue you face to get to concurrent kernel execution is that your kernels are just too large (too many blocks/threads):

#define WIDTH   6400
#define HEIGHT  4800
#define NB_STREAM 10

#define BLOC_X 32
#define BLOC_Y 32

    dim3 threads(BLOC_X,BLOC_Y);
    dim3 blocks(ceil((float)WIDTH/BLOC_X),ceil((float)HEIGHT/BLOC_Y));

I would suggest that if these are the sizes of kernels you need to run, there's probably not much benefit to try and strive for kernel overlap - each kernel is launching enough blocks to "fill" the GPU, so you have already exposed enough parallelism to keep the GPU busy. But if you are desperate to witness kernel concurrency, you could make your kernels use a smaller number of blocks while causing each kernel to spend more time executing. We could do this by launching 1 block, and have just the the threads in each block perform the image filtering.