如何从C ++传递字符串矩阵到Cuda内核

更新时间：2023-02-12 17:09:33

你已经显示的代码已经完成，你遗漏的东西可能很重要。如果您显示完整的验证码，他人会更容易为您提供协助。此外，任何时候您在使用CUDA代码时，***使用正确的cuda错误检查，这往往会指向你的工作（我怀疑这可能有助于你的第二次尝试）。此外，使用 cuda-memcheck 运行代码通常是有益的。

运行到CUDA和嵌套指针（ a 是指针到指针数组）的经典问题。这个问题也发生在几乎任何时候有一个指针埋在一些其他数据结构。为了将这样的数据结构从主机复制到设备需要具有多个步骤的深度复制操作。要了解更多关于这一点，搜索CUDA 2D数组（我认为规范的答案是由talonmies 这里）或看看我的答案此处和这里。

另外请注意，对于CUDA 6，如果您能够使用统一内存。

您的第二次尝试似乎是朝着展平您的2D或指针到ponter数组 char 的路径。这是深复制的问题的典型解决方案，导致更少的代码复杂性和可能还有更高的性能。这是一个完全工作的例子，混合了你的第一次和第二次尝试的想法，这似乎对我有用：

  $ cat t389。 cu 
 #include< stdio.h> 

 __global__ void func（char * a，int * indexes，int num_strings）{


 for（int i = 0; i  printf（string [％d]：，i）; 
 for（int j = indexes [2 * i]; j< indexes [2 * i + 1]; j ++）
 printf（％c，a [j] 
 printf（\\\
）; 
} 
} 

 int main（）{

 int max_text_length，num_str; 
 num_str = 3; 
 char * tmp [num_str]; 
 max_text_length = 12; 

 tmp [0] =（char *）malloc（max_text_length * sizeof（char））; 
 tmp [1] =（char *）malloc（max_text_length * sizeof（char））; 
 tmp [2] =（char *）malloc（max_text_length * sizeof（char））; 

 tmp [0] =some text; 
 tmp [1] =rand txt; 
 tmp [2] =text; 

 int stridx [2 * num_str]; 
 int * d_stridx; 
 stridx [0] = 0; 
 stridx [1] = 9; 
 stridx [2] = 9; 
 stridx [3] = 17; 
 stridx [4] = 17; 
 stridx [5] = 21; 

 char * a，* d_a; 
 a =（char *）malloc（num_str * max_text_length * sizeof（char））; 
 // flatten 
 int subidx = 0; 
 for（int i = 0; i  {
 for（int j = stridx [2 * i]; j a [j] = tmp [i] [subidx ++]; 
 subidx = 0; 
} 

 cudaMalloc（（void **）& d_a，num_str * max_text_length * sizeof（char））; 
 cudaMemcpy（d_a，a，num_str * max_text_length * sizeof（char），cudaMemcpyHostToDevice）; 
 cudaMalloc（（void **）& d_stridx，num_str * 2 * sizeof（int））; 
 cudaMemcpy（d_stridx，stridx，2 * num_str * sizeof（int），cudaMemcpyHostToDevice）; 


 func<<< 1,1>>>（d_a，d_stridx，num_str）; 
 cudaDeviceSynchronize（）; 

} 
 $ nvcc -arch = sm_20 -o t389 t389.cu 
 $ cuda-memcheck ./t389 
 ========= CUDA -MEMCHECK 
 string [0]：一些文本
 string [1]：rand txt 
 string [2]：text 
 ========= ERROR摘要： 0错误
 $

Problem:

I've a matrix in C++ filled with strings and I want to pass it to cuda kernel function. I know that CUDA can't handle strings so after some research I've tried out with some solutions listed below.

Attempts:

define an array of pointers in C++ containing for each cell a pointer chars (for simplicity tmp[i] is filled with the strings contained into the matrix previously cited)

C++ section

 char *tmp[3];
 int text_length, array_length;

 text_length = 4;
 array_length = 3;

 tmp[0] = (char*) malloc(text_length*sizeof(char));
 tmp[1] = (char*) malloc(text_length*sizeof(char));
 tmp[2] = (char*) malloc(text_length*sizeof(char));

 tmp[0] = "some";
 tmp[1] = "rand";
 tmp[2] = "text";

 char *a[3];
 for(int i=0;i<array_length;i++)
 {
   cudaMalloc((void**) &a[i],text_length*sizeof(char));
   cudaMemcpy(&a[i],&tmp[i],text_length*sizeof(char),cudaMemcpyHostToDevice);
 }

 func<<<blocksPerGrid, threadsPerBlock>>>(a);

CUDA section

 __global__ void func(char* a[]){

 for(int i=0;i<3;i++)
   printf("value[%d] = %s \n",i, a[i]);
 }

Output

 value[0] = (null)
 value[1] = (null)
 value[2] = (null)

spread the matrix filled with strings to a char pointer and pass it to cuda kernel and there try to retrieve the strings (again code simplified in C++)

C++ section

 char *a;
 int index[6];

 a = "somerandtext";
 index[0] = 0; // first word start
 index[1] = 3; // first word end
 index[2] = 4; // same as first word 
 index[3] = 7;
 index[4] = 8;
 index[5] = 1;

 func<<<blocksPerGrid, threadsPerBlock>>>(a,index);

CUDA section

 __global__ void func(char* a,int index[]){

 int first_word_start = index[0];
 int first_word_end = index[1];

 // print first word
 for(int i=first_word_start;i<=first_word_end;i++)
   printf("%c",a[i]);
 }

Output

 no output produced

I've tried out with a lot of other solutions but no one works for me... The problem can also re proposed asking: how can i pass 'n' strings to a cuda kernel and print (and compare) all of them there ( keep in mind that I can't pass 'n' variables).

Niether of the codes you've shown is complete, and the things you've left out may be important. You'll make it easier for others to help you if you show complete codes. Also, anytime you're struggling with CUDA codes, it's good practice to use proper cuda error checking which often times will point you at what is not working (I suspect this might have helped with your second attempt). Also, running your code with cuda-memcheck is often times instructive.

In your first attempt, you've run into a classic problem with CUDA and nested pointers (a is a pointer to an array of pointers). This problem occurs also pretty much any time there is a pointer buried in some other data structure. To copy such a data structure from host to device requires a "deep copy" operation, which has multiple steps. To understand more about this, search on "CUDA 2D array" (I consider the canonical answer to be the one given by talonmies here) or take a look at my answers here and here.

Also note that with CUDA 6, "deep copies" can be a lot easier conceptually for the programmer if you are able to use unified memory.

Your second attempt appears to be headed down a path of "flattening" your 2D or pointer-to-ponter array of char. That's a typical solution to the "problem" of deep-copying, resulting in less code complexity and probably also higher performance. Here's a fully worked example, blending ideas from your first and second attempt, which seems to work for me:

$ cat t389.cu
#include <stdio.h>

 __global__ void func(char* a, int *indexes, int num_strings){


 for(int i=0;i<num_strings;i++){
   printf("string[%d]: ", i);
   for (int j=indexes[2*i]; j < indexes[2*i+1]; j++)
     printf("%c", a[j]);
   printf("\n");
 }
}

int main(){

 int max_text_length, num_str;
 num_str = 3;
 char *tmp[num_str];
 max_text_length = 12;

 tmp[0] = (char*) malloc(max_text_length*sizeof(char));
 tmp[1] = (char*) malloc(max_text_length*sizeof(char));
 tmp[2] = (char*) malloc(max_text_length*sizeof(char));

 tmp[0] = "some text";
 tmp[1] = "rand txt";
 tmp[2] = "text";

 int stridx[2*num_str];
 int *d_stridx;
 stridx[0] = 0;
 stridx[1] = 9;
 stridx[2] = 9;
 stridx[3] = 17;
 stridx[4] = 17;
 stridx[5] = 21;

 char *a, *d_a;
 a = (char *)malloc(num_str*max_text_length*sizeof(char));
 //flatten
 int subidx = 0;
 for(int i=0;i<num_str;i++)
 {
   for (int j=stridx[2*i]; j<stridx[2*i+1]; j++)
     a[j] = tmp[i][subidx++];
   subidx = 0;
 }

 cudaMalloc((void**)&d_a,num_str*max_text_length*sizeof(char));
 cudaMemcpy(d_a, a,num_str*max_text_length*sizeof(char),cudaMemcpyHostToDevice);
 cudaMalloc((void**)&d_stridx,num_str*2*sizeof(int));
 cudaMemcpy(d_stridx, stridx,2*num_str*sizeof(int),cudaMemcpyHostToDevice);


 func<<<1,1>>>(d_a, d_stridx, num_str);
 cudaDeviceSynchronize();

}
$ nvcc -arch=sm_20 -o t389 t389.cu
$ cuda-memcheck ./t389
========= CUDA-MEMCHECK
string[0]: some text
string[1]: rand txt
string[2]: text
========= ERROR SUMMARY: 0 errors
$

上一篇 : ：启动VS2010时出现问题下一篇 : 如何自定义Grails字段插件

如何从C ++传递字符串矩阵到Cuda内核

Problem:

Attempts:

相关阅读

技术问答最新文章