CUDA / CUBLAS矩阵向量乘法

更新时间：2022-10-15 19:17:37

我们不以这种方式引用向量的第一个元素：
```
  cublasSetVector ，& x0，1，d_x0，
 
```

  cublasSetVector（N，sizeof（float），&（x0 [0]）， d_x0，1）;

同样对于 SetMatrix 调用引用 A ：

  cublasSetMatrix（N，N，sizeof （float），&（A [0]），N，d_A，N）;

您的 GetVector 调用有2个错误：

  cublasGetVector（N，sizeof（float），& temp，1，d_temp，1）;

您拥有 temp 和 d_temp 参数逆转（您正在从设备复制到主机），而且您不应将地址 temp ：它已经是一个指针。这样做：

  cublasGetVector（N，sizeof（float），d_temp，1，temp，1）

您没有对所有cublas进行正确的错误检查调用，例如你的get / set矩阵/向量调用。

您正创建 A 作为向量的数组。这不适用于 cublasSetMatrix 。相反，我们需要创建 A 作为一个具有足够大小（N * N）的平面向量来存储整个矩阵。

最后，cublas期望它使用的矩阵以列为主的顺序存储。如果你以行为主的顺序传递C风格的数组，你应该使用 cublasSgemv 中的矩阵转置：
```
  cublasCheckErrors（cublasSgemv（handle，CUBLAS_OP_T，N，N，& alpha，d_A，N，d_x0,1，& beta，d_temp，1） 
 
```

以下代码修复了以下各种问题：

  $ cat t235.cu 
 #include< cuda.h> 
 #include< vector> 
 #include< iostream> 
 #include< fstream> 
 #include< stdio.h> 
 #include< stdlib.h> 
 #include< cmath> 
 #include< cublas_v2.h> 
 #include< time.h> 

 // #includetimenow.cu

 //错误检查宏
 #define cudaCheckErrors（msg）\ 
 do {\ 
 cudaError_t __err = cudaGetLastError（）; \ 
 if（__err！= cudaSuccess）{\ 
 fprintf（stderr，致命错误：％s（％s在％s：％d）\\\
，\ 
 msg，cudaGetErrorString（__ err），\ 
 __FILE__，__LINE__）; \ 
 fprintf（stderr，*** FAILED  -  ABORTING\\\
）; \ 
 exit（1）; \ 
} \ 
} while（0）

 // CUBLAS V2 API 
 #define cublasCheckErrors（fn）\ 
 do { \ 
 cublasStatus_t __err = fn; \ 
 if（__err！= CUBLAS_STATUS_SUCCESS）{\ 
 fprintf（stderr，Fatal cublas error：％d（at％s：％d）\\\
，\ 
 （int）（__ err），\ 
 __FILE__，__LINE__）; \ 
 fprintf（stderr，*** FAILED  -  ABORTING\\\
）; \ 
 exit（1）; \ 
} \ 
} while（0）

 //随机数据填充
 void fillvector（float * data，int N）{
 for（int i = 0; i  data [i] = float（rand（）％10）; 
} 
} 

 // printer 
 void printer（bool printOut，float * data，int N）{
 if（printOut == true） {
 for（int i = 0; i< N; i ++）{
 printf（％2.1f，data [i]）; 
} 
 printf（\\\
）; 
} 
} 

 ////////////////////////////////// ///////////////////////////////////// 
 ///////// ////////////////////////////////////////////////// ////////// 

 int main（）{

 bool printOut = true; 

 int N; 
 std :: cout<< Enter N：; 
 std :: cin>> N; 

 std :: vector< float> x0; 
 x0.resize（N）; 

 std :: vector< float> p; 
 p.resize（N * N）; 

 // matrix A 
 std :: vector< float> A（N * N）; 
 fillvector（A.data（），N * N）; 
 for（int i = 0; i  printer（printOut，&（A [（i * N）]），N）; 
 printf（\\\
）;} 
 fillvector（x0.data（），N）; 
 printer（printOut，x0.data（），N）; 

 printf（\\\
Starting CUDA computation ...）; 
 /// double startTime = timenow（）; 

 //设备指针
 float * d_A，* d_x0，* d_temp; 

 cudaMalloc（（void **）& d_A，N * N * sizeof（float））; 
 cudaMalloc（（void **）& d_temp，N * sizeof（float））; 
 cudaMalloc（（void **）& d_x0，N * sizeof（float））; 
 cudaCheckErrors（cuda malloc fail）; 

 //可能需要展平A ... 
 cublasCheckErrors（cublasSetVector（N，sizeof（float），&（x0 [0]），1，d_x0，1） 
 // daMemcpy（d_x0，& x0，N * sizeof（float），cudaMemcpyHostToDevice）; 
 cublasCheckErrors（cublasSetMatrix（N，N，sizeof（float），&（A [0]），N，d_A，N） 
 // cudaCheckErrors（cuda memcpy of A or x0 fail）; 

 float * temp; 
 temp =（float *）malloc（N * sizeof（temp））; 

 cublasHandle_t handle; 
 cublasCheckErrors（cublasCreate（& handle））; 

 float alpha = 1.0f; 
 float beta = 0.0f; 
 cublasCheckErrors（cublasSgemv（handle，CUBLAS_OP_T，N，N，& alpha，d_A，N，d_x0,1，& beta，d_temp，1） 

 cublasCheckErrors（cublasGetVector（N，sizeof（float），d_temp，1，temp，1））; 
 // cudaMemcpy（temp，d_temp，N * sizeof（float），cudaMemcpyDeviceToHost）; 
 // cudaCheckErrors（返回主机失败）; 

 printf（\\\
）; 
 printer（printOut，temp，N）; 

 / * alpha = -1.0; 
 cublasSaxpy（handle，N，& alpha，d_temp，1，d_v，1）; 
 cublasGetVector（N，sizeof（float）* N，d_v，1，& v，1）; 
 printf（\\\
）; 
 for（int i = 0; i  printf（％2.1f，v [i]）; 
} * / 

 printf（\\\
Finished CUDA calculateations ... \\\
）; 
 // double endTime = timenow（）; 

 // double timeDiff = endTime  -  startTime; 
 // printf（\\\
Runtime：％2.3f seconds \\\
，timeDiff）; 

 cudaFree（d_temp）; 
 cudaFree（d_A）; 
 // cudaFree（d_p）; 
 cudaFree（d_x0）; 

 return 0; 
} 
 $ nvcc -arch = sm_20 -03 -o t235 t235.cu -lcublas 
 $ ./t235 
输入N：5 
 3.0 6.0 7.0 5.0 3.0 

 5.0 6.0 2.0 9.0 1.0 

 2.0 7.0 0.0 9.0 3.0 

 6.0 0.0 6.0 2.0 6.0 

 1.0 8.0 7.0 9.0 2.0 

 0.0 2.0 3.0 7.0 5.0 

启动CUDA计算... 
 83.0 86.0 92.0 62.0 110.0 

完成CUDA计算。 .. 
 $

I previously posted a question regarding matrix-vector multiplication in CUDA and about writing my own kernel. After doing this, I decided to implement my problem using CUBLAS as suggested by some users (thanks @Robert Crovella ) on SO in the hopes of achieving higher performance (my project is performance driven).

Just to clarify: I want to multiply a NxN matrix with a 1xN vector.

I've been looking at the code pasted below for a couple of days now and I cant figure out why the multiplication is giving me an incorrect result. I fear that i am causing problems by using < vector > arrays (this is part of a much larger system that uses these data types). I don't mean to use this thread as a debugging tool but I think this will also be helpful to other users trying to achieve this as I have not come across a particularly comprehensive source on the internet for my particular problem (and for the cublas v2 API). Thanks in advance!

#include <cuda.h>
#include <vector>
#include <iostream>
#include <fstream>
#include <stdio.h>
#include <stdlib.h>
#include <cmath>
#include <cublas_v2.h>
#include <time.h>

//#include "timenow.cu"

// error check macros
#define cudaCheckErrors(msg) \
    do { \
        cudaError_t __err = cudaGetLastError(); \
        if (__err != cudaSuccess) { \
            fprintf(stderr, "Fatal error: %s (%s at %s:%d)\n", \
                msg, cudaGetErrorString(__err), \
                __FILE__, __LINE__); \
            fprintf(stderr, "*** FAILED - ABORTING\n"); \
            exit(1); \
        } \
    } while (0)

// for CUBLAS V2 API
#define cublasCheckErrors(fn) \
    do { \
        cublasStatus_t __err = fn; \
        if (__err != CUBLAS_STATUS_SUCCESS) { \
            fprintf(stderr, "Fatal cublas error: %d (at %s:%d)\n", \
                (int)(__err), \
                __FILE__, __LINE__); \
            fprintf(stderr, "*** FAILED - ABORTING\n"); \
            exit(1); \
        } \
    } while (0)

// random data filler
void fillvector(float *data, int N){
    for(int i=0; i<N; i++){
        data[i] = float(rand() % 10);
    }
}

//printer
void printer(bool printOut, float *data, int N){
    if(printOut == true){
    for(int i=0; i<N; i++){
        printf("%2.1f ", data[i]);
    }
    printf("\n");
    }
}

/////////////////////////////////////////////////////////////////////
/////////////////////////////////////////////////////////////////////

int main(){

bool printOut = true;

int N;
std::cout << "Enter N: " ;
std::cin >> N;

std::vector<float> x0;
x0.resize(N);

std::vector<float> p;
p.resize(N*N);

// matrix A
std::vector<float> A[N];
for(int i=0;i<N;i++){
        A[i].resize(N);
    fillvector(A[i].data(), N);
    printer(printOut, A[i].data(), N);
}
printf("\n");
fillvector(x0.data(), N);
printer(printOut, x0.data(), N);

printf("\nStarting CUDA computation...");
///double startTime = timenow();

// device pointers
float *d_A, *d_p, *d_b, *d_x0, *d_v, *d_temp;

cudaMalloc((void**)&d_A, N*N*sizeof(float));
cudaMalloc((void**)&d_temp, N*sizeof(float));
cudaMalloc((void**)&d_x0, N*sizeof(float));
cudaCheckErrors("cuda malloc fail");

// might need to flatten A...
cublasSetVector(N, sizeof(float), &x0, 1, d_x0, 1);
//daMemcpy(d_x0, &x0, N*sizeof(float), cudaMemcpyHostToDevice);
cublasSetMatrix(N, N, sizeof(float), &A, N, d_A, N);
cudaCheckErrors("cuda memcpy of A or x0 fail");

float *temp;
temp = (float *)malloc(N*sizeof(temp));

cublasHandle_t handle;
cublasCheckErrors(cublasCreate(&handle));

float alpha = 1.0f;
float beta = 0.0f;
cublasCheckErrors(cublasSgemv(handle, CUBLAS_OP_N, N, N, &alpha, d_A, N, d_x0, 1, &beta, d_temp, 1));

cublasGetVector(N, sizeof(float), &temp, 1, d_temp, 1);
//cudaMemcpy(temp, d_temp, N*sizeof(float), cudaMemcpyDeviceToHost);
cudaCheckErrors("returning to host failed");

printf("\n");
printer(printOut, temp, N);

/*alpha = -1.0;
cublasSaxpy(handle, N, &alpha, d_temp, 1, d_v, 1);
cublasGetVector(N, sizeof(float) * N, d_v, 1, &v, 1);
printf("\n");
for(int i=0; i<N; i++){
    printf("%2.1f ",v[i]);
}*/

printf("\nFinished CUDA computations...");
//double endTime = timenow();

//double timeDiff = endTime - startTime;
//printf("\nRuntime: %2.3f seconds \n", timeDiff);

cudaFree(d_temp);
cudaFree(d_A);
cudaFree(d_p);
cudaFree(d_x0);

return 0;
}

We don't reference the first element of a vector this way:
```
cublasSetVector(N, sizeof(float), &x0, 1, d_x0,   
```

Instead you should do this:

cublasSetVector(N, sizeof(float), &(x0[0]), 1, d_x0, 1);

And likewise for your SetMatrix call referencing A:

cublasSetMatrix(N, N, sizeof(float), &(A[0]), N, d_A, N);

Your GetVector call has 2 errors:

cublasGetVector(N, sizeof(float), &temp, 1, d_temp, 1);

You have your temp and d_temp parameters reversed (you are copying from device to host) and you should not take the address of temp: it is already a pointer. So do this:

cublasGetVector(N, sizeof(float), d_temp, 1, temp, 1);

You're not doing proper error checking on all cublas calls, such as your get/set matrix/vector calls. Use the same method you are using on other cublas calls for these also.
You are creating A as an array of vectors. This won't work with cublasSetMatrix. Instead we need to create A as a flat vector, of sufficient size (N*N) to store the entire matrix.
Finally, cublas expects the matrices it uses to be stored in column-major order. If you pass C-style arrays in row-major order, you should use the transpose for that matrix in cublasSgemv:
```
cublasCheckErrors(cublasSgemv(handle, CUBLAS_OP_T, N, N, &alpha, d_A, N, d_x0, 1, &beta, d_temp, 1));
```

The following code has these various problems fixed:

$ cat t235.cu
#include <cuda.h>
#include <vector>
#include <iostream>
#include <fstream>
#include <stdio.h>
#include <stdlib.h>
#include <cmath>
#include <cublas_v2.h>
#include <time.h>

//#include "timenow.cu"

// error check macros
#define cudaCheckErrors(msg) \
    do { \
        cudaError_t __err = cudaGetLastError(); \
        if (__err != cudaSuccess) { \
            fprintf(stderr, "Fatal error: %s (%s at %s:%d)\n", \
                msg, cudaGetErrorString(__err), \
                __FILE__, __LINE__); \
            fprintf(stderr, "*** FAILED - ABORTING\n"); \
            exit(1); \
        } \
    } while (0)

// for CUBLAS V2 API
#define cublasCheckErrors(fn) \
    do { \
        cublasStatus_t __err = fn; \
        if (__err != CUBLAS_STATUS_SUCCESS) { \
            fprintf(stderr, "Fatal cublas error: %d (at %s:%d)\n", \
                (int)(__err), \
                __FILE__, __LINE__); \
            fprintf(stderr, "*** FAILED - ABORTING\n"); \
            exit(1); \
        } \
    } while (0)

// random data filler
void fillvector(float *data, int N){
    for(int i=0; i<N; i++){
        data[i] = float(rand() % 10);
    }
}

//printer
void printer(bool printOut, float *data, int N){
    if(printOut == true){
    for(int i=0; i<N; i++){
        printf("%2.1f ", data[i]);
    }
    printf("\n");
    }
}

/////////////////////////////////////////////////////////////////////
/////////////////////////////////////////////////////////////////////

int main(){

bool printOut = true;

int N;
std::cout << "Enter N: " ;
std::cin >> N;

std::vector<float> x0;
x0.resize(N);

std::vector<float> p;
p.resize(N*N);

// matrix A
std::vector<float> A(N*N);
fillvector(A.data(), N*N);
for (int i=0; i< N; i++){
  printer(printOut, &(A[(i*N)]), N);
  printf("\n");}
fillvector(x0.data(), N);
printer(printOut, x0.data(), N);

printf("\nStarting CUDA computation...");
///double startTime = timenow();

// device pointers
float *d_A, *d_x0, *d_temp;

cudaMalloc((void**)&d_A, N*N*sizeof(float));
cudaMalloc((void**)&d_temp, N*sizeof(float));
cudaMalloc((void**)&d_x0, N*sizeof(float));
cudaCheckErrors("cuda malloc fail");

// might need to flatten A...
cublasCheckErrors(cublasSetVector(N, sizeof(float), &(x0[0]), 1, d_x0, 1));
//daMemcpy(d_x0, &x0, N*sizeof(float), cudaMemcpyHostToDevice);
cublasCheckErrors(cublasSetMatrix(N, N, sizeof(float), &(A[0]), N, d_A, N));
//cudaCheckErrors("cuda memcpy of A or x0 fail");

float *temp;
temp = (float *)malloc(N*sizeof(temp));

cublasHandle_t handle;
cublasCheckErrors(cublasCreate(&handle));

float alpha = 1.0f;
float beta = 0.0f;
cublasCheckErrors(cublasSgemv(handle, CUBLAS_OP_T, N, N, &alpha, d_A, N, d_x0, 1, &beta, d_temp, 1));

cublasCheckErrors(cublasGetVector(N, sizeof(float), d_temp, 1, temp, 1));
//cudaMemcpy(temp, d_temp, N*sizeof(float), cudaMemcpyDeviceToHost);
//cudaCheckErrors("returning to host failed");

printf("\n");
printer(printOut, temp, N);

/*alpha = -1.0;
cublasSaxpy(handle, N, &alpha, d_temp, 1, d_v, 1);
cublasGetVector(N, sizeof(float) * N, d_v, 1, &v, 1);
printf("\n");
for(int i=0; i<N; i++){
    printf("%2.1f ",v[i]);
}*/

printf("\nFinished CUDA computations...\n");
//double endTime = timenow();

//double timeDiff = endTime - startTime;
//printf("\nRuntime: %2.3f seconds \n", timeDiff);

cudaFree(d_temp);
cudaFree(d_A);
//cudaFree(d_p);
cudaFree(d_x0);

return 0;
}
$ nvcc -arch=sm_20 -O3 -o t235 t235.cu -lcublas
$ ./t235
Enter N: 5
3.0 6.0 7.0 5.0 3.0

5.0 6.0 2.0 9.0 1.0

2.0 7.0 0.0 9.0 3.0

6.0 0.0 6.0 2.0 6.0

1.0 8.0 7.0 9.0 2.0

0.0 2.0 3.0 7.0 5.0

Starting CUDA computation...
83.0 86.0 92.0 62.0 110.0

Finished CUDA computations...
$

上一篇 : ：反应路由器“无法读取未定义的属性“推送"下一篇 : C＃中的矩阵乘法

CUDA / CUBLAS矩阵向量乘法

相关阅读

技术问答最新文章