且构网

分享程序员开发的那些事...
且构网 - 分享程序员编程开发的那些事

CUDA矩阵求逆高斯约旦

更新时间:2022-10-24 13:58:48

看来问题是在你的 gaussjordan 内核。

当你在做高斯 - 乔丹消除对原来的()的矩阵,这是可以接受只对行元素,以枢轴点的工作权。

但是,当你申请的同一行的操作单位矩阵创建逆(),有必要施加相当于行操作的每该行的构件的,而不仅仅是那些到枢轴点的右侧。

因此​​,如果您修改 gaussjordan 内核是这样的:

  __global__无效gaussjordan(浮动* A,浮*我,INT N,int i)以
{
    INT X = blockIdx.x * blockDim.x + threadIdx.x;
    INT Y = blockIdx.y * blockDim.y + threadIdx.y;
    浮磷;

    如果(X n种安培;&安培; Y n种)
        如果(X I标记){//这限制操作以下面的枢轴点的行
            P = A [X * N + I] / A [I * N + 1];
            我[X * N + Y]  -  = I [I * N + Y] * P; //适用于每一行成员
            如果(γ> =ⅰ){//限制行成员枢轴的右
                A [X * N + Y]  -  = A [I * N + Y] * P; //只适用于成员的权利支点
            }
        }
 }
 

我相信你会有更好的效果。随着上述变化,我能够浮动,我相信的精度范围内复制你的预期效果

I didn't find any similar question to mine. I'm trying to write the gaussian-jordan inverse matrix algorithm. The idea of the algorithm is simple:)

I want to inverse only a lower triangular matrix. I got almost correct answer. Where did I do sth wrong? Does anyone can help me? I will appreciate it.

  • d_ A lower triangular matrix (nxn)
  • dI identity matrix (nxn)
  • n size of a matrix in one direction (n%16=0)

  • dim3 threadsPerBlock(n/16,n/16);

  • dim3 numBlocks(16,16);

I know it is a simple implementation but at first I need it to work correctly :) Does anyone can help me or give me any hint? I will appreciate it. Thanks a lot!

there is the whole cpu code:

  #include <stdio.h>
#include <iostream>
#include <fstream>
#include <vector>
#include <string>
#pragma comment(lib, "cuda.lib")
#pragma comment(lib, "cudart.lib")
#include <cuda.h>
#include <math.h>
#include <cuda_runtime.h>
#include <cuda_runtime_api.h>
#include "device_launch_parameters.h"
#include <cublas_v2.h>

using namespace std;

 __global__ void gaussjordan(float *A,  float *I,int n, int i)
{
    int x = blockIdx.x * blockDim.x + threadIdx.x;
    int y = blockIdx.y * blockDim.y + threadIdx.y;
    float P;

    if(x<n && y<n)
        if(x>i)
            if(y>=i){
                P=A[x*n+i]/A[i*n+i];
                I[x*n+y]-= I[i*n+y]*P;
                A[x*n+y]-= A[i*n+y]*P;
            }
            __syncthreads();
 }


 __global__ void dev(float *d_A,  float *dI, int h)
{
    int x = blockIdx.x * blockDim.x + threadIdx.x;
    int y = blockIdx.y * blockDim.y + threadIdx.y;

    if(x<h && y<h)
        if(d_A[x*h+x]!=0){
            dI[x*h+y]  /= d_A[x*h+x];
            d_A[x*h+y] /= d_A[x*h+x];
        }
    __syncthreads();

}

void savetofile(float *A, string s, int n, int h)
{
    std::ofstream plik;
    plik.open(s);

    for(int j=0;j<h;j++){
        for(int i=0;i<h;i++){
            plik<<A[j*n+i]<<"\t";}
            plik<<endl;}
    plik.close();
}

int main()
{
    int n = 16;
// creating input
    float *iL = new float [n*n];
    float *L = new float [n*n];
    for(int i=0;i<n;i++)
        for(int j=0;j<n;j++)
            if(i==j || i>j) L[i*n+j] = (i*n+j+1)*(i*n+j+1)*0.007 + (i*n+j+1)*0.01 -(i*n+j+1)*(i*n+j+1)*(i*n+j+1)*0.0005;
            else L[i*n+j] = 0;

    savetofile(L,"L.txt",n,n);

    cout<<"inv\n";
    float *d_A, *d_L,*I, *dI;
    float time;
    cudaError_t err;
    cudaEvent_t start, stop;
    cudaEventCreate(&start);
    cudaEventCreate(&stop);
    int ddsize = n*n*sizeof(float);

    dim3 threadsPerBlock(n/16,n/16);
    dim3 numBlocks(16,16);
// memory allocation    
    err= cudaMalloc( (void**)  &d_A, ddsize);   if(err!=cudaSuccess){cout<<cudaGetErrorString(err)<<" in "<<__FILE__<<" at line "<< __LINE__<<endl;}
    err= cudaMalloc( (void**)   &dI, ddsize);   if(err!=cudaSuccess){cout<<cudaGetErrorString(err)<<" in "<<__FILE__<<" at line "<< __LINE__<<endl;}
    I = new float[n*n];

    for(int i=0;i<n;i++){
        for(int j=0;j<n;j++){
            if(i==j) I[i*n+i]=1.0;
                else I[i*n+j]=0.0;}}
 //copy data from GPU to CPU
    err =cudaMemcpy(  d_A,    L, ddsize, cudaMemcpyHostToDevice); if(err!=cudaSuccess){cout<<cudaGetErrorString(err)<<" in "<<__FILE__<<" at line "<< __LINE__<<endl;}
    err =cudaMemcpy(   dI,    I, ddsize, cudaMemcpyHostToDevice);  if(err!=cudaSuccess){cout<<cudaGetErrorString(err)<<" in "<<__FILE__<<" at line "<< __LINE__<<endl;}
//timer start
    cudaEventRecord( start, 0);
// L^(-1)    
    for(int i=0;i<n;i++){
        gaussjordan<<<numBlocks,threadsPerBlock>>>(d_A, dI, n, i);
    }
    dev<<<numBlocks,    threadsPerBlock>>>(d_A, dI, n); 

    err = cudaMemcpy(iL, dI, ddsize, cudaMemcpyDeviceToHost ); if(err!=cudaSuccess){cout<<cudaGetErrorString(err)<<" in "<<__FILE__<<" at line "<< __LINE__<<endl;} 
    err = cudaMemcpy(L, d_A, ddsize, cudaMemcpyDeviceToHost ); if(err!=cudaSuccess){cout<<cudaGetErrorString(err)<<" in "<<__FILE__<<" at line "<< __LINE__<<endl;} 

    cudaEventRecord( stop, 0 );
    cudaEventSynchronize( stop );
    cudaEventElapsedTime( &time, start, stop );
    cudaEventDestroy( start );
    cudaEventDestroy( stop );

    std::cout<<"Cuda Time - inverse: "<< time <<"ms\n";
    savetofile(iL,"inv.txt",n,n);
    savetofile(L,"I.txt",n,n);
    cudaFree(d_A);
    cudaFree(dI);
    delete []I;
    delete []L;
    delete []iL;
    system("Pause");
 return 0;
}

there is my output:

60.6061 0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   
-34.1233    -2.13675    -0  -0  -0  -0  -0  -0  0   0   0   0   0   0   0   0   
-48.5115    1.91603 -0.0799201  -0  -0  -0  -0  -0  0   0   0   0   0   0   0   0   
-49.4891    1.8697  0.0748167   -0.0196634  -0  -0  -0  -0  0   0   0   0   0   0   0   0   
-49.8332    1.84732 0.0725876   0.018747    -0.00767828 -0  -0  -0  0   0   0   0   0   0   0   0   
-50.0073    1.83403 0.071321    0.0182352   0.00739595  -0.00376795 -0  -0  0   0   0   0   0   0   0   0   
-50.112 1.82521 0.0705011   0.0179073   0.0072164   0.00365346  -0.00212282 -0  0   0   0   0   0   0   0   0   
-50.1818    1.81893 0.0699261   0.0176789   0.00709196  0.00357445  0.00206784  -0.00131234 0   0   0   0   0   0   0   0   
-50.2316    1.81423 0.0695003   0.0175105   0.00700059  0.00351662  0.0020277   0.00128271  -0.00086736 -0  -0  -0  -0  -0  -0  -0  
-50.2689    1.81057 0.0691722   0.0173813   0.00693062  0.00347244  0.00199711  0.00126017  0.000850006 -0.000602925    -0  -0  -0  -0  -0  -0  
-50.2979    1.80765 0.0689115   0.0172789   0.0068753   0.00343758  0.00197301  0.00124245  0.000836382 0.000592093 -0.000435975    -0  -0  -0  -0  -0  
-50.321 1.80527 0.0686993   0.0171957   0.00683047  0.00340937  0.00195354  0.00122815  0.000825401 0.000583374 0.000428868 -0.00032541 -0  -0  -0  -0  
-50.34  1.80328 0.0685233   0.0171269   0.0067934   0.00338607  0.00193748  0.00121637  0.000816362 0.000576204 0.000423029 0.000320554 -0.000249293    -0  -0  -0  
-50.3557    1.80159 0.0683749   0.0170689   0.00676223  0.0033665   0.001924    0.00120649  0.000808792 0.000570204 0.000418147 0.000316498 0.000245864 -0.000195186    -0  -0  
-50.369 1.80015 0.0682481   0.0170195   0.00673566  0.00334983  0.00191253  0.00119809  0.000802358 0.000565109 0.000414005 0.000313058 0.000242958 0.000192695 -0.000155673    -0  
-50.3805    1.7989  0.0681385   0.0169768   0.00671274  0.00333547  0.00190265  0.00119086  0.000796824 0.000560729 0.000410446 0.000310105 0.000240465 0.000190559 0.00015382  -0.000126146

and it should be:

60,6060606060606    4,44089209850063e-16    4,85722573273506e-17    -3,12250225675825e-17   0   1,73472347597681e-18    -1,08420217248550e-18   -7,58941520739853e-19   4,33680868994202e-19    -5,42101086242752e-19   0   -6,93889390390723e-18   0   -1,38777878078145e-17   0   1,18720137887163e-17
-34,1232841232841   -2,13675213675214   0   8,67361737988404e-18    3,03576608295941e-18    8,67361737988404e-19    -1,73472347597681e-18   1,35525271560688e-18    -8,67361737988404e-19   1,00288700954909e-18    0   0   6,93889390390723e-18    6,93889390390723e-18    -1,38777878078145e-17   3,02221355580334e-18
-17,9130271437964   1,91603268526345    -0,0799200799200800 1,30104260698261e-18    1,95156391047391e-18    -9,75781955236954e-19   1,95156391047391e-18    2,16840434497101e-19    -3,52365706057789e-19   -1,62630325872826e-19   1,38777878078145e-17    -3,46944695195361e-18   0   0   0   -2,72405795836983e-18
-2,86140643299924   0,0760191125748172  0,0748166415934231  -0,0196633632216454 -2,41234983378025e-18   7,99599102208060e-19    3,25260651745651e-19    -4,74338450462408e-19   2,67662411332359e-19    2,91379333855479e-19    -2,16840434497101e-18   -4,33680868994202e-19   1,30104260698261e-18    0   0   6,86096687275983e-20
-1,33482739506506   0,0346053236774996  0,00125734163772674 0,0187469132242915  -0,00767825058738617    5,35324822664718e-19    -2,23616698075135e-19   5,08219768352580e-20    5,92923063078010e-20    1,74488787134386e-19    -4,33680868994202e-19   4,33680868994202e-19    -2,16840434497101e-19   2,16840434497101e-19    0   -1,19008129089229e-19
-0,793561224702690  0,0203250367373064  0,000727127971238783    0,000177630032830862    0,00739591005669882 -0,00376795430225022    4,98055372985529e-19    -3,84552958053452e-19   3,20178454062126e-19    -1,35525271560688e-19   6,50521303491303e-19    -1,08420217248550e-19   1,08420217248550e-19    -2,16840434497101e-19   0   -7,15742840429884e-20
-0,532255026297144  0,0135340901236068  0,000479383336751935    0,000115847127348313    4,51920594555328e-05    0,00365346070706817 -0,00212282675610843    1,37219337455197e-19    -5,14996031930615e-19   3,30342849429177e-19    0   -2,71050543121376e-19   1,08420217248550e-19    0   0   5,08219768352580e-20
-0,384130052448431  0,00972113086608457 0,000342250536212794    8,21235560483452e-05    3,18129608485860e-05    1,56232096436654e-05    0,00206784220009096 -0,00131233595800525    6,39509875176997e-20    -3,37542629480839e-19   -1,08420217248550e-19   2,16840434497101e-19    0   0   0   -8,47032947254300e-22
-0,291692030052418  0,00735419164507677 0,000257375648850429    6,15185225200113e-05    2,37495210052671e-05    1,16038017329438e-05    6,53368676878396e-06    0,00128271813402154 -0,000867362869930264   1,77876918923403e-19    1,62630325872826e-19    -1,89735380184963e-19   1,62630325872826e-19    0   0   -9,07384044746169e-20
-0,229596895430646  0,00578230937666655 0,000201707743336976    4,79768824589291e-05    1,84020572663637e-05    8,96002707181433e-06    5,05525466995835e-06    3,12009781742606e-06    0,000850011219708818    -0,000602925394011745   0   2,71050543121376e-20    -8,13151629364128e-20   5,42101086242752e-20    -5,42101086242752e-20   7,73976355553617e-20
-0,185720949479909  0,00466765632076680 0,000162419592307734    3,85318721641536e-05    1,47407053519860e-05    7,17308297585328e-06    4,02178178072207e-06    2,48428717850195e-06    1,64547815065802e-06    0,000592092919336558    -0,000435974905284452   0   0   8,13151629364128e-20    -1,08420217248550e-19   2,64697796016969e-20
-0,153867987373140  0,00385473267086607 0,000133863548213241    3,17506489004575e-05    1,20962229586152e-05    5,86799087221288e-06    3,28276799988068e-06    2,02338706451671e-06    1,33735029942045e-06    9,34275734555363e-07    0,000428867197061432    -0,000325409609345764   0   2,71050543121376e-20    0   -1,09055491958991e-20
-0,129703518509601  0,00324211947468978 0,000112403568308126    2,65969300905272e-05    1,01402805713936e-05    4,89779294849866e-06    2,73496124917826e-06    1,68586638861081e-06    1,11012300345236e-06    7,73556738632873e-07    5,60933254708493e-07    0,000320553621268105    -0,000249293253625970   5,42101086242752e-20    0   -1,01114558078482e-20
-0,110691345431593  0,00276839969825208 9,59884298624889e-05    2,25961759289096e-05    8,63052307521336e-06    4,15554692230644e-06    2,31688356971108e-06    1,42511604039733e-06    9,39229137057347e-07    6,51934526276135e-07    4,72019315851685e-07    3,53897320062806e-07    0,000245863313382516    -0,000195185934120844   0   -1,24407964127975e-20
-0,0958269169656213 0,00239699666599593 8,28626202960276e-05    1,95227026042985e-05    7,41637441475814e-06    3,57424367962823e-06    1,99334817579930e-06    1,21993241781196e-06    8,05577604288488e-07    5,57554928001086e-07    4,03155267486669e-07    3,01723475812485e-07    2,31838854154289e-07    0,000192695260333710    -0,000155673036807333   -2,34522247271034e-20
-0,0838002301027703 0,00209415237243389 7,23249901251223e-05    1,70229067498473e-05    6,46008752692950e-06    3,11455737751181e-06    1,73159030599080e-06    1,06073213436631e-06    6,96842172109705e-07    4,82764206408816e-07    3,49217230232344e-07    2,60145440758586e-07    2,00286821017368e-07    1,56906945950947e-07    0,000153820426928509    -0,000126146355001072

It seems the problem was in your gaussjordan kernel.

When you are doing gauss-jordan elimination on the original (L) matrix, it is acceptable to work only on the row elements to the right of the pivot point.

But when you are applying the same row operations to the identity matrix to create the inverse (I), it's necessary to apply the equivalent row operations to every member of the row, not just those to the right of the pivot point.

So if you modify your gaussjordan kernel like this:

 __global__ void gaussjordan(float *A,  float *I,int n, int i)
{
    int x = blockIdx.x * blockDim.x + threadIdx.x;
    int y = blockIdx.y * blockDim.y + threadIdx.y;
    float P;

    if(x<n && y<n)
        if(x>i){ // this limits operation to rows below the pivot point
            P=A[x*n+i]/A[i*n+i];
            I[x*n+y] -= I[i*n+y]*P;  // apply for every row member
            if(y>=i){ //limits  to row members to the right of the pivot
                A[x*n+y] -= A[i*n+y]*P;  // apply only to members right of pivot
            }
        }
 }

I believe you'll have better results. With the above changes, I was able to duplicate your expected results within the accuracy of float vs. double, I believe.