CUDA，3D计算对象之间的距离矩阵

更新时间：2022-10-28 09:57:08

除非你使用非常大的分子，有可能不会有足够的工作，让GPU的忙，所以计算会更快的CPU。

如果你的意思是计算的欧氏距离，你的计算是不正确的。您需要3D版的勾股定理。

我会用一个SoA用于存储的坐标。

您要生成与尽可能多的内存访问模式合并读取和写入成为可能。要做到这一点，安排了32个线程中的每个经产生的地址或索引要尽量靠近对方为可能的（有点简单化）。

threadIdx 指定线程索引块内 blockIdx 指定块索引网格内。 blockIdx 总是相同的经线的所有线程。只有 threadIdx 变化的块中的线程中。为了形象化 threadIdx 的3个维度如何分配给线程，把它们当作嵌套循环，其中 X 是内环和以Z 是外循环。因此，与相邻主题X 值是最有可能是同一经线内，如果 X 是整除32，只有线程共享相同的 X / 32 值是相同的扭曲之内。

我已经包含了一个完整的例子下面你的算法。在这个例子中，我指数从 threadIdx.x 导出如此，检查经线将产生合并的读取和写入，我会去在code同时插入了几个连续的值，如0，1和2 我并检查所生成的索引也将是连续的。

在Ĵ指数生成的地址如Ĵ源自 threadIdx不太重要.Y ，所以是不太可能的经内变化（也永远不会改变，如果 threadIdx.x 是整除32）。

 的#includecuda_runtime.h
＃包括＆LT;的iostream＆GT;

使用名字空间std;

const int的N（20）;

＃定义检查（ANS）{_check（（ANS），__FILE__，__LINE__）; }
内嵌无效_check（cudaError_t code，字符*文件，INT行）
{
  如果（code！= cudaSuccess）{
    fprintf中（错误，CUDA错误：％s％s的％D \ N，cudaGetErrorString（code），文件，行）;
    退出（code）;
  }
}

INT div_up（INT A，INT B）{
  返回（（％A）！= 0）？ （A / B + 1）：（A / B）;
}

__global__无效calc_distances（双*的距离，
  双* atoms_x，双* atoms_y，双* atoms_z）;

INT主（INT ARGC，字符** argv的）
{
  双* atoms_x_h;
  检查（cudaMallocHost（安培; atoms_x_h，N *的sizeof（双）））;

  双* atoms_y_h;
  检查（cudaMallocHost（安培; atoms_y_h，N *的sizeof（双）））;

  双* atoms_z_h;
  检查（cudaMallocHost（安培; atoms_z_h，N *的sizeof（双）））;

  对于（INT I（0）; I＆LT; N ++我）{
    atoms_x_h [我] =我;
    atoms_y_h [我] =我;
    atoms_z_h [我] =我;
  }

  双* atoms_x_d;
  检查（cudaMalloc（安培; atoms_x_d，N *的sizeof（双）））;

  双* atoms_y_d;
  检查（cudaMalloc（安培; atoms_y_d，N *的sizeof（双）））;

  双* atoms_z_d;
  检查（cudaMalloc（安培; atoms_z_d，N *的sizeof（双）））;

  检查（cudaMemcpy（atoms_x_d，atoms_x_h，N *的sizeof（双），cudaMemcpyHostToDevice））;
  检查（cudaMemcpy（atoms_y_d，atoms_y_h，N *的sizeof（双），cudaMemcpyHostToDevice））;
  检查（cudaMemcpy（atoms_z_d，atoms_z_h，N *的sizeof（双），cudaMemcpyHostToDevice））;

  双* distances_d;
  检查（cudaMalloc（安培; distances_d，N * N * sizeof的（双）））;

  const int的threads_per_block（256）;
  为dim3 n_blocks（div_up（N，threads_per_block））;

  calc_distances＆LT;＆LT;＆LT; n_blocks，threads_per_block＆GT;＆GT;＆GT;（distances_d，atoms_x_d，atoms_y_d，atoms_z_d）;

  检查（cudaPeekAtLastError（））;
  检查（cudaDeviceSynchronize（））;

  双* distances_h;
  检查（cudaMallocHost（安培; distances_h，N * N * sizeof的（双）））;

  检查（cudaMemcpy（distances_h，distances_d，N * N * sizeof的（双），cudaMemcpyDeviceToHost））;

  对于（INT I（0）; I＆LT; N ++我）{
    对于（诠释J（0）; J＆n种; ++ j）条{
      COUT＆LT;＆LT; （＆其中;＆其中; I＆其中;＆所述;，＆其中;＆其中; J＆其中;＆所述;）：＆其中;＆其中; distances_h [I + N * J]＆LT;＆LT; ENDL;
    }
  }

  检查（cudaFree（distances_d））;
  检查（cudaFreeHost（distances_h））;
  检查（cudaFree（atoms_x_d））;
  检查（cudaFreeHost（atoms_x_h））;
  检查（cudaFree（atoms_y_d））;
  检查（cudaFreeHost（atoms_y_h））;
  检查（cudaFree（atoms_z_d））;
  检查（cudaFreeHost（atoms_z_h））;

  返回0;
}

__global__无效calc_distances（双*的距离，
  双* atoms_x，双* atoms_y，双* atoms_z）
{
  INT I（threadIdx.x + blockIdx.x * blockDim.x）;
  诠释J（threadIdx.y + blockIdx.y * blockDim.y）;

  如果（I＆GT; = N || J＆GT; = N）{
    返回;
  }

  距离[I + N * J] =
    （atoms_x [I]  -  atoms_x [J]）*（atoms_x [I]  -  atoms_x [J]）+
    （atoms_y [I]  -  atoms_y [J]）*（atoms_y [I]  -  atoms_y [J]）+
    （atoms_z [I]  -  atoms_z [J]）*（atoms_z [I]  -  atoms_z [J]）;
}

I have a "string"(molecule) of connected N objects(atoms) in 3D (each atom has a coordinates). And I need to calculate a distance between each pair of atoms in a molecule (see pseudo code below ). How could it be done with CUDA? Should I pass to a kernel function 2 3D Arrays? Or 3 arrays with coordinates: X[N], Y[N], Z[N]? Thanks.

struct atom { double x,y,z; }

int main()
{

//N number of atoms in a molecule
double DistanceMatrix[N][N];
double d;
atom Atoms[N];

for (int i = 0; i < N; i ++)
    for (int j = 0; j < N; j++)
      DistanceMatrix[i][j] = (atoms[i].x -atoms[j].x)*(atoms[i].x -atoms[j].x) +
                     (atoms[i].y -atoms[j].y)* (atoms[i].y -atoms[j].y) + (atoms[i].z -atoms[j].z)* (atoms[i].z -atoms[j].z;

 }

Unless you're working with very large molecules, there probably won't be enough work to keep the GPU busy, so calculations will be faster with the CPU.

If you meant to calculate the Euclidean distance, your calculation is not correct. You need the 3D version of the Pythagorean theorem.

I would use a SoA for storing the coordinates.

You want to generate a memory access pattern with as many coalesced reads and writes as possible. To do that, arrange for addresses or indexes generated by the 32 threads in each warp to be as close to each other as possible (a bit simplified).

threadIdx designates thread indexes within a block and blockIdx designates block indexes within the grid. blockIdx is always the same for all threads in a warp. Only threadIdx varies within the threads in a block. To visualize how the 3 dimensions of threadIdx are assigned to threads, think of them as nested loops where x is the inner loop and z is the outer loop. So, threads with adjacent x values are the most likely to be within the same warp and, if x is divisible by 32, only threads sharing the same x / 32 value are within the same warp.

I have included a complete example for your algorithm below. In the example, the i index is derived from threadIdx.x so, to check that warps would generate coalesced reads and writes, I would go over the code while inserting a few consecutive values such as 0, 1 and 2 for i and checking that the generated indexes would also be consecutive.

Addresses generated from the j index are less important as j is derived from threadIdx.y and so is less likely to vary within a warp (and will never vary if threadIdx.x is divisible by 32).

#include  "cuda_runtime.h"
#include <iostream>

using namespace std;

const int N(20);

#define check(ans) { _check((ans), __FILE__, __LINE__); }
inline void _check(cudaError_t code, char *file, int line)
{
  if (code != cudaSuccess) {
    fprintf(stderr,"CUDA Error: %s %s %d\n", cudaGetErrorString(code), file, line);
    exit(code);
  }
}

int div_up(int a, int b) {
  return ((a % b) != 0) ? (a / b + 1) : (a / b);
}

__global__ void calc_distances(double* distances,
  double* atoms_x, double* atoms_y, double* atoms_z);

int main(int argc, char **argv)
{
  double* atoms_x_h;
  check(cudaMallocHost(&atoms_x_h, N * sizeof(double)));

  double* atoms_y_h;
  check(cudaMallocHost(&atoms_y_h, N * sizeof(double)));

  double* atoms_z_h;
  check(cudaMallocHost(&atoms_z_h, N * sizeof(double)));

  for (int i(0); i < N; ++i) {
    atoms_x_h[i] = i;
    atoms_y_h[i] = i;
    atoms_z_h[i] = i;
  }

  double* atoms_x_d;
  check(cudaMalloc(&atoms_x_d, N * sizeof(double)));

  double* atoms_y_d;
  check(cudaMalloc(&atoms_y_d, N * sizeof(double)));

  double* atoms_z_d;
  check(cudaMalloc(&atoms_z_d, N * sizeof(double)));

  check(cudaMemcpy(atoms_x_d, atoms_x_h, N * sizeof(double), cudaMemcpyHostToDevice));
  check(cudaMemcpy(atoms_y_d, atoms_y_h, N * sizeof(double), cudaMemcpyHostToDevice));
  check(cudaMemcpy(atoms_z_d, atoms_z_h, N * sizeof(double), cudaMemcpyHostToDevice));

  double* distances_d;
  check(cudaMalloc(&distances_d, N * N * sizeof(double)));

  const int threads_per_block(256);
  dim3 n_blocks(div_up(N, threads_per_block));

  calc_distances<<<n_blocks, threads_per_block>>>(distances_d, atoms_x_d, atoms_y_d, atoms_z_d);

  check(cudaPeekAtLastError());
  check(cudaDeviceSynchronize());

  double* distances_h;
  check(cudaMallocHost(&distances_h, N * N * sizeof(double)));

  check(cudaMemcpy(distances_h, distances_d, N * N * sizeof(double), cudaMemcpyDeviceToHost));

  for (int i(0); i < N; ++i) {
    for (int j(0); j < N; ++j) {
      cout << "(" << i << "," << j << "): " << distances_h[i + N * j] << endl;
    }
  }

  check(cudaFree(distances_d));
  check(cudaFreeHost(distances_h));
  check(cudaFree(atoms_x_d));
  check(cudaFreeHost(atoms_x_h));
  check(cudaFree(atoms_y_d));
  check(cudaFreeHost(atoms_y_h));
  check(cudaFree(atoms_z_d));
  check(cudaFreeHost(atoms_z_h));

  return 0;
}

__global__ void calc_distances(double* distances,
  double* atoms_x, double* atoms_y, double* atoms_z)
{
  int i(threadIdx.x + blockIdx.x * blockDim.x);
  int j(threadIdx.y + blockIdx.y * blockDim.y);

  if (i >= N || j >= N) {
    return;
  }

  distances[i + N * j] =
    (atoms_x[i] - atoms_x[j]) * (atoms_x[i] - atoms_x[j]) +
    (atoms_y[i] - atoms_y[j]) * (atoms_y[i] - atoms_y[j]) +
    (atoms_z[i] - atoms_z[j]) * (atoms_z[i] - atoms_z[j]);
}

上一篇 : ：逐个元素矢量乘法与CUDA下一篇 : 从Jar重新编译一个Java类

CUDA，3D计算对象之间的距离矩阵

相关阅读

技术问答最新文章