CUDA和libc++abi.dylib中对象的共享内存错误

Question

我有以下问题（请记住，我对使用 CUDA 编程还很陌生）。

我有一个名为

vec3f

的类，它类似于

float3

数据类型，但具有重载运算符和其他向量函数。这些函数以

__device__ __host__

为前缀。然后，在我的内核中，我在

block_x

和

block_y

索引上做一个嵌套的 for 循环，并做类似的事情，

//set up shared memory block
extern __shared__ vec3f share[];
vec3f *sh_pos = share;
vec3f *sh_velocity = &sh_pos[blockDim.x*blockDim.y];
sh_pos[blockDim.x * threadIdx.x + threadIdx.y] = oldParticles[index].position();
sh_velocity[blockDim.x * threadIdx.x + threadIdx.y] = oldParticles[index].velocity();
__syncthreads();

在上面的代码中，

oldParticles

是一个指向传递给内核的名为

particles

的类的指针。

OldParticles

实际上是一个

thrust::device_vector<particle>

的底层指针（我不确定这是否与它有关）。一切都可以编译，但是当我运行时出现错误

libc++abi.dylib: terminate called throwing an exception
Abort trap: 6

感谢您的回复。我认为错误与我没有为传递给我的内核的参数分配空间有关。在我的主机代码中执行以下操作修复了此错误，

particle* particle_ptrs[2];
particle_ptrs[0] = thrust::raw_pointer_cast(&d_old_particles[0]);
particle_ptrs[1] = thrust::raw_pointer_cast(&d_new_particles[0]);
CUDA_SAFE_CALL( cudaMalloc( (void**)&particle_ptrs[0], max_particles * sizeof(particle) ) );
CUDA_SAFE_CALL( cudaMalloc( (void**)&particle_ptrs[1], max_particles * sizeof(particle) ) );

然后是内核调用，

force_kernel<<< grid,block,sharedMemSize  >>>(particle_ptrs[0],particle_ptrs[1],time_step);

我现在遇到的问题似乎是我无法将数据从设备复制回主机。我想这和我对 Thrust 不熟悉有关

我正在做一系列的复制如下，

//make a host vector assume this is initialized
thrust::host_vector<particle> h_particles;
thrust::device_vector<particle> d_old_particles, d_new_particles;
d_old_particles = h_particles;
//launch kernel as shown above 
//with thrust vectors having been casted into their underlying pointers
//particle_ptrs[1] gets modified and so shouldnt d_new_particles?
//copy back
h_particles = d_new_particles;

所以我想我的问题是，我可以在内核中修改 Thrust

device_vector

（在本例中为

particle_pters[0]

），然后将修改保存到内核中的另一个 Thrust

device_vector

（在本例中为

particle_pters[1]

），然后保存一次我退出内核，复制到一个

host_vector

?

我仍然无法让它工作。我做了一个更短的例子，我遇到了同样的问题，

#include <iostream>
#include <thrust/host_vector.h>
#include <thrust/device_vector.h>
#include "vec3f.h"
const int BLOCK_SIZE = 8;
const int max_particles = 64;
const float dt = 0.01;

using namespace std;
//particle class
class particle {
public:
  particle() : 
    _velocity(vec3f(0,0,0)), _position(vec3f(0,0,0)), _density(0.0) {
  };
  particle(const vec3f& pos, const vec3f& vel) :
    _position(pos), _velocity(vel), _density(0.0) {
  };

  vec3f _velocity;
  vec3f _position;
  float _density;
};

//forward declaration of kernel func
__global__ void kernel_func(particle* old_parts, particle* new_parts, float dt);

//global thrust vectors
thrust::host_vector<particle> h_parts;
thrust::device_vector<particle> old_parts, new_parts;
particle* particle_ptrs[2];

int main() {
  //load host vector
  for (int i =0; i<max_particles; i++) {
    h_parts.push_back(particle(vec3f(0.5,0.5,0.5),vec3f(10,10,10)));
  }

  particle_ptrs[0] = thrust::raw_pointer_cast(&old_parts[0]);
  particle_ptrs[1] = thrust::raw_pointer_cast(&new_parts[0]);
  cudaMalloc( (void**)&particle_ptrs[0], max_particles * sizeof(particle) );
  cudaMalloc( (void**)&particle_ptrs[1], max_particles * sizeof(particle) );
  //copy host particles to old device particles...
  old_parts = h_parts;
  //kernel block and grid dimensions
  dim3 block(BLOCK_SIZE,BLOCK_SIZE,1);
  dim3 grid(int(sqrt(float(max_particles) / (float(block.x*block.y)))), int(sqrt(float(max_particles) / (float(block.x*block.y)))), 1);
  kernel_func<<<block,grid>>>(particle_ptrs[0],particle_ptrs[1],dt);
  //copy new device particles back to host particles
  h_parts = new_parts;
  for (int i =0; i<max_particles; i++) {
    particle temp1 = h_parts[i];
    cout << temp1._position << endl;
  }  
  //delete thrust device vectors
  old_parts.clear();
  old_parts.shrink_to_fit();
  new_parts.clear();
  new_parts.shrink_to_fit();
  return 0;
}

//kernel function
__global__ void kernel_func(particle* old_parts, particle* new_parts, float dt) {
  unsigned int x = blockIdx.x * blockDim.x + threadIdx.x;
  unsigned int y = blockIdx.y * blockDim.y + threadIdx.y;
  //get array position for 2d grid...
  unsigned int arr_pos = y*blockDim.x*gridDim.x + x;

  new_parts[arr_pos]._velocity = old_parts[arr_pos]._velocity * 10.0 * dt;
  new_parts[arr_pos]._position = old_parts[arr_pos]._position * 10.0 * dt;
  new_parts[arr_pos]._density = old_parts[arr_pos]._density * 10.0 * dt;
}

所以

host_vector

对于所有 64 个粒子都有一个

(0.5,0.5,0.5)

的初始位置。然后内核尝试将其乘以 10 以给出

(5,5,5)

作为所有粒子的位置。但是当我

cout

数据时我没有看到这个。它仍然只是

(0.5,0.5,0.5)

。我分配内存的方式有问题吗？线有没有问题：

  //copy new device particles back to host particles
  h_parts = new_parts;

可能是什么问题？谢谢。

Answer 1

您发布的代码存在各种问题

您的
```
block
```
和
```
grid
```
变量在您的内核调用中颠倒了。
```
grid
```
先来。
您应该对内核和运行时 API 调用进行 cuda 错误检查。
您使用
```
cudaMalloc
```
在从空设备向量原始转换的指针上分配存储的方法是不明智的。 vector 容器不知道您是“在后台”执行此操作的。相反，您可以在实例化时直接为设备向量分配存储空间，例如：
```
thrust::device_vector<particle> old_parts(max_particles), new_parts(max_particles);
```
你说你期望 5、5、5，但你的内核乘以 10，然后乘以
```
dt
```
，即 0.01，所以我相信正确的输出是 0.05、0.05、0.05
您的网格计算 (int(sqrt...))，对于任意
```
max_particles
```
要么不能保证产生足够的块（如果将浮点数转换为 int 截断或向下舍入），要么会产生额外的块（如果它向上舍入).四舍五入的情况很糟糕。我们应该使用
```
ceil
```
函数或其他网格计算方法来处理它。汇总情况（这是
```
ceil
```
会做的）没问题，但我们需要处理网格可能启动额外块/线程的事实。我们通过内核中的线程检查来做到这一点。网格计算也存在其他问题。我们要取
```
max_particles
```
的平方根，然后将它除以特定方向上的块尺寸，得到该方向上的网格尺寸。

这是我根据这些变化修改的一些代码，它似乎产生了正确的输出（0.05、0.05、0.05）。请注意，我必须进行一些其他更改，因为我手边没有您的“vec3f.h”头文件，所以我改用了

float3

。

#include <iostream>
#include <thrust/host_vector.h>
#include <thrust/device_vector.h>
#include <vector_functions.h>

const int BLOCK_SIZE = 8;
const int max_particles = 64;
const float dt = 0.01;

using namespace std;
//particle class
class particle {
public:
  particle() :
    _velocity(make_float3(0,0,0)), _position(make_float3(0,0,0)), _density(0.0)
 {
  };
  particle(const float3& pos, const float3& vel) :
    _position(pos), _velocity(vel), _density(0.0)
 {
  };

  float3 _velocity;
  float3 _position;
  float _density;
};

//forward declaration of kernel func
__global__ void kernel_func(particle* old_parts, particle* new_parts, float dt);


int main() {
  //global thrust vectors
  thrust::host_vector<particle> h_parts;
  particle* particle_ptrs[2];
  //load host vector
  for (int i =0; i<max_particles; i++) {
    h_parts.push_back(particle(make_float3(0.5,0.5,0.5),make_float3(10,10,10)));
  }

  //copy host particles to old device particles...
  thrust::device_vector<particle> old_parts = h_parts;
  thrust::device_vector<particle> new_parts(max_particles);
  particle_ptrs[0] = thrust::raw_pointer_cast(&old_parts[0]);
  particle_ptrs[1] = thrust::raw_pointer_cast(&new_parts[0]);
  //kernel block and grid dimensions
  dim3 block(BLOCK_SIZE,BLOCK_SIZE,1);
  dim3 grid((int)ceil(sqrt(float(max_particles)) / (float(block.x))), (int)ceil(sqrt(float(max_particles)) / (float(block.y))), 1);
  cout << "grid x: " << grid.x << "  grid y: "  << grid.y << endl;
  kernel_func<<<grid,block>>>(particle_ptrs[0],particle_ptrs[1],dt);
  //copy new device particles back to host particles
  cudaDeviceSynchronize();
  h_parts = new_parts;
  for (int i =0; i<max_particles; i++) {
    particle temp1 = h_parts[i];
    cout << temp1._position.x << "," << temp1._position.y << "," << temp1._position.z << endl;
  }
  //delete thrust device vectors
  old_parts.clear();
  old_parts.shrink_to_fit();
  new_parts.clear();
  new_parts.shrink_to_fit();

  return 0;
}

//kernel function
__global__ void kernel_func(particle* old_parts, particle* new_parts, float dt) {
  unsigned int x = blockIdx.x * blockDim.x + threadIdx.x;
  unsigned int y = blockIdx.y * blockDim.y + threadIdx.y;
  //get array position for 2d grid...
  unsigned int arr_pos = y*blockDim.x*gridDim.x + x;
  if (arr_pos < max_particles) {

    new_parts[arr_pos]._velocity.x = old_parts[arr_pos]._velocity.x * 10.0 * dt;
    new_parts[arr_pos]._velocity.y = old_parts[arr_pos]._velocity.y * 10.0 * dt;
    new_parts[arr_pos]._velocity.z = old_parts[arr_pos]._velocity.z * 10.0 * dt;
    new_parts[arr_pos]._position.x = old_parts[arr_pos]._position.x * 10.0 * dt;
    new_parts[arr_pos]._position.y = old_parts[arr_pos]._position.y * 10.0 * dt;
    new_parts[arr_pos]._position.z = old_parts[arr_pos]._position.z * 10.0 * dt;
    new_parts[arr_pos]._density = old_parts[arr_pos]._density * 10.0 * dt;
  }
}

CUDA和libc++abi.dylib中对象的共享内存错误

问题描述投票：0回答：1

1个回答

最新问题

CUDA和libc++abi.dylib中对象的共享内存错误

问题描述 投票：0回答：1

1个回答

最新问题

问题描述投票：0回答：1