一、问题表现形式
程序是三频四相移解相和相位展开的,使用
cuda
的流异步的方法切分成16段,使用16个流进行计算。
且问题只有次数多了以后才会出现,而且,连续两次进行计算的时候,好像只有一张图有结果。
横条纹解相只有一段多一点有效
竖条纹解相只有3段有效结果
二、问题代码
void cu_calPhase12Src(
const uchar *src1_1, const uchar *src1_2, const uchar *src1_3, const uchar *src1_4,
const uchar *src2_1, const uchar *src2_2, const uchar *src2_3, const uchar *src2_4,
const uchar *src3_1, const uchar *src3_2, const uchar *src3_3, const uchar *src3_4,
float *dst, int nWidth, int nHeight,
float *wavelenth, int *times)
{
//检查
int nStreams(16); //要创建的流数目
int nLeng = nWidth * nHeight;
// dim3 block(32, 32);
dim3 block(1024, 1); //这儿准备尝试一下256
int nSize = (nLeng + nStreams - 1) / nStreams;
dim3 grid((nSize + block.x - 1) / block.x, 1);
//申请cuda上面的内存
float *d_waveLength;
int *d_times;
cudaMalloc(&d_waveLength, 8 * sizeof(float));
cudaMalloc(&d_times, sizeof(int));
//这两个拷贝保持同步
cudaMemcpy(d_waveLength, wavelenth, 8 * sizeof(float), cudaMemcpyHostToDevice);
cudaMemcpy(d_times, times, sizeof(int), cudaMemcpyHostToDevice);
unsigned char *d_src1_1, *d_src1_2, *d_src1_3, *d_src1_4;
unsigned char *d_src2_1, *d_src2_2, *d_src2_3, *d_src2_4;
unsigned char *d_src3_1, *d_src3_2, *d_src3_3, *d_src3_4;
cudaMalloc(&d_src1_1, nLeng * sizeof(unsigned char));
cudaMalloc(&d_src1_2, nLeng * sizeof(unsigned char));
cudaMalloc(&d_src1_3, nLeng * sizeof(unsigned char));
cudaMalloc(&d_src1_4, nLeng * sizeof(unsigned char));
cudaMalloc(&d_src2_1, nLeng * sizeof(unsigned char));
cudaMalloc(&d_src2_2, nLeng * sizeof(unsigned char));
cudaMalloc(&d_src2_3, nLeng * sizeof(unsigned char));
cudaMalloc(&d_src2_4, nLeng * sizeof(unsigned char));
cudaMalloc(&d_src3_1, nLeng * sizeof(unsigned char));
cudaMalloc(&d_src3_2, nLeng * sizeof(unsigned char));
cudaMalloc(&d_src3_3, nLeng * sizeof(unsigned char));
cudaMalloc(&d_src3_4, nLeng * sizeof(unsigned char));
//结果的硬件内存
float *d_dst;
cudaMalloc(&d_dst, nLeng * sizeof(float));
//每次的size可能不一样也需要传进去
int *d_size;
cudaMalloc(&d_size, nStreams * sizeof(int));
// 创建一定数量的流
cudaStream_t stream[nStreams];
for (size_t i = 0; i < nStreams; i++)
{
cudaStreamCreate(&stream[i]);
}
//对流执行拷贝和运算
for (int i = 0; i < nStreams; ++i)
{
//这部分cpu的函数可能会干扰异步执行,必要时候提出来看一下
// int size = nSize < (nSize * i) ? nSize : (nLeng - nSize * i);
int ni = i * nSize;
int size = min(nLeng - nSize * i, nSize);
//从主机内存复制数据到设备内存
cudaMemcpyAsync(d_src1_1 + ni, src1_1 + ni, size, cudaMemcpyHostToDevice, stream[i]);
cudaMemcpyAsync(d_src1_2 + ni, src1_2 + ni, size, cudaMemcpyHostToDevice, stream[i]);
cudaMemcpyAsync(d_src1_3 + ni, src1_3 + ni, size, cudaMemcpyHostToDevice, stream[i]);
cudaMemcpyAsync(d_src1_4 + ni, src1_4 + ni, size, cudaMemcpyHostToDevice, stream[i]);
cudaMemcpyAsync(d_src2_1 + ni, src2_1 + ni, size, cudaMemcpyHostToDevice, stream[i]);
cudaMemcpyAsync(d_src2_2 + ni, src2_2 + ni, size, cudaMemcpyHostToDevice, stream[i]);
cudaMemcpyAsync(d_src2_3 + ni, src2_3 + ni, size, cudaMemcpyHostToDevice, stream[i]);
cudaMemcpyAsync(d_src2_4 + ni, src2_4 + ni, size, cudaMemcpyHostToDevice, stream[i]);
cudaMemcpyAsync(d_src3_1 + ni, src3_1 + ni, size, cudaMemcpyHostToDevice, stream[i]);
cudaMemcpyAsync(d_src3_2 + ni, src3_2 + ni, size, cudaMemcpyHostToDevice, stream[i]);
cudaMemcpyAsync(d_src3_3 + ni, src3_3 + ni, size, cudaMemcpyHostToDevice, stream[i]);
cudaMemcpyAsync(d_src3_4 + ni, src3_4 + ni, size, cudaMemcpyHostToDevice, stream[i]);
cudaMemcpyAsync(d_size + i, src3_4 + i, sizeof(int), cudaMemcpyHostToDevice, stream[i]);
//执行Kernel处理设备内存
if (times[0] == 3)
{
cuda_calPhase12Src_times3_kernel<<<grid, block, 0, stream[i]>>>(
d_src1_1 + ni, d_src1_2 + ni, d_src1_3 + ni, d_src1_4 + ni,
d_src2_1 + ni, d_src2_2 + ni, d_src2_3 + ni, d_src2_4 + ni,
d_src3_1 + ni, d_src3_2 + ni, d_src3_3 + ni, d_src3_4 + ni,
d_dst + ni, d_size + i,
d_waveLength);
}
else
{
cuda_calPhase12Src_times2_kernel<<<grid, block, 0, stream[i]>>>(
d_src1_1 + ni, d_src1_2 + ni, d_src1_3 + ni, d_src1_4 + ni,
d_src2_1 + ni, d_src2_2 + ni, d_src2_3 + ni, d_src2_4 + ni,
d_src3_1 + ni, d_src3_2 + ni, d_src3_3 + ni, d_src3_4 + ni,
d_dst + ni, d_size + i,
d_waveLength);
}
//从设备内存到主机内存
cudaMemcpyAsync(dst + ni, d_dst + ni, size * sizeof(float), cudaMemcpyDeviceToHost, stream[i]);
}
//等待stream全部执行完
for (size_t i = 0; i < nStreams; i++)
{
cudaStreamSynchronize(stream[i]);
cudaStreamDestroy(stream[i]);
}
cudaFree(d_src1_1);
cudaFree(d_src1_2);
cudaFree(d_src1_3);
cudaFree(d_src1_4);
cudaFree(d_src2_1);
cudaFree(d_src2_2);
cudaFree(d_src2_3);
cudaFree(d_src2_4);
cudaFree(d_src3_1);
cudaFree(d_src3_2);
cudaFree(d_src3_3);
cudaFree(d_src3_4);
cudaFree(d_waveLength);
cudaFree(d_times);
cudaFree(d_dst);
cudaFree(d_size);
return;
}
三、后续
还在排查中。
用nvvp
分析的时候,发现了这样的情况:不同流的同一核函数执行时间差异过大,而那些基本没有耗费时间的核函数所在的流的输出结果就是纯黑图,暂且不知道原因,只知道在核函数执行层面确实出现了问题。
![](https://img.haomeiwen.com/i19536936/b858c0c63ccc02cf.png)
最后发现是代码里面写段长的时候出现了问题
//错误
cudaMemcpyAsync(d_size + i, src3_4 + i, sizeof(int), cudaMemcpyHostToDevice, stream[i]);
//正确
cudaMemcpyAsync(d_size + i, &size, sizeof(int), cudaMemcpyHostToDevice, stream[i]);
网友评论