说明和参考
- 这里面用到了cuda求和,这个比一般的cuda内核函数要慢不少,可以了解下cuda写归约函数;
- 用到了cuda排序,也是归约的;
- 求解svd使用的是
cuSOLVER
库的cusolverDnXgesvd()
方法,根据示例的方法需要申请一些句柄,这个会非常耗时,建议不要轻易申请和释放。
cuda 计算SVD:cuda计算SVD接口文档翻译与示例代码记录;
Thrust:https://nvidia.github.io/thrust/,https://docs.nvidia.com/cuda/thrust/index.html,https://thrust.github.io/
cuda排序:cuda实现排序(纯享版,gpt4提供的代码)
代码
#define CUDA_GLOABAL __global__
#define CUDA_G__ CUDA_GLOABAL
#define C_G__ CUDA_G__
C_G__ void cuda_xyz2WA_kernel(
const double *d_src_x, const double *d_src_y,
const double *d_src_z, const double *d_src_W,
double x_mean, double y_mean, double z_mean,
double *d_dst_WA, int length)
{
int x = threadIdx.x + blockIdx.x * blockDim.x; // x坐标
// int y = threadIdx.y + blockIdx.y * blockDim.y; // y坐标
if (x < length)
{
d_dst_WA[x] = (d_src_x[x] - x_mean) * d_src_W[x];
d_dst_WA[x + length] = (d_src_y[x] - y_mean) * d_src_W[x];
d_dst_WA[x + 2 * length] = (d_src_z[x] - z_mean) * d_src_W[x];
// printf("thread x(%d): dx(%lf), dy(%lf), dz(%lf), dw(%lf), dwa(%lf,%lf,%lf) \n",
// x, d_src_x[x], d_src_y[x], d_src_z[x], d_src_W[x],
// d_dst_WA[x * 3], d_dst_WA[x * 3 + 1], d_dst_WA[x * 3 + 2]);
}
}
C_G__ void cuda_calR_kernel(
const double *d_src_x, const double *d_src_y,
const double *d_src_z, const double *d_vt,
double x_mean, double y_mean, double z_mean,
double *d_dst_R, int length)
{
int x = threadIdx.x + blockIdx.x * blockDim.x; // x坐标
// int y = threadIdx.y + blockIdx.y * blockDim.y; // y坐标
if (x < length)
{
d_dst_R[x] = d_vt[2] * (d_src_x[x] - x_mean) +
d_vt[5] * (d_src_y[x] - y_mean) +
d_vt[8] * (d_src_z[x] - z_mean);
}
}
C_G__ void cuda_calNewWeightAndWR_kernel(
const double *d_src_R,
double *d_dst_W, double *d_dst_WR,
int length, double s, double K)
{
int x = threadIdx.x + blockIdx.x * blockDim.x; // x坐标
// int y = threadIdx.y + blockIdx.y * blockDim.y; // y坐标
if (x < length)
{
double u_i = d_src_R[x] / (K * s);
if (abs(u_i) < 1)
{
double u_i2 = u_i * u_i;
d_dst_W[x] = (1 - u_i2) * (1 - u_i2);
}
else
{
d_dst_W[x] = 0.f;
}
d_dst_WR[x] = d_dst_W[x] * (d_src_R[x] * d_src_R[x]);
}
}
int BiSquarePlaneFitting_gpu(const std::vector<Point3f> &vPts,
Point3d &ptMeans, Point3d &ptStddev,
float &a, float &b, float &c, float &d)
{
using data_type = double;
/// 进行计算,
size_t sz = vPts.size();
if (sz < 3)
{
a = 0.f;
b = 0.f;
c = 0.f;
d = 0.f;
ptMeans = Point3d{};
ptStddev = Point3d{};
return 1;
}
double xSum{}, ySum{}, zSum{};
double xMean{}, yMean{}, zMean{};
// cuda 计算所需的一干矩阵和句柄
cusolverDnHandle_t cusolverH = NULL;
cublasHandle_t cublasH = NULL;
cudaStream_t stream = NULL;
/* step 1: create cusolver handle, bind a stream */
CUSOLVER_CHECK(cusolverDnCreate(&cusolverH));
CUBLAS_CHECK(cublasCreate(&cublasH));
CUDA_CHECK(cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking));
CUSOLVER_CHECK(cusolverDnSetStream(cusolverH, stream));
CUBLAS_CHECK(cublasSetStream(cublasH, stream));
// 初始化权重
cv::Mat_<data_type> mWeight = cv::Mat_<data_type>::ones(sz, 1);
// Gpu上面存储权重
cv::cuda::GpuMat gW;
gW.upload(mWeight);
cv::Mat_<data_type> mX(sz, 1), mY(sz, 1), mZ(sz, 1);
for (size_t i = 0; i < sz; i++)
{
mX(i, 0) = vPts[i].x;
mY(i, 0) = vPts[i].y;
mZ(i, 0) = vPts[i].z;
}
// Gpu上面存储原始的X,Y,Z坐标信息
cv::cuda::GpuMat gX, gY, gZ;
cv::cuda::GpuMat gWX, gWY, gWZ;
gX.upload(mX);
gY.upload(mY);
gZ.upload(mZ);
// 迭代计算终止条件
size_t maxIter = sz * (sz - 1) / 2;
maxIter = min(maxIter, (size_t)50);
data_type sigma = 1e-8; ///< 控制精度总损失的参数
cv::cuda::GpuMat gWA(3, sz, CV_64FC1); // cuda的矩阵和opencv的矩阵是
cv::cuda::GpuMat gR(sz, 1, CV_64FC1);
cv::cuda::GpuMat gRsort(sz, 1, CV_64FC1);
// cv::cuda::GpuMat gMadSort(sz, 1, CV_64FC1);
cv::cuda::GpuMat gWR(sz, 1, CV_64FC1);
double K = 4.685; ///< BiSquare方法的参数常数项
// double s{};
double lastSumWR{}; ///< 上一轮的总损失
double minDelta{};
const int threadSizeX = 1024;
dim3 gridSize((sz + threadSizeX - 1) / threadSizeX, 1);
dim3 blockSize(threadSizeX, 1);
// data_type *d_A = nullptr;
data_type *d_S = nullptr;
// data_type *d_U = nullptr;
data_type *d_VT = nullptr;
int *d_info = nullptr;
data_type *d_work = nullptr;
data_type *h_work = nullptr;
// data_type *d_rwork = nullptr;
// data_type *d_W = nullptr; // W = S*VT
size_t workspaceInBytesOnDevice = 0;
size_t workspaceInBytesOnHost = 0;
int info = 0;
signed char jobu = 'N'; // U可以不用计算出来
signed char jobvt = 'A'; // VT必须要全部计算出来
const int64_t m = sz;
const int64_t n = 3;
const int64_t lda = m;
// const int64_t ldu = m;
const int64_t ldvt = n;
/* 申请内存 */
// int64_t u_sz = lda * m;
int64_t S_sz = min(m, n);
int64_t vt_sz = ldvt * n;
// CUDA_CHECK(cudaMalloc(reinterpret_cast<void **>(&d_A), sizeof(data_type) * A.size()));
CUDA_CHECK(cudaMalloc(reinterpret_cast<void **>(&d_S), sizeof(data_type) * S_sz));
// CUDA_CHECK(cudaMalloc(reinterpret_cast<void **>(&d_U), sizeof(data_type) * u_sz));
CUDA_CHECK(cudaMalloc(reinterpret_cast<void **>(&d_VT), sizeof(data_type) * vt_sz));
CUDA_CHECK(cudaMalloc(reinterpret_cast<void **>(&d_info), sizeof(int)));
// CUDA_CHECK(cudaMalloc(reinterpret_cast<void **>(&d_W), sizeof(data_type) * lda * n));
std::vector<data_type> VT(vt_sz, 0);
std::vector<data_type> S(min(m, n), 0);
int median_length = 1;
int median_start = sz / 2 - 1;
if (sz % 2 == 0)
{
median_length = 2;
median_start = sz / 2;
}
std::vector<data_type> median_s(median_length, 0);
/* 查询SVD的工作空间 */
CUSOLVER_CHECK(cusolverDnXgesvd_bufferSize(
cusolverH,
nullptr,
jobu,
jobvt,
m,
n,
traits<data_type>::cuda_data_type,
gWA.ptr<data_type>(),
lda,
traits<data_type>::cuda_data_type,
d_S,
traits<data_type>::cuda_data_type,
nullptr, // d_U,
lda,
traits<data_type>::cuda_data_type,
d_VT,
ldvt, // lda,
traits<data_type>::cuda_data_type,
&workspaceInBytesOnDevice,
&workspaceInBytesOnHost));
CUDA_CHECK(cudaMalloc(reinterpret_cast<void **>(&d_work), workspaceInBytesOnDevice));
h_work = (data_type *)malloc(workspaceInBytesOnHost);
if (!h_work)
{
throw std::bad_alloc();
}
int length = sz;
for (size_t iter = 0; iter < maxIter; iter++)
{
// 这儿是不是可以根据权重先重新更新一下均值
xSum = 0.0f;
ySum = 0.0f;
zSum = 0.0f;
double w_iSum{0.f}; ///< 权重总和
cv::cuda::multiply(gX, gW, gWX);
cv::cuda::multiply(gY, gW, gWY);
cv::cuda::multiply(gZ, gW, gWZ);
xSum = cv::cuda::sum(gWX)[0];
ySum = cv::cuda::sum(gWY)[0];
zSum = cv::cuda::sum(gWZ)[0];
w_iSum = cv::cuda::sum(gW)[0];
xMean = xSum / w_iSum;
yMean = ySum / w_iSum;
zMean = zSum / w_iSum;
// 这两步合起来写一个kernel
// 得到gWA
cuda_xyz2WA_kernel<<<gridSize, blockSize>>>(
gX.ptr<data_type>(), gY.ptr<data_type>(), gZ.ptr<data_type>(),
gW.ptr<data_type>(), xMean, yMean, zMean,
gWA.ptr<data_type>(), length);
#ifdef _DEBUG
cv::Mat mXg2h_debug;
cv::Mat mYg2h_debug;
cv::Mat mZg2h_debug;
cv::Mat mWg2h_debug;
cv::Mat mWAg2h_debug;
gX.download(mXg2h_debug);
gY.download(mYg2h_debug);
gZ.download(mZg2h_debug);
gW.download(mWg2h_debug);
gWA.download(mWAg2h_debug);
cv::imwrite("mXg2h_debug.tiff", mXg2h_debug);
cv::imwrite("mYg2h_debug.tiff", mYg2h_debug);
cv::imwrite("mZg2h_debug.tiff", mZg2h_debug);
cv::imwrite("mWg2h_debug.tiff", mWg2h_debug);
cv::imwrite("mWAg2h_debug.tiff", mWAg2h_debug);
// std::vector<data_type> WA(sz * 3, 0);
// CUDA_CHECK(cudaMemcpyAsync(WA.data(), gWA.ptr<data_type>(), sizeof(data_type) * sz * 3,
// cudaMemcpyDeviceToHost, stream));
// std::printf("WA = (matlab base-1)\n");
// print_matrix(sz, n, WA.data(), sz);
// std::printf("=====\n");
#endif
// 更新权重之后计算
/* step 4: 计算 SVD */
CUSOLVER_CHECK(cusolverDnXgesvd(
cusolverH,
nullptr,
jobu,
jobvt,
m,
n,
traits<data_type>::cuda_data_type,
gWA.ptr<data_type>(), // d_A,
lda,
traits<data_type>::cuda_data_type,
d_S,
traits<data_type>::cuda_data_type,
nullptr, // d_U,
lda,
traits<data_type>::cuda_data_type,
d_VT,
ldvt, // lda,
traits<data_type>::cuda_data_type,
d_work,
workspaceInBytesOnDevice,
h_work,
workspaceInBytesOnHost,
d_info));
#ifdef _DEBUG
// 算完拷贝
// CUDA_CHECK(cudaMemcpyAsync(U.data(), d_U, sizeof(data_type) * U.size(), cudaMemcpyDeviceToHost,
// stream));
CUDA_CHECK(cudaMemcpyAsync(VT.data(), d_VT, sizeof(data_type) * vt_sz,
cudaMemcpyDeviceToHost, stream));
// CUDA_CHECK(cudaMemcpyAsync(S.data(), d_S, sizeof(data_type) * min(m, n),
// cudaMemcpyDeviceToHost, stream));
CUDA_CHECK(cudaMemcpyAsync(&info, d_info, sizeof(int), cudaMemcpyDeviceToHost, stream));
CUDA_CHECK(cudaStreamSynchronize(stream));
// 打印所有的VT
std::printf("VT = (matlab base-1)\n");
print_matrix(n, n, VT.data(), ldvt);
std::printf("=====\n");
#endif
// 偏差矩阵
cuda_calR_kernel<<<gridSize, blockSize>>>(
gX.ptr<data_type>(), gY.ptr<data_type>(), gZ.ptr<data_type>(),
d_VT, xMean, yMean, zMean,
gR.ptr<data_type>(), length);
// 偏差中位数
w_iSum = cv::cuda::sum(gW)[0];
cv::cuda::multiply(gR, gW, gWR);
double wr_iSum = cv::cuda::sum(gWR)[0];
double median = wr_iSum / w_iSum;
cv::cuda::absdiff(gR, median, gRsort);
thrust::sort(thrust::device, gRsort.ptr<data_type>(), gRsort.ptr<data_type>() + sz);
CUDA_CHECK(cudaMemcpyAsync(median_s.data(), gRsort.ptr<data_type>() + median_start,
sizeof(data_type) * median_length,
cudaMemcpyDeviceToHost, stream));
double mad = sz % 2 == 0 ? (median_s[0] + median_s[1]) / 2 : median_s[0];
double s = mad / 0.6745; ///< 0.6745是BiSquare方法的参数常数项
// 根据新计算出来的参数用biSquare更新权重
cuda_calNewWeightAndWR_kernel<<<gridSize, blockSize>>>(
gR.ptr<data_type>(), gW.ptr<data_type>(), gWR.ptr<data_type>(),
length, s, K);
//
double sumWR = cv::cuda::sum(gWR)[0];
if (iter == 0)
{
minDelta = abs(sumWR * sigma);
}
if (abs(sumWR - lastSumWR) < minDelta)
{
break;
}
lastSumWR = sumWR;
#ifdef _DEBUG
// std::cout << "mWeight = " << mWeight << std::endl;
// std::cout << "sumWR = " << sumWR << std::endl;
printf("iter %lu sumWR = %lf \n", iter, sumWR);
#endif
}
CUDA_CHECK(cudaMemcpyAsync(VT.data(), d_VT, sizeof(data_type) * vt_sz,
cudaMemcpyDeviceToHost, stream));
a = VT[2];
b = VT[5];
c = VT[8];
d = -a * xMean - b * yMean - c * zMean;
ptMeans = MENTO::Point3d(xMean, yMean, zMean);
/* free resources */
// CUDA_CHECK(cudaFree(d_A));
CUDA_CHECK(cudaFree(d_S));
// CUDA_CHECK(cudaFree(d_U));
CUDA_CHECK(cudaFree(d_VT));
CUDA_CHECK(cudaFree(d_info));
CUDA_CHECK(cudaFree(d_work));
// CUDA_CHECK(cudaFree(d_rwork));
// CUDA_CHECK(cudaFree(d_W));
free(h_work);
CUSOLVER_CHECK(cusolverDnDestroy(cusolverH));
CUBLAS_CHECK(cublasDestroy(cublasH));
CUDA_CHECK(cudaStreamDestroy(stream));
CUDA_CHECK(cudaDeviceReset());
return 0;
}
nvvp显示的耗时的图
正经迭代的耗时并不大,最大的大头在申请和销毁句柄。
耗时
网友评论