美文网首页
全部最小二乘法拟合平面BiSquare优化cuda版本

全部最小二乘法拟合平面BiSquare优化cuda版本

作者: 寽虎非虫003 | 来源:发表于2023-08-16 14:37 被阅读0次

说明和参考

  • 这里面用到了cuda求和,这个比一般的cuda内核函数要慢不少,可以了解下cuda写归约函数;
  • 用到了cuda排序,也是归约的;
  • 求解svd使用的是cuSOLVER库的cusolverDnXgesvd()方法,根据示例的方法需要申请一些句柄,这个会非常耗时,建议不要轻易申请和释放。

cuda 计算SVD:cuda计算SVD接口文档翻译与示例代码记录
Thrust:https://nvidia.github.io/thrust/https://docs.nvidia.com/cuda/thrust/index.htmlhttps://thrust.github.io/
cuda排序:cuda实现排序(纯享版,gpt4提供的代码)

代码

#define CUDA_GLOABAL __global__
#define CUDA_G__ CUDA_GLOABAL
#define C_G__ CUDA_G__

C_G__ void cuda_xyz2WA_kernel(
    const double *d_src_x, const double *d_src_y,
    const double *d_src_z, const double *d_src_W,
    double x_mean, double y_mean, double z_mean,
    double *d_dst_WA, int length)
{

    int x = threadIdx.x + blockIdx.x * blockDim.x; // x坐标
    // int y = threadIdx.y + blockIdx.y * blockDim.y; // y坐标

    if (x < length)
    {
        d_dst_WA[x] = (d_src_x[x] - x_mean) * d_src_W[x];
        d_dst_WA[x + length] = (d_src_y[x] - y_mean) * d_src_W[x];
        d_dst_WA[x + 2 * length] = (d_src_z[x] - z_mean) * d_src_W[x];

        // printf("thread x(%d): dx(%lf), dy(%lf), dz(%lf), dw(%lf), dwa(%lf,%lf,%lf) \n",
        //        x, d_src_x[x], d_src_y[x], d_src_z[x], d_src_W[x],
        //        d_dst_WA[x * 3], d_dst_WA[x * 3 + 1], d_dst_WA[x * 3 + 2]);
    }
}

C_G__ void cuda_calR_kernel(
    const double *d_src_x, const double *d_src_y,
    const double *d_src_z, const double *d_vt,
    double x_mean, double y_mean, double z_mean,
    double *d_dst_R, int length)
{

    int x = threadIdx.x + blockIdx.x * blockDim.x; // x坐标
    // int y = threadIdx.y + blockIdx.y * blockDim.y; // y坐标

    if (x < length)
    {
        d_dst_R[x] = d_vt[2] * (d_src_x[x] - x_mean) +
                     d_vt[5] * (d_src_y[x] - y_mean) +
                     d_vt[8] * (d_src_z[x] - z_mean);
    }
}

C_G__ void cuda_calNewWeightAndWR_kernel(
    const double *d_src_R,
    double *d_dst_W, double *d_dst_WR,
    int length, double s, double K)
{

    int x = threadIdx.x + blockIdx.x * blockDim.x; // x坐标
    // int y = threadIdx.y + blockIdx.y * blockDim.y; // y坐标

    if (x < length)
    {
        double u_i = d_src_R[x] / (K * s);
        if (abs(u_i) < 1)
        {
            double u_i2 = u_i * u_i;
            d_dst_W[x] = (1 - u_i2) * (1 - u_i2);
        }
        else
        {
            d_dst_W[x] = 0.f;
        }
        d_dst_WR[x] = d_dst_W[x] * (d_src_R[x] * d_src_R[x]);
    }
}


int BiSquarePlaneFitting_gpu(const std::vector<Point3f> &vPts,
                             Point3d &ptMeans, Point3d &ptStddev,
                             float &a, float &b, float &c, float &d)
{

    using data_type = double;
    /// 进行计算,
    size_t sz = vPts.size();

    if (sz < 3)
    {
        a = 0.f;
        b = 0.f;
        c = 0.f;
        d = 0.f;
        ptMeans = Point3d{};
        ptStddev = Point3d{};
        return 1;
    }

    double xSum{}, ySum{}, zSum{};
    double xMean{}, yMean{}, zMean{};

    // cuda 计算所需的一干矩阵和句柄
    cusolverDnHandle_t cusolverH = NULL;
    cublasHandle_t cublasH = NULL;
    cudaStream_t stream = NULL;

    /* step 1: create cusolver handle, bind a stream */
    CUSOLVER_CHECK(cusolverDnCreate(&cusolverH));
    CUBLAS_CHECK(cublasCreate(&cublasH));

    CUDA_CHECK(cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking));
    CUSOLVER_CHECK(cusolverDnSetStream(cusolverH, stream));
    CUBLAS_CHECK(cublasSetStream(cublasH, stream));

    // 初始化权重
    cv::Mat_<data_type> mWeight = cv::Mat_<data_type>::ones(sz, 1);

    // Gpu上面存储权重
    cv::cuda::GpuMat gW;
    gW.upload(mWeight);

    cv::Mat_<data_type> mX(sz, 1), mY(sz, 1), mZ(sz, 1);
    for (size_t i = 0; i < sz; i++)
    {
        mX(i, 0) = vPts[i].x;
        mY(i, 0) = vPts[i].y;
        mZ(i, 0) = vPts[i].z;
    }

    // Gpu上面存储原始的X,Y,Z坐标信息
    cv::cuda::GpuMat gX, gY, gZ;
    cv::cuda::GpuMat gWX, gWY, gWZ;
    gX.upload(mX);
    gY.upload(mY);
    gZ.upload(mZ);


    // 迭代计算终止条件
    size_t maxIter = sz * (sz - 1) / 2;
    maxIter = min(maxIter, (size_t)50);
    data_type sigma = 1e-8; ///< 控制精度总损失的参数

    cv::cuda::GpuMat gWA(3, sz, CV_64FC1); // cuda的矩阵和opencv的矩阵是
    cv::cuda::GpuMat gR(sz, 1, CV_64FC1);
    cv::cuda::GpuMat gRsort(sz, 1, CV_64FC1);
    // cv::cuda::GpuMat gMadSort(sz, 1, CV_64FC1);
    cv::cuda::GpuMat gWR(sz, 1, CV_64FC1);

    double K = 4.685; ///< BiSquare方法的参数常数项
    // double s{};
    double lastSumWR{}; ///< 上一轮的总损失

    double minDelta{};

    const int threadSizeX = 1024;
    dim3 gridSize((sz + threadSizeX - 1) / threadSizeX, 1);
    dim3 blockSize(threadSizeX, 1);

    

    // data_type *d_A = nullptr;
    data_type *d_S = nullptr;
    // data_type *d_U = nullptr;
    data_type *d_VT = nullptr;
    int *d_info = nullptr;
    data_type *d_work = nullptr;
    data_type *h_work = nullptr;
    // data_type *d_rwork = nullptr;
    // data_type *d_W = nullptr; // W = S*VT

    size_t workspaceInBytesOnDevice = 0;
    size_t workspaceInBytesOnHost = 0;
    int info = 0;

    signed char jobu = 'N';  // U可以不用计算出来
    signed char jobvt = 'A'; // VT必须要全部计算出来

    const int64_t m = sz;
    const int64_t n = 3;
    const int64_t lda = m;
    // const int64_t ldu = m;
    const int64_t ldvt = n;

    /* 申请内存 */
    // int64_t u_sz = lda * m;

    int64_t S_sz = min(m, n);
    int64_t vt_sz = ldvt * n;
    // CUDA_CHECK(cudaMalloc(reinterpret_cast<void **>(&d_A), sizeof(data_type) * A.size()));
    CUDA_CHECK(cudaMalloc(reinterpret_cast<void **>(&d_S), sizeof(data_type) * S_sz));
    // CUDA_CHECK(cudaMalloc(reinterpret_cast<void **>(&d_U), sizeof(data_type) * u_sz));
    CUDA_CHECK(cudaMalloc(reinterpret_cast<void **>(&d_VT), sizeof(data_type) * vt_sz));
    CUDA_CHECK(cudaMalloc(reinterpret_cast<void **>(&d_info), sizeof(int)));
    // CUDA_CHECK(cudaMalloc(reinterpret_cast<void **>(&d_W), sizeof(data_type) * lda * n));

    std::vector<data_type> VT(vt_sz, 0);
    std::vector<data_type> S(min(m, n), 0);

    int median_length = 1;
    int median_start = sz / 2 - 1;
    if (sz % 2 == 0)
    {
        median_length = 2;
        median_start = sz / 2;
    }
    std::vector<data_type> median_s(median_length, 0);

    /* 查询SVD的工作空间 */
    CUSOLVER_CHECK(cusolverDnXgesvd_bufferSize(
        cusolverH,
        nullptr,
        jobu,
        jobvt,
        m,
        n,
        traits<data_type>::cuda_data_type,
        gWA.ptr<data_type>(),
        lda,
        traits<data_type>::cuda_data_type,
        d_S,
        traits<data_type>::cuda_data_type,
        nullptr, // d_U,
        lda,
        traits<data_type>::cuda_data_type,
        d_VT,
        ldvt, // lda,
        traits<data_type>::cuda_data_type,
        &workspaceInBytesOnDevice,
        &workspaceInBytesOnHost));

    CUDA_CHECK(cudaMalloc(reinterpret_cast<void **>(&d_work), workspaceInBytesOnDevice));
    h_work = (data_type *)malloc(workspaceInBytesOnHost);
    if (!h_work)
    {
        throw std::bad_alloc();
    }

    int length = sz;

    for (size_t iter = 0; iter < maxIter; iter++)
    {
        // 这儿是不是可以根据权重先重新更新一下均值
        xSum = 0.0f;
        ySum = 0.0f;
        zSum = 0.0f;
        double w_iSum{0.f}; ///< 权重总和

        cv::cuda::multiply(gX, gW, gWX);
        cv::cuda::multiply(gY, gW, gWY);
        cv::cuda::multiply(gZ, gW, gWZ);

        xSum = cv::cuda::sum(gWX)[0];
        ySum = cv::cuda::sum(gWY)[0];
        zSum = cv::cuda::sum(gWZ)[0];
        w_iSum = cv::cuda::sum(gW)[0];

        xMean = xSum / w_iSum;
        yMean = ySum / w_iSum;
        zMean = zSum / w_iSum;

        // 这两步合起来写一个kernel
        // 得到gWA

        cuda_xyz2WA_kernel<<<gridSize, blockSize>>>(
            gX.ptr<data_type>(), gY.ptr<data_type>(), gZ.ptr<data_type>(),
            gW.ptr<data_type>(), xMean, yMean, zMean,
            gWA.ptr<data_type>(), length);

#ifdef _DEBUG
        cv::Mat mXg2h_debug;
        cv::Mat mYg2h_debug;
        cv::Mat mZg2h_debug;
        cv::Mat mWg2h_debug;
        cv::Mat mWAg2h_debug;
        gX.download(mXg2h_debug);
        gY.download(mYg2h_debug);
        gZ.download(mZg2h_debug);
        gW.download(mWg2h_debug);
        gWA.download(mWAg2h_debug);

        cv::imwrite("mXg2h_debug.tiff", mXg2h_debug);
        cv::imwrite("mYg2h_debug.tiff", mYg2h_debug);
        cv::imwrite("mZg2h_debug.tiff", mZg2h_debug);
        cv::imwrite("mWg2h_debug.tiff", mWg2h_debug);
        cv::imwrite("mWAg2h_debug.tiff", mWAg2h_debug);

        // std::vector<data_type> WA(sz * 3, 0);
        // CUDA_CHECK(cudaMemcpyAsync(WA.data(), gWA.ptr<data_type>(), sizeof(data_type) * sz * 3,
        //                            cudaMemcpyDeviceToHost, stream));
        // std::printf("WA = (matlab base-1)\n");
        // print_matrix(sz, n, WA.data(), sz);
        // std::printf("=====\n");
#endif

        // 更新权重之后计算
        /* step 4: 计算 SVD */
        CUSOLVER_CHECK(cusolverDnXgesvd(
            cusolverH,
            nullptr,
            jobu,
            jobvt,
            m,
            n,
            traits<data_type>::cuda_data_type,
            gWA.ptr<data_type>(), // d_A,
            lda,
            traits<data_type>::cuda_data_type,
            d_S,
            traits<data_type>::cuda_data_type,
            nullptr, // d_U,
            lda,
            traits<data_type>::cuda_data_type,
            d_VT,
            ldvt, // lda,
            traits<data_type>::cuda_data_type,
            d_work,
            workspaceInBytesOnDevice,
            h_work,
            workspaceInBytesOnHost,
            d_info));

#ifdef _DEBUG
        // 算完拷贝
        // CUDA_CHECK(cudaMemcpyAsync(U.data(), d_U, sizeof(data_type) * U.size(), cudaMemcpyDeviceToHost,
        //                            stream));
        CUDA_CHECK(cudaMemcpyAsync(VT.data(), d_VT, sizeof(data_type) * vt_sz,
                                   cudaMemcpyDeviceToHost, stream));
        // CUDA_CHECK(cudaMemcpyAsync(S.data(), d_S, sizeof(data_type) * min(m, n),
        //                            cudaMemcpyDeviceToHost, stream));
        CUDA_CHECK(cudaMemcpyAsync(&info, d_info, sizeof(int), cudaMemcpyDeviceToHost, stream));

        CUDA_CHECK(cudaStreamSynchronize(stream));

        // 打印所有的VT
        std::printf("VT = (matlab base-1)\n");
        print_matrix(n, n, VT.data(), ldvt);
        std::printf("=====\n");
#endif

        // 偏差矩阵
        cuda_calR_kernel<<<gridSize, blockSize>>>(
            gX.ptr<data_type>(), gY.ptr<data_type>(), gZ.ptr<data_type>(),
            d_VT, xMean, yMean, zMean,
            gR.ptr<data_type>(), length);

        // 偏差中位数
        w_iSum = cv::cuda::sum(gW)[0];
        cv::cuda::multiply(gR, gW, gWR);
        double wr_iSum = cv::cuda::sum(gWR)[0];
        double median = wr_iSum / w_iSum;
        cv::cuda::absdiff(gR, median, gRsort);
        thrust::sort(thrust::device, gRsort.ptr<data_type>(), gRsort.ptr<data_type>() + sz);

        CUDA_CHECK(cudaMemcpyAsync(median_s.data(), gRsort.ptr<data_type>() + median_start,
                                   sizeof(data_type) * median_length,
                                   cudaMemcpyDeviceToHost, stream));

        double mad = sz % 2 == 0 ? (median_s[0] + median_s[1]) / 2 : median_s[0];
        double s = mad / 0.6745; ///< 0.6745是BiSquare方法的参数常数项
        // 根据新计算出来的参数用biSquare更新权重
        cuda_calNewWeightAndWR_kernel<<<gridSize, blockSize>>>(
            gR.ptr<data_type>(), gW.ptr<data_type>(), gWR.ptr<data_type>(),
            length, s, K);

        //
        double sumWR = cv::cuda::sum(gWR)[0];
        if (iter == 0)
        {
            minDelta = abs(sumWR * sigma);
        }

        if (abs(sumWR - lastSumWR) < minDelta)
        {
            break;
        }
        lastSumWR = sumWR;
#ifdef _DEBUG
        // std::cout << "mWeight = " << mWeight << std::endl;
        // std::cout << "sumWR = " << sumWR << std::endl;
        printf("iter %lu sumWR = %lf \n", iter, sumWR);
#endif
    }

    CUDA_CHECK(cudaMemcpyAsync(VT.data(), d_VT, sizeof(data_type) * vt_sz,
                               cudaMemcpyDeviceToHost, stream));
    a = VT[2];
    b = VT[5];
    c = VT[8];
    d = -a * xMean - b * yMean - c * zMean;
    ptMeans = MENTO::Point3d(xMean, yMean, zMean);

    /* free resources */
    // CUDA_CHECK(cudaFree(d_A));
    CUDA_CHECK(cudaFree(d_S));
    // CUDA_CHECK(cudaFree(d_U));
    CUDA_CHECK(cudaFree(d_VT));
    CUDA_CHECK(cudaFree(d_info));
    CUDA_CHECK(cudaFree(d_work));
    // CUDA_CHECK(cudaFree(d_rwork));
    // CUDA_CHECK(cudaFree(d_W));
    free(h_work);

    CUSOLVER_CHECK(cusolverDnDestroy(cusolverH));
    CUBLAS_CHECK(cublasDestroy(cublasH));

    CUDA_CHECK(cudaStreamDestroy(stream));

    CUDA_CHECK(cudaDeviceReset());

    return 0;
}

nvvp显示的耗时的图

正经迭代的耗时并不大,最大的大头在申请和销毁句柄。


耗时

相关文章

网友评论

      本文标题:全部最小二乘法拟合平面BiSquare优化cuda版本

      本文链接:https://www.haomeiwen.com/subject/slgkmdtx.html