美文网首页
全部最小二乘法拟合平面BiSquare优化cuda版本

全部最小二乘法拟合平面BiSquare优化cuda版本

作者: 寽虎非虫003 | 来源:发表于2023-08-16 14:37 被阅读0次

    说明和参考

    • 这里面用到了cuda求和,这个比一般的cuda内核函数要慢不少,可以了解下cuda写归约函数;
    • 用到了cuda排序,也是归约的;
    • 求解svd使用的是cuSOLVER库的cusolverDnXgesvd()方法,根据示例的方法需要申请一些句柄,这个会非常耗时,建议不要轻易申请和释放。

    cuda 计算SVD:cuda计算SVD接口文档翻译与示例代码记录
    Thrust:https://nvidia.github.io/thrust/https://docs.nvidia.com/cuda/thrust/index.htmlhttps://thrust.github.io/
    cuda排序:cuda实现排序(纯享版,gpt4提供的代码)

    代码

    #define CUDA_GLOABAL __global__
    #define CUDA_G__ CUDA_GLOABAL
    #define C_G__ CUDA_G__
    
    C_G__ void cuda_xyz2WA_kernel(
        const double *d_src_x, const double *d_src_y,
        const double *d_src_z, const double *d_src_W,
        double x_mean, double y_mean, double z_mean,
        double *d_dst_WA, int length)
    {
    
        int x = threadIdx.x + blockIdx.x * blockDim.x; // x坐标
        // int y = threadIdx.y + blockIdx.y * blockDim.y; // y坐标
    
        if (x < length)
        {
            d_dst_WA[x] = (d_src_x[x] - x_mean) * d_src_W[x];
            d_dst_WA[x + length] = (d_src_y[x] - y_mean) * d_src_W[x];
            d_dst_WA[x + 2 * length] = (d_src_z[x] - z_mean) * d_src_W[x];
    
            // printf("thread x(%d): dx(%lf), dy(%lf), dz(%lf), dw(%lf), dwa(%lf,%lf,%lf) \n",
            //        x, d_src_x[x], d_src_y[x], d_src_z[x], d_src_W[x],
            //        d_dst_WA[x * 3], d_dst_WA[x * 3 + 1], d_dst_WA[x * 3 + 2]);
        }
    }
    
    C_G__ void cuda_calR_kernel(
        const double *d_src_x, const double *d_src_y,
        const double *d_src_z, const double *d_vt,
        double x_mean, double y_mean, double z_mean,
        double *d_dst_R, int length)
    {
    
        int x = threadIdx.x + blockIdx.x * blockDim.x; // x坐标
        // int y = threadIdx.y + blockIdx.y * blockDim.y; // y坐标
    
        if (x < length)
        {
            d_dst_R[x] = d_vt[2] * (d_src_x[x] - x_mean) +
                         d_vt[5] * (d_src_y[x] - y_mean) +
                         d_vt[8] * (d_src_z[x] - z_mean);
        }
    }
    
    C_G__ void cuda_calNewWeightAndWR_kernel(
        const double *d_src_R,
        double *d_dst_W, double *d_dst_WR,
        int length, double s, double K)
    {
    
        int x = threadIdx.x + blockIdx.x * blockDim.x; // x坐标
        // int y = threadIdx.y + blockIdx.y * blockDim.y; // y坐标
    
        if (x < length)
        {
            double u_i = d_src_R[x] / (K * s);
            if (abs(u_i) < 1)
            {
                double u_i2 = u_i * u_i;
                d_dst_W[x] = (1 - u_i2) * (1 - u_i2);
            }
            else
            {
                d_dst_W[x] = 0.f;
            }
            d_dst_WR[x] = d_dst_W[x] * (d_src_R[x] * d_src_R[x]);
        }
    }
    
    
    int BiSquarePlaneFitting_gpu(const std::vector<Point3f> &vPts,
                                 Point3d &ptMeans, Point3d &ptStddev,
                                 float &a, float &b, float &c, float &d)
    {
    
        using data_type = double;
        /// 进行计算,
        size_t sz = vPts.size();
    
        if (sz < 3)
        {
            a = 0.f;
            b = 0.f;
            c = 0.f;
            d = 0.f;
            ptMeans = Point3d{};
            ptStddev = Point3d{};
            return 1;
        }
    
        double xSum{}, ySum{}, zSum{};
        double xMean{}, yMean{}, zMean{};
    
        // cuda 计算所需的一干矩阵和句柄
        cusolverDnHandle_t cusolverH = NULL;
        cublasHandle_t cublasH = NULL;
        cudaStream_t stream = NULL;
    
        /* step 1: create cusolver handle, bind a stream */
        CUSOLVER_CHECK(cusolverDnCreate(&cusolverH));
        CUBLAS_CHECK(cublasCreate(&cublasH));
    
        CUDA_CHECK(cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking));
        CUSOLVER_CHECK(cusolverDnSetStream(cusolverH, stream));
        CUBLAS_CHECK(cublasSetStream(cublasH, stream));
    
        // 初始化权重
        cv::Mat_<data_type> mWeight = cv::Mat_<data_type>::ones(sz, 1);
    
        // Gpu上面存储权重
        cv::cuda::GpuMat gW;
        gW.upload(mWeight);
    
        cv::Mat_<data_type> mX(sz, 1), mY(sz, 1), mZ(sz, 1);
        for (size_t i = 0; i < sz; i++)
        {
            mX(i, 0) = vPts[i].x;
            mY(i, 0) = vPts[i].y;
            mZ(i, 0) = vPts[i].z;
        }
    
        // Gpu上面存储原始的X,Y,Z坐标信息
        cv::cuda::GpuMat gX, gY, gZ;
        cv::cuda::GpuMat gWX, gWY, gWZ;
        gX.upload(mX);
        gY.upload(mY);
        gZ.upload(mZ);
    
    
        // 迭代计算终止条件
        size_t maxIter = sz * (sz - 1) / 2;
        maxIter = min(maxIter, (size_t)50);
        data_type sigma = 1e-8; ///< 控制精度总损失的参数
    
        cv::cuda::GpuMat gWA(3, sz, CV_64FC1); // cuda的矩阵和opencv的矩阵是
        cv::cuda::GpuMat gR(sz, 1, CV_64FC1);
        cv::cuda::GpuMat gRsort(sz, 1, CV_64FC1);
        // cv::cuda::GpuMat gMadSort(sz, 1, CV_64FC1);
        cv::cuda::GpuMat gWR(sz, 1, CV_64FC1);
    
        double K = 4.685; ///< BiSquare方法的参数常数项
        // double s{};
        double lastSumWR{}; ///< 上一轮的总损失
    
        double minDelta{};
    
        const int threadSizeX = 1024;
        dim3 gridSize((sz + threadSizeX - 1) / threadSizeX, 1);
        dim3 blockSize(threadSizeX, 1);
    
        
    
        // data_type *d_A = nullptr;
        data_type *d_S = nullptr;
        // data_type *d_U = nullptr;
        data_type *d_VT = nullptr;
        int *d_info = nullptr;
        data_type *d_work = nullptr;
        data_type *h_work = nullptr;
        // data_type *d_rwork = nullptr;
        // data_type *d_W = nullptr; // W = S*VT
    
        size_t workspaceInBytesOnDevice = 0;
        size_t workspaceInBytesOnHost = 0;
        int info = 0;
    
        signed char jobu = 'N';  // U可以不用计算出来
        signed char jobvt = 'A'; // VT必须要全部计算出来
    
        const int64_t m = sz;
        const int64_t n = 3;
        const int64_t lda = m;
        // const int64_t ldu = m;
        const int64_t ldvt = n;
    
        /* 申请内存 */
        // int64_t u_sz = lda * m;
    
        int64_t S_sz = min(m, n);
        int64_t vt_sz = ldvt * n;
        // CUDA_CHECK(cudaMalloc(reinterpret_cast<void **>(&d_A), sizeof(data_type) * A.size()));
        CUDA_CHECK(cudaMalloc(reinterpret_cast<void **>(&d_S), sizeof(data_type) * S_sz));
        // CUDA_CHECK(cudaMalloc(reinterpret_cast<void **>(&d_U), sizeof(data_type) * u_sz));
        CUDA_CHECK(cudaMalloc(reinterpret_cast<void **>(&d_VT), sizeof(data_type) * vt_sz));
        CUDA_CHECK(cudaMalloc(reinterpret_cast<void **>(&d_info), sizeof(int)));
        // CUDA_CHECK(cudaMalloc(reinterpret_cast<void **>(&d_W), sizeof(data_type) * lda * n));
    
        std::vector<data_type> VT(vt_sz, 0);
        std::vector<data_type> S(min(m, n), 0);
    
        int median_length = 1;
        int median_start = sz / 2 - 1;
        if (sz % 2 == 0)
        {
            median_length = 2;
            median_start = sz / 2;
        }
        std::vector<data_type> median_s(median_length, 0);
    
        /* 查询SVD的工作空间 */
        CUSOLVER_CHECK(cusolverDnXgesvd_bufferSize(
            cusolverH,
            nullptr,
            jobu,
            jobvt,
            m,
            n,
            traits<data_type>::cuda_data_type,
            gWA.ptr<data_type>(),
            lda,
            traits<data_type>::cuda_data_type,
            d_S,
            traits<data_type>::cuda_data_type,
            nullptr, // d_U,
            lda,
            traits<data_type>::cuda_data_type,
            d_VT,
            ldvt, // lda,
            traits<data_type>::cuda_data_type,
            &workspaceInBytesOnDevice,
            &workspaceInBytesOnHost));
    
        CUDA_CHECK(cudaMalloc(reinterpret_cast<void **>(&d_work), workspaceInBytesOnDevice));
        h_work = (data_type *)malloc(workspaceInBytesOnHost);
        if (!h_work)
        {
            throw std::bad_alloc();
        }
    
        int length = sz;
    
        for (size_t iter = 0; iter < maxIter; iter++)
        {
            // 这儿是不是可以根据权重先重新更新一下均值
            xSum = 0.0f;
            ySum = 0.0f;
            zSum = 0.0f;
            double w_iSum{0.f}; ///< 权重总和
    
            cv::cuda::multiply(gX, gW, gWX);
            cv::cuda::multiply(gY, gW, gWY);
            cv::cuda::multiply(gZ, gW, gWZ);
    
            xSum = cv::cuda::sum(gWX)[0];
            ySum = cv::cuda::sum(gWY)[0];
            zSum = cv::cuda::sum(gWZ)[0];
            w_iSum = cv::cuda::sum(gW)[0];
    
            xMean = xSum / w_iSum;
            yMean = ySum / w_iSum;
            zMean = zSum / w_iSum;
    
            // 这两步合起来写一个kernel
            // 得到gWA
    
            cuda_xyz2WA_kernel<<<gridSize, blockSize>>>(
                gX.ptr<data_type>(), gY.ptr<data_type>(), gZ.ptr<data_type>(),
                gW.ptr<data_type>(), xMean, yMean, zMean,
                gWA.ptr<data_type>(), length);
    
    #ifdef _DEBUG
            cv::Mat mXg2h_debug;
            cv::Mat mYg2h_debug;
            cv::Mat mZg2h_debug;
            cv::Mat mWg2h_debug;
            cv::Mat mWAg2h_debug;
            gX.download(mXg2h_debug);
            gY.download(mYg2h_debug);
            gZ.download(mZg2h_debug);
            gW.download(mWg2h_debug);
            gWA.download(mWAg2h_debug);
    
            cv::imwrite("mXg2h_debug.tiff", mXg2h_debug);
            cv::imwrite("mYg2h_debug.tiff", mYg2h_debug);
            cv::imwrite("mZg2h_debug.tiff", mZg2h_debug);
            cv::imwrite("mWg2h_debug.tiff", mWg2h_debug);
            cv::imwrite("mWAg2h_debug.tiff", mWAg2h_debug);
    
            // std::vector<data_type> WA(sz * 3, 0);
            // CUDA_CHECK(cudaMemcpyAsync(WA.data(), gWA.ptr<data_type>(), sizeof(data_type) * sz * 3,
            //                            cudaMemcpyDeviceToHost, stream));
            // std::printf("WA = (matlab base-1)\n");
            // print_matrix(sz, n, WA.data(), sz);
            // std::printf("=====\n");
    #endif
    
            // 更新权重之后计算
            /* step 4: 计算 SVD */
            CUSOLVER_CHECK(cusolverDnXgesvd(
                cusolverH,
                nullptr,
                jobu,
                jobvt,
                m,
                n,
                traits<data_type>::cuda_data_type,
                gWA.ptr<data_type>(), // d_A,
                lda,
                traits<data_type>::cuda_data_type,
                d_S,
                traits<data_type>::cuda_data_type,
                nullptr, // d_U,
                lda,
                traits<data_type>::cuda_data_type,
                d_VT,
                ldvt, // lda,
                traits<data_type>::cuda_data_type,
                d_work,
                workspaceInBytesOnDevice,
                h_work,
                workspaceInBytesOnHost,
                d_info));
    
    #ifdef _DEBUG
            // 算完拷贝
            // CUDA_CHECK(cudaMemcpyAsync(U.data(), d_U, sizeof(data_type) * U.size(), cudaMemcpyDeviceToHost,
            //                            stream));
            CUDA_CHECK(cudaMemcpyAsync(VT.data(), d_VT, sizeof(data_type) * vt_sz,
                                       cudaMemcpyDeviceToHost, stream));
            // CUDA_CHECK(cudaMemcpyAsync(S.data(), d_S, sizeof(data_type) * min(m, n),
            //                            cudaMemcpyDeviceToHost, stream));
            CUDA_CHECK(cudaMemcpyAsync(&info, d_info, sizeof(int), cudaMemcpyDeviceToHost, stream));
    
            CUDA_CHECK(cudaStreamSynchronize(stream));
    
            // 打印所有的VT
            std::printf("VT = (matlab base-1)\n");
            print_matrix(n, n, VT.data(), ldvt);
            std::printf("=====\n");
    #endif
    
            // 偏差矩阵
            cuda_calR_kernel<<<gridSize, blockSize>>>(
                gX.ptr<data_type>(), gY.ptr<data_type>(), gZ.ptr<data_type>(),
                d_VT, xMean, yMean, zMean,
                gR.ptr<data_type>(), length);
    
            // 偏差中位数
            w_iSum = cv::cuda::sum(gW)[0];
            cv::cuda::multiply(gR, gW, gWR);
            double wr_iSum = cv::cuda::sum(gWR)[0];
            double median = wr_iSum / w_iSum;
            cv::cuda::absdiff(gR, median, gRsort);
            thrust::sort(thrust::device, gRsort.ptr<data_type>(), gRsort.ptr<data_type>() + sz);
    
            CUDA_CHECK(cudaMemcpyAsync(median_s.data(), gRsort.ptr<data_type>() + median_start,
                                       sizeof(data_type) * median_length,
                                       cudaMemcpyDeviceToHost, stream));
    
            double mad = sz % 2 == 0 ? (median_s[0] + median_s[1]) / 2 : median_s[0];
            double s = mad / 0.6745; ///< 0.6745是BiSquare方法的参数常数项
            // 根据新计算出来的参数用biSquare更新权重
            cuda_calNewWeightAndWR_kernel<<<gridSize, blockSize>>>(
                gR.ptr<data_type>(), gW.ptr<data_type>(), gWR.ptr<data_type>(),
                length, s, K);
    
            //
            double sumWR = cv::cuda::sum(gWR)[0];
            if (iter == 0)
            {
                minDelta = abs(sumWR * sigma);
            }
    
            if (abs(sumWR - lastSumWR) < minDelta)
            {
                break;
            }
            lastSumWR = sumWR;
    #ifdef _DEBUG
            // std::cout << "mWeight = " << mWeight << std::endl;
            // std::cout << "sumWR = " << sumWR << std::endl;
            printf("iter %lu sumWR = %lf \n", iter, sumWR);
    #endif
        }
    
        CUDA_CHECK(cudaMemcpyAsync(VT.data(), d_VT, sizeof(data_type) * vt_sz,
                                   cudaMemcpyDeviceToHost, stream));
        a = VT[2];
        b = VT[5];
        c = VT[8];
        d = -a * xMean - b * yMean - c * zMean;
        ptMeans = MENTO::Point3d(xMean, yMean, zMean);
    
        /* free resources */
        // CUDA_CHECK(cudaFree(d_A));
        CUDA_CHECK(cudaFree(d_S));
        // CUDA_CHECK(cudaFree(d_U));
        CUDA_CHECK(cudaFree(d_VT));
        CUDA_CHECK(cudaFree(d_info));
        CUDA_CHECK(cudaFree(d_work));
        // CUDA_CHECK(cudaFree(d_rwork));
        // CUDA_CHECK(cudaFree(d_W));
        free(h_work);
    
        CUSOLVER_CHECK(cusolverDnDestroy(cusolverH));
        CUBLAS_CHECK(cublasDestroy(cublasH));
    
        CUDA_CHECK(cudaStreamDestroy(stream));
    
        CUDA_CHECK(cudaDeviceReset());
    
        return 0;
    }
    

    nvvp显示的耗时的图

    正经迭代的耗时并不大,最大的大头在申请和销毁句柄。


    耗时

    相关文章

      网友评论

          本文标题:全部最小二乘法拟合平面BiSquare优化cuda版本

          本文链接:https://www.haomeiwen.com/subject/slgkmdtx.html