美文网首页CUDA
教你一步步写一个cuda path tracer:牛刀小试!

教你一步步写一个cuda path tracer:牛刀小试!

作者: RobotBerry | 来源:发表于2017-04-19 11:31 被阅读141次

    前面几篇都是关于cuda的基础知识介绍,甚是乏味。是时候用cuda渲染一张图片来直观地感受一下了!

    Kevin Beason大神曾经写过一个99行的c++版本的path tracer。麻雀虽小,五脏俱全,这个path tracer包含了很多global illumination的特性:

    • Monte Carlo samling
    • OpenMP
    • Specular, Diffuse, and glass BRDFs
    • Antialising via super-sampling with importance-sampled tent distribution
    • Cosine importance sampling of the hemisphere for diffuse reflection
    • Russian roulette for path termination
    • Russian roulette and splitting for selecting reflection and/or refraction for glass BRDF

    为了方便起见,我去掉了不必要的特性,只保留了diffuse材质。源代码请从官网上获取。修改后的代码如下:

    smallPT.cpp

    #include <cmath>
    #include <cstdio>
    #include <chrono>
    
    #define BOUNCE 4
    #define WIDTH 800
    #define HEIGHT 600
    #define SPP 1024
    
    struct Vec
    {
        double x, y, z;
        Vec(double x_ = 0, double y_ = 0, double z_ = 0) { x = x_; y = y_; z = z_; }
        Vec operator+(const Vec &b) const { return Vec(x + b.x, y + b.y, z + b.z); }
        Vec operator-(const Vec &b) const { return Vec(x - b.x, y - b.y, z - b.z); }
        Vec operator*(double b) const { return Vec(x*b, y*b, z*b); }
        Vec mult(const Vec &b) const { return Vec(x*b.x, y*b.y, z*b.z); }
        Vec& norm() { return *this = *this * (1 / sqrt(x*x + y*y + z*z)); }
        double dot(const Vec &b) const { return x*b.x + y*b.y + z*b.z; }
        Vec operator%(Vec&b) { return Vec(y*b.z - z*b.y, z*b.x - x*b.z, x*b.y - y*b.x); }
    };
    
    struct Ray { Vec o, d; Ray(Vec o_, Vec d_) : o(o_), d(d_) {} };
    
    enum Refl_t { DIFF, SPEC, REFR }; // diffuse, specular, refraction
    
    struct Sphere
    {
        double rad;
        Vec p, e, c; // position, emission, color
        Refl_t refl;
        Sphere(double rad_, Vec p_, Vec e_, Vec c_, Refl_t refl_) :
            rad(rad_), p(p_), e(e_), c(c_), refl(refl_) {}
        double intersect(const Ray &r) const
        {
            Vec op = p - r.o;
            double t, eps = 1e-4, b = op.dot(r.d), det = b*b - op.dot(op) + rad*rad;
            if (det < 0) return 0; else det = sqrt(det);
            return (t = b - det) > eps ? t : ((t = b + det) > eps ? t : 0);
        }
    };
    
    Sphere spheres[] =
    {
        Sphere(1e5, Vec(-1e5f - 50.0f, 40.0f, 80.0f), Vec(),Vec(.75,.25,.25),DIFF), // Left 
        Sphere(1e5, Vec(1e5f + 50.0f, 40.0f, 80.0f),Vec(),Vec(.25,.25,.75),DIFF), // Rght 
        Sphere(1e5, Vec(0.0f, 40.0f, -1e5f),     Vec(),Vec(.75,.75,.75),DIFF), // Back 
        Sphere(1e5, Vec(0.0f, 40.0f, 1e5f + 600.0f), Vec(),Vec(1, 1, 1), DIFF), // Frnt 
        Sphere(1e5, Vec(0.0f, -1e5f, 80.0f ),    Vec(),Vec(.75,.75,.75),DIFF), // Botm 
        Sphere(1e5, Vec(0.0f, 1e5f + 80.0f, 80.0f),Vec(),Vec(.75,.75,.75),DIFF), // Top 
        Sphere(16,Vec(-25.0f, 16.0f, 47.0f),       Vec(),Vec(1,1,1), DIFF), // Mirr 
        Sphere(20,Vec(25.0f, 20.0f, 78.0f),       Vec(),Vec(1,1,1), DIFF), // Glas 
        Sphere(600, Vec(0.0f, 678.8f, 80.0f),Vec(1.6f, 1.6f, 1.6f),  Vec(), DIFF) // Lite 
    };
    
    float rand(unsigned int *seed0, unsigned int *seed1)
    {
        *seed0 = 36969 * ((*seed0) & 65535) + ((*seed0) >> 16);
        *seed1 = 18000 * ((*seed1) & 65535) + ((*seed1) >> 16);
    
        unsigned int ires = ((*seed0) << 16) + (*seed1);
    
        union
        {
            float f;
            unsigned int ui;
        } res;
    
        res.ui = (ires & 0x007fffff) | 0x40000000;  // bitwise AND, bitwise OR
    
        return (res.f - 2.f) / 2.f;
    }
    
    inline double clamp(double x) { return x < 0 ? 0 : x>1 ? 1 : x; }
    inline int toInt(double x) { return int(pow(clamp(x), 1 / 2.2) * 255 + .5); }
    inline bool intersect(const Ray &r, double &t, int &id) {
        double n = sizeof(spheres) / sizeof(Sphere), d, inf = t = 1e20;
        for (int i = int(n); i--;) if ((d = spheres[i].intersect(r)) && d < t) { t = d; id = i; }
        return t < inf;
    }
    
    Vec radiance(const Ray &r, int depth, unsigned int *s0, unsigned int *s1)
    {
        double t;                               // distance to intersection 
        int id = 0;                               // id of intersected object 
        if (depth > BOUNCE || !intersect(r, t, id)) return Vec(); // if miss, return black 
        const Sphere &obj = spheres[id];        // the hit object 
        Vec x = r.o + r.d*t, n = (x - obj.p).norm(), nl = n.dot(r.d) < 0 ? n : n*-1, f = obj.c;
    
        if (obj.refl == DIFF)
        {
            double r1 = 2 * M_PI*rand(s0, s1), r2 = rand(s0, s1), r2s = sqrt(r2);
            Vec w = nl, u = ((fabs(w.x) > .1 ? Vec(0, 1) : Vec(1)) % w).norm(), v = w%u;
            Vec d = (u*cos(r1)*r2s + v*sin(r1)*r2s + w*sqrt(1 - r2)).norm();
            return obj.e + f.mult(radiance(Ray(x, d), depth + 1, s0, s1)) * n.dot(d) * 2;
        }
        return Vec();
    }
    
    int main()
    {
        Ray cam(Vec(0, 52, 300), Vec(0, -0.05, -1.0).norm()); // cam pos, dir 
        Vec cx = Vec(WIDTH*.5135 / HEIGHT), cy = (cx%cam.d).norm()*.5135, r, *c = new Vec[WIDTH*HEIGHT];
    
        std::chrono::time_point<std::chrono::system_clock> begin = std::chrono::system_clock::now();
    #pragma omp parallel for schedule(dynamic, 1) private(r)       // OpenMP 
        for (int y = 0; y < HEIGHT; y++) // Loop over image rows 
        {
            fprintf(stderr, "\rRendering (%d spp) %5.2f%%", SPP, 100.*y / (HEIGHT - 1));
            for (int x = 0; x < WIDTH; x++)   // Loop cols
            {
                unsigned int s0 = x;
                unsigned int s1 = y;
                int i = (HEIGHT - y - 1)*WIDTH + x;
                r = Vec();
                for (int s = 0; s < SPP; s++)
                {
                    Vec d = cx*((0.25 + x) / WIDTH - .5) + cy*((0.25 + y) / HEIGHT - .5) + cam.d;
                    r = r + radiance(Ray(cam.o + d * 40, d.norm()), 0, &s0, &s1);
                }
                r = r * (1.0 / SPP);
                c[i] = Vec(clamp(r.x), clamp(r.y), clamp(r.z));
            }
        }
        std::chrono::time_point<std::chrono::system_clock> end = std::chrono::system_clock::now();
        std::chrono::duration<double> elapsedTime = end - begin;
        printf("\nTime: %.6lfs\n", elapsedTime.count());
    
        FILE *f = fopen("image.ppm", "w");         // Write image to PPM file. 
        fprintf(f, "P3\n%d %d\n%d\n", WIDTH, HEIGHT, 255);
        for (int i = 0; i < WIDTH*HEIGHT; i++)
            fprintf(f, "%d %d %d ", toInt(c[i].x), toInt(c[i].y), toInt(c[i].z));
    }
    

    Note

    1.这段代码使用OpenMP进行多线程加速。

    OpenMP(Open Multi-Processing)是一套支持跨平台共享内存方式的多线程并发的编程API

    在vs2015中需要手动启用OpenMP功能:

    vs2015中打开OpenMP选项

    2.生成的ppm图片文件可以用ps等软件打开

    对应的cuda代码如下:

    smallPT.cu

    // cuda headers
    #include <cuda_runtime.h>
    #include <device_launch_parameters.h>
    
    // c++ headers
    #include <cstdio>
    #include <algorithm>
    #include <climits>
    #include <chrono>
    
    #include "helper_math.h"
    
    #define WIDTH 800
    #define HEIGHT 600
    #define SPP 1024
    #define BOUNCE 4
    #define SPHERE_EPSILON 0.0001f
    #define BOX_EPSILON 0.001f
    #define RAY_EPSILON 0.05f;
    
    struct Ray 
    {
        __device__ Ray(float3 pos, float3 dir) :
            pos(pos), dir(dir) {}
    
        float3 pos;
        float3 dir;
    };
    
    enum class Material { Diffuse, Specular, Refractive };
    
    struct Sphere 
    {
        __device__ float intersect(const Ray &ray) const
        {
            float t;
            float3 dis = pos - ray.pos;
            float proj = dot(dis, ray.dir);
            float delta = proj * proj - dot(dis, dis) + radius * radius;
    
            if (delta < 0) return 0;
    
            delta = sqrtf(delta);
            return (t = proj - delta) > SPHERE_EPSILON ? t : ((t = proj + delta) > SPHERE_EPSILON ? t : 0);
        }
    
        float radius;
        float3 pos, emissionColor, mainColor;
        Material material;
    };
    
    __constant__ Sphere spheres[] =
    {
        { 1e5f, { -1e5f - 50.0f, 40.0f, 80.0f }, { 0.0f, 0.0f, 0.0f }, { 0.75f, 0.25f, 0.25f }, Material::Diffuse }, // Left 
        { 1e5f, { 1e5f + 50.0f, 40.0f, 80.0f }, { 0.0f, 0.0f, 0.0f }, { 0.25f, 0.25f, 0.75f }, Material::Diffuse }, // Right 
        { 1e5f, { 0.0f, 40.0f, -1e5f }, { 0.0f, 0.0f, 0.0f }, { 0.75f, 0.75f, 0.75f }, Material::Diffuse }, // Back 
        { 1e5f, { 0.0f, 40.0f, 1e5f + 600.0f }, { 0.0f, 0.0f, 0.0f }, { 1.00f, 1.00f, 1.00f }, Material::Diffuse }, // Front 
        { 1e5f, { 0.0f, -1e5f, 80.0f }, { 0.0f, 0.0f, 0.0f }, { 0.75f, 0.75f, 0.75f }, Material::Diffuse }, // Bottom 
        { 1e5f, { 0.0f, 1e5f + 80.0f, 80.0f }, { 0.0f, 0.0f, 0.0f }, { 0.75f, 0.75f, 0.75f }, Material::Diffuse }, // Top 
        { 16.0f, { -25.0f, 16.0f, 47.0f }, { 0.0f, 0.0f, 0.0f }, { 1.0f, 1.0f, 1.0f }, Material::Diffuse }, // small sphere 1
        { 20.0f, { 25.0f, 20.0f, 78.0f }, { 0.0f, 0.0f, 0.0f }, { 1.0f, 1.0f, 1.0f }, Material::Diffuse }, // small sphere 2
        { 600.0f, { 0.0f, 678.8f, 80.0f }, { 1.6f, 1.6f, 1.6f }, { 0.0f, 0.0f, 0.0f }, Material::Diffuse }  // Light
    };
    
    __device__ inline bool intersectScene(const Ray &ray, float &t, int &id) 
    {
        t = FLT_MAX, id = -1;
        int sphereNum = sizeof(spheres) / sizeof(Sphere);
        for (int i = 0; i < sphereNum; i++)
        {
            float ct = spheres[i].intersect(ray);
            if (ct != 0 && ct < t)
            {  
                t = ct;
                id = i;
            }
        }
            
        return id != -1;
    }
    
    __device__ static float rand(uint *seed0, uint *seed1) 
    {
        *seed0 = 36969 * ((*seed0) & 65535) + ((*seed0) >> 16);
        *seed1 = 18000 * ((*seed1) & 65535) + ((*seed1) >> 16);
    
        uint ires = ((*seed0) << 16) + (*seed1);
    
        union 
        {
            float f;
            uint ui;
        } res;
    
        res.ui = (ires & 0x007fffff) | 0x40000000;  // bitwise AND, bitwise OR
    
        return (res.f - 2.f) / 2.f;
    }
    
    inline int gammaCorrect(float c)
    {
        return int(pow(clamp(c, 0.0f, 1.0f), 1 / 2.2) * 255 + .5);
    }
    
    __device__ float3 pathTrace(Ray &ray, uint *s1, uint *s2) 
    { 
        float3 accumColor = make_float3(0.0f, 0.0f, 0.0f);
        float3 mask = make_float3(1.0f, 1.0f, 1.0f);
    
        for (int i = 0; i < BOUNCE; i++)
        {
            float t;
            int id;
    
            if (!intersectScene(ray, t, id))
                return make_float3(0.0f, 0.0f, 0.0f);
    
            const Sphere &obj = spheres[id];
            float3 p = ray.pos + ray.dir * t; 
            float3 n = normalize(p - obj.pos);
            float3 nl = dot(n, ray.dir) < 0 ? n : n * -1;
    
            accumColor += mask * obj.emissionColor;
    
            float r1 = rand(s1, s2) * M_PI * 2;
            float r2 = rand(s1, s2);
            float r2s = sqrtf(r2);
    
            float3 w = nl;
            float3 u = normalize(cross((std::fabs(w.x) > std::fabs(w.y) ? make_float3(0, 1, 0) : make_float3(1, 0, 0)), w));
            float3 v = cross(w, u);
    
            float3 d = normalize(u*cos(r1)*r2s + v*sin(r1)*r2s + w*sqrtf(1 - r2));
    
            ray.pos = p + nl * RAY_EPSILON;
            ray.dir = d;
    
            mask *= obj.mainColor * dot(d, nl) * 2;
        }
    
        return accumColor;
    }
    
    __global__ void pathTracekernel(float3 *h_output) 
    {
        uint x = blockIdx.x * blockDim.x + threadIdx.x;
        uint y = blockIdx.y * blockDim.y + threadIdx.y;
    
        uint i = (HEIGHT - y - 1)*WIDTH + x;
    
        uint s1 = x;
        uint s2 = y;
    
        Ray camRay(make_float3(0.0f, 52.0f, 300.0f), normalize(make_float3(0.0f, -0.05f, -1.0f)));
        float3 cx = make_float3(WIDTH * 0.5135 / HEIGHT, 0.0f, 0.0f);
        float3 cy = normalize(cross(cx, camRay.dir)) * 0.5135;
        float3 pixel = make_float3(0.0f);
    
        for (int s = 0; s < SPP; s++) 
        {  
            float3 d = camRay.dir + cx*((0.25 + x) / WIDTH - 0.5) + cy*((0.25 + y) / HEIGHT - 0.5);
            Ray ray(camRay.pos + d * 40, normalize(d));
    
            pixel += pathTrace(ray, &s1, &s2)*(1. / SPP);
        }
    
        h_output[i] = clamp(pixel, 0.0f, 1.0f);
    }
    
    int main() {
    
        float3 *h_output = new float3[WIDTH * HEIGHT];
        float3 *d_output;
    
        cudaMalloc(&d_output, WIDTH * HEIGHT * sizeof(float3));
    
        dim3 block(8, 8, 1);
        dim3 grid(WIDTH / block.x, HEIGHT / block.y, 1);
    
        printf("CUDA initialized.\nStart rendering...\n");
    
        std::chrono::time_point<std::chrono::system_clock> begin = std::chrono::system_clock::now();
    
        pathTracekernel << <grid, block >> > (d_output);
    
        cudaMemcpy(h_output, d_output, WIDTH * HEIGHT * sizeof(float3), cudaMemcpyDeviceToHost);
    
        std::chrono::time_point<std::chrono::system_clock> end = std::chrono::system_clock::now();
        std::chrono::duration<double> elapsedTime = end - begin;
        printf("Time: %.6lfs\n", elapsedTime.count());
    
        cudaFree(d_output);
    
        printf("Done!\n");
    
        FILE *f = fopen("smallptcuda.ppm", "w");
        fprintf(f, "P3\n%d %d\n%d\n", WIDTH, HEIGHT, 255);
        for (int i = 0; i < WIDTH * HEIGHT; i++)
            fprintf(f, "%d %d %d ", gammaCorrect(h_output[i].x),
                gammaCorrect(h_output[i].y),
                gammaCorrect(h_output[i].z));
    
        printf("Saved image to 'smallptcuda.ppm'.\n");
    
        delete[] h_output;
    
        return 0;
    }
    

    Note

    1. smallPT.cu中包含了一个helper_math.h头文件,该文件实现了float3等cuda基本数据类型的常见操作,例如+ - * dot cross等运算。该头文件位于C:\ProgramData\NVIDIA Corporation\CUDA Samples\v8.0\common\inc目录下。

    2. 由于在这段代码里,核函数的单次运行时间会超出系统的默认锁定时间,所以会导致系统认为程序出错而结束进程。详见微软的官网说明。所以我们需要在注册表中修改系统对核函数的监控项。在HKEY_LOCAL_MACHINE\SYSTEM\CurrentControlSet\Control\GraphicsDrivers下添加如下Tdr项:

    TdrDelay

    TdrDelay Specifies the number of seconds that the GPU can delay the preempt request from the GPU scheduler. This is effectively the timeout threshold. The default value is 2 seconds.

    TdrDelay可以被理解为核函数单次运行的最长时间。

    说明

    目前我不会详细解释代码的原理,只是想让大家直观的认识一下path tracing。具体的原理说明会放在之后的教程里。

    Profile

    参数

    图片尺寸(Width*Height):800*600
    反射次数(Bounce):4
    像素采样数(Spp, Samples per pixel):1024

    smallPT.cpp

    渲染时间
    without OpenMP:595.3s

    不用OpenMP加速

    with OpenMP:107.3s

    OpenMP加速
    结果图片 cpu渲染图

    smallPT.cu

    渲染时间:6.3s


    结果图片 gpu渲染图

    反射次数(Bounce):16
    像素采样数(Spp, Samples per pixel):2048

    渲染时间:26.9s

    结果图片

    spp:2048, bounce:8

    由于反射次数增加,indirect radiance会增加,场景的亮度也随之增加;像素采样率的提高会使得渲染图片更加平滑。

    分析

    可以看出,cpu(smallPT.cpp)和gpu(smallPT.cu)的结果图片基本一致(亮度有些差别,可能和float和double的精度有关),但是时间差距很大。在gpu上运行的速度要比在cpu上快107.3/6.3=17倍,比不使用OpenMP多线程加速更要快上595.3/6.3=94倍。所以说,把path tracing这种计算密集型的算法搬到gpu上跑是十分有必要的。

    let's step to path tracing now!

    相关文章

      网友评论

      本文标题:教你一步步写一个cuda path tracer:牛刀小试!

      本文链接:https://www.haomeiwen.com/subject/jqfqzttx.html