教你一步步写一个cuda path tracer：牛刀小试！

作者: RobotBerry | 来源:发表于2017-04-19 11:31 被阅读141次

教你一步步写一个cuda path tracer：牛刀小试！
教你一步步写一个cuda path tracer：cuda he
教你一步步写一个cuda path tracer：cuda与类
教你一步步写一个cuda path tracer：cuda简介和
教你一步步写一个cuda path tracer：前言
Ray Tracing in One Week Note 1
Matlab设置CUDA
otel 源码阅读（三）
[教程]仅20行Javascript代码！手把手教你写页面模板引
CUDA_cublas_LIBRARY not found

前面几篇都是关于cuda的基础知识介绍，甚是乏味。是时候用cuda渲染一张图片来直观地感受一下了！

Kevin Beason大神曾经写过一个99行的c++版本的path tracer。麻雀虽小，五脏俱全，这个path tracer包含了很多global illumination的特性：

Monte Carlo samling

OpenMP
Specular, Diffuse, and glass BRDFs
Antialising via super-sampling with importance-sampled tent distribution
Cosine importance sampling of the hemisphere for diffuse reflection
Russian roulette for path termination
Russian roulette and splitting for selecting reflection and/or refraction for glass BRDF

为了方便起见，我去掉了不必要的特性，只保留了diffuse材质。源代码请从官网上获取。修改后的代码如下：

smallPT.cpp

#include <cmath>
#include <cstdio>
#include <chrono>

#define BOUNCE 4
#define WIDTH 800
#define HEIGHT 600
#define SPP 1024

struct Vec
{
    double x, y, z;
    Vec(double x_ = 0, double y_ = 0, double z_ = 0) { x = x_; y = y_; z = z_; }
    Vec operator+(const Vec &b) const { return Vec(x + b.x, y + b.y, z + b.z); }
    Vec operator-(const Vec &b) const { return Vec(x - b.x, y - b.y, z - b.z); }
    Vec operator*(double b) const { return Vec(x*b, y*b, z*b); }
    Vec mult(const Vec &b) const { return Vec(x*b.x, y*b.y, z*b.z); }
    Vec& norm() { return *this = *this * (1 / sqrt(x*x + y*y + z*z)); }
    double dot(const Vec &b) const { return x*b.x + y*b.y + z*b.z; }
    Vec operator%(Vec&b) { return Vec(y*b.z - z*b.y, z*b.x - x*b.z, x*b.y - y*b.x); }
};

struct Ray { Vec o, d; Ray(Vec o_, Vec d_) : o(o_), d(d_) {} };

enum Refl_t { DIFF, SPEC, REFR }; // diffuse, specular, refraction

struct Sphere
{
    double rad;
    Vec p, e, c; // position, emission, color
    Refl_t refl;
    Sphere(double rad_, Vec p_, Vec e_, Vec c_, Refl_t refl_) :
        rad(rad_), p(p_), e(e_), c(c_), refl(refl_) {}
    double intersect(const Ray &r) const
    {
        Vec op = p - r.o;
        double t, eps = 1e-4, b = op.dot(r.d), det = b*b - op.dot(op) + rad*rad;
        if (det < 0) return 0; else det = sqrt(det);
        return (t = b - det) > eps ? t : ((t = b + det) > eps ? t : 0);
    }
};

Sphere spheres[] =
{
    Sphere(1e5, Vec(-1e5f - 50.0f, 40.0f, 80.0f), Vec(),Vec(.75,.25,.25),DIFF), // Left 
    Sphere(1e5, Vec(1e5f + 50.0f, 40.0f, 80.0f),Vec(),Vec(.25,.25,.75),DIFF), // Rght 
    Sphere(1e5, Vec(0.0f, 40.0f, -1e5f),     Vec(),Vec(.75,.75,.75),DIFF), // Back 
    Sphere(1e5, Vec(0.0f, 40.0f, 1e5f + 600.0f), Vec(),Vec(1, 1, 1), DIFF), // Frnt 
    Sphere(1e5, Vec(0.0f, -1e5f, 80.0f ),    Vec(),Vec(.75,.75,.75),DIFF), // Botm 
    Sphere(1e5, Vec(0.0f, 1e5f + 80.0f, 80.0f),Vec(),Vec(.75,.75,.75),DIFF), // Top 
    Sphere(16,Vec(-25.0f, 16.0f, 47.0f),       Vec(),Vec(1,1,1), DIFF), // Mirr 
    Sphere(20,Vec(25.0f, 20.0f, 78.0f),       Vec(),Vec(1,1,1), DIFF), // Glas 
    Sphere(600, Vec(0.0f, 678.8f, 80.0f),Vec(1.6f, 1.6f, 1.6f),  Vec(), DIFF) // Lite 
};

float rand(unsigned int *seed0, unsigned int *seed1)
{
    *seed0 = 36969 * ((*seed0) & 65535) + ((*seed0) >> 16);
    *seed1 = 18000 * ((*seed1) & 65535) + ((*seed1) >> 16);

    unsigned int ires = ((*seed0) << 16) + (*seed1);

    union
    {
        float f;
        unsigned int ui;
    } res;

    res.ui = (ires & 0x007fffff) | 0x40000000;  // bitwise AND, bitwise OR

    return (res.f - 2.f) / 2.f;
}

inline double clamp(double x) { return x < 0 ? 0 : x>1 ? 1 : x; }
inline int toInt(double x) { return int(pow(clamp(x), 1 / 2.2) * 255 + .5); }
inline bool intersect(const Ray &r, double &t, int &id) {
    double n = sizeof(spheres) / sizeof(Sphere), d, inf = t = 1e20;
    for (int i = int(n); i--;) if ((d = spheres[i].intersect(r)) && d < t) { t = d; id = i; }
    return t < inf;
}

Vec radiance(const Ray &r, int depth, unsigned int *s0, unsigned int *s1)
{
    double t;                               // distance to intersection 
    int id = 0;                               // id of intersected object 
    if (depth > BOUNCE || !intersect(r, t, id)) return Vec(); // if miss, return black 
    const Sphere &obj = spheres[id];        // the hit object 
    Vec x = r.o + r.d*t, n = (x - obj.p).norm(), nl = n.dot(r.d) < 0 ? n : n*-1, f = obj.c;

    if (obj.refl == DIFF)
    {
        double r1 = 2 * M_PI*rand(s0, s1), r2 = rand(s0, s1), r2s = sqrt(r2);
        Vec w = nl, u = ((fabs(w.x) > .1 ? Vec(0, 1) : Vec(1)) % w).norm(), v = w%u;
        Vec d = (u*cos(r1)*r2s + v*sin(r1)*r2s + w*sqrt(1 - r2)).norm();
        return obj.e + f.mult(radiance(Ray(x, d), depth + 1, s0, s1)) * n.dot(d) * 2;
    }
    return Vec();
}

int main()
{
    Ray cam(Vec(0, 52, 300), Vec(0, -0.05, -1.0).norm()); // cam pos, dir 
    Vec cx = Vec(WIDTH*.5135 / HEIGHT), cy = (cx%cam.d).norm()*.5135, r, *c = new Vec[WIDTH*HEIGHT];

    std::chrono::time_point<std::chrono::system_clock> begin = std::chrono::system_clock::now();
#pragma omp parallel for schedule(dynamic, 1) private(r)       // OpenMP 
    for (int y = 0; y < HEIGHT; y++) // Loop over image rows 
    {
        fprintf(stderr, "\rRendering (%d spp) %5.2f%%", SPP, 100.*y / (HEIGHT - 1));
        for (int x = 0; x < WIDTH; x++)   // Loop cols
        {
            unsigned int s0 = x;
            unsigned int s1 = y;
            int i = (HEIGHT - y - 1)*WIDTH + x;
            r = Vec();
            for (int s = 0; s < SPP; s++)
            {
                Vec d = cx*((0.25 + x) / WIDTH - .5) + cy*((0.25 + y) / HEIGHT - .5) + cam.d;
                r = r + radiance(Ray(cam.o + d * 40, d.norm()), 0, &s0, &s1);
            }
            r = r * (1.0 / SPP);
            c[i] = Vec(clamp(r.x), clamp(r.y), clamp(r.z));
        }
    }
    std::chrono::time_point<std::chrono::system_clock> end = std::chrono::system_clock::now();
    std::chrono::duration<double> elapsedTime = end - begin;
    printf("\nTime: %.6lfs\n", elapsedTime.count());

    FILE *f = fopen("image.ppm", "w");         // Write image to PPM file. 
    fprintf(f, "P3\n%d %d\n%d\n", WIDTH, HEIGHT, 255);
    for (int i = 0; i < WIDTH*HEIGHT; i++)
        fprintf(f, "%d %d %d ", toInt(c[i].x), toInt(c[i].y), toInt(c[i].z));
}

Note

1.这段代码使用OpenMP进行多线程加速。

OpenMP（Open Multi-Processing）是一套支持跨平台共享内存方式的多线程并发的编程API

在vs2015中需要手动启用OpenMP功能：

vs2015中打开OpenMP选项

2.生成的ppm图片文件可以用ps等软件打开

对应的cuda代码如下：

smallPT.cu

// cuda headers
#include <cuda_runtime.h>
#include <device_launch_parameters.h>

// c++ headers
#include <cstdio>
#include <algorithm>
#include <climits>
#include <chrono>

#include "helper_math.h"

#define WIDTH 800
#define HEIGHT 600
#define SPP 1024
#define BOUNCE 4
#define SPHERE_EPSILON 0.0001f
#define BOX_EPSILON 0.001f
#define RAY_EPSILON 0.05f;

struct Ray 
{
    __device__ Ray(float3 pos, float3 dir) :
        pos(pos), dir(dir) {}

    float3 pos;
    float3 dir;
};

enum class Material { Diffuse, Specular, Refractive };

struct Sphere 
{
    __device__ float intersect(const Ray &ray) const
    {
        float t;
        float3 dis = pos - ray.pos;
        float proj = dot(dis, ray.dir);
        float delta = proj * proj - dot(dis, dis) + radius * radius;

        if (delta < 0) return 0;

        delta = sqrtf(delta);
        return (t = proj - delta) > SPHERE_EPSILON ? t : ((t = proj + delta) > SPHERE_EPSILON ? t : 0);
    }

    float radius;
    float3 pos, emissionColor, mainColor;
    Material material;
};

__constant__ Sphere spheres[] =
{
    { 1e5f, { -1e5f - 50.0f, 40.0f, 80.0f }, { 0.0f, 0.0f, 0.0f }, { 0.75f, 0.25f, 0.25f }, Material::Diffuse }, // Left 
    { 1e5f, { 1e5f + 50.0f, 40.0f, 80.0f }, { 0.0f, 0.0f, 0.0f }, { 0.25f, 0.25f, 0.75f }, Material::Diffuse }, // Right 
    { 1e5f, { 0.0f, 40.0f, -1e5f }, { 0.0f, 0.0f, 0.0f }, { 0.75f, 0.75f, 0.75f }, Material::Diffuse }, // Back 
    { 1e5f, { 0.0f, 40.0f, 1e5f + 600.0f }, { 0.0f, 0.0f, 0.0f }, { 1.00f, 1.00f, 1.00f }, Material::Diffuse }, // Front 
    { 1e5f, { 0.0f, -1e5f, 80.0f }, { 0.0f, 0.0f, 0.0f }, { 0.75f, 0.75f, 0.75f }, Material::Diffuse }, // Bottom 
    { 1e5f, { 0.0f, 1e5f + 80.0f, 80.0f }, { 0.0f, 0.0f, 0.0f }, { 0.75f, 0.75f, 0.75f }, Material::Diffuse }, // Top 
    { 16.0f, { -25.0f, 16.0f, 47.0f }, { 0.0f, 0.0f, 0.0f }, { 1.0f, 1.0f, 1.0f }, Material::Diffuse }, // small sphere 1
    { 20.0f, { 25.0f, 20.0f, 78.0f }, { 0.0f, 0.0f, 0.0f }, { 1.0f, 1.0f, 1.0f }, Material::Diffuse }, // small sphere 2
    { 600.0f, { 0.0f, 678.8f, 80.0f }, { 1.6f, 1.6f, 1.6f }, { 0.0f, 0.0f, 0.0f }, Material::Diffuse }  // Light
};

__device__ inline bool intersectScene(const Ray &ray, float &t, int &id) 
{
    t = FLT_MAX, id = -1;
    int sphereNum = sizeof(spheres) / sizeof(Sphere);
    for (int i = 0; i < sphereNum; i++)
    {
        float ct = spheres[i].intersect(ray);
        if (ct != 0 && ct < t)
        {  
            t = ct;
            id = i;
        }
    }
        
    return id != -1;
}

__device__ static float rand(uint *seed0, uint *seed1) 
{
    *seed0 = 36969 * ((*seed0) & 65535) + ((*seed0) >> 16);
    *seed1 = 18000 * ((*seed1) & 65535) + ((*seed1) >> 16);

    uint ires = ((*seed0) << 16) + (*seed1);

    union 
    {
        float f;
        uint ui;
    } res;

    res.ui = (ires & 0x007fffff) | 0x40000000;  // bitwise AND, bitwise OR

    return (res.f - 2.f) / 2.f;
}

inline int gammaCorrect(float c)
{
    return int(pow(clamp(c, 0.0f, 1.0f), 1 / 2.2) * 255 + .5);
}

__device__ float3 pathTrace(Ray &ray, uint *s1, uint *s2) 
{ 
    float3 accumColor = make_float3(0.0f, 0.0f, 0.0f);
    float3 mask = make_float3(1.0f, 1.0f, 1.0f);

    for (int i = 0; i < BOUNCE; i++)
    {
        float t;
        int id;

        if (!intersectScene(ray, t, id))
            return make_float3(0.0f, 0.0f, 0.0f);

        const Sphere &obj = spheres[id];
        float3 p = ray.pos + ray.dir * t; 
        float3 n = normalize(p - obj.pos);
        float3 nl = dot(n, ray.dir) < 0 ? n : n * -1;

        accumColor += mask * obj.emissionColor;

        float r1 = rand(s1, s2) * M_PI * 2;
        float r2 = rand(s1, s2);
        float r2s = sqrtf(r2);

        float3 w = nl;
        float3 u = normalize(cross((std::fabs(w.x) > std::fabs(w.y) ? make_float3(0, 1, 0) : make_float3(1, 0, 0)), w));
        float3 v = cross(w, u);

        float3 d = normalize(u*cos(r1)*r2s + v*sin(r1)*r2s + w*sqrtf(1 - r2));

        ray.pos = p + nl * RAY_EPSILON;
        ray.dir = d;

        mask *= obj.mainColor * dot(d, nl) * 2;
    }

    return accumColor;
}

__global__ void pathTracekernel(float3 *h_output) 
{
    uint x = blockIdx.x * blockDim.x + threadIdx.x;
    uint y = blockIdx.y * blockDim.y + threadIdx.y;

    uint i = (HEIGHT - y - 1)*WIDTH + x;

    uint s1 = x;
    uint s2 = y;

    Ray camRay(make_float3(0.0f, 52.0f, 300.0f), normalize(make_float3(0.0f, -0.05f, -1.0f)));
    float3 cx = make_float3(WIDTH * 0.5135 / HEIGHT, 0.0f, 0.0f);
    float3 cy = normalize(cross(cx, camRay.dir)) * 0.5135;
    float3 pixel = make_float3(0.0f);

    for (int s = 0; s < SPP; s++) 
    {  
        float3 d = camRay.dir + cx*((0.25 + x) / WIDTH - 0.5) + cy*((0.25 + y) / HEIGHT - 0.5);
        Ray ray(camRay.pos + d * 40, normalize(d));

        pixel += pathTrace(ray, &s1, &s2)*(1. / SPP);
    }

    h_output[i] = clamp(pixel, 0.0f, 1.0f);
}

int main() {

    float3 *h_output = new float3[WIDTH * HEIGHT];
    float3 *d_output;

    cudaMalloc(&d_output, WIDTH * HEIGHT * sizeof(float3));

    dim3 block(8, 8, 1);
    dim3 grid(WIDTH / block.x, HEIGHT / block.y, 1);

    printf("CUDA initialized.\nStart rendering...\n");

    std::chrono::time_point<std::chrono::system_clock> begin = std::chrono::system_clock::now();

    pathTracekernel << <grid, block >> > (d_output);

    cudaMemcpy(h_output, d_output, WIDTH * HEIGHT * sizeof(float3), cudaMemcpyDeviceToHost);

    std::chrono::time_point<std::chrono::system_clock> end = std::chrono::system_clock::now();
    std::chrono::duration<double> elapsedTime = end - begin;
    printf("Time: %.6lfs\n", elapsedTime.count());

    cudaFree(d_output);

    printf("Done!\n");

    FILE *f = fopen("smallptcuda.ppm", "w");
    fprintf(f, "P3\n%d %d\n%d\n", WIDTH, HEIGHT, 255);
    for (int i = 0; i < WIDTH * HEIGHT; i++)
        fprintf(f, "%d %d %d ", gammaCorrect(h_output[i].x),
            gammaCorrect(h_output[i].y),
            gammaCorrect(h_output[i].z));

    printf("Saved image to 'smallptcuda.ppm'.\n");

    delete[] h_output;

    return 0;
}

Note

smallPT.cu中包含了一个helper_math.h头文件，该文件实现了float3等cuda基本数据类型的常见操作，例如+ - * dot cross等运算。该头文件位于C:\ProgramData\NVIDIA Corporation\CUDA Samples\v8.0\common\inc目录下。
由于在这段代码里，核函数的单次运行时间会超出系统的默认锁定时间，所以会导致系统认为程序出错而结束进程。详见微软的官网说明。所以我们需要在注册表中修改系统对核函数的监控项。在HKEY_LOCAL_MACHINE\SYSTEM\CurrentControlSet\Control\GraphicsDrivers下添加如下Tdr项：

TdrDelay

TdrDelay Specifies the number of seconds that the GPU can delay the preempt request from the GPU scheduler. This is effectively the timeout threshold. The default value is 2 seconds.

TdrDelay可以被理解为核函数单次运行的最长时间。

说明

目前我不会详细解释代码的原理，只是想让大家直观的认识一下path tracing。具体的原理说明会放在之后的教程里。

Profile

参数

图片尺寸（Width*Height）：800*600
反射次数（Bounce）：4
像素采样数（Spp, Samples per pixel）：1024

smallPT.cpp

渲染时间
without OpenMP：595.3s

不用OpenMP加速

with OpenMP：107.3s

OpenMP加速
结果图片

cpu渲染图

smallPT.cu

渲染时间：6.3s

结果图片

gpu渲染图

反射次数（Bounce）：16
像素采样数（Spp, Samples per pixel）：2048

渲染时间：26.9s

结果图片

spp:2048, bounce:8

由于反射次数增加，indirect radiance会增加，场景的亮度也随之增加；像素采样率的提高会使得渲染图片更加平滑。

分析

可以看出，cpu（smallPT.cpp）和gpu（smallPT.cu）的结果图片基本一致（亮度有些差别，可能和float和double的精度有关），但是时间差距很大。在gpu上运行的速度要比在cpu上快107.3/6.3=17倍，比不使用OpenMP多线程加速更要快上595.3/6.3=94倍。所以说，把path tracing这种计算密集型的算法搬到gpu上跑是十分有必要的。

let's step to path tracing now!

教你一步步写一个cuda path tracer：牛刀小试！
前面几篇都是关于cuda的基础知识介绍，甚是乏味。是时候用cuda渲染一张图片来直观地感受一下了！ Kevin B...
教你一步步写一个cuda path tracer：cuda he
我们学任何一门语言和框架，都要从hello world学起。cuda的hello world可以是：首先要注意，...
教你一步步写一个cuda path tracer：cuda与类
cuda里能不能使用类？答案当然是可以的。下面给出一个实例： arrayadder.cuh arrayadder....
教你一步步写一个cuda path tracer：cuda简介和
cuda定义 cuda是nvidia推出的一种通用并行计算架构，该架构能够充分利用gpu强大的并行计算能力，解决一...
教你一步步写一个cuda path tracer：前言
简介这是一套关于如何使用cuda实现path tracing算法的系列教程。背景我是一名计算机学院的研究生，...
Ray Tracing in One Week Note 1
Write a Path Tracer C++ fast, portable, most renders use...
Matlab设置CUDA
setenv('CUDA_PATH', '/home/data/package/cuda-8.0/');seten...
otel 源码阅读（三）
Tracer Tracer的逻辑很简单，Tracer接口就一个Start方法，用来生成新的span。与之相对于的t...
[教程]仅20行Javascript代码！手把手教你写页面模板引
【HELLO小程序导读】：AbsurdJS 作者写的一篇教程，一步步教你怎样用 Javascript 实现一个纯客...
CUDA_cublas_LIBRARY not found
这个问题出现在编译的时候。原因：/path/to/cuda-xx/lib64/路径下面找不到 libcubla...