美文网首页
cuda实现排序(纯享版,gpt4提供的代码)

cuda实现排序(纯享版,gpt4提供的代码)

作者: 寽虎非虫003 | 来源:发表于2023-08-14 16:03 被阅读0次

    第一版能成功跑对的排序

    #include <stdio.h>
    #include <stdlib.h>
    #include <cuda.h>
    
    #define N 512
    
    __global__ void gpu_odd_even_sort(int *d, int n, int phase) {
        int index = threadIdx.x + blockDim.x * blockIdx.x;
        if (phase % 2 == 0) { // even phase
            if (index < n / 2) {
                int i = 2 * index;
                if (d[i] > d[i + 1]) {
                    // swap elements
                    int temp = d[i];
                    d[i] = d[i + 1];
                    d[i + 1] = temp;
                }
            }
        } else { // odd phase
            if (index < (n - 1) / 2) {
                int i = 2 * index + 1;
                if (d[i] > d[i + 1]) {
                    // swap elements
                    int temp = d[i];
                    d[i] = d[i + 1];
                    d[i + 1] = temp;
                }
            }
        }
    }
    
    
    int main() {
        int h[N];
        int *d;
    
        // Initialize host array
        for (int i = 0; i < N; i++) {
            // h[i] = rand() % N;
            h[i] = N-i;
        }
    
        // 打印结果
        printf("======初始数据========\n");
        for (size_t i = 0; i < N; i++)
        {
            printf("%d\t",h[i]);
        }
        printf("\n");
    
        // Allocate device memory
        cudaMalloc((void**)&d, sizeof(int) * N);
    
        // Copy host array to device array
        cudaMemcpy(d, h, sizeof(int) * N, cudaMemcpyHostToDevice);
    
        // Launch kernel
        for (int i = 0; i < N; i++) {
            gpu_odd_even_sort<<<N / 256, 256>>>(d, N, i);
            cudaDeviceSynchronize();
        }
    
        // Copy back to host
        cudaMemcpy(h, d, sizeof(int) * N, cudaMemcpyDeviceToHost);
    
        // Cleanup
        cudaFree(d);
    
        // 打印结果
        printf("======结果========\n");
        for (size_t i = 0; i < N; i++)
        {
            printf("%d\t",h[i]);
        }
        printf("\n");
    
        return 0;
    }
    

    结果


    跑的结果

    缺点还是很明显的,要连续从cpu调用N次cuda。

    第二版能跑对的,Bitonic排序

    #include <stdio.h>
    #include <stdlib.h>
    #include <cuda.h>
    
    #define N 512
    
    __global__ void bitonic_sort_step(int *dev_values, int j, int k) {
        unsigned int i, ixj; /* Sorting partners: i and ixj */
        i = threadIdx.x + blockDim.x * blockIdx.x;
        ixj = i^j;
    
        /* The threads with the lowest ids sort the array. */
        if ((ixj)>i) {
            if ((i&k)==0) {
                /* Sort ascending */
                if (dev_values[i]>dev_values[ixj]) {
                    /* exchange(i,ixj); */
                    int temp = dev_values[i];
                    dev_values[i] = dev_values[ixj];
                    dev_values[ixj] = temp;
                }
            }
            if ((i&k)!=0) {
                /* Sort descending */
                if (dev_values[i]<dev_values[ixj]) {
                    /* exchange(i,ixj); */
                    int temp = dev_values[i];
                    dev_values[i] = dev_values[ixj];
                    dev_values[ixj] = temp;
                }
            }
        }
    }
    
    /**
     * Inplace bitonic sort using CUDA.
     */
    void bitonic_sort(int *values) {
        int *dev_values;
        size_t size = N * sizeof(int);
    
        cudaMalloc((void**) &dev_values, size);
        cudaMemcpy(dev_values, values, size, cudaMemcpyHostToDevice);
    
        dim3 blocks(N/2,1);    /* Number of blocks   */
        dim3 threads(256,1);  /* Number of threads  */
    
        int j, k;
        static int iter_times{0};
        /* Major step */
        for (k = 2; k <= N; k <<= 1) {
            /* Minor step */
            for (j=k>>1; j>0; j=j>>1) {
                bitonic_sort_step<<<blocks, threads>>>(dev_values, j, k);
                printf("iter time is %d,k= %d, j=%d\n",++iter_times,k,j);
            }
        }
        cudaMemcpy(values, dev_values, size, cudaMemcpyDeviceToHost);
        cudaFree(dev_values);
    }
    
    int main() {
        int h[N];
        for (int i = 0; i < N; i++) {
            h[i] = N-i;
        }
    
        bitonic_sort(h);
    
        for (int i = 0; i < N; i++) {
            printf("%d ", h[i]);
        }
        printf("\n");
    
        return 0;
    }
    

    第三版能跑对的,thrust

    #include <thrust/host_vector.h>
    #include <thrust/device_vector.h>
    #include <thrust/sort.h>
    
    #define N 512
    
    int main() {
        // Generate random numbers on the host
        thrust::host_vector<int> h(N);
        for (int i = 0; i < N; i++) {
            h[i] = rand() % N;
        }
    
        // Copy data to device
        thrust::device_vector<int> d = h;
    
        // Sort data on the device (uses Thrust's CUDA implementation of quicksort)
        thrust::sort(d.begin(), d.end());
    
        // Copy data back to host
        thrust::copy(d.begin(), d.end(), h.begin());
    
        // Print sorted data
        for (int i = 0; i < N; i++) {
            printf("%d ", h[i]);
        }
        printf("\n");
    
        return 0;
    }
    

    相关文章

      网友评论

          本文标题:cuda实现排序(纯享版,gpt4提供的代码)

          本文链接:https://www.haomeiwen.com/subject/likcmdtx.html