第一版能成功跑对的排序
#include <stdio.h>
#include <stdlib.h>
#include <cuda.h>
#define N 512
__global__ void gpu_odd_even_sort(int *d, int n, int phase) {
int index = threadIdx.x + blockDim.x * blockIdx.x;
if (phase % 2 == 0) { // even phase
if (index < n / 2) {
int i = 2 * index;
if (d[i] > d[i + 1]) {
// swap elements
int temp = d[i];
d[i] = d[i + 1];
d[i + 1] = temp;
}
}
} else { // odd phase
if (index < (n - 1) / 2) {
int i = 2 * index + 1;
if (d[i] > d[i + 1]) {
// swap elements
int temp = d[i];
d[i] = d[i + 1];
d[i + 1] = temp;
}
}
}
}
int main() {
int h[N];
int *d;
// Initialize host array
for (int i = 0; i < N; i++) {
// h[i] = rand() % N;
h[i] = N-i;
}
// 打印结果
printf("======初始数据========\n");
for (size_t i = 0; i < N; i++)
{
printf("%d\t",h[i]);
}
printf("\n");
// Allocate device memory
cudaMalloc((void**)&d, sizeof(int) * N);
// Copy host array to device array
cudaMemcpy(d, h, sizeof(int) * N, cudaMemcpyHostToDevice);
// Launch kernel
for (int i = 0; i < N; i++) {
gpu_odd_even_sort<<<N / 256, 256>>>(d, N, i);
cudaDeviceSynchronize();
}
// Copy back to host
cudaMemcpy(h, d, sizeof(int) * N, cudaMemcpyDeviceToHost);
// Cleanup
cudaFree(d);
// 打印结果
printf("======结果========\n");
for (size_t i = 0; i < N; i++)
{
printf("%d\t",h[i]);
}
printf("\n");
return 0;
}
结果
跑的结果
缺点还是很明显的,要连续从cpu调用N次cuda。
第二版能跑对的,Bitonic排序
#include <stdio.h>
#include <stdlib.h>
#include <cuda.h>
#define N 512
__global__ void bitonic_sort_step(int *dev_values, int j, int k) {
unsigned int i, ixj; /* Sorting partners: i and ixj */
i = threadIdx.x + blockDim.x * blockIdx.x;
ixj = i^j;
/* The threads with the lowest ids sort the array. */
if ((ixj)>i) {
if ((i&k)==0) {
/* Sort ascending */
if (dev_values[i]>dev_values[ixj]) {
/* exchange(i,ixj); */
int temp = dev_values[i];
dev_values[i] = dev_values[ixj];
dev_values[ixj] = temp;
}
}
if ((i&k)!=0) {
/* Sort descending */
if (dev_values[i]<dev_values[ixj]) {
/* exchange(i,ixj); */
int temp = dev_values[i];
dev_values[i] = dev_values[ixj];
dev_values[ixj] = temp;
}
}
}
}
/**
* Inplace bitonic sort using CUDA.
*/
void bitonic_sort(int *values) {
int *dev_values;
size_t size = N * sizeof(int);
cudaMalloc((void**) &dev_values, size);
cudaMemcpy(dev_values, values, size, cudaMemcpyHostToDevice);
dim3 blocks(N/2,1); /* Number of blocks */
dim3 threads(256,1); /* Number of threads */
int j, k;
static int iter_times{0};
/* Major step */
for (k = 2; k <= N; k <<= 1) {
/* Minor step */
for (j=k>>1; j>0; j=j>>1) {
bitonic_sort_step<<<blocks, threads>>>(dev_values, j, k);
printf("iter time is %d,k= %d, j=%d\n",++iter_times,k,j);
}
}
cudaMemcpy(values, dev_values, size, cudaMemcpyDeviceToHost);
cudaFree(dev_values);
}
int main() {
int h[N];
for (int i = 0; i < N; i++) {
h[i] = N-i;
}
bitonic_sort(h);
for (int i = 0; i < N; i++) {
printf("%d ", h[i]);
}
printf("\n");
return 0;
}
第三版能跑对的,thrust
#include <thrust/host_vector.h>
#include <thrust/device_vector.h>
#include <thrust/sort.h>
#define N 512
int main() {
// Generate random numbers on the host
thrust::host_vector<int> h(N);
for (int i = 0; i < N; i++) {
h[i] = rand() % N;
}
// Copy data to device
thrust::device_vector<int> d = h;
// Sort data on the device (uses Thrust's CUDA implementation of quicksort)
thrust::sort(d.begin(), d.end());
// Copy data back to host
thrust::copy(d.begin(), d.end(), h.begin());
// Print sorted data
for (int i = 0; i < N; i++) {
printf("%d ", h[i]);
}
printf("\n");
return 0;
}
网友评论