【摘要】 CUDA
CUDA的原子操作针对的是Global Memory或者是Shared Memory。
- Shared Memory是可被同一个block的所有thread访问(读写)的。
- Global Memory相当于显存,可以被所有thread访问(读写)的。
#include<time.h> //for time()
#include<stdlib.h> //for srand()/rand()
#include<sys/time.h> //for gettimeofday()/struct timeval
#define N 10000000
#define BLOCK_SIZE 256
#define BLOCKS ((N + BLOCK_SIZE - 1) / BLOCK_SIZE)
__managed__ int source[N]; //input data
__managed__ int final_result[1] = {0}; //scalar output
__global__ void _sum_gpu(int *input, int count, int *output)
__shared__ int sum_per_block[BLOCK_SIZE];
int temp = 0;
for (int idx = threadIdx.x + blockDim.x * blockIdx.x;
idx < count;
idx += gridDim.x * blockDim.x
temp += input[idx];
sum_per_block[threadIdx.x] = temp; //the per-thread partial sum is temp!
//**********shared memory summation stage***********
for (int length = BLOCK_SIZE / 2; length >= 1; length /= 2)
int double_kill = -1;
if (threadIdx.x < length)
double_kill = sum_per_block[threadIdx.x] + sum_per_block[threadIdx.x + length];
__syncthreads(); //why we need two __syncthreads() here, and,
if (threadIdx.x < length)
sum_per_block[threadIdx.x] = double_kill;
__syncthreads(); //....here ?
} //the per-block partial sum is sum_per_block[0]
if (blockDim.x * blockIdx.x < count) //in case that our users are naughty
//the final reduction performed by atomicAdd()
if (threadIdx.x == 0) atomicAdd(output, sum_per_block[0]);
int _sum_cpu(int *ptr, int count)
int sum = 0;
for (int i = 0; i < count; i++)
sum += ptr[i];
return sum;
void _init(int *ptr, int count)
uint32_t seed = (uint32_t)time(NULL); //make huan happy
srand(seed); //reseeding the random generator
//filling the buffer with random data
for (int i = 0; i < count; i++) ptr[i] = rand();
double get_time()
struct timeval tv;
gettimeofday(&tv, NULL);
return ((double)tv.tv_usec * 0.000001 + tv.tv_sec);
int main()
fprintf(stderr, "filling the buffer with %d elements...\n", N);
_init(source, N);
//Now we are going to kick start your kernel.
cudaDeviceSynchronize(); //steady! ready! go!
fprintf(stderr, "Running on GPU...\n");
double t0 = get_time();
_sum_gpu<<<BLOCKS, BLOCK_SIZE>>>(source, N, final_result);
CHECK(cudaGetLastError()); //checking for launch failures
CHECK(cudaDeviceSynchronize()); //checking for run-time failurs
double t1 = get_time();
int A = final_result[0];
fprintf(stderr, "GPU sum: %u\n", A);
//Now we are going to exercise your CPU...
fprintf(stderr, "Running on CPU...\n");
double t2 = get_time();
int B = _sum_cpu(source, N);
double t3 = get_time();
fprintf(stderr, "CPU sum: %u\n", B);
//******The last judgement**********
if (A == B)
fprintf(stderr, "Test Passed!\n");
fprintf(stderr, "Test failed!\n");
//****and some timing details*******
fprintf(stderr, "GPU time %.3f ms\n", (t1 - t0) * 1000.0);
fprintf(stderr, "CPU time %.3f ms\n", (t3 - t2) * 1000.0);
return 0;
#include<time.h> //for time()
#include<stdlib.h> //for srand()/rand()
#include<sys/time.h> //for gettimeofday()/struct timeval
#define N 10000000
#define BLOCK_SIZE 256
#define BLOCKS ((N + BLOCK_SIZE - 1) / BLOCK_SIZE)
__managed__ int source[N]; //input data
//__managed__ int final_result[2] = {INT_MIN,INT_MAX}; //scalar output
__managed__ int final_result_max = INT_MIN; //scalar output
__managed__ int final_result_min = INT_MAX; //scalar output
__global__ void _sum_min_or_max(int *input, int count, int *max_output, int *min_output)
__shared__ int max_per_block[BLOCK_SIZE];
__shared__ int min_per_block[BLOCK_SIZE];
int max_temp = 0;
int min_temp = 0;
for (int idx = threadIdx.x + blockDim.x * blockIdx.x;
idx < count;
idx += gridDim.x * blockDim.x
//temp += input[idx];
max_temp = (input[idx] > max_temp) ? input[idx] :max_temp;
min_temp = (input[idx] < min_temp) ? input[idx] :min_temp;
max_per_block[threadIdx.x] = max_temp; //the per-thread partial max is temp!
min_per_block[threadIdx.x] = min_temp; //the per-thread partial max is temp!
//**********shared memory summation stage***********
for (int length = BLOCK_SIZE / 2; length >= 1; length /= 2)
int max_double_kill = -1;
int min_double_kill = -1;
if (threadIdx.x < length)
max_double_kill = (max_per_block[threadIdx.x] > max_per_block[threadIdx.x + length]) ? max_per_block[threadIdx.x] : max_per_block[threadIdx.x + length];
min_double_kill = (min_per_block[threadIdx.x] < min_per_block[threadIdx.x + length]) ? min_per_block[threadIdx.x] : min_per_block[threadIdx.x + length];
__syncthreads(); //why we need two __syncthreads() here, and,
if (threadIdx.x < length)
max_per_block[threadIdx.x] = max_double_kill;
min_per_block[threadIdx.x] = min_double_kill;
__syncthreads(); //....here ?
} //the per-block partial sum is sum_per_block[0]
if (blockDim.x * blockIdx.x < count) //in case that our users are naughty
//the final reduction performed by atomicAdd()
//if (threadIdx.x == 0) atomicAdd(output, max_per_block[0]);
if (threadIdx.x == 0) atomicMax(max_output, max_per_block[0]);
if (threadIdx.x == 0) atomicMin(min_output, min_per_block[0]);
int _max_min_cpu(int *ptr, int count, int *max1, int *min1)
int max = INT_MIN;
int min = INT_MAX;
for (int i = 0; i < count; i++)
//sum += ptr[i];
max = (ptr[i] > max)? ptr[i]:max;
min = (ptr[i] < min)? ptr[i]:min;
//printf(" CPU max = %d\n", max);
//printf(" CPU min = %d\n", min);
*max1 = max;
*min1 = min;
return 0;
void _init(int *ptr, int count)
uint32_t seed = (uint32_t)time(NULL); //make huan happy
//srand(seed); //reseeding the random generator
//filling the buffer with random data
for (int i = 0; i < count; i++)
//ptr[i] = rand() % 100000000;
ptr[i] = rand() ;
if (i % 2 == 0) ptr[i] = 0 - ptr[i] ;
double get_time()
struct timeval tv;
gettimeofday(&tv, NULL);
return ((double)tv.tv_usec * 0.000001 + tv.tv_sec);
int main()
fprintf(stderr, "filling the buffer with %d elements...\n", N);
_init(source, N);
//Now we are going to kick start your kernel.
cudaDeviceSynchronize(); //steady! ready! go!
fprintf(stderr, "Running on GPU...\n");
double t0 = get_time();
_sum_min_or_max<<<BLOCKS, BLOCK_SIZE>>>(source, N, &final_result_max, &final_result_min);
CHECK(cudaGetLastError()); //checking for launch failures
CHECK(cudaDeviceSynchronize()); //checking for run-time failures
double t1 = get_time();
//int A = final_result[0];
fprintf(stderr, " GPU max: %d\n", final_result_max);
fprintf(stderr, " GPU min: %d\n", final_result_min);
//Now we are going to exercise your CPU...
fprintf(stderr, "Running on CPU...\n");
double t2 = get_time();
int cpu_max=0;
int cpu_min=0;
int B = _max_min_cpu(source, N, &cpu_max, &cpu_min);
printf(" CPU max = %d\n", cpu_max);
printf(" CPU min = %d\n", cpu_min);
double t3 = get_time();
//fprintf(stderr, "CPU sum: %u\n", B);
//******The last judgement**********
if ( final_result_max == cpu_max && final_result_min == cpu_min )
fprintf(stderr, "Test Passed!\n");
fprintf(stderr, "Test failed!\n");
//****and some timing details*******
fprintf(stderr, "GPU time %.3f ms\n", (t1 - t0) * 1000.0);
fprintf(stderr, "CPU time %.3f ms\n", (t3 - t2) * 1000.0);
return 0;
上面的代码,张小白偷懒,使用了两个managed变量记录结果,张小白看了看后面的作业,还有一道“找到1000万数据中前10个最大值”的题目,感觉还是用 数组会更合适点。也许可以无缝的升级解决后面这道题,于是张小白又做了以下改动:
#include<time.h> //for time()
#include<stdlib.h> //for srand()/rand()
#include<sys/time.h> //for gettimeofday()/struct timeval
#define N 10000000
#define BLOCK_SIZE 256
#define BLOCKS ((N + BLOCK_SIZE - 1) / BLOCK_SIZE)
__managed__ int source[N]; //input data
__managed__ int final_result[2] = {INT_MIN,INT_MAX}; //scalar output
//__managed__ int final_result_max = INT_MIN; //scalar output
//__managed__ int final_result_min = INT_MAX; //scalar output
//__global__ void _sum_min_or_max(int *input, int count, int *max_output, int *min_output)
__global__ void _sum_min_or_max(int *input, int count,int *output)
__shared__ int max_per_block[BLOCK_SIZE];
__shared__ int min_per_block[BLOCK_SIZE];
int max_temp = 0;
int min_temp = 0;
for (int idx = threadIdx.x + blockDim.x * blockIdx.x;
idx < count;
idx += gridDim.x * blockDim.x
//temp += input[idx];
max_temp = (input[idx] > max_temp) ? input[idx] :max_temp;
min_temp = (input[idx] < min_temp) ? input[idx] :min_temp;
max_per_block[threadIdx.x] = max_temp; //the per-thread partial max is temp!
min_per_block[threadIdx.x] = min_temp; //the per-thread partial max is temp!
//**********shared memory summation stage***********
for (int length = BLOCK_SIZE / 2; length >= 1; length /= 2)
int max_double_kill = -1;
int min_double_kill = -1;
if (threadIdx.x < length)
max_double_kill = (max_per_block[threadIdx.x] > max_per_block[threadIdx.x + length]) ? max_per_block[threadIdx.x] : max_per_block[threadIdx.x + length];
min_double_kill = (min_per_block[threadIdx.x] < min_per_block[threadIdx.x + length]) ? min_per_block[threadIdx.x] : min_per_block[threadIdx.x + length];
__syncthreads(); //why we need two __syncthreads() here, and,
if (threadIdx.x < length)
max_per_block[threadIdx.x] = max_double_kill;
min_per_block[threadIdx.x] = min_double_kill;
__syncthreads(); //....here ?
} //the per-block partial sum is sum_per_block[0]
if (blockDim.x * blockIdx.x < count) //in case that our users are naughty
//the final reduction performed by atomicAdd()
//if (threadIdx.x == 0) atomicAdd(output, max_per_block[0]);
//if (threadIdx.x == 0) atomicMax(max_output, max_per_block[0]);
//if (threadIdx.x == 0) atomicMin(min_output, min_per_block[0]);
if (threadIdx.x == 0) atomicMax(&output[0], max_per_block[0]);
if (threadIdx.x == 0) atomicMin(&output[1], min_per_block[0]);
int _max_min_cpu(int *ptr, int count, int *max1, int *min1)
int max = INT_MIN;
int min = INT_MAX;
for (int i = 0; i < count; i++)
//sum += ptr[i];
max = (ptr[i] > max)? ptr[i]:max;
min = (ptr[i] < min)? ptr[i]:min;
//printf(" CPU max = %d\n", max);
//printf(" CPU min = %d\n", min);
*max1 = max;
*min1 = min;
return 0;
void _init(int *ptr, int count)
uint32_t seed = (uint32_t)time(NULL); //make huan happy
srand(seed); //reseeding the random generator
//filling the buffer with random data
for (int i = 0; i < count; i++)
//ptr[i] = rand() % 100000000;
ptr[i] = rand() ;
if (i % 2 == 0) ptr[i] = 0 - ptr[i] ;
double get_time()
struct timeval tv;
gettimeofday(&tv, NULL);
return ((double)tv.tv_usec * 0.000001 + tv.tv_sec);
int main()
fprintf(stderr, "filling the buffer with %d elements...\n", N);
_init(source, N);
//Now we are going to kick start your kernel.
cudaDeviceSynchronize(); //steady! ready! go!
fprintf(stderr, "Running on GPU...\n");
double t0 = get_time();
//_sum_min_or_max<<<BLOCKS, BLOCK_SIZE>>>(source, N, &final_result_max, &final_result_min);
_sum_min_or_max<<<BLOCKS, BLOCK_SIZE>>>(source, N,final_result);
CHECK(cudaGetLastError()); //checking for launch failures
CHECK(cudaDeviceSynchronize()); //checking for run-time failures
double t1 = get_time();
//int A = final_result[0];
//fprintf(stderr, " GPU max: %d\n", final_result_max);
//fprintf(stderr, " GPU min: %d\n", final_result_min);
fprintf(stderr, " GPU max: %d\n", final_result[0]);
fprintf(stderr, " GPU min: %d\n", final_result[1]);
//Now we are going to exercise your CPU...
fprintf(stderr, "Running on CPU...\n");
double t2 = get_time();
int cpu_max=0;
int cpu_min=0;
int B = _max_min_cpu(source, N, &cpu_max, &cpu_min);
printf(" CPU max = %d\n", cpu_max);
printf(" CPU min = %d\n", cpu_min);
double t3 = get_time();
//fprintf(stderr, "CPU sum: %u\n", B);
//******The last judgement**********
//if ( final_result_max == cpu_max && final_result_min == cpu_min )
if ( final_result[0] == cpu_max && final_result[1] == cpu_min )
fprintf(stderr, "Test Passed!\n");
fprintf(stderr, "Test failed!\n");
//****and some timing details*******
fprintf(stderr, "GPU time %.3f ms\n", (t1 - t0) * 1000.0);
fprintf(stderr, "CPU time %.3f ms\n", (t3 - t2) * 1000.0);
return 0;
(Quardo P1000上运行)
这个问题,就留给大家思索了!听说阅读 樊哲勇老师的小红书《CUDA 编程:基础与实践》可以找到解决之路哦~~
