#include #include #include #include double pi_seq(int64_t num_steps); double pi_openmp(int64_t num_steps); double pi_cuda(int64_t num_steps); double pi_cuda2(int64_t num_steps); __global__ void pi_kernel(int64_t num_steps, int64_t iterationsperthread, double step, double *sumresults); __global__ void pi_kernel2(int64_t num_steps, int64_t iterationsperthread, double step, double *sumresults); double ltime(); int main (int argc, char *argv[]) { int64_t steps = 1000000000L; double starttime, endtime, seqtime, partime, pi; if (argc > 1) steps = atoi(argv[1]); // run the same code twice to warm up the GPU (or processor) starttime = ltime(); pi = pi_seq(steps); endtime = ltime(); seqtime = endtime - starttime; printf("Sequential pi=%.10lf, time = %.6lf\n", pi, seqtime); starttime = ltime(); pi = pi_seq(steps); endtime = ltime(); seqtime = endtime - starttime; printf("Sequential pi=%.10lf, time = %.6lf\n", pi, seqtime); starttime = ltime(); pi = pi_openmp(steps); endtime = ltime(); partime = endtime - starttime; printf("OpenMP pi=%.10lf, time = %.6lf, speedup=%.2lf\n", pi, partime, (seqtime / partime)); starttime = ltime(); pi = pi_openmp(steps); endtime = ltime(); partime = endtime - starttime; printf("OpenMP pi=%.10lf, time = %.6lf, speedup=%.2lf\n", pi, partime, (seqtime / partime)); starttime = ltime(); pi = pi_cuda(steps); endtime = ltime(); partime = endtime - starttime; printf("CUDA pi=%.10lf, time = %.6lf, speedup=%.2lf\n", pi, partime, (seqtime / partime)); starttime = ltime(); pi = pi_cuda(steps); endtime = ltime(); partime = endtime - starttime; printf("CUDA pi=%.10lf, time = %.6lf, speedup=%.2lf\n", pi, partime, (seqtime / partime)); starttime = ltime(); pi = pi_cuda2(steps); endtime = ltime(); partime = endtime - starttime; printf("CUDA2 pi=%.10lf, time = %.6lf, speedup=%.2lf\n", pi, partime, (seqtime / partime)); starttime = ltime(); pi = pi_cuda2(steps); endtime = ltime(); partime = endtime - starttime; printf("CUDA2 pi=%.10lf, time = %.6lf, speedup=%.2lf\n", pi, partime, (seqtime / partime)); } double pi_seq(int64_t num_steps) { int64_t i; double step, x, pi, sum = 0.0; step = 1.0/(double) num_steps; for (i=0;i< num_steps; i++){ x = (i+0.5)*step; sum = sum + 4.0/(1.0+x*x); } pi = step * sum; return pi; } double pi_openmp(int64_t num_steps) { int64_t i; double step, x, pi, sum = 0.0; step = 1.0/(double) num_steps; #pragma omp parallel for private(x) reduction(+:sum) for (i=0; i>>(num_steps, iterationsperthread, step, d_sumresults); cudaMemcpy(h_sumresults, d_sumresults, sumresultssize, cudaMemcpyDeviceToHost); for (i = 0; i < blocks; i++) sum += h_sumresults[i]; pi = step * sum; cudaFree(d_sumresults); free(h_sumresults); return pi; } double pi_cuda2(int64_t num_steps) { int64_t blocks = BLOCKS; int64_t iterationsperthread = num_steps / (THREADSPERBLOCK*BLOCKS) + 1; int sumresultssize = blocks * sizeof(double); double *h_sumresults = (double*) malloc(sumresultssize); double *d_sumresults; double sum = 0.0; double step = 1.0/(double) num_steps; double pi; int i; cudaMalloc((void **)&d_sumresults, sumresultssize); pi_kernel2<<>>(num_steps, iterationsperthread, step, d_sumresults); cudaMemcpy(h_sumresults, d_sumresults, sumresultssize, cudaMemcpyDeviceToHost); for (i = 0; i < blocks; i++) sum += h_sumresults[i]; pi = step * sum; cudaFree(d_sumresults); free(h_sumresults); return pi; } // simple variant where TID 0 computes the sum for each block __global__ void pi_kernel(int64_t num_steps, int64_t iterationsperthread, double step, double *sumresults) { int tindex = threadIdx.x + blockIdx.x * blockDim.x; int64_t start = (int64_t)tindex * iterationsperthread; int64_t end = start + iterationsperthread; __shared__ double sums[THREADSPERBLOCK]; double sum = 0.0, x; int64_t i; int j; if (end > num_steps) end = num_steps; for (i = start; i < end; i++) { x = (i+0.5)*step; sum = sum + 4.0/(1.0+x*x); } sums[threadIdx.x] = sum; __syncthreads(); if (threadIdx.x == 0) { sum = 0.0; for (j = 0; j < THREADSPERBLOCK; j++) sum += sums[j]; sumresults[blockIdx.x] = sum; } } // improved variant where sum is computed in two stages __global__ void pi_kernel2(int64_t num_steps, int64_t iterationsperthread, double step, double *sumresults) { int tindex = threadIdx.x + blockIdx.x * blockDim.x; int64_t start = (int64_t)tindex * iterationsperthread; int64_t end = start + iterationsperthread; __shared__ double sums[THREADSPERBLOCK]; __shared__ double bsums[TBLOCK]; double sum = 0.0, x; int64_t i; int j, k; if (end > num_steps) end = num_steps; for (i = start; i < end; i++) { x = (i+0.5)*step; sum = sum + 4.0/(1.0+x*x); } sums[threadIdx.x] = sum; if (threadIdx.x < TBLOCK) bsums[threadIdx.x] = 0.0; __syncthreads(); if (threadIdx.x < TBLOCK) { sum = 0.0; k = threadIdx.x * TBLOCK; for (j = 0; j < TBLOCK; j++) sum += sums[k+j]; bsums[threadIdx.x] = sum; } __syncthreads(); if (threadIdx.x == 0) { sum = 0.0; for (j = 0; j < TBLOCK; j++) sum += bsums[j]; sumresults[blockIdx.x] = sum; } } double ltime() { return omp_get_wtime(); }