#include #include #include #include double pi_seq(int64_t num_steps); double pi_openmp(int64_t num_steps); double pi_cuda(int64_t num_steps); __global__ void pi_kernel(int64_t num_steps, int64_t iterationsperthread, double step, double *sumresults); double ltime(); int main (int argc, char *argv[]) { int64_t steps = 1000000000L; double starttime, endtime, seqtime, partime, pi; if (argc > 1) steps = atoi(argv[1]); starttime = ltime(); pi = pi_seq(steps); endtime = ltime(); seqtime = endtime - starttime; printf("Sequential pi=%.10lf, time = %.6lf\n", pi, seqtime); starttime = ltime(); pi = pi_openmp(steps); endtime = ltime(); partime = endtime - starttime; printf("OpenMP pi=%.10lf, time = %.6lf, speedup=%.2lf\n", pi, partime, (seqtime / partime)); starttime = ltime(); pi = pi_cuda(steps); endtime = ltime(); partime = endtime - starttime; printf("CUDA pi=%.10lf, time = %.6lf, speedup=%.2lf\n", pi, partime, (seqtime / partime)); } double pi_seq(int64_t num_steps) { int64_t i; double step, x, pi, sum = 0.0; step = 1.0/(double) num_steps; for (i=0;i< num_steps; i++){ x = (i+0.5)*step; sum = sum + 4.0/(1.0+x*x); } pi = step * sum; return pi; } double pi_openmp(int64_t num_steps) { int64_t i; double step, x, pi, sum = 0.0; step = 1.0/(double) num_steps; #pragma omp parallel for private(x) reduction(+:sum) for (i=0; i>>(num_steps, iterationsperthread, step, d_sumresults); // copy results back to host cudaMemcpy(/* TODO */); // sum the results of each block // TODO // free dynamically allocated memory cudaFree(d_sumresults); free(h_sumresults); return pi; } __global__ void pi_kernel(int64_t num_steps, int64_t iterationsperthread, double step, double *sumresults) { // global index of each thread int tindex = threadIdx.x + blockIdx.x * blockDim.x; // threadIdx.x is index within a block // TODO compute the section each thread needs to sum // shared array of sums of each thread in a block __shared__ double sums[THREADSPERBLOCK]; // TODO do the work of the thread // store result to shared array sums[threadIdx.x] = sum; __syncthreads(); if (threadIdx.x == 0) { // TODO thread 0 in a block sums the total result of the block // and stores it to the global array of subresults } } double ltime() { return omp_get_wtime(); }