/* matmul.mpi.c SJ */ /* matrix multiplication in a couple of ways */ #include #include #include #include #include #ifndef N #define N 100 #endif #ifndef PP #define PP 4 #endif #define NEW(type) ( (type*)malloc(sizeof(type)) ) typedef double matrix[N][N]; typedef double pmatrix[N/PP][N]; double ltime(); void matmul(matrix *A, matrix *B, matrix *C); void matmul_t(matrix *A, matrix *B, matrix *C); int matmul_t_omp(matrix *A, matrix *B, matrix *C); void matmul_t_b(matrix *A, matrix *B, matrix *C); void transpose(matrix *A, matrix *T); int main(int argc, char **argv) { int PID, P; int i, j; double starttime, endtime, seqtime = 0, partime; matrix *A = NULL, *B = NULL, *C = NULL, *T = NULL; /* space allocation */ A = NEW(matrix); B = NEW(matrix); C = NEW(matrix); /* create random matrices */ for (i = 0; i < N; i++) { for (j = 0; j < N; j++) { (*A)[i][j] = (double)(i + j + 1); (*B)[i][j] = (double)(i + j); } } if (N <= 1000) { starttime = ltime(); /* sequential */ matmul(A, B, C); endtime = ltime(); seqtime = endtime-starttime; printf("Simple sequential %.3f s, %6.2lf MFLOPS, C(6, 9) = %lf \n", seqtime, ((double)N*N*N*2)/(1000000*seqtime), (*C)[6][9]); } if (N <= 2000) { /* transpose */ starttime = ltime(); T = NEW(matrix); /* sequential */ transpose(A, T); endtime = ltime(); seqtime = endtime-starttime; printf("Transpose only %.3f s, %6.2lf MFLOPS, T(6, 9) = %lf \n", seqtime, ((double)N*N)/(1000000*seqtime), (*T)[6][9]); /* sequential with transpose */ starttime = ltime(); matmul_t (A, B, C); endtime = ltime(); seqtime = endtime-starttime; printf("Sequential with transpose %.3f s, %6.2lf MFLOPS, C(6, 9) = %lf \n", seqtime, ((double)N*N*N*2)/(1000000*seqtime), (*C)[6][9]); /* OpenMP with transpose */ starttime = ltime(); int Nthreads = matmul_t_omp (A, B, C); endtime = ltime(); seqtime = endtime-starttime; printf("OpenMP (Threads=%d, procs=%d) with transpose %.3f s, %6.2lf MFLOPS, C(6, 9) = %lf \n", Nthreads, omp_get_num_procs(), seqtime, ((double)N*N*N*2)/(1000000*seqtime), (*C)[6][9]); /* sequential with transpose and blocking */ starttime = ltime(); matmul_t (A, B, C); endtime = ltime(); seqtime = endtime-starttime; printf("Sequential with transpose and blocking %.3f s, %6.2lf MFLOPS, C(6, 9) = %lf \n", seqtime, ((double)N*N*N*2)/(1000000*seqtime), (*C)[6][9]); } free(A); free(B); free(C); free(T); exit(0); } /* basic algorithm according to the defition */ void matmul(matrix *A, matrix *B, matrix *C) { int i, j, k; double c; for (i = 0; i < N; i++) { for (j = 0; j < N; j++) { c = 0.0; for (k = 0; k < N; k++) { c += (*A)[i][k] * (*B)[k][j]; } (*C)[i][j] = c; } } } /* transpose matrix A to T */ void transpose(matrix *A, matrix *T) { int i, j; for (i = 0; i < N; i++) { for (j = 0; j < N; j++) { (*T)[j][i] = (*A)[i][j]; } } } /* first transpose B, then compute matmul */ void matmul_t(matrix *A, matrix *B, matrix *C) { int i, j, k; double c; matrix *Bt; Bt = NEW(matrix); transpose(B, Bt); for (i = 0; i < N; i++) { for (j = 0; j < N; j++) { c = 0.0; for (k = 0; k < N; k++) { c += (*A)[i][k] * (*Bt)[j][k]; } (*C)[i][j] = c; } } free(Bt); } int matmul_t_omp(matrix *A, matrix *B, matrix *C) { int i, j, p; matrix *Bt; Bt = NEW(matrix); transpose(B, Bt); #pragma omp parallel for private(i, j) for (i = 0; i < N; i++) { p = omp_get_num_threads(); for (j = 0; j < N; j++) { int k; double c = 0.0; for (k = 0; k < N; k++) { c += (*A)[i][k] * (*Bt)[j][k]; } (*C)[i][j] = c; } } free(Bt); return p; } /* use A and Bt in stripes */ /* this helps cache usage in many architectures */ void matmul_t_b(matrix *A, matrix *B, matrix *C) { int i, ii, j, jj, k; double c; matrix *Bt; int block = 20; Bt = NEW(matrix); transpose(B, Bt); for (i = 0; i < N; i+=block) { for (j = 0; j < N; j+=block) { for (ii = i; ii < i+block && ii < N; ii++) { for (jj = j; jj < j+block && jj < N; jj++) { c = 0.0; for (k = 0; k < N; k++) { c += (*A)[ii][k] * (*Bt)[jj][k]; } (*C)[ii][jj] = c; } } } } free(Bt); } double ltime() { struct timeval tv; struct timezone tz; gettimeofday(&tv, &tz); return tv.tv_sec + (double)tv.tv_usec/1000000; }