#include #include #include #include #include "mpi.h" /* do not include cblas.h but but in hard what I need */ /*#include "cblas.h"*/ enum CBLAS_ORDER {CblasRowMajor=101, CblasColMajor=102 }; enum CBLAS_TRANSPOSE {CblasNoTrans=111, CblasTrans=112, CblasConjTrans=113, AtlasConj=114}; extern void cblas_daxpy(const int N, const double alpha, const double *X, const int incX, double *Y, const int incY); extern double cblas_ddot(const int N, const double *X, const int incX, const double *Y, const int incY); extern double cblas_dnrm2(const int N, const double *X, const int incX); extern void cblas_dscal(const int N, const double alpha, double *X, const int incX); extern void cblas_dgemv(const enum CBLAS_ORDER Order, const enum CBLAS_TRANSPOSE TransA, const int M, const int N, const double alpha, const double *A, const int lda, const double *X, const int incX, const double beta, double *Y, const int incY); int main(int argc, char **argv){ int i, j, index, m, mloc, my_rank, n, pool_size, seed; double *A, *Asave, alpha, beta, orthlevel, *R, represlevel; double time; MPI_Init(&argc, &argv); MPI_Comm_size(MPI_COMM_WORLD, &pool_size); MPI_Comm_rank(MPI_COMM_WORLD, &my_rank); m = 100; n = 10; for( i = 1; i < argc; i++ ) { if( strcmp( argv[i], "-m" ) == 0 ) { m = atoi(argv[i+1]); i++; } if( strcmp( argv[i], "-n" ) == 0 ) { n = atoi(argv[i+1]); i++; } } seed = my_rank*m*n; srand(seed); mloc = m / pool_size; if ( my_rank == 0 ) printf("Running MGS on %d processors with a matrix %dx%d (on each processor: %dx%d))\n", pool_size, m, n, mloc, n); A = (double *)malloc(mloc*n*sizeof(double)) ; if (A==NULL){ printf("error of memory allocation\n"); exit(0); } Asave = (double *)malloc(mloc*n*sizeof(double)) ; if (Asave==NULL){ printf("error of memory allocation\n"); exit(0); } R = (double *)malloc(n*n*sizeof(double)) ; if (R==NULL){ printf("error of memory allocation\n"); exit(0); } index = 0; for (i = 0; i < mloc; i++) { for (j = 0; j < n; j++) { A[index] = ((double) rand()) / ((double) RAND_MAX) - 0.5 ; Asave[index] = A[index] ; index++; } } for (i = 0; i < n*n; i++) { R[i] = 0e0; } /* Start MGS */ time =- MPI_Wtime(); for ( j = 0; j < n; j++) { for ( i = 0; i < j; i++) { alpha = cblas_ddot(mloc,&(A[i*mloc]),1,&(A[j*mloc]),1) ; MPI_Allreduce( &alpha, &(R[j*n+i]), 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD); cblas_daxpy(mloc,-R[j*n+i],&(A[i*mloc]),1,&(A[j*mloc]),1); } alpha = cblas_dnrm2(mloc,&(A[j*mloc]),1); alpha = alpha*alpha ; MPI_Allreduce( &alpha, &(R[j*n+j]), 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD); R[j*n+j] = sqrt(R[j*n+j]); cblas_dscal(mloc,1/R[j*n+j],&(A[j*mloc]),1); } time += MPI_Wtime(); /* And that's it .... */ if (my_rank==0) printf("MGS time (m=%6d,n=%6d,nb_proc=%4d) %20.3f \n",m,n,pool_size,time); /* Verify that the orthogonality ( Q^T Q = I ) is close the machine precision level */ orthlevel = 0.e0; for ( j = 0; j < n; j++) { for ( i = 0; i < j; i++) { alpha = cblas_ddot(mloc,&(A[i*mloc]),1,&(A[j*mloc]),1) ; MPI_Allreduce( &alpha, &beta, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD); orthlevel = orthlevel + 2*beta*beta ; } alpha = cblas_dnrm2(mloc,&(A[j*mloc]),1); alpha = alpha*alpha; MPI_Allreduce( &alpha, &beta, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD); orthlevel = orthlevel + (beta-1)*(beta-1) ; } orthlevel = sqrt(orthlevel) ; if (my_rank==0) printf(" || I - Q^T Q ||_{fro} = %e \n",orthlevel); /* Verify that the representativity of QR factorization ( A = QR ) is close the machine precision level */ represlevel = 0.e0; for ( j = 0; j < n; j++) { cblas_dgemv(CblasColMajor, CblasNoTrans, mloc, (j+1), 1.0e0, A, mloc, &(R[j*n]), 1, (-1.0e0), &(Asave[j*mloc]), 1); alpha = cblas_dnrm2(mloc,&(Asave[j*mloc]),1); alpha = alpha*alpha; MPI_Allreduce( &alpha, &beta, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD); represlevel = represlevel + beta ; } represlevel = sqrt(represlevel) ; if (my_rank==0) printf(" || A - Q R ||_{fro} = %e \n",orthlevel); free(R); free(Asave); free(A); MPI_Finalize(); exit(0); }