int main (int argc, char *argv[]) { double *A,*A2,*L,*U, temp2; int i,j,k; int temp=0; int offset = 0; double t1,t2; if( argc > 1 ) N = atoi(argv[1]); if( argc > 2 ) //Block = atoi(argv[2]); M = atoi(argv[2]); A = (double *)malloc (N*N*sizeof(double)); A2 = (double *)malloc (N*N*sizeof(double)); L = (double *)malloc (N*N*sizeof(double)); U = (double *)malloc (N*N*sizeof(double)); if( A==NULL || A2==NULL || L==NULL || U==NULL) { printf("Can't allocate memory\n"); exit(1); } /* INITIALIZATION */ //InitMatrix(A,N); InitMatrix3(A,N); for(i=0; i<N*N; i++) { A2[i] = A[i]; // Copy of A for verification of correctness L[i] = 0; U[i] = 0; } /* /\* LU DECOMPOSITION *\/ */ /* for (k=0;k<N-1;k++){ */ /* for (i=k+1;i<N;i++){ */ /* A[i*N+k] = A[i*N+k]/A[k*N+k]; */ /* /\* for (i=k+1;i<N;i++) *\/ */ /* for (j=k+1;j<N;j++) */ /* A[i*N+j] = A[i*N+j] - A[i*N+k]*A[k*N+j]; */ /* } */ /* } */ int *sizedim; int *start; int R; //Remain int itr = 0; sizedim = (int*)malloc(M*sizeof(int)); start = (int*)malloc(M*sizeof(int)); R = N; t1 = GetTickCount(); #pragma omp parallel { //printf("The number of thread: %d\n", omp_get_num_threads()); #pragma omp master { while (N-offset>M){ // printf(" Iteration: %d\n", itr++); for (i=0;i<M;i++){ if (i<R%M){ sizedim[i]=R/M+1; start[i]=(R/M+1)*i; } else{ sizedim[i]=R/M; start[i]=(R/M+1)*(R%M)+(R/M)*(i-R%M); } //printf("%i,%i \n",sizedim[i],start[i]); } //Print_Matrix(sizedim,1,M); stage1(A, offset, sizedim, start, N, M); //Print_Matrix(A,N,N); stage2(A, offset, sizedim, start, N, M); //Print_Matrix(A,N,N); stage3(A, offset, sizedim, start, N, M); offset+=sizedim[0]; R=R-sizedim[0]; //Print_Matrix(A,N,N); } //while } //master } //omp parallel ProcessDiagonalBlock(&A[offset*N+offset], N-offset, N); t2 = GetTickCount(); printf("Time for LU-decomposition in secs: %f \n", (t2-t1)/1000000); //Print_Matrix(A,N,N); /* while (N-offset>Block){ */ /* stepLU(A,Block,offset,N); */ /* offset+=Block; */ /* } */ /* ProcessDiagonalBlock(&A[offset*N+offset], N-offset, N); */ //Print_Matrix(A,N,N); #ifdef CHECK /* PROOF OF CORRECTNESS */ for (i=0;i<N;i++) for (j=0;j<N;j++) if (i>j) L[i*N+j] = A[i*N+j]; else U[i*N+j] = A[i*N+j]; for (i=0;i<N;i++) L[i*N+i] = 1; //printf("L=\n"); //Print_Matrix(L,N,N); //printf("U=\n"); //Print_Matrix(U,N,N); for (i=0;i<N;i++) for (j=0;j<N;j++){ temp2=0; for (k=0;k<N;k++) temp2+=L[i*N+k]*U[k*N+j]; if ((A2[i*N+j]-temp2)/A2[i*N+j] >0.1 || (A2[i*N+j]-temp2)/A2[i*N+j] <-0.1) { temp++; printf("Error at: [%d, %d\n]",i,j); } } printf("Errors = %d \n", temp); #endif return 0; }
int main (int argc, char *argv[]) { double *A,*A2,*L,*U, temp2; int i,j,k; int temp=0; int offset = 0; double t1,t2; if (argc < 3) { printf("Usage: ./lu <Matrix size> <number of blocks per dimension>\n"); exit(1); } if( argc > 1 ) N = atoi(argv[1]); if( argc > 2 ) M = atoi(argv[2]); A = (double *)malloc (N*N*sizeof(double)); A2 = (double *)malloc (N*N*sizeof(double)); L = (double *)malloc (N*N*sizeof(double)); U = (double *)malloc (N*N*sizeof(double)); if( A==NULL || A2==NULL || L==NULL || U==NULL) { printf("Can't allocate memory\n"); exit(1); } /* INITIALIZATION */ InitMatrix3(A,N); for(i=0; i<N*N; i++) { A2[i] = A[i]; // Copy of A for verification of correctness L[i] = 0; U[i] = 0; } int *sizedim; int *start; int R; //Remain sizedim = (int*)malloc(M*sizeof(int)); start = (int*)malloc(M*sizeof(int)); R = N; t1 = GetTickCount(); #pragma omp parallel { #pragma omp master { while (N-offset>M){ for (i=0;i<M;i++){ if (i<R%M){ sizedim[i]=R/M+1; start[i]=(R/M+1)*i; } else{ sizedim[i]=R/M; start[i]=(R/M+1)*(R%M)+(R/M)*(i-R%M); } } stage1(A, offset, sizedim, start, N, M); stage2(A, offset, sizedim, start, N, M); stage3(A, offset, sizedim, start, N, M); offset+=sizedim[0]; R=R-sizedim[0]; } //end of while } //end of master } //end of parallel region ProcessDiagonalBlock(&A[offset*N+offset], N-offset, N); t2 = GetTickCount(); printf("Time for LU-decomposition in secs: %f \n", (t2-t1)/1000000); #ifdef CHECK /* PROOF OF CORRECTNESS */ for (i=0;i<N;i++) for (j=0;j<N;j++) if (i>j) L[i*N+j] = A[i*N+j]; else U[i*N+j] = A[i*N+j]; for (i=0;i<N;i++) L[i*N+i] = 1; for (i=0;i<N;i++) for (j=0;j<N;j++){ temp2=0; for (k=0;k<N;k++) temp2+=L[i*N+k]*U[k*N+j]; if ((A2[i*N+j]-temp2)/A2[i*N+j] >0.1 || (A2[i*N+j]-temp2)/A2[i*N+j] <-0.1) temp++; } printf("Errors = %d \n", temp); #endif return; }