int blockMatrixMultiply(double* a, double* b, double* out, int n, int m){ const int s = n/m; const int r = n - s*m; const int matrix_size = n*n; const int block_size = m*m; const int string_size = m*n; const int small_block_size = r*m; const int smallest_block_size = r*r; int i, j, k; int res = zeroMatrix(out, matrix_size); if (res!=0){ printf("zeroMatrix failed!\n\t--blockmatrixmultiply\n"); } double* const temp_block = new double[block_size]; if (r>0){ for (i=0; i<s; i++){ for (j=0; j<s; j++){ for (k=0; k<s; k++){ res = simpleMatrixMultiply(a+i*string_size+k*block_size, b+k*string_size+j*block_size, temp_block, m, m, m); res = addToMatrix(out+i*string_size+j*block_size, temp_block, block_size); } res = simpleMatrixMultiply(a+i*string_size+s*block_size, b+s*string_size+j*small_block_size, temp_block, m, r, m); res = addToMatrix(out+i*string_size+j*block_size, temp_block, block_size); } for (k=0; k<s; k++){ res = simpleMatrixMultiply(a+i*string_size+k*block_size, b+k*string_size+s*block_size, temp_block, m, m, r); res = addToMatrix(out+i*string_size+s*block_size, temp_block, small_block_size); } res = simpleMatrixMultiply(a+i*string_size+s*block_size, b+s*string_size+s*small_block_size, temp_block, m, r, r); res = addToMatrix(out+i*string_size+s*block_size, temp_block, small_block_size); } for (j=0; j<s; j++){ for (k=0; k<s; k++){ res = simpleMatrixMultiply(a+s*string_size+k*small_block_size, b+k*string_size+j*block_size, temp_block, r, m, m); res = addToMatrix(out+s*string_size+j*small_block_size, temp_block, small_block_size); } res = simpleMatrixMultiply(a+s*string_size+s*small_block_size, b+s*string_size+j*small_block_size, temp_block, r, r, m); res = addToMatrix(out+s*string_size+j*small_block_size, temp_block, small_block_size); } for (k=0; k<s; k++){ res = simpleMatrixMultiply(a+s*string_size+k*small_block_size, b+k*string_size+s*block_size, temp_block, r, m, r); res = addToMatrix(out+s*string_size+s*small_block_size, temp_block, smallest_block_size); } res = simpleMatrixMultiply(a+s*string_size+s*small_block_size, b+s*string_size+s*small_block_size, temp_block, r, r, r); res = addToMatrix(out+s*string_size+s*small_block_size, temp_block, smallest_block_size); } else{ for (i=0; i<s; i++){ for (j=0; j<s; j++){ for (k=0; k<s; k++){ res = simpleMatrixMultiply(a+i*string_size+k*block_size, b+k*string_size+j*block_size, temp_block, m, m, m); res = addToMatrix(out+i*string_size+j*block_size, temp_block, block_size); } } } } delete[] temp_block; return 0; }
void blocksMultiplyLast(double *a, double *b, int i, int j, double *out, int k, int m, int n, int r){ //out=a_{pq}*b_{ij} //k = n/m //r = m%n //m - размер блока int z; z = (j<k) ? m:r; //int block_size_first = m*x; //int block_size_second = m*y; simpleMatrixMultiply(a, b + i*m*n + j*m*r, out, r, r, z); }
void blocksMultiply(double *a, int p, int q, double *b, int i, int j, double *out, int k, int m, int n, int r){ //out=a_{pq}*b_{ij} //k = n/m //r = m%n //m - размер блока int x, y, z, t; x = (p<k) ? m:r; y = (q<k) ? m:r; z = (j<k) ? m:r; t = (i<k) ? m:r; //int block_size_first = m*x; //int block_size_second = m*y; simpleMatrixMultiply(a + p*m*n + q*m*x, b + i*m*n + j*m*t, out, x, y, z); }
int main(int argc, const char* argv[]) { srand((unsigned int)time(NULL)); if (argc >= 2) { int m_size = atoi(argv[1]); int debug = 0, retVal = 0, test = 0; double startTime = 0, endTime = 0, time; //Get needed info for the multiplication if (argc == 3) { if (strncmp(argv[2], "-d", 2) == 0) debug = 1; else if (strncmp(argv[2], "-t", 2) == 0) test = 1; } else if (argc == 4) { if (strncmp(argv[2], "-d", 2) == 0 || strncmp(argv[3], "-d", 2) == 0) debug = 1; if (strncmp(argv[2], "-t", 2) == 0 || strncmp(argv[3], "-t", 2) == 0) test = 1; } double * __attribute__((aligned(16))) a = generateRandomMatrixOfSize(m_size); double * __attribute__((aligned(16))) b = generateRandomMatrixOfSize(m_size); double * __attribute__((aligned(16))) c = matrixOfZerosWithSize(m_size); //Multiply if (test) { double * __attribute__((aligned(16))) test_c = matrixOfZerosWithSize(m_size); simpleMatrixMultiply(a, b, test_c, m_size); matrixMultiply(a, b, c, m_size); FILE *correct = fopen("correct.txt", "w"); saveMatrixToFile(test_c, m_size, correct); fclose(correct); FILE *mine = fopen("mine.txt", "w"); saveMatrixToFile(c, m_size, mine); fclose(mine); } else {
int gaussInvert(double *a, double *b, int matrix_side, int block_side, int total_pr, int current_pr, int* blocks_order_reversed, int* blocks_order, double* buf_1, double* buf_2, double* buf_string, double* buf_string_2){ MPI_Status status; int first_row, first_row_proc_id, last_row_c; int current_row, current_row_proc_id; int start_nonzero_a, nonzero_a_size; int total_block_rows, total_full_block_rows, block_size, block_string_size; int max_block_rows_pp, max_rows_pp, short_block_string_size, last_block_row_proc_id, last_block_row_in_current_pr; int small_block_row_width, small_block_size, current_pr_full_rows, last_block_row_width, matrix_size_current_pr; int buf_size; int i, j, k, j1, min_j, min_k_global; int res; double temp=-1.; mainBlockInfo in, out; initParameters(matrix_side, block_side, total_pr, current_pr, &total_block_rows, &total_full_block_rows, &block_size, &block_string_size, &max_block_rows_pp, &max_rows_pp, &short_block_string_size, &last_block_row_proc_id, &last_block_row_in_current_pr, &small_block_row_width, &small_block_size, ¤t_pr_full_rows, &last_block_row_width, &matrix_size_current_pr); buf_size = 2 * block_string_size; in.rank = current_pr; in.minnorm = 0.; in.label = 0; in.min_k = 0; for (i=0; i<buf_size; i++){ buf_string[i] = 0.; buf_string_2[i] = 0.; } for (i=0; i<total_full_block_rows; i++){ start_nonzero_a = block_size*(i); nonzero_a_size = block_string_size - start_nonzero_a; buf_size = block_string_size + nonzero_a_size; first_row = (i+total_pr-1-current_pr)/total_pr; first_row_proc_id = i%total_pr; min_j = 0; min_k_global = 0; in.minnorm = 0.; in.rank = current_pr; in.label = 0; in.min_k = i; temp = 0.; for (j=first_row; j<current_pr_full_rows; j++){ for (k=i; k<total_full_block_rows; k++){ res = simpleInvert(a + j*block_string_size + k*block_size, buf_1, buf_2, block_side); if (!res) { temp = matrixNorm(buf_1, block_side); if (in.label){ if (temp<in.minnorm){ in.minnorm = temp; min_j=j; in.min_k = k; } } else{ in.label = 1; in.minnorm = temp; min_j=j; in.min_k = k; } } } } #ifdef WO_PIVOT_SEARCH_ATALL if (current_pr==first_row_proc_id){ min_j=first_row; in.minnorm = 1.; in.min_k=i; in.label=1; } #endif //rewrite MPI_Allreduce(&in, &out, 1, MPI_mainBlockInfo, MPI_searchMainBlock, MPI_COMM_WORLD); if (out.label==0){ if (current_pr==0){ printf("Main block not found!\n\t -- Step %d\n", i); } fflush(stdout); return -1; } #ifdef DEBUG_MODE if (current_pr == first_row_proc_id){ printf("**\nOUT RANK %d\n", out.rank); printf("IN RANK %d\n**\n", in.rank); fflush(stdout); } #endif min_k_global = out.min_k; #ifdef W_FULL_PIVOT_SEARCH for (j=0; j<max_block_rows_pp; j++){ swapMatrix(a + j*block_string_size + i*block_size, a + j*block_string_size + min_k_global*block_size, block_size); } #endif temp = blocks_order[i]; blocks_order[i]=blocks_order[min_k_global]; blocks_order[min_k_global]=temp; //for debug purposes: #ifdef WO_PIVOT_SEARCH_ATALL out.rank = first_row_proc_id; //don't forget about it #endif /***************Multiply string by the main block*****************/ if (current_pr==out.rank){ simpleInvert(a + min_j*block_string_size + i*block_size, buf_1, buf_2, block_side); for (j=i+1; j<total_full_block_rows; j++){ simpleMatrixMultiply(buf_1, a + min_j*block_string_size + j*block_size, buf_2, block_side, block_side, block_side); copyMatrix(buf_2, a + min_j*block_string_size + j*block_size, block_size); } for (j=0; j<total_full_block_rows; j++){ simpleMatrixMultiply(buf_1, b + min_j*block_string_size + j*block_size, buf_2, block_side, block_side, block_side); copyMatrix(buf_2, b + min_j*block_string_size + j*block_size, block_size); } if(small_block_row_width){ simpleMatrixMultiply(buf_1, b + min_j*block_string_size + total_full_block_rows*block_size, buf_2, block_side, block_side, small_block_row_width); copyMatrix(buf_2, b + min_j*block_string_size + total_full_block_rows*block_size, small_block_size); simpleMatrixMultiply(buf_1, a + min_j*block_string_size + total_full_block_rows*block_size, buf_2, block_side, block_side, small_block_row_width); copyMatrix(buf_2, a + min_j*block_string_size + total_full_block_rows*block_size, small_block_size); } for (j=0; j<block_string_size; j++){ //buf_string[j]=a[min_j*block_string_size+j]; buf_string[j]=b[min_j*block_string_size+j]; } //for (j=0; j<block_string_size; j++){ for (j=start_nonzero_a, j1=block_string_size; j<block_string_size; j++, j1++){ //buf_string[j+block_string_size]=b[min_j*block_string_size+j]; buf_string[j1]=a[min_j*block_string_size+j]; } } //MPI_Bcast(buf_string, buf_size, MPI_DOUBLE, out.rank, MPI_COMM_WORLD); MPI_Bcast(buf_string, buf_size, MPI_DOUBLE, out.rank, MPI_COMM_WORLD); if (out.rank!=first_row_proc_id){ if (current_pr==first_row_proc_id){ for (j=0; j<block_string_size; j++){ //buf_string_2[j]=a[first_row*block_string_size+j]; buf_string_2[j]=b[first_row*block_string_size+j]; //a[first_row*block_string_size+j]=buf_string[j]; b[first_row*block_string_size+j]=buf_string[j]; } //for (j=0; j<block_string_size; j++){ for (j=start_nonzero_a, j1=block_string_size; j<block_string_size; j++, j1++){ //buf_string_2[j+block_string_size]=b[first_row*block_string_size+j]; buf_string_2[j1]=a[first_row*block_string_size+j]; //b[first_row*block_string_size+j]=buf_string[j+block_string_size]; a[first_row*block_string_size+j]=buf_string[j1]; } //MPI_Send(buf_string_2, buf_size, MPI_DOUBLE, out.rank, 42, MPI_COMM_WORLD); MPI_Send(buf_string_2, buf_size, MPI_DOUBLE, out.rank, 42, MPI_COMM_WORLD); } if(current_pr==out.rank){ //MPI_Recv(buf_string_2, buf_size, MPI_DOUBLE, first_row_proc_id, 42, MPI_COMM_WORLD, &status); MPI_Recv(buf_string_2, buf_size, MPI_DOUBLE, first_row_proc_id, 42, MPI_COMM_WORLD, &status); for (j=0; j<block_string_size; j++){ //a[min_j*block_string_size+j]=buf_string_2[j]; b[min_j*block_string_size+j]=buf_string_2[j]; } //for (j=0; j<block_string_size; j++){ for (j=start_nonzero_a, j1=block_string_size; j<block_string_size; j++, j1++){ //b[min_j*block_string_size+j]=buf_string_2[j+block_string_size]; a[min_j*block_string_size+j]=buf_string_2[j1]; } } } else{ if(current_pr==out.rank){ if (min_j!=first_row){ for(j=0; j<block_string_size; j++){ temp=a[min_j*block_string_size+j]; a[min_j*block_string_size+j]=a[first_row*block_string_size+j]; a[first_row*block_string_size+j]=temp; } for(j=0; j<block_string_size; j++){ temp=b[min_j*block_string_size+j]; b[min_j*block_string_size+j]=b[first_row*block_string_size+j]; b[first_row*block_string_size+j]=temp; } } } } #ifdef DEBUG_MODE if (current_pr == out.rank){ printf("SUCCESS SEARCHING MAIN BLOCK STEP %d\n", i); fflush(stdout); } #endif if (current_pr == first_row_proc_id){ first_row++; } for (j=first_row; j<max_block_rows_pp; j++){ //for (k = i+1; k<total_full_block_rows; k++){ for (k = i+1, j1=1; k<total_full_block_rows; k++, j1++){ //simpleMatrixMultiply(a + j*block_string_size + i*block_size, buf_string + k*block_size, buf_2, block_side, block_side, block_side); simpleMatrixMultiply(a + j*block_string_size + i*block_size, buf_string + block_string_size + j1*block_size, buf_2, block_side, block_side, block_side); subtractFromMatrix(a + j*block_string_size + k*block_size, buf_2, block_size); //subtractFromMatrix(a + j*block_string_size + start_nonzero_a + j1*block_size, buf_2, block_size); } for (k = 0; k<total_full_block_rows; k++){ //simpleMatrixMultiply(a + j*block_string_size + i*block_size, buf_string + block_string_size + k*block_size, buf_2, block_side, block_side, block_side); simpleMatrixMultiply(a + j*block_string_size + i*block_size, buf_string + k*block_size, buf_2, block_side, block_side, block_side); subtractFromMatrix(b + j*block_string_size + k*block_size, buf_2, block_size); } if(small_block_row_width){ //simpleMatrixMultiply(a + j*block_string_size + i*block_size, buf_string + total_full_block_rows*block_size, buf_2, block_side, block_side, small_block_row_width); simpleMatrixMultiply(a + j*block_string_size + i*block_size, buf_string + buf_size - small_block_size, buf_2, block_side, block_side, small_block_row_width); subtractFromMatrix(a + j*block_string_size + total_full_block_rows*block_size, buf_2, small_block_size); //simpleMatrixMultiply(a + j*block_string_size + i*block_size, buf_string + total_full_block_rows*block_size + block_string_size, buf_2, block_side, block_side, small_block_row_width); simpleMatrixMultiply(a + j*block_string_size + i*block_size, buf_string + total_full_block_rows*block_size, buf_2, block_side, block_side, small_block_row_width); subtractFromMatrix(b + j*block_string_size + total_full_block_rows*block_size, buf_2, small_block_size); } } } if(small_block_row_width){ if (current_pr==last_block_row_proc_id){ simpleInvert(a + current_pr_full_rows*block_string_size + total_full_block_rows*block_size, buf_1, buf_2, small_block_row_width); for (k=0; k<total_full_block_rows; k++){ simpleMatrixMultiply(buf_1, b + current_pr_full_rows*block_string_size + k*block_size, buf_2, small_block_row_width, small_block_row_width, block_side); copyMatrix(buf_2, b + current_pr_full_rows*block_string_size + k*block_size, small_block_size); } simpleMatrixMultiply(buf_1, b + current_pr_full_rows*block_string_size + total_full_block_rows*block_size, buf_2, small_block_row_width, small_block_row_width, small_block_row_width); copyMatrix(buf_2, b + current_pr_full_rows*block_string_size + total_full_block_rows*block_size, small_block_row_width*small_block_row_width); } } #ifdef DEBUG_MODE if (current_pr==first_row_proc_id){ printf("SUCCESS IN DIRECT FLOW!!!\n"); fflush(stdout); } #endif for(j=0; j<total_full_block_rows; j++){ blocks_order_reversed[blocks_order[j]]=j; } #ifdef W_REVERSE_FLOW if(small_block_row_width){ if (current_pr==last_block_row_proc_id){ for (j=0; j<block_string_size; j++){ buf_string[j]=b[(last_block_row_in_current_pr-1)*block_string_size + j]; } } MPI_Bcast(buf_string, block_string_size, MPI_DOUBLE, last_block_row_proc_id, MPI_COMM_WORLD); last_row_c = (current_pr==last_block_row_proc_id) ? (last_block_row_in_current_pr-1) : (last_block_row_in_current_pr); for (j=last_row_c-1;j>=0;j--){ for (k=0; k<total_full_block_rows;k++){ simpleMatrixMultiply(a+j*block_string_size+total_full_block_rows*block_size, buf_string + k*block_size, buf_2,block_side,small_block_row_width,block_side); subtractFromMatrix(b+j*block_string_size+k*block_size, buf_2, block_size); } simpleMatrixMultiply(a+j*block_string_size+total_full_block_rows*block_size, buf_string + total_full_block_rows*block_size, buf_2,block_side,small_block_row_width,small_block_row_width); subtractFromMatrix(b+j*block_string_size+total_full_block_rows*block_size, buf_2, small_block_size); } } for (i=total_full_block_rows-1; i>0; i--){//i-ю строчку вычитаем из всех //current_row = (i+total_pr-1-current_pr)/total_pr;//first row in cur_pr not upper than the subtracted string current_row_proc_id = i%total_pr; current_row = i/total_pr; if(current_pr==current_row_proc_id){ for (j=0; j<block_string_size; j++){ buf_string[j]=b[current_row*block_string_size + j]; } } if(current_pr<current_row_proc_id){ current_row++; } MPI_Bcast(buf_string, block_string_size, MPI_DOUBLE, current_row_proc_id, MPI_COMM_WORLD); for(j=current_row-1; j>=0; j--){//из j-й строчки правой матрицы вычитается for (k=0; k<total_full_block_rows;k++){ simpleMatrixMultiply(a+j*block_string_size+i*block_size, buf_string + k*block_size, buf_2, block_side, block_side, block_side); subtractFromMatrix(b+j*block_string_size+k*block_size, buf_2, block_size); } if (small_block_row_width){ simpleMatrixMultiply(a+j*block_string_size+i*block_size, buf_string + total_full_block_rows*block_size, buf_2,block_side,block_side,small_block_row_width); subtractFromMatrix(b+j*block_string_size+total_full_block_rows*block_size, buf_2, small_block_size); } } } #endif #ifdef DEBUG_MODE if (current_pr==0){ printf("SUCCESS IN REVERSE FLOW!!!\n"); fflush(stdout); printf("Exit from gauss\n"); fflush(stdout); } #endif return 0; }