int blockMatrixMultiply(double* a, double* b, double* out, int n, int m){
  const int s = n/m;
  const int r = n - s*m;
  const int matrix_size = n*n;
  const int block_size = m*m;
  const int string_size = m*n;
  const int small_block_size = r*m;
  const int smallest_block_size = r*r;

  int i, j, k;
  int res = zeroMatrix(out, matrix_size);
  if (res!=0){
    printf("zeroMatrix failed!\n\t--blockmatrixmultiply\n");
  }
  double* const temp_block = new double[block_size];
  if (r>0){
    for (i=0; i<s; i++){
      for (j=0; j<s; j++){
	for (k=0; k<s; k++){
	  res = simpleMatrixMultiply(a+i*string_size+k*block_size, b+k*string_size+j*block_size, temp_block, m, m, m);
	  res = addToMatrix(out+i*string_size+j*block_size, temp_block, block_size);
	}
	res = simpleMatrixMultiply(a+i*string_size+s*block_size, b+s*string_size+j*small_block_size, temp_block, m, r, m);
	res = addToMatrix(out+i*string_size+j*block_size, temp_block, block_size);
      }
      for (k=0; k<s; k++){
	res = simpleMatrixMultiply(a+i*string_size+k*block_size, b+k*string_size+s*block_size, temp_block, m, m, r);
	res = addToMatrix(out+i*string_size+s*block_size, temp_block, small_block_size);
      }
      res = simpleMatrixMultiply(a+i*string_size+s*block_size, b+s*string_size+s*small_block_size, temp_block, m, r, r);
      res = addToMatrix(out+i*string_size+s*block_size, temp_block, small_block_size);
    }
    for (j=0; j<s; j++){
      for (k=0; k<s; k++){
	res = simpleMatrixMultiply(a+s*string_size+k*small_block_size, b+k*string_size+j*block_size, temp_block, r, m, m);
	res = addToMatrix(out+s*string_size+j*small_block_size, temp_block, small_block_size);
      }
      res = simpleMatrixMultiply(a+s*string_size+s*small_block_size, b+s*string_size+j*small_block_size, temp_block, r, r, m);
      res = addToMatrix(out+s*string_size+j*small_block_size, temp_block, small_block_size);
    }
    for (k=0; k<s; k++){
      res = simpleMatrixMultiply(a+s*string_size+k*small_block_size, b+k*string_size+s*block_size, temp_block, r, m, r);
      res = addToMatrix(out+s*string_size+s*small_block_size, temp_block, smallest_block_size);
    }
    res = simpleMatrixMultiply(a+s*string_size+s*small_block_size, b+s*string_size+s*small_block_size, temp_block, r, r, r);
    res = addToMatrix(out+s*string_size+s*small_block_size, temp_block, smallest_block_size);
  }
  else{
    for (i=0; i<s; i++){
      for (j=0; j<s; j++){
	for (k=0; k<s; k++){
	  res = simpleMatrixMultiply(a+i*string_size+k*block_size, b+k*string_size+j*block_size, temp_block, m, m, m);
	  res = addToMatrix(out+i*string_size+j*block_size, temp_block, block_size);
	}
      }
    }
  }
  delete[] temp_block;
  return 0;
}
void blocksMultiplyLast(double *a, double *b, int i, int j, double *out, int k, int m, int n, int r){
  //out=a_{pq}*b_{ij}
  //k = n/m
  //r = m%n
  //m - размер блока
  int z;
  z = (j<k) ? m:r;
  //int block_size_first = m*x;
  //int block_size_second = m*y;
  simpleMatrixMultiply(a, b + i*m*n + j*m*r, out, r, r, z);
}
void blocksMultiply(double *a, int p, int q, double *b, int i, int j, double *out, int k, int m, int n, int r){
  //out=a_{pq}*b_{ij}
  //k = n/m
  //r = m%n
  //m - размер блока
  int x, y, z, t;
  x = (p<k) ? m:r;
  y = (q<k) ? m:r;
  z = (j<k) ? m:r;
  t = (i<k) ? m:r;
  //int block_size_first = m*x;
  //int block_size_second = m*y;
  simpleMatrixMultiply(a + p*m*n + q*m*x, b + i*m*n + j*m*t, out, x, y, z);
}
Ejemplo n.º 4
0
int main(int argc, const char* argv[])
{
  srand((unsigned int)time(NULL));
  
  if (argc >= 2) {
    int m_size = atoi(argv[1]);
    int debug = 0, retVal = 0, test = 0;
    double startTime = 0, endTime = 0, time;

    //Get needed info for the multiplication
    if (argc == 3) {
      if (strncmp(argv[2], "-d", 2) == 0) debug = 1;
      else if (strncmp(argv[2], "-t", 2) == 0) test = 1;
    } 
    else if (argc == 4) {
      if (strncmp(argv[2], "-d", 2) == 0 || strncmp(argv[3], "-d", 2) == 0) debug = 1;
      if (strncmp(argv[2], "-t", 2) == 0 || strncmp(argv[3], "-t", 2) == 0) test = 1;
    }

    double * __attribute__((aligned(16))) a = generateRandomMatrixOfSize(m_size);
    double * __attribute__((aligned(16))) b = generateRandomMatrixOfSize(m_size);
    double * __attribute__((aligned(16))) c = matrixOfZerosWithSize(m_size);

    //Multiply
    if (test) {
      double * __attribute__((aligned(16))) test_c = matrixOfZerosWithSize(m_size);      
      simpleMatrixMultiply(a, b, test_c, m_size);
      matrixMultiply(a, b, c, m_size);
      
      FILE *correct = fopen("correct.txt", "w");
      saveMatrixToFile(test_c, m_size, correct); 
      fclose(correct);
      FILE *mine = fopen("mine.txt", "w");
      saveMatrixToFile(c, m_size, mine); 
      fclose(mine);
    } else {
int gaussInvert(double *a, double *b, int matrix_side, int block_side, 
  int total_pr, int current_pr, 
  int* blocks_order_reversed, int* blocks_order,
  double* buf_1, double* buf_2, double* buf_string, double* buf_string_2){
  MPI_Status status;

  int first_row, first_row_proc_id, last_row_c;
  int current_row, current_row_proc_id;
  int start_nonzero_a, nonzero_a_size;

	int total_block_rows, total_full_block_rows, block_size, block_string_size;
	int max_block_rows_pp, max_rows_pp, short_block_string_size, last_block_row_proc_id, last_block_row_in_current_pr;
	int small_block_row_width, small_block_size, current_pr_full_rows, last_block_row_width, matrix_size_current_pr;
  int buf_size;
  int i, j, k, j1, min_j, min_k_global;
  int res;
  double temp=-1.;
  mainBlockInfo in, out;

	initParameters(matrix_side, block_side, total_pr, current_pr, 
	&total_block_rows, &total_full_block_rows, 
	&block_size, &block_string_size, 
	&max_block_rows_pp, &max_rows_pp, &short_block_string_size,
	&last_block_row_proc_id, &last_block_row_in_current_pr,
	&small_block_row_width, &small_block_size,
	&current_pr_full_rows, &last_block_row_width,
	&matrix_size_current_pr);

  buf_size = 2 * block_string_size;

 	in.rank = current_pr;
	in.minnorm = 0.;
  in.label = 0;
  in.min_k = 0;

 	for (i=0; i<buf_size; i++){
  	buf_string[i] = 0.;
		buf_string_2[i] = 0.;
 	}
 	for (i=0; i<total_full_block_rows; i++){
    start_nonzero_a = block_size*(i);
    nonzero_a_size = block_string_size - start_nonzero_a;
    buf_size = block_string_size + nonzero_a_size;

		first_row = (i+total_pr-1-current_pr)/total_pr;
		first_row_proc_id = i%total_pr;
    min_j = 0;
    min_k_global = 0;

   	in.minnorm = 0.;
   	in.rank = current_pr;
    in.label = 0;
    in.min_k = i;
		temp = 0.;
	
	  for (j=first_row; j<current_pr_full_rows; j++){
   		for (k=i; k<total_full_block_rows; k++){
   	    res = simpleInvert(a + j*block_string_size + k*block_size, buf_1, buf_2, block_side);
        if (!res) {
   			  temp = matrixNorm(buf_1, block_side);
          if (in.label){
            if (temp<in.minnorm){
     		      in.minnorm = temp;
              min_j=j;
              in.min_k = k;
            }
          }
          else{
            in.label = 1;
            in.minnorm = temp;
            min_j=j;
            in.min_k = k;
          }
        }
      }
    }

#ifdef WO_PIVOT_SEARCH_ATALL
		if (current_pr==first_row_proc_id){
			min_j=first_row;
			in.minnorm = 1.;
			in.min_k=i;
			in.label=1;
		}
#endif
    //rewrite
    MPI_Allreduce(&in, &out, 1, MPI_mainBlockInfo, MPI_searchMainBlock, MPI_COMM_WORLD);

		if (out.label==0){
			if (current_pr==0){
			  printf("Main block not found!\n\t -- Step %d\n", i);
			}
			fflush(stdout);
			return -1;
		}

#ifdef DEBUG_MODE
		if (current_pr == first_row_proc_id){
      printf("**\nOUT RANK %d\n", out.rank);
      printf("IN RANK %d\n**\n", in.rank);
     	fflush(stdout);
    }
#endif
   	min_k_global = out.min_k;
#ifdef W_FULL_PIVOT_SEARCH
  	for (j=0; j<max_block_rows_pp; j++){
			swapMatrix(a + j*block_string_size + i*block_size, a + j*block_string_size + min_k_global*block_size, block_size);
 		}
#endif
    temp = blocks_order[i];
    blocks_order[i]=blocks_order[min_k_global];
    blocks_order[min_k_global]=temp;

		//for debug purposes:
#ifdef WO_PIVOT_SEARCH_ATALL
		out.rank = first_row_proc_id;
		//don't forget about it
#endif
      /***************Multiply string by the main block*****************/
    if (current_pr==out.rank){
      simpleInvert(a + min_j*block_string_size + i*block_size, buf_1, buf_2, block_side);
      for (j=i+1; j<total_full_block_rows; j++){
        simpleMatrixMultiply(buf_1, a + min_j*block_string_size + j*block_size, buf_2, block_side, block_side, block_side);
        copyMatrix(buf_2, a + min_j*block_string_size + j*block_size, block_size);
      }
      for (j=0; j<total_full_block_rows; j++){
        simpleMatrixMultiply(buf_1, b + min_j*block_string_size + j*block_size, buf_2, block_side, block_side, block_side);
        copyMatrix(buf_2, b + min_j*block_string_size + j*block_size, block_size);
      }
      if(small_block_row_width){
        simpleMatrixMultiply(buf_1, b + min_j*block_string_size + total_full_block_rows*block_size, buf_2, block_side, block_side, small_block_row_width);
        copyMatrix(buf_2, b + min_j*block_string_size + total_full_block_rows*block_size, small_block_size);

        simpleMatrixMultiply(buf_1, a + min_j*block_string_size + total_full_block_rows*block_size, buf_2, block_side, block_side, small_block_row_width);
        copyMatrix(buf_2, a + min_j*block_string_size + total_full_block_rows*block_size, small_block_size);
      }
			for (j=0; j<block_string_size; j++){
		    //buf_string[j]=a[min_j*block_string_size+j];
        buf_string[j]=b[min_j*block_string_size+j];
			}
			//for (j=0; j<block_string_size; j++){
      for (j=start_nonzero_a, j1=block_string_size; j<block_string_size; j++, j1++){
				//buf_string[j+block_string_size]=b[min_j*block_string_size+j];
        buf_string[j1]=a[min_j*block_string_size+j];
			}
    }

		//MPI_Bcast(buf_string, buf_size, MPI_DOUBLE, out.rank, MPI_COMM_WORLD);
    MPI_Bcast(buf_string, buf_size, MPI_DOUBLE, out.rank, MPI_COMM_WORLD);

		if (out.rank!=first_row_proc_id){
			if (current_pr==first_row_proc_id){
				for (j=0; j<block_string_size; j++){
					//buf_string_2[j]=a[first_row*block_string_size+j];
          buf_string_2[j]=b[first_row*block_string_size+j];
          //a[first_row*block_string_size+j]=buf_string[j];
          b[first_row*block_string_size+j]=buf_string[j];
				}
				//for (j=0; j<block_string_size; j++){
        for (j=start_nonzero_a, j1=block_string_size; j<block_string_size; j++, j1++){
					//buf_string_2[j+block_string_size]=b[first_row*block_string_size+j];
          buf_string_2[j1]=a[first_row*block_string_size+j];
          //b[first_row*block_string_size+j]=buf_string[j+block_string_size];
          a[first_row*block_string_size+j]=buf_string[j1];
				}
				//MPI_Send(buf_string_2, buf_size, MPI_DOUBLE, out.rank, 42, MPI_COMM_WORLD);
        MPI_Send(buf_string_2, buf_size, MPI_DOUBLE, out.rank, 42, MPI_COMM_WORLD);
			}
			if(current_pr==out.rank){
				//MPI_Recv(buf_string_2, buf_size, MPI_DOUBLE, first_row_proc_id, 42, MPI_COMM_WORLD, &status);
        MPI_Recv(buf_string_2, buf_size, MPI_DOUBLE, first_row_proc_id, 42, MPI_COMM_WORLD, &status);
				for (j=0; j<block_string_size; j++){
					//a[min_j*block_string_size+j]=buf_string_2[j];
          b[min_j*block_string_size+j]=buf_string_2[j];
				}
				//for (j=0; j<block_string_size; j++){
        for (j=start_nonzero_a, j1=block_string_size; j<block_string_size; j++, j1++){
					//b[min_j*block_string_size+j]=buf_string_2[j+block_string_size];
          a[min_j*block_string_size+j]=buf_string_2[j1];
				}
			}
		}
		else{
			if(current_pr==out.rank){
				if (min_j!=first_row){
					for(j=0; j<block_string_size; j++){
						temp=a[min_j*block_string_size+j];
						a[min_j*block_string_size+j]=a[first_row*block_string_size+j];
						a[first_row*block_string_size+j]=temp;
					}
					for(j=0; j<block_string_size; j++){
						temp=b[min_j*block_string_size+j];
						b[min_j*block_string_size+j]=b[first_row*block_string_size+j];
						b[first_row*block_string_size+j]=temp;
					}
    		}
			}
		}

#ifdef DEBUG_MODE
		if (current_pr == out.rank){
			printf("SUCCESS SEARCHING MAIN BLOCK STEP %d\n", i);
			fflush(stdout);
		}
#endif

    if (current_pr == first_row_proc_id){
    	first_row++;
  	}

  	for (j=first_row; j<max_block_rows_pp; j++){
      //for (k = i+1; k<total_full_block_rows; k++){
      for (k = i+1, j1=1; k<total_full_block_rows; k++, j1++){
		    //simpleMatrixMultiply(a + j*block_string_size + i*block_size, buf_string + k*block_size, buf_2, block_side, block_side, block_side);
        simpleMatrixMultiply(a + j*block_string_size + i*block_size, buf_string + block_string_size + j1*block_size, buf_2, block_side, block_side, block_side);
			  subtractFromMatrix(a + j*block_string_size + k*block_size, buf_2, block_size);
        //subtractFromMatrix(a + j*block_string_size + start_nonzero_a + j1*block_size, buf_2, block_size);
    	}
      for (k = 0; k<total_full_block_rows; k++){
				//simpleMatrixMultiply(a + j*block_string_size + i*block_size, buf_string + block_string_size + k*block_size, buf_2, block_side, block_side, block_side);
        simpleMatrixMultiply(a + j*block_string_size + i*block_size, buf_string + k*block_size, buf_2, block_side, block_side, block_side);
				subtractFromMatrix(b + j*block_string_size + k*block_size, buf_2, block_size);
      }
      if(small_block_row_width){
        //simpleMatrixMultiply(a + j*block_string_size + i*block_size, buf_string + total_full_block_rows*block_size, buf_2, block_side, block_side, small_block_row_width);
        simpleMatrixMultiply(a + j*block_string_size + i*block_size, buf_string + buf_size - small_block_size, buf_2, block_side, block_side, small_block_row_width);
        subtractFromMatrix(a + j*block_string_size + total_full_block_rows*block_size, buf_2, small_block_size);

        //simpleMatrixMultiply(a + j*block_string_size + i*block_size, buf_string + total_full_block_rows*block_size + block_string_size, buf_2, block_side, block_side, small_block_row_width);
        simpleMatrixMultiply(a + j*block_string_size + i*block_size, buf_string + total_full_block_rows*block_size, buf_2, block_side, block_side, small_block_row_width);
        subtractFromMatrix(b + j*block_string_size + total_full_block_rows*block_size, buf_2, small_block_size);
    	}
   	}
	}

  if(small_block_row_width){
    if (current_pr==last_block_row_proc_id){
      simpleInvert(a + current_pr_full_rows*block_string_size + total_full_block_rows*block_size, buf_1, buf_2, small_block_row_width);
      for (k=0; k<total_full_block_rows; k++){
        simpleMatrixMultiply(buf_1, b + current_pr_full_rows*block_string_size + k*block_size, buf_2, 
          small_block_row_width, small_block_row_width, block_side);
        copyMatrix(buf_2, b + current_pr_full_rows*block_string_size + k*block_size, small_block_size);
    	}
      simpleMatrixMultiply(buf_1, b + current_pr_full_rows*block_string_size + total_full_block_rows*block_size, buf_2, 
      	small_block_row_width, small_block_row_width, small_block_row_width);
      copyMatrix(buf_2, b + current_pr_full_rows*block_string_size + total_full_block_rows*block_size, small_block_row_width*small_block_row_width);
    }
  }
#ifdef DEBUG_MODE
  if (current_pr==first_row_proc_id){
  	printf("SUCCESS IN DIRECT FLOW!!!\n");
  	fflush(stdout);
	}
#endif

  for(j=0; j<total_full_block_rows; j++){
    blocks_order_reversed[blocks_order[j]]=j;
  }

#ifdef W_REVERSE_FLOW
	if(small_block_row_width){
    if (current_pr==last_block_row_proc_id){
      for (j=0; j<block_string_size; j++){
        buf_string[j]=b[(last_block_row_in_current_pr-1)*block_string_size + j];
      }
    }
    MPI_Bcast(buf_string, block_string_size, MPI_DOUBLE, last_block_row_proc_id, MPI_COMM_WORLD);
    last_row_c = (current_pr==last_block_row_proc_id) ? (last_block_row_in_current_pr-1) : (last_block_row_in_current_pr);
    for (j=last_row_c-1;j>=0;j--){
      for (k=0; k<total_full_block_rows;k++){
        simpleMatrixMultiply(a+j*block_string_size+total_full_block_rows*block_size, buf_string + k*block_size, buf_2,block_side,small_block_row_width,block_side);
        subtractFromMatrix(b+j*block_string_size+k*block_size, buf_2, block_size);
      } 
      simpleMatrixMultiply(a+j*block_string_size+total_full_block_rows*block_size, buf_string + total_full_block_rows*block_size, buf_2,block_side,small_block_row_width,small_block_row_width);
      subtractFromMatrix(b+j*block_string_size+total_full_block_rows*block_size, buf_2, small_block_size);
    }
  }	
  for (i=total_full_block_rows-1; i>0; i--){//i-ю строчку вычитаем из всех
    //current_row = (i+total_pr-1-current_pr)/total_pr;//first row in cur_pr not upper than the subtracted string
    current_row_proc_id = i%total_pr;    
    current_row = i/total_pr;
    if(current_pr==current_row_proc_id){
    	for (j=0; j<block_string_size; j++){
       	buf_string[j]=b[current_row*block_string_size + j];
  		}
  	}
        
    if(current_pr<current_row_proc_id){
      current_row++;
    }
        
    MPI_Bcast(buf_string, block_string_size, MPI_DOUBLE, current_row_proc_id, MPI_COMM_WORLD);

    for(j=current_row-1; j>=0; j--){//из j-й строчки правой матрицы вычитается
      for (k=0; k<total_full_block_rows;k++){
        simpleMatrixMultiply(a+j*block_string_size+i*block_size, buf_string + k*block_size, buf_2, block_side, block_side, block_side);
      	subtractFromMatrix(b+j*block_string_size+k*block_size, buf_2, block_size);
      }
      if (small_block_row_width){
        simpleMatrixMultiply(a+j*block_string_size+i*block_size, buf_string + total_full_block_rows*block_size, buf_2,block_side,block_side,small_block_row_width);
        subtractFromMatrix(b+j*block_string_size+total_full_block_rows*block_size, buf_2, small_block_size);
    	}
    }
  }
#endif
#ifdef DEBUG_MODE
  if (current_pr==0){
    printf("SUCCESS IN REVERSE FLOW!!!\n");
    fflush(stdout);
    printf("Exit from gauss\n");
    fflush(stdout);
  }
#endif
	return 0;
}