int create_diag( PLA_Obj diag )
{
  int 
    i, size;
  double
    d_i;
  PLA_Obj 
    diag_1 = NULL, diag_B = NULL;

  PLA_Obj_view_all( diag, &diag_B );
  i = 1;

  while ( TRUE ){
    PLA_Obj_global_length( diag_B, &size );
    if ( size == 0 ) break;

    PLA_Obj_horz_split_2( diag_B, 1, &diag_1,
			              &diag_B );
    d_i = ( double ) i;

    PLA_Obj_set( diag_1, MPI_DOUBLE, &d_i );

    i++;
  }

  PLA_Obj_free( &diag_1 );
  PLA_Obj_free( &diag_B );

  return PLA_SUCCESS;
}
int PLA_Symmetrize( int uplo, PLA_Obj A )
{
  int 
    size;
  PLA_Obj
    A_BR = NULL, A_12 = NULL, A_21 = NULL;

  if ( uplo != PLA_LOWER_TRIANGULAR )
    PLA_Abort( "only uplo == PLA_LOWER_TRIANGULAR implemented",
		__LINE__, __FILE__ );

  PLA_Obj_view_all( A, &A_BR );
  while ( TRUE ){
    PLA_Obj_global_length( A_BR, &size );
    if ( size == 0 ) break;
    
    PLA_Obj_split_4( A_BR, 1, 1, PLA_DUMMY, &A_12,
                                  &A_21,      &A_BR );

    PLA_Obj_set_orientation( A_12, PLA_PROJ_ONTO_ROW );
    
    PLA_Copy( A_21, A_12 );
  }

  PLA_Obj_free( &A_BR );
  PLA_Obj_free( &A_12 );
  PLA_Obj_free( &A_21 );
  
  return PLA_SUCCESS;
}
int PLA_Obj_set_to_identity( PLA_Obj A )
{
  int length, width;
  PLA_Obj A11 = NULL,     ABR = NULL;

  PLA_Obj_global_length( A, &length );
  PLA_Obj_global_width( A, &width );

  if ( length != width ){
    printf("PLA_Obj_set_to_identity() error:  object not square\n");
    exit ( 0 );
  }

  PLA_Obj_set_to_zero( A );
  PLA_Obj_view_all( A, &ABR );
  
  while ( TRUE ){
    PLA_Obj_global_length( ABR, &length );
    if ( 0 == length ) break;
    
    PLA_Obj_split_4( ABR, 1, 1, &A11,      PLA_DUMMY,
                                 PLA_DUMMY, &ABR );

    PLA_Obj_set_to_one( A11 );
  }

  PLA_Obj_free( &A11 );
  PLA_Obj_free( &ABR );

  return PLA_SUCCESS;
}
int PLA_Matrix_infinity_norm_exit( PLA_Obj A, PLA_Obj alpha )
{
  int value = PLA_SUCCESS,
      size_malloced;
  double 
    PLA_Local_abs_max(), PLA_Local_abs_diff(), diff;
  char 
    routine_name[ 35 ];

  PLA_Routine_stack_push( "PLA_Matrix_inf_norm_exit" );

  if ( PLA_CHECK_AGAINST_SEQUENTIAL ){
    PLA_Obj
      A_tmp = NULL, alpha_tmp = NULL, alpha_cpy = NULL;

    PLA_Mscalar_create_conf_to( A, PLA_ALL_ROWS, PLA_ALL_COLS, 
				&A_tmp );
    PLA_Copy( A, A_tmp );

    PLA_Mscalar_create_conf_to( alpha, PLA_ALL_ROWS, PLA_ALL_COLS, 
				 &alpha_tmp );

    PLA_Copy( alpha, alpha_tmp );

    PLA_Mscalar_create_conf_to( alpha, PLA_ALL_ROWS, PLA_ALL_COLS, 
				 &alpha_cpy );

    PLA_Local_matrix_infinity_norm( A_tmp, alpha_cpy );

    diff = PLA_Local_abs_diff( alpha_tmp, alpha_cpy );

    if ( diff > 0.000001 ){
      PLA_Warning( "PLA_Matrix_infinity_norm: large absolute error encountered" );
      value--;
    }      

    PLA_Obj_free( &alpha_cpy );
    PLA_Obj_free( &alpha_tmp );
    PLA_Obj_free( &A_tmp );
  }

  size_malloced = PLA_Total_size_malloced( );

  if ( size_malloced != old_size_malloced )
    PLA_Warning( "PLA_Matrix_infinity_norm: memory discrepency" );
  
  PLA_Routine_stack_pop( routine_name );

  PLA_Routine_stack_pop( routine_name );

  return value;
}
PLA_Copy_sym_tridiag_to_msc( int uplo, PLA_Obj A, PLA_Obj tridiag )
{
    int
    size;
    PLA_Obj
    A_BR = NULL, A_11 = NULL, A_21 = NULL, A_21_1 = NULL,
    temp = NULL,
    diag_1 = NULL, diag_B = NULL,
    subdiag_1 = NULL, subdiag_B = NULL;

    if ( uplo != PLA_LOWER_TRIANGULAR )
        PLA_Abort( "uplo != PLA_LOWER_TRIANGULAR not yet implemented",
                   __LINE__, __FILE__ );

    PLA_Mscalar_create_conf_to( tridiag, PLA_ALL_ROWS, PLA_ALL_COLS,
                                &temp );
    PLA_Obj_set_to_zero( temp );
    PLA_Obj_view_all( A, &A_BR );
    PLA_Obj_vert_split_2( temp, 1, &diag_B, &subdiag_B );

    while ( TRUE ) {
        PLA_Obj_global_length( A_BR, &size );
        if ( size == 0 ) break;

        PLA_Obj_split_4( A_BR, 1, 1, &A_11, PLA_DUMMY,
                         &A_21, &A_BR );

        PLA_Obj_horz_split_2( diag_B, 1, &diag_1,
                              &diag_B );

        PLA_Obj_horz_split_2( subdiag_B, 1, &subdiag_1,
                              &subdiag_B );

        PLA_Local_copy( A_11, diag_1 );
        if ( size > 1 ) {
            PLA_Obj_horz_split_2( A_21, 1,   &A_21_1,
                                  PLA_DUMMY );
            PLA_Local_copy( A_21_1, subdiag_1 );
        }
    }

    PLA_Reduce( temp, MPI_SUM, tridiag );

    PLA_Obj_free( &A_BR );
    PLA_Obj_free( &A_11 );
    PLA_Obj_free( &A_21 );
    PLA_Obj_free( &A_21_1 );
    PLA_Obj_free( &temp );
    PLA_Obj_free( &diag_1 );
    PLA_Obj_free( &diag_B );
    PLA_Obj_free( &subdiag_1 );
    PLA_Obj_free( &subdiag_B );

    return PLA_SUCCESS;
}
int PLA_Asum( PLA_Obj x, PLA_Obj alpha )
{
  int 
    value = PLA_SUCCESS;

  PLA_Obj
    alpha_local = NULL;

  if ( PLA_ERROR_CHECKING )    
    value = PLA_Asum_enter( x, alpha );

  PLA_Mscalar_create_conf_to 
    ( alpha, PLA_ALL_ROWS, PLA_ALL_COLS, &alpha_local );

  PLA_Local_asum( x, alpha_local );

  PLA_Reduce( alpha_local, MPI_SUM, alpha );

  PLA_Obj_free( &alpha_local );

  if ( PLA_ERROR_CHECKING )   
    value = PLA_Asum_exit( x, alpha );

  return value;
}
int PLA_Obj_set_diagonal( PLA_Obj A, PLA_Obj x )
/*
   PLA_Obj_set_diag

   Purpose:  Create diagonal matrix A with diagonal values equal
   to the values in vector x.

   Input:   x --   PLA_MVECTOR of width 1
                   specifies the values to be placed on the diagonal of A

   OUTPUT:  A --   PLA_MATRIX
*/
{
  int 
    size;
  PLA_Obj 
    A_11 = NULL,  A_BR = NULL,
    x_1  = NULL,  x_B  = NULL;

  PLA_Obj_view_all( A, &A_BR );
  PLA_Obj_view_all( x, &x_B );

  while ( TRUE ){
    PLA_Obj_global_length( x_B, &size );
    if ( size == 0 ) break;
    
    PLA_Obj_split_4( A_BR, 1, 1,   &A_11,      PLA_DUMMY,
		                    PLA_DUMMY, &A_BR );

    PLA_Obj_horz_split_2( x_B, 1,  &x_1,
			            &x_B );

    PLA_Local_copy( x_1, A_11 );
  }

  PLA_Obj_free( &A_11 );
  PLA_Obj_free( &A_BR );
  PLA_Obj_free( &x_1 );
  PLA_Obj_free( &x_B );

  return PLA_SUCCESS;
}
int PLA_Ger(  PLA_Obj alpha, PLA_Obj x, PLA_Obj y, PLA_Obj A )
{
  int 
    value = PLA_SUCCESS,
    owner_row, owner_col;
  PLA_Obj
    alpha_cpy = NULL, 
    x_dup = NULL, y_dup = NULL;
  
  if ( PLA_ERROR_CHECKING )    /* Perform parameter and error checking */
    value = PLA_Ger_enter( alpha, x, y, A );

    /* If necessary, duplicate alpha and beta to all nodes */
    PLA_Obj_owner_row( alpha, &owner_row );
    PLA_Obj_owner_col( alpha, &owner_col );
    if ( owner_row != PLA_ALL_ROWS || owner_col != PLA_ALL_COLS ){
      PLA_Mscalar_create_conf_to( alpha, PLA_ALL_ROWS, PLA_ALL_COLS,
				   &alpha_cpy );
      PLA_Copy( alpha, alpha_cpy );
    }
    
    PLA_Pmvector_create_conf_to( A, PLA_PROJ_ONTO_ROW, PLA_ALL_ROWS, 1,
				  &y_dup );
    PLA_Pmvector_create_conf_to( A, PLA_PROJ_ONTO_COL, PLA_ALL_COLS, 1,
				  &x_dup );
      
    PLA_Copy( x, x_dup );
    PLA_Copy( y, y_dup );

    PLA_Local_ger( ( alpha_cpy == NULL ? alpha: alpha_cpy ), 
		    x_dup, y_dup, A );
	
    PLA_Obj_free( &alpha_cpy );
    PLA_Obj_free( &x_dup );
    PLA_Obj_free( &y_dup );
    
  if ( PLA_ERROR_CHECKING )    /* Perform exit error checking */
    value = PLA_Ger_exit( alpha, x, y, A );

  return value;
}
int PLA_Herk_perform_local_part( int uplo, 
                                  PLA_Obj alpha, PLA_Obj Xdpmv, 
                                                  PLA_Obj Xdpmv_conj_trans,
                                  PLA_Obj beta,  PLA_Obj A )
{
  int 
    length, local_size, length_1,
    count, me;

  PLA_Obj 
    A11           = NULL,     
    A21           = NULL,       A22           = NULL,
    Xdpmv_1       = NULL,       Xdpmv_2       = NULL,
    Xdpmv_conj_trans_1 = NULL,       Xdpmv_conj_trans_2 = NULL;


  PLA_Obj_local_width( A, &local_size );
  if ( 0 == local_size ) return;
  PLA_Obj_local_length( A, &local_size );
  if ( 0 == local_size ) return;

  PLA_Obj_global_length( A, &length );

  if ( length >= 256 ) {
    length_1 = (length/2);
    length_1 = length_1 - length_1 % 64 + 64; 
    if ( length_1 >= length ) length = length/2;
    PLA_Obj_split_4 ( A, length_1, length_1,      &A11, PLA_DUMMY,
                                                  &A21, &A22 );
    PLA_Obj_horz_split_2( Xdpmv, length_1,        &Xdpmv_1,
                                                  &Xdpmv_2 );
    PLA_Obj_vert_split_2( Xdpmv_conj_trans, length_1,  
                                  &Xdpmv_conj_trans_1, &Xdpmv_conj_trans_2 );

    PLA_Local_gemm( PLA_NO_TRANS, PLA_NO_TRANS, 
		    alpha, Xdpmv_2, Xdpmv_conj_trans_1, beta, A21 );

    PLA_Herk_perform_local_part( PLA_LOWER_TRIANGULAR, 
                                 alpha, Xdpmv_1, Xdpmv_conj_trans_1,
                                 beta, A11 );
    PLA_Herk_perform_local_part( PLA_LOWER_TRIANGULAR, 
                                 alpha, Xdpmv_2, Xdpmv_conj_trans_2,
                                 beta, A22 );
  } 
  else {    
    PLA_Herk_perform_local_part_by_panels( 
                                 uplo, alpha, Xdpmv, Xdpmv_conj_trans,
                                       beta,  A );
  }
  PLA_Obj_free( &A11 );
  PLA_Obj_free( &A21 );                PLA_Obj_free( &A22 );
  PLA_Obj_free( &Xdpmv_1 );            PLA_Obj_free( &Xdpmv_2 );
  PLA_Obj_free( &Xdpmv_conj_trans_1 );      PLA_Obj_free( &Xdpmv_conj_trans_2 );

  return( PLA_SUCCESS );
}
int PLA_Local_add(PLA_Obj x, PLA_Obj y)
{
  PLA_Obj
    one = NULL;

  PLA_Create_constants_conf_to( x, NULL, NULL, &one );

  PLA_Local_axpy( one, x, y );

  PLA_Obj_free( &one );

  return PLA_SUCCESS;
}
int PLA_Local_sym_tridiag_eig( PLA_Obj tridiag, 
				PLA_Obj Z )
{
  int 
    info, length, i_one = 1, local_length_Z, ldz;
  void 
    *buffer_diag, *buffer_subdiag, *buffer_Z, *work;
  MPI_Datatype
    datatype;
  PLA_Obj
    diag = NULL, subdiag = NULL;

  PLA_Obj_vert_split_2( tridiag, 1, &diag, &subdiag );

  PLA_Obj_datatype( diag, &datatype );
  PLA_Obj_global_length( diag, &length );
  PLA_Obj_local_buffer( diag,    &buffer_diag );
  PLA_Obj_local_buffer( subdiag, &buffer_subdiag );

  if ( Z == NULL )
    PLA_dsteqr( "N", &length, 
		   (double *) buffer_diag, (double *) buffer_subdiag,
		   NULL, &i_one, work, &info );
  else{
    PLA_Obj_local_length( Z, &local_length_Z );
    PLA_Obj_local_buffer( Z, &buffer_Z );
    PLA_Obj_local_ldim( Z, &ldz );
    work = ( double * ) PLA_malloc( 2 * length * sizeof( double ) );
    PLA_dsteqr_x( "V", &local_length_Z, &length, 
		   (double *) buffer_diag, (double *) buffer_subdiag,
		   (double *) buffer_Z, &ldz, work, &info );
    PLA_free( work );
  }

  PLA_Obj_free( &diag );
  PLA_Obj_free( &subdiag );

  return PLA_SUCCESS;
}
int PLA_Obj_set_to_zero_below_first_subdiagonal( PLA_Obj A )
{
  PLA_Obj
    A21 = NULL;

  PLA_Obj_split_4( A, 1, -1, PLA_DUMMY, PLA_DUMMY,
		              &A21,       PLA_DUMMY );

  PLA_Obj_set_to_zero_below_diagonal( A21 );

  PLA_Obj_free( &A21 );
  
  return PLA_SUCCESS;
}
int PLA_Obj_set_to_zero_below_diagonal( PLA_Obj A )
{
  PLA_Obj
    A_BR = NULL, a_21 = NULL;
  int
    size;

  PLA_Obj_view_all( A, &A_BR );

  while( TRUE ){
    PLA_Obj_global_length( A_BR, &size );
    if ( size == 0 ) break;
    
    PLA_Obj_split_4( A_BR, 1, 1, PLA_DUMMY, PLA_DUMMY,
		                  &a_21,      &A_BR );
    
    PLA_Obj_set_to_zero( a_21 );
  }

  PLA_Obj_free( &A_BR );
  PLA_Obj_free( &a_21 );
  
  return PLA_SUCCESS;
}
Exemple #14
0
static int pmatdestroy(void*MM){
  plapackM* ctx=(plapackM*)MM;
  int info;
  DSDPFunctionBegin;
  PPDSDPPrint("PDSDP: Compute M: %5.5e \n",ctx->thessian);
  PPDSDPPrint("PDSDP: Solve M %4.5e seconds\n ",ctx->tsolve);
  info=PLA_Obj_free(&ctx->AMat); DSDPCHKERR(info);
  info=PLA_Obj_free(&ctx->vVec); DSDPCHKERR(info);
  info=PLA_Obj_free(&ctx->wVec); DSDPCHKERR(info);
  info=PLA_Obj_free(&ctx->one); DSDPCHKERR(info);
  info=PLA_Obj_free(&ctx->zero); DSDPCHKERR(info);
  info=PLA_Obj_free(&ctx->dxerror); DSDPCHKERR(info);
  info=PLA_Temp_free(&ctx->templ); DSDPCHKERR(info);
  info=PLA_Finalize(); DSDPCHKERR(info);
  DSDPFREE(&ctx,&info);DSDPCHKERR(info);
  DSDPFunctionReturn(0);
}
void PLA_Trmm_right_upper_trans( int diag, PLA_Obj alpha, PLA_Obj A, PLA_Obj B, int nb_alg )
{
  PLA_Obj     ATL=NULL, ATR=NULL,     A00=NULL, A01=NULL, A02=NULL,    BL=NULL,           B0=NULL,
              ABL=NULL, ABR=NULL,     A10=NULL, A11=NULL, A12=NULL,    BR=NULL,           B1=NULL,
                                      A20=NULL, A21=NULL, A22=NULL,                  B2=NULL,
              A11_dup=NULL, B1_mv=NULL, A01_dpmv=NULL, B1_dpmv=NULL,
              ONE=NULL;

  int         b, size_row;

  PLA_Create_constants_conf_to( A, NULL, NULL, &ONE );

  PLA_Part_2x2( A,  &ATL, /**/ &ATR,
                  /* ************** */
                    &ABL, /**/ &ABR,   0, 0,     /* submatrix */ PLA_TL );

  PLA_Part_1x2( B,  &BL,  /**/ &BR,   
                0, /* width  submatrix */ PLA_LEFT );

  while ( TRUE ){
    PLA_Obj_global_length( ABR, &size_row );
    b = min( size_row, nb_alg );
    if ( 0 == b ) break;

    PLA_Repart_2x2_to_3x3( ATL, /**/ ATR,         &A00, /**/ &A01, &A02,
                        /* ************* */    /* ********************* */
                                /**/              &A10, /**/ &A11, &A12,
                           ABL, /**/ ABR,         &A20, /**/ &A21, &A22,   
                           b, b, /* A11 from */ PLA_BR );

    PLA_Repart_1x2_to_1x3( BL,  /**/ BR,          &B0,  /**/ &B1,  &B2,    
                           b, /* width  B1 from */ PLA_RIGHT );

    /* ********************************************************************* */

    PLA_Pmvector_create_conf_to( B0, PLA_PROJ_ONTO_COL, PLA_ALL_COLS, b, &B1_dpmv );

    PLA_Pmvector_create_conf_to( B0, PLA_PROJ_ONTO_ROW, PLA_ALL_ROWS, b, &A01_dpmv );

    PLA_Copy( B1, B1_dpmv );

    PLA_Copy( A01, A01_dpmv );

    PLA_Local_gemm( PLA_NO_TRANSPOSE, PLA_NO_TRANSPOSE, alpha, B1_dpmv, A01_dpmv, ONE, B0 );

    PLA_Mscalar_create_conf_to( A11, PLA_ALL_ROWS, PLA_ALL_COLS, &A11_dup );

    PLA_Copy( A11, A11_dup );

    PLA_Local_trmm( PLA_RIGHT, PLA_UPPER_TRIANGULAR, PLA_TRANSPOSE, diag, alpha, A11_dup, B1_dpmv );
    
    PLA_Copy( B1_dpmv, B1 );

    /* ********************************************************************* */

    PLA_Cont_with_3x3_to_2x2( &ATL, /**/ &ATR,         A00, A01, /**/ A02,
                                    /**/               A10, A11, /**/ A12,
                            /* ************** */   /* ****************** */
                              &ABL, /**/ &ABR,         A20, A21, /**/ A22, 
                              /* A11 added to */ PLA_TL );

    PLA_Cont_with_1x3_to_1x2( &BL, /**/ &BR,           B0,  B1,  /**/ B2,  
                              /* B1  added to */ PLA_LEFT );
  }

  PLA_Obj_free( &ATL );
  PLA_Obj_free( &ATR );
  PLA_Obj_free( &A00 );
  PLA_Obj_free( &A01 );
  PLA_Obj_free( &A01_dpmv );
  PLA_Obj_free( &A02 );
  PLA_Obj_free( &BL );
  PLA_Obj_free( &B0 );
  PLA_Obj_free( &ABL );
  PLA_Obj_free( &ABR );
  PLA_Obj_free( &A10 );
  PLA_Obj_free( &A11 );
  PLA_Obj_free( &A11_dup );
  PLA_Obj_free( &A12 );
  PLA_Obj_free( &BR );
  PLA_Obj_free( &B1 );
  PLA_Obj_free( &B1_mv );
  PLA_Obj_free( &B1_dpmv );
  PLA_Obj_free( &A20 );
  PLA_Obj_free( &A21 );
  PLA_Obj_free( &A22 );
  PLA_Obj_free( &B2 );
  PLA_Obj_free( &ONE );
}
int PLA_Tri_red_exit( int uplo, PLA_Obj A, PLA_Obj s, PLA_Obj Q )

{
  int value = PLA_SUCCESS,
      size_malloced;
  double 
    max_A, diff,
    PLA_Local_abs_max(), PLA_Local_abs_diff();
  char 
    routine_name[ 35 ];


  PLA_Routine_stack_push( "PLA_Tri_red_exit" );

  if ( PLA_CHECK_AGAINST_SEQUENTIAL ){
    PLA_Obj 
      A_temp = NULL, Q_temp = NULL, B = NULL, C = NULL,
      minus_one = NULL, zero = NULL, one = NULL;
    MPI_Datatype
      datatype;
    double tol;

    PLA_Obj_datatype( A, &datatype );
    if ( datatype == MPI_DOUBLE || datatype == MPI_DOUBLE_COMPLEX )
      tol = 0.000001;
    else
      tol = 0.0001;

    PLA_Create_constants_conf_to( A, &minus_one, &zero, &one );

    PLA_Obj_set_to_zero_below_first_subdiagonal( A );

    PLA_Mscalar_create_conf_to( A, PLA_ALL_ROWS, PLA_ALL_COLS, &A_temp );
    PLA_Mscalar_create_conf_to( Q, PLA_ALL_ROWS, PLA_ALL_COLS, &Q_temp );
    PLA_Copy( Q, Q_temp );
    PLA_Copy( A, A_temp );

    max_A = PLA_Local_abs_max( A_cpy );

    /* Form Q A Q^T */
    PLA_Mscalar_create_conf_to( A_temp, PLA_ALL_ROWS, PLA_ALL_COLS, &B );
    PLA_Mscalar_create_conf_to( A_temp, PLA_ALL_ROWS, PLA_ALL_COLS, &C );
    /* B = Q A */
    PLA_Local_symm( PLA_SIDE_RIGHT, PLA_LOWER_TRIANGULAR, 
                     one, A_temp, Q_temp, zero, B );
    /* C = A - B Q^T*/
    PLA_Local_copy( A_cpy, C );
    PLA_Local_gemm( PLA_NO_TRANS, PLA_TRANS, minus_one, B, Q_temp, one, C );
    PLA_Set_triang_to_zero( PLA_LOWER_TRIANGULAR, PLA_NONUNIT_DIAG, C );

    diff = PLA_Local_abs_max( C );

    if ( diff > tol * max_A ){
      PLA_Warning( "PLA_Tri_red: large relative error encountered" );
      value--;
    }      

    PLA_Obj_free( &A_cpy );
    PLA_Obj_free( &A_temp );
    PLA_Obj_free( &Q_temp );
    PLA_Obj_free( &B );
    PLA_Obj_free( &C );
    PLA_Obj_free( &minus_one );
    PLA_Obj_free( &zero );
    PLA_Obj_free( &one );
  }

  size_malloced = PLA_Total_size_malloced( );

  if ( size_malloced != old_size_malloced )
    PLA_Warning( "PLA_Tri_red: memory discrepency" );
  
  PLA_Routine_stack_pop( routine_name );

  PLA_Routine_stack_pop( routine_name );

  return value;
}
int PLA_API_Poll_request ( )

/*--------------------------------------------------------------------------

Purpose :  Poll to see if messages have come in

----------------------------------------------------------------------------*/
{
  int 
    msgtype,
    m, n, lda,
    obj_index, source,
    align_row, align_col,
    typesize,
    local_ldim,
    temp;

  PLA_Obj
    obj_source = NULL;

  void
    *local_buf;

  MPI_Datatype 
    datatype;

  if ( PLA_API_Accept_recv_request_buffer( ) ){
    /* New recv put_buffer has come in */
    while ( !PLA_API_End_of_recv_request_buffer ( ) ){
      /* Process next message */
      PLA_API_read_data_from_request_buffer( sizeof( int ), 
					     ( char * ) &msgtype );
      switch ( msgtype ){
      case PLA_API_TYPE_PUT:
	PLA_Abort( "API illegal type", __LINE__, __FILE__ );

      case PLA_API_TYPE_SYNC:
	request_sync_counter ++;
	break;

      case PLA_API_TYPE_GET:
	PLA_API_read_data_from_request_buffer( sizeof( int ), ( char * ) &m );
	PLA_API_read_data_from_request_buffer( sizeof( int ), ( char * ) &n );
	PLA_API_read_data_from_request_buffer( sizeof( int ), 
					       ( char * ) &obj_index );
	PLA_API_read_data_from_request_buffer( sizeof( int ), 
					       ( char * ) &align_row );
	PLA_API_read_data_from_request_buffer( sizeof( int ), 
					       ( char * ) &align_col );
	PLA_API_read_data_from_request_buffer( sizeof( void * ),
						  ( char * ) &local_buf );
	PLA_API_read_data_from_request_buffer( sizeof( int ), 
					       ( char * )  &lda );
	PLA_API_read_data_from_request_buffer( sizeof( int ), 
					       ( char * )  &source );

	/* Take view into appropriate object */
	PLA_Obj_view( open_objects_list[ obj_index ],
		      m, n, align_row, align_col, &obj_source );

	/* Create room to send current message */
	PLA_Obj_datatype( obj_source, &datatype );
	MPI_Type_size( datatype, &typesize );
	PLA_API_put_buffer_make_room( 5 * sizeof( int ) + 
				      sizeof( void * ) +
				      m * n * typesize, source );
			   
	/* Enter put destination information in the buffer */
	temp = PLA_API_TYPE_GET;
	PLA_API_add_data_to_put_buffer( sizeof( int ), ( char * ) &temp,
				        source );
	/*	PLA_API_add_data_to_put_buffer( sizeof( int ), ( char * ) &datatype,
		source ); */
	PLA_API_add_data_to_put_buffer( sizeof( int ), ( char * ) &obj_index, 
				        source );
	PLA_API_add_data_to_put_buffer( sizeof( int ), ( char * ) &m, 
				        source );
	PLA_API_add_data_to_put_buffer( sizeof( int ), ( char * ) &n,
				        source );
	PLA_API_add_data_to_put_buffer( sizeof( void * ), 
				        ( char * ) &local_buf, source );
	PLA_API_add_data_to_put_buffer( sizeof( int ),  ( char * ) &lda,
				        source );

	/* Add the contents of the object to the buffer */
	PLA_API_add_contents_from_obj_to_put_buffer( obj_source, source );

	break;

      default:
	PLA_Abort( "API illegal type", __LINE__, __FILE__ );
      }
    }
  }

  PLA_Obj_free( &obj_source );

  return PLA_SUCCESS;
}
int PLA_Chol_simple( int nb_alg, PLA_Obj A )
/*
  Purpose : Parallel Cholesky Factorization.  This particular version 
            assumes only the lower triangular portion of A is stored.
	    It only uses PLAPACK parallel BLAS calls, to illustrate how
            a simple implementation reflects the algorithm perfectly.

  IN     nb_alg      integer, algorithmic block size to be used
  IN/OUT A           matrix to be factored

  Algorith used:

  ******************************************************************
      
      Partition  A = / A_TL ||   *  \
                     | =====  ===== |
                     \ A_BL || A_BR /
     	     where A_TL is 0 x 0 
      while A_BR is not 0 x 0 
         Determine block size b
	 Partition  A = / A_TL ||   *  \    / A_00 ||  *   |  *   \ 
                        | =====  ===== | =  | ====    ====   ==== |
                        \ A_BL || A_BR /    | A_10 || A_11 |  *   |
                                            | ----    ----   ---- |
                                            \ A_20 || A_21 | A_22 /
     	        where A_00 = A_TL and A_11 is b x b 
         Update A_11 <- L_11 = Chol. Fact.( A_11 )
         Update A_21 <- L_21 = A_21 inv( L_11' )
	 Update A_22 <- A_22 - L_21 * L_21'
         Continue with
	            A = / A_TL ||   *  \    / A_00 |  *   ||  *   \ 
                        | =====  ===== | =  | ----   ----    ---- |
                        \ A_BL || A_BR /    | A_10 | A_11 ||  *   |
                                            | ====   ====    ==== |
                                            \ A_20 | A_21 || A_22 /
      endwhile
              
  ******************************************************************

  NOTE:  For details on how to implement parallel Cholesky factorization
         see
	 
	 R. van de Geijn, Using PLAPACK, The MIT Press, 1997.

	 G. Morrow and R. van de Geijn, "Zen and the Art of High-Performance
                 Parallel Computing," http://www.cs.utexas.edu/users/plapack
*/
{
  int       
    value = 0,
    size_top, size_left,
    owner_top, owner_left,
    size;

  PLA_Obj  
    ABR             = NULL,     
    A11             = NULL,     A21             = NULL,     
    one             = NULL,     minus_one       = NULL;

  /* Create usual duplicated scalar constants */
  PLA_Create_constants_conf_to( A, &minus_one, NULL, &one );

  /* View ABR = A */
  PLA_Obj_view_all( A, &ABR );

  while ( TRUE ) {
    /* Determine size of current panel.  Notice that we limit the size so that 
       A_11 resides on a single processor */
    PLA_Obj_split_size( ABR, PLA_SIDE_TOP,  &size_top,  &owner_top );
    PLA_Obj_split_size( ABR, PLA_SIDE_LEFT, &size_left, &owner_left );
    if ( 0 == ( size = min( min( size_top, size_left), nb_alg ) ) ) break;

    /* Partition A_BR = / A_11   *   \
                        \ A_21  A_22 / where A_11 is b x b.
       Notice that A_22 becomes A_BR in the next iteration, so we
       already view this as ABR. */
    PLA_Obj_split_4( ABR, size, size,   &A11, PLA_DUMMY,
                                         &A21, &ABR );

    /* Update A_11 <- L_11 = Chol. Fact.( A_11 ) */
    PLA_Local_chol( PLA_LOWER_TRIANGULAR, A11 );

    /* Update A_21 <- L_21 = A_21 inv( L_11' ) */
    PLA_Trsm( PLA_SIDE_RIGHT, PLA_LOWER_TRIANGULAR,
	       PLA_TRANSPOSE,  PLA_NONUNIT_DIAG,
	       one, A11, A21 );

    /* Update A_22 <- A_22 - L_21 * L_21' */
    PLA_Syrk( PLA_LOWER_TRIANGULAR, minus_one, A21, one, ABR );
  }

  /* free temporary objects and views */
  PLA_Obj_free( &ABR );        
  PLA_Obj_free( &A11 );        PLA_Obj_free( &A21 ); 
  PLA_Obj_free( &one );        PLA_Obj_free( &minus_one );
  
  return value;
}
Exemple #19
0
int main(int argc, char *argv[])
{
    MPI_Comm
    comm = MPI_COMM_NULL;
    MPI_Datatype
    datatype;
    PLA_Template
    templ = NULL;
    PLA_Obj
    A      = NULL, A_orig = NULL, residual = NULL,
    minus_one   = NULL, one  = NULL, diff = NULL;
    int
    n,
    nb_distr, nb_alg,
    error, parameters, sequential,
    me, nprocs, nprows, npcols,
    itype;
    double
    time,
    flops;

    MPI_Init(&argc, &argv);

    MPI_Comm_rank(MPI_COMM_WORLD, &me);
    MPI_Comm_size(MPI_COMM_WORLD, &nprocs);

    if (me==0) {
        printf("enter mesh size:\n");
        scanf("%d%d", &nprows, &npcols );
        printf("mesh size  = %d x %d \n", nprows, npcols );
        printf("enter distr. block size:\n");
        scanf("%d", &nb_distr );
        printf("nb_distr = %d\n", nb_distr );
        printf("enter alg. block size:\n");
        scanf("%d", &nb_alg );
        printf("nb_alg = %d\n", nb_alg );
        printf("turn on error checking? (0 = NO, 1 = YES):\n");
        scanf("%d", &error );
        printf("error checking = %d\n", error );
        printf("turn on parameter checking? (0 = NO, 1 = YES):\n");
        scanf("%d", &parameters );
        printf("parameter checking = %d\n", parameters );
        printf("turn on sequential checking? (0 = NO, 1 = YES):\n");
        scanf("%d", &sequential );
        printf("sequential checking = %d\n", sequential );
    }
    MPI_Bcast(&nprows, 1, MPI_INT, 0, MPI_COMM_WORLD);
    MPI_Bcast(&npcols, 1, MPI_INT, 0, MPI_COMM_WORLD);
    MPI_Bcast(&nb_distr, 1, MPI_INT, 0, MPI_COMM_WORLD);
    MPI_Bcast(&nb_alg,   1, MPI_INT, 0, MPI_COMM_WORLD);
    MPI_Bcast(&error, 1, MPI_INT, 0, MPI_COMM_WORLD);
    MPI_Bcast(&parameters, 1, MPI_INT, 0, MPI_COMM_WORLD);
    MPI_Bcast(&sequential, 1, MPI_INT, 0, MPI_COMM_WORLD);

    pla_Environ_set_nb_alg( PLA_OP_ALL_ALG, nb_alg );

    PLA_Set_error_checking( error, parameters, sequential, FALSE );

    /*  PLA_Comm_1D_to_2D_ratio(MPI_COMM_WORLD, 1.0, &comm); */
    PLA_Comm_1D_to_2D(MPI_COMM_WORLD, nprows, npcols, &comm);

    PLA_Init(comm);

    PLA_Temp_create( nb_distr, 0, &templ );

    while ( TRUE ) {
        if (me==0) {
            printf("enter datatype:\n");
            printf("-1 = quit\n");
            printf(" 0 = float\n");
            printf(" 1 = double\n");
            printf(" 2 = complex\n");
            printf(" 3 = double complex\n");
            scanf("%d", &itype );
            printf("itype = %d\n", itype );
        }
        MPI_Bcast(&itype, 1, MPI_INT, 0, MPI_COMM_WORLD);

        if ( itype == -1 ) break;
        switch( itype ) {
        case 0:
            datatype = MPI_FLOAT;
            break;
        case 1:
            datatype = MPI_DOUBLE;
            break;
        case 2:
            datatype = MPI_COMPLEX;
            break;
        case 3:
            datatype = MPI_DOUBLE_COMPLEX;
            break;
        default:
            PLA_Abort( "unknown datatype", __LINE__, __FILE__ );
        }

        if (me==0) {
            printf("enter n:\n");
            scanf("%d", &n );
            printf("n = %d\n", n );
        }

        MPI_Bcast(&n,     1, MPI_INT, 0, MPI_COMM_WORLD);

        PLA_Matrix_create( datatype,
                           n,
                           n,
                           templ,
                           PLA_ALIGN_FIRST,
                           PLA_ALIGN_FIRST,
                           &A );

        PLA_Matrix_create_conf_to( A, &A_orig );

        PLA_Matrix_create_conf_to( A, &residual );

        create_problem( A );

        {
            double d_n;

            d_n = (double) n;
            PLA_Shift( A, MPI_DOUBLE, &d_n );
        }

        PLA_Create_constants_conf_to( A, &minus_one, NULL, &one );

        PLA_Local_copy( A, A_orig );

        /* Use invert routine that uses factors */

        MPI_Barrier( MPI_COMM_WORLD );
        time = MPI_Wtime ();

        PLA_Triangular_invert( PLA_LOWER_TRIANGULAR, A );

        MPI_Barrier( MPI_COMM_WORLD );
        time = MPI_Wtime () - time;

        flops = 2.0/3.0 * n * n * n;

        if ( me == 0 )
            printf("%d time = %f, MFLOPS/node = %10.4lf \n", n, time,
                   flops / time * 1.0e-6 / nprocs );

        PLA_Obj_set_to_identity( residual );

        PLA_Set_triang_to_zero( PLA_LOWER_TRIANGULAR, PLA_NONUNIT_DIAG,
                                A_orig );
        PLA_Set_triang_to_zero( PLA_LOWER_TRIANGULAR, PLA_NONUNIT_DIAG,
                                A );

        PLA_Gemm( PLA_NO_TRANS, PLA_NO_TRANS, minus_one, A_orig, A, one,
                  residual );

        PLA_Mscalar_create( datatype, PLA_ALL_ROWS, PLA_ALL_COLS,
                            1, 1, templ, &diff );

        PLA_Matrix_one_norm( residual, diff );
    }

    PLA_Obj_free( &A );
    PLA_Obj_free( &A_orig );
    PLA_Obj_free( &residual );
    PLA_Obj_free( &minus_one );
    PLA_Obj_free( &one );
    PLA_Obj_free( &diff );

    PLA_Temp_free(&templ);
    PLA_Finalize( );
    MPI_Finalize( );

}
int PLA_Symm_exit( int side, int uplo,
	             PLA_Obj alpha, PLA_Obj A, PLA_Obj B, 
                     PLA_Obj beta,  PLA_Obj C )

{
  int value = PLA_SUCCESS,
      size_malloced;
  double 
    max_A, max_B, max_C, max_all, diff,
    PLA_Local_abs_max(), PLA_Local_abs_diff();
  PLA_Obj
    one = NULL;
  char 
    routine_name[ 35 ];

  PLA_Routine_stack_push( "PLA_Gemm_exit" );

  if ( PLA_CHECK_AGAINST_SEQUENTIAL ){
    PLA_Obj C_temp = NULL;

    PLA_Mscalar_create_conf_to( C, PLA_ALL_ROWS, PLA_ALL_COLS, &C_temp );
    PLA_Copy( C, C_temp );

    max_A = PLA_Local_abs_max( A_cpy );
    max_B = PLA_Local_abs_max( B_cpy );

    PLA_Local_scal( beta_cpy, C_cpy );

    max_C = PLA_Local_abs_max( C_cpy ); 
    max_all = max( max_A, max( max_B, max_C ) ); 

    PLA_Create_constants_conf_to( A, NULL, NULL, &one );
    PLA_Local_symm( side, uplo, alpha_cpy, A_cpy, B_cpy,
		                     one,  C_cpy );

    diff = PLA_Local_abs_diff( C_temp, C_cpy );

    if ( diff > 0.000001 * max_all ){
      PLA_Warning( "PLA_Symm: large relative error encountered" );
      value--;
    }      

    PLA_Obj_free( &alpha_cpy );
    PLA_Obj_free( &A_cpy );
    PLA_Obj_free( &B_cpy );
    PLA_Obj_free( &beta_cpy );
    PLA_Obj_free( &C_cpy );

    PLA_Obj_free( &C_temp );
    PLA_Obj_free( &one );
  }

  size_malloced = PLA_Total_size_malloced( );

  if ( size_malloced != old_size_malloced )
    PLA_Warning( "PLA_Symm: memory discrepency" );
  
  PLA_Routine_stack_pop( routine_name );

  PLA_Routine_stack_pop( routine_name );

  return value;
}
int PLA_Matrix_one_norm( PLA_Obj A, PLA_Obj alpha )
{
  int 
    size, proj_onto_A,
    value = PLA_SUCCESS;

  PLA_Obj
    A_R = NULL,      a_1 = NULL,
    col_asums = NULL, local_col_asums = NULL, local_col_asums_R = NULL,
    alpha_1 = NULL, local_max = NULL;

  
  if ( PLA_ERROR_CHECKING )    /* Perform parameter and error checking */
    value = PLA_Matrix_one_norm_enter( A, alpha );

  /* Create a vector to hold the one norm of the columns of the matrix */

  PLA_Obj_get_orientation( A, &proj_onto_A );
  PLA_Obj_set_orientation( A, PLA_PROJ_ONTO_ROW );
  PLA_Mvector_create_conf_to( A, 1, &col_asums );
  PLA_Obj_set_orientation( A, proj_onto_A );

  /* Create a vector projected onto rows to hold the one norm of the
     columns of the local matrices */

  PLA_Pmvector_create_conf_to( A, PLA_PROJ_ONTO_ROW, PLA_ALL_ROWS, 1,
				&local_col_asums );

  PLA_Obj_view_all( A, &A_R );
  PLA_Obj_view_all( local_col_asums, &local_col_asums_R );

  while ( TRUE ){
    PLA_Obj_global_width( A_R, &size );
    if ( size == 0 ) break;

    PLA_Obj_vert_split_2( A_R, 1,               &a_1,     &A_R );
    PLA_Obj_vert_split_2( local_col_asums_R, 1,  &alpha_1, &local_col_asums_R );
    
    PLA_Local_asum( a_1, alpha_1 );
  }
  
  /* compute global 1-norms of the columns */
  PLA_Reduce( local_col_asums, MPI_SUM, col_asums );
  
  /* Create a multiscalar in which to hold the local asum */

  PLA_Mscalar_create_conf_to( alpha, PLA_ALL_ROWS, PLA_ALL_COLS, 
			       &local_max );

  PLA_Local_absolute_max( col_asums, local_max );

  PLA_Reduce( local_max, MPI_MAX, alpha );

  PLA_Obj_free ( &A_R );
  PLA_Obj_free ( &a_1 );
  PLA_Obj_free ( &col_asums );
  PLA_Obj_free ( &local_col_asums );
  PLA_Obj_free ( &local_col_asums_R );
  PLA_Obj_free ( &local_max );
  PLA_Obj_free ( &alpha_1 );

  if ( PLA_ERROR_CHECKING )    /* Perform parameter and error checking */
    value = PLA_Matrix_one_norm_exit( A, alpha );

}
int PLA_API_Poll_put ( )

/*--------------------------------------------------------------------------

Purpose :  Poll to see if messages have come in

----------------------------------------------------------------------------*/
{
  int 
    msgtype,
    m, n, lda,
    obj_index,
    align_row, align_col,
    local_ldim;

  PLA_Obj
    obj_dest = NULL;

  void
    *local_buf;

  MPI_Datatype
    datatype;

  if ( PLA_API_Accept_recv_put_buffer( ) ){
    /* New recv put_buffer has come in */
    while ( !PLA_API_End_of_recv_put_buffer ( ) ){
      /* Process next message */
      PLA_API_read_data_from_put_buffer( sizeof( int ), ( char * ) &msgtype );

      switch ( msgtype ){
      case PLA_API_TYPE_PUT:
	PLA_API_read_data_from_put_buffer( sizeof( int ), ( char * ) &m );
	PLA_API_read_data_from_put_buffer( sizeof( int ), ( char * ) &n );
	PLA_API_read_data_from_put_buffer( sizeof( int ), 
					   ( char * ) &obj_index );
	PLA_API_read_data_from_put_buffer( sizeof( int ), 
					   ( char * ) &align_row );
	PLA_API_read_data_from_put_buffer( sizeof( int ), 
					   ( char * ) &align_col );

	/* Take view into appropriate object */
	PLA_Obj_view( open_objects_list[ obj_index ],
		      m, n, align_row, align_col, &obj_dest );

	/* Add the contents of the message to the local contents
           of the view */
	PLA_API_read_contents_from_buffer_to_obj( m, n, obj_dest );

	break;
      case PLA_API_TYPE_SYNC:
	put_sync_counter ++;

	break;
      case PLA_API_TYPE_GET:
	/* Enter put destination information in the buffer */
	/*	PLA_API_read_data_from_put_buffer( sizeof( MPI_Datatype ), 
		( char *) &datatype ); */
	PLA_API_read_data_from_put_buffer( sizeof( int ), 
					    ( char * ) &obj_index );
	PLA_API_read_data_from_put_buffer( sizeof( int ), ( char * ) &m );
	PLA_API_read_data_from_put_buffer( sizeof( int ), ( char * ) &n );
	PLA_API_read_data_from_put_buffer( sizeof( void * ),
					    ( char * ) &local_buf );
	PLA_API_read_data_from_put_buffer( sizeof( int ), ( char * ) &lda );

	PLA_Obj_datatype( open_objects_list[ obj_index ], &datatype );

	/* Add the contents of the object to the buffer */
	PLA_API_add_matrix_from_recv_buffer_to_local( datatype,
               m, n, local_buf, lda );

	break;
      default:
	PLA_Abort( "API illegal type", __LINE__, __FILE__ );
      }
    }
  }

  PLA_Obj_free( &obj_dest );

  return PLA_SUCCESS;
}
void PLA_Trmm_left_lower_notrans( int diag, PLA_Obj alpha, PLA_Obj A, PLA_Obj B, int nb_alg )
{
  PLA_Obj     ATL=NULL, ATR=NULL,     A00=NULL, A01=NULL, A02=NULL,    BT=NULL,           B0=NULL,
              ABL=NULL, ABR=NULL,     A10=NULL, A11=NULL, A12=NULL,    BB=NULL,           B1=NULL,
                                      A20=NULL, A21=NULL, A22=NULL,                  B2=NULL,
              A11_dup=NULL, B1_mv=NULL, A21_dpmv=NULL, B1_dpmv=NULL,
              ONE=NULL;

  int         b, size_row;

  PLA_Create_constants_conf_to( A, NULL, NULL, &ONE );

  PLA_Part_2x2( A,  &ATL, /**/ &ATR,
                  /* ************** */
                    &ABL, /**/ &ABR,   0, 0,     /* submatrix */ PLA_BR );

  PLA_Part_2x1( B,  &BT, 
                   /***/
                    &BB,               0, /* length submatrix */ PLA_BOTTOM );

  while ( TRUE ){
    PLA_Obj_global_length( ATL, &size_row );
    b = min( size_row, nb_alg );
    if ( 0 == b ) break;

    PLA_Repart_2x2_to_3x3( ATL, /**/ ATR,         &A00, &A01, /**/ &A02,
                                /**/              &A10, &A11, /**/ &A12,
                        /* ************* */    /* ********************* */
                           ABL, /**/ ABR,         &A20, &A21, /**/ &A22,   
                           b, b, /* A11 from */ PLA_TL );

    PLA_Repart_2x1_to_3x1( BT,                   &B0,
                                                 &B1,
                          /**/                   /**/
                           BB,                   &B2,    
                           b, /* length B1 from */ PLA_TOP );

    /* ********************************************************************* */

    PLA_Obj_set_orientation( B1, PLA_PROJ_ONTO_ROW );

    PLA_Pmvector_create_conf_to( B2, PLA_PROJ_ONTO_ROW, PLA_ALL_ROWS, b, &B1_dpmv );

    PLA_Pmvector_create_conf_to( B2, PLA_PROJ_ONTO_COL, PLA_ALL_COLS, b, &A21_dpmv );

    PLA_Copy( B1, B1_dpmv );

    PLA_Copy( A21, A21_dpmv );

    PLA_Local_gemm( PLA_NO_TRANSPOSE, PLA_NO_TRANSPOSE, alpha, A21_dpmv, B1_dpmv, ONE, B2 );

    PLA_Mscalar_create_conf_to( A11, PLA_ALL_ROWS, PLA_ALL_COLS, &A11_dup );

    PLA_Copy( A11, A11_dup );

    PLA_Local_trmm( PLA_LEFT, PLA_LOWER_TRIANGULAR, PLA_NO_TRANSPOSE, diag, alpha, A11_dup, B1_dpmv );
    
    PLA_Copy( B1_dpmv, B1 );

    /* ********************************************************************* */

    PLA_Cont_with_3x3_to_2x2( &ATL, /**/ &ATR,         A00, /**/ A01, A02,
                            /* ************** */   /* ****************** */
                                    /**/               A10, /**/ A11, A12,
                              &ABL, /**/ &ABR,         A20, /**/ A21, A22, 
			      /* A11 added to */ PLA_BR );

    PLA_Cont_with_3x1_to_2x1( &BT,                     B0,
                             /***/                    /**/
                                                       B1,
                              &BB,                     B2,                 
			      /* B1  added to */ PLA_BOTTOM );
  }

  PLA_Obj_free( &ATL );
  PLA_Obj_free( &ATR );
  PLA_Obj_free( &A00 );
  PLA_Obj_free( &A01 );
  PLA_Obj_free( &A02 );
  PLA_Obj_free( &BT );
  PLA_Obj_free( &B0 );
  PLA_Obj_free( &ABL );
  PLA_Obj_free( &ABR );
  PLA_Obj_free( &A10 );
  PLA_Obj_free( &A11 );
  PLA_Obj_free( &A11_dup );
  PLA_Obj_free( &A12 );
  PLA_Obj_free( &BB );
  PLA_Obj_free( &B1 );
  PLA_Obj_free( &B1_mv );
  PLA_Obj_free( &B1_dpmv );
  PLA_Obj_free( &A20 );
  PLA_Obj_free( &A21 );
  PLA_Obj_free( &A21_dpmv );
  PLA_Obj_free( &A22 );
  PLA_Obj_free( &B2 );
  PLA_Obj_free( &ONE );
}
int PLA_Triangular_invert_lower( PLA_Obj A )
{
  int 
    nb_alg, size, value = PLA_SUCCESS;

  PLA_Obj
    A_BR = NULL, A_11 = NULL, A_21 = NULL, A_21_dup = NULL, A_11_msc = NULL,
    Ainv = NULL, Ainv_BL = NULL, Ainv_10_11 = NULL, Ainv_10_11_dup = NULL,
    minus_one = NULL, zero = NULL, one = NULL;

  PLA_Template
    templ;

  PLA_Obj_template( A, &templ );
  PLA_Environ_nb_alg( PLA_OP_PAN_PAN, templ, &nb_alg );

  PLA_Create_constants_conf_to( A, &minus_one, &zero, &one );
  PLA_Obj_vert_split_2( A, 0, &Ainv_BL, &A_BR );

  
  while( TRUE ){
    PLA_Obj_global_length( A_BR, &size );
    if ( 0 == ( size = min( size, nb_alg ) ) ) break;

    PLA_Obj_split_4( A_BR, size, size, &A_11, PLA_DUMMY,
                                        &A_21, &A_BR );

    PLA_Obj_view_shift( Ainv_BL,       0,
                                 0,             size,
                                        0 ); 

    PLA_Obj_horz_split_2( Ainv_BL, size, &Ainv_10_11,
                                          &Ainv_BL );

    PLA_Mscalar_create_conf_to( A_11, PLA_ALL_ROWS, PLA_ALL_COLS,
				 &A_11_msc );
    PLA_Copy( A_11, A_11_msc );

    PLA_Obj_set_orientation( Ainv_10_11, PLA_PROJ_ONTO_ROW );
    PLA_Pmvector_create_conf_to( Ainv_BL, PLA_PROJ_ONTO_ROW, PLA_ALL_ROWS,
				  size, &Ainv_10_11_dup );
    PLA_Pmvector_create_conf_to( Ainv_BL, PLA_PROJ_ONTO_COL, PLA_ALL_COLS,
				  size, &A_21_dup );

    /*    PLA_Trsm( PLA_SIDE_LEFT,    PLA_LOWER_TRIANGULAR,
	       PLA_NO_TRANSPOSE, PLA_NONUNIT_DIAG,
	       one, A_11, Ainv_10_11 );  */

    PLA_Obj_set_to_identity( A_11 );
    
    PLA_Copy( Ainv_10_11, Ainv_10_11_dup );

    PLA_Local_trsm( PLA_SIDE_LEFT,    PLA_LOWER_TRIANGULAR,
	       PLA_NO_TRANSPOSE, PLA_NONUNIT_DIAG,
	       one, A_11_msc, Ainv_10_11_dup ); 

    PLA_Copy( A_21,       A_21_dup );

    PLA_Obj_set_to_zero( A_21 );
 
    PLA_Local_gemm( PLA_NO_TRANS, PLA_NO_TRANS, 
	       minus_one, A_21_dup, Ainv_10_11_dup,
	       one, Ainv_BL );

    PLA_Copy( Ainv_10_11_dup, Ainv_10_11 );
  }

  PLA_Obj_free( &A_BR );
  PLA_Obj_free( &A_11 );
  PLA_Obj_free( &A_21 );
  PLA_Obj_free( &Ainv );
  PLA_Obj_free( &Ainv_BL );
  PLA_Obj_free( &Ainv_10_11 );
  PLA_Obj_free( &minus_one );
  PLA_Obj_free( &zero );
  PLA_Obj_free( &one );

  return PLA_SUCCESS;
}
int PLA_LU_linpack_right( int nb_alg, PLA_Obj A, PLA_Obj pivots )
/*
   Blocked right-looking LU factorization;

   Input: 

   nb_alg        --  Algorithmic blocking size to be used
  
   A             --  m x n PLA_Matrix
                     contains original matrix to be factored

   pivots        --  PLA_Mvector of width 1 and datatype MPI_INT
                     vector of length min( m, n ) in which to store
                     the pivot information

   Output:

   A             --  m x n PLA_Matrix
                     contains factors L and U s.t. P A_orig = L U

   pivots        --  PLA_Mvector of width 1 and datatype MPI_INT
                     integer vector that describes permutation matrix P
		     
   returns value PLA_SUCCESS if factorization completes successfully
*/
{
  int       
    value = PLA_SUCCESS,
    size, length, width;

  PLA_Obj  
    A_BR            = NULL,     A_1            = NULL,
    A_B             = NULL,
    A_11            = NULL,     A_12            = NULL,     
    A_21            = NULL,     A_21_dpmv       = NULL,
    A_1_mv          = NULL,     A_1_dpmv        = NULL,
    A_11_msc        = NULL,     A_12_dpmv       = NULL,
    A_12_mv         = NULL,
    pivots_B        = NULL,     pivots_1       = NULL,
    pivots_1_msc    = NULL,
    one             = NULL,     minus_one      = NULL;

  /* Create usual duplicated scalar constants */
  PLA_Create_constants_conf_to( A, &minus_one, NULL, &one );

  /* Partition A = / A_TL  A_TR \
                   \ A_BL  A_BR / where A_TL is 0 x 0 

     A_BR will track the active part of matrix A       */
  PLA_Obj_view_all( A, &A_BR );

  /* Partition A = / A_T \
                   \ A_B /   where A_T is 0 x n

     A_B will track the part of A that needs to be pivoted */

  PLA_Obj_view_all( A, &A_B );
 
  /* Partition pivots = / pivots_T \
                        \ pivots_B /   where pivots_T is 0 x 1 
    
     pivots_B will track the active part of the pivot vector */

  PLA_Obj_view_all( pivots, &pivots_B );

  /* Do until done */
  while ( TRUE ) {          
    /* Determine size of current panel and check if done */
    PLA_Obj_global_length( A_BR, &length );
    PLA_Obj_global_width ( A_BR,  &width );
    size = min( length, width );
    if ( 0 == ( size = min( size, nb_alg ) ) ) break;

    /* Partition off the current column panel to be factored */
    PLA_Obj_vert_split_2( A_BR, size,       &A_1, PLA_DUMMY );

    /* Create a multivector to hold current column panel, allowing
       all nodes to participate in the panel factorization */
    PLA_Mvector_create_conf_to( A_1, size, &A_1_mv );

    /* Copy the current column panel to the multivector */
    PLA_Copy( A_1, A_1_mv );

    /* Partition A_BR = / A_11 A_12 \
                        \ A_21 A_22 /  where A_11 is size x size */	       
    PLA_Obj_split_4( A_BR, size, size,      &A_11, &A_12,
                                            &A_21, PLA_DUMMY );

    /* Create a duplicated multiscalar to hold a copy of A_11 after
       the panel has been factored (side-effect of the panel factorization
       routine */
    PLA_Mscalar_create_conf_to( A_11, PLA_ALL_ROWS, PLA_ALL_COLS, 
                                 &A_11_msc );

    /* Partition off the subvector of pivots that will hold the pivot
       information for the column panel to be factored */
    PLA_Obj_horz_split_2( pivots_B, size,   &pivots_1, 
                                             &pivots_B );

    /* Create a duplicated multiscalar in which to store the pivot
       information for the column panel factorization */
    PLA_Mscalar_create_conf_to( pivots_1, PLA_ALL_ROWS, PLA_ALL_COLS, 
				 &pivots_1_msc );

    /* Factor the current column panel, distributed as a multivector.
       the updated A_11 matrix will be returned duplicated to all
       nodes in A_11_msc */ 	       
    value = PLA_LU_right_mv( A_1_mv, A_11_msc, pivots_1_msc  );
    if ( value != PLA_SUCCESS )  break;

    /* Copy the pivot information for the current column panel into pivots */
    PLA_Copy( pivots_1_msc, pivots_1 );

    /* Pivot the rows of the matrix consistent with the pivot information
       for the current column panel */
    PLA_Apply_pivots_to_rows( A_BR, pivots_1_msc );

    PLA_Obj_split_4( A_BR, size, size,      PLA_DUMMY, PLA_DUMMY,
                                            PLA_DUMMY, &A_BR );

    /* Copy A_12 so it is duplicated to all rows, in A_12_dpmv */
    PLA_Obj_set_orientation( A_12, PLA_PROJ_ONTO_ROW );
    PLA_Pmvector_create_conf_to( A_12, PLA_PROJ_ONTO_ROW, PLA_ALL_ROWS,
				  size, &A_12_dpmv );

    /* Duplicate the current panel to all columns, in A_1_dpmv */
    PLA_Pmvector_create_conf_to( A_1, PLA_PROJ_ONTO_COL, PLA_ALL_COLS,
				  size, &A_1_dpmv );
    PLA_Copy( A_1_mv,   A_1_dpmv );

    /* Place the current column panel back into the matrix */
    PLA_Copy( A_1_dpmv, A_1 );  

    /* View view the part of A_1_dpmv that holds the copy of the updated
       matrix A_21 */
    PLA_Obj_horz_split_2( A_1_dpmv, size, PLA_DUMMY,
			                   &A_21_dpmv );
    
    /* Update A_12 <- inv(L_11) A_12, while distributed in A_12_dpmv */

    PLA_Mvector_create_conf_to( A_12_dpmv, size, &A_12_mv );
    PLA_Copy( A_12, A_12_mv );
    PLA_Local_trsm( PLA_SIDE_RIGHT, PLA_LOWER_TRIANGULAR, 
		     PLA_TRANSPOSE, PLA_UNIT_DIAG,
		     one, A_11_msc, A_12_mv );
    PLA_Copy( A_12_mv, A_12_dpmv ); 

    /* Place the current row panel A_12 back into the matrix */
    PLA_Copy( A_12_dpmv, A_12 );

    /* Update A_22 (viewed by A_BR) <- A_22 - A_21 * A_12 */
    PLA_Local_gemm( PLA_NO_TRANSPOSE, PLA_NO_TRANSPOSE, 
		     minus_one, A_21_dpmv, A_12_dpmv, one, A_BR ); 

    /* Update the view of the part of the matrix that still needs to
       be pivoted in subsequent steps */
    PLA_Obj_horz_split_2( A_B, size,       PLA_DUMMY,
			                    &A_B );
  }
  PLA_Obj_free( &A_BR );
  PLA_Obj_free( &A_1 );
  PLA_Obj_free( &A_B );
  PLA_Obj_free( &A_11 );
  PLA_Obj_free( &A_12 );
  PLA_Obj_free( &A_21 );
  PLA_Obj_free( &A_21_dpmv );
  PLA_Obj_free( &A_1_mv );
  PLA_Obj_free( &A_1_dpmv );
  PLA_Obj_free( &A_12_dpmv );
  PLA_Obj_free( &A_12_mv );
  PLA_Obj_free( &A_11_msc );
  PLA_Obj_free( &pivots_B );
  PLA_Obj_free( &pivots_1 );
  PLA_Obj_free( &pivots_1_msc );
  PLA_Obj_free( &one );
  PLA_Obj_free( &minus_one );

  return value;
}
int main(int argc, char *argv[])
{
  MPI_Comm 
    comm = MPI_COMM_NULL;
  MPI_Datatype
    datatype;
  PLA_Template 
    templ = NULL;
  PLA_Obj  
    A_orig  = NULL, A      = NULL, Q = NULL, diag = NULL,  B = NULL,
    minus_one = NULL, zero   = NULL, one  = NULL;
  int      
    n,
    nb_distr, nb_alg,
    error, parameters, sequential,
    me, nprocs, nprows, npcols,
    itype;
  double 
    time,
    flops,
    d_abs_max,
    PLA_Local_abs_max();

  MPI_Init(&argc, &argv);

  MPI_Comm_rank(MPI_COMM_WORLD, &me);
  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);

  if (me==0) {
    printf("enter mesh size:\n");
    scanf("%d%d", &nprows, &npcols );
    printf("mesh size  = %d x %d \n", nprows, npcols );
    printf("enter distr. block size:\n");
    scanf("%d", &nb_distr );
    printf("nb_distr = %d\n", nb_distr );
    printf("enter alg. block size:\n");
    scanf("%d", &nb_alg );
    printf("nb_alg = %d\n", nb_alg );
    printf("turn on error checking? (0 = NO, 1 = YES):\n");
    scanf("%d", &error );
    printf("error checking = %d\n", error );
    printf("turn on parameter checking? (0 = NO, 1 = YES):\n");
    scanf("%d", &parameters );
    printf("parameter checking = %d\n", parameters );
    printf("turn on sequential checking? (0 = NO, 1 = YES):\n");
    scanf("%d", &sequential );
    printf("sequential checking = %d\n", sequential );
  }
  MPI_Bcast(&nprows, 1, MPI_INT, 0, MPI_COMM_WORLD);
  MPI_Bcast(&npcols, 1, MPI_INT, 0, MPI_COMM_WORLD);
  MPI_Bcast(&nb_distr, 1, MPI_INT, 0, MPI_COMM_WORLD);
  MPI_Bcast(&nb_alg,   1, MPI_INT, 0, MPI_COMM_WORLD);
  MPI_Bcast(&error, 1, MPI_INT, 0, MPI_COMM_WORLD);
  MPI_Bcast(&parameters, 1, MPI_INT, 0, MPI_COMM_WORLD);
  MPI_Bcast(&sequential, 1, MPI_INT, 0, MPI_COMM_WORLD);

  pla_Environ_set_nb_alg( PLA_OP_ALL_ALG, nb_alg );

  PLA_Set_error_checking( error, parameters, sequential, FALSE );

/*  PLA_Comm_1D_to_2D_ratio(MPI_COMM_WORLD, 1.0, &comm); */
  PLA_Comm_1D_to_2D(MPI_COMM_WORLD, nprows, npcols, &comm); 

  PLA_Init(comm);
    
  PLA_Temp_create( nb_distr, 0, &templ );
    
  while ( TRUE ){
    if (me==0) {
      printf("enter datatype:\n");
      printf("-1 = quit\n");
      printf(" 0 = float\n");
      printf(" 1 = double\n");
      printf(" 2 = complex\n");
      printf(" 3 = double complex\n");
      scanf("%d", &itype );
      printf("itype = %d\n", itype );
    }
    MPI_Bcast(&itype, 1, MPI_INT, 0, MPI_COMM_WORLD);

    if ( itype == -1 ) break;
    switch( itype ){
    case 0:
      datatype = MPI_FLOAT;
      break;
    case 1:
      datatype = MPI_DOUBLE;
      break;
    case 2:
      datatype = MPI_COMPLEX;
      break;
    case 3:
      datatype = MPI_DOUBLE_COMPLEX;
      break;
    default:
      PLA_Abort( "unknown datatype", __LINE__, __FILE__ );
    }

    if (me==0) {
      printf("enter n:\n");
      scanf("%d", &n );
      printf("n = %d\n", n );
    }

    MPI_Bcast(&n,     1, MPI_INT, 0, MPI_COMM_WORLD);

    PLA_Matrix_create( datatype, 
		        n, 
		        n,
			templ, 
                        PLA_ALIGN_FIRST, 
                        PLA_ALIGN_FIRST, 
                        &A_orig ); 

    PLA_Matrix_create_conf_to( A_orig, &Q );

    PLA_Matrix_create_conf_to( A_orig, &A );

    PLA_Mvector_create( datatype,
		        n, 
		        1,
			templ, 
                        PLA_ALIGN_FIRST, 
                        &diag ); 

    PLA_Create_constants_conf_to( A, &minus_one, &zero, &one );

    create_diag( diag );

    PLA_Create_sym_eigenproblem( PLA_LOWER_TRIANGULAR, 3, diag, A_orig, Q );

    PLA_Copy( A_orig, A );

    MPI_Barrier( MPI_COMM_WORLD );
    time = MPI_Wtime ();

    PLA_Spectral_decomp( PLA_LOWER_TRIANGULAR, A, Q, diag );

    MPI_Barrier( MPI_COMM_WORLD );
    time = MPI_Wtime () - time;

    /******* Check answer *******/

    /* Make A_orig symmetric */
      PLA_Symmetrize( PLA_LOWER_TRIANGULAR, A_orig );

    PLA_Matrix_create_conf_to( A_orig, &B );

    PLA_Obj_set_to_zero( A );
    PLA_Obj_set_diagonal( A, diag );

    /* A_orig = A_orig - Q diag Q^T */
    PLA_Gemm( PLA_NO_TRANSPOSE, PLA_NO_TRANSPOSE,
	       one, Q, A, zero, B );
    PLA_Gemm( PLA_NO_TRANSPOSE, PLA_TRANSPOSE,
	       minus_one, B, Q, one, A_orig );

    /* Extract absolute value of entry with largest absolute value in A_orig */
    d_abs_max = PLA_Local_abs_max( A_orig );

    if ( d_abs_max > 0.000000001 )
      printf( "large error detected: %le\n", d_abs_max );
      
    flops = 4.0/3.0 * n * n * n;

    if ( me == 0 ) 
      printf("%d time = %f, MFLOPS/node = %10.4lf \n", n, time,
	     flops / time * 1.0e-6 / nprocs );

    PLA_Obj_free( &A_orig );
    PLA_Obj_free( &A );
    PLA_Obj_free( &Q );
    PLA_Obj_free( &diag );
    PLA_Obj_free( &B );
    PLA_Obj_free( &minus_one );
    PLA_Obj_free( &zero );
    PLA_Obj_free( &one );
  }

  PLA_Temp_free(&templ);
  PLA_Finalize( );
  MPI_Finalize( );
}
int PLA_Form_Q_x( int trans, int uplo, PLA_Obj A, PLA_Obj s, PLA_Obj Q )

/*
  Purpose: Compute Q = H_1 * H_2 * ... * H_n-1 where H_i is
  the ith Householder transform stored in the ith column of A of
  uplo == PLA_LOWER_TRIANGULAR or the ith row of A if 
  uplo == PLA_UPPER_TRIANGULAR and 
  ith entry in s.
  If trans = PLA_NO_TRANS, Q is computed.  Otherwise, Q^T is computed.

  Input:  trans   --   Indicates whether Q or Q^T is to be computed.
          uplo    --   Indicates whether Householder vectors are
                       stored above or below diagonal.
          A       --   General mxn matrix A 
                       (PLA_MATRIX)
          s       --   vector for storing scalar in Householder transforms
                       (MVECTOR of length=min(m,n), width=1)

  Output: Q      

  Return value: PLA_SUCCESS iff Q is computed successfully.
*/
{
  int       size, length, width, nb_alg1, nb_alg2, nb_alg, proj_old;
  PLA_Template  templ;
  PLA_Obj   ATL         = NULL,     ABR            = NULL,
            QBR         = NULL,
            A1B         = NULL,     AB1_mv         = NULL,     
            W_mv        = NULL,     Y_mv        = NULL,     
            sL          = NULL,
            s_cur       = NULL,     s_dup   = NULL;

  if ( uplo == PLA_LOWER_TRIANGULAR )
    return PLA_Form_Q( trans, A, s, Q );

  /* Determine algorithmic blocking size */
  PLA_Obj_template( A, &templ );
  PLA_Environ_nb_alg( PLA_OP_PAN_PAN, templ, &nb_alg1 ); 
  PLA_Environ_nb_alg( PLA_OP_PAN_MAT, templ, &nb_alg2 ); 
  nb_alg = ( nb_alg1 > nb_alg2 ? nb_alg1 : nb_alg2 );

  /* Q = I */
  PLA_Obj_set_to_identity( Q );

  /* Apply the Householder vectors like 
                 ( H_1 ( ... ( H_n-1 ( H_n I ) ) ... ) ) */

  /* Initially partition A = / ATL | ATR \  where ATL is square and
                             \  *  | ABR /  ABR has width 0  */

  PLA_Obj_global_length( A, &length );
  PLA_Obj_global_width ( A, &width );
  width = min( length, width );
  PLA_Obj_split_4( A, width, width,   &ATL,      PLA_DUMMY,
                                      PLA_DUMMY, &ABR );

  PLA_Obj_split_4( Q, width, width,   PLA_DUMMY, PLA_DUMMY,
                                      PLA_DUMMY, &QBR );

  PLA_Obj_horz_split_2( s, width,     &sL,       
                                      PLA_DUMMY );

  while ( TRUE ) {
    /* Check if done */

    PLA_Obj_global_length( ATL, &length ); 
    PLA_Obj_global_width( ATL, &width ); 
    if ( ( size = min( min( length, width), nb_alg ) ) == 0 ) break;

    /* Grow ABR by the panel from which to compute the next 
       WY transform.  Grow QBR similarly */

    PLA_Obj_view_shift( ABR,    -size, 
                     -size,               0,
                                  0 );

    PLA_Obj_view_shift( QBR,      -size,
                          -size,           0,
                                    0 );

    /* Partition off A1B from which to compute the next WY transform */
    PLA_Obj_horz_split_2( ABR, size, &A1B, 
                                     PLA_DUMMY );
    PLA_Obj_set_orientation( A1B, PLA_PROJ_ONTO_ROW );

    /* Partition off the scaling factors and duplicate to all nodes */

    PLA_Obj_horz_split_2( sL,  -size,     &sL, 
                                          &s_cur );

    PLA_Mscalar_create_conf_to( s_cur, PLA_ALL_ROWS, PLA_ALL_COLS, 
                                &s_dup );

    /* Redistribute A1B as a multivector and compute W and Y */

    PLA_Obj_get_orientation( QBR, &proj_old );
    PLA_Obj_set_orientation( QBR, PLA_PROJ_ONTO_ROW );
    PLA_Mvector_create_conf_to( QBR, size, &AB1_mv );
    PLA_Mvector_create_conf_to( QBR, size, &W_mv );
    PLA_Mvector_create_conf_to( QBR, size, &Y_mv );
    PLA_Obj_set_orientation( QBR, proj_old );

    PLA_Copy( A1B, AB1_mv );
    PLA_Copy( s_cur, s_dup );

    PLA_Compute_WY( AB1_mv, s_dup, W_mv, Y_mv );

    if ( trans == PLA_NO_TRANSPOSE ) {
      /* Update QBR <- ( I + W Y^T ) QBR */
      PLA_Apply_W_Y_transform ( PLA_SIDE_LEFT, PLA_NO_TRANSPOSE,
                                W_mv, Y_mv, QBR ); 
    }
    else {
      /* Update QBR <- QBR ( I + W^T Y ) QBR */
      PLA_Apply_W_Y_transform ( PLA_SIDE_RIGHT, PLA_TRANSPOSE,
                                W_mv, Y_mv, QBR ); 
    }

    /* Update view of ATL */

    PLA_Obj_split_4( ATL, -size, -size,  &ATL,      PLA_DUMMY,
                                         PLA_DUMMY, PLA_DUMMY );
  }

  PLA_Obj_free( &ATL );           PLA_Obj_free( &ABR );            
  PLA_Obj_free( &QBR ); 
  PLA_Obj_free( &A1B );           PLA_Obj_free( &AB1_mv );
  PLA_Obj_free( &W_mv );          PLA_Obj_free( &Y_mv );          
  PLA_Obj_free( &s_cur );         PLA_Obj_free( &s_dup );
  PLA_Obj_free( &sL );

  return PLA_SUCCESS;
}
void create_problem( PLA_Obj A, PLA_Obj x, PLA_Obj b )
{
  PLA_Obj  
    zero = NULL,    
    one = NULL,
    A_cur = NULL,
    A11 = NULL;

  int size, me, nprocs, i, j, fill_blocksize, this_fill_blocksize, type_size;
  double d_one = 1.0, time;

  void *locA;

  void *local_buf;
  int  local_m, local_n, local_ldim, local_stride, global_length;

  MPI_Datatype
    datatype;
  
  PLA_Obj_global_length( A, &size );

  PLA_Obj_datatype ( A, &datatype );
  MPI_Type_size ( datatype, &type_size);

  MPI_Comm_rank( MPI_COMM_WORLD, &me );
  MPI_Comm_size( MPI_COMM_WORLD, &nprocs );

  PLA_Create_constants_conf_to( A, NULL, &zero, &one );

  srand48( me * 1793 );

  PLA_Obj_local_length( A, &local_m );
  PLA_Obj_local_width(  A, &local_n );
  PLA_Obj_local_buffer( A, (void **) &local_buf );
  PLA_Obj_local_ldim(   A, &local_ldim );



#define FILL_METHOD FILL_LOCAL_RANDOM
#if FILL_METHOD

  /************************************************************************
    Fill the matrices.

    NOTE:  There are two versions of the fill routine in this file.  The
    version directly below this comment simply fills the local portions of 
    the matrix and vector with random numbers.  To use this version, the 
    line directly above this comment should read "#define FILL_METHOD FILL_LOCAL_RANDOM".

    The other version of the fill algorithm uses the PLAPACK Application 
    Interface, which allows each processor to create a portion of the matrix
    or vector (regardless of the true location on the machine of that portion)
    and then submit the piece through a call to the PLAPACK API.  To use 
    this version of the algorithm, the line directly above this comment should 
    read "#define FILL_METHOD FILL_THROUGH_API".

    The API version of the algorithm has a parameter called "fill_blocksize"
    that determines the width of the column blocks to be submitted to the
    matrix.  The number of independent messages generated by the API is
    inversely proportional to the fill_blocksize, and significant network
    contention (or even deadlock on some systems) may occur if the
    fill_blocksize is taken too small.
    **********************************************************************/

  for (j=0; j<local_n; j++ )
    for (i=0; i<local_m; i++ )
      if ( datatype == MPI_DOUBLE )
	{
	  ( (double *) local_buf )[ j*local_ldim + i ] = drand48() * 2.0 -1.0;
	}
      else if ( datatype == MPI_FLOAT )
	{
	  ( (float *) local_buf)[ j*local_ldim + i ] = (float) (drand48() * 2.0 -1.0);
	}
      else if ( datatype == MPI_DOUBLE_COMPLEX )
	{
	  ((PLA_DOUBLE_COMPLEX *)local_buf)[ j*local_ldim + i ].real = drand48() * 2.0 -1.0;
	  ((PLA_DOUBLE_COMPLEX *)local_buf)[ j*local_ldim + i ].imaginary = drand48() * 2.0 -1.0;
	}
      else if ( datatype == MPI_COMPLEX )
	{
	  ((PLA_COMPLEX *)local_buf)[ j*local_ldim + i ].real = (float) drand48() * 2.0 -1.0;
	  ((PLA_COMPLEX *)local_buf)[ j*local_ldim + i ].imaginary = (float) drand48() * 2.0 -1.0;
	}
	
  
  PLA_Obj_local_length( x, &local_m );
  PLA_Obj_local_buffer( x, (void **) &local_buf );
  PLA_Obj_local_stride( x, &local_stride );
  
  for (i=0; i<local_m; i++ )
      if ( datatype == MPI_DOUBLE )
	{
	  ((double*) local_buf)[ i*local_stride ] = drand48() * 2.0 -1.0;
	}
      else if ( datatype == MPI_FLOAT )
	{
	  ((float *)local_buf)[ i*local_stride ] = (float) (drand48() * 2.0 -1.0);
	}
      else if ( datatype == MPI_DOUBLE_COMPLEX )
	{
	  ((PLA_DOUBLE_COMPLEX *)local_buf)[ i*local_stride ].real = drand48() * 2.0 -1.0;
	  ((PLA_DOUBLE_COMPLEX *)local_buf)[ i*local_stride ].imaginary = drand48() * 2.0 -1.0;
	}
      else if ( datatype == MPI_COMPLEX )
	{
	  ((PLA_COMPLEX *)local_buf)[ i*local_stride ].real = (float) drand48() * 2.0 -1.0;
	  ((PLA_COMPLEX *)local_buf)[ i*local_stride ].imaginary = (float) drand48() * 2.0 -1.0;
	}

#else
  /***********************************************************************************
    Alternate version of the problem creation using the PLAPACK Application interface.

    To use, edit the line marked "PROBLEM CREATION METHOD" above to read
    "#define FILL_METHOD FILL_THROUGH_API".
    **********************************************************************************/

  if ( 0 == me ) 
    printf("Using PLAPACK application interface to create problem.\n");

  MPI_Barrier ( MPI_COMM_WORLD);
  time = MPI_Wtime ();

  PLA_API_begin();
  PLA_Obj_API_open(A);
  PLA_Obj_API_open(x);

  fill_blocksize = 10;

  locA = (void *) malloc( type_size * size * fill_blocksize  );
  
  for (j=me*fill_blocksize;j< size; j+=nprocs*fill_blocksize) {
    this_fill_blocksize = min( fill_blocksize, size - j);
    for (i=0; i < size*this_fill_blocksize; i++)  {   /* This loop determines the values to put into matrix */
      if ( MPI_DOUBLE == datatype )
	((double *)locA)[i]=drand48() * 2.0 - 1.0;      
      else if ( MPI_FLOAT == datatype )
	((float *)locA)[i]=drand48() * 2.0 - 1.0;      
      else if ( MPI_DOUBLE_COMPLEX == datatype ) {
	((double *)locA)[2*i]=drand48() * 2.0 - 1.0;      
	((double *)locA)[2*i+1]=drand48() * 2.0 - 1.0;      
      }
      else if ( MPI_COMPLEX == datatype ) {
	((float *)locA)[2*i]=drand48() * 2.0 - 1.0;      
	((float *)locA)[2*i+1]=drand48() * 2.0 - 1.0;      
      }
      else
	printf("Unhandled datatype in create_problem().\n");
    }
    PLA_API_axpy_matrix_to_global(size, 
				  this_fill_blocksize, 
				  &d_one, 
				  locA, 
				  size, 
				  A, 
				  0, j );
  }
  
  if (0==me) {                                  /* processor zero alone fills the vector */
    for (i=0; i<size; i++)
      if ( MPI_DOUBLE == datatype )
	((double *)locA)[i]=drand48() * 2.0 - 1.0;      
      else if ( MPI_FLOAT == datatype )
	((float *)locA)[i]=drand48() * 2.0 - 1.0;      
      else if ( MPI_DOUBLE_COMPLEX == datatype ) {
	((double *)locA)[2*i]=drand48() * 2.0 - 1.0;      
	((double *)locA)[2*i+1]=drand48() * 2.0 - 1.0;      
      }
      else if ( MPI_COMPLEX == datatype ) {
	((float *)locA)[2*i]=drand48() * 2.0 - 1.0;      
	((float *)locA)[2*i+1]=drand48() * 2.0 - 1.0;      
      }
      else
	printf("Unhandled datatype in create_problem().\n");

    PLA_API_axpy_vector_to_global( size, &d_one, locA, 1, x, 0);
  }
  
  free( locA );
  
  PLA_Obj_API_close(A);
  PLA_Obj_API_close(x);
  PLA_API_end(); 
  
  MPI_Barrier ( MPI_COMM_WORLD);
  time = MPI_Wtime () - time;

  if ( 0 == me ) {
    printf("time for problem creation: %e seconds\n", time);
  }

#endif

  /* Make A positive definite by adding a large value to the diagonal */

  PLA_Obj_view_all( A, &A_cur );
  PLA_Obj_global_length ( A, &global_length);
  while ( TRUE ){
    PLA_Obj_global_length( A_cur, &size );
    if ( 0 == size ) break;
    PLA_Obj_split_4( A_cur, 1, 1, &A11, PLA_DUMMY,
		    PLA_DUMMY, &A_cur );
    PLA_Obj_local_length( A11, &local_m );
    PLA_Obj_local_width ( A11, &local_n );
    if ( local_m == 1 && local_n == 1 ) {
      PLA_Obj_local_buffer( A11, (void **) &local_buf );

      if ( datatype == MPI_DOUBLE )
	*(double *)local_buf += (double) global_length;
      else if ( MPI_FLOAT == datatype )
	*(float *)local_buf += (float) global_length;
      else if ( datatype == MPI_DOUBLE_COMPLEX){
	((double *)local_buf)[0] += (double) global_length;
	((double *)local_buf)[1] = 0.0;
      }
      else if ( datatype == MPI_COMPLEX){
	((float *)local_buf)[0] += (float) global_length;
	((float *)local_buf)[1] = 0.0;
      }
      else
	printf("Unhandled datatype in create_problem().\n");
    }
  }
      
  if ( datatype == MPI_DOUBLE || datatype == MPI_FLOAT )
    PLA_Symv( PLA_LOWER_TRIANGULAR, one, A, x, zero, b ); 
  else
    PLA_Hemv( PLA_LOWER_TRIANGULAR, one, A, x, zero, b ); 

  PLA_Obj_free( &zero );         PLA_Obj_free( &one );
  PLA_Obj_free( &A_cur);         PLA_Obj_free ( &A11);
}
int PLA_Syrk_panpan( int nb_alg, int uplo, int transa,
		      PLA_Obj alpha, PLA_Obj A, 
                      PLA_Obj beta,  PLA_Obj C )
{
  PLA_Obj 
    A_L = NULL, A_1 = NULL, A_1_dup = NULL, A_1_trans_dup = NULL,
    one = NULL;
  int 
    size;

  PLA_Local_scal( beta, C );   
  PLA_Create_constants_conf_to( C, NULL, NULL, &one );

  if ( transa == PLA_NO_TRANS ) 
    PLA_Obj_global_width(  A, &size );
  else                          
    PLA_Obj_global_length( A, &size );
  nb_alg = min( size, nb_alg );

  PLA_Obj_view_all( A, &A_L );

  PLA_Pmvector_create_conf_to( 
          C, PLA_PROJ_ONTO_COL, PLA_ALL_COLS, nb_alg, &A_1_dup );
  PLA_Pmvector_create_conf_to( 
          C, PLA_PROJ_ONTO_ROW, PLA_ALL_ROWS, nb_alg, &A_1_trans_dup );


  while ( TRUE ){
    if ( transa == PLA_NO_TRANS ) 
      PLA_Obj_global_width(  A_L, &size );
   else                          
      PLA_Obj_global_length( A_L, &size );
    if ( ( size = min( size, nb_alg ) ) == 0 ) break;

    if ( transa == PLA_NO_TRANS ) 
      PLA_Obj_vert_split_2( A_L, size, &A_1, &A_L );
    else {
      PLA_Obj_horz_split_2( A_L, size, &A_1, 
                                        &A_L );
      PLA_Obj_set_orientation( A_1, PLA_PROJ_ONTO_ROW );
    }

    if ( size != nb_alg ) {
      PLA_Obj_vert_split_2( A_1_dup,       size, &A_1_dup, PLA_DUMMY );
      PLA_Obj_horz_split_2( A_1_trans_dup, size, &A_1_trans_dup, 
                                                   PLA_DUMMY );
    }

    PLA_Copy( A_1,     A_1_dup );       
    PLA_Copy( A_1_dup, A_1_trans_dup );

    if ( transa == PLA_CONJ_TRANS )   PLA_Conjugate( A_1_dup );

    PLA_Syrk_perform_local_part( uplo, alpha, A_1_dup, A_1_trans_dup, one,   C );
  }

  PLA_Obj_free( &A_L);           PLA_Obj_free( &A_1 );   
  PLA_Obj_free( &A_1_dup );      PLA_Obj_free( &A_1_trans_dup );
  PLA_Obj_free( &one );

  return PLA_SUCCESS;
}
int PLA_Trmm_left_upper( int transa, int diag,
			  PLA_Obj alpha, PLA_Obj A, PLA_Obj B )
{
  PLA_Obj 
    A_TL = NULL, A_1 = NULL, A_1_dup = NULL, A_11_dup = NULL, 
    B_T = NULL,  B_1 = NULL, B_1_dup = NULL,
    A_BR = NULL, B_B = NULL,
    one = NULL,  zero = NULL;

  int 
    nb_alg, size;

  PLA_Template
    templ;

  PLA_Local_scal( alpha, B );   
  PLA_Create_constants_conf_to( A, NULL, &zero, &one );
  
  PLA_Obj_template( A, &templ );

  if ( transa == PLA_NO_TRANS ){
    PLA_Obj_global_length( B, &size );
    PLA_Environ_nb_alg( PLA_OP_PAN_MAT, templ, &nb_alg );
    nb_alg = min( size, nb_alg );

    PLA_Obj_view_all( A, &A_BR );        PLA_Obj_view_all( B, &B_B );

    PLA_Pmvector_create_conf_to( 
  	  B, PLA_PROJ_ONTO_COL, PLA_ALL_COLS, nb_alg, &A_1_dup );
    PLA_Pmvector_create_conf_to( 
          B, PLA_PROJ_ONTO_ROW, PLA_ALL_ROWS, nb_alg, &B_1_dup );

    while ( TRUE ){
      PLA_Obj_global_length( A_BR, &size );
      if ( ( size = min( size, nb_alg ) ) == 0 ) break;

      PLA_Obj_horz_split_2( A_BR, size, &A_1, 
                                         PLA_DUMMY );
      PLA_Obj_set_orientation( A_1, PLA_PROJ_ONTO_ROW );

      if ( size != nb_alg ){
	PLA_Obj_vert_split_2( A_1_dup, size, &A_1_dup, PLA_DUMMY );
	PLA_Obj_horz_split_2( B_1_dup, size, &B_1_dup, 
                                             PLA_DUMMY );
      }

      PLA_Copy( A_1, A_1_dup );       

      PLA_Obj_horz_split_2( A_1_dup, size,  &A_11_dup,
                                             PLA_DUMMY );

      PLA_Set_triang_to_zero( PLA_LOWER_TRIANGULAR, diag, A_11_dup );

      PLA_Local_gemm( PLA_TRANS, PLA_NO_TRANS, one,  A_1_dup, B_B,
                                                     zero, B_1_dup );

      PLA_Obj_horz_split_2( B_B, size,        &B_1,
	                                      &B_B );

      PLA_Obj_set_orientation( B_1, PLA_PROJ_ONTO_ROW );

      PLA_Reduce( B_1_dup, MPI_SUM, B_1 );

      PLA_Obj_split_4( A_BR, size, size,   PLA_DUMMY, PLA_DUMMY,
                                           PLA_DUMMY, &A_BR );

      PLA_Obj_horz_split_2( A_1_dup, size, PLA_DUMMY, 
                                           &A_1_dup );
    }
  }
  else if ( transa == PLA_TRANS ){ /* transa == PLA_TRANS */
    PLA_Obj_global_length( B, &size );
    PLA_Environ_nb_alg( PLA_OP_PAN_MAT, templ, &nb_alg );
    nb_alg = min( size, nb_alg );

    PLA_Obj_view_all( A, &A_TL );        PLA_Obj_view_all( B, &B_T );

    PLA_Pmvector_create_conf_to( 
  	  B, PLA_PROJ_ONTO_COL, PLA_ALL_COLS, nb_alg, &A_1_dup );
    PLA_Pmvector_create_conf_to( 
          B, PLA_PROJ_ONTO_ROW, PLA_ALL_ROWS, nb_alg, &B_1_dup );

    while ( TRUE ){
      PLA_Obj_global_length( A_TL, &size );
      if ( ( size = min( size, nb_alg ) ) == 0 ) break;

      PLA_Obj_vert_split_2( A_TL, -size, PLA_DUMMY, &A_1 );

      PLA_Copy( A_1, A_1_dup );       

      PLA_Obj_horz_split_2( A_1_dup, -size, PLA_DUMMY,
                                             &A_11_dup );

      PLA_Set_triang_to_zero( PLA_UPPER_TRIANGULAR, diag, A_11_dup );

      PLA_Local_gemm( transa, PLA_NO_TRANS, one,  A_1_dup, B_T,
                                              zero, B_1_dup );

      PLA_Obj_horz_split_2( B_T, -size,        &B_T,
			                        &B_1 );

      PLA_Obj_set_orientation( B_1, PLA_PROJ_ONTO_ROW );

      PLA_Reduce( B_1_dup, MPI_SUM, B_1 );

      PLA_Obj_split_4( A_TL, -size, -size, &A_TL,      PLA_DUMMY,
                                            PLA_DUMMY, PLA_DUMMY );

      PLA_Obj_horz_split_2( A_1_dup, -size, &A_1_dup,
                                             PLA_DUMMY );
    }
  }
  else 
    PLA_Abort("Trmm: transpose case not yet implemented", __LINE__, __FILE__ );

  PLA_Obj_free( &A_TL );
  PLA_Obj_free( &A_BR );
  PLA_Obj_free( &A_1 );
  PLA_Obj_free( &A_1_dup );
  PLA_Obj_free( &A_11_dup );
  PLA_Obj_free( &B_T );
  PLA_Obj_free( &B_B );
  PLA_Obj_free( &B_1 );
  PLA_Obj_free( &B_1_dup );
  PLA_Obj_free( &one );
  PLA_Obj_free( &zero );

  return PLA_SUCCESS;
}