Exemple #1
0
FLA_Error REF_Syrk_lt( FLA_Obj alpha, FLA_Obj A, FLA_Obj beta, FLA_Obj C )
{

  FLA_Syrk_external( FLA_LOWER_TRIANGULAR, FLA_TRANSPOSE, alpha, A, beta, C );
  
  return 0;
}
FLA_Error FLA_Syrk_ln_omp1t_var5( FLA_Obj A, FLA_Obj C )
{
  FLA_Obj AL,    AR,       A0,  A1,  A2;
  FLA_Obj MyC;

  int b;
  
  FLA_Part_1x2( A,    &AL,  &AR,      0, FLA_LEFT );

  #pragma intel omp parallel taskq
  {
  while ( FLA_Obj_width( AL ) < FLA_Obj_width( A ) ){

    b = FLA_Task_compute_blocksize( 0, A, AL, FLA_LEFT );

    FLA_Repart_1x2_to_1x3( AL,  /**/ AR,        &A0, /**/ &A1, &A2,
                           b, FLA_RIGHT );

    /*------------------------------------------------------------*/

    #pragma intel omp task captureprivate(A1) private(MyC)
    {
    FLA_Obj_create_conf_to( FLA_NO_TRANSPOSE, C, &MyC );
    FLA_Obj_set_to_zero( MyC );
    
    /* MyC := A1 * A1' */
    FLA_Syrk_external( FLA_LOWER_TRIANGULAR, FLA_NO_TRANSPOSE, FLA_ONE, A1, FLA_ZERO, MyC );

    /* C := MyC */
    FLA_Axpy_sync_pipeline( FLA_ONE, MyC, C );

    FLA_Obj_free( &MyC );
    }

    /*------------------------------------------------------------*/

    FLA_Cont_with_1x3_to_1x2( &AL,  /**/ &AR,        A0, A1, /**/ A2,
                              FLA_LEFT );
  }
  }

  return FLA_SUCCESS;
}
FLA_Error FLA_Syrk_ln_omp2l_var1( FLA_Obj A, FLA_Obj C )
{
  FLA_Obj AT,              A0,
          AB,              A1,
                           A2;

  FLA_Obj CTL,   CTR,      C00, C01, C02, 
          CBL,   CBR,      C10, C11, C12,
                           C20, C21, C22;

  int b;

  FLA_Part_2x1( A,    &AT, 
                      &AB,            0, FLA_TOP );

  FLA_Part_2x2( C,    &CTL, &CTR,
                      &CBL, &CBR,     0, 0, FLA_TL );

  #pragma intel omp parallel taskq
  {
  while ( FLA_Obj_length( AT ) < FLA_Obj_length( A ) ){

    b = FLA_Task_compute_blocksize( 0, A, AT, FLA_TOP );

    FLA_Repart_2x1_to_3x1( AT,                &A0, 
                        /* ** */            /* ** */
                                              &A1, 
                           AB,                &A2,        b, FLA_BOTTOM );

    FLA_Repart_2x2_to_3x3( CTL, /**/ CTR,       &C00, /**/ &C01, &C02,
                        /* ************* */   /* ******************** */
                                                &C10, /**/ &C11, &C12,
                           CBL, /**/ CBR,       &C20, /**/ &C21, &C22,
                           b, b, FLA_BR );

    /*------------------------------------------------------------*/

    #pragma intel omp task captureprivate(A1, A0, C10)
    {
    /* C10 = C10 + A1 * A0' */
    FLA_Gemm_external( FLA_NO_TRANSPOSE, FLA_TRANSPOSE, FLA_ONE, A1, A0, FLA_ONE, C10 );
    }

    /*------------------------------------------------------------*/

    FLA_Cont_with_3x1_to_2x1( &AT,                A0, 
                                                  A1, 
                            /* ** */           /* ** */
                              &AB,                A2,     FLA_TOP );

    FLA_Cont_with_3x3_to_2x2( &CTL, /**/ &CTR,       C00, C01, /**/ C02,
                                                     C10, C11, /**/ C12,
                            /* ************** */  /* ****************** */
                              &CBL, /**/ &CBR,       C20, C21, /**/ C22,
                              FLA_TL );

  }

  FLA_Part_2x1( A,    &AT, 
                      &AB,            0, FLA_TOP );

  FLA_Part_2x2( C,    &CTL, &CTR,
                      &CBL, &CBR,     0, 0, FLA_TL );

  while ( FLA_Obj_length( AT ) < FLA_Obj_length( A ) ){

    b = FLA_Task_compute_blocksize( 0, A, AT, FLA_TOP );

    FLA_Repart_2x1_to_3x1( AT,                &A0, 
                        /* ** */            /* ** */
                                              &A1, 
                           AB,                &A2,        b, FLA_BOTTOM );

    FLA_Repart_2x2_to_3x3( CTL, /**/ CTR,       &C00, /**/ &C01, &C02,
                        /* ************* */   /* ******************** */
                                                &C10, /**/ &C11, &C12,
                           CBL, /**/ CBR,       &C20, /**/ &C21, &C22,
                           b, b, FLA_BR );

    /*------------------------------------------------------------*/

    #pragma intel omp task captureprivate(C11, A1)
    {
    /* C11 = C11 + A1 * A1' */
    FLA_Syrk_external( FLA_LOWER_TRIANGULAR, FLA_NO_TRANSPOSE, FLA_ONE, A1, FLA_ONE, C11 );
    }

    /*------------------------------------------------------------*/

    FLA_Cont_with_3x1_to_2x1( &AT,                A0, 
                                                  A1, 
                            /* ** */           /* ** */
                              &AB,                A2,     FLA_TOP );

    FLA_Cont_with_3x3_to_2x2( &CTL, /**/ &CTR,       C00, C01, /**/ C02,
                                                     C10, C11, /**/ C12,
                            /* ************** */  /* ****************** */
                              &CBL, /**/ &CBR,       C20, C21, /**/ C22,
                              FLA_TL );

  }
  }
  return FLA_SUCCESS;
}
FLA_Error FLA_Syrk_ln_blk_var2_ht( FLA_Obj alpha, FLA_Obj A, FLA_Obj beta, FLA_Obj C, FLA_Syrk_t* cntl )
{
  FLA_Obj AT,              A0,
          AB,              A1,
                           A2;

  FLA_Obj CTL,   CTR,      C00, C01, C02, 
          CBL,   CBR,      C10, C11, C12,
                           C20, C21, C22;

  int b;

  FLA_Part_2x1( A,    &AT, 
                      &AB,            0, FLA_TOP );

  FLA_Part_2x2( C,    &CTL, &CTR,
                      &CBL, &CBR,     0, 0, FLA_TL );

  while ( FLA_Obj_length( AT ) < FLA_Obj_length( A ) ){

    b = 1;

    FLA_Repart_2x1_to_3x1( AT,                &A0, 
                        /* ** */            /* ** */
                                              &A1, 
                           AB,                &A2,        b, FLA_BOTTOM );

    FLA_Repart_2x2_to_3x3( CTL, /**/ CTR,       &C00, /**/ &C01, &C02,
                        /* ************* */   /* ******************** */
                                                &C10, /**/ &C11, &C12,
                           CBL, /**/ CBR,       &C20, /**/ &C21, &C22,
                           b, b, FLA_BR );

    /*------------------------------------------------------------*/

    /* C21 = C21 + A2 * A1' */
    FLA_Gemm_nt_blk_var1_ht( alpha, A2, A1, beta, C21,
                          NULL );

    /* C11 = C11 + A1 * A1' */
    FLA_Syrk_external( FLA_LOWER_TRIANGULAR, FLA_NO_TRANSPOSE, 
                       alpha, *FLASH_OBJ_PTR_AT( A1 ), beta, *FLASH_OBJ_PTR_AT( C11 ),
                       NULL );

    /*------------------------------------------------------------*/

    FLA_Cont_with_3x1_to_2x1( &AT,                A0, 
                                                  A1, 
                            /* ** */           /* ** */
                              &AB,                A2,     FLA_TOP );

    FLA_Cont_with_3x3_to_2x2( &CTL, /**/ &CTR,       C00, C01, /**/ C02,
                                                     C10, C11, /**/ C12,
                            /* ************** */  /* ****************** */
                              &CBL, /**/ &CBR,       C20, C21, /**/ C22,
                              FLA_TL );

  }

  return FLA_SUCCESS;
}
Exemple #5
0
FLA_Error FLA_Syrk_ut_task( FLA_Obj alpha, FLA_Obj A, FLA_Obj beta, FLA_Obj C, fla_syrk_t* cntl )
{
  return FLA_Syrk_external( FLA_UPPER_TRIANGULAR, FLA_TRANSPOSE, alpha, A, beta, C );
}
Exemple #6
0
FLA_Error FLA_Syrk_task( FLA_Uplo uplo, FLA_Trans trans, FLA_Obj alpha, FLA_Obj A, FLA_Obj beta, FLA_Obj C, fla_syrk_t* cntl )
{
  return FLA_Syrk_external( uplo, trans, alpha, A, beta, C );
}