struct databel_fvi * load_databel_fvi( char *path ) { FILE *f; databel_fvi *fvi; size_t data_size; f = fgls_fopen( path, "r" ); fvi = (databel_fvi*) fgls_malloc( sizeof(databel_fvi) ); // Header fread( &fvi->fvi_header, sizeof(databel_fvi_header), 1, f ); // Labels data_size = (fvi->fvi_header.numVariables +fvi->fvi_header.numObservations ) * fvi->fvi_header.namelength * sizeof(char); fvi->fvi_data = (char *) fgls_malloc ( data_size ); // Load labels fread( fvi->fvi_data, 1, data_size, f ); fclose( f ); return fvi; }
// List of aiocb structs - Constructor void aiocb_list_init( aiocb_list *b, int num_chunks, FILE *fp, double *buff ) { int i; b->n = num_chunks; b->issued = 0; b->aiocb_l = ( const struct aiocb ** ) fgls_malloc ( b->n * sizeof(struct aiocb *) ); for ( i = 0; i < b->n; i++ ) { struct aiocb *tmp_cb; tmp_cb = (struct aiocb *) fgls_malloc ( sizeof(struct aiocb) ); // zero-out the structure and fill it up with "constant" data bzero( (char *)tmp_cb, sizeof(struct aiocb) ); tmp_cb->aio_fildes = fileno( fp ); tmp_cb->aio_buf = &buff[i*(size_t)MAX_AIO_SIZE/sizeof(double)]; // assign to the const list b->aiocb_l[i] = tmp_cb; } // Will wait one by one. After all, I need them all to be done // ( and usually only one call actually needed, block size is not that big ) b->aux_l = ( const struct aiocb ** ) fgls_malloc ( sizeof(struct aiocb *) ); }
/* * Builds Phi as an SPD matrix, after the eigenvalues were fixed * during the REML estimation * * Z = eigVecs * W = eigVals * Phi = Z W Z^T */ void build_SPD_Phi( int n, double *eigVecs, double *eigVals, double *Phi ) { double ONE = 1.0, ZERO = 0.0, *tmp; int i, j; tmp = fgls_malloc( n * n * sizeof(double) ); // tmp = Z * W for ( j = 0; j < n; j++ ) for ( i = 0; i < n; i++ ) tmp[ j*n + i ] = eigVecs[ j*n + i ] * eigVals[j]; // Phi = tmp * Z^T dgemm_( NO_TRANS, TRANS, &n, &n, &n, &ONE, tmp, &n, eigVecs, &n, &ZERO, Phi, &n ); free( tmp ); }
// // Double buffering handler // // Constructor void double_buffering_init( double_buffering *b, size_t buff_size, FILE *fp, FGLS_config_t *c ) { int i; // Single buffer size b->buff_size = buff_size; // Number of chunks into which an aio operation is split b->num_chunks = size_t_ceil( buff_size, MAX_AIO_SIZE ); // Allocate double buffers for ( i = 0; i < NUM_BUFFERS_PER_THREAD; i++ ) b->buffs[i] = ( double * ) fgls_malloc ( buff_size ); // Create the necessary amount of aiocb lists, // one for io, one for comp, // and initialize them for ( i = 0; i < NUM_BUFFERS_PER_THREAD; i++ ) aiocb_list_init( &b->aiocb_l[i], b->num_chunks, fp, b->buffs[i] ); b->comp_l = &b->aiocb_l[0]; b->io_l = &b->aiocb_l[1]; // Config b->cf = c; }
// // GWAS config + data // void initialize_config( FGLS_config_t *cf, char *cov_base, char *phi_base, char *snp_base, char *pheno_base, char *out_base//, // char *var, int num_threads, int xtile, int ytile, int xb, int yb, int write_output ) { load_databel_info( cf ); // Problem dimensions cf->n = cf->XL_fvi->fvi_header.numObservations; cf->p = cf->XL_fvi->fvi_header.numVariables + 1; // Intercept included cf->m = cf->XR_fvi->fvi_header.numVariables; cf->t = cf->Y_fvi->fvi_header.numVariables; // Assuming wXR = 1 cf->wXL = cf->p - 1; cf->wXR = 1; // Algorithm parameters /*cf->x_b = x_b;*/ /*cf->y_b = y_b;*/ /*cf->num_threads = num_threads;*/ get_main_memory_size( &cf->totalMem, &cf->availMem ); estimate_block_sizes( cf, cf->var, cf->x_b == -1 || cf->y_b == -1 ); // if any is -1, estimate them /* if ( !(cf->x_b == -1 || cf->y_b == -1) ) { cf->x_b = xb; cf->y_b = yb; } cf->x_tile = xtile; cf->y_tile = ytile; */ /*cf->x_b = 10;*/ /*cf->y_b = 10;*/ // In/Out Files /* sprintf( cf->Phi_data_path, "%s.fvd", phi_base ); sprintf( cf->Phi_info_path, "%s.fvi", phi_base ); sprintf( cf->XL_data_path, "%s.fvd", cov_base ); sprintf( cf->XL_info_path, "%s.fvi", cov_base ); sprintf( cf->XR_data_path, "%s.fvd", snp_base ); sprintf( cf->XR_info_path, "%s.fvi", snp_base ); sprintf( cf->Y_data_path, "%s.fvd", pheno_base ); sprintf( cf->Y_info_path, "%s.fvi", pheno_base ); if ( write_output ) { sprintf( cf->B_data_path, "%s.out", out_base ); sprintf( cf->B_info_path, "%s.iout", out_base ); } else { sprintf( cf->B_data_path, "/dev/null" ); sprintf( cf->B_info_path, "/dev/null" ); } */ // Temporary files snprintf( cf->ZtXL_path, STR_BUFFER_SIZE, "%s.tmp", cov_base ); snprintf( cf->ZtXR_path, STR_BUFFER_SIZE, "%s.tmp", snp_base ); snprintf( cf->ZtY_path, STR_BUFFER_SIZE, "%s.tmp", pheno_base ); // Allocate memory and load in-core data // NOTE: only data which is input to GWAS // // Intermediate data as ZtXL will be handled on the fly // by the corresponding routines: // * ZtXL, only available after REML_eigen (if so) // * Z and W will be allocated and filled up during // the first eigen-decomposition of Phi, (if so) // ests needed in both cases chol and eigen cf->ests = (double *) fgls_malloc ( (3 + cf->wXL) * cf->t * sizeof(double) ); cf->h2 = cf->ests; cf->sigma2 = &cf->ests[ cf->t ]; cf->res_sigma2 = &cf->ests[ 2 * cf->t ]; cf->beta_ests = &cf->ests[ 3 * cf->t ]; cf->Phi = (double *) fgls_malloc ( cf->n * cf->n * sizeof(double) ); load_file ( cf->Phi_data_path, "rb", cf->Phi, cf->n * cf->n ); // Sanity check (Phi) checkNoNans(cf->n * cf->n, cf->Phi, "[ERROR] NaNs not allowed in Phi\n"); cf->XL = (double *) fgls_malloc ( cf->n * cf->wXL * sizeof(double) ); load_file ( cf->XL_data_path, "rb", cf->XL, cf->n * cf->wXL ); // Sanity check (XL) average( cf->XL, cf->n, cf->wXL, cf->threshold, "Covariate", &cf->XL_fvi->fvi_data[cf->n*NAMELENGTH], NAMELENGTH, 1, cf->num_threads ); cf->ZtXL = NULL; cf->Z = NULL; cf->W = NULL; // Open files for out-of-core data // Again, only for data input/output to GWAS. Intermediate, on the fly cf->XR = fgls_fopen( cf->XR_data_path, "rb" ); cf->ZtXR = NULL; cf->Y = fgls_fopen( cf->Y_data_path, "rb" ); cf->ZtY = NULL; cf->B = fgls_fopen( cf->B_data_path, "wb" ); return; }
/* * Cholesky-based solution of the * sequence of Feasible Generalized Least-Squares problem * in the context of GWAS: */ int fgls_chol( FGLS_config_t cf ) { int n = cf.n, m = cf.m, p = cf.p, t = cf.t, x_b = cf.x_b, /*y_b = cf.y_b,*/ wXL = cf.wXL, wXR = cf.wXR; /* In-core operands */ double *Phi; double *M; double *ests; double *h2; double *res_sigma; double alpha; double beta; /* Out-of-core operands */ double *Bij; // Auxiliary variables /* Reusable data thanks to constant XL */ double *XL; double *XL_orig; // XL and a copy (XL is overwritten at every iteration of j) double *B_t; // Top part of b ( in inv(S) b ) double *V_tl; // Top-Left part of V /* BLAS / LAPACK constants */ double ZERO = 0.0; double ONE = 1.0; int iONE = 1; /* LAPACK error value */ int info; /* iterators and auxiliar vars */ int ib, i, j, k, l; // size_t int nn = cf.n * cf.n; // size_t size_t size_one_b_record = p + (p*(p+1))/2; // Threading int id; double *tmpBs, *tmpVs; // Buffer with one B and one V per thread double *oneB, *oneV; // Each thread pointer to its B and V if ( cf.y_b != 1 ) { fprintf(stderr, "\n[Warning] y_b not used (set to 1)\n"); cf.y_b = 1; } /* Memory allocation */ // In-core build_SPD_Phi( cf.n, cf.Z, cf.W, cf.Phi ); Phi = cf.Phi; M = ( double * ) fgls_malloc ( (size_t)cf.n * cf.n * sizeof(double) ); ests = cf.ests; h2 = ests; res_sigma = &ests[2*cf.t]; XL_orig = cf.XL; XL = ( double * ) fgls_malloc ( cf.wXL * cf.n * sizeof(double) ); B_t = ( double * ) fgls_malloc ( cf.wXL * sizeof(double) ); V_tl = ( double * ) fgls_malloc ( cf.wXL * cf.wXL * sizeof(double) ); // Temporary storage prior to copying in db_B tmpBs = ( double * ) fgls_malloc ( cf.p * cf.num_threads * sizeof(double) ); tmpVs = ( double * ) fgls_malloc ( cf.p * cf.p * cf.num_threads * sizeof(double) ); /* Files and pointers for out-of-core */ double *XR_comp, *Y_comp, *B_comp; /* Asynchronous IO data structures */ double_buffering db_XR, db_Y, db_B; double_buffering_init( &db_XR, (size_t)cf.n * cf.wXR * cf.x_b * sizeof(double), cf.XR, &cf ); // _fp double_buffering_init( &db_Y, (size_t)cf.n * cf.y_b * sizeof(double), cf.Y, &cf ); double_buffering_init( &db_B, (size_t)size_one_b_record * cf.x_b * cf.y_b * sizeof(double), cf.B, &cf ); #if VAMPIR VT_USER_START("READ_X"); #endif /* Read first block of XR's */ double_buffering_read_XR( &db_XR, IO_BUFF, 0, (size_t)MIN( cf.x_b, cf.m ) - 1 ); double_buffering_swap( &db_XR ); #if VAMPIR VT_USER_END("READ_X"); #endif #if VAMPIR VT_USER_START("READ_Y"); #endif /* Read first Y */ double_buffering_read_Y( &db_Y, IO_BUFF, 0, 0 ); double_buffering_swap( &db_Y ); #if VAMPIR VT_USER_END("READ_Y"); #endif int iter = 0; for ( j = 0; j < t; j++ ) { /* Set the number of threads for the multi-threaded BLAS */ set_multi_threaded_BLAS( cf.num_threads ); #if VAMPIR VT_USER_START("READ_Y"); #endif /* Read next Y */ size_t next_j = (j+1) >= t ? 0 : j+1; double_buffering_read_Y( &db_Y, IO_BUFF, next_j, next_j ); #if VAMPIR VT_USER_END("READ_Y"); #endif #if VAMPIR VT_USER_START("COMP_J"); #endif /* M := sigma * ( h^2 Phi - (1 - h^2) I ) */ memcpy( M, Phi, (size_t)n * n * sizeof(double) ); alpha = res_sigma[j] * h2[j]; beta = res_sigma[j] * (1 - h2[j]); dscal_(&nn, &alpha, M, &iONE); for ( i = 0; i < n; i++ ) M[i*n + i] = M[i*n + i] + beta; /* L * L' = M */ dpotrf_(LOWER, &n, M, &n, &info); if (info != 0) { char err[STR_BUFFER_SIZE]; snprintf(err, STR_BUFFER_SIZE, "dpotrf(M) failed (info: %d)", info); error_msg(err, 1); } /* XL := inv(L) * XL */ memcpy( XL, XL_orig, wXL * n * sizeof(double) ); dtrsm_(LEFT, LOWER, NO_TRANS, NON_UNIT, &n, &wXL, &ONE, M, &n, XL, &n); #if VAMPIR VT_USER_START("WAIT_Y"); #endif // Wait until current Y is available for computation double_buffering_wait( &db_Y, COMP_BUFF ); #if VAMPIR VT_USER_END("WAIT_Y"); #endif /* y := inv(L) * y */ Y_comp = double_buffering_get_comp_buffer( &db_Y ); // Sanity check average( Y_comp, n, 1, cf.threshold, "TRAIT", &cf.Y_fvi->fvi_data[n*NAMELENGTH], NAMELENGTH, 0 ); dtrsv_(LOWER, NO_TRANS, NON_UNIT, &n, M, &n, Y_comp, &iONE); /* B_t := XL' * y */ dgemv_(TRANS, &n, &wXL, &ONE, XL, &n, Y_comp, &iONE, &ZERO, B_t, &iONE); /* V_tl := XL' * XL */ dsyrk_(LOWER, TRANS, &wXL, &n, &ONE, XL, &n, &ZERO, V_tl, &wXL); #if VAMPIR VT_USER_END("COMP_J"); #endif /* Solve for x_b X's at once */ for (ib = 0; ib < m; ib += x_b) { #if VAMPIR VT_USER_START("READ_X"); #endif /* Read next block of XR's */ size_t next_x_from = ((size_t)ib + x_b) >= m ? 0 : (size_t)ib + x_b; size_t next_x_to = ((size_t)ib + x_b) >= m ? MIN( (size_t)x_b, (size_t)m ) - 1 : next_x_from + MIN( (size_t)x_b, (size_t)m - next_x_from ) - 1; double_buffering_read_XR( &db_XR, IO_BUFF, next_x_from, next_x_to ); #if VAMPIR VT_USER_END("READ_X"); #endif #if VAMPIR VT_USER_START("WAIT_X"); #endif /* Wait until current block of XR's is available for computation */ double_buffering_wait( &db_XR, COMP_BUFF ); #if VAMPIR VT_USER_END("WAIT_X"); #endif /* Set the number of threads for the multi-threaded BLAS */ set_multi_threaded_BLAS( cf.num_threads ); #if VAMPIR VT_USER_START("COMP_IB"); #endif /* XR := inv(L) XR */ XR_comp = double_buffering_get_comp_buffer( &db_XR ); // Auxiliar variables int x_inc = MIN(x_b, m - ib); int rhss = wXR * x_inc; // Sanity check average( XR_comp, n, x_inc, cf.threshold, "SNP", &cf.XR_fvi->fvi_data[(n+ib)*NAMELENGTH], NAMELENGTH, 1 ); // Computation dtrsm_(LEFT, LOWER, NO_TRANS, NON_UNIT, &n, &rhss, &ONE, M, &n, XR_comp, &n); #if VAMPIR VT_USER_END("COMP_IB"); #endif #if CHOL_MIX_PARALLELISM /* Set the number of threads for the multi-threaded BLAS to 1. * The innermost loop is parallelized using OPENMP */ set_single_threaded_BLAS(); #endif #if VAMPIR VT_USER_START("COMP_I"); #endif B_comp = double_buffering_get_comp_buffer( &db_B ); #if CHOL_MIX_PARALLELISM #pragma omp parallel for private(Bij, oneB, oneV, i, k, info, id) schedule(static) num_threads(cf.num_threads) #endif for (i = 0; i < x_inc; i++) { id = omp_get_thread_num(); oneB = &tmpBs[ id * p ]; oneV = &tmpVs[ id * p * p ]; Bij = &B_comp[i * size_one_b_record]; // Building B // Copy B_T memcpy(oneB, B_t, wXL * sizeof(double)); // B_B := XR' * y dgemv_("T", &n, &wXR, &ONE, &XR_comp[i * wXR * n], &n, Y_comp, &iONE, &ZERO, &oneB[wXL], &iONE); // Building V // Copy V_TL for( k = 0; k < wXL; k++ ) dcopy_(&wXL, &V_tl[k*wXL], &iONE, &oneV[k*p], &iONE); // V_TL // V_BL := XR' * XL dgemm_("T", "N", &wXR, &wXL, &n, &ONE, &XR_comp[i * wXR * n], &n, XL, &n, &ZERO, &oneV[wXL], &p); // V_BL // V_BR := XR' * XR dsyrk_("L", "T", &wXR, &n, &ONE, &XR_comp[i * wXR * n], &n, &ZERO, &oneV[wXL * p + wXL], &p); // V_BR // B := inv(V) * B dpotrf_(LOWER, &p, oneV, &p, &info); if (info != 0) { for ( k = 0; k < size_one_b_record; k++ ) Bij[k] = 0.0/0.0; //nan("char-sequence"); continue; } dtrsv_(LOWER, NO_TRANS, NON_UNIT, &p, oneV, &p, oneB, &iONE); dtrsv_(LOWER, TRANS, NON_UNIT, &p, oneV, &p, oneB, &iONE); /* V := res_sigma * inv( X' inv(M) X) */ dpotri_(LOWER, &p, oneV, &p, &info); if (info != 0) { char err[STR_BUFFER_SIZE]; snprintf(err, STR_BUFFER_SIZE, "dpotri failed (info: %d)", info); error_msg(err, 1); } // Copy output for ( k = 0; k < p; k++ ) Bij[k] = (float) oneB[k]; for ( k = 0; k < p; k++ ) Bij[p+k] = (float)sqrt(oneV[k*p+k]); int idx = 0; for ( k = 0; k < p-1; k++ ) // Cols of V for ( l = k+1; l < p; l++ ) // Rows of V { Bij[p+p+idx] = (float)oneV[k*p+l]; idx++; } #if 0 printf("Chi square: %.6f\n", ( (oneB[p-1] / Bij[p+p-1]) * (oneB[p-1] / Bij[p+p-1]) ) ); #endif } #if VAMPIR VT_USER_END("COMP_I"); #endif #if VAMPIR VT_USER_START("WAIT_BV"); #endif /* Wait until the previous blocks of B's and V's are written */ if ( iter > 0) double_buffering_wait( &db_B, IO_BUFF ); #if VAMPIR VT_USER_END("WAIT_BV"); #endif /* Write current blocks of B's and V's */ #if VAMPIR VT_USER_START("WRITE_BV"); #endif double_buffering_write_B( &db_B, COMP_BUFF, ib, ib+x_inc - 1, j, j ); #if VAMPIR VT_USER_END("WRITE_BV"); #endif /* Swap buffers */ double_buffering_swap( &db_XR ); double_buffering_swap( &db_B ); iter++; } /* Swap buffers */ double_buffering_swap( &db_Y ); } #if VAMPIR VT_USER_START("WAIT_ALL"); #endif /* Wait for the remaining IO operations issued */ double_buffering_wait( &db_XR, COMP_BUFF ); double_buffering_wait( &db_Y, COMP_BUFF ); double_buffering_wait( &db_B, IO_BUFF ); #if VAMPIR VT_USER_END("WAIT_ALL"); #endif /* Clean-up */ free( M ); free( XL ); free( B_t ); free( V_tl ); free( tmpBs ); free( tmpVs ); double_buffering_destroy( &db_XR ); double_buffering_destroy( &db_Y ); double_buffering_destroy( &db_B ); return 0; }