Exemple #1
0
struct databel_fvi * load_databel_fvi( char *path )
{
	FILE *f;
	databel_fvi *fvi;
	size_t data_size;

	f = fgls_fopen( path, "r" );
	
	fvi = (databel_fvi*) fgls_malloc( sizeof(databel_fvi) );
	// Header
	fread( &fvi->fvi_header, sizeof(databel_fvi_header), 1, f );
	// Labels
	data_size = (fvi->fvi_header.numVariables +fvi->fvi_header.numObservations ) * 
		         fvi->fvi_header.namelength * sizeof(char);
	fvi->fvi_data = (char *) fgls_malloc ( data_size );
	// Load labels
	fread( fvi->fvi_data, 1, data_size, f );

	fclose( f );

	return fvi;
}
Exemple #2
0
// List of aiocb structs - Constructor
void aiocb_list_init( aiocb_list *b, int num_chunks, FILE *fp, double *buff )
{
	int i;

	b->n = num_chunks;
	b->issued = 0;

	b->aiocb_l = ( const struct aiocb ** ) fgls_malloc ( b->n * sizeof(struct aiocb *) );
	for ( i = 0; i < b->n; i++ )
	{
		struct aiocb *tmp_cb;
		tmp_cb = (struct aiocb *) fgls_malloc ( sizeof(struct aiocb) );
		// zero-out the structure and fill it up with "constant" data
		bzero( (char *)tmp_cb, sizeof(struct aiocb) );
		tmp_cb->aio_fildes = fileno( fp );
		tmp_cb->aio_buf = &buff[i*(size_t)MAX_AIO_SIZE/sizeof(double)];
		// assign to the const list
		b->aiocb_l[i] = tmp_cb;
	}
	// Will wait one by one. After all, I need them all to be done 
	// ( and usually only one call actually needed, block size is not that big )
	b->aux_l = ( const struct aiocb ** ) fgls_malloc ( sizeof(struct aiocb *) );
}
Exemple #3
0
/*
 * Builds Phi as an SPD matrix, after the eigenvalues were fixed 
 * during the REML estimation
 *
 * Z = eigVecs
 * W = eigVals
 * Phi = Z W Z^T
 */
void build_SPD_Phi( int n, double *eigVecs, double *eigVals, double *Phi )
{
	double ONE = 1.0,
		   ZERO = 0.0,
		   *tmp;
	int i, j;

	tmp = fgls_malloc( n * n * sizeof(double) );

	// tmp = Z * W
	for ( j = 0; j < n; j++ )
		for ( i = 0; i < n; i++ )
			tmp[ j*n + i ] = eigVecs[ j*n + i ] * eigVals[j];
	// Phi = tmp * Z^T
	dgemm_( NO_TRANS, TRANS,
			&n, &n, &n,
			&ONE, tmp, &n, eigVecs, &n,
			&ZERO, Phi, &n );

	free( tmp );
}
Exemple #4
0
//
// Double buffering handler
//
// Constructor
void double_buffering_init( double_buffering *b, size_t buff_size, FILE *fp, FGLS_config_t *c )
{
	int i;
	
	// Single buffer size
	b->buff_size = buff_size;
	// Number of chunks into which an aio operation is split
	b->num_chunks = size_t_ceil( buff_size, MAX_AIO_SIZE );

	// Allocate double buffers
	for ( i = 0; i < NUM_BUFFERS_PER_THREAD; i++ )
		b->buffs[i] = ( double * ) fgls_malloc ( buff_size );

	// Create the necessary amount of aiocb lists,
	// one for io, one for comp,
	// and initialize them
	for ( i = 0; i < NUM_BUFFERS_PER_THREAD; i++ )
		aiocb_list_init( &b->aiocb_l[i], b->num_chunks, fp, b->buffs[i] );
	b->comp_l = &b->aiocb_l[0];
	b->io_l   = &b->aiocb_l[1];
	// Config
	b->cf = c;
}
Exemple #5
0
// 
// GWAS config + data
//
void initialize_config(
        FGLS_config_t *cf, 
		char *cov_base, char *phi_base, char *snp_base, char *pheno_base, char *out_base//,
//		char *var, int num_threads, int xtile, int ytile, int xb, int yb, int write_output
)
{
	load_databel_info( cf );
    // Problem dimensions
	cf->n = cf->XL_fvi->fvi_header.numObservations;
	cf->p = cf->XL_fvi->fvi_header.numVariables + 1; // Intercept included
	cf->m = cf->XR_fvi->fvi_header.numVariables;
	cf->t = cf->Y_fvi->fvi_header.numVariables;
	// Assuming wXR = 1
	cf->wXL = cf->p - 1;
	cf->wXR = 1;
    // Algorithm parameters
	/*cf->x_b = x_b;*/
	/*cf->y_b = y_b;*/
	/*cf->num_threads = num_threads;*/
	get_main_memory_size( &cf->totalMem, &cf->availMem );
	estimate_block_sizes( cf, cf->var, cf->x_b == -1 || cf->y_b == -1 ); // if any is -1, estimate them
/*	if ( !(cf->x_b == -1 || cf->y_b == -1) )
	{
		cf->x_b = xb;
		cf->y_b = yb;
	}
	cf->x_tile = xtile;
	cf->y_tile = ytile;
*/
	/*cf->x_b = 10;*/
	/*cf->y_b = 10;*/
    // In/Out Files
  /*  sprintf( cf->Phi_data_path, "%s.fvd",  phi_base );
    sprintf( cf->Phi_info_path, "%s.fvi",  phi_base );
    sprintf( cf->XL_data_path,  "%s.fvd",  cov_base );
    sprintf( cf->XL_info_path,  "%s.fvi",  cov_base );
    sprintf( cf->XR_data_path,  "%s.fvd",  snp_base );
    sprintf( cf->XR_info_path,  "%s.fvi",  snp_base );
    sprintf( cf->Y_data_path,   "%s.fvd",  pheno_base );
    sprintf( cf->Y_info_path,   "%s.fvi",  pheno_base );
	if ( write_output )
	{
		sprintf( cf->B_data_path, "%s.out",  out_base );
		sprintf( cf->B_info_path, "%s.iout", out_base );
	}
	else
	{
		sprintf( cf->B_data_path,      "/dev/null" );
		sprintf( cf->B_info_path,      "/dev/null" );
	}
*/	
    // Temporary files
    snprintf( cf->ZtXL_path, STR_BUFFER_SIZE, "%s.tmp", cov_base );
    snprintf( cf->ZtXR_path, STR_BUFFER_SIZE, "%s.tmp", snp_base );
    snprintf( cf->ZtY_path,  STR_BUFFER_SIZE, "%s.tmp", pheno_base );

	// Allocate memory and load in-core data
	// NOTE: only data which is input to GWAS
	//    
	// Intermediate data as ZtXL will be handled on the fly
	// by the corresponding routines:
	//   * ZtXL, only available after REML_eigen (if so)
	//   * Z and W will be allocated and filled up during
	//     the first eigen-decomposition of Phi, (if so)
	
	// ests needed in both cases chol and eigen
	cf->ests = (double *) fgls_malloc ( (3 + cf->wXL) * cf->t * sizeof(double) );
	cf->h2         =  cf->ests;
	cf->sigma2     = &cf->ests[ cf->t ];
	cf->res_sigma2 = &cf->ests[ 2 * cf->t ];
	cf->beta_ests  = &cf->ests[ 3 * cf->t ];

	cf->Phi = (double *) fgls_malloc ( cf->n * cf->n * sizeof(double) );
	load_file ( cf->Phi_data_path,  "rb", cf->Phi,  cf->n * cf->n );
	// Sanity check (Phi)
	checkNoNans(cf->n * cf->n, cf->Phi, "[ERROR] NaNs not allowed in Phi\n");
	cf->XL   = (double *) fgls_malloc ( cf->n * cf->wXL * sizeof(double) );
	load_file ( cf->XL_data_path,   "rb", cf->XL,   cf->n * cf->wXL );
	// Sanity check (XL)
	average( cf->XL, cf->n, cf->wXL, cf->threshold, "Covariate", 
			&cf->XL_fvi->fvi_data[cf->n*NAMELENGTH], NAMELENGTH, 1, cf->num_threads );
	cf->ZtXL = NULL;
	cf->Z    = NULL;
	cf->W    = NULL;

	// Open files for out-of-core data
	// Again, only for data input/output to GWAS. Intermediate, on the fly
	cf->XR = fgls_fopen( cf->XR_data_path, "rb" );
	cf->ZtXR = NULL;
	cf->Y  = fgls_fopen( cf->Y_data_path, "rb" );
	cf->ZtY = NULL;
	cf->B = fgls_fopen( cf->B_data_path, "wb" );

    return;
}
Exemple #6
0
/*
 * Cholesky-based solution of the 
 *  sequence of Feasible Generalized Least-Squares problem
 *  in the context of GWAS:
 */
int fgls_chol( FGLS_config_t cf )
{
	int n = cf.n,
		   m = cf.m,
		   p = cf.p,
		   t = cf.t,
		   x_b = cf.x_b,
		   /*y_b = cf.y_b,*/
		   wXL = cf.wXL,
		   wXR = cf.wXR;
    /* In-core operands */
    double *Phi;
    double *M;
	double *ests;
    double *h2;
	double *res_sigma;
    double alpha;
    double beta;

    /* Out-of-core operands */
    double *Bij; // Auxiliary variables

    /* Reusable data thanks to constant XL */
    double *XL;
    double *XL_orig; // XL and a copy (XL is overwritten at every iteration of j)
    double *B_t;  // Top part of b ( in inv(S) b )
    double *V_tl; // Top-Left part of V

    /* BLAS / LAPACK constants */
    double ZERO = 0.0;
    double ONE = 1.0;
    int iONE = 1;
    /* LAPACK error value */
    int info;

    /* iterators and auxiliar vars */
    int ib, i, j, k, l; // size_t
    int nn = cf.n * cf.n; // size_t
	size_t size_one_b_record = p + (p*(p+1))/2;

	// Threading
	int id;
	double *tmpBs, *tmpVs; // Buffer with one B and one V per thread
	double *oneB, *oneV;   // Each thread pointer to its B and V

    if ( cf.y_b != 1 )
	{
        fprintf(stderr, "\n[Warning] y_b not used (set to 1)\n");
		cf.y_b = 1;
	}

    /* Memory allocation */
    // In-core
	build_SPD_Phi( cf.n, cf.Z, cf.W, cf.Phi );
	Phi   = cf.Phi;
    M     = ( double * ) fgls_malloc ( (size_t)cf.n * cf.n * sizeof(double) );
    ests  = cf.ests;

	h2 = ests;
	res_sigma = &ests[2*cf.t];

    XL_orig = cf.XL;
    XL      = ( double * ) fgls_malloc ( cf.wXL * cf.n * sizeof(double) );
    B_t  = ( double * ) fgls_malloc ( cf.wXL * sizeof(double) );
    V_tl = ( double * ) fgls_malloc ( cf.wXL * cf.wXL * sizeof(double) );

	// Temporary storage prior to copying in db_B
    tmpBs = ( double * ) fgls_malloc ( cf.p * cf.num_threads * sizeof(double) );
    tmpVs = ( double * ) fgls_malloc ( cf.p * cf.p * cf.num_threads * sizeof(double) );

    /* Files and pointers for out-of-core */
    double *XR_comp, *Y_comp, *B_comp;

    /* Asynchronous IO data structures */
	double_buffering db_XR, db_Y, db_B;
	double_buffering_init( &db_XR, (size_t)cf.n * cf.wXR * cf.x_b * sizeof(double),
			                cf.XR, &cf ); // _fp
	double_buffering_init( &db_Y, (size_t)cf.n * cf.y_b * sizeof(double),
			                cf.Y,  &cf );
	double_buffering_init( &db_B, (size_t)size_one_b_record * cf.x_b * cf.y_b * sizeof(double),
			                cf.B,  &cf );

#if VAMPIR
    VT_USER_START("READ_X");
#endif
    /* Read first block of XR's */
	double_buffering_read_XR( &db_XR, IO_BUFF, 0, (size_t)MIN( cf.x_b, cf.m ) - 1 );
	double_buffering_swap( &db_XR );
#if VAMPIR
    VT_USER_END("READ_X");
#endif
#if VAMPIR
    VT_USER_START("READ_Y");
#endif
    /* Read first Y */
	double_buffering_read_Y( &db_Y, IO_BUFF, 0, 0 );
	double_buffering_swap( &db_Y );
#if VAMPIR
    VT_USER_END("READ_Y");
#endif

    int iter = 0;
    for ( j = 0; j < t; j++ )
    {
        /* Set the number of threads for the multi-threaded BLAS */
		set_multi_threaded_BLAS( cf.num_threads );

#if VAMPIR
        VT_USER_START("READ_Y");
#endif
        /* Read next Y */
		size_t next_j = (j+1) >= t ? 0 : j+1;
		double_buffering_read_Y( &db_Y, IO_BUFF, next_j, next_j );
#if VAMPIR
        VT_USER_END("READ_Y");
#endif

#if VAMPIR
        VT_USER_START("COMP_J");
#endif
        /* M := sigma * ( h^2 Phi - (1 - h^2) I ) */
        memcpy( M, Phi, (size_t)n * n * sizeof(double) );
		alpha = res_sigma[j] * h2[j];
        beta  = res_sigma[j] * (1 - h2[j]);
        dscal_(&nn, &alpha, M, &iONE);
        for ( i = 0; i < n; i++ )
            M[i*n + i] = M[i*n + i] + beta;

        /* L * L' = M */
        dpotrf_(LOWER, &n, M, &n, &info);
        if (info != 0)
        {
            char err[STR_BUFFER_SIZE];
            snprintf(err, STR_BUFFER_SIZE, "dpotrf(M) failed (info: %d)", info);
            error_msg(err, 1);
        }

        /* XL := inv(L) * XL */
        memcpy( XL, XL_orig, wXL * n * sizeof(double) );
        dtrsm_(LEFT, LOWER, NO_TRANS, NON_UNIT, &n, &wXL, &ONE, M, &n, XL, &n);

#if VAMPIR
        VT_USER_START("WAIT_Y");
#endif
        // Wait until current Y is available for computation
		double_buffering_wait( &db_Y, COMP_BUFF );
#if VAMPIR
        VT_USER_END("WAIT_Y");
#endif

        /* y := inv(L) * y */
		Y_comp = double_buffering_get_comp_buffer( &db_Y );
		// Sanity check
		average( Y_comp, n, 1, cf.threshold, "TRAIT", 
				&cf.Y_fvi->fvi_data[n*NAMELENGTH], NAMELENGTH, 0 );
        dtrsv_(LOWER, NO_TRANS, NON_UNIT, &n, M, &n, Y_comp, &iONE);

        /* B_t := XL' * y */
        dgemv_(TRANS, &n, &wXL, &ONE, XL, &n, Y_comp, &iONE, &ZERO, B_t, &iONE);

        /* V_tl := XL' * XL */
        dsyrk_(LOWER, TRANS, &wXL, &n, &ONE, XL, &n, &ZERO, V_tl, &wXL);
#if VAMPIR
        VT_USER_END("COMP_J");
#endif
		/* Solve for x_b X's at once */
        for (ib = 0; ib < m; ib += x_b) 
        {
#if VAMPIR
            VT_USER_START("READ_X");
#endif
            /* Read next block of XR's */
			size_t next_x_from = ((size_t)ib + x_b) >= m ?  0 : (size_t)ib + x_b;
			size_t next_x_to   = ((size_t)ib + x_b) >= m ? MIN( (size_t)x_b, (size_t)m ) - 1 : 
				                                           next_x_from + MIN( (size_t)x_b, (size_t)m - next_x_from ) - 1;
			double_buffering_read_XR( &db_XR, IO_BUFF, next_x_from, next_x_to );
#if VAMPIR
            VT_USER_END("READ_X");
#endif

#if VAMPIR
            VT_USER_START("WAIT_X");
#endif
            /* Wait until current block of XR's is available for computation */
			double_buffering_wait( &db_XR, COMP_BUFF );
#if VAMPIR
            VT_USER_END("WAIT_X");
#endif

            /* Set the number of threads for the multi-threaded BLAS */
			set_multi_threaded_BLAS( cf.num_threads );

#if VAMPIR
            VT_USER_START("COMP_IB");
#endif
            /* XR := inv(L) XR */
			XR_comp = double_buffering_get_comp_buffer( &db_XR );
			// Auxiliar variables
            int x_inc = MIN(x_b, m - ib);
            int rhss  = wXR * x_inc;
			// Sanity check
			average( XR_comp, n, x_inc, cf.threshold, "SNP", 
					&cf.XR_fvi->fvi_data[(n+ib)*NAMELENGTH], NAMELENGTH, 1 );
			// Computation
            dtrsm_(LEFT, LOWER, NO_TRANS, NON_UNIT, &n, &rhss, &ONE, M, &n, XR_comp, &n);

#if VAMPIR
            VT_USER_END("COMP_IB");
#endif

#if CHOL_MIX_PARALLELISM
            /* Set the number of threads for the multi-threaded BLAS to 1.
             * The innermost loop is parallelized using OPENMP */
			set_single_threaded_BLAS();
#endif
#if VAMPIR
            VT_USER_START("COMP_I");
#endif
            B_comp = double_buffering_get_comp_buffer( &db_B );
#if CHOL_MIX_PARALLELISM
            #pragma omp parallel for private(Bij, oneB, oneV, i, k, info, id) schedule(static) num_threads(cf.num_threads)
#endif
            for (i = 0; i < x_inc; i++)
            {
				id = omp_get_thread_num();
				oneB = &tmpBs[ id * p ];
				oneV = &tmpVs[ id * p * p ];
                Bij = &B_comp[i * size_one_b_record];

                // Building B
                // Copy B_T
                memcpy(oneB, B_t, wXL * sizeof(double));
                // B_B := XR' * y
                dgemv_("T", 
                        &n, &wXR, 
                        &ONE, &XR_comp[i * wXR * n], &n, Y_comp, &iONE, 
                        &ZERO, &oneB[wXL], &iONE);

                // Building V
                // Copy V_TL
                for( k = 0; k < wXL; k++ )
                    dcopy_(&wXL, &V_tl[k*wXL], &iONE, &oneV[k*p], &iONE); // V_TL
                // V_BL := XR' * XL
                dgemm_("T", "N",
                        &wXR, &wXL, &n,
                        &ONE, &XR_comp[i * wXR * n], &n, XL, &n,
                        &ZERO, &oneV[wXL], &p); // V_BL
                // V_BR := XR' * XR
                dsyrk_("L", "T", 
                        &wXR, &n, 
                        &ONE, &XR_comp[i * wXR * n], &n, 
                        &ZERO, &oneV[wXL * p + wXL], &p); // V_BR

                // B := inv(V) * B
                dpotrf_(LOWER, &p, oneV, &p, &info);
                if (info != 0)
                {
					for ( k = 0; k < size_one_b_record; k++ )
						Bij[k] = 0.0/0.0; //nan("char-sequence");
					continue;
                }
                dtrsv_(LOWER, NO_TRANS, NON_UNIT, &p, oneV, &p, oneB, &iONE);
                dtrsv_(LOWER,    TRANS, NON_UNIT, &p, oneV, &p, oneB, &iONE);

                /* V := res_sigma * inv( X' inv(M) X) */
                dpotri_(LOWER, &p, oneV, &p, &info);
                if (info != 0)
                {
                    char err[STR_BUFFER_SIZE];
                    snprintf(err, STR_BUFFER_SIZE, "dpotri failed (info: %d)", info);
                    error_msg(err, 1);
                }

				// Copy output
				for ( k = 0; k < p; k++ )
					Bij[k] = (float) oneB[k];
                for ( k = 0; k < p; k++ )
                    Bij[p+k] = (float)sqrt(oneV[k*p+k]);
				int idx = 0;
				for ( k = 0; k < p-1; k++ ) // Cols of V
					for ( l = k+1; l < p; l++ ) // Rows of V
					{
						Bij[p+p+idx] = (float)oneV[k*p+l];
						idx++;
					}
#if 0
			  printf("Chi square: %.6f\n", ( (oneB[p-1] / Bij[p+p-1]) * (oneB[p-1] / Bij[p+p-1]) ) );
#endif
            }
#if VAMPIR
            VT_USER_END("COMP_I");
#endif

#if VAMPIR
            VT_USER_START("WAIT_BV");
#endif
            /* Wait until the previous blocks of B's and V's are written */
            if ( iter > 0)
                double_buffering_wait( &db_B, IO_BUFF );
#if VAMPIR
            VT_USER_END("WAIT_BV");
#endif
            /* Write current blocks of B's and V's */
#if VAMPIR
            VT_USER_START("WRITE_BV");
#endif
			double_buffering_write_B( &db_B, COMP_BUFF, ib, ib+x_inc - 1, j, j );
#if VAMPIR
            VT_USER_END("WRITE_BV");
#endif

            /* Swap buffers */
			double_buffering_swap( &db_XR );
			double_buffering_swap( &db_B  );
            iter++;
        }
        /* Swap buffers */
		double_buffering_swap( &db_Y );
    }

#if VAMPIR
    VT_USER_START("WAIT_ALL");
#endif
    /* Wait for the remaining IO operations issued */
	double_buffering_wait( &db_XR, COMP_BUFF );
	double_buffering_wait( &db_Y,  COMP_BUFF );
	double_buffering_wait( &db_B,  IO_BUFF );
#if VAMPIR
    VT_USER_END("WAIT_ALL");
#endif

    /* Clean-up */
    free( M );

    free( XL );
    free( B_t  );
    free( V_tl );
    free( tmpBs );
    free( tmpVs );

	double_buffering_destroy( &db_XR );
	double_buffering_destroy( &db_Y  );
	double_buffering_destroy( &db_B  );

    return 0;
}