Пример #1
0
static int fake_PLs(args_t *args, bcf_hdr_t *hdr, bcf1_t *line)
{
    // PLs not present, use GTs instead. 
    int fake_PL = args->no_PLs ? args->no_PLs : 99;    // with 1, discordance is the number of non-matching GTs
    int nsm_gt, i;
    if ( (nsm_gt=bcf_get_genotypes(hdr, line, &args->tmp_arr, &args->ntmp_arr)) <= 0 ) 
        error("GT not present at %s:%d?\n", hdr->id[BCF_DT_CTG][line->rid].key, line->pos+1);
    nsm_gt /= bcf_hdr_nsamples(hdr);
    int npl = line->n_allele*(line->n_allele+1)/2;
    hts_expand(int,npl*bcf_hdr_nsamples(hdr),args->npl_arr,args->pl_arr);
    for (i=0; i<bcf_hdr_nsamples(hdr); i++)
    {
        int *gt_ptr = args->tmp_arr + i*nsm_gt;
        int a = bcf_gt_allele(gt_ptr[0]);
        int b = bcf_gt_allele(gt_ptr[1]);

        int j, *pl_ptr = args->pl_arr + i*npl;
        if ( a<0 || b<0 ) // missing genotype
        {
            for (j=0; j<npl; j++) pl_ptr[j] = -1;
        }
        else
        {
            for (j=0; j<npl; j++) pl_ptr[j] = fake_PL;
            int idx = bcf_alleles2gt(a,b);
            pl_ptr[idx] = 0;
        }
    }
    return npl;
}
Пример #2
0
int process_GT(args_t *args, bcf1_t *line, uint32_t *ntot, uint32_t *ndif)
{
    int ngt = bcf_get_genotypes(args->sm_hdr, line, &args->tmp_arr, &args->ntmp_arr);

    if ( ngt<=0 ) return 1;                 // GT not present
    if ( ngt!=args->nsmpl*2 ) return 2;     // not diploid
    ngt /= args->nsmpl;
    
    int i,j, idx = 0;
    for (i=1; i<args->nsmpl; i++)
    {
        int32_t *a = args->tmp_arr + i*ngt;
        if ( bcf_gt_is_missing(a[0]) || bcf_gt_is_missing(a[1]) || a[1]==bcf_int32_vector_end ) { idx+=i; continue; }
        int agt = 1<<bcf_gt_allele(a[0]) | 1<<bcf_gt_allele(a[1]);

        for (j=0; j<i; j++)
        {
            int32_t *b = args->tmp_arr + j*ngt;
            if ( bcf_gt_is_missing(b[0]) || bcf_gt_is_missing(b[1]) || b[1]==bcf_int32_vector_end ) { idx++; continue; }
            int bgt = 1<<bcf_gt_allele(b[0]) | 1<<bcf_gt_allele(b[1]);

            ntot[idx]++;
            if ( agt!=bgt ) ndif[idx]++;
            idx++;
        }
    }
    return 0;
}
Пример #3
0
int estimate_AF(args_t *args, bcf1_t *line, double *alt_freq)
{
    if ( !args->nitmp )
    {
        args->nitmp = bcf_get_genotypes(args->hdr, line, &args->itmp, &args->mitmp);
        if ( args->nitmp != 2*args->nsmpl ) return -1;     // not diploid?
        args->nitmp /= args->nsmpl;
    }

    int i, nalt = 0, nref = 0;
    for (i=0; i<args->nsmpl; i++)
    {
        int32_t *gt = &args->itmp[i*args->nitmp];

        if ( bcf_gt_is_missing(gt[0]) || bcf_gt_is_missing(gt[1]) ) continue;

        if ( bcf_gt_allele(gt[0]) ) nalt++;
        else nref++;

        if ( bcf_gt_allele(gt[1]) ) nalt++;
        else nref++;
    }
    if ( !nalt && !nref ) return -1;

    *alt_freq = (double)nalt / (nalt + nref);
    return 0;
}
Пример #4
0
int process_region_precise(args_t *args, char *seq, regitr_t *itr)
{
    int k = 1;
    uint32_t start = itr->reg[itr->i].start, end = itr->reg[itr->i].end;
    while ( itr->i+k<itr->n && start==itr->reg[itr->i+k].start && end==itr->reg[itr->i+k].end ) k++;
    
    int ret = ploidy_query(args->ploidy, seq, start, args->sex2ploidy, NULL, NULL);
    assert(ret);

    memset(args->counts,0,args->ncounts*sizeof(int));

    // Select 'nsites' sites spaced so that they evenly cover the whole region 
    // to get a representative sample. We index-jump as we should be checking
    // a few sites only.
    int i, rid = -1, pos, prev_pos = -1, ismpl;
    for (i=0; i<args->nsites; i++)
    {
        rid = -1;
        pos = ((i+1.0)/(args->nsites+1))*(end - start) + start;
        if ( i>0 && pos <= prev_pos ) continue;     // the vcf is too sparse
        if ( bcf_sr_seek(args->sr,seq,pos)!=0 ) return k;   // sequence not present
        if ( !bcf_sr_next_line(args->sr) ) return k;        // no sites found
        bcf1_t *rec = bcf_sr_get_line(args->sr,0);
        if ( rid==-1 ) rid = rec->rid;
        if ( rid!=rec->rid || rec->pos > end ) break;
        prev_pos = rec->pos;

        int ngts = bcf_get_genotypes(args->hdr,rec,&args->gts,&args->ngts);
        ngts /= args->nsample;
        for (ismpl=0; ismpl<args->nsample; ismpl++)
        {
            int32_t *gts = args->gts + ngts*ismpl;
            int igt, ploidy = 0;
            for (igt=0; igt<ngts; igt++)
            {
                if ( gts[igt]==bcf_int32_vector_end || bcf_gt_is_missing(gts[igt]) ) break;
                else ploidy++;
            }
            args->counts[ismpl*(args->max_ploidy+1) + ploidy]++;
            if ( args->verbose )
                fprintf(stderr,"%s:%d\t%s\tploidy=%d\n", seq,rec->pos+1,args->hdr->samples[ismpl],ploidy);
        }
    }

    for (ismpl=0; ismpl<args->nsample; ismpl++)
    {
        float sum = 0, *probs = args->sex2prob + ismpl*args->nsex;
        int *counts = args->counts + ismpl*(args->max_ploidy+1);
        for (i=0; i<args->max_ploidy+1; i++) sum += counts[i];
        if ( !sum ) continue;
        for (i=0; i<args->nsex; i++)
        {
            int ploidy = args->sex2ploidy[i];
            probs[i] *= counts[ploidy]/sum;
        }
    }

    return k;
}
static void phase_update(args_t *args, bcf_hdr_t *hdr, bcf1_t *rec)
{
    int i, nGTs = bcf_get_genotypes(hdr, rec, &args->GTa, &args->mGTa);
    for (i=0; i<bcf_hdr_nsamples(hdr); i++)
    {
        if ( !args->swap_phase[i] ) continue;
        int *gt = &args->GTa[i*2];
        if ( gt[0]==bcf_gt_missing || gt[1]==bcf_int32_vector_end ) continue;
        SWAP(int, gt[0], gt[1]);
        gt[1] |= 1;
    }
    bcf_update_genotypes(hdr,rec,args->GTa,nGTs);
}
Пример #6
0
static void set_observed_prob_trio(bcf1_t *rec)
{
    int ngt = bcf_get_genotypes(args.hdr, rec, &args.gt_arr, &args.ngt_arr);
    if ( ngt<0 ) return;
    if ( ngt!=6 ) return;   // chrX

    int32_t a,b,c,d,e,f;
    a = args.gt_arr[2*args.imother];
    b = args.gt_arr[2*args.imother+1];
    c = args.gt_arr[2*args.ifather];
    d = args.gt_arr[2*args.ifather+1];
    e = args.gt_arr[2*args.ichild];
    f = args.gt_arr[2*args.ichild+1];
    if ( bcf_gt_is_missing(a) || bcf_gt_is_missing(b) ) return;
    if ( bcf_gt_is_missing(c) || bcf_gt_is_missing(d) ) return;
    if ( bcf_gt_is_missing(e) || bcf_gt_is_missing(f) ) return;
    if ( !bcf_gt_is_phased(a) && !bcf_gt_is_phased(b) ) return; // only the second allele should be set when phased
    if ( !bcf_gt_is_phased(c) && !bcf_gt_is_phased(d) ) return;
    if ( !bcf_gt_is_phased(e) && !bcf_gt_is_phased(f) ) return;
    a = bcf_gt_allele(a);
    b = bcf_gt_allele(b);
    c = bcf_gt_allele(c);
    d = bcf_gt_allele(d);
    e = bcf_gt_allele(e);
    f = bcf_gt_allele(f);

    int mother = (1<<a) | (1<<b);
    int father = (1<<c) | (1<<d);
    int child  = (1<<e) | (1<<f);
    if ( !(mother&child) || !(father&child) )  return;      // Mendelian-inconsistent site, skip

    if ( a!=b ) args.nhet_mother++;
    if ( c!=d ) args.nhet_father++;

    int m = args.msites;
    args.nsites++;
    hts_expand(uint32_t,args.nsites,args.msites,args.sites);
    if ( m!=args.msites ) args.eprob = (double*) realloc(args.eprob, sizeof(double)*args.msites*args.nstates);

    args.sites[args.nsites-1] = rec->pos;
    double *prob = args.eprob + args.nstates*(args.nsites-1);
    prob[TRIO_AC] = prob_shared(0,e,a) * prob_shared(0,f,c);
    prob[TRIO_AD] = prob_shared(0,e,a) * prob_shared(0,f,d);
    prob[TRIO_BC] = prob_shared(0,e,b) * prob_shared(0,f,c);
    prob[TRIO_BD] = prob_shared(0,e,b) * prob_shared(0,f,d);
    prob[TRIO_CA] = prob_shared(0,e,c) * prob_shared(0,f,a);
    prob[TRIO_DA] = prob_shared(0,e,d) * prob_shared(0,f,a);
    prob[TRIO_CB] = prob_shared(0,e,c) * prob_shared(0,f,b);
    prob[TRIO_DB] = prob_shared(0,e,d) * prob_shared(0,f,b);
}
Пример #7
0
static void set_observed_prob_unrelated(bcf1_t *rec)
{
    float af = 0.5;  // alternate allele frequency

    int ngt = bcf_get_genotypes(args.hdr, rec, &args.gt_arr, &args.ngt_arr);
    if ( ngt<0 ) return;
    if ( ngt!=4 ) return;   // chrX

    int32_t a,b,c,d;
    a = args.gt_arr[2*args.isample];
    b = args.gt_arr[2*args.isample+1];
    c = args.gt_arr[2*args.jsample];
    d = args.gt_arr[2*args.jsample+1];
    if ( bcf_gt_is_missing(a) || bcf_gt_is_missing(b) ) return;
    if ( bcf_gt_is_missing(c) || bcf_gt_is_missing(d) ) return;
    if ( !bcf_gt_is_phased(a) && !bcf_gt_is_phased(b) ) return; // only the second allele should be set when phased
    if ( !bcf_gt_is_phased(c) && !bcf_gt_is_phased(d) ) return;
    a = bcf_gt_allele(a);
    b = bcf_gt_allele(b);
    c = bcf_gt_allele(c);
    d = bcf_gt_allele(d);

    int m = args.msites;
    args.nsites++;
    hts_expand(uint32_t,args.nsites,args.msites,args.sites);
    if ( m!=args.msites ) args.eprob = (double*) realloc(args.eprob, sizeof(double)*args.msites*args.nstates);

    args.sites[args.nsites-1] = rec->pos;
    double *prob = args.eprob + args.nstates*(args.nsites-1);
    prob[UNRL_xxxx] = prob_not_shared(af,a,c) * prob_not_shared(af,a,d) * prob_not_shared(af,b,c) * prob_not_shared(af,b,d);
    prob[UNRL_0x0x] = prob_shared(af,a,c) * prob_not_shared(af,b,d);
    prob[UNRL_0xx0] = prob_shared(af,a,d) * prob_not_shared(af,b,c);
    prob[UNRL_x00x] = prob_shared(af,b,c) * prob_not_shared(af,a,d);
    prob[UNRL_x0x0] = prob_shared(af,b,d) * prob_not_shared(af,a,c);
    prob[UNRL_0101] = prob_shared(af,a,c) * prob_shared(af,b,d);
    prob[UNRL_0110] = prob_shared(af,a,d) * prob_shared(af,b,c);

#if 0
    static int x = 0;
    if ( !x++)
    {
        printf("p(0==0) .. %f\n", prob_shared(af,0,0));
        printf("p(0!=0) .. %f\n", prob_not_shared(af,0,0));
        printf("p(0==1) .. %f\n", prob_shared(af,0,1));
        printf("p(0!=1) .. %f\n", prob_not_shared(af,0,1));
    }
    printf("%d|%d %d|%d  x:%f 11:%f 12:%f 21:%f 22:%f 11,22:%f 12,21:%f  %d\n", a,b,c,d,
            prob[UNRL_xxxx], prob[UNRL_0x0x], prob[UNRL_0xx0], prob[UNRL_x00x], prob[UNRL_x0x0], prob[UNRL_0101], prob[UNRL_0110], rec->pos+1);
#endif
}
Пример #8
0
bcf1_t *process(bcf1_t *rec)
{
    int ngts = bcf_get_genotypes(in_hdr, rec, &gts, &mgts);

    int i, changed = 0;
    for (i=0; i<ngts; i++)
    {
        if ( gts[i]==bcf_gt_missing )
        {
            gts[i] = new_gt;
            changed++;
        }
    }
    nchanged += changed;
    if ( changed ) bcf_update_genotypes(out_hdr, rec, gts, ngts);
    return rec;
}
Пример #9
0
int calc_dosage_GT(bcf1_t *rec)
{
    int i, j, nret = bcf_get_genotypes(in_hdr,rec,(void**)&buf,&nbuf);
    if ( nret<0 ) return -1;

    nret /= rec->n_sample;
    int32_t *ptr = (int32_t*) buf;
    for (i=0; i<rec->n_sample; i++)
    { 
        float dsg = 0;
        for (j=0; j<nret; j++)
        { 
            if ( ptr[j]==bcf_int32_missing || ptr[j]==bcf_int32_vector_end || ptr[j]==bcf_gt_missing ) break;
            if ( bcf_gt_allele(ptr[j]) ) dsg += 1;
        }
        printf("\t%.1f", j>0 ? dsg : -1);
        ptr += nret;
    }
    return 0;
}
Пример #10
0
void union_data::readGenotypesVCF(string fvcf,string region) {
	int n_includedG = 0;
	int n_excludedG_mult = 0;
	int n_excludedG_void = 0;
	int n_excludedG_user = 0;
	int n_includedS = 0;
	vector < int > mappingS;
	genotype_id.clear();
	genotype_chr.clear();
	genotype_start.clear();
	genotype_end.clear();
	genotype_val.clear();
	genotype_count=0;
	genotype_id_to_idx.clear();

	//Opening files
	bcf_srs_t * sr =  bcf_sr_init();

    //vrb.bullet("target region [" + regionGenotype.get() + "]");
    //if (bcf_sr_set_regions(sr, regionGenotype.get().c_str(), 0) == -1) vrb.error("Cannot jump to region!");
	bcf_sr_set_regions(sr, region.c_str(), 0);
	if(!(bcf_sr_add_reader (sr, fvcf.c_str()))) {
		switch (sr->errnum) {
		case not_bgzf: vrb.error("File not compressed with bgzip!");
		case idx_load_failed: vrb.error("Impossible to load index file!");
		case file_type_error: vrb.error("File format not detected by htslib!");
		default : vrb.error("Unknown error!");
		}
	}

	//Sample processing
	int n_samples = bcf_hdr_nsamples(sr->readers[0].header);
	for (int i0 = 0 ; i0 < n_samples ; i0 ++) {
		mappingS.push_back(findSample(string(sr->readers[0].header->samples[i0])));
		if (mappingS.back() >= 0) n_includedS++;
	}


	//Read genotype data
	int ngt, ngt_arr = 0, nds, nds_arr = 0, * gt_arr = NULL, nsl, nsl_arr = 0, * sl_arr = NULL;
	float * ds_arr = NULL;
	bcf1_t * line;
    unsigned int linecount = 0;
	while(bcf_sr_next_line (sr)) {
        linecount ++;
        if (linecount % 100000 == 0) vrb.bullet("Read " + stb.str(linecount) + " lines");
		line =  bcf_sr_get_line(sr, 0);
		if (line->n_allele == 2) {
			ngt = bcf_get_genotypes(sr->readers[0].header, line, &gt_arr, &ngt_arr);
			nds = bcf_get_format_float(sr->readers[0].header, line,"DS", &ds_arr, &nds_arr);
			if (nds == n_samples || ngt == 2*n_samples) {
				bcf_unpack(line, BCF_UN_STR);
				string sid = string(line->d.id);
				if (filter_genotype.check(sid)) {
					genotype_id.push_back(sid);
					genotype_chr.push_back(string(bcf_hdr_id2name(sr->readers[0].header, line->rid)));
					string genotype_ref = string(line->d.allele[0]);
					genotype_start.push_back(line->pos + 1);
					nsl = bcf_get_info_int32(sr->readers[0].header, line, "END", &sl_arr, &nsl_arr);
					if (nsl >= 0 && nsl_arr == 1) genotype_end.push_back(sl_arr[0]);
					else genotype_end.push_back(genotype_start.back() + genotype_ref.size() - 1);
					genotype_val.push_back(vector < float > (sample_count, 0.0));

					for(int i = 0 ; i < n_samples ; i ++) {
						if (mappingS[i] >= 0) {
							if (nds > 0) genotype_val.back()[mappingS[i]] = ds_arr[i];
							else {
								if (gt_arr[2*i+0] == bcf_gt_missing || gt_arr[2*i+1] == bcf_gt_missing) genotype_val.back()[mappingS[i]] = bcf_float_missing;
								else genotype_val.back()[mappingS[i]] = bcf_gt_allele(gt_arr[2*i+0]) + bcf_gt_allele(gt_arr[2*i+1]);
							}
						}
					}
                    pair < string, int > temp (sid,n_includedG);
                    genotype_id_to_idx.insert(temp);
					n_includedG++;
				} else n_excludedG_user ++;
			} else n_excludedG_void ++;
		} else n_excludedG_mult ++;
	}

	//Finalize
	free(gt_arr);
	free(ds_arr);
	bcf_sr_destroy(sr);
	genotype_count = n_includedG;
	//vrb.bullet(stb.str(n_includedG) + " variants included");
	//if (n_excludedG_user > 0) vrb.bullet(stb.str(n_excludedG_user) + " variants excluded by user");
	//if (n_excludedG_mult > 0) vrb.bullet(stb.str(n_excludedG_mult) + " multi-allelic variants excluded");
	//if (n_excludedG_void > 0) vrb.bullet(stb.str(n_excludedG_void) + " uninformative variants excluded [no GT/DS]");
    //if (genotype_count == 0) vrb.leave("Cannot find genotypes in target region!");
}
Пример #11
0
void bcf_remove_alleles(const bcf_hdr_t *header, bcf1_t *line, int rm_mask)
{
    int *map = (int*) calloc(line->n_allele, sizeof(int));

    // create map of indexes from old to new ALT numbering and modify ALT
    kstring_t str = {0,0,0};
    kputs(line->d.allele[0], &str);

    int nrm = 0, i,j;  // i: ori alleles, j: new alleles
    for (i=1, j=1; i<line->n_allele; i++) 
    {
        if ( rm_mask & 1<<i )
        {
            // remove this allele
            line->d.allele[i] = NULL;
            nrm++;
            continue;
        }
        kputc(',', &str);
        kputs(line->d.allele[i], &str);
        map[i] = j;
        j++;
    }
    if ( !nrm ) { free(map); free(str.s); return; }

    int nR_ori = line->n_allele;
    int nR_new = line->n_allele-nrm;
    assert(nR_new > 0); // should not be able to remove reference allele
    int nA_ori = nR_ori-1;
    int nA_new = nR_new-1;

    int nG_ori = nR_ori*(nR_ori + 1)/2;
    int nG_new = nR_new*(nR_new + 1)/2;

    bcf_update_alleles_str(header, line, str.s);

    // remove from Number=G, Number=R and Number=A INFO fields.
    uint8_t *dat = NULL;
    int mdat = 0, ndat = 0, mdat_bytes = 0, nret;
    for (i=0; i<line->n_info; i++)
    {
        bcf_info_t *info = &line->d.info[i];
        int vlen = bcf_hdr_id2length(header,BCF_HL_INFO,info->key);
        
        if ( vlen!=BCF_VL_A && vlen!=BCF_VL_G && vlen!=BCF_VL_R ) continue; // no need to change

        int type = bcf_hdr_id2type(header,BCF_HL_INFO,info->key);
        if ( type==BCF_HT_FLAG ) continue;
        int size = 1;
        if ( type==BCF_HT_REAL || type==BCF_HT_INT ) size = 4;

        mdat = mdat_bytes / size;
        nret = bcf_get_info_values(header, line, bcf_hdr_int2id(header,BCF_DT_ID,info->key), (void**)&dat, &mdat, type);
        mdat_bytes = mdat * size;
        if ( nret<0 ) 
        { 
            fprintf(stderr,"[%s:%d %s] Could not access INFO/%s at %s:%d [%d]\n", __FILE__,__LINE__,__FUNCTION__, 
                bcf_hdr_int2id(header,BCF_DT_ID,info->key), bcf_seqname(header,line), line->pos+1, nret); 
            exit(1);
        }
        if ( type==BCF_HT_STR ) 
        { 
            str.l = 0;
            char *ss = (char*) dat, *se = (char*) dat;
            if ( vlen==BCF_VL_A || vlen==BCF_VL_R )
            {
                int nexp, inc = 0;
                if ( vlen==BCF_VL_A )
                {
                    nexp = nA_ori;
                    inc  = 1;
                }
                else
                    nexp = nR_ori;
                for (j=0; j<nexp; j++)
                {
                    if ( !*se ) break;
                    while ( *se && *se!=',' ) se++;
                    if ( rm_mask & 1<<(j+inc) ) 
                    { 
                        if ( *se ) se++;
                        ss = se; 
                        continue; 
                    }
                    if ( str.l ) kputc(',',&str);
                    kputsn(ss,se-ss,&str);
                    if ( *se ) se++;
                    ss = se;
                }
                assert( j==nexp );
            }
            else    // Number=G, assuming diploid genotype
            {
                int k = 0, n = 0;
                for (j=0; j<nR_ori; j++)
                {
                    for (k=0; k<=j; k++)
                    {
                        if ( !*se ) break;
                        while ( *se && *se!=',' ) se++;
                        n++;
                        if ( rm_mask & 1<<j || rm_mask & 1<<k ) 
                        { 
                            if ( *se ) se++;
                            ss = se; 
                            continue; 
                        }
                        if ( str.l ) kputc(',',&str);
                        kputsn(ss,se-ss,&str);
                        if ( *se ) se++;
                        ss = se;
                    }
                    if ( !*se ) break;
                }
                assert( n=nG_ori );
            }

            nret = bcf_update_info(header, line, bcf_hdr_int2id(header,BCF_DT_ID,info->key), (void*)str.s, str.l, type);
            if ( nret<0 )
            {
                fprintf(stderr,"[%s:%d %s] Could not update INFO/%s at %s:%d [%d]\n", __FILE__,__LINE__,__FUNCTION__,
                        bcf_hdr_int2id(header,BCF_DT_ID,info->key), bcf_seqname(header,line), line->pos+1, nret);
                exit(1);
            }
            continue; 
        }
        
        if ( vlen==BCF_VL_A || vlen==BCF_VL_R )
        {
            int inc = 0, ntop;
            if ( vlen==BCF_VL_A )
            {
                assert( nret==nA_ori );
                ntop = nA_ori;
                ndat = nA_new;
                inc  = 1;
            }
            else
            {
                assert( nret==nR_ori );
                ntop = nR_ori;
                ndat = nR_new;
            }
            int k = 0;

            #define BRANCH(type_t,is_vector_end) \
            { \
                type_t *ptr = (type_t*) dat; \
                int size = sizeof(type_t); \
                for (j=0; j<ntop; j++) /* j:ori, k:new */ \
                { \
                    if ( is_vector_end ) { memcpy(dat+k*size, dat+j*size, size); break; } \
                    if ( rm_mask & 1<<(j+inc) ) continue; \
                    if ( j!=k ) memcpy(dat+k*size, dat+j*size, size); \
                    k++; \
                } \
            }
            switch (type) 
            {
                case BCF_HT_INT:  BRANCH(int32_t,ptr[j]==bcf_int32_vector_end); break;
                case BCF_HT_REAL: BRANCH(float,bcf_float_is_vector_end(ptr[j])); break;
            }
            #undef BRANCH
        }
        else    // Number=G
        {
            assert( nret==nG_ori );
            int k, l_ori = -1, l_new = 0;
            ndat = nG_new;

            #define BRANCH(type_t,is_vector_end) \
            { \
                type_t *ptr = (type_t*) dat; \
                int size = sizeof(type_t); \
                for (j=0; j<nR_ori; j++) \
                { \
                    for (k=0; k<=j; k++) \
                    { \
                        l_ori++; \
                        if ( is_vector_end ) { memcpy(dat+l_new*size, dat+l_ori*size, size); break; } \
                        if ( rm_mask & 1<<j || rm_mask & 1<<k ) continue; \
                        if ( l_ori!=l_new ) memcpy(dat+l_new*size, dat+l_ori*size, size); \
                        l_new++; \
                    } \
                } \
            }
            switch (type) 
            {
                case BCF_HT_INT:  BRANCH(int32_t,ptr[l_ori]==bcf_int32_vector_end); break;
                case BCF_HT_REAL: BRANCH(float,bcf_float_is_vector_end(ptr[l_ori])); break;
            }
            #undef BRANCH
        }

        nret = bcf_update_info(header, line, bcf_hdr_int2id(header,BCF_DT_ID,info->key), (void*)dat, ndat, type);
        if ( nret<0 )
        {
            fprintf(stderr,"[%s:%d %s] Could not update INFO/%s at %s:%d [%d]\n", __FILE__,__LINE__,__FUNCTION__,
                    bcf_hdr_int2id(header,BCF_DT_ID,info->key), bcf_seqname(header,line), line->pos+1, nret);
            exit(1);
        }
    }

    // Update GT fields, the allele indexes might have changed
    for (i=1; i<line->n_allele; i++) if ( map[i]!=i ) break;
    if ( i<line->n_allele )
    {
        mdat = mdat_bytes / 4;  // sizeof(int32_t)
        nret = bcf_get_genotypes(header,line,(void**)&dat,&mdat);
        mdat_bytes = mdat * 4;
        if ( nret>0 )
        {
            nret /= line->n_sample;
            int32_t *ptr = (int32_t*) dat;
            for (i=0; i<line->n_sample; i++)
            {
                for (j=0; j<nret; j++)
                {
                    if ( ptr[j]==bcf_gt_missing ) continue;
                    if ( ptr[j]==bcf_int32_vector_end ) break;
                    int al = bcf_gt_allele(ptr[j]);
                    assert( al<nR_ori && map[al]>=0 );
                    ptr[j] = (map[al]+1)<<1 | (ptr[j]&1);
                }
                ptr += nret;
            }
            bcf_update_genotypes(header, line, (void*)dat, nret*line->n_sample);
        }
    }

    // Remove from Number=G, Number=R and Number=A FORMAT fields. 
    // Assuming haploid or diploid GTs
    for (i=0; i<line->n_fmt; i++)
    {
        bcf_fmt_t *fmt = &line->d.fmt[i];
        int vlen = bcf_hdr_id2length(header,BCF_HL_FMT,fmt->id);

        if ( vlen!=BCF_VL_A && vlen!=BCF_VL_G && vlen!=BCF_VL_R ) continue; // no need to change

        int type = bcf_hdr_id2type(header,BCF_HL_FMT,fmt->id);
        if ( type==BCF_HT_FLAG ) continue;

        int size = 1;
        if ( type==BCF_HT_REAL || type==BCF_HT_INT ) size = 4;

        mdat = mdat_bytes / size;
        nret = bcf_get_format_values(header, line, bcf_hdr_int2id(header,BCF_DT_ID,fmt->id), (void**)&dat, &mdat, type);
        mdat_bytes = mdat * size;
        if ( nret<0 ) 
        { 
            fprintf(stderr,"[%s:%d %s] Could not access FORMAT/%s at %s:%d [%d]\n", __FILE__,__LINE__,__FUNCTION__, 
                    bcf_hdr_int2id(header,BCF_DT_ID,fmt->id), bcf_seqname(header,line), line->pos+1, nret); 
            exit(1);
        }

        if ( type==BCF_HT_STR ) 
        {
            int size = nret/line->n_sample;     // number of bytes per sample
            str.l = 0;
            if ( vlen==BCF_VL_A || vlen==BCF_VL_R )
            {
                int nexp, inc = 0;
                if ( vlen==BCF_VL_A )
                {
                    nexp = nA_ori;
                    inc  = 1;
                }
                else
                    nexp = nR_ori;
                for (j=0; j<line->n_sample; j++)
                {
                    char *ss = ((char*)dat) + j*size, *se = ss + size, *ptr = ss;
                    int k_src = 0, k_dst = 0, l = str.l;
                    for (k_src=0; k_src<nexp; k_src++)
                    {
                        if ( ptr>=se || !*ptr) break;
                        while ( ptr<se && *ptr && *ptr!=',' ) ptr++;
                        if ( rm_mask & 1<<(k_src+inc) )
                        {
                            ss = ++ptr;
                            continue;
                        }
                        if ( k_dst ) kputc(',',&str);
                        kputsn(ss,ptr-ss,&str);
                        ss = ++ptr;
                        k_dst++;
                    }
                    assert( k_src==nexp );
                    l = str.l - l;
                    for (; l<size; l++) kputc(0, &str);
                }
            }
            else    // Number=G, diploid or haploid
            {
                for (j=0; j<line->n_sample; j++)
                {
                    char *ss = ((char*)dat) + j*size, *se = ss + size, *ptr = ss;
                    int k_src = 0, k_dst = 0, l = str.l;
                    int nexp = 0; // diploid or haploid?
                    while ( ptr<se )
                    {
                        if ( !*ptr ) break;
                        if ( *ptr==',' ) nexp++;
                        ptr++;
                    }
                    if ( ptr!=ss ) nexp++;
                    assert( nexp==nG_ori || nexp==nR_ori );
                    ptr = ss;
                    if ( nexp==nG_ori ) // diploid
                    {
                        int ia, ib;
                        for (ia=0; ia<nR_ori; ia++)
                        {
                            for (ib=0; ib<=ia; ib++)
                            {
                                if ( ptr>=se || !*ptr ) break;
                                while ( ptr<se && *ptr && *ptr!=',' ) ptr++;
                                if ( rm_mask & 1<<ia || rm_mask & 1<<ib )
                                {
                                    ss = ++ptr;
                                    continue;
                                }
                                if ( k_dst ) kputc(',',&str);
                                kputsn(ss,ptr-ss,&str);
                                ss = ++ptr;
                                k_dst++;
                            }
                            if ( ptr>=se || !*ptr ) break;
                        }
                    }
                    else    // haploid
                    {
                        for (k_src=0; k_src<nR_ori; k_src++)
                        {
                            if ( ptr>=se || !*ptr ) break;
                            while ( ptr<se && *ptr && *ptr!=',' ) ptr++;
                            if ( rm_mask & 1<<k_src )
                            {
                                ss = ++ptr;
                                continue;
                            }
                            if ( k_dst ) kputc(',',&str);
                            kputsn(ss,ptr-ss,&str);
                            ss = ++ptr;
                            k_dst++;
                        }
                        assert( k_src==nR_ori );
                        l = str.l - l;
                        for (; l<size; l++) kputc(0, &str);
                    }
                }
            }
            nret = bcf_update_format(header, line, bcf_hdr_int2id(header,BCF_DT_ID,fmt->id), (void*)str.s, str.l, type);
            if ( nret<0 )
            {
                fprintf(stderr,"[%s:%d %s] Could not update FORMAT/%s at %s:%d [%d]\n", __FILE__,__LINE__,__FUNCTION__,
                        bcf_hdr_int2id(header,BCF_DT_ID,fmt->id), bcf_seqname(header,line), line->pos+1, nret);
                exit(1);
            }
            continue;
        }

        int nori = nret / line->n_sample;
        if ( vlen==BCF_VL_A || vlen==BCF_VL_R || (vlen==BCF_VL_G && nori==nR_ori) ) // Number=A, R or haploid Number=G
        {
            int ntop, inc = 0;
            if ( vlen==BCF_VL_A )
            {
                assert( nori==nA_ori );     // todo: will fail if all values are missing
                ntop = nA_ori;
                ndat = nA_new*line->n_sample;
                inc  = 1;
            }
            else
            {
                assert( nori==nR_ori );     // todo: will fail if all values are missing
                ntop = nR_ori;
                ndat = nR_new*line->n_sample;
            }

            #define BRANCH(type_t,is_vector_end) \
            { \
                for (j=0; j<line->n_sample; j++) \
                { \
                    type_t *ptr_src = ((type_t*)dat) + j*nori; \
                    type_t *ptr_dst = ((type_t*)dat) + j*nA_new; \
                    int size = sizeof(type_t); \
                    int k_src, k_dst = 0; \
                    for (k_src=0; k_src<ntop; k_src++) \
                    { \
                        if ( is_vector_end ) { memcpy(ptr_dst+k_dst, ptr_src+k_src, size); break; } \
                        if ( rm_mask & 1<<(k_src+inc) ) continue; \
                        if ( k_src!=k_dst ) memcpy(ptr_dst+k_dst, ptr_src+k_src, size); \
                        k_dst++; \
                    } \
                } \
            }
            switch (type) 
            {
                case BCF_HT_INT:  BRANCH(int32_t,ptr_src[k_src]==bcf_int32_vector_end); break;
                case BCF_HT_REAL: BRANCH(float,bcf_float_is_vector_end(ptr_src[k_src])); break;
            }
            #undef BRANCH
        }
        else    // Number=G, diploid or mixture of haploid+diploid
        {
            assert( nori==nG_ori );
            ndat = nG_new*line->n_sample;

            #define BRANCH(type_t,is_vector_end) \
            { \
                for (j=0; j<line->n_sample; j++) \
                { \
                    type_t *ptr_src = ((type_t*)dat) + j*nori; \
                    type_t *ptr_dst = ((type_t*)dat) + j*nG_new; \
                    int size = sizeof(type_t); \
                    int ia, ib, k_dst = 0, k_src; \
                    int nset = 0;   /* haploid or diploid? */ \
                    for (k_src=0; k_src<nG_ori; k_src++) { if ( is_vector_end ) break; nset++; } \
                    if ( nset==nR_ori ) /* haploid */ \
                    { \
                        for (k_src=0; k_src<nR_ori; k_src++) \
                        { \
                            if ( rm_mask & 1<<k_src ) continue; \
                            if ( k_src!=k_dst ) memcpy(ptr_dst+k_dst, ptr_src+k_src, size); \
                            k_dst++; \
                        } \
                        memcpy(ptr_dst+k_dst, ptr_src+k_src, size); \
                    } \
                    else /* diploid */ \
                    { \
                        k_src = -1; \
                        for (ia=0; ia<nR_ori; ia++) \
                        { \
                            for (ib=0; ib<=ia; ib++) \
                            { \
                                k_src++; \
                                if ( is_vector_end ) { memcpy(ptr_dst+k_dst, ptr_src+k_src, size); ia = nR_ori; break; } \
                                if ( rm_mask & 1<<ia || rm_mask & 1<<ib ) continue; \
                                if ( k_src!=k_dst ) memcpy(ptr_dst+k_dst, ptr_src+k_src, size); \
                                k_dst++; \
                            } \
                        } \
                    } \
                } \
            }
            switch (type) 
            {
                case BCF_HT_INT:  BRANCH(int32_t,ptr_src[k_src]==bcf_int32_vector_end); break;
                case BCF_HT_REAL: BRANCH(float,bcf_float_is_vector_end(ptr_src[k_src])); break;
            }
            #undef BRANCH
        }
        nret = bcf_update_format(header, line, bcf_hdr_int2id(header,BCF_DT_ID,fmt->id), (void*)dat, ndat, type);
        if ( nret<0 )
        {
            fprintf(stderr,"[%s:%d %s] Could not update FORMAT/%s at %s:%d [%d]\n", __FILE__,__LINE__,__FUNCTION__,
                    bcf_hdr_int2id(header,BCF_DT_ID,fmt->id), bcf_seqname(header,line), line->pos+1, nret);
            exit(1);
        }
    }
    free(dat);
    free(str.s);
    free(map);
}
Пример #12
0
bcf1_t *process(bcf1_t *rec)
{
    bcf1_t *dflt = args.mode&MODE_LIST_GOOD ? rec : NULL;
    args.nrec++;

    if ( rec->n_allele > 63 ) return dflt;      // we use 64bit bitmask below

    int ngt = bcf_get_genotypes(args.hdr, rec, &args.gt_arr, &args.ngt_arr);
    if ( ngt<0 ) return dflt;
    if ( ngt!=2*bcf_hdr_nsamples(args.hdr) && ngt!=bcf_hdr_nsamples(args.hdr) ) return dflt;
    ngt /= bcf_hdr_nsamples(args.hdr);

    int itr_set = regidx_overlap(args.rules, bcf_seqname(args.hdr,rec),rec->pos,rec->pos, args.itr_ori);

    int i, has_bad = 0, needs_update = 0;
    for (i=0; i<args.ntrios; i++)
    {
        int32_t a,b,c,d,e,f;
        trio_t *trio = &args.trios[i];

        a = args.gt_arr[ngt*trio->imother];
        b = ngt==2 ? args.gt_arr[ngt*trio->imother+1] : bcf_int32_vector_end;
        c = args.gt_arr[ngt*trio->ifather];
        d = ngt==2 ? args.gt_arr[ngt*trio->ifather+1] : bcf_int32_vector_end;
        e = args.gt_arr[ngt*trio->ichild];
        f = ngt==2 ? args.gt_arr[ngt*trio->ichild+1] : bcf_int32_vector_end;

        // skip sites with missing data in child
        if ( bcf_gt_is_missing(e) || bcf_gt_is_missing(f) ) continue;

        uint64_t mother = 0, father = 0,child1,child2;

        int is_ok = 0;
        if ( !itr_set )
        {
            if ( f==bcf_int32_vector_end ) { warn_ploidy(rec); continue; }

            // All M,F,C genotypes are diploid. Missing data are considered consistent.
            child1 = 1<<bcf_gt_allele(e);
            child2 = 1<<bcf_gt_allele(f);
            mother  = bcf_gt_is_missing(a) ? child1|child2 : 1<<bcf_gt_allele(a);
            mother |= bcf_gt_is_missing(b) || b==bcf_int32_vector_end ? child1|child2 : 1<<bcf_gt_allele(b);
            father  = bcf_gt_is_missing(c) ? child1|child2 : 1<<bcf_gt_allele(c);
            father |= bcf_gt_is_missing(d) || d==bcf_int32_vector_end ? child1|child2 : 1<<bcf_gt_allele(d);

            if ( (mother&child1 && father&child2) || (mother&child2 && father&child1) ) is_ok = 1;
        }
        else
        {
            child1  = 1<<bcf_gt_allele(e);
            child2  = bcf_gt_is_missing(f) || f==bcf_int32_vector_end ? 0 : 1<<bcf_gt_allele(f);
            mother |= bcf_gt_is_missing(a) ? 0 : 1<<bcf_gt_allele(a);
            mother |= bcf_gt_is_missing(b) || b==bcf_int32_vector_end ? 0 : 1<<bcf_gt_allele(b);
            father |= bcf_gt_is_missing(c) ? 0 : 1<<bcf_gt_allele(c);
            father |= bcf_gt_is_missing(d) || d==bcf_int32_vector_end ? 0 : 1<<bcf_gt_allele(d);

            regitr_copy(args.itr, args.itr_ori);
            while ( !is_ok && regitr_overlap(args.itr) )
            {
                rule_t *rule = &regitr_payload(args.itr,rule_t);
                if ( child1 && child2 )
                {
                    if ( !rule->mal || !rule->fal ) continue;   // wrong rule (haploid), but this is a diploid GT
                    if ( !mother ) mother = child1|child2;
                    if ( !father ) father = child1|child2;
                    if ( (mother&child1 && father&child2) || (mother&child2 && father&child1) ) is_ok = 1; 
                    continue;
                }
                if ( rule->mal )
                {
                    if ( mother && !(child1&mother) ) continue;
                }
                if ( rule->fal )
                {
                    if ( father && !(child1&father) ) continue;
                }
                is_ok = 1;
            }
        }
        if ( is_ok )
        {
            trio->nok++;
        }
        else
        {
            trio->nbad++;
            has_bad = 1;
            if ( args.mode&MODE_DELETE )
            {
                args.gt_arr[ngt*trio->imother] = bcf_gt_missing;
                if ( b!=bcf_int32_vector_end ) args.gt_arr[ngt*trio->imother+1] = bcf_gt_missing; // should be always true 
                args.gt_arr[ngt*trio->ifather] = bcf_gt_missing;
                if ( d!=bcf_int32_vector_end ) args.gt_arr[ngt*trio->ifather+1] = bcf_gt_missing;
                args.gt_arr[ngt*trio->ichild] = bcf_gt_missing;
                if ( f!=bcf_int32_vector_end ) args.gt_arr[ngt*trio->ichild+1]  = bcf_gt_missing;
                needs_update = 1;
            }
        }
    }

    if ( needs_update && bcf_update_genotypes(args.hdr,rec,args.gt_arr,ngt*bcf_hdr_nsamples(args.hdr)) )
        error("Could not update GT field at %s:%d\n", bcf_seqname(args.hdr,rec),rec->pos+1);

    if ( args.mode&MODE_DELETE ) return rec;
    if ( args.mode&MODE_LIST_GOOD ) return has_bad ? NULL : rec;
    if ( args.mode&MODE_LIST_BAD ) return has_bad ? rec : NULL;

    return NULL;
}
Пример #13
0
/*
 * GT field (genotype) comparison function.
 */
bcf1_t *process(bcf1_t *rec)
{
    uint64_t i;
    bcf_unpack(rec, BCF_UN_FMT); // unpack the Format fields, including the GT field
    int gte_smp = 0; // number GT array entries per sample (should be 2, one entry per allele)
    args.ngt_arr = 0;        /*! hold the number of current GT array entries */
    if ( (gte_smp = bcf_get_genotypes(args.hdr, rec, &(args.gt_arr), &(args.ngt_arr) ) ) <= 0 )
    {
        error("GT not present at %s: %d\n", args.hdr->id[BCF_DT_CTG][rec->rid].key, rec->pos+1);
    }

    gte_smp /= args.nsmp; // divide total number of genotypes array entries (= args.ngt_arr) by number of samples

    // initialize with missing genotype
    int a1 = 0;
    int a2 = 0;

    // initialize with first selected sample genotype that is not missing
    int gt = -1;
    while ( (a1 == 0) || (a2 == 0) )
    {
       gt++;
       if (gt == args.nsmp) break;
       if (args.selected_smps[gt] == 0) continue;
       a1 = (args.gt_arr + gte_smp * gt)[0];
       if ( gte_smp == 2 ) a2 = (args.gt_arr + gte_smp * gt)[1];
       else if ( gte_smp == 1 ) a2 = bcf_int32_vector_end;
       else error("GTsubset does not support ploidy higher than 2.\n");
    }
//    fprintf(stderr, "a1: %i  a2: %i\n", a1, a2);

    // check all genotypes if they match (for included samples) or disagree (for samples not included)
    gt = 0;
    for ( i = 0; i < args.nsmp; i++ )
    {
        int *gt_ptr = args.gt_arr + gte_smp * i;

        int b1 = gt_ptr[0];
        int b2;
        if ( gte_smp == 2 ) // two entries available per sample, padded with missing values for haploid genotypes
        {
            b2 = gt_ptr[1];
        }
        else if (gte_smp == 1 ) // use vector end value for second entry, if only one is available
        {
            b2 = bcf_int32_vector_end;
        }
        else
        {
            error("GTsubset does not support ploidy higher than 2.\n");
        }

 //      fprintf(stderr, "b1: %i  b2: %i\n", b1, b2);
        /* missing genotypes are counted as always passing, as they neither
         * mismatch the initial selected genotype for a selected sample, nor
         * do they match the initial selected genotype for an excluded sample's
         * genotype */
        if ( (b1 == 0) || (b2 == 0) )
        {
            gt++;
//            fprintf(stderr, "missing => pass\n");
            continue;
        }
        else if ( args.selected_smps[i] == 1 )
        {
            if ( (b1 == a1) && (b2 == a2) )
            {
                gt++;
//                fprintf(stderr, "match => pass\n");
                continue;
            }
            else
            {
//                fprintf(stderr, "no match => fail\n");
                break;
            }
        }
        else if ( args.selected_smps[i] == 0 )
        {
            if ( (b1 != a1 ) || (b2 != a2) )
            {
                gt++;
 //               fprintf(stderr, "no match => pass\n");
                continue;
            }
            else
            {
//                fprintf(stderr, "match => fail\n");
                break;
            }
        }
    }
    if ( gt == args.nsmp )
    {
        return rec;
    }
    else
    {
        return NULL;
    }
}
Пример #14
0
int parse_line(args_t *args, bcf1_t *line, double *alt_freq, double *pdg)
{
    args->nitmp = 0;

    // Set allele frequency
    int ret;
    if ( args->af_tag )
    {
        // Use an INFO tag provided by the user
        ret = bcf_get_info_float(args->hdr, line, args->af_tag, &args->AFs, &args->mAFs);
        if ( ret==1 )
            *alt_freq = args->AFs[0];
        if ( ret==-2 )
            error("Type mismatch for INFO/%s tag at %s:%d\n", args->af_tag, bcf_seqname(args->hdr,line), line->pos+1);
    }
    else if ( args->af_fname ) 
    {
        // Read AF from a file
        ret = read_AF(args->files->targets, line, alt_freq);
    }
    else
    {
        // Use GTs or AC/AN: GTs when AC/AN not present or when GTs explicitly requested by --estimate-AF
        ret = -1;
        if ( !args->estimate_AF )
        {
            int AC = -1, AN = 0;
            ret = bcf_get_info_int32(args->hdr, line, "AN", &args->itmp, &args->mitmp);
            if ( ret==1 )
            {
                AN = args->itmp[0];
                ret = bcf_get_info_int32(args->hdr, line, "AC", &args->itmp, &args->mitmp);
                if ( ret>0 )
                    AC = args->itmp[0];
            }
            if ( AN<=0 || AC<0 ) 
                ret = -1;
            else 
                *alt_freq = (double) AC/AN;
        }
        if ( ret==-1 )
            ret = estimate_AF(args, line, alt_freq);    // reads GTs into args->itmp
    }

    if ( ret<0 ) return ret;
    if ( *alt_freq==0.0 )
    {
        if ( args->dflt_AF==0 ) return -1;       // we skip sites with AF=0
        *alt_freq = args->dflt_AF;
    }

    // Set P(D|G)
    if ( args->fake_PLs )
    {
        if ( !args->nitmp )
        {
            args->nitmp = bcf_get_genotypes(args->hdr, line, &args->itmp, &args->mitmp);
            if ( args->nitmp != 2*args->nsmpl ) return -1;     // not diploid?
            args->nitmp /= args->nsmpl;
        }

        int32_t *gt = &args->itmp[args->ismpl*args->nitmp];
        if ( bcf_gt_is_missing(gt[0]) || bcf_gt_is_missing(gt[1]) ) return -1;

        int a = bcf_gt_allele(gt[0]);
        int b = bcf_gt_allele(gt[1]);
        if ( a!=b )
        {
            pdg[0] = pdg[2] = args->unseen_PL;
            pdg[1] = 1 - 2*args->unseen_PL;
        }
        else if ( a==0 )
        {
            pdg[0] = 1 - 2*args->unseen_PL;
            pdg[1] = pdg[2] = args->unseen_PL;
        }
        else
        {
            pdg[0] = pdg[1] = args->unseen_PL;
            pdg[2] = 1 - 2*args->unseen_PL;
        }
    }
    else
    {
        args->nitmp = bcf_get_format_int32(args->hdr, line, "PL", &args->itmp, &args->mitmp);
        if ( args->nitmp != args->nsmpl*line->n_allele*(line->n_allele+1)/2. ) return -1;     // not diploid?
        args->nitmp /= args->nsmpl;

        int32_t *pl = &args->itmp[args->ismpl*args->nitmp];
        pdg[0] = pl[0] < 256 ? args->pl2p[ pl[0] ] : 1.0;
        pdg[1] = pl[1] < 256 ? args->pl2p[ pl[1] ] : 1.0;
        pdg[2] = pl[2] < 256 ? args->pl2p[ pl[2] ] : 1.0;

        double sum = pdg[0] + pdg[1] + pdg[2];
        if ( !sum ) return -1;
        pdg[0] /= sum;
        pdg[1] /= sum;
        pdg[2] /= sum;
    }

    return 0;
}
void genrich_data::readReferenceGenotypes(string fvcf) {
	vector < int > mappingS;

	//Opening files
	vrb.title("Reading variant list in [" + fvcf + "] MAF=" + stb.str(threshold_maf));
	bcf_srs_t * sr =  bcf_sr_init();
	if(!(bcf_sr_add_reader (sr, fvcf.c_str()))) {
		switch (sr->errnum) {
		case not_bgzf: vrb.error("File not compressed with bgzip!");
		case idx_load_failed: vrb.error("Impossible to load index file!");
		case file_type_error: vrb.error("File format not detected by htslib!");
		default : vrb.error("Unknown error!");
		}
	}

	//Sample processing
	int included_sample = 0;
	int n_samples = bcf_hdr_nsamples(sr->readers[0].header);
	for (int i = 0 ; i < n_samples ; i ++) {
		mappingS.push_back(findSample(string(sr->readers[0].header->samples[i])));
		if (mappingS.back() >= 0) included_sample ++;
	}
	vrb.bullet("#samples = " + stb.str(included_sample));

	//Variant processing
	unsigned int n_excludedV_mult = 0, n_excludedV_void = 0, n_excludedV_rare = 0, n_excludedV_uchr = 0, n_line = 0, n_excludedV_toofar = 0;
	int ngt, ngt_arr = 0, *gt_arr = NULL;
	bcf1_t * line;
	while(bcf_sr_next_line (sr)) {
		line =  bcf_sr_get_line(sr, 0);
		if (line->n_allele == 2) {
			bcf_unpack(line, BCF_UN_STR);
			string sid = string(line->d.id);
			string chr = string(bcf_hdr_id2name(sr->readers[0].header, line->rid));
			int chr_idx = findCHR(chr);
			if (chr_idx >= 0) {
				unsigned int pos = line->pos + 1;
				ngt = bcf_get_genotypes(sr->readers[0].header, line, &gt_arr, &ngt_arr);
				if (ngt == 2*n_samples) {
					double freq = 0.0, tot = 0.0;
					for(int i = 0 ; i < n_samples ; i ++) {
						assert(gt_arr[2*i+0] != bcf_gt_missing && gt_arr[2*i+1] != bcf_gt_missing);
						if (mappingS[i] >= 0) {
							freq += bcf_gt_allele(gt_arr[2*i+0]) + bcf_gt_allele(gt_arr[2*i+1]);
							tot += 2.0;
						}
					}
					double maf = freq / tot;
					if (maf > 0.5) maf = 1.0 - maf;
					if (maf >= threshold_maf) {
						int dist_tss = getDistance(chr_idx, pos);
						if (dist_tss < 1e6) {
							string tmp_id = chr + "_" + stb.str(pos);
							genotype_uuid.insert(pair < string, unsigned int > (tmp_id, genotype_pos.size()));
							genotype_chr.push_back(chr_idx);
							genotype_pos.push_back(pos);
							genotype_maf.push_back(maf);
							genotype_dist.push_back(dist_tss);
							genotype_haps.push_back(vector < bool > (2 * included_sample, false));
							for(int i = 0 ; i < n_samples ; i ++) {
								if (mappingS[i] >= 0) {
									genotype_haps.back()[2 * mappingS[i] + 0] = bcf_gt_allele(gt_arr[2 * i + 0]);
									genotype_haps.back()[2 * mappingS[i] + 1] = bcf_gt_allele(gt_arr[2 * i + 1]);
								}
							}
						} else n_excludedV_toofar++;
					} else n_excludedV_rare ++;
				} else n_excludedV_void ++;
			} else n_excludedV_uchr ++;
		} else n_excludedV_mult ++;

		if (n_line % 100000 == 0) vrb.bullet("#lines = " + stb.str(n_line));

		n_line ++;
 	}
	genotype_qtl = vector < bool > (genotype_pos.size(), false);
	genotype_gwas = vector < bool > (genotype_pos.size(), false);
	genotype_bin = vector < int > (genotype_pos.size(), -1);

	//Finalize
	bcf_sr_destroy(sr);
	vrb.bullet(stb.str(genotype_pos.size()) + " variants included");
	if (n_excludedV_mult > 0) vrb.bullet(stb.str(n_excludedV_mult) + " multi-allelic variants excluded");
	if (n_excludedV_uchr > 0) vrb.bullet(stb.str(n_excludedV_uchr) + " variants with unreferenced chromosome in --tss");
	if (n_excludedV_rare > 0) vrb.bullet(stb.str(n_excludedV_rare) + " maf filtered variants");
	if (n_excludedV_toofar > 0) vrb.bullet(stb.str(n_excludedV_toofar) + " too far variants");
}
Пример #16
0
static void check_gt(args_t *args)
{
    int i,ret, *gt2ipl = NULL, m_gt2ipl = 0, *gt_arr = NULL, ngt_arr = 0;
    int fake_pls = args->no_PLs;

    // Initialize things: check which tags are defined in the header, sample names etc.
    if ( bcf_hdr_id2int(args->gt_hdr, BCF_DT_ID, "GT")<0 ) error("[E::%s] GT not present in the header of %s?\n", __func__, args->files->readers[1].fname);
    if ( bcf_hdr_id2int(args->sm_hdr, BCF_DT_ID, "PL")<0 ) 
    {
        if ( bcf_hdr_id2int(args->sm_hdr, BCF_DT_ID, "GT")<0 )
            error("[E::%s] Neither PL nor GT present in the header of %s\n", __func__, args->files->readers[0].fname);
        fprintf(stderr,"Warning: PL not present in the header of %s, using GT instead\n", args->files->readers[0].fname);
        fake_pls = 1;
    }

    FILE *fp = args->plot ? open_file(NULL, "w", "%s.tab", args->plot) : stdout;
    print_header(args, fp);

    int tgt_isample = -1, query_isample = 0;
    if ( args->target_sample ) 
    {
        tgt_isample = bcf_hdr_id2int(args->gt_hdr, BCF_DT_SAMPLE, args->target_sample);
        if ( tgt_isample<0 ) error("No such sample in %s: [%s]\n", args->files->readers[1].fname, args->target_sample);
    }
    if ( args->all_sites )
    {
        if ( tgt_isample==-1 ) 
        {
            fprintf(stderr,"No target sample selected for comparison, using the first sample in %s: %s\n", args->gt_fname,args->gt_hdr->samples[0]);
            tgt_isample = 0;
        }
    }
    if ( args->query_sample )
    {
        query_isample = bcf_hdr_id2int(args->sm_hdr, BCF_DT_SAMPLE, args->query_sample);
        if ( query_isample<0 ) error("No such sample in %s: [%s]\n", args->files->readers[0].fname, args->query_sample);
    }
    if ( args->all_sites )
        fprintf(fp, "# [1]SC, Site by Site Comparison\t[2]Chromosome\t[3]Position\t[4]-g alleles\t[5]-g GT (%s)\t[6]Coverage\t[7]Query alleles\t[8-]Query PLs (%s)\n", args->gt_hdr->samples[tgt_isample],args->sm_hdr->samples[query_isample]);

    // Main loop
    while ( (ret=bcf_sr_next_line(args->files)) )
    {
        if ( ret!=2 ) continue;
        bcf1_t *sm_line = args->files->readers[0].buffer[0];    // the query file
        bcf1_t *gt_line = args->files->readers[1].buffer[0];    // the -g target file
        bcf_unpack(sm_line, BCF_UN_FMT);
        bcf_unpack(gt_line, BCF_UN_FMT);

        // Init mapping from target genotype index to the sample's PL fields
        int n_gt2ipl = gt_line->n_allele*(gt_line->n_allele + 1)/2;
        if ( n_gt2ipl > m_gt2ipl )
        {
            m_gt2ipl = n_gt2ipl;
            gt2ipl   = (int*) realloc(gt2ipl, sizeof(int)*m_gt2ipl);
        }
        if ( !init_gt2ipl(args, gt_line, sm_line, gt2ipl, n_gt2ipl) ) continue;

        // Target genotypes
        int ngt, npl;
        if ( (ngt=bcf_get_genotypes(args->gt_hdr, gt_line, &gt_arr, &ngt_arr)) <= 0 ) 
            error("GT not present at %s:%d?", args->gt_hdr->id[BCF_DT_CTG][gt_line->rid].key, gt_line->pos+1);
        ngt /= bcf_hdr_nsamples(args->gt_hdr);
        if ( ngt!=2 ) continue; // checking only diploid genotypes

        // Sample PLs
        if ( !fake_pls )
        {
            if ( (npl=bcf_get_format_int32(args->sm_hdr, sm_line, "PL", &args->pl_arr, &args->npl_arr)) <= 0 )
                error("PL not present at %s:%d?", args->sm_hdr->id[BCF_DT_CTG][sm_line->rid].key, sm_line->pos+1);
            npl /= bcf_hdr_nsamples(args->sm_hdr);
        }
        else
            npl = fake_PLs(args, args->sm_hdr, sm_line);

        // Calculate likelihoods for all samples, assuming diploid genotypes

        // For faster access to genotype likelihoods (PLs) of the query sample
        int max_ipl, *pl_ptr = args->pl_arr + query_isample*npl;
        double sum_pl = 0;  // for converting PLs to probs
        for (max_ipl=0; max_ipl<npl; max_ipl++) 
        {
            if ( pl_ptr[max_ipl]==bcf_int32_vector_end ) break;
            if ( pl_ptr[max_ipl]==bcf_int32_missing ) continue;
            sum_pl += pow(10, -0.1*pl_ptr[max_ipl]);
        }
        if ( sum_pl==0 ) continue; // no PLs present

        // The main stats: concordance of the query sample with the target -g samples
        for (i=0; i<bcf_hdr_nsamples(args->gt_hdr); i++)
        {
            int *gt_ptr = gt_arr + i*ngt;
            if ( gt_ptr[1]==bcf_int32_vector_end ) continue;    // skip haploid genotypes
            int a = bcf_gt_allele(gt_ptr[0]);
            int b = bcf_gt_allele(gt_ptr[1]);
            if ( a<0 || b<0 ) continue; // missing genotypes
            if ( args->hom_only && a!=b ) continue; // heterozygous genotype
            int igt_tgt = igt_tgt = bcf_alleles2gt(a,b); // genotype index in the target file
            int igt_qry = gt2ipl[igt_tgt];  // corresponding genotype in query file
            if ( igt_qry>=max_ipl || pl_ptr[igt_qry]<0 ) continue;   // genotype not present in query sample: haploid or missing
            args->lks[i] += log(pow(10, -0.1*pl_ptr[igt_qry])/sum_pl); 
            args->sites[i]++; 
        }
        if ( args->all_sites )
        {
            // Print LKs at all sites for debugging
            int *gt_ptr = gt_arr + tgt_isample*ngt;
            if ( gt_ptr[1]==bcf_int32_vector_end ) continue;    // skip haploid genotypes
            int a = bcf_gt_allele(gt_ptr[0]);
            int b = bcf_gt_allele(gt_ptr[1]);
            if ( args->hom_only && a!=b ) continue; // heterozygous genotype
            fprintf(fp, "SC\t%s\t%d", args->gt_hdr->id[BCF_DT_CTG][gt_line->rid].key, gt_line->pos+1);
            for (i=0; i<gt_line->n_allele; i++) fprintf(fp, "%c%s", i==0?'\t':',', gt_line->d.allele[i]);
            fprintf(fp, "\t%s/%s", a>=0 ? gt_line->d.allele[a] : ".", b>=0 ? gt_line->d.allele[b] : ".");

            int igt, *pl_ptr = args->pl_arr + query_isample*npl; // PLs of the query sample
            for (i=0; i<sm_line->n_allele; i++) fprintf(fp, "%c%s", i==0?'\t':',', sm_line->d.allele[i]); 
            for (igt=0; igt<npl; igt++)   
                if ( pl_ptr[igt]==bcf_int32_vector_end ) break;
                else if ( pl_ptr[igt]==bcf_int32_missing ) fprintf(fp, ".");
                else fprintf(fp, "\t%d", pl_ptr[igt]); 
            fprintf(fp, "\n"); 
        }
    }
    free(gt2ipl);
    free(gt_arr);
    free(args->pl_arr);
    free(args->tmp_arr);

    // Scale LKs and certainties
    int nsamples = bcf_hdr_nsamples(args->gt_hdr);
    double min = args->lks[0];
    for (i=1; i<nsamples; i++) if ( min>args->lks[i] ) min = args->lks[i];
    for (i=0; i<nsamples; i++) args->lks[i] = min ? args->lks[i] / min : 0;
    double max_avg = args->sites[0] ? args->lks[0]/args->sites[0] : 0;
    for (i=1; i<nsamples; i++) 
    {
        double val = args->sites[i] ? args->lks[i]/args->sites[i] : 0;
        if ( max_avg<val ) max_avg = val;
    }

    // Sorted output
    double **p = (double**) malloc(sizeof(double*)*nsamples);
    for (i=0; i<nsamples; i++) p[i] = &args->lks[i];
    qsort(p, nsamples, sizeof(int*), cmp_doubleptr);

    fprintf(fp, "# [1]CN\t[2]Concordance with %s (total)\t[3]Concordance (average)\t[4]Number of sites compared\t[5]Sample\t[6]Sample ID\n", args->sm_hdr->samples[query_isample]);
    for (i=0; i<nsamples; i++)
    {
        int idx = p[i] - args->lks;
        double avg = args->sites[idx] ? args->lks[idx]/args->sites[idx] : 0;
        fprintf(fp, "CN\t%e\t%e\t%.0f\t%s\t%d\n", 1-args->lks[idx], 1-avg/max_avg, args->sites[idx], args->gt_hdr->samples[idx], i);
    }

    if ( args->plot )
    {
        fclose(fp);
        plot_check(args, args->target_sample ? args->target_sample : "", args->sm_hdr->samples[query_isample]);
    }
}
Пример #17
0
static void phased_flush(args_t *args)
{
    if ( !args->nbuf ) return;

    bcf_hdr_t *ahdr = args->files->readers[0].header;
    bcf_hdr_t *bhdr = args->files->readers[1].header;

    int i, j, nsmpl = bcf_hdr_nsamples(args->out_hdr);
    static int gt_absent_warned = 0;

    for (i=0; i<args->nbuf; i+=2)
    {
        bcf1_t *arec = args->buf[i];
        bcf1_t *brec = args->buf[i+1];

        int nGTs = bcf_get_genotypes(ahdr, arec, &args->GTa, &args->mGTa);
        if ( nGTs < 0 ) 
        {
            if ( !gt_absent_warned )
            {
                fprintf(stderr,"GT is not present at %s:%d. (This warning is printed only once.)\n", bcf_seqname(ahdr,arec), arec->pos+1);
                gt_absent_warned = 1;
            }
            continue;
        }
        if ( nGTs != 2*nsmpl ) continue;    // not diploid
        nGTs = bcf_get_genotypes(bhdr, brec, &args->GTb, &args->mGTb);
        if ( nGTs < 0 )
        {
            if ( !gt_absent_warned )
            {
                fprintf(stderr,"GT is not present at %s:%d. (This warning is printed only once.)\n", bcf_seqname(bhdr,brec), brec->pos+1);
                gt_absent_warned = 1;
            }
            continue;
        }
        if ( nGTs != 2*nsmpl ) continue;    // not diploid

        for (j=0; j<nsmpl; j++)
        {
            int *gta = &args->GTa[j*2];
            int *gtb = &args->GTb[j*2];
            if ( gta[1]==bcf_int32_vector_end || gtb[1]==bcf_int32_vector_end ) continue;
            if ( bcf_gt_is_missing(gta[0]) || bcf_gt_is_missing(gta[1]) || bcf_gt_is_missing(gtb[0]) || bcf_gt_is_missing(gtb[1]) ) continue;
            if ( !bcf_gt_is_phased(gta[1]) || !bcf_gt_is_phased(gtb[1]) ) continue;
            if ( bcf_gt_allele(gta[0])==bcf_gt_allele(gta[1]) || bcf_gt_allele(gtb[0])==bcf_gt_allele(gtb[1]) ) continue;
            if ( bcf_gt_allele(gta[0])==bcf_gt_allele(gtb[0]) && bcf_gt_allele(gta[1])==bcf_gt_allele(gtb[1]) )
            {
                if ( args->swap_phase[j] ) args->nmism[j]++; else args->nmatch[j]++;
            }
            if ( bcf_gt_allele(gta[0])==bcf_gt_allele(gtb[1]) && bcf_gt_allele(gta[1])==bcf_gt_allele(gtb[0]) )
            {
                if ( args->swap_phase[j] ) args->nmatch[j]++; else args->nmism[j]++;
            }
        }
    }
    for (i=0; i<args->nbuf/2; i+=2)
    {
        bcf1_t *arec = args->buf[i];
        bcf_translate(args->out_hdr, args->files->readers[0].header, arec);
        if ( args->nswap )
            phase_update(args, args->out_hdr, arec);
        if ( !args->compact_PS || args->phase_set_changed )
        {
            bcf_update_format_int32(args->out_hdr,arec,"PS",args->phase_set,nsmpl);
            args->phase_set_changed = 0;
        }
        bcf_write(args->out_fh, args->out_hdr, arec);

        if ( arec->pos < args->prev_pos_check ) error("FIXME, disorder: %s:%d vs %d  [1]\n", bcf_seqname(args->files->readers[0].header,arec),arec->pos+1,args->prev_pos_check+1);
        args->prev_pos_check = arec->pos;
    }
    args->nswap = 0;
    for (j=0; j<nsmpl; j++)
    {
        if ( args->nmatch[j] >= args->nmism[j] )
            args->swap_phase[j] = 0;
        else
        {
            args->swap_phase[j] = 1;
            args->nswap++;
        }
        if ( args->nmatch[j] && args->nmism[j] )
        {
            // Entropy-inspired quality. The factor 0.7 shifts and scales to (0,1)
            double f = (double)args->nmatch[j]/(args->nmatch[j]+args->nmism[j]);
            args->phase_qual[j] = 99*(0.7 + f*log(f) + (1-f)*log(1-f))/0.7;
        }
        else
            args->phase_qual[j] = 99;
        args->nmatch[j] = 0;
        args->nmism[j]  = 0;
    }
    int PQ_printed = 0;
    for (; i<args->nbuf; i+=2)
    {
        bcf1_t *brec = args->buf[i+1];
        bcf_translate(args->out_hdr, args->files->readers[1].header, brec);
        if ( !PQ_printed )
        {
            bcf_update_format_int32(args->out_hdr,brec,"PQ",args->phase_qual,nsmpl);
            PQ_printed = 1;
            for (j=0; j<nsmpl; j++)
                if ( args->phase_qual[j] < args->min_PQ ) 
                {
                    args->phase_set[j] = brec->pos+1;
                    args->phase_set_changed = 1;
                }
                else if ( args->compact_PS ) args->phase_set[j] = bcf_int32_missing;
        }
        if ( args->nswap )
            phase_update(args, args->out_hdr, brec);
        if ( !args->compact_PS || args->phase_set_changed )
        {
            bcf_update_format_int32(args->out_hdr,brec,"PS",args->phase_set,nsmpl);
            args->phase_set_changed = 0;
        }
        bcf_write(args->out_fh, args->out_hdr, brec);

        if ( brec->pos < args->prev_pos_check ) error("FIXME, disorder: %s:%d vs %d  [2]\n", bcf_seqname(args->files->readers[1].header,brec),brec->pos+1,args->prev_pos_check+1);
        args->prev_pos_check = brec->pos;
    }
    args->nbuf = 0;
}
Пример #18
0
int main(int argc, char **argv)
{
    if (argc < 3) {
        fprintf(stderr,"%s <bcf file> <num vars>\n", argv[0]);
        return 1;
    }

    char *fname = argv[1];
    uint32_t num_vars = atoi(argv[2]);

    htsFile *fp    = hts_open(fname,"rb");
    bcf_hdr_t *hdr = bcf_hdr_read(fp);
    bcf1_t *line    = bcf_init1();
    int32_t *gt_p = NULL;

    uint32_t num_inds = bcf_hdr_nsamples(hdr);
    
    int32_t i, j, k, ntmp = 0, int_i = 0, two_bit_i = 0, sum, t_sum = 0;

    uint32_t num_ind_ints = 1 + ((num_inds - 1) / 16);

    pri_queue q = priq_new(0);
    priority p;

    uint32_t *packed_ints = (uint32_t *) calloc(num_ind_ints,
                                                sizeof(uint32_t));

    FILE *gt_of = fopen("gt.tmp.packed","wb");
    FILE *md_of = fopen("md.tmp.packed","w");

    uint32_t *md_index = (uint32_t *) malloc(num_vars * sizeof(uint32_t));
    uint32_t md_i = 0;

    unsigned long t_bcf_read = 0, 
                  t_bcf_dup = 0,
                  t_bcf_unpack = 0,
                  t_bcf_get_genotypes = 0,
                  t_bcf_hdr_nsamples = 0,
                  t_q = 0,
                  t_write = 0,
                  t_get_md = 0,
                  t_md_write = 0,
                  t_pack = 0;

    for (i = 0; i < num_vars; ++i) {
        sum = 0;
        int_i = 0;
        two_bit_i = 0;

        int r = bcf_read(fp, hdr, line);
        
        // Copy
        bcf1_t *t_line = bcf_dup(line);

        // Unpack
        bcf_unpack(t_line, BCF_UN_ALL);

        // Get metadata
        size_t len = strlen(bcf_hdr_id2name(hdr, t_line->rid)) +
                     10 + // max length of pos
                     strlen(t_line->d.id) +
                     strlen(t_line->d.allele[0]) +
                     strlen(t_line->d.allele[1]) +
                     4; //tabs
        char *md = (char *) malloc(len * sizeof(char));

        sprintf(md, "%s\t%d\t%s\t%s\t%s",
                     bcf_hdr_id2name(hdr, t_line->rid),
                     t_line->pos + 1,
                     t_line->d.id,
                     t_line->d.allele[0],
                     t_line->d.allele[1]); 

        // Write metadata
        md_i += strlen(md);
        md_index[i] = md_i;
        fprintf(md_of, "%s", md);

        // Get gentotypes
        uint32_t num_gts_per_sample = bcf_get_genotypes(hdr,
                                                        t_line,
                                                        &gt_p,
                                                        &ntmp);
        num_gts_per_sample /= num_inds;
        int32_t *gt_i = gt_p;
        
        // Pack genotypes
        for (j = 0; j < num_inds; ++j) {
            uint32_t gt = 0;
            for (k = 0; k < num_gts_per_sample; ++k) {
                gt += bcf_gt_allele(gt_i[k]);
            }

            packed_ints[int_i] += gt << (30 - 2*two_bit_i);
            two_bit_i += 1;
            if (two_bit_i == 16) {
                two_bit_i = 0;
                int_i += 1;
            }

            sum += gt;
            gt_i += num_gts_per_sample;
        }

        // Get a priority for the variant based on the sum and number of 
        // leading zeros
        p.sum = sum;
        uint32_t prefix_len = 0;
        j = 0;
        while ((j < num_ind_ints) && (packed_ints[j] == 0)){
            prefix_len += 32;
            j += 1;
        }
        if (j < num_ind_ints)
            prefix_len += nlz1(packed_ints[j]);
        
        // Push it into the q
        p.len = prefix_len;
        int *j = (int *) malloc (sizeof(int));
        j[0] = i;
        priq_push(q, j, p);

        // Write to file
        fwrite(packed_ints, sizeof(uint32_t), num_ind_ints,gt_of);

        memset(packed_ints, 0, num_ind_ints*sizeof(uint32_t));

        t_sum += sum;

        bcf_destroy(t_line);
        free(md);
    }
    fclose(gt_of);
    fclose(md_of);


    md_of = fopen("md.tmp.packed","r");
    FILE *md_out = fopen("md.bim","w");
    gt_of = fopen("gt.tmp.packed","rb");
    FILE *s_gt_of = fopen("s.gt.tmp.packed","wb");

    // Get variants in order and rewrite a variant-major sorted matrix
    while ( priq_top(q, &p) != NULL ) {
        int *d = priq_pop(q, &p);

        uint32_t start = 0;
        if (*d != 0)
            start = md_index[*d - 1];

        uint32_t len = md_index[*d] - start;

        fseek(md_of, start*sizeof(char), SEEK_SET);
        char buf[len+1];
        fread(buf, sizeof(char), len, md_of);
        buf[len] = '\0';

        fseek(gt_of, (*d)*num_ind_ints*sizeof(uint32_t), SEEK_SET);
        fread(packed_ints, sizeof(uint32_t), num_ind_ints, gt_of);
        fwrite(packed_ints, sizeof(uint32_t), num_ind_ints,s_gt_of);

        fprintf(md_out, "%s\n", buf);
    }

    fclose(md_out);
    fclose(md_of);
    fclose(gt_of);
    fclose(s_gt_of);


    /*
     * In a packed-int variant-major matrix there will be a num_vars
     * number of rows, and a num_inds number of values packed into
     * num_inds_ints number of intergers.  For examples, 16 rows of 16 values
     * will be 16 ints, where each int encodes 16 values.
     *
     */
    
    uint32_t num_var_ints = 1 + ((num_vars - 1) / 16);

    uint32_t *I_data = (uint32_t *)calloc(num_var_ints*16,sizeof(uint32_t));
    uint32_t **I = (uint32_t **)malloc(16*sizeof(uint32_t*));
    for (i = 0; i < 16; ++i)
        I[i] = I_data + i*num_var_ints;
    uint32_t I_i = 0, I_int_i = 0;

    uint32_t v;

    s_gt_of = fopen("s.gt.tmp.packed","rb");
    FILE *rs_gt_of = fopen("r.s.gt.tmp.packed","wb");

    // Write these to values to that this is a well-formed uncompressed 
    // packed int binary file (ubin) file
    fwrite(&num_vars, sizeof(uint32_t), 1, rs_gt_of);
    fwrite(&num_inds, sizeof(uint32_t), 1, rs_gt_of);
     
    /* 
     * we need to loop over the columns in the v-major file.
     * There are num_vars rows, and num_ind_ints 16-ind packed columns
     *
     * In this loop :
     *  i: cols in var-major form, rows in ind-major form
     *  j: rows in var-major form, cols in ind-major form
     */
    uint32_t num_inds_to_write = num_inds;
    for (i = 0; i < num_ind_ints; ++i) { // loop over each int col
        for (j = 0; j < num_vars; ++j) { // loop over head row in that col
            // skip to the value at the row/col
            fseek(s_gt_of, 
                  j*num_ind_ints*sizeof(uint32_t) + //row
                  i*sizeof(uint32_t), //col
                  SEEK_SET);

            fread(&v, sizeof(uint32_t), 1, s_gt_of);

            // one int corresponds to a col of 16 two-bit values
            // two_bit_i will move across the cols
            for (two_bit_i = 0; two_bit_i < 16; ++two_bit_i) {
                I[two_bit_i][I_i] += ((v >> (30 - 2*two_bit_i)) & 3) << 
                                     (30 - 2*I_int_i);
            }
            I_int_i += 1;

            if (I_int_i == 16) {
                I_i += 1;
                I_int_i = 0;
            }
        }

        // When we are at the end of the file, and the number of lines 
        // is not a factor of 16, only write out the lines that contain values
        if (num_inds_to_write >= 16) {
            fwrite(I_data,
                   sizeof(uint32_t),
                   num_var_ints*16,
                   rs_gt_of);
            num_inds_to_write -= 16;
        } else {
            fwrite(I_data,
                   sizeof(uint32_t),
                   num_var_ints*num_inds_to_write,
                   rs_gt_of);
        }
        memset(I_data, 0, num_var_ints*16*sizeof(uint32_t));
        I_int_i = 0;
        I_i = 0;
    }

    fclose(s_gt_of);
    fclose(rs_gt_of);

    free(md_index);
    free(packed_ints);
}