Пример #1
0
static int load_genmap(args_t *args, bcf1_t *line)
{
    if ( !args->genmap_fname ) { args->ngenmap = 0; return 0; }

    kstring_t str = {0,0,0};
    char *fname = strstr(args->genmap_fname,"{CHROM}");
    if ( fname )
    {
        kputsn(args->genmap_fname, fname - args->genmap_fname, &str);
        kputs(bcf_seqname(args->hdr,line), &str);
        kputs(fname+7,&str);
        fname = str.s;
    }
    else
        fname = args->genmap_fname;

    htsFile *fp = hts_open(fname, "rb");
    if ( !fp )
    {
        args->ngenmap = 0;
        return -1;
    }

    hts_getline(fp, KS_SEP_LINE, &str);
    if ( strcmp(str.s,"position COMBINED_rate(cM/Mb) Genetic_Map(cM)") )
        error("Unexpected header, found:\n\t[%s], but expected:\n\t[position COMBINED_rate(cM/Mb) Genetic_Map(cM)]\n", fname, str.s);

    args->ngenmap = args->igenmap = 0;
    while ( hts_getline(fp, KS_SEP_LINE, &str) > 0 )
    {
        args->ngenmap++;
        hts_expand(genmap_t,args->ngenmap,args->mgenmap,args->genmap);
        genmap_t *gm = &args->genmap[args->ngenmap-1];

        char *tmp, *end;
        gm->pos = strtol(str.s, &tmp, 10);
        if ( str.s==tmp ) error("Could not parse %s: %s\n", fname, str.s);

        // skip second column
        tmp++;
        while ( *tmp && !isspace(*tmp) ) tmp++;

        // read the genetic map in cM
        gm->rate = strtod(tmp+1, &end);
        if ( tmp+1==end ) error("Could not parse %s: %s\n", fname, str.s);
    }
    if ( !args->ngenmap ) error("Genetic map empty?\n");
    int i;
    for (i=0; i<args->ngenmap; i++) args->genmap[i].rate /= args->genmap[args->ngenmap-1].rate; // scale to 1
    if ( hts_close(fp) ) error("Close failed\n");
    free(str.s);
    return 0;
}
Пример #2
0
/**
 *  annots_reader_next() - reads next line from annots.tab.gz and sets: class, vals
 *   Returns 1 on successful read or 0 if no further record could be read.
 */
int annots_reader_next(args_t *args)
{
    args->str.l = 0;
    if ( hts_getline(args->file,'\n',&args->str)<=0 ) return 0;

    char *t, *line = args->str.s;

    if ( !args->mvals )
    {
        t = line;
        while ( *t ) 
        {
            if ( *t=='\t' ) args->mvals++;
            t++;
        }
        args->vals = (double*) malloc(args->mvals*sizeof(double));
    }

    // class
    args->dclass = atoi(line);
    t = column_next(line, '\t'); 

    // values
    int i;
    for (i=0; i<args->mvals; i++)
    {
        if ( !*t ) error("Could not parse %d-th data field: is the line truncated?\nThe line was: [%s]\n",i+2,line);
        args->vals[i] = atof(++t);
        t = column_next(t,'\t');
    }
    return 1;
}
Пример #3
0
/**
 * Parse multiple files from command line unlabeled arguments or -L denoted file list.  If both are defined, the files are merged.
 *
 * @files          - file names are stored in this vector
 * @argument_files - vector of input files
 * @file_list      - file names stored in a file
 *
 */
void Program::parse_files(std::vector<std::string>& files, const std::vector<std::string>& arg_files, std::string file_list)
{
    files.clear();

    if (arg_files.size()!=0)
    {
        files = arg_files;
    }    
    
    if (file_list != "")
    {
        htsFile *file = hts_open(file_list.c_str(), "r");
        if (file==NULL)
        {
            std::cerr << "cannot open " << file_list << "\n";
            exit(1);
        }
        kstring_t *s = &file->line;
        while (hts_getline(file, '\n', s) >= 0)
        {
            if (s->s[0]!='#')
            {
                files.push_back(std::string(s->s));
            }
        }
        hts_close(file);
    }
}
Пример #4
0
/**
 * Reads next record, hides the random access of different regions from the user.
 */
bool TBXOrderedReader::read(kstring_t *s)
{
    if (random_access_enabled)
    {
        while(true)
        {
            if (itr && tbx_itr_next(hts, tbx, itr, s)>=0)
            {
                return true;
            }
            else if (!initialize_next_interval())
            {
                return false;
            }
        }
    }
    else
    {
        if (hts_getline(hts, '\n', s) >= 0)
		{
		    return true;
		}
		else
	    {
	        return false;    
	    }
    }

    return false;
};
Пример #5
0
void cis_data::scanPhenotypes(string fbed) {
	int n_includedP = 0;
	int n_excludedP = 0;
	int n_negativeStrd = 0;

	//Open BED file
	vrb.title("Scanning phenotype data in [" + fbed + "]");
	htsFile *fp = hts_open(fbed.c_str(),"r");
	if (!fp) vrb.error("Cannot open file");
	tbx_t * tbx = tbx_index_load(fbed.c_str());
	if (!tbx) vrb.error("Cannot open index file");

	//Read header
	kstring_t str = {0,0,0};
	if (!hts_getline(fp, KS_SEP_LINE, &str) || !str.l || str.s[0] != tbx->conf.meta_char ) vrb.error("Cannot read header line");

	//Scan file
	vector < string > tokens;
	while (hts_getline(fp, KS_SEP_LINE, &str) >= 0) {
		if (str.l && str.s[0] != tbx->conf.meta_char) {
			stb.split(string(str.s), tokens);
			if (tokens.size() < 5) vrb.error("Incorrect number of columns!");
			if ((grp_mode == GRP_NONE && filter_phenotype.check(tokens[3])) || (grp_mode != GRP_NONE && filter_phenotype.check(tokens[4]))) {
				phenotype_id.push_back(tokens[3]);
				phenotype_chr.push_back(tokens[0]);
				phenotype_start.push_back(atoi(tokens[1].c_str()) + 1);
				phenotype_end.push_back(atoi(tokens[2].c_str()));
				if (grp_mode > 0 && full_test) phenotype_grp.push_back("ALL_GENES");
				if (grp_mode > 0 && !full_test) phenotype_grp.push_back(tokens[4]);
                phenotype_neg.push_back(tokens[5] == "-");
                if (phenotype_neg.back()) n_negativeStrd ++;
				n_includedP++;
			} else n_excludedP ++;
		}
	}

	//Finalize & verbose
	tbx_destroy(tbx);
	if (hts_close(fp)) vrb.error("Cannot properly close file");
	phenotype_count = phenotype_id.size();
	vrb.bullet(stb.str(n_includedP) + " phenotypes included");
	if (n_excludedP > 0) vrb.bullet(stb.str(n_excludedP) + " phenotypes excluded by user");
	if (n_negativeStrd > 0 ) vrb.bullet(stb.str(n_negativeStrd) + " phenotypes are on the negative strand");
    if (phenotype_count == 0) vrb.leave("Cannot find phenotypes in region!");
}
Пример #6
0
static void reheader_vcf(args_t *args)
{
    kstring_t hdr = {0,0,0};
    htsFile *fp = hts_open(args->fname, "r"); if ( !fp ) error("Failed to open: %s\n", args->fname);
    while ( hts_getline(fp, KS_SEP_LINE, &fp->line) >=0 )
    {
        kputc('\n',&fp->line);  // hts_getline eats the newline character
        if ( fp->line.s[0]!='#' ) break;
        kputsn(fp->line.s,fp->line.l,&hdr);
    }

    int nsamples = 0;
    char **samples = NULL;
    if ( args->samples_fname )
        samples = hts_readlines(args->samples_fname, &nsamples);
    if ( args->header_fname )
    {
        free(hdr.s); hdr.s = NULL; hdr.l = hdr.m = 0;
        read_header_file(args->header_fname, &hdr);
    }
    if ( samples )
    {
        set_samples(samples, nsamples, &hdr);
        int i;
        for (i=0; i<nsamples; i++) free(samples[i]);
        free(samples);
    }

    int out = STDOUT_FILENO;
    if ( write(out, hdr.s, hdr.l)!=hdr.l ) error("Failed to write %d bytes\n", hdr.l);
    free(hdr.s);
    if ( fp->line.l )
    {
        if ( write(out, fp->line.s, fp->line.l)!=fp->line.l ) error("Failed to write %d bytes\n", fp->line.l);
    }
    while ( hts_getline(fp, KS_SEP_LINE, &fp->line) >=0 )   // uncompressed file implies small size, we don't worry about speed
    {
        kputc('\n',&fp->line);
        if ( write(out, fp->line.s, fp->line.l)!=fp->line.l ) error("Failed to write %d bytes\n", fp->line.l);
    }
    hts_close(fp);
}
Пример #7
0
void Tabix::getHeader(string& header) {
    header.clear();
    kstring_t str = {0,0,0};
    while ( hts_getline(fn, KS_SEP_LINE, &str) >= 0 ) {
        if ( !str.l || str.s[0]!=tbx->conf.meta_char ) {
            break;
        } else {
            header += string(str.s);
            header += "\n";
        }
    }
    // set back to start
    if (iter != NULL){
        current_chrom = chroms.begin();
        if (iter) tbx_itr_destroy(iter);
        iter = tbx_itr_querys(tbx, current_chrom->c_str());
    }
}
Пример #8
0
static void read_header_file(char *fname, kstring_t *hdr)
{
    kstring_t tmp = {0,0,0};
    hdr->l = 0;

    htsFile *fp = hts_open(fname, "r");
    if ( !fp ) error("Could not read: %s\n", fname);
    while ( hts_getline(fp, KS_SEP_LINE, &tmp) > 0 )
    {
        kputsn(tmp.s,tmp.l,hdr);
        kputc('\n',hdr);
    }
    if ( hts_close(fp) ) error("Close failed: %s\n", fname);
    free(tmp.s);

    while ( hdr->l>0 && isspace(hdr->s[hdr->l-1]) ) hdr->l--;  // remove trailing newlines
    kputc('\n',hdr);
}
Пример #9
0
/**
 * Parse intervals. Processes the interval list first followed by the interval string. Duplicates are dropped.
 *
 * @intervals       - intervals stored in this vector
 * @interval_list   - file containing intervals
 * @interval_string - comma delimited intervals in a string
 *
 * todo: merge overlapping sites?
 */
void Program::parse_intervals(std::vector<GenomeInterval>& intervals, std::string interval_list, std::string interval_string)
{
    intervals.clear();
    std::map<std::string, uint32_t> m;

    if (interval_list!="")
    {
        htsFile *file = hts_open(interval_list.c_str(), "r");
        if (file)
        {
            kstring_t *s = &file->line;
            while (hts_getline(file, '\n', s)>=0)
            {
                std::string ss = std::string(s->s);
                if (m.find(ss)==m.end())
                {
                    m[ss] = 1;
                    GenomeInterval interval(ss);
                    intervals.push_back(interval);
                }
            }
            hts_close(file);
        }
    }

    std::vector<std::string> v;
    if (interval_string!="")
        split(v, ",", interval_string);

    for (uint32_t i=0; i<v.size(); ++i)
    {
        if (m.find(v[i])==m.end())
        {
            m[v[i]] = 1;
            GenomeInterval interval(v[i]);
            intervals.push_back(interval);
        }
    }
}
Пример #10
0
/**
 * Parse samples. Processes the sample list. Duplicates are dropped.
 *
 * @samples      - samples stored in this vector
 * @sample_map   - samples stored in this map
 * @sample_list  - file containing sample names
 */
void Program::read_sample_list(std::vector<std::string>& samples, std::string sample_list)
{
    samples.clear();
    std::map<std::string, int32_t> map;

    if (sample_list!="")
    {
        htsFile *file = hts_open(sample_list.c_str(), "r");
        if (file)
        {
            kstring_t *s = &file->line;
            while (hts_getline(file, '\n', s)>=0)
            {
                std::string ss = std::string(s->s);
                if (map.find(ss)==map.end())
                {
                    map[ss] = 1;
                    samples.push_back(ss);
                }
            }
            hts_close(file);
        }
    }
}
Пример #11
0
void union_data::scanGenotypesBED(string fbed) {
	string buffer;
	int n_includedG = 0;
	int n_excludedG_user = 0;

	//Opening files
	htsFile *fp = hts_open(fbed.c_str(),"r");
	if (!fp) vrb.error("Cannot open file!");
	tbx_t * tbx = tbx_index_load(fbed.c_str());
	if (!tbx) vrb.error("Cannot load index file!");
	kstring_t str = {0,0,0};
	if (hts_getline(fp, KS_SEP_LINE, &str) <= 0 || !str.l || str.s[0] != tbx->conf.meta_char ) vrb.error("Cannot read header line!");

	//Read genotype data
	vector < string > tokens;
    unsigned int linecount = 0;
    //Jump to interesting region
    if (regionGenotype.chr != "NA"){
        hts_itr_t *itr = tbx_itr_querys(tbx, regionGenotype.get().c_str());
        vrb.bullet("target region [" + regionGenotype.get() + "]");
        if (!itr) vrb.error("Cannot jump to region!");
        while (tbx_itr_next(fp, tbx, itr, &str) >= 0) {
            linecount ++;
            if (linecount % 1000000 == 0) vrb.bullet("Read " + stb.str(linecount) + " lines");
            stb.split(string(str.s), tokens);
            if (tokens.size() < 7) vrb.error("Incorrect number of columns!");
            if (genotype_id_to_idx.count(tokens[3])) continue;
            if (filter_genotype.check(tokens[3])) {
                genotype_id.push_back(tokens[3]);
                genotype_chr.push_back(tokens[0]);
                genotype_start.push_back(atoi(tokens[1].c_str()) + 1);
                genotype_end.push_back(atoi(tokens[2].c_str()));
                pair < string, int > temp (tokens[3],genotype_id_to_idx.size());
                genotype_id_to_idx.insert(temp);
                n_includedG++;
            } else n_excludedG_user ++;
        }
        tbx_itr_destroy(itr);
    }else{
        while (hts_getline(fp, KS_SEP_LINE, &str) >= 0) {
            linecount ++;
            if (linecount % 1000000 == 0) vrb.bullet("Read " + stb.str(linecount) + " lines");
            stb.split(string(str.s), tokens);
            if (str.l && str.s[0] != tbx->conf.meta_char) {
                if (tokens.size() < 7) vrb.error("Incorrect number of columns!");
                if (genotype_id_to_idx.count(tokens[3])) continue;
                if (filter_genotype.check(tokens[3])) {
                    genotype_id.push_back(tokens[3]);
                    genotype_chr.push_back(tokens[0]);
                    genotype_start.push_back(atoi(tokens[1].c_str()) + 1);
                    genotype_end.push_back(atoi(tokens[2].c_str()));
                    pair < string, int > temp (tokens[3],genotype_id_to_idx.size());
                    genotype_id_to_idx.insert(temp);
                    n_includedG++;
                } else n_excludedG_user ++;
            }
        }
    }

	//Finalize & verbose
	tbx_destroy(tbx);
    genotype_count += n_includedG;
	if (hts_close(fp)) vrb.error("Cannot properly close file!");
	vrb.bullet(stb.str(n_includedG) + " new variants included");
	if (n_excludedG_user > 0) vrb.bullet(stb.str(n_excludedG_user) + " variants excluded by user");
    if (n_includedG  == 0) vrb.leave("Cannot find variants in target region!");
}
Пример #12
0
void union_data::readGenotypesBED(string fbed,string region) {
	string buffer;
	int n_includedG = 0;
	int n_excludedG_user = 0;
	int n_includedS = 0;
	int n_excludedS = 0;
	int n_missingS = 0;
	vector < int > mappingS;
	genotype_id.clear();
	genotype_chr.clear();
	genotype_start.clear();
	genotype_end.clear();
	genotype_val.clear();
	genotype_count=0;
	genotype_id_to_idx.clear();
	//Opening files
	htsFile *fp = hts_open(fbed.c_str(),"r");
	if (!fp) vrb.error("Cannot open file!");
	tbx_t * tbx = tbx_index_load(fbed.c_str());
	if (!tbx) vrb.error("Cannot load index file!");
	kstring_t str = {0,0,0};
	if (hts_getline(fp, KS_SEP_LINE, &str) <= 0 || !str.l || str.s[0] != tbx->conf.meta_char ) vrb.error("Cannot read header line!");

	//Process sample names
	vector < string > tokens;
	stb.split(string(str.s), tokens);
	if (tokens.size() < 7) vrb.error("Incorrect number of columns!");
	for (int i0 = 6 ; i0 < tokens.size() ; i0 ++) {
		string sid = tokens[i0];
		if (filter_sample.check(sid)) {
			mappingS.push_back(findSample(sid));
			if (mappingS.back() >= 0) n_includedS ++;
			else n_missingS ++;
		} else {
			mappingS.push_back(-1);
			n_excludedS ++;
		}
	}
	//vrb.bullet(stb.str(n_includedS) + " samples included");
	//if (n_excludedS > 0) vrb.bullet(stb.str(n_excludedS) + " samples excluded by user");
	//if (n_missingS > 0) vrb.bullet(stb.str(n_missingS) + " samples without phenotype data");
	//if (n_includedS != sample_count) vrb.error("Cannot find genotype for " + stb.str(sample_count - n_includedS) + " samples!");

    unsigned int linecount = 0;

	//Jump to interesting region

	hts_itr_t *itr = tbx_itr_querys(tbx, region.c_str());
	//vrb.bullet("target region [" + regionGenotype.get() + "]");
	//if (!itr) vrb.error("Cannot jump to region!");
	while (tbx_itr_next(fp, tbx, itr, &str) >= 0) {
		linecount ++;
		if (linecount % 100000 == 0) vrb.bullet("Read " + stb.str(linecount) + " lines");
		stb.split(string(str.s), tokens);
		if (tokens.size() < 7) vrb.error("Incorrect number of columns!");
		if (filter_genotype.check(tokens[3])) {
			genotype_id.push_back(tokens[3]);
			genotype_chr.push_back(tokens[0]);
			genotype_start.push_back(atoi(tokens[1].c_str()) + 1);
			genotype_end.push_back(atoi(tokens[2].c_str()));
			genotype_val.push_back(vector < float > (sample_count, 0.0));
			for (int t = 6 ; t < tokens.size() ; t ++) {
				if (mappingS[t-6] >= 0) {
					if (tokens[t] == "NA") genotype_val.back()[mappingS[t-6]] = bcf_float_missing;
					else genotype_val.back()[mappingS[t-6]] = stof(tokens[t]);
				}
			}
			pair < string, int > temp (tokens[3],n_includedG);
			genotype_id_to_idx.insert(temp);
			n_includedG++;
		} else n_excludedG_user ++;
	}
	tbx_itr_destroy(itr);


	//Finalize & verbose
	tbx_destroy(tbx);
	if (hts_close(fp)) vrb.error("Cannot properly close file!");
	genotype_count = n_includedG;
	//vrb.bullet(stb.str(n_includedG) + " variants included");
	//if (n_excludedG_user > 0) vrb.bullet(stb.str(n_excludedG_user) + " variants excluded by user");
    //if (genotype_count == 0) vrb.leave("Cannot find variants in target region!");
}
Пример #13
0
Файл: vcf.c Проект: goshng/cocoa
bcf_hdr_t *vcf_hdr_read(htsFile *fp)
{
	if (!fp->is_bin) {
		kstring_t txt, *s = &fp->line;
		bcf_hdr_t *h;
		h = bcf_hdr_init();
		txt.l = txt.m = 0; txt.s = 0;
		while (hts_getline(fp, KS_SEP_LINE, s) >= 0) {
			if (s->l == 0) continue;
			if (s->s[0] != '#') {
				if (hts_verbose >= 2)
					fprintf(stderr, "[E::%s] no sample line\n", __func__);
				free(txt.s);
				bcf_hdr_destroy(h);
				return 0;
			}
			if (s->s[1] != '#' && fp->fn_aux) { // insert contigs here
				int dret;
				gzFile f;
				kstream_t *ks;
				kstring_t tmp;
				tmp.l = tmp.m = 0; tmp.s = 0;
				f = gzopen(fp->fn_aux, "r");
				ks = ks_init(f);
				while (ks_getuntil(ks, 0, &tmp, &dret) >= 0) {
					int c;
					kputs("##contig=<ID=", &txt); kputs(tmp.s, &txt);
					ks_getuntil(ks, 0, &tmp, &dret);
					kputs(",length=", &txt); kputw(atol(tmp.s), &txt);
					kputsn(">\n", 2, &txt);
					if (dret != '\n')
						while ((c = ks_getc(ks)) != '\n' && c != -1); // skip the rest of the line
				}
				free(tmp.s);
				ks_destroy(ks);
				gzclose(f);
			}
			kputsn(s->s, s->l, &txt);
			if (s->s[1] != '#') break;
			kputc('\n', &txt);
		}
		h->l_text = txt.l + 1; // including NULL
		h->text = txt.s;
		bcf_hdr_parse(h);
        // check tabix index, are all contigs listed in the header? add the missing ones
        tbx_t *idx = tbx_index_load(fp->fn);
        if ( idx )
        {
			int i, n, need_sync = 0;
			const char **names = tbx_seqnames(idx, &n);
			for (i=0; i<n; i++)
			{
                bcf_hrec_t *hrec = bcf_hdr_get_hrec(h, BCF_DT_CTG, (char*) names[i]);
                if ( hrec ) continue;
                hrec = (bcf_hrec_t*) calloc(1,sizeof(bcf_hrec_t));
                hrec->key = strdup("contig");
                bcf_hrec_add_key(hrec, "ID", strlen("ID"));
                bcf_hrec_set_val(hrec, hrec->nkeys-1, (char*) names[i], strlen(names[i]), 0);
                bcf_hrec_add_key(hrec, "length", strlen("length"));
                bcf_hrec_set_val(hrec, hrec->nkeys-1, "-1", strlen("-1"), 0);   // what is a good default value?
                bcf_hdr_add_hrec(h, hrec);
                need_sync = 1;
			}
			free(names);
			tbx_destroy(idx);
            if ( need_sync )
            {
                bcf_hdr_sync(h);
                bcf_hdr_fmt_text(h);
            }
		}
		return h;
	} else return bcf_hdr_read((BGZF*)fp->fp);
}
Пример #14
0
static int query_regions(args_t *args, char *fname, char **regs, int nregs)
{
    int i;
    htsFile *fp = hts_open(fname,"r");
    if ( !fp ) error("Could not read %s\n", fname);
    enum htsExactFormat format = hts_get_format(fp)->format;

    regidx_t *reg_idx = NULL;
    if ( args->targets_fname )
    {
        reg_idx = regidx_init(args->targets_fname, NULL, NULL, 0, NULL);
        if ( !reg_idx ) error("Could not read %s\n", args->targets_fname);
    }

    if ( format == bcf )
    {
        htsFile *out = hts_open("-","w");
        if ( !out ) error("Could not open stdout\n", fname);
        hts_idx_t *idx = bcf_index_load(fname);
        if ( !idx ) error("Could not load .csi index of %s\n", fname);
        bcf_hdr_t *hdr = bcf_hdr_read(fp);
        if ( !hdr ) error("Could not read the header: %s\n", fname);
        if ( args->print_header )
            bcf_hdr_write(out,hdr);
        if ( !args->header_only )
        {
            bcf1_t *rec = bcf_init();
            for (i=0; i<nregs; i++)
            {
                hts_itr_t *itr = bcf_itr_querys(idx,hdr,regs[i]);
                while ( bcf_itr_next(fp, itr, rec) >=0 )
                {
                    if ( reg_idx && !regidx_overlap(reg_idx, bcf_seqname(hdr,rec),rec->pos,rec->pos+rec->rlen-1, NULL) ) continue;
                    bcf_write(out,hdr,rec);
                }
                tbx_itr_destroy(itr);
            }
            bcf_destroy(rec);
        }
        if ( hts_close(out) ) error("hts_close returned non-zero status for stdout\n");
        bcf_hdr_destroy(hdr);
        hts_idx_destroy(idx);
    }
    else if ( format==vcf || format==sam || format==unknown_format )
    {
        tbx_t *tbx = tbx_index_load(fname);
        if ( !tbx ) error("Could not load .tbi/.csi index of %s\n", fname);
        kstring_t str = {0,0,0};
        if ( args->print_header )
        {
            while ( hts_getline(fp, KS_SEP_LINE, &str) >= 0 )
            {
                if ( !str.l || str.s[0]!=tbx->conf.meta_char ) break;
                puts(str.s);
            }
        }
        if ( !args->header_only )
        {
            int nseq;
            const char **seq = NULL;
            if ( reg_idx ) seq = tbx_seqnames(tbx, &nseq);
            for (i=0; i<nregs; i++)
            {
                hts_itr_t *itr = tbx_itr_querys(tbx, regs[i]);
                if ( !itr ) continue;
                while (tbx_itr_next(fp, tbx, itr, &str) >= 0)
                {
                    if ( reg_idx && !regidx_overlap(reg_idx,seq[itr->curr_tid],itr->curr_beg,itr->curr_end, NULL) ) continue;
                    puts(str.s);
                }
                tbx_itr_destroy(itr);
            }
            free(seq);
        }
        free(str.s);
        tbx_destroy(tbx);
    }
    else if ( format==bam )
        error("Please use \"samtools view\" for querying BAM files.\n");

    if ( reg_idx ) regidx_destroy(reg_idx);
    if ( hts_close(fp) ) error("hts_close returned non-zero status: %s\n", fname);

    for (i=0; i<nregs; i++) free(regs[i]);
    free(regs);
    return 0;
}
Пример #15
0
bcf_sr_regions_t *bcf_sr_regions_init(const char *regions, int is_file, int ichr, int ifrom, int ito)
{
    bcf_sr_regions_t *reg;
    if ( !is_file ) return _regions_init_string(regions);

    reg = (bcf_sr_regions_t *) calloc(1, sizeof(bcf_sr_regions_t));
    reg->start = reg->end = -1;
    reg->prev_start = reg->prev_seq = -1;

    reg->file = hts_open(regions, "rb");
    if ( !reg->file )
    {
        fprintf(stderr,"[%s:%d %s] Could not open file: %s\n", __FILE__,__LINE__,__FUNCTION__,regions);
        free(reg);
        return NULL;
    }

    reg->tbx = tbx_index_load(regions);
    if ( !reg->tbx )
    {
        int len = strlen(regions);
        int is_bed  = strcasecmp(".bed",regions+len-4) ? 0 : 1;
        if ( !is_bed && !strcasecmp(".bed.gz",regions+len-7) ) is_bed = 1;

        if ( reg->file->format.format==vcf ) ito = 1;

        // read the whole file, tabix index is not present
        while ( hts_getline(reg->file, KS_SEP_LINE, &reg->line) > 0 )
        {
            char *chr, *chr_end;
            int from, to, ret;
            ret = _regions_parse_line(reg->line.s, ichr,ifrom,abs(ito), &chr,&chr_end,&from,&to);
            if ( ret < 0 )
            {
                if ( ito<0 )
                    ret = _regions_parse_line(reg->line.s, ichr,ifrom,ifrom, &chr,&chr_end,&from,&to);
                if ( ret<0 )
                {
                    fprintf(stderr,"[%s:%d] Could not parse the file %s, using the columns %d,%d[,%d]\n", __FILE__,__LINE__,regions,ichr+1,ifrom+1,ito+1);
                    hts_close(reg->file); reg->file = NULL; free(reg);
                    return NULL;
                }
            }
            if ( !ret ) continue;
            if ( is_bed ) from++;
            *chr_end = 0;
            _regions_add(reg, chr, from, to);
            *chr_end = '\t';
        }
        hts_close(reg->file); reg->file = NULL;
        if ( !reg->nseqs ) { free(reg); return NULL; }
        return reg;
    }

    reg->seq_names = (char**) tbx_seqnames(reg->tbx, &reg->nseqs);
    if ( !reg->seq_hash )
        reg->seq_hash = khash_str2int_init();
    int i;
    for (i=0; i<reg->nseqs; i++)
    {
        khash_str2int_set(reg->seq_hash,reg->seq_names[i],i);
    }
    reg->fname  = strdup(regions);
    reg->is_bin = 1;
    return reg;
}
Пример #16
0
int beds_database_add(struct beds_options *opts, const char *fname, char *columns)
{
    if ( opts->n_files == opts->m_files ) {
	opts->m_files = opts->m_files == 0 ? 2 : opts->m_files +2;
	opts->files = (struct beds_anno_file*)realloc(opts->files, opts->m_files*sizeof(struct beds_anno_file));	
    }
    struct beds_anno_file *file = &opts->files[opts->n_files];
    memset(file, 0, sizeof(struct beds_anno_file));
    file->id = opts->n_files;
    file->fname = strdup(fname);
    file->fp = hts_open(fname, "r");
    if (file->fp == NULL)
	error("Failed to open %s : %s", fname, strerror(errno));
    // int n;
    file->idx = tbx_index_load(fname);
    if ( file->idx == NULL)
	error("Failed to load index of %s.", fname);
    opts->n_files++;
    
    file->last_id = -1;
    file->last_start = -1;
    file->last_end = -1;
    kstring_t string = KSTRING_INIT;
    int no_columns = 0;
    int i;
    if ( columns == NULL && file->no_such_chrom == 0) {
	warnings("No columns string specified for %s. Will annotate all tags in this data.", fname);
        file->no_such_chrom = 1;
	no_columns = 1;
    } else {
	int *splits = NULL;
	kputs(columns, &string);
	int nfields;
	splits = ksplit(&string, ',', &nfields);
	file->m_cols = nfields;
	file->cols = (struct anno_col*)malloc(sizeof(struct anno_col) * file->m_cols);

	for ( i = 0; i < nfields; ++i ) {
	    char *ss = string.s + splits[i];
	    struct anno_col *col = &file->cols[file->n_cols];
	    col->icol = i;
	    col->replace = REPLACE_MISSING;
	    if (*ss == '+') {
		col->replace = REPLACE_MISSING;
		ss++;
	    } else if ( *ss == '-' ) {
		col->replace = REPLACE_EXISTING;
		ss++;
	    }
	    if (ss[0] == '\0')
		continue;
	    if ( strncmp(ss, "INFO/", 5) == 0)
		ss += 5;
	    col->hdr_key = strdup(ss);	    
	    col->icol = -1;
	    // debug_print("%s, %d", col->hdr_key, file->n_cols);
	    file->n_cols++;	    
	}
	string.l = 0;	    
    }

    while (1) {
	string.l =0;
	if ( hts_getline(file->fp, KS_SEP_LINE, &string) < 0 )
	    break;
	// only accept header line in the beginning for file
	if ( string.s[0] != '#' )
	    break;
	if ( strncmp(string.s, "##INFO=", 7) == 0) {
	    char *ss = string.s + 11;
	    char *se = ss;
	    while (se && *se != ',') se++;
	    struct anno_col *col = NULL;
	    // if no column string specified, init all header lines
	    if ( no_columns ) {
		if ( file->n_cols == file->m_cols ) {
		    file->m_cols = file->m_cols == 0 ? 2 : file->m_cols + 2;
		    file->cols = (struct anno_col *) realloc(file->cols, file->m_cols*sizeof(struct anno_col));
		}
		col = &file->cols[file->n_cols++];
		col->icol = -1;
		col->hdr_key = strndup(ss, se-ss+1);
		col->hdr_key[se-ss] = '\0';
	    } else {
		for ( i = 0; i < file->n_cols; ++i ) {		    
		    if ( strncmp(file->cols[i].hdr_key, ss, se-ss) == 0)
			break;
		}
		// if header line is not set in the column string, skip
		if ( i == file->n_cols )
		    continue;
		col = &file->cols[i];
	    }

	    // specify setter functions here
	    col->setter.bed = beds_setter_info_string;
	    
	    bcf_hdr_append(opts->hdr_out, string.s);
	    bcf_hdr_sync(opts->hdr_out);
	    int hdr_id = bcf_hdr_id2int(opts->hdr_out, BCF_DT_ID,col->hdr_key);
	    assert ( bcf_hdr_idinfo_exists(opts->hdr_out, BCF_HL_INFO, hdr_id) );
	}
	string.l = 0;
	// set column number for each col
	if ( strncasecmp(string.s, "#chr", 4) == 0) {
	    int nfields;	    
	    int *splits = ksplit(&string, '\t', &nfields);

	    if (nfields < 4) {
		fprintf(stderr, "[error] Bad header of bed database : %s. n_fields : %d, %s", fname, nfields, string.s);
		fprintf(stderr, "[notice] this error usually happened because the header line is seperated by spaces but not tab!");
		exit(1);
	    }
	    int k;
	    for ( k = 3; k < nfields; ++k ) {
		char *ss = string.s + splits[k];
		for (i = 0; i < file->n_cols; ++i ) {
		    struct anno_col *col = &file->cols[i];
		    if ( strcmp(col->hdr_key, ss) == 0)
			break;
		}
		// if name line specify more names than column string or header, skip
		if ( i == file->n_cols )
		    continue;

		struct anno_col *col = &file->cols[i];
		col->icol = k;
	    }
	}
    }
    for ( i = 0; i < file->n_cols; ++i ) {
	struct anno_col *col = &file->cols[i];
	if ( col->hdr_key && col->icol == -1 )
	    error("No column %s found in bed database : %s", col->hdr_key, fname);

	int hdr_id = bcf_hdr_id2int(opts->hdr_out, BCF_DT_ID, col->hdr_key);
        assert(hdr_id>-1);
	col->number = bcf_hdr_id2length(opts->hdr_out, BCF_HL_INFO, hdr_id);
	if ( col->number == BCF_VL_A || col->number == BCF_VL_R || col->number == BCF_VL_G)
	    error("Only support fixed INFO number for bed database. %s", col->hdr_key);
	col->ifile = file->id;
    }
    if ( string.m )
	free(string.s);
    if ( opts->beds_is_inited == 0 )
	opts->beds_is_inited = 1;
    return 0;
}
Пример #17
0
void cis_data::readPhenotypes(string fbed) {
	int n_includedS = 0;
	int n_includedP = 0;
	int n_excludedP = 0;
	int n_negativeStrd = 0;
	vector < int > mappingS;

	//Open BED file
	vrb.title("Reading phenotype data in [" + fbed + "]");
	htsFile *fp = hts_open(fbed.c_str(),"r");
	if (!fp) vrb.error("Cannot open file");
	tbx_t *tbx = tbx_index_load(fbed.c_str());
	if (!tbx) vrb.error("Cannot open index file");
	kstring_t str = {0,0,0};
	if (hts_getline(fp, KS_SEP_LINE, &str) <= 0 || !str.l || str.s[0] != tbx->conf.meta_char ) vrb.error("Cannot read header line!");

	//Process sample names
	vector < string > tokens;
	stb.split(string(str.s), tokens);
	if (tokens.size() < 7) vrb.error("Incorrect number of columns!");
	for (int t = 6 ; t < tokens.size() ; t ++) {
		mappingS.push_back(findSample(tokens[t]));
		if (mappingS.back() >= 0) n_includedS++;
	}

	//Read phenotypes
    unsigned int linecount =0;
    
    //Read phenotypes
    if (regionPhenotype.chr != "NA"){
        hts_itr_t *itr = tbx_itr_querys(tbx, regionPhenotype.get().c_str());
        vrb.bullet("target region [" + regionPhenotype.get() + "]");
        if (!itr) vrb.error("Cannot jump to region!");
        //Read data
        while (tbx_itr_next(fp, tbx, itr, &str) >= 0) {
            linecount ++;
            if (linecount % 100000 == 0) vrb.bullet("Read " + stb.str(linecount) + " lines");
            stb.split(string(str.s), tokens);
            if (tokens.size() < 7) vrb.error("Incorrect number of columns!");
            if ((grp_mode == GRP_NONE && filter_phenotype.check(tokens[3])) || (grp_mode != GRP_NONE && filter_phenotype.check(tokens[4]))) {
                phenotype_id.push_back(tokens[3]);
                phenotype_chr.push_back(tokens[0]);
                phenotype_start.push_back(atoi(tokens[1].c_str()) + 1);
                phenotype_end.push_back(atoi(tokens[2].c_str()));
				if (grp_mode > 0 && full_test) phenotype_grp.push_back("ALL_GENES");
				if (grp_mode > 0 && !full_test) phenotype_grp.push_back(tokens[4]);
                phenotype_neg.push_back(tokens[5] == "-");
                if (phenotype_neg.back()) n_negativeStrd ++;
                phenotype_val.push_back(vector < float > (sample_count, 0.0));
                for (int t = 6 ; t < tokens.size() ; t ++) {
                    if (mappingS[t-6] >= 0) {
                        if (tokens[t] == "NA") phenotype_val.back()[mappingS[t-6]] = bcf_float_missing;
                        else phenotype_val.back()[mappingS[t-6]] = stof(tokens[t]);
                    }
                }
                n_includedP++;
            } else n_excludedP ++;
        }
        tbx_itr_destroy(itr);
    }else{
        while (hts_getline(fp, KS_SEP_LINE, &str) >= 0) {
            linecount ++;
            if (linecount % 100000 == 0) vrb.bullet("Read " + stb.str(linecount) + " lines");
            stb.split(string(str.s), tokens);
            if (str.l && str.s[0] != tbx->conf.meta_char) {
                if (tokens.size() < 7) vrb.error("Incorrect number of columns!");
                if ((grp_mode == GRP_NONE && filter_phenotype.check(tokens[3])) || (grp_mode != GRP_NONE && filter_phenotype.check(tokens[4]))) {
                    phenotype_id.push_back(tokens[3]);
                    phenotype_chr.push_back(tokens[0]);
                    phenotype_start.push_back(atoi(tokens[1].c_str()) + 1);
                    phenotype_end.push_back(atoi(tokens[2].c_str()));
    				if (grp_mode > 0 && full_test) phenotype_grp.push_back("ALL_GENES");
    				if (grp_mode > 0 && !full_test) phenotype_grp.push_back(tokens[4]);
                    phenotype_neg.push_back(tokens[5] == "-");
                    if (phenotype_neg.back()) n_negativeStrd ++;
                    phenotype_val.push_back(vector < float > (sample_count, 0.0));
                    for (int t = 6 ; t < tokens.size() ; t ++) {
                        if (mappingS[t-6] >= 0) {
                            if (tokens[t] == "NA") phenotype_val.back()[mappingS[t-6]] = bcf_float_missing;
                            else phenotype_val.back()[mappingS[t-6]] = stof(tokens[t]);
                        }
                    }
                    n_includedP++;
                } else n_excludedP ++;
            }
        }
    }

	//Finalize & verbose
	tbx_destroy(tbx);
	if (hts_close(fp)) vrb.error("Cannot properly close file");
	phenotype_count = phenotype_id.size();
	vrb.bullet(stb.str(n_includedP) + " phenotypes included");
	if (n_excludedP > 0) vrb.bullet(stb.str(n_excludedP) + " phenotypes excluded by user");
	if (n_negativeStrd > 0 ) vrb.bullet(stb.str(n_negativeStrd) + " phenotypes are on the negative strand");
    if (phenotype_count == 0) vrb.leave("Cannot find phenotypes in target region!");
}
Пример #18
0
void union_data::scanPhenotypes(string fbed) {
	int n_includedP = 0;
	int n_excludedP = 0;

	//Open BED file
	vrb.title("Scanning phenotype data in [" + fbed + "]");
	htsFile *fp = hts_open(fbed.c_str(),"r");
	if (!fp) vrb.error("Cannot open file");
	tbx_t * tbx = tbx_index_load(fbed.c_str());
	if (!tbx) vrb.error("Cannot open index file");

	//Read header
	kstring_t str = {0,0,0};
	if (!hts_getline(fp, KS_SEP_LINE, &str) || !str.l || str.s[0] != tbx->conf.meta_char ) vrb.error("Cannot read header line");

	//Scan file
	vector < string > tokens;
    unsigned int linecount =0;
    if (regionPhenotype.chr != "NA"){
        hts_itr_t *itr = tbx_itr_querys(tbx, regionPhenotype.get().c_str());
        vrb.bullet("target region [" + regionPhenotype.get() + "]");
        if (!itr) vrb.error("Cannot jump to region!");
        //Read data
        while (tbx_itr_next(fp, tbx, itr, &str) >= 0) {
            linecount ++;
            if (linecount % 100000 == 0) vrb.bullet("Read " + stb.str(linecount) + " lines");
            stb.split(string(str.s), tokens);
            if (tokens.size() < 7) vrb.error("Incorrect number of columns!");
            if (phenotype_id_to_idx.count(tokens[3])) continue;
            if (filter_phenotype.check(tokens[3])) {
                phenotype_id.push_back(tokens[3]);
                phenotype_chr.push_back(tokens[0]);
                phenotype_start.push_back(atoi(tokens[1].c_str()) + 1);
                phenotype_end.push_back(atoi(tokens[2].c_str()));
                pair < string, int > temp (tokens[3],phenotype_id_to_idx.size());
                phenotype_id_to_idx.insert(temp);
                n_includedP++;
            } else n_excludedP ++;
        }
        tbx_itr_destroy(itr);
    }else{
        while (hts_getline(fp, KS_SEP_LINE, &str) >= 0) {
            linecount ++;
            if (linecount % 100000 == 0) vrb.bullet("Read " + stb.str(linecount) + " lines");
            if (str.l && str.s[0] != tbx->conf.meta_char) {
                stb.split(string(str.s), tokens);
                if (tokens.size() < 7) vrb.error("Incorrect number of columns!");
                if (phenotype_id_to_idx.count(tokens[3])) continue;
                if (filter_phenotype.check(tokens[3])) {
                    phenotype_id.push_back(tokens[3]);
                    phenotype_chr.push_back(tokens[0]);
                    phenotype_start.push_back(atoi(tokens[1].c_str()) + 1);
                    phenotype_end.push_back(atoi(tokens[2].c_str()));
                    pair < string, int > temp (tokens[3],phenotype_id_to_idx.size());
                    phenotype_id_to_idx.insert(temp);
                    n_includedP++;
                } else n_excludedP ++;
            }
        }
    }
	//Finalize & verbose
	tbx_destroy(tbx);
	if (hts_close(fp)) vrb.error("Cannot properly close file");
	phenotype_count = phenotype_id.size();
	vrb.bullet(stb.str(n_includedP) + " new phenotypes included");
	if (n_excludedP > 0) vrb.bullet(stb.str(n_excludedP) + " phenotypes excluded by user");
    if (phenotype_count == 0) vrb.leave("Cannot find phenotypes in region!");
}
Пример #19
0
void union_data::readPhenotypes(string fbed, string region) {
	int n_includedS = 0;
	int n_includedP = 0;
	int n_excludedP = 0;
	vector < int > mappingS;
	phenotype_id.clear();
	phenotype_chr.clear();
	phenotype_start.clear();
	phenotype_end.clear();
	phenotype_val.clear();
	phenotype_count=0;
	phenotype_id_to_idx.clear();
	//Open BED file
	//vrb.title("Reading phenotype data in [" + fbed + "]");
	htsFile *fp = hts_open(fbed.c_str(),"r");
	if (!fp) vrb.error("Cannot open file");
	tbx_t *tbx = tbx_index_load(fbed.c_str());
	if (!tbx) vrb.error("Cannot open index file");
	kstring_t str = {0,0,0};
	if (hts_getline(fp, KS_SEP_LINE, &str) <= 0 || !str.l || str.s[0] != tbx->conf.meta_char ) vrb.error("Cannot read header line!");

	//Process sample names
	vector < string > tokens;
	stb.split(string(str.s), tokens);
	if (tokens.size() < 7) vrb.error("Incorrect number of columns!");
	for (int t = 6 ; t < tokens.size() ; t ++) {
		mappingS.push_back(findSample(tokens[t]));
		if (mappingS.back() >= 0) n_includedS++;
	}
    unsigned int linecount =0;

	//Read phenotypes
	hts_itr_t *itr = tbx_itr_querys(tbx, region.c_str());
	//vrb.bullet("target region [" + regionPhenotype.get() + "]");
	//if (!itr) vrb.error("Cannot jump to region!");
	//Read data
	while (tbx_itr_next(fp, tbx, itr, &str) >= 0) {
		linecount ++;
		if (linecount % 100000 == 0) vrb.bullet("Read " + stb.str(linecount) + " lines");
		stb.split(string(str.s), tokens);
		if (tokens.size() < 7) vrb.error("Incorrect number of columns!");
		if (filter_phenotype.check(tokens[3])) {
			phenotype_id.push_back(tokens[3]);
			phenotype_chr.push_back(tokens[0]);
			phenotype_start.push_back(atoi(tokens[1].c_str()) + 1);
			phenotype_end.push_back(atoi(tokens[2].c_str()));
			phenotype_val.push_back(vector < float > (sample_count, 0.0));
			for (int t = 6 ; t < tokens.size() ; t ++) {
				if (mappingS[t-6] >= 0) {
					if (tokens[t] == "NA") phenotype_val.back()[mappingS[t-6]] = bcf_float_missing;
					else phenotype_val.back()[mappingS[t-6]] = stof(tokens[t]);
				}
			}
			pair < string, int > temp (tokens[3],n_includedP);
			phenotype_id_to_idx.insert(temp);
			n_includedP++;
		} else n_excludedP ++;
	}
	tbx_itr_destroy(itr);
	//Finalize & verbose
	tbx_destroy(tbx);
	if (hts_close(fp)) vrb.error("Cannot properly close file");
	phenotype_count = phenotype_id.size();
	//vrb.bullet(stb.str(n_includedP) + " phenotypes included");
	//if (n_excludedP > 0) vrb.bullet(stb.str(n_excludedP) + " phenotypes excluded by user");
    //if (phenotype_count == 0) vrb.leave("Cannot find phenotypes in target region!");
}
Пример #20
0
static void consensus(args_t *args)
{
    htsFile *fasta = hts_open(args->ref_fname, "rb");
    if ( !fasta ) error("Error reading %s\n", args->ref_fname);
    kstring_t str = {0,0,0};
    while ( hts_getline(fasta, KS_SEP_LINE, &str) > 0 )
    {
        if ( str.s[0]=='>' )
        {
            // new sequence encountered, apply all chached variants
            while ( args->vcf_rbuf.n )
            {
                if (args->chain) {
                    print_chain(args);
                    destroy_chain(args);
                }
                bcf1_t *rec = args->vcf_buf[args->vcf_rbuf.f];
                if ( rec->rid!=args->rid || ( args->fa_end_pos && rec->pos > args->fa_end_pos ) ) break;
                int i = rbuf_shift(&args->vcf_rbuf);
                apply_variant(args, args->vcf_buf[i]);
            }
            flush_fa_buffer(args, 0);
            init_region(args, str.s+1);
            continue;
        }
        args->fa_length  += str.l;
        args->fa_src_pos += str.l;

        // determine if uppercase or lowercase is used in this fasta file
        if ( args->fa_case==-1 ) args->fa_case = toupper(str.s[0])==str.s[0] ? 1 : 0;

        if ( args->mask && args->rid>=0) mask_region(args, str.s, str.l);
        kputs(str.s, &args->fa_buf);

        bcf1_t **rec_ptr = NULL;
        while ( args->rid>=0 && (rec_ptr = next_vcf_line(args)) )
        {
            bcf1_t *rec = *rec_ptr;

            // still the same chr and the same region? if not, fasta buf can be flushed
            if ( rec->rid!=args->rid || ( args->fa_end_pos && rec->pos > args->fa_end_pos ) )
            {
                // save the vcf record until next time and flush
                unread_vcf_line(args, rec_ptr);
                rec_ptr = NULL;
                break;
            }

            // is the vcf record well beyond cached fasta buffer? if yes, the buf can be flushed
            if ( args->fa_ori_pos + args->fa_buf.l - args->fa_mod_off <= rec->pos )
            {
                unread_vcf_line(args, rec_ptr);
                rec_ptr = NULL;
                break;
            }

            // is the cached fasta buffer full enough? if not, read more fasta, no flushing
            if ( args->fa_ori_pos + args->fa_buf.l - args->fa_mod_off < rec->pos + rec->rlen )
            {
                unread_vcf_line(args, rec_ptr);
                break;
            }
            apply_variant(args, rec);
        }
        if ( !rec_ptr ) flush_fa_buffer(args, 60);
    }
    if (args->chain) {
        print_chain(args);
        destroy_chain(args);
    }
    flush_fa_buffer(args, 0);
    hts_close(fasta);
    free(str.s);
}
Пример #21
0
int bcf_sr_regions_next(bcf_sr_regions_t *reg)
{
    if ( reg->iseq<0 ) return -1;
    reg->start = reg->end = -1;
    reg->nals = 0;

    // using in-memory regions
    if ( reg->regs )
    {
        while ( reg->iseq < reg->nseqs )
        {
            reg->regs[reg->iseq].creg++;
            if ( reg->regs[reg->iseq].creg < reg->regs[reg->iseq].nregs ) break;
            reg->iseq++;
        }
        if ( reg->iseq >= reg->nseqs ) { reg->iseq = -1; return -1; } // no more regions left
        region1_t *creg = &reg->regs[reg->iseq].regs[reg->regs[reg->iseq].creg];
        reg->start = creg->start;
        reg->end   = creg->end;
        return 0;
    }

    // reading from tabix
    char *chr, *chr_end;
    int ichr = 0, ifrom = 1, ito = 2, is_bed = 0, from, to;
    if ( reg->tbx )
    {
        ichr   = reg->tbx->conf.sc-1;
        ifrom  = reg->tbx->conf.bc-1;
        ito    = reg->tbx->conf.ec-1;
        if ( ito<0 ) ito = ifrom;
        is_bed = reg->tbx->conf.preset==TBX_UCSC ? 1 : 0;
    }

    int ret = 0;
    while ( !ret )
    {
        if ( reg->itr )
        {
            // tabix index present, reading a chromosome block
            ret = tbx_itr_next(reg->file, reg->tbx, reg->itr, &reg->line);
            if ( ret<0 ) { reg->iseq = -1; return -1; }
        }
        else
        {
            if ( reg->is_bin )
            {
                // Waited for seek which never came. Reopen in text mode and stream
                // through the regions, otherwise hts_getline would fail
                hts_close(reg->file);
                reg->file = hts_open(reg->fname, "r");
                if ( !reg->file )
                {
                    fprintf(stderr,"[%s:%d %s] Could not open file: %s\n", __FILE__,__LINE__,__FUNCTION__,reg->fname);
                    reg->file = NULL;
                    bcf_sr_regions_destroy(reg);
                    return -1;
                }
                reg->is_bin = 0;
            }

            // tabix index absent, reading the whole file
            ret = hts_getline(reg->file, KS_SEP_LINE, &reg->line);
            if ( ret<0 ) { reg->iseq = -1; return -1; }
        }
        ret = _regions_parse_line(reg->line.s, ichr,ifrom,ito, &chr,&chr_end,&from,&to);
        if ( ret<0 )
        {
            fprintf(stderr,"[%s:%d] Could not parse the file %s, using the columns %d,%d,%d\n", __FILE__,__LINE__,reg->fname,ichr+1,ifrom+1,ito+1);
            return -1;
        }
    }
    if ( is_bed ) from++;

    *chr_end = 0;
    if ( khash_str2int_get(reg->seq_hash, chr, &reg->iseq)<0 )
    {
        fprintf(stderr,"Broken tabix index? The sequence \"%s\" not in dictionary [%s]\n", chr,reg->line.s);
        exit(1);
    }
    *chr_end = '\t';

    reg->start = from - 1;
    reg->end   = to - 1;
    return 0;
}
Пример #22
0
/*
 *  _reader_fill_buffer() - buffers all records with the same coordinate
 */
static void _reader_fill_buffer(bcf_srs_t *files, bcf_sr_t *reader)
{
    // Return if the buffer is full: the coordinate of the last buffered record differs
    if ( reader->nbuffer && reader->buffer[reader->nbuffer]->pos != reader->buffer[1]->pos ) return;

    // No iterator (sequence not present in this file) and not streaming
    if ( !reader->itr && !files->streaming ) return;

    // Fill the buffer with records starting at the same position
    int i, ret = 0;
    while (1)
    {
        if ( reader->nbuffer+1 >= reader->mbuffer )
        {
            // Increase buffer size
            reader->mbuffer += 8;
            reader->buffer = (bcf1_t**) realloc(reader->buffer, sizeof(bcf1_t*)*reader->mbuffer);
            for (i=8; i>0; i--)     // initialize
            {
                reader->buffer[reader->mbuffer-i] = bcf_init1();
                reader->buffer[reader->mbuffer-i]->max_unpack = files->max_unpack;
                reader->buffer[reader->mbuffer-i]->pos = -1;    // for rare cases when VCF starts from 1
            }
        }
        if ( files->streaming )
        {
            if ( reader->file->format.format==vcf )
            {
                if ( (ret=hts_getline(reader->file, KS_SEP_LINE, &files->tmps)) < 0 ) break;   // no more lines
                int ret = vcf_parse1(&files->tmps, reader->header, reader->buffer[reader->nbuffer+1]);
                if ( ret<0 ) break;
            }
            else if ( reader->file->format.format==bcf )
            {
                if ( (ret=bcf_read1(reader->file, reader->header, reader->buffer[reader->nbuffer+1])) < 0 ) break; // no more lines
            }
            else
            {
                fprintf(stderr,"[%s:%d %s] fixme: not ready for this\n", __FILE__,__LINE__,__FUNCTION__);
                exit(1);
            }
        }
        else if ( reader->tbx_idx )
        {
            if ( (ret=tbx_itr_next(reader->file, reader->tbx_idx, reader->itr, &files->tmps)) < 0 ) break;  // no more lines
            vcf_parse1(&files->tmps, reader->header, reader->buffer[reader->nbuffer+1]);
        }
        else
        {
            if ( (ret=bcf_itr_next(reader->file, reader->itr, reader->buffer[reader->nbuffer+1])) < 0 ) break; // no more lines
            bcf_subset_format(reader->header,reader->buffer[reader->nbuffer+1]);
        }

        // apply filter
        if ( !reader->nfilter_ids )
            bcf_unpack(reader->buffer[reader->nbuffer+1], BCF_UN_STR);
        else
        {
            bcf_unpack(reader->buffer[reader->nbuffer+1], BCF_UN_STR|BCF_UN_FLT);
            if ( !has_filter(reader, reader->buffer[reader->nbuffer+1]) ) continue;
        }
        reader->nbuffer++;

        if ( reader->buffer[reader->nbuffer]->pos != reader->buffer[1]->pos ) break;    // the buffer is full
    }
    if ( ret<0 )
    {
        // done for this region
        tbx_itr_destroy(reader->itr);
        reader->itr = NULL;
    }
    if ( files->collapse && reader->nbuffer>=2 && reader->buffer[1]->pos==reader->buffer[2]->pos )
        collapse_buffer(files, reader);
}
Пример #23
0
static void concat(args_t *args)
{
    int i;
    if ( args->phased_concat )  // phased concat
    {
        // keep only two open files at a time
        while ( args->ifname < args->nfnames )
        {
            int new_file = 0;
            while ( args->files->nreaders < 2 && args->ifname < args->nfnames )
            {
                if ( !bcf_sr_add_reader(args->files,args->fnames[args->ifname]) ) error("Failed to open %s: %s\n", args->fnames[args->ifname],bcf_sr_strerror(args->files->errnum));
                new_file = 1;

                args->ifname++;
                if ( args->start_pos[args->ifname-1]==-1 ) break;   // new chromosome, start with only one file open
                if ( args->ifname < args->nfnames && args->start_pos[args->ifname]==-1 ) break; // next file starts on a different chromosome
            }

            // is there a line from the previous run? Seek the newly opened reader to that position
            int seek_pos = -1;
            int seek_chr = -1;
            if ( bcf_sr_has_line(args->files,0) )
            {
                bcf1_t *line = bcf_sr_get_line(args->files,0);
                bcf_sr_seek(args->files, bcf_seqname(args->files->readers[0].header,line), line->pos);
                seek_pos = line->pos;
                seek_chr = bcf_hdr_name2id(args->out_hdr, bcf_seqname(args->files->readers[0].header,line));
            }
            else if ( new_file )
                bcf_sr_seek(args->files,NULL,0);  // set to start

            int nret;
            while ( (nret = bcf_sr_next_line(args->files)) )
            {
                if ( !bcf_sr_has_line(args->files,0) )  // no input from the first reader
                {
                    // We are assuming that there is a perfect overlap, sites which are not present in both files are dropped
                    if ( ! bcf_sr_region_done(args->files,0) ) continue;

                    phased_flush(args);
                    bcf_sr_remove_reader(args->files, 0);
                }

                // Get a line to learn about current position
                for (i=0; i<args->files->nreaders; i++)
                    if ( bcf_sr_has_line(args->files,i) ) break;
                bcf1_t *line = bcf_sr_get_line(args->files,i);

                // This can happen after bcf_sr_seek: indel may start before the coordinate which we seek to.
                if ( seek_chr>=0 && seek_pos>line->pos && seek_chr==bcf_hdr_name2id(args->out_hdr, bcf_seqname(args->files->readers[i].header,line)) ) continue;
                seek_pos = seek_chr = -1;

                //  Check if the position overlaps with the next, yet unopened, reader
                int must_seek = 0;
                while ( args->ifname < args->nfnames && args->start_pos[args->ifname]!=-1 && line->pos >= args->start_pos[args->ifname] )
                {
                    must_seek = 1;
                    if ( !bcf_sr_add_reader(args->files,args->fnames[args->ifname]) ) error("Failed to open %s: %s\n", args->fnames[args->ifname],bcf_sr_strerror(args->files->errnum));
                    args->ifname++;
                }
                if ( must_seek )
                {
                    bcf_sr_seek(args->files, bcf_seqname(args->files->readers[i].header,line), line->pos);
                    seek_pos = line->pos;
                    seek_chr = bcf_hdr_name2id(args->out_hdr, bcf_seqname(args->files->readers[i].header,line));
                    continue;
                }

                // We are assuming that there is a perfect overlap, sites which are not present in both files are dropped
                if ( args->files->nreaders>1 && !bcf_sr_has_line(args->files,1) && !bcf_sr_region_done(args->files,1) ) continue;

                phased_push(args, bcf_sr_get_line(args->files,0), args->files->nreaders>1 ? bcf_sr_get_line(args->files,1) : NULL);
            }

            if ( args->files->nreaders )
            {
                phased_flush(args);
                while ( args->files->nreaders )
                    bcf_sr_remove_reader(args->files, 0);
            }
        }
    }
    else if ( args->files )  // combining overlapping files, using synced reader
    {
        while ( bcf_sr_next_line(args->files) )
        {
            for (i=0; i<args->files->nreaders; i++)
            {
                bcf1_t *line = bcf_sr_get_line(args->files,i);
                if ( !line ) continue;
                bcf_translate(args->out_hdr, args->files->readers[i].header, line);
                bcf_write1(args->out_fh, args->out_hdr, line);
                if ( args->remove_dups ) break;
            }
        }
    }
    else    // concatenating
    {
        kstring_t tmp = {0,0,0};
        int prev_chr_id = -1, prev_pos;
        bcf1_t *line = bcf_init();
        for (i=0; i<args->nfnames; i++)
        {
            htsFile *fp = hts_open(args->fnames[i], "r"); if ( !fp ) error("Failed to open: %s\n", args->fnames[i]);
            bcf_hdr_t *hdr = bcf_hdr_read(fp); if ( !hdr ) error("Failed to parse header: %s\n", args->fnames[i]);
            if ( !fp->is_bin && args->output_type&FT_VCF )
            {
                line->max_unpack = BCF_UN_STR;
                // if VCF is on both input and output, avoid VCF to BCF conversion
                while ( hts_getline(fp, KS_SEP_LINE, &fp->line) >=0 )
                {
                    char *str = fp->line.s;
                    while ( *str && *str!='\t' ) str++;
                    tmp.l = 0;
                    kputsn(fp->line.s,str-fp->line.s,&tmp);
                    int chr_id = bcf_hdr_name2id(args->out_hdr, tmp.s);
                    if ( chr_id<0 ) error("The sequence \"%s\" not defined in the header: %s\n(Quick workaround: index the file.)\n", tmp.s, args->fnames[i]);
                    if ( prev_chr_id!=chr_id )
                    {
                        prev_pos = -1;
                        if ( args->seen_seq[chr_id] )
                            error("\nThe chromosome block %s is not contiguous, consider running with -a.\n", tmp.s);
                    }
                    char *end;
                    int pos = strtol(str+1,&end,10) - 1;
                    if ( end==str+1 ) error("Could not parse line: %s\n", fp->line.s);
                    if ( prev_pos > pos )
                        error("The chromosome block %s is not sorted, consider running with -a.\n", tmp.s);
                    args->seen_seq[chr_id] = 1;
                    prev_chr_id = chr_id;

                    if ( vcf_write_line(args->out_fh, &fp->line)!=0 ) error("Failed to write %d bytes\n", fp->line.l);
                }
            }
            else
            {
                // BCF conversion is required
                line->max_unpack = 0;
                while ( bcf_read(fp, hdr, line)==0 )
                {
                    bcf_translate(args->out_hdr, hdr, line);

                    if ( prev_chr_id!=line->rid )
                    {
                        prev_pos = -1;
                        if ( args->seen_seq[line->rid] )
                            error("\nThe chromosome block %s is not contiguous, consider running with -a.\n", bcf_seqname(args->out_hdr, line));
                    }
                    if ( prev_pos > line->pos )
                        error("The chromosome block %s is not sorted, consider running with -a.\n", bcf_seqname(args->out_hdr, line));
                    args->seen_seq[line->rid] = 1;
                    prev_chr_id = line->rid;

                    if ( bcf_write(args->out_fh, args->out_hdr, line)!=0 ) error("Failed to write\n");
                }
            }
            bcf_hdr_destroy(hdr);
            hts_close(fp);
        }
        bcf_destroy(line);
        free(tmp.s);
    }
}
Пример #24
0
/**
 * tabix workhorse function
 */
static int tabix_handler(request_rec *r)
    {
    htsFile *fp=NULL;
    hts_itr_t *itr=NULL;
    kstring_t line = {0,0,0};
    int print_header=1;
    int print_body=1;
    struct tabix_callback_t handler;
    int http_status=OK;
	
	memset((void*)&handler,0,sizeof(struct tabix_callback_t));
    handler.r=r;
    handler.limit=DEFAULT_LIMIT_RECORDS;
	
    if (!r->handler || strcmp(r->handler, "tabix-handler")) return (DECLINED);
    if (strcmp(r->method, "GET")!=0) return DECLINED;
    if(r->canonical_filename==NULL)  return DECLINED;
     /* file must be b-gzipped */
    if( !(
    	str_ends_with(r->canonical_filename,".gz")
       	))  return DECLINED;
    /* file must be indexed with tabix */
    if( !(
    	fileExtExists(r->canonical_filename,".tbi")
       	))  return 404;
   
    
   
    handler.httParams = HttpParamParseGET(r); 
    if(handler.httParams==NULL) return DECLINED;
    handler.file_format=E_FORMAT_UNDEFINED;
    if(str_ends_with(r->canonical_filename,".vcf.gz"))
    	{
    	handler.file_format=E_FORMAT_VCF;
    	}
    else if(str_ends_with(r->canonical_filename,".bed.gz"))
    	{
    	handler.file_format=E_FORMAT_BED;
    	}
    
    /* only one loop, we use this to cleanup the code, instead of using a goto statement */
    do	{
    	const char* format=HttpParamGet(handler.httParams,"format");
    	const char* limit=HttpParamGet(handler.httParams,"limit");
    	const char* region=HttpParamGet(handler.httParams,"region");
    	int iterator_was_requested=FALSE;
    	
    	
    	if(limit!=NULL)
    		{
    		handler.limit=atol(limit);
    		}
    	
    	if(format==NULL)
    		{
    		http_status=DECLINED;
    		break;
    		}
    	else if(strcmp(format,"xml")==0)
    	 	{
    	 	SETUP_HANDLER(xml);
    	 	}

    	 else if(strcmp(format,"json")==0 || strcmp(format,"jsonp")==0)
    	 	{
    	 	handler.jsonp_callback=HttpParamGet(handler.httParams,"callback");
    	 	SETUP_HANDLER(json);
    	 	}
    	 else if(strcmp(format,"html")==0)
    	 	{
    	 	SETUP_HANDLER(html);
    	 	}
    	 else
    	 	{
    	 	SETUP_HANDLER(plain);
    	 	}
    	
    	fp=hts_open(r->canonical_filename,"r");
    	if(fp==NULL)
    		{
    		http_status=HTTP_NOT_FOUND;
    		break;
    		}
    	//read index
    	handler.tbx = tbx_index_load(r->canonical_filename);
    	if(handler.tbx==NULL)
			{
			http_status=HTTP_INTERNAL_SERVER_ERROR;
			break;
			}
    	if(region!=NULL && !str_is_empty(region))
    		{
    		iterator_was_requested=TRUE;
    		itr = tbx_itr_querys(handler.tbx,region);
    		}

	
    	handler.startdocument(&handler);
    	if(print_header)
    	    {
    	    handler.startheader(&handler);
    	    while ( hts_getline(fp, KS_SEP_LINE, &line) >= 0 )
    	            {
		    if ( !line.l || line.s[0]!=handler.tbx->conf.meta_char ) break;
		    handler.header(&handler,&line);
		    handler.count++;
    	            }
    	    handler.enddheader(&handler);
    	    }
    	handler.count=0;//Reset 
    	if(print_body)
    	    {
    	    handler.startbody(&handler);
		    if(iterator_was_requested)
				{
				if(itr!=NULL)
					{
					while ((handler.limit==-1 || handler.count< handler.limit) && tbx_itr_next(fp, handler.tbx, itr, &line) >= 0)
						{
						if(handler.show(&handler,&line)<0) break;
						handler.count++;
						}
					}
		
				}
		    else
				{
				while ((handler.limit==-1 || handler.count< handler.limit) && \
					hts_getline(fp, KS_SEP_LINE, &line) >= 0)
					{
					if(handler.show(&handler,&line)<0) break;
					handler.count++;
					}
				}
	   	 handler.endbody(&handler);
    	    }
	handler.enddocument(&handler);
    	} while(0);/* always abort */
    
    
    //cleanup
    if(itr!=NULL) tbx_itr_destroy(itr);
    HttpParamFree(handler.httParams);
    free(line.s);
    if(fp!=NULL) hts_close(fp);
    if(handler.tbx!=NULL) tbx_destroy(handler.tbx);
    return http_status;
    }