Example #1
0
/**
 * Initialize next interval.
 * Returns false only if all intervals are accessed.
 */
bool BCFOrderedReader::initialize_next_interval()
{
    while (interval_index!=intervals.size())
    {
        if (ftype.format==bcf)
        {
            intervals[interval_index++].to_string(&s);
            itr = bcf_itr_querys(idx, hdr, s.s);
            if (itr)
            {
                return true;
            }
        }
        else if (ftype.format==vcf && ftype.compression==bgzf)
        {
            intervals[interval_index++].to_string(&s);
            itr = tbx_itr_querys(tbx, s.s);
            if (itr)
            {
                return true;
            }
        }
    }

    return false;
};
Example #2
0
/**
 * Jump to interval. Returns false if not successful.
 *
 * @interval - string representation of interval.
 */
bool BCFOrderedReader::jump_to_interval(GenomeInterval& interval)
{
    if (index_loaded)
    {
        intervals_present = true;
        random_access_enabled = true;
        intervals.clear();
        intervals.push_back(interval);
        interval_index = 0;
        if (ftype.format==bcf)
        {
            intervals[interval_index++].to_string(&s);
            itr = bcf_itr_querys(idx, hdr, s.s);
            if (itr)
            {
                return true;
            }
        }
        else if (ftype.format==vcf && ftype.compression==bgzf)
        {
            intervals[interval_index++].to_string(&s);
            itr = tbx_itr_querys(tbx, s.s);
            if (itr)
            {
                return true;
            }
        }
    }

    return false;
};
Example #3
0
bool Tabix::getNextLine(string& line) {
    kstring_t str = {0,0,0};
    if (iter==NULL){
        return false;
    }
    if (has_jumped) {
        if (iter && tbx_itr_next(fn, tbx, iter, &str) >= 0) {
            line = string(str.s);
            return true;
        } else return false;
    } else { // step through all sequences in the file
        // we've never jumped, so read everything
        if (iter && tbx_itr_next(fn, tbx, iter, &str) >= 0) {
            line = string(str.s);
            return true;
        } else {
            ++current_chrom;
            while (current_chrom != chroms.end()) {
                tbx_itr_destroy(iter);
                iter = tbx_itr_querys(tbx, current_chrom->c_str());
                if (iter && tbx_itr_next(fn, tbx, iter, &str) >= 0) {
                    line = string(str.s);
                    return true;
                } else {
                    ++current_chrom;
                }
            }
            return false;
        }
    }
}
Example #4
0
static char *tgt_next_seq(regions_t *tgt)
{
    if ( ++tgt->cseq >= tgt->nseqs ) return NULL;
    if ( tgt->itr ) tbx_itr_destroy(tgt->itr);
    tgt->itr = tbx_itr_querys(tgt->tbx,tgt->seq_names[tgt->cseq]);
    tgt->tpos.to = -1;
    return tgt->seq_names[tgt->cseq];
}
Example #5
0
void
hts_streamer::
resetRegion(
    const char* region)
{
    if (_titr) tbx_itr_destroy(_titr);

    _titr = tbx_itr_querys(_tidx, region);
    _is_stream_end = (! _titr);
}
Example #6
0
Tabix::Tabix(string& file) {
    has_jumped = false;
    filename = file;
    const char* cfilename = file.c_str();
    struct stat stat_tbi,stat_vcf;
    char *fnidx = (char*) calloc(strlen(cfilename) + 5, 1);
    strcat(strcpy(fnidx, cfilename), ".tbi");
    if ( bgzf_is_bgzf(cfilename)!=1 )
    {
        cerr << "[tabix++] was bgzip used to compress this file? " << file << endl;
        free(fnidx);
        exit(1);
    }
    // Common source of errors: new VCF is used with an old index
    stat(fnidx, &stat_tbi);
    stat(cfilename, &stat_vcf);
    if ( stat_vcf.st_mtime > stat_tbi.st_mtime )
    {
        cerr << "[tabix++] the index file is older than the vcf file. Please use '-f' to overwrite or reindex." << endl;
        free(fnidx);
        exit(1);
    }
    free(fnidx);

    if ((fn = hts_open(cfilename, "r")) == 0) {
        cerr << "[tabix++] fail to open the data file." << endl;
        exit(1);
    }

    if ((tbx = tbx_index_load(cfilename)) == NULL) {
        cerr << "[tabix++] failed to load the index file." << endl;
        exit(1);
    }

    int nseq;
    const char** seq = tbx_seqnames(tbx, &nseq);
    for (int i=0; i<nseq; i++) {
        chroms.push_back(seq[i]);
    }
    free(seq);

    idxconf = &tbx_conf_vcf;

    // set up the iterator, defaults to the beginning
    if (nseq == 0){
        // the vcf file contains only the header according 
        // to the index
        iter = NULL;
        current_chrom = chroms.end();
    }else{
        current_chrom = chroms.begin();
        iter = tbx_itr_querys(tbx, current_chrom->c_str());
    } 
}
Example #7
0
/**
 * Initialize buffer for next interval.
 * This should only be invoked if the buffer is empty.
 * Returns true if successful.
 */
bool BCFSyncedReader::initialize_next_interval()
{
    if (random_access)
    {
        while (intervals_index < intervals.size())
        {
            GenomeInterval interval = intervals[intervals_index++];

            for (size_t i=0; i<nfiles; ++i)
            {
                hts_itr_destroy(itrs[i]);
                itrs[i] = 0;
                interval.to_string(&s);

                if (ftypes[i].format==bcf)
                {
                    itrs[i] = bcf_itr_querys(idxs[i], hdrs[i], s.s);
                }
                else if (ftypes[i].format==vcf)
                {
                    itrs[i] = tbx_itr_querys(tbxs[i], s.s);
                }

                fill_buffer(i);
            }

            //make sure pq is not empty
            //it is possible for the pq to be empty as iterators may be returned
            //as the sequence might be a valid sequence stated in the header
            if (pq.size()!=0)
            {
                return true;
            }
        }

        return false;
    }
    else
    {
        for (size_t i=0; i<nfiles; ++i)
        {
            fill_buffer(i);
        }

        if (pq.size()!=0)
        {
            return true;
        }

        return false;
    }
}
Example #8
0
/**
 * Initialize next interval.
 * Returns false only if all intervals are accessed.
 */
bool TBXOrderedReader::initialize_next_interval()
{
    while (interval_index!=intervals.size())
    {
        intervals[interval_index++].to_string(&s);
        itr = tbx_itr_querys(tbx, s.s);
        if (itr)
        {
            return true;
        }     
    }

    return false;
};
Example #9
0
int bcf_sr_regions_seek(bcf_sr_regions_t *reg, const char *seq)
{
    reg->iseq = reg->start = reg->end = -1;
    if ( khash_str2int_get(reg->seq_hash, seq, &reg->iseq) < 0 ) return -1;  // sequence seq not in regions

    // using in-memory regions
    if ( reg->regs ) return 0;

    // reading regions from tabix
    if ( reg->itr ) tbx_itr_destroy(reg->itr);
    reg->itr = tbx_itr_querys(reg->tbx, seq);
    if ( reg->itr ) return 0;

    return -1;
}
Example #10
0
void Tabix::getHeader(string& header) {
    header.clear();
    kstring_t str = {0,0,0};
    while ( hts_getline(fn, KS_SEP_LINE, &str) >= 0 ) {
        if ( !str.l || str.s[0]!=tbx->conf.meta_char ) {
            break;
        } else {
            header += string(str.s);
            header += "\n";
        }
    }
    // set back to start
    if (iter != NULL){
        current_chrom = chroms.begin();
        if (iter) tbx_itr_destroy(iter);
        iter = tbx_itr_querys(tbx, current_chrom->c_str());
    }
}
Example #11
0
/**
 * Jump to interval. Returns false if not successful.
 *
 * @interval - string representation of interval.
 */
bool TBXOrderedReader::jump_to_interval(GenomeInterval& interval)
{
    if (index_loaded)
    {
        intervals_present = true;
        random_access_enabled = true;
        intervals.clear();
        intervals.push_back(interval);
        interval_index = 0;
       
        intervals[interval_index++].to_string(&s);
        itr = tbx_itr_querys(tbx, s.s);
        if (itr)
        {
            return true;
        }    
    }

    return false;
};
Example #12
0
/**
 * Initialize buffer for next interval.
 * This should only be invoked if the buffer is empty.
 * Returns true if successful.
 */
bool BCFSyncedStreamReader::initialize_next_interval()
{
    while (intervals_index < intervals.size())
    {
        GenomeInterval interval = intervals[intervals_index++];

        for (int32_t i = 0; i<nfiles; ++i)
        {
            int32_t ftype = hts_file_type(vcf_files[i].c_str());
            hts_itr_destroy(itrs[i]);
            itrs[i] = 0;
            interval.to_string(&s);

            if (ftype==FT_BCF_GZ)
            {
                itrs[i] = bcf_itr_querys(idxs[i], hdrs[i], s.s);
            }
            else if (ftype==FT_VCF_GZ)
            {
                itrs[i] = tbx_itr_querys(tbxs[i], s.s);
            }

            fill_buffer(i);
        }

        //make sure pq is not empty
        //it is possible for the pq to be empty as iterators may be returned
        //as the sequence might be a valid sequence stated in the header
        if (pq.size()!=0)
        {
            return true;
        }
    }

    return false;
}
Example #13
0
int bcf_sr_next_line(readers_t *files)
{
    int32_t min_pos = INT_MAX;
    int ret,i,j;
    kstring_t *str = &files->tmps;

    while ( min_pos==INT_MAX )
    {
        // Need to open new chromosome?
        int eos = 0;
        for (i=0; i<files->nreaders; i++)
            if ( !files->readers[i].itr && !files->readers[i].nbuffer ) eos++;
        if ( eos==files->nreaders )
        {
            const char *seq;
            if ( files->targets )
            {
                seq = tgt_next_seq(files->targets);
                if ( !seq ) return 0;   // all chroms scanned
            }
            else
            {
                if ( ++files->iseq >= files->nseqs ) return 0;  // all chroms scanned
                seq = files->seqs[files->iseq];
            }
            for (i=0; i<files->nreaders; i++)
            {
                reader_t *reader = &files->readers[i];
                if ( reader->tbx )
                    reader->itr = tbx_itr_querys(reader->tbx,seq);
                else
                    reader->itr = bcf_itr_querys(reader->bcf,reader->header,seq);
            }
        }

        // Find the smallest coordinate
        for (i=0; i<files->nreaders; i++)
        {
            reader_t *reader = &files->readers[i];
            int buffer_full = ( reader->nbuffer && reader->buffer[reader->nbuffer]->pos != reader->buffer[1]->pos ) ? 1 : 0;
            if ( reader->itr && !buffer_full )
            {
                // Fill the buffer with records starting at the same position
                while (1)
                {
                    if ( reader->nbuffer+1 >= reader->mbuffer ) 
                    {
                        reader->mbuffer += 8;
                        reader->buffer = (bcf1_t**) realloc(reader->buffer, sizeof(bcf1_t*)*reader->mbuffer);
                        for (j=8; j>0; j--)
                            reader->buffer[reader->mbuffer-j] = bcf_init1();
                    }
                    if ( reader->tbx )
                    {
                        ret = tbx_itr_next((BGZF*)reader->file->fp, reader->tbx, reader->itr, str);
                        if ( ret<0 ) break;
                        vcf_parse1(str, reader->header, reader->buffer[reader->nbuffer+1]);
                    }
                    else
                    {
                        ret = bcf_itr_next((BGZF*)reader->file->fp, reader->itr, reader->buffer[reader->nbuffer+1]);
                        if ( ret<0 ) break;
                    }
                    bcf_unpack(reader->buffer[reader->nbuffer+1], BCF_UN_STR|BCF_UN_FLT);
                    // apply filter
                    if ( reader->filter_id!=-1 && reader->buffer[reader->nbuffer+1]->d.n_flt && reader->filter_id!=reader->buffer[reader->nbuffer+1]->d.flt[0] ) continue;
                    set_variant_types(reader->buffer[reader->nbuffer+1]);
                    reader->nbuffer++;
                    if ( reader->buffer[reader->nbuffer]->pos != reader->buffer[1]->pos ) break;
                }
                if ( ret<0 ) { tbx_itr_destroy(reader->itr); reader->itr = NULL; } // done for this chromosome
            }
            if ( reader->nbuffer )
            {
                if ( min_pos > reader->buffer[1]->pos ) min_pos = reader->buffer[1]->pos; 
            }
            // The buffer is full - either there is nothing else to read or the last record has a different coordinate
            if ( files->collapse && reader->nbuffer>2 && reader->buffer[1]->pos==reader->buffer[2]->pos )
            {
                collapse_buffer(files, reader);
            }
        }
        if ( files->targets && min_pos!=INT_MAX )
        {
            int ret = tgt_has_position(files->targets, min_pos);
            if ( ret==1 ) continue;

            // The position must be skipped
            if ( ret==-1 )
            {
                // done for this chromosome, don't read the rest
                for (i=0; i<files->nreaders; i++) 
                {
                    files->readers[i].nbuffer = 0;
                    if ( files->readers[i].itr )
                    {
                        tbx_itr_destroy(files->readers[i].itr);
                        files->readers[i].itr = NULL;
                    }
                }
                min_pos = INT_MAX;
                continue;
            }

            // remove the active line, save the buffer line
            for (i=0; i<files->nreaders; i++)
            {
                reader_t *reader = &files->readers[i];
                for (j=1; j<=reader->nbuffer; j++)
                    if ( reader->buffer[j]->pos!=min_pos ) break;
                if ( j==1 ) continue;
                if ( j<=reader->nbuffer )
                {
                    bcf1_t *tmp = reader->buffer[1]; reader->buffer[1] = reader->buffer[j]; reader->buffer[j] = tmp;
                    reader->nbuffer = 1;
                }
                else 
                    reader->nbuffer = 0;
            }
            min_pos = INT_MAX;
        }
    }

    //printf("[next_line] min_pos=%d\n", min_pos+1);
    //debug_buffers(files);

    // Set the current line
    ret = 0;
    bcf1_t *first = NULL;
    for (i=0; i<files->nreaders; i++)
    {
        reader_t *reader = &files->readers[i];
        if ( !reader->nbuffer || reader->buffer[1]->pos!=min_pos ) continue;

        // Match the records by REF and ALT
        int j, irec = -1;
        if ( first )
        {
            for (j=1; j<=reader->nbuffer; j++)
            {
                bcf1_t *line = reader->buffer[j];
                if ( min_pos != line->pos ) break;  // done with this buffer

                if ( files->collapse&COLLAPSE_ANY ) { irec=j; break; }  // checking position only
                if ( files->collapse&COLLAPSE_SNPS && first->d.var_type&VCF_SNP && line->d.var_type&VCF_SNP ) { irec=j; break; }
                if ( files->collapse&COLLAPSE_INDELS && first->d.var_type&VCF_INDEL && line->d.var_type&VCF_INDEL ) { irec=j; break; }

                if ( first->rlen != line->rlen ) continue;  // REFs do not match
                if ( strcmp(first->d.allele[0], line->d.allele[0]) ) continue; // REFs do not match
                int ial,jal;
                if ( files->collapse==COLLAPSE_NONE )
                {
                    // require exact match, all alleles must be identical
                    if ( first->n_allele!=line->n_allele ) continue;   // different number of alleles
                    int nmatch = 1; // REF has been already checked
                    for (ial=1; ial<first->n_allele; ial++)
                    {
                        for (jal=1; jal<line->n_allele; jal++)
                            if ( !strcmp(first->d.allele[ial], line->d.allele[jal]) ) { nmatch++; break; }
                    }
                    if ( nmatch>=first->n_allele ) { irec=j; break; }
                }
                else
                {
                    // thorough check: the REFs and some of the alleles have to be shared
                    // (neglecting different representations of the same indel for now)
                    for (ial=1; ial<first->n_allele; ial++)
                    {
                        for (jal=1; jal<line->n_allele; jal++)
                            if ( !strcmp(first->d.allele[ial], line->d.allele[jal]) ) { irec=j; break; }
                        if ( irec>=1 ) break;
                    }
                }
                if ( irec>=1 ) break;
            }
            if ( irec==-1 ) continue;
        }
        else 
        {
            first = reader->buffer[1];
            irec  = 1;
        }
        bcf1_t *tmp = reader->buffer[0];
        reader->buffer[0] = reader->buffer[irec];
        for (j=irec+1; j<=reader->nbuffer; j++)
            reader->buffer[j-1] = reader->buffer[j];
        reader->buffer[ reader->nbuffer ] = tmp;
        reader->nbuffer--;
        ret |= 1<<i;
    }
    // fprintf(stdout,"[next_line] min_pos=%d mask=%d\n", min_pos+1, ret);
    // debug_buffers(stdout,files);

    return ret;
}
Example #14
0
/**
 * tabix workhorse function
 */
static int tabix_handler(request_rec *r)
    {
    htsFile *fp=NULL;
    hts_itr_t *itr=NULL;
    kstring_t line = {0,0,0};
    int print_header=1;
    int print_body=1;
    struct tabix_callback_t handler;
    int http_status=OK;
	
	memset((void*)&handler,0,sizeof(struct tabix_callback_t));
    handler.r=r;
    handler.limit=DEFAULT_LIMIT_RECORDS;
	
    if (!r->handler || strcmp(r->handler, "tabix-handler")) return (DECLINED);
    if (strcmp(r->method, "GET")!=0) return DECLINED;
    if(r->canonical_filename==NULL)  return DECLINED;
     /* file must be b-gzipped */
    if( !(
    	str_ends_with(r->canonical_filename,".gz")
       	))  return DECLINED;
    /* file must be indexed with tabix */
    if( !(
    	fileExtExists(r->canonical_filename,".tbi")
       	))  return 404;
   
    
   
    handler.httParams = HttpParamParseGET(r); 
    if(handler.httParams==NULL) return DECLINED;
    handler.file_format=E_FORMAT_UNDEFINED;
    if(str_ends_with(r->canonical_filename,".vcf.gz"))
    	{
    	handler.file_format=E_FORMAT_VCF;
    	}
    else if(str_ends_with(r->canonical_filename,".bed.gz"))
    	{
    	handler.file_format=E_FORMAT_BED;
    	}
    
    /* only one loop, we use this to cleanup the code, instead of using a goto statement */
    do	{
    	const char* format=HttpParamGet(handler.httParams,"format");
    	const char* limit=HttpParamGet(handler.httParams,"limit");
    	const char* region=HttpParamGet(handler.httParams,"region");
    	int iterator_was_requested=FALSE;
    	
    	
    	if(limit!=NULL)
    		{
    		handler.limit=atol(limit);
    		}
    	
    	if(format==NULL)
    		{
    		http_status=DECLINED;
    		break;
    		}
    	else if(strcmp(format,"xml")==0)
    	 	{
    	 	SETUP_HANDLER(xml);
    	 	}

    	 else if(strcmp(format,"json")==0 || strcmp(format,"jsonp")==0)
    	 	{
    	 	handler.jsonp_callback=HttpParamGet(handler.httParams,"callback");
    	 	SETUP_HANDLER(json);
    	 	}
    	 else if(strcmp(format,"html")==0)
    	 	{
    	 	SETUP_HANDLER(html);
    	 	}
    	 else
    	 	{
    	 	SETUP_HANDLER(plain);
    	 	}
    	
    	fp=hts_open(r->canonical_filename,"r");
    	if(fp==NULL)
    		{
    		http_status=HTTP_NOT_FOUND;
    		break;
    		}
    	//read index
    	handler.tbx = tbx_index_load(r->canonical_filename);
    	if(handler.tbx==NULL)
			{
			http_status=HTTP_INTERNAL_SERVER_ERROR;
			break;
			}
    	if(region!=NULL && !str_is_empty(region))
    		{
    		iterator_was_requested=TRUE;
    		itr = tbx_itr_querys(handler.tbx,region);
    		}

	
    	handler.startdocument(&handler);
    	if(print_header)
    	    {
    	    handler.startheader(&handler);
    	    while ( hts_getline(fp, KS_SEP_LINE, &line) >= 0 )
    	            {
		    if ( !line.l || line.s[0]!=handler.tbx->conf.meta_char ) break;
		    handler.header(&handler,&line);
		    handler.count++;
    	            }
    	    handler.enddheader(&handler);
    	    }
    	handler.count=0;//Reset 
    	if(print_body)
    	    {
    	    handler.startbody(&handler);
		    if(iterator_was_requested)
				{
				if(itr!=NULL)
					{
					while ((handler.limit==-1 || handler.count< handler.limit) && tbx_itr_next(fp, handler.tbx, itr, &line) >= 0)
						{
						if(handler.show(&handler,&line)<0) break;
						handler.count++;
						}
					}
		
				}
		    else
				{
				while ((handler.limit==-1 || handler.count< handler.limit) && \
					hts_getline(fp, KS_SEP_LINE, &line) >= 0)
					{
					if(handler.show(&handler,&line)<0) break;
					handler.count++;
					}
				}
	   	 handler.endbody(&handler);
    	    }
	handler.enddocument(&handler);
    	} while(0);/* always abort */
    
    
    //cleanup
    if(itr!=NULL) tbx_itr_destroy(itr);
    HttpParamFree(handler.httParams);
    free(line.s);
    if(fp!=NULL) hts_close(fp);
    if(handler.tbx!=NULL) tbx_destroy(handler.tbx);
    return http_status;
    }
Example #15
0
File: tabix.c Project: Illumina/akt
static int query_regions(args_t *args, char *fname, char **regs, int nregs)
{
    int i;
    htsFile *fp = hts_open(fname,"r");
    if ( !fp ) error("Could not read %s\n", fname);
    enum htsExactFormat format = hts_get_format(fp)->format;

    regidx_t *reg_idx = NULL;
    if ( args->targets_fname )
    {
        reg_idx = regidx_init(args->targets_fname, NULL, NULL, 0, NULL);
        if ( !reg_idx ) error("Could not read %s\n", args->targets_fname);
    }

    if ( format == bcf )
    {
        htsFile *out = hts_open("-","w");
        if ( !out ) error("Could not open stdout\n", fname);
        hts_idx_t *idx = bcf_index_load(fname);
        if ( !idx ) error("Could not load .csi index of %s\n", fname);
        bcf_hdr_t *hdr = bcf_hdr_read(fp);
        if ( !hdr ) error("Could not read the header: %s\n", fname);
        if ( args->print_header )
            bcf_hdr_write(out,hdr);
        if ( !args->header_only )
        {
            bcf1_t *rec = bcf_init();
            for (i=0; i<nregs; i++)
            {
                hts_itr_t *itr = bcf_itr_querys(idx,hdr,regs[i]);
                while ( bcf_itr_next(fp, itr, rec) >=0 )
                {
                    if ( reg_idx && !regidx_overlap(reg_idx, bcf_seqname(hdr,rec),rec->pos,rec->pos+rec->rlen-1, NULL) ) continue;
                    bcf_write(out,hdr,rec);
                }
                tbx_itr_destroy(itr);
            }
            bcf_destroy(rec);
        }
        if ( hts_close(out) ) error("hts_close returned non-zero status for stdout\n");
        bcf_hdr_destroy(hdr);
        hts_idx_destroy(idx);
    }
    else if ( format==vcf || format==sam || format==unknown_format )
    {
        tbx_t *tbx = tbx_index_load(fname);
        if ( !tbx ) error("Could not load .tbi/.csi index of %s\n", fname);
        kstring_t str = {0,0,0};
        if ( args->print_header )
        {
            while ( hts_getline(fp, KS_SEP_LINE, &str) >= 0 )
            {
                if ( !str.l || str.s[0]!=tbx->conf.meta_char ) break;
                puts(str.s);
            }
        }
        if ( !args->header_only )
        {
            int nseq;
            const char **seq = NULL;
            if ( reg_idx ) seq = tbx_seqnames(tbx, &nseq);
            for (i=0; i<nregs; i++)
            {
                hts_itr_t *itr = tbx_itr_querys(tbx, regs[i]);
                if ( !itr ) continue;
                while (tbx_itr_next(fp, tbx, itr, &str) >= 0)
                {
                    if ( reg_idx && !regidx_overlap(reg_idx,seq[itr->curr_tid],itr->curr_beg,itr->curr_end, NULL) ) continue;
                    puts(str.s);
                }
                tbx_itr_destroy(itr);
            }
            free(seq);
        }
        free(str.s);
        tbx_destroy(tbx);
    }
    else if ( format==bam )
        error("Please use \"samtools view\" for querying BAM files.\n");

    if ( reg_idx ) regidx_destroy(reg_idx);
    if ( hts_close(fp) ) error("hts_close returned non-zero status: %s\n", fname);

    for (i=0; i<nregs; i++) free(regs[i]);
    free(regs);
    return 0;
}
Example #16
0
int main_tabix(int argc, char *argv[])
{
    int c, min_shift = -1, is_force = 0, is_all = 0;
    tbx_conf_t conf = tbx_conf_gff, *conf_ptr = NULL;
    while ((c = getopt(argc, argv, "0fap:s:b:e:S:c:m:")) >= 0)
        if (c == '0') conf.preset |= TBX_UCSC;
        else if (c == 'f') is_force = 1;
        else if (c == 'a') is_all = 1;
        else if (c == 'm') min_shift = atoi(optarg);
        else if (c == 's') conf.sc = atoi(optarg);
        else if (c == 'b') conf.bc = atoi(optarg);
        else if (c == 'e') conf.ec = atoi(optarg);
        else if (c == 'c') conf.meta_char = *optarg;
        else if (c == 'S') conf.line_skip = atoi(optarg);
        else if (c == 'p') {
            if (strcmp(optarg, "gff") == 0) conf_ptr = &tbx_conf_gff;
            else if (strcmp(optarg, "bed") == 0) conf_ptr = &tbx_conf_bed;
            else if (strcmp(optarg, "sam") == 0) conf_ptr = &tbx_conf_sam;
            else if (strcmp(optarg, "vcf") == 0) conf_ptr = &tbx_conf_vcf;
            else {
                fprintf(stderr, "The type '%s' not recognised\n", optarg);
                return 1;
            }

        }
    if (optind == argc) {
        fprintf(stderr, "\nUsage: bcftools tabix [options] <in.gz> [reg1 [...]]\n\n");
        fprintf(stderr, "Options: -p STR    preset: gff, bed, sam or vcf [gff]\n");
        fprintf(stderr, "         -s INT    column number for sequence names (suppressed by -p) [1]\n");
        fprintf(stderr, "         -b INT    column number for region start [4]\n");
        fprintf(stderr, "         -e INT    column number for region end (if no end, set INT to -b) [5]\n");
        fprintf(stderr, "         -0        specify coordinates are zero-based\n");
        fprintf(stderr, "         -S INT    skip first INT lines [0]\n");
        fprintf(stderr, "         -c CHAR   skip lines starting with CHAR [null]\n");
        fprintf(stderr, "         -a        print all records\n");
        fprintf(stderr, "         -f        force to overwrite existing index\n");
        fprintf(stderr, "         -m INT    set the minimal interval size to 1<<INT; 0 for the old tabix index [0]\n");
        fprintf(stderr, "\n");
        return 1;
    }
    if (is_all) { // read without random access
        kstring_t s;
        BGZF *fp;
        s.l = s.m = 0; s.s = 0;
        fp = bgzf_open(argv[optind], "r");
        while (bgzf_getline(fp, '\n', &s) >= 0) puts(s.s);
        bgzf_close(fp);
        free(s.s);
    } else if (optind + 2 > argc) { // create index
        if ( !conf_ptr )
        {
            // auto-detect file type by file name
            int l = strlen(argv[optind]);
            int strcasecmp(const char *s1, const char *s2);
            if (l>=7 && strcasecmp(argv[optind]+l-7, ".gff.gz") == 0) conf_ptr = &tbx_conf_gff;
            else if (l>=7 && strcasecmp(argv[optind]+l-7, ".bed.gz") == 0) conf_ptr = &tbx_conf_bed;
            else if (l>=7 && strcasecmp(argv[optind]+l-7, ".sam.gz") == 0) conf_ptr = &tbx_conf_sam;
            else if (l>=7 && strcasecmp(argv[optind]+l-7, ".vcf.gz") == 0) conf_ptr = &tbx_conf_vcf;
        }
        if ( conf_ptr ) conf = *conf_ptr;

        if (!is_force) {
            char *fn;
            FILE *fp;
            fn = (char*)alloca(strlen(argv[optind]) + 5);
            strcat(strcpy(fn, argv[optind]), min_shift <= 0? ".tbi" : ".csi");
            if ((fp = fopen(fn, "rb")) != 0) {
                fclose(fp);
                fprintf(stderr, "[E::%s] the index file exists; use option '-f' to overwrite\n", __func__);
                return 1;
            }
        }
        if ( tbx_index_build(argv[optind], min_shift, &conf) )
        {
            fprintf(stderr,"tbx_index_build failed: Is the file bgzip-compressed? Was wrong -p [type] option used?\n");
            return 1;
        }
    } else { // read with random access
        tbx_t *tbx;
        BGZF *fp;
        kstring_t s;
        int i;
        if ((tbx = tbx_index_load(argv[optind])) == 0) return 1;
        if ((fp = bgzf_open(argv[optind], "r")) == 0) return 1;
        s.s = 0; s.l = s.m = 0;
        for (i = optind + 1; i < argc; ++i) {
            hts_itr_t *itr;
            if ((itr = tbx_itr_querys(tbx, argv[i])) == 0) continue;
            while (tbx_bgzf_itr_next(fp, tbx, itr, &s) >= 0) puts(s.s);
            tbx_itr_destroy(itr);
        }
        free(s.s);
        bgzf_close(fp);
        tbx_destroy(tbx);
    }
    return 0;
}
Example #17
0
int 
main_vcfset(int argc, char *argv[])
{
     vcfset_conf_t vcfset_conf;
     char *vcf_header = NULL;
     int rc = 0;
     char *vcf_in1, *vcf_in2, *vcf_out;
     long int num_vars_vcf1;
     long int num_vars_vcf1_ign, num_vars_out;
     static int only_passed = 0;
     static int only_pos = 0;
     static int only_snvs = 0;
     static int only_indels = 0;
     static int count_only = 0;
     tbx_t *vcf2_tbx = NULL; /* index for second vcf file */
     htsFile *vcf2_hts = NULL;
     char *add_info_field = NULL;
     int vcf_concat_findex = 0;
     vcf_in1 = vcf_in2 = vcf_out = NULL;
     num_vars_vcf1 = 0;
     num_vars_vcf1_ign = num_vars_out = 0;

     /* default vcfset options */
     memset(&vcfset_conf, 0, sizeof(vcfset_conf_t));
     /* vcfset_conf.vcf_in1 = NULL; */
     /* vcfset_conf.vcf_in2 = NULL; */
     /* vcfset_conf.vcf_out = stdout;*/


    /* keep in sync with long_opts_str and usage 
     *
     * getopt is a pain in the whole when it comes to syncing of long
     * and short args and usage. check out gopt, libcfu...
     */
    while (1) {
         int c;
         static struct option long_opts[] = {
              /* see usage sync */
              {"help", no_argument, NULL, 'h'},
              {"verbose", no_argument, &verbose, 1},
              {"debug", no_argument, &debug, 1},
              {"only-passed", no_argument, &only_passed, 1},
              {"only-pos", no_argument, &only_pos, 1},
              {"only-indels", no_argument, &only_indels, 1},
              {"only-snvs", no_argument, &only_snvs, 1},
              {"count-only", no_argument, &count_only, 1},

              {"vcf1", required_argument, NULL, '1'},
              {"vcf2", required_argument, NULL, '2'},
              {"vcfout", required_argument, NULL, 'o'},
              {"action", required_argument, NULL, 'a'},
              {"add-info", required_argument, NULL, 'I'},

              {0, 0, 0, 0} /* sentinel */
         };

         /* keep in sync with long_opts and usage */
         static const char *long_opts_str = "h1:2:o:a:I:";

         /* getopt_long stores the option index here. */
         int long_opts_index = 0;
         c = getopt_long(argc-1, argv+1, /* skipping 'lofreq', just leaving 'command', i.e. call */
                         long_opts_str, long_opts, & long_opts_index);
         if (c == -1) {
              break;
         }

         switch (c) {
         /* keep in sync with long_opts etc */
         case 'h': 
              usage(& vcfset_conf); 
              free(vcf_in1); free(vcf_in2); free(vcf_out);
              return 0;

         case '1': 
              vcf_in1 = strdup(optarg);
              break;

         case '2': 
              vcf_in2 = strdup(optarg);
              break;

         case 'o':
              if (0 != strcmp(optarg, "-")) {
                   if (file_exists(optarg)) {
                        LOG_FATAL("Cowardly refusing to overwrite file '%s'. Exiting...\n", optarg);
                        free(vcf_in1); free(vcf_in2);
                        return 1;
                   }
              }
              vcf_out = strdup(optarg);
              break;

         case 'a': 
              if (0 == strcmp(optarg, "intersect")) {
                   vcfset_conf.vcf_setop = SETOP_INTERSECT;

              } else if (0 == strcmp(optarg, "complement")) {
                   vcfset_conf.vcf_setop = SETOP_COMPLEMENT;

              } else if (0 == strcmp(optarg, "concat")) {
                   vcfset_conf.vcf_setop = SETOP_CONCAT;

              } else {
                   LOG_FATAL("Unknown action '%s'. Exiting...\n", optarg);
                   free(vcf_in1); free(vcf_in2); free(vcf_out);
                   return 1;
              }
              break;

         case 'I': 
              add_info_field = strdup(optarg);
              break;

         case '?': 
              LOG_FATAL("%s\n", "unrecognized arguments found. Exiting...\n"); 
              free(vcf_in1); free(vcf_in2); free(vcf_out);
              return 1;

         default:
              break;
         }
    }

    vcfset_conf.only_passed = only_passed;
    vcfset_conf.only_pos = only_pos;
    vcfset_conf.only_snvs = only_snvs;
    vcfset_conf.only_indels = only_indels;

    if (vcfset_conf.only_indels && vcfset_conf.only_snvs) {
         LOG_FATAL("%s\n", "Can't take only indels *and* only snvs into account");
         return 1;
    }

    if (0 != argc - optind - 1) {
         if (vcfset_conf.vcf_setop == SETOP_CONCAT) {
              vcf_concat_findex = optind;
         } else {
              LOG_FATAL("%s\n", "Unrecognized arguments found\n");
              return 1;
         }
    } else {
         if (vcfset_conf.vcf_setop == SETOP_CONCAT) {
              LOG_FATAL("%s\n", "No extra files for concat given\n");
              return 1;
         }
    }
#if 0
    int i; for (i=optind+1; i<argc; i++) {
         LOG_FIXME("argv[%d]=%s\n", i, argv[i]);
    }
#endif

    if (argc == 2) {
        fprintf(stderr, "\n");
        usage(& vcfset_conf);
        free(vcf_in1); free(vcf_in2); free(vcf_out);
        return 1;
    }

    if (vcfset_conf.vcf_setop == SETOP_UNKNOWN) {
         LOG_FATAL("%s\n", "No set operation specified");
         usage(& vcfset_conf);
         free(vcf_in1); free(vcf_in2); free(vcf_out);
         return 1;
    }

    if  (vcf_in1 == NULL || (vcf_in2 == NULL && vcfset_conf.vcf_setop != SETOP_CONCAT)) {
         LOG_FATAL("%s\n\n", "At least one vcf input file not specified");
         usage(& vcfset_conf);
         free(vcf_in1); free(vcf_in2); free(vcf_out);
         return 1;
    }
    if (vcf_in2 != NULL && vcfset_conf.vcf_setop == SETOP_CONCAT) {
         LOG_FATAL("%s\n\n", "For concat just use the -1 option followed by all other vcf files instead of using -2");
         usage(& vcfset_conf);
         free(vcf_in1); free(vcf_in2); free(vcf_out);
         return 1;         
    }

    if (vcf_file_open(& vcfset_conf.vcf_in1, vcf_in1, 
                      HAS_GZIP_EXT(vcf_in1), 'r')) {
         LOG_ERROR("Couldn't open %s\n", vcf_in1);
         free(vcf_in1); free(vcf_in2); free(vcf_out);
         return 1;
    }

    if (vcf_in2) {
         vcf2_hts = hts_open(vcf_in2, "r");
         if (!vcf2_hts) {
              LOG_FATAL("Couldn't load %s\n", vcf_in2);
              return 1;
         }
         vcf2_tbx = tbx_index_load(vcf_in2);
         if (!vcf2_tbx) {
              LOG_FATAL("Couldn't load tabix index for %s\n", vcf_in2);
              return 1;
         }
    }

    /* vcf_out default if not set: stdout==- */
    if (! vcf_out) {
         vcf_out = malloc(2 * sizeof(char));
         strcpy(vcf_out, "-");
    }

    if (! count_only) {
         if (vcf_file_open(& vcfset_conf.vcf_out, vcf_out, 
                           HAS_GZIP_EXT(vcf_out), 'w')) {
              LOG_ERROR("Couldn't open %s\n", vcf_out);
              free(vcf_in1); free(vcf_in2); free(vcf_out);
              return 1;
         }
    }

    /* use meta-data/header of vcf_in1 for output
     */
    LOG_DEBUG("Getting header from %s\n", vcf_in1);
    if (0 !=  vcf_parse_header(&vcf_header, & vcfset_conf.vcf_in1)) {
         LOG_WARN("%s\n", "vcf_parse_header() failed");
         if (vcf_file_seek(& vcfset_conf.vcf_in1, 0, SEEK_SET)) {
              LOG_FATAL("%s\n", "Couldn't rewind file to parse variants"
                        " after header parsing failed");
              return -1;
         }
    } else {
         if (! count_only) {
              /* vcf_write_header would write *default* header */
              vcf_write_header(& vcfset_conf.vcf_out, vcf_header);
         }
         free(vcf_header);
    }

    
    /* parse first vcf file
     */
    LOG_DEBUG("Starting to parse variants from %s\n", vcf_in1);
    while (1) {
         var_t *var1 = NULL;
         int rc;
         int is_indel;
         kstring_t var2_kstr = {0, 0, 0};
         hts_itr_t *var2_itr = NULL;
         char regbuf[1024];
         int var2_match = 0;

         vcf_new_var(&var1);
         rc = vcf_parse_var(& vcfset_conf.vcf_in1, var1);
         if (rc) {
              free(var1);
              
              if (vcfset_conf.vcf_setop != SETOP_CONCAT) {
                   break;
              } else {
                   vcf_concat_findex++;
                   if (vcf_concat_findex==argc) {
                        break;
                   }
                   /* set vcf1 up anew and simply continue as if nothing happened 
                    */
                   vcf_file_close(& vcfset_conf.vcf_in1);
                   free(vcf_in1);

                   vcf_in1 = strdup(argv[vcf_concat_findex]);
                   LOG_DEBUG("updated vcf_in1 = %s\n", vcf_in1);
                   if (vcf_file_open(& vcfset_conf.vcf_in1, vcf_in1, 
                                     HAS_GZIP_EXT(vcf_in1), 'r')) {
                        LOG_ERROR("Couldn't open %s\n", vcf_in1);
                        free(vcf_in1); free(vcf_in2); free(vcf_out);
                        return 1;
                   }
                   if (0 != vcf_skip_header(& vcfset_conf.vcf_in1)) {
                        LOG_WARN("skip header failed for %s\n", vcf_in1);
                   }
                   continue;
              }
         }

         is_indel = vcf_var_is_indel(var1);
         if (vcfset_conf.only_snvs && is_indel) {
              free(var1);
              continue;
         } else if (vcfset_conf.only_indels && ! is_indel) {
              free(var1);
              continue;
         }

         if (! vcfset_conf.only_pos && NULL != strchr(var1->alt, ',')) {
              LOG_FATAL("%s\n", "No support for multi-allelic SNVs in vcf1");
              return -1;
         }
         if (vcfset_conf.only_passed && ! VCF_VAR_PASSES(var1)) {
#ifdef TRACE
              LOG_DEBUG("Skipping non-passing var1 %s:%d\n", var1->chrom, var1->pos);
#endif
              num_vars_vcf1_ign += 1;
              vcf_free_var(& var1);
              continue;
         }
         if (add_info_field) {
              vcf_var_add_to_info(var1, add_info_field);
         }
         num_vars_vcf1 += 1;
#ifdef TRACE
         LOG_DEBUG("Got passing var1 %s:%d\n", var1->chrom, var1->pos);
#endif

         if (vcfset_conf.vcf_setop == SETOP_CONCAT) {
              num_vars_out += 1;
              if (! count_only) {
                   vcf_write_var(& vcfset_conf.vcf_out, var1);
              }
              vcf_free_var(& var1);
              /* skip comparison against vcf2 */
              continue;
         }

         /* use index access to vcf2 */
         snprintf(regbuf, 1024, "%s:%ld-%ld", var1->chrom, var1->pos+1, var1->pos+1);
         var2_itr = tbx_itr_querys(vcf2_tbx, regbuf);
         if (! var2_itr) {
              var2_match = 0;
         } else {
              var2_match = 0;
              while (tbx_itr_next(vcf2_hts, vcf2_tbx, var2_itr, &var2_kstr) >= 0) {
                   var_t *var2 = NULL;
                   int var2_is_indel = 0;

                   vcf_new_var(&var2);
                   rc = vcf_parse_var_from_line(var2_kstr.s, var2);
                   /* LOG_FIXME("%d:%s>%s looking at var2 %d:%s>%s (reg %s)\n", 
                             var1->pos+1, var1->ref, var1->alt,
                             var2->pos+1, var2->ref, var2->alt, regbuf); */
                   if (rc) {
                        LOG_FATAL("%s\n", "Error while parsing variant returned from tabix");
                        return -1;
                   }

                   var2_is_indel = vcf_var_is_indel(var2);

                   /* iterator returns anything overlapping with that 
                    * position, i.e. this also includes up/downstream
                    * indels, so make sure actual position matches */
                   if (var1->pos != var2->pos) {
                        var2_match = 0;

                   } else if (vcfset_conf.only_passed && ! VCF_VAR_PASSES(var2)) {
                        var2_match = 0;

                   } else if (vcfset_conf.only_snvs && var2_is_indel) {
                        var2_match = 0;

                   } else if (vcfset_conf.only_indels && ! var2_is_indel) {
                        var2_match = 0;

                   } else if (vcfset_conf.only_pos) {
#ifdef TRACE
                        LOG_DEBUG("Pos match for var2 %s:%d\n", var2->chrom, var2->pos);
#endif
                        var2_match = 1;

                   } else {
                        if (0==strcmp(var1->ref, var2->ref) && 0==strcmp(var1->alt, var2->alt)) {
#ifdef TRACE
                             LOG_DEBUG("Full match for var2 %s:%d\n", var2->chrom, var2->pos);
#endif
                             var2_match = 1;/* FIXME: check type as well i.e. snv vs indel */                             
                        }
                   }
                   vcf_free_var(&var2);
                   if (var2_match) {
                        break;/* no need to continue */
                   }
              }
         }

         if (vcfset_conf.vcf_setop == SETOP_COMPLEMENT) {
              /* relative complement : elements in A but not B */
              if (!var2_match) {
                   num_vars_out += 1;
                   if (! count_only) {
                        vcf_write_var(& vcfset_conf.vcf_out, var1);
                   }
              }
         } else if (vcfset_conf.vcf_setop == SETOP_INTERSECT) {
              if (var2_match) {
                   num_vars_out += 1;
                   if (! count_only) {
                        vcf_write_var(& vcfset_conf.vcf_out, var1);
                   }
              }

         } else {
              LOG_FATAL("Internal error: unsupported vcf_setop %d\n", vcfset_conf.vcf_setop);
              return 1;
         }

         vcf_free_var(& var1);
         tbx_itr_destroy(var2_itr);
    }/* while (1) */

    vcf_file_close(& vcfset_conf.vcf_in1);
    if (vcf_in2) {
         hts_close(vcf2_hts);
         tbx_destroy(vcf2_tbx);
    }
    LOG_VERBOSE("Parsed %d variants from 1st vcf file (ignoring %d non-passed of those)\n", 
                num_vars_vcf1 + num_vars_vcf1_ign, num_vars_vcf1_ign);
    LOG_VERBOSE("Wrote %d variants to output\n", 
                num_vars_out);
    if (! count_only) {
         vcf_file_close(& vcfset_conf.vcf_out);
    }

    if (0==rc) {
         if (count_only) {
              printf("%ld\n", num_vars_out);
         }

         LOG_VERBOSE("%s\n", "Successful exit.");
    }

    free(vcf_in1);
    free(vcf_in2);
    free(vcf_out);


    return rc;
}
void union_data::readGenotypesBED(string fbed,string region) {
	string buffer;
	int n_includedG = 0;
	int n_excludedG_user = 0;
	int n_includedS = 0;
	int n_excludedS = 0;
	int n_missingS = 0;
	vector < int > mappingS;
	genotype_id.clear();
	genotype_chr.clear();
	genotype_start.clear();
	genotype_end.clear();
	genotype_val.clear();
	genotype_count=0;
	genotype_id_to_idx.clear();
	//Opening files
	htsFile *fp = hts_open(fbed.c_str(),"r");
	if (!fp) vrb.error("Cannot open file!");
	tbx_t * tbx = tbx_index_load(fbed.c_str());
	if (!tbx) vrb.error("Cannot load index file!");
	kstring_t str = {0,0,0};
	if (hts_getline(fp, KS_SEP_LINE, &str) <= 0 || !str.l || str.s[0] != tbx->conf.meta_char ) vrb.error("Cannot read header line!");

	//Process sample names
	vector < string > tokens;
	stb.split(string(str.s), tokens);
	if (tokens.size() < 7) vrb.error("Incorrect number of columns!");
	for (int i0 = 6 ; i0 < tokens.size() ; i0 ++) {
		string sid = tokens[i0];
		if (filter_sample.check(sid)) {
			mappingS.push_back(findSample(sid));
			if (mappingS.back() >= 0) n_includedS ++;
			else n_missingS ++;
		} else {
			mappingS.push_back(-1);
			n_excludedS ++;
		}
	}
	//vrb.bullet(stb.str(n_includedS) + " samples included");
	//if (n_excludedS > 0) vrb.bullet(stb.str(n_excludedS) + " samples excluded by user");
	//if (n_missingS > 0) vrb.bullet(stb.str(n_missingS) + " samples without phenotype data");
	//if (n_includedS != sample_count) vrb.error("Cannot find genotype for " + stb.str(sample_count - n_includedS) + " samples!");

    unsigned int linecount = 0;

	//Jump to interesting region

	hts_itr_t *itr = tbx_itr_querys(tbx, region.c_str());
	//vrb.bullet("target region [" + regionGenotype.get() + "]");
	//if (!itr) vrb.error("Cannot jump to region!");
	while (tbx_itr_next(fp, tbx, itr, &str) >= 0) {
		linecount ++;
		if (linecount % 100000 == 0) vrb.bullet("Read " + stb.str(linecount) + " lines");
		stb.split(string(str.s), tokens);
		if (tokens.size() < 7) vrb.error("Incorrect number of columns!");
		if (filter_genotype.check(tokens[3])) {
			genotype_id.push_back(tokens[3]);
			genotype_chr.push_back(tokens[0]);
			genotype_start.push_back(atoi(tokens[1].c_str()) + 1);
			genotype_end.push_back(atoi(tokens[2].c_str()));
			genotype_val.push_back(vector < float > (sample_count, 0.0));
			for (int t = 6 ; t < tokens.size() ; t ++) {
				if (mappingS[t-6] >= 0) {
					if (tokens[t] == "NA") genotype_val.back()[mappingS[t-6]] = bcf_float_missing;
					else genotype_val.back()[mappingS[t-6]] = stof(tokens[t]);
				}
			}
			pair < string, int > temp (tokens[3],n_includedG);
			genotype_id_to_idx.insert(temp);
			n_includedG++;
		} else n_excludedG_user ++;
	}
	tbx_itr_destroy(itr);


	//Finalize & verbose
	tbx_destroy(tbx);
	if (hts_close(fp)) vrb.error("Cannot properly close file!");
	genotype_count = n_includedG;
	//vrb.bullet(stb.str(n_includedG) + " variants included");
	//if (n_excludedG_user > 0) vrb.bullet(stb.str(n_excludedG_user) + " variants excluded by user");
    //if (genotype_count == 0) vrb.leave("Cannot find variants in target region!");
}
Example #19
0
bool Tabix::setRegion(string& region) {
    tbx_itr_destroy(iter);
    iter = tbx_itr_querys(tbx, region.c_str());
    has_jumped = true;
    return true;
}
void union_data::scanGenotypesBED(string fbed) {
	string buffer;
	int n_includedG = 0;
	int n_excludedG_user = 0;

	//Opening files
	htsFile *fp = hts_open(fbed.c_str(),"r");
	if (!fp) vrb.error("Cannot open file!");
	tbx_t * tbx = tbx_index_load(fbed.c_str());
	if (!tbx) vrb.error("Cannot load index file!");
	kstring_t str = {0,0,0};
	if (hts_getline(fp, KS_SEP_LINE, &str) <= 0 || !str.l || str.s[0] != tbx->conf.meta_char ) vrb.error("Cannot read header line!");

	//Read genotype data
	vector < string > tokens;
    unsigned int linecount = 0;
    //Jump to interesting region
    if (regionGenotype.chr != "NA"){
        hts_itr_t *itr = tbx_itr_querys(tbx, regionGenotype.get().c_str());
        vrb.bullet("target region [" + regionGenotype.get() + "]");
        if (!itr) vrb.error("Cannot jump to region!");
        while (tbx_itr_next(fp, tbx, itr, &str) >= 0) {
            linecount ++;
            if (linecount % 1000000 == 0) vrb.bullet("Read " + stb.str(linecount) + " lines");
            stb.split(string(str.s), tokens);
            if (tokens.size() < 7) vrb.error("Incorrect number of columns!");
            if (genotype_id_to_idx.count(tokens[3])) continue;
            if (filter_genotype.check(tokens[3])) {
                genotype_id.push_back(tokens[3]);
                genotype_chr.push_back(tokens[0]);
                genotype_start.push_back(atoi(tokens[1].c_str()) + 1);
                genotype_end.push_back(atoi(tokens[2].c_str()));
                pair < string, int > temp (tokens[3],genotype_id_to_idx.size());
                genotype_id_to_idx.insert(temp);
                n_includedG++;
            } else n_excludedG_user ++;
        }
        tbx_itr_destroy(itr);
    }else{
        while (hts_getline(fp, KS_SEP_LINE, &str) >= 0) {
            linecount ++;
            if (linecount % 1000000 == 0) vrb.bullet("Read " + stb.str(linecount) + " lines");
            stb.split(string(str.s), tokens);
            if (str.l && str.s[0] != tbx->conf.meta_char) {
                if (tokens.size() < 7) vrb.error("Incorrect number of columns!");
                if (genotype_id_to_idx.count(tokens[3])) continue;
                if (filter_genotype.check(tokens[3])) {
                    genotype_id.push_back(tokens[3]);
                    genotype_chr.push_back(tokens[0]);
                    genotype_start.push_back(atoi(tokens[1].c_str()) + 1);
                    genotype_end.push_back(atoi(tokens[2].c_str()));
                    pair < string, int > temp (tokens[3],genotype_id_to_idx.size());
                    genotype_id_to_idx.insert(temp);
                    n_includedG++;
                } else n_excludedG_user ++;
            }
        }
    }

	//Finalize & verbose
	tbx_destroy(tbx);
    genotype_count += n_includedG;
	if (hts_close(fp)) vrb.error("Cannot properly close file!");
	vrb.bullet(stb.str(n_includedG) + " new variants included");
	if (n_excludedG_user > 0) vrb.bullet(stb.str(n_excludedG_user) + " variants excluded by user");
    if (n_includedG  == 0) vrb.leave("Cannot find variants in target region!");
}
void union_data::readPhenotypes(string fbed, string region) {
	int n_includedS = 0;
	int n_includedP = 0;
	int n_excludedP = 0;
	vector < int > mappingS;
	phenotype_id.clear();
	phenotype_chr.clear();
	phenotype_start.clear();
	phenotype_end.clear();
	phenotype_val.clear();
	phenotype_count=0;
	phenotype_id_to_idx.clear();
	//Open BED file
	//vrb.title("Reading phenotype data in [" + fbed + "]");
	htsFile *fp = hts_open(fbed.c_str(),"r");
	if (!fp) vrb.error("Cannot open file");
	tbx_t *tbx = tbx_index_load(fbed.c_str());
	if (!tbx) vrb.error("Cannot open index file");
	kstring_t str = {0,0,0};
	if (hts_getline(fp, KS_SEP_LINE, &str) <= 0 || !str.l || str.s[0] != tbx->conf.meta_char ) vrb.error("Cannot read header line!");

	//Process sample names
	vector < string > tokens;
	stb.split(string(str.s), tokens);
	if (tokens.size() < 7) vrb.error("Incorrect number of columns!");
	for (int t = 6 ; t < tokens.size() ; t ++) {
		mappingS.push_back(findSample(tokens[t]));
		if (mappingS.back() >= 0) n_includedS++;
	}
    unsigned int linecount =0;

	//Read phenotypes
	hts_itr_t *itr = tbx_itr_querys(tbx, region.c_str());
	//vrb.bullet("target region [" + regionPhenotype.get() + "]");
	//if (!itr) vrb.error("Cannot jump to region!");
	//Read data
	while (tbx_itr_next(fp, tbx, itr, &str) >= 0) {
		linecount ++;
		if (linecount % 100000 == 0) vrb.bullet("Read " + stb.str(linecount) + " lines");
		stb.split(string(str.s), tokens);
		if (tokens.size() < 7) vrb.error("Incorrect number of columns!");
		if (filter_phenotype.check(tokens[3])) {
			phenotype_id.push_back(tokens[3]);
			phenotype_chr.push_back(tokens[0]);
			phenotype_start.push_back(atoi(tokens[1].c_str()) + 1);
			phenotype_end.push_back(atoi(tokens[2].c_str()));
			phenotype_val.push_back(vector < float > (sample_count, 0.0));
			for (int t = 6 ; t < tokens.size() ; t ++) {
				if (mappingS[t-6] >= 0) {
					if (tokens[t] == "NA") phenotype_val.back()[mappingS[t-6]] = bcf_float_missing;
					else phenotype_val.back()[mappingS[t-6]] = stof(tokens[t]);
				}
			}
			pair < string, int > temp (tokens[3],n_includedP);
			phenotype_id_to_idx.insert(temp);
			n_includedP++;
		} else n_excludedP ++;
	}
	tbx_itr_destroy(itr);
	//Finalize & verbose
	tbx_destroy(tbx);
	if (hts_close(fp)) vrb.error("Cannot properly close file");
	phenotype_count = phenotype_id.size();
	//vrb.bullet(stb.str(n_includedP) + " phenotypes included");
	//if (n_excludedP > 0) vrb.bullet(stb.str(n_excludedP) + " phenotypes excluded by user");
    //if (phenotype_count == 0) vrb.leave("Cannot find phenotypes in target region!");
}
Example #22
0
void cis_data::readPhenotypes(string fbed) {
	int n_includedS = 0;
	int n_includedP = 0;
	int n_excludedP = 0;
	int n_negativeStrd = 0;
	vector < int > mappingS;

	//Open BED file
	vrb.title("Reading phenotype data in [" + fbed + "]");
	htsFile *fp = hts_open(fbed.c_str(),"r");
	if (!fp) vrb.error("Cannot open file");
	tbx_t *tbx = tbx_index_load(fbed.c_str());
	if (!tbx) vrb.error("Cannot open index file");
	kstring_t str = {0,0,0};
	if (hts_getline(fp, KS_SEP_LINE, &str) <= 0 || !str.l || str.s[0] != tbx->conf.meta_char ) vrb.error("Cannot read header line!");

	//Process sample names
	vector < string > tokens;
	stb.split(string(str.s), tokens);
	if (tokens.size() < 7) vrb.error("Incorrect number of columns!");
	for (int t = 6 ; t < tokens.size() ; t ++) {
		mappingS.push_back(findSample(tokens[t]));
		if (mappingS.back() >= 0) n_includedS++;
	}

	//Read phenotypes
    unsigned int linecount =0;
    
    //Read phenotypes
    if (regionPhenotype.chr != "NA"){
        hts_itr_t *itr = tbx_itr_querys(tbx, regionPhenotype.get().c_str());
        vrb.bullet("target region [" + regionPhenotype.get() + "]");
        if (!itr) vrb.error("Cannot jump to region!");
        //Read data
        while (tbx_itr_next(fp, tbx, itr, &str) >= 0) {
            linecount ++;
            if (linecount % 100000 == 0) vrb.bullet("Read " + stb.str(linecount) + " lines");
            stb.split(string(str.s), tokens);
            if (tokens.size() < 7) vrb.error("Incorrect number of columns!");
            if ((grp_mode == GRP_NONE && filter_phenotype.check(tokens[3])) || (grp_mode != GRP_NONE && filter_phenotype.check(tokens[4]))) {
                phenotype_id.push_back(tokens[3]);
                phenotype_chr.push_back(tokens[0]);
                phenotype_start.push_back(atoi(tokens[1].c_str()) + 1);
                phenotype_end.push_back(atoi(tokens[2].c_str()));
				if (grp_mode > 0 && full_test) phenotype_grp.push_back("ALL_GENES");
				if (grp_mode > 0 && !full_test) phenotype_grp.push_back(tokens[4]);
                phenotype_neg.push_back(tokens[5] == "-");
                if (phenotype_neg.back()) n_negativeStrd ++;
                phenotype_val.push_back(vector < float > (sample_count, 0.0));
                for (int t = 6 ; t < tokens.size() ; t ++) {
                    if (mappingS[t-6] >= 0) {
                        if (tokens[t] == "NA") phenotype_val.back()[mappingS[t-6]] = bcf_float_missing;
                        else phenotype_val.back()[mappingS[t-6]] = stof(tokens[t]);
                    }
                }
                n_includedP++;
            } else n_excludedP ++;
        }
        tbx_itr_destroy(itr);
    }else{
        while (hts_getline(fp, KS_SEP_LINE, &str) >= 0) {
            linecount ++;
            if (linecount % 100000 == 0) vrb.bullet("Read " + stb.str(linecount) + " lines");
            stb.split(string(str.s), tokens);
            if (str.l && str.s[0] != tbx->conf.meta_char) {
                if (tokens.size() < 7) vrb.error("Incorrect number of columns!");
                if ((grp_mode == GRP_NONE && filter_phenotype.check(tokens[3])) || (grp_mode != GRP_NONE && filter_phenotype.check(tokens[4]))) {
                    phenotype_id.push_back(tokens[3]);
                    phenotype_chr.push_back(tokens[0]);
                    phenotype_start.push_back(atoi(tokens[1].c_str()) + 1);
                    phenotype_end.push_back(atoi(tokens[2].c_str()));
    				if (grp_mode > 0 && full_test) phenotype_grp.push_back("ALL_GENES");
    				if (grp_mode > 0 && !full_test) phenotype_grp.push_back(tokens[4]);
                    phenotype_neg.push_back(tokens[5] == "-");
                    if (phenotype_neg.back()) n_negativeStrd ++;
                    phenotype_val.push_back(vector < float > (sample_count, 0.0));
                    for (int t = 6 ; t < tokens.size() ; t ++) {
                        if (mappingS[t-6] >= 0) {
                            if (tokens[t] == "NA") phenotype_val.back()[mappingS[t-6]] = bcf_float_missing;
                            else phenotype_val.back()[mappingS[t-6]] = stof(tokens[t]);
                        }
                    }
                    n_includedP++;
                } else n_excludedP ++;
            }
        }
    }

	//Finalize & verbose
	tbx_destroy(tbx);
	if (hts_close(fp)) vrb.error("Cannot properly close file");
	phenotype_count = phenotype_id.size();
	vrb.bullet(stb.str(n_includedP) + " phenotypes included");
	if (n_excludedP > 0) vrb.bullet(stb.str(n_excludedP) + " phenotypes excluded by user");
	if (n_negativeStrd > 0 ) vrb.bullet(stb.str(n_negativeStrd) + " phenotypes are on the negative strand");
    if (phenotype_count == 0) vrb.leave("Cannot find phenotypes in target region!");
}
void union_data::scanPhenotypes(string fbed) {
	int n_includedP = 0;
	int n_excludedP = 0;

	//Open BED file
	vrb.title("Scanning phenotype data in [" + fbed + "]");
	htsFile *fp = hts_open(fbed.c_str(),"r");
	if (!fp) vrb.error("Cannot open file");
	tbx_t * tbx = tbx_index_load(fbed.c_str());
	if (!tbx) vrb.error("Cannot open index file");

	//Read header
	kstring_t str = {0,0,0};
	if (!hts_getline(fp, KS_SEP_LINE, &str) || !str.l || str.s[0] != tbx->conf.meta_char ) vrb.error("Cannot read header line");

	//Scan file
	vector < string > tokens;
    unsigned int linecount =0;
    if (regionPhenotype.chr != "NA"){
        hts_itr_t *itr = tbx_itr_querys(tbx, regionPhenotype.get().c_str());
        vrb.bullet("target region [" + regionPhenotype.get() + "]");
        if (!itr) vrb.error("Cannot jump to region!");
        //Read data
        while (tbx_itr_next(fp, tbx, itr, &str) >= 0) {
            linecount ++;
            if (linecount % 100000 == 0) vrb.bullet("Read " + stb.str(linecount) + " lines");
            stb.split(string(str.s), tokens);
            if (tokens.size() < 7) vrb.error("Incorrect number of columns!");
            if (phenotype_id_to_idx.count(tokens[3])) continue;
            if (filter_phenotype.check(tokens[3])) {
                phenotype_id.push_back(tokens[3]);
                phenotype_chr.push_back(tokens[0]);
                phenotype_start.push_back(atoi(tokens[1].c_str()) + 1);
                phenotype_end.push_back(atoi(tokens[2].c_str()));
                pair < string, int > temp (tokens[3],phenotype_id_to_idx.size());
                phenotype_id_to_idx.insert(temp);
                n_includedP++;
            } else n_excludedP ++;
        }
        tbx_itr_destroy(itr);
    }else{
        while (hts_getline(fp, KS_SEP_LINE, &str) >= 0) {
            linecount ++;
            if (linecount % 100000 == 0) vrb.bullet("Read " + stb.str(linecount) + " lines");
            if (str.l && str.s[0] != tbx->conf.meta_char) {
                stb.split(string(str.s), tokens);
                if (tokens.size() < 7) vrb.error("Incorrect number of columns!");
                if (phenotype_id_to_idx.count(tokens[3])) continue;
                if (filter_phenotype.check(tokens[3])) {
                    phenotype_id.push_back(tokens[3]);
                    phenotype_chr.push_back(tokens[0]);
                    phenotype_start.push_back(atoi(tokens[1].c_str()) + 1);
                    phenotype_end.push_back(atoi(tokens[2].c_str()));
                    pair < string, int > temp (tokens[3],phenotype_id_to_idx.size());
                    phenotype_id_to_idx.insert(temp);
                    n_includedP++;
                } else n_excludedP ++;
            }
        }
    }
	//Finalize & verbose
	tbx_destroy(tbx);
	if (hts_close(fp)) vrb.error("Cannot properly close file");
	phenotype_count = phenotype_id.size();
	vrb.bullet(stb.str(n_includedP) + " new phenotypes included");
	if (n_excludedP > 0) vrb.bullet(stb.str(n_excludedP) + " phenotypes excluded by user");
    if (phenotype_count == 0) vrb.leave("Cannot find phenotypes in region!");
}