Beispiel #1
0
/**
 * Reads next record, hides the random access of different regions from the user.
 */
bool BCFOrderedReader::read(bcf1_t *v)
{
    if (random_access_enabled)
    {
        if (ftype.format==bcf)
        {
            while(true)
            {
                if (itr && bcf_itr_next(file, itr, v)>=0)
                {
                    return true;
                }
                else if (!initialize_next_interval())
                {
                    return false;
                }
            }
        }
        else
        {
            while(true)
            {
                if (itr && tbx_itr_next(file, tbx, itr, &s)>=0)
                {
                    vcf_parse1(&s, hdr, v);
                    return true;
                }
                else if (!initialize_next_interval())
                {
                    return false;
                }
            }
        }
    }
    else
    {
        if (bcf_read(file, hdr, v)==0)
        {
            return true;
        }
        else
        {
            return false;
        }
    }

    return false;
};
Beispiel #2
0
int bcf_sr_next_line(readers_t *files)
{
    int32_t min_pos = INT_MAX;
    int ret,i,j;
    kstring_t *str = &files->tmps;

    while ( min_pos==INT_MAX )
    {
        // Need to open new chromosome?
        int eos = 0;
        for (i=0; i<files->nreaders; i++)
            if ( !files->readers[i].itr && !files->readers[i].nbuffer ) eos++;
        if ( eos==files->nreaders )
        {
            const char *seq;
            if ( files->targets )
            {
                seq = tgt_next_seq(files->targets);
                if ( !seq ) return 0;   // all chroms scanned
            }
            else
            {
                if ( ++files->iseq >= files->nseqs ) return 0;  // all chroms scanned
                seq = files->seqs[files->iseq];
            }
            for (i=0; i<files->nreaders; i++)
            {
                reader_t *reader = &files->readers[i];
                if ( reader->tbx )
                    reader->itr = tbx_itr_querys(reader->tbx,seq);
                else
                    reader->itr = bcf_itr_querys(reader->bcf,reader->header,seq);
            }
        }

        // Find the smallest coordinate
        for (i=0; i<files->nreaders; i++)
        {
            reader_t *reader = &files->readers[i];
            int buffer_full = ( reader->nbuffer && reader->buffer[reader->nbuffer]->pos != reader->buffer[1]->pos ) ? 1 : 0;
            if ( reader->itr && !buffer_full )
            {
                // Fill the buffer with records starting at the same position
                while (1)
                {
                    if ( reader->nbuffer+1 >= reader->mbuffer ) 
                    {
                        reader->mbuffer += 8;
                        reader->buffer = (bcf1_t**) realloc(reader->buffer, sizeof(bcf1_t*)*reader->mbuffer);
                        for (j=8; j>0; j--)
                            reader->buffer[reader->mbuffer-j] = bcf_init1();
                    }
                    if ( reader->tbx )
                    {
                        ret = tbx_itr_next((BGZF*)reader->file->fp, reader->tbx, reader->itr, str);
                        if ( ret<0 ) break;
                        vcf_parse1(str, reader->header, reader->buffer[reader->nbuffer+1]);
                    }
                    else
                    {
                        ret = bcf_itr_next((BGZF*)reader->file->fp, reader->itr, reader->buffer[reader->nbuffer+1]);
                        if ( ret<0 ) break;
                    }
                    bcf_unpack(reader->buffer[reader->nbuffer+1], BCF_UN_STR|BCF_UN_FLT);
                    // apply filter
                    if ( reader->filter_id!=-1 && reader->buffer[reader->nbuffer+1]->d.n_flt && reader->filter_id!=reader->buffer[reader->nbuffer+1]->d.flt[0] ) continue;
                    set_variant_types(reader->buffer[reader->nbuffer+1]);
                    reader->nbuffer++;
                    if ( reader->buffer[reader->nbuffer]->pos != reader->buffer[1]->pos ) break;
                }
                if ( ret<0 ) { tbx_itr_destroy(reader->itr); reader->itr = NULL; } // done for this chromosome
            }
            if ( reader->nbuffer )
            {
                if ( min_pos > reader->buffer[1]->pos ) min_pos = reader->buffer[1]->pos; 
            }
            // The buffer is full - either there is nothing else to read or the last record has a different coordinate
            if ( files->collapse && reader->nbuffer>2 && reader->buffer[1]->pos==reader->buffer[2]->pos )
            {
                collapse_buffer(files, reader);
            }
        }
        if ( files->targets && min_pos!=INT_MAX )
        {
            int ret = tgt_has_position(files->targets, min_pos);
            if ( ret==1 ) continue;

            // The position must be skipped
            if ( ret==-1 )
            {
                // done for this chromosome, don't read the rest
                for (i=0; i<files->nreaders; i++) 
                {
                    files->readers[i].nbuffer = 0;
                    if ( files->readers[i].itr )
                    {
                        tbx_itr_destroy(files->readers[i].itr);
                        files->readers[i].itr = NULL;
                    }
                }
                min_pos = INT_MAX;
                continue;
            }

            // remove the active line, save the buffer line
            for (i=0; i<files->nreaders; i++)
            {
                reader_t *reader = &files->readers[i];
                for (j=1; j<=reader->nbuffer; j++)
                    if ( reader->buffer[j]->pos!=min_pos ) break;
                if ( j==1 ) continue;
                if ( j<=reader->nbuffer )
                {
                    bcf1_t *tmp = reader->buffer[1]; reader->buffer[1] = reader->buffer[j]; reader->buffer[j] = tmp;
                    reader->nbuffer = 1;
                }
                else 
                    reader->nbuffer = 0;
            }
            min_pos = INT_MAX;
        }
    }

    //printf("[next_line] min_pos=%d\n", min_pos+1);
    //debug_buffers(files);

    // Set the current line
    ret = 0;
    bcf1_t *first = NULL;
    for (i=0; i<files->nreaders; i++)
    {
        reader_t *reader = &files->readers[i];
        if ( !reader->nbuffer || reader->buffer[1]->pos!=min_pos ) continue;

        // Match the records by REF and ALT
        int j, irec = -1;
        if ( first )
        {
            for (j=1; j<=reader->nbuffer; j++)
            {
                bcf1_t *line = reader->buffer[j];
                if ( min_pos != line->pos ) break;  // done with this buffer

                if ( files->collapse&COLLAPSE_ANY ) { irec=j; break; }  // checking position only
                if ( files->collapse&COLLAPSE_SNPS && first->d.var_type&VCF_SNP && line->d.var_type&VCF_SNP ) { irec=j; break; }
                if ( files->collapse&COLLAPSE_INDELS && first->d.var_type&VCF_INDEL && line->d.var_type&VCF_INDEL ) { irec=j; break; }

                if ( first->rlen != line->rlen ) continue;  // REFs do not match
                if ( strcmp(first->d.allele[0], line->d.allele[0]) ) continue; // REFs do not match
                int ial,jal;
                if ( files->collapse==COLLAPSE_NONE )
                {
                    // require exact match, all alleles must be identical
                    if ( first->n_allele!=line->n_allele ) continue;   // different number of alleles
                    int nmatch = 1; // REF has been already checked
                    for (ial=1; ial<first->n_allele; ial++)
                    {
                        for (jal=1; jal<line->n_allele; jal++)
                            if ( !strcmp(first->d.allele[ial], line->d.allele[jal]) ) { nmatch++; break; }
                    }
                    if ( nmatch>=first->n_allele ) { irec=j; break; }
                }
                else
                {
                    // thorough check: the REFs and some of the alleles have to be shared
                    // (neglecting different representations of the same indel for now)
                    for (ial=1; ial<first->n_allele; ial++)
                    {
                        for (jal=1; jal<line->n_allele; jal++)
                            if ( !strcmp(first->d.allele[ial], line->d.allele[jal]) ) { irec=j; break; }
                        if ( irec>=1 ) break;
                    }
                }
                if ( irec>=1 ) break;
            }
            if ( irec==-1 ) continue;
        }
        else 
        {
            first = reader->buffer[1];
            irec  = 1;
        }
        bcf1_t *tmp = reader->buffer[0];
        reader->buffer[0] = reader->buffer[irec];
        for (j=irec+1; j<=reader->nbuffer; j++)
            reader->buffer[j-1] = reader->buffer[j];
        reader->buffer[ reader->nbuffer ] = tmp;
        reader->nbuffer--;
        ret |= 1<<i;
    }
    // fprintf(stdout,"[next_line] min_pos=%d mask=%d\n", min_pos+1, ret);
    // debug_buffers(stdout,files);

    return ret;
}
/*
 *  _reader_fill_buffer() - buffers all records with the same coordinate
 */
static void _reader_fill_buffer(bcf_srs_t *files, bcf_sr_t *reader)
{
    // Return if the buffer is full: the coordinate of the last buffered record differs
    if ( reader->nbuffer && reader->buffer[reader->nbuffer]->pos != reader->buffer[1]->pos ) return;

    // No iterator (sequence not present in this file) and not streaming
    if ( !reader->itr && !files->streaming ) return;

    // Fill the buffer with records starting at the same position
    int i, ret = 0;
    while (1)
    {
        if ( reader->nbuffer+1 >= reader->mbuffer )
        {
            // Increase buffer size
            reader->mbuffer += 8;
            reader->buffer = (bcf1_t**) realloc(reader->buffer, sizeof(bcf1_t*)*reader->mbuffer);
            for (i=8; i>0; i--)     // initialize
            {
                reader->buffer[reader->mbuffer-i] = bcf_init1();
                reader->buffer[reader->mbuffer-i]->max_unpack = files->max_unpack;
                reader->buffer[reader->mbuffer-i]->pos = -1;    // for rare cases when VCF starts from 1
            }
        }
        if ( files->streaming )
        {
            if ( reader->file->format.format==vcf )
            {
                if ( (ret=hts_getline(reader->file, KS_SEP_LINE, &files->tmps)) < 0 ) break;   // no more lines
                int ret = vcf_parse1(&files->tmps, reader->header, reader->buffer[reader->nbuffer+1]);
                if ( ret<0 ) break;
            }
            else if ( reader->file->format.format==bcf )
            {
                if ( (ret=bcf_read1(reader->file, reader->header, reader->buffer[reader->nbuffer+1])) < 0 ) break; // no more lines
            }
            else
            {
                fprintf(stderr,"[%s:%d %s] fixme: not ready for this\n", __FILE__,__LINE__,__FUNCTION__);
                exit(1);
            }
        }
        else if ( reader->tbx_idx )
        {
            if ( (ret=tbx_itr_next(reader->file, reader->tbx_idx, reader->itr, &files->tmps)) < 0 ) break;  // no more lines
            vcf_parse1(&files->tmps, reader->header, reader->buffer[reader->nbuffer+1]);
        }
        else
        {
            if ( (ret=bcf_itr_next(reader->file, reader->itr, reader->buffer[reader->nbuffer+1])) < 0 ) break; // no more lines
            bcf_subset_format(reader->header,reader->buffer[reader->nbuffer+1]);
        }

        // apply filter
        if ( !reader->nfilter_ids )
            bcf_unpack(reader->buffer[reader->nbuffer+1], BCF_UN_STR);
        else
        {
            bcf_unpack(reader->buffer[reader->nbuffer+1], BCF_UN_STR|BCF_UN_FLT);
            if ( !has_filter(reader, reader->buffer[reader->nbuffer+1]) ) continue;
        }
        reader->nbuffer++;

        if ( reader->buffer[reader->nbuffer]->pos != reader->buffer[1]->pos ) break;    // the buffer is full
    }
    if ( ret<0 )
    {
        // done for this region
        tbx_itr_destroy(reader->itr);
        reader->itr = NULL;
    }
    if ( files->collapse && reader->nbuffer>=2 && reader->buffer[1]->pos==reader->buffer[2]->pos )
        collapse_buffer(files, reader);
}