/** * Reads next record, hides the random access of different regions from the user. */ bool BCFOrderedReader::read(bcf1_t *v) { if (random_access_enabled) { if (ftype.format==bcf) { while(true) { if (itr && bcf_itr_next(file, itr, v)>=0) { return true; } else if (!initialize_next_interval()) { return false; } } } else { while(true) { if (itr && tbx_itr_next(file, tbx, itr, &s)>=0) { vcf_parse1(&s, hdr, v); return true; } else if (!initialize_next_interval()) { return false; } } } } else { if (bcf_read(file, hdr, v)==0) { return true; } else { return false; } } return false; };
int bcf_sr_next_line(readers_t *files) { int32_t min_pos = INT_MAX; int ret,i,j; kstring_t *str = &files->tmps; while ( min_pos==INT_MAX ) { // Need to open new chromosome? int eos = 0; for (i=0; i<files->nreaders; i++) if ( !files->readers[i].itr && !files->readers[i].nbuffer ) eos++; if ( eos==files->nreaders ) { const char *seq; if ( files->targets ) { seq = tgt_next_seq(files->targets); if ( !seq ) return 0; // all chroms scanned } else { if ( ++files->iseq >= files->nseqs ) return 0; // all chroms scanned seq = files->seqs[files->iseq]; } for (i=0; i<files->nreaders; i++) { reader_t *reader = &files->readers[i]; if ( reader->tbx ) reader->itr = tbx_itr_querys(reader->tbx,seq); else reader->itr = bcf_itr_querys(reader->bcf,reader->header,seq); } } // Find the smallest coordinate for (i=0; i<files->nreaders; i++) { reader_t *reader = &files->readers[i]; int buffer_full = ( reader->nbuffer && reader->buffer[reader->nbuffer]->pos != reader->buffer[1]->pos ) ? 1 : 0; if ( reader->itr && !buffer_full ) { // Fill the buffer with records starting at the same position while (1) { if ( reader->nbuffer+1 >= reader->mbuffer ) { reader->mbuffer += 8; reader->buffer = (bcf1_t**) realloc(reader->buffer, sizeof(bcf1_t*)*reader->mbuffer); for (j=8; j>0; j--) reader->buffer[reader->mbuffer-j] = bcf_init1(); } if ( reader->tbx ) { ret = tbx_itr_next((BGZF*)reader->file->fp, reader->tbx, reader->itr, str); if ( ret<0 ) break; vcf_parse1(str, reader->header, reader->buffer[reader->nbuffer+1]); } else { ret = bcf_itr_next((BGZF*)reader->file->fp, reader->itr, reader->buffer[reader->nbuffer+1]); if ( ret<0 ) break; } bcf_unpack(reader->buffer[reader->nbuffer+1], BCF_UN_STR|BCF_UN_FLT); // apply filter if ( reader->filter_id!=-1 && reader->buffer[reader->nbuffer+1]->d.n_flt && reader->filter_id!=reader->buffer[reader->nbuffer+1]->d.flt[0] ) continue; set_variant_types(reader->buffer[reader->nbuffer+1]); reader->nbuffer++; if ( reader->buffer[reader->nbuffer]->pos != reader->buffer[1]->pos ) break; } if ( ret<0 ) { tbx_itr_destroy(reader->itr); reader->itr = NULL; } // done for this chromosome } if ( reader->nbuffer ) { if ( min_pos > reader->buffer[1]->pos ) min_pos = reader->buffer[1]->pos; } // The buffer is full - either there is nothing else to read or the last record has a different coordinate if ( files->collapse && reader->nbuffer>2 && reader->buffer[1]->pos==reader->buffer[2]->pos ) { collapse_buffer(files, reader); } } if ( files->targets && min_pos!=INT_MAX ) { int ret = tgt_has_position(files->targets, min_pos); if ( ret==1 ) continue; // The position must be skipped if ( ret==-1 ) { // done for this chromosome, don't read the rest for (i=0; i<files->nreaders; i++) { files->readers[i].nbuffer = 0; if ( files->readers[i].itr ) { tbx_itr_destroy(files->readers[i].itr); files->readers[i].itr = NULL; } } min_pos = INT_MAX; continue; } // remove the active line, save the buffer line for (i=0; i<files->nreaders; i++) { reader_t *reader = &files->readers[i]; for (j=1; j<=reader->nbuffer; j++) if ( reader->buffer[j]->pos!=min_pos ) break; if ( j==1 ) continue; if ( j<=reader->nbuffer ) { bcf1_t *tmp = reader->buffer[1]; reader->buffer[1] = reader->buffer[j]; reader->buffer[j] = tmp; reader->nbuffer = 1; } else reader->nbuffer = 0; } min_pos = INT_MAX; } } //printf("[next_line] min_pos=%d\n", min_pos+1); //debug_buffers(files); // Set the current line ret = 0; bcf1_t *first = NULL; for (i=0; i<files->nreaders; i++) { reader_t *reader = &files->readers[i]; if ( !reader->nbuffer || reader->buffer[1]->pos!=min_pos ) continue; // Match the records by REF and ALT int j, irec = -1; if ( first ) { for (j=1; j<=reader->nbuffer; j++) { bcf1_t *line = reader->buffer[j]; if ( min_pos != line->pos ) break; // done with this buffer if ( files->collapse&COLLAPSE_ANY ) { irec=j; break; } // checking position only if ( files->collapse&COLLAPSE_SNPS && first->d.var_type&VCF_SNP && line->d.var_type&VCF_SNP ) { irec=j; break; } if ( files->collapse&COLLAPSE_INDELS && first->d.var_type&VCF_INDEL && line->d.var_type&VCF_INDEL ) { irec=j; break; } if ( first->rlen != line->rlen ) continue; // REFs do not match if ( strcmp(first->d.allele[0], line->d.allele[0]) ) continue; // REFs do not match int ial,jal; if ( files->collapse==COLLAPSE_NONE ) { // require exact match, all alleles must be identical if ( first->n_allele!=line->n_allele ) continue; // different number of alleles int nmatch = 1; // REF has been already checked for (ial=1; ial<first->n_allele; ial++) { for (jal=1; jal<line->n_allele; jal++) if ( !strcmp(first->d.allele[ial], line->d.allele[jal]) ) { nmatch++; break; } } if ( nmatch>=first->n_allele ) { irec=j; break; } } else { // thorough check: the REFs and some of the alleles have to be shared // (neglecting different representations of the same indel for now) for (ial=1; ial<first->n_allele; ial++) { for (jal=1; jal<line->n_allele; jal++) if ( !strcmp(first->d.allele[ial], line->d.allele[jal]) ) { irec=j; break; } if ( irec>=1 ) break; } } if ( irec>=1 ) break; } if ( irec==-1 ) continue; } else { first = reader->buffer[1]; irec = 1; } bcf1_t *tmp = reader->buffer[0]; reader->buffer[0] = reader->buffer[irec]; for (j=irec+1; j<=reader->nbuffer; j++) reader->buffer[j-1] = reader->buffer[j]; reader->buffer[ reader->nbuffer ] = tmp; reader->nbuffer--; ret |= 1<<i; } // fprintf(stdout,"[next_line] min_pos=%d mask=%d\n", min_pos+1, ret); // debug_buffers(stdout,files); return ret; }
/* * _reader_fill_buffer() - buffers all records with the same coordinate */ static void _reader_fill_buffer(bcf_srs_t *files, bcf_sr_t *reader) { // Return if the buffer is full: the coordinate of the last buffered record differs if ( reader->nbuffer && reader->buffer[reader->nbuffer]->pos != reader->buffer[1]->pos ) return; // No iterator (sequence not present in this file) and not streaming if ( !reader->itr && !files->streaming ) return; // Fill the buffer with records starting at the same position int i, ret = 0; while (1) { if ( reader->nbuffer+1 >= reader->mbuffer ) { // Increase buffer size reader->mbuffer += 8; reader->buffer = (bcf1_t**) realloc(reader->buffer, sizeof(bcf1_t*)*reader->mbuffer); for (i=8; i>0; i--) // initialize { reader->buffer[reader->mbuffer-i] = bcf_init1(); reader->buffer[reader->mbuffer-i]->max_unpack = files->max_unpack; reader->buffer[reader->mbuffer-i]->pos = -1; // for rare cases when VCF starts from 1 } } if ( files->streaming ) { if ( reader->file->format.format==vcf ) { if ( (ret=hts_getline(reader->file, KS_SEP_LINE, &files->tmps)) < 0 ) break; // no more lines int ret = vcf_parse1(&files->tmps, reader->header, reader->buffer[reader->nbuffer+1]); if ( ret<0 ) break; } else if ( reader->file->format.format==bcf ) { if ( (ret=bcf_read1(reader->file, reader->header, reader->buffer[reader->nbuffer+1])) < 0 ) break; // no more lines } else { fprintf(stderr,"[%s:%d %s] fixme: not ready for this\n", __FILE__,__LINE__,__FUNCTION__); exit(1); } } else if ( reader->tbx_idx ) { if ( (ret=tbx_itr_next(reader->file, reader->tbx_idx, reader->itr, &files->tmps)) < 0 ) break; // no more lines vcf_parse1(&files->tmps, reader->header, reader->buffer[reader->nbuffer+1]); } else { if ( (ret=bcf_itr_next(reader->file, reader->itr, reader->buffer[reader->nbuffer+1])) < 0 ) break; // no more lines bcf_subset_format(reader->header,reader->buffer[reader->nbuffer+1]); } // apply filter if ( !reader->nfilter_ids ) bcf_unpack(reader->buffer[reader->nbuffer+1], BCF_UN_STR); else { bcf_unpack(reader->buffer[reader->nbuffer+1], BCF_UN_STR|BCF_UN_FLT); if ( !has_filter(reader, reader->buffer[reader->nbuffer+1]) ) continue; } reader->nbuffer++; if ( reader->buffer[reader->nbuffer]->pos != reader->buffer[1]->pos ) break; // the buffer is full } if ( ret<0 ) { // done for this region tbx_itr_destroy(reader->itr); reader->itr = NULL; } if ( files->collapse && reader->nbuffer>=2 && reader->buffer[1]->pos==reader->buffer[2]->pos ) collapse_buffer(files, reader); }