Beispiel #1
0
bam1_t * punchout_read(
        const bam1_t * const in_bam,
        const vector< cov_t > & variants,
        const aligned_t & read
        )
{
    aligned_t::const_iterator rit = read.begin();
    var_citer vit = variants.begin();
    vector< pair< bool, unsigned > > keep;
    vector< int > cols;

    bam1_t * const out_bam = bam_init1();

    if ( !out_bam ) {
        cerr << "memory allocation error" << endl;
        exit( 1 );
    }

    out_bam->core = in_bam->core;
    out_bam->core.l_qseq = 0;

    // build a vector of which positions we're keeping,
    // and how long our new seq will be
    for ( ; rit != read.end() && vit != variants.end(); ) {

#if 0
        cerr << "r: ( " << rit->col << ", " << rit->op == INS << ", ";
        for ( unsigned j = 0; j < rit->elem.size(); ++j )
            cerr << bits2nuc( rit->elem[ j ] );
        cerr << " )" << endl;
        cerr << "v: ( " << vit->col << ", " << vit->op == INS << " )" << endl;
#endif

        if ( rit->col == vit->col && rit->op == vit->op ) {
            elem_t elem;
            obs_citer it;

            rit->get_seq( elem );
            it = vit->obs.find( elem );

            if ( it == vit->obs.end() ) {
                cerr << "unknown variant observed, which is weird...( 1 )" << endl;
                exit( 1 );
            }

            if ( it->second ) {
                keep.push_back( make_pair( true, it->first.size() ) );
                cols.push_back( rit->col );
                out_bam->core.l_qseq += it->first.size();
            }
            else
                keep.push_back( make_pair( false, it->first.size() ) );

            ++rit;
            ++vit;
        }
        // matching column in variants, but insertion in read
        else if ( rit->col == vit->col && rit->op == INS ) {
            ++vit;
        }
        // matching column in read, but insertion in variants:
        // we got ahead in read somehow, which should never happen
        else if ( rit->col == vit->col && vit->op == INS ) {
            cerr << "unknown variant observed, which is weird...( 2 )" << endl;
            exit( 1 );
        }
        // read is behind variants for some reason, which should never happen
        else if ( rit->col < vit->col ) {
            cerr << "unknown variant observed, which is weird...( 3 )" << endl;
            exit( 1 );
        }
        else // ( vit->col < rit->col )
            ++vit;
    }

    // cerr << "seqlen: " << out_bam->core.l_qseq << endl;

    out_bam->data_len = in_bam->data_len;
    out_bam->m_data = out_bam->data_len;
    out_bam->data = NULL;

    realloc_data( out_bam );

    // copy the read name
    memcpy( bam1_qname( out_bam ), bam1_qname( in_bam ), in_bam->core.l_qname );

    // update the cigar string
    {
        const uint32_t * cigar = bam1_cigar( in_bam );
        int in_idx = 0, out_idx = 0;
        vector< int > cigar_;

        for ( int i = 0; i < in_bam->core.n_cigar; ++i ) {
            const int op = cigar[ i ] & BAM_CIGAR_MASK;
            const int nop = cigar[ i ] >> BAM_CIGAR_SHIFT;

            if ( op == BAM_CDEL )
                for ( int j = 0; j < nop; ++j )
                    cigar_.push_back( BAM_CDEL );
            else if ( op == BAM_CINS ) {
                if ( keep[ in_idx ].second != unsigned( nop ) ) {
                    cerr << "assumptions violated ( 1 )" << endl;
                    exit( 1 );
                }
                if ( keep[ in_idx ].first )
                    for ( int j = 0; j < nop; ++j )
                        cigar_.push_back( op );
                else
                    for ( int j = 0; j < nop; ++j )
                        cigar_.push_back( BAM_CDEL );
                ++in_idx;
            }
            else if ( op == BAM_CMATCH || op == BAM_CEQUAL || op == BAM_CDIFF ) {
                for ( int j = 0; j < nop; ++j, ++in_idx ) {
                    if ( keep[ in_idx ].second != 1 ) {
                        cerr << "assumptions violated ( 2 )" << endl;
                        exit( 1 );
                    }
                    if ( keep[ in_idx ].first )
                        cigar_.push_back( op );
                    else
                        cigar_.push_back( BAM_CDEL );
                }
            }
        }

        if ( cigar_.size() ) {
            int op = cigar_[ 0 ];
            int nop = 1;
    
            for ( unsigned i = 1; i < cigar_.size(); ++i ) {
                if ( cigar_[ i ] == op )
                    ++nop;
                else {
                    bam1_cigar( out_bam )[ out_idx++ ] = cigval( op, nop );
                    op = cigar_[ i ];
                    nop = 1;
                }
            
                if ( out_bam->core.l_qname + out_idx >= out_bam->m_data ) {
                    ++out_bam->m_data;
                    realloc_data( out_bam );
                }
            }

            bam1_cigar( out_bam )[ out_idx++ ] = cigval( op, nop );
        }

        out_bam->core.n_cigar = out_idx;
    }

    out_bam->l_aux = in_bam->l_aux;

    out_bam->data_len = (
            out_bam->core.l_qname +
            4 * out_bam->core.n_cigar +
            ( out_bam->core.l_qseq + 1 ) / 2 +
            out_bam->core.l_qseq +
            out_bam->l_aux
            );

    out_bam->m_data = out_bam->data_len;

    realloc_data( out_bam );

    memcpy( bam1_aux( out_bam ), bam1_aux( in_bam ), in_bam->l_aux );

    // copy the sequence and quality scores
    {
        int in_idx = 0, out_idx = 0;

        for ( unsigned i = 0; i < keep.size(); ++i ) {
            if ( keep[ i ].first )
                for ( unsigned k = 0; k < keep[ i ].second; ++k ) {
                    bam1_seq_seti( bam1_seq( out_bam ), out_idx, bam1_seqi( bam1_seq( in_bam ), in_idx ) );
                    ++in_idx;
                    ++out_idx;
            }
            else
                in_idx += keep[ i ].second;
        }

        if ( bam1_qual( in_bam )[ 0 ] == 0xFF )
            bam1_qual( out_bam )[ 0 ] = 0xFF;
        else {
            in_idx = 0, out_idx = 0;

            for ( unsigned i = 0; keep.size(); ++i ) {
                if ( keep[ i ].first )
                    for ( unsigned k = 0; k < keep[ i ].second; ++k )
                        bam1_qual( out_bam )[ out_idx++ ] = bam1_qual( in_bam )[ in_idx++ ];
                else
                    in_idx += keep[ i ].second;
            }
        }
    }

    if ( cols.size() )
        out_bam->core.pos = cols[ 0 ];

    out_bam->core.bin = bam_reg2bin( out_bam->core.pos, bam_calend( &out_bam->core, bam1_cigar( out_bam ) ) );

    if ( bam_cigar2qlen( &out_bam->core, bam1_cigar( out_bam ) ) != out_bam->core.l_qseq ) {
        cerr << "cig2qlen: " << bam_cigar2qlen( &out_bam->core, bam1_cigar( out_bam ) ) << endl;
        cerr << "l_qseq:   " << out_bam->core.l_qseq << endl;
        cerr << "invalid CIGAR string for sequence length" << endl;
        exit( 1 );
    }

    if ( !bam_validate1( NULL, out_bam ) ) {
        cerr << "record failed validation" << endl;
        exit( 1 );
    }

    return out_bam;
}
Beispiel #2
0
int main( int argc, const char * argv[] )
{
    args_t args = args_t( argc, argv );
    coverage_t coverage;
    vector< cov_t > variants;
    vector< pair< int, int > > data;

    // accumulate the data at each position in a linked list
    {
        cov_citer cit;
        bam1_t * in_bam = bam_init1();

        while ( args.bamin->next( in_bam ) ) {
            aligned_t read( in_bam );
            coverage.include( read );
        }

        for ( cit = coverage.begin(); cit != coverage.end(); ++cit ) {
            int cov = 0;

            for ( obs_citer it = cit->obs.begin(); it != cit->obs.end(); ++it )
                cov += it->second;
           
            for ( obs_citer it = cit->obs.begin(); it != cit->obs.end(); ++it )
                if ( it->second )
                    data.push_back( make_pair( cov, it->second ) );

#if 0
            obs_citer it = cit->obs.begin();
            int cov = 0, maj;

            if ( it == cit->obs.end() )
                continue;

            maj = it->second;
            cov += maj;

            for ( ++it; it != cit->obs.end(); ++it ) {
                if ( it->second > maj )
                    maj = it->second;
                cov += it->second;
            }

            data.push_back( make_pair( cov, maj ) );
#endif
        }

        bam_destroy1( in_bam );
    }

    // learn a joint multi-binomial model for the mutation rate classes
    {
        cov_iter cit;
        double lg_L, aicc, bg, lg_bg, lg_invbg;
        rateclass_t rc( data );
        vector< pair< double, double > > params;

        rc( lg_L, aicc, params );

        bg = params[ 0 ].second;
        lg_bg = log( bg );
        lg_invbg = log( 1.0 - bg );

        params_json_dump( stderr, lg_L, aicc, params );

        // cerr << "background: " << bg << endl;

        // determine which variants are above background and those which are not
        for ( cit = coverage.begin(); cit != coverage.end(); ++cit ) {
            if ( cit->op == INS )
                continue;

            int cov = 0;

            for ( obs_citer it = cit->obs.begin(); it != cit->obs.end(); ++it )
                cov += it->second;

            for ( obs_iter it = cit->obs.begin(); it != cit->obs.end(); ++it ) {
                const double p = prob_background( lg_bg, lg_invbg, cov, it->second );
                if ( p < args.cutoff ) {
                    cout << cit->col << "\t" << cov << "\t" << it->second;
                    for ( unsigned i = 0; i < it->first.size(); ++i )
                        cout << bits2nuc( it->first[ i ] );
                    cout << ":" << p << endl;
                    it->second = 1;
                }
                else {
                    it->second = 0;
                }
            }

#if 0
            variants.push_back( *cit );
#endif
        }
    }

    return 0;

    // write out the input reads, but only with "real" variants this time
    {
        bam1_t * const in_bam = bam_init1();

        if ( !args.bamin->seek0() ) {
            cerr << "unable to seek( 0 )" << endl;
            exit( 1 );
        }

        if ( !args.bamout->write_header( args.bamin->hdr ) ) {
            cerr << "error writing out BAM header" << endl;
            exit( 1 );
        }

        while ( args.bamin->next( in_bam ) ) {
            aligned_t read( in_bam );

            bam1_t * const out_bam = punchout_read( in_bam, variants, read );

            if ( !out_bam->core.l_qseq )
                continue;

            if ( !args.bamout->write( out_bam ) ) {
                cerr << "error writing out read" << endl;
                exit( 1 );
            }

            bam_destroy1( out_bam );
        }

        bam_destroy1( in_bam );
    }

    return 0;
}