//------------------------------------------------------------------------------
void CheckDuplicates( const vector< string > & input,
                      const string & infmt,
                      const CWinMaskUtil::CIdSet * ids,
                      const CWinMaskUtil::CIdSet * exclude_ids )
{
    typedef vector< string >::const_iterator input_iterator;

    dup_lookup_table table;
    CRef<CObjectManager> om(CObjectManager::GetInstance());

    for( input_iterator i( input.begin() ); i != input.end(); ++i )
    {
        Uint4 seqnum( 0 );

        for(CWinMaskUtil::CInputBioseq_CI bs_iter(*i, infmt); bs_iter; ++bs_iter)
        {
            CBioseq_Handle bsh = *bs_iter;

            if( CWinMaskUtil::consider( bsh, ids, exclude_ids ) )
            {
                TSeqPos data_len = bsh.GetBioseqLength();
                if( data_len < MIN_SEQ_LENGTH )
                    continue;

                string id;
                sequence::GetId(bsh, sequence::eGetId_Best)
                    .GetSeqId()->GetLabel(&id);
                data_len -= SAMPLE_SKIP;
                tracker track( table, id );

                string index;
                CSeqVector data =
                    bsh.GetSeqVector(CBioseq_Handle::eCoding_Iupac);
                for( TSeqPos i = 0;  i < data_len;  ++i )
                {
                    index.erase();
                    data.GetSeqData(i, i + SAMPLE_LENGTH, index);
                    const dup_lookup_table::sample * sample( table[index] );

                    if( sample != 0 )
                        track( index, seqnum, i, sample->begin(), sample->end() );
                }

                table.add_seq_info( id, data );
                ++seqnum;
            }
        }
    }
}
Example #2
0
//-------------------------------------------------------------------------
int CWinMaskApplication::Run (void)
{
    SetDiagPostLevel( eDiag_Warning );
    CWinMaskConfig aConfig( GetArgs() );

    // Branch away immediately if the converter is called.
    //
    // if( GetArgs()["convert"].AsBoolean() ) {
    if( aConfig.AppType() == CWinMaskConfig::eConvertCounts )
    {
        if( aConfig.Output() == "-" ) {
            CWinMaskCountsConverter converter( 
                    aConfig.Input(),
                    NcbiCout,
                    aConfig.SFormat(),
                    aConfig.GetMetaData() );
            return converter();
        }
        else {
            CWinMaskCountsConverter converter( 
                    aConfig.Input(),
                    aConfig.Output(),
                    aConfig.SFormat(),
                    aConfig.GetMetaData() );
            return converter();
        }
    }

    CRef<CObjectManager> om(CObjectManager::GetInstance());
    if(aConfig.InFmt() == "seqids")
        CGBDataLoader::RegisterInObjectManager(
            *om, 0, CObjectManager::eDefault );

    // Read and validate configuration values.
    if( aConfig.AppType() == CWinMaskConfig::eComputeCounts )
    {
        if( aConfig.Output() == "-" ) {
            CWinMaskCountsGenerator cg( aConfig.Input(),
                                        NcbiCout,
                                        aConfig.InFmt(),
                                        aConfig.SFormat(),
                                        aConfig.Th(),
                                        aConfig.Mem(),
                                        aConfig.UnitSize(),
                                        aConfig.GenomeSize(),
                                        aConfig.MinScore(),
                                        aConfig.MaxScore(),
                                        aConfig.CheckDup(),
                                        aConfig.FaList(),
                                        aConfig.Ids(),
                                        aConfig.ExcludeIds(),
                                        aConfig.UseBA(),
                                        aConfig.GetMetaData() );
            cg();
        }
        else {
            CWinMaskCountsGenerator cg( aConfig.Input(),
                                        aConfig.Output(),
                                        aConfig.InFmt(),
                                        aConfig.SFormat(),
                                        aConfig.Th(),
                                        aConfig.Mem(),
                                        aConfig.UnitSize(),
                                        aConfig.GenomeSize(),
                                        aConfig.MinScore(),
                                        aConfig.MaxScore(),
                                        aConfig.CheckDup(),
                                        aConfig.FaList(),
                                        aConfig.Ids(),
                                        aConfig.ExcludeIds(),
                                        aConfig.UseBA(),
                                        aConfig.GetMetaData() );
            cg();
        }

        return 0;
    }

    if(aConfig.InFmt() == "seqids"){
        LOG_POST(Error << "windowmasker with seqids input not implemented yet");
        return 1;
    }

    CMaskReader & theReader = aConfig.Reader();
    CMaskWriter & theWriter = aConfig.Writer();
    CSeqMasker theMasker( aConfig.LStatName(),
                          aConfig.WindowSize(),
                          aConfig.WindowStep(),
                          aConfig.UnitStep(),
                          aConfig.Textend(),
                          aConfig.CutoffScore(),
                          aConfig.MaxScore(),
                          aConfig.MinScore(),
                          aConfig.SetMaxScore(),
                          aConfig.SetMinScore(),
                          aConfig.MergePass(),
                          aConfig.MergeCutoffScore(),
                          aConfig.AbsMergeCutoffDist(),
                          aConfig.MeanMergeCutoffDist(),
                          aConfig.MergeUnitStep(),
                          aConfig.Trigger(),
                          aConfig.TMin_Count(),
                          aConfig.Discontig(),
                          aConfig.Pattern(),
                          aConfig.UseBA() );
    CRef< CSeq_entry > aSeqEntry( 0 );
    Uint4 total = 0, total_masked = 0;
    CSDustMasker * duster( 0 );
    const CWinMaskConfig::CIdSet * ids( aConfig.Ids() );
    const CWinMaskConfig::CIdSet * exclude_ids( aConfig.ExcludeIds() );

    if( aConfig.AppType() == CWinMaskConfig::eGenerateMasksWithDuster )
        duster = new CSDustMasker( aConfig.DustWindow(),
                                   aConfig.DustLevel(),
                                   aConfig.DustLinker() );

    while( (aSeqEntry = theReader.GetNextSequence()).NotEmpty() )
    {
        if( aSeqEntry->Which() == CSeq_entry::e_not_set ) continue;
        CScope scope(*om);
        CSeq_entry_Handle seh = scope.AddTopLevelSeqEntry(*aSeqEntry);
        Uint4 masked = 0;
        CBioseq_CI bs_iter(seh, CSeq_inst::eMol_na);
        for ( ;  bs_iter;  ++bs_iter) {
            CBioseq_Handle bsh = *bs_iter;
            if (bsh.GetBioseqLength() == 0) {
                continue;
            }

            if( CWinMaskUtil::consider( bsh, ids, exclude_ids ) )
            {
                TSeqPos len = bsh.GetBioseqLength();
                total += len;
                _TRACE( "Sequence length " << len );
                CSeqVector data =
                    bsh.GetSeqVector(CBioseq_Handle::eCoding_Iupac);
                auto_ptr< CSeqMasker::TMaskList > mask_info( theMasker( data ) );
                CSeqMasker::TMaskList dummy;

                if( duster != 0 ) // Dust and merge with mask_info
                {
                    auto_ptr< CSeqMasker::TMaskList > dust_info( 
                        (*duster)( data, *mask_info.get() ) );
                    CSeqMasker::MergeMaskInfo( mask_info.get(), dust_info.get() );
                }

                // theWriter.Print( bsh, *mask_info, aConfig.MatchId() );
                theWriter.Print( bsh, *mask_info, GetArgs()["parse_seqids"] );

                for( CSeqMasker::TMaskList::const_iterator i = mask_info->begin();
                     i != mask_info->end(); ++i )
                    masked += i->second - i->first + 1;

                total_masked += masked;
                _TRACE( "Number of positions masked: " << masked );
            }
        }
    }

    _TRACE( "Total number of positions: " << total );
    _TRACE( "Total number of positions masked: " << total_masked );
    return 0;
}