Beispiel #1
0
CRef<CSeq_entry> CAlnReader::GetSeqEntry()
{
    if (m_Entry) {
        return m_Entry;
    } else if ( !m_ReadDone ) {
        NCBI_THROW2(CObjReaderParseException, eFormat,
                   "CAlnReader::GetSeqEntry(): "
                   "Seq_entry is not available until after Read()", 0);
    }
    m_Entry = new CSeq_entry();
    CRef<CSeq_annot> seq_annot (new CSeq_annot);
    seq_annot->SetData().SetAlign().push_back(GetSeqAlign());

    m_Entry->SetSet().SetClass(CBioseq_set::eClass_pop_set);
    m_Entry->SetSet().SetAnnot().push_back(seq_annot);

    CBioseq_set::TSeq_set& seq_set = m_Entry->SetSet().SetSeq_set();

    typedef CDense_seg::TDim TNumrow;
    for (TNumrow row_i = 0; row_i < m_Dim; row_i++) {
        const string& seq_str     = m_SeqVec[row_i];
        const size_t& seq_str_len = seq_str.size();

        CRef<CSeq_entry> seq_entry (new CSeq_entry);

        // seq-id(s)
        CBioseq::TId& ids = seq_entry->SetSeq().SetId();
        CSeq_id::ParseFastaIds(ids, m_Ids[row_i], true);
        if (ids.empty()) {
            ids.push_back(CRef<CSeq_id>(new CSeq_id(CSeq_id::e_Local,
                                                    m_Ids[row_i])));
        }

        // mol
        CSeq_inst::EMol mol   = CSeq_inst::eMol_not_set;
        CSeq_id::EAccessionInfo ai = ids.front()->IdentifyAccession();
        if (ai & CSeq_id::fAcc_nuc) {
            mol = CSeq_inst::eMol_na;
        } else if (ai & CSeq_id::fAcc_prot) {
            mol = CSeq_inst::eMol_aa;
        } else {
            switch (CFormatGuess::SequenceType(seq_str.data(), seq_str_len)) {
            case CFormatGuess::eNucleotide:  mol = CSeq_inst::eMol_na;  break;
            case CFormatGuess::eProtein:     mol = CSeq_inst::eMol_aa;  break;
            default:                         break;
            }
        }

        // seq-inst
        CRef<CSeq_inst> seq_inst (new CSeq_inst);
        seq_entry->SetSeq().SetInst(*seq_inst);
        seq_set.push_back(seq_entry);

        // repr
        seq_inst->SetRepr(CSeq_inst::eRepr_raw);

        // mol
        seq_inst->SetMol(mol);

        // len
        _ASSERT(seq_str_len == m_SeqLen[row_i]);
        seq_inst->SetLength(seq_str_len);

        // data
        CSeq_data& data = seq_inst->SetSeq_data();
        if (mol == CSeq_inst::eMol_aa) {
            data.SetIupacaa().Set(seq_str);
        } else {
            data.SetIupacna().Set(seq_str);
            CSeqportUtil::Pack(&data);
        }

    }
    
    
    return m_Entry;
}
int CLocalFinderApp::Run(void)
{
    CArgs myargs = GetArgs();

    int left            = myargs["from"].AsInteger();
    int right           = myargs["to"].AsInteger();
    bool repeats        = myargs["rep"];


    //
    // read our sequence data
    //
    CFastaReader fastareader(myargs["input"].AsString());
    CRef<CSeq_loc> masked_regions;
    masked_regions = fastareader.SaveMask();
    CRef<CSeq_entry> se = fastareader.ReadOneSeq();
    
    if(masked_regions) {
        CBioseq& bioseq = se->SetSeq();     // assumes that reader gets only one sequence per fasta id (no [] in file)
        CRef<CSeq_annot> seq_annot(new CSeq_annot);
        seq_annot->SetNameDesc("NCBI-FASTA-Lowercase");
        bioseq.SetAnnot().push_back(seq_annot);
        CSeq_annot::C_Data::TFtable* feature_table = &seq_annot->SetData().SetFtable();
        for(CSeq_loc_CI i(*masked_regions); i; ++i) {
            CRef<CSeq_feat> repeat(new CSeq_feat);
            CRef<CSeq_id> id(new CSeq_id);
            id->Assign(i.GetSeq_id());
            CRef<CSeq_loc> loc(new CSeq_loc(*id, i.GetRange().GetFrom(), i.GetRange().GetTo()));
            repeat->SetLocation(*loc);
            repeat->SetData().SetImp().SetKey("repeat_region");
            feature_table->push_back(repeat);
        }
    }

    CRef<CObjectManager> objmgr = CObjectManager::GetInstance();
    CScope scope(*objmgr);
    scope.AddTopLevelSeqEntry(*se);       

    CRef<CSeq_id> cntg(new CSeq_id);
    cntg->Assign(*se->GetSeq().GetFirstId());
    CSeq_loc loc;
    loc.SetWhole(*cntg);
    CSeqVector vec(loc, scope);
    vec.SetIupacCoding();

    CResidueVec seq;
    ITERATE(CSeqVector,i,vec)
        seq.push_back(*i);

    // read the alignment information
    TGeneModelList alignments;
    if(myargs["align"]) {
        CNcbiIstream& alignmentfile = myargs["align"].AsInputFile();
        string our_contig = cntg->GetSeqIdString(true);
        string cur_contig; 
        CAlignModel algn;
        
        while(alignmentfile >> algn >> getcontig(cur_contig)) {
            if (cur_contig==our_contig)
                alignments.push_back(algn);
        }
    }

    // create engine
    CRef<CHMMParameters> hmm_params(new CHMMParameters(myargs["model"].AsInputFile()));
    CGnomonEngine gnomon(hmm_params, seq, TSignedSeqRange(left, right));

    // run!
    gnomon.Run(alignments, repeats, true, true, false, false, 10.0);

    // dump the annotation
    CRef<CSeq_annot> annot = gnomon.GetAnnot(*cntg);
    auto_ptr<CObjectOStream> os(CObjectOStream::Open(eSerial_AsnText, cout));
    *os << *annot;

    return 0;

}