Ejemplo n.º 1
0
CRef<CSeq_entry>
CShortReadFastaInputSource::x_ReadFastaOneSeq(CRef<ILineReader> line_reader)
{
    int start = 0;
    // parse the last read defline
    CTempString line = **line_reader;
    CTempString id = x_ParseDefline(line);
    CRef<CSeq_id> seqid(new CSeq_id);
    seqid->Set(CSeq_id::e_Local, id);
    ++(*line_reader);
    line = **line_reader;
    while (line[0] != '>') {

        // ignore empty lines
        if (line.empty() && !line_reader->AtEOF()) {
            ++(*line_reader);
            line = **line_reader;
            continue;
        }

        // copy the sequence
        // increase the sequence buffer if necessary
        if (start + line.length() + 1 > m_SeqBuffLen) {
            string tmp;
            m_SeqBuffLen = 2 * (start + line.length() + 1);
            tmp.reserve(m_SeqBuffLen);
            memcpy(&tmp[0], &m_Sequence[0], start);
            m_Sequence.swap(tmp);
        }
        memcpy(&m_Sequence[start], line.data(), line.length());
        start += line.length();

        if (line_reader->AtEOF()) {
            break;
        }

        // read next line
        ++(*line_reader);
        line = **line_reader;
    }

    // set up sequence
    if (start > 0) {
        CRef<CSeq_entry> seq_entry(new CSeq_entry);
        CBioseq& bioseq = seq_entry->SetSeq();
        bioseq.SetInst().SetMol(CSeq_inst::eMol_na);
        bioseq.SetInst().SetRepr(CSeq_inst::eRepr_raw);
        bioseq.SetId().clear();
        bioseq.SetId().push_back(seqid);
        bioseq.SetInst().SetLength(start);
        m_Sequence[start] = 0;
        bioseq.SetInst().SetSeq_data().SetIupacna(CIUPACna(&m_Sequence[0]));
        bioseq.SetDescr();

        m_BasesAdded += start;
        return seq_entry;
    }

    return CRef<CSeq_entry>();
}
Ejemplo n.º 2
0
CRef<CSeq_entry> CAlnReader::GetSeqEntry()
{
    if (m_Entry) {
        return m_Entry;
    } else if ( !m_ReadDone ) {
        NCBI_THROW2(CObjReaderParseException, eFormat,
                   "CAlnReader::GetSeqEntry(): "
                   "Seq_entry is not available until after Read()", 0);
    }
    m_Entry = new CSeq_entry();
    CRef<CSeq_annot> seq_annot (new CSeq_annot);
    seq_annot->SetData().SetAlign().push_back(GetSeqAlign());

    m_Entry->SetSet().SetClass(CBioseq_set::eClass_pop_set);
    m_Entry->SetSet().SetAnnot().push_back(seq_annot);

    CBioseq_set::TSeq_set& seq_set = m_Entry->SetSet().SetSeq_set();

    typedef CDense_seg::TDim TNumrow;
    for (TNumrow row_i = 0; row_i < m_Dim; row_i++) {
        const string& seq_str     = m_SeqVec[row_i];
        const size_t& seq_str_len = seq_str.size();

        CRef<CSeq_entry> seq_entry (new CSeq_entry);

        // seq-id(s)
        CBioseq::TId& ids = seq_entry->SetSeq().SetId();
        CSeq_id::ParseFastaIds(ids, m_Ids[row_i], true);
        if (ids.empty()) {
            ids.push_back(CRef<CSeq_id>(new CSeq_id(CSeq_id::e_Local,
                                                    m_Ids[row_i])));
        }

        // mol
        CSeq_inst::EMol mol   = CSeq_inst::eMol_not_set;
        CSeq_id::EAccessionInfo ai = ids.front()->IdentifyAccession();
        if (ai & CSeq_id::fAcc_nuc) {
            mol = CSeq_inst::eMol_na;
        } else if (ai & CSeq_id::fAcc_prot) {
            mol = CSeq_inst::eMol_aa;
        } else {
            switch (CFormatGuess::SequenceType(seq_str.data(), seq_str_len)) {
            case CFormatGuess::eNucleotide:  mol = CSeq_inst::eMol_na;  break;
            case CFormatGuess::eProtein:     mol = CSeq_inst::eMol_aa;  break;
            default:                         break;
            }
        }

        // seq-inst
        CRef<CSeq_inst> seq_inst (new CSeq_inst);
        seq_entry->SetSeq().SetInst(*seq_inst);
        seq_set.push_back(seq_entry);

        // repr
        seq_inst->SetRepr(CSeq_inst::eRepr_raw);

        // mol
        seq_inst->SetMol(mol);

        // len
        _ASSERT(seq_str_len == m_SeqLen[row_i]);
        seq_inst->SetLength(seq_str_len);

        // data
        CSeq_data& data = seq_inst->SetSeq_data();
        if (mol == CSeq_inst::eMol_aa) {
            data.SetIupacaa().Set(seq_str);
        } else {
            data.SetIupacna().Set(seq_str);
            CSeqportUtil::Pack(&data);
        }

    }
    
    
    return m_Entry;
}
Ejemplo n.º 3
0
CRef<CSeq_entry>
CShortReadFastaInputSource::x_ReadFastqOneSeq(CRef<ILineReader> line_reader)
{
    CTempString line;
    CTempString id;
    CRef<CSeq_entry> retval;

    // first read defline
    ++(*line_reader);
    line = **line_reader;

    // skip empty lines
    while (!line_reader->AtEOF() && line.empty()) {
        ++(*line_reader);
        line = **line_reader;
    }

    if (line[0] != '@') {
        NCBI_THROW(CInputException, eInvalidInput, (string)"FASTQ parse error:"
                   " defline expected at line: " +
                   NStr::IntToString(line_reader->GetLineNumber()));
    }

    id = x_ParseDefline(line);
    CRef<CSeq_id> seqid(new CSeq_id);
    seqid->Set(CSeq_id::e_Local, id);

    // read sequence
    ++(*line_reader);
    line = **line_reader;
    // skip empty lines
    while (!line_reader->AtEOF() && line.empty()) {
        ++(*line_reader);
        line = **line_reader;
    }

    // set up sequence
    if (line.length() > 0) {
        CRef<CSeq_entry> seq_entry(new CSeq_entry);
        CBioseq& bioseq = seq_entry->SetSeq();
        bioseq.SetInst().SetMol(CSeq_inst::eMol_na);
        bioseq.SetInst().SetRepr(CSeq_inst::eRepr_raw);
        bioseq.SetId().clear();
        bioseq.SetId().push_back(seqid);
        bioseq.SetInst().SetLength(line.length());
        bioseq.SetInst().SetSeq_data().SetIupacna(CIUPACna(line.data()));
        bioseq.SetDescr();

        m_BasesAdded += line.length();
        retval = seq_entry;
    }
    
    // read and skip second defline
    ++(*line_reader);
    line = **line_reader;
    // skip empty lines
    while (!line_reader->AtEOF() && line.empty()) {
        ++(*line_reader);
        line = **line_reader;
    }

    if (line[0] != '+') {
        NCBI_THROW(CInputException, eInvalidInput, (string)"FASTQ parse error:"
                   " defline expected at line: " +
                   NStr::IntToString(line_reader->GetLineNumber()));
    }

    // read and skip quality scores
    ++(*line_reader);
    line = **line_reader;
    // skip empty lines
    while (!line_reader->AtEOF() && line.empty()) {
        ++(*line_reader);
        line = **line_reader;
    }

    return retval;
}
Ejemplo n.º 4
0
void
CShortReadFastaInputSource::x_ReadFastc(CBioseq_set& bioseq_set,
                                        TSeqPos batch_size)
{
    string id;

    // tags to indicate paired sequences
    CRef<CSeqdesc> seqdesc_first(new CSeqdesc);
    seqdesc_first->SetUser().SetType().SetStr("Mapping");
    seqdesc_first->SetUser().AddField("has_pair", eFirstSegment);

    CRef<CSeqdesc> seqdesc_last(new CSeqdesc);
    seqdesc_last->SetUser().SetType().SetStr("Mapping");
    seqdesc_last->SetUser().AddField("has_pair", eLastSegment);

    m_BasesAdded = 0;
    while (m_BasesAdded < batch_size && !m_LineReader->AtEOF()) {
        ++(*m_LineReader);
        m_Line = **m_LineReader;

        // ignore empty lines
        if (m_Line.empty()) {
            continue;
        }

        // if defline
        if (m_Line[0] == '>') {
            id = x_ParseDefline(m_Line);
        }
        else {
            // otherwise sequence

            // make sure that a defline was read first
            if (id.empty()) {
                NCBI_THROW(CInputException, eInvalidInput,
                           (string)"Missing defline before line: " +
                           NStr::IntToString(m_LineReader->GetLineNumber()));
            }

            // find '><' that separate reads of a pair
            size_t p = m_Line.find('>');
            if (p == CTempString::npos || m_Line[p + 1] != '<') {

                NCBI_THROW(CInputException, eInvalidInput,
                           (string)"FASTC parse error: Sequence separator '><'"
                           " was not found in line: " +
                           NStr::IntToString(m_LineReader->GetLineNumber()));
            }

            // set up reads, there are two sequences in the same line separated
            char* first = (char*)m_Line.data();
            char* second = (char*)m_Line.data() + p + 2;
            size_t first_len = p;
            size_t second_len = m_Line.length() - p - 2;

            {{
                CRef<CSeq_id> seqid(new CSeq_id);
                seqid->Set(CSeq_id::e_Local, id + ".1");

                CRef<CSeq_entry> seq_entry(new CSeq_entry);
                CBioseq& bioseq = seq_entry->SetSeq();
                bioseq.SetInst().SetMol(CSeq_inst::eMol_na);
                bioseq.SetInst().SetRepr(CSeq_inst::eRepr_raw);
                bioseq.SetId().clear();
                bioseq.SetId().push_back(seqid);
                bioseq.SetInst().SetLength(first_len);
                first[first_len] = 0;
                bioseq.SetInst().SetSeq_data().SetIupacna(CIUPACna(first));
                bioseq.SetDescr().Set().push_back(seqdesc_first);

                // add a sequence to the batch
                bioseq_set.SetSeq_set().push_back(seq_entry);
            }}

            {{
                CRef<CSeq_id> seqid(new CSeq_id);
                seqid->Set(CSeq_id::e_Local, id + ".2");

                CRef<CSeq_entry> seq_entry(new CSeq_entry);
                CBioseq& bioseq = seq_entry->SetSeq();
                bioseq.SetInst().SetMol(CSeq_inst::eMol_na);
                bioseq.SetInst().SetRepr(CSeq_inst::eRepr_raw);
                bioseq.SetId().clear();
                bioseq.SetId().push_back(seqid);
                bioseq.SetInst().SetLength(second_len);
                second[second_len] = 0;
                bioseq.SetInst().SetSeq_data().SetIupacna(CIUPACna(second));
                bioseq.SetDescr().Set().push_back(seqdesc_last);

                // add a sequence to the batch
                bioseq_set.SetSeq_set().push_back(seq_entry);
            }}
            m_BasesAdded += first_len + second_len;
            id.clear();
        }
    }
}
Ejemplo n.º 5
0
CRef<CSeq_loc>
CBlastFastaInputSource::x_FastaToSeqLoc(CRef<objects::CSeq_loc>& lcase_mask,
                                        CScope& scope)
{
    static const TSeqRange kEmptyRange(TSeqRange::GetEmpty());
    CRef<CBlastScopeSource> query_scope_source;

    if (m_Config.GetLowercaseMask())
        lcase_mask = m_InputReader->SaveMask();

    CRef<CSeq_entry> seq_entry(m_InputReader->ReadOneSeq());
    if (lcase_mask) {
        if (lcase_mask->Which() != CSeq_loc::e_not_set) {
            lcase_mask->SetStrand(eNa_strand_plus);
        }
        _ASSERT(lcase_mask->GetStrand() == eNa_strand_plus ||
                lcase_mask->GetStrand() == eNa_strand_unknown);
    }
    _ASSERT(seq_entry.NotEmpty());
    scope.AddTopLevelSeqEntry(*seq_entry);

    CTypeConstIterator<CBioseq> itr(ConstBegin(*seq_entry));

    CRef<CSeq_loc> retval(new CSeq_loc());

    if ( !blast::HasRawSequenceData(*itr) ) {
        CBlastInputReader* blast_reader = 
            dynamic_cast<CBlastInputReader*>(m_InputReader.get());
        _ASSERT(blast_reader);
        CRef<CBlastScopeSource> query_scope_source =
            blast_reader->GetQueryScopeSource();
        query_scope_source->AddDataLoaders(CRef<CScope>(&scope));
    }

    if (m_ReadProteins && itr->IsNa()) {
        NCBI_THROW(CInputException, eSequenceMismatch,
                   "Nucleotide FASTA provided for protein sequence");
    } else if ( !m_ReadProteins && itr->IsAa() ) {
        NCBI_THROW(CInputException, eSequenceMismatch,
                   "Protein FASTA provided for nucleotide sequence");
    }

    // set strand
    if (m_Config.GetStrand() == eNa_strand_other ||
        m_Config.GetStrand() == eNa_strand_unknown) {
        if (m_ReadProteins)
            retval->SetInt().SetStrand(eNa_strand_unknown);
        else
            retval->SetInt().SetStrand(eNa_strand_both);
    } else {
        if (m_ReadProteins) {
            NCBI_THROW(CInputException, eInvalidStrand,
                       "Cannot assign nucleotide strand to protein sequence");
        }
        retval->SetInt().SetStrand(m_Config.GetStrand());
    }

    // sanity checks for the range
    const TSeqPos from = m_Config.GetRange().GetFrom() == kEmptyRange.GetFrom()
        ? 0 : m_Config.GetRange().GetFrom();
    const TSeqPos to = m_Config.GetRange().GetTo() == kEmptyRange.GetTo()
        ? 0 : m_Config.GetRange().GetTo();

    // Get the sequence length
    const TSeqPos seqlen = seq_entry->GetSeq().GetInst().GetLength();
    //if (seqlen == 0) {
    //    NCBI_THROW(CInputException, eEmptyUserInput,
    //               "Query contains no sequence data");
    //}
    _ASSERT(seqlen != numeric_limits<TSeqPos>::max());
    if (to > 0 && to < from) {
        NCBI_THROW(CInputException, eInvalidRange, 
                   "Invalid sequence range");
    }
    if (from > seqlen) {
        NCBI_THROW(CInputException, eInvalidRange, 
                   "Invalid from coordinate (greater than sequence length)");
    }
    // N.B.: if the to coordinate is greater than or equal to the sequence
    // length, we fix that silently


    // set sequence range
    retval->SetInt().SetFrom(from);
    retval->SetInt().SetTo((to > 0 && to < seqlen) ? to : (seqlen-1));

    // set ID
    retval->SetInt().SetId().Assign(*FindBestChoice(itr->GetId(), CSeq_id::BestRank));

    return retval;
}
Ejemplo n.º 6
0
CRef<CBioseq_set> CMakeCdrProds::MakeCdrProds(CRef<CSeq_annot> annot,
                                              CBioseq_Handle handle)
{
    CRef<CBioseq_set> bioseq_set(new CBioseq_set);
    if (!annot->GetData().IsFtable()) {
        // Is this the right thing to do?
        // Could throw, or could return null CRef instead.
        return bioseq_set;
    }

    list<CRef<CSeq_feat> >& ftable = annot->SetData().SetFtable();

    NON_CONST_ITERATE (list<CRef<CSeq_feat> >, feat, ftable) {
        if (!(*feat)->GetData().IsCdregion()) {
            // not interested if not a Cdregion
            continue;
        }
        if ((*feat)->IsSetProduct()) {
            // already has a product; don't make new one
            continue;
        }

        string prot;
        CSeqTranslator::Translate(**feat, handle.GetScope(), prot);
        CRef<CSeq_data> seq_data(new CSeq_data(prot,
                                               CSeq_data::e_Iupacaa));
        CRef<CSeq_inst> seq_inst(new CSeq_inst);
        seq_inst->SetSeq_data(*seq_data);
        seq_inst->SetRepr(CSeq_inst_Base::eRepr_raw);
        seq_inst->SetMol(CSeq_inst_Base::eMol_aa);
        seq_inst->SetLength(prot.size());

        CRef<CBioseq> bio_seq(new CBioseq);
        string num = NStr::NumericToString(sm_Counter.Add(1));
        // pad to five digits
        if (num.size() < 5) {
            num.insert(SIZE_TYPE(0), 5 - num.size(), '0');
        }
        string acc = "tp" + num;
        string full_acc = "lcl|" + acc;
        CRef<CSeq_id> id(new CSeq_id(full_acc));
        bio_seq->SetId().push_back(id);
        // a title
        CRef<CSeqdesc> title(new CSeqdesc);
        title->SetTitle(string("Translation product ") + acc);
        bio_seq->SetDescr().Set().push_back(title);
        // Mol_type
        CRef<CSeqdesc> mol_type(new CSeqdesc);
        mol_type->SetMol_type( eGIBB_mol_peptide);
        bio_seq->SetDescr().Set().push_back(mol_type);
        
        // set the instance
        bio_seq->SetInst(*seq_inst);
        
        // wrap this Bio_seq in an entry
        CRef<CSeq_entry> seq_entry(new CSeq_entry);
        seq_entry->SetSeq(*bio_seq);
        
        // add this entry to our Bioseq_set
        bioseq_set->SetSeq_set().push_back(seq_entry);

        // record it as product in the annot we're handed
        CRef<CSeq_loc> prod_loc(new CSeq_loc);
        prod_loc->SetWhole(*id);
        (*feat)->SetProduct(*prod_loc);
    }

    return bioseq_set;
}
Ejemplo n.º 7
0
CRef<CSeq_entry> CSraRun::GetSpotEntry(spotid_t spot_id) const
{
    CRef<CSeq_entry> entry;
    
    CSraStringValue name(m_Name, spot_id);

    entry = new CSeq_entry();
    CBioseq_set& seqset = entry->SetSet();
    seqset.SetLevel(0);
    seqset.SetClass(seqset.eClass_other);

    CSraValueFor<SRASpotDesc> sdesc(m_SDesc, spot_id);
    TSeqPos trim_start = m_Trim && m_TrimStart?
        CSraValueFor<INSDC_coord_zero>(m_TrimStart, spot_id).Value(): 0;
    TSeqPos trim_end = sdesc->clip_qual_right;

    CSraValueFor<SRAReadDesc> rdesc(m_RDesc, spot_id);
    CSraStringValue read(m_Read, spot_id);
    CSraBytesValue qual(m_Qual, spot_id);
    int seq_count = 0;
    string id_start = GetAccession()+'.'+NStr::UIntToString(spot_id)+'.';
    for ( int r = 0; r < sdesc->num_reads; ++r ) {
        if ( rdesc[r].type != SRA_READ_TYPE_BIOLOGICAL ) {
            continue;
        }
        TSeqPos len = rdesc[r].seg.len;
        if ( len == 0 ) {
            continue;
        }
        TSeqPos start = rdesc[r].seg.start;
        TSeqPos end = start + len;
        if ( m_Trim ) {
            start = max(start, trim_start);
            end = min(end, trim_end);
            if ( start >= end ) {
                continue;
            }
            len = end - start;
        }

        CRef<CSeq_entry> seq_entry(new CSeq_entry);
        CBioseq& seq = seq_entry->SetSeq();
        
        CRef<CSeq_id> id(new CSeq_id);
        id->SetGeneral().SetDb("SRA");
        id->SetGeneral().SetTag().SetStr(id_start+NStr::UIntToString(r+1));
        seq.SetId().push_back(id);

        {{
            CRef<CSeqdesc> desc(new CSeqdesc);
            desc->SetTitle(name.Value());
            seq.SetDescr().Set().push_back(desc);
        }}
        {{
            CSeq_inst& inst = seq.SetInst();
            inst.SetRepr(inst.eRepr_raw);
            inst.SetMol(inst.eMol_na);
            inst.SetLength(len);
            inst.SetSeq_data().SetIupacna().Set()
                .assign(read.data()+start, len);
        }}
        {{
            CRef<CSeq_annot> annot(new CSeq_annot);
            CRef<CSeq_graph> graph(new CSeq_graph);
            annot->SetData().SetGraph().push_back(graph);
            graph->SetTitle("Phred Quality");
            graph->SetLoc().SetWhole(*id);
            graph->SetNumval(len);
            CByte_graph& bytes = graph->SetGraph().SetByte();
            bytes.SetAxis(0);
            CByte_graph::TValues& values = bytes.SetValues();
            values.reserve(len);
            int min = kMax_Int;
            int max = kMin_Int;
            for ( size_t i = 0; i < len; ++i ) {
                int v = qual[start+i];
                values.push_back(v);
                if ( v < min ) {
                    min = v;
                }
                if ( v > max ) {
                    max = v;
                }
            }
            bytes.SetMin(min);
            bytes.SetMax(max);

            seq.SetAnnot().push_back(annot);
        }}

        seqset.SetSeq_set().push_back(seq_entry);
        ++seq_count;
    }
    switch ( seq_count ) {
    case 0:
        entry.Reset();
        break;
    case 1:
        entry = seqset.GetSeq_set().front();
        break;
    }
    return entry;
}