Пример #1
0
CRef<CSeq_entry>
CShortReadFastaInputSource::x_ReadFastaOneSeq(CRef<ILineReader> line_reader)
{
    int start = 0;
    // parse the last read defline
    CTempString line = **line_reader;
    CTempString id = x_ParseDefline(line);
    CRef<CSeq_id> seqid(new CSeq_id);
    seqid->Set(CSeq_id::e_Local, id);
    ++(*line_reader);
    line = **line_reader;
    while (line[0] != '>') {

        // ignore empty lines
        if (line.empty() && !line_reader->AtEOF()) {
            ++(*line_reader);
            line = **line_reader;
            continue;
        }

        // copy the sequence
        // increase the sequence buffer if necessary
        if (start + line.length() + 1 > m_SeqBuffLen) {
            string tmp;
            m_SeqBuffLen = 2 * (start + line.length() + 1);
            tmp.reserve(m_SeqBuffLen);
            memcpy(&tmp[0], &m_Sequence[0], start);
            m_Sequence.swap(tmp);
        }
        memcpy(&m_Sequence[start], line.data(), line.length());
        start += line.length();

        if (line_reader->AtEOF()) {
            break;
        }

        // read next line
        ++(*line_reader);
        line = **line_reader;
    }

    // set up sequence
    if (start > 0) {
        CRef<CSeq_entry> seq_entry(new CSeq_entry);
        CBioseq& bioseq = seq_entry->SetSeq();
        bioseq.SetInst().SetMol(CSeq_inst::eMol_na);
        bioseq.SetInst().SetRepr(CSeq_inst::eRepr_raw);
        bioseq.SetId().clear();
        bioseq.SetId().push_back(seqid);
        bioseq.SetInst().SetLength(start);
        m_Sequence[start] = 0;
        bioseq.SetInst().SetSeq_data().SetIupacna(CIUPACna(&m_Sequence[0]));
        bioseq.SetDescr();

        m_BasesAdded += start;
        return seq_entry;
    }

    return CRef<CSeq_entry>();
}
Пример #2
0
void CSeq_data::DoConstruct(const string& value, E_Choice index)
{

    switch (index) {
    case e_Iupacna:
        SetIupacna() = CIUPACna(value);
        break;
    case e_Iupacaa:
        SetIupacaa() = CIUPACaa(value);
        break;
    case e_Ncbieaa:
        SetNcbieaa() = CNCBIeaa(value);
        break;
    default:
        // throw an error
        NCBI_THROW (CException, eUnknown,
            "CSeq_data::DoConstruct: Invalid E_Choice index");
    }
}
Пример #3
0
CRef<CSeq_entry>
CShortReadFastaInputSource::x_ReadFastqOneSeq(CRef<ILineReader> line_reader)
{
    CTempString line;
    CTempString id;
    CRef<CSeq_entry> retval;

    // first read defline
    ++(*line_reader);
    line = **line_reader;

    // skip empty lines
    while (!line_reader->AtEOF() && line.empty()) {
        ++(*line_reader);
        line = **line_reader;
    }

    if (line[0] != '@') {
        NCBI_THROW(CInputException, eInvalidInput, (string)"FASTQ parse error:"
                   " defline expected at line: " +
                   NStr::IntToString(line_reader->GetLineNumber()));
    }

    id = x_ParseDefline(line);
    CRef<CSeq_id> seqid(new CSeq_id);
    seqid->Set(CSeq_id::e_Local, id);

    // read sequence
    ++(*line_reader);
    line = **line_reader;
    // skip empty lines
    while (!line_reader->AtEOF() && line.empty()) {
        ++(*line_reader);
        line = **line_reader;
    }

    // set up sequence
    if (line.length() > 0) {
        CRef<CSeq_entry> seq_entry(new CSeq_entry);
        CBioseq& bioseq = seq_entry->SetSeq();
        bioseq.SetInst().SetMol(CSeq_inst::eMol_na);
        bioseq.SetInst().SetRepr(CSeq_inst::eRepr_raw);
        bioseq.SetId().clear();
        bioseq.SetId().push_back(seqid);
        bioseq.SetInst().SetLength(line.length());
        bioseq.SetInst().SetSeq_data().SetIupacna(CIUPACna(line.data()));
        bioseq.SetDescr();

        m_BasesAdded += line.length();
        retval = seq_entry;
    }
    
    // read and skip second defline
    ++(*line_reader);
    line = **line_reader;
    // skip empty lines
    while (!line_reader->AtEOF() && line.empty()) {
        ++(*line_reader);
        line = **line_reader;
    }

    if (line[0] != '+') {
        NCBI_THROW(CInputException, eInvalidInput, (string)"FASTQ parse error:"
                   " defline expected at line: " +
                   NStr::IntToString(line_reader->GetLineNumber()));
    }

    // read and skip quality scores
    ++(*line_reader);
    line = **line_reader;
    // skip empty lines
    while (!line_reader->AtEOF() && line.empty()) {
        ++(*line_reader);
        line = **line_reader;
    }

    return retval;
}
Пример #4
0
void
CShortReadFastaInputSource::x_ReadFastc(CBioseq_set& bioseq_set,
                                        TSeqPos batch_size)
{
    string id;

    // tags to indicate paired sequences
    CRef<CSeqdesc> seqdesc_first(new CSeqdesc);
    seqdesc_first->SetUser().SetType().SetStr("Mapping");
    seqdesc_first->SetUser().AddField("has_pair", eFirstSegment);

    CRef<CSeqdesc> seqdesc_last(new CSeqdesc);
    seqdesc_last->SetUser().SetType().SetStr("Mapping");
    seqdesc_last->SetUser().AddField("has_pair", eLastSegment);

    m_BasesAdded = 0;
    while (m_BasesAdded < batch_size && !m_LineReader->AtEOF()) {
        ++(*m_LineReader);
        m_Line = **m_LineReader;

        // ignore empty lines
        if (m_Line.empty()) {
            continue;
        }

        // if defline
        if (m_Line[0] == '>') {
            id = x_ParseDefline(m_Line);
        }
        else {
            // otherwise sequence

            // make sure that a defline was read first
            if (id.empty()) {
                NCBI_THROW(CInputException, eInvalidInput,
                           (string)"Missing defline before line: " +
                           NStr::IntToString(m_LineReader->GetLineNumber()));
            }

            // find '><' that separate reads of a pair
            size_t p = m_Line.find('>');
            if (p == CTempString::npos || m_Line[p + 1] != '<') {

                NCBI_THROW(CInputException, eInvalidInput,
                           (string)"FASTC parse error: Sequence separator '><'"
                           " was not found in line: " +
                           NStr::IntToString(m_LineReader->GetLineNumber()));
            }

            // set up reads, there are two sequences in the same line separated
            char* first = (char*)m_Line.data();
            char* second = (char*)m_Line.data() + p + 2;
            size_t first_len = p;
            size_t second_len = m_Line.length() - p - 2;

            {{
                CRef<CSeq_id> seqid(new CSeq_id);
                seqid->Set(CSeq_id::e_Local, id + ".1");

                CRef<CSeq_entry> seq_entry(new CSeq_entry);
                CBioseq& bioseq = seq_entry->SetSeq();
                bioseq.SetInst().SetMol(CSeq_inst::eMol_na);
                bioseq.SetInst().SetRepr(CSeq_inst::eRepr_raw);
                bioseq.SetId().clear();
                bioseq.SetId().push_back(seqid);
                bioseq.SetInst().SetLength(first_len);
                first[first_len] = 0;
                bioseq.SetInst().SetSeq_data().SetIupacna(CIUPACna(first));
                bioseq.SetDescr().Set().push_back(seqdesc_first);

                // add a sequence to the batch
                bioseq_set.SetSeq_set().push_back(seq_entry);
            }}

            {{
                CRef<CSeq_id> seqid(new CSeq_id);
                seqid->Set(CSeq_id::e_Local, id + ".2");

                CRef<CSeq_entry> seq_entry(new CSeq_entry);
                CBioseq& bioseq = seq_entry->SetSeq();
                bioseq.SetInst().SetMol(CSeq_inst::eMol_na);
                bioseq.SetInst().SetRepr(CSeq_inst::eRepr_raw);
                bioseq.SetId().clear();
                bioseq.SetId().push_back(seqid);
                bioseq.SetInst().SetLength(second_len);
                second[second_len] = 0;
                bioseq.SetInst().SetSeq_data().SetIupacna(CIUPACna(second));
                bioseq.SetDescr().Set().push_back(seqdesc_last);

                // add a sequence to the batch
                bioseq_set.SetSeq_set().push_back(seq_entry);
            }}
            m_BasesAdded += first_len + second_len;
            id.clear();
        }
    }
}
Пример #5
0
//
// CreateConsensus()
//
// compute a consensus sequence given a particular alignment
// the rules for a consensus are:
//   - a segment is consensus gap if > 50% of the sequences are gap at this
//     segment.  50% exactly is counted as sequence
//   - for a segment counted as sequence, for each position, the most
//     frequently occurring base is counted as consensus.  in the case of
//     a tie, the consensus is considered muddied, and the consensus is
//     so marked
//
CRef<CDense_seg>
CAlnVec::CreateConsensus(int& consensus_row, CBioseq& consensus_seq,
                         const CSeq_id& consensus_id) const
{
    consensus_seq.Reset();
    if ( !m_DS || m_NumRows < 1) {
        return CRef<CDense_seg>();
    }

    bool isNucleotide = GetBioseqHandle(0).IsNucleotide();

    size_t i;
    size_t j;

    // temporary storage for our consensus
    vector<string> consens(m_NumSegs);

    CreateConsensus(consens);

    //
    // now, create a new CDense_seg
    // we create a new CBioseq for our data and
    // copy the contents of the CDense_seg
    //
    string data;
    TSignedSeqPos total_bases = 0;

    CRef<CDense_seg> new_ds(new CDense_seg());
    new_ds->SetDim(m_NumRows + 1);
    new_ds->SetNumseg(m_NumSegs);
    new_ds->SetLens() = m_Lens;
    new_ds->SetStarts().reserve(m_Starts.size() + m_NumSegs);
    if ( !m_Strands.empty() ) {
        new_ds->SetStrands().reserve(m_Strands.size() +
                                     m_NumSegs);
    }

    for (i = 0;  i < consens.size();  ++i) {
        // copy the old entries
        for (j = 0;  j < (size_t)m_NumRows;  ++j) {
            int idx = i * m_NumRows + j;
            new_ds->SetStarts().push_back(m_Starts[idx]);
            if ( !m_Strands.empty() ) {
                new_ds->SetStrands().push_back(m_Strands[idx]);
            }
        }

        // add our new entry
        // this places the consensus as the last sequence
        // it should preferably be the first, but this would mean adjusting
        // the bioseq handle and seqvector caches, and all row numbers would
        // shift
        if (consens[i].length() != 0) {
            new_ds->SetStarts().push_back(total_bases);
        } else {
            new_ds->SetStarts().push_back(-1);
        }
        
        if ( !m_Strands.empty() ) {
            new_ds->SetStrands().push_back(eNa_strand_unknown);
        }

        total_bases += consens[i].length();
        data += consens[i];
    }

    // copy our IDs
    for (i = 0;  i < m_Ids.size();  ++i) {
        new_ds->SetIds().push_back(m_Ids[i]);
    }

    // now, we construct a new Bioseq
    {{

         // sequence ID
         CRef<CSeq_id> id(new CSeq_id());
         id->Assign(consensus_id);
         consensus_seq.SetId().push_back(id);

         new_ds->SetIds().push_back(id);

         // add a description for this sequence
         CSeq_descr& desc = consensus_seq.SetDescr();
         CRef<CSeqdesc> d(new CSeqdesc);
         desc.Set().push_back(d);
         d->SetComment("This is a generated consensus sequence");

         // the main one: Seq-inst
         CSeq_inst& inst = consensus_seq.SetInst();
         inst.SetRepr(CSeq_inst::eRepr_raw);
         inst.SetMol(isNucleotide ? CSeq_inst::eMol_na : CSeq_inst::eMol_aa);
         inst.SetLength(data.length());

         CSeq_data& seq_data = inst.SetSeq_data();
         if (isNucleotide) {
             CIUPACna& na = seq_data.SetIupacna();
             na = CIUPACna(data);
         } else {
             CIUPACaa& aa = seq_data.SetIupacaa();
             aa = CIUPACaa(data);
         }
    }}

    consensus_row = new_ds->GetIds().size() - 1;
    return new_ds;
}