CRef<CSeq_entry> CShortReadFastaInputSource::x_ReadFastaOneSeq(CRef<ILineReader> line_reader) { int start = 0; // parse the last read defline CTempString line = **line_reader; CTempString id = x_ParseDefline(line); CRef<CSeq_id> seqid(new CSeq_id); seqid->Set(CSeq_id::e_Local, id); ++(*line_reader); line = **line_reader; while (line[0] != '>') { // ignore empty lines if (line.empty() && !line_reader->AtEOF()) { ++(*line_reader); line = **line_reader; continue; } // copy the sequence // increase the sequence buffer if necessary if (start + line.length() + 1 > m_SeqBuffLen) { string tmp; m_SeqBuffLen = 2 * (start + line.length() + 1); tmp.reserve(m_SeqBuffLen); memcpy(&tmp[0], &m_Sequence[0], start); m_Sequence.swap(tmp); } memcpy(&m_Sequence[start], line.data(), line.length()); start += line.length(); if (line_reader->AtEOF()) { break; } // read next line ++(*line_reader); line = **line_reader; } // set up sequence if (start > 0) { CRef<CSeq_entry> seq_entry(new CSeq_entry); CBioseq& bioseq = seq_entry->SetSeq(); bioseq.SetInst().SetMol(CSeq_inst::eMol_na); bioseq.SetInst().SetRepr(CSeq_inst::eRepr_raw); bioseq.SetId().clear(); bioseq.SetId().push_back(seqid); bioseq.SetInst().SetLength(start); m_Sequence[start] = 0; bioseq.SetInst().SetSeq_data().SetIupacna(CIUPACna(&m_Sequence[0])); bioseq.SetDescr(); m_BasesAdded += start; return seq_entry; } return CRef<CSeq_entry>(); }
void CSeq_data::DoConstruct(const string& value, E_Choice index) { switch (index) { case e_Iupacna: SetIupacna() = CIUPACna(value); break; case e_Iupacaa: SetIupacaa() = CIUPACaa(value); break; case e_Ncbieaa: SetNcbieaa() = CNCBIeaa(value); break; default: // throw an error NCBI_THROW (CException, eUnknown, "CSeq_data::DoConstruct: Invalid E_Choice index"); } }
CRef<CSeq_entry> CShortReadFastaInputSource::x_ReadFastqOneSeq(CRef<ILineReader> line_reader) { CTempString line; CTempString id; CRef<CSeq_entry> retval; // first read defline ++(*line_reader); line = **line_reader; // skip empty lines while (!line_reader->AtEOF() && line.empty()) { ++(*line_reader); line = **line_reader; } if (line[0] != '@') { NCBI_THROW(CInputException, eInvalidInput, (string)"FASTQ parse error:" " defline expected at line: " + NStr::IntToString(line_reader->GetLineNumber())); } id = x_ParseDefline(line); CRef<CSeq_id> seqid(new CSeq_id); seqid->Set(CSeq_id::e_Local, id); // read sequence ++(*line_reader); line = **line_reader; // skip empty lines while (!line_reader->AtEOF() && line.empty()) { ++(*line_reader); line = **line_reader; } // set up sequence if (line.length() > 0) { CRef<CSeq_entry> seq_entry(new CSeq_entry); CBioseq& bioseq = seq_entry->SetSeq(); bioseq.SetInst().SetMol(CSeq_inst::eMol_na); bioseq.SetInst().SetRepr(CSeq_inst::eRepr_raw); bioseq.SetId().clear(); bioseq.SetId().push_back(seqid); bioseq.SetInst().SetLength(line.length()); bioseq.SetInst().SetSeq_data().SetIupacna(CIUPACna(line.data())); bioseq.SetDescr(); m_BasesAdded += line.length(); retval = seq_entry; } // read and skip second defline ++(*line_reader); line = **line_reader; // skip empty lines while (!line_reader->AtEOF() && line.empty()) { ++(*line_reader); line = **line_reader; } if (line[0] != '+') { NCBI_THROW(CInputException, eInvalidInput, (string)"FASTQ parse error:" " defline expected at line: " + NStr::IntToString(line_reader->GetLineNumber())); } // read and skip quality scores ++(*line_reader); line = **line_reader; // skip empty lines while (!line_reader->AtEOF() && line.empty()) { ++(*line_reader); line = **line_reader; } return retval; }
void CShortReadFastaInputSource::x_ReadFastc(CBioseq_set& bioseq_set, TSeqPos batch_size) { string id; // tags to indicate paired sequences CRef<CSeqdesc> seqdesc_first(new CSeqdesc); seqdesc_first->SetUser().SetType().SetStr("Mapping"); seqdesc_first->SetUser().AddField("has_pair", eFirstSegment); CRef<CSeqdesc> seqdesc_last(new CSeqdesc); seqdesc_last->SetUser().SetType().SetStr("Mapping"); seqdesc_last->SetUser().AddField("has_pair", eLastSegment); m_BasesAdded = 0; while (m_BasesAdded < batch_size && !m_LineReader->AtEOF()) { ++(*m_LineReader); m_Line = **m_LineReader; // ignore empty lines if (m_Line.empty()) { continue; } // if defline if (m_Line[0] == '>') { id = x_ParseDefline(m_Line); } else { // otherwise sequence // make sure that a defline was read first if (id.empty()) { NCBI_THROW(CInputException, eInvalidInput, (string)"Missing defline before line: " + NStr::IntToString(m_LineReader->GetLineNumber())); } // find '><' that separate reads of a pair size_t p = m_Line.find('>'); if (p == CTempString::npos || m_Line[p + 1] != '<') { NCBI_THROW(CInputException, eInvalidInput, (string)"FASTC parse error: Sequence separator '><'" " was not found in line: " + NStr::IntToString(m_LineReader->GetLineNumber())); } // set up reads, there are two sequences in the same line separated char* first = (char*)m_Line.data(); char* second = (char*)m_Line.data() + p + 2; size_t first_len = p; size_t second_len = m_Line.length() - p - 2; {{ CRef<CSeq_id> seqid(new CSeq_id); seqid->Set(CSeq_id::e_Local, id + ".1"); CRef<CSeq_entry> seq_entry(new CSeq_entry); CBioseq& bioseq = seq_entry->SetSeq(); bioseq.SetInst().SetMol(CSeq_inst::eMol_na); bioseq.SetInst().SetRepr(CSeq_inst::eRepr_raw); bioseq.SetId().clear(); bioseq.SetId().push_back(seqid); bioseq.SetInst().SetLength(first_len); first[first_len] = 0; bioseq.SetInst().SetSeq_data().SetIupacna(CIUPACna(first)); bioseq.SetDescr().Set().push_back(seqdesc_first); // add a sequence to the batch bioseq_set.SetSeq_set().push_back(seq_entry); }} {{ CRef<CSeq_id> seqid(new CSeq_id); seqid->Set(CSeq_id::e_Local, id + ".2"); CRef<CSeq_entry> seq_entry(new CSeq_entry); CBioseq& bioseq = seq_entry->SetSeq(); bioseq.SetInst().SetMol(CSeq_inst::eMol_na); bioseq.SetInst().SetRepr(CSeq_inst::eRepr_raw); bioseq.SetId().clear(); bioseq.SetId().push_back(seqid); bioseq.SetInst().SetLength(second_len); second[second_len] = 0; bioseq.SetInst().SetSeq_data().SetIupacna(CIUPACna(second)); bioseq.SetDescr().Set().push_back(seqdesc_last); // add a sequence to the batch bioseq_set.SetSeq_set().push_back(seq_entry); }} m_BasesAdded += first_len + second_len; id.clear(); } } }
// // CreateConsensus() // // compute a consensus sequence given a particular alignment // the rules for a consensus are: // - a segment is consensus gap if > 50% of the sequences are gap at this // segment. 50% exactly is counted as sequence // - for a segment counted as sequence, for each position, the most // frequently occurring base is counted as consensus. in the case of // a tie, the consensus is considered muddied, and the consensus is // so marked // CRef<CDense_seg> CAlnVec::CreateConsensus(int& consensus_row, CBioseq& consensus_seq, const CSeq_id& consensus_id) const { consensus_seq.Reset(); if ( !m_DS || m_NumRows < 1) { return CRef<CDense_seg>(); } bool isNucleotide = GetBioseqHandle(0).IsNucleotide(); size_t i; size_t j; // temporary storage for our consensus vector<string> consens(m_NumSegs); CreateConsensus(consens); // // now, create a new CDense_seg // we create a new CBioseq for our data and // copy the contents of the CDense_seg // string data; TSignedSeqPos total_bases = 0; CRef<CDense_seg> new_ds(new CDense_seg()); new_ds->SetDim(m_NumRows + 1); new_ds->SetNumseg(m_NumSegs); new_ds->SetLens() = m_Lens; new_ds->SetStarts().reserve(m_Starts.size() + m_NumSegs); if ( !m_Strands.empty() ) { new_ds->SetStrands().reserve(m_Strands.size() + m_NumSegs); } for (i = 0; i < consens.size(); ++i) { // copy the old entries for (j = 0; j < (size_t)m_NumRows; ++j) { int idx = i * m_NumRows + j; new_ds->SetStarts().push_back(m_Starts[idx]); if ( !m_Strands.empty() ) { new_ds->SetStrands().push_back(m_Strands[idx]); } } // add our new entry // this places the consensus as the last sequence // it should preferably be the first, but this would mean adjusting // the bioseq handle and seqvector caches, and all row numbers would // shift if (consens[i].length() != 0) { new_ds->SetStarts().push_back(total_bases); } else { new_ds->SetStarts().push_back(-1); } if ( !m_Strands.empty() ) { new_ds->SetStrands().push_back(eNa_strand_unknown); } total_bases += consens[i].length(); data += consens[i]; } // copy our IDs for (i = 0; i < m_Ids.size(); ++i) { new_ds->SetIds().push_back(m_Ids[i]); } // now, we construct a new Bioseq {{ // sequence ID CRef<CSeq_id> id(new CSeq_id()); id->Assign(consensus_id); consensus_seq.SetId().push_back(id); new_ds->SetIds().push_back(id); // add a description for this sequence CSeq_descr& desc = consensus_seq.SetDescr(); CRef<CSeqdesc> d(new CSeqdesc); desc.Set().push_back(d); d->SetComment("This is a generated consensus sequence"); // the main one: Seq-inst CSeq_inst& inst = consensus_seq.SetInst(); inst.SetRepr(CSeq_inst::eRepr_raw); inst.SetMol(isNucleotide ? CSeq_inst::eMol_na : CSeq_inst::eMol_aa); inst.SetLength(data.length()); CSeq_data& seq_data = inst.SetSeq_data(); if (isNucleotide) { CIUPACna& na = seq_data.SetIupacna(); na = CIUPACna(data); } else { CIUPACaa& aa = seq_data.SetIupacaa(); aa = CIUPACaa(data); } }} consensus_row = new_ds->GetIds().size() - 1; return new_ds; }