CRef<CSeq_entry> CShortReadFastaInputSource::x_ReadFastaOneSeq(CRef<ILineReader> line_reader) { int start = 0; // parse the last read defline CTempString line = **line_reader; CTempString id = x_ParseDefline(line); CRef<CSeq_id> seqid(new CSeq_id); seqid->Set(CSeq_id::e_Local, id); ++(*line_reader); line = **line_reader; while (line[0] != '>') { // ignore empty lines if (line.empty() && !line_reader->AtEOF()) { ++(*line_reader); line = **line_reader; continue; } // copy the sequence // increase the sequence buffer if necessary if (start + line.length() + 1 > m_SeqBuffLen) { string tmp; m_SeqBuffLen = 2 * (start + line.length() + 1); tmp.reserve(m_SeqBuffLen); memcpy(&tmp[0], &m_Sequence[0], start); m_Sequence.swap(tmp); } memcpy(&m_Sequence[start], line.data(), line.length()); start += line.length(); if (line_reader->AtEOF()) { break; } // read next line ++(*line_reader); line = **line_reader; } // set up sequence if (start > 0) { CRef<CSeq_entry> seq_entry(new CSeq_entry); CBioseq& bioseq = seq_entry->SetSeq(); bioseq.SetInst().SetMol(CSeq_inst::eMol_na); bioseq.SetInst().SetRepr(CSeq_inst::eRepr_raw); bioseq.SetId().clear(); bioseq.SetId().push_back(seqid); bioseq.SetInst().SetLength(start); m_Sequence[start] = 0; bioseq.SetInst().SetSeq_data().SetIupacna(CIUPACna(&m_Sequence[0])); bioseq.SetDescr(); m_BasesAdded += start; return seq_entry; } return CRef<CSeq_entry>(); }
CRef<CSeq_entry> CAlnReader::GetSeqEntry() { if (m_Entry) { return m_Entry; } else if ( !m_ReadDone ) { NCBI_THROW2(CObjReaderParseException, eFormat, "CAlnReader::GetSeqEntry(): " "Seq_entry is not available until after Read()", 0); } m_Entry = new CSeq_entry(); CRef<CSeq_annot> seq_annot (new CSeq_annot); seq_annot->SetData().SetAlign().push_back(GetSeqAlign()); m_Entry->SetSet().SetClass(CBioseq_set::eClass_pop_set); m_Entry->SetSet().SetAnnot().push_back(seq_annot); CBioseq_set::TSeq_set& seq_set = m_Entry->SetSet().SetSeq_set(); typedef CDense_seg::TDim TNumrow; for (TNumrow row_i = 0; row_i < m_Dim; row_i++) { const string& seq_str = m_SeqVec[row_i]; const size_t& seq_str_len = seq_str.size(); CRef<CSeq_entry> seq_entry (new CSeq_entry); // seq-id(s) CBioseq::TId& ids = seq_entry->SetSeq().SetId(); CSeq_id::ParseFastaIds(ids, m_Ids[row_i], true); if (ids.empty()) { ids.push_back(CRef<CSeq_id>(new CSeq_id(CSeq_id::e_Local, m_Ids[row_i]))); } // mol CSeq_inst::EMol mol = CSeq_inst::eMol_not_set; CSeq_id::EAccessionInfo ai = ids.front()->IdentifyAccession(); if (ai & CSeq_id::fAcc_nuc) { mol = CSeq_inst::eMol_na; } else if (ai & CSeq_id::fAcc_prot) { mol = CSeq_inst::eMol_aa; } else { switch (CFormatGuess::SequenceType(seq_str.data(), seq_str_len)) { case CFormatGuess::eNucleotide: mol = CSeq_inst::eMol_na; break; case CFormatGuess::eProtein: mol = CSeq_inst::eMol_aa; break; default: break; } } // seq-inst CRef<CSeq_inst> seq_inst (new CSeq_inst); seq_entry->SetSeq().SetInst(*seq_inst); seq_set.push_back(seq_entry); // repr seq_inst->SetRepr(CSeq_inst::eRepr_raw); // mol seq_inst->SetMol(mol); // len _ASSERT(seq_str_len == m_SeqLen[row_i]); seq_inst->SetLength(seq_str_len); // data CSeq_data& data = seq_inst->SetSeq_data(); if (mol == CSeq_inst::eMol_aa) { data.SetIupacaa().Set(seq_str); } else { data.SetIupacna().Set(seq_str); CSeqportUtil::Pack(&data); } } return m_Entry; }
CRef<CSeq_entry> CShortReadFastaInputSource::x_ReadFastqOneSeq(CRef<ILineReader> line_reader) { CTempString line; CTempString id; CRef<CSeq_entry> retval; // first read defline ++(*line_reader); line = **line_reader; // skip empty lines while (!line_reader->AtEOF() && line.empty()) { ++(*line_reader); line = **line_reader; } if (line[0] != '@') { NCBI_THROW(CInputException, eInvalidInput, (string)"FASTQ parse error:" " defline expected at line: " + NStr::IntToString(line_reader->GetLineNumber())); } id = x_ParseDefline(line); CRef<CSeq_id> seqid(new CSeq_id); seqid->Set(CSeq_id::e_Local, id); // read sequence ++(*line_reader); line = **line_reader; // skip empty lines while (!line_reader->AtEOF() && line.empty()) { ++(*line_reader); line = **line_reader; } // set up sequence if (line.length() > 0) { CRef<CSeq_entry> seq_entry(new CSeq_entry); CBioseq& bioseq = seq_entry->SetSeq(); bioseq.SetInst().SetMol(CSeq_inst::eMol_na); bioseq.SetInst().SetRepr(CSeq_inst::eRepr_raw); bioseq.SetId().clear(); bioseq.SetId().push_back(seqid); bioseq.SetInst().SetLength(line.length()); bioseq.SetInst().SetSeq_data().SetIupacna(CIUPACna(line.data())); bioseq.SetDescr(); m_BasesAdded += line.length(); retval = seq_entry; } // read and skip second defline ++(*line_reader); line = **line_reader; // skip empty lines while (!line_reader->AtEOF() && line.empty()) { ++(*line_reader); line = **line_reader; } if (line[0] != '+') { NCBI_THROW(CInputException, eInvalidInput, (string)"FASTQ parse error:" " defline expected at line: " + NStr::IntToString(line_reader->GetLineNumber())); } // read and skip quality scores ++(*line_reader); line = **line_reader; // skip empty lines while (!line_reader->AtEOF() && line.empty()) { ++(*line_reader); line = **line_reader; } return retval; }
void CShortReadFastaInputSource::x_ReadFastc(CBioseq_set& bioseq_set, TSeqPos batch_size) { string id; // tags to indicate paired sequences CRef<CSeqdesc> seqdesc_first(new CSeqdesc); seqdesc_first->SetUser().SetType().SetStr("Mapping"); seqdesc_first->SetUser().AddField("has_pair", eFirstSegment); CRef<CSeqdesc> seqdesc_last(new CSeqdesc); seqdesc_last->SetUser().SetType().SetStr("Mapping"); seqdesc_last->SetUser().AddField("has_pair", eLastSegment); m_BasesAdded = 0; while (m_BasesAdded < batch_size && !m_LineReader->AtEOF()) { ++(*m_LineReader); m_Line = **m_LineReader; // ignore empty lines if (m_Line.empty()) { continue; } // if defline if (m_Line[0] == '>') { id = x_ParseDefline(m_Line); } else { // otherwise sequence // make sure that a defline was read first if (id.empty()) { NCBI_THROW(CInputException, eInvalidInput, (string)"Missing defline before line: " + NStr::IntToString(m_LineReader->GetLineNumber())); } // find '><' that separate reads of a pair size_t p = m_Line.find('>'); if (p == CTempString::npos || m_Line[p + 1] != '<') { NCBI_THROW(CInputException, eInvalidInput, (string)"FASTC parse error: Sequence separator '><'" " was not found in line: " + NStr::IntToString(m_LineReader->GetLineNumber())); } // set up reads, there are two sequences in the same line separated char* first = (char*)m_Line.data(); char* second = (char*)m_Line.data() + p + 2; size_t first_len = p; size_t second_len = m_Line.length() - p - 2; {{ CRef<CSeq_id> seqid(new CSeq_id); seqid->Set(CSeq_id::e_Local, id + ".1"); CRef<CSeq_entry> seq_entry(new CSeq_entry); CBioseq& bioseq = seq_entry->SetSeq(); bioseq.SetInst().SetMol(CSeq_inst::eMol_na); bioseq.SetInst().SetRepr(CSeq_inst::eRepr_raw); bioseq.SetId().clear(); bioseq.SetId().push_back(seqid); bioseq.SetInst().SetLength(first_len); first[first_len] = 0; bioseq.SetInst().SetSeq_data().SetIupacna(CIUPACna(first)); bioseq.SetDescr().Set().push_back(seqdesc_first); // add a sequence to the batch bioseq_set.SetSeq_set().push_back(seq_entry); }} {{ CRef<CSeq_id> seqid(new CSeq_id); seqid->Set(CSeq_id::e_Local, id + ".2"); CRef<CSeq_entry> seq_entry(new CSeq_entry); CBioseq& bioseq = seq_entry->SetSeq(); bioseq.SetInst().SetMol(CSeq_inst::eMol_na); bioseq.SetInst().SetRepr(CSeq_inst::eRepr_raw); bioseq.SetId().clear(); bioseq.SetId().push_back(seqid); bioseq.SetInst().SetLength(second_len); second[second_len] = 0; bioseq.SetInst().SetSeq_data().SetIupacna(CIUPACna(second)); bioseq.SetDescr().Set().push_back(seqdesc_last); // add a sequence to the batch bioseq_set.SetSeq_set().push_back(seq_entry); }} m_BasesAdded += first_len + second_len; id.clear(); } } }
CRef<CSeq_loc> CBlastFastaInputSource::x_FastaToSeqLoc(CRef<objects::CSeq_loc>& lcase_mask, CScope& scope) { static const TSeqRange kEmptyRange(TSeqRange::GetEmpty()); CRef<CBlastScopeSource> query_scope_source; if (m_Config.GetLowercaseMask()) lcase_mask = m_InputReader->SaveMask(); CRef<CSeq_entry> seq_entry(m_InputReader->ReadOneSeq()); if (lcase_mask) { if (lcase_mask->Which() != CSeq_loc::e_not_set) { lcase_mask->SetStrand(eNa_strand_plus); } _ASSERT(lcase_mask->GetStrand() == eNa_strand_plus || lcase_mask->GetStrand() == eNa_strand_unknown); } _ASSERT(seq_entry.NotEmpty()); scope.AddTopLevelSeqEntry(*seq_entry); CTypeConstIterator<CBioseq> itr(ConstBegin(*seq_entry)); CRef<CSeq_loc> retval(new CSeq_loc()); if ( !blast::HasRawSequenceData(*itr) ) { CBlastInputReader* blast_reader = dynamic_cast<CBlastInputReader*>(m_InputReader.get()); _ASSERT(blast_reader); CRef<CBlastScopeSource> query_scope_source = blast_reader->GetQueryScopeSource(); query_scope_source->AddDataLoaders(CRef<CScope>(&scope)); } if (m_ReadProteins && itr->IsNa()) { NCBI_THROW(CInputException, eSequenceMismatch, "Nucleotide FASTA provided for protein sequence"); } else if ( !m_ReadProteins && itr->IsAa() ) { NCBI_THROW(CInputException, eSequenceMismatch, "Protein FASTA provided for nucleotide sequence"); } // set strand if (m_Config.GetStrand() == eNa_strand_other || m_Config.GetStrand() == eNa_strand_unknown) { if (m_ReadProteins) retval->SetInt().SetStrand(eNa_strand_unknown); else retval->SetInt().SetStrand(eNa_strand_both); } else { if (m_ReadProteins) { NCBI_THROW(CInputException, eInvalidStrand, "Cannot assign nucleotide strand to protein sequence"); } retval->SetInt().SetStrand(m_Config.GetStrand()); } // sanity checks for the range const TSeqPos from = m_Config.GetRange().GetFrom() == kEmptyRange.GetFrom() ? 0 : m_Config.GetRange().GetFrom(); const TSeqPos to = m_Config.GetRange().GetTo() == kEmptyRange.GetTo() ? 0 : m_Config.GetRange().GetTo(); // Get the sequence length const TSeqPos seqlen = seq_entry->GetSeq().GetInst().GetLength(); //if (seqlen == 0) { // NCBI_THROW(CInputException, eEmptyUserInput, // "Query contains no sequence data"); //} _ASSERT(seqlen != numeric_limits<TSeqPos>::max()); if (to > 0 && to < from) { NCBI_THROW(CInputException, eInvalidRange, "Invalid sequence range"); } if (from > seqlen) { NCBI_THROW(CInputException, eInvalidRange, "Invalid from coordinate (greater than sequence length)"); } // N.B.: if the to coordinate is greater than or equal to the sequence // length, we fix that silently // set sequence range retval->SetInt().SetFrom(from); retval->SetInt().SetTo((to > 0 && to < seqlen) ? to : (seqlen-1)); // set ID retval->SetInt().SetId().Assign(*FindBestChoice(itr->GetId(), CSeq_id::BestRank)); return retval; }
CRef<CBioseq_set> CMakeCdrProds::MakeCdrProds(CRef<CSeq_annot> annot, CBioseq_Handle handle) { CRef<CBioseq_set> bioseq_set(new CBioseq_set); if (!annot->GetData().IsFtable()) { // Is this the right thing to do? // Could throw, or could return null CRef instead. return bioseq_set; } list<CRef<CSeq_feat> >& ftable = annot->SetData().SetFtable(); NON_CONST_ITERATE (list<CRef<CSeq_feat> >, feat, ftable) { if (!(*feat)->GetData().IsCdregion()) { // not interested if not a Cdregion continue; } if ((*feat)->IsSetProduct()) { // already has a product; don't make new one continue; } string prot; CSeqTranslator::Translate(**feat, handle.GetScope(), prot); CRef<CSeq_data> seq_data(new CSeq_data(prot, CSeq_data::e_Iupacaa)); CRef<CSeq_inst> seq_inst(new CSeq_inst); seq_inst->SetSeq_data(*seq_data); seq_inst->SetRepr(CSeq_inst_Base::eRepr_raw); seq_inst->SetMol(CSeq_inst_Base::eMol_aa); seq_inst->SetLength(prot.size()); CRef<CBioseq> bio_seq(new CBioseq); string num = NStr::NumericToString(sm_Counter.Add(1)); // pad to five digits if (num.size() < 5) { num.insert(SIZE_TYPE(0), 5 - num.size(), '0'); } string acc = "tp" + num; string full_acc = "lcl|" + acc; CRef<CSeq_id> id(new CSeq_id(full_acc)); bio_seq->SetId().push_back(id); // a title CRef<CSeqdesc> title(new CSeqdesc); title->SetTitle(string("Translation product ") + acc); bio_seq->SetDescr().Set().push_back(title); // Mol_type CRef<CSeqdesc> mol_type(new CSeqdesc); mol_type->SetMol_type( eGIBB_mol_peptide); bio_seq->SetDescr().Set().push_back(mol_type); // set the instance bio_seq->SetInst(*seq_inst); // wrap this Bio_seq in an entry CRef<CSeq_entry> seq_entry(new CSeq_entry); seq_entry->SetSeq(*bio_seq); // add this entry to our Bioseq_set bioseq_set->SetSeq_set().push_back(seq_entry); // record it as product in the annot we're handed CRef<CSeq_loc> prod_loc(new CSeq_loc); prod_loc->SetWhole(*id); (*feat)->SetProduct(*prod_loc); } return bioseq_set; }
CRef<CSeq_entry> CSraRun::GetSpotEntry(spotid_t spot_id) const { CRef<CSeq_entry> entry; CSraStringValue name(m_Name, spot_id); entry = new CSeq_entry(); CBioseq_set& seqset = entry->SetSet(); seqset.SetLevel(0); seqset.SetClass(seqset.eClass_other); CSraValueFor<SRASpotDesc> sdesc(m_SDesc, spot_id); TSeqPos trim_start = m_Trim && m_TrimStart? CSraValueFor<INSDC_coord_zero>(m_TrimStart, spot_id).Value(): 0; TSeqPos trim_end = sdesc->clip_qual_right; CSraValueFor<SRAReadDesc> rdesc(m_RDesc, spot_id); CSraStringValue read(m_Read, spot_id); CSraBytesValue qual(m_Qual, spot_id); int seq_count = 0; string id_start = GetAccession()+'.'+NStr::UIntToString(spot_id)+'.'; for ( int r = 0; r < sdesc->num_reads; ++r ) { if ( rdesc[r].type != SRA_READ_TYPE_BIOLOGICAL ) { continue; } TSeqPos len = rdesc[r].seg.len; if ( len == 0 ) { continue; } TSeqPos start = rdesc[r].seg.start; TSeqPos end = start + len; if ( m_Trim ) { start = max(start, trim_start); end = min(end, trim_end); if ( start >= end ) { continue; } len = end - start; } CRef<CSeq_entry> seq_entry(new CSeq_entry); CBioseq& seq = seq_entry->SetSeq(); CRef<CSeq_id> id(new CSeq_id); id->SetGeneral().SetDb("SRA"); id->SetGeneral().SetTag().SetStr(id_start+NStr::UIntToString(r+1)); seq.SetId().push_back(id); {{ CRef<CSeqdesc> desc(new CSeqdesc); desc->SetTitle(name.Value()); seq.SetDescr().Set().push_back(desc); }} {{ CSeq_inst& inst = seq.SetInst(); inst.SetRepr(inst.eRepr_raw); inst.SetMol(inst.eMol_na); inst.SetLength(len); inst.SetSeq_data().SetIupacna().Set() .assign(read.data()+start, len); }} {{ CRef<CSeq_annot> annot(new CSeq_annot); CRef<CSeq_graph> graph(new CSeq_graph); annot->SetData().SetGraph().push_back(graph); graph->SetTitle("Phred Quality"); graph->SetLoc().SetWhole(*id); graph->SetNumval(len); CByte_graph& bytes = graph->SetGraph().SetByte(); bytes.SetAxis(0); CByte_graph::TValues& values = bytes.SetValues(); values.reserve(len); int min = kMax_Int; int max = kMin_Int; for ( size_t i = 0; i < len; ++i ) { int v = qual[start+i]; values.push_back(v); if ( v < min ) { min = v; } if ( v > max ) { max = v; } } bytes.SetMin(min); bytes.SetMax(max); seq.SetAnnot().push_back(annot); }} seqset.SetSeq_set().push_back(seq_entry); ++seq_count; } switch ( seq_count ) { case 0: entry.Reset(); break; case 1: entry = seqset.GetSeq_set().front(); break; } return entry; }