bool CShortReadFastaInputSource::x_ReadFromTwoFiles(CBioseq_set& bioseq_set, TSeqPos batch_size, CShortReadFastaInputSource::EInputFormat format) { if (format == eFastc) { NCBI_THROW(CInputException, eInvalidInput, "FASTC format cannot be " "used with two files"); } // tags to indicate paired sequences CRef<CSeqdesc> seqdesc_first(new CSeqdesc); seqdesc_first->SetUser().SetType().SetStr("Mapping"); seqdesc_first->SetUser().AddField("has_pair", eFirstSegment); CRef<CSeqdesc> seqdesc_last(new CSeqdesc); seqdesc_last->SetUser().SetType().SetStr("Mapping"); seqdesc_last->SetUser().AddField("has_pair", eLastSegment); m_BasesAdded = 0; while (m_BasesAdded < batch_size && !m_LineReader->AtEOF() && !m_SecondLineReader->AtEOF()) { CRef<CSeq_entry> first; CRef<CSeq_entry> second; if (format == eFasta) { first = x_ReadFastaOneSeq(m_LineReader); second = x_ReadFastaOneSeq(m_SecondLineReader); } else { first = x_ReadFastqOneSeq(m_LineReader); second = x_ReadFastqOneSeq(m_SecondLineReader); } if (first.NotEmpty()) { if (second.NotEmpty()) { first->SetSeq().SetDescr().Set().push_back(seqdesc_first); } bioseq_set.SetSeq_set().push_back(first); } if (second.NotEmpty()) { if (first.NotEmpty()) { second->SetSeq().SetDescr().Set().push_back(seqdesc_last); } bioseq_set.SetSeq_set().push_back(second); } } return true; }
static void UnpackSeqSet(CBioseq_set& bss, SequenceSet *parent, SequenceSet::SequenceList& seqlist) { CBioseq_set::TSeq_set::iterator q, qe = bss.SetSeq_set().end(); for (q=bss.SetSeq_set().begin(); q!=qe; ++q) { if (q->GetObject().IsSeq()) { // only store amino acid or nucleotide sequences if (q->GetObject().GetSeq().GetInst().GetMol() != CSeq_inst::eMol_aa && q->GetObject().GetSeq().GetInst().GetMol() != CSeq_inst::eMol_dna && q->GetObject().GetSeq().GetInst().GetMol() != CSeq_inst::eMol_rna && q->GetObject().GetSeq().GetInst().GetMol() != CSeq_inst::eMol_na) continue; const Sequence *sequence = new Sequence(parent, q->GetObject().SetSeq()); if (!sequence || !sequence->identifier) FATALMSG("Can't create Sequence object, aborting..."); seqlist.push_back(sequence); } else { // Bioseq-set UnpackSeqSet(q->GetObject().SetSet(), parent, seqlist); } } }
void CShortReadFastaInputSource::x_ReadFastc(CBioseq_set& bioseq_set, TSeqPos batch_size) { string id; // tags to indicate paired sequences CRef<CSeqdesc> seqdesc_first(new CSeqdesc); seqdesc_first->SetUser().SetType().SetStr("Mapping"); seqdesc_first->SetUser().AddField("has_pair", eFirstSegment); CRef<CSeqdesc> seqdesc_last(new CSeqdesc); seqdesc_last->SetUser().SetType().SetStr("Mapping"); seqdesc_last->SetUser().AddField("has_pair", eLastSegment); m_BasesAdded = 0; while (m_BasesAdded < batch_size && !m_LineReader->AtEOF()) { ++(*m_LineReader); m_Line = **m_LineReader; // ignore empty lines if (m_Line.empty()) { continue; } // if defline if (m_Line[0] == '>') { id = x_ParseDefline(m_Line); } else { // otherwise sequence // make sure that a defline was read first if (id.empty()) { NCBI_THROW(CInputException, eInvalidInput, (string)"Missing defline before line: " + NStr::IntToString(m_LineReader->GetLineNumber())); } // find '><' that separate reads of a pair size_t p = m_Line.find('>'); if (p == CTempString::npos || m_Line[p + 1] != '<') { NCBI_THROW(CInputException, eInvalidInput, (string)"FASTC parse error: Sequence separator '><'" " was not found in line: " + NStr::IntToString(m_LineReader->GetLineNumber())); } // set up reads, there are two sequences in the same line separated char* first = (char*)m_Line.data(); char* second = (char*)m_Line.data() + p + 2; size_t first_len = p; size_t second_len = m_Line.length() - p - 2; {{ CRef<CSeq_id> seqid(new CSeq_id); seqid->Set(CSeq_id::e_Local, id + ".1"); CRef<CSeq_entry> seq_entry(new CSeq_entry); CBioseq& bioseq = seq_entry->SetSeq(); bioseq.SetInst().SetMol(CSeq_inst::eMol_na); bioseq.SetInst().SetRepr(CSeq_inst::eRepr_raw); bioseq.SetId().clear(); bioseq.SetId().push_back(seqid); bioseq.SetInst().SetLength(first_len); first[first_len] = 0; bioseq.SetInst().SetSeq_data().SetIupacna(CIUPACna(first)); bioseq.SetDescr().Set().push_back(seqdesc_first); // add a sequence to the batch bioseq_set.SetSeq_set().push_back(seq_entry); }} {{ CRef<CSeq_id> seqid(new CSeq_id); seqid->Set(CSeq_id::e_Local, id + ".2"); CRef<CSeq_entry> seq_entry(new CSeq_entry); CBioseq& bioseq = seq_entry->SetSeq(); bioseq.SetInst().SetMol(CSeq_inst::eMol_na); bioseq.SetInst().SetRepr(CSeq_inst::eRepr_raw); bioseq.SetId().clear(); bioseq.SetId().push_back(seqid); bioseq.SetInst().SetLength(second_len); second[second_len] = 0; bioseq.SetInst().SetSeq_data().SetIupacna(CIUPACna(second)); bioseq.SetDescr().Set().push_back(seqdesc_last); // add a sequence to the batch bioseq_set.SetSeq_set().push_back(seq_entry); }} m_BasesAdded += first_len + second_len; id.clear(); } } }
void CShortReadFastaInputSource::x_ReadFastaOrFastq(CBioseq_set& bioseq_set, TSeqPos batch_size) { // tags to indicate paired sequences CRef<CSeqdesc> seqdesc_first(new CSeqdesc); seqdesc_first->SetUser().SetType().SetStr("Mapping"); seqdesc_first->SetUser().AddField("has_pair", eFirstSegment); CRef<CSeqdesc> seqdesc_last(new CSeqdesc); seqdesc_last->SetUser().SetType().SetStr("Mapping"); seqdesc_last->SetUser().AddField("has_pair", eLastSegment); m_BasesAdded = 0; while (m_BasesAdded < batch_size && !m_LineReader->AtEOF()) { CRef<CSeq_entry> first; CRef<CSeq_entry> second; switch (m_Format) { case eFasta: first = x_ReadFastaOneSeq(m_LineReader); break; case eFastq: first = x_ReadFastqOneSeq(m_LineReader); break; default: NCBI_THROW(CInputException, eInvalidInput, "Invalid input file " "format x_ReadFastaOrFastq read either FASTA or FASTQ"); } // if paired read the next sequence and mark a pair if (m_IsPaired) { switch (m_Format) { case eFasta: second = x_ReadFastaOneSeq(m_LineReader); break; case eFastq: second = x_ReadFastqOneSeq(m_LineReader); break; default: NCBI_THROW(CInputException, eInvalidInput, "Invalid input file " "format x_ReadFastaOrFastq read either FASTA or " "FASTQ"); } if (first.NotEmpty()) { if (second.NotEmpty()) { first->SetSeq().SetDescr().Set().push_back(seqdesc_first); } bioseq_set.SetSeq_set().push_back(first); } if (second.NotEmpty()) { if (first.NotEmpty()) { second->SetSeq().SetDescr().Set().push_back(seqdesc_last); } bioseq_set.SetSeq_set().push_back(second); } } else { // otherwise just add the read sequence if (first.NotEmpty()) { bioseq_set.SetSeq_set().push_back(first); } } } }