bool
CShortReadFastaInputSource::x_ReadFromTwoFiles(CBioseq_set& bioseq_set,
                            TSeqPos batch_size,
                            CShortReadFastaInputSource::EInputFormat format)
{
    if (format == eFastc) {
        NCBI_THROW(CInputException, eInvalidInput, "FASTC format cannot be "
                   "used with two files");
    }

    // tags to indicate paired sequences
    CRef<CSeqdesc> seqdesc_first(new CSeqdesc);
    seqdesc_first->SetUser().SetType().SetStr("Mapping");
    seqdesc_first->SetUser().AddField("has_pair", eFirstSegment);

    CRef<CSeqdesc> seqdesc_last(new CSeqdesc);
    seqdesc_last->SetUser().SetType().SetStr("Mapping");
    seqdesc_last->SetUser().AddField("has_pair", eLastSegment);

    m_BasesAdded = 0;
    while (m_BasesAdded < batch_size && !m_LineReader->AtEOF() &&
           !m_SecondLineReader->AtEOF()) {

        CRef<CSeq_entry> first;
        CRef<CSeq_entry> second;

        if (format == eFasta) {
            first = x_ReadFastaOneSeq(m_LineReader); 
            second = x_ReadFastaOneSeq(m_SecondLineReader);
        }
        else {
            first = x_ReadFastqOneSeq(m_LineReader);
            second = x_ReadFastqOneSeq(m_SecondLineReader);
        }

        if (first.NotEmpty()) {
            if (second.NotEmpty()) {
                first->SetSeq().SetDescr().Set().push_back(seqdesc_first);
            }
            bioseq_set.SetSeq_set().push_back(first);
        }

        if (second.NotEmpty()) {
            if (first.NotEmpty()) {
                second->SetSeq().SetDescr().Set().push_back(seqdesc_last);
            }
            bioseq_set.SetSeq_set().push_back(second);
        }
    }

    return true;
}
Beispiel #2
0
static void UnpackSeqSet(CBioseq_set& bss, SequenceSet *parent, SequenceSet::SequenceList& seqlist)
{
    CBioseq_set::TSeq_set::iterator q, qe = bss.SetSeq_set().end();
    for (q=bss.SetSeq_set().begin(); q!=qe; ++q) {
        if (q->GetObject().IsSeq()) {

            // only store amino acid or nucleotide sequences
            if (q->GetObject().GetSeq().GetInst().GetMol() != CSeq_inst::eMol_aa &&
                q->GetObject().GetSeq().GetInst().GetMol() != CSeq_inst::eMol_dna &&
                q->GetObject().GetSeq().GetInst().GetMol() != CSeq_inst::eMol_rna &&
                q->GetObject().GetSeq().GetInst().GetMol() != CSeq_inst::eMol_na)
                continue;

            const Sequence *sequence = new Sequence(parent, q->GetObject().SetSeq());
            if (!sequence || !sequence->identifier)
                FATALMSG("Can't create Sequence object, aborting...");
            seqlist.push_back(sequence);

        } else { // Bioseq-set
            UnpackSeqSet(q->GetObject().SetSet(), parent, seqlist);
        }
    }
}
void
CShortReadFastaInputSource::x_ReadFastc(CBioseq_set& bioseq_set,
                                        TSeqPos batch_size)
{
    string id;

    // tags to indicate paired sequences
    CRef<CSeqdesc> seqdesc_first(new CSeqdesc);
    seqdesc_first->SetUser().SetType().SetStr("Mapping");
    seqdesc_first->SetUser().AddField("has_pair", eFirstSegment);

    CRef<CSeqdesc> seqdesc_last(new CSeqdesc);
    seqdesc_last->SetUser().SetType().SetStr("Mapping");
    seqdesc_last->SetUser().AddField("has_pair", eLastSegment);

    m_BasesAdded = 0;
    while (m_BasesAdded < batch_size && !m_LineReader->AtEOF()) {
        ++(*m_LineReader);
        m_Line = **m_LineReader;

        // ignore empty lines
        if (m_Line.empty()) {
            continue;
        }

        // if defline
        if (m_Line[0] == '>') {
            id = x_ParseDefline(m_Line);
        }
        else {
            // otherwise sequence

            // make sure that a defline was read first
            if (id.empty()) {
                NCBI_THROW(CInputException, eInvalidInput,
                           (string)"Missing defline before line: " +
                           NStr::IntToString(m_LineReader->GetLineNumber()));
            }

            // find '><' that separate reads of a pair
            size_t p = m_Line.find('>');
            if (p == CTempString::npos || m_Line[p + 1] != '<') {

                NCBI_THROW(CInputException, eInvalidInput,
                           (string)"FASTC parse error: Sequence separator '><'"
                           " was not found in line: " +
                           NStr::IntToString(m_LineReader->GetLineNumber()));
            }

            // set up reads, there are two sequences in the same line separated
            char* first = (char*)m_Line.data();
            char* second = (char*)m_Line.data() + p + 2;
            size_t first_len = p;
            size_t second_len = m_Line.length() - p - 2;

            {{
                CRef<CSeq_id> seqid(new CSeq_id);
                seqid->Set(CSeq_id::e_Local, id + ".1");

                CRef<CSeq_entry> seq_entry(new CSeq_entry);
                CBioseq& bioseq = seq_entry->SetSeq();
                bioseq.SetInst().SetMol(CSeq_inst::eMol_na);
                bioseq.SetInst().SetRepr(CSeq_inst::eRepr_raw);
                bioseq.SetId().clear();
                bioseq.SetId().push_back(seqid);
                bioseq.SetInst().SetLength(first_len);
                first[first_len] = 0;
                bioseq.SetInst().SetSeq_data().SetIupacna(CIUPACna(first));
                bioseq.SetDescr().Set().push_back(seqdesc_first);

                // add a sequence to the batch
                bioseq_set.SetSeq_set().push_back(seq_entry);
            }}

            {{
                CRef<CSeq_id> seqid(new CSeq_id);
                seqid->Set(CSeq_id::e_Local, id + ".2");

                CRef<CSeq_entry> seq_entry(new CSeq_entry);
                CBioseq& bioseq = seq_entry->SetSeq();
                bioseq.SetInst().SetMol(CSeq_inst::eMol_na);
                bioseq.SetInst().SetRepr(CSeq_inst::eRepr_raw);
                bioseq.SetId().clear();
                bioseq.SetId().push_back(seqid);
                bioseq.SetInst().SetLength(second_len);
                second[second_len] = 0;
                bioseq.SetInst().SetSeq_data().SetIupacna(CIUPACna(second));
                bioseq.SetDescr().Set().push_back(seqdesc_last);

                // add a sequence to the batch
                bioseq_set.SetSeq_set().push_back(seq_entry);
            }}
            m_BasesAdded += first_len + second_len;
            id.clear();
        }
    }
}
void
CShortReadFastaInputSource::x_ReadFastaOrFastq(CBioseq_set& bioseq_set,
                                               TSeqPos batch_size)
{
    // tags to indicate paired sequences
    CRef<CSeqdesc> seqdesc_first(new CSeqdesc);
    seqdesc_first->SetUser().SetType().SetStr("Mapping");
    seqdesc_first->SetUser().AddField("has_pair", eFirstSegment);

    CRef<CSeqdesc> seqdesc_last(new CSeqdesc);
    seqdesc_last->SetUser().SetType().SetStr("Mapping");
    seqdesc_last->SetUser().AddField("has_pair", eLastSegment);

    m_BasesAdded = 0;
    while (m_BasesAdded < batch_size && !m_LineReader->AtEOF()) {

        CRef<CSeq_entry> first;
        CRef<CSeq_entry> second;
        switch (m_Format) {
        case eFasta:
            first = x_ReadFastaOneSeq(m_LineReader); 
            break;

        case eFastq:
            first = x_ReadFastqOneSeq(m_LineReader);
            break;

        default:
            NCBI_THROW(CInputException, eInvalidInput, "Invalid input file "
                       "format x_ReadFastaOrFastq read either FASTA or FASTQ");
        }


        // if paired read the next sequence and mark a pair
        if (m_IsPaired) {
            switch (m_Format) {
            case eFasta:
                second = x_ReadFastaOneSeq(m_LineReader);
                break;

            case eFastq:
                second = x_ReadFastqOneSeq(m_LineReader);
                break;
                
            default:
                NCBI_THROW(CInputException, eInvalidInput, "Invalid input file "
                           "format x_ReadFastaOrFastq read either FASTA or "
                           "FASTQ");
            }

            if (first.NotEmpty()) {
                if (second.NotEmpty()) {
                    first->SetSeq().SetDescr().Set().push_back(seqdesc_first);
                }
                bioseq_set.SetSeq_set().push_back(first);
            }

            if (second.NotEmpty()) {
                if (first.NotEmpty()) {
                    second->SetSeq().SetDescr().Set().push_back(seqdesc_last);
                }
                bioseq_set.SetSeq_set().push_back(second);
            }
        }
        else {
            // otherwise just add the read sequence
            if (first.NotEmpty()) {
                bioseq_set.SetSeq_set().push_back(first);
            }
        }
    }
}