示例#1
0
Document* PFMatrixFormat::loadDocument( IOAdapter* io, const U2DbiRef& dbiRef, const QVariantMap& fs, U2OpStatus& os ){
    DbiOperationsBlock opBlock(dbiRef, os);
    CHECK_OP(os, NULL);
    Q_UNUSED(opBlock);

    QList<GObject*> objs;
    IOAdapterFactory* iof = AppContext::getIOAdapterRegistry()->getIOAdapterFactoryById(io->getAdapterId());
    TaskStateInfo siPFM;
    PFMatrix m = WeightMatrixIO::readPFMatrix(iof, io->getURL().getURLString(), siPFM);
    if (siPFM.hasError()) {
        os.setError(tr("The file format is not PFM"));
    } else {
       if (m.getLength() == 0) {
            os.setError(tr("Zero length or corrupted model\nMaybe model data are not enough for selected algorithm"));
        }
    }
    CHECK_OP(os, NULL);

    PFMatrixObject *mObj = PFMatrixObject::createInstance(m, QFileInfo(io->getURL().getURLString()).baseName(), dbiRef, os, fs);
    CHECK_OP(os, NULL);
    objs.append(mObj);
    return new Document(this, io->getFactory(), io->getURL(), dbiRef, objs, fs);
}
示例#2
0
Document* PlainTextFormat::loadDocument(IOAdapter* io, const U2DbiRef& dbiRef, const QVariantMap& fs, U2OpStatus& os){
    DbiOperationsBlock opBlock(dbiRef, os);
    CHECK_OP(os, NULL);
    Q_UNUSED(opBlock);

    QString text;
    int size = io->left();
    if (size > 0) {
        text.reserve(size);
    }
    QByteArray block(BUFF_SIZE, '\0');
    int blockLen = 0;
    while ((blockLen = io->readBlock(block.data(), BUFF_SIZE)) > 0) {
        int sizeBefore = text.length();
        QString line = QString::fromLocal8Bit(block.data(), blockLen);
        text.append(line);
        if (text.length() != sizeBefore + blockLen) {
            os.setError(L10N::errorReadingFile(io->getURL()));
            break;
        }
        os.setProgress(io->getProgress());
    }

    CHECK_OP(os, NULL);

    //todo: check file-readonly status?

    QVariantMap hints;
    hints.insert(DBI_FOLDER_HINT, fs.value(DBI_FOLDER_HINT, U2ObjectDbi::ROOT_FOLDER));

    TextObject* to = TextObject::createInstance(text, io->getURL().baseFileName(), dbiRef, os, hints);
    CHECK_OP(os, NULL);
    QList<GObject*> objects;
    objects.append(to);
    Document* d = new Document(this, io->getFactory(), io->getURL(), dbiRef, objects, fs);
    return d;
}
示例#3
0
/**
 * FASTQ format specification: http://maq.sourceforge.net/fastq.shtml
 */
static void load(IOAdapter* io, const U2DbiRef& dbiRef, const QVariantMap& hints, const GUrl& docUrl, QList<GObject*>& objects, U2OpStatus& os,
                 int gapSize, int predictedSize, QString& writeLockReason) {
    DbiOperationsBlock opBlock(dbiRef, os);
    CHECK_OP(os, );
    Q_UNUSED(opBlock);
    writeLockReason.clear();

    bool merge = gapSize!=-1;
    QByteArray sequence;
    QByteArray qualityScores;
    QStringList headers;
    QSet<QString> uniqueNames;

    QVector<U2Region> mergedMapping;
    QByteArray gapSequence((merge ? gapSize : 0), 0);
    sequence.reserve(predictedSize);
    qualityScores.reserve(predictedSize);

    // for lower case annotations
    GObjectReference sequenceRef;
    qint64 sequenceStart = 0;

    U2SequenceImporter seqImporter(hints, true);
    const QString folder = hints.value(DocumentFormat::DBI_FOLDER_HINT, U2ObjectDbi::ROOT_FOLDER).toString();
    int seqNumber = 0;
    int progressUpNum = 0;

    const int objectsCountLimit = hints.contains(DocumentReadingMode_MaxObjectsInDoc) ? hints[DocumentReadingMode_MaxObjectsInDoc].toInt() : -1;
    const bool settingsMakeUniqueName = !hints.value(DocumentReadingMode_DontMakeUniqueNames, false).toBool();
    while (!os.isCoR()) {
        //read header
        QString sequenceName = readSequenceName(os, io, '@');
        // check for eof while trying to read another FASTQ block
        if (io->isEof()) {
            break;
        }

        CHECK_OP_BREAK(os);

        if(sequenceName.isEmpty()){
            sequenceName = "Sequence";
        }

        if ((merge == false) || (seqNumber == 0)) {
            QString objName = sequenceName;
            if (settingsMakeUniqueName) {
                objName = (merge) ? "Sequence" : TextUtils::variate(sequenceName, "_", uniqueNames);
                objName.squeeze();
                uniqueNames.insert(objName);
            }
            seqImporter.startSequence(dbiRef, folder, objName, false, os);
            CHECK_OP_BREAK(os);
        }

        //read sequence
        if (merge && sequence.length() > 0) {
            seqImporter.addDefaultSymbolsBlock(gapSize,os);
            sequenceStart += sequence.length();
            sequenceStart+=gapSize;
            CHECK_OP_BREAK(os);
        }

        sequence.clear();
        readSequence(os, io, sequence);
        MemoryLocker lSequence(os, qCeil(sequence.size()/(1000*1000)));
        CHECK_OP_BREAK(os);
        Q_UNUSED(lSequence);

        seqImporter.addBlock(sequence.data(),sequence.length(),os);
        CHECK_OP_BREAK(os);

        QString qualSequenceName = readSequenceName(os, io, '+');
        if (!qualSequenceName.isEmpty()) {
            static const QString err = U2::FastqFormat::tr("Not a valid FASTQ file: %1, sequence name differs from quality scores name: %2 and %3");
            CHECK_EXT_BREAK(sequenceName == qualSequenceName,
                os.setError(err.arg(docUrl.getURLString()).arg(sequenceName).arg(qualSequenceName)));
        }

        // read qualities
        qualityScores.clear();
        readQuality(os, io, qualityScores, sequence.size());
        CHECK_OP_BREAK(os);

        static const QString err = U2::FastqFormat::tr("Not a valid FASTQ file: %1. Bad quality scores: inconsistent size.").arg(docUrl.getURLString());
        CHECK_EXT_BREAK(sequence.length() == qualityScores.length(), os.setError(err));

        seqNumber++;
        progressUpNum++;
        if (merge) {
            headers.append(sequenceName);
            mergedMapping.append(U2Region(sequenceStart, sequence.length() ));
        }
        else {
            if (objectsCountLimit > 0 && objects.size() >= objectsCountLimit) {
                os.setError(FastqFormat::tr("File \"%1\" contains too many sequences to be displayed. "
                    "However, you can process these data using instruments from the menu <i>Tools -> NGS data analysis</i> "
                    "or pipelines built with Workflow Designer.")
                    .arg(io->getURL().getURLString()));
                break;
            }

            U2Sequence u2seq = seqImporter.finalizeSequenceAndValidate(os);
            CHECK_OP_BREAK(os);
            sequenceRef = GObjectReference(io->getURL().getURLString(), u2seq.visualName, GObjectTypes::SEQUENCE, U2EntityRef(dbiRef, u2seq.id));

            U2SequenceObject* seqObj = new U2SequenceObject(u2seq.visualName, U2EntityRef(dbiRef, u2seq.id));
            CHECK_EXT_BREAK(seqObj != NULL, os.setError("U2SequenceObject is NULL"));
            seqObj->setQuality(DNAQuality(qualityScores));
            objects << seqObj;

            U1AnnotationUtils::addAnnotations(objects, seqImporter.getCaseAnnotations(), sequenceRef, NULL, hints);
        }
        if (PROGRESS_UPDATE_STEP == progressUpNum) {
            progressUpNum = 0;
            os.setProgress(io->getProgress());
        }
    }

    CHECK_OP_EXT(os, qDeleteAll(objects); objects.clear(), );
    bool emptyObjects = objects.isEmpty();
    CHECK_EXT(!emptyObjects || merge, os.setError(Document::tr("Document is empty.")), );
    SAFE_POINT(headers.size() == mergedMapping.size(), "headers <-> regions mapping failed!", );

    if (!merge) {
        return;
    }
    U2Sequence u2seq = seqImporter.finalizeSequenceAndValidate(os);
    CHECK_OP(os,);

    sequenceRef = GObjectReference(io->getURL().getURLString(), u2seq.visualName, GObjectTypes::SEQUENCE, U2EntityRef(dbiRef, u2seq.id));

    U1AnnotationUtils::addAnnotations(objects, seqImporter.getCaseAnnotations(), sequenceRef, NULL, hints);
    objects << new U2SequenceObject(u2seq.visualName, U2EntityRef(dbiRef, u2seq.id));
    objects << DocumentFormatUtils::addAnnotationsForMergedU2Sequence(sequenceRef, dbiRef, headers, mergedMapping, hints);
    if (headers.size() > 1) {
        writeLockReason = DocumentFormat::MERGED_SEQ_LOCK;
    }
}
void EMBLGenbankAbstractDocument::load(const U2DbiRef& dbiRef, IOAdapter* io, QList<GObject*>& objects, QVariantMap& fs, U2OpStatus& os, QString& writeLockReason) {
    DbiOperationsBlock opBlock(dbiRef, os);
    CHECK_OP(os, );
    Q_UNUSED(opBlock);
    writeLockReason.clear();

    //get settings
    int gapSize = qBound(-1, DocumentFormatUtils::getMergeGap(fs), 1000*1000);
    bool merge = gapSize!=-1;

    QScopedPointer<AnnotationTableObject> mergedAnnotations(NULL);
    QStringList contigs;
    QVector<U2Region> mergedMapping;

    // Sequence loading is 'lazy', so, if there is no sequence, it won't be created and there is no need to remove it.
    U2SequenceImporter seqImporter(fs, true);
    const QString folder = fs.value(DBI_FOLDER_HINT, U2ObjectDbi::ROOT_FOLDER).toString();

    QSet<QString> usedNames;

    GObjectReference sequenceRef(GObjectReference(io->getURL().getURLString(), "", GObjectTypes::SEQUENCE));

    QByteArray readBuffer(ParserState::LOCAL_READ_BUFFER_SIZE, '\0');
    ParserState st(isNcbiLikeFormat() ? 12 : 5, io, NULL, os);
    st.buff = readBuffer.data();

    TmpDbiObjects dbiObjects(dbiRef, os);
    int num_sequence = 0;

    qint64 sequenceStart = 0;
    int sequenceSize = 0;
    int fullSequenceSize = 0;
    const int objectsCountLimit = fs.contains(DocumentReadingMode_MaxObjectsInDoc) ? fs[DocumentReadingMode_MaxObjectsInDoc].toInt() : -1;

    for (int i=0; !os.isCoR(); i++, ++num_sequence) {
        if (objectsCountLimit > 0 && objects.size() >= objectsCountLimit) {
            os.setError(EMBLGenbankAbstractDocument::tr("File \"%1\" contains too many sequences to be displayed. "
                "However, you can process these data using instruments from the menu <i>Tools -> NGS data analysis</i> "
                "or pipelines built with Workflow Designer.")
                .arg(io->getURL().getURLString()));
            break;
        }

        //TODO: reference to a local variable??? Such a pointer will become invalid
        EMBLGenbankDataEntry data;
        st.entry = &data;

        if (num_sequence == 0 || merge == false){
            seqImporter.startSequence(dbiRef, folder, "default sequence name", false, os); //change name and circularity after finalize method
            CHECK_OP(os, );
        }

        sequenceSize = 0;
        os.setDescription(tr("Reading entry header"));
        int offset = 0;
        if (merge && num_sequence > 0) {
            offset = gapSize;
        }
        if (!readEntry(&st,seqImporter,sequenceSize,fullSequenceSize,merge,offset, os)) {
            break;
        }

        if (merge && sequenceSize > 0 && num_sequence > 0) {
                sequenceStart = fullSequenceSize - sequenceSize;
                sequenceStart += gapSize;
                fullSequenceSize += gapSize;
        }

        // tolerate blank lines between records
        char ch;
        bool b;
        while ((b = st.io->getChar(&ch)) && (ch == '\n' || ch == '\r')){}
        if (b) {
            st.io->skip(-1);
        }

        AnnotationTableObject *annotationsObject = NULL;

        if (data.hasAnnotationObjectFlag) {
            QString annotationName = genObjectName(usedNames, data.name, data.tags, i+1, GObjectTypes::ANNOTATION_TABLE);

            QVariantMap hints;
            hints.insert(DBI_FOLDER_HINT, fs.value(DBI_FOLDER_HINT, U2ObjectDbi::ROOT_FOLDER));
            if (Q_UNLIKELY(merge && NULL == mergedAnnotations)) {
                mergedAnnotations.reset(new AnnotationTableObject(annotationName, dbiRef, hints));
            }
            annotationsObject = merge ? mergedAnnotations.data() : new AnnotationTableObject(annotationName, dbiRef, hints);

            QStringList groupNames;
            QMap<QString, QList<SharedAnnotationData> > groupName2Annotations;
            for (int i = 0, n = data.features.size(); i < n; ++i) {
                SharedAnnotationData &d = data.features[i];
                if (!d->location->regions.isEmpty()) {
                    for (int i = 0, n = d->location->regions.size(); i < n; ++i) {
                        // for some reason larger numbers cannot be stored within rtree SQLite tables
                        if (d->location->regions[i].endPos() > 9223371036854775807LL) {
                            d->location->regions[i].length = 9223371036854775807LL - d->location->regions[i].startPos;
                        }
                    }
                }
                groupNames.clear();
                d->removeAllQualifiers(GBFeatureUtils::QUALIFIER_GROUP, groupNames);
                if (groupNames.isEmpty()) {
                    groupName2Annotations[""].append(d);
                } else {
                    foreach(const QString &gName, groupNames) {
                        groupName2Annotations[gName].append(d);
                    }
                }
                CHECK_OP(os, );
            }
示例#5
0
static void load(IOAdapter* io, const U2DbiRef& dbiRef,  QList<GObject*>& objects, const QVariantMap& fs, U2OpStatus& os) {
    DbiOperationsBlock opBlock(dbiRef, os);
    CHECK_OP(os, );
    Q_UNUSED(opBlock);

    U2SequenceImporter seqImporter(fs, true);
    const QString folder = fs.value(DocumentFormat::DBI_FOLDER_HINT, U2ObjectDbi::ROOT_FOLDER).toString();

    QByteArray readBuffer(DocumentFormat::READ_BUFF_SIZE, '\0');
    char* buff  = readBuffer.data();

    QBitArray ALPHAS = TextUtils::ALPHA_NUMS;
    ALPHAS['-'] = true;

    QByteArray seq;
    QString seqName(io->getURL().baseFileName());
    //reading sequence
    QBuffer writer(&seq);
    writer.open(QIODevice::WriteOnly);
    TmpDbiObjects dbiObjects(dbiRef, os);
    bool ok = true;
    int len = 0;
    bool isStarted = false;
    int sequenceCounter = 0;
    bool terminatorFound = false;
    bool isSplit = fs.value((DocumentReadingMode_SequenceAsSeparateHint), false).toBool();


    while (ok && !io->isEof()) {
        len = io->readLine(buff, DocumentFormat::READ_BUFF_SIZE, &terminatorFound);
        if (len <= 0){
            continue;
        }

        seq.clear();
        bool isSeek = writer.seek(0);
                assert(isSeek); Q_UNUSED(isSeek);
        if (os.isCoR()) {
            break;
        }

        for (int i=0; i<len && ok; i++) {
            char c = buff[i];
            if (ALPHAS[(uchar)c]) {
                ok = writer.putChar(c);
            }
        }
        if(seq.size()>0 && isStarted == false ){
            QString name = sequenceCounter == 0 ? seqName : seqName + QString("_%1").arg(sequenceCounter);
            isStarted = true;
            seqImporter.startSequence(dbiRef, folder, name, false, os);
        }
        if(isStarted){
            seqImporter.addBlock(seq.data(),seq.size(),os);
        }
        if (seq.size()>0 && isStarted && terminatorFound && isSplit){
            finishSequence(objects, io, os, dbiRef, fs, dbiObjects, seqImporter);
            sequenceCounter++;
            isStarted = false;
        }
        if (os.isCoR()) {
            break;
        }
        os.setProgress(io->getProgress());
    }
    writer.close();

    CHECK_OP(os, );
    if (sequenceCounter == 0){
        CHECK_EXT(isStarted == true, os.setError(RawDNASequenceFormat::tr("Sequence is empty")), );
    }
示例#6
0
static void load(IOAdapter* io, const U2DbiRef& dbiRef, const QVariantMap& fs, QList<GObject*>& objects,
                 int gapSize, QString& writeLockReason, U2OpStatus& os)
{
    DbiOperationsBlock opBlock(dbiRef, os);
    CHECK_OP(os, );
    Q_UNUSED(opBlock);

    static char fastaCommentStartChar = FastaFormat::FASTA_COMMENT_START_SYMBOL;

    MemoryLocker memoryLocker(os, 1);
    CHECK_OP(os, );

    writeLockReason.clear();
    QByteArray readBuff(DocumentFormat::READ_BUFF_SIZE + 1, 0);
    char* buff = readBuff.data();
    qint64 len = 0;

    bool merge = gapSize != -1;
    QStringList headers;
    QSet<QString> uniqueNames;
    QVector<U2Region> mergedMapping;

    // for lower case annotations
    GObjectReference sequenceRef;

    //skip leading whites if present
    bool lineOk = true;
    static QBitArray nonWhites = ~TextUtils::WHITES;
    io->readUntil(buff, DocumentFormat::READ_BUFF_SIZE, nonWhites, IOAdapter::Term_Exclude, &lineOk);
    CHECK_EXT(!io->hasError(), os.setError(io->errorString()), );

    U2SequenceImporter seqImporter(fs, true);
    const QString folder = fs.value(DocumentFormat::DBI_FOLDER_HINT, U2ObjectDbi::ROOT_FOLDER).toString();

    qint64 sequenceStart = 0;
    int sequenceNumber = 0;
    DbiConnection con(dbiRef, os);
    bool headerReaded = false;
    QStringList emptySeqNames;

    const int objectsCountLimit = fs.contains(DocumentReadingMode_MaxObjectsInDoc) ? fs[DocumentReadingMode_MaxObjectsInDoc].toInt() : -1;
    const bool settingsMakeUniqueName = !fs.value(DocumentReadingMode_DontMakeUniqueNames, false).toBool();
    while (!os.isCoR()) {
        //skip start comments and read header
        if(!headerReaded){
            do{
                len = io->readLine(buff, DocumentFormat::READ_BUFF_SIZE);
                CHECK_EXT(!io->hasError(), os.setError(io->errorString()), );
            }while(buff[0] == fastaCommentStartChar && len > 0);
        }

        if (len == 0 && io->isEof()) { //end if stream
            break;
        }
        CHECK_EXT(!io->hasError(), os.setError(io->errorString()), );
        CHECK_EXT_BREAK(lineOk, os.setError(FastaFormat::tr("Line is too long")));

        QString headerLine = QString(QByteArray(buff+1, len-1)).trimmed();
        CHECK_EXT_BREAK(buff[0] == FastaFormat::FASTA_HEADER_START_SYMBOL, os.setError(FastaFormat::tr("First line is not a FASTA header")));

        //read sequence
        if (sequenceNumber == 0 || !merge) {
            QString objName = headerLine;
            if(objName.isEmpty()){
                objName = "Sequence";
            }
            if (settingsMakeUniqueName) {
                objName = (merge) ? "Sequence" : TextUtils::variate(objName, "_", uniqueNames);
                objName.squeeze();
                memoryLocker.tryAcquire(2*objName.size());
                CHECK_OP_BREAK(os);
                uniqueNames.insert(objName);
            }
            seqImporter.startSequence(os, dbiRef, folder, objName, false);
            CHECK_OP_BREAK(os);

            sequenceRef = GObjectReference(io->getURL().getURLString(), objName, GObjectTypes::SEQUENCE);
        }
        if (sequenceNumber >= 1 && merge) {
            seqImporter.addDefaultSymbolsBlock(gapSize, os);
            sequenceStart += gapSize;
            CHECK_OP_BREAK(os);
        }
        int sequenceLen = 0;
        while (!os.isCoR()) {
            do {
                len = io->readLine(buff, DocumentFormat::READ_BUFF_SIZE);
                CHECK_EXT(!io->hasError(), os.setError(io->errorString()), );
            } while (len <= 0 && !io->isEof());
            CHECK_EXT(!io->hasError(), os.setError(io->errorString()), );

            if (len <= 0 && io->isEof()) {
                break;
            }
            CHECK_EXT(!io->hasError(), os.setError(io->errorString()), );

            buff[len] = 0;

            if(buff[0] != fastaCommentStartChar && buff[0] != FastaFormat::FASTA_HEADER_START_SYMBOL){
                len = TextUtils::remove(buff, len, TextUtils::WHITES);
                if(len > 0){
                    seqImporter.addBlock(buff, len, os);
                    sequenceLen += len;
                }
            }else if( buff[0] == FastaFormat::FASTA_HEADER_START_SYMBOL){
                headerReaded = true;
                break;
            }

            CHECK_OP_BREAK(os);
            os.setProgress(io->getProgress());
        }

        if (merge) {
            memoryLocker.tryAcquire(headerLine.size());
            CHECK_OP_BREAK(os);
            headers.append(headerLine);
            mergedMapping.append(U2Region(sequenceStart, sequenceLen));
        } else {
            if (objectsCountLimit > 0 && objects.size() >= objectsCountLimit) {
                os.setError(FastaFormat::tr("File \"%1\" contains too many sequences to be displayed. "
                    "However, you can process these data using instruments from the menu <i>Tools -> NGS data analysis</i> "
                    "or pipelines built with Workflow Designer.")
                    .arg(io->getURL().getURLString()));
                break;
            }
            memoryLocker.tryAcquire(800);
            CHECK_OP_BREAK(os);
            U2Sequence seq = seqImporter.finalizeSequenceAndValidate(os);
            if (os.hasError() && os.getError() == U2SequenceImporter::EMPTY_SEQUENCE_ERROR) {
                os.setError("");
                emptySeqNames << headerLine;
                continue;
            }
            sequenceRef.entityRef = U2EntityRef(dbiRef, seq.id);

            //TODO parse header
            U2StringAttribute attr(seq.id, DNAInfo::FASTA_HDR, headerLine);
            con.dbi->getAttributeDbi()->createStringAttribute(attr, os);
            CHECK_OP_BREAK(os);

            objects << new U2SequenceObject(seq.visualName, U2EntityRef(dbiRef, seq.id));
            CHECK_OP_BREAK(os);

            U1AnnotationUtils::addAnnotations(objects, seqImporter.getCaseAnnotations(), sequenceRef, NULL, fs);
        }
        sequenceStart += sequenceLen;
        sequenceNumber++;
        ioLog.trace(QString("Sequence #%1 is processed").arg(sequenceNumber));
    }

    CHECK_OP_EXT(os, qDeleteAll(objects); objects.clear(), );
    CHECK_EXT(!objects.isEmpty() || merge, os.setError(Document::tr("Document is empty.")), );
    SAFE_POINT(headers.size() == mergedMapping.size(), "headers <-> regions mapping failed!", );
    ioLog.trace("All sequences are processed");

    if (!emptySeqNames.isEmpty()) {
        QString warningMessage;
        warningMessage.append(FastaFormat::tr("Loaded sequences: %1.\n").arg(sequenceNumber));
        warningMessage.append(FastaFormat::tr("Skipped sequences: %1.\n").arg(emptySeqNames.size()));
        warningMessage.append(FastaFormat::tr("The following sequences are empty:\n%1").arg(emptySeqNames.join(",\n")));
        os.addWarning(warningMessage);
    }

    if (!merge) {
        return;
    }

    U2Sequence seq = seqImporter.finalizeSequenceAndValidate(os);
    CHECK_OP(os, );
    sequenceRef.entityRef = U2EntityRef(dbiRef, seq.id);

    U1AnnotationUtils::addAnnotations(objects, seqImporter.getCaseAnnotations(), sequenceRef, NULL, fs);
    objects << new U2SequenceObject(seq.visualName, U2EntityRef(dbiRef, seq.id));
    objects << DocumentFormatUtils::addAnnotationsForMergedU2Sequence( sequenceRef, dbiRef, headers, mergedMapping, fs );
    if (headers.size() > 1) {
        writeLockReason = QObject::tr("Document sequences were merged");
    }
}