示例#1
0
// diagnose
//   SSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSS.....................................................
//   ..........................XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX......................
//   ...............................IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII......................
//   .................................JJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJ......................
//   LLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLL....................................................
//   !"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^_`abcdefghijklmnopqrstuvwxyz{|}~
//   |                         |    |        |                              |                     |
//  33                        59   64       73                            104                   126 <- maxValue is value from here
// S 0........................26...31.......40
// X                          -5....0........9.............................40
// I                                0........9.............................40
// J                                   3.....9.............................40
// L 0.2......................26...31........41
//
//  S - Sanger        Phred+33,  raw reads typically (0, 40)
//  X - Solexa        Solexa+64, raw reads typically (-5, 40)
//  I - Illumina 1.3+ Phred+64,  raw reads typically (0, 40)
//  J - Illumina 1.5+ Phred+64,  raw reads typically (3, 40) with 0=unused, 1=unused, 2=Read Segment Quality Control Indicator (bold)
//  L - Illumina 1.8+ Phred+33,  raw reads typically (0, 41)
DNAQualityType FastqQualityTrimTask::detectQualityType(){
    int maxValue = 33;
    int minValue = 126;
    FASTQIterator iter_qual(settings.inputUrl, stateInfo);
    CHECK(!stateInfo.hasError(), DNAQualityType_Sanger);

    int counter = 0;
    while (iter_qual.hasNext()) {
        CHECK(!stateInfo.isCoR(), DNAQualityType_Sanger);

        if (counter > 1000) {   // check only first 1000 reads in file
            break;
        }

        DNASequence dna = iter_qual.next();
        int seqLen = dna.length();
        if (seqLen > dna.quality.qualCodes.length()) {
            continue;
        } else {
            for (int pos = 0; pos <= seqLen - 1; pos++) {
                maxValue = qMax(static_cast<int>(dna.quality.qualCodes.at(pos)), maxValue);
                minValue = qMin(static_cast<int>(dna.quality.qualCodes.at(pos)), minValue);
            }
        }
        counter++;
    }
    return DNAQuality::detectTypeByMinMaxQualityValues(minValue, maxValue);
}
void CASAVAFilterTask::runStep(){
    int ncount = 0;
    int ycount = 0;

    QScopedPointer<IOAdapter> io  (IOAdapterUtils::open(settings.outDir + settings.outName, stateInfo, IOAdapterMode_Append));

    //1:N:0:TAAGGG
    QRegExp pattern (":Y:[^:]:");
    FASTQIterator iter(settings.inputUrl);
    while(iter.hasNext()){
        if(stateInfo.isCoR()){
            return;
        }
        DNASequence seq = iter.next();
        QString comment = DNAInfo::getFastqComment(seq.info);
        if(pattern.indexIn(comment) != -1){
            ycount++;
        }else{
            FastqFormat::writeEntry(seq.getName() + " " + comment, seq, io.data(), "Writing error", stateInfo, false);
            ncount++;
        }
    }

    algoLog.info(QString("Discarded by CASAVA filter %1").arg(ycount));
    algoLog.info(QString("Accepted by CASAVA filter %1").arg(ncount));
    algoLog.info(QString("Total by CASAVA FILTER: %1").arg(ncount + ycount));
}
//Test DNASequence Allocate(DNALength)
TEST_F(DNASequenceTest, Allocate) {
    dnaOne.Allocate(0);
    EXPECT_EQ(dnaOne.length, 0);

    DNASequence dnaTwo;
    dnaTwo.Allocate(100);
    EXPECT_EQ(dnaTwo.length, 100);
}
//Test DNASequence.Copy(const DNASequence rhs, 
//                      DNALength rhsPos,
//                      DNALength rhsLength)
TEST_F(DNASequenceTest, Copy) {
    DNALength oneLen = 10;
    Nucleotide * one = new Nucleotide [oneLen];

    string As("AGAAAAACAA");
    for (int i = 0; i < oneLen; i++) {
        one[i] = As[i];
    }

    dnaOne.seq = one;
    dnaOne.length = oneLen;

    DNASequence dnaTwo;
    dnaTwo.Copy(dnaOne);

    EXPECT_EQ(dnaTwo.length, dnaOne.length);
    EXPECT_NE(dnaTwo.seq   , dnaOne.seq);
    EXPECT_TRUE(dnaTwo.deleteOnExit); 
    EXPECT_EQ(memcmp(dnaTwo.seq, dnaOne.seq, dnaOne.length), 0);

    //if rhs.length is 0, return this * 
    DNASequence dnaThree;
    dnaTwo.Copy(dnaThree);
    //dnaTwo remains unchanged
    EXPECT_EQ(dnaTwo.length, 0);
    EXPECT_NE(dnaTwo.seq, dnaOne.seq);
    EXPECT_TRUE(dnaTwo.deleteOnExit); 
    EXPECT_TRUE(dnaTwo.seq == NULL);

    //if rhsPos is not 0 and rhsLength is 0
    dnaTwo.Copy(dnaOne, 2);
    EXPECT_EQ(dnaTwo.length, dnaOne.length - 2);
    EXPECT_TRUE(dnaTwo.deleteOnExit); 
    EXPECT_EQ(memcmp(dnaTwo.seq, dnaOne.seq + 2, dnaTwo.length), 0);


    //if the subsequence to copy is out of bounds
    EXPECT_GT(200, dnaOne.length);
    //EXPECT_EXIT(dnaTwo.Copy(dnaOne, 200), ::testing::ExitedWithCode(1), ""); 


    //if both rhsPos and rhsLength are less than MAXINT,
    //but rhsPos+ rhsLength > MAXINT
    DNALength rhsPos = 3;
    DNALength rhsLength = UINT_MAX -1;
    EXPECT_TRUE(rhsPos < UINT_MAX && rhsLength < UINT_MAX);
    EXPECT_TRUE(rhsLength > dnaOne.length + 1);
    //EXPECT_EXIT(dnaTwo.Copy(dnaOne, rhsPos, rhsLength), ::testing::ExitedWithCode(1), "");


    //if rhsPos > rhs.length
    //EXPECT_EXIT(dnaTwo.Copy(dnaOne, dnaOne.length + 1), ::testing::ExitedWithCode(1), "")
    //    << "Copy a subsequence which is out of bounds. This needs to be taken care of. See bug 21867.";

}
void QualityTrimTask::runStep(){
    int ncount = 0;
    int ycount = 0;

    QScopedPointer<IOAdapter> io  (IOAdapterUtils::open(settings.outDir + settings.outName, stateInfo, IOAdapterMode_Append));

    int quality = settings.customParameters.value(QUALITY_ID, 20).toInt();
    int minLen = settings.customParameters.value(LEN_ID, 0).toInt();
    bool bothEnds = settings.customParameters.value(BOTH_ID, false).toInt();

    FASTQIterator iter(settings.inputUrl);
    while(iter.hasNext()){
        if(stateInfo.isCoR()){
            return;
        }
        DNASequence dna = iter.next();
        QString comment = DNAInfo::getFastqComment(dna.info);
        int seqLen = dna.length();
        if(seqLen > dna.quality.qualCodes.length()){
            ncount++;
            continue;
        }else{
            int endPosition = seqLen-1;
            for (; endPosition>=0; endPosition--){
                if(dna.quality.getValue(endPosition) >= quality){
                    break;
                }
            }
            int beginPosition = 0;
            if (bothEnds) {
                for (; beginPosition<=endPosition; beginPosition++) {
                    if (dna.quality.getValue(beginPosition) >= quality) {
                        break;
                    }
                }
            }
            if(endPosition>=beginPosition && endPosition-beginPosition+1 >= minLen){
                DNASequence trimmed(dna.getName(), dna.seq.left(endPosition+1).mid(beginPosition), dna.alphabet);
                trimmed.quality = dna.quality;
                trimmed.quality.qualCodes = trimmed.quality.qualCodes.left(endPosition+1).mid(beginPosition);
                FastqFormat::writeEntry(trimmed.getName(), trimmed, io.data(), "Writing error", stateInfo, false);
                ycount++;
            }else{
                ncount++;
                continue;
            }
        }
    }

    algoLog.info(QString("Discarded by trimmer %1").arg(ncount));
    algoLog.info(QString("Accepted by trimmer %1").arg(ycount));
    algoLog.info(QString("Total by trimmer %1").arg(ncount + ycount));
}
示例#6
0
int main(int argc, char* argv[]) {
	string refFileName, queryFileName;
	int maxHammingDistance;
	if (argc < 4) {
		cout << "usage: hammer ref query maxHam " << endl;
		exit(1);
	}
	refFileName = argv[1];
	queryFileName = argv[2];
	maxHammingDistance = atoi(argv[3]);

	FASTAReader reader;
	reader.Initialize(refFileName);
	FASTASequence ref, refRC;
	reader.GetNext(ref);
	ref.MakeRC(refRC);
	
	FASTAReader queryReader;
	queryReader.Initialize(queryFileName);
	FASTASequence query;
	queryReader.GetNext(query);
	DNALength p;
	for(p=0; p < ref.length-query.length-1; p++ ){
		DNASequence subseq;
		subseq.seq = &ref.seq[p];
		subseq.length = query.length;
		//		cout << "t "; subseq.PrintSeq(cout);
		//		cout << "q "; ((DNASequence*)&query)->PrintSeq(cout);
		if (HammingDistance(&subseq.seq[0], &query.seq[0], query.length) < maxHammingDistance) {
			cout << ">" << p << endl;
			subseq.PrintSeq(cout);
		}
		int i;
		for (i =0; i < query.length; i++) {
			subseq.seq[i] = toupper(subseq.seq[i]);
		}
	}

	for(p=0; p < ref.length-query.length-1; p++ ){
		DNASequence subseq;
		subseq.seq = &refRC.seq[p];
		subseq.length = query.length;
		if (HammingDistance(&subseq.seq[0], &query.seq[0], query.length) < maxHammingDistance) {
			cout << ">" << p << "rc" << endl;
			subseq.PrintSeq(cout);
		}
		int i;
		for (i =0; i < query.length; i++) {
			subseq.seq[i] = toupper(subseq.seq[i]);
		}
	}

}
//Test DNASequence ReferenceSubstring(rhs, pos, substrLength)
TEST_F(DNASequenceTest, ReferenceSubstring) {
    DNALength oneLen = 10;
    dnaOne.seq = new Nucleotide[oneLen];
    dnaOne.length = oneLen;

    DNASequence dnaTwo;
    dnaTwo.ReferenceSubstring(dnaOne);

    EXPECT_EQ(dnaOne.seq, dnaTwo.seq);
    EXPECT_EQ(dnaOne.length, dnaTwo.length);
    EXPECT_FALSE(dnaTwo.deleteOnExit);

//    EXPECT_DEATH_IF_SUPPORTED(dnaTwo.ReferenceSubstring(dnaOne, 100), "");
    delete dnaOne.seq;
}
示例#8
0
bool do_test(string sequence, int __expected) {
    time_t startClock = clock();
    DNASequence *instance = new DNASequence();
    int __result = instance->longestDNASequence(sequence);
    double elapsed = (double)(clock() - startClock) / CLOCKS_PER_SEC;
    delete instance;

    if (__result == __expected) {
        cout << "PASSED!" << " (" << elapsed << " seconds)" << endl;
        return true;
    }
    else {
        cout << "FAILED!" << " (" << elapsed << " seconds)" << endl;
        cout << "           Expected: " << to_string(__expected) << endl;
        cout << "           Received: " << to_string(__result) << endl;
        return false;
    }
}
//Test DNASequence ShallowCopy
TEST_F(DNASequenceTest, ShallowCopy) {
    DNALength oneLen = 10;
    Nucleotide * one = new Nucleotide [oneLen];

    string As("AAAAAAAAAA");
    for (int i = 0; i < oneLen; i++) {
        one[i] = As[i];
    }
    dnaOne.seq = one;
    dnaOne.length = oneLen;

    DNASequence dnaTwo;
    dnaTwo.ShallowCopy(dnaOne);

    EXPECT_EQ(dnaTwo.length, dnaOne.length);
    EXPECT_EQ(dnaTwo.seq   , dnaOne.seq);
    EXPECT_EQ(dnaTwo.deleteOnExit, dnaOne.deleteOnExit);
}
void MergeFastqTask::runStep(){
    QScopedPointer<IOAdapter> io  (IOAdapterUtils::open(settings.outDir + settings.outName, stateInfo, IOAdapterMode_Append));

    QStringList urls = settings.customParameters.value(INPUT_URLS_ID, "").toString().split(",");
    qint64 numberOfSeqs = 0;
    qint64 numberOfFiles = 0;

    foreach (QString url, urls){
        FASTQIterator iter(url);
        while(iter.hasNext()){
            if(stateInfo.isCoR()){
                return;
            }
            DNASequence dna = iter.next();
            FastqFormat::writeEntry(dna.getName(), dna, io.data(), "Writing error", stateInfo, false);
            numberOfSeqs++;
        }
        numberOfFiles++;

    }
//Test DNASequence constructor
TEST_F(DNASequenceTest, Constructor) {
    DNASequence dnaSeq;
    EXPECT_TRUE(dnaSeq.seq == NULL);
    EXPECT_TRUE(dnaSeq.length == 0);
    EXPECT_TRUE(dnaSeq.size() == dnaSeq.length);
    EXPECT_TRUE(dnaSeq.bitsPerNuc == 8);
    EXPECT_FALSE(dnaSeq.deleteOnExit);

    Nucleotide HKITTY[] = "HELLO,KITTY!";
    dnaSeq.seq = HKITTY;
    dnaSeq.length = sizeof(HKITTY)/sizeof(Nucleotide) - 1;
//    dnaSeq.Print(cout);
    EXPECT_EQ(dnaSeq.size(), 12);

    
    DNALength thisLen = 12;
    Nucleotide * thisNuc = new Nucleotide [thisLen];
    memcpy(thisNuc, HKITTY, thisLen);
    DNASequence newDnaSeq; 
    newDnaSeq.seq = thisNuc;
    newDnaSeq.length = thisLen;
//    newDnaSeq.Print(cout);
    EXPECT_EQ(memcmp(newDnaSeq.seq, dnaSeq.seq, thisLen), 0);
    EXPECT_EQ(newDnaSeq.length, thisLen);
    if (!thisNuc) delete thisNuc;

    DNASequence nnewDnaSeq;
    thisLen = 12;
    string atgc ("atgcatgcatgc");
    thisNuc = new Nucleotide [thisLen];
    for (int i = 0 ; i < thisLen; i++) {
        thisNuc[i] = atgc[i];
    }
    string ret;
    nnewDnaSeq.seq = thisNuc;
    nnewDnaSeq.length = thisLen;
    for (int i = 0 ; i < thisLen; i++) {
        ret += nnewDnaSeq.seq[i];
    }
    EXPECT_STREQ(ret.c_str(), atgc.c_str());
}
示例#12
0
static void saveSequence(IOAdapter* io, const DNASequence& sequence, U2OpStatus& os) {
    writeHeaderToFile( io, sequence.getName( ), os );
    CHECK_OP( os, );

    const char *seq = sequence.seq.constData( );
    const int len = sequence.seq.length( );
    for ( int i = 0; i < len; i += SAVE_LINE_LEN ) {
        const int chunkSize = qMin( SAVE_LINE_LEN, len - i );
        writeBlockToFile( io, seq + i, chunkSize, os );
        CHECK_OP( os, );
    }
}
//Test DNASequence TakeOwnership
TEST_F(DNASequenceTest, TakeOwnership) {
    DNALength oneLen = 10;
    Nucleotide * one = new Nucleotide [oneLen];
    
    dnaOne.seq = one; 
    dnaOne.length = oneLen;

    DNASequence dnaTwo;
    
    //a bug may occur if deleteOneExit is true and 
    //TakeOwnership() is called twice. In that case, both
    //dnaOne and dnaTwo will become wild pointers 
    dnaTwo.deleteOnExit = true;
    dnaTwo.TakeOwnership(dnaOne);
    
    EXPECT_EQ(dnaTwo.length, dnaOne.length);
    EXPECT_EQ(dnaTwo.deleteOnExit, dnaOne.deleteOnExit);
    EXPECT_EQ(dnaTwo.seq, dnaOne.seq);

    if(!one) delete one;
}
Task* PWMatrixSearchWorker::tick() {
    while (modelPort->hasMessage()) {
        models << modelPort->get().getData().toMap().value(PWMatrixWorkerFactory::WMATRIX_SLOT.getId()).value<PWMatrix>();
    }
    if (!modelPort->isEnded()) {
        return NULL;
    }

    if (dataPort->hasMessage()) {
        Message inputMessage = getMessageAndSetupScriptValues(dataPort);
        if (inputMessage.isEmpty() || models.isEmpty()) {
            output->transit();
            return NULL;
        }
        QVariantMap map = inputMessage.getData().toMap();
        SharedDbiDataHandler seqId = map.value(BaseSlots::DNA_SEQUENCE_SLOT().getId()).value<SharedDbiDataHandler>();
        QScopedPointer<U2SequenceObject> seqObj(StorageUtils::getSequenceObject(context->getDataStorage(), seqId));
        if (seqObj.isNull()) {
            return NULL;
        }
        U2OpStatusImpl os;
        DNASequence seq = seqObj->getWholeSequence(os);
        CHECK_OP(os, new FailTask(os.getError()));

        if (!seq.isNull() && seq.alphabet->getType() == DNAAlphabet_NUCL) {
            WeightMatrixSearchCfg config(cfg);
            config.complOnly = (strand < 0);
            if (strand <= 0) {
                DNATranslation* compTT = AppContext::getDNATranslationRegistry()->
                    lookupComplementTranslation(seq.alphabet);
                if (compTT  != NULL) {
                    config.complTT = compTT  ;
                }
            }
            QList<Task*> subtasks;
            foreach(PWMatrix model, models) {
                subtasks << new WeightMatrixSingleSearchTask(model, seq.seq, config, 0);
            }
示例#15
0
void FastqQualityTrimTask::runStep(){
    int ncount = 0;
    int ycount = 0;

    QScopedPointer<IOAdapter> io(IOAdapterUtils::open(settings.outDir + settings.outName, stateInfo, IOAdapterMode_Append));

    int quality = settings.customParameters.value(QUALITY_ID, 20).toInt();
    int minLen = settings.customParameters.value(LEN_ID, 0).toInt();
    bool bothEnds = settings.customParameters.value(BOTH_ID, false).toInt();
    DNAQualityType qualityType = detectQualityType();
    CHECK_OP(stateInfo, );

    FASTQIterator iter(settings.inputUrl, stateInfo);
    CHECK_OP(stateInfo, );

    while (iter.hasNext()) {
        CHECK_OP(stateInfo, );

        DNASequence dna = iter.next();
        dna.quality.type = qualityType;
        const U2Region acceptedRegion = DNASequenceUtils::trimByQuality(dna, quality, minLen, bothEnds);

        if (0 < acceptedRegion.length) {
            ycount++;
        } else {
            ncount++;
            continue;
        }

        FastqFormat::writeEntry(dna.getName(), dna, io.data(), "Writing error", stateInfo, false);
    }

    algoLog.info(QString("Discarded by trimmer %1").arg(ncount));
    algoLog.info(QString("Accepted by trimmer %1").arg(ycount));
    algoLog.info(QString("Total by trimmer %1").arg(ncount + ycount));
}
示例#16
0
GeneByGeneCompareResult GeneByGeneComparator::compareGeneAnnotation(const DNASequence& seq, const QList<SharedAnnotationData> &annData,
    const QString& annName, float identity)
{
    GeneByGeneCompareResult result;

    float maxIdentity = -1.0F;
    foreach (const SharedAnnotationData &adata, annData) {
        if (adata->name == annName) {
            U2Location location = adata->location;
            if (location->isSingleRegion()) {
                int reglen = location->regions.first().length;
                float lenRatio  = reglen * 100 /static_cast<float>(seq.length());
                maxIdentity = qMax(maxIdentity, lenRatio);
                if(lenRatio >= identity){ //check length ratio
                    QString ident = adata->findFirstQualifierValue(BLAST_IDENT);
                    if (!ident.isEmpty()){
                        //create BLAST string  YES/identity/gaps
                        float blastIdent = parseBlastQual(ident);
                        if (blastIdent != -1.0f && blastIdent >= identity){
                            result.identical = true;
                            result.identityString = GeneByGeneCompareResult::IDENTICAL_YES;
                            result.identityString.append(QString("\\%1").arg(blastIdent));
                            QString gaps = adata->findFirstQualifierValue(BLAST_GAPS);
                            if (!gaps.isEmpty()){
                                float blastGaps = parseBlastQual(gaps);
                                if (blastGaps!=1.0f){
                                    result.identityString.append(QString("\\%1").arg(blastGaps));
                                }
                            }else{
                                result.identityString.append(QString("\\0"));
                            }
                        }
                    }else{ //not a blast annotation
                        result.identical = true;
                        result.identityString = GeneByGeneCompareResult::IDENTICAL_YES;
                    }
                }
            }
            break;
        }
    }

    if (result.identical == false && maxIdentity != -1.0f){
        result.identityString.append(QString("\\%1").arg(maxIdentity));
    }

    return result;
}
int main(int argc, char* argv[]) {
    string inFileName, readsFileName;
    DNALength readLength;
    float coverage = 0;
    bool noRandInit = false;
    int numReads = -1;
    CommandLineParser clp;
    int qualityValue = 20;
    bool printFastq = false;
    int stratify = 0;
    string titleType = "pacbio";
    string fastqType = "illumina"; // or "sanger"
    clp.RegisterStringOption("inFile", &inFileName, "Reference sequence", 0);
    clp.RegisterPreviousFlagsAsHidden();
    clp.RegisterIntOption("readLength", (int*) &readLength, "The length of reads to simulate.  The length is fixed.",
                          CommandLineParser::PositiveInteger, "Length of every read.", 0);
    clp.RegisterFloatOption("coverage", &coverage, "Total coverage (from which the number of reads is calculated",
                            CommandLineParser::PositiveFloat, 0);
    clp.RegisterFlagOption("nonRandInit", &noRandInit, "Skip initializing the random number generator with time.");
    clp.RegisterIntOption("nReads", &numReads, "Total number of reads (from which coverage is calculated)", CommandLineParser::PositiveInteger, 0);
    clp.RegisterStringOption("readsFile", &readsFileName, "Reads output file", 0);
    clp.RegisterFlagOption("fastq", &printFastq, "Fake fastq output with constant quality value (20)");
    clp.RegisterIntOption("quality", &qualityValue, "Value to use for fastq quality", CommandLineParser::PositiveInteger);
    clp.RegisterIntOption("stratify", &stratify, "Sample a read every 'stratify' bases, rather than randomly.", CommandLineParser::PositiveInteger);
    clp.RegisterStringOption("titleType", &titleType, "Set the name of the title: 'pacbio'|'illumina'");
    clp.RegisterStringOption("fastqType", &fastqType, "Set the type of fastq: 'illumina'|'sanger'");
    vector<string> leftovers;
    clp.ParseCommandLine(argc, argv, leftovers);

    if (!noRandInit) {
        InitializeRandomGeneratorWithTime();
    }

    FASTAReader inReader;
    inReader.Init(inFileName);
    vector<FASTASequence> reference;

    inReader.ReadAllSequences(reference);
    ofstream readsFile;
    if (readsFileName == "") {
        cout << "ERROR.  You must specify a reads file." << endl;
        exit(0);
    }
    CrucialOpen(readsFileName, readsFile, std::ios::out);

    ofstream sangerFastqFile;
    if (fastqType == "sanger") {
        string sangerFastqFileName = readsFileName + ".fastq";
        CrucialOpen(sangerFastqFileName, sangerFastqFile, std::ios::out);
    }

    DNALength refLength = 0;
    int i;
    for (i = 0; i < reference.size(); i++) {
        refLength += reference[i].length;
    }
    if (numReads == -1 and coverage == 0 and stratify == 0) {
        cout << "Error, you must specify either coverage, nReads, or stratify." << endl;
        exit(1);
    }
    else if (numReads == -1) {
        numReads = (refLength / readLength) * coverage;
    }

    if (stratify) {
        if (!readLength) {
            cout << "ERROR. If you are using stratification, a read length must be specified." << endl;
            exit(1);
        }
    }

    DNASequence sampleSeq;
    sampleSeq.length = readLength;
    int maxRetry = 10000000;
    int retryNumber = 0;
    DNALength seqIndex, seqPos;
    if (stratify) {
        seqIndex = 0;
        seqPos   = 0;
    }
    DNALength origReadLength = readLength;
    for (i = 0; stratify or i < numReads; i++) {
        if (stratify == 0) {
            FindRandomPos(reference, seqIndex, seqPos, readLength );
        }
        else {
            //
            // find the next start pos, or bail if done
            //
            if (seqPos >= reference[seqIndex].length) {
                if (seqIndex == reference.size() - 1) {
                    break;
                }
                else {
                    seqIndex = seqIndex + 1;
                    seqPos   = 0;
                    continue;
                }
            }
            readLength = min(reference[seqIndex].length - seqPos, origReadLength);
        }
        sampleSeq.seq = &reference[seqIndex].seq[seqPos];
        int j;
        int gappedRead = 0;
        string title;
        stringstream titleStrm;
        if (titleType == "pacbio") {
            titleStrm << i << "|"<< reference[seqIndex].GetName() << "|" << seqPos << "|" << seqPos + readLength;
        }
        else if (titleType == "illumina") {
            titleStrm << "SE_" << i << "_0@" << seqPos << "-"<<seqPos+readLength <<"/1";
        }
        else {
            cout << "ERROR. Bad title type " << titleType << endl;
            exit(0);
        }
        title = titleStrm.str();
        sampleSeq.length = readLength;
        if (!printFastq) {
            readsFile << ">" << title << endl;
            sampleSeq.PrintSeq(readsFile);
        }
        else {
            FASTQSequence fastqSampleSeq;
            fastqSampleSeq.CopyTitle(title);
            fastqSampleSeq.seq = sampleSeq.seq;
            fastqSampleSeq.length = sampleSeq.length;
            fastqSampleSeq.qual.data = new unsigned char[sampleSeq.length];
            fill(fastqSampleSeq.qual.data, fastqSampleSeq.qual.data + sampleSeq.length, qualityValue);
            if (fastqType == "illumina") {
                fastqSampleSeq.PrintFastq(readsFile, fastqSampleSeq.length+1);
            }
            else {
                fastqSampleSeq.PrintSeq(readsFile);
                fastqSampleSeq.PrintQual(sangerFastqFile);
            }
            delete[] fastqSampleSeq.qual.data;
            delete[] fastqSampleSeq.title;
        }

        if (stratify) {
            seqPos += readLength;
        }

    }
    return 0;
}
示例#18
0
int main(int argc, char* argv[1]) {
	if (argc < 3) {
		cout << "Usage: findUnique genome.fasta query.fasta effective_k [options]" << endl;
		cout << "  genome.fasta.sa must exist." << endl;
		cout << "  Finds sequences at least effective_k in length that are unique." << endl;
		cout << "  -max m       Allow up to m matches" << endl;
		cout << "  -minLength l Ensure the length of the match is at least this." << endl;
		cout << "  -prefix p n  Allow up to n matches across a prefix of length p" << endl;
		cout << "  -suffix s n  Allow up to n matches across a suffix of length s" << endl;
		cout << "               Prefix and suffix options override max." << endl;
		cout << "  -out file    Print queries to this output file (query.fasta.queries)" << endl;
		exit(0);
	}

	DNASuffixArray sarray;
	
	string genomeFileName = argv[1];
	string suffixArrayFileName = genomeFileName + ".sa";
	
	FASTAReader reader;
	FASTASequence genome;

	int maxN = 0;

	int prefix = 0;
	int suffix = 0;
	int prefixN = 0;
	int suffixN = 0;
	int argi = 4;
	string outputFileName = "";
	int minLength = 0;
	while (argi < argc) {
		if (strcmp(argv[argi], "-max") == 0) {
			++argi;
			maxN = atoi(argv[argi]);
		}
		else if (strcmp(argv[argi], "-prefix") == 0) {
			++argi;
			prefix = atoi(argv[argi]);
			++argi;
			prefixN = atoi(argv[argi]);
		}
		else if (strcmp(argv[argi], "-suffix") == 0) {
			++argi;
			suffix = atoi(argv[argi]);
			++argi;
			suffixN = atoi(argv[argi]);
		}
		else if (strcmp(argv[argi], "-out") == 0) {
			++argi;
			outputFileName = argv[argi];
		}
		else if (strcmp(argv[argi], "-minLength") == 0) {
			++argi;
			minLength = atoi(argv[argi]);
		}
		++argi;
	}

	reader.Initialize(genomeFileName);
	reader.ReadAllSequencesIntoOne(genome);
	sarray.Read(suffixArrayFileName);

	FASTAReader queryReader;
	FASTASequence querySequence;
	string queryFileName = argv[2];
	int maxLength = atoi(argv[3]);
	string summaryTableFileName = queryFileName + ".summary";
	if (outputFileName == "") {
		outputFileName = queryFileName + ".queries";
	}
		
	
	ofstream summaryTable(summaryTableFileName.c_str());
	ofstream outputFile(outputFileName.c_str());

	queryReader.Initialize(queryFileName);

	while (queryReader.GetNext(querySequence)) {
		int i;
		cerr << "searching " << querySequence.title << endl;
		if (querySequence.length < maxLength) {
			continue;
		}

		int nMatches = 0;
		querySequence.ToUpper();
		int localMax;
		for (i = 0; i < querySequence.length - maxLength + 1; i++) {
			if ((i + 1) % 100000 == 0) {
				cerr << "processed: " << i + 1 << endl;
			}

			int lcpLength;
			vector<SAIndex> lcpLeftBounds, lcpRightBounds;
			vector<SAIndex> rclcpLeftBounds, rclcpRightBounds;
			localMax = maxN;
			if (i < prefix) {
				localMax = prefixN;
			}
			if (i >= querySequence.length - suffix) {
				localMax = suffixN;
			}
			if (querySequence.length - i <= maxLength) {
				continue;
			}
			if (querySequence.seq[i] == 'N') {
				continue;
			}
			lcpLength = sarray.StoreLCPBounds(genome.seq, genome.length, // The string which the suffix array is built on.
																				&querySequence.seq[i], querySequence.length-i,
																				true,
																				maxLength,
																				lcpLeftBounds, lcpRightBounds,
																				false);
			if (lcpLength < minLength) {
				continue;
			}
			if (lcpLength < maxLength or 
					lcpRightBounds.size() == 0 or 
					(lcpRightBounds.size() > 0 and 
					 lcpLeftBounds.size() > 0 and  
					 lcpRightBounds[lcpRightBounds.size() - 1] - lcpLeftBounds[lcpLeftBounds.size()-1] <= localMax)) {

				FASTASequence rc;
				DNASequence subseq;
				subseq.ReferenceSubstring(querySequence, i, maxLength);
				subseq.MakeRC(rc);
				int rclcpLength;
				int numForwardMatches;
				if (lcpLength == 0) {
					numForwardMatches = 0;
				}
				else {
					numForwardMatches = lcpRightBounds[lcpRightBounds.size() - 1] - lcpLeftBounds[lcpLeftBounds.size()-1];
				}
				rclcpLength = sarray.StoreLCPBounds(genome.seq, genome.length, // The string which the suffix array is built on.
																						rc.seq, maxLength,
																						true,
																						rclcpLength,
																						rclcpLeftBounds, rclcpRightBounds,
																						false);

				string rcstr((const char*)rc.seq, rc.length);

				if (rclcpLength < maxLength or 
						rclcpRightBounds.size() == 0 or
						(numForwardMatches + 
						 rclcpRightBounds[rclcpRightBounds.size() - 1] -
						 rclcpLeftBounds[rclcpLeftBounds.size()-1] <= localMax)) 
					{
						char* substr = new char[maxLength+1];
						substr[maxLength] = '\0';
						memcpy(substr, &querySequence.seq[i], maxLength);

						//						string substr = string((const char*) querySequence.seq, i, maxLength);
						
						outputFile << querySequence.title << "\t" << substr << "\t" << i << endl;

						++nMatches;
						delete[] substr;
						//					}
					}
				rc.Free();
			}

		}
		summaryTable << querySequence.title << "\t" << nMatches << endl;
		querySequence.Free();
	}
	outputFile.close();
	genome.Free();
}
void SAMAlignmentsToCandidates(SAMAlignment &sam,
                               std::vector<FASTASequence> &referenceSequences,
                               std::map<std::string,int> & refNameToRefListIndex,
                               std::vector<AlignmentCandidate<> > &candidates, 
                               bool parseSmrtTitle,
                               bool keepRefAsForward,
                               bool copyQVs) {
  //
  // First determine how many alignments there are from CIGAR string.
  //
  std::vector<int> lengths;
  std::vector<char> ops;
  sam.cigar.Vectorize(lengths, ops);

  DNASequence querySeq;
  // For now just reference the query sequence.
  querySeq.deleteOnExit = false;
  querySeq.seq = (Nucleotide*) sam.seq.c_str();
  querySeq.length = sam.seq.size();

  DNALength samTEnd = 0;
  DNALength samTStart = sam.pos - 1;

  std::vector<std::string> optionalQVs;
  if (copyQVs) {
      sam.CopyQVs(&optionalQVs);    
  }
  if (keepRefAsForward == false and IsReverseComplement(sam.flag)) {
    ReverseAlignmentOperations(lengths, ops);
    DNASequence rcQuerySeq;
    querySeq.CopyAsRC(rcQuerySeq);
    //
    // Zero out the query seq so that the string memory is not
    // deleted.
    //
    querySeq.seq = NULL;
    querySeq.length = 0;
    querySeq = rcQuerySeq;
    rcQuerySeq.Free();
    samTEnd = GetAlignedReferenceLengthByCIGARSum(ops, lengths);
    
    // We also need to reverse any optional QVs
    if (copyQVs) {
      for(int i=0; i<optionalQVs.size(); i++) {
        std::reverse(optionalQVs[i].begin(), optionalQVs[i].end());
      }
    }
  }


  int i;
  int offset = 0;
  if (ops.size() == 0) {
    return;
  }
  bool alignmentStarted = false;
  bool onFirstMatch = true;
  int  curAlignment;
  
  //
  // Advance past any clipping.  This advances in both query and
  // reference position.
  //
  int cigarPos = 0;
  int qPos = 0; 
  int tPos = 0;

  DNALength queryPosOffset = 0;
  if (parseSmrtTitle) {
    //
    // The aligned sequence is really a subread of a full
    // sequence. The position of the aligments start at 0, the
    // beginning of the query sequence, but in the sam file, they
    // may appear as subreads, and are offset from the start of the
    // subread.  By convention, the subread coordinates are embedded
    // in the title of the query, if it is a smrtTitle. 
    // Two types of smrtTitle are supported:
    // movie/zmw/start_end
    // movie/zmw/start_end/start2_end2
    SMRTTitle stitle = SMRTTitle(sam.qName);

    if (not stitle.isSMRTTitle) {
      std::cout << "ERROR. Could not parse title " << sam.qName << std::endl;
      exit(1);
    }
    queryPosOffset = stitle.start;
  }
  else if (sam.xs) {
    queryPosOffset += sam.xs - 1;
  }


  while (cigarPos < lengths.size()) {
    int numClipped;
    //
    // Sequence clipping becomes offsets into the q/t alignedSeqPos
    //


    int numSoftClipped;
    numClipped = AdvancePastClipping(lengths, ops, cigarPos, numSoftClipped);

    //
    // End loop now.
    //
    if (cigarPos >= lengths.size()) {
      break;
    }
    qPos += numSoftClipped;

    //
    // Skipped sequences are just advances in the tPos.
    //
    int numSkipped = AdvancePastSkipped(lengths, ops, cigarPos);
    tPos += numSkipped;

    if (cigarPos >= lengths.size()) {
      break;
    }


    AlignmentCandidate<> alignment;
    //
    // The aligned sequence must start at a match therefore the tpos
    // and qpos are 0.
    //
    alignment.qPos = 0;
    alignment.tPos = 0;

    // qAlignStart is the start of the alignment relative to the sequence in the SAM file.
    DNALength qAlignStart = qPos;
    // tAlignStart is the start of the alignment in the genome.
    DNALength tAlignStart = tPos;
    
    int cigarEnd = cigarPos;
    AdvancePosToAlignmentEnd(ops, cigarEnd);

    CIGAROpsToBlocks(lengths, ops,          
                     cigarPos, cigarEnd,
                     qPos, tPos,
                     alignment);


    DNALength queryLengthSum = GetAlignedQueryLengthByCIGARSum(ops, lengths);
    DNALength refLengthSum   = GetAlignedReferenceLengthByCIGARSum(ops, lengths);
    alignment.qAlignedSeqLength = qPos - qAlignStart;
    alignment.tAlignedSeqLength = tPos - tAlignStart;

    //
    // Assign candidate sequences.
    //
    // First, the query sequence is straight from the SAM line.
    ((DNASequence*)&alignment.qAlignedSeq)->Copy(querySeq, qAlignStart, alignment.qAlignedSeqLength);
    if (copyQVs) {
      alignment.ReadOptionalQVs(optionalQVs, qAlignStart, alignment.qAlignedSeqLength);
    }
    
    // The SAM Alignments a
    alignment.qStrand = IsReverseComplement(sam.flag);
    alignment.tStrand = 0;
    alignment.mapQV   = sam.mapQV;

    //
    // Assign the offsets into the original sequence where the
    // subsequence starts.
    //

    alignment.qAlignedSeqPos = queryPosOffset + qAlignStart;    
    alignment.tAlignedSeqPos = samTStart + tAlignStart;
    
    if (sam.rName == "*") {
      //
      // No reference, do not add the alignment to the list of
      // candidates.
      //
      continue;
    }
    else {
      int refIndex;
      int s = refNameToRefListIndex.size();
      if (refNameToRefListIndex.find(sam.rName) == refNameToRefListIndex.end()) {
        std::cout <<" ERROR.  SAM Reference " << sam.rName << " is not found in the list of reference contigs." << std::endl;
        exit(1);
      }
      
      refIndex = refNameToRefListIndex[sam.rName];
     
      alignment.tLength = referenceSequences[refIndex].length;
      alignment.qLength = sam.seq.size(); 
      alignment.qName = sam.qName;
      alignment.tName = sam.rName;


      if (keepRefAsForward == false and alignment.qStrand == 1) {

        //
        // Now that the reference sequence has been copied, if it is
        // on the reverse strand, make the reverse complement for
        // proper printing.
        //
        alignment.tAlignedSeqPos = samTStart + (samTEnd - tAlignStart - alignment.tAlignedSeqLength);
		if (alignment.tAlignedSeqLength > referenceSequences[refIndex].length ||
			alignment.tAlignedSeqPos    > referenceSequences[refIndex].length ||
			alignment.tAlignedSeqLength + alignment.tAlignedSeqPos > referenceSequences[refIndex].length + 2) {
            //alignment.tAlignedSeqPos is 1 based and unsigned.
			std::cout << "WARNING. The mapping of read " << alignment.qName  
				 << " to reference "      << alignment.tName 
                 << " is out of bounds."  << std::endl
                 << "         StartPos (" << alignment.tAlignedSeqPos  
                 << ") + AlnLength (" << alignment.tAlignedSeqLength 
                 << ") > RefLength (" << referenceSequences[refIndex].length
                 << ") + 2 "          << std::endl;
            continue;
		}
        ((DNASequence*)&alignment.tAlignedSeq)->Copy(referenceSequences[refIndex], alignment.tAlignedSeqPos, alignment.tAlignedSeqLength);             
        alignment.tAlignedSeq.ReverseComplementSelf();
        // either ref or read is defined as being in the forward
        // orientation.  Here, since refAsForward is false, the read
        // is forward.  Since the read is forward, the aligned
        // sequences are stored as the reverse complement of the read
        // and the references.
        //
        alignment.tStrand = 1;
        alignment.qStrand = 0;
      }
      else {
        if (alignment.tAlignedSeqLength > referenceSequences[refIndex].length ||
			alignment.tAlignedSeqPos    > referenceSequences[refIndex].length ||
			alignment.tAlignedSeqLength + alignment.tAlignedSeqPos > referenceSequences[refIndex].length + 2) {
            //alignment.tAlignedSeqPos is 1 based and unsigned. 
			std::cout << "WARNING. The mapping of read " << alignment.qName  
				 << " to reference "      << alignment.tName 
                 << " is out of bounds."  << std::endl
                 << "         StartPos (" << alignment.tAlignedSeqPos  
                 << ") + AlnLength (" << alignment.tAlignedSeqLength 
                 << ") > RefLength (" << referenceSequences[refIndex].length
                 << ") + 2 "          << std::endl;
            continue;
		}
        ((DNASequence*)&alignment.tAlignedSeq)->Copy(referenceSequences[refIndex], 
                                                     alignment.tAlignedSeqPos, 
                                                     alignment.tAlignedSeqLength);
      }
    }

    if (alignment.blocks.size() > 0) {
      candidates.push_back(alignment);
    }
  }
  if (candidates.size() > 0 and keepRefAsForward == false and candidates[0].tStrand == 1) {
    std::reverse(candidates.begin(), candidates.end());
  }
  querySeq.Free();
}