// diagnose // SSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSS..................................................... // ..........................XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX...................... // ...............................IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII...................... // .................................JJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJ...................... // LLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLL.................................................... // !"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^_`abcdefghijklmnopqrstuvwxyz{|}~ // | | | | | | // 33 59 64 73 104 126 <- maxValue is value from here // S 0........................26...31.......40 // X -5....0........9.............................40 // I 0........9.............................40 // J 3.....9.............................40 // L 0.2......................26...31........41 // // S - Sanger Phred+33, raw reads typically (0, 40) // X - Solexa Solexa+64, raw reads typically (-5, 40) // I - Illumina 1.3+ Phred+64, raw reads typically (0, 40) // J - Illumina 1.5+ Phred+64, raw reads typically (3, 40) with 0=unused, 1=unused, 2=Read Segment Quality Control Indicator (bold) // L - Illumina 1.8+ Phred+33, raw reads typically (0, 41) DNAQualityType FastqQualityTrimTask::detectQualityType(){ int maxValue = 33; int minValue = 126; FASTQIterator iter_qual(settings.inputUrl, stateInfo); CHECK(!stateInfo.hasError(), DNAQualityType_Sanger); int counter = 0; while (iter_qual.hasNext()) { CHECK(!stateInfo.isCoR(), DNAQualityType_Sanger); if (counter > 1000) { // check only first 1000 reads in file break; } DNASequence dna = iter_qual.next(); int seqLen = dna.length(); if (seqLen > dna.quality.qualCodes.length()) { continue; } else { for (int pos = 0; pos <= seqLen - 1; pos++) { maxValue = qMax(static_cast<int>(dna.quality.qualCodes.at(pos)), maxValue); minValue = qMin(static_cast<int>(dna.quality.qualCodes.at(pos)), minValue); } } counter++; } return DNAQuality::detectTypeByMinMaxQualityValues(minValue, maxValue); }
void CASAVAFilterTask::runStep(){ int ncount = 0; int ycount = 0; QScopedPointer<IOAdapter> io (IOAdapterUtils::open(settings.outDir + settings.outName, stateInfo, IOAdapterMode_Append)); //1:N:0:TAAGGG QRegExp pattern (":Y:[^:]:"); FASTQIterator iter(settings.inputUrl); while(iter.hasNext()){ if(stateInfo.isCoR()){ return; } DNASequence seq = iter.next(); QString comment = DNAInfo::getFastqComment(seq.info); if(pattern.indexIn(comment) != -1){ ycount++; }else{ FastqFormat::writeEntry(seq.getName() + " " + comment, seq, io.data(), "Writing error", stateInfo, false); ncount++; } } algoLog.info(QString("Discarded by CASAVA filter %1").arg(ycount)); algoLog.info(QString("Accepted by CASAVA filter %1").arg(ncount)); algoLog.info(QString("Total by CASAVA FILTER: %1").arg(ncount + ycount)); }
//Test DNASequence Allocate(DNALength) TEST_F(DNASequenceTest, Allocate) { dnaOne.Allocate(0); EXPECT_EQ(dnaOne.length, 0); DNASequence dnaTwo; dnaTwo.Allocate(100); EXPECT_EQ(dnaTwo.length, 100); }
//Test DNASequence.Copy(const DNASequence rhs, // DNALength rhsPos, // DNALength rhsLength) TEST_F(DNASequenceTest, Copy) { DNALength oneLen = 10; Nucleotide * one = new Nucleotide [oneLen]; string As("AGAAAAACAA"); for (int i = 0; i < oneLen; i++) { one[i] = As[i]; } dnaOne.seq = one; dnaOne.length = oneLen; DNASequence dnaTwo; dnaTwo.Copy(dnaOne); EXPECT_EQ(dnaTwo.length, dnaOne.length); EXPECT_NE(dnaTwo.seq , dnaOne.seq); EXPECT_TRUE(dnaTwo.deleteOnExit); EXPECT_EQ(memcmp(dnaTwo.seq, dnaOne.seq, dnaOne.length), 0); //if rhs.length is 0, return this * DNASequence dnaThree; dnaTwo.Copy(dnaThree); //dnaTwo remains unchanged EXPECT_EQ(dnaTwo.length, 0); EXPECT_NE(dnaTwo.seq, dnaOne.seq); EXPECT_TRUE(dnaTwo.deleteOnExit); EXPECT_TRUE(dnaTwo.seq == NULL); //if rhsPos is not 0 and rhsLength is 0 dnaTwo.Copy(dnaOne, 2); EXPECT_EQ(dnaTwo.length, dnaOne.length - 2); EXPECT_TRUE(dnaTwo.deleteOnExit); EXPECT_EQ(memcmp(dnaTwo.seq, dnaOne.seq + 2, dnaTwo.length), 0); //if the subsequence to copy is out of bounds EXPECT_GT(200, dnaOne.length); //EXPECT_EXIT(dnaTwo.Copy(dnaOne, 200), ::testing::ExitedWithCode(1), ""); //if both rhsPos and rhsLength are less than MAXINT, //but rhsPos+ rhsLength > MAXINT DNALength rhsPos = 3; DNALength rhsLength = UINT_MAX -1; EXPECT_TRUE(rhsPos < UINT_MAX && rhsLength < UINT_MAX); EXPECT_TRUE(rhsLength > dnaOne.length + 1); //EXPECT_EXIT(dnaTwo.Copy(dnaOne, rhsPos, rhsLength), ::testing::ExitedWithCode(1), ""); //if rhsPos > rhs.length //EXPECT_EXIT(dnaTwo.Copy(dnaOne, dnaOne.length + 1), ::testing::ExitedWithCode(1), "") // << "Copy a subsequence which is out of bounds. This needs to be taken care of. See bug 21867."; }
void QualityTrimTask::runStep(){ int ncount = 0; int ycount = 0; QScopedPointer<IOAdapter> io (IOAdapterUtils::open(settings.outDir + settings.outName, stateInfo, IOAdapterMode_Append)); int quality = settings.customParameters.value(QUALITY_ID, 20).toInt(); int minLen = settings.customParameters.value(LEN_ID, 0).toInt(); bool bothEnds = settings.customParameters.value(BOTH_ID, false).toInt(); FASTQIterator iter(settings.inputUrl); while(iter.hasNext()){ if(stateInfo.isCoR()){ return; } DNASequence dna = iter.next(); QString comment = DNAInfo::getFastqComment(dna.info); int seqLen = dna.length(); if(seqLen > dna.quality.qualCodes.length()){ ncount++; continue; }else{ int endPosition = seqLen-1; for (; endPosition>=0; endPosition--){ if(dna.quality.getValue(endPosition) >= quality){ break; } } int beginPosition = 0; if (bothEnds) { for (; beginPosition<=endPosition; beginPosition++) { if (dna.quality.getValue(beginPosition) >= quality) { break; } } } if(endPosition>=beginPosition && endPosition-beginPosition+1 >= minLen){ DNASequence trimmed(dna.getName(), dna.seq.left(endPosition+1).mid(beginPosition), dna.alphabet); trimmed.quality = dna.quality; trimmed.quality.qualCodes = trimmed.quality.qualCodes.left(endPosition+1).mid(beginPosition); FastqFormat::writeEntry(trimmed.getName(), trimmed, io.data(), "Writing error", stateInfo, false); ycount++; }else{ ncount++; continue; } } } algoLog.info(QString("Discarded by trimmer %1").arg(ncount)); algoLog.info(QString("Accepted by trimmer %1").arg(ycount)); algoLog.info(QString("Total by trimmer %1").arg(ncount + ycount)); }
int main(int argc, char* argv[]) { string refFileName, queryFileName; int maxHammingDistance; if (argc < 4) { cout << "usage: hammer ref query maxHam " << endl; exit(1); } refFileName = argv[1]; queryFileName = argv[2]; maxHammingDistance = atoi(argv[3]); FASTAReader reader; reader.Initialize(refFileName); FASTASequence ref, refRC; reader.GetNext(ref); ref.MakeRC(refRC); FASTAReader queryReader; queryReader.Initialize(queryFileName); FASTASequence query; queryReader.GetNext(query); DNALength p; for(p=0; p < ref.length-query.length-1; p++ ){ DNASequence subseq; subseq.seq = &ref.seq[p]; subseq.length = query.length; // cout << "t "; subseq.PrintSeq(cout); // cout << "q "; ((DNASequence*)&query)->PrintSeq(cout); if (HammingDistance(&subseq.seq[0], &query.seq[0], query.length) < maxHammingDistance) { cout << ">" << p << endl; subseq.PrintSeq(cout); } int i; for (i =0; i < query.length; i++) { subseq.seq[i] = toupper(subseq.seq[i]); } } for(p=0; p < ref.length-query.length-1; p++ ){ DNASequence subseq; subseq.seq = &refRC.seq[p]; subseq.length = query.length; if (HammingDistance(&subseq.seq[0], &query.seq[0], query.length) < maxHammingDistance) { cout << ">" << p << "rc" << endl; subseq.PrintSeq(cout); } int i; for (i =0; i < query.length; i++) { subseq.seq[i] = toupper(subseq.seq[i]); } } }
//Test DNASequence ReferenceSubstring(rhs, pos, substrLength) TEST_F(DNASequenceTest, ReferenceSubstring) { DNALength oneLen = 10; dnaOne.seq = new Nucleotide[oneLen]; dnaOne.length = oneLen; DNASequence dnaTwo; dnaTwo.ReferenceSubstring(dnaOne); EXPECT_EQ(dnaOne.seq, dnaTwo.seq); EXPECT_EQ(dnaOne.length, dnaTwo.length); EXPECT_FALSE(dnaTwo.deleteOnExit); // EXPECT_DEATH_IF_SUPPORTED(dnaTwo.ReferenceSubstring(dnaOne, 100), ""); delete dnaOne.seq; }
bool do_test(string sequence, int __expected) { time_t startClock = clock(); DNASequence *instance = new DNASequence(); int __result = instance->longestDNASequence(sequence); double elapsed = (double)(clock() - startClock) / CLOCKS_PER_SEC; delete instance; if (__result == __expected) { cout << "PASSED!" << " (" << elapsed << " seconds)" << endl; return true; } else { cout << "FAILED!" << " (" << elapsed << " seconds)" << endl; cout << " Expected: " << to_string(__expected) << endl; cout << " Received: " << to_string(__result) << endl; return false; } }
//Test DNASequence ShallowCopy TEST_F(DNASequenceTest, ShallowCopy) { DNALength oneLen = 10; Nucleotide * one = new Nucleotide [oneLen]; string As("AAAAAAAAAA"); for (int i = 0; i < oneLen; i++) { one[i] = As[i]; } dnaOne.seq = one; dnaOne.length = oneLen; DNASequence dnaTwo; dnaTwo.ShallowCopy(dnaOne); EXPECT_EQ(dnaTwo.length, dnaOne.length); EXPECT_EQ(dnaTwo.seq , dnaOne.seq); EXPECT_EQ(dnaTwo.deleteOnExit, dnaOne.deleteOnExit); }
void MergeFastqTask::runStep(){ QScopedPointer<IOAdapter> io (IOAdapterUtils::open(settings.outDir + settings.outName, stateInfo, IOAdapterMode_Append)); QStringList urls = settings.customParameters.value(INPUT_URLS_ID, "").toString().split(","); qint64 numberOfSeqs = 0; qint64 numberOfFiles = 0; foreach (QString url, urls){ FASTQIterator iter(url); while(iter.hasNext()){ if(stateInfo.isCoR()){ return; } DNASequence dna = iter.next(); FastqFormat::writeEntry(dna.getName(), dna, io.data(), "Writing error", stateInfo, false); numberOfSeqs++; } numberOfFiles++; }
//Test DNASequence constructor TEST_F(DNASequenceTest, Constructor) { DNASequence dnaSeq; EXPECT_TRUE(dnaSeq.seq == NULL); EXPECT_TRUE(dnaSeq.length == 0); EXPECT_TRUE(dnaSeq.size() == dnaSeq.length); EXPECT_TRUE(dnaSeq.bitsPerNuc == 8); EXPECT_FALSE(dnaSeq.deleteOnExit); Nucleotide HKITTY[] = "HELLO,KITTY!"; dnaSeq.seq = HKITTY; dnaSeq.length = sizeof(HKITTY)/sizeof(Nucleotide) - 1; // dnaSeq.Print(cout); EXPECT_EQ(dnaSeq.size(), 12); DNALength thisLen = 12; Nucleotide * thisNuc = new Nucleotide [thisLen]; memcpy(thisNuc, HKITTY, thisLen); DNASequence newDnaSeq; newDnaSeq.seq = thisNuc; newDnaSeq.length = thisLen; // newDnaSeq.Print(cout); EXPECT_EQ(memcmp(newDnaSeq.seq, dnaSeq.seq, thisLen), 0); EXPECT_EQ(newDnaSeq.length, thisLen); if (!thisNuc) delete thisNuc; DNASequence nnewDnaSeq; thisLen = 12; string atgc ("atgcatgcatgc"); thisNuc = new Nucleotide [thisLen]; for (int i = 0 ; i < thisLen; i++) { thisNuc[i] = atgc[i]; } string ret; nnewDnaSeq.seq = thisNuc; nnewDnaSeq.length = thisLen; for (int i = 0 ; i < thisLen; i++) { ret += nnewDnaSeq.seq[i]; } EXPECT_STREQ(ret.c_str(), atgc.c_str()); }
static void saveSequence(IOAdapter* io, const DNASequence& sequence, U2OpStatus& os) { writeHeaderToFile( io, sequence.getName( ), os ); CHECK_OP( os, ); const char *seq = sequence.seq.constData( ); const int len = sequence.seq.length( ); for ( int i = 0; i < len; i += SAVE_LINE_LEN ) { const int chunkSize = qMin( SAVE_LINE_LEN, len - i ); writeBlockToFile( io, seq + i, chunkSize, os ); CHECK_OP( os, ); } }
//Test DNASequence TakeOwnership TEST_F(DNASequenceTest, TakeOwnership) { DNALength oneLen = 10; Nucleotide * one = new Nucleotide [oneLen]; dnaOne.seq = one; dnaOne.length = oneLen; DNASequence dnaTwo; //a bug may occur if deleteOneExit is true and //TakeOwnership() is called twice. In that case, both //dnaOne and dnaTwo will become wild pointers dnaTwo.deleteOnExit = true; dnaTwo.TakeOwnership(dnaOne); EXPECT_EQ(dnaTwo.length, dnaOne.length); EXPECT_EQ(dnaTwo.deleteOnExit, dnaOne.deleteOnExit); EXPECT_EQ(dnaTwo.seq, dnaOne.seq); if(!one) delete one; }
Task* PWMatrixSearchWorker::tick() { while (modelPort->hasMessage()) { models << modelPort->get().getData().toMap().value(PWMatrixWorkerFactory::WMATRIX_SLOT.getId()).value<PWMatrix>(); } if (!modelPort->isEnded()) { return NULL; } if (dataPort->hasMessage()) { Message inputMessage = getMessageAndSetupScriptValues(dataPort); if (inputMessage.isEmpty() || models.isEmpty()) { output->transit(); return NULL; } QVariantMap map = inputMessage.getData().toMap(); SharedDbiDataHandler seqId = map.value(BaseSlots::DNA_SEQUENCE_SLOT().getId()).value<SharedDbiDataHandler>(); QScopedPointer<U2SequenceObject> seqObj(StorageUtils::getSequenceObject(context->getDataStorage(), seqId)); if (seqObj.isNull()) { return NULL; } U2OpStatusImpl os; DNASequence seq = seqObj->getWholeSequence(os); CHECK_OP(os, new FailTask(os.getError())); if (!seq.isNull() && seq.alphabet->getType() == DNAAlphabet_NUCL) { WeightMatrixSearchCfg config(cfg); config.complOnly = (strand < 0); if (strand <= 0) { DNATranslation* compTT = AppContext::getDNATranslationRegistry()-> lookupComplementTranslation(seq.alphabet); if (compTT != NULL) { config.complTT = compTT ; } } QList<Task*> subtasks; foreach(PWMatrix model, models) { subtasks << new WeightMatrixSingleSearchTask(model, seq.seq, config, 0); }
void FastqQualityTrimTask::runStep(){ int ncount = 0; int ycount = 0; QScopedPointer<IOAdapter> io(IOAdapterUtils::open(settings.outDir + settings.outName, stateInfo, IOAdapterMode_Append)); int quality = settings.customParameters.value(QUALITY_ID, 20).toInt(); int minLen = settings.customParameters.value(LEN_ID, 0).toInt(); bool bothEnds = settings.customParameters.value(BOTH_ID, false).toInt(); DNAQualityType qualityType = detectQualityType(); CHECK_OP(stateInfo, ); FASTQIterator iter(settings.inputUrl, stateInfo); CHECK_OP(stateInfo, ); while (iter.hasNext()) { CHECK_OP(stateInfo, ); DNASequence dna = iter.next(); dna.quality.type = qualityType; const U2Region acceptedRegion = DNASequenceUtils::trimByQuality(dna, quality, minLen, bothEnds); if (0 < acceptedRegion.length) { ycount++; } else { ncount++; continue; } FastqFormat::writeEntry(dna.getName(), dna, io.data(), "Writing error", stateInfo, false); } algoLog.info(QString("Discarded by trimmer %1").arg(ncount)); algoLog.info(QString("Accepted by trimmer %1").arg(ycount)); algoLog.info(QString("Total by trimmer %1").arg(ncount + ycount)); }
GeneByGeneCompareResult GeneByGeneComparator::compareGeneAnnotation(const DNASequence& seq, const QList<SharedAnnotationData> &annData, const QString& annName, float identity) { GeneByGeneCompareResult result; float maxIdentity = -1.0F; foreach (const SharedAnnotationData &adata, annData) { if (adata->name == annName) { U2Location location = adata->location; if (location->isSingleRegion()) { int reglen = location->regions.first().length; float lenRatio = reglen * 100 /static_cast<float>(seq.length()); maxIdentity = qMax(maxIdentity, lenRatio); if(lenRatio >= identity){ //check length ratio QString ident = adata->findFirstQualifierValue(BLAST_IDENT); if (!ident.isEmpty()){ //create BLAST string YES/identity/gaps float blastIdent = parseBlastQual(ident); if (blastIdent != -1.0f && blastIdent >= identity){ result.identical = true; result.identityString = GeneByGeneCompareResult::IDENTICAL_YES; result.identityString.append(QString("\\%1").arg(blastIdent)); QString gaps = adata->findFirstQualifierValue(BLAST_GAPS); if (!gaps.isEmpty()){ float blastGaps = parseBlastQual(gaps); if (blastGaps!=1.0f){ result.identityString.append(QString("\\%1").arg(blastGaps)); } }else{ result.identityString.append(QString("\\0")); } } }else{ //not a blast annotation result.identical = true; result.identityString = GeneByGeneCompareResult::IDENTICAL_YES; } } } break; } } if (result.identical == false && maxIdentity != -1.0f){ result.identityString.append(QString("\\%1").arg(maxIdentity)); } return result; }
int main(int argc, char* argv[]) { string inFileName, readsFileName; DNALength readLength; float coverage = 0; bool noRandInit = false; int numReads = -1; CommandLineParser clp; int qualityValue = 20; bool printFastq = false; int stratify = 0; string titleType = "pacbio"; string fastqType = "illumina"; // or "sanger" clp.RegisterStringOption("inFile", &inFileName, "Reference sequence", 0); clp.RegisterPreviousFlagsAsHidden(); clp.RegisterIntOption("readLength", (int*) &readLength, "The length of reads to simulate. The length is fixed.", CommandLineParser::PositiveInteger, "Length of every read.", 0); clp.RegisterFloatOption("coverage", &coverage, "Total coverage (from which the number of reads is calculated", CommandLineParser::PositiveFloat, 0); clp.RegisterFlagOption("nonRandInit", &noRandInit, "Skip initializing the random number generator with time."); clp.RegisterIntOption("nReads", &numReads, "Total number of reads (from which coverage is calculated)", CommandLineParser::PositiveInteger, 0); clp.RegisterStringOption("readsFile", &readsFileName, "Reads output file", 0); clp.RegisterFlagOption("fastq", &printFastq, "Fake fastq output with constant quality value (20)"); clp.RegisterIntOption("quality", &qualityValue, "Value to use for fastq quality", CommandLineParser::PositiveInteger); clp.RegisterIntOption("stratify", &stratify, "Sample a read every 'stratify' bases, rather than randomly.", CommandLineParser::PositiveInteger); clp.RegisterStringOption("titleType", &titleType, "Set the name of the title: 'pacbio'|'illumina'"); clp.RegisterStringOption("fastqType", &fastqType, "Set the type of fastq: 'illumina'|'sanger'"); vector<string> leftovers; clp.ParseCommandLine(argc, argv, leftovers); if (!noRandInit) { InitializeRandomGeneratorWithTime(); } FASTAReader inReader; inReader.Init(inFileName); vector<FASTASequence> reference; inReader.ReadAllSequences(reference); ofstream readsFile; if (readsFileName == "") { cout << "ERROR. You must specify a reads file." << endl; exit(0); } CrucialOpen(readsFileName, readsFile, std::ios::out); ofstream sangerFastqFile; if (fastqType == "sanger") { string sangerFastqFileName = readsFileName + ".fastq"; CrucialOpen(sangerFastqFileName, sangerFastqFile, std::ios::out); } DNALength refLength = 0; int i; for (i = 0; i < reference.size(); i++) { refLength += reference[i].length; } if (numReads == -1 and coverage == 0 and stratify == 0) { cout << "Error, you must specify either coverage, nReads, or stratify." << endl; exit(1); } else if (numReads == -1) { numReads = (refLength / readLength) * coverage; } if (stratify) { if (!readLength) { cout << "ERROR. If you are using stratification, a read length must be specified." << endl; exit(1); } } DNASequence sampleSeq; sampleSeq.length = readLength; int maxRetry = 10000000; int retryNumber = 0; DNALength seqIndex, seqPos; if (stratify) { seqIndex = 0; seqPos = 0; } DNALength origReadLength = readLength; for (i = 0; stratify or i < numReads; i++) { if (stratify == 0) { FindRandomPos(reference, seqIndex, seqPos, readLength ); } else { // // find the next start pos, or bail if done // if (seqPos >= reference[seqIndex].length) { if (seqIndex == reference.size() - 1) { break; } else { seqIndex = seqIndex + 1; seqPos = 0; continue; } } readLength = min(reference[seqIndex].length - seqPos, origReadLength); } sampleSeq.seq = &reference[seqIndex].seq[seqPos]; int j; int gappedRead = 0; string title; stringstream titleStrm; if (titleType == "pacbio") { titleStrm << i << "|"<< reference[seqIndex].GetName() << "|" << seqPos << "|" << seqPos + readLength; } else if (titleType == "illumina") { titleStrm << "SE_" << i << "_0@" << seqPos << "-"<<seqPos+readLength <<"/1"; } else { cout << "ERROR. Bad title type " << titleType << endl; exit(0); } title = titleStrm.str(); sampleSeq.length = readLength; if (!printFastq) { readsFile << ">" << title << endl; sampleSeq.PrintSeq(readsFile); } else { FASTQSequence fastqSampleSeq; fastqSampleSeq.CopyTitle(title); fastqSampleSeq.seq = sampleSeq.seq; fastqSampleSeq.length = sampleSeq.length; fastqSampleSeq.qual.data = new unsigned char[sampleSeq.length]; fill(fastqSampleSeq.qual.data, fastqSampleSeq.qual.data + sampleSeq.length, qualityValue); if (fastqType == "illumina") { fastqSampleSeq.PrintFastq(readsFile, fastqSampleSeq.length+1); } else { fastqSampleSeq.PrintSeq(readsFile); fastqSampleSeq.PrintQual(sangerFastqFile); } delete[] fastqSampleSeq.qual.data; delete[] fastqSampleSeq.title; } if (stratify) { seqPos += readLength; } } return 0; }
int main(int argc, char* argv[1]) { if (argc < 3) { cout << "Usage: findUnique genome.fasta query.fasta effective_k [options]" << endl; cout << " genome.fasta.sa must exist." << endl; cout << " Finds sequences at least effective_k in length that are unique." << endl; cout << " -max m Allow up to m matches" << endl; cout << " -minLength l Ensure the length of the match is at least this." << endl; cout << " -prefix p n Allow up to n matches across a prefix of length p" << endl; cout << " -suffix s n Allow up to n matches across a suffix of length s" << endl; cout << " Prefix and suffix options override max." << endl; cout << " -out file Print queries to this output file (query.fasta.queries)" << endl; exit(0); } DNASuffixArray sarray; string genomeFileName = argv[1]; string suffixArrayFileName = genomeFileName + ".sa"; FASTAReader reader; FASTASequence genome; int maxN = 0; int prefix = 0; int suffix = 0; int prefixN = 0; int suffixN = 0; int argi = 4; string outputFileName = ""; int minLength = 0; while (argi < argc) { if (strcmp(argv[argi], "-max") == 0) { ++argi; maxN = atoi(argv[argi]); } else if (strcmp(argv[argi], "-prefix") == 0) { ++argi; prefix = atoi(argv[argi]); ++argi; prefixN = atoi(argv[argi]); } else if (strcmp(argv[argi], "-suffix") == 0) { ++argi; suffix = atoi(argv[argi]); ++argi; suffixN = atoi(argv[argi]); } else if (strcmp(argv[argi], "-out") == 0) { ++argi; outputFileName = argv[argi]; } else if (strcmp(argv[argi], "-minLength") == 0) { ++argi; minLength = atoi(argv[argi]); } ++argi; } reader.Initialize(genomeFileName); reader.ReadAllSequencesIntoOne(genome); sarray.Read(suffixArrayFileName); FASTAReader queryReader; FASTASequence querySequence; string queryFileName = argv[2]; int maxLength = atoi(argv[3]); string summaryTableFileName = queryFileName + ".summary"; if (outputFileName == "") { outputFileName = queryFileName + ".queries"; } ofstream summaryTable(summaryTableFileName.c_str()); ofstream outputFile(outputFileName.c_str()); queryReader.Initialize(queryFileName); while (queryReader.GetNext(querySequence)) { int i; cerr << "searching " << querySequence.title << endl; if (querySequence.length < maxLength) { continue; } int nMatches = 0; querySequence.ToUpper(); int localMax; for (i = 0; i < querySequence.length - maxLength + 1; i++) { if ((i + 1) % 100000 == 0) { cerr << "processed: " << i + 1 << endl; } int lcpLength; vector<SAIndex> lcpLeftBounds, lcpRightBounds; vector<SAIndex> rclcpLeftBounds, rclcpRightBounds; localMax = maxN; if (i < prefix) { localMax = prefixN; } if (i >= querySequence.length - suffix) { localMax = suffixN; } if (querySequence.length - i <= maxLength) { continue; } if (querySequence.seq[i] == 'N') { continue; } lcpLength = sarray.StoreLCPBounds(genome.seq, genome.length, // The string which the suffix array is built on. &querySequence.seq[i], querySequence.length-i, true, maxLength, lcpLeftBounds, lcpRightBounds, false); if (lcpLength < minLength) { continue; } if (lcpLength < maxLength or lcpRightBounds.size() == 0 or (lcpRightBounds.size() > 0 and lcpLeftBounds.size() > 0 and lcpRightBounds[lcpRightBounds.size() - 1] - lcpLeftBounds[lcpLeftBounds.size()-1] <= localMax)) { FASTASequence rc; DNASequence subseq; subseq.ReferenceSubstring(querySequence, i, maxLength); subseq.MakeRC(rc); int rclcpLength; int numForwardMatches; if (lcpLength == 0) { numForwardMatches = 0; } else { numForwardMatches = lcpRightBounds[lcpRightBounds.size() - 1] - lcpLeftBounds[lcpLeftBounds.size()-1]; } rclcpLength = sarray.StoreLCPBounds(genome.seq, genome.length, // The string which the suffix array is built on. rc.seq, maxLength, true, rclcpLength, rclcpLeftBounds, rclcpRightBounds, false); string rcstr((const char*)rc.seq, rc.length); if (rclcpLength < maxLength or rclcpRightBounds.size() == 0 or (numForwardMatches + rclcpRightBounds[rclcpRightBounds.size() - 1] - rclcpLeftBounds[rclcpLeftBounds.size()-1] <= localMax)) { char* substr = new char[maxLength+1]; substr[maxLength] = '\0'; memcpy(substr, &querySequence.seq[i], maxLength); // string substr = string((const char*) querySequence.seq, i, maxLength); outputFile << querySequence.title << "\t" << substr << "\t" << i << endl; ++nMatches; delete[] substr; // } } rc.Free(); } } summaryTable << querySequence.title << "\t" << nMatches << endl; querySequence.Free(); } outputFile.close(); genome.Free(); }
void SAMAlignmentsToCandidates(SAMAlignment &sam, std::vector<FASTASequence> &referenceSequences, std::map<std::string,int> & refNameToRefListIndex, std::vector<AlignmentCandidate<> > &candidates, bool parseSmrtTitle, bool keepRefAsForward, bool copyQVs) { // // First determine how many alignments there are from CIGAR string. // std::vector<int> lengths; std::vector<char> ops; sam.cigar.Vectorize(lengths, ops); DNASequence querySeq; // For now just reference the query sequence. querySeq.deleteOnExit = false; querySeq.seq = (Nucleotide*) sam.seq.c_str(); querySeq.length = sam.seq.size(); DNALength samTEnd = 0; DNALength samTStart = sam.pos - 1; std::vector<std::string> optionalQVs; if (copyQVs) { sam.CopyQVs(&optionalQVs); } if (keepRefAsForward == false and IsReverseComplement(sam.flag)) { ReverseAlignmentOperations(lengths, ops); DNASequence rcQuerySeq; querySeq.CopyAsRC(rcQuerySeq); // // Zero out the query seq so that the string memory is not // deleted. // querySeq.seq = NULL; querySeq.length = 0; querySeq = rcQuerySeq; rcQuerySeq.Free(); samTEnd = GetAlignedReferenceLengthByCIGARSum(ops, lengths); // We also need to reverse any optional QVs if (copyQVs) { for(int i=0; i<optionalQVs.size(); i++) { std::reverse(optionalQVs[i].begin(), optionalQVs[i].end()); } } } int i; int offset = 0; if (ops.size() == 0) { return; } bool alignmentStarted = false; bool onFirstMatch = true; int curAlignment; // // Advance past any clipping. This advances in both query and // reference position. // int cigarPos = 0; int qPos = 0; int tPos = 0; DNALength queryPosOffset = 0; if (parseSmrtTitle) { // // The aligned sequence is really a subread of a full // sequence. The position of the aligments start at 0, the // beginning of the query sequence, but in the sam file, they // may appear as subreads, and are offset from the start of the // subread. By convention, the subread coordinates are embedded // in the title of the query, if it is a smrtTitle. // Two types of smrtTitle are supported: // movie/zmw/start_end // movie/zmw/start_end/start2_end2 SMRTTitle stitle = SMRTTitle(sam.qName); if (not stitle.isSMRTTitle) { std::cout << "ERROR. Could not parse title " << sam.qName << std::endl; exit(1); } queryPosOffset = stitle.start; } else if (sam.xs) { queryPosOffset += sam.xs - 1; } while (cigarPos < lengths.size()) { int numClipped; // // Sequence clipping becomes offsets into the q/t alignedSeqPos // int numSoftClipped; numClipped = AdvancePastClipping(lengths, ops, cigarPos, numSoftClipped); // // End loop now. // if (cigarPos >= lengths.size()) { break; } qPos += numSoftClipped; // // Skipped sequences are just advances in the tPos. // int numSkipped = AdvancePastSkipped(lengths, ops, cigarPos); tPos += numSkipped; if (cigarPos >= lengths.size()) { break; } AlignmentCandidate<> alignment; // // The aligned sequence must start at a match therefore the tpos // and qpos are 0. // alignment.qPos = 0; alignment.tPos = 0; // qAlignStart is the start of the alignment relative to the sequence in the SAM file. DNALength qAlignStart = qPos; // tAlignStart is the start of the alignment in the genome. DNALength tAlignStart = tPos; int cigarEnd = cigarPos; AdvancePosToAlignmentEnd(ops, cigarEnd); CIGAROpsToBlocks(lengths, ops, cigarPos, cigarEnd, qPos, tPos, alignment); DNALength queryLengthSum = GetAlignedQueryLengthByCIGARSum(ops, lengths); DNALength refLengthSum = GetAlignedReferenceLengthByCIGARSum(ops, lengths); alignment.qAlignedSeqLength = qPos - qAlignStart; alignment.tAlignedSeqLength = tPos - tAlignStart; // // Assign candidate sequences. // // First, the query sequence is straight from the SAM line. ((DNASequence*)&alignment.qAlignedSeq)->Copy(querySeq, qAlignStart, alignment.qAlignedSeqLength); if (copyQVs) { alignment.ReadOptionalQVs(optionalQVs, qAlignStart, alignment.qAlignedSeqLength); } // The SAM Alignments a alignment.qStrand = IsReverseComplement(sam.flag); alignment.tStrand = 0; alignment.mapQV = sam.mapQV; // // Assign the offsets into the original sequence where the // subsequence starts. // alignment.qAlignedSeqPos = queryPosOffset + qAlignStart; alignment.tAlignedSeqPos = samTStart + tAlignStart; if (sam.rName == "*") { // // No reference, do not add the alignment to the list of // candidates. // continue; } else { int refIndex; int s = refNameToRefListIndex.size(); if (refNameToRefListIndex.find(sam.rName) == refNameToRefListIndex.end()) { std::cout <<" ERROR. SAM Reference " << sam.rName << " is not found in the list of reference contigs." << std::endl; exit(1); } refIndex = refNameToRefListIndex[sam.rName]; alignment.tLength = referenceSequences[refIndex].length; alignment.qLength = sam.seq.size(); alignment.qName = sam.qName; alignment.tName = sam.rName; if (keepRefAsForward == false and alignment.qStrand == 1) { // // Now that the reference sequence has been copied, if it is // on the reverse strand, make the reverse complement for // proper printing. // alignment.tAlignedSeqPos = samTStart + (samTEnd - tAlignStart - alignment.tAlignedSeqLength); if (alignment.tAlignedSeqLength > referenceSequences[refIndex].length || alignment.tAlignedSeqPos > referenceSequences[refIndex].length || alignment.tAlignedSeqLength + alignment.tAlignedSeqPos > referenceSequences[refIndex].length + 2) { //alignment.tAlignedSeqPos is 1 based and unsigned. std::cout << "WARNING. The mapping of read " << alignment.qName << " to reference " << alignment.tName << " is out of bounds." << std::endl << " StartPos (" << alignment.tAlignedSeqPos << ") + AlnLength (" << alignment.tAlignedSeqLength << ") > RefLength (" << referenceSequences[refIndex].length << ") + 2 " << std::endl; continue; } ((DNASequence*)&alignment.tAlignedSeq)->Copy(referenceSequences[refIndex], alignment.tAlignedSeqPos, alignment.tAlignedSeqLength); alignment.tAlignedSeq.ReverseComplementSelf(); // either ref or read is defined as being in the forward // orientation. Here, since refAsForward is false, the read // is forward. Since the read is forward, the aligned // sequences are stored as the reverse complement of the read // and the references. // alignment.tStrand = 1; alignment.qStrand = 0; } else { if (alignment.tAlignedSeqLength > referenceSequences[refIndex].length || alignment.tAlignedSeqPos > referenceSequences[refIndex].length || alignment.tAlignedSeqLength + alignment.tAlignedSeqPos > referenceSequences[refIndex].length + 2) { //alignment.tAlignedSeqPos is 1 based and unsigned. std::cout << "WARNING. The mapping of read " << alignment.qName << " to reference " << alignment.tName << " is out of bounds." << std::endl << " StartPos (" << alignment.tAlignedSeqPos << ") + AlnLength (" << alignment.tAlignedSeqLength << ") > RefLength (" << referenceSequences[refIndex].length << ") + 2 " << std::endl; continue; } ((DNASequence*)&alignment.tAlignedSeq)->Copy(referenceSequences[refIndex], alignment.tAlignedSeqPos, alignment.tAlignedSeqLength); } } if (alignment.blocks.size() > 0) { candidates.push_back(alignment); } } if (candidates.size() > 0 and keepRefAsForward == false and candidates[0].tStrand == 1) { std::reverse(candidates.begin(), candidates.end()); } querySeq.Free(); }