void tokenizePatternFile(std::ifstream& in) { // tokenize a line from the pattern file. The first part will be the pattern and the second // part is the file to write to. std::string lineptr; while(in.good()) { std::getline(in, lineptr); if(lineptr.empty()) { continue; } std::vector<std::string> fields; split(fields, lineptr, "\t"); switch(fields.size()) { case 0: break; case 1: manager.add(fields[0]); if(opts.r_flag) { std::string rcpattern = fields[0]; reverseComplement(rcpattern); manager.add(rcpattern); } break; default: manager.add(fields[0], fields[1]); if(opts.r_flag) { std::string rcpattern = fields[0]; reverseComplement(rcpattern); manager.add(rcpattern, fields[1]); } break; } } }
static void showOverlap(const bam1_t *leftBam, const bam1_t *rightBam) /* If the two reads overlap, show how. */ { const bam1_core_t *leftCore = &(leftBam->core), *rightCore = &(rightBam->core); int leftStart = leftCore->pos, rightStart = rightCore->pos; int leftLen = bamGetTargetLength(leftBam), rightLen = bamGetTargetLength(rightBam); char *leftSeq = bamGetQuerySequence(leftBam, useStrand); char *rightSeq = bamGetQuerySequence(rightBam, useStrand); if (useStrand && bamIsRc(leftBam)) reverseComplement(leftSeq, strlen(leftSeq)); if (useStrand && bamIsRc(rightBam)) reverseComplement(rightSeq, strlen(rightSeq)); if ((rightStart > leftStart && leftStart + leftLen > rightStart) || (leftStart > rightStart && rightStart+rightLen > leftStart)) { int leftClipLow, rightClipLow; bamGetSoftClipping(leftBam, &leftClipLow, NULL, NULL); bamGetSoftClipping(rightBam, &rightClipLow, NULL, NULL); leftStart -= leftClipLow; rightStart -= rightClipLow; printf("<B>Note: End read alignments overlap:</B><BR>\n<PRE><TT>"); int i = leftStart - rightStart; while (i-- > 0) putc(' ', stdout); puts(leftSeq); i = rightStart - leftStart; while (i-- > 0) putc(' ', stdout); puts(rightSeq); puts("</TT></PRE>"); } }
void loadIfNewSeq(char *nibDir, char *newName, char strand, char **pName, struct dnaSeq **pSeq, char *pStrand) /* Load sequence unless it is already loaded. Reverse complement * if necessary. */ { struct dnaSeq *seq; if (sameString(newName, *pName)) { if (strand != *pStrand) { seq = *pSeq; reverseComplement(seq->dna, seq->size); *pStrand = strand; } } else { char fileName[512]; freeDnaSeq(pSeq); snprintf(fileName, sizeof(fileName), "%s/%s.nib", nibDir, newName); *pName = newName; *pSeq = seq = nibLoadAllMasked(NIB_MASK_MIXED, fileName); *pStrand = strand; if (strand == '-') reverseComplement(seq->dna, seq->size); uglyf("Loaded %d bases in %s\n", seq->size, fileName); } }
void loadFaSeq(struct hash *faHash, char *newName, char strand, char **pName, struct dnaSeq **pSeq, char *pStrand) /* retrieve sequence from hash. Reverse complement * if necessary. */ { struct dnaSeq *seq; if (sameString(newName, *pName)) { if (strand != *pStrand) { seq = *pSeq; reverseComplement(seq->dna, seq->size); *pStrand = strand; } } else { *pName = newName; *pSeq = seq = hashFindVal(faHash, newName); *pStrand = strand; if (strand == '-') reverseComplement(seq->dna, seq->size); verbose(1, "Loaded %d bases from %s fa\n", seq->size, newName); } }
static void loadFaSeq(struct hash *faHash, char *newName, char strand, char **pName, struct dnaSeq **pSeq, char *pStrand, char *fastaFileName) /* retrieve sequence from hash. Reverse complement * if necessary. */ { struct dnaSeq *seq; if (sameString(newName, *pName)) { if (strand != *pStrand) { seq = *pSeq; reverseComplement(seq->dna, seq->size); *pStrand = strand; } } else { *pName = newName; *pSeq = seq = hashFindVal(faHash, newName); if (NULL == seq) errAbort("ERROR: can not find sequence name '%s' from fasta file '%s'\n", newName, fastaFileName); *pStrand = strand; if (strand == '-') reverseComplement(seq->dna, seq->size); verbose(1, "Loaded %d bases from %s fa\n", seq->size, newName); } }
static void makeProfile(DNA *oligo, int oligoSize, int mismatchesAllowed, struct seqList *seqList, boolean considerRc, double profile[16][4]) /* Scan through file counting up things that match oligo to within * mismatch tolerance, and use these counts to build up a profile. */ { int counts[16][4]; int total = 0; double invTotal; int i,j; int seqCount = 0; struct seqList *seqEl; DNA rcOligo[17]; if (considerRc) { assert(oligoSize < sizeof(rcOligo)); memcpy(rcOligo, oligo, oligoSize); reverseComplement(rcOligo, oligoSize); } zeroBytes(counts, sizeof(counts)); for (seqEl = seqList; seqEl != NULL; seqEl = seqEl->next) { struct dnaSeq *seq = seqEl->seq; DNA *dna = seq->dna; int size = seq->size; int endIx = size-oligoSize; ++seqCount; for (i=0; i<=endIx; ++i) { DNA *target = dna+i; if (allGoodBases(target, oligoSize)) { if (mismatchCount(oligo, target, oligoSize) <= mismatchesAllowed) { ++total; for (j=0; j<oligoSize; ++j) counts[j][ntVal[(int)target[j]]] += 1; } if (considerRc && mismatchCount(rcOligo, target, oligoSize) <= mismatchesAllowed) { ++total; reverseComplement(target, oligoSize); for (j=0; j<oligoSize; ++j) counts[j][ntVal[(int)target[j]]] += 1; reverseComplement(target, oligoSize); } } } } invTotal = 1.0/total; for (i=0; i<oligoSize; ++i) { for (j=0; j<4; ++j) { profile[i][j] = invTotal * counts[i][j]; } } }
struct dnaSeq *gfiExpandAndLoadCached(struct gfRange *range, struct hash *tFileCache, char *tSeqDir, int querySize, int *retTotalSeqSize, boolean respectFrame, boolean isRc, int expansion) /* Expand range to cover an additional expansion bases on either side. * Load up target sequence and return. (Done together because don't * know target size before loading.) */ { struct dnaSeq *target = NULL; char fileName[PATH_LEN+256]; safef(fileName, sizeof(fileName), "%s/%s", tSeqDir, range->tName); if (nibIsFile(fileName)) { struct nibInfo *nib = hashFindVal(tFileCache, fileName); if (nib == NULL) { nib = nibInfoNew(fileName); hashAdd(tFileCache, fileName, nib); } if (isRc) reverseIntRange(&range->tStart, &range->tEnd, nib->size); gfiExpandRange(range, querySize, nib->size, respectFrame, isRc, expansion); target = nibLdPart(fileName, nib->f, nib->size, range->tStart, range->tEnd - range->tStart); if (isRc) { reverseComplement(target->dna, target->size); reverseIntRange(&range->tStart, &range->tEnd, nib->size); } *retTotalSeqSize = nib->size; } else { struct twoBitFile *tbf = NULL; char *tSeqName = strchr(fileName, ':'); int tSeqSize = 0; if (tSeqName == NULL) errAbort("No colon in .2bit response from gfServer"); *tSeqName++ = 0; tbf = hashFindVal(tFileCache, fileName); if (tbf == NULL) { tbf = twoBitOpen(fileName); hashAdd(tFileCache, fileName, tbf); } tSeqSize = twoBitSeqSize(tbf, tSeqName); if (isRc) reverseIntRange(&range->tStart, &range->tEnd, tSeqSize); gfiExpandRange(range, querySize, tSeqSize, respectFrame, isRc, expansion); target = twoBitReadSeqFragLower(tbf, tSeqName, range->tStart, range->tEnd); if (isRc) { reverseComplement(target->dna, target->size); reverseIntRange(&range->tStart, &range->tEnd, tSeqSize); } *retTotalSeqSize = tSeqSize; } return target; }
void ReadsLayout::print(size_t index, ostream &out, bool dir, unsigned int start, unsigned int maxD, Pairing *P) { if (getNext(index) != 0) { cerr << "void ReadsLayout::print(size_t index) problem\n"; sendBugReportPlease(cerr); } if (!dir) index = reverseComplement(index); size_t p = getBegin(index); size_t tmp; do { unsigned int position=getPosition(p); if (position > maxD) break; if (position < start) { tmp = p; p = getNext(p); continue; } unsigned int pairedRead=0; unsigned int pairedNode=0; int lib=0; if (P->getNLibrary() != 0) { pairedRead = P->getPairing(p); pairedNode = getNodeId(pairedRead); lib = P->getPeLibraryID(p); } if (getDirection(p)) out << '>'; else out << '<'; for (int i = 0; i < getPosition(p) % 120; i++) out << " "; if (getDirection(p)) out << getDirectRead(p) << " " << p << ' ' << lib << ' ' << pairedNode << '\n'; else out << getReverseRead(p) << " " << p << ' ' << lib << ' ' << pairedNode << '\n'; tmp = p; p = getNext(p); } while (tmp != index); out << flush; if (!dir) //back to initial direction index = reverseComplement(index); }
static void makeDirFasta(char *regionsFile, char *hg18FastaFile, char *dir, int num) { FILE *fp, *sq; char buf[500], dirName[500], seqName[500], chr1[500], chr2[500]; int b1, e1, b2, e2, i, len; char ori1, ori2; struct hash *seqHash = NULL; struct dnaSeq *seq1, *seq2; struct stat st; DNA *s1, *s2; seqHash = faReadAllIntoHash(hg18FastaFile, dnaUpper); if (stat(dir, &st) != 0) do_cmd("mkdir %s", dir); fp = mustOpen(regionsFile, "r"); i = 0; while (fgets(buf, 500, fp)) { if (sscanf(buf, "%[^:]:%d-%d %[^:]:%d-%d [%c %c]", chr1, &b1, &e1, chr2, &b2, &e2, &ori1, &ori2) != 8) errAbort("error: %s", buf); ++i; if (i != num) continue; sprintf(dirName, "%s/R%d", dir, i); if (stat(dirName, &st) != 0) do_cmd("mkdir %s", dir); sprintf(seqName, "%s/ref.fa", dirName); sq = mustOpen(seqName, "w"); fprintf(sq, ">%s:%d-%d+%s:%d-%d[%c%c]\n", chr1, b1, e1, chr2, b2, e2, ori1, ori2); seq1 = (struct dnaSeq *)hashFindVal(seqHash, chr1); assert(e1 <= seq1->size); len = e1 - b1 + 1; if (ori1 == '-') { s1 = cloneStringZExt(seq1->dna + b1 - 1, len, len+1); reverseComplement(s1, len); writeSeqWithBreaks(sq, s1, len, 80); freeMem(s1); } else writeSeqWithBreaks(sq, seq1->dna + b1 - 1, e1 - b1 + 1, 80); seq2 = (struct dnaSeq *)hashFindVal(seqHash, chr2); assert(e2 <= seq2->size); len = e2 - b2 + 1; if (ori2 == '-') { s2 = cloneStringZExt(seq2->dna + b2 - 1, len, len+1); reverseComplement(s2, len); writeSeqWithBreaks(sq, s2, len, 80); freeMem(s2); } else writeSeqWithBreaks(sq, seq2->dna + b2 - 1, e2 - b2 + 1, 80); fclose(sq); } fclose(fp); //FIXME: free space }
void CommonTest::test_reverseComplement() { // Case 1: upper case std::string nucs = "ACGATCGTGTCATGCNNACCACG"; std::string rev = reverseComplement(nucs); CPPUNIT_ASSERT_MESSAGE("Incorrect reverse complement", rev == "CGTGGTNNGCATGACACGATCGT"); // Case 2: lower case nucs = "acgaccacagctacgacnacgactan"; rev = reverseComplement(nucs); CPPUNIT_ASSERT_MESSAGE("Incorrect reverse complement", rev == "NTAGTCGTNGTCGTAGCTGTGGTCGT"); }
void showTargetRange(struct xaAli *xa, int tOff, int tLen, char strand, boolean showSym) /* Display a range of xa, indexed by target. */ { char *hSym = xa->hSym, *qSym = xa->qSym, *tSym = xa->tSym; int symCount = xa->symCount; int tPos = 0; int i = 0; int j; int maxLineLen = 50; int lineLen; int startIx; int fullLen; int endIx; /* Figure out starting and ending positions taking inserts in target * into account. */ startIx = lenWithDashes(tSym, tOff); fullLen = lenWithDashes(tSym+startIx, tLen); endIx = startIx + fullLen; if (strand == '-') { reverseComplement(qSym+startIx, fullLen); reverseComplement(tSym+startIx, fullLen); reverseBytes(hSym+startIx, fullLen); } for (i=startIx; i<endIx; i += lineLen) { lineLen = endIx-i; if (lineLen > maxLineLen) lineLen = maxLineLen; mustWrite(stdout, qSym+i, lineLen); fputc('\n', stdout); for (j=0; j<lineLen; ++j) { char c = (toupper(qSym[i+j]) == toupper(tSym[i+j]) ? '|' : ' '); fputc(c, stdout); } fputc('\n', stdout); mustWrite(stdout, tSym+i, lineLen); fputc('\n', stdout); //if (showSym) { mustWrite(stdout, hSym+i, lineLen); fputc('\n', stdout); } fputc('\n', stdout); } if (strand == '-') { reverseComplement(qSym+startIx, fullLen); reverseComplement(tSym+startIx, fullLen); reverseBytes(hSym+startIx, fullLen); } }
bool HapgenUtil::makeFlankingHaplotypes(const HapgenAlignment& aln, const ReadTable* pRefTable, int flanking, const StringVector& inHaplotypes, StringVector& outFlankingHaplotypes, StringVector& outHaplotypes) { std::string upstream; std::string referenceHaplotype; std::string downstream; extractReferenceSubstrings(aln, pRefTable, flanking, upstream, referenceHaplotype, downstream); // Flip reference strings to match the strand of the input haplotypes if(aln.isRC) { // reverse complement each string upstream = reverseComplement(upstream); referenceHaplotype = reverseComplement(referenceHaplotype); downstream = reverseComplement(downstream); // Swap up and downstream upstream.swap(downstream); } // Make the reference haplotype w/ flanking sequence std::string referenceFlanking = upstream + referenceHaplotype + downstream; outFlankingHaplotypes.push_back(referenceFlanking); outHaplotypes.push_back(referenceHaplotype); // Check that all sequences match the reference haplotype properly /* bool checkOk = checkAlignmentsAreConsistent(referenceFlanking, inHaplotypes); if(!checkOk) { outHaplotypes.clear(); return false; } */ // Make the flanking sequences for each haplotype for(size_t i = 0; i < inHaplotypes.size(); ++i) { // Skip if the input haplotype exactly matches the reference if(inHaplotypes[i] != referenceHaplotype) { outFlankingHaplotypes.push_back(upstream + inHaplotypes[i] + downstream); outHaplotypes.push_back(inHaplotypes[i]); } } return true; }
void NGSReadSet::processReadWhileParsing(NGSRead &tempread) { //if (!tempread.flag) return; int i, id; if (!tempread.direction) { reverseComplement(tempread.scaff); reverseComplement(tempread.read); } tempread.convertStateStr(tempread.scaff, SEQ_DNA); tempread.convertStateStr(tempread.read, SEQ_DNA); assert(tempread.scaff.length() == tempread.read.length()); int nstates = 4 + (!ngs_ignore_gaps); for (i = 0, id = 0; i < tempread.scaff.length(); i++) { int state1 = tempread.scaff[i]; int state2 = tempread.read[i]; if (state1 >= nstates || state2 >= nstates) continue; double *pair_pos, *state_pos; while (id >= state_freq.size()) { state_pos = new double[nstates]; memset(state_pos, 0, sizeof(double)*(nstates)); state_freq.push_back(state_pos); } state_pos = state_freq[id]; state_pos[state2] += 1.0/tempread.times; while (id >= pair_freq.size()) { pair_pos = new double[(nstates) * (nstates)]; memset(pair_pos, 0, sizeof(double)*(nstates) * (nstates)); pair_freq.push_back(pair_pos); } pair_pos = pair_freq[id]; pair_pos[state1*(nstates) + state2] += 1.0/tempread.times; id++; } if (tree) { ReadInfo read_info; tempread.homo_rate = homo_rate; tempread.computePairFreq(); read_info.homo_distance = tempread.optimizeDist(1.0-tempread.identity); read_info.homo_logl = -tempread.computeFunction(read_info.homo_distance); tempread.homo_rate = 0.0; read_info.distance = tempread.optimizeDist(read_info.homo_distance); read_info.logl = -tempread.computeFunction(read_info.distance); read_info.id = tempread.id; read_info.identity = tempread.identity; push_back(read_info); } }
boolean fastFind(DNA *needle, int needleSize, struct patSpace *ps, struct ffAli **retAli, boolean *retRc, int *retScore) /* Do fast alignment. */ { struct patClump *clumpList, *clump; boolean isRc; struct aliList *aliList = NULL, *ali; for (isRc = 0; isRc <= 1; ++isRc) { if (isRc) reverseComplement(needle, needleSize); if ((clumpList = patSpaceFindOne(ps, needle, needleSize)) != NULL) { for (clump = clumpList; clump != NULL; clump = clump->next) { struct dnaSeq *haySeq = clump->seq; DNA *haystack = haySeq->dna; int start = clump->start; struct ffAli *ffAli = ffFind(needle, needle+needleSize, haystack+start, haystack+start+clump->size, ffCdna); if (ffAli != NULL) { AllocVar(ali); ali->ali = ffAli; ali->score = ffScoreCdna(ffAli); ali->isRc = isRc; slAddHead(&aliList, ali); } } slFreeList(&clumpList); } if (isRc) reverseComplement(needle, needleSize); } if (aliList != NULL) { slSort(&aliList, cmpAliList); *retAli = aliList->ali; aliList->ali = NULL; *retRc = aliList->isRc; *retScore = aliList->score; for (ali = aliList->next; ali != NULL; ali = ali->next) ffFreeAli(&ali->ali); slFreeList(&aliList); return TRUE; } else return FALSE; }
// Merging ARBRC: R and B into RBR // First, the sequence of the vertex is extended // by the the content of the edge label void Vertex::mergeTipVertex(Edge* pEdge) { Edge* pTwin = pEdge->getTwin(); //std::cout << "Adding label to " << getID() << " str: " << pSE->getLabel() << "\n"; // Merge the sequence DNAEncodedString label1 = pEdge->getLabel(); DNAEncodedString label2 = pTwin->getLabel(); size_t RB_len = label1.length()+label2.length(); //merge R and B into RBR if(pEdge->getDir() == ED_SENSE && pTwin->getComp()==EC_SAME) { m_seq.append(label1); m_seq.append(label2); } else if(pEdge->getDir() == ED_SENSE && pTwin->getComp()==EC_REVERSE) { m_seq.append(label1); DNAEncodedString tmp(reverseComplement(label2.toString())); m_seq.append(tmp); } else if(pEdge->getDir() == ED_ANTISENSE && pTwin->getComp()==EC_SAME) { label2.append(label1); label2.append(m_seq); m_seq=label2; } else { DNAEncodedString tmp(reverseComplement(label2.toString())); tmp.append(label1); tmp.append(m_seq); m_seq=tmp; } // All the SeqCoords for the edges must have their seqlen field updated // Also, if we prepended sequence to this edge, all the matches in the // SENSE direction must have their coordinates offset size_t newLen = m_seq.length(); for(EdgePtrVecIter iter = m_edges.begin(); iter != m_edges.end(); ++iter) { Edge* pUpdateEdge = *iter; pUpdateEdge->updateSeqLen(newLen); //add offset RB to each sense edge if(pUpdateEdge->getDir() == ED_SENSE && pEdge != pUpdateEdge) pUpdateEdge->offsetMatch(RB_len); } }
void correctEst(struct psl *psl, struct dnaSeq *est, struct dnaSeq *geno) /* Correct bases in EST to match genome where they align. */ { int i, blockCount = psl->blockCount; if (psl->strand[0] == '-') reverseComplement(est->dna, est->size); for (i=0; i<blockCount; ++i) { memcpy(est->dna + psl->qStarts[i], geno->dna + psl->tStarts[i], psl->blockSizes[i]); } if (psl->strand[0] == '-') reverseComplement(est->dna, est->size); }
//printing all variables void printTable(string transcriptID, string mispos, string ref, int cov, int insertion, int deletion, baseCounter counter) { int same = counter["A"] + counter["C"] + counter["G"] + counter["T"]; int reverse = counter["a"] + counter["c"] + counter["g"] + counter["t"]; if (same > 0){ cout << transcriptID << "\t" ; cout << mispos << "\t"; cout << atoi(mispos.c_str())+1 << '\t'; cout << ref << "\t"; cout << cov << "\t"; cout << '+' << "\t"; cout << counter["A"] << "\t" << counter["C"] << "\t"; cout << counter["T"] << "\t" << counter["G"] << "\t" ; cout << insertion << "\t" << deletion; cout << '\n'; } if (reverse > 0){ cout << transcriptID << "\t" ; cout << mispos << "\t"; cout << atoi(mispos.c_str())+1 << '\t'; cout << reverseComplement(ref) << "\t"; cout << cov << "\t"; cout << '-' << "\t"; cout << counter["a"] << "\t" << counter["c"] << "\t"; cout << counter["t"] << "\t" << counter["g"] << "\t" ; cout << insertion << "\t" << deletion; cout << '\n'; } }
void BamToFastq::SingleFastq() { // open the 1st fastq file for writing ofstream fq(_fastq1.c_str(), ios::out); if ( !fq ) { cerr << "Error: The first fastq file (" << _fastq1 << ") could not be opened. Exiting!" << endl; exit (1); } // open the BAM file BamReader reader; reader.Open(_bamFile); BamAlignment bam; while (reader.GetNextAlignment(bam)) { // extract the sequence and qualities for the BAM "query" string seq = bam.QueryBases; string qual = bam.Qualities; if (bam.IsReverseStrand() == true) { reverseComplement(seq); reverseSequence(qual); } fq << "@" << bam.Name << endl; fq << seq << endl; fq << "+" << endl; fq << qual << endl; } }
struct dnaSeq *twoBitAndBedToSeq(struct twoBitFile *tbf, struct bed *bed) /* Get sequence defined by bed. Exclude introns. */ { struct dnaSeq *seq; if (bed->blockCount <= 1) { seq = twoBitReadSeqFrag(tbf, bed->chrom, bed->chromStart, bed->chromEnd); freeMem(seq->name); seq->name = cloneString(bed->name); } else { int totalBlockSize = bedTotalBlockSize(bed); AllocVar(seq); seq->name = cloneString(bed->name); seq->dna = needMem(totalBlockSize+1); seq->size = totalBlockSize; int i; int seqOffset = 0; for (i=0; i<bed->blockCount; ++i) { int exonSize = bed->blockSizes[i]; int exonStart = bed->chromStart + bed->chromStarts[i]; struct dnaSeq *exon = twoBitReadSeqFrag(tbf, bed->chrom, exonStart, exonStart+exonSize); memcpy(seq->dna + seqOffset, exon->dna, exonSize); seqOffset += exonSize; dnaSeqFree(&exon); } } if (bed->strand[0] == '-') reverseComplement(seq->dna, seq->size); return seq; }
void foldPslIntoStats(struct psl *psl, struct dnaSeq *tSeq, struct hash *otherHash, struct stats *stats) /* Load sequence corresponding to bed and add alignment stats. */ { struct dnaSeq *qSeq = loadSomeSeq(otherHash, psl->qName, psl->qStart, psl->qEnd); int i, bCount = psl->blockCount; int qOffset; // uglyf("%s:%d-%d %s %s:%d-%d\n", psl->qName, psl->qStart, psl->qEnd, psl->strand, psl->tName, psl->tStart, psl->tEnd); if (qSeq != NULL && tSeq != NULL) { if (psl->strand[0] == '-') { reverseComplement(qSeq->dna, qSeq->size); qOffset = psl->qSize - psl->qEnd; } else qOffset = psl->qStart; if (psl->strand[1] == '-') errAbort("Can't yet handle reverse complemented targets"); for (i=0; i<bCount; ++i) { int bSize = psl->blockSizes[i]; stats->bedBaseAli += bSize; stats->bedBaseMatch += baseMatch(qSeq->dna + psl->qStarts[i] - qOffset, tSeq->dna + psl->tStarts[i], bSize); } } freeDnaSeq(&qSeq); }
// Align the haplotype to the reference genome represented by the BWT/SSA pair void HapgenUtil::alignHaplotypeToReferenceBWASW(const std::string& haplotype, const BWTIndexSet& referenceIndex, HapgenAlignmentVector& outAlignments) { PROFILE_FUNC("HapgenUtil::alignHaplotypesToReferenceBWASW") LRAlignment::LRParams params; params.zBest = 20; for(size_t i = 0; i <= 1; ++i) { LRAlignment::LRHitVector hits; std::string query = (i == 0) ? haplotype : reverseComplement(haplotype); LRAlignment::bwaswAlignment(query, referenceIndex.pBWT, referenceIndex.pSSA, params, hits); // Convert the hits into alignments for(size_t j = 0; j < hits.size(); ++j) { int q_alignment_length = hits[j].q_end - hits[j].q_start; // Skip non-complete alignments if((int)haplotype.length() == q_alignment_length) { HapgenAlignment aln(hits[j].targetID, hits[j].t_start, hits[j].length, hits[j].G, i == 1); outAlignments.push_back(aln); } } } }
boolean sameStickyEnd(struct cutter *enz1, struct cutter *enz2) /* Check to see if two enzymes make the same sticky ends. If either of the enzymes have sticky ends that isn't all ACGT, then this returns false. */ { boolean ret = FALSE; struct dnaSeq *sticky1 = stickyEnd(enz1); struct dnaSeq *sticky2 = stickyEnd(enz2); if (sticky1 && sticky2) if (sticky1 && sticky2 && (sticky1->size == sticky2->size) && (acgtCount(sticky1->dna) == sticky1->size) && (acgtCount(sticky2->dna) == sticky2->size)) { if (sameString(sticky1->dna, sticky2->dna)) ret = TRUE; else { reverseComplement(sticky2->dna, sticky2->size); if (sameString(sticky1->dna, sticky2->dna)) ret = TRUE; } } freeDnaSeq(&sticky1); freeDnaSeq(&sticky2); return ret; }
struct dnaSeq *genePredToGenomicSequence(struct genePred *pred, char *chromSeq, struct lm *lm) /* Return concatenated genomic sequence of exons of pred. */ { int txLen = 0; int i; for (i=0; i < pred->exonCount; i++) txLen += (pred->exonEnds[i] - pred->exonStarts[i]); char *seq = lmAlloc(lm, txLen + 1); int offset = 0; for (i=0; i < pred->exonCount; i++) { int blockStart = pred->exonStarts[i]; int blockSize = pred->exonEnds[i] - blockStart; memcpy(seq+offset, chromSeq+blockStart, blockSize*sizeof(*seq)); offset += blockSize; } if(pred->strand[0] == '-') reverseComplement(seq, txLen); struct dnaSeq *txSeq = NULL; lmAllocVar(lm, txSeq); txSeq->name = lmCloneString(lm, pred->name); txSeq->dna = seq; txSeq->size = txLen; return txSeq; }
static char *gpFxModifyCodingSequence(char *oldCodingSeq, struct genePred *pred, int startInCds, int endInCds, struct allele *allele, int *retCdsBasesAdded, struct lm *lm) /* Return a new coding sequence that is oldCodingSeq with allele applied. */ { boolean isRc = (pred->strand[0] == '-'); char *newAlleleSeq = allele->sequence; int newAlLen = strlen(newAlleleSeq); if (! isAllNt(newAlleleSeq, newAlLen)) { // symbolic -- may be deletion or insertion, but we can't tell. :( newAlleleSeq = ""; newAlLen = 0; } if (isRc && newAlLen > 0) { newAlleleSeq = lmCloneString(lm, newAlleleSeq); reverseComplement(newAlleleSeq, newAlLen); } int variantSizeOnCds = endInCds - startInCds; if (variantSizeOnCds < 0) errAbort("gpFx: endInCds (%d) < startInCds (%d)", endInCds, startInCds); char *newCodingSeq = mergeAllele(oldCodingSeq, startInCds, variantSizeOnCds, newAlleleSeq, newAlLen, lm); // If newCodingSequence has an early stop, truncate there: truncateAtStopCodon(newCodingSeq); int variantSizeOnRef = allele->variant->chromEnd - allele->variant->chromStart; if (retCdsBasesAdded) *retCdsBasesAdded = allele->length - variantSizeOnRef; return newCodingSeq; }
/********************************************************************************************************************** Search a read in the dataset using binary search **********************************************************************************************************************/ Read * Dataset::getReadFromString(const string & read) { UINT64 min = 0, max = getNumberOfUniqueReads()-1; string readReverse = reverseComplement(read); int comparator; if(read.compare(readReverse) < 0) { while (max >= min) // At first search for the forward string. { UINT64 mid = (min + max) / 2; // Determine which subarray to search. comparator = reads->at(mid)->getStringForward().compare(read.c_str()); if(comparator == 0) return reads->at(mid); else if (comparator < 0) // Change min index to search upper subarray. min = mid + 1; else if (comparator > 0) // Change max index to search lower subarray. max = mid - 1; } } else { while (max >= min) // If forward string is not found then search for the reverse string { UINT64 mid = (min+max) / 2; // Determine which subarray to search comparator = reads->at(mid)->getStringForward().compare(readReverse.c_str()); if( comparator == 0) return reads->at(mid); else if (comparator < 0) // Change min index to search upper subarray. min = mid + 1; else if (comparator > 0) // Change max index to search lower subarray. max = mid - 1; } } MYEXIT("String not found in Dataset: "+read); }
// Validate that the edge members are sane void Edge::validate() const { const Edge* pTwin = getTwin(); std::string m_v1 = getMatchStr(); std::string m_v2 = pTwin->getMatchStr(); if(getComp() == EC_REVERSE) m_v2 = reverseComplement(m_v2); bool error = false; if(m_v1.length() != m_v2.length()) { std::cerr << "Error, matching strings are not the same length\n"; error = true; } if(error) { std::cerr << "V1M: " << m_v1 << "\n"; std::cerr << "V2M: " << m_v2 << "\n"; std::cerr << "V1MC: " << getMatchCoord() << "\n"; std::cerr << "V2MC: " << pTwin->getMatchCoord() << "\n"; std::cerr << "V1: " << getStart()->getSeq() << "\n"; std::cerr << "Validation failed for edge " << *this << "\n"; assert(false); } }
void verify_node_orig(kg_node_t * node, unsigned kmer_length) { assert( false && "TODO FIX! REVERSED KMER ENDIANNESS" ); int double_kmer_length = kmer_length << 1; #ifdef LARGE_KMERS Kmer mask; mask.createMask(double_kmer_length); #else Kmer mask = (Kmer(1) << double_kmer_length) - 1; #endif Kmer kmer = node->kmer; Kmer rc_kmer = reverseComplement(kmer, kmer_length); char leftmost_base = (kmer >> (double_kmer_length - 2)) & 0x3; char rightmost_base = kmer & 0x3; for (int i = 0 ; i < 4 ; ++ i) { // check on the left side kg_node_t * node2 = node->left[i]; int count = node->left_count[i]; if (node2) { assert (count != 0); if (count > 0) { Kmer kmer2 = KMER_PREPEND(kmer, i, double_kmer_length, mask); assert(kmer2 == node2->kmer); assert(node2->right[(int)rightmost_base] == node); assert(node2->right_count[(int)rightmost_base] == count); } else { Kmer kmer2 = KMER_APPEND(rc_kmer, i ^ 0x3, double_kmer_length, mask); assert(kmer2 == node2->kmer); assert(node2->left[rightmost_base ^ 0x3] == node); assert(node2->left_count[rightmost_base ^ 0x3] == count); } } else { assert (count == 0); } // check on the right side node2 = node->right[i]; count = node->right_count[i]; if (node2) { assert (count != 0); if (count > 0) { Kmer kmer2 = KMER_APPEND(kmer, i, double_kmer_length, mask); assert(kmer2 == node2->kmer); assert(node2->left[(int)leftmost_base] == node); assert(node2->left_count[(int)leftmost_base] == count); } else { Kmer kmer2 = KMER_PREPEND(rc_kmer, i ^ 0x3, double_kmer_length, mask); assert(kmer2 == node2->kmer); assert(node2->right[leftmost_base ^ 0x3] == node); assert(node2->right_count[leftmost_base ^ 0x3] == count); } } else { assert (count == 0); } } }
std::string SGPairedAlgorithms::pathToString(const Vertex* pX, const Path& path) { std::string out = pX->getSeq().toString(); EdgeComp currComp = EC_SAME; for(size_t i = 0; i < path.size(); ++i) { Edge* pYZ = path[i]; EdgeComp ecYZ = pYZ->getComp(); // Calculate the next comp, between X and Z EdgeComp ecXZ; if(ecYZ == EC_SAME) ecXZ = currComp; else ecXZ = !currComp; std::string edge_str = pYZ->getLabel(); assert(edge_str.size() != 0); if(currComp == EC_REVERSE) edge_str = reverseComplement(edge_str); out.append(edge_str); currComp = ecXZ; } return out; }
// Returns true if changing the given reference base is detectable with kmers KmerCounts computeChangeCounts(const BWTIndexSet& ref_index, std::string& sequence, size_t base_idx, char new_base) { // Introduce the change char old_base = sequence[base_idx]; sequence[base_idx] = new_base; size_t l = sequence.length(); // Iterate over kmers covering this position size_t start_k_idx = (base_idx + 1) > opt::kmer ? base_idx + 1 - opt::kmer : 0; size_t end_k_idx = (base_idx + opt::kmer) < l ? base_idx : l - opt::kmer; assert(end_k_idx - start_k_idx <= opt::kmer); KmerCounts counts; counts.total = 0; counts.zero = 0; for(size_t ki = start_k_idx; ki <= end_k_idx; ++ki) { std::string ks = sequence.substr(ki, opt::kmer); size_t occ = BWTAlgorithms::countSequenceOccurrences(ks, ref_index) + BWTAlgorithms::countSequenceOccurrences(reverseComplement(ks), ref_index); counts.total += 1; counts.zero += (occ == 0) ? 1 : 0; } // Reset the base sequence[base_idx] = old_base; return counts; }
static void simpleFillInSequence(char *seqDir, struct agpFrag *agpList, DNA *dna, int dnaSize) /* Fill in DNA array with sequences from simple clones. */ { struct agpFrag *agp; char underline = '_'; for (agp = agpList; agp != NULL; agp = agp->next) { char clone[128]; char path[512]; struct dnaSeq *seq; int size; strcpy(clone, agp->frag); chopSuffixAt(clone,underline); sprintf(path, "%s/%s.fa", seqDir, clone); seq = faReadAllDna(path); if (slCount(seq) != 1) errAbort("Can only handle exactly one clone in %s.", path); size = agp->fragEnd - agp->fragStart; if (agp->strand[0] == '-') reverseComplement(seq->dna + agp->fragStart, size); memcpy(dna + agp->chromStart, seq->dna + agp->fragStart, size); freeDnaSeq(&seq); } }