Example #1
0
void tokenizePatternFile(std::ifstream& in) {
    // tokenize a line from the pattern file.  The first part will be the pattern and the second
    // part is the file to write to.

    std::string lineptr;

    while(in.good()) {
        std::getline(in, lineptr);
        if(lineptr.empty()) {
            continue;
        }
        std::vector<std::string> fields;
        split(fields, lineptr, "\t");
        switch(fields.size()) {
            case 0:
                break;
            case 1:
                manager.add(fields[0]);
                if(opts.r_flag) {
                    std::string rcpattern = fields[0];
                    reverseComplement(rcpattern);
                    manager.add(rcpattern);
                }
                break;
            default:
                manager.add(fields[0], fields[1]);
                if(opts.r_flag) {
                    std::string rcpattern = fields[0];
                    reverseComplement(rcpattern);
                    manager.add(rcpattern, fields[1]);
                }
                break;
        }
    }
}
Example #2
0
static void showOverlap(const bam1_t *leftBam, const bam1_t *rightBam)
/* If the two reads overlap, show how. */
{
const bam1_core_t *leftCore = &(leftBam->core), *rightCore = &(rightBam->core);
int leftStart = leftCore->pos, rightStart = rightCore->pos;
int leftLen = bamGetTargetLength(leftBam), rightLen = bamGetTargetLength(rightBam);
char *leftSeq = bamGetQuerySequence(leftBam, useStrand);
char *rightSeq = bamGetQuerySequence(rightBam, useStrand);
if (useStrand && bamIsRc(leftBam))
    reverseComplement(leftSeq, strlen(leftSeq));
if (useStrand && bamIsRc(rightBam))
    reverseComplement(rightSeq, strlen(rightSeq));
if ((rightStart > leftStart && leftStart + leftLen > rightStart) ||
    (leftStart > rightStart && rightStart+rightLen > leftStart))
    {
    int leftClipLow, rightClipLow;
    bamGetSoftClipping(leftBam, &leftClipLow, NULL, NULL);
    bamGetSoftClipping(rightBam, &rightClipLow, NULL, NULL);
    leftStart -= leftClipLow;
    rightStart -= rightClipLow;
    printf("<B>Note: End read alignments overlap:</B><BR>\n<PRE><TT>");
    int i = leftStart - rightStart;
    while (i-- > 0)
	putc(' ', stdout);
    puts(leftSeq);
    i = rightStart - leftStart;
    while (i-- > 0)
	putc(' ', stdout);
    puts(rightSeq);
    puts("</TT></PRE>");
    }
}
void loadIfNewSeq(char *nibDir, char *newName, char strand, 
	char **pName, struct dnaSeq **pSeq, char *pStrand)
/* Load sequence unless it is already loaded.  Reverse complement
 * if necessary. */
{
struct dnaSeq *seq;
if (sameString(newName, *pName))
    {
    if (strand != *pStrand)
        {
	seq = *pSeq;
	reverseComplement(seq->dna, seq->size);
	*pStrand = strand;
	}
    }
else
    {
    char fileName[512];
    freeDnaSeq(pSeq);
    snprintf(fileName, sizeof(fileName), "%s/%s.nib", nibDir, newName);
    *pName = newName;
    *pSeq = seq = nibLoadAllMasked(NIB_MASK_MIXED, fileName);
    *pStrand = strand;
    if (strand == '-')
        reverseComplement(seq->dna, seq->size);
    uglyf("Loaded %d bases in %s\n", seq->size, fileName);
    }
}
void loadFaSeq(struct hash *faHash, char *newName, char strand, 
	char **pName, struct dnaSeq **pSeq, char *pStrand)
/* retrieve sequence from hash.  Reverse complement
 * if necessary. */
{
struct dnaSeq *seq;
if (sameString(newName, *pName))
    {
    if (strand != *pStrand)
        {
	seq = *pSeq;
	reverseComplement(seq->dna, seq->size);
	*pStrand = strand;
	}
    }
else
    {
    *pName = newName;
    *pSeq = seq = hashFindVal(faHash, newName);
    *pStrand = strand;
    if (strand == '-')
        reverseComplement(seq->dna, seq->size);
    verbose(1, "Loaded %d bases from %s fa\n", seq->size, newName);
    }
}
static void loadFaSeq(struct hash *faHash, char *newName, char strand,
	char **pName, struct dnaSeq **pSeq, char *pStrand, char *fastaFileName)
/* retrieve sequence from hash.  Reverse complement
 * if necessary. */
{
struct dnaSeq *seq;
if (sameString(newName, *pName))
    {
    if (strand != *pStrand)
        {
	seq = *pSeq;
	reverseComplement(seq->dna, seq->size);
	*pStrand = strand;
	}
    }
else
    {
    *pName = newName;
    *pSeq = seq = hashFindVal(faHash, newName);
    if (NULL == seq)
        errAbort("ERROR: can not find sequence name '%s' from fasta file '%s'\n", newName, fastaFileName);
    *pStrand = strand;
    if (strand == '-')
        reverseComplement(seq->dna, seq->size);
    verbose(1, "Loaded %d bases from %s fa\n", seq->size, newName);
    }
}
Example #6
0
static void makeProfile(DNA *oligo, int oligoSize, int mismatchesAllowed, 
    struct seqList *seqList, boolean considerRc, double profile[16][4])
/* Scan through file counting up things that match oligo to within
 * mismatch tolerance, and use these counts to build up a profile. */
{
int counts[16][4];
int total = 0;
double invTotal;
int i,j;
int seqCount = 0;
struct seqList *seqEl;
DNA rcOligo[17];

if (considerRc)
    {
    assert(oligoSize < sizeof(rcOligo));
    memcpy(rcOligo, oligo, oligoSize);
    reverseComplement(rcOligo, oligoSize);
    }

zeroBytes(counts, sizeof(counts));
for (seqEl = seqList; seqEl != NULL; seqEl = seqEl->next)
    {
    struct dnaSeq *seq = seqEl->seq;
    DNA *dna = seq->dna;
    int size = seq->size;
    int endIx = size-oligoSize;

    ++seqCount;
    for (i=0; i<=endIx; ++i)
        {
        DNA *target = dna+i;
        if (allGoodBases(target, oligoSize))
            {
            if (mismatchCount(oligo, target, oligoSize) <= mismatchesAllowed)
                {
                ++total;
                for (j=0; j<oligoSize; ++j)
                    counts[j][ntVal[(int)target[j]]] += 1;
                }
            if (considerRc && mismatchCount(rcOligo, target, oligoSize) <= mismatchesAllowed)
                {
                ++total;
                reverseComplement(target, oligoSize);
                for (j=0; j<oligoSize; ++j)
                    counts[j][ntVal[(int)target[j]]] += 1;
                reverseComplement(target, oligoSize);
                }
            } 
        }
    }
invTotal = 1.0/total;
for (i=0; i<oligoSize; ++i)
    {
    for (j=0; j<4; ++j)
        {
        profile[i][j] = invTotal * counts[i][j];
        }
    }
}
Example #7
0
struct dnaSeq *gfiExpandAndLoadCached(struct gfRange *range, 
	struct hash *tFileCache, char *tSeqDir, int querySize, 
	int *retTotalSeqSize, boolean respectFrame, boolean isRc, int expansion)
/* Expand range to cover an additional expansion bases on either side.
 * Load up target sequence and return. (Done together because don't
 * know target size before loading.) */
{
struct dnaSeq *target = NULL;
char fileName[PATH_LEN+256];

safef(fileName, sizeof(fileName), "%s/%s", tSeqDir, range->tName);
if (nibIsFile(fileName))
    {
    struct nibInfo *nib = hashFindVal(tFileCache, fileName);
    if (nib == NULL)
        {
	nib = nibInfoNew(fileName);
	hashAdd(tFileCache, fileName, nib);
	}
    if (isRc)
	reverseIntRange(&range->tStart, &range->tEnd, nib->size);
    gfiExpandRange(range, querySize, nib->size, respectFrame, isRc, expansion);
    target = nibLdPart(fileName, nib->f, nib->size, 
    	range->tStart, range->tEnd - range->tStart);
    if (isRc)
	{
	reverseComplement(target->dna, target->size);
	reverseIntRange(&range->tStart, &range->tEnd, nib->size);
	}
    *retTotalSeqSize = nib->size;
    }
else
    {
    struct twoBitFile *tbf = NULL;
    char *tSeqName = strchr(fileName, ':');
    int tSeqSize = 0;
    if (tSeqName == NULL)
        errAbort("No colon in .2bit response from gfServer");
    *tSeqName++ = 0;
    tbf = hashFindVal(tFileCache, fileName);
    if (tbf == NULL)
        {
	tbf = twoBitOpen(fileName);
	hashAdd(tFileCache, fileName, tbf);
	}
    tSeqSize = twoBitSeqSize(tbf, tSeqName);
    if (isRc)
	reverseIntRange(&range->tStart, &range->tEnd, tSeqSize);
    gfiExpandRange(range, querySize, tSeqSize, respectFrame, isRc, expansion);
    target = twoBitReadSeqFragLower(tbf, tSeqName, range->tStart, range->tEnd);
    if (isRc)
	{
	reverseComplement(target->dna, target->size);
	reverseIntRange(&range->tStart, &range->tEnd, tSeqSize);
	}
    *retTotalSeqSize = tSeqSize;
    }
return target;
}
Example #8
0
void ReadsLayout::print(size_t index, ostream &out, bool dir, unsigned int start, unsigned int maxD, Pairing *P) {

    if (getNext(index) != 0) {
        cerr << "void ReadsLayout::print(size_t index) problem\n";
        sendBugReportPlease(cerr);
    }
    if (!dir)
        index = reverseComplement(index);

    size_t p = getBegin(index);
    size_t tmp;

    do {
        
        unsigned int position=getPosition(p);
        
        if (position > maxD)
            break;

        if (position < start)
        {
            tmp = p;
            p = getNext(p);
            continue;
        }

        unsigned int pairedRead=0;
        unsigned int pairedNode=0;
        int lib=0;

        if (P->getNLibrary() != 0)
        {
            pairedRead = P->getPairing(p);
            pairedNode = getNodeId(pairedRead);
            lib = P->getPeLibraryID(p);
        }
        
        if (getDirection(p))
            out << '>';
        else
            out << '<';

        for (int i = 0; i < getPosition(p) % 120; i++)
            out << " ";
        if (getDirection(p))
            out << getDirectRead(p) << " " << p << ' ' << lib << ' ' << pairedNode << '\n';
        else
            out << getReverseRead(p) << " " << p << ' ' << lib << ' ' << pairedNode << '\n';
        
        tmp = p;
        p = getNext(p);

    } while (tmp != index);

    out << flush;

    if (!dir) //back to initial direction
        index = reverseComplement(index);
}
static void makeDirFasta(char *regionsFile, char *hg18FastaFile, char *dir, int num) {
	FILE *fp, *sq;
	char buf[500], dirName[500], seqName[500], chr1[500], chr2[500];
	int b1, e1, b2, e2, i, len;
	char ori1, ori2;
	struct hash *seqHash = NULL;
	struct dnaSeq *seq1, *seq2;
	struct stat st;
	DNA *s1, *s2;

	seqHash = faReadAllIntoHash(hg18FastaFile, dnaUpper);
	if (stat(dir, &st) != 0)
		do_cmd("mkdir %s", dir);

	fp = mustOpen(regionsFile, "r");
	i = 0;
	while (fgets(buf, 500, fp)) {
		if (sscanf(buf, "%[^:]:%d-%d %[^:]:%d-%d [%c %c]", chr1, &b1, &e1, chr2, &b2, &e2, &ori1, &ori2) != 8)
			errAbort("error: %s", buf);
		++i;
		if (i != num) 
			continue;
		sprintf(dirName, "%s/R%d", dir, i);
		if (stat(dirName, &st) != 0)
			do_cmd("mkdir %s", dir);
		sprintf(seqName, "%s/ref.fa", dirName);
		sq = mustOpen(seqName, "w");
		fprintf(sq, ">%s:%d-%d+%s:%d-%d[%c%c]\n", chr1, b1, e1, chr2, b2, e2, ori1, ori2);
		seq1 = (struct dnaSeq *)hashFindVal(seqHash, chr1);
		assert(e1 <= seq1->size);
		len = e1 - b1 + 1;
		if (ori1 == '-') {
			s1 = cloneStringZExt(seq1->dna + b1 - 1, len, len+1);
			reverseComplement(s1, len);
			writeSeqWithBreaks(sq, s1, len, 80);
			freeMem(s1);
		}
		else
			writeSeqWithBreaks(sq, seq1->dna + b1 - 1, e1 - b1 + 1, 80);
		seq2 = (struct dnaSeq *)hashFindVal(seqHash, chr2);
		assert(e2 <= seq2->size);
		len = e2 - b2 + 1;
		if (ori2 == '-') {
			s2 = cloneStringZExt(seq2->dna + b2 - 1, len, len+1);
			reverseComplement(s2, len);
			writeSeqWithBreaks(sq, s2, len, 80);
			freeMem(s2);
		}
		else
			writeSeqWithBreaks(sq, seq2->dna + b2 - 1, e2 - b2 + 1, 80);
		fclose(sq);
	}
	fclose(fp);
	//FIXME: free space
} 
Example #10
0
void CommonTest::test_reverseComplement() {
  // Case 1: upper case
  std::string nucs = "ACGATCGTGTCATGCNNACCACG";
  std::string rev = reverseComplement(nucs);
  CPPUNIT_ASSERT_MESSAGE("Incorrect reverse complement", rev == "CGTGGTNNGCATGACACGATCGT");

  // Case 2: lower case
  nucs = "acgaccacagctacgacnacgactan";
  rev = reverseComplement(nucs);
  CPPUNIT_ASSERT_MESSAGE("Incorrect reverse complement", rev == "NTAGTCGTNGTCGTAGCTGTGGTCGT");
}
void showTargetRange(struct xaAli *xa, int tOff, int tLen, char strand, boolean showSym)
/* Display a range of xa, indexed by target. */
{
char *hSym = xa->hSym, *qSym = xa->qSym, *tSym = xa->tSym;
int symCount = xa->symCount;
int tPos = 0;
int i = 0;
int j;
int maxLineLen = 50;
int lineLen;
int startIx;
int fullLen;
int endIx;

/* Figure out starting and ending positions taking inserts in target
 * into account. */
startIx = lenWithDashes(tSym, tOff);
fullLen = lenWithDashes(tSym+startIx, tLen);
endIx = startIx + fullLen;
if (strand == '-')
    {
    reverseComplement(qSym+startIx, fullLen);
    reverseComplement(tSym+startIx, fullLen);
    reverseBytes(hSym+startIx, fullLen);
    }
for (i=startIx; i<endIx; i += lineLen)
    {
    lineLen = endIx-i;
    if (lineLen > maxLineLen)
        lineLen = maxLineLen;
    mustWrite(stdout, qSym+i, lineLen);
    fputc('\n', stdout);
    for (j=0; j<lineLen; ++j)
        {
        char c = (toupper(qSym[i+j]) == toupper(tSym[i+j]) ? '|' : ' ');
        fputc(c, stdout);
        }
    fputc('\n', stdout);
    mustWrite(stdout, tSym+i, lineLen);
    fputc('\n', stdout);
    //if (showSym)
        {
        mustWrite(stdout, hSym+i, lineLen);
        fputc('\n', stdout);
        }
    fputc('\n', stdout);
    }
if (strand == '-')
    {
    reverseComplement(qSym+startIx, fullLen);
    reverseComplement(tSym+startIx, fullLen);
    reverseBytes(hSym+startIx, fullLen);
    }
}
Example #12
0
bool HapgenUtil::makeFlankingHaplotypes(const HapgenAlignment& aln, 
                                        const ReadTable* pRefTable, 
                                        int flanking,
                                        const StringVector& inHaplotypes,
                                        StringVector& outFlankingHaplotypes,
                                        StringVector& outHaplotypes)
{
    std::string upstream;
    std::string referenceHaplotype;
    std::string downstream;

    extractReferenceSubstrings(aln, pRefTable, flanking, upstream, referenceHaplotype, downstream);

    // Flip reference strings to match the strand of the input haplotypes
    if(aln.isRC)
    {
        // reverse complement each string
        upstream = reverseComplement(upstream);
        referenceHaplotype = reverseComplement(referenceHaplotype);
        downstream = reverseComplement(downstream);

        // Swap up and downstream
        upstream.swap(downstream);
    }

    // Make the reference haplotype w/ flanking sequence
    std::string referenceFlanking = upstream + referenceHaplotype + downstream;
    outFlankingHaplotypes.push_back(referenceFlanking);
    outHaplotypes.push_back(referenceHaplotype);

    // Check that all sequences match the reference haplotype properly
    /*
    bool checkOk = checkAlignmentsAreConsistent(referenceFlanking, inHaplotypes);
    if(!checkOk)
    {
        outHaplotypes.clear();
        return false;
    }
    */

    // Make the flanking sequences for each haplotype
    for(size_t i = 0; i < inHaplotypes.size(); ++i)
    {
        // Skip if the input haplotype exactly matches the reference
        if(inHaplotypes[i] != referenceHaplotype)
        {
            outFlankingHaplotypes.push_back(upstream + inHaplotypes[i] + downstream);
            outHaplotypes.push_back(inHaplotypes[i]);
        }
    }

    return true;
}
Example #13
0
void NGSReadSet::processReadWhileParsing(NGSRead &tempread) {

    //if (!tempread.flag) return;
    int i, id;

    if (!tempread.direction) {
        reverseComplement(tempread.scaff);
        reverseComplement(tempread.read);
    }
    tempread.convertStateStr(tempread.scaff, SEQ_DNA);
    tempread.convertStateStr(tempread.read, SEQ_DNA);
    assert(tempread.scaff.length() == tempread.read.length());

    int nstates = 4 + (!ngs_ignore_gaps);

    for (i = 0, id = 0; i < tempread.scaff.length(); i++) {
        int state1 = tempread.scaff[i];
        int state2 = tempread.read[i];
        if (state1 >= nstates || state2 >= nstates) continue;
        double *pair_pos, *state_pos;
        while (id >= state_freq.size()) {
            state_pos = new double[nstates];
            memset(state_pos, 0, sizeof(double)*(nstates));
            state_freq.push_back(state_pos);
        }
        state_pos = state_freq[id];
        state_pos[state2] += 1.0/tempread.times;
        while (id >= pair_freq.size()) {
            pair_pos = new double[(nstates) * (nstates)];
            memset(pair_pos, 0, sizeof(double)*(nstates) * (nstates));
            pair_freq.push_back(pair_pos);
        }
        pair_pos = pair_freq[id];
        pair_pos[state1*(nstates) + state2] += 1.0/tempread.times;
        id++;
    }

    if (tree) {
        ReadInfo read_info;
        tempread.homo_rate = homo_rate;
        tempread.computePairFreq();
        read_info.homo_distance = tempread.optimizeDist(1.0-tempread.identity);
        read_info.homo_logl = -tempread.computeFunction(read_info.homo_distance);
        tempread.homo_rate = 0.0;
        read_info.distance = tempread.optimizeDist(read_info.homo_distance);
        read_info.logl = -tempread.computeFunction(read_info.distance);
        read_info.id = tempread.id;
        read_info.identity = tempread.identity;
        push_back(read_info);
    }


}
Example #14
0
boolean fastFind(DNA *needle, int needleSize, 
    struct patSpace *ps, struct ffAli **retAli, boolean *retRc, int *retScore)
/* Do fast alignment. */
{
struct patClump *clumpList, *clump;
boolean isRc;
struct aliList *aliList = NULL, *ali;

for (isRc = 0; isRc <= 1; ++isRc)
    {
    if (isRc)
        reverseComplement(needle, needleSize);
    if ((clumpList = patSpaceFindOne(ps, needle, needleSize)) != NULL)
        {
        for (clump = clumpList; clump != NULL; clump = clump->next)
            {
            struct dnaSeq *haySeq = clump->seq;
            DNA *haystack = haySeq->dna;
            int start = clump->start;
            struct ffAli *ffAli = ffFind(needle, needle+needleSize, 
                haystack+start, haystack+start+clump->size, ffCdna);
            if (ffAli != NULL)
                {
                AllocVar(ali);
                ali->ali = ffAli;
                ali->score = ffScoreCdna(ffAli);
                ali->isRc = isRc;
                slAddHead(&aliList, ali);
                }
            }
        slFreeList(&clumpList);
        }
    if (isRc)
        reverseComplement(needle, needleSize);
    }
if (aliList != NULL)
    {
    slSort(&aliList, cmpAliList);
    *retAli = aliList->ali;
    aliList->ali = NULL;
    *retRc = aliList->isRc;
    *retScore = aliList->score;
    for (ali = aliList->next; ali != NULL; ali = ali->next)
        ffFreeAli(&ali->ali);
    slFreeList(&aliList);
    return TRUE;
    }
else
    return FALSE;
}
Example #15
0
// Merging ARBRC: R and B into RBR
// First, the sequence of the vertex is extended
// by the the content of the edge label
void Vertex::mergeTipVertex(Edge* pEdge)
{
    Edge* pTwin = pEdge->getTwin();
    //std::cout << "Adding label to " << getID() << " str: " << pSE->getLabel() << "\n";

    // Merge the sequence
    DNAEncodedString label1 = pEdge->getLabel();
    DNAEncodedString label2 = pTwin->getLabel();
    size_t RB_len = label1.length()+label2.length();

    //merge R and B into RBR
    if(pEdge->getDir() == ED_SENSE && pTwin->getComp()==EC_SAME)
    {
        m_seq.append(label1);
        m_seq.append(label2);
    }
    else if(pEdge->getDir() == ED_SENSE && pTwin->getComp()==EC_REVERSE)
    {
        m_seq.append(label1);
        DNAEncodedString tmp(reverseComplement(label2.toString()));
        m_seq.append(tmp);
    }
    else if(pEdge->getDir() == ED_ANTISENSE && pTwin->getComp()==EC_SAME)
    {
        label2.append(label1);
        label2.append(m_seq);
        m_seq=label2;
    }
    else
    {
        DNAEncodedString tmp(reverseComplement(label2.toString()));
        tmp.append(label1);
        tmp.append(m_seq);
        m_seq=tmp;
    }

    // All the SeqCoords for the edges must have their seqlen field updated
    // Also, if we prepended sequence to this edge, all the matches in the
    // SENSE direction must have their coordinates offset
    size_t newLen = m_seq.length();
    for(EdgePtrVecIter iter = m_edges.begin(); iter != m_edges.end(); ++iter)
    {
        Edge* pUpdateEdge = *iter;
        pUpdateEdge->updateSeqLen(newLen);
        //add offset RB to each sense edge
        if(pUpdateEdge->getDir() == ED_SENSE && pEdge != pUpdateEdge)
            pUpdateEdge->offsetMatch(RB_len);
    }
}
Example #16
0
void correctEst(struct psl *psl, struct dnaSeq *est, struct dnaSeq *geno)
/* Correct bases in EST to match genome where they align. */
{
int i, blockCount = psl->blockCount;

if (psl->strand[0] == '-')
    reverseComplement(est->dna, est->size);
for (i=0; i<blockCount; ++i)
    {
    memcpy(est->dna + psl->qStarts[i], geno->dna + psl->tStarts[i], 
    	psl->blockSizes[i]);
    }
if (psl->strand[0] == '-')
    reverseComplement(est->dna, est->size);
}
Example #17
0
//printing all variables
void printTable(string transcriptID, string mispos, string ref, int cov,
				int insertion, int deletion, baseCounter counter)
{
    int same = counter["A"] + counter["C"] + counter["G"] + counter["T"];
    int reverse = counter["a"] + counter["c"] + counter["g"] + counter["t"];
    if (same > 0){
        cout << transcriptID << "\t" ;
	    cout << mispos << "\t"; 
	    cout << atoi(mispos.c_str())+1 << '\t';
    	cout << ref << "\t";
	    cout << cov << "\t"; 
    	cout << '+' << "\t";
	    cout << counter["A"] << "\t" << counter["C"] << "\t";
    	cout << counter["T"] << "\t" << counter["G"] << "\t" ;
    	cout << insertion << "\t" << deletion;
	    cout << '\n';
    }
    if (reverse > 0){
        cout << transcriptID << "\t" ;
	    cout << mispos << "\t"; 
	    cout << atoi(mispos.c_str())+1 << '\t';
    	cout << reverseComplement(ref) << "\t";
	    cout << cov << "\t"; 
    	cout << '-' << "\t";
	    cout << counter["a"] << "\t" << counter["c"] << "\t";
    	cout << counter["t"] << "\t" << counter["g"] << "\t" ;
    	cout << insertion << "\t" << deletion;
	    cout << '\n';
    }
}
Example #18
0
void BamToFastq::SingleFastq() {
    // open the 1st fastq file for writing
    ofstream fq(_fastq1.c_str(), ios::out);
    if ( !fq ) {
        cerr << "Error: The first fastq file (" << _fastq1 << ") could not be opened.  Exiting!" << endl;
        exit (1);
    }
    // open the BAM file
    BamReader reader;
    reader.Open(_bamFile);
    BamAlignment bam;
    while (reader.GetNextAlignment(bam)) {
        // extract the sequence and qualities for the BAM "query"
        string seq  = bam.QueryBases;
        string qual = bam.Qualities;
        if (bam.IsReverseStrand() == true) {
            reverseComplement(seq);
            reverseSequence(qual);
        }
        fq << "@" << bam.Name << endl;
        fq << seq << endl;
        fq << "+" << endl;
        fq << qual << endl;
    }
}
Example #19
0
struct dnaSeq *twoBitAndBedToSeq(struct twoBitFile *tbf, struct bed *bed)
/* Get sequence defined by bed.  Exclude introns. */
{
struct dnaSeq *seq;
if (bed->blockCount <= 1)
    {
    seq = twoBitReadSeqFrag(tbf, bed->chrom, bed->chromStart, bed->chromEnd);
    freeMem(seq->name);
    seq->name = cloneString(bed->name);
    }
else
    {
    int totalBlockSize = bedTotalBlockSize(bed);
    AllocVar(seq);
    seq->name = cloneString(bed->name);
    seq->dna = needMem(totalBlockSize+1);
    seq->size = totalBlockSize;
    int i;
    int seqOffset = 0;
    for (i=0; i<bed->blockCount; ++i)
        {
	int exonSize = bed->blockSizes[i];
	int exonStart = bed->chromStart + bed->chromStarts[i];
	struct dnaSeq *exon = twoBitReadSeqFrag(tbf, bed->chrom, exonStart, exonStart+exonSize);
	memcpy(seq->dna + seqOffset, exon->dna, exonSize);
	seqOffset += exonSize;
	dnaSeqFree(&exon);
	}
    }
if (bed->strand[0] == '-')
    reverseComplement(seq->dna, seq->size);
return seq;
}
void foldPslIntoStats(struct psl *psl, struct dnaSeq *tSeq,
                      struct hash *otherHash, struct stats *stats)
/* Load sequence corresponding to bed and add alignment stats. */
{
    struct dnaSeq *qSeq = loadSomeSeq(otherHash,
                                      psl->qName, psl->qStart, psl->qEnd);
    int i, bCount = psl->blockCount;
    int qOffset;

// uglyf("%s:%d-%d %s %s:%d-%d\n", psl->qName, psl->qStart, psl->qEnd, psl->strand, psl->tName, psl->tStart, psl->tEnd);
    if (qSeq != NULL && tSeq != NULL)
    {
        if (psl->strand[0] == '-')
        {
            reverseComplement(qSeq->dna, qSeq->size);
            qOffset = psl->qSize - psl->qEnd;
        }
        else
            qOffset = psl->qStart;
        if (psl->strand[1] == '-')
            errAbort("Can't yet handle reverse complemented targets");
        for (i=0; i<bCount; ++i)
        {
            int bSize  = psl->blockSizes[i];
            stats->bedBaseAli += bSize;
            stats->bedBaseMatch += baseMatch(qSeq->dna + psl->qStarts[i] - qOffset,
                                             tSeq->dna + psl->tStarts[i],  bSize);
        }
    }
    freeDnaSeq(&qSeq);
}
Example #21
0
// Align the haplotype to the reference genome represented by the BWT/SSA pair
void HapgenUtil::alignHaplotypeToReferenceBWASW(const std::string& haplotype,
        const BWTIndexSet& referenceIndex,
        HapgenAlignmentVector& outAlignments)
{
    PROFILE_FUNC("HapgenUtil::alignHaplotypesToReferenceBWASW")
    LRAlignment::LRParams params;

    params.zBest = 20;

    for(size_t i = 0; i <= 1; ++i)
    {
        LRAlignment::LRHitVector hits;
        std::string query = (i == 0) ? haplotype : reverseComplement(haplotype);
        LRAlignment::bwaswAlignment(query, referenceIndex.pBWT, referenceIndex.pSSA, params, hits);

        // Convert the hits into alignments
        for(size_t j = 0; j < hits.size(); ++j)
        {
            int q_alignment_length = hits[j].q_end - hits[j].q_start;

            // Skip non-complete alignments
            if((int)haplotype.length() == q_alignment_length)
            {
                HapgenAlignment aln(hits[j].targetID, hits[j].t_start, hits[j].length, hits[j].G, i == 1);
                outAlignments.push_back(aln);
            }
        }
    }
}
boolean sameStickyEnd(struct cutter *enz1, struct cutter *enz2)
/* Check to see if two enzymes make the same sticky ends.  If either of the
   enzymes have sticky ends that isn't all ACGT, then this returns false. */
{
boolean ret = FALSE;
struct dnaSeq *sticky1 = stickyEnd(enz1);
struct dnaSeq *sticky2 = stickyEnd(enz2);

if (sticky1 && sticky2)
if (sticky1 && sticky2 && (sticky1->size == sticky2->size) &&
    (acgtCount(sticky1->dna) == sticky1->size) && (acgtCount(sticky2->dna) == sticky2->size))
    {
    if (sameString(sticky1->dna, sticky2->dna))
	ret = TRUE;
    else
	{
	reverseComplement(sticky2->dna, sticky2->size);
	if (sameString(sticky1->dna, sticky2->dna))
	    ret = TRUE;
	}
    }
freeDnaSeq(&sticky1);
freeDnaSeq(&sticky2);
return ret;
}
Example #23
0
struct dnaSeq *genePredToGenomicSequence(struct genePred *pred, char *chromSeq, struct lm *lm)
/* Return concatenated genomic sequence of exons of pred. */
{
int txLen = 0;
int i;
for (i=0; i < pred->exonCount; i++)
    txLen += (pred->exonEnds[i] - pred->exonStarts[i]);
char *seq = lmAlloc(lm, txLen + 1);
int offset = 0;
for (i=0; i < pred->exonCount; i++)
    {
    int blockStart = pred->exonStarts[i];
    int blockSize = pred->exonEnds[i] - blockStart;
    memcpy(seq+offset, chromSeq+blockStart, blockSize*sizeof(*seq));
    offset += blockSize;
    }
if(pred->strand[0] == '-')
    reverseComplement(seq, txLen);
struct dnaSeq *txSeq = NULL;
lmAllocVar(lm, txSeq);
txSeq->name = lmCloneString(lm, pred->name);
txSeq->dna = seq;
txSeq->size = txLen;
return txSeq;
}
Example #24
0
static char *gpFxModifyCodingSequence(char *oldCodingSeq, struct genePred *pred,
				      int startInCds, int endInCds, struct allele *allele,
				      int *retCdsBasesAdded, struct lm *lm)
/* Return a new coding sequence that is oldCodingSeq with allele applied. */
{
boolean isRc = (pred->strand[0] == '-');
char *newAlleleSeq = allele->sequence;
int newAlLen = strlen(newAlleleSeq);
if (! isAllNt(newAlleleSeq, newAlLen))
    {
    // symbolic -- may be deletion or insertion, but we can't tell. :(
    newAlleleSeq = "";
    newAlLen = 0;
    }
if (isRc && newAlLen > 0)
    {
    newAlleleSeq = lmCloneString(lm, newAlleleSeq);
    reverseComplement(newAlleleSeq, newAlLen);
    }
int variantSizeOnCds = endInCds - startInCds;
if (variantSizeOnCds < 0)
    errAbort("gpFx: endInCds (%d) < startInCds (%d)", endInCds, startInCds);
char *newCodingSeq = mergeAllele(oldCodingSeq, startInCds, variantSizeOnCds,
				 newAlleleSeq, newAlLen, lm);
// If newCodingSequence has an early stop, truncate there:
truncateAtStopCodon(newCodingSeq);
int variantSizeOnRef = allele->variant->chromEnd - allele->variant->chromStart;
if (retCdsBasesAdded)
    *retCdsBasesAdded = allele->length - variantSizeOnRef;
return newCodingSeq;
}
Example #25
0
/**********************************************************************************************************************
	Search a read in the dataset using binary search
**********************************************************************************************************************/
Read * Dataset::getReadFromString(const string & read)
{
	UINT64 min = 0, max = getNumberOfUniqueReads()-1;
	string readReverse = reverseComplement(read);
	int comparator;
	if(read.compare(readReverse) < 0)
	{
		while (max >= min) 															// At first search for the forward string.
		{
			UINT64 mid = (min + max) / 2; 	// Determine which subarray to search.
			comparator = reads->at(mid)->getStringForward().compare(read.c_str());
			if(comparator == 0)
				return reads->at(mid);
			else if (comparator < 0) 	// Change min index to search upper subarray.
				min = mid + 1;
			else if (comparator > 0) 	// Change max index to search lower subarray.
				max = mid - 1;
		}
	}
	else
	{
		while (max >= min) 																	// If forward string is not found then search for the reverse string
		{
			UINT64 mid = (min+max) / 2; 													// Determine which subarray to search
			comparator = reads->at(mid)->getStringForward().compare(readReverse.c_str());
			if( comparator == 0)
				return reads->at(mid);
			else if (comparator < 0) 	// Change min index to search upper subarray.
				min = mid + 1;
			else if (comparator > 0) 	// Change max index to search lower subarray.
				max = mid - 1;
		}
	}
	MYEXIT("String not found in Dataset: "+read);
}
// Validate that the edge members are sane
void Edge::validate() const
{
    const Edge* pTwin = getTwin();
    std::string m_v1 = getMatchStr();
    std::string m_v2 = pTwin->getMatchStr();

    if(getComp() == EC_REVERSE)
        m_v2 = reverseComplement(m_v2);

    bool error = false;
    if(m_v1.length() != m_v2.length())
    {
        std::cerr << "Error, matching strings are not the same length\n";
        error = true;
    }

    if(error)
    {
        std::cerr << "V1M: " << m_v1 << "\n";
        std::cerr << "V2M: " << m_v2 << "\n";
        std::cerr << "V1MC: " << getMatchCoord() << "\n";
        std::cerr << "V2MC: " << pTwin->getMatchCoord() << "\n";
        std::cerr << "V1: " << getStart()->getSeq() << "\n";
        std::cerr << "Validation failed for edge " << *this << "\n";
        assert(false);
    }
}
Example #27
0
void
verify_node_orig(kg_node_t * node, unsigned kmer_length) {
    assert( false && "TODO FIX! REVERSED KMER ENDIANNESS" );
    int double_kmer_length = kmer_length << 1;
#ifdef LARGE_KMERS
    Kmer mask;
    mask.createMask(double_kmer_length);
#else
    Kmer mask = (Kmer(1) << double_kmer_length) - 1;
#endif
    Kmer kmer = node->kmer;
    Kmer rc_kmer = reverseComplement(kmer, kmer_length);
    char leftmost_base = (kmer >> (double_kmer_length - 2)) & 0x3;
    char rightmost_base = kmer & 0x3;

    for (int i = 0 ; i < 4 ; ++ i) {
        // check on the left side
        kg_node_t * node2 = node->left[i];
        int count = node->left_count[i];

        if (node2) {
            assert (count != 0);
            if (count > 0) {
                Kmer kmer2 = KMER_PREPEND(kmer, i, double_kmer_length, mask);
                assert(kmer2 == node2->kmer);
                assert(node2->right[(int)rightmost_base] == node);
                assert(node2->right_count[(int)rightmost_base] == count);
            } else {
                Kmer kmer2 = KMER_APPEND(rc_kmer, i ^ 0x3, double_kmer_length, mask);
                assert(kmer2 == node2->kmer);
                assert(node2->left[rightmost_base ^ 0x3] == node);
                assert(node2->left_count[rightmost_base ^ 0x3] == count);
            }
        } else {
            assert (count == 0);
        }


        // check on the right side
        node2 = node->right[i];
        count = node->right_count[i];

        if (node2) {
            assert (count != 0);
            if (count > 0) {
                Kmer kmer2 = KMER_APPEND(kmer, i, double_kmer_length, mask);
                assert(kmer2 == node2->kmer);
                assert(node2->left[(int)leftmost_base] == node);
                assert(node2->left_count[(int)leftmost_base] == count);
            } else {
                Kmer kmer2 = KMER_PREPEND(rc_kmer, i ^ 0x3, double_kmer_length, mask);
                assert(kmer2 == node2->kmer);
                assert(node2->right[leftmost_base ^ 0x3] == node);
                assert(node2->right_count[leftmost_base ^ 0x3] == count);
            }
        } else {
            assert (count == 0);
        }
    }
}
Example #28
0
std::string SGPairedAlgorithms::pathToString(const Vertex* pX, const Path& path)
{
    std::string out = pX->getSeq().toString();
    EdgeComp currComp = EC_SAME;

    for(size_t i = 0; i < path.size(); ++i)
    {
        Edge* pYZ = path[i];
        EdgeComp ecYZ = pYZ->getComp();

        // Calculate the next comp, between X and Z
        EdgeComp ecXZ;
        if(ecYZ == EC_SAME)
            ecXZ = currComp;
        else
            ecXZ = !currComp;
        
        std::string edge_str = pYZ->getLabel();
        assert(edge_str.size() != 0);
        if(currComp == EC_REVERSE)
            edge_str = reverseComplement(edge_str);

        out.append(edge_str);
        currComp = ecXZ;
    }
    return out;
}
Example #29
0
// Returns true if changing the given reference base is detectable with kmers
KmerCounts computeChangeCounts(const BWTIndexSet& ref_index, std::string& sequence, size_t base_idx, char new_base)
{
    // Introduce the change
    char old_base = sequence[base_idx];
    sequence[base_idx] = new_base;
    size_t l = sequence.length();

    // Iterate over kmers covering this position
    size_t start_k_idx = (base_idx + 1) > opt::kmer ? base_idx + 1 - opt::kmer : 0;
    size_t end_k_idx = (base_idx + opt::kmer) < l ? base_idx : l - opt::kmer;
    assert(end_k_idx - start_k_idx <= opt::kmer);

    KmerCounts counts;
    counts.total = 0;
    counts.zero = 0;
    for(size_t ki = start_k_idx; ki <= end_k_idx; ++ki) {
        std::string ks = sequence.substr(ki, opt::kmer);
        size_t occ = BWTAlgorithms::countSequenceOccurrences(ks, ref_index) +
                     BWTAlgorithms::countSequenceOccurrences(reverseComplement(ks), ref_index);

        counts.total += 1;
        counts.zero += (occ == 0) ? 1 : 0;
    }

    // Reset the base
    sequence[base_idx] = old_base;
    return counts;
}
Example #30
0
static void simpleFillInSequence(char *seqDir, struct agpFrag *agpList,
    DNA *dna, int dnaSize)
/* Fill in DNA array with sequences from simple clones. */
{
struct agpFrag *agp;
char underline = '_';

for (agp = agpList; agp != NULL; agp = agp->next)
    {
    char clone[128];
    char path[512];
    struct dnaSeq *seq;
    int size;
    strcpy(clone, agp->frag);
    chopSuffixAt(clone,underline);
    sprintf(path, "%s/%s.fa", seqDir, clone);
    seq = faReadAllDna(path);
    if (slCount(seq) != 1)
	errAbort("Can only handle exactly one clone in %s.", path);
    size = agp->fragEnd - agp->fragStart;
    if (agp->strand[0] == '-')
	reverseComplement(seq->dna + agp->fragStart, size);
    memcpy(dna + agp->chromStart, seq->dna + agp->fragStart, size);
    freeDnaSeq(&seq);
    }
}