void outputOneRa(struct dnaSeq *seq, int start, int end, FILE *f) /* Output one Ra record to file. */ { fprintf(f, "orfName %s_%d_%d\n", seq->name, start, end); fprintf(f, "txName %s\n", seq->name); fprintf(f, "txSize %d\n", seq->size); fprintf(f, "cdsStart %d\n", start); fprintf(f, "cdsEnd %d\n", end); fprintf(f, "cdsSize %d\n", end-start); fprintf(f, "gotStart %d\n", startsWith("atg", seq->dna+start)); fprintf(f, "gotEnd %d\n", isStopCodon(seq->dna+end-3)); boolean gotKozak1 = FALSE; if (start >= 3) { char c = seq->dna[start-3]; gotKozak1 = (c == 'a' || c == 'g'); } fprintf(f, "gotKozak1 %d\n", gotKozak1); boolean gotKozak2 = FALSE; if (start+3 < seq->size) gotKozak2 = (seq->dna[start+3] == 'g'); fprintf(f, "gotKozak2 %d\n", gotKozak2); fprintf(f, "gotKozak %d\n", gotKozak1 + gotKozak2); /* Count up upstream ATG and Kozak */ struct rbTree *upAtgRanges = rangeTreeNew(), *upKozakRanges = rangeTreeNew(); int upAtg = 0, upKozak = 0; int i; for (i=0; i<start; ++i) { if (startsWith("atg", seq->dna + i)) { int orfEnd = findOrfEnd(seq, i); if (orfEnd < start) rangeTreeAdd(upAtgRanges, i, orfEnd); ++upAtg; if (isKozak(seq->dna, seq->size, i)) { ++upKozak; if (orfEnd < start) rangeTreeAdd(upKozakRanges, i, orfEnd); } } } fprintf(f, "upstreamAtgCount %d\n", upAtg); fprintf(f, "upstreamKozakCount %d\n", upKozak); fprintf(f, "upstreamSize %d\n", rangeTreeOverlapSize(upAtgRanges, 0, start)); fprintf(f, "upstreamKozakSize %d\n", rangeTreeOverlapSize(upKozakRanges, 0, start)); fprintf(f, "\n"); /* Cluen up and go home. */ rangeTreeFree(&upAtgRanges); rangeTreeFree(&upKozakRanges); }
int orfEndInSeq(struct dnaSeq *seq, int start) /* Figure out end of orf that starts at start */ { return findOrfEnd(seq->dna, seq->size, start); }
void fillInArrayFromPair(struct lm *lm, struct mafComp *native, struct mafComp *xeno, struct orthoCds *array, int arraySize, int symCount) /* Figure out the CDS in xeno for each position in native. */ { char *nText = native->text, *xText = xeno->text; int nSize = arraySize, xSize = symCount - countChars(xText, '-'); /* Create an array that for each point in native gives you the index of corresponding * point in xeno, and another array that does the opposite. */ int *nToX, *xToN; lmAllocArray(lm, nToX, nSize+1); lmAllocArray(lm, xToN, xSize+1); int i; int nIx = 0, xIx = 0; for (i=0; i<symCount; ++i) { char n = nText[i], x = xText[i]; if (n == '.') errAbort("Dot in native component %s of maf. Can't handle it.", native->src); nToX[nIx] = xIx; xToN[xIx] = nIx; if (n != '-') { array[nIx].base = x; nToX[nIx] = xIx; ++nIx; } if (x != '-') ++xIx; } assert(xIx == xSize); assert(nIx == nSize); /* Put an extra value at end of arrays to simplify logic. */ nToX[nSize] = xSize; xToN[xSize] = nSize; /* Create xeno sequence without the '-' chars */ char *xDna = lmCloneString(lm, xText); tolowers(xDna); stripChar(xDna, '-'); #ifdef DEBUG uglyf("xToN:"); for (i=0; i<xSize; ++i) uglyf(" %d", xToN[i]); uglyf("\n"); #endif /* DEBUG */ /* Step through this, one frame at a time, looking for best ORF */ int frame; for (frame=0; frame<3; ++frame) { /* Calculate some things constant for this frame, and deal with * ORF that starts at beginning (may not have ATG) */ int lastPos = xSize-3; int frameDnaSize = xSize-frame; int start = frame, end = findOrfEnd(xDna, frameDnaSize, frame); applyOrf(start, end, xDna, xToN, array, arraySize); for (start = end; start<=lastPos; ) { // uglyf("start %d %c%c%c\n", start, xDna[start], xDna[start+1], xDna[start+2]); if (startsWith("atg", xDna+start)) { end = findOrfEnd(xDna, frameDnaSize, start); applyOrf(start, end, xDna, xToN, array, arraySize); start = end; } else start += 3; } } }