struct cdsEvidence *createCds(struct dnaSeq *seq, int start, int end, int *upAtgCount, int *upKozakCount, int lastIntronPos, struct orthoCdsArray *orthoList, double orthoWeightPer) /* Return new cdsEvidence on given sequence at given position. */ { struct cdsEvidence *orf; AllocVar(orf); int size = end - start; size -= size % 3; end = start + size; orf->name = cloneString(seq->name); orf->start = start; orf->end = end; orf->source = cloneString("txCdsPredict"); orf->accession = cloneString("."); orf->score = orfScore(orf, seq, upAtgCount, upKozakCount, lastIntronPos, orthoList, orthoWeightPer); orf->startComplete = startsWith("atg", seq->dna + start); orf->endComplete = isStopCodon(seq->dna + end - 3); orf->cdsCount = 1; AllocArray(orf->cdsStarts, 1); orf->cdsStarts[0] = start; AllocArray(orf->cdsSizes, 1); orf->cdsSizes[0] = size; return orf; }
void outputOneRa(struct dnaSeq *seq, int start, int end, FILE *f) /* Output one Ra record to file. */ { fprintf(f, "orfName %s_%d_%d\n", seq->name, start, end); fprintf(f, "txName %s\n", seq->name); fprintf(f, "txSize %d\n", seq->size); fprintf(f, "cdsStart %d\n", start); fprintf(f, "cdsEnd %d\n", end); fprintf(f, "cdsSize %d\n", end-start); fprintf(f, "gotStart %d\n", startsWith("atg", seq->dna+start)); fprintf(f, "gotEnd %d\n", isStopCodon(seq->dna+end-3)); boolean gotKozak1 = FALSE; if (start >= 3) { char c = seq->dna[start-3]; gotKozak1 = (c == 'a' || c == 'g'); } fprintf(f, "gotKozak1 %d\n", gotKozak1); boolean gotKozak2 = FALSE; if (start+3 < seq->size) gotKozak2 = (seq->dna[start+3] == 'g'); fprintf(f, "gotKozak2 %d\n", gotKozak2); fprintf(f, "gotKozak %d\n", gotKozak1 + gotKozak2); /* Count up upstream ATG and Kozak */ struct rbTree *upAtgRanges = rangeTreeNew(), *upKozakRanges = rangeTreeNew(); int upAtg = 0, upKozak = 0; int i; for (i=0; i<start; ++i) { if (startsWith("atg", seq->dna + i)) { int orfEnd = findOrfEnd(seq, i); if (orfEnd < start) rangeTreeAdd(upAtgRanges, i, orfEnd); ++upAtg; if (isKozak(seq->dna, seq->size, i)) { ++upKozak; if (orfEnd < start) rangeTreeAdd(upKozakRanges, i, orfEnd); } } } fprintf(f, "upstreamAtgCount %d\n", upAtg); fprintf(f, "upstreamKozakCount %d\n", upKozak); fprintf(f, "upstreamSize %d\n", rangeTreeOverlapSize(upAtgRanges, 0, start)); fprintf(f, "upstreamKozakSize %d\n", rangeTreeOverlapSize(upKozakRanges, 0, start)); fprintf(f, "\n"); /* Cluen up and go home. */ rangeTreeFree(&upAtgRanges); rangeTreeFree(&upKozakRanges); }
int findOrfEnd(char *dna, int dnaSize, int start) /* Figure out end of orf that starts at start */ { int lastPos = dnaSize-3; int i; for (i=start+3; i<lastPos; i += 3) { if (isStopCodon(dna+i)) return i+3; } return dnaSize; }
int findOrfEnd(struct dnaSeq *seq, int start) /* Figure out end of orf that starts at start */ { int lastPos = seq->size-3; int i; for (i=start+3; i<lastPos; i += 3) { if (isStopCodon(seq->dna+i)) return i+3; } return seq->size; }
static void truncateAtStopCodon(char *codingSeq) /* If codingSeq contains a stop codon, truncate any sequence past that. */ { if (codingSeq == NULL) errAbort("truncateAtStopCodon: null input"); char *p = codingSeq; while (p[0] != '\0' && p[1] != '\0' && p[2] != '\0') { if (isStopCodon(p)) { p[3] = '\0'; break; } p += 3; } }
double orfScore(struct cdsEvidence *orf, struct dnaSeq *seq, int *upAtgCount, int *upKozakCount, int lastIntronPos, struct orthoCdsArray *orthoList, double orthoWeightPer) /* Return a fairly ad-hoc score for orf. Each base in ORF * is worth one point, and we go from there.... */ { double score = orf->end - orf->start; DNA *dna = seq->dna; /* If we really have start, that's worth 50 bases */ if (startsWith("atg", dna + orf->start)) { score += 50; /* Kozak condition worth 100 more. */ if ((orf->start + 4 <= seq->size && dna[orf->start+3] == 'g') || (orf->start >=3 && (dna[orf->start-3] == 'a' || dna[orf->start-3] == 'g'))) score += 100; } /* A stop codon is also worth 50 */ if (isStopCodon(dna + orf->end - 3)) score += 50; /* Penalize by upstream bases. */ score -= upAtgCount[orf->start]*0.5; score -= upKozakCount[orf->start]*0.5; /* Penalize NMD */ if (lastIntronPos > 0) { int nmdDangle = lastIntronPos - orf->end; if (nmdDangle >= 55) score -= 400; } /* Add in bits from ortho species. */ struct orthoCdsArray *ortho; for (ortho = orthoList; ortho != NULL; ortho = ortho->next) { score += orthoWeightPer * orthoScore(ortho, orf); } return score; }
void txCdsEvFromBorf(char *inBorf, char *txFa, char *outTce) /* txCdsEvFromBorf - Convert borfBig format to txCdsEvidence (tce) in an effort * to annotate the coding regions.. */ { struct lineFile *lf = lineFileOpen(inBorf, TRUE); struct hash *txHash = faReadAllIntoHash(txFa, dnaLower); char *row[BORF_NUM_COLS]; FILE *f = mustOpen(outTce, "w"); while (lineFileRowTab(lf, row)) { struct borf b; borfStaticLoad(row, &b); if (b.strand[0] == '+' && b.score >= 50) { struct dnaSeq *txSeq = hashFindVal(txHash, b.name); boolean hasStop = FALSE; if (b.cdsEnd + 3 < txSeq->size) { hasStop = isStopCodon(txSeq->dna + b.cdsEnd); b.cdsEnd += 3; } if (txSeq == NULL) errAbort("%s is in %s but not %s", b.name, inBorf, txFa); int score = (b.score - 45)*5; if (score > 1000) score = 1000; if (score < 0) score = 0; fprintf(f, "%s\t", b.name); fprintf(f, "%d\t", b.cdsStart); fprintf(f, "%d\t", b.cdsEnd); fprintf(f, "%s\t", "bestorf"); fprintf(f, "%s\t", "."); fprintf(f, "%d\t", score); fprintf(f, "%d\t", startsWith("atg", txSeq->dna + b.cdsStart)); fprintf(f, "%d\t", hasStop); fprintf(f, "%d\t", 1); fprintf(f, "%d,\t", b.cdsStart); fprintf(f, "%d,\n", b.cdsEnd - b.cdsStart); } } lineFileClose(&lf); carefulClose(&f); }