static struct codonCoords findLastCodon(struct genePred *gp, int *frames) /* get the coordinates of the last codon (start or stop). It must be in * correct frame, or zero is returned. */ { if (honorCdsStat && (gp->optFields & genePredCdsStatFld) && (gp->cdsEndStat != cdsComplete)) return zeroCodonCoords; // not flagged as complete // find last CDS exon int iExon, cdsStart = 0, cdsEnd = 0; for (iExon = gp->exonCount-1; iExon >= 0; iExon--) { if (genePredCdsExon(gp, iExon, &cdsStart, &cdsEnd)) break; } if (iExon == -1) return zeroCodonCoords; // no CDS // get frame of last base and validate that we are on a bound. int frame = (gp->strand[0] == '-') ? frames[iExon] : frameIncr(frames[iExon], (cdsEnd-cdsStart)); if (frame != 0) return zeroCodonCoords; // not on a frame boundary /* get last part of codon */ struct codonCoords codon = zeroCodonCoords; codon.start= codon.start1 = max(cdsStart, cdsEnd-3); codon.end = codon.end1 = cdsEnd; codon.iExon1 = iExon; /* first part, if spliced */ if ((codon.end1 - codon.start1) < 3) { codon.start2 = codon.start1; codon.end = codon.end2 = codon.end1; codon.iExon2 = iExon; iExon--; codon.iExon1 = iExon; if ((iExon == -1) || !genePredCdsExon(gp, iExon, &cdsStart, &cdsEnd)) return zeroCodonCoords; // no more int needed = 3 - (codon.end2 - codon.start2); if ((cdsEnd - cdsStart) < needed) return zeroCodonCoords; // not enough space codon.start = codon.start1 = cdsEnd-needed; codon.end1 = cdsEnd; } return codon; }
static int getCorrectedCdsOffset(struct genePred *pred, int cdsOffsetIn) /* Increment cdsOffset for each 'N' that getCodingSequence added prior to it. */ { int totalMissingBases = 0; int cdsOffsetSoFar = 0; if (pred->optFields & genePredExonFramesFld) { boolean isRc = (pred->strand[0] == '-'); int i, iStart = 0, iIncr = 1; if (isRc) { iStart = pred->exonCount-1; iIncr = -1; } for (i = iStart; i >= 0 && i < pred->exonCount; i += iIncr) { int start, end; if (genePredCdsExon(pred, i, &start, &end)) { // Don't count missing bases after cdsOffsetIn: if (cdsOffsetSoFar > cdsOffsetIn) break; int exonCdsSize = end - start; totalMissingBases += calcMissingBases(pred, i, cdsOffsetSoFar + totalMissingBases); cdsOffsetSoFar += exonCdsSize; } } } return cdsOffsetIn + totalMissingBases; }
static void loadGene(struct orgGenes *genes, struct genePred *gp) /* break one gene into cds exons and add to bins. check for overlapping CDS */ { struct binKeeper *chrBins = chromBinsGet(genes->bins, gp->chrom, TRUE); struct gene *gene; int iExon, start, end; int cdsOff = 0; lmAllocVar(genes->memPool, gene); gene->genes = genes; gene->chrom = chromStrAlloc(genes, gp->chrom); gene->strand = gp->strand[0]; slAddHead(&genes->genes, gene); gene->name = lmCloneString(genes->memPool, gp->name); /* process in transcription order so we get the cdsOff set */ if (gp->strand[0] == '+') { for (iExon = 0; iExon < gp->exonCount; iExon++) if (genePredCdsExon(gp, iExon, &start, &end)) { loadExon(gene, chrBins, gp, iExon, start, end, cdsOff); cdsOff += (end - start); } } else { for (iExon = gp->exonCount-1; iExon >= 0; iExon--) if (genePredCdsExon(gp, iExon, &start, &end)) { loadExon(gene, chrBins, gp, iExon, start, end, cdsOff); cdsOff += (end - start); } } slReverse(&gene->exons); }
static char *getCodingSequence(struct genePred *pred, char *transcriptSequence, boolean *retAddedBases, struct lm *lm) /* Extract the CDS sequence from a transcript. If pred has exonFrames, add 'N' where * needed (for example, if the coding region begins out-of-frame, add one or two 'N's * at the beginning of the cds sequence) and set retAddedBases if we do add 'N'. * If pred doesn't have exonFrames, use the simple method above. */ { if (retAddedBases) *retAddedBases = FALSE; if (pred->optFields & genePredExonFramesFld) { boolean isRc = (pred->strand[0] == '-'); int i, iStart = 0, iIncr = 1; if (isRc) { iStart = pred->exonCount-1; iIncr = -1; } char *cdsSeq = lmAlloc(lm, genePredCdsSize(pred) + 3 * pred->exonCount); int txOffset = getCodingOffsetInTx(pred, pred->strand[0]), cdsOffset = 0; for (i = iStart; i >= 0 && i < pred->exonCount; i += iIncr) { int start, end; if (genePredCdsExon(pred, i, &start, &end)) { int exonCdsSize = end - start; int missingBases = calcMissingBases(pred, i, cdsOffset); if (missingBases > 0) { if (retAddedBases) *retAddedBases = TRUE; while (missingBases > 0) { cdsSeq[cdsOffset++] = 'N'; missingBases--; } } memcpy(&cdsSeq[cdsOffset], &transcriptSequence[txOffset], exonCdsSize); cdsOffset += exonCdsSize; txOffset += exonCdsSize; } } return cdsSeq; } else return getCodingSequenceSimple(pred, transcriptSequence, lm); }
static void cnvGenePredCds(struct genePred *gp, int qSize, FILE *cdsFh) /* determine CDS and output */ { int e, off = 0; int qCdsStart = -1, qCdsEnd = -1; int eCdsStart, eCdsEnd; for (e = 0; e < gp->exonCount; ++e) { if (genePredCdsExon(gp, e, &eCdsStart, &eCdsEnd)) { if (qCdsStart < 0) qCdsStart = off + (eCdsStart - gp->exonStarts[e]); qCdsEnd = off + (eCdsEnd - gp->exonStarts[e]); } off += gp->exonEnds[e] - gp->exonStarts[e]; } if (gp->strand[0] == '-') reverseIntRange(&qCdsStart, &qCdsEnd, qSize); fprintf(cdsFh,"%s\t%d..%d\n", gp->name, qCdsStart+1, qCdsEnd); /* genbank cds is closed 1-based */ }
static int *calcFrames(struct genePred *gp) /* compute frames for a genePred the doesn't have them. Free resulting array */ { int *frames = needMem(gp->exonCount*sizeof(int)); int iStart = (gp->strand[0] == '+') ? 0 : gp->exonCount - 1; int iStop = (gp->strand[0] == '+') ? gp->exonCount : -1; int iIncr = (gp->strand[0] == '+') ? 1 : -1; int i, cdsStart, cdsEnd; int cdsBaseCnt = 0; for (i = iStart; i != iStop; i += iIncr) { if (genePredCdsExon(gp, i, &cdsStart, &cdsEnd)) { frames[i] = cdsBaseCnt % 3; cdsBaseCnt += (cdsEnd - cdsStart); } else frames[i] = -1; } return frames; }
void kgGetCds(char *db, char *spDb, char *geneTable, FILE *outf) /* get CDS info */ { struct sqlConnection *conn = NULL; struct genePred *gp; int cdsCnt; struct genePredReader *gpr; int iExon, exonStart, exonEnd; if (db != NULL) conn = sqlConnect(db); gpr = genePredReaderQuery(conn, geneTable, NULL); while ((gp = genePredReaderNext(gpr)) != NULL) { cdsCnt = 0; for (iExon = 0; iExon < gp->exonCount; iExon++) { if (genePredCdsExon(gp, iExon, &exonStart, &exonEnd)) { sprintf(cdsBloc[cdsCnt], "%d-%d;", exonStart, exonEnd); cdsCnt++; } } if (cdsCnt > 0) { processAlign(db, spDb, gp->name, cdsCnt, outf); } else { fprintf(stderr, "%s does not have cds.\n", gp->name); } } sqlDisconnect(&conn); }