Exemple #1
0
static struct codonCoords findLastCodon(struct genePred *gp, int *frames)
/* get the coordinates of the last codon (start or stop). It must be in
 * correct frame, or zero is returned. */
{
if (honorCdsStat && (gp->optFields & genePredCdsStatFld)
    && (gp->cdsEndStat != cdsComplete))
    return zeroCodonCoords;  // not flagged as complete

// find last CDS exon
int iExon, cdsStart = 0, cdsEnd = 0;
for (iExon = gp->exonCount-1; iExon >= 0; iExon--)
    {
    if (genePredCdsExon(gp, iExon, &cdsStart, &cdsEnd))
        break;
    }
if (iExon == -1)
    return zeroCodonCoords;  // no CDS

// get frame of last base and validate that we are on a bound.
int frame = (gp->strand[0] == '-') ? frames[iExon]
    : frameIncr(frames[iExon], (cdsEnd-cdsStart));
if (frame != 0)
    return zeroCodonCoords;  // not on a frame boundary

/* get last part of codon */
struct codonCoords codon = zeroCodonCoords;
codon.start= codon.start1 = max(cdsStart, cdsEnd-3);
codon.end = codon.end1 = cdsEnd;
codon.iExon1 = iExon;

/* first part, if spliced */
if ((codon.end1 - codon.start1) < 3)
    {
    codon.start2 = codon.start1;
    codon.end = codon.end2 = codon.end1;
    codon.iExon2 = iExon;
    iExon--;
    codon.iExon1 = iExon;
    if ((iExon == -1) || !genePredCdsExon(gp, iExon, &cdsStart, &cdsEnd))
        return zeroCodonCoords;  // no more
    int needed = 3 - (codon.end2 - codon.start2);
    if ((cdsEnd - cdsStart) < needed)
        return zeroCodonCoords;  // not enough space
    codon.start = codon.start1 = cdsEnd-needed;
    codon.end1 = cdsEnd;
    }
return codon;
}
Exemple #2
0
static int getCorrectedCdsOffset(struct genePred *pred, int cdsOffsetIn)
/* Increment cdsOffset for each 'N' that getCodingSequence added prior to it. */
{
int totalMissingBases = 0;
int cdsOffsetSoFar = 0;
if (pred->optFields & genePredExonFramesFld)
    {
    boolean isRc = (pred->strand[0] == '-');
    int i, iStart = 0, iIncr = 1;
    if (isRc)
	{
	iStart = pred->exonCount-1;
	iIncr = -1;
	}
    for (i = iStart;  i >= 0 && i < pred->exonCount;  i += iIncr)
	{
	int start, end;
	if (genePredCdsExon(pred, i, &start, &end))
	    {
	    // Don't count missing bases after cdsOffsetIn:
	    if (cdsOffsetSoFar > cdsOffsetIn)
		break;
	    int exonCdsSize = end - start;
	    totalMissingBases += calcMissingBases(pred, i, cdsOffsetSoFar + totalMissingBases);
	    cdsOffsetSoFar += exonCdsSize;
	    }
	}
    }
return cdsOffsetIn + totalMissingBases;
}
Exemple #3
0
static void loadGene(struct orgGenes *genes, struct genePred *gp)
/* break one gene into cds exons and add to bins. check for overlapping CDS */
{
struct binKeeper *chrBins = chromBinsGet(genes->bins, gp->chrom, TRUE);
struct gene *gene;
int iExon, start, end;
int cdsOff = 0;

lmAllocVar(genes->memPool, gene);
gene->genes = genes;
gene->chrom = chromStrAlloc(genes, gp->chrom);
gene->strand = gp->strand[0];
slAddHead(&genes->genes, gene);
gene->name = lmCloneString(genes->memPool, gp->name);

/* process in transcription order so we get the cdsOff set */
if (gp->strand[0] == '+')
    {
    for (iExon = 0; iExon < gp->exonCount; iExon++)
        if (genePredCdsExon(gp, iExon, &start, &end))
            {
            loadExon(gene, chrBins, gp, iExon, start, end, cdsOff);
            cdsOff += (end - start);
            }
    }
else
    {
    for (iExon = gp->exonCount-1; iExon >= 0; iExon--)
        if (genePredCdsExon(gp, iExon, &start, &end))
            {
            loadExon(gene, chrBins, gp, iExon, start, end, cdsOff);
            cdsOff += (end - start);
            }
    }
slReverse(&gene->exons);
}
Exemple #4
0
static char *getCodingSequence(struct genePred *pred, char *transcriptSequence,
			       boolean *retAddedBases, struct lm *lm)
/* Extract the CDS sequence from a transcript.  If pred has exonFrames, add 'N' where
 * needed (for example, if the coding region begins out-of-frame, add one or two 'N's
 * at the beginning of the cds sequence) and set retAddedBases if we do add 'N'.
 * If pred doesn't have exonFrames, use the simple method above. */
{
if (retAddedBases)
    *retAddedBases = FALSE;
if (pred->optFields & genePredExonFramesFld)
    {
    boolean isRc = (pred->strand[0] == '-');
    int i, iStart = 0, iIncr = 1;
    if (isRc)
	{
	iStart = pred->exonCount-1;
	iIncr = -1;
	}
    char *cdsSeq = lmAlloc(lm, genePredCdsSize(pred) + 3 * pred->exonCount);
    int txOffset = getCodingOffsetInTx(pred, pred->strand[0]), cdsOffset = 0;
    for (i = iStart;  i >= 0 && i < pred->exonCount;  i += iIncr)
	{
	int start, end;
	if (genePredCdsExon(pred, i, &start, &end))
	    {
	    int exonCdsSize = end - start;
	    int missingBases = calcMissingBases(pred, i, cdsOffset);
	    if (missingBases > 0)
		{
		if (retAddedBases)
		    *retAddedBases = TRUE;
		while (missingBases > 0)
		    {
		    cdsSeq[cdsOffset++] = 'N';
		    missingBases--;
		    }
		}
	    memcpy(&cdsSeq[cdsOffset], &transcriptSequence[txOffset], exonCdsSize);
	    cdsOffset += exonCdsSize;
	    txOffset += exonCdsSize;
	    }
	}
    return cdsSeq;
    }
else
    return getCodingSequenceSimple(pred, transcriptSequence, lm);
}
static void cnvGenePredCds(struct genePred *gp, int qSize, FILE *cdsFh)
/* determine CDS and output */
{
int e, off = 0;
int qCdsStart = -1, qCdsEnd = -1;
int eCdsStart, eCdsEnd;

for (e = 0; e < gp->exonCount; ++e)
    {
    if (genePredCdsExon(gp, e, &eCdsStart, &eCdsEnd))
        {
        if (qCdsStart < 0)
            qCdsStart = off + (eCdsStart - gp->exonStarts[e]);
        qCdsEnd = off + (eCdsEnd - gp->exonStarts[e]);
        }
    off += gp->exonEnds[e] - gp->exonStarts[e];
    } 
if (gp->strand[0] == '-')
    reverseIntRange(&qCdsStart, &qCdsEnd, qSize);
fprintf(cdsFh,"%s\t%d..%d\n", gp->name, qCdsStart+1, qCdsEnd); /* genbank cds is closed 1-based */
}
Exemple #6
0
static int *calcFrames(struct genePred *gp)
/* compute frames for a genePred the doesn't have them.  Free resulting array */
{
int *frames = needMem(gp->exonCount*sizeof(int));
int iStart = (gp->strand[0] == '+') ? 0 : gp->exonCount - 1;
int iStop = (gp->strand[0] == '+') ? gp->exonCount : -1;
int iIncr = (gp->strand[0] == '+') ? 1 : -1;
int i, cdsStart, cdsEnd;
int cdsBaseCnt = 0;
for (i = iStart; i != iStop; i += iIncr)
    {
    if (genePredCdsExon(gp, i, &cdsStart, &cdsEnd))
        {
        frames[i] = cdsBaseCnt % 3;
        cdsBaseCnt += (cdsEnd - cdsStart);
        }
    else
        frames[i] = -1;
    }
return frames;
}
Exemple #7
0
void kgGetCds(char *db, char *spDb, char *geneTable, FILE *outf)
/* get CDS info */
{
struct sqlConnection *conn = NULL;
struct genePred *gp;
int cdsCnt;
struct genePredReader *gpr;
int iExon, exonStart, exonEnd;

if (db != NULL)
    conn = sqlConnect(db);

gpr = genePredReaderQuery(conn, geneTable, NULL);
while ((gp = genePredReaderNext(gpr)) != NULL)
    {
    cdsCnt = 0;
    for (iExon = 0; iExon < gp->exonCount; iExon++)
    	{
    	if (genePredCdsExon(gp, iExon, &exonStart, &exonEnd))
    	    {
	    sprintf(cdsBloc[cdsCnt], "%d-%d;", exonStart, exonEnd);
	    
	    cdsCnt++;
	    }
    	}
    if (cdsCnt > 0) 
    	{
	processAlign(db, spDb, gp->name, cdsCnt, outf);
	}
    else
    	{
	fprintf(stderr, "%s does not have cds.\n", gp->name);
	}
    }
sqlDisconnect(&conn);
}