예제 #1
0
struct cdsEvidence *createCds(struct dnaSeq *seq, int start, int end,
	int *upAtgCount, int *upKozakCount, int lastIntronPos,
	struct orthoCdsArray *orthoList, double orthoWeightPer)
/* Return new cdsEvidence on given sequence at given position. */
{
struct cdsEvidence *orf;
AllocVar(orf);
int size = end - start;
size -= size % 3;
end = start + size;
orf->name = cloneString(seq->name);
orf->start = start;
orf->end = end;
orf->source = cloneString("txCdsPredict");
orf->accession = cloneString(".");
orf->score = orfScore(orf, seq, upAtgCount, upKozakCount, lastIntronPos,
	orthoList, orthoWeightPer);
orf->startComplete = startsWith("atg", seq->dna + start);
orf->endComplete = isStopCodon(seq->dna + end - 3);
orf->cdsCount = 1;
AllocArray(orf->cdsStarts, 1);
orf->cdsStarts[0] = start;
AllocArray(orf->cdsSizes, 1);
orf->cdsSizes[0] = size;
return orf;
}
void outputOneRa(struct dnaSeq *seq, int start, int end, FILE *f)
/* Output one Ra record to file. */
{
    fprintf(f, "orfName %s_%d_%d\n", seq->name, start, end);
    fprintf(f, "txName %s\n", seq->name);
    fprintf(f, "txSize %d\n", seq->size);
    fprintf(f, "cdsStart %d\n", start);
    fprintf(f, "cdsEnd %d\n", end);
    fprintf(f, "cdsSize %d\n", end-start);
    fprintf(f, "gotStart %d\n", startsWith("atg", seq->dna+start));
    fprintf(f, "gotEnd %d\n", isStopCodon(seq->dna+end-3));
    boolean gotKozak1 = FALSE;
    if (start >= 3)
    {
        char c = seq->dna[start-3];
        gotKozak1 = (c == 'a' || c == 'g');
    }
    fprintf(f, "gotKozak1 %d\n", gotKozak1);
    boolean gotKozak2 = FALSE;
    if (start+3 < seq->size)
        gotKozak2 = (seq->dna[start+3] == 'g');
    fprintf(f, "gotKozak2 %d\n", gotKozak2);
    fprintf(f, "gotKozak %d\n", gotKozak1 + gotKozak2);

    /* Count up upstream ATG and Kozak */
    struct rbTree *upAtgRanges = rangeTreeNew(), *upKozakRanges = rangeTreeNew();
    int upAtg = 0, upKozak = 0;
    int i;
    for (i=0; i<start; ++i)
    {
        if (startsWith("atg", seq->dna + i))
        {
            int orfEnd = findOrfEnd(seq, i);
            if (orfEnd < start)
                rangeTreeAdd(upAtgRanges, i, orfEnd);
            ++upAtg;
            if (isKozak(seq->dna, seq->size, i))
            {
                ++upKozak;
                if (orfEnd < start)
                    rangeTreeAdd(upKozakRanges, i, orfEnd);
            }
        }
    }
    fprintf(f, "upstreamAtgCount %d\n", upAtg);
    fprintf(f, "upstreamKozakCount %d\n", upKozak);
    fprintf(f, "upstreamSize %d\n", rangeTreeOverlapSize(upAtgRanges, 0, start));
    fprintf(f, "upstreamKozakSize %d\n", rangeTreeOverlapSize(upKozakRanges, 0, start));
    fprintf(f, "\n");

    /* Cluen up and go home. */
    rangeTreeFree(&upAtgRanges);
    rangeTreeFree(&upKozakRanges);
}
예제 #3
0
int findOrfEnd(char *dna, int dnaSize, int start)
/* Figure out end of orf that starts at start */
{
int lastPos = dnaSize-3;
int i;
for (i=start+3; i<lastPos; i += 3)
    {
    if (isStopCodon(dna+i))
        return i+3;
    }
return dnaSize;
}
int findOrfEnd(struct dnaSeq *seq, int start)
/* Figure out end of orf that starts at start */
{
    int lastPos = seq->size-3;
    int i;
    for (i=start+3; i<lastPos; i += 3)
    {
        if (isStopCodon(seq->dna+i))
            return i+3;
    }
    return seq->size;
}
예제 #5
0
static void truncateAtStopCodon(char *codingSeq)
/* If codingSeq contains a stop codon, truncate any sequence past that. */
{
if (codingSeq == NULL)
    errAbort("truncateAtStopCodon: null input");
char *p = codingSeq;
while (p[0] != '\0' && p[1] != '\0' && p[2] != '\0')
    {
    if (isStopCodon(p))
	{
	p[3] = '\0';
	break;
	}
    p += 3;
    }
}
예제 #6
0
double orfScore(struct cdsEvidence *orf, struct dnaSeq *seq, int *upAtgCount, int *upKozakCount,
	int lastIntronPos, struct orthoCdsArray *orthoList, double orthoWeightPer)
/* Return a fairly ad-hoc score for orf. Each base in ORF
 * is worth one point, and we go from there.... */
{
double score = orf->end - orf->start;
DNA *dna = seq->dna;

/* If we really have start, that's worth 50 bases */
if (startsWith("atg", dna + orf->start))
    {
    score += 50;

    /* Kozak condition worth 100 more. */
    if ((orf->start + 4 <= seq->size && dna[orf->start+3] == 'g') ||
        (orf->start >=3 && (dna[orf->start-3] == 'a' || dna[orf->start-3] == 'g')))
	score += 100;
    }

/* A stop codon is also worth 50 */
if (isStopCodon(dna + orf->end - 3))
    score += 50;

/* Penalize by upstream bases. */
score -= upAtgCount[orf->start]*0.5;
score -= upKozakCount[orf->start]*0.5;

/* Penalize NMD */
if (lastIntronPos > 0)
    {
    int nmdDangle = lastIntronPos - orf->end;
    if (nmdDangle >= 55)
        score -= 400;
    }

/* Add in bits from ortho species. */
struct orthoCdsArray *ortho;
for (ortho = orthoList; ortho != NULL; ortho = ortho->next)
    {
    score += orthoWeightPer * orthoScore(ortho, orf);
    }

return score;
}
void txCdsEvFromBorf(char *inBorf, char *txFa, char *outTce)
/* txCdsEvFromBorf - Convert borfBig format to txCdsEvidence (tce) in an effort 
 * to annotate the coding regions.. */
{
struct lineFile *lf = lineFileOpen(inBorf, TRUE);
struct hash *txHash = faReadAllIntoHash(txFa, dnaLower);
char *row[BORF_NUM_COLS];
FILE *f = mustOpen(outTce, "w");
while (lineFileRowTab(lf, row))
    {
    struct borf b;
    borfStaticLoad(row, &b);
    if (b.strand[0] == '+' && b.score >= 50)
	{
	struct dnaSeq *txSeq = hashFindVal(txHash, b.name);
	boolean hasStop = FALSE;
	if (b.cdsEnd + 3 < txSeq->size)
	    {
	    hasStop = isStopCodon(txSeq->dna + b.cdsEnd);
	    b.cdsEnd += 3;
	    }
	if (txSeq == NULL)
	    errAbort("%s is in %s but not %s", b.name, inBorf, txFa);
	int score = (b.score - 45)*5;
	if (score > 1000) score = 1000;
	if (score < 0) score = 0;
	fprintf(f, "%s\t", b.name);
	fprintf(f, "%d\t", b.cdsStart);
	fprintf(f, "%d\t", b.cdsEnd);
	fprintf(f, "%s\t", "bestorf");
	fprintf(f, "%s\t", ".");
	fprintf(f, "%d\t", score);
	fprintf(f, "%d\t", startsWith("atg", txSeq->dna + b.cdsStart));
	fprintf(f, "%d\t", hasStop);
	fprintf(f, "%d\t", 1);	
	fprintf(f, "%d,\t", b.cdsStart);
	fprintf(f, "%d,\n", b.cdsEnd - b.cdsStart);
	}
    }
lineFileClose(&lf);
carefulClose(&f);
}