struct hash *loadAndWeighTce(char *fileName, struct hash *refToPepHash,
	struct hash *pepToRefHash)
/* Load transcript cds evidence from file into hash of
 * txCdsInfo. */
{
/* Read all tce's in file into list and hash of txCdsInfo. */
struct txCdsInfo *tx, *txList = NULL;
struct lineFile *lf = lineFileOpen(fileName, TRUE);
struct hash *hash = hashNew(18);
char *row[CDSEVIDENCE_NUM_COLS];
while  (lineFileRow(lf, row))
    {
    /* Convert row to cdsEvidence structure. */
    struct cdsEvidence *cds = cdsEvidenceLoad(row);
    char *acc = txAccFromTempName(cds->name);

    tx = hashFindVal(hash, cds->name);
    if (tx == NULL)
        {
	AllocVar(tx);
	hashAddSaveName(hash, cds->name, tx, &tx->name);
	slAddHead(&txList, tx);
	}

    /* Track whether it's refSeq, and the associated protein. */
    char *refSeqAcc = NULL, *refPepAcc = NULL;

    refPepAcc = hashFindVal(refToPepHash, acc);
    refSeqAcc = hashFindVal(pepToRefHash, acc);
    if (refPepAcc != NULL && refSeqAcc == NULL)
	refSeqAcc = acc;
    if (refSeqAcc != NULL && refPepAcc == NULL)
        refPepAcc = acc;

    /* If we are refSeq, bump our score for matches to our own 
     * bits by a huge factor. */
    if (refPepAcc != NULL && startsWith("RefPep", cds->source)
        && sameString(cds->accession, refPepAcc))
	cds->score += refSeqWeight;
    if (refSeqAcc != NULL && startsWith("RefSeq", cds->source)
        && sameString(cds->accession, refSeqAcc))
	cds->score += refSeqWeight + 100;

    /* If we are CCDS then that's great too. */
    if (sameString("ccds", cds->source))
        cds->score += ccdsWeight;

    /* If we are txCdsPredict bump weight too.  Only RefSeq and txCdsPredict
     * can actually possibly make it over real threshold. */
    if (sameString("txCdsPredict", cds->source))
        cds->score += txCdsPredictWeight;
    slAddHead(&tx->cdsList, cds);
    }
lineFileClose(&lf);
slReverse(&txList);

/* Sort all cdsLists by score. */
for (tx = txList; tx != NULL; tx = tx->next)
    slSort(&tx->cdsList, cdsEvidenceCmpScore);

return hash;
}
Esempio n. 2
0
void showCdsEvidence(char *geneName, struct trackDb *tdb, char *evTable)
/* Print out stuff from cdsEvidence table. */
{
struct sqlConnection *conn = hAllocConn(database);
double bestScore = 0;
if (sqlTableExists(conn, evTable))
    {
    webNewSection("CDS Prediction Information");
    char query[512];
    sqlSafef(query, sizeof(query), 
	    "select count(*) from %s where name='%s'", evTable, geneName);
    if (sqlQuickNum(conn, query) > 0)
	{
	sqlSafef(query, sizeof(query), 
		"select * from %s where name='%s' order by score desc", evTable, geneName);
	struct sqlResult *sr = sqlGetResult(conn, query);
	char **row;

	webPrintLinkTableStart();
	webPrintLabelCell("ORF<BR>size");
	webPrintLabelCell("start in<BR>transcript");
	webPrintLabelCell("end in<BR>transcript");
	webPrintLabelCell("source");
	webPrintLabelCell("accession");
	webPrintLabelCell("ad-hoc<BR>score");
	webPrintLabelCell("start<BR>codon");
	webPrintLabelCell("end<BR>codon");
	webPrintLabelCell("piece<BR>count");
	webPrintLabelCell("piece list");
	webPrintLabelCell("frame");
	webPrintLinkTableNewRow();

	while ((row = sqlNextRow(sr)) != NULL)
	    {
	    struct cdsEvidence *ev = cdsEvidenceLoad(row);
	    webPrintIntCell(ev->end - ev->start);
	    int i;
	    webPrintIntCell(ev->start+1);
	    webPrintIntCell(ev->end);
	    webPrintLinkCell(ev->source);
	    webPrintLinkCell(ev->accession);
	    webPrintLinkCellRightStart();
	    printf("%3.2f", ev->score);
	    bestScore = max(ev->score, bestScore);
	    webPrintLinkCellEnd();
	    webPrintLinkCell(ev->startComplete ? "yes" : "no");
	    webPrintLinkCell(ev->endComplete ? "yes" : "no");
	    webPrintIntCell(ev->cdsCount);
	    webPrintLinkCellRightStart();
	    for (i=0; i<ev->cdsCount; ++i)
		{
		int start = ev->cdsStarts[i];
		int end = start + ev->cdsSizes[i];
		printf("%d-%d ", start+1, end);
		}
	    webPrintLinkCellEnd();
	    webPrintLinkCellRightStart();
	    for (i=0; i<ev->cdsCount; ++i)
	        {
		if (i>0) printf(",");
	        printf("%d", ev->cdsStarts[i]%3 + 1);
		}
	    webPrintLinkCellEnd();
	    webPrintLinkTableNewRow();
	    }
	sqlFreeResult(&sr);
	webPrintLinkTableEnd();
	printf("This table shows CDS predictions for this transcript from a number of "
	    "sources including alignments against UniProtKB proteins, alignments against Genbank "
	    "mRNAs with CDS regions annotated by the sequence submitter, and "
	    "Victor Solovyev's bestorf program. Each prediction is assigned an ad-hoc score "
	    "score is based on several factors including the quality of "
	    "any associated alignments, the quality of the source, and the length of the "
	    "prediction.  For RefSeq transcripts with annotated CDSs the ad-hoc score "
	    "is over a million unless there are severe problems mapping the mRNA to the "
	    "genome.  In other cases the score generally ranges from 0 to 50,000. "
	    "The highest scoring prediction in this table is used to define the CDS "
	    "boundaries for this transcript.<P>If no score is 2000 or more, the transcript "
	    "is considered non-coding. In cases where the CDS is subject to "
	    "nonsense-mediated decay the CDS is removed.  The CDS is also removed "
	    "from transcripts when evidence points to it being in an artifact of an "
	    "incompletely processed transcript.  Specifically if the CDS is entirely "
	    "enclosed in the 3' UTR or an intron of a refSeq or other high quality "
	    "transcript, the CDS is removed.");
	}
    else
        {
	printf("no significant CDS prediction found, likely %s is noncoding",
		geneName);
	}
    }
hFreeConn(&conn);
}