struct hash *loadAndWeighTce(char *fileName, struct hash *refToPepHash, struct hash *pepToRefHash) /* Load transcript cds evidence from file into hash of * txCdsInfo. */ { /* Read all tce's in file into list and hash of txCdsInfo. */ struct txCdsInfo *tx, *txList = NULL; struct lineFile *lf = lineFileOpen(fileName, TRUE); struct hash *hash = hashNew(18); char *row[CDSEVIDENCE_NUM_COLS]; while (lineFileRow(lf, row)) { /* Convert row to cdsEvidence structure. */ struct cdsEvidence *cds = cdsEvidenceLoad(row); char *acc = txAccFromTempName(cds->name); tx = hashFindVal(hash, cds->name); if (tx == NULL) { AllocVar(tx); hashAddSaveName(hash, cds->name, tx, &tx->name); slAddHead(&txList, tx); } /* Track whether it's refSeq, and the associated protein. */ char *refSeqAcc = NULL, *refPepAcc = NULL; refPepAcc = hashFindVal(refToPepHash, acc); refSeqAcc = hashFindVal(pepToRefHash, acc); if (refPepAcc != NULL && refSeqAcc == NULL) refSeqAcc = acc; if (refSeqAcc != NULL && refPepAcc == NULL) refPepAcc = acc; /* If we are refSeq, bump our score for matches to our own * bits by a huge factor. */ if (refPepAcc != NULL && startsWith("RefPep", cds->source) && sameString(cds->accession, refPepAcc)) cds->score += refSeqWeight; if (refSeqAcc != NULL && startsWith("RefSeq", cds->source) && sameString(cds->accession, refSeqAcc)) cds->score += refSeqWeight + 100; /* If we are CCDS then that's great too. */ if (sameString("ccds", cds->source)) cds->score += ccdsWeight; /* If we are txCdsPredict bump weight too. Only RefSeq and txCdsPredict * can actually possibly make it over real threshold. */ if (sameString("txCdsPredict", cds->source)) cds->score += txCdsPredictWeight; slAddHead(&tx->cdsList, cds); } lineFileClose(&lf); slReverse(&txList); /* Sort all cdsLists by score. */ for (tx = txList; tx != NULL; tx = tx->next) slSort(&tx->cdsList, cdsEvidenceCmpScore); return hash; }
void showCdsEvidence(char *geneName, struct trackDb *tdb, char *evTable) /* Print out stuff from cdsEvidence table. */ { struct sqlConnection *conn = hAllocConn(database); double bestScore = 0; if (sqlTableExists(conn, evTable)) { webNewSection("CDS Prediction Information"); char query[512]; sqlSafef(query, sizeof(query), "select count(*) from %s where name='%s'", evTable, geneName); if (sqlQuickNum(conn, query) > 0) { sqlSafef(query, sizeof(query), "select * from %s where name='%s' order by score desc", evTable, geneName); struct sqlResult *sr = sqlGetResult(conn, query); char **row; webPrintLinkTableStart(); webPrintLabelCell("ORF<BR>size"); webPrintLabelCell("start in<BR>transcript"); webPrintLabelCell("end in<BR>transcript"); webPrintLabelCell("source"); webPrintLabelCell("accession"); webPrintLabelCell("ad-hoc<BR>score"); webPrintLabelCell("start<BR>codon"); webPrintLabelCell("end<BR>codon"); webPrintLabelCell("piece<BR>count"); webPrintLabelCell("piece list"); webPrintLabelCell("frame"); webPrintLinkTableNewRow(); while ((row = sqlNextRow(sr)) != NULL) { struct cdsEvidence *ev = cdsEvidenceLoad(row); webPrintIntCell(ev->end - ev->start); int i; webPrintIntCell(ev->start+1); webPrintIntCell(ev->end); webPrintLinkCell(ev->source); webPrintLinkCell(ev->accession); webPrintLinkCellRightStart(); printf("%3.2f", ev->score); bestScore = max(ev->score, bestScore); webPrintLinkCellEnd(); webPrintLinkCell(ev->startComplete ? "yes" : "no"); webPrintLinkCell(ev->endComplete ? "yes" : "no"); webPrintIntCell(ev->cdsCount); webPrintLinkCellRightStart(); for (i=0; i<ev->cdsCount; ++i) { int start = ev->cdsStarts[i]; int end = start + ev->cdsSizes[i]; printf("%d-%d ", start+1, end); } webPrintLinkCellEnd(); webPrintLinkCellRightStart(); for (i=0; i<ev->cdsCount; ++i) { if (i>0) printf(","); printf("%d", ev->cdsStarts[i]%3 + 1); } webPrintLinkCellEnd(); webPrintLinkTableNewRow(); } sqlFreeResult(&sr); webPrintLinkTableEnd(); printf("This table shows CDS predictions for this transcript from a number of " "sources including alignments against UniProtKB proteins, alignments against Genbank " "mRNAs with CDS regions annotated by the sequence submitter, and " "Victor Solovyev's bestorf program. Each prediction is assigned an ad-hoc score " "score is based on several factors including the quality of " "any associated alignments, the quality of the source, and the length of the " "prediction. For RefSeq transcripts with annotated CDSs the ad-hoc score " "is over a million unless there are severe problems mapping the mRNA to the " "genome. In other cases the score generally ranges from 0 to 50,000. " "The highest scoring prediction in this table is used to define the CDS " "boundaries for this transcript.<P>If no score is 2000 or more, the transcript " "is considered non-coding. In cases where the CDS is subject to " "nonsense-mediated decay the CDS is removed. The CDS is also removed " "from transcripts when evidence points to it being in an artifact of an " "incompletely processed transcript. Specifically if the CDS is entirely " "enclosed in the 3' UTR or an intron of a refSeq or other high quality " "transcript, the CDS is removed."); } else { printf("no significant CDS prediction found, likely %s is noncoding", geneName); } } hFreeConn(&conn); }