void txCdsRefBestEvOnly(char *inFile, char *outFile) /* txCdsRefBestEvOnly - Go through a cdsEvidence file, and extract only the bits that refer to the native orf for a RefSeqReviewed transcript.. */ { struct cdsEvidence *cds, *cdsList = cdsEvidenceLoadAll(inFile); struct hash *nativeEvHash = hashNew(18); FILE *f = mustOpen(outFile, "w"); /* Make one pass through list adding the native refseq reviewed records to hash. */ for (cds = cdsList; cds != NULL; cds = cds->next) { if (sameString(cds->source, "RefSeqReviewed")) { char *acc = strrchr(cds->name, '.'); assert(acc != NULL); acc += 1; if (sameString(acc, cds->accession)) hashAdd(nativeEvHash, cds->name, cds); } } /* Make another pass through outputting all lines that correspond to * reviewd refseq's ORF. */ for (cds = cdsList; cds != NULL; cds = cds->next) { struct cdsEvidence *native = hashFindVal(nativeEvHash, cds->name); if (native != NULL) { if (cds->start == native->start && cds->end == native->end) cdsEvidenceTabOut(cds, f); } } carefulClose(&f); }
void txInfoAssemble(char *txBedFile, char *cdsEvFile, char *txCdsPredictFile, char *altSpliceFile, char *exceptionFile, char *sizePolyAFile, char *pslFile, char *flipFile, char *outFile) /* txInfoAssemble - Assemble information from various sources into txInfo table.. */ { /* Build up hash of evidence keyed by transcript name. */ struct hash *cdsEvHash = hashNew(18); struct cdsEvidence *cdsEv, *cdsEvList = cdsEvidenceLoadAll(cdsEvFile); for (cdsEv = cdsEvList; cdsEv != NULL; cdsEv = cdsEv->next) hashAddUnique(cdsEvHash, cdsEv->name, cdsEv); verbose(2, "Loaded %d elements from %s\n", cdsEvHash->elCount, cdsEvFile); /* Build up hash of bestorf structures keyed by transcript name */ struct hash *predictHash = hashNew(18); struct cdsEvidence *predict, *predictList = cdsEvidenceLoadAll(txCdsPredictFile); for (predict = predictList; predict != NULL; predict = predict->next) hashAddUnique(predictHash, predict->name, predict); verbose(2, "Loaded %d predicts from %s\n", predictHash->elCount, txCdsPredictFile); /* Build up structure for random access of retained introns */ struct bed *altSpliceList = bedLoadNAll(altSpliceFile, 6); verbose(2, "Loaded %d alts from %s\n", slCount(altSpliceList), altSpliceFile); struct hash *altSpliceHash = bedsIntoHashOfKeepers(altSpliceList); /* Read in exception info. */ struct hash *selenocysteineHash, *altStartHash; genbankExceptionsHash(exceptionFile, &selenocysteineHash, &altStartHash); /* Read in polyA sizes */ struct hash *sizePolyAHash = hashNameIntFile(sizePolyAFile); verbose(2, "Loaded %d from %s\n", sizePolyAHash->elCount, sizePolyAFile); /* Read in psls */ struct hash *pslHash = hashNew(20); struct psl *psl, *pslList = pslLoadAll(pslFile); for (psl = pslList; psl != NULL; psl = psl->next) hashAdd(pslHash, psl->qName, psl); verbose(2, "Loaded %d from %s\n", pslHash->elCount, pslFile); /* Read in accessions that we flipped for better splice sites. */ struct hash *flipHash = hashWordsInFile(flipFile, 0); /* Open primary gene input and output. */ struct lineFile *lf = lineFileOpen(txBedFile, TRUE); FILE *f = mustOpen(outFile, "w"); /* Main loop - process each gene */ char *row[12]; while (lineFileRow(lf, row)) { struct bed *bed = bedLoad12(row); verbose(3, "Processing %s\n", bed->name); /* Initialize info to zero */ struct txInfo info; ZeroVar(&info); /* Figure out name, sourceAcc, and isRefSeq from bed->name */ info.name = bed->name; info.category = "n/a"; if (isRfam(bed->name) || stringIn("tRNA", bed->name) != NULL) { info.sourceAcc = cloneString(bed->name); } else { info.sourceAcc = txAccFromTempName(bed->name); } info.isRefSeq = startsWith("NM_", info.sourceAcc); if (startsWith("antibody.", info.sourceAcc) || startsWith("CCDS", info.sourceAcc) || isRfam(info.sourceAcc) || stringIn("tRNA", info.sourceAcc) != NULL) { /* Fake up some things for antibody frag and CCDS that don't have alignments. */ info.sourceSize = bedTotalBlockSize(bed); info.aliCoverage = 1.0; info.aliIdRatio = 1.0; info. genoMapCount = 1; } else { /* Loop through all psl's associated with our RNA. Figure out * our overlap with each, and pick best one. */ struct hashEl *hel, *firstPslHel = hashLookup(pslHash, info.sourceAcc); if (firstPslHel == NULL) errAbort("%s is not in %s", info.sourceAcc, pslFile); int mapCount = 0; struct psl *psl, *bestPsl = NULL; int coverage, bestCoverage = 0; boolean isFlipped = (hashLookup(flipHash, info.sourceAcc) != NULL); for (hel = firstPslHel; hel != NULL; hel = hashLookupNext(hel)) { psl = hel->val; mapCount += 1; coverage = pslBedOverlap(psl, bed); if (coverage > bestCoverage) { bestCoverage = coverage; bestPsl = psl; } /* If we flipped it, try it on the opposite strand too. */ if (isFlipped) { psl->strand[0] = (psl->strand[0] == '+' ? '-' : '+'); coverage = pslBedOverlap(psl, bed); if (coverage > bestCoverage) { bestCoverage = coverage; bestPsl = psl; } psl->strand[0] = (psl->strand[0] == '+' ? '-' : '+'); } } if (bestPsl == NULL) errAbort("%s has no overlapping alignments with %s in %s", bed->name, info.sourceAcc, pslFile); /* Figure out and save alignment statistics. */ int polyA = hashIntValDefault(sizePolyAHash, bed->name, 0); info.sourceSize = bestPsl->qSize - polyA; info.aliCoverage = (double)bestCoverage / info.sourceSize; info.aliIdRatio = (double)(bestPsl->match + bestPsl->repMatch)/ (bestPsl->match + bestPsl->misMatch + bestPsl->repMatch); info. genoMapCount = mapCount; } /* Get orf size and start/end complete from cdsEv. */ if (bed->thickStart < bed->thickEnd) { cdsEv = hashFindVal(cdsEvHash, bed->name); if (cdsEv != NULL) { info.orfSize = cdsEv->end - cdsEv->start; info.startComplete = cdsEv->startComplete; info.endComplete = cdsEv->endComplete; } } /* Get score from prediction. */ predict = hashFindVal(predictHash, bed->name); if (predict != NULL) info.cdsScore = predict->score; /* Figure out nonsense-mediated-decay from bed itself. */ info.nonsenseMediatedDecay = isNonsenseMediatedDecayTarget(bed); /* Figure out if retained intron from bed and alt-splice keeper hash */ info.retainedIntron = hasRetainedIntron(bed, altSpliceHash); info.strangeSplice = countStrangeSplices(bed, altSpliceHash); info.atacIntrons = countAtacIntrons(bed, altSpliceHash); info.bleedIntoIntron = addIntronBleed(bed, altSpliceHash); /* Look up selenocysteine info. */ info.selenocysteine = (hashLookup(selenocysteineHash, bed->name) != NULL); /* Loop through bed looking for small gaps indicative of frame shift/stop */ int i, lastBlock = bed->blockCount-1; int exonCount = 1; for (i=0; i < lastBlock; ++i) { int gapStart = bed->chromStarts[i] + bed->blockSizes[i]; int gapEnd = bed->chromStarts[i+1]; int gapSize = gapEnd - gapStart; switch (gapSize) { case 1: case 2: info.genomicFrameShift = TRUE; break; case 3: info.genomicStop = TRUE; break; default: exonCount += 1; break; } } info.exonCount = exonCount; /* Write info, free bed. */ txInfoTabOut(&info, f); bedFree(&bed); } /* Clean up and go home. */ carefulClose(&f); }
void txCdsRepick(char *inputBed, char *inputTxg, char *inputCluster, char *inputInfo, char *inputCds, char *outputCds, char *outputPp) /* txCdsRepick - After we have clustered based on the preliminary coding * regions we can make a more intelligent choice here about the final coding * regions. */ { /* Read input bed into hash. Also calculate number with CDS set. */ struct hash *bedHash = hashNew(16); struct bed *bed, *bedList = bedLoadNAll(inputBed, 12); int txWithCdsCount = 0; for (bed = bedList; bed != NULL; bed = bed->next) { if (bed->thickStart < bed->thickEnd) txWithCdsCount += 1; hashAdd(bedHash, bed->name, bed); } verbose(2, "Read %d beds from %s\n", bedHash->elCount, inputBed); /* Read input transcript graphs into list, and into a hash * keyed by transcript names. */ struct hash *graphHash = hashNew(16); struct txGraph *txg, *txgList = txGraphLoadAll(inputTxg); for (txg = txgList; txg != NULL; txg = txg->next) { int i; for (i=0; i<txg->sourceCount; ++i) hashAdd(graphHash, txg->sources[i].accession, txg); } verbose(2, "Read %d graphs (%d transcripts) from %s\n", slCount(txgList), graphHash->elCount, inputTxg); /* Read input protein cluster into list, and into a hash * keyed by transcript name */ struct hash *clusterHash = hashNew(16); struct txCluster *cluster, *clusterList = txClusterLoadAll(inputCluster); for (cluster = clusterList; cluster != NULL; cluster = cluster->next) { int i; for (i=0; i<cluster->txCount; ++i) hashAdd(clusterHash, cluster->txArray[i], cluster); } verbose(2, "Read %d protein clusters (%d transcripts) from %s\n", slCount(clusterList), clusterHash->elCount, inputCluster); /* Read in txInfo into a hash keyed by transcript name */ struct hash *infoHash = hashNew(16); struct txInfo *info, *infoList = txInfoLoadAll(inputInfo); for (info = infoList; info != NULL; info = info->next) hashAdd(infoHash, info->name, info); verbose(2, "Read info on %d transcripts from %s\n", infoHash->elCount, inputInfo); /* Read in input cds evidence into a hash keyed by transcript name * who's values are a sorted *list* of evidence. */ struct hash *evHash = hashNew(16); struct cdsEvidence *ev, *nextEv, *evList = cdsEvidenceLoadAll(inputCds); int evCount = 0; for (ev = evList; ev != NULL; ev = nextEv) { nextEv = ev->next; struct hashEl *hel = hashLookup(evHash, ev->name); if (hel == NULL) hel = hashAdd(evHash, ev->name, NULL); slAddTail(&hel->val, ev); ++evCount; } verbose(2, "Read %d pieces of cdsEvidence on %d transcripts from %s\n", evCount, evHash->elCount, inputCds); /* Create a hash containing what looks to be the best protein-coding * transcript in each protein cluster. This is keyed by cluster name * with transcript names for values. */ FILE *f = mustOpen(outputPp, "w"); struct hash *bestInClusterHash = hashNew(16); for (cluster = clusterList; cluster != NULL; cluster = cluster->next) { double bestScore = -BIGNUM; char *bestTx = NULL; int i; for (i=0; i<cluster->txCount; ++i) { char *tx = cluster->txArray[i]; info = hashMustFindVal(infoHash, tx); double score = infoCodingScore(info, TRUE); if (score > bestScore) { bestTx = tx; bestScore = score; } } hashAdd(bestInClusterHash, cluster->name, bestTx); fprintf(f, "%s\t%s\n", cluster->name, bestTx); } carefulClose(&f); verbose(2, "Picked best protein for each protein cluster\n"); /* Loop through each transcript cluster (graph). Make a list of * protein clusters associated with that graph. Armed with this * information call repick routine on each transcript in the graph. */ f = mustOpen(outputCds, "w"); for (txg = txgList; txg != NULL; txg = txg->next) { /* Build up list of protein clusters associated with transcript cluster. */ struct slRef *protClusterRefList = NULL, *protClusterRef; int i; for (i=0; i<txg->sourceCount; ++i) { char *tx = txg->sources[i].accession; struct txCluster *protCluster = hashFindVal(clusterHash, tx); if (protCluster != NULL) refAddUnique(&protClusterRefList, protCluster); } /* Figure out best scoring protein in RNA cluster, and set threshold * to eliminate ones scoring less than half this much. */ double bestProtScore = 0; for (protClusterRef = protClusterRefList; protClusterRef != NULL; protClusterRef = protClusterRef->next) { struct txCluster *protCluster = protClusterRef->val; char *protTx = hashMustFindVal(bestInClusterHash, protCluster->name); struct txInfo *info = hashMustFindVal(infoHash, protTx); double score = infoCodingScore(info, FALSE); bestProtScore = max(score, bestProtScore); } double protScoreThreshold = bestProtScore * 0.5; /* Get list of references to beds of proteins over that threshold. */ struct slRef *protRefList = NULL; for (protClusterRef = protClusterRefList; protClusterRef != NULL; protClusterRef = protClusterRef->next) { struct txCluster *protCluster = protClusterRef->val; char *protTx = hashMustFindVal(bestInClusterHash, protCluster->name); struct txInfo *info = hashMustFindVal(infoHash, protTx); double score = infoCodingScore(info, FALSE); if (score >= protScoreThreshold) { struct bed *bed = hashMustFindVal(bedHash, protTx); refAdd(&protRefList, bed); } } /* Go repick each CDS in RNA cluster */ for (i=0; i<txg->sourceCount; ++i) { char *tx = txg->sources[i].accession; struct bed *bed = hashMustFindVal(bedHash, tx); struct cdsEvidence *evList = hashFindVal(evHash, tx); if (evList != NULL && bed->thickStart < bed->thickEnd) { info = hashMustFindVal(infoHash, bed->name); pickCompatableCds(bed, protRefList, evList, info, f); } } slFreeList(&protClusterRefList); } carefulClose(&f); verbose(1, "repicked %d, removed %d, no change to %d\n", pickedBetter, pickedNone, txWithCdsCount - pickedBetter - pickedNone); }