void txOrtho(char *inAgx, char *inChain, char *inNet, char *orthoAgx, char *outEdges) /* txOrtho - Produce list of shared edges between two transcription graphs in two species. */ { /* Load up input and create output file */ struct txGraph *inGraphList = txGraphLoadAll(inAgx); verbose(1, "Loaded %d input graphs in %s\n", slCount(inGraphList), inAgx); struct hash *chainHash = allChainsHash(inChain); verbose(1, "Read %d chains from %s\n", chainHash->elCount, inChain); struct hash *netHash = netToBkHash(inNet); verbose(1, "Read %d nets from %s\n", netHash->elCount, inNet); struct txGraph *orthoGraphList = txGraphLoadAll(orthoAgx); verbose(1, "Loaded %d ortho graphs in %s\n", slCount(orthoGraphList), orthoAgx); struct hash *orthoGraphHash = txgIntoKeeperHash(orthoGraphList); verbose(1, "%d ortho chromosomes/scaffolds\n", orthoGraphHash->elCount); FILE *f = mustOpen(outEdges, "w"); /* Loop through inGraphList. */ struct txGraph *inGraph; for (inGraph = inGraphList; inGraph != NULL; inGraph = inGraph->next) { verbose(2, "Processing %s %s:%d-%d strand %s\n", inGraph->name, inGraph->tName, inGraph->tStart, inGraph->tEnd, inGraph->strand); writeOrthoEdges(inGraph, chainHash, netHash, orthoGraphHash, f); } carefulClose(&f); }
void txgGoodEdges(char *inTxg, char *inWeights, char *asciiThreshold, char *outType, char *outEdges) /* txgGoodEdges - Get edges that are above a certain threshold.. */ { struct txGraph *txgList = txGraphLoadAll(inTxg); verbose(2, "LOaded %d txGraphs from %s\n", slCount(txgList), inTxg); struct hash *weightHash = hashWeights(inWeights); verbose(2, "Loaded %d weights from %s\n", weightHash->elCount, inWeights); double threshold = sqlDouble(asciiThreshold); verbose(2, "Threshold %f\n", threshold); struct txGraph *txg; FILE *f = mustOpen(outEdges, "w"); for (txg = txgList; txg != NULL; txg = txg->next) { verbose(2, "%s edgeCount %d\n", txg->name, txg->edgeCount); processOneGraph(txg, weightHash, threshold, outType, f); } carefulClose(&f); }
void txgTrim(char *inTxg, char *inWeights, char *asciiThreshold, char *outTxg) /* txgTrim - Trim out parts of txGraph that are not of sufficient weight.. */ { struct txGraph *txgList = txGraphLoadAll(inTxg); verbose(2, "LOaded %d txGraphs from %s\n", slCount(txgList), inTxg); struct hash *weightHash = hashWeights(inWeights); verbose(2, "Loaded %d weights from %s\n", weightHash->elCount, inWeights); double threshold = sqlDouble(asciiThreshold); verbose(2, "Threshold %f\n", threshold); struct txGraph *txg; FILE *f = mustOpen(outTxg, "w"); for (txg = txgList; txg != NULL; txg = txg->next) { verbose(2, "%s edgeCount %d\n", txg->name, txg->edgeCount); txGraphTrimOne(txg, weightHash, threshold); if (txg->edgeCount > 0) txGraphTabOut(txg, f); } carefulClose(&f); }
void txGeneCanonical(char *codingCluster, char *infoFile, char *noncodingGraph, char *genesBed, char *nearCoding, char *outCanonical, char *outIsoforms, char *outClusters) /* txGeneCanonical - Pick a canonical version of each gene - that is the form * to use when just interested in a single splicing varient. Produces final * transcript clusters as well. */ { /* Read in input into lists in memory. */ struct txCluster *coding, *codingList = txClusterLoadAll(codingCluster); struct txGraph *graph, *graphList = txGraphLoadAll(noncodingGraph); struct bed *bed, *nextBed, *bedList = bedLoadNAll(genesBed, 12); struct txInfo *info, *infoList = txInfoLoadAll(infoFile); struct bed *nearList = bedLoadNAll(nearCoding, 12); /* Make hash of all beds. */ struct hash *bedHash = hashNew(18); for (bed = bedList; bed != NULL; bed = bed->next) hashAdd(bedHash, bed->name, bed); /* Make has of all info. */ struct hash *infoHash = hashNew(18); for (info = infoList; info != NULL; info = info->next) hashAdd(infoHash, info->name, info); /* Make a binKeeper structure that we'll populate with coding genes. */ struct hash *sizeHash = minChromSizeFromBeds(bedList); struct hash *keeperHash = minChromSizeKeeperHash(sizeHash); /* Make list of coding genes and toss them into binKeeper. * This will eat up bed list, but bedHash is ok. */ struct gene *gene, *geneList = NULL; for (coding = codingList; coding != NULL; coding = coding->next) { gene = geneFromCluster(coding, bedHash, infoHash); slAddHead(&geneList, gene); struct binKeeper *bk = hashMustFindVal(keeperHash, gene->chrom); binKeeperAdd(bk, gene->start, gene->end, gene); } /* Go through near-coding genes and add them to the coding gene * they most overlap. */ for (bed = nearList; bed != NULL; bed = nextBed) { nextBed = bed->next; gene = mostOverlappingGene(keeperHash, bed); if (gene == NULL) errAbort("%s is near coding, but doesn't overlap any coding!?", bed->name); geneAddBed(gene, bed); } /* Add non-coding genes. */ for (graph = graphList; graph != NULL; graph = graph->next) { gene = geneFromGraph(graph, bedHash); slAddHead(&geneList, gene); } /* Sort so it all looks nicer. */ slSort(&geneList, geneCmp); /* Open up output files. */ FILE *fCan = mustOpen(outCanonical, "w"); FILE *fIso = mustOpen(outIsoforms, "w"); FILE *fClus = mustOpen(outClusters, "w"); /* Loop through, making up gene name, and writing output. */ int geneId = 0; for (gene = geneList; gene != NULL; gene = gene->next) { /* Make up name. */ char name[16]; safef(name, sizeof(name), "g%05d", ++geneId); /* Reverse transcript list just to make it look better. */ slReverse(&gene->txList); /* Write out canonical file output */ bed = hashMustFindVal(bedHash, gene->niceTx->name); fprintf(fCan, "%s\t%d\t%d\t%d\t%s\t%s\n", bed->chrom, bed->chromStart, bed->chromEnd, geneId, gene->niceTx->name, gene->niceTx->name); /* Write out isoforms output. */ for (bed = gene->txList; bed != NULL; bed = bed->next) fprintf(fIso, "%d\t%s\n", geneId, bed->name); /* Write out cluster output, starting with bed 6 standard fields. */ fprintf(fClus, "%s\t%d\t%d\t%s\t%d\t%c\t", gene->chrom, gene->start, gene->end, name, 0, gene->strand); /* Write out thick-start/thick end. */ if (gene->isCoding) { int thickStart = gene->end, thickEnd = gene->start; for (bed = gene->txList; bed != NULL; bed = bed->next) { if (bed->thickStart < bed->thickEnd) { thickStart = min(thickStart, bed->thickStart); thickEnd = max(thickEnd, bed->thickEnd); } } fprintf(fClus, "%d\t%d\t", thickStart, thickEnd); } else { fprintf(fClus, "%d\t%d\t", gene->start, gene->start); } /* We got no rgb value, just write out zero. */ fprintf(fClus, "0\t"); /* Get exons from exonTree. */ struct range *exon, *exonList = rangeTreeList(gene->exonTree); fprintf(fClus, "%d\t", slCount(exonList)); for (exon = exonList; exon != NULL; exon = exon->next) fprintf(fClus, "%d,", exon->start - gene->start); fprintf(fClus, "\t"); for (exon = exonList; exon != NULL; exon = exon->next) fprintf(fClus, "%d,", exon->end - exon->start); fprintf(fClus, "\t"); /* Write out associated transcripts. */ fprintf(fClus, "%d\t", slCount(gene->txList)); for (bed = gene->txList; bed != NULL; bed = bed->next) fprintf(fClus, "%s,", bed->name); fprintf(fClus, "\t"); /* Write out nice value */ fprintf(fClus, "%s\t", gene->niceTx->name); /* Write out coding/noncoding value. */ fprintf(fClus, "%d\n", gene->isCoding); } /* Close up files. */ carefulClose(&fCan); carefulClose(&fIso); carefulClose(&fClus); }
void txCdsRepick(char *inputBed, char *inputTxg, char *inputCluster, char *inputInfo, char *inputCds, char *outputCds, char *outputPp) /* txCdsRepick - After we have clustered based on the preliminary coding * regions we can make a more intelligent choice here about the final coding * regions. */ { /* Read input bed into hash. Also calculate number with CDS set. */ struct hash *bedHash = hashNew(16); struct bed *bed, *bedList = bedLoadNAll(inputBed, 12); int txWithCdsCount = 0; for (bed = bedList; bed != NULL; bed = bed->next) { if (bed->thickStart < bed->thickEnd) txWithCdsCount += 1; hashAdd(bedHash, bed->name, bed); } verbose(2, "Read %d beds from %s\n", bedHash->elCount, inputBed); /* Read input transcript graphs into list, and into a hash * keyed by transcript names. */ struct hash *graphHash = hashNew(16); struct txGraph *txg, *txgList = txGraphLoadAll(inputTxg); for (txg = txgList; txg != NULL; txg = txg->next) { int i; for (i=0; i<txg->sourceCount; ++i) hashAdd(graphHash, txg->sources[i].accession, txg); } verbose(2, "Read %d graphs (%d transcripts) from %s\n", slCount(txgList), graphHash->elCount, inputTxg); /* Read input protein cluster into list, and into a hash * keyed by transcript name */ struct hash *clusterHash = hashNew(16); struct txCluster *cluster, *clusterList = txClusterLoadAll(inputCluster); for (cluster = clusterList; cluster != NULL; cluster = cluster->next) { int i; for (i=0; i<cluster->txCount; ++i) hashAdd(clusterHash, cluster->txArray[i], cluster); } verbose(2, "Read %d protein clusters (%d transcripts) from %s\n", slCount(clusterList), clusterHash->elCount, inputCluster); /* Read in txInfo into a hash keyed by transcript name */ struct hash *infoHash = hashNew(16); struct txInfo *info, *infoList = txInfoLoadAll(inputInfo); for (info = infoList; info != NULL; info = info->next) hashAdd(infoHash, info->name, info); verbose(2, "Read info on %d transcripts from %s\n", infoHash->elCount, inputInfo); /* Read in input cds evidence into a hash keyed by transcript name * who's values are a sorted *list* of evidence. */ struct hash *evHash = hashNew(16); struct cdsEvidence *ev, *nextEv, *evList = cdsEvidenceLoadAll(inputCds); int evCount = 0; for (ev = evList; ev != NULL; ev = nextEv) { nextEv = ev->next; struct hashEl *hel = hashLookup(evHash, ev->name); if (hel == NULL) hel = hashAdd(evHash, ev->name, NULL); slAddTail(&hel->val, ev); ++evCount; } verbose(2, "Read %d pieces of cdsEvidence on %d transcripts from %s\n", evCount, evHash->elCount, inputCds); /* Create a hash containing what looks to be the best protein-coding * transcript in each protein cluster. This is keyed by cluster name * with transcript names for values. */ FILE *f = mustOpen(outputPp, "w"); struct hash *bestInClusterHash = hashNew(16); for (cluster = clusterList; cluster != NULL; cluster = cluster->next) { double bestScore = -BIGNUM; char *bestTx = NULL; int i; for (i=0; i<cluster->txCount; ++i) { char *tx = cluster->txArray[i]; info = hashMustFindVal(infoHash, tx); double score = infoCodingScore(info, TRUE); if (score > bestScore) { bestTx = tx; bestScore = score; } } hashAdd(bestInClusterHash, cluster->name, bestTx); fprintf(f, "%s\t%s\n", cluster->name, bestTx); } carefulClose(&f); verbose(2, "Picked best protein for each protein cluster\n"); /* Loop through each transcript cluster (graph). Make a list of * protein clusters associated with that graph. Armed with this * information call repick routine on each transcript in the graph. */ f = mustOpen(outputCds, "w"); for (txg = txgList; txg != NULL; txg = txg->next) { /* Build up list of protein clusters associated with transcript cluster. */ struct slRef *protClusterRefList = NULL, *protClusterRef; int i; for (i=0; i<txg->sourceCount; ++i) { char *tx = txg->sources[i].accession; struct txCluster *protCluster = hashFindVal(clusterHash, tx); if (protCluster != NULL) refAddUnique(&protClusterRefList, protCluster); } /* Figure out best scoring protein in RNA cluster, and set threshold * to eliminate ones scoring less than half this much. */ double bestProtScore = 0; for (protClusterRef = protClusterRefList; protClusterRef != NULL; protClusterRef = protClusterRef->next) { struct txCluster *protCluster = protClusterRef->val; char *protTx = hashMustFindVal(bestInClusterHash, protCluster->name); struct txInfo *info = hashMustFindVal(infoHash, protTx); double score = infoCodingScore(info, FALSE); bestProtScore = max(score, bestProtScore); } double protScoreThreshold = bestProtScore * 0.5; /* Get list of references to beds of proteins over that threshold. */ struct slRef *protRefList = NULL; for (protClusterRef = protClusterRefList; protClusterRef != NULL; protClusterRef = protClusterRef->next) { struct txCluster *protCluster = protClusterRef->val; char *protTx = hashMustFindVal(bestInClusterHash, protCluster->name); struct txInfo *info = hashMustFindVal(infoHash, protTx); double score = infoCodingScore(info, FALSE); if (score >= protScoreThreshold) { struct bed *bed = hashMustFindVal(bedHash, protTx); refAdd(&protRefList, bed); } } /* Go repick each CDS in RNA cluster */ for (i=0; i<txg->sourceCount; ++i) { char *tx = txg->sources[i].accession; struct bed *bed = hashMustFindVal(bedHash, tx); struct cdsEvidence *evList = hashFindVal(evHash, tx); if (evList != NULL && bed->thickStart < bed->thickEnd) { info = hashMustFindVal(infoHash, bed->name); pickCompatableCds(bed, protRefList, evList, info, f); } } slFreeList(&protClusterRefList); } carefulClose(&f); verbose(1, "repicked %d, removed %d, no change to %d\n", pickedBetter, pickedNone, txWithCdsCount - pickedBetter - pickedNone); }