void txGeneColor(char *uniProtDb, char *infoFile, char *pickFile, char *outFile) /* txGeneColor - Figure out color to draw gene in.. */ { /* Load picks into hash. We don't use cdsPicksLoadAll because empty fields * cause that autoSql-generated routine problems. */ struct hash *pickHash = newHash(18); struct cdsPick *pick; struct lineFile *lf = lineFileOpen(pickFile, TRUE); char *row[CDSPICK_NUM_COLS]; while (lineFileRowTab(lf, row)) { pick = cdsPickLoad(row); hashAdd(pickHash, pick->name, pick); } /* Open uniprot database connection. */ struct sqlConnection *uConn = sqlConnect(uniProtDb); #ifdef OLD /* Figure out our light and medium colors. */ mediumBlue.r = (6*trueBlue.r + 4*255)/10; mediumBlue.g = (6*trueBlue.g + 4*255)/10; mediumBlue.b = (6*trueBlue.b + 4*255)/10; lightBlue.r = (1*trueBlue.r + 2*255)/3; lightBlue.g = (1*trueBlue.g + 2*255)/3; lightBlue.b = (1*trueBlue.b + 2*255)/3; #endif /* OLD */ /* Read in info file, and loop through it to make out file. */ struct txInfo *info, *infoList = txInfoLoadAll(infoFile); FILE *f = mustOpen(outFile, "w"); for (info = infoList; info != NULL; info = info->next) { struct rgbColor *col; pick = hashFindVal(pickHash, info->name); if (pick != NULL) { char *source = pick->source; if (sameString(source, "RefPepValidated")) col = &trueBlue; else if (sameString(source, "ccds")) col = &trueBlue; else if (sameString(source, "RefPepReviewed")) col = &trueBlue; else if (sameString(source, "RefSeqValidated")) col = &trueBlue; else if (sameString(source, "RefSeqReviewed")) col = &trueBlue; else if (sameString(source, "swissProt")) col = &trueBlue; else if (startsWith("Ref", source)) col = &mediumBlue; else col = &lightBlue; if (pick->swissProt[0] != 0) { char *acc = spLookupPrimaryAcc(uConn, pick->swissProt); struct slName *pdbList = spPdbAccs(uConn, acc); if (pdbList != NULL) col = &black; slFreeList(&pdbList); } } else col = &lightBlue; fprintf(f, "%s\t%d\t%d\t%d\n", info->name, col->r, col->g, col->b); } carefulClose(&f); }
void txGeneCdsMap(char *inBed, char *inInfo, char *inPicks, char *refPepToTxPsl, char *refToPepTab, char *chromSizes, char *cdsToRna, char *rnaToGenome) /* txGeneCdsMap - Create mapping between CDS region of gene and genome. */ { /* Load info into hash. */ struct hash *infoHash = hashNew(18); struct txInfo *info, *infoList = txInfoLoadAll(inInfo); for (info = infoList; info != NULL; info = info->next) hashAdd(infoHash, info->name, info); /* Load picks into hash. We don't use cdsPicksLoadAll because empty fields * cause that autoSql-generated routine problems. */ struct hash *pickHash = newHash(18); struct cdsPick *pick; struct lineFile *lf = lineFileOpen(inPicks, TRUE); char *row[CDSPICK_NUM_COLS]; while (lineFileRowTab(lf, row)) { pick = cdsPickLoad(row); hashAdd(pickHash, pick->name, pick); } lineFileClose(&lf); /* Load refPep/tx alignments into hash keyed by tx. */ struct hash *refPslHash = hashNew(18); struct psl *psl, *pslList = pslLoadAll(refPepToTxPsl); for (psl = pslList; psl != NULL; psl = psl->next) hashAdd(refPslHash, psl->tName, psl); struct hash *refToPepHash = hashTwoColumnFile(refToPepTab); struct hash *chromSizeHash = hashNameIntFile(chromSizes); /* Load in bed. */ struct bed *bed, *bedList = bedLoadNAll(inBed, 12); /* Open output, and stream through bedList, writing output. */ FILE *fCdsToRna = mustOpen(cdsToRna, "w"); FILE *fRnaToGenome = mustOpen(rnaToGenome, "w"); int refTotal = 0, refFound = 0; for (bed = bedList; bed != NULL; bed = bed->next) { if (bed->thickStart < bed->thickEnd) { char *chrom = bed->chrom; int chromSize = hashIntVal(chromSizeHash, chrom); info = hashMustFindVal(infoHash, bed->name); pick = hashMustFindVal(pickHash, bed->name); if (info->isRefSeq) { char *refAcc = txAccFromTempName(bed->name); if (!startsWith("NM_", refAcc)) errAbort("Don't think I did find that refSeq acc, got %s", refAcc); char *protAcc = hashMustFindVal(refToPepHash, refAcc); ++refTotal; if (findAndMapPsl(bed, protAcc, refPslHash, chromSize, fCdsToRna)) ++refFound; } else { fakeCdsToMrna(bed, fCdsToRna); } fakeRnaToGenome(bed, chromSize, fRnaToGenome); } } verbose(1, "Missed %d of %d refSeq protein mappings. A small number of RefSeqs just map\n" "to genome in the UTR.\n", refTotal - refFound, refTotal); carefulClose(&fCdsToRna); carefulClose(&fRnaToGenome); }
void txGeneSeparateNoncoding(char *inBed, char *inInfo, char *outCoding, char *outNearCoding, char *outNearCodingJunk, char *outAntisense, char *outNoncoding, char *outInfo) /* txGeneSeparateNoncoding - Separate genes into four piles - coding, * non-coding that overlap coding, antisense to coding, and independent non-coding. */ { /* Read in txInfo into a hash keyed by transcript name */ struct hash *infoHash = hashNew(16); struct txInfo *info, *infoList = txInfoLoadAll(inInfo); for (info = infoList; info != NULL; info = info->next) hashAdd(infoHash, info->name, info); verbose(2, "Read info on %d transcripts from %s\n", infoHash->elCount, inInfo); /* Read in bed, and sort so we can process it easily a * strand of one chromosome at a time. */ struct bed *inBedList = bedLoadNAll(inBed, 12); slSort(&inBedList, bedCmpChromStrandStart); /* Open up output files. */ FILE *fCoding = mustOpen(outCoding, "w"); FILE *fNearCoding = mustOpen(outNearCoding, "w"); FILE *fNearCodingJunk = mustOpen(outNearCodingJunk, "w"); FILE *fNoncoding = mustOpen(outNoncoding, "w"); FILE *fAntisense = mustOpen(outAntisense, "w"); /* Go through input one chromosome strand at a time. */ struct chrom *chrom, *chromList = chromsForBeds(inBedList); for (chrom = chromList; chrom != NULL; chrom = chrom->next) { verbose(2, "chrom %s\n", chrom->name); /* Do the separation. */ struct bed *codingList, *nearCodingList, *nearCodingJunkList, *antisenseList, *noncodingList; separateChrom(chrom, infoHash, &codingList, &nearCodingList, &nearCodingJunkList, &antisenseList, &noncodingList); verbose(2, "%d coding, %d near, %d anti, %d non\n", slCount(codingList), slCount(nearCodingList), slCount(antisenseList), slCount(noncodingList)); /* Write lists to respective files. */ writeBedList(codingList, fCoding); writeBedList(nearCodingList, fNearCoding); writeBedList(nearCodingJunkList, fNearCodingJunk); writeBedList(antisenseList, fAntisense); writeBedList(noncodingList, fNoncoding); } carefulClose(&fCoding); carefulClose(&fNearCoding); carefulClose(&fNearCodingJunk); carefulClose(&fNoncoding); carefulClose(&fAntisense); verbose(1, "coding %d, codingJunk %d, nearCoding %d, junk %d, antisense %d, noncoding %d\n", codingCount, codingJunkCount, nearCodingCount, junkCount, antisenseCount, noncodingCount); /* Write out updated info file */ FILE *f = mustOpen(outInfo, "w"); for (info = infoList; info != NULL; info = info->next) { txInfoTabOut(info, f); } carefulClose(&f); }
void txGeneCanonical(char *codingCluster, char *infoFile, char *noncodingGraph, char *genesBed, char *nearCoding, char *outCanonical, char *outIsoforms, char *outClusters) /* txGeneCanonical - Pick a canonical version of each gene - that is the form * to use when just interested in a single splicing varient. Produces final * transcript clusters as well. */ { /* Read in input into lists in memory. */ struct txCluster *coding, *codingList = txClusterLoadAll(codingCluster); struct txGraph *graph, *graphList = txGraphLoadAll(noncodingGraph); struct bed *bed, *nextBed, *bedList = bedLoadNAll(genesBed, 12); struct txInfo *info, *infoList = txInfoLoadAll(infoFile); struct bed *nearList = bedLoadNAll(nearCoding, 12); /* Make hash of all beds. */ struct hash *bedHash = hashNew(18); for (bed = bedList; bed != NULL; bed = bed->next) hashAdd(bedHash, bed->name, bed); /* Make has of all info. */ struct hash *infoHash = hashNew(18); for (info = infoList; info != NULL; info = info->next) hashAdd(infoHash, info->name, info); /* Make a binKeeper structure that we'll populate with coding genes. */ struct hash *sizeHash = minChromSizeFromBeds(bedList); struct hash *keeperHash = minChromSizeKeeperHash(sizeHash); /* Make list of coding genes and toss them into binKeeper. * This will eat up bed list, but bedHash is ok. */ struct gene *gene, *geneList = NULL; for (coding = codingList; coding != NULL; coding = coding->next) { gene = geneFromCluster(coding, bedHash, infoHash); slAddHead(&geneList, gene); struct binKeeper *bk = hashMustFindVal(keeperHash, gene->chrom); binKeeperAdd(bk, gene->start, gene->end, gene); } /* Go through near-coding genes and add them to the coding gene * they most overlap. */ for (bed = nearList; bed != NULL; bed = nextBed) { nextBed = bed->next; gene = mostOverlappingGene(keeperHash, bed); if (gene == NULL) errAbort("%s is near coding, but doesn't overlap any coding!?", bed->name); geneAddBed(gene, bed); } /* Add non-coding genes. */ for (graph = graphList; graph != NULL; graph = graph->next) { gene = geneFromGraph(graph, bedHash); slAddHead(&geneList, gene); } /* Sort so it all looks nicer. */ slSort(&geneList, geneCmp); /* Open up output files. */ FILE *fCan = mustOpen(outCanonical, "w"); FILE *fIso = mustOpen(outIsoforms, "w"); FILE *fClus = mustOpen(outClusters, "w"); /* Loop through, making up gene name, and writing output. */ int geneId = 0; for (gene = geneList; gene != NULL; gene = gene->next) { /* Make up name. */ char name[16]; safef(name, sizeof(name), "g%05d", ++geneId); /* Reverse transcript list just to make it look better. */ slReverse(&gene->txList); /* Write out canonical file output */ bed = hashMustFindVal(bedHash, gene->niceTx->name); fprintf(fCan, "%s\t%d\t%d\t%d\t%s\t%s\n", bed->chrom, bed->chromStart, bed->chromEnd, geneId, gene->niceTx->name, gene->niceTx->name); /* Write out isoforms output. */ for (bed = gene->txList; bed != NULL; bed = bed->next) fprintf(fIso, "%d\t%s\n", geneId, bed->name); /* Write out cluster output, starting with bed 6 standard fields. */ fprintf(fClus, "%s\t%d\t%d\t%s\t%d\t%c\t", gene->chrom, gene->start, gene->end, name, 0, gene->strand); /* Write out thick-start/thick end. */ if (gene->isCoding) { int thickStart = gene->end, thickEnd = gene->start; for (bed = gene->txList; bed != NULL; bed = bed->next) { if (bed->thickStart < bed->thickEnd) { thickStart = min(thickStart, bed->thickStart); thickEnd = max(thickEnd, bed->thickEnd); } } fprintf(fClus, "%d\t%d\t", thickStart, thickEnd); } else { fprintf(fClus, "%d\t%d\t", gene->start, gene->start); } /* We got no rgb value, just write out zero. */ fprintf(fClus, "0\t"); /* Get exons from exonTree. */ struct range *exon, *exonList = rangeTreeList(gene->exonTree); fprintf(fClus, "%d\t", slCount(exonList)); for (exon = exonList; exon != NULL; exon = exon->next) fprintf(fClus, "%d,", exon->start - gene->start); fprintf(fClus, "\t"); for (exon = exonList; exon != NULL; exon = exon->next) fprintf(fClus, "%d,", exon->end - exon->start); fprintf(fClus, "\t"); /* Write out associated transcripts. */ fprintf(fClus, "%d\t", slCount(gene->txList)); for (bed = gene->txList; bed != NULL; bed = bed->next) fprintf(fClus, "%s,", bed->name); fprintf(fClus, "\t"); /* Write out nice value */ fprintf(fClus, "%s\t", gene->niceTx->name); /* Write out coding/noncoding value. */ fprintf(fClus, "%d\n", gene->isCoding); } /* Close up files. */ carefulClose(&fCan); carefulClose(&fIso); carefulClose(&fClus); }
void txGeneXref(char *genomeDb, char *uniProtDb, char *genePredFile, char *infoFile, char *pickFile, char *evFile, char *outFile) /* txGeneXref - Make kgXref type table for genes.. */ { /* Load picks into hash. We don't use cdsPicksLoadAll because empty fields * cause that autoSql-generated routine problems. */ struct hash *pickHash = newHash(18); struct hash *geneToProtHash = makeGeneToProtHash(genePredFile); struct cdsPick *pick; struct lineFile *lf = lineFileOpen(pickFile, TRUE); char *row[CDSPICK_NUM_COLS]; while (lineFileRowTab(lf, row)) { pick = cdsPickLoad(row); removePickVersions(pick); hashAdd(pickHash, pick->name, pick); } /* Load evidence into hash */ struct hash *evHash = newHash(18); struct txRnaAccs *ev, *evList = txRnaAccsLoadAll(evFile); for (ev = evList; ev != NULL; ev = ev->next) hashAdd(evHash, ev->name, ev); /* Open connections to our databases */ struct sqlConnection *gConn = sqlConnect(genomeDb); struct sqlConnection *uConn = sqlConnect(uniProtDb); /* Read in info file, and loop through it to make out file. */ struct txInfo *info, *infoList = txInfoLoadAll(infoFile); FILE *f = mustOpen(outFile, "w"); for (info = infoList; info != NULL; info = info->next) { char *kgID = info->name; char *mRNA = ""; char *spID = ""; char *spDisplayID = ""; char *geneSymbol = NULL; char *refseq = ""; char *protAcc = ""; char *description = NULL; char query[256]; char *proteinId = hashMustFindVal(geneToProtHash, info->name); boolean isAb = sameString(info->category, "antibodyParts"); pick = hashFindVal(pickHash, info->name); ev = hashFindVal(evHash, info->name); if (pick != NULL) { /* Fill in the relatively straightforward fields. */ refseq = pick->refSeq; if (info->orfSize > 0) { protAcc = pick->refProt; spID = proteinId; if (sameString(protAcc, spID)) spID = pick->uniProt; if (spID[0] != 0) spDisplayID = spAnyAccToId(uConn, spID); } /* Fill in gene symbol and description from refseq if possible. */ if (refseq[0] != 0) { struct sqlResult *sr; safef(query, sizeof(query), "select name,product from refLink where mrnaAcc='%s'", refseq); sr = sqlGetResult(gConn, query); char **row = sqlNextRow(sr); if (row != NULL) { geneSymbol = cloneString(row[0]); if (!sameWord("unknown protein", row[1])) description = cloneString(row[1]); } sqlFreeResult(&sr); } /* If need be try uniProt for gene symbol and description. */ if (spID[0] != 0 && (geneSymbol == NULL || description == NULL)) { char *acc = spLookupPrimaryAcc(uConn, spID); if (description == NULL) description = spDescription(uConn, acc); if (geneSymbol == NULL) { struct slName *nameList = spGenes(uConn, acc); if (nameList != NULL) geneSymbol = cloneString(nameList->name); slFreeList(&nameList); } } } /* If it's an antibody fragment use that as name. */ if (isAb) { geneSymbol = cloneString("abParts"); description = cloneString("Parts of antibodies, mostly variable regions."); isAb = TRUE; } if (ev == NULL) { mRNA = cloneString(""); if (!isAb) { errAbort("%s is %s but not %s\n", info->name, infoFile, evFile); } } else { mRNA = cloneString(ev->primary); chopSuffix(mRNA); } /* Still no joy? Try genbank RNA records. */ if (geneSymbol == NULL || description == NULL) { if (ev != NULL) { int i; for (i=0; i<ev->accCount; ++i) { char *acc = ev->accs[i]; chopSuffix(acc); if (geneSymbol == NULL) { safef(query, sizeof(query), "select geneName.name from gbCdnaInfo,geneName " "where geneName.id=gbCdnaInfo.geneName and gbCdnaInfo.acc = '%s'", acc); geneSymbol = sqlQuickString(gConn, query); if (geneSymbol != NULL) { if (sameString(geneSymbol, "n/a")) geneSymbol = NULL; } } if (description == NULL) { safef(query, sizeof(query), "select description.name from gbCdnaInfo,description " "where description.id=gbCdnaInfo.description " "and gbCdnaInfo.acc = '%s'", acc); description = sqlQuickString(gConn, query); if (description != NULL) { if (sameString(description, "n/a")) description = NULL; } } } } } if (geneSymbol == NULL) geneSymbol = mRNA; if (description == NULL) description = mRNA; /* Get rid of some characters that will cause havoc downstream. */ stripChar(geneSymbol, '\''); subChar(geneSymbol, '<', '['); subChar(geneSymbol, '>', ']'); /* Abbreviate geneSymbol if too long */ if (strlen(geneSymbol) > 40) strcpy(geneSymbol+37, "..."); fprintf(f, "%s\t", kgID); fprintf(f, "%s\t", mRNA); fprintf(f, "%s\t", spID); fprintf(f, "%s\t", spDisplayID); fprintf(f, "%s\t", geneSymbol); fprintf(f, "%s\t", refseq); fprintf(f, "%s\t", protAcc); fprintf(f, "%s\n", description); } carefulClose(&f); }
void txCdsRepick(char *inputBed, char *inputTxg, char *inputCluster, char *inputInfo, char *inputCds, char *outputCds, char *outputPp) /* txCdsRepick - After we have clustered based on the preliminary coding * regions we can make a more intelligent choice here about the final coding * regions. */ { /* Read input bed into hash. Also calculate number with CDS set. */ struct hash *bedHash = hashNew(16); struct bed *bed, *bedList = bedLoadNAll(inputBed, 12); int txWithCdsCount = 0; for (bed = bedList; bed != NULL; bed = bed->next) { if (bed->thickStart < bed->thickEnd) txWithCdsCount += 1; hashAdd(bedHash, bed->name, bed); } verbose(2, "Read %d beds from %s\n", bedHash->elCount, inputBed); /* Read input transcript graphs into list, and into a hash * keyed by transcript names. */ struct hash *graphHash = hashNew(16); struct txGraph *txg, *txgList = txGraphLoadAll(inputTxg); for (txg = txgList; txg != NULL; txg = txg->next) { int i; for (i=0; i<txg->sourceCount; ++i) hashAdd(graphHash, txg->sources[i].accession, txg); } verbose(2, "Read %d graphs (%d transcripts) from %s\n", slCount(txgList), graphHash->elCount, inputTxg); /* Read input protein cluster into list, and into a hash * keyed by transcript name */ struct hash *clusterHash = hashNew(16); struct txCluster *cluster, *clusterList = txClusterLoadAll(inputCluster); for (cluster = clusterList; cluster != NULL; cluster = cluster->next) { int i; for (i=0; i<cluster->txCount; ++i) hashAdd(clusterHash, cluster->txArray[i], cluster); } verbose(2, "Read %d protein clusters (%d transcripts) from %s\n", slCount(clusterList), clusterHash->elCount, inputCluster); /* Read in txInfo into a hash keyed by transcript name */ struct hash *infoHash = hashNew(16); struct txInfo *info, *infoList = txInfoLoadAll(inputInfo); for (info = infoList; info != NULL; info = info->next) hashAdd(infoHash, info->name, info); verbose(2, "Read info on %d transcripts from %s\n", infoHash->elCount, inputInfo); /* Read in input cds evidence into a hash keyed by transcript name * who's values are a sorted *list* of evidence. */ struct hash *evHash = hashNew(16); struct cdsEvidence *ev, *nextEv, *evList = cdsEvidenceLoadAll(inputCds); int evCount = 0; for (ev = evList; ev != NULL; ev = nextEv) { nextEv = ev->next; struct hashEl *hel = hashLookup(evHash, ev->name); if (hel == NULL) hel = hashAdd(evHash, ev->name, NULL); slAddTail(&hel->val, ev); ++evCount; } verbose(2, "Read %d pieces of cdsEvidence on %d transcripts from %s\n", evCount, evHash->elCount, inputCds); /* Create a hash containing what looks to be the best protein-coding * transcript in each protein cluster. This is keyed by cluster name * with transcript names for values. */ FILE *f = mustOpen(outputPp, "w"); struct hash *bestInClusterHash = hashNew(16); for (cluster = clusterList; cluster != NULL; cluster = cluster->next) { double bestScore = -BIGNUM; char *bestTx = NULL; int i; for (i=0; i<cluster->txCount; ++i) { char *tx = cluster->txArray[i]; info = hashMustFindVal(infoHash, tx); double score = infoCodingScore(info, TRUE); if (score > bestScore) { bestTx = tx; bestScore = score; } } hashAdd(bestInClusterHash, cluster->name, bestTx); fprintf(f, "%s\t%s\n", cluster->name, bestTx); } carefulClose(&f); verbose(2, "Picked best protein for each protein cluster\n"); /* Loop through each transcript cluster (graph). Make a list of * protein clusters associated with that graph. Armed with this * information call repick routine on each transcript in the graph. */ f = mustOpen(outputCds, "w"); for (txg = txgList; txg != NULL; txg = txg->next) { /* Build up list of protein clusters associated with transcript cluster. */ struct slRef *protClusterRefList = NULL, *protClusterRef; int i; for (i=0; i<txg->sourceCount; ++i) { char *tx = txg->sources[i].accession; struct txCluster *protCluster = hashFindVal(clusterHash, tx); if (protCluster != NULL) refAddUnique(&protClusterRefList, protCluster); } /* Figure out best scoring protein in RNA cluster, and set threshold * to eliminate ones scoring less than half this much. */ double bestProtScore = 0; for (protClusterRef = protClusterRefList; protClusterRef != NULL; protClusterRef = protClusterRef->next) { struct txCluster *protCluster = protClusterRef->val; char *protTx = hashMustFindVal(bestInClusterHash, protCluster->name); struct txInfo *info = hashMustFindVal(infoHash, protTx); double score = infoCodingScore(info, FALSE); bestProtScore = max(score, bestProtScore); } double protScoreThreshold = bestProtScore * 0.5; /* Get list of references to beds of proteins over that threshold. */ struct slRef *protRefList = NULL; for (protClusterRef = protClusterRefList; protClusterRef != NULL; protClusterRef = protClusterRef->next) { struct txCluster *protCluster = protClusterRef->val; char *protTx = hashMustFindVal(bestInClusterHash, protCluster->name); struct txInfo *info = hashMustFindVal(infoHash, protTx); double score = infoCodingScore(info, FALSE); if (score >= protScoreThreshold) { struct bed *bed = hashMustFindVal(bedHash, protTx); refAdd(&protRefList, bed); } } /* Go repick each CDS in RNA cluster */ for (i=0; i<txg->sourceCount; ++i) { char *tx = txg->sources[i].accession; struct bed *bed = hashMustFindVal(bedHash, tx); struct cdsEvidence *evList = hashFindVal(evHash, tx); if (evList != NULL && bed->thickStart < bed->thickEnd) { info = hashMustFindVal(infoHash, bed->name); pickCompatableCds(bed, protRefList, evList, info, f); } } slFreeList(&protClusterRefList); } carefulClose(&f); verbose(1, "repicked %d, removed %d, no change to %d\n", pickedBetter, pickedNone, txWithCdsCount - pickedBetter - pickedNone); }