void gensatFixFull(char *captionFile) /* Fix missing captions. */ { struct lineFile *lf = lineFileOpen(captionFile, TRUE); char *row[2]; struct dyString *sql = dyStringNew(0); struct sqlConnection *conn = sqlConnect(database); struct hash *capHash = newHash(16); while (lineFileRowTab(lf, row)) { int captionId; char *submitId = row[0]; char *caption = row[1]; captionId = hashIntValDefault(capHash, caption, 0); if (captionId == 0) { dyStringClear(sql); dyStringAppend(sql, "insert into caption values(default, \""); dyStringAppend(sql, caption); dyStringAppend(sql, "\")"); sqlUpdate(conn, sql->string); verbose(1, "%s\n", sql->string); captionId = sqlLastAutoId(conn); hashAddInt(capHash, caption, captionId); } dyStringClear(sql); dyStringPrintf(sql, "update imageFile set caption=%d ", captionId); dyStringPrintf(sql, "where submissionSet=%d ", gensatId); dyStringPrintf(sql, "and submitId = \"%s\"", submitId); sqlUpdate(conn, sql->string); verbose(1, "%s\n", sql->string); } dyStringFree(&sql); }
void scopCollapse(char *inFeat, char *inModel, char *outFeat, char *outDesc, char *outKnownTo) /* scopCollapse - Convert SCOP model to SCOP ID. Also make id/name converter file.. */ { /* Process inModel file, writing three columns to output, and keeping * a couple of columns in a hash */ struct hash *modelToSeed = hashNew(18); struct hash *seedToScop = hashNew(16); struct lineFile *lf = lineFileOpen(inModel, TRUE); FILE *f = mustOpen(outDesc, "w"); char *modRow[5]; while (lineFileRowTab(lf, modRow)) { char *seedId = modRow[2]; hashAdd(modelToSeed, modRow[0], cloneString(seedId) ); if (!hashLookup(seedToScop, seedId)) { char *scopId = modRow[1]; hashAdd(seedToScop, seedId, cloneString(scopId)); fprintf(f, "%s\t%s\t%s\n", scopId, seedId, modRow[4]); } } carefulClose(&f); lineFileClose(&lf); /* Process in-feature. We make up a structure for each protein here. */ struct hash *protHash = hashNew(18); struct protInfo *prot, *protList = NULL; lf = lineFileOpen(inFeat, TRUE); char *featRow[6]; while (lineFileRow(lf, featRow)) { prot = hashFindVal(protHash, featRow[0]); if (prot == NULL) { AllocVar(prot); hashAddSaveName(protHash, featRow[0], prot, &prot->name); slAddHead(&protList, prot); } struct protFeature *feature; AllocVar(feature); feature->protein = prot->name; feature->start = lineFileNeedNum(lf, featRow, 1); feature->end = lineFileNeedNum(lf, featRow, 2); feature->name = hashMustFindVal(modelToSeed, featRow[3]); feature->eVal = lineFileNeedDouble(lf, featRow, 4); feature->score = lineFileNeedDouble(lf, featRow, 5); slAddHead(&prot->featureList, feature); } lineFileClose(&lf); slReverse(&protList); f = mustOpen(outFeat, "w"); FILE *fKnownTo = mustOpen(outKnownTo, "w"); for (prot = protList; prot != NULL; prot = prot->next) outputProt(prot, seedToScop, f, fKnownTo); carefulClose(&f); carefulClose(&fKnownTo); }
struct hash *readCsizeHash(char *filename) /* read in a chrom sizes file */ { struct lineFile *lf = lineFileOpen(filename, TRUE); struct hash *cHash = hashNew(10); char *words[2]; while (lineFileRowTab(lf, words)) hashAddInt(cHash, words[0], sqlSigned(words[1])); lineFileClose(&lf); return cHash; }
struct hash *makeGeneToProtHash(char *fileName) /* Create hash that links gene name to protein name. * Feed this in extended gene pred.*/ { struct hash *hash = newHash(18); char *row[11]; struct lineFile *lf = lineFileOpen(fileName, TRUE); while (lineFileRowTab(lf, row)) hashAdd(hash, row[0], cloneString(row[10])); lineFileClose(&lf); return hash; }
void hprdP2p(char *hprdBinaryPPI, char *hprdComplexes, char *outTab) /* hprdP2p - Create hprd.p2p tab file from HPRD flat files for use with hgNetDist. */ { FILE *f = mustOpen(outTab, "w"); char *row[8]; char *row2[6]; char *ids[100]; struct lineFile *lf = lineFileOpen(hprdBinaryPPI, TRUE); while (lineFileRowTab(lf, row)) { char *hprdId1 = row[1]; char *hprdId2 = row[4]; fprintf(f,"%s\t%s\t1.0\n",hprdId1,hprdId2); } lineFileClose(&lf); lf = lineFileOpen(hprdComplexes, TRUE); char *lastComplex = ""; int i = 0; while (lineFileRowTab(lf, row2)) { char *complexId = row2[0]; char *hprdId = row2[1]; if (sameString(hprdId,"None")) continue; if (!sameString(complexId,lastComplex)) { iterateComplex(ids, i, f, lastComplex); i = 0; lastComplex = complexId; } ids[i++] = cloneString(hprdId); } iterateComplex(ids, i, f, lastComplex); lineFileClose(&lf); carefulClose(&f); }
struct hash *loadNewToOldHash(char *oldToNewFile) /* Read through 4 column file <position> <old> <new> <type> and make hash of old accessions * keyed by new accession, only containing elements where new and old are different. */ { struct lineFile *lf = lineFileOpen(oldToNewFile, TRUE); char *row[4]; struct hash *hash = hashNew(16); while (lineFileRowTab(lf, row)) { char *oldAcc = row[1], *newAcc = row[2]; if (newAcc[0] != 0 && !sameString(oldAcc, newAcc)) hashAdd(hash, newAcc, cloneString(oldAcc)); } return hash; }
void txCdsEvFromBorf(char *inBorf, char *txFa, char *outTce) /* txCdsEvFromBorf - Convert borfBig format to txCdsEvidence (tce) in an effort * to annotate the coding regions.. */ { struct lineFile *lf = lineFileOpen(inBorf, TRUE); struct hash *txHash = faReadAllIntoHash(txFa, dnaLower); char *row[BORF_NUM_COLS]; FILE *f = mustOpen(outTce, "w"); while (lineFileRowTab(lf, row)) { struct borf b; borfStaticLoad(row, &b); if (b.strand[0] == '+' && b.score >= 50) { struct dnaSeq *txSeq = hashFindVal(txHash, b.name); boolean hasStop = FALSE; if (b.cdsEnd + 3 < txSeq->size) { hasStop = isStopCodon(txSeq->dna + b.cdsEnd); b.cdsEnd += 3; } if (txSeq == NULL) errAbort("%s is in %s but not %s", b.name, inBorf, txFa); int score = (b.score - 45)*5; if (score > 1000) score = 1000; if (score < 0) score = 0; fprintf(f, "%s\t", b.name); fprintf(f, "%d\t", b.cdsStart); fprintf(f, "%d\t", b.cdsEnd); fprintf(f, "%s\t", "bestorf"); fprintf(f, "%s\t", "."); fprintf(f, "%d\t", score); fprintf(f, "%d\t", startsWith("atg", txSeq->dna + b.cdsStart)); fprintf(f, "%d\t", hasStop); fprintf(f, "%d\t", 1); fprintf(f, "%d,\t", b.cdsStart); fprintf(f, "%d,\n", b.cdsEnd - b.cdsStart); } } lineFileClose(&lf); carefulClose(&f); }
struct hash *getFreqHash(char *freqFile) /* Read the frequency file in, and store it in a hash and return that. */ { struct hash *freqHash = newHash(23); struct lineFile *lf = lineFileOpen(freqFile, TRUE); char *words[3]; /* Assume there's a header and skip it. */ lineFileSkip(lf, 1); while (lineFileRowTab(lf, words)) { int val; lineFileNeedFullNum(lf, words, 1); lineFileNeedFullNum(lf, words, 2); val = (int)sqlUnsigned(words[2]); addFreqToHash(freqHash, words[0], words[1], val); } lineFileClose(&lf); hashTraverseEls(freqHash, sortSlPairList); return freqHash; }
void encodeUserDbCrawl(char *input, char *output) /* encodeUserDbCrawl - Mine user DB for ENCODE info.. */ { struct lineFile *lf = lineFileOpen(input, TRUE); FILE *f = mustOpen(output, "w"); char *row[6]; struct hash *varHash = hashNew(0); struct trackVar *tvList = NULL, *tv; int totalCount = 0; int wgEncodeCount = 0; while (lineFileRowTab(lf, row)) { char *contents; contents = row[1]; int useCount; char *lastUse = row[4]; useCount = atoi(row[5]); if (useCount > 1 && startsWith("2011-1", lastUse) && (stringIn("db=hg18", contents) || stringIn("db=hg19", contents))) { boolean anyTrack, isEncode; parseContents(contents, varHash, &tvList, &anyTrack, &isEncode); if (isEncode) wgEncodeCount++; if (anyTrack) ++totalCount; } } slSort(&tvList, trackVarCmp); for (tv = tvList; tv != NULL; tv = tv->next) { fprintf(f, "%s\t%f\t%d\t%d\t%d\t%d\t%d\t%d\n", tv->name, percentOn(tv), tv->full, tv->pack, tv->squish, tv->dense, tv->show, tv->hide); } printf("wgEncode in %d of %d\n", wgEncodeCount, totalCount); carefulClose(&f); }
void txGeneColor(char *uniProtDb, char *infoFile, char *pickFile, char *outFile) /* txGeneColor - Figure out color to draw gene in.. */ { /* Load picks into hash. We don't use cdsPicksLoadAll because empty fields * cause that autoSql-generated routine problems. */ struct hash *pickHash = newHash(18); struct cdsPick *pick; struct lineFile *lf = lineFileOpen(pickFile, TRUE); char *row[CDSPICK_NUM_COLS]; while (lineFileRowTab(lf, row)) { pick = cdsPickLoad(row); hashAdd(pickHash, pick->name, pick); } /* Open uniprot database connection. */ struct sqlConnection *uConn = sqlConnect(uniProtDb); #ifdef OLD /* Figure out our light and medium colors. */ mediumBlue.r = (6*trueBlue.r + 4*255)/10; mediumBlue.g = (6*trueBlue.g + 4*255)/10; mediumBlue.b = (6*trueBlue.b + 4*255)/10; lightBlue.r = (1*trueBlue.r + 2*255)/3; lightBlue.g = (1*trueBlue.g + 2*255)/3; lightBlue.b = (1*trueBlue.b + 2*255)/3; #endif /* OLD */ /* Read in info file, and loop through it to make out file. */ struct txInfo *info, *infoList = txInfoLoadAll(infoFile); FILE *f = mustOpen(outFile, "w"); for (info = infoList; info != NULL; info = info->next) { struct rgbColor *col; pick = hashFindVal(pickHash, info->name); if (pick != NULL) { char *source = pick->source; if (sameString(source, "RefPepValidated")) col = &trueBlue; else if (sameString(source, "ccds")) col = &trueBlue; else if (sameString(source, "RefPepReviewed")) col = &trueBlue; else if (sameString(source, "RefSeqValidated")) col = &trueBlue; else if (sameString(source, "RefSeqReviewed")) col = &trueBlue; else if (sameString(source, "swissProt")) col = &trueBlue; else if (startsWith("Ref", source)) col = &mediumBlue; else col = &lightBlue; if (pick->swissProt[0] != 0) { char *acc = spLookupPrimaryAcc(uConn, pick->swissProt); struct slName *pdbList = spPdbAccs(uConn, acc); if (pdbList != NULL) col = &black; slFreeList(&pdbList); } } else col = &lightBlue; fprintf(f, "%s\t%d\t%d\t%d\n", info->name, col->r, col->g, col->b); } carefulClose(&f); }
void txGeneCdsMap(char *inBed, char *inInfo, char *inPicks, char *refPepToTxPsl, char *refToPepTab, char *chromSizes, char *cdsToRna, char *rnaToGenome) /* txGeneCdsMap - Create mapping between CDS region of gene and genome. */ { /* Load info into hash. */ struct hash *infoHash = hashNew(18); struct txInfo *info, *infoList = txInfoLoadAll(inInfo); for (info = infoList; info != NULL; info = info->next) hashAdd(infoHash, info->name, info); /* Load picks into hash. We don't use cdsPicksLoadAll because empty fields * cause that autoSql-generated routine problems. */ struct hash *pickHash = newHash(18); struct cdsPick *pick; struct lineFile *lf = lineFileOpen(inPicks, TRUE); char *row[CDSPICK_NUM_COLS]; while (lineFileRowTab(lf, row)) { pick = cdsPickLoad(row); hashAdd(pickHash, pick->name, pick); } lineFileClose(&lf); /* Load refPep/tx alignments into hash keyed by tx. */ struct hash *refPslHash = hashNew(18); struct psl *psl, *pslList = pslLoadAll(refPepToTxPsl); for (psl = pslList; psl != NULL; psl = psl->next) hashAdd(refPslHash, psl->tName, psl); struct hash *refToPepHash = hashTwoColumnFile(refToPepTab); struct hash *chromSizeHash = hashNameIntFile(chromSizes); /* Load in bed. */ struct bed *bed, *bedList = bedLoadNAll(inBed, 12); /* Open output, and stream through bedList, writing output. */ FILE *fCdsToRna = mustOpen(cdsToRna, "w"); FILE *fRnaToGenome = mustOpen(rnaToGenome, "w"); int refTotal = 0, refFound = 0; for (bed = bedList; bed != NULL; bed = bed->next) { if (bed->thickStart < bed->thickEnd) { char *chrom = bed->chrom; int chromSize = hashIntVal(chromSizeHash, chrom); info = hashMustFindVal(infoHash, bed->name); pick = hashMustFindVal(pickHash, bed->name); if (info->isRefSeq) { char *refAcc = txAccFromTempName(bed->name); if (!startsWith("NM_", refAcc)) errAbort("Don't think I did find that refSeq acc, got %s", refAcc); char *protAcc = hashMustFindVal(refToPepHash, refAcc); ++refTotal; if (findAndMapPsl(bed, protAcc, refPslHash, chromSize, fCdsToRna)) ++refFound; } else { fakeCdsToMrna(bed, fCdsToRna); } fakeRnaToGenome(bed, chromSize, fRnaToGenome); } } verbose(1, "Missed %d of %d refSeq protein mappings. A small number of RefSeqs just map\n" "to genome in the UTR.\n", refTotal - refFound, refTotal); carefulClose(&fCdsToRna); carefulClose(&fRnaToGenome); }
void txGeneFromBed(char *inBed, char *inPicks, char *ucscFa, char *uniProtFa, char *refPepFa, char *outKg) /* txGeneFromBed - Convert from bed to knownGenes format table (genePred + uniProt ID). */ { /* Load protein sequence into hashes */ struct hash *uniProtHash = faReadAllIntoHash(uniProtFa, dnaUpper); struct hash *ucscProtHash = faReadAllIntoHash(ucscFa, dnaUpper); struct hash *refProtHash =faReadAllIntoHash(refPepFa, dnaUpper); /* Load picks into hash. We don't use cdsPicksLoadAll because empty fields * cause that autoSql-generated routine problems. */ struct hash *pickHash = newHash(18); struct cdsPick *pick; struct lineFile *lf = lineFileOpen(inPicks, TRUE); char *row[CDSPICK_NUM_COLS]; while (lineFileRowTab(lf, row)) { pick = cdsPickLoad(row); hashAdd(pickHash, pick->name, pick); } /* Load in bed */ struct bed *bed, *bedList = bedLoadNAll(inBed, 12); /* Do reformatting and write output. */ FILE *f = mustOpen(outKg, "w"); for (bed = bedList; bed != NULL; bed = bed->next) { char *protAcc = NULL; if (bed->thickStart < bed->thickEnd) { pick = hashMustFindVal(pickHash, bed->name); struct dnaSeq *spSeq = NULL, *uniSeq = NULL, *refPep = NULL, *ucscSeq; ucscSeq = hashMustFindVal(ucscProtHash, bed->name); if (pick->swissProt[0]) spSeq = hashMustFindVal(uniProtHash, pick->swissProt); if (pick->uniProt[0]) uniSeq = hashMustFindVal(uniProtHash, pick->uniProt); if (pick->refProt[0]) refPep = hashMustFindVal(refProtHash, pick->refProt); /* First we look for an exact match between the ucsc protein and * something from swissProt/uniProt. */ if (spSeq != NULL && sameString(ucscSeq->dna, spSeq->dna)) protAcc = pick->swissProt; if (protAcc == NULL && uniSeq != NULL && sameString(ucscSeq->dna, uniSeq->dna)) protAcc = pick->uniProt; if (protAcc == NULL && refPep != NULL && sameString(ucscSeq->dna, refPep->dna)) { protAcc = cloneString(pick->refProt); chopSuffix(protAcc); } if (protAcc == NULL) { if (pick->uniProt[0]) protAcc = pick->uniProt; else { protAcc = cloneString(pick->refProt); chopSuffix(protAcc); } } } outputKg(bed, emptyForNull(protAcc), f); } carefulClose(&f); }
void writeTab(struct hash *stageHash, struct hash *seqHash, char *sourceImageDir, char *parsedTab, struct hash *nameHash, char *outName) /* Synthasize data and write out tab-separated file with one line for * each image. */ { char sourceImage[PATH_LEN]; FILE *f = mustOpen(outName, "w"); struct lineFile *lf = lineFileOpen(parsedTab, TRUE); char *row[6]; /* Write header. */ fprintf(f, "#"); fprintf(f, "gene\t"); fprintf(f, "submitId\t"); fprintf(f, "fileName\t"); fprintf(f, "imageWidth\t"); fprintf(f, "imageHeight\t"); fprintf(f, "bodyPart\t"); fprintf(f, "age\t"); fprintf(f, "minAge\t"); fprintf(f, "maxAge\t"); fprintf(f, "seq\n"); while (lineFileRowTab(lf, row)) { char *clone = row[0]; char *stage = row[1]; char *part = row[2]; char *dir = row[3]; char *subdir = row[4]; char *file = row[5]; char *gene = hashFindVal(nameHash, clone); struct dnaSeq *seq = hashFindVal(seqHash, clone); int width, height; safef(sourceImage, sizeof(sourceImage), "%s/%s/%s/%s", sourceImageDir, dir, subdir, file); jpegSize(sourceImage, &width, &height); if (gene == NULL) gene = clone; fprintf(f, "%s\t", gene); fprintf(f, "%s\t", clone); fprintf(f, "%s/%s\t", subdir, file); fprintf(f, "%d\t", width); fprintf(f, "%d\t", height); fprintf(f, "%s\t", part); if (sameString(stage, "mixed")) fprintf(f, "1\t0\t3\t"); else { char *age = hashMustFindVal(stageHash, stage); fprintf(f, "%s\t", age); fprintf(f, "%s\t", age); fprintf(f, "%s\t", age); } if (seq != NULL) fprintf(f, "%s\n", seq->dna); else fprintf(f, "\n"); } carefulClose(&f); }
void writeTab( struct hash *imageHash, struct hash *seqHash, char *sourceImageDir, char *allenTab, struct hash *nameHash, char *outName) /* Synthesize data and write out tab-separated file with one line for * each image. */ { char sourceImage[PATH_LEN]; FILE *f = mustOpen(outName, "w"); struct lineFile *lf = lineFileOpen(allenTab, TRUE); char *row[5]; /* Write header. */ fprintf(f, "#"); fprintf(f, "gene\t"); fprintf(f, "refSeq\t"); fprintf(f, "locusLink\t"); fprintf(f, "submitId\t"); /* egeneid=68323 or genesym=1110003F05Rik */ fprintf(f, "fileName\t"); fprintf(f, "imageWidth\t"); fprintf(f, "imageHeight\t"); fprintf(f, "probeId\t"); /* actually, this not supported yet but would be great. */ fprintf(f, "seq\n"); while (lineFileRowTab(lf, row)) { char *gene = row[0]; /* char *geneName = row[1]; */ char *entrez = row[2]; char *refSeq = row[3]; char *url = row[4]; char *probeId = hashFindVal(nameHash, refSeq); struct dnaSeq *seq = NULL; int width=0, height=0; char *relPath = hashFindVal(imageHash, gene); char *submitId = strchr(url,'='); if (probeId) seq = hashFindVal(seqHash, probeId); if (submitId) ++submitId; /* we want the string following first '=' */ if (sameString(entrez,"0")) entrez = NULL; if (relPath) { safef(sourceImage, sizeof(sourceImage), "%s/%s", sourceImageDir, relPath); jpegSize(sourceImage, &width, &height); fprintf(f, "%s\t", gene); fprintf(f, "%s\t", refSeq); fprintf(f, "%s\t", entrez?entrez:""); fprintf(f, "%s\t", submitId); fprintf(f, "%s\t", relPath); fprintf(f, "%d\t", width); fprintf(f, "%d\t", height); fprintf(f, "%s\t", probeId?probeId:""); if (seq != NULL) fprintf(f, "%s\n", seq->dna); else fprintf(f, "\n"); } } lineFileClose(&lf); carefulClose(&f); }
void txGeneXref(char *genomeDb, char *uniProtDb, char *genePredFile, char *infoFile, char *pickFile, char *evFile, char *outFile) /* txGeneXref - Make kgXref type table for genes.. */ { /* Load picks into hash. We don't use cdsPicksLoadAll because empty fields * cause that autoSql-generated routine problems. */ struct hash *pickHash = newHash(18); struct hash *geneToProtHash = makeGeneToProtHash(genePredFile); struct cdsPick *pick; struct lineFile *lf = lineFileOpen(pickFile, TRUE); char *row[CDSPICK_NUM_COLS]; while (lineFileRowTab(lf, row)) { pick = cdsPickLoad(row); removePickVersions(pick); hashAdd(pickHash, pick->name, pick); } /* Load evidence into hash */ struct hash *evHash = newHash(18); struct txRnaAccs *ev, *evList = txRnaAccsLoadAll(evFile); for (ev = evList; ev != NULL; ev = ev->next) hashAdd(evHash, ev->name, ev); /* Open connections to our databases */ struct sqlConnection *gConn = sqlConnect(genomeDb); struct sqlConnection *uConn = sqlConnect(uniProtDb); /* Read in info file, and loop through it to make out file. */ struct txInfo *info, *infoList = txInfoLoadAll(infoFile); FILE *f = mustOpen(outFile, "w"); for (info = infoList; info != NULL; info = info->next) { char *kgID = info->name; char *mRNA = ""; char *spID = ""; char *spDisplayID = ""; char *geneSymbol = NULL; char *refseq = ""; char *protAcc = ""; char *description = NULL; char query[256]; char *proteinId = hashMustFindVal(geneToProtHash, info->name); boolean isAb = sameString(info->category, "antibodyParts"); pick = hashFindVal(pickHash, info->name); ev = hashFindVal(evHash, info->name); if (pick != NULL) { /* Fill in the relatively straightforward fields. */ refseq = pick->refSeq; if (info->orfSize > 0) { protAcc = pick->refProt; spID = proteinId; if (sameString(protAcc, spID)) spID = pick->uniProt; if (spID[0] != 0) spDisplayID = spAnyAccToId(uConn, spID); } /* Fill in gene symbol and description from refseq if possible. */ if (refseq[0] != 0) { struct sqlResult *sr; safef(query, sizeof(query), "select name,product from refLink where mrnaAcc='%s'", refseq); sr = sqlGetResult(gConn, query); char **row = sqlNextRow(sr); if (row != NULL) { geneSymbol = cloneString(row[0]); if (!sameWord("unknown protein", row[1])) description = cloneString(row[1]); } sqlFreeResult(&sr); } /* If need be try uniProt for gene symbol and description. */ if (spID[0] != 0 && (geneSymbol == NULL || description == NULL)) { char *acc = spLookupPrimaryAcc(uConn, spID); if (description == NULL) description = spDescription(uConn, acc); if (geneSymbol == NULL) { struct slName *nameList = spGenes(uConn, acc); if (nameList != NULL) geneSymbol = cloneString(nameList->name); slFreeList(&nameList); } } } /* If it's an antibody fragment use that as name. */ if (isAb) { geneSymbol = cloneString("abParts"); description = cloneString("Parts of antibodies, mostly variable regions."); isAb = TRUE; } if (ev == NULL) { mRNA = cloneString(""); if (!isAb) { errAbort("%s is %s but not %s\n", info->name, infoFile, evFile); } } else { mRNA = cloneString(ev->primary); chopSuffix(mRNA); } /* Still no joy? Try genbank RNA records. */ if (geneSymbol == NULL || description == NULL) { if (ev != NULL) { int i; for (i=0; i<ev->accCount; ++i) { char *acc = ev->accs[i]; chopSuffix(acc); if (geneSymbol == NULL) { safef(query, sizeof(query), "select geneName.name from gbCdnaInfo,geneName " "where geneName.id=gbCdnaInfo.geneName and gbCdnaInfo.acc = '%s'", acc); geneSymbol = sqlQuickString(gConn, query); if (geneSymbol != NULL) { if (sameString(geneSymbol, "n/a")) geneSymbol = NULL; } } if (description == NULL) { safef(query, sizeof(query), "select description.name from gbCdnaInfo,description " "where description.id=gbCdnaInfo.description " "and gbCdnaInfo.acc = '%s'", acc); description = sqlQuickString(gConn, query); if (description != NULL) { if (sameString(description, "n/a")) description = NULL; } } } } } if (geneSymbol == NULL) geneSymbol = mRNA; if (description == NULL) description = mRNA; /* Get rid of some characters that will cause havoc downstream. */ stripChar(geneSymbol, '\''); subChar(geneSymbol, '<', '['); subChar(geneSymbol, '>', ']'); /* Abbreviate geneSymbol if too long */ if (strlen(geneSymbol) > 40) strcpy(geneSymbol+37, "..."); fprintf(f, "%s\t", kgID); fprintf(f, "%s\t", mRNA); fprintf(f, "%s\t", spID); fprintf(f, "%s\t", spDisplayID); fprintf(f, "%s\t", geneSymbol); fprintf(f, "%s\t", refseq); fprintf(f, "%s\t", protAcc); fprintf(f, "%s\n", description); } carefulClose(&f); }
void txGeneAlias(char *genomeDb, char *uniProtDb, char *xrefFile, char *evFile, char *oldToNew, char *aliasFile, char *protAliasFile) /* txGeneAlias - Make kgAlias and kgProtAlias tables.. */ { /* Read and hash oldToNew */ struct hash *newToOldHash = loadNewToOldHash(oldToNew); /* Load evidence into hash */ struct hash *evHash = newHash(18); struct txRnaAccs *ev, *evList = txRnaAccsLoadAll(evFile); for (ev = evList; ev != NULL; ev = ev->next) hashAdd(evHash, ev->name, ev); /* Open connections to our databases */ struct sqlConnection *gConn = sqlConnect(genomeDb); struct sqlConnection *uConn = sqlConnect(uniProtDb); struct sqlResult *sr; char **row; char query[256]; /* Open files. */ struct lineFile *lf = lineFileOpen(xrefFile, TRUE); FILE *fAlias = mustOpen(aliasFile, "w"); FILE *fProt = mustOpen(protAliasFile, "w"); /* Stream through xref file, which has much of the info we need, * and which contains a line for each gene. */ char *words[KGXREF_NUM_COLS]; while (lineFileRowTab(lf, words)) { /* Load the xref, and output most of it's fields as aliases. */ struct kgXref *x = kgXrefLoad(words); char *id = x->kgID; outAlias(fAlias, id, x->kgID); outAlias(fAlias, id, x->mRNA); outAlias(fAlias, id, x->spID); outAlias(fAlias, id, x->spDisplayID); outAlias(fAlias, id, x->geneSymbol); outAlias(fAlias, id, x->refseq); outAlias(fAlias, id, x->protAcc); char *old = hashFindVal(newToOldHash, id); if (old != NULL) outAlias(fAlias, id, old); /* If we've got a uniProt ID, use that to get more info from uniProt. */ char *acc = x->spID; if ((acc[0] != 0) && (acc = spLookupPrimaryAccMaybe(uConn, acc)) != NULL) { /* Get current accession and output a bunch of easy protein aliases. */ outProt(fProt, id, acc, acc); outProt(fProt, id, acc, x->spDisplayID); outProt(fProt, id, acc, x->geneSymbol); outProt(fProt, id, acc, x->protAcc); if (old != NULL) outProt(fProt, id, acc, old); /* Throw in old swissProt accessions. */ sqlSafef(query, sizeof(query), "select val from otherAcc where acc = '%s'", acc); sr = sqlGetResult(uConn, query); while ((row = sqlNextRow(sr)) != NULL) { outAlias(fAlias, id, row[0]); outProt(fProt, id, acc, row[0]); } /* Throw in gene names that SwissProt knows about */ struct slName *gene, *geneList = spGenes(uConn, acc); for (gene = geneList; gene != NULL; gene = gene->next) { outAlias(fAlias, id, gene->name); outProt(fProt, id, acc, gene->name); } slFreeList(&geneList); } /* Throw in gene names from genbank. */ /* At some point we may want to restrict this to the primary transcript in a cluster. */ ev = hashFindVal(evHash, id); if (ev != NULL) { int i; for (i=0; i<ev->accCount; ++i) { sqlSafef(query, sizeof(query), "select geneName from gbCdnaInfo where acc='%s'", acc); int nameId = sqlQuickNum(gConn, query); if (nameId != 0) { char name[64]; sqlSafef(query, sizeof(query), "select name from geneName where id=%d", nameId); if (sqlQuickQuery(gConn, query, name, sizeof(name))) outAlias(fAlias, id, name); } } } kgXrefFree(&x); } carefulClose(&fAlias); carefulClose(&fProt); }
void edwFixReplaced(char *database, char *inTab, char *spikedTab, char *outSql, char *outRa) /* edwFixReplaced - Clean up files that were replaced in ENCODE2. */ { struct sqlConnection *conn = edwConnect(); struct lineFile *lf = lineFileOpen(inTab, TRUE); FILE *fSql = mustOpen(outSql, "w"); FILE *fRa = mustOpen(outRa, "w"); char *row[2]; struct hash *renameHash = rootRenameHash(); struct hash *spikedHash = hashTwoColumnFile(spikedTab); int depCount = 0, repCount = 0; while (lineFileRowTab(lf, row)) { /* Get fields in local variables. */ char *oldFileName = row[0]; char *objStatus = row[1]; /* Do spikein rename lookup. */ char *spiked = hashFindVal(spikedHash, oldFileName); if (spiked != NULL) { verbose(2, "renaming spikeing %s to %s\n", oldFileName, spiked); oldFileName = spiked; } /* Get rid of bai name for bam,bai pairs. */ char *comma = strchr(oldFileName, ','); if (comma != NULL) { if (!endsWith(comma, ".bai")) errAbort("Unexpected conjoining of files line %d of %s", lf->lineIx, lf->fileName); *comma = 0; } /* For .fastq.tgz files we got to unpack them. */ if (endsWith(oldFileName, ".fastq.tgz")) { /* Get root name - name minus suffix */ char *oldRoot = cloneString(oldFileName); chopSuffix(oldRoot); chopSuffix(oldRoot); verbose(2, "Processing fastq.tgz %s %s\n", oldFileName, oldRoot); // Find records for old version. char query[512]; sqlSafef(query, sizeof(query), "select * from edwFile where submitFileName like '%s/%%/%s.fastq.tgz.dir/%%'" " order by submitFileName", database, oldRoot); struct edwFile *oldList = edwFileLoadByQuery(conn, query); int oldCount = slCount(oldList); if (oldCount == 0) errAbort("No records match %s", query); // Find record for replaced version. // Fortunately all of the fastq.tgz's are just V2, which simplifies code a bit sqlSafef(query, sizeof(query), "select * from edwFile where submitFileName like '%s/%%/%sV2.fastq.tgz.dir/%%'" " order by submitFileName", database, oldRoot); struct edwFile *newList = edwFileLoadByQuery(conn, query); int newCount = slCount(newList); if (newCount == 0) errAbort("No records match %s", query); // Make a hash of new records keyed by new file name inside of tgz struct edwFile *newEf; struct hash *newHash = hashNew(0); for (newEf = newList; newEf != NULL; newEf = newEf->next) { char fileName[FILENAME_LEN]; splitPath(newEf->submitFileName, NULL, fileName, NULL); hashAdd(newHash, fileName, newEf); verbose(2, " %s\n", fileName); } verbose(2, "%d in oldList, %d in newList\n", oldCount, newCount); // Loop through old records trying to find corresponding new record struct edwFile *oldEf; for (oldEf = oldList; oldEf != NULL; oldEf = oldEf->next) { char fileName[FILENAME_LEN]; splitPath(oldEf->submitFileName, NULL, fileName, NULL); struct edwFile *newEf = hashFindVal(newHash, fileName); char *newName = "n/a"; fprintf(fSql, "update edwFile set deprecated='%s' where id=%u;\n", objStatus, oldEf->id); ++depCount; if (newEf != NULL) { fprintf(fSql, "update edwFile set replacedBy=%u where id=%u;\n", newEf->id, oldEf->id); newName = newEf->submitFileName; ++repCount; } fprintf(fRa, "objStatus %s\n", objStatus); fprintf(fRa, "oldFile %s\n", oldEf->submitFileName); fprintf(fRa, "newFile %s\n", newName); fprintf(fRa, "\n"); verbose(2, "%s -> %s\n", oldEf->submitFileName, newName); } } else { /* Figure out new file name by either adding V2 at end, or if there is already a V#, * replacing it. */ #ifdef SOON #endif /* SOON */ int oldVersion = 1; char *noVersion = NULL; { /* Split old file name into root and suffix. */ char *suffix = edwFindDoubleFileSuffix(oldFileName); if (suffix == NULL) errAbort("No suffix in %s line %d of %s", oldFileName, lf->lineIx, lf->fileName); char *oldRoot = cloneStringZ(oldFileName, suffix - oldFileName); char *renamed = hashFindVal(renameHash, oldRoot); if (renamed != NULL) { verbose(2, "Overriding %s with %s\n", oldRoot, renamed); oldRoot = cloneString(renamed); } /* Look for V# at end of old root, and if it's there chop it off and update oldVersion */ noVersion = oldRoot; // If no V, we done. */ char *vPos = strrchr(oldRoot, 'V'); if (vPos != NULL) { char *numPos = vPos + 1; int numSize = strlen(numPos); if (numSize == 1 || numSize == 2) { if (isAllDigits(numPos)) { oldVersion = atoi(numPos); *vPos = 0; } else errAbort("Expecting numbers after V in file name got %s line %d of %s", numPos, lf->lineIx, lf->fileName); } } verbose(2, "%s parses to %s %d %s\n", oldFileName, noVersion, oldVersion, suffix); /* Find record for old file. */ char query[512]; sqlSafef(query, sizeof(query), "select * from edwFile where submitFileName like '%s/%%/%s'", database, oldFileName); struct edwFile *oldEf = edwFileLoadByQuery(conn, query); if (slCount(oldEf) != 1) errAbort("Expecting one result got %d for %s\n", slCount(oldEf), query); fprintf(fSql, "# %s %s\n", oldFileName, objStatus); verbose(2, "%s: %s\n", oldFileName, objStatus); /* Find record for new file. */ struct edwFile *newEf = NULL; int newVersion; for (newVersion = oldVersion+1; newVersion < 7; ++newVersion) { sqlSafef(query, sizeof(query), "select * from edwFile where submitFileName like '%s/%%/%sV%d%s'", database, noVersion, newVersion, suffix); newEf = edwFileLoadByQuery(conn, query); if (newEf != NULL) break; } if (newEf == NULL) verbose(2, "Could not find next version of %s (%s)", oldFileName, oldRoot); if (slCount(newEf) > 1) errAbort("Expecting one result got %d for %s\n", slCount(newEf), query); long long oldId = oldEf->id; fprintf(fSql, "update edwFile set deprecated='%s' where id=%lld;\n", objStatus, oldId); ++depCount; char *newName = "n/a"; if (newEf != NULL) { long long newId = newEf->id; fprintf(fSql, "update edwFile set replacedBy=%lld where id=%lld;\n", newId, oldId); newName = newEf->submitFileName; ++repCount; } fprintf(fRa, "objStatus %s\n", objStatus); fprintf(fRa, "oldFile %s\n", oldEf->submitFileName); fprintf(fRa, "newFile %s\n", newName); fprintf(fRa, "\n"); verbose(2, "%s -> %s\n", oldEf->submitFileName, newName); } } } verbose(1, "%d deprecated, %d replaced\n", depCount, repCount); carefulClose(&fSql); carefulClose(&fRa); }