void subColumn(char *asciiColumn, char *inFile, char *subFile, char *outFile) /* subColumn - Substitute one column in a tab-separated file.. */ { struct hash *subHash = hashTwoColumnFile(subFile); int column = atoi(asciiColumn); if (column == 0) usage(); else column -= 1; char *row[1024*4]; struct lineFile *lf = lineFileOpen(inFile, TRUE); FILE *f = mustOpen(outFile, "w"); int rowCount; while ((rowCount = lineFileChopNextTab(lf, row, ArraySize(row))) > 0) { if (rowCount == ArraySize(row)) errAbort("Too many columns (%d) line %d of %s.", rowCount, lf->lineIx, lf->fileName); if (column >= rowCount) errAbort("Not enough columns (%d) line %d of %s.", rowCount, lf->lineIx, lf->fileName); int i; for (i=0; i<rowCount; ++i) { char *s = row[i]; if (i == column) { if (isList) { s = subCommaList(subHash, s); } else { char *sub = hashFindVal(subHash, s); if (sub == NULL) { if (fMiss) { fprintf(fMiss, "%s\n", s); ++missCount; } else errAbort("%s not in %s line %d of %s", s, subFile, lf->lineIx, lf->fileName); } else s = sub; } } fputs(s, f); if (i == rowCount-1) fputc('\n', f); else fputc('\t', f); } } carefulClose(&f); }
void regClusterBedExpCfg(char *input, char *output) /* regClusterBedExpCfg - Create config file for hgBedsToBedExps from list of files.. */ { if (cellLetter) cellLetterHash = hashTwoColumnFile(cellLetter); if (encodeList) makeConfigFromEncodeList(input, output); else if (tabList) makeConfigFromTabList(input, output, useTarget); else makeConfigFromFileList(input, output); }
void kgAttachKegg(char *database, char *locusLinkToPathway, char *knownToKegg) /* kgAttachKegg - Attach UCSC genes to KEGG pathways via locusLink IDs. */ { /* Build up hash keyed by locus link ID with KEGG pathway id's as value. */ struct hash *llToKegg = hashTwoColumnFile(locusLinkToPathway); verbose(1, "Got %d items in %s\n", llToKegg->elCount, locusLinkToPathway); /* Build up hash keyed by refSeq accession (without version) with UCSC known gene values. */ struct sqlConnection *conn = sqlConnect(database); struct hash *ucscToRef = hashNew(16); struct sqlResult *sr = sqlGetResult(conn, "select * from knownToRefSeq"); char **row; while ((row = sqlNextRow(sr)) != NULL) hashAdd(ucscToRef, row[0], cloneString(row[1])); verbose(1, "Got %d items in %s.knownToRefSeq\n", ucscToRef->elCount, database); sqlFreeResult(&sr); /* Build up hash keyed by refSeq accessions with locus link values. */ struct hash *refToLl = hashNew(16); sr = sqlGetResult(conn, "select mrnaAcc,locusLinkId from refLink"); while ((row = sqlNextRow(sr)) != NULL) hashAdd(refToLl, row[0], cloneString(row[1])); sqlFreeResult(&sr); verbose(1, "Got %d items in %s.refLink\n", refToLl->elCount, database); /* Stream through kgTxInfo table getting ones that are _primarily_ refSeq. */ sr = sqlGetResult(conn, "select name from kgTxInfo where isRefSeq=1"); FILE *f = mustOpen(knownToKegg, "w"); while ((row = sqlNextRow(sr)) != NULL) { char *ucsc = row[0]; char *refSeq = hashFindVal(ucscToRef, ucsc); if (refSeq) { char *ll = hashFindVal(refToLl, refSeq); if (ll) { char *kegg = hashFindVal(llToKegg, ll); if (kegg) fprintf(f, "%s\t%s\t%s\n", ucsc, ll, kegg); } } } sqlFreeResult(&sr); carefulClose(&f); sqlDisconnect(&conn); }
struct hash *loadGeneToModule(struct sqlConnection *conn, char *fileName, char *table) /* Load up simple two-column file into a lookup type table. */ { struct dyString *dy = dyStringNew(512); dyStringPrintf(dy, "CREATE TABLE %s (\n" " gene varchar(255) not null,\n" " module int not null,\n" " #Indices\n" " PRIMARY KEY(gene(12)),\n" " INDEX(module)\n" ")\n", table); sqlRemakeTable(conn, table, dy->string); sqlLoadTabFile(conn, fileName, table, 0); verbose(1, "Loaded %s table\n", table); return hashTwoColumnFile(fileName); }
void txGeneCdsMap(char *inBed, char *inInfo, char *inPicks, char *refPepToTxPsl, char *refToPepTab, char *chromSizes, char *cdsToRna, char *rnaToGenome) /* txGeneCdsMap - Create mapping between CDS region of gene and genome. */ { /* Load info into hash. */ struct hash *infoHash = hashNew(18); struct txInfo *info, *infoList = txInfoLoadAll(inInfo); for (info = infoList; info != NULL; info = info->next) hashAdd(infoHash, info->name, info); /* Load picks into hash. We don't use cdsPicksLoadAll because empty fields * cause that autoSql-generated routine problems. */ struct hash *pickHash = newHash(18); struct cdsPick *pick; struct lineFile *lf = lineFileOpen(inPicks, TRUE); char *row[CDSPICK_NUM_COLS]; while (lineFileRowTab(lf, row)) { pick = cdsPickLoad(row); hashAdd(pickHash, pick->name, pick); } lineFileClose(&lf); /* Load refPep/tx alignments into hash keyed by tx. */ struct hash *refPslHash = hashNew(18); struct psl *psl, *pslList = pslLoadAll(refPepToTxPsl); for (psl = pslList; psl != NULL; psl = psl->next) hashAdd(refPslHash, psl->tName, psl); struct hash *refToPepHash = hashTwoColumnFile(refToPepTab); struct hash *chromSizeHash = hashNameIntFile(chromSizes); /* Load in bed. */ struct bed *bed, *bedList = bedLoadNAll(inBed, 12); /* Open output, and stream through bedList, writing output. */ FILE *fCdsToRna = mustOpen(cdsToRna, "w"); FILE *fRnaToGenome = mustOpen(rnaToGenome, "w"); int refTotal = 0, refFound = 0; for (bed = bedList; bed != NULL; bed = bed->next) { if (bed->thickStart < bed->thickEnd) { char *chrom = bed->chrom; int chromSize = hashIntVal(chromSizeHash, chrom); info = hashMustFindVal(infoHash, bed->name); pick = hashMustFindVal(pickHash, bed->name); if (info->isRefSeq) { char *refAcc = txAccFromTempName(bed->name); if (!startsWith("NM_", refAcc)) errAbort("Don't think I did find that refSeq acc, got %s", refAcc); char *protAcc = hashMustFindVal(refToPepHash, refAcc); ++refTotal; if (findAndMapPsl(bed, protAcc, refPslHash, chromSize, fCdsToRna)) ++refFound; } else { fakeCdsToMrna(bed, fCdsToRna); } fakeRnaToGenome(bed, chromSize, fRnaToGenome); } } verbose(1, "Missed %d of %d refSeq protein mappings. A small number of RefSeqs just map\n" "to genome in the UTR.\n", refTotal - refFound, refTotal); carefulClose(&fCdsToRna); carefulClose(&fRnaToGenome); }
void edwFixReplaced(char *database, char *inTab, char *spikedTab, char *outSql, char *outRa) /* edwFixReplaced - Clean up files that were replaced in ENCODE2. */ { struct sqlConnection *conn = edwConnect(); struct lineFile *lf = lineFileOpen(inTab, TRUE); FILE *fSql = mustOpen(outSql, "w"); FILE *fRa = mustOpen(outRa, "w"); char *row[2]; struct hash *renameHash = rootRenameHash(); struct hash *spikedHash = hashTwoColumnFile(spikedTab); int depCount = 0, repCount = 0; while (lineFileRowTab(lf, row)) { /* Get fields in local variables. */ char *oldFileName = row[0]; char *objStatus = row[1]; /* Do spikein rename lookup. */ char *spiked = hashFindVal(spikedHash, oldFileName); if (spiked != NULL) { verbose(2, "renaming spikeing %s to %s\n", oldFileName, spiked); oldFileName = spiked; } /* Get rid of bai name for bam,bai pairs. */ char *comma = strchr(oldFileName, ','); if (comma != NULL) { if (!endsWith(comma, ".bai")) errAbort("Unexpected conjoining of files line %d of %s", lf->lineIx, lf->fileName); *comma = 0; } /* For .fastq.tgz files we got to unpack them. */ if (endsWith(oldFileName, ".fastq.tgz")) { /* Get root name - name minus suffix */ char *oldRoot = cloneString(oldFileName); chopSuffix(oldRoot); chopSuffix(oldRoot); verbose(2, "Processing fastq.tgz %s %s\n", oldFileName, oldRoot); // Find records for old version. char query[512]; sqlSafef(query, sizeof(query), "select * from edwFile where submitFileName like '%s/%%/%s.fastq.tgz.dir/%%'" " order by submitFileName", database, oldRoot); struct edwFile *oldList = edwFileLoadByQuery(conn, query); int oldCount = slCount(oldList); if (oldCount == 0) errAbort("No records match %s", query); // Find record for replaced version. // Fortunately all of the fastq.tgz's are just V2, which simplifies code a bit sqlSafef(query, sizeof(query), "select * from edwFile where submitFileName like '%s/%%/%sV2.fastq.tgz.dir/%%'" " order by submitFileName", database, oldRoot); struct edwFile *newList = edwFileLoadByQuery(conn, query); int newCount = slCount(newList); if (newCount == 0) errAbort("No records match %s", query); // Make a hash of new records keyed by new file name inside of tgz struct edwFile *newEf; struct hash *newHash = hashNew(0); for (newEf = newList; newEf != NULL; newEf = newEf->next) { char fileName[FILENAME_LEN]; splitPath(newEf->submitFileName, NULL, fileName, NULL); hashAdd(newHash, fileName, newEf); verbose(2, " %s\n", fileName); } verbose(2, "%d in oldList, %d in newList\n", oldCount, newCount); // Loop through old records trying to find corresponding new record struct edwFile *oldEf; for (oldEf = oldList; oldEf != NULL; oldEf = oldEf->next) { char fileName[FILENAME_LEN]; splitPath(oldEf->submitFileName, NULL, fileName, NULL); struct edwFile *newEf = hashFindVal(newHash, fileName); char *newName = "n/a"; fprintf(fSql, "update edwFile set deprecated='%s' where id=%u;\n", objStatus, oldEf->id); ++depCount; if (newEf != NULL) { fprintf(fSql, "update edwFile set replacedBy=%u where id=%u;\n", newEf->id, oldEf->id); newName = newEf->submitFileName; ++repCount; } fprintf(fRa, "objStatus %s\n", objStatus); fprintf(fRa, "oldFile %s\n", oldEf->submitFileName); fprintf(fRa, "newFile %s\n", newName); fprintf(fRa, "\n"); verbose(2, "%s -> %s\n", oldEf->submitFileName, newName); } } else { /* Figure out new file name by either adding V2 at end, or if there is already a V#, * replacing it. */ #ifdef SOON #endif /* SOON */ int oldVersion = 1; char *noVersion = NULL; { /* Split old file name into root and suffix. */ char *suffix = edwFindDoubleFileSuffix(oldFileName); if (suffix == NULL) errAbort("No suffix in %s line %d of %s", oldFileName, lf->lineIx, lf->fileName); char *oldRoot = cloneStringZ(oldFileName, suffix - oldFileName); char *renamed = hashFindVal(renameHash, oldRoot); if (renamed != NULL) { verbose(2, "Overriding %s with %s\n", oldRoot, renamed); oldRoot = cloneString(renamed); } /* Look for V# at end of old root, and if it's there chop it off and update oldVersion */ noVersion = oldRoot; // If no V, we done. */ char *vPos = strrchr(oldRoot, 'V'); if (vPos != NULL) { char *numPos = vPos + 1; int numSize = strlen(numPos); if (numSize == 1 || numSize == 2) { if (isAllDigits(numPos)) { oldVersion = atoi(numPos); *vPos = 0; } else errAbort("Expecting numbers after V in file name got %s line %d of %s", numPos, lf->lineIx, lf->fileName); } } verbose(2, "%s parses to %s %d %s\n", oldFileName, noVersion, oldVersion, suffix); /* Find record for old file. */ char query[512]; sqlSafef(query, sizeof(query), "select * from edwFile where submitFileName like '%s/%%/%s'", database, oldFileName); struct edwFile *oldEf = edwFileLoadByQuery(conn, query); if (slCount(oldEf) != 1) errAbort("Expecting one result got %d for %s\n", slCount(oldEf), query); fprintf(fSql, "# %s %s\n", oldFileName, objStatus); verbose(2, "%s: %s\n", oldFileName, objStatus); /* Find record for new file. */ struct edwFile *newEf = NULL; int newVersion; for (newVersion = oldVersion+1; newVersion < 7; ++newVersion) { sqlSafef(query, sizeof(query), "select * from edwFile where submitFileName like '%s/%%/%sV%d%s'", database, noVersion, newVersion, suffix); newEf = edwFileLoadByQuery(conn, query); if (newEf != NULL) break; } if (newEf == NULL) verbose(2, "Could not find next version of %s (%s)", oldFileName, oldRoot); if (slCount(newEf) > 1) errAbort("Expecting one result got %d for %s\n", slCount(newEf), query); long long oldId = oldEf->id; fprintf(fSql, "update edwFile set deprecated='%s' where id=%lld;\n", objStatus, oldId); ++depCount; char *newName = "n/a"; if (newEf != NULL) { long long newId = newEf->id; fprintf(fSql, "update edwFile set replacedBy=%lld where id=%lld;\n", newId, oldId); newName = newEf->submitFileName; ++repCount; } fprintf(fRa, "objStatus %s\n", objStatus); fprintf(fRa, "oldFile %s\n", oldEf->submitFileName); fprintf(fRa, "newFile %s\n", newName); fprintf(fRa, "\n"); verbose(2, "%s -> %s\n", oldEf->submitFileName, newName); } } } verbose(1, "%d deprecated, %d replaced\n", depCount, repCount); carefulClose(&fSql); carefulClose(&fRa); }