void loadDatabase() { struct sqlConnection *conn = hAllocConn(); FILE *f = mustOpen("ContigLocusIdCondense.tab", "r"); hgLoadTabFile(conn, ".", "ContigLocusIdCondense", &f); hFreeConn(&conn); }
void fakeCloneOldTable(struct sqlConnection *oldConn, struct sqlConnection *newConn, char *table) /* Clone cart table in newConn from oldConn. Add fake prefix to * contents field to help mark it as fake. */ { char query[256]; sqlSafef(query, sizeof(query), "select * from %s", table); struct sqlResult *sr = sqlGetResult(oldConn, query); char **row; FILE *f = hgCreateTabFile(NULL, table); while ((row = sqlNextRow(sr)) != NULL) { int i; for (i=0; i<cartNumFields; ++i) { if (i != 0) fprintf(f, "\t"); if (i == 1) fprintf(f, "%s", fakePrefix); fprintf(f, "%s", row[i]); } fprintf(f, "\n"); } hgLoadTabFile(newConn, NULL, table, &f); hgUnlinkTabFile(NULL, table); }
void makeTableDescriptions(char *database, char *asFile) /* makeTableDescriptions - Add table descriptions to database.. */ { struct sqlConnection *conn = sqlConnect(database); struct lineFile *lf = lineFileOpen(asFile, TRUE); FILE *f = hgCreateTabFile(".", "tableDescriptions"); /* Open a tab file with name corresponding to tableName in tmpDir. */ char *line; /* struct asObject *asList = */ asParseFile(asFile); /* Just to check syntax */ if (sqlTableExists(conn, "chromInfo")) errAbort("%s looks like a genome database, has chromInfo, aborting", database); sqlRemakeTable(conn, "tableDescriptions", "NOSQLINJ CREATE TABLE tableDescriptions (\n" " tableName varchar(255) not null,\n" " autoSqlDef longblob not null,\n" " gbdAnchor varchar(255) not null,\n" " PRIMARY KEY(tableName(32))\n" ")" ); while (lineFileNextReal(lf, &line)) { if (startsWith("table", line)) { struct dyString *as = dyStringNew(0); char *name = trimSpaces(line + 6); /* Skip over table. */ char *escaped = NULL; fprintf(f, "%s\t", name); /* Putting lines into as. */ for (;;) { char *s; dyStringAppend(as, line); dyStringAppendC(as, '\n'); s = skipLeadingSpaces(line); if (s[0] == ')') break; if (!lineFileNext(lf, &line, NULL)) errAbort("Unexpected end of file, missing closing paren in %s", lf->fileName); } escaped = needMem(2*as->stringSize+1); fprintf(f, "%s\t", sqlEscapeTabFileString2(escaped, as->string)); fprintf(f, "\n"); freez(&escaped); dyStringFree(&as); } else errAbort("Expecting table line %d of %s", lf->lineIx, lf->fileName); } hgLoadTabFile(conn, ".", "tableDescriptions", &f); }
void hgCeOrfToGene(char *database, char *geneNames, char *geneTable, char *table) /* hgCeOrfToGene - Make orfToGene table for C.elegans from * GENE_DUMPS/gene_names.txt. */ { struct lineFile *lf = lineFileOpen(geneNames, TRUE); struct sqlConnection *conn; struct sqlResult *sr; char query[256]; char **row; char *tempDir = "."; FILE *f = hgCreateTabFile(tempDir, table); char *words[4]; struct hash *orfHash = newHash(17); /* Make hash to look up gene names. */ while (lineFileNextRowTab(lf, words, ArraySize(words))) { char *gene = words[0]; char *orfs = words[3]; char *type = words[2]; char *orf[128]; int i, orfCount; if (sameString(type, "Gene")) { orfCount = chopString(orfs, ",", orf, ArraySize(orf)); if (orfCount >= ArraySize(orf)) errAbort("Too many ORFs line %d of %s", lf->lineIx, lf->fileName); for (i=0; i<orfCount; ++i) hashAdd(orfHash, orf[i], cloneString(gene)); } } lineFileClose(&lf); /* For each orf in gene table write out gene name if possible, * otherwise orf name. */ conn = sqlConnect(database); safef(query, sizeof(query), "select name from %s", geneTable); sr = sqlGetResult(conn,query); while ((row = sqlNextRow(sr)) != NULL) { char *orf = row[0]; char *gene = hashFindVal(orfHash, orf); if (gene == NULL) gene = orf; fprintf(f, "%s\t%s\n", orf, gene); } sqlFreeResult(&sr); createTable(conn, table, unique); hgLoadTabFile(conn, tempDir, table, &f); }
void saveDataTable(struct expData *data) /* Create the expression table the cheesey way by loading a temp tab file. */ { FILE *f = hgCreateTabFile(".", table); struct expData *cur; struct sqlConnection *conn = sqlConnect(database); expDataCreateTable(conn, table); for (cur = data; cur != NULL; cur = cur->next) expDataTabOut(cur, f); hgLoadTabFile(conn, ".", table, &f); hgRemoveTabFile(".", table); sqlDisconnect(&conn); }
static void loadTable(struct bed4 *beds, char *db, char *parTable) /* create and load table */ { struct sqlConnection *conn = sqlConnect(db); char sqlCmd[256]; sqlSafef(sqlCmd, sizeof(sqlCmd), createSql, parTable); sqlRemakeTable(conn, parTable, sqlCmd); FILE *tabFh = hgCreateTabFile(NULL, parTable); writeBeds(beds, tabFh); hgLoadTabFile(conn, NULL, parTable, &tabFh); hgUnlinkTabFile(NULL, parTable); sqlDisconnect(&conn); }
void ldGencodeIntron(char *database, char *table, int gtfCount, char *gtfNames[]) /* Load Gencode intron status table from GTF files with * intron_id and intron_status keywords */ { struct gffFile *gff, *gffList = NULL; struct gffLine *gffLine; struct gencodeIntron *intron, *intronList = NULL; struct sqlConnection *conn; FILE *f; int i; int introns = 0; for (i=0; i<gtfCount; i++) { verbose(1, "Reading %s\n", gtfNames[i]); gff = gffRead(gtfNames[i]); for (gffLine = gff->lineList; gffLine != NULL; gffLine = gffLine->next) { if (sameWord(gffLine->feature, "intron")) { AllocVar(intron); intron->chrom = gffLine->seq; intron->chromStart = gffLine->start; intron->chromEnd = gffLine->end; intron->name = gffLine->intronId; intron->strand[0] = gffLine->strand; intron->strand[1] = 0; intron->status = gffLine->intronStatus; intron->transcript = gffLine->group; intron->geneId = gffLine->geneId; slAddHead(&intronList, intron); verbose(2, "%s %s\n", intron->chrom, intron->name); introns++; } } } slSort(&intronList, bedCmp); f = hgCreateTabFile(".", table); for (intron = intronList; intron != NULL; intron = intron->next) gencodeIntronTabOut(intron, f); carefulClose(&f); verbose(1, "%d introns in %d files\n", introns, gtfCount); hSetDb(database); conn = sqlConnect(database); gencodeIntronTableCreate(conn, table, hGetMinIndexLength()); hgLoadTabFile(conn, ".", table, &f); sqlDisconnect(&conn); }
void hgLoadGenePred(char *db, char *table, int numGenePreds, char **genePredFiles) /* hgLoadGenePred - Load up a mySQL database genePred table. */ { struct genePred *genes = loadGenes(numGenePreds, genePredFiles); struct sqlConnection *conn = sqlConnect(db); char *tmpDir = "."; FILE *tabFh = hgCreateTabFile(tmpDir, table); mkTabFile(db, genes, tabFh); genePredFreeList(&genes); setupTable(db, conn, table); hgLoadTabFile(conn, tmpDir, table, &tabFh); sqlDisconnect(&conn); hgRemoveTabFile(tmpDir, table); }
void loadDatabase(char *chromName) /* load one table into database */ { FILE *f; struct sqlConnection *conn = hAllocConn(); char tableName[64], fileName[64]; safef(tableName, ArraySize(tableName), "%s_snpTmp", chromName); safef(fileName, ArraySize(fileName), "%s_snpTmp.tab", chromName); f = mustOpen(fileName, "r"); hgLoadTabFile(conn, ".", tableName, &f); hFreeConn(&conn); }
int lineToExpTable(char *line, char *table) /* Create expression format table from line. */ { FILE *f = hgCreateTabFile(tabDir, table); int count = lineToExp(line, f); if (doLoad) { struct sqlConnection *conn = sqlConnect(database); expRecordCreateTable(conn, table); hgLoadTabFile(conn, tabDir, table, &f); hgRemoveTabFile(tabDir, table); sqlDisconnect(&conn); } return count; }
void makeNewExpTable(char *oldTable, struct maMedSpec *medList, char *newTable) /* Create new expTable in hgFixed that is very similar * to oldExpTable, but with rows defined by medList. */ { struct maMedSpec *med; struct expRecord *oldExp, newExp; struct sqlConnection *conn = sqlConnect("hgFixed"); FILE *f = hgCreateTabFile(tabDir, newTable); char query[256], **row; struct sqlResult *sr; int curId = 0; for (med = medList; med != NULL; med = med->next) { /* Load expression record from old table of first * thing in median. */ sqlSafef(query, sizeof(query), "select * from %s where id = %d", oldTable, med->ids[0]); sr = sqlGetResult(conn, query); if ((row = sqlNextRow(sr)) == NULL) errAbort("Can't find id %d in %s\n", med->ids[0], oldTable); oldExp = expRecordLoad(row); sqlFreeResult(&sr); if (oldExp->numExtras < 3) errAbort("Can only deal with old tables with 3 extras or more"); /* Create new expression record, mostly just a shallow copy of old. */ newExp = *oldExp; newExp.id = curId; ++curId; newExp.name = newExp.description = med->name; newExp.extras[2] = med->group; /* Save new one, free old one. */ expRecordTabOut(&newExp, f); expRecordFree(&oldExp); } if (doLoad) { expRecordCreateTable(conn, newTable); hgLoadTabFile(conn, tabDir, newTable, &f); hgRemoveTabFile(tabDir, newTable); } sqlDisconnect(&conn); }
void hgRatioMicroarray(char *absTable, char *relTable) /* hgRatioMicroarray - Create a ratio form of microarray. */ { struct maMedSpec *clumpList = NULL; struct sqlConnection *conn = sqlConnect(database); struct sqlResult *sr; char **row; char query[512]; struct expData *ex; struct expData *expList = NULL; FILE *f = hgCreateTabFile(tabDir, relTable); int rowCount = 0; if (clump != NULL) clumpList = maMedSpecReadAll(clump); sqlSafef(query, sizeof(query), "select * from %s", absTable); sr = sqlGetResult(conn, query); while ((row = sqlNextRow(sr)) != NULL) { ex = expDataLoad(row); slAddHead(&expList, ex); if (limit != 0 && rowCount >= limit) break; } sqlFreeResult(&sr); slReverse(&expList); maExpDataClipMin(expList, minAbsVal, minAbsVal * 0.5); maExpDataAddConstant(expList, c); if (transpose) maExpDataDoLogRatioTranspose(expList, doAverage); else maExpDataDoLogRatioGivenMedSpec(expList, clumpList, (doAverage) ? useMean : useMedian); for (ex = expList; ex != NULL; ex = ex->next) expDataTabOut(ex, f); if (doLoad) { expDataCreateTable(conn, relTable); hgLoadTabFile(conn, tabDir, relTable, &f); hgRemoveTabFile(tabDir, relTable); } expDataFreeList(&expList); sqlDisconnect(&conn); }
struct hash *loadModuleToMotif(struct sqlConnection *conn, char *fileName, char *table) /* Load up file which has a line per module. The first word is the module * number, the rest of the tab-separated fields are motif names. * Return hash keyed by module&motif. */ { struct lineFile *lf = lineFileOpen(fileName, TRUE); char *line, *module, *motif; FILE *f = hgCreateTabFile(tmpDir, table); struct dyString *dy = dyStringNew(512); int motifCount = 0, moduleCount = 0; struct hash *hash = newHash(18); while (lineFileNextReal(lf, &line)) { ++moduleCount; subChar(line, ' ', '_'); module = nextWord(&line); while ((motif = nextWord(&line)) != NULL) { ++motifCount; fprintf(f, "%s\t%s\n", module, motif); hashAdd2(hash, module, motif, NULL); } } dyStringPrintf(dy, "CREATE TABLE %s (\n" " module int not null,\n" " motif varchar(255) not null,\n" " #Indices\n" " INDEX(module),\n" " INDEX(motif(16))\n" ")\n", table); sqlRemakeTable(conn, table, dy->string); verbose(1, "%d modules, %d motifs in modules\n", moduleCount, motifCount); hgLoadTabFile(conn, tmpDir, table, &f); hgRemoveTabFile(tmpDir, table); verbose(1, "Loaded %s table\n", table); lineFileClose(&lf); return hash; }
struct hash *makeExpsTable(char *database, char *expTable, char *expFile, int *expCount) /* Open experiment file and use it to create experiment table. Use optional fields if present, otherwise defaults. Return a hash of expId's, keyed by name */ { struct lineFile *lf = lineFileOpen(expFile, TRUE); FILE *f = hgCreateTabFile(tabDir, expTable); int expId = 0; char *words[6]; int wordCt; struct hash *expHash = newHash(0); while ((wordCt = lineFileChopNext(lf, words, ArraySize(words)))) { char *name = words[0]; hashAddInt(expHash, name, expId); fprintf(f, "%d\t%s\t", expId++, name); fprintf(f, "%s\t", wordCt > 1 ? words[1] : name); fprintf(f, "%s\t", wordCt > 2 ? words[2] : expUrl); fprintf(f, "%s\t", wordCt > 3 ? words[3] : expRef); fprintf(f, "%s\t", wordCt > 4 ? words[4] : expCredit); fprintf(f, "0\n"); /* extras */ } if (expId <= 0) errAbort("No experiments in %s", lf->fileName); verbose(2, "%d experiments\n", expId); if (doLoad) { struct sqlConnection *conn = sqlConnect(database); expRecordCreateTable(conn, expTable); hgLoadTabFile(conn, tabDir, expTable, &f); sqlDisconnect(&conn); } lineFileClose(&lf); if (expCount) *expCount = expId; return expHash; }
void makeNewDataTable(char *database, char *oldTable, struct maMedSpec *medList, char *newTable) /* Create new table in database based on medians of data * in old table as defined by medList. */ { struct sqlConnection *conn = sqlConnect(database); FILE *f = hgCreateTabFile(tabDir, newTable); struct expData *expList, *medianExpList, *exp; expList = expDataLoadTableLimit(conn, oldTable, limit); medianExpList = maExpDataMedianFromSpec(expList, medList, minExps); for (exp = medianExpList; exp != NULL; exp = exp->next) expDataTabOut(exp, f); if (doLoad) { expDataCreateTable(conn, newTable); hgLoadTabFile(conn, tabDir, newTable, &f); hgRemoveTabFile(tabDir, newTable); } expDataFreeList(&expList); expDataFreeList(&medianExpList); sqlDisconnect(&conn); }
void hgGnfMicroarray(char *expTable, char *dataTable, char *atlasFile) /** Main function that does all the work for new-style*/ { struct lineFile *lf = lineFileOpen(atlasFile, TRUE); char *line; int i, wordCount, expCount; char **row; float *data; char *affyId; struct hash *hash = newHash(17); FILE *f = NULL; int dataCount = 0; /* Open Atlas file and use first line to create experiment table. */ if (!lineFileNextReal(lf, &line)) errAbort("%s is empty", lf->fileName); if (startsWith("Affy", line)) line += 4; if (startsWith("Gene Name", line)) line += 9; if (line[0] != '\t') errAbort("%s doesn't seem to be a new format atlas file", lf->fileName); expCount = lineToExpTable(line+1, expTable); if (expCount <= 0) errAbort("No experiments in %s it seems", lf->fileName); warn("%d experiments\n", expCount); f = hgCreateTabFile(tabDir, dataTable); AllocArray(row, expCount); AllocArray(data, expCount); while (lineFileNextReal(lf, &line)) { affyId = nextWord(&line); wordCount = chopByWhite(line, row, expCount); if (wordCount != expCount) errAbort("Expecting %d data points, got %d line %d of %s", expCount, wordCount, lf->lineIx, lf->fileName); if (chopName != NULL) { char *e = stringIn(chopName, affyId); if (e != NULL) *e = 0; } if (hashLookup(hash, affyId)) { warn("Duplicate %s, skipping all but first.", affyId); continue; } for (i=0; i<expCount; ++i) { data[i] = sqlFloat(row[i]); } shortDataOut(f, affyId, expCount, data); ++dataCount; if (limit != 0 && dataCount >= limit) break; } lineFileClose(&lf); if (doLoad) { struct sqlConnection *conn = sqlConnect(database); expDataCreateTable(conn, dataTable); hgLoadTabFile(conn, tabDir, dataTable, &f); hgRemoveTabFile(tabDir, dataTable); sqlDisconnect(&conn); } }
void hgExpDistance(char *database, char *posTable, char *expTable, char *outTable) /* hgExpDistance - Create table that measures expression distance between pairs. */ { struct sqlConnection *conn = sqlConnect(database); struct sqlResult *sr; char query[256]; char **row; struct hash *expHash = hashNew(16); int realExpCount = -1; struct microData *gene; int rc, t; pthread_t *threads = NULL; pthread_attr_t attr; int *threadID = NULL; void *status; char *tempDir = "."; int arrayNum; struct microDataDistance *geneDistPtr = NULL; struct microDataDistance *geneDistArray = NULL; int geneIx; FILE *f = NULL; /* Get list/hash of all items with expression values. */ safef(query, sizeof(query), "select name,expCount,expScores from %s", posTable); sr = sqlGetResult(conn, query); while ((row = sqlNextRow(sr)) != NULL) { char *name = row[0]; if (!hashLookup(expHash, name)) { int expCount = sqlUnsigned(row[1]); int commaCount; float *expScores = NULL; sqlFloatDynamicArray(row[2], &expScores, &commaCount); if (expCount != commaCount) errAbort("expCount and expScores don't match on %s in %s", name, posTable); if (realExpCount == -1) realExpCount = expCount; if (expCount != realExpCount) errAbort("In %s some rows have %d experiments others %d", name, expCount, realExpCount); AllocVar(gene); gene->expCount = expCount; gene->expScores = expScores; hashAddSaveName(expHash, name, gene, &gene->name); slAddHead(&geneList, gene); } } sqlFreeResult(&sr); conn = sqlConnect(database); slReverse(&geneList); geneCount = slCount(geneList); printf("Have %d elements in %s\n", geneCount, posTable); weights = getWeights(realExpCount); if (optionExists("lookup")) geneList = lookupGenes(conn, optionVal("lookup", NULL), geneList); geneCount = slCount(geneList); printf("Got %d unique elements in %s\n", geneCount, posTable); sqlDisconnect(&conn); /* Disconnect because next step is slow. */ if (geneCount < 1) errAbort("ERROR: unique gene count less than one ?"); f = hgCreateTabFile(tempDir, outTable); synQ = synQueueNew(); /* instantiate threads */ AllocArray( threadID, numThreads ); AllocArray( threads, numThreads ); pthread_attr_init( &attr ); pthread_mutex_init( &mutexDotOut, NULL ); pthread_attr_setdetachstate( &attr, PTHREAD_CREATE_JOINABLE ); for (t = 0; t < numThreads; t++) { threadID[t] = t; rc = pthread_create( &threads[t], &attr, computeDistance, (void *) &threadID[t]); if (rc) errAbort("ERROR: in pthread_create() %d\n", rc ); } /* this thread will write to the file from the queue */ for (arrayNum = 0; arrayNum < geneCount; arrayNum++) { geneDistArray = (struct microDataDistance *)synQueueGet( synQ ); geneDistPtr = geneDistArray; /* Print out closest GENEDISTS distances in tab file. */ for (geneIx=0; geneIx < GENEDISTS && geneIx < geneCount; ++geneIx, geneDistPtr++) if (geneDistPtr != NULL) fprintf(f, "%s\t%s\t%f\n", geneDistPtr->name1, geneDistPtr->name2, geneDistPtr->distance); else errAbort("ERROR: writing distance %d to file\n", geneIx); freeMem( geneDistArray ); } /* synchronize all threads */ for (t = 0; t < numThreads; t++) { rc = pthread_join( threads[t], &status); if (rc) errAbort("ERROR: in pthread_join() %d\n", rc ); } printf("Made %s.tab\n", outTable); slFreeList( &geneList ); pthread_mutex_destroy( &mutexDotOut ); pthread_attr_destroy( &attr ); /* Create and load table. */ conn = sqlConnect(database); distanceTableCreate(conn, outTable); hgLoadTabFile(conn, tempDir, outTable, &f); printf("Loaded %s\n", outTable); /* Add indices. */ safef(query, sizeof(query), "alter table %s add index(query(12))", outTable); sqlUpdate(conn, query); printf("Made query index\n"); if (optionExists("targetIndex")) { safef(query, sizeof(query), "alter table %s add index(target(12))", outTable); sqlUpdate(conn, query); printf("Made target index\n"); } hgRemoveTabFile(tempDir, outTable); }
void hgLoadRnaFold(char *database, char *table, char *foldDir) /* hgLoadRnaFold - Load a directory full of RNA fold files into database. */ { char path[PATH_LEN]; struct slName *dirList, *dirEl; struct lineFile *lf; char *line, *word, *s, c; FILE *f = hgCreateTabFile(tabDir, table); int count = 0; dirList = listDir(foldDir, "*"); for (dirEl = dirList; dirEl != NULL; dirEl = dirEl->next) { char *name = dirEl->name; if (sameString(name, "CVS")) continue; safef(path, sizeof(path), "%s/%s", foldDir, name); lf = lineFileOpen(path, TRUE); if (!lineFileNext(lf, &line, NULL)) { if (warnEmpty) { warn("%s is empty, skipping\n", name); lineFileClose(&lf); continue; } else errAbort("%s is empty\n", name); } if (!isupper(line[0])) notFold(path, 1); fprintf(f, "%s\t", name); /* Save name */ fprintf(f, "%s\t", line); /* Save sequence */ lineFileNeedNext(lf, &line, NULL); c = line[0]; if (c != '.' && c != '(') notFold(path, 2); word = nextWord(&line); fprintf(f, "%s\t", word); /* Save nested parenthesis */ /* Parse out (energy) term at end of line. */ s = strchr(line, '('); if (s == NULL) notFold(path, 3); word = skipLeadingSpaces(s+1); if (word == NULL || (!word[0] == '-' && !isdigit(word[0]))) notFold(path, 4); if ((s = strchr(word, ')')) == NULL) notFold(path, 5); *s = 0; fprintf(f, "%s\n", word); lineFileClose(&lf); ++count; } printf("Parsed %d files\n", count); if (doLoad) { struct sqlConnection *conn = sqlConnect(database); rnaFoldCreateTable(conn, table); hgLoadTabFile(conn, tabDir, table, &f); hgRemoveTabFile(tabDir, table); sqlDisconnect(&conn); } }
void processRefSeq(char *database, char *faFile, char *raFile, char *pslFile, char *loc2refFile, char *pepFile, char *mim2locFile) /* hgRefSeqMrna - Load refSeq mRNA alignments and other info into * refSeqGene table. */ { struct lineFile *lf; struct hash *raHash, *rsiHash = newHash(0); struct hash *loc2mimHash = newHash(0); struct refSeqInfo *rsiList = NULL, *rsi; char *s, *line, *row[5]; int wordCount, dotMod = 0; int noLocCount = 0; int rsiCount = 0; int noProtCount = 0; struct psl *psl; struct sqlConnection *conn = hgStartUpdate(database); struct hash *productHash = loadNameTable(conn, "productName", 16); struct hash *geneHash = loadNameTable(conn, "geneName", 16); char *kgName = "refGene"; FILE *kgTab = hgCreateTabFile(".", kgName); FILE *productTab = hgCreateTabFile(".", "productName"); FILE *geneTab = hgCreateTabFile(".", "geneName"); FILE *refLinkTab = hgCreateTabFile(".", "refLink"); FILE *refPepTab = hgCreateTabFile(".", "refPep"); FILE *refMrnaTab = hgCreateTabFile(".", "refMrna"); struct exon *exonList = NULL, *exon; char *answer; char cond_str[200]; /* Make refLink and other tables table if they don't exist already. */ sqlMaybeMakeTable(conn, "refLink", refLinkTableDef); sqlUpdate(conn, "NOSQLINJ delete from refLink"); sqlMaybeMakeTable(conn, "refGene", refGeneTableDef); sqlUpdate(conn, "NOSQLINJ delete from refGene"); sqlMaybeMakeTable(conn, "refPep", refPepTableDef); sqlUpdate(conn, "NOSQLINJ delete from refPep"); sqlMaybeMakeTable(conn, "refMrna", refMrnaTableDef); sqlUpdate(conn, "NOSQLINJ delete from refMrna"); /* Scan through locus link to omim ID file and put in hash. */ { char *row[2]; printf("Scanning %s\n", mim2locFile); lf = lineFileOpen(mim2locFile, TRUE); while (lineFileRow(lf, row)) { hashAdd(loc2mimHash, row[1], intToPt(atoi(row[0]))); } lineFileClose(&lf); } /* Scan through .ra file and make up start of refSeqInfo * objects in hash and list. */ printf("Scanning %s\n", raFile); lf = lineFileOpen(raFile, TRUE); while ((raHash = hashNextRa(lf)) != NULL) { if (clDots > 0 && ++dotMod == clDots ) { dotMod = 0; dotOut(); } AllocVar(rsi); slAddHead(&rsiList, rsi); if ((s = hashFindVal(raHash, "acc")) == NULL) errAbort("No acc near line %d of %s", lf->lineIx, lf->fileName); rsi->mrnaAcc = cloneString(s); if ((s = hashFindVal(raHash, "siz")) == NULL) errAbort("No siz near line %d of %s", lf->lineIx, lf->fileName); rsi->size = atoi(s); if ((s = hashFindVal(raHash, "gen")) != NULL) rsi->geneName = cloneString(s); //!!!else //!!! warn("No gene name for %s", rsi->mrnaAcc); if ((s = hashFindVal(raHash, "cds")) != NULL) parseCds(s, 0, rsi->size, &rsi->cdsStart, &rsi->cdsEnd); else rsi->cdsEnd = rsi->size; if ((s = hashFindVal(raHash, "ngi")) != NULL) rsi->ngi = atoi(s); rsi->geneNameId = putInNameTable(geneHash, geneTab, rsi->geneName); s = hashFindVal(raHash, "pro"); if (s != NULL) rsi->productName = cloneString(s); rsi->productNameId = putInNameTable(productHash, productTab, s); hashAdd(rsiHash, rsi->mrnaAcc, rsi); freeHashAndVals(&raHash); } lineFileClose(&lf); if (clDots) printf("\n"); /* Scan through loc2ref filling in some gaps in rsi. */ printf("Scanning %s\n", loc2refFile); lf = lineFileOpen(loc2refFile, TRUE); while (lineFileNext(lf, &line, NULL)) { char *mrnaAcc; if (line[0] == '#') continue; wordCount = chopTabs(line, row); if (wordCount < 5) errAbort("Expecting at least 5 tab-separated words line %d of %s", lf->lineIx, lf->fileName); mrnaAcc = row[1]; mrnaAcc = accWithoutSuffix(mrnaAcc); if (mrnaAcc[2] != '_') warn("%s is and odd name %d of %s", mrnaAcc, lf->lineIx, lf->fileName); if ((rsi = hashFindVal(rsiHash, mrnaAcc)) != NULL) { rsi->locusLinkId = lineFileNeedNum(lf, row, 0); rsi->omimId = ptToInt(hashFindVal(loc2mimHash, row[0])); rsi->proteinAcc = cloneString(accWithoutSuffix(row[4])); } } lineFileClose(&lf); /* Report how many seem to be missing from loc2ref file. * Write out knownInfo file. */ printf("Writing %s\n", "refLink.tab"); for (rsi = rsiList; rsi != NULL; rsi = rsi->next) { ++rsiCount; if (rsi->locusLinkId == 0) ++noLocCount; if (rsi->proteinAcc == NULL) ++noProtCount; fprintf(refLinkTab, "%s\t%s\t%s\t%s\t%u\t%u\t%u\t%u\n", emptyForNull(rsi->geneName), emptyForNull(rsi->productName), emptyForNull(rsi->mrnaAcc), emptyForNull(rsi->proteinAcc), rsi->geneNameId, rsi->productNameId, rsi->locusLinkId, rsi->omimId); } if (noLocCount) printf("Missing locusLinkIds for %d of %d\n", noLocCount, rsiCount); if (noProtCount) printf("Missing protein accessions for %d of %d\n", noProtCount, rsiCount); /* Process alignments and write them out as genes. */ lf = pslFileOpen(pslFile); dotMod = 0; while ((psl = pslNext(lf)) != NULL) { if (hashFindVal(rsiHash, psl->qName) != NULL) { if (clDots > 0 && ++dotMod == clDots ) { dotMod = 0; dotOut(); } sqlSafefFrag(cond_str, sizeof cond_str, "extAC='%s'", psl->qName); answer = sqlGetField(proteinDB, "spXref2", "displayID", cond_str); if (answer == NULL) { fprintf(stderr, "%s NOT FOUND.\n", psl->qName); fflush(stderr); } if (answer != NULL) { struct genePred *gp = NULL; exonList = pslToExonList(psl); fprintf(kgTab, "%s\t%s\t%c\t%d\t%d\t", psl->qName, psl->tName, psl->strand[0], psl->tStart, psl->tEnd); rsi = hashMustFindVal(rsiHash, psl->qName); gp = genePredFromPsl(psl, rsi->cdsStart, rsi->cdsEnd, genePredStdInsertMergeSize); if (!gp) errAbort("Cannot convert psl (%s) to genePred.\n", psl->qName); fprintf(kgTab, "%d\t%d\t", gp->cdsStart, gp->cdsEnd); fprintf(kgTab, "%d\t", slCount(exonList)); fflush(kgTab); for (exon = exonList; exon != NULL; exon = exon->next) fprintf(kgTab, "%d,", exon->start); fprintf(kgTab, "\t"); for (exon = exonList; exon != NULL; exon = exon->next) fprintf(kgTab, "%d,", exon->end); fprintf(kgTab, "\n"); slFreeList(&exonList); } } else { fprintf(stderr, "%s found in psl, but not in .fa or .ra data files.\n", psl->qName); fflush(stderr); } } if (clDots) printf("\n"); if (!clTest) { writeSeqTable(pepFile, refPepTab, FALSE, TRUE); writeSeqTable(faFile, refMrnaTab, FALSE, FALSE); } carefulClose(&kgTab); carefulClose(&productTab); carefulClose(&geneTab); carefulClose(&refLinkTab); carefulClose(&refPepTab); carefulClose(&refMrnaTab); if (!clTest) { printf("Loading database with %s\n", kgName); fflush(stdout); hgLoadTabFile(conn, ".", kgName, NULL); printf("Loading database with %s\n", "productName"); fflush(stdout); hgLoadTabFile(conn, ".", "productName", NULL); printf("Loading database with %s\n", "geneName"); fflush(stdout); hgLoadTabFile(conn, ".", "geneName", NULL); printf("Loading database with %s\n", "refLink"); fflush(stdout); hgLoadTabFile(conn, ".", "refLink", NULL); printf("Loading database with %s\n", "refPep"); fflush(stdout); hgLoadTabFile(conn, ".", "refPep", NULL); printf("Loading database with %s\n", "refMrna"); fflush(stdout); hgLoadTabFile(conn, ".", "refMrna", NULL); } }
void hgKnownToSuper(char *database, char *org, char *assFile) /* hgKnownToSuper - Load knownToSuperfamily table. */ { struct sqlConnection *conn = sqlConnect(database); struct hash *pepToKnown = ensPepToKnown(conn, TRUE); char *table = "knownToSuper"; FILE *f = hgCreateTabFile(tempDir, table); struct lineFile *lf = lineFileOpen(assFile, TRUE); boolean gotOrg = FALSE; int outCount = 0; char *row[6]; while (lineFileRow(lf, row)) { if (sameString(row[0], org)) { char *pepName = row[1]; char *regions = row[3]; char *eVal = row[4]; char *supId = row[5]; char *knownId = hashFindVal(pepToKnown, pepName); if (knownId != NULL) { char *region, *e; int start,end; /* Loop through comma-separated region string. */ for (region = regions; region != NULL; region = e) { e = strchr(region, ','); if (e != NULL) { *e++ = 0; if (e[0] == 0) e = NULL; } if (sscanf(region, "%d-%d", &start, &end) < 2) errAbort("bad region %s line %d of %s", region, lf->lineIx, lf->fileName); fprintf(f, "%s\t%s\t%d\t%d\t%s\n", knownId, supId, start-1, end, eVal); ++outCount; } } gotOrg = TRUE; } } lineFileClose(&lf); if (!gotOrg) errAbort("Looks like '%s' is not a recognized organism", org); if (outCount <= 0) errAbort("No good records found in %s", assFile); printf("%d records output\n", outCount); /* Refresh connection in case things took a while. */ sqlDisconnect(&conn); conn = sqlConnect(database); /* Load up database. */ createTable(conn, table); hgLoadTabFile(conn, tempDir, table, &f); hgRemoveTabFile(tempDir, table); }
void hgStsAlias(char *database, char *inFile) /* hgStsAlias - Make table of STS aliases. */ { struct lineFile *lf = lineFileOpen(inFile, TRUE); char *words[16],*parts[64]; int partCount, wordCount; char *table = "stsAlias"; struct sqlConnection *conn = sqlConnect(database); FILE *f = hgCreateTabFile(".", table); struct hash *trueHash = makeTrueHash(conn); int i; char *alias, *trueName; int aliasCount = 0; while ((wordCount = lineFileChop(lf, words)) != 0) { trueName = NULL; if (wordCount != 2) { static boolean warned = FALSE; if (!warned) { warn("Got %d words line %d of %s, skipping", wordCount, lf->lineIx, lf->fileName); warn("There may be other lines like this as well"); warned = TRUE; } continue; } lineFileExpectWords(lf, 2, wordCount); partCount = chopByChar(words[1], ';', parts, ArraySize(parts)); if (partCount >= ArraySize(parts)) errAbort("Too many aliases line %d of %s\n", lf->lineIx, lf->fileName); /* Figure out which one we actually have a name for. */ for (i=0; i<partCount; ++i) { alias = parts[i]; if (hashLookup(trueHash, alias)) { trueName = alias; break; } } /* If we have a true name then write out alias/trueName pairs. */ if (trueName != NULL) { for (i=0; i<partCount; ++i) { alias = parts[i]; if (alias != trueName) { ++aliasCount; fprintf(f, "%s\t%s\n", alias, trueName); } } } } lineFileClose(&lf); printf("Found %d aliases in %s\n", aliasCount, inFile); hgLoadTabFile(conn, ".", table, &f); sqlDisconnect(&conn); printf("Loaded table %s in database %s\n", table, database); }
void hgExpDistance(char *database, char *posTable, char *expTable, char *outTable) /* hgExpDistance - Create table that measures expression distance between pairs. */ { struct sqlConnection *conn = sqlConnect(database); struct sqlResult *sr; char query[256]; char **row; struct hash *expHash = hashNew(16); int realExpCount = -1; struct microData *geneList = NULL, *curGene, *gene; int geneIx, geneCount = 0; struct microData **geneArray = NULL; float *weights = NULL; char *tempDir = "."; FILE *f = hgCreateTabFile(tempDir, outTable); /* Get list/hash of all items with expression values. */ sqlSafef(query, sizeof(query), "select name,expCount,expScores from %s", posTable); sr = sqlGetResult(conn, query); while ((row = sqlNextRow(sr)) != NULL) { char *name = row[0]; if (!hashLookup(expHash, name)) { int expCount = sqlUnsigned(row[1]); int commaCount; float *expScores = NULL; sqlFloatDynamicArray(row[2], &expScores, &commaCount); if (expCount != commaCount) errAbort("expCount and expScores don't match on %s in %s", name, posTable); if (realExpCount == -1) realExpCount = expCount; if (expCount != realExpCount) errAbort("In %s some rows have %d experiments others %d", name, expCount, realExpCount); AllocVar(gene); gene->expCount = expCount; gene->expScores = expScores; hashAddSaveName(expHash, name, gene, &gene->name); slAddHead(&geneList, gene); } } sqlFreeResult(&sr); conn = sqlConnect(database); slReverse(&geneList); geneCount = slCount(geneList); printf("Have %d elements in %s\n", geneCount, posTable); weights = getWeights(realExpCount); if (optionExists("lookup")) geneList = lookupGenes(conn, optionVal("lookup", NULL), geneList); geneCount = slCount(geneList); printf("Got %d unique elements in %s\n", geneCount, posTable); sqlDisconnect(&conn); /* Disconnect because next step is slow. */ if (geneCount < 1) errAbort("ERROR: unique gene count less than one ?"); /* Get an array for sorting. */ AllocArray(geneArray, geneCount); for (gene = geneList,geneIx=0; gene != NULL; gene = gene->next, ++geneIx) geneArray[geneIx] = gene; /* Print out closest 1000 in tab file. */ for (curGene = geneList; curGene != NULL; curGene = curGene->next) { calcDistances(curGene, geneList, weights); qsort(geneArray, geneCount, sizeof(geneArray[0]), cmpMicroDataDistance); for (geneIx=0; geneIx < 1000 && geneIx < geneCount; ++geneIx) { gene = geneArray[geneIx]; fprintf(f, "%s\t%s\t%f\n", curGene->name, gene->name, gene->distance); } dotOut(); } printf("Made %s.tab\n", outTable); /* Create and load table. */ conn = sqlConnect(database); distanceTableCreate(conn, outTable); hgLoadTabFile(conn, tempDir, outTable, &f); printf("Loaded %s\n", outTable); /* Add indices. */ sqlSafef(query, sizeof(query), "alter table %s add index(query)", outTable); sqlUpdate(conn, query); printf("Made query index\n"); if (optionExists("targetIndex")) { sqlSafef(query, sizeof(query), "alter table %s add index(target)", outTable); sqlUpdate(conn, query); printf("Made target index\n"); } hgRemoveTabFile(tempDir, outTable); }
void knownToVisiGene(char *database) /* knownToVisiGene - Create knownToVisiGene table by riffling through various other knownTo tables. */ { char *tempDir = "."; FILE *f = hgCreateTabFile(tempDir, outTable); struct sqlConnection *hConn = sqlConnect(database); struct sqlConnection *iConn = sqlConnect(visiDb); struct sqlResult *sr; char **row; struct hash *geneImageHash = newHash(18); struct hash *locusLinkImageHash = newHash(18); struct hash *refSeqImageHash = newHash(18); struct hash *genbankImageHash = newHash(18); struct hash *probeImageHash = newHash(18); struct hash *knownToLocusLinkHash = newHash(18); struct hash *knownToRefSeqHash = newHash(18); struct hash *knownToGeneHash = newHash(18); struct hash *favorHugoHash = newHash(18); struct hash *knownToProbeHash = newHash(18); struct hash *knownToAllProbeHash = newHash(18); struct genePred *knownList = NULL, *known; struct hash *dupeHash = newHash(17); probesDb = optionVal("probesDb", database); struct sqlConnection *probesConn = sqlConnect(probesDb); vgProbes = sqlTableExists(probesConn,"vgProbes"); vgAllProbes = sqlTableExists(probesConn,"vgAllProbes"); /* Go through and make up hashes of images keyed by various fields. */ sr = sqlGetResult(iConn, NOSQLINJ "select image.id,imageFile.priority,gene.name,gene.locusLink,gene.refSeq,gene.genbank" ",probe.id,submissionSet.privateUser,vgPrbMap.vgPrb,gene.id" " from image,imageFile,imageProbe,probe,gene,submissionSet,vgPrbMap" " where image.imageFile = imageFile.id" " and image.id = imageProbe.image" " and imageProbe.probe = probe.id" " and probe.gene = gene.id" " and image.submissionSet=submissionSet.id" " and vgPrbMap.probe = probe.id"); while ((row = sqlNextRow(sr)) != NULL) { int id = sqlUnsigned(row[0]); float priority = atof(row[1]); int privateUser = sqlSigned(row[7]); char vgPrb_Id[256]; safef(vgPrb_Id, sizeof(vgPrb_Id), "vgPrb_%s",row[8]); int geneId = sqlUnsigned(row[9]); if (privateUser == 0) { addPrioritizedImage(probeImageHash, id, priority, geneId, vgPrb_Id); addPrioritizedImage(geneImageHash, id, priority, geneId, row[2]); addPrioritizedImage(locusLinkImageHash, id, priority, geneId, row[3]); addPrioritizedImage(refSeqImageHash, id, priority, geneId, row[4]); addPrioritizedImage(genbankImageHash, id, priority, geneId, row[5]); } } verbose(2, "Made hashes of image: geneImageHash %d, locusLinkImageHash %d, refSeqImageHash %d" ", genbankImageHash %d probeImageHash %d\n", geneImageHash->elCount, locusLinkImageHash->elCount, refSeqImageHash->elCount, genbankImageHash->elCount, probeImageHash->elCount); sqlFreeResult(&sr); /* Build up list of known genes. */ sr = sqlGetResult(hConn, NOSQLINJ "select * from knownGene"); while ((row = sqlNextRow(sr)) != NULL) { struct genePred *known = genePredLoad(row); if (!hashLookup(dupeHash, known->name)) { hashAdd(dupeHash, known->name, NULL); slAddHead(&knownList, known); } } slReverse(&knownList); sqlFreeResult(&sr); verbose(2, "Got %d known genes\n", slCount(knownList)); /* Build up hashes from knownGene to other things. */ if (vgProbes) bestProbeOverlap(probesConn, "vgProbes", knownList, knownToProbeHash); if (vgAllProbes) bestProbeOverlap(probesConn, "vgAllProbes", knownList, knownToAllProbeHash); foldIntoHash(hConn, "knownToLocusLink", "name", "value", knownToLocusLinkHash, NULL, FALSE); foldIntoHash(hConn, "knownToRefSeq", "name", "value", knownToRefSeqHash, NULL, FALSE); foldIntoHash(hConn, "kgXref", "kgID", "geneSymbol", knownToGeneHash, favorHugoHash, FALSE); foldIntoHash(hConn, "kgAlias", "kgID", "alias", knownToGeneHash, favorHugoHash, TRUE); foldIntoHash(hConn, "kgProtAlias", "kgID", "alias", knownToGeneHash, favorHugoHash, TRUE); verbose(2, "knownToLocusLink %d, knownToRefSeq %d, knownToGene %d knownToProbe %d knownToAllProbe %d\n", knownToLocusLinkHash->elCount, knownToRefSeqHash->elCount, knownToGeneHash->elCount, knownToProbeHash->elCount, knownToAllProbeHash->elCount); /* Try and find an image for each gene. */ for (known = knownList; known != NULL; known = known->next) { char *name = known->name; struct prioritizedImage *best = NULL; { best = bestImage(name, knownToLocusLinkHash, locusLinkImageHash); if (!best) best = bestImage(name, knownToRefSeqHash, refSeqImageHash); if (!best) { best = hashFindVal(genbankImageHash, name); } if (!best) best = bestImage(name, knownToGeneHash, geneImageHash); if (vgProbes && !best) best = bestImage(name, knownToProbeHash, probeImageHash); if (vgAllProbes && !best) best = bestImage(name, knownToAllProbeHash, probeImageHash); } if (best) { fprintf(f, "%s\t%d\t%d\n", name, best->imageId, best->geneId); } } createTable(hConn, outTable); hgLoadTabFile(hConn, tempDir, outTable, &f); hgRemoveTabFile(tempDir, outTable); }
void loadGeneToMotif(struct sqlConnection *conn, char *fileName, char *table, struct hash *geneToModuleHash, struct hash *moduleAndMotifHash, struct hash *motifHash, struct hash *positionsHash, char *regionTable) /* Load file which is a big matrix with genes for rows and motifs for * columns. There is a semicolon-separated list of numbers in the matrix * where a gene has the motif, and an empty (tab separated) field * where there is no motif. The numbers are relative to the * region associated with the gene in the positionsHash. * Only load bits of this where motif actually occurs in module associated * with gene. */ { struct lineFile *lf = lineFileOpen(fileName, TRUE); char *line; FILE *f = hgCreateTabFile(tmpDir, table); char *motifNames[32*1024], *row[32*1024]; int motifCount, rowSize, i; char *gene, *module; int geneCount = 0, total = 0; struct dyString *dy = dyStringNew(512); struct genomePos *motifPosList = NULL, *motifPosForGene; struct genomePos *regionPosList = NULL, *regionPos; /* Read first line, which is labels. */ if (!lineFileNextReal(lf, &line)) errAbort("Empty file %s", fileName); subChar(line, ' ', '_'); motifCount = chopLine(line, motifNames); if (motifCount >= ArraySize(motifNames)) errAbort("Too many motifs line 1 of %s", fileName); lineFileExpectAtLeast(lf, 2, motifCount); motifNames[0] = NULL; for (i=1; i<motifCount; ++i) { char name[64]; motifNames[i] = cloneString(fixMotifName(motifNames[i],name,sizeof(name))); if (!hashLookup(motifHash, motifNames[i])) errAbort("Motif %s is in %s but not modules_motifs.gxm", motifNames[i], fileName); } /* Read subsequent lines. */ while ((rowSize = lineFileChopTab(lf, row)) != 0) { lineFileExpectWords(lf, motifCount, rowSize); gene = row[0]; module = hashFindVal(geneToModuleHash, gene); if (module == NULL) { warn("WARNING: Gene %s in line %d of %s but not module_assignments.tab", gene, lf->lineIx, lf->fileName); continue; } regionPos = NULL; for (i=1; i<rowSize; ++i) { if (row[i][0] != 0) { if (hashLookup2(moduleAndMotifHash, module, motifNames[i])) { regionPos = hashFindVal(positionsHash, gene); if (regionPos == NULL) { warn("WARNING: %s in %s but not gene_positions.tab", gene, fileName); i = rowSize; continue; } motifPosForGene = convertMotifPos(row[i], regionPos, hashMustFindVal(motifHash, motifNames[i]), lf); motifPosList = slCat(motifPosForGene, motifPosList); ++total; } } } if (regionPos != NULL) { slAddHead(®ionPosList, regionPos); } ++geneCount; } lineFileClose(&lf); /* Output sorted table of all motif hits. */ { struct genomePos *pos; slSort(&motifPosList, genomePosCmp); for (pos = motifPosList; pos != NULL; pos = pos->next) { int start = pos->start; int end = pos->end; if (start < 0) start = 0; fprintf(f, "%d\t", binFromRange(start, end)); fprintf(f, "%s\t", pos->chrom); fprintf(f, "%d\t%d\t", start, end); fprintf(f, "%s\t", pos->motif); fprintf(f, "%d\t", pos->score); fprintf(f, "%c\t", pos->strand); fprintf(f, "%s\n", pos->name); } dyStringPrintf(dy, "CREATE TABLE %s (\n" " bin smallInt unsigned not null,\n" " chrom varChar(255) not null,\n" " chromStart int not null,\n" " chromEnd int not null,\n" " name varchar(255) not null,\n" " score int not null,\n" " strand char(1) not null,\n" " gene varchar(255) not null,\n" " #Indices\n" " INDEX(gene(12)),\n" " INDEX(name(16)),\n" " INDEX(chrom(8),bin)\n" ")\n", table); sqlRemakeTable(conn, table, dy->string); verbose(1, "%d genes, %d motifs, %d motifs in genes\n", geneCount, motifCount-1, total); hgLoadTabFile(conn, tmpDir, table, &f); // hgRemoveTabFile(tmpDir, table); verbose(1, "Loaded %s table\n", table); slFreeList(&motifPosList); } /* Now output sorted table of upstream regions. */ { FILE *f = hgCreateTabFile(tmpDir, regionTable); struct genomePos *pos; dyStringClear(dy); dyStringPrintf(dy, "CREATE TABLE %s (\n" " bin smallInt unsigned not null,\n" " chrom varChar(255) not null,\n" " chromStart int not null,\n" " chromEnd int not null,\n" " name varchar(255) not null,\n" " score int not null,\n" " strand char(1) not null,\n" " #Indices\n" " INDEX(name(16)),\n" " INDEX(chrom(8),bin)\n" ")\n", regionTable); sqlRemakeTable(conn, regionTable, dy->string); slSort(®ionPosList, genomePosCmp); for (pos = regionPosList; pos != NULL; pos = pos->next) { int start = pos->start; int end = pos->end; if (start < 0) start = 0; fprintf(f, "%d\t", binFromRange(start, end)); fprintf(f, "%s\t", pos->chrom); fprintf(f, "%d\t%d\t", start, end); fprintf(f, "%s\t", pos->name); fprintf(f, "%d\t", pos->score); fprintf(f, "%c\n", pos->strand); } hgLoadTabFile(conn, tmpDir, regionTable, &f); // hgRemoveTabFile(tmpDir, regionTable); } }
void hgLoadChromGraph(boolean doLoad, char *db, char *track, char *fileName) /* hgLoadChromGraph - Load up chromosome graph. */ { double minVal,maxVal; struct chromGraph *el, *list; FILE *f; char *tempDir = "."; char path[PATH_LEN], gbdbPath[PATH_LEN]; char *idTable = optionVal("idTable", NULL); char *pathPrefix = NULL; if (idTable == NULL) list = chromGraphLoadAll(fileName); else list = chromGraphListWithTable(fileName, db, idTable); if (list == NULL) errAbort("%s is empty", fileName); /* Figure out min/max values */ minVal = maxVal = list->val; for (el = list->next; el != NULL; el = el->next) { if (optionExists("minusLog10")) { if (el->val == 1) el->val = 0; else if (el->val > 0) el->val = -1 * log(el->val)/log(10); } if (el->val < minVal) minVal = el->val; if (el->val > maxVal) maxVal = el->val; } /* Sort and write out temp file. */ slSort(&list, chromGraphCmp); f = hgCreateTabFile(tempDir, track); for (el = list; el != NULL; el = el->next) chromGraphTabOut(el, f); if (doLoad) { struct dyString *dy = dyStringNew(0); struct sqlConnection *conn; /* Set up connection to database and create main table. */ conn = hAllocConn(db); sqlDyStringPrintf(dy, createString, track, hGetMinIndexLength(db)); sqlRemakeTable(conn, track, dy->string); /* Load main table and clean up file handle. */ hgLoadTabFile(conn, tempDir, track, &f); hgRemoveTabFile(tempDir, track); /* If need be create meta table. If need be delete old row. */ if (!sqlTableExists(conn, "metaChromGraph")) sqlUpdate(conn, metaCreateString); else { dyStringClear(dy); sqlDyStringPrintf(dy, "delete from metaChromGraph where name = '%s'", track); sqlUpdate(conn, dy->string); } /* Make chrom graph file */ safef(path, sizeof(path), "%s.cgb", track); chromGraphToBin(list, path); safef(path, sizeof(path), "/gbdb/%s/chromGraph", db); pathPrefix = optionVal("pathPrefix", path); safef(gbdbPath, sizeof(gbdbPath), "%s/%s.cgb", pathPrefix, track); /* Create new line in meta table */ dyStringClear(dy); sqlDyStringPrintf(dy, "insert into metaChromGraph values('%s',%f,%f,'%s');", track, minVal, maxVal, gbdbPath); sqlUpdate(conn, dy->string); } }
void hgLoadMafSummary(char *db, char *table, char *fileName) /* hgLoadMafSummary - Load a summary table of pairs in a maf into a database. */ { long mafCount = 0, allMafCount = 0; struct mafComp *mcMaster = NULL; struct mafAli *maf; struct mafFile *mf = mafOpen(fileName); struct sqlConnection *conn; FILE *f = hgCreateTabFile(".", table); long componentCount = 0; struct hash *componentHash = newHash(0); if (!test) { conn = sqlConnect(database); mafSummaryTableCreate(conn, table, hGetMinIndexLength(db)); } verbose(1, "Indexing and tabulating %s\n", fileName); /* process mafs */ while ((maf = mafNext(mf)) != NULL) { mcMaster = mafMaster(maf, mf, fileName); allMafCount++; if (mcMaster->srcSize < minSeqSize) continue; while (mcMaster->size > maxSize) { /* break maf into maxSize pieces */ int end = mcMaster->start + maxSize; struct mafAli *subMaf = mafSubset(maf, mcMaster->src, mcMaster->start, end); verbose(3, "Splitting maf %s:%d len %d\n", mcMaster->src, mcMaster->start, mcMaster->size); componentCount += processMaf(subMaf, componentHash, f, mf, fileName); mafAliFree(&subMaf); subMaf = mafSubset(maf, mcMaster->src, end, end + (mcMaster->size - maxSize)); mafAliFree(&maf); maf = subMaf; mcMaster = mafMaster(maf, mf, fileName); } if (mcMaster->size != 0) { /* remainder of maf after splitting off maxSize submafs */ componentCount += processMaf(maf, componentHash, f, mf, fileName); } mafAliFree(&maf); mafCount++; } mafFileFree(&mf); flushSummaryBlocks(componentHash, f); verbose(1, "Created %ld summary blocks from %ld components and %ld mafs from %s\n", summaryCount, componentCount, allMafCount, fileName); if (test) return; verbose(1, "Loading into %s table %s...\n", database, table); hgLoadTabFile(conn, ".", table, &f); verbose(1, "Loading complete"); hgEndUpdate(&conn, "Add %ld maf summary blocks from %s\n", summaryCount, fileName); }
void hgFlyBase(char *database, char *genesFile) /* hgFlyBase - Parse FlyBase genes.txt file and turn it into a couple of * tables. */ { char *tGene = "fbGene"; char *tSynonym = "fbSynonym"; char *tAllele = "fbAllele"; char *tRef = "fbRef"; char *tRole = "fbRole"; char *tPhenotype = "fbPhenotype"; char *tTranscript = "fbTranscript"; char *tGo = "fbGo"; char *tUniProt = "fbUniProt"; FILE *fGene = hgCreateTabFile(tabDir, tGene); FILE *fSynonym = hgCreateTabFile(tabDir, tSynonym); FILE *fAllele = hgCreateTabFile(tabDir, tAllele); FILE *fRef = hgCreateTabFile(tabDir, tRef); FILE *fRole = hgCreateTabFile(tabDir, tRole); FILE *fPhenotype = hgCreateTabFile(tabDir, tPhenotype); FILE *fTranscript = NULL; FILE *fGo = hgCreateTabFile(tabDir, tGo); FILE *fUniProt = hgCreateTabFile(tabDir, tUniProt); struct lineFile *lf = lineFileOpen(genesFile, TRUE); struct hash *refHash = newHash(19); int nextRefId = 0; int nextAlleleId = 0; char *line, sub, type, *rest, *s; char *geneSym = NULL, *geneName = NULL, *geneId = NULL; int recordCount = 0; struct slName *synList = NULL, *syn; int curAllele = 0, curRef = 0; struct ref *ref = NULL; struct sqlConnection *conn; struct hash *goUniqHash = newHash(18); /* Make table from flybase genes to BGDP transcripts. */ if (doTranscript) { fTranscript = hgCreateTabFile(tabDir, tTranscript); getAllSplices(database, fTranscript); } /* Make dummy reference for flybase itself. */ fprintf(fRef, "0\tFlyBase\n"); /* Loop through parsing and writing tab files. */ while (lineFileNext(lf, &line, NULL)) { sub = line[0]; if (sub == '#') { /* End of record. */ ++recordCount; if (geneId == NULL) errAbort("Record without *z line ending line %d of %s", lf->lineIx, lf->fileName); /* Write out synonyms. */ s = naForNull(geneSym); geneSym = ungreek(s); freeMem(s); s = naForNull(geneName); geneName = ungreek(s); if (! sameString(s, "n/a")) freeMem(s); if (geneSym != NULL && !sameString(geneSym, "n/a")) slNameStore(&synList, geneSym); if (geneName != NULL && !sameString(geneName, "n/a")) slNameStore(&synList, geneName); for (syn = synList; syn != NULL; syn = syn->next) { s = ungreek(syn->name); fprintf(fSynonym, "%s\t%s\n", geneId, s); freeMem(s); } /* Write out gene record. */ fprintf(fGene, "%s\t%s\t%s\n", geneId, geneSym, geneName); /* Clean up. */ freez(&geneSym); freez(&geneName); freez(&geneId); slFreeList(&synList); ref = NULL; curRef = curAllele = 0; continue; } else if (sub == 0) errAbort("blank line %d of %s, not allowed in gene.txt", lf->lineIx, lf->fileName); else if (isalnum(sub)) errAbort("line %d of %s begins with %c, not allowed", lf->lineIx, lf->fileName, sub); type = line[1]; rest = trimSpaces(line+2); if (sub == '*' && type == 'a') geneSym = cloneString(rest); else if (sub == '*' && type == 'e') geneName = cloneString(rest); else if (sub == '*' && type == 'z') { geneId = cloneString(rest); if (!startsWith("FBgn", geneId)) errAbort("Bad FlyBase gene ID %s line %d of %s", geneId, lf->lineIx, lf->fileName); } else if (type == 'i' && (sub == '*' || sub == '$')) { if (strlen(rest) > 2) /* Avoid short useless ones. */ slNameStore(&synList, rest); } else if (sub == '*' && type == 'A') { if (geneId == NULL) errAbort("Allele before geneId line %d of %s", lf->lineIx, lf->fileName); curAllele = ++nextAlleleId; fprintf(fAllele, "%d\t%s\t%s\n", curAllele, geneId, rest); if (!sameString(rest, "classical") && !sameString(rest, "in vitro") && !sameString(rest, "wild-type") ) { slNameStore(&synList, rest); } } else if (sub == '*' && type == 'm') { if (geneId == NULL) errAbort("*m protein ID before geneId line %d of %s", lf->lineIx, lf->fileName); if (startsWith("UniProt", rest)) { char *ptr = strchr(rest, ':'); if (ptr != NULL) ptr++; else errAbort("Trouble parsing UniProt ID %s like %d of %s", rest, lf->lineIx, lf->fileName); fprintf(fUniProt, "%s\t%s\n", geneId, ptr); } } else if (type == 'E') { ref = hashFindVal(refHash, rest); if (ref == NULL) { AllocVar(ref); ref->id = ++nextRefId; hashAdd(refHash, rest, ref); subChar(rest, '\t', ' '); fprintf(fRef, "%d\t%s\n", ref->id, rest); } curRef = ref->id; } else if ((type == 'k' || type == 'r' || type == 'p') && sub != '@') { FILE *f = (type == 'r' ? fRole : fPhenotype); struct dyString *dy = suckSameLines(lf, line); subChar(dy->string, '\t', ' '); if (geneId == NULL) errAbort("Expecting *z in record before line %d of %s", lf->lineIx, lf->fileName); fprintf(f, "%s\t%d\t%d\t%s\n", geneId, curAllele, curRef, dy->string); dyStringFree(&dy); } else if (type == 'd' || type == 'f' || type == 'F') { FILE *f = fGo; char aspect = (type == 'd') ? 'P' : (type == 'f') ? 'C' : 'F'; char *goId = rest; char *p = strstr(goId, " ; "); char assoc[128]; if (p == NULL) continue; else goId = firstWordInLine(p + 3); safef(assoc, sizeof(assoc), "%s.%s", geneId, goId); if (hashLookup(goUniqHash, assoc) == NULL) { hashAddInt(goUniqHash, assoc, 1); fprintf(f, "%s\t%s\t%c\n", geneId, goId, aspect); } } } printf("Processed %d records in %d lines\n", recordCount, lf->lineIx); lineFileClose(&lf); conn = sqlConnect(database); remakeTables(conn); if (doLoad) { printf("Loading %s\n", tGene); hgLoadTabFile(conn, tabDir, tGene, &fGene); if (doTranscript) { printf("Loading %s\n", tTranscript); hgLoadTabFile(conn, tabDir, tTranscript, &fTranscript); } printf("Loading %s\n", tSynonym); hgLoadTabFile(conn, tabDir, tSynonym, &fSynonym); printf("Loading %s\n", tAllele); hgLoadTabFile(conn, tabDir, tAllele, &fAllele); printf("Loading %s\n", tRef); hgLoadTabFile(conn, tabDir, tRef, &fRef); printf("Loading %s\n", tRole); hgLoadTabFile(conn, tabDir, tRole, &fRole); printf("Loading %s\n", tPhenotype); hgLoadTabFile(conn, tabDir, tPhenotype, &fPhenotype); printf("Loading %s\n", tGo); hgLoadTabFile(conn, tabDir, tGo, &fGo); printf("Loading %s\n", tUniProt); hgLoadTabFile(conn, tabDir, tUniProt, &fUniProt); hgRemoveTabFile(tabDir, tGene); if (doTranscript) hgRemoveTabFile(tabDir, tTranscript); hgRemoveTabFile(tabDir, tSynonym); hgRemoveTabFile(tabDir, tAllele); hgRemoveTabFile(tabDir, tRef); hgRemoveTabFile(tabDir, tRole); hgRemoveTabFile(tabDir, tPhenotype); hgRemoveTabFile(tabDir, tGo); hgRemoveTabFile(tabDir, tUniProt); } }
struct hash *loadMotifWeights(struct sqlConnection *conn, char *fileName, char *table) /* Load in XML weight motif file and save it in tab-separated format * and in hash keyed by motif name. */ { struct esmMotifs *motifs = esmMotifsLoad(fileName); struct esmMotif *motif; FILE *f = hgCreateTabFile(tmpDir, table); struct dyString *dy = dyStringNew(512); struct hash *hash = newHash(16); for (motif = motifs->esmMotif; motif != NULL; motif = motif->next) { struct esmWeights *weights = motif->esmWeights; int posCount = slCount(weights->esmPosition); struct esmPosition *pos; struct dnaMotif *dm; char name[64]; fixMotifName(motif->Name, name, sizeof(name)); AllocVar(dm); dm->name = cloneString(name); dm->columnCount = posCount; AllocArray(dm->aProb, posCount); AllocArray(dm->cProb, posCount); AllocArray(dm->gProb, posCount); AllocArray(dm->tProb, posCount); for (pos = weights->esmPosition; pos != NULL; pos = pos->next) { char *row[5]; double odds[4], sumOdds = 0; int i; int ix = pos->Num; int rowSize = chopString(pos->Weights, ";", row, ArraySize(row)); if (rowSize != 4) errAbort("Expecting 4 values for weights in position %d of Motif %s", pos->Num, motif->Name); if (ix >= posCount) errAbort("Num %d out of range in Motif %s", ix, motif->Name); for (i=0; i<4; ++i) { odds[i] = exp(atof(row[0])); sumOdds += odds[i]; } dm->aProb[ix] = odds[0]/sumOdds; dm->cProb[ix] = odds[1]/sumOdds; dm->gProb[ix] = odds[2]/sumOdds; dm->tProb[ix] = odds[3]/sumOdds; } dnaMotifTabOut(dm, f); hashAdd(hash, dm->name, dm); } dyStringPrintf(dy, "CREATE TABLE %s (\n" " name varchar(16) not null, # Motif name.\n" " columnCount int not null, # Count of columns in motif.\n" " aProb longblob not null, # Probability of A's in each column.\n" " cProb longblob not null, # Probability of C's in each column.\n" " gProb longblob not null, # Probability of G's in each column.\n" " tProb longblob not null, # Probability of T's in each column.\n" " #Indices\n" " PRIMARY KEY(name)\n" ")\n", table); sqlRemakeTable(conn, table, dy->string); hgLoadTabFile(conn, tmpDir, table, &f); hgRemoveTabFile(tmpDir, table); verbose(1, "Processed %d motifs into %s\n", slCount(motifs->esmMotif), table); return hash; }
void hgLoadNetDist(char *inTab, char *db, char *outTable) { char *tempDir = "."; FILE *f = hgCreateTabFile(tempDir, outTable); struct sqlConnection *hConn = sqlConnect(db); FILE *missingFile=NULL; int missingCount=0; struct lineFile *lf=NULL; char *row[3]; int rowCount=3; if (sqlRemap) { fetchRemapInfo(db); missingHash = newHash(16); missingFile = mustOpen("missing.tab","w"); } /* read edges from file */ lf=lineFileOpen(inTab, TRUE); /* print final values, remapping if needed */ while (lineFileNextRowTab(lf, row, rowCount)) { char *geneI = row[0]; char *geneJ = row[1]; char *dij = row[2]; char *gi=NULL, *gj=NULL; if (sqlRemap) { /* it is possible for each id to have multiple remap values in hash */ struct hashEl *hi=NULL, *hj=NULL, *hjSave=NULL; hi = hashLookup(aliasHash,geneI); hj = hashLookup(aliasHash,geneJ); missingCount += handleMissing(hi, geneI, missingHash, missingFile); missingCount += handleMissing(hj, geneJ, missingHash, missingFile); hjSave = hj; /* do all combinations of i and j */ for(;hi;hi=hashLookupNext(hi)) { gi = (char *)hi->val; for(;hj;hj=hashLookupNext(hj)) { gj = (char *)hj->val; fprintf(f,"%s\t%s\t%s\n",gi,gj,dij); } hj = hjSave; /* reset it */ } } else { gi=geneI; gj=geneJ; fprintf(f,"%s\t%s\t%s\n",gi,gj,dij); } } lineFileClose(&lf); carefulClose(&f); if (sqlRemap) { carefulClose(&missingFile); if (missingCount == 0) unlink("missing.tab"); else printf("hgLoadNetDist %d id-remapping misses, see missing.tab\n", missingCount); } createTable(hConn, outTable); hgLoadTabFile(hConn, tempDir, outTable, &f); hgRemoveTabFile(tempDir, outTable); }
void hgExpDistance(char *database, char *posTable, char *expTable, char *outTable) /* hgExpDistance - Create table that measures expression distance between pairs. */ { struct sqlConnection *conn = sqlConnect(database); struct sqlResult *sr; char query[256]; char **row; struct hash *expHash = hashNew(16); int realExpCount = -1; struct microData *gene; int rc, t; pthread_t *threads = NULL; pthread_attr_t attr; int *threadID = NULL; void *status; char *tempDir = "."; long time1, time2; time1 = clock1000(); /* Get list/hash of all items with expression values. */ sqlSafef(query, sizeof(query), "select name,expCount,expScores from %s", posTable); sr = sqlGetResult(conn, query); while ((row = sqlNextRow(sr)) != NULL) { char *name = row[0]; if (!hashLookup(expHash, name)) { int expCount = sqlUnsigned(row[1]); int commaCount; float *expScores = NULL; sqlFloatDynamicArray(row[2], &expScores, &commaCount); if (expCount != commaCount) errAbort("expCount and expScores don't match on %s in %s", name, posTable); if (realExpCount == -1) realExpCount = expCount; if (expCount != realExpCount) errAbort("In %s some rows have %d experiments others %d", name, expCount, realExpCount); AllocVar(gene); gene->expCount = expCount; gene->expScores = expScores; hashAddSaveName(expHash, name, gene, &gene->name); slAddHead(&geneList, gene); } } sqlFreeResult(&sr); conn = sqlConnect(database); slReverse(&geneList); geneCount = slCount(geneList); printf("Have %d elements in %s\n", geneCount, posTable); weights = getWeights(realExpCount); if (optionExists("lookup")) geneList = lookupGenes(conn, optionVal("lookup", NULL), geneList); geneCount = slCount(geneList); printf("Got %d unique elements in %s\n", geneCount, posTable); sqlDisconnect(&conn); /* Disconnect because next step is slow. */ if (geneCount < 1) errAbort("ERROR: unique gene count less than one ?"); time2 = clock1000(); verbose(2, "records read time: %.2f seconds\n", (time2 - time1) / 1000.0); f = hgCreateTabFile(tempDir, outTable); /* instantiate threads */ AllocArray( threadID, numThreads ); AllocArray( threads, numThreads ); pthread_attr_init( &attr ); pthread_mutex_init( &mutexfilehandle, NULL ); pthread_attr_setdetachstate( &attr, PTHREAD_CREATE_JOINABLE ); for (t = 0; t < numThreads; t++) { threadID[t] = t; rc = pthread_create( &threads[t], &attr, computeDistance, (void *) &threadID[t]); if (rc) errAbort("ERROR: in pthread_create() %d\n", rc ); } /* synchronize all threads */ for (t = 0; t < numThreads; t++) { rc = pthread_join( threads[t], &status); if (rc) errAbort("ERROR: in pthread_join() %d\n", rc ); } printf("Made %s.tab\n", outTable); slFreeList( &geneList ); pthread_mutex_destroy( &mutexfilehandle ); pthread_attr_destroy( &attr ); time1 = time2; time2 = clock1000(); verbose(2, "distance computation time: %.2f seconds\n", (time2 - time1) / 1000.0); /* Create and load table. */ conn = sqlConnect(database); distanceTableCreate(conn, outTable); hgLoadTabFile(conn, tempDir, outTable, &f); printf("Loaded %s\n", outTable); /* Add indices. */ sqlSafef(query, sizeof(query), "alter table %s add index(query(12))", outTable); sqlUpdate(conn, query); printf("Made query index\n"); if (optionExists("targetIndex")) { sqlSafef(query, sizeof(query), "alter table %s add index(target(12))", outTable); sqlUpdate(conn, query); printf("Made target index\n"); } hgRemoveTabFile(tempDir, outTable); time1 = time2; time2 = clock1000(); verbose(2, "table create/load/index time: %.2f seconds\n", (time2 - time1) / 1000.0); }