void saveDataTable(struct expData *data) /* Create the expression table the cheesey way by loading a temp tab file. */ { FILE *f = hgCreateTabFile(".", table); struct expData *cur; struct sqlConnection *conn = sqlConnect(database); expDataCreateTable(conn, table); for (cur = data; cur != NULL; cur = cur->next) expDataTabOut(cur, f); hgLoadTabFile(conn, ".", table, &f); hgRemoveTabFile(".", table); sqlDisconnect(&conn); }
int lineToExpTable(char *line, char *table) /* Create expression format table from line. */ { FILE *f = hgCreateTabFile(tabDir, table); int count = lineToExp(line, f); if (doLoad) { struct sqlConnection *conn = sqlConnect(database); expRecordCreateTable(conn, table); hgLoadTabFile(conn, tabDir, table, &f); hgRemoveTabFile(tabDir, table); sqlDisconnect(&conn); } return count; }
void hgLoadGenePred(char *db, char *table, int numGenePreds, char **genePredFiles) /* hgLoadGenePred - Load up a mySQL database genePred table. */ { struct genePred *genes = loadGenes(numGenePreds, genePredFiles); struct sqlConnection *conn = sqlConnect(db); char *tmpDir = "."; FILE *tabFh = hgCreateTabFile(tmpDir, table); mkTabFile(db, genes, tabFh); genePredFreeList(&genes); setupTable(db, conn, table); hgLoadTabFile(conn, tmpDir, table, &tabFh); sqlDisconnect(&conn); hgRemoveTabFile(tmpDir, table); }
void makeNewExpTable(char *oldTable, struct maMedSpec *medList, char *newTable) /* Create new expTable in hgFixed that is very similar * to oldExpTable, but with rows defined by medList. */ { struct maMedSpec *med; struct expRecord *oldExp, newExp; struct sqlConnection *conn = sqlConnect("hgFixed"); FILE *f = hgCreateTabFile(tabDir, newTable); char query[256], **row; struct sqlResult *sr; int curId = 0; for (med = medList; med != NULL; med = med->next) { /* Load expression record from old table of first * thing in median. */ sqlSafef(query, sizeof(query), "select * from %s where id = %d", oldTable, med->ids[0]); sr = sqlGetResult(conn, query); if ((row = sqlNextRow(sr)) == NULL) errAbort("Can't find id %d in %s\n", med->ids[0], oldTable); oldExp = expRecordLoad(row); sqlFreeResult(&sr); if (oldExp->numExtras < 3) errAbort("Can only deal with old tables with 3 extras or more"); /* Create new expression record, mostly just a shallow copy of old. */ newExp = *oldExp; newExp.id = curId; ++curId; newExp.name = newExp.description = med->name; newExp.extras[2] = med->group; /* Save new one, free old one. */ expRecordTabOut(&newExp, f); expRecordFree(&oldExp); } if (doLoad) { expRecordCreateTable(conn, newTable); hgLoadTabFile(conn, tabDir, newTable, &f); hgRemoveTabFile(tabDir, newTable); } sqlDisconnect(&conn); }
void hgRatioMicroarray(char *absTable, char *relTable) /* hgRatioMicroarray - Create a ratio form of microarray. */ { struct maMedSpec *clumpList = NULL; struct sqlConnection *conn = sqlConnect(database); struct sqlResult *sr; char **row; char query[512]; struct expData *ex; struct expData *expList = NULL; FILE *f = hgCreateTabFile(tabDir, relTable); int rowCount = 0; if (clump != NULL) clumpList = maMedSpecReadAll(clump); sqlSafef(query, sizeof(query), "select * from %s", absTable); sr = sqlGetResult(conn, query); while ((row = sqlNextRow(sr)) != NULL) { ex = expDataLoad(row); slAddHead(&expList, ex); if (limit != 0 && rowCount >= limit) break; } sqlFreeResult(&sr); slReverse(&expList); maExpDataClipMin(expList, minAbsVal, minAbsVal * 0.5); maExpDataAddConstant(expList, c); if (transpose) maExpDataDoLogRatioTranspose(expList, doAverage); else maExpDataDoLogRatioGivenMedSpec(expList, clumpList, (doAverage) ? useMean : useMedian); for (ex = expList; ex != NULL; ex = ex->next) expDataTabOut(ex, f); if (doLoad) { expDataCreateTable(conn, relTable); hgLoadTabFile(conn, tabDir, relTable, &f); hgRemoveTabFile(tabDir, relTable); } expDataFreeList(&expList); sqlDisconnect(&conn); }
struct hash *loadModuleToMotif(struct sqlConnection *conn, char *fileName, char *table) /* Load up file which has a line per module. The first word is the module * number, the rest of the tab-separated fields are motif names. * Return hash keyed by module&motif. */ { struct lineFile *lf = lineFileOpen(fileName, TRUE); char *line, *module, *motif; FILE *f = hgCreateTabFile(tmpDir, table); struct dyString *dy = dyStringNew(512); int motifCount = 0, moduleCount = 0; struct hash *hash = newHash(18); while (lineFileNextReal(lf, &line)) { ++moduleCount; subChar(line, ' ', '_'); module = nextWord(&line); while ((motif = nextWord(&line)) != NULL) { ++motifCount; fprintf(f, "%s\t%s\n", module, motif); hashAdd2(hash, module, motif, NULL); } } dyStringPrintf(dy, "CREATE TABLE %s (\n" " module int not null,\n" " motif varchar(255) not null,\n" " #Indices\n" " INDEX(module),\n" " INDEX(motif(16))\n" ")\n", table); sqlRemakeTable(conn, table, dy->string); verbose(1, "%d modules, %d motifs in modules\n", moduleCount, motifCount); hgLoadTabFile(conn, tmpDir, table, &f); hgRemoveTabFile(tmpDir, table); verbose(1, "Loaded %s table\n", table); lineFileClose(&lf); return hash; }
void makeNewDataTable(char *database, char *oldTable, struct maMedSpec *medList, char *newTable) /* Create new table in database based on medians of data * in old table as defined by medList. */ { struct sqlConnection *conn = sqlConnect(database); FILE *f = hgCreateTabFile(tabDir, newTable); struct expData *expList, *medianExpList, *exp; expList = expDataLoadTableLimit(conn, oldTable, limit); medianExpList = maExpDataMedianFromSpec(expList, medList, minExps); for (exp = medianExpList; exp != NULL; exp = exp->next) expDataTabOut(exp, f); if (doLoad) { expDataCreateTable(conn, newTable); hgLoadTabFile(conn, tabDir, newTable, &f); hgRemoveTabFile(tabDir, newTable); } expDataFreeList(&expList); expDataFreeList(&medianExpList); sqlDisconnect(&conn); }
void hgGnfMicroarray(char *expTable, char *dataTable, char *atlasFile) /** Main function that does all the work for new-style*/ { struct lineFile *lf = lineFileOpen(atlasFile, TRUE); char *line; int i, wordCount, expCount; char **row; float *data; char *affyId; struct hash *hash = newHash(17); FILE *f = NULL; int dataCount = 0; /* Open Atlas file and use first line to create experiment table. */ if (!lineFileNextReal(lf, &line)) errAbort("%s is empty", lf->fileName); if (startsWith("Affy", line)) line += 4; if (startsWith("Gene Name", line)) line += 9; if (line[0] != '\t') errAbort("%s doesn't seem to be a new format atlas file", lf->fileName); expCount = lineToExpTable(line+1, expTable); if (expCount <= 0) errAbort("No experiments in %s it seems", lf->fileName); warn("%d experiments\n", expCount); f = hgCreateTabFile(tabDir, dataTable); AllocArray(row, expCount); AllocArray(data, expCount); while (lineFileNextReal(lf, &line)) { affyId = nextWord(&line); wordCount = chopByWhite(line, row, expCount); if (wordCount != expCount) errAbort("Expecting %d data points, got %d line %d of %s", expCount, wordCount, lf->lineIx, lf->fileName); if (chopName != NULL) { char *e = stringIn(chopName, affyId); if (e != NULL) *e = 0; } if (hashLookup(hash, affyId)) { warn("Duplicate %s, skipping all but first.", affyId); continue; } for (i=0; i<expCount; ++i) { data[i] = sqlFloat(row[i]); } shortDataOut(f, affyId, expCount, data); ++dataCount; if (limit != 0 && dataCount >= limit) break; } lineFileClose(&lf); if (doLoad) { struct sqlConnection *conn = sqlConnect(database); expDataCreateTable(conn, dataTable); hgLoadTabFile(conn, tabDir, dataTable, &f); hgRemoveTabFile(tabDir, dataTable); sqlDisconnect(&conn); } }
void hgLoadRnaFold(char *database, char *table, char *foldDir) /* hgLoadRnaFold - Load a directory full of RNA fold files into database. */ { char path[PATH_LEN]; struct slName *dirList, *dirEl; struct lineFile *lf; char *line, *word, *s, c; FILE *f = hgCreateTabFile(tabDir, table); int count = 0; dirList = listDir(foldDir, "*"); for (dirEl = dirList; dirEl != NULL; dirEl = dirEl->next) { char *name = dirEl->name; if (sameString(name, "CVS")) continue; safef(path, sizeof(path), "%s/%s", foldDir, name); lf = lineFileOpen(path, TRUE); if (!lineFileNext(lf, &line, NULL)) { if (warnEmpty) { warn("%s is empty, skipping\n", name); lineFileClose(&lf); continue; } else errAbort("%s is empty\n", name); } if (!isupper(line[0])) notFold(path, 1); fprintf(f, "%s\t", name); /* Save name */ fprintf(f, "%s\t", line); /* Save sequence */ lineFileNeedNext(lf, &line, NULL); c = line[0]; if (c != '.' && c != '(') notFold(path, 2); word = nextWord(&line); fprintf(f, "%s\t", word); /* Save nested parenthesis */ /* Parse out (energy) term at end of line. */ s = strchr(line, '('); if (s == NULL) notFold(path, 3); word = skipLeadingSpaces(s+1); if (word == NULL || (!word[0] == '-' && !isdigit(word[0]))) notFold(path, 4); if ((s = strchr(word, ')')) == NULL) notFold(path, 5); *s = 0; fprintf(f, "%s\n", word); lineFileClose(&lf); ++count; } printf("Parsed %d files\n", count); if (doLoad) { struct sqlConnection *conn = sqlConnect(database); rnaFoldCreateTable(conn, table); hgLoadTabFile(conn, tabDir, table, &f); hgRemoveTabFile(tabDir, table); sqlDisconnect(&conn); } }
struct hash *loadMotifWeights(struct sqlConnection *conn, char *fileName, char *table) /* Load in XML weight motif file and save it in tab-separated format * and in hash keyed by motif name. */ { struct esmMotifs *motifs = esmMotifsLoad(fileName); struct esmMotif *motif; FILE *f = hgCreateTabFile(tmpDir, table); struct dyString *dy = dyStringNew(512); struct hash *hash = newHash(16); for (motif = motifs->esmMotif; motif != NULL; motif = motif->next) { struct esmWeights *weights = motif->esmWeights; int posCount = slCount(weights->esmPosition); struct esmPosition *pos; struct dnaMotif *dm; char name[64]; fixMotifName(motif->Name, name, sizeof(name)); AllocVar(dm); dm->name = cloneString(name); dm->columnCount = posCount; AllocArray(dm->aProb, posCount); AllocArray(dm->cProb, posCount); AllocArray(dm->gProb, posCount); AllocArray(dm->tProb, posCount); for (pos = weights->esmPosition; pos != NULL; pos = pos->next) { char *row[5]; double odds[4], sumOdds = 0; int i; int ix = pos->Num; int rowSize = chopString(pos->Weights, ";", row, ArraySize(row)); if (rowSize != 4) errAbort("Expecting 4 values for weights in position %d of Motif %s", pos->Num, motif->Name); if (ix >= posCount) errAbort("Num %d out of range in Motif %s", ix, motif->Name); for (i=0; i<4; ++i) { odds[i] = exp(atof(row[0])); sumOdds += odds[i]; } dm->aProb[ix] = odds[0]/sumOdds; dm->cProb[ix] = odds[1]/sumOdds; dm->gProb[ix] = odds[2]/sumOdds; dm->tProb[ix] = odds[3]/sumOdds; } dnaMotifTabOut(dm, f); hashAdd(hash, dm->name, dm); } dyStringPrintf(dy, "CREATE TABLE %s (\n" " name varchar(16) not null, # Motif name.\n" " columnCount int not null, # Count of columns in motif.\n" " aProb longblob not null, # Probability of A's in each column.\n" " cProb longblob not null, # Probability of C's in each column.\n" " gProb longblob not null, # Probability of G's in each column.\n" " tProb longblob not null, # Probability of T's in each column.\n" " #Indices\n" " PRIMARY KEY(name)\n" ")\n", table); sqlRemakeTable(conn, table, dy->string); hgLoadTabFile(conn, tmpDir, table, &f); hgRemoveTabFile(tmpDir, table); verbose(1, "Processed %d motifs into %s\n", slCount(motifs->esmMotif), table); return hash; }
void hgExpDistance(char *database, char *posTable, char *expTable, char *outTable) /* hgExpDistance - Create table that measures expression distance between pairs. */ { struct sqlConnection *conn = sqlConnect(database); struct sqlResult *sr; char query[256]; char **row; struct hash *expHash = hashNew(16); int realExpCount = -1; struct microData *gene; int rc, t; pthread_t *threads = NULL; pthread_attr_t attr; int *threadID = NULL; void *status; char *tempDir = "."; int arrayNum; struct microDataDistance *geneDistPtr = NULL; struct microDataDistance *geneDistArray = NULL; int geneIx; FILE *f = NULL; /* Get list/hash of all items with expression values. */ safef(query, sizeof(query), "select name,expCount,expScores from %s", posTable); sr = sqlGetResult(conn, query); while ((row = sqlNextRow(sr)) != NULL) { char *name = row[0]; if (!hashLookup(expHash, name)) { int expCount = sqlUnsigned(row[1]); int commaCount; float *expScores = NULL; sqlFloatDynamicArray(row[2], &expScores, &commaCount); if (expCount != commaCount) errAbort("expCount and expScores don't match on %s in %s", name, posTable); if (realExpCount == -1) realExpCount = expCount; if (expCount != realExpCount) errAbort("In %s some rows have %d experiments others %d", name, expCount, realExpCount); AllocVar(gene); gene->expCount = expCount; gene->expScores = expScores; hashAddSaveName(expHash, name, gene, &gene->name); slAddHead(&geneList, gene); } } sqlFreeResult(&sr); conn = sqlConnect(database); slReverse(&geneList); geneCount = slCount(geneList); printf("Have %d elements in %s\n", geneCount, posTable); weights = getWeights(realExpCount); if (optionExists("lookup")) geneList = lookupGenes(conn, optionVal("lookup", NULL), geneList); geneCount = slCount(geneList); printf("Got %d unique elements in %s\n", geneCount, posTable); sqlDisconnect(&conn); /* Disconnect because next step is slow. */ if (geneCount < 1) errAbort("ERROR: unique gene count less than one ?"); f = hgCreateTabFile(tempDir, outTable); synQ = synQueueNew(); /* instantiate threads */ AllocArray( threadID, numThreads ); AllocArray( threads, numThreads ); pthread_attr_init( &attr ); pthread_mutex_init( &mutexDotOut, NULL ); pthread_attr_setdetachstate( &attr, PTHREAD_CREATE_JOINABLE ); for (t = 0; t < numThreads; t++) { threadID[t] = t; rc = pthread_create( &threads[t], &attr, computeDistance, (void *) &threadID[t]); if (rc) errAbort("ERROR: in pthread_create() %d\n", rc ); } /* this thread will write to the file from the queue */ for (arrayNum = 0; arrayNum < geneCount; arrayNum++) { geneDistArray = (struct microDataDistance *)synQueueGet( synQ ); geneDistPtr = geneDistArray; /* Print out closest GENEDISTS distances in tab file. */ for (geneIx=0; geneIx < GENEDISTS && geneIx < geneCount; ++geneIx, geneDistPtr++) if (geneDistPtr != NULL) fprintf(f, "%s\t%s\t%f\n", geneDistPtr->name1, geneDistPtr->name2, geneDistPtr->distance); else errAbort("ERROR: writing distance %d to file\n", geneIx); freeMem( geneDistArray ); } /* synchronize all threads */ for (t = 0; t < numThreads; t++) { rc = pthread_join( threads[t], &status); if (rc) errAbort("ERROR: in pthread_join() %d\n", rc ); } printf("Made %s.tab\n", outTable); slFreeList( &geneList ); pthread_mutex_destroy( &mutexDotOut ); pthread_attr_destroy( &attr ); /* Create and load table. */ conn = sqlConnect(database); distanceTableCreate(conn, outTable); hgLoadTabFile(conn, tempDir, outTable, &f); printf("Loaded %s\n", outTable); /* Add indices. */ safef(query, sizeof(query), "alter table %s add index(query(12))", outTable); sqlUpdate(conn, query); printf("Made query index\n"); if (optionExists("targetIndex")) { safef(query, sizeof(query), "alter table %s add index(target(12))", outTable); sqlUpdate(conn, query); printf("Made target index\n"); } hgRemoveTabFile(tempDir, outTable); }
void hgExpDistance(char *database, char *posTable, char *expTable, char *outTable) /* hgExpDistance - Create table that measures expression distance between pairs. */ { struct sqlConnection *conn = sqlConnect(database); struct sqlResult *sr; char query[256]; char **row; struct hash *expHash = hashNew(16); int realExpCount = -1; struct microData *gene; int rc, t; pthread_t *threads = NULL; pthread_attr_t attr; int *threadID = NULL; void *status; char *tempDir = "."; long time1, time2; time1 = clock1000(); /* Get list/hash of all items with expression values. */ sqlSafef(query, sizeof(query), "select name,expCount,expScores from %s", posTable); sr = sqlGetResult(conn, query); while ((row = sqlNextRow(sr)) != NULL) { char *name = row[0]; if (!hashLookup(expHash, name)) { int expCount = sqlUnsigned(row[1]); int commaCount; float *expScores = NULL; sqlFloatDynamicArray(row[2], &expScores, &commaCount); if (expCount != commaCount) errAbort("expCount and expScores don't match on %s in %s", name, posTable); if (realExpCount == -1) realExpCount = expCount; if (expCount != realExpCount) errAbort("In %s some rows have %d experiments others %d", name, expCount, realExpCount); AllocVar(gene); gene->expCount = expCount; gene->expScores = expScores; hashAddSaveName(expHash, name, gene, &gene->name); slAddHead(&geneList, gene); } } sqlFreeResult(&sr); conn = sqlConnect(database); slReverse(&geneList); geneCount = slCount(geneList); printf("Have %d elements in %s\n", geneCount, posTable); weights = getWeights(realExpCount); if (optionExists("lookup")) geneList = lookupGenes(conn, optionVal("lookup", NULL), geneList); geneCount = slCount(geneList); printf("Got %d unique elements in %s\n", geneCount, posTable); sqlDisconnect(&conn); /* Disconnect because next step is slow. */ if (geneCount < 1) errAbort("ERROR: unique gene count less than one ?"); time2 = clock1000(); verbose(2, "records read time: %.2f seconds\n", (time2 - time1) / 1000.0); f = hgCreateTabFile(tempDir, outTable); /* instantiate threads */ AllocArray( threadID, numThreads ); AllocArray( threads, numThreads ); pthread_attr_init( &attr ); pthread_mutex_init( &mutexfilehandle, NULL ); pthread_attr_setdetachstate( &attr, PTHREAD_CREATE_JOINABLE ); for (t = 0; t < numThreads; t++) { threadID[t] = t; rc = pthread_create( &threads[t], &attr, computeDistance, (void *) &threadID[t]); if (rc) errAbort("ERROR: in pthread_create() %d\n", rc ); } /* synchronize all threads */ for (t = 0; t < numThreads; t++) { rc = pthread_join( threads[t], &status); if (rc) errAbort("ERROR: in pthread_join() %d\n", rc ); } printf("Made %s.tab\n", outTable); slFreeList( &geneList ); pthread_mutex_destroy( &mutexfilehandle ); pthread_attr_destroy( &attr ); time1 = time2; time2 = clock1000(); verbose(2, "distance computation time: %.2f seconds\n", (time2 - time1) / 1000.0); /* Create and load table. */ conn = sqlConnect(database); distanceTableCreate(conn, outTable); hgLoadTabFile(conn, tempDir, outTable, &f); printf("Loaded %s\n", outTable); /* Add indices. */ sqlSafef(query, sizeof(query), "alter table %s add index(query(12))", outTable); sqlUpdate(conn, query); printf("Made query index\n"); if (optionExists("targetIndex")) { sqlSafef(query, sizeof(query), "alter table %s add index(target(12))", outTable); sqlUpdate(conn, query); printf("Made target index\n"); } hgRemoveTabFile(tempDir, outTable); time1 = time2; time2 = clock1000(); verbose(2, "table create/load/index time: %.2f seconds\n", (time2 - time1) / 1000.0); }
void hgKnownToSuper(char *database, char *org, char *assFile) /* hgKnownToSuper - Load knownToSuperfamily table. */ { struct sqlConnection *conn = sqlConnect(database); struct hash *pepToKnown = ensPepToKnown(conn, TRUE); char *table = "knownToSuper"; FILE *f = hgCreateTabFile(tempDir, table); struct lineFile *lf = lineFileOpen(assFile, TRUE); boolean gotOrg = FALSE; int outCount = 0; char *row[6]; while (lineFileRow(lf, row)) { if (sameString(row[0], org)) { char *pepName = row[1]; char *regions = row[3]; char *eVal = row[4]; char *supId = row[5]; char *knownId = hashFindVal(pepToKnown, pepName); if (knownId != NULL) { char *region, *e; int start,end; /* Loop through comma-separated region string. */ for (region = regions; region != NULL; region = e) { e = strchr(region, ','); if (e != NULL) { *e++ = 0; if (e[0] == 0) e = NULL; } if (sscanf(region, "%d-%d", &start, &end) < 2) errAbort("bad region %s line %d of %s", region, lf->lineIx, lf->fileName); fprintf(f, "%s\t%s\t%d\t%d\t%s\n", knownId, supId, start-1, end, eVal); ++outCount; } } gotOrg = TRUE; } } lineFileClose(&lf); if (!gotOrg) errAbort("Looks like '%s' is not a recognized organism", org); if (outCount <= 0) errAbort("No good records found in %s", assFile); printf("%d records output\n", outCount); /* Refresh connection in case things took a while. */ sqlDisconnect(&conn); conn = sqlConnect(database); /* Load up database. */ createTable(conn, table); hgLoadTabFile(conn, tempDir, table, &f); hgRemoveTabFile(tempDir, table); }
void hgLoadChromGraph(boolean doLoad, char *db, char *track, char *fileName) /* hgLoadChromGraph - Load up chromosome graph. */ { double minVal,maxVal; struct chromGraph *el, *list; FILE *f; char *tempDir = "."; char path[PATH_LEN], gbdbPath[PATH_LEN]; char *idTable = optionVal("idTable", NULL); char *pathPrefix = NULL; if (idTable == NULL) list = chromGraphLoadAll(fileName); else list = chromGraphListWithTable(fileName, db, idTable); if (list == NULL) errAbort("%s is empty", fileName); /* Figure out min/max values */ minVal = maxVal = list->val; for (el = list->next; el != NULL; el = el->next) { if (optionExists("minusLog10")) { if (el->val == 1) el->val = 0; else if (el->val > 0) el->val = -1 * log(el->val)/log(10); } if (el->val < minVal) minVal = el->val; if (el->val > maxVal) maxVal = el->val; } /* Sort and write out temp file. */ slSort(&list, chromGraphCmp); f = hgCreateTabFile(tempDir, track); for (el = list; el != NULL; el = el->next) chromGraphTabOut(el, f); if (doLoad) { struct dyString *dy = dyStringNew(0); struct sqlConnection *conn; /* Set up connection to database and create main table. */ conn = hAllocConn(db); sqlDyStringPrintf(dy, createString, track, hGetMinIndexLength(db)); sqlRemakeTable(conn, track, dy->string); /* Load main table and clean up file handle. */ hgLoadTabFile(conn, tempDir, track, &f); hgRemoveTabFile(tempDir, track); /* If need be create meta table. If need be delete old row. */ if (!sqlTableExists(conn, "metaChromGraph")) sqlUpdate(conn, metaCreateString); else { dyStringClear(dy); sqlDyStringPrintf(dy, "delete from metaChromGraph where name = '%s'", track); sqlUpdate(conn, dy->string); } /* Make chrom graph file */ safef(path, sizeof(path), "%s.cgb", track); chromGraphToBin(list, path); safef(path, sizeof(path), "/gbdb/%s/chromGraph", db); pathPrefix = optionVal("pathPrefix", path); safef(gbdbPath, sizeof(gbdbPath), "%s/%s.cgb", pathPrefix, track); /* Create new line in meta table */ dyStringClear(dy); sqlDyStringPrintf(dy, "insert into metaChromGraph values('%s',%f,%f,'%s');", track, minVal, maxVal, gbdbPath); sqlUpdate(conn, dy->string); } }
void hgLoadNetDist(char *inTab, char *db, char *outTable) { char *tempDir = "."; FILE *f = hgCreateTabFile(tempDir, outTable); struct sqlConnection *hConn = sqlConnect(db); FILE *missingFile=NULL; int missingCount=0; struct lineFile *lf=NULL; char *row[3]; int rowCount=3; if (sqlRemap) { fetchRemapInfo(db); missingHash = newHash(16); missingFile = mustOpen("missing.tab","w"); } /* read edges from file */ lf=lineFileOpen(inTab, TRUE); /* print final values, remapping if needed */ while (lineFileNextRowTab(lf, row, rowCount)) { char *geneI = row[0]; char *geneJ = row[1]; char *dij = row[2]; char *gi=NULL, *gj=NULL; if (sqlRemap) { /* it is possible for each id to have multiple remap values in hash */ struct hashEl *hi=NULL, *hj=NULL, *hjSave=NULL; hi = hashLookup(aliasHash,geneI); hj = hashLookup(aliasHash,geneJ); missingCount += handleMissing(hi, geneI, missingHash, missingFile); missingCount += handleMissing(hj, geneJ, missingHash, missingFile); hjSave = hj; /* do all combinations of i and j */ for(;hi;hi=hashLookupNext(hi)) { gi = (char *)hi->val; for(;hj;hj=hashLookupNext(hj)) { gj = (char *)hj->val; fprintf(f,"%s\t%s\t%s\n",gi,gj,dij); } hj = hjSave; /* reset it */ } } else { gi=geneI; gj=geneJ; fprintf(f,"%s\t%s\t%s\n",gi,gj,dij); } } lineFileClose(&lf); carefulClose(&f); if (sqlRemap) { carefulClose(&missingFile); if (missingCount == 0) unlink("missing.tab"); else printf("hgLoadNetDist %d id-remapping misses, see missing.tab\n", missingCount); } createTable(hConn, outTable); hgLoadTabFile(hConn, tempDir, outTable, &f); hgRemoveTabFile(tempDir, outTable); }
void knownToVisiGene(char *database) /* knownToVisiGene - Create knownToVisiGene table by riffling through various other knownTo tables. */ { char *tempDir = "."; FILE *f = hgCreateTabFile(tempDir, outTable); struct sqlConnection *hConn = sqlConnect(database); struct sqlConnection *iConn = sqlConnect(visiDb); struct sqlResult *sr; char **row; struct hash *geneImageHash = newHash(18); struct hash *locusLinkImageHash = newHash(18); struct hash *refSeqImageHash = newHash(18); struct hash *genbankImageHash = newHash(18); struct hash *probeImageHash = newHash(18); struct hash *knownToLocusLinkHash = newHash(18); struct hash *knownToRefSeqHash = newHash(18); struct hash *knownToGeneHash = newHash(18); struct hash *favorHugoHash = newHash(18); struct hash *knownToProbeHash = newHash(18); struct hash *knownToAllProbeHash = newHash(18); struct genePred *knownList = NULL, *known; struct hash *dupeHash = newHash(17); probesDb = optionVal("probesDb", database); struct sqlConnection *probesConn = sqlConnect(probesDb); vgProbes = sqlTableExists(probesConn,"vgProbes"); vgAllProbes = sqlTableExists(probesConn,"vgAllProbes"); /* Go through and make up hashes of images keyed by various fields. */ sr = sqlGetResult(iConn, NOSQLINJ "select image.id,imageFile.priority,gene.name,gene.locusLink,gene.refSeq,gene.genbank" ",probe.id,submissionSet.privateUser,vgPrbMap.vgPrb,gene.id" " from image,imageFile,imageProbe,probe,gene,submissionSet,vgPrbMap" " where image.imageFile = imageFile.id" " and image.id = imageProbe.image" " and imageProbe.probe = probe.id" " and probe.gene = gene.id" " and image.submissionSet=submissionSet.id" " and vgPrbMap.probe = probe.id"); while ((row = sqlNextRow(sr)) != NULL) { int id = sqlUnsigned(row[0]); float priority = atof(row[1]); int privateUser = sqlSigned(row[7]); char vgPrb_Id[256]; safef(vgPrb_Id, sizeof(vgPrb_Id), "vgPrb_%s",row[8]); int geneId = sqlUnsigned(row[9]); if (privateUser == 0) { addPrioritizedImage(probeImageHash, id, priority, geneId, vgPrb_Id); addPrioritizedImage(geneImageHash, id, priority, geneId, row[2]); addPrioritizedImage(locusLinkImageHash, id, priority, geneId, row[3]); addPrioritizedImage(refSeqImageHash, id, priority, geneId, row[4]); addPrioritizedImage(genbankImageHash, id, priority, geneId, row[5]); } } verbose(2, "Made hashes of image: geneImageHash %d, locusLinkImageHash %d, refSeqImageHash %d" ", genbankImageHash %d probeImageHash %d\n", geneImageHash->elCount, locusLinkImageHash->elCount, refSeqImageHash->elCount, genbankImageHash->elCount, probeImageHash->elCount); sqlFreeResult(&sr); /* Build up list of known genes. */ sr = sqlGetResult(hConn, NOSQLINJ "select * from knownGene"); while ((row = sqlNextRow(sr)) != NULL) { struct genePred *known = genePredLoad(row); if (!hashLookup(dupeHash, known->name)) { hashAdd(dupeHash, known->name, NULL); slAddHead(&knownList, known); } } slReverse(&knownList); sqlFreeResult(&sr); verbose(2, "Got %d known genes\n", slCount(knownList)); /* Build up hashes from knownGene to other things. */ if (vgProbes) bestProbeOverlap(probesConn, "vgProbes", knownList, knownToProbeHash); if (vgAllProbes) bestProbeOverlap(probesConn, "vgAllProbes", knownList, knownToAllProbeHash); foldIntoHash(hConn, "knownToLocusLink", "name", "value", knownToLocusLinkHash, NULL, FALSE); foldIntoHash(hConn, "knownToRefSeq", "name", "value", knownToRefSeqHash, NULL, FALSE); foldIntoHash(hConn, "kgXref", "kgID", "geneSymbol", knownToGeneHash, favorHugoHash, FALSE); foldIntoHash(hConn, "kgAlias", "kgID", "alias", knownToGeneHash, favorHugoHash, TRUE); foldIntoHash(hConn, "kgProtAlias", "kgID", "alias", knownToGeneHash, favorHugoHash, TRUE); verbose(2, "knownToLocusLink %d, knownToRefSeq %d, knownToGene %d knownToProbe %d knownToAllProbe %d\n", knownToLocusLinkHash->elCount, knownToRefSeqHash->elCount, knownToGeneHash->elCount, knownToProbeHash->elCount, knownToAllProbeHash->elCount); /* Try and find an image for each gene. */ for (known = knownList; known != NULL; known = known->next) { char *name = known->name; struct prioritizedImage *best = NULL; { best = bestImage(name, knownToLocusLinkHash, locusLinkImageHash); if (!best) best = bestImage(name, knownToRefSeqHash, refSeqImageHash); if (!best) { best = hashFindVal(genbankImageHash, name); } if (!best) best = bestImage(name, knownToGeneHash, geneImageHash); if (vgProbes && !best) best = bestImage(name, knownToProbeHash, probeImageHash); if (vgAllProbes && !best) best = bestImage(name, knownToAllProbeHash, probeImageHash); } if (best) { fprintf(f, "%s\t%d\t%d\n", name, best->imageId, best->geneId); } } createTable(hConn, outTable); hgLoadTabFile(hConn, tempDir, outTable, &f); hgRemoveTabFile(tempDir, outTable); }
void hgExpDistance(char *database, char *posTable, char *expTable, char *outTable) /* hgExpDistance - Create table that measures expression distance between pairs. */ { struct sqlConnection *conn = sqlConnect(database); struct sqlResult *sr; char query[256]; char **row; struct hash *expHash = hashNew(16); int realExpCount = -1; struct microData *geneList = NULL, *curGene, *gene; int geneIx, geneCount = 0; struct microData **geneArray = NULL; float *weights = NULL; char *tempDir = "."; FILE *f = hgCreateTabFile(tempDir, outTable); /* Get list/hash of all items with expression values. */ sqlSafef(query, sizeof(query), "select name,expCount,expScores from %s", posTable); sr = sqlGetResult(conn, query); while ((row = sqlNextRow(sr)) != NULL) { char *name = row[0]; if (!hashLookup(expHash, name)) { int expCount = sqlUnsigned(row[1]); int commaCount; float *expScores = NULL; sqlFloatDynamicArray(row[2], &expScores, &commaCount); if (expCount != commaCount) errAbort("expCount and expScores don't match on %s in %s", name, posTable); if (realExpCount == -1) realExpCount = expCount; if (expCount != realExpCount) errAbort("In %s some rows have %d experiments others %d", name, expCount, realExpCount); AllocVar(gene); gene->expCount = expCount; gene->expScores = expScores; hashAddSaveName(expHash, name, gene, &gene->name); slAddHead(&geneList, gene); } } sqlFreeResult(&sr); conn = sqlConnect(database); slReverse(&geneList); geneCount = slCount(geneList); printf("Have %d elements in %s\n", geneCount, posTable); weights = getWeights(realExpCount); if (optionExists("lookup")) geneList = lookupGenes(conn, optionVal("lookup", NULL), geneList); geneCount = slCount(geneList); printf("Got %d unique elements in %s\n", geneCount, posTable); sqlDisconnect(&conn); /* Disconnect because next step is slow. */ if (geneCount < 1) errAbort("ERROR: unique gene count less than one ?"); /* Get an array for sorting. */ AllocArray(geneArray, geneCount); for (gene = geneList,geneIx=0; gene != NULL; gene = gene->next, ++geneIx) geneArray[geneIx] = gene; /* Print out closest 1000 in tab file. */ for (curGene = geneList; curGene != NULL; curGene = curGene->next) { calcDistances(curGene, geneList, weights); qsort(geneArray, geneCount, sizeof(geneArray[0]), cmpMicroDataDistance); for (geneIx=0; geneIx < 1000 && geneIx < geneCount; ++geneIx) { gene = geneArray[geneIx]; fprintf(f, "%s\t%s\t%f\n", curGene->name, gene->name, gene->distance); } dotOut(); } printf("Made %s.tab\n", outTable); /* Create and load table. */ conn = sqlConnect(database); distanceTableCreate(conn, outTable); hgLoadTabFile(conn, tempDir, outTable, &f); printf("Loaded %s\n", outTable); /* Add indices. */ sqlSafef(query, sizeof(query), "alter table %s add index(query)", outTable); sqlUpdate(conn, query); printf("Made query index\n"); if (optionExists("targetIndex")) { sqlSafef(query, sizeof(query), "alter table %s add index(target)", outTable); sqlUpdate(conn, query); printf("Made target index\n"); } hgRemoveTabFile(tempDir, outTable); }
void hgFlyBase(char *database, char *genesFile) /* hgFlyBase - Parse FlyBase genes.txt file and turn it into a couple of * tables. */ { char *tGene = "fbGene"; char *tSynonym = "fbSynonym"; char *tAllele = "fbAllele"; char *tRef = "fbRef"; char *tRole = "fbRole"; char *tPhenotype = "fbPhenotype"; char *tTranscript = "fbTranscript"; char *tGo = "fbGo"; char *tUniProt = "fbUniProt"; FILE *fGene = hgCreateTabFile(tabDir, tGene); FILE *fSynonym = hgCreateTabFile(tabDir, tSynonym); FILE *fAllele = hgCreateTabFile(tabDir, tAllele); FILE *fRef = hgCreateTabFile(tabDir, tRef); FILE *fRole = hgCreateTabFile(tabDir, tRole); FILE *fPhenotype = hgCreateTabFile(tabDir, tPhenotype); FILE *fTranscript = NULL; FILE *fGo = hgCreateTabFile(tabDir, tGo); FILE *fUniProt = hgCreateTabFile(tabDir, tUniProt); struct lineFile *lf = lineFileOpen(genesFile, TRUE); struct hash *refHash = newHash(19); int nextRefId = 0; int nextAlleleId = 0; char *line, sub, type, *rest, *s; char *geneSym = NULL, *geneName = NULL, *geneId = NULL; int recordCount = 0; struct slName *synList = NULL, *syn; int curAllele = 0, curRef = 0; struct ref *ref = NULL; struct sqlConnection *conn; struct hash *goUniqHash = newHash(18); /* Make table from flybase genes to BGDP transcripts. */ if (doTranscript) { fTranscript = hgCreateTabFile(tabDir, tTranscript); getAllSplices(database, fTranscript); } /* Make dummy reference for flybase itself. */ fprintf(fRef, "0\tFlyBase\n"); /* Loop through parsing and writing tab files. */ while (lineFileNext(lf, &line, NULL)) { sub = line[0]; if (sub == '#') { /* End of record. */ ++recordCount; if (geneId == NULL) errAbort("Record without *z line ending line %d of %s", lf->lineIx, lf->fileName); /* Write out synonyms. */ s = naForNull(geneSym); geneSym = ungreek(s); freeMem(s); s = naForNull(geneName); geneName = ungreek(s); if (! sameString(s, "n/a")) freeMem(s); if (geneSym != NULL && !sameString(geneSym, "n/a")) slNameStore(&synList, geneSym); if (geneName != NULL && !sameString(geneName, "n/a")) slNameStore(&synList, geneName); for (syn = synList; syn != NULL; syn = syn->next) { s = ungreek(syn->name); fprintf(fSynonym, "%s\t%s\n", geneId, s); freeMem(s); } /* Write out gene record. */ fprintf(fGene, "%s\t%s\t%s\n", geneId, geneSym, geneName); /* Clean up. */ freez(&geneSym); freez(&geneName); freez(&geneId); slFreeList(&synList); ref = NULL; curRef = curAllele = 0; continue; } else if (sub == 0) errAbort("blank line %d of %s, not allowed in gene.txt", lf->lineIx, lf->fileName); else if (isalnum(sub)) errAbort("line %d of %s begins with %c, not allowed", lf->lineIx, lf->fileName, sub); type = line[1]; rest = trimSpaces(line+2); if (sub == '*' && type == 'a') geneSym = cloneString(rest); else if (sub == '*' && type == 'e') geneName = cloneString(rest); else if (sub == '*' && type == 'z') { geneId = cloneString(rest); if (!startsWith("FBgn", geneId)) errAbort("Bad FlyBase gene ID %s line %d of %s", geneId, lf->lineIx, lf->fileName); } else if (type == 'i' && (sub == '*' || sub == '$')) { if (strlen(rest) > 2) /* Avoid short useless ones. */ slNameStore(&synList, rest); } else if (sub == '*' && type == 'A') { if (geneId == NULL) errAbort("Allele before geneId line %d of %s", lf->lineIx, lf->fileName); curAllele = ++nextAlleleId; fprintf(fAllele, "%d\t%s\t%s\n", curAllele, geneId, rest); if (!sameString(rest, "classical") && !sameString(rest, "in vitro") && !sameString(rest, "wild-type") ) { slNameStore(&synList, rest); } } else if (sub == '*' && type == 'm') { if (geneId == NULL) errAbort("*m protein ID before geneId line %d of %s", lf->lineIx, lf->fileName); if (startsWith("UniProt", rest)) { char *ptr = strchr(rest, ':'); if (ptr != NULL) ptr++; else errAbort("Trouble parsing UniProt ID %s like %d of %s", rest, lf->lineIx, lf->fileName); fprintf(fUniProt, "%s\t%s\n", geneId, ptr); } } else if (type == 'E') { ref = hashFindVal(refHash, rest); if (ref == NULL) { AllocVar(ref); ref->id = ++nextRefId; hashAdd(refHash, rest, ref); subChar(rest, '\t', ' '); fprintf(fRef, "%d\t%s\n", ref->id, rest); } curRef = ref->id; } else if ((type == 'k' || type == 'r' || type == 'p') && sub != '@') { FILE *f = (type == 'r' ? fRole : fPhenotype); struct dyString *dy = suckSameLines(lf, line); subChar(dy->string, '\t', ' '); if (geneId == NULL) errAbort("Expecting *z in record before line %d of %s", lf->lineIx, lf->fileName); fprintf(f, "%s\t%d\t%d\t%s\n", geneId, curAllele, curRef, dy->string); dyStringFree(&dy); } else if (type == 'd' || type == 'f' || type == 'F') { FILE *f = fGo; char aspect = (type == 'd') ? 'P' : (type == 'f') ? 'C' : 'F'; char *goId = rest; char *p = strstr(goId, " ; "); char assoc[128]; if (p == NULL) continue; else goId = firstWordInLine(p + 3); safef(assoc, sizeof(assoc), "%s.%s", geneId, goId); if (hashLookup(goUniqHash, assoc) == NULL) { hashAddInt(goUniqHash, assoc, 1); fprintf(f, "%s\t%s\t%c\n", geneId, goId, aspect); } } } printf("Processed %d records in %d lines\n", recordCount, lf->lineIx); lineFileClose(&lf); conn = sqlConnect(database); remakeTables(conn); if (doLoad) { printf("Loading %s\n", tGene); hgLoadTabFile(conn, tabDir, tGene, &fGene); if (doTranscript) { printf("Loading %s\n", tTranscript); hgLoadTabFile(conn, tabDir, tTranscript, &fTranscript); } printf("Loading %s\n", tSynonym); hgLoadTabFile(conn, tabDir, tSynonym, &fSynonym); printf("Loading %s\n", tAllele); hgLoadTabFile(conn, tabDir, tAllele, &fAllele); printf("Loading %s\n", tRef); hgLoadTabFile(conn, tabDir, tRef, &fRef); printf("Loading %s\n", tRole); hgLoadTabFile(conn, tabDir, tRole, &fRole); printf("Loading %s\n", tPhenotype); hgLoadTabFile(conn, tabDir, tPhenotype, &fPhenotype); printf("Loading %s\n", tGo); hgLoadTabFile(conn, tabDir, tGo, &fGo); printf("Loading %s\n", tUniProt); hgLoadTabFile(conn, tabDir, tUniProt, &fUniProt); hgRemoveTabFile(tabDir, tGene); if (doTranscript) hgRemoveTabFile(tabDir, tTranscript); hgRemoveTabFile(tabDir, tSynonym); hgRemoveTabFile(tabDir, tAllele); hgRemoveTabFile(tabDir, tRef); hgRemoveTabFile(tabDir, tRole); hgRemoveTabFile(tabDir, tPhenotype); hgRemoveTabFile(tabDir, tGo); hgRemoveTabFile(tabDir, tUniProt); } }