void loadDatabase()
{
struct sqlConnection *conn = hAllocConn();
FILE *f = mustOpen("ContigLocusIdCondense.tab", "r");
hgLoadTabFile(conn, ".", "ContigLocusIdCondense", &f);
hFreeConn(&conn);
}
Пример #2
0
void fakeCloneOldTable(struct sqlConnection *oldConn, struct sqlConnection *newConn, char *table)
/* Clone cart table in newConn from oldConn. Add fake prefix to
 * contents field to help mark it as fake. */
{
char query[256];
sqlSafef(query, sizeof(query), "select * from %s", table);
struct sqlResult *sr = sqlGetResult(oldConn, query);
char **row;
FILE *f = hgCreateTabFile(NULL, table);
while ((row = sqlNextRow(sr)) != NULL)
    {
    int i;
    for (i=0; i<cartNumFields; ++i)
        {
	if (i != 0)
	    fprintf(f, "\t");
	if (i == 1)
	    fprintf(f, "%s", fakePrefix);
	fprintf(f, "%s", row[i]);
	}
    fprintf(f, "\n");
    }
hgLoadTabFile(newConn, NULL, table, &f);
hgUnlinkTabFile(NULL, table);
}
Пример #3
0
void makeTableDescriptions(char *database, char *asFile)
/* makeTableDescriptions - Add table descriptions to database.. */
{
struct sqlConnection *conn = sqlConnect(database);
struct lineFile *lf = lineFileOpen(asFile, TRUE);
FILE *f = hgCreateTabFile(".", "tableDescriptions");
/* Open a tab file with name corresponding to tableName in tmpDir. */
char *line;

/* struct asObject *asList = */ asParseFile(asFile);	/* Just to check syntax */

if (sqlTableExists(conn, "chromInfo"))
    errAbort("%s looks like a genome database, has chromInfo, aborting", 
    	database);

sqlRemakeTable(conn, "tableDescriptions",
   "NOSQLINJ CREATE TABLE tableDescriptions (\n"
   "  tableName varchar(255) not null,\n"
   "  autoSqlDef longblob not null,\n"
   "  gbdAnchor varchar(255) not null,\n"
   "  PRIMARY KEY(tableName(32))\n"
   ")" );

while (lineFileNextReal(lf, &line))
    {
    if (startsWith("table", line))
        {
	struct dyString *as = dyStringNew(0);
	char *name = trimSpaces(line + 6);	/* Skip over table. */
	char *escaped = NULL;

	fprintf(f, "%s\t", name);

	/* Putting lines into as. */
	for (;;)
	    {
	    char *s;
	    dyStringAppend(as, line);
	    dyStringAppendC(as, '\n');
	    s = skipLeadingSpaces(line);
	    if (s[0] == ')')
	        break;
	    if (!lineFileNext(lf, &line, NULL))
	        errAbort("Unexpected end of file, missing closing paren in %s",
			lf->fileName);
	    }
	escaped = needMem(2*as->stringSize+1);
	fprintf(f, "%s\t", sqlEscapeTabFileString2(escaped, as->string));
	fprintf(f, "\n");

	freez(&escaped);
	dyStringFree(&as);
	}
    else
        errAbort("Expecting table line %d of %s", lf->lineIx, lf->fileName);
    }
hgLoadTabFile(conn, ".", "tableDescriptions", &f);
}
void hgCeOrfToGene(char *database, char *geneNames, 
	char *geneTable, char *table)
/* hgCeOrfToGene - Make orfToGene table for C.elegans from 
 * GENE_DUMPS/gene_names.txt. */
{
struct lineFile *lf = lineFileOpen(geneNames, TRUE);
struct sqlConnection *conn;
struct sqlResult *sr;
char query[256];
char **row;
char *tempDir = ".";
FILE *f = hgCreateTabFile(tempDir, table);
char *words[4];
struct hash *orfHash = newHash(17);

/* Make hash to look up gene names. */
while (lineFileNextRowTab(lf, words, ArraySize(words)))
    {
    char *gene = words[0];
    char *orfs = words[3];
    char *type = words[2];
    char *orf[128];
    int i, orfCount;

    if (sameString(type, "Gene"))
	{
	orfCount = chopString(orfs, ",", orf, ArraySize(orf));
	if (orfCount >= ArraySize(orf))
	     errAbort("Too many ORFs line %d of %s", lf->lineIx, lf->fileName);
	for (i=0; i<orfCount; ++i)
	    hashAdd(orfHash, orf[i], cloneString(gene));
	}
    }
lineFileClose(&lf);

/* For each orf in gene table write out gene name if possible,
 * otherwise orf name. */
conn = sqlConnect(database);
safef(query, sizeof(query), "select name from %s", geneTable);
sr = sqlGetResult(conn,query);
while ((row = sqlNextRow(sr)) != NULL)
    {
    char *orf = row[0];
    char *gene = hashFindVal(orfHash, orf);
    if (gene == NULL)
        gene = orf;
    fprintf(f, "%s\t%s\n", orf, gene);
    }
sqlFreeResult(&sr);

createTable(conn, table, unique);
hgLoadTabFile(conn, tempDir, table, &f);
}
Пример #5
0
void saveDataTable(struct expData *data)
/* Create the expression table the cheesey way by loading a temp tab file. */
{
FILE *f = hgCreateTabFile(".", table);
struct expData *cur;
struct sqlConnection *conn = sqlConnect(database);
expDataCreateTable(conn, table);
for (cur = data; cur != NULL; cur = cur->next)
    expDataTabOut(cur, f);
hgLoadTabFile(conn, ".", table, &f);
hgRemoveTabFile(".", table);
sqlDisconnect(&conn);
}
Пример #6
0
static void loadTable(struct bed4 *beds, char *db, char *parTable)
/* create and load table */
{
struct sqlConnection *conn = sqlConnect(db);
char sqlCmd[256];
sqlSafef(sqlCmd, sizeof(sqlCmd), createSql, parTable);
sqlRemakeTable(conn, parTable, sqlCmd);

FILE *tabFh = hgCreateTabFile(NULL, parTable);
writeBeds(beds, tabFh);
hgLoadTabFile(conn, NULL, parTable, &tabFh);
hgUnlinkTabFile(NULL, parTable);
sqlDisconnect(&conn);
}
Пример #7
0
void ldGencodeIntron(char *database, char *table,  
                        int gtfCount, char *gtfNames[])
/* Load Gencode intron status table from GTF files with
 * intron_id and intron_status keywords */
{
struct gffFile *gff, *gffList = NULL;
struct gffLine *gffLine;
struct gencodeIntron *intron, *intronList = NULL;
struct sqlConnection *conn;
FILE *f;
int i;
int introns = 0;

for (i=0; i<gtfCount; i++)
    {
    verbose(1, "Reading %s\n", gtfNames[i]);
    gff = gffRead(gtfNames[i]);
    for (gffLine = gff->lineList; gffLine != NULL; gffLine = gffLine->next)
        {
        if (sameWord(gffLine->feature, "intron"))
            {
            AllocVar(intron);
            intron->chrom = gffLine->seq;
            intron->chromStart = gffLine->start;
            intron->chromEnd = gffLine->end;
            intron->name = gffLine->intronId;
            intron->strand[0] = gffLine->strand;
            intron->strand[1] = 0;
            intron->status = gffLine->intronStatus;
            intron->transcript = gffLine->group;
            intron->geneId = gffLine->geneId;
            slAddHead(&intronList, intron);
            verbose(2, "%s %s\n", intron->chrom, intron->name);
            introns++;
            }
        }
    }
slSort(&intronList, bedCmp);
f = hgCreateTabFile(".", table);
for (intron = intronList; intron != NULL; intron = intron->next)
    gencodeIntronTabOut(intron, f);
carefulClose(&f);

verbose(1, "%d introns in %d files\n", introns, gtfCount);
hSetDb(database);
conn = sqlConnect(database);
gencodeIntronTableCreate(conn, table, hGetMinIndexLength());
hgLoadTabFile(conn, ".", table, &f);
sqlDisconnect(&conn);
}
Пример #8
0
void hgLoadGenePred(char *db, char *table, int numGenePreds, char **genePredFiles)
/* hgLoadGenePred - Load up a mySQL database genePred table. */
{
struct genePred *genes = loadGenes(numGenePreds, genePredFiles);
struct sqlConnection *conn = sqlConnect(db);
char *tmpDir = ".";
FILE *tabFh = hgCreateTabFile(tmpDir, table);

mkTabFile(db, genes, tabFh);
genePredFreeList(&genes);
setupTable(db, conn, table);
hgLoadTabFile(conn, tmpDir, table, &tabFh);
sqlDisconnect(&conn);
hgRemoveTabFile(tmpDir, table);
}
Пример #9
0
void loadDatabase(char *chromName)
/* load one table into database */
{
FILE *f;
struct sqlConnection *conn = hAllocConn();
char tableName[64], fileName[64];

safef(tableName, ArraySize(tableName), "%s_snpTmp", chromName);
safef(fileName, ArraySize(fileName), "%s_snpTmp.tab", chromName);

f = mustOpen(fileName, "r");
hgLoadTabFile(conn, ".", tableName, &f);

hFreeConn(&conn);
}
Пример #10
0
int lineToExpTable(char *line, char *table)
/* Create expression format table from line. */
{
FILE *f = hgCreateTabFile(tabDir, table);
int count = lineToExp(line, f);
if (doLoad)
    {
    struct sqlConnection *conn = sqlConnect(database);
    expRecordCreateTable(conn, table);
    hgLoadTabFile(conn, tabDir, table, &f);
    hgRemoveTabFile(tabDir, table);
    sqlDisconnect(&conn);
    }
return count;
}
Пример #11
0
void makeNewExpTable(char *oldTable, struct maMedSpec *medList, char *newTable)
/* Create new expTable in hgFixed that is very similar
 * to oldExpTable, but with rows defined by medList. */
{
struct maMedSpec *med;
struct expRecord *oldExp, newExp;
struct sqlConnection *conn = sqlConnect("hgFixed");
FILE *f = hgCreateTabFile(tabDir, newTable);
char query[256], **row;
struct sqlResult *sr;
int curId = 0;

for (med = medList; med != NULL; med = med->next)
    {
    /* Load expression record from old table of first
     * thing in median. */
    sqlSafef(query, sizeof(query),
    	"select * from %s where id = %d", oldTable, med->ids[0]);
    sr = sqlGetResult(conn, query);
    if ((row = sqlNextRow(sr)) == NULL)
        errAbort("Can't find id %d in %s\n", med->ids[0], oldTable);
    oldExp = expRecordLoad(row);
    sqlFreeResult(&sr);
    if (oldExp->numExtras < 3)
        errAbort("Can only deal with old tables with 3 extras or more");


    /* Create new expression record, mostly just a shallow copy of old. */
    newExp = *oldExp;
    newExp.id = curId;
    ++curId;
    newExp.name = newExp.description = med->name;
    newExp.extras[2] = med->group;

    /* Save new one, free old one. */
    expRecordTabOut(&newExp, f);
    expRecordFree(&oldExp);
    }

if (doLoad)
    {
    expRecordCreateTable(conn, newTable);
    hgLoadTabFile(conn, tabDir, newTable, &f);
    hgRemoveTabFile(tabDir, newTable);
    }
sqlDisconnect(&conn);
}
Пример #12
0
void hgRatioMicroarray(char *absTable, char *relTable)
/* hgRatioMicroarray - Create a ratio form of microarray. */
{
struct maMedSpec *clumpList = NULL;
struct sqlConnection *conn = sqlConnect(database);
struct sqlResult *sr;
char **row;
char query[512];
struct expData *ex;
struct expData *expList = NULL;
FILE *f = hgCreateTabFile(tabDir, relTable);
int rowCount = 0;

if (clump != NULL)
    clumpList = maMedSpecReadAll(clump);

sqlSafef(query, sizeof(query),
	"select * from %s", absTable);
sr = sqlGetResult(conn, query);
while ((row = sqlNextRow(sr)) != NULL)
    {
    ex = expDataLoad(row);
    slAddHead(&expList, ex);
    if (limit != 0 && rowCount >= limit)
        break;
    }
sqlFreeResult(&sr);
slReverse(&expList);
maExpDataClipMin(expList, minAbsVal, minAbsVal * 0.5);
maExpDataAddConstant(expList, c);
if (transpose)
    maExpDataDoLogRatioTranspose(expList, doAverage);
else
    maExpDataDoLogRatioGivenMedSpec(expList, clumpList, (doAverage) ? useMean : useMedian);
for (ex = expList; ex != NULL; ex = ex->next)
    expDataTabOut(ex, f);
if (doLoad)
    {
    expDataCreateTable(conn, relTable);
    hgLoadTabFile(conn, tabDir, relTable, &f);
    hgRemoveTabFile(tabDir, relTable);
    }
expDataFreeList(&expList);
sqlDisconnect(&conn);
}
struct hash *loadModuleToMotif(struct sqlConnection *conn, char *fileName, 
	char *table)
/* Load up file which has a line per module.  The first word is the module
 * number, the rest of the tab-separated fields are motif names. 
 * Return hash keyed by module&motif. */
{
struct lineFile *lf = lineFileOpen(fileName, TRUE);
char *line, *module, *motif;
FILE *f = hgCreateTabFile(tmpDir, table);
struct dyString *dy = dyStringNew(512);
int motifCount = 0, moduleCount = 0;
struct hash *hash = newHash(18);

while (lineFileNextReal(lf, &line))
    {
    ++moduleCount;
    subChar(line, ' ', '_');
    module = nextWord(&line);
    while ((motif = nextWord(&line)) != NULL)
	{
	++motifCount;
        fprintf(f, "%s\t%s\n", module, motif);
	hashAdd2(hash, module, motif, NULL);
	}
    }
dyStringPrintf(dy,
"CREATE TABLE  %s (\n"
"    module int not null,\n"
"    motif varchar(255) not null,\n"
"              #Indices\n"
"    INDEX(module),\n"
"    INDEX(motif(16))\n"
")\n",  table);
sqlRemakeTable(conn, table, dy->string);
verbose(1, "%d modules, %d motifs in modules\n",
	moduleCount, motifCount);
hgLoadTabFile(conn, tmpDir, table, &f);
hgRemoveTabFile(tmpDir, table);
verbose(1, "Loaded %s table\n", table);
lineFileClose(&lf);
return hash;
}
struct hash *makeExpsTable(char *database, char *expTable, char *expFile,
                                 int *expCount)
/* Open experiment file and use it to create experiment table.
   Use optional fields if present, otherwise defaults.
   Return a hash of expId's, keyed by name */
{
struct lineFile *lf = lineFileOpen(expFile, TRUE);
FILE *f = hgCreateTabFile(tabDir, expTable);
int expId = 0;
char *words[6];
int wordCt;
struct hash *expHash = newHash(0);

while ((wordCt = lineFileChopNext(lf, words, ArraySize(words))))
    {
    char *name = words[0];
    hashAddInt(expHash, name, expId);
    fprintf(f, "%d\t%s\t", expId++, name);
    fprintf(f, "%s\t", wordCt > 1 ? words[1] : name);
    fprintf(f, "%s\t", wordCt > 2 ? words[2] : expUrl);
    fprintf(f, "%s\t", wordCt > 3 ? words[3] : expRef);
    fprintf(f, "%s\t", wordCt > 4 ? words[4] : expCredit);
    fprintf(f, "0\n");          /* extras */
    }
if (expId <= 0)
    errAbort("No experiments in %s", lf->fileName);
verbose(2, "%d experiments\n", expId);

if (doLoad)
    {
    struct sqlConnection *conn = sqlConnect(database);
    expRecordCreateTable(conn, expTable);
    hgLoadTabFile(conn, tabDir, expTable, &f);
    sqlDisconnect(&conn);
    }
lineFileClose(&lf);
if (expCount)
    *expCount = expId;
return expHash;
}
Пример #15
0
void makeNewDataTable(char *database, char *oldTable, struct maMedSpec *medList, char *newTable)
/* Create new table in database based on medians of data
 * in old table as defined by medList. */
{
struct sqlConnection *conn = sqlConnect(database);
FILE *f = hgCreateTabFile(tabDir, newTable);
struct expData *expList, *medianExpList, *exp;

expList = expDataLoadTableLimit(conn, oldTable, limit);
medianExpList = maExpDataMedianFromSpec(expList, medList, minExps);
for (exp = medianExpList; exp != NULL; exp = exp->next)
    expDataTabOut(exp, f);
if (doLoad)
    {
    expDataCreateTable(conn, newTable);
    hgLoadTabFile(conn, tabDir, newTable, &f);
    hgRemoveTabFile(tabDir, newTable);
    }
expDataFreeList(&expList);
expDataFreeList(&medianExpList);
sqlDisconnect(&conn);
}
Пример #16
0
void hgGnfMicroarray(char *expTable, char *dataTable, char *atlasFile)
/** Main function that does all the work for new-style*/
{
struct lineFile *lf = lineFileOpen(atlasFile, TRUE);
char *line;
int i, wordCount, expCount;
char **row;
float *data;
char *affyId;
struct hash *hash = newHash(17);
FILE *f = NULL;
int dataCount = 0;

/* Open Atlas file and use first line to create experiment table. */
if (!lineFileNextReal(lf, &line))
    errAbort("%s is empty", lf->fileName);
if (startsWith("Affy", line))
    line += 4;
if (startsWith("Gene Name", line))
    line += 9;
if (line[0] != '\t')
    errAbort("%s doesn't seem to be a new format atlas file", lf->fileName);
expCount = lineToExpTable(line+1, expTable);
if (expCount <= 0)
    errAbort("No experiments in %s it seems", lf->fileName);
warn("%d experiments\n", expCount);

f = hgCreateTabFile(tabDir, dataTable);

AllocArray(row, expCount);
AllocArray(data, expCount);
while (lineFileNextReal(lf, &line))
    {
    affyId = nextWord(&line);
    wordCount = chopByWhite(line, row, expCount);
    if (wordCount != expCount)
        errAbort("Expecting %d data points, got %d line %d of %s", 
		expCount, wordCount, lf->lineIx, lf->fileName);
    if (chopName != NULL)
        {
	char *e = stringIn(chopName, affyId);
	if (e != NULL)
	    *e = 0;
	}
    if (hashLookup(hash, affyId))
	{
        warn("Duplicate %s, skipping all but first.", affyId);
	continue;
	}
    for (i=0; i<expCount; ++i)
        {
        data[i] = sqlFloat(row[i]);
        }
    shortDataOut(f, affyId, expCount, data);
    ++dataCount;
    if (limit != 0 && dataCount >= limit)
        break;
    }
lineFileClose(&lf);

if (doLoad)
    {
    struct sqlConnection *conn = sqlConnect(database);
    expDataCreateTable(conn, dataTable);
    hgLoadTabFile(conn, tabDir, dataTable, &f);
    hgRemoveTabFile(tabDir, dataTable);
    sqlDisconnect(&conn);
    }
}
void hgExpDistance(char *database, char *posTable, char *expTable, char *outTable)
/* hgExpDistance - Create table that measures expression distance between pairs. */
{
struct sqlConnection *conn = sqlConnect(database);
struct sqlResult *sr;
char query[256];
char **row;
struct hash *expHash = hashNew(16);
int realExpCount = -1;
struct microData *gene;
int rc, t;
pthread_t *threads = NULL;
pthread_attr_t attr;
int *threadID = NULL;
void *status;
char *tempDir = ".";
int arrayNum; 
struct microDataDistance *geneDistPtr = NULL;	
struct microDataDistance *geneDistArray = NULL;	
int geneIx;
FILE *f = NULL;

/* Get list/hash of all items with expression values. */
safef(query, sizeof(query), "select name,expCount,expScores from %s", posTable);
sr = sqlGetResult(conn, query);
while ((row = sqlNextRow(sr)) != NULL)
    {
    char *name = row[0];
    if (!hashLookup(expHash, name))
	{
	int expCount = sqlUnsigned(row[1]);
	int commaCount;
	float *expScores = NULL;

	sqlFloatDynamicArray(row[2], &expScores, &commaCount);
	if (expCount != commaCount)
	    errAbort("expCount and expScores don't match on %s in %s", name, posTable);
	if (realExpCount == -1)
	    realExpCount = expCount;
	if (expCount != realExpCount)
	    errAbort("In %s some rows have %d experiments others %d", 
	    	name, expCount, realExpCount);
	AllocVar(gene);
	gene->expCount = expCount;
	gene->expScores = expScores;
	hashAddSaveName(expHash, name, gene, &gene->name);
	slAddHead(&geneList, gene);
	}
    }
sqlFreeResult(&sr);
conn = sqlConnect(database);
slReverse(&geneList);
geneCount = slCount(geneList);
printf("Have %d elements in %s\n", geneCount, posTable);

weights = getWeights(realExpCount);

if (optionExists("lookup"))
    geneList = lookupGenes(conn, optionVal("lookup", NULL), geneList);
geneCount = slCount(geneList);
printf("Got %d unique elements in %s\n", geneCount, posTable);

sqlDisconnect(&conn);	/* Disconnect because next step is slow. */


if (geneCount < 1)
    errAbort("ERROR: unique gene count less than one ?");

f = hgCreateTabFile(tempDir, outTable);
synQ = synQueueNew();

/* instantiate threads */
AllocArray( threadID, numThreads );
AllocArray( threads, numThreads );
pthread_attr_init( &attr );
pthread_mutex_init( &mutexDotOut, NULL );
pthread_attr_setdetachstate( &attr, PTHREAD_CREATE_JOINABLE );

for (t = 0; t < numThreads; t++) {
	threadID[t] = t;
	rc = pthread_create( &threads[t], &attr, computeDistance, 
						(void *) &threadID[t]);
	if (rc)
		errAbort("ERROR: in pthread_create() %d\n", rc );
} 

/* this thread will write to the file from the queue */
for (arrayNum = 0; arrayNum < geneCount; arrayNum++) {
	geneDistArray = (struct microDataDistance *)synQueueGet( synQ );
	geneDistPtr = geneDistArray;
    	/* Print out closest GENEDISTS distances in tab file. */
    	for (geneIx=0; geneIx < GENEDISTS && geneIx < geneCount; 
						++geneIx, geneDistPtr++)
		if (geneDistPtr != NULL)
			fprintf(f, "%s\t%s\t%f\n", geneDistPtr->name1, 
				geneDistPtr->name2, geneDistPtr->distance);
		else
			errAbort("ERROR: writing distance %d to file\n", 
							geneIx);
	freeMem( geneDistArray );
}

/* synchronize all threads */
for (t = 0; t < numThreads; t++) {
	rc = pthread_join( threads[t], &status);
	if (rc)
		errAbort("ERROR: in pthread_join() %d\n", rc );
} 

printf("Made %s.tab\n", outTable);

slFreeList( &geneList );

pthread_mutex_destroy( &mutexDotOut );
pthread_attr_destroy( &attr );

/* Create and load table. */
conn = sqlConnect(database);
distanceTableCreate(conn, outTable);
hgLoadTabFile(conn, tempDir, outTable, &f);
printf("Loaded %s\n", outTable);

/* Add indices. */
safef(query, sizeof(query), "alter table %s add index(query(12))", outTable);
sqlUpdate(conn, query);
printf("Made query index\n");
if (optionExists("targetIndex"))
    {
    safef(query, sizeof(query), "alter table %s add index(target(12))", outTable);
    sqlUpdate(conn, query);
    printf("Made target index\n");
    }

hgRemoveTabFile(tempDir, outTable);
}
Пример #18
0
void hgLoadRnaFold(char *database, char *table, char *foldDir)
/* hgLoadRnaFold - Load a directory full of RNA fold files into database. */
{
char path[PATH_LEN];
struct slName *dirList, *dirEl;
struct lineFile *lf;
char *line, *word, *s, c;
FILE *f = hgCreateTabFile(tabDir, table);
int count = 0;

dirList = listDir(foldDir, "*");
for (dirEl = dirList; dirEl != NULL; dirEl = dirEl->next)
    {
    char *name = dirEl->name;
    if (sameString(name, "CVS"))
        continue;
    safef(path, sizeof(path), "%s/%s", foldDir, name);
    lf = lineFileOpen(path, TRUE);
    if (!lineFileNext(lf, &line, NULL))
        {
	if (warnEmpty)
	    {
	    warn("%s is empty, skipping\n", name);
	    lineFileClose(&lf);
	    continue;
	    }
	else
	    errAbort("%s is empty\n", name);
	}
    if (!isupper(line[0]))
	notFold(path, 1);
    fprintf(f, "%s\t", name);	/* Save name */
    fprintf(f, "%s\t", line);	/* Save sequence */
    lineFileNeedNext(lf, &line, NULL);
    c = line[0];
    if (c != '.' && c != '(')
        notFold(path, 2);
    word = nextWord(&line);
    fprintf(f, "%s\t", word);	/* Save nested parenthesis */

    /* Parse out (energy) term at end of line. */
    s = strchr(line, '(');
    if (s == NULL)
        notFold(path, 3);
    word = skipLeadingSpaces(s+1);
    if (word == NULL || (!word[0] == '-' && !isdigit(word[0])))
        notFold(path, 4);
    if ((s = strchr(word, ')')) == NULL)
        notFold(path, 5);
    *s = 0;
    fprintf(f, "%s\n", word);
    
    lineFileClose(&lf);
    ++count;
    }
printf("Parsed %d files\n", count);
if (doLoad)
    {
    struct sqlConnection *conn = sqlConnect(database);
    rnaFoldCreateTable(conn, table);
    hgLoadTabFile(conn, tabDir, table, &f);
    hgRemoveTabFile(tabDir, table);
    sqlDisconnect(&conn);
    }
}
Пример #19
0
void processRefSeq(char *database, char *faFile, char *raFile, char *pslFile, char *loc2refFile, 
	char *pepFile, char *mim2locFile)
/* hgRefSeqMrna - Load refSeq mRNA alignments and other info into 
 * refSeqGene table. */
{
struct lineFile *lf;
struct hash *raHash, *rsiHash = newHash(0);
struct hash *loc2mimHash = newHash(0);
struct refSeqInfo *rsiList = NULL, *rsi;
char *s, *line, *row[5];
int wordCount, dotMod = 0;
int noLocCount = 0;
int rsiCount = 0;
int noProtCount = 0;
struct psl *psl;
struct sqlConnection *conn = hgStartUpdate(database);
struct hash *productHash = loadNameTable(conn, "productName", 16);
struct hash *geneHash = loadNameTable(conn, "geneName", 16);
char *kgName = "refGene";

FILE *kgTab = hgCreateTabFile(".", kgName);
FILE *productTab = hgCreateTabFile(".", "productName");
FILE *geneTab = hgCreateTabFile(".", "geneName");
FILE *refLinkTab = hgCreateTabFile(".", "refLink");
FILE *refPepTab = hgCreateTabFile(".", "refPep");
FILE *refMrnaTab = hgCreateTabFile(".", "refMrna");

struct exon *exonList = NULL, *exon;
char *answer;
char cond_str[200];

/* Make refLink and other tables table if they don't exist already. */
sqlMaybeMakeTable(conn, "refLink", refLinkTableDef);
sqlUpdate(conn, "NOSQLINJ delete from refLink");
sqlMaybeMakeTable(conn, "refGene", refGeneTableDef);
sqlUpdate(conn, "NOSQLINJ delete from refGene");
sqlMaybeMakeTable(conn, "refPep", refPepTableDef);
sqlUpdate(conn, "NOSQLINJ delete from refPep");
sqlMaybeMakeTable(conn, "refMrna", refMrnaTableDef);
sqlUpdate(conn, "NOSQLINJ delete from refMrna");

/* Scan through locus link to omim ID file and put in hash. */
    {
    char *row[2];

    printf("Scanning %s\n", mim2locFile);
    lf = lineFileOpen(mim2locFile, TRUE);
    while (lineFileRow(lf, row))
	{
	hashAdd(loc2mimHash, row[1], intToPt(atoi(row[0])));
	}
    lineFileClose(&lf);
    }

/* Scan through .ra file and make up start of refSeqInfo
 * objects in hash and list. */
printf("Scanning %s\n", raFile);
lf = lineFileOpen(raFile, TRUE);
while ((raHash = hashNextRa(lf)) != NULL)
    {
    if (clDots > 0 && ++dotMod == clDots )
        {
	dotMod = 0;
	dotOut();
	}
    AllocVar(rsi);
    slAddHead(&rsiList, rsi);
    if ((s = hashFindVal(raHash, "acc")) == NULL)
        errAbort("No acc near line %d of %s", lf->lineIx, lf->fileName);
    rsi->mrnaAcc = cloneString(s);
    if ((s = hashFindVal(raHash, "siz")) == NULL)
        errAbort("No siz near line %d of %s", lf->lineIx, lf->fileName);
    rsi->size = atoi(s);
    if ((s = hashFindVal(raHash, "gen")) != NULL)
	rsi->geneName = cloneString(s);
    //!!!else
      //!!!  warn("No gene name for %s", rsi->mrnaAcc);
    if ((s = hashFindVal(raHash, "cds")) != NULL)
        parseCds(s, 0, rsi->size, &rsi->cdsStart, &rsi->cdsEnd);
    else
        rsi->cdsEnd = rsi->size;
    if ((s = hashFindVal(raHash, "ngi")) != NULL)
        rsi->ngi = atoi(s);

    rsi->geneNameId = putInNameTable(geneHash, geneTab, rsi->geneName);
    s = hashFindVal(raHash, "pro");
    if (s != NULL)
        rsi->productName = cloneString(s);
    rsi->productNameId = putInNameTable(productHash, productTab, s);
    hashAdd(rsiHash, rsi->mrnaAcc, rsi);

    freeHashAndVals(&raHash);
    }
lineFileClose(&lf);
if (clDots) printf("\n");

/* Scan through loc2ref filling in some gaps in rsi. */
printf("Scanning %s\n", loc2refFile);
lf = lineFileOpen(loc2refFile, TRUE);
while (lineFileNext(lf, &line, NULL))
    {
    char *mrnaAcc;

    if (line[0] == '#')
        continue;
    wordCount = chopTabs(line, row);
    if (wordCount < 5)
        errAbort("Expecting at least 5 tab-separated words line %d of %s",
		lf->lineIx, lf->fileName);
    mrnaAcc = row[1];
    mrnaAcc = accWithoutSuffix(mrnaAcc);

    if (mrnaAcc[2] != '_')
        warn("%s is and odd name %d of %s", 
		mrnaAcc, lf->lineIx, lf->fileName);
    if ((rsi = hashFindVal(rsiHash, mrnaAcc)) != NULL)
        {
	rsi->locusLinkId = lineFileNeedNum(lf, row, 0);
	rsi->omimId = ptToInt(hashFindVal(loc2mimHash, row[0]));
	rsi->proteinAcc = cloneString(accWithoutSuffix(row[4]));
	}
    }
lineFileClose(&lf);

/* Report how many seem to be missing from loc2ref file. 
 * Write out knownInfo file. */
printf("Writing %s\n", "refLink.tab");
for (rsi = rsiList; rsi != NULL; rsi = rsi->next)
    {
    ++rsiCount;
    if (rsi->locusLinkId == 0)
        ++noLocCount;
    if (rsi->proteinAcc == NULL)
        ++noProtCount;
    fprintf(refLinkTab, "%s\t%s\t%s\t%s\t%u\t%u\t%u\t%u\n",
	emptyForNull(rsi->geneName), 
	emptyForNull(rsi->productName),
    	emptyForNull(rsi->mrnaAcc), 
	emptyForNull(rsi->proteinAcc),
	rsi->geneNameId, rsi->productNameId, 
	rsi->locusLinkId, rsi->omimId);
    }
if (noLocCount) 
    printf("Missing locusLinkIds for %d of %d\n", noLocCount, rsiCount);
if (noProtCount)
    printf("Missing protein accessions for %d of %d\n", noProtCount, rsiCount);

/* Process alignments and write them out as genes. */
lf = pslFileOpen(pslFile);
dotMod = 0;
while ((psl = pslNext(lf)) != NULL)
  {
  if (hashFindVal(rsiHash, psl->qName) != NULL)
    {
    if (clDots > 0 && ++dotMod == clDots )
        {
	dotMod = 0;
	dotOut();
	}
   
    sqlSafefFrag(cond_str, sizeof cond_str, "extAC='%s'", psl->qName);
    answer = sqlGetField(proteinDB, "spXref2", "displayID", cond_str);
	       
    if (answer == NULL)
	{
	fprintf(stderr, "%s NOT FOUND.\n", psl->qName);
   	fflush(stderr);
	}

    if (answer != NULL)
    	{	
        struct genePred *gp = NULL;
    	exonList = pslToExonList(psl);
    	fprintf(kgTab, "%s\t%s\t%c\t%d\t%d\t",
	psl->qName, psl->tName, psl->strand[0], psl->tStart, psl->tEnd);
    	rsi = hashMustFindVal(rsiHash, psl->qName);

        gp = genePredFromPsl(psl, rsi->cdsStart, rsi->cdsEnd, genePredStdInsertMergeSize);
        if (!gp)
            errAbort("Cannot convert psl (%s) to genePred.\n", psl->qName);

    	fprintf(kgTab, "%d\t%d\t", gp->cdsStart, gp->cdsEnd);
    	fprintf(kgTab, "%d\t", slCount(exonList));
    
    	fflush(kgTab);
     
    	for (exon = exonList; exon != NULL; exon = exon->next)
        fprintf(kgTab, "%d,", exon->start);
    	fprintf(kgTab, "\t");
    
        for (exon = exonList; exon != NULL; exon = exon->next)
        	fprintf(kgTab, "%d,", exon->end);
    	fprintf(kgTab, "\n");
    	slFreeList(&exonList);
    	}
    }
  else
    {
    fprintf(stderr, "%s found in psl, but not in .fa or .ra data files.\n", psl->qName);
    fflush(stderr);
    }
  }

if (clDots) printf("\n");

if (!clTest)
    {
    writeSeqTable(pepFile, refPepTab, FALSE, TRUE);
    writeSeqTable(faFile, refMrnaTab, FALSE, FALSE);
    }

carefulClose(&kgTab);
carefulClose(&productTab);
carefulClose(&geneTab);
carefulClose(&refLinkTab);
carefulClose(&refPepTab);
carefulClose(&refMrnaTab);

if (!clTest)
    {
    printf("Loading database with %s\n", kgName);
    fflush(stdout);
    
    hgLoadTabFile(conn, ".", kgName, NULL);

    printf("Loading database with %s\n", "productName");
    fflush(stdout);
    hgLoadTabFile(conn, ".", "productName", NULL);
    
    printf("Loading database with %s\n", "geneName");
    fflush(stdout);
    hgLoadTabFile(conn, ".", "geneName", NULL);
    
    printf("Loading database with %s\n", "refLink");
    fflush(stdout);
    hgLoadTabFile(conn, ".", "refLink", NULL);
    
    printf("Loading database with %s\n", "refPep");
    fflush(stdout);
    hgLoadTabFile(conn, ".", "refPep", NULL);
    
    printf("Loading database with %s\n", "refMrna");
    fflush(stdout);
    hgLoadTabFile(conn, ".", "refMrna", NULL);
    }
}
Пример #20
0
void hgKnownToSuper(char *database, char *org, char *assFile)
/* hgKnownToSuper - Load knownToSuperfamily table. */
{
    struct sqlConnection *conn = sqlConnect(database);
    struct hash *pepToKnown = ensPepToKnown(conn, TRUE);
    char *table = "knownToSuper";
    FILE *f = hgCreateTabFile(tempDir, table);
    struct lineFile *lf = lineFileOpen(assFile, TRUE);
    boolean gotOrg = FALSE;
    int outCount = 0;
    char *row[6];

    while (lineFileRow(lf, row))
    {
        if (sameString(row[0], org))
        {
            char *pepName = row[1];
            char *regions = row[3];
            char *eVal = row[4];
            char *supId = row[5];
            char *knownId = hashFindVal(pepToKnown, pepName);
            if (knownId != NULL)
            {
                char *region, *e;
                int start,end;
                /* Loop through comma-separated region string. */
                for (region = regions; region != NULL; region = e)
                {
                    e = strchr(region, ',');
                    if (e != NULL)
                    {
                        *e++ = 0;
                        if (e[0] == 0)
                            e = NULL;
                    }
                    if (sscanf(region, "%d-%d", &start, &end) < 2)
                        errAbort("bad region %s line %d of %s", region,
                                 lf->lineIx, lf->fileName);
                    fprintf(f, "%s\t%s\t%d\t%d\t%s\n",
                            knownId, supId, start-1, end, eVal);
                    ++outCount;
                }
            }
            gotOrg = TRUE;
        }
    }
    lineFileClose(&lf);
    if (!gotOrg)
        errAbort("Looks like '%s' is not a recognized organism", org);
    if (outCount <= 0)
        errAbort("No good records found in %s", assFile);
    printf("%d records output\n", outCount);

    /* Refresh connection in case things took a while. */
    sqlDisconnect(&conn);
    conn = sqlConnect(database);

    /* Load up database. */
    createTable(conn, table);
    hgLoadTabFile(conn, tempDir, table, &f);
    hgRemoveTabFile(tempDir, table);
}
void hgStsAlias(char *database, char *inFile)
/* hgStsAlias - Make table of STS aliases. */
{
struct lineFile *lf = lineFileOpen(inFile, TRUE);
char *words[16],*parts[64];
int partCount, wordCount;
char *table = "stsAlias";
struct sqlConnection *conn = sqlConnect(database);
FILE *f = hgCreateTabFile(".", table);
struct hash *trueHash = makeTrueHash(conn);
int i;
char *alias, *trueName;
int aliasCount = 0;

while ((wordCount = lineFileChop(lf, words)) != 0)
    {
    trueName = NULL;
    if (wordCount != 2)
        {
	static boolean warned = FALSE;
	if (!warned)
	    {
	    warn("Got %d words line %d of %s, skipping", wordCount, lf->lineIx, 
		lf->fileName);
	    warn("There may be other lines like this as well");
	    warned = TRUE;
	    }
	continue;
	}
    lineFileExpectWords(lf, 2, wordCount);
    partCount = chopByChar(words[1], ';', parts, ArraySize(parts));
    if (partCount >= ArraySize(parts))
        errAbort("Too many aliases line %d of %s\n", lf->lineIx, lf->fileName);

    /* Figure out which one we actually have a name for. */
    for (i=0; i<partCount; ++i)
        {
	alias = parts[i];
	if (hashLookup(trueHash, alias))
	    {
	    trueName = alias;
	    break;
	    }
	}

    /* If we have a true name then write out alias/trueName pairs. */
    if (trueName != NULL)
	{
	for (i=0; i<partCount; ++i)
	    {
	    alias = parts[i];
	    if (alias != trueName)
		{
		++aliasCount;
	        fprintf(f, "%s\t%s\n", alias, trueName);
		}
	    }
	}
    }
lineFileClose(&lf);
printf("Found %d aliases in %s\n", aliasCount, inFile);
hgLoadTabFile(conn, ".", table, &f);
sqlDisconnect(&conn);
printf("Loaded table %s in database %s\n", table, database);
}
Пример #22
0
void hgExpDistance(char *database, char *posTable, char *expTable, char *outTable)
/* hgExpDistance - Create table that measures expression distance between pairs. */
{
struct sqlConnection *conn = sqlConnect(database);
struct sqlResult *sr;
char query[256];
char **row;
struct hash *expHash = hashNew(16);
int realExpCount = -1;
struct microData *geneList = NULL, *curGene, *gene;
int geneIx, geneCount = 0;
struct microData **geneArray = NULL;
float *weights = NULL;
char *tempDir = ".";
FILE *f = hgCreateTabFile(tempDir, outTable);

/* Get list/hash of all items with expression values. */
sqlSafef(query, sizeof(query), "select name,expCount,expScores from %s", posTable);
sr = sqlGetResult(conn, query);
while ((row = sqlNextRow(sr)) != NULL)
    {
    char *name = row[0];
    if (!hashLookup(expHash, name))
	{
	int expCount = sqlUnsigned(row[1]);
	int commaCount;
	float *expScores = NULL;

	sqlFloatDynamicArray(row[2], &expScores, &commaCount);
	if (expCount != commaCount)
	    errAbort("expCount and expScores don't match on %s in %s", name, posTable);
	if (realExpCount == -1)
	    realExpCount = expCount;
	if (expCount != realExpCount)
	    errAbort("In %s some rows have %d experiments others %d", 
	    	name, expCount, realExpCount);
	AllocVar(gene);
	gene->expCount = expCount;
	gene->expScores = expScores;
	hashAddSaveName(expHash, name, gene, &gene->name);
	slAddHead(&geneList, gene);
	}
    }
sqlFreeResult(&sr);
conn = sqlConnect(database);
slReverse(&geneList);
geneCount = slCount(geneList);
printf("Have %d elements in %s\n", geneCount, posTable);

weights = getWeights(realExpCount);

if (optionExists("lookup"))
    geneList = lookupGenes(conn, optionVal("lookup", NULL), geneList);
geneCount = slCount(geneList);
printf("Got %d unique elements in %s\n", geneCount, posTable);

sqlDisconnect(&conn);	/* Disconnect because next step is slow. */


if (geneCount < 1)
    errAbort("ERROR: unique gene count less than one ?");
/* Get an array for sorting. */
AllocArray(geneArray, geneCount);
for (gene = geneList,geneIx=0; gene != NULL; gene = gene->next, ++geneIx)
    geneArray[geneIx] = gene;

/* Print out closest 1000 in tab file. */
for (curGene = geneList; curGene != NULL; curGene = curGene->next)
    {
    calcDistances(curGene, geneList, weights);
    qsort(geneArray, geneCount, sizeof(geneArray[0]), cmpMicroDataDistance);
    for (geneIx=0; geneIx < 1000 && geneIx < geneCount; ++geneIx)
        {
	gene = geneArray[geneIx];
	fprintf(f, "%s\t%s\t%f\n", curGene->name, gene->name, gene->distance);
	}
    dotOut();
    }
printf("Made %s.tab\n", outTable);

/* Create and load table. */
conn = sqlConnect(database);
distanceTableCreate(conn, outTable);
hgLoadTabFile(conn, tempDir, outTable, &f);
printf("Loaded %s\n", outTable);

/* Add indices. */
sqlSafef(query, sizeof(query), "alter table %s add index(query)", outTable);
sqlUpdate(conn, query);
printf("Made query index\n");
if (optionExists("targetIndex"))
    {
    sqlSafef(query, sizeof(query), "alter table %s add index(target)", outTable);
    sqlUpdate(conn, query);
    printf("Made target index\n");
    }

hgRemoveTabFile(tempDir, outTable);
}
Пример #23
0
void knownToVisiGene(char *database)
/* knownToVisiGene - Create knownToVisiGene table by riffling through various other knownTo tables. */
{
char *tempDir = ".";
FILE *f = hgCreateTabFile(tempDir, outTable);
struct sqlConnection *hConn = sqlConnect(database);
struct sqlConnection *iConn = sqlConnect(visiDb);
struct sqlResult *sr;
char **row;
struct hash *geneImageHash = newHash(18);
struct hash *locusLinkImageHash = newHash(18);
struct hash *refSeqImageHash = newHash(18);
struct hash *genbankImageHash = newHash(18);
struct hash *probeImageHash = newHash(18);
struct hash *knownToLocusLinkHash = newHash(18);
struct hash *knownToRefSeqHash = newHash(18);
struct hash *knownToGeneHash = newHash(18);
struct hash *favorHugoHash = newHash(18);
struct hash *knownToProbeHash = newHash(18);
struct hash *knownToAllProbeHash = newHash(18);
struct genePred *knownList = NULL, *known;
struct hash *dupeHash = newHash(17);


probesDb  = optionVal("probesDb", database);
struct sqlConnection *probesConn = sqlConnect(probesDb);
vgProbes = sqlTableExists(probesConn,"vgProbes");
vgAllProbes = sqlTableExists(probesConn,"vgAllProbes");

/* Go through and make up hashes of images keyed by various fields. */
sr = sqlGetResult(iConn,
        NOSQLINJ "select image.id,imageFile.priority,gene.name,gene.locusLink,gene.refSeq,gene.genbank"
	",probe.id,submissionSet.privateUser,vgPrbMap.vgPrb,gene.id"
	" from image,imageFile,imageProbe,probe,gene,submissionSet,vgPrbMap"
	" where image.imageFile = imageFile.id"
	" and image.id = imageProbe.image"
	" and imageProbe.probe = probe.id"
	" and probe.gene = gene.id"
	" and image.submissionSet=submissionSet.id"
	" and vgPrbMap.probe = probe.id");

while ((row = sqlNextRow(sr)) != NULL)
    {
    int id = sqlUnsigned(row[0]);
    float priority = atof(row[1]);
    int privateUser = sqlSigned(row[7]);
    char vgPrb_Id[256];
    safef(vgPrb_Id, sizeof(vgPrb_Id), "vgPrb_%s",row[8]);
    int geneId = sqlUnsigned(row[9]);
    if (privateUser == 0)
	{
	addPrioritizedImage(probeImageHash, id, priority, geneId, vgPrb_Id);
	addPrioritizedImage(geneImageHash, id, priority, geneId, row[2]);
	addPrioritizedImage(locusLinkImageHash, id, priority, geneId, row[3]);
	addPrioritizedImage(refSeqImageHash, id, priority, geneId, row[4]);
	addPrioritizedImage(genbankImageHash, id, priority, geneId, row[5]);
	}
    }
verbose(2, "Made hashes of image: geneImageHash %d, locusLinkImageHash %d, refSeqImageHash %d"
           ", genbankImageHash %d probeImageHash %d\n", 
            geneImageHash->elCount, locusLinkImageHash->elCount, refSeqImageHash->elCount, 
	    genbankImageHash->elCount, probeImageHash->elCount);
sqlFreeResult(&sr);

/* Build up list of known genes. */
sr = sqlGetResult(hConn, NOSQLINJ "select * from knownGene");
while ((row = sqlNextRow(sr)) != NULL)
    {
    struct genePred *known = genePredLoad(row);
    if (!hashLookup(dupeHash, known->name))
        {
	hashAdd(dupeHash, known->name, NULL);
	slAddHead(&knownList, known);
	}
    }
slReverse(&knownList);
sqlFreeResult(&sr);
verbose(2, "Got %d known genes\n", slCount(knownList));

/* Build up hashes from knownGene to other things. */
if (vgProbes)
    bestProbeOverlap(probesConn, "vgProbes", knownList, knownToProbeHash);
if (vgAllProbes)
    bestProbeOverlap(probesConn, "vgAllProbes", knownList, knownToAllProbeHash);

foldIntoHash(hConn, "knownToLocusLink", "name", "value", knownToLocusLinkHash, NULL, FALSE);
foldIntoHash(hConn, "knownToRefSeq", "name", "value", knownToRefSeqHash, NULL, FALSE);
foldIntoHash(hConn, "kgXref", "kgID", "geneSymbol", knownToGeneHash, favorHugoHash, FALSE);
foldIntoHash(hConn, "kgAlias", "kgID", "alias", knownToGeneHash, favorHugoHash, TRUE);
foldIntoHash(hConn, "kgProtAlias", "kgID", "alias", knownToGeneHash, favorHugoHash, TRUE);

verbose(2, "knownToLocusLink %d, knownToRefSeq %d, knownToGene %d knownToProbe %d knownToAllProbe %d\n", 
   knownToLocusLinkHash->elCount, knownToRefSeqHash->elCount, knownToGeneHash->elCount,
   knownToProbeHash->elCount, knownToAllProbeHash->elCount);

/* Try and find an image for each gene. */
for (known = knownList; known != NULL; known = known->next)
    {
    char *name = known->name;
    struct prioritizedImage *best = NULL;
    {
    best = bestImage(name, knownToLocusLinkHash, locusLinkImageHash);
    if (!best)
	best = bestImage(name, knownToRefSeqHash, refSeqImageHash);
    if (!best)
	{
	best = hashFindVal(genbankImageHash, name);
	}
    if (!best)
	best = bestImage(name, knownToGeneHash, geneImageHash);
    if (vgProbes && !best)
	best = bestImage(name, knownToProbeHash, probeImageHash);
    if (vgAllProbes && !best)
	best = bestImage(name, knownToAllProbeHash, probeImageHash);
    }	    
    if (best)
        {
	fprintf(f, "%s\t%d\t%d\n", name, best->imageId, best->geneId);
	}
    }

createTable(hConn, outTable);
hgLoadTabFile(hConn, tempDir, outTable, &f);
hgRemoveTabFile(tempDir, outTable);
}
void loadGeneToMotif(struct sqlConnection *conn, char *fileName, char *table,
	struct hash *geneToModuleHash, struct hash *moduleAndMotifHash,
	struct hash *motifHash, struct hash *positionsHash,
	char *regionTable)
/* Load file which is a big matrix with genes for rows and motifs for
 * columns.  There is a semicolon-separated list of numbers in the matrix 
 * where a gene has the motif, and an empty (tab separated) field
 * where there is no motif.  The numbers are relative to the
 * region associated with the gene in the positionsHash. 
 * Only load bits of this where motif actually occurs in module associated 
 * with gene. */
{
struct lineFile *lf = lineFileOpen(fileName, TRUE);
char *line;
FILE *f = hgCreateTabFile(tmpDir, table);
char *motifNames[32*1024], *row[32*1024];
int motifCount, rowSize, i;
char *gene, *module;
int geneCount = 0, total = 0;
struct dyString *dy = dyStringNew(512);
struct genomePos *motifPosList = NULL, *motifPosForGene;
struct genomePos *regionPosList = NULL, *regionPos;

/* Read first line, which is labels. */
if (!lineFileNextReal(lf, &line))
    errAbort("Empty file %s", fileName);
subChar(line, ' ', '_');
motifCount = chopLine(line, motifNames);
if (motifCount >= ArraySize(motifNames))
    errAbort("Too many motifs line 1 of %s", fileName);
lineFileExpectAtLeast(lf, 2, motifCount);
motifNames[0] = NULL;
for (i=1; i<motifCount; ++i)
    {
    char name[64];
    motifNames[i] = cloneString(fixMotifName(motifNames[i],name,sizeof(name)));
    if (!hashLookup(motifHash, motifNames[i]))
        errAbort("Motif %s is in %s but not modules_motifs.gxm",
		motifNames[i], fileName);
    }

/* Read subsequent lines. */
while ((rowSize = lineFileChopTab(lf, row)) != 0)
    {
    lineFileExpectWords(lf, motifCount, rowSize);
    gene = row[0];
    module = hashFindVal(geneToModuleHash, gene);
    if (module == NULL)
	{
        warn("WARNING: Gene %s in line %d of %s but not module_assignments.tab", 
		gene, lf->lineIx, lf->fileName);
	continue;
	}
    regionPos = NULL;
    for (i=1; i<rowSize; ++i)
        {
	if (row[i][0] != 0)
	    {
	    if (hashLookup2(moduleAndMotifHash, module, motifNames[i]))
		{
		regionPos = hashFindVal(positionsHash, gene);
		if (regionPos == NULL)
		    {
		    warn("WARNING: %s in %s but not gene_positions.tab",
		    	gene, fileName);
		    i = rowSize; continue;
		    }
		
		motifPosForGene = convertMotifPos(row[i], regionPos, 
			hashMustFindVal(motifHash, motifNames[i]), lf);
		motifPosList = slCat(motifPosForGene, motifPosList);
		++total;
		}
	    }
	}
    if (regionPos != NULL)
        {
	slAddHead(&regionPosList, regionPos);
	}
    ++geneCount;
    }
lineFileClose(&lf);

/* Output sorted table of all motif hits. */
    {
    struct genomePos *pos;
    slSort(&motifPosList, genomePosCmp);
    for (pos = motifPosList; pos != NULL; pos = pos->next)
	{
	int start = pos->start;
	int end = pos->end;
	if (start < 0) start = 0;
	fprintf(f, "%d\t", binFromRange(start, end));
	fprintf(f, "%s\t", pos->chrom);
	fprintf(f, "%d\t%d\t", start, end);
	fprintf(f, "%s\t", pos->motif);
	fprintf(f, "%d\t", pos->score);
	fprintf(f, "%c\t", pos->strand);
	fprintf(f, "%s\n", pos->name);
	}
    dyStringPrintf(dy,
    "CREATE TABLE  %s (\n"
    "    bin smallInt unsigned not null,\n"
    "    chrom varChar(255) not null,\n"
    "    chromStart int not null,\n"
    "    chromEnd int not null,\n"
    "    name varchar(255) not null,\n"
    "    score int not null,\n"
    "    strand char(1) not null,\n"
    "    gene varchar(255) not null,\n"
    "              #Indices\n"
    "    INDEX(gene(12)),\n"
    "    INDEX(name(16)),\n"
    "    INDEX(chrom(8),bin)\n"
    ")\n",  table);
    sqlRemakeTable(conn, table, dy->string);
    verbose(1, "%d genes, %d motifs, %d motifs in genes\n",
	    geneCount, motifCount-1, total);
    hgLoadTabFile(conn, tmpDir, table, &f);
    // hgRemoveTabFile(tmpDir, table);
    verbose(1, "Loaded %s table\n", table);
    slFreeList(&motifPosList);
    }

/* Now output sorted table of upstream regions. */
    {
    FILE *f = hgCreateTabFile(tmpDir, regionTable);
    struct genomePos *pos;
    dyStringClear(dy);
    dyStringPrintf(dy,
    "CREATE TABLE  %s (\n"
    "    bin smallInt unsigned not null,\n"
    "    chrom varChar(255) not null,\n"
    "    chromStart int not null,\n"
    "    chromEnd int not null,\n"
    "    name varchar(255) not null,\n"
    "    score int not null,\n"
    "    strand char(1) not null,\n"
    "              #Indices\n"
    "    INDEX(name(16)),\n"
    "    INDEX(chrom(8),bin)\n"
    ")\n",  regionTable);
    sqlRemakeTable(conn, regionTable, dy->string);
    slSort(&regionPosList, genomePosCmp);
    for (pos = regionPosList; pos != NULL; pos = pos->next)
	{
	int start = pos->start;
	int end = pos->end;
	if (start < 0) start = 0;
	fprintf(f, "%d\t", binFromRange(start, end));
	fprintf(f, "%s\t", pos->chrom);
	fprintf(f, "%d\t%d\t", start, end);
	fprintf(f, "%s\t", pos->name);
	fprintf(f, "%d\t", pos->score);
	fprintf(f, "%c\n", pos->strand);
	}
    hgLoadTabFile(conn, tmpDir, regionTable, &f);
    // hgRemoveTabFile(tmpDir, regionTable);
    }
}
Пример #25
0
void hgLoadChromGraph(boolean doLoad, char *db, char *track, char *fileName)
/* hgLoadChromGraph - Load up chromosome graph. */
{
    double minVal,maxVal;
    struct chromGraph *el, *list;
    FILE *f;
    char *tempDir = ".";
    char path[PATH_LEN], gbdbPath[PATH_LEN];
    char *idTable = optionVal("idTable", NULL);
    char *pathPrefix = NULL;

    if (idTable == NULL)
        list = chromGraphLoadAll(fileName);
    else
        list = chromGraphListWithTable(fileName, db, idTable);
    if (list == NULL)
        errAbort("%s is empty", fileName);

    /* Figure out min/max values */
    minVal = maxVal = list->val;
    for (el = list->next; el != NULL; el = el->next)
    {
        if (optionExists("minusLog10"))
        {
            if (el->val == 1)
                el->val = 0;
            else if (el->val > 0)
                el->val = -1 * log(el->val)/log(10);
        }
        if (el->val < minVal)
            minVal = el->val;
        if (el->val > maxVal)
            maxVal = el->val;
    }


    /* Sort and write out temp file. */
    slSort(&list, chromGraphCmp);
    f = hgCreateTabFile(tempDir, track);
    for (el = list; el != NULL; el = el->next)
        chromGraphTabOut(el, f);

    if (doLoad)
    {
        struct dyString *dy = dyStringNew(0);
        struct sqlConnection *conn;

        /* Set up connection to database and create main table. */
        conn = hAllocConn(db);
        sqlDyStringPrintf(dy, createString, track, hGetMinIndexLength(db));
        sqlRemakeTable(conn, track, dy->string);

        /* Load main table and clean up file handle. */
        hgLoadTabFile(conn, tempDir, track, &f);
        hgRemoveTabFile(tempDir, track);

        /* If need be create meta table.  If need be delete old row. */
        if (!sqlTableExists(conn, "metaChromGraph"))
            sqlUpdate(conn, metaCreateString);
        else
        {
            dyStringClear(dy);
            sqlDyStringPrintf(dy, "delete from metaChromGraph where name = '%s'",
                              track);
            sqlUpdate(conn, dy->string);
        }

        /* Make chrom graph file */
        safef(path, sizeof(path), "%s.cgb", track);
        chromGraphToBin(list, path);
        safef(path, sizeof(path), "/gbdb/%s/chromGraph", db);
        pathPrefix = optionVal("pathPrefix", path);
        safef(gbdbPath, sizeof(gbdbPath), "%s/%s.cgb", pathPrefix, track);

        /* Create new line in meta table */
        dyStringClear(dy);
        sqlDyStringPrintf(dy, "insert into metaChromGraph values('%s',%f,%f,'%s');",
                          track, minVal, maxVal, gbdbPath);
        sqlUpdate(conn, dy->string);
    }
}
void hgLoadMafSummary(char *db, char *table, char *fileName)
/* hgLoadMafSummary - Load a summary table of pairs in a maf into a database. */
{
long mafCount = 0, allMafCount = 0;
struct mafComp *mcMaster = NULL;
struct mafAli *maf;
struct mafFile *mf = mafOpen(fileName);
struct sqlConnection *conn;
FILE *f = hgCreateTabFile(".", table);
long componentCount = 0;
struct hash *componentHash = newHash(0);

if (!test)
    {
    conn = sqlConnect(database);
    mafSummaryTableCreate(conn, table, hGetMinIndexLength(db));
    }
verbose(1, "Indexing and tabulating %s\n", fileName);

/* process mafs */
while ((maf = mafNext(mf)) != NULL)
    {
    mcMaster = mafMaster(maf, mf, fileName);
    allMafCount++;
    if (mcMaster->srcSize < minSeqSize)
	continue;
    while (mcMaster->size > maxSize)
        {
        /* break maf into maxSize pieces */
        int end = mcMaster->start + maxSize;
        struct mafAli *subMaf = 
                mafSubset(maf, mcMaster->src, mcMaster->start, end);
        verbose(3, "Splitting maf %s:%d len %d\n", mcMaster->src,
                                        mcMaster->start, mcMaster->size);
        componentCount += 
            processMaf(subMaf, componentHash, f, mf, fileName);
        mafAliFree(&subMaf);
        subMaf = mafSubset(maf, mcMaster->src, 
                                end, end + (mcMaster->size - maxSize));
        mafAliFree(&maf);
        maf = subMaf;
        mcMaster = mafMaster(maf, mf, fileName);
        }
    if (mcMaster->size != 0)
        {
        /* remainder of maf after splitting off maxSize submafs */
        componentCount += 
            processMaf(maf, componentHash, f, mf, fileName);
        }
    mafAliFree(&maf);
    mafCount++;
    }
mafFileFree(&mf);
flushSummaryBlocks(componentHash, f);
verbose(1, 
    "Created %ld summary blocks from %ld components and %ld mafs from %s\n",
        summaryCount, componentCount, allMafCount, fileName);
if (test)
    return;
verbose(1, "Loading into %s table %s...\n", database, table);
hgLoadTabFile(conn, ".", table, &f);
verbose(1, "Loading complete");
hgEndUpdate(&conn, "Add %ld maf summary blocks from %s\n", 
                        summaryCount, fileName);
}
Пример #27
0
void hgFlyBase(char *database, char *genesFile)
/* hgFlyBase - Parse FlyBase genes.txt file and turn it into a couple of 
 * tables. */
{
char *tGene = "fbGene";
char *tSynonym = "fbSynonym";
char *tAllele = "fbAllele";
char *tRef = "fbRef";
char *tRole = "fbRole";
char *tPhenotype = "fbPhenotype";
char *tTranscript = "fbTranscript";
char *tGo = "fbGo";
char *tUniProt = "fbUniProt";
FILE *fGene = hgCreateTabFile(tabDir, tGene);
FILE *fSynonym = hgCreateTabFile(tabDir, tSynonym);
FILE *fAllele = hgCreateTabFile(tabDir, tAllele);
FILE *fRef = hgCreateTabFile(tabDir, tRef);
FILE *fRole = hgCreateTabFile(tabDir, tRole);
FILE *fPhenotype = hgCreateTabFile(tabDir, tPhenotype);
FILE *fTranscript = NULL;
FILE *fGo = hgCreateTabFile(tabDir, tGo);
FILE *fUniProt = hgCreateTabFile(tabDir, tUniProt);
struct lineFile *lf = lineFileOpen(genesFile, TRUE);
struct hash *refHash = newHash(19);
int nextRefId = 0;
int nextAlleleId = 0;
char *line, sub, type, *rest, *s;
char *geneSym = NULL, *geneName = NULL, *geneId = NULL;
int recordCount = 0;
struct slName *synList = NULL, *syn;
int curAllele = 0, curRef = 0;
struct ref *ref = NULL;
struct sqlConnection *conn;
struct hash *goUniqHash = newHash(18);

/* Make table from flybase genes to BGDP transcripts. */
if (doTranscript)
    {
    fTranscript = hgCreateTabFile(tabDir, tTranscript);
    getAllSplices(database, fTranscript);
    }

/* Make dummy reference for flybase itself. */
fprintf(fRef, "0\tFlyBase\n");

/* Loop through parsing and writing tab files. */
while (lineFileNext(lf, &line, NULL))
    {
    sub = line[0];
    if (sub == '#')
	{
	/* End of record. */
	++recordCount;
	if (geneId == NULL)
	    errAbort("Record without *z line ending line %d of %s",
		lf->lineIx, lf->fileName);

	/* Write out synonyms. */
	s = naForNull(geneSym);
	geneSym = ungreek(s);
	freeMem(s);
	s = naForNull(geneName);
	geneName = ungreek(s);
	if (! sameString(s, "n/a"))
	    freeMem(s);
	if (geneSym != NULL && !sameString(geneSym, "n/a"))
	    slNameStore(&synList, geneSym);
	if (geneName != NULL && !sameString(geneName, "n/a"))
	    slNameStore(&synList, geneName);
	for (syn = synList; syn != NULL; syn = syn->next)
	    {
	    s = ungreek(syn->name);
	    fprintf(fSynonym, "%s\t%s\n", geneId, s);
	    freeMem(s);
	    }

	/* Write out gene record. */
	fprintf(fGene, "%s\t%s\t%s\n", geneId, geneSym, geneName);

	/* Clean up. */
	freez(&geneSym);
	freez(&geneName);
	freez(&geneId);
	slFreeList(&synList);
	ref = NULL;
	curRef = curAllele = 0;
	continue;
	}
    else if (sub == 0)
       errAbort("blank line %d of %s, not allowed in gene.txt",
	    lf->lineIx, lf->fileName);
    else if (isalnum(sub))
       errAbort("line %d of %s begins with %c, not allowed",
	    lf->lineIx, lf->fileName, sub);
    type = line[1];
    rest = trimSpaces(line+2);
    if (sub == '*' && type == 'a')
	geneSym = cloneString(rest);
    else if (sub == '*' && type == 'e')
        geneName = cloneString(rest);
    else if (sub == '*' && type == 'z')
	{
        geneId = cloneString(rest); 
	if (!startsWith("FBgn", geneId))
	    errAbort("Bad FlyBase gene ID %s line %d of %s", geneId, 
		lf->lineIx, lf->fileName);
	}
    else if (type == 'i' && (sub == '*' || sub == '$'))
	{
	if (strlen(rest) > 2)	/* Avoid short useless ones. */
	    slNameStore(&synList, rest);
	}
    else if (sub == '*' && type == 'A')
        {
	if (geneId == NULL)
	    errAbort("Allele before geneId line %d of %s", 
	    	lf->lineIx, lf->fileName);
	curAllele = ++nextAlleleId;
	fprintf(fAllele, "%d\t%s\t%s\n", curAllele, geneId, rest);
	if (!sameString(rest, "classical") &&
	    !sameString(rest, "in vitro") &&
	    !sameString(rest, "wild-type") )
	    {
	    slNameStore(&synList, rest);
	    }
	}
    else if (sub == '*' && type == 'm')
	{
	if (geneId == NULL)
	    errAbort("*m protein ID before geneId line %d of %s", 
	    	lf->lineIx, lf->fileName);
	if (startsWith("UniProt", rest))
	    {
	    char *ptr = strchr(rest, ':');
	    if (ptr != NULL)
		ptr++;
	    else
		errAbort("Trouble parsing UniProt ID %s like %d of %s",
			 rest, lf->lineIx, lf->fileName);
	    fprintf(fUniProt, "%s\t%s\n", geneId, ptr);
	    }
	}
    else if (type == 'E')
        {
	ref = hashFindVal(refHash, rest);
	if (ref == NULL)
	    {
	    AllocVar(ref);
	    ref->id = ++nextRefId;
	    hashAdd(refHash, rest, ref);
	    subChar(rest, '\t', ' ');
	    fprintf(fRef, "%d\t%s\n", ref->id, rest);
	    }
	curRef = ref->id;
	}
    else if ((type == 'k' || type == 'r' || type == 'p') && sub != '@')
        {
	FILE *f = (type == 'r' ? fRole : fPhenotype);
	struct dyString *dy = suckSameLines(lf, line);
	subChar(dy->string, '\t', ' ');
	if (geneId == NULL)
	    errAbort("Expecting *z in record before line %d of %s",
	    	lf->lineIx, lf->fileName);
	fprintf(f, "%s\t%d\t%d\t%s\n", geneId, curAllele, curRef, dy->string);
	dyStringFree(&dy);
	}
    else if (type == 'd' || type == 'f' || type == 'F')
	{
	FILE *f = fGo;
	char aspect = (type == 'd') ? 'P' : (type == 'f') ? 'C' : 'F';
	char *goId = rest;
	char *p = strstr(goId, " ; ");
	char assoc[128];
	if (p == NULL)
	    continue;
	else
	    goId = firstWordInLine(p + 3);
	safef(assoc, sizeof(assoc), "%s.%s", geneId, goId);
	if (hashLookup(goUniqHash, assoc) == NULL)
	    {
	    hashAddInt(goUniqHash, assoc, 1);
	    fprintf(f, "%s\t%s\t%c\n", geneId, goId, aspect);
	    }
	}
    }
printf("Processed %d records in %d lines\n", recordCount, lf->lineIx);
lineFileClose(&lf);

conn = sqlConnect(database);
remakeTables(conn);

if (doLoad)
    {
    printf("Loading %s\n", tGene);
    hgLoadTabFile(conn, tabDir, tGene, &fGene);
    if (doTranscript)
	{
	printf("Loading %s\n", tTranscript);
	hgLoadTabFile(conn, tabDir, tTranscript, &fTranscript);
	}
    printf("Loading %s\n", tSynonym);
    hgLoadTabFile(conn, tabDir, tSynonym, &fSynonym);
    printf("Loading %s\n", tAllele);
    hgLoadTabFile(conn, tabDir, tAllele, &fAllele);
    printf("Loading %s\n", tRef);
    hgLoadTabFile(conn, tabDir, tRef, &fRef);
    printf("Loading %s\n", tRole);
    hgLoadTabFile(conn, tabDir, tRole, &fRole);
    printf("Loading %s\n", tPhenotype);
    hgLoadTabFile(conn, tabDir, tPhenotype, &fPhenotype);
    printf("Loading %s\n", tGo);
    hgLoadTabFile(conn, tabDir, tGo, &fGo);
    printf("Loading %s\n", tUniProt);
    hgLoadTabFile(conn, tabDir, tUniProt, &fUniProt);
    hgRemoveTabFile(tabDir, tGene);
    if (doTranscript)
	hgRemoveTabFile(tabDir, tTranscript);
    hgRemoveTabFile(tabDir, tSynonym);
    hgRemoveTabFile(tabDir, tAllele);
    hgRemoveTabFile(tabDir, tRef);
    hgRemoveTabFile(tabDir, tRole);
    hgRemoveTabFile(tabDir, tPhenotype);
    hgRemoveTabFile(tabDir, tGo);
    hgRemoveTabFile(tabDir, tUniProt);
    }
}
struct hash *loadMotifWeights(struct sqlConnection *conn, char *fileName, 
	char *table)
/* Load in XML weight motif file and save it in tab-separated format
 * and in hash keyed by motif name. */
{
struct esmMotifs *motifs = esmMotifsLoad(fileName);
struct esmMotif *motif;
FILE *f = hgCreateTabFile(tmpDir, table);
struct dyString *dy = dyStringNew(512);
struct hash *hash = newHash(16);

for (motif = motifs->esmMotif; motif != NULL; motif = motif->next)
    {
    struct esmWeights *weights = motif->esmWeights;
    int posCount = slCount(weights->esmPosition);
    struct esmPosition *pos;
    struct dnaMotif *dm;
    char name[64];


    fixMotifName(motif->Name, name, sizeof(name));
    AllocVar(dm);
    dm->name = cloneString(name);
    dm->columnCount = posCount;
    AllocArray(dm->aProb, posCount);
    AllocArray(dm->cProb, posCount);
    AllocArray(dm->gProb, posCount);
    AllocArray(dm->tProb, posCount);
    for (pos = weights->esmPosition; pos != NULL; pos = pos->next)
        {
	char *row[5];
	double odds[4], sumOdds = 0;
	int i;

	int ix = pos->Num;
	int rowSize = chopString(pos->Weights, ";", row, ArraySize(row));
	if (rowSize != 4)
	    errAbort("Expecting 4 values for weights in position %d of Motif %s",
               pos->Num, motif->Name);
	if (ix >= posCount)
	    errAbort("Num %d out of range in Motif %s", ix, motif->Name);
	for (i=0; i<4; ++i)
	    {
	    odds[i] = exp(atof(row[0]));
	    sumOdds += odds[i];
	    }
	dm->aProb[ix] = odds[0]/sumOdds;
	dm->cProb[ix] = odds[1]/sumOdds;
	dm->gProb[ix] = odds[2]/sumOdds;
	dm->tProb[ix] = odds[3]/sumOdds;
	}
    dnaMotifTabOut(dm, f);
    hashAdd(hash, dm->name, dm);
    }
dyStringPrintf(dy,
"CREATE TABLE %s (\n"
"    name varchar(16) not null,	# Motif name.\n"
"    columnCount int not null,	# Count of columns in motif.\n"
"    aProb longblob not null,	# Probability of A's in each column.\n"
"    cProb longblob not null,	# Probability of C's in each column.\n"
"    gProb longblob not null,	# Probability of G's in each column.\n"
"    tProb longblob not null,	# Probability of T's in each column.\n"
"              #Indices\n"
"    PRIMARY KEY(name)\n"
")\n", table);
sqlRemakeTable(conn, table, dy->string);
hgLoadTabFile(conn, tmpDir, table, &f);
hgRemoveTabFile(tmpDir, table);
verbose(1, "Processed %d motifs into %s\n", slCount(motifs->esmMotif), table);
return hash;
}
Пример #29
0
void hgLoadNetDist(char *inTab, char *db, char *outTable)
{
char *tempDir = ".";
FILE *f = hgCreateTabFile(tempDir, outTable);

struct sqlConnection *hConn = sqlConnect(db);

FILE *missingFile=NULL;
int missingCount=0;

struct lineFile *lf=NULL;
char *row[3];
int rowCount=3;

if (sqlRemap)
    {
    fetchRemapInfo(db);
    missingHash = newHash(16);  
    missingFile = mustOpen("missing.tab","w");
    }

/* read edges from file */


lf=lineFileOpen(inTab, TRUE);

/* print final values, remapping if needed */

while (lineFileNextRowTab(lf, row, rowCount))
    {
    char *geneI = row[0];
    char *geneJ = row[1];
    char *dij = row[2];
    char *gi=NULL, *gj=NULL;
    if (sqlRemap)
	{ /* it is possible for each id to have multiple remap values in hash */
	struct hashEl *hi=NULL, *hj=NULL, *hjSave=NULL;
	hi = hashLookup(aliasHash,geneI);
	hj = hashLookup(aliasHash,geneJ);
	missingCount += handleMissing(hi, geneI, missingHash, missingFile);
	missingCount += handleMissing(hj, geneJ, missingHash, missingFile);
	hjSave = hj;
	/* do all combinations of i and j */	
	for(;hi;hi=hashLookupNext(hi))
	    {
	    gi = (char *)hi->val;
	    for(;hj;hj=hashLookupNext(hj))
		{
		gj = (char *)hj->val;
		fprintf(f,"%s\t%s\t%s\n",gi,gj,dij);
		}
	    hj = hjSave; /* reset it */
	    }
	}
    else
	{
	gi=geneI;
	gj=geneJ;
	fprintf(f,"%s\t%s\t%s\n",gi,gj,dij);
	}
    }

lineFileClose(&lf);
carefulClose(&f);    

if (sqlRemap)
    {
    carefulClose(&missingFile);
    if (missingCount == 0)
	unlink("missing.tab");
    else	    
    	printf("hgLoadNetDist %d id-remapping misses, see missing.tab\n", missingCount);
    }

createTable(hConn, outTable);
hgLoadTabFile(hConn, tempDir, outTable, &f);
hgRemoveTabFile(tempDir, outTable);
}
Пример #30
0
void hgExpDistance(char *database, char *posTable, char *expTable, char *outTable)
/* hgExpDistance - Create table that measures expression distance between pairs. */
{
struct sqlConnection *conn = sqlConnect(database);
struct sqlResult *sr;
char query[256];
char **row;
struct hash *expHash = hashNew(16);
int realExpCount = -1;
struct microData *gene;
int rc, t;
pthread_t *threads = NULL;
pthread_attr_t attr;
int *threadID = NULL;
void *status;
char *tempDir = ".";
long time1, time2;

time1 = clock1000();

/* Get list/hash of all items with expression values. */
sqlSafef(query, sizeof(query), "select name,expCount,expScores from %s", posTable);
sr = sqlGetResult(conn, query);
while ((row = sqlNextRow(sr)) != NULL)
    {
    char *name = row[0];
    if (!hashLookup(expHash, name))
	{
	int expCount = sqlUnsigned(row[1]);
	int commaCount;
	float *expScores = NULL;

	sqlFloatDynamicArray(row[2], &expScores, &commaCount);
	if (expCount != commaCount)
	    errAbort("expCount and expScores don't match on %s in %s", name, posTable);
	if (realExpCount == -1)
	    realExpCount = expCount;
	if (expCount != realExpCount)
	    errAbort("In %s some rows have %d experiments others %d", 
	    	name, expCount, realExpCount);
	AllocVar(gene);
	gene->expCount = expCount;
	gene->expScores = expScores;
	hashAddSaveName(expHash, name, gene, &gene->name);
	slAddHead(&geneList, gene);
	}
    }
sqlFreeResult(&sr);
conn = sqlConnect(database);
slReverse(&geneList);
geneCount = slCount(geneList);
printf("Have %d elements in %s\n", geneCount, posTable);

weights = getWeights(realExpCount);

if (optionExists("lookup"))
    geneList = lookupGenes(conn, optionVal("lookup", NULL), geneList);
geneCount = slCount(geneList);
printf("Got %d unique elements in %s\n", geneCount, posTable);

sqlDisconnect(&conn);	/* Disconnect because next step is slow. */


if (geneCount < 1)
    errAbort("ERROR: unique gene count less than one ?");

time2 = clock1000();
verbose(2, "records read time: %.2f seconds\n", (time2 - time1) / 1000.0);

f = hgCreateTabFile(tempDir, outTable);

/* instantiate threads */
AllocArray( threadID, numThreads );
AllocArray( threads, numThreads );
pthread_attr_init( &attr );
pthread_mutex_init( &mutexfilehandle, NULL );
pthread_attr_setdetachstate( &attr, PTHREAD_CREATE_JOINABLE );

for (t = 0; t < numThreads; t++) {
	threadID[t] = t;
	rc = pthread_create( &threads[t], &attr, computeDistance, 
						(void *) &threadID[t]);
	if (rc)
		errAbort("ERROR: in pthread_create() %d\n", rc );
} 

/* synchronize all threads */
for (t = 0; t < numThreads; t++) {
	rc = pthread_join( threads[t], &status);
	if (rc)
		errAbort("ERROR: in pthread_join() %d\n", rc );
} 

printf("Made %s.tab\n", outTable);

slFreeList( &geneList );

pthread_mutex_destroy( &mutexfilehandle );
pthread_attr_destroy( &attr );

time1 = time2;
time2 = clock1000();
verbose(2, "distance computation time: %.2f seconds\n", (time2 - time1) / 1000.0);

/* Create and load table. */
conn = sqlConnect(database);
distanceTableCreate(conn, outTable);
hgLoadTabFile(conn, tempDir, outTable, &f);
printf("Loaded %s\n", outTable);

/* Add indices. */
sqlSafef(query, sizeof(query), "alter table %s add index(query(12))", outTable);
sqlUpdate(conn, query);
printf("Made query index\n");
if (optionExists("targetIndex"))
    {
    sqlSafef(query, sizeof(query), "alter table %s add index(target(12))", outTable);
    sqlUpdate(conn, query);
    printf("Made target index\n");
    }

hgRemoveTabFile(tempDir, outTable);

time1 = time2;
time2 = clock1000();
verbose(2, "table create/load/index time: %.2f seconds\n", (time2 - time1) / 1000.0);

}