void makeMotifs(char *inFile, struct hash *tfHash, char *outFile) /* Parse input motifs and save them to outFile in dnaMotif format. */ { struct lineFile *lf = lineFileOpen(inFile, TRUE); FILE *f = mustOpen(outFile, "w"); struct hashEl *hel; for (;;) { char *line; char *words[256], *word; int wordCount; struct dnaMotif *motif; if (!lineFileSkipTo(lf, "Probability matrix for")) break; lineFileNeedNext(lf, &line, NULL); wordCount = chopLine(line, words); if (wordCount >= ArraySize(words)) errAbort("Line %d of %s is too long\n", lf->lineIx, lf->fileName); if (!sameString(words[0], "#")) badFormat(lf); AllocVar(motif); motif->columnCount = wordCount-1; readBaseProbs(lf, words, "#A", &motif->aProb, motif->columnCount); readBaseProbs(lf, words, "#C", &motif->cProb, motif->columnCount); readBaseProbs(lf, words, "#T", &motif->tProb, motif->columnCount); readBaseProbs(lf, words, "#G", &motif->gProb, motif->columnCount); if (!lineFileSkipTo(lf, "Source:")) lineFileUnexpectedEnd(lf); lineFileReuse(lf); lineFileNeedNext(lf, &line, NULL); word = nextWord(&line); word = nextWord(&line); if (word == NULL) errAbort("Short Source: line %d of %s", lf->lineIx, lf->fileName); motif->name = cloneString(word); hel = hashLookup(tfHash, motif->name); if (hel == NULL) errAbort("%s in %s but not GFFs", motif->name, lf->fileName); hel->val = motif; dnaMotifTabOut(motif, f); } carefulClose(&f); lineFileClose(&lf); }
void iriToDnaMotif(char *inName, char *outName) /* iriToDnaMotif - Convert improbRunInfo to dnaMotif. */ { FILE *f = mustOpen(outName, "w"); static struct dnaMotif motif; struct improbRunInfo *iriList = improbRunInfoLoadAll(inName); struct improbRunInfo *iri; for (iri = iriList; iri != NULL; iri = iri->next) { motif.name = iri->name; motif.columnCount = iri->columnCount; motif.aProb = iri->aProb; motif.cProb = iri->cProb; motif.gProb = iri->gProb; motif.tProb = iri->tProb; dnaMotifTabOut(&motif, f); } }
void emblMatrixToMotif(char *inName, char *outName) /* emblMatrixToMotif - Convert transfac matrix in EMBL format to dnaMotif. */ { struct hash *hash = NULL; struct lineFile *lf = emblOpen(inName, NULL); FILE *f = mustOpen(outName, "w"); struct dnaMotif *motif; while ((hash = emblRecord(lf)) != NULL) { char *ac = hashFindVal(hash, "AC"); char *po = hashFindVal(hash, "P0"); if (ac != NULL && po != NULL && orgFits(hash)) { motif = emblToMotif(ac, hash); dnaMotifTabOut(motif, f); dnaMotifFree(&motif); } } }
struct hash *loadMotifWeights(struct sqlConnection *conn, char *fileName, char *table) /* Load in XML weight motif file and save it in tab-separated format * and in hash keyed by motif name. */ { struct esmMotifs *motifs = esmMotifsLoad(fileName); struct esmMotif *motif; FILE *f = hgCreateTabFile(tmpDir, table); struct dyString *dy = dyStringNew(512); struct hash *hash = newHash(16); for (motif = motifs->esmMotif; motif != NULL; motif = motif->next) { struct esmWeights *weights = motif->esmWeights; int posCount = slCount(weights->esmPosition); struct esmPosition *pos; struct dnaMotif *dm; char name[64]; fixMotifName(motif->Name, name, sizeof(name)); AllocVar(dm); dm->name = cloneString(name); dm->columnCount = posCount; AllocArray(dm->aProb, posCount); AllocArray(dm->cProb, posCount); AllocArray(dm->gProb, posCount); AllocArray(dm->tProb, posCount); for (pos = weights->esmPosition; pos != NULL; pos = pos->next) { char *row[5]; double odds[4], sumOdds = 0; int i; int ix = pos->Num; int rowSize = chopString(pos->Weights, ";", row, ArraySize(row)); if (rowSize != 4) errAbort("Expecting 4 values for weights in position %d of Motif %s", pos->Num, motif->Name); if (ix >= posCount) errAbort("Num %d out of range in Motif %s", ix, motif->Name); for (i=0; i<4; ++i) { odds[i] = exp(atof(row[0])); sumOdds += odds[i]; } dm->aProb[ix] = odds[0]/sumOdds; dm->cProb[ix] = odds[1]/sumOdds; dm->gProb[ix] = odds[2]/sumOdds; dm->tProb[ix] = odds[3]/sumOdds; } dnaMotifTabOut(dm, f); hashAdd(hash, dm->name, dm); } dyStringPrintf(dy, "CREATE TABLE %s (\n" " name varchar(16) not null, # Motif name.\n" " columnCount int not null, # Count of columns in motif.\n" " aProb longblob not null, # Probability of A's in each column.\n" " cProb longblob not null, # Probability of C's in each column.\n" " gProb longblob not null, # Probability of G's in each column.\n" " tProb longblob not null, # Probability of T's in each column.\n" " #Indices\n" " PRIMARY KEY(name)\n" ")\n", table); sqlRemakeTable(conn, table, dy->string); hgLoadTabFile(conn, tmpDir, table, &f); hgRemoveTabFile(tmpDir, table); verbose(1, "Processed %d motifs into %s\n", slCount(motifs->esmMotif), table); return hash; }