void readStsInfo(struct lineFile *sif) /* Read in current stsInfo file */ { struct stsInfo2 *si; struct sts *s; struct primer *p; struct gb *gb; char name[16], *words[52]; int i; stsHash = newHash(20); dbStsIdHash = newHash(20); primerHash = newHash(20); nameHash = newHash(24); gbAccHash = newHash(20); ucscIdHash = newHash(20); /* Read in all rows */ while (lineFileChopTab(sif, words)) { si = stsInfo2Load(words); /* Determine next ucsc id to be used */ if (si->identNo >= nextUcscId) nextUcscId = si->identNo + 1; /* Create sts struct */ if (sameString(si->organism, "H**o sapiens\0")) { AllocVar(s); s->next = NULL; s->si = si; s->fa = NULL; s->mapped = isMapped(si); s->dbstsIdExists = FALSE; slAddHead(&sList, s); safef(name, ArraySize(name), "%d", si->identNo); hashAdd(stsHash, name, s); /* Add ids to dbStsIdHash */ if (si->dbSTSid) hashAddInt(dbStsIdHash, name, si->dbSTSid); /* Add sts records to ucscId hash */ if (si->dbSTSid) { safef(name, ArraySize(name), "%d", si->dbSTSid); hashAdd(ucscIdHash, name, s); } for (i = 0; i < si->otherDbstsCount; i++) { safef(name, ArraySize(name), "%d", si->otherDbSTS[i]); if (!hashLookup(ucscIdHash, name)) hashAdd(ucscIdHash, name, s); } /* Add names to name hash and genbank hash */ hashAdd(nameHash, si->name, s); for (i = 0; i < si->gbCount; i++) { hashAdd(nameHash, si->genbank[i], s); AllocVar(gb); gb->next = NULL; gb->acc = cloneString(si->genbank[i]); gb->s = s; gb->gbSeq = FALSE; hashAdd(gbAccHash, gb->acc, gb); } for (i = 0; i < si->gdbCount; i++) hashAdd(nameHash, si->gdb[i], s); for (i = 0; i < si->nameCount; i++) hashAdd(nameHash, si->otherNames[i], s); /* Create primer info if available and add to hash */ if (differentString(si->leftPrimer, "\0")) { AllocVar(p); p->next = NULL; p->dbStsId = si->dbSTSid; p->left = cloneString(si->leftPrimer); p->right = cloneString(si->rightPrimer); p->dist = cloneString(si->distance); p->ucscId = si->identNo; safef(name, ArraySize(name), "%d", p->dbStsId); hashAdd(primerHash, name, p); } } else { stsInfo2Free(&si); } } }
void loadGeneToMotif(struct sqlConnection *conn, char *fileName, char *table, struct hash *geneToModuleHash, struct hash *moduleAndMotifHash, struct hash *motifHash, struct hash *positionsHash, char *regionTable) /* Load file which is a big matrix with genes for rows and motifs for * columns. There is a semicolon-separated list of numbers in the matrix * where a gene has the motif, and an empty (tab separated) field * where there is no motif. The numbers are relative to the * region associated with the gene in the positionsHash. * Only load bits of this where motif actually occurs in module associated * with gene. */ { struct lineFile *lf = lineFileOpen(fileName, TRUE); char *line; FILE *f = hgCreateTabFile(tmpDir, table); char *motifNames[32*1024], *row[32*1024]; int motifCount, rowSize, i; char *gene, *module; int geneCount = 0, total = 0; struct dyString *dy = dyStringNew(512); struct genomePos *motifPosList = NULL, *motifPosForGene; struct genomePos *regionPosList = NULL, *regionPos; /* Read first line, which is labels. */ if (!lineFileNextReal(lf, &line)) errAbort("Empty file %s", fileName); subChar(line, ' ', '_'); motifCount = chopLine(line, motifNames); if (motifCount >= ArraySize(motifNames)) errAbort("Too many motifs line 1 of %s", fileName); lineFileExpectAtLeast(lf, 2, motifCount); motifNames[0] = NULL; for (i=1; i<motifCount; ++i) { char name[64]; motifNames[i] = cloneString(fixMotifName(motifNames[i],name,sizeof(name))); if (!hashLookup(motifHash, motifNames[i])) errAbort("Motif %s is in %s but not modules_motifs.gxm", motifNames[i], fileName); } /* Read subsequent lines. */ while ((rowSize = lineFileChopTab(lf, row)) != 0) { lineFileExpectWords(lf, motifCount, rowSize); gene = row[0]; module = hashFindVal(geneToModuleHash, gene); if (module == NULL) { warn("WARNING: Gene %s in line %d of %s but not module_assignments.tab", gene, lf->lineIx, lf->fileName); continue; } regionPos = NULL; for (i=1; i<rowSize; ++i) { if (row[i][0] != 0) { if (hashLookup2(moduleAndMotifHash, module, motifNames[i])) { regionPos = hashFindVal(positionsHash, gene); if (regionPos == NULL) { warn("WARNING: %s in %s but not gene_positions.tab", gene, fileName); i = rowSize; continue; } motifPosForGene = convertMotifPos(row[i], regionPos, hashMustFindVal(motifHash, motifNames[i]), lf); motifPosList = slCat(motifPosForGene, motifPosList); ++total; } } } if (regionPos != NULL) { slAddHead(®ionPosList, regionPos); } ++geneCount; } lineFileClose(&lf); /* Output sorted table of all motif hits. */ { struct genomePos *pos; slSort(&motifPosList, genomePosCmp); for (pos = motifPosList; pos != NULL; pos = pos->next) { int start = pos->start; int end = pos->end; if (start < 0) start = 0; fprintf(f, "%d\t", binFromRange(start, end)); fprintf(f, "%s\t", pos->chrom); fprintf(f, "%d\t%d\t", start, end); fprintf(f, "%s\t", pos->motif); fprintf(f, "%d\t", pos->score); fprintf(f, "%c\t", pos->strand); fprintf(f, "%s\n", pos->name); } sqlDyStringPrintf(dy, "CREATE TABLE %s (\n" " bin smallInt unsigned not null,\n" " chrom varChar(255) not null,\n" " chromStart int not null,\n" " chromEnd int not null,\n" " name varchar(255) not null,\n" " score int not null,\n" " strand char(1) not null,\n" " gene varchar(255) not null,\n" " #Indices\n" " INDEX(gene(12)),\n" " INDEX(name(16)),\n" " INDEX(chrom(8),bin)\n" ")\n", table); sqlRemakeTable(conn, table, dy->string); verbose(1, "%d genes, %d motifs, %d motifs in genes\n", geneCount, motifCount-1, total); hgLoadTabFile(conn, tmpDir, table, &f); // hgRemoveTabFile(tmpDir, table); verbose(1, "Loaded %s table\n", table); slFreeList(&motifPosList); } /* Now output sorted table of upstream regions. */ { FILE *f = hgCreateTabFile(tmpDir, regionTable); struct genomePos *pos; dyStringClear(dy); sqlDyStringPrintf(dy, "CREATE TABLE %s (\n" " bin smallInt unsigned not null,\n" " chrom varChar(255) not null,\n" " chromStart int not null,\n" " chromEnd int not null,\n" " name varchar(255) not null,\n" " score int not null,\n" " strand char(1) not null,\n" " #Indices\n" " INDEX(name(16)),\n" " INDEX(chrom(8),bin)\n" ")\n", regionTable); sqlRemakeTable(conn, regionTable, dy->string); slSort(®ionPosList, genomePosCmp); for (pos = regionPosList; pos != NULL; pos = pos->next) { int start = pos->start; int end = pos->end; if (start < 0) start = 0; fprintf(f, "%d\t", binFromRange(start, end)); fprintf(f, "%s\t", pos->chrom); fprintf(f, "%d\t%d\t", start, end); fprintf(f, "%s\t", pos->name); fprintf(f, "%d\t", pos->score); fprintf(f, "%c\n", pos->strand); } hgLoadTabFile(conn, tmpDir, regionTable, &f); // hgRemoveTabFile(tmpDir, regionTable); } }
void readZfinMapping(struct lineFile *zmf) /* Read in ZFIN IDs and mapping panel data */ { char *words[6], *name = NULL, *nameLower = NULL, *id = NULL, *panel = NULL; char *chrom = NULL, *units = NULL; double pos; int i = 0, j, index = -1, k; struct zfin *zf = NULL; struct panel *p = NULL; boolean foundPos = FALSE; zfinMarkerHash = newHash(16); zfinIdHash = newHash(16); while (lineFileChopTab(zmf, words) ) { /* get ZFIN Id, name and panel */ id = cloneString(words[0]); name = cloneString(words[1]); nameLower = cloneString(words[1]); panel = cloneString(words[2]); chrom = cloneString(words[3]); pos = sqlDouble(words[4]); units = cloneString(words[5]); /* check if this name exists already in hash */ touppers(name); zf = hashFindVal(zfinMarkerHash, name); if (zf == NULL) { /* allocate memory for zfin struct if not in hash */ AllocVar(zf); /* find panel and add marker information */ /* first initialize */ zf->panelArray = needMem(sizeof(struct panel*) * NUMPANELS); for (i = 0; i < NUMPANELS; i++) { zf->panelArray[i] = NULL; } zf->zfinId = cloneString(id); zf->name = cloneString(nameLower); zf->acc = NULL; zf->zfAlias = NULL; } /* find index number for this panel */ for (j = 0; j < NUMPANELS; j++) { if (sameString(panel, markerPanels[j])) index = j; } if (zf->panelArray[index] == NULL) { /* allocate memory for panel struct for panel data */ AllocVar(p); /* initialize chrom and pos arrays */ for (k = 0; k < NUMPOS; k++) { p->chrom[k] = NULL; p->pos[k] = -1; } p->name = cloneString(nameLower); p->panel = cloneString(panel); p->units = cloneString(units); } else p = zf->panelArray[index]; foundPos = FALSE; for (k = 0; k < NUMPOS && (!foundPos); k++) { if (p->chrom[k] == NULL) { if (!sameString(p->name, nameLower)) fprintf(stderr, "The entry in position %d does not match %s\n", index, nameLower); else { p->chrom[k] = cloneString(chrom); p->pos[k] = pos; foundPos = TRUE; } } } zf->panelArray[index] = p; /* name is already in upper case, add zfin to zfinMarkerHash */ addHashElUnique(zfinMarkerHash, name, zf); /* if new ZFIN struct add the name to the zfinIdHash keyed by ZFIN ID */ if (hashFindVal(zfinIdHash, id) == NULL) addHashElUnique(zfinIdHash, id, nameLower); } }