void readFinfFiles(char *gsDir) /* Read in .finf files and save info in cloneHash/cloneList. */ { struct lineFile *lf; struct clone *clone = NULL; struct endInfo *end; char fileName[512]; int i; char *words[7]; char lastClone[64]; char cloneName[64]; int gsInfoCount = 0; struct frag *frag; boolean isFin; char *s, *e; strcpy(lastClone, ""); for (i=0; i<ArraySize(gsFiles); ++i) { isFin = (i <= 0); sprintf(fileName, "%s/%s", gsDir, gsFiles[i]); printf("Reading info from %s\n", fileName); lf = lineFileOpen(fileName, TRUE); while (lineFileRow(lf, words)) { if (!sameString(words[1], lastClone)) { struct clone *oldClone; strcpy(lastClone, words[1]); strcpy(cloneName, words[1]); AllocVar(clone); s = strchr(cloneName, '.'); if (s == NULL) errAbort("Bad clone name format line %d of %s\n", lf->lineIx, lf->fileName); if (strlen(s) >= sizeof(clone->version)) errAbort("Bad clone name format line %d of %s\n", lf->lineIx, lf->fileName); strcpy(clone->version, s); chopSuffix(cloneName); clone->size = atoi(words[3]); if ((oldClone = hashFindVal(cloneHash, cloneName)) != NULL) { if (isFin && clone->size == oldClone->size && sameString(clone->version, oldClone->version)) warn("Apparently benign duplication of %s line %d of %s", cloneName, lf->lineIx, lf->fileName); else warn("%s duplicated line %d of %s (size %d oldSize %d)", cloneName, lf->lineIx, lf->fileName, clone->size, oldClone->size); } hashAddSaveName(cloneHash, cloneName, clone, &clone->name); clone->isFin = isFin; slAddHead(&cloneList, clone); } frag = newFrag(words[0], lf); slAddTail(&clone->fragList, frag); ++clone->fragCount; if (!clone->isFin && !sameString(words[6], "?") && !sameString(words[6], "i") && !sameString(words[6], "w")) { char *s = strchr(words[0], '~'); char c; if (s == NULL) errAbort("Expecting ~ in fragment name line %d of %s\n", lf->lineIx, lf->fileName); ++s; AllocVar(end); end->contig = cloneString(s); subChar(s, '.', '_'); end->text = cloneString(words[6]); c = lastChar(end->text); if (!(c == 'L' || c == 'R')) c = '?'; end->lr = c; slAddHead(&clone->gsList, end); ++gsInfoCount; } } lineFileClose(&lf); } printf("Found %d ends in %d clones\n", gsInfoCount, slCount(cloneList)); }
static void tfBindLevelSection(struct tfData *tfList, struct sqlConnection *conn, char *motifTable, char *tfToConditionTable) /* Print info on individual transcription factors that bind * with e-val between minVal and maxVal. */ { struct tfData *tf; struct transRegCode *trc; webNewSection("Transcription Factors Showing IP Over this Probe "); hTableStart(); printf("<TR>"); colLabel("Transcription", 1); colLabel("Growth Condition", 3); colLabel("Motif Information", 3); printf("</TR>\n"); printf("<TR>"); colLabel("Factor", 1); colLabel("Good IP (P<0.001)", 1); colLabel("Weak IP (P<0.005)", 1); colLabel("No IP (P>0.005)", 1); colLabel("Hits", 1); colLabel("Scores", 1); colLabel("Conservation (2 max)", 1); printf("</TR>\n"); for (tf = tfList; tf != NULL; tf = tf->next) { struct hash *boundHash = newHash(8); slSort(&tf->conditionList, tfCondCmpName); printf("<TR>"); /* Print transcription name. */ printf("<TD>"); sacCerHgGeneLinkName(conn, tf->name); printf("</TD>"); /* Print stong and weak growth conditions. */ ipPrintInRange(tf->conditionList, 0.0, 0.002, boundHash); ipPrintInRange(tf->conditionList, 0.002, 0.006, boundHash); /* Grab list of all conditions tested from database and * print out ones not in strong or weak as none. */ { char query[256], **row; struct sqlResult *sr; boolean isFirst = TRUE; boolean gotAny = FALSE; sqlSafef(query, sizeof(query), "select growthCondition from %s where name='%s'", tfToConditionTable, tf->name); sr = sqlGetResult(conn, query); printf("<TD>"); while ((row = sqlNextRow(sr)) != NULL) { if (!hashLookup(boundHash, row[0])) { if (isFirst) isFirst = FALSE; else printf(", "); printf("%s", row[0]); gotAny = TRUE; } } sqlFreeResult(&sr); if (!gotAny) printf(" "); printf("</TD>"); } /* Print motif info. */ if (tf->trcList == NULL) printf("<TD>0</TD><TD>n/a</TD><TD>n/a</TD>\n"); else { printf("<TD>%d</TD>", slCount(tf->trcList)); /* Print scores. */ printf("<TD>"); for (trc = tf->trcList; trc != NULL; trc = trc->next) { double score; if (trc != tf->trcList) printf(", "); score = motifScoreHere( trc->chrom, trc->chromStart, trc->chromEnd, trc->name, motifTable); transRegCodeAnchor(trc); printf("%3.1f</A>", score); } printf("</TD><TD>"); for (trc = tf->trcList; trc != NULL; trc = trc->next) { if (trc != tf->trcList) printf(", "); printf("%d", trc->consSpecies); } printf("</TD>"); } printf("</TR>\n"); hashFree(&boundHash); } hTableEnd(); }
void printCloneStart(struct clone *clone, FILE *f) /* Print the start of the output line with the clone info. */ { fprintf(f, "%s\t%s\tH%d\tG%d\tS%d\t", clone->name, clone->version, clone->phase, slCount(clone->gsList), slCount(clone->spList)); }
char *visiGeneHypertextGenotype(struct sqlConnection *conn, int id) /* Return genotype of organism if any in nifty hypertext format. */ { int genotypeId; struct slName *geneIdList, *geneId; char query[256]; struct dyString *html; /* Look up genotype ID. */ sqlSafef(query, sizeof(query), "select specimen.genotype from image,specimen " "where image.id=%d and image.specimen = specimen.id", id); genotypeId = sqlQuickNum(conn, query); if (genotypeId == 0) return NULL; /* Get list of genes involved. */ sqlSafef(query, sizeof(query), "select distinct allele.gene from genotypeAllele,allele " "where genotypeAllele.genotype=%d " "and genotypeAllele.allele = allele.id" , genotypeId); geneIdList = sqlQuickList(conn, query); if (geneIdList == NULL) return cloneString("wild type"); /* Loop through each gene adding information to html. */ html = dyStringNew(0); for (geneId = geneIdList; geneId != NULL; geneId = geneId->next) { char *geneName; struct slName *alleleList, *allele; int alleleCount; boolean needsSlash = FALSE; /* Get gene name. */ sqlSafef(query, sizeof(query), "select name from gene where id='%s'", geneId->name); geneName = sqlQuickString(conn, query); if (geneName == NULL) internalErr(); /* Process each allele of gene. */ sqlSafef(query, sizeof(query), "select allele.name from genotypeAllele,allele " "where genotypeAllele.genotype=%d " "and genotypeAllele.allele = allele.id " "and allele.gene=%s" , genotypeId, geneId->name); alleleList = sqlQuickList(conn, query); alleleCount = slCount(alleleList); for (allele = alleleList; allele != NULL; allele = allele->next) { char *simplifiedAllele = getSimplifiedAllele(geneName, allele->name); int repCount = 1, rep; if (alleleCount == 1) repCount = 2; for (rep = 0; rep < repCount; ++rep) { if (needsSlash) dyStringAppendC(html, '/'); else needsSlash = TRUE; dyStringAppend(html, geneName); dyStringPrintf(html, "<SUP>%s</SUP>", simplifiedAllele); } freeMem(simplifiedAllele); } if (geneId->next != NULL) dyStringAppendC(html, ' '); slFreeList(&alleleList); freeMem(geneName); } slFreeList(&geneIdList); return dyStringCannibalize(&html); }
int bbiWriteZoomLevels( struct lineFile *lf, /* Input file. */ FILE *f, /* Output. */ int blockSize, /* Size of index block */ int itemsPerSlot, /* Number of data points bundled at lowest level. */ bbiWriteReducedOnceReturnReducedTwice writeReducedOnceReturnReducedTwice, /* callback */ int fieldCount, /* Number of fields in bed (4 for bedGraph) */ boolean doCompress, /* Do we compress. Answer really should be yes! */ bits64 dataSize, /* Size of data on disk (after compression if any). */ struct bbiChromUsage *usageList, /* Result from bbiChromUsageFromBedFile */ int resTryCount, int resScales[], int resSizes[], /* How much to zoom at each level */ bits32 zoomAmounts[bbiMaxZoomLevels], /* Fills in amount zoomed at each level. */ bits64 zoomDataOffsets[bbiMaxZoomLevels], /* Fills in where data starts for each zoom level. */ bits64 zoomIndexOffsets[bbiMaxZoomLevels], /* Fills in where index starts for each level. */ struct bbiSummaryElement *totalSum) /* Write out all the zoom levels and return the number of levels written. Writes * actual zoom amount and the offsets of the zoomed data and index in the last three * parameters. Sorry for all the parameters - it was this or duplicate a big chunk of * code between bedToBigBed and bedGraphToBigWig. */ { /* Write out first zoomed section while storing in memory next zoom level. */ assert(resTryCount > 0); int maxReducedSize = dataSize/2; int initialReduction = 0, initialReducedCount = 0; /* Figure out initialReduction for zoom - one that is maxReducedSize or less. */ int resTry; for (resTry = 0; resTry < resTryCount; ++resTry) { bits64 reducedSize = resSizes[resTry] * sizeof(struct bbiSummaryOnDisk); if (doCompress) reducedSize /= 2; // Estimate! if (reducedSize <= maxReducedSize) { initialReduction = resScales[resTry]; initialReducedCount = resSizes[resTry]; break; } } verbose(2, "initialReduction %d, initialReducedCount = %d\n", initialReduction, initialReducedCount); /* Force there to always be at least one zoom. It may waste a little space on small * files, but it makes files more uniform, and avoids special case code for calculating * overall file summary. */ if (initialReduction == 0) { initialReduction = resScales[0]; initialReducedCount = resSizes[0]; } /* Call routine to make the initial zoom level and also a bit of work towards further levels. */ struct lm *lm = lmInit(0); int zoomIncrement = bbiResIncrement; lineFileRewind(lf); struct bbiSummary *rezoomedList = writeReducedOnceReturnReducedTwice(usageList, fieldCount, lf, initialReduction, initialReducedCount, zoomIncrement, blockSize, itemsPerSlot, doCompress, lm, f, &zoomDataOffsets[0], &zoomIndexOffsets[0], totalSum); verboseTime(2, "writeReducedOnceReturnReducedTwice"); zoomAmounts[0] = initialReduction; int zoomLevels = 1; /* Loop around to do any additional levels of zoom. */ int zoomCount = initialReducedCount; int reduction = initialReduction * zoomIncrement; while (zoomLevels < bbiMaxZoomLevels) { int rezoomCount = slCount(rezoomedList); if (rezoomCount >= zoomCount) break; zoomCount = rezoomCount; zoomDataOffsets[zoomLevels] = ftell(f); zoomIndexOffsets[zoomLevels] = bbiWriteSummaryAndIndex(rezoomedList, blockSize, itemsPerSlot, doCompress, f); zoomAmounts[zoomLevels] = reduction; ++zoomLevels; reduction *= zoomIncrement; rezoomedList = bbiSummarySimpleReduce(rezoomedList, reduction, lm); } lmCleanup(&lm); verboseTime(2, "further reductions"); return zoomLevels; }
static struct chrGapList *createGaps(struct bed *bounds) { struct bed *bedEl = NULL; char *prevChr = NULL; struct chrGapList *gaps = NULL; struct gap *prevGap = NULL; struct bed *prevBedEl = NULL; struct chrGapList *curChrList = NULL; int boundingChrCount = 0; int overlappedBounding = 0; for (bedEl = bounds; bedEl != NULL; bedEl = bedEl->next) { /* the first bedEl does not yet start a new gap, must have a second */ if ((NULL == prevChr) || differentWord(prevChr,bedEl->chrom)) { struct chrGapList *cEl; AllocVar(cEl); cEl->chrom = cloneString(bedEl->chrom); cEl->gList = NULL; if (prevChr) { if (NULL == prevGap) { verbose(2,"WARNING: only one element on %s ! No gap defined.\n", prevChr); slPopHead(&gaps); --boundingChrCount; } freeMem(prevChr); } prevChr = cloneString(bedEl->chrom); prevGap = NULL; prevBedEl = bedEl; /* bounding element before first gap */ verbose(4,"new chrom on bounding gap creation %s, adding %#lx\n", prevChr, (unsigned long) cEl); slAddHead(&gaps,cEl); ++boundingChrCount; curChrList = cEl; } else { struct gap *gEl; AllocVar(gEl); gEl->prev = prevGap; /* first one is NULL */ gEl->upstream = prevBedEl; gEl->isUpstreamBound = TRUE; /* bounding element */ gEl->downstream = bedEl; gEl->isDownstreamBound = TRUE; /* bounding element */ gEl->next = NULL; /* not there yet */ if (prevGap == NULL) /* first one is NULL */ { curChrList->gList = gEl; /* starting the list */ } else { prevGap->next = gEl; } prevGap = gEl; /* gapSize is between downstream and upstream */ gEl->gapSize = bedEl->chromStart - prevBedEl->chromEnd; verbose(5,"gap: %s:%d-%d size %d (%d)\n", bedEl->chrom, gEl->upstream->chromEnd, gEl->downstream->chromStart, gEl->gapSize, gEl->downstream->chromStart - gEl->upstream->chromEnd); if (gEl->gapSize < 0) { ++overlappedBounding; if (verboseLevel()>3) { warn("WARNING: overlapping bounding elements at\n\t" "%s:%d-%d <-> %s:%d-%d", prevBedEl->chrom, prevBedEl->chromStart, prevBedEl->chromEnd, bedEl->chrom, bedEl->chromStart, bedEl->chromEnd); } gEl->gapSize = 0; } prevBedEl = bedEl; } } if (prevChr) { /* potentially the last one is a single item on a chrom */ if (NULL == prevGap) { verbose(2,"WARNING: only one element on %s ! No gap defined.\n", prevChr); slPopHead(&gaps); --boundingChrCount; } freeMem(prevChr); } slReverse(&gaps); verbose(3,"bounding chrom count: %d (=? %d), overlapped items: %d\n", boundingChrCount, slCount(gaps), overlappedBounding); return(gaps); }
static void randomPlacement(char *bounding, char *placed) { struct bed *boundingElements = bedLoadAll(bounding); struct bed *placeItems = bedLoadAll(placed); struct bed *nearestNeighbors = NULL; int boundingCount = slCount(boundingElements); int placedCount = slCount(placeItems); int neighborCount = 0; struct chrGapList *boundingGaps = NULL; struct chrGapList *duplicateGapList = NULL; struct chrGapList *neighborGaps = NULL; struct statistic *statsList = NULL; struct statistic *statEl = NULL; if (neighbor) { nearestNeighbors = bedLoadAll(neighbor); slSort(&nearestNeighbors, bedCmp); /* order by chrom,chromStart */ neighborCount = slCount(nearestNeighbors); verbose(2, "neighbor element count: %d\n", neighborCount); neighborGaps = createGaps(nearestNeighbors); } slSort(&boundingElements, bedCmp); /* order by chrom,chromStart */ slSort(&placeItems, bedCmp); /* order by chrom,chromStart */ verbose(2, "bounding element count: %d\n", boundingCount); verbose(2, "placed item count: %d\n", placedCount); boundingGaps = createGaps(boundingElements); if (TRUE) /* display initial placement stats only */ { char *neighborName = NULL; if (neighbor) { neighborName = cloneString(neighbor); duplicateGapList = cloneGapList(neighborGaps); } else { neighborName = cloneString(bounding); duplicateGapList = cloneGapList(boundingGaps); } verbose(2,"stats before initial placement: =================\n"); statEl = gapStats(duplicateGapList, (char *)NULL, (char *)NULL, (char *)NULL); printf("statistics on gaps before any placements:\n\t(%s)\n", neighborName); statsPrint(statEl); slAddHead(&statsList,statEl); initialPlacement(duplicateGapList,placeItems); verbose(2,"stats after initial placement: =================\n"); statEl = gapStats(duplicateGapList, zeroBedOutFile, shoulderBedOutFile, distOut); printf("statistics after initial placement of placed items:\n\t(%s)\n", placed); statsPrint(statEl); slAddHead(&statsList,statEl); freeChrList(&duplicateGapList, FALSE); slReverse(&statsList); freeMem(neighborName); } if (trials > 0) { int trial; srand48((long int)seed); /* for default seed=0, same set of randoms */ slSort(&placeItems, bedCmpSize); /* order by size of elements */ slReverse(&placeItems); /* largest ones first */ measurePlaced(placeItems); /* show placed item characteristics */ for (trial = 0; trial < trials; ++trial) { struct bed *randomPlacedBedList; duplicateGapList = cloneGapList(boundingGaps); randomPlacedBedList = randomTrial(duplicateGapList,placeItems); if (neighbor) { struct chrGapList *duplicateNeighborList; slSort(&randomPlacedBedList,bedCmp);/*order by chrom,chromStart*/ duplicateNeighborList = cloneGapList(neighborGaps); initialPlacement(duplicateNeighborList,randomPlacedBedList); statEl = gapStats(duplicateNeighborList, (char *)NULL, (char *)NULL, (char *)NULL); freeChrList(&duplicateNeighborList, FALSE); } else statEl = gapStats(duplicateGapList, (char *)NULL, (char *)NULL, (char *)NULL); slAddHead(&statsList,statEl); /* this gap list has temporary bed elements that were * created by the randomTrial(), they need to be freed as * the list is released, hence the TRUE signal. * It isn't a true freeBedList operation because the chrom * names are left intact in the original copy of the bed * list. (The names were being shared.) */ if ((trial == (trials - 1)) && (bedOutFile != NULL)) { bedListOutput(duplicateGapList, bedOutFile); } freeChrList(&duplicateGapList, TRUE); } slReverse(&statsList); statsPrint(statsList); } if (neighbor) { bedFreeList(&nearestNeighbors); freeChrList(&neighborGaps, FALSE); } bedFreeList(&boundingElements); bedFreeList(&placeItems); freeChrList(&boundingGaps, FALSE); }
void encode2Meta(char *database, char *manifestIn, char *outMetaRa) /* encode2Meta - Create meta files.. */ { int dbIx = stringArrayIx(database, metaDbs, ArraySize(metaDbs)); if (dbIx < 0) errAbort("Unrecognized database %s", database); /* Create a three level meta.ra format file based on hgFixed.encodeExp * and database.metaDb tables. The levels are composite, experiment, file */ struct metaNode *metaTree = metaTreeNew("encode2"); /* Load up the manifest. */ struct encode2Manifest *mi, *miList = encode2ManifestShortLoadAll(manifestIn); struct hash *miHash = hashNew(18); for (mi = miList; mi != NULL; mi = mi->next) hashAdd(miHash, mi->fileName, mi); verbose(1, "%d files in %s\n", miHash->elCount, manifestIn); /* Load up encodeExp info. */ struct sqlConnection *expConn = sqlConnect(expDb); struct encodeExp *expList = encodeExpLoadByQuery(expConn, "NOSQLINJ select * from encodeExp"); sqlDisconnect(&expConn); verbose(1, "%d experiments in encodeExp\n", slCount(expList)); struct hash *compositeHash = hashNew(0); /* Go through each organism database in turn. */ int i; for (i=0; i<ArraySize(metaDbs); ++i) { char *db = metaDbs[i]; if (!sameString(database, db)) continue; verbose(1, "exploring %s\n", db); struct mdbObj *mdb, *mdbList = getMdbList(db); verbose(1, "%d meta objects in %s\n", slCount(mdbList), db); /* Get info on all composites. */ for (mdb = mdbList; mdb != NULL; mdb = mdb->next) { char *objType = mdbVarLookup(mdb->vars, "objType"); if (objType != NULL && sameString(objType, "composite")) { char compositeName[256]; safef(compositeName, sizeof(compositeName), "%s", mdb->obj); struct metaNode *compositeNode = metaNodeNew(compositeName); slAddHead(&metaTree->children, compositeNode); compositeNode->parent = metaTree; struct mdbVar *v; for (v=mdb->vars; v != NULL; v = v->next) { metaNodeAddVar(compositeNode, v->var, v->val); } metaNodeAddVar(compositeNode, "assembly", db); hashAdd(compositeHash, mdb->obj, compositeNode); } } /* Make up one more for experiments with no composite. */ char *noCompositeName = "wgEncodeZz"; struct metaNode *noCompositeNode = metaNodeNew(noCompositeName); slAddHead(&metaTree->children, noCompositeNode); noCompositeNode->parent = metaTree; hashAdd(compositeHash, noCompositeName, noCompositeNode); /* Now go through objects trying to tie experiments to composites. */ struct hash *expToComposite = hashNew(16); for (mdb = mdbList; mdb != NULL; mdb = mdb->next) { char *composite = mdbVarLookup(mdb->vars, "composite"); if (originalData(composite)) { char *dccAccession = mdbVarLookup(mdb->vars, "dccAccession"); if (dccAccession != NULL) { char *oldComposite = hashFindVal(expToComposite, dccAccession); if (oldComposite != NULL) { if (!sameString(oldComposite, composite)) verbose(2, "%s maps to %s ignoring mapping to %s", dccAccession, oldComposite, composite); } else { hashAdd(expToComposite, dccAccession, composite); } } } } /* Now get info on all experiments in this organism. */ struct hash *expHash = hashNew(0); struct encodeExp *exp; for (exp = expList; exp != NULL; exp = exp->next) { if (sameString(exp->organism, organisms[i])) { if (exp->accession != NULL) { char *composite = hashFindVal(expToComposite, exp->accession); struct metaNode *compositeNode; if (composite != NULL) { compositeNode = hashMustFindVal(compositeHash, composite); } else { compositeNode = noCompositeNode; } struct metaNode *expNode = wrapNodeAroundExp(exp); hashAdd(expHash, expNode->name, expNode); slAddHead(&compositeNode->children, expNode); expNode->parent = compositeNode; } } } for (mdb = mdbList; mdb != NULL; mdb = mdb->next) { char *fileName = NULL, *dccAccession = NULL; char *objType = mdbVarLookup(mdb->vars, "objType"); if (objType != NULL && sameString(objType, "composite")) continue; dccAccession = mdbVarLookup(mdb->vars, "dccAccession"); if (dccAccession == NULL) continue; char *composite = hashFindVal(expToComposite, dccAccession); if (composite == NULL) errAbort("Can't find composite for %s", mdb->obj); struct mdbVar *v; for (v = mdb->vars; v != NULL; v = v->next) { char *var = v->var, *val = v->val; if (sameString("fileName", var)) { fileName = val; char path[PATH_LEN]; char *comma = strchr(fileName, ','); if (comma != NULL) *comma = 0; /* Cut off comma separated list. */ safef(path, sizeof(path), "%s/%s/%s", db, composite, fileName); /* Add database path */ fileName = val = v->val = cloneString(path); } } if (fileName != NULL) { if (hashLookup(miHash, fileName)) { struct metaNode *expNode = hashFindVal(expHash, dccAccession); if (expNode != NULL) { struct metaNode *fileNode = metaNodeNew(mdb->obj); slAddHead(&expNode->children, fileNode); fileNode->parent = expNode; struct mdbVar *v; for (v=mdb->vars; v != NULL; v = v->next) { metaNodeAddVar(fileNode, v->var, v->val); } } } } } #ifdef SOON #endif /* SOON */ } struct hash *suppress = makeSuppress(); struct hash *closeEnoughTags = makeCloseEnoughTags(); metaTreeHoist(metaTree, closeEnoughTags); metaTreeSortChildrenSortTags(metaTree); FILE *f = mustOpen(outMetaRa, "w"); struct metaNode *node; for (node = metaTree->children; node != NULL; node = node->next) metaTreeWrite(0, 0, BIGNUM, FALSE, NULL, node, suppress, f); carefulClose(&f); /* Write warning about tags in highest parent. */ struct mdbVar *v; for (v = metaTree->vars; v != NULL; v = v->next) verbose(1, "Omitting universal %s %s\n", v->var, v->val); }