void bigWigAverageOverBed(char *inBw, char *inBed, char *outTab) /* bigWigAverageOverBed - Compute average score of big wig over each bed, which may have introns. */ { struct bed *bedList; int fieldCount; bedLoadAllReturnFieldCount(inBed, &bedList, &fieldCount); checkUniqueNames(bedList); struct bbiFile *bbi = bigWigFileOpen(inBw); FILE *f = mustOpen(outTab, "w"); FILE *bedF = NULL; if (bedOut != NULL) bedF = mustOpen(bedOut, "w"); /* Count up number of blocks in file. It takes about 1/100th of of second to * look up a single block in a bigWig. On the other hand to stream through * the whole file setting a array of doubles takes about 30 seconds, so we change * strategy at 3,000 blocks. * I (Jim) usually avoid having two paths through the code like this, and am tempted * to always go the ~30 second chromosome-at-a-time way. On the other hand the block-way * was developed first, and it was useful to have both ways to test against each other. * (This found a bug where the chromosome way wasn't handling beds in chromosomes not * covered by the bigWig for instance). Since this code is not likely to change too * much, keeping both implementations in seems reasonable. */ int blockCount = countBlocks(bedList, fieldCount); verbose(2, "Got %d blocks, if >= 3000 will use chromosome-at-a-time method\n", blockCount); if (blockCount < 3000) averageFetchingEachBlock(bbi, bedList, fieldCount, f, bedF); else averageFetchingEachChrom(bbi, &bedList, fieldCount, f, bedF); carefulClose(&bedF); carefulClose(&f); }
void regCompanionEnhProCellSpecificPairs(char *enhBed, char *cellDescriptions, char *geneLevels, char *pairsIn, char *outDir) /* regCompanionEnhProCellSpecificPairs - Select enh/pro pairs that are seen in a given cell * lines. */ { /* Load up cell descriptions into cell array */ struct expRecord *cell, *cellList = expRecordLoadAll(cellDescriptions); int cellCount = slCount(cellList); struct expRecord **cellArray; AllocArray(cellArray, cellCount); int i; for (i=0, cell = cellList; i < cellCount; ++i, cell = cell->next) cellArray[i] = cell; verbose(2, "Got %d cells in %s\n", cellCount, cellDescriptions); /* Load up enhBed into a hash keyed by name */ struct bed *enh, *enhList; int fieldCount; bedLoadAllReturnFieldCount(enhBed, &enhList, &fieldCount); if (fieldCount != 15) errAbort("Expecting bed 15 format in %s", enhBed); struct hash *enhHash = hashNew(16); for (enh = enhList; enh != NULL; enh = enh->next) { if (enh->expCount != cellCount) errAbort("Inconsistent input: %d cells in %s, but %d in %s\n", cellCount, cellDescriptions, enh->expCount, enhBed); hashAddUnique(enhHash, enh->name, enh); } verbose(2, "Got %d enhancers in %s\n", enhHash->elCount, enhBed); /* Get a hash with key of gene name and value an array of expression values. */ struct hash *geneHash = hashGeneLevels(geneLevels, cellCount); verbose(2, "Got %d genes in %s\n", geneHash->elCount, geneLevels); /* Open inPairs.bed, just to make sure it's there before we do any output. */ struct lineFile *lf = lineFileOpen(pairsIn, TRUE); /* Remove trailing slash from output dir if any */ if (lastChar(outDir) == '/') { int len = strlen(outDir); outDir[len-1] = 0; } /* Make output directory and open all output files. */ makeDirsOnPath(outDir); FILE *outFiles[cellCount]; for (i=0, cell = cellList; i < cellCount; ++i, cell = cell->next) { char path[PATH_LEN]; safef(path, sizeof(path), "%s/%s.bed", outDir, cell->description); outFiles[i] = mustOpen(path, "w"); } /* Stream through input file and copy to appropriate outputs. */ char *words[bedKnownFields*2]; // Make a little bigger than any known bed int wordCount, wordsRequired = 0; char *separator = "->"; int separatorSize = strlen(separator); int pairCount = 0; while ((wordCount = lineFileChop(lf, words)) != 0) { /* Make sure all lines have same # of fields, and at least 4. */ if (wordsRequired == 0) { wordsRequired = wordCount; lineFileExpectAtLeast(lf, 4, wordCount); } else lineFileExpectWords(lf, wordsRequired, wordCount); ++pairCount; /* Parse out name field. */ char *name = words[3]; char *sepPos = stringIn(separator, name); if (sepPos == NULL) errAbort("Expecting %s in %s line %d of %s", separator, name, lf->lineIx, lf->fileName); char *enhName = cloneStringZ(name, sepPos-name); char *geneName = sepPos + separatorSize; /* Look up enhancer and gene. */ enh = hashMustFindVal(enhHash, enhName); double *geneLevels = hashMustFindVal(geneHash, geneName); freez(&enhName); /* Output ones over minimum levels. */ for (i=0; i < cellCount; ++i) { double enhLevel = enh->expScores[i]; double geneLevel = geneLevels[i]; if (enhLevel >= minAct && geneLevel >= minExp) { int j; FILE *f = outFiles[i]; fprintf(f, "%s", words[0]); for (j=1; j<wordCount; ++j) fprintf(f, "\t%s", words[j]); fprintf(f, "\n"); } } } verbose(2, "Got %d pairs in %s\n", pairCount, pairsIn); /* Clean up. */ lineFileClose(&lf); for (i=0; i<cellCount; ++i) carefulClose(&outFiles[i]); }