void bigWigAverageOverBed(char *inBw, char *inBed, char *outTab)
/* bigWigAverageOverBed - Compute average score of big wig over each bed, which may have introns. */
{
struct bed *bedList;
int fieldCount;
bedLoadAllReturnFieldCount(inBed, &bedList, &fieldCount);
checkUniqueNames(bedList);

struct bbiFile *bbi = bigWigFileOpen(inBw);
FILE *f = mustOpen(outTab, "w");
FILE *bedF = NULL;
if (bedOut != NULL)
    bedF = mustOpen(bedOut, "w");

/* Count up number of blocks in file.  It takes about 1/100th of of second to
 * look up a single block in a bigWig.  On the other hand to stream through
 * the whole file setting a array of doubles takes about 30 seconds, so we change
 * strategy at 3,000 blocks. 
 *   I (Jim) usually avoid having two paths through the code like this, and am tempted
 * to always go the ~30 second chromosome-at-a-time  way.  On the other hand the block-way
 * was developed first, and it was useful to have both ways to test against each other.
 * (This found a bug where the chromosome way wasn't handling beds in chromosomes not
 * covered by the bigWig for instance).  Since this code is not likely to change too
 * much, keeping both implementations in seems reasonable. */
int blockCount = countBlocks(bedList, fieldCount);
verbose(2, "Got %d blocks, if >= 3000 will use chromosome-at-a-time method\n", blockCount);

if (blockCount < 3000)
    averageFetchingEachBlock(bbi, bedList, fieldCount, f, bedF);
else
    averageFetchingEachChrom(bbi, &bedList, fieldCount, f, bedF);

carefulClose(&bedF);
carefulClose(&f);
}
Exemple #2
0
void regCompanionEnhProCellSpecificPairs(char *enhBed, char *cellDescriptions, 
	char *geneLevels, char *pairsIn, char *outDir)
/* regCompanionEnhProCellSpecificPairs - Select enh/pro pairs that are seen in a given cell 
 * lines. */
{
/* Load up cell descriptions into cell array */
struct expRecord *cell, *cellList = expRecordLoadAll(cellDescriptions);
int cellCount = slCount(cellList);
struct expRecord **cellArray;
AllocArray(cellArray, cellCount);
int i;
for (i=0, cell = cellList; i < cellCount; ++i, cell = cell->next)
    cellArray[i] = cell;
verbose(2, "Got %d cells in %s\n", cellCount, cellDescriptions);

/* Load up enhBed into a hash keyed by name */
struct bed *enh, *enhList;
int fieldCount;
bedLoadAllReturnFieldCount(enhBed, &enhList, &fieldCount);
if (fieldCount != 15)
   errAbort("Expecting bed 15 format in %s", enhBed);
struct hash *enhHash = hashNew(16);
for (enh = enhList; enh != NULL; enh = enh->next)
    {
    if (enh->expCount != cellCount)
        errAbort("Inconsistent input: %d cells in %s, but %d in %s\n", 
		cellCount, cellDescriptions, enh->expCount, enhBed);
    hashAddUnique(enhHash, enh->name, enh);
    }
verbose(2, "Got %d enhancers in %s\n", enhHash->elCount, enhBed);

/* Get a hash with key of gene name and value an array of expression values. */
struct hash *geneHash = hashGeneLevels(geneLevels, cellCount);
verbose(2, "Got %d genes in %s\n", geneHash->elCount, geneLevels);

/* Open inPairs.bed, just to make sure it's there before we do any output. */
struct lineFile *lf = lineFileOpen(pairsIn, TRUE);

/* Remove trailing slash from output dir if any */
if (lastChar(outDir) == '/')
    {
    int len = strlen(outDir);
    outDir[len-1] = 0;
    }

/* Make output directory and open all output files. */
makeDirsOnPath(outDir);
FILE *outFiles[cellCount];
for (i=0, cell = cellList; i < cellCount; ++i, cell = cell->next)
    {
    char path[PATH_LEN];
    safef(path, sizeof(path), "%s/%s.bed", outDir, cell->description);
    outFiles[i] = mustOpen(path, "w");
    }

/* Stream through input file and copy to appropriate outputs. */
char *words[bedKnownFields*2];	// Make a little bigger than any known bed
int wordCount, wordsRequired = 0;
char *separator = "->";
int separatorSize = strlen(separator);
int pairCount = 0;
while ((wordCount = lineFileChop(lf, words)) != 0)
    {
    /* Make sure all lines have same # of fields, and at least 4. */
    if (wordsRequired == 0)
	{
        wordsRequired = wordCount;
	lineFileExpectAtLeast(lf, 4, wordCount);
	}
    else
	lineFileExpectWords(lf, wordsRequired, wordCount);
    ++pairCount;

    /* Parse out name field. */
    char *name = words[3];
    char *sepPos = stringIn(separator, name);
    if (sepPos == NULL)
        errAbort("Expecting %s in %s line %d of %s", separator, name, lf->lineIx, lf->fileName);
    char *enhName = cloneStringZ(name, sepPos-name);
    char *geneName = sepPos + separatorSize;

    /* Look up enhancer and gene. */
    enh = hashMustFindVal(enhHash, enhName);
    double *geneLevels = hashMustFindVal(geneHash, geneName);
    freez(&enhName);

    /* Output ones over minimum levels. */
    for (i=0; i < cellCount; ++i)
        {
	double enhLevel = enh->expScores[i];
	double geneLevel = geneLevels[i];
	if (enhLevel >= minAct && geneLevel >= minExp)
	    {
	    int j;
	    FILE *f = outFiles[i];
	    fprintf(f, "%s", words[0]);
	    for (j=1; j<wordCount; ++j)
		fprintf(f, "\t%s", words[j]);
	    fprintf(f, "\n");
	    }
	}
    }
verbose(2, "Got %d pairs in %s\n", pairCount, pairsIn);

/* Clean up. */
lineFileClose(&lf);
for (i=0; i<cellCount; ++i)
    carefulClose(&outFiles[i]);
}