Beispiel #1
0
void scopCollapse(char *inFeat, char *inModel, char *outFeat, char *outDesc, 
	char *outKnownTo)
/* scopCollapse - Convert SCOP model to SCOP ID. Also make id/name converter file.. */
{
/* Process inModel file, writing three columns to output, and keeping
 * a couple of columns in a hash */
struct hash *modelToSeed = hashNew(18);
struct hash *seedToScop = hashNew(16);
struct lineFile *lf = lineFileOpen(inModel, TRUE);
FILE *f = mustOpen(outDesc, "w");
char *modRow[5];
while (lineFileRowTab(lf, modRow))
    {
    char *seedId = modRow[2];
    hashAdd(modelToSeed, modRow[0], cloneString(seedId) );
    if (!hashLookup(seedToScop, seedId))
        {
	char *scopId = modRow[1];
	hashAdd(seedToScop, seedId, cloneString(scopId));
	fprintf(f, "%s\t%s\t%s\n", scopId, seedId, modRow[4]);
	}
    }
carefulClose(&f);
lineFileClose(&lf);

/* Process in-feature.  We make up a structure for each protein here. */
struct hash *protHash = hashNew(18);
struct protInfo *prot, *protList = NULL;
lf = lineFileOpen(inFeat, TRUE);
char *featRow[6];
while (lineFileRow(lf, featRow))
    {
    prot = hashFindVal(protHash, featRow[0]);
    if (prot == NULL)
        {
	AllocVar(prot);
	hashAddSaveName(protHash, featRow[0], prot, &prot->name);
	slAddHead(&protList, prot);
	}
    struct protFeature *feature;
    AllocVar(feature);
    feature->protein = prot->name;
    feature->start = lineFileNeedNum(lf, featRow, 1);
    feature->end = lineFileNeedNum(lf, featRow, 2);
    feature->name = hashMustFindVal(modelToSeed, featRow[3]);
    feature->eVal = lineFileNeedDouble(lf, featRow, 4);
    feature->score = lineFileNeedDouble(lf, featRow, 5);
    slAddHead(&prot->featureList, feature);
    }
lineFileClose(&lf);
slReverse(&protList);

f = mustOpen(outFeat, "w");
FILE *fKnownTo = mustOpen(outKnownTo, "w");
for (prot = protList; prot != NULL; prot = prot->next)
    outputProt(prot, seedToScop, f, fKnownTo);
carefulClose(&f);
carefulClose(&fKnownTo);
}
Beispiel #2
0
void colTransform(char *column, char *input, char *addFactor, char *mulFactor, char *output)
/* colTransform - Add and/or multiply column by constant.. */
{
int col = sqlUnsigned(column) - 1;
double add = sqlDouble(addFactor);
double mul = sqlDouble(mulFactor);
struct lineFile *lf = lineFileOpen(input, TRUE);
FILE *f = mustOpen(output, "w");
char *words[512];
int wordCount;
while ((wordCount = lineFileChop(lf, words)) > 0)
    {
    lineFileExpectAtLeast(lf, col, wordCount);
    double x = lineFileNeedDouble(lf, words, col);
    int i;
    for (i=0; i<wordCount; ++i)
        {
	if (i != 0)
	    fputc('\t', f);
	if (i == col)
	    fprintf(f, "%g", x*mul+add);
	else
	    fputs(words[i], f);
	}
    fputc('\n', f);
    }
carefulClose(&f);
}
struct wigSection *wigSectionRead(struct lineFile *lf)
/* Parse out next section of wig. */
{
    static double *vals = NULL;
    static int valAlloc = 0;

    /* Get "fixedStep" line and parse it. */
    char *line;
    if (!lineFileNextReal(lf, &line))
        return NULL;
    char *pattern = "fixedStep ";
    int patSize = 10;
    if (!startsWith(pattern, line))
        errAbort("Expecting fixedStep line %d of %s", lf->lineIx, lf->fileName);
    line += patSize;
    struct hash *varHash = hashVarLine(line, lf->lineIx);
    int step = sqlUnsigned(requiredVal(lf, varHash, "step"));
    int start = sqlUnsigned(requiredVal(lf, varHash, "start"));
    char *chrom = cloneString(requiredVal(lf, varHash, "chrom"));
    hashFree(&varHash);

    /* Parse out numbers until next fixedStep. */
    int valCount = 0;
    int i;
    for (;;)
    {
        if (!lineFileNextReal(lf, &line))
            break;
        if (startsWith(pattern, line))
        {
            lineFileReuse(lf);
            break;
        }
        for (i=0; i<step; ++i)
        {
            if (valCount >= valAlloc)
            {
                int newAlloc = valAlloc + 1024;
                ExpandArray(vals, valAlloc, newAlloc);
                valAlloc = newAlloc;
            }
            vals[valCount] = lineFileNeedDouble(lf, &line, 0);
            ++valCount;
        }
    }

    /* Create wigSection. */
    struct wigSection *section;
    AllocVar(section);
    section->chrom = chrom;
    section->chromStart = start;
    section->chromEnd = start + valCount;
    section->vals = CloneArray(vals, valCount);
    return section;
}
struct hash *hashWeights(char *in)
/* Return hash full of weights. */
{
    struct lineFile *lf = lineFileOpen(in, TRUE);
    char *row[2];
    struct hash *hash = hashNew(0);
    while (lineFileRow(lf, row))
    {
        struct weight *weight;
        AllocVar(weight);
        weight->value = lineFileNeedDouble(lf, row, 1);
        hashAddSaveName(hash, row[0], weight, &weight->type);
    }
    lineFileClose(&lf);
    return hash;
}
Beispiel #5
0
void raIntoCdwRepeatQa(char *fileName, struct sqlConnection *conn, long long fileId)
/* Read in two column file and put it into cdwQaRepeat table. */
{
struct lineFile *lf = lineFileOpen(fileName, TRUE);
char *row[2];
while (lineFileRow(lf, row))
    {
    char *repeatClass = row[0];
    double mapRatio = lineFileNeedDouble(lf, row, 1);
    char query[512];
    sqlSafef(query, sizeof(query), 
	"insert into cdwQaRepeat (fileId,repeatClass,mapRatio) values (%lld, \"%s\", %g)",
	fileId, repeatClass, mapRatio);
    sqlUpdate(conn, query);
    }
lineFileClose(&lf);
}
Beispiel #6
0
static struct visiMatch *readMatchFile(char *fileName)
/* Read in match file */
{
struct visiMatch *matchList = NULL, *match;
struct lineFile *lf = lineFileMayOpen(fileName, TRUE);
if (lf != NULL)
    {
    char *row[2];
    while (lineFileRow(lf, row))
	{
	AllocVar(match);
	match->imageId = lineFileNeedNum(lf, row, 0);
	match->weight = lineFileNeedDouble(lf, row, 1);
	slAddHead(&matchList, match);
	}
    lineFileClose(&lf);
    slReverse(&matchList);
    }
return matchList;
}
static double minOfCol(char *fileName, int colIx)
/* Return minimum value seen in given column of file. */
{
struct lineFile *lf = lineFileOpen(fileName, TRUE);
int minColCount = colIx+1;
char *row[minColCount];
boolean gotAny = FALSE;
double minVal = 0;
while (lineFileNextRow(lf, row, minColCount))
    {
    double val = lineFileNeedDouble(lf, row, colIx);
    if (!gotAny || val < minVal)
	{
	gotAny = TRUE;
        minVal = val;
	}
    }
lineFileClose(&lf);
if (!gotAny)
    errAbort("No data in %s", fileName);
return minVal;
}
Beispiel #8
0
void convertVariableStepSection(struct lineFile *lf, struct hash *vars, struct bgOut *out)
/* Read through section and output. */
{
char *chrom = requiredVar(vars, "chrom", lf);
int span = sqlUnsigned(optionalVar(vars, "span", "1"));
char *line;
while (lineFileNextReal(lf, &line))
    {
    line = skipLeadingSpaces(line);
    if (isalpha(line[0]))
	{
        lineFileReuse(lf);
	break;
	}
    char *words[3];
    int wordCount = chopLine(line, words);
    if (wordCount != 2)
        errAbort("Expecting exactly two numbers line %d of %s", lf->lineIx, lf->fileName);
    int start = lineFileNeedNum(lf, words, 0) - 1;
    double val = lineFileNeedDouble(lf, words, 1);
    bgOutWrite(out, chrom, start, start+span, val);
    }
}
Beispiel #9
0
static void parseBedGraphSection(struct lineFile *lf, boolean clipDontDie, 
	struct hash *chromSizeHash, struct lm *lm, 
	int itemsPerSlot, struct bwgSection **pSectionList)
/* Parse out bedGraph section until we get to something that is not in bedGraph format. */
{
/* Set up hash and list to store chromosomes. */
struct hash *chromHash = hashNew(0);
struct bedGraphChrom *chrom, *chromList = NULL;

/* Collect lines in items on appropriate chromosomes. */
struct bwgBedGraphItem *item;
char *line;
while (lineFileNextReal(lf, &line))
    {
    /* Check for end of section. */
    if (stepTypeLine(line))
        {
	lineFileReuse(lf);
	break;
	}

    /* Parse out our line and make sure it has exactly 4 columns. */
    char *words[5];
    int wordCount = chopLine(line, words);
    lineFileExpectWords(lf, 4, wordCount);

    /* Get chromosome. */
    char *chromName = words[0];
    chrom = hashFindVal(chromHash, chromName);
    if (chrom == NULL)
        {
	lmAllocVar(chromHash->lm, chrom);
	hashAddSaveName(chromHash, chromName, chrom, &chrom->name);
	chrom->size = (chromSizeHash ? hashIntVal(chromSizeHash, chromName) : BIGNUM);
	slAddHead(&chromList, chrom);
	}

    /* Convert to item and add to chromosome list. */
    lmAllocVar(lm, item);
    item->start = lineFileNeedNum(lf, words, 1);
    item->end = lineFileNeedNum(lf, words, 2);
    item->val = lineFileNeedDouble(lf, words, 3);

    /* Do sanity checking on coordinates. */
    if (item->start > item->end)
        errAbort("bedGraph error: start (%u) after end line (%u) %d of %s.", 
		item->start, item->end, lf->lineIx, lf->fileName);
    if (item->end > chrom->size)
	{
        warn("bedGraph error line %d of %s: chromosome %s has size %u but item ends at %u",
	        lf->lineIx, lf->fileName, chrom->name, chrom->size, item->end);
	if (!clipDontDie)
	    noWarnAbort();
	}
    else
	{
	slAddHead(&chrom->itemList, item);
	}
    }
slSort(&chromList, bedGraphChromCmpName);

/* Loop through each chromosome and output the item list, broken into sections
 * for that chrom. */
for (chrom = chromList; chrom != NULL; chrom = chrom->next)
    {
    slSort(&chrom->itemList, bwgBedGraphItemCmp);

    /* Check to make sure no overlap between items. */
    struct bwgBedGraphItem *item = chrom->itemList, *nextItem;
    for (nextItem = item->next; nextItem != NULL; nextItem = nextItem->next)
        {
	if (item->end > nextItem->start)
	    errAbort("Overlap between %s %d %d and %s %d %d.\nPlease remove overlaps and try again",
	        chrom->name, item->start, item->end, chrom->name, nextItem->start, nextItem->end);
	item = nextItem;
	}

    /* Break up into sections of no more than items-per-slot size. */
    struct bwgBedGraphItem *startItem, *endItem, *nextStartItem = chrom->itemList;
    for (startItem = chrom->itemList; startItem != NULL; startItem = nextStartItem)
	{
	/* Find end item of this section, and start item for next section.
	 * Terminate list at end item. */
	int sectionSize = 0;
	int i;
	endItem = startItem;
	for (i=0; i<itemsPerSlot; ++i)
	    {
	    if (nextStartItem == NULL)
		break;
	    endItem = nextStartItem;
	    nextStartItem = nextStartItem->next;
	    ++sectionSize;
	    }
	endItem->next = NULL;

	/* Fill in section and add it to section list. */
	struct bwgSection *section;
	lmAllocVar(lm, section);
	section->chrom = cloneString(chrom->name);
	section->start = startItem->start;
	section->end = endItem->end;
	section->type = bwgTypeBedGraph;
	section->items.bedGraphList = startItem;
	section->itemCount = sectionSize;
	slAddHead(pSectionList, section);
	}
    }

/* Free up hash, no longer needed. Free's chromList as a side effect since chromList is in 
 * hash's memory. */
hashFree(&chromHash);
chromList = NULL;
}
Beispiel #10
0
static void parseVariableStepSection(struct lineFile *lf, boolean clipDontDie, struct lm *lm,
	int itemsPerSlot, char *chrom, int chromSize, bits32 span, struct bwgSection **pSectionList)
/* Read the single column data in section until get to end. */
{
struct lm *lmLocal = lmInit(0);

/* Stream through section until get to end of file or next section,
 * adding values from single column to list. */
char *words[2];
char *line;
struct bwgVariableStepItem *item, *nextItem, *itemList = NULL;
int originalSectionSize = 0;
while (lineFileNextReal(lf, &line))
    {
    if (steppedSectionEnd(line, 2))
	{
        lineFileReuse(lf);
	break;
	}
    chopLine(line, words);
    lmAllocVar(lmLocal, item);
    int start = lineFileNeedNum(lf, words, 0);
    if (start <= 0)
	{
	errAbort("line %d of %s: zero or negative chromosome coordinate not allowed",
	    lf->lineIx, lf->fileName);
	}
    item->start = start - 1;
    item->val = lineFileNeedDouble(lf, words, 1);
    if (item->start + span > chromSize)
        {
	warn("line %d of %s: chromosome %s has %u bases, but item ends at %u",
	    lf->lineIx, lf->fileName, chrom, chromSize, item->start + span);
	if (!clipDontDie)
	    noWarnAbort();
	}
    else
        {
	slAddHead(&itemList, item);
	++originalSectionSize;
	}
    }
slSort(&itemList, bwgVariableStepItemCmp);

/* Make sure no overlap between items. */
if (itemList != NULL)
    {
    item = itemList;
    for (nextItem = item->next; nextItem != NULL; nextItem = nextItem->next)
        {
	if (item->start + span > nextItem->start)
	    errAbort("Overlap on %s between items starting at %d and %d.\n"
	             "Please remove overlaps and try again",
		    chrom, item->start, nextItem->start);
	item = nextItem;
	}
    }

/* Break up into sections of no more than items-per-slot size. */
int sizeLeft = originalSectionSize;
for (item = itemList; item != NULL; )
    {
    /* Figure out size of this section  */
    int sectionSize = sizeLeft;
    if (sectionSize > itemsPerSlot)
        sectionSize = itemsPerSlot;
    sizeLeft -= sectionSize;

    /* Convert from list to array representation. */
    struct bwgVariableStepPacked *packed, *p;		
    p = lmAllocArray(lm, packed, sectionSize);
    int i;
    for (i=0; i<sectionSize; ++i)
        {
	p->start = item->start;
	p->val = item->val;
	item = item->next;
	++p;
	}

    /* Fill in section and add it to list. */
    struct bwgSection *section;
    lmAllocVar(lm, section);
    section->chrom = chrom;
    section->start = packed[0].start;
    section->end = packed[sectionSize-1].start + span;
    section->type = bwgTypeVariableStep;
    section->items.variableStepPacked = packed;
    section->itemSpan = span;
    section->itemCount = sectionSize;
    slAddHead(pSectionList, section);
    }
lmCleanup(&lmLocal);
}
Beispiel #11
0
static void parseFixedStepSection(struct lineFile *lf, boolean clipDontDie, struct lm *lm,
	int itemsPerSlot, char *chrom, bits32 chromSize, bits32 span, bits32 sectionStart, 
	bits32 step, struct bwgSection **pSectionList)
/* Read the single column data in section until get to end. */
{
struct lm *lmLocal = lmInit(0);

/* Stream through section until get to end of file or next section,
 * adding values from single column to list. */
char *words[1];
char *line;
struct bwgFixedStepItem *item, *itemList = NULL;
int originalSectionSize = 0;
bits32 sectionEnd = sectionStart;
while (lineFileNextReal(lf, &line))
    {
    if (steppedSectionEnd(line, 1))
	{
        lineFileReuse(lf);
	break;
	}
    chopLine(line, words);
    lmAllocVar(lmLocal, item);
    item->val = lineFileNeedDouble(lf, words, 0);
    if (sectionEnd + span > chromSize)
	{
	warn("line %d of %s: chromosome %s has %u bases, but item ends at %u",
	    lf->lineIx, lf->fileName, chrom, chromSize, sectionEnd + span);
	if (!clipDontDie)
	    noWarnAbort();
	}
    else
	{
	slAddHead(&itemList, item);
	++originalSectionSize;
	}
    sectionEnd += step;
    }
slReverse(&itemList);

/* Break up into sections of no more than items-per-slot size, and convert to packed format. */
int sizeLeft = originalSectionSize;
for (item = itemList; item != NULL; )
    {
    /* Figure out size of this section  */
    int sectionSize = sizeLeft;
    if (sectionSize > itemsPerSlot)
        sectionSize = itemsPerSlot;
    sizeLeft -= sectionSize;


    /* Allocate and fill in section. */
    struct bwgSection *section;
    lmAllocVar(lm, section);
    section->chrom = chrom;
    section->start = sectionStart;
    sectionStart += sectionSize * step;
    section->end = sectionStart - step + span;
    section->type = bwgTypeFixedStep;
    section->itemStep = step;
    section->itemSpan = span;
    section->itemCount = sectionSize;

    /* Allocate array for data, and copy from list to array representation */
    struct bwgFixedStepPacked *packed;		/* An array */
    section->items.fixedStepPacked = lmAllocArray(lm, packed, sectionSize);
    int i;
    for (i=0; i<sectionSize; ++i)
        {
	packed->val = item->val;
	item = item->next;
	++packed;
	}

    /* Add section to list. */
    slAddHead(pSectionList, section);
    }
lmCleanup(&lmLocal);
}
Beispiel #12
0
struct bwgSection *bwgParseWig(
	char *fileName,       /* Name of ascii wig file. */
	boolean clipDontDie,  /* Skip items outside chromosome rather than aborting. */
	struct hash *chromSizeHash,  /* If non-NULL items checked to be inside chromosome. */
	int maxSectionSize,   /* Biggest size of a section.  100 - 100,000 is usual range. */
	struct lm *lm)	      /* Memory pool to allocate from. */
/* Parse out ascii wig file - allocating memory in lm. */
{
struct lineFile *lf = lineFileOpen(fileName, TRUE);
char *line;
struct bwgSection *sectionList = NULL;

/* remove initial browser and track lines */
lineFileRemoveInitialCustomTrackLines(lf);

while (lineFileNextReal(lf, &line))
    {
    verbose(2, "processing %s\n", line);
    if (stringIn("chrom=", line))
	parseSteppedSection(lf, clipDontDie, chromSizeHash, line, lm, maxSectionSize, &sectionList);
    else
        {
	/* Check for bed... */
	char *dupe = cloneString(line);
	char *words[5];
	int wordCount = chopLine(dupe, words);
	if (wordCount != 4)
	    errAbort("Unrecognized line %d of %s:\n%s\n", lf->lineIx, lf->fileName, line);

	/* Parse out a bed graph line just to check numerical format. */
	char *chrom = words[0];
	int start = lineFileNeedNum(lf, words, 1);
	int end = lineFileNeedNum(lf, words, 2);
	double val = lineFileNeedDouble(lf, words, 3);
	verbose(2, "bedGraph %s:%d-%d@%g\n", chrom, start, end, val);

	/* Push back line and call bed parser. */
	lineFileReuse(lf);
	parseBedGraphSection(lf, clipDontDie, chromSizeHash, lm, maxSectionSize, &sectionList);
	}
    }
slSort(&sectionList, bwgSectionCmp);

/* Check for overlap at section level. */
struct bwgSection *section, *nextSection;
for (section = sectionList; section != NULL; section = nextSection)
    {
    nextSection = section->next;
    if (nextSection != NULL)
        {
	if (sameString(section->chrom, nextSection->chrom))
	    {
	    if (section->end > nextSection->start)
	        {
		errAbort("There's more than one value for %s base %d (in coordinates that start with 1).\n",
		    section->chrom, nextSection->start+1);
		}
	    }
	}
    }

return sectionList;
}
Beispiel #13
0
struct encodePeak *encodePeakLineFileLoad(char **row, enum encodePeakType pt, struct lineFile *lf)
/* From a linefile line, load an encodePeak row.  Errors outputted */
/* have line numbers, etc. Does more error checking as well. */
{
struct encodePeak *peak;
if (!pt)
    errAbort("Unknown peak type set for track");
AllocVar(peak);
peak->chrom = cloneString(row[0]);
peak->chromStart = lineFileNeedNum(lf, row, 1);
peak->chromEnd = lineFileNeedNum(lf, row, 2);
peak->peak = -1;
if (peak->chromEnd < 1)
    lineFileAbort(lf, "chromEnd less than 1 (%d)", peak->chromEnd);
if (peak->chromEnd < peak->chromStart)
    lineFileAbort(lf, "chromStart after chromEnd (%d > %d)", 
    	peak->chromStart, peak->chromEnd);
peak->name = cloneString(row[3]);
peak->score = lineFileNeedNum(lf, row, 4);
safecpy(peak->strand, sizeof(peak->strand), row[5]);
if (peak->strand[0] != '+' && peak->strand[0] != '-' && peak->strand[0] != '.')
    lineFileAbort(lf, "Expecting +, -, or . in strand");
if (pt != gappedPeak)
/* deal with signalValue, pValue, qValue, and peak */
    {
    peak->signalValue = (float)lineFileNeedDouble(lf, row, 6);
    peak->pValue = (float)lineFileNeedDouble(lf, row, 7);
    peak->qValue = (float)lineFileNeedDouble(lf, row, 8);
    if ((pt == narrowPeak) || (pt == encodePeak))
	{	
	peak->peak = lineFileNeedNum(lf, row, 9);
	if (peak->peak >= (int)peak->chromEnd)
	    lineFileAbort(lf, "peak site past chromEnd (%d > %d)", peak->peak, peak->chromEnd);
	}
    }
else  /* must be gappedPeak */
/* deal with thickStart, thickEnd, itemRgb even though they're not used */
    {
    int thickStart = lineFileNeedNum(lf, row, 6);
    int thickEnd = lineFileNeedNum(lf, row, 7);
    int itemRgb = 0;
    char *comma;
    /*	Allow comma separated list of rgb values here	*/
    comma = strchr(row[8], ',');
    if (comma)
	itemRgb = bedParseRgb(row[8]);
    else
	itemRgb = lineFileNeedNum(lf, row, 8);
    if ((thickStart != 0) || (thickEnd != 0) || (itemRgb != 0))
	lineFileAbort(lf, "thickStart, thickEnd, and itemRgb columns not used in gappedPeak type, set all to 0");
    }
/* Deal with blocks */
if ((pt == gappedPeak) || (pt == encodePeak))
    {
    int i, count;
    int lastEnd, lastStart;
    int blockCountIx, blockSizesIx, blockStartsIx;
    if (pt == gappedPeak)
	{
	blockCountIx = 9;
	blockSizesIx = 10;
	blockStartsIx = 11;
	}
    else
	{
	blockCountIx = 10;
	blockSizesIx = 11;
	blockStartsIx = 12;
	}
    peak->blockCount = lineFileNeedNum(lf, row, blockCountIx);
    sqlUnsignedDynamicArray(row[blockSizesIx], &peak->blockSizes, &count);
    if (count != peak->blockCount)
	lineFileAbort(lf,  "expecting %d elements in array", peak->blockCount);
    sqlUnsignedDynamicArray(row[blockStartsIx], &peak->blockStarts, &count);
    if (count != peak->blockCount)
	lineFileAbort(lf, "expecting %d elements in array", peak->blockCount);
    // tell the user if they appear to be using absolute starts rather than 
    // relative... easy to forget!  Also check block order, coord ranges...
    lastStart = -1;
    lastEnd = 0;
    for (i=0;  i < peak->blockCount;  i++)
	{
	if (peak->blockStarts[i]+peak->chromStart >= peak->chromEnd)
	    {
	    if (peak->blockStarts[i] >= peak->chromStart)
		lineFileAbort(lf, 
		    "BED blockStarts offsets must be relative to chromStart, "
		    "not absolute.  Try subtracting chromStart from each offset "
		    "in blockStarts.");
	    else
		lineFileAbort(lf, 
		    "BED blockStarts[i]+chromStart must be less than chromEnd.");
	    }
	lastStart = peak->blockStarts[i];
	lastEnd = peak->chromStart + peak->blockStarts[i] + peak->blockSizes[i];
	}
    if (peak->blockStarts[0] != 0)
	lineFileAbort(lf, 
	    "BED blocks must span chromStart to chromEnd.  "
	    "BED blockStarts[0] must be 0 (==%d) so that (chromStart + "
	    "blockStarts[0]) equals chromStart.", peak->blockStarts[0]);
    i = peak->blockCount-1;
    if ((peak->chromStart + peak->blockStarts[i] + peak->blockSizes[i]) !=
	peak->chromEnd)
	{
	lineFileAbort(lf, 
	    "BED blocks must span chromStart to chromEnd.  (chromStart + "
	    "blockStarts[last] + blockSizes[last]) must equal chromEnd.");
	}
    }
if (pt == gappedPeak)
    /* deal with final three columns of a gappedPeak */
    {
    peak->signalValue = (float)lineFileNeedDouble(lf, row, 12);
    peak->pValue = (float)lineFileNeedDouble(lf, row, 13);
    peak->qValue = (float)lineFileNeedDouble(lf, row, 14);    
    }
return peak;
}
Beispiel #14
0
void writeSections(struct bbiChromUsage *usageList, struct lineFile *lf, 
	int itemsPerSlot, struct bbiBoundsArray *bounds, int sectionCount, FILE *f,
	int resTryCount, int resScales[], int resSizes[], 
	boolean doCompress, bits32 *retMaxSectionSize)
/* Read through lf, chunking it into sections that get written to f.  Save info
 * about sections in bounds. */
{
int maxSectionSize = 0;
struct bbiChromUsage *usage = usageList;
int itemIx = 0, sectionIx = 0;
bits32 reserved32 = 0;
UBYTE reserved8 = 0;
struct sectionItem items[itemsPerSlot];
struct sectionItem *lastB = NULL;
bits32 resEnds[resTryCount];
int resTry;
for (resTry = 0; resTry < resTryCount; ++resTry)
    resEnds[resTry] = 0;
struct dyString *stream = dyStringNew(0);

/* remove initial browser and track lines */
lineFileRemoveInitialCustomTrackLines(lf);

for (;;)
    {
    /* Get next line of input if any. */
    char *row[5];
    int rowSize = lineFileChopNext(lf, row, ArraySize(row));

    /* Figure out whether need to output section. */
    boolean sameChrom = FALSE;
    if (rowSize > 0)
	sameChrom = sameString(row[0], usage->name);
    if (itemIx >= itemsPerSlot || rowSize == 0 || !sameChrom)
        {
	/* Figure out section position. */
	bits32 chromId = usage->id;
	bits32 sectionStart = items[0].start;
	bits32 sectionEnd = items[itemIx-1].end;

	/* Save section info for indexing. */
	assert(sectionIx < sectionCount);
	struct bbiBoundsArray *section = &bounds[sectionIx++];
	section->offset = ftell(f);
	section->range.chromIx = chromId;
	section->range.start = sectionStart;
	section->range.end = sectionEnd;

	/* Output section header to stream. */
	dyStringClear(stream);
	UBYTE type = bwgTypeBedGraph;
	bits16 itemCount = itemIx;
	dyStringWriteOne(stream, chromId);			// chromId
	dyStringWriteOne(stream, sectionStart);		// start
	dyStringWriteOne(stream, sectionEnd);	// end
	dyStringWriteOne(stream, reserved32);		// itemStep
	dyStringWriteOne(stream, reserved32);		// itemSpan
	dyStringWriteOne(stream, type);			// type
	dyStringWriteOne(stream, reserved8);			// reserved
	dyStringWriteOne(stream, itemCount);			// itemCount

	/* Output each item in section to stream. */
	int i;
	for (i=0; i<itemIx; ++i)
	    {
	    struct sectionItem *item = &items[i];
	    dyStringWriteOne(stream, item->start);
	    dyStringWriteOne(stream, item->end);
	    dyStringWriteOne(stream, item->val);
	    }

	/* Save stream to file, compressing if need be. */
	if (stream->stringSize > maxSectionSize)
	    maxSectionSize = stream->stringSize;
	if (doCompress)
	    {
	    size_t maxCompSize = zCompBufSize(stream->stringSize);
	    char compBuf[maxCompSize];
	    int compSize = zCompress(stream->string, stream->stringSize, compBuf, maxCompSize);
	    mustWrite(f, compBuf, compSize);
	    }
	else
	    mustWrite(f, stream->string, stream->stringSize);


	/* If at end of input we are done. */
	if (rowSize == 0)
	    break;

	/* Set up for next section. */
	itemIx = 0;

	if (!sameChrom)
	    {
	    usage = usage->next;
	    assert(usage != NULL);
            if (!sameString(row[0], usage->name))
                errAbort("read %s, expecting %s on line %d in file %s\n", 
                    row[0], usage->name, lf->lineIx, lf->fileName);
	    assert(sameString(row[0], usage->name));
	    lastB = NULL;
	    for (resTry = 0; resTry < resTryCount; ++resTry)
		resEnds[resTry] = 0;
	    }
	}

    /* Parse out input. */
    lineFileExpectWords(lf, 4, rowSize);
    bits32 start = lineFileNeedNum(lf, row, 1);
    bits32 end = lineFileNeedNum(lf, row, 2);
    float val = lineFileNeedDouble(lf, row, 3);

    /* Verify that inputs meets our assumption - that it is a sorted bedGraph file. */
    if (start > end)
        errAbort("Start (%u) after end (%u) line %d of %s", start, end, lf->lineIx, lf->fileName);
    if (lastB != NULL)
        {
	if (lastB->start > start)
	    errAbort("BedGraph not sorted on start line %d of %s", lf->lineIx, lf->fileName);
	if (lastB->end > start)
	    errAbort("Overlapping regions in bedGraph line %d of %s", lf->lineIx, lf->fileName);
	}


    /* Do zoom counting. */
    for (resTry = 0; resTry < resTryCount; ++resTry)
        {
	bits32 resEnd = resEnds[resTry];
	if (start >= resEnd)
	    {
	    resSizes[resTry] += 1;
	    resEnds[resTry] = resEnd = start + resScales[resTry];
	    }
	while (end > resEnd)
	    {
	    resSizes[resTry] += 1;
	    resEnds[resTry] = resEnd = resEnd + resScales[resTry];
	    }
	}

    /* Save values in output array. */
    struct sectionItem *b = &items[itemIx];
    b->start = start;
    b->end = end;
    b->val = val;
    lastB = b;
    itemIx += 1;
    }
assert(sectionIx == sectionCount);

*retMaxSectionSize = maxSectionSize;
}
void ticksToWig(int startTick, char *inTable, char *outDensity, char *outAverage)
/* ticksToWig - Convert tab-delimited file of Unix time ticks, and possibly also 
 * numerical values to wig file(s).. */
{
struct lineFile *lf = lineFileOpen(inTable, TRUE);
FILE *densityFile = mustOpen(outDensity, "w");
printVarStepHead(densityFile);
FILE *averageFile = NULL;
if (outAverage != NULL)
    {
    averageFile = mustOpen(outAverage, "w");
    printVarStepHead(averageFile);
    }
int colsToParse = 1 + max(tickCol, valCol);
char *row[colsToParse];

time_t curTick = 0;
int sameTickCount = 0;
double tickTotal = 0;
double val = 0;
time_t tick;
while (lineFileNextRow(lf, row, colsToParse))
    {
    tick = lineFileNeedNum(lf, row, tickCol);
    if (averageFile != NULL)
       val = lineFileNeedDouble(lf, row, valCol);
    if (curTick != tick)
        {
	if (curTick > tick)
	    errAbort("Input isn't sorted - %ld > %ld line %d of %s\n", 
	    	(long)curTick, (long)tick, lf->lineIx, lf->fileName);
	if (startTick == 0)
	    startTick = tick;
        if (sameTickCount > 0)
	    {
	    fprintf(densityFile, "%ld\t%d\n", curTick - startTick + 1, sameTickCount);
	    time_t i;
	    for (i=curTick+1; i<tick; ++i)
		{
		fprintf(densityFile, "%ld\t%d\n", i - startTick + 1, 0);
		}
	   if (averageFile != NULL)
	       {
	       fprintf(averageFile, "%ld\t%f\n", 
	       		(long)curTick - startTick + 1, tickTotal/sameTickCount);
	       tickTotal = 0;
	       }
	    sameTickCount = 0;
	    }
        curTick = tick;
	}
    tickTotal += val;
    sameTickCount += 1;
    }
if (sameTickCount > 0)
   {
   fprintf(densityFile, "%ld\t%d\n", curTick - startTick + 1, sameTickCount);
   if (averageFile != NULL)
       fprintf(averageFile, "%ld\t%f\n", 
       		(long)curTick - startTick + 1, tickTotal/sameTickCount);
   }
carefulClose(&densityFile);
carefulClose(&averageFile);
}
void regChromiaMergeWindows(char *input, char *output)
/* regChromiaMergeWindows - Merge adjacent identically labeled windows in BED file generated 
 * by Chromia.. */
{
struct lineFile *lf = lineFileOpen(input, TRUE);
char *row[32];
int rowSize = 0;
FILE *f = mustOpen(output, "w");
char lastLabel[128];
lastLabel[0] = 0;
char lastChrom[128];
lastChrom[0] = 0;
int lastChromStart = 0, lastChromEnd = 0;
int regionStart = 0, regionEnd = 0;
double sumOfScores = 0.0;

for (;;)
    {
    /* Get next line chopped into words.  Break at end of file. Check to make sure
     * all lines have same number of words. */
    int thisRowSize = lineFileChop(lf, row);
    if (thisRowSize == 0)
        break;
    if (rowSize == 0)
        rowSize = thisRowSize;
    else if (rowSize != thisRowSize)
        {
	errAbort("First line of %s has %d words, but there are %d words in line %d",
		lf->fileName, rowSize, thisRowSize, lf->lineIx);
	}

    /* Convert row into local variables. */
    char *chrom = row[0];
    int chromStart = lineFileNeedNum(lf, row, 1);
    int chromEnd = lineFileNeedNum(lf, row, 2);
    char *label = row[labelColumn];
    double score = lineFileNeedDouble(lf, row, scoreColumn);
    
    /* Make sure file is sorted with no overlap.*/
    if (sameString(chrom, lastChrom))
        {
	int gapSize = chromStart - lastChromEnd;
	if (gapSize < 0)
	    {
	    if (chromStart < lastChromStart)
		errAbort("%s is not sorted. %s %d %d followed by %s %d %d line %d",
		    lf->fileName, lastChrom, lastChromStart, lastChromEnd,
		    chrom, chromStart, chromEnd, lf->lineIx);
	    else  
		errAbort("%s has overlaps. %s %d %d followed by %s %d %d line %d",
		    lf->fileName, lastChrom, lastChromStart, lastChromEnd,
		    chrom, chromStart, chromEnd, lf->lineIx);
	    }
	}

    /* Subtract noise threshold from score, and if not still positive just ignore line. */
    score -= inNoiseThreshold;
    if (score > 0)
	{
	/* See if we have entered a new region. */
	boolean newRegion = FALSE;
	if (sameString(chrom, lastChrom))
	    {
	    int gapSize = chromStart - lastChromEnd;
	    if (gapSize > maxGap)
		 newRegion = TRUE;
	    }
	else
	    newRegion = TRUE;
	if (!sameString(label, lastLabel))
	    newRegion = TRUE;
	    
	/* Got new region.  Output old region if any, and initialize new region. */
	if (newRegion)
	    {
	    if (regionStart != regionEnd)
		outputRegion(f, lastChrom, regionStart, regionEnd, lastLabel, sumOfScores);
	    regionStart = chromStart;
	    sumOfScores = 0;
	    }

	/* Update region. */
	regionEnd = chromEnd;
	sumOfScores += score;

	/* Keep track of this row so can compare it to next row. */
	safecpy(lastChrom, sizeof(lastChrom), chrom);
	safecpy(lastLabel, sizeof(lastLabel), label);
	lastChromStart = chromStart;
	lastChromEnd = chromEnd;
	}

    }
outputRegion(f, lastChrom, regionStart, regionEnd, lastLabel, sumOfScores);
carefulClose(&f);
lineFileClose(&lf);
}
Beispiel #17
0
/*	Read through the file and determine min,max and thus range
 *	set bin size and minimum value
 */
static void autoScale(char *inFile)
{
int wordCount;
char *row[256];
unsigned long dataCount = 0;
double min = HUGE;
double max = - HUGE;
double range = 0.0;
struct lineFile *lf = lineFileOpen(inFile, TRUE);

while ((wordCount = lineFileChop(lf, row)))
    {
    double d;
    if ((wordCount <= col) || (wordCount <= aveCol))
        errAbort("Not enough words line %d of %s", lf->lineIx, lf->fileName);
    d = lineFileNeedDouble(lf, row, col);
    if ( d < min ) min = d;
    if ( d > max ) max = d;
    ++dataCount;
    }
lineFileClose(&lf);

range = max - min;

if (range < 0.0)
        errAbort("range of data invalid: %g = [%g:%g]", range, min, max);

maxBinCount = autoscale;
if (real)
    {
    minValR = min;
    /*	need to make binSizeR slightly larger to get the last data point
     *	in the last bin.  This is a floating point round off situation.
     */
    binSizeR = (range + (range/1000000.0)) / maxBinCount;
    }
else
    {
    minVal = (int) floor(min);
    binSize = (int)ceil(range / maxBinCount);
    if (binSize < 1) binSize = 1;
    verbose(1, "#\tautoscale data range: (%d - %d)/%d = %d\n",
	(int) ceil(max), minVal, maxBinCount, binSize);
    }
verbose(2, "#\tautoscale number of data values: %lu\n", dataCount);
    
verbose(2, "#\tautoscale maxBinCount: %d\n", maxBinCount);
if (real)
    {
    verbose(2, "#\tautoscale data range: %g = [%g:%g]\n",
	range, minValR, max);
    verbose(2, "#\tautoscale minVal: %g\n", minValR);
    verbose(2, "#\tautoscale binSize: %g\n", binSizeR);
    }
else
    {
    verbose(2, "#\tautoscale data range: %g = [%d:%d]\n",
	range, minVal, (int) ceil(max));
    verbose(2, "#\tautoscale minVal: %d\n", minVal);
    verbose(2, "#\tautoscale binSize: %d\n", binSize);
    }
}	/*	autoScale()	*/
Beispiel #18
0
static void textHistogram(char *inFile)
/* textHistogram - Make a histogram in ascii. */
{
double *hist = NULL;
double *total = NULL;
char *row[256];
int wordCount;
struct lineFile *lf = lineFileOpen(inFile, TRUE);
int i,j;
int minData = maxBinCount, maxData = 0;
int totalTooBig = 0;
double maxCount = 0;
double maxCt;
double maxVal = 0;
int truncation = 0;
int begin, end;
unsigned long long totalCounts = 0;
double cpd;

/* Allocate histogram and optionally space for
 * second column totals. */
AllocArray(hist, maxBinCount);
if (aveCol >= 0)
    AllocArray(total, maxBinCount);

while (skip-- > 0)
wordCount = lineFileChop(lf, row);

/* Go through each line of input file accumulating
 * data. */
while ((wordCount = lineFileChop(lf, row)))
    {
    int x;	/*	will become the index into hist[]	*/
    if (wordCount <= col || wordCount <= aveCol)
        errAbort("Not enough words line %d of %s", lf->lineIx, lf->fileName);
    x = -1;
    if (real)	/*	for real data, work in real space to find index */
	{
	double d;
	d = lineFileNeedDouble(lf, row, col);
	if (d > maxVal)
	    maxVal = d;
	if (d >= minValR)
	    {
	    d -= minValR;
	    x = (int) floor(d / binSizeR);
	    }
	}
    else
	{
	x = lineFileNeedNum(lf, row, col);
	if (x > maxVal)
	    maxVal = x;
	if (x >= minVal)
	    {
	    x -= minVal;
	    x /= binSize;
	    }
	}
    /*	index x is calculated, accumulate it when in range	*/
    if (x >= 0 && x < maxBinCount)
	{
	hist[x] += 1;
	if (aveCol >= 0)
	    {
	    double a;
	    a = lineFileNeedDouble(lf, row, aveCol);
	    total[x] += a;
	    }
	}
    else
        {
        verbose(2, "truncating index %d\n", x);
        truncation = (x > truncation) ? x : truncation;
        totalTooBig += 1;
        }
    }

lineFileClose(&lf);

if (truncation > 0)
    {
    if (real)
	fprintf(stderr,"large values truncated: need %d bins or larger binSize than %g\n",truncation, binSizeR);
    else
	fprintf(stderr,"large values truncated: need %d bins or larger binSize than %d\n",truncation, binSize);
    printf("Maximum value %f\n", maxVal);
    }

/* Figure out range that has data, maximum data
 * value and optionally compute averages. */
if (aveCol >= 0)
    {
    double ave, maxAve = -BIGNUM;
    for (i=0; i<maxBinCount; ++i)
	{
	int count = hist[i];
	if (count != 0)
	    {
	    ave = total[i]/count;
	    if (maxAve < ave) maxAve = ave;
	    if (minData > i) minData = i;
	    if (maxData < i) maxData = i;
	    }
	}
    maxCt = maxAve;
    }
else
    {
    for (i=0; i<maxBinCount; ++i)
	{
	int count = hist[i];
	if (count != 0)
	    {
	    if (maxCount < count) maxCount = count;
	    if (minData > i) minData = i;
	    if (maxData < i) maxData = i;
	    }
	}
    maxCt = maxCount;
    }

begin = minData;
end = maxData + 1;
if (verboseLevel()>1)
    {
    begin = 0;
    end = maxBinCount;
    }

if (probValues || freq)
    {
    totalCounts = 0;
    for (i=begin; i<end; ++i)
	totalCounts += hist[i];
    verbose(2,"#\ttotal data values: %llu\n", totalCounts);
    if (totalCounts < 1)
	errAbort("ERROR: No bins with any data ?\n");
    }

if (freq)
    maxCt = maxCt/(double)totalCounts;
if (doLog)
    maxCt = log(maxCt);

if (verboseLevel()>1)
    {
    if (noStar) {
	if (probValues)
	    printf("# bin\tValue\t\tprob-Value\t\tlog2(prob-Value)\tCPD\t1-CPD\n");
	else
	    printf("# bin  Value	ascii graph\n");
    } else
	printf("# bin  Value	ascii graph\n");
    }

cpd = 0.0;	/*	cumulative probability distribution	*/
/* Output results. */
for (i=begin; i<=end; ++i)
    {
    double ct;
    double binStartR = 0.0;
    int binStart = 0;
    long count;

    if (i != end)
	count = hist[i];
    else
	{
	if (totalTooBig == 0)
	    break;
        count = totalTooBig;
	}
    if (real)
	binStartR = i*binSizeR + minValR;
    else
	binStart = i*binSize + minVal;

    if (aveCol >= 0)
	{
	if (count > 0)
	    ct = total[i]/count;
	else
	    ct = 0;
	}
    else if (freq)
        {
        ct = count/(double)totalCounts;
        }
    else
	{
	ct = count;
	}
    if (doLog)
	ct = log(ct);
    if (noStar)
	{
	if (i == end)
	    printf("<minVal or >=");
	if (verboseLevel()>1)
	    printf("%02d\t", i);
	if (real)
	    {
	    if (probValues)
		{
		if (verboseLevel()>1)
		    printf("%g:%g", binStartR, binStartR+binSizeR);
		else
		    printf("%3d %g:%g", i, binStartR, binStartR+binSizeR);
		}
	    else
		printf("%3d %g:%g\t%f", i, binStartR, binStartR+binSizeR, ct);
	    }
	else
	    {
	    printf("%d\t%f", binStart, ct);
	    }
	if (probValues)
	    {
	    if (ct > 0)
		{
		cpd += (double)ct/(double)totalCounts;
		printf("\t%f\t%f\t%f\t%f\n", (double)ct/(double)totalCounts,
		    log((double)ct/(double)totalCounts)/log(2.0), cpd, 1.0-cpd);
		}
	    else
		printf("\t0.0      \tN/A     \t%f\t%f\n", cpd, 1.0-cpd);
	    }
	else
	    printf("\n");
	}
    else
	{
	int astCount = round(ct * 60.0 / maxCt);
	if (i == end)
	    printf("<minVal or >=");
	if (verboseLevel()>1)
	    printf("%2d ", i);
	if (real)
	    printf("%f ", binStartR);
	else
	    printf("%3d ", binStart);
	for (j=0; j<astCount; ++j)
	    putchar('*');
	if ((aveCol >= 0) || freq)
	    printf(" %f\n", ct);
	else
	    printf(" %ld\n", count);
	}
    }
}	/*	textHistogram()	*/