void scopCollapse(char *inFeat, char *inModel, char *outFeat, char *outDesc, char *outKnownTo) /* scopCollapse - Convert SCOP model to SCOP ID. Also make id/name converter file.. */ { /* Process inModel file, writing three columns to output, and keeping * a couple of columns in a hash */ struct hash *modelToSeed = hashNew(18); struct hash *seedToScop = hashNew(16); struct lineFile *lf = lineFileOpen(inModel, TRUE); FILE *f = mustOpen(outDesc, "w"); char *modRow[5]; while (lineFileRowTab(lf, modRow)) { char *seedId = modRow[2]; hashAdd(modelToSeed, modRow[0], cloneString(seedId) ); if (!hashLookup(seedToScop, seedId)) { char *scopId = modRow[1]; hashAdd(seedToScop, seedId, cloneString(scopId)); fprintf(f, "%s\t%s\t%s\n", scopId, seedId, modRow[4]); } } carefulClose(&f); lineFileClose(&lf); /* Process in-feature. We make up a structure for each protein here. */ struct hash *protHash = hashNew(18); struct protInfo *prot, *protList = NULL; lf = lineFileOpen(inFeat, TRUE); char *featRow[6]; while (lineFileRow(lf, featRow)) { prot = hashFindVal(protHash, featRow[0]); if (prot == NULL) { AllocVar(prot); hashAddSaveName(protHash, featRow[0], prot, &prot->name); slAddHead(&protList, prot); } struct protFeature *feature; AllocVar(feature); feature->protein = prot->name; feature->start = lineFileNeedNum(lf, featRow, 1); feature->end = lineFileNeedNum(lf, featRow, 2); feature->name = hashMustFindVal(modelToSeed, featRow[3]); feature->eVal = lineFileNeedDouble(lf, featRow, 4); feature->score = lineFileNeedDouble(lf, featRow, 5); slAddHead(&prot->featureList, feature); } lineFileClose(&lf); slReverse(&protList); f = mustOpen(outFeat, "w"); FILE *fKnownTo = mustOpen(outKnownTo, "w"); for (prot = protList; prot != NULL; prot = prot->next) outputProt(prot, seedToScop, f, fKnownTo); carefulClose(&f); carefulClose(&fKnownTo); }
void colTransform(char *column, char *input, char *addFactor, char *mulFactor, char *output) /* colTransform - Add and/or multiply column by constant.. */ { int col = sqlUnsigned(column) - 1; double add = sqlDouble(addFactor); double mul = sqlDouble(mulFactor); struct lineFile *lf = lineFileOpen(input, TRUE); FILE *f = mustOpen(output, "w"); char *words[512]; int wordCount; while ((wordCount = lineFileChop(lf, words)) > 0) { lineFileExpectAtLeast(lf, col, wordCount); double x = lineFileNeedDouble(lf, words, col); int i; for (i=0; i<wordCount; ++i) { if (i != 0) fputc('\t', f); if (i == col) fprintf(f, "%g", x*mul+add); else fputs(words[i], f); } fputc('\n', f); } carefulClose(&f); }
struct wigSection *wigSectionRead(struct lineFile *lf) /* Parse out next section of wig. */ { static double *vals = NULL; static int valAlloc = 0; /* Get "fixedStep" line and parse it. */ char *line; if (!lineFileNextReal(lf, &line)) return NULL; char *pattern = "fixedStep "; int patSize = 10; if (!startsWith(pattern, line)) errAbort("Expecting fixedStep line %d of %s", lf->lineIx, lf->fileName); line += patSize; struct hash *varHash = hashVarLine(line, lf->lineIx); int step = sqlUnsigned(requiredVal(lf, varHash, "step")); int start = sqlUnsigned(requiredVal(lf, varHash, "start")); char *chrom = cloneString(requiredVal(lf, varHash, "chrom")); hashFree(&varHash); /* Parse out numbers until next fixedStep. */ int valCount = 0; int i; for (;;) { if (!lineFileNextReal(lf, &line)) break; if (startsWith(pattern, line)) { lineFileReuse(lf); break; } for (i=0; i<step; ++i) { if (valCount >= valAlloc) { int newAlloc = valAlloc + 1024; ExpandArray(vals, valAlloc, newAlloc); valAlloc = newAlloc; } vals[valCount] = lineFileNeedDouble(lf, &line, 0); ++valCount; } } /* Create wigSection. */ struct wigSection *section; AllocVar(section); section->chrom = chrom; section->chromStart = start; section->chromEnd = start + valCount; section->vals = CloneArray(vals, valCount); return section; }
struct hash *hashWeights(char *in) /* Return hash full of weights. */ { struct lineFile *lf = lineFileOpen(in, TRUE); char *row[2]; struct hash *hash = hashNew(0); while (lineFileRow(lf, row)) { struct weight *weight; AllocVar(weight); weight->value = lineFileNeedDouble(lf, row, 1); hashAddSaveName(hash, row[0], weight, &weight->type); } lineFileClose(&lf); return hash; }
void raIntoCdwRepeatQa(char *fileName, struct sqlConnection *conn, long long fileId) /* Read in two column file and put it into cdwQaRepeat table. */ { struct lineFile *lf = lineFileOpen(fileName, TRUE); char *row[2]; while (lineFileRow(lf, row)) { char *repeatClass = row[0]; double mapRatio = lineFileNeedDouble(lf, row, 1); char query[512]; sqlSafef(query, sizeof(query), "insert into cdwQaRepeat (fileId,repeatClass,mapRatio) values (%lld, \"%s\", %g)", fileId, repeatClass, mapRatio); sqlUpdate(conn, query); } lineFileClose(&lf); }
static struct visiMatch *readMatchFile(char *fileName) /* Read in match file */ { struct visiMatch *matchList = NULL, *match; struct lineFile *lf = lineFileMayOpen(fileName, TRUE); if (lf != NULL) { char *row[2]; while (lineFileRow(lf, row)) { AllocVar(match); match->imageId = lineFileNeedNum(lf, row, 0); match->weight = lineFileNeedDouble(lf, row, 1); slAddHead(&matchList, match); } lineFileClose(&lf); slReverse(&matchList); } return matchList; }
static double minOfCol(char *fileName, int colIx) /* Return minimum value seen in given column of file. */ { struct lineFile *lf = lineFileOpen(fileName, TRUE); int minColCount = colIx+1; char *row[minColCount]; boolean gotAny = FALSE; double minVal = 0; while (lineFileNextRow(lf, row, minColCount)) { double val = lineFileNeedDouble(lf, row, colIx); if (!gotAny || val < minVal) { gotAny = TRUE; minVal = val; } } lineFileClose(&lf); if (!gotAny) errAbort("No data in %s", fileName); return minVal; }
void convertVariableStepSection(struct lineFile *lf, struct hash *vars, struct bgOut *out) /* Read through section and output. */ { char *chrom = requiredVar(vars, "chrom", lf); int span = sqlUnsigned(optionalVar(vars, "span", "1")); char *line; while (lineFileNextReal(lf, &line)) { line = skipLeadingSpaces(line); if (isalpha(line[0])) { lineFileReuse(lf); break; } char *words[3]; int wordCount = chopLine(line, words); if (wordCount != 2) errAbort("Expecting exactly two numbers line %d of %s", lf->lineIx, lf->fileName); int start = lineFileNeedNum(lf, words, 0) - 1; double val = lineFileNeedDouble(lf, words, 1); bgOutWrite(out, chrom, start, start+span, val); } }
static void parseBedGraphSection(struct lineFile *lf, boolean clipDontDie, struct hash *chromSizeHash, struct lm *lm, int itemsPerSlot, struct bwgSection **pSectionList) /* Parse out bedGraph section until we get to something that is not in bedGraph format. */ { /* Set up hash and list to store chromosomes. */ struct hash *chromHash = hashNew(0); struct bedGraphChrom *chrom, *chromList = NULL; /* Collect lines in items on appropriate chromosomes. */ struct bwgBedGraphItem *item; char *line; while (lineFileNextReal(lf, &line)) { /* Check for end of section. */ if (stepTypeLine(line)) { lineFileReuse(lf); break; } /* Parse out our line and make sure it has exactly 4 columns. */ char *words[5]; int wordCount = chopLine(line, words); lineFileExpectWords(lf, 4, wordCount); /* Get chromosome. */ char *chromName = words[0]; chrom = hashFindVal(chromHash, chromName); if (chrom == NULL) { lmAllocVar(chromHash->lm, chrom); hashAddSaveName(chromHash, chromName, chrom, &chrom->name); chrom->size = (chromSizeHash ? hashIntVal(chromSizeHash, chromName) : BIGNUM); slAddHead(&chromList, chrom); } /* Convert to item and add to chromosome list. */ lmAllocVar(lm, item); item->start = lineFileNeedNum(lf, words, 1); item->end = lineFileNeedNum(lf, words, 2); item->val = lineFileNeedDouble(lf, words, 3); /* Do sanity checking on coordinates. */ if (item->start > item->end) errAbort("bedGraph error: start (%u) after end line (%u) %d of %s.", item->start, item->end, lf->lineIx, lf->fileName); if (item->end > chrom->size) { warn("bedGraph error line %d of %s: chromosome %s has size %u but item ends at %u", lf->lineIx, lf->fileName, chrom->name, chrom->size, item->end); if (!clipDontDie) noWarnAbort(); } else { slAddHead(&chrom->itemList, item); } } slSort(&chromList, bedGraphChromCmpName); /* Loop through each chromosome and output the item list, broken into sections * for that chrom. */ for (chrom = chromList; chrom != NULL; chrom = chrom->next) { slSort(&chrom->itemList, bwgBedGraphItemCmp); /* Check to make sure no overlap between items. */ struct bwgBedGraphItem *item = chrom->itemList, *nextItem; for (nextItem = item->next; nextItem != NULL; nextItem = nextItem->next) { if (item->end > nextItem->start) errAbort("Overlap between %s %d %d and %s %d %d.\nPlease remove overlaps and try again", chrom->name, item->start, item->end, chrom->name, nextItem->start, nextItem->end); item = nextItem; } /* Break up into sections of no more than items-per-slot size. */ struct bwgBedGraphItem *startItem, *endItem, *nextStartItem = chrom->itemList; for (startItem = chrom->itemList; startItem != NULL; startItem = nextStartItem) { /* Find end item of this section, and start item for next section. * Terminate list at end item. */ int sectionSize = 0; int i; endItem = startItem; for (i=0; i<itemsPerSlot; ++i) { if (nextStartItem == NULL) break; endItem = nextStartItem; nextStartItem = nextStartItem->next; ++sectionSize; } endItem->next = NULL; /* Fill in section and add it to section list. */ struct bwgSection *section; lmAllocVar(lm, section); section->chrom = cloneString(chrom->name); section->start = startItem->start; section->end = endItem->end; section->type = bwgTypeBedGraph; section->items.bedGraphList = startItem; section->itemCount = sectionSize; slAddHead(pSectionList, section); } } /* Free up hash, no longer needed. Free's chromList as a side effect since chromList is in * hash's memory. */ hashFree(&chromHash); chromList = NULL; }
static void parseVariableStepSection(struct lineFile *lf, boolean clipDontDie, struct lm *lm, int itemsPerSlot, char *chrom, int chromSize, bits32 span, struct bwgSection **pSectionList) /* Read the single column data in section until get to end. */ { struct lm *lmLocal = lmInit(0); /* Stream through section until get to end of file or next section, * adding values from single column to list. */ char *words[2]; char *line; struct bwgVariableStepItem *item, *nextItem, *itemList = NULL; int originalSectionSize = 0; while (lineFileNextReal(lf, &line)) { if (steppedSectionEnd(line, 2)) { lineFileReuse(lf); break; } chopLine(line, words); lmAllocVar(lmLocal, item); int start = lineFileNeedNum(lf, words, 0); if (start <= 0) { errAbort("line %d of %s: zero or negative chromosome coordinate not allowed", lf->lineIx, lf->fileName); } item->start = start - 1; item->val = lineFileNeedDouble(lf, words, 1); if (item->start + span > chromSize) { warn("line %d of %s: chromosome %s has %u bases, but item ends at %u", lf->lineIx, lf->fileName, chrom, chromSize, item->start + span); if (!clipDontDie) noWarnAbort(); } else { slAddHead(&itemList, item); ++originalSectionSize; } } slSort(&itemList, bwgVariableStepItemCmp); /* Make sure no overlap between items. */ if (itemList != NULL) { item = itemList; for (nextItem = item->next; nextItem != NULL; nextItem = nextItem->next) { if (item->start + span > nextItem->start) errAbort("Overlap on %s between items starting at %d and %d.\n" "Please remove overlaps and try again", chrom, item->start, nextItem->start); item = nextItem; } } /* Break up into sections of no more than items-per-slot size. */ int sizeLeft = originalSectionSize; for (item = itemList; item != NULL; ) { /* Figure out size of this section */ int sectionSize = sizeLeft; if (sectionSize > itemsPerSlot) sectionSize = itemsPerSlot; sizeLeft -= sectionSize; /* Convert from list to array representation. */ struct bwgVariableStepPacked *packed, *p; p = lmAllocArray(lm, packed, sectionSize); int i; for (i=0; i<sectionSize; ++i) { p->start = item->start; p->val = item->val; item = item->next; ++p; } /* Fill in section and add it to list. */ struct bwgSection *section; lmAllocVar(lm, section); section->chrom = chrom; section->start = packed[0].start; section->end = packed[sectionSize-1].start + span; section->type = bwgTypeVariableStep; section->items.variableStepPacked = packed; section->itemSpan = span; section->itemCount = sectionSize; slAddHead(pSectionList, section); } lmCleanup(&lmLocal); }
static void parseFixedStepSection(struct lineFile *lf, boolean clipDontDie, struct lm *lm, int itemsPerSlot, char *chrom, bits32 chromSize, bits32 span, bits32 sectionStart, bits32 step, struct bwgSection **pSectionList) /* Read the single column data in section until get to end. */ { struct lm *lmLocal = lmInit(0); /* Stream through section until get to end of file or next section, * adding values from single column to list. */ char *words[1]; char *line; struct bwgFixedStepItem *item, *itemList = NULL; int originalSectionSize = 0; bits32 sectionEnd = sectionStart; while (lineFileNextReal(lf, &line)) { if (steppedSectionEnd(line, 1)) { lineFileReuse(lf); break; } chopLine(line, words); lmAllocVar(lmLocal, item); item->val = lineFileNeedDouble(lf, words, 0); if (sectionEnd + span > chromSize) { warn("line %d of %s: chromosome %s has %u bases, but item ends at %u", lf->lineIx, lf->fileName, chrom, chromSize, sectionEnd + span); if (!clipDontDie) noWarnAbort(); } else { slAddHead(&itemList, item); ++originalSectionSize; } sectionEnd += step; } slReverse(&itemList); /* Break up into sections of no more than items-per-slot size, and convert to packed format. */ int sizeLeft = originalSectionSize; for (item = itemList; item != NULL; ) { /* Figure out size of this section */ int sectionSize = sizeLeft; if (sectionSize > itemsPerSlot) sectionSize = itemsPerSlot; sizeLeft -= sectionSize; /* Allocate and fill in section. */ struct bwgSection *section; lmAllocVar(lm, section); section->chrom = chrom; section->start = sectionStart; sectionStart += sectionSize * step; section->end = sectionStart - step + span; section->type = bwgTypeFixedStep; section->itemStep = step; section->itemSpan = span; section->itemCount = sectionSize; /* Allocate array for data, and copy from list to array representation */ struct bwgFixedStepPacked *packed; /* An array */ section->items.fixedStepPacked = lmAllocArray(lm, packed, sectionSize); int i; for (i=0; i<sectionSize; ++i) { packed->val = item->val; item = item->next; ++packed; } /* Add section to list. */ slAddHead(pSectionList, section); } lmCleanup(&lmLocal); }
struct bwgSection *bwgParseWig( char *fileName, /* Name of ascii wig file. */ boolean clipDontDie, /* Skip items outside chromosome rather than aborting. */ struct hash *chromSizeHash, /* If non-NULL items checked to be inside chromosome. */ int maxSectionSize, /* Biggest size of a section. 100 - 100,000 is usual range. */ struct lm *lm) /* Memory pool to allocate from. */ /* Parse out ascii wig file - allocating memory in lm. */ { struct lineFile *lf = lineFileOpen(fileName, TRUE); char *line; struct bwgSection *sectionList = NULL; /* remove initial browser and track lines */ lineFileRemoveInitialCustomTrackLines(lf); while (lineFileNextReal(lf, &line)) { verbose(2, "processing %s\n", line); if (stringIn("chrom=", line)) parseSteppedSection(lf, clipDontDie, chromSizeHash, line, lm, maxSectionSize, §ionList); else { /* Check for bed... */ char *dupe = cloneString(line); char *words[5]; int wordCount = chopLine(dupe, words); if (wordCount != 4) errAbort("Unrecognized line %d of %s:\n%s\n", lf->lineIx, lf->fileName, line); /* Parse out a bed graph line just to check numerical format. */ char *chrom = words[0]; int start = lineFileNeedNum(lf, words, 1); int end = lineFileNeedNum(lf, words, 2); double val = lineFileNeedDouble(lf, words, 3); verbose(2, "bedGraph %s:%d-%d@%g\n", chrom, start, end, val); /* Push back line and call bed parser. */ lineFileReuse(lf); parseBedGraphSection(lf, clipDontDie, chromSizeHash, lm, maxSectionSize, §ionList); } } slSort(§ionList, bwgSectionCmp); /* Check for overlap at section level. */ struct bwgSection *section, *nextSection; for (section = sectionList; section != NULL; section = nextSection) { nextSection = section->next; if (nextSection != NULL) { if (sameString(section->chrom, nextSection->chrom)) { if (section->end > nextSection->start) { errAbort("There's more than one value for %s base %d (in coordinates that start with 1).\n", section->chrom, nextSection->start+1); } } } } return sectionList; }
struct encodePeak *encodePeakLineFileLoad(char **row, enum encodePeakType pt, struct lineFile *lf) /* From a linefile line, load an encodePeak row. Errors outputted */ /* have line numbers, etc. Does more error checking as well. */ { struct encodePeak *peak; if (!pt) errAbort("Unknown peak type set for track"); AllocVar(peak); peak->chrom = cloneString(row[0]); peak->chromStart = lineFileNeedNum(lf, row, 1); peak->chromEnd = lineFileNeedNum(lf, row, 2); peak->peak = -1; if (peak->chromEnd < 1) lineFileAbort(lf, "chromEnd less than 1 (%d)", peak->chromEnd); if (peak->chromEnd < peak->chromStart) lineFileAbort(lf, "chromStart after chromEnd (%d > %d)", peak->chromStart, peak->chromEnd); peak->name = cloneString(row[3]); peak->score = lineFileNeedNum(lf, row, 4); safecpy(peak->strand, sizeof(peak->strand), row[5]); if (peak->strand[0] != '+' && peak->strand[0] != '-' && peak->strand[0] != '.') lineFileAbort(lf, "Expecting +, -, or . in strand"); if (pt != gappedPeak) /* deal with signalValue, pValue, qValue, and peak */ { peak->signalValue = (float)lineFileNeedDouble(lf, row, 6); peak->pValue = (float)lineFileNeedDouble(lf, row, 7); peak->qValue = (float)lineFileNeedDouble(lf, row, 8); if ((pt == narrowPeak) || (pt == encodePeak)) { peak->peak = lineFileNeedNum(lf, row, 9); if (peak->peak >= (int)peak->chromEnd) lineFileAbort(lf, "peak site past chromEnd (%d > %d)", peak->peak, peak->chromEnd); } } else /* must be gappedPeak */ /* deal with thickStart, thickEnd, itemRgb even though they're not used */ { int thickStart = lineFileNeedNum(lf, row, 6); int thickEnd = lineFileNeedNum(lf, row, 7); int itemRgb = 0; char *comma; /* Allow comma separated list of rgb values here */ comma = strchr(row[8], ','); if (comma) itemRgb = bedParseRgb(row[8]); else itemRgb = lineFileNeedNum(lf, row, 8); if ((thickStart != 0) || (thickEnd != 0) || (itemRgb != 0)) lineFileAbort(lf, "thickStart, thickEnd, and itemRgb columns not used in gappedPeak type, set all to 0"); } /* Deal with blocks */ if ((pt == gappedPeak) || (pt == encodePeak)) { int i, count; int lastEnd, lastStart; int blockCountIx, blockSizesIx, blockStartsIx; if (pt == gappedPeak) { blockCountIx = 9; blockSizesIx = 10; blockStartsIx = 11; } else { blockCountIx = 10; blockSizesIx = 11; blockStartsIx = 12; } peak->blockCount = lineFileNeedNum(lf, row, blockCountIx); sqlUnsignedDynamicArray(row[blockSizesIx], &peak->blockSizes, &count); if (count != peak->blockCount) lineFileAbort(lf, "expecting %d elements in array", peak->blockCount); sqlUnsignedDynamicArray(row[blockStartsIx], &peak->blockStarts, &count); if (count != peak->blockCount) lineFileAbort(lf, "expecting %d elements in array", peak->blockCount); // tell the user if they appear to be using absolute starts rather than // relative... easy to forget! Also check block order, coord ranges... lastStart = -1; lastEnd = 0; for (i=0; i < peak->blockCount; i++) { if (peak->blockStarts[i]+peak->chromStart >= peak->chromEnd) { if (peak->blockStarts[i] >= peak->chromStart) lineFileAbort(lf, "BED blockStarts offsets must be relative to chromStart, " "not absolute. Try subtracting chromStart from each offset " "in blockStarts."); else lineFileAbort(lf, "BED blockStarts[i]+chromStart must be less than chromEnd."); } lastStart = peak->blockStarts[i]; lastEnd = peak->chromStart + peak->blockStarts[i] + peak->blockSizes[i]; } if (peak->blockStarts[0] != 0) lineFileAbort(lf, "BED blocks must span chromStart to chromEnd. " "BED blockStarts[0] must be 0 (==%d) so that (chromStart + " "blockStarts[0]) equals chromStart.", peak->blockStarts[0]); i = peak->blockCount-1; if ((peak->chromStart + peak->blockStarts[i] + peak->blockSizes[i]) != peak->chromEnd) { lineFileAbort(lf, "BED blocks must span chromStart to chromEnd. (chromStart + " "blockStarts[last] + blockSizes[last]) must equal chromEnd."); } } if (pt == gappedPeak) /* deal with final three columns of a gappedPeak */ { peak->signalValue = (float)lineFileNeedDouble(lf, row, 12); peak->pValue = (float)lineFileNeedDouble(lf, row, 13); peak->qValue = (float)lineFileNeedDouble(lf, row, 14); } return peak; }
void writeSections(struct bbiChromUsage *usageList, struct lineFile *lf, int itemsPerSlot, struct bbiBoundsArray *bounds, int sectionCount, FILE *f, int resTryCount, int resScales[], int resSizes[], boolean doCompress, bits32 *retMaxSectionSize) /* Read through lf, chunking it into sections that get written to f. Save info * about sections in bounds. */ { int maxSectionSize = 0; struct bbiChromUsage *usage = usageList; int itemIx = 0, sectionIx = 0; bits32 reserved32 = 0; UBYTE reserved8 = 0; struct sectionItem items[itemsPerSlot]; struct sectionItem *lastB = NULL; bits32 resEnds[resTryCount]; int resTry; for (resTry = 0; resTry < resTryCount; ++resTry) resEnds[resTry] = 0; struct dyString *stream = dyStringNew(0); /* remove initial browser and track lines */ lineFileRemoveInitialCustomTrackLines(lf); for (;;) { /* Get next line of input if any. */ char *row[5]; int rowSize = lineFileChopNext(lf, row, ArraySize(row)); /* Figure out whether need to output section. */ boolean sameChrom = FALSE; if (rowSize > 0) sameChrom = sameString(row[0], usage->name); if (itemIx >= itemsPerSlot || rowSize == 0 || !sameChrom) { /* Figure out section position. */ bits32 chromId = usage->id; bits32 sectionStart = items[0].start; bits32 sectionEnd = items[itemIx-1].end; /* Save section info for indexing. */ assert(sectionIx < sectionCount); struct bbiBoundsArray *section = &bounds[sectionIx++]; section->offset = ftell(f); section->range.chromIx = chromId; section->range.start = sectionStart; section->range.end = sectionEnd; /* Output section header to stream. */ dyStringClear(stream); UBYTE type = bwgTypeBedGraph; bits16 itemCount = itemIx; dyStringWriteOne(stream, chromId); // chromId dyStringWriteOne(stream, sectionStart); // start dyStringWriteOne(stream, sectionEnd); // end dyStringWriteOne(stream, reserved32); // itemStep dyStringWriteOne(stream, reserved32); // itemSpan dyStringWriteOne(stream, type); // type dyStringWriteOne(stream, reserved8); // reserved dyStringWriteOne(stream, itemCount); // itemCount /* Output each item in section to stream. */ int i; for (i=0; i<itemIx; ++i) { struct sectionItem *item = &items[i]; dyStringWriteOne(stream, item->start); dyStringWriteOne(stream, item->end); dyStringWriteOne(stream, item->val); } /* Save stream to file, compressing if need be. */ if (stream->stringSize > maxSectionSize) maxSectionSize = stream->stringSize; if (doCompress) { size_t maxCompSize = zCompBufSize(stream->stringSize); char compBuf[maxCompSize]; int compSize = zCompress(stream->string, stream->stringSize, compBuf, maxCompSize); mustWrite(f, compBuf, compSize); } else mustWrite(f, stream->string, stream->stringSize); /* If at end of input we are done. */ if (rowSize == 0) break; /* Set up for next section. */ itemIx = 0; if (!sameChrom) { usage = usage->next; assert(usage != NULL); if (!sameString(row[0], usage->name)) errAbort("read %s, expecting %s on line %d in file %s\n", row[0], usage->name, lf->lineIx, lf->fileName); assert(sameString(row[0], usage->name)); lastB = NULL; for (resTry = 0; resTry < resTryCount; ++resTry) resEnds[resTry] = 0; } } /* Parse out input. */ lineFileExpectWords(lf, 4, rowSize); bits32 start = lineFileNeedNum(lf, row, 1); bits32 end = lineFileNeedNum(lf, row, 2); float val = lineFileNeedDouble(lf, row, 3); /* Verify that inputs meets our assumption - that it is a sorted bedGraph file. */ if (start > end) errAbort("Start (%u) after end (%u) line %d of %s", start, end, lf->lineIx, lf->fileName); if (lastB != NULL) { if (lastB->start > start) errAbort("BedGraph not sorted on start line %d of %s", lf->lineIx, lf->fileName); if (lastB->end > start) errAbort("Overlapping regions in bedGraph line %d of %s", lf->lineIx, lf->fileName); } /* Do zoom counting. */ for (resTry = 0; resTry < resTryCount; ++resTry) { bits32 resEnd = resEnds[resTry]; if (start >= resEnd) { resSizes[resTry] += 1; resEnds[resTry] = resEnd = start + resScales[resTry]; } while (end > resEnd) { resSizes[resTry] += 1; resEnds[resTry] = resEnd = resEnd + resScales[resTry]; } } /* Save values in output array. */ struct sectionItem *b = &items[itemIx]; b->start = start; b->end = end; b->val = val; lastB = b; itemIx += 1; } assert(sectionIx == sectionCount); *retMaxSectionSize = maxSectionSize; }
void ticksToWig(int startTick, char *inTable, char *outDensity, char *outAverage) /* ticksToWig - Convert tab-delimited file of Unix time ticks, and possibly also * numerical values to wig file(s).. */ { struct lineFile *lf = lineFileOpen(inTable, TRUE); FILE *densityFile = mustOpen(outDensity, "w"); printVarStepHead(densityFile); FILE *averageFile = NULL; if (outAverage != NULL) { averageFile = mustOpen(outAverage, "w"); printVarStepHead(averageFile); } int colsToParse = 1 + max(tickCol, valCol); char *row[colsToParse]; time_t curTick = 0; int sameTickCount = 0; double tickTotal = 0; double val = 0; time_t tick; while (lineFileNextRow(lf, row, colsToParse)) { tick = lineFileNeedNum(lf, row, tickCol); if (averageFile != NULL) val = lineFileNeedDouble(lf, row, valCol); if (curTick != tick) { if (curTick > tick) errAbort("Input isn't sorted - %ld > %ld line %d of %s\n", (long)curTick, (long)tick, lf->lineIx, lf->fileName); if (startTick == 0) startTick = tick; if (sameTickCount > 0) { fprintf(densityFile, "%ld\t%d\n", curTick - startTick + 1, sameTickCount); time_t i; for (i=curTick+1; i<tick; ++i) { fprintf(densityFile, "%ld\t%d\n", i - startTick + 1, 0); } if (averageFile != NULL) { fprintf(averageFile, "%ld\t%f\n", (long)curTick - startTick + 1, tickTotal/sameTickCount); tickTotal = 0; } sameTickCount = 0; } curTick = tick; } tickTotal += val; sameTickCount += 1; } if (sameTickCount > 0) { fprintf(densityFile, "%ld\t%d\n", curTick - startTick + 1, sameTickCount); if (averageFile != NULL) fprintf(averageFile, "%ld\t%f\n", (long)curTick - startTick + 1, tickTotal/sameTickCount); } carefulClose(&densityFile); carefulClose(&averageFile); }
void regChromiaMergeWindows(char *input, char *output) /* regChromiaMergeWindows - Merge adjacent identically labeled windows in BED file generated * by Chromia.. */ { struct lineFile *lf = lineFileOpen(input, TRUE); char *row[32]; int rowSize = 0; FILE *f = mustOpen(output, "w"); char lastLabel[128]; lastLabel[0] = 0; char lastChrom[128]; lastChrom[0] = 0; int lastChromStart = 0, lastChromEnd = 0; int regionStart = 0, regionEnd = 0; double sumOfScores = 0.0; for (;;) { /* Get next line chopped into words. Break at end of file. Check to make sure * all lines have same number of words. */ int thisRowSize = lineFileChop(lf, row); if (thisRowSize == 0) break; if (rowSize == 0) rowSize = thisRowSize; else if (rowSize != thisRowSize) { errAbort("First line of %s has %d words, but there are %d words in line %d", lf->fileName, rowSize, thisRowSize, lf->lineIx); } /* Convert row into local variables. */ char *chrom = row[0]; int chromStart = lineFileNeedNum(lf, row, 1); int chromEnd = lineFileNeedNum(lf, row, 2); char *label = row[labelColumn]; double score = lineFileNeedDouble(lf, row, scoreColumn); /* Make sure file is sorted with no overlap.*/ if (sameString(chrom, lastChrom)) { int gapSize = chromStart - lastChromEnd; if (gapSize < 0) { if (chromStart < lastChromStart) errAbort("%s is not sorted. %s %d %d followed by %s %d %d line %d", lf->fileName, lastChrom, lastChromStart, lastChromEnd, chrom, chromStart, chromEnd, lf->lineIx); else errAbort("%s has overlaps. %s %d %d followed by %s %d %d line %d", lf->fileName, lastChrom, lastChromStart, lastChromEnd, chrom, chromStart, chromEnd, lf->lineIx); } } /* Subtract noise threshold from score, and if not still positive just ignore line. */ score -= inNoiseThreshold; if (score > 0) { /* See if we have entered a new region. */ boolean newRegion = FALSE; if (sameString(chrom, lastChrom)) { int gapSize = chromStart - lastChromEnd; if (gapSize > maxGap) newRegion = TRUE; } else newRegion = TRUE; if (!sameString(label, lastLabel)) newRegion = TRUE; /* Got new region. Output old region if any, and initialize new region. */ if (newRegion) { if (regionStart != regionEnd) outputRegion(f, lastChrom, regionStart, regionEnd, lastLabel, sumOfScores); regionStart = chromStart; sumOfScores = 0; } /* Update region. */ regionEnd = chromEnd; sumOfScores += score; /* Keep track of this row so can compare it to next row. */ safecpy(lastChrom, sizeof(lastChrom), chrom); safecpy(lastLabel, sizeof(lastLabel), label); lastChromStart = chromStart; lastChromEnd = chromEnd; } } outputRegion(f, lastChrom, regionStart, regionEnd, lastLabel, sumOfScores); carefulClose(&f); lineFileClose(&lf); }
/* Read through the file and determine min,max and thus range * set bin size and minimum value */ static void autoScale(char *inFile) { int wordCount; char *row[256]; unsigned long dataCount = 0; double min = HUGE; double max = - HUGE; double range = 0.0; struct lineFile *lf = lineFileOpen(inFile, TRUE); while ((wordCount = lineFileChop(lf, row))) { double d; if ((wordCount <= col) || (wordCount <= aveCol)) errAbort("Not enough words line %d of %s", lf->lineIx, lf->fileName); d = lineFileNeedDouble(lf, row, col); if ( d < min ) min = d; if ( d > max ) max = d; ++dataCount; } lineFileClose(&lf); range = max - min; if (range < 0.0) errAbort("range of data invalid: %g = [%g:%g]", range, min, max); maxBinCount = autoscale; if (real) { minValR = min; /* need to make binSizeR slightly larger to get the last data point * in the last bin. This is a floating point round off situation. */ binSizeR = (range + (range/1000000.0)) / maxBinCount; } else { minVal = (int) floor(min); binSize = (int)ceil(range / maxBinCount); if (binSize < 1) binSize = 1; verbose(1, "#\tautoscale data range: (%d - %d)/%d = %d\n", (int) ceil(max), minVal, maxBinCount, binSize); } verbose(2, "#\tautoscale number of data values: %lu\n", dataCount); verbose(2, "#\tautoscale maxBinCount: %d\n", maxBinCount); if (real) { verbose(2, "#\tautoscale data range: %g = [%g:%g]\n", range, minValR, max); verbose(2, "#\tautoscale minVal: %g\n", minValR); verbose(2, "#\tautoscale binSize: %g\n", binSizeR); } else { verbose(2, "#\tautoscale data range: %g = [%d:%d]\n", range, minVal, (int) ceil(max)); verbose(2, "#\tautoscale minVal: %d\n", minVal); verbose(2, "#\tautoscale binSize: %d\n", binSize); } } /* autoScale() */
static void textHistogram(char *inFile) /* textHistogram - Make a histogram in ascii. */ { double *hist = NULL; double *total = NULL; char *row[256]; int wordCount; struct lineFile *lf = lineFileOpen(inFile, TRUE); int i,j; int minData = maxBinCount, maxData = 0; int totalTooBig = 0; double maxCount = 0; double maxCt; double maxVal = 0; int truncation = 0; int begin, end; unsigned long long totalCounts = 0; double cpd; /* Allocate histogram and optionally space for * second column totals. */ AllocArray(hist, maxBinCount); if (aveCol >= 0) AllocArray(total, maxBinCount); while (skip-- > 0) wordCount = lineFileChop(lf, row); /* Go through each line of input file accumulating * data. */ while ((wordCount = lineFileChop(lf, row))) { int x; /* will become the index into hist[] */ if (wordCount <= col || wordCount <= aveCol) errAbort("Not enough words line %d of %s", lf->lineIx, lf->fileName); x = -1; if (real) /* for real data, work in real space to find index */ { double d; d = lineFileNeedDouble(lf, row, col); if (d > maxVal) maxVal = d; if (d >= minValR) { d -= minValR; x = (int) floor(d / binSizeR); } } else { x = lineFileNeedNum(lf, row, col); if (x > maxVal) maxVal = x; if (x >= minVal) { x -= minVal; x /= binSize; } } /* index x is calculated, accumulate it when in range */ if (x >= 0 && x < maxBinCount) { hist[x] += 1; if (aveCol >= 0) { double a; a = lineFileNeedDouble(lf, row, aveCol); total[x] += a; } } else { verbose(2, "truncating index %d\n", x); truncation = (x > truncation) ? x : truncation; totalTooBig += 1; } } lineFileClose(&lf); if (truncation > 0) { if (real) fprintf(stderr,"large values truncated: need %d bins or larger binSize than %g\n",truncation, binSizeR); else fprintf(stderr,"large values truncated: need %d bins or larger binSize than %d\n",truncation, binSize); printf("Maximum value %f\n", maxVal); } /* Figure out range that has data, maximum data * value and optionally compute averages. */ if (aveCol >= 0) { double ave, maxAve = -BIGNUM; for (i=0; i<maxBinCount; ++i) { int count = hist[i]; if (count != 0) { ave = total[i]/count; if (maxAve < ave) maxAve = ave; if (minData > i) minData = i; if (maxData < i) maxData = i; } } maxCt = maxAve; } else { for (i=0; i<maxBinCount; ++i) { int count = hist[i]; if (count != 0) { if (maxCount < count) maxCount = count; if (minData > i) minData = i; if (maxData < i) maxData = i; } } maxCt = maxCount; } begin = minData; end = maxData + 1; if (verboseLevel()>1) { begin = 0; end = maxBinCount; } if (probValues || freq) { totalCounts = 0; for (i=begin; i<end; ++i) totalCounts += hist[i]; verbose(2,"#\ttotal data values: %llu\n", totalCounts); if (totalCounts < 1) errAbort("ERROR: No bins with any data ?\n"); } if (freq) maxCt = maxCt/(double)totalCounts; if (doLog) maxCt = log(maxCt); if (verboseLevel()>1) { if (noStar) { if (probValues) printf("# bin\tValue\t\tprob-Value\t\tlog2(prob-Value)\tCPD\t1-CPD\n"); else printf("# bin Value ascii graph\n"); } else printf("# bin Value ascii graph\n"); } cpd = 0.0; /* cumulative probability distribution */ /* Output results. */ for (i=begin; i<=end; ++i) { double ct; double binStartR = 0.0; int binStart = 0; long count; if (i != end) count = hist[i]; else { if (totalTooBig == 0) break; count = totalTooBig; } if (real) binStartR = i*binSizeR + minValR; else binStart = i*binSize + minVal; if (aveCol >= 0) { if (count > 0) ct = total[i]/count; else ct = 0; } else if (freq) { ct = count/(double)totalCounts; } else { ct = count; } if (doLog) ct = log(ct); if (noStar) { if (i == end) printf("<minVal or >="); if (verboseLevel()>1) printf("%02d\t", i); if (real) { if (probValues) { if (verboseLevel()>1) printf("%g:%g", binStartR, binStartR+binSizeR); else printf("%3d %g:%g", i, binStartR, binStartR+binSizeR); } else printf("%3d %g:%g\t%f", i, binStartR, binStartR+binSizeR, ct); } else { printf("%d\t%f", binStart, ct); } if (probValues) { if (ct > 0) { cpd += (double)ct/(double)totalCounts; printf("\t%f\t%f\t%f\t%f\n", (double)ct/(double)totalCounts, log((double)ct/(double)totalCounts)/log(2.0), cpd, 1.0-cpd); } else printf("\t0.0 \tN/A \t%f\t%f\n", cpd, 1.0-cpd); } else printf("\n"); } else { int astCount = round(ct * 60.0 / maxCt); if (i == end) printf("<minVal or >="); if (verboseLevel()>1) printf("%2d ", i); if (real) printf("%f ", binStartR); else printf("%3d ", binStart); for (j=0; j<astCount; ++j) putchar('*'); if ((aveCol >= 0) || freq) printf(" %f\n", ct); else printf(" %ld\n", count); } } } /* textHistogram() */