void explainSome(char *database, Bits *h**o, Bits *once, Bits *bits, char *chrom, int chromSize, struct sqlConnection *conn, char *trackSpec, char *homologyTrack) /* Explain some of homology. */ { int trackSize = 0, homoSize = 0, andSize = 0, cumSize = 0, newSize = 0; homoSize = bitCountRange(h**o, 0, chromSize); bitClear(bits, chromSize); if (trackSpec != NULL) { fbOrTableBits(database, bits, trackSpec, chrom, chromSize, conn); trackSize = bitCountRange(bits, 0, chromSize); bitAnd(bits, h**o, chromSize); andSize = bitCountRange(bits, 0, chromSize); bitAnd(bits, once, chromSize); newSize = bitCountRange(bits, 0, chromSize); bitNot(bits, chromSize); bitAnd(once, bits, chromSize); cumSize = homoSize - bitCountRange(once, 0, chromSize); } else { trackSpec = homologyTrack; trackSize = andSize = homoSize; cumSize = newSize = 0; } printf("%-21s %8d %8d %5.2f%% %6.2f%% %6.2f%% %5.2f%% %5.2f%%\n", trackSpec, trackSize, andSize, 100.0*trackSize/chromSize, 100.0*andSize/trackSize, 100.0*andSize/homoSize, 100.0*newSize/homoSize, 100.0*cumSize/homoSize); }
static int countBasesOverlap(struct bed *bedItem, Bits *bits, boolean hasBlocks, int chromSize) /* Return the number of bases belonging to bedItem covered by bits. */ { int count = 0; int i; if (bedItem->chromEnd > chromSize) errAbort("Item out of range [0,%d): %s %s:%d-%d", chromSize, (bedItem->name ? bedItem->name : ""), bedItem->chrom, bedItem->chromStart, bedItem->chromEnd); if (bedItem->chromStart == bedItem->chromEnd) { /* Zero-size item: count overlap with adjacent bases. */ for (i = bedItem->chromStart-1; i < bedItem->chromEnd+1; i++) if (i >= 0 && i < chromSize && bitReadOne(bits, i)) count++; } else if (hasBlocks) { for (i=0; i < bedItem->blockCount; i++) { int start = bedItem->chromStart + bedItem->chromStarts[i]; count += bitCountRange(bits, start, bedItem->blockSizes[i]); } } else { count = bitCountRange(bits, bedItem->chromStart, bedItem->chromEnd - bedItem->chromStart); } return(count); }
boolean chainUsed(struct chain *chain, struct chrom *qChrom, struct chrom *tChrom) /* Look at bitmaps to see if chain intersects any part of * chromosome on either side that is not used. Then mark * newly used parts. */ { struct cBlock *b; boolean anyOpen = FALSE; for (b = chain->blockList; b != NULL; b = b->next) { int size = b->qEnd - b->qStart; if (bitCountRange(qChrom->bits, b->qStart, size) != size) { anyOpen = TRUE; break; } size = b->tEnd - b->tStart; if (bitCountRange(tChrom->bits, b->tStart, size) != size) { anyOpen = TRUE; break; } } if (anyOpen) { for (b = chain->blockList; b != NULL; b = b->next) { setWithPad(qChrom, b->qStart, b->qEnd); setWithPad(tChrom, b->tStart, b->tEnd); } } return anyOpen; }
void bitsToBins(Bits *bits, char *chrom, int chromSize, FILE *binFile, int binSize, int binOverlap) /* Write out binned counts of bits. */ { int bin, count; if (!binFile) return; for (bin=0; bin+binSize<chromSize; bin=bin+binOverlap) { count = bitCountRange(bits, bin, binSize); fprintf(binFile, "%s\t%d\t%d\t%d\t%s.%d\n", chrom, bin, bin+binSize, count, chrom, bin/binOverlap+1); } count = bitCountRange(bits, bin, chromSize-bin); fprintf(binFile, "%s\t%d\t%d\t%d\t%s.%d\n", chrom, bin, chromSize, count, chrom, bin/binOverlap+1); }
static void basesCoveredFromBits(struct covStats *cov) /* Calculate basesCovered from bits for each item on list. */ { int regionSize = cov->region->end - cov->region->start; cov->basesCovered = bitCountRange(cov->bits, 0, regionSize); bitFree(&cov->bits); }
void makeDeletes(struct sqlConnection *conn, struct chromInfo *chrom, FILE *f) /* Generate SQL that kills tet alignments on simple repeats. */ { struct wabaChromHit *wchList = NULL, *wch; struct rmskOut ro; int tetSize; int repSize; int start, end; int delCount = 0; int totCount = 0; Bits *b = NULL; printf(" Loading all tet alignments on %s...\n", chrom->chrom); wchList = wchLoadAll(conn, chrom->chrom); printf(" Got %d alignments\n", slCount(wchList)); b = getMaskedBits(conn, chrom); for (wch = wchList; wch != NULL; wch = wch->next) { tetSize = wch->chromEnd - wch->chromStart; repSize = bitCountRange(b, wch->chromStart, tetSize); ++totCount; if (repSize * 2 > tetSize) { ++delCount; makeDelete(chrom->chrom, wch, f); } } bitFree(&b); if (totCount > 0) printf("Deleted %d of %d (%4.2f%%)\n", delCount, totCount, (100.0)*delCount/(double)totCount); wchFreeList(&wchList); }
void addBufIntervalInfo(double *valBuf, Bits *covBuf, int start, int end, int *pSumSize, int *pSumCoverage, double *pSumVal) /* Look at interval in buffers and add result to sums. */ { int size1 = end - start; *pSumSize += size1; int cov1 = bitCountRange(covBuf, start, size1); *pSumCoverage += cov1; int i; double sum1 = 0; for (i=start; i<end; ++i) sum1 += valBuf[i]; *pSumVal += sum1; }
void maskFeatures(char *database, struct sqlConnection *conn, char *chrom, int chromSize, Bits *maskBits) /* Mask out bits we're not interested in for conservation. */ { fbOrTableBits(database, maskBits, "gap", chrom, chromSize, conn); fbOrTableBits(database, maskBits, "refGene:exon:12", chrom, chromSize, conn); fbOrTableBits(database, maskBits, "mrna:exon:12", chrom, chromSize, conn); fbOrTableBits(database, maskBits, "ensGene:exon:12", chrom, chromSize, conn); fbOrTableBits(database, maskBits, "softberryGene:exon:12", chrom, chromSize, conn); fbOrTableBits(database, maskBits, "twinscan:exon:12", chrom, chromSize, conn); fbOrTableBits(database, maskBits, "xenoMrna:exon:12", chrom, chromSize, conn); fbOrTableBits(database, maskBits, "intronEst:exon:12", chrom, chromSize, conn); fbOrTableBits(database, maskBits, "anyMrnaCov", chrom, chromSize, conn); fbOrTableBits(database, maskBits, "rmsk", chrom, chromSize, conn); printf("%s: %d bits masked\n", chrom, bitCountRange(maskBits, 0, chromSize)); }
static void visiSearcherWeedResults(struct visiSearcher *searcher, struct sqlConnection *conn) /* Get rid of images that are just partial matches, and also * images that are private. This leaks a little memory - the * matches that are weeded out.*/ { struct visiMatch *newList = NULL, *match, *next, key; int wordCount = searcher->wordCount; struct dyString *query = dyStringNew(0); struct sqlResult *sr; char **row; int passCount = 0; /* Construct query to fetch all non-private imageId's in matchList. */ dyStringAppend(query, "select image.id from image,submissionSet " "where submissionSet.privateUser = 0 " "and submissionSet.id = image.submissionSet " "and image.id in ("); for (match = searcher->matchList; match != NULL; match = next) { next = match->next; if (bitCountRange(match->wordBits, 0, wordCount) == wordCount) { if (passCount != 0) dyStringAppendC(query, ','); dyStringPrintf(query, "%d", match->imageId); ++passCount; } } dyStringAppendC(query, ')'); /* Execute query, and put corresponding images on newList. */ if (passCount > 0) { sr = sqlGetResult(conn, query->string); while ((row = sqlNextRow(sr)) != NULL) { key.imageId = sqlUnsigned(row[0]); match = rbTreeFind(searcher->tree, &key); if (match == NULL) internalErr(); slAddHead(&newList, match); } slReverse(&newList); } searcher->matchList = newList; dyStringFree(&query); }
double averageInRegion(struct bigWigValsOnChrom *chromVals, int start, int size) /* Return average value in region where there is data. */ { int n = bitCountRange(chromVals->covBuf, start, size); if (n == 0) return 0.0; else { double *val = chromVals->valBuf + start; double sum = 0; int i = size; while (--i >= 0) sum += *val++; return sum/n; } }
void bitsToRegions(Bits *bits, char *chrom, int chromSize, struct bed *bedList, FILE *bedOutFile) /* Write out counts of bits in regions defined by bed elements. */ { struct bed *bl=NULL; int count, i=0; if (!bedOutFile) return; for (bl=bedList; bl!=NULL; bl=bl->next) { if(differentString(bl->chrom,chrom)) continue; count = bitCountRange(bits, bl->chromStart, bl->chromEnd-bl->chromStart); fprintf(bedOutFile, "%s\t%d\t%d\t%d\t%s.%d\n", chrom, bl->chromStart, bl->chromEnd, count, chrom, ++i); } }
void chromFeatureBits(struct sqlConnection *conn,char *database, char *chrom, int tableCount, char *tables[], FILE *bedFile, FILE *faFile, FILE *binFile, struct bed *bedRegionList, FILE *bedOutFile, int chromSize, int *retChromBits, int *retFirstTableBits, int *retSecondTableBits) /* featureBits - Correlate tables via bitmap projections and booleans * on one chromosome. */ { int i; Bits *acc = NULL; Bits *bits = NULL; char *table; acc = bitAlloc(chromSize); bits = bitAlloc(chromSize); for (i=0; i<tableCount; ++i) { boolean not = FALSE; table = tables[i]; if (table[0] == '!') { not = TRUE; ++table; } if (i == 0) { orTable(database, acc, table, chrom, chromSize, conn); if (not) bitNot(acc, chromSize); if (retFirstTableBits != NULL) *retFirstTableBits = bitCountRange(acc, 0, chromSize); } else { bitClear(bits, chromSize); orTable(database, bits, table, chrom, chromSize, conn); if (not) bitNot(bits, chromSize); if (i == 1 && retSecondTableBits != NULL) *retSecondTableBits = bitCountRange(bits, 0, chromSize); /* feature/bug - the above does not respect minSize */ if (orLogic) bitOr(acc, bits, chromSize); else bitAnd(acc, bits, chromSize); } } if (notResults) bitNot(acc, chromSize); *retChromBits = bitCountRange(acc, 0, chromSize); if (bedFile != NULL || faFile != NULL) { minSize = optionInt("minSize", minSize); bitsToBed(database, acc, chrom, chromSize, bedFile, faFile, minSize); } if (binFile != NULL) { binSize = optionInt("binSize", binSize); binOverlap = optionInt("binOverlap", binOverlap); bitsToBins(acc, chrom, chromSize, binFile, binSize, binOverlap); } if (bedOutFile != NULL) bitsToRegions(acc, chrom, chromSize, bedRegionList, bedOutFile); bitFree(&acc); bitFree(&bits); }
void addToStats(struct stats *stats, Bits *aliBits, Bits *matchBits, Bits *geneBits, Bits *seqBits, struct region *r, FILE *f, struct scoredWindow **pWinList) /* Step big window through region adding to stats. */ { char *chrom = r->chrom; int chromStart = r->start; int chromEnd = r->end; int bigStart, bigEnd, smallStart, smallEnd; int aliCount, matchCount, geneCount, seqCount; int bigWeight; double consRatio = 0, geneRatio = 0; /* Do some sanity checking/error reporting */ if (chromEnd < chromStart) errAbort("Out of range %s:%d-%d (%s)", chrom, chromStart, chromEnd, r->name); if (printWin) fprintf(f, "%s\n", r->name); for (bigStart = chromStart; bigStart < chromEnd; bigStart += bigStepSize) { int smallPassing = 0; /* Count of small windows passing %ID threshold */ int smallGotData = 0; /* Count of small windows with alignment data */ int consIx = -1; /* Index into conservation histogram */ int geneIx = -1; /* Index into gene density histogram */ int thisBigSize; /* Figure out boundaries of big window, and based on * size how much to weigh it in histogram */ bigEnd = bigStart + bigWinSize; if (bigEnd > chromEnd) bigEnd = chromEnd; thisBigSize = bigEnd - bigStart; bigWeight = round(10.0 * thisBigSize / bigWinSize); /* Figure out number of non-N bases, and skip this window * if they amount to less than half of it. */ seqCount = bitCountRange(seqBits, bigStart, thisBigSize); if (seqCount < thisBigSize/2) continue; /* Step through small windows inside big one to calculate * what percentage of small windows are conserved over * theshold. */ for (smallStart = bigStart; smallStart < bigEnd; smallStart += smallWinSize) { smallEnd = smallStart + smallWinSize; if (smallEnd > chromEnd) smallEnd = chromEnd; aliCount = bitCountRange(aliBits, smallStart, smallEnd - smallStart); matchCount = bitCountRange(matchBits, smallStart, smallEnd - smallStart); /* See if enough of the small window aligns to * calculate percentage of bases aligning * accurately, and if so add small window to * data set. */ if (aliCount >= 0.75 * smallWinSize) { double ratio = (double) matchCount/aliCount; smallGotData += 1; if (ratio >= threshold) smallPassing += 1; } } /* If a reasonable number of small windows have * data, add statistics to conservation histogram. */ if (smallGotData >= 50) { consRatio = (double) smallPassing/smallGotData; consIx = consRatio * histSize; if (consIx > histSize) consIx = histSize; stats->totalConsCount += bigWeight; stats->consCounts[consIx] += bigWeight; } /* Calculate gene density and save. */ geneCount = bitCountRange(geneBits, bigStart, bigEnd - bigStart); geneRatio = (double)geneCount / seqCount; geneIx = geneRatio * geneScale * histSize; if (geneIx > histSize) geneIx = histSize; stats->totalGeneCount += bigWeight; stats->geneCounts[geneIx] += bigWeight; /* If valid gene density and conservation data then * add it to two-dimensional histogram */ if (geneIx >= 0 && consIx >= 0) { if (printWin) { fprintf(f, " %s:%d-%d ", chrom, bigStart+1, bigEnd); fprintf(f, "gene %4.1f%% consNotTx %4.1f%%\n", 100*geneRatio, 100*consRatio); } stats->totalBothCount += bigWeight; stats->bothCounts[consIx][geneIx] += bigWeight; /* If no gaps add it to window list. */ if (seqCount == bigWinSize) { struct scoredWindow *win; AllocVar(win); win->chrom = chrom; win->start = bigStart; win->geneRatio = geneRatio; win->consRatio = consRatio; slAddHead(pWinList, win); } } } }
void splitByGap(char *inName, int pieceSize, char *outRoot, long long estSize) /* Split up file into pieces at most pieceSize bases long, at gap boundaries * if possible. */ { off_t pieces = (estSize + pieceSize-1)/pieceSize; int digits = digitsBaseTen(pieces); int minGapSize = optionInt("minGapSize", 1000); boolean noGapDrops = optionExists("noGapDrops"); int maxN = optionInt("maxN", pieceSize-1); boolean oneFile = optionExists("oneFile"); char fileName[512]; char dirOnly[256], noPath[128]; int pos, pieceIx = 0, writeCount = 0; struct dnaSeq seq; struct lineFile *lf = lineFileOpen(inName, TRUE); FILE *f = NULL; Bits *bits = NULL; int seqCount = 0; char *outFile = optionVal("out", NULL); char *liftFile = optionVal("lift", NULL); FILE *lift = NULL; ZeroVar(&seq); if (minGapSize < 1) errAbort("ERROR: minGapSize must be > 0"); splitPath(outRoot, dirOnly, noPath, NULL); if (oneFile) { sprintf(fileName, "%s.fa", outRoot); f = mustOpen(fileName, "w"); } else fileName[0] = '\0'; if (liftFile) lift = mustOpen(liftFile, "w"); while (faMixedSpeedReadNext(lf, &seq.dna, &seq.size, &seq.name)) { bits = bitAlloc(seq.size); setBitsN(seq.dna, seq.size, bits); ++seqCount; if (outFile != NULL) { if (seqCount > 1) errAbort("Can only handle in files with one sequence using out option"); bitsForOut(outFile, seq.size, bits); } pos = 0; while (pos < seq.size) { boolean gotGap = FALSE; int gapStart = 0; int gapSize = 0; int endSize = seq.size - pos; int thisSize = min(endSize, pieceSize); int startGapLen = 0; if (seq.dna[pos] == 'n' || seq.dna[pos] == 'N') { startGapLen = bitFindClear(bits, pos, endSize) - pos; verbose(3,"#\tstarting gap at %d for length: %d\n", pos, startGapLen ); } /* if a block is all gap for longer than minGapSize, then * keep it all together in one large piece */ if (startGapLen > minGapSize) { if (noGapDrops) { writeOneByGap(oneFile, outRoot, digits, &pieceIx, f, noPath, pos, startGapLen, &seq, lift, &writeCount, fileName); } else verbose(3,"#\tbeginning gap of %d size skipped\n", startGapLen); thisSize = startGapLen; } else if (thisSize > 0 && bitCountRange(bits, pos, thisSize) <= maxN) { if (endSize>pieceSize) /* otherwise chops tiny piece at very end */ { gotGap = findLastGap(&(seq.dna[pos]), thisSize, endSize, minGapSize, &gapStart, &gapSize); if (gotGap) thisSize = gapStart; } writeOneByGap(oneFile, outRoot, digits, &pieceIx, f, noPath, pos, thisSize, &seq, lift, &writeCount, fileName); } pos += thisSize; if (gotGap) { /* last block is all gap, write it all out */ /*if ((pos + gapSize) >= seq.size)*/ if (noGapDrops) { writeOneByGap(oneFile, outRoot, digits, &pieceIx, f, noPath, pos, gapSize, &seq ,lift, &writeCount, fileName); verbose(3, "#\tadding gapSize %d to pos %d -> %d and writing gap\n", gapSize, pos, pos+gapSize); } else verbose(3,"#\tadding gapSize %d to pos %d -> %d\n", gapSize, pos, pos+gapSize); pos += gapSize; } } bitFree(&bits); } carefulClose(&f); carefulClose(&lift); lineFileClose(&lf); printf("%d pieces of %d written\n", writeCount, pieceIx); }
void splitByCount(char *inName, int pieceSize, char *outRoot, off_t estSize, int extra) /* Split up file into pieces pieceSize long. */ { off_t pieces = (estSize + pieceSize-1)/pieceSize; int digits = digitsBaseTen(pieces); int maxN = optionInt("maxN", pieceSize-1); boolean oneFile = optionExists("oneFile"); char fileName[PATH_LEN]; char dirOnly[PATH_LEN], noPath[128]; int pos, pieceIx = 0, writeCount = 0; struct dnaSeq seq; struct lineFile *lf = lineFileOpen(inName, TRUE); FILE *f = NULL; Bits *bits = NULL; int seqCount = 0; char *outFile = optionVal("out", NULL); char *liftFile = optionVal("lift", NULL); FILE *lift = NULL; ZeroVar(&seq); splitPath(outRoot, dirOnly, noPath, NULL); if (oneFile) { sprintf(fileName, "%s.fa", outRoot); f = mustOpen(fileName, "w"); } if (liftFile) lift = mustOpen(liftFile, "w"); /* Count number of N's from s[0] to s[size-1]. * Treat any parts past end of string as N's. */ while (faMixedSpeedReadNext(lf, &seq.dna, &seq.size, &seq.name)) { bits = bitAlloc(seq.size); setBitsN(seq.dna, seq.size, bits); ++seqCount; if (outFile != NULL) { if (seqCount > 1) errAbort("Can only handle in files with one sequence using out option"); bitsForOut(outFile, seq.size, bits); } for (pos = 0; pos < seq.size; pos += pieceSize) { char numOut[128]; int thisSize = seq.size - pos; if (thisSize > (pieceSize + extra)) thisSize = pieceSize + extra; if ((thisSize <= extra) && (pos > 0)) break; /* nobody wants duplicate smaller than extra overhang */ if (bitCountRange(bits, pos, thisSize) <= maxN) { if (!oneFile) { mkOutPath(fileName, outRoot, digits, pieceIx); f = mustOpen(fileName, "w"); } sprintf(numOut, "%s%0*d", noPath, digits, pieceIx); faWriteNext(f, numOut, seq.dna + pos, thisSize); if (lift) fprintf(lift, "%d\t%s\t%d\t%s\t%d\n", pos, numOut, thisSize, seq.name, seq.size); ++writeCount; if (!oneFile) carefulClose(&f); } pieceIx++; } bitFree(&bits); } carefulClose(&f); carefulClose(&lift); lineFileClose(&lf); printf("%d pieces of %d written\n", writeCount, pieceIx); }
struct bbiInterval *intersectedFilteredBbiIntervalsOnRegion(struct sqlConnection *conn, struct bbiFile *bwf, struct region *region, enum wigCompare filterCmp, double filterLl, double filterUl, struct lm *lm) /* Get list of bbiIntervals (more-or-less bedGraph things from bigWig) out of bigWig file * and if necessary apply filter and intersection. Return list which is allocated in lm. */ { char *chrom = region->chrom; int chromSize = hChromSize(database, chrom); struct bbiInterval *iv, *ivList = bigWigIntervalQuery(bwf, chrom, region->start, region->end, lm); /* Run filter if necessary */ if (filterCmp != wigNoOp_e) { struct bbiInterval *next, *newList = NULL; for (iv = ivList; iv != NULL; iv = next) { next = iv->next; if (wigCompareValFilter(iv->val, filterCmp, filterLl, filterUl)) { slAddHead(&newList, iv); } } slReverse(&newList); ivList = newList; } /* Run intersection if necessary */ if (anyIntersection()) { boolean isBpWise = intersectionIsBpWise(); Bits *bits2 = bitsForIntersectingTable(conn, region, chromSize, isBpWise); struct bbiInterval *next, *newList = NULL; double moreThresh = cartCgiUsualDouble(cart, hgtaMoreThreshold, 0)*0.01; double lessThresh = cartCgiUsualDouble(cart, hgtaLessThreshold, 100)*0.01; char *op = cartString(cart, hgtaIntersectOp); for (iv = ivList; iv != NULL; iv = next) { next = iv->next; int start = iv->start; int size = iv->end - start; int overlap = bitCountRange(bits2, start, size); if (isBpWise) { if (overlap == size) { slAddHead(&newList, iv); } else if (overlap > 0) { /* Here we have to break things up. */ double val = iv->val; struct bbiInterval *partIv = iv; // Reuse memory for first interval int s = iv->start, end = iv->end; for (;;) { s = bitFindSet(bits2, s, end); if (s >= end) break; int bitsSet = bitFindClear(bits2, s, end) - s; if (partIv == NULL) lmAllocVar(lm, partIv); partIv->start = s; partIv->end = s + bitsSet; partIv->val = val; slAddHead(&newList, partIv); partIv = NULL; s += bitsSet; if (s >= end) break; } } } else { double coverage = (double)overlap/size; if (intersectOverlapFilter(op, moreThresh, lessThresh, coverage)) { slAddHead(&newList, iv); } } } slReverse(&newList); ivList = newList; bitFree(&bits2); } return ivList; }