static int countBasesOverlap(struct bed *bedItem, Bits *bits, boolean hasBlocks, int chromSize) /* Return the number of bases belonging to bedItem covered by bits. */ { int count = 0; int i; if (bedItem->chromEnd > chromSize) errAbort("Item out of range [0,%d): %s %s:%d-%d", chromSize, (bedItem->name ? bedItem->name : ""), bedItem->chrom, bedItem->chromStart, bedItem->chromEnd); if (bedItem->chromStart == bedItem->chromEnd) { /* Zero-size item: count overlap with adjacent bases. */ for (i = bedItem->chromStart-1; i < bedItem->chromEnd+1; i++) if (i >= 0 && i < chromSize && bitReadOne(bits, i)) count++; } else if (hasBlocks) { for (i=0; i < bedItem->blockCount; i++) { int start = bedItem->chromStart + bedItem->chromStarts[i]; count += bitCountRange(bits, start, bedItem->blockSizes[i]); } } else { count = bitCountRange(bits, bedItem->chromStart, bedItem->chromEnd - bedItem->chromStart); } return(count); }
struct hash *chainReadUsedSwapLf(char *fileName, boolean swapQ, Bits *bits, struct lineFile *lf) /* Read chains that are marked as used in the * bits array (which may be NULL) into a hash keyed by id. */ { char nameBuf[16]; struct hash *hash = hashNew(18); struct chain *chain; int usedCount = 0, count = 0; while ((chain = chainRead(lf)) != NULL) { ++count; if (bits != NULL && !bitReadOne(bits, chain->id)) { chainFree(&chain); continue; } safef(nameBuf, sizeof(nameBuf), "%x", chain->id); if (hashLookup(hash, nameBuf)) errAbort("Duplicate chain %d ending line %d of %s", chain->id, lf->lineIx, lf->fileName); if (swapQ) chainSwap(chain); hashAdd(hash, nameBuf, chain); ++usedCount; } return hash; }
int findCrossover(Bits *bits, int overlapStart, int overlapEnd) /* Search from middle of overlap until find a clear spot. * Return -1 if no such spot*/ { int i, offset; int size = overlapEnd - overlapStart; int halfSize = size/2; int halfPlus = halfSize+1; for (i=0; i<=halfPlus; ++i) { offset = halfSize + i; if (offset < size) if (!bitReadOne(bits, offset)) return offset+overlapStart; offset = halfSize - i; if (offset >= 0) if (!bitReadOne(bits, offset)) return offset+overlapStart; } return -1; }
static int bitCountBasesOverlap(struct bed *bedItem, Bits *bits, boolean hasBlocks) /* Return the number of bases belonging to bedItem covered by bits. */ { int count = 0; int i, j; if (hasBlocks) { for (i=0; i < bedItem->blockCount; i++) { int start = bedItem->chromStart + bedItem->chromStarts[i]; int end = start + bedItem->blockSizes[i]; for (j=start; j < end; j++) if (bitReadOne(bits, j)) count++; } } else { for (i=bedItem->chromStart; i < bedItem->chromEnd; i++) if (bitReadOne(bits, i)) count++; } return(count); }
static void savePslx(char *chromName, int chromSize, int chromOffset, struct ffAli *ali, struct dnaSeq *tSeq, struct dnaSeq *qSeq, boolean isRc, enum ffStringency stringency, int minMatch, FILE *f, struct hash *t3Hash, boolean reportTargetStrand, boolean targetIsRc, struct hash *maskHash, int minIdentity, boolean qIsProt, boolean tIsProt, boolean saveSeq) /* Analyse one alignment and if it looks good enough write it out to file in * psl format (or pslX format - if saveSeq is TRUE). */ { /* This function was stolen from psLayout and slightly extensively to cope * with protein as well as DNA aligments. */ struct ffAli *ff, *nextFf; struct ffAli *right = ffRightmost(ali); DNA *needle = qSeq->dna; DNA *hay = tSeq->dna; int nStart = ali->nStart - needle; int nEnd = right->nEnd - needle; int hStart, hEnd; int nInsertBaseCount = 0; int nInsertCount = 0; int hInsertBaseCount = 0; int hInsertCount = 0; int matchCount = 0; int mismatchCount = 0; int repMatch = 0; int countNs = 0; DNA *np, *hp, n, h; int blockSize; int i; struct trans3 *t3List = NULL; Bits *maskBits = NULL; if (maskHash != NULL) maskBits = hashMustFindVal(maskHash, tSeq->name); if (t3Hash != NULL) t3List = hashMustFindVal(t3Hash, tSeq->name); hStart = trans3GenoPos(ali->hStart, tSeq, t3List, FALSE) + chromOffset; hEnd = trans3GenoPos(right->hEnd, tSeq, t3List, TRUE) + chromOffset; /* Count up matches, mismatches, inserts, etc. */ for (ff = ali; ff != NULL; ff = nextFf) { nextFf = ff->right; blockSize = ff->nEnd - ff->nStart; np = ff->nStart; hp = ff->hStart; for (i=0; i<blockSize; ++i) { n = np[i]; h = hp[i]; if (n == 'n' || h == 'n') ++countNs; else { if (n == h) { if (maskBits != NULL) { int seqOff = hp + i - hay; if (bitReadOne(maskBits, seqOff)) ++repMatch; else ++matchCount; } else ++matchCount; } else ++mismatchCount; } } if (nextFf != NULL) { int nhStart = trans3GenoPos(nextFf->hStart, tSeq, t3List, FALSE) + chromOffset; int ohEnd = trans3GenoPos(ff->hEnd, tSeq, t3List, TRUE) + chromOffset; int hGap = nhStart - ohEnd; int nGap = nextFf->nStart - ff->nEnd; if (nGap != 0) { ++nInsertCount; nInsertBaseCount += nGap; } if (hGap != 0) { ++hInsertCount; hInsertBaseCount += hGap; } } } /* See if it looks good enough to output, and output. */ /* if (score >= minMatch) Moved to higher level */ { int gaps = nInsertCount + (stringency == ffCdna ? 0: hInsertCount); int id = roundingScale(1000, matchCount + repMatch - 2*gaps, matchCount + repMatch + mismatchCount); if (id >= minIdentity) { if (isRc) { int temp; int oSize = qSeq->size; temp = nStart; nStart = oSize - nEnd; nEnd = oSize - temp; } if (targetIsRc) { int temp; temp = hStart; hStart = chromSize - hEnd; hEnd = chromSize - temp; } fprintf(f, "%d\t%d\t%d\t%d\t%d\t%d\t%d\t%d\t%c", matchCount, mismatchCount, repMatch, countNs, nInsertCount, nInsertBaseCount, hInsertCount, hInsertBaseCount, (isRc ? '-' : '+')); if (reportTargetStrand) fprintf(f, "%c", (targetIsRc ? '-' : '+') ); fprintf(f, "\t%s\t%d\t%d\t%d\t" "%s\t%d\t%d\t%d\t%d\t", qSeq->name, qSeq->size, nStart, nEnd, chromName, chromSize, hStart, hEnd, ffAliCount(ali)); for (ff = ali; ff != NULL; ff = ff->right) fprintf(f, "%ld,", (long)(ff->nEnd - ff->nStart)); fprintf(f, "\t"); for (ff = ali; ff != NULL; ff = ff->right) fprintf(f, "%ld,", (long)(ff->nStart - needle)); fprintf(f, "\t"); for (ff = ali; ff != NULL; ff = ff->right) fprintf(f, "%d,", trans3GenoPos(ff->hStart, tSeq, t3List, FALSE) + chromOffset); if (saveSeq) { fputc('\t', f); for (ff = ali; ff != NULL; ff = ff->right) { mustWrite(f, ff->nStart, ff->nEnd - ff->nStart); fputc(',', f); } fputc('\t', f); for (ff = ali; ff != NULL; ff = ff->right) { mustWrite(f, ff->hStart, ff->hEnd - ff->hStart); fputc(',', f); } } fprintf(f, "\n"); if (ferror(f)) { perror(""); errAbort("Write error to .psl"); } } } }
void doEnrichmentsFromBigWig(struct sqlConnection *conn, struct cdwFile *ef, struct cdwValidFile *vf, struct cdwAssembly *assembly, struct target *targetList) /* Figure out enrichments from a bigBed file. */ { /* Get path to bigBed, open it, and read all chromosomes. */ char *bigWigPath = cdwPathForFileId(conn, ef->id); struct bbiFile *bbi = bigWigFileOpen(bigWigPath); struct bbiChromInfo *chrom, *chromList = bbiChromList(bbi); struct bigWigValsOnChrom *valsOnChrom = bigWigValsOnChromNew(); /* This takes a while, so let's figure out what parts take the time. */ long totalBigQueryTime = 0; long totalOverlapTime = 0; /* Do a pretty complex loop that just aims to set target->overlapBases and ->uniqOverlapBases * for all targets. This is complicated by just wanting to keep one chromosome worth of * bigWig data in memory. Also just for performance we do a lookup of target range tree to * get chromosome specific one to use, which avoids a hash lookup in the inner loop. */ for (chrom = chromList; chrom != NULL; chrom = chrom->next) { long startBigQueryTime = clock1000(); boolean gotData = bigWigValsOnChromFetchData(valsOnChrom, chrom->name, bbi); long endBigQueryTime = clock1000(); totalBigQueryTime += endBigQueryTime - startBigQueryTime; if (gotData) { double *valBuf = valsOnChrom->valBuf; Bits *covBuf = valsOnChrom->covBuf; /* Loop through all targets adding overlaps from ivList */ long startOverlapTime = clock1000(); struct target *target; for (target = targetList; target != NULL; target = target->next) { if (target->skip) continue; struct genomeRangeTree *grt = target->grt; struct rbTree *targetTree = genomeRangeTreeFindRangeTree(grt, chrom->name); if (targetTree != NULL) { struct range *range, *rangeList = rangeTreeList(targetTree); for (range = rangeList; range != NULL; range = range->next) { int s = range->start, e = range->end, i; for (i=s; i<=e; ++i) { if (bitReadOne(covBuf, i)) { double x = valBuf[i]; target->uniqOverlapBases += 1; target->overlapBases += x; } } } } } long endOverlapTime = clock1000(); totalOverlapTime += endOverlapTime - startOverlapTime; } } verbose(1, "totalBig %0.3f, totalOverlap %0.3f\n", 0.001*totalBigQueryTime, 0.001*totalOverlapTime); /* Now loop through targets and save enrichment info to database */ struct target *target; for (target = targetList; target != NULL; target = target->next) { if (target->skip) continue; struct cdwQaEnrich *enrich = enrichFromOverlaps(ef, vf, assembly, target, target->overlapBases, target->uniqOverlapBases); cdwQaEnrichSaveToDb(conn, enrich, "cdwQaEnrich", 128); cdwQaEnrichFree(&enrich); } bigWigValsOnChromFree(&valsOnChrom); bbiChromInfoFreeList(&chromList); bigWigFileClose(&bbi); freez(&bigWigPath); }