void indexedChainSubsetOnT(struct indexedChain *ixc, int subStart, int subEnd, struct chain **retSubChain, struct chain **retChainToFree) /* Extract subset of chain that has been indexed. */ { struct range *r = rangeTreeAllOverlapping(ixc->blockTree, subStart, subEnd); if (r == NULL) *retSubChain = *retChainToFree = NULL; else chainFastSubsetOnT(ixc->chain, r->val, subStart, subEnd, retSubChain, retChainToFree); }
int chainBlockCoverage(struct indexedChain *ixc, int start, int end, int* blockStarts, int *blockSizes, int blockCount) /* Calculate how many of the blocks are covered at both block begin and * end by a chain. */ { int blocksCovered = 0; int i=0; /* Find the part of the chain of interest to us. */ struct range *rangeList = rangeTreeAllOverlapping(ixc->blockTree, start, end); /* Check to see how many of our exons the boxInList contains covers. For each block check to see if the blockStart and blockEnd are found in the boxInList. */ for(i=0; i<blockCount; i++) { boolean startFound = FALSE; int blockStart = blockStarts[i]; int blockEnd = blockStarts[i] + blockSizes[i]; struct range *r; /* Skip over bits of range list that are no longer relevant. */ while (rangeList != NULL && rangeList->end <= blockStart) rangeList = rangeList->next; /* Count up blocks covered on both ends. */ for (r = rangeList; r != NULL; r = r->next) { // CCCCCC CCCCC CCCCCC CCC CCCC // BBB BBBB BBB BBBBBBBB BBBB // yes no no no no yes if(r->start <= blockStart && r->end >= blockStart) startFound = TRUE; if(startFound && r->start <= blockEnd && r->end >= blockEnd) { blocksCovered++; break; } if (r->start > blockEnd) break; } } return blocksCovered; }
struct range *rangeTreeMaxOverlapping(struct rbTree *tree, int start, int end) /* Return item that overlaps most with start-end. Not thread safe. Trashes list used * by rangeTreeAllOverlapping. */ { struct range *range, *best = NULL; int bestOverlap = 0; for (range = rangeTreeAllOverlapping(tree, start, end); range != NULL; range = range->next) { int overlap = rangeIntersection(range->start, range->end, start, end); if (overlap > bestOverlap) { bestOverlap = overlap; best = range; } } if (best) best->next = NULL; /* could be set by calls to List functions */ return best; }
struct bed *breakUpBedAtCdsBreaks(struct cdsEvidence *cds, struct bed *bed) /* Create a new broken-up that excludes part of gene between CDS breaks. * Also jiggles cds->end coordinate to cope with the sequence we remove. * Deals with transcript to genome coordinate mapping including negative * strand. Be afraid, be very afraid! */ { /* Create range tree covering all breaks. The coordinates here * are transcript coordinates. While we're out it shrink outer CDS * since we are actually shrinking transcript. */ struct rbTree *gapTree = rangeTreeNew(); int bedSize = bed->chromEnd - bed->chromStart; struct lm *lm = gapTree->lm; /* Convenient place to allocate memory. */ int i, lastCds = cds->cdsCount-1; for (i=0; i<lastCds; ++i) { int gapStart = cds->cdsStarts[i] + cds->cdsSizes[i]; int gapEnd = cds->cdsStarts[i+1]; int gapSize = gapEnd - gapStart; cds->end -= gapSize; rangeTreeAdd(gapTree, gapStart, gapEnd); } /* Get list of exons in bed, flipped to reverse strand if need be. */ struct range *exon, *exonList = bedToExonList(bed, lm); if (bed->strand[0] == '-') flipExonList(&exonList, bedSize); /* Go through exon list, mapping each exon to transcript * coordinates. Check if exon needs breaking up, and if * so do so, as we copy it to new list. */ /* Copy exons to new list, breaking them up if need be. */ struct range *newList = NULL, *nextExon, *newExon; int txStartPos = 0, txEndPos; for (exon = exonList; exon != NULL; exon = nextExon) { txEndPos = txStartPos + exon->end - exon->start; nextExon = exon->next; struct range *gapList = rangeTreeAllOverlapping(gapTree, txStartPos, txEndPos); if (gapList != NULL) { verbose(3, "Splitting exon because of CDS gap\n"); /* Make up exons from current position up to next gap. This is a little * complicated by possibly the gap starting before the exon. */ int exonStart = exon->start; int txStart = txStartPos; struct range *gap; for (gap = gapList; gap != NULL; gap = gap->next) { int txEnd = gap->start; int gapSize = rangeIntersection(gap->start, gap->end, txStart, txEndPos); int exonSize = txEnd - txStart; if (exonSize > 0) { lmAllocVar(lm, newExon); newExon->start = exonStart; newExon->end = exonStart + exonSize; slAddHead(&newList, newExon); } else /* This case happens if gap starts before exon */ { exonSize = 0; } /* Update current position in both transcript and genome space. */ exonStart += exonSize + gapSize; txStart += exonSize + gapSize; } /* Make up final exon from last gap to end, at least if we don't end in a gap. */ if (exonStart < exon->end) { lmAllocVar(lm, newExon); newExon->start = exonStart; newExon->end = exon->end; slAddHead(&newList, newExon); } } else { /* Easy case where we don't intersect any gaps. */ slAddHead(&newList, exon); } txStartPos= txEndPos; } slReverse(&newList); /* Flip exons back to forward strand if need be */ if (bed->strand[0] == '-') flipExonList(&newList, bedSize); /* Convert exons to bed12 */ struct bed *newBed; AllocVar(newBed); newBed->chrom = cloneString(bed->chrom); newBed->chromStart = newList->start + bed->chromStart; newBed->chromEnd = newList->end + bed->chromStart; newBed->name = cloneString(bed->name); newBed->score = bed->score; newBed->strand[0] = bed->strand[0]; newBed->blockCount = slCount(newList); AllocArray(newBed->blockSizes, newBed->blockCount); AllocArray(newBed->chromStarts, newBed->blockCount); for (exon = newList, i=0; exon != NULL; exon = exon->next, i++) { newBed->chromStarts[i] = exon->start; newBed->blockSizes[i] = exon->end - exon->start; newBed->chromEnd = exon->end + bed->chromStart; } /* Clean up and go home. */ rbTreeFree(&gapTree); return newBed; }
void allWriteReadsToDir(char *regionFile, char *dir) { FILE *fp, *rd; char buf[500], readName[500], fileName[500], chr[50], fub[500]; char str[2][500]; char *readStr, *ch; int i, b, e, j, k; struct slName *ali; struct hashEl *el; struct rbTree *tr; struct range *rg; struct hash *localHash = NULL; fp = mustOpen(regionFile, "r"); j = 0; while (fgets(buf, 500, fp)) { if (sscanf(buf, "%[^\t]\t%[^\t]\t%*s", str[0], str[1]) != 2) errAbort("error: %s", buf); ++j; sprintf(fileName, "%s/R%d/reads.fq", dir, j); rd = mustOpen(fileName, "w"); localHash = hashNew(8); for (i = 0; i < 2; i++) { if (sscanf(str[i], "%[^:]:%d-%d", chr, &b, &e) != 3) errAbort("error: %s", str[i]); el = hashLookup(aliHash, chr); tr = (struct rbTree *)(el->val); for (rg = rangeTreeAllOverlapping(tr, b, e); rg; rg = rg->next) { for (ali = (struct slName *)(rg->val); ali; ali = ali->next) { if (hashLookup(localHash, ali->name)) continue; hashStoreName(localHash, ali->name); readStr = (char *)hashFindVal(readsHash, ali->name); if(readStr == NULL) continue; //assert(readStr); strcpy(fub, readStr); ch = strchr(fub, ' '); *ch = '\0'; fprintf(rd, "@%s\n", ali->name); fprintf(rd, "%s\n", fub); ++ch; fprintf(rd, "+%s\n", ali->name); fprintf(rd, "%s\n", ch); strcpy(readName, ali->name); k = strlen(readName); /* if (readName[k-1] == '1') readName[k-1] = '2'; else if (readName[k-1] == '2') readName[k-1] = '1'; else errAbort("read identifier error: %s", readName); if (hashLookup(localHash, readName)) continue; hashStoreName(localHash, readName); readStr = (char *)hashFindVal(readsHash, readName); assert(readStr); strcpy(fub, readStr); ch = strchr(fub, ' '); *ch = '\0'; fprintf(rd, "@%s\n", readName); fprintf(rd, "%s\n", fub); ++ch; fprintf(rd, "+%s\n", readName); fprintf(rd, "%s\n", ch); */ } } } hashFree(&localHash); fclose(rd); } fclose(fp); hashFreeWithVals(&readsHash, freez); hashFreeWithVals(&aliHash, rbTreeFree); }
void rangeTreeAddToCoverageDepth(struct rbTree *tree, int start, int end) /* Add area from start to end to a tree that is being built up to store the * depth of coverage. Recover coverage back out by looking at ptToInt(range->val) * on tree elements. */ { struct range q; q.start = start; q.end = end; struct range *r, *existing = rbTreeFind(tree, &q); if (existing == NULL) { lmAllocVar(tree->lm, r); r->start = start; r->end = end; r->val = intToPt(1); rbTreeAdd(tree, r); } else { if (existing->start <= start && existing->end >= end) /* The existing one completely encompasses us */ { /* Make a new section for the bit before start. */ if (existing->start < start) { lmAllocVar(tree->lm, r); r->start = existing->start; r->end = start; r->val = existing->val; existing->start = start; rbTreeAdd(tree, r); } /* Make a new section for the bit after end. */ if (existing->end > end) { lmAllocVar(tree->lm, r); r->start = end; r->end = existing->end; r->val = existing->val; existing->end = end; rbTreeAdd(tree, r); } /* Increment existing section in overlapping area. */ existing->val = (char *)(existing->val) + 1; } else /* In general case fetch list of regions that overlap us. Remaining cases to handle are: r >> e rrrrrrrrrrrrrrrrrrrr eeeeeeeeee e < r rrrrrrrrrrrrrrr eeeeeeeeeeee r < e rrrrrrrrrrrr eeeeeeeeeeeee */ { struct range *existingList = rangeTreeAllOverlapping(tree, start, end); #ifdef DEBUG /* Make sure that list is really sorted for debugging... */ int lastStart = existingList->start; for (r = existingList; r != NULL; r = r->next) { int start = r->start; if (start < lastStart) internalErr(); } #endif /* DEBUG */ int s = start, e = end; for (existing = existingList; existing != NULL; existing = existing->next) { /* Deal with start of new range that comes before existing */ if (s < existing->start) { lmAllocVar(tree->lm, r); r->start = s; r->end = existing->start; r->val = intToPt(1); s = existing->start; rbTreeAdd(tree, r); } else if (s > existing->start) { lmAllocVar(tree->lm, r); r->start = existing->start; r->end = s; r->val = existing->val; existing->start = s; rbTreeAdd(tree, r); } existing->val = (char *)(existing->val) + 1; s = existing->end; } if (s < e) /* Deal with end of new range that doesn't overlap with anything. */ { lmAllocVar(tree->lm, r); r->start = s; r->end = e; r->val = intToPt(1); rbTreeAdd(tree, r); } } } }