struct bed *findCompatible(struct bed *newBed, struct hash *oldHash, struct hash *usedHash) /* Try and find an old bed compatible with new bed. */ { struct binKeeper *bk = hashFindVal(oldHash, newBed->chrom); int bestDiff = BIGNUM; struct bed *bestBed = NULL; if (bk == NULL) return NULL; struct binElement *bin, *binList = binKeeperFind(bk, newBed->chromStart, newBed->chromEnd); for (bin = binList; bin != NULL; bin = bin->next) { struct bed *oldBed = bin->val; if (oldBed->strand[0] == newBed->strand[0]) { if (!hashLookup(usedHash, oldBed->name)) { if (bedCompatibleExtension(oldBed, newBed) || endUtrChangeOnly(oldBed, newBed)) { int diff = bedTotalBlockSize(oldBed) - bedTotalBlockSize(newBed); if (diff < 0) diff = -diff; if (diff < bestDiff) { bestDiff = diff; bestBed = oldBed; } } } } } slFreeList(&binList); return bestBed; }
double bedOverlapRatio(struct bed *a, struct bed *b) /* Return TRUE if a overlaps b by at least given ratio. */ { int aSize = bedTotalBlockSize(a); int bSize = bedTotalBlockSize(b); int overlapSize = bedSameStrandOverlap(a,b); if (aSize == bSize && aSize == overlapSize) return 1.0; /* Avoid rounding here. */ double x = overlapSize; double aCov = x / aSize; double bCov = x / bSize; return min(aCov, bCov); }
struct dnaSeq *twoBitAndBedToSeq(struct twoBitFile *tbf, struct bed *bed) /* Get sequence defined by bed. Exclude introns. */ { struct dnaSeq *seq; if (bed->blockCount <= 1) { seq = twoBitReadSeqFrag(tbf, bed->chrom, bed->chromStart, bed->chromEnd); freeMem(seq->name); seq->name = cloneString(bed->name); } else { int totalBlockSize = bedTotalBlockSize(bed); AllocVar(seq); seq->name = cloneString(bed->name); seq->dna = needMem(totalBlockSize+1); seq->size = totalBlockSize; int i; int seqOffset = 0; for (i=0; i<bed->blockCount; ++i) { int exonSize = bed->blockSizes[i]; int exonStart = bed->chromStart + bed->chromStarts[i]; struct dnaSeq *exon = twoBitReadSeqFrag(tbf, bed->chrom, exonStart, exonStart+exonSize); memcpy(seq->dna + seqOffset, exon->dna, exonSize); seqOffset += exonSize; dnaSeqFree(&exon); } } if (bed->strand[0] == '-') reverseComplement(seq->dna, seq->size); return seq; }
int findLastIntronPos(struct hash *bedHash, char *name) /* Find last intron position in RNA coordinates if we have * a bed for this mRNA. Otherwise (or if it's single exon) * return 0. */ { struct bed *bed = hashFindVal(bedHash, name); if (bed == NULL) return 0; if (bed->blockCount < 2) return 0; int rnaSize = bedTotalBlockSize(bed); if (bed->strand[0] == '+') return rnaSize - bed->blockSizes[bed->blockCount-1]; else return rnaSize - bed->blockSizes[0]; }
static struct psl *bedToPsl(struct bed *bed, struct hash *chromSizes) /* Convert a single bed to a PSL. */ { int qSize = bedTotalBlockSize(bed); struct psl *psl; if (keepQuery) psl = pslNew(bed->chrom, hashIntVal(chromSizes, bed->chrom), bed->chromStart, bed->chromEnd, bed->chrom, hashIntVal(chromSizes, bed->chrom), bed->chromStart, bed->chromEnd, ((bed->strand[0] == '\0') ? "+" : bed->strand), (bed->blockCount == 0) ? 1 : bed->blockCount, 0); else psl = pslNew(bed->name, qSize, 0, qSize, bed->chrom, hashIntVal(chromSizes, bed->chrom), bed->chromStart, bed->chromEnd, ((bed->strand[0] == '\0') ? "+" : bed->strand), (bed->blockCount == 0) ? 1 : bed->blockCount, 0); psl->match = psl->qSize; if (bed->blockCount == 0) bedToPsl4(bed, psl); else bedToPsl12(bed, psl); return psl; }
void txInfoAssemble(char *txBedFile, char *cdsEvFile, char *txCdsPredictFile, char *altSpliceFile, char *exceptionFile, char *sizePolyAFile, char *pslFile, char *flipFile, char *outFile) /* txInfoAssemble - Assemble information from various sources into txInfo table.. */ { /* Build up hash of evidence keyed by transcript name. */ struct hash *cdsEvHash = hashNew(18); struct cdsEvidence *cdsEv, *cdsEvList = cdsEvidenceLoadAll(cdsEvFile); for (cdsEv = cdsEvList; cdsEv != NULL; cdsEv = cdsEv->next) hashAddUnique(cdsEvHash, cdsEv->name, cdsEv); verbose(2, "Loaded %d elements from %s\n", cdsEvHash->elCount, cdsEvFile); /* Build up hash of bestorf structures keyed by transcript name */ struct hash *predictHash = hashNew(18); struct cdsEvidence *predict, *predictList = cdsEvidenceLoadAll(txCdsPredictFile); for (predict = predictList; predict != NULL; predict = predict->next) hashAddUnique(predictHash, predict->name, predict); verbose(2, "Loaded %d predicts from %s\n", predictHash->elCount, txCdsPredictFile); /* Build up structure for random access of retained introns */ struct bed *altSpliceList = bedLoadNAll(altSpliceFile, 6); verbose(2, "Loaded %d alts from %s\n", slCount(altSpliceList), altSpliceFile); struct hash *altSpliceHash = bedsIntoHashOfKeepers(altSpliceList); /* Read in exception info. */ struct hash *selenocysteineHash, *altStartHash; genbankExceptionsHash(exceptionFile, &selenocysteineHash, &altStartHash); /* Read in polyA sizes */ struct hash *sizePolyAHash = hashNameIntFile(sizePolyAFile); verbose(2, "Loaded %d from %s\n", sizePolyAHash->elCount, sizePolyAFile); /* Read in psls */ struct hash *pslHash = hashNew(20); struct psl *psl, *pslList = pslLoadAll(pslFile); for (psl = pslList; psl != NULL; psl = psl->next) hashAdd(pslHash, psl->qName, psl); verbose(2, "Loaded %d from %s\n", pslHash->elCount, pslFile); /* Read in accessions that we flipped for better splice sites. */ struct hash *flipHash = hashWordsInFile(flipFile, 0); /* Open primary gene input and output. */ struct lineFile *lf = lineFileOpen(txBedFile, TRUE); FILE *f = mustOpen(outFile, "w"); /* Main loop - process each gene */ char *row[12]; while (lineFileRow(lf, row)) { struct bed *bed = bedLoad12(row); verbose(3, "Processing %s\n", bed->name); /* Initialize info to zero */ struct txInfo info; ZeroVar(&info); /* Figure out name, sourceAcc, and isRefSeq from bed->name */ info.name = bed->name; info.category = "n/a"; if (isRfam(bed->name) || stringIn("tRNA", bed->name) != NULL) { info.sourceAcc = cloneString(bed->name); } else { info.sourceAcc = txAccFromTempName(bed->name); } info.isRefSeq = startsWith("NM_", info.sourceAcc); if (startsWith("antibody.", info.sourceAcc) || startsWith("CCDS", info.sourceAcc) || isRfam(info.sourceAcc) || stringIn("tRNA", info.sourceAcc) != NULL) { /* Fake up some things for antibody frag and CCDS that don't have alignments. */ info.sourceSize = bedTotalBlockSize(bed); info.aliCoverage = 1.0; info.aliIdRatio = 1.0; info. genoMapCount = 1; } else { /* Loop through all psl's associated with our RNA. Figure out * our overlap with each, and pick best one. */ struct hashEl *hel, *firstPslHel = hashLookup(pslHash, info.sourceAcc); if (firstPslHel == NULL) errAbort("%s is not in %s", info.sourceAcc, pslFile); int mapCount = 0; struct psl *psl, *bestPsl = NULL; int coverage, bestCoverage = 0; boolean isFlipped = (hashLookup(flipHash, info.sourceAcc) != NULL); for (hel = firstPslHel; hel != NULL; hel = hashLookupNext(hel)) { psl = hel->val; mapCount += 1; coverage = pslBedOverlap(psl, bed); if (coverage > bestCoverage) { bestCoverage = coverage; bestPsl = psl; } /* If we flipped it, try it on the opposite strand too. */ if (isFlipped) { psl->strand[0] = (psl->strand[0] == '+' ? '-' : '+'); coverage = pslBedOverlap(psl, bed); if (coverage > bestCoverage) { bestCoverage = coverage; bestPsl = psl; } psl->strand[0] = (psl->strand[0] == '+' ? '-' : '+'); } } if (bestPsl == NULL) errAbort("%s has no overlapping alignments with %s in %s", bed->name, info.sourceAcc, pslFile); /* Figure out and save alignment statistics. */ int polyA = hashIntValDefault(sizePolyAHash, bed->name, 0); info.sourceSize = bestPsl->qSize - polyA; info.aliCoverage = (double)bestCoverage / info.sourceSize; info.aliIdRatio = (double)(bestPsl->match + bestPsl->repMatch)/ (bestPsl->match + bestPsl->misMatch + bestPsl->repMatch); info. genoMapCount = mapCount; } /* Get orf size and start/end complete from cdsEv. */ if (bed->thickStart < bed->thickEnd) { cdsEv = hashFindVal(cdsEvHash, bed->name); if (cdsEv != NULL) { info.orfSize = cdsEv->end - cdsEv->start; info.startComplete = cdsEv->startComplete; info.endComplete = cdsEv->endComplete; } } /* Get score from prediction. */ predict = hashFindVal(predictHash, bed->name); if (predict != NULL) info.cdsScore = predict->score; /* Figure out nonsense-mediated-decay from bed itself. */ info.nonsenseMediatedDecay = isNonsenseMediatedDecayTarget(bed); /* Figure out if retained intron from bed and alt-splice keeper hash */ info.retainedIntron = hasRetainedIntron(bed, altSpliceHash); info.strangeSplice = countStrangeSplices(bed, altSpliceHash); info.atacIntrons = countAtacIntrons(bed, altSpliceHash); info.bleedIntoIntron = addIntronBleed(bed, altSpliceHash); /* Look up selenocysteine info. */ info.selenocysteine = (hashLookup(selenocysteineHash, bed->name) != NULL); /* Loop through bed looking for small gaps indicative of frame shift/stop */ int i, lastBlock = bed->blockCount-1; int exonCount = 1; for (i=0; i < lastBlock; ++i) { int gapStart = bed->chromStarts[i] + bed->blockSizes[i]; int gapEnd = bed->chromStarts[i+1]; int gapSize = gapEnd - gapStart; switch (gapSize) { case 1: case 2: info.genomicFrameShift = TRUE; break; case 3: info.genomicStop = TRUE; break; default: exonCount += 1; break; } } info.exonCount = exonCount; /* Write info, free bed. */ txInfoTabOut(&info, f); bedFree(&bed); } /* Clean up and go home. */ carefulClose(&f); }
void averageFetchingEachChrom(struct bbiFile *bbi, struct bed **pBedList, int fieldCount, FILE *f, FILE *bedF) /* Do the averaging by sorting bedList by chromosome, and then processing each chromosome * at once. Faster for long bedLists. */ { /* Sort by chromosome. */ slSort(pBedList, bedCmpChrom); struct bigWigValsOnChrom *chromVals = bigWigValsOnChromNew(); struct bed *bed, *bedList, *nextChrom; verbose(1, "processing chromosomes"); for (bedList = *pBedList; bedList != NULL; bedList = nextChrom) { /* Figure out which chromosome we're working on, and the last bed using it. */ char *chrom = bedList->chrom; nextChrom = nextChromInList(bedList); verbose(2, "Processing %s\n", chrom); if (bigWigValsOnChromFetchData(chromVals, chrom, bbi)) { double *valBuf = chromVals->valBuf; Bits *covBuf = chromVals->covBuf; /* Loop through beds doing sums and outputting. */ for (bed = bedList; bed != nextChrom; bed = bed->next) { int size = 0, coverage = 0; double sum = 0.0; if (sampleAroundCenter > 0) { int center = (bed->chromStart + bed->chromEnd)/2; int left = center - (sampleAroundCenter/2); addBufIntervalInfo(valBuf, covBuf, left, left+sampleAroundCenter, &size, &coverage, &sum); } else { if (fieldCount < 12) { addBufIntervalInfo(valBuf, covBuf, bed->chromStart, bed->chromEnd, &size, &coverage, &sum); } else { int i; for (i=0; i<bed->blockCount; ++i) { int start = bed->chromStart + bed->chromStarts[i]; int end = start + bed->blockSizes[i]; addBufIntervalInfo(valBuf, covBuf, start, end, &size, &coverage, &sum); } } } /* Print out result, fudging mean to 0 if no coverage at all. */ double mean = 0; if (coverage > 0) mean = sum/coverage; fprintf(f, "%s\t%d\t%d\t%g\t%g\t%g\n", bed->name, size, coverage, sum, sum/size, mean); optionallyPrintBedPlus(bedF, bed, fieldCount, mean); } verboseDot(); } else { /* If no bigWig data on this chromosome, just output as if coverage is 0 */ for (bed = bedList; bed != nextChrom; bed = bed->next) { fprintf(f, "%s\t%d\t0\t0\t0\t0\n", bed->name, bedTotalBlockSize(bed)); optionallyPrintBedPlus(bedF, bed, fieldCount, 0); } } } verbose(1, "\n"); }
int scoreNoncodingBed(struct bed *bed) /* Score noncoding bed weighing number of exons and total size. */ { return bed->blockCount*100 + bedTotalBlockSize(bed); }