void bedFirstCodingExonSize(char *inBed, char *overBed, char *underBed, char *outSize) /* bedFirstCodingExonSize - Figure out size of first coding exon. */ { FILE *fSize = mustOpen(outSize, "w"); FILE *fOver = NULL, *fUnder = NULL; if (overBed) fOver = mustOpen(overBed, "w"); if (underBed) fUnder = mustOpen(underBed, "w"); struct bed *bed, *bedList = bedLoadNAll(inBed, 12); for (bed = bedList; bed != NULL; bed = bed->next) { if (bed->thickStart < bed->thickEnd) { int firstCdsSize = bedFirstCdsSize(bed); fprintf(fSize, "%s\t%d\n", bed->name, firstCdsSize); if (firstCdsSize >= threshold) { if (fOver != NULL) bedTabOutN(bed, 12, fOver); } else { if (fUnder != NULL) bedTabOutN(bed, 12, fUnder); } } } carefulClose(&fSize); carefulClose(&fOver); carefulClose(&fUnder); }
struct hash *loadRegions(char *file) /* load regions into a hash of lists by chrom */ { struct bed *bed = NULL, *bedList = NULL, *nextBed = NULL, *temp = NULL; struct hash *regionHash = newHash(6); struct bed *regions; regions = bedLoadNAll(file, outDir ? 4 : 3); /* order by chrom, start */ slSort(®ions, bedCmp); verbose(2, "found %d regions\n", slCount(regions)); bedList = regions; for (bed = regions; bed != NULL; bed = nextBed) { verbose(3, "region %s:%d-%d\n", bed->chrom, bed->chromStart+1, bed->chromEnd); nextBed = bed->next; if ((bed->next == NULL) || (differentString(bed->chrom,bed->next->chrom))) { temp = bed->next; bed->next = NULL; hashAdd(regionHash, bed->chrom, bedList); verbose(2, "just added %d regions on %s\n", slCount(bedList), bed->chrom); bedList = temp; } } return regionHash; }
void cgapSageBedAddFreqs(char *oldBedFile, char *freqFile, char *libsFile, char *newBedFile) /* cgapSageBedAddFreqs - Add frequency data to the bed. */ { struct hash *totTagsHash; struct bed *mappings; struct hash *freqHash; struct cgapSage *sageList; verbose(1, "Loading libraries...\n"); totTagsHash = getTotTagsHash(libsFile); verbose(1, "Loaded libraries.\n"); verbose(1, "Loading mappings...\n"); mappings = bedLoadNAll(oldBedFile, 8); verbose(1, "Loaded mappings.\n"); verbose(1, "Loading frequencies...\n"); freqHash = getFreqHash(freqFile); verbose(1, "Loaded frequencies.\n"); verbose(1, "Building new bed list...\n"); sageList = makeCgapSageList(freqHash, totTagsHash, mappings); verbose(1, "Built new bed list.\n"); verbose(1, "Writing output...\n"); writeCgapSageFile(sageList, newBedFile); verbose(1, "Wrote output. All done!\n"); freeFreqHash(&freqHash); hashFree(&totTagsHash); cgapSageFreeList(&sageList); slFreeList(&mappings); }
void txCdsPredict(char *inFa, char *outCds, char *nmdBed, char *mafFile, boolean anyStart) /* txCdsPredict - Somewhat simple-minded ORF predictor using a weighting scheme.. */ { struct dnaSeq *rna, *rnaList = faReadAllDna(inFa); verbose(2, "Read %d sequences from %s\n", slCount(rnaList), inFa); /* Make up hash of bed records for NMD analysis. */ struct hash *nmdHash = hashNew(18); if (nmdBed != NULL) { struct bed *bed, *bedList = bedLoadNAll(nmdBed, 12); for (bed = bedList; bed != NULL; bed = bed->next) hashAdd(nmdHash, bed->name, bed); verbose(2, "Read %d beds from %s\n", nmdHash->elCount, nmdBed); } /* Make up hash of maf records for conservation analysis. */ struct hash *mafHash = hashNew(18); int otherSpeciesCount = 0; if (mafFile != NULL) { struct mafFile *mf = mafReadAll(mafFile); struct mafAli *maf; for (maf = mf->alignments; maf != NULL; maf = maf->next) hashAdd(mafHash, maf->components->src, maf); verbose(2, "Read %d alignments from %s\n", mafHash->elCount, mafFile); struct hash *uniqSpeciesHash = hashNew(0); for (maf = mf->alignments; maf != NULL; maf = maf->next) { struct mafComp *comp; for (comp = maf->components->next; comp != NULL; comp = comp->next) hashStore(uniqSpeciesHash, comp->src); } otherSpeciesCount = uniqSpeciesHash->elCount; verbose(2, "%d other species in %s\n", otherSpeciesCount, mafFile); } FILE *f = mustOpen(outCds, "w"); for (rna = rnaList; rna != NULL; rna = rna->next) { verbose(3, "%s\n", rna->name); struct cdsEvidence *orfList = orfsOnRna(rna, nmdHash, mafHash, otherSpeciesCount, anyStart); if (orfList != NULL) { slSort(&orfList, cdsEvidenceCmpScore); cdsEvidenceTabOut(orfList, f); } cdsEvidenceFreeList(&orfList); } carefulClose(&f); }
void bedOrBlocks(char *inFile, char *outFile) /* bedOrBlocks - Create a bed that is the union of all blocks of a list of beds.. */ { struct bed *start, *end, *inList = bedLoadNAll(inFile, 12); FILE *f = mustOpen(outFile, "w"); slSort(&inList, bedCmpChromStrandStart); for (start = inList; start != NULL; start = end) { for (end = start->next; end != NULL; end = end->next) { if (!sameString(start->chrom, end->chrom)) break; if (start->strand[0] != end->strand[0]) break; } doStrand(start, end, f); } carefulClose(&f); }
struct genePred *convertBedsToGps(char *bedFile) /* Load beds from a file and convert to bare bones genePredictions. */ { struct genePred *gpList = NULL, *gp =NULL; struct bed *bedList=NULL, *bed=NULL; bedList = bedLoadNAll(bedFile, 6); if(bedList->strand == NULL) errAbort("Beds must have strand information."); for(bed=bedList; bed!=NULL; bed=bed->next) { AllocVar(gp); gp->chrom = cloneString(bed->chrom); gp->txStart = gp->cdsStart = bed->chromStart; gp->txEnd = gp->cdsEnd = bed->chromEnd; gp->name = cloneString(bed->name); safef(gp->strand, sizeof(gp->strand), "%s", bed->strand); slAddHead(&gpList, gp); } bedFreeList(&bedList); slReverse(&gpList); return gpList; }
struct hash *bedLoadNInHash(char *filename, int fields) { struct bed *bed = NULL, *currList = NULL, *temp = NULL, *nextBed = NULL; struct hash *regionHash = newHash(6); struct bed *regions; regions = bedLoadNAll(filename, fields); slSort(®ions, bedCmp); currList = regions; for(bed = regions; bed != NULL; bed = nextBed) { nextBed = bed->next; if((bed->next == NULL) || (differentString(bed->chrom,bed->next->chrom))) { temp = bed->next; bed->next = NULL; hashAdd(regionHash, bed->chrom, currList); currList = temp; } } return(regionHash); }
void ultraPcrRegions(char *database, char *bedFile, char *outFa) /* ultraPcrRegions - Get regions to PCR up and some surrounding sequence. */ { int extraSize = 1000; FILE *f = mustOpen(outFa, "w"); struct bed *bed, *bedList = bedLoadNAll(bedFile, 4); hSetDb(database); for (bed = bedList; bed != NULL; bed = bed->next) { int bedSize = bed->chromEnd - bed->chromStart; int chromSize = hChromSize(bed->chrom); int seqSize; int seqStart = bed->chromStart - extraSize; int seqEnd = bed->chromEnd + extraSize; int firstParenPos, secondParenPos; struct dyString *dy; char fileName[512]; struct dnaSeq *seq; if (seqStart < 0) seqStart = 0; if (seqEnd > chromSize) seqEnd = chromSize; seqSize = seqEnd - seqStart; firstParenPos = bed->chromStart - seqStart; secondParenPos = firstParenPos + bedSize; seq = hChromSeqMixed(bed->chrom, seqStart, seqEnd); dy = dyStringNew(seqSize+2); dyStringAppendN(dy, seq->dna, firstParenPos); dyStringAppendC(dy, '('); dyStringAppendN(dy, seq->dna+firstParenPos, secondParenPos-firstParenPos); dyStringAppendC(dy, ')'); dyStringAppendN(dy, seq->dna+secondParenPos, seqSize - secondParenPos); faWriteNext(f, bed->name, dy->string, dy->stringSize); } carefulClose(&f); }
void txGeneAccession(char *oldBedFile, char *lastIdFile, char *newBedFile, char *txToAccFile, char *oldToNewFile) /* txGeneAccession - Assign permanent accession number to genes. */ { /* Read in all input. */ struct bed *oldList = bedLoadNAll(oldBedFile, 12); verbose(2, "Read %d from %s\n", slCount(oldList), oldBedFile); struct bed *newList = bedLoadNAll(newBedFile, 12); verbose(2, "Read %d from %s\n", slCount(newList), newBedFile); int txId = readNumberFromFile(lastIdFile); verbose(2, "Last txId used was %d (from %s)\n", txId, lastIdFile); /* Make a random-access data structure for old list. */ struct hash *oldHash = bedsIntoKeeperHash(oldList); /* Make a little hash to help prevent us from reusing an * old accession twice (which might happen if we extend it * in two incompatible ways). */ struct hash *usedHash = hashNew(16); /* Record our decisions in hash as well as file. */ struct hash *idToAccHash = hashNew(16); /* Loop through new list first looking for exact matches. Record * exact matches in hash so we don't look for them again during * the next, "compatable" match phase. */ struct hash *oldExactHash = hashNew(16), *newExactHash = hashNew(16); struct bed *oldBed, *newBed; FILE *f = mustOpen(txToAccFile, "w"); FILE *fOld = mustOpen(oldToNewFile, "w"); for (newBed = newList; newBed != NULL; newBed = newBed->next) { oldBed = findExact(newBed, oldHash, usedHash); if (oldBed != NULL) { hashAdd(oldExactHash, oldBed->name, oldBed); hashAdd(newExactHash, newBed->name, newBed); hashAdd(usedHash, oldBed->name, NULL); fprintf(f, "%s\t%s\n", newBed->name, oldBed->name); hashAdd(idToAccHash, newBed->name, oldBed->name); fprintf(fOld, "%s:%d-%d\t%s\t%s\t%s\n", oldBed->chrom, oldBed->chromStart, oldBed->chromEnd, oldBed->name, oldBed->name, "exact"); } } /* Loop through new bed looking for compatible things. If * we can't find anything compatable, make up a new accession. */ for (newBed = newList; newBed != NULL; newBed = newBed->next) { if (!hashLookup(newExactHash, newBed->name)) { oldBed = findCompatible(newBed, oldHash, usedHash); if (oldBed == NULL) { char newAcc[16]; txGeneAccFromId(++txId, newAcc); strcat(newAcc, ".1"); fprintf(f, "%s\t%s\n", newBed->name, newAcc); hashAdd(idToAccHash, newBed->name, cloneString(newAcc)); oldBed = findMostOverlapping(newBed, oldHash); char *oldAcc = (oldBed == NULL ? "" : oldBed->name); fprintf(fOld, "%s:%d-%d\t%s\t%s\t%s\n", newBed->chrom, newBed->chromStart, newBed->chromEnd, oldAcc, newAcc, "new"); } else { char *acc = cloneString(oldBed->name); char *ver = strchr(acc, '.'); if (ver == NULL) errAbort("No version found in %s", oldBed->name); *ver++ = 0; int version = sqlUnsigned(ver); char newAcc[16]; safef(newAcc, sizeof(newAcc), "%s.%d", acc, version+1); hashAdd(usedHash, oldBed->name, NULL); fprintf(f, "%s\t%s\n", newBed->name, newAcc); hashAdd(idToAccHash, newBed->name, newAcc); fprintf(fOld, "%s:%d-%d\t%s\t%s\t%s\n", newBed->chrom, newBed->chromStart, newBed->chromEnd, oldBed->name, newAcc, "compatible"); } } } carefulClose(&f); /* Make a random-access data structure for old list. */ struct hash *newHash = bedsIntoKeeperHash(newList); /* Write record of ones that don't map. */ for (oldBed = oldList; oldBed != NULL; oldBed = oldBed->next) { if (!hashLookup(usedHash, oldBed->name)) { char *newAcc = ""; struct bed *newBed = findMostOverlapping(oldBed, newHash); if (newBed != NULL) newAcc = hashMustFindVal(idToAccHash, newBed->name); fprintf(fOld, "%s:%d-%d\t%s\t%s\t%s\n", oldBed->chrom, oldBed->chromStart, oldBed->chromEnd, oldBed->name, newAcc, "lost"); } } carefulClose(&fOld); if (!optionExists("test")) { FILE *fId = mustOpen(lastIdFile, "w"); fprintf(fId, "%d\n", txId); carefulClose(&fId); } }
void liftOverMerge(char *oldFile, char *newFile) /* liftOverMerge - Merge regions in BED5 generated by liftOver -multiple */ { struct bed *bedList = NULL, *bed = NULL, *otherBed = NULL, *nextBed = NULL; struct bedList *bedListHeaders = NULL, *bedListHeader = NULL; FILE *f = mustOpen(newFile, "w"); bedList = bedLoadNAll(oldFile, 5); /* break down bed list into a list of lists, one per "region", where region * is the name field in the bed */ for (bed = bedList; bed != NULL; bed = nextBed) { verbose(3, "%s:%d-%d %s %d\n", bed->chrom, bed->chromStart, bed->chromEnd, bed->name, bed->score); if (bedListHeader == NULL || differentString(bed->name, bedListHeader->name)) { verbose(2, "region %s\n", bed->name); AllocVar(bedListHeader); bedListHeader->name = cloneString(bed->name); slAddHead(&bedListHeaders, bedListHeader); } nextBed = bed->next; slAddHead(&bedListHeader->bed, bed); } slReverse(&bedListHeaders); for (bedListHeader = bedListHeaders; bedListHeader != NULL; bedListHeader = bedListHeader->next) { int ix = 1; verbose(3, "region %s\n", bedListHeader->name); slReverse(&bedListHeader->bed); /* traverse list of bed lists, merging overlapping entries * for each region */ for (bed = bedListHeader->bed; bed != NULL; bed = bed->next) { for (otherBed = bed->next; otherBed != NULL; otherBed = nextBed) { nextBed = otherBed->next; if (sameString(bed->chrom, otherBed->chrom) && (max(bed->chromStart, otherBed->chromStart) <= min(bed->chromEnd, otherBed->chromEnd) + mergeGap)) { /* these regions overlap (or are within the merge gap), * so create one that is a merge, and drop the other */ verbose(2,"merging %s:%d-%d, %s:%d-%d (overlap=%d)", otherBed->chrom, otherBed->chromStart, otherBed->chromEnd, bed->chrom, bed->chromStart, bed->chromEnd, min(bed->chromEnd, otherBed->chromEnd) - max(bed->chromStart, otherBed->chromStart)); bed->chromStart = min(otherBed->chromStart, bed->chromStart); bed->chromEnd = max(otherBed->chromEnd, bed->chromEnd); verbose(2," to %s:%d-%d\n", bed->chrom, bed->chromStart, bed->chromEnd); slRemoveEl(&bedListHeader->bed, otherBed); } } } for (otherBed = bedListHeader->bed; otherBed != NULL; otherBed = otherBed->next) { otherBed->score = ix++; bedOutputN(otherBed, 5, f, '\t', '\n'); } } }
void txInfoAssemble(char *txBedFile, char *cdsEvFile, char *txCdsPredictFile, char *altSpliceFile, char *exceptionFile, char *sizePolyAFile, char *pslFile, char *flipFile, char *outFile) /* txInfoAssemble - Assemble information from various sources into txInfo table.. */ { /* Build up hash of evidence keyed by transcript name. */ struct hash *cdsEvHash = hashNew(18); struct cdsEvidence *cdsEv, *cdsEvList = cdsEvidenceLoadAll(cdsEvFile); for (cdsEv = cdsEvList; cdsEv != NULL; cdsEv = cdsEv->next) hashAddUnique(cdsEvHash, cdsEv->name, cdsEv); verbose(2, "Loaded %d elements from %s\n", cdsEvHash->elCount, cdsEvFile); /* Build up hash of bestorf structures keyed by transcript name */ struct hash *predictHash = hashNew(18); struct cdsEvidence *predict, *predictList = cdsEvidenceLoadAll(txCdsPredictFile); for (predict = predictList; predict != NULL; predict = predict->next) hashAddUnique(predictHash, predict->name, predict); verbose(2, "Loaded %d predicts from %s\n", predictHash->elCount, txCdsPredictFile); /* Build up structure for random access of retained introns */ struct bed *altSpliceList = bedLoadNAll(altSpliceFile, 6); verbose(2, "Loaded %d alts from %s\n", slCount(altSpliceList), altSpliceFile); struct hash *altSpliceHash = bedsIntoHashOfKeepers(altSpliceList); /* Read in exception info. */ struct hash *selenocysteineHash, *altStartHash; genbankExceptionsHash(exceptionFile, &selenocysteineHash, &altStartHash); /* Read in polyA sizes */ struct hash *sizePolyAHash = hashNameIntFile(sizePolyAFile); verbose(2, "Loaded %d from %s\n", sizePolyAHash->elCount, sizePolyAFile); /* Read in psls */ struct hash *pslHash = hashNew(20); struct psl *psl, *pslList = pslLoadAll(pslFile); for (psl = pslList; psl != NULL; psl = psl->next) hashAdd(pslHash, psl->qName, psl); verbose(2, "Loaded %d from %s\n", pslHash->elCount, pslFile); /* Read in accessions that we flipped for better splice sites. */ struct hash *flipHash = hashWordsInFile(flipFile, 0); /* Open primary gene input and output. */ struct lineFile *lf = lineFileOpen(txBedFile, TRUE); FILE *f = mustOpen(outFile, "w"); /* Main loop - process each gene */ char *row[12]; while (lineFileRow(lf, row)) { struct bed *bed = bedLoad12(row); verbose(3, "Processing %s\n", bed->name); /* Initialize info to zero */ struct txInfo info; ZeroVar(&info); /* Figure out name, sourceAcc, and isRefSeq from bed->name */ info.name = bed->name; info.category = "n/a"; if (isRfam(bed->name) || stringIn("tRNA", bed->name) != NULL) { info.sourceAcc = cloneString(bed->name); } else { info.sourceAcc = txAccFromTempName(bed->name); } info.isRefSeq = startsWith("NM_", info.sourceAcc); if (startsWith("antibody.", info.sourceAcc) || startsWith("CCDS", info.sourceAcc) || isRfam(info.sourceAcc) || stringIn("tRNA", info.sourceAcc) != NULL) { /* Fake up some things for antibody frag and CCDS that don't have alignments. */ info.sourceSize = bedTotalBlockSize(bed); info.aliCoverage = 1.0; info.aliIdRatio = 1.0; info. genoMapCount = 1; } else { /* Loop through all psl's associated with our RNA. Figure out * our overlap with each, and pick best one. */ struct hashEl *hel, *firstPslHel = hashLookup(pslHash, info.sourceAcc); if (firstPslHel == NULL) errAbort("%s is not in %s", info.sourceAcc, pslFile); int mapCount = 0; struct psl *psl, *bestPsl = NULL; int coverage, bestCoverage = 0; boolean isFlipped = (hashLookup(flipHash, info.sourceAcc) != NULL); for (hel = firstPslHel; hel != NULL; hel = hashLookupNext(hel)) { psl = hel->val; mapCount += 1; coverage = pslBedOverlap(psl, bed); if (coverage > bestCoverage) { bestCoverage = coverage; bestPsl = psl; } /* If we flipped it, try it on the opposite strand too. */ if (isFlipped) { psl->strand[0] = (psl->strand[0] == '+' ? '-' : '+'); coverage = pslBedOverlap(psl, bed); if (coverage > bestCoverage) { bestCoverage = coverage; bestPsl = psl; } psl->strand[0] = (psl->strand[0] == '+' ? '-' : '+'); } } if (bestPsl == NULL) errAbort("%s has no overlapping alignments with %s in %s", bed->name, info.sourceAcc, pslFile); /* Figure out and save alignment statistics. */ int polyA = hashIntValDefault(sizePolyAHash, bed->name, 0); info.sourceSize = bestPsl->qSize - polyA; info.aliCoverage = (double)bestCoverage / info.sourceSize; info.aliIdRatio = (double)(bestPsl->match + bestPsl->repMatch)/ (bestPsl->match + bestPsl->misMatch + bestPsl->repMatch); info. genoMapCount = mapCount; } /* Get orf size and start/end complete from cdsEv. */ if (bed->thickStart < bed->thickEnd) { cdsEv = hashFindVal(cdsEvHash, bed->name); if (cdsEv != NULL) { info.orfSize = cdsEv->end - cdsEv->start; info.startComplete = cdsEv->startComplete; info.endComplete = cdsEv->endComplete; } } /* Get score from prediction. */ predict = hashFindVal(predictHash, bed->name); if (predict != NULL) info.cdsScore = predict->score; /* Figure out nonsense-mediated-decay from bed itself. */ info.nonsenseMediatedDecay = isNonsenseMediatedDecayTarget(bed); /* Figure out if retained intron from bed and alt-splice keeper hash */ info.retainedIntron = hasRetainedIntron(bed, altSpliceHash); info.strangeSplice = countStrangeSplices(bed, altSpliceHash); info.atacIntrons = countAtacIntrons(bed, altSpliceHash); info.bleedIntoIntron = addIntronBleed(bed, altSpliceHash); /* Look up selenocysteine info. */ info.selenocysteine = (hashLookup(selenocysteineHash, bed->name) != NULL); /* Loop through bed looking for small gaps indicative of frame shift/stop */ int i, lastBlock = bed->blockCount-1; int exonCount = 1; for (i=0; i < lastBlock; ++i) { int gapStart = bed->chromStarts[i] + bed->blockSizes[i]; int gapEnd = bed->chromStarts[i+1]; int gapSize = gapEnd - gapStart; switch (gapSize) { case 1: case 2: info.genomicFrameShift = TRUE; break; case 3: info.genomicStop = TRUE; break; default: exonCount += 1; break; } } info.exonCount = exonCount; /* Write info, free bed. */ txInfoTabOut(&info, f); bedFree(&bed); } /* Clean up and go home. */ carefulClose(&f); }
void txGeneCdsMap(char *inBed, char *inInfo, char *inPicks, char *refPepToTxPsl, char *refToPepTab, char *chromSizes, char *cdsToRna, char *rnaToGenome) /* txGeneCdsMap - Create mapping between CDS region of gene and genome. */ { /* Load info into hash. */ struct hash *infoHash = hashNew(18); struct txInfo *info, *infoList = txInfoLoadAll(inInfo); for (info = infoList; info != NULL; info = info->next) hashAdd(infoHash, info->name, info); /* Load picks into hash. We don't use cdsPicksLoadAll because empty fields * cause that autoSql-generated routine problems. */ struct hash *pickHash = newHash(18); struct cdsPick *pick; struct lineFile *lf = lineFileOpen(inPicks, TRUE); char *row[CDSPICK_NUM_COLS]; while (lineFileRowTab(lf, row)) { pick = cdsPickLoad(row); hashAdd(pickHash, pick->name, pick); } lineFileClose(&lf); /* Load refPep/tx alignments into hash keyed by tx. */ struct hash *refPslHash = hashNew(18); struct psl *psl, *pslList = pslLoadAll(refPepToTxPsl); for (psl = pslList; psl != NULL; psl = psl->next) hashAdd(refPslHash, psl->tName, psl); struct hash *refToPepHash = hashTwoColumnFile(refToPepTab); struct hash *chromSizeHash = hashNameIntFile(chromSizes); /* Load in bed. */ struct bed *bed, *bedList = bedLoadNAll(inBed, 12); /* Open output, and stream through bedList, writing output. */ FILE *fCdsToRna = mustOpen(cdsToRna, "w"); FILE *fRnaToGenome = mustOpen(rnaToGenome, "w"); int refTotal = 0, refFound = 0; for (bed = bedList; bed != NULL; bed = bed->next) { if (bed->thickStart < bed->thickEnd) { char *chrom = bed->chrom; int chromSize = hashIntVal(chromSizeHash, chrom); info = hashMustFindVal(infoHash, bed->name); pick = hashMustFindVal(pickHash, bed->name); if (info->isRefSeq) { char *refAcc = txAccFromTempName(bed->name); if (!startsWith("NM_", refAcc)) errAbort("Don't think I did find that refSeq acc, got %s", refAcc); char *protAcc = hashMustFindVal(refToPepHash, refAcc); ++refTotal; if (findAndMapPsl(bed, protAcc, refPslHash, chromSize, fCdsToRna)) ++refFound; } else { fakeCdsToMrna(bed, fCdsToRna); } fakeRnaToGenome(bed, chromSize, fRnaToGenome); } } verbose(1, "Missed %d of %d refSeq protein mappings. A small number of RefSeqs just map\n" "to genome in the UTR.\n", refTotal - refFound, refTotal); carefulClose(&fCdsToRna); carefulClose(&fRnaToGenome); }
void txGeneSeparateNoncoding(char *inBed, char *inInfo, char *outCoding, char *outNearCoding, char *outNearCodingJunk, char *outAntisense, char *outNoncoding, char *outInfo) /* txGeneSeparateNoncoding - Separate genes into four piles - coding, * non-coding that overlap coding, antisense to coding, and independent non-coding. */ { /* Read in txInfo into a hash keyed by transcript name */ struct hash *infoHash = hashNew(16); struct txInfo *info, *infoList = txInfoLoadAll(inInfo); for (info = infoList; info != NULL; info = info->next) hashAdd(infoHash, info->name, info); verbose(2, "Read info on %d transcripts from %s\n", infoHash->elCount, inInfo); /* Read in bed, and sort so we can process it easily a * strand of one chromosome at a time. */ struct bed *inBedList = bedLoadNAll(inBed, 12); slSort(&inBedList, bedCmpChromStrandStart); /* Open up output files. */ FILE *fCoding = mustOpen(outCoding, "w"); FILE *fNearCoding = mustOpen(outNearCoding, "w"); FILE *fNearCodingJunk = mustOpen(outNearCodingJunk, "w"); FILE *fNoncoding = mustOpen(outNoncoding, "w"); FILE *fAntisense = mustOpen(outAntisense, "w"); /* Go through input one chromosome strand at a time. */ struct chrom *chrom, *chromList = chromsForBeds(inBedList); for (chrom = chromList; chrom != NULL; chrom = chrom->next) { verbose(2, "chrom %s\n", chrom->name); /* Do the separation. */ struct bed *codingList, *nearCodingList, *nearCodingJunkList, *antisenseList, *noncodingList; separateChrom(chrom, infoHash, &codingList, &nearCodingList, &nearCodingJunkList, &antisenseList, &noncodingList); verbose(2, "%d coding, %d near, %d anti, %d non\n", slCount(codingList), slCount(nearCodingList), slCount(antisenseList), slCount(noncodingList)); /* Write lists to respective files. */ writeBedList(codingList, fCoding); writeBedList(nearCodingList, fNearCoding); writeBedList(nearCodingJunkList, fNearCodingJunk); writeBedList(antisenseList, fAntisense); writeBedList(noncodingList, fNoncoding); } carefulClose(&fCoding); carefulClose(&fNearCoding); carefulClose(&fNearCodingJunk); carefulClose(&fNoncoding); carefulClose(&fAntisense); verbose(1, "coding %d, codingJunk %d, nearCoding %d, junk %d, antisense %d, noncoding %d\n", codingCount, codingJunkCount, nearCodingCount, junkCount, antisenseCount, noncodingCount); /* Write out updated info file */ FILE *f = mustOpen(outInfo, "w"); for (info = infoList; info != NULL; info = info->next) { txInfoTabOut(info, f); } carefulClose(&f); }
void txGeneCanonical(char *codingCluster, char *infoFile, char *noncodingGraph, char *genesBed, char *nearCoding, char *outCanonical, char *outIsoforms, char *outClusters) /* txGeneCanonical - Pick a canonical version of each gene - that is the form * to use when just interested in a single splicing varient. Produces final * transcript clusters as well. */ { /* Read in input into lists in memory. */ struct txCluster *coding, *codingList = txClusterLoadAll(codingCluster); struct txGraph *graph, *graphList = txGraphLoadAll(noncodingGraph); struct bed *bed, *nextBed, *bedList = bedLoadNAll(genesBed, 12); struct txInfo *info, *infoList = txInfoLoadAll(infoFile); struct bed *nearList = bedLoadNAll(nearCoding, 12); /* Make hash of all beds. */ struct hash *bedHash = hashNew(18); for (bed = bedList; bed != NULL; bed = bed->next) hashAdd(bedHash, bed->name, bed); /* Make has of all info. */ struct hash *infoHash = hashNew(18); for (info = infoList; info != NULL; info = info->next) hashAdd(infoHash, info->name, info); /* Make a binKeeper structure that we'll populate with coding genes. */ struct hash *sizeHash = minChromSizeFromBeds(bedList); struct hash *keeperHash = minChromSizeKeeperHash(sizeHash); /* Make list of coding genes and toss them into binKeeper. * This will eat up bed list, but bedHash is ok. */ struct gene *gene, *geneList = NULL; for (coding = codingList; coding != NULL; coding = coding->next) { gene = geneFromCluster(coding, bedHash, infoHash); slAddHead(&geneList, gene); struct binKeeper *bk = hashMustFindVal(keeperHash, gene->chrom); binKeeperAdd(bk, gene->start, gene->end, gene); } /* Go through near-coding genes and add them to the coding gene * they most overlap. */ for (bed = nearList; bed != NULL; bed = nextBed) { nextBed = bed->next; gene = mostOverlappingGene(keeperHash, bed); if (gene == NULL) errAbort("%s is near coding, but doesn't overlap any coding!?", bed->name); geneAddBed(gene, bed); } /* Add non-coding genes. */ for (graph = graphList; graph != NULL; graph = graph->next) { gene = geneFromGraph(graph, bedHash); slAddHead(&geneList, gene); } /* Sort so it all looks nicer. */ slSort(&geneList, geneCmp); /* Open up output files. */ FILE *fCan = mustOpen(outCanonical, "w"); FILE *fIso = mustOpen(outIsoforms, "w"); FILE *fClus = mustOpen(outClusters, "w"); /* Loop through, making up gene name, and writing output. */ int geneId = 0; for (gene = geneList; gene != NULL; gene = gene->next) { /* Make up name. */ char name[16]; safef(name, sizeof(name), "g%05d", ++geneId); /* Reverse transcript list just to make it look better. */ slReverse(&gene->txList); /* Write out canonical file output */ bed = hashMustFindVal(bedHash, gene->niceTx->name); fprintf(fCan, "%s\t%d\t%d\t%d\t%s\t%s\n", bed->chrom, bed->chromStart, bed->chromEnd, geneId, gene->niceTx->name, gene->niceTx->name); /* Write out isoforms output. */ for (bed = gene->txList; bed != NULL; bed = bed->next) fprintf(fIso, "%d\t%s\n", geneId, bed->name); /* Write out cluster output, starting with bed 6 standard fields. */ fprintf(fClus, "%s\t%d\t%d\t%s\t%d\t%c\t", gene->chrom, gene->start, gene->end, name, 0, gene->strand); /* Write out thick-start/thick end. */ if (gene->isCoding) { int thickStart = gene->end, thickEnd = gene->start; for (bed = gene->txList; bed != NULL; bed = bed->next) { if (bed->thickStart < bed->thickEnd) { thickStart = min(thickStart, bed->thickStart); thickEnd = max(thickEnd, bed->thickEnd); } } fprintf(fClus, "%d\t%d\t", thickStart, thickEnd); } else { fprintf(fClus, "%d\t%d\t", gene->start, gene->start); } /* We got no rgb value, just write out zero. */ fprintf(fClus, "0\t"); /* Get exons from exonTree. */ struct range *exon, *exonList = rangeTreeList(gene->exonTree); fprintf(fClus, "%d\t", slCount(exonList)); for (exon = exonList; exon != NULL; exon = exon->next) fprintf(fClus, "%d,", exon->start - gene->start); fprintf(fClus, "\t"); for (exon = exonList; exon != NULL; exon = exon->next) fprintf(fClus, "%d,", exon->end - exon->start); fprintf(fClus, "\t"); /* Write out associated transcripts. */ fprintf(fClus, "%d\t", slCount(gene->txList)); for (bed = gene->txList; bed != NULL; bed = bed->next) fprintf(fClus, "%s,", bed->name); fprintf(fClus, "\t"); /* Write out nice value */ fprintf(fClus, "%s\t", gene->niceTx->name); /* Write out coding/noncoding value. */ fprintf(fClus, "%d\n", gene->isCoding); } /* Close up files. */ carefulClose(&fCan); carefulClose(&fIso); carefulClose(&fClus); }
void txCdsRepick(char *inputBed, char *inputTxg, char *inputCluster, char *inputInfo, char *inputCds, char *outputCds, char *outputPp) /* txCdsRepick - After we have clustered based on the preliminary coding * regions we can make a more intelligent choice here about the final coding * regions. */ { /* Read input bed into hash. Also calculate number with CDS set. */ struct hash *bedHash = hashNew(16); struct bed *bed, *bedList = bedLoadNAll(inputBed, 12); int txWithCdsCount = 0; for (bed = bedList; bed != NULL; bed = bed->next) { if (bed->thickStart < bed->thickEnd) txWithCdsCount += 1; hashAdd(bedHash, bed->name, bed); } verbose(2, "Read %d beds from %s\n", bedHash->elCount, inputBed); /* Read input transcript graphs into list, and into a hash * keyed by transcript names. */ struct hash *graphHash = hashNew(16); struct txGraph *txg, *txgList = txGraphLoadAll(inputTxg); for (txg = txgList; txg != NULL; txg = txg->next) { int i; for (i=0; i<txg->sourceCount; ++i) hashAdd(graphHash, txg->sources[i].accession, txg); } verbose(2, "Read %d graphs (%d transcripts) from %s\n", slCount(txgList), graphHash->elCount, inputTxg); /* Read input protein cluster into list, and into a hash * keyed by transcript name */ struct hash *clusterHash = hashNew(16); struct txCluster *cluster, *clusterList = txClusterLoadAll(inputCluster); for (cluster = clusterList; cluster != NULL; cluster = cluster->next) { int i; for (i=0; i<cluster->txCount; ++i) hashAdd(clusterHash, cluster->txArray[i], cluster); } verbose(2, "Read %d protein clusters (%d transcripts) from %s\n", slCount(clusterList), clusterHash->elCount, inputCluster); /* Read in txInfo into a hash keyed by transcript name */ struct hash *infoHash = hashNew(16); struct txInfo *info, *infoList = txInfoLoadAll(inputInfo); for (info = infoList; info != NULL; info = info->next) hashAdd(infoHash, info->name, info); verbose(2, "Read info on %d transcripts from %s\n", infoHash->elCount, inputInfo); /* Read in input cds evidence into a hash keyed by transcript name * who's values are a sorted *list* of evidence. */ struct hash *evHash = hashNew(16); struct cdsEvidence *ev, *nextEv, *evList = cdsEvidenceLoadAll(inputCds); int evCount = 0; for (ev = evList; ev != NULL; ev = nextEv) { nextEv = ev->next; struct hashEl *hel = hashLookup(evHash, ev->name); if (hel == NULL) hel = hashAdd(evHash, ev->name, NULL); slAddTail(&hel->val, ev); ++evCount; } verbose(2, "Read %d pieces of cdsEvidence on %d transcripts from %s\n", evCount, evHash->elCount, inputCds); /* Create a hash containing what looks to be the best protein-coding * transcript in each protein cluster. This is keyed by cluster name * with transcript names for values. */ FILE *f = mustOpen(outputPp, "w"); struct hash *bestInClusterHash = hashNew(16); for (cluster = clusterList; cluster != NULL; cluster = cluster->next) { double bestScore = -BIGNUM; char *bestTx = NULL; int i; for (i=0; i<cluster->txCount; ++i) { char *tx = cluster->txArray[i]; info = hashMustFindVal(infoHash, tx); double score = infoCodingScore(info, TRUE); if (score > bestScore) { bestTx = tx; bestScore = score; } } hashAdd(bestInClusterHash, cluster->name, bestTx); fprintf(f, "%s\t%s\n", cluster->name, bestTx); } carefulClose(&f); verbose(2, "Picked best protein for each protein cluster\n"); /* Loop through each transcript cluster (graph). Make a list of * protein clusters associated with that graph. Armed with this * information call repick routine on each transcript in the graph. */ f = mustOpen(outputCds, "w"); for (txg = txgList; txg != NULL; txg = txg->next) { /* Build up list of protein clusters associated with transcript cluster. */ struct slRef *protClusterRefList = NULL, *protClusterRef; int i; for (i=0; i<txg->sourceCount; ++i) { char *tx = txg->sources[i].accession; struct txCluster *protCluster = hashFindVal(clusterHash, tx); if (protCluster != NULL) refAddUnique(&protClusterRefList, protCluster); } /* Figure out best scoring protein in RNA cluster, and set threshold * to eliminate ones scoring less than half this much. */ double bestProtScore = 0; for (protClusterRef = protClusterRefList; protClusterRef != NULL; protClusterRef = protClusterRef->next) { struct txCluster *protCluster = protClusterRef->val; char *protTx = hashMustFindVal(bestInClusterHash, protCluster->name); struct txInfo *info = hashMustFindVal(infoHash, protTx); double score = infoCodingScore(info, FALSE); bestProtScore = max(score, bestProtScore); } double protScoreThreshold = bestProtScore * 0.5; /* Get list of references to beds of proteins over that threshold. */ struct slRef *protRefList = NULL; for (protClusterRef = protClusterRefList; protClusterRef != NULL; protClusterRef = protClusterRef->next) { struct txCluster *protCluster = protClusterRef->val; char *protTx = hashMustFindVal(bestInClusterHash, protCluster->name); struct txInfo *info = hashMustFindVal(infoHash, protTx); double score = infoCodingScore(info, FALSE); if (score >= protScoreThreshold) { struct bed *bed = hashMustFindVal(bedHash, protTx); refAdd(&protRefList, bed); } } /* Go repick each CDS in RNA cluster */ for (i=0; i<txg->sourceCount; ++i) { char *tx = txg->sources[i].accession; struct bed *bed = hashMustFindVal(bedHash, tx); struct cdsEvidence *evList = hashFindVal(evHash, tx); if (evList != NULL && bed->thickStart < bed->thickEnd) { info = hashMustFindVal(infoHash, bed->name); pickCompatableCds(bed, protRefList, evList, info, f); } } slFreeList(&protClusterRefList); } carefulClose(&f); verbose(1, "repicked %d, removed %d, no change to %d\n", pickedBetter, pickedNone, txWithCdsCount - pickedBetter - pickedNone); }
void txGeneFromBed(char *inBed, char *inPicks, char *ucscFa, char *uniProtFa, char *refPepFa, char *outKg) /* txGeneFromBed - Convert from bed to knownGenes format table (genePred + uniProt ID). */ { /* Load protein sequence into hashes */ struct hash *uniProtHash = faReadAllIntoHash(uniProtFa, dnaUpper); struct hash *ucscProtHash = faReadAllIntoHash(ucscFa, dnaUpper); struct hash *refProtHash =faReadAllIntoHash(refPepFa, dnaUpper); /* Load picks into hash. We don't use cdsPicksLoadAll because empty fields * cause that autoSql-generated routine problems. */ struct hash *pickHash = newHash(18); struct cdsPick *pick; struct lineFile *lf = lineFileOpen(inPicks, TRUE); char *row[CDSPICK_NUM_COLS]; while (lineFileRowTab(lf, row)) { pick = cdsPickLoad(row); hashAdd(pickHash, pick->name, pick); } /* Load in bed */ struct bed *bed, *bedList = bedLoadNAll(inBed, 12); /* Do reformatting and write output. */ FILE *f = mustOpen(outKg, "w"); for (bed = bedList; bed != NULL; bed = bed->next) { char *protAcc = NULL; if (bed->thickStart < bed->thickEnd) { pick = hashMustFindVal(pickHash, bed->name); struct dnaSeq *spSeq = NULL, *uniSeq = NULL, *refPep = NULL, *ucscSeq; ucscSeq = hashMustFindVal(ucscProtHash, bed->name); if (pick->swissProt[0]) spSeq = hashMustFindVal(uniProtHash, pick->swissProt); if (pick->uniProt[0]) uniSeq = hashMustFindVal(uniProtHash, pick->uniProt); if (pick->refProt[0]) refPep = hashMustFindVal(refProtHash, pick->refProt); /* First we look for an exact match between the ucsc protein and * something from swissProt/uniProt. */ if (spSeq != NULL && sameString(ucscSeq->dna, spSeq->dna)) protAcc = pick->swissProt; if (protAcc == NULL && uniSeq != NULL && sameString(ucscSeq->dna, uniSeq->dna)) protAcc = pick->uniProt; if (protAcc == NULL && refPep != NULL && sameString(ucscSeq->dna, refPep->dna)) { protAcc = cloneString(pick->refProt); chopSuffix(protAcc); } if (protAcc == NULL) { if (pick->uniProt[0]) protAcc = pick->uniProt; else { protAcc = cloneString(pick->refProt); chopSuffix(protAcc); } } } outputKg(bed, emptyForNull(protAcc), f); } carefulClose(&f); }