void txInfoAssemble(char *txBedFile, char *cdsEvFile, char *txCdsPredictFile, char *altSpliceFile, char *exceptionFile, char *sizePolyAFile, char *pslFile, char *flipFile, char *outFile) /* txInfoAssemble - Assemble information from various sources into txInfo table.. */ { /* Build up hash of evidence keyed by transcript name. */ struct hash *cdsEvHash = hashNew(18); struct cdsEvidence *cdsEv, *cdsEvList = cdsEvidenceLoadAll(cdsEvFile); for (cdsEv = cdsEvList; cdsEv != NULL; cdsEv = cdsEv->next) hashAddUnique(cdsEvHash, cdsEv->name, cdsEv); verbose(2, "Loaded %d elements from %s\n", cdsEvHash->elCount, cdsEvFile); /* Build up hash of bestorf structures keyed by transcript name */ struct hash *predictHash = hashNew(18); struct cdsEvidence *predict, *predictList = cdsEvidenceLoadAll(txCdsPredictFile); for (predict = predictList; predict != NULL; predict = predict->next) hashAddUnique(predictHash, predict->name, predict); verbose(2, "Loaded %d predicts from %s\n", predictHash->elCount, txCdsPredictFile); /* Build up structure for random access of retained introns */ struct bed *altSpliceList = bedLoadNAll(altSpliceFile, 6); verbose(2, "Loaded %d alts from %s\n", slCount(altSpliceList), altSpliceFile); struct hash *altSpliceHash = bedsIntoHashOfKeepers(altSpliceList); /* Read in exception info. */ struct hash *selenocysteineHash, *altStartHash; genbankExceptionsHash(exceptionFile, &selenocysteineHash, &altStartHash); /* Read in polyA sizes */ struct hash *sizePolyAHash = hashNameIntFile(sizePolyAFile); verbose(2, "Loaded %d from %s\n", sizePolyAHash->elCount, sizePolyAFile); /* Read in psls */ struct hash *pslHash = hashNew(20); struct psl *psl, *pslList = pslLoadAll(pslFile); for (psl = pslList; psl != NULL; psl = psl->next) hashAdd(pslHash, psl->qName, psl); verbose(2, "Loaded %d from %s\n", pslHash->elCount, pslFile); /* Read in accessions that we flipped for better splice sites. */ struct hash *flipHash = hashWordsInFile(flipFile, 0); /* Open primary gene input and output. */ struct lineFile *lf = lineFileOpen(txBedFile, TRUE); FILE *f = mustOpen(outFile, "w"); /* Main loop - process each gene */ char *row[12]; while (lineFileRow(lf, row)) { struct bed *bed = bedLoad12(row); verbose(3, "Processing %s\n", bed->name); /* Initialize info to zero */ struct txInfo info; ZeroVar(&info); /* Figure out name, sourceAcc, and isRefSeq from bed->name */ info.name = bed->name; info.category = "n/a"; if (isRfam(bed->name) || stringIn("tRNA", bed->name) != NULL) { info.sourceAcc = cloneString(bed->name); } else { info.sourceAcc = txAccFromTempName(bed->name); } info.isRefSeq = startsWith("NM_", info.sourceAcc); if (startsWith("antibody.", info.sourceAcc) || startsWith("CCDS", info.sourceAcc) || isRfam(info.sourceAcc) || stringIn("tRNA", info.sourceAcc) != NULL) { /* Fake up some things for antibody frag and CCDS that don't have alignments. */ info.sourceSize = bedTotalBlockSize(bed); info.aliCoverage = 1.0; info.aliIdRatio = 1.0; info. genoMapCount = 1; } else { /* Loop through all psl's associated with our RNA. Figure out * our overlap with each, and pick best one. */ struct hashEl *hel, *firstPslHel = hashLookup(pslHash, info.sourceAcc); if (firstPslHel == NULL) errAbort("%s is not in %s", info.sourceAcc, pslFile); int mapCount = 0; struct psl *psl, *bestPsl = NULL; int coverage, bestCoverage = 0; boolean isFlipped = (hashLookup(flipHash, info.sourceAcc) != NULL); for (hel = firstPslHel; hel != NULL; hel = hashLookupNext(hel)) { psl = hel->val; mapCount += 1; coverage = pslBedOverlap(psl, bed); if (coverage > bestCoverage) { bestCoverage = coverage; bestPsl = psl; } /* If we flipped it, try it on the opposite strand too. */ if (isFlipped) { psl->strand[0] = (psl->strand[0] == '+' ? '-' : '+'); coverage = pslBedOverlap(psl, bed); if (coverage > bestCoverage) { bestCoverage = coverage; bestPsl = psl; } psl->strand[0] = (psl->strand[0] == '+' ? '-' : '+'); } } if (bestPsl == NULL) errAbort("%s has no overlapping alignments with %s in %s", bed->name, info.sourceAcc, pslFile); /* Figure out and save alignment statistics. */ int polyA = hashIntValDefault(sizePolyAHash, bed->name, 0); info.sourceSize = bestPsl->qSize - polyA; info.aliCoverage = (double)bestCoverage / info.sourceSize; info.aliIdRatio = (double)(bestPsl->match + bestPsl->repMatch)/ (bestPsl->match + bestPsl->misMatch + bestPsl->repMatch); info. genoMapCount = mapCount; } /* Get orf size and start/end complete from cdsEv. */ if (bed->thickStart < bed->thickEnd) { cdsEv = hashFindVal(cdsEvHash, bed->name); if (cdsEv != NULL) { info.orfSize = cdsEv->end - cdsEv->start; info.startComplete = cdsEv->startComplete; info.endComplete = cdsEv->endComplete; } } /* Get score from prediction. */ predict = hashFindVal(predictHash, bed->name); if (predict != NULL) info.cdsScore = predict->score; /* Figure out nonsense-mediated-decay from bed itself. */ info.nonsenseMediatedDecay = isNonsenseMediatedDecayTarget(bed); /* Figure out if retained intron from bed and alt-splice keeper hash */ info.retainedIntron = hasRetainedIntron(bed, altSpliceHash); info.strangeSplice = countStrangeSplices(bed, altSpliceHash); info.atacIntrons = countAtacIntrons(bed, altSpliceHash); info.bleedIntoIntron = addIntronBleed(bed, altSpliceHash); /* Look up selenocysteine info. */ info.selenocysteine = (hashLookup(selenocysteineHash, bed->name) != NULL); /* Loop through bed looking for small gaps indicative of frame shift/stop */ int i, lastBlock = bed->blockCount-1; int exonCount = 1; for (i=0; i < lastBlock; ++i) { int gapStart = bed->chromStarts[i] + bed->blockSizes[i]; int gapEnd = bed->chromStarts[i+1]; int gapSize = gapEnd - gapStart; switch (gapSize) { case 1: case 2: info.genomicFrameShift = TRUE; break; case 3: info.genomicStop = TRUE; break; default: exonCount += 1; break; } } info.exonCount = exonCount; /* Write info, free bed. */ txInfoTabOut(&info, f); bedFree(&bed); } /* Clean up and go home. */ carefulClose(&f); }
/* main function */ int main_bismark (int argc, char *argv[]) { char *output, *outReportfile, *outCpGfile, *outbedGraphfile, *row[100], *samfilecopy; char *forwardcg, *forwardchg, *forwardchh, *forwardread, *forwardread1; char *reversecg, *reversechg, *reversechh, *reverseread, *reverseread1; unsigned long long int *cnt; unsigned long long int *cnt2 = NULL; int optSam = 0, c, optaddChr = 0, optStats = 0, optBis = 0, optFull = 0, optKeep = 0; unsigned int optisize = 500; int optcov = 5; char *optoutput = NULL; struct hash *cpgHash = newHash(0); struct hash *chgHash = newHash(0); struct hash *chhHash = newHash(0); time_t start_time, end_time; start_time = time(NULL); while ((c = getopt(argc, argv, "SCsbFBo:c:I:h?")) >= 0) { switch (c) { case 'S': optSam = 1; break; case 'C': optaddChr = 1; break; case 's': optStats = 1; break; case 'b': optBis = 1; break; case 'F': optFull = 1; break; case 'B': optKeep = 1; break; case 'c': optcov = (int)strtol(optarg, 0, 0); break; case 'I': optisize = (unsigned int)strtol(optarg, 0, 0); break; case 'o': optoutput = strdup(optarg); break; case 'h': case '?': return bismark_usage(); break; default: return 1; } } if (optind + 3 > argc) return bismark_usage(); char *chr_size_file = argv[optind]; char *cpg_bed_file = argv[optind+1]; char *sam_file = argv[optind+2]; fprintf(stderr, "* CpG file provided: %s\n", cpg_bed_file); fprintf(stderr, "* Insert size cutoff: %u\n", optisize); fprintf(stderr, "* Read coverage threshold: %i\n", optcov); struct hash *chrHash = hashNameIntFile(chr_size_file); samfilecopy = cloneString(sam_file); int numFields = chopByChar(samfilecopy, ',', row, ArraySize(row)); fprintf(stderr, "* Provided %i BAM/SAM file(s)\n", numFields); if(optFull) { fprintf(stderr, "* Warning: will run in Full mode, 8 track files and 1 report file will be generated\n"); fprintf(stderr, "* Warning: will output stats over each C (in CHG)\n"); fprintf(stderr, "* Warning: will output stats over each C (in CHH)\n"); optStats = 0; optBis = 1; } if(optStats) { fprintf(stderr, "* Warning: will report stats only as -s specified\n"); } // if use select bismark like output, read cpgHash at each C stats if(optBis) { fprintf(stderr, "* Warning: will output stats over each C (in CpG)\n"); cpgHash = cpgBed2BinKeeperHashBismark(chrHash, cpg_bed_file); }else{ fprintf(stderr, "* Warning: will output stats over each CpG\n"); cpgHash = cpgBed2BinKeeperHash(chrHash, cpg_bed_file); } if(optoutput) { output = optoutput; } else { output = cloneString(get_filename_without_ext(basename(row[0]))); } if(asprintf(&outCpGfile, "%s.CpG.bedGraph", output) < 0) errAbort("Mem Error.\n"); if(asprintf(&outbedGraphfile, "%s.density.bedGraph", output) < 0) errAbort("Mem Error.\n"); if (asprintf(&outReportfile, "%s.report", output) < 0) errAbort("Preparing output wrong"); if (asprintf(&forwardcg, "%s.forward.CG.bedGraph", output) < 0) errAbort("Preparing output wrong"); if (asprintf(&forwardchg, "%s.forward.CHG.bedGraph", output) < 0) errAbort("Preparing output wrong"); if (asprintf(&forwardchh, "%s.forward.CHH.bedGraph", output) < 0) errAbort("Preparing output wrong"); if (asprintf(&forwardread, "%s.forward.Density.bed", output) < 0) errAbort("Preparing output wrong"); if (asprintf(&forwardread1, "%s.forward.Density.bedGraph", output) < 0) errAbort("Preparing output wrong"); if (asprintf(&reversecg, "%s.reverse.CG.bedGraph", output) < 0) errAbort("Preparing output wrong"); if (asprintf(&reversechg, "%s.reverse.CHG.bedGraph", output) < 0) errAbort("Preparing output wrong"); if (asprintf(&reversechh, "%s.reverse.CHH.bedGraph", output) < 0) errAbort("Preparing output wrong"); if (asprintf(&reverseread, "%s.reverse.Density.bed", output) < 0) errAbort("Preparing output wrong"); if (asprintf(&reverseread1, "%s.reverse.Density.bedGraph", output) < 0) errAbort("Preparing output wrong"); //sam file to bed file //fprintf(stderr, "* Parsing the SAM/BAM file\n"); cnt = bismarkBamParse(sam_file, chrHash, cpgHash, chgHash, chhHash, forwardread, reverseread, optSam, optaddChr, optFull, optisize); //write to file if (optFull){ fprintf(stderr, "* Output CpG methylation calls\n"); writecpgBismarkLite(cpgHash, forwardcg, reversecg, optcov); fprintf(stderr, "* Output CHG methylation calls\n"); writecpgBismarkLiteHash(chgHash, forwardchg, reversechg, optcov); fprintf(stderr, "* Output CHH methylation calls\n"); writecpgBismarkLiteHash(chhHash, forwardchh, reversechh, optcov); fprintf(stderr, "* Sorting methylation calls\n"); sortBedfile(forwardcg); sortBedfile(reversecg); sortBedfile(forwardchg); sortBedfile(reversechg); sortBedfile(forwardchh); sortBedfile(reversechh); fprintf(stderr, "* Sorting density bed\n"); sortBedfile(forwardread); sortBedfile(reverseread); fprintf(stderr, "* Generating density bedGraph\n"); bedItemOverlapCount(chrHash, forwardread, forwardread1); bedItemOverlapCount(chrHash, reverseread, reverseread1); }else{ cnt2 = writecpgBismark(cpgHash, outbedGraphfile, outCpGfile, optStats, optcov); //sort output if(!optStats) { fprintf(stderr, "* Sorting output density\n"); sortBedfile(outbedGraphfile); } //sort output if(!optStats) { fprintf(stderr, "* Sorting output CpG methylation call\n"); sortBedfile(outCpGfile); } } //generate bigWig //fprintf(stderr, "* Generating bigWig\n"); //bigWigFileCreate(outbedGraphfile, chr_size_file, 256, 1024, 0, 1, outbigWigfile); //bedGraphToBigWig(outbedGraphfile, chr_size_file, outbigWigfile); //write report file fprintf(stderr, "* Preparing report file\n"); writeReportBismark(outReportfile, cnt, cnt2, numFields, row, optBis, hashIntSum(chrHash)); if(!optKeep){ fprintf(stderr, "* Deleting (huge) density bed files\n"); unlink(forwardread); unlink(reverseread); } //cleaning hashFree(&chrHash); hashFree(&cpgHash); hashFree(&chgHash); hashFree(&chhHash); free(outCpGfile); free(outbedGraphfile); //free(outbigWigfile); free(outReportfile); free(samfilecopy); free(forwardcg); free(forwardchg); free(forwardchh); free(forwardread); free(forwardread1); free(reversecg); free(reversechg); free(reversechh); free(reverseread); free(reverseread1); end_time = time(NULL); fprintf(stderr, "* Done, time used %.0f seconds.\n", difftime(end_time, start_time)); return 0; }
void txGeneCdsMap(char *inBed, char *inInfo, char *inPicks, char *refPepToTxPsl, char *refToPepTab, char *chromSizes, char *cdsToRna, char *rnaToGenome) /* txGeneCdsMap - Create mapping between CDS region of gene and genome. */ { /* Load info into hash. */ struct hash *infoHash = hashNew(18); struct txInfo *info, *infoList = txInfoLoadAll(inInfo); for (info = infoList; info != NULL; info = info->next) hashAdd(infoHash, info->name, info); /* Load picks into hash. We don't use cdsPicksLoadAll because empty fields * cause that autoSql-generated routine problems. */ struct hash *pickHash = newHash(18); struct cdsPick *pick; struct lineFile *lf = lineFileOpen(inPicks, TRUE); char *row[CDSPICK_NUM_COLS]; while (lineFileRowTab(lf, row)) { pick = cdsPickLoad(row); hashAdd(pickHash, pick->name, pick); } lineFileClose(&lf); /* Load refPep/tx alignments into hash keyed by tx. */ struct hash *refPslHash = hashNew(18); struct psl *psl, *pslList = pslLoadAll(refPepToTxPsl); for (psl = pslList; psl != NULL; psl = psl->next) hashAdd(refPslHash, psl->tName, psl); struct hash *refToPepHash = hashTwoColumnFile(refToPepTab); struct hash *chromSizeHash = hashNameIntFile(chromSizes); /* Load in bed. */ struct bed *bed, *bedList = bedLoadNAll(inBed, 12); /* Open output, and stream through bedList, writing output. */ FILE *fCdsToRna = mustOpen(cdsToRna, "w"); FILE *fRnaToGenome = mustOpen(rnaToGenome, "w"); int refTotal = 0, refFound = 0; for (bed = bedList; bed != NULL; bed = bed->next) { if (bed->thickStart < bed->thickEnd) { char *chrom = bed->chrom; int chromSize = hashIntVal(chromSizeHash, chrom); info = hashMustFindVal(infoHash, bed->name); pick = hashMustFindVal(pickHash, bed->name); if (info->isRefSeq) { char *refAcc = txAccFromTempName(bed->name); if (!startsWith("NM_", refAcc)) errAbort("Don't think I did find that refSeq acc, got %s", refAcc); char *protAcc = hashMustFindVal(refToPepHash, refAcc); ++refTotal; if (findAndMapPsl(bed, protAcc, refPslHash, chromSize, fCdsToRna)) ++refFound; } else { fakeCdsToMrna(bed, fCdsToRna); } fakeRnaToGenome(bed, chromSize, fRnaToGenome); } } verbose(1, "Missed %d of %d refSeq protein mappings. A small number of RefSeqs just map\n" "to genome in the UTR.\n", refTotal - refFound, refTotal); carefulClose(&fCdsToRna); carefulClose(&fRnaToGenome); }
/* main filter function */ int main_filter(int argc, char *argv[]){ char *output, *subfam, *out, *outReport; unsigned long long int *cnt; struct hash *hashRmsk = newHash(0); struct hash *hashRep = newHash(0); struct hash *hashFam = newHash(0); struct hash *hashCla = newHash(0); int optSam = 0, optthreshold = 1; char *optoutput = NULL, *optname = NULL, *optclass = NULL, *optfamily = NULL; unsigned int optreadlist = 0, optQual = 10, optisize = 500, optExt = 150; int filterField = 0, c, optDup = 0, optNorm = 0, optaddChr = 0, optDis =0, optTreat = 0; float optCov = 0.0001; time_t start_time, end_time; start_time = time(NULL); while ((c = getopt(argc, argv, "SQ:g:N:n:c:t:f:rRTDCE:I:o:h?")) >= 0) { switch (c) { case 'S': optSam = 1; break; case 'Q': optQual = (unsigned int)strtol(optarg, 0, 0); break; case 'g': optCov = atof(optarg); break; case 'N': optNorm = (unsigned int)strtol(optarg, 0, 0); break; case 't': optthreshold = (unsigned int)strtol(optarg, 0, 0); break; case 'r': optreadlist = 1; break; case 'R': optDup = 1; break; case 'T': optTreat = 1; break; case 'D': optDis = 1; break; case 'C': optaddChr = 1; break; case 'n': optname = strdup(optarg); break; case 'c': optclass = strdup(optarg); break; case 'f': optfamily = strdup(optarg); break; case 'E': optExt = (unsigned int)strtol(optarg, 0, 0); break; case 'I': optisize = (unsigned int)strtol(optarg, 0, 0); break; case 'o': optoutput = strdup(optarg); break; case 'h': case '?': return filter_usage(); break; default: return 1; } } if (optind + 4 > argc) return filter_usage(); char *chr_size_file = argv[optind]; char *rep_size_file = argv[optind+1]; char *rmsk_file = argv[optind+2]; char *sam_file = argv[optind+3]; if ( (optname && optclass) || (optname && optfamily) || (optclass && optfamily) || (optname && optclass && optfamily)) errAbort("Please specify only one filter, either -n, -c or -f."); int nindex = 0; if (optNorm == 0){ nindex = 7; } else if (optNorm == 1){ nindex = 8; } else if (optNorm == 2) { nindex = 6; } else if (optNorm == 3) { nindex = 4; } else{ errAbort("Wrong normalization method specified"); } subfam = cloneString("ALL"); if (optname) { optclass = NULL; optfamily = NULL; subfam = cloneString(optname); filterField = 10; }else if (optclass) { optname = NULL; optfamily = NULL; subfam = cloneString(optclass); filterField = 11; } else if (optfamily) { optname = NULL; optclass= NULL; subfam = cloneString(optfamily); filterField = 12; } if (sameString(subfam, "ALL")){ fprintf(stderr, "* You didn't specify any filter, will output all repeats\n"); filterField = 0; } if(optoutput) { output = optoutput; } else { output = cloneString(get_filename_without_ext(basename(sam_file))); } struct hash *chrHash = hashNameIntFile(chr_size_file); struct hash *repHash = hashNameIntFile(rep_size_file); fprintf(stderr, "* Start to parse the rmsk file\n"); rmsk2binKeeperHash(rmsk_file, chrHash, repHash, &hashRmsk, &hashRep, &hashFam, &hashCla, filterField, subfam); //sam file fprintf(stderr, "* Start to parse the SAM/BAM file\n"); //if (optPair){ // cnt = PEsamFile2nodupRepbedFile(sam_file, chrHash, hashRmsk, hashRep, hashFam, hashCla, optSam, optQual, 1, optDup, optaddChr, optisize); //} else { // cnt = samFile2nodupRepbedFile(sam_file, chrHash, hashRmsk, hashRep, hashFam, hashCla, optSam, optQual, 1, optDup, optaddChr); //} cnt = samFile2nodupRepbedFileNew(sam_file, chrHash, hashRmsk, hashRep, hashFam, hashCla, optSam, optQual, 1, optDup, optaddChr, optDis, optisize, optExt, optCov, optTreat, NULL, NULL, 0); fprintf(stderr, "* Preparing the output file\n"); if (asprintf(&out, "%s_%s.iteres.loci", output, subfam) < 0) errAbort("Preparing output wrong"); if (asprintf(&outReport, "%s_%s.iteres.reportloci", output, subfam) < 0) errAbort("Preparing output wrong"); writeFilterOut(hashRmsk, out, optreadlist, optthreshold, subfam, cnt[nindex]); //write report file fprintf(stderr, "* Preparing report file\n"); writeReport(outReport, cnt, optQual, subfam); hashFree(&chrHash); hashFree(&repHash); hashFree(&hashRmsk); hashFree(&hashRep); hashFree(&hashFam); hashFree(&hashCla); free(out); free(outReport); end_time = time(NULL); fprintf(stderr, "* Done, time used %.0f seconds.\n", difftime(end_time, start_time)); return 0; }