// Attemps to find the next significant CpG site. Returns true if one was found // and flase otherwise. static bool read_next_significant_cpg(istream &cpg_stream, GenomicRegion &cpg, double cutoff, bool &skipped_any, bool &sig_raw, size_t &test_cov, size_t &test_meth, size_t &rest_cov, size_t &rest_meth) { GenomicRegion region; skipped_any = false; sig_raw = false; string cpg_encoding; while (getline(cpg_stream, cpg_encoding)) { string record, chrom, name, sign; size_t position; double raw_pval, adjusted_pval, corrected_pval; std::istringstream iss(cpg_encoding); iss.exceptions(std::ios::failbit); iss >> chrom >> position >> sign >> name >> raw_pval >> adjusted_pval >> corrected_pval >> test_cov >> test_meth >> rest_cov >> rest_meth; if (0 <= corrected_pval && corrected_pval < cutoff) { cpg.set_chrom(chrom); cpg.set_start(position); cpg.set_end(position + 1); sig_raw = (0 <= raw_pval && raw_pval < cutoff); return true; } skipped_any = true; } return false; }
/** * \brief TODO */ bool Exon::sameRegion(const GenomicRegion &r) const { return ((r.get_chrom() == this->region.get_chrom()) && (r.get_start() == this->region.get_start()) && (r.get_end() == this->region.get_end()) && (r.get_strand() == this->region.get_strand())); }
/** * \brief Does this exon partially overlap the given genomic region? This * function doesn't consider an exact region match (i.e. same chrom, * start, end, and strand) to be a 'partial overlap'. */ bool Exon::partialOverlap(const GenomicRegion &r) const { if (r.get_chrom() != this->region.get_chrom()) return false; if (r.get_strand() != this->region.get_strand()) return false; // by now they must be one the same chromosome and strand... first see if // they match exactly if ((r.get_start() == this->region.get_start()) && (r.get_end() == this->region.get_end())) return false; // not an exact match return (r.overlaps(this->region)); }
static void convert_coordinates(const unordered_map<size_t, size_t> &cpgs, GenomicRegion ®ion) { const unordered_map<size_t, size_t>::const_iterator start_itr(cpgs.find(region.get_start())); const unordered_map<size_t, size_t>::const_iterator end_itr(cpgs.find(region.get_end())); if (start_itr == cpgs.end() || end_itr == cpgs.end()) throw runtime_error("could not convert:\n" + region.tostring()); region.set_start(start_itr->second); region.set_end(end_itr->second); }
static void get_chrom(const MappedRead &mr, const chrom_file_map &chrom_files, GenomicRegion &chrom_region, string &chrom) { const chrom_file_map::const_iterator fn(chrom_files.find(mr.r.get_chrom())); if (fn == chrom_files.end()) throw SMITHLABException("could not find chrom: " + mr.r.get_chrom()); chrom.clear(); read_fasta_file(fn->second, mr.r.get_chrom(), chrom); if (chrom.empty()) throw SMITHLABException("could not find chrom: " + mr.r.get_chrom()); chrom_region.set_chrom(mr.r.get_chrom()); }
static void get_chrom(const MappedRead &mr, const vector<string> &all_chroms, const unordered_map<string, size_t> &chrom_lookup, GenomicRegion &chrom_region, string &chrom) { const unordered_map<string, size_t>::const_iterator the_chrom(chrom_lookup.find(mr.r.get_chrom())); if (the_chrom == chrom_lookup.end()) throw runtime_error("could not find chrom: " + mr.r.get_chrom()); chrom = all_chroms[the_chrom->second]; if (chrom.empty()) throw runtime_error("could not find chrom: " + mr.r.get_chrom()); chrom_region.set_chrom(mr.r.get_chrom()); }
static void get_chrom(const bool VERBOSE, const GenomicRegion &r, const unordered_map<string, string>& chrom_files, GenomicRegion &chrom_region, string &chrom) { const unordered_map<string, string>::const_iterator fn(chrom_files.find(r.get_chrom())); if (fn == chrom_files.end()) throw runtime_error("could not find chrom: " + r.get_chrom()); chrom.clear(); read_fasta_file(fn->second, r.get_chrom(), chrom); if (chrom.empty()) throw runtime_error("could not find chrom: " + r.get_chrom()); else { chrom_region.set_chrom(r.get_chrom()); } }
void merge(istream &cpg_stream, ostream &dmr_stream, double cutoff) { bool skipped_last_cpg, sig_raw; GenomicRegion dmr; dmr.set_name("dmr"); size_t dmr_test_cov = 0; size_t dmr_test_meth = 0; size_t dmr_rest_cov = 0; size_t dmr_rest_meth = 0; size_t test_cov = 0; size_t test_meth = 0; size_t rest_cov = 0; size_t rest_meth = 0; // Find the first significant CpG, or terminate the function if none exist. if (!read_next_significant_cpg(cpg_stream, dmr, cutoff, skipped_last_cpg, sig_raw, test_cov, test_meth, rest_cov, rest_meth)) return; dmr.set_score(sig_raw); dmr_test_cov += test_cov; dmr_test_meth += test_meth; dmr_rest_cov += rest_cov; dmr_rest_meth += rest_meth; GenomicRegion cpg; cpg.set_name("dmr"); while(read_next_significant_cpg(cpg_stream, cpg, cutoff, skipped_last_cpg, sig_raw, test_cov, test_meth, rest_cov, rest_meth)) { if (skipped_last_cpg || cpg.get_chrom() != dmr.get_chrom()) { if (dmr.get_score() != 0) dmr_stream << dmr.get_chrom() << '\t' << dmr.get_start() << '\t' << dmr.get_end() << '\t' << dmr.get_name() << '\t' << dmr.get_score() << '\t' << double(dmr_test_meth)/dmr_test_cov - double(dmr_rest_meth)/dmr_rest_cov << std::endl; dmr = cpg; dmr.set_score(sig_raw); dmr_test_cov = test_cov; dmr_test_meth = test_meth; dmr_rest_cov = rest_cov; dmr_rest_meth = rest_meth; } else { dmr.set_end(cpg.get_end()); dmr.set_score(dmr.get_score() + sig_raw); dmr_test_cov += test_cov; dmr_test_meth += test_meth; dmr_rest_cov += rest_cov; dmr_rest_meth += rest_meth; } } dmr_stream << dmr << std::endl; }
void GffSplitLocator::process(std::string& sLine, void* pData) { if (m_iLines++ == 0) return; m_pLineElements->clear(); this->split(sLine, '\t', m_pLineElements); // chrom = 0, strand = 1, start = 2, end = 3 uint32_t iStart = std::stoi( m_pLineElements->at(1) ); uint32_t iEnd = std::stoi( m_pLineElements->at(2) ); GffEntry* pChrom = m_pGffLoader->getChromosome( &(m_pLineElements->at(0)) ); GenomicRegion* pSplit = new GenomicRegion(iStart, iEnd); std::vector<GffEntry*>* pGenes = pChrom->findChildrenAt(pSplit, m_pSearchLevel, false); sSplitResults* pResult = NULL; std::stringstream oOutLine; if (pGenes->size() == 1) { /** * A single gene explains this split */ pResult = this->queryGeneSame(pGenes->at(0), pSplit); } if (pGenes->size() > 1) { for (uint32_t i = 0; i < pGenes->size(); ++i) { GffEntry* pGene = pGenes->at(i); sSplitResults* pGeneResult = this->queryGeneSame(pGene, pSplit); if (pResult == NULL) { pResult = pGeneResult; } else { if ((pResult->bSameGene == false) && (pGeneResult->bSameGene == true)) { pResult = pGeneResult; continue; } if ((pResult->bExonsOfGene == false) && (pGeneResult->bExonsOfGene == true)) { pResult = pGeneResult; continue; } if ((pResult->bSameTranscriptInOneGene == false) && (pGeneResult->bSameTranscriptInOneGene == true)) { pResult = pGeneResult; continue; } } } } if (pResult != NULL) { oOutLine << sLine.substr(0, sLine.length()-1) << pResult->toString('\t') << std::endl; std::string sFullLine = oOutLine.str(); m_pFileWriter->writeDirect( sFullLine ); return; } pGenes = pChrom->findChildrenAt(pSplit, m_pSearchLevel, true); std::vector<GffEntry*>* pRegions = NULL; GffEntry* pRegion = NULL; if (pGenes->size() == 1) { /** * A single gene explains this split */ GffEntry* pGene = pGenes->at(0); if (pGene->contains( pSplit->getStart() )) pRegions = pChrom->findChildrenAt(NULL, pSplit->getEnd()); else pRegions = pChrom->findChildrenAt(NULL, pSplit->getStart()); if (pRegions->size() > 0) pRegion = pRegions->at(0); pResult = this->queryGeneDifferent(pGene, pRegion, pSplit); } if (pGenes->size() > 1) { for (uint32_t i = 0; i < pGenes->size(); ++i) { GffEntry* pGene = pGenes->at(i); if (pGene->contains( pSplit->getStart() )) pRegions = pChrom->findChildrenAt(NULL, pSplit->getEnd()); else pRegions = pChrom->findChildrenAt(NULL, pSplit->getStart()); if (pRegions->size() > 0) pRegion = pRegions->at(0); sSplitResults* pGeneResult = this->queryGeneDifferent(pGene, pRegion, pSplit); if (pResult == NULL) { pResult = pGeneResult; } else { if ((pResult->bSameGene == false) && (pGeneResult->bSameGene == true)) { pResult = pGeneResult; continue; } if ((pResult->bExonsOfGene == false) && (pGeneResult->bExonsOfGene == true)) { pResult = pGeneResult; continue; } if ((pResult->bSameTranscriptInOneGene == false) && (pGeneResult->bSameTranscriptInOneGene == true)) { pResult = pGeneResult; continue; } } } } if (pResult != NULL) { oOutLine << sLine.substr(0, sLine.length()-1) << pResult->toString('\t') << std::endl; std::string sFullLine = oOutLine.str(); m_pFileWriter->writeDirect( sFullLine ); return; } m_pLineElements->clear(); }
static bool succeeds(const string &chrom, const size_t position, const GenomicRegion &r) { return r.get_chrom() < chrom || (chrom == r.get_chrom() && r.get_end() <= position); }
static bool precedes(const string &chrom, const size_t position, const GenomicRegion &r) { return chrom < r.get_chrom() || (chrom == r.get_chrom() && position < r.get_start()); }
static inline bool end_less(const GenomicRegion &a, const GenomicRegion &b) { return a.get_end() < b.get_end(); }
static inline bool same_end(const GenomicRegion &a, const GenomicRegion &b) { return a.get_end() == b.get_end(); }
static inline bool start_leq(const GenomicRegion &a, const GenomicRegion &b) { return a.get_start() <= b.get_start(); }
static inline bool strand_less(const GenomicRegion &a, const GenomicRegion &b) { return a.get_strand() <= b.get_strand(); }