void readTruePattern(int start, int end){ //cout << " s is for start position, e is for end position" << endl; string chr, methylString; string dummyLine; getline(truePatternFile, dummyLine); int i = 0; while(!truePatternFile.eof()) { int s = -100, e = -100, cid, pid; float abundance = 0 ; i++; truePatternFile >> chr >> s >> e >> cid >> pid >> abundance >> methylString; //cout << "Ture read : "<< chr << " " << s << " " << e <<endl; if( s >= start && e <= end){ MethylRead* m = new MethylRead(s, e-s+1); m->parseMethyl(methylString); // m->write(); trueMethylData.push_back(m); trueAbundanceData.push_back(abundance); } } //cout << "out of while loop" << endl; float sum =0; for(unsigned int j= 0; j<trueAbundanceData.size(); j++){ sum += trueAbundanceData.at(j); } for(unsigned int j= 0; j<trueAbundanceData.size(); j++){ trueAbundanceData.at(j) = trueAbundanceData.at(j)/sum ; } //cout << "out of read true pattern" << endl; }
float cost(Graph::Node u, Graph::Node v) { MethylRead* readU = read_map[u]; MethylRead* readV = read_map[v]; int common = 0 ; //cout << "Distance start" << endl; int match = readU->distance(readV, common); //readU->write(); //readV->write(); //cout << "node " << g.id(u) << ", " << g.id(v) << endl; //cout << "match " << match << endl; //cout << "common " << common << endl; int mismatch = readU->cpgs.size() + readV->cpgs.size() - match - common ; //int mismatch = common - match; int totalCpG = readU->cpgs.size() + readV->cpgs.size() - common; //cout << "mismatch " << mismatch << endl; //cout << "totalCpG " << totalCpG << endl; // cout << "-----------------------------------" << endl; //cout << "cost " << (float(mismatch) / totalCpG) << endl; //return (float(mismatch) / common) ; return (float(mismatch) / totalCpG) ; }
void readEstimatedPattern(int start, int end){ // "s" is for start position, "e" is for end position //cout << "start of readEstimated Patterns" << endl; string chr, methylString, regions; string dummyLine; int maxAbd= 0; getline(estimatedPatternFile, dummyLine); //cout << "dummy line = " << dummyLine << endl; int i = 0; while(!estimatedPatternFile.eof()) { int s = -100, e = -100, cid, pid; float abundance = 0 ; i++; estimatedPatternFile >> chr >> s >> e >> cid >> pid >> abundance >> methylString >> regions; // we assume the patterns are sorted by their abundance. // if(i==1) maxAbd = std::max(double(maxAbd), double(abundance)); // cout << "start = " << start << " s= " << s << "end = " << end << " abundance= " << abundance << " methylstring = " << methylString << endl; // cout << "Estimated read : "<< chr << " " << s << " " << e << " " << methylString << endl; if( s >= start -1 && e <= end && abundance > 0.01*maxAbd && methylString!="*"){ MethylRead* m = new MethylRead(s, e-s+1); //cerr << "befor parse" << endl; m->parseMethyl(methylString); //cerr << "befor write" << endl; //m->write(); //cerr << "i = " << i << endl; estimatedMethylData.push_back(m); estimatedAbundanceData.push_back(abundance); } } //cout << "out of while loop" << endl; float sum =0; for(unsigned int j= 0; j<estimatedAbundanceData.size(); j++){ sum += estimatedAbundanceData.at(j); } for(unsigned int j= 0; j<estimatedAbundanceData.size(); j++){ estimatedAbundanceData.at(j) = estimatedAbundanceData.at(j)/sum ; } //cout << "out of read estimated pattern" << endl; }
void readShortRead(int start, int end){ int s, l; string readId, methylString, strand, etc; //string dummyLine; //getline(truePatternFile, dummyLine); int i = 0; while(!shortReadFile.eof()) { i++; shortReadFile >> readId >> s >> l >> strand >> methylString >> etc; //cout << "Ture read : "<< chr << " " << s << " " << methylString <<endl; if( s >= start - 1 && s + l -1 <= end){ MethylRead* m = new MethylRead(s, s + l -1); m->parseMethyl(methylString); //m->write(); readMethylData.push_back(m); } } }
void readEstimatedPattern(int start, int end){ // "s" is for start position, "e" is for end position string chr, methylString; string dummyLine; getline(estimatedPatternFile, dummyLine); int i = 0; while(!estimatedPatternFile.eof()) { int s = -100, e = -100, cid, pid; float abundance = 0 ; i++; estimatedPatternFile >> chr >> s >> e >> cid >> pid >> abundance >> methylString; //cout << "Estimated read : "<< chr << " " << s << " " << e << " " << methylString << endl; if( s >= start -1 && e <= end){ MethylRead* m = new MethylRead(s, e-s+1); //cerr << "befor parse" << endl; m->parseMethyl(methylString); //cerr << "befor write" << endl; //m->write(); //cerr << "after write" << endl; estimatedMethylData.push_back(m); estimatedAbundanceData.push_back(abundance); } } float sum =0; for(unsigned int j= 0; j<estimatedAbundanceData.size(); j++){ sum += estimatedAbundanceData.at(j); } for(unsigned int j= 0; j<estimatedAbundanceData.size(); j++){ estimatedAbundanceData.at(j) = estimatedAbundanceData.at(j)/sum ; } }
void MFGraph::normalize_coverage() { // put nodes in a priority queue by start position // to ensure we iterate in the proper order std::priority_queue<MethylRead*, std::vector<MethylRead*>, CompareReadStarts> position_queue; for (ListDigraph::NodeIt n(mfGraph); n != INVALID; ++n) { MethylRead *read = read_map[n]; // std::cout << read->start() << std::endl; position_queue.push(read); } std::vector<float> coverage_per_position; std::vector<ListDigraph::Node> current_nodes; int current_startpos = -1; float current_coverage = 0.; while (! position_queue.empty() ) { MethylRead *read = position_queue.top(); position_queue.pop(); ListDigraph::Node n = read->node; // keep going if node is invalid if (!mfGraph.valid(n)) continue; #ifndef NDEBUG std::cout << "current node:" << read->start() << " " << coverage_map[n] << std::endl; #endif if (current_startpos == -1 ) { current_startpos = read->start(); current_coverage = coverage_map[n]; current_nodes.push_back(n); continue; } if (read->start() < current_startpos) { coverage_per_position.push_back(current_coverage); #ifndef NDEBUG std::cout << "new position!" << std::endl; std::cout << current_startpos << " " << current_coverage << std::endl; #endif // divide by position coverage for (std::vector<ListDigraph::Node>::iterator it = current_nodes.begin(); it != current_nodes.end(); ++it) { ListDigraph::Node node = *it; normalized_coverage_map[node] = (float) coverage_map[node] / current_coverage; #ifndef NDEBUG std::cout << coverage_map[node] << " " << normalized_coverage_map[node] << std::endl; #endif } #ifndef NDEBUG std::cout << std::endl; #endif current_nodes.clear(); current_nodes.push_back(n); current_coverage = (float) coverage_map[n]; current_startpos = read->start(); } if (read->start() == current_startpos) { current_coverage += (float) coverage_map[n]; current_nodes.push_back(n); } else { std::cerr << "nodes out of order" << std::endl; return; } } // process last position coverage_per_position.push_back(current_coverage); for (std::vector<ListDigraph::Node>::iterator it = current_nodes.begin(); it != current_nodes.end(); ++it) { ListDigraph::Node node = *it; normalized_coverage_map[node] = (float) coverage_map[node] / current_coverage; #ifndef NDEBUG std::cout << coverage_map[node] << " " << normalized_coverage_map[node] << std::endl; #endif } float median_coverage = calculate_median(coverage_per_position); #ifndef NDEBUG std::cout << "median coverage: " << median_coverage << std::endl; #endif for (ListDigraph::NodeIt n(mfGraph); n != INVALID; ++n) { normalized_coverage_map[n] *= median_coverage; } is_normalized = true; }
int MFGraph::decompose(const int componentID, std::ostream & patt_stream, std::string chr) { IdMap<ListDigraph, ListDigraph::Node> idmap(mfGraph); // compute total flow float total_flow = this->total_flow(); #ifndef NDEBUG std::cout << "total flow:: " << total_flow << std::endl; #endif int flownum = 0; // iterate while residual flow while (total_flow > 0.00001) { // compute residual flow for each arc ListDigraph::ArcMap<float> residual_flow(mfGraph); for (ListDigraph::ArcIt arc(mfGraph); arc != INVALID; ++arc) { residual_flow[arc] = (total_flow - flow_map[arc]); if(flow_map[arc] != 0){ // #ifndef NDEBUG // std::cout << "source: " << mfGraph.id(mfGraph.source(arc))<< ", target: " << mfGraph.id(mfGraph.target(arc)) << ", flow: " << flow_map[arc] <<std::endl; // #endif } } #ifndef NDEBUG std::cout << "run the min-max dijkstra algorithm " << std::endl; #endif // run the min-max dijkstra algorithm ListDigraph::NodeMap<float> dist(mfGraph); Dijkstra<ListDigraph, ListDigraph::ArcMap<float> > ::SetOperationTraits<DijkstraMinMaxOperationTraits<float> > ::Create dijkstra(mfGraph, residual_flow); dijkstra.distMap(dist); dijkstra.run(source, sink); #ifndef NDEBUG std::cout << "get the resulting path and it's flow " << std::endl; #endif // get the resulting path and it's flow Path<ListDigraph> shortestPath = dijkstra.path(sink); float path_flow = total_flow - dijkstra.dist(sink); // break out if this is not a valid path if (path_flow == 0) { #ifndef NDEBUG std::cout << "Found invalid path" << std::endl; #endif break; } // construct a meth fragment from path here // and remove path flow from each arc in path #ifndef NDEBUG std::cout << "dijkstra.dist: " << dijkstra.dist(sink) << ", path_flow:" << path_flow << ", total flow: " << total_flow << " length: " << shortestPath.length() << std::endl; #endif // it's a valid pattern, so increase number of paths found flownum++; std::stringstream region_list; MethylRead* pattern = NULL; int start, end; //MethylRead pattern = MethylRead(*read_map[source]); //int start = read_map[source]->start(); //int end = read_map[sink]->end(); for(Path<ListDigraph>::ArcIt arc(shortestPath); arc != INVALID; ++arc) { ListDigraph::Node s = mfGraph.source(arc); ListDigraph::Node t = mfGraph.target(arc); if (s == source) { pattern = new MethylRead(*read_map[t]); start = read_map[t]->start(); end = read_map[t]->end(); // end = read_map[sink]->end(); #ifndef NDEBUG std::cout << "Original pattern = " << pattern->getMethString() << std::endl; #endif break; } } #ifndef NDEBUG if (!pattern) { std::cout << "We should not hit this" << std::endl; for (Path<ListDigraph>::ArcIt arc(shortestPath); arc != INVALID; ++arc) { ListDigraph::Node s = mfGraph.source(arc); ListDigraph::Node t = mfGraph.target(arc); std::cout << mfGraph.id(s) << " -> " << mfGraph.id(t) << std::endl; } } #endif for(Path<ListDigraph>::ArcIt arc(shortestPath); arc != INVALID; ++arc) { // #ifndef NDEBUG // std::cout << " After finding a path, " << "source: " << mfGraph.id(mfGraph.source(arc))<< ", target: " << mfGraph.id(mfGraph.target(arc)) << ", flow: " << flow_map[arc] << " ,arc - pathFlow: " << flow_map[arc] - path_flow << std::endl; // #endif ListDigraph::Node s = mfGraph.source(arc); ListDigraph::Node t = mfGraph.target(arc); MethylRead *read = read_map[t]; // don't print the source node or nodes connected to sink if (s != source && t != get_sink()) { region_list << idmap[s]; if (!childless[t]) { region_list << ","; } } flow_map[arc] -= path_flow; // delete arc if no residual flow if (flow_map[arc] < 1e-6) { mfGraph.erase(arc); } if (s == source) { continue; } if (!read || t == get_sink()) { continue; } if (s != source && t != get_sink()) { pattern->merge(read); end = read->end(); //std::cout << "new pattern = " << pattern->getMethString() << std::endl; } // if (childless[t]) { // end = read->end(); // } } #ifndef NDEBUG std::cout << "new pattern = " << pattern->getMethString() << std::endl; #endif //Note we add source one nucleotide before every read patt_stream << chr << "\t" << start << "\t" << end; patt_stream << "\t" << componentID << "\t" << flownum << "\t" << path_flow; patt_stream << "\t" << pattern->getMethString() << "\t" << region_list.str() << std::endl; delete pattern; // recompute residual flow total_flow -= path_flow; } // all done return flownum; }