Пример #1
0
void readTruePattern(int start, int end){
    //cout << " s is for start position, e is for end position" << endl;
	
	string chr, methylString;
    string dummyLine;
    getline(truePatternFile, dummyLine);
	int i = 0;
	while(!truePatternFile.eof()) {
        int  s = -100, e = -100, cid, pid;
        float abundance = 0 ;

		i++;
		truePatternFile >> chr >> s >> e >> cid >> pid >> abundance >> methylString;
        //cout << "Ture read : "<< chr << " " << s << " " << e <<endl;
		if( s >= start && e <= end){
            MethylRead* m = new MethylRead(s, e-s+1);
            m->parseMethyl(methylString);
            
           // m->write();
            
            trueMethylData.push_back(m);
            trueAbundanceData.push_back(abundance);
		}
	}
    //cout << "out of while loop" << endl;
    float sum =0;
    for(unsigned int j= 0; j<trueAbundanceData.size(); j++){
        sum += trueAbundanceData.at(j);
    }
    for(unsigned int j= 0; j<trueAbundanceData.size(); j++){
        trueAbundanceData.at(j) = trueAbundanceData.at(j)/sum ;
    }
    //cout << "out of read true pattern" << endl;

}
Пример #2
0
float cost(Graph::Node u, Graph::Node v) {
    
    
    MethylRead* readU = read_map[u];
    MethylRead* readV = read_map[v];
    
    
    int common = 0 ;
    //cout << "Distance start" << endl;
    int match =  readU->distance(readV, common);
    //readU->write();
    //readV->write();
    //cout <<  "node " << g.id(u) << ", " << g.id(v) << endl;
    //cout << "match " << match <<  endl;
    
    //cout << "common " << common <<  endl;
    
    int mismatch = readU->cpgs.size() + readV->cpgs.size() - match - common ;
    //int mismatch = common - match;
    int totalCpG = readU->cpgs.size() + readV->cpgs.size() - common;
    //cout << "mismatch " << mismatch <<  endl;
    
    //cout << "totalCpG " << totalCpG <<  endl;
   // cout << "-----------------------------------" << endl;
    
    //cout << "cost " << (float(mismatch) / totalCpG) <<  endl;
    
    //return (float(mismatch) / common) ;
    
    return (float(mismatch) / totalCpG) ;
}
Пример #3
0
void readEstimatedPattern(int start, int end){
	// "s" is for start position, "e" is for end position
    //cout << "start of readEstimated Patterns" << endl;
	string  chr, methylString, regions;
    string dummyLine;
    int maxAbd= 0;
    getline(estimatedPatternFile, dummyLine);
    //cout << "dummy line = " << dummyLine << endl;
	int i = 0;
	while(!estimatedPatternFile.eof()) {
        int  s = -100, e = -100, cid, pid;
        float abundance = 0 ;

		i++;
		estimatedPatternFile >> chr >> s >> e >> cid >> pid >> abundance >> methylString >> regions;
        // we assume the patterns are sorted by their abundance.
       // if(i==1)
        maxAbd = std::max(double(maxAbd), double(abundance));
        
       // cout << "start = " << start << " s= " << s << "end = " << end  << " abundance= " << abundance << " methylstring = " << methylString << endl;
       // cout << "Estimated read : "<< chr << " " << s << " " << e << "  " << methylString << endl;
		if( s >= start -1  &&  e <= end && abundance > 0.01*maxAbd && methylString!="*"){
            
            MethylRead* m = new MethylRead(s, e-s+1);
            //cerr << "befor parse" << endl;
            
            m->parseMethyl(methylString);
            //cerr << "befor write" << endl;
            //m->write();
            //cerr << "i = " << i << endl;
            estimatedMethylData.push_back(m);
            estimatedAbundanceData.push_back(abundance);
		}
	}
    //cout << "out of while loop" << endl;

    float sum =0;
    for(unsigned int j= 0; j<estimatedAbundanceData.size(); j++){
        sum += estimatedAbundanceData.at(j);
    }
    for(unsigned int j= 0; j<estimatedAbundanceData.size(); j++){
        estimatedAbundanceData.at(j) = estimatedAbundanceData.at(j)/sum ;
    }
    //cout << "out of read estimated pattern" << endl;
 
}
Пример #4
0
void readShortRead(int start, int end){
    int  s, l;
	string readId, methylString, strand, etc;
    //string dummyLine;
    //getline(truePatternFile, dummyLine);
	int i = 0;
	while(!shortReadFile.eof()) {
		i++;
		shortReadFile >> readId >> s >> l >> strand >> methylString >> etc;
        //cout << "Ture read : "<< chr << " " << s << " " << methylString <<endl;
		if( s >= start - 1 && s + l -1  <= end){
            MethylRead* m = new MethylRead(s, s + l -1);
            m->parseMethyl(methylString);
            
            //m->write();
            
            readMethylData.push_back(m);
		}
	}
}
Пример #5
0
void readEstimatedPattern(int start, int end){
	// "s" is for start position, "e" is for end position
	string  chr, methylString;
    string dummyLine;
    getline(estimatedPatternFile, dummyLine);
	int i = 0;
	while(!estimatedPatternFile.eof()) {
        int  s = -100, e = -100, cid, pid;
        float abundance = 0 ;

		i++;
		estimatedPatternFile >> chr >> s >> e >> cid >> pid >> abundance >> methylString;
        
        //cout << "Estimated read : "<< chr << " " << s << " " << e << "  " << methylString << endl;
		if( s >= start -1  &&  e <= end){
            
            MethylRead* m = new MethylRead(s, e-s+1);
            //cerr << "befor parse" << endl;
            
            m->parseMethyl(methylString);
            //cerr << "befor write" << endl;
            //m->write();
            //cerr << "after write" << endl;
            
            estimatedMethylData.push_back(m);
            estimatedAbundanceData.push_back(abundance);
		}
	}
    float sum =0;
    for(unsigned int j= 0; j<estimatedAbundanceData.size(); j++){
        sum += estimatedAbundanceData.at(j);
    }
    for(unsigned int j= 0; j<estimatedAbundanceData.size(); j++){
        estimatedAbundanceData.at(j) = estimatedAbundanceData.at(j)/sum ;
    }
    
}
Пример #6
0
    void MFGraph::normalize_coverage()
    {
        // put nodes in a priority queue by start position
        // to ensure we iterate in the proper order
        std::priority_queue<MethylRead*, std::vector<MethylRead*>, CompareReadStarts> position_queue;
        for (ListDigraph::NodeIt n(mfGraph); n != INVALID; ++n) {
            MethylRead *read = read_map[n];
            //      std::cout << read->start() << std::endl;
            position_queue.push(read);
        }
        
        std::vector<float> coverage_per_position;
        std::vector<ListDigraph::Node> current_nodes;
        
        int current_startpos = -1;
        float current_coverage = 0.;
        
        while (! position_queue.empty() ) {
            MethylRead *read = position_queue.top();
            position_queue.pop();
            
            ListDigraph::Node n = read->node;
            // keep going if node is invalid
            if (!mfGraph.valid(n)) continue;
            
#ifndef NDEBUG
            std::cout << "current node:" << read->start() << " " << coverage_map[n] << std::endl;
#endif
            
            if (current_startpos == -1 ) {
                current_startpos = read->start();
                current_coverage = coverage_map[n];
                current_nodes.push_back(n);
                continue;
            }
            
            if (read->start() < current_startpos) {
                coverage_per_position.push_back(current_coverage);
                
#ifndef NDEBUG
                std::cout << "new position!" << std::endl;
                std::cout << current_startpos << " " << current_coverage << std::endl;
#endif
                
                // divide by position coverage
                for (std::vector<ListDigraph::Node>::iterator it = current_nodes.begin(); it != current_nodes.end(); ++it) {
                    ListDigraph::Node node = *it;
                    normalized_coverage_map[node] = (float) coverage_map[node] / current_coverage;
#ifndef NDEBUG
                    std::cout << coverage_map[node] << " " << normalized_coverage_map[node] << std::endl;
#endif
                }
                
#ifndef NDEBUG
                std::cout << std::endl;
#endif
                
                current_nodes.clear();
                current_nodes.push_back(n);
                current_coverage = (float) coverage_map[n];
                current_startpos = read->start();
            } if (read->start() == current_startpos) {
                current_coverage += (float) coverage_map[n];
                current_nodes.push_back(n);
            } else {
                std::cerr << "nodes out of order" << std::endl;
                return;
            }
        }
        
        // process last position
        coverage_per_position.push_back(current_coverage);
        for (std::vector<ListDigraph::Node>::iterator it = current_nodes.begin(); it != current_nodes.end(); ++it) {
            ListDigraph::Node node = *it;
            normalized_coverage_map[node] = (float) coverage_map[node] / current_coverage;
#ifndef NDEBUG
            std::cout << coverage_map[node] << " " << normalized_coverage_map[node] << std::endl;
#endif
        }
        
        float median_coverage = calculate_median(coverage_per_position);
#ifndef NDEBUG
        std::cout << "median coverage: " << median_coverage << std::endl;
#endif
        
        for (ListDigraph::NodeIt n(mfGraph); n != INVALID; ++n) {
            normalized_coverage_map[n] *= median_coverage;
        }
        is_normalized = true;
    }
Пример #7
0
    int MFGraph::decompose(const int componentID, std::ostream & patt_stream, std::string chr)
    {
        IdMap<ListDigraph, ListDigraph::Node> idmap(mfGraph);
        
        // compute total flow
        float total_flow = this->total_flow();
#ifndef NDEBUG
        std::cout << "total flow:: " << total_flow << std::endl;
#endif
        
        int flownum = 0;
        
        // iterate while residual flow
        while (total_flow > 0.00001) {
            
            // compute residual flow for each arc
            ListDigraph::ArcMap<float> residual_flow(mfGraph);
            for (ListDigraph::ArcIt arc(mfGraph); arc != INVALID; ++arc) {
                residual_flow[arc] = (total_flow - flow_map[arc]);
                if(flow_map[arc] != 0){
// #ifndef NDEBUG
//                     std::cout << "source: " << mfGraph.id(mfGraph.source(arc))<< ", target: " << mfGraph.id(mfGraph.target(arc)) << ", flow: " << flow_map[arc] <<std::endl;
// #endif
                }
                
            }
#ifndef NDEBUG
            std::cout << "run the min-max dijkstra algorithm " << std::endl;
#endif
            // run the min-max dijkstra algorithm
            ListDigraph::NodeMap<float> dist(mfGraph);
            Dijkstra<ListDigraph, ListDigraph::ArcMap<float> >
            ::SetOperationTraits<DijkstraMinMaxOperationTraits<float> >
            ::Create dijkstra(mfGraph, residual_flow);
            dijkstra.distMap(dist);
            dijkstra.run(source, sink);
#ifndef NDEBUG
            std::cout << "get the resulting path and it's flow " << std::endl;
#endif
            // get the resulting path and it's flow
            Path<ListDigraph> shortestPath = dijkstra.path(sink);
            float path_flow = total_flow - dijkstra.dist(sink);

            // break out if this is not a valid path
            if (path_flow == 0) {
#ifndef NDEBUG
              std::cout << "Found invalid path" << std::endl;
#endif
              break;
            }

            // construct a meth fragment from path here
            // and remove path flow from each arc in path
#ifndef NDEBUG
            std::cout << "dijkstra.dist: " << dijkstra.dist(sink) << ", path_flow:" << path_flow << ", total flow: " << total_flow << " length: " << shortestPath.length() << std::endl;
#endif

            // it's a valid pattern, so increase number of paths found
            flownum++;

            std::stringstream region_list;
            MethylRead* pattern = NULL;
            int start, end;
            //MethylRead pattern = MethylRead(*read_map[source]);
            //int start = read_map[source]->start();
            //int end = read_map[sink]->end();
            
            for(Path<ListDigraph>::ArcIt arc(shortestPath); arc != INVALID; ++arc) {
                ListDigraph::Node s = mfGraph.source(arc);
                ListDigraph::Node t = mfGraph.target(arc);
                if (s == source) {
                    pattern = new MethylRead(*read_map[t]);
                    start = read_map[t]->start();
                    end = read_map[t]->end();
                    // end = read_map[sink]->end();

#ifndef NDEBUG
                    std::cout << "Original pattern = " << pattern->getMethString() << std::endl;
#endif
                    break;
                }
                
            }

#ifndef NDEBUG
            if (!pattern) {
              std::cout << "We should not hit this" << std::endl;
              for (Path<ListDigraph>::ArcIt arc(shortestPath); arc != INVALID; ++arc) {
                ListDigraph::Node s = mfGraph.source(arc);
                ListDigraph::Node t = mfGraph.target(arc);
                std::cout << mfGraph.id(s) << " -> " << mfGraph.id(t) << std::endl;
              }
            }
#endif
             
            for(Path<ListDigraph>::ArcIt arc(shortestPath); arc != INVALID; ++arc) {
// #ifndef NDEBUG
//                 std::cout << " After finding a path, " << "source: " << mfGraph.id(mfGraph.source(arc))<< ", target: " << mfGraph.id(mfGraph.target(arc)) << ", flow: " << flow_map[arc] << " ,arc - pathFlow: " << flow_map[arc] - path_flow << std::endl;
// #endif
                
                ListDigraph::Node s = mfGraph.source(arc);
                ListDigraph::Node t = mfGraph.target(arc);
                
                MethylRead *read = read_map[t];

                // don't print the source node or nodes connected to sink
                if (s != source && t != get_sink()) {
                    region_list << idmap[s];
                    if (!childless[t]) {
                        region_list << ",";
                    }
                }
                
                flow_map[arc] -= path_flow;
                
                // delete arc if no residual flow
                if (flow_map[arc] < 1e-6) {
                    mfGraph.erase(arc);
                }
                
                if (s == source) {
                    continue;
                }
                
                
                if (!read  || t == get_sink()) {
                    continue;
                }
                
             
                if (s != source && t != get_sink()) {
                    pattern->merge(read);
                    end = read->end();
                    //std::cout << "new pattern = " << pattern->getMethString() << std::endl;
                }
                
              
              //  if (childless[t]) {
              //      end = read->end();
              //  }
                
                
            }

#ifndef NDEBUG
            std::cout << "new pattern = " << pattern->getMethString() << std::endl;
#endif

            //Note we add source one nucleotide before every read
            patt_stream << chr << "\t" << start << "\t" << end;
            patt_stream << "\t" << componentID << "\t" << flownum << "\t" << path_flow;
            patt_stream << "\t" << pattern->getMethString() << "\t" << region_list.str() << std::endl;

            delete pattern;
            
            // recompute residual flow
            total_flow -= path_flow;
        }
        
        // all done
        return flownum;
    }