//~~~~~~~ //8/29/2014 //Recursive function //Keep Spliting until component gets smaller than let's say 300 void splitCmp (int cmpID, vector<string> trVec, map<int, vector<string>> &rcCounts, map<string, vector<string>> &rcID_reverse, map<vector<string>, int> &obsReadsClasses, map<vector<string>, string> &rcID, map<string, map<string, int>> >f, map <string, string> &fa) { //Note: Leave single transcript classes "as is" int tempCount=0; //cout<<"Component "<<cmpID<<" has size > 10"<<endl; //sort the current read class in another vector vector<string> sortedReadClass; //Adjust read classes in this component and resplit using DFS (or boost) //1. Get the read classes for this component and save them in a local variable //It's faster to search for matching read class in this small structure map<vector<string>, int> cmpObsReadsClasses; //All obs read class that belong to this component only //for each read class ID for (const auto &class_ID : rcCounts[cmpID]){ sortedReadClass.clear(); //for each transcript in this read class rcID_reverse[class_ID]-->print the class and the counts for (const auto &tr : rcID_reverse[class_ID]) { sortedReadClass.push_back(tr); //cout<<tr<<"\t"; } //cout<<"]"<<"\t"<<obsReadsClasses[rcID_reverse[class_ID]]<<endl; sort(sortedReadClass.begin(), sortedReadClass.end()); cmpObsReadsClasses[sortedReadClass]=obsReadsClasses[rcID_reverse[class_ID]]; } #if DEBUG cout<<"Print all initial read classes that belong to component"<<cmpID<<endl; print(cmpObsReadsClasses); //exit(7); #endif //2.Compress Read Classes //Delete the last transcript //for each read class ID for (const auto &class_ID : rcCounts[cmpID]){ //cout<<"Processing Read Class: "<<endl; //cout<<"[\t"; sortedReadClass.clear(); //for each transcript in this read class rcID_reverse[class_ID]-->print the class and the counts for (const auto &tr : rcID_reverse[class_ID]) { sortedReadClass.push_back(tr); //cout<<tr<<"\t"; } //cout<<"]"<<"\t"<<obsReadsClasses[rcID_reverse[class_ID]]<<endl; sort(sortedReadClass.begin(), sortedReadClass.end()); //if counts < 10 && readclass size is not 1 if(obsReadsClasses[rcID_reverse[class_ID]] < readCountThresh && rcID_reverse[class_ID].size()>1) { //cout<<" Count of read class: "<<endl; //print(rcID_reverse[class_ID]); //cout<<" is : "<<obsReadsClasses[rcID_reverse[class_ID]]<<endl; //keep the initial class before removing elements from the end vector<string> initialReadClass; for (const auto &tr : rcID_reverse[class_ID]) initialReadClass.push_back(tr); sort(initialReadClass.begin(), initialReadClass.end()); bool foundClass=false; while(!foundClass && !rcID_reverse[class_ID].empty()) { rcID_reverse[class_ID].pop_back(); if(!rcID_reverse[class_ID].empty()) { //if there is at least one more transcript in this class after removing //sort again sortedReadClass.clear(); for (const auto &tr : rcID_reverse[class_ID]) sortedReadClass.push_back(tr); sort(sortedReadClass.begin(), sortedReadClass.end()); //#if DEBUG // cout<<"New Read Class "<<endl; // cout<<" ["; // for (const auto tr : sortedReadClass) // cout<<tr<<" "; // cout<<" ]"<<endl; //#endif //no need to search inside all read class ... only those read classes that belong to this component //cmpObsReadsClasses have been created to address the issue above if ( cmpObsReadsClasses.find(sortedReadClass) != cmpObsReadsClasses.end() ){ //if this read class is found among the given classes #if DEBUG cout<<" ["; cout<<"Read class has been found"<<endl; for (const auto tr : sortedReadClass) cout<<tr<<" "; cout<<" ]"<<endl; #endif //increment the new founded class obsReadsClasses[sortedReadClass]+=obsReadsClasses[initialReadClass]; //global read classes cmpObsReadsClasses[sortedReadClass]+=obsReadsClasses[initialReadClass]; //local read classes //delete the old class obsReadsClasses.erase(initialReadClass); cmpObsReadsClasses.erase(initialReadClass); //Note: All structures that are related to obsReadsClasses MUST also be updated //This includes but not limited to: p_value_new2 foundClass=true; } } else { //else after poping we got to the end of the stack-->this means no read class was found //drop the read and write this read to file the read to which was initially mapped that was initally mapped to initialReadClass /*~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ //unused_reads<<read_name<<"\t mapped to class: ["; cout<<"\t Initial class. (NOT Compatible with any other class): ["; for (const auto tr : initialReadClass) cout<<tr<<" "; cout<<" ]"<<endl; //discarded_reads++; ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~*/ //delete the old class obsReadsClasses.erase(initialReadClass); cmpObsReadsClasses.erase(initialReadClass); #if DEBUG //cout<<"Read "<<read_name<<" was NOT mapped to any class"<<endl; cout<<"Initial Class: ["; for (const auto tr : initialReadClass) cout<<tr<<" "; cout<<" ]"<<endl; #endif } } //end while class not found } }//end: for each read class ID //cout<<"Print adjusted read classes that belong to this component"<<endl; //print(cmpObsReadsClasses); //2. Maybe create again the graph and run boost? (some recursion needs to be done here) //!!!the entire p_value_new must be updated? struct Vertex { std::string vertexName; }; //typedef adjacency_list <vecS, vecS, undirectedS, Vertex> MyGraph; typedef boost::labeled_graph<adjacency_list<vecS, vecS, undirectedS, Vertex>, std::string> MyGraph; //The side effect of this is that one need to use graph() member function to make some algorithms work: typedef boost::graph_traits<MyGraph>::vertex_descriptor VertexID; MyGraph G; //We'll use d_value (p_value) to hold the adjacency matrix map< string, map< vector<string>, double> > p_value_new2; //this is d_value //local variable //for each OBS Read class for (const auto &read_class: cmpObsReadsClasses){ //for each transcript in the current class for (const auto &tr: read_class.first){ //if this value exists --> then we have an error if(p_value_new2[tr][read_class.first]){ cout<<"Error: p value new2 already exists!!!"<<endl; exit(1); } p_value_new2[tr][read_class.first]=read_class.second;//enter the counts for each class } } #if DEBUG cout<<"\nPrint p_value_new2 (local):"<<endl; //print(p_value_new2); //exit(7); #endif //Build the graph //for each transcript -- for (const auto &tr : p_value_new2){ // Create vertices in that graph VertexID u = boost::add_vertex(tr.first,G); G[tr.first].vertexName = tr.first; //for each class in the current transcript for (const auto &read_class : tr.second ){ //string rClassName = "["; //add [ in order to differentiate the classes that contains only one transcripts from the transcripts itself string rClassName=rcID[read_class.first]; //get just the read class ID VertexID v = boost::add_vertex(rClassName,G); G[rClassName].vertexName = rClassName; //in case vertex already exists then this should overwrite add_edge(u, v, G); //allCout<<"[Print 1] Edge btwn vertex "<<tr.first<<" and vertex "<<rClassName<<endl; } } std::vector<int> component(num_vertices(G)); int num = connected_components(G, &component[0]); //cout << "Total number of components: " << num << endl; //go thorugh components //cout<<"component size is: "<<component.size()<<endl; //CREATE MAP THAT WILL HOLD ALL THE COMPONENTS (transcript names): map<int, vector<string>> cc; //CREATE MAP THAT WILL HOLD ALL READ CLASSES AND COUNTS map<int, vector<string>> rcCountsLocal; vector<int>::size_type i=0; Vertex & vertex = G.graph()[i]; //cout<<"Vertex "<<i<<" has name: "<<vertex.vertexName<<endl; //Notes for each component (# of components == # of vertices but the component id changes only when we have a new component) for (i = 0; i < component.size(); i++){ //cout<<"Component iterations: "<<i<<endl; Vertex & vertex = G.graph()[i]; //cout<<"i="<<i<<" Element from component: "<<component[i]<<" is "<<vertex.vertexName<<endl; if(vertex.vertexName[0] != '[')//if the first character is not [ cc[component[i]].push_back(vertex.vertexName); else { //if the first character it is a "[" then this a read class rcCountsLocal[component[i]].push_back(vertex.vertexName); } }//end: for each component #if DEBUG cout<<"Print component map:"<<endl; for(const auto &cmp : cc) { cout<<"component "<<cmp.first<<" - size: "<<cmp.second.size()<<" containts: "<<endl; for(const auto &elem : cmp.second) cout<<"\t\t"<<elem<<endl; } #endif //Now print to file //cout<<"\nWrite observed transcripts names\n\t"<<endl; ofstream d_stream_new2; ofstream resultStream; string resultsFile="../singleTrGenes.txt"; resultStream.open(resultsFile.c_str(),ios::app);//open file in append mode (in order to avoid overwritting) if(!resultStream){ cout<<"Unable to open " <<resultsFile<<endl; exit(1); } //For each component for(const auto &cmp : cc) {//for each component there is a vector with transcript names //string glblCmpId=dirPrefix+"_"+cmp.first; //global component id subCmpId++; //Check the size of the component and if it is 1 then just write the results to file if( cmp.second.size()< 2 ) { resultStream<<subCmpId<<"\t"<<cmp.second.size()<<"\t"<<obsReadsClasses[cmp.second]; resultStream<<"\t"<<cmp.second.front()<<"\t1"<<endl; //cmpID---#tr.---#ObsReads---Transcript names continue; //continue to the next component } //~~~~~~~ //8/28/2014 if(cmp.second.size() > cmpSizeThresh) { //Prepare directory prefix //stringstream ss; //ss << cmp.first; //dirPrefix+="_"+ss.str(); //increment read count threshold (required for the next function call) readCountThresh*=2; //otherwise we'll get the same read classes (no new colapse will happen) //Recursive Function //cout<<"splitCmp: Call the recursive function"<<endl; splitCmp(cmp.first, cmp.second, rcCountsLocal, rcID_reverse, obsReadsClasses, rcID, gtf, fa); continue; } //~~~~~~~ //Else write to files write2Files(subCmpId, cmp.second, gtf, fa); d_stream_new2.clear(); //reuse the same stream (just clear the state flags) stringstream outFile; //outFile.str(""); //clear the stringstream //outFile<<"./"<<dirPrefix<<"_"<<cmp.first<<"/obsRCcounts.txt"; //directory already exists from previous outFile<<"./"<<subCmpId<<"/obsRCcounts.txt"; string d_file_new2=outFile.str(); d_stream_new2.open(d_file_new2.c_str()); if(!d_stream_new2){ cout<<"Unable to open" <<d_file_new2<<endl; exit(1); } //for each read class ID for (const auto &class_ID : rcCountsLocal[cmp.first]){ d_stream_new2<<"[\t"; //for each transcript in this read class rcID_reverse[class_ID]-->print the class and the counts for (const auto &tr : rcID_reverse[class_ID]) { d_stream_new2<<tr<<"\t"; } d_stream_new2<<"]"<<"\t"<<obsReadsClasses[rcID_reverse[class_ID]]<<endl; }//end: for each read class ID d_stream_new2.close(); //Write to file the number of transcripts in each readClass d_stream_new2.clear(); //reuse the same stream (just clear the state flags) outFile.str(""); //clear the stringstream //outFile<<"./"<<dirPrefix<<"_"<<cmp.first<<"/obsRCsize.txt"; //directory already exists from previous outFile<<"./"<<subCmpId<<"/obsRCsize.txt"; d_file_new2=outFile.str(); d_stream_new2.open(d_file_new2.c_str()); if(!d_stream_new2){ cout<<"Unable to open" <<d_file_new2<<endl; exit(1); } //Matrix for size of read class vector<vector<double>> rcSize; //initialize matrix rcSize with zero for(int i=0;i<rcCountsLocal[cmp.first].size();i++) { vector<double> row; // Create an empty row for(int j=0;j<rcCountsLocal[cmp.first].size();j++) { row.push_back(0.0); // Add an element (column) to the row } rcSize.push_back(row); // Add the row to the main vector } /*cout<<"print(rcSize)"<<endl; for(int i=0;i<rcSize.size();i++) { for(int j=0;j<rcSize.size();j++) { cout<<rcSize[i][j]; } cout<<endl; }*/ //fill the diagonal with read classes size. //for each read class ID int i=0; for (const auto &class_ID : rcCountsLocal[cmp.first]) { rcSize[i][i]=(double)1/rcID_reverse[class_ID].size(); i++; } //print Matrix to file for(int i=0;i<rcCountsLocal[cmp.first].size();i++) { for(int j=0;j<rcCountsLocal[cmp.first].size();j++) { d_stream_new2<<rcSize[i][j]<<"\t"; } d_stream_new2<<endl; } d_stream_new2.close(); }//end:for each cc resultStream.close(); //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ }//end splitCmp
int main(int argc,char *argv[]){ //cout<<"\nRunning "<<argv[0]<<endl; if(argc<4){ cout<<HELPMESSAGE<<endl; exit(1); } double EPS = 0.01; int iterations=0; string obsRBowtie_file=argv[1]; //aligned Observed reads string gtf_file=argv[2]; string fa_file=argv[3]; clock_t begin = clock(); //used for measuring entire elapsed time for this function //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ cout<<"\n~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~"<<endl; cout<<"["<<current_time()<<"] Load GTF "<<endl; cout<<"~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~"<<endl; map<string, map<string, int>> gtf; loadGTF(gtf_file, gtf); //Write to file transcript lengths //This file is used to replace the use of grep for computing transcript lenght (grep is slow) ofstream outGlobalTrLength; string globalTrLengthFile="../trLen.txt"; outGlobalTrLength.open(globalTrLengthFile.c_str()); if(!outGlobalTrLength){ cout<<"Unable to open " <<globalTrLengthFile<<endl; exit(1); } for (const auto &tr : gtf){ outGlobalTrLength<<tr.first<<"\t"<<tr.second.begin()->second<<endl; } outGlobalTrLength.close(); cout<<"\n~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~"<<endl; cout<<"["<<current_time()<<"] ~~~~~ Done! ~~~~~"<<endl; cout<<"~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~"<<endl<<endl; cout<<"\n~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~"<<endl; cout<<"["<<current_time()<<"] Load FA File"<<endl; cout<<"~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~"<<endl; map<string, string> fa; loadFA(fa_file, fa); cout<<"\n~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~"<<endl; cout<<"["<<current_time()<<"] ~~~~~ Done! ~~~~~"<<endl; cout<<"~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~"<<endl<<endl; cout<<"\n~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~"<<endl; cout<<"["<<current_time()<<"] Compute Observed Reads Classes "<<endl; cout<<"~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~"<<endl; //Extract OBS reads classes from Bowtie output map<vector<string>, int> obsReadsClasses; cout<<"\nMain: Parsing Bowtie file: "<<endl; extract_obsRC(obsRBowtie_file, obsReadsClasses); //~~~~~~~~~~~~~~~~~~~~~~~ //08/26/2014~~~~~~~~~~~~~ //delete classes with count 10 //for each read class //for (const auto &r_class : obsReadsClasses){ // if(r_class.second<10) // obsReadsClasses.erase(r_class.first); //} //~~~~~~~~~~~~~~~~~~~~~~~~~~~ //Create an ID for each Read Class --- this is just a current solution --- needs improvement --- not smart because will double the memory map<vector<string>, string> rcID; map<string, vector<string>> rcID_reverse; //in this way we have a bidirectional map (which is stupid anyway...but what to do?) int total_obs_reads=0; int id=0; //for each readClass in obsReadsClasses add an id and Compute Total number of Observed reads for (const auto &r_class : obsReadsClasses){ stringstream cID_ss; cID_ss<<"["<<id<<"]";//these square brackets were inserted in order to avoid modifying the code down when it checks for "[" rcID[r_class.first]=cID_ss.str(); rcID_reverse[cID_ss.str()]=r_class.first; id++; total_obs_reads+=r_class.second; } cout<<"\n~~~ Total number of Observed reads = "<<total_obs_reads<<endl; cout<<"Total number of Observed Reads classes is: "<<obsReadsClasses.size()<<endl; #if DEBUG cout<<"\nOBS Reads Classes \t Size:"<<endl; //print(obsReadsClasses); cout<<endl; //exit(7); //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ //This might be only for debugging //07/30/2014: Print all observed classes to file ofstream allRCout; // all observed read classes string allRCFile="../allObsReadClasses.txt"; allRCout.open(allRCFile.c_str(),ios::app);//open file in append mode (in order to avoid overwritting) if(!allRCout){ cout<<"Unable to open " <<allRCFile<<endl; exit(1); } for (const auto &read_class: obsReadsClasses){ //for each read class allRCout<<"[ "; for (const auto &tr : read_class.first)//for each transcript in this class allRCout<<tr<<" "; allRCout<<"] "<<read_class.second<<endl; } allRCout.close(); //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ #endif //cout<<"\n~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~"<<endl; //cout<<"["<<current_time()<<"] ~~~~~ Done! ~~~~~"<<endl; //cout<<"~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~"<<endl<<endl; //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ cout<<"\n\n~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~"<<endl; cout<<"["<<current_time()<<"] Computing adjacency matrix between reads (classes) and transcripts... "<<endl; cout<<"~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~"<<endl; //We'll use d_value (p_value) to hold the adjacency matrix map< string, map< vector<string>, double> > p_value_new2; //this is d_value //for each OBS Read class for (const auto &read_class: obsReadsClasses){ //for each transcript in the current class for (const auto &tr: read_class.first){ //if this value exists --> then we have an error if(p_value_new2[tr][read_class.first]){ cout<<"Error: p value new2 already exists!!!"<<endl; exit(1); } p_value_new2[tr][read_class.first]=read_class.second;//enter the counts for each class } } #if DEBUG cout<<"\nPrint p_value_new2"<<endl; //print(p_value_new2); //exit(7); #endif //cout<<"\n~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~"<<endl; //cout<<"["<<current_time()<<"] ~~~~~ Done: adjacency matrix has been computed"<<endl; //cout<<"~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~"<<endl<<endl; /*~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~*/ cout<<"\n\n~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~"<<endl; cout<<"["<<current_time()<<"] Computing Connected Components ... "<<endl; cout<<"~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~"<<endl; //Both DFS and BFS complete on O(n) time, using O(n) memory, where n is matrix size. But BFS it doesn't suffer from stack overflow problem, and it doesn't spend time on recursive calls. //Anyway, let's first use boost library //*********************************************For debug only*******************************////////// //08/24/2014: Print all components and edges to file /* ofstream allCout; // all components string allCFile="../allComponents.txt"; allCout.open(allCFile.c_str(),ios::app);//open file in append mode (in order to avoid overwritting) if(!allCout){ cout<<"Unable to open " <<allCFile<<endl; exit(1); } */ //*****************************************//////////////////////////////////////////////////////////// struct Vertex { std::string vertexName; }; //typedef adjacency_list <vecS, vecS, undirectedS, Vertex> MyGraph; typedef boost::labeled_graph<adjacency_list<vecS, vecS, undirectedS, Vertex>, std::string> MyGraph; //The side effect of this is that one need to use graph() member function to make some algorithms work: typedef boost::graph_traits<MyGraph>::vertex_descriptor VertexID; MyGraph G; //for each transcript -- for (const auto &tr : p_value_new2){ // Create vertices in that graph VertexID u = boost::add_vertex(tr.first,G); G[tr.first].vertexName = tr.first; //for each class in the current transcript for (const auto &read_class : tr.second ){ //string rClassName = "["; //add [ in order to differentiate the classes that contains only one transcripts from the transcripts itself string rClassName=rcID[read_class.first]; //get just the read class ID //---Lines below have been replaced by line above //for each transcript in this class -- create a string with the name of the class //for (const auto &tr_class : read_class.first) // rClassName+=tr_class; //cout<<"Class name is: "<<rClassName<<endl; VertexID v = boost::add_vertex(rClassName,G); G[rClassName].vertexName = rClassName; //in case vertex already exists then this should overwrite add_edge(u, v, G); //allCout<<"[Print 1] Edge btwn vertex "<<tr.first<<" and vertex "<<rClassName<<endl; } } std::vector<int> component(num_vertices(G)); int num = connected_components(G, &component[0]); cout << "Total number of components: " << num << endl; /* std::vector<int>::size_type j; for (j = 0; j != component.size(); ++j) allCout << "Vertex " << j <<" is in component " << component[j]<<endl; allCout << endl; cout<<"\nPrint components: "<<endl; for (const auto &c : component) { cout<<"Component: "<<c<<endl; } */ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ //08/22/2014: Print all components to file /* MyGraph::vertex_iterator vertexIt, vertexEnd; boost::tie(vertexIt, vertexEnd) = vertices(G); for (; vertexIt != vertexEnd; ++vertexIt){ VertexID vertexID = *vertexIt; // dereference vertexIt, get the ID Vertex & vertex = G.graph()[vertexID]; //The side effect of boost::labeled_graph is that one need to use graph() member function to make some algorithms work allCout<<"[Print 2] Vertex name is : "<<vertex.vertexName<<endl; } allCout.close(); */ //cout<<"\n~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~"<<endl; //cout<<"["<<current_time()<<"] ~~~~~ Done: Connected Components had been computed"<<endl; //cout<<"~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~"<<endl<<endl; //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ //~~~~~ Write Values to Files ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ cout<<endl<<"~~~~~~~~~~~~~~~~~~~~~~"<<endl; cout<<"Prepare Values for SimReg"<<endl; cout<<"~~~~~~~~~~~~~~~~~~~~~~"<<endl; cout<<"Please wait, this may take few minutes"<<endl; cout<<"writing to files . . ."<<endl; /********************************* *********************************/ /* //Write OBS Read Classes Names and Frequencies to File string read_classes_file2="obsCC.txt"; cout<<"\nWrite read classes to: \n\t"<<read_classes_file2<<endl; ofstream read_classes_stream; read_classes_stream.open(read_classes_file2.c_str()); if(!read_classes_stream){ cout<<"Unable to open" <<read_classes_file2<<endl; exit(1); } //for each read class for (const auto &read_class : obsReadsClasses) { //print to file the class name read_classes_stream<<"[ "; for (const auto &cluster : read_class.first){ read_classes_stream<<cluster<<" "; } read_classes_stream<<"]\t"<<new_o[read_class.first]<<endl; } read_classes_stream.close(); /********************************************************************************* **********************************************************************************/ /**************************** * Write transcripts from each component to file * *****************************/ //go thorugh components //cout<<"component size is: "<<component.size()<<endl; //CREATE MAP THAT WILL HOLD ALL THE COMPONENTS (transcript names): map<int, vector<string>> cc; //CREATE MAP THAT WILL HOLD ALL READ CLASSES AND COUNTS // ### 9/2/2014: Not sure about the counts? The integer looks like the component ID? map<int, vector<string>> rcCounts; vector<int>::size_type i=0; Vertex & vertex = G.graph()[i]; //cout<<"Vertex "<<i<<" has name: "<<vertex.vertexName<<endl; //Notes for each component (# of components == # of vertices but the component id changes only when we have a new component) for (i = 0; i < component.size(); i++){ //cout<<"Component iterations: "<<i<<endl; Vertex & vertex = G.graph()[i]; //cout<<"i="<<i<<" Element from component: "<<component[i]<<" is "<<vertex.vertexName<<endl; if(vertex.vertexName[0] != '[')//if the first character is not [ cc[component[i]].push_back(vertex.vertexName); else { //if the first character it is a "[" then this a read class rcCounts[component[i]].push_back(vertex.vertexName); } }//end: for each component #if DEBUG cout<<"Print component map:"<<endl; for(const auto &cmp : cc) { cout<<"component "<<cmp.first<<" - size: "<<cmp.second.size()<<" containts: "<<endl; for(const auto &elem : cmp.second) cout<<"\t\t"<<elem<<endl; } #endif //Now print to file cout<<"\nWrite observed transcripts names\n\t"<<endl; ofstream resultStream; string resultsFile="../singleTrGenes.txt"; resultStream.open(resultsFile.c_str(),ios::app);//open file in append mode (in order to avoid overwritting) if(!resultStream){ cout<<"Unable to open " <<resultsFile<<endl; exit(1); } //For each component for(const auto &cmp : cc) {//for each component there is a vector with transcript names //Check the size of the component and if it is 1 then just write the results to file if( cmp.second.size()< 2 ) { resultStream<<cmp.first<<"\t"<<cmp.second.size()<<"\t"<<obsReadsClasses[cmp.second]; resultStream<<"\t"<<cmp.second.front()<<"\t1"<<endl; //cmpID---#tr.---#ObsReads---Transcript names continue; //continue to the next component } //~~~~~~~ //8/28/2014 if(cmp.second.size() > cmpSizeThresh) { //Prepare the prefix for sub-components //stringstream ss; //ss << cmp.first; //string dirPrefix=ss.str(); //Recursive Function cout<<"Call the recursive function"<<endl; //Reset the thereshold value everytime this function is called readCountThresh=1; splitCmp(cmp.first, cmp.second, rcCounts, rcID_reverse, obsReadsClasses, rcID, gtf, fa); continue; } //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ //######################################################################### write2Files(cmp.first, cmp.second, gtf, fa); //######################################################################### //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ofstream d_stream_new2; //d_stream_new2.clear(); //reuse the same stream (just clear the state flags) stringstream outFile; //outFile.str(""); //clear the stringstream outFile<<"./"<<cmp.first<<"/obsRCcounts.txt"; //directory already exists from previous string d_file_new2=outFile.str(); d_stream_new2.open(d_file_new2.c_str()); if(!d_stream_new2){ cout<<"Unable to open" <<d_file_new2<<endl; exit(1); } //for each read class ID for (const auto &class_ID : rcCounts[cmp.first]){ d_stream_new2<<"[\t"; //for each transcript in this read class rcID_reverse[class_ID]-->print the class and the counts for (const auto &tr : rcID_reverse[class_ID]) { d_stream_new2<<tr<<"\t"; } d_stream_new2<<"]"<<"\t"<<obsReadsClasses[rcID_reverse[class_ID]]<<endl; }//end: for each read class ID d_stream_new2.close(); //Write to file the number of transcripts in each readClass d_stream_new2.clear(); //reuse the same stream (just clear the state flags) outFile.str(""); //clear the stringstream outFile<<"./"<<cmp.first<<"/obsRCsize.txt"; //directory already exists from previous d_file_new2=outFile.str(); d_stream_new2.open(d_file_new2.c_str()); if(!d_stream_new2){ cout<<"Unable to open" <<d_file_new2<<endl; exit(1); } //Matrix for size of read class vector<vector<double>> rcSize; //initialize matrix rcSize with zero for(int i=0;i<rcCounts[cmp.first].size();i++) { vector<double> row; // Create an empty row for(int j=0;j<rcCounts[cmp.first].size();j++) { row.push_back(0.0); // Add an element (column) to the row } rcSize.push_back(row); // Add the row to the main vector } /* cout<<"print(rcSize)"<<endl; for(int i=0;i<rcSize.size();i++) { for(int j=0;j<rcSize.size();j++) { cout<<rcSize[i][j]; } cout<<endl; } exit(7);*/ //fill the diagonal with read classes size. //for each read class ID int i=0; for (const auto &class_ID : rcCounts[cmp.first]) { rcSize[i][i]=(double)1/rcID_reverse[class_ID].size(); i++; } //print Matrix to file for(int i=0;i<rcCounts[cmp.first].size();i++) { for(int j=0;j<rcCounts[cmp.first].size();j++) { d_stream_new2<<rcSize[i][j]<<"\t"; } d_stream_new2<<endl; } d_stream_new2.close(); }//end:for each cc resultStream.close(); //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ clock_t end = clock(); double elapsed_secs = double(end - begin) / CLOCKS_PER_SEC; cout<<"\nDone - Elapsed time: "<<elapsed_secs<<endl; }//end main