void SeedingData::writeCheckpoints(){ /* write the Seeds checkpoint */ if(m_parameters->writeCheckpoints() && !m_parameters->hasCheckpoint("SimpleSeeds")){ ofstream f(m_parameters->getCheckpointFile("SimpleSeeds").c_str()); ostringstream buffer; cout<<"Rank "<<m_parameters->getRank()<<" is writing checkpoint SimpleSeeds"<<endl; vector<GraphPath> * seeds = & m_SEEDING_seeds; int count=(*seeds).size(); buffer.write((char*)&count, sizeof(int)); for(int i=0;i<(int)(*seeds).size();i++){ int length=(*seeds)[i].size(); buffer.write((char*)&length, sizeof(int)); for(int j=0;j<(int)(*seeds)[i].size();j++){ Kmer theKmer; (*seeds)[i].at(j,&theKmer); theKmer.write(&buffer); CoverageDepth coverageValue=0; coverageValue=(*seeds)[i].getCoverageAt(j); buffer.write((char*)&coverageValue, sizeof(CoverageDepth)); flushFileOperationBuffer(false, &buffer, &f, CONFIG_FILE_IO_BUFFER_SIZE); } } flushFileOperationBuffer(true, &buffer, &f, CONFIG_FILE_IO_BUFFER_SIZE); f.close(); } }
void GenomeNeighbourhood::processFinalList(){ /* we have a list of pairs * there are duplicates * and the list is still unfiltered. * * the first step is to select the best entry for * any ordered pair where (a,b) and (b,a) are * different */ /* now, select one entry for each */ /* now, select one entry for each */ /* cases: * * VALID 1. * * -----------> ------------> 2. * * -----------> <------------ 3. * * <------------ -------------> 4. * * <------------ <------------- all other cases are invalid. * */ ostringstream relations; relations<<m_parameters->getPrefix()<<"/NeighbourhoodRelations.txt"; string file=relations.str(); ofstream f(file.c_str()); ostringstream operationBuffer; operationBuffer<<"#LeftContigPath LengthInKmers DNAStrand PositionOnStrand"; operationBuffer<<" RightContigPath LengthInKmers DNAStrand PositionOnStrand"; operationBuffer<<" DistanceInKmers QualityControlStatus"<<endl; for(int i=0;i<(int)m_finalList.size();i++){ PathHandle contig1=m_finalList[i].getContig1(); PathHandle contig2=m_finalList[i].getContig2(); int length1=m_contigLengths->operator[](contig1); int length2=m_contigLengths->operator[](contig2); Strand strand1=m_finalList[i].getStrand1(); Strand strand2=m_finalList[i].getStrand2(); int progression1=m_finalList[i].getProgression1(); int progression2=m_finalList[i].getProgression2(); int depth=m_finalList[i].getDepth(); bool valid=true; int windows=(0x00000001 << 0x00000002); int width1=length1/windows; int width2=length2/windows; /*The pair is considered valid unless the similarity is after the first 1/4 and before the last 3/4*/ if((progression1>width1 && progression1<(length1-width1-1)) || (progression2>width2 && progression2<(length2-width2-1))){ valid=false; } operationBuffer<<"contig-"<<contig1<<" "<<length1<<" "<<strand1<<" "<<progression1<<""; operationBuffer<<" contig-"<<contig2<<" "<<length2<<" "<<strand2<<" "<<progression2<<""; operationBuffer<<" "<<depth<<" "; if(valid){ operationBuffer<<"PASS"; }else{ operationBuffer<<"FAIL"; } operationBuffer<<endl; flushFileOperationBuffer(false,&operationBuffer,&f,CONFIG_FILE_IO_BUFFER_SIZE); } flushFileOperationBuffer(true,&operationBuffer,&f,CONFIG_FILE_IO_BUFFER_SIZE); f.close(); }
void GeneOntology::writeOntologyFiles(){ ostringstream theFile; theFile<<m_parameters->getPrefix()<<"/BiologicalAbundances/_GeneOntology"; string directory=theFile.str(); createDirectory(directory.c_str()); theFile<<"/Terms"; ostringstream xmlFileStream; xmlFileStream<<theFile.str()<<".xml"; ostringstream tsvFileStream; tsvFileStream<<theFile.str()<<".tsv"; string xmlFile=xmlFileStream.str(); string tsvFile=tsvFileStream.str(); if(!m_gotGeneOntologyParameter){ return; } map<GeneOntologyIdentifier,int> modeCoverages; map<GeneOntologyIdentifier,double> meanCoverages; map<GeneOntologyIdentifier,double> estimatedProportions; ofstream xmlStream(xmlFile.c_str()); ostringstream operationBuffer; //------------- cout<<"TOTAL: "<<m_kmerObservationsWithGeneOntologies<<endl; operationBuffer<<"<?xml version=\"1.0\" encoding=\"UTF-8\"?>"<<endl; operationBuffer<<"<root>"<<endl; LargeCount totalForTheGraph=m_searcher->getTotalNumberOfColoredKmerObservationsForANameSpace(COLOR_NAMESPACE_EMBL_CDS); operationBuffer<<"<totalColoredKmerObservations>"; operationBuffer<<totalForTheGraph<<"</totalColoredKmerObservations>"<<endl; // declare tsv files map<string,FILE*> tsvFiles; map<string,ostringstream*> tsvBuffers; for(map<GeneOntologyIdentifier,map<CoverageDepth,int> >::iterator i= m_ontologyTermFrequencies.begin();i!=m_ontologyTermFrequencies.end();i++){ GeneOntologyIdentifier handle=i->first; int mode=0; int modeCount=0; int total=0; LargeCount totalObservations=0; for(map<CoverageDepth,int>::iterator j=i->second.begin();j!=i->second.end();j++){ CoverageDepth coverage=j->first; int frequency=j->second; if(frequency>modeCount){ mode=coverage; modeCount=frequency; } total+=frequency; totalObservations+=coverage*frequency; } double mean=totalObservations; if(total!=0){ mean/=total; } #ifdef ASSERT assert(modeCoverages.count(handle)==0); assert(meanCoverages.count(handle)==0); #endif /**/ modeCoverages[handle]=mode; meanCoverages[handle]=mean; /**/ GeneOntologyDomain domain=getDomain(handle); string domainName=getDomainName(domain); operationBuffer<<"<geneOntologyTerm>"<<endl; operationBuffer<<"<identifier>"; operationBuffer<<getGeneOntologyIdentifier(handle)<<"</identifier><name>"; operationBuffer<<getGeneOntologyName(handle)<<"</name>"<<endl; operationBuffer<<"<domain>"<<domainName<<"</domain>"<<endl; /* print paths to root */ printPathsFromRoot(handle,&operationBuffer); operationBuffer<<"<modeKmerCoverage>"<<mode<<"</modeKmerCoverage>"; operationBuffer<<"<meanKmerCoverage>"<<mean<<"</meanKmerCoverage>"<<endl; operationBuffer<<"<totalColoredKmerObservations>"<<totalObservations<<"</totalColoredKmerObservations>"<<endl; double estimatedProportion=(0.0+totalObservations); if(totalForTheGraph!=0){ estimatedProportion/=totalForTheGraph; } estimatedProportions[handle]=estimatedProportion; m_termCounts[handle]=totalObservations; operationBuffer<<"<proportion>"<<estimatedProportion<<"</proportion>"<<endl; operationBuffer<<"<distribution>"<<endl; operationBuffer<<"#Coverage Frequency"<<endl; for(map<CoverageDepth,int>::iterator j=i->second.begin();j!=i->second.end();j++){ CoverageDepth coverage=j->first; int frequency=j->second; operationBuffer<<coverage<<" "<<frequency<<endl; } operationBuffer<<"</distribution></geneOntologyTerm>"<<endl; flushFileOperationBuffer(false,&operationBuffer,&xmlStream,CONFIG_FILE_IO_BUFFER_SIZE); // also output beautiful tsv file too if(tsvFiles.count(domainName)==0){ ostringstream theFile; theFile<<m_parameters->getPrefix()<<"/BiologicalAbundances/"; theFile<<"0.Profile.GeneOntologyDomain="<<domainName<<".tsv"; string tsvFile=theFile.str(); tsvFiles[domainName]=fopen(tsvFile.c_str(),"a"); tsvBuffers[domainName]=new ostringstream(); *(tsvBuffers[domainName])<<"#TermIdentifier TermName TermDomain TermProportion"<<endl; } *(tsvBuffers[domainName])<<getGeneOntologyIdentifier(handle)<<" "; *(tsvBuffers[domainName])<<getGeneOntologyName(handle)<<" "; *(tsvBuffers[domainName])<<domainName; *(tsvBuffers[domainName])<<" "<<estimatedProportion<<endl; } operationBuffer<<"</root>"<<endl; flushFileOperationBuffer(true,&operationBuffer,&xmlStream,CONFIG_FILE_IO_BUFFER_SIZE); xmlStream.close(); ofstream tsvStream(tsvFile.c_str()); operationBuffer<<"#Identifier Name Mode k-mer coverage Mean k-mer coverage Proportion"<<endl; // close tsv files for(map<string,FILE*>::iterator i=tsvFiles.begin();i!=tsvFiles.end();i++){ string category=i->first; FILE*file=i->second; string text=tsvBuffers[category]->str(); fprintf(file,"%s",text.c_str()); delete tsvBuffers[category]; fclose(file); } tsvBuffers.clear(); tsvFiles.clear(); for(map<GeneOntologyIdentifier,map<CoverageDepth,int> >::iterator i= m_ontologyTermFrequencies.begin();i!=m_ontologyTermFrequencies.end();i++){ GeneOntologyIdentifier handle=i->first; operationBuffer<<getGeneOntologyIdentifier(handle)<<" "; operationBuffer<<getGeneOntologyName(handle)<<" "; operationBuffer<<modeCoverages[handle]<<" "; operationBuffer<<meanCoverages[handle]<<" "; operationBuffer<<estimatedProportions[handle]<<endl; flushFileOperationBuffer(false,&operationBuffer,&tsvStream,CONFIG_FILE_IO_BUFFER_SIZE); } flushFileOperationBuffer(true,&operationBuffer,&tsvStream,CONFIG_FILE_IO_BUFFER_SIZE); tsvStream.close(); }
void GeneOntology::writeOntologyProfile(GeneOntologyDomain domain){ int maximumDepth=getDomainDepth(domain); cout<<"[GeneOntology] maximum depth for GeneOntologyDomain "<<domain<<" is "<<maximumDepth<<endl; string domainName="NULL"; if(domain==GENE_ONTOLOGY_DOMAIN_biological_process){ domainName=GENE_ONTOLOGY_DOMAIN_biological_process_STRING; }else if(domain==GENE_ONTOLOGY_DOMAIN_cellular_component){ domainName=GENE_ONTOLOGY_DOMAIN_cellular_component_STRING; }else if(domain==GENE_ONTOLOGY_DOMAIN_molecular_function){ domainName=GENE_ONTOLOGY_DOMAIN_molecular_function_STRING; } #ifdef ASSERT assert(domainName!="NULL"); #endif LargeCount totalForTheGraph=m_searcher->getTotalNumberOfColoredKmerObservationsForANameSpace(COLOR_NAMESPACE_EMBL_CDS); for(int depth=0;depth<maximumDepth;depth++){ // create the file for the domain and given depth. ostringstream operationBuffer; ostringstream fileName; fileName<<m_parameters->getPrefix()<<"/BiologicalAbundances/_GeneOntology"; fileName<<"/"<<domainName<<".Depth="<<depth<<".tsv"; string file2=fileName.str(); ofstream file; for(map<GeneOntologyIdentifier,int>::iterator i=m_recursiveCounts.begin(); i!=m_recursiveCounts.end();i++){ GeneOntologyIdentifier handle=i->first; int count=i->second; if(count==0){ continue; } if(!hasDepth(handle)){ continue; } if(getGeneOntologyDepth(handle)!=depth){ continue; } if(getDomain(handle)!=domain){ continue; } double proportion=count; if(totalForTheGraph!=0){ proportion/=totalForTheGraph; } if(!file.is_open()){ file.open(file2.c_str()); operationBuffer<<"#Identifier Name Proportion Observations Total"<<endl; } operationBuffer<<getGeneOntologyIdentifier(handle); operationBuffer<<" "<<getGeneOntologyName(handle)<<" "; operationBuffer<<proportion; operationBuffer<<" "<<count<<" "<<totalForTheGraph<<endl; flushFileOperationBuffer(false,&operationBuffer,&file,CONFIG_FILE_IO_BUFFER_SIZE); } if(file.is_open()){ flushFileOperationBuffer(true,&operationBuffer,&file,CONFIG_FILE_IO_BUFFER_SIZE); file.close(); } } }
void PhylogenyViewer::showObservations_XML(ostream*stream){ ostringstream operationBuffer; /* build a mashup for the ranks * this will contain total at each level */ map<string,LargeCount> rankRecursiveObservations; map<string,LargeCount> rankSelfObservations; populateRanks(&rankSelfObservations,&rankRecursiveObservations); operationBuffer<<"<?xml version=\"1.0\" encoding=\"UTF-8\"?>"<<endl; operationBuffer<<"<root>"<<endl; /* add the sample name in the XML file */ operationBuffer<<"<sample>"; operationBuffer<<m_parameters->getSampleName(); operationBuffer<<"</sample>"<<endl; operationBuffer<<"<totalAssembledKmerObservations>"<<m_totalNumberOfKmerObservations<<"</totalAssembledKmerObservations>"<<endl; LargeCount totalColoredAssembledKmerObservations=m_totalNumberOfKmerObservations-m_unknown; operationBuffer<<"<totalColoredAssembledKmerObservations>"<<totalColoredAssembledKmerObservations<<"</totalColoredAssembledKmerObservations>"<<endl; operationBuffer<<"<ranks>"<<endl; for(map<string,LargeCount>::iterator i=rankSelfObservations.begin();i!=rankSelfObservations.end();i++){ string rank=i->first; #ifdef ASSERT assert(rankRecursiveObservations.count(rank)>0); assert(rankSelfObservations.count(rank)>0); #endif operationBuffer<<"<entry><rank>"<<rank<<"</rank><self><kmerObservations>"; operationBuffer<<rankSelfObservations[rank]<<"</kmerObservations></self>"; operationBuffer<<"<recursive><kmerObservations>"<<rankRecursiveObservations[rank]; operationBuffer<<"</kmerObservations></recursive></entry>"<<endl; } operationBuffer<<"</ranks>"<<endl; operationBuffer<<"<entry>"; operationBuffer<<"<taxon><identifier>unknown</identifier><name>unknown</name><rank>unknown</rank></taxon>"<<endl; operationBuffer<<"<path></path>"<<endl; operationBuffer<<"<self><kmerObservations>"<<m_unknown<<"</kmerObservations>"; double ratio=m_unknown; if(m_totalNumberOfKmerObservations!=0) ratio/=m_totalNumberOfKmerObservations; operationBuffer<<"<proportion>"<<ratio<<"</proportion>"; operationBuffer<<"<coloredProportion>0</coloredProportion>"; operationBuffer<<"<coloredProportionInRank>0</coloredProportionInRank></self></entry>"<<endl; // declare tsv files map<string,FILE*> tsvFiles; map<string,ostringstream*> tsvBuffers; for(map<TaxonIdentifier,string>::iterator i=m_taxonNames.begin(); i!=m_taxonNames.end();i++){ TaxonIdentifier taxon=i->first; LargeCount count=getSelfCount(taxon); string rank=getTaxonRank(taxon); #ifdef ASSERT assert(rankSelfObservations.count(rank)>0); assert(rankRecursiveObservations.count(rank)>0); #endif LargeCount rankRecursiveCount=rankRecursiveObservations[rank]; #ifdef ASSERT LargeCount rankSelfCount=rankSelfObservations[rank]; //- assert(rankSelfCount>=0); assert(rankRecursiveCount>=0); #endif LargeCount recursiveCount=getRecursiveCount(taxon); if(recursiveCount==0){ continue; } operationBuffer<<"<entry>"<<endl; printTaxon_XML(taxon,&operationBuffer); vector<TaxonIdentifier> path; getTaxonPathFromRoot(taxon,&path); printTaxonPath_XML(taxon,&path,&operationBuffer); operationBuffer<<"<self>"<<endl; operationBuffer<<"<kmerObservations>"<<count<<"</kmerObservations>"; double ratio=count; if(m_totalNumberOfKmerObservations!=0) ratio/=m_totalNumberOfKmerObservations; operationBuffer<<"<proportion>"<<ratio<<"</proportion>"; double coloredRatio=count; if(totalColoredAssembledKmerObservations!=0){ coloredRatio/=totalColoredAssembledKmerObservations; } operationBuffer<<"<coloredProportion>"<<coloredRatio<<"</coloredProportion>"; operationBuffer<<"</self>"<<endl; operationBuffer<<"<recursive>"; operationBuffer<<"<kmerObservations>"; operationBuffer<<recursiveCount; operationBuffer<<"</kmerObservations>"<<endl; double ratio2=recursiveCount; if(m_totalNumberOfKmerObservations!=0) ratio2/=m_totalNumberOfKmerObservations; operationBuffer<<"<proportion>"<<ratio2<<"</proportion>"; double coloredRatio2=recursiveCount; if(totalColoredAssembledKmerObservations!=0){ coloredRatio2/=totalColoredAssembledKmerObservations; } operationBuffer<<"<coloredProportion>"<<coloredRatio2<<"</coloredProportion>"; double coloredRatioInRank=recursiveCount; if(rankRecursiveCount!=0){ coloredRatioInRank/=rankRecursiveCount; } operationBuffer<<"<coloredProportionInRank>"<<coloredRatioInRank<<"</coloredProportionInRank>"; operationBuffer<<"</recursive>"<<endl; operationBuffer<<"</entry>"<<endl; flushFileOperationBuffer(false,&operationBuffer,stream,CONFIG_FILE_IO_BUFFER_SIZE); // add data to the tsv file if(tsvFiles.count(rank)==0){ ostringstream theFile; theFile<<m_parameters->getPrefix()<<"/BiologicalAbundances/"; theFile<<"0.Profile.TaxonomyRank="<<rank<<".tsv"; string tsvFile=theFile.str(); tsvFiles[rank]=fopen(tsvFile.c_str(),"a"); tsvBuffers[rank]=new ostringstream(); *(tsvBuffers[rank])<<"#TaxonIdentifier TaxonName TaxonRank TaxonProportion"<<endl; } string name=getTaxonName(taxon); *(tsvBuffers[rank])<<taxon<<" "<<name<<" "<<rank; *(tsvBuffers[rank])<<" "<<coloredRatioInRank<<endl; } // close XML files operationBuffer<<"</root>"<<endl; flushFileOperationBuffer(true,&operationBuffer,stream,CONFIG_FILE_IO_BUFFER_SIZE); // close tsv files for(map<string,FILE*>::iterator i=tsvFiles.begin();i!=tsvFiles.end();i++){ string rank=i->first; FILE*file=i->second; string text=tsvBuffers[rank]->str(); fprintf(file,"%s",text.c_str()); delete tsvBuffers[rank]; fclose(file); } tsvBuffers.clear(); tsvFiles.clear(); }