Beispiel #1
0
void SeedingData::writeCheckpoints(){

	/* write the Seeds checkpoint */
	if(m_parameters->writeCheckpoints() && !m_parameters->hasCheckpoint("SimpleSeeds")){

		ofstream f(m_parameters->getCheckpointFile("SimpleSeeds").c_str());
		ostringstream buffer;

		cout<<"Rank "<<m_parameters->getRank()<<" is writing checkpoint SimpleSeeds"<<endl;

		vector<GraphPath> * seeds = & m_SEEDING_seeds;

		int count=(*seeds).size();

		buffer.write((char*)&count, sizeof(int));

		for(int i=0;i<(int)(*seeds).size();i++){
			int length=(*seeds)[i].size();
			buffer.write((char*)&length, sizeof(int));

			for(int j=0;j<(int)(*seeds)[i].size();j++){
				Kmer theKmer;
				(*seeds)[i].at(j,&theKmer);
				theKmer.write(&buffer);

				CoverageDepth coverageValue=0;
				coverageValue=(*seeds)[i].getCoverageAt(j);
				buffer.write((char*)&coverageValue, sizeof(CoverageDepth));
				flushFileOperationBuffer(false, &buffer, &f, CONFIG_FILE_IO_BUFFER_SIZE);
			}
		}
                flushFileOperationBuffer(true, &buffer, &f, CONFIG_FILE_IO_BUFFER_SIZE);
		f.close();
	}
}
Beispiel #2
0
void GenomeNeighbourhood::processFinalList(){
/* we have a list of pairs
 * there are duplicates
 * and the list is still unfiltered.
 *
 * the first step is to select the best entry for
 * any ordered pair where (a,b) and (b,a) are
 * different
 */

/* now, select one entry for each */


/* now, select one entry for each */

/* cases:
 *
 *

VALID

1.
            *            *
 ----------->            ------------>

2.
            *            *
 ----------->            <------------

3.
            *            *
<------------            ------------->

4.
            *            *
<------------            <-------------

all other cases are invalid.

 *
 */

	ostringstream relations;

	relations<<m_parameters->getPrefix()<<"/NeighbourhoodRelations.txt";

	string file=relations.str();

	ofstream f(file.c_str());
	
	ostringstream operationBuffer;

	operationBuffer<<"#LeftContigPath	LengthInKmers	DNAStrand	PositionOnStrand";
	operationBuffer<<"	RightContigPath	LengthInKmers	DNAStrand	PositionOnStrand";
	operationBuffer<<"	DistanceInKmers	QualityControlStatus"<<endl;

	for(int i=0;i<(int)m_finalList.size();i++){
		PathHandle contig1=m_finalList[i].getContig1();
		PathHandle contig2=m_finalList[i].getContig2();
		int length1=m_contigLengths->operator[](contig1);
		int length2=m_contigLengths->operator[](contig2);
		Strand strand1=m_finalList[i].getStrand1();
		Strand strand2=m_finalList[i].getStrand2();
		int progression1=m_finalList[i].getProgression1();
		int progression2=m_finalList[i].getProgression2();

		int depth=m_finalList[i].getDepth();

		bool valid=true;

		int windows=(0x00000001 << 0x00000002);

		int width1=length1/windows;
		int width2=length2/windows;

		/*The pair is considered valid unless the similarity is after the first 1/4 and before the last 3/4*/

		if((progression1>width1 && progression1<(length1-width1-1)) || (progression2>width2 && progression2<(length2-width2-1))){
			valid=false;
		}


		operationBuffer<<"contig-"<<contig1<<"	"<<length1<<"	"<<strand1<<"	"<<progression1<<"";
		operationBuffer<<"	contig-"<<contig2<<"	"<<length2<<"	"<<strand2<<"	"<<progression2<<"";
		operationBuffer<<"	"<<depth<<"	";

		if(valid){
			operationBuffer<<"PASS";
		}else{
			operationBuffer<<"FAIL";
		}
		operationBuffer<<endl;

		flushFileOperationBuffer(false,&operationBuffer,&f,CONFIG_FILE_IO_BUFFER_SIZE);
	}

	flushFileOperationBuffer(true,&operationBuffer,&f,CONFIG_FILE_IO_BUFFER_SIZE);

	f.close();
}
Beispiel #3
0
void GeneOntology::writeOntologyFiles(){
	ostringstream theFile;
	theFile<<m_parameters->getPrefix()<<"/BiologicalAbundances/_GeneOntology";

	string directory=theFile.str();
	createDirectory(directory.c_str());

	theFile<<"/Terms";

	ostringstream xmlFileStream;
	xmlFileStream<<theFile.str()<<".xml";
	ostringstream tsvFileStream;
	tsvFileStream<<theFile.str()<<".tsv";

	string xmlFile=xmlFileStream.str();
	string tsvFile=tsvFileStream.str();

	if(!m_gotGeneOntologyParameter){
		return;
	}


	map<GeneOntologyIdentifier,int> modeCoverages;
	map<GeneOntologyIdentifier,double> meanCoverages;
	map<GeneOntologyIdentifier,double> estimatedProportions;



	ofstream xmlStream(xmlFile.c_str());
	ostringstream operationBuffer; //-------------

	cout<<"TOTAL: "<<m_kmerObservationsWithGeneOntologies<<endl;

	operationBuffer<<"<?xml version=\"1.0\" encoding=\"UTF-8\"?>"<<endl;
	operationBuffer<<"<root>"<<endl;

	LargeCount totalForTheGraph=m_searcher->getTotalNumberOfColoredKmerObservationsForANameSpace(COLOR_NAMESPACE_EMBL_CDS);
	operationBuffer<<"<totalColoredKmerObservations>";
	operationBuffer<<totalForTheGraph<<"</totalColoredKmerObservations>"<<endl;

	// declare tsv files
	map<string,FILE*> tsvFiles;
	map<string,ostringstream*> tsvBuffers;



	for(map<GeneOntologyIdentifier,map<CoverageDepth,int> >::iterator i=
		m_ontologyTermFrequencies.begin();i!=m_ontologyTermFrequencies.end();i++){

		GeneOntologyIdentifier handle=i->first;

		int mode=0;
		int modeCount=0;
		int total=0;

		LargeCount totalObservations=0;

		for(map<CoverageDepth,int>::iterator j=i->second.begin();j!=i->second.end();j++){

			CoverageDepth coverage=j->first;
			int frequency=j->second;

			if(frequency>modeCount){
				mode=coverage;
				modeCount=frequency;
			}

			total+=frequency;

			totalObservations+=coverage*frequency;
		}

		double mean=totalObservations;

		if(total!=0){
			mean/=total;
		}
		
		#ifdef ASSERT
		assert(modeCoverages.count(handle)==0);
		assert(meanCoverages.count(handle)==0);
		#endif /**/

		modeCoverages[handle]=mode;
		meanCoverages[handle]=mean; /**/

		GeneOntologyDomain domain=getDomain(handle);
		string domainName=getDomainName(domain);

		operationBuffer<<"<geneOntologyTerm>"<<endl;
		operationBuffer<<"<identifier>";
		operationBuffer<<getGeneOntologyIdentifier(handle)<<"</identifier><name>";
		operationBuffer<<getGeneOntologyName(handle)<<"</name>"<<endl;
		operationBuffer<<"<domain>"<<domainName<<"</domain>"<<endl;

		/* print paths to root */
		printPathsFromRoot(handle,&operationBuffer);

		operationBuffer<<"<modeKmerCoverage>"<<mode<<"</modeKmerCoverage>";
		operationBuffer<<"<meanKmerCoverage>"<<mean<<"</meanKmerCoverage>"<<endl;
		operationBuffer<<"<totalColoredKmerObservations>"<<totalObservations<<"</totalColoredKmerObservations>"<<endl;

		double estimatedProportion=(0.0+totalObservations);

		if(totalForTheGraph!=0){
			estimatedProportion/=totalForTheGraph;
		}
		
		estimatedProportions[handle]=estimatedProportion;

		m_termCounts[handle]=totalObservations;

		operationBuffer<<"<proportion>"<<estimatedProportion<<"</proportion>"<<endl;
		operationBuffer<<"<distribution>"<<endl;

		operationBuffer<<"#Coverage	Frequency"<<endl;

		for(map<CoverageDepth,int>::iterator j=i->second.begin();j!=i->second.end();j++){

			CoverageDepth coverage=j->first;
			int frequency=j->second;

			operationBuffer<<coverage<<"	"<<frequency<<endl;
		}

		operationBuffer<<"</distribution></geneOntologyTerm>"<<endl;

		flushFileOperationBuffer(false,&operationBuffer,&xmlStream,CONFIG_FILE_IO_BUFFER_SIZE);


		// also output beautiful tsv file too


		if(tsvFiles.count(domainName)==0){
			ostringstream theFile;
			theFile<<m_parameters->getPrefix()<<"/BiologicalAbundances/";
			theFile<<"0.Profile.GeneOntologyDomain="<<domainName<<".tsv";
	
			string tsvFile=theFile.str();
			tsvFiles[domainName]=fopen(tsvFile.c_str(),"a");

			tsvBuffers[domainName]=new ostringstream();

			*(tsvBuffers[domainName])<<"#TermIdentifier	TermName	TermDomain	TermProportion"<<endl;
		}

		*(tsvBuffers[domainName])<<getGeneOntologyIdentifier(handle)<<"	";
		*(tsvBuffers[domainName])<<getGeneOntologyName(handle)<<"	";
		*(tsvBuffers[domainName])<<domainName;
		*(tsvBuffers[domainName])<<"	"<<estimatedProportion<<endl;

	}

	operationBuffer<<"</root>"<<endl;

	flushFileOperationBuffer(true,&operationBuffer,&xmlStream,CONFIG_FILE_IO_BUFFER_SIZE);

	xmlStream.close();

	ofstream tsvStream(tsvFile.c_str());

	operationBuffer<<"#Identifier	Name	Mode k-mer coverage	Mean k-mer coverage	Proportion"<<endl;


	// close tsv files
	for(map<string,FILE*>::iterator i=tsvFiles.begin();i!=tsvFiles.end();i++){
	
		string category=i->first;
		FILE*file=i->second;

		string text=tsvBuffers[category]->str();
		fprintf(file,"%s",text.c_str());

		delete tsvBuffers[category];

		fclose(file);
	}

	tsvBuffers.clear();
	tsvFiles.clear();



	for(map<GeneOntologyIdentifier,map<CoverageDepth,int> >::iterator i=
		m_ontologyTermFrequencies.begin();i!=m_ontologyTermFrequencies.end();i++){

		GeneOntologyIdentifier handle=i->first;

		operationBuffer<<getGeneOntologyIdentifier(handle)<<"	";
		operationBuffer<<getGeneOntologyName(handle)<<"	";
		operationBuffer<<modeCoverages[handle]<<"	";
		operationBuffer<<meanCoverages[handle]<<"	";
		operationBuffer<<estimatedProportions[handle]<<endl;

		flushFileOperationBuffer(false,&operationBuffer,&tsvStream,CONFIG_FILE_IO_BUFFER_SIZE);
	}

	flushFileOperationBuffer(true,&operationBuffer,&tsvStream,CONFIG_FILE_IO_BUFFER_SIZE);

	tsvStream.close();
}
Beispiel #4
0
void GeneOntology::writeOntologyProfile(GeneOntologyDomain domain){

	int maximumDepth=getDomainDepth(domain);

	cout<<"[GeneOntology] maximum depth for GeneOntologyDomain "<<domain<<" is "<<maximumDepth<<endl;

	string domainName="NULL";

	if(domain==GENE_ONTOLOGY_DOMAIN_biological_process){
		domainName=GENE_ONTOLOGY_DOMAIN_biological_process_STRING;
	}else if(domain==GENE_ONTOLOGY_DOMAIN_cellular_component){
		domainName=GENE_ONTOLOGY_DOMAIN_cellular_component_STRING;
	}else if(domain==GENE_ONTOLOGY_DOMAIN_molecular_function){
		domainName=GENE_ONTOLOGY_DOMAIN_molecular_function_STRING;
	}

	#ifdef ASSERT
	assert(domainName!="NULL");
	#endif

	LargeCount totalForTheGraph=m_searcher->getTotalNumberOfColoredKmerObservationsForANameSpace(COLOR_NAMESPACE_EMBL_CDS);

	for(int depth=0;depth<maximumDepth;depth++){

		// create the file for the domain and given depth.

		ostringstream operationBuffer;

		ostringstream fileName;
		fileName<<m_parameters->getPrefix()<<"/BiologicalAbundances/_GeneOntology";
		fileName<<"/"<<domainName<<".Depth="<<depth<<".tsv";
		string file2=fileName.str();

		ofstream file;


		for(map<GeneOntologyIdentifier,int>::iterator i=m_recursiveCounts.begin();
			i!=m_recursiveCounts.end();i++){

			GeneOntologyIdentifier handle=i->first;

			int count=i->second;


			if(count==0){
				continue;
			}

			if(!hasDepth(handle)){
				continue;
			}

			if(getGeneOntologyDepth(handle)!=depth){
				continue;
			}

			if(getDomain(handle)!=domain){
				continue;
			}

			double proportion=count;

			if(totalForTheGraph!=0){
				proportion/=totalForTheGraph;
			}

			if(!file.is_open()){
				file.open(file2.c_str());
				operationBuffer<<"#Identifier	Name	Proportion	Observations	Total"<<endl;
			}

			operationBuffer<<getGeneOntologyIdentifier(handle);
			operationBuffer<<"	"<<getGeneOntologyName(handle)<<"	";
			operationBuffer<<proportion;
			operationBuffer<<"	"<<count<<"	"<<totalForTheGraph<<endl;

			flushFileOperationBuffer(false,&operationBuffer,&file,CONFIG_FILE_IO_BUFFER_SIZE);

		}

		if(file.is_open()){
			flushFileOperationBuffer(true,&operationBuffer,&file,CONFIG_FILE_IO_BUFFER_SIZE);
			file.close();
		}
	}
}
Beispiel #5
0
void PhylogenyViewer::showObservations_XML(ostream*stream){

	ostringstream operationBuffer;

	/* build a mashup for the ranks
 * this will contain total at each level */

	map<string,LargeCount> rankRecursiveObservations;
	map<string,LargeCount> rankSelfObservations;

	populateRanks(&rankSelfObservations,&rankRecursiveObservations);

	operationBuffer<<"<?xml version=\"1.0\" encoding=\"UTF-8\"?>"<<endl;
	operationBuffer<<"<root>"<<endl;

	/* add the sample name in the XML file */
	operationBuffer<<"<sample>";
	operationBuffer<<m_parameters->getSampleName();
	operationBuffer<<"</sample>"<<endl;

	operationBuffer<<"<totalAssembledKmerObservations>"<<m_totalNumberOfKmerObservations<<"</totalAssembledKmerObservations>"<<endl;

	LargeCount totalColoredAssembledKmerObservations=m_totalNumberOfKmerObservations-m_unknown;

	operationBuffer<<"<totalColoredAssembledKmerObservations>"<<totalColoredAssembledKmerObservations<<"</totalColoredAssembledKmerObservations>"<<endl;

	operationBuffer<<"<ranks>"<<endl;

	for(map<string,LargeCount>::iterator i=rankSelfObservations.begin();i!=rankSelfObservations.end();i++){
		string rank=i->first;

		#ifdef ASSERT
		assert(rankRecursiveObservations.count(rank)>0);
		assert(rankSelfObservations.count(rank)>0);
		#endif

		operationBuffer<<"<entry><rank>"<<rank<<"</rank><self><kmerObservations>";
		operationBuffer<<rankSelfObservations[rank]<<"</kmerObservations></self>";
		operationBuffer<<"<recursive><kmerObservations>"<<rankRecursiveObservations[rank];
		operationBuffer<<"</kmerObservations></recursive></entry>"<<endl;

	}

	operationBuffer<<"</ranks>"<<endl;

	operationBuffer<<"<entry>";
	operationBuffer<<"<taxon><identifier>unknown</identifier><name>unknown</name><rank>unknown</rank></taxon>"<<endl;
	operationBuffer<<"<path></path>"<<endl;
	operationBuffer<<"<self><kmerObservations>"<<m_unknown<<"</kmerObservations>";

	double ratio=m_unknown;

	if(m_totalNumberOfKmerObservations!=0)
		ratio/=m_totalNumberOfKmerObservations;

	operationBuffer<<"<proportion>"<<ratio<<"</proportion>";
	operationBuffer<<"<coloredProportion>0</coloredProportion>";
	operationBuffer<<"<coloredProportionInRank>0</coloredProportionInRank></self></entry>"<<endl;

	// declare tsv files
	map<string,FILE*> tsvFiles;
	map<string,ostringstream*> tsvBuffers;

	for(map<TaxonIdentifier,string>::iterator i=m_taxonNames.begin();
		i!=m_taxonNames.end();i++){

		TaxonIdentifier taxon=i->first;

		LargeCount count=getSelfCount(taxon);

		string rank=getTaxonRank(taxon);

		#ifdef ASSERT
		assert(rankSelfObservations.count(rank)>0);
		assert(rankRecursiveObservations.count(rank)>0);
		#endif

		LargeCount rankRecursiveCount=rankRecursiveObservations[rank];

		#ifdef ASSERT
		LargeCount rankSelfCount=rankSelfObservations[rank]; //-

		assert(rankSelfCount>=0);
		assert(rankRecursiveCount>=0);
		#endif

		LargeCount recursiveCount=getRecursiveCount(taxon);

		if(recursiveCount==0){
			continue;
		}

		operationBuffer<<"<entry>"<<endl;

		printTaxon_XML(taxon,&operationBuffer);

		vector<TaxonIdentifier> path;

		getTaxonPathFromRoot(taxon,&path);
		printTaxonPath_XML(taxon,&path,&operationBuffer);

		operationBuffer<<"<self>"<<endl;
		operationBuffer<<"<kmerObservations>"<<count<<"</kmerObservations>";


		double ratio=count;
		if(m_totalNumberOfKmerObservations!=0)
			ratio/=m_totalNumberOfKmerObservations;

		operationBuffer<<"<proportion>"<<ratio<<"</proportion>";

		double coloredRatio=count;

		if(totalColoredAssembledKmerObservations!=0){
			coloredRatio/=totalColoredAssembledKmerObservations;
		}

		operationBuffer<<"<coloredProportion>"<<coloredRatio<<"</coloredProportion>";

		operationBuffer<<"</self>"<<endl;

		operationBuffer<<"<recursive>";
		operationBuffer<<"<kmerObservations>";
		operationBuffer<<recursiveCount;
		operationBuffer<<"</kmerObservations>"<<endl;


		double ratio2=recursiveCount;
		if(m_totalNumberOfKmerObservations!=0)
			ratio2/=m_totalNumberOfKmerObservations;

		operationBuffer<<"<proportion>"<<ratio2<<"</proportion>";

		double coloredRatio2=recursiveCount;

		if(totalColoredAssembledKmerObservations!=0){
			coloredRatio2/=totalColoredAssembledKmerObservations;
		}

		operationBuffer<<"<coloredProportion>"<<coloredRatio2<<"</coloredProportion>";

		double coloredRatioInRank=recursiveCount;

		if(rankRecursiveCount!=0){
			coloredRatioInRank/=rankRecursiveCount;
		}

		operationBuffer<<"<coloredProportionInRank>"<<coloredRatioInRank<<"</coloredProportionInRank>";
		
		operationBuffer<<"</recursive>"<<endl;

		operationBuffer<<"</entry>"<<endl;

		flushFileOperationBuffer(false,&operationBuffer,stream,CONFIG_FILE_IO_BUFFER_SIZE);

		// add data to the tsv file


		if(tsvFiles.count(rank)==0){
			ostringstream theFile;
			theFile<<m_parameters->getPrefix()<<"/BiologicalAbundances/";
			theFile<<"0.Profile.TaxonomyRank="<<rank<<".tsv";
	
			string tsvFile=theFile.str();
			tsvFiles[rank]=fopen(tsvFile.c_str(),"a");

			tsvBuffers[rank]=new ostringstream();

			*(tsvBuffers[rank])<<"#TaxonIdentifier	TaxonName	TaxonRank	TaxonProportion"<<endl;
		}

		string name=getTaxonName(taxon);

		*(tsvBuffers[rank])<<taxon<<"	"<<name<<"	"<<rank;
		*(tsvBuffers[rank])<<"	"<<coloredRatioInRank<<endl;
	}

	// close XML files
	operationBuffer<<"</root>"<<endl;
	flushFileOperationBuffer(true,&operationBuffer,stream,CONFIG_FILE_IO_BUFFER_SIZE);


	// close tsv files
	for(map<string,FILE*>::iterator i=tsvFiles.begin();i!=tsvFiles.end();i++){
	
		string rank=i->first;
		FILE*file=i->second;

		string text=tsvBuffers[rank]->str();
		fprintf(file,"%s",text.c_str());

		delete tsvBuffers[rank];

		fclose(file);
	}

	tsvBuffers.clear();
	tsvFiles.clear();

}