Esempio n. 1
0
bool Model::readModel (bool ccwVerts){
  int vIndex = 0;
  int fIndex = 0;
  int vCount = 0;
  int fCount = 0;
  std::string line;
  std::string filename = "./models/";
  filename.append(m_fName);
  std::ifstream modelStream(filename.c_str());
  if (!modelStream.is_open()){
    std::cerr << "File: " << filename << " could not be opened." << std::endl;
    exit(EXIT_FAILURE);
  }
  while(getline(modelStream,line)){
    if (line.find("ply") != std::string::npos ||
	line.find("format") != std::string::npos ||
	line.find("comment") != std::string::npos ||
	line.find("property") != std::string::npos ||
	line.find("end_header") != std::string::npos) {
      /*ignore this lines are not used*/    
    }
    else if (line.find("element vertex ") != std::string::npos){
      std::istringstream vcLineStream(line);
      std::string notNeeded;
      vcLineStream >> notNeeded >> notNeeded >> vCount;
      if (vCount == 0){
	std::cerr << "Number of vertices not provided" << std::endl;
	exit(EXIT_FAILURE);
      }
    }
    else if (line.find("element face ") != std::string::npos){
Esempio n. 2
0
int main(int argc, char* argv[])
{
    // Parse command-line arguments
    Parameters parameters;
    bool bParsed = parseCommandLine(argc, argv, parameters);

    if(!bParsed || parameters.bShowHelp || argc == 1)
    {
        help();
        return 0;
    }
    else if(parameters.bShowVersion)
    {
        std::cout << "Naive Bayes Classify v1.0.5 by Donovan Parks, Norm MacDonald, and Rob Beiko." << std::endl;
        return 0;
    }
    else if(parameters.bShowContactInfo)
    {
        std::cout << "Comments, suggestions, and bug reports can be sent to Donovan Parks ([email protected])." << std::endl;
        return 0;
    }
    else if(parameters.queryFile.empty() || parameters.modelFile.empty() || parameters.resultsFile.empty())
    {
        std::cout << "Must specify query (-q), model (-m), and result (-r) file." << std::endl << std::endl;
        help();
        return 0;
    }

    bool bRecordAllModels = false;
    if(parameters.topModels <= 0)
    {
        bRecordAllModels = true;
        parameters.topModels = 0;
    }

    // Get model k-mer length
    if(parameters.verbose >= 1)
        std::cout << "Determining n-mer length..." << std::endl;

    std::ifstream tempStream(parameters.modelFile.c_str(), std::ios::in);
    if(tempStream.fail())
    {
        std::cout << "Failed to open model file: " << parameters.modelFile << std::endl << std::endl;
        return -1;
    }
    std::string line;
    std::getline(tempStream, line);
    KmerModel tempModel(line);
    uint kmerLength = tempModel.kmerLength();
    if(parameters.verbose >= 1)
        std::cout << "  n-mer length: " << kmerLength << std::endl << std::endl;

    // Read query fragments

    if(parameters.verbose >= 1)
        std::cout << "Reading query fragments..." << std::endl;

    char* buffer = NULL;
    std::vector<SeqInfo> querySeqs;
    FastaIO fastaIO;
    bool bSuccess = fastaIO.readSeqs(parameters.queryFile, querySeqs, buffer, parameters.verbose);
    if(!bSuccess)
    {
        std::cout << "Failed to open query fragment file: " << parameters.queryFile << std::endl;
        return -1;
    }
    if(parameters.verbose >= 1)
        std::cout << "  Number of query fragments: " << querySeqs.size() << std::endl << std::endl;

    // Classify query fragments in batches in order to keep memory requirements within reason (~ 1GB)
    if(parameters.verbose >= 1)
        std::cout << "Processing query fragments in batches of " << parameters.batchSize << "." << std::endl << std::endl;

    KmerCalculator kmerCalculator(kmerLength);
    for(uint batchNum = 0; batchNum < ceil(double(querySeqs.size()) / parameters.batchSize); ++batchNum)
    {
        if(parameters.verbose >= 1)
            std::cout << "Batch #" << (batchNum+1) << std::endl;

        // get k-mers for each query fragment
        if(parameters.verbose >= 1)
            std::cout << "  Calculating n-mers in query fragment: " << std::endl;

        std::vector< std::vector<uint> > queryKmerProfiles;
        queryKmerProfiles.reserve(parameters.batchSize);
        for(uint seqIndex = batchNum*parameters.batchSize;
                seqIndex < std::min(ulong(querySeqs.size()), ulong(batchNum+1)*parameters.batchSize);
                ++seqIndex)
        {
            if(parameters.verbose >= 3)
                std::cout << querySeqs.at(seqIndex).seqId << std::endl;
            else if (seqIndex % 5000 == 0 && parameters.verbose >= 1)
                std::cout << "." << std::flush;

            std::vector<uint> profile;
            kmerCalculator.extractForwardKmers(querySeqs.at(seqIndex), profile);
            queryKmerProfiles.push_back(profile);
        }
        if(parameters.verbose >= 1)
            std::cout << std::endl;

        // apply each model to each query sequence
        if(parameters.verbose >= 1)
            std::cout << "  Applying models to query sequences: " << std::endl;

        std::ifstream modelStream(parameters.modelFile.c_str(), std::ios::in);

        uint modelNum = 0;

        std::vector<std::string> modelNames;
        std::vector< std::list<TopModel> > topModelsPerFragment(queryKmerProfiles.size());
        std::vector< std::vector<float> > modelLogLikelihoods;
        while(!modelStream.eof())
        {
            std::string line;
            std::getline(modelStream, line);

            if(line.empty())
                break;

            if(modelNum % 200 == 0 && parameters.verbose >= 1)
                std::cout << " " << modelNum << std::flush;

            KmerModel kmerModel(line);
            modelNames.push_back(kmerModel.name());
            if(parameters.verbose >= 2)
            {
                kmerModel.printModelInfo(std::cout);
                std::cout << std::endl;
            }

            ulong size = 0;
            if(bRecordAllModels)
                size = queryKmerProfiles.size();
            std::vector<float> logLikelihoods(size);
            for(uint seqIndex = 0; seqIndex < queryKmerProfiles.size(); ++seqIndex)
            {
                SeqInfo querySeqInfo = querySeqs[seqIndex + batchNum*parameters.batchSize];
                float logLikelihood = kmerModel.classify(querySeqInfo, queryKmerProfiles[seqIndex]);

                // record models with highest log likelihood
                if(bRecordAllModels)
                {
                    logLikelihoods[seqIndex] = logLikelihood;
                }
                else
                {
                    std::list<TopModel> topModels = topModelsPerFragment.at(seqIndex);

                    if(topModels.size() == 0)
                        topModels.push_front(TopModel(modelNum, logLikelihood));

                    std::list<TopModel>::iterator it;
                    bool bInserted = false;
                    for(it = topModels.begin(); it != topModels.end(); it++)
                    {
                        if(logLikelihood > it->logLikelihood)
                        {
                            topModels.insert(it, TopModel(modelNum, logLikelihood));
                            bInserted = true;
                            break;
                        }
                    }

                    if((int)topModels.size() < parameters.topModels && !bInserted)
                        topModels.push_back(TopModel(modelNum, logLikelihood));
                    else if((int)topModels.size() > parameters.topModels)
                        topModels.pop_back();

                    topModelsPerFragment.at(seqIndex) = topModels;
                }
            }

            if(bRecordAllModels)
                modelLogLikelihoods.push_back(logLikelihoods);

            modelNum++;
        }
        if(parameters.verbose >= 1)
            std::cout << std::endl;

        // write out classification
        if(parameters.verbose >= 1)
            std::cout << "  Writing out classification results." << std::endl << std::endl;

        std::stringstream outputTempResults;
        outputTempResults << "./batch_" << batchNum << "." << parameters.tempExtension;
        std::ofstream fout(outputTempResults.str().c_str(), std::ios::out);
        if(fout.fail())
        {
            std::cout << "Failed to write temporary results file: " << outputTempResults.str() << std::endl;
            return -1;
        }

        // check if all model results are to be written out
        if(bRecordAllModels)
        {
            if(batchNum == 0)
            {
                fout << "Fragment Id" << "\t" << "Length" << "\t" << "Valid n-mers";
                for(uint modelIndex = 0; modelIndex < modelNames.size(); ++modelIndex)
                    fout << "\t" << modelNames[modelIndex];
                fout << std::endl;
            }

            for(uint seqIndex = 0; seqIndex < queryKmerProfiles.size(); ++seqIndex)
            {
                SeqInfo querySeqInfo = querySeqs.at(seqIndex + batchNum*parameters.batchSize);

                fout << querySeqInfo.seqId << "\t" << querySeqInfo.length << "\t" << querySeqInfo.validKmers;

                for(uint modelIndex = 0; modelIndex < modelNames.size(); ++modelIndex)
                    fout << "\t" << modelLogLikelihoods[modelIndex][seqIndex];
                fout << std::endl;
            }
        }
        else
        {
            for(uint seqIndex = 0; seqIndex < queryKmerProfiles.size(); ++seqIndex)
            {
                SeqInfo querySeqInfo = querySeqs.at(seqIndex + batchNum*parameters.batchSize);

                fout << querySeqInfo.seqId << "\t" << querySeqInfo.length << "\t" << querySeqInfo.validKmers;

                std::list<TopModel>::iterator it;
                for(it = topModelsPerFragment.at(seqIndex).begin(); it != topModelsPerFragment.at(seqIndex).end(); it++)
                    fout << "\t" << modelNames[it->modelNum] << "\t" << it->logLikelihood;

                fout << std::endl;
            }
        }

        fout.close();
    }

    // free memory allocated to hold query fragment data
    delete[] buffer;

    // Concatenate result files
    if(parameters.verbose >= 1)
        std::cout << "Building results file: ";

    std::ofstream resultsStream(parameters.resultsFile.c_str(), std::ios::out | std::ios::binary);
    for(uint batchNum = 0; batchNum < ceil(double(querySeqs.size()) / parameters.batchSize); ++batchNum)
    {
        if(parameters.verbose >= 1)
            std::cout << "." << std::flush;

        std::stringstream tempResultFile;
        tempResultFile << "./batch_" << batchNum  << "." << parameters.tempExtension;
        std::ifstream tempStream(tempResultFile.str().c_str(), std::ios::binary);
        if(tempStream.fail() || tempStream.bad())
        {
            std::cout << "Failed to open file: " << tempResultFile.str() << std::endl;
            return -1;
        }

        // calculate size of file
        tempStream.seekg(0, std::ios::end);
        ulong fileSize = tempStream.tellg();
        tempStream.seekg(0, std::ios::beg);

        // write out data in reasonable sized chunks
        ulong chunkSize = 64*1024*1024;

        // allocate memory for reading file
        char* tempBuffer = new char[chunkSize];
        if(tempBuffer == NULL)
        {
            std::cout << std::endl << "Failed to allocate memory required by file: " << tempResultFile.str() << std::endl;
            return -1;
        }

        for(uint chunk = 0; chunk < ceil(float(fileSize) / chunkSize); ++chunk)
        {
            ulong currentChunkSize = std::min(chunkSize, fileSize - chunk*chunkSize);

            // read file into buffer
            tempStream.read(tempBuffer, currentChunkSize);
            if(tempStream.fail() || tempStream.bad())
            {
                std::cout << std::endl << "Failed to read data from " << tempResultFile.str() << std::endl;
                return -1;
            }

            resultsStream.write(tempBuffer, currentChunkSize);
            resultsStream.flush();
        }

        tempStream.close();
        delete[] tempBuffer;
    }
    resultsStream.close();

    if(parameters.verbose >= 1)
    {
        std::cout << std::endl;
        std::cout << "Done." << std::endl;
    }

    for(uint batchNum = 0; batchNum < ceil(double(querySeqs.size()) / parameters.batchSize); ++batchNum)
    {
        std::stringstream filename;
        filename << "./batch_" << batchNum  << "." << parameters.tempExtension;
        std::remove(filename.str().c_str());
    }

    return 0;
}
int main(int argc, char* argv[])
{
	// Parse command-line arguments
	Parameters parameters;
	bool bParsed = parseCommandLine(argc, argv, parameters);

	if(!bParsed || parameters.bShowHelp || argc == 1) 
	{			
		help();
		return 0;
  }
	else if(parameters.bShowVersion)
	{
		std::cout << "Naive Bayes Train v1.0.7 by Donovan Parks, Norm MacDonald, and Rob Beiko." << std::endl;
		return 0;
	}
	else if(parameters.bShowContactInfo)
	{
		std::cout << "Comments, suggestions, and bug reports can be sent to Donovan Parks ([email protected])." << std::endl;
		return 0;
	}
	else if(parameters.outputDir.empty() || parameters.sequenceFile.empty())
	{
		std::cout << "Must specify sequence file (-s) and output directory (-m)." << std::endl << std::endl;
		help();
		return 0;
	}

	// train model for each sequence in the sequence file
	std::cout << "Training models..." << std::endl;
	uint numModels = 0;
	std::ifstream modelStream(parameters.sequenceFile.c_str(), std::ios::in);
	while(!modelStream.eof())
	{
		std::string line;
		getline(modelStream, line);

		line.erase(remove_if(line.begin(), line.end(), ::isspace), line.end());

		if(line.empty())
			continue;

		std::string modelName = line.substr(line.find_last_of('/')+1, std::string::npos);
		modelName = modelName.substr(0, modelName.find_last_of('.'));

		std::cout << "  Processing model " << modelName << std::endl;

		KmerModel kmerModel(parameters.kmerSize);
		kmerModel.name(modelName);

		numModels++;		
	
		FastaIO fastaIO;
		
		bool bOK = fastaIO.open(line);
		if(!bOK)
		{
			std::cerr << "Error opening file: " << line << std::endl;
			return -1;
		}

		while(true)
		{
			SeqInfo seqInfo;
			bool bNextSeq = fastaIO.nextSeq(seqInfo);
			if(!bNextSeq)
				break;

			seqInfo.taxonomy.strain = modelName;

			bOK = kmerModel.constructModel(seqInfo);
			if(!bOK)
			{
				std::cerr << "Error building model." << std::endl;
				return -1;
			}
		}	

		kmerModel.calculateConditionalProbabilities();
		kmerModel.write(parameters.outputDir + modelName + ".txt");
	}

	std::cout << std::endl;
	std::cout << "Number of models: " << numModels << std::endl;

	return 0;
}