std::vector<TskCarveExtractScalpel::CarvedFile> TskCarveExtractScalpel::parseCarvingResultsFile(int unallocImgId, const std::string &resultsFilePath) const { try { std::vector<CarvedFile> carvedFiles; Poco::File resultsFile(resultsFilePath); if (!resultsFile.exists()) { std::stringstream msg; msg << "TskCarveExtractScalpel::parseCarvingResultsFile : could not find Scalpel carving results file for unalloc img id " << unallocImgId; throw TskException(msg.str()); } std::ifstream resultsStream(resultsFilePath.c_str()); if (!resultsStream) { std::stringstream msg; msg << "TskCarveExtractScalpel::parseCarvingResultsFile : unable to open Scalpel carving results file for unalloc img id " << unallocImgId; throw TskException(msg.str()); } // Discard all of the file up to and including the header for the carved files list. std::string line; while (std::getline(resultsStream, line) && line.find("Extracted From") == std::string::npos); // Parse the files list. const std::size_t numberOfFileFields = 5; while (std::getline(resultsStream, line)) { // Tokenize the next line of the results file and see if it is part of the files list by checking the number of tokens. Poco::StringTokenizer tokenizer(line, "\t ", Poco::StringTokenizer::TOK_IGNORE_EMPTY | Poco::StringTokenizer::TOK_TRIM); if (tokenizer.count() != numberOfFileFields) { // No more files in the files list. break; } carvedFiles.push_back(CarvedFile(unallocImgId, tokenizer[0], tokenizer[1], tokenizer[3])); } resultsStream.close(); return carvedFiles; } catch (Poco::Exception &ex) { std::stringstream msg; msg << "TskCarveExtractScalpel::parseCarvingResultsFile : Poco exception: " << ex.displayText(); throw TskException(msg.str()); } }
int main(int argc, char *argv[]) { ECArgs args( argc, argv ); assert(args.nargs() == 1); ECString path(args.arg(0)); cerr << "At start of pHsgt" << endl; for(int n = 0 ; n < MAXNUMNTS ; n++) numTerm[n] = 0; Term::init( path ); readHeadInfo(path); int sentenceCount = 0; ECString s1lex("^^"); ECString s1nm("S1"); int s1Int = Term::get(s1nm)->toInt(); UnitRules ur; ur.init(); while(cin) { //if(sentenceCount > 4000) break; if(sentenceCount%10000 == 0) cerr << sentenceCount << endl; InputTree parse; cin >> parse; //cerr << parse << endl; if(!cin) break; if(parse.length() == 0) break; EcSPairs wtList; parse.make(wtList); InputTree* par; par = &parse; addWwData(par); incrWordData(s1Int, s1lex); ur.gatherData(par); sentenceCount++; } ECString resultsString(path); resultsString += "pSgT.txt"; ofstream resultsStream(resultsString.c_str()); assert(resultsStream); int numWords = 0; resultsStream << " \n"; //leave space for number of words; resultsStream.precision(3); ECString lastWord; int wordFreq = 0; WordMap::iterator wmi = wordMap.begin(); resultsStream << wordMap.size() << "\n\n"; for( ; wmi != wordMap.end() ; wmi++) { ECString w = (*wmi).first; resultsStream << w << "\t"; PosD& posd = (*wmi).second; PosD::iterator pdi = posd.begin(); int count = 0; for( ; pdi != posd.end(); pdi++) { int posInt = (*pdi).first; int c = (*pdi).second; count += c; float p = (float)c/(float)numTerm[posInt]; resultsStream << posInt << " " << p << " "; } resultsStream << "| " << count << "\n"; } ur.setData(path); return 1; }
int main(int argc, char* argv[]) { // Parse command-line arguments Parameters parameters; bool bParsed = parseCommandLine(argc, argv, parameters); if(!bParsed || parameters.bShowHelp || argc == 1) { help(); return 0; } else if(parameters.bShowVersion) { std::cout << "Naive Bayes Classify v1.0.5 by Donovan Parks, Norm MacDonald, and Rob Beiko." << std::endl; return 0; } else if(parameters.bShowContactInfo) { std::cout << "Comments, suggestions, and bug reports can be sent to Donovan Parks ([email protected])." << std::endl; return 0; } else if(parameters.queryFile.empty() || parameters.modelFile.empty() || parameters.resultsFile.empty()) { std::cout << "Must specify query (-q), model (-m), and result (-r) file." << std::endl << std::endl; help(); return 0; } bool bRecordAllModels = false; if(parameters.topModels <= 0) { bRecordAllModels = true; parameters.topModels = 0; } // Get model k-mer length if(parameters.verbose >= 1) std::cout << "Determining n-mer length..." << std::endl; std::ifstream tempStream(parameters.modelFile.c_str(), std::ios::in); if(tempStream.fail()) { std::cout << "Failed to open model file: " << parameters.modelFile << std::endl << std::endl; return -1; } std::string line; std::getline(tempStream, line); KmerModel tempModel(line); uint kmerLength = tempModel.kmerLength(); if(parameters.verbose >= 1) std::cout << " n-mer length: " << kmerLength << std::endl << std::endl; // Read query fragments if(parameters.verbose >= 1) std::cout << "Reading query fragments..." << std::endl; char* buffer = NULL; std::vector<SeqInfo> querySeqs; FastaIO fastaIO; bool bSuccess = fastaIO.readSeqs(parameters.queryFile, querySeqs, buffer, parameters.verbose); if(!bSuccess) { std::cout << "Failed to open query fragment file: " << parameters.queryFile << std::endl; return -1; } if(parameters.verbose >= 1) std::cout << " Number of query fragments: " << querySeqs.size() << std::endl << std::endl; // Classify query fragments in batches in order to keep memory requirements within reason (~ 1GB) if(parameters.verbose >= 1) std::cout << "Processing query fragments in batches of " << parameters.batchSize << "." << std::endl << std::endl; KmerCalculator kmerCalculator(kmerLength); for(uint batchNum = 0; batchNum < ceil(double(querySeqs.size()) / parameters.batchSize); ++batchNum) { if(parameters.verbose >= 1) std::cout << "Batch #" << (batchNum+1) << std::endl; // get k-mers for each query fragment if(parameters.verbose >= 1) std::cout << " Calculating n-mers in query fragment: " << std::endl; std::vector< std::vector<uint> > queryKmerProfiles; queryKmerProfiles.reserve(parameters.batchSize); for(uint seqIndex = batchNum*parameters.batchSize; seqIndex < std::min(ulong(querySeqs.size()), ulong(batchNum+1)*parameters.batchSize); ++seqIndex) { if(parameters.verbose >= 3) std::cout << querySeqs.at(seqIndex).seqId << std::endl; else if (seqIndex % 5000 == 0 && parameters.verbose >= 1) std::cout << "." << std::flush; std::vector<uint> profile; kmerCalculator.extractForwardKmers(querySeqs.at(seqIndex), profile); queryKmerProfiles.push_back(profile); } if(parameters.verbose >= 1) std::cout << std::endl; // apply each model to each query sequence if(parameters.verbose >= 1) std::cout << " Applying models to query sequences: " << std::endl; std::ifstream modelStream(parameters.modelFile.c_str(), std::ios::in); uint modelNum = 0; std::vector<std::string> modelNames; std::vector< std::list<TopModel> > topModelsPerFragment(queryKmerProfiles.size()); std::vector< std::vector<float> > modelLogLikelihoods; while(!modelStream.eof()) { std::string line; std::getline(modelStream, line); if(line.empty()) break; if(modelNum % 200 == 0 && parameters.verbose >= 1) std::cout << " " << modelNum << std::flush; KmerModel kmerModel(line); modelNames.push_back(kmerModel.name()); if(parameters.verbose >= 2) { kmerModel.printModelInfo(std::cout); std::cout << std::endl; } ulong size = 0; if(bRecordAllModels) size = queryKmerProfiles.size(); std::vector<float> logLikelihoods(size); for(uint seqIndex = 0; seqIndex < queryKmerProfiles.size(); ++seqIndex) { SeqInfo querySeqInfo = querySeqs[seqIndex + batchNum*parameters.batchSize]; float logLikelihood = kmerModel.classify(querySeqInfo, queryKmerProfiles[seqIndex]); // record models with highest log likelihood if(bRecordAllModels) { logLikelihoods[seqIndex] = logLikelihood; } else { std::list<TopModel> topModels = topModelsPerFragment.at(seqIndex); if(topModels.size() == 0) topModels.push_front(TopModel(modelNum, logLikelihood)); std::list<TopModel>::iterator it; bool bInserted = false; for(it = topModels.begin(); it != topModels.end(); it++) { if(logLikelihood > it->logLikelihood) { topModels.insert(it, TopModel(modelNum, logLikelihood)); bInserted = true; break; } } if((int)topModels.size() < parameters.topModels && !bInserted) topModels.push_back(TopModel(modelNum, logLikelihood)); else if((int)topModels.size() > parameters.topModels) topModels.pop_back(); topModelsPerFragment.at(seqIndex) = topModels; } } if(bRecordAllModels) modelLogLikelihoods.push_back(logLikelihoods); modelNum++; } if(parameters.verbose >= 1) std::cout << std::endl; // write out classification if(parameters.verbose >= 1) std::cout << " Writing out classification results." << std::endl << std::endl; std::stringstream outputTempResults; outputTempResults << "./batch_" << batchNum << "." << parameters.tempExtension; std::ofstream fout(outputTempResults.str().c_str(), std::ios::out); if(fout.fail()) { std::cout << "Failed to write temporary results file: " << outputTempResults.str() << std::endl; return -1; } // check if all model results are to be written out if(bRecordAllModels) { if(batchNum == 0) { fout << "Fragment Id" << "\t" << "Length" << "\t" << "Valid n-mers"; for(uint modelIndex = 0; modelIndex < modelNames.size(); ++modelIndex) fout << "\t" << modelNames[modelIndex]; fout << std::endl; } for(uint seqIndex = 0; seqIndex < queryKmerProfiles.size(); ++seqIndex) { SeqInfo querySeqInfo = querySeqs.at(seqIndex + batchNum*parameters.batchSize); fout << querySeqInfo.seqId << "\t" << querySeqInfo.length << "\t" << querySeqInfo.validKmers; for(uint modelIndex = 0; modelIndex < modelNames.size(); ++modelIndex) fout << "\t" << modelLogLikelihoods[modelIndex][seqIndex]; fout << std::endl; } } else { for(uint seqIndex = 0; seqIndex < queryKmerProfiles.size(); ++seqIndex) { SeqInfo querySeqInfo = querySeqs.at(seqIndex + batchNum*parameters.batchSize); fout << querySeqInfo.seqId << "\t" << querySeqInfo.length << "\t" << querySeqInfo.validKmers; std::list<TopModel>::iterator it; for(it = topModelsPerFragment.at(seqIndex).begin(); it != topModelsPerFragment.at(seqIndex).end(); it++) fout << "\t" << modelNames[it->modelNum] << "\t" << it->logLikelihood; fout << std::endl; } } fout.close(); } // free memory allocated to hold query fragment data delete[] buffer; // Concatenate result files if(parameters.verbose >= 1) std::cout << "Building results file: "; std::ofstream resultsStream(parameters.resultsFile.c_str(), std::ios::out | std::ios::binary); for(uint batchNum = 0; batchNum < ceil(double(querySeqs.size()) / parameters.batchSize); ++batchNum) { if(parameters.verbose >= 1) std::cout << "." << std::flush; std::stringstream tempResultFile; tempResultFile << "./batch_" << batchNum << "." << parameters.tempExtension; std::ifstream tempStream(tempResultFile.str().c_str(), std::ios::binary); if(tempStream.fail() || tempStream.bad()) { std::cout << "Failed to open file: " << tempResultFile.str() << std::endl; return -1; } // calculate size of file tempStream.seekg(0, std::ios::end); ulong fileSize = tempStream.tellg(); tempStream.seekg(0, std::ios::beg); // write out data in reasonable sized chunks ulong chunkSize = 64*1024*1024; // allocate memory for reading file char* tempBuffer = new char[chunkSize]; if(tempBuffer == NULL) { std::cout << std::endl << "Failed to allocate memory required by file: " << tempResultFile.str() << std::endl; return -1; } for(uint chunk = 0; chunk < ceil(float(fileSize) / chunkSize); ++chunk) { ulong currentChunkSize = std::min(chunkSize, fileSize - chunk*chunkSize); // read file into buffer tempStream.read(tempBuffer, currentChunkSize); if(tempStream.fail() || tempStream.bad()) { std::cout << std::endl << "Failed to read data from " << tempResultFile.str() << std::endl; return -1; } resultsStream.write(tempBuffer, currentChunkSize); resultsStream.flush(); } tempStream.close(); delete[] tempBuffer; } resultsStream.close(); if(parameters.verbose >= 1) { std::cout << std::endl; std::cout << "Done." << std::endl; } for(uint batchNum = 0; batchNum < ceil(double(querySeqs.size()) / parameters.batchSize); ++batchNum) { std::stringstream filename; filename << "./batch_" << batchNum << "." << parameters.tempExtension; std::remove(filename.str().c_str()); } return 0; }
int main(int argc, char *argv[]) { ECArgs args( argc, argv ); assert(args.nargs() == 1); ECString path(args.arg(0)); cerr << "At start of pSfgt" << endl; for(int n = 0 ; n < 140 ; n++) numTerm[n] = 0; ECString resultsString(path); resultsString += "endings.txt"; Term::init( path ); if(args.isset('L')) Term::Language = args.value('L'); readHeadInfo(path); Pst pst(path); //???; int sentenceCount = 0; int wordCount = 0; int processedCount = 0; /*int i, j; for(i = 0 ; i < 60 ; i++) for(j = 0 ; j < 30 ; j++) data[i][j] = 0; */ int i = 0; while(cin) { if(i++%5000 == 1) cerr << i << endl; InputTree parse; cin >> parse; if(!cin) break; if(parse.length() == 0 && cin) continue; if(parse.length()==0 ||!cin) break; addWwData(&parse); processedCount++; wordCount += parse.length(); } ofstream resultsStream(resultsString.c_str()); assert(resultsStream); /*int totNt[30]; for(i = 0 ; i < 30 ; i++) totNt[i] = 0; for(i = 0 ; i <= Term::lastTagInt() ; i++) { for(j = 0 ; j < (Term::lastNTInt() - Term::lastTagInt()) ; j++) totNt[j] += data[i][j]; } */ resultsStream << numEndings << "\n"; for(i = 0 ; i < 140 ; i++) { endMap::iterator emi = endData[i].begin(); for( ; emi != endData[i].end() ; emi++) { ECString ending = (*emi).first; int cnt = (*emi).second; resultsStream << i << "\t" << ending << "\t" << (float) cnt / (float) numTerm[i] << endl; //<< "\n"; } } cout<<"totol sentence:"<<processedCount<<endl; cout<<"total suffix:"<<numEndings<<endl; return 0; }
int main(int argc, char *argv[]) { ECArgs args( argc, argv ); assert(args.nargs() == 1); ECString path(args.arg(0)); cerr << "At start of pTgNt" << endl; for(int n = 0 ; n < MAXNUMTS ; n++) numTerm[n] = 0; ECString resultsString(path); resultsString += "endings.txt"; Term::init( path ); if(args.isset('L')) Term::Language = args.value('L'); readHeadInfo(path); Pst pst(path); int sentenceCount = 0; int wordCount = 0; int processedCount = 0; int i, j; for(i = 0 ; i < MAXNUMTS ; i++) for(j = 0 ; j < MAXNUMNTS ; j++) data[i][j] = 0; i = 0; while(cin) { if(i%10000 == 0) cerr << i << endl; //if(i > 1000) break; InputTree parse; cin >> parse; if(!cin) break; if(parse.length() == 0) break; const Term* resTerm = addWwData(&parse); processedCount++; wordCount += parse.length(); i++; } ofstream resultsStream(resultsString.c_str()); assert(resultsStream); int totNt[MAXNUMTS]; for(i = 0 ; i < MAXNUMTS ; i++) totNt[i] = 0; for(i = 0 ; i <= Term::lastTagInt() ; i++) { for(j = 0 ; j < (Term::lastNTInt() - Term::lastTagInt()) ; j++) totNt[j] += data[i][j]; } resultsStream << numEndings << "\n"; for(i = 0 ; i < MAXNUMTS ; i++) { endMap::iterator emi = endData[i].begin(); for( ; emi != endData[i].end() ; emi++) { ECString ending = (*emi).first; int cnt = (*emi).second; resultsStream << i << "\t" << ending << "\t" << (float) cnt / (float) numTerm[i] << endl; //<< "\n"; } } return 0; }
int main(int argc, char *argv[]) { ECArgs args( argc, argv ); ECString path(args.arg(0)); cerr << "At start of pUgT" << endl; Term::init( path ); if(args.isset('L')) Term::Language = args.value('L'); readHeadInfo(path); Pst pst(path); int sentenceCount = 0; int i, j; for(i = 0 ; i < MAXNUMTS ; i++) { posCounts[i] = 0; posCapCounts[i] = 0; posDenoms[i] = 0; posUCounts[i] = 0; posDashCounts[i] = 0; } for(i = 0 ; i < MAXNUMTS ; i++) totCounts[i] = 0; i = 0; for( ; ; ) { if(i++%10000 == 1) cerr << i << endl; //if(i > 1000) break; InputTree parse; cin >> parse; //cerr << parse << endl; if(parse.length() == 0) break; if(!cin) break; curSent = &parse; addWwData(&parse); sentenceCount++; } ECString resultsString(path); resultsString += "pUgT.txt"; ofstream resultsStream(resultsString.c_str()); assert(resultsStream); /* we print out p(unknown|tag) p(Capital|tag) p(hasDash|tag, unknown) note for Capital the denom is different because we ignore the first two words of the sentence */ int nm = Term::lastTagInt()+1; for(i = 0 ; i < nm ; i++) { resultsStream << i << "\t"; float pugt = 0; float pudenom = (float)posDenoms[i]; if(pudenom > 0) pugt = (float)posUCounts[i]/pudenom; resultsStream << pugt << "\t"; if(posCounts[i] == 0) resultsStream << 0 << "\t"; else resultsStream << (float) posCapCounts[i]/ (float)posCounts[i] << "\t"; if(posUCounts[i] == 0) resultsStream << 0; else resultsStream << (float)posDashCounts[i]/posUCounts[i] ; resultsStream << endl; } ECString resultsString2(path); resultsString2 += "nttCounts.txt"; ofstream resultsStream2(resultsString2.c_str()); assert(resultsStream2); for(i = 0 ; i <= Term::lastNTInt() ; i++) { resultsStream2 << i << "\t"; resultsStream2 << totCounts[i] << "\n"; } return 0; }