std::vector<TskCarveExtractScalpel::CarvedFile> TskCarveExtractScalpel::parseCarvingResultsFile(int unallocImgId, const std::string &resultsFilePath) const
{
    try
    {
        std::vector<CarvedFile> carvedFiles;

        Poco::File resultsFile(resultsFilePath);
        if (!resultsFile.exists())
        {
            std::stringstream msg;
            msg << "TskCarveExtractScalpel::parseCarvingResultsFile : could not find Scalpel carving results file for unalloc img id " << unallocImgId;
            throw TskException(msg.str());
        }
        
        std::ifstream resultsStream(resultsFilePath.c_str());
        if (!resultsStream)
        {
            std::stringstream msg;
            msg << "TskCarveExtractScalpel::parseCarvingResultsFile : unable to open Scalpel carving results file for unalloc img id " << unallocImgId;
            throw TskException(msg.str());
        }

        // Discard all of the file up to and including the header for the carved files list.
        std::string line;
        while (std::getline(resultsStream, line) && line.find("Extracted From") == std::string::npos);

        // Parse the files list.
        const std::size_t numberOfFileFields = 5;
        while (std::getline(resultsStream, line))
        {
            // Tokenize the next line of the results file and see if it is part of the files list by checking the number of tokens.
            Poco::StringTokenizer tokenizer(line, "\t ", Poco::StringTokenizer::TOK_IGNORE_EMPTY | Poco::StringTokenizer::TOK_TRIM); 
            if (tokenizer.count() != numberOfFileFields)
            {
                // No more files in the files list.
                break;
            }

            carvedFiles.push_back(CarvedFile(unallocImgId, tokenizer[0], tokenizer[1], tokenizer[3]));
        }

        resultsStream.close();

        return carvedFiles;
    }
    catch (Poco::Exception &ex)
    {
        std::stringstream msg;
        msg << "TskCarveExtractScalpel::parseCarvingResultsFile : Poco exception: " <<  ex.displayText();
        throw TskException(msg.str());
    }
}
Exemple #2
0
int
main(int argc, char *argv[])
{
  ECArgs args( argc, argv );
  assert(args.nargs() == 1);
  ECString path(args.arg(0));
  cerr << "At start of pHsgt" << endl;

  for(int n = 0 ; n < MAXNUMNTS ; n++)
    numTerm[n] = 0;

  Term::init( path );
  readHeadInfo(path);

  int sentenceCount = 0;

  ECString s1lex("^^");
  ECString s1nm("S1");
  int s1Int = Term::get(s1nm)->toInt();
	
  UnitRules ur;
  ur.init();
  while(cin)
    {
      //if(sentenceCount > 4000) break;
      if(sentenceCount%10000 == 0) cerr << sentenceCount << endl;
      InputTree  parse;
      cin >> parse;
      //cerr << parse << endl;
      if(!cin) break;
      if(parse.length() == 0) break;
       EcSPairs wtList;
       parse.make(wtList); 
       InputTree* par;
       par = &parse;

      addWwData(par);
      incrWordData(s1Int, s1lex);
      ur.gatherData(par);
      sentenceCount++;
    }
  ECString resultsString(path);
  resultsString += "pSgT.txt";
  ofstream     resultsStream(resultsString.c_str());
  assert(resultsStream);

  int numWords = 0;
  resultsStream << "       \n";  //leave space for number of words;
  resultsStream.precision(3);
  ECString lastWord;
  int wordFreq = 0;
  WordMap::iterator wmi = wordMap.begin();
  resultsStream << wordMap.size() << "\n\n";
  for( ; wmi != wordMap.end() ; wmi++)
    {
      ECString w = (*wmi).first;
      resultsStream << w << "\t";
      PosD& posd = (*wmi).second;
      PosD::iterator pdi = posd.begin();
      int count = 0;
      for( ; pdi != posd.end(); pdi++)
	{
	  int posInt = (*pdi).first;
	  int c = (*pdi).second;
	  count += c;
	  float p = (float)c/(float)numTerm[posInt];
	  resultsStream << posInt << " " << p << " ";
	}
      resultsStream << "| " << count << "\n";
    }
  ur.setData(path);
  return 1;
}
Exemple #3
0
int main(int argc, char* argv[])
{
    // Parse command-line arguments
    Parameters parameters;
    bool bParsed = parseCommandLine(argc, argv, parameters);

    if(!bParsed || parameters.bShowHelp || argc == 1)
    {
        help();
        return 0;
    }
    else if(parameters.bShowVersion)
    {
        std::cout << "Naive Bayes Classify v1.0.5 by Donovan Parks, Norm MacDonald, and Rob Beiko." << std::endl;
        return 0;
    }
    else if(parameters.bShowContactInfo)
    {
        std::cout << "Comments, suggestions, and bug reports can be sent to Donovan Parks ([email protected])." << std::endl;
        return 0;
    }
    else if(parameters.queryFile.empty() || parameters.modelFile.empty() || parameters.resultsFile.empty())
    {
        std::cout << "Must specify query (-q), model (-m), and result (-r) file." << std::endl << std::endl;
        help();
        return 0;
    }

    bool bRecordAllModels = false;
    if(parameters.topModels <= 0)
    {
        bRecordAllModels = true;
        parameters.topModels = 0;
    }

    // Get model k-mer length
    if(parameters.verbose >= 1)
        std::cout << "Determining n-mer length..." << std::endl;

    std::ifstream tempStream(parameters.modelFile.c_str(), std::ios::in);
    if(tempStream.fail())
    {
        std::cout << "Failed to open model file: " << parameters.modelFile << std::endl << std::endl;
        return -1;
    }
    std::string line;
    std::getline(tempStream, line);
    KmerModel tempModel(line);
    uint kmerLength = tempModel.kmerLength();
    if(parameters.verbose >= 1)
        std::cout << "  n-mer length: " << kmerLength << std::endl << std::endl;

    // Read query fragments

    if(parameters.verbose >= 1)
        std::cout << "Reading query fragments..." << std::endl;

    char* buffer = NULL;
    std::vector<SeqInfo> querySeqs;
    FastaIO fastaIO;
    bool bSuccess = fastaIO.readSeqs(parameters.queryFile, querySeqs, buffer, parameters.verbose);
    if(!bSuccess)
    {
        std::cout << "Failed to open query fragment file: " << parameters.queryFile << std::endl;
        return -1;
    }
    if(parameters.verbose >= 1)
        std::cout << "  Number of query fragments: " << querySeqs.size() << std::endl << std::endl;

    // Classify query fragments in batches in order to keep memory requirements within reason (~ 1GB)
    if(parameters.verbose >= 1)
        std::cout << "Processing query fragments in batches of " << parameters.batchSize << "." << std::endl << std::endl;

    KmerCalculator kmerCalculator(kmerLength);
    for(uint batchNum = 0; batchNum < ceil(double(querySeqs.size()) / parameters.batchSize); ++batchNum)
    {
        if(parameters.verbose >= 1)
            std::cout << "Batch #" << (batchNum+1) << std::endl;

        // get k-mers for each query fragment
        if(parameters.verbose >= 1)
            std::cout << "  Calculating n-mers in query fragment: " << std::endl;

        std::vector< std::vector<uint> > queryKmerProfiles;
        queryKmerProfiles.reserve(parameters.batchSize);
        for(uint seqIndex = batchNum*parameters.batchSize;
                seqIndex < std::min(ulong(querySeqs.size()), ulong(batchNum+1)*parameters.batchSize);
                ++seqIndex)
        {
            if(parameters.verbose >= 3)
                std::cout << querySeqs.at(seqIndex).seqId << std::endl;
            else if (seqIndex % 5000 == 0 && parameters.verbose >= 1)
                std::cout << "." << std::flush;

            std::vector<uint> profile;
            kmerCalculator.extractForwardKmers(querySeqs.at(seqIndex), profile);
            queryKmerProfiles.push_back(profile);
        }
        if(parameters.verbose >= 1)
            std::cout << std::endl;

        // apply each model to each query sequence
        if(parameters.verbose >= 1)
            std::cout << "  Applying models to query sequences: " << std::endl;

        std::ifstream modelStream(parameters.modelFile.c_str(), std::ios::in);

        uint modelNum = 0;

        std::vector<std::string> modelNames;
        std::vector< std::list<TopModel> > topModelsPerFragment(queryKmerProfiles.size());
        std::vector< std::vector<float> > modelLogLikelihoods;
        while(!modelStream.eof())
        {
            std::string line;
            std::getline(modelStream, line);

            if(line.empty())
                break;

            if(modelNum % 200 == 0 && parameters.verbose >= 1)
                std::cout << " " << modelNum << std::flush;

            KmerModel kmerModel(line);
            modelNames.push_back(kmerModel.name());
            if(parameters.verbose >= 2)
            {
                kmerModel.printModelInfo(std::cout);
                std::cout << std::endl;
            }

            ulong size = 0;
            if(bRecordAllModels)
                size = queryKmerProfiles.size();
            std::vector<float> logLikelihoods(size);
            for(uint seqIndex = 0; seqIndex < queryKmerProfiles.size(); ++seqIndex)
            {
                SeqInfo querySeqInfo = querySeqs[seqIndex + batchNum*parameters.batchSize];
                float logLikelihood = kmerModel.classify(querySeqInfo, queryKmerProfiles[seqIndex]);

                // record models with highest log likelihood
                if(bRecordAllModels)
                {
                    logLikelihoods[seqIndex] = logLikelihood;
                }
                else
                {
                    std::list<TopModel> topModels = topModelsPerFragment.at(seqIndex);

                    if(topModels.size() == 0)
                        topModels.push_front(TopModel(modelNum, logLikelihood));

                    std::list<TopModel>::iterator it;
                    bool bInserted = false;
                    for(it = topModels.begin(); it != topModels.end(); it++)
                    {
                        if(logLikelihood > it->logLikelihood)
                        {
                            topModels.insert(it, TopModel(modelNum, logLikelihood));
                            bInserted = true;
                            break;
                        }
                    }

                    if((int)topModels.size() < parameters.topModels && !bInserted)
                        topModels.push_back(TopModel(modelNum, logLikelihood));
                    else if((int)topModels.size() > parameters.topModels)
                        topModels.pop_back();

                    topModelsPerFragment.at(seqIndex) = topModels;
                }
            }

            if(bRecordAllModels)
                modelLogLikelihoods.push_back(logLikelihoods);

            modelNum++;
        }
        if(parameters.verbose >= 1)
            std::cout << std::endl;

        // write out classification
        if(parameters.verbose >= 1)
            std::cout << "  Writing out classification results." << std::endl << std::endl;

        std::stringstream outputTempResults;
        outputTempResults << "./batch_" << batchNum << "." << parameters.tempExtension;
        std::ofstream fout(outputTempResults.str().c_str(), std::ios::out);
        if(fout.fail())
        {
            std::cout << "Failed to write temporary results file: " << outputTempResults.str() << std::endl;
            return -1;
        }

        // check if all model results are to be written out
        if(bRecordAllModels)
        {
            if(batchNum == 0)
            {
                fout << "Fragment Id" << "\t" << "Length" << "\t" << "Valid n-mers";
                for(uint modelIndex = 0; modelIndex < modelNames.size(); ++modelIndex)
                    fout << "\t" << modelNames[modelIndex];
                fout << std::endl;
            }

            for(uint seqIndex = 0; seqIndex < queryKmerProfiles.size(); ++seqIndex)
            {
                SeqInfo querySeqInfo = querySeqs.at(seqIndex + batchNum*parameters.batchSize);

                fout << querySeqInfo.seqId << "\t" << querySeqInfo.length << "\t" << querySeqInfo.validKmers;

                for(uint modelIndex = 0; modelIndex < modelNames.size(); ++modelIndex)
                    fout << "\t" << modelLogLikelihoods[modelIndex][seqIndex];
                fout << std::endl;
            }
        }
        else
        {
            for(uint seqIndex = 0; seqIndex < queryKmerProfiles.size(); ++seqIndex)
            {
                SeqInfo querySeqInfo = querySeqs.at(seqIndex + batchNum*parameters.batchSize);

                fout << querySeqInfo.seqId << "\t" << querySeqInfo.length << "\t" << querySeqInfo.validKmers;

                std::list<TopModel>::iterator it;
                for(it = topModelsPerFragment.at(seqIndex).begin(); it != topModelsPerFragment.at(seqIndex).end(); it++)
                    fout << "\t" << modelNames[it->modelNum] << "\t" << it->logLikelihood;

                fout << std::endl;
            }
        }

        fout.close();
    }

    // free memory allocated to hold query fragment data
    delete[] buffer;

    // Concatenate result files
    if(parameters.verbose >= 1)
        std::cout << "Building results file: ";

    std::ofstream resultsStream(parameters.resultsFile.c_str(), std::ios::out | std::ios::binary);
    for(uint batchNum = 0; batchNum < ceil(double(querySeqs.size()) / parameters.batchSize); ++batchNum)
    {
        if(parameters.verbose >= 1)
            std::cout << "." << std::flush;

        std::stringstream tempResultFile;
        tempResultFile << "./batch_" << batchNum  << "." << parameters.tempExtension;
        std::ifstream tempStream(tempResultFile.str().c_str(), std::ios::binary);
        if(tempStream.fail() || tempStream.bad())
        {
            std::cout << "Failed to open file: " << tempResultFile.str() << std::endl;
            return -1;
        }

        // calculate size of file
        tempStream.seekg(0, std::ios::end);
        ulong fileSize = tempStream.tellg();
        tempStream.seekg(0, std::ios::beg);

        // write out data in reasonable sized chunks
        ulong chunkSize = 64*1024*1024;

        // allocate memory for reading file
        char* tempBuffer = new char[chunkSize];
        if(tempBuffer == NULL)
        {
            std::cout << std::endl << "Failed to allocate memory required by file: " << tempResultFile.str() << std::endl;
            return -1;
        }

        for(uint chunk = 0; chunk < ceil(float(fileSize) / chunkSize); ++chunk)
        {
            ulong currentChunkSize = std::min(chunkSize, fileSize - chunk*chunkSize);

            // read file into buffer
            tempStream.read(tempBuffer, currentChunkSize);
            if(tempStream.fail() || tempStream.bad())
            {
                std::cout << std::endl << "Failed to read data from " << tempResultFile.str() << std::endl;
                return -1;
            }

            resultsStream.write(tempBuffer, currentChunkSize);
            resultsStream.flush();
        }

        tempStream.close();
        delete[] tempBuffer;
    }
    resultsStream.close();

    if(parameters.verbose >= 1)
    {
        std::cout << std::endl;
        std::cout << "Done." << std::endl;
    }

    for(uint batchNum = 0; batchNum < ceil(double(querySeqs.size()) / parameters.batchSize); ++batchNum)
    {
        std::stringstream filename;
        filename << "./batch_" << batchNum  << "." << parameters.tempExtension;
        std::remove(filename.str().c_str());
    }

    return 0;
}
Exemple #4
0
int
main(int argc, char *argv[])
{
  ECArgs args( argc, argv );
  assert(args.nargs() == 1);
  ECString path(args.arg(0));
  cerr << "At start of pSfgt" << endl;

  for(int n = 0 ; n < 140 ; n++)
    numTerm[n] = 0;

  ECString resultsString(path);
  resultsString += "endings.txt";

  Term::init( path );
  if(args.isset('L')) Term::Language = args.value('L');
  readHeadInfo(path);
  Pst pst(path); //???;

  int sentenceCount = 0;
  int wordCount = 0;
  int processedCount = 0;

  /*int i, j;
  for(i = 0 ; i < 60 ; i++)
    for(j = 0 ; j < 30 ; j++)
      data[i][j] = 0;
  */
  int i = 0;
  while(cin)
    {
      if(i++%5000 == 1) cerr << i << endl;
      InputTree  parse;
      cin >> parse;
      if(!cin) break;
      if(parse.length() == 0 && cin) continue;
      if(parse.length()==0 ||!cin) break;
      addWwData(&parse);
      processedCount++;
      wordCount += parse.length();
    }
  ofstream     resultsStream(resultsString.c_str());
  assert(resultsStream);
  /*int  totNt[30];
  for(i = 0 ; i < 30 ; i++) totNt[i] = 0;
  for(i = 0 ; i <= Term::lastTagInt() ; i++)
    {
      for(j = 0 ; j < (Term::lastNTInt() - Term::lastTagInt()) ; j++)
	totNt[j] += data[i][j];
    }
    */
  resultsStream << numEndings << "\n";

  for(i = 0 ; i < 140 ; i++)
    {
      endMap::iterator emi = endData[i].begin();
      for( ; emi != endData[i].end() ; emi++)
	{
	  ECString ending = (*emi).first;
	  int cnt = (*emi).second;
	  resultsStream << i << "\t" << ending << "\t"
			<< (float) cnt / (float) numTerm[i]
			<< endl;
	    //<< "\n";

	}
    }
  cout<<"totol sentence:"<<processedCount<<endl;
  cout<<"total suffix:"<<numEndings<<endl;

  return 0;
}
Exemple #5
0
int
main(int argc, char *argv[])
{
  ECArgs args( argc, argv );
  assert(args.nargs() == 1);
  ECString path(args.arg(0));
  cerr << "At start of pTgNt" << endl;

  for(int n = 0 ; n < MAXNUMTS ; n++)
    numTerm[n] = 0;

  ECString resultsString(path);
  resultsString += "endings.txt";

  Term::init( path );  
  if(args.isset('L')) Term::Language = args.value('L');
  readHeadInfo(path);
  Pst pst(path);

  int sentenceCount = 0;
  int wordCount = 0;
  int processedCount = 0;

  int i, j;
  for(i = 0 ; i < MAXNUMTS ; i++)
    for(j = 0 ; j < MAXNUMNTS ; j++)
      data[i][j] = 0;

  i = 0;
  while(cin)
    {
      if(i%10000 == 0) cerr << i << endl;
      //if(i > 1000) break;
      InputTree  parse;
      cin >> parse;
      if(!cin) break;
      if(parse.length() == 0) break;
      const Term* resTerm = addWwData(&parse);
      processedCount++;
      wordCount += parse.length();
      i++;
    }
  ofstream     resultsStream(resultsString.c_str());
  assert(resultsStream);
  int  totNt[MAXNUMTS];
  for(i = 0 ; i < MAXNUMTS ; i++) totNt[i] = 0;
  for(i = 0 ; i <= Term::lastTagInt() ; i++)
    {
      for(j = 0 ; j < (Term::lastNTInt() - Term::lastTagInt()) ; j++)
	totNt[j] += data[i][j];
    }
  resultsStream << numEndings << "\n";
  for(i = 0 ; i < MAXNUMTS ; i++)
    {
      endMap::iterator emi = endData[i].begin();
      for( ; emi != endData[i].end() ; emi++)
	{
	  ECString ending = (*emi).first;
	  int cnt = (*emi).second;
	  resultsStream << i << "\t" << ending << "\t"
			<< (float) cnt / (float) numTerm[i]
			<< endl;
	    //<< "\n";
	}
    }
  return 0;
}
Exemple #6
0
int
main(int argc, char *argv[])
{
  ECArgs args( argc, argv );
  ECString path(args.arg(0));
  cerr << "At start of pUgT" << endl;

  Term::init( path );  
  if(args.isset('L')) Term::Language = args.value('L');
  readHeadInfo(path);
  Pst pst(path);

  int sentenceCount = 0;

  int i, j;
  for(i = 0 ; i < MAXNUMTS ; i++)
    {
      posCounts[i] = 0;
      posCapCounts[i] = 0;
      posDenoms[i] = 0;
      posUCounts[i] = 0;
      posDashCounts[i] = 0;
    }
  for(i = 0 ; i < MAXNUMTS ; i++) totCounts[i] = 0;

  i = 0;
  for( ; ; )
    {
      if(i++%10000 == 1) cerr << i << endl;
      //if(i > 1000) break;
      InputTree  parse;
      cin >> parse;
      //cerr << parse << endl;
      if(parse.length() == 0) break;
      if(!cin) break;
      curSent = &parse;
      addWwData(&parse);
      sentenceCount++;
    }

  ECString resultsString(path);
  resultsString += "pUgT.txt";
  ofstream     resultsStream(resultsString.c_str());
  assert(resultsStream);
  /* we print out p(unknown|tag)    p(Capital|tag)   p(hasDash|tag, unknown)
     note for Capital the denom is different because we ignore the first
     two words of the sentence */
  int nm = Term::lastTagInt()+1;
  for(i = 0 ; i < nm ; i++)
    {
      resultsStream << i << "\t";
      float pugt = 0;
      float pudenom = (float)posDenoms[i];
      if(pudenom > 0) pugt = (float)posUCounts[i]/pudenom;
      resultsStream << pugt << "\t";
      if(posCounts[i] == 0) resultsStream << 0 << "\t";
      else
	resultsStream << (float) posCapCounts[i]/ (float)posCounts[i] << "\t";
      if(posUCounts[i] == 0) resultsStream << 0;
      else resultsStream << (float)posDashCounts[i]/posUCounts[i] ;
      resultsStream << endl;
    }
  ECString resultsString2(path);
  resultsString2 += "nttCounts.txt";
  ofstream     resultsStream2(resultsString2.c_str());
  assert(resultsStream2);
  for(i = 0 ; i <= Term::lastNTInt() ; i++)
    {
      resultsStream2 << i << "\t";
      resultsStream2 << totCounts[i] << "\n";
    }
  return 0;
}