C++ (Cpp) SentenceAlignmentWithSyntax Examples

Programming Language: C++ (Cpp)

Examples at hotexamples.com: 2

C++ (Cpp) SentenceAlignmentWithSyntax - 2 examples found. These are the top rated real world C++ (Cpp) examples of SentenceAlignmentWithSyntax extracted from open source projects. You can rate examples to help us improve the quality of examples.

Frequently Used Methods

Show Hide

create(2)

Frequently Used Methods

create (2)

Example #1

Show file

File: extract-rules-main.cpp Project: 840462307cn/mosesdecoder

int main(int argc, char* argv[])
{
  cerr << "extract-rules, written by Philipp Koehn\n"
       << "rule extraction from an aligned parallel corpus\n";

  RuleExtractionOptions options;
  int sentenceOffset = 0;
#ifdef WITH_THREADS
  int thread_count = 1;
#endif
  if (argc < 5) {
    cerr << "syntax: extract-rules corpus.target corpus.source corpus.align extract ["

         << " --GlueGrammar FILE"
         << " | --UnknownWordLabel FILE"
         << " | --OnlyDirect"
         << " | --MaxSpan[" << options.maxSpan << "]"
         << " | --MinHoleTarget[" << options.minHoleTarget << "]"
         << " | --MinHoleSource[" << options.minHoleSource << "]"
         << " | --MinWords[" << options.minWords << "]"
         << " | --MaxSymbolsTarget[" << options.maxSymbolsTarget << "]"
         << " | --MaxSymbolsSource[" << options.maxSymbolsSource << "]"
         << " | --MaxNonTerm[" << options.maxNonTerm << "]"
         << " | --MaxScope[" << options.maxScope << "]"
         << " | --SourceSyntax | --TargetSyntax"
         << " | --AllowOnlyUnalignedWords | --DisallowNonTermConsecTarget |--NonTermConsecSource |  --NoNonTermFirstWord | --NoFractionalCounting"
         << " | --UnpairedExtractFormat"
         << " | --ConditionOnTargetLHS ]"
         << " | --BoundaryRules[" << options.boundaryRules << "]"
         << " | --FlexibilityScore\n";

    exit(1);
  }
  char* &fileNameT = argv[1];
  char* &fileNameS = argv[2];
  char* &fileNameA = argv[3];
  string fileNameGlueGrammar;
  string fileNameUnknownWordLabel;
  string fileNameExtract = string(argv[4]);

  int optionInd = 5;

  for(int i=optionInd; i<argc; i++) {
    // maximum span length
    if (strcmp(argv[i],"--MaxSpan") == 0) {
      options.maxSpan = atoi(argv[++i]);
      if (options.maxSpan < 1) {
        cerr << "extract error: --maxSpan should be at least 1" << endl;
        exit(1);
      }
    } else if (strcmp(argv[i],"--MinHoleTarget") == 0) {
      options.minHoleTarget = atoi(argv[++i]);
      if (options.minHoleTarget < 1) {
        cerr << "extract error: --minHoleTarget should be at least 1" << endl;
        exit(1);
      }
    } else if (strcmp(argv[i],"--MinHoleSource") == 0) {
      options.minHoleSource = atoi(argv[++i]);
      if (options.minHoleSource < 1) {
        cerr << "extract error: --minHoleSource should be at least 1" << endl;
        exit(1);
      }
    }
    // maximum number of words in hierarchical phrase
    else if (strcmp(argv[i],"--MaxSymbolsTarget") == 0) {
      options.maxSymbolsTarget = atoi(argv[++i]);
      if (options.maxSymbolsTarget < 1) {
        cerr << "extract error: --MaxSymbolsTarget should be at least 1" << endl;
        exit(1);
      }
    }
    // maximum number of words in hierarchical phrase
    else if (strcmp(argv[i],"--MaxSymbolsSource") == 0) {
      options.maxSymbolsSource = atoi(argv[++i]);
      if (options.maxSymbolsSource < 1) {
        cerr << "extract error: --MaxSymbolsSource should be at least 1" << endl;
        exit(1);
      }
    }
    // minimum number of words in hierarchical phrase
    else if (strcmp(argv[i],"--MinWords") == 0) {
      options.minWords = atoi(argv[++i]);
      if (options.minWords < 0) {
        cerr << "extract error: --MinWords should be at least 0" << endl;
        exit(1);
      }
    }
    // maximum number of non-terminals
    else if (strcmp(argv[i],"--MaxNonTerm") == 0) {
      options.maxNonTerm = atoi(argv[++i]);
      if (options.maxNonTerm < 1) {
        cerr << "extract error: --MaxNonTerm should be at least 1" << endl;
        exit(1);
      }
    }
    // maximum scope (see Hopkins and Langmead (2010))
    else if (strcmp(argv[i],"--MaxScope") == 0) {
      options.maxScope = atoi(argv[++i]);
      if (options.maxScope < 0) {
        cerr << "extract error: --MaxScope should be at least 0" << endl;
        exit(1);
      }
    } else if (strcmp(argv[i], "--GZOutput") == 0) {
      options.gzOutput = true;
    }
    // allow consecutive non-terminals (X Y | X Y)
    else if (strcmp(argv[i],"--TargetSyntax") == 0) {
      options.targetSyntax = true;
    } else if (strcmp(argv[i],"--SourceSyntax") == 0) {
      options.sourceSyntax = true;
    } else if (strcmp(argv[i],"--AllowOnlyUnalignedWords") == 0) {
      options.requireAlignedWord = false;
    } else if (strcmp(argv[i],"--DisallowNonTermConsecTarget") == 0) {
      options.nonTermConsecTarget = false;
    } else if (strcmp(argv[i],"--NonTermConsecSource") == 0) {
      options.nonTermConsecSource = true;
    } else if (strcmp(argv[i],"--NoNonTermFirstWord") == 0) {
      options.nonTermFirstWord = false;
    } else if (strcmp(argv[i],"--OnlyOutputSpanInfo") == 0) {
      options.onlyOutputSpanInfo = true;
    } else if (strcmp(argv[i],"--OnlyDirect") == 0) {
      options.onlyDirectFlag = true;
    } else if (strcmp(argv[i],"--GlueGrammar") == 0) {
      options.glueGrammarFlag = true;
      if (++i >= argc) {
        cerr << "ERROR: Option --GlueGrammar requires a file name" << endl;
        exit(0);
      }
      fileNameGlueGrammar = string(argv[i]);
      cerr << "creating glue grammar in '" << fileNameGlueGrammar << "'" << endl;
    } else if (strcmp(argv[i],"--UnknownWordLabel") == 0) {
      options.unknownWordLabelFlag = true;
      if (++i >= argc) {
        cerr << "ERROR: Option --UnknownWordLabel requires a file name" << endl;
        exit(0);
      }
      fileNameUnknownWordLabel = string(argv[i]);
      cerr << "creating unknown word labels in '" << fileNameUnknownWordLabel << "'" << endl;
    }
    // TODO: this should be a useful option
    //else if (strcmp(argv[i],"--ZipFiles") == 0) {
    //  zipFiles = true;
    //}
    // if an source phrase is paired with two target phrases, then count(t|s) = 0.5
    else if (strcmp(argv[i],"--NoFractionalCounting") == 0) {
      options.fractionalCounting = false;
    } else if (strcmp(argv[i],"--PCFG") == 0) {
      options.pcfgScore = true;
    } else if (strcmp(argv[i],"--UnpairedExtractFormat") == 0) {
      options.unpairedExtractFormat = true;
    } else if (strcmp(argv[i],"--ConditionOnTargetLHS") == 0) {
      options.conditionOnTargetLhs = true;
    } else if (strcmp(argv[i],"--FlexibilityScore") == 0) {
      options.flexScoreFlag = true;
    } else if (strcmp(argv[i],"-threads") == 0 ||
               strcmp(argv[i],"--threads") == 0 ||
               strcmp(argv[i],"--Threads") == 0) {
#ifdef WITH_THREADS
      thread_count = atoi(argv[++i]);
#else
      cerr << "thread support not compiled in." << '\n';
      exit(1);
#endif
    } else if (strcmp(argv[i], "--SentenceOffset") == 0) {
      if (i+1 >= argc || argv[i+1][0] < '0' || argv[i+1][0] > '9') {
        cerr << "extract: syntax error, used switch --SentenceOffset without a number" << endl;
        exit(1);
      }
      sentenceOffset = atoi(argv[++i]);
    } else if (strcmp(argv[i],"--BoundaryRules") == 0) {
      options.boundaryRules = true;
    } else {
      cerr << "extract: syntax error, unknown option '" << string(argv[i]) << "'\n";
      exit(1);
    }
  }

  cerr << "extracting hierarchical rules" << endl;

  // open input files
  Moses::InputFileStream tFile(fileNameT);
  Moses::InputFileStream sFile(fileNameS);
  Moses::InputFileStream aFile(fileNameA);

  istream *tFileP = &tFile;
  istream *sFileP = &sFile;
  istream *aFileP = &aFile;

  // open output files
  string fileNameExtractInv = fileNameExtract + ".inv" + (options.gzOutput?".gz":"");
  Moses::OutputFileStream extractFile;
  Moses::OutputFileStream extractFileInv;
  Moses::OutputFileStream extractFileContext;
  Moses::OutputFileStream extractFileContextInv;
  extractFile.Open((fileNameExtract  + (options.gzOutput?".gz":"")).c_str());
  if (!options.onlyDirectFlag)
    extractFileInv.Open(fileNameExtractInv.c_str());

  if (options.flexScoreFlag) {
    string fileNameExtractContext = fileNameExtract + ".context" + (options.gzOutput?".gz":"");
    extractFileContext.Open(fileNameExtractContext.c_str());
    if (!options.onlyDirectFlag) {
      string fileNameExtractContextInv = fileNameExtract + ".context.inv" + (options.gzOutput?".gz":"");
      extractFileContextInv.Open(fileNameExtractContextInv.c_str());
    }
  }

  // stats on labels for glue grammar and unknown word label probabilities
  set< string > targetLabelCollection, sourceLabelCollection;
  map< string, int > targetTopLabelCollection, sourceTopLabelCollection;

  // loop through all sentence pairs
  size_t i=sentenceOffset;
  string targetString, sourceString, alignmentString;

  while(getline(*tFileP, targetString)) {
    i++;

    getline(*sFileP, sourceString);
    getline(*aFileP, alignmentString);

    if (i%1000 == 0) cerr << i << " " << flush;

    SentenceAlignmentWithSyntax sentence
    (targetLabelCollection, sourceLabelCollection,
     targetTopLabelCollection, sourceTopLabelCollection, options);
    //az: output src, tgt, and alingment line
    if (options.onlyOutputSpanInfo) {
      cout << "LOG: SRC: " << sourceString << endl;
      cout << "LOG: TGT: " << targetString << endl;
      cout << "LOG: ALT: " << alignmentString << endl;
      cout << "LOG: PHRASES_BEGIN:" << endl;
    }

    if (sentence.create(targetString.c_str(), sourceString.c_str(), alignmentString.c_str(),"", i, options.boundaryRules)) {
      if (options.unknownWordLabelFlag) {
        collectWordLabelCounts(sentence);
      }
      ExtractTask *task = new ExtractTask(sentence, options, extractFile, extractFileInv, extractFileContext, extractFileContextInv);
      task->Run();
      delete task;
    }
    if (options.onlyOutputSpanInfo) cout << "LOG: PHRASES_END:" << endl; //az: mark end of phrases
  }

  tFile.Close();
  sFile.Close();
  aFile.Close();
  // only close if we actually opened it
  if (!options.onlyOutputSpanInfo) {
    extractFile.Close();
    if (!options.onlyDirectFlag) extractFileInv.Close();
  }

  if (options.flexScoreFlag) {
    extractFileContext.Close();
    if (!options.onlyDirectFlag) extractFileContextInv.Close();
  }

  if (options.glueGrammarFlag)
    writeGlueGrammar(fileNameGlueGrammar, options, targetLabelCollection, targetTopLabelCollection);

  if (options.unknownWordLabelFlag)
    writeUnknownWordLabel(fileNameUnknownWordLabel);
}

Example #2

Show file

File: extract-rules.cpp Project: filipg/mosesdecoder

int main(int argc, char* argv[])
{
  cerr << "extract-rules, written by Philipp Koehn\n"
       << "rule extraction from an aligned parallel corpus\n";

  RuleExtractionOptions options;
#ifdef WITH_THREADS
  int thread_count = 1;
#endif
  if (argc < 5) {
    cerr << "syntax: extract-rules corpus.target corpus.source corpus.align extract ["
#ifdef WITH_THREADS
         << " --threads NUM |"
#endif
         << " --GlueGrammar FILE"
         << " | --UnknownWordLabel FILE"
         << " | --OnlyDirect"
         << " | --OutputNTLengths"
         << " | --MaxSpan[" << options.maxSpan << "]"
         << " | --MinHoleTarget[" << options.minHoleTarget << "]"
         << " | --MinHoleSource[" << options.minHoleSource << "]"
         << " | --MinWords[" << options.minWords << "]"
         << " | --MaxSymbolsTarget[" << options.maxSymbolsTarget << "]"
         << " | --MaxSymbolsSource[" << options.maxSymbolsSource << "]"
         << " | --MaxNonTerm[" << options.maxNonTerm << "]"
         << " | --MaxScope[" << options.maxScope << "]"
         << " | --SourceSyntax | --TargetSyntax"
         << " | --AllowOnlyUnalignedWords | --DisallowNonTermConsecTarget |--NonTermConsecSource |  --NoNonTermFirstWord | --NoFractionalCounting ]\n";
    exit(1);
  }
  char* &fileNameT = argv[1];
  char* &fileNameS = argv[2];
  char* &fileNameA = argv[3];
  string fileNameGlueGrammar;
  string fileNameUnknownWordLabel;
  string fileNameExtract = string(argv[4]);

  int optionInd = 5;

  for(int i=optionInd; i<argc; i++) {
    // maximum span length
    if (strcmp(argv[i],"--MaxSpan") == 0) {
      options.maxSpan = atoi(argv[++i]);
      if (options.maxSpan < 1) {
        cerr << "extract error: --maxSpan should be at least 1" << endl;
        exit(1);
      }
    } else if (strcmp(argv[i],"--MinHoleTarget") == 0) {
      options.minHoleTarget = atoi(argv[++i]);
      if (options.minHoleTarget < 1) {
        cerr << "extract error: --minHoleTarget should be at least 1" << endl;
        exit(1);
      }
    } else if (strcmp(argv[i],"--MinHoleSource") == 0) {
      options.minHoleSource = atoi(argv[++i]);
      if (options.minHoleSource < 1) {
        cerr << "extract error: --minHoleSource should be at least 1" << endl;
        exit(1);
      }
    }
    // maximum number of words in hierarchical phrase
    else if (strcmp(argv[i],"--MaxSymbolsTarget") == 0) {
      options.maxSymbolsTarget = atoi(argv[++i]);
      if (options.maxSymbolsTarget < 1) {
        cerr << "extract error: --MaxSymbolsTarget should be at least 1" << endl;
        exit(1);
      }
    }
    // maximum number of words in hierarchical phrase
    else if (strcmp(argv[i],"--MaxSymbolsSource") == 0) {
      options.maxSymbolsSource = atoi(argv[++i]);
      if (options.maxSymbolsSource < 1) {
        cerr << "extract error: --MaxSymbolsSource should be at least 1" << endl;
        exit(1);
      }
    }
    // minimum number of words in hierarchical phrase
    else if (strcmp(argv[i],"--MinWords") == 0) {
      options.minWords = atoi(argv[++i]);
      if (options.minWords < 0) {
        cerr << "extract error: --MinWords should be at least 0" << endl;
        exit(1);
      }
    }
    // maximum number of non-terminals
    else if (strcmp(argv[i],"--MaxNonTerm") == 0) {
      options.maxNonTerm = atoi(argv[++i]);
      if (options.maxNonTerm < 1) {
        cerr << "extract error: --MaxNonTerm should be at least 1" << endl;
        exit(1);
      }
    }
    // maximum scope (see Hopkins and Langmead (2010))
    else if (strcmp(argv[i],"--MaxScope") == 0) {
      options.maxScope = atoi(argv[++i]);
      if (options.maxScope < 0) {
        cerr << "extract error: --MaxScope should be at least 0" << endl;
        exit(1);
      }
    }
    else if (strcmp(argv[i], "--GZOutput") == 0) {
      options.gzOutput = true;  
    } 
    // allow consecutive non-terminals (X Y | X Y)
    else if (strcmp(argv[i],"--TargetSyntax") == 0) {
      options.targetSyntax = true;
    } else if (strcmp(argv[i],"--SourceSyntax") == 0) {
      options.sourceSyntax = true;
    } else if (strcmp(argv[i],"--AllowOnlyUnalignedWords") == 0) {
      options.requireAlignedWord = false;
    } else if (strcmp(argv[i],"--DisallowNonTermConsecTarget") == 0) {
      options.nonTermConsecTarget = false;
    } else if (strcmp(argv[i],"--NonTermConsecSource") == 0) {
      options.nonTermConsecSource = true;
    } else if (strcmp(argv[i],"--NoNonTermFirstWord") == 0) {
      options.nonTermFirstWord = false;
    } else if (strcmp(argv[i],"--OnlyOutputSpanInfo") == 0) {
      options.onlyOutputSpanInfo = true;
    } else if (strcmp(argv[i],"--OnlyDirect") == 0) {
      options.onlyDirectFlag = true;
    } else if (strcmp(argv[i],"--GlueGrammar") == 0) {
      options.glueGrammarFlag = true;
      if (++i >= argc) {
        cerr << "ERROR: Option --GlueGrammar requires a file name" << endl;
        exit(0);
      }
      fileNameGlueGrammar = string(argv[i]);
      cerr << "creating glue grammar in '" << fileNameGlueGrammar << "'" << endl;
    } else if (strcmp(argv[i],"--UnknownWordLabel") == 0) {
      options.unknownWordLabelFlag = true;
      if (++i >= argc) {
        cerr << "ERROR: Option --UnknownWordLabel requires a file name" << endl;
        exit(0);
      }
      fileNameUnknownWordLabel = string(argv[i]);
      cerr << "creating unknown word labels in '" << fileNameUnknownWordLabel << "'" << endl;
    }
    // TODO: this should be a useful option
    //else if (strcmp(argv[i],"--ZipFiles") == 0) {
    //  zipFiles = true;
    //}
    // if an source phrase is paired with two target phrases, then count(t|s) = 0.5
    else if (strcmp(argv[i],"--NoFractionalCounting") == 0) {
      options.fractionalCounting = false;
    } else if (strcmp(argv[i],"--OutputNTLengths") == 0) {
      options.outputNTLengths = true;
#ifdef WITH_THREADS
    } else if (strcmp(argv[i],"-threads") == 0 || 
               strcmp(argv[i],"--threads") == 0 ||
               strcmp(argv[i],"--Threads") == 0) {
      thread_count = atoi(argv[++i]);
#endif
    } else {
      cerr << "extract: syntax error, unknown option '" << string(argv[i]) << "'\n";
      exit(1);
    }
  }

  cerr << "extracting hierarchical rules" << endl;

  // open input files
  Moses::InputFileStream tFile(fileNameT);
  Moses::InputFileStream sFile(fileNameS);
  Moses::InputFileStream aFile(fileNameA);

  istream *tFileP = &tFile;
  istream *sFileP = &sFile;
  istream *aFileP = &aFile;

  // open output files
  string fileNameExtractInv = fileNameExtract + ".inv" + (options.gzOutput?".gz":"");
  Moses::OutputFileStream extractFile;
  Moses::OutputFileStream extractFileInv;
  extractFile.Open((fileNameExtract  + (options.gzOutput?".gz":"")).c_str());
  if (!options.onlyDirectFlag)
    extractFileInv.Open(fileNameExtractInv.c_str());

  // output into file
  Moses::OutputCollector* extractCollector = new Moses::OutputCollector(&extractFile);
  Moses::OutputCollector* extractCollectorInv = new Moses::OutputCollector(&extractFileInv);

  // stats on labels for glue grammar and unknown word label probabilities
  set< string > targetLabelCollection, sourceLabelCollection;
  map< string, int > targetTopLabelCollection, sourceTopLabelCollection;

#ifdef WITH_THREADS
  // set up thread pool
  Moses::ThreadPool pool(thread_count);
  pool.SetQueueLimit(1000);
#endif

  // loop through all sentence pairs
  size_t i=0;
  while(true) {
    i++;
    if (i%1000 == 0) cerr << "." << flush;
    if (i%10000 == 0) cerr << ":" << flush;
    if (i%100000 == 0) cerr << "!" << flush;
    char targetString[LINE_MAX_LENGTH];
    char sourceString[LINE_MAX_LENGTH];
    char alignmentString[LINE_MAX_LENGTH];
    SAFE_GETLINE((*tFileP), targetString, LINE_MAX_LENGTH, '\n', __FILE__);
    if (tFileP->eof()) break;
    SAFE_GETLINE((*sFileP), sourceString, LINE_MAX_LENGTH, '\n', __FILE__);
    SAFE_GETLINE((*aFileP), alignmentString, LINE_MAX_LENGTH, '\n', __FILE__);

    SentenceAlignmentWithSyntax *sentence = new SentenceAlignmentWithSyntax
      (targetLabelCollection, sourceLabelCollection, 
       targetTopLabelCollection, sourceTopLabelCollection, options);
    //az: output src, tgt, and alingment line
    if (options.onlyOutputSpanInfo) {
      cout << "LOG: SRC: " << sourceString << endl;
      cout << "LOG: TGT: " << targetString << endl;
      cout << "LOG: ALT: " << alignmentString << endl;
      cout << "LOG: PHRASES_BEGIN:" << endl;
    }

    if (sentence->create(targetString, sourceString, alignmentString, i)) {
      if (options.unknownWordLabelFlag) {
        collectWordLabelCounts(*sentence);
      }
      ExtractTask *task = new ExtractTask(i-1, sentence, options, extractCollector, extractCollectorInv);
#ifdef WITH_THREADS
      if (thread_count == 1) {
        task->Run();
        delete task;
      }
      else {
        pool.Submit(task);
      }
#else
      task->Run();
      delete task;
#endif
    }
    if (options.onlyOutputSpanInfo) cout << "LOG: PHRASES_END:" << endl; //az: mark end of phrases
  }

#ifdef WITH_THREADS
  // wait for all threads to finish
  pool.Stop(true);
#endif

  tFile.Close();
  sFile.Close();
  aFile.Close();
  // only close if we actually opened it
  if (!options.onlyOutputSpanInfo) {
    extractFile.Close();
    if (!options.onlyDirectFlag) extractFileInv.Close();
  }

  if (options.glueGrammarFlag)
    writeGlueGrammar(fileNameGlueGrammar, options, targetLabelCollection, targetTopLabelCollection);

  if (options.unknownWordLabelFlag)
    writeUnknownWordLabel(fileNameUnknownWordLabel);
}