Ejemplo n.º 1
0
bool RuleTableLoaderStandard::Load(FormatType format
                                , const std::vector<FactorType> &input
                                , const std::vector<FactorType> &output
                                , const std::string &inFile
                                , const std::vector<float> &weight
                                , size_t /* tableLimit */
                                , const LMList &languageModels
                                , const WordPenaltyProducer* wpProducer
                                , RuleTableTrie &ruleTable)
{
  PrintUserTime(string("Start loading text SCFG phrase table. ") + (format==MosesFormat?"Moses ":"Hiero ") + " format");

  const StaticData &staticData = StaticData::Instance();
  const std::string& factorDelimiter = staticData.GetFactorDelimiter();

  string lineOrig;
  size_t count = 0;

  std::ostream *progress = NULL;
  IFVERBOSE(1) progress = &std::cerr;
  util::FilePiece in(inFile.c_str(), progress);

  // reused variables
  vector<float> scoreVector;
  StringPiece line;
  std::string hiero_before, hiero_after;

  while(true) {
    try {
      line = in.ReadLine();
    } catch (const util::EndOfFileException &e) { break; }

    if (format == HieroFormat) { // inefficiently reformat line
      hiero_before.assign(line.data(), line.size());
      ReformatHieroRule(hiero_before, hiero_after);
      line = hiero_after;
    }

    util::TokenIter<util::MultiCharacter> pipes(line, "|||");
    StringPiece sourcePhraseString(*pipes);
    StringPiece targetPhraseString(*++pipes);
    StringPiece scoreString(*++pipes);
    StringPiece alignString(*++pipes);
    // TODO(bhaddow) efficiently handle default instead of parsing this string every time.  
    StringPiece ruleCountString = ++pipes ? *pipes : StringPiece("1 1");
    
    if (++pipes) {
      stringstream strme;
      strme << "Syntax error at " << ruleTable.GetFilePath() << ":" << count;
      UserMessage::Add(strme.str());
      abort();
    }

    bool isLHSEmpty = (sourcePhraseString.find_first_not_of(" \t", 0) == string::npos);
    if (isLHSEmpty && !staticData.IsWordDeletionEnabled()) {
      TRACE_ERR( ruleTable.GetFilePath() << ":" << count << ": pt entry contains empty target, skipping\n");
      continue;
    }

    scoreVector.clear();
    for (util::TokenIter<util::AnyCharacter, true> s(scoreString, " \t"); s; ++s) {
      char *err_ind;
      scoreVector.push_back(strtod(s->data(), &err_ind));
      UTIL_THROW_IF(err_ind == s->data(), util::Exception, "Bad score " << *s << " on line " << count);
    }
    const size_t numScoreComponents = ruleTable.GetFeature()->GetNumScoreComponents();
    if (scoreVector.size() != numScoreComponents) {
      stringstream strme;
      strme << "Size of scoreVector != number (" << scoreVector.size() << "!="
            << numScoreComponents << ") of score components on line " << count;
      UserMessage::Add(strme.str());
      abort();
    }

    // parse source & find pt node

    // constituent labels
    Word sourceLHS, targetLHS;

    // source
    Phrase sourcePhrase( 0);
    sourcePhrase.CreateFromStringNewFormat(Input, input, sourcePhraseString, factorDelimiter, sourceLHS);

    // create target phrase obj
    TargetPhrase *targetPhrase = new TargetPhrase(Output);
    targetPhrase->CreateFromStringNewFormat(Output, output, targetPhraseString, factorDelimiter, targetLHS);
    targetPhrase->SetSourcePhrase(sourcePhrase);

    // rest of target phrase
    targetPhrase->SetAlignmentInfo(alignString, sourcePhrase);
    targetPhrase->SetTargetLHS(targetLHS);
    
    targetPhrase->SetRuleCount(ruleCountString, scoreVector[0]);
    //targetPhrase->SetDebugOutput(string("New Format pt ") + line);
    
    // component score, for n-best output
    std::transform(scoreVector.begin(),scoreVector.end(),scoreVector.begin(),TransformScore);
    std::transform(scoreVector.begin(),scoreVector.end(),scoreVector.begin(),FloorScore);

    targetPhrase->SetScoreChart(ruleTable.GetFeature(), scoreVector, weight, languageModels,wpProducer);

    TargetPhraseCollection &phraseColl = GetOrCreateTargetPhraseCollection(ruleTable, sourcePhrase, *targetPhrase, sourceLHS);
    phraseColl.Add(targetPhrase);

    count++;
  }

  // sort and prune each target phrase collection
  SortAndPrune(ruleTable);

  return true;
}
Ejemplo n.º 2
0
bool HyperTreeLoader::Load(AllOptions const& opts,
                           const std::vector<FactorType> &input,
                           const std::vector<FactorType> &output,
                           const std::string &inFile,
                           const RuleTableFF &ff,
                           HyperTree &trie,
                           boost::unordered_set<std::size_t> &sourceTermSet)
{
  PrintUserTime(std::string("Start loading HyperTree"));

  sourceTermSet.clear();

  std::size_t count = 0;

  std::ostream *progress = NULL;
  IFVERBOSE(1) progress = &std::cerr;
  util::FilePiece in(inFile.c_str(), progress);

  // reused variables
  std::vector<float> scoreVector;
  StringPiece line;

  double_conversion::StringToDoubleConverter converter(double_conversion::StringToDoubleConverter::NO_FLAGS, NAN, NAN, "inf", "nan");

  HyperPathLoader hyperPathLoader;

  Phrase dummySourcePhrase;
  {
    Word *lhs = NULL;
    dummySourcePhrase.CreateFromString(Input, input, "hello", &lhs);
    delete lhs;
  }

  while(true) {
    try {
      line = in.ReadLine();
    } catch (const util::EndOfFileException &e) {
      break;
    }

    util::TokenIter<util::MultiCharacter> pipes(line, "|||");
    StringPiece sourceString(*pipes);
    StringPiece targetString(*++pipes);
    StringPiece scoreString(*++pipes);

    StringPiece alignString;
    if (++pipes) {
      StringPiece temp(*pipes);
      alignString = temp;
    }

    ++pipes;  // counts

    scoreVector.clear();
    for (util::TokenIter<util::AnyCharacter, true> s(scoreString, " \t"); s; ++s) {
      int processed;
      float score = converter.StringToFloat(s->data(), s->length(), &processed);
      UTIL_THROW_IF2(std::isnan(score), "Bad score " << *s << " on line " << count);
      scoreVector.push_back(FloorScore(TransformScore(score)));
    }
    const std::size_t numScoreComponents = ff.GetNumScoreComponents();
    if (scoreVector.size() != numScoreComponents) {
      UTIL_THROW2("Size of scoreVector != number (" << scoreVector.size() << "!="
                  << numScoreComponents << ") of score components on line " << count);
    }

    // Source-side
    HyperPath sourceFragment;
    hyperPathLoader.Load(sourceString, sourceFragment);
    ExtractSourceTerminalSetFromHyperPath(sourceFragment, sourceTermSet);

    // Target-side
    TargetPhrase *targetPhrase = new TargetPhrase(&ff);
    Word *targetLHS = NULL;
    targetPhrase->CreateFromString(Output, output, targetString, &targetLHS);
    targetPhrase->SetTargetLHS(targetLHS);
    targetPhrase->SetAlignmentInfo(alignString);

    if (++pipes) {
      StringPiece sparseString(*pipes);
      targetPhrase->SetSparseScore(&ff, sparseString);
    }

    if (++pipes) {
      StringPiece propertiesString(*pipes);
      targetPhrase->SetProperties(propertiesString);
    }

    targetPhrase->GetScoreBreakdown().Assign(&ff, scoreVector);
    targetPhrase->EvaluateInIsolation(dummySourcePhrase,
                                      ff.GetFeaturesToApply());

    // Add rule to trie.
    TargetPhraseCollection::shared_ptr phraseColl
    = GetOrCreateTargetPhraseCollection(trie, sourceFragment);
    phraseColl->Add(targetPhrase);

    count++;
  }

  // sort and prune each target phrase collection
  if (ff.GetTableLimit()) {
    SortAndPrune(trie, ff.GetTableLimit());
  }

  return true;
}
Ejemplo n.º 3
0
bool RuleTableLoaderStandard::Load(FormatType format
                                   , const std::vector<FactorType> &input
                                   , const std::vector<FactorType> &output
                                   , const std::string &inFile
                                   , size_t /* tableLimit */
                                   , RuleTableTrie &ruleTable)
{
  PrintUserTime(string("Start loading text SCFG phrase table. ") + (format==MosesFormat?"Moses ":"Hiero ") + " format");

  const StaticData &staticData = StaticData::Instance();
  const std::string& factorDelimiter = staticData.GetFactorDelimiter();

  string lineOrig;
  size_t count = 0;

  std::ostream *progress = NULL;
  IFVERBOSE(1) progress = &std::cerr;
  util::FilePiece in(inFile.c_str(), progress);

  // reused variables
  vector<float> scoreVector;
  StringPiece line;
  std::string hiero_before, hiero_after;

  double_conversion::StringToDoubleConverter converter(double_conversion::StringToDoubleConverter::NO_FLAGS, NAN, NAN, "inf", "nan");

  while(true) {
    try {
      line = in.ReadLine();
    } catch (const util::EndOfFileException &e) {
      break;
    }

    if (format == HieroFormat) { // inefficiently reformat line
      hiero_before.assign(line.data(), line.size());
      ReformatHieroRule(hiero_before, hiero_after);
      line = hiero_after;
    }

    util::TokenIter<util::MultiCharacter> pipes(line, "|||");
    StringPiece sourcePhraseString(*pipes);
    StringPiece targetPhraseString(*++pipes);
    StringPiece scoreString(*++pipes);

    StringPiece alignString;
    if (++pipes) {
      StringPiece temp(*pipes);
      alignString = temp;
    }

    if (++pipes) {
      StringPiece str(*pipes); //counts
    }

    bool isLHSEmpty = (sourcePhraseString.find_first_not_of(" \t", 0) == string::npos);
    if (isLHSEmpty && !staticData.IsWordDeletionEnabled()) {
      TRACE_ERR( ruleTable.GetFilePath() << ":" << count << ": pt entry contains empty target, skipping\n");
      continue;
    }

    scoreVector.clear();
    for (util::TokenIter<util::AnyCharacter, true> s(scoreString, " \t"); s; ++s) {
      int processed;
      float score = converter.StringToFloat(s->data(), s->length(), &processed);
      UTIL_THROW_IF(isnan(score), util::Exception, "Bad score " << *s << " on line " << count);
      scoreVector.push_back(FloorScore(TransformScore(score)));
    }
    const size_t numScoreComponents = ruleTable.GetNumScoreComponents();
    if (scoreVector.size() != numScoreComponents) {
      stringstream strme;
      strme << "Size of scoreVector != number (" << scoreVector.size() << "!="
            << numScoreComponents << ") of score components on line " << count;
      UserMessage::Add(strme.str());
      abort();
    }

    // parse source & find pt node

    // constituent labels
    Word *sourceLHS;
    Word *targetLHS;

    // create target phrase obj
    TargetPhrase *targetPhrase = new TargetPhrase();
    targetPhrase->CreateFromString(Output, output, targetPhraseString, factorDelimiter, &targetLHS);

    // source
    Phrase sourcePhrase;
    sourcePhrase.CreateFromString(Input, input, sourcePhraseString, factorDelimiter, &sourceLHS);

    // rest of target phrase
    targetPhrase->SetAlignmentInfo(alignString);
    targetPhrase->SetTargetLHS(targetLHS);

    //targetPhrase->SetDebugOutput(string("New Format pt ") + line);

    if (++pipes) {
      StringPiece sparseString(*pipes);
      targetPhrase->SetSparseScore(&ruleTable, sparseString);
    }

    if (++pipes) {
      StringPiece propertiesString(*pipes);
      targetPhrase->SetProperties(propertiesString);
    }

    targetPhrase->GetScoreBreakdown().Assign(&ruleTable, scoreVector);
    targetPhrase->Evaluate(sourcePhrase, ruleTable.GetFeaturesToApply());

    TargetPhraseCollection &phraseColl = GetOrCreateTargetPhraseCollection(ruleTable, sourcePhrase, *targetPhrase, sourceLHS);
    phraseColl.Add(targetPhrase);

    count++;
  }

  // sort and prune each target phrase collection
  SortAndPrune(ruleTable);

  return true;
}
Ejemplo n.º 4
0
inline children launch_pipeline(const Entries &entries) 
{ 
    BOOST_ASSERT(entries.size() >= 2); 

    children cs; 
    detail::file_handle fhinvalid; 

    boost::scoped_array<detail::pipe> pipes(new detail::pipe[entries.size() - 1]); 

#if defined(BOOST_POSIX_API) 
    { 
        typename Entries::size_type i = 0; 
        const typename Entries::value_type::context_type &ctx = entries[i].context; 

        detail::info_map infoin, infoout; 

        if (ctx.stdin_behavior.get_type() != stream_behavior::close) 
        { 
            detail::stream_info si = detail::stream_info(ctx.stdin_behavior, false); 
            infoin.insert(detail::info_map::value_type(STDIN_FILENO, si)); 
        } 

        BOOST_ASSERT(ctx.stdout_behavior.get_type() == stream_behavior::close); 
        detail::stream_info si2(close_stream(), true); 
        si2.type_ = detail::stream_info::use_handle; 
        si2.handle_ = pipes[i].wend().release(); 
        infoout.insert(detail::info_map::value_type(STDOUT_FILENO, si2)); 

        if (ctx.stderr_behavior.get_type() != stream_behavior::close) 
        { 
            detail::stream_info si = detail::stream_info(ctx.stderr_behavior, true); 
            infoout.insert(detail::info_map::value_type(STDERR_FILENO, si)); 
        } 

        detail::posix_setup s; 
        s.work_directory = ctx.work_directory; 

        pid_t pid = detail::posix_start(entries[i].executable, entries[i].arguments, ctx.environment, infoin, infoout, s); 

        detail::file_handle fhstdin; 

        if (ctx.stdin_behavior.get_type() == stream_behavior::capture) 
        { 
            fhstdin = detail::posix_info_locate_pipe(infoin, STDIN_FILENO, false); 
            BOOST_ASSERT(fhstdin.valid()); 
        } 

        cs.push_back(child(pid, fhstdin, fhinvalid, fhinvalid)); 
    } 

    for (typename Entries::size_type i = 1; i < entries.size() - 1; ++i) 
    { 
        const typename Entries::value_type::context_type &ctx = entries[i].context; 
        detail::info_map infoin, infoout; 

        BOOST_ASSERT(ctx.stdin_behavior.get_type() == stream_behavior::close); 
        detail::stream_info si1(close_stream(), false); 
        si1.type_ = detail::stream_info::use_handle; 
        si1.handle_ = pipes[i - 1].rend().release(); 
        infoin.insert(detail::info_map::value_type(STDIN_FILENO, si1)); 

        BOOST_ASSERT(ctx.stdout_behavior.get_type() == stream_behavior::close); 
        detail::stream_info si2(close_stream(), true); 
        si2.type_ = detail::stream_info::use_handle; 
        si2.handle_ = pipes[i].wend().release(); 
        infoout.insert(detail::info_map::value_type(STDOUT_FILENO, si2)); 

        if (ctx.stderr_behavior.get_type() != stream_behavior::close) 
        { 
            detail::stream_info si = detail::stream_info(ctx.stderr_behavior, true); 
            infoout.insert(detail::info_map::value_type(STDERR_FILENO, si)); 
        } 

        detail::posix_setup s; 
        s.work_directory = ctx.work_directory; 

        pid_t pid = detail::posix_start(entries[i].executable, entries[i].arguments, ctx.environment, infoin, infoout, s); 

        cs.push_back(child(pid, fhinvalid, fhinvalid, fhinvalid)); 
    } 

    { 
        typename Entries::size_type i = entries.size() - 1; 
        const typename Entries::value_type::context_type &ctx = entries[i].context; 

        detail::info_map infoin, infoout; 

        BOOST_ASSERT(ctx.stdin_behavior.get_type() == stream_behavior::close); 
        detail::stream_info si1(close_stream(), false); 
        si1.type_ = detail::stream_info::use_handle; 
        si1.handle_ = pipes[i - 1].rend().release(); 
        infoin.insert(detail::info_map::value_type(STDIN_FILENO, si1)); 

        if (ctx.stdout_behavior.get_type() != stream_behavior::close) 
        { 
            detail::stream_info si = detail::stream_info(ctx.stdout_behavior, true); 
            infoout.insert(detail::info_map::value_type(STDOUT_FILENO, si)); 
        } 

        if (ctx.stderr_behavior.get_type() != stream_behavior::close) 
        { 
            detail::stream_info si = detail::stream_info(ctx.stderr_behavior, true); 
            infoout.insert(detail::info_map::value_type(STDERR_FILENO, si)); 
        } 

        detail::posix_setup s; 
        s.work_directory = ctx.work_directory; 

        pid_t pid = detail::posix_start(entries[i].executable, entries[i].arguments, ctx.environment, infoin, infoout, s); 

        detail::file_handle fhstdout, fhstderr; 

        if (ctx.stdout_behavior.get_type() == stream_behavior::capture) 
        { 
            fhstdout = detail::posix_info_locate_pipe(infoout, STDOUT_FILENO, true); 
            BOOST_ASSERT(fhstdout.valid()); 
        } 

        if (ctx.stderr_behavior.get_type() == stream_behavior::capture) 
        { 
            fhstderr = detail::posix_info_locate_pipe(infoout, STDERR_FILENO, true); 
            BOOST_ASSERT(fhstderr.valid()); 
        } 

        cs.push_back(child(pid, fhinvalid, fhstdout, fhstderr)); 
    } 
#elif defined(BOOST_WINDOWS_API) 
    STARTUPINFOA si; 
    detail::win32_setup s; 
    s.startupinfo = &si; 

    { 
        typename Entries::size_type i = 0; 
        const typename Entries::value_type::context_type &ctx = entries[i].context; 

        detail::stream_info sii = detail::stream_info(ctx.stdin_behavior, false); 
        detail::file_handle fhstdin; 
        if (sii.type_ == detail::stream_info::use_pipe) 
            fhstdin = sii.pipe_->wend(); 

        BOOST_ASSERT(ctx.stdout_behavior.get_type() == stream_behavior::close); 
        detail::stream_info sio(close_stream(), true); 
        sio.type_ = detail::stream_info::use_handle; 
        sio.handle_ = pipes[i].wend().release(); 

        detail::stream_info sie(ctx.stderr_behavior, true); 

        s.work_directory = ctx.work_directory; 

        ::ZeroMemory(&si, sizeof(si)); 
        si.cb = sizeof(si); 
        PROCESS_INFORMATION pi = detail::win32_start(entries[i].executable, entries[i].arguments, ctx.environment, sii, sio, sie, s); 

        if (!::CloseHandle(pi.hThread)) 
            boost::throw_exception(boost::system::system_error(boost::system::error_code(::GetLastError(), boost::system::get_system_category()), "boost::process::launch_pipeline: CloseHandle failed")); 

        cs.push_back(child(pi.dwProcessId, fhstdin, fhinvalid, fhinvalid, detail::file_handle(pi.hProcess))); 
    } 

    for (typename Entries::size_type i = 1; i < entries.size() - 1; ++i) 
    { 
        const typename Entries::value_type::context_type &ctx = entries[i].context; 

        BOOST_ASSERT(ctx.stdin_behavior.get_type() == stream_behavior::close); 
        detail::stream_info sii(close_stream(), false); 
        sii.type_ = detail::stream_info::use_handle; 
        sii.handle_ = pipes[i - 1].rend().release(); 

        detail::stream_info sio(close_stream(), true); 
        sio.type_ = detail::stream_info::use_handle; 
        sio.handle_ = pipes[i].wend().release(); 

        detail::stream_info sie(ctx.stderr_behavior, true); 

        s.work_directory = ctx.work_directory; 

        ::ZeroMemory(&si, sizeof(si)); 
        si.cb = sizeof(si); 
        PROCESS_INFORMATION pi = detail::win32_start(entries[i].executable, entries[i].arguments, ctx.environment, sii, sio, sie, s); 

        if (!::CloseHandle(pi.hThread)) 
            boost::throw_exception(boost::system::system_error(boost::system::error_code(::GetLastError(), boost::system::get_system_category()), "boost::process::launch_pipeline: CloseHandle failed")); 

        cs.push_back(child(pi.dwProcessId, fhinvalid, fhinvalid, fhinvalid, detail::file_handle(pi.hProcess))); 
    } 

    { 
        typename Entries::size_type i = entries.size() - 1; 
        const typename Entries::value_type::context_type &ctx = entries[i].context; 

        BOOST_ASSERT(ctx.stdin_behavior.get_type() == stream_behavior::close); 
        detail::stream_info sii(close_stream(), false); 
        sii.type_ = detail::stream_info::use_handle; 
        sii.handle_ = pipes[i - 1].rend().release(); 

        detail::file_handle fhstdout, fhstderr; 

        detail::stream_info sio(ctx.stdout_behavior, true); 
        if (sio.type_ == detail::stream_info::use_pipe) 
            fhstdout = sio.pipe_->rend(); 
        detail::stream_info sie(ctx.stderr_behavior, true); 
        if (sie.type_ == detail::stream_info::use_pipe) 
            fhstderr = sie.pipe_->rend(); 

        s.work_directory = ctx.work_directory; 

        ::ZeroMemory(&si, sizeof(si)); 
        si.cb = sizeof(si); 
        PROCESS_INFORMATION pi = detail::win32_start(entries[i].executable, entries[i].arguments, ctx.environment, sii, sio, sie, s); 

        if (!::CloseHandle(pi.hThread)) 
            boost::throw_exception(boost::system::system_error(boost::system::error_code(::GetLastError(), boost::system::get_system_category()), "boost::process::launch_pipeline: CloseHandle failed")); 

        cs.push_back(child(pi.dwProcessId, fhinvalid, fhstdout, fhstderr, detail::file_handle(pi.hProcess))); 
    } 
#endif 

    return cs; 
} 
bool RuleTrieLoader::Load(const std::vector<FactorType> &input,
                          const std::vector<FactorType> &output,
                          const std::string &inFile,
                          const RuleTableFF &ff,
                          RuleTrie &trie)
{
    PrintUserTime(std::string("Start loading text phrase table. Moses format"));

    const StaticData &staticData = StaticData::Instance();
    // const std::string &factorDelimiter = staticData.GetFactorDelimiter();

    std::size_t count = 0;

    std::ostream *progress = NULL;
    IFVERBOSE(1) progress = &std::cerr;
    util::FilePiece in(inFile.c_str(), progress);

    // reused variables
    std::vector<float> scoreVector;
    StringPiece line;

    double_conversion::StringToDoubleConverter converter(double_conversion::StringToDoubleConverter::NO_FLAGS, NAN, NAN, "inf", "nan");

    while(true) {
        try {
            line = in.ReadLine();
        } catch (const util::EndOfFileException &e) {
            break;
        }

        util::TokenIter<util::MultiCharacter> pipes(line, "|||");
        StringPiece sourcePhraseString(*pipes);
        StringPiece targetPhraseString(*++pipes);
        StringPiece scoreString(*++pipes);

        StringPiece alignString;
        if (++pipes) {
            StringPiece temp(*pipes);
            alignString = temp;
        }

        if (++pipes) {
            StringPiece str(*pipes); //counts
        }

        bool isLHSEmpty = (sourcePhraseString.find_first_not_of(" \t", 0) == std::string::npos);
        if (isLHSEmpty && !staticData.IsWordDeletionEnabled()) {
            TRACE_ERR( ff.GetFilePath() << ":" << count << ": pt entry contains empty target, skipping\n");
            continue;
        }

        scoreVector.clear();
        for (util::TokenIter<util::AnyCharacter, true> s(scoreString, " \t"); s; ++s) {
            int processed;
            float score = converter.StringToFloat(s->data(), s->length(), &processed);
            UTIL_THROW_IF2(std::isnan(score), "Bad score " << *s << " on line " << count);
            scoreVector.push_back(FloorScore(TransformScore(score)));
        }
        const std::size_t numScoreComponents = ff.GetNumScoreComponents();
        if (scoreVector.size() != numScoreComponents) {
            UTIL_THROW2("Size of scoreVector != number (" << scoreVector.size() << "!="
                        << numScoreComponents << ") of score components on line " << count);
        }

        // parse source & find pt node

        // constituent labels
        Word *sourceLHS = NULL;
        Word *targetLHS;

        // create target phrase obj
        TargetPhrase *targetPhrase = new TargetPhrase(&ff);
        // targetPhrase->CreateFromString(Output, output, targetPhraseString, factorDelimiter, &targetLHS);
        targetPhrase->CreateFromString(Output, output, targetPhraseString, &targetLHS);
        // source
        Phrase sourcePhrase;
        // sourcePhrase.CreateFromString(Input, input, sourcePhraseString, factorDelimiter, &sourceLHS);
        sourcePhrase.CreateFromString(Input, input, sourcePhraseString, &sourceLHS);

        // rest of target phrase
        targetPhrase->SetAlignmentInfo(alignString);
        targetPhrase->SetTargetLHS(targetLHS);

        //targetPhrase->SetDebugOutput(string("New Format pt ") + line);

        if (++pipes) {
            StringPiece sparseString(*pipes);
            targetPhrase->SetSparseScore(&ff, sparseString);
        }

        if (++pipes) {
            StringPiece propertiesString(*pipes);
            targetPhrase->SetProperties(propertiesString);
        }

        targetPhrase->GetScoreBreakdown().Assign(&ff, scoreVector);
        targetPhrase->EvaluateInIsolation(sourcePhrase, ff.GetFeaturesToApply());

        TargetPhraseCollection &phraseColl = GetOrCreateTargetPhraseCollection(
                trie, *sourceLHS, sourcePhrase);
        phraseColl.Add(targetPhrase);

        // not implemented correctly in memory pt. just delete it for now
        delete sourceLHS;

        count++;
    }

    // sort and prune each target phrase collection
    if (ff.GetTableLimit()) {
        SortAndPrune(trie, ff.GetTableLimit());
    }

    return true;
}
Ejemplo n.º 6
0
Variant HHVM_FUNCTION(proc_open,
                      const String& cmd,
                      const Array& descriptorspec,
                      VRefParam pipesParam,
                      const String& cwd /* = null_string */,
                      const Variant& env /* = null_variant */,
                      const Variant& other_options /* = null_variant */) {
  if (RuntimeOption::WhitelistExec && !check_cmd(cmd.data())) {
    return false;
  }
  if (cmd.size() != strlen(cmd.c_str())) {
    raise_warning("NULL byte detected. Possible attack");
    return false;
  }
  Variant pipes(pipesParam, Variant::WithRefBind{});

  std::vector<DescriptorItem> items;

  std::string scwd = "";
  if (!cwd.empty()) {
    scwd = cwd.c_str();
  } else if (!g_context->getCwd().empty()) {
    scwd = g_context->getCwd().c_str();
  }

  Array enva;

  if (env.isNull()) {
    // Build out an environment that conceptually matches what we'd
    // see if we were to iterate the environment and call getenv()
    // for each name.

    // Env vars defined in the hdf file go in first
    for (const auto& envvar : RuntimeOption::EnvVariables) {
      enva.set(String(envvar.first), String(envvar.second));
    }

    // global environment overrides the hdf
    for (char **env = environ; env && *env; env++) {
      char *p = strchr(*env, '=');
      if (p) {
        String name(*env, p - *env, CopyString);
        String val(p + 1, CopyString);
        enva.set(name, val);
      }
    }

    // and then any putenv() changes take precedence
    for (ArrayIter iter(g_context->getEnvs()); iter; ++iter) {
      enva.set(iter.first(), iter.second());
    }
  } else {
    enva = env.toArray();
  }

  pid_t child;

  if (LightProcess::Available()) {
    // light process available
    // there is no need to do any locking, because the forking is delegated
    // to the light process
    if (!pre_proc_open(descriptorspec, items)) return false;
    const int item_size = items.size();
    std::vector<int> created;
    created.reserve(item_size);
    std::vector<int> intended;
    intended.reserve(item_size);
    for (int i = 0; i < item_size; i++) {
      const auto& item = items[i];
      created.push_back(item.childend);
      intended.push_back(item.index);
    }

    std::vector<std::string> envs;
    for (ArrayIter iter(enva); iter; ++iter) {
      StringBuffer nvpair;
      nvpair.append(iter.first().toString());
      nvpair.append('=');
      nvpair.append(iter.second().toString());
      std::string tmp = nvpair.detach().c_str();
      if (tmp.find('\n') == std::string::npos) {
        envs.push_back(tmp);
      }
    }

    child = LightProcess::proc_open(cmd.c_str(), created, intended,
                                    scwd.c_str(), envs);
    assert(child);
    return post_proc_open(cmd, pipes, enva, items, child);
  } else {
    /* the unix way */
    Lock lock(DescriptorItem::s_mutex);
    if (!pre_proc_open(descriptorspec, items)) return false;
    child = fork();
    if (child) {
      // the parent process
      return post_proc_open(cmd, pipes, enva, items, child);
    }
  }

  assert(child == 0);
  /* this is the child process */

  /* close those descriptors that we just opened for the parent stuff,
   * dup new descriptors into required descriptors and close the original
   * cruft */
  for (auto& item : items) {
    item.dupChild();
  }
  if (scwd.length() > 0 && chdir(scwd.c_str())) {
    // chdir failed, the working directory remains unchanged
  }
  std::vector<String> senvs; // holding those char *
  char **envp = build_envp(enva, senvs);
  execle("/bin/sh", "sh", "-c", cmd.data(), NULL, envp);
  free(envp);
  _exit(127);
}
Ejemplo n.º 7
0
Variant HHVM_FUNCTION(proc_open,
                      const String& cmd,
                      const Array& descriptorspec,
                      VRefParam pipesParam,
                      const Variant& cwd /* = uninit_variant */,
                      const Variant& env /* = uninit_variant */,
                      const Variant& other_options /* = uninit_variant */) {
  if (RuntimeOption::WhitelistExec && !check_cmd(cmd.data())) {
    return false;
  }
  if (cmd.size() != strlen(cmd.c_str())) {
    raise_warning("NULL byte detected. Possible attack");
    return false;
  }
  Variant pipes(pipesParam, Variant::WithRefBind{});

  std::vector<DescriptorItem> items;

  std::string scwd = "";
  if (!cwd.isNull() && cwd.isString() && !cwd.asCStrRef().empty()) {
    scwd = cwd.asCStrRef().c_str();
  } else if (!g_context->getCwd().empty()) {
    scwd = g_context->getCwd().c_str();
  }

  Array enva;

  if (env.isNull()) {
    if (is_cli_mode()) {
      enva = cli_env();
    } else {
      // Build out an environment that conceptually matches what we'd
      // see if we were to iterate the environment and call getenv()
      // for each name.

      // Env vars defined in the hdf file go in first
      for (const auto& envvar : RuntimeOption::EnvVariables) {
        enva.set(String(envvar.first), String(envvar.second));
      }

      // global environment overrides the hdf
      for (char **env = environ; env && *env; env++) {
        char *p = strchr(*env, '=');
        if (p) {
          String name(*env, p - *env, CopyString);
          String val(p + 1, CopyString);
          enva.set(name, val);
        }
      }
    }

    // and then any putenv() changes take precedence
    for (ArrayIter iter(g_context->getEnvs()); iter; ++iter) {
      enva.set(iter.first(), iter.second());
    }
  } else {
    enva = env.toArray();
  }


#ifdef _WIN32
  PROCESS_INFORMATION pi;
  HANDLE childHandle;
  STARTUPINFO si;
  BOOL newprocok;
  SECURITY_ATTRIBUTES security;
  DWORD dwCreateFlags = 0;
  char *command_with_cmd;
  UINT old_error_mode;
  char cur_cwd[MAXPATHLEN];
  bool suppress_errors = false;
  bool bypass_shell = false;

  if (!other_options.isNull() && other_options.isArray()) {
    auto arr = other_options.asCArrRef();
    if (arr.exists(String("suppress_errors", CopyString), true)) {
      auto v = arr[String("suppress_errors", CopyString)];
      if ((v.isBoolean() && v.asBooleanVal()) ||
          (v.isInteger() && v.asInt64Val())) {
        suppress_errors = true;
      }
    }

    if (arr.exists(String("bypass_shell", CopyString), true)) {
      auto v = arr[String("bypass_shell", CopyString)];
      if ((v.isBoolean() && v.asBooleanVal()) ||
          (v.isInteger() && v.asInt64Val())) {
        bypass_shell = true;
      }
    }
  }

  /* we use this to allow the child to inherit handles */
  memset(&security, 0, sizeof(security));
  security.nLength = sizeof(security);
  security.bInheritHandle = true;
  security.lpSecurityDescriptor = nullptr;

  memset(&si, 0, sizeof(si));
  si.cb = sizeof(si);
  si.dwFlags = STARTF_USESTDHANDLES;

  si.hStdInput = GetStdHandle(STD_INPUT_HANDLE);
  si.hStdOutput = GetStdHandle(STD_OUTPUT_HANDLE);
  si.hStdError = GetStdHandle(STD_ERROR_HANDLE);

  if (!pre_proc_open(descriptorspec, items)) return false;
  /* redirect stdin/stdout/stderr if requested */
  for (size_t i = 0; i < items.size(); i++) {
    switch (items[i].index) {
      case 0:
        si.hStdInput = items[i].childend;
        break;
      case 1:
        si.hStdOutput = items[i].childend;
        break;
      case 2:
        si.hStdError = items[i].childend;
        break;
    }
  }


  memset(&pi, 0, sizeof(pi));

  if (suppress_errors) {
    old_error_mode = SetErrorMode(
      SEM_FAILCRITICALERRORS | SEM_NOGPFAULTERRORBOX);
  }

  dwCreateFlags = NORMAL_PRIORITY_CLASS;
  if (!RuntimeOption::ServerExecutionMode()) {
    dwCreateFlags |= CREATE_NO_WINDOW;
  }

  char *envp = build_envp(enva);
  if (bypass_shell) {
    newprocok = CreateProcess(
      nullptr,
      strdup(cmd.c_str()),
      &security,
      &security,
      TRUE,
      dwCreateFlags,
      envp,
      scwd.c_str(),
      &si,
      &pi);
  } else {
    std::string command_with = "cmd.exe /c ";
    command_with += cmd.toCppString();

    newprocok = CreateProcess(
      nullptr,
      strdup(command_with.c_str()),
      &security,
      &security,
      TRUE,
      dwCreateFlags,
      envp,
      scwd.c_str(),
      &si,
      &pi);
  }
  free(envp);

  if (suppress_errors) {
    SetErrorMode(old_error_mode);
  }

  if (newprocok == FALSE) {
    DWORD dw = GetLastError();
    char* msg;
    FormatMessageA(
      FORMAT_MESSAGE_ALLOCATE_BUFFER
        | FORMAT_MESSAGE_FROM_SYSTEM
        | FORMAT_MESSAGE_IGNORE_INSERTS,
      nullptr,
      dw,
      MAKELANGID(LANG_NEUTRAL, SUBLANG_DEFAULT),
      (LPSTR)&msg,
      0,
      nullptr);

    /* clean up all the descriptors */
    for (size_t i = 0; i < items.size(); i++) {
      CloseHandle(items[i].childend);
      if (items[i].parentend) {
        CloseHandle(items[i].parentend);
      }
    }
    raise_warning("CreateProcess failed, error code - %u: %s", dw, msg);
    LocalFree(msg);
    return false;
  }

  childHandle = pi.hProcess;
  DWORD child = pi.dwProcessId;
  CloseHandle(pi.hThread);
  return post_proc_open(cmd, pipes, enva, items, (pid_t)child, childHandle);
#else
  pid_t child;

  if (LightProcess::Available()) {
    // light process available
    // there is no need to do any locking, because the forking is delegated
    // to the light process
    if (!pre_proc_open(descriptorspec, items)) return false;
    const int item_size = items.size();
    std::vector<int> created;
    created.reserve(item_size);
    std::vector<int> intended;
    intended.reserve(item_size);
    for (int i = 0; i < item_size; i++) {
      const auto& item = items[i];
      created.push_back(item.childend);
      intended.push_back(item.index);
    }

    std::vector<std::string> envs;
    for (ArrayIter iter(enva); iter; ++iter) {
      StringBuffer nvpair;
      nvpair.append(iter.first().toString());
      nvpair.append('=');
      nvpair.append(iter.second().toString());
      std::string tmp = nvpair.detach().c_str();
      if (tmp.find('\n') == std::string::npos) {
        envs.push_back(tmp);
      }
    }

    child = LightProcess::proc_open(cmd.c_str(), created, intended,
                                    scwd.c_str(), envs);
    assert(child);
    return post_proc_open(cmd, pipes, enva, items, child);
  } else {
    /* the unix way */
    Lock lock(DescriptorItem::s_mutex);
    if (!pre_proc_open(descriptorspec, items)) return false;
    child = fork();
    if (child) {
      // the parent process
      return post_proc_open(cmd, pipes, enva, items, child);
    }
  }

  assert(child == 0);
  /* this is the child process */

  /* close those descriptors that we just opened for the parent stuff,
   * dup new descriptors into required descriptors and close the original
   * cruft */
  for (auto& item : items) {
    item.dupChild();
  }
  if (scwd.length() > 0 && chdir(scwd.c_str())) {
    // chdir failed, the working directory remains unchanged
  }
  std::vector<String> senvs; // holding those char *
  char **envp = build_envp(enva, senvs);
  execle("/bin/sh", "sh", "-c", cmd.data(), nullptr, envp);
  free(envp);
  _exit(127);
#endif
}
bool PhraseDictionaryMemory::Load(const std::vector<FactorType> &input
                                  , const std::vector<FactorType> &output
                                  , const string &filePath
                                  , const vector<float> &weight
                                  , size_t tableLimit
                                  , const LMList &languageModels
                                  , float weightWP)
{
    const_cast<LMList&>(languageModels).InitializeBeforeSentenceProcessing();

    const StaticData &staticData = StaticData::Instance();

    m_tableLimit = tableLimit;

    util::FilePiece inFile(filePath.c_str(), staticData.GetVerboseLevel() >= 1 ? &std::cerr : NULL);

    size_t line_num = 0;
    size_t numElement = NOT_FOUND; // 3=old format, 5=async format which include word alignment info
    const std::string& factorDelimiter = staticData.GetFactorDelimiter();

    Phrase sourcePhrase(0);
    std::vector<float> scv;
    scv.reserve(m_numScoreComponent);

    TargetPhraseCollection *preSourceNode = NULL;
    std::string preSourceString;

    while(true) {
        ++line_num;
        StringPiece line;
        try {
            line = inFile.ReadLine();
        } catch (util::EndOfFileException &e) {
            break;
        }

        util::TokenIter<util::MultiCharacter> pipes(line, util::MultiCharacter("|||"));
        StringPiece sourcePhraseString(GrabOrDie(pipes, filePath, line_num));
        StringPiece targetPhraseString(GrabOrDie(pipes, filePath, line_num));
        StringPiece scoreString(GrabOrDie(pipes, filePath, line_num));

        bool isLHSEmpty = !util::TokenIter<util::AnyCharacter, true>(sourcePhraseString, util::AnyCharacter(" \t"));
        if (isLHSEmpty && !staticData.IsWordDeletionEnabled()) {
            TRACE_ERR( filePath << ":" << line_num << ": pt entry contains empty source, skipping\n");
            continue;
        }

        //target
        std::auto_ptr<TargetPhrase> targetPhrase(new TargetPhrase());
        targetPhrase->CreateFromString(output, targetPhraseString, factorDelimiter);

        scv.clear();
        for (util::TokenIter<util::AnyCharacter, true> token(scoreString, util::AnyCharacter(" \t")); token; ++token) {
            char *err_ind;
            // Token is always delimited by some form of space.  Also, apparently strtod is portable but strtof isn't.
            scv.push_back(FloorScore(TransformScore(static_cast<float>(strtod(token->data(), &err_ind)))));
            if (err_ind == token->data()) {
                stringstream strme;
                strme << "Bad number " << token << " on line " << line_num;
                UserMessage::Add(strme.str());
                abort();
            }
        }
        if (scv.size() != m_numScoreComponent) {
            stringstream strme;
            strme << "Size of scoreVector != number (" <<scv.size() << "!=" <<m_numScoreComponent<<") of score components on line " << line_num;
            UserMessage::Add(strme.str());
            abort();
        }



        size_t consumed = 3;
        if (pipes) {
            targetPhrase->SetAlignmentInfo(*pipes++);
            ++consumed;
        }

        ScoreComponentCollection sparse;
        if (pipes) pipes++; //counts
        if (pipes) {
            //sparse features
            SparsePhraseDictionaryFeature* spdf =
                GetFeature()->GetSparsePhraseDictionaryFeature();
            if (spdf) {
                sparse.Assign(spdf,(pipes++)->as_string());
            }
        }


        // scv good to go sir!
        targetPhrase->SetScore(m_feature, scv, sparse, weight, weightWP, languageModels);

        // Check number of entries delimited by ||| agrees across all lines.
        for (; pipes; ++pipes, ++consumed) {}
        if (numElement != consumed) {
            if (numElement == NOT_FOUND) {
                numElement = consumed;
            } else {
                stringstream strme;
                strme << "Syntax error at " << filePath << ":" << line_num;
                UserMessage::Add(strme.str());
                abort();
            }
        }

        //TODO: Would be better to reuse source phrases, but ownership has to be
        //consistent across phrase table implementations
        sourcePhrase.Clear();
        sourcePhrase.CreateFromString(input, sourcePhraseString, factorDelimiter);
        //Now that the source phrase is ready, we give the target phrase a copy
        targetPhrase->SetSourcePhrase(sourcePhrase);
        if (preSourceString == sourcePhraseString && preSourceNode) {
            preSourceNode->Add(targetPhrase.release());
        } else {
            preSourceNode = CreateTargetPhraseCollection(sourcePhrase);
            preSourceNode->Add(targetPhrase.release());
            preSourceString.assign(sourcePhraseString.data(), sourcePhraseString.size());
        }
    }

    // sort each target phrase collection
    m_collection.Sort(m_tableLimit);

    /* // TODO ASK OLIVER WHY THIS IS NEEDED
    const_cast<LMList&>(languageModels).CleanUpAfterSentenceProcessing();
    */

    return true;
}