bool XmlDocument::XPath(StringPiece const &v, std::wstring *r) { WStringPiece str; xmlXPathObjectPtr xpath = xmlXPathEvalExpression((unsigned char const*)v.data(), libxml_stuff->xpath_context); if (xpath) { xmlChar *s_ = xmlXPathCastToString(xpath); if (s_) { StringPiece s((char*)s_); wchar_t *output = conv_buf.Alloc(s.length()); int const conv_result = cpcl::TryConvertUTF8_UTF16(s, output, conv_buf.Size()); if (conv_result == -1) { output = 0; Trace(CPCL_TRACE_LEVEL_ERROR, "XmlDocument::XPath(%s): utf_to_uc fails", v.as_string().c_str()); } else { output = conv_buf.Data() + conv_result; if (conv_result > ((int)(conv_buf.Size()&INT_MAX))) Trace(CPCL_TRACE_LEVEL_WARNING, "XmlDocument::XPath(%s): TryConvertUTF8_UTF16 okashi desu ne...", v.as_string().c_str()); } xmlFree(s_); if (output) { //*output = 0; str = WStringPiece(conv_buf.Data(), output - conv_buf.Data()); } } xmlXPathFreeObject(xpath); } if (TrimResults) str = str.trim(TrimChars); if ((!str.empty()) && (r)) r->assign(str.data(), str.size()); return (!str.empty()); }
float VWPredictor::Predict(const StringPiece &label) { m_ex->set_label(label.as_string()); m_isFirstSource = true; m_isFirstTarget = true; float loss = m_ex->predict_partial(); if (DEBUG) std::cerr << "VW :: Predicted loss: " << loss << "\n"; m_ex->remns(); // remove target namespace return loss; }
void FeatureDataIterator::readNext() { m_next.clear(); try { StringPiece marker = m_in->ReadDelimited(); if (marker != StringPiece(FEATURES_TXT_BEGIN)) { throw FileFormatException(m_in->FileName(), marker.as_string()); } size_t sentenceId = m_in->ReadULong(); size_t count = m_in->ReadULong(); size_t length = m_in->ReadULong(); m_in->ReadLine(); //discard rest of line for (size_t i = 0; i < count; ++i) { StringPiece line = m_in->ReadLine(); m_next.push_back(FeatureDataItem()); for (TokenIter<AnyCharacter, true> token(line, AnyCharacter(" \t")); token; ++token) { TokenIter<AnyCharacter,false> value(*token,AnyCharacter(":")); if (!value) throw FileFormatException(m_in->FileName(), line.as_string()); StringPiece first = *value; ++value; if (!value) { //regular feature float floatValue = ParseFloat(first); m_next.back().dense.push_back(floatValue); } else { //sparse feature StringPiece second = *value; float floatValue = ParseFloat(second); m_next.back().sparse.set(first.as_string(),floatValue); } } if (length != m_next.back().dense.size()) { throw FileFormatException(m_in->FileName(), line.as_string()); } } StringPiece line = m_in->ReadLine(); if (line != StringPiece(FEATURES_TXT_END)) { throw FileFormatException(m_in->FileName(), line.as_string()); } } catch (EndOfFileException &e) { m_in.reset(); } }
//get the children of a node in a binarized tree; if a child is virtual, (transitively) replace it with its children void InternalTree::GetUnbinarizedChildren(std::vector<TreePointer> &ret) const { for (std::vector<TreePointer>::const_iterator itx = m_children.begin(); itx != m_children.end(); ++itx) { const StringPiece label = (*itx)->GetLabel().GetString(0); if (!label.empty() && label.as_string()[0] == '^') { (*itx)->GetUnbinarizedChildren(ret); } else { ret.push_back(*itx); } } }
// Replace the first "old" pattern with the "new" pattern in a string std::string ReplaceFirst( const StringPiece& s, const StringPiece& oldsub, const StringPiece& newsub) { if (oldsub.empty()) return s.as_string(); std::string res; std::string::size_type pos = s.find(oldsub); if (pos == std::string::npos) return s.as_string(); else { res.append(s.data(), pos); res.append(newsub.data(), newsub.size()); res.append(s.data() + pos + oldsub.size(), s.length() - pos - oldsub.size()); } return res; }
DALM::VocabId LanguageModelDALM::GetVocabId(const Factor *factor) const { VocabMap::left_map::const_iterator iter; iter = m_vocabMap.left.find(factor); if (iter != m_vocabMap.left.end()) { return iter->second; } else { StringPiece str = factor->GetString(); DALM::VocabId wid = m_vocab->lookup(str.as_string().c_str()); return wid; } }
void add_to_map(StoreVocab<uint64_t> &sourceVocab, const StringPiece &textin) { //Tokenize util::TokenIter<util::SingleCharacter> itWord(textin, util::SingleCharacter(' ')); while (itWord) { StringPiece word = *itWord; util::TokenIter<util::SingleCharacter> itFactor(word, util::SingleCharacter('|')); while (itFactor) { StringPiece factor = *itFactor; sourceVocab.Insert(getHash(factor), factor.as_string()); itFactor++; } itWord++; } }
static inline void SplitUsingStringDelimiterToIterator(const StringPiece& full, const char* delim, ITR& result) { if (full.empty()) { return; } if (delim[0] == '\0') { *result++ = full.as_string(); return; } // Optimize the common case where delim is a single character. if (delim[1] == '\0') { SplitStringToIteratorUsing<StringType>(full, delim, result); return; } size_t delim_length = strlen(delim); for (size_t begin_index = 0; begin_index < full.size();) { size_t end_index = full.find(delim, begin_index); if (end_index == std::string::npos) { *result++ = full.substr(begin_index).as_string(); return; } if (end_index > begin_index) { StringType value(full.data() + begin_index, end_index - begin_index); *result++ = value; } begin_index = end_index + delim_length; } }
// Replace all the "old" pattern with the "new" pattern in a string std::string ReplaceAll(const StringPiece& s, const StringPiece& oldsub, const StringPiece& newsub) { if (oldsub.empty()) return s.as_string(); std::string res; std::string::size_type start_pos = 0; std::string::size_type pos; do { pos = s.find(oldsub, start_pos); if (pos == std::string::npos) { break; } res.append(s.data() + start_pos, pos - start_pos); res.append(newsub.data(), newsub.size()); start_pos = pos + oldsub.size(); } while (true); res.append(s.data() + start_pos, s.length() - start_pos); return res; }
void TargetPhrase::SetProperties(const StringPiece &str) { if (str.size() == 0) { return; } vector<string> toks; TokenizeMultiCharSeparator(toks, str.as_string(), "{{"); for (size_t i = 0; i < toks.size(); ++i) { string &tok = toks[i]; if (tok.empty()) { continue; } size_t endPos = tok.rfind("}"); tok = tok.substr(0, endPos - 1); vector<string> keyValue = TokenizeFirstOnly(tok, " "); UTIL_THROW_IF2(keyValue.size() != 2, "Incorrect format of property: " << str); SetProperty(keyValue[0], keyValue[1]); } }
line_text split_line(StringPiece textin) { const char delim[] = " ||| "; StringPiece str; //Temp string container std::string temp; //Temp string for conversion int num; line_text output; //Tokenize util::TokenIter<util::MultiCharacter> it(textin, util::MultiCharacter(delim)); //Get string str = *it; output.text = str; it++; //Get num str = *it; //Convert to int temp = str.as_string(); num = atoi(temp.c_str()); output.value = num; return output; }
LsiRewriteDriverFactory::LsiRewriteDriverFactory( const ProcessContext &process_context, SystemThreadSystem *system_thread_system, StringPiece hostname, int port) : SystemRewriteDriverFactory(process_context, system_thread_system, NULL /* default shared memory runtime */, hostname, port), m_mainConf(NULL), m_bThreadsStarted(false), m_pLsiMessageHandler(new LsiMessageHandler(timer(), thread_system()->NewMutex())), m_pHtmlParseLsiMessageHandler( new LsiMessageHandler(timer(), thread_system()->NewMutex())), m_pSharedCircularBuffer(NULL), m_sHostname(hostname.as_string()), m_iPort(port) { InitializeDefaultOptions(); default_options()->set_beacon_url("/ls_pagespeed_beacon"); SystemRewriteOptions *system_options = dynamic_cast<SystemRewriteOptions *>(default_options()); system_options->set_file_cache_clean_inode_limit(500000); system_options->set_avoid_renaming_introspective_javascript(true); set_message_handler(m_pLsiMessageHandler); set_html_parse_message_handler(m_pHtmlParseLsiMessageHandler); }
std::string ReplaceAllChars(const StringPiece& s, const StringPiece& from, char to) { std::string result = s.as_string(); ReplaceAllChars(&result, from, to); return result; }
std::string StringTrim(const StringPiece& str, const StringPiece& trim_value) { std::string res = str.as_string(); StringTrim(&res, trim_value); return res; }
std::string StringTrim(const StringPiece& str) { std::string res = str.as_string(); StringTrim(&res); return res; }
void VWPredictor::AddFeature(const StringPiece &name, float value) { if (DEBUG) std::cerr << "VW :: Adding feature: " << EscapeSpecialChars(name.as_string()) << ":" << value << "\n"; m_ex->addf(EscapeSpecialChars(name.as_string()), value); }
int main(int argc, char **argv) { util::FileStream out(1); util::FileStream err(2); size_t maxNBestSize; size_t iterationLimit; std::string filenameSBleu, filenameNBestList, filenameFeatureNames, filenameInitialWeights; bool ignoreDecoderScore; float learningRate; float initialStepSize; float decreaseRate; float increaseRate; float minStepSize; float maxStepSize; float floorAbsScalingFactor; float regularizationParameter; bool printZeroWeights; bool miniBatches; std::string optimizerTypeStr; size_t optimizerType = 0; #define EXPECTED_BLEU_OPTIMIZER_TYPE_RPROP 1 #define EXPECTED_BLEU_OPTIMIZER_TYPE_SGD 2 try { po::options_description descr("Usage"); descr.add_options() ("help,h", "produce help message") ("n-best-size-limit,l", po::value<size_t>(&maxNBestSize)->default_value(100), "limit of n-best list entries to be considered for training") ("iterations,i", po::value<size_t>(&iterationLimit)->default_value(50), "number of training iterations") ("sbleu-file,b", po::value<std::string>(&filenameSBleu)->required(), "file containing sentence-level BLEU scores for all n-best list entries") ("prepared-n-best-list,n", po::value<std::string>(&filenameNBestList)->required(), "input n-best list file, in prepared format for expected BLEU training") ("feature-name-file,f", po::value<std::string>(&filenameFeatureNames)->required(), "file containing mapping between feature names and indices") ("initial-weights-file,w", po::value<std::string>(&filenameInitialWeights)->default_value(""), "file containing start values for scaling factors (optional)") ("ignore-decoder-score", boost::program_options::value<bool>(&ignoreDecoderScore)->default_value(0), "exclude decoder score from computation of posterior probability") ("regularization", boost::program_options::value<float>(®ularizationParameter)->default_value(0), // e.g. 1e-5 "regularization parameter; suggested value range: [1e-8,1e-5]") ("learning-rate", boost::program_options::value<float>(&learningRate)->default_value(1), "learning rate for the SGD optimizer") ("floor", boost::program_options::value<float>(&floorAbsScalingFactor)->default_value(0), // e.g. 1e-7 "set scaling factor to 0 if below this absolute value after update") ("initial-step-size", boost::program_options::value<float>(&initialStepSize)->default_value(0.001), // TODO: try 0.01 and 0.1 "initial step size for the RPROP optimizer") ("decrease-rate", boost::program_options::value<float>(&decreaseRate)->default_value(0.5), "decrease rate for the RPROP optimizer") ("increase-rate", boost::program_options::value<float>(&increaseRate)->default_value(1.2), "increase rate for the RPROP optimizer") ("min-step-size", boost::program_options::value<float>(&minStepSize)->default_value(1e-7), "minimum step size for the RPROP optimizer") ("max-step-size", boost::program_options::value<float>(&maxStepSize)->default_value(1), "maximum step size for the RPROP optimizer") ("print-zero-weights", boost::program_options::value<bool>(&printZeroWeights)->default_value(0), "output scaling factors even if they are trained to 0") ("optimizer", po::value<std::string>(&optimizerTypeStr)->default_value("RPROP"), "optimizer type used for training (known algorithms: RPROP, SGD)") ("mini-batches", boost::program_options::value<bool>(&miniBatches)->default_value(0), "update after every single sentence (SGD only)") ; po::variables_map vm; po::store(po::parse_command_line(argc, argv, descr), vm); if (vm.count("help")) { std::ostringstream os; os << descr; out << os.str() << '\n'; out.flush(); exit(0); } po::notify(vm); } catch(std::exception& e) { err << "Error: " << e.what() << '\n'; err.flush(); exit(1); } if ( !optimizerTypeStr.compare("rprop") || !optimizerTypeStr.compare("RPROP") ) { optimizerType = EXPECTED_BLEU_OPTIMIZER_TYPE_RPROP; } else if ( !optimizerTypeStr.compare("sgd") || !optimizerTypeStr.compare("SGD") ) { optimizerType = EXPECTED_BLEU_OPTIMIZER_TYPE_SGD; } else { err << "Error: unknown optimizer type: \"" << optimizerTypeStr << "\" (known optimizers: rprop, sgd) " << '\n'; err.flush(); exit(1); } util::FilePiece ifsFeatureNames(filenameFeatureNames.c_str()); StringPiece lineFeatureName; if ( !ifsFeatureNames.ReadLineOrEOF(lineFeatureName) ) { err << "Error: flawed content in " << filenameFeatureNames << '\n'; err.flush(); exit(1); } size_t maxFeatureNamesIdx = atol( lineFeatureName.as_string().c_str() ); std::vector<std::string> featureNames(maxFeatureNamesIdx); boost::unordered_map<std::string, size_t> featureIndexes; for (size_t i=0; i<maxFeatureNamesIdx; ++i) { if ( !ifsFeatureNames.ReadLineOrEOF(lineFeatureName) ) { err << "Error: flawed content in " << filenameFeatureNames << '\n'; err.flush(); exit(1); } util::TokenIter<util::SingleCharacter> token(lineFeatureName, ' '); size_t featureIndexCurrent = atol( token->as_string().c_str() ); token++; featureNames[featureIndexCurrent] = token->as_string(); featureIndexes[token->as_string()] = featureIndexCurrent; } std::vector<float> sparseScalingFactor(maxFeatureNamesIdx); std::vector< boost::unordered_map<size_t, float> > sparseScore(maxNBestSize); // read initial weights, if any given if ( filenameInitialWeights.length() != 0 ) { util::FilePiece ifsInitialWeights(filenameInitialWeights.c_str()); StringPiece lineInitialWeight; if ( !ifsInitialWeights.ReadLineOrEOF(lineInitialWeight) ) { err << "Error: flawed content in " << filenameInitialWeights << '\n'; err.flush(); exit(1); } do { util::TokenIter<util::SingleCharacter> token(lineInitialWeight, ' '); boost::unordered_map<std::string, size_t>::const_iterator found = featureIndexes.find(token->as_string()); if ( found == featureIndexes.end() ) { err << "Error: flawed content in " << filenameInitialWeights << " (unkown feature name \"" << token->as_string() << "\")" << '\n'; err.flush(); exit(1); } token++; sparseScalingFactor[found->second] = atof( token->as_string().c_str() ); } while ( ifsInitialWeights.ReadLineOrEOF(lineInitialWeight) ); } // train ExpectedBleuOptimizer optimizer(err, learningRate, initialStepSize, decreaseRate, increaseRate, minStepSize, maxStepSize, floorAbsScalingFactor, regularizationParameter); if ( optimizerType == EXPECTED_BLEU_OPTIMIZER_TYPE_RPROP ) { optimizer.InitRPROP(sparseScalingFactor); } else if ( optimizerType == EXPECTED_BLEU_OPTIMIZER_TYPE_SGD ) { optimizer.InitRPROP(sparseScalingFactor); } else { err << "Error: unknown optimizer type" << '\n'; err.flush(); exit(1); } for (size_t nIteration=1; nIteration<=iterationLimit; ++nIteration) { util::FilePiece ifsSBleu(filenameSBleu.c_str()); util::FilePiece ifsNBest(filenameNBestList.c_str()); out << "### ITERATION " << nIteration << '\n' << '\n'; size_t sentenceIndex = 0; size_t batchSize = 0; size_t nBestSizeCount = 0; size_t globalIndex = 0; StringPiece lineNBest; std::vector<double> overallScoreUntransformed; std::vector<float> sBleu; float xBleu = 0; // double expPrecisionCorrection = 0.0; while ( ifsNBest.ReadLineOrEOF(lineNBest) ) { util::TokenIter<util::SingleCharacter> token(lineNBest, ' '); if ( token == token.end() ) { err << "Error: flawed content in " << filenameNBestList << '\n'; err.flush(); exit(1); } size_t sentenceIndexCurrent = atol( token->as_string().c_str() ); token++; if ( sentenceIndex != sentenceIndexCurrent ) { if ( optimizerType == EXPECTED_BLEU_OPTIMIZER_TYPE_RPROP ) { optimizer.AddTrainingInstance( nBestSizeCount, sBleu, overallScoreUntransformed, sparseScore ); } else if ( optimizerType == EXPECTED_BLEU_OPTIMIZER_TYPE_SGD ) { optimizer.AddTrainingInstance( nBestSizeCount, sBleu, overallScoreUntransformed, sparseScore, miniBatches ); if ( miniBatches ) { xBleu += optimizer.UpdateSGD( sparseScalingFactor, 1 ); // out << "ITERATION " << nIteration << " SENTENCE " << sentenceIndex << " XBLEUSUM= " << xBleu << '\n'; // for (size_t i=0; i<sparseScalingFactor.size(); ++i) // { // if ( (sparseScalingFactor[i] != 0) || printZeroWeights ) // { // out << "ITERATION " << nIteration << " WEIGHT " << featureNames[i] << " " << sparseScalingFactor[i] << '\n'; // } // } // out << '\n'; // out.flush(); } } else { err << "Error: unknown optimizer type" << '\n'; err.flush(); exit(1); } for (size_t i=0; i<nBestSizeCount; ++i) { sparseScore[i].clear(); } nBestSizeCount = 0; overallScoreUntransformed.clear(); sBleu.clear(); sentenceIndex = sentenceIndexCurrent; ++batchSize; } StringPiece lineSBleu; if ( !ifsSBleu.ReadLineOrEOF(lineSBleu) ) { err << "Error: insufficient number of lines in " << filenameSBleu << '\n'; err.flush(); exit(1); } if ( nBestSizeCount < maxNBestSize ) { // retrieve sBLEU float sBleuCurrent = atof( lineSBleu.as_string().c_str() ); sBleu.push_back(sBleuCurrent); // process n-best list entry if ( token == token.end() ) { err << "Error: flawed content in " << filenameNBestList << '\n'; err.flush(); exit(1); } double scoreCurrent = 0; if ( !ignoreDecoderScore ) { scoreCurrent = atof( token->as_string().c_str() ); // decoder score } token++; // if ( nBestSizeCount == 0 ) // best translation (first n-best list entry for the current sentence / a new mini-batch) // { // expPrecisionCorrection = std::floor ( scoreCurrent ); // decoder score of first-best // } while (token != token.end()) { size_t featureNameCurrent = atol( token->as_string().c_str() ); token++; float featureValueCurrent = atof( token->as_string().c_str() ); sparseScore[nBestSizeCount].insert(std::make_pair(featureNameCurrent, featureValueCurrent)); scoreCurrent += sparseScalingFactor[featureNameCurrent] * featureValueCurrent; token++; } // overallScoreUntransformed.push_back( std::exp(scoreCurrent - expPrecisionCorrection) ); overallScoreUntransformed.push_back( std::exp(scoreCurrent) ); ++nBestSizeCount; } ++globalIndex; } if ( optimizerType == EXPECTED_BLEU_OPTIMIZER_TYPE_RPROP ) { optimizer.AddTrainingInstance( nBestSizeCount, sBleu, overallScoreUntransformed, sparseScore ); // last sentence in the corpus xBleu = optimizer.UpdateRPROP( sparseScalingFactor, batchSize ); out << "xBLEU= " << xBleu << '\n'; } else if ( optimizerType == EXPECTED_BLEU_OPTIMIZER_TYPE_SGD ) { optimizer.AddTrainingInstance( nBestSizeCount, sBleu, overallScoreUntransformed, sparseScore, miniBatches ); // last sentence in the corpus if ( miniBatches ) { xBleu += optimizer.UpdateSGD( sparseScalingFactor, 1 ); xBleu /= batchSize; } else { xBleu = optimizer.UpdateSGD( sparseScalingFactor, batchSize ); } out << "xBLEU= " << xBleu << '\n'; } else { err << "Error: unknown optimizer type" << '\n'; err.flush(); exit(1); } for (size_t i=0; i<nBestSizeCount; ++i) { sparseScore[i].clear(); } nBestSizeCount = 0; overallScoreUntransformed.clear(); sBleu.clear(); out << '\n'; for (size_t i=0; i<sparseScalingFactor.size(); ++i) { if ( (sparseScalingFactor[i] != 0) || printZeroWeights ) { out << "ITERATION " << nIteration << " WEIGHT " << featureNames[i] << " " << sparseScalingFactor[i] << '\n'; } } out << '\n'; out.flush(); } }
virtual void Add (WordIndex index, const StringPiece& str) { std::string s = str.as_string(); Add(index, s); }
void TargetPhrase::SetSparseScore(const FeatureFunction* translationScoreProducer, const StringPiece &sparseString) { m_scoreBreakdown.Assign(translationScoreProducer, sparseString.as_string()); }
std::string StripString(const StringPiece& s, const char* remove, char replacewith) { std::string res = s.as_string(); StripString(&res, remove, replacewith); return res; }
std::string RemoveContinuousBlank(const StringPiece& str) { std::string res = str.as_string(); RemoveContinuousBlank(&res); return res; }
DALM::VocabId LanguageModelDALM::GetVocabId(const Factor *factor) const { StringPiece str = factor->GetString(); DALM::VocabId wid = m_vocab->lookup(str.as_string().c_str()); return wid; }