void GlobalLexicalModel::Load() { FactorCollection &factorCollection = FactorCollection::Instance(); const std::string& factorDelimiter = StaticData::Instance().GetFactorDelimiter(); VERBOSE(2, "Loading global lexical model from file " << m_filePath << endl); m_inputFactors = FactorMask(m_inputFactorsVec); m_outputFactors = FactorMask(m_outputFactorsVec); InputFileStream inFile(m_filePath); // reading in data one line at a time size_t lineNum = 0; string line; while(getline(inFile, line)) { ++lineNum; vector<string> token = Tokenize<string>(line, " "); if (token.size() != 3) { // format checking stringstream errorMessage; errorMessage << "Syntax error at " << m_filePath << ":" << lineNum << endl << line << endl; UserMessage::Add(errorMessage.str()); abort(); } // create the output word Word *outWord = new Word(); vector<string> factorString = Tokenize( token[0], factorDelimiter ); for (size_t i=0 ; i < m_outputFactorsVec.size() ; i++) { const FactorDirection& direction = Output; const FactorType& factorType = m_outputFactorsVec[i]; const Factor* factor = factorCollection.AddFactor( direction, factorType, factorString[i] ); outWord->SetFactor( factorType, factor ); } // create the input word Word *inWord = new Word(); factorString = Tokenize( token[1], factorDelimiter ); for (size_t i=0 ; i < m_inputFactorsVec.size() ; i++) { const FactorDirection& direction = Input; const FactorType& factorType = m_inputFactorsVec[i]; const Factor* factor = factorCollection.AddFactor( direction, factorType, factorString[i] ); inWord->SetFactor( factorType, factor ); } // maximum entropy feature score float score = Scan<float>(token[2]); // std::cerr << "storing word " << *outWord << " " << *inWord << " " << score << endl; // store feature in hash DoubleHash::iterator keyOutWord = m_hash.find( outWord ); if( keyOutWord == m_hash.end() ) { m_hash[outWord][inWord] = score; } else { // already have hash for outword, delete the word to avoid leaks (keyOutWord->second)[inWord] = score; delete outWord; } } }
bool PhraseDictionaryTreeAdaptor::Load(const std::vector<FactorType> &input , const std::vector<FactorType> &output , const std::string &filePath , const std::vector<float> &weight , size_t tableLimit , const LMList &languageModels , float weightWP ) { FactorCollection &factorCollection = FactorCollection::Instance(); if(m_numScoreComponent!=weight.size()) { stringstream strme; strme << "ERROR: mismatch of number of scaling factors: "<<weight.size() <<" "<<m_numScoreComponent<<"\n"; UserMessage::Add(strme.str()); return false; } m_filePath = filePath; // set Dictionary members m_inputFactors = FactorMask(input); m_outputFactors = FactorMask(output); VERBOSE(2,"PhraseDictionaryTreeAdaptor: input=" << m_inputFactors << " output=" << m_outputFactors << std::endl); // set PhraseDictionary members m_tableLimit=tableLimit; imp->Create(input,output,factorCollection,filePath, weight,languageModels,weightWP); return true; }
DecodeFeature::DecodeFeature(size_t numScoreComponents , const std::vector<FactorType> &input , const std::vector<FactorType> &output , const std::string &line) : StatelessFeatureFunction(numScoreComponents, line) , m_input(input), m_output(output) { m_inputFactors = FactorMask(input); m_outputFactors = FactorMask(output); VERBOSE(2,"DecodeFeature: input=" << m_inputFactors << " output=" << m_outputFactors << std::endl); }
void DecodeFeature::SetParameter(const std::string& key, const std::string& value) { if (key == "input-factor") { m_input =Tokenize<FactorType>(value, ","); m_inputFactors = FactorMask(m_input); } else if (key == "output-factor") { m_output =Tokenize<FactorType>(value, ","); m_outputFactors = FactorMask(m_output); } else { StatelessFeatureFunction::SetParameter(key, value); } }
bool PhraseDictionaryNewFormat::Load(const std::vector<FactorType> &input , const std::vector<FactorType> &output , const string &filePath , const vector<float> &weight , size_t tableLimit , const LMList &languageModels , float weightWP) { m_filePath = filePath; m_tableLimit = tableLimit; //factors m_inputFactors = FactorMask(input); m_outputFactors = FactorMask(output); // data from file InputFileStream inFile(filePath); bool ret = Load(input, output, inFile, weight, tableLimit, languageModels, weightWP); return ret; }
bool LanguageModelParallelBackoff::Load(const std::string &filePath, const std::vector<FactorType> &factorTypes, size_t nGramOrder) { cerr << "Loading Language Model Parallel Backoff!!!\n"; widMatrix = new ::WidMatrix(); m_factorTypes = FactorMask(factorTypes); m_srilmVocab = new ::FactoredVocab(); //assert(m_srilmVocab != 0); fnSpecs = 0; File f(filePath.c_str(),"r"); fnSpecs = new ::FNgramSpecs<FNgramCount>(f,*m_srilmVocab, 0/*debug*/); cerr << "Loaded fnSpecs!\n"; m_srilmVocab->unkIsWord() = true; m_srilmVocab->nullIsWord() = true; m_srilmVocab->toLower() = false; FNgramStats *factoredStats = new FNgramStats(*m_srilmVocab, *fnSpecs); factoredStats->debugme(2); cerr << "Factored stats\n"; FNgram* fngramLM = new FNgram(*m_srilmVocab,*fnSpecs); assert(fngramLM != 0); cerr << "FNgram object created\n"; fngramLM->skipOOVs = false; if (!factoredStats->read()) { cerr << "error reading in counts in factor file\n"; exit(1); } cerr << "Factored stats read!\n"; factoredStats->estimateDiscounts(); factoredStats->computeCardinalityFunctions(); factoredStats->sumCounts(); cerr << "Another three operations made!\n"; if (!fngramLM->read()) { cerr << "format error in lm file\n"; exit(1); } cerr << "fngramLM reads!\n"; m_filePath = filePath; m_nGramOrder= nGramOrder; m_factorTypesOrdered= factorTypes; m_unknownId = m_srilmVocab->unkIndex(); cerr << "m_unknowdId = " << m_unknownId << endl; m_srilmModel = fngramLM; cerr << "Create factors...\n"; CreateFactors(); cerr << "Factors created! \n"; //FactorCollection &factorCollection = FactorCollection::Instance(); /*for (size_t index = 0 ; index < m_factorTypesOrdered.size() ; ++index) { FactorType factorType = m_factorTypesOrdered[index]; m_sentenceStartArray[factorType] = factorCollection.AddFactor(Output, factorType, BOS_); m_sentenceEndArray[factorType] = factorCollection.AddFactor(Output, factorType, EOS_); //factorIdStart = m_sentenceStartArray[factorType]->GetId(); //factorIdEnd = m_sentenceEndArray[factorType]->GetId(); for (size_t i = 0; i < 10; i++) { lmIdMap[factorIdStart * 10 + i] = GetLmID(BOS_); lmIdMap[factorIdEnd * 10 + i] = GetLmID(EOS_); } //(*lmIdMap)[factorIdStart * 10 + index] = GetLmID(BOS_); //(*lmIdMap)[factorIdEnd * 10 + index] = GetLmID(EOS_); }*/ return true; }