void GlobalLexicalModel::Load()
{
  FactorCollection &factorCollection = FactorCollection::Instance();
  const std::string& factorDelimiter = StaticData::Instance().GetFactorDelimiter();

  VERBOSE(2, "Loading global lexical model from file " << m_filePath << endl);

  m_inputFactors = FactorMask(m_inputFactorsVec);
  m_outputFactors = FactorMask(m_outputFactorsVec);
  InputFileStream inFile(m_filePath);

  // reading in data one line at a time
  size_t lineNum = 0;
  string line;
  while(getline(inFile, line)) {
    ++lineNum;
    vector<string> token = Tokenize<string>(line, " ");

    if (token.size() != 3) { // format checking
      stringstream errorMessage;
      errorMessage << "Syntax error at " << m_filePath << ":" << lineNum << endl << line << endl;
      UserMessage::Add(errorMessage.str());
      abort();
    }

    // create the output word
    Word *outWord = new Word();
    vector<string> factorString = Tokenize( token[0], factorDelimiter );
    for (size_t i=0 ; i < m_outputFactorsVec.size() ; i++) {
      const FactorDirection& direction = Output;
      const FactorType& factorType = m_outputFactorsVec[i];
      const Factor* factor = factorCollection.AddFactor( direction, factorType, factorString[i] );
      outWord->SetFactor( factorType, factor );
    }

    // create the input word
    Word *inWord = new Word();
    factorString = Tokenize( token[1], factorDelimiter );
    for (size_t i=0 ; i < m_inputFactorsVec.size() ; i++) {
      const FactorDirection& direction = Input;
      const FactorType& factorType = m_inputFactorsVec[i];
      const Factor* factor = factorCollection.AddFactor( direction, factorType, factorString[i] );
      inWord->SetFactor( factorType, factor );
    }

    // maximum entropy feature score
    float score = Scan<float>(token[2]);

    // std::cerr << "storing word " << *outWord << " " << *inWord << " " << score << endl;

    // store feature in hash
    DoubleHash::iterator keyOutWord = m_hash.find( outWord );
    if( keyOutWord == m_hash.end() ) {
      m_hash[outWord][inWord] = score;
    } else { // already have hash for outword, delete the word to avoid leaks
      (keyOutWord->second)[inWord] = score;
      delete outWord;
    }
  }
}
bool PhraseDictionaryTreeAdaptor::Load(const std::vector<FactorType> &input
																				 , const std::vector<FactorType> &output
																				 , const std::string &filePath
																				 , const std::vector<float> &weight
																				 , size_t tableLimit
																				 , const LMList &languageModels
																				 , float weightWP
																				 )
{
	FactorCollection &factorCollection = FactorCollection::Instance();

	if(m_numScoreComponent!=weight.size()) {
		stringstream strme;
		strme << "ERROR: mismatch of number of scaling factors: "<<weight.size()
						 <<" "<<m_numScoreComponent<<"\n";
		UserMessage::Add(strme.str());
		return false;
	}
	m_filePath = filePath;

	// set Dictionary members
	m_inputFactors = FactorMask(input);
	m_outputFactors = FactorMask(output);
	VERBOSE(2,"PhraseDictionaryTreeAdaptor: input=" << m_inputFactors << "  output=" << m_outputFactors << std::endl);

	// set PhraseDictionary members
	m_tableLimit=tableLimit;

	imp->Create(input,output,factorCollection,filePath,
							weight,languageModels,weightWP);
	return true;
}
示例#3
0
DecodeFeature::DecodeFeature(size_t numScoreComponents
                             , const std::vector<FactorType> &input
                             , const std::vector<FactorType> &output
                             , const std::string &line)
  : StatelessFeatureFunction(numScoreComponents, line)
  , m_input(input), m_output(output)
{
  m_inputFactors = FactorMask(input);
  m_outputFactors = FactorMask(output);
  VERBOSE(2,"DecodeFeature: input=" << m_inputFactors << "  output=" << m_outputFactors << std::endl);
}
示例#4
0
void DecodeFeature::SetParameter(const std::string& key, const std::string& value)
{
  if (key == "input-factor") {
    m_input =Tokenize<FactorType>(value, ",");
    m_inputFactors = FactorMask(m_input);
  } else if (key == "output-factor") {
    m_output =Tokenize<FactorType>(value, ",");
    m_outputFactors = FactorMask(m_output);
  } else {
    StatelessFeatureFunction::SetParameter(key, value);
  }
}
bool PhraseDictionaryNewFormat::Load(const std::vector<FactorType> &input
																			 , const std::vector<FactorType> &output
																			 , const string &filePath
																			 , const vector<float> &weight
																			 , size_t tableLimit
																			 , const LMList &languageModels
																			 , float weightWP)
{
	m_filePath = filePath;
	m_tableLimit = tableLimit;
	
	//factors
	m_inputFactors = FactorMask(input);
	m_outputFactors = FactorMask(output);
	
	// data from file
	InputFileStream inFile(filePath);
			
	bool ret = Load(input, output, inFile, weight, tableLimit, languageModels, weightWP);		
	return ret;
}
bool LanguageModelParallelBackoff::Load(const std::string &filePath, const std::vector<FactorType> &factorTypes, size_t nGramOrder)
{

  cerr << "Loading Language Model Parallel Backoff!!!\n";
  widMatrix = new ::WidMatrix();
  m_factorTypes	= FactorMask(factorTypes);
  m_srilmVocab = new ::FactoredVocab();
  //assert(m_srilmVocab != 0);

  fnSpecs = 0;
  File f(filePath.c_str(),"r");
  fnSpecs = new ::FNgramSpecs<FNgramCount>(f,*m_srilmVocab, 0/*debug*/);

  cerr << "Loaded fnSpecs!\n";

  m_srilmVocab->unkIsWord() = true;
  m_srilmVocab->nullIsWord() = true;
  m_srilmVocab->toLower() = false;

  FNgramStats *factoredStats = new FNgramStats(*m_srilmVocab, *fnSpecs);

  factoredStats->debugme(2);

  cerr << "Factored stats\n";

  FNgram* fngramLM = new FNgram(*m_srilmVocab,*fnSpecs);
  assert(fngramLM != 0);

  cerr << "FNgram object created\n";

  fngramLM->skipOOVs = false;

  if (!factoredStats->read()) {
    cerr << "error reading in counts in factor file\n";
    exit(1);
  }

  cerr << "Factored stats read!\n";

  factoredStats->estimateDiscounts();
  factoredStats->computeCardinalityFunctions();
  factoredStats->sumCounts();

  cerr << "Another three operations made!\n";

  if (!fngramLM->read()) {
    cerr << "format error in lm file\n";
    exit(1);
  }

  cerr << "fngramLM reads!\n";

  m_filePath = filePath;
  m_nGramOrder= nGramOrder;

  m_factorTypesOrdered= factorTypes;

  m_unknownId = m_srilmVocab->unkIndex();

  cerr << "m_unknowdId = " << m_unknownId << endl;

  m_srilmModel = fngramLM;

  cerr << "Create factors...\n";

  CreateFactors();

  cerr << "Factors created! \n";
  //FactorCollection &factorCollection = FactorCollection::Instance();

  /*for (size_t index = 0 ; index < m_factorTypesOrdered.size() ; ++index)
  {
  	FactorType factorType = m_factorTypesOrdered[index];
  	m_sentenceStartArray[factorType] 	= factorCollection.AddFactor(Output, factorType, BOS_);


  	m_sentenceEndArray[factorType] 		= factorCollection.AddFactor(Output, factorType, EOS_);

    //factorIdStart = m_sentenceStartArray[factorType]->GetId();
    //factorIdEnd = m_sentenceEndArray[factorType]->GetId();

    for (size_t i = 0; i < 10; i++)
    {
      lmIdMap[factorIdStart * 10 + i] = GetLmID(BOS_);
  		lmIdMap[factorIdEnd * 10 + i] = GetLmID(EOS_);
    }

  	//(*lmIdMap)[factorIdStart * 10 + index] = GetLmID(BOS_);
  	//(*lmIdMap)[factorIdEnd * 10 + index] = GetLmID(EOS_);

  }*/
  return true;
}