コード例 #1
0
bool PhraseDictionaryNewFormat::Load(const std::vector<FactorType> &input
																			 , const std::vector<FactorType> &output
																			 , std::istream &inStream
																			 , const std::vector<float> &weight
																			 , size_t tableLimit
																			 , const LMList &languageModels
																			 , float weightWP)
{
	PrintUserTime("Start loading new format pt model");
	
	const StaticData &staticData = StaticData::Instance();
	const std::string& factorDelimiter = staticData.GetFactorDelimiter();
	
	VERBOSE(2,"PhraseDictionaryNewFormat: input=" << m_inputFactors << "  output=" << m_outputFactors << std::endl);
	
	string line;
	size_t count = 0;
	
	while(getline(inStream, line))
	{
		vector<string> tokens;
		vector<float> scoreVector;
		
		TokenizeMultiCharSeparator(tokens, line , "|||" );
					
		if (tokens.size() != 4 && tokens.size() != 5)
		{
			stringstream strme;
			strme << "Syntax error at " << m_filePath << ":" << count;
			UserMessage::Add(strme.str());
			abort();
		}
		
		const string &sourcePhraseString	= tokens[0]
								, &targetPhraseString	= tokens[1]
								, &alignString				= tokens[2]
								, &scoreString				= tokens[3];

		bool isLHSEmpty = (sourcePhraseString.find_first_not_of(" \t", 0) == string::npos);
		if (isLHSEmpty && !staticData.IsWordDeletionEnabled()) {
			TRACE_ERR( m_filePath << ":" << count << ": pt entry contains empty target, skipping\n");
			continue;
		}
		
		Tokenize<float>(scoreVector, scoreString);
		if (scoreVector.size() != m_numScoreComponent)
		{
			stringstream strme;
			strme << "Size of scoreVector != number (" <<scoreVector.size() << "!=" <<m_numScoreComponent<<") of score components on line " << count;
			UserMessage::Add(strme.str());
			abort();
		}
		assert(scoreVector.size() == m_numScoreComponent);
		
		// parse source & find pt node
		
		// head word
		Word sourceLHS, targetLHS;

		// source
		Phrase sourcePhrase(Input);
		sourcePhrase.CreateFromStringNewFormat(Input, input, sourcePhraseString, factorDelimiter, sourceLHS);
		
		// create target phrase obj
		TargetPhrase *targetPhrase = new TargetPhrase(Output);
		targetPhrase->CreateFromStringNewFormat(Output, output, targetPhraseString, factorDelimiter, targetLHS);
		
		// alignment
		list<pair<size_t,size_t> > alignmentInfo;
		CreateAlignmentInfo(alignmentInfo, alignString);

		// rest of target phrase
		targetPhrase->SetAlignmentInfo(alignmentInfo);
		targetPhrase->SetTargetLHS(targetLHS);
		//targetPhrase->SetDebugOutput(string("New Format pt ") + line);
		
		// component score, for n-best output
		std::transform(scoreVector.begin(),scoreVector.end(),scoreVector.begin(),TransformScore);
		std::transform(scoreVector.begin(),scoreVector.end(),scoreVector.begin(),FloorScore);
		
		targetPhrase->SetScoreChart(GetFeature(), scoreVector, weight, languageModels);
		
		// count info for backoff
		if (tokens.size() >= 6)
			targetPhrase->CreateCountInfo(tokens[5]);

		TargetPhraseCollection &phraseColl = GetOrCreateTargetPhraseCollection(sourcePhrase, *targetPhrase);
		AddEquivPhrase(phraseColl, targetPhrase);
		
		count++;
	}
	
	// cleanup cache
	
	// sort each target phrase collection
	m_collection.Sort(m_tableLimit);
	
	return true;
}