void LexicalTable::load( const string &filePath )
{
  cerr << "Loading lexical translation table from " << filePath;
  ifstream inFile;
  inFile.open(filePath.c_str());
  if (inFile.fail()) {
    cerr << " - ERROR: could not open file\n";
    exit(1);
  }
  istream *inFileP = &inFile;

  string line;

  int i=0;
  while(getline(*inFileP, line)) {
    i++;
    if (i%100000 == 0) cerr << "." << flush;

    vector<string> token = tokenize( line.c_str() );
    if (token.size() != 3) {
      cerr << "line " << i << " in " << filePath << " has wrong number of tokens, skipping:\n" <<
           token.size() << " " << token[0] << " " << line << endl;
      continue;
    }

    double prob = atof( token[2].c_str() );
    WORD_ID wordE = vcbE.storeIfNew( token[0] );
    WORD_ID wordF = vcbF.storeIfNew( token[1] );
    ltable[ wordF ][ wordE ] = prob;
  }
  cerr << endl;
}
bool PhraseAlignment::create(const char line[], int lineID )
{
  vector< string > token = tokenize( line );
  int item = 1;
  PHRASE phraseF, phraseE;
  for (size_t j=0; j<token.size(); j++) {
    if (token[j] == "|||") item++;
    else {
      if (item == 1)
        phraseF.push_back( vcbF.storeIfNew( token[j] ) );
      else if (item == 2)
        phraseE.push_back( vcbE.storeIfNew( token[j] ) );
      else if (item == 3) {
        int e,f;
        sscanf(token[j].c_str(), "%d-%d", &f, &e);
        if ((size_t)e >= phraseE.size() || (size_t)f >= phraseF.size()) {
          cerr << "WARNING: sentence " << lineID << " has alignment point (" << f << ", " << e << ") out of bounds (" << phraseF.size() << ", " << phraseE.size() << ")\n";
        } else {
          if (alignedToE.size() == 0) {
            vector< size_t > dummy;
            for(size_t i=0; i<phraseE.size(); i++)
              alignedToE.push_back( dummy );
            for(size_t i=0; i<phraseF.size(); i++)
              alignedToF.push_back( dummy );
            foreign = phraseTableF.storeIfNew( phraseF );
            english = phraseTableE.storeIfNew( phraseE );
          }
          alignedToE[e].push_back( f );
          alignedToF[f].push_back( e );
        }
      }
    }
  }
  return (item>2); // real phrase pair, not just foreign phrase
}
Beispiel #3
0
void LexicalTable::load( char *fileName )
{
  cerr << "Loading lexical translation table from " << fileName;
  ifstream inFile;
  inFile.open(fileName);
  if (inFile.fail()) {
    cerr << " - ERROR: could not open file\n";
    exit(1);
  }
  istream *inFileP = &inFile;

  char line[LINE_MAX_LENGTH];

  int i=0;
  while(true) {
    i++;
    if (i%100000 == 0) cerr << "." << flush;
    SAFE_GETLINE((*inFileP), line, LINE_MAX_LENGTH, '\n', __FILE__);
    if (inFileP->eof()) break;

    vector<string> token = tokenize( line );
    if (token.size() != 3) {
      cerr << "line " << i << " in " << fileName
           << " has wrong number of tokens, skipping:\n"
           << token.size() << " " << token[0] << " " << line << endl;
      continue;
    }

    double prob = atof( token[2].c_str() );
    WORD_ID wordT = vcbT.storeIfNew( token[0] );
    WORD_ID wordS = vcbS.storeIfNew( token[1] );
    ltable[ wordS ][ wordT ] = prob;
  }
  cerr << endl;
}
// read in a phrase pair and store it
void PhraseAlignment::create(const vector<string>& token, int lineID) {
	int item = 1;
	PHRASE phraseS, phraseT;
	for (size_t j=0; j<token.size(); ++j) {
		if (token[j] == "|||")
			item++;
		else if (item == 1) // source phrase
			phraseS.push_back( vcbS.storeIfNew( token[j] ) );
		else if (item == 2) // target phrase
			phraseT.push_back( vcbT.storeIfNew( token[j] ) );
		else if (item == 3) { // alignment
			int s = strtol(token[j].substr(0, token[j].find("-")).c_str(), NULL, 10);
			int t = strtol(token[j].substr(token[j].find("-") + 1).c_str(), NULL, 10);
			if (t >= phraseT.size() || s >= phraseS.size()) {
				cerr << "WARNING: phrase pair " << lineID 
						 << " has alignment point (" << s << ", " << t 
						 << ") out of bounds (" << phraseS.size() << ", " << phraseT.size() << ")\n";
			} else {
				// first alignment point? -> initialize
				if (alignedToT.size() == 0) {
          assert(alignedToS.size() == 0);
          size_t numTgtSymbols = (hierarchicalFlag ? phraseT.size()-1 : phraseT.size());
          alignedToT.resize(numTgtSymbols);
          size_t numSrcSymbols = (hierarchicalFlag ? phraseS.size()-1 : phraseS.size());
          alignedToS.resize(numSrcSymbols);
					source = phraseTableS.storeIfNew( phraseS );
					target = phraseTableT.storeIfNew( phraseT );
				}
				// add alignment point
				alignedToT[t].insert( s );
				alignedToS[s].insert( t );
			}
		} else if (item == 4) // count
			count = strtof(token[j].c_str(), NULL);
	}
	if (item == 3)
		count = 1.0;
	if (item < 3 || item > 4) {
		cerr << "ERROR: faulty line " << lineID << ": ";
		for(vector<string>::const_iterator i = token.begin(); i != token.end(); cerr << *(i++) << " ");
		cerr << endl;
	}
}
void LexicalTable::load(char *fileName) {
  cerr << "Loading lexical translation table from " << fileName;
	Bz2LineReader inFile(fileName, Bz2LineReader::UNCOMPRESSED);
	
	int i = 0;
	for (string line = inFile.readLine(); !line.empty(); line = inFile.readLine()) {
		if (line.empty())
			break;
    if (++i%100000 == 0) cerr << "." << flush;
		
    vector<string> token = tokenize(line.c_str());
    if (token.size() != 3) {
      cerr << "line " << i << " “" << line << "” in " << fileName 
			     << " has wrong number of tokens (" << token.size() << "), skipping:\n"
			     << token.size() << " " << token[0] << " " << line << endl;
      continue;
    }
  
    WORD_ID wordT = vcbT.storeIfNew( token[0] );
    WORD_ID wordS = vcbS.storeIfNew( token[1] );
    ltable[ wordS ][ wordT ] = strtod(token[2].c_str(), NULL);
  }
  cerr << endl;
}