void LexicalTable::load( const string &filePath ) { cerr << "Loading lexical translation table from " << filePath; ifstream inFile; inFile.open(filePath.c_str()); if (inFile.fail()) { cerr << " - ERROR: could not open file\n"; exit(1); } istream *inFileP = &inFile; string line; int i=0; while(getline(*inFileP, line)) { i++; if (i%100000 == 0) cerr << "." << flush; vector<string> token = tokenize( line.c_str() ); if (token.size() != 3) { cerr << "line " << i << " in " << filePath << " has wrong number of tokens, skipping:\n" << token.size() << " " << token[0] << " " << line << endl; continue; } double prob = atof( token[2].c_str() ); WORD_ID wordE = vcbE.storeIfNew( token[0] ); WORD_ID wordF = vcbF.storeIfNew( token[1] ); ltable[ wordF ][ wordE ] = prob; } cerr << endl; }
bool PhraseAlignment::create(const char line[], int lineID ) { vector< string > token = tokenize( line ); int item = 1; PHRASE phraseF, phraseE; for (size_t j=0; j<token.size(); j++) { if (token[j] == "|||") item++; else { if (item == 1) phraseF.push_back( vcbF.storeIfNew( token[j] ) ); else if (item == 2) phraseE.push_back( vcbE.storeIfNew( token[j] ) ); else if (item == 3) { int e,f; sscanf(token[j].c_str(), "%d-%d", &f, &e); if ((size_t)e >= phraseE.size() || (size_t)f >= phraseF.size()) { cerr << "WARNING: sentence " << lineID << " has alignment point (" << f << ", " << e << ") out of bounds (" << phraseF.size() << ", " << phraseE.size() << ")\n"; } else { if (alignedToE.size() == 0) { vector< size_t > dummy; for(size_t i=0; i<phraseE.size(); i++) alignedToE.push_back( dummy ); for(size_t i=0; i<phraseF.size(); i++) alignedToF.push_back( dummy ); foreign = phraseTableF.storeIfNew( phraseF ); english = phraseTableE.storeIfNew( phraseE ); } alignedToE[e].push_back( f ); alignedToF[f].push_back( e ); } } } } return (item>2); // real phrase pair, not just foreign phrase }
void LexicalTable::load( char *fileName ) { cerr << "Loading lexical translation table from " << fileName; ifstream inFile; inFile.open(fileName); if (inFile.fail()) { cerr << " - ERROR: could not open file\n"; exit(1); } istream *inFileP = &inFile; char line[LINE_MAX_LENGTH]; int i=0; while(true) { i++; if (i%100000 == 0) cerr << "." << flush; SAFE_GETLINE((*inFileP), line, LINE_MAX_LENGTH, '\n', __FILE__); if (inFileP->eof()) break; vector<string> token = tokenize( line ); if (token.size() != 3) { cerr << "line " << i << " in " << fileName << " has wrong number of tokens, skipping:\n" << token.size() << " " << token[0] << " " << line << endl; continue; } double prob = atof( token[2].c_str() ); WORD_ID wordT = vcbT.storeIfNew( token[0] ); WORD_ID wordS = vcbS.storeIfNew( token[1] ); ltable[ wordS ][ wordT ] = prob; } cerr << endl; }
// read in a phrase pair and store it void PhraseAlignment::create(const vector<string>& token, int lineID) { int item = 1; PHRASE phraseS, phraseT; for (size_t j=0; j<token.size(); ++j) { if (token[j] == "|||") item++; else if (item == 1) // source phrase phraseS.push_back( vcbS.storeIfNew( token[j] ) ); else if (item == 2) // target phrase phraseT.push_back( vcbT.storeIfNew( token[j] ) ); else if (item == 3) { // alignment int s = strtol(token[j].substr(0, token[j].find("-")).c_str(), NULL, 10); int t = strtol(token[j].substr(token[j].find("-") + 1).c_str(), NULL, 10); if (t >= phraseT.size() || s >= phraseS.size()) { cerr << "WARNING: phrase pair " << lineID << " has alignment point (" << s << ", " << t << ") out of bounds (" << phraseS.size() << ", " << phraseT.size() << ")\n"; } else { // first alignment point? -> initialize if (alignedToT.size() == 0) { assert(alignedToS.size() == 0); size_t numTgtSymbols = (hierarchicalFlag ? phraseT.size()-1 : phraseT.size()); alignedToT.resize(numTgtSymbols); size_t numSrcSymbols = (hierarchicalFlag ? phraseS.size()-1 : phraseS.size()); alignedToS.resize(numSrcSymbols); source = phraseTableS.storeIfNew( phraseS ); target = phraseTableT.storeIfNew( phraseT ); } // add alignment point alignedToT[t].insert( s ); alignedToS[s].insert( t ); } } else if (item == 4) // count count = strtof(token[j].c_str(), NULL); } if (item == 3) count = 1.0; if (item < 3 || item > 4) { cerr << "ERROR: faulty line " << lineID << ": "; for(vector<string>::const_iterator i = token.begin(); i != token.end(); cerr << *(i++) << " "); cerr << endl; } }
void LexicalTable::load(char *fileName) { cerr << "Loading lexical translation table from " << fileName; Bz2LineReader inFile(fileName, Bz2LineReader::UNCOMPRESSED); int i = 0; for (string line = inFile.readLine(); !line.empty(); line = inFile.readLine()) { if (line.empty()) break; if (++i%100000 == 0) cerr << "." << flush; vector<string> token = tokenize(line.c_str()); if (token.size() != 3) { cerr << "line " << i << " “" << line << "” in " << fileName << " has wrong number of tokens (" << token.size() << "), skipping:\n" << token.size() << " " << token[0] << " " << line << endl; continue; } WORD_ID wordT = vcbT.storeIfNew( token[0] ); WORD_ID wordS = vcbS.storeIfNew( token[1] ); ltable[ wordS ][ wordT ] = strtod(token[2].c_str(), NULL); } cerr << endl; }