void TargetCorpus::Create( string fileName ) { ifstream textFile; char line[LINE_MAX_LENGTH]; // count the number of words first; textFile.open(fileName.c_str()); istream *fileP = &textFile; m_size = 0; m_sentenceCount = 0; while(!fileP->eof()) { SAFE_GETLINE((*fileP), line, LINE_MAX_LENGTH, '\n'); if (fileP->eof()) break; vector< WORD_ID > words = m_vcb.Tokenize( line ); m_size += words.size(); m_sentenceCount++; } textFile.close(); cerr << m_size << " words" << endl; // allocate memory m_array = (WORD_ID*) calloc( sizeof( WORD_ID ), m_size ); m_sentenceEnd = (INDEX*) calloc( sizeof( INDEX ), m_sentenceCount ); // fill the array int wordIndex = 0; int sentenceId = 0; textFile.open(fileName.c_str()); fileP = &textFile; while(!fileP->eof()) { SAFE_GETLINE((*fileP), line, LINE_MAX_LENGTH, '\n'); if (fileP->eof()) break; vector< WORD_ID > words = m_vcb.Tokenize( line ); vector< WORD_ID >::const_iterator i; for( i=words.begin(); i!=words.end(); i++) { m_array[ wordIndex++ ] = *i; } m_sentenceEnd[ sentenceId++ ] = wordIndex-1; } textFile.close(); cerr << "done reading " << wordIndex << " words, " << sentenceId << " sentences." << endl; }
void PhraseDictionaryMultiModelCounts::LoadLexicalTable( string &fileName, lexicalTable* ltable) { cerr << "Loading lexical translation table from " << fileName; ifstream inFile; inFile.open(fileName.c_str()); if (inFile.fail()) { cerr << " - ERROR: could not open file\n"; exit(1); } istream *inFileP = &inFile; char line[LINE_MAX_LENGTH]; int i=0; while(true) { i++; if (i%100000 == 0) cerr << "." << flush; SAFE_GETLINE((*inFileP), line, LINE_MAX_LENGTH, '\n', __FILE__); if (inFileP->eof()) break; vector<string> token = tokenize( line ); if (token.size() != 4) { cerr << "line " << i << " in " << fileName << " has wrong number of tokens, skipping:\n" << token.size() << " " << token[0] << " " << line << endl; continue; } double joint = atof( token[2].c_str() ); double marginal = atof( token[3].c_str() ); Word wordT, wordS; wordT.CreateFromString(Output, m_output, token[0], false); wordS.CreateFromString(Input, m_input, token[1], false); ltable->joint[ wordS ][ wordT ] = joint; ltable->marginal[ wordS ] = marginal; } cerr << endl; }
void DTable::load( string fileName ) { ifstream inFile; inFile.open(fileName.c_str()); istream *inFileP = &inFile; char line[TABLE_LINE_MAX_LENGTH]; int i=0; while(true) { i++; SAFE_GETLINE((*inFileP), line, TABLE_LINE_MAX_LENGTH, '\n'); if (inFileP->eof()) break; vector<string> token = tokenize( line ); if (token.size() < 2) { cerr << "line " << i << " in " << fileName << " too short, skipping\n"; continue; } int d = atoi( token[0].c_str() ); double prob = log( atof( token[1].c_str() ) ); dtable[ d ] = prob; } }
SuffixArray::SuffixArray( string fileName ) { m_vcb.StoreIfNew( "<uNk>" ); m_endOfSentence = m_vcb.StoreIfNew( "<s>" ); ifstream extractFile; char line[LINE_MAX_LENGTH]; // count the number of words first; extractFile.open(fileName.c_str()); istream *fileP = &extractFile; m_size = 0; size_t sentenceCount = 0; while(!fileP->eof()) { SAFE_GETLINE((*fileP), line, LINE_MAX_LENGTH, '\n'); if (fileP->eof()) break; vector< WORD_ID > words = m_vcb.Tokenize( line ); m_size += words.size() + 1; sentenceCount++; } extractFile.close(); cerr << m_size << " words (incl. sentence boundaries)" << endl; // allocate memory m_array = (WORD_ID*) calloc( sizeof( WORD_ID ), m_size ); m_index = (INDEX*) calloc( sizeof( INDEX ), m_size ); m_wordInSentence = (char*) calloc( sizeof( char ), m_size ); m_sentence = (size_t*) calloc( sizeof( size_t ), m_size ); m_sentenceLength = (char*) calloc( sizeof( char ), sentenceCount ); // fill the array int wordIndex = 0; int sentenceId = 0; extractFile.open(fileName.c_str()); fileP = &extractFile; while(!fileP->eof()) { SAFE_GETLINE((*fileP), line, LINE_MAX_LENGTH, '\n'); if (fileP->eof()) break; vector< WORD_ID > words = m_vcb.Tokenize( line ); // add to corpus vector corpus.push_back(words); // create SA vector< WORD_ID >::const_iterator i; for( i=words.begin(); i!=words.end(); i++) { m_index[ wordIndex ] = wordIndex; m_sentence[ wordIndex ] = sentenceId; m_wordInSentence[ wordIndex ] = i-words.begin(); m_array[ wordIndex++ ] = *i; } m_index[ wordIndex ] = wordIndex; m_array[ wordIndex++ ] = m_endOfSentence; m_sentenceLength[ sentenceId++ ] = words.size(); } extractFile.close(); cerr << "done reading " << wordIndex << " words, " << sentenceId << " sentences." << endl; // List(0,9); // sort m_buffer = (INDEX*) calloc( sizeof( INDEX ), m_size ); Sort( 0, m_size-1 ); free( m_buffer ); cerr << "done sorting" << endl; }