Esempio n. 1
0
void TargetCorpus::Create( string fileName )
{
  ifstream textFile;
  char line[LINE_MAX_LENGTH];

  // count the number of words first;
  textFile.open(fileName.c_str());
  istream *fileP = &textFile;
  m_size = 0;
  m_sentenceCount = 0;
  while(!fileP->eof()) {
    SAFE_GETLINE((*fileP), line, LINE_MAX_LENGTH, '\n');
    if (fileP->eof()) break;
    vector< WORD_ID > words = m_vcb.Tokenize( line );
    m_size += words.size();
    m_sentenceCount++;
  }
  textFile.close();
  cerr << m_size << " words" << endl;

  // allocate memory
  m_array = (WORD_ID*) calloc( sizeof( WORD_ID ), m_size );
  m_sentenceEnd = (INDEX*) calloc( sizeof( INDEX ), m_sentenceCount );

  // fill the array
  int wordIndex = 0;
  int sentenceId = 0;
  textFile.open(fileName.c_str());
  fileP = &textFile;
  while(!fileP->eof()) {
    SAFE_GETLINE((*fileP), line, LINE_MAX_LENGTH, '\n');
    if (fileP->eof()) break;
    vector< WORD_ID > words = m_vcb.Tokenize( line );
    vector< WORD_ID >::const_iterator i;

    for( i=words.begin(); i!=words.end(); i++) {
      m_array[ wordIndex++ ] = *i;
    }
    m_sentenceEnd[ sentenceId++ ] = wordIndex-1;
  }
  textFile.close();
  cerr << "done reading " << wordIndex << " words, " << sentenceId << " sentences." << endl;
}
Esempio n. 2
0
void PhraseDictionaryMultiModelCounts::LoadLexicalTable( string &fileName, lexicalTable* ltable)
{

  cerr << "Loading lexical translation table from " << fileName;
  ifstream inFile;
  inFile.open(fileName.c_str());
  if (inFile.fail()) {
    cerr << " - ERROR: could not open file\n";
    exit(1);
  }
  istream *inFileP = &inFile;

  char line[LINE_MAX_LENGTH];

  int i=0;
  while(true) {
    i++;
    if (i%100000 == 0) cerr << "." << flush;
    SAFE_GETLINE((*inFileP), line, LINE_MAX_LENGTH, '\n', __FILE__);
    if (inFileP->eof()) break;

    vector<string> token = tokenize( line );
    if (token.size() != 4) {
      cerr << "line " << i << " in " << fileName
           << " has wrong number of tokens, skipping:\n"
           << token.size() << " " << token[0] << " " << line << endl;
      continue;
    }

    double joint = atof( token[2].c_str() );
    double marginal = atof( token[3].c_str() );
    Word wordT, wordS;
    wordT.CreateFromString(Output, m_output, token[0], false);
    wordS.CreateFromString(Input, m_input, token[1], false);
    ltable->joint[ wordS ][ wordT ] = joint;
    ltable->marginal[ wordS ] = marginal;
  }
  cerr << endl;

}
Esempio n. 3
0
void DTable::load( string fileName ) {
  ifstream inFile;
  inFile.open(fileName.c_str());
  istream *inFileP = &inFile;

  char line[TABLE_LINE_MAX_LENGTH];
  int i=0;
  while(true) {
    i++;
    SAFE_GETLINE((*inFileP), line, TABLE_LINE_MAX_LENGTH, '\n');
    if (inFileP->eof()) break;

    vector<string> token = tokenize( line );
    if (token.size() < 2) {
      cerr << "line " << i << " in " << fileName << " too short, skipping\n";
      continue;
    }

    int d = atoi( token[0].c_str() );
    double prob = log( atof( token[1].c_str() ) );
    dtable[ d ] = prob;
  }  
}
Esempio n. 4
0
SuffixArray::SuffixArray( string fileName )
{
  m_vcb.StoreIfNew( "<uNk>" );
  m_endOfSentence = m_vcb.StoreIfNew( "<s>" );

  ifstream extractFile;
  char line[LINE_MAX_LENGTH];

  // count the number of words first;
  extractFile.open(fileName.c_str());
  istream *fileP = &extractFile;
  m_size = 0;
  size_t sentenceCount = 0;
  while(!fileP->eof()) {
    SAFE_GETLINE((*fileP), line, LINE_MAX_LENGTH, '\n');
    if (fileP->eof()) break;
    vector< WORD_ID > words = m_vcb.Tokenize( line );
    m_size += words.size() + 1;
    sentenceCount++;
  }
  extractFile.close();
  cerr << m_size << " words (incl. sentence boundaries)" << endl;

  // allocate memory
  m_array = (WORD_ID*) calloc( sizeof( WORD_ID ), m_size );
  m_index = (INDEX*) calloc( sizeof( INDEX ), m_size );
  m_wordInSentence = (char*) calloc( sizeof( char ), m_size );
  m_sentence = (size_t*) calloc( sizeof( size_t ), m_size );
  m_sentenceLength = (char*) calloc( sizeof( char ), sentenceCount );

  // fill the array
  int wordIndex = 0;
  int sentenceId = 0;
  extractFile.open(fileName.c_str());
  fileP = &extractFile;
  while(!fileP->eof()) {
    SAFE_GETLINE((*fileP), line, LINE_MAX_LENGTH, '\n');
    if (fileP->eof()) break;
    vector< WORD_ID > words = m_vcb.Tokenize( line );

    // add to corpus vector
    corpus.push_back(words);

    // create SA

    vector< WORD_ID >::const_iterator i;
    for( i=words.begin(); i!=words.end(); i++) {
      m_index[ wordIndex ] = wordIndex;
      m_sentence[ wordIndex ] = sentenceId;
      m_wordInSentence[ wordIndex ] = i-words.begin();
      m_array[ wordIndex++ ] = *i;
    }
    m_index[ wordIndex ] = wordIndex;
    m_array[ wordIndex++ ] = m_endOfSentence;
    m_sentenceLength[ sentenceId++ ] = words.size();
  }
  extractFile.close();
  cerr << "done reading " << wordIndex << " words, " << sentenceId << " sentences." << endl;
  // List(0,9);

  // sort
  m_buffer = (INDEX*) calloc( sizeof( INDEX ), m_size );
  Sort( 0, m_size-1 );
  free( m_buffer );
  cerr << "done sorting" << endl;
}