void computeCountOfCounts( char* fileNameExtract ) { cerr << "computing counts of counts"; for(int i=1;i<=GT_MAX;i++) countOfCounts[i] = 0; ifstream extractFile; extractFile.open( fileNameExtract ); if (extractFile.fail()) { cerr << "ERROR: could not open extract file " << fileNameExtract << endl; exit(1); } istream &extractFileP = extractFile; // loop through all extracted phrase translations int i=0; char line[LINE_MAX_LENGTH],lastLine[LINE_MAX_LENGTH]; lastLine[0] = '\0'; PhraseAlignment *lastPhrasePair = NULL; while(true) { if (extractFileP.eof()) break; if (++i % 100000 == 0) cerr << "." << flush; SAFE_GETLINE((extractFileP), line, LINE_MAX_LENGTH, '\n', __FILE__); if (extractFileP.eof()) break; // identical to last line? just add count if (strcmp(line,lastLine) == 0) { lastPhrasePair->addToCount( line ); continue; } strcpy( lastLine, line ); // create new phrase pair PhraseAlignment *phrasePair = new PhraseAlignment(); phrasePair->create( line, i ); if (i == 1) { lastPhrasePair = phrasePair; continue; } // only differs in count? just add count if (lastPhrasePair->match( *phrasePair )) { lastPhrasePair->count += phrasePair->count; phrasePair->clear(); delete(phrasePair); continue; } // periodically house cleaning if (phrasePair->GetSource() != lastPhrasePair->GetSource()) { phraseTableT.clear(); // these would get too big phraseTableS.clear(); // these would get too big // process line again, since phrase tables flushed phrasePair->clear(); phrasePair->create( line, i ); } int count = lastPhrasePair->count + 0.99999; if(count <= GT_MAX) countOfCounts[ count ]++; lastPhrasePair->clear(); delete( lastPhrasePair ); lastPhrasePair = phrasePair; } delete lastPhrasePair; discountFactor[0] = 0.01; // floor cerr << "\n"; for(int i=1;i<GT_MAX;i++) { discountFactor[i] = ((float)i+1)/(float)i*(((float)countOfCounts[i+1]+0.1) / ((float)countOfCounts[i]+0.1)); cerr << "count " << i << ": " << countOfCounts[ i ] << ", discount factor: " << discountFactor[i]; // some smoothing... if (discountFactor[i]>1) discountFactor[i] = 1; if (discountFactor[i]<discountFactor[i-1]) discountFactor[i] = discountFactor[i-1]; cerr << " -> " << discountFactor[i]*i << endl; } }
void computeCountOfCounts(const string& fileNameExtract) { if (fileNameExtract == "-") { cerr << "The ‘GoodTuring Discounting’ option may not be used with piped input!" << endl; exit(9); } cerr << "computing counts of counts"; for (size_t i=1; i<=GT_MAX; countOfCounts[i++] = 0); Bz2LineReader extractFile(fileNameExtract); // loop through all extracted phrase translations int i=0; string lastLine; PhraseAlignment *lastPhrasePair = NULL; for (string line = extractFile.readLine(); !line.empty(); line = extractFile.readLine()) { if (line.empty()) break; if ((++i)%10000000 == 0) cerr << "[" << i << "]" << endl; else if (i % 100000 == 0) cerr << "," << flush; // identical to last line? just add count if (line == lastLine) { lastPhrasePair->addToCount(line); continue; } lastLine = line; // create new phrase pair PhraseAlignment *phrasePair = new PhraseAlignment(); vector<string> lineVector = tokenize(line.c_str()); phrasePair->create(lineVector, i); if (i == 1) { lastPhrasePair = phrasePair; continue; } // only differs in count? just add count if (lastPhrasePair->match( *phrasePair )) { lastPhrasePair->count += phrasePair->count; phrasePair->clear(); delete(phrasePair); continue; } // periodically house cleaning if (phrasePair->source != lastPhrasePair->source) { phraseTableT.clear(); // these would get too big phraseTableS.clear(); // these would get too big // process line again, since phrase tables flushed phrasePair->clear(); phrasePair->create(lineVector, i); } int count = lastPhrasePair->count + 0.99999; if(count <= GT_MAX) ++countOfCounts[ count ]; lastPhrasePair->clear(); delete( lastPhrasePair ); lastPhrasePair = phrasePair; } discountFactor[0] = 0.01; // floor cerr << "\n"; for(int i=1;i<GT_MAX; ++i) { discountFactor[i] = ((float)i+1)/(float)i*(((float)countOfCounts[i+1]+0.1) / ((float)countOfCounts[i]+0.1)); cerr << "count " << i << ": " << countOfCounts[ i ] << ", discount factor: " << discountFactor[i]; // some smoothing... if (discountFactor[i]>1) discountFactor[i] = 1; if (discountFactor[i]<discountFactor[i-1]) discountFactor[i] = discountFactor[i-1]; cerr << " -> " << discountFactor[i]*i << endl; } }