int Clusters:: reclusterNeyEssen() { cerr << "Calculating MLE for prior probabilities" << endl; vector<double> prior(numberClasses,0.0l); double xxx = 1.0l/(double)numberTypes; for (int i = 0; i < numberTypes; i++){ int c = classVector[i]; prior[c] += xxx; } for (int i = 0; i < numberClasses;i++) cerr << i << " " << prior[i] << endl; if (numberStates > 0){ cerr << "Training all the HMMs" << endl; for (int c = 0; c < numberClasses; c++){ // cerr << "Training HMM " << c << endl; HMM* hmmPtr = hmms[c]; HMM* newHmmPtr = new HMM(numberStates, alphabetSize); for (int i = 0; i < numberTypes; i++){ if (classVector[i] == c){ // then this word is in the right class // so train it on word i const string & word = *(corpus.wordArray[i]); vector<int> v; hmmPtr->convertString(word,v); // FIXME double weight = 1.0l; if (USE_TRUE_WEIGHT){ weight = corpus.countArray[i]; } hmmPtr->emSingle(*newHmmPtr, weight, v); } } newHmmPtr->normalise(); hmms[c] = newHmmPtr; delete hmmPtr; } } int something = 0; for (int i = 0; i < numberTypes; i++){ // cerr << "Word " << i; int w = sortedWords[i]; //cerr << *(corpus.wordArray[w]) << endl; if (counts[w] > FREQ_CUTOFF){ //cerr << "Doing " << w << endl; if (bestCluster(w, prior)){ something++; } } } return something; }
Clusters:: Clusters(int numberClasses_, const SimpleCorpusOne & corpus_, int numberStates_, int alphabetSize_, bool randomised) : numberClasses(numberClasses_), numberTypes(corpus_.numberTypes), numberTokens(corpus_.numberTokens), numberStates(numberStates_), alphabetSize(alphabetSize_), data(corpus_.data), corpus(corpus_), clusterBigrams(numberClasses_,numberClasses_) { classVector.resize(numberTypes); counts.resize(numberTypes); sortedWords.resize(numberTypes); first.resize(numberTypes); clusterUnigrams.resize(numberClasses); next = new int[numberTokens]; for (int i = 0; i < numberTokens; i++) next[i] = numberTokens; for (int w = 0; w < numberTypes; w++){ counts[w]=0; classVector[w] = numberClasses -1; } // counts are set for (int i = 0; i < numberTokens; i++) counts[data[i]]++; // now find the most frequent numberClasses -1 of them. vector< pair<int,int> > countsTable(numberTypes); for (int i = 0; i < numberTypes; i++){ countsTable[i] = pair<int,int>(counts[i],i); //cerr << counts[i] << " " << i << endl; } cerr << "Sorting words" << endl; sort(countsTable.begin(),countsTable.end()); for (int i = 0; i < numberTypes; i++){ first[i] = -1; sortedWords[i] = countsTable[numberTypes - 1 - i].second; //cerr << "sort " << i << " " << sortedWords[i] << " , n =" << countsTable[numberTypes - 1 - i].first << endl; } if (randomised) { for (int i = 0; i < numberTypes; i++){ if (counts[i] > FREQ_CUTOFF){ int rc = (int) (1.0 * numberClasses *rand()/(RAND_MAX+1.0)); classVector[i] = rc; } } } else { for (int i = 0; i < numberClasses-1; i++){ classVector[sortedWords[i]]= i; } } vector<int> last(numberTypes,0); cerr << "Indexing data" << endl; for (int i = 0; i < numberTokens-1; i++){ int w = data[i]; int w2 = data[i+1]; if (w2 < 0 || w2 > numberTypes -1){ cerr << i+1 << " " << w2 << endl; } assert(w >= 0 && w < numberTypes); assert(w2 >= 0 && w2 < numberTypes); if (first[w] == -1){ first[w] = i; last[w] = i; } else { next[last[w]] = i; last[w] = i; } int c1 = classVector[w]; int c2 = classVector[w2]; assert(c1 >= 0 && c1 < numberClasses); assert(c2 >= 0 && c2 < numberClasses); clusterBigrams(c1,c2)++; clusterUnigrams[c1]++; } cerr << "Finished indexing " << endl; // be careful clusterUnigrams[classVector[data[numberTokens-1]]]++; cerr << "Numberstates " << numberStates << endl; if (numberStates > 0){ cerr << "Starting to do the HMMs" << endl; hmms.resize(numberClasses); for (int i = 0; i < numberClasses; i++){ HMM* hmmPtr = new HMM(numberStates, alphabetSize); hmmPtr->randomise(); hmmPtr->normalise(); hmms[i] = hmmPtr; } } }