/* Add one word to the dictionary. */ static int IndexWord(char *Word,int Follows) { WORD *thisWord,*lastWord; int wordIndex; if(Word != END_SENTENCE) { thisWord = FindWord(Word); if(!thisWord) { thisWord = AddWord(Word); if(!thisWord) Niall_Error("Out of memory."); } wordIndex = WordIndex(thisWord); } else wordIndex = -1; lastWord = GetWord(Follows); if(!lastWord) Niall_Error("Corrupted brain (Can't find last word)."); Associate(lastWord,wordIndex); return(wordIndex); }
// Returns the index of the next set bit after the given index. // Useful for quickly iterating through the set bits in a sparse vector. int BitVector::NextSetBit(int prev_bit) const { // Move on to the next bit. int next_bit = prev_bit + 1; if (next_bit >= bit_size_) return -1; // Check the remains of the word containing the next_bit first. int next_word = WordIndex(next_bit); int bit_index = next_word * kBitFactor; int word_end = bit_index + kBitFactor; uinT32 word = array_[next_word]; uinT8 byte = word & 0xff; while (bit_index < word_end) { if (bit_index + 8 > next_bit && byte != 0) { while (bit_index + lsb_index_[byte] < next_bit && byte != 0) byte = lsb_eroded_[byte]; if (byte != 0) return bit_index + lsb_index_[byte]; } word >>= 8; bit_index += 8; byte = word & 0xff; } // next_word didn't contain a 1, so find the next word with set bit. ++next_word; int wordlen = WordLength(); while (next_word < wordlen && (word = array_[next_word]) == 0) { ++next_word; bit_index += kBitFactor; } if (bit_index >= bit_size_) return -1; // Find the first non-zero byte within the word. while ((word & 0xff) == 0) { word >>= 8; bit_index += 8; } return bit_index + lsb_index_[word & 0xff]; }
void ApParser::train(SentenceReader* sentenceReader, char const* modelFile) { WordIndex labelIndex; vector<string> labels; vector<string> predLabels; // collect events list<Tanl::Classifier::Event*> events; WordCounts predCount; // count predicate occurrences int evCount = 0; Tanl::Classifier::PID pID = 1; // leave 0 for bias // create inverted index of predicate names // used to create vector of pIDs EventStream eventStream(sentenceReader, &info); while (eventStream.hasNext()) { Tanl::Classifier::Event* ev = eventStream.next(); events.push_back(ev); evCount++; // count them explicitly, since size() is costly if (config.verbose) { if (evCount % 10000 == 0) cerr << '+' << flush; else if (evCount % 1000 == 0) cerr << '.' << flush; } vector<string>& ec = ev->features; // ec = {p1, ... , pn} for (unsigned j = 0; j < ec.size(); j++) { string& pred = ec[j]; // decide whether to retain it (# occurrences > cutoff) if (predIndex.find(pred.c_str()) == predIndex.end()) { // not yet among those retained WordCounts::iterator wcit = predCount.find(pred); // increment # of occurrences int count; if (wcit == predCount.end()) count = predCount[pred] = 1; else count = ++wcit->second; if (count >= config.featureCutoff) { predLabels.push_back(pred); // accept it into predLabels predIndex[pred.c_str()] = pID++; predCount.erase(pred); } } } } if (config.verbose) cerr << endl; // build cases Cases cases; cases.reserve(evCount); int n = 0; Tanl::Classifier::ClassID oID = 0; while (!events.empty()) { Tanl::Classifier::Event* ev = events.front(); events.pop_front(); cases.push_back(Case()); X& x = cases[n].first; // features // add features vector<string>& ec = ev->features; // ec = {p1, ... , pn} char const* c = ev->className.c_str(); for (unsigned j = 0; j < ec.size(); j++) { string& pred = ec[j]; WordIndex::const_iterator pit = predIndex.find(pred.c_str()); if (pit != predIndex.end()) { x.push_back(pit->second); } } if (x.size()) { if (labelIndex.find(c) == labelIndex.end()) { labelIndex[c] = oID++; labels.push_back(c); } cases[n].second = labelIndex[c]; n++; if (config.verbose) { if (n % 10000 == 0) cerr << '+' << flush; else if (n % 1000 == 0) cerr << '.' << flush; } x.push_back(0); // bias } delete ev; } cases.resize(n); if (config.verbose) cerr << endl; int predSize = predLabels.size(); predSize++; // bias APSV ap(labels.size(), predSize); ofstream ofs(modelFile, ios::binary | ios::trunc); // dump configuration settings config.writeHeader(ofs); // dump labels ofs << labels.size() << endl; FOR_EACH (vector<string>, labels, pit) ofs << *pit << endl; // dump predLabels ofs << predLabels.size() << endl; FOR_EACH (vector<string>, predLabels, pit) ofs << *pit << endl; // free memory predIndex.clear(); WordIndex().swap(predIndex); // STL map do not deallocate. resize(0) has no effect labelIndex.clear(); WordIndex().swap(labelIndex); // clear memory for unfrequent entities info.clearRareEntities(); // perform training ap.train(cases, iter); // dump parameters ap.save(ofs); // dump global info info.save(ofs); }
/* Correct a spelling */ void Niall_CorrectSpelling(char *Original, char *Correct) { WORD *OrigWord,*CorrWord,*Word; int OrigIndex,CorrIndex; ASCN *Assoc,*nextAscn,*prevAscn; int i; /* Clean up the words */ StripPunctuation(Original); StripPunctuation(Correct); /* Check they are not empty */ if((strlen(Original)==0)||(strlen(Correct)==0)) { Niall_Warning("You must enter a word to be corrected and a corrected version."); return; } /* Check they aren't the same */ if(!strcmp(Original,Correct)) { Niall_Warning("The words are the same!"); return; } /* Find the original (mis-spelt) word */ OrigWord=FindWord(Original); if(OrigWord==NULL) { Niall_Warning("Can't find word '%s' in dictionary.",Original); return; } /* Check if the corrected version already exists */ CorrWord=FindWord(Correct); if(CorrWord==NULL) { /* This is the easy one. Just replace the word. */ free(OrigWord->Data); OrigWord->Data = calloc(sizeof(char),strlen(Correct)+1); strcpy(OrigWord->Data,Correct); } else { /* More complex. Any links to the incorrect word must be ** destroyed and re-made for the correct word. Links from ** the incorrect word must be applied to the correct word. ** The incorrect word must be removed from the dictionary, ** and all links updated to reflect the change of index. */ OrigIndex = WordIndex(OrigWord); CorrIndex = WordIndex(CorrWord); /* Recreate associations to the incorrect word. */ for(Word=WordList;Word;Word=Word->Next) { for(Assoc=Word->Associations;Assoc;Assoc=Assoc->Next) { if(Assoc->Word != OrigIndex) continue; /* Unlink the association. */ if(Assoc == Word->Associations) { Word->Associations = Assoc->Next; } else { for(prevAscn=Word->Associations;prevAscn;prevAscn=prevAscn->Next) { if(Assoc == prevAscn->Next) { prevAscn->Next = Assoc->Next; break; } } } /* Re-make the association on the correct word */ for(i=0;i<Assoc->Probability;i++) Associate(Word,CorrIndex); /* Free the association. */ free(Assoc); break; } } /* Copy old associations to the correct word. */ for(Assoc=OrigWord->Associations;Assoc;Assoc=Assoc->Next) { for(i=0;i<Assoc->Probability;i++) Associate(CorrWord,Assoc->Word); } /* Delete the old associations. */ for(Assoc=OrigWord->Associations;Assoc;Assoc=nextAscn) { nextAscn=Assoc->Next; free(Assoc); } OrigWord->Associations=NULL; /* Unlink and free the old word. */ if(OrigWord == WordList) { WordList = OrigWord->Next; } else { for(Word=WordList;Word;Word=Word->Next) { if(OrigWord == Word->Next) { Word->Next = OrigWord->Next; break; } } } free(OrigWord->Data); free(OrigWord); /* Update the indexes in all associations. */ for(Word=WordList;Word;Word=Word->Next) { for(Assoc=Word->Associations;Assoc;Assoc=Assoc->Next) { if(Assoc->Word > OrigIndex) Assoc->Word--; } } } }