Exemple #1
0
/* Add one word to the dictionary.
*/
static int IndexWord(char *Word,int Follows)
{
	WORD *thisWord,*lastWord;
	int wordIndex;

	if(Word != END_SENTENCE)
	{
		thisWord = FindWord(Word);
		if(!thisWord)
		{
			thisWord = AddWord(Word);
			if(!thisWord) Niall_Error("Out of memory.");
		}
		wordIndex = WordIndex(thisWord);	
	}
	else wordIndex = -1;

	lastWord = GetWord(Follows);
	if(!lastWord) Niall_Error("Corrupted brain (Can't find last word).");
	Associate(lastWord,wordIndex);

	return(wordIndex);
}
// Returns the index of the next set bit after the given index.
// Useful for quickly iterating through the set bits in a sparse vector.
int BitVector::NextSetBit(int prev_bit) const {
  // Move on to the next bit.
  int next_bit = prev_bit + 1;
  if (next_bit >= bit_size_) return -1;
  // Check the remains of the word containing the next_bit first.
  int next_word = WordIndex(next_bit);
  int bit_index = next_word * kBitFactor;
  int word_end = bit_index + kBitFactor;
  uinT32 word = array_[next_word];
  uinT8 byte = word & 0xff;
  while (bit_index < word_end) {
    if (bit_index + 8 > next_bit && byte != 0) {
      while (bit_index + lsb_index_[byte] < next_bit && byte != 0)
        byte = lsb_eroded_[byte];
      if (byte != 0)
        return bit_index + lsb_index_[byte];
    }
    word >>= 8;
    bit_index += 8;
    byte = word & 0xff;
  }
  // next_word didn't contain a 1, so find the next word with set bit.
  ++next_word;
  int wordlen = WordLength();
  while (next_word < wordlen && (word = array_[next_word]) == 0) {
    ++next_word;
    bit_index += kBitFactor;
  }
  if (bit_index >= bit_size_) return -1;
  // Find the first non-zero byte within the word.
  while ((word & 0xff) == 0) {
    word >>= 8;
    bit_index += 8;
  }
  return bit_index + lsb_index_[word & 0xff];
}
Exemple #3
0
void ApParser::train(SentenceReader* sentenceReader, char const* modelFile)
{
  WordIndex		labelIndex;
  vector<string>	labels;

  vector<string>	predLabels;

  // collect events
  list<Tanl::Classifier::Event*>	events;

  WordCounts		predCount; // count predicate occurrences
  int evCount = 0;
  Tanl::Classifier::PID pID = 1;		// leave 0 for bias
  // create inverted index of predicate names
  // used to create vector of pIDs
  EventStream eventStream(sentenceReader, &info);
  while (eventStream.hasNext()) {
    Tanl::Classifier::Event* ev = eventStream.next();
    events.push_back(ev);
    evCount++;		      // count them explicitly, since size() is costly
    if (config.verbose) {
      if (evCount % 10000 == 0)
	cerr << '+' << flush;
      else if (evCount % 1000 == 0)
	cerr << '.' << flush;
    }
    vector<string>& ec = ev->features; // ec = {p1, ... , pn}
    for (unsigned j = 0; j < ec.size(); j++) {
      string& pred = ec[j];
      // decide whether to retain it (# occurrences > cutoff)
      if (predIndex.find(pred.c_str()) == predIndex.end()) {
	// not yet among those retained
	WordCounts::iterator wcit = predCount.find(pred);
	// increment # of occurrences
	int count;
	if (wcit == predCount.end())
	  count = predCount[pred] = 1;
	else
	  count = ++wcit->second;
	if (count >= config.featureCutoff) {
	  predLabels.push_back(pred); // accept it into predLabels
	  predIndex[pred.c_str()] = pID++;
	  predCount.erase(pred);
	}
      }
    }
  }
  if (config.verbose)
    cerr << endl;

  // build cases
  Cases cases;
  cases.reserve(evCount);
  int n = 0;
  Tanl::Classifier::ClassID oID = 0;
  while (!events.empty()) {
    Tanl::Classifier::Event* ev = events.front();
    events.pop_front();
    cases.push_back(Case());
    X& x = cases[n].first;	// features
    // add features
    vector<string>& ec = ev->features; // ec = {p1, ... , pn}
    char const* c = ev->className.c_str();
    for (unsigned j = 0; j < ec.size(); j++) {
      string& pred = ec[j];
      WordIndex::const_iterator pit = predIndex.find(pred.c_str());
      if (pit != predIndex.end()) {
	x.push_back(pit->second);
      }
    }
    if (x.size()) {
      if (labelIndex.find(c) == labelIndex.end()) {
	labelIndex[c] = oID++;
	labels.push_back(c);
      }
      cases[n].second = labelIndex[c];
      n++;
      if (config.verbose) {
	if (n % 10000 == 0)
	  cerr << '+' << flush;
	else if (n % 1000 == 0)
	  cerr << '.' << flush;
      }
      x.push_back(0);		// bias
    }
    delete ev;
  }
  cases.resize(n);
  if (config.verbose)
    cerr << endl;

  int predSize = predLabels.size();
  predSize++;			// bias
  APSV ap(labels.size(), predSize);
  
  ofstream ofs(modelFile, ios::binary | ios::trunc);
  // dump configuration settings
  config.writeHeader(ofs);
  // dump labels
  ofs << labels.size() << endl;
  FOR_EACH (vector<string>, labels, pit)
    ofs << *pit << endl;
  // dump predLabels
  ofs << predLabels.size() << endl;
  FOR_EACH (vector<string>, predLabels, pit)
    ofs << *pit << endl;
  // free memory
  predIndex.clear();
  WordIndex().swap(predIndex); // STL map do not deallocate. resize(0) has no effect
  labelIndex.clear();
  WordIndex().swap(labelIndex);
  // clear memory for unfrequent entities
  info.clearRareEntities();
  // perform training
  ap.train(cases, iter);
  // dump parameters
  ap.save(ofs);
  // dump global info
  info.save(ofs);
}
Exemple #4
0
/* Correct a spelling
*/
void Niall_CorrectSpelling(char *Original, char *Correct)
{
	WORD *OrigWord,*CorrWord,*Word;
	int OrigIndex,CorrIndex;
	ASCN *Assoc,*nextAscn,*prevAscn;
	int i;

	/* Clean up the words
	*/
	StripPunctuation(Original);
	StripPunctuation(Correct);

	/* Check they are not empty
	*/
	if((strlen(Original)==0)||(strlen(Correct)==0))
	{
		Niall_Warning("You must enter a word to be corrected and a corrected version.");
		return;
	}

	/* Check they aren't the same
	*/
	if(!strcmp(Original,Correct))
	{
		Niall_Warning("The words are the same!");
		return;
	}

	/* Find the original (mis-spelt) word
	*/
	OrigWord=FindWord(Original);
	if(OrigWord==NULL)
	{
		Niall_Warning("Can't find word '%s' in dictionary.",Original);
		return;
	}

	/* Check if the corrected version already exists
	*/
	CorrWord=FindWord(Correct);
	if(CorrWord==NULL)
	{
		/* This is the easy one. Just replace the word.
		*/
		free(OrigWord->Data);
		OrigWord->Data = calloc(sizeof(char),strlen(Correct)+1);
		strcpy(OrigWord->Data,Correct);
	}
	else
	{
		/* More complex. Any links to the incorrect word must be
		** destroyed and re-made for the correct word. Links from
		** the incorrect word must be applied to the correct word.
		** The incorrect word must be removed from the dictionary,
		** and all links updated to reflect the change of index.
		*/
		OrigIndex = WordIndex(OrigWord);
		CorrIndex = WordIndex(CorrWord);

		/* Recreate associations to the incorrect word.
		*/
		for(Word=WordList;Word;Word=Word->Next)
		{
			for(Assoc=Word->Associations;Assoc;Assoc=Assoc->Next)
			{
				if(Assoc->Word != OrigIndex) continue;

				/* Unlink the association.
				*/
				if(Assoc == Word->Associations)
				{
					Word->Associations = Assoc->Next;
				}
				else
				{
					for(prevAscn=Word->Associations;prevAscn;prevAscn=prevAscn->Next)
					{
						if(Assoc == prevAscn->Next)
						{
							prevAscn->Next = Assoc->Next;
							break;
						}
					}
				}

				/* Re-make the association on the correct word
				*/
				for(i=0;i<Assoc->Probability;i++) Associate(Word,CorrIndex);

				/* Free the association.
				*/
				free(Assoc);
				break;
			}
		}


		/* Copy old associations to the correct word.
		*/
		for(Assoc=OrigWord->Associations;Assoc;Assoc=Assoc->Next)
		{
			for(i=0;i<Assoc->Probability;i++) Associate(CorrWord,Assoc->Word);
		}


		/* Delete the old associations.
		*/
		for(Assoc=OrigWord->Associations;Assoc;Assoc=nextAscn)
		{
			nextAscn=Assoc->Next;
			free(Assoc);
		}
		OrigWord->Associations=NULL;


		/* Unlink and free the old word.
		*/
		if(OrigWord == WordList)
		{
			WordList = OrigWord->Next;
		}
		else
		{
			for(Word=WordList;Word;Word=Word->Next)
			{
				if(OrigWord == Word->Next)
				{
					Word->Next = OrigWord->Next;
					break;
				}
			}
		}
		free(OrigWord->Data);
		free(OrigWord);


		/* Update the indexes in all associations.
		*/
		for(Word=WordList;Word;Word=Word->Next)
		{
			for(Assoc=Word->Associations;Assoc;Assoc=Assoc->Next)
			{
				if(Assoc->Word > OrigIndex) Assoc->Word--;
			}
		}
	}
}