Esempio n. 1
0
void bit_vector::resize(unsigned new_size, bool val) {
    if (new_size <= m_num_bits) {
        m_num_bits = new_size;
        return;
    }
 
    TRACE("bit_vector", tout << "expanding: " << new_size << " capacity: " << m_capacity << " num words: " 
          << num_words(new_size) << "\n";);
Esempio n. 2
0
		bool none_set() const
		{
			const int words = num_words();
			for (int i = 0; i < words; ++i)
			{
				if (m_buf[i] != 0) return false;
			}
			return true;
		}
Esempio n. 3
0
fixed_bit_vector_manager::fixed_bit_vector_manager(unsigned num_bits):
    m_alloc("fixed_bit_vector") {
    m_num_bits = num_bits;
    m_num_words = num_words(num_bits);
    m_num_bytes = m_num_words * sizeof(unsigned);
    unsigned bit_rest = m_num_bits % 32;
    m_mask = (1U << bit_rest) - 1;
    if (m_mask == 0) m_mask = UINT_MAX;
}
Esempio n. 4
0
febitvec::febitvec(size_t bits, bool val, bool padding) {
	risk_release_ownership();
	resize_no_init(bits);
	size_t nWords = num_words();
	std::fill_n(m_words, nWords, bm_uint_t(val ? -1 : 0));
	if (bits != m_capacity && padding != val) {
		assert(bits > 0);
		if (padding)
			m_words[nWords-1] = ~((bm_uint_t(-1) >> (m_capacity-bits)) - 1);
		else
		bool none_set() const
		{
			const int words = num_words();
			std::uint32_t const* b = buf();
			for (int i = 0; i < words; ++i)
			{
				if (b[i] != 0) return false;
			}
			return true;
		}
Esempio n. 6
0
bool fixed_bit_vector_manager::equals(fixed_bit_vector const& a, fixed_bit_vector const& b) const {
    if (&a == &b) return true;
    unsigned n = num_words();
    if (n == 0)
        return true;
    for (unsigned i = 0; i < n - 1; i++) {
        if (a.m_data[i] != b.m_data[i])
            return false;
    }
    return last_word(a) == last_word(b);
}
Esempio n. 7
0
bool fixed_bit_vector_manager::contains(fixed_bit_vector const& a, fixed_bit_vector const& b) const {
    unsigned n = num_words();
    if (n == 0)
        return true;
    
    for (unsigned i = 0; i < n - 1; ++i) {
        if ((a.m_data[i] & b.m_data[i]) != b.m_data[i])
            return false;
    }
    unsigned b_data = last_word(b);
    return (last_word(a) & b_data) == b_data;
}
Esempio n. 8
0
    Document< W >(const concrete::Communication& communication,
		  const WordPruner< W >& word_pruner) : id(communication.id) {
      for(concrete::Section section : communication.sectionList) {
	if(! section.__isset.sentenceList) continue;
	for(concrete::Sentence sentence : section.sentenceList) {
	  if(! sentence.__isset.tokenization) continue;
	  concrete::Tokenization tokenization = sentence.tokenization;
	  std::vector< W > toks_to_add = word_pruner.prune(tokenization);
	  if(toks_to_add.size() > 0) {
	    words_.insert(words_.end(), toks_to_add.begin(), toks_to_add.end());
	  }
	}
      }
      INFO << "Document " << this->id << " has " << num_words() << " words";
    }
Esempio n. 9
0
int Synset::total_words() const {
  int val = 0;
  for (int ii = 0; ii < (int)words_.size(); ++ii) val += num_words(ii);
  return val;
}
Esempio n. 10
0
unsigned fixed_bit_vector_manager::last_word(fixed_bit_vector const& bv) const {
    unsigned n = num_words();
    if (n == 0) return 0;
    return bv.m_data[n-1] & m_mask;
}
Esempio n. 11
0
void Tagger(FILE * lexicon, FILE * bigrams, FILE * lRuleFile,
	    FILE * cRuleFile, Registry * lexicon_hash,
	    Registry * lemma_hash, Registry * good_right_hash,
	    Registry * good_left_hash, Registry * seenTagging,
	    Darray * bigramArray, Darray * lRuleArray, Darray * cRuleArray)
{
	char line[MAXLINELEN];
	char space[500];
	char word[MAXWORDLEN], tag[MAXTAGLEN];
	char bigram1[MAXWORDLEN], bigram2[MAXWORDLEN];
	char **perl_split_ptr, **perl_split_ptr2, *atempstr,
	    **temp_perl_split_ptr;
	char *tempruleptr;
	char bigram_space[MAXWORDLEN * 2];
	int numLexiconEntries;
/*Added by Golam Mortuza Hossain */
	char lemma[MAXWORDLEN];
	*lemma_hash = Registry_create(Registry_strcmp, Registry_strhash);
/* g.m.h */

	/* Benjamin Han 100400: time for creativity! */
	*lexicon_hash = Registry_create(Registry_strcmp, Registry_strhash);
	*good_right_hash =
	    Registry_create(Registry_strcmp, Registry_strhash);
	*good_left_hash =
	    Registry_create(Registry_strcmp, Registry_strhash);
	*seenTagging = Registry_create(Registry_strcmp, Registry_strhash);
	*lRuleArray = Darray_create();
	*cRuleArray = Darray_create();
	*bigramArray = Darray_create();

	/* lexicon hash stores the most likely tag for all known words.
	   we can have a separate wordlist and lexicon file because unsupervised
	   learning    can add to wordlist, while not adding to lexicon.  For
	   example, if a big    untagged corpus is about to be tagged, the wordlist
	   can be extended to    include words in that corpus, while the lexicon
	   remains static.    Lexicon is file of form: 
	   word t1 t2 ... tn 
	   where t1 is the most likely tag for the word, and t2...tn are alternate
	   tags, in no particular order. */
	/* read through once to get size */
	for (numLexiconEntries = 0;
	     fgets(line, sizeof(line), lexicon) != NULL;
	     numLexiconEntries += num_words(line))
		if (not_just_blank(line))
			line[strlen(line) - 1] = '\0';

	fseek(lexicon, (long) 0, SEEK_SET);

	/* just need word and most likely tag from lexicon (first tag entry) */
	/* Benjamin Han 100400: originally it's hinted by the # of lines in lexicon
	   file */
	Registry_size_hint(*lexicon_hash, numLexiconEntries);
/*Added by Golam Mortuza Hossain */
	Registry_size_hint(*lemma_hash, numLexiconEntries);
/* g.m.h */

	while (fgets(line, sizeof(line), lexicon) != NULL) {
		if (not_just_blank(line)) {
			line[strlen(line) - 1] = '\0';
/*Added by Golam Mortuza Hossain */
			sscanf(line, "%s%s%s", word, lemma, tag);
//      if ( strcmp ( word, lemma) != 0 ) 
			Registry_add(*lemma_hash, (char *) mystrdup(word),
				     (char *) mystrdup(lemma));
/* It would have been much better to just use
 * "struct" and put "lemma" in lexicon hash. But
 * it does not seem to be working by simple hacking*/
/* g.m.h */
			Registry_add(*lexicon_hash,
				     (char *) mystrdup(word),
				     (char *) mystrdup(tag));
		}
	}

	/* read in lexical rule file */
	while (fgets(line, sizeof(line), lRuleFile) != NULL) {
		if (not_just_blank(line)) {
			line[strlen(line) - 1] = '\0';
			Darray_addh(*lRuleArray, mystrdup(line));
			perl_split_ptr = perl_split(line);
			temp_perl_split_ptr = perl_split_ptr;
			if (strcmp(perl_split_ptr[1], "goodright") == 0) {
				tempruleptr = mystrdup(perl_split_ptr[0]);
				Registry_add(*good_right_hash, tempruleptr,
					     (char *) 1);
			} else if (strcmp(perl_split_ptr[2], "fgoodright")
				   == 0) {
				tempruleptr = mystrdup(perl_split_ptr[1]);
				Registry_add(*good_right_hash, tempruleptr,
					     (char *) 1);
			} else if (strcmp(perl_split_ptr[1], "goodleft") ==
				   0) {
				tempruleptr = mystrdup(perl_split_ptr[0]);
				Registry_add(*good_left_hash, tempruleptr,
					     (char *) 1);
			} else if (strcmp(perl_split_ptr[2], "fgoodleft")
				   == 0) {
				tempruleptr = mystrdup(perl_split_ptr[1]);
				Registry_add(*good_left_hash, tempruleptr,
					     (char *) 1);
			}
			free(*perl_split_ptr);
			free(perl_split_ptr);
		}
	}

	/* read in bigram file */
	/* Benjamin Han 100400: I store the contents in bigramArray so
	   we don't have to do file IO everytime the start-state-tagger is
	   invoked. */
	while (fgets(line, sizeof(line), bigrams) != NULL) {
		if (not_just_blank(line)) {
			line[strlen(line) - 1] = '\0';
			atempstr =
			    (char *) malloc(sizeof(char) *
					    (strlen(line) + 1));
			strcpy(atempstr, line);
			Darray_addh(*bigramArray, atempstr);
		}
	}

	fseek(lexicon, (long) 0, SEEK_SET);

	/* read in the lexicon for the final-state-tagger */
	Registry_size_hint(*seenTagging, numLexiconEntries);

	/* Benjamin Han 100500: MISSING RESTRICT_MOVE section?
	   Answer: Brill used registry WORDS while I use lexicon_hash to replace
	   his WORDS (see POST::Run) - the only difference is in WORDS 
	   every value is 1 while in lexicon_hash a values is the first 
	   tag following the word in the lexicon file. */
	while (fgets(line, sizeof(line), lexicon) != NULL) {
		if (not_just_blank(line)) {
			line[strlen(line) - 1] = '\0';
			perl_split_ptr = perl_split(line);
			perl_split_ptr2 = perl_split_ptr;
			++perl_split_ptr;
			while (*perl_split_ptr != NULL) {
				sprintf(space, "%s %s", *perl_split_ptr2,
					*perl_split_ptr);
				Registry_add(*seenTagging, mystrdup(space),
					     (char *) 1);
				++perl_split_ptr;
			}
			free(*perl_split_ptr2);
			free(perl_split_ptr2);
		}
	}

	/* read in contextual rule */
	while (fgets(line, sizeof(line), cRuleFile) != NULL)
		if (not_just_blank(line)) {
			line[strlen(line) - 1] = '\0';
			Darray_addh(*cRuleArray, mystrdup(line));
		}
}