void bit_vector::resize(unsigned new_size, bool val) { if (new_size <= m_num_bits) { m_num_bits = new_size; return; } TRACE("bit_vector", tout << "expanding: " << new_size << " capacity: " << m_capacity << " num words: " << num_words(new_size) << "\n";);
bool none_set() const { const int words = num_words(); for (int i = 0; i < words; ++i) { if (m_buf[i] != 0) return false; } return true; }
fixed_bit_vector_manager::fixed_bit_vector_manager(unsigned num_bits): m_alloc("fixed_bit_vector") { m_num_bits = num_bits; m_num_words = num_words(num_bits); m_num_bytes = m_num_words * sizeof(unsigned); unsigned bit_rest = m_num_bits % 32; m_mask = (1U << bit_rest) - 1; if (m_mask == 0) m_mask = UINT_MAX; }
febitvec::febitvec(size_t bits, bool val, bool padding) { risk_release_ownership(); resize_no_init(bits); size_t nWords = num_words(); std::fill_n(m_words, nWords, bm_uint_t(val ? -1 : 0)); if (bits != m_capacity && padding != val) { assert(bits > 0); if (padding) m_words[nWords-1] = ~((bm_uint_t(-1) >> (m_capacity-bits)) - 1); else
bool none_set() const { const int words = num_words(); std::uint32_t const* b = buf(); for (int i = 0; i < words; ++i) { if (b[i] != 0) return false; } return true; }
bool fixed_bit_vector_manager::equals(fixed_bit_vector const& a, fixed_bit_vector const& b) const { if (&a == &b) return true; unsigned n = num_words(); if (n == 0) return true; for (unsigned i = 0; i < n - 1; i++) { if (a.m_data[i] != b.m_data[i]) return false; } return last_word(a) == last_word(b); }
bool fixed_bit_vector_manager::contains(fixed_bit_vector const& a, fixed_bit_vector const& b) const { unsigned n = num_words(); if (n == 0) return true; for (unsigned i = 0; i < n - 1; ++i) { if ((a.m_data[i] & b.m_data[i]) != b.m_data[i]) return false; } unsigned b_data = last_word(b); return (last_word(a) & b_data) == b_data; }
Document< W >(const concrete::Communication& communication, const WordPruner< W >& word_pruner) : id(communication.id) { for(concrete::Section section : communication.sectionList) { if(! section.__isset.sentenceList) continue; for(concrete::Sentence sentence : section.sentenceList) { if(! sentence.__isset.tokenization) continue; concrete::Tokenization tokenization = sentence.tokenization; std::vector< W > toks_to_add = word_pruner.prune(tokenization); if(toks_to_add.size() > 0) { words_.insert(words_.end(), toks_to_add.begin(), toks_to_add.end()); } } } INFO << "Document " << this->id << " has " << num_words() << " words"; }
int Synset::total_words() const { int val = 0; for (int ii = 0; ii < (int)words_.size(); ++ii) val += num_words(ii); return val; }
unsigned fixed_bit_vector_manager::last_word(fixed_bit_vector const& bv) const { unsigned n = num_words(); if (n == 0) return 0; return bv.m_data[n-1] & m_mask; }
void Tagger(FILE * lexicon, FILE * bigrams, FILE * lRuleFile, FILE * cRuleFile, Registry * lexicon_hash, Registry * lemma_hash, Registry * good_right_hash, Registry * good_left_hash, Registry * seenTagging, Darray * bigramArray, Darray * lRuleArray, Darray * cRuleArray) { char line[MAXLINELEN]; char space[500]; char word[MAXWORDLEN], tag[MAXTAGLEN]; char bigram1[MAXWORDLEN], bigram2[MAXWORDLEN]; char **perl_split_ptr, **perl_split_ptr2, *atempstr, **temp_perl_split_ptr; char *tempruleptr; char bigram_space[MAXWORDLEN * 2]; int numLexiconEntries; /*Added by Golam Mortuza Hossain */ char lemma[MAXWORDLEN]; *lemma_hash = Registry_create(Registry_strcmp, Registry_strhash); /* g.m.h */ /* Benjamin Han 100400: time for creativity! */ *lexicon_hash = Registry_create(Registry_strcmp, Registry_strhash); *good_right_hash = Registry_create(Registry_strcmp, Registry_strhash); *good_left_hash = Registry_create(Registry_strcmp, Registry_strhash); *seenTagging = Registry_create(Registry_strcmp, Registry_strhash); *lRuleArray = Darray_create(); *cRuleArray = Darray_create(); *bigramArray = Darray_create(); /* lexicon hash stores the most likely tag for all known words. we can have a separate wordlist and lexicon file because unsupervised learning can add to wordlist, while not adding to lexicon. For example, if a big untagged corpus is about to be tagged, the wordlist can be extended to include words in that corpus, while the lexicon remains static. Lexicon is file of form: word t1 t2 ... tn where t1 is the most likely tag for the word, and t2...tn are alternate tags, in no particular order. */ /* read through once to get size */ for (numLexiconEntries = 0; fgets(line, sizeof(line), lexicon) != NULL; numLexiconEntries += num_words(line)) if (not_just_blank(line)) line[strlen(line) - 1] = '\0'; fseek(lexicon, (long) 0, SEEK_SET); /* just need word and most likely tag from lexicon (first tag entry) */ /* Benjamin Han 100400: originally it's hinted by the # of lines in lexicon file */ Registry_size_hint(*lexicon_hash, numLexiconEntries); /*Added by Golam Mortuza Hossain */ Registry_size_hint(*lemma_hash, numLexiconEntries); /* g.m.h */ while (fgets(line, sizeof(line), lexicon) != NULL) { if (not_just_blank(line)) { line[strlen(line) - 1] = '\0'; /*Added by Golam Mortuza Hossain */ sscanf(line, "%s%s%s", word, lemma, tag); // if ( strcmp ( word, lemma) != 0 ) Registry_add(*lemma_hash, (char *) mystrdup(word), (char *) mystrdup(lemma)); /* It would have been much better to just use * "struct" and put "lemma" in lexicon hash. But * it does not seem to be working by simple hacking*/ /* g.m.h */ Registry_add(*lexicon_hash, (char *) mystrdup(word), (char *) mystrdup(tag)); } } /* read in lexical rule file */ while (fgets(line, sizeof(line), lRuleFile) != NULL) { if (not_just_blank(line)) { line[strlen(line) - 1] = '\0'; Darray_addh(*lRuleArray, mystrdup(line)); perl_split_ptr = perl_split(line); temp_perl_split_ptr = perl_split_ptr; if (strcmp(perl_split_ptr[1], "goodright") == 0) { tempruleptr = mystrdup(perl_split_ptr[0]); Registry_add(*good_right_hash, tempruleptr, (char *) 1); } else if (strcmp(perl_split_ptr[2], "fgoodright") == 0) { tempruleptr = mystrdup(perl_split_ptr[1]); Registry_add(*good_right_hash, tempruleptr, (char *) 1); } else if (strcmp(perl_split_ptr[1], "goodleft") == 0) { tempruleptr = mystrdup(perl_split_ptr[0]); Registry_add(*good_left_hash, tempruleptr, (char *) 1); } else if (strcmp(perl_split_ptr[2], "fgoodleft") == 0) { tempruleptr = mystrdup(perl_split_ptr[1]); Registry_add(*good_left_hash, tempruleptr, (char *) 1); } free(*perl_split_ptr); free(perl_split_ptr); } } /* read in bigram file */ /* Benjamin Han 100400: I store the contents in bigramArray so we don't have to do file IO everytime the start-state-tagger is invoked. */ while (fgets(line, sizeof(line), bigrams) != NULL) { if (not_just_blank(line)) { line[strlen(line) - 1] = '\0'; atempstr = (char *) malloc(sizeof(char) * (strlen(line) + 1)); strcpy(atempstr, line); Darray_addh(*bigramArray, atempstr); } } fseek(lexicon, (long) 0, SEEK_SET); /* read in the lexicon for the final-state-tagger */ Registry_size_hint(*seenTagging, numLexiconEntries); /* Benjamin Han 100500: MISSING RESTRICT_MOVE section? Answer: Brill used registry WORDS while I use lexicon_hash to replace his WORDS (see POST::Run) - the only difference is in WORDS every value is 1 while in lexicon_hash a values is the first tag following the word in the lexicon file. */ while (fgets(line, sizeof(line), lexicon) != NULL) { if (not_just_blank(line)) { line[strlen(line) - 1] = '\0'; perl_split_ptr = perl_split(line); perl_split_ptr2 = perl_split_ptr; ++perl_split_ptr; while (*perl_split_ptr != NULL) { sprintf(space, "%s %s", *perl_split_ptr2, *perl_split_ptr); Registry_add(*seenTagging, mystrdup(space), (char *) 1); ++perl_split_ptr; } free(*perl_split_ptr2); free(perl_split_ptr2); } } /* read in contextual rule */ while (fgets(line, sizeof(line), cRuleFile) != NULL) if (not_just_blank(line)) { line[strlen(line) - 1] = '\0'; Darray_addh(*cRuleArray, mystrdup(line)); } }