bool SpellChecker::checkWordSpelling(const QString &word) { QString tmp = word; tmp = tmp.remove(QString::fromUtf8("»")); tmp = tmp.remove(QString::fromUtf8("«")); QByteArray ba = tmp.toUtf8(); return (aspell_speller_check(spell_checker1, ba.data(), ba.size()) != 0) || (aspell_speller_check((spell_checker2 != NULL ? spell_checker2 : spell_checker1), ba.data(), ba.size()) != 0); }
void permute(char* a, int l, int r) { int i; if (l == r) { int correct = aspell_speller_check(spell_checker, a, -1); if (correct) { printf("%s\n", a); } else if (!correct) { } else { fprintf(stderr, "Error: %s\n", aspell_speller_error_message(spell_checker)); } } else { for (i = l; i <= r; i++) { swap((a + l), (a + i)); permute(a, l + 1, r); swap((a + l), (a + i)); //backtrack } } }
int weechat_aspell_check_word (struct t_aspell_speller_buffer *speller_buffer, const char *word) { int i; /* word too small? then do not check word */ if ((weechat_config_integer (weechat_aspell_config_check_word_min_length) > 0) && ((int)strlen (word) < weechat_config_integer (weechat_aspell_config_check_word_min_length))) return 1; /* word is a number? then do not check word */ if (weechat_aspell_string_is_simili_number (word)) return 1; /* check word with all spellers (order is important) */ if (speller_buffer->spellers) { for (i = 0; speller_buffer->spellers[i]; i++) { #ifdef USE_ENCHANT if (enchant_dict_check (speller_buffer->spellers[i], word, strlen (word)) == 0) #else if (aspell_speller_check (speller_buffer->spellers[i], word, -1) == 1) #endif /* USE_ENCHANT */ return 1; } } /* misspelled word! */ return 0; }
int spelling_correct(char *word, struct spelling *s) { int ret; char *conv_word; char *p; size_t conv_in, conv_out; conv_word = spelling_conv(s->conv, word); if (isdigit(conv_word[0])) { int i, digit; digit = 1; for (i = 1; conv_word[i] != '\0'; i++) { if (!isdigit(conv_word[i])) { digit = 0; break; } } if (digit) return 1; } ret = aspell_speller_check(s->speller, conv_word, -1); free(conv_word); return ret; }
bool KAspellChecker::checkWord(const QString &word) { if (!m_speller) return true; return aspell_speller_check(m_speller, word.toLocal8Bit().data(), -1); }
bool CheckerString::next_misspelling() { if (off_end(cur_line_)) return false; if (has_repl_) { has_repl_ = false; CharVector word; bool correct = false; // FIXME: This is a hack to avoid trying to check a word with a space // in it. The correct action is to reparse to string and // check each word individually. However doing so involves // an API enhancement in Checker. for (int i = 0; i != real_word_size_; ++i) { if (asc_isspace(*(real_word_begin_ + i))) correct = true; } if (!correct) correct = aspell_speller_check(speller_, &*real_word_begin_, real_word_size_); diff_ += real_word_size_ - tok_.len; tok_.len = real_word_size_; if (!correct) return true; } while ((tok_ = checker_->next_misspelling()).len == 0) { next_line(cur_line_); diff_ = 0; if (off_end(cur_line_)) return false; checker_->process(cur_line_->real.data(), cur_line_->real.size()); } real_word_begin_ = cur_line_->real.begin() + tok_.offset + diff_; real_word_size_ = tok_.len; fix_display_str(); return true; }
bool ASpellChecker::isCorrect(const QString& word) { if(speller_) { int correct = aspell_speller_check(speller_, word.toUtf8().constData(), -1); return (correct != 0); } return true; }
bool SpellCheck::ok(const QString &word){ if (!spell_checker || word.isEmpty()) return true; int correct = aspell_speller_check(spell_checker, word.toAscii().constData(), -1); return (correct != 0); }
//__________________________________________________________________________ bool Speller::Aspell::Suggest::checkWord(const std::string& word) { bool status = true; if( aspell_speller_check( fspeller, word.c_str(), -1 ) == 0 ) { status = false; } return status; }
static void check_word(GtkSpell *spell, GtkTextBuffer *buffer, GtkTextIter *start, GtkTextIter *end) { char *text; text = gtk_text_buffer_get_text(buffer, start, end, FALSE); if (debug) g_print("checking: %s\n", text); if (aspell_speller_check(spell->speller, text, -1) == FALSE) gtk_text_buffer_apply_tag(buffer, spell->tag_highlight, start, end); g_free(text); }
/** * Ask the spell-checker if the spelling looks good. * Return true if the spelling is good, else false. */ Boolean spellcheck_test(void * chk, const char * word) { int val = 0; struct linkgrammar_aspell *aspell = (struct linkgrammar_aspell *)chk; if (aspell && aspell->speller) { /* this can return -1 on failure */ val = aspell_speller_check(aspell->speller, word, -1); } return (val == 1) ? TRUE : FALSE; }
bool ASpellChecker::isCorrect(const QString& word) { if(spellers_.isEmpty()) return true; foreach(AspellSpeller* speller, spellers_) { if (aspell_speller_check(speller, word.toUtf8().constData(), -1) != 0) return true; } return false; }
/** * Check a given word for correctness. * @param word the word to check * @return true if the word is correct, otherwise false. */ static VALUE aspell_check(VALUE self, VALUE word) { AspellSpeller *speller = get_speller(self); VALUE result = Qfalse; int code = aspell_speller_check(speller, STR2CSTR(word), -1); if (code == 1) result = Qtrue; else if (code == 0) result = Qfalse; else rb_raise( cAspellError, "%s", aspell_speller_error_message(speller)); return result; }
int weechat_aspell_check_word (struct t_gui_buffer *buffer, struct t_aspell_speller_buffer *speller_buffer, const char *word) { const char *buffer_type, *buffer_nick, *buffer_channel; int i; /* word too small? then do not check word */ if ((weechat_config_integer (weechat_aspell_config_check_word_min_length) > 0) && ((int)strlen (word) < weechat_config_integer (weechat_aspell_config_check_word_min_length))) return 1; /* word is a number? then do not check word */ if (weechat_aspell_string_is_simili_number (word)) return 1; /* word is a nick of nicklist on this buffer? then do not check word */ if (weechat_nicklist_search_nick (buffer, NULL, word)) return 1; /* for "private" buffers, ignore self and remote nicks */ buffer_type = weechat_buffer_get_string (buffer, "localvar_type"); if (buffer_type && (strcmp (buffer_type, "private") == 0)) { /* check self nick */ buffer_nick = weechat_buffer_get_string (buffer, "localvar_nick"); if (buffer_nick && (weechat_strcasecmp (buffer_nick, word) == 0)) return 1; /* check remote nick */ buffer_channel = weechat_buffer_get_string (buffer, "localvar_channel"); if (buffer_channel && (weechat_strcasecmp (buffer_channel, word) == 0)) return 1; } /* check word with all spellers for this buffer (order is important) */ if (speller_buffer->spellers) { for (i = 0; speller_buffer->spellers[i]; i++) { #ifdef USE_ENCHANT if (enchant_dict_check (speller_buffer->spellers[i], word, strlen (word)) == 0) #else if (aspell_speller_check (speller_buffer->spellers[i], word, -1) == 1) #endif return 1; } } /* misspelled word! */ return 0; }
QStringList SpellChecker::suggestions() { QStringList sl; if ((spell_checker1 == 0) || (spell_checker2 == 0)) return sl; QTextCursor cursor = m_textEdit->textCursor(); cursor.select(QTextCursor::WordUnderCursor); QString word = cursor.selectedText(); QByteArray ba = word.toUtf8(); if ((aspell_speller_check(spell_checker2, ba.data(), ba.size()) != 0)||(aspell_speller_check(spell_checker1, ba.data(), ba.size()) != 0)) return sl; const struct AspellWordList *awl = aspell_speller_suggest(spell_checker1, ba.data(), ba.size()); if (aspell_word_list_size(awl) > 0) { struct AspellStringEnumeration *ase = aspell_word_list_elements(awl); int i = 0; while ((!aspell_string_enumeration_at_end(ase))&&(i < 10)) { const char *text = aspell_string_enumeration_next(ase); sl << QString::fromUtf8(text); i++; } delete_aspell_string_enumeration(ase); } return sl; }
bool SpellChecker::check(QString word) { if(word.isEmpty()) return true; AspellSpeller * checker; if(word[0].toLower()>=QChar('a') && word[0].toLower()<=QChar('z')){ checker = m_spell_checker_en; } else{ checker = m_spell_checker_ru; } if(!checker) return true; return (bool)aspell_speller_check(checker, m_codec->fromUnicode(word).data(), -1); }
int weechat_aspell_check_word (struct t_gui_buffer *buffer, const char *word) { struct t_aspell_speller *ptr_speller; int rc; rc = 0; /* word too small? then do not check word */ if ((weechat_config_integer (weechat_aspell_config_check_word_min_length) > 0) && ((int)strlen (word) < weechat_config_integer (weechat_aspell_config_check_word_min_length))) rc = 1; else { /* word is URL? then do not check word */ if (weechat_aspell_string_is_url (word)) rc = 1; else { /* word is a number? then do not check word */ if (weechat_aspell_string_is_simili_number (word)) rc = 1; else { /* word is a nick of nicklist on this buffer? then do not check word */ if (weechat_nicklist_search_nick (buffer, NULL, word)) rc = 1; else { /* check word with all spellers for this buffer (order is important) */ for (ptr_speller = weechat_aspell_spellers; ptr_speller; ptr_speller = ptr_speller->next_speller) { if (aspell_speller_check (ptr_speller->speller, word, -1) == 1) { rc = 1; break; } } } } } } return rc; }
void checkTheWord(char* word,int checkDoc) { #if 1 int correct; AspellWordList* suggestions; AspellStringEnumeration* elements; const char* suggestedword; int wordcnt=0; char* wordlist[100]; char* labeltext[512]; correct=aspell_speller_check(spellChecker,word,-1); if(!correct) { badWord=word; cancelCheck=false; if(spellCheckWord==NULL) buildWordCheck(checkDoc); else { for(int j=0; j<numWords; j++) gtk_combo_box_text_remove((GtkComboBoxText*)wordListDropbox,0); sprintf((char*)&labeltext,"Change <i><b>%s</b></i> to: ",badWord); gtk_label_set_text((GtkLabel*)badWordLabel,(char*)&labeltext); gtk_label_set_use_markup((GtkLabel*)badWordLabel,true); } suggestions=(AspellWordList*)aspell_speller_suggest(spellChecker,word,-1); elements=aspell_word_list_elements(suggestions); while((suggestedword=aspell_string_enumeration_next(elements))!=NULL) { wordlist[wordcnt]=strdup(suggestedword); gtk_combo_box_text_append_text((GtkComboBoxText*)wordListDropbox,wordlist[wordcnt]); wordcnt++; } numWords=wordcnt; delete_aspell_string_enumeration(elements); gtk_combo_box_set_active((GtkComboBox*)wordListDropbox,0); gtk_widget_show_all(spellCheckWord); gtk_dialog_run((GtkDialog *)spellCheckWord); } #endif }
bool SpellChecker::checkWord(QString word) { bool isWordValid = checkers.size() == 0; if (word.indexOf(QRegExp("\\D")) == -1) isWordValid = true; else { for (Checkers::Iterator it = checkers.begin(); it != checkers.end(); it++) { if (aspell_speller_check(it.value(), word.toUtf8(), -1)) { isWordValid = true; break; } } } return isWordValid; }
/* * Class: calliope_AeseSpeller * Method: hasWord * Signature: (Ljava/lang/String;Ljava/lang/String;)Z */ JNIEXPORT jboolean JNICALL Java_calliope_AeseSpeller_hasWord (JNIEnv *env, jobject obj, jstring jword, jstring lang) { int correct = 0; jboolean copied1, copied2; checker *c = checkers; const char *word = load_string( env, jword, &copied1 ); const char *language = load_string( env, lang, &copied2 ); while ( c != NULL ) if ( strcmp(language,c->lang)!=0 ) c = c->next; else break; if ( c == NULL ) { c = checker_create( language ); if ( c != NULL ) { if ( checkers == NULL ) checkers = c; else { checker *temp = checkers; while ( temp->next != NULL ) temp = temp->next; temp->next = c; } } } if ( c != NULL ) { correct = aspell_speller_check(c->spell_checker, word, strlen(word)); } else fprintf(stderr,"checker: no dict for language %s\n",language); if ( copied1 ) unload_string( env, jword, word, copied1 ); if ( copied2 ) unload_string( env, lang, language, copied2 ); return correct; }
/* method:check ***************************************************************/ static PyObject* m_check(PyObject* self, PyObject* args) { char* word; int length; if (!PyArg_ParseTuple(args, "s#", &word, &length)) { PyErr_SetString(PyExc_TypeError, "a string is required"); return NULL; } if (!length) return Py_BuildValue("i", 1); switch (aspell_speller_check(Speller(self), word, length)) { case 0: return Py_BuildValue("i", 0); case 1: return Py_BuildValue("i", 1); default: PyErr_SetString(_AspellSpellerException, aspell_speller_error_message(Speller(self))); return NULL; } }
int main(int argc, const char *argv[]) { AspellCanHaveError * ret; AspellSpeller * speller; int have; char word[81]; char * p; char * word_end; AspellConfig * config; if (argc < 2) { printf("Usage: %s <language> [<size>|- [[<jargon>|- [<encoding>]]]\n", argv[0]); return 1; } config = new_aspell_config(); aspell_config_replace(config, "lang", argv[1]); if (argc >= 3 && argv[2][0] != '-' && argv[2][1] != '\0') aspell_config_replace(config, "size", argv[2]); if (argc >= 4 && argv[3][0] != '-') aspell_config_replace(config, "jargon", argv[3]); if (argc >= 5 && argv[4][0] != '-') aspell_config_replace(config, "encoding", argv[4]); ret = new_aspell_speller(config); delete_aspell_config(config); if (aspell_error(ret) != 0) { printf("Error: %s\n",aspell_error_message(ret)); delete_aspell_can_have_error(ret); return 2; } speller = to_aspell_speller(ret); config = aspell_speller_config(speller); fputs("Using: ", stdout); fputs(aspell_config_retrieve(config, "lang"), stdout); fputs("-", stdout); fputs(aspell_config_retrieve(config, "jargon"), stdout); fputs("-", stdout); fputs(aspell_config_retrieve(config, "size"), stdout); fputs("-", stdout); fputs(aspell_config_retrieve(config, "module"), stdout); fputs("\n\n", stdout); puts("Type \"h\" for help.\n"); while (fgets(word, 80, stdin) != 0) { /* remove trailing spaces */ word_end = strchr(word, '\0') - 1; while (word_end != word && (*word_end == '\n' || *word_end == ' ')) --word_end; ++word_end; *word_end = '\0'; putchar('\n'); switch (word[0]) { case '\0': break; case 'h': puts( "Usage: \n" " h(elp) help\n" " c <word> check if a word is the correct spelling\n" " s <word> print out a list of suggestions for a word\n" " a <word> add a word to the personal word list\n" " i <word> ignore a word for the rest of the session\n" " d <file> spell checks a document\n" " p dumps the personal word list\n" " P dumps the session word list\n" " m dumps the main word list\n" " o <option> <value> sets a config option\n" " r <option> retrieves a config option\n" " l <option> retrieves a config option as a list\n" " S saves all word lists\n" " C clear the curent sesstion word list\n" " x quite\n" ); break; case 'p': print_word_list(speller, aspell_speller_personal_word_list(speller), '\n'); break; case 'P': print_word_list(speller, aspell_speller_session_word_list(speller), '\n'); break; case 'm': print_word_list(speller, aspell_speller_main_word_list(speller), '\n'); break; case 'S': aspell_speller_save_all_word_lists(speller); check_for_error(speller); break; case 'C': aspell_speller_clear_session(speller); check_for_error(speller); break; case 'x': goto END; case 'c': if (strlen(word) < 3) { printf("Usage: %c <word>\n", word[0]); } else { have = aspell_speller_check(speller, word + 2, -1); if (have == 1) puts("correct"); else if (have == 0) puts("incorrect"); else printf("Error: %s\n", aspell_speller_error_message(speller)); } break; case 's': if (strlen(word) < 3) { printf("Usage: %c <word>\n", word[0]); } else { print_word_list(speller, aspell_speller_suggest(speller, word + 2, -1), '\n'); } break; case 'a': if (strlen(word) < 3) { printf("Usage: %c <word>\n", word[0]); } else { aspell_speller_add_to_personal(speller, word + 2, -1); check_for_error(speller); } break; case 'i': if (strlen(word) < 3) { printf("Usage: %c <word>\n", word[0]); } else { aspell_speller_add_to_session(speller, word + 2, -1); check_for_error(speller); } break; case 'o': word[80] = '\0'; /* to make sure strchr doesn't run off end of string */ p = strchr(word + 3, ' '); if (strlen(word) < 3 || p == 0) { printf("Usage: %c <option> <value>\n", word[0]); } else { *p = '\0'; ++p; aspell_config_replace(config, word + 2, p); check_for_config_error(config); } break; case 'r': if (strlen(word) < 3) { printf("Usage: %c <option>\n", word[0]); } else { const char * val = aspell_config_retrieve(config, word + 2); check_for_config_error(config); if (val) printf("%s = \"%s\"\n", word + 2, val); } break; case 'l': if (strlen(word) < 3) { printf("Usage: %c <option>\n", word[0]); } else { AspellStringList * lst = new_aspell_string_list(); AspellMutableContainer * lst0 = aspell_string_list_to_mutable_container(lst); AspellStringEnumeration * els; const char * val; aspell_config_retrieve_list(config, word + 2, lst0); check_for_config_error(config); els = aspell_string_list_elements(lst); printf("%s:\n", word + 2); while ( (val = aspell_string_enumeration_next(els)) != 0) printf(" %s\n", val); delete_aspell_string_enumeration(els); delete_aspell_string_list(lst); } break; case 'd': if (strlen(word) < 3) { printf("Usage: %c <file>\n", word[0]); } else { check_document(speller, word + 2); printf("\n"); } break; default: printf("Unknown Command: %s\n", word); } putchar('\n'); } END: delete_aspell_speller(speller); return 0; }
/** * Does the current spell-checker have this word? * @param u the userdata object * @param word the word to lookup * @return 1 if it was there else 0 */ int userdata_has_word( userdata *u, XML_Char *word ) { int correct = aspell_speller_check(u->spell_checker, word, strlen((char*)word)); return correct; }
bool AspellAdapterImpl::isWordCorrect(const std::string & word) { int correct = aspell_speller_check(aspellSpeller_, word.c_str(), -1); return (bool) correct; }
bool Speller::add(const char *word) { if (speller == NULL) return false; return aspell_speller_check(speller, word, strlen(word)) != 0; }
int Speller::check(const char *word) { if (speller == NULL) return -1; return aspell_speller_check(speller, word, strlen(word)); }
// runs in O(t^2) time where t is the number of tokens in the input corpus // We consider maxK to be fairly constant void rawr::compile(int maxK) { _maxK = maxK; std::vector<std::vector<token_id>> tokens; std::set<std::string> thashtags; std::set<std::string> fv_emoticons; std::ifstream fvefile("emoticons.txt"); if (fvefile) { std::string line; while (getline(fvefile, line)) { fv_emoticons.insert(line); emoticons.forms.add(line); } } fvefile.close(); std::map<std::string, std::string> canonical_form; AspellConfig* spell_config = new_aspell_config(); AspellCanHaveError* possible_err = new_aspell_speller(spell_config); if (aspell_error_number(possible_err) != 0) { std::cout << "aspell error: " << aspell_error_message(possible_err) << std::endl; exit(1); } AspellSpeller* spell_checker = to_aspell_speller(possible_err); std::cout << "Reading emojis..." << std::endl; prefix_search emojis; std::ifstream emoji_file("emojis.txt"); if (emoji_file) { while (!emoji_file.eof()) { std::string rawmojis; getline(emoji_file, rawmojis); if (rawmojis.back() == '\r') { rawmojis.pop_back(); } emojis.add(rawmojis); } emoji_file.close(); } std::cout << "Tokenizing corpus... 0%" << std::flush; int len = 0; for (auto c : _corpora) { len += c.length(); } int startper = 0; int per = 0; int perprime = 0; std::cout.fill(' '); for (int i = 0; i < _corpora.size(); i++) { size_t start = 0; int end = 0; std::vector<token_id> tkcor; while (end != std::string::npos) { perprime = (startper + end) * 100 / len; if (perprime != per) { per = perprime; std::cout << "\b\b\b\b" << std::right; std::cout.width(3); std::cout << per << "%" << std::flush; } end = _corpora[i].find_first_of(" \n", start); bool emoji = false; std::string te = _corpora[i].substr(start, (end == std::string::npos) ? std::string::npos : end - start + 1); std::string t = ""; if (te.compare("") && te.compare(".") && te.compare(" ")) { if (te.back() == ' ') { te.pop_back(); } // Extract strings of emojis into their own tokens even if they're not space delimited int m = emojis.match(te); emoji = m > 0; if (m == 0) m = 1; t = te.substr(0,m); te = te.substr(m); while (!te.empty()) { m = emojis.match(te); if (emoji == (m > 0)) { if (m == 0) m = 1; t += te.substr(0,m); te = te.substr(m); } else { end = start + t.length() - 1; break; } } std::string tc(t); std::transform(tc.begin(), tc.end(), tc.begin(), ::tolower); int pst = tc.find_first_not_of("\"([*"); int dst = tc.find_last_not_of("\")]*.,?!\n;:"); std::string canonical(""); if ((pst != std::string::npos) && (dst != std::string::npos)) { canonical = std::string(tc, pst, dst - pst + 1); } word& w = ([&] () -> word& { // Hashtag freevar if (canonical[0] == '#') { thashtags.insert(canonical); return hashtags; } // Emoticon freevar if (emoji) { emoticons.forms.add(canonical); return emoticons; } if ((pst != std::string::npos) && (dst != std::string::npos)) { std::string emoticon_canon(t, pst, t.find_last_not_of("\"]*\n.,?!") - pst + 1); if (fv_emoticons.count(emoticon_canon) == 1) { emoticons.forms.add(emoticon_canon); return emoticons; } } // Basically any other word if (canonical_form.count(canonical) == 0) { if ( // Legacy freevars should be distinct from tokens containing similar words (canonical.find("$name$") != std::string::npos) // Words with no letters will be mangled by the spell checker || (canonical.find_first_of("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz") == std::string::npos) ) { canonical_form[canonical] = canonical; words.emplace(canonical, canonical); } else { int correct = aspell_speller_check(spell_checker, canonical.c_str(), canonical.size()); if (correct) { words.emplace(canonical, canonical); canonical_form[canonical] = canonical; } else { const AspellWordList* suggestions = aspell_speller_suggest(spell_checker, canonical.c_str(), canonical.size()); AspellStringEnumeration* elements = aspell_word_list_elements(suggestions); const char* replacement = aspell_string_enumeration_next(elements); if (replacement != NULL) { std::string sugrep(replacement); canonical_form[canonical] = sugrep; if (words.count(sugrep) == 0) { words.emplace(sugrep, sugrep); } } else { words.emplace(canonical, canonical); canonical_form[canonical] = canonical; } delete_aspell_string_enumeration(elements); } } } word& tw = words.at(canonical_form.at(canonical)); tw.forms.add(canonical); return tw; })(); token tk(w); tk.raw = t; for (char c : t) { if (c == '*') { tk.delimiters[{parentype::asterisk, doublestatus::opening}]++; } else if (c == '[') { tk.delimiters[{parentype::square_bracket, doublestatus::opening}]++; } else if (c == '(') { tk.delimiters[{parentype::paren, doublestatus::opening}]++; } else if (c == '"') { tk.delimiters[{parentype::quote, doublestatus::opening}]++; } else { break; } } int backtrack = t.find_last_not_of(".,?!])\"*\n;:") + 1; if (backtrack != t.length()) { std::string ending = t.substr(backtrack); std::string suffix; bool newline = false; bool terminating = false; for (char c : ending) { if ((c == '.') || (c == ',') || (c == '?') || (c == '!') || (c == ';') || (c == ':')) { suffix += c; terminating = true; continue; } else if (c == '\n') { newline = true; terminating = true; continue; } parentype pt = ([&] { switch (c) { case ']': return parentype::square_bracket; case ')': return parentype::paren; case '*': return parentype::asterisk; case '"': return parentype::quote; } })(); if (tk.delimiters[{pt, doublestatus::opening}] > 0) { tk.delimiters[{pt, doublestatus::opening}]--; tk.delimiters[{pt, doublestatus::both}]++; } else { tk.delimiters[{pt, doublestatus::closing}]++; } } if (terminating) { if ((suffix == ",") && (!newline)) { tk.suffix = suffixtype::comma; } else { tk.suffix = suffixtype::terminating; if (!newline) { w.terms.add({suffix, false}); } else { w.terms.add({".", false}); } } } } tkcor.push_back(_tokenstore.add(tk)); } start = ((end > (std::string::npos - 1) ) ? std::string::npos : end + 1); } tokens.push_back(tkcor); startper += _corpora[i].length(); } std::cout << "\b\b\b\b100%" << std::endl; delete_aspell_speller(spell_checker); delete_aspell_config(spell_config); std::cout << canonical_form.size() << " distinct forms" << std::endl; std::cout << words.size() << " distinct words" << std::endl; // Time to condense the distribution stuff for the words std::cout << "Compiling token histograms..." << std::endl; for (auto& it : words) { it.second.forms.compile(); it.second.terms.compile(); } // Hashtag freevar is not frequency distributed for (auto& it : thashtags) { hashtags.forms.add(it); } hashtags.forms.compile(); hashtags.terms.compile(); // Compile other freevars emoticons.forms.compile(); emoticons.terms.compile(); // Compile the interned tokens. _tokenstore.compile(); // kgram distribution std::cout << "Creating markov chain... 0%" << std::flush; std::map<kgram, std::map<token_id, token_data> > tstats; len = 0; for (auto c : tokens) { len += (maxK-1) * c.size(); } startper = 0; per = 0; perprime = 0; int corpid = 0; for (auto corpus : tokens) { for (int k=0; k<maxK && k<corpus.size(); k++) { // The zero'th token should be a terminator. token_id fid = corpus[k]; const token& f = _tokenstore.get(fid); kgram term_prefix(corpus.begin(), corpus.begin()+k); term_prefix.push_front(wildcardQuery); if (tstats[term_prefix].count(fid) == 0) { tstats[term_prefix].emplace(fid, fid); } token_data& td2 = tstats[term_prefix].at(fid); td2.all++; td2.corpora.insert(corpid); if (std::find_if(f.raw.begin(), f.raw.end(), ::islower) == f.raw.end()) { td2.uppercase++; } else if (isupper(f.raw[0])) { td2.titlecase++; } } for (int k=1; k<maxK && k<corpus.size(); k++) { for (int i=0; i<(corpus.size() - k); i++) { perprime = (startper+i) * 100 / len; if (perprime != per) { per = perprime; std::cout << "\b\b\b\b" << std::right; std::cout.width(3); std::cout << per << "%" << std::flush; } kgram prefix(corpus.begin()+i, corpus.begin()+i+k); token_id fid = corpus[i+k]; const token& f = _tokenstore.get(fid); if (tstats[prefix].count(fid) == 0) { tstats[prefix].emplace(fid, fid); } token_data& td = tstats[prefix].at(fid); td.all++; td.corpora.insert(corpid); if (std::find_if(f.raw.begin(), f.raw.end(), ::islower) == f.raw.end()) { td.uppercase++; } else if (isupper(f.raw[0])) { td.titlecase++; } const token& startTok = _tokenstore.get(std::begin(prefix)->tok); if (startTok.suffix == suffixtype::terminating) { kgram term_prefix(prefix); term_prefix.pop_front(); term_prefix.push_front(wildcardQuery); if (tstats[term_prefix].count(fid) == 0) { tstats[term_prefix].emplace(fid, fid); } token_data& td2 = tstats[term_prefix].at(fid); td2.all++; td2.corpora.insert(corpid); if (std::find_if(f.raw.begin(), f.raw.end(), ::islower) == f.raw.end()) { td2.uppercase++; } else if (isupper(f.raw[0])) { td2.titlecase++; } } } startper += corpus.size(); } corpid++; } std::cout << "\b\b\b\b100%" << std::endl; // Condense the kgram distribution std::cout << "Compiling kgram distributions... 0%"; len = tstats.size(); per = 0; perprime = 0; int indicator = 0; for (auto& it : tstats) { indicator++; perprime = indicator * 100 / len; if (per != perprime) { per = perprime; std::cout << "\b\b\b\b" << std::right; std::cout.width(3); std::cout << per << "%" << std::flush; } kgram klist = it.first; auto& probtable = it.second; auto& distribution = _stats[klist]; int max = 0; for (auto& kt : probtable) { max += kt.second.all; distribution.emplace(max, kt.second); } } std::cout << "\b\b\b\b100%" << std::endl; _compiled = true; }
static int get_aspell_hits(const char *word, int len) { if (len < 2) { log_message(DEBUG, " [-]Skip aspell checking (word is very short)"); return NO_LANGUAGE; } AspellConfig *spell_config = new_aspell_config(); for (int lang = 0; lang < xconfig->total_languages; lang++) { char *lang_word = (char *) malloc(1 * sizeof(char)); if (lang_word == NULL) continue; lang_word[0] = NULLSYM; for (int i = 0; i < len; i++) { KeyCode kc; int modifier; main_window->xkeymap->char_to_keycode(main_window->xkeymap, word[i], &kc, &modifier); char *symbol = keycode_to_symbol(kc, lang, modifier); if ((symbol == NULL) || (lang_word == NULL)) continue; lang_word = (char *) realloc(lang_word, (strlen(lang_word) + strlen(symbol) + 1) * sizeof(char)); if (lang_word != NULL) strcat(lang_word, symbol); free(symbol); } if (lang_word == NULL) continue; aspell_config_replace(spell_config, "lang", xconfig->languages[lang].dir); AspellCanHaveError *possible_err = new_aspell_speller(spell_config); AspellSpeller *spell_checker = 0; if (aspell_error_number(possible_err) == 0) { spell_checker = to_aspell_speller(possible_err); int correct = aspell_speller_check(spell_checker, lang_word, strlen(lang_word)); if (correct) { log_message(DEBUG, " [+]Found this word in %s aspell dictionary", xconfig->get_lang_name(xconfig, lang)); delete_aspell_speller(spell_checker); free(lang_word); return lang; } } else { log_message(DEBUG, " [!]Error aspell checking for %s aspell dictionary", xconfig->get_lang_name(xconfig, lang)); } delete_aspell_speller(spell_checker); free(lang_word); } log_message(DEBUG, " [-]This word has no hits for all aspell dictionaries"); return NO_LANGUAGE; }