Esempio n. 1
0
bool SpellChecker::checkWordSpelling(const QString &word)
{
    QString tmp = word;
    tmp = tmp.remove(QString::fromUtf8("»"));
    tmp = tmp.remove(QString::fromUtf8("«"));

    QByteArray ba = tmp.toUtf8();
    return (aspell_speller_check(spell_checker1, ba.data(), ba.size()) != 0) ||
           (aspell_speller_check((spell_checker2 != NULL ? spell_checker2 : spell_checker1), ba.data(), ba.size()) != 0);
}
Esempio n. 2
0
void permute(char* a, int l, int r)
{
	int i;

	if (l == r) {
		int correct = aspell_speller_check(spell_checker, a, -1);

		if (correct)	{
			printf("%s\n", a);
		}
		else if (!correct) {
		}
		else {
			fprintf(stderr, "Error: %s\n",
			        aspell_speller_error_message(spell_checker));
		}
	}
	else {
		for (i = l; i <= r; i++) {
			swap((a + l), (a + i));
			permute(a, l + 1, r);
			swap((a + l), (a + i));	//backtrack
		}
	}
}
Esempio n. 3
0
int
weechat_aspell_check_word (struct t_aspell_speller_buffer *speller_buffer,
                           const char *word)
{
    int i;

    /* word too small? then do not check word */
    if ((weechat_config_integer (weechat_aspell_config_check_word_min_length) > 0)
        && ((int)strlen (word) < weechat_config_integer (weechat_aspell_config_check_word_min_length)))
        return 1;

    /* word is a number? then do not check word */
    if (weechat_aspell_string_is_simili_number (word))
        return 1;

    /* check word with all spellers (order is important) */
    if (speller_buffer->spellers)
    {
        for (i = 0; speller_buffer->spellers[i]; i++)
        {
#ifdef USE_ENCHANT
            if (enchant_dict_check (speller_buffer->spellers[i], word, strlen (word)) == 0)
#else
            if (aspell_speller_check (speller_buffer->spellers[i], word, -1) == 1)
#endif /* USE_ENCHANT */
                return 1;
        }
    }

    /* misspelled word! */
    return 0;
}
Esempio n. 4
0
int
spelling_correct(char *word, struct spelling *s)
{
	int ret;
	char *conv_word;
	char *p;
	size_t conv_in, conv_out;

	conv_word = spelling_conv(s->conv, word);

	if (isdigit(conv_word[0])) {
		int i, digit;

		digit = 1;
		for (i = 1; conv_word[i] != '\0'; i++) {
			if (!isdigit(conv_word[i])) {
				digit = 0;
				break;
			}
		}
		if (digit)
			return 1;
	}


	ret = aspell_speller_check(s->speller, conv_word, -1);

	free(conv_word);

	return ret;
} 
bool KAspellChecker::checkWord(const QString &word)
{
    if (!m_speller) 
        return true;

    return aspell_speller_check(m_speller, word.toLocal8Bit().data(), -1);
}
Esempio n. 6
0
bool CheckerString::next_misspelling()
{
  if (off_end(cur_line_)) return false;
  if (has_repl_) {
    has_repl_ = false;
    CharVector word;
    bool correct = false;
    // FIXME: This is a hack to avoid trying to check a word with a space
    //        in it.  The correct action is to reparse to string and
    //        check each word individually.  However doing so involves
    //        an API enhancement in Checker.
    for (int i = 0; i != real_word_size_; ++i) {
      if (asc_isspace(*(real_word_begin_ + i)))
	correct = true;
    }
    if (!correct)
      correct = aspell_speller_check(speller_, &*real_word_begin_, real_word_size_);
    diff_ += real_word_size_ - tok_.len;
    tok_.len = real_word_size_;
    if (!correct)
      return true;
  }
  while ((tok_ = checker_->next_misspelling()).len == 0) {
    next_line(cur_line_);
    diff_ = 0;
    if (off_end(cur_line_)) return false;
    checker_->process(cur_line_->real.data(), cur_line_->real.size());
  }
  real_word_begin_ = cur_line_->real.begin() + tok_.offset + diff_;
  real_word_size_  = tok_.len;
  fix_display_str();
  return true;
}
Esempio n. 7
0
bool ASpellChecker::isCorrect(const QString& word)
{
	if(speller_) {
		int correct = aspell_speller_check(speller_, word.toUtf8().constData(), -1);
		return (correct != 0);
	}
	return true;
}
Esempio n. 8
0
bool SpellCheck::ok(const QString &word){
    if (!spell_checker || word.isEmpty())
        return true;

    int correct = aspell_speller_check(spell_checker, word.toAscii().constData(), -1);

    return (correct != 0);
}
Esempio n. 9
0
//__________________________________________________________________________
bool Speller::Aspell::Suggest::checkWord(const std::string& word)
{
	bool status = true;

	if( aspell_speller_check( fspeller, word.c_str(), -1 ) == 0 )
	{
		status = false;
	}
	return status;
}
Esempio n. 10
0
static void
check_word(GtkSpell *spell, GtkTextBuffer *buffer,
           GtkTextIter *start, GtkTextIter *end) {
	char *text;
	text = gtk_text_buffer_get_text(buffer, start, end, FALSE);
	if (debug) g_print("checking: %s\n", text);
	if (aspell_speller_check(spell->speller, text, -1) == FALSE)
		gtk_text_buffer_apply_tag(buffer, spell->tag_highlight, start, end);
	g_free(text);
}
Esempio n. 11
0
/**
 * Ask the spell-checker if the spelling looks good.
 * Return true if the spelling is good, else false.
 */
Boolean spellcheck_test(void * chk, const char * word)
{
    int val = 0;
    struct linkgrammar_aspell *aspell = (struct linkgrammar_aspell *)chk;
    if (aspell && aspell->speller)  {
        /* this can return -1 on failure */
        val = aspell_speller_check(aspell->speller, word, -1);
    }
    return (val == 1) ? TRUE : FALSE;
}
Esempio n. 12
0
bool ASpellChecker::isCorrect(const QString& word)
{
    if(spellers_.isEmpty())
        return true;

    foreach(AspellSpeller* speller, spellers_) {
        if (aspell_speller_check(speller, word.toUtf8().constData(), -1) != 0)
            return true;
    }
    return false;
}
Esempio n. 13
0
/**
 * Check a given word for correctness.
 * @param word the word to check
 * @return true if the word is correct, otherwise false.
 */
static VALUE aspell_check(VALUE self, VALUE word) {
    AspellSpeller *speller = get_speller(self);
    VALUE result = Qfalse;
    int code = aspell_speller_check(speller, STR2CSTR(word), -1);
    if (code == 1)
        result = Qtrue;
    else if (code == 0)
        result = Qfalse;
    else
        rb_raise( cAspellError, "%s", aspell_speller_error_message(speller));
    return result;
}
Esempio n. 14
0
int
weechat_aspell_check_word (struct t_gui_buffer *buffer,
                           struct t_aspell_speller_buffer *speller_buffer,
                           const char *word)
{
    const char *buffer_type, *buffer_nick, *buffer_channel;
    int i;

    /* word too small? then do not check word */
    if ((weechat_config_integer (weechat_aspell_config_check_word_min_length) > 0)
        && ((int)strlen (word) < weechat_config_integer (weechat_aspell_config_check_word_min_length)))
        return 1;

    /* word is a number? then do not check word */
    if (weechat_aspell_string_is_simili_number (word))
        return 1;

    /* word is a nick of nicklist on this buffer? then do not check word */
    if (weechat_nicklist_search_nick (buffer, NULL, word))
        return 1;

    /* for "private" buffers, ignore self and remote nicks */
    buffer_type = weechat_buffer_get_string (buffer, "localvar_type");
    if (buffer_type && (strcmp (buffer_type, "private") == 0))
    {
        /* check self nick */
        buffer_nick = weechat_buffer_get_string (buffer, "localvar_nick");
        if (buffer_nick && (weechat_strcasecmp (buffer_nick, word) == 0))
            return 1;
        /* check remote nick */
        buffer_channel = weechat_buffer_get_string (buffer, "localvar_channel");
        if (buffer_channel && (weechat_strcasecmp (buffer_channel, word) == 0))
            return 1;
    }

    /* check word with all spellers for this buffer (order is important) */
    if (speller_buffer->spellers)
    {
        for (i = 0; speller_buffer->spellers[i]; i++)
        {
#ifdef USE_ENCHANT
            if (enchant_dict_check (speller_buffer->spellers[i], word, strlen (word)) == 0)
#else
            if (aspell_speller_check (speller_buffer->spellers[i], word, -1) == 1)
#endif
                return 1;
        }
    }

    /* misspelled word! */
    return 0;
}
Esempio n. 15
0
QStringList SpellChecker::suggestions()
{
    QStringList sl;
    if ((spell_checker1 == 0) || (spell_checker2 == 0))
        return sl;
    QTextCursor cursor = m_textEdit->textCursor();
    cursor.select(QTextCursor::WordUnderCursor);
    QString word = cursor.selectedText();
    QByteArray ba = word.toUtf8();
    if ((aspell_speller_check(spell_checker2, ba.data(), ba.size()) != 0)||(aspell_speller_check(spell_checker1, ba.data(), ba.size()) != 0))
        return sl;
    const struct AspellWordList *awl = aspell_speller_suggest(spell_checker1, ba.data(), ba.size());
    if (aspell_word_list_size(awl) > 0) {
        struct AspellStringEnumeration *ase = aspell_word_list_elements(awl);
        int i  = 0;
        while ((!aspell_string_enumeration_at_end(ase))&&(i < 10)) {
            const char *text = aspell_string_enumeration_next(ase);
            sl << QString::fromUtf8(text);
            i++;
        }
        delete_aspell_string_enumeration(ase);
    }
    return sl;
}
Esempio n. 16
0
bool SpellChecker::check(QString word)
{
	if(word.isEmpty()) return true;

	AspellSpeller * checker;
	if(word[0].toLower()>=QChar('a') && word[0].toLower()<=QChar('z')){
		checker = m_spell_checker_en;
	}
	else{
		checker = m_spell_checker_ru;
	}
	if(!checker) return true;
	
	return (bool)aspell_speller_check(checker, m_codec->fromUnicode(word).data(), -1);
}
Esempio n. 17
0
int
weechat_aspell_check_word (struct t_gui_buffer *buffer, const char *word)
{
    struct t_aspell_speller *ptr_speller;
    int rc;
    
    rc = 0;
    
    /* word too small? then do not check word */
    if ((weechat_config_integer (weechat_aspell_config_check_word_min_length) > 0)
        && ((int)strlen (word) < weechat_config_integer (weechat_aspell_config_check_word_min_length)))
        rc = 1;
    else
    {
        /* word is URL? then do not check word */
        if (weechat_aspell_string_is_url (word))
            rc = 1;
        else
        {
            /* word is a number? then do not check word */
            if (weechat_aspell_string_is_simili_number (word))
                rc = 1;
            else
            {
                /* word is a nick of nicklist on this buffer? then do not check word */
                if (weechat_nicklist_search_nick (buffer, NULL, word))
                    rc = 1;
                else
                {
                    /* check word with all spellers for this buffer (order is important) */
                    for (ptr_speller = weechat_aspell_spellers; ptr_speller;
                         ptr_speller = ptr_speller->next_speller)
                    {
                        if (aspell_speller_check (ptr_speller->speller, word, -1) == 1)
                        {
                            rc = 1;
                            break;
                        }
                    }
                }
            }
        }
    }
    
    return rc;
}
Esempio n. 18
0
void checkTheWord(char* word,int checkDoc)
{
#if 1
	int							correct;
	AspellWordList*				suggestions;
	AspellStringEnumeration*	elements;
	const char*					suggestedword;
	int							wordcnt=0;
	char*						wordlist[100];
	char*						labeltext[512];

	correct=aspell_speller_check(spellChecker,word,-1);
	if(!correct)
		{
			badWord=word;
			cancelCheck=false;
			if(spellCheckWord==NULL)
				buildWordCheck(checkDoc);
			else
				{
					for(int j=0; j<numWords; j++)
						gtk_combo_box_text_remove((GtkComboBoxText*)wordListDropbox,0);

					sprintf((char*)&labeltext,"Change <i><b>%s</b></i> to: ",badWord);
					gtk_label_set_text((GtkLabel*)badWordLabel,(char*)&labeltext);
					gtk_label_set_use_markup((GtkLabel*)badWordLabel,true);
				}

			suggestions=(AspellWordList*)aspell_speller_suggest(spellChecker,word,-1);
			elements=aspell_word_list_elements(suggestions);
			while((suggestedword=aspell_string_enumeration_next(elements))!=NULL)
				{
					wordlist[wordcnt]=strdup(suggestedword);
					gtk_combo_box_text_append_text((GtkComboBoxText*)wordListDropbox,wordlist[wordcnt]);
					wordcnt++;
				}
			numWords=wordcnt;
			delete_aspell_string_enumeration(elements);
			gtk_combo_box_set_active((GtkComboBox*)wordListDropbox,0);
			gtk_widget_show_all(spellCheckWord);
			gtk_dialog_run((GtkDialog *)spellCheckWord);
		}
#endif
}
Esempio n. 19
0
bool SpellChecker::checkWord(QString word)
{
	bool isWordValid = checkers.size() == 0;
	if (word.indexOf(QRegExp("\\D")) == -1)
		isWordValid = true;
	else
	{
		for (Checkers::Iterator it = checkers.begin(); it != checkers.end(); it++)
		{
			if (aspell_speller_check(it.value(), word.toUtf8(), -1))
			{
				isWordValid = true;
				break;
			}
		}
	}
	
	return isWordValid;
}
/*
 * Class:     calliope_AeseSpeller
 * Method:    hasWord
 * Signature: (Ljava/lang/String;Ljava/lang/String;)Z
 */
JNIEXPORT jboolean JNICALL Java_calliope_AeseSpeller_hasWord
  (JNIEnv *env, jobject obj, jstring jword, jstring lang)
{
    int correct = 0;
    jboolean copied1, copied2;
    checker *c = checkers;
    const char *word = load_string( env, jword, &copied1 );
    const char *language = load_string( env, lang, &copied2 );
    while ( c != NULL )
        if ( strcmp(language,c->lang)!=0 )
            c = c->next;
        else
            break;
    if ( c == NULL )
    {
        c = checker_create( language );
        if ( c != NULL )
        {
            if ( checkers == NULL )
                checkers = c;
            else
            {
                checker *temp = checkers;
                while ( temp->next != NULL )
                    temp = temp->next;
                temp->next = c;
            }
        }
    }
    if ( c != NULL )
    {
        correct = aspell_speller_check(c->spell_checker, word, 
            strlen(word));
    }
    else
        fprintf(stderr,"checker: no dict for language %s\n",language);
    if ( copied1 )
        unload_string( env, jword, word, copied1 );
    if ( copied2 )
        unload_string( env, lang, language, copied2 );
    return correct;
}
Esempio n. 21
0
/* method:check ***************************************************************/
static PyObject* m_check(PyObject* self, PyObject* args) {
	char* word;
	int   length;

	if (!PyArg_ParseTuple(args, "s#", &word, &length)) {
		PyErr_SetString(PyExc_TypeError, "a string is required");
		return NULL;
	}

	if (!length)
		return Py_BuildValue("i", 1);

	switch (aspell_speller_check(Speller(self), word, length)) {
		case 0:
			return Py_BuildValue("i", 0);
		case 1:
			return Py_BuildValue("i", 1);
		default:
			PyErr_SetString(_AspellSpellerException, aspell_speller_error_message(Speller(self)));
			return NULL;
	}
}
Esempio n. 22
0
int main(int argc, const char *argv[]) 
{
  AspellCanHaveError * ret;
  AspellSpeller * speller;
  int have;
  char word[81];
  char * p;
  char * word_end;
  AspellConfig * config;

  if (argc < 2) {
    printf("Usage: %s <language> [<size>|- [[<jargon>|- [<encoding>]]]\n", argv[0]);
    return 1;
  }

  config = new_aspell_config();

  aspell_config_replace(config, "lang", argv[1]);

  if (argc >= 3 && argv[2][0] != '-' && argv[2][1] != '\0')
    aspell_config_replace(config, "size", argv[2]);

  if (argc >= 4 && argv[3][0] != '-')
    aspell_config_replace(config, "jargon", argv[3]);

  if (argc >= 5 && argv[4][0] != '-')
    aspell_config_replace(config, "encoding", argv[4]);

  ret = new_aspell_speller(config);

  delete_aspell_config(config);

  if (aspell_error(ret) != 0) {
    printf("Error: %s\n",aspell_error_message(ret));
    delete_aspell_can_have_error(ret);
    return 2;
  }
  speller = to_aspell_speller(ret);
  config = aspell_speller_config(speller);

  fputs("Using: ",                                      stdout);
  fputs(aspell_config_retrieve(config, "lang"),         stdout);
  fputs("-",                                            stdout);
  fputs(aspell_config_retrieve(config, "jargon"),       stdout);
  fputs("-",                                            stdout);
  fputs(aspell_config_retrieve(config, "size"),         stdout);
  fputs("-",                                            stdout);
  fputs(aspell_config_retrieve(config, "module"),       stdout);
  fputs("\n\n",                                         stdout);

  puts("Type \"h\" for help.\n");

  while (fgets(word, 80, stdin) != 0) {

    /* remove trailing spaces */

    word_end = strchr(word, '\0') - 1;
    while (word_end != word && (*word_end == '\n' || *word_end == ' ')) 
      --word_end;
    ++word_end;
    *word_end = '\0';
    
    putchar('\n');
    switch (word[0]) {
    case '\0':
      break;
    case 'h':
      puts(
	"Usage: \n"
	"  h(elp)      help\n"
	"  c <word>    check if a word is the correct spelling\n"
	"  s <word>    print out a list of suggestions for a word\n"
	"  a <word>    add a word to the personal word list\n"
	"  i <word>    ignore a word for the rest of the session\n"
        "  d <file>    spell checks a document\n"
	"  p           dumps the personal word list\n"
	"  P           dumps the session word list\n"
	"  m           dumps the main  word list\n"
        "  o <option> <value> sets a config option\n"
	"  r <option>         retrieves a config option\n"
        "  l <option>         retrieves a config option as a list\n"
	"  S           saves all word lists\n"
	"  C           clear the curent sesstion word list\n"
	"  x           quite\n"	);
      break;
    case 'p':
      print_word_list(speller, 
		      aspell_speller_personal_word_list(speller), '\n');
      break;
    case 'P':
      print_word_list(speller, 
		      aspell_speller_session_word_list(speller), '\n');
      break;
    case 'm':
      print_word_list(speller, 
		      aspell_speller_main_word_list(speller), '\n');
      break;
    case 'S':
      aspell_speller_save_all_word_lists(speller);
      check_for_error(speller);
      break;
    case 'C': 
      aspell_speller_clear_session(speller);
      check_for_error(speller);
      break;
    case 'x':
      goto END;
    case 'c':
      if (strlen(word) < 3) {
	printf("Usage: %c <word>\n", word[0]);
      } else {
	have = aspell_speller_check(speller, word + 2, -1);
	if (have == 1) 
	  puts("correct");
	else if (have == 0)
	  puts("incorrect");
	else
	  printf("Error: %s\n", aspell_speller_error_message(speller));
      }
      break;
    case 's':
      if (strlen(word) < 3) {
	printf("Usage: %c <word>\n", word[0]);
      } else {
	print_word_list(speller, 
			aspell_speller_suggest(speller, word + 2, -1), '\n');
      }
      break;
    case 'a':
      if (strlen(word) < 3) {
	printf("Usage: %c <word>\n", word[0]);
      } else {
	aspell_speller_add_to_personal(speller, word + 2, -1);
	check_for_error(speller);
      }
      break;
    case 'i':
      if (strlen(word) < 3) {
	printf("Usage: %c <word>\n", word[0]);
      } else {
	aspell_speller_add_to_session(speller, word + 2, -1);
	check_for_error(speller);
      }
      break;
    case 'o':
      word[80] = '\0'; /* to make sure strchr doesn't run off end of string */
      p = strchr(word + 3, ' ');
      if (strlen(word) < 3 || p == 0) {
	printf("Usage: %c <option> <value>\n", word[0]);
      } else {
	*p = '\0';
	++p;
	aspell_config_replace(config, word + 2, p);
	check_for_config_error(config);
      }
      break;
    case 'r':
      if (strlen(word) < 3) {
	printf("Usage: %c <option>\n", word[0]);
      } else {
	const char * val = aspell_config_retrieve(config, word + 2);
	check_for_config_error(config);
	if (val)
	  printf("%s = \"%s\"\n", word + 2, val);
      }
      break;
    case 'l':
      if (strlen(word) < 3) {
	printf("Usage: %c <option>\n", word[0]);
      } else {
	AspellStringList * lst = new_aspell_string_list();
	AspellMutableContainer * lst0 
	  = aspell_string_list_to_mutable_container(lst);
	AspellStringEnumeration * els;
	const char * val;
	aspell_config_retrieve_list(config, word + 2, lst0);
	check_for_config_error(config);
	els = aspell_string_list_elements(lst);
	printf("%s:\n", word + 2);
	while ( (val = aspell_string_enumeration_next(els)) != 0)
	  printf("  %s\n", val);
	delete_aspell_string_enumeration(els);
	delete_aspell_string_list(lst);
      }
      break;
    case 'd':
      if (strlen(word) < 3) {
	printf("Usage: %c <file>\n", word[0]);
      } else {
	check_document(speller, word + 2);
	printf("\n");
      }
      break;
    default:
      printf("Unknown Command: %s\n", word);
    }
    putchar('\n');
  }
 END:
  delete_aspell_speller(speller);
  return 0;
}
/**
 * Does the current spell-checker have this word?
 * @param u the userdata object
 * @param word the word to lookup
 * @return 1 if it was there else 0
 */
int userdata_has_word( userdata *u, XML_Char *word )
{
    int correct = aspell_speller_check(u->spell_checker, word, 
        strlen((char*)word));
    return correct;
}
bool AspellAdapterImpl::isWordCorrect(const std::string & word) {
    int correct = aspell_speller_check(aspellSpeller_, word.c_str(), -1);

    return (bool) correct;
}
Esempio n. 25
0
bool Speller::add(const char *word)
{
    if (speller == NULL)
        return false;
    return aspell_speller_check(speller, word, strlen(word)) != 0;
}
Esempio n. 26
0
int Speller::check(const char *word)
{
    if (speller == NULL)
        return -1;
    return aspell_speller_check(speller, word, strlen(word));
}
Esempio n. 27
0
// runs in O(t^2) time where t is the number of tokens in the input corpus
// We consider maxK to be fairly constant
void rawr::compile(int maxK)
{
  _maxK = maxK;

  std::vector<std::vector<token_id>> tokens;
  std::set<std::string> thashtags;
  std::set<std::string> fv_emoticons;
  
  std::ifstream fvefile("emoticons.txt");
  if (fvefile)
  {
    std::string line;
    while (getline(fvefile, line))
    {
      fv_emoticons.insert(line);
      emoticons.forms.add(line);
    }
  }
  
  fvefile.close();
  
  std::map<std::string, std::string> canonical_form;
  
  AspellConfig* spell_config = new_aspell_config();
  AspellCanHaveError* possible_err = new_aspell_speller(spell_config);
  if (aspell_error_number(possible_err) != 0)
  {
    std::cout << "aspell error: " << aspell_error_message(possible_err) << std::endl;
    exit(1);
  }
  
  AspellSpeller* spell_checker = to_aspell_speller(possible_err);
  
  std::cout << "Reading emojis..." << std::endl;
  prefix_search emojis;
  std::ifstream emoji_file("emojis.txt");
  if (emoji_file)
  {
    while (!emoji_file.eof())
    {
      std::string rawmojis;
      getline(emoji_file, rawmojis);
      if (rawmojis.back() == '\r')
      {
        rawmojis.pop_back();
      }
    
      emojis.add(rawmojis);
    }
    
    emoji_file.close();
  }

  std::cout << "Tokenizing corpus...   0%" << std::flush;
  int len = 0;
  for (auto c : _corpora)
  {
    len += c.length();
  }
  
  int startper = 0;
  int per = 0;
  int perprime = 0;
  std::cout.fill(' ');
  for (int i = 0; i < _corpora.size(); i++)
  {
    size_t start = 0;
    int end = 0;
    std::vector<token_id> tkcor;

    while (end != std::string::npos)
    {
      perprime = (startper + end) * 100 / len;
      if (perprime != per)
      {
        per = perprime;
      
        std::cout << "\b\b\b\b" << std::right;
        std::cout.width(3);
        std::cout << per << "%" << std::flush;
      }
    
      end = _corpora[i].find_first_of(" \n", start);

      bool emoji = false;
      std::string te = _corpora[i].substr(start, (end == std::string::npos) ? std::string::npos : end - start + 1);
      std::string t = "";
    
      if (te.compare("") && te.compare(".") && te.compare(" "))
      {
        if (te.back() == ' ')
        {
          te.pop_back();
        }
        
        // Extract strings of emojis into their own tokens even if they're not space delimited
        int m = emojis.match(te);
        emoji = m > 0;
        if (m == 0) m = 1;
        t = te.substr(0,m);
        te = te.substr(m);
      
        while (!te.empty())
        {
          m = emojis.match(te);
          if (emoji == (m > 0))
          {
            if (m == 0) m = 1;
            t += te.substr(0,m);
            te = te.substr(m);
          } else {
            end = start + t.length() - 1;
            break;
          }
        }
      
        std::string tc(t);
        std::transform(tc.begin(), tc.end(), tc.begin(), ::tolower);

        int pst = tc.find_first_not_of("\"([*");
        int dst = tc.find_last_not_of("\")]*.,?!\n;:");
        std::string canonical("");
        if ((pst != std::string::npos) && (dst != std::string::npos))
        {
          canonical = std::string(tc, pst, dst - pst + 1);
        }
      
        word& w = ([&] () -> word& {
          // Hashtag freevar
          if (canonical[0] == '#')
          {
            thashtags.insert(canonical);
          
            return hashtags;
          }
        
          // Emoticon freevar
          if (emoji)
          {
            emoticons.forms.add(canonical);
          
            return emoticons;
          }
        
          if ((pst != std::string::npos) && (dst != std::string::npos))
          {
            std::string emoticon_canon(t, pst, t.find_last_not_of("\"]*\n.,?!") - pst + 1);
            if (fv_emoticons.count(emoticon_canon) == 1)
            {
              emoticons.forms.add(emoticon_canon);
          
              return emoticons;
            }
          }
        
          // Basically any other word
          if (canonical_form.count(canonical) == 0)
          {
            if (
              // Legacy freevars should be distinct from tokens containing similar words
              (canonical.find("$name$") != std::string::npos)
              // Words with no letters will be mangled by the spell checker
              || (canonical.find_first_of("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz") == std::string::npos)
              )
            {
              canonical_form[canonical] = canonical;
              words.emplace(canonical, canonical);
            } else {
              int correct = aspell_speller_check(spell_checker, canonical.c_str(), canonical.size());
              if (correct)
              {
                words.emplace(canonical, canonical);
                canonical_form[canonical] = canonical;
              } else {
                const AspellWordList* suggestions = aspell_speller_suggest(spell_checker, canonical.c_str(), canonical.size());
                AspellStringEnumeration* elements = aspell_word_list_elements(suggestions);
                const char* replacement = aspell_string_enumeration_next(elements);
                if (replacement != NULL)
                {
                  std::string sugrep(replacement);
                  canonical_form[canonical] = sugrep;
          
                  if (words.count(sugrep) == 0)
                  {
                    words.emplace(sugrep, sugrep);
                  }
                } else {
                  words.emplace(canonical, canonical);
                  canonical_form[canonical] = canonical;
                }
          
                delete_aspell_string_enumeration(elements);
              }
            }
          }
        
          word& tw = words.at(canonical_form.at(canonical));
          tw.forms.add(canonical);
        
          return tw;
        })();
      
        token tk(w);
        tk.raw = t;
      
        for (char c : t)
        {
          if (c == '*')
          {
            tk.delimiters[{parentype::asterisk, doublestatus::opening}]++;
          } else if (c == '[')
          {
            tk.delimiters[{parentype::square_bracket, doublestatus::opening}]++;
          } else if (c == '(')
          {
            tk.delimiters[{parentype::paren, doublestatus::opening}]++;
          } else if (c == '"')
          {
            tk.delimiters[{parentype::quote, doublestatus::opening}]++;
          } else {
            break;
          }
        }
      
        int backtrack = t.find_last_not_of(".,?!])\"*\n;:") + 1;
        if (backtrack != t.length())
        {
          std::string ending = t.substr(backtrack);
          std::string suffix;
          bool newline = false;
          bool terminating = false;
        
          for (char c : ending)
          {
            if ((c == '.') || (c == ',') || (c == '?') || (c == '!') || (c == ';') || (c == ':'))
            {
              suffix += c;
              terminating = true;
            
              continue;
            } else if (c == '\n')
            {
              newline = true;
              terminating = true;
              
              continue;
            }
          
            parentype pt = ([&] {
              switch (c)
              {
                case ']': return parentype::square_bracket;
                case ')': return parentype::paren;
                case '*': return parentype::asterisk;
                case '"': return parentype::quote;
              }
            })();
          
            if (tk.delimiters[{pt, doublestatus::opening}] > 0)
            {
              tk.delimiters[{pt, doublestatus::opening}]--;
              tk.delimiters[{pt, doublestatus::both}]++;
            } else {
              tk.delimiters[{pt, doublestatus::closing}]++;
            }
          }
        
          if (terminating)
          {
            if ((suffix == ",") && (!newline))
            {
              tk.suffix = suffixtype::comma;
            } else {
              tk.suffix = suffixtype::terminating;
              
              if (!newline)
              {
                w.terms.add({suffix, false});
              } else {
                w.terms.add({".", false});
              }
            }
          }
        }

        tkcor.push_back(_tokenstore.add(tk));
      }

      start = ((end > (std::string::npos - 1) ) ? std::string::npos : end + 1);
    }
    
    tokens.push_back(tkcor);
    
    startper += _corpora[i].length();
  }
  
  std::cout << "\b\b\b\b100%" << std::endl;
  
  delete_aspell_speller(spell_checker);
  delete_aspell_config(spell_config);
  
  std::cout << canonical_form.size() << " distinct forms" << std::endl;
  std::cout << words.size() << " distinct words" << std::endl;
  
  // Time to condense the distribution stuff for the words
  std::cout << "Compiling token histograms..." << std::endl;
  for (auto& it : words)
  {
    it.second.forms.compile();
    it.second.terms.compile();
  }
  
  // Hashtag freevar is not frequency distributed
  for (auto& it : thashtags)
  {
    hashtags.forms.add(it);
  }
  
  hashtags.forms.compile();
  hashtags.terms.compile();
  
  // Compile other freevars
  emoticons.forms.compile();
  emoticons.terms.compile();

  // Compile the interned tokens.
  _tokenstore.compile();

  // kgram distribution
  std::cout << "Creating markov chain...   0%" << std::flush;
  std::map<kgram, std::map<token_id, token_data> > tstats;

  len = 0;
  for (auto c : tokens)
  {
    len += (maxK-1) * c.size();
  }
  
  startper = 0;
  per = 0;
  perprime = 0;
  int corpid = 0;
  for (auto corpus : tokens)
  {
    for (int k=0; k<maxK && k<corpus.size(); k++)
    {
      // The zero'th token should be a terminator.
      token_id fid = corpus[k];
      const token& f = _tokenstore.get(fid);

      kgram term_prefix(corpus.begin(), corpus.begin()+k);
      term_prefix.push_front(wildcardQuery);

      if (tstats[term_prefix].count(fid) == 0)
      {
        tstats[term_prefix].emplace(fid, fid);
      }

      token_data& td2 = tstats[term_prefix].at(fid);
      td2.all++;
      td2.corpora.insert(corpid);

      if (std::find_if(f.raw.begin(), f.raw.end(), ::islower) == f.raw.end())
      {
        td2.uppercase++;
      } else if (isupper(f.raw[0]))
      {
        td2.titlecase++;
      }
    }

    for (int k=1; k<maxK && k<corpus.size(); k++)
    {
      for (int i=0; i<(corpus.size() - k); i++)
      {
        perprime = (startper+i) * 100 / len;
        if (perprime != per)
        {
          per = perprime;
      
          std::cout << "\b\b\b\b" << std::right;
          std::cout.width(3);
          std::cout << per << "%" << std::flush;
        }
      
        kgram prefix(corpus.begin()+i, corpus.begin()+i+k);
        token_id fid = corpus[i+k];
        const token& f = _tokenstore.get(fid);

        if (tstats[prefix].count(fid) == 0)
        {
          tstats[prefix].emplace(fid, fid);
        }

        token_data& td = tstats[prefix].at(fid);
        td.all++;
        td.corpora.insert(corpid);

        if (std::find_if(f.raw.begin(), f.raw.end(), ::islower) == f.raw.end())
        {
          td.uppercase++;
        } else if (isupper(f.raw[0]))
        {
          td.titlecase++;
        }

        const token& startTok = _tokenstore.get(std::begin(prefix)->tok);
        if (startTok.suffix == suffixtype::terminating)
        {
          kgram term_prefix(prefix);
          term_prefix.pop_front();
          term_prefix.push_front(wildcardQuery);

          if (tstats[term_prefix].count(fid) == 0)
          {
            tstats[term_prefix].emplace(fid, fid);
          }

          token_data& td2 = tstats[term_prefix].at(fid);
          td2.all++;
          td2.corpora.insert(corpid);

          if (std::find_if(f.raw.begin(), f.raw.end(), ::islower) == f.raw.end())
          {
            td2.uppercase++;
          } else if (isupper(f.raw[0]))
          {
            td2.titlecase++;
          }
        }
      }
      
      startper += corpus.size();
    }
    
    corpid++;
  }
  
  std::cout << "\b\b\b\b100%" << std::endl;

  // Condense the kgram distribution
  std::cout << "Compiling kgram distributions...   0%";
  len = tstats.size();
  per = 0;
  perprime = 0;
  int indicator = 0;
  for (auto& it : tstats)
  {
    indicator++;
    perprime = indicator * 100 / len;
    if (per != perprime)
    {
      per = perprime;
    
      std::cout << "\b\b\b\b" << std::right;
      std::cout.width(3);
      std::cout << per << "%" << std::flush;
    }
    
    kgram klist = it.first;
    auto& probtable = it.second;
    auto& distribution = _stats[klist];
    int max = 0;
		
    for (auto& kt : probtable)
    {
      max += kt.second.all;
			
      distribution.emplace(max, kt.second);
    }
  }
  
  std::cout << "\b\b\b\b100%" << std::endl;
  
  _compiled = true;
}
Esempio n. 28
0
static int get_aspell_hits(const char *word, int len)
{
	if (len < 2) 
	{
		log_message(DEBUG, "   [-]Skip aspell checking (word is very short)");
		return NO_LANGUAGE;	
	}

	AspellConfig *spell_config = new_aspell_config();
	
	for (int lang = 0; lang < xconfig->total_languages; lang++)
	{
		char *lang_word = (char *) malloc(1 * sizeof(char));
		if (lang_word == NULL)
			continue;
		lang_word[0] = NULLSYM;

		for (int i = 0; i < len; i++)
		{
			KeyCode kc;
			int modifier;
			main_window->xkeymap->char_to_keycode(main_window->xkeymap, word[i], &kc, &modifier);

			char *symbol = keycode_to_symbol(kc, lang, modifier);
			if ((symbol == NULL) || (lang_word == NULL))
				continue;
			lang_word = (char *) realloc(lang_word, (strlen(lang_word) + strlen(symbol) + 1) * sizeof(char));
			if (lang_word != NULL)
				strcat(lang_word, symbol);

			free(symbol);
		}
		
		if (lang_word == NULL)
			continue;

		aspell_config_replace(spell_config, "lang", xconfig->languages[lang].dir);
		AspellCanHaveError *possible_err = new_aspell_speller(spell_config);
		AspellSpeller *spell_checker = 0;

		if (aspell_error_number(possible_err) == 0)
		{
			spell_checker = to_aspell_speller(possible_err);
			int correct = aspell_speller_check(spell_checker, lang_word, strlen(lang_word));
			if (correct)
			{
				log_message(DEBUG, "   [+]Found this word in %s aspell dictionary", xconfig->get_lang_name(xconfig, lang));
				delete_aspell_speller(spell_checker);
				free(lang_word);
				return lang;
			}
		}
		else
		{
			log_message(DEBUG, "   [!]Error aspell checking for %s aspell dictionary", xconfig->get_lang_name(xconfig, lang));
		}
	
		delete_aspell_speller(spell_checker);
		free(lang_word);
	}
	
	log_message(DEBUG, "   [-]This word has no hits for all aspell dictionaries");
	return NO_LANGUAGE;
}