void AspellAdapterImpl::getSuggestionsForLastWord(
                       SuggestionsList & suggestionsList,
                       const std::string & word
                       ) {


    const AspellWordList * suggestions = aspell_speller_suggest(
                        aspellSpeller_,
                        word.c_str(), -1);

    AspellStringEnumeration * elements = aspell_word_list_elements(suggestions);

    const char * currentWordSuggestion;
    while ( (currentWordSuggestion = aspell_string_enumeration_next(elements)) != NULL ) {
        if (limitCandidates_ &&
            (limitCandidates_ <= suggestionsList.size())) {
            break;
        }

        suggestionsList.push_back(std::string(currentWordSuggestion));
    }

    delete_aspell_string_enumeration(elements);

    return;
}
Example #2
0
QStringList SpellChecker::suggestions(QString word)
{
	QStringList ret;

	if(word.isEmpty()) return ret;

	AspellSpeller * checker;
	if(word[0].toLower()>=QChar('a') && word[0].toLower()<=QChar('z')){
		checker = m_spell_checker_en;
	}
	else{
		checker = m_spell_checker_ru;
	}
	if(!checker) return ret;

	const AspellWordList * suggestions = aspell_speller_suggest(checker, m_codec->fromUnicode(word).data(), -1);
	AspellStringEnumeration * elements = aspell_word_list_elements(suggestions);
	const char * suggestion;
	while ( suggestion = aspell_string_enumeration_next(elements) ) {
		ret << m_codec->toUnicode(suggestion);
	}
	delete_aspell_string_enumeration(elements);

	return ret;
}
Example #3
0
/**
 * Retrieve the value of a specific option as list.
 * @param word the option as string.
 */
static VALUE aspell_conf_retrieve_list(VALUE self, VALUE key) {
    AspellSpeller *speller = get_speller(self);
    AspellConfig *config = aspell_speller_config(speller);
    AspellStringList * list = new_aspell_string_list();
    AspellMutableContainer * container  = aspell_string_list_to_mutable_container(list);
    AspellStringEnumeration * els;
    VALUE result = rb_ary_new();
    const char *option_value;

    //retrieve list
    aspell_config_retrieve_list(config, STR2CSTR(key), container);
    //check for error
    if (aspell_config_error(config) != 0) {
        char *tmp = strdup(aspell_config_error_message(config));
        delete_aspell_string_list(list);
        rb_raise( cAspellError, "%s", tmp);
    }

    //iterate over list
    els = aspell_string_list_elements(list);
    while ( (option_value = aspell_string_enumeration_next(els)) != 0) {
        //push the option value to result
        rb_ary_push(result, rb_str_new2(option_value));
    }
    //free list
    delete_aspell_string_enumeration(els);
    delete_aspell_string_list(list);

    return result;
}
Example #4
0
int spellcheck_suggest(void * chk, char ***sug, const char * word)
{
    struct linkgrammar_aspell *aspell = (struct linkgrammar_aspell *)chk;
    if (!sug) {
        prt_error("Error: Aspell. Corrupt pointer.\n");
        return 0;
    }
    if (aspell && aspell->speller) {
        const AspellWordList *list = NULL;
        AspellStringEnumeration *elem = NULL;
        const char *aword = NULL;
        unsigned int size, i;
        char **array = NULL;

        list = aspell_speller_suggest(aspell->speller, word, -1);
        elem = aspell_word_list_elements(list);
        size = aspell_word_list_size(list);
        /* allocate an array of char* for returning back to link-parser
         */
        array = (char **)malloc(sizeof(char *) * size);
        if (!array) {
            prt_error("Error: Aspell. Out of memory.\n");
            delete_aspell_string_enumeration(elem);
            return 0;
        }
        i = 0;
        while ((aword = aspell_string_enumeration_next(elem)) != NULL) {
            array[i++] = strdup(aword);
        }
        delete_aspell_string_enumeration(elem);
        *sug = array;
        return size;
    }
    return 0;
}
Example #5
0
int
main(int argc, char *argv[])
{
  AspellConfig *conf;
  const char *tmp;
  char *option = "extra-dicts";


  AspellStringList * lst = new_aspell_string_list();
  AspellMutableContainer * lst0  = aspell_string_list_to_mutable_container(lst);
  AspellStringEnumeration *els;

  if(argc > 1)
      option  = argv[1];

  conf = new_aspell_config();
  tmp = aspell_config_retrieve(conf, option);
  fprintf(stderr, "%s = %s\n", option, tmp);

  aspell_config_retrieve_list(conf, option, lst0);
  els = aspell_string_list_elements(lst);
  while( (tmp = aspell_string_enumeration_next(els) ) ) {
      fprintf(stderr, "%s\n", tmp);
  }

  return(0);
}
Example #6
0
/**
 * Utility function that wraps a list of words as ruby array of ruby strings.
 * @param list an aspell wordlist.
 * @return an ruby array, containing all words as ruby strings.
 */
static VALUE get_list(const AspellWordList *list) {
    VALUE result = rb_ary_new2(aspell_word_list_size(list));
    if (list != 0) {
        AspellStringEnumeration * els = aspell_word_list_elements(list);
        const char * word;
        while ( (word = aspell_string_enumeration_next(els)) != 0) {
            rb_ary_push(result, rb_str_new2(word));
        }
        delete_aspell_string_enumeration(els);
    }
    return result;
}
Example #7
0
QStringList Speller::suggestions(const char *word)
{
    QStringList res;
    const AspellWordList *wl = aspell_speller_suggest(speller, word, -1);
    if (wl){
        AspellStringEnumeration *els = aspell_word_list_elements(wl);
        const char *word;
        while ((word = aspell_string_enumeration_next(els)) != NULL) {
            res.append(QString::fromUtf8(word));
        }
    }
    return res;
}
Example #8
0
void SpellCheck::suggestions(const QString &word, QStringList &list){
    if (!spell_checker || word.isEmpty())
        return;

    const AspellWordList *suggestions = aspell_speller_suggest(spell_checker, word.toUtf8().constData(), word.length());
    AspellStringEnumeration *elements = aspell_word_list_elements(suggestions);

    const char * sugg;
    while ((sugg = aspell_string_enumeration_next(elements)) != NULL ){
        list.append(QString::fromUtf8(sugg, strlen(sugg)));
    }

    delete_aspell_string_enumeration(elements);
}
Example #9
0
QList<QString> ASpellChecker::suggestions(const QString& word)
{
	QList<QString> words;
	if (speller_) {
		const AspellWordList* list = aspell_speller_suggest(speller_, word.toUtf8(), -1); 
		AspellStringEnumeration* elements = aspell_word_list_elements(list);
		const char *c_word;
		while ((c_word = aspell_string_enumeration_next(elements)) != NULL) {
			words += QString::fromUtf8(c_word);
		}
		delete_aspell_string_enumeration(elements);
	}
	return words;
}
Example #10
0
static void print_word_list(AspellSpeller * speller, 
			    const AspellWordList *wl,
			    char delem) 
{
  if (wl == 0) {
    printf("Error: %s\n", aspell_speller_error_message(speller));
  } else {
    AspellStringEnumeration * els = aspell_word_list_elements(wl);
    const char * word;
    while ( (word = aspell_string_enumeration_next(els)) != 0) {
      fputs(word, stdout);
      putc(delem, stdout);
    }
  }
}
Example #11
0
/* Remeber to free returned string */
char *
spelling_document_line(struct spelling_document *sd, char *in_line)
{
	char *newline;
	int diff, line_len;
	size_t line_size, conv_line;
	struct AspellToken token;
	char *line;
	size_t conv_in;

	line = spelling_conv(sd->spelling->conv, in_line);

	line_len = strlen(line);
	line_size = line_len + (line_len/10);
	if ((newline = malloc(line_size)) == NULL)
		return NULL;

	strcpy(newline, line);
	free(line);
	aspell_document_checker_process(sd->checker, newline, line_len);
	diff = 0;
	while (token = aspell_document_checker_next_misspelling(sd->checker),
	       token.len != 0) {
		char *word_begin;
		const char *word;
		int word_len;
		const AspellWordList *wl;
		AspellStringEnumeration *els;

		word_begin = newline + token.offset + diff;
		wl = aspell_speller_suggest(sd->spelling->speller, word_begin, token.len);
		els = aspell_word_list_elements(wl);
		if ((word = aspell_string_enumeration_next(els)) == NULL)
			continue;
		word_len = strlen(word);
		diff += word_len - token.len;
		memmove(word_begin + word_len, word_begin + token.len, strlen(word_begin + token.len) + 1);
		memcpy(word_begin, word, word_len);
		line_len += diff;
	}

	line = spelling_conv(sd->spelling->conv_out, newline);
	free(newline);

	return line;
}
Example #12
0
QList<QString> ASpellChecker::suggestions(const QString& word)
{
    QList<QString> words;

    foreach(AspellSpeller* speller, spellers_) {
        const AspellWordList* list = aspell_speller_suggest(speller, word.toUtf8(), -1);
        AspellStringEnumeration* elements = aspell_word_list_elements(list);
        const char *c_word;
        while ((c_word = aspell_string_enumeration_next(elements)) != NULL) {
            QString suggestion = QString::fromUtf8(c_word);
            if(suggestion.size() > 2)
                words.append(suggestion);
        }
        delete_aspell_string_enumeration(elements);
    }
    return words;
}
Example #13
0
void checkTheWord(char* word,int checkDoc)
{
#if 1
	int							correct;
	AspellWordList*				suggestions;
	AspellStringEnumeration*	elements;
	const char*					suggestedword;
	int							wordcnt=0;
	char*						wordlist[100];
	char*						labeltext[512];

	correct=aspell_speller_check(spellChecker,word,-1);
	if(!correct)
		{
			badWord=word;
			cancelCheck=false;
			if(spellCheckWord==NULL)
				buildWordCheck(checkDoc);
			else
				{
					for(int j=0; j<numWords; j++)
						gtk_combo_box_text_remove((GtkComboBoxText*)wordListDropbox,0);

					sprintf((char*)&labeltext,"Change <i><b>%s</b></i> to: ",badWord);
					gtk_label_set_text((GtkLabel*)badWordLabel,(char*)&labeltext);
					gtk_label_set_use_markup((GtkLabel*)badWordLabel,true);
				}

			suggestions=(AspellWordList*)aspell_speller_suggest(spellChecker,word,-1);
			elements=aspell_word_list_elements(suggestions);
			while((suggestedword=aspell_string_enumeration_next(elements))!=NULL)
				{
					wordlist[wordcnt]=strdup(suggestedword);
					gtk_combo_box_text_append_text((GtkComboBoxText*)wordListDropbox,wordlist[wordcnt]);
					wordcnt++;
				}
			numWords=wordcnt;
			delete_aspell_string_enumeration(elements);
			gtk_combo_box_set_active((GtkComboBox*)wordListDropbox,0);
			gtk_widget_show_all(spellCheckWord);
			gtk_dialog_run((GtkDialog *)spellCheckWord);
		}
#endif
}
Example #14
0
//__________________________________________________________________________
void Speller::Aspell::Suggest::getConfigOpt(const std::string& opt,
					    std::vector<std::string>& vals)
{
	// Stores current setting of configuration option, 'opt', which
	// has a value of list type, in 'vals'.
	AspellStringList* list = new_aspell_string_list();
	AspellMutableContainer* lst0 =
		aspell_string_list_to_mutable_container( list );
	aspell_config_retrieve_list( fconfig, opt.c_str(), lst0 );
	AspellStringEnumeration* enum_list =
		aspell_string_list_elements( list );
	const char* next;
	while( (next = aspell_string_enumeration_next( enum_list )) )
	{
		vals.push_back( next );
	}
	delete_aspell_string_enumeration( enum_list );
	delete_aspell_string_list( list );
}
QStringList KAspellChecker::suggestions(const QString &word)
{
    if (!m_speller) 
        return QStringList();

    QStringList suggs;

    const AspellWordList *wordList = aspell_speller_suggest(m_speller, word.toLocal8Bit().data(), -1);
	
    if (wordList == 0)
        return suggs;

    AspellStringEnumeration *els = aspell_word_list_elements(wordList);
    const char *ws;
    while ((ws = aspell_string_enumeration_next(els)) != 0) 
           suggs << QString::fromLatin1(ws);

    return suggs;
}
Example #16
0
//__________________________________________________________________________
void
Speller::Aspell::Suggest::storeWordList(const AspellWordList* wlist,
					std::vector<std::string>& replacement)
	throw( std::invalid_argument )
{
	if( ! wlist )
	{
		throw std::invalid_argument( "(Aspell.Speller.Suggest.store"
					     "WordList): word list pointer "
					     "is null." );
	}

	AspellStringEnumeration* enum_list =
		aspell_word_list_elements( wlist );
	const char* next;
	while( (next = aspell_string_enumeration_next( enum_list )) )
	{
		replacement.push_back( next );
	}
	delete_aspell_string_enumeration( enum_list );
}
Example #17
0
//__________________________________________________________________________
void
Speller::Aspell::Suggest::printWordList(const AspellWordList* wlist,
					char delim)
	throw( std::invalid_argument )
{
	if( ! wlist )
	{
		throw std::invalid_argument( "(Aspell.Speller.Suggest.print"
					     "WordList): word list pointer "
					     "is null." );
	}

	AspellStringEnumeration* enum_list =
		aspell_word_list_elements( wlist );
	const char* next;
	while( (next = aspell_string_enumeration_next( enum_list )) )
	{
		std::cout << next << delim;
	}
	delete_aspell_string_enumeration( enum_list );
}
Example #18
0
/* helper function: converts an aspell string list into python list */
static PyObject* AspellStringList2PythonList(const AspellStringList* wordlist) {
	PyObject* list;
	AspellStringEnumeration* elements;
	const char* word;

	list = PyList_New(0);
	if (!list) {
		PyErr_SetString(PyExc_Exception, "can't create new list");
		return NULL;
	}

	elements = aspell_string_list_elements(wordlist);
	while ( (word=aspell_string_enumeration_next(elements)) != 0)
		if (PyList_Append(list, Py_BuildValue("s", word)) == -1) {
			PyErr_SetString(PyExc_Exception, "It is almost impossible, but happend! Can't append element to the list.");
			delete_aspell_string_enumeration(elements);
			Py_DECREF(list);
			return NULL;
		}
	delete_aspell_string_enumeration(elements);
	return list;
}
Example #19
0
QStringList SpellChecker::suggestions()
{
    QStringList sl;
    if ((spell_checker1 == 0) || (spell_checker2 == 0))
        return sl;
    QTextCursor cursor = m_textEdit->textCursor();
    cursor.select(QTextCursor::WordUnderCursor);
    QString word = cursor.selectedText();
    QByteArray ba = word.toUtf8();
    if ((aspell_speller_check(spell_checker2, ba.data(), ba.size()) != 0)||(aspell_speller_check(spell_checker1, ba.data(), ba.size()) != 0))
        return sl;
    const struct AspellWordList *awl = aspell_speller_suggest(spell_checker1, ba.data(), ba.size());
    if (aspell_word_list_size(awl) > 0) {
        struct AspellStringEnumeration *ase = aspell_word_list_elements(awl);
        int i  = 0;
        while ((!aspell_string_enumeration_at_end(ase))&&(i < 10)) {
            const char *text = aspell_string_enumeration_next(ase);
            sl << QString::fromUtf8(text);
            i++;
        }
        delete_aspell_string_enumeration(ase);
    }
    return sl;
}
Example #20
0
GtkWidget*
build_suggestion_menu(GtkSpell *spell, GtkTextBuffer *buffer,
                      const char *word) {
	const char *suggestion;
	GtkWidget *topmenu, *menu;
	GtkWidget *mi;
	int count = 0;
	const AspellWordList *suggestions;
	AspellStringEnumeration *elements;
	char *label;
	
	topmenu = menu = gtk_menu_new();

	/* + Add to Dictionary */
	label = g_strdup_printf("Add \"%s\" to Dictionary", word);
	mi = gtk_image_menu_item_new_with_label(label);
	g_free(label);
	gtk_image_menu_item_set_image(GTK_IMAGE_MENU_ITEM(mi), 
			gtk_image_new_from_stock(GTK_STOCK_ADD, GTK_ICON_SIZE_MENU));
	g_signal_connect(G_OBJECT(mi), "activate",
			G_CALLBACK(add_to_dictionary), spell);
	gtk_widget_show_all(mi);
	gtk_menu_shell_append(GTK_MENU_SHELL(topmenu), mi);

	/* Separator */
	mi = gtk_menu_item_new();
	gtk_widget_show(mi);
	gtk_menu_shell_append(GTK_MENU_SHELL(topmenu), mi);

	suggestions = aspell_speller_suggest(spell->speller, word, -1);
	elements = aspell_word_list_elements(suggestions);

	suggestion = aspell_string_enumeration_next(elements);
	if (suggestion == NULL) {
		/* no suggestions.  put something in the menu anyway... */
		GtkWidget *label;
		label = gtk_label_new("");
		gtk_label_set_markup(GTK_LABEL(label), "<i>(no suggestions)</i>");

		mi = gtk_menu_item_new();
		gtk_container_add(GTK_CONTAINER(mi), label);
		gtk_widget_show_all(mi);
		gtk_menu_shell_prepend(GTK_MENU_SHELL(menu), mi);
	} else {
		/* build a set of menus with suggestions. */
		while (suggestion != NULL) {
			if (count == 10) {
				mi = gtk_menu_item_new();
				gtk_widget_show(mi);
				gtk_menu_shell_append(GTK_MENU_SHELL(menu), mi);

				mi = gtk_menu_item_new_with_label("More...");
				gtk_widget_show(mi);
				gtk_menu_shell_append(GTK_MENU_SHELL(menu), mi);

				menu = gtk_menu_new();
				gtk_menu_item_set_submenu(GTK_MENU_ITEM(mi), menu);
				count = 0;
			}
			mi = gtk_menu_item_new_with_label(suggestion);
			g_signal_connect(G_OBJECT(mi), "activate",
					G_CALLBACK(replace_word), spell);
			gtk_widget_show(mi);
			gtk_menu_shell_append(GTK_MENU_SHELL(menu), mi);
			count++;
			suggestion = aspell_string_enumeration_next(elements);
		}
	}

	delete_aspell_string_enumeration(elements);

	return topmenu;
}
Example #21
0
int main(int argc, const char *argv[]) 
{
  AspellCanHaveError * ret;
  AspellSpeller * speller;
  int have;
  char word[81];
  char * p;
  char * word_end;
  AspellConfig * config;

  if (argc < 2) {
    printf("Usage: %s <language> [<size>|- [[<jargon>|- [<encoding>]]]\n", argv[0]);
    return 1;
  }

  config = new_aspell_config();

  aspell_config_replace(config, "lang", argv[1]);

  if (argc >= 3 && argv[2][0] != '-' && argv[2][1] != '\0')
    aspell_config_replace(config, "size", argv[2]);

  if (argc >= 4 && argv[3][0] != '-')
    aspell_config_replace(config, "jargon", argv[3]);

  if (argc >= 5 && argv[4][0] != '-')
    aspell_config_replace(config, "encoding", argv[4]);

  ret = new_aspell_speller(config);

  delete_aspell_config(config);

  if (aspell_error(ret) != 0) {
    printf("Error: %s\n",aspell_error_message(ret));
    delete_aspell_can_have_error(ret);
    return 2;
  }
  speller = to_aspell_speller(ret);
  config = aspell_speller_config(speller);

  fputs("Using: ",                                      stdout);
  fputs(aspell_config_retrieve(config, "lang"),         stdout);
  fputs("-",                                            stdout);
  fputs(aspell_config_retrieve(config, "jargon"),       stdout);
  fputs("-",                                            stdout);
  fputs(aspell_config_retrieve(config, "size"),         stdout);
  fputs("-",                                            stdout);
  fputs(aspell_config_retrieve(config, "module"),       stdout);
  fputs("\n\n",                                         stdout);

  puts("Type \"h\" for help.\n");

  while (fgets(word, 80, stdin) != 0) {

    /* remove trailing spaces */

    word_end = strchr(word, '\0') - 1;
    while (word_end != word && (*word_end == '\n' || *word_end == ' ')) 
      --word_end;
    ++word_end;
    *word_end = '\0';
    
    putchar('\n');
    switch (word[0]) {
    case '\0':
      break;
    case 'h':
      puts(
	"Usage: \n"
	"  h(elp)      help\n"
	"  c <word>    check if a word is the correct spelling\n"
	"  s <word>    print out a list of suggestions for a word\n"
	"  a <word>    add a word to the personal word list\n"
	"  i <word>    ignore a word for the rest of the session\n"
        "  d <file>    spell checks a document\n"
	"  p           dumps the personal word list\n"
	"  P           dumps the session word list\n"
	"  m           dumps the main  word list\n"
        "  o <option> <value> sets a config option\n"
	"  r <option>         retrieves a config option\n"
        "  l <option>         retrieves a config option as a list\n"
	"  S           saves all word lists\n"
	"  C           clear the curent sesstion word list\n"
	"  x           quite\n"	);
      break;
    case 'p':
      print_word_list(speller, 
		      aspell_speller_personal_word_list(speller), '\n');
      break;
    case 'P':
      print_word_list(speller, 
		      aspell_speller_session_word_list(speller), '\n');
      break;
    case 'm':
      print_word_list(speller, 
		      aspell_speller_main_word_list(speller), '\n');
      break;
    case 'S':
      aspell_speller_save_all_word_lists(speller);
      check_for_error(speller);
      break;
    case 'C': 
      aspell_speller_clear_session(speller);
      check_for_error(speller);
      break;
    case 'x':
      goto END;
    case 'c':
      if (strlen(word) < 3) {
	printf("Usage: %c <word>\n", word[0]);
      } else {
	have = aspell_speller_check(speller, word + 2, -1);
	if (have == 1) 
	  puts("correct");
	else if (have == 0)
	  puts("incorrect");
	else
	  printf("Error: %s\n", aspell_speller_error_message(speller));
      }
      break;
    case 's':
      if (strlen(word) < 3) {
	printf("Usage: %c <word>\n", word[0]);
      } else {
	print_word_list(speller, 
			aspell_speller_suggest(speller, word + 2, -1), '\n');
      }
      break;
    case 'a':
      if (strlen(word) < 3) {
	printf("Usage: %c <word>\n", word[0]);
      } else {
	aspell_speller_add_to_personal(speller, word + 2, -1);
	check_for_error(speller);
      }
      break;
    case 'i':
      if (strlen(word) < 3) {
	printf("Usage: %c <word>\n", word[0]);
      } else {
	aspell_speller_add_to_session(speller, word + 2, -1);
	check_for_error(speller);
      }
      break;
    case 'o':
      word[80] = '\0'; /* to make sure strchr doesn't run off end of string */
      p = strchr(word + 3, ' ');
      if (strlen(word) < 3 || p == 0) {
	printf("Usage: %c <option> <value>\n", word[0]);
      } else {
	*p = '\0';
	++p;
	aspell_config_replace(config, word + 2, p);
	check_for_config_error(config);
      }
      break;
    case 'r':
      if (strlen(word) < 3) {
	printf("Usage: %c <option>\n", word[0]);
      } else {
	const char * val = aspell_config_retrieve(config, word + 2);
	check_for_config_error(config);
	if (val)
	  printf("%s = \"%s\"\n", word + 2, val);
      }
      break;
    case 'l':
      if (strlen(word) < 3) {
	printf("Usage: %c <option>\n", word[0]);
      } else {
	AspellStringList * lst = new_aspell_string_list();
	AspellMutableContainer * lst0 
	  = aspell_string_list_to_mutable_container(lst);
	AspellStringEnumeration * els;
	const char * val;
	aspell_config_retrieve_list(config, word + 2, lst0);
	check_for_config_error(config);
	els = aspell_string_list_elements(lst);
	printf("%s:\n", word + 2);
	while ( (val = aspell_string_enumeration_next(els)) != 0)
	  printf("  %s\n", val);
	delete_aspell_string_enumeration(els);
	delete_aspell_string_list(lst);
      }
      break;
    case 'd':
      if (strlen(word) < 3) {
	printf("Usage: %c <file>\n", word[0]);
      } else {
	check_document(speller, word + 2);
	printf("\n");
      }
      break;
    default:
      printf("Unknown Command: %s\n", word);
    }
    putchar('\n');
  }
 END:
  delete_aspell_speller(speller);
  return 0;
}
Example #22
0
// runs in O(t^2) time where t is the number of tokens in the input corpus
// We consider maxK to be fairly constant
void rawr::compile(int maxK)
{
  _maxK = maxK;

  std::vector<std::vector<token_id>> tokens;
  std::set<std::string> thashtags;
  std::set<std::string> fv_emoticons;
  
  std::ifstream fvefile("emoticons.txt");
  if (fvefile)
  {
    std::string line;
    while (getline(fvefile, line))
    {
      fv_emoticons.insert(line);
      emoticons.forms.add(line);
    }
  }
  
  fvefile.close();
  
  std::map<std::string, std::string> canonical_form;
  
  AspellConfig* spell_config = new_aspell_config();
  AspellCanHaveError* possible_err = new_aspell_speller(spell_config);
  if (aspell_error_number(possible_err) != 0)
  {
    std::cout << "aspell error: " << aspell_error_message(possible_err) << std::endl;
    exit(1);
  }
  
  AspellSpeller* spell_checker = to_aspell_speller(possible_err);
  
  std::cout << "Reading emojis..." << std::endl;
  prefix_search emojis;
  std::ifstream emoji_file("emojis.txt");
  if (emoji_file)
  {
    while (!emoji_file.eof())
    {
      std::string rawmojis;
      getline(emoji_file, rawmojis);
      if (rawmojis.back() == '\r')
      {
        rawmojis.pop_back();
      }
    
      emojis.add(rawmojis);
    }
    
    emoji_file.close();
  }

  std::cout << "Tokenizing corpus...   0%" << std::flush;
  int len = 0;
  for (auto c : _corpora)
  {
    len += c.length();
  }
  
  int startper = 0;
  int per = 0;
  int perprime = 0;
  std::cout.fill(' ');
  for (int i = 0; i < _corpora.size(); i++)
  {
    size_t start = 0;
    int end = 0;
    std::vector<token_id> tkcor;

    while (end != std::string::npos)
    {
      perprime = (startper + end) * 100 / len;
      if (perprime != per)
      {
        per = perprime;
      
        std::cout << "\b\b\b\b" << std::right;
        std::cout.width(3);
        std::cout << per << "%" << std::flush;
      }
    
      end = _corpora[i].find_first_of(" \n", start);

      bool emoji = false;
      std::string te = _corpora[i].substr(start, (end == std::string::npos) ? std::string::npos : end - start + 1);
      std::string t = "";
    
      if (te.compare("") && te.compare(".") && te.compare(" "))
      {
        if (te.back() == ' ')
        {
          te.pop_back();
        }
        
        // Extract strings of emojis into their own tokens even if they're not space delimited
        int m = emojis.match(te);
        emoji = m > 0;
        if (m == 0) m = 1;
        t = te.substr(0,m);
        te = te.substr(m);
      
        while (!te.empty())
        {
          m = emojis.match(te);
          if (emoji == (m > 0))
          {
            if (m == 0) m = 1;
            t += te.substr(0,m);
            te = te.substr(m);
          } else {
            end = start + t.length() - 1;
            break;
          }
        }
      
        std::string tc(t);
        std::transform(tc.begin(), tc.end(), tc.begin(), ::tolower);

        int pst = tc.find_first_not_of("\"([*");
        int dst = tc.find_last_not_of("\")]*.,?!\n;:");
        std::string canonical("");
        if ((pst != std::string::npos) && (dst != std::string::npos))
        {
          canonical = std::string(tc, pst, dst - pst + 1);
        }
      
        word& w = ([&] () -> word& {
          // Hashtag freevar
          if (canonical[0] == '#')
          {
            thashtags.insert(canonical);
          
            return hashtags;
          }
        
          // Emoticon freevar
          if (emoji)
          {
            emoticons.forms.add(canonical);
          
            return emoticons;
          }
        
          if ((pst != std::string::npos) && (dst != std::string::npos))
          {
            std::string emoticon_canon(t, pst, t.find_last_not_of("\"]*\n.,?!") - pst + 1);
            if (fv_emoticons.count(emoticon_canon) == 1)
            {
              emoticons.forms.add(emoticon_canon);
          
              return emoticons;
            }
          }
        
          // Basically any other word
          if (canonical_form.count(canonical) == 0)
          {
            if (
              // Legacy freevars should be distinct from tokens containing similar words
              (canonical.find("$name$") != std::string::npos)
              // Words with no letters will be mangled by the spell checker
              || (canonical.find_first_of("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz") == std::string::npos)
              )
            {
              canonical_form[canonical] = canonical;
              words.emplace(canonical, canonical);
            } else {
              int correct = aspell_speller_check(spell_checker, canonical.c_str(), canonical.size());
              if (correct)
              {
                words.emplace(canonical, canonical);
                canonical_form[canonical] = canonical;
              } else {
                const AspellWordList* suggestions = aspell_speller_suggest(spell_checker, canonical.c_str(), canonical.size());
                AspellStringEnumeration* elements = aspell_word_list_elements(suggestions);
                const char* replacement = aspell_string_enumeration_next(elements);
                if (replacement != NULL)
                {
                  std::string sugrep(replacement);
                  canonical_form[canonical] = sugrep;
          
                  if (words.count(sugrep) == 0)
                  {
                    words.emplace(sugrep, sugrep);
                  }
                } else {
                  words.emplace(canonical, canonical);
                  canonical_form[canonical] = canonical;
                }
          
                delete_aspell_string_enumeration(elements);
              }
            }
          }
        
          word& tw = words.at(canonical_form.at(canonical));
          tw.forms.add(canonical);
        
          return tw;
        })();
      
        token tk(w);
        tk.raw = t;
      
        for (char c : t)
        {
          if (c == '*')
          {
            tk.delimiters[{parentype::asterisk, doublestatus::opening}]++;
          } else if (c == '[')
          {
            tk.delimiters[{parentype::square_bracket, doublestatus::opening}]++;
          } else if (c == '(')
          {
            tk.delimiters[{parentype::paren, doublestatus::opening}]++;
          } else if (c == '"')
          {
            tk.delimiters[{parentype::quote, doublestatus::opening}]++;
          } else {
            break;
          }
        }
      
        int backtrack = t.find_last_not_of(".,?!])\"*\n;:") + 1;
        if (backtrack != t.length())
        {
          std::string ending = t.substr(backtrack);
          std::string suffix;
          bool newline = false;
          bool terminating = false;
        
          for (char c : ending)
          {
            if ((c == '.') || (c == ',') || (c == '?') || (c == '!') || (c == ';') || (c == ':'))
            {
              suffix += c;
              terminating = true;
            
              continue;
            } else if (c == '\n')
            {
              newline = true;
              terminating = true;
              
              continue;
            }
          
            parentype pt = ([&] {
              switch (c)
              {
                case ']': return parentype::square_bracket;
                case ')': return parentype::paren;
                case '*': return parentype::asterisk;
                case '"': return parentype::quote;
              }
            })();
          
            if (tk.delimiters[{pt, doublestatus::opening}] > 0)
            {
              tk.delimiters[{pt, doublestatus::opening}]--;
              tk.delimiters[{pt, doublestatus::both}]++;
            } else {
              tk.delimiters[{pt, doublestatus::closing}]++;
            }
          }
        
          if (terminating)
          {
            if ((suffix == ",") && (!newline))
            {
              tk.suffix = suffixtype::comma;
            } else {
              tk.suffix = suffixtype::terminating;
              
              if (!newline)
              {
                w.terms.add({suffix, false});
              } else {
                w.terms.add({".", false});
              }
            }
          }
        }

        tkcor.push_back(_tokenstore.add(tk));
      }

      start = ((end > (std::string::npos - 1) ) ? std::string::npos : end + 1);
    }
    
    tokens.push_back(tkcor);
    
    startper += _corpora[i].length();
  }
  
  std::cout << "\b\b\b\b100%" << std::endl;
  
  delete_aspell_speller(spell_checker);
  delete_aspell_config(spell_config);
  
  std::cout << canonical_form.size() << " distinct forms" << std::endl;
  std::cout << words.size() << " distinct words" << std::endl;
  
  // Time to condense the distribution stuff for the words
  std::cout << "Compiling token histograms..." << std::endl;
  for (auto& it : words)
  {
    it.second.forms.compile();
    it.second.terms.compile();
  }
  
  // Hashtag freevar is not frequency distributed
  for (auto& it : thashtags)
  {
    hashtags.forms.add(it);
  }
  
  hashtags.forms.compile();
  hashtags.terms.compile();
  
  // Compile other freevars
  emoticons.forms.compile();
  emoticons.terms.compile();

  // Compile the interned tokens.
  _tokenstore.compile();

  // kgram distribution
  std::cout << "Creating markov chain...   0%" << std::flush;
  std::map<kgram, std::map<token_id, token_data> > tstats;

  len = 0;
  for (auto c : tokens)
  {
    len += (maxK-1) * c.size();
  }
  
  startper = 0;
  per = 0;
  perprime = 0;
  int corpid = 0;
  for (auto corpus : tokens)
  {
    for (int k=0; k<maxK && k<corpus.size(); k++)
    {
      // The zero'th token should be a terminator.
      token_id fid = corpus[k];
      const token& f = _tokenstore.get(fid);

      kgram term_prefix(corpus.begin(), corpus.begin()+k);
      term_prefix.push_front(wildcardQuery);

      if (tstats[term_prefix].count(fid) == 0)
      {
        tstats[term_prefix].emplace(fid, fid);
      }

      token_data& td2 = tstats[term_prefix].at(fid);
      td2.all++;
      td2.corpora.insert(corpid);

      if (std::find_if(f.raw.begin(), f.raw.end(), ::islower) == f.raw.end())
      {
        td2.uppercase++;
      } else if (isupper(f.raw[0]))
      {
        td2.titlecase++;
      }
    }

    for (int k=1; k<maxK && k<corpus.size(); k++)
    {
      for (int i=0; i<(corpus.size() - k); i++)
      {
        perprime = (startper+i) * 100 / len;
        if (perprime != per)
        {
          per = perprime;
      
          std::cout << "\b\b\b\b" << std::right;
          std::cout.width(3);
          std::cout << per << "%" << std::flush;
        }
      
        kgram prefix(corpus.begin()+i, corpus.begin()+i+k);
        token_id fid = corpus[i+k];
        const token& f = _tokenstore.get(fid);

        if (tstats[prefix].count(fid) == 0)
        {
          tstats[prefix].emplace(fid, fid);
        }

        token_data& td = tstats[prefix].at(fid);
        td.all++;
        td.corpora.insert(corpid);

        if (std::find_if(f.raw.begin(), f.raw.end(), ::islower) == f.raw.end())
        {
          td.uppercase++;
        } else if (isupper(f.raw[0]))
        {
          td.titlecase++;
        }

        const token& startTok = _tokenstore.get(std::begin(prefix)->tok);
        if (startTok.suffix == suffixtype::terminating)
        {
          kgram term_prefix(prefix);
          term_prefix.pop_front();
          term_prefix.push_front(wildcardQuery);

          if (tstats[term_prefix].count(fid) == 0)
          {
            tstats[term_prefix].emplace(fid, fid);
          }

          token_data& td2 = tstats[term_prefix].at(fid);
          td2.all++;
          td2.corpora.insert(corpid);

          if (std::find_if(f.raw.begin(), f.raw.end(), ::islower) == f.raw.end())
          {
            td2.uppercase++;
          } else if (isupper(f.raw[0]))
          {
            td2.titlecase++;
          }
        }
      }
      
      startper += corpus.size();
    }
    
    corpid++;
  }
  
  std::cout << "\b\b\b\b100%" << std::endl;

  // Condense the kgram distribution
  std::cout << "Compiling kgram distributions...   0%";
  len = tstats.size();
  per = 0;
  perprime = 0;
  int indicator = 0;
  for (auto& it : tstats)
  {
    indicator++;
    perprime = indicator * 100 / len;
    if (per != perprime)
    {
      per = perprime;
    
      std::cout << "\b\b\b\b" << std::right;
      std::cout.width(3);
      std::cout << per << "%" << std::flush;
    }
    
    kgram klist = it.first;
    auto& probtable = it.second;
    auto& distribution = _stats[klist];
    int max = 0;
		
    for (auto& kt : probtable)
    {
      max += kt.second.all;
			
      distribution.emplace(max, kt.second);
    }
  }
  
  std::cout << "\b\b\b\b100%" << std::endl;
  
  _compiled = true;
}
Example #23
0
char *
weechat_aspell_get_suggestions (struct t_aspell_speller_buffer *speller_buffer,
                                const char *word)
{
    int i, size, max_suggestions, num_suggestions;
    char *suggestions, *suggestions2;
    const char *ptr_word;
#ifdef USE_ENCHANT
    char **elements;
    size_t num_elements;
#else
    const AspellWordList *list;
    AspellStringEnumeration *elements;
#endif

    max_suggestions = weechat_config_integer (weechat_aspell_config_check_suggestions);
    if (max_suggestions < 0)
        return NULL;

    size = 1;
    suggestions = malloc (size);
    if (!suggestions)
        return NULL;

    suggestions[0] = '\0';
    if (speller_buffer->spellers)
    {
        for (i = 0; speller_buffer->spellers[i]; i++)
        {
#ifdef USE_ENCHANT
            elements = enchant_dict_suggest (speller_buffer->spellers[i], word,
                                             -1, &num_elements);
            if (elements)
            {
                if (num_elements > 0)
                {
                    num_suggestions = 0;
                    while ((ptr_word = elements[num_suggestions]) != NULL)
                    {
                        size += strlen (ptr_word) + ((suggestions[0]) ? 1 : 0);
                        suggestions2 = realloc (suggestions, size);
                        if (!suggestions2)
                        {
                            free (suggestions);
                            enchant_dict_free_string_list (speller_buffer->spellers[i],
                                                           elements);
                            return NULL;
                        }
                        suggestions = suggestions2;
                        if (suggestions[0])
                            strcat (suggestions, (num_suggestions == 0) ? "/" : ",");
                        strcat (suggestions, ptr_word);
                        num_suggestions++;
                        if ((max_suggestions >= 0) && (num_suggestions == max_suggestions))
                            break;
                    }
                }
                enchant_dict_free_string_list (speller_buffer->spellers[i], elements);
            }
#else
            list = aspell_speller_suggest (speller_buffer->spellers[i], word, -1);
            if (list)
            {
                elements = aspell_word_list_elements (list);
                num_suggestions = 0;
                while ((ptr_word = aspell_string_enumeration_next (elements)) != NULL)
                {
                    size += strlen (ptr_word) + ((suggestions[0]) ? 1 : 0);
                    suggestions2 = realloc (suggestions, size);
                    if (!suggestions2)
                    {
                        free (suggestions);
                        delete_aspell_string_enumeration (elements);
                        return NULL;
                    }
                    suggestions = suggestions2;
                    if (suggestions[0])
                        strcat (suggestions, (num_suggestions == 0) ? "/" : ",");
                    strcat (suggestions, ptr_word);
                    num_suggestions++;
                    if ((max_suggestions >= 0) && (num_suggestions == max_suggestions))
                        break;
                }
                delete_aspell_string_enumeration (elements);
            }
#endif
        }
    }

    /* no suggestions found */
    if (!suggestions[0])
    {
        free (suggestions);
        return NULL;
    }

    return suggestions;
}