コード例 #1
0
ファイル: weechat-aspell.c プロジェクト: FauxFaux/weechat_old
char *
weechat_aspell_get_suggestions (struct t_aspell_speller_buffer *speller_buffer,
                                const char *word)
{
    int i, size, max_suggestions, num_suggestions;
    char *suggestions, *suggestions2;
    const char *ptr_word;
#ifdef USE_ENCHANT
    char **elements;
    size_t num_elements;
#else
    const AspellWordList *list;
    AspellStringEnumeration *elements;
#endif

    max_suggestions = weechat_config_integer (weechat_aspell_config_check_suggestions);
    if (max_suggestions < 0)
        return NULL;

    size = 1;
    suggestions = malloc (size);
    if (!suggestions)
        return NULL;

    suggestions[0] = '\0';
    if (speller_buffer->spellers)
    {
        for (i = 0; speller_buffer->spellers[i]; i++)
        {
#ifdef USE_ENCHANT
            elements = enchant_dict_suggest (speller_buffer->spellers[i], word,
                                             -1, &num_elements);
            if (elements)
            {
                if (num_elements > 0)
                {
                    num_suggestions = 0;
                    while ((ptr_word = elements[num_suggestions]) != NULL)
                    {
                        size += strlen (ptr_word) + ((suggestions[0]) ? 1 : 0);
                        suggestions2 = realloc (suggestions, size);
                        if (!suggestions2)
                        {
                            free (suggestions);
                            enchant_dict_free_string_list (speller_buffer->spellers[i],
                                                           elements);
                            return NULL;
                        }
                        suggestions = suggestions2;
                        if (suggestions[0])
                            strcat (suggestions, (num_suggestions == 0) ? "/" : ",");
                        strcat (suggestions, ptr_word);
                        num_suggestions++;
                        if ((max_suggestions >= 0) && (num_suggestions == max_suggestions))
                            break;
                    }
                }
                enchant_dict_free_string_list (speller_buffer->spellers[i], elements);
            }
#else
            list = aspell_speller_suggest (speller_buffer->spellers[i], word, -1);
            if (list)
            {
                elements = aspell_word_list_elements (list);
                num_suggestions = 0;
                while ((ptr_word = aspell_string_enumeration_next (elements)) != NULL)
                {
                    size += strlen (ptr_word) + ((suggestions[0]) ? 1 : 0);
                    suggestions2 = realloc (suggestions, size);
                    if (!suggestions2)
                    {
                        free (suggestions);
                        delete_aspell_string_enumeration (elements);
                        return NULL;
                    }
                    suggestions = suggestions2;
                    if (suggestions[0])
                        strcat (suggestions, (num_suggestions == 0) ? "/" : ",");
                    strcat (suggestions, ptr_word);
                    num_suggestions++;
                    if ((max_suggestions >= 0) && (num_suggestions == max_suggestions))
                        break;
                }
                delete_aspell_string_enumeration (elements);
            }
#endif
        }
    }

    /* no suggestions found */
    if (!suggestions[0])
    {
        free (suggestions);
        return NULL;
    }

    return suggestions;
}
コード例 #2
0
ファイル: kgramstats.cpp プロジェクト: hatkirby/rawr-ebooks
// runs in O(t^2) time where t is the number of tokens in the input corpus
// We consider maxK to be fairly constant
void rawr::compile(int maxK)
{
  _maxK = maxK;

  std::vector<std::vector<token_id>> tokens;
  std::set<std::string> thashtags;
  std::set<std::string> fv_emoticons;
  
  std::ifstream fvefile("emoticons.txt");
  if (fvefile)
  {
    std::string line;
    while (getline(fvefile, line))
    {
      fv_emoticons.insert(line);
      emoticons.forms.add(line);
    }
  }
  
  fvefile.close();
  
  std::map<std::string, std::string> canonical_form;
  
  AspellConfig* spell_config = new_aspell_config();
  AspellCanHaveError* possible_err = new_aspell_speller(spell_config);
  if (aspell_error_number(possible_err) != 0)
  {
    std::cout << "aspell error: " << aspell_error_message(possible_err) << std::endl;
    exit(1);
  }
  
  AspellSpeller* spell_checker = to_aspell_speller(possible_err);
  
  std::cout << "Reading emojis..." << std::endl;
  prefix_search emojis;
  std::ifstream emoji_file("emojis.txt");
  if (emoji_file)
  {
    while (!emoji_file.eof())
    {
      std::string rawmojis;
      getline(emoji_file, rawmojis);
      if (rawmojis.back() == '\r')
      {
        rawmojis.pop_back();
      }
    
      emojis.add(rawmojis);
    }
    
    emoji_file.close();
  }

  std::cout << "Tokenizing corpus...   0%" << std::flush;
  int len = 0;
  for (auto c : _corpora)
  {
    len += c.length();
  }
  
  int startper = 0;
  int per = 0;
  int perprime = 0;
  std::cout.fill(' ');
  for (int i = 0; i < _corpora.size(); i++)
  {
    size_t start = 0;
    int end = 0;
    std::vector<token_id> tkcor;

    while (end != std::string::npos)
    {
      perprime = (startper + end) * 100 / len;
      if (perprime != per)
      {
        per = perprime;
      
        std::cout << "\b\b\b\b" << std::right;
        std::cout.width(3);
        std::cout << per << "%" << std::flush;
      }
    
      end = _corpora[i].find_first_of(" \n", start);

      bool emoji = false;
      std::string te = _corpora[i].substr(start, (end == std::string::npos) ? std::string::npos : end - start + 1);
      std::string t = "";
    
      if (te.compare("") && te.compare(".") && te.compare(" "))
      {
        if (te.back() == ' ')
        {
          te.pop_back();
        }
        
        // Extract strings of emojis into their own tokens even if they're not space delimited
        int m = emojis.match(te);
        emoji = m > 0;
        if (m == 0) m = 1;
        t = te.substr(0,m);
        te = te.substr(m);
      
        while (!te.empty())
        {
          m = emojis.match(te);
          if (emoji == (m > 0))
          {
            if (m == 0) m = 1;
            t += te.substr(0,m);
            te = te.substr(m);
          } else {
            end = start + t.length() - 1;
            break;
          }
        }
      
        std::string tc(t);
        std::transform(tc.begin(), tc.end(), tc.begin(), ::tolower);

        int pst = tc.find_first_not_of("\"([*");
        int dst = tc.find_last_not_of("\")]*.,?!\n;:");
        std::string canonical("");
        if ((pst != std::string::npos) && (dst != std::string::npos))
        {
          canonical = std::string(tc, pst, dst - pst + 1);
        }
      
        word& w = ([&] () -> word& {
          // Hashtag freevar
          if (canonical[0] == '#')
          {
            thashtags.insert(canonical);
          
            return hashtags;
          }
        
          // Emoticon freevar
          if (emoji)
          {
            emoticons.forms.add(canonical);
          
            return emoticons;
          }
        
          if ((pst != std::string::npos) && (dst != std::string::npos))
          {
            std::string emoticon_canon(t, pst, t.find_last_not_of("\"]*\n.,?!") - pst + 1);
            if (fv_emoticons.count(emoticon_canon) == 1)
            {
              emoticons.forms.add(emoticon_canon);
          
              return emoticons;
            }
          }
        
          // Basically any other word
          if (canonical_form.count(canonical) == 0)
          {
            if (
              // Legacy freevars should be distinct from tokens containing similar words
              (canonical.find("$name$") != std::string::npos)
              // Words with no letters will be mangled by the spell checker
              || (canonical.find_first_of("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz") == std::string::npos)
              )
            {
              canonical_form[canonical] = canonical;
              words.emplace(canonical, canonical);
            } else {
              int correct = aspell_speller_check(spell_checker, canonical.c_str(), canonical.size());
              if (correct)
              {
                words.emplace(canonical, canonical);
                canonical_form[canonical] = canonical;
              } else {
                const AspellWordList* suggestions = aspell_speller_suggest(spell_checker, canonical.c_str(), canonical.size());
                AspellStringEnumeration* elements = aspell_word_list_elements(suggestions);
                const char* replacement = aspell_string_enumeration_next(elements);
                if (replacement != NULL)
                {
                  std::string sugrep(replacement);
                  canonical_form[canonical] = sugrep;
          
                  if (words.count(sugrep) == 0)
                  {
                    words.emplace(sugrep, sugrep);
                  }
                } else {
                  words.emplace(canonical, canonical);
                  canonical_form[canonical] = canonical;
                }
          
                delete_aspell_string_enumeration(elements);
              }
            }
          }
        
          word& tw = words.at(canonical_form.at(canonical));
          tw.forms.add(canonical);
        
          return tw;
        })();
      
        token tk(w);
        tk.raw = t;
      
        for (char c : t)
        {
          if (c == '*')
          {
            tk.delimiters[{parentype::asterisk, doublestatus::opening}]++;
          } else if (c == '[')
          {
            tk.delimiters[{parentype::square_bracket, doublestatus::opening}]++;
          } else if (c == '(')
          {
            tk.delimiters[{parentype::paren, doublestatus::opening}]++;
          } else if (c == '"')
          {
            tk.delimiters[{parentype::quote, doublestatus::opening}]++;
          } else {
            break;
          }
        }
      
        int backtrack = t.find_last_not_of(".,?!])\"*\n;:") + 1;
        if (backtrack != t.length())
        {
          std::string ending = t.substr(backtrack);
          std::string suffix;
          bool newline = false;
          bool terminating = false;
        
          for (char c : ending)
          {
            if ((c == '.') || (c == ',') || (c == '?') || (c == '!') || (c == ';') || (c == ':'))
            {
              suffix += c;
              terminating = true;
            
              continue;
            } else if (c == '\n')
            {
              newline = true;
              terminating = true;
              
              continue;
            }
          
            parentype pt = ([&] {
              switch (c)
              {
                case ']': return parentype::square_bracket;
                case ')': return parentype::paren;
                case '*': return parentype::asterisk;
                case '"': return parentype::quote;
              }
            })();
          
            if (tk.delimiters[{pt, doublestatus::opening}] > 0)
            {
              tk.delimiters[{pt, doublestatus::opening}]--;
              tk.delimiters[{pt, doublestatus::both}]++;
            } else {
              tk.delimiters[{pt, doublestatus::closing}]++;
            }
          }
        
          if (terminating)
          {
            if ((suffix == ",") && (!newline))
            {
              tk.suffix = suffixtype::comma;
            } else {
              tk.suffix = suffixtype::terminating;
              
              if (!newline)
              {
                w.terms.add({suffix, false});
              } else {
                w.terms.add({".", false});
              }
            }
          }
        }

        tkcor.push_back(_tokenstore.add(tk));
      }

      start = ((end > (std::string::npos - 1) ) ? std::string::npos : end + 1);
    }
    
    tokens.push_back(tkcor);
    
    startper += _corpora[i].length();
  }
  
  std::cout << "\b\b\b\b100%" << std::endl;
  
  delete_aspell_speller(spell_checker);
  delete_aspell_config(spell_config);
  
  std::cout << canonical_form.size() << " distinct forms" << std::endl;
  std::cout << words.size() << " distinct words" << std::endl;
  
  // Time to condense the distribution stuff for the words
  std::cout << "Compiling token histograms..." << std::endl;
  for (auto& it : words)
  {
    it.second.forms.compile();
    it.second.terms.compile();
  }
  
  // Hashtag freevar is not frequency distributed
  for (auto& it : thashtags)
  {
    hashtags.forms.add(it);
  }
  
  hashtags.forms.compile();
  hashtags.terms.compile();
  
  // Compile other freevars
  emoticons.forms.compile();
  emoticons.terms.compile();

  // Compile the interned tokens.
  _tokenstore.compile();

  // kgram distribution
  std::cout << "Creating markov chain...   0%" << std::flush;
  std::map<kgram, std::map<token_id, token_data> > tstats;

  len = 0;
  for (auto c : tokens)
  {
    len += (maxK-1) * c.size();
  }
  
  startper = 0;
  per = 0;
  perprime = 0;
  int corpid = 0;
  for (auto corpus : tokens)
  {
    for (int k=0; k<maxK && k<corpus.size(); k++)
    {
      // The zero'th token should be a terminator.
      token_id fid = corpus[k];
      const token& f = _tokenstore.get(fid);

      kgram term_prefix(corpus.begin(), corpus.begin()+k);
      term_prefix.push_front(wildcardQuery);

      if (tstats[term_prefix].count(fid) == 0)
      {
        tstats[term_prefix].emplace(fid, fid);
      }

      token_data& td2 = tstats[term_prefix].at(fid);
      td2.all++;
      td2.corpora.insert(corpid);

      if (std::find_if(f.raw.begin(), f.raw.end(), ::islower) == f.raw.end())
      {
        td2.uppercase++;
      } else if (isupper(f.raw[0]))
      {
        td2.titlecase++;
      }
    }

    for (int k=1; k<maxK && k<corpus.size(); k++)
    {
      for (int i=0; i<(corpus.size() - k); i++)
      {
        perprime = (startper+i) * 100 / len;
        if (perprime != per)
        {
          per = perprime;
      
          std::cout << "\b\b\b\b" << std::right;
          std::cout.width(3);
          std::cout << per << "%" << std::flush;
        }
      
        kgram prefix(corpus.begin()+i, corpus.begin()+i+k);
        token_id fid = corpus[i+k];
        const token& f = _tokenstore.get(fid);

        if (tstats[prefix].count(fid) == 0)
        {
          tstats[prefix].emplace(fid, fid);
        }

        token_data& td = tstats[prefix].at(fid);
        td.all++;
        td.corpora.insert(corpid);

        if (std::find_if(f.raw.begin(), f.raw.end(), ::islower) == f.raw.end())
        {
          td.uppercase++;
        } else if (isupper(f.raw[0]))
        {
          td.titlecase++;
        }

        const token& startTok = _tokenstore.get(std::begin(prefix)->tok);
        if (startTok.suffix == suffixtype::terminating)
        {
          kgram term_prefix(prefix);
          term_prefix.pop_front();
          term_prefix.push_front(wildcardQuery);

          if (tstats[term_prefix].count(fid) == 0)
          {
            tstats[term_prefix].emplace(fid, fid);
          }

          token_data& td2 = tstats[term_prefix].at(fid);
          td2.all++;
          td2.corpora.insert(corpid);

          if (std::find_if(f.raw.begin(), f.raw.end(), ::islower) == f.raw.end())
          {
            td2.uppercase++;
          } else if (isupper(f.raw[0]))
          {
            td2.titlecase++;
          }
        }
      }
      
      startper += corpus.size();
    }
    
    corpid++;
  }
  
  std::cout << "\b\b\b\b100%" << std::endl;

  // Condense the kgram distribution
  std::cout << "Compiling kgram distributions...   0%";
  len = tstats.size();
  per = 0;
  perprime = 0;
  int indicator = 0;
  for (auto& it : tstats)
  {
    indicator++;
    perprime = indicator * 100 / len;
    if (per != perprime)
    {
      per = perprime;
    
      std::cout << "\b\b\b\b" << std::right;
      std::cout.width(3);
      std::cout << per << "%" << std::flush;
    }
    
    kgram klist = it.first;
    auto& probtable = it.second;
    auto& distribution = _stats[klist];
    int max = 0;
		
    for (auto& kt : probtable)
    {
      max += kt.second.all;
			
      distribution.emplace(max, kt.second);
    }
  }
  
  std::cout << "\b\b\b\b100%" << std::endl;
  
  _compiled = true;
}
コード例 #3
0
ファイル: gtkspell.c プロジェクト: jmissig/gabber
GtkWidget*
build_suggestion_menu(GtkSpell *spell, GtkTextBuffer *buffer,
                      const char *word) {
	const char *suggestion;
	GtkWidget *topmenu, *menu;
	GtkWidget *mi;
	int count = 0;
	const AspellWordList *suggestions;
	AspellStringEnumeration *elements;
	char *label;
	
	topmenu = menu = gtk_menu_new();

	/* + Add to Dictionary */
	label = g_strdup_printf("Add \"%s\" to Dictionary", word);
	mi = gtk_image_menu_item_new_with_label(label);
	g_free(label);
	gtk_image_menu_item_set_image(GTK_IMAGE_MENU_ITEM(mi), 
			gtk_image_new_from_stock(GTK_STOCK_ADD, GTK_ICON_SIZE_MENU));
	g_signal_connect(G_OBJECT(mi), "activate",
			G_CALLBACK(add_to_dictionary), spell);
	gtk_widget_show_all(mi);
	gtk_menu_shell_append(GTK_MENU_SHELL(topmenu), mi);

	/* Separator */
	mi = gtk_menu_item_new();
	gtk_widget_show(mi);
	gtk_menu_shell_append(GTK_MENU_SHELL(topmenu), mi);

	suggestions = aspell_speller_suggest(spell->speller, word, -1);
	elements = aspell_word_list_elements(suggestions);

	suggestion = aspell_string_enumeration_next(elements);
	if (suggestion == NULL) {
		/* no suggestions.  put something in the menu anyway... */
		GtkWidget *label;
		label = gtk_label_new("");
		gtk_label_set_markup(GTK_LABEL(label), "<i>(no suggestions)</i>");

		mi = gtk_menu_item_new();
		gtk_container_add(GTK_CONTAINER(mi), label);
		gtk_widget_show_all(mi);
		gtk_menu_shell_prepend(GTK_MENU_SHELL(menu), mi);
	} else {
		/* build a set of menus with suggestions. */
		while (suggestion != NULL) {
			if (count == 10) {
				mi = gtk_menu_item_new();
				gtk_widget_show(mi);
				gtk_menu_shell_append(GTK_MENU_SHELL(menu), mi);

				mi = gtk_menu_item_new_with_label("More...");
				gtk_widget_show(mi);
				gtk_menu_shell_append(GTK_MENU_SHELL(menu), mi);

				menu = gtk_menu_new();
				gtk_menu_item_set_submenu(GTK_MENU_ITEM(mi), menu);
				count = 0;
			}
			mi = gtk_menu_item_new_with_label(suggestion);
			g_signal_connect(G_OBJECT(mi), "activate",
					G_CALLBACK(replace_word), spell);
			gtk_widget_show(mi);
			gtk_menu_shell_append(GTK_MENU_SHELL(menu), mi);
			count++;
			suggestion = aspell_string_enumeration_next(elements);
		}
	}

	delete_aspell_string_enumeration(elements);

	return topmenu;
}
コード例 #4
0
ファイル: example-c.c プロジェクト: Wilbeibi/DotConfig
int main(int argc, const char *argv[]) 
{
  AspellCanHaveError * ret;
  AspellSpeller * speller;
  int have;
  char word[81];
  char * p;
  char * word_end;
  AspellConfig * config;

  if (argc < 2) {
    printf("Usage: %s <language> [<size>|- [[<jargon>|- [<encoding>]]]\n", argv[0]);
    return 1;
  }

  config = new_aspell_config();

  aspell_config_replace(config, "lang", argv[1]);

  if (argc >= 3 && argv[2][0] != '-' && argv[2][1] != '\0')
    aspell_config_replace(config, "size", argv[2]);

  if (argc >= 4 && argv[3][0] != '-')
    aspell_config_replace(config, "jargon", argv[3]);

  if (argc >= 5 && argv[4][0] != '-')
    aspell_config_replace(config, "encoding", argv[4]);

  ret = new_aspell_speller(config);

  delete_aspell_config(config);

  if (aspell_error(ret) != 0) {
    printf("Error: %s\n",aspell_error_message(ret));
    delete_aspell_can_have_error(ret);
    return 2;
  }
  speller = to_aspell_speller(ret);
  config = aspell_speller_config(speller);

  fputs("Using: ",                                      stdout);
  fputs(aspell_config_retrieve(config, "lang"),         stdout);
  fputs("-",                                            stdout);
  fputs(aspell_config_retrieve(config, "jargon"),       stdout);
  fputs("-",                                            stdout);
  fputs(aspell_config_retrieve(config, "size"),         stdout);
  fputs("-",                                            stdout);
  fputs(aspell_config_retrieve(config, "module"),       stdout);
  fputs("\n\n",                                         stdout);

  puts("Type \"h\" for help.\n");

  while (fgets(word, 80, stdin) != 0) {

    /* remove trailing spaces */

    word_end = strchr(word, '\0') - 1;
    while (word_end != word && (*word_end == '\n' || *word_end == ' ')) 
      --word_end;
    ++word_end;
    *word_end = '\0';
    
    putchar('\n');
    switch (word[0]) {
    case '\0':
      break;
    case 'h':
      puts(
	"Usage: \n"
	"  h(elp)      help\n"
	"  c <word>    check if a word is the correct spelling\n"
	"  s <word>    print out a list of suggestions for a word\n"
	"  a <word>    add a word to the personal word list\n"
	"  i <word>    ignore a word for the rest of the session\n"
        "  d <file>    spell checks a document\n"
	"  p           dumps the personal word list\n"
	"  P           dumps the session word list\n"
	"  m           dumps the main  word list\n"
        "  o <option> <value> sets a config option\n"
	"  r <option>         retrieves a config option\n"
        "  l <option>         retrieves a config option as a list\n"
	"  S           saves all word lists\n"
	"  C           clear the curent sesstion word list\n"
	"  x           quite\n"	);
      break;
    case 'p':
      print_word_list(speller, 
		      aspell_speller_personal_word_list(speller), '\n');
      break;
    case 'P':
      print_word_list(speller, 
		      aspell_speller_session_word_list(speller), '\n');
      break;
    case 'm':
      print_word_list(speller, 
		      aspell_speller_main_word_list(speller), '\n');
      break;
    case 'S':
      aspell_speller_save_all_word_lists(speller);
      check_for_error(speller);
      break;
    case 'C': 
      aspell_speller_clear_session(speller);
      check_for_error(speller);
      break;
    case 'x':
      goto END;
    case 'c':
      if (strlen(word) < 3) {
	printf("Usage: %c <word>\n", word[0]);
      } else {
	have = aspell_speller_check(speller, word + 2, -1);
	if (have == 1) 
	  puts("correct");
	else if (have == 0)
	  puts("incorrect");
	else
	  printf("Error: %s\n", aspell_speller_error_message(speller));
      }
      break;
    case 's':
      if (strlen(word) < 3) {
	printf("Usage: %c <word>\n", word[0]);
      } else {
	print_word_list(speller, 
			aspell_speller_suggest(speller, word + 2, -1), '\n');
      }
      break;
    case 'a':
      if (strlen(word) < 3) {
	printf("Usage: %c <word>\n", word[0]);
      } else {
	aspell_speller_add_to_personal(speller, word + 2, -1);
	check_for_error(speller);
      }
      break;
    case 'i':
      if (strlen(word) < 3) {
	printf("Usage: %c <word>\n", word[0]);
      } else {
	aspell_speller_add_to_session(speller, word + 2, -1);
	check_for_error(speller);
      }
      break;
    case 'o':
      word[80] = '\0'; /* to make sure strchr doesn't run off end of string */
      p = strchr(word + 3, ' ');
      if (strlen(word) < 3 || p == 0) {
	printf("Usage: %c <option> <value>\n", word[0]);
      } else {
	*p = '\0';
	++p;
	aspell_config_replace(config, word + 2, p);
	check_for_config_error(config);
      }
      break;
    case 'r':
      if (strlen(word) < 3) {
	printf("Usage: %c <option>\n", word[0]);
      } else {
	const char * val = aspell_config_retrieve(config, word + 2);
	check_for_config_error(config);
	if (val)
	  printf("%s = \"%s\"\n", word + 2, val);
      }
      break;
    case 'l':
      if (strlen(word) < 3) {
	printf("Usage: %c <option>\n", word[0]);
      } else {
	AspellStringList * lst = new_aspell_string_list();
	AspellMutableContainer * lst0 
	  = aspell_string_list_to_mutable_container(lst);
	AspellStringEnumeration * els;
	const char * val;
	aspell_config_retrieve_list(config, word + 2, lst0);
	check_for_config_error(config);
	els = aspell_string_list_elements(lst);
	printf("%s:\n", word + 2);
	while ( (val = aspell_string_enumeration_next(els)) != 0)
	  printf("  %s\n", val);
	delete_aspell_string_enumeration(els);
	delete_aspell_string_list(lst);
      }
      break;
    case 'd':
      if (strlen(word) < 3) {
	printf("Usage: %c <file>\n", word[0]);
      } else {
	check_document(speller, word + 2);
	printf("\n");
      }
      break;
    default:
      printf("Unknown Command: %s\n", word);
    }
    putchar('\n');
  }
 END:
  delete_aspell_speller(speller);
  return 0;
}