예제 #1
0
bool AspellAdapterImpl::isDictionaryAvailable(const std::string & langCode) {
    AspellConfig * tempAspellConfig = new_aspell_config();
    aspell_config_replace(tempAspellConfig, "lang", langCode.c_str());
    AspellCanHaveError * possibleError = new_aspell_speller(tempAspellConfig);

    bool result = true;
    if (aspell_error_number(possibleError) != 0) {
        result = false;
    } else {
        AspellSpeller * tempSpeller = to_aspell_speller(possibleError);
        delete_aspell_speller(tempSpeller);
    }

    delete_aspell_config(tempAspellConfig);
    
    return result;
}
예제 #2
0
SpellCheck::SpellCheck(QObject *parent) :
    QObject(parent),
    config(NULL),
    spell_checker(NULL)
{
    config = new_aspell_config();

    aspell_config_replace(config, "encoding", "utf-8");
    aspell_config_replace(config, "personal", (QDir::homePath()+QDir::separator()+".eiskaltdc++"+QDir::separator()+"dict").toAscii().constData());

    if (config){
        /*const AspellDictInfoList *dicts = get_aspell_dict_info_list(config);
        AspellDictInfoEnumeration *enumer = aspell_dict_info_list_elements(dicts);
        const AspellDictInfo *info = NULL;

        QStringList all;

        while ((info = aspell_dict_info_enumeration_next(enumer)) != NULL)
            all.append(QString::fromUtf8(info->code, strlen(info->code)));

        if (WSGET(WS_APP_ASPELL_LANG).isEmpty()){
            QString lc_prefix = QLocale::system().name();

            if (all.contains(lc_prefix))//Loading dictionary from system locale
                aspell_config_replace(config, "lang", lc_prefix.toAscii().constData());
            else if (all.contains(lc_prefix.left(lc_prefix.indexOf("_")))) {
                aspell_config_replace(config, "lang", lc_prefix.left(lc_prefix.indexOf("_")).toAscii().constData());
            }
        }
        else
            aspell_config_replace(config, "lang", WSGET(WS_APP_ASPELL_LANG).toAscii().constData());*/
        AspellCanHaveError *error = new_aspell_speller(config);

        if (aspell_error(error) != 0){
            delete_aspell_config(config);

            printf("%s\n", aspell_error_message(error));

            config = NULL;
        }
        else
            spell_checker = to_aspell_speller(error);
    }
}
예제 #3
0
void ASpellChecker::setActiveLanguages(const QSet<LanguageManager::LangId>& langs)
{
    clearSpellers();

    for(auto const &lang: langs)
    {
        AspellConfig* conf = aspell_config_clone(config_);
        aspell_config_replace(conf, "lang", LanguageManager::toString(lang)
                              .replace(QLatin1Char('-'),QLatin1Char('_')).toUtf8().constData());
        AspellCanHaveError* ret = new_aspell_speller(conf);
        if (aspell_error_number(ret) == 0) {
            spellers_.append(to_aspell_speller(ret));
        }
        else {
            qDebug() << QString("Aspell error: %1").arg(aspell_error_message(ret));
        }
        delete_aspell_config(conf);
    }
}
예제 #4
0
ASpellChecker::ASpellChecker()
{
	config_ = NULL;
	speller_ = NULL;
	config_ = new_aspell_config();
	aspell_config_replace(config_, "encoding", "utf-8");
#ifdef Q_WS_WIN
	aspell_config_replace(config_, "conf-dir", QDir::homeDirPath());
	aspell_config_replace(config_, "data-dir", QString("%1/aspell/data").arg(QCoreApplication::applicationDirPath()));
	aspell_config_replace(config_, "dict-dir", QString("%1/aspell/dict").arg(QCoreApplication::applicationDirPath()));
#endif
	AspellCanHaveError* ret = new_aspell_speller(config_);
	if (aspell_error_number(ret) == 0) {
		speller_ = to_aspell_speller(ret);
	}
	else {
		qWarning(QString("Aspell error: %1").arg(aspell_error_message(ret)).toAscii());
	}
}
/**
 * Create a spell checker for a language
 * @param language the language code e.g. en_GB or it
 * @return a checker or NULL
 */
static checker *checker_create( const char *language )
{
    int err = 0;
    checker *c = calloc( 1, sizeof(checker) );
    if ( c != NULL )
    {
        strncpy(c->lang,language,24);
        c->spell_config = new_aspell_config();
        if ( c->spell_config != NULL )
        {
            aspell_config_replace( c->spell_config, "lang", language );
            AspellCanHaveError *possible_err 
                = new_aspell_speller(c->spell_config);
            c->spell_checker = 0;
            if (aspell_error_number(possible_err) != 0)
            {
                fprintf(stderr,"%s\n",aspell_error_message(possible_err));
                err = 1;
            }
            else
            {
                c->spell_checker = to_aspell_speller(possible_err);
                if ( c->spell_checker == NULL )
                {
                    fprintf(stderr,"checker: failed to initialise speller\n");
                    err = 1;
                }
            }
            if ( err )
            {
                checker_dispose( c );
                c = NULL;
            }
        }
        else
            fprintf(stderr,"checker: failed to create speller\n");
    }
    else
        fprintf(stderr,"checker: failed to create object\n");
    return c;
}
예제 #6
0
/**
 * create a neew spell-checker for the language 'lang'
 */
void * spellcheck_create(const char * lang)
{
    struct linkgrammar_aspell *aspell = NULL;
    size_t i = 0;
    AspellCanHaveError *spell_err = NULL;

    for (i = 0; i < sizeof(spellcheck_lang_mapping)/sizeof(char *); i += 2)
    {
        if (0 != strcmp(lang, spellcheck_lang_mapping[i])) continue;
        aspell = (struct linkgrammar_aspell *)malloc(sizeof(struct linkgrammar_aspell));
        if (!aspell) {
            prt_error("Error: out of memory. Aspell not used.\n");
            aspell = NULL;
            break;
        }
        aspell->config = NULL;
        aspell->speller = NULL;
        aspell->config = new_aspell_config();
        if (aspell_config_replace(aspell->config, ASPELL_LANG_KEY,
                                  spellcheck_lang_mapping[i]) == 0) {
            prt_error("Error: failed to set language in aspell: %s\n", lang);
            delete_aspell_config(aspell->config);
            free(aspell);
            aspell = NULL;
            break;
        }
        spell_err = new_aspell_speller(aspell->config);
        if (aspell_error_number(spell_err) != 0) {
            prt_error("Error: Aspell: %s\n", aspell_error_message(spell_err));
            delete_aspell_can_have_error(spell_err);
            delete_aspell_config(aspell->config);
            free(aspell);
            aspell = NULL;
            break;
        }
        aspell->speller = to_aspell_speller(spell_err);
        break;
    }
    return aspell;
}
예제 #7
0
파일: gtkspell.c 프로젝트: jmissig/gabber
static gboolean
gtkspell_set_language_internal(GtkSpell *spell, const gchar *lang, GError **error) {
	AspellConfig *config;
	AspellCanHaveError *err;

	if (lang == NULL) {
		lang = g_getenv("LANG");
		if (lang) {
			if (g_strncasecmp(lang, "C", 1) == 0)
				lang = NULL;
			else if (lang[0] == 0)
				lang = NULL;
		}
	}

	config = new_aspell_config();
	if (lang)
		aspell_config_replace(config, "language-tag", lang);
	aspell_config_replace(config, "encoding", "utf-8");
	err = new_aspell_speller(config);
	delete_aspell_config(config);

	if (aspell_error_number(err) != 0) {
#ifdef USING_ASPELL
		g_set_error(error, GTKSPELL_ERROR, GTKSPELL_ERROR_BACKEND,
				"aspell: %s", aspell_error_message(err));
#elif defined USING_PSPELL
		g_set_error(error, GTKSPELL_ERROR, GTKSPELL_ERROR_BACKEND,
				"pspell: %s", aspell_error_message(err));
#endif
		return FALSE;
	} 
	if (spell->speller)
		delete_aspell_speller(spell->speller);
	spell->speller = to_aspell_speller(err);

	return TRUE;
}
예제 #8
0
파일: raspell.c 프로젝트: stuart/raspell
/**
 * Ctor for aspell objects:
 * Aspell.new(language, jargon, size, encoding)
 * Please note: All parameters are optional. If a parameter is omitted, a default value is assumed from
 *              the environment (eg lang from $LANG). To retain default values, you can use nil
 *              as value: to set only size: Aspell.new(nil, nil, "80")
 * @param language ISO639 language code plus optional ISO 3166 counry code as string (eg: "de" or "us_US")
 * @param jargon a special jargon of the selected language
 * @param size the size of the dictionary to chose (if there are options)
 * @param encoding the encoding to use
 * @exception Exception if the specified dictionary is not found.
 */
static VALUE aspell_s_new(int argc, VALUE *argv, VALUE klass) {
    VALUE vlang, vjargon, vsize, vencoding;
    const char *tmp;
    //aspell values
    AspellCanHaveError * ret;
    AspellSpeller * speller;
    AspellConfig * config;

    //create new config
    config = new_aspell_config();

    //extract values
    rb_scan_args(argc, argv, "04", &vlang, &vjargon, &vsize, &vencoding);

    //language:
    if (RTEST(vlang)) set_option(config, "lang", STR2CSTR(vlang));
    //jargon:
    if (RTEST(vjargon)) set_option(config, "jargon", STR2CSTR(vjargon));
    //size:
    if (RTEST(vsize)) set_option(config, "size", STR2CSTR(vsize));
    //encoding:
    if (RTEST(vencoding)) set_option(config, "encoding", STR2CSTR(vencoding));

    //create speller:
    ret = new_aspell_speller(config);
    delete_aspell_config(config);
    if (aspell_error(ret) != 0) {
        tmp = strdup(aspell_error_message(ret));
        delete_aspell_can_have_error(ret);
        rb_raise(cAspellError, "%s", tmp);
    }

    speller = to_aspell_speller(ret);

    //wrap pointer
    return Data_Wrap_Struct(klass, 0, aspell_free, speller);
}
예제 #9
0
/* Create a new speller *******************************************************/
static PyObject* new_speller(PyObject* self, PyObject* args) {
	aspell_AspellObject* newobj;

	AspellSpeller* speller = 0;
	AspellConfig*  config;
	AspellCanHaveError* possible_error;

	int i;
	int n; /* arg count */
	char *key, *value;

	config = new_aspell_config();
	if (config == NULL) {
		PyErr_SetString(_AspellModuleException, "can't create config");
		return NULL;
	}

	/* check constructor arguments */
	n = PyTuple_Size(args);
	switch (n) {
		case 0: /* no arguments passed */
			break;

		case 2: /* constructor is called with single pair: key & value */
			if (PyArg_ParseTuple(args, "ss", &key, &value)) {
				if (!aspell_config_replace(config, key, value)) {
					PyErr_SetString(_AspellConfigException, aspell_config_error_message(config));
					goto arg_error;
				}
				break;
			}
			PyErr_Clear();
		default: /* list of tuples key&value */
			for (i=0; i<n; i++) {
				if (!PyArg_ParseTuple(PyTuple_GetItem(args, i), "ss", &key, &value)) {
					PyErr_Format(PyExc_TypeError, "argument %d: tuple of two strings (key, value) expeced", i);
					goto arg_error;
				}
				if (!aspell_config_replace(config, key, value)) {
					PyErr_SetString(_AspellConfigException, aspell_config_error_message(config));
					goto arg_error;
				}
			}
			Py_DECREF(args);
			break;
	}

	/* try to create a new speller */
	possible_error = new_aspell_speller(config);
	delete_aspell_config(config);

	if (aspell_error_number(possible_error) == 0)
		/* save a speller */
		speller = to_aspell_speller(possible_error);
	else {
		/* or raise an exception */
		PyErr_SetString(_AspellSpellerException, aspell_error_message(possible_error));
		delete_aspell_can_have_error(possible_error);
		return NULL;
	}

	/* create a new py-object */
  newobj = (aspell_AspellObject*)PyObject_New(aspell_AspellObject, &aspell_AspellType);
	newobj->speller = speller;

	return (PyObject*)newobj;

/* argument error: before return NULL we need to
   delete speller's config we've created */
arg_error:
	delete_aspell_config(config);
	return NULL;
}
예제 #10
0
static int get_aspell_hits(const char *word, int len)
{
	if (len < 2) 
	{
		log_message(DEBUG, "   [-]Skip aspell checking (word is very short)");
		return NO_LANGUAGE;	
	}

	AspellConfig *spell_config = new_aspell_config();
	
	for (int lang = 0; lang < xconfig->total_languages; lang++)
	{
		char *lang_word = (char *) malloc(1 * sizeof(char));
		if (lang_word == NULL)
			continue;
		lang_word[0] = NULLSYM;

		for (int i = 0; i < len; i++)
		{
			KeyCode kc;
			int modifier;
			main_window->xkeymap->char_to_keycode(main_window->xkeymap, word[i], &kc, &modifier);

			char *symbol = keycode_to_symbol(kc, lang, modifier);
			if ((symbol == NULL) || (lang_word == NULL))
				continue;
			lang_word = (char *) realloc(lang_word, (strlen(lang_word) + strlen(symbol) + 1) * sizeof(char));
			if (lang_word != NULL)
				strcat(lang_word, symbol);

			free(symbol);
		}
		
		if (lang_word == NULL)
			continue;

		aspell_config_replace(spell_config, "lang", xconfig->languages[lang].dir);
		AspellCanHaveError *possible_err = new_aspell_speller(spell_config);
		AspellSpeller *spell_checker = 0;

		if (aspell_error_number(possible_err) == 0)
		{
			spell_checker = to_aspell_speller(possible_err);
			int correct = aspell_speller_check(spell_checker, lang_word, strlen(lang_word));
			if (correct)
			{
				log_message(DEBUG, "   [+]Found this word in %s aspell dictionary", xconfig->get_lang_name(xconfig, lang));
				delete_aspell_speller(spell_checker);
				free(lang_word);
				return lang;
			}
		}
		else
		{
			log_message(DEBUG, "   [!]Error aspell checking for %s aspell dictionary", xconfig->get_lang_name(xconfig, lang));
		}
	
		delete_aspell_speller(spell_checker);
		free(lang_word);
	}
	
	log_message(DEBUG, "   [-]This word has no hits for all aspell dictionaries");
	return NO_LANGUAGE;
}
예제 #11
0
AspellSpeller *
#endif /* USE_ENCHANT */
weechat_aspell_speller_new (const char *lang)
{
#ifdef USE_ENCHANT
    EnchantDict *new_speller;
#else
    AspellConfig *config;
    AspellCanHaveError *ret;
    AspellSpeller *new_speller;
#endif /* USE_ENCHANT */
    struct t_infolist *infolist;

    if (!lang)
        return NULL;

    if (weechat_aspell_plugin->debug)
    {
        weechat_printf (NULL,
                        "%s: creating new speller for lang \"%s\"",
                        ASPELL_PLUGIN_NAME, lang);
    }

#ifdef USE_ENCHANT
    new_speller = enchant_broker_request_dict (broker, lang);
    if (!new_speller)
    {
        weechat_printf (NULL,
                        _("%s%s: error: unable to create speller for lang \"%s\""),
                        weechat_prefix ("error"), ASPELL_PLUGIN_NAME,
                        lang);
        return NULL;
    }
#else
    /* create a speller instance for the newly created cell */
    config = new_aspell_config ();
    aspell_config_replace (config, "lang", lang);
#endif /* USE_ENCHANT */

    /* apply all options */
    infolist = weechat_infolist_get ("option", NULL, "aspell.option.*");
    if (infolist)
    {
        while (weechat_infolist_next (infolist))
        {
#ifdef USE_ENCHANT
            /* TODO: set option with enchant */
#else
            aspell_config_replace (config,
                                   weechat_infolist_string (infolist, "option_name"),
                                   weechat_infolist_string (infolist, "value"));
#endif /* USE_ENCHANT */
        }
        weechat_infolist_free (infolist);
    }

#ifndef USE_ENCHANT
    ret = new_aspell_speller (config);

    if (aspell_error (ret) != 0)
    {
        weechat_printf (NULL,
                        "%s%s: error: %s",
                        weechat_prefix ("error"), ASPELL_PLUGIN_NAME,
                        aspell_error_message (ret));
        delete_aspell_config (config);
        delete_aspell_can_have_error (ret);
        return NULL;
    }

    new_speller = to_aspell_speller (ret);
#endif /* USE_ENCHANT */

    weechat_hashtable_set (weechat_aspell_spellers, lang, new_speller);

#ifndef USE_ENCHANT
    /* free configuration */
    delete_aspell_config (config);
#endif /* USE_ENCHANT */

    return new_speller;
}
예제 #12
0
void SpellChecker::setLanguage(const QString &lang)
{
    if (lang.isEmpty()) return;
    delete_aspell_speller(spell_checker1);
    delete_aspell_speller(spell_checker2);
    bad_language.clear();

    m_lang2 = "en";
    m_lang1 = m_map->value(lang, QString("en"));
    if (lang.contains("+")) {
        QStringList sl = lang.split("+");
        m_lang1 = m_map->value(sl[0], QString("en"));
        m_lang2 = m_map->value(sl[1], QString("en"));
        if ((m_lang1 == "deu")||(lang == "ger")) {
            m_lang1 = "de_DE";
        }
        if ((m_lang2 == "deu")||(lang == "ger")) {
            m_lang2 = "de_DE";
        }
    }
    if (lang == "rus_fra") {
        m_lang1 = "ru";
        m_lang2 = "fr";
    } else if (lang == "rus_ger") {
        m_lang1 = "ru";
        m_lang2 = "de";
    } else if (lang == "rus_spa") {
        m_lang1 = "ru";
        m_lang2 = "es";
    }
    if ((lang == "deu")||(lang == "ger")) {
        m_lang1 = "de_DE";
        m_lang2 = "de_AT";

    }
    if (lang == "ruseng") {
        m_lang1 = "ru";
        m_lang2 = "en";

    }
    aspell_config_replace(spell_config1, "lang", m_lang1.toAscii());
    aspell_config_replace(spell_config2, "lang", m_lang2.toAscii());
    AspellCanHaveError *possible_err = new_aspell_speller(spell_config1);
    spell_checker1 = 0;
    if (aspell_error_number(possible_err) == 0)
        spell_checker1 = to_aspell_speller(possible_err);
    else
        delete_aspell_can_have_error(possible_err);
    possible_err = new_aspell_speller(spell_config2);
    spell_checker2 = 0;
    if (aspell_error_number(possible_err) == 0)
        spell_checker2 = to_aspell_speller(possible_err);
    else
        delete_aspell_can_have_error(possible_err);

    // Check absent dictionary
    if (spell_checker1 == 0)
        bad_language = m_lang1;
    if (spell_checker2 == 0)
        bad_language = m_lang2;

}
/**
 * Create a userdata object
 * @param language the language e.g. "en_GB"
 * @param rules recipe file path to recipe file
 * @param fmt the format object containing function pointers
 * @return a complete userdata object or NULL
 */
userdata *userdata_create( const char *language, char *barefile, recipe *rules, 
    format *fmt, hh_exceptions *hhe )
{
    int err = 0;
    userdata *u = calloc( 1, sizeof(userdata) );
    if ( u != NULL )
    {
        u->rules = rules;
        if ( hhe != NULL )
            u->hhe = hhe;
        u->spell_config = new_aspell_config();
        if ( u->spell_config != NULL )
        {
            aspell_config_replace( u->spell_config, "lang", language );
            AspellCanHaveError *possible_err 
                = new_aspell_speller(u->spell_config);
            u->spell_checker = 0;
            if (aspell_error_number(possible_err) != 0)
            {
                fprintf(stderr,"%s\n",aspell_error_message(possible_err));
                err = 1;
            }
            else
            {
                u->spell_checker = to_aspell_speller(possible_err);
                if ( u->spell_checker == NULL )
                {
                    fprintf(stderr,"userdata: failed to initialise speller\n");
                    err = 1;
                }
            }
            u->range_stack = stack_create();
            if ( u->range_stack == NULL )
            {
                err = 1;
                fprintf(stderr, 
                    "stripper: failed to allocate store for range stack" );
            }
            u->ignoring = stack_create();
            if ( u->ignoring == NULL )
            {
                err = 1;
                fprintf(stderr, 
                    "stripper: failed to allocate store for ignore stack" );
            }
            if ( !open_dest_files(u,barefile,fmt) )
            {
                err = 1;
                fprintf(stderr,"stripper: couldn't open dest files\n");
            }
        }
        else
        {
            fprintf(stderr, "userdata: failed to initialise speller\n");
            err = 1;
        }
    }
    else
        fprintf(stderr, "userdata:failed to allocate object\n");
    if ( err )
    {
        userdata_dispose( u );
        u = NULL;
    }
    return u;
}
예제 #14
0
// runs in O(t^2) time where t is the number of tokens in the input corpus
// We consider maxK to be fairly constant
void rawr::compile(int maxK)
{
  _maxK = maxK;

  std::vector<std::vector<token_id>> tokens;
  std::set<std::string> thashtags;
  std::set<std::string> fv_emoticons;
  
  std::ifstream fvefile("emoticons.txt");
  if (fvefile)
  {
    std::string line;
    while (getline(fvefile, line))
    {
      fv_emoticons.insert(line);
      emoticons.forms.add(line);
    }
  }
  
  fvefile.close();
  
  std::map<std::string, std::string> canonical_form;
  
  AspellConfig* spell_config = new_aspell_config();
  AspellCanHaveError* possible_err = new_aspell_speller(spell_config);
  if (aspell_error_number(possible_err) != 0)
  {
    std::cout << "aspell error: " << aspell_error_message(possible_err) << std::endl;
    exit(1);
  }
  
  AspellSpeller* spell_checker = to_aspell_speller(possible_err);
  
  std::cout << "Reading emojis..." << std::endl;
  prefix_search emojis;
  std::ifstream emoji_file("emojis.txt");
  if (emoji_file)
  {
    while (!emoji_file.eof())
    {
      std::string rawmojis;
      getline(emoji_file, rawmojis);
      if (rawmojis.back() == '\r')
      {
        rawmojis.pop_back();
      }
    
      emojis.add(rawmojis);
    }
    
    emoji_file.close();
  }

  std::cout << "Tokenizing corpus...   0%" << std::flush;
  int len = 0;
  for (auto c : _corpora)
  {
    len += c.length();
  }
  
  int startper = 0;
  int per = 0;
  int perprime = 0;
  std::cout.fill(' ');
  for (int i = 0; i < _corpora.size(); i++)
  {
    size_t start = 0;
    int end = 0;
    std::vector<token_id> tkcor;

    while (end != std::string::npos)
    {
      perprime = (startper + end) * 100 / len;
      if (perprime != per)
      {
        per = perprime;
      
        std::cout << "\b\b\b\b" << std::right;
        std::cout.width(3);
        std::cout << per << "%" << std::flush;
      }
    
      end = _corpora[i].find_first_of(" \n", start);

      bool emoji = false;
      std::string te = _corpora[i].substr(start, (end == std::string::npos) ? std::string::npos : end - start + 1);
      std::string t = "";
    
      if (te.compare("") && te.compare(".") && te.compare(" "))
      {
        if (te.back() == ' ')
        {
          te.pop_back();
        }
        
        // Extract strings of emojis into their own tokens even if they're not space delimited
        int m = emojis.match(te);
        emoji = m > 0;
        if (m == 0) m = 1;
        t = te.substr(0,m);
        te = te.substr(m);
      
        while (!te.empty())
        {
          m = emojis.match(te);
          if (emoji == (m > 0))
          {
            if (m == 0) m = 1;
            t += te.substr(0,m);
            te = te.substr(m);
          } else {
            end = start + t.length() - 1;
            break;
          }
        }
      
        std::string tc(t);
        std::transform(tc.begin(), tc.end(), tc.begin(), ::tolower);

        int pst = tc.find_first_not_of("\"([*");
        int dst = tc.find_last_not_of("\")]*.,?!\n;:");
        std::string canonical("");
        if ((pst != std::string::npos) && (dst != std::string::npos))
        {
          canonical = std::string(tc, pst, dst - pst + 1);
        }
      
        word& w = ([&] () -> word& {
          // Hashtag freevar
          if (canonical[0] == '#')
          {
            thashtags.insert(canonical);
          
            return hashtags;
          }
        
          // Emoticon freevar
          if (emoji)
          {
            emoticons.forms.add(canonical);
          
            return emoticons;
          }
        
          if ((pst != std::string::npos) && (dst != std::string::npos))
          {
            std::string emoticon_canon(t, pst, t.find_last_not_of("\"]*\n.,?!") - pst + 1);
            if (fv_emoticons.count(emoticon_canon) == 1)
            {
              emoticons.forms.add(emoticon_canon);
          
              return emoticons;
            }
          }
        
          // Basically any other word
          if (canonical_form.count(canonical) == 0)
          {
            if (
              // Legacy freevars should be distinct from tokens containing similar words
              (canonical.find("$name$") != std::string::npos)
              // Words with no letters will be mangled by the spell checker
              || (canonical.find_first_of("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz") == std::string::npos)
              )
            {
              canonical_form[canonical] = canonical;
              words.emplace(canonical, canonical);
            } else {
              int correct = aspell_speller_check(spell_checker, canonical.c_str(), canonical.size());
              if (correct)
              {
                words.emplace(canonical, canonical);
                canonical_form[canonical] = canonical;
              } else {
                const AspellWordList* suggestions = aspell_speller_suggest(spell_checker, canonical.c_str(), canonical.size());
                AspellStringEnumeration* elements = aspell_word_list_elements(suggestions);
                const char* replacement = aspell_string_enumeration_next(elements);
                if (replacement != NULL)
                {
                  std::string sugrep(replacement);
                  canonical_form[canonical] = sugrep;
          
                  if (words.count(sugrep) == 0)
                  {
                    words.emplace(sugrep, sugrep);
                  }
                } else {
                  words.emplace(canonical, canonical);
                  canonical_form[canonical] = canonical;
                }
          
                delete_aspell_string_enumeration(elements);
              }
            }
          }
        
          word& tw = words.at(canonical_form.at(canonical));
          tw.forms.add(canonical);
        
          return tw;
        })();
      
        token tk(w);
        tk.raw = t;
      
        for (char c : t)
        {
          if (c == '*')
          {
            tk.delimiters[{parentype::asterisk, doublestatus::opening}]++;
          } else if (c == '[')
          {
            tk.delimiters[{parentype::square_bracket, doublestatus::opening}]++;
          } else if (c == '(')
          {
            tk.delimiters[{parentype::paren, doublestatus::opening}]++;
          } else if (c == '"')
          {
            tk.delimiters[{parentype::quote, doublestatus::opening}]++;
          } else {
            break;
          }
        }
      
        int backtrack = t.find_last_not_of(".,?!])\"*\n;:") + 1;
        if (backtrack != t.length())
        {
          std::string ending = t.substr(backtrack);
          std::string suffix;
          bool newline = false;
          bool terminating = false;
        
          for (char c : ending)
          {
            if ((c == '.') || (c == ',') || (c == '?') || (c == '!') || (c == ';') || (c == ':'))
            {
              suffix += c;
              terminating = true;
            
              continue;
            } else if (c == '\n')
            {
              newline = true;
              terminating = true;
              
              continue;
            }
          
            parentype pt = ([&] {
              switch (c)
              {
                case ']': return parentype::square_bracket;
                case ')': return parentype::paren;
                case '*': return parentype::asterisk;
                case '"': return parentype::quote;
              }
            })();
          
            if (tk.delimiters[{pt, doublestatus::opening}] > 0)
            {
              tk.delimiters[{pt, doublestatus::opening}]--;
              tk.delimiters[{pt, doublestatus::both}]++;
            } else {
              tk.delimiters[{pt, doublestatus::closing}]++;
            }
          }
        
          if (terminating)
          {
            if ((suffix == ",") && (!newline))
            {
              tk.suffix = suffixtype::comma;
            } else {
              tk.suffix = suffixtype::terminating;
              
              if (!newline)
              {
                w.terms.add({suffix, false});
              } else {
                w.terms.add({".", false});
              }
            }
          }
        }

        tkcor.push_back(_tokenstore.add(tk));
      }

      start = ((end > (std::string::npos - 1) ) ? std::string::npos : end + 1);
    }
    
    tokens.push_back(tkcor);
    
    startper += _corpora[i].length();
  }
  
  std::cout << "\b\b\b\b100%" << std::endl;
  
  delete_aspell_speller(spell_checker);
  delete_aspell_config(spell_config);
  
  std::cout << canonical_form.size() << " distinct forms" << std::endl;
  std::cout << words.size() << " distinct words" << std::endl;
  
  // Time to condense the distribution stuff for the words
  std::cout << "Compiling token histograms..." << std::endl;
  for (auto& it : words)
  {
    it.second.forms.compile();
    it.second.terms.compile();
  }
  
  // Hashtag freevar is not frequency distributed
  for (auto& it : thashtags)
  {
    hashtags.forms.add(it);
  }
  
  hashtags.forms.compile();
  hashtags.terms.compile();
  
  // Compile other freevars
  emoticons.forms.compile();
  emoticons.terms.compile();

  // Compile the interned tokens.
  _tokenstore.compile();

  // kgram distribution
  std::cout << "Creating markov chain...   0%" << std::flush;
  std::map<kgram, std::map<token_id, token_data> > tstats;

  len = 0;
  for (auto c : tokens)
  {
    len += (maxK-1) * c.size();
  }
  
  startper = 0;
  per = 0;
  perprime = 0;
  int corpid = 0;
  for (auto corpus : tokens)
  {
    for (int k=0; k<maxK && k<corpus.size(); k++)
    {
      // The zero'th token should be a terminator.
      token_id fid = corpus[k];
      const token& f = _tokenstore.get(fid);

      kgram term_prefix(corpus.begin(), corpus.begin()+k);
      term_prefix.push_front(wildcardQuery);

      if (tstats[term_prefix].count(fid) == 0)
      {
        tstats[term_prefix].emplace(fid, fid);
      }

      token_data& td2 = tstats[term_prefix].at(fid);
      td2.all++;
      td2.corpora.insert(corpid);

      if (std::find_if(f.raw.begin(), f.raw.end(), ::islower) == f.raw.end())
      {
        td2.uppercase++;
      } else if (isupper(f.raw[0]))
      {
        td2.titlecase++;
      }
    }

    for (int k=1; k<maxK && k<corpus.size(); k++)
    {
      for (int i=0; i<(corpus.size() - k); i++)
      {
        perprime = (startper+i) * 100 / len;
        if (perprime != per)
        {
          per = perprime;
      
          std::cout << "\b\b\b\b" << std::right;
          std::cout.width(3);
          std::cout << per << "%" << std::flush;
        }
      
        kgram prefix(corpus.begin()+i, corpus.begin()+i+k);
        token_id fid = corpus[i+k];
        const token& f = _tokenstore.get(fid);

        if (tstats[prefix].count(fid) == 0)
        {
          tstats[prefix].emplace(fid, fid);
        }

        token_data& td = tstats[prefix].at(fid);
        td.all++;
        td.corpora.insert(corpid);

        if (std::find_if(f.raw.begin(), f.raw.end(), ::islower) == f.raw.end())
        {
          td.uppercase++;
        } else if (isupper(f.raw[0]))
        {
          td.titlecase++;
        }

        const token& startTok = _tokenstore.get(std::begin(prefix)->tok);
        if (startTok.suffix == suffixtype::terminating)
        {
          kgram term_prefix(prefix);
          term_prefix.pop_front();
          term_prefix.push_front(wildcardQuery);

          if (tstats[term_prefix].count(fid) == 0)
          {
            tstats[term_prefix].emplace(fid, fid);
          }

          token_data& td2 = tstats[term_prefix].at(fid);
          td2.all++;
          td2.corpora.insert(corpid);

          if (std::find_if(f.raw.begin(), f.raw.end(), ::islower) == f.raw.end())
          {
            td2.uppercase++;
          } else if (isupper(f.raw[0]))
          {
            td2.titlecase++;
          }
        }
      }
      
      startper += corpus.size();
    }
    
    corpid++;
  }
  
  std::cout << "\b\b\b\b100%" << std::endl;

  // Condense the kgram distribution
  std::cout << "Compiling kgram distributions...   0%";
  len = tstats.size();
  per = 0;
  perprime = 0;
  int indicator = 0;
  for (auto& it : tstats)
  {
    indicator++;
    perprime = indicator * 100 / len;
    if (per != perprime)
    {
      per = perprime;
    
      std::cout << "\b\b\b\b" << std::right;
      std::cout.width(3);
      std::cout << per << "%" << std::flush;
    }
    
    kgram klist = it.first;
    auto& probtable = it.second;
    auto& distribution = _stats[klist];
    int max = 0;
		
    for (auto& kt : probtable)
    {
      max += kt.second.all;
			
      distribution.emplace(max, kt.second);
    }
  }
  
  std::cout << "\b\b\b\b100%" << std::endl;
  
  _compiled = true;
}
예제 #15
0
int main(int argc, const char *argv[]) 
{
  AspellCanHaveError * ret;
  AspellSpeller * speller;
  int have;
  char word[81];
  char * p;
  char * word_end;
  AspellConfig * config;

  if (argc < 2) {
    printf("Usage: %s <language> [<size>|- [[<jargon>|- [<encoding>]]]\n", argv[0]);
    return 1;
  }

  config = new_aspell_config();

  aspell_config_replace(config, "lang", argv[1]);

  if (argc >= 3 && argv[2][0] != '-' && argv[2][1] != '\0')
    aspell_config_replace(config, "size", argv[2]);

  if (argc >= 4 && argv[3][0] != '-')
    aspell_config_replace(config, "jargon", argv[3]);

  if (argc >= 5 && argv[4][0] != '-')
    aspell_config_replace(config, "encoding", argv[4]);

  ret = new_aspell_speller(config);

  delete_aspell_config(config);

  if (aspell_error(ret) != 0) {
    printf("Error: %s\n",aspell_error_message(ret));
    delete_aspell_can_have_error(ret);
    return 2;
  }
  speller = to_aspell_speller(ret);
  config = aspell_speller_config(speller);

  fputs("Using: ",                                      stdout);
  fputs(aspell_config_retrieve(config, "lang"),         stdout);
  fputs("-",                                            stdout);
  fputs(aspell_config_retrieve(config, "jargon"),       stdout);
  fputs("-",                                            stdout);
  fputs(aspell_config_retrieve(config, "size"),         stdout);
  fputs("-",                                            stdout);
  fputs(aspell_config_retrieve(config, "module"),       stdout);
  fputs("\n\n",                                         stdout);

  puts("Type \"h\" for help.\n");

  while (fgets(word, 80, stdin) != 0) {

    /* remove trailing spaces */

    word_end = strchr(word, '\0') - 1;
    while (word_end != word && (*word_end == '\n' || *word_end == ' ')) 
      --word_end;
    ++word_end;
    *word_end = '\0';
    
    putchar('\n');
    switch (word[0]) {
    case '\0':
      break;
    case 'h':
      puts(
	"Usage: \n"
	"  h(elp)      help\n"
	"  c <word>    check if a word is the correct spelling\n"
	"  s <word>    print out a list of suggestions for a word\n"
	"  a <word>    add a word to the personal word list\n"
	"  i <word>    ignore a word for the rest of the session\n"
        "  d <file>    spell checks a document\n"
	"  p           dumps the personal word list\n"
	"  P           dumps the session word list\n"
	"  m           dumps the main  word list\n"
        "  o <option> <value> sets a config option\n"
	"  r <option>         retrieves a config option\n"
        "  l <option>         retrieves a config option as a list\n"
	"  S           saves all word lists\n"
	"  C           clear the curent sesstion word list\n"
	"  x           quite\n"	);
      break;
    case 'p':
      print_word_list(speller, 
		      aspell_speller_personal_word_list(speller), '\n');
      break;
    case 'P':
      print_word_list(speller, 
		      aspell_speller_session_word_list(speller), '\n');
      break;
    case 'm':
      print_word_list(speller, 
		      aspell_speller_main_word_list(speller), '\n');
      break;
    case 'S':
      aspell_speller_save_all_word_lists(speller);
      check_for_error(speller);
      break;
    case 'C': 
      aspell_speller_clear_session(speller);
      check_for_error(speller);
      break;
    case 'x':
      goto END;
    case 'c':
      if (strlen(word) < 3) {
	printf("Usage: %c <word>\n", word[0]);
      } else {
	have = aspell_speller_check(speller, word + 2, -1);
	if (have == 1) 
	  puts("correct");
	else if (have == 0)
	  puts("incorrect");
	else
	  printf("Error: %s\n", aspell_speller_error_message(speller));
      }
      break;
    case 's':
      if (strlen(word) < 3) {
	printf("Usage: %c <word>\n", word[0]);
      } else {
	print_word_list(speller, 
			aspell_speller_suggest(speller, word + 2, -1), '\n');
      }
      break;
    case 'a':
      if (strlen(word) < 3) {
	printf("Usage: %c <word>\n", word[0]);
      } else {
	aspell_speller_add_to_personal(speller, word + 2, -1);
	check_for_error(speller);
      }
      break;
    case 'i':
      if (strlen(word) < 3) {
	printf("Usage: %c <word>\n", word[0]);
      } else {
	aspell_speller_add_to_session(speller, word + 2, -1);
	check_for_error(speller);
      }
      break;
    case 'o':
      word[80] = '\0'; /* to make sure strchr doesn't run off end of string */
      p = strchr(word + 3, ' ');
      if (strlen(word) < 3 || p == 0) {
	printf("Usage: %c <option> <value>\n", word[0]);
      } else {
	*p = '\0';
	++p;
	aspell_config_replace(config, word + 2, p);
	check_for_config_error(config);
      }
      break;
    case 'r':
      if (strlen(word) < 3) {
	printf("Usage: %c <option>\n", word[0]);
      } else {
	const char * val = aspell_config_retrieve(config, word + 2);
	check_for_config_error(config);
	if (val)
	  printf("%s = \"%s\"\n", word + 2, val);
      }
      break;
    case 'l':
      if (strlen(word) < 3) {
	printf("Usage: %c <option>\n", word[0]);
      } else {
	AspellStringList * lst = new_aspell_string_list();
	AspellMutableContainer * lst0 
	  = aspell_string_list_to_mutable_container(lst);
	AspellStringEnumeration * els;
	const char * val;
	aspell_config_retrieve_list(config, word + 2, lst0);
	check_for_config_error(config);
	els = aspell_string_list_elements(lst);
	printf("%s:\n", word + 2);
	while ( (val = aspell_string_enumeration_next(els)) != 0)
	  printf("  %s\n", val);
	delete_aspell_string_enumeration(els);
	delete_aspell_string_list(lst);
      }
      break;
    case 'd':
      if (strlen(word) < 3) {
	printf("Usage: %c <file>\n", word[0]);
      } else {
	check_document(speller, word + 2);
	printf("\n");
      }
      break;
    default:
      printf("Unknown Command: %s\n", word);
    }
    putchar('\n');
  }
 END:
  delete_aspell_speller(speller);
  return 0;
}