Example #1
0
SpellChecker::SpellChecker( QObject * parent ) 
	: QObject(parent)
{
	m_spell_config_ru = new_aspell_config();
	aspell_config_replace(m_spell_config_ru, "dict-dir", "./dict");
	aspell_config_replace(m_spell_config_ru, "encoding", "utf-8");
	aspell_config_replace(m_spell_config_ru, "lang", "ru");
	AspellCanHaveError * possible_err = new_aspell_speller(m_spell_config_ru);
	m_spell_checker_ru = 0;
	if (aspell_error_number(possible_err) != 0){
		puts(aspell_error_message(possible_err));
	}
	else{
		m_spell_checker_ru = to_aspell_speller(possible_err); 
	}


	m_spell_config_en = new_aspell_config();
	aspell_config_replace(m_spell_config_en, "dict-dir", "./dict");
	aspell_config_replace(m_spell_config_en, "encoding", "utf-8");
	aspell_config_replace(m_spell_config_en, "lang", "en");
	possible_err = new_aspell_speller(m_spell_config_en);
	m_spell_checker_en = 0;
	if (aspell_error_number(possible_err) != 0){
		puts(aspell_error_message(possible_err));
	}
	else{
		m_spell_checker_en = to_aspell_speller(possible_err); 
	}

	m_codec = QTextCodec::codecForName("UTF-8");
}
Example #2
0
int main(int argc,char **argv)
{
#ifdef _USEQT5_
	QApplication	app(argc,argv);
#endif
	AspellCanHaveError*	possible_err;

	aspellConfig=new_aspell_config();
	possible_err=new_aspell_speller(aspellConfig);

	if(aspell_error_number(possible_err)!= 0)
		puts(aspell_error_message(possible_err));
	else
		spellChecker=to_aspell_speller(possible_err);

#ifndef _USEQT5_
	gtk_init(&argc,&argv);

	buildMainGuiGtk();
	gtk_window_stick(GTK_WINDOW(window));
	gtk_window_set_keep_above((GtkWindow*)window,true);
	gtk_widget_show_all(window);
	gtk_main();
#else
	holdapp=&app;
	buildMainGuiQt();
	window->show();
	app.exec();
#endif
}
Example #3
0
bool SpellChecker::addCheckedLang(QString &name)
{
	if (checkers.find(name) != checkers.end())
		return true;

	aspell_config_replace(spellConfig, "lang", name.toAscii());

	// create spell checker using prepared configuration
	AspellCanHaveError* possibleErr = new_aspell_speller(spellConfig);
	if (aspell_error_number(possibleErr) != 0)
	{
		MessageBox::msg(aspell_error_message(possibleErr));
		return false;
	}
	else
		checkers[name] = to_aspell_speller(possibleErr);

	if (checkers.size() == 1)
	{
		foreach(ChatWidget *chat, ChatWidgetManager::instance()->chats())
			chatCreated(chat);
	}

	return true;
}
Example #4
0
//__________________________________________________________________________
void Speller::Aspell::Suggest::init(const std::string& lang,
				    const std::string& jargon,
				    const std::string& encoding)
	throw( std::invalid_argument, std::runtime_error )
{
	// Save aspell configuration values
	flang = lang;
	fjargon = jargon;
	fencoding = encoding;

	fconfig = new_aspell_config();
	try
	{
		setConfig();
	}
	catch( const std::invalid_argument& err )
	{
		throw err;
	}

	AspellCanHaveError* ret = new_aspell_speller( fconfig );
	delete_aspell_config( fconfig );
	if( aspell_error_number( ret ) != 0 )
	{
		delete_aspell_can_have_error( ret );
		throw std::runtime_error( "(Aspell::Speller::Suggest::init"
					  "): Error in creating speller." );
	}
	else
	{
		fspeller = to_aspell_speller( ret );
		fconfig = aspell_speller_config( fspeller );
	}
}
Example #5
0
//__________________________________________________________________________
void Speller::Aspell::Suggest::resetConfig()
	throw( std::invalid_argument, std::runtime_error )
{
	delete_aspell_config( fconfig );
	fconfig = new_aspell_config();
	try
	{
		setConfig();
	}
	catch( const std::invalid_argument& err )
	{
		throw err;
	}

	AspellCanHaveError* ret = new_aspell_speller( fconfig );
	if( aspell_error_number( ret ) != 0 )
	{
		delete_aspell_can_have_error( ret );
		throw std::runtime_error( "(Aspell::Speller::Suggest::Reset"
					  "Config): Error in creating "
					  "speller." );
	}
	else
	{
		// Following statement causes a crash, hence commented out
		//delete_aspell_speller( fspeller );
		fspeller = to_aspell_speller( ret );
		delete_aspell_config( fconfig );
		fconfig = aspell_speller_config( fspeller );
	}
}
void AspellAdapterImpl::createAspellInstance() {
    AspellCanHaveError * possibleError = new_aspell_speller(aspellConfig_);
    if (aspell_error_number(possibleError) != 0) {
        ERROR("ASPELL CREATION ERROR: " << aspell_error_message(possibleError));
        // @todo
        //        throw PsiException(aspell_error_message(possibleError));
    } else {
        aspellSpeller_ = to_aspell_speller(possibleError);
    }
}
Example #7
0
void initAspell()
{
	spell_config = new_aspell_config();
	aspell_config_replace(spell_config, "lang", "en_US");	//set language
	possible_err = new_aspell_speller(spell_config);
	spell_checker = 0;

	if (aspell_error_number(possible_err) != 0) {
		printf("%s ", aspell_error_message(possible_err));
	}
	else {
		printf("Unscrambled words:\n");
		spell_checker = to_aspell_speller(possible_err);
	}
}
Example #8
0
void init(void)
{
	char*				filename;
#ifdef _ASPELL_
	AspellCanHaveError*	possible_err;
#endif
	lineWrap=true;
	highLight=true;
	useUnderline=true;

	tabWidth=4;
	fontAndSize=strdup("mono 10");
	terminalCommand=strdup("xterm -e");
	windowWidth=800;
	windowHeight=400;
	windowX=-1;
	windowY=-1;
	wrapSearch=true;
	insensitiveSearch=true;
	replaceAll=false;
	showLiveSearch=true;
	gzipPages=false;

	asprintf(&filename,"%s/.ManPageEditor",getenv("HOME"));
	g_mkdir_with_parents(filename,493);
	g_free(filename);

	readConfig();

	tmpGzipPages=gzipPages;
	tmpHighLight=highLight;
	tmpLineWrap=lineWrap;
	tmpTabWidth=tabWidth;
	tmpUseUnderline=useUnderline;
	tmpShowLiveSearch=showLiveSearch;

#ifdef _ASPELL_
	aspellConfig=new_aspell_config();
	possible_err=new_aspell_speller(aspellConfig);

	if(aspell_error_number(possible_err)!= 0)
		puts(aspell_error_message(possible_err));
	else
		spellChecker=to_aspell_speller(possible_err);
#endif

}
bool AspellAdapterImpl::isDictionaryAvailable(const std::string & langCode) {
    AspellConfig * tempAspellConfig = new_aspell_config();
    aspell_config_replace(tempAspellConfig, "lang", langCode.c_str());
    AspellCanHaveError * possibleError = new_aspell_speller(tempAspellConfig);

    bool result = true;
    if (aspell_error_number(possibleError) != 0) {
        result = false;
    } else {
        AspellSpeller * tempSpeller = to_aspell_speller(possibleError);
        delete_aspell_speller(tempSpeller);
    }

    delete_aspell_config(tempAspellConfig);
    
    return result;
}
void ASpellChecker::setActiveLanguages(const QList<QString>& langs)
{
	clearSpellers();

	foreach(const QString& lang, langs)
	{
		AspellConfig* conf = aspell_config_clone(config_);
		aspell_config_replace(conf, "lang", lang.toUtf8().constData());
		AspellCanHaveError* ret = new_aspell_speller(conf);
		if (aspell_error_number(ret) == 0) {
			spellers_.append(to_aspell_speller(ret));
		}
		else {
			qDebug() << QString("Aspell error: %1").arg(aspell_error_message(ret));
		}
		delete_aspell_config(conf);
	}
Example #11
0
void ASpellChecker::setActiveLanguages(const QSet<LanguageManager::LangId>& langs)
{
    clearSpellers();

    for(auto const &lang: langs)
    {
        AspellConfig* conf = aspell_config_clone(config_);
        aspell_config_replace(conf, "lang", LanguageManager::toString(lang)
                              .replace(QLatin1Char('-'),QLatin1Char('_')).toUtf8().constData());
        AspellCanHaveError* ret = new_aspell_speller(conf);
        if (aspell_error_number(ret) == 0) {
            spellers_.append(to_aspell_speller(ret));
        }
        else {
            qDebug() << QString("Aspell error: %1").arg(aspell_error_message(ret));
        }
        delete_aspell_config(conf);
    }
}
Example #12
0
ASpellChecker::ASpellChecker()
{
	config_ = NULL;
	speller_ = NULL;
	config_ = new_aspell_config();
	aspell_config_replace(config_, "encoding", "utf-8");
#ifdef Q_WS_WIN
	aspell_config_replace(config_, "conf-dir", QDir::homeDirPath());
	aspell_config_replace(config_, "data-dir", QString("%1/aspell/data").arg(QCoreApplication::applicationDirPath()));
	aspell_config_replace(config_, "dict-dir", QString("%1/aspell/dict").arg(QCoreApplication::applicationDirPath()));
#endif
	AspellCanHaveError* ret = new_aspell_speller(config_);
	if (aspell_error_number(ret) == 0) {
		speller_ = to_aspell_speller(ret);
	}
	else {
		qWarning(QString("Aspell error: %1").arg(aspell_error_message(ret)).toAscii());
	}
}
/**
 * Create a spell checker for a language
 * @param language the language code e.g. en_GB or it
 * @return a checker or NULL
 */
static checker *checker_create( const char *language )
{
    int err = 0;
    checker *c = calloc( 1, sizeof(checker) );
    if ( c != NULL )
    {
        strncpy(c->lang,language,24);
        c->spell_config = new_aspell_config();
        if ( c->spell_config != NULL )
        {
            aspell_config_replace( c->spell_config, "lang", language );
            AspellCanHaveError *possible_err 
                = new_aspell_speller(c->spell_config);
            c->spell_checker = 0;
            if (aspell_error_number(possible_err) != 0)
            {
                fprintf(stderr,"%s\n",aspell_error_message(possible_err));
                err = 1;
            }
            else
            {
                c->spell_checker = to_aspell_speller(possible_err);
                if ( c->spell_checker == NULL )
                {
                    fprintf(stderr,"checker: failed to initialise speller\n");
                    err = 1;
                }
            }
            if ( err )
            {
                checker_dispose( c );
                c = NULL;
            }
        }
        else
            fprintf(stderr,"checker: failed to create speller\n");
    }
    else
        fprintf(stderr,"checker: failed to create object\n");
    return c;
}
Example #14
0
/**
 * create a neew spell-checker for the language 'lang'
 */
void * spellcheck_create(const char * lang)
{
    struct linkgrammar_aspell *aspell = NULL;
    size_t i = 0;
    AspellCanHaveError *spell_err = NULL;

    for (i = 0; i < sizeof(spellcheck_lang_mapping)/sizeof(char *); i += 2)
    {
        if (0 != strcmp(lang, spellcheck_lang_mapping[i])) continue;
        aspell = (struct linkgrammar_aspell *)malloc(sizeof(struct linkgrammar_aspell));
        if (!aspell) {
            prt_error("Error: out of memory. Aspell not used.\n");
            aspell = NULL;
            break;
        }
        aspell->config = NULL;
        aspell->speller = NULL;
        aspell->config = new_aspell_config();
        if (aspell_config_replace(aspell->config, ASPELL_LANG_KEY,
                                  spellcheck_lang_mapping[i]) == 0) {
            prt_error("Error: failed to set language in aspell: %s\n", lang);
            delete_aspell_config(aspell->config);
            free(aspell);
            aspell = NULL;
            break;
        }
        spell_err = new_aspell_speller(aspell->config);
        if (aspell_error_number(spell_err) != 0) {
            prt_error("Error: Aspell: %s\n", aspell_error_message(spell_err));
            delete_aspell_can_have_error(spell_err);
            delete_aspell_config(aspell->config);
            free(aspell);
            aspell = NULL;
            break;
        }
        aspell->speller = to_aspell_speller(spell_err);
        break;
    }
    return aspell;
}
Example #15
0
static gboolean
gtkspell_set_language_internal(GtkSpell *spell, const gchar *lang, GError **error) {
	AspellConfig *config;
	AspellCanHaveError *err;

	if (lang == NULL) {
		lang = g_getenv("LANG");
		if (lang) {
			if (g_strncasecmp(lang, "C", 1) == 0)
				lang = NULL;
			else if (lang[0] == 0)
				lang = NULL;
		}
	}

	config = new_aspell_config();
	if (lang)
		aspell_config_replace(config, "language-tag", lang);
	aspell_config_replace(config, "encoding", "utf-8");
	err = new_aspell_speller(config);
	delete_aspell_config(config);

	if (aspell_error_number(err) != 0) {
#ifdef USING_ASPELL
		g_set_error(error, GTKSPELL_ERROR, GTKSPELL_ERROR_BACKEND,
				"aspell: %s", aspell_error_message(err));
#elif defined USING_PSPELL
		g_set_error(error, GTKSPELL_ERROR, GTKSPELL_ERROR_BACKEND,
				"pspell: %s", aspell_error_message(err));
#endif
		return FALSE;
	} 
	if (spell->speller)
		delete_aspell_speller(spell->speller);
	spell->speller = to_aspell_speller(err);

	return TRUE;
}
Example #16
0
void SpellChecker::setLanguage(const QString &lang)
{
    if (lang.isEmpty()) return;
    delete_aspell_speller(spell_checker1);
    delete_aspell_speller(spell_checker2);
    bad_language.clear();

    m_lang2 = "en";
    m_lang1 = m_map->value(lang, QString("en"));
    if (lang.contains("+")) {
        QStringList sl = lang.split("+");
        m_lang1 = m_map->value(sl[0], QString("en"));
        m_lang2 = m_map->value(sl[1], QString("en"));
        if ((m_lang1 == "deu")||(lang == "ger")) {
            m_lang1 = "de_DE";
        }
        if ((m_lang2 == "deu")||(lang == "ger")) {
            m_lang2 = "de_DE";
        }
    }
    if (lang == "rus_fra") {
        m_lang1 = "ru";
        m_lang2 = "fr";
    } else if (lang == "rus_ger") {
        m_lang1 = "ru";
        m_lang2 = "de";
    } else if (lang == "rus_spa") {
        m_lang1 = "ru";
        m_lang2 = "es";
    }
    if ((lang == "deu")||(lang == "ger")) {
        m_lang1 = "de_DE";
        m_lang2 = "de_AT";

    }
    if (lang == "ruseng") {
        m_lang1 = "ru";
        m_lang2 = "en";

    }
    aspell_config_replace(spell_config1, "lang", m_lang1.toAscii());
    aspell_config_replace(spell_config2, "lang", m_lang2.toAscii());
    AspellCanHaveError *possible_err = new_aspell_speller(spell_config1);
    spell_checker1 = 0;
    if (aspell_error_number(possible_err) == 0)
        spell_checker1 = to_aspell_speller(possible_err);
    else
        delete_aspell_can_have_error(possible_err);
    possible_err = new_aspell_speller(spell_config2);
    spell_checker2 = 0;
    if (aspell_error_number(possible_err) == 0)
        spell_checker2 = to_aspell_speller(possible_err);
    else
        delete_aspell_can_have_error(possible_err);

    // Check absent dictionary
    if (spell_checker1 == 0)
        bad_language = m_lang1;
    if (spell_checker2 == 0)
        bad_language = m_lang2;

}
/**
 * Create a userdata object
 * @param language the language e.g. "en_GB"
 * @param rules recipe file path to recipe file
 * @param fmt the format object containing function pointers
 * @return a complete userdata object or NULL
 */
userdata *userdata_create( const char *language, char *barefile, recipe *rules, 
    format *fmt, hh_exceptions *hhe )
{
    int err = 0;
    userdata *u = calloc( 1, sizeof(userdata) );
    if ( u != NULL )
    {
        u->rules = rules;
        if ( hhe != NULL )
            u->hhe = hhe;
        u->spell_config = new_aspell_config();
        if ( u->spell_config != NULL )
        {
            aspell_config_replace( u->spell_config, "lang", language );
            AspellCanHaveError *possible_err 
                = new_aspell_speller(u->spell_config);
            u->spell_checker = 0;
            if (aspell_error_number(possible_err) != 0)
            {
                fprintf(stderr,"%s\n",aspell_error_message(possible_err));
                err = 1;
            }
            else
            {
                u->spell_checker = to_aspell_speller(possible_err);
                if ( u->spell_checker == NULL )
                {
                    fprintf(stderr,"userdata: failed to initialise speller\n");
                    err = 1;
                }
            }
            u->range_stack = stack_create();
            if ( u->range_stack == NULL )
            {
                err = 1;
                fprintf(stderr, 
                    "stripper: failed to allocate store for range stack" );
            }
            u->ignoring = stack_create();
            if ( u->ignoring == NULL )
            {
                err = 1;
                fprintf(stderr, 
                    "stripper: failed to allocate store for ignore stack" );
            }
            if ( !open_dest_files(u,barefile,fmt) )
            {
                err = 1;
                fprintf(stderr,"stripper: couldn't open dest files\n");
            }
        }
        else
        {
            fprintf(stderr, "userdata: failed to initialise speller\n");
            err = 1;
        }
    }
    else
        fprintf(stderr, "userdata:failed to allocate object\n");
    if ( err )
    {
        userdata_dispose( u );
        u = NULL;
    }
    return u;
}
Example #18
0
static int get_aspell_hits(const char *word, int len)
{
	if (len < 2) 
	{
		log_message(DEBUG, "   [-]Skip aspell checking (word is very short)");
		return NO_LANGUAGE;	
	}

	AspellConfig *spell_config = new_aspell_config();
	
	for (int lang = 0; lang < xconfig->total_languages; lang++)
	{
		char *lang_word = (char *) malloc(1 * sizeof(char));
		if (lang_word == NULL)
			continue;
		lang_word[0] = NULLSYM;

		for (int i = 0; i < len; i++)
		{
			KeyCode kc;
			int modifier;
			main_window->xkeymap->char_to_keycode(main_window->xkeymap, word[i], &kc, &modifier);

			char *symbol = keycode_to_symbol(kc, lang, modifier);
			if ((symbol == NULL) || (lang_word == NULL))
				continue;
			lang_word = (char *) realloc(lang_word, (strlen(lang_word) + strlen(symbol) + 1) * sizeof(char));
			if (lang_word != NULL)
				strcat(lang_word, symbol);

			free(symbol);
		}
		
		if (lang_word == NULL)
			continue;

		aspell_config_replace(spell_config, "lang", xconfig->languages[lang].dir);
		AspellCanHaveError *possible_err = new_aspell_speller(spell_config);
		AspellSpeller *spell_checker = 0;

		if (aspell_error_number(possible_err) == 0)
		{
			spell_checker = to_aspell_speller(possible_err);
			int correct = aspell_speller_check(spell_checker, lang_word, strlen(lang_word));
			if (correct)
			{
				log_message(DEBUG, "   [+]Found this word in %s aspell dictionary", xconfig->get_lang_name(xconfig, lang));
				delete_aspell_speller(spell_checker);
				free(lang_word);
				return lang;
			}
		}
		else
		{
			log_message(DEBUG, "   [!]Error aspell checking for %s aspell dictionary", xconfig->get_lang_name(xconfig, lang));
		}
	
		delete_aspell_speller(spell_checker);
		free(lang_word);
	}
	
	log_message(DEBUG, "   [-]This word has no hits for all aspell dictionaries");
	return NO_LANGUAGE;
}
Example #19
0
/* Create a new speller *******************************************************/
static PyObject* new_speller(PyObject* self, PyObject* args) {
	aspell_AspellObject* newobj;

	AspellSpeller* speller = 0;
	AspellConfig*  config;
	AspellCanHaveError* possible_error;

	int i;
	int n; /* arg count */
	char *key, *value;

	config = new_aspell_config();
	if (config == NULL) {
		PyErr_SetString(_AspellModuleException, "can't create config");
		return NULL;
	}

	/* check constructor arguments */
	n = PyTuple_Size(args);
	switch (n) {
		case 0: /* no arguments passed */
			break;

		case 2: /* constructor is called with single pair: key & value */
			if (PyArg_ParseTuple(args, "ss", &key, &value)) {
				if (!aspell_config_replace(config, key, value)) {
					PyErr_SetString(_AspellConfigException, aspell_config_error_message(config));
					goto arg_error;
				}
				break;
			}
			PyErr_Clear();
		default: /* list of tuples key&value */
			for (i=0; i<n; i++) {
				if (!PyArg_ParseTuple(PyTuple_GetItem(args, i), "ss", &key, &value)) {
					PyErr_Format(PyExc_TypeError, "argument %d: tuple of two strings (key, value) expeced", i);
					goto arg_error;
				}
				if (!aspell_config_replace(config, key, value)) {
					PyErr_SetString(_AspellConfigException, aspell_config_error_message(config));
					goto arg_error;
				}
			}
			Py_DECREF(args);
			break;
	}

	/* try to create a new speller */
	possible_error = new_aspell_speller(config);
	delete_aspell_config(config);

	if (aspell_error_number(possible_error) == 0)
		/* save a speller */
		speller = to_aspell_speller(possible_error);
	else {
		/* or raise an exception */
		PyErr_SetString(_AspellSpellerException, aspell_error_message(possible_error));
		delete_aspell_can_have_error(possible_error);
		return NULL;
	}

	/* create a new py-object */
  newobj = (aspell_AspellObject*)PyObject_New(aspell_AspellObject, &aspell_AspellType);
	newobj->speller = speller;

	return (PyObject*)newobj;

/* argument error: before return NULL we need to
   delete speller's config we've created */
arg_error:
	delete_aspell_config(config);
	return NULL;
}
Example #20
0
// runs in O(t^2) time where t is the number of tokens in the input corpus
// We consider maxK to be fairly constant
void rawr::compile(int maxK)
{
  _maxK = maxK;

  std::vector<std::vector<token_id>> tokens;
  std::set<std::string> thashtags;
  std::set<std::string> fv_emoticons;
  
  std::ifstream fvefile("emoticons.txt");
  if (fvefile)
  {
    std::string line;
    while (getline(fvefile, line))
    {
      fv_emoticons.insert(line);
      emoticons.forms.add(line);
    }
  }
  
  fvefile.close();
  
  std::map<std::string, std::string> canonical_form;
  
  AspellConfig* spell_config = new_aspell_config();
  AspellCanHaveError* possible_err = new_aspell_speller(spell_config);
  if (aspell_error_number(possible_err) != 0)
  {
    std::cout << "aspell error: " << aspell_error_message(possible_err) << std::endl;
    exit(1);
  }
  
  AspellSpeller* spell_checker = to_aspell_speller(possible_err);
  
  std::cout << "Reading emojis..." << std::endl;
  prefix_search emojis;
  std::ifstream emoji_file("emojis.txt");
  if (emoji_file)
  {
    while (!emoji_file.eof())
    {
      std::string rawmojis;
      getline(emoji_file, rawmojis);
      if (rawmojis.back() == '\r')
      {
        rawmojis.pop_back();
      }
    
      emojis.add(rawmojis);
    }
    
    emoji_file.close();
  }

  std::cout << "Tokenizing corpus...   0%" << std::flush;
  int len = 0;
  for (auto c : _corpora)
  {
    len += c.length();
  }
  
  int startper = 0;
  int per = 0;
  int perprime = 0;
  std::cout.fill(' ');
  for (int i = 0; i < _corpora.size(); i++)
  {
    size_t start = 0;
    int end = 0;
    std::vector<token_id> tkcor;

    while (end != std::string::npos)
    {
      perprime = (startper + end) * 100 / len;
      if (perprime != per)
      {
        per = perprime;
      
        std::cout << "\b\b\b\b" << std::right;
        std::cout.width(3);
        std::cout << per << "%" << std::flush;
      }
    
      end = _corpora[i].find_first_of(" \n", start);

      bool emoji = false;
      std::string te = _corpora[i].substr(start, (end == std::string::npos) ? std::string::npos : end - start + 1);
      std::string t = "";
    
      if (te.compare("") && te.compare(".") && te.compare(" "))
      {
        if (te.back() == ' ')
        {
          te.pop_back();
        }
        
        // Extract strings of emojis into their own tokens even if they're not space delimited
        int m = emojis.match(te);
        emoji = m > 0;
        if (m == 0) m = 1;
        t = te.substr(0,m);
        te = te.substr(m);
      
        while (!te.empty())
        {
          m = emojis.match(te);
          if (emoji == (m > 0))
          {
            if (m == 0) m = 1;
            t += te.substr(0,m);
            te = te.substr(m);
          } else {
            end = start + t.length() - 1;
            break;
          }
        }
      
        std::string tc(t);
        std::transform(tc.begin(), tc.end(), tc.begin(), ::tolower);

        int pst = tc.find_first_not_of("\"([*");
        int dst = tc.find_last_not_of("\")]*.,?!\n;:");
        std::string canonical("");
        if ((pst != std::string::npos) && (dst != std::string::npos))
        {
          canonical = std::string(tc, pst, dst - pst + 1);
        }
      
        word& w = ([&] () -> word& {
          // Hashtag freevar
          if (canonical[0] == '#')
          {
            thashtags.insert(canonical);
          
            return hashtags;
          }
        
          // Emoticon freevar
          if (emoji)
          {
            emoticons.forms.add(canonical);
          
            return emoticons;
          }
        
          if ((pst != std::string::npos) && (dst != std::string::npos))
          {
            std::string emoticon_canon(t, pst, t.find_last_not_of("\"]*\n.,?!") - pst + 1);
            if (fv_emoticons.count(emoticon_canon) == 1)
            {
              emoticons.forms.add(emoticon_canon);
          
              return emoticons;
            }
          }
        
          // Basically any other word
          if (canonical_form.count(canonical) == 0)
          {
            if (
              // Legacy freevars should be distinct from tokens containing similar words
              (canonical.find("$name$") != std::string::npos)
              // Words with no letters will be mangled by the spell checker
              || (canonical.find_first_of("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz") == std::string::npos)
              )
            {
              canonical_form[canonical] = canonical;
              words.emplace(canonical, canonical);
            } else {
              int correct = aspell_speller_check(spell_checker, canonical.c_str(), canonical.size());
              if (correct)
              {
                words.emplace(canonical, canonical);
                canonical_form[canonical] = canonical;
              } else {
                const AspellWordList* suggestions = aspell_speller_suggest(spell_checker, canonical.c_str(), canonical.size());
                AspellStringEnumeration* elements = aspell_word_list_elements(suggestions);
                const char* replacement = aspell_string_enumeration_next(elements);
                if (replacement != NULL)
                {
                  std::string sugrep(replacement);
                  canonical_form[canonical] = sugrep;
          
                  if (words.count(sugrep) == 0)
                  {
                    words.emplace(sugrep, sugrep);
                  }
                } else {
                  words.emplace(canonical, canonical);
                  canonical_form[canonical] = canonical;
                }
          
                delete_aspell_string_enumeration(elements);
              }
            }
          }
        
          word& tw = words.at(canonical_form.at(canonical));
          tw.forms.add(canonical);
        
          return tw;
        })();
      
        token tk(w);
        tk.raw = t;
      
        for (char c : t)
        {
          if (c == '*')
          {
            tk.delimiters[{parentype::asterisk, doublestatus::opening}]++;
          } else if (c == '[')
          {
            tk.delimiters[{parentype::square_bracket, doublestatus::opening}]++;
          } else if (c == '(')
          {
            tk.delimiters[{parentype::paren, doublestatus::opening}]++;
          } else if (c == '"')
          {
            tk.delimiters[{parentype::quote, doublestatus::opening}]++;
          } else {
            break;
          }
        }
      
        int backtrack = t.find_last_not_of(".,?!])\"*\n;:") + 1;
        if (backtrack != t.length())
        {
          std::string ending = t.substr(backtrack);
          std::string suffix;
          bool newline = false;
          bool terminating = false;
        
          for (char c : ending)
          {
            if ((c == '.') || (c == ',') || (c == '?') || (c == '!') || (c == ';') || (c == ':'))
            {
              suffix += c;
              terminating = true;
            
              continue;
            } else if (c == '\n')
            {
              newline = true;
              terminating = true;
              
              continue;
            }
          
            parentype pt = ([&] {
              switch (c)
              {
                case ']': return parentype::square_bracket;
                case ')': return parentype::paren;
                case '*': return parentype::asterisk;
                case '"': return parentype::quote;
              }
            })();
          
            if (tk.delimiters[{pt, doublestatus::opening}] > 0)
            {
              tk.delimiters[{pt, doublestatus::opening}]--;
              tk.delimiters[{pt, doublestatus::both}]++;
            } else {
              tk.delimiters[{pt, doublestatus::closing}]++;
            }
          }
        
          if (terminating)
          {
            if ((suffix == ",") && (!newline))
            {
              tk.suffix = suffixtype::comma;
            } else {
              tk.suffix = suffixtype::terminating;
              
              if (!newline)
              {
                w.terms.add({suffix, false});
              } else {
                w.terms.add({".", false});
              }
            }
          }
        }

        tkcor.push_back(_tokenstore.add(tk));
      }

      start = ((end > (std::string::npos - 1) ) ? std::string::npos : end + 1);
    }
    
    tokens.push_back(tkcor);
    
    startper += _corpora[i].length();
  }
  
  std::cout << "\b\b\b\b100%" << std::endl;
  
  delete_aspell_speller(spell_checker);
  delete_aspell_config(spell_config);
  
  std::cout << canonical_form.size() << " distinct forms" << std::endl;
  std::cout << words.size() << " distinct words" << std::endl;
  
  // Time to condense the distribution stuff for the words
  std::cout << "Compiling token histograms..." << std::endl;
  for (auto& it : words)
  {
    it.second.forms.compile();
    it.second.terms.compile();
  }
  
  // Hashtag freevar is not frequency distributed
  for (auto& it : thashtags)
  {
    hashtags.forms.add(it);
  }
  
  hashtags.forms.compile();
  hashtags.terms.compile();
  
  // Compile other freevars
  emoticons.forms.compile();
  emoticons.terms.compile();

  // Compile the interned tokens.
  _tokenstore.compile();

  // kgram distribution
  std::cout << "Creating markov chain...   0%" << std::flush;
  std::map<kgram, std::map<token_id, token_data> > tstats;

  len = 0;
  for (auto c : tokens)
  {
    len += (maxK-1) * c.size();
  }
  
  startper = 0;
  per = 0;
  perprime = 0;
  int corpid = 0;
  for (auto corpus : tokens)
  {
    for (int k=0; k<maxK && k<corpus.size(); k++)
    {
      // The zero'th token should be a terminator.
      token_id fid = corpus[k];
      const token& f = _tokenstore.get(fid);

      kgram term_prefix(corpus.begin(), corpus.begin()+k);
      term_prefix.push_front(wildcardQuery);

      if (tstats[term_prefix].count(fid) == 0)
      {
        tstats[term_prefix].emplace(fid, fid);
      }

      token_data& td2 = tstats[term_prefix].at(fid);
      td2.all++;
      td2.corpora.insert(corpid);

      if (std::find_if(f.raw.begin(), f.raw.end(), ::islower) == f.raw.end())
      {
        td2.uppercase++;
      } else if (isupper(f.raw[0]))
      {
        td2.titlecase++;
      }
    }

    for (int k=1; k<maxK && k<corpus.size(); k++)
    {
      for (int i=0; i<(corpus.size() - k); i++)
      {
        perprime = (startper+i) * 100 / len;
        if (perprime != per)
        {
          per = perprime;
      
          std::cout << "\b\b\b\b" << std::right;
          std::cout.width(3);
          std::cout << per << "%" << std::flush;
        }
      
        kgram prefix(corpus.begin()+i, corpus.begin()+i+k);
        token_id fid = corpus[i+k];
        const token& f = _tokenstore.get(fid);

        if (tstats[prefix].count(fid) == 0)
        {
          tstats[prefix].emplace(fid, fid);
        }

        token_data& td = tstats[prefix].at(fid);
        td.all++;
        td.corpora.insert(corpid);

        if (std::find_if(f.raw.begin(), f.raw.end(), ::islower) == f.raw.end())
        {
          td.uppercase++;
        } else if (isupper(f.raw[0]))
        {
          td.titlecase++;
        }

        const token& startTok = _tokenstore.get(std::begin(prefix)->tok);
        if (startTok.suffix == suffixtype::terminating)
        {
          kgram term_prefix(prefix);
          term_prefix.pop_front();
          term_prefix.push_front(wildcardQuery);

          if (tstats[term_prefix].count(fid) == 0)
          {
            tstats[term_prefix].emplace(fid, fid);
          }

          token_data& td2 = tstats[term_prefix].at(fid);
          td2.all++;
          td2.corpora.insert(corpid);

          if (std::find_if(f.raw.begin(), f.raw.end(), ::islower) == f.raw.end())
          {
            td2.uppercase++;
          } else if (isupper(f.raw[0]))
          {
            td2.titlecase++;
          }
        }
      }
      
      startper += corpus.size();
    }
    
    corpid++;
  }
  
  std::cout << "\b\b\b\b100%" << std::endl;

  // Condense the kgram distribution
  std::cout << "Compiling kgram distributions...   0%";
  len = tstats.size();
  per = 0;
  perprime = 0;
  int indicator = 0;
  for (auto& it : tstats)
  {
    indicator++;
    perprime = indicator * 100 / len;
    if (per != perprime)
    {
      per = perprime;
    
      std::cout << "\b\b\b\b" << std::right;
      std::cout.width(3);
      std::cout << per << "%" << std::flush;
    }
    
    kgram klist = it.first;
    auto& probtable = it.second;
    auto& distribution = _stats[klist];
    int max = 0;
		
    for (auto& kt : probtable)
    {
      max += kt.second.all;
			
      distribution.emplace(max, kt.second);
    }
  }
  
  std::cout << "\b\b\b\b100%" << std::endl;
  
  _compiled = true;
}