Beispiel #1
0
/*
  We are not interested in words that contain non-alpha chars.
 */
Vocab::Vocab(ANT_search_engine *search_engine) {
    char *term;
    ANT_btree_iterator iterator(search_engine);

    this->strings = NULL;
    this->string_count = 0;

#ifdef USE_STR_LIST
    int i;

    for (string_count = 0, iterator.first(NULL); (term = iterator.next()) != NULL; string_count++)
		if (contains_non_alpha(term))
			string_count--;

	printf("str_count:%d\n", string_count);

    strings = (char **) malloc(sizeof(strings[0]) * string_count);
    for (i = 0, term = iterator.first(NULL); term != NULL; term = iterator.next()) {
		if (!contains_non_alpha(term)) {
			if(i%100 == 0)
				printf("%s\n", term);
			strings[i] = strdup(term);
			i++;
		}
	}
#else
	trie = new trie_node();
    for (iterator.first(NULL); (term = iterator.next()) != NULL;)
		if (!contains_non_alpha(term)) {
			ANT_search_engine_btree_leaf td;
			search_engine->get_postings_details(term, &td);
			trie->add(term, td.collection_frequency);
		}
#endif
}
Beispiel #2
0
Phoneme *
dictionary_get_decomp (const char *phrase_in)
{
    char *phrase;
    char **phrasev = NULL;
    int i;
    GSList *decomps = NULL;
    Phoneme *decomp = NULL;

    if (! (phrase_in && *phrase_in))
        return NULL;

    phrase = g_strdup (phrase_in);
    remove_excess_whitespace (phrase);
    phrasev = g_strsplit (phrase, " ", -1);

    for (i = 0; phrasev[i]; ++i) {
        DictionaryWord *dword;
        Phoneme *this_decomp = NULL;

        dword = dictionary_get_word (phrasev[i]);
        if (dword != NULL) {
            this_decomp = dword->decomp;
        } else if (contains_non_alpha (phrasev[i])) {
            non_alpha_to_whitespace (phrasev[i]);
            this_decomp = dictionary_get_decomp (phrasev[i]);
        }
    
        if (this_decomp == NULL)
            goto finished;
    
        decomps = g_slist_prepend (decomps, this_decomp);
    }

    if (g_slist_length (decomps) == 1) {

        decomp = (Phoneme *) decomps->data;

    } else if (g_slist_length (decomps) > 1) {
        int len = 0, i = 0, j = 0;
        GSList *iter;

        for (iter = decomps; iter != NULL; iter = iter->next)
            len += phoneme_decomp_length (iter->data);

        decomp = g_new0 (Phoneme, len+1);
        i = 0;

        for (iter = decomps; iter != NULL; iter = iter->next) {
            Phoneme *d = iter->data;
            for (j = 0; d[j]; ++j) {
                decomp[i] = d[j];
                ++i;
            }
        }
    }

 finished:
    g_free (phrase);
    g_strfreev (phrasev);
    g_slist_free (decomps);

    return decomp;
}