/* We are not interested in words that contain non-alpha chars. */ Vocab::Vocab(ANT_search_engine *search_engine) { char *term; ANT_btree_iterator iterator(search_engine); this->strings = NULL; this->string_count = 0; #ifdef USE_STR_LIST int i; for (string_count = 0, iterator.first(NULL); (term = iterator.next()) != NULL; string_count++) if (contains_non_alpha(term)) string_count--; printf("str_count:%d\n", string_count); strings = (char **) malloc(sizeof(strings[0]) * string_count); for (i = 0, term = iterator.first(NULL); term != NULL; term = iterator.next()) { if (!contains_non_alpha(term)) { if(i%100 == 0) printf("%s\n", term); strings[i] = strdup(term); i++; } } #else trie = new trie_node(); for (iterator.first(NULL); (term = iterator.next()) != NULL;) if (!contains_non_alpha(term)) { ANT_search_engine_btree_leaf td; search_engine->get_postings_details(term, &td); trie->add(term, td.collection_frequency); } #endif }
Phoneme * dictionary_get_decomp (const char *phrase_in) { char *phrase; char **phrasev = NULL; int i; GSList *decomps = NULL; Phoneme *decomp = NULL; if (! (phrase_in && *phrase_in)) return NULL; phrase = g_strdup (phrase_in); remove_excess_whitespace (phrase); phrasev = g_strsplit (phrase, " ", -1); for (i = 0; phrasev[i]; ++i) { DictionaryWord *dword; Phoneme *this_decomp = NULL; dword = dictionary_get_word (phrasev[i]); if (dword != NULL) { this_decomp = dword->decomp; } else if (contains_non_alpha (phrasev[i])) { non_alpha_to_whitespace (phrasev[i]); this_decomp = dictionary_get_decomp (phrasev[i]); } if (this_decomp == NULL) goto finished; decomps = g_slist_prepend (decomps, this_decomp); } if (g_slist_length (decomps) == 1) { decomp = (Phoneme *) decomps->data; } else if (g_slist_length (decomps) > 1) { int len = 0, i = 0, j = 0; GSList *iter; for (iter = decomps; iter != NULL; iter = iter->next) len += phoneme_decomp_length (iter->data); decomp = g_new0 (Phoneme, len+1); i = 0; for (iter = decomps; iter != NULL; iter = iter->next) { Phoneme *d = iter->data; for (j = 0; d[j]; ++j) { decomp[i] = d[j]; ++i; } } } finished: g_free (phrase); g_strfreev (phrasev); g_slist_free (decomps); return decomp; }