END_TEST START_TEST (initialize_using_lang_code) { int rc; char *errMsg = NULL; varnam *handle; strbuf *tmp; rc = varnam_init_from_id ("ml", &handle, &errMsg); if (errMsg != NULL) { printf ("init_from_lang failed: %s\n", errMsg); } assert_success (rc); tmp = strbuf_init (20); strbuf_addf (tmp, "%s/ml.vst", strbuf_to_s (varnam_get_symbols_dir())); ck_assert_str_eq (strbuf_to_s(tmp), varnam_get_scheme_file (handle)); strbuf_destroy (tmp); tmp = strbuf_init (10); strbuf_addf (tmp, "%s/.local/share/varnam/suggestions/ml.vst.learnings", getenv ("HOME")); ck_assert_str_eq (strbuf_to_s (tmp), varnam_get_suggestions_file (handle)); strbuf_destroy (tmp); varnam_destroy (handle); }
int execute_query_int (sqlite3* db, const char* sql) { int rc, result; sqlite3_stmt* stmt; strbuf* error; rc = sqlite3_prepare_v2 (db, sql, -1, &stmt, NULL); if (rc != SQLITE_OK) { error = strbuf_init (50); strbuf_addf (error, "Failed to prepare query: %s. Return code was: %d\n", sql, rc); ck_abort_msg (strbuf_to_s (error)); } rc = sqlite3_step (stmt); if (rc != SQLITE_ROW) { error = strbuf_init (50); strbuf_addf (error, "Failed to execute query: %s. Return code was: %d\n", sql, rc); ck_abort_msg (strbuf_to_s (error)); } result = sqlite3_column_int (stmt, 0); sqlite3_finalize (stmt); return result; }
static int varnam_learn_internal(varnam *handle, const char *word, int confidence) { int rc; varray *tokens; strbuf *sanitized_word; if (handle == NULL || word == NULL) return VARNAM_ARGS_ERROR; if (!is_words_store_available(handle)) { return VARNAM_ERROR; } if (!is_utf8 (word)) { set_last_error (handle, "Incorrect encoding. Expected UTF-8 string"); return VARNAM_ERROR; } tokens = get_pooled_array (handle); /* This removes all starting and trailing special characters from the word */ sanitized_word = sanitize_word (handle, word); rc = vst_tokenize (handle, strbuf_to_s (sanitized_word), VARNAM_TOKENIZER_VALUE, VARNAM_MATCH_ALL, tokens); if (rc) return rc; #ifdef _VARNAM_VERBOSE printf ("%s\n", "Tokens before reducing noice"); print_tokens_array (tokens); #endif /* Tokens may contain more data that we can handle. Reducing noice so that we learn most relevant combinations */ reduce_noise_in_tokens (tokens); #ifdef _VARNAM_VERBOSE printf ("%s\n", "Tokens after reducing noice"); print_tokens_array (tokens); #endif if (!can_learn_from_tokens (handle, tokens, strbuf_to_s (sanitized_word))) return VARNAM_ERROR; return vwt_persist_possibilities (handle, tokens, strbuf_to_s (sanitized_word), confidence); }
static strbuf* sanitize_word (varnam *handle, const char *word) { size_t i; bool is_special = false; strbuf *string, *to_remove; string = get_pooled_string (handle); to_remove = get_pooled_string (handle); strbuf_add (string, word); for (i = 0; i < string->length; i++) { is_special = is_special_character (string->buffer[i]); if (is_special) strbuf_addc (to_remove, string->buffer[i]); else break; } strbuf_remove_from_first (string, strbuf_to_s (to_remove)); strbuf_clear (to_remove); for (i = string->length - 1; i >= 0; i--) { is_special = is_special_character (string->buffer[i]); if (is_special) strbuf_addc (to_remove, string->buffer[i]); else break; } strbuf_remove_from_last (string, strbuf_to_s (to_remove)); language_specific_sanitization (string); /* Remove trailing ZWNJ and leading ZWJ */ strbuf_remove_from_first (string, ZWNJ()); strbuf_remove_from_last (string, ZWNJ()); strbuf_remove_from_first (string, ZWJ()); return string; }
END_TEST START_TEST (numbers_will_be_ignored_while_learning) { int rc; strbuf *string; rc = varnam_learn (varnam_instance, "01"); assert_error (rc); string = strbuf_init (50); strbuf_add (string, "Can't process '0'. One or more characters in '01' are not known"); ck_assert_str_eq (varnam_get_last_error (varnam_instance), strbuf_to_s (string)); rc = varnam_learn (varnam_instance, "१०१"); assert_error (rc); strbuf_clear (string); strbuf_add (string, "Nothing to learn from '१०१'"); ck_assert_str_eq (varnam_get_last_error (varnam_instance), strbuf_to_s (string)); strbuf_destroy (string); }
void assert_error (int value) { strbuf *string = NULL; if (value != VARNAM_ERROR) { string = strbuf_init (50); strbuf_addf (string, "Expected VARNAM_ERROR, but got %d. %s", value, varnam_get_last_error (varnam_instance)); ck_abort_msg (strbuf_to_s (string)); } }
END_TEST START_TEST (words_with_repeating_characters_will_not_be_learned) { int rc; strbuf *string; const char *word_to_learn = "കകകകകകക"; rc = varnam_learn (varnam_instance, word_to_learn); assert_error (rc); string = strbuf_init (50); strbuf_addf (string, "'%s' looks incorrect. Not learning anything", word_to_learn); ck_assert_str_eq (varnam_get_last_error (varnam_instance), strbuf_to_s (string)); strbuf_destroy (string); }
static const char* get_scheme_details(varnam *handle, const char* key, struct strbuf *buffer) { struct strbuf *value = buffer; if (handle == NULL) return NULL; if (strbuf_is_blank (value)) { vst_get_metadata (handle, key, value); } return strbuf_to_s (value); }
void ensure_word_list_contains(varray *words, const char *word) { int i = 0, found = 0; vword *w; strbuf *error; for (i = 0; i < varray_length (words); i++) { w = varray_get (words, i); if (strcmp (w->text, word) == 0) { found = 1; break; } } if (!found) { error = strbuf_init (50); strbuf_addf (error, "Expected word list to contain '%s'", word); ck_abort_msg (strbuf_to_s (error)); } }
void reinitialize_varnam_instance(const char *filename) { int rc; char *msg; varnam *handle; strbuf *error; if (varnam_instance != NULL) { varnam_destroy (varnam_instance); varnam_instance = NULL; } rc = varnam_init (filename, &handle, &msg); if(rc != VARNAM_SUCCESS) { error = strbuf_init (50); strbuf_addf (error, "Varnam initialization failed. %s. %s", filename, msg); ck_abort_msg (strbuf_to_s (error)); } varnam_instance = handle; }
END_TEST START_TEST (varnam_export_full) { int rc, pcnt, wcnt, i; float filecnt; strbuf* f; strbuf* error; f = strbuf_init (20); pcnt = execute_query_int (varnam_instance->internal->known_words, "select count(*) from patterns_content;"); wcnt = execute_query_int (varnam_instance->internal->known_words, "select count(*) from words;"); rc = varnam_export_words (varnam_instance, 2, "output/", VARNAM_EXPORT_FULL, NULL); assert_success (rc); filecnt = pcnt / 2; for (i = 0; i < (int) ceil (filecnt); i++) { strbuf_clear (f); strbuf_addf (f, "output/%d.patterns.txt", i); if (!file_exist (strbuf_to_s (f))) { error = strbuf_init (10); strbuf_addf (error, "Failed to find file: %s\n", strbuf_to_s (f)); ck_abort_msg (strbuf_to_s (error)); } } filecnt = wcnt / 2; for (i = 0; i < (int) ceil (filecnt); i++) { strbuf_clear (f); strbuf_addf (f, "output/%d.words.txt", i); if (!file_exist (strbuf_to_s (f))) { error = strbuf_init (10); strbuf_addf (error, "Failed to find file: %s\n", strbuf_to_s (f)); ck_abort_msg (strbuf_to_s (error)); } } strbuf_destroy (f); }
int varnam_learn_from_file(varnam *handle, const char *filepath, vlearn_status *status, void (*callback)(varnam *handle, const char *word, int status_code, void *object), void *object) { int rc; FILE *infile; char line_buffer[10000]; strbuf *word; varray *word_parts; int confidence; int parts; infile = fopen(filepath, "r"); if (!infile) { set_last_error (handle, "Couldn't open file '%s' for reading.\n", filepath); return VARNAM_ERROR; } if (status != NULL) { status->total_words = 0; status->failed = 0; } rc = vwt_optimize_for_huge_transaction(handle); if (rc) { fclose (infile); return rc; } /* Learning from file will be mostly new words. Optimizing for that */ v_->_config_mostly_learning_new_words = 1; varnam_log (handle, "Starting to learn from %s", filepath); rc = vwt_start_changes (handle); if (rc) { vwt_turn_off_optimization_for_huge_transaction(handle); fclose (infile); return rc; } while (fgets(line_buffer, sizeof(line_buffer), infile)) { reset_pool (handle); word = get_pooled_string (handle); strbuf_add (word, trimwhitespace (line_buffer)); word_parts = strbuf_split (word, handle, ' '); parts = varray_length (word_parts); if (parts > 0 && parts <= 2) { confidence = 1; if (parts == 2) { word = varray_get (word_parts, 1); confidence = atoi (strbuf_to_s (word)); } word = varray_get (word_parts, 0); rc = varnam_learn_internal (handle, strbuf_to_s (word), confidence); if (rc) { if (status != NULL) status->failed++; } } else { rc = VARNAM_ERROR; if (status != NULL) status->failed++; } if (status != NULL) status->total_words++; if (callback != NULL) callback (handle, strbuf_to_s (word), rc, object); } varnam_log (handle, "Writing changes to disk"); rc = vwt_end_changes (handle); if (rc) { varnam_log (handle, "Writing changes to disk failed"); } varnam_log (handle, "Ensuring file integrity"); rc = vwt_turn_off_optimization_for_huge_transaction(handle); if (rc) { varnam_log (handle, "Failed to check file integrity"); } varnam_log (handle, "Compacting file"); rc = vwt_compact_file (handle); if (rc) return rc; fclose (infile); return rc; }