Пример #1
0
END_TEST

START_TEST (initialize_using_lang_code)
{
  int rc;
  char *errMsg = NULL;
  varnam *handle;
  strbuf *tmp;

  rc = varnam_init_from_id ("ml", &handle, &errMsg);
  if (errMsg != NULL) {
    printf ("init_from_lang failed: %s\n", errMsg);
  }
  assert_success (rc);

  tmp = strbuf_init (20);
  strbuf_addf (tmp, "%s/ml.vst", strbuf_to_s (varnam_get_symbols_dir()));
  ck_assert_str_eq (strbuf_to_s(tmp), varnam_get_scheme_file (handle));
  strbuf_destroy (tmp);

  tmp = strbuf_init (10);
  strbuf_addf (tmp, "%s/.local/share/varnam/suggestions/ml.vst.learnings", getenv ("HOME"));
  ck_assert_str_eq (strbuf_to_s (tmp), varnam_get_suggestions_file (handle));

  strbuf_destroy (tmp);
  varnam_destroy (handle);
}
Пример #2
0
int
execute_query_int (sqlite3* db, const char* sql)
{
    int rc, result;
    sqlite3_stmt* stmt;
    strbuf* error;

    rc = sqlite3_prepare_v2 (db, sql, -1, &stmt, NULL);
    if (rc != SQLITE_OK) {
        error = strbuf_init (50);
        strbuf_addf (error, "Failed to prepare query: %s. Return code was: %d\n", sql, rc);
        ck_abort_msg (strbuf_to_s (error));
    }

    rc = sqlite3_step (stmt);
    if (rc != SQLITE_ROW) {
        error = strbuf_init (50);
        strbuf_addf (error, "Failed to execute query: %s. Return code was: %d\n", sql, rc);
        ck_abort_msg (strbuf_to_s (error));
    }

    result = sqlite3_column_int (stmt, 0);
    sqlite3_finalize (stmt);
    return result;
}
Пример #3
0
static int
varnam_learn_internal(varnam *handle, const char *word, int confidence)
{
    int rc;
    varray *tokens;
    strbuf *sanitized_word;

    if (handle == NULL || word == NULL)
        return VARNAM_ARGS_ERROR;

    if (!is_words_store_available(handle)) {
        return VARNAM_ERROR;
    }

    if (!is_utf8 (word)) {
        set_last_error (handle, "Incorrect encoding. Expected UTF-8 string");
        return VARNAM_ERROR;
    }

    tokens = get_pooled_array (handle);

    /* This removes all starting and trailing special characters from the word */
    sanitized_word = sanitize_word (handle, word);

    rc = vst_tokenize (handle, strbuf_to_s (sanitized_word), VARNAM_TOKENIZER_VALUE, VARNAM_MATCH_ALL, tokens);
    if (rc) return rc;

#ifdef _VARNAM_VERBOSE
    printf ("%s\n", "Tokens before reducing noice");
    print_tokens_array (tokens);
#endif

    /* Tokens may contain more data that we can handle. Reducing noice so that we learn most relevant combinations */
    reduce_noise_in_tokens (tokens);

#ifdef _VARNAM_VERBOSE
    printf ("%s\n", "Tokens after reducing noice");
    print_tokens_array (tokens);
#endif

    if (!can_learn_from_tokens (handle, tokens, strbuf_to_s (sanitized_word)))
        return VARNAM_ERROR;

    return vwt_persist_possibilities (handle,
                                      tokens,
                                      strbuf_to_s (sanitized_word),
                                      confidence);
}
Пример #4
0
static strbuf*
sanitize_word (varnam *handle, const char *word)
{
    size_t i;
    bool is_special = false;
    strbuf *string, *to_remove;

    string = get_pooled_string (handle);
    to_remove = get_pooled_string (handle);

    strbuf_add (string, word);

    for (i = 0; i < string->length; i++)
    {
        is_special = is_special_character (string->buffer[i]);
        if (is_special)
            strbuf_addc (to_remove, string->buffer[i]);
        else
            break;
    }

    strbuf_remove_from_first (string, strbuf_to_s (to_remove));

    strbuf_clear (to_remove);
    for (i = string->length - 1; i >= 0; i--)
    {
        is_special = is_special_character (string->buffer[i]);
        if (is_special)
            strbuf_addc (to_remove, string->buffer[i]);
        else
            break;
    }

    strbuf_remove_from_last (string, strbuf_to_s (to_remove));
    language_specific_sanitization (string);

    /* Remove trailing ZWNJ and leading ZWJ */
    strbuf_remove_from_first (string, ZWNJ());
    strbuf_remove_from_last (string, ZWNJ());
    strbuf_remove_from_first (string, ZWJ());

    return string;
}
Пример #5
0
END_TEST

START_TEST (numbers_will_be_ignored_while_learning)
{
    int rc;
    strbuf *string;

    rc = varnam_learn (varnam_instance, "01");
    assert_error (rc);
    string = strbuf_init (50);
    strbuf_add (string, "Can't process '0'. One or more characters in '01' are not known");
    ck_assert_str_eq (varnam_get_last_error (varnam_instance), strbuf_to_s (string));

    rc = varnam_learn (varnam_instance, "१०१");
    assert_error (rc);
    strbuf_clear (string);
    strbuf_add (string, "Nothing to learn from '१०१'");
    ck_assert_str_eq (varnam_get_last_error (varnam_instance), strbuf_to_s (string));
    strbuf_destroy (string);
}
Пример #6
0
void 
assert_error (int value)
{
    strbuf *string = NULL;
    if (value != VARNAM_ERROR) {
        string = strbuf_init (50);
        strbuf_addf (string, "Expected VARNAM_ERROR, but got %d. %s", value, 
                varnam_get_last_error (varnam_instance));
        ck_abort_msg (strbuf_to_s (string));
    }
}
Пример #7
0
END_TEST

START_TEST (words_with_repeating_characters_will_not_be_learned)
{
    int rc;
    strbuf *string;
    const char *word_to_learn = "കകകകകകക";

    rc = varnam_learn (varnam_instance, word_to_learn);
    assert_error (rc);
    string = strbuf_init (50);
    strbuf_addf (string, "'%s' looks incorrect. Not learning anything", word_to_learn);
    ck_assert_str_eq (varnam_get_last_error (varnam_instance), strbuf_to_s (string));
    strbuf_destroy (string);
}
Пример #8
0
static const char*
get_scheme_details(varnam *handle, const char* key, struct strbuf *buffer)
{
    struct strbuf *value = buffer;

    if (handle == NULL)
        return NULL;

    if (strbuf_is_blank (value))
    {
        vst_get_metadata (handle, key, value);
    }

    return strbuf_to_s (value);
}
Пример #9
0
void
ensure_word_list_contains(varray *words, const char *word)
{
    int i = 0, found = 0;
    vword *w;
    strbuf *error;

    for (i = 0; i < varray_length (words); i++) {
        w = varray_get (words, i);
        if (strcmp (w->text, word) == 0) {
            found = 1;
            break;
        }
    }

    if (!found) {
        error = strbuf_init (50);
        strbuf_addf (error, "Expected word list to contain '%s'", word);
        ck_abort_msg (strbuf_to_s (error));
    }
}
Пример #10
0
void
reinitialize_varnam_instance(const char *filename)
{
    int rc;
    char *msg;
    varnam *handle;
    strbuf *error;


    if (varnam_instance != NULL) {
        varnam_destroy (varnam_instance);
        varnam_instance = NULL;
    }

    rc = varnam_init (filename, &handle, &msg);
    if(rc != VARNAM_SUCCESS) {
        error = strbuf_init (50);
        strbuf_addf (error, "Varnam initialization failed. %s. %s", filename, msg);
        ck_abort_msg (strbuf_to_s (error));
    }

    varnam_instance = handle;
}
Пример #11
0
END_TEST

START_TEST (varnam_export_full)
{
    int rc, pcnt, wcnt, i;
    float filecnt;
    strbuf* f; strbuf* error;

    f = strbuf_init (20);
    pcnt = execute_query_int (varnam_instance->internal->known_words, "select count(*) from patterns_content;");
    wcnt = execute_query_int (varnam_instance->internal->known_words, "select count(*) from words;");

    rc = varnam_export_words (varnam_instance, 2, "output/", VARNAM_EXPORT_FULL, NULL);
    assert_success (rc);

    filecnt = pcnt / 2;
    for (i = 0; i < (int) ceil (filecnt); i++) {
        strbuf_clear (f);
        strbuf_addf (f, "output/%d.patterns.txt", i);
        if (!file_exist (strbuf_to_s (f))) {
            error = strbuf_init (10);
            strbuf_addf (error, "Failed to find file: %s\n", strbuf_to_s (f));
            ck_abort_msg (strbuf_to_s (error));
        }
    }

    filecnt = wcnt / 2;
    for (i = 0; i < (int) ceil (filecnt); i++) {
        strbuf_clear (f);
        strbuf_addf (f, "output/%d.words.txt", i);
        if (!file_exist (strbuf_to_s (f))) {
            error = strbuf_init (10);
            strbuf_addf (error, "Failed to find file: %s\n", strbuf_to_s (f));
            ck_abort_msg (strbuf_to_s (error));
        }
    }

    strbuf_destroy (f);
}
Пример #12
0
int
varnam_learn_from_file(varnam *handle,
                       const char *filepath,
                       vlearn_status *status,
                       void (*callback)(varnam *handle, const char *word, int status_code, void *object),
                       void *object)
{
    int rc;
    FILE *infile;
    char line_buffer[10000];
    strbuf *word;
    varray *word_parts;
    int confidence;
    int parts;

    infile = fopen(filepath, "r");
    if (!infile) {
        set_last_error (handle, "Couldn't open file '%s' for reading.\n", filepath);
        return VARNAM_ERROR;
    }

    if (status != NULL)
    {
        status->total_words = 0;
        status->failed = 0;
    }

    rc = vwt_optimize_for_huge_transaction(handle);
    if (rc) {
        fclose (infile);
        return rc;
    }

    /* Learning from file will be mostly new words. Optimizing for that */
    v_->_config_mostly_learning_new_words = 1;

    varnam_log (handle, "Starting to learn from %s", filepath);
    rc = vwt_start_changes (handle);
    if (rc) {
        vwt_turn_off_optimization_for_huge_transaction(handle);
        fclose (infile);
        return rc;
    }

    while (fgets(line_buffer, sizeof(line_buffer), infile))
    {
        reset_pool (handle);

        word = get_pooled_string (handle);
        strbuf_add (word, trimwhitespace (line_buffer));
        word_parts = strbuf_split (word, handle, ' ');
        parts = varray_length (word_parts);
        if (parts > 0 && parts <= 2)
        {
            confidence = 1;
            if (parts == 2) {
                word = varray_get (word_parts, 1);
                confidence = atoi (strbuf_to_s (word));
            }

            word = varray_get (word_parts, 0);
            rc = varnam_learn_internal (handle, strbuf_to_s (word), confidence);
            if (rc) {
                if (status != NULL) status->failed++;
            }
        }
        else {
            rc = VARNAM_ERROR;
            if (status != NULL) status->failed++;
        }

        if (status   != NULL) status->total_words++;
        if (callback != NULL) callback (handle, strbuf_to_s (word), rc, object);
    }

    varnam_log (handle, "Writing changes to disk");
    rc = vwt_end_changes (handle);
    if (rc) {
        varnam_log (handle, "Writing changes to disk failed");
    }

    varnam_log (handle, "Ensuring file integrity");
    rc = vwt_turn_off_optimization_for_huge_transaction(handle);
    if (rc) {
        varnam_log (handle, "Failed to check file integrity");
    }

    varnam_log (handle, "Compacting file");
    rc = vwt_compact_file (handle);
    if (rc) return rc;

    fclose (infile);
    return rc;
}