Beispiel #1
0
static strbuf*
sanitize_word (varnam *handle, const char *word)
{
    size_t i;
    bool is_special = false;
    strbuf *string, *to_remove;

    string = get_pooled_string (handle);
    to_remove = get_pooled_string (handle);

    strbuf_add (string, word);

    for (i = 0; i < string->length; i++)
    {
        is_special = is_special_character (string->buffer[i]);
        if (is_special)
            strbuf_addc (to_remove, string->buffer[i]);
        else
            break;
    }

    strbuf_remove_from_first (string, strbuf_to_s (to_remove));

    strbuf_clear (to_remove);
    for (i = string->length - 1; i >= 0; i--)
    {
        is_special = is_special_character (string->buffer[i]);
        if (is_special)
            strbuf_addc (to_remove, string->buffer[i]);
        else
            break;
    }

    strbuf_remove_from_last (string, strbuf_to_s (to_remove));
    language_specific_sanitization (string);

    /* Remove trailing ZWNJ and leading ZWJ */
    strbuf_remove_from_first (string, ZWNJ());
    strbuf_remove_from_last (string, ZWNJ());
    strbuf_remove_from_first (string, ZWJ());

    return string;
}
Beispiel #2
0
int
varnam_detect_lang(varnam *handle, const char *input)
{
    strbuf *word;
    utf8_decoder decoder;
    int codepoint, language = VARNAM_LANG_CODE_UNKNOWN, prev_language = 0;
    
    if (handle == NULL || input == NULL) {
        return VARNAM_LANG_CODE_UNKNOWN;
    }

    word = get_pooled_string (handle);
    strbuf_add (word, input);

    if (strbuf_is_blank (word)) {
        return VARNAM_LANG_CODE_UNKNOWN;
    }

    utf8_decode_init (word->buffer, (int) word->length, &decoder);

    for (;;)
    {
        codepoint = utf8_decode_next (&decoder);
        if (codepoint == UTF8_END || codepoint == UTF8_ERROR)
            break;

        if (should_skip(codepoint))
            continue;

        language = get_language (codepoint);

        if (language == VARNAM_LANG_CODE_UNKNOWN)
            return VARNAM_LANG_CODE_UNKNOWN;
        
        if (prev_language != 0 && language != prev_language) {
            /* Looks like characters from multiple languages are mixed */
            return VARNAM_LANG_CODE_UNKNOWN;
        }
        prev_language = language;
    }

    return language;
}
Beispiel #3
0
int
varnam_learn_from_file(varnam *handle,
                       const char *filepath,
                       vlearn_status *status,
                       void (*callback)(varnam *handle, const char *word, int status_code, void *object),
                       void *object)
{
    int rc;
    FILE *infile;
    char line_buffer[10000];
    strbuf *word;
    varray *word_parts;
    int confidence;
    int parts;

    infile = fopen(filepath, "r");
    if (!infile) {
        set_last_error (handle, "Couldn't open file '%s' for reading.\n", filepath);
        return VARNAM_ERROR;
    }

    if (status != NULL)
    {
        status->total_words = 0;
        status->failed = 0;
    }

    rc = vwt_optimize_for_huge_transaction(handle);
    if (rc) {
        fclose (infile);
        return rc;
    }

    /* Learning from file will be mostly new words. Optimizing for that */
    v_->_config_mostly_learning_new_words = 1;

    varnam_log (handle, "Starting to learn from %s", filepath);
    rc = vwt_start_changes (handle);
    if (rc) {
        vwt_turn_off_optimization_for_huge_transaction(handle);
        fclose (infile);
        return rc;
    }

    while (fgets(line_buffer, sizeof(line_buffer), infile))
    {
        reset_pool (handle);

        word = get_pooled_string (handle);
        strbuf_add (word, trimwhitespace (line_buffer));
        word_parts = strbuf_split (word, handle, ' ');
        parts = varray_length (word_parts);
        if (parts > 0 && parts <= 2)
        {
            confidence = 1;
            if (parts == 2) {
                word = varray_get (word_parts, 1);
                confidence = atoi (strbuf_to_s (word));
            }

            word = varray_get (word_parts, 0);
            rc = varnam_learn_internal (handle, strbuf_to_s (word), confidence);
            if (rc) {
                if (status != NULL) status->failed++;
            }
        }
        else {
            rc = VARNAM_ERROR;
            if (status != NULL) status->failed++;
        }

        if (status   != NULL) status->total_words++;
        if (callback != NULL) callback (handle, strbuf_to_s (word), rc, object);
    }

    varnam_log (handle, "Writing changes to disk");
    rc = vwt_end_changes (handle);
    if (rc) {
        varnam_log (handle, "Writing changes to disk failed");
    }

    varnam_log (handle, "Ensuring file integrity");
    rc = vwt_turn_off_optimization_for_huge_transaction(handle);
    if (rc) {
        varnam_log (handle, "Failed to check file integrity");
    }

    varnam_log (handle, "Compacting file");
    rc = vwt_compact_file (handle);
    if (rc) return rc;

    fclose (infile);
    return rc;
}