static strbuf* sanitize_word (varnam *handle, const char *word) { size_t i; bool is_special = false; strbuf *string, *to_remove; string = get_pooled_string (handle); to_remove = get_pooled_string (handle); strbuf_add (string, word); for (i = 0; i < string->length; i++) { is_special = is_special_character (string->buffer[i]); if (is_special) strbuf_addc (to_remove, string->buffer[i]); else break; } strbuf_remove_from_first (string, strbuf_to_s (to_remove)); strbuf_clear (to_remove); for (i = string->length - 1; i >= 0; i--) { is_special = is_special_character (string->buffer[i]); if (is_special) strbuf_addc (to_remove, string->buffer[i]); else break; } strbuf_remove_from_last (string, strbuf_to_s (to_remove)); language_specific_sanitization (string); /* Remove trailing ZWNJ and leading ZWJ */ strbuf_remove_from_first (string, ZWNJ()); strbuf_remove_from_last (string, ZWNJ()); strbuf_remove_from_first (string, ZWJ()); return string; }
int varnam_detect_lang(varnam *handle, const char *input) { strbuf *word; utf8_decoder decoder; int codepoint, language = VARNAM_LANG_CODE_UNKNOWN, prev_language = 0; if (handle == NULL || input == NULL) { return VARNAM_LANG_CODE_UNKNOWN; } word = get_pooled_string (handle); strbuf_add (word, input); if (strbuf_is_blank (word)) { return VARNAM_LANG_CODE_UNKNOWN; } utf8_decode_init (word->buffer, (int) word->length, &decoder); for (;;) { codepoint = utf8_decode_next (&decoder); if (codepoint == UTF8_END || codepoint == UTF8_ERROR) break; if (should_skip(codepoint)) continue; language = get_language (codepoint); if (language == VARNAM_LANG_CODE_UNKNOWN) return VARNAM_LANG_CODE_UNKNOWN; if (prev_language != 0 && language != prev_language) { /* Looks like characters from multiple languages are mixed */ return VARNAM_LANG_CODE_UNKNOWN; } prev_language = language; } return language; }
int varnam_learn_from_file(varnam *handle, const char *filepath, vlearn_status *status, void (*callback)(varnam *handle, const char *word, int status_code, void *object), void *object) { int rc; FILE *infile; char line_buffer[10000]; strbuf *word; varray *word_parts; int confidence; int parts; infile = fopen(filepath, "r"); if (!infile) { set_last_error (handle, "Couldn't open file '%s' for reading.\n", filepath); return VARNAM_ERROR; } if (status != NULL) { status->total_words = 0; status->failed = 0; } rc = vwt_optimize_for_huge_transaction(handle); if (rc) { fclose (infile); return rc; } /* Learning from file will be mostly new words. Optimizing for that */ v_->_config_mostly_learning_new_words = 1; varnam_log (handle, "Starting to learn from %s", filepath); rc = vwt_start_changes (handle); if (rc) { vwt_turn_off_optimization_for_huge_transaction(handle); fclose (infile); return rc; } while (fgets(line_buffer, sizeof(line_buffer), infile)) { reset_pool (handle); word = get_pooled_string (handle); strbuf_add (word, trimwhitespace (line_buffer)); word_parts = strbuf_split (word, handle, ' '); parts = varray_length (word_parts); if (parts > 0 && parts <= 2) { confidence = 1; if (parts == 2) { word = varray_get (word_parts, 1); confidence = atoi (strbuf_to_s (word)); } word = varray_get (word_parts, 0); rc = varnam_learn_internal (handle, strbuf_to_s (word), confidence); if (rc) { if (status != NULL) status->failed++; } } else { rc = VARNAM_ERROR; if (status != NULL) status->failed++; } if (status != NULL) status->total_words++; if (callback != NULL) callback (handle, strbuf_to_s (word), rc, object); } varnam_log (handle, "Writing changes to disk"); rc = vwt_end_changes (handle); if (rc) { varnam_log (handle, "Writing changes to disk failed"); } varnam_log (handle, "Ensuring file integrity"); rc = vwt_turn_off_optimization_for_huge_transaction(handle); if (rc) { varnam_log (handle, "Failed to check file integrity"); } varnam_log (handle, "Compacting file"); rc = vwt_compact_file (handle); if (rc) return rc; fclose (infile); return rc; }