inline void language_classifier_normalize_token(char_array *array, char *str, token_t token) { char_array_strip_nul_byte(array); if (is_word_token(token.type)) { add_normalized_token(array, str, token, LANGUAGE_CLASSIFIER_NORMALIZE_TOKEN_OPTIONS); } else { char_array_add(array, " "); } }
void add_normalized_strings_token(cstring_array *strings, char *str, token_t token, normalize_options_t options) { uint64_t normalize_token_options = get_normalize_token_options(options); if (token.type != WHITESPACE ) { bool contains_hyphen = string_contains_hyphen_len(str + token.offset, token.len); if (!contains_hyphen || token.type == HYPHEN) { normalize_token(strings, str, token, normalize_token_options); } else if (is_word_token(token.type)) { normalize_token(strings, str, token, normalize_token_options); if (options.replace_word_hyphens) { normalize_token_options |= NORMALIZE_TOKEN_REPLACE_HYPHENS; normalize_token(strings, str, token, normalize_token_options); normalize_token_options ^= NORMALIZE_TOKEN_REPLACE_HYPHENS; } if (options.delete_word_hyphens) { normalize_token_options |= NORMALIZE_TOKEN_DELETE_HYPHENS; normalize_token(strings, str, token, normalize_token_options); normalize_token_options ^= NORMALIZE_TOKEN_DELETE_HYPHENS; } } else if (is_numeric_token(token.type)) { normalize_token(strings, str, token, normalize_token_options); if (options.replace_numeric_hyphens) { normalize_token_options |= NORMALIZE_TOKEN_REPLACE_HYPHENS; normalize_token(strings, str, token, normalize_token_options); normalize_token_options ^= NORMALIZE_TOKEN_REPLACE_HYPHENS; } if (options.delete_numeric_hyphens) { normalize_token_options |= NORMALIZE_TOKEN_DELETE_HYPHENS; normalize_token(strings, str, token, normalize_token_options); normalize_token_options ^= NORMALIZE_TOKEN_DELETE_HYPHENS; } } if (is_numeric_token(token.type) && options.split_alpha_from_numeric) { normalize_token_options |= NORMALIZE_TOKEN_SPLIT_ALPHA_FROM_NUMERIC; normalize_token(strings, str, token, normalize_token_options); normalize_token_options ^= NORMALIZE_TOKEN_SPLIT_ALPHA_FROM_NUMERIC; } } else { cstring_array_add_string(strings, " "); } }