static void test_utf8proc_normalization(TestBatchRunner *runner) { SKIP(runner, 1, "utf8proc can't handle control chars or Unicode non-chars"); return; for (int32_t i = 0; i < 100; i++) { String *source = TestUtils_random_string(rand() % 40); // Normalize once. uint8_t *normalized; int32_t check = utf8proc_map((const uint8_t*)Str_Get_Ptr8(source), Str_Get_Size(source), &normalized, UTF8PROC_STABLE | UTF8PROC_COMPOSE | UTF8PROC_COMPAT | UTF8PROC_CASEFOLD); if (check < 0) { lucy_Json_set_tolerant(1); String *json = lucy_Json_to_json((Obj*)source); if (!json) { json = Str_newf("[failed to encode]"); } FAIL(runner, "Failed to normalize: %s", Str_Get_Ptr8(json)); DECREF(json); DECREF(source); return; } // Normalize again. size_t normalized_len = strlen((char*)normalized); uint8_t *dupe; int32_t dupe_check = utf8proc_map(normalized, normalized_len, &dupe, UTF8PROC_STABLE | UTF8PROC_COMPOSE | UTF8PROC_COMPAT | UTF8PROC_CASEFOLD); if (dupe_check < 0) { THROW(ERR, "Unexpected normalization error: %i32", dupe_check); } int comparison = strcmp((char*)normalized, (char*)dupe); free(dupe); free(normalized); DECREF(source); if (comparison != 0) { FAIL(runner, "Not fully normalized"); return; } } PASS(runner, "Normalization successful."); }
static void test_utf8proc_map ( void ) { char string_simple[] = "The quick brown.fox"; uint8_t *string_simple_utf8 = (uint8_t*)(&string_simple[0]); uint8_t *dest = NULL; ssize_t chars_written; unsigned int index; chars_written = utf8proc_map( string_simple_utf8, sizeof ( string_simple ), &dest, UTF8PROC_NULLTERM | UTF8PROC_STABLE | UTF8PROC_DECOMPOSE ); rtems_test_assert ( chars_written == strlen ( string_simple ) ); rtems_test_assert ( dest != NULL); /* Our source string contains only very simple characters. Thus the above * decomposition should result in exactly the same string */ for ( index = 0; index < chars_written; ++index ) { rtems_test_assert ( string_simple_utf8[index] == dest[index] ); } free ( dest ); }
int utf8_stricmp(const char *str1, const char *str2) { int result = 0; char *str1_utf8 = NULL; if (utf8proc_map((unsigned char *) str1, 0, (unsigned char **) &str1_utf8, UTF8PROC_NULLTERM | UTF8PROC_CASEFOLD) > 0) { char *str2_utf8 = NULL; if (utf8proc_map((unsigned char *) str2, 0, (unsigned char **) &str2_utf8, UTF8PROC_NULLTERM | UTF8PROC_CASEFOLD) > 0) { result = strcmp(str1_utf8, str2_utf8); } free(str2_utf8); } free(str1_utf8); return result; }
int main(int argc, char **argv) { int i; int options = 0; for (i = 1; i < argc; ++i) { if (!strcmp(argv[i], "-nfkc")) { options |= UTF8PROC_STABLE|UTF8PROC_COMPOSE|UTF8PROC_COMPAT; continue; } if (!strcmp(argv[i], "-nfkd")) { options |= UTF8PROC_STABLE|UTF8PROC_DECOMPOSE|UTF8PROC_COMPAT; continue; } if (!strcmp(argv[i], "-nfc")) { options |= UTF8PROC_STABLE|UTF8PROC_COMPOSE; continue; } if (!strcmp(argv[i], "-nfd")) { options |= UTF8PROC_STABLE|UTF8PROC_DECOMPOSE; continue; } if (!strcmp(argv[i], "-casefold")) { options |= UTF8PROC_CASEFOLD; continue; } if (argv[i][0] == '-') { fprintf(stderr, "unrecognized option: %s\n", argv[i]); return EXIT_FAILURE; } size_t len; uint8_t *src = readfile(argv[i], &len); if (!src) { fprintf(stderr, "error reading %s\n", argv[i]); return EXIT_FAILURE; } uint8_t *dest; mytime start = gettime(); for (int i = 0; i < 100; ++i) { utf8proc_map(src, len, &dest, options); free(dest); } printf("%s: %g\n", argv[i], elapsed(gettime(), start) / 100); free(src); } return EXIT_SUCCESS; }
static void issue102(void) /* #128 */ { utf8proc_uint8_t input[] = {0x58, 0xe2, 0x81, 0xa5, 0x45, 0xcc, 0x80, 0xc2, 0xad, 0xe1, 0xb4, 0xac, 0x00}; /* "X\u2065E\u0300\u00ad\u1d2c" */ utf8proc_uint8_t stripna[] = {0x78, 0xc3, 0xa8, 0x61, 0x00}; /* "x\u00e8a" */ utf8proc_uint8_t correct[] = {0x78, 0xe2, 0x81, 0xa5, 0xc3, 0xa8, 0x61, 0x00}; /* "x\u2065\u00e8a" */ utf8proc_uint8_t *output; utf8proc_map(input, 0, &output, UTF8PROC_NULLTERM | UTF8PROC_STABLE | UTF8PROC_COMPOSE | UTF8PROC_COMPAT | UTF8PROC_CASEFOLD | UTF8PROC_IGNORE | UTF8PROC_STRIPNA); printf("NFKC_Casefold \"%s\" -> \"%s\" vs. \"%s\"\n", (char*)input, (char*)output, (char*)stripna); check(strlen((char*) output) == 4, "incorrect NFKC_Casefold+stripna length"); check(!memcmp(stripna, output, 5), "incorrect NFKC_Casefold+stripna data"); free(output); output = utf8proc_NFKC_Casefold(input); printf("NFKC_Casefold \"%s\" -> \"%s\" vs. \"%s\"\n", (char*)input, (char*)output, (char*)correct); check(strlen((char*) output) == 7, "incorrect NFKC_Casefold length"); check(!memcmp(correct, output, 8), "incorrect NFKC_Casefold data"); }
char *normalize_string_utf8(char *str, uint64_t options) { int utf8proc_options = UTF8PROC_OPTIONS_BASE | UTF8PROC_IGNORE | UTF8PROC_NLF2LF | UTF8PROC_STRIPCC; uint8_t *utf8proc_normalized = NULL; bool have_utf8proc_options = false; if (options & NORMALIZE_STRING_TRIM) { string_trim(str); } if (options & NORMALIZE_STRING_DECOMPOSE) { have_utf8proc_options = true; utf8proc_options |= UTF8PROC_OPTIONS_NFD; } if (options & NORMALIZE_STRING_STRIP_ACCENTS) { have_utf8proc_options = true; utf8proc_options |= UTF8PROC_OPTIONS_STRIP_ACCENTS; } if (options & NORMALIZE_STRING_LOWERCASE) { have_utf8proc_options = true; utf8proc_options |= UTF8PROC_OPTIONS_LOWERCASE; } char *normalized = NULL; if (have_utf8proc_options) { utf8proc_map((uint8_t *)str, 0, &utf8proc_normalized, utf8proc_options); normalized = (char *)utf8proc_normalized; str = normalized; } if (options & NORMALIZE_STRING_REPLACE_HYPHENS) { string_replace(str, '-', ' '); normalized = str; } return normalized; }
/* Menu Action Functions */ void action_search(bool (*match)(std::string &searchString, Json::Value &gameData, int &outScore)) { HB_Keyboard sHBKB; bool bKBCancelled = false; consoleClear(); printf("Please enter text to search for:\n"); std::string searchString = getInput(&sHBKB, bKBCancelled); if (bKBCancelled) { return; } // User has entered their input, so let's scrap the keyboard clear_screen(GFX_BOTTOM); std::vector<game_item> display_output; int outScore; for (unsigned int i = 0; i < sourceData.size(); i++) { // Check the region filter std::string regionFilter = config.GetRegionFilter(); if(regionFilter != "off" && sourceData[i]["region"].asString() != regionFilter) { continue; } // Check that the encTitleKey isn't null if (sourceData[i]["encTitleKey"].isNull()) { continue; } // Create an ASCII version of the name if one doesn't exist yet if (sourceData[i]["ascii_name"].isNull()) { // Normalize the name down to ASCII utf8proc_option_t options = (utf8proc_option_t)(UTF8PROC_NULLTERM | UTF8PROC_STABLE | UTF8PROC_DECOMPOSE | UTF8PROC_COMPAT | UTF8PROC_STRIPMARK | UTF8PROC_STRIPCC); utf8proc_uint8_t* szName; utf8proc_uint8_t *str = (utf8proc_uint8_t*)sourceData[i]["name"].asCString(); utf8proc_map(str, 0, &szName, options); sourceData[i]["ascii_name"] = (const char*)szName; free(szName); } if (match(searchString, sourceData[i], outScore)) { game_item item; item.score = outScore; item.index = i; switch(sourceDataType) { case JSON_TYPE_WINGS: item.titleid = sourceData[i]["titleid"].asString(); item.titlekey = sourceData[i]["enckey"].asString(); item.name = sourceData[i]["ascii_name"].asString(); item.region = sourceData[i]["region"].asString(); item.code = sourceData[i]["code"].asString(); break; case JSON_TYPE_ONLINE: item.titleid = sourceData[i]["titleID"].asString(); item.titlekey = sourceData[i]["encTitleKey"].asString(); item.name = sourceData[i]["ascii_name"].asString(); item.region = sourceData[i]["region"].asString(); item.code = sourceData[i]["serial"].asString(); break; } std::string typeCheck = item.titleid.substr(4,4); //if title id belongs to gameapp/dlc/update/dsiware, use it. if not, ignore. case sensitve of course if(typeCheck == "0000" || typeCheck == "008c" || typeCheck == "000e" || typeCheck == "8004"){ display_output.push_back(item); } } } unsigned int display_amount = display_output.size(); // We technically have 30 rows to work with, minus 2 for header/footer. But stick with 20 entries for now if (display_amount == 0) { printf("No matching titles found.\n"); wait_key_specific("\nPress A to return.\n", KEY_A); return; } // sort similar names by fuzzy score if(display_amount>1) { std::sort(display_output.begin(), display_output.end(), compareByScore); } std::string mode_text; switch (config.GetMode()) { case CConfig::Mode::DOWNLOAD_CIA: mode_text = "Create CIA"; break; case CConfig::Mode::INSTALL_CIA: mode_text = "Install CIA"; break; case CConfig::Mode::INSTALL_TICKET: mode_text = "Create Ticket"; break; } char footer[51]; char header[51]; sprintf(header, "Select a Title (found %i results)", display_amount); sprintf(footer, "Press A to %s. Press X to queue.", mode_text.c_str()); titles_multkey_draw(header, footer, 1, &display_output, &display_output, menu_search_keypress); }
int main(int argc, char **argv) { char *buf = NULL; size_t bufsize = 0; FILE *f = argc > 1 ? fopen(argv[1], "r") : NULL; utf8proc_uint8_t src[1024]; check(f != NULL, "error opening GraphemeBreakTest.txt"); while (getline(&buf, &bufsize, f) > 0) { size_t bi = 0, si = 0; lineno += 1; if (lineno % 100 == 0) printf("checking line %zd...\n", lineno); if (buf[0] == '#') continue; while (buf[bi]) { bi = skipspaces(buf, bi); if (buf[bi] == '/') { /* grapheme break */ src[si++] = '/'; bi++; } else if (buf[bi] == '+') { /* no break */ bi++; } else if (buf[bi] == '#') { /* start of comments */ break; } else { /* hex-encoded codepoint */ bi += encode((char*) (src + si), buf + bi) - 1; while (src[si]) ++si; /* advance to NUL termination */ } } if (si && src[si-1] == '/') --si; /* no break after final grapheme */ src[si] = 0; /* NUL-terminate */ if (si) { utf8proc_uint8_t utf8[1024]; /* copy src without 0xff grapheme separators */ size_t i = 0, j = 0; utf8proc_ssize_t glen; utf8proc_uint8_t *g; /* utf8proc_map grapheme results */ while (i < si) { if (src[i] != '/') utf8[j++] = src[i++]; else i++; } glen = utf8proc_map(utf8, j, &g, UTF8PROC_CHARBOUND); if (glen == UTF8PROC_ERROR_INVALIDUTF8) { /* the test file contains surrogate codepoints, which are only for UTF-16 */ printf("line %zd: ignoring invalid UTF-8 codepoints\n", lineno); } else { check(glen >= 0, "utf8proc_map error = %s", utf8proc_errmsg(glen)); for (i = 0; i <= glen; ++i) if (g[i] == 0xff) g[i] = '/'; /* easier-to-read output (/ is not in test strings) */ check(!strcmp((char*)g, (char*)src), "grapheme mismatch: \"%s\" instead of \"%s\"", (char*)g, (char*)src); } free(g); } } fclose(f); printf("Passed tests after %zd lines!\n", lineno); return 0; }