C++ (Cpp) utf8proc_map 예제들

예제 #1

0

파일 보기

파일: TestNormalizer.c 프로젝트: lazycrazyowl/lucy

static void
test_utf8proc_normalization(TestBatchRunner *runner) {
    SKIP(runner, 1,
         "utf8proc can't handle control chars or Unicode non-chars");
    return;

    for (int32_t i = 0; i < 100; i++) {
        String *source = TestUtils_random_string(rand() % 40);

        // Normalize once.
        uint8_t *normalized;
        int32_t check = utf8proc_map((const uint8_t*)Str_Get_Ptr8(source),
                                     Str_Get_Size(source),
                                     &normalized,
                                     UTF8PROC_STABLE  |
                                     UTF8PROC_COMPOSE |
                                     UTF8PROC_COMPAT  |
                                     UTF8PROC_CASEFOLD);
        if (check < 0) {
            lucy_Json_set_tolerant(1);
            String *json = lucy_Json_to_json((Obj*)source);
            if (!json) {
                json = Str_newf("[failed to encode]");
            }
            FAIL(runner, "Failed to normalize: %s", Str_Get_Ptr8(json));
            DECREF(json);
            DECREF(source);
            return;
        }

        // Normalize again.
        size_t normalized_len = strlen((char*)normalized);
        uint8_t *dupe;
        int32_t dupe_check = utf8proc_map(normalized, normalized_len, &dupe,
                                          UTF8PROC_STABLE  |
                                          UTF8PROC_COMPOSE |
                                          UTF8PROC_COMPAT  |
                                          UTF8PROC_CASEFOLD);
        if (dupe_check < 0) {
            THROW(ERR, "Unexpected normalization error: %i32", dupe_check);
        }
        int comparison = strcmp((char*)normalized, (char*)dupe);
        free(dupe);
        free(normalized);
        DECREF(source);
        if (comparison != 0) {
            FAIL(runner, "Not fully normalized");
            return;
        }
    }
    PASS(runner, "Normalization successful.");
}

예제 #2

0

파일 보기

파일: init.c 프로젝트: AlexShiLucky/rtems

static void
test_utf8proc_map ( void )
{
  char         string_simple[]    = "The quick brown.fox";
  uint8_t     *string_simple_utf8 = (uint8_t*)(&string_simple[0]);
  uint8_t     *dest               = NULL;
  ssize_t      chars_written;
  unsigned int index;

  chars_written = utf8proc_map(
    string_simple_utf8,
    sizeof ( string_simple ),
    &dest,
    UTF8PROC_NULLTERM | UTF8PROC_STABLE | UTF8PROC_DECOMPOSE );
  rtems_test_assert ( chars_written == strlen ( string_simple ) );
  rtems_test_assert ( dest != NULL);

  /* Our source string contains only very simple characters. Thus the above
   * decomposition should result in exactly the same string
   */
  for ( index = 0; index < chars_written; ++index ) {
    rtems_test_assert ( string_simple_utf8[index] == dest[index] );
  }
  free ( dest );
}

예제 #3

0

파일 보기

파일: tools.cpp 프로젝트: NatLibFi/usemarcon

int utf8_stricmp(const char *str1, const char *str2)
{
    int result = 0;

    char *str1_utf8 = NULL;
    if (utf8proc_map((unsigned char *) str1, 0, (unsigned char **) &str1_utf8, UTF8PROC_NULLTERM | UTF8PROC_CASEFOLD) > 0)
    {
        char *str2_utf8 = NULL;
        if (utf8proc_map((unsigned char *) str2, 0, (unsigned char **) &str2_utf8, UTF8PROC_NULLTERM | UTF8PROC_CASEFOLD) > 0)
        {
            result = strcmp(str1_utf8, str2_utf8);
        }
        free(str2_utf8);
    }
    free(str1_utf8);
    return result;
}

예제 #4

0

파일 보기

파일: bench.c 프로젝트: 0x09/hfsfuse

int main(int argc, char **argv)
{
	 int i;
	 int options = 0;
	 
	 for (i = 1; i < argc; ++i) {
		  if (!strcmp(argv[i], "-nfkc")) {
			   options |= UTF8PROC_STABLE|UTF8PROC_COMPOSE|UTF8PROC_COMPAT;
			   continue;
		  }
		  if (!strcmp(argv[i], "-nfkd")) {
			   options |= UTF8PROC_STABLE|UTF8PROC_DECOMPOSE|UTF8PROC_COMPAT;
			   continue;
		  }
		  if (!strcmp(argv[i], "-nfc")) {
			   options |= UTF8PROC_STABLE|UTF8PROC_COMPOSE;
			   continue;
		  }
		  if (!strcmp(argv[i], "-nfd")) {
			   options |= UTF8PROC_STABLE|UTF8PROC_DECOMPOSE;
			   continue;
		  }
		  if (!strcmp(argv[i], "-casefold")) {
			   options |= UTF8PROC_CASEFOLD;
			   continue;
		  }
		  if (argv[i][0] == '-') {
			   fprintf(stderr, "unrecognized option: %s\n", argv[i]);
			   return EXIT_FAILURE;
		  }

		  size_t len;
		  uint8_t *src = readfile(argv[i], &len);
		  if (!src) {
			   fprintf(stderr, "error reading %s\n", argv[i]);
			   return EXIT_FAILURE;
		  }
		  uint8_t *dest;
		  mytime start = gettime();
		  for (int i = 0; i < 100; ++i) {
			   utf8proc_map(src, len, &dest, options);
			   free(dest);
		  }
		  printf("%s: %g\n", argv[i], elapsed(gettime(), start) / 100);
		  free(src);
	 }

	 return EXIT_SUCCESS;
}

예제 #5

0

파일 보기

파일: misc.c 프로젝트: ScottPJones/utf8proc

static void issue102(void) /* #128 */
{
    utf8proc_uint8_t input[] = {0x58, 0xe2, 0x81, 0xa5, 0x45, 0xcc, 0x80, 0xc2, 0xad, 0xe1, 0xb4, 0xac, 0x00}; /* "X\u2065E\u0300\u00ad\u1d2c" */
    utf8proc_uint8_t stripna[] = {0x78, 0xc3, 0xa8, 0x61, 0x00}; /* "x\u00e8a" */
    utf8proc_uint8_t correct[] = {0x78, 0xe2, 0x81, 0xa5, 0xc3, 0xa8, 0x61, 0x00}; /* "x\u2065\u00e8a" */
    utf8proc_uint8_t *output;
    utf8proc_map(input, 0, &output, UTF8PROC_NULLTERM | UTF8PROC_STABLE |
        UTF8PROC_COMPOSE | UTF8PROC_COMPAT | UTF8PROC_CASEFOLD | UTF8PROC_IGNORE | UTF8PROC_STRIPNA);
    printf("NFKC_Casefold \"%s\" -> \"%s\" vs. \"%s\"\n", (char*)input, (char*)output, (char*)stripna);
    check(strlen((char*) output) == 4, "incorrect NFKC_Casefold+stripna length");
    check(!memcmp(stripna, output, 5), "incorrect NFKC_Casefold+stripna data");
    free(output);
    output = utf8proc_NFKC_Casefold(input);
    printf("NFKC_Casefold \"%s\" -> \"%s\" vs. \"%s\"\n", (char*)input, (char*)output, (char*)correct);
    check(strlen((char*) output) == 7, "incorrect NFKC_Casefold length");
    check(!memcmp(correct, output, 8), "incorrect NFKC_Casefold data");
}

예제 #6

0

파일 보기

파일: normalize.c 프로젝트: SigTill/libpostal

char *normalize_string_utf8(char *str, uint64_t options) {    
    int utf8proc_options = UTF8PROC_OPTIONS_BASE | UTF8PROC_IGNORE | UTF8PROC_NLF2LF | UTF8PROC_STRIPCC;
    uint8_t *utf8proc_normalized = NULL;

    bool have_utf8proc_options = false;

    if (options & NORMALIZE_STRING_TRIM) {
        string_trim(str);
    }

    if (options & NORMALIZE_STRING_DECOMPOSE) {
        have_utf8proc_options = true;
        utf8proc_options |= UTF8PROC_OPTIONS_NFD;
    }

    if (options & NORMALIZE_STRING_STRIP_ACCENTS) {
        have_utf8proc_options = true;
        utf8proc_options |= UTF8PROC_OPTIONS_STRIP_ACCENTS;
    }

    if (options & NORMALIZE_STRING_LOWERCASE) {
        have_utf8proc_options = true;
        utf8proc_options |= UTF8PROC_OPTIONS_LOWERCASE;
    }

    char *normalized = NULL;

    if (have_utf8proc_options) {
        utf8proc_map((uint8_t *)str, 0, &utf8proc_normalized, utf8proc_options);

        normalized = (char *)utf8proc_normalized;
        str = normalized;
    }

    if (options & NORMALIZE_STRING_REPLACE_HYPHENS) {
        string_replace(str, '-', ' ');
        normalized = str;
    }

    return normalized;
}

예제 #7

0

파일 보기

파일: main.cpp 프로젝트: lavanoid/CIAngel

/* Menu Action Functions */
void action_search(bool (*match)(std::string &searchString, Json::Value &gameData, int &outScore))
{
    HB_Keyboard sHBKB;
    bool bKBCancelled = false;

    consoleClear();

    printf("Please enter text to search for:\n");
    std::string searchString = getInput(&sHBKB, bKBCancelled);
    if (bKBCancelled)
    {
        return;
    }

    // User has entered their input, so let's scrap the keyboard
    clear_screen(GFX_BOTTOM);

    std::vector<game_item> display_output;
    int outScore;
    
    for (unsigned int i = 0; i < sourceData.size(); i++) {
        // Check the region filter
        std::string regionFilter = config.GetRegionFilter();
        if(regionFilter != "off" && sourceData[i]["region"].asString() != regionFilter) {
            continue;
        }

        // Check that the encTitleKey isn't null
        if (sourceData[i]["encTitleKey"].isNull())
        {
            continue;
        }

        // Create an ASCII version of the name if one doesn't exist yet
        if (sourceData[i]["ascii_name"].isNull())
        {
            // Normalize the name down to ASCII
            utf8proc_option_t options = (utf8proc_option_t)(UTF8PROC_NULLTERM | UTF8PROC_STABLE | UTF8PROC_DECOMPOSE | UTF8PROC_COMPAT | UTF8PROC_STRIPMARK | UTF8PROC_STRIPCC);
            utf8proc_uint8_t* szName;
            utf8proc_uint8_t *str = (utf8proc_uint8_t*)sourceData[i]["name"].asCString();
            utf8proc_map(str, 0, &szName, options);

            sourceData[i]["ascii_name"] = (const char*)szName;

            free(szName);
        }

        if (match(searchString, sourceData[i], outScore))
        {

            game_item item;
            item.score = outScore;
            item.index = i;

            switch(sourceDataType) {
            case JSON_TYPE_WINGS:
              item.titleid = sourceData[i]["titleid"].asString();
              item.titlekey = sourceData[i]["enckey"].asString();
              item.name = sourceData[i]["ascii_name"].asString();
              item.region = sourceData[i]["region"].asString();
              item.code = sourceData[i]["code"].asString();
              break;
            case JSON_TYPE_ONLINE:
              item.titleid = sourceData[i]["titleID"].asString();
              item.titlekey = sourceData[i]["encTitleKey"].asString();
              item.name = sourceData[i]["ascii_name"].asString();
              item.region = sourceData[i]["region"].asString();
              item.code = sourceData[i]["serial"].asString();
              break;
            }

            std::string typeCheck = item.titleid.substr(4,4);
            //if title id belongs to gameapp/dlc/update/dsiware, use it. if not, ignore. case sensitve of course
            if(typeCheck == "0000" || typeCheck == "008c" || typeCheck == "000e" || typeCheck == "8004"){
                display_output.push_back(item);
            }
        }
    }

    unsigned int display_amount = display_output.size();

    // We technically have 30 rows to work with, minus 2 for header/footer. But stick with 20 entries for now

    if (display_amount == 0)
    {
        printf("No matching titles found.\n");
        wait_key_specific("\nPress A to return.\n", KEY_A);
        return;
    }

    // sort similar names by fuzzy score
    if(display_amount>1) {
        std::sort(display_output.begin(), display_output.end(), compareByScore);
    }
    
    std::string mode_text;
    switch (config.GetMode())
    {
        case CConfig::Mode::DOWNLOAD_CIA:
            mode_text = "Create CIA";
        break;
        case CConfig::Mode::INSTALL_CIA:
            mode_text = "Install CIA";
        break;
        case CConfig::Mode::INSTALL_TICKET:
            mode_text = "Create Ticket";
        break;
    }

    char footer[51];
    char header[51];
    sprintf(header, "Select a Title (found %i results)", display_amount);
    sprintf(footer, "Press A to %s. Press X to queue.", mode_text.c_str());
    titles_multkey_draw(header, footer, 1, &display_output, &display_output, menu_search_keypress);
}

예제 #8

0

파일 보기

파일: graphemetest.c 프로젝트: jlec/utf8proc

int main(int argc, char **argv)
{
    char *buf = NULL;
    size_t bufsize = 0;
    FILE *f = argc > 1 ? fopen(argv[1], "r") : NULL;
    utf8proc_uint8_t src[1024];
    
    check(f != NULL, "error opening GraphemeBreakTest.txt");
    while (getline(&buf, &bufsize, f) > 0) {
        size_t bi = 0, si = 0;
        lineno += 1;
        
        if (lineno % 100 == 0)
            printf("checking line %zd...\n", lineno);
        
        if (buf[0] == '#') continue;
        
        while (buf[bi]) {
            bi = skipspaces(buf, bi);
            if (buf[bi] == '/') { /* grapheme break */
                src[si++] = '/';
                bi++;
            }
            else if (buf[bi] == '+') { /* no break */
                bi++;
            }
            else if (buf[bi] == '#') { /* start of comments */
                break;
            }
            else { /* hex-encoded codepoint */
                bi += encode((char*) (src + si), buf + bi) - 1;
                while (src[si]) ++si; /* advance to NUL termination */
            }
        }
        if (si && src[si-1] == '/')
            --si; /* no break after final grapheme */
        src[si] = 0; /* NUL-terminate */
        
        if (si) {
            utf8proc_uint8_t utf8[1024]; /* copy src without 0xff grapheme separators */
            size_t i = 0, j = 0;
            utf8proc_ssize_t glen;
            utf8proc_uint8_t *g; /* utf8proc_map grapheme results */
            while (i < si) {
                if (src[i] != '/')
                    utf8[j++] = src[i++];
                else
                    i++;
            }
            glen = utf8proc_map(utf8, j, &g, UTF8PROC_CHARBOUND);
            if (glen == UTF8PROC_ERROR_INVALIDUTF8) {
                 /* the test file contains surrogate codepoints, which are only for UTF-16 */
                 printf("line %zd: ignoring invalid UTF-8 codepoints\n", lineno);
            }
            else {
                 check(glen >= 0, "utf8proc_map error = %s",
                       utf8proc_errmsg(glen));
                 for (i = 0; i <= glen; ++i)
                      if (g[i] == 0xff)
                           g[i] = '/'; /* easier-to-read output (/ is not in test strings) */
                 check(!strcmp((char*)g, (char*)src),
                       "grapheme mismatch: \"%s\" instead of \"%s\"", (char*)g, (char*)src);
            }
            free(g);
        }
    }
    fclose(f);
    printf("Passed tests after %zd lines!\n", lineno);
    return 0;
}