TEST(String, Iconv) { const char *noel_utf8 = "no\xc3\xabl"; /* noël */ const char *noel_iso = "no\xebl"; char *str; FILE *f; /* string_iconv */ WEE_TEST_STR(NULL, string_iconv (0, NULL, NULL, NULL)); WEE_TEST_STR("", string_iconv (0, NULL, NULL, "")); WEE_TEST_STR("abc", string_iconv (0, NULL, NULL, "abc")); WEE_TEST_STR("abc", string_iconv (1, "UTF-8", "ISO-8859-15", "abc")); WEE_TEST_STR(noel_iso, string_iconv (1, "UTF-8", "ISO-8859-15", noel_utf8)); WEE_TEST_STR(noel_utf8, string_iconv (0, "ISO-8859-15", "UTF-8", noel_iso)); /* string_iconv_to_internal */ WEE_TEST_STR(NULL, string_iconv_to_internal (NULL, NULL)); WEE_TEST_STR("", string_iconv_to_internal (NULL, "")); WEE_TEST_STR("abc", string_iconv_to_internal (NULL, "abc")); WEE_TEST_STR(noel_utf8, string_iconv_to_internal ("ISO-8859-15", noel_iso)); /* string_iconv_from_internal */ WEE_TEST_STR(NULL, string_iconv_from_internal (NULL, NULL)); WEE_TEST_STR("", string_iconv_from_internal (NULL, "")); WEE_TEST_STR("abc", string_iconv_from_internal (NULL, "abc")); WEE_TEST_STR(noel_iso, string_iconv_from_internal ("ISO-8859-15", noel_utf8)); /* string_iconv_fprintf */ f = fopen ("/dev/null", "w"); LONGS_EQUAL(0, string_iconv_fprintf (f, NULL)); LONGS_EQUAL(1, string_iconv_fprintf (f, "abc")); LONGS_EQUAL(1, string_iconv_fprintf (f, noel_utf8)); LONGS_EQUAL(1, string_iconv_fprintf (f, noel_iso)); fclose (f); }
/** LyricsWiki parsing strategy: * - start from first hit of "<div class='lyricbox'>" * - search until first html entity * - start converting entities to utf-32 and <br />s to newlines * - if over 48 characters without html entity (allows up to 8 <br />s, for * instance), stop */ static char *parse_lyrics_page(char *page) { char *p; int gap = 0; uint32_t chr; char *tmp; string_t *string, *result; p = strstr(page, "<div class='lyricbox'>"); if (!p) { return NULL; } string = string_new(); for (; *p != '\0'; ++p) { if (string_size(string) > 0 && gap > 48) { break; } ++gap; if (!strncmp(p, "&#", 2)) { if (sscanf(p + 2, "%d;", &chr) < 1) { continue; } gap = 0; tmp = (char *)&chr; string_push_back(string, tmp[0]); string_push_back(string, tmp[1]); string_push_back(string, tmp[2]); string_push_back(string, tmp[3]); } else if (!strncmp(p, "<br />", 6)) { string_push_back(string, '\n'); string_push_back(string, '\0'); string_push_back(string, '\0'); string_push_back(string, '\0'); } } result = string_iconv(string, "UTF-8", "UTF-32"); string_free(string); return string_release(result); }