Beispiel #1
0
/// \threadsafe
QList<QByteArray> QIcuCodec::availableCodecs()
{
    QList<QByteArray> codecs;
    int n = ucnv_countAvailable();
    for (int i = 0; i < n; ++i) {
        const char *name = ucnv_getAvailableName(i);

        UErrorCode error = U_ZERO_ERROR;
        const char *standardName = ucnv_getStandardName(name, "MIME", &error);
        if (U_FAILURE(error) || !standardName) {
            error = U_ZERO_ERROR;
            standardName = ucnv_getStandardName(name, "IANA", &error);
        }
        if (U_FAILURE(error))
            continue;

        error = U_ZERO_ERROR;
        int ac = ucnv_countAliases(standardName, &error);
        if (U_FAILURE(error))
            continue;
        for (int j = 0; j < ac; ++j) {
            error = U_ZERO_ERROR;
            const char *alias = ucnv_getAlias(standardName, j, &error);
            if (!U_SUCCESS(error))
                continue;
            codecs += alias;
        }
    }

    // handled by Qt and not in ICU:
    codecs += "TSCII";

    return codecs;
}
// If a charset listed in the IANA Charset Registry is supported by an implementation
// of the Java platform then its canonical name must be the name listed in the registry.
// Many charsets are given more than one name in the registry, in which case the registry
// identifies one of the names as MIME-preferred. If a charset has more than one registry
// name then its canonical name must be the MIME-preferred name and the other names in
// the registry must be valid aliases. If a supported charset is not listed in the IANA
// registry then its canonical name must begin with one of the strings "X-" or "x-".
static jstring getJavaCanonicalName(JNIEnv* env, const char* icuCanonicalName) {
  UErrorCode status = U_ZERO_ERROR;

  // Check to see if this is a well-known MIME or IANA name.
  const char* cName = NULL;
  if ((cName = ucnv_getStandardName(icuCanonicalName, "MIME", &status)) != NULL) {
    return env->NewStringUTF(cName);
  } else if ((cName = ucnv_getStandardName(icuCanonicalName, "IANA", &status)) != NULL) {
    return env->NewStringUTF(cName);
  }

  // Check to see if an alias already exists with "x-" prefix, if yes then
  // make that the canonical name.
  int32_t aliasCount = ucnv_countAliases(icuCanonicalName, &status);
  for (int i = 0; i < aliasCount; ++i) {
    const char* name = ucnv_getAlias(icuCanonicalName, i, &status);
    if (name != NULL && name[0] == 'x' && name[1] == '-') {
      return env->NewStringUTF(name);
    }
  }

  // As a last resort, prepend "x-" to any alias and make that the canonical name.
  status = U_ZERO_ERROR;
  const char* name = ucnv_getStandardName(icuCanonicalName, "UTR22", &status);
  if (name == NULL && strchr(icuCanonicalName, ',') != NULL) {
    name = ucnv_getAlias(icuCanonicalName, 1, &status);
  }
  // If there is no UTR22 canonical name then just return the original name.
  if (name == NULL) {
    name = icuCanonicalName;
  }
  std::unique_ptr<char[]> result(new char[2 + strlen(name) + 1]);
  strcpy(&result[0], "x-");
  strcat(&result[0], name);
  return env->NewStringUTF(&result[0]);
}
Beispiel #3
0
QList<QByteArray> QIcuCodec::aliases() const
{
    UErrorCode error = U_ZERO_ERROR;

    int n = ucnv_countAliases(m_name, &error);

    QList<QByteArray> aliases;
    for (int i = 0; i < n; ++i) {
        const char *a = ucnv_getAlias(m_name, i, &error);
        // skip the canonical name
        if (!a || !qstrcmp(a, m_name))
            continue;
        aliases += a;
    }

    return aliases;
}
/**
 * Get all available ICU charsets and their aliases (elems 2,3,...)
 *
 * @return R list object; element name == ICU charset canonical name;
 * elements are character vectors (aliases)
 *
 * @version 0.1-?? (Marek Gagolewski)
 *
 * @version 0.2-1 (Marek Gagolewski)
 *          use StriUcnv; make StriException-friendly
 *
 * @version 0.3-1 (Marek Gagolewski, 2014-11-04)
 *    Issue #112: str_prepare_arg* retvals were not PROTECTed from gc
 */
SEXP stri_enc_list()
{
   R_len_t c = (R_len_t)ucnv_countAvailable();

   STRI__ERROR_HANDLER_BEGIN(0)
   SEXP ret;
   SEXP names;
   STRI__PROTECT(ret = Rf_allocVector(VECSXP, c));
   STRI__PROTECT(names = Rf_allocVector(STRSXP, c));

   for (R_len_t i=0; i<c; ++i) {
      const char* canonical_name = ucnv_getAvailableName(i);
      if (!canonical_name) {
         SET_STRING_ELT(names, i, NA_STRING);
         continue;
      }

      SET_STRING_ELT(names, i, Rf_mkChar(canonical_name));

      UErrorCode status = U_ZERO_ERROR;
      R_len_t ci = (R_len_t)ucnv_countAliases(canonical_name, &status);
      if (U_FAILURE(status) || ci <= 0)
         SET_VECTOR_ELT(ret, i, Rf_ScalarString(NA_STRING));
      else {
         SEXP aliases;
         STRI__PROTECT(aliases = Rf_allocVector(STRSXP, ci));
         for (R_len_t j=0; j<ci; ++j) {
            status = U_ZERO_ERROR;
            const char* alias = ucnv_getAlias(canonical_name, j, &status);
            if (U_FAILURE(status) || !alias)
               SET_STRING_ELT(aliases, j, NA_STRING);
            else
               SET_STRING_ELT(aliases, j, Rf_mkChar(alias));
         }
         SET_VECTOR_ELT(ret, i, aliases);
         STRI__UNPROTECT(1);
      }
   }

   Rf_setAttrib(ret, R_NamesSymbol, names);
   STRI__UNPROTECT_ALL
   return ret;

   STRI__ERROR_HANDLER_END({/* no special action on error */})
}
static const char* getICUCanonicalName(const char* name) {
  UErrorCode error = U_ZERO_ERROR;
  const char* canonicalName = NULL;
  if ((canonicalName = ucnv_getCanonicalName(name, "MIME", &error)) != NULL) {
    return canonicalName;
  } else if ((canonicalName = ucnv_getCanonicalName(name, "IANA", &error)) != NULL) {
    return canonicalName;
  } else if ((canonicalName = ucnv_getCanonicalName(name, "", &error)) != NULL) {
    return canonicalName;
  } else if ((canonicalName = ucnv_getAlias(name, 0, &error)) != NULL) {
    // We have some aliases in the form x-blah .. match those first.
    return canonicalName;
  } else if (strstr(name, "x-") == name) {
    // Check if the converter can be opened with the name given.
    error = U_ZERO_ERROR;
    icu::LocalUConverterPointer cnv(ucnv_open(name + 2, &error));
    if (U_SUCCESS(error)) {
      return name + 2;
    }
  }
  return NULL;
}
void TextCodecICU::registerEncodingNames(EncodingNameRegistrar registrar)
{
    // We register Hebrew with logical ordering using a separate name.
    // Otherwise, this would share the same canonical name as the
    // visual ordering case, and then TextEncoding could not tell them
    // apart; ICU treats these names as synonyms.
    registrar("ISO-8859-8-I", "ISO-8859-8-I");

    int32_t numEncodings = ucnv_countAvailable();
    for (int32_t i = 0; i < numEncodings; ++i) {
        const char* name = ucnv_getAvailableName(i);
        UErrorCode error = U_ZERO_ERROR;
#if !defined(USING_SYSTEM_ICU)
        const char* primaryStandard = "HTML";
        const char* secondaryStandard = "MIME";
#else
        const char* primaryStandard = "MIME";
        const char* secondaryStandard = "IANA";
#endif
        const char* standardName = ucnv_getStandardName(name, primaryStandard, &error);
        if (U_FAILURE(error) || !standardName) {
            error = U_ZERO_ERROR;
            // Try IANA to pick up 'windows-12xx' and other names
            // which are not preferred MIME names but are widely used.
            standardName = ucnv_getStandardName(name, secondaryStandard, &error);
            if (U_FAILURE(error) || !standardName)
                continue;
        }

        // A number of these aliases are handled in Chrome's copy of ICU, but
        // Chromium can be compiled with the system ICU.

        // 1. Treat GB2312 encoding as GBK (its more modern superset), to match other browsers.
        // 2. On the Web, GB2312 is encoded as EUC-CN or HZ, while ICU provides a native encoding
        //    for encoding GB_2312-80 and several others. So, we need to override this behavior, too.
#if defined(USING_SYSTEM_ICU)
        if (!strcmp(standardName, "GB2312") || !strcmp(standardName, "GB_2312-80"))
            standardName = "GBK";
        // Similarly, EUC-KR encodings all map to an extended version, but
        // per HTML5, the canonical name still should be EUC-KR.
        else if (!strcmp(standardName, "EUC-KR") || !strcmp(standardName, "KSC_5601") || !strcmp(standardName, "cp1363"))
            standardName = "EUC-KR";
        // And so on.
        else if (!strcasecmp(standardName, "iso-8859-9")) // This name is returned in different case by ICU 3.2 and 3.6.
            standardName = "windows-1254";
        else if (!strcmp(standardName, "TIS-620"))
            standardName = "windows-874";
#endif

        registrar(standardName, standardName);

        uint16_t numAliases = ucnv_countAliases(name, &error);
        ASSERT(U_SUCCESS(error));
        if (U_SUCCESS(error))
            for (uint16_t j = 0; j < numAliases; ++j) {
                error = U_ZERO_ERROR;
                const char* alias = ucnv_getAlias(name, j, &error);
                ASSERT(U_SUCCESS(error));
                if (U_SUCCESS(error) && alias != standardName)
                    registrar(alias, standardName);
            }
    }

    // These two entries have to be added here because ICU's converter table
    // cannot have both ISO-8859-8-I and ISO-8859-8.
    registrar("csISO88598I", "ISO-8859-8-I");
    registrar("logical", "ISO-8859-8-I");

#if defined(USING_SYSTEM_ICU)
    // Additional alias for MacCyrillic not present in ICU.
    registrar("maccyrillic", "x-mac-cyrillic");

    // Additional aliases that historically were present in the encoding
    // table in WebKit on Macintosh that don't seem to be present in ICU.
    // Perhaps we can prove these are not used on the web and remove them.
    // Or perhaps we can get them added to ICU.
    registrar("x-mac-roman", "macintosh");
    registrar("x-mac-ukrainian", "x-mac-cyrillic");
    registrar("cn-big5", "Big5");
    registrar("x-x-big5", "Big5");
    registrar("cn-gb", "GBK");
    registrar("csgb231280", "GBK");
    registrar("x-euc-cn", "GBK");
    registrar("x-gbk", "GBK");
    registrar("koi", "KOI8-R");
    registrar("visual", "ISO-8859-8");
    registrar("winarabic", "windows-1256");
    registrar("winbaltic", "windows-1257");
    registrar("wincyrillic", "windows-1251");
    registrar("iso-8859-11", "windows-874");
    registrar("iso8859-11", "windows-874");
    registrar("dos-874", "windows-874");
    registrar("wingreek", "windows-1253");
    registrar("winhebrew", "windows-1255");
    registrar("winlatin2", "windows-1250");
    registrar("winturkish", "windows-1254");
    registrar("winvietnamese", "windows-1258");
    registrar("x-cp1250", "windows-1250");
    registrar("x-cp1251", "windows-1251");
    registrar("x-euc", "EUC-JP");
    registrar("x-windows-949", "EUC-KR");
    registrar("KSC5601", "EUC-KR");
    registrar("x-uhc", "EUC-KR");
    registrar("shift-jis", "Shift_JIS");

    // Alternative spelling of ISO encoding names.
    registrar("ISO8859-1", "ISO-8859-1");
    registrar("ISO8859-2", "ISO-8859-2");
    registrar("ISO8859-3", "ISO-8859-3");
    registrar("ISO8859-4", "ISO-8859-4");
    registrar("ISO8859-5", "ISO-8859-5");
    registrar("ISO8859-6", "ISO-8859-6");
    registrar("ISO8859-7", "ISO-8859-7");
    registrar("ISO8859-8", "ISO-8859-8");
    registrar("ISO8859-8-I", "ISO-8859-8-I");
    registrar("ISO8859-9", "ISO-8859-9");
    registrar("ISO8859-10", "ISO-8859-10");
    registrar("ISO8859-13", "ISO-8859-13");
    registrar("ISO8859-14", "ISO-8859-14");
    registrar("ISO8859-15", "ISO-8859-15");
    // No need to have an entry for ISO8859-16. ISO-8859-16 has just one label
    // listed in WHATWG Encoding Living Standard (http://encoding.spec.whatwg.org/ ).

    // Additional aliases present in the WHATWG Encoding Standard
    // and Firefox (as of Oct 2014), but not in the upstream ICU.
    // Three entries for windows-1252 need not be listed here because
    // TextCodecLatin1 registers them.
    registrar("csiso58gb231280", "GBK");
    registrar("csiso88596e", "ISO-8859-6");
    registrar("csiso88596i", "ISO-8859-6");
    registrar("csiso88598e", "ISO-8859-8");
    registrar("gb_2312", "GBK");
    registrar("iso88592", "ISO-8859-2");
    registrar("iso88593", "ISO-8859-3");
    registrar("iso88594", "ISO-8859-4");
    registrar("iso88595", "ISO-8859-5");
    registrar("iso88596", "ISO-8859-6");
    registrar("iso88597", "ISO-8859-7");
    registrar("iso88598", "ISO-8859-8");
    registrar("iso88599", "windows-1254");
    registrar("iso885910", "ISO-8859-10");
    registrar("iso885911", "windows-874");
    registrar("iso885913", "ISO-8859-13");
    registrar("iso885914", "ISO-8859-14");
    registrar("iso885915", "ISO-8859-15");
    registrar("iso_8859-2", "ISO-8859-2");
    registrar("iso_8859-3", "ISO-8859-3");
    registrar("iso_8859-4", "ISO-8859-4");
    registrar("iso_8859-5", "ISO-8859-5");
    registrar("iso_8859-6", "ISO-8859-6");
    registrar("iso_8859-7", "ISO-8859-7");
    registrar("iso_8859-8", "ISO-8859-8");
    registrar("iso_8859-9", "windows-1254");
    registrar("iso_8859-15", "ISO-8859-15");
    registrar("koi8_r", "KOI8-R");
    registrar("x-cp1253", "windows-1253");
    registrar("x-cp1254", "windows-1254");
    registrar("x-cp1255", "windows-1255");
    registrar("x-cp1256", "windows-1256");
    registrar("x-cp1257", "windows-1257");
    registrar("x-cp1258", "windows-1258");
#endif
}
Beispiel #7
0
int main(int argc, char **argv)
{
    FILE *outfile;
    int ret = 0;
    int seenf = 0;

    size_t bufsz = DEFAULT_BUFSZ;

    const char *fromcpage = 0;
    const char *tocpage = 0;
    const char *translit = 0;
    const char *outfilestr = 0;
    int fallback = 0;

    UConverterFromUCallback fromucallback = UCNV_FROM_U_CALLBACK_STOP;
    const void *fromuctxt = 0;
    UConverterToUCallback toucallback = UCNV_TO_U_CALLBACK_STOP;
    const void *touctxt = 0;

    char **iter;
    char **end = argv + argc;

    const char *pname;

    int printConvs = 0, printCanon = 0;
    const char *printName = 0;
    int printTranslits = 0;

    int verbose = 0;

    // Get and prettify pname.
    pname = uprv_strrchr(*argv, U_FILE_SEP_CHAR);
#ifdef WIN32
    if (!pname) {
        pname = uprv_strrchr(*argv, '/');
    }
#endif
    if (!pname) {
        pname = *argv;
    } else {
        ++pname;
    }

    // First, get the arguments from command-line
    // to know the codepages to convert between

    // XXX When you add to this loop, you need to add to the similar loop
    // below.

    for (iter = argv + 1; iter != end; iter++) {
        // Check for from charset
        if (strcmp("-f", *iter) == 0 || !strcmp("--from-code", *iter)) {
            iter++;
            if (iter != end)
                fromcpage = *iter;
            else
                usage(pname, 1);
        } else if (strcmp("-t", *iter) == 0 || !strcmp("--to-code", *iter)) {
            iter++;
            if (iter != end)
                tocpage = *iter;
            else
                usage(pname, 1);
        } else if (strcmp("-x", *iter) == 0) {
            iter++;
            if (iter != end)
                translit = *iter;
            else
                usage(pname, 1);
        } else if (!strcmp("--fallback", *iter)) {
            fallback = 1;
        } else if (!strcmp("--no-fallback", *iter)) {
            fallback = 0;
        } else if (strcmp("-b", *iter) == 0 || !strcmp("--block-size", *iter)) {
            iter++;
            if (iter != end) {
                bufsz = atoi(*iter);
                if ((int) bufsz <= 0) {
                    initMsg(pname);
                    UnicodeString str(*iter);
                    initMsg(pname);
                    u_wmsg(stderr, "badBlockSize", str.getBuffer());
                    return 3;
                }
            } else {
                usage(pname, 1);
            }
        } else if (strcmp("-l", *iter) == 0 || !strcmp("--list", *iter)) {
            if (printTranslits) {
                usage(pname, 1);
            }
            printConvs = 1;
        } else if (strcmp("--default-code", *iter) == 0) {
            if (printTranslits) {
                usage(pname, 1);
            }
            printName = ucnv_getDefaultName();
        } else if (strcmp("--list-code", *iter) == 0) {
            if (printTranslits) {
                usage(pname, 1);
            }

            iter++;
            if (iter != end) {
                UErrorCode e = U_ZERO_ERROR;
                printName = ucnv_getAlias(*iter, 0, &e);
                if (U_FAILURE(e) || !printName) {
                    UnicodeString str(*iter);
                    initMsg(pname);
                    u_wmsg(stderr, "noSuchCodeset", str.getBuffer());
                    return 2;
                }
            } else
                usage(pname, 1);
        } else if (strcmp("--canon", *iter) == 0) {
            printCanon = 1;
        } else if (strcmp("-L", *iter) == 0
            || !strcmp("--list-transliterators", *iter)) {
            if (printConvs) {
                usage(pname, 1);
            }
            printTranslits = 1;
        } else if (strcmp("-h", *iter) == 0 || !strcmp("-?", *iter)
            || !strcmp("--help", *iter)) {
            usage(pname, 0);
        } else if (!strcmp("-c", *iter)) {
            fromucallback = UCNV_FROM_U_CALLBACK_SKIP;
        } else if (!strcmp("--to-callback", *iter)) {
            iter++;
            if (iter != end) {
                const struct callback_ent *cbe = findCallback(*iter);
                if (cbe) {
                    fromucallback = cbe->fromu;
                    fromuctxt = cbe->fromuctxt;
                } else {
                    UnicodeString str(*iter);
                    initMsg(pname);
                    u_wmsg(stderr, "unknownCallback", str.getBuffer());
                    return 4;
                }
            } else {
                usage(pname, 1);
            }
        } else if (!strcmp("--from-callback", *iter)) {
            iter++;
            if (iter != end) {
                const struct callback_ent *cbe = findCallback(*iter);
                if (cbe) {
                    toucallback = cbe->tou;
                    touctxt = cbe->touctxt;
                } else {
                    UnicodeString str(*iter);
                    initMsg(pname);
                    u_wmsg(stderr, "unknownCallback", str.getBuffer());
                    return 4;
                }
            } else {
                usage(pname, 1);
            }
        } else if (!strcmp("-i", *iter)) {
            toucallback = UCNV_TO_U_CALLBACK_SKIP;
        } else if (!strcmp("--callback", *iter)) {
            iter++;
            if (iter != end) {
                const struct callback_ent *cbe = findCallback(*iter);
                if (cbe) {
                    fromucallback = cbe->fromu;
                    fromuctxt = cbe->fromuctxt;
                    toucallback = cbe->tou;
                    touctxt = cbe->touctxt;
                } else {
                    UnicodeString str(*iter);
                    initMsg(pname);
                    u_wmsg(stderr, "unknownCallback", str.getBuffer());
                    return 4;
                }
            } else {
                usage(pname, 1);
            }
        } else if (!strcmp("-s", *iter) || !strcmp("--silent", *iter)) {
            verbose = 0;
        } else if (!strcmp("-v", *iter) || !strcmp("--verbose", *iter)) {
            verbose = 1;
        } else if (!strcmp("-V", *iter) || !strcmp("--version", *iter)) {
            printf("%s v2.0\n", pname);
            return 0;
        } else if (!strcmp("-o", *iter) || !strcmp("--output", *iter)) {
            ++iter;
            if (iter != end && !outfilestr) {
                outfilestr = *iter;
            } else {
                usage(pname, 1);
            }
        } else if (**iter == '-' && (*iter)[1]) {
            usage(pname, 1);
        }
    }

    if (printConvs || printName) {
        return printConverters(pname, printName, printCanon) ? 2 : 0;
    } else if (printTranslits) {
        return printTransliterators(printCanon) ? 3 : 0;
    }

    if (!fromcpage || !uprv_strcmp(fromcpage, "-")) {
        fromcpage = ucnv_getDefaultName();
    }
    if (!tocpage || !uprv_strcmp(tocpage, "-")) {
        tocpage = ucnv_getDefaultName();
    }

    // Open the correct output file or connect to stdout for reading input
    if (outfilestr != 0 && strcmp(outfilestr, "-")) {
        outfile = fopen(outfilestr, "wb");
        if (outfile == 0) {
            UnicodeString str1(outfilestr, "");
            UnicodeString str2(strerror(errno), "");
            initMsg(pname);
            u_wmsg(stderr, "cantCreateOutputF",
                str1.getBuffer(), str2.getBuffer());
            return 1;
        }
    } else {
        outfilestr = "-";
        outfile = stdout;
#ifdef WIN32
        if (setmode(fileno(outfile), O_BINARY) == -1) {
            u_wmsg(stderr, "cantSetOutBinMode");
            exit(-1);
        }
#endif
    }

    /* Loop again on the arguments to find all the input files, and
    convert them. XXX Cheap and sloppy. */

    for (iter = argv + 1; iter != end; iter++) {
        if (strcmp("-f", *iter) == 0 || !strcmp("--from-code", *iter)) {
            iter++;
        } else if (strcmp("-t", *iter) == 0 || !strcmp("--to-code", *iter)) {
            iter++;
        } else if (strcmp("-x", *iter) == 0) {
            iter++;
        } else if (!strcmp("--fallback", *iter)) {
            ;
        } else if (!strcmp("--no-fallback", *iter)) {
            ;
        } else if (strcmp("-b", *iter) == 0 || !strcmp("--block-size", *iter)) {
            iter++;
        } else if (strcmp("-l", *iter) == 0 || !strcmp("--list", *iter)) {
            ;
        } else if (strcmp("--default-code", *iter) == 0) {
            ;
        } else if (strcmp("--list-code", *iter) == 0) {
            ;
        } else if (strcmp("--canon", *iter) == 0) {
            ;
        } else if (strcmp("-L", *iter) == 0
            || !strcmp("--list-transliterators", *iter)) {
            ;
        } else if (strcmp("-h", *iter) == 0 || !strcmp("-?", *iter)
            || !strcmp("--help", *iter)) {
            ;
        } else if (!strcmp("-c", *iter)) {
            ;
        } else if (!strcmp("--to-callback", *iter)) {
            iter++;
        } else if (!strcmp("--from-callback", *iter)) {
            iter++;
        } else if (!strcmp("-i", *iter)) {
            ;
        } else if (!strcmp("--callback", *iter)) {
            iter++;
        } else if (!strcmp("-s", *iter) || !strcmp("--silent", *iter)) {
            ;
        } else if (!strcmp("-v", *iter) || !strcmp("--verbose", *iter)) {
            ;
        } else if (!strcmp("-V", *iter) || !strcmp("--version", *iter)) {
            ;
        } else if (!strcmp("-o", *iter) || !strcmp("--output", *iter)) {
            ++iter;
        } else {
            seenf = 1;
            if (!convertFile
                (pname, fromcpage, toucallback, touctxt, tocpage,
                fromucallback, fromuctxt, fallback, bufsz, translit, *iter,
                outfile, verbose)) {
                goto error_exit;
            }
        }
    }

    if (!seenf) {
        if (!convertFile
            (pname, fromcpage, toucallback, touctxt, tocpage,
            fromucallback, fromuctxt, fallback, bufsz, translit, 0, outfile,
            verbose)) {
            goto error_exit;
        }
    }

    goto normal_exit;
error_exit:
    ret = 1;
normal_exit:

    if (outfile != stdout)
        fclose(outfile);

    return ret;
}
Beispiel #8
0
static int printConverters(const char *pname, const char *lookfor,
    int canon)
{
    UErrorCode err = U_ZERO_ERROR;
    int32_t num;
    uint16_t num_stds;
    const char **stds;

    /* If there is a specified name, just handle that now. */

    if (lookfor) {
        if (!canon) {
            printf("%s\n", lookfor);
            return 0;
        } else {
        /*  Because we are printing a canonical name, we need the
            true converter name. We've done that already except for
            the default name (because we want to print the exact
            name one would get when calling ucnv_getDefaultName()
            in non-canon mode). But since we do not know at this
            point if we have the default name or something else, we
            need to normalize again to the canonical converter
            name. */

            const char *truename = ucnv_getAlias(lookfor, 0, &err);
            if (U_SUCCESS(err)) {
                lookfor = truename;
            } else {
                err = U_ZERO_ERROR;
            }
        }
    }

    /* Print converter names. We come here for one of two reasons: we
       are printing all the names (lookfor was null), or we have a
       single converter to print but in canon mode, hence we need to
       get to it in order to print everything. */

    num = ucnv_countAvailable();
    if (num <= 0) {
        initMsg(pname);
        u_wmsg(stderr, "cantGetNames");
        return -1;
    }
    if (lookfor) {
        num = 1;                /* We know where we want to be. */
    }

    num_stds = ucnv_countStandards();
    stds = (const char **) uprv_malloc(num_stds * sizeof(*stds));
    if (!stds) {
        u_wmsg(stderr, "cantGetTag", u_wmsg_errorName(U_MEMORY_ALLOCATION_ERROR));
        return -1;
    } else {
        uint16_t s;

        for (s = 0; s < num_stds; ++s) {
            stds[s] = ucnv_getStandard(s, &err);
            if (U_FAILURE(err)) {
                u_wmsg(stderr, "cantGetTag", u_wmsg_errorName(err));
                return -1;
            }
        }
    }

    for (int32_t i = 0; i < num; i++) {
        const char *name;
        uint16_t num_aliases;

        /* Set the name either to what we are looking for, or
        to the current converter name. */

        if (lookfor) {
            name = lookfor;
        } else {
            name = ucnv_getAvailableName(i);
        }

        /* Get all the aliases associated to the name. */

        err = U_ZERO_ERROR;
        num_aliases = ucnv_countAliases(name, &err);
        if (U_FAILURE(err)) {
            printf("%s", name);

            UnicodeString str(name, (int32_t)(uprv_strlen(name) + 1));
            putchar('\t');
            u_wmsg(stderr, "cantGetAliases", str.getBuffer(),
                u_wmsg_errorName(err));
            return -1;
        } else {
            uint16_t a, s, t;

            /* Write all the aliases and their tags. */

            for (a = 0; a < num_aliases; ++a) {
                const char *alias = ucnv_getAlias(name, a, &err);

                if (U_FAILURE(err)) {
                    UnicodeString str(name, (int32_t)(uprv_strlen(name) + 1));
                    putchar('\t');
                    u_wmsg(stderr, "cantGetAliases", str.getBuffer(),
                        u_wmsg_errorName(err));
                    return -1;
                }

                printf("%s", alias);

                /* Look (slowly, linear searching) for a tag. */

                if (canon) {
                    for (s = t = 0; s < num_stds; ++s) {
                        const char *standard =
                            ucnv_getStandardName(name, stds[s], &err);
                        if (U_SUCCESS(err) && standard) {
                            if (!strcmp(standard, alias)) {
                                if (!t) {
                                    printf(" {");
                                    t = 1;
                                }
                                printf(" %s", stds[s]);
                            }
                        }
                    }
                    if (t) {
                        printf(" }");
                    }
                }

                /* Move on. */

                if (a < num_aliases - 1) {
                    putchar(a || !canon ? ' ' : '\t');
                }
            }
        }

        /* Terminate this entry. */

        if (canon) {
            putchar('\n');
        } else if (i < num - 1) {
            putchar(' ');
        }
    }

    /* Free temporary data. */

    uprv_free(stds);

    /* Success. */

    return 0;
}
void TextCodecICU::registerExtendedEncodingNames(EncodingNameRegistrar registrar)
{
    // We register Hebrew with logical ordering using a separate name.
    // Otherwise, this would share the same canonical name as the
    // visual ordering case, and then TextEncoding could not tell them
    // apart; ICU treats these names as synonyms.
    registrar("ISO-8859-8-I", "ISO-8859-8-I");

    int32_t numEncodings = ucnv_countAvailable();
    for (int32_t i = 0; i < numEncodings; ++i) {
        const char* name = ucnv_getAvailableName(i);
        UErrorCode error = U_ZERO_ERROR;
        // Try MIME before trying IANA to pick up commonly used names like
        // 'EUC-JP' instead of horrendeously long names like 
        // 'Extended_UNIX_Code_Packed_Format_for_Japanese'. 
        const char* standardName = ucnv_getStandardName(name, "MIME", &error);
        if (!U_SUCCESS(error) || !standardName) {
            error = U_ZERO_ERROR;
            // Try IANA to pick up 'windows-12xx' and other names
            // which are not preferred MIME names but are widely used. 
            standardName = ucnv_getStandardName(name, "IANA", &error);
            if (!U_SUCCESS(error) || !standardName)
                continue;
        }

        // 1. Treat GB2312 encoding as GBK (its more modern superset), to match other browsers.
        // 2. On the Web, GB2312 is encoded as EUC-CN or HZ, while ICU provides a native encoding
        //    for encoding GB_2312-80 and several others. So, we need to override this behavior, too.
        if (strcmp(standardName, "GB2312") == 0 || strcmp(standardName, "GB_2312-80") == 0)
            standardName = "GBK";
        // Similarly, EUC-KR encodings all map to an extended version.
        else if (strcmp(standardName, "KSC_5601") == 0 || strcmp(standardName, "EUC-KR") == 0 || strcmp(standardName, "cp1363") == 0)
            standardName = "windows-949";
        // And so on.
        else if (strcasecmp(standardName, "iso-8859-9") == 0) // This name is returned in different case by ICU 3.2 and 3.6.
            standardName = "windows-1254";
        else if (strcmp(standardName, "TIS-620") == 0)
            standardName = "windows-874";

        registrar(standardName, standardName);

        uint16_t numAliases = ucnv_countAliases(name, &error);
        ASSERT(U_SUCCESS(error));
        if (U_SUCCESS(error))
            for (uint16_t j = 0; j < numAliases; ++j) {
                error = U_ZERO_ERROR;
                const char* alias = ucnv_getAlias(name, j, &error);
                ASSERT(U_SUCCESS(error));
                if (U_SUCCESS(error) && alias != standardName)
                    registrar(alias, standardName);
            }
    }

    // Additional aliases.
    // These are present in modern versions of ICU, but not in ICU 3.2 (shipped with Mac OS X 10.4).
    registrar("macroman", "macintosh");
    registrar("maccyrillic", "x-mac-cyrillic");

    // Additional aliases that historically were present in the encoding
    // table in WebKit on Macintosh that don't seem to be present in ICU.
    // Perhaps we can prove these are not used on the web and remove them.
    // Or perhaps we can get them added to ICU.
    registrar("xmacroman", "macintosh");
    registrar("xmacukrainian", "x-mac-cyrillic");
    registrar("cnbig5", "Big5");
    registrar("xxbig5", "Big5");
    registrar("cngb", "GBK");
    registrar("csgb231280", "GBK");
    registrar("xeuccn", "GBK");
    registrar("xgbk", "GBK");
    registrar("csISO88598I", "ISO_8859-8-I");
    registrar("koi", "KOI8-R");
    registrar("logical", "ISO-8859-8-I");
    registrar("unicode11utf8", "UTF-8");
    registrar("unicode20utf8", "UTF-8");
    registrar("xunicode20utf8", "UTF-8");
    registrar("visual", "ISO-8859-8");
    registrar("winarabic", "windows-1256");
    registrar("winbaltic", "windows-1257");
    registrar("wincyrillic", "windows-1251");
    registrar("iso885911", "windows-874");
    registrar("dos874", "windows-874");
    registrar("wingreek", "windows-1253");
    registrar("winhebrew", "windows-1255");
    registrar("winlatin2", "windows-1250");
    registrar("winturkish", "windows-1254");
    registrar("winvietnamese", "windows-1258");
    registrar("xcp1250", "windows-1250");
    registrar("xcp1251", "windows-1251");
    registrar("xeuc", "EUC-JP");
    registrar("xwindows949", "windows-949");
    registrar("xuhc", "windows-949");

    // These aliases are present in modern versions of ICU, but use different codecs, and have no standard names.
    // They are not present in ICU 3.2.
    registrar("dos720", "cp864");
    registrar("jis7", "ISO-2022-JP");
}
CF_PRIVATE const char *__CFStringEncodingGetICUName(CFStringEncoding encoding) {
#define STACK_BUFFER_SIZE (60)
    char buffer[STACK_BUFFER_SIZE];
    const char *result = NULL;
    UErrorCode errorCode = U_ZERO_ERROR;
    uint32_t codepage = 0;

    if (kCFStringEncodingUTF7_IMAP == encoding) return "IMAP-mailbox-name";

    if (kCFStringEncodingUnicode != (encoding & 0x0F00)) codepage = __CFStringEncodingGetWindowsCodePage(encoding); // we don't use codepage for UTF to avoid little endian weirdness of Windows

    if ((0 != codepage) && (snprintf(buffer, STACK_BUFFER_SIZE, "windows-%d", codepage) < STACK_BUFFER_SIZE) && (NULL != (result = ucnv_getAlias(buffer, 0, &errorCode)))) return result;

    if (__CFStringEncodingGetCanonicalName(encoding, buffer, STACK_BUFFER_SIZE)) result = ucnv_getAlias(buffer, 0, &errorCode);

    return result;
#undef STACK_BUFFER_SIZE
}
Beispiel #11
0
void TextCodecICU::registerExtendedEncodingNames(EncodingNameRegistrar registrar)
{
    // We register Hebrew with logical ordering using a separate name.
    // Otherwise, this would share the same canonical name as the
    // visual ordering case, and then TextEncoding could not tell them
    // apart; ICU works with either name.
    registrar("ISO-8859-8-I", "ISO-8859-8-I");

    int32_t numEncodings = ucnv_countAvailable();
    for (int32_t i = 0; i < numEncodings; ++i) {
        const char* name = ucnv_getAvailableName(i);
        UErrorCode error = U_ZERO_ERROR;
        // FIXME: Should we use the "MIME" standard instead of "IANA"?
        const char* standardName = ucnv_getStandardName(name, "IANA", &error);
        if (!U_SUCCESS(error) || !standardName)
            continue;

        // 1. Treat GB2312 encoding as GBK (its more modern superset), to match other browsers.
        // 2. On the Web, GB2312 is encoded as EUC-CN or HZ, while ICU provides a native encoding
        //    for encoding GB_2312-80 and several others. So, we need to override this behavior, too.
        if (strcmp(standardName, "GB2312") == 0 || strcmp(standardName, "GB_2312-80") == 0)
            standardName = "GBK";
        // Similarly, EUC-KR encodings all map to an extended version.
        else if (strcmp(standardName, "KS_C_5601-1987") == 0 || strcmp(standardName, "EUC-KR") == 0)
            standardName = "windows-949-2000";
        // And so on.
        else if (strcmp(standardName, "ISO_8859-9:1989") == 0)
            standardName = "windows-1254";
        else if (strcmp(standardName, "TIS-620") == 0)
            standardName = "windows-874-2000";

        registrar(standardName, standardName);

        uint16_t numAliases = ucnv_countAliases(name, &error);
        ASSERT(U_SUCCESS(error));
        if (U_SUCCESS(error))
            for (uint16_t j = 0; j < numAliases; ++j) {
                error = U_ZERO_ERROR;
                const char* alias = ucnv_getAlias(name, j, &error);
                ASSERT(U_SUCCESS(error));
                if (U_SUCCESS(error) && alias != standardName)
                    registrar(alias, standardName);
            }
    }

    // Additional aliases.
    // Perhaps we can get these added to ICU.
    registrar("macroman", "macintosh");
    registrar("xmacroman", "macintosh");

    // Additional aliases that historically were present in the encoding
    // table in WebKit on Macintosh that don't seem to be present in ICU.
    // Perhaps we can prove these are not used on the web and remove them.
    // Or perhaps we can get them added to ICU.
    registrar("cnbig5", "Big5");
    registrar("cngb", "EUC-CN");
    registrar("csISO88598I", "ISO_8859-8-I");
    registrar("csgb231280", "EUC-CN");
    registrar("dos720", "cp864");
    registrar("dos874", "cp874");
    registrar("jis7", "ISO-2022-JP");
    registrar("koi", "KOI8-R");
    registrar("logical", "ISO-8859-8-I");
    registrar("unicode11utf8", "UTF-8");
    registrar("unicode20utf8", "UTF-8");
    registrar("visual", "ISO-8859-8");
    registrar("winarabic", "windows-1256");
    registrar("winbaltic", "windows-1257");
    registrar("wincyrillic", "windows-1251");
    registrar("windows874", "windows874-2000");
    registrar("iso885911", "windows874-2000");
    registrar("wingreek", "windows-1253");
    registrar("winhebrew", "windows-1255");
    registrar("winlatin2", "windows-1250");
    registrar("winturkish", "windows-1254");
    registrar("winvietnamese", "windows-1258");
    registrar("xcp1250", "windows-1250");
    registrar("xcp1251", "windows-1251");
    registrar("xeuc", "EUC-JP");
    registrar("xeuccn", "EUC-CN");
    registrar("xgbk", "EUC-CN");
    registrar("xunicode20utf8", "UTF-8");
    registrar("xwindows949", "windows-949-2000");
    registrar("xxbig5", "Big5");
}
void TextCodecICU::registerEncodingNames(EncodingNameRegistrar registrar)
{
    // We register Hebrew with logical ordering using a separate name.
    // Otherwise, this would share the same canonical name as the
    // visual ordering case, and then TextEncoding could not tell them
    // apart; ICU treats these names as synonyms.
    registrar("ISO-8859-8-I", "ISO-8859-8-I");

    int32_t numConverters = ucnv_countAvailable();
    for (int32_t i = 0; i < numConverters; ++i) {
        const char* canonicalConverterName = ucnv_getAvailableName(i);
        UErrorCode error = U_ZERO_ERROR;
        // Try MIME before trying IANA to pick up commonly used names like
        // 'EUC-JP' instead of horrendously long names like 
        // 'Extended_UNIX_Code_Packed_Format_for_Japanese'. 
        const char* webStandardName = ucnv_getStandardName(canonicalConverterName, "MIME", &error);
        if (!U_SUCCESS(error) || !webStandardName) {
            error = U_ZERO_ERROR;
            // Try IANA to pick up 'windows-12xx' and other names
            // which are not preferred MIME names but are widely used. 
            webStandardName = ucnv_getStandardName(canonicalConverterName, "IANA", &error);
            if (!U_SUCCESS(error) || !webStandardName)
                continue;
        }

        // Any standard encoding overrides should match checks in registerCodecs() below.

        // 1. Treat GB2312 encoding as GBK (its more modern superset), to match other browsers.
        // 2. On the Web, GB2312 is encoded as EUC-CN or HZ, while ICU provides a native encoding
        //    for encoding GB_2312-80 and several others. So, we need to override this behavior, too.
        if (strcmp(webStandardName, "GB2312") == 0 || strcmp(webStandardName, "GB_2312-80") == 0)
            webStandardName = "GBK";
        // Similarly, EUC-KR encodings all map to an extended version.
        else if (strcmp(webStandardName, "KSC_5601") == 0 || strcmp(webStandardName, "EUC-KR") == 0 || strcmp(webStandardName, "cp1363") == 0)
            webStandardName = "windows-949";
        // And so on.
        // FIXME: strcasecmp is locale sensitive, we should not be using it.
        else if (strcasecmp(webStandardName, "iso-8859-9") == 0) // This name is returned in different case by ICU 3.2 and 3.6.
            webStandardName = "windows-1254";
        else if (strcmp(webStandardName, "TIS-620") == 0)
            webStandardName = "windows-874";

        registrar(webStandardName, webStandardName);

        uint16_t numAliases = ucnv_countAliases(canonicalConverterName, &error);
        ASSERT(U_SUCCESS(error));
        if (U_SUCCESS(error))
            for (uint16_t j = 0; j < numAliases; ++j) {
                error = U_ZERO_ERROR;
                const char* alias = ucnv_getAlias(canonicalConverterName, j, &error);
                ASSERT(U_SUCCESS(error));
                if (U_SUCCESS(error) && alias != webStandardName)
                    registrar(alias, webStandardName);
            }
    }

    // Additional aliases.
    // macroman is present in modern versions of ICU, but not in ICU 3.2 (shipped with Mac OS X 10.4).
    // FIXME: Do any ports still use such old versions?
    registrar("macroman", "macintosh");

    // Additional aliases that historically were present in the encoding
    // table in WebKit on Macintosh that don't seem to be present in ICU.
    // Perhaps we can prove these are not used on the web and remove them.
    // Or perhaps we can get them added to ICU.
    registrar("x-mac-roman", "macintosh");
    registrar("maccyrillic", "x-mac-cyrillic");
    registrar("x-mac-ukrainian", "x-mac-cyrillic");
    registrar("cn-big5", "Big5");
    registrar("x-x-big5", "Big5");
    registrar("cn-gb", "GBK");
    registrar("csgb231280", "GBK");
    registrar("x-euc-cn", "GBK");
    registrar("x-gbk", "GBK");
    registrar("csISO88598I", "ISO-8859-8-I");
    registrar("koi", "KOI8-R");
    registrar("logical", "ISO-8859-8-I");
    registrar("visual", "ISO-8859-8");
    registrar("winarabic", "windows-1256");
    registrar("winbaltic", "windows-1257");
    registrar("wincyrillic", "windows-1251");
    registrar("iso-8859-11", "windows-874");
    registrar("iso8859-11", "windows-874");
    registrar("dos-874", "windows-874");
    registrar("wingreek", "windows-1253");
    registrar("winhebrew", "windows-1255");
    registrar("winlatin2", "windows-1250");
    registrar("winturkish", "windows-1254");
    registrar("winvietnamese", "windows-1258");
    registrar("x-cp1250", "windows-1250");
    registrar("x-cp1251", "windows-1251");
    registrar("x-euc", "EUC-JP");
    registrar("x-windows-949", "windows-949");
    registrar("KSC5601", "windows-949");
    registrar("x-uhc", "windows-949");
    registrar("shift-jis", "Shift_JIS");

    // These aliases are present in modern versions of ICU, but use different codecs, and have no standard names.
    // They are not present in ICU 3.2.
    registrar("dos-720", "cp864");
    registrar("jis7", "ISO-2022-JP");

    // Alternative spelling of ISO encoding names.
    registrar("ISO8859-1", "ISO-8859-1");
    registrar("ISO8859-2", "ISO-8859-2");
    registrar("ISO8859-3", "ISO-8859-3");
    registrar("ISO8859-4", "ISO-8859-4");
    registrar("ISO8859-5", "ISO-8859-5");
    registrar("ISO8859-6", "ISO-8859-6");
    registrar("ISO8859-7", "ISO-8859-7");
    registrar("ISO8859-8", "ISO-8859-8");
    registrar("ISO8859-8-I", "ISO-8859-8-I");
    registrar("ISO8859-9", "windows-1254");
    registrar("ISO8859-10", "ISO-8859-10");
    registrar("ISO8859-13", "ISO-8859-13");
    registrar("ISO8859-14", "ISO-8859-14");
    registrar("ISO8859-15", "ISO-8859-15");
    // Not registering ISO8859-16, because Firefox (as of version 3.6.6) doesn't know this particular alias,
    // and because older versions of ICU don't support ISO-8859-16 encoding at all.
}
Beispiel #13
0
const char *__hs_ucnv_getAlias(const char *alias, uint16_t n,
			       UErrorCode *pErrorCode)
{
    return ucnv_getAlias(alias, n, pErrorCode);
}