Exemplo n.º 1
0
CF_PRIVATE CFStringEncoding __CFStringEncodingGetFromICUName(const char *icuName) {
    uint32_t codepage;
    char *endPtr;
    UErrorCode errorCode = U_ZERO_ERROR;

    if ((0 == strncasecmp_l(icuName, "windows-", strlen("windows-"), NULL)) && (0 != (codepage = strtol(icuName + strlen("windows-"), &endPtr, 10))) && (*endPtr == '\0')) return __CFStringEncodingGetFromWindowsCodePage(codepage);

    if (0 != ucnv_countAliases(icuName, &errorCode)) {
        CFStringEncoding encoding;
        const char *name;

        // Try WINDOWS platform
        name = ucnv_getStandardName(icuName, "WINDOWS", &errorCode);
        
        if (NULL != name) {
            if ((0 == strncasecmp_l(name, "windows-", strlen("windows-"), NULL)) && (0 != (codepage = strtol(name + strlen("windows-"), &endPtr, 10))) && (*endPtr == '\0')) return __CFStringEncodingGetFromWindowsCodePage(codepage);
            
            if (strncasecmp_l(icuName, name, strlen(name), NULL) && (kCFStringEncodingInvalidId != (encoding = __CFStringEncodingGetFromCanonicalName(name)))) return encoding;
        }

        // Try JAVA platform
        name = ucnv_getStandardName(icuName, "JAVA", &errorCode);
        if ((NULL != name) && strncasecmp_l(icuName, name, strlen(name), NULL) && (kCFStringEncodingInvalidId != (encoding = __CFStringEncodingGetFromCanonicalName(name)))) return encoding;

        // Try MIME platform
        name = ucnv_getStandardName(icuName, "MIME", &errorCode);
        if ((NULL != name) && strncasecmp_l(icuName, name, strlen(name), NULL) && (kCFStringEncodingInvalidId != (encoding = __CFStringEncodingGetFromCanonicalName(name)))) return encoding;
    }

    return kCFStringEncodingInvalidId;
}
Exemplo n.º 2
0
/// \threadsafe
QList<QByteArray> QIcuCodec::availableCodecs()
{
    QList<QByteArray> codecs;
    int n = ucnv_countAvailable();
    for (int i = 0; i < n; ++i) {
        const char *name = ucnv_getAvailableName(i);

        UErrorCode error = U_ZERO_ERROR;
        const char *standardName = ucnv_getStandardName(name, "MIME", &error);
        if (U_FAILURE(error) || !standardName) {
            error = U_ZERO_ERROR;
            standardName = ucnv_getStandardName(name, "IANA", &error);
        }
        if (U_FAILURE(error))
            continue;

        error = U_ZERO_ERROR;
        int ac = ucnv_countAliases(standardName, &error);
        if (U_FAILURE(error))
            continue;
        for (int j = 0; j < ac; ++j) {
            error = U_ZERO_ERROR;
            const char *alias = ucnv_getAlias(standardName, j, &error);
            if (!U_SUCCESS(error))
                continue;
            codecs += alias;
        }
    }

    // handled by Qt and not in ICU:
    codecs += "TSCII";

    return codecs;
}
Exemplo n.º 3
0
static const char *getEncodingName(const char *encoding) {
    UErrorCode err;
    const char *enc;

    err = U_ZERO_ERROR;
    if (!(enc = ucnv_getStandardName(encoding, "MIME", &err))) {
        err = U_ZERO_ERROR;
        if (!(enc = ucnv_getStandardName(encoding, "IANA", &err))) {
            ;
        }
    }

    return enc;
}
Exemplo n.º 4
0
const CFStringEncoding *
CFStringGetListOfAvailableEncodings (void)
{
  if (_kCFStringEncodingList == NULL)
    {
      GSMutexLock (&_kCFStringEncodingLock);
      if (_kCFStringEncodingList == NULL)
        {
          int32_t count;
          int32_t idx;
          const char *name;
          UErrorCode err = U_ZERO_ERROR;
          
          count = ucnv_countAvailable ();
          
          _kCFStringEncodingList = CFAllocatorAllocate (NULL,
            sizeof(CFStringEncoding) * (count + 1), 0);
          
          idx = 0;
          while (idx < count)
            {
              name = ucnv_getStandardName(ucnv_getAvailableName (idx),
                "MIME", &err);
              if (U_SUCCESS(err))
                _kCFStringEncodingList[idx] =
                  CFStringConvertStandardNameToEncoding (name, -1);
              ++idx;
            }
          _kCFStringEncodingList[idx] = kCFStringEncodingInvalidId;
        }
      GSMutexUnlock (&_kCFStringEncodingLock);
    }
  
  return _kCFStringEncodingList;
}
Exemplo n.º 5
0
void TextCodecICU::registerCodecs(TextCodecRegistrar registrar) {
  // See comment above in registerEncodingNames.
  registrar("ISO-8859-8-I", create, 0);

  int32_t numEncodings = ucnv_countAvailable();
  for (int32_t i = 0; i < numEncodings; ++i) {
    const char* name = ucnv_getAvailableName(i);
    UErrorCode error = U_ZERO_ERROR;
    const char* standardName = ucnv_getStandardName(name, "MIME", &error);
    if (!U_SUCCESS(error) || !standardName) {
      error = U_ZERO_ERROR;
      standardName = ucnv_getStandardName(name, "IANA", &error);
      if (!U_SUCCESS(error) || !standardName)
        continue;
    }
    registrar(standardName, create, 0);
  }
}
Exemplo n.º 6
0
CFStringEncoding
CFStringGetSystemEncoding (void)
{
#if defined(_WIN32)
  return kCFStringEncodingASCII;
#else
  if (_kCFStringSystemEncoding == kCFStringEncodingInvalidId)
    {
      GSMutexLock (&_kCFStringEncodingLock);
      if (_kCFStringSystemEncoding == kCFStringEncodingInvalidId)
        {
          const char *name;
          const char *defaultName;
          UErrorCode err = U_ZERO_ERROR;

          defaultName = ucnv_getDefaultName ();
          name = ucnv_getStandardName (defaultName, "MIME", &err);
          if (name != NULL)
            {
              _kCFStringSystemEncoding =
                CFStringConvertStandardNameToEncoding (name, -1);
            }
          else
            {
              name = ucnv_getStandardName (defaultName, "IANA", &err);
              if (name != NULL)
                _kCFStringSystemEncoding =
                  CFStringConvertStandardNameToEncoding (name, -1);
              else
                _kCFStringSystemEncoding = kCFStringEncodingInvalidId;
            }
        }
      GSMutexUnlock (&_kCFStringEncodingLock);
    }
  return _kCFStringSystemEncoding;
#endif
}
// If a charset listed in the IANA Charset Registry is supported by an implementation
// of the Java platform then its canonical name must be the name listed in the registry.
// Many charsets are given more than one name in the registry, in which case the registry
// identifies one of the names as MIME-preferred. If a charset has more than one registry
// name then its canonical name must be the MIME-preferred name and the other names in
// the registry must be valid aliases. If a supported charset is not listed in the IANA
// registry then its canonical name must begin with one of the strings "X-" or "x-".
static jstring getJavaCanonicalName(JNIEnv* env, const char* icuCanonicalName) {
  UErrorCode status = U_ZERO_ERROR;

  // Check to see if this is a well-known MIME or IANA name.
  const char* cName = NULL;
  if ((cName = ucnv_getStandardName(icuCanonicalName, "MIME", &status)) != NULL) {
    return env->NewStringUTF(cName);
  } else if ((cName = ucnv_getStandardName(icuCanonicalName, "IANA", &status)) != NULL) {
    return env->NewStringUTF(cName);
  }

  // Check to see if an alias already exists with "x-" prefix, if yes then
  // make that the canonical name.
  int32_t aliasCount = ucnv_countAliases(icuCanonicalName, &status);
  for (int i = 0; i < aliasCount; ++i) {
    const char* name = ucnv_getAlias(icuCanonicalName, i, &status);
    if (name != NULL && name[0] == 'x' && name[1] == '-') {
      return env->NewStringUTF(name);
    }
  }

  // As a last resort, prepend "x-" to any alias and make that the canonical name.
  status = U_ZERO_ERROR;
  const char* name = ucnv_getStandardName(icuCanonicalName, "UTR22", &status);
  if (name == NULL && strchr(icuCanonicalName, ',') != NULL) {
    name = ucnv_getAlias(icuCanonicalName, 1, &status);
  }
  // If there is no UTR22 canonical name then just return the original name.
  if (name == NULL) {
    name = icuCanonicalName;
  }
  std::unique_ptr<char[]> result(new char[2 + strlen(name) + 1]);
  strcpy(&result[0], "x-");
  strcat(&result[0], name);
  return env->NewStringUTF(&result[0]);
}
Exemplo n.º 8
0
CFStringRef
CFStringConvertEncodingToIANACharSetName (CFStringEncoding encoding)
{
  const char *name;
  const char *cnvName;
  UErrorCode err = U_ZERO_ERROR;

  cnvName = CFStringICUConverterName (encoding);
  name = ucnv_getStandardName (cnvName, "IANA", &err);
  if (U_FAILURE (err))
    return NULL;
  /* Using this function here because we don't want to make multiple copies
     of this string. */
  return __CFStringMakeConstantString (name);
}
void TextCodecICU::registerCodecs(TextCodecRegistrar registrar)
{
    // See comment above in registerEncodingNames.
    UErrorCode error = U_ZERO_ERROR;
    const char* canonicalConverterName = ucnv_getCanonicalName("ISO-8859-8-I", "IANA", &error);
    ASSERT(U_SUCCESS(error));
    registrar("ISO-8859-8-I", create, canonicalConverterName);

    int32_t numConverters = ucnv_countAvailable();
    for (int32_t i = 0; i < numConverters; ++i) {
        canonicalConverterName = ucnv_getAvailableName(i);
        error = U_ZERO_ERROR;
        const char* webStandardName = ucnv_getStandardName(canonicalConverterName, "MIME", &error);
        if (!U_SUCCESS(error) || !webStandardName) {
            error = U_ZERO_ERROR;
            webStandardName = ucnv_getStandardName(canonicalConverterName, "IANA", &error);
            if (!U_SUCCESS(error) || !webStandardName)
                continue;
        }

        // Don't register codecs for overridden encodings.
        if (strcmp(webStandardName, "GB2312") == 0 || strcmp(webStandardName, "GB_2312-80") == 0
            || strcmp(webStandardName, "KSC_5601") == 0 || strcmp(webStandardName, "EUC-KR") == 0
            || strcmp(webStandardName, "cp1363") == 0
            || strcasecmp(webStandardName, "iso-8859-9") == 0
            || strcmp(webStandardName, "TIS-620") == 0)
            continue;

        registrar(webStandardName, create, fastStrDup(canonicalConverterName));
    }

    // These encodings currently don't have standard names, so we need to register encoders manually.
    // FIXME: Is there a good way to determine the most up to date variant programmatically?
    registrar("windows-874", create, "windows-874-2000");
    registrar("windows-949", create, "windows-949-2000");
}
Exemplo n.º 10
0
void TextCodecICU::registerExtendedCodecs(TextCodecRegistrar registrar)
{
    // See comment above in registerEncodingNames.
    registrar("ISO-8859-8-I", newTextCodecICU, 0);

    int32_t numEncodings = ucnv_countAvailable();
    for (int32_t i = 0; i < numEncodings; ++i) {
        const char* name = ucnv_getAvailableName(i);
        UErrorCode error = U_ZERO_ERROR;
        // FIXME: Should we use the "MIME" standard instead of "IANA"?
        const char* standardName = ucnv_getStandardName(name, "IANA", &error);
        if (!U_SUCCESS(error) || !standardName)
            continue;
        registrar(standardName, newTextCodecICU, 0);
    }
}
Exemplo n.º 11
0
STDMETHODIMP LgIcuConverterEnumerator::get_ConverterName(int iconv, BSTR * pbstrName)
{
	BEGIN_COM_METHOD
	ChkComOutPtr(pbstrName);
	StrUtil::InitIcuDataDir();
	const char * canonicalName = ucnv_getAvailableName(iconv);
	UErrorCode err = U_ZERO_ERROR;
	const char * ianaName = ucnv_getStandardName(canonicalName, "IANA", &err);
	if (!ianaName)
		ianaName = canonicalName;
	if (!ianaName)
		return S_OK; // no useable name obtainable.
	// These names are guaranteed to be 7-bit ASCII (common chars in ASCII and EBCDIC)
	*pbstrName = AsciiToBstr(ianaName);
	END_COM_METHOD(g_fact, IID_ILgIcuConverterEnumerator);
}
Exemplo n.º 12
0
static int dotestname(const char *name, const char *standard, const char *expected) {
    int res = 1;

    UErrorCode error;
    const char *tag;

    error = U_ZERO_ERROR;
    tag = ucnv_getStandardName(name, standard, &error);
    if (!tag && expected) {
        log_err_status(error, "FAIL: could not find %s standard name for %s\n", standard, name);
        res = 0;
    } else if (expected && (name == tag || uprv_strcmp(expected, tag))) {
        log_err("FAIL: expected %s for %s standard name for %s, got %s\n", expected, standard, name, tag);
        res = 0;
    }

    return res;
}
		vector<wstring> StringCharsetConverter::makeCharsetList()
		{
			vector<wstring> list;
			uint16_t n = static_cast<uint16_t>(ucnv_countAvailable());
			UErrorCode err = U_ZERO_ERROR;
			const char *name;
			wchar_t buf[48];
			StringCharsetConverter cv;
			for(uint16_t i = 0; i < n; ++ i)
			{
				name = ucnv_getAvailableName(i);
				name = ucnv_getStandardName(name, "MIME", &err);
				if(U_FAILURE(err) || name == nullptr)
				{
					err = U_ZERO_ERROR;
					continue;
				}
				if(cv.convertToUTF16(buf, 48, name) != static_cast<size_t>(-1))
					list.push_back(buf);
			}

			return move(list);
		}
Exemplo n.º 14
0
void TextCodecICU::registerEncodingNames(EncodingNameRegistrar registrar)
{
    // We register Hebrew with logical ordering using a separate name.
    // Otherwise, this would share the same canonical name as the
    // visual ordering case, and then TextEncoding could not tell them
    // apart; ICU treats these names as synonyms.
    registrar("ISO-8859-8-I", "ISO-8859-8-I");

    int32_t numEncodings = ucnv_countAvailable();
    for (int32_t i = 0; i < numEncodings; ++i) {
        const char* name = ucnv_getAvailableName(i);
        UErrorCode error = U_ZERO_ERROR;
#if !defined(USING_SYSTEM_ICU)
        const char* primaryStandard = "HTML";
        const char* secondaryStandard = "MIME";
#else
        const char* primaryStandard = "MIME";
        const char* secondaryStandard = "IANA";
#endif
        const char* standardName = ucnv_getStandardName(name, primaryStandard, &error);
        if (U_FAILURE(error) || !standardName) {
            error = U_ZERO_ERROR;
            // Try IANA to pick up 'windows-12xx' and other names
            // which are not preferred MIME names but are widely used.
            standardName = ucnv_getStandardName(name, secondaryStandard, &error);
            if (U_FAILURE(error) || !standardName)
                continue;
        }

        // A number of these aliases are handled in Chrome's copy of ICU, but
        // Chromium can be compiled with the system ICU.

        // 1. Treat GB2312 encoding as GBK (its more modern superset), to match other browsers.
        // 2. On the Web, GB2312 is encoded as EUC-CN or HZ, while ICU provides a native encoding
        //    for encoding GB_2312-80 and several others. So, we need to override this behavior, too.
#if defined(USING_SYSTEM_ICU)
        if (!strcmp(standardName, "GB2312") || !strcmp(standardName, "GB_2312-80"))
            standardName = "GBK";
        // Similarly, EUC-KR encodings all map to an extended version, but
        // per HTML5, the canonical name still should be EUC-KR.
        else if (!strcmp(standardName, "EUC-KR") || !strcmp(standardName, "KSC_5601") || !strcmp(standardName, "cp1363"))
            standardName = "EUC-KR";
        // And so on.
        else if (!strcasecmp(standardName, "iso-8859-9")) // This name is returned in different case by ICU 3.2 and 3.6.
            standardName = "windows-1254";
        else if (!strcmp(standardName, "TIS-620"))
            standardName = "windows-874";
#endif

        registrar(standardName, standardName);

        uint16_t numAliases = ucnv_countAliases(name, &error);
        ASSERT(U_SUCCESS(error));
        if (U_SUCCESS(error))
            for (uint16_t j = 0; j < numAliases; ++j) {
                error = U_ZERO_ERROR;
                const char* alias = ucnv_getAlias(name, j, &error);
                ASSERT(U_SUCCESS(error));
                if (U_SUCCESS(error) && alias != standardName)
                    registrar(alias, standardName);
            }
    }

    // These two entries have to be added here because ICU's converter table
    // cannot have both ISO-8859-8-I and ISO-8859-8.
    registrar("csISO88598I", "ISO-8859-8-I");
    registrar("logical", "ISO-8859-8-I");

#if defined(USING_SYSTEM_ICU)
    // Additional alias for MacCyrillic not present in ICU.
    registrar("maccyrillic", "x-mac-cyrillic");

    // Additional aliases that historically were present in the encoding
    // table in WebKit on Macintosh that don't seem to be present in ICU.
    // Perhaps we can prove these are not used on the web and remove them.
    // Or perhaps we can get them added to ICU.
    registrar("x-mac-roman", "macintosh");
    registrar("x-mac-ukrainian", "x-mac-cyrillic");
    registrar("cn-big5", "Big5");
    registrar("x-x-big5", "Big5");
    registrar("cn-gb", "GBK");
    registrar("csgb231280", "GBK");
    registrar("x-euc-cn", "GBK");
    registrar("x-gbk", "GBK");
    registrar("koi", "KOI8-R");
    registrar("visual", "ISO-8859-8");
    registrar("winarabic", "windows-1256");
    registrar("winbaltic", "windows-1257");
    registrar("wincyrillic", "windows-1251");
    registrar("iso-8859-11", "windows-874");
    registrar("iso8859-11", "windows-874");
    registrar("dos-874", "windows-874");
    registrar("wingreek", "windows-1253");
    registrar("winhebrew", "windows-1255");
    registrar("winlatin2", "windows-1250");
    registrar("winturkish", "windows-1254");
    registrar("winvietnamese", "windows-1258");
    registrar("x-cp1250", "windows-1250");
    registrar("x-cp1251", "windows-1251");
    registrar("x-euc", "EUC-JP");
    registrar("x-windows-949", "EUC-KR");
    registrar("KSC5601", "EUC-KR");
    registrar("x-uhc", "EUC-KR");
    registrar("shift-jis", "Shift_JIS");

    // Alternative spelling of ISO encoding names.
    registrar("ISO8859-1", "ISO-8859-1");
    registrar("ISO8859-2", "ISO-8859-2");
    registrar("ISO8859-3", "ISO-8859-3");
    registrar("ISO8859-4", "ISO-8859-4");
    registrar("ISO8859-5", "ISO-8859-5");
    registrar("ISO8859-6", "ISO-8859-6");
    registrar("ISO8859-7", "ISO-8859-7");
    registrar("ISO8859-8", "ISO-8859-8");
    registrar("ISO8859-8-I", "ISO-8859-8-I");
    registrar("ISO8859-9", "ISO-8859-9");
    registrar("ISO8859-10", "ISO-8859-10");
    registrar("ISO8859-13", "ISO-8859-13");
    registrar("ISO8859-14", "ISO-8859-14");
    registrar("ISO8859-15", "ISO-8859-15");
    // No need to have an entry for ISO8859-16. ISO-8859-16 has just one label
    // listed in WHATWG Encoding Living Standard (http://encoding.spec.whatwg.org/ ).

    // Additional aliases present in the WHATWG Encoding Standard
    // and Firefox (as of Oct 2014), but not in the upstream ICU.
    // Three entries for windows-1252 need not be listed here because
    // TextCodecLatin1 registers them.
    registrar("csiso58gb231280", "GBK");
    registrar("csiso88596e", "ISO-8859-6");
    registrar("csiso88596i", "ISO-8859-6");
    registrar("csiso88598e", "ISO-8859-8");
    registrar("gb_2312", "GBK");
    registrar("iso88592", "ISO-8859-2");
    registrar("iso88593", "ISO-8859-3");
    registrar("iso88594", "ISO-8859-4");
    registrar("iso88595", "ISO-8859-5");
    registrar("iso88596", "ISO-8859-6");
    registrar("iso88597", "ISO-8859-7");
    registrar("iso88598", "ISO-8859-8");
    registrar("iso88599", "windows-1254");
    registrar("iso885910", "ISO-8859-10");
    registrar("iso885911", "windows-874");
    registrar("iso885913", "ISO-8859-13");
    registrar("iso885914", "ISO-8859-14");
    registrar("iso885915", "ISO-8859-15");
    registrar("iso_8859-2", "ISO-8859-2");
    registrar("iso_8859-3", "ISO-8859-3");
    registrar("iso_8859-4", "ISO-8859-4");
    registrar("iso_8859-5", "ISO-8859-5");
    registrar("iso_8859-6", "ISO-8859-6");
    registrar("iso_8859-7", "ISO-8859-7");
    registrar("iso_8859-8", "ISO-8859-8");
    registrar("iso_8859-9", "windows-1254");
    registrar("iso_8859-15", "ISO-8859-15");
    registrar("koi8_r", "KOI8-R");
    registrar("x-cp1253", "windows-1253");
    registrar("x-cp1254", "windows-1254");
    registrar("x-cp1255", "windows-1255");
    registrar("x-cp1256", "windows-1256");
    registrar("x-cp1257", "windows-1257");
    registrar("x-cp1258", "windows-1258");
#endif
}
Exemplo n.º 15
0
QTextCodec *QIcuCodec::codecForNameUnlocked(const char *name)
{
    // backwards compatibility with Qt 4.x
    if (!qstrcmp(name, "CP949"))
        name = "windows-949";
    // these are broken data in ICU 4.4, and can't be resolved even though they are aliases to tis-620
    if (!qstrcmp(name, "windows-874-2000")
        || !qstrcmp(name, "windows-874")
        || !qstrcmp(name, "MS874")
        || !qstrcmp(name, "x-windows-874")
        || !qstrcmp(name, "ISO 8859-11"))
        name = "TIS-620";

    UErrorCode error = U_ZERO_ERROR;
    // MIME gives better default names
    const char *standardName = ucnv_getStandardName(name, "MIME", &error);
    if (U_FAILURE(error) || !standardName) {
        error = U_ZERO_ERROR;
        standardName = ucnv_getStandardName(name, "IANA", &error);
    }
    bool qt_only = false;
    if (U_FAILURE(error) || !standardName) {
        standardName = name;
        qt_only = true;
    } else {
        // correct some issues where the ICU data set contains duplicated entries.
        // Where this happens it's because one data set is a subset of another. We
        // always use the larger data set.

        if (qstrcmp(standardName, "GB2312") == 0 || qstrcmp(standardName, "GB_2312-80") == 0)
            standardName = "GBK";
        else if (qstrcmp(standardName, "KSC_5601") == 0 || qstrcmp(standardName, "EUC-KR") == 0 || qstrcmp(standardName, "cp1363") == 0)
            standardName = "windows-949";
    }

    QCoreGlobalData *globalData = QCoreGlobalData::instance();
    QTextCodecCache *cache = &globalData->codecCache;

    QTextCodec *codec;
    if (cache) {
        codec = cache->value(standardName);
        if (codec)
            return codec;
    }

    for (int i = 0; i < globalData->allCodecs.size(); ++i) {
        QTextCodec *cursor = globalData->allCodecs.at(i);
        if (qTextCodecNameMatch(cursor->name(), standardName)) {
            if (cache)
                cache->insert(standardName, cursor);
            return cursor;
        }
        QList<QByteArray> aliases = cursor->aliases();
        for (int y = 0; y < aliases.size(); ++y)
            if (qTextCodecNameMatch(aliases.at(y), standardName)) {
                if (cache)
                    cache->insert(standardName, cursor);
                return cursor;
            }
    }

    QTextCodec *c = loadQtCodec(standardName);
    if (c)
        return c;

    if (qt_only)
        return 0;

    // check whether there is really a converter for the name available.
    UConverter *conv = ucnv_open(standardName, &error);
    if (!conv) {
        qDebug() << "codecForName: ucnv_open failed" << standardName << u_errorName(error);
        return 0;
    }
    //qDebug() << "QIcuCodec: Standard name for " << name << "is" << standardName;
    ucnv_close(conv);


    c = new QIcuCodec(standardName);
    if (cache)
        cache->insert(standardName, c);
    return c;
}
Exemplo n.º 16
0
static int printConverters(const char *pname, const char *lookfor,
    int canon)
{
    UErrorCode err = U_ZERO_ERROR;
    int32_t num;
    uint16_t num_stds;
    const char **stds;

    /* If there is a specified name, just handle that now. */

    if (lookfor) {
        if (!canon) {
            printf("%s\n", lookfor);
            return 0;
        } else {
        /*  Because we are printing a canonical name, we need the
            true converter name. We've done that already except for
            the default name (because we want to print the exact
            name one would get when calling ucnv_getDefaultName()
            in non-canon mode). But since we do not know at this
            point if we have the default name or something else, we
            need to normalize again to the canonical converter
            name. */

            const char *truename = ucnv_getAlias(lookfor, 0, &err);
            if (U_SUCCESS(err)) {
                lookfor = truename;
            } else {
                err = U_ZERO_ERROR;
            }
        }
    }

    /* Print converter names. We come here for one of two reasons: we
       are printing all the names (lookfor was null), or we have a
       single converter to print but in canon mode, hence we need to
       get to it in order to print everything. */

    num = ucnv_countAvailable();
    if (num <= 0) {
        initMsg(pname);
        u_wmsg(stderr, "cantGetNames");
        return -1;
    }
    if (lookfor) {
        num = 1;                /* We know where we want to be. */
    }

    num_stds = ucnv_countStandards();
    stds = (const char **) uprv_malloc(num_stds * sizeof(*stds));
    if (!stds) {
        u_wmsg(stderr, "cantGetTag", u_wmsg_errorName(U_MEMORY_ALLOCATION_ERROR));
        return -1;
    } else {
        uint16_t s;

        for (s = 0; s < num_stds; ++s) {
            stds[s] = ucnv_getStandard(s, &err);
            if (U_FAILURE(err)) {
                u_wmsg(stderr, "cantGetTag", u_wmsg_errorName(err));
                return -1;
            }
        }
    }

    for (int32_t i = 0; i < num; i++) {
        const char *name;
        uint16_t num_aliases;

        /* Set the name either to what we are looking for, or
        to the current converter name. */

        if (lookfor) {
            name = lookfor;
        } else {
            name = ucnv_getAvailableName(i);
        }

        /* Get all the aliases associated to the name. */

        err = U_ZERO_ERROR;
        num_aliases = ucnv_countAliases(name, &err);
        if (U_FAILURE(err)) {
            printf("%s", name);

            UnicodeString str(name, (int32_t)(uprv_strlen(name) + 1));
            putchar('\t');
            u_wmsg(stderr, "cantGetAliases", str.getBuffer(),
                u_wmsg_errorName(err));
            return -1;
        } else {
            uint16_t a, s, t;

            /* Write all the aliases and their tags. */

            for (a = 0; a < num_aliases; ++a) {
                const char *alias = ucnv_getAlias(name, a, &err);

                if (U_FAILURE(err)) {
                    UnicodeString str(name, (int32_t)(uprv_strlen(name) + 1));
                    putchar('\t');
                    u_wmsg(stderr, "cantGetAliases", str.getBuffer(),
                        u_wmsg_errorName(err));
                    return -1;
                }

                printf("%s", alias);

                /* Look (slowly, linear searching) for a tag. */

                if (canon) {
                    for (s = t = 0; s < num_stds; ++s) {
                        const char *standard =
                            ucnv_getStandardName(name, stds[s], &err);
                        if (U_SUCCESS(err) && standard) {
                            if (!strcmp(standard, alias)) {
                                if (!t) {
                                    printf(" {");
                                    t = 1;
                                }
                                printf(" %s", stds[s]);
                            }
                        }
                    }
                    if (t) {
                        printf(" }");
                    }
                }

                /* Move on. */

                if (a < num_aliases - 1) {
                    putchar(a || !canon ? ' ' : '\t');
                }
            }
        }

        /* Terminate this entry. */

        if (canon) {
            putchar('\n');
        } else if (i < num - 1) {
            putchar(' ');
        }
    }

    /* Free temporary data. */

    uprv_free(stds);

    /* Success. */

    return 0;
}
Exemplo n.º 17
0
void TextCodecICU::registerExtendedEncodingNames(EncodingNameRegistrar registrar)
{
    // We register Hebrew with logical ordering using a separate name.
    // Otherwise, this would share the same canonical name as the
    // visual ordering case, and then TextEncoding could not tell them
    // apart; ICU treats these names as synonyms.
    registrar("ISO-8859-8-I", "ISO-8859-8-I");

    int32_t numEncodings = ucnv_countAvailable();
    for (int32_t i = 0; i < numEncodings; ++i) {
        const char* name = ucnv_getAvailableName(i);
        UErrorCode error = U_ZERO_ERROR;
        // Try MIME before trying IANA to pick up commonly used names like
        // 'EUC-JP' instead of horrendeously long names like 
        // 'Extended_UNIX_Code_Packed_Format_for_Japanese'. 
        const char* standardName = ucnv_getStandardName(name, "MIME", &error);
        if (!U_SUCCESS(error) || !standardName) {
            error = U_ZERO_ERROR;
            // Try IANA to pick up 'windows-12xx' and other names
            // which are not preferred MIME names but are widely used. 
            standardName = ucnv_getStandardName(name, "IANA", &error);
            if (!U_SUCCESS(error) || !standardName)
                continue;
        }

        // 1. Treat GB2312 encoding as GBK (its more modern superset), to match other browsers.
        // 2. On the Web, GB2312 is encoded as EUC-CN or HZ, while ICU provides a native encoding
        //    for encoding GB_2312-80 and several others. So, we need to override this behavior, too.
        if (strcmp(standardName, "GB2312") == 0 || strcmp(standardName, "GB_2312-80") == 0)
            standardName = "GBK";
        // Similarly, EUC-KR encodings all map to an extended version.
        else if (strcmp(standardName, "KSC_5601") == 0 || strcmp(standardName, "EUC-KR") == 0 || strcmp(standardName, "cp1363") == 0)
            standardName = "windows-949";
        // And so on.
        else if (strcasecmp(standardName, "iso-8859-9") == 0) // This name is returned in different case by ICU 3.2 and 3.6.
            standardName = "windows-1254";
        else if (strcmp(standardName, "TIS-620") == 0)
            standardName = "windows-874";

        registrar(standardName, standardName);

        uint16_t numAliases = ucnv_countAliases(name, &error);
        ASSERT(U_SUCCESS(error));
        if (U_SUCCESS(error))
            for (uint16_t j = 0; j < numAliases; ++j) {
                error = U_ZERO_ERROR;
                const char* alias = ucnv_getAlias(name, j, &error);
                ASSERT(U_SUCCESS(error));
                if (U_SUCCESS(error) && alias != standardName)
                    registrar(alias, standardName);
            }
    }

    // Additional aliases.
    // These are present in modern versions of ICU, but not in ICU 3.2 (shipped with Mac OS X 10.4).
    registrar("macroman", "macintosh");
    registrar("maccyrillic", "x-mac-cyrillic");

    // Additional aliases that historically were present in the encoding
    // table in WebKit on Macintosh that don't seem to be present in ICU.
    // Perhaps we can prove these are not used on the web and remove them.
    // Or perhaps we can get them added to ICU.
    registrar("xmacroman", "macintosh");
    registrar("xmacukrainian", "x-mac-cyrillic");
    registrar("cnbig5", "Big5");
    registrar("xxbig5", "Big5");
    registrar("cngb", "GBK");
    registrar("csgb231280", "GBK");
    registrar("xeuccn", "GBK");
    registrar("xgbk", "GBK");
    registrar("csISO88598I", "ISO_8859-8-I");
    registrar("koi", "KOI8-R");
    registrar("logical", "ISO-8859-8-I");
    registrar("unicode11utf8", "UTF-8");
    registrar("unicode20utf8", "UTF-8");
    registrar("xunicode20utf8", "UTF-8");
    registrar("visual", "ISO-8859-8");
    registrar("winarabic", "windows-1256");
    registrar("winbaltic", "windows-1257");
    registrar("wincyrillic", "windows-1251");
    registrar("iso885911", "windows-874");
    registrar("dos874", "windows-874");
    registrar("wingreek", "windows-1253");
    registrar("winhebrew", "windows-1255");
    registrar("winlatin2", "windows-1250");
    registrar("winturkish", "windows-1254");
    registrar("winvietnamese", "windows-1258");
    registrar("xcp1250", "windows-1250");
    registrar("xcp1251", "windows-1251");
    registrar("xeuc", "EUC-JP");
    registrar("xwindows949", "windows-949");
    registrar("xuhc", "windows-949");

    // These aliases are present in modern versions of ICU, but use different codecs, and have no standard names.
    // They are not present in ICU 3.2.
    registrar("dos720", "cp864");
    registrar("jis7", "ISO-2022-JP");
}
/** Fetch information on an encoding
 *
 * @param enc either NULL or "" for default encoding,
 *        or one string with encoding name
 * @return R list object with many components (see R doc for details)
 *
 * @version 0.1-?? (Marek Gagolewski)
 *
 * @version 0.2-1 (Marek Gagolewski)
 *          use StriUcnv; make StriException-friendly
 *
 * @version 0.3-1 (Marek Gagolewski, 2014-11-04)
 *    Issue #112: str_prepare_arg* retvals were not PROTECTed from gc
 */
SEXP stri_enc_info(SEXP enc)
{
   const char* selected_enc = stri__prepare_arg_enc(enc, "enc", true/*default ok*/); /* this is R_alloc'ed */

   STRI__ERROR_HANDLER_BEGIN(0)
   StriUcnv uconv_obj(selected_enc);
   //uconv_obj.setCallBackSubstitute(); // restore default callbacks (no warning)
   UConverter* uconv = uconv_obj.getConverter(false);
   UErrorCode status = U_ZERO_ERROR;

   // get the list of available standards
   vector<const char*> standards = StriUcnv::getStandards();
   R_len_t standards_n = (R_len_t)standards.size();

   // alloc output list
   SEXP vals;
   SEXP names;
   const int nval = standards_n+2+5;
   STRI__PROTECT(names = Rf_allocVector(STRSXP, nval));
   SET_STRING_ELT(names, 0, Rf_mkChar("Name.friendly"));
   SET_STRING_ELT(names, 1, Rf_mkChar("Name.ICU"));
   for (R_len_t i=0; i<standards_n; ++i) {
      if (standards[i])
         SET_STRING_ELT(names, i+2, Rf_mkChar((string("Name.")+standards[i]).c_str()));
   }
   SET_STRING_ELT(names, nval-5, Rf_mkChar("ASCII.subset"));
   SET_STRING_ELT(names, nval-4, Rf_mkChar("Unicode.1to1"));
   SET_STRING_ELT(names, nval-3, Rf_mkChar("CharSize.8bit"));
   SET_STRING_ELT(names, nval-2, Rf_mkChar("CharSize.min"));
   SET_STRING_ELT(names, nval-1, Rf_mkChar("CharSize.max"));

   STRI__PROTECT(vals = Rf_allocVector(VECSXP, nval));


   // get canonical (ICU) name
   status = U_ZERO_ERROR;
   const char* canname = ucnv_getName(uconv, &status);
   if (U_FAILURE(status) || !canname) {
      SET_VECTOR_ELT(vals, 1, Rf_ScalarString(NA_STRING));
      Rf_warning(MSG__ENC_ERROR_GETNAME);
   }
   else {
      SET_VECTOR_ELT(vals, 1, stri__make_character_vector_char_ptr(1, canname));

      // friendly name
      const char* frname = StriUcnv::getFriendlyName(canname);
      if (frname)  SET_VECTOR_ELT(vals, 0, stri__make_character_vector_char_ptr(1, frname));
      else         SET_VECTOR_ELT(vals, 0, Rf_ScalarString(NA_STRING));

      // has ASCII as its subset?
      SET_VECTOR_ELT(vals, nval-5, Rf_ScalarLogical((int)uconv_obj.hasASCIIsubset()));

      // min,max character size, is 8bit?
      int mincharsize = (int)ucnv_getMinCharSize(uconv);
      int maxcharsize = (int)ucnv_getMaxCharSize(uconv);
      int is8bit = (mincharsize==1 && maxcharsize == 1);
      SET_VECTOR_ELT(vals, nval-3, Rf_ScalarLogical(is8bit));
      SET_VECTOR_ELT(vals, nval-2, Rf_ScalarInteger(mincharsize));
      SET_VECTOR_ELT(vals, nval-1, Rf_ScalarInteger(maxcharsize));

      // is there a one-to-one correspondence with Unicode?
      if (!is8bit)
         SET_VECTOR_ELT(vals, nval-4, Rf_ScalarLogical(NA_LOGICAL));
      else
         SET_VECTOR_ELT(vals, nval-4, Rf_ScalarLogical((int)uconv_obj.is1to1Unicode()));

      // other standard names
      for (R_len_t i=0; i<standards_n; ++i) {
         if (!standards[i]) continue;

         status = U_ZERO_ERROR;
         const char* stdname = ucnv_getStandardName(canname, standards[i], &status);
         if (U_FAILURE(status) || !stdname)
            SET_VECTOR_ELT(vals, i+2, Rf_ScalarString(NA_STRING));
         else
            SET_VECTOR_ELT(vals, i+2, stri__make_character_vector_char_ptr(1, stdname));
      }
   }
   Rf_setAttrib(vals, R_NamesSymbol, names);
   STRI__UNPROTECT_ALL
   return vals;

   STRI__ERROR_HANDLER_END({/* no special action on error */})
}
Exemplo n.º 19
0
void TextCodecICU::registerExtendedEncodingNames(EncodingNameRegistrar registrar)
{
    // We register Hebrew with logical ordering using a separate name.
    // Otherwise, this would share the same canonical name as the
    // visual ordering case, and then TextEncoding could not tell them
    // apart; ICU works with either name.
    registrar("ISO-8859-8-I", "ISO-8859-8-I");

    int32_t numEncodings = ucnv_countAvailable();
    for (int32_t i = 0; i < numEncodings; ++i) {
        const char* name = ucnv_getAvailableName(i);
        UErrorCode error = U_ZERO_ERROR;
        // FIXME: Should we use the "MIME" standard instead of "IANA"?
        const char* standardName = ucnv_getStandardName(name, "IANA", &error);
        if (!U_SUCCESS(error) || !standardName)
            continue;

        // 1. Treat GB2312 encoding as GBK (its more modern superset), to match other browsers.
        // 2. On the Web, GB2312 is encoded as EUC-CN or HZ, while ICU provides a native encoding
        //    for encoding GB_2312-80 and several others. So, we need to override this behavior, too.
        if (strcmp(standardName, "GB2312") == 0 || strcmp(standardName, "GB_2312-80") == 0)
            standardName = "GBK";
        // Similarly, EUC-KR encodings all map to an extended version.
        else if (strcmp(standardName, "KS_C_5601-1987") == 0 || strcmp(standardName, "EUC-KR") == 0)
            standardName = "windows-949-2000";
        // And so on.
        else if (strcmp(standardName, "ISO_8859-9:1989") == 0)
            standardName = "windows-1254";
        else if (strcmp(standardName, "TIS-620") == 0)
            standardName = "windows-874-2000";

        registrar(standardName, standardName);

        uint16_t numAliases = ucnv_countAliases(name, &error);
        ASSERT(U_SUCCESS(error));
        if (U_SUCCESS(error))
            for (uint16_t j = 0; j < numAliases; ++j) {
                error = U_ZERO_ERROR;
                const char* alias = ucnv_getAlias(name, j, &error);
                ASSERT(U_SUCCESS(error));
                if (U_SUCCESS(error) && alias != standardName)
                    registrar(alias, standardName);
            }
    }

    // Additional aliases.
    // Perhaps we can get these added to ICU.
    registrar("macroman", "macintosh");
    registrar("xmacroman", "macintosh");

    // Additional aliases that historically were present in the encoding
    // table in WebKit on Macintosh that don't seem to be present in ICU.
    // Perhaps we can prove these are not used on the web and remove them.
    // Or perhaps we can get them added to ICU.
    registrar("cnbig5", "Big5");
    registrar("cngb", "EUC-CN");
    registrar("csISO88598I", "ISO_8859-8-I");
    registrar("csgb231280", "EUC-CN");
    registrar("dos720", "cp864");
    registrar("dos874", "cp874");
    registrar("jis7", "ISO-2022-JP");
    registrar("koi", "KOI8-R");
    registrar("logical", "ISO-8859-8-I");
    registrar("unicode11utf8", "UTF-8");
    registrar("unicode20utf8", "UTF-8");
    registrar("visual", "ISO-8859-8");
    registrar("winarabic", "windows-1256");
    registrar("winbaltic", "windows-1257");
    registrar("wincyrillic", "windows-1251");
    registrar("windows874", "windows874-2000");
    registrar("iso885911", "windows874-2000");
    registrar("wingreek", "windows-1253");
    registrar("winhebrew", "windows-1255");
    registrar("winlatin2", "windows-1250");
    registrar("winturkish", "windows-1254");
    registrar("winvietnamese", "windows-1258");
    registrar("xcp1250", "windows-1250");
    registrar("xcp1251", "windows-1251");
    registrar("xeuc", "EUC-JP");
    registrar("xeuccn", "EUC-CN");
    registrar("xgbk", "EUC-CN");
    registrar("xunicode20utf8", "UTF-8");
    registrar("xwindows949", "windows-949-2000");
    registrar("xxbig5", "Big5");
}
void TextCodecICU::registerEncodingNames(EncodingNameRegistrar registrar)
{
    // We register Hebrew with logical ordering using a separate name.
    // Otherwise, this would share the same canonical name as the
    // visual ordering case, and then TextEncoding could not tell them
    // apart; ICU treats these names as synonyms.
    registrar("ISO-8859-8-I", "ISO-8859-8-I");

    int32_t numConverters = ucnv_countAvailable();
    for (int32_t i = 0; i < numConverters; ++i) {
        const char* canonicalConverterName = ucnv_getAvailableName(i);
        UErrorCode error = U_ZERO_ERROR;
        // Try MIME before trying IANA to pick up commonly used names like
        // 'EUC-JP' instead of horrendously long names like 
        // 'Extended_UNIX_Code_Packed_Format_for_Japanese'. 
        const char* webStandardName = ucnv_getStandardName(canonicalConverterName, "MIME", &error);
        if (!U_SUCCESS(error) || !webStandardName) {
            error = U_ZERO_ERROR;
            // Try IANA to pick up 'windows-12xx' and other names
            // which are not preferred MIME names but are widely used. 
            webStandardName = ucnv_getStandardName(canonicalConverterName, "IANA", &error);
            if (!U_SUCCESS(error) || !webStandardName)
                continue;
        }

        // Any standard encoding overrides should match checks in registerCodecs() below.

        // 1. Treat GB2312 encoding as GBK (its more modern superset), to match other browsers.
        // 2. On the Web, GB2312 is encoded as EUC-CN or HZ, while ICU provides a native encoding
        //    for encoding GB_2312-80 and several others. So, we need to override this behavior, too.
        if (strcmp(webStandardName, "GB2312") == 0 || strcmp(webStandardName, "GB_2312-80") == 0)
            webStandardName = "GBK";
        // Similarly, EUC-KR encodings all map to an extended version.
        else if (strcmp(webStandardName, "KSC_5601") == 0 || strcmp(webStandardName, "EUC-KR") == 0 || strcmp(webStandardName, "cp1363") == 0)
            webStandardName = "windows-949";
        // And so on.
        // FIXME: strcasecmp is locale sensitive, we should not be using it.
        else if (strcasecmp(webStandardName, "iso-8859-9") == 0) // This name is returned in different case by ICU 3.2 and 3.6.
            webStandardName = "windows-1254";
        else if (strcmp(webStandardName, "TIS-620") == 0)
            webStandardName = "windows-874";

        registrar(webStandardName, webStandardName);

        uint16_t numAliases = ucnv_countAliases(canonicalConverterName, &error);
        ASSERT(U_SUCCESS(error));
        if (U_SUCCESS(error))
            for (uint16_t j = 0; j < numAliases; ++j) {
                error = U_ZERO_ERROR;
                const char* alias = ucnv_getAlias(canonicalConverterName, j, &error);
                ASSERT(U_SUCCESS(error));
                if (U_SUCCESS(error) && alias != webStandardName)
                    registrar(alias, webStandardName);
            }
    }

    // Additional aliases.
    // macroman is present in modern versions of ICU, but not in ICU 3.2 (shipped with Mac OS X 10.4).
    // FIXME: Do any ports still use such old versions?
    registrar("macroman", "macintosh");

    // Additional aliases that historically were present in the encoding
    // table in WebKit on Macintosh that don't seem to be present in ICU.
    // Perhaps we can prove these are not used on the web and remove them.
    // Or perhaps we can get them added to ICU.
    registrar("x-mac-roman", "macintosh");
    registrar("maccyrillic", "x-mac-cyrillic");
    registrar("x-mac-ukrainian", "x-mac-cyrillic");
    registrar("cn-big5", "Big5");
    registrar("x-x-big5", "Big5");
    registrar("cn-gb", "GBK");
    registrar("csgb231280", "GBK");
    registrar("x-euc-cn", "GBK");
    registrar("x-gbk", "GBK");
    registrar("csISO88598I", "ISO-8859-8-I");
    registrar("koi", "KOI8-R");
    registrar("logical", "ISO-8859-8-I");
    registrar("visual", "ISO-8859-8");
    registrar("winarabic", "windows-1256");
    registrar("winbaltic", "windows-1257");
    registrar("wincyrillic", "windows-1251");
    registrar("iso-8859-11", "windows-874");
    registrar("iso8859-11", "windows-874");
    registrar("dos-874", "windows-874");
    registrar("wingreek", "windows-1253");
    registrar("winhebrew", "windows-1255");
    registrar("winlatin2", "windows-1250");
    registrar("winturkish", "windows-1254");
    registrar("winvietnamese", "windows-1258");
    registrar("x-cp1250", "windows-1250");
    registrar("x-cp1251", "windows-1251");
    registrar("x-euc", "EUC-JP");
    registrar("x-windows-949", "windows-949");
    registrar("KSC5601", "windows-949");
    registrar("x-uhc", "windows-949");
    registrar("shift-jis", "Shift_JIS");

    // These aliases are present in modern versions of ICU, but use different codecs, and have no standard names.
    // They are not present in ICU 3.2.
    registrar("dos-720", "cp864");
    registrar("jis7", "ISO-2022-JP");

    // Alternative spelling of ISO encoding names.
    registrar("ISO8859-1", "ISO-8859-1");
    registrar("ISO8859-2", "ISO-8859-2");
    registrar("ISO8859-3", "ISO-8859-3");
    registrar("ISO8859-4", "ISO-8859-4");
    registrar("ISO8859-5", "ISO-8859-5");
    registrar("ISO8859-6", "ISO-8859-6");
    registrar("ISO8859-7", "ISO-8859-7");
    registrar("ISO8859-8", "ISO-8859-8");
    registrar("ISO8859-8-I", "ISO-8859-8-I");
    registrar("ISO8859-9", "windows-1254");
    registrar("ISO8859-10", "ISO-8859-10");
    registrar("ISO8859-13", "ISO-8859-13");
    registrar("ISO8859-14", "ISO-8859-14");
    registrar("ISO8859-15", "ISO-8859-15");
    // Not registering ISO8859-16, because Firefox (as of version 3.6.6) doesn't know this particular alias,
    // and because older versions of ICU don't support ISO-8859-16 encoding at all.
}