static jobjectArray NativeConverter_getAvailableCharsetNames(JNIEnv* env, jclass) {
    int32_t num = ucnv_countAvailable();
    jobjectArray result = env->NewObjectArray(num, JniConstants::stringClass, NULL);
    if (result == NULL) {
        return NULL;
    }
    for (int i = 0; i < num; ++i) {
        const char* name = ucnv_getAvailableName(i);
        ScopedLocalRef<jstring> javaCanonicalName(env, getJavaCanonicalName(env, name));
        if (javaCanonicalName.get() == NULL) {
            return NULL;
        }
        env->SetObjectArrayElement(result, i, javaCanonicalName.get());
        if (env->ExceptionCheck()) {
            return NULL;
        }
    }
    return result;
}
Example #2
0
void TextCodecICU::registerExtendedCodecs(TextCodecRegistrar registrar)
{
    // See comment above in registerEncodingNames.
    registrar("ISO-8859-8-I", newTextCodecICU, 0);

    int32_t numEncodings = ucnv_countAvailable();
    for (int32_t i = 0; i < numEncodings; ++i) {
        const char* name = ucnv_getAvailableName(i);
        UErrorCode error = U_ZERO_ERROR;
        const char* standardName = ucnv_getStandardName(name, "MIME", &error);
        if (!U_SUCCESS(error) || !standardName) {
            error = U_ZERO_ERROR;
            standardName = ucnv_getStandardName(name, "IANA", &error);
            if (!U_SUCCESS(error) || !standardName)
                continue;
        }
        registrar(standardName, newTextCodecICU, 0);
    }
}
const CFStringEncoding *
CFStringGetListOfAvailableEncodings (void)
{
  if (_kCFStringEncodingList == NULL)
    {
      GSMutexLock (&_kCFStringEncodingLock);
      if (_kCFStringEncodingList == NULL)
        {
          int32_t count;
          int32_t idx, pos;
          const char *name;
          UErrorCode err = U_ZERO_ERROR;

          count = ucnv_countAvailable ();

          _kCFStringEncodingList = CFAllocatorAllocate (NULL,
                                                        sizeof
                                                        (CFStringEncoding) *
                                                        (count + 1), 0);

          idx = pos = 0;
          while (idx < count)
            {
              name = ucnv_getStandardName (ucnv_getAvailableName (idx),
                                           "MIME", &err);
              if (U_SUCCESS (err) && name != NULL)
              {
                  _kCFStringEncodingList[pos] =
                    CFStringConvertStandardNameToEncoding (name, -1);
                  ++pos;
              }
              ++idx;
            }
          _kCFStringEncodingList[pos] = kCFStringEncodingInvalidId;
        }
      GSMutexUnlock (&_kCFStringEncodingLock);
    }

  return _kCFStringEncodingList;
}
Example #4
0
static UBool
getAvailableNames() {
  int32_t i;
  if (gAvailableNames != NULL) {
    return TRUE;
  }
  gCountAvailable = ucnv_countAvailable();
  if (gCountAvailable == 0) {
    log_data_err("No converters available.\n");
    return FALSE;
  }
  gAvailableNames = (const char **)uprv_malloc(gCountAvailable * sizeof(const char *));
  if (gAvailableNames == NULL) {
    log_err("unable to allocate memory for %ld available converter names\n",
            (long)gCountAvailable);
    return FALSE;
  }
  for (i = 0; i < gCountAvailable; ++i) {
    gAvailableNames[i] = ucnv_getAvailableName(i);
  }
  return TRUE;
}
UErrorCode convsample_03()
{
  printf("\n\n==============================================\n"
         "Sample 03: C: print out all converters\n");

  int32_t count;
  int32_t i;

  // **************************** START SAMPLE *******************
  count = ucnv_countAvailable();
  printf("Available converters: %d\n", count);
  
  for(i=0;i<count;i++) 
  {
    printf("%s ", ucnv_getAvailableName(i));
  }

  // ***************************** END SAMPLE ********************
  
  printf("\n");

  return U_ZERO_ERROR;
}
Example #6
0
	//-----------------------------------------------------------------------------------
	// encodings and converters
	//-----------------------------------------------------------------------------------
	void ICUUnicodeSupport::buildListEncodings()
	{
		int32_t count = ucnv_countAvailable();
		for(int32_t i = 0; i != count; ++i)
		{
			const char* canonicalName = ucnv_getAvailableName(i);
			
			UErrorCode errorCode = U_ZERO_ERROR;
			int16_t countAliases = ucnv_countAliases(canonicalName, &errorCode);
			CHECK_ICU_ERROR_CODE(errorCode);

			if(countAliases)
			{
				vector<const char*>::type aliases;
				aliases.resize(countAliases);
				ucnv_getAliases(canonicalName, &aliases[0], &errorCode);
				CHECK_ICU_ERROR_CODE(errorCode);

				for(size_t j = 0; j != countAliases; ++j)
					_addEncoding<sizeof(String::value_type)> (aliases[j]);
			}
		}
	}
		vector<wstring> StringCharsetConverter::makeCharsetList()
		{
			vector<wstring> list;
			uint16_t n = static_cast<uint16_t>(ucnv_countAvailable());
			UErrorCode err = U_ZERO_ERROR;
			const char *name;
			wchar_t buf[48];
			StringCharsetConverter cv;
			for(uint16_t i = 0; i < n; ++ i)
			{
				name = ucnv_getAvailableName(i);
				name = ucnv_getStandardName(name, "MIME", &err);
				if(U_FAILURE(err) || name == nullptr)
				{
					err = U_ZERO_ERROR;
					continue;
				}
				if(cv.convertToUTF16(buf, 48, name) != static_cast<size_t>(-1))
					list.push_back(buf);
			}

			return move(list);
		}
void TextCodecICU::registerCodecs(TextCodecRegistrar registrar)
{
    // See comment above in registerEncodingNames.
    UErrorCode error = U_ZERO_ERROR;
    const char* canonicalConverterName = ucnv_getCanonicalName("ISO-8859-8-I", "IANA", &error);
    ASSERT(U_SUCCESS(error));
    registrar("ISO-8859-8-I", create, canonicalConverterName);

    int32_t numConverters = ucnv_countAvailable();
    for (int32_t i = 0; i < numConverters; ++i) {
        canonicalConverterName = ucnv_getAvailableName(i);
        error = U_ZERO_ERROR;
        const char* webStandardName = ucnv_getStandardName(canonicalConverterName, "MIME", &error);
        if (!U_SUCCESS(error) || !webStandardName) {
            error = U_ZERO_ERROR;
            webStandardName = ucnv_getStandardName(canonicalConverterName, "IANA", &error);
            if (!U_SUCCESS(error) || !webStandardName)
                continue;
        }

        // Don't register codecs for overridden encodings.
        if (strcmp(webStandardName, "GB2312") == 0 || strcmp(webStandardName, "GB_2312-80") == 0
            || strcmp(webStandardName, "KSC_5601") == 0 || strcmp(webStandardName, "EUC-KR") == 0
            || strcmp(webStandardName, "cp1363") == 0
            || strcasecmp(webStandardName, "iso-8859-9") == 0
            || strcmp(webStandardName, "TIS-620") == 0)
            continue;

        registrar(webStandardName, create, fastStrDup(canonicalConverterName));
    }

    // These encodings currently don't have standard names, so we need to register encoders manually.
    // FIXME: Is there a good way to determine the most up to date variant programmatically?
    registrar("windows-874", create, "windows-874-2000");
    registrar("windows-949", create, "windows-949-2000");
}
Example #9
0
/* open a selector. If converterListSize is 0, build for all converters.
   If excludedCodePoints is NULL, don't exclude any codepoints */
U_CAPI UConverterSelector* U_EXPORT2
ucnvsel_open(const char* const*  converterList, int32_t converterListSize,
             const USet* excludedCodePoints,
             const UConverterUnicodeSet whichSet, UErrorCode* status) {
  // check if already failed
  if (U_FAILURE(*status)) {
    return NULL;
  }
  // ensure args make sense!
  if (converterListSize < 0 || (converterList == NULL && converterListSize != 0)) {
    *status = U_ILLEGAL_ARGUMENT_ERROR;
    return NULL;
  }

  // allocate a new converter
  LocalUConverterSelectorPointer newSelector(
    (UConverterSelector*)uprv_malloc(sizeof(UConverterSelector)));
  if (newSelector.isNull()) {
    *status = U_MEMORY_ALLOCATION_ERROR;
    return NULL;
  }
  uprv_memset(newSelector.getAlias(), 0, sizeof(UConverterSelector));

  if (converterListSize == 0) {
    converterList = NULL;
    converterListSize = ucnv_countAvailable();
  }
  newSelector->encodings =
    (char**)uprv_malloc(converterListSize * sizeof(char*));
  if (!newSelector->encodings) {
    *status = U_MEMORY_ALLOCATION_ERROR;
    return NULL;
  }
  newSelector->encodings[0] = NULL;  // now we can call ucnvsel_close()

  // make a backup copy of the list of converters
  int32_t totalSize = 0;
  int32_t i;
  for (i = 0; i < converterListSize; i++) {
    totalSize +=
      (int32_t)uprv_strlen(converterList != NULL ? converterList[i] : ucnv_getAvailableName(i)) + 1;
  }
  // 4-align the totalSize to 4-align the size of the serialized form
  int32_t encodingStrPadding = totalSize & 3;
  if (encodingStrPadding != 0) {
    encodingStrPadding = 4 - encodingStrPadding;
  }
  newSelector->encodingStrLength = totalSize += encodingStrPadding;
  char* allStrings = (char*) uprv_malloc(totalSize);
  if (!allStrings) {
    *status = U_MEMORY_ALLOCATION_ERROR;
    return NULL;
  }

  for (i = 0; i < converterListSize; i++) {
    newSelector->encodings[i] = allStrings;
    uprv_strcpy(newSelector->encodings[i],
                converterList != NULL ? converterList[i] : ucnv_getAvailableName(i));
    allStrings += uprv_strlen(newSelector->encodings[i]) + 1;
  }
  while (encodingStrPadding > 0) {
    *allStrings++ = 0;
    --encodingStrPadding;
  }

  newSelector->ownEncodingStrings = TRUE;
  newSelector->encodingsCount = converterListSize;
  UPropsVectors *upvec = upvec_open((converterListSize+31)/32, status);
  generateSelectorData(newSelector.getAlias(), upvec, excludedCodePoints, whichSet, status);
  upvec_close(upvec);

  if (U_FAILURE(*status)) {
    return NULL;
  }

  return newSelector.orphan();
}
void TextCodecICU::registerEncodingNames(EncodingNameRegistrar registrar)
{
    // We register Hebrew with logical ordering using a separate name.
    // Otherwise, this would share the same canonical name as the
    // visual ordering case, and then TextEncoding could not tell them
    // apart; ICU treats these names as synonyms.
    registrar("ISO-8859-8-I", "ISO-8859-8-I");

    int32_t numEncodings = ucnv_countAvailable();
    for (int32_t i = 0; i < numEncodings; ++i) {
        const char* name = ucnv_getAvailableName(i);
        UErrorCode error = U_ZERO_ERROR;
#if !defined(USING_SYSTEM_ICU)
        const char* primaryStandard = "HTML";
        const char* secondaryStandard = "MIME";
#else
        const char* primaryStandard = "MIME";
        const char* secondaryStandard = "IANA";
#endif
        const char* standardName = ucnv_getStandardName(name, primaryStandard, &error);
        if (U_FAILURE(error) || !standardName) {
            error = U_ZERO_ERROR;
            // Try IANA to pick up 'windows-12xx' and other names
            // which are not preferred MIME names but are widely used.
            standardName = ucnv_getStandardName(name, secondaryStandard, &error);
            if (U_FAILURE(error) || !standardName)
                continue;
        }

        // A number of these aliases are handled in Chrome's copy of ICU, but
        // Chromium can be compiled with the system ICU.

        // 1. Treat GB2312 encoding as GBK (its more modern superset), to match other browsers.
        // 2. On the Web, GB2312 is encoded as EUC-CN or HZ, while ICU provides a native encoding
        //    for encoding GB_2312-80 and several others. So, we need to override this behavior, too.
#if defined(USING_SYSTEM_ICU)
        if (!strcmp(standardName, "GB2312") || !strcmp(standardName, "GB_2312-80"))
            standardName = "GBK";
        // Similarly, EUC-KR encodings all map to an extended version, but
        // per HTML5, the canonical name still should be EUC-KR.
        else if (!strcmp(standardName, "EUC-KR") || !strcmp(standardName, "KSC_5601") || !strcmp(standardName, "cp1363"))
            standardName = "EUC-KR";
        // And so on.
        else if (!strcasecmp(standardName, "iso-8859-9")) // This name is returned in different case by ICU 3.2 and 3.6.
            standardName = "windows-1254";
        else if (!strcmp(standardName, "TIS-620"))
            standardName = "windows-874";
#endif

        registrar(standardName, standardName);

        uint16_t numAliases = ucnv_countAliases(name, &error);
        ASSERT(U_SUCCESS(error));
        if (U_SUCCESS(error))
            for (uint16_t j = 0; j < numAliases; ++j) {
                error = U_ZERO_ERROR;
                const char* alias = ucnv_getAlias(name, j, &error);
                ASSERT(U_SUCCESS(error));
                if (U_SUCCESS(error) && alias != standardName)
                    registrar(alias, standardName);
            }
    }

    // These two entries have to be added here because ICU's converter table
    // cannot have both ISO-8859-8-I and ISO-8859-8.
    registrar("csISO88598I", "ISO-8859-8-I");
    registrar("logical", "ISO-8859-8-I");

#if defined(USING_SYSTEM_ICU)
    // Additional alias for MacCyrillic not present in ICU.
    registrar("maccyrillic", "x-mac-cyrillic");

    // Additional aliases that historically were present in the encoding
    // table in WebKit on Macintosh that don't seem to be present in ICU.
    // Perhaps we can prove these are not used on the web and remove them.
    // Or perhaps we can get them added to ICU.
    registrar("x-mac-roman", "macintosh");
    registrar("x-mac-ukrainian", "x-mac-cyrillic");
    registrar("cn-big5", "Big5");
    registrar("x-x-big5", "Big5");
    registrar("cn-gb", "GBK");
    registrar("csgb231280", "GBK");
    registrar("x-euc-cn", "GBK");
    registrar("x-gbk", "GBK");
    registrar("koi", "KOI8-R");
    registrar("visual", "ISO-8859-8");
    registrar("winarabic", "windows-1256");
    registrar("winbaltic", "windows-1257");
    registrar("wincyrillic", "windows-1251");
    registrar("iso-8859-11", "windows-874");
    registrar("iso8859-11", "windows-874");
    registrar("dos-874", "windows-874");
    registrar("wingreek", "windows-1253");
    registrar("winhebrew", "windows-1255");
    registrar("winlatin2", "windows-1250");
    registrar("winturkish", "windows-1254");
    registrar("winvietnamese", "windows-1258");
    registrar("x-cp1250", "windows-1250");
    registrar("x-cp1251", "windows-1251");
    registrar("x-euc", "EUC-JP");
    registrar("x-windows-949", "EUC-KR");
    registrar("KSC5601", "EUC-KR");
    registrar("x-uhc", "EUC-KR");
    registrar("shift-jis", "Shift_JIS");

    // Alternative spelling of ISO encoding names.
    registrar("ISO8859-1", "ISO-8859-1");
    registrar("ISO8859-2", "ISO-8859-2");
    registrar("ISO8859-3", "ISO-8859-3");
    registrar("ISO8859-4", "ISO-8859-4");
    registrar("ISO8859-5", "ISO-8859-5");
    registrar("ISO8859-6", "ISO-8859-6");
    registrar("ISO8859-7", "ISO-8859-7");
    registrar("ISO8859-8", "ISO-8859-8");
    registrar("ISO8859-8-I", "ISO-8859-8-I");
    registrar("ISO8859-9", "ISO-8859-9");
    registrar("ISO8859-10", "ISO-8859-10");
    registrar("ISO8859-13", "ISO-8859-13");
    registrar("ISO8859-14", "ISO-8859-14");
    registrar("ISO8859-15", "ISO-8859-15");
    // No need to have an entry for ISO8859-16. ISO-8859-16 has just one label
    // listed in WHATWG Encoding Living Standard (http://encoding.spec.whatwg.org/ ).

    // Additional aliases present in the WHATWG Encoding Standard
    // and Firefox (as of Oct 2014), but not in the upstream ICU.
    // Three entries for windows-1252 need not be listed here because
    // TextCodecLatin1 registers them.
    registrar("csiso58gb231280", "GBK");
    registrar("csiso88596e", "ISO-8859-6");
    registrar("csiso88596i", "ISO-8859-6");
    registrar("csiso88598e", "ISO-8859-8");
    registrar("gb_2312", "GBK");
    registrar("iso88592", "ISO-8859-2");
    registrar("iso88593", "ISO-8859-3");
    registrar("iso88594", "ISO-8859-4");
    registrar("iso88595", "ISO-8859-5");
    registrar("iso88596", "ISO-8859-6");
    registrar("iso88597", "ISO-8859-7");
    registrar("iso88598", "ISO-8859-8");
    registrar("iso88599", "windows-1254");
    registrar("iso885910", "ISO-8859-10");
    registrar("iso885911", "windows-874");
    registrar("iso885913", "ISO-8859-13");
    registrar("iso885914", "ISO-8859-14");
    registrar("iso885915", "ISO-8859-15");
    registrar("iso_8859-2", "ISO-8859-2");
    registrar("iso_8859-3", "ISO-8859-3");
    registrar("iso_8859-4", "ISO-8859-4");
    registrar("iso_8859-5", "ISO-8859-5");
    registrar("iso_8859-6", "ISO-8859-6");
    registrar("iso_8859-7", "ISO-8859-7");
    registrar("iso_8859-8", "ISO-8859-8");
    registrar("iso_8859-9", "windows-1254");
    registrar("iso_8859-15", "ISO-8859-15");
    registrar("koi8_r", "KOI8-R");
    registrar("x-cp1253", "windows-1253");
    registrar("x-cp1254", "windows-1254");
    registrar("x-cp1255", "windows-1255");
    registrar("x-cp1256", "windows-1256");
    registrar("x-cp1257", "windows-1257");
    registrar("x-cp1258", "windows-1258");
#endif
}
Example #11
0
static int printConverters(const char *pname, const char *lookfor,
    int canon)
{
    UErrorCode err = U_ZERO_ERROR;
    int32_t num;
    uint16_t num_stds;
    const char **stds;

    /* If there is a specified name, just handle that now. */

    if (lookfor) {
        if (!canon) {
            printf("%s\n", lookfor);
            return 0;
        } else {
        /*  Because we are printing a canonical name, we need the
            true converter name. We've done that already except for
            the default name (because we want to print the exact
            name one would get when calling ucnv_getDefaultName()
            in non-canon mode). But since we do not know at this
            point if we have the default name or something else, we
            need to normalize again to the canonical converter
            name. */

            const char *truename = ucnv_getAlias(lookfor, 0, &err);
            if (U_SUCCESS(err)) {
                lookfor = truename;
            } else {
                err = U_ZERO_ERROR;
            }
        }
    }

    /* Print converter names. We come here for one of two reasons: we
       are printing all the names (lookfor was null), or we have a
       single converter to print but in canon mode, hence we need to
       get to it in order to print everything. */

    num = ucnv_countAvailable();
    if (num <= 0) {
        initMsg(pname);
        u_wmsg(stderr, "cantGetNames");
        return -1;
    }
    if (lookfor) {
        num = 1;                /* We know where we want to be. */
    }

    num_stds = ucnv_countStandards();
    stds = (const char **) uprv_malloc(num_stds * sizeof(*stds));
    if (!stds) {
        u_wmsg(stderr, "cantGetTag", u_wmsg_errorName(U_MEMORY_ALLOCATION_ERROR));
        return -1;
    } else {
        uint16_t s;

        for (s = 0; s < num_stds; ++s) {
            stds[s] = ucnv_getStandard(s, &err);
            if (U_FAILURE(err)) {
                u_wmsg(stderr, "cantGetTag", u_wmsg_errorName(err));
                return -1;
            }
        }
    }

    for (int32_t i = 0; i < num; i++) {
        const char *name;
        uint16_t num_aliases;

        /* Set the name either to what we are looking for, or
        to the current converter name. */

        if (lookfor) {
            name = lookfor;
        } else {
            name = ucnv_getAvailableName(i);
        }

        /* Get all the aliases associated to the name. */

        err = U_ZERO_ERROR;
        num_aliases = ucnv_countAliases(name, &err);
        if (U_FAILURE(err)) {
            printf("%s", name);

            UnicodeString str(name, (int32_t)(uprv_strlen(name) + 1));
            putchar('\t');
            u_wmsg(stderr, "cantGetAliases", str.getBuffer(),
                u_wmsg_errorName(err));
            return -1;
        } else {
            uint16_t a, s, t;

            /* Write all the aliases and their tags. */

            for (a = 0; a < num_aliases; ++a) {
                const char *alias = ucnv_getAlias(name, a, &err);

                if (U_FAILURE(err)) {
                    UnicodeString str(name, (int32_t)(uprv_strlen(name) + 1));
                    putchar('\t');
                    u_wmsg(stderr, "cantGetAliases", str.getBuffer(),
                        u_wmsg_errorName(err));
                    return -1;
                }

                printf("%s", alias);

                /* Look (slowly, linear searching) for a tag. */

                if (canon) {
                    for (s = t = 0; s < num_stds; ++s) {
                        const char *standard =
                            ucnv_getStandardName(name, stds[s], &err);
                        if (U_SUCCESS(err) && standard) {
                            if (!strcmp(standard, alias)) {
                                if (!t) {
                                    printf(" {");
                                    t = 1;
                                }
                                printf(" %s", stds[s]);
                            }
                        }
                    }
                    if (t) {
                        printf(" }");
                    }
                }

                /* Move on. */

                if (a < num_aliases - 1) {
                    putchar(a || !canon ? ' ' : '\t');
                }
            }
        }

        /* Terminate this entry. */

        if (canon) {
            putchar('\n');
        } else if (i < num - 1) {
            putchar(' ');
        }
    }

    /* Free temporary data. */

    uprv_free(stds);

    /* Success. */

    return 0;
}
Example #12
0
void TextCodecICU::registerExtendedEncodingNames(EncodingNameRegistrar registrar)
{
    // We register Hebrew with logical ordering using a separate name.
    // Otherwise, this would share the same canonical name as the
    // visual ordering case, and then TextEncoding could not tell them
    // apart; ICU treats these names as synonyms.
    registrar("ISO-8859-8-I", "ISO-8859-8-I");

    int32_t numEncodings = ucnv_countAvailable();
    for (int32_t i = 0; i < numEncodings; ++i) {
        const char* name = ucnv_getAvailableName(i);
        UErrorCode error = U_ZERO_ERROR;
        // Try MIME before trying IANA to pick up commonly used names like
        // 'EUC-JP' instead of horrendeously long names like 
        // 'Extended_UNIX_Code_Packed_Format_for_Japanese'. 
        const char* standardName = ucnv_getStandardName(name, "MIME", &error);
        if (!U_SUCCESS(error) || !standardName) {
            error = U_ZERO_ERROR;
            // Try IANA to pick up 'windows-12xx' and other names
            // which are not preferred MIME names but are widely used. 
            standardName = ucnv_getStandardName(name, "IANA", &error);
            if (!U_SUCCESS(error) || !standardName)
                continue;
        }

        // 1. Treat GB2312 encoding as GBK (its more modern superset), to match other browsers.
        // 2. On the Web, GB2312 is encoded as EUC-CN or HZ, while ICU provides a native encoding
        //    for encoding GB_2312-80 and several others. So, we need to override this behavior, too.
        if (strcmp(standardName, "GB2312") == 0 || strcmp(standardName, "GB_2312-80") == 0)
            standardName = "GBK";
        // Similarly, EUC-KR encodings all map to an extended version.
        else if (strcmp(standardName, "KSC_5601") == 0 || strcmp(standardName, "EUC-KR") == 0 || strcmp(standardName, "cp1363") == 0)
            standardName = "windows-949";
        // And so on.
        else if (strcasecmp(standardName, "iso-8859-9") == 0) // This name is returned in different case by ICU 3.2 and 3.6.
            standardName = "windows-1254";
        else if (strcmp(standardName, "TIS-620") == 0)
            standardName = "windows-874";

        registrar(standardName, standardName);

        uint16_t numAliases = ucnv_countAliases(name, &error);
        ASSERT(U_SUCCESS(error));
        if (U_SUCCESS(error))
            for (uint16_t j = 0; j < numAliases; ++j) {
                error = U_ZERO_ERROR;
                const char* alias = ucnv_getAlias(name, j, &error);
                ASSERT(U_SUCCESS(error));
                if (U_SUCCESS(error) && alias != standardName)
                    registrar(alias, standardName);
            }
    }

    // Additional aliases.
    // These are present in modern versions of ICU, but not in ICU 3.2 (shipped with Mac OS X 10.4).
    registrar("macroman", "macintosh");
    registrar("maccyrillic", "x-mac-cyrillic");

    // Additional aliases that historically were present in the encoding
    // table in WebKit on Macintosh that don't seem to be present in ICU.
    // Perhaps we can prove these are not used on the web and remove them.
    // Or perhaps we can get them added to ICU.
    registrar("xmacroman", "macintosh");
    registrar("xmacukrainian", "x-mac-cyrillic");
    registrar("cnbig5", "Big5");
    registrar("xxbig5", "Big5");
    registrar("cngb", "GBK");
    registrar("csgb231280", "GBK");
    registrar("xeuccn", "GBK");
    registrar("xgbk", "GBK");
    registrar("csISO88598I", "ISO_8859-8-I");
    registrar("koi", "KOI8-R");
    registrar("logical", "ISO-8859-8-I");
    registrar("unicode11utf8", "UTF-8");
    registrar("unicode20utf8", "UTF-8");
    registrar("xunicode20utf8", "UTF-8");
    registrar("visual", "ISO-8859-8");
    registrar("winarabic", "windows-1256");
    registrar("winbaltic", "windows-1257");
    registrar("wincyrillic", "windows-1251");
    registrar("iso885911", "windows-874");
    registrar("dos874", "windows-874");
    registrar("wingreek", "windows-1253");
    registrar("winhebrew", "windows-1255");
    registrar("winlatin2", "windows-1250");
    registrar("winturkish", "windows-1254");
    registrar("winvietnamese", "windows-1258");
    registrar("xcp1250", "windows-1250");
    registrar("xcp1251", "windows-1251");
    registrar("xeuc", "EUC-JP");
    registrar("xwindows949", "windows-949");
    registrar("xuhc", "windows-949");

    // These aliases are present in modern versions of ICU, but use different codecs, and have no standard names.
    // They are not present in ICU 3.2.
    registrar("dos720", "cp864");
    registrar("jis7", "ISO-2022-JP");
}
Example #13
0
void TextCodecICU::registerExtendedEncodingNames(EncodingNameRegistrar registrar)
{
    // We register Hebrew with logical ordering using a separate name.
    // Otherwise, this would share the same canonical name as the
    // visual ordering case, and then TextEncoding could not tell them
    // apart; ICU works with either name.
    registrar("ISO-8859-8-I", "ISO-8859-8-I");

    int32_t numEncodings = ucnv_countAvailable();
    for (int32_t i = 0; i < numEncodings; ++i) {
        const char* name = ucnv_getAvailableName(i);
        UErrorCode error = U_ZERO_ERROR;
        // FIXME: Should we use the "MIME" standard instead of "IANA"?
        const char* standardName = ucnv_getStandardName(name, "IANA", &error);
        if (!U_SUCCESS(error) || !standardName)
            continue;

        // 1. Treat GB2312 encoding as GBK (its more modern superset), to match other browsers.
        // 2. On the Web, GB2312 is encoded as EUC-CN or HZ, while ICU provides a native encoding
        //    for encoding GB_2312-80 and several others. So, we need to override this behavior, too.
        if (strcmp(standardName, "GB2312") == 0 || strcmp(standardName, "GB_2312-80") == 0)
            standardName = "GBK";
        // Similarly, EUC-KR encodings all map to an extended version.
        else if (strcmp(standardName, "KS_C_5601-1987") == 0 || strcmp(standardName, "EUC-KR") == 0)
            standardName = "windows-949-2000";
        // And so on.
        else if (strcmp(standardName, "ISO_8859-9:1989") == 0)
            standardName = "windows-1254";
        else if (strcmp(standardName, "TIS-620") == 0)
            standardName = "windows-874-2000";

        registrar(standardName, standardName);

        uint16_t numAliases = ucnv_countAliases(name, &error);
        ASSERT(U_SUCCESS(error));
        if (U_SUCCESS(error))
            for (uint16_t j = 0; j < numAliases; ++j) {
                error = U_ZERO_ERROR;
                const char* alias = ucnv_getAlias(name, j, &error);
                ASSERT(U_SUCCESS(error));
                if (U_SUCCESS(error) && alias != standardName)
                    registrar(alias, standardName);
            }
    }

    // Additional aliases.
    // Perhaps we can get these added to ICU.
    registrar("macroman", "macintosh");
    registrar("xmacroman", "macintosh");

    // Additional aliases that historically were present in the encoding
    // table in WebKit on Macintosh that don't seem to be present in ICU.
    // Perhaps we can prove these are not used on the web and remove them.
    // Or perhaps we can get them added to ICU.
    registrar("cnbig5", "Big5");
    registrar("cngb", "EUC-CN");
    registrar("csISO88598I", "ISO_8859-8-I");
    registrar("csgb231280", "EUC-CN");
    registrar("dos720", "cp864");
    registrar("dos874", "cp874");
    registrar("jis7", "ISO-2022-JP");
    registrar("koi", "KOI8-R");
    registrar("logical", "ISO-8859-8-I");
    registrar("unicode11utf8", "UTF-8");
    registrar("unicode20utf8", "UTF-8");
    registrar("visual", "ISO-8859-8");
    registrar("winarabic", "windows-1256");
    registrar("winbaltic", "windows-1257");
    registrar("wincyrillic", "windows-1251");
    registrar("windows874", "windows874-2000");
    registrar("iso885911", "windows874-2000");
    registrar("wingreek", "windows-1253");
    registrar("winhebrew", "windows-1255");
    registrar("winlatin2", "windows-1250");
    registrar("winturkish", "windows-1254");
    registrar("winvietnamese", "windows-1258");
    registrar("xcp1250", "windows-1250");
    registrar("xcp1251", "windows-1251");
    registrar("xeuc", "EUC-JP");
    registrar("xeuccn", "EUC-CN");
    registrar("xgbk", "EUC-CN");
    registrar("xunicode20utf8", "UTF-8");
    registrar("xwindows949", "windows-949-2000");
    registrar("xxbig5", "Big5");
}
void TextCodecICU::registerEncodingNames(EncodingNameRegistrar registrar)
{
    // We register Hebrew with logical ordering using a separate name.
    // Otherwise, this would share the same canonical name as the
    // visual ordering case, and then TextEncoding could not tell them
    // apart; ICU treats these names as synonyms.
    registrar("ISO-8859-8-I", "ISO-8859-8-I");

    int32_t numConverters = ucnv_countAvailable();
    for (int32_t i = 0; i < numConverters; ++i) {
        const char* canonicalConverterName = ucnv_getAvailableName(i);
        UErrorCode error = U_ZERO_ERROR;
        // Try MIME before trying IANA to pick up commonly used names like
        // 'EUC-JP' instead of horrendously long names like 
        // 'Extended_UNIX_Code_Packed_Format_for_Japanese'. 
        const char* webStandardName = ucnv_getStandardName(canonicalConverterName, "MIME", &error);
        if (!U_SUCCESS(error) || !webStandardName) {
            error = U_ZERO_ERROR;
            // Try IANA to pick up 'windows-12xx' and other names
            // which are not preferred MIME names but are widely used. 
            webStandardName = ucnv_getStandardName(canonicalConverterName, "IANA", &error);
            if (!U_SUCCESS(error) || !webStandardName)
                continue;
        }

        // Any standard encoding overrides should match checks in registerCodecs() below.

        // 1. Treat GB2312 encoding as GBK (its more modern superset), to match other browsers.
        // 2. On the Web, GB2312 is encoded as EUC-CN or HZ, while ICU provides a native encoding
        //    for encoding GB_2312-80 and several others. So, we need to override this behavior, too.
        if (strcmp(webStandardName, "GB2312") == 0 || strcmp(webStandardName, "GB_2312-80") == 0)
            webStandardName = "GBK";
        // Similarly, EUC-KR encodings all map to an extended version.
        else if (strcmp(webStandardName, "KSC_5601") == 0 || strcmp(webStandardName, "EUC-KR") == 0 || strcmp(webStandardName, "cp1363") == 0)
            webStandardName = "windows-949";
        // And so on.
        // FIXME: strcasecmp is locale sensitive, we should not be using it.
        else if (strcasecmp(webStandardName, "iso-8859-9") == 0) // This name is returned in different case by ICU 3.2 and 3.6.
            webStandardName = "windows-1254";
        else if (strcmp(webStandardName, "TIS-620") == 0)
            webStandardName = "windows-874";

        registrar(webStandardName, webStandardName);

        uint16_t numAliases = ucnv_countAliases(canonicalConverterName, &error);
        ASSERT(U_SUCCESS(error));
        if (U_SUCCESS(error))
            for (uint16_t j = 0; j < numAliases; ++j) {
                error = U_ZERO_ERROR;
                const char* alias = ucnv_getAlias(canonicalConverterName, j, &error);
                ASSERT(U_SUCCESS(error));
                if (U_SUCCESS(error) && alias != webStandardName)
                    registrar(alias, webStandardName);
            }
    }

    // Additional aliases.
    // macroman is present in modern versions of ICU, but not in ICU 3.2 (shipped with Mac OS X 10.4).
    // FIXME: Do any ports still use such old versions?
    registrar("macroman", "macintosh");

    // Additional aliases that historically were present in the encoding
    // table in WebKit on Macintosh that don't seem to be present in ICU.
    // Perhaps we can prove these are not used on the web and remove them.
    // Or perhaps we can get them added to ICU.
    registrar("x-mac-roman", "macintosh");
    registrar("maccyrillic", "x-mac-cyrillic");
    registrar("x-mac-ukrainian", "x-mac-cyrillic");
    registrar("cn-big5", "Big5");
    registrar("x-x-big5", "Big5");
    registrar("cn-gb", "GBK");
    registrar("csgb231280", "GBK");
    registrar("x-euc-cn", "GBK");
    registrar("x-gbk", "GBK");
    registrar("csISO88598I", "ISO-8859-8-I");
    registrar("koi", "KOI8-R");
    registrar("logical", "ISO-8859-8-I");
    registrar("visual", "ISO-8859-8");
    registrar("winarabic", "windows-1256");
    registrar("winbaltic", "windows-1257");
    registrar("wincyrillic", "windows-1251");
    registrar("iso-8859-11", "windows-874");
    registrar("iso8859-11", "windows-874");
    registrar("dos-874", "windows-874");
    registrar("wingreek", "windows-1253");
    registrar("winhebrew", "windows-1255");
    registrar("winlatin2", "windows-1250");
    registrar("winturkish", "windows-1254");
    registrar("winvietnamese", "windows-1258");
    registrar("x-cp1250", "windows-1250");
    registrar("x-cp1251", "windows-1251");
    registrar("x-euc", "EUC-JP");
    registrar("x-windows-949", "windows-949");
    registrar("KSC5601", "windows-949");
    registrar("x-uhc", "windows-949");
    registrar("shift-jis", "Shift_JIS");

    // These aliases are present in modern versions of ICU, but use different codecs, and have no standard names.
    // They are not present in ICU 3.2.
    registrar("dos-720", "cp864");
    registrar("jis7", "ISO-2022-JP");

    // Alternative spelling of ISO encoding names.
    registrar("ISO8859-1", "ISO-8859-1");
    registrar("ISO8859-2", "ISO-8859-2");
    registrar("ISO8859-3", "ISO-8859-3");
    registrar("ISO8859-4", "ISO-8859-4");
    registrar("ISO8859-5", "ISO-8859-5");
    registrar("ISO8859-6", "ISO-8859-6");
    registrar("ISO8859-7", "ISO-8859-7");
    registrar("ISO8859-8", "ISO-8859-8");
    registrar("ISO8859-8-I", "ISO-8859-8-I");
    registrar("ISO8859-9", "windows-1254");
    registrar("ISO8859-10", "ISO-8859-10");
    registrar("ISO8859-13", "ISO-8859-13");
    registrar("ISO8859-14", "ISO-8859-14");
    registrar("ISO8859-15", "ISO-8859-15");
    // Not registering ISO8859-16, because Firefox (as of version 3.6.6) doesn't know this particular alias,
    // and because older versions of ICU don't support ISO-8859-16 encoding at all.
}
Example #15
0
const char* __hs_ucnv_getAvailableName(int32_t n)
{
    return ucnv_getAvailableName(n);
}