Ejemplo n.º 1
0
static PyObject* icu_swap_case(PyObject *self, PyObject *input) {
    PyObject *result = NULL;
    UErrorCode status = U_ZERO_ERROR;
    UChar *input_buf = NULL, *output_buf = NULL;
    UChar32 *buf = NULL;
    int32_t sz = 0, sz32 = 0, i = 0;

    input_buf = python_to_icu(input, &sz);
    if (input_buf == NULL) goto end;
    output_buf = (UChar*) calloc(3 * sz, sizeof(UChar));
    buf = (UChar32*) calloc(2 * sz, sizeof(UChar32));
    if (output_buf == NULL || buf == NULL) { PyErr_NoMemory(); goto end; }
    u_strToUTF32(buf, 2 * sz, &sz32, input_buf, sz, &status);

    for (i = 0; i < sz32; i++) {
        if (u_islower(buf[i])) buf[i] = u_toupper(buf[i]);
        else if (u_isupper(buf[i])) buf[i] = u_tolower(buf[i]);
    }
    u_strFromUTF32(output_buf, 3*sz, &sz, buf, sz32, &status);
    if (U_FAILURE(status)) { PyErr_SetString(PyExc_ValueError, u_errorName(status)); goto end; }
    result = icu_to_python(output_buf, sz);

end:
    if (input_buf != NULL) free(input_buf);
    if (output_buf != NULL) free(output_buf);
    if (buf != NULL) free(buf);
    return result;

} // }}}
Ejemplo n.º 2
0
    bool CheckString(CatalogItemPtr item, const wxString& source, const wxString& translation) override
    {
        if (u_isupper(source[0]) && u_islower(translation[0]))
        {
            item->SetIssue(CatalogItem::Issue::Warning, _("The translation should start as a sentence."));
            return true;
        }

        if (u_islower(source[0]) && u_isupper(translation[0]))
        {
            if (m_lang != "de")
            {
                item->SetIssue(CatalogItem::Issue::Warning, _("The translation should start with a lowercase character."));
                return true;
            }
            // else: German nouns start uppercased, this would cause too many false positives
        }

        return false;
    }
Ejemplo n.º 3
0
String EditorClientImpl::getAutoCorrectSuggestionForMisspelledWord(const String& misspelledWord)
{
    if (!(isContinuousSpellCheckingEnabled() && m_webView->client()))
        return String();

    // Do not autocorrect words with capital letters in it except the
    // first letter. This will remove cases changing "IMB" to "IBM".
    for (size_t i = 1; i < misspelledWord.length(); i++) {
        if (u_isupper(static_cast<UChar32>(misspelledWord[i])))
            return String();
    }

    return m_webView->client()->autoCorrectWord(WebString(misspelledWord));
}
Ejemplo n.º 4
0
static int
u_iscclass(PARROT_INTERP, UINTVAL codepoint, INTVAL flags)
{
    ASSERT_ARGS(u_iscclass)
#if PARROT_HAS_ICU
    UNUSED(interp);
            /* XXX which one
               return u_charDigitValue(codepoint);
               */
    if ((flags & enum_cclass_uppercase)    && u_isupper(codepoint))  return 1;
    if ((flags & enum_cclass_lowercase)    && u_islower(codepoint))  return 1;
    if ((flags & enum_cclass_alphabetic)   && u_isalpha(codepoint))  return 1;
    if ((flags & enum_cclass_numeric)      && u_isdigit(codepoint))  return 1;
    if ((flags & enum_cclass_hexadecimal)  && u_isxdigit(codepoint)) return 1;
    if ((flags & enum_cclass_whitespace)   && u_isspace(codepoint))  return 1;
    if ((flags & enum_cclass_printing)     && u_isprint(codepoint))  return 1;
    if ((flags & enum_cclass_graphical)    && u_isgraph(codepoint))  return 1;
    if ((flags & enum_cclass_blank)        && u_isblank(codepoint))  return 1;
    if ((flags & enum_cclass_control)      && u_iscntrl(codepoint))  return 1;
    if ((flags & enum_cclass_alphanumeric) && u_isalnum(codepoint))  return 1;
    if ((flags & enum_cclass_word)         &&
        (u_isalnum(codepoint) || codepoint == '_'))                  return 1;
    if ((flags & enum_cclass_newline)      &&
        (codepoint == 0x2028 || codepoint == 0x2029 ||
         u_hasBinaryProperty(codepoint, UCHAR_LINE_BREAK)))          return 1;

    return 0;
#else
    if (codepoint < 256)
        return (Parrot_iso_8859_1_typetable[codepoint] & flags) ? 1 : 0;

    if (flags == enum_cclass_any)
        return 1;

    /* All codepoints from u+0100 to u+02af are alphabetic, so we
     * cheat on the WORD and ALPHABETIC properties to include these
     * (and incorrectly exclude all others).  This is a stopgap until
     * ICU is everywhere, or we have better non-ICU unicode support. */
    if (flags == enum_cclass_word || flags == enum_cclass_alphabetic)
        return (codepoint < 0x2b0);

    if (flags & enum_cclass_whitespace) {
        /* from http://www.unicode.org/Public/UNIDATA/PropList.txt */
        switch (codepoint) {
          case 0x1680: case 0x180e: case 0x2000: case 0x2001:
          case 0x2002: case 0x2003: case 0x2004: case 0x2005:
          case 0x2006: case 0x2007: case 0x2008: case 0x2009:
          case 0x200a: case 0x2028: case 0x2029: case 0x202f:
          case 0x205f: case 0x3000:
            return 1;
          default:
            break;
        }
    }

    if (flags & enum_cclass_numeric) {
        /* from http://www.unicode.org/Public/UNIDATA/UnicodeData.txt */
        if (codepoint >= 0x0660 && codepoint <= 0x0669) return 1;
        if (codepoint >= 0x06f0 && codepoint <= 0x06f9) return 1;
        if (codepoint >= 0x07c0 && codepoint <= 0x07c9) return 1;
        if (codepoint >= 0x0966 && codepoint <= 0x096f) return 1;
        if (codepoint >= 0x09e6 && codepoint <= 0x09ef) return 1;
        if (codepoint >= 0x0a66 && codepoint <= 0x0a6f) return 1;
        if (codepoint >= 0x0ae6 && codepoint <= 0x0aef) return 1;
        if (codepoint >= 0x0b66 && codepoint <= 0x0b6f) return 1;
        if (codepoint >= 0x0be6 && codepoint <= 0x0bef) return 1;
        if (codepoint >= 0x0c66 && codepoint <= 0x0c6f) return 1;
        if (codepoint >= 0x0ce6 && codepoint <= 0x0cef) return 1;
        if (codepoint >= 0x0d66 && codepoint <= 0x0d6f) return 1;
        if (codepoint >= 0x0e50 && codepoint <= 0x0e59) return 1;
        if (codepoint >= 0x0ed0 && codepoint <= 0x0ed9) return 1;
        if (codepoint >= 0x0f20 && codepoint <= 0x0f29) return 1;
        if (codepoint >= 0x1040 && codepoint <= 0x1049) return 1;
        if (codepoint >= 0x17e0 && codepoint <= 0x17e9) return 1;
        if (codepoint >= 0x1810 && codepoint <= 0x1819) return 1;
        if (codepoint >= 0x1946 && codepoint <= 0x194f) return 1;
        if (codepoint >= 0x19d0 && codepoint <= 0x19d9) return 1;
        if (codepoint >= 0x1b50 && codepoint <= 0x1b59) return 1;
        if (codepoint >= 0xff10 && codepoint <= 0xff19) return 1;
    }

    if (flags & enum_cclass_newline) {
        /* from http://www.unicode.org/Public/UNIDATA/extracted/DerivedLineBreak.txt
         * Line_Break=Mandatory_Break*/
        if (codepoint == 0x2028 || codepoint == 0x2029) return 1;
    }

    if (flags & ~(enum_cclass_whitespace | enum_cclass_numeric | enum_cclass_newline))
        Parrot_ex_throw_from_c_noargs(interp, EXCEPTION_LIBRARY_ERROR,
            "no ICU lib loaded");

    return 0;
#endif
}
static jboolean Character_isUpperCaseImpl(JNIEnv*, jclass, jint codePoint) {
    return u_isupper(codePoint);
}
Ejemplo n.º 6
0
//static jboolean Character_isUpperCaseImpl(JNIEnv*, jclass, jint codePoint) {
JNIEXPORT jboolean JNICALL
Java_java_lang_Character_isUpperCaseImpl(JNIEnv*, jclass, jint codePoint) {
    return u_isupper(codePoint);
}
Ejemplo n.º 7
0
jboolean fastiva_vm_Character_C$__isUpperCaseImpl(jint codePoint) {
    return u_isupper(codePoint);
}
// Helper sets the character attribute properties and sets up the script table.
// Does not set tops and bottoms.
void SetupBasicProperties(bool report_errors, bool decompose,
                          UNICHARSET* unicharset) {
  for (int unichar_id = 0; unichar_id < unicharset->size(); ++unichar_id) {
    // Convert any custom ligatures.
    const char* unichar_str = unicharset->id_to_unichar(unichar_id);
    for (int i = 0; UNICHARSET::kCustomLigatures[i][0] != nullptr; ++i) {
      if (!strcmp(UNICHARSET::kCustomLigatures[i][1], unichar_str)) {
        unichar_str = UNICHARSET::kCustomLigatures[i][0];
        break;
      }
    }

    // Convert the unichar to UTF32 representation
    std::vector<char32> uni_vector = UNICHAR::UTF8ToUTF32(unichar_str);

    // Assume that if the property is true for any character in the string,
    // then it holds for the whole "character".
    bool unichar_isalpha = false;
    bool unichar_islower = false;
    bool unichar_isupper = false;
    bool unichar_isdigit = false;
    bool unichar_ispunct = false;

    for (char32 u_ch : uni_vector) {
      if (u_isalpha(u_ch)) unichar_isalpha = true;
      if (u_islower(u_ch)) unichar_islower = true;
      if (u_isupper(u_ch)) unichar_isupper = true;
      if (u_isdigit(u_ch)) unichar_isdigit = true;
      if (u_ispunct(u_ch)) unichar_ispunct = true;
    }

    unicharset->set_isalpha(unichar_id, unichar_isalpha);
    unicharset->set_islower(unichar_id, unichar_islower);
    unicharset->set_isupper(unichar_id, unichar_isupper);
    unicharset->set_isdigit(unichar_id, unichar_isdigit);
    unicharset->set_ispunctuation(unichar_id, unichar_ispunct);

    tesseract::IcuErrorCode err;
    unicharset->set_script(unichar_id, uscript_getName(
        uscript_getScript(uni_vector[0], err)));

    const int num_code_points = uni_vector.size();
    // Obtain the lower/upper case if needed and record it in the properties.
    unicharset->set_other_case(unichar_id, unichar_id);
    if (unichar_islower || unichar_isupper) {
      std::vector<char32> other_case(num_code_points, 0);
      for (int i = 0; i < num_code_points; ++i) {
        // TODO(daria): Ideally u_strToLower()/ustrToUpper() should be used.
        // However since they deal with UChars (so need a conversion function
        // from char32 or UTF8string) and require a meaningful locale string,
        // for now u_tolower()/u_toupper() are used.
        other_case[i] = unichar_islower ? u_toupper(uni_vector[i]) :
          u_tolower(uni_vector[i]);
      }
      std::string other_case_uch = UNICHAR::UTF32ToUTF8(other_case);
      UNICHAR_ID other_case_id =
          unicharset->unichar_to_id(other_case_uch.c_str());
      if (other_case_id != INVALID_UNICHAR_ID) {
        unicharset->set_other_case(unichar_id, other_case_id);
      } else if (unichar_id >= SPECIAL_UNICHAR_CODES_COUNT && report_errors) {
        tprintf("Other case %s of %s is not in unicharset\n",
                other_case_uch.c_str(), unichar_str);
      }
    }

    // Set RTL property and obtain mirror unichar ID from ICU.
    std::vector<char32> mirrors(num_code_points, 0);
    for (int i = 0; i < num_code_points; ++i) {
      mirrors[i] = u_charMirror(uni_vector[i]);
      if (i == 0) {  // set directionality to that of the 1st code point
        unicharset->set_direction(unichar_id,
                                  static_cast<UNICHARSET::Direction>(
                                      u_charDirection(uni_vector[i])));
      }
    }
    std::string mirror_uch = UNICHAR::UTF32ToUTF8(mirrors);
    UNICHAR_ID mirror_uch_id = unicharset->unichar_to_id(mirror_uch.c_str());
    if (mirror_uch_id != INVALID_UNICHAR_ID) {
      unicharset->set_mirror(unichar_id, mirror_uch_id);
    } else if (report_errors) {
      tprintf("Mirror %s of %s is not in unicharset\n",
              mirror_uch.c_str(), unichar_str);
    }

    // Record normalized version of this unichar.
    std::string normed_str;
    if (unichar_id != 0 &&
        tesseract::NormalizeUTF8String(
            decompose ? tesseract::UnicodeNormMode::kNFKD
                      : tesseract::UnicodeNormMode::kNFKC,
            tesseract::OCRNorm::kNormalize, tesseract::GraphemeNorm::kNone,
            unichar_str, &normed_str) &&
        !normed_str.empty()) {
      unicharset->set_normed(unichar_id, normed_str.c_str());
    } else {
      unicharset->set_normed(unichar_id, unichar_str);
    }
    ASSERT_HOST(unicharset->get_other_case(unichar_id) < unicharset->size());
  }
  unicharset->post_load_setup();
}