Beispiel #1
0
    bool CubeUtils::IsCaseInvariant(const char_32 *str32, CharSet *char_set) {
        bool all_one_case = true;
        bool capitalized;
        bool prev_upper;
        bool prev_lower;
        bool first_upper;
        bool first_lower;
        bool cur_upper;
        bool cur_lower;

        string str8;
        if (!char_set) {
            // If cube char_set is missing, use C-locale-dependent functions
            // on UTF8 characters to determine case properties.
            first_upper = isupper(str32[0]);
            first_lower = islower(str32[0]);
            if (first_upper)
                capitalized = true;
            prev_upper = first_upper;
            prev_lower = islower(str32[0]);
            for (int c = 1; str32[c] != 0; ++c) {
                cur_upper = isupper(str32[c]);
                cur_lower = islower(str32[c]);
                if ((prev_upper && cur_lower) || (prev_lower && cur_upper))
                    all_one_case = false;
                if (cur_upper)
                    capitalized = false;
                prev_upper = cur_upper;
                prev_lower = cur_lower;
            }
        } else {
            UNICHARSET *unicharset = char_set->InternalUnicharset();
            // Use UNICHARSET functions to determine case properties
            first_upper = unicharset->get_isupper(char_set->ClassID(str32[0]));
            first_lower = unicharset->get_islower(char_set->ClassID(str32[0]));
            if (first_upper)
                capitalized = true;
            prev_upper = first_upper;
            prev_lower = unicharset->get_islower(char_set->ClassID(str32[0]));

            for (int c = 1; c < StrLen(str32); ++c) {
                cur_upper = unicharset->get_isupper(char_set->ClassID(str32[c]));
                cur_lower = unicharset->get_islower(char_set->ClassID(str32[c]));
                if ((prev_upper && cur_lower) || (prev_lower && cur_upper))
                    all_one_case = false;
                if (cur_upper)
                    capitalized = false;
                prev_upper = cur_upper;
                prev_lower = cur_lower;
            }
        }
        return all_one_case || capitalized;
    }
Beispiel #2
0
 char_32 *CubeUtils::ToLower(const char_32 *str32, CharSet *char_set) {
     if (!char_set) {
         return NULL;
     }
     UNICHARSET *unicharset = char_set->InternalUnicharset();
     int len = StrLen(str32);
     char_32 *lower = new char_32[len + 1];
     if (!lower)
         return NULL;
     for (int i = 0; i < len; ++i) {
         char_32 ch = str32[i];
         if (ch == INVALID_UNICHAR_ID) {
             delete[] lower;
             return NULL;
         }
         // convert upper-case characters to lower-case
         if (unicharset->get_isupper(char_set->ClassID(ch))) {
             UNICHAR_ID uid_lower = unicharset->get_other_case(char_set->ClassID(ch));
             const char_32 *str32_lower = char_set->ClassString(uid_lower);
             // expect lower-case version of character to be a single character
             if (!str32_lower || StrLen(str32_lower) != 1) {
                 delete[] lower;
                 return NULL;
             }
             lower[i] = str32_lower[0];
         } else {
             lower[i] = ch;
         }
     }
     lower[len] = 0;
     return lower;
 }
Beispiel #3
0
int Dict::case_ok(const WERD_CHOICE &word, const UNICHARSET &unicharset) {
  int state = 0;
  int x;
  for (x = 0; x < word.length(); ++x) {
    UNICHAR_ID ch_id = word.unichar_id(x);
    if (unicharset.get_isupper(ch_id))
      state = case_state_table[state][1];
    else if (unicharset.get_islower(ch_id))
      state = case_state_table[state][2];
    else if (unicharset.get_isdigit(ch_id))
      state = case_state_table[state][3];
    else
      state = case_state_table[state][0];
    if (state == -1) return false;
  }
  return state != 5; // single lower is bad
}
Beispiel #4
0
BOOL8 Tesseract::non_O_upper(const UNICHARSET& ch_set, UNICHAR_ID unichar_id) {
  return ch_set.get_isupper(unichar_id) && !ch_set.eq(unichar_id, "O");
}