// Return a STRING containing debug information on the unichar, including // the id_to_unichar, its hex unicodes and the properties. STRING UNICHARSET::debug_str(UNICHAR_ID id) const { if (id == INVALID_UNICHAR_ID) return STRING(id_to_unichar(id)); const CHAR_FRAGMENT *fragment = this->get_fragment(id); if (fragment) { return fragment->to_string(); } const char* str = id_to_unichar(id); STRING result = debug_utf8_str(str); // Append a for lower alpha, A for upper alpha, and x if alpha but neither. if (get_isalpha(id)) { if (get_islower(id)) result += "a"; else if (get_isupper(id)) result += "A"; else result += "x"; } // Append 0 if a digit. if (get_isdigit(id)) { result += "0"; } // Append p is a punctuation symbol. if (get_ispunctuation(id)) { result += "p"; } return result; }
// Sets all the properties for this unicharset given a src unicharset with // everything set. The unicharsets don't have to be the same, and graphemes // are correctly accounted for. void UNICHARSET::PartialSetPropertiesFromOther(int start_index, const UNICHARSET& src) { for (int ch = start_index; ch < size_used; ++ch) { const char* utf8 = id_to_unichar(ch); UNICHAR_PROPERTIES properties; if (src.GetStrProperties(utf8, &properties)) { // Setup the script_id, other_case, and mirror properly. const char* script = src.get_script_from_script_id(properties.script_id); properties.script_id = add_script(script); const char* other_case = src.id_to_unichar(properties.other_case); if (contains_unichar(other_case)) { properties.other_case = unichar_to_id(other_case); } else { properties.other_case = ch; } const char* mirror_str = src.id_to_unichar(properties.mirror); if (contains_unichar(mirror_str)) { properties.mirror = unichar_to_id(mirror_str); } else { properties.mirror = ch; } unichars[ch].properties.CopyFrom(properties); set_normed_ids(ch); } else { tprintf("Failed to get properties for index %d = %s\n", ch, utf8); } } }
// Expands the tops and bottoms and widths for this unicharset given a // src unicharset with ranges in it. The unicharsets don't have to be the // same, and graphemes are correctly accounted for. void UNICHARSET::ExpandRangesFromOther(const UNICHARSET& src) { for (int ch = 0; ch < size_used; ++ch) { const char* utf8 = id_to_unichar(ch); UNICHAR_PROPERTIES properties; if (src.GetStrProperties(utf8, &properties)) { // Expand just the ranges from properties. unichars[ch].properties.ExpandRangesFrom(properties); } } }
// Sets the normed_ids vector from the normed string. normed_ids is not // stored in the file, and needs to be set when the UNICHARSET is loaded. void UNICHARSET::set_normed_ids(UNICHAR_ID unichar_id) { unichars[unichar_id].properties.normed_ids.truncate(0); if (unichar_id == UNICHAR_SPACE && id_to_unichar(unichar_id)[0] == ' ') { unichars[unichar_id].properties.normed_ids.push_back(UNICHAR_SPACE); } else if (!encode_string(unichars[unichar_id].properties.normed.string(), true, &unichars[unichar_id].properties.normed_ids, NULL, NULL)) { unichars[unichar_id].properties.normed_ids.truncate(0); unichars[unichar_id].properties.normed_ids.push_back(unichar_id); } }
const char* const UNICHARSET::id_to_unichar_ext(UNICHAR_ID id) const { if (id == INVALID_UNICHAR_ID) { return INVALID_UNICHAR; } ASSERT_HOST(id < this->size()); // Resolve from the kCustomLigatures table if this is a private encoding. if (get_isprivate(id)) { const char* ch = id_to_unichar(id); for (int i = 0; kCustomLigatures[i][0] != NULL; ++i) { if (!strcmp(ch, kCustomLigatures[i][1])) { return kCustomLigatures[i][0]; } } } // Otherwise return the stored representation. return unichars[id].representation; }
// Returns whether the unichar id represents a unicode value in the private use // area. We use this range only internally to represent uncommon ligatures // (eg. 'ct') that do not have regular unicode values. bool UNICHARSET::get_isprivate(UNICHAR_ID unichar_id) const { UNICHAR uc(id_to_unichar(unichar_id), -1); int uni = uc.first_uni(); return (uni >= 0xE000 && uni <= 0xF8FF); }