// Parse a string of the form [~]<lang>[+[~]<lang>]*. // Langs with no prefix get appended to to_load, provided they // are not in there already. // Langs with ~ prefix get appended to not_to_load, provided they are not in // there already. void Tesseract::ParseLanguageString(const char* lang_str, GenericVector<STRING>* to_load, GenericVector<STRING>* not_to_load) { STRING remains(lang_str); while (remains.length() > 0) { // Find the start of the lang code and which vector to add to. const char* start = remains.string(); while (*start == '+') ++start; GenericVector<STRING>* target = to_load; if (*start == '~') { target = not_to_load; ++start; } // Find the index of the end of the lang code in string start. int end = strlen(start); const char* plus = strchr(start, '+'); if (plus != NULL && plus - start < end) end = plus - start; STRING lang_code(start); lang_code.truncate_at(end); STRING next(start + end); remains = next; // Check whether lang_code is already in the target vector and add. if (!IsStrInList(lang_code, *target)) { if (tessdata_manager_debug_level) tprintf("Adding language '%s' to list\n", lang_code.string()); target->push_back(lang_code); } } }
std::string getOSXSystemLang() { // Get the user's language list (in order of preference) CFArrayRef langs = CFLocaleCopyPreferredLanguages(); if( CFArrayGetCount( langs ) == 0 ) { return "en_US"; } const char *lang_code_raw = CFStringGetCStringPtr( ( CFStringRef )CFArrayGetValueAtIndex( langs, 0 ), kCFStringEncodingUTF8 ); if( !lang_code_raw ) { return "en_US"; } // Convert to the underscore format expected by gettext std::string lang_code( lang_code_raw ); std::replace( lang_code.begin(), lang_code.end(), '-', '_' ); /** * Handle special case for simplified/traditional Chinese. Simplified/Traditional * is actually denoted by the region code in older iterations of the * language codes, whereas now (at least on OS X) region is distinct. * That is, CDDA expects 'zh_CN' but OS X might give 'zh-Hans-CN'. */ if( string_starts_with( lang_code, "zh_Hans" ) ) { return "zh_CN"; } else if( string_starts_with( lang_code, "zh_Hant" ) ) { return "zh_TW"; } return isValidLanguage( lang_code ) ? lang_code : "en_US"; }
int main(int argc, char **argv) { const glist *seg = NULL; u32 key = 0; Pvoid_t entries = NULL; uint dc = 0; uint ic = 0; const char *stats = NULL; dub_init(); stats = pparm_common_name("istats"); open_serializer(); while ((seg = pull_head(&key))){ uint i; u32 lang = lang_code(seg); do{ for (i = 0; i < seg->len; i++){ Word_t *e = NULL; Word_t idx = seg->lst[i]; struct istat_entry *ent = NULL; JLI(e, entries, idx); if (!*e){ *e = (Word_t)xmalloc( sizeof(struct istat_entry)); ent = (struct istat_entry*)*e; memset(ent, 0, sizeof(struct istat_entry)); ent->xid = idx; } ent = (struct istat_entry*)*e; ++ent->freq; if (!ent->lang_code) ent->lang_code = lang; } }while ((seg = pull_segment())); ++dc; } ic = write_istats(stats, entries); dub_msg("Number of documents: %u\n", dc); dub_msg("Number of ixemes: %u\n", ic); return 0; }