Example #1
0
// Parse a string of the form [~]<lang>[+[~]<lang>]*.
// Langs with no prefix get appended to to_load, provided they
// are not in there already.
// Langs with ~ prefix get appended to not_to_load, provided they are not in
// there already.
void Tesseract::ParseLanguageString(const char* lang_str,
                                    GenericVector<STRING>* to_load,
                                    GenericVector<STRING>* not_to_load) {
  STRING remains(lang_str);
  while (remains.length() > 0) {
    // Find the start of the lang code and which vector to add to.
    const char* start = remains.string();
    while (*start == '+')
      ++start;
    GenericVector<STRING>* target = to_load;
    if (*start == '~') {
      target = not_to_load;
      ++start;
    }
    // Find the index of the end of the lang code in string start.
    int end = strlen(start);
    const char* plus = strchr(start, '+');
    if (plus != NULL && plus - start < end)
      end = plus - start;
    STRING lang_code(start);
    lang_code.truncate_at(end);
    STRING next(start + end);
    remains = next;
    // Check whether lang_code is already in the target vector and add.
    if (!IsStrInList(lang_code, *target)) {
      if (tessdata_manager_debug_level)
        tprintf("Adding language '%s' to list\n", lang_code.string());
      target->push_back(lang_code);
    }
  }
}
Example #2
0
std::string getOSXSystemLang()
{
    // Get the user's language list (in order of preference)
    CFArrayRef langs = CFLocaleCopyPreferredLanguages();
    if( CFArrayGetCount( langs ) == 0 ) {
        return "en_US";
    }

    const char *lang_code_raw = CFStringGetCStringPtr(
                                    ( CFStringRef )CFArrayGetValueAtIndex( langs, 0 ),
                                    kCFStringEncodingUTF8 );
    if( !lang_code_raw ) {
        return "en_US";
    }

    // Convert to the underscore format expected by gettext
    std::string lang_code( lang_code_raw );
    std::replace( lang_code.begin(), lang_code.end(), '-', '_' );

    /**
     * Handle special case for simplified/traditional Chinese. Simplified/Traditional
     * is actually denoted by the region code in older iterations of the
     * language codes, whereas now (at least on OS X) region is distinct.
     * That is, CDDA expects 'zh_CN' but OS X might give 'zh-Hans-CN'.
     */
    if( string_starts_with( lang_code, "zh_Hans" ) ) {
        return "zh_CN";
    } else if( string_starts_with( lang_code, "zh_Hant" ) ) {
        return "zh_TW";
    }

    return isValidLanguage( lang_code ) ? lang_code : "en_US";
}
Example #3
0
int main(int argc, char **argv)
{
        const glist *seg = NULL;
        u32         key  = 0;

        Pvoid_t entries = NULL;
        
        uint   dc       = 0;   
        uint   ic       = 0;

        const char *stats = NULL;

        dub_init();
        
        stats = pparm_common_name("istats");
        
        open_serializer();

        while ((seg = pull_head(&key))){
        
                uint i;
                u32 lang = lang_code(seg);
                
                do{
                        for (i = 0; i < seg->len; i++){
                                
                                Word_t            *e    = NULL;
                                Word_t            idx   = seg->lst[i];
                                struct istat_entry *ent = NULL;
                                
                                JLI(e, entries, idx);
                                if (!*e){
                                        *e  = (Word_t)xmalloc(
                                                sizeof(struct istat_entry));
                                        ent = (struct istat_entry*)*e;
                                        memset(ent, 0, 
                                                sizeof(struct istat_entry));
                                        ent->xid = idx;
                                }                         

                                ent = (struct istat_entry*)*e;

                                ++ent->freq;
                                if (!ent->lang_code)
                                        ent->lang_code = lang;
                        }
                        
                }while ((seg = pull_segment()));

                ++dc;
        }
 
        ic = write_istats(stats, entries);
        
        dub_msg("Number of documents: %u\n", dc);
        dub_msg("Number of ixemes: %u\n", ic);        

        return 0;
}