/** * enca_guess_destroy: * @analyser: Analyser to destroy. * * Frees memory owned by analyser state. **/ void enca_guess_destroy(EncaAnalyserState *analyser) { enca_free(analyser->counts); enca_free(analyser->ratings); enca_free(analyser->order); }
/** * enca_pair_destroy: * @analyser: Analyzer state whose pair statistics part should be destroyed. * * Destroys the pair statistics part of analyser state @analyser. **/ void enca_pair_destroy(EncaAnalyserState *analyser) { enca_free(analyser->pair2bits); enca_free(analyser->bitcounts); enca_free(analyser->pairratings); }
/** * enca_language_destroy: * @analyser: Analyzer state whose language part should be destroyed. * * Destroys the language part of analyser state @analyser. **/ void enca_language_destroy(EncaAnalyserState *analyser) { enca_free(analyser->charsets); enca_free(analyser->lcbits); enca_free(analyser->ucbits); analyser->ncharsets = 0; analyser->lang = NULL; }
/* process options and do some other initializations, then go through the file list and process files one by one at the end, exit and return 0 on succes, 1 on failure, 2 on troubles */ int main(int argc, char *argv[]) { char **pp_file, **flist; /* filename list pointer */ long int err; /* nonzero if process_file() ever returned nonzero */ EncaAnalyser an; /* Process command line arguments. */ pp_file = flist = process_opt(argc, argv); /* Initialization. */ if (options.verbosity_level > 2) fprintf(stderr, "Initializing language %s\n", options.language); an = enca_analyser_alloc(options.language); if (!an) { fprintf(stderr, "%s: Language `%s' is unknown or not supported.\n" "Run `%s --list languages' to get list " "of supported languages.\n" "Run `%s -L none' to test only language independent, " "multibyte encodings.\n", program_name, options.language, program_name, program_name); exit(EXIT_TROUBLE); } enca_set_threshold(an, 1.38); enca_set_multibyte(an, 1); enca_set_ambiguity(an, 1); enca_set_garbage_test(an, 1); /* Any files specified on command line? */ if (pp_file == NULL) { /* No => read stdin. */ err = process_file(an, NULL); } else { /* Process file list, cumultate the worst error in err. */ err = 0; while (*pp_file != NULL) { err |= process_file(an, *pp_file); enca_free(*pp_file); pp_file++; } } process_file(NULL, NULL); enca_analyser_free(an); enca_free(options.language); enca_free(options.target_enc_str); enca_free(flist); if (err & EXIT_TROUBLE) err = EXIT_TROUBLE; return err; }
/** * analyse: * @analyser: An analyser initialized for some language. * @buffer: Buffer to be analysed. * @size: Size of @buffer. * * Analyses @buffer and finds its encoding. * * Returns: Encoding of @buffer. **/ static EncaEncoding analyse(EncaAnalyserState *analyser, unsigned char *buffer, size_t size) { analyser->result = ENCODING_UNKNOWN; /* Empty buffer? */ if (size == 0) { analyser->gerrno = ENCA_EEMPTY; return analyser->result; } assert(buffer != NULL); /* Initialize stuff. */ analyser->gerrno = 0; analyser->buffer = buffer; analyser->size = size; analyser->buffer2 = NULL; analyser->size2 = 0; analyser->gerrno = make_guess(analyser); if (analyser->gerrno) analyser->result = ENCODING_UNKNOWN; /* When buffer2 is not NULL, then it holds the original buffer, so we must * free the copy (i.e. buffer, not buffer2!). */ if (analyser->buffer2 != NULL) enca_free(analyser->buffer); return analyser->result; }
/** * Checks for doubly-encoded UTF-8 and prints a line when it looks so. **/ static void double_utf8_chk(EncaAnalyser an, const unsigned char *sample, size_t size) { size_t dbl, i; int *candidates; if (options.output_type != OTYPE_DETAILS && options.output_type != OTYPE_HUMAN) return; dbl = enca_double_utf8_check(an, sample, size); if (!dbl) return; candidates = enca_double_utf8_get_candidates(an); if (dbl == 1) printf(" Doubly-encoded to UTF-8 from"); else printf(" Doubly-encoded to UTF-8 from one of:"); for (i = 0; i < dbl; i++) printf(" %s", enca_charset_name(candidates[i], ENCA_NAME_STYLE_ENCA)); putchar('\n'); enca_free(candidates); }
/** * enca_analyser_free: * @analyser: An analyser to be destroyed. * * Frees memory used by #EncaAnalyser @analyser. **/ void enca_analyser_free(EncaAnalyser analyser) { assert(analyser != NULL); enca_pair_destroy(analyser); enca_double_utf8_destroy(analyser); enca_guess_destroy(analyser); enca_language_destroy(analyser); enca_free(analyser); }
/** * Returns `language' component of locale name locname (if successfully * parsed), NULL otherwise * * Returned string should be freed by caller. **/ static char* strip_locale_name(const char *locname) { /* Some supported languages can also appear as dialects of some other * language */ struct { const char *dialect; const char *iso639; } const DIALECTS[] = { { "cs_SK", "sk" }, { "ru_UA", "uk" }, }; size_t n; char *s; if (!locname) return NULL; s = enca_strdup(locname); n = strlen(s); /* Just language: en, de, fr, cs, sk, ru, etc. */ if (n == 2) return s; /* Some long specification (either X/Open or CEN). */ if (n >= 5 && s[2] == '_' && (s[5] == '\0' || s[5] == '.' || s[5] == '+')) { size_t i; /* Convert dialects. */ for (i = 0; i < ELEMENTS(DIALECTS); i++) { if (strncmp(DIALECTS[i].dialect, s, 5) == 0) { s[0] = DIALECTS[i].iso639[0]; s[1] = DIALECTS[i].iso639[1]; break; } } s[2] = '\0'; } else { /* Just garbage or some unresolved locale alias. */ enca_free(s); } return s; }
/* * when lang is not NULL converts it to two-character language code * othwerise, tries to guess what language user wants from locale settings. * returns string of length 2 containig language code (to be freed by caller) * or NULL if not detected or unable to convert. */ char* detect_lang(const char *lang) { char *locname, *result, *cvt; atexit(codeset_free); #ifdef HAVE_SETLOCALE /* No lang, detect locale, then CODESET, then try to transform it */ if (!lang) { locname = detect_user_language(); /* HERE: locname is (a) newly allocated (b) NULL */ codeset = detect_target_charset(locname); /* HERE: codeset is (a) newly allocated, different from locname (b) NULL */ cvt = locale_alias_convert(locname); result = strip_locale_name(cvt); enca_free(cvt); enca_free(locname); return result; } /* We have lang, try it first untransformed, then transformed for CODESET */ codeset = detect_target_charset(lang); locname = locale_alias_convert(lang); if (!codeset) codeset = detect_target_charset(locname); result = strip_locale_name(locname); enca_free(locname); return result; #else /* HAVE_SETLOCALE */ cvt = locale_alias_convert(lang); result = strip_locale_name(cvt); enca_free(cvt); return result; #endif /* HAVE_SETLOCALE */ }
/** * Checks whether @progpath is and executable we can execute it. * Tries to resolve relative paths and symlinks. **/ static int check_executability_one(const char *progpath) { static uid_t uid; static gid_t gid; static int check_executability_one_initialized = 0; struct stat st; char *fname = canonicalize_file_name(progpath); /* Is it a regular file at all? */ if (stat(fname, &st) != 0 || (st.st_mode & S_IFREG) == 0) { enca_free(fname); return 0; } /* Check executability by anyone, user, group. */ enca_free(fname); if (st.st_mode & S_IXOTH) return 1; if (!check_executability_one_initialized) { uid = getuid(); gid = getgid(); check_executability_one_initialized = 1; } if ((st.st_mode & S_IXUSR) && st.st_uid == gid) return 1; if ((st.st_mode & S_IXGRP) && st.st_uid == uid) return 1; return 0; }
/* set external convertor to extc */ void set_external_convertor(const char *extc) { enca_free(extern_convertor); if (strchr(extc, '/') == NULL) { if (extc[0] == 'b' && extc[1] == '-') { extc += 2; fprintf(stderr, "%s: The `b-' prefix for standard external convertors " "is deprecated.\n" "I'll pretend you said `%s'.\n", program_name, extc); } extern_convertor = enca_strconcat(EXTCONV_DIR, "/", extc, NULL); } else extern_convertor = enca_strdup(extc); }
/** * enca_analyser_alloc: * @langname: Language for which the analyser should be initialized. * * Allocates an analyser and initializes it for language @language. * * The analyser, once crerated, can be used only for language for which it * was initialized. If you need to detect encodings of texts in more than one * language, you must allocate an analyser for each one. Note however, an * analyser may occupy a considerable amount of memory (a few hundreds of kB), * so it's generally not a good idea to have several hundreds of them floating * around. * * @langname is two-letter ISO 639:1989 language code. Locale names in form * language_territory and ISO-639 English language names also may be accepted * in the future. To be on the safe side, use only names returned by * enca_get_languages(). * * Returns: The newly created #EncaAnalyser on success, #NULL on failure * (namely when @langname is unknown or otherwise invalid). **/ EncaAnalyser enca_analyser_alloc(const char *langname) { EncaAnalyserState *analyser; if (langname == NULL) return NULL; analyser = NEW(EncaAnalyserState, 1); if (!enca_language_init(analyser, langname)) { enca_free(analyser); return NULL; } enca_guess_init(analyser); enca_double_utf8_init(analyser); enca_pair_init(analyser); return analyser; }
/** * detect_user_language: * * Detect user's locale by querying several LC categories. * * NB: this is conceptually wrong, the string returned by setlocale should * be taken as opaque -- but then we would be in deep shit^Wtrouble. This * seems to actually happen on Win32. * * Returns: A string (to be freed) with locale name or NULL on failure. **/ static char* detect_user_language(void) { static const int test_categories[] = { LC_CTYPE, LC_COLLATE, #if HAVE_LC_MESSAGES LC_MESSAGES, #endif }; char *s = NULL; size_t i; for (i = 0; i < ELEMENTS(test_categories); i++) { enca_free(s); if ((s = setlocale(test_categories[i], "")) == NULL) continue; s = enca_strdup(s); if (setlocale(test_categories[i], "C") == NULL) { fprintf(stderr, "%s: Cannot set locale to the portable \"C\" locale\n", program_name); exit(EXIT_TROUBLE); } if (strcmp(s, "") == 0 || strcmp(s, "C") == 0 || strcmp(s, "POSIX") == 0 || (strncmp(s, "en", 2) == 0 && !isalpha(s[2]))) continue; if (options.verbosity_level > 2) fprintf(stderr, "Locale inherited from environment: %s\n", s); return s; } return NULL; }
static void codeset_free(void) { enca_free(codeset); }
/** * Prints results. **/ static void print_results(const char *fname, EncaAnalyser an, EncaEncoding result, int gerrno) { char *s; EncaSurface surf = result.surface & ~enca_charset_natural_surface(result.charset); if (options.prefix_filename) printf("%s: ", ffname_r(fname)); switch (options.output_type) { case OTYPE_ALIASES: print_aliases(result.charset); break; case OTYPE_CANON: if (surf) { s = enca_get_surface_name(surf, ENCA_NAME_STYLE_ENCA); fputs(enca_charset_name(result.charset, ENCA_NAME_STYLE_ENCA), stdout); puts(s); enca_free(s); } else puts(enca_charset_name(result.charset, ENCA_NAME_STYLE_ENCA)); break; case OTYPE_HUMAN: case OTYPE_DETAILS: if (surf) { s = enca_get_surface_name(surf, ENCA_NAME_STYLE_HUMAN); puts(enca_charset_name(result.charset, ENCA_NAME_STYLE_HUMAN)); indent_surface(s); enca_free(s); } else puts(enca_charset_name(result.charset, ENCA_NAME_STYLE_HUMAN)); break; case OTYPE_RFC1345: puts(enca_charset_name(result.charset, ENCA_NAME_STYLE_RFC1345)); break; case OTYPE_CS2CS: if (enca_charset_name(result.charset, ENCA_NAME_STYLE_CSTOCS) != NULL) puts(enca_charset_name(result.charset, ENCA_NAME_STYLE_CSTOCS)); else puts(enca_charset_name(ENCA_CS_UNKNOWN, ENCA_NAME_STYLE_CSTOCS)); break; case OTYPE_ICONV: if (enca_charset_name(result.charset, ENCA_NAME_STYLE_ICONV) != NULL) puts(enca_charset_name(result.charset, ENCA_NAME_STYLE_ICONV)); else puts(enca_charset_name(ENCA_CS_UNKNOWN, ENCA_NAME_STYLE_ICONV)); break; case OTYPE_MIME: if (enca_charset_name(result.charset, ENCA_NAME_STYLE_MIME) != NULL) puts(enca_charset_name(result.charset, ENCA_NAME_STYLE_MIME)); else puts(enca_charset_name(ENCA_CS_UNKNOWN, ENCA_NAME_STYLE_MIME)); break; default: abort(); break; } if (gerrno && options.output_type == OTYPE_DETAILS) { printf(" Failure reason: %s.\n", enca_strerror(an, gerrno)); } }
/* fork and the child executes Settings.Convertor on fname create temporary file containing stdin when fname == NULL and convert it passing special option STDOUT to convertor (that is assumed to delete the temporary file itself) from_enc, to_enc are encoding names as should be passed to convertor returns 0 on success, nonzero on failure; on critical failure (like we cannot fork()) it simply aborts */ int convert_external(File *file, const EncaEncoding from_enc) { /* special fourth parameter passed to external convertor to instruct it to send result to stdout */ static const char *STDOUT_CONV = "-"; pid_t pid; int status; File *tempfile = NULL; char *from_name, *target_name; if (*extern_convertor == '\0') { fprintf(stderr, "%s: No external convertor defined!\n", program_name); return ERR_CANNOT; } if (options.verbosity_level > 2) fprintf(stderr, " launching `%s' to convert `%s'\n", extern_convertor, ffname_r(file->name)); /* Is conversion of stdin requested? */ if (file->name == NULL) { /* Then we have to copy it to a temporary file. */ tempfile = file_temporary(file->buffer, 0); if (tempfile == NULL) return ERR_IOFAIL; if (copy_and_convert(file, tempfile, NULL) != 0) { file_unlink(tempfile->name); file_free(tempfile); return ERR_IOFAIL; } } /* Construct the charset names before fork() */ from_name = enca_strconcat(enca_charset_name(from_enc.charset, ENCA_NAME_STYLE_ENCA), enca_get_surface_name(from_enc.surface, ENCA_NAME_STYLE_ENCA), NULL); if (enca_charset_is_known(options.target_enc.charset) && (options.target_enc.surface & ENCA_SURFACE_UNKNOWN) == 0) { target_name = enca_strconcat(enca_charset_name(options.target_enc.charset, ENCA_NAME_STYLE_ENCA), enca_get_surface_name(options.target_enc.surface, ENCA_NAME_STYLE_ENCA), NULL); } else target_name = enca_strdup(options.target_enc_str); /* Fork. */ pid = vfork(); if (pid == 0) { /* Child. */ if (tempfile) execlp(extern_convertor, extern_convertor, from_name, target_name, tempfile->name, STDOUT_CONV, NULL); else execlp(extern_convertor, extern_convertor, from_name, target_name, file->name, NULL); exit(ERR_EXEC); } /* Parent. */ if (pid == -1) { fprintf(stderr, "%s: Cannot fork() to execute convertor: %s\n", program_name, strerror(errno)); exit(EXIT_TROUBLE); } /* Wait until the child returns. */ if (waitpid(pid, &status, 0) == -1) { /* Error. */ fprintf(stderr, "%s: wait_pid() error while waiting for convertor: %s\n", program_name, strerror(errno)); exit(EXIT_TROUBLE); } if (!WIFEXITED(status)) { /* Child exited abnormally. */ fprintf(stderr, "%s: Child convertor process has been murdered.\n", program_name); exit(EXIT_TROUBLE); } enca_free(from_name); enca_free(target_name); if (tempfile) { unlink(tempfile->name); file_free(tempfile); } /* Child exited normally, test exit status. */ if (WEXITSTATUS(status) != EXIT_SUCCESS) { /* This means child was unable to execute convertor or convertor failed. */ fprintf(stderr, "%s: External convertor failed (error code %d)\n", program_name, WEXITSTATUS(status)); if (WEXITSTATUS(status) == ERR_EXEC) return ERR_EXEC; else return ERR_CANNOT; } /* Success! Wow! */ return ERR_OK; }