Exemplo n.º 1
0
Arquivo: guess.c Projeto: MrBadge/enca
/**
 * enca_guess_destroy:
 * @analyser: Analyser to destroy.
 *
 * Frees memory owned by analyser state.
 **/
void
enca_guess_destroy(EncaAnalyserState *analyser)
{
  enca_free(analyser->counts);
  enca_free(analyser->ratings);
  enca_free(analyser->order);
}
Exemplo n.º 2
0
Arquivo: pair.c Projeto: 0MasteR0/xbmc
/**
 * enca_pair_destroy:
 * @analyser: Analyzer state whose pair statistics part should be destroyed.
 *
 * Destroys the pair statistics part of analyser state @analyser.
 **/
void
enca_pair_destroy(EncaAnalyserState *analyser)
{
  enca_free(analyser->pair2bits);
  enca_free(analyser->bitcounts);
  enca_free(analyser->pairratings);
}
Exemplo n.º 3
0
Arquivo: lang.c Projeto: MrBadge/enca
/**
 * enca_language_destroy:
 * @analyser: Analyzer state whose language part should be destroyed.
 *
 * Destroys the language part of analyser state @analyser.
 **/
void
enca_language_destroy(EncaAnalyserState *analyser)
{
  enca_free(analyser->charsets);
  enca_free(analyser->lcbits);
  enca_free(analyser->ucbits);
  analyser->ncharsets = 0;
  analyser->lang = NULL;
}
Exemplo n.º 4
0
/* process options and do some other initializations, then go through the
   file list and process files one by one
   at the end, exit and return 0 on succes, 1 on failure, 2 on troubles */
int
main(int argc, char *argv[])
{
  char **pp_file, **flist; /* filename list pointer */
  long int err; /* nonzero if process_file() ever returned nonzero */
  EncaAnalyser an;

  /* Process command line arguments. */
  pp_file = flist = process_opt(argc, argv);

  /* Initialization. */
  if (options.verbosity_level > 2)
    fprintf(stderr, "Initializing language %s\n", options.language);
  an = enca_analyser_alloc(options.language);
  if (!an) {
    fprintf(stderr, "%s: Language `%s' is unknown or not supported.\n"
                    "Run `%s --list languages' to get list "
                    "of supported languages.\n"
                    "Run `%s -L none' to test only language independent, "
                    "multibyte encodings.\n",
                    program_name, options.language,
                    program_name,
                    program_name);
    exit(EXIT_TROUBLE);
  }

  enca_set_threshold(an, 1.38);
  enca_set_multibyte(an, 1);
  enca_set_ambiguity(an, 1);
  enca_set_garbage_test(an, 1);

  /* Any files specified on command line? */
  if (pp_file == NULL) {
    /* No => read stdin. */
    err = process_file(an, NULL);
  }
  else {
    /* Process file list, cumultate the worst error in err. */
    err = 0;
    while (*pp_file != NULL) {
      err |= process_file(an, *pp_file);
      enca_free(*pp_file);
      pp_file++;
    }
  }

  process_file(NULL, NULL);
  enca_analyser_free(an);
  enca_free(options.language);
  enca_free(options.target_enc_str);
  enca_free(flist);

  if (err & EXIT_TROUBLE)
    err = EXIT_TROUBLE;

  return err;
}
Exemplo n.º 5
0
Arquivo: guess.c Projeto: MrBadge/enca
/**
 * analyse:
 * @analyser: An analyser initialized for some language.
 * @buffer: Buffer to be analysed.
 * @size: Size of @buffer.
 *
 * Analyses @buffer and finds its encoding.
 *
 * Returns: Encoding of @buffer.
 **/
static EncaEncoding
analyse(EncaAnalyserState *analyser,
        unsigned char *buffer,
        size_t size)
{
  analyser->result = ENCODING_UNKNOWN;

  /* Empty buffer? */
  if (size == 0) {
    analyser->gerrno = ENCA_EEMPTY;
    return analyser->result;
  }
  assert(buffer != NULL);

  /* Initialize stuff. */
  analyser->gerrno = 0;

  analyser->buffer = buffer;
  analyser->size = size;

  analyser->buffer2 = NULL;
  analyser->size2 = 0;

  analyser->gerrno = make_guess(analyser);
  if (analyser->gerrno)
    analyser->result = ENCODING_UNKNOWN;

  /* When buffer2 is not NULL, then it holds the original buffer, so we must
   * free the copy (i.e. buffer, not buffer2!). */
  if (analyser->buffer2 != NULL)
    enca_free(analyser->buffer);

  return analyser->result;
}
Exemplo n.º 6
0
/**
 * Checks for doubly-encoded UTF-8 and prints a line when it looks so.
 **/
static void
double_utf8_chk(EncaAnalyser an,
                const unsigned char *sample,
                size_t size)
{
  size_t dbl, i;
  int *candidates;

  if (options.output_type != OTYPE_DETAILS
      && options.output_type != OTYPE_HUMAN)
    return;

  dbl = enca_double_utf8_check(an, sample, size);
  if (!dbl)
    return;

  candidates = enca_double_utf8_get_candidates(an);
  if (dbl == 1)
    printf("  Doubly-encoded to UTF-8 from");
  else
    printf("  Doubly-encoded to UTF-8 from one of:");

  for (i = 0; i < dbl; i++)
    printf(" %s", enca_charset_name(candidates[i], ENCA_NAME_STYLE_ENCA));

  putchar('\n');
  enca_free(candidates);
}
Exemplo n.º 7
0
/**
 * enca_analyser_free:
 * @analyser: An analyser to be destroyed.
 *
 * Frees memory used by #EncaAnalyser @analyser.
 **/
void
enca_analyser_free(EncaAnalyser analyser)
{
  assert(analyser != NULL);

  enca_pair_destroy(analyser);
  enca_double_utf8_destroy(analyser);
  enca_guess_destroy(analyser);
  enca_language_destroy(analyser);
  enca_free(analyser);
}
Exemplo n.º 8
0
/**
 * Returns `language' component of locale name locname (if successfully
 * parsed), NULL otherwise
 *
 * Returned string should be freed by caller.
 **/
static char*
strip_locale_name(const char *locname)
{
  /* Some supported languages can also appear as dialects of some other
   * language */
  struct {
    const char *dialect;
    const char *iso639;
  }
  const DIALECTS[] = {
    { "cs_SK", "sk" },
    { "ru_UA", "uk" },
  };

  size_t n;
  char *s;

  if (!locname)
    return NULL;

  s = enca_strdup(locname);
  n = strlen(s);
  /* Just language: en, de, fr, cs, sk, ru, etc. */
  if (n == 2)
    return s;

  /* Some long specification (either X/Open or CEN). */
  if (n >= 5 && s[2] == '_'
      && (s[5] == '\0' || s[5] == '.' || s[5] == '+')) {
    size_t i;

    /* Convert dialects. */
    for (i = 0; i < ELEMENTS(DIALECTS); i++) {
      if (strncmp(DIALECTS[i].dialect, s, 5) == 0) {
        s[0] = DIALECTS[i].iso639[0];
        s[1] = DIALECTS[i].iso639[1];
        break;
      }
    }

    s[2] = '\0';
  }
  else {
    /* Just garbage or some unresolved locale alias. */
    enca_free(s);
  }

  return s;
}
Exemplo n.º 9
0
/*
 * when lang is not NULL converts it to two-character language code
 * othwerise, tries to guess what language user wants from locale settings.
 * returns string of length 2 containig language code (to be freed by caller)
 * or NULL if not detected or unable to convert.
 */
char*
detect_lang(const char *lang)
{
  char *locname, *result, *cvt;

  atexit(codeset_free);
#ifdef HAVE_SETLOCALE
  /* No lang, detect locale, then CODESET, then try to transform it */
  if (!lang) {
    locname = detect_user_language();
    /* HERE: locname is (a) newly allocated (b) NULL */
    codeset = detect_target_charset(locname);
    /* HERE: codeset is (a) newly allocated, different from locname (b) NULL */
    cvt = locale_alias_convert(locname);
    result = strip_locale_name(cvt);
    enca_free(cvt);
    enca_free(locname);
    return result;
  }

  /* We have lang, try it first untransformed, then transformed for CODESET */
  codeset = detect_target_charset(lang);
  locname = locale_alias_convert(lang);
  if (!codeset)
    codeset = detect_target_charset(locname);
  result = strip_locale_name(locname);
  enca_free(locname);
  return result;

#else /* HAVE_SETLOCALE */
  cvt = locale_alias_convert(lang);
  result = strip_locale_name(cvt);
  enca_free(cvt);
  return result;
#endif /* HAVE_SETLOCALE */
}
Exemplo n.º 10
0
/**
 * Checks whether @progpath is and executable we can execute it.
 * Tries to resolve relative paths and symlinks.
 **/
static int
check_executability_one(const char *progpath)
{
  static uid_t uid;
  static gid_t gid;
  static int check_executability_one_initialized = 0;

  struct stat st;
  char *fname = canonicalize_file_name(progpath);

  /* Is it a regular file at all? */
  if (stat(fname, &st) != 0
      || (st.st_mode & S_IFREG) == 0) {
    enca_free(fname);
    return 0;
  }

  /* Check executability by anyone, user, group. */
  enca_free(fname);
  if (st.st_mode & S_IXOTH)
    return 1;

  if (!check_executability_one_initialized) {
    uid = getuid();
    gid = getgid();
    check_executability_one_initialized = 1;
  }

  if ((st.st_mode & S_IXUSR) && st.st_uid == gid)
    return 1;

  if ((st.st_mode & S_IXGRP) && st.st_uid == uid)
    return 1;

  return 0;
}
Exemplo n.º 11
0
/* set external convertor to extc */
void
set_external_convertor(const char *extc)
{
  enca_free(extern_convertor);
  if (strchr(extc, '/') == NULL) {
    if (extc[0] == 'b' && extc[1] == '-') {
      extc += 2;
      fprintf(stderr, "%s: The `b-' prefix for standard external convertors "
                      "is deprecated.\n"
                      "I'll pretend you said `%s'.\n",
                      program_name,
                      extc);
    }
    extern_convertor = enca_strconcat(EXTCONV_DIR, "/", extc, NULL);
  }
  else
    extern_convertor = enca_strdup(extc);
}
Exemplo n.º 12
0
/**
 * enca_analyser_alloc:
 * @langname: Language for which the analyser should be initialized.
 *
 * Allocates an analyser and initializes it for language @language.
 *
 * The analyser, once crerated, can be used only for language for which it
 * was initialized.  If you need to detect encodings of texts in more than one
 * language, you must allocate an analyser for each one.  Note however, an
 * analyser may occupy a considerable amount of memory (a few hundreds of kB),
 * so it's generally not a good idea to have several hundreds of them floating
 * around.
 *
 * @langname is two-letter ISO 639:1989 language code.  Locale names in form
 * language_territory and ISO-639 English language names also may be accepted
 * in the future. To be on the safe side, use only names returned by
 * enca_get_languages().
 *
 * Returns: The newly created #EncaAnalyser on success, #NULL on failure
 *          (namely when @langname is unknown or otherwise invalid).
 **/
EncaAnalyser
enca_analyser_alloc(const char *langname)
{
  EncaAnalyserState *analyser;

  if (langname == NULL)
    return NULL;

  analyser = NEW(EncaAnalyserState, 1);
  if (!enca_language_init(analyser, langname)) {
    enca_free(analyser);
    return NULL;
  }

  enca_guess_init(analyser);
  enca_double_utf8_init(analyser);
  enca_pair_init(analyser);

  return analyser;
}
Exemplo n.º 13
0
/**
 * detect_user_language:
 *
 * Detect user's locale by querying several LC categories.
 *
 * NB: this is conceptually wrong, the string returned by setlocale should
 * be taken as opaque -- but then we would be in deep shit^Wtrouble.  This
 * seems to actually happen on Win32.
 *
 * Returns: A string (to be freed) with locale name or NULL on failure.
 **/
static char*
detect_user_language(void)
{
  static const int test_categories[] = {
    LC_CTYPE, LC_COLLATE,
#if HAVE_LC_MESSAGES
    LC_MESSAGES,
#endif
  };
  char *s = NULL;
  size_t i;

  for (i = 0; i < ELEMENTS(test_categories); i++) {
    enca_free(s);
    if ((s = setlocale(test_categories[i], "")) == NULL)
      continue;
    s = enca_strdup(s);
    if (setlocale(test_categories[i], "C") == NULL) {
      fprintf(stderr, "%s: Cannot set locale to the portable \"C\" locale\n",
                      program_name);
      exit(EXIT_TROUBLE);
    }

    if (strcmp(s, "") == 0
        || strcmp(s, "C") == 0
        || strcmp(s, "POSIX") == 0
        || (strncmp(s, "en", 2) == 0 && !isalpha(s[2])))
      continue;

    if (options.verbosity_level > 2)
      fprintf(stderr, "Locale inherited from environment: %s\n", s);

    return s;
  }

  return NULL;
}
Exemplo n.º 14
0
static void
codeset_free(void)
{
  enca_free(codeset);
}
Exemplo n.º 15
0
/**
 * Prints results.
 **/
static void
print_results(const char *fname,
              EncaAnalyser an,
              EncaEncoding result,
              int gerrno)
{
  char *s;
  EncaSurface surf = result.surface
                     & ~enca_charset_natural_surface(result.charset);

  if (options.prefix_filename)
    printf("%s: ", ffname_r(fname));

  switch (options.output_type) {
    case OTYPE_ALIASES:
    print_aliases(result.charset);
    break;

    case OTYPE_CANON:
    if (surf) {
      s = enca_get_surface_name(surf, ENCA_NAME_STYLE_ENCA);
      fputs(enca_charset_name(result.charset, ENCA_NAME_STYLE_ENCA), stdout);
      puts(s);
      enca_free(s);
    }
    else
      puts(enca_charset_name(result.charset, ENCA_NAME_STYLE_ENCA));
    break;

    case OTYPE_HUMAN:
    case OTYPE_DETAILS:
    if (surf) {
      s = enca_get_surface_name(surf, ENCA_NAME_STYLE_HUMAN);
      puts(enca_charset_name(result.charset, ENCA_NAME_STYLE_HUMAN));
      indent_surface(s);
      enca_free(s);
    }
    else
      puts(enca_charset_name(result.charset, ENCA_NAME_STYLE_HUMAN));
    break;

    case OTYPE_RFC1345:
    puts(enca_charset_name(result.charset, ENCA_NAME_STYLE_RFC1345));
    break;

    case OTYPE_CS2CS:
    if (enca_charset_name(result.charset, ENCA_NAME_STYLE_CSTOCS) != NULL)
      puts(enca_charset_name(result.charset, ENCA_NAME_STYLE_CSTOCS));
    else
      puts(enca_charset_name(ENCA_CS_UNKNOWN, ENCA_NAME_STYLE_CSTOCS));
    break;

    case OTYPE_ICONV:
    if (enca_charset_name(result.charset, ENCA_NAME_STYLE_ICONV) != NULL)
      puts(enca_charset_name(result.charset, ENCA_NAME_STYLE_ICONV));
    else
      puts(enca_charset_name(ENCA_CS_UNKNOWN, ENCA_NAME_STYLE_ICONV));
    break;

    case OTYPE_MIME:
    if (enca_charset_name(result.charset, ENCA_NAME_STYLE_MIME) != NULL)
      puts(enca_charset_name(result.charset, ENCA_NAME_STYLE_MIME));
    else
      puts(enca_charset_name(ENCA_CS_UNKNOWN, ENCA_NAME_STYLE_MIME));
    break;

    default:
    abort();
    break;
  }

  if (gerrno && options.output_type == OTYPE_DETAILS) {
    printf("  Failure reason: %s.\n", enca_strerror(an, gerrno));
  }
}
Exemplo n.º 16
0
/* fork and the child executes Settings.Convertor on fname
   create temporary file containing stdin when fname == NULL and convert it
   passing special option STDOUT to convertor (that is assumed to delete
   the temporary file itself)
   from_enc, to_enc are encoding names as should be passed to convertor
   returns 0 on success, nonzero on failure;
   on critical failure (like we cannot fork()) it simply aborts */
int
convert_external(File *file,
                 const EncaEncoding from_enc)
{
  /* special fourth parameter passed to external convertor to instruct it to
  send result to stdout */
  static const char *STDOUT_CONV = "-";

  pid_t pid;
  int status;
  File *tempfile = NULL;
  char *from_name, *target_name;

  if (*extern_convertor == '\0') {
    fprintf(stderr, "%s: No external convertor defined!\n", program_name);
    return ERR_CANNOT;
  }

  if (options.verbosity_level > 2)
    fprintf(stderr, "    launching `%s' to convert `%s'\n",
                    extern_convertor, ffname_r(file->name));

  /* Is conversion of stdin requested? */
  if (file->name == NULL) {
    /* Then we have to copy it to a temporary file. */
    tempfile = file_temporary(file->buffer, 0);
    if (tempfile == NULL)
      return ERR_IOFAIL;

    if (copy_and_convert(file, tempfile, NULL) != 0) {
      file_unlink(tempfile->name);
      file_free(tempfile);
      return ERR_IOFAIL;
    }
  }

  /* Construct the charset names before fork() */
  from_name = enca_strconcat(enca_charset_name(from_enc.charset,
                                               ENCA_NAME_STYLE_ENCA),
                             enca_get_surface_name(from_enc.surface,
                                                   ENCA_NAME_STYLE_ENCA),
                             NULL);
  if (enca_charset_is_known(options.target_enc.charset)
      && (options.target_enc.surface & ENCA_SURFACE_UNKNOWN) == 0) {
    target_name
      = enca_strconcat(enca_charset_name(options.target_enc.charset,
                                         ENCA_NAME_STYLE_ENCA),
                       enca_get_surface_name(options.target_enc.surface,
                                             ENCA_NAME_STYLE_ENCA),
                       NULL);
  }
  else
    target_name = enca_strdup(options.target_enc_str);

  /* Fork. */
  pid = vfork();
  if (pid == 0) {
    /* Child. */
    if (tempfile)
      execlp(extern_convertor, extern_convertor,
             from_name, target_name, tempfile->name,
             STDOUT_CONV, NULL);
    else
      execlp(extern_convertor, extern_convertor,
             from_name, target_name, file->name, NULL);

    exit(ERR_EXEC);
  }

  /* Parent. */
  if (pid == -1) {
    fprintf(stderr, "%s: Cannot fork() to execute convertor: %s\n",
                    program_name,
                    strerror(errno));
    exit(EXIT_TROUBLE);
  }
  /* Wait until the child returns. */
  if (waitpid(pid, &status, 0) == -1) {
    /* Error. */
    fprintf(stderr, "%s: wait_pid() error while waiting for convertor: %s\n",
                    program_name,
                    strerror(errno));
    exit(EXIT_TROUBLE);
  }
  if (!WIFEXITED(status)) {
    /* Child exited abnormally. */
    fprintf(stderr, "%s: Child convertor process has been murdered.\n",
                    program_name);
    exit(EXIT_TROUBLE);
  }

  enca_free(from_name);
  enca_free(target_name);

  if (tempfile) {
    unlink(tempfile->name);
    file_free(tempfile);
  }

  /* Child exited normally, test exit status. */
  if (WEXITSTATUS(status) != EXIT_SUCCESS) {
    /* This means child was unable to execute convertor or convertor failed. */
    fprintf(stderr, "%s: External convertor failed (error code %d)\n",
                    program_name,
                    WEXITSTATUS(status));
    if (WEXITSTATUS(status) == ERR_EXEC)
      return ERR_EXEC;
    else
      return ERR_CANNOT;
  }
  /* Success!  Wow! */
  return ERR_OK;
}