Example #1
0
/**
 * find_charset_map:
 * @charset: An 8bit charset.
 *
 * Find charset -> UCS-2 map.
 *
 * Returns: Pointer to the map, #NULL when not found.
 **/
static const EncaUnicodeMap*
find_charset_map(int charset)
{
  static int charset_id[ELEMENTS(UNICODE_MAP)];
  static int charset_id_initialized = 0;
  size_t i;

  if (charset < 0)
    return NULL;

  if (!charset_id_initialized) {
    for (i = 0; i < ELEMENTS(UNICODE_MAP); i++) {
      charset_id[i] = enca_name_to_charset(UNICODE_MAP[i].name);
      assert(charset_id[i] != ENCA_CS_UNKNOWN);
    }
    charset_id_initialized = 1;
  }

  for (i = 0; i < ELEMENTS(UNICODE_MAP); i++) {
    if (charset_id[i] == charset)
      return UNICODE_MAP + i;
  }

  return NULL;
}
Example #2
0
/* convert file using UNIX98 iconv functions
   returns 0 on success, nonzero error code otherwise
   when iconv implementation is not transitive (ICONV_TRANSITIVE is not
   defined), it may help to perform conversion via Unicode, so we try it too
   (probably UCS-2/ISO-10646, but maybe UTF-8---whatever has been detected
   at configure time) */
int
convert_iconv(File *file,
              EncaEncoding from_enc)
{
  static int ascii = ENCA_CS_UNKNOWN;
  File *tempfile = NULL;
  int err;
  iconv_t icd;

  if (!enca_charset_is_known(ascii)) {
    ascii = enca_name_to_charset("ascii");
    assert(enca_charset_is_known(ascii));
  }

  /* When iconv doesn't know the encodings, it can't convert between them.
   * We also don't try conversion to ASCII, it can only damage the files and
   * upset users, nothing else.
   * And fail early on really silly surfaces. */
  if (!enca_charset_name(from_enc.charset, ENCA_NAME_STYLE_ICONV)
      || (enca_charset_is_known(options.target_enc.charset)
          && !enca_charset_name(options.target_enc.charset,
                                ENCA_NAME_STYLE_ICONV))
      || options.target_enc.charset == ascii
      || !acceptable_surface(from_enc)
      || !acceptable_surface(options.target_enc))
    return ERR_CANNOT;

  /* Is the conversion possible? */
  if (do_iconv_open(from_enc, options.target_enc, &icd) != 0)
    return ERR_CANNOT;

  /* Since iconv doesn't recode files in place, we make a temporary file
     and copy contents of file fname to it.
     save the current content first, then copy the rest.
     When the file is stdin, fake-reopen it to stdout. */
  err = ERR_IOFAIL;
  if ((tempfile = file_temporary(file->buffer, 1))
      && file_write(tempfile) != -1
      && copy_and_convert(file, tempfile, NULL) == 0
      && (!file->name || file_seek(file, 0, SEEK_SET) == 0)
      && file_seek(tempfile, 0, SEEK_SET) == 0
      && (!file->name || file_truncate(file, 0) == 0)
      && (file->name || (file_close(file) == 0
                         && file_open(file, "wb") == 0))) {
    /* Create the second buffer when we don't have any yet
      but don't make it unnecessarily large, system default suffices */
    if (!buffer_iconv)
      buffer_iconv = buffer_new(0);
    tempfile->buffer = buffer_iconv;

    err = iconv_one_step(tempfile, file, icd);
  }

  file_free(tempfile);
  do_iconv_close(icd);
  return err;
}
Example #3
0
File: lang.c Project: MrBadge/enca
/**
 * language_charsets_ids:
 * @lang: A language.
 *
 * Creates and fills table of charset identifiers of charsets supported for
 * language @lang.
 *
 * The size of the table is determined by @lang->ncharsets.
 *
 * Returns: The charsets id table; #NULL when @lang has no charsets.
 **/
static int*
language_charsets_ids(const EncaLanguageInfo *lang)
{
  int *charsets;
  size_t i;

  assert(lang != NULL);

  if (lang->ncharsets == 0)
    return NULL;

  charsets = NEW(int, lang->ncharsets);
  for (i = 0; i < lang->ncharsets; i++) {
    charsets[i] = enca_name_to_charset(lang->csnames[i]);
    assert(charsets[i] != ENCA_CS_UNKNOWN);
  }

  return charsets;
}
Example #4
0
File: guess.c Project: MrBadge/enca
/**
 * make_guess:
 * @analyser: An analyser whose buffer is to be analysed.
 *
 * Finds encoding of @buffer and stores it in @analyser->result.
 *
 * Returns: Zero on success, nonzero error code when the encoding was not
 * determined.
 **/
static EncaErrno
make_guess(EncaAnalyserState *analyser)
{
  const unsigned short int *const *const weights = analyser->lang->weights;
  const unsigned short int *const significant = analyser->lang->significant;
  size_t *const counts = analyser->counts;
  size_t *const order = analyser->order;
  double *const ratings = analyser->ratings;
  const EncaAnalyserOptions *const options = &(analyser->options);
  unsigned char *buffer = analyser->buffer;
  size_t size = analyser->size;

  static int ascii = ENCA_CS_UNKNOWN; /* ASCII charset id */

  size_t fchars; /* characters filtered out */
  size_t i, cs;

  /* Initialize when we are called the first time. */
  if (ascii == ENCA_CS_UNKNOWN) {
    ascii = enca_name_to_charset("ascii");
    assert(ascii != ENCA_CS_UNKNOWN);
  }

  /* Count characters. */
  count_characters(analyser);

  /* Pure ascii file (but may be qp-encoded!). */
  if (!analyser->bin && !analyser->up) {
    if (options->multibyte_enabled) {
      if (try_test_list(analyser, ENCA_MULTIBYTE_TESTS_ASCII))
        return 0;
    }

    if (options->interpreted_surfaces && looks_like_qp(analyser)) {
      /* Quoted printables => recompute aliases and recount characters. */
      buffer = analyser->buffer;
      size = analyser->size;
      count_characters(analyser);
    }

    if (!analyser->bin && !analyser->up) {
      /* Plain ascii. */
      analyser->result.charset = ascii;
      analyser->result.surface |= enca_eol_surface(buffer, size,
                                                   analyser->counts);
      return 0;
    }
  }

  /* Binary encodings (binary noise is handled later). */
  if (analyser->bin && options->multibyte_enabled) {
    if (try_test_list(analyser, ENCA_MULTIBYTE_TESTS_BINARY))
      return 0;
  }
  /* When interpreted surfaces are not allowed and sample contains binary data,
   * we can give it up right here. */
  if (!options->interpreted_surfaces && analyser->bin)
    return ENCA_EGARBAGE;

  /* Multibyte 8bit sample (utf-8), this has to be tested before
   * filtering too -- no language independent multibyte encoding can be
   * assumed to survive it. */
  if (!analyser->bin && analyser->up && options->multibyte_enabled) {
    if (try_test_list(analyser, ENCA_MULTIBYTE_TESTS_8BIT))
      return 0;
  }

  /* Now it can still be a regular 8bit charset (w/ or w/o noise), language
   * dependent MBCS (w/ or w/o noise), ascii w/ noise or just garbage. */

  /* When the buffer must be treated as const and filters are enabled
   * (and we didn't created a copy earlier), create a copy and store
   * original into buffer2 */
  if (options->const_buffer
      && options->filtering
      && analyser->buffer2 == NULL) {
    analyser->buffer2 = buffer;
    analyser->size2 = size;
    analyser->buffer = memcpy(enca_malloc(size), buffer, size);
    buffer = analyser->buffer;
  }

  /* Filter out blocks of binary data and box-drawing characters. */
  fchars = 0;
  if (options->filtering) {
    if (analyser->bin) {
      fchars = filter_binary(buffer, size, FILL_CHARACTER);
      if (fchars)
        analyser->result.surface |= ENCA_SURFACE_EOL_BIN;
    }
    fchars += enca_filter_boxdraw(analyser, FILL_CHARACTER);
  }

  /* At least something should remain after filtering. */
  if (size - fchars < sqrt((double)size))
    return ENCA_EFILTERED;

  /* Detect surface. */
  analyser->result.surface |= enca_eol_surface(buffer, size, counts);

  /* When sample has been damaged by filters, recount characters. */
  if (fchars) {
    count_characters(analyser);

    if (!analyser->up) {
      analyser->result.charset = ascii;
      /* FIXME: What if it's ASCII + box-drawing characters? */
      analyser->result.surface |= ENCA_SURFACE_EOL_BIN;
      return 0;
    }
  }

  /* Check multibyte 8bit sample (utf-8) again.
   * Chances are filtering helped it, even if it most probably destroyed it. */
  if (analyser->up && options->multibyte_enabled) {
    if (try_test_list(analyser, ENCA_MULTIBYTE_TESTS_8BIT_TOLERANT))
      return 0;
  }

  /* When no regular charsets are present (i.e. language is `none')
   * nothing of the following procedure has sense so just quit. */
  if (analyser->ncharsets == 0)
    return ENCA_ENOCS8;

  /* How many significant characters we caught? */
  if (!check_significant(analyser))
      return ENCA_ESIGNIF;

  /* Try pair analysis first. */
  if (enca_pair_analyse(analyser))
    return 0;

  /* Regular, language dependent 8bit charsets.
   *
   * When w_rs is relative occurence of character s in charset r we multiply
   * count[s] with factor (the sum in denominator is so-called significancy)
   *
   *          w
   *           rs
   *   ----------------
   *           ___
   *          \
   *    eps +  >   w
   *          /___  rs
   *            r
   */
  if (weights) {
    for (cs = 0; cs < analyser->ncharsets; cs++) {
      ratings[cs] = 0.0;
      for (i = 0; i < 0x100; i++) {
        ratings[cs] += weights[cs][i]/(significant[i] + EPSILON)*counts[i];
      }
    }
  } else {
    assert(analyser->lang->ratinghook);
    analyser->lang->ratinghook(analyser);
  }

  /* Find winner and second best. */
  enca_find_max_sec(analyser);

  /* Run langauge specific hooks. */
  if (analyser->ncharsets > 1 && analyser->lang->hook)
    analyser->lang->hook(analyser);

  /* Now we have found charset with the best relative ratings
     but we need an absolute test to detect total garbage. */
  if (options->test_garbageness && weights
      && test_garbage(analyser))
      return ENCA_EGARBAGE;

  /* Do we have a winner? */
  if (analyser->ncharsets == 1) {
    analyser->result.charset = analyser->charsets[order[0]];
    return 0;
  }

  if (ratings[order[0]]/(ratings[order[1]] + EPSILON)
      < options->threshold + EPSILON) {
    /* Unfortunately no, but in ambiguous mode have the last chance. */
    if (options->ambiguous_mode && weights)
      return ambiguous_hook(analyser);

    return ENCA_EWINNER;
  }
  analyser->result.charset = analyser->charsets[order[0]];

  return 0;
}
Example #5
0
/* process file named fname
   this is the `boss' function
   returns 0 on succes, 1 on failure, 2 on troubles */
static int
process_file(EncaAnalyser an,
             const char *fname)
{
  static int utf8 = ENCA_CS_UNKNOWN;
  static Buffer *buffer = NULL; /* persistent i/o buffer */
  int ot_is_convert = (options.output_type == OTYPE_CONVERT);

  EncaEncoding result; /* the guessed encoding */
  File *file; /* the processed file */

  if (!an) {
    buffer_free(buffer);
    return 0;
  }

  /* Initialize when we are called the first time. */
  if (buffer == NULL)
    buffer = buffer_new(buffer_size);

  if (!enca_charset_is_known(utf8)) {
    utf8 = enca_name_to_charset("utf8");
    assert(enca_charset_is_known(utf8));
  }

  /* Read sample. */
  file = file_new(fname, buffer);
  if (file_open(file, ot_is_convert ? "r+b" : "rb") != 0) {
    file_free(file);
    return EXIT_TROUBLE;
  }
  if (file_read(file) == -1) {
    file_free(file);
    return EXIT_TROUBLE;
  }
  if (!ot_is_convert)
    file_close(file);

  /* Guess encoding. */
  dwim_libenca_options(an, file);
  if (ot_is_convert)
    result = enca_analyse_const(an, buffer->data, buffer->pos);
  else
    result = enca_analyse(an, buffer->data, buffer->pos);

  /* Is conversion required? */
  if (ot_is_convert) {
    int err = 0;

    if (enca_charset_is_known(result.charset))
      err = convert(file, result);
    else {
      if (enca_errno(an) != ENCA_EEMPTY) {
        fprintf(stderr, "%s: Cannot convert `%s' from unknown encoding\n",
                        program_name,
                        ffname_r(file->name));
      }
      /* Copy stdin to stdout unchanged. */
      if (file->name == NULL)
        err = copy_and_convert(file, file, NULL);
    }

    file_free(file);
    if ((err == ERR_OK && !enca_charset_is_known(result.charset)
         && enca_errno(an) != ENCA_EEMPTY)
        || err == ERR_CANNOT)
      return 1;

    return (err == ERR_OK) ? EXIT_SUCCESS : EXIT_TROUBLE;
  }

  /* Print results. */
  print_results(file->name, an, result, enca_errno(an));
  if (result.charset == utf8)
    double_utf8_chk(an, buffer->data, buffer->pos);

  file_free(file);

  return enca_charset_is_known(result.charset) ? EXIT_SUCCESS : EXIT_FAILURE;
}
Example #6
0
/**
 * enca_language_hook_eol:
 * @analyser: Analyser whose charset ratings are to be modified.
 * @ncs: The number of charsets.
 * @hookdata: What characters of which charsets should be decided with based
 *            on the EOL type.
 *
 * Decide between two charsets differing only in EOL type or other surface.
 *
 * The (surface mask, charset) pairs are scanned in order. If a matching
 * surface is found, ratings of all other charsets in the list are zeroed.
 * So you can place a surface mask of all 1s at the end to match when nothing
 * else matches.
 *
 * All the charsets have to have the same rating, or nothing happens.
 *
 * It also recomputes @order when something changes.
 *
 * Returns: Nonzero when @ratings were actually modified, nonzero otherwise.
 **/
int
enca_language_hook_eol(EncaAnalyserState *analyser,
                       size_t ncs,
                       EncaLanguageHookDataEOL *hookdata)
{
  const int *const ids = analyser->charsets;
  const size_t ncharsets = analyser->ncharsets;
  const size_t *const order = analyser->order;
  double *const ratings = analyser->ratings;
  size_t j, k;

  assert(ncharsets > 0);
  assert(ncs <= ncharsets);
  if (ncs < 2)
    return 0;

  /* Rating equality check. */
  for (j = 1; j < ncs; j++) {
    if (fabs(ratings[order[j-1]] - ratings[order[j]]) > EPSILON)
      return 0;
  }

  /* Find id's and check whether they are the first */
  for (j = 0; j < ncs; j++) {
    EncaLanguageHookDataEOL *h = hookdata + j;

    /* Find charset if unknown */
    if (h->cs == (size_t)-1) {
      int id;

      id = enca_name_to_charset(h->name);
      assert(id != ENCA_CS_UNKNOWN);
      k = 0;
      while (k < ncharsets && id != ids[k])
        k++;
      assert(k < ncharsets);
      h->cs = k;
    }

    /* If any charset is not between the first ncs ones, do nothing. */
    k = 0;
    while (k < ncs && order[k] != h->cs)
      k++;
    if (k == ncs)
      return 0;
  }

  /* Find first matching EOL type. */
  for (j = 0; j < ncs; j++) {
    EncaLanguageHookDataEOL const *h = hookdata + j;

    if (h->eol & analyser->result.surface) {
      int chg = 0;

      for (k = 0; k < ncs; k++) {
        h = hookdata + k;

        if (k != j && ratings[h->cs] > 0.0) {
          ratings[h->cs] = 0.0;
          chg = 1;
        }
      }
      if (chg)
        enca_find_max_sec(analyser);

      return chg;
    }
  }

  return 0;
}
Example #7
0
/**
 * enca_language_hook_ncs:
 * @analyser: Analyser whose charset ratings are to be modified.
 * @ncs: The number of charsets.
 * @hookdata: What characters of which charsets should be given the extra
 *            weight.
 *
 * Decide between two charsets differing only in a few characters.
 *
 * If the two most probable charsets correspond to @hookdata charsets,
 * give the characters they differ half the weight of all other characters
 * together, thus allowing to decide between the two very similar charsets.
 *
 * It also recomputes @order when something changes.
 *
 * Returns: Nonzero when @ratings were actually modified, nonzero otherwise.
 **/
int
enca_language_hook_ncs(EncaAnalyserState *analyser,
                       size_t ncs,
                       EncaLanguageHookData1CS *hookdata)
{
  const int *const ids = analyser->charsets;
  const size_t ncharsets = analyser->ncharsets;
  const size_t *counts = analyser->counts;
  const size_t *const order = analyser->order;
  double *const ratings = analyser->ratings;
  size_t maxcnt, j, k, m;
  double q;

  assert(ncharsets > 0);
  assert(ncs <= ncharsets);
  if (ncs < 2)
    return 0;

  /*
  for (j = 0; j < ncharsets; j++) {
    fprintf(stderr, "%s:\t%g\n", enca_csname(ids[order[j]]), ratings[order[j]]);
  }
  */

  /* Find id's and check whether they are the first */
  for (j = 0; j < ncs; j++) {
    EncaLanguageHookData1CS *h = hookdata + j;

    /* Find charset if unknown */
    if (h->cs == (size_t)-1) {
      int id;

      id = enca_name_to_charset(h->name);
      assert(id != ENCA_CS_UNKNOWN);
      k = 0;
      while (k < ncharsets && id != ids[k])
        k++;
      assert(k < ncharsets);
      h->cs = k;
    }

    /* If any charset is not between the first ncs ones, do nothing. */
    k = 0;
    while (k < ncs && order[k] != h->cs)
      k++;
    if (k == ncs)
      return 0;
  }

  /* Sum the extra-important characters and find maximum. */
  maxcnt = 0;
  for (j = 0; j < ncs; j++) {
    EncaLanguageHookData1CS const *h = hookdata + j;

    for (m = k = 0; k < h->size; k++)
      m += counts[h->list[k]];
    if (m > maxcnt)
      maxcnt = m;
  }
  if (maxcnt == 0)
    return 0;

  /* Substract something from charsets that have less than maximum. */
  q = 0.5 * ratings[order[0]]/(maxcnt + EPSILON);
  for (j = 0; j < ncs; j++) {
    EncaLanguageHookData1CS const *h = hookdata + j;

    m = maxcnt;
    for (k = 0; k < h->size; k++)
      m -= counts[h->list[k]];
    ratings[h->cs] -= q*m;
  }

  enca_find_max_sec(analyser);

  return 1;
}
Example #8
0
/**
 * filter_boxdraw_out:
 * @charset: Charset whose associated filter should be applied.
 * @buffer: Buffer to be filtered.
 * @size: Size of @buffer.
 * @fill_char: Replacement character for filtered bytes.
 *
 * Replaces box-drawing characters in @buffer with @fill_char.
 *
 * Not all possibly box-drawing characters are replaced, only those meeting
 * certain conditions to reduce false filtering.  It's assumed
 * isspace(@fill_char) is true (it aborts when it isn't).
 *
 * It's OK to call with @charset which has no filter associated, it just
 * returns zero then.
 *
 * Returns: The number of characters filtered.
 **/
static size_t
filter_boxdraw_out(int charset,
                   unsigned char *buffer,
                   size_t size,
                   unsigned char fill_char)
{
  static int charset_id[ELEMENTS(BOXDRAW)];
  static int charset_id_initialized = 0;
  const EncaBoxDraw *bd;
  size_t i, n, xout;

  assert(enca_isspace(fill_char));

  if (!charset_id_initialized) {
    for (i = 0; i < ELEMENTS(BOXDRAW); i++) {
      charset_id[i] = enca_name_to_charset(BOXDRAW[i].csname);
      assert(charset_id[i] != ENCA_CS_UNKNOWN);
    }
    charset_id_initialized = 1;
  }

  /* Find whether we have any filter associated with this charset. */
  bd = NULL;
  for (i = 0; i < ELEMENTS(BOXDRAW); i++) {
    if (charset_id[i] == charset) {
      bd = BOXDRAW + i;
      break;
    }
  }
  if (bd == NULL)
    return 0;

  xout = 0;
  /* First stage:
   * Horizontal lines, they must occur at least two in a row. */
  i = 0;
  while (i < size-1) {
    if (buffer[i] == bd->h1 || buffer[i] == bd->h2) {
      for (n = i+1; buffer[n] == buffer[i] && n < size; n++)
        ;

      if (n > i+1) {
        memset(buffer + i, fill_char, n - i);
        xout += n - i;
      }
      i = n;
    }
    else i++;
  }

  /* Second stage:
   * Vertical/mixed, they must occur separated by whitespace.
   * We assume isspace(fill_char) is true. */
  if (size > 1
      && bd->isvbox[buffer[0]]
      && enca_isspace(buffer[1])) {
    buffer[0] = fill_char;
    xout++;
  }

  for (i = 1; i < size-1; i++) {
    if (bd->isvbox[buffer[i]]
        && enca_isspace(buffer[i-1])
        && enca_isspace(buffer[i+1])) {
      buffer[i] = fill_char;
      xout++;
    }
  }

  if (size > 1
      && bd->isvbox[buffer[size-1]]
      && enca_isspace(buffer[size-2])) {
    buffer[size-1] = fill_char;
    xout++;
  }

  return xout;
}