/** * find_charset_map: * @charset: An 8bit charset. * * Find charset -> UCS-2 map. * * Returns: Pointer to the map, #NULL when not found. **/ static const EncaUnicodeMap* find_charset_map(int charset) { static int charset_id[ELEMENTS(UNICODE_MAP)]; static int charset_id_initialized = 0; size_t i; if (charset < 0) return NULL; if (!charset_id_initialized) { for (i = 0; i < ELEMENTS(UNICODE_MAP); i++) { charset_id[i] = enca_name_to_charset(UNICODE_MAP[i].name); assert(charset_id[i] != ENCA_CS_UNKNOWN); } charset_id_initialized = 1; } for (i = 0; i < ELEMENTS(UNICODE_MAP); i++) { if (charset_id[i] == charset) return UNICODE_MAP + i; } return NULL; }
/* convert file using UNIX98 iconv functions returns 0 on success, nonzero error code otherwise when iconv implementation is not transitive (ICONV_TRANSITIVE is not defined), it may help to perform conversion via Unicode, so we try it too (probably UCS-2/ISO-10646, but maybe UTF-8---whatever has been detected at configure time) */ int convert_iconv(File *file, EncaEncoding from_enc) { static int ascii = ENCA_CS_UNKNOWN; File *tempfile = NULL; int err; iconv_t icd; if (!enca_charset_is_known(ascii)) { ascii = enca_name_to_charset("ascii"); assert(enca_charset_is_known(ascii)); } /* When iconv doesn't know the encodings, it can't convert between them. * We also don't try conversion to ASCII, it can only damage the files and * upset users, nothing else. * And fail early on really silly surfaces. */ if (!enca_charset_name(from_enc.charset, ENCA_NAME_STYLE_ICONV) || (enca_charset_is_known(options.target_enc.charset) && !enca_charset_name(options.target_enc.charset, ENCA_NAME_STYLE_ICONV)) || options.target_enc.charset == ascii || !acceptable_surface(from_enc) || !acceptable_surface(options.target_enc)) return ERR_CANNOT; /* Is the conversion possible? */ if (do_iconv_open(from_enc, options.target_enc, &icd) != 0) return ERR_CANNOT; /* Since iconv doesn't recode files in place, we make a temporary file and copy contents of file fname to it. save the current content first, then copy the rest. When the file is stdin, fake-reopen it to stdout. */ err = ERR_IOFAIL; if ((tempfile = file_temporary(file->buffer, 1)) && file_write(tempfile) != -1 && copy_and_convert(file, tempfile, NULL) == 0 && (!file->name || file_seek(file, 0, SEEK_SET) == 0) && file_seek(tempfile, 0, SEEK_SET) == 0 && (!file->name || file_truncate(file, 0) == 0) && (file->name || (file_close(file) == 0 && file_open(file, "wb") == 0))) { /* Create the second buffer when we don't have any yet but don't make it unnecessarily large, system default suffices */ if (!buffer_iconv) buffer_iconv = buffer_new(0); tempfile->buffer = buffer_iconv; err = iconv_one_step(tempfile, file, icd); } file_free(tempfile); do_iconv_close(icd); return err; }
/** * language_charsets_ids: * @lang: A language. * * Creates and fills table of charset identifiers of charsets supported for * language @lang. * * The size of the table is determined by @lang->ncharsets. * * Returns: The charsets id table; #NULL when @lang has no charsets. **/ static int* language_charsets_ids(const EncaLanguageInfo *lang) { int *charsets; size_t i; assert(lang != NULL); if (lang->ncharsets == 0) return NULL; charsets = NEW(int, lang->ncharsets); for (i = 0; i < lang->ncharsets; i++) { charsets[i] = enca_name_to_charset(lang->csnames[i]); assert(charsets[i] != ENCA_CS_UNKNOWN); } return charsets; }
/** * make_guess: * @analyser: An analyser whose buffer is to be analysed. * * Finds encoding of @buffer and stores it in @analyser->result. * * Returns: Zero on success, nonzero error code when the encoding was not * determined. **/ static EncaErrno make_guess(EncaAnalyserState *analyser) { const unsigned short int *const *const weights = analyser->lang->weights; const unsigned short int *const significant = analyser->lang->significant; size_t *const counts = analyser->counts; size_t *const order = analyser->order; double *const ratings = analyser->ratings; const EncaAnalyserOptions *const options = &(analyser->options); unsigned char *buffer = analyser->buffer; size_t size = analyser->size; static int ascii = ENCA_CS_UNKNOWN; /* ASCII charset id */ size_t fchars; /* characters filtered out */ size_t i, cs; /* Initialize when we are called the first time. */ if (ascii == ENCA_CS_UNKNOWN) { ascii = enca_name_to_charset("ascii"); assert(ascii != ENCA_CS_UNKNOWN); } /* Count characters. */ count_characters(analyser); /* Pure ascii file (but may be qp-encoded!). */ if (!analyser->bin && !analyser->up) { if (options->multibyte_enabled) { if (try_test_list(analyser, ENCA_MULTIBYTE_TESTS_ASCII)) return 0; } if (options->interpreted_surfaces && looks_like_qp(analyser)) { /* Quoted printables => recompute aliases and recount characters. */ buffer = analyser->buffer; size = analyser->size; count_characters(analyser); } if (!analyser->bin && !analyser->up) { /* Plain ascii. */ analyser->result.charset = ascii; analyser->result.surface |= enca_eol_surface(buffer, size, analyser->counts); return 0; } } /* Binary encodings (binary noise is handled later). */ if (analyser->bin && options->multibyte_enabled) { if (try_test_list(analyser, ENCA_MULTIBYTE_TESTS_BINARY)) return 0; } /* When interpreted surfaces are not allowed and sample contains binary data, * we can give it up right here. */ if (!options->interpreted_surfaces && analyser->bin) return ENCA_EGARBAGE; /* Multibyte 8bit sample (utf-8), this has to be tested before * filtering too -- no language independent multibyte encoding can be * assumed to survive it. */ if (!analyser->bin && analyser->up && options->multibyte_enabled) { if (try_test_list(analyser, ENCA_MULTIBYTE_TESTS_8BIT)) return 0; } /* Now it can still be a regular 8bit charset (w/ or w/o noise), language * dependent MBCS (w/ or w/o noise), ascii w/ noise or just garbage. */ /* When the buffer must be treated as const and filters are enabled * (and we didn't created a copy earlier), create a copy and store * original into buffer2 */ if (options->const_buffer && options->filtering && analyser->buffer2 == NULL) { analyser->buffer2 = buffer; analyser->size2 = size; analyser->buffer = memcpy(enca_malloc(size), buffer, size); buffer = analyser->buffer; } /* Filter out blocks of binary data and box-drawing characters. */ fchars = 0; if (options->filtering) { if (analyser->bin) { fchars = filter_binary(buffer, size, FILL_CHARACTER); if (fchars) analyser->result.surface |= ENCA_SURFACE_EOL_BIN; } fchars += enca_filter_boxdraw(analyser, FILL_CHARACTER); } /* At least something should remain after filtering. */ if (size - fchars < sqrt((double)size)) return ENCA_EFILTERED; /* Detect surface. */ analyser->result.surface |= enca_eol_surface(buffer, size, counts); /* When sample has been damaged by filters, recount characters. */ if (fchars) { count_characters(analyser); if (!analyser->up) { analyser->result.charset = ascii; /* FIXME: What if it's ASCII + box-drawing characters? */ analyser->result.surface |= ENCA_SURFACE_EOL_BIN; return 0; } } /* Check multibyte 8bit sample (utf-8) again. * Chances are filtering helped it, even if it most probably destroyed it. */ if (analyser->up && options->multibyte_enabled) { if (try_test_list(analyser, ENCA_MULTIBYTE_TESTS_8BIT_TOLERANT)) return 0; } /* When no regular charsets are present (i.e. language is `none') * nothing of the following procedure has sense so just quit. */ if (analyser->ncharsets == 0) return ENCA_ENOCS8; /* How many significant characters we caught? */ if (!check_significant(analyser)) return ENCA_ESIGNIF; /* Try pair analysis first. */ if (enca_pair_analyse(analyser)) return 0; /* Regular, language dependent 8bit charsets. * * When w_rs is relative occurence of character s in charset r we multiply * count[s] with factor (the sum in denominator is so-called significancy) * * w * rs * ---------------- * ___ * \ * eps + > w * /___ rs * r */ if (weights) { for (cs = 0; cs < analyser->ncharsets; cs++) { ratings[cs] = 0.0; for (i = 0; i < 0x100; i++) { ratings[cs] += weights[cs][i]/(significant[i] + EPSILON)*counts[i]; } } } else { assert(analyser->lang->ratinghook); analyser->lang->ratinghook(analyser); } /* Find winner and second best. */ enca_find_max_sec(analyser); /* Run langauge specific hooks. */ if (analyser->ncharsets > 1 && analyser->lang->hook) analyser->lang->hook(analyser); /* Now we have found charset with the best relative ratings but we need an absolute test to detect total garbage. */ if (options->test_garbageness && weights && test_garbage(analyser)) return ENCA_EGARBAGE; /* Do we have a winner? */ if (analyser->ncharsets == 1) { analyser->result.charset = analyser->charsets[order[0]]; return 0; } if (ratings[order[0]]/(ratings[order[1]] + EPSILON) < options->threshold + EPSILON) { /* Unfortunately no, but in ambiguous mode have the last chance. */ if (options->ambiguous_mode && weights) return ambiguous_hook(analyser); return ENCA_EWINNER; } analyser->result.charset = analyser->charsets[order[0]]; return 0; }
/* process file named fname this is the `boss' function returns 0 on succes, 1 on failure, 2 on troubles */ static int process_file(EncaAnalyser an, const char *fname) { static int utf8 = ENCA_CS_UNKNOWN; static Buffer *buffer = NULL; /* persistent i/o buffer */ int ot_is_convert = (options.output_type == OTYPE_CONVERT); EncaEncoding result; /* the guessed encoding */ File *file; /* the processed file */ if (!an) { buffer_free(buffer); return 0; } /* Initialize when we are called the first time. */ if (buffer == NULL) buffer = buffer_new(buffer_size); if (!enca_charset_is_known(utf8)) { utf8 = enca_name_to_charset("utf8"); assert(enca_charset_is_known(utf8)); } /* Read sample. */ file = file_new(fname, buffer); if (file_open(file, ot_is_convert ? "r+b" : "rb") != 0) { file_free(file); return EXIT_TROUBLE; } if (file_read(file) == -1) { file_free(file); return EXIT_TROUBLE; } if (!ot_is_convert) file_close(file); /* Guess encoding. */ dwim_libenca_options(an, file); if (ot_is_convert) result = enca_analyse_const(an, buffer->data, buffer->pos); else result = enca_analyse(an, buffer->data, buffer->pos); /* Is conversion required? */ if (ot_is_convert) { int err = 0; if (enca_charset_is_known(result.charset)) err = convert(file, result); else { if (enca_errno(an) != ENCA_EEMPTY) { fprintf(stderr, "%s: Cannot convert `%s' from unknown encoding\n", program_name, ffname_r(file->name)); } /* Copy stdin to stdout unchanged. */ if (file->name == NULL) err = copy_and_convert(file, file, NULL); } file_free(file); if ((err == ERR_OK && !enca_charset_is_known(result.charset) && enca_errno(an) != ENCA_EEMPTY) || err == ERR_CANNOT) return 1; return (err == ERR_OK) ? EXIT_SUCCESS : EXIT_TROUBLE; } /* Print results. */ print_results(file->name, an, result, enca_errno(an)); if (result.charset == utf8) double_utf8_chk(an, buffer->data, buffer->pos); file_free(file); return enca_charset_is_known(result.charset) ? EXIT_SUCCESS : EXIT_FAILURE; }
/** * enca_language_hook_eol: * @analyser: Analyser whose charset ratings are to be modified. * @ncs: The number of charsets. * @hookdata: What characters of which charsets should be decided with based * on the EOL type. * * Decide between two charsets differing only in EOL type or other surface. * * The (surface mask, charset) pairs are scanned in order. If a matching * surface is found, ratings of all other charsets in the list are zeroed. * So you can place a surface mask of all 1s at the end to match when nothing * else matches. * * All the charsets have to have the same rating, or nothing happens. * * It also recomputes @order when something changes. * * Returns: Nonzero when @ratings were actually modified, nonzero otherwise. **/ int enca_language_hook_eol(EncaAnalyserState *analyser, size_t ncs, EncaLanguageHookDataEOL *hookdata) { const int *const ids = analyser->charsets; const size_t ncharsets = analyser->ncharsets; const size_t *const order = analyser->order; double *const ratings = analyser->ratings; size_t j, k; assert(ncharsets > 0); assert(ncs <= ncharsets); if (ncs < 2) return 0; /* Rating equality check. */ for (j = 1; j < ncs; j++) { if (fabs(ratings[order[j-1]] - ratings[order[j]]) > EPSILON) return 0; } /* Find id's and check whether they are the first */ for (j = 0; j < ncs; j++) { EncaLanguageHookDataEOL *h = hookdata + j; /* Find charset if unknown */ if (h->cs == (size_t)-1) { int id; id = enca_name_to_charset(h->name); assert(id != ENCA_CS_UNKNOWN); k = 0; while (k < ncharsets && id != ids[k]) k++; assert(k < ncharsets); h->cs = k; } /* If any charset is not between the first ncs ones, do nothing. */ k = 0; while (k < ncs && order[k] != h->cs) k++; if (k == ncs) return 0; } /* Find first matching EOL type. */ for (j = 0; j < ncs; j++) { EncaLanguageHookDataEOL const *h = hookdata + j; if (h->eol & analyser->result.surface) { int chg = 0; for (k = 0; k < ncs; k++) { h = hookdata + k; if (k != j && ratings[h->cs] > 0.0) { ratings[h->cs] = 0.0; chg = 1; } } if (chg) enca_find_max_sec(analyser); return chg; } } return 0; }
/** * enca_language_hook_ncs: * @analyser: Analyser whose charset ratings are to be modified. * @ncs: The number of charsets. * @hookdata: What characters of which charsets should be given the extra * weight. * * Decide between two charsets differing only in a few characters. * * If the two most probable charsets correspond to @hookdata charsets, * give the characters they differ half the weight of all other characters * together, thus allowing to decide between the two very similar charsets. * * It also recomputes @order when something changes. * * Returns: Nonzero when @ratings were actually modified, nonzero otherwise. **/ int enca_language_hook_ncs(EncaAnalyserState *analyser, size_t ncs, EncaLanguageHookData1CS *hookdata) { const int *const ids = analyser->charsets; const size_t ncharsets = analyser->ncharsets; const size_t *counts = analyser->counts; const size_t *const order = analyser->order; double *const ratings = analyser->ratings; size_t maxcnt, j, k, m; double q; assert(ncharsets > 0); assert(ncs <= ncharsets); if (ncs < 2) return 0; /* for (j = 0; j < ncharsets; j++) { fprintf(stderr, "%s:\t%g\n", enca_csname(ids[order[j]]), ratings[order[j]]); } */ /* Find id's and check whether they are the first */ for (j = 0; j < ncs; j++) { EncaLanguageHookData1CS *h = hookdata + j; /* Find charset if unknown */ if (h->cs == (size_t)-1) { int id; id = enca_name_to_charset(h->name); assert(id != ENCA_CS_UNKNOWN); k = 0; while (k < ncharsets && id != ids[k]) k++; assert(k < ncharsets); h->cs = k; } /* If any charset is not between the first ncs ones, do nothing. */ k = 0; while (k < ncs && order[k] != h->cs) k++; if (k == ncs) return 0; } /* Sum the extra-important characters and find maximum. */ maxcnt = 0; for (j = 0; j < ncs; j++) { EncaLanguageHookData1CS const *h = hookdata + j; for (m = k = 0; k < h->size; k++) m += counts[h->list[k]]; if (m > maxcnt) maxcnt = m; } if (maxcnt == 0) return 0; /* Substract something from charsets that have less than maximum. */ q = 0.5 * ratings[order[0]]/(maxcnt + EPSILON); for (j = 0; j < ncs; j++) { EncaLanguageHookData1CS const *h = hookdata + j; m = maxcnt; for (k = 0; k < h->size; k++) m -= counts[h->list[k]]; ratings[h->cs] -= q*m; } enca_find_max_sec(analyser); return 1; }
/** * filter_boxdraw_out: * @charset: Charset whose associated filter should be applied. * @buffer: Buffer to be filtered. * @size: Size of @buffer. * @fill_char: Replacement character for filtered bytes. * * Replaces box-drawing characters in @buffer with @fill_char. * * Not all possibly box-drawing characters are replaced, only those meeting * certain conditions to reduce false filtering. It's assumed * isspace(@fill_char) is true (it aborts when it isn't). * * It's OK to call with @charset which has no filter associated, it just * returns zero then. * * Returns: The number of characters filtered. **/ static size_t filter_boxdraw_out(int charset, unsigned char *buffer, size_t size, unsigned char fill_char) { static int charset_id[ELEMENTS(BOXDRAW)]; static int charset_id_initialized = 0; const EncaBoxDraw *bd; size_t i, n, xout; assert(enca_isspace(fill_char)); if (!charset_id_initialized) { for (i = 0; i < ELEMENTS(BOXDRAW); i++) { charset_id[i] = enca_name_to_charset(BOXDRAW[i].csname); assert(charset_id[i] != ENCA_CS_UNKNOWN); } charset_id_initialized = 1; } /* Find whether we have any filter associated with this charset. */ bd = NULL; for (i = 0; i < ELEMENTS(BOXDRAW); i++) { if (charset_id[i] == charset) { bd = BOXDRAW + i; break; } } if (bd == NULL) return 0; xout = 0; /* First stage: * Horizontal lines, they must occur at least two in a row. */ i = 0; while (i < size-1) { if (buffer[i] == bd->h1 || buffer[i] == bd->h2) { for (n = i+1; buffer[n] == buffer[i] && n < size; n++) ; if (n > i+1) { memset(buffer + i, fill_char, n - i); xout += n - i; } i = n; } else i++; } /* Second stage: * Vertical/mixed, they must occur separated by whitespace. * We assume isspace(fill_char) is true. */ if (size > 1 && bd->isvbox[buffer[0]] && enca_isspace(buffer[1])) { buffer[0] = fill_char; xout++; } for (i = 1; i < size-1; i++) { if (bd->isvbox[buffer[i]] && enca_isspace(buffer[i-1]) && enca_isspace(buffer[i+1])) { buffer[i] = fill_char; xout++; } } if (size > 1 && bd->isvbox[buffer[size-1]] && enca_isspace(buffer[size-2])) { buffer[size-1] = fill_char; xout++; } return xout; }