/* Convert Unicode/PdfDocEncoding string into utf-8 */ char * pdf_to_utf8(fz_context *ctx, pdf_obj *src) { unsigned char *srcptr = (unsigned char *) pdf_to_str_buf(src); char *dstptr, *dst; int srclen = pdf_to_str_len(src); int dstlen = 0; int ucs; int i; if (srclen >= 2 && srcptr[0] == 254 && srcptr[1] == 255) { for (i = 2; i + 1 < srclen; i += 2) { ucs = srcptr[i] << 8 | srcptr[i+1]; dstlen += fz_runelen(ucs); } dstptr = dst = fz_malloc(ctx, dstlen + 1); for (i = 2; i + 1 < srclen; i += 2) { ucs = srcptr[i] << 8 | srcptr[i+1]; dstptr += fz_runetochar(dstptr, ucs); } } else if (srclen >= 2 && srcptr[0] == 255 && srcptr[1] == 254) { for (i = 2; i + 1 < srclen; i += 2) { ucs = srcptr[i] | srcptr[i+1] << 8; dstlen += fz_runelen(ucs); } dstptr = dst = fz_malloc(ctx, dstlen + 1); for (i = 2; i + 1 < srclen; i += 2) { ucs = srcptr[i] | srcptr[i+1] << 8; dstptr += fz_runetochar(dstptr, ucs); } } else { for (i = 0; i < srclen; i++) dstlen += fz_runelen(pdf_doc_encoding[srcptr[i]]); dstptr = dst = fz_malloc(ctx, dstlen + 1); for (i = 0; i < srclen; i++) { ucs = pdf_doc_encoding[srcptr[i]]; dstptr += fz_runetochar(dstptr, ucs); } } *dstptr = '\0'; return dst; }
char * fz_utf8_from_wchar(const wchar_t *s) { const wchar_t *src = s; char *d; char *dst; int len = 1; while (*src) { len += fz_runelen(*src++); } d = malloc(len); if (d != NULL) { dst = d; src = s; while (*src) { dst += fz_runetochar(dst, *src++); } *dst = 0; } return d; }
/* Convert Unicode/PdfDocEncoding string into utf-8 */ char * pdf_to_utf8(pdf_document *doc, pdf_obj *src) { fz_context *ctx = doc->ctx; fz_buffer *strmbuf = NULL; unsigned char *srcptr; char *dstptr, *dst; int srclen; int dstlen = 0; int ucs; int i; fz_var(strmbuf); fz_try(ctx) { if (pdf_is_string(src)) { srcptr = (unsigned char *) pdf_to_str_buf(src); srclen = pdf_to_str_len(src); } else if (pdf_is_stream(doc, pdf_to_num(src), pdf_to_gen(src))) { strmbuf = pdf_load_stream(doc, pdf_to_num(src), pdf_to_gen(src)); srclen = fz_buffer_storage(ctx, strmbuf, (unsigned char **)&srcptr); } else { srclen = 0; } if (srclen >= 2 && srcptr[0] == 254 && srcptr[1] == 255) { for (i = 2; i + 1 < srclen; i += 2) { ucs = srcptr[i] << 8 | srcptr[i+1]; dstlen += fz_runelen(ucs); } dstptr = dst = fz_malloc(ctx, dstlen + 1); for (i = 2; i + 1 < srclen; i += 2) { ucs = srcptr[i] << 8 | srcptr[i+1]; dstptr += fz_runetochar(dstptr, ucs); } } else if (srclen >= 2 && srcptr[0] == 255 && srcptr[1] == 254) { for (i = 2; i + 1 < srclen; i += 2) { ucs = srcptr[i] | srcptr[i+1] << 8; dstlen += fz_runelen(ucs); } dstptr = dst = fz_malloc(ctx, dstlen + 1); for (i = 2; i + 1 < srclen; i += 2) { ucs = srcptr[i] | srcptr[i+1] << 8; dstptr += fz_runetochar(dstptr, ucs); } } else { for (i = 0; i < srclen; i++) dstlen += fz_runelen(pdf_doc_encoding[srcptr[i]]); dstptr = dst = fz_malloc(ctx, dstlen + 1); for (i = 0; i < srclen; i++) { ucs = pdf_replace_undefined(pdf_doc_encoding[srcptr[i]]); dstptr += fz_runetochar(dstptr, ucs); } } } fz_always(ctx) { fz_drop_buffer(ctx, strmbuf); } fz_catch(ctx) { fz_rethrow(ctx); } *dstptr = '\0'; return dst; }
/* Convert Unicode/PdfDocEncoding string into utf-8 */ char * pdf_to_utf8(fz_context *ctx, pdf_document *doc, pdf_obj *src) { fz_buffer *stmbuf = NULL; unsigned char *srcptr; char *dstptr, *dst; size_t srclen; size_t dstlen = 0; int ucs; size_t i; fz_var(stmbuf); fz_try(ctx) { if (pdf_is_string(ctx, src)) { srcptr = (unsigned char *) pdf_to_str_buf(ctx, src); srclen = pdf_to_str_len(ctx, src); } else if (pdf_is_stream(ctx, src)) { stmbuf = pdf_load_stream(ctx, doc, pdf_to_num(ctx, src), pdf_to_gen(ctx, src)); srclen = fz_buffer_storage(ctx, stmbuf, (unsigned char **)&srcptr); } else { srclen = 0; } /* UTF-16BE */ if (srclen >= 2 && srcptr[0] == 254 && srcptr[1] == 255) { i = 2; while (i + 2 <= srclen) { /* skip language escape codes */ if (i + 6 <= srclen && srcptr[i+0] == 0 && srcptr[i+1] == 27 && srcptr[i+4] == 0 && srcptr[i+5] == 27) { i += 6; } else if (i + 8 <= srclen && srcptr[i+0] == 0 && srcptr[i+1] == 27 && srcptr[i+6] == 0 && srcptr[i+7] == 27) { i += 8; } else { i += rune_from_utf16be(&ucs, srcptr + i, srcptr + srclen); dstlen += fz_runelen(ucs); } } dstptr = dst = fz_malloc(ctx, dstlen + 1); i = 2; while (i + 2 <= srclen) { /* skip language escape codes */ if (i + 6 <= srclen && srcptr[i+0] == 0 && srcptr[i+1] == 27 && srcptr[i+4] == 0 && srcptr[i+5] == 27) { i += 6; } else if (i + 8 <= srclen && srcptr[i+0] == 0 && srcptr[i+1] == 27 && srcptr[i+6] == 0 && srcptr[i+7] == 27) { i += 8; } else { i += rune_from_utf16be(&ucs, srcptr + i, srcptr + srclen); dstptr += fz_runetochar(dstptr, ucs); } } } /* PDFDocEncoding */ else { for (i = 0; i < srclen; i++) dstlen += fz_runelen(pdf_doc_encoding[srcptr[i]]); dstptr = dst = fz_malloc(ctx, dstlen + 1); for (i = 0; i < srclen; i++) { ucs = pdf_doc_encoding[srcptr[i]]; dstptr += fz_runetochar(dstptr, ucs); } } } fz_always(ctx) { fz_drop_buffer(ctx, stmbuf); } fz_catch(ctx) { fz_rethrow(ctx); } *dstptr = '\0'; return dst; }