static void dconvert(int mandatory, const char *tmpcode) { int i; int ret; yaz_iconv_t cd; for (i = 0; iso_8859_1_a[i]; i++) { size_t r; char *inbuf = (char*) iso_8859_1_a[i]; size_t inbytesleft = strlen(inbuf); char outbuf0[24]; char outbuf1[10]; char *outbuf = outbuf0; size_t outbytesleft = sizeof(outbuf0); cd = yaz_iconv_open(tmpcode, "ISO-8859-1"); YAZ_CHECK(cd || !mandatory); if (!cd) return; r = yaz_iconv(cd, &inbuf, &inbytesleft, &outbuf, &outbytesleft); YAZ_CHECK(r != (size_t) (-1)); r = yaz_iconv(cd, 0, 0, &outbuf, &outbytesleft); YAZ_CHECK(r != (size_t) (-1)); yaz_iconv_close(cd); if (r == (size_t) (-1)) return; cd = yaz_iconv_open("ISO-8859-1", tmpcode); YAZ_CHECK(cd || !mandatory); if (!cd) return; inbuf = outbuf0; inbytesleft = sizeof(outbuf0) - outbytesleft; outbuf = outbuf1; outbytesleft = sizeof(outbuf1); r = yaz_iconv(cd, &inbuf, &inbytesleft, &outbuf, &outbytesleft); YAZ_CHECK(r != (size_t) (-1)); r = yaz_iconv(cd, 0, 0, &outbuf, &outbytesleft); if (r == (size_t)(-1)) { fprintf(stderr, "failed\n"); } YAZ_CHECK(r != (size_t) (-1)); if (r != (size_t)(-1)) { ret = compare_buffers("dconvert", i, strlen(iso_8859_1_a[i]), iso_8859_1_a[i], sizeof(outbuf1) - outbytesleft, outbuf1); YAZ_CHECK(ret); } yaz_iconv_close(cd); } }
yaz_iconv_t yaz_iconv_open(const char *tocode, const char *fromcode) { yaz_iconv_t cd = (yaz_iconv_t) xmalloc(sizeof(*cd)); cd->encoder.data = 0; cd->encoder.write_handle = 0; cd->encoder.flush_handle = 0; cd->encoder.init_handle = 0; cd->encoder.destroy_handle = 0; cd->decoder.data = 0; cd->decoder.read_handle = 0; cd->decoder.init_handle = 0; cd->decoder.destroy_handle = 0; cd->my_errno = YAZ_ICONV_UNKNOWN; /* a useful hack: if fromcode has leading @, the library not use YAZ's own conversions .. */ if (fromcode[0] == '@') fromcode++; else { prepare_encoders(cd, tocode); prepare_decoders(cd, fromcode); } if (cd->decoder.read_handle && cd->encoder.write_handle) { #if HAVE_ICONV_H cd->iconv_cd = (iconv_t) (-1); #endif ; } else { #if HAVE_ICONV_H cd->iconv_cd = iconv_open(tocode, fromcode); if (cd->iconv_cd == (iconv_t) (-1)) { yaz_iconv_close(cd); return 0; } #else yaz_iconv_close(cd); return 0; #endif } cd->init_flag = 1; return cd; }
void CIconv::SetTargetCharset(CString value) { if (value.CompareNoCase(m_TargetCharset) != 0) { if (m_hIconv) { yaz_iconv_close(m_hIconv); } m_TargetCharset = value; } }
int utf8_check(unsigned c) { if (sizeof(c) >= 4) { size_t r; char src[4]; char dst[4]; char utf8buf[6]; char *inbuf = src; size_t inbytesleft = 4; char *outbuf = utf8buf; size_t outbytesleft = sizeof(utf8buf); int i; yaz_iconv_t cd = yaz_iconv_open("UTF-8", "UCS4LE"); if (!cd) return 0; for (i = 0; i<4; i++) src[i] = c >> (i*8); r = yaz_iconv(cd, &inbuf, &inbytesleft, &outbuf, &outbytesleft); yaz_iconv_close(cd); if (r == (size_t)(-1)) return 0; cd = yaz_iconv_open("UCS4LE", "UTF-8"); if (!cd) return 0; inbytesleft = sizeof(utf8buf) - outbytesleft; inbuf = utf8buf; outbuf = dst; outbytesleft = 4; r = yaz_iconv(cd, &inbuf, &inbytesleft, &outbuf, &outbytesleft); if (r == (size_t)(-1)) return 0; yaz_iconv_close(cd); if (memcmp(src, dst, 4)) return 0; }
int yaz_record_conv_opac_record(yaz_record_conv_t p, Z_OPACRecord *input_record, WRBUF output_record) { int ret = 0; struct yaz_record_conv_rule *r = p->rules; if (!r || r->type->construct != construct_marc) { wrbuf_puts(p->wr_error, "Expecting MARC rule as first rule for OPAC"); ret = -1; /* no marc rule so we can't do OPAC */ } else { struct marc_info *mi = r->info; const char *input_charset = mi->input_charset; yaz_iconv_t cd; WRBUF res = wrbuf_alloc(); yaz_marc_t mt = yaz_marc_create(); if (yaz_opac_check_marc21_coding(input_charset, input_record)) input_charset = "utf-8"; cd = yaz_iconv_open(mi->output_charset, input_charset); wrbuf_rewind(p->wr_error); yaz_marc_xml(mt, mi->output_format_mode); yaz_marc_iconv(mt, cd); yaz_opac_decode_wrbuf(mt, input_record, res); if (ret != -1) { ret = yaz_record_conv_record_rule(p, r->next, wrbuf_buf(res), wrbuf_len(res), output_record); } yaz_marc_destroy(mt); if (cd) yaz_iconv_close(cd); wrbuf_destroy(res); } return ret; }
static void tst_ucs4b_to_utf8(void) { yaz_iconv_t cd = yaz_iconv_open("UTF8", "UCS4"); YAZ_CHECK(cd); if (!cd) return; YAZ_CHECK(tst_convert_l( cd, 8, "\x00\x00\xFF\x1F\x00\x00\x00o", 4, "\xEF\xBC\x9F\x6F")); YAZ_CHECK(tst_convert_l( cd, 8, "\x00\x00\xAE\x0E\x00\x00\xC0\xF4", 6, "\xEA\xB8\x8E\xEC\x83\xB4")); yaz_iconv_close(cd); }
static int convert_marc(void *info, WRBUF record, WRBUF wr_error) { struct marc_info *mi = info; const char *input_charset = mi->input_charset; int ret = 0; yaz_marc_t mt = yaz_marc_create(); yaz_marc_xml(mt, mi->output_format_mode); if (mi->leader_spec) yaz_marc_leader_spec(mt, mi->leader_spec); if (mi->input_format_mode == YAZ_MARC_ISO2709) { int sz = yaz_marc_read_iso2709(mt, wrbuf_buf(record), wrbuf_len(record)); if (sz > 0) { if (yaz_marc_check_marc21_coding(input_charset, wrbuf_buf(record), wrbuf_len(record))) input_charset = "utf-8"; ret = 0; } else ret = -1; } else if (mi->input_format_mode == YAZ_MARC_MARCXML || mi->input_format_mode == YAZ_MARC_TURBOMARC) { xmlDocPtr doc = xmlParseMemory(wrbuf_buf(record), wrbuf_len(record)); if (!doc) { wrbuf_printf(wr_error, "xmlParseMemory failed"); ret = -1; } else { ret = yaz_marc_read_xml(mt, xmlDocGetRootElement(doc)); if (ret) wrbuf_printf(wr_error, "yaz_marc_read_xml failed"); } xmlFreeDoc(doc); } else { wrbuf_printf(wr_error, "unsupported input format"); ret = -1; } if (ret == 0) { yaz_iconv_t cd = yaz_iconv_open(mi->output_charset, input_charset); if (cd) yaz_marc_iconv(mt, cd); wrbuf_rewind(record); ret = yaz_marc_write_mode(mt, record); if (ret) wrbuf_printf(wr_error, "yaz_marc_write_mode failed"); if (cd) yaz_iconv_close(cd); } yaz_marc_destroy(mt); return ret; }
static void *construct_marc(const xmlNode *ptr, const char *path, WRBUF wr_error) { NMEM nmem = nmem_create(); struct marc_info *info = nmem_malloc(nmem, sizeof(*info)); struct _xmlAttr *attr; const char *input_format = 0; const char *output_format = 0; if (strcmp((const char *) ptr->name, "marc")) { nmem_destroy(nmem); return 0; } info->nmem = nmem; info->input_charset = 0; info->output_charset = 0; info->input_format_mode = 0; info->output_format_mode = 0; info->leader_spec = 0; for (attr = ptr->properties; attr; attr = attr->next) { if (!xmlStrcmp(attr->name, BAD_CAST "inputcharset") && attr->children && attr->children->type == XML_TEXT_NODE) info->input_charset = (const char *) attr->children->content; else if (!xmlStrcmp(attr->name, BAD_CAST "outputcharset") && attr->children && attr->children->type == XML_TEXT_NODE) info->output_charset = (const char *) attr->children->content; else if (!xmlStrcmp(attr->name, BAD_CAST "inputformat") && attr->children && attr->children->type == XML_TEXT_NODE) input_format = (const char *) attr->children->content; else if (!xmlStrcmp(attr->name, BAD_CAST "outputformat") && attr->children && attr->children->type == XML_TEXT_NODE) output_format = (const char *) attr->children->content; else if (!xmlStrcmp(attr->name, BAD_CAST "leaderspec") && attr->children && attr->children->type == XML_TEXT_NODE) info->leader_spec = nmem_strdup(info->nmem,(const char *) attr->children->content); else { wrbuf_printf(wr_error, "Element <marc>: expected attributes" "'inputformat', 'inputcharset', 'outputformat' or" " 'outputcharset', got attribute '%s'", attr->name); nmem_destroy(info->nmem); return 0; } } if (!input_format) { wrbuf_printf(wr_error, "Element <marc>: " "attribute 'inputformat' required"); nmem_destroy(info->nmem); return 0; } else if (!strcmp(input_format, "marc")) { info->input_format_mode = YAZ_MARC_ISO2709; } else if (!strcmp(input_format, "xml")) { info->input_format_mode = YAZ_MARC_MARCXML; /** Libxml2 generates UTF-8 encoding by default . So we convert from UTF-8 to outputcharset (if defined) */ if (!info->input_charset && info->output_charset) info->input_charset = "utf-8"; } else if (!strcmp(input_format, "json")) { info->input_format_mode = YAZ_MARC_JSON; } else { wrbuf_printf(wr_error, "Element <marc inputformat='%s'>: " " Unsupported input format" " defined by attribute value", input_format); nmem_destroy(info->nmem); return 0; } if (!output_format) { wrbuf_printf(wr_error, "Element <marc>: attribute 'outputformat' required"); nmem_destroy(info->nmem); return 0; } else if (!strcmp(output_format, "line")) { info->output_format_mode = YAZ_MARC_LINE; } else if (!strcmp(output_format, "marcxml")) { info->output_format_mode = YAZ_MARC_MARCXML; if (info->input_charset && !info->output_charset) info->output_charset = "utf-8"; } else if (!strcmp(output_format, "turbomarc")) { info->output_format_mode = YAZ_MARC_TURBOMARC; if (info->input_charset && !info->output_charset) info->output_charset = "utf-8"; } else if (!strcmp(output_format, "marc")) { info->output_format_mode = YAZ_MARC_ISO2709; } else if (!strcmp(output_format, "marcxchange")) { info->output_format_mode = YAZ_MARC_XCHANGE; if (info->input_charset && !info->output_charset) info->output_charset = "utf-8"; } else if (!strcmp(output_format, "json")) { info->output_format_mode = YAZ_MARC_JSON; if (info->input_charset && !info->output_charset) info->output_charset = "utf-8"; } else { wrbuf_printf(wr_error, "Element <marc outputformat='%s'>: " " Unsupported output format" " defined by attribute value", output_format); nmem_destroy(info->nmem); return 0; } if (info->input_charset && info->output_charset) { yaz_iconv_t cd = yaz_iconv_open(info->output_charset, info->input_charset); if (!cd) { wrbuf_printf(wr_error, "Element <marc inputcharset='%s' outputcharset='%s'>:" " Unsupported character set mapping" " defined by attribute values", info->input_charset, info->output_charset); nmem_destroy(info->nmem); return 0; } yaz_iconv_close(cd); } else if (!info->output_charset) { wrbuf_printf(wr_error, "Element <marc>: " "attribute 'outputcharset' missing"); nmem_destroy(info->nmem); return 0; } else if (!info->input_charset) { wrbuf_printf(wr_error, "Element <marc>: " "attribute 'inputcharset' missing"); nmem_destroy(info->nmem); return 0; } info->input_charset = nmem_strdup(info->nmem, info->input_charset); info->output_charset = nmem_strdup(info->nmem, info->output_charset); return info; }
void yf::QueryRewrite::Rep::process(mp::Package &package) const { Z_GDU *gdu = package.request().get(); if (gdu && gdu->which == Z_GDU_Z3950) { Z_APDU *apdu_req = gdu->u.z3950; if (apdu_req->which == Z_APDU_searchRequest) { int error_code = 0; const char *addinfo = 0; mp::odr odr; Z_SearchRequest *req = apdu_req->u.searchRequest; if (m_stylesheet) { xmlDocPtr doc_input = 0; yaz_query2xml(req->query, &doc_input); if (doc_input) { xmlDocPtr doc_res = xsltApplyStylesheet(m_stylesheet, doc_input, 0); if (!doc_res) { error_code = YAZ_BIB1_MALFORMED_QUERY; addinfo = "XSLT transform failed for query"; } else { const xmlNode *root_element = xmlDocGetRootElement(doc_res); yaz_xml2query(root_element, &req->query, odr, &error_code, &addinfo); xmlFreeDoc(doc_res); } xmlFreeDoc(doc_input); } } if (!error_code && charset_to.length() && charset_from.length() && (req->query->which == Z_Query_type_1 || req->query->which == Z_Query_type_101)) { yaz_iconv_t cd = yaz_iconv_open(charset_to.c_str(), charset_from.c_str()); if (cd) { int r = yaz_query_charset_convert_rpnquery_check( req->query->u.type_1, odr, cd); yaz_iconv_close(cd); if (r) { /* query could not be char converted */ error_code = YAZ_BIB1_MALFORMED_QUERY; addinfo = "could not convert query to target charset"; } } } if (error_code) { Z_APDU *f_apdu = odr.create_searchResponse(apdu_req, error_code, addinfo); package.response() = f_apdu; return; } package.request() = gdu; } } package.move(); }
static void dump(const char *fname, const char *from, const char *to, int input_format, int output_format, int write_using_libxml2, int print_offset, const char *split_fname, int split_chunk, int verbose, FILE *cfile, const char *leader_spec) { yaz_marc_t mt = yaz_marc_create(); yaz_iconv_t cd = 0; if (yaz_marc_leader_spec(mt, leader_spec)) { fprintf(stderr, "bad leader spec: %s\n", leader_spec); yaz_marc_destroy(mt); exit(2); } if (from && to) { cd = yaz_iconv_open(to, from); if (!cd) { fprintf(stderr, "conversion from %s to %s " "unsupported\n", from, to); yaz_marc_destroy(mt); exit(2); } yaz_marc_iconv(mt, cd); } yaz_marc_enable_collection(mt); yaz_marc_xml(mt, output_format); yaz_marc_write_using_libxml2(mt, write_using_libxml2); yaz_marc_debug(mt, verbose); if (input_format == YAZ_MARC_MARCXML || input_format == YAZ_MARC_TURBOMARC || input_format == YAZ_MARC_XCHANGE) { #if YAZ_HAVE_XML2 marcdump_read_xml(mt, fname); #endif } else if (input_format == YAZ_MARC_LINE) { marcdump_read_line(mt, fname); } else if (input_format == YAZ_MARC_ISO2709) { FILE *inf = fopen(fname, "rb"); int num = 1; int marc_no = 0; int split_file_no = -1; if (!inf) { fprintf(stderr, "%s: cannot open %s:%s\n", prog, fname, strerror(errno)); exit(1); } if (cfile) fprintf(cfile, "char *marc_records[] = {\n"); for(;; marc_no++) { const char *result = 0; size_t len; size_t rlen; size_t len_result; size_t r; char buf[100001]; r = fread(buf, 1, 5, inf); if (r < 5) { if (r == 0) /* normal EOF, all good */ break; if (print_offset && verbose) { printf("<!-- Extra %ld bytes at end of file -->\n", (long) r); } break; } while (*buf < '0' || *buf > '9') { int i; long off = ftell(inf) - 5; printf("<!-- Skipping bad byte %d (0x%02X) at offset " "%ld (0x%lx) -->\n", *buf & 0xff, *buf & 0xff, off, off); for (i = 0; i<4; i++) buf[i] = buf[i+1]; r = fread(buf+4, 1, 1, inf); no_errors++; if (r < 1) break; } if (r < 1) { if (verbose || print_offset) printf("<!-- End of file with data -->\n"); break; } if (print_offset) { long off = ftell(inf) - 5; printf("<!-- Record %d offset %ld (0x%lx) -->\n", num, off, off); } len = atoi_n(buf, 5); if (len < 25 || len > 100000) { long off = ftell(inf) - 5; printf("<!-- Bad Length %ld read at offset %ld (%lx) -->\n", (long)len, (long) off, (long) off); no_errors++; break; } rlen = len - 5; r = fread(buf + 5, 1, rlen, inf); if (r < rlen) { long off = ftell(inf); printf("<!-- Premature EOF at offset %ld (%lx) -->\n", (long) off, (long) off); no_errors++; break; } while (buf[len-1] != ISO2709_RS) { if (len > sizeof(buf)-2) { r = 0; break; } r = fread(buf + len, 1, 1, inf); if (r != 1) break; len++; } if (r < 1) { printf("<!-- EOF while searching for RS -->\n"); no_errors++; break; } if (split_fname) { char fname[256]; const char *mode = 0; FILE *sf; if ((marc_no % split_chunk) == 0) { mode = "wb"; split_file_no++; } else mode = "ab"; sprintf(fname, "%.200s%07d", split_fname, split_file_no); sf = fopen(fname, mode); if (!sf) { fprintf(stderr, "Could not open %s\n", fname); split_fname = 0; } else { if (fwrite(buf, 1, len, sf) != len) { fprintf(stderr, "Could write content to %s\n", fname); split_fname = 0; no_errors++; } fclose(sf); } } len_result = rlen; r = yaz_marc_decode_buf(mt, buf, -1, &result, &len_result); if (r == -1) no_errors++; if (r > 0 && result && len_result) { if (fwrite(result, len_result, 1, stdout) != 1) { fprintf(stderr, "Write to stdout failed\n"); no_errors++; break; } } if (r > 0 && cfile) { char *p = buf; size_t i; if (marc_no) fprintf(cfile, ","); fprintf(cfile, "\n"); for (i = 0; i < r; i++) { if ((i & 15) == 0) fprintf(cfile, " \""); fprintf(cfile, "\\x%02X", p[i] & 255); if (i < r - 1 && (i & 15) == 15) fprintf(cfile, "\"\n"); } fprintf(cfile, "\"\n"); } num++; if (verbose) printf("\n"); } if (cfile) fprintf(cfile, "};\n"); fclose(inf); } { WRBUF wrbuf = wrbuf_alloc(); yaz_marc_write_trailer(mt, wrbuf); fputs(wrbuf_cstr(wrbuf), stdout); wrbuf_destroy(wrbuf); } if (cd) yaz_iconv_close(cd); yaz_marc_destroy(mt); }
CIconv::~CIconv() { if (m_hIconv) { yaz_iconv_close(m_hIconv); } }
static void tst_marc8_to_ucs4b(void) { yaz_iconv_t cd = yaz_iconv_open("UCS4", "MARC8"); YAZ_CHECK(cd); if (!cd) return; YAZ_CHECK(tst_convert_l( cd, 0, "\033$1" "\x21\x2B\x3B" /* FF1F */ "\033(B" "o", 8, "\x00\x00\xFF\x1F" "\x00\x00\x00o")); YAZ_CHECK(tst_convert_l( cd, 0, "\033$1" "\x6F\x77\x29" /* AE0E */ "\x6F\x52\x7C" /* c0F4 */ "\033(B", 8, "\x00\x00\xAE\x0E" "\x00\x00\xC0\xF4")); YAZ_CHECK(tst_convert_l( cd, 0, "\033$1" "\x21\x50\x6E" /* UCS 7CFB */ "\x21\x51\x31" /* UCS 7D71 */ "\x21\x3A\x67" /* UCS 5B89 */ "\x21\x33\x22" /* UCS 5168 */ "\x21\x33\x53" /* UCS 5206 */ "\x21\x44\x2B" /* UCS 6790 */ "\033(B", 24, "\x00\x00\x7C\xFB" "\x00\x00\x7D\x71" "\x00\x00\x5B\x89" "\x00\x00\x51\x68" "\x00\x00\x52\x06" "\x00\x00\x67\x90")); YAZ_CHECK(tst_convert_l( cd, 0, "\xB0\xB2", /* AYN and oSLASH */ 8, "\x00\x00\x02\xBB" "\x00\x00\x00\xF8")); YAZ_CHECK(tst_convert_l( cd, 0, "\xF6\x61", /* a underscore */ 8, "\x00\x00\x00\x61" "\x00\x00\x03\x32")); YAZ_CHECK(tst_convert_l( cd, 0, "\x61\xC2", /* a, phonorecord mark */ 8, "\x00\x00\x00\x61" "\x00\x00\x21\x17")); /* bug #258 */ YAZ_CHECK(tst_convert_l( cd, 0, "el" "\xe8" "am\xe8" "an", /* elaman where a is a" */ 32, "\x00\x00\x00" "e" "\x00\x00\x00" "l" "\x00\x00\x00" "a" "\x00\x00\x03\x08" "\x00\x00\x00" "m" "\x00\x00\x00" "a" "\x00\x00\x03\x08" "\x00\x00\x00" "n")); /* bug #260 */ YAZ_CHECK(tst_convert_l( cd, 0, "\xe5\xe8\x41", 12, "\x00\x00\x00\x41" "\x00\x00\x03\x04" "\x00\x00\x03\x08")); /* bug #416 */ YAZ_CHECK(tst_convert_l( cd, 0, "\xEB\x74\xEC\x73", 12, "\x00\x00\x00\x74" "\x00\x00\x03\x61" "\x00\x00\x00\x73")); /* bug #416 */ YAZ_CHECK(tst_convert_l( cd, 0, "\xFA\x74\xFB\x73", 12, "\x00\x00\x00\x74" "\x00\x00\x03\x60" "\x00\x00\x00\x73")); yaz_iconv_close(cd); }