static xmlCharEncoding sniff_encoding(saxctxt* ctx, const char* cbuf, size_t bytes) { request_rec* r = ctx->f->r; cdn_conf* cfg = ctx->cfg; xmlCharEncoding ret; char* p; ap_regmatch_t match[2]; char* buf = (char*)cbuf; const char *encoding = 0; /* If we've got it in the HTTP headers, there's nothing to do */ if(r->content_type && (p = ap_strcasestr(r->content_type, "charset=")) && p != NULL) { p += 8; if((encoding = apr_pstrndup(r->pool, p, strcspn(p, " ;")))) { ap_log_error(APLOG_MARK, APLOG_DEBUG, APR_SUCCESS, r->server, "sniff_encoding: found charset %s in Content-Type", encoding); ret = xmlParseCharEncoding(encoding); if(((ret != XML_CHAR_ENCODING_ERROR) && (ret != XML_CHAR_ENCODING_NONE))) return ret; } } /* to sniff, first we look for BOM */ if(encoding == NULL) { if((ret = xmlDetectCharEncoding((const xmlChar*)buf, bytes)) != XML_CHAR_ENCODING_NONE) { ap_log_error(APLOG_MARK, APLOG_DEBUG, APR_SUCCESS, r->server, "sniff_encoding: got charset from XML rules"); return ret; } /* If none of the above, look for a META-thingey */ if(ap_regexec(seek_meta_ctype, buf, 1, match, 0) == 0) { p = apr_pstrndup(r->pool, buf + match[0].rm_so, match[0].rm_eo - match[0].rm_so); if(ap_regexec(seek_charset, p, 2, match, 0) == 0) encoding = apr_pstrndup(r->pool, p+match[1].rm_so, match[1].rm_eo - match[1].rm_so); } } /* either it's set to something we found or it's still the default */ if(encoding) { ap_log_error(APLOG_MARK, APLOG_DEBUG, APR_SUCCESS, r->server, "sniff_encoding: got charset %s from HTML META", encoding); ret = xmlParseCharEncoding(encoding); if(ret != XML_CHAR_ENCODING_ERROR && ret != XML_CHAR_ENCODING_NONE) return ret; ap_log_error(APLOG_MARK, APLOG_DEBUG, APR_SUCCESS, r->server, "sniff_encoding: charset %s not supported", encoding); } /* Use configuration default as a last resort */ ap_log_error(APLOG_MARK, APLOG_DEBUG, APR_SUCCESS, r->server, "sniff_encoding: no suitable charset information"); return (cfg->default_encoding == XML_CHAR_ENCODING_NONE) ? XML_CHAR_ENCODING_8859_1 : cfg->default_encoding ; }
/* * call-seq: * initialize_native(xml_sax, filename) * * Initialize the push parser with +xml_sax+ using +filename+ */ static VALUE initialize_native(VALUE self, VALUE _xml_sax, VALUE _filename, VALUE encoding) { htmlSAXHandlerPtr sax; const char * filename = NULL; htmlParserCtxtPtr ctx; xmlCharEncoding enc = XML_CHAR_ENCODING_NONE; Data_Get_Struct(_xml_sax, xmlSAXHandler, sax); if(_filename != Qnil) filename = StringValuePtr(_filename); if (!NIL_P(encoding)) { enc = xmlParseCharEncoding(StringValuePtr(encoding)); if (enc == XML_CHAR_ENCODING_ERROR) rb_raise(rb_eArgError, "Unsupported Encoding"); } ctx = htmlCreatePushParserCtxt( sax, NULL, NULL, 0, filename, enc ); if(ctx == NULL) rb_raise(rb_eRuntimeError, "Could not create a parser context"); ctx->userData = NOKOGIRI_SAX_TUPLE_NEW(ctx, self); ctx->sax2 = 1; DATA_PTR(self) = ctx; return self; }
SV* C2Sv( const xmlChar *string, const xmlChar *encoding ) { SV *retval = &PL_sv_undef; xmlCharEncoding enc; STRLEN len = 0; if ( string != NULL ) { if ( encoding != NULL ) { enc = xmlParseCharEncoding( (const char*)encoding ); } else { enc = 0; } if ( enc == 0 ) { /* this happens if the encoding is "" or NULL */ enc = XML_CHAR_ENCODING_UTF8; } len = xmlStrlen( string ); retval = newSVpvn( (const char *)string, xmlStrlen(string) ); if ( enc == XML_CHAR_ENCODING_UTF8 ) { /* create an UTF8 string. */ #ifdef HAVE_UTF8 xs_warn("C2Sv: set UTF8-SV-flag\n"); SvUTF8_on(retval); #endif } } return retval; }
/* @node: the node that should be wrapped into a SV * @owner: perl instance of the owner node (may be NULL) * * This function will create a real perl instance of a given node. * the function is called directly by the XS layer, to generate a perl * instance of the node. All node reference counts are updated within * this function. Therefore this function returns a node that can * directly be used as output. * * if @ower is NULL or undefined, the node is ment to be the root node * of the tree. this node will later be used as an owner of other * nodes. */ SV* PmmNodeToSv( xmlNodePtr node, ProxyNodePtr owner ) { ProxyNodePtr dfProxy= NULL; SV * retval = &PL_sv_undef; const char * CLASS = "XML::LibXML::Node"; if ( node != NULL ) { /* find out about the class */ CLASS = PmmNodeTypeName( node ); xs_warn("PmmNodeToSv: return new perl node of class:\n"); xs_warn( CLASS ); if ( node->_private != NULL ) { dfProxy = PmmNewNode(node); /* fprintf(stderr, " at 0x%08.8X\n", dfProxy); */ } else { dfProxy = PmmNewNode(node); /* fprintf(stderr, " at 0x%08.8X\n", dfProxy); */ if ( dfProxy != NULL ) { if ( owner != NULL ) { dfProxy->owner = PmmNODE( owner ); PmmREFCNT_inc( owner ); /* fprintf(stderr, "REFCNT incremented on owner: 0x%08.8X\n", owner); */ } else { xs_warn("PmmNodeToSv: node contains itself (owner==NULL)\n"); } } else { xs_warn("PmmNodeToSv: proxy creation failed!\n"); } } retval = NEWSV(0,0); sv_setref_pv( retval, CLASS, (void*)dfProxy ); PmmREFCNT_inc(dfProxy); /* fprintf(stderr, "REFCNT incremented on node: 0x%08.8X\n", dfProxy); */ switch ( node->type ) { case XML_DOCUMENT_NODE: case XML_HTML_DOCUMENT_NODE: case XML_DOCB_DOCUMENT_NODE: if ( ((xmlDocPtr)node)->encoding != NULL ) { dfProxy->encoding = (int)xmlParseCharEncoding( (const char*)((xmlDocPtr)node)->encoding ); } break; default: break; } } else { xs_warn( "PmmNodeToSv: no node found!\n" ); } return retval; }
/* * call-seq: * reader.encoding -> XML::Encoding::UTF_8 * * Returns the encoding of the document being read. Note you * first have to read data from the reader for encoding * to return a value * * reader = XML::Reader.file(XML_FILE) * assert_nil(reader.encoding) * reader.read * assert_equal(XML::Encoding::UTF_8, reader.encoding) * * In addition, libxml always appears to return nil for the encoding * when parsing strings. */ static VALUE rxml_reader_encoding(VALUE self) { xmlTextReaderPtr xreader = rxml_text_reader_get(self); const xmlChar *xencoding = xmlTextReaderConstEncoding(xreader); if (xencoding) return INT2NUM(xmlParseCharEncoding(xencoding)); else return INT2NUM(XML_CHAR_ENCODING_NONE); }
/* * call-seq: * document.encoding -> XML::Encoding::UTF_8 * * Obtain the encoding specified by this document. */ static VALUE rxml_document_encoding_get(VALUE self) { xmlDocPtr xdoc; const char *xencoding; Data_Get_Struct(self, xmlDoc, xdoc); xencoding = (const char*)xdoc->encoding; return INT2NUM(xmlParseCharEncoding(xencoding)); }
/* * call-seq: * Input.s_to_encoding("UTF_8") -> XML::Encoding::UTF_8 * * Converts an encoding string to an encoding constant * defined on the XML::Encoding class. */ static VALUE rxml_encoding_from_s(VALUE klass, VALUE encoding) { xmlCharEncoding xencoding; if (encoding == Qnil) return Qnil; xencoding = xmlParseCharEncoding(StringValuePtr(encoding)); return NUM2INT(xencoding); }
static VALUE rxml_document_rb_encoding_get(VALUE self) { xmlDocPtr xdoc; const char *xencoding; rb_encoding* rbencoding; Data_Get_Struct(self, xmlDoc, xdoc); xencoding = (const char*)xdoc->encoding; rbencoding = rxml_xml_encoding_to_rb_encoding(mXMLEncoding, xmlParseCharEncoding(xencoding)); return rb_enc_from_encoding(rbencoding); }
static const char* set_charset_default(cmd_parms* cmd, void* CFG, const char* charset) { cdn_conf* cfg = (cdn_conf *)CFG; cfg->default_encoding = xmlParseCharEncoding(charset); switch(cfg->default_encoding) { case XML_CHAR_ENCODING_NONE: return "Default charset not found"; case XML_CHAR_ENCODING_ERROR: return "Invalid or unsupported default charset"; default: return NULL; } }
VALUE rxml_str_new2(const char* xstr, const char* xencoding) { #ifdef HAVE_RUBY_ENCODING_H if (xencoding) { xmlCharEncoding xmlEncoding = xmlParseCharEncoding(xencoding); VALUE encoding = rxml_xml_encoding_to_rb_encoding(mXMLEncoding, xmlEncoding); rb_encoding* xencodingPtr = (rb_encoding*) RDATA(encoding)->data; return rb_external_str_new_with_enc(xstr, strlen(xstr), xencodingPtr); } #endif return rb_str_new2(xstr); }
rb_encoding* rxml_figure_encoding(const char* xencoding) { rb_encoding* result; if (xencoding) { xmlCharEncoding xmlEncoding = xmlParseCharEncoding(xencoding); result = rxml_xml_encoding_to_rb_encoding(mXMLEncoding, xmlEncoding); } else { result = rb_utf8_encoding(); } return result; }
/** * decodeString returns an $encoding encoded string. * while string is an UTF-8 encoded string and * encoding is the coding name **/ char* PmmDecodeString( const char *encoding, const xmlChar *string) { char *ret=NULL; xmlCharEncoding enc; if ( string != NULL ) { xs_warn( "PmmDecodeString called\n" ); if( encoding != NULL ) { enc = xmlParseCharEncoding( encoding ); ret = (char*)PmmFastDecodeString( enc, string, (const xmlChar*)encoding ); xs_warn( "PmmDecodeString done\n" ); } else { ret = (char*)xmlStrdup(string); } } return ret; }
OutputBuffer::OutputBuffer(const std::string & encoding) { xmlCharEncodingHandlerPtr encoder = nullptr; if ( !encoding.empty() ) { xmlCharEncoding enc = xmlParseCharEncoding(encoding.c_str()); if ( enc != XML_CHAR_ENCODING_UTF8 ) { encoder = xmlFindCharEncodingHandler(encoding.c_str()); if ( encoder == nullptr ) throw InternalError("Unsupported output encoding: " + encoding); } } _buf = xmlOutputBufferCreateIO(OutputBuffer::write_cb, OutputBuffer::close_cb, this, encoder); if ( _buf == nullptr ) throw InternalError("Failed to create xml output buffer"); }
/** * encodeString returns an UTF-8 encoded String * while the encodig has the name of the encoding of string **/ xmlChar* PmmEncodeString( const char *encoding, const xmlChar *string ) { xmlCharEncoding enc; xmlChar *ret = NULL; if ( string != NULL ) { if( encoding != NULL ) { xs_warn("PmmEncodeString: encoding to UTF-8 from:\n"); xs_warn( encoding ); enc = xmlParseCharEncoding( encoding ); ret = PmmFastEncodeString( enc, string, (const xmlChar *)encoding ); } else { /* if utf-8 is requested we do nothing */ ret = xmlStrdup( string ); } } return ret; }
/* This is a little helper, that allows us to set the encoding attr. * after broken transformations * * PP: This function is not used! */ void PmmFixProxyEncoding( ProxyNodePtr dfProxy ) { xmlNodePtr node = PmmNODE( dfProxy ); if ( node != NULL ) { switch ( node->type ) { case XML_DOCUMENT_NODE: case XML_HTML_DOCUMENT_NODE: case XML_DOCB_DOCUMENT_NODE: if ( ((xmlDocPtr)node)->encoding != NULL ) { dfProxy->encoding = (int)xmlParseCharEncoding( (const char*)((xmlDocPtr)node)->encoding ); } break; default: dfProxy->encoding = 1; break; } } }
/** * htmlNodeDumpFileFormat: * @out: the FILE pointer * @doc: the document * @cur: the current node * @encoding: the document encoding * @format: should formatting spaces been added * * Dump an HTML node, recursive behaviour,children are printed too. * * TODO: if encoding == NULL try to save in the doc encoding * * returns: the number of byte written or -1 in case of failure. */ int htmlNodeDumpFileFormat(FILE *out, xmlDocPtr doc, xmlNodePtr cur, const char *encoding, int format) { xmlOutputBufferPtr buf; xmlCharEncodingHandlerPtr handler = NULL; int ret; xmlInitParser(); if (encoding != NULL) { xmlCharEncoding enc; enc = xmlParseCharEncoding(encoding); if (enc != XML_CHAR_ENCODING_UTF8) { handler = xmlFindCharEncodingHandler(encoding); if (handler == NULL) return(-1); } } /* * Fallback to HTML or ASCII when the encoding is unspecified */ if (handler == NULL) handler = xmlFindCharEncodingHandler("HTML"); if (handler == NULL) handler = xmlFindCharEncodingHandler("ascii"); /* * save the content to a temp buffer. */ buf = xmlOutputBufferCreateFile(out, handler); if (buf == NULL) return(0); htmlNodeDumpFormatOutput(buf, doc, cur, encoding, format); ret = xmlOutputBufferClose(buf); return(ret); }
SV * x_PmmNodeToSv(xmlNodePtr node, ProxyNodePtr owner) { ProxyNodePtr dfProxy= NULL; SV * retval = &PL_sv_undef; const char * CLASS = "XML::LibXML::Node"; if ( node != NULL ) { #ifdef XML_LIBXML_THREADS if( x_PmmUSEREGISTRY ) SvLOCK(x_PROXY_NODE_REGISTRY_MUTEX); #endif /* find out about the class */ CLASS = x_PmmNodeTypeName( node ); if ( node->_private != NULL ) { dfProxy = x_PmmNewNode(node); /* warn(" at 0x%08.8X\n", dfProxy); */ } else { dfProxy = x_PmmNewNode(node); /* fprintf(stderr, " at 0x%08.8X\n", dfProxy); */ if ( dfProxy != NULL ) { if ( owner != NULL ) { dfProxy->owner = x_PmmNODE( owner ); x_PmmREFCNT_inc( owner ); /* fprintf(stderr, "REFCNT incremented on owner: 0x%08.8X\n", owner); */ } } else { warn("x_PmmNodeToSv: proxy creation failed!\n"); } } retval = NEWSV(0,0); sv_setref_pv( retval, CLASS, (void*)dfProxy ); #ifdef XML_LIBXML_THREADS if( x_PmmUSEREGISTRY ) x_PmmRegistryREFCNT_inc(dfProxy); #endif x_PmmREFCNT_inc(dfProxy); /* fprintf(stderr, "REFCNT incremented on node: 0x%08.8X\n", dfProxy); */ switch ( node->type ) { case XML_DOCUMENT_NODE: case XML_HTML_DOCUMENT_NODE: case XML_DOCB_DOCUMENT_NODE: if ( ((xmlDocPtr)node)->encoding != NULL ) { x_SetPmmENCODING(dfProxy, (int)xmlParseCharEncoding( (const char*)((xmlDocPtr)node)->encoding )); } break; default: break; } #ifdef XML_LIBXML_THREADS if( x_PmmUSEREGISTRY ) SvUNLOCK(x_PROXY_NODE_REGISTRY_MUTEX); #endif } else { warn( "x_PmmNodeToSv: no node found!\n" ); } return retval; }
/* * call-seq: * context.encoding -> XML::Encoding::UTF_8 * * Obtain the character encoding identifier used in * this context. */ static VALUE rxml_parser_context_encoding_get(VALUE self) { xmlParserCtxtPtr ctxt; Data_Get_Struct(self, xmlParserCtxt, ctxt); return INT2NUM(xmlParseCharEncoding(ctxt->encoding)); }
/** * htmlDocDumpMemory: * @cur: the document * @mem: OUT: the memory pointer * @size: OUT: the memory length * * Dump an HTML document in memory and return the xmlChar * and it's size. * It's up to the caller to free the memory. */ void htmlDocDumpMemory(xmlDocPtr cur, xmlChar**mem, int *size) { xmlOutputBufferPtr buf; xmlCharEncodingHandlerPtr handler = NULL; const char *encoding; xmlInitParser(); if ((mem == NULL) || (size == NULL)) return; if (cur == NULL) { *mem = NULL; *size = 0; return; } encoding = (const char *) htmlGetMetaEncoding(cur); if (encoding != NULL) { xmlCharEncoding enc; enc = xmlParseCharEncoding(encoding); if (enc != cur->charset) { if (cur->charset != XML_CHAR_ENCODING_UTF8) { /* * Not supported yet */ *mem = NULL; *size = 0; return; } handler = xmlFindCharEncodingHandler(encoding); if (handler == NULL) { *mem = NULL; *size = 0; return; } } else { handler = xmlFindCharEncodingHandler(encoding); } } /* * Fallback to HTML or ASCII when the encoding is unspecified */ if (handler == NULL) handler = xmlFindCharEncodingHandler("HTML"); if (handler == NULL) handler = xmlFindCharEncodingHandler("ascii"); buf = xmlAllocOutputBuffer(handler); if (buf == NULL) { *mem = NULL; *size = 0; return; } htmlDocContentDumpOutput(buf, cur, NULL); xmlOutputBufferFlush(buf); if (buf->conv != NULL) { *size = buf->conv->use; *mem = xmlStrndup(buf->conv->content, *size); } else { *size = buf->buffer->use; *mem = xmlStrndup(buf->buffer->content, *size); } (void)xmlOutputBufferClose(buf); }
int main(int argc, char **argv) { int ch; char *encoding = "utf8"; char *filename; int quiet = 0; int exitcode = EXIT_OK; xmlDocPtr doc = NULL; xmlDtdPtr dtd = NULL; xmlCharEncoding enc; xmlCharEncodingHandlerPtr enchandler; xmlOutputBufferPtr out; /* LibXML version checking */ LIBXML_TEST_VERSION; /* Who am I? */ if ((progname = rindex(argv[0], '/'))) progname++; else progname = argv[0]; /* Parsing CLI */ while ((ch = getopt(argc, argv, "hqe:")) != -1) switch (ch) { case 'q': quiet = 1; break; case 'e': encoding = optarg; break; case 'h': /* FALL THROUGH */ case '?': usage(EXIT_OK); break; default: usage(EXIT_USAGE); break; } argc -= optind; argv += optind; switch (argc) { case 1: filename = argv[0]; break; case 0: filename = "-"; break; default: usage(EXIT_USAGE); } /* Get user encoding */ if ((enc = xmlParseCharEncoding(encoding)) == XML_CHAR_ENCODING_ERROR) { fprintf(stderr, "Unknown encoding: %s\n", encoding); usage(EXIT_USAGE); } enchandler = xmlGetCharEncodingHandler(enc); /* Shut up */ xmlSetGenericErrorFunc(NULL, &noerror_handler); xmlGetWarningsDefaultValue = 0; xmlPedanticParserDefault(0); /* File exists? (race condition below) */ if (strcmp(filename, "-") && (open(filename, O_RDONLY) < 0)) { if (!quiet) fprintf(stderr, "ERROR: unable to open %s\n", filename); exitcode = EXIT_NOFILE; goto exit; } /* Parse document */ if ((doc = xmlParseFile(filename)) == NULL) { if (!quiet) fprintf(stderr, "ERROR: badly formed document\n"); exitcode = EXIT_BADXML; goto exit; } /* Extract DTD (if any) */ dtd = xmlGetIntSubset(doc); /* Create output buffer */ if ((out = xmlOutputBufferCreateFd(1, enchandler)) == NULL) { if (!quiet) fprintf(stderr, "ERROR: unable to open output channel\n"); exitcode = EXIT_INTERNAL; goto exit; } /* Dump information */ if (doc->version) { xmlOutputBufferWriteString(out, "Version : "); xmlOutputBufferWriteString(out, doc->version); xmlOutputBufferWriteString(out, "\n"); } if (doc->encoding) { xmlOutputBufferWriteString(out, "Encoding : "); xmlOutputBufferWriteString(out, doc->encoding); xmlOutputBufferWriteString(out, "\n"); } if (dtd && dtd->name) { xmlOutputBufferWriteString(out, "Name : "); xmlOutputBufferWriteString(out, dtd->name); xmlOutputBufferWriteString(out, "\n"); } if (dtd && dtd->ExternalID) { xmlOutputBufferWriteString(out, "Identifier: "); xmlOutputBufferWriteString(out, dtd->ExternalID); xmlOutputBufferWriteString(out, "\n"); } if (dtd && dtd->SystemID) { xmlOutputBufferWriteString(out, "URI : "); xmlOutputBufferWriteString(out, dtd->SystemID); xmlOutputBufferWriteString(out, "\n"); xmlOutputBufferFlush(out); } /* Free resources */ if (doc) xmlFreeDoc(doc); xmlCleanupParser(); /* OK */ exit: exit(exitcode); }