// ---------------------------------------------------------------------- void SAXInterruptibleReader:: parse() throw(runtime_error) { char buf[16384]; int len; stop_flag_ = false; if(!initialized_) open_file(); while(!cache_.empty() && !stop_flag_) { struct CacheData* cd = cache_.front(); switch(cd->tag_type) { case SIR_OPENTAG: start_element(cd->name, cd->atts); break; case SIR_TEXTTAG: text_element(cd->name); break; case SIR_CLOSETAG: end_element(cd->name); break; } cache_.pop_front(); delete cd; } //Read the file until the end of file while( !is_->eof() && !stop_flag_ ) { is_->read( buf, sizeof(buf) ); len = is_->gcount(); if (XML_Parse(parser, buf, len, is_->eof()) == XML_STATUS_ERROR) { std::cerr << XML_ErrorString(XML_GetErrorCode(parser)) << "at line " << XML_GetCurrentLineNumber(parser) << std::endl; reset(); throw std::runtime_error("Error in parsing XML input"); } } //Done -> Close the file and free all associated memory if( !stop_flag_ ) { parsing_done(); reset(); } }
// ---------------------------------------------------------------------- void SAXInterruptibleReader:: handle_start_element(string name, AttList& atts) throw(runtime_error) { string _name = string(name); if( stop_flag_ ) { CacheData* cd = new CacheData; cd->tag_type = SIR_OPENTAG; cd->name = _name ; cd->atts = atts; cache_.push_back(cd); } else { start_element(_name, atts); } }
PRIVATE int SGML_write (HTStream * context, const char * b, int l) { const SGML_dtd *dtd = context->dtd; HTChunk *string = context->string; const char *text = b; int count = 0; while (l-- > 0) { char c = *b++; switch(context->state) { got_element_open: /* ** The label is jumped when the '>' of a the element ** start tag has been detected. This DOES NOT FALL TO ** THE CODE S_after_open, only processes the tag and ** sets the state (c should still contain the ** terminating character of the tag ('>')) */ if (context->current_tag && context->current_tag->name) start_element(context); context->state = S_after_open; break; case S_after_open: /* ** State S_after_open is entered only for single ** character after the element opening tag to test ** against newline. Strip one trainling newline only ** after opening nonempty element. - SGML: Ugh! */ text = b; count = 0; if (c == '\n' && (context->contents != SGML_EMPTY)) { context->state = S_text; break; } --text; goto S_text; S_text: context->state = S_text; case S_text: #ifdef ISO_2022_JP if (c == '\033') { context->state = S_esc; ++count; break; } #endif /* ISO_2022_JP */ if (c == '&') { if (count > 0) PUTB(text, count); count = 0; HTChunk_clear(string); context->state = S_ero; } else if (c == '<') { if (count > 0) PUTB(text, count); count = 0; HTChunk_clear(string); /* should scrap LITERAL, and use CDATA and RCDATA -- msa */ context->state = (context->contents == SGML_LITERAL) ? S_literal : S_tag; } else if (c == '\n') /* Newline - ignore if before end tag! */ context->state = S_nl; else ++count; break; case S_nl: if (c == '<') { if (count > 0) PUTB(text, count); count = 0; HTChunk_clear(string); context->state = (context->contents == SGML_LITERAL) ? S_literal : S_nl_tago; } else { ++count; goto S_text; } break; case S_nl_tago: /* Had newline and tag opener */ if (c != '/') PUTC('\n'); /* Only ignore newline before </ */ context->state = S_tag; goto handle_S_tag; #ifdef ISO_2022_JP case S_esc: if (c=='$') context->state = S_dollar; else if (c=='(') context->state = S_paren; else context->state = S_text; ++count; break; case S_dollar: if (c=='@' || c=='B') context->state = S_nonascii_text; else context->state = S_text; ++count; break; case S_paren: if (c=='B' || c=='J') context->state = S_text; else context->state = S_text; ++count; break; case S_nonascii_text: if (c == '\033') context->state = S_esc; ++count; break; #endif /* ISO_2022_JP */ /* In literal mode, waits only for specific end tag! ** Only foir compatibility with old servers. */ case S_literal: HTChunk_putc(string, c); if ( TOUPPER(c) != ((HTChunk_size(string) == 1) ? '/' : context->current_tag->name[HTChunk_size(string)-2])) { /* If complete match, end literal */ if ((c == '>') && (!context->current_tag->name[HTChunk_size(string)-2])) { end_element (context,context->current_tag); /* ...setting SGML_MIXED below is a bit of kludge, but a good guess that currently works, anything other than SGML_LITERAL would work... -- msa */ context->contents = SGML_MIXED; } else { /* If Mismatch: recover string. */ PUTC( '<'); PUTB(HTChunk_data(string), HTChunk_size(string)); } context->state = S_text; text = b; count = 0; } break; /* ** Character reference or Entity */ case S_ero: if (c == '#') { /* &# is Char Ref Open */ context->state = S_cro; break; } context->state = S_entity; /** FALL THROUGH TO S_entity !! ***/ /* ** Handle Entities */ case S_entity: if (isalnum((int) c)) HTChunk_putc(string, c); else { HTChunk_terminate(string); handle_entity(context); text = b; count = 0; if (c != ';') { --text; goto S_text; } context->state = S_text; } break; /* Character reference */ case S_cro: if (isalnum((int)c)) /* accumulate a character NUMBER */ HTChunk_putc(string, c); else { int value; HTChunk_terminate(string); if (sscanf(HTChunk_data(string), "%d", &value)==1) PUTC((char)value); else { PUTB("&#", 2); PUTB(HTChunk_data(string), HTChunk_size(string)-1); } text = b; count = 0; if (c != ';') { --text; goto S_text; } context->state = S_text; } break; case S_tag: /* new tag */ handle_S_tag: if (isalnum((int)c)) HTChunk_putc(string, c); else { /* End of tag name */ int i; if (c == '/') { if (HTChunk_size(string) > 0) HTTRACE(SGML_TRACE, "`<%s/' found!\n" _ HTChunk_data(string)); context->state = S_end; break; } else if (c == '!') { if (HTChunk_size(string) > 0) HTTRACE(SGML_TRACE, " `<%s!' found!\n" _ HTChunk_data(string)); context->state = S_md; break; } HTChunk_terminate(string); context->current_tag = SGMLFindTag(dtd, HTChunk_data(string)); if (context->current_tag == NULL) { HTTRACE(SGML_TRACE, "*** Unknown element %s\n" _ HTChunk_data(string)); (*context->actions->unparsed_begin_element) (context->target, HTChunk_data(string), HTChunk_size(string)); } else { for (i=0; i<context->current_tag->number_of_attributes; i++) { context->present[i] = NO; context->value[i] = -1; } } context->token = 0; HTChunk_clear(string); context->current_attribute_number = INVALID; goto S_tag_gap; } break; S_tag_gap: context->state = S_tag_gap; case S_tag_gap: /* Expecting attribute or > */ if (isspace((int) c)) break; /* Gap between attributes */ if (c == '>') goto got_element_open; else goto S_attr; S_attr: /* ** Start collecting the attribute name and collect ** it in S_attr. */ context->state = S_attr; HTChunk_truncate(string, context->token); case S_attr: if (isspace((int) c) || c == '>' || c == '=') goto got_attribute_name; else HTChunk_putc(string, c); break; got_attribute_name: /* ** This label is entered when attribute name has been ** collected. Process it and enter S_attr_gap for ** potential value or start of the next attribute. */ HTChunk_terminate(string) ; handle_attribute_name (context, HTChunk_data(string) + context->token); HTChunk_truncate(string, context->token); context->state = S_attr_gap; case S_attr_gap: /* Expecting attribute or = or > */ if (isspace((int) c)) break; /* Gap after attribute */ if (c == '>') goto got_element_open; else if (c == '=') context->state = S_equals; else goto S_attr; /* Get next attribute */ break; case S_equals: /* After attr = */ if (isspace((int) c)) break; /* Before attribute value */ if (c == '>') { /* End of tag */ HTTRACE(SGML_TRACE, "found = but no value\n"); goto got_element_open; } else if (c == '\'') context->state = S_squoted; else if (c == '"') context->state = S_dquoted; else goto S_value; break; S_value: context->state = S_value; HTChunk_truncate(string, context->token); case S_value: if (isspace((int) c) || c == '>') { HTChunk_terminate(string); handle_attribute_value(context); context->token = HTChunk_size(string); goto S_tag_gap; } else HTChunk_putc(string, c); break; case S_squoted: /* Quoted attribute value */ if (c == '\'') { HTChunk_terminate(string); handle_attribute_value(context); context->token = HTChunk_size(string); context->state = S_tag_gap; } else if (c && c != '\n' && c != '\r') HTChunk_putc(string, c); break; case S_dquoted: /* Quoted attribute value */ if (c == '"') { HTChunk_terminate(string); handle_attribute_value(context); context->token = HTChunk_size(string); context->state = S_tag_gap; } else if (c && c != '\n' && c != '\r') HTChunk_putc(string, c); break; case S_end: /* </ */ if (isalnum((int) c)) HTChunk_putc(string, c); else { /* End of end tag name */ HTTag *t; char * first; HTChunk_terminate(string); if ((first=HTChunk_data(string))!=NULL && *first != '\0') t = SGMLFindTag(dtd, HTChunk_data(string)); else /* Empty end tag */ /* Original code popped here one from the stack. If this feature is required, I have to put the stack back... -- msa */ t = NULL; if (!t) { HTTRACE(SGML_TRACE, "Unknown end tag </%s>\n" _ HTChunk_data(string)); (*context->actions->unparsed_end_element) (context->target, HTChunk_data(string), HTChunk_size(string)); } else { context->current_tag = NULL; end_element(context, t); } HTChunk_clear(string); context->current_attribute_number = INVALID; if (c != '>') { if (!isspace((int) c)) HTTRACE(SGML_TRACE, "`</%s%c' found!\n" _ HTChunk_data(string) _ c); context->state = S_junk_tag; } else { text = b; count = 0; context->state = S_text; } } break; case S_junk_tag: if (c == '>') { text = b; count = 0; context->state = S_text; } break; /* ** Scanning (actually skipping) declarations */ case S_md: if (c == '-') context->state = S_com_1; else if (c == '"') context->state = S_md_dqs; else if (c == '\'') context->state = S_md_sqs; else if (c == '>') { text = b; count = 0; context->state = S_text; } break; case S_md_dqs: /* Skip double quoted string */ if (c == '"') context->state = S_md; else if (c == '>') { text = b; count = 0; context->state = S_text; } break; case S_md_sqs: /* Skip single quoted string */ if (c == '\'') context->state = S_md; else if (c == '>') { text = b; count = 0; context->state = S_text; } break; case S_com_1: /* Starting a comment? */ context->state = (c == '-') ? S_com : S_md; if (c == '>') { text = b; count = 0; context->state = S_text; } break; case S_com: /* ..within comment */ if (c == '-') context->state = S_com_2; break; case S_com_2: /* Ending a comment ? */ context->state = (c == '-') ? S_com_2a : S_com; break; case S_com_2a: if (c == '>') { text = b; count = 0; context->state = S_text; } else if (c == '-') { context->state = S_com_2a; } else context->state = S_com; break; } } if (count > 0) PUTB(text, count); return HT_OK; }
static void raptor_rdfa_start_element(void *user_data, raptor_xml_element *xml_element) { raptor_qname* qname = raptor_xml_element_get_name(xml_element); int nb_attributes = raptor_xml_element_get_attributes_count(xml_element); raptor_qname** attrs = raptor_xml_element_get_attributes(xml_element); unsigned char* localname = raptor_qname_to_counted_name(qname, NULL); const raptor_namespace* qname_ns = raptor_qname_get_namespace(qname); int nb_namespaces = 0; const char** namespaces = NULL; int nb_defaulted = 0; char** attr = NULL; int i; const char* ns_name = NULL; const char* ns_uri = NULL; if(nb_attributes > 0) { /* Everything written into 'attr' is a shared pointer into * xml_element or contained objects - qnames, namespaces, uris * and values */ attr = (char**)malloc(sizeof(char*) * (1 + (nb_attributes * 5))); for(i = 0; i < nb_attributes; i++) { const raptor_namespace* attr_ns = attrs[i]->nspace; char** attri = &attr[5 * i]; /* 5 tuple: (localname, prefix, URI, value, end) */ attri[0] = (char*)attrs[i]->local_name; attri[1] = attr_ns ? (char*)attr_ns->prefix : NULL; attri[2] = attr_ns ? (char*)raptor_uri_as_string(attr_ns->uri) : NULL; attri[3] = (char*)attrs[i]->value; attri[4] = attri[3] + attrs[i]->value_length; } attr[5 * i] = NULL; } /* * @ctx: the user data (XML parser context) * @localname: the local name of the element * @prefix: the element namespace prefix if available * @URI: the element namespace name if available * @nb_namespaces: number of namespace definitions on that node * @namespaces: pointer to the array of prefix/URI pairs namespace definitions * @nb_attributes: the number of attributes on that node * @nb_defaulted: the number of defaulted attributes. The defaulted * ones are at the end of the array * @attributes: pointer to the array of (localname/prefix/URI/value/end) * attribute values. */ if(qname_ns) { ns_name = (const char*)raptor_namespace_get_prefix(qname_ns); ns_uri = (const char*)raptor_uri_as_string(qname_ns->uri); } start_element(user_data, (const char*)localname, ns_name, ns_uri, nb_namespaces, (const char**)namespaces, nb_attributes, nb_defaulted, (const char**)attr); if(attr) free(attr); raptor_free_memory(localname); }