Beispiel #1
0
		// ----------------------------------------------------------------------
		void SAXInterruptibleReader::
			parse()
			throw(runtime_error)
		{
			char buf[16384];
			int len;

			stop_flag_ = false;

			if(!initialized_)
				open_file();

			while(!cache_.empty() && !stop_flag_)
			{
				struct CacheData* cd = cache_.front();

				switch(cd->tag_type)
				{
				case SIR_OPENTAG:
				   start_element(cd->name, cd->atts);
				   break;
				case SIR_TEXTTAG:
				   text_element(cd->name);
				   break;
				case SIR_CLOSETAG:
				   end_element(cd->name);
				   break;
				}

				cache_.pop_front();
				delete cd;
			}

			//Read the file until the end of file
            while( !is_->eof() && !stop_flag_ )
            {
            	is_->read( buf, sizeof(buf) );
				len = is_->gcount();

				if (XML_Parse(parser, buf, len, is_->eof()) == XML_STATUS_ERROR)
				{
					std::cerr << XML_ErrorString(XML_GetErrorCode(parser)) << "at line " << XML_GetCurrentLineNumber(parser) << std::endl;
				    reset();
				    throw std::runtime_error("Error in parsing XML input");
				}
            }

			//Done -> Close the file and free all associated memory
			if( !stop_flag_ )
			{
				parsing_done();
				reset();
			}
		}
Beispiel #2
0
		// ----------------------------------------------------------------------
		void SAXInterruptibleReader::
			handle_start_element(string name, AttList& atts)
			throw(runtime_error)
		{
			string _name = string(name);

			if( stop_flag_ )
			{
				CacheData* cd = new CacheData;
				cd->tag_type = SIR_OPENTAG;
				cd->name = _name ;
				cd->atts = atts;
				cache_.push_back(cd);
			}
			else
			{
				start_element(_name, atts);
			}
		}
Beispiel #3
0
PRIVATE int SGML_write (HTStream * context, const char * b, int l)
    {
	const SGML_dtd	*dtd = context->dtd;
	HTChunk	*string = context->string;
	const char *text = b;
	int count = 0;
	
	while (l-- > 0)
	    {
		char c = *b++;
		switch(context->state)
		    {
		    got_element_open:
			/*
			** The label is jumped when the '>' of a the element
			** start tag has been detected. This DOES NOT FALL TO
			** THE CODE S_after_open, only processes the tag and
			** sets the state (c should still contain the
			** terminating character of the tag ('>'))
			*/
			if (context->current_tag && context->current_tag->name)
				start_element(context);
			context->state = S_after_open;
			break;

		    case S_after_open:
			/*
			** State S_after_open is entered only for single
			** character after the element opening tag to test
			** against newline. Strip one trainling newline only
			** after opening nonempty element.  - SGML: Ugh!
			*/
			text = b;
			count = 0;
			if (c == '\n' && (context->contents != SGML_EMPTY))
			    {
				context->state = S_text;
				break;
			    }
			--text;
			goto S_text;

		    S_text:
			context->state = S_text;
		    case S_text:
#ifdef ISO_2022_JP
			if (c == '\033')
			    {
				context->state = S_esc;
				++count;
				break;
			    }
#endif /* ISO_2022_JP */
			if (c == '&')
			    {
				if (count > 0)
					PUTB(text, count);
				count = 0;
				HTChunk_clear(string);
				context->state = S_ero;
			    }
			else if (c == '<')
			    {
				if (count > 0)
					PUTB(text, count);
				count = 0;
				HTChunk_clear(string);
				/* should scrap LITERAL, and use CDATA and
				   RCDATA -- msa */
				context->state =
					(context->contents == SGML_LITERAL) ?
						S_literal : S_tag;
			    }
			else if (c == '\n')
			    	/* Newline - ignore if before end tag! */
				context->state = S_nl;
			else
				++count;
			break;

		    case S_nl:
			if (c == '<')
			    {
				if (count > 0)
					PUTB(text, count);
				count = 0;
				HTChunk_clear(string);
				context->state =
					(context->contents == SGML_LITERAL) ?
						S_literal : S_nl_tago;
			    }
			else
			    {
				++count;
				goto S_text;
			    }
			break;

		    case S_nl_tago:	/* Had newline and tag opener */
			if (c != '/')
				PUTC('\n'); /* Only ignore newline before </ */
			context->state = S_tag;
			goto handle_S_tag;

#ifdef ISO_2022_JP
		    case S_esc:
			if (c=='$')
				context->state = S_dollar;
			else if (c=='(')
				context->state = S_paren;
			else
				context->state = S_text;
			++count;
			break;

		    case S_dollar:
			if (c=='@' || c=='B')
				context->state = S_nonascii_text;
			else
				context->state = S_text;
			++count;
			break;

		    case S_paren:
			if (c=='B' || c=='J')
				context->state = S_text;
			else
				context->state = S_text;
			++count;
			break;

		    case S_nonascii_text:
			if (c == '\033')
				context->state = S_esc;
			++count;
			break;
#endif /* ISO_2022_JP */

			/* In literal mode, waits only for specific end tag!
			** Only foir compatibility with old servers.
			*/
		    case S_literal:
			HTChunk_putc(string, c);
			if ( TOUPPER(c) !=
			    ((HTChunk_size(string) == 1) ? '/'
			     : context->current_tag->name[HTChunk_size(string)-2]))
			    {

				/* If complete match, end literal */
				if ((c == '>') &&
				    (!context->current_tag->name[HTChunk_size(string)-2]))
				    {
					end_element
						(context,context->current_tag);
					/*
					  ...setting SGML_MIXED below is a
					  bit of kludge, but a good guess that
					  currently works, anything other than
					  SGML_LITERAL would work... -- msa */
					context->contents = SGML_MIXED;
				    }
				else
				    {
					/* If Mismatch: recover string. */
					PUTC( '<');
					PUTB(HTChunk_data(string), HTChunk_size(string));
				    }
				context->state = S_text;
				text = b;
				count = 0;
			    }
			break;

			/*
			** Character reference or Entity
			*/
		    case S_ero:
			if (c == '#')
			    {
				/*   &# is Char Ref Open */ 
				context->state = S_cro;
				break;
			    }
			context->state = S_entity;

			/** FALL THROUGH TO S_entity !! ***/

			/*
			** Handle Entities
			*/
		    case S_entity:
			if (isalnum((int) c))
				HTChunk_putc(string, c);
			else
			    {
				HTChunk_terminate(string);
				handle_entity(context);
				text = b;
				count = 0;
				if (c != ';')
				    {
					--text;
					goto S_text;
				    }
				context->state = S_text;
			    }
			break;

			/*	Character reference
			 */
		    case S_cro:
			if (isalnum((int)c))
				/* accumulate a character NUMBER */
				HTChunk_putc(string, c);
			else
			    {
				int value;
				HTChunk_terminate(string);
				if (sscanf(HTChunk_data(string), "%d", &value)==1)
					PUTC((char)value);
				else
				    {
					PUTB("&#", 2);
					PUTB(HTChunk_data(string), HTChunk_size(string)-1);
				    }
				text = b;
				count = 0;
				if (c != ';')
				    {
					--text;
					goto S_text;
				    }
				context->state = S_text;
			    }
			break;

		    case S_tag:		/* new tag */
		    handle_S_tag:
			if (isalnum((int)c))
				HTChunk_putc(string, c);
			else { /* End of tag name */
			    int i;
			    if (c == '/') {
				if (HTChunk_size(string) > 0)
				    HTTRACE(SGML_TRACE, "`<%s/' found!\n" _ HTChunk_data(string));
				context->state = S_end;
				break;
			    } else if (c == '!') {
				if (HTChunk_size(string) > 0)
				    HTTRACE(SGML_TRACE, " `<%s!' found!\n" _ HTChunk_data(string));
				context->state = S_md;
				break;
			    }
			    HTChunk_terminate(string);
			    context->current_tag  = SGMLFindTag(dtd, HTChunk_data(string));
			    if (context->current_tag == NULL) {
				HTTRACE(SGML_TRACE, "*** Unknown element %s\n" _ HTChunk_data(string));
				(*context->actions->unparsed_begin_element)
				    (context->target, HTChunk_data(string), HTChunk_size(string));
			    } else {
				for (i=0; i<context->current_tag->number_of_attributes; i++) {
				    context->present[i] = NO;
				    context->value[i] = -1;
				}
			    }
			    context->token = 0;
			    HTChunk_clear(string);
			    context->current_attribute_number = INVALID;
			    goto S_tag_gap;
			}
			break;

		    S_tag_gap:
			context->state = S_tag_gap;
		    case S_tag_gap:		/* Expecting attribute or > */
			if (isspace((int) c))
				break;	/* Gap between attributes */

			if (c == '>')
				goto got_element_open;
			else
				goto S_attr;

		    S_attr:
			/*
			** Start collecting the attribute name and collect
			** it in S_attr.
			*/
			context->state = S_attr;
			HTChunk_truncate(string, context->token);
		    case S_attr:
			if (isspace((int) c) || c == '>' || c == '=')
				goto got_attribute_name;
			else
				HTChunk_putc(string, c);
			break;

		    got_attribute_name:
			/*
			** This label is entered when attribute name has been
			** collected. Process it and enter S_attr_gap for
			** potential value or start of the next attribute.
			*/
			HTChunk_terminate(string) ;
			handle_attribute_name
				(context, HTChunk_data(string) + context->token);
			HTChunk_truncate(string, context->token);
			context->state = S_attr_gap;
		    case S_attr_gap:	/* Expecting attribute or = or > */
			if (isspace((int) c))
				break;	/* Gap after attribute */

			if (c == '>')
				goto got_element_open;
			else if (c == '=')
				context->state = S_equals;
			else
				goto S_attr; /* Get next attribute */
			break;

		    case S_equals:	/* After attr = */ 
			if (isspace((int) c))
				break;	/* Before attribute value */

			if (c == '>')
			    {		/* End of tag */
				HTTRACE(SGML_TRACE, "found = but no value\n");
				goto got_element_open;
			    }
			else if (c == '\'')
				context->state = S_squoted;
			else if (c == '"')
				context->state = S_dquoted;
			else
				goto S_value;
			break;

		    S_value:
			context->state = S_value;
			HTChunk_truncate(string, context->token);
		    case S_value:
			if (isspace((int) c) || c == '>')
			    {
				HTChunk_terminate(string);
				handle_attribute_value(context);
				context->token = HTChunk_size(string);
				goto S_tag_gap;
			    }
			else
				HTChunk_putc(string, c);
			break;
		
		    case S_squoted:	/* Quoted attribute value */
			if (c == '\'')
			    {
				HTChunk_terminate(string);
				handle_attribute_value(context);
				context->token = HTChunk_size(string);
				context->state = S_tag_gap;
			    }
			else if (c && c != '\n' && c != '\r')
				HTChunk_putc(string, c);
			break;
	
		    case S_dquoted:	/* Quoted attribute value */
			if (c == '"')
			    {
				HTChunk_terminate(string);
				handle_attribute_value(context);
				context->token = HTChunk_size(string);
				context->state = S_tag_gap;
			    }
			else if (c && c != '\n' && c != '\r')
				HTChunk_putc(string, c);
			break;

		    case S_end:	/* </ */
			if (isalnum((int) c))
				HTChunk_putc(string, c);
			else
			    {		/* End of end tag name */
				HTTag *t;
				char * first;
				HTChunk_terminate(string);
				if ((first=HTChunk_data(string))!=NULL && *first != '\0')
				        t = SGMLFindTag(dtd, HTChunk_data(string));
				else
				    	/* Empty end tag */
					/* Original code popped here one
					   from the stack. If this feature
					   is required, I have to put the
					   stack back... -- msa */
					t = NULL;
				if (!t) {
				    HTTRACE(SGML_TRACE, "Unknown end tag </%s>\n" _ HTChunk_data(string));
				    (*context->actions->unparsed_end_element)
					(context->target, HTChunk_data(string), HTChunk_size(string));
				} else {
				    context->current_tag = NULL;
				    end_element(context, t);
				}
				HTChunk_clear(string);
				context->current_attribute_number = INVALID;
				if (c != '>')
				    {
					if (!isspace((int) c))
					    HTTRACE(SGML_TRACE, "`</%s%c' found!\n" _ HTChunk_data(string) _ c);
					context->state = S_junk_tag;
				    }
				else
				    {
					text = b;
					count = 0;
					context->state = S_text;
				    }
			    }
			break;

		    case S_junk_tag:
			if (c == '>')
			    {
				text = b;
				count = 0;
				context->state = S_text;
			    }
			break;

			/*
			** Scanning (actually skipping) declarations
			*/
		    case S_md:
			if (c == '-')
				context->state = S_com_1;
			else if (c == '"')
				context->state = S_md_dqs;
			else if (c == '\'')
				context->state = S_md_sqs;
			else if (c == '>')
			    {
				text = b;
				count = 0;
				context->state = S_text;
			    }
			break;

		    case S_md_dqs: /* Skip double quoted string */
			if (c == '"')
				context->state = S_md;
			else if (c == '>')
			    {
				text = b;
				count = 0;
				context->state = S_text;
			    }
			break;

		    case S_md_sqs: /* Skip single quoted string */
			if (c == '\'')
				context->state = S_md;
			else if (c == '>')
			    {
				text = b;
				count = 0;
				context->state = S_text;
			    }
			break;

		    case S_com_1: /* Starting a comment? */
			context->state = (c == '-') ? S_com : S_md;
			if (c == '>')
			    {
				text = b;
				count = 0;
				context->state = S_text;
			    }
			break;

		    case S_com: /* ..within comment */
			if (c == '-')
				context->state = S_com_2;
			break;

		    case S_com_2: /* Ending a comment ? */
			context->state = (c == '-') ? S_com_2a : S_com;
			break;
		    
		    case S_com_2a:
			if (c == '>') {
			    text = b;
			    count = 0;
			    context->state = S_text;
			} else if (c == '-') {
			    context->state = S_com_2a;
			} else
			    context->state = S_com;
			break;
		    }
	    }
	if (count > 0)
		PUTB(text, count);
	return HT_OK;
    }
Beispiel #4
0
static void raptor_rdfa_start_element(void *user_data,
                                      raptor_xml_element *xml_element)
{
  raptor_qname* qname = raptor_xml_element_get_name(xml_element);
  int nb_attributes = raptor_xml_element_get_attributes_count(xml_element);
  raptor_qname** attrs = raptor_xml_element_get_attributes(xml_element);
  unsigned char* localname = raptor_qname_to_counted_name(qname, NULL);
  const raptor_namespace* qname_ns = raptor_qname_get_namespace(qname);
  int nb_namespaces = 0;
  const char** namespaces = NULL;
  int nb_defaulted = 0;
  char** attr = NULL;
  int i;
  const char* ns_name = NULL;
  const char* ns_uri = NULL;

  if(nb_attributes > 0) {
    /* Everything written into 'attr' is a shared pointer into
     * xml_element or contained objects - qnames, namespaces, uris
     * and values
     */
    attr = (char**)malloc(sizeof(char*) * (1 + (nb_attributes * 5)));
    for(i = 0; i < nb_attributes; i++) {
      const raptor_namespace* attr_ns = attrs[i]->nspace;
      char** attri = &attr[5 * i];
      /* 5 tuple: (localname, prefix, URI, value, end) */
      attri[0] = (char*)attrs[i]->local_name;
      attri[1] = attr_ns ? (char*)attr_ns->prefix : NULL;
      attri[2] = attr_ns ? (char*)raptor_uri_as_string(attr_ns->uri) : NULL;
      attri[3] = (char*)attrs[i]->value;
      attri[4] = attri[3] + attrs[i]->value_length;
    }
    attr[5 * i] = NULL;
  }

/*
 * @ctx:  the user data (XML parser context)
 * @localname:  the local name of the element
 * @prefix:  the element namespace prefix if available
 * @URI:  the element namespace name if available
 * @nb_namespaces:  number of namespace definitions on that node
 * @namespaces:  pointer to the array of prefix/URI pairs namespace definitions
 * @nb_attributes:  the number of attributes on that node
 * @nb_defaulted:  the number of defaulted attributes. The defaulted
 *                  ones are at the end of the array
 * @attributes:  pointer to the array of (localname/prefix/URI/value/end)
 *               attribute values.
 */
  if(qname_ns) {
    ns_name = (const char*)raptor_namespace_get_prefix(qname_ns);
    ns_uri = (const char*)raptor_uri_as_string(qname_ns->uri);
  }

  start_element(user_data, (const char*)localname,
                ns_name,
                ns_uri,
                nb_namespaces,
                (const char**)namespaces,
                nb_attributes,
                nb_defaulted,
                (const char**)attr);
  if(attr)
    free(attr);
  raptor_free_memory(localname);
}