Ejemplo n.º 1
0
int same_gtrans(GState *a, GTrans *s, GState *b, GTrans *t, int use_scc) 
{ /* returns 1 if the transitions are identical */
  if((s->to != t->to) ||
     ! same_sets(s->pos, t->pos, 1) ||
     ! same_sets(s->neg, t->neg, 1))
    return 0; /* transitions differ */
  if(same_sets(s->final, t->final, 0))
    return 1; /* same transitions exactly */
  /* next we check whether acceptance conditions may be ignored */
  if( use_scc &&
      ( in_set(bad_scc, a->incoming) ||
        in_set(bad_scc, b->incoming) ||
        (a->incoming != s->to->incoming) ||
        (b->incoming != t->to->incoming) ) )
    return 1;
  return 0;
  /* below is the old test to check whether acceptance conditions may be ignored */
  if(!use_scc)
    return 0; /* transitions differ */
  if( (a->incoming == b->incoming) && (a->incoming == s->to->incoming) )
    return 0; /* same scc: acceptance conditions must be taken into account */
  /* if scc(a)=scc(b)>scc(s->to) then acceptance conditions need not be taken into account */
  /* if scc(a)>scc(b) and scc(a) is non-trivial then all_gtrans_match(a,b,use_scc) will fail */
  /* if scc(a) is trivial then acceptance conditions of transitions from a need not be taken into account */
  return 1; /* same transitions up to acceptance conditions */
}
Ejemplo n.º 2
0
std::string GumboInterface::serialize_contents(GumboNode* node, enum UpdateTypes doupdates) {
    std::string contents        = "";
    std::string tagname         = get_tag_name(node);
    bool no_entity_substitution = in_set(no_entity_sub, tagname);
    bool keep_whitespace        = in_set(preserve_whitespace, tagname);
    bool is_inline              = in_set(nonbreaking_inline, tagname);

    // build up result for each child, recursively if need be
    GumboVector* children = &node->v.element.children;

    bool inject_newline = false;

    for (unsigned int i = 0; i < children->length; ++i) {
        GumboNode* child = static_cast<GumboNode*> (children->data[i]);

        if (child->type == GUMBO_NODE_TEXT) {
            inject_newline = false;
            if (no_entity_substitution) {
                contents.append(std::string(child->v.text.text));
            } else {
                contents.append(substitute_xml_entities_into_text(std::string(child->v.text.text)));
            }

        } else if (child->type == GUMBO_NODE_ELEMENT || child->type == GUMBO_NODE_TEMPLATE) {
            contents.append(serialize(child, doupdates));
            inject_newline = false;
            std::string childname = get_tag_name(child);
            if (!is_inline && !keep_whitespace && !in_set(nonbreaking_inline,childname)) {
                contents.append("\n");
                inject_newline = true;
            }

        } else if (child->type == GUMBO_NODE_WHITESPACE) {
            // try to keep all whitespace to keep as close to original as possible
            std::string wspace = std::string(child->v.text.text);
            if (inject_newline) {
                newlinetrim(wspace);
                inject_newline = false;
            }
            contents.append(wspace);
            inject_newline = false;

        } else if (child->type == GUMBO_NODE_CDATA) {
            contents.append("<![CDATA[" + std::string(child->v.text.text) + "]]>");
            inject_newline = false;

        } else if (child->type == GUMBO_NODE_COMMENT) {
            contents.append("<!--" + std::string(child->v.text.text) + "-->");
 
        } else {
            fprintf(stderr, "unknown element of type: %d\n", child->type); 
            inject_newline = false;
        }

    }

    return contents;
}
static int test_mangle_option(int argc, char **argv, const char *option, int hasval)
{
    int ac = argc, i = 0, ret;
    char *av[MAX_OPTS];
    const char *useless = NULL;

    assert(argc <= MAX_OPTS);

    for (i = 0; i < argc; i++) {
        av[i] = argv[i];
    }
    ret = tc_mangle_cmdline(&argc, &argv, option, (hasval) ?(&useless) :NULL);
    tc_info("mangling: %i", ret);
    if (ret != 0) {
        DUMP_OPTS(argc, argv);
        if (ac != argc) {
            tc_warn("missing argument (argc not changed)");
            return 1;
        }
        for (i = 0; i < argc; i++) {
            if (av[i] != argv[i]
             || strcmp(av[i], argv[i]) != 0) {
                tc_warn("argument diversion (%s VS %s @ %i)", av[i], argv[i], i);
                return 1;
            }
        }
        if (!in_set(option, (const char **)argv, argc)) {
            tc_warn("option still present");
            return 1;
        }
    } else {
        int na = ac - ((hasval) ?2 :1);
        DUMP_OPTS(argc, argv);
        if (na != argc) {
            tc_warn("argument number mismatch (expected %i|got %i)", na, argc);
            return 1;
        }
        if (in_set(option, (const char **)argv, argc)) {
            tc_warn("option still present");
            return 1;
        }
        for (i = 0; i < ac; i++) {
            if (!in_set(argv[i], (const char **)av, ac)) {
                tc_warn("missing argument: %s", argv[i]);
                return 1;
            }
        }
    }
    return 0;
}
Ejemplo n.º 4
0
static
bool is_splitters(
  const char             c,
  const parser_type * parser)
{
  return in_set(c , parser->splitters);
}
Ejemplo n.º 5
0
static 
bool is_special(
  const char             c,
  const parser_type * parser)
{
  return in_set(c , parser->specials);
}
Ejemplo n.º 6
0
static
bool is_in_quoters(
  const char       c,
  const parser_type * parser)
{
  return in_set(c , parser->quoters);
}
Ejemplo n.º 7
0
int simplify_gtrans() /* simplifies the transitions */
{
  int changed = 0;
  GState *s;
  GTrans *t, *t1;

  if(tl_stats) getrusage(RUSAGE_SELF, &tr_debut);

  for(s = gstates->nxt; s != gstates; s = s->nxt) {
    t = s->trans->nxt;
    while(t != s->trans) { /* tries to remove t */
      copy_gtrans(t, s->trans);
      t1 = s->trans->nxt;
      while ( !((t != t1) 
          && (t1->to == t->to) 
          && included_set(t1->pos, t->pos, 1) 
          && included_set(t1->neg, t->neg, 1) 
          && (included_set(t->final, t1->final, 0)  /* acceptance conditions of t are also in t1 or may be ignored */
              || (tl_simp_scc && ((s->incoming != t->to->incoming) || in_set(bad_scc, s->incoming))))) )
        t1 = t1->nxt;
      if(t1 != s->trans) { /* remove transition t */
        GTrans *free = t->nxt;
        t->to = free->to;
        copy_set(free->pos, t->pos, 1);
        copy_set(free->neg, t->neg, 1);
        copy_set(free->final, t->final, 0);
        t->nxt = free->nxt;
        if(free == s->trans) s->trans = t;
        free_gtrans(free, 0, 0);
        changed++;
      }
      else
        t = t->nxt;
    }
Ejemplo n.º 8
0
void simplify_astates() /* simplifies the alternating automaton */
{
    ATrans *t;
    int i, *acc = make_set(-1, 0); /* no state is accessible initially */

    for (t = transition[0]; t; t = t->nxt, i = 0)
    {
        merge_sets(acc, t->to, 0);    /* all initial states are accessible */
    }

    for (i = node_id - 1; i > 0; i--)
    {
        if (!in_set(acc, i))   /* frees unaccessible states */
        {
            label[i] = ZN;
            free_atrans(transition[i], 1);
            transition[i] = (ATrans *)0;
            continue;
        }
        astate_count++;
        simplify_atrans(&transition[i]);
        for (t = transition[i]; t; t = t->nxt)
        {
            merge_sets(acc, t->to, 0);
        }
    }

    tfree(acc);
}
Ejemplo n.º 9
0
int break_to_odd_set(
        unsigned int number, 
        unsigned int current_value, 
        unsigned int* set)
{
    if (number == 0){
        return EXIT_SUCCESS;
    }

    if (number < current_value){
        return EXIT_FAILURE;
    }
    else if (number % 2 == 1){
        if (break_to_odd_set(number - current_value, current_value + 2, set) == EXIT_FAILURE){
            if (in_set(set, number) == EXIT_FAILURE){
                add_to_set(set, number);
            }
            else{
                return EXIT_FAILURE;
            }
        }
        else{
            if (in_set(set, current_value) == EXIT_FAILURE){
                add_to_set(set, current_value);
            }
            else{
                return EXIT_FAILURE;
            }
        }
    } 
    else if (number % 2 == 0){
        if (break_to_odd_set(number - current_value, current_value + 2, set) == EXIT_FAILURE){
            return EXIT_FAILURE;
        }else{
            if (in_set(set, current_value) == EXIT_FAILURE){
                add_to_set(set, current_value);
            }else{
                return EXIT_FAILURE;
            }
        }
    }
    else{
        printf("This should never happen\n");
    }

    return EXIT_SUCCESS;
}
Ejemplo n.º 10
0
/**********************************************************************
 *
 *	followpos
 *
 * Creates followpos array using depth-first traversal.
 * Followpos(i) is the set of positions j such that there is some input 
 * string ...cd... such that i corresponds to this occurence of c and 
 * j to this occurence of d. So for example for a subexpression cd each 
 * lastpos of c is followed by firstpos of d, or for c* each lastpos of
 * c is followed by firstpos in c.
 */
static bool followpos(void)
{
	REG1 int	i; 
	REG2 int	j;
	REG3 node_t*	inode;
	REG4 node_t*	jnode;
	REG5 set_t**	followptr;
	
	if ((followptr = calloc((rbuf.root+1) * sizeof(set_t *), 1)) == NULL)
		return FALSE;
	rbuf.follow_array = followptr;
	inode = rbuf.tree;
	for (i = 0; i <= rbuf.root; i++, inode++, followptr++) {
		if (inode->type != ID && inode->type != CLASS_ID ) {
			*followptr = NULL;
			continue;
		}
		if ((*followptr = calloc(rbuf.setsize, 1)) == NULL)
			return FALSE;
		jnode = &rbuf.tree[i+1];
		for (j = i+1; j <= rbuf.root; j++, jnode++) {
			switch (jnode->type) {
			    case CAT:
				if (in_set(rbuf.tree[jnode->val.next.left].
				    lastpos, i))
				    set_union(
				      *followptr,
				      *followptr,
				      rbuf.tree[jnode->val.next.right].firstpos,
				      rbuf.setsize);
				break;
			    case CLOSURE:
				if (in_set(jnode->lastpos, i))
				    set_union(
				      *followptr,
				      *followptr,
				      jnode->firstpos,
				      rbuf.setsize);
				break;
			    default:
				break;
			}
		}
	}
	return TRUE;
}
Ejemplo n.º 11
0
/**********************************************************************
 *
 *	print_tree
 */
static void print_tree(void)
{
	REG1 int	i,j;
	char*		char_image(int c);
	
	for (i = 0; i <= rbuf.root; i++) {
		printf("%2d ",i);
		switch (rbuf.tree[i].type) {
			case CAT:
				printf("CAT %2d %2d", 
					rbuf.tree[i].val.next.left,
					rbuf.tree[i].val.next.right);
				break;
			case OR:
				printf("OR  %2d %2d",
					rbuf.tree[i].val.next.left,
					rbuf.tree[i].val.next.right);
				break;
			case CLOSURE:
				printf("CLO %2d   ",
					rbuf.tree[i].val.next.left);
				break;
			case ID:
				printf("ID %3s   ", 
					char_image(rbuf.tree[i].val.item));
				break;
			case EMPTY_ID:
				printf("EMPTY_ID ");
				break;
			case CLASS_ID:
				printf("CLASS ID ");
				break;
		}
		printf(" f");
		for (j = 0; j <= rbuf.root; j++)
			if (in_set(rbuf.tree[i].firstpos, j))
				printf(" %2d",j);
		printf("\tl");
		for (j = 0; j <= rbuf.root; j++)
			if (in_set(rbuf.tree[i].lastpos, j))
				printf(" %2d",j);
		printf("\n");
	}
	fflush(stdout);
}
Ejemplo n.º 12
0
/*-------------------------------------------------------------------
  dec возвращает номер допустимой на множестве set точки между ie-ой
      и i-ой  и не левее x
-------------------------------------------------------------------*/
static int16_t dec(struct cut_elm **cutp, int16_t i, int16_t ie, int16_t set, int16_t x)
{
  if (i<=ie) return -128;
  (*cutp)--; i--;
  while ((*cutp)->x > x)
  {
    char var=(*cutp)->var & 0x7F;
    if (i==ie || in_set(var,set))  return i;
    (*cutp)--; i--;
  }
  return -128;
}
Ejemplo n.º 13
0
/*-------------------------------------------------------------------
  inc возвращает номер допустимой на множестве set точки между i-ой
      и ie-ой и не правее x
-------------------------------------------------------------------*/
static int16_t inc(struct cut_elm **cutp, int16_t i, int16_t ie, int16_t set, int16_t x)
{
  if (i>=ie) return -128;
  (*cutp)++; i++;
  while ((*cutp)->x < x)
  {
    char var=(*cutp)->var & 0x7F;
    if (i==ie || in_set(var,set))  return i;
    (*cutp)++; i++;
  }
  return -128;
}
Ejemplo n.º 14
0
/**********************************************************************
 *
 *	print_followpos
 */
static void print_followpos(void)
{
	REG1 int	i, j;
	
	printf("Followpositions:\n");
	for (i = 0; i <= rbuf.root; i++) {
		if (rbuf.follow_array[i] == NULL)
			continue;
		printf("%2d:", i);
		for (j = 0; j <= rbuf.root; j++)
			if (in_set(rbuf.follow_array[i], j))
				printf(" %d", j);
		printf("\n");
	}
	fflush(stdout);
}
Ejemplo n.º 15
0
QStringList GumboInterface::get_properties(GumboNode* node)
{
    if (node->type != GUMBO_NODE_ELEMENT) {
        return QStringList();
    }
    QStringList properties;
    std::string tagname = get_tag_name(node);
    if (in_set(manifest_properties, tagname)) {
        properties.append(QString::fromStdString(tagname));
    }
    GumboVector* children = &node->v.element.children;
    for (unsigned int i = 0; i < children->length; ++i) {
        properties.append(get_properties(static_cast<GumboNode*>(children->data[i])));
    }
    return properties;
}
Ejemplo n.º 16
0
bool_t set_add_element(set_t *s, u64 el) {
	void *tmp = NULL;

	if (s == NULL || in_set(s, el))
		return FALSE;

	tmp = s->set == NULL ? malloc(sizeof(u64)) : realloc(s->set, (s->size + 1) * sizeof(u64));
	if (tmp == NULL)
		return FALSE;

	s->set = (u64*) tmp;
	s->set[s->size] = el;
	s->size++;

	return TRUE;
}
Ejemplo n.º 17
0
/*        the workhorse. note that it calls itself.
           args:

              int *set  == an array of ints, indexes into the permutable item list
              int len  == the number of items to permute.
              char **vals  == an array of the strings that are being permuted.
              int lev  == which item we are going to be selecting. At each level
                          of recursion, we will be selecting the next permuted item.
                          The deeper we go, the more items we have selected.
              int *currset  == while set points to the list of items, the 'currset'
                           is the working list of permuted items. when we reach the bottom
                           (when the lev equals len), we have a complete permuted set.
              int (*func)(int *, in, char **) == a func to call each time we have
                           a complete permuted list; It returns an int. It takes 3 args,
                           and array of ints, a len, and an array of char pointers.
*/
void permute( int *set, int len, char **vals, int lev, int *currset, int (*func)(int *, int, char **))
{
	int i;
	
	if( lev == len )
	{
		(*func)(currset, len, vals); /* at the end of the recursion, call the func we passed in */
		return;
	}
	for(i=0;i<len;i++)
	{
		if( !in_set(currset, lev, i) )
		{
			currset[lev] = i;
			permute( set, len, vals, lev+1, currset, func );
		}
	}
}
Ejemplo n.º 18
0
QStringList GumboInterface::get_properties(GumboNode* node)
{
    if (node->type != GUMBO_NODE_ELEMENT) {
        return QStringList();
    }
    QStringList properties;
    std::string tagname = get_tag_name(node);
    if (in_set(manifest_properties, tagname)) {
        properties.append(QString::fromStdString(tagname));
    }
    GumboAttribute* attr = gumbo_get_attribute(&node->v.element.attributes, "src");
    if (attr && !QUrl(QString::fromUtf8(attr->value)).isRelative()) {
        properties.append(QString("remote-resources"));
    }
    GumboVector* children = &node->v.element.children;
    for (unsigned int i = 0; i < children->length; ++i) {
        properties.append(get_properties(static_cast<GumboNode*>(children->data[i])));
    }
    return properties;
}
Ejemplo n.º 19
0
void permute( int *set, int len, char *vals, int lev, int *currset, int print_it)
{
	int i;
	match_count=0;
	if( lev == len )
	{
		if(print_it)
		{
			/* print out the set */
			for(i=0;i<len;i++)
			{
				printf("%c", vals[currset[i]]);
			}
			printf("\n");
		}
		else
		{
			char buf1[20];
			for(i=0;i<len;i++)
			{
				buf1[i] = vals[currset[i]];
			}
			buf1[i] = 0;
			
			if( ast_hashtab_lookup(dict, buf1) )
			{
				match_count++;
				printf("   Found:   %s in the dictionary!\n", buf1);
			}
		}
		return;
	}
	for(i=0;i<len;i++)
	{
		if( !in_set(currset, lev, i) )
		{
			currset[lev] = i;
			permute( set, len, vals, lev+1, currset, print_it );
		}
	}
}
Ejemplo n.º 20
0
std::string GumboInterface::prettyprint(GumboNode* node, int lvl, const std::string indent_chars)
{

    // special case the document node
    if (node->type == GUMBO_NODE_DOCUMENT) {
      std::string results = build_doctype(node);
      results.append(prettyprint_contents(node,lvl+1,indent_chars));
      return results;
    }

    std::string tagname = get_tag_name(node);
    std::string parentname = get_tag_name(node->parent);
    bool in_head = (parentname == "head");

    bool is_structural = in_set(structural_tags, tagname);
    bool is_inline = in_set(nonbreaking_inline, tagname);

    // build attr string
    std::string atts = "";
    bool no_entity_substitution = in_set(no_entity_sub, tagname);
    const GumboVector * attribs = &node->v.element.attributes;
    for (unsigned int i=0; i< attribs->length; ++i) {
        GumboAttribute* at = static_cast<GumboAttribute*>(attribs->data[i]);
        atts.append(build_attributes(at, no_entity_substitution));
    }

    bool is_void_tag = in_set(void_tags, tagname);

    // get tag contents
    std::string contents = "";
    if (!is_void_tag) {
        if (is_structural && tagname != "html") {
            contents = prettyprint_contents(node, lvl+1, indent_chars);
        } else {
            contents = prettyprint_contents(node, lvl, indent_chars);
        }
    }

    bool keep_whitespace = in_set(preserve_whitespace, tagname);
    if (!keep_whitespace && !is_inline) {
        rtrim(contents);
    }

    bool single = is_void_tag;
    // for xhtml serialization that allows non-void tags to be self-closing
    // uncomment the following line
    // single = single || contents.empty();

    char c = indent_chars.at(0);
    int  n = indent_chars.length(); 
    std::string indent_space = std::string((lvl-1)*n,c);

    // handle self-closed tags with no contents first
    if (single) {
        std::string selfclosetag = "<" + tagname + atts + "/>";
        if (is_inline) {
            // always add newline after br tags when they are children of structural tags
            if ((tagname == "br") && in_set(structural_tags, parentname)) {
              selfclosetag.append("\n");
              if (!in_head && (tagname != "html")) selfclosetag.append("\n");
            }
            return selfclosetag;
        }
        if (!in_head && (tagname != "html")) selfclosetag.append("\n");
        return indent_space + selfclosetag + "\n";
    } 

    // Handle the general case
    std::string results;
    std::string starttag = "<" + tagname +  atts + ">";
    std::string closetag = "</" + tagname + ">";

    if (is_structural) {
        results = indent_space + starttag;
        if (!contents.empty()) {
            results.append("\n" + contents + "\n" + indent_space);
        }  
        results.append(closetag + "\n");
        if (!in_head && (tagname != "html")) results.append("\n");
    } else if (is_inline) {
        results = starttag;
        results.append(contents);
        results.append(closetag);
    } else /** all others */ {
        results = indent_space + starttag;
        if (!keep_whitespace) {
            ltrim(contents);
        }
        results.append(contents);
        results.append(closetag + "\n");
        if (!in_head && (tagname != "html")) results.append("\n");
    }
    return results;
}
Ejemplo n.º 21
0
std::string GumboInterface::prettyprint_contents(GumboNode* node, int lvl, const std::string indent_chars) 
{
    std::string contents        = "";
    std::string tagname         = get_tag_name(node);
    bool no_entity_substitution = in_set(no_entity_sub, tagname);
    bool keep_whitespace        = in_set(preserve_whitespace, tagname);
    bool is_inline              = in_set(nonbreaking_inline, tagname);
    bool is_structural          = in_set(structural_tags, tagname);
    char c                      = indent_chars.at(0);
    int  n                      = indent_chars.length(); 
    std::string indent_space    = std::string((lvl-1)*n,c);
    char last_char              = 'x';
    bool contains_block_tags    = false;

    GumboVector* children = &node->v.element.children;

    if (is_structural || (tagname == "#document")) last_char = '\n';
    bool in_head_without_title = (tagname == "head");

    for (unsigned int i = 0; i < children->length; ++i) {

        GumboNode* child = static_cast<GumboNode*> (children->data[i]);

        if (child->type == GUMBO_NODE_TEXT) {
            std::string val;

            if (no_entity_substitution) {
                val = std::string(child->v.text.text);
            } else {
                val = substitute_xml_entities_into_text(std::string(child->v.text.text));
            }

            // if child of a structual element is text and follows a newline, indent it properly
            if (is_structural && last_char == '\n') {
                contents.append(indent_space);
                ltrim(val);
            }
            if (!keep_whitespace && !is_structural) {
                // okay to condense whitespace
                condense_whitespace(val);
            }
            contents.append(val);

        } else if (child->type == GUMBO_NODE_ELEMENT || child->type == GUMBO_NODE_TEMPLATE) {

            std::string val = prettyprint(child, lvl, indent_chars);
            std::string childname = get_tag_name(child);
            if (in_head_without_title && (childname == "title")) in_head_without_title = false;
            if (!in_set(nonbreaking_inline, childname)) {
                contains_block_tags = true;
                if (last_char != '\n') {
                    contents.append("\n");
                    if (tagname != "head" && tagname != "html") contents.append("\n");
                    last_char='\n';
                }
            }
            // if child of a structual element is inline and follows a newline, indent it properly
            if (is_structural && in_set(nonbreaking_inline, childname) && (last_char == '\n')) {
                contents.append(indent_space);
                ltrim(val);
            }    
            contents.append(val);

        } else if (child->type == GUMBO_NODE_WHITESPACE) {

            if (keep_whitespace) {
                std::string wspace = std::string(child->v.text.text);
                contents.append(wspace);
            } else if (is_inline || in_set(other_text_holders, tagname)) {
                if (std::string(" \t\v\f\r\n").find(last_char) == std::string::npos) {
                    contents.append(std::string(" "));
                }
            }

        } else if (child->type == GUMBO_NODE_CDATA) {
            contents.append("<![CDATA[" + std::string(child->v.text.text) + "]]>");

        } else if (child->type == GUMBO_NODE_COMMENT) {
            contents.append("<!--" + std::string(child->v.text.text) + "-->");
 
        } else {
            fprintf(stderr, "unknown element of type: %d\n", child->type); 
        }

        // update last character of current contents
        if (!contents.empty()) {
            last_char = contents.at(contents.length()-1);
        }

    }

    // inject epmpty title into head if one is missing
    if (in_head_without_title) {
        if (last_char != '\n') contents.append("\n");
        contents.append(indent_space + "<title></title>\n");
        last_char = '\n';
    }

    // treat inline tags containing block tags like a block tag
    if (is_inline && contains_block_tags) {
      if (last_char != '\n') contents.append("\n\n");
      contents.append(indent_space);
    }

    return contents;
}
Ejemplo n.º 22
0
std::string GumboInterface::serialize(GumboNode* node, enum UpdateTypes doupdates) {
    // special case the document node
    if (node->type == GUMBO_NODE_DOCUMENT) {
        std::string results = build_doctype(node);
        results.append(serialize_contents(node, doupdates));
        return results;
    }

    std::string close = "";
    std::string closeTag = "";
    std::string atts = "";
    std::string tagname            = get_tag_name(node);
    bool need_special_handling     = in_set(special_handling, tagname);
    bool is_void_tag              = in_set(void_tags, tagname);
    bool no_entity_substitution    = in_set(no_entity_sub, tagname);
    // bool is_inline                 = in_set(nonbreaking_inline, tagname);
    bool is_href_src_tag           = in_set(href_src_tags, tagname);

    // build attr string  
    const GumboVector * attribs = &node->v.element.attributes;
    for (unsigned int i=0; i< attribs->length; ++i) {
        GumboAttribute* at = static_cast<GumboAttribute*>(attribs->data[i]);
        atts.append(build_attributes(at, no_entity_substitution, ((doupdates & SourceUpdates) && is_href_src_tag), (doupdates & StyleUpdates)));
    }

    // Make sure that the xmlns attribute exists as an html tag attribute
    if (tagname == "html") {
      if (atts.find("xmlns=") == std::string::npos) {
        atts.append(" xmlns=\"http://www.w3.org/1999/xhtml\"");
      }
    }

    // determine closing tag type
    if (is_void_tag) {
        close = "/";
    } else {
        closeTag = "</" + tagname + ">";
    }

    std::string contents;

    if ((tagname == "body") && (doupdates & BodyUpdates)) {
        contents = m_newbody;
    } else {
        // serialize your contents
        contents = serialize_contents(node, doupdates);
    }

    if ((doupdates & StyleUpdates) && (tagname == "style") && 
        (node->parent->type == GUMBO_NODE_ELEMENT) && 
        (node->parent->v.element.tag == GUMBO_TAG_HEAD)) {
        contents = update_style_urls(contents);
    }

    if (need_special_handling) {
        ltrimnewlines(contents);
        rtrim(contents);
        contents.append("\n");
    }

    // build results
    std::string results;


    if ((doupdates & LinkUpdates) && (tagname == "link") && 
        (node->parent->type == GUMBO_NODE_ELEMENT) && 
        (node->parent->v.element.tag == GUMBO_TAG_HEAD)) {
      return "";
    }

    results.append("<"+tagname+atts+close+">");
    if (need_special_handling) results.append("\n");
    results.append(contents);

    if ((doupdates & LinkUpdates) && (tagname == "head")) {
        results.append(m_newcsslinks);
    }

    results.append(closeTag);
    if (need_special_handling) results.append("\n");
    return results;
}
Ejemplo n.º 23
0
std::string GumboInterface::prettyprint(GumboNode* node, int lvl, const std::string indent_chars)
{

    // special case the document node
    if (node->type == GUMBO_NODE_DOCUMENT) {
      std::string results = build_doctype(node);
      results.append(prettyprint_contents(node,lvl+1,indent_chars));
      return results;
    }

    std::string close              = "";
    std::string closeTag           = "";
    std::string atts               = "";
    std::string tagname            = get_tag_name(node);
    std::string parentname         = get_tag_name(node->parent);
    bool in_head                   = (parentname == "head");
    // bool need_special_handling     = in_set(special_handling, tagname);
    bool is_empty_tag              = in_set(empty_tags, tagname);
    bool no_entity_substitution    = in_set(no_entity_sub, tagname);
    bool keep_whitespace           = in_set(preserve_whitespace, tagname);
    bool is_inline                 = in_set(nonbreaking_inline, tagname) && (parentname != "body");
    bool is_structural             = in_set(structural_tags, tagname);
    bool pp_okay                   = !is_inline && !keep_whitespace;
    char c                         = indent_chars.at(0);
    int  n                         = indent_chars.length(); 

    // build attr string
    const GumboVector * attribs = &node->v.element.attributes;
    for (unsigned int i=0; i< attribs->length; ++i) {
        GumboAttribute* at = static_cast<GumboAttribute*>(attribs->data[i]);
        atts.append(build_attributes(at, no_entity_substitution));
    }

    // determine closing tag type
    if (is_empty_tag) {
        close = "/";
    } else {
        closeTag = "</" + tagname + ">";
    }

    std::string indent_space = std::string((lvl-1)*n,c);
    std::string contents;

    // prettyprint your contents
    if (is_structural && tagname != "html") {
        contents = prettyprint_contents(node, lvl+1, indent_chars);
    } else {
        contents = prettyprint_contents(node, lvl, indent_chars);
    }

    if (is_structural) {
        rtrim(contents);
        if (!contents.empty()) contents.append("\n");
    }

    // remove any leading or trailing whitespace form within paragraphs
    if (tagname == "p") {
        ltrim(contents);
        rtrim(contents);
    }

    char last_char = ' ';
    if (!contents.empty()) {
        last_char = contents.at(contents.length()-1);
    } 

    // build results
    std::string results;

    if (!is_inline && !in_set(nonbreaking_inline, parentname)) {
      results.append(indent_space);
    }

    results.append("<"+tagname+atts+close+">");

    if (pp_okay && is_structural && !contents.empty()) {
        results.append("\n");
    }

    results.append(contents);

    if (pp_okay && (last_char != '\n') && !contents.empty() && is_structural) {
        results.append("\n");
    }

    // handle any indent before structural close tags
    if (!is_inline && is_structural && !closeTag.empty() && !contents.empty()) {
        results.append(indent_space);
    }

    results.append(closeTag);

    if ((pp_okay || tagname =="br") && !in_set(nonbreaking_inline, parentname)) {
        if (!in_head  && tagname != "html") {
            results.append("\n\n");
        } else {
            results.append("\n");
        }
    }

    return results;
}
Ejemplo n.º 24
0
std::string GumboInterface::prettyprint_contents(GumboNode* node, int lvl, const std::string indent_chars) 
{
    std::string contents        = "";
    std::string tagname         = get_tag_name(node);
    bool no_entity_substitution = in_set(no_entity_sub, tagname);
    bool keep_whitespace        = in_set(preserve_whitespace, tagname);
    bool is_inline              = in_set(nonbreaking_inline, tagname);
    bool is_structural          = in_set(structural_tags, tagname);
    // bool pp_okay                = !is_inline && !keep_whitespace;
    char c                      = indent_chars.at(0);
    int  n                      = indent_chars.length(); 

    GumboVector* children = &node->v.element.children;

    for (unsigned int i = 0; i < children->length; ++i) {

        GumboNode* child = static_cast<GumboNode*> (children->data[i]);

        if (child->type == GUMBO_NODE_TEXT) {
            std::string val;

            if (no_entity_substitution) {
                val = std::string(child->v.text.text);
            } else {
                val = substitute_xml_entities_into_text(std::string(child->v.text.text));
            }

            // if child of a structual element is text, indent it properly
            if (is_structural) {
              std::string indent_space = std::string((lvl-1)*n,c);
              contents.append(indent_space);
              ltrim(val);
            } else if (!keep_whitespace && !is_structural) {
                // okay to condense whitespace
                condense_whitespace(val);
            }
            contents.append(val);

        } else if (child->type == GUMBO_NODE_ELEMENT || child->type == GUMBO_NODE_TEMPLATE) {

            std::string val = prettyprint(child, lvl, indent_chars);
            contents.append(val);

        } else if (child->type == GUMBO_NODE_WHITESPACE) {

            if (keep_whitespace) {
                std::string wspace = std::string(child->v.text.text);
                contents.append(wspace);
            } else if (is_inline || in_set(other_text_holders, tagname)) {
                char last_char = 'x';
                if (!contents.empty()) {
                    last_char = contents.at(contents.length()-1);
                }
                if (std::string(" \t\v\f\r\n").find(last_char) == std::string::npos) {
                    contents.append(std::string(" "));
                }
            }

        } else if (child->type == GUMBO_NODE_CDATA) {
            contents.append("<![CDATA[" + std::string(child->v.text.text) + "]]>");

        } else if (child->type == GUMBO_NODE_COMMENT) {
            contents.append("<!--" + std::string(child->v.text.text) + "-->");
 
        } else {
            fprintf(stderr, "unknown element of type: %d\n", child->type); 
        }

    }

    return contents;
}
Ejemplo n.º 25
0
static bool is_in_delete_set(const char c , const parser_type * parser) {
  return in_set(c , parser->delete_set);
}
Ejemplo n.º 26
0
/*------------------------------------------------------------------
  rcut_out  отрезает одну букву от сегмента (ib,ie) справа (причем
            шире сегмента (ii,ie) ): первую попавшуюся лучше trs2,
            а если все хуже - наилучшую;
            set определяет множество допустимых точек (всего ncut),
            если set<0 - поиск по всем точкам;
            возвращает 1, если лучше tol, и 0, если хуже;
            imax и pmax - найденная правая граница буквы и ее
            вероятность
------------------------------------------------------------------*/
static char rcut_out(int16_t ib, int16_t ii, int16_t ie, int16_t wmin, int16_t wmax,
                     int16_t set, int16_t tol, int16_t *imax, int16_t *pmax, int16_t mode)
{
  struct cut_elm  *cut; int16_t i;
  struct cut_elm *cutb=cut_list+ib;
  int16_t xe=cut_list[ie].x;
  int16_t x0=xe-sym_width;      //начальное сечение
  int16_t xb=xe-wmax;           //левая граница
  struct cut_elm  *cutl,*cutr; int16_t il,ir;   //сечения слева и справа от x0
  uchar p;
  int16_t sete=set;
  uchar cc;
  seg_vers *cur_vers;

  xe -= wmin;
  if (set<0)  { set=1; sete=4; }
  for ( ; set<=sete; set++)
  {
  //первый разрез на расстоянии около sym_width, далее смещаемся в обе стороны
    i=ii-1; cut=cut_list+i; il=ir=-128;
    while (cut>=cutb && cut->x >= xb)
    {
      char var=cut->var & 0x7F;
      if (cut->x < xe && in_set(var,set))
        if(cut->x >= x0)  { ir=i;  cutr=cut; }
        else              { il=i;  cutl=cut; break; }
      cut--; i--;
    }

    while (il>=0 || ir>=0)
    {
      if (il<0)    goto right;
      else
        if (ir<0)  goto left;
        else
          if (cutr->x-x0 < x0-cutl->x)  goto right;
left:
      cc=addij(LC,r,cut_list,vers_list,ncut,il,ie,(char)mode);
      if (cc & 8)                     //широкий
        il=-128;
      else
      {
        if ((cc & (2+4+8)) == 0)
          if (cur_vers=find_vers(il,ie,vers_list))
          {
            p=0;
            if (let_or_bad(&cur_vers->vers))
            {
              p=cur_vers->vers.vers[0].prob;
              if (p>*pmax) { *pmax=p; *imax=il; }
              if (p>tol)  return 1;
            }
            else
              if (cur_vers->vers.vers[0].let=='-')
              {
                *imax=il; return 1;
              }
          }
        il=dec(&cutl,il,ib,set,xb);
      }
      continue;
right:
      cc=addij(LC,r,cut_list,vers_list,ncut,ir,ie,(char)mode);
      if (cc & 4)                     //узкий
        ir=-128;
      else
      {
        if ((cc & (2+4+8)) == 0)
          if (cur_vers=find_vers(ir,ie,vers_list))
          {
            p=0;
            if (let_or_bad(&cur_vers->vers))
            {
              p=cur_vers->vers.vers[0].prob;
              if (p>*pmax) { *pmax=p; *imax=ir; }
              if (p>tol)  return 1;
            }
            else
              if (cur_vers->vers.vers[0].let=='-')
              {
                *imax=ir; return 1;
              }
          }
        ir=inc(&cutr,ir,ii,set,xe);
        if (ir==ii) ir=-128;
      }
    }
  }
  return 0;
}
Ejemplo n.º 27
0
/*-------------------------------------------------------------------
  one_cut  находит разрез сегмента (i1,i2) в пределах (ib,ie), чтобы
           хотя бы один кусок был хороший;
           возвращает 1, если разрезался на две хорошие части, иначе - 0
-------------------------------------------------------------------*/
static char one_cut(int16_t i1, int16_t ib, int16_t ie, int16_t i2)
{
  struct cut_elm  *cut; int16_t i;
  struct cut_elm *cutb=cut_list+ib;
  struct cut_elm *cute=cut_list+ie;
  int16_t xb=cutb->x;
  int16_t xe=cute->x;
  int16_t x0=(cutb->x+cute->x)>>1;  //начальное сечение
  struct cut_elm  *cutl,*cutr; int16_t il,ir;   //сечения слева и справа от x0
  char fll,flr;
  SVERS *vers;
  int16_t set;

  if (debug_on)
#ifndef MY_DEBUG
    if (det_trace)
#endif
    {
      sprintf(snap_text,"one cut: %d-%d",ib,ie);
      cg_show_rast(LC,r,snap_text,cut_list); //растр - на экран
    }

  if (cut_list[i2].x-cut_list[i1].x<=sym_width)  return 0;
  xb=x0-(sym_width>>2); xe=x0+(sym_width>>2);
  for (set=1; set<=4; set++)
  {
  //первый разрез примерно на середине, далее смещаемся в обе стороны
    i=ib+1; cut=cutb+1; il=ir=-128;
    while (cut<=cute && cut->x < xe)
    {
      char var=cut->var & 0x7F;
      if (cut->x > xb && in_set(var,set))
        if(cut->x <= x0)  { il=i;  cutl=cut; }
        else              { ir=i;  cutr=cut; break; }
      cut++; i++;
    }

    while (il>=0 || ir>=0)
    {
      if (il<0)    goto right;
      else
        if (ir<0)  goto left;
        else
          if (cutr->x-x0 < x0-cutl->x)  goto right;
left:
      i=il; cut=cutl; il=dec(&cutl,il,ib,set,xb);
      if (il==i1) il=-128;  goto rec;
right:
      i=ir; cut=cutr; ir=inc(&cutr,ir,ie,set,xe);
      if (ir==i2) ir=-128;
rec:
      fll=flr=0;
      addij(LC,r,cut_list,vers_list,ncut,i1,i,0);
      addij(LC,r,cut_list,vers_list,ncut,i,i2,0);
      if (let_or_bad(vers=&cut->versm))   fll=vers->vers[0].prob>trs2;
      if (let_or_bad(vers=&cute->versm))  flr=vers->vers[0].prob>trs2;
      if (fll && flr)  return 1;
      if (fll || flr)  return 0;
    }
    xb=cutb->x; xe=cute->x;
  }
  return 0;
}