void Node::setSelfClosing( bool theSelfClosing ) { if ( theSelfClosing != isSelfClosing() ) { flag = theSelfClosing ? SelfClosing : None; children.clear(); } }
std::string Node::getString() const { std::string toReturn; if ( isTextNode() ) { toReturn = text; } else if ( isCharacterData() ) { toReturn = "<![CDATA["; toReturn += text; toReturn += "]]>"; } else { toReturn = "<"; toReturn += name; for ( auto it = attributes.begin(); it != attributes.end(); ++it ) { toReturn += " "; toReturn += it->getString(); } if ( isSelfClosing() ) { toReturn += " />"; } else { toReturn += ">"; for ( auto it = children.begin(); it != children.end(); ++it ) { toReturn += "\r\n"; toReturn += ( * it )->getString(); } if ( getChildren().size() != 0 ) { toReturn += "\r\n"; } toReturn += "</"; toReturn += name; toReturn += ">"; } } return toReturn; }
wdocument* html_document(FILE* infile) { char *file; struct lengthed from = fileIntoString(infile); file = from.place; for (int i = 0; i < from.size; i++) { char c = file[i]; if (c == '\t' || c == '\n' || c == '\r') { file[i] = ' '; } } wdocument *document = malloc(sizeof(wdocument)); document -> text = file; document -> length = from.size; wtag* root = malloc(sizeof(wtag)); root -> parent = NULL; root -> nextSibling = NULL; root -> firstChild = NULL; root -> previousSibling = NULL; docstring rootname; rootname.slen = 9; rootname.str = &("#document"); rootname.document = document; rootname.start = 0; rootname.length = 0; root -> tagname = rootname; root -> tag = rootname; root -> data = &rootname; document -> root = root; const int WAIT_FOR_OPEN_TAG = 0; //Waits for a tag to start (waiting for <) const int WAIT_FOR_NAME_END = 1; //Waits for a tag's name to end either at a space or > const int WAIT_FOR_ATTRIBUTE = 8; const int WAIT_FOR_ATTRIBUTE_END = 2; //either =, space, or > to mark end of attribute name. const int WAIT_FOR_TAG_END = 3; //Waits for either > or a letter. const int WAIT_FOR_SCRIPT_CLOSE = 7; //Waits for the string </script> to declare a script tag closed. const int WAIT_FOR_ATTRIBUTE_VALUE = 9; const int WAIT_FOR_ATTRIBUTE_QUOTING = 10; const int WAIT_FOR_ATTRIBUTE_VALUE_END_SPACE = 11; const int WAIT_FOR_ATTRIBUTE_VALUE_END_DOUBLE = 12; const int WAIT_FOR_ATTRIBUTE_VALUE_END_SINGLE = 13; const int CLOSE_PARENT = -1; //Close the parent tag const int OPEN_NEW = -2; //Close the previous tag const int DO_NOTHING = -3; //Do nothing (e.g., </p> if i was being //really compliant and liked to do this for some reason) const int BE_SIBLING = -4; // E.g., <br><br><br>.. docstring TEXTNODE; TEXTNODE.document = document; TEXTNODE.slen = 5; TEXTNODE.str = &("#text"); int within[20]; for (int i = 0; i < 20; i++) { within[i] = 0; //Debugging only } wtag* previous = NULL; //The previous sibling. wtag* parent = root; wtag* now = NULL; wattribute* attr = NULL; wattribute* pattr = NULL; //The previous attribute on the current node. //To be reset upon creating a new `now` int nmode = 0; for (int i = 0; i < document -> length; i++) { char c = file[i]; int mode = nmode; within[mode]++; if (mode == WAIT_FOR_OPEN_TAG) { //Waits for a tag to start (waiting for <) if (c == '<') { if (now != NULL) { //A text node that we have to append in. if (previous == NULL) { parent -> firstChild = now; previous = now; } else { previous -> nextSibling = now; previous = now; } } now = malloc(sizeof(wtag)); now -> parent = NULL; now -> firstChild = NULL; now -> previousSibling = NULL; now -> nextSibling = NULL; now -> data = NULL; now -> tag.slen = 0; now -> tag.document = document; now -> tag.start = i; now -> tag.length = 0; now -> tagname.slen = 0; now -> tagname.document = document; now -> tagname.start = i + 1; now -> tagname.length = 0; nmode = WAIT_FOR_NAME_END; } else { if (now != NULL) { //We don't really have anything to do, except perhaps update the length. now -> data -> length = i + 1 - now -> data -> start;//(*(*now).data).start; } else { now = malloc(sizeof(wtag)); now -> parent = parent; now -> previousSibling = previous; now -> nextSibling = NULL; now -> firstChild = NULL; now -> tag = TEXTNODE; now -> tagname = TEXTNODE; now -> data = malloc(sizeof(docstring)); now -> data -> slen = 0; now -> data -> document = document; now -> data -> start = i; pattr = NULL; attr = NULL; } } } if (mode == WAIT_FOR_ATTRIBUTE_END) { if (c == ' ') { //The attribute's name is over. //We now have to wait for: //a) = sign (write attribute value) //b) letter (start new tag, set up this one to copy value from name) //c) > (finish tag) attr -> name.length = i - attr -> name.start; nmode = WAIT_FOR_ATTRIBUTE_VALUE; } if (c == '=') { //Wait for attribute value. attr -> name.length = i - attr -> name.start; mode = WAIT_FOR_ATTRIBUTE_VALUE; } if (c == '>') { mode = WAIT_FOR_NAME_END; } } if (mode == WAIT_FOR_ATTRIBUTE_VALUE_END_DOUBLE || mode == WAIT_FOR_ATTRIBUTE_VALUE_END_SINGLE) { char match = '"'; if (mode == WAIT_FOR_ATTRIBUTE_VALUE_END_SINGLE) { match = '\''; } if (file[i-1] != '\\' && c == match) { //The attribute is over. attr -> value.length = i - (attr -> value.start); pattr = attr; nmode = WAIT_FOR_ATTRIBUTE; } } if (mode == WAIT_FOR_ATTRIBUTE_VALUE_END_SPACE) { if (c == ' ' || c == '>') { attr -> value.length = i - (attr -> value.start); pattr = attr; mode = WAIT_FOR_ATTRIBUTE; nmode = WAIT_FOR_ATTRIBUTE; } } if (mode == WAIT_FOR_ATTRIBUTE_QUOTING) { if (c == '"') { attr -> value.start = i + 1; nmode = WAIT_FOR_ATTRIBUTE_VALUE_END_DOUBLE; } if (c == '\'') { attr -> value.start = i + 1; nmode = WAIT_FOR_ATTRIBUTE_VALUE_END_SINGLE; } bool lower = c >= 'a' && c <= 'z'; bool upper = c >= 'A' && c <= 'Z'; bool number = c >= '0' && c <= '9'; if (lower || upper || number) { attr -> value.start = i; nmode = WAIT_FOR_ATTRIBUTE_VALUE_END_SPACE; } } if (mode == WAIT_FOR_ATTRIBUTE_VALUE) { if (c == '=') { //We have a specified value nmode = WAIT_FOR_ATTRIBUTE_QUOTING; } if (c >= 'a' && c <= 'z') { //We have not hit an =, so this is a new attribute. //Specify value to be the same as the name attr -> value = attr -> name; mode = WAIT_FOR_ATTRIBUTE; //Process accordingly pattr = attr; } } if (mode == WAIT_FOR_ATTRIBUTE) { if (c >= 'a' && c <= 'z') { attr = malloc(sizeof(wattribute)); if (pattr != NULL) { pattr -> next = attr; } attr -> name.document = document; attr -> value.document = document; attr -> next = NULL; if (now -> firstattribute == NULL) { now -> firstattribute = attr; } nmode = WAIT_FOR_ATTRIBUTE_END; } if (c == '>') { mode = WAIT_FOR_NAME_END; //Finish up the tag. } } if (mode == WAIT_FOR_NAME_END) { //Waits for a tag's name to end either at a space or > if (c == ' ') { //Attributes TODO: now -> tagname.length = i - now -> tagname.start; nmode = WAIT_FOR_ATTRIBUTE; } if (c == '>') { //The tag is over. now -> tag.length = i - now -> tag.start + 1; //Includes < and > if (now -> tagname.length == 0) { now -> tagname.length = i - now -> tagname.start; //Includes < and > } char firstLetter = document -> text[ (now -> tag).start + 1 ]; int action = OPEN_NEW; if (firstLetter == '/') { action = CLOSE_PARENT; } if (isSelfClosing(*now)) { action = BE_SIBLING; } if (action == BE_SIBLING || action == OPEN_NEW) { //Self closing + open tags //Extra closey exemptions: bool closeParent = false; if (tagIs(*parent,"head",4)) { if (tagIs(*now,"body",4)) { closeParent = true; } } if (tagIs(*parent,"p",1)) { if (tagIs(*now,"p",1)) { closeParent = true; } //address,article,aside,blockquote //div,dl,fieldset,footer,form,h1 //h2,h3,h4,h5,h6,header,hr,menu //nav,ol,pre,section,table,ul,p if (tagIs(*now,"address",7) || tagIs(*now,"article",7) || tagIs(*now,"aside",5) || tagIs(*now,"blockquote",10)) { closeParent = true; } if (tagIs(*now,"div",3) || tagIs(*now,"dl",2) || tagIs(*now,"fieldset",8) || tagIs(*now,"footer",6) || tagIs(*now,"form",4) || tagIs(*now,"h1",2)) { closeParent = true; } if (tagIs(*now,"h2",2) || tagIs(*now,"h3",2) || tagIs(*now,"h4",2) || tagIs(*now,"h5",2) || tagIs(*now,"header",6) || tagIs(*now,"hr",2) || tagIs(*now,"menu",4)) { closeParent = true; } if (tagIs(*now,"nav",3) || tagIs(*now,"ol",2) || tagIs(*now,"pre",3) || tagIs(*now,"section",7) || tagIs(*now,"table",5) || tagIs(*now,"ul",2)) { closeParent = true; } } if (tagIs(*parent,"li",2)) { //The parent is an LI tag. The following will close it: //LI if (tagIs(*now,"li",2)) { closeParent = true; } } if (closeParent) { //We don't actually want the parent. //We make these swaps: // previous <-- parent // parent <-- parent.parent //Then we attach like normal. previous = parent; parent = previous -> parent; } if (parent != NULL && parent -> firstChild == NULL) { parent -> firstChild = now; } else { if (previous == NULL ) { } previous -> nextSibling = now; now -> previousSibling = previous; } now -> parent = parent; previous = now; } if (action == OPEN_NEW) { //Only open tags previous = NULL; parent = now; } if (action == CLOSE_PARENT) { //Closing tags free(now); now = NULL; previous = parent; parent = parent -> parent; } now = NULL; nmode = WAIT_FOR_OPEN_TAG; } } //showDocString(*parent -> tag); } if (now != NULL && previous != NULL) { //Dangling text node previous -> nextSibling = now; } return document; }