ExprTree *ClassAdXMLParser:: ParseNumberOrString(XMLLexer::TagID tag_id) { bool have_token; ExprTree *tree; XMLLexer::Token token; // Get start tag tree = NULL; have_token = lexer.ConsumeToken(&token); assert(have_token && token.tag_id == tag_id); // Get text of number or string have_token = lexer.PeekToken(&token); if (have_token && token.token_type == XMLLexer::tokenType_Text) { lexer.ConsumeToken(&token); Value value; if (tag_id == XMLLexer::tagID_Integer) { long long number; sscanf(token.text.c_str(), "%lld", &number); value.SetIntegerValue(number); } else if (tag_id == XMLLexer::tagID_Real) { double real; real = strtod(token.text.c_str(), NULL); value.SetRealValue(real); } else { // its a string bool validStr = true; token.text += " "; convert_escapes(token.text, validStr ); if(!validStr) { // invalid string because it had /0 escape sequence return NULL; } else { value.SetStringValue(token.text); } } tree = Literal::MakeLiteral(value); } else if (tag_id == XMLLexer::tagID_String) { // We were expecting text and got none, so we had // the empty string, which was skipped by the lexer. Value value; value.SetStringValue(""); tree = Literal::MakeLiteral(value); } SwallowEndTag(tag_id); return tree; }
ExprTree *ClassAdXMLParser:: ParseNumberOrString(XMLLexer::TagID tag_id) { bool have_token; ExprTree *tree; XMLLexer::Token token; // Get start tag tree = NULL; have_token = lexer.ConsumeToken(&token); assert(have_token && token.tag_id == tag_id); // Get text of number or string have_token = lexer.PeekToken(&token); if (have_token && token.token_type == XMLLexer::tokenType_Text) { lexer.ConsumeToken(&token); Value value; if (tag_id == XMLLexer::tagID_Integer) { long long number; char * pend; const char * pnum = token.text.c_str(); number = strtoll(pnum, &pend, 10); if ( ! number && (pend == pnum)) { value.SetErrorValue(); } else { value.SetIntegerValue(number); } } else if (tag_id == XMLLexer::tagID_Real) { double real; char * pend; const char * pnum = token.text.c_str(); real = strtod(pnum, &pend); if (pend == pnum) { value.SetErrorValue(); } else { value.SetRealValue(real); } } else { // its a string bool validStr = true; //token.text.push_back('\0'); // force an explicit null terminator (because that's that the normal lexer does.) convert_escapes(token.text, validStr ); if(!validStr) { // invalid string because it had /0 escape sequence return NULL; } else { value.SetStringValue(token.text); } } tree = Literal::MakeLiteral(value); } else if (tag_id == XMLLexer::tagID_String) { // We were expecting text and got none, so we had // the empty string, which was skipped by the lexer. Value value; value.SetStringValue(""); tree = Literal::MakeLiteral(value); } SwallowEndTag(tag_id); return tree; }
void XMLLexer:: BreakdownTag(const char *complete_tag) { int length, i; int start, count; length = strlen(complete_tag); // Skip whitespace for (i = 0; i < length && isspace(complete_tag[i]); i++) { ; } // Is it a begin or end tag? if (complete_tag[i] == '/') { current_token.tag_type = tagType_End; i++; } else if (complete_tag[length-1] == '/') { current_token.tag_type = tagType_Empty; length--; // skip the / in the processing that follows. } else { current_token.tag_type = tagType_Start; } // Now pull out the tag name current_token.text = ""; start = i; count = 0; while (i < length && i != '>' && !isspace(complete_tag[i])) { //current_token.text += complete_tag[i]; i++; count++; } // With gcc-2.x's STL, this is faster than using a bunch of += statements. current_token.text.assign(complete_tag+start, count); // Figure out which tag it is current_token.tag_id = tagID_NoTag; for (unsigned int x = 0; x < NUMBER_OF_TAG_MAPPINGS; x++) { if (!strcmp(current_token.text.c_str(), tag_mappings[x].tag_name)) { current_token.tag_id = tag_mappings[x].id; break; } } // If we're not at the end, we probably have attributes, so let's // pull them out. (We might just have whitespace though.) while (i < length) { string name, value; name = ""; value = ""; // Skip whitespace while (i < length && isspace(complete_tag[i])) { i++; } // Now take text up to a whitespace or equal sign. This is the name start = i; count = 0; while (i < length && !isspace(complete_tag[i]) && complete_tag[i] != '=') { //name += complete_tag[i]; i++; count++; } // With gcc-2.x's STL, this is faster than using a bunch of += statements. name.assign(complete_tag+start, count); // Now skip whitespace and equal signs // Note that this allows some technically illegal things // like " == = = =", but who really cares? while (i < length && (isspace(complete_tag[i]) || complete_tag[i] == '=')) { i++; } i++; // go past 1st \" // Now pick out the value char oldCh = 0; // consume the string literal; read upto " ignoring \" while ( (i<length) && ( complete_tag[i] != '\"' || ( complete_tag[i] == '\"' && oldCh == '\\' ) ) ) { oldCh = complete_tag[i]; value += complete_tag[i]; i++; } // scan string for &...; & replace them with their corresponding entities for (unsigned int k=0; k< value.length(); k++) { if (value[k] == '&') { // create substring int index = k-1; string str; do { index++; str += value[index]; } while(value[index] != ';'); for (unsigned int j = 0; j < NUMBER_OF_ENTITIES; j++){ if (!strcmp(str.c_str(), entities[j].name)) { value.replace(k, str.length(), entities[j].replacement_text); } } } } bool validStr = true; //value.push_back('\0'); // force an explicit null terminator (because that's that the normal lexer does.) convert_escapes(value, validStr); if(!validStr) { // contains a \0 escape char current_token.tag_type = tagType_Invalid; } else if (name.size() > 0 && value.size() > 0) { current_token.attributes[name] = value; } } return; }
// tokenizeStringLiteral: Scans strings of the form " ... " or '...' // based on whether the argument passed was '\"' or '\'' int Lexer:: tokenizeString(char delim) { bool stringComplete = false; // need to mark() after the quote inString = true; wind (); mark (); while (!stringComplete) { bool oddBackWhacks = false; int oldCh = 0; // consume the string literal; read upto " ignoring \" while( ( ch > 0 ) && ( ch != delim || ( ch == delim && oldCh == '\\' && oddBackWhacks ) ) ) { if( !oddBackWhacks && ch == '\\' ) { oddBackWhacks = true; } else { oddBackWhacks = false; } oldCh = ch; wind( ); } if( ch == delim ) { int tempch = ' '; // read past the whitespace characters while (isspace(tempch)) { tempch = lexSource->ReadCharacter(); } if (tempch != delim) { // a new token exists after the string if (tempch != -1) { lexSource->UnreadCharacter(); } stringComplete = true; } else { // the adjacent string is to be concatenated to the existing string lexBuffer.erase(lexBufferCount--); // erase the lagging '\"' wind(); } } else { // loop quit due to ch == 0 or ch == EOF tokenType = LEX_TOKEN_ERROR; return tokenType; } } cut( ); wind( ); // skip over the close quote bool validStr = true; // to check if string is valid after converting escape convert_escapes(lexBuffer, validStr); yylval.SetStringValue( lexBuffer.c_str( ) ); if (validStr) { if(delim == '\"') { tokenType = LEX_STRING_VALUE; } else { tokenType = LEX_IDENTIFIER; } } else { tokenType = LEX_TOKEN_ERROR; // string conatins a '\0' character inbetween } return tokenType; }