// This function is modelled after sscanf, and supports a subset of its features. See sscanf documentation // for information about the syntax it accepts. static void parse(const char *line, const char *fmt, int field_count, va_list va) { char *str = strdup(line); const int len = strlen(str); for (int i = 0; i < len; ++i) { if (str[i] == '\t') str[i] = ' '; } char *format = strdup(fmt); const int formatlen = strlen(format); for (int i = 0; i < formatlen; ++i) { if (format[i] == '\t') format[i] = ' '; } int count = 0; const char *src = str; const char *end = str + len; for (int i = 0; i < formatlen; ++i) { if (format[i] == '%') { char code[10]; char width[10]; int j = 0; int jw = 0; bool inBrackets = false; while (++i < formatlen && !isCodeSeparator(format[i])) { char c = format[i]; if (c == '[') { inBrackets = true; } else if (inBrackets && c == ']') { inBrackets = false; } if (!inBrackets && isNum(c)) { width[jw++] = c; } else { code[j++] = c; } } code[j] = '\0'; width[jw] = '\0'; void *var = va_arg(va, void *); if (strcmp(code, "n") == 0) { *(int*)var = src - str; continue; } char s[2000]; unsigned int fieldWidth = 1; if (width[0] != '\0') { fieldWidth = atoi(width); } j = 0; if (code[0] == 'c') { for (unsigned int n = 0; n < fieldWidth; ++n) { s[j++] = src[0]; ++src; } } else if (code[0] == '[') { bool isNegated; char *allowed = parseCharacterClass(code, &isNegated); while (src != end) { bool inSet = strchr(allowed, src[0]) != NULL; if ((isNegated && inSet) || (!isNegated && !inSet)) break; s[j++] = src[0]; ++src; } delete[] allowed; } else { char nextChar = format[i]; while (src[0] == ' ') { //skip initial whitespace ++src; } while (src != end && src[0] != nextChar && !isSeparator(src[0])) { s[j++] = src[0]; ++src; } } s[j] = '\0'; --i; if (width[0] == '\0') { fieldWidth = strlen(s); } if (strcmp(code, "d") == 0) { *(int*)var = atoi(s); } else if (strcmp(code, "x") == 0) { *(int*)var = strtol(s, (char **) NULL, 16); } else if (strcmp(code, "f") == 0) { *(float*)var = str2float(s); } else if (strcmp(code, "c") == 0) { *(char*)var = s[0]; } else if (strcmp(code, "s") == 0) { char *string = (char*)var; strncpy(string, s, fieldWidth); if (fieldWidth <= strlen(s)) { // add terminating \0 string[fieldWidth] = '\0'; } } else if (code[0] == '[') { char *string = (char*)var; strncpy(string, s, fieldWidth); string[fieldWidth-1] = '\0'; } else { error("Code not handled: \"%s\" \"%s\"\n\"%s\" \"%s\"", code, s, line, fmt); } ++count; continue; } while (src[0] == ' ') { ++src; } if (src == end) break; if (src[0] != format[i] && format[i] != ' ') { error("Expected line of format '%s', got '%s'", fmt, line); } if (src == end) break; if (format[i] != ' ') { ++src; if (src == end) break; } }
RangeToken* RegxParser::parseCharacterClass(const bool useNRange) { setParseContext(regexParserStateInBrackets); processNext(); RangeToken* tok = 0; bool isNRange = false; if (getState() == REGX_T_CHAR && getCharData() == chCaret) { isNRange = true; processNext(); } tok = fTokenFactory->createRange(); parserState type; bool firstLoop = true; bool wasDecoded; while ( (type = getState()) != REGX_T_EOF) { wasDecoded = false; // single range | from-to-range | subtraction if (type == REGX_T_CHAR && getCharData() == chCloseSquare && !firstLoop) break; XMLInt32 ch = getCharData(); bool end = false; if (type == REGX_T_BACKSOLIDUS) { switch(ch) { case chLatin_d: case chLatin_D: case chLatin_w: case chLatin_W: case chLatin_s: case chLatin_S: case chLatin_i: case chLatin_I: case chLatin_c: case chLatin_C: { tok->mergeRanges(getTokenForShorthand(ch)); end = true; } break; case chLatin_p: case chLatin_P: { RangeToken* tok2 = processBacksolidus_pP(ch); if (tok2 == 0) { ThrowXMLwithMemMgr(ParseException,XMLExcepts::Parser_Atom5, getMemoryManager()); } tok->mergeRanges(tok2); end = true; } break; case chDash: wasDecoded = true; // fall thru to default. default: ch = decodeEscaped(); } } // end if REGX_T_BACKSOLIDUS else if (type == REGX_T_XMLSCHEMA_CC_SUBTRACTION && !firstLoop) { if (isNRange) { tok = RangeToken::complementRanges(tok, fTokenFactory, fMemoryManager); isNRange=false; } RangeToken* rangeTok = parseCharacterClass(false); tok->subtractRanges(rangeTok); if (getState() != REGX_T_CHAR || getCharData() != chCloseSquare) { ThrowXMLwithMemMgr(ParseException,XMLExcepts::Parser_CC5, getMemoryManager()); } break; } // end if REGX_T_XMLSCHEMA... processNext(); if (!end) { if (type == REGX_T_CHAR && (ch == chOpenSquare || ch == chCloseSquare || (ch == chDash && getCharData() == chCloseSquare && firstLoop))) { // if regex = [-] then invalid... // '[', ']', '-' not allowed and should be escaped XMLCh chStr[] = { ch, chNull }; ThrowXMLwithMemMgr2(ParseException,XMLExcepts::Parser_CC6, chStr, chStr, getMemoryManager()); } if (ch == chDash && getCharData() == chDash && getState() != REGX_T_BACKSOLIDUS && !wasDecoded) { XMLCh chStr[] = { ch, chNull }; ThrowXMLwithMemMgr2(ParseException,XMLExcepts::Parser_CC6, chStr, chStr, getMemoryManager()); } if (getState() != REGX_T_CHAR || getCharData() != chDash) { tok->addRange(ch, ch); } else { processNext(); if ((type = getState()) == REGX_T_EOF) ThrowXMLwithMemMgr(ParseException,XMLExcepts::Parser_CC2, getMemoryManager()); if (type == REGX_T_CHAR && getCharData() == chCloseSquare) { tok->addRange(ch, ch); tok->addRange(chDash, chDash); } else if (type == REGX_T_XMLSCHEMA_CC_SUBTRACTION) { static const XMLCh dashStr[] = { chDash, chNull}; ThrowXMLwithMemMgr2(ParseException, XMLExcepts::Parser_CC6, dashStr, dashStr, getMemoryManager()); } else { XMLInt32 rangeEnd = getCharData(); XMLCh rangeEndStr[] = { rangeEnd, chNull }; if (type == REGX_T_CHAR) { if (rangeEnd == chOpenSquare || rangeEnd == chCloseSquare || rangeEnd == chDash) // '[', ']', '-' not allowed and should be escaped ThrowXMLwithMemMgr2(ParseException, XMLExcepts::Parser_CC6, rangeEndStr, rangeEndStr, getMemoryManager()); } else if (type == REGX_T_BACKSOLIDUS) { rangeEnd = decodeEscaped(); } processNext(); if (ch > rangeEnd) { XMLCh chStr[] = { ch, chNull }; ThrowXMLwithMemMgr2(ParseException,XMLExcepts::Parser_Ope3, rangeEndStr, chStr, getMemoryManager()); } tok->addRange(ch, rangeEnd); } } } firstLoop = false; } if (getState() == REGX_T_EOF) ThrowXMLwithMemMgr(ParseException,XMLExcepts::Parser_CC2, getMemoryManager()); if (isNRange) { if(useNRange) tok->setTokenType(Token::T_NRANGE); else tok = RangeToken::complementRanges(tok, fTokenFactory, fMemoryManager); } tok->sortRanges(); tok->compactRanges(); // If the case-insensitive option is enabled, we need to // have the new RangeToken instance build its internal // case-insensitive RangeToken. if (RegularExpression::isSet(fOptions, RegularExpression::IGNORE_CASE)) { tok->getCaseInsensitiveToken(fTokenFactory); } setParseContext(regexParserStateNormal); processNext(); return tok; }
void Parser::parseAlternative(JumpList& failures) { PatternCharacterSequence sequence(m_generator, failures); while (1) { switch (peek()) { case EndOfPattern: case '|': case ')': sequence.flush(); return; case '*': case '+': case '?': case '{': { Quantifier q = consumeQuantifier(); if (q.type == Quantifier::None) { sequence.append(consume()); continue; } if (q.type == Quantifier::Error) return; if (!sequence.size()) { setError(QuantifierWithoutAtom); return; } sequence.flush(q); continue; } case '^': consume(); sequence.flush(); m_generator.generateAssertionBOL(failures); continue; case '$': consume(); sequence.flush(); m_generator.generateAssertionEOL(failures); continue; case '.': consume(); sequence.flush(); if (!parseCharacterClassQuantifier(failures, CharacterClass::newline(), true)) return; continue; case '[': consume(); sequence.flush(); if (!parseCharacterClass(failures)) return; continue; case '(': consume(); sequence.flush(); if (!parseParentheses(failures)) return; continue; case '\\': { consume(); Escape escape = consumeEscape(false); if (escape.type() == Escape::PatternCharacter) { sequence.append(PatternCharacterEscape::cast(escape).character()); continue; } sequence.flush(); if (!parseNonCharacterEscape(failures, escape)) return; continue; } default: sequence.append(consume()); continue; } } }
Token* RegxParser::parseAtom() { Token* tok = 0; switch(fState) { case REGX_T_LPAREN: return processParen(); case REGX_T_DOT: processNext(); tok = fTokenFactory->getDot(); break; case REGX_T_CARET: return processCaret(); case REGX_T_DOLLAR: return processDollar(); case REGX_T_LBRACKET: return parseCharacterClass(true); case REGX_T_BACKSOLIDUS: switch(fCharData) { case chLatin_d: case chLatin_D: case chLatin_w: case chLatin_W: case chLatin_s: case chLatin_S: case chLatin_c: case chLatin_C: case chLatin_i: case chLatin_I: tok = getTokenForShorthand(fCharData); processNext(); return tok; case chDigit_0: case chDigit_1: case chDigit_2: case chDigit_3: case chDigit_4: case chDigit_5: case chDigit_6: case chDigit_7: case chDigit_8: case chDigit_9: return processBackReference(); case chLatin_p: case chLatin_P: { tok = processBacksolidus_pP(fCharData); if (tok == 0) { ThrowXMLwithMemMgr(ParseException,XMLExcepts::Parser_Atom5, fMemoryManager); } } break; default: { XMLInt32 ch = decodeEscaped(); if (ch < 0x10000) { tok = fTokenFactory->createChar(ch); } else { XMLCh* surrogateStr = RegxUtil::decomposeToSurrogates(ch, fMemoryManager); ArrayJanitor<XMLCh> janSurrogate(surrogateStr, fMemoryManager); tok = fTokenFactory->createString(surrogateStr); } } break; } // end switch processNext(); break; case REGX_T_CHAR: if (fCharData == chOpenCurly || fCharData == chCloseCurly || fCharData == chCloseSquare) ThrowXMLwithMemMgr(ParseException,XMLExcepts::Parser_Atom4, fMemoryManager); tok = fTokenFactory->createChar(fCharData); processNext(); break; default: ThrowXMLwithMemMgr(ParseException,XMLExcepts::Parser_Atom4, fMemoryManager); } //end switch return tok; }