Exemplo n.º 1
0
// This function is modelled after sscanf, and supports a subset of its features. See sscanf documentation
// for information about the syntax it accepts.
static void parse(const char *line, const char *fmt, int field_count, va_list va) {
	char *str = strdup(line);
	const int len = strlen(str);
	for (int i = 0; i < len; ++i) {
		if (str[i] == '\t')
			str[i] = ' ';
	}

	char *format = strdup(fmt);
	const int formatlen = strlen(format);
	for (int i = 0; i < formatlen; ++i) {
		if (format[i] == '\t')
			format[i] = ' ';
	}

	int count = 0;
	const char *src = str;
	const char *end = str + len;
	for (int i = 0; i < formatlen; ++i) {
		if (format[i] == '%') {
			char code[10];
			char width[10];
			int j = 0;
			int jw = 0;
			bool inBrackets = false;
			while (++i < formatlen && !isCodeSeparator(format[i])) {
				char c = format[i];
				if (c == '[') {
					inBrackets = true;
				} else if (inBrackets && c == ']') {
					inBrackets = false;
				}
				if (!inBrackets && isNum(c)) {
					width[jw++] = c;
				} else {
					code[j++] = c;
				}
			}
			code[j] = '\0';
			width[jw] = '\0';

			void *var = va_arg(va, void *);
			if (strcmp(code, "n") == 0) {
				*(int*)var = src - str;
				continue;
			}

			char s[2000];

			unsigned int fieldWidth = 1;
			if (width[0] != '\0') {
				fieldWidth = atoi(width);
			}

			j = 0;
			if (code[0] == 'c') {
				for (unsigned int n = 0; n < fieldWidth; ++n) {
					s[j++] = src[0];
					++src;
				}
			} else if (code[0] == '[') {
				bool isNegated;
				char *allowed = parseCharacterClass(code, &isNegated);

				while (src != end) {
					bool inSet = strchr(allowed, src[0]) != NULL;
					if ((isNegated && inSet) || (!isNegated && !inSet))
						break;

					s[j++] = src[0];
					++src;
				}

				delete[] allowed;
			} else {
				char nextChar = format[i];
				while (src[0] == ' ') { //skip initial whitespace
					++src;
				}
				while (src != end && src[0] != nextChar && !isSeparator(src[0])) {
					s[j++] = src[0];
					++src;
				}
			}

			s[j] = '\0';
			--i;

			if (width[0] == '\0') {
				fieldWidth = strlen(s);
			}

			if (strcmp(code, "d") == 0) {
				*(int*)var = atoi(s);
			} else if (strcmp(code, "x") == 0) {
				*(int*)var = strtol(s, (char **) NULL, 16);
			} else if (strcmp(code, "f") == 0) {
				*(float*)var = str2float(s);
			} else if (strcmp(code, "c") == 0) {
				*(char*)var = s[0];
			} else if (strcmp(code, "s") == 0) {
				char *string = (char*)var;
				strncpy(string, s, fieldWidth);
				if (fieldWidth <= strlen(s)) {
					// add terminating \0
					string[fieldWidth] = '\0';
				}
			} else if (code[0] == '[') {
				char *string = (char*)var;
				strncpy(string, s, fieldWidth);
				string[fieldWidth-1] = '\0';
			} else {
				error("Code not handled: \"%s\" \"%s\"\n\"%s\" \"%s\"", code, s, line, fmt);
			}

			++count;
			continue;
		}

		while (src[0] == ' ') {
			++src;
		}
		if (src == end)
			break;

		if (src[0] != format[i] && format[i] != ' ') {
			error("Expected line of format '%s', got '%s'", fmt, line);
		}

		if (src == end)
			break;
		if (format[i] != ' ') {
			++src;
			if (src == end)
				break;
		}
	}
Exemplo n.º 2
0
RangeToken* RegxParser::parseCharacterClass(const bool useNRange) {

    setParseContext(regexParserStateInBrackets);
    processNext();

    RangeToken* tok = 0;
    bool isNRange = false;

    if (getState() == REGX_T_CHAR && getCharData() == chCaret) {
        isNRange = true;
        processNext();
    }
    tok = fTokenFactory->createRange();

    parserState type;
    bool firstLoop = true;
    bool wasDecoded;

    while ( (type = getState()) != REGX_T_EOF) {

        wasDecoded = false;

        // single range | from-to-range | subtraction
        if (type == REGX_T_CHAR && getCharData() == chCloseSquare && !firstLoop)
            break;

        XMLInt32 ch = getCharData();
        bool     end = false;

        if (type == REGX_T_BACKSOLIDUS) {

            switch(ch) {
            case chLatin_d:
            case chLatin_D:
            case chLatin_w:
            case chLatin_W:
            case chLatin_s:
            case chLatin_S:
            case chLatin_i:
            case chLatin_I:
            case chLatin_c:
            case chLatin_C:
                {
                    tok->mergeRanges(getTokenForShorthand(ch));
                    end = true;
                }
                break;
            case chLatin_p:
            case chLatin_P:
                {                    
                    RangeToken* tok2 = processBacksolidus_pP(ch);

                    if (tok2 == 0) {
                        ThrowXMLwithMemMgr(ParseException,XMLExcepts::Parser_Atom5, getMemoryManager());
                    }

                    tok->mergeRanges(tok2);
                    end = true;
                }
                break;
            case chDash:
                wasDecoded = true;
                // fall thru to default.
            default:
                ch = decodeEscaped();
            }
        } // end if REGX_T_BACKSOLIDUS
        else if (type == REGX_T_XMLSCHEMA_CC_SUBTRACTION && !firstLoop) {

            if (isNRange)
            {
                tok = RangeToken::complementRanges(tok, fTokenFactory, fMemoryManager);
                isNRange=false;
            }
            RangeToken* rangeTok = parseCharacterClass(false);
            tok->subtractRanges(rangeTok);

            if (getState() != REGX_T_CHAR || getCharData() != chCloseSquare) {
                ThrowXMLwithMemMgr(ParseException,XMLExcepts::Parser_CC5, getMemoryManager());
            }
            break;
        } // end if REGX_T_XMLSCHEMA...

        processNext();

        if (!end) {

            if (type == REGX_T_CHAR
                && (ch == chOpenSquare
                    || ch == chCloseSquare
                    || (ch == chDash && getCharData() == chCloseSquare && firstLoop))) {
                // if regex = [-] then invalid...
                // '[', ']', '-' not allowed and should be escaped
                XMLCh chStr[] = { ch, chNull };
                ThrowXMLwithMemMgr2(ParseException,XMLExcepts::Parser_CC6, chStr, chStr, getMemoryManager());
            }
            if (ch == chDash && getCharData() == chDash && getState() != REGX_T_BACKSOLIDUS && !wasDecoded) {
                XMLCh chStr[] = { ch, chNull };
                ThrowXMLwithMemMgr2(ParseException,XMLExcepts::Parser_CC6, chStr, chStr, getMemoryManager());
            }

            if (getState() != REGX_T_CHAR || getCharData() != chDash) {
                tok->addRange(ch, ch);
            }
            else {

                processNext();
                if ((type = getState()) == REGX_T_EOF)
                    ThrowXMLwithMemMgr(ParseException,XMLExcepts::Parser_CC2, getMemoryManager());

                if (type == REGX_T_CHAR && getCharData() == chCloseSquare) {
                    tok->addRange(ch, ch);
                    tok->addRange(chDash, chDash);
                }
                else if (type == REGX_T_XMLSCHEMA_CC_SUBTRACTION) {

                    static const XMLCh dashStr[] = { chDash, chNull};
                    ThrowXMLwithMemMgr2(ParseException, XMLExcepts::Parser_CC6, dashStr, dashStr, getMemoryManager());
                }
                else {

                    XMLInt32 rangeEnd = getCharData();
                    XMLCh rangeEndStr[] = { rangeEnd, chNull };

                    if (type == REGX_T_CHAR) {

                        if (rangeEnd == chOpenSquare
                            || rangeEnd == chCloseSquare
                            || rangeEnd == chDash)
                            // '[', ']', '-' not allowed and should be escaped
                            ThrowXMLwithMemMgr2(ParseException, XMLExcepts::Parser_CC6, rangeEndStr, rangeEndStr, getMemoryManager());
                    }
                    else if (type == REGX_T_BACKSOLIDUS) {
                        rangeEnd = decodeEscaped();
                    }

                    processNext();

                    if (ch > rangeEnd) {
                        XMLCh chStr[] = { ch, chNull };
                        ThrowXMLwithMemMgr2(ParseException,XMLExcepts::Parser_Ope3, rangeEndStr, chStr, getMemoryManager());
                    }

                    tok->addRange(ch, rangeEnd);
                }
            }
        }
        firstLoop = false;
    }

    if (getState() == REGX_T_EOF)
        ThrowXMLwithMemMgr(ParseException,XMLExcepts::Parser_CC2, getMemoryManager());

    if (isNRange)
    {
        if(useNRange)
            tok->setTokenType(Token::T_NRANGE);
        else
            tok = RangeToken::complementRanges(tok, fTokenFactory, fMemoryManager);
    }

    tok->sortRanges();
    tok->compactRanges();

    // If the case-insensitive option is enabled, we need to
    // have the new RangeToken instance build its internal
    // case-insensitive RangeToken.
    if (RegularExpression::isSet(fOptions, RegularExpression::IGNORE_CASE))
    {
        tok->getCaseInsensitiveToken(fTokenFactory);
    }

    setParseContext(regexParserStateNormal);
    processNext();

    return tok;
}
Exemplo n.º 3
0
void Parser::parseAlternative(JumpList& failures)
{
    PatternCharacterSequence sequence(m_generator, failures);

    while (1) {
        switch (peek()) {
        case EndOfPattern:
        case '|':
        case ')':
            sequence.flush();
            return;

        case '*':
        case '+':
        case '?':
        case '{': {
            Quantifier q = consumeQuantifier();

            if (q.type == Quantifier::None) {
                sequence.append(consume());
                continue;
            }

            if (q.type == Quantifier::Error)
                return;

            if (!sequence.size()) {
                setError(QuantifierWithoutAtom);
                return;
            }

            sequence.flush(q);
            continue;
        }

        case '^':
            consume();

            sequence.flush();
            m_generator.generateAssertionBOL(failures);
            continue;

        case '$':
            consume();

            sequence.flush();
            m_generator.generateAssertionEOL(failures);
            continue;

        case '.':
            consume();

            sequence.flush();
            if (!parseCharacterClassQuantifier(failures, CharacterClass::newline(), true))
                return;
            continue;

        case '[':
            consume();

            sequence.flush();
            if (!parseCharacterClass(failures))
                return;
            continue;

        case '(':
            consume();

            sequence.flush();
            if (!parseParentheses(failures))
                return;
            continue;

        case '\\': {
            consume();

            Escape escape = consumeEscape(false);
            if (escape.type() == Escape::PatternCharacter) {
                sequence.append(PatternCharacterEscape::cast(escape).character());
                continue;
            }

            sequence.flush();
            if (!parseNonCharacterEscape(failures, escape))
                return;
            continue;
        }

        default:
            sequence.append(consume());
            continue;
        }
    }
}
Exemplo n.º 4
0
Token* RegxParser::parseAtom() {

    Token* tok = 0;

    switch(fState) {

    case REGX_T_LPAREN:
        return processParen();
    case REGX_T_DOT:
        processNext();
        tok = fTokenFactory->getDot();
        break;
    case REGX_T_CARET:
        return processCaret();
    case REGX_T_DOLLAR:
        return processDollar();
    case REGX_T_LBRACKET:
        return parseCharacterClass(true);
    case REGX_T_BACKSOLIDUS:
        switch(fCharData) {

        case chLatin_d:
        case chLatin_D:
        case chLatin_w:
        case chLatin_W:
        case chLatin_s:
        case chLatin_S:
        case chLatin_c:
        case chLatin_C:
        case chLatin_i:
        case chLatin_I:
            tok = getTokenForShorthand(fCharData);
            processNext();
            return tok;
        case chDigit_0:
        case chDigit_1:
        case chDigit_2:
        case chDigit_3:
        case chDigit_4:
        case chDigit_5:
        case chDigit_6:
        case chDigit_7:
        case chDigit_8:
        case chDigit_9:
            return processBackReference();
        case chLatin_p:
        case chLatin_P:
            {                
                tok = processBacksolidus_pP(fCharData);
                if (tok == 0) {
                    ThrowXMLwithMemMgr(ParseException,XMLExcepts::Parser_Atom5, fMemoryManager);
                }
            }
            break;
        default:
            {
                XMLInt32 ch = decodeEscaped();
                if (ch < 0x10000) {
                    tok = fTokenFactory->createChar(ch);
                }
                else {

                    XMLCh* surrogateStr = RegxUtil::decomposeToSurrogates(ch, fMemoryManager);
                    ArrayJanitor<XMLCh> janSurrogate(surrogateStr, fMemoryManager);
                    tok = fTokenFactory->createString(surrogateStr);
                }
            }
            break;
        } // end switch

        processNext();
        break;
    case REGX_T_CHAR:
        if (fCharData == chOpenCurly
            || fCharData == chCloseCurly
            || fCharData == chCloseSquare)
            ThrowXMLwithMemMgr(ParseException,XMLExcepts::Parser_Atom4, fMemoryManager);

        tok = fTokenFactory->createChar(fCharData);
        processNext();
        break;
    default:
        ThrowXMLwithMemMgr(ParseException,XMLExcepts::Parser_Atom4, fMemoryManager);
    } //end switch

    return tok;
}