bool RegularExpression::matchRange(Context* const context, const Op* const op,
								   int& offset, const short direction,
								   const bool ignoreCase)
{
	int tmpOffset = direction > 0 ? offset : offset - 1;

	if (tmpOffset >= context->fLimit || tmpOffset < 0)
		return false;

	XMLInt32 strCh = 0;
	
	if (!context->nextCh(strCh, tmpOffset, direction))
		return false;

	RangeToken* tok = (RangeToken *) op->getToken();
	bool match = false;

	if (ignoreCase) {
		tok = tok->getCaseInsensitiveToken(fTokenFactory);
	}

    match = tok->match(strCh);

	if (!match)
		return false;

	offset = (direction > 0) ? ++tmpOffset : tmpOffset;

	return true;
}
/*
 * Prepares for matching. This method is called during construction.
 */
void RegularExpression::prepare() {

	compile(fTokenTree);

	fMinLength = fTokenTree->getMinLength();
	fFirstChar = 0;

	if (!isSet(fOptions, PROHIBIT_HEAD_CHARACTER_OPTIMIZATION) &&
		!isSet(fOptions, XMLSCHEMA_MODE))							{

		RangeToken* rangeTok = fTokenFactory->createRange();
		int result = fTokenTree->analyzeFirstCharacter(rangeTok, fOptions, fTokenFactory);

		if (result == Token::FC_TERMINAL) {

			rangeTok->compactRanges();
			fFirstChar = rangeTok;
		}

        rangeTok->createMap();

    	if (isSet(fOptions, IGNORE_CASE))
        {
            rangeTok->getCaseInsensitiveToken(fTokenFactory);
        }
    }

	if (fOperations != 0 && fOperations->getNextOp() == 0 &&
		(fOperations->getOpType() == Op::O_STRING ||
         fOperations->getOpType() == Op::O_CHAR) &&
         !isSet(fOptions, IGNORE_CASE) )                      {

		fFixedStringOnly = true;

		if (fOperations->getOpType() == Op::O_STRING) {
			fMemoryManager->deallocate(fFixedString);//delete [] fFixedString;
			fFixedString = XMLString::replicate(fOperations->getLiteral(), fMemoryManager);
		}
		else{
			
			XMLInt32 ch = fOperations->getData();

			if ( ch >= 0x10000) { // add as constant
				fMemoryManager->deallocate(fFixedString);//delete [] fFixedString;
				fFixedString = RegxUtil::decomposeToSurrogates(ch, fMemoryManager);
			}
			else {

				XMLCh* dummyStr = (XMLCh*) fMemoryManager->allocate(2 * sizeof(XMLCh));//new XMLCh[2];
				dummyStr[0] = (XMLCh) fOperations->getData();
				dummyStr[1] = chNull;
				fMemoryManager->deallocate(fFixedString);//delete [] fFixedString;
				fFixedString = dummyStr;
			}
		}

		fBMPattern = new (fMemoryManager) BMPattern(fFixedString, 256,
								  isSet(fOptions, IGNORE_CASE), fMemoryManager);
	}
	else if (!isSet(fOptions, XMLSCHEMA_MODE) &&		
             !isSet(fOptions, PROHIBIT_FIXED_STRING_OPTIMIZATION) &&
             !isSet(fOptions, IGNORE_CASE)) {

		int fixedOpts = 0;
		Token* tok = fTokenTree->findFixedString(fOptions, fixedOpts);

		fMemoryManager->deallocate(fFixedString);//delete [] fFixedString;

		fFixedString = (tok == 0) ? 0
			: XMLString::replicate(tok->getString(), fMemoryManager);

		if (fFixedString != 0 && XMLString::stringLen(fFixedString) < 2) {

			fMemoryManager->deallocate(fFixedString);//delete [] fFixedString;
			fFixedString = 0;
		}
		
		if (fFixedString != 0) {

			fBMPattern = new (fMemoryManager) BMPattern(fFixedString, 256,
									   isSet(fixedOpts, IGNORE_CASE), fMemoryManager);
		}
	}
}
Exemple #3
0
RangeToken* RegxParser::parseCharacterClass(const bool useNRange) {

    setParseContext(regexParserStateInBrackets);
    processNext();

    RangeToken* tok = 0;
    bool isNRange = false;

    if (getState() == REGX_T_CHAR && getCharData() == chCaret) {
        isNRange = true;
        processNext();
    }
    tok = fTokenFactory->createRange();

    parserState type;
    bool firstLoop = true;
    bool wasDecoded;

    while ( (type = getState()) != REGX_T_EOF) {

        wasDecoded = false;

        // single range | from-to-range | subtraction
        if (type == REGX_T_CHAR && getCharData() == chCloseSquare && !firstLoop)
            break;

        XMLInt32 ch = getCharData();
        bool     end = false;

        if (type == REGX_T_BACKSOLIDUS) {

            switch(ch) {
            case chLatin_d:
            case chLatin_D:
            case chLatin_w:
            case chLatin_W:
            case chLatin_s:
            case chLatin_S:
            case chLatin_i:
            case chLatin_I:
            case chLatin_c:
            case chLatin_C:
                {
                    tok->mergeRanges(getTokenForShorthand(ch));
                    end = true;
                }
                break;
            case chLatin_p:
            case chLatin_P:
                {                    
                    RangeToken* tok2 = processBacksolidus_pP(ch);

                    if (tok2 == 0) {
                        ThrowXMLwithMemMgr(ParseException,XMLExcepts::Parser_Atom5, getMemoryManager());
                    }

                    tok->mergeRanges(tok2);
                    end = true;
                }
                break;
            case chDash:
                wasDecoded = true;
                // fall thru to default.
            default:
                ch = decodeEscaped();
            }
        } // end if REGX_T_BACKSOLIDUS
        else if (type == REGX_T_XMLSCHEMA_CC_SUBTRACTION && !firstLoop) {

            if (isNRange)
            {
                tok = RangeToken::complementRanges(tok, fTokenFactory, fMemoryManager);
                isNRange=false;
            }
            RangeToken* rangeTok = parseCharacterClass(false);
            tok->subtractRanges(rangeTok);

            if (getState() != REGX_T_CHAR || getCharData() != chCloseSquare) {
                ThrowXMLwithMemMgr(ParseException,XMLExcepts::Parser_CC5, getMemoryManager());
            }
            break;
        } // end if REGX_T_XMLSCHEMA...

        processNext();

        if (!end) {

            if (type == REGX_T_CHAR
                && (ch == chOpenSquare
                    || ch == chCloseSquare
                    || (ch == chDash && getCharData() == chCloseSquare && firstLoop))) {
                // if regex = [-] then invalid...
                // '[', ']', '-' not allowed and should be escaped
                XMLCh chStr[] = { ch, chNull };
                ThrowXMLwithMemMgr2(ParseException,XMLExcepts::Parser_CC6, chStr, chStr, getMemoryManager());
            }
            if (ch == chDash && getCharData() == chDash && getState() != REGX_T_BACKSOLIDUS && !wasDecoded) {
                XMLCh chStr[] = { ch, chNull };
                ThrowXMLwithMemMgr2(ParseException,XMLExcepts::Parser_CC6, chStr, chStr, getMemoryManager());
            }

            if (getState() != REGX_T_CHAR || getCharData() != chDash) {
                tok->addRange(ch, ch);
            }
            else {

                processNext();
                if ((type = getState()) == REGX_T_EOF)
                    ThrowXMLwithMemMgr(ParseException,XMLExcepts::Parser_CC2, getMemoryManager());

                if (type == REGX_T_CHAR && getCharData() == chCloseSquare) {
                    tok->addRange(ch, ch);
                    tok->addRange(chDash, chDash);
                }
                else if (type == REGX_T_XMLSCHEMA_CC_SUBTRACTION) {

                    static const XMLCh dashStr[] = { chDash, chNull};
                    ThrowXMLwithMemMgr2(ParseException, XMLExcepts::Parser_CC6, dashStr, dashStr, getMemoryManager());
                }
                else {

                    XMLInt32 rangeEnd = getCharData();
                    XMLCh rangeEndStr[] = { rangeEnd, chNull };

                    if (type == REGX_T_CHAR) {

                        if (rangeEnd == chOpenSquare
                            || rangeEnd == chCloseSquare
                            || rangeEnd == chDash)
                            // '[', ']', '-' not allowed and should be escaped
                            ThrowXMLwithMemMgr2(ParseException, XMLExcepts::Parser_CC6, rangeEndStr, rangeEndStr, getMemoryManager());
                    }
                    else if (type == REGX_T_BACKSOLIDUS) {
                        rangeEnd = decodeEscaped();
                    }

                    processNext();

                    if (ch > rangeEnd) {
                        XMLCh chStr[] = { ch, chNull };
                        ThrowXMLwithMemMgr2(ParseException,XMLExcepts::Parser_Ope3, rangeEndStr, chStr, getMemoryManager());
                    }

                    tok->addRange(ch, rangeEnd);
                }
            }
        }
        firstLoop = false;
    }

    if (getState() == REGX_T_EOF)
        ThrowXMLwithMemMgr(ParseException,XMLExcepts::Parser_CC2, getMemoryManager());

    if (isNRange)
    {
        if(useNRange)
            tok->setTokenType(Token::T_NRANGE);
        else
            tok = RangeToken::complementRanges(tok, fTokenFactory, fMemoryManager);
    }

    tok->sortRanges();
    tok->compactRanges();

    // If the case-insensitive option is enabled, we need to
    // have the new RangeToken instance build its internal
    // case-insensitive RangeToken.
    if (RegularExpression::isSet(fOptions, RegularExpression::IGNORE_CASE))
    {
        tok->getCaseInsensitiveToken(fTokenFactory);
    }

    setParseContext(regexParserStateNormal);
    processNext();

    return tok;
}