Пример #1
0
// ---------------------------------------------------------------------------
//  XMLUTF8Transcoder: Implementation of the transcoder API
// ---------------------------------------------------------------------------
unsigned int
XMLUTF8Transcoder::transcodeFrom(const  XMLByte* const          srcData
                                , const unsigned int            srcCount
                                ,       XMLCh* const            toFill
                                , const unsigned int            maxChars
                                ,       unsigned int&           bytesEaten
                                ,       unsigned char* const    charSizes)
{
    // Watch for pathological scenario. Shouldn't happen, but...
    if (!srcCount || !maxChars)
        return 0;

    // If debugging, make sure that the block size is legal
    #if defined(XERCES_DEBUG)
    checkBlockSize(maxChars);
    #endif

    //
    //  Get pointers to our start and end points of the input and output
    //  buffers.
    //
    const XMLByte*  srcPtr = srcData;
    const XMLByte*  srcEnd = srcPtr + srcCount;
    XMLCh*          outPtr = toFill;
    XMLCh*          outEnd = outPtr + maxChars;
    unsigned char*  sizePtr = charSizes;



    //
    //  We now loop until we either run out of input data, or room to store
    //  output chars.
    //
    while ((srcPtr < srcEnd) && (outPtr < outEnd))
    {
        // Special-case ASCII, which is a leading byte value of <= 127
        if (*srcPtr <= 127)
        {
            *outPtr++ = XMLCh(*srcPtr++);
            *sizePtr++ = 1;
            continue;
        }

        // See how many trailing src bytes this sequence is going to require
        const unsigned int trailingBytes = gUTFBytes[*srcPtr];

        //
        //  If there are not enough source bytes to do this one, then we
        //  are done. Note that we done >= here because we are implicitly
        //  counting the 1 byte we get no matter what.
        //
        //  If we break out here, then there is nothing to undo since we
        //  haven't updated any pointers yet.
        //
        if (srcPtr + trailingBytes >= srcEnd)
            break;

        // Looks ok, so lets build up the value
        // or at least let's try to do so--remembering that
        // we cannot assume the encoding to be valid:

        // first, test first byte
        if((gUTFByteIndicatorTest[trailingBytes] & *srcPtr) != gUTFByteIndicator[trailingBytes]) {
            char pos[2] = {(char)0x31, 0}; 
            char len[2] = {(char)(trailingBytes+0x31), 0};
            char byte[2] = {*srcPtr,0};
            ThrowXMLwithMemMgr3(UTFDataFormatException, XMLExcepts::UTF8_FormatError, pos, byte, len, getMemoryManager());
        }

        /***
         * http://www.unicode.org/reports/tr27/
         *
         * Table 3.1B. lists all of the byte sequences that are legal in UTF-8. 
         * A range of byte values such as A0..BF indicates that any byte from A0 to BF (inclusive) 
         * is legal in that position. 
         * Any byte value outside of the ranges listed is illegal. 
         * For example, 
         * the byte sequence <C0 AF> is illegal  since C0 is not legal in the 1st Byte column. 
         * The byte sequence <E0 9F 80> is illegal since in the row 
         *    where E0 is legal as a first byte, 
         *    9F is not legal as a second byte. 
         * The byte sequence <F4 80 83 92> is legal, since every byte in that sequence matches 
         * a byte range in a row of the table (the last row). 
         *
         *
         * Table 3.1B. Legal UTF-8 Byte Sequences  
         * Code Points              1st Byte    2nd Byte    3rd Byte    4th Byte 
         * =========================================================================
         * U+0000..U+007F            00..7F       
         * -------------------------------------------------------------------------
         * U+0080..U+07FF            C2..DF      80..BF      
         *
         * -------------------------------------------------------------------------
         * U+0800..U+0FFF            E0          A0..BF     80..BF   
         *                                       -- 
         *                          
         * U+1000..U+FFFF            E1..EF      80..BF     80..BF    
         *
         * --------------------------------------------------------------------------
         * U+10000..U+3FFFF          F0          90..BF     80..BF       80..BF 
         *                                       --
         * U+40000..U+FFFFF          F1..F3      80..BF     80..BF       80..BF 
         * U+100000..U+10FFFF        F4          80..8F     80..BF       80..BF 
         *                                           --
         * ==========================================================================
         *
         *  Cases where a trailing byte range is not 80..BF are underlined in the table to 
         *  draw attention to them. These occur only in the second byte of a sequence.
         *
         ***/

        XMLUInt32 tmpVal = 0;

        switch(trailingBytes)
        {
            case 1 :
                // UTF-8:   [110y yyyy] [10xx xxxx]
                // Unicode: [0000 0yyy] [yyxx xxxx]
                //
                // 0xC0, 0xC1 has been filtered out             
                checkTrailingBytes(*(srcPtr+1), 1, 1);

                tmpVal = *srcPtr++;
                tmpVal <<= 6;
                tmpVal += *srcPtr++;

                break;
            case 2 :
                // UTF-8:   [1110 zzzz] [10yy yyyy] [10xx xxxx]
                // Unicode: [zzzz yyyy] [yyxx xxxx]
                //
                if (( *srcPtr == 0xE0) && ( *(srcPtr+1) < 0xA0)) 
                {
                    char byte0[2] = {*srcPtr    ,0};
                    char byte1[2] = {*(srcPtr+1),0};

                    ThrowXMLwithMemMgr2(UTFDataFormatException
                                      , XMLExcepts::UTF8_Invalid_3BytesSeq
                                      , byte0
                                      , byte1
                                      , getMemoryManager());
                }

                checkTrailingBytes(*(srcPtr+1), 2, 1);
                checkTrailingBytes(*(srcPtr+2), 2, 2);

                //
                // D36 (a) UTF-8 is the Unicode Transformation Format that serializes 
                //         a Unicode code point as a sequence of one to four bytes, 
                //         as specified in Table 3.1, UTF-8 Bit Distribution.
                //     (b) An illegal UTF-8 code unit sequence is any byte sequence that 
                //         does not match the patterns listed in Table 3.1B, Legal UTF-8 
                //         Byte Sequences.
                //     (c) An irregular UTF-8 code unit sequence is a six-byte sequence 
                //         where the first three bytes correspond to a high surrogate, 
                //         and the next three bytes correspond to a low surrogate. 
                //         As a consequence of C12, these irregular UTF-8 sequences shall 
                //         not be generated by a conformant process. 
                //
                //irregular three bytes sequence
                // that is zzzzyy matches leading surrogate tag 110110 or 
                //                       trailing surrogate tag 110111
                // *srcPtr=1110 1101 
                // *(srcPtr+1)=1010 yyyy or 
                // *(srcPtr+1)=1011 yyyy
                //
                // 0xED 1110 1101
                // 0xA0 1010 0000

                if ((*srcPtr == 0xED) && (*(srcPtr+1) >= 0xA0))
                {
                    char byte0[2] = {*srcPtr,    0};
                    char byte1[2] = {*(srcPtr+1),0};

                     ThrowXMLwithMemMgr2(UTFDataFormatException
                              , XMLExcepts::UTF8_Irregular_3BytesSeq
                              , byte0
                              , byte1
                              , getMemoryManager());
                }

                tmpVal = *srcPtr++;
                tmpVal <<= 6;
                tmpVal += *srcPtr++;
                tmpVal <<= 6;
                tmpVal += *srcPtr++;

                break;
            case 3 : 
                // UTF-8:   [1111 0uuu] [10uu zzzz] [10yy yyyy] [10xx xxxx]*
                // Unicode: [1101 10ww] [wwzz zzyy] (high surrogate)
                //          [1101 11yy] [yyxx xxxx] (low surrogate)
                //          * uuuuu = wwww + 1
                //
                if (((*srcPtr == 0xF0) && (*(srcPtr+1) < 0x90)) ||
                    ((*srcPtr == 0xF4) && (*(srcPtr+1) > 0x8F))  )
                {
                    char byte0[2] = {*srcPtr    ,0};
                    char byte1[2] = {*(srcPtr+1),0};

                    ThrowXMLwithMemMgr2(UTFDataFormatException
                                      , XMLExcepts::UTF8_Invalid_4BytesSeq
                                      , byte0
                                      , byte1
                                      , getMemoryManager());
                }

                checkTrailingBytes(*(srcPtr+1), 3, 1);
                checkTrailingBytes(*(srcPtr+2), 3, 2);
                checkTrailingBytes(*(srcPtr+3), 3, 3);
                
                tmpVal = *srcPtr++;
                tmpVal <<= 6;
                tmpVal += *srcPtr++;
                tmpVal <<= 6;
                tmpVal += *srcPtr++;
                tmpVal <<= 6;
                tmpVal += *srcPtr++;

                break;
            default: // trailingBytes > 3

                /***
                 * The definition of UTF-8 in Annex D of ISO/IEC 10646-1:2000 also allows 
                 * for the use of five- and six-byte sequences to encode characters that 
                 * are outside the range of the Unicode character set; those five- and 
                 * six-byte sequences are illegal for the use of UTF-8 as a transformation 
                 * of Unicode characters. ISO/IEC 10646 does not allow mapping of unpaired 
                 * surrogates, nor U+FFFE and U+FFFF (but it does allow other noncharacters).
                 ***/
                char len[2]  = {(char)(trailingBytes+0x31), 0};
                char byte[2] = {*srcPtr,0};

                ThrowXMLwithMemMgr2(UTFDataFormatException
                                  , XMLExcepts::UTF8_Exceede_BytesLimit
                                  , byte
                                  , len
                                  , getMemoryManager());

                break;
        }


        // since trailingBytes comes from an array, this logic is redundant
        //  default :
        //      ThrowXMLwithMemMgr(TranscodingException, XMLExcepts::Trans_BadSrcSeq);
        //}
        tmpVal -= gUTFOffsets[trailingBytes];

        //
        //  If it will fit into a single char, then put it in. Otherwise
        //  encode it as a surrogate pair. If its not valid, use the
        //  replacement char.
        //
        if (!(tmpVal & 0xFFFF0000))
        {
            *sizePtr++ = trailingBytes + 1;
            *outPtr++ = XMLCh(tmpVal);
        }
         else if (tmpVal > 0x10FFFF)
        {
            //
            //  If we've gotten more than 32 chars so far, then just break
            //  out for now and lets process those. When we come back in
            //  here again, we'll get no chars and throw an exception. This
            //  way, the error will have a line and col number closer to
            //  the real problem area.
            //
            if ((outPtr - toFill) > 32)
                break;

            ThrowXMLwithMemMgr(TranscodingException, XMLExcepts::Trans_BadSrcSeq, getMemoryManager());
        }
         else
        {
            //
            //  If we have enough room to store the leading and trailing
            //  chars, then lets do it. Else, pretend this one never
            //  happened, and leave it for the next time. Since we don't
            //  update the bytes read until the bottom of the loop, by
            //  breaking out here its like it never happened.
            //
            if (outPtr + 1 >= outEnd)
                break;

            // Store the leading surrogate char
            tmpVal -= 0x10000;
            *sizePtr++ = trailingBytes + 1;
            *outPtr++ = XMLCh((tmpVal >> 10) + 0xD800);

            //
            //  And then the trailing char. This one accounts for no
            //  bytes eaten from the source, so set the char size for this
            //  one to be zero.
            //
            *sizePtr++ = 0;
            *outPtr++ = XMLCh((tmpVal & 0x3FF) + 0xDC00);
        }
    }

    // Update the bytes eaten
    bytesEaten = srcPtr - srcData;

    // Return the characters read
    return outPtr - toFill;
}
Пример #2
0
//
// here content is a list of items
//
void ListDatatypeValidator::checkContent(       BaseRefVectorOf<XMLCh>*       tokenVector
                                        , const XMLCh*                  const content
                                        ,       ValidationContext*      const context
                                        ,       bool                          asBase
                                        ,       MemoryManager*          const manager)
{
    DatatypeValidator* bv = getBaseValidator();

    if (bv->getType() == DatatypeValidator::List)
        ((ListDatatypeValidator*)bv)->checkContent(tokenVector, content, context, true, manager);
    else
    {   // the ultimate itemType DTV
        for (unsigned int i = 0; i < tokenVector->size(); i++)
            bv->validate(tokenVector->elementAt(i), context, manager);
    }

    int thisFacetsDefined = getFacetsDefined();

    // we check pattern first
    if ( (thisFacetsDefined & DatatypeValidator::FACET_PATTERN ) != 0 )
    {
        // lazy construction
        if (getRegex() == 0)
        {
            try {
                setRegex(new (fMemoryManager) RegularExpression(getPattern(), SchemaSymbols::fgRegEx_XOption, fMemoryManager));            	
            }
            catch (XMLException &e)
            {
                ThrowXMLwithMemMgr1(InvalidDatatypeValueException, XMLExcepts::RethrowError, e.getMessage(), manager);
            }
        }

        //check every item in the list as a whole
        if (getRegex()->matches(content, manager) == false)
        {
            ThrowXMLwithMemMgr2(InvalidDatatypeValueException
                    , XMLExcepts::VALUE_NotMatch_Pattern
                    , content
                    , getPattern()
                    , manager);
        }

    }

    // if this is a base validator, we only need to check pattern facet
    // all other facet were inherited by the derived type
    if (asBase)
        return;

    unsigned int tokenNumber = tokenVector->size();

    if (((thisFacetsDefined & DatatypeValidator::FACET_MAXLENGTH) != 0) &&
        (tokenNumber > getMaxLength()))
    {
        XMLCh value1[BUF_LEN+1];
        XMLCh value2[BUF_LEN+1];
        XMLString::binToText(tokenNumber, value1, BUF_LEN, 10, manager);
        XMLString::binToText(getMaxLength(), value2, BUF_LEN, 10, manager);

        ThrowXMLwithMemMgr3(InvalidDatatypeValueException
                , XMLExcepts::VALUE_GT_maxLen
                , getContent()
                , value1
                , value2
                , manager);
    }

    if (((thisFacetsDefined & DatatypeValidator::FACET_MINLENGTH) != 0) &&
        (tokenNumber < getMinLength()))
    {
        XMLCh value1[BUF_LEN+1];
        XMLCh value2[BUF_LEN+1];
        XMLString::binToText(tokenNumber, value1, BUF_LEN, 10, manager);
        XMLString::binToText(getMinLength(), value2, BUF_LEN, 10, manager);

        ThrowXMLwithMemMgr3(InvalidDatatypeValueException
                , XMLExcepts::VALUE_LT_minLen
                , getContent()
                , value1
                , value2
                , manager);
    }

    if (((thisFacetsDefined & DatatypeValidator::FACET_LENGTH) != 0) &&
        (tokenNumber != AbstractStringValidator::getLength()))
    {
        XMLCh value1[BUF_LEN+1];
        XMLCh value2[BUF_LEN+1];
        XMLString::binToText(tokenNumber, value1, BUF_LEN, 10, manager);
        XMLString::binToText(AbstractStringValidator::getLength(), value2, BUF_LEN, 10, manager);

        ThrowXMLwithMemMgr3(InvalidDatatypeValueException
                , XMLExcepts::VALUE_NE_Len
                , getContent()
                , value1
                , value2
                , manager);
    }

    if ((thisFacetsDefined & DatatypeValidator::FACET_ENUMERATION) != 0 &&
        (getEnumeration() != 0))
    {
        int i;
        int enumLength = getEnumeration()->size();

        for ( i = 0; i < enumLength; i++)
        {
            //optimization: we do a lexical comparision first
            // this may be faster for string and its derived
            if (XMLString::equals(getEnumeration()->elementAt(i), getContent()))
                break; // a match found

            // do a value space check
            // this is needed for decimal (and probably other types
            // such as datetime related)
            // eg.
            // tokenVector = "1 2 3.0 4" vs enumeration = "1 2 3 4.0"
            //
            if (valueSpaceCheck(tokenVector, getEnumeration()->elementAt(i), manager))
                break;
        }

        if (i == enumLength)
            ThrowXMLwithMemMgr1(InvalidDatatypeValueException, XMLExcepts::VALUE_NotIn_Enumeration, getContent(), manager);

    } // enumeration

}
Пример #3
0
// -----------------------------------------------------------------------
// Abstract interface from AbstractNumericValidator
// -----------------------------------------------------------------------
void DecimalDatatypeValidator::checkContent(const XMLCh*             const content
                                           ,      ValidationContext* const context
                                           ,      bool                     asBase
                                           ,      MemoryManager*     const manager)
{
    //validate against base validator if any
    DecimalDatatypeValidator *pBase = (DecimalDatatypeValidator*) this->getBaseValidator();
    if (pBase)
        pBase->checkContent(content, context, true, manager);

    int thisFacetsDefined = getFacetsDefined();

    // we check pattern first
    if ( (thisFacetsDefined & DatatypeValidator::FACET_PATTERN ) != 0 )
    {
        // lazy construction
        if (getRegex() ==0) {
            try {
                // REVISIT: cargillmem fMemoryManager vs manager
                setRegex(new (fMemoryManager) RegularExpression(getPattern(), SchemaSymbols::fgRegEx_XOption, fMemoryManager));                
            }
            catch (XMLException &e)
            {
                ThrowXMLwithMemMgr1(InvalidDatatypeValueException, XMLExcepts::RethrowError, e.getMessage(), manager);
            }
        }

        if (getRegex()->matches(content, manager) ==false)
        {
            ThrowXMLwithMemMgr2(InvalidDatatypeValueException
                    , XMLExcepts::VALUE_NotMatch_Pattern
                    , content
                    , getPattern()
                    , manager);
        }
    }

    // if this is a base validator, we only need to check pattern facet
    // all other facet were inherited by the derived type
    if (asBase)
        return;
    XMLCh *errorMsg = 0;
    try {
        XMLBigDecimal  compareDataValue(content, manager);
        XMLBigDecimal* compareData = &compareDataValue;        
        
        if (getEnumeration())
        {
            int i=0;
            int enumLength = getEnumeration()->size();
            for ( ; i < enumLength; i++)
            {
                if (compareValues(compareData, (XMLBigDecimal*) getEnumeration()->elementAt(i)) ==0 )
                    break;
            }

            if (i == enumLength)
                ThrowXMLwithMemMgr1(InvalidDatatypeValueException, XMLExcepts::VALUE_NotIn_Enumeration, content, manager);
        }

        boundsCheck(compareData, manager);

        if ( (thisFacetsDefined & DatatypeValidator::FACET_FRACTIONDIGITS) != 0 )
        {
            if ( compareData->getScale() > fFractionDigits )
            {                
                XMLCh value1[BUF_LEN+1];
                XMLCh value2[BUF_LEN+1];
                XMLString::binToText(compareData->getScale(), value1, BUF_LEN, 10, manager);
                XMLString::binToText(fFractionDigits, value2, BUF_LEN, 10, manager);
                ThrowXMLwithMemMgr3(InvalidDatatypeFacetException
                                 , XMLExcepts::VALUE_exceed_fractDigit
                                 , compareData->getRawData()
                                 , value1
                                 , value2
                                 , manager);
            }
        }

        if ( (thisFacetsDefined & DatatypeValidator::FACET_TOTALDIGITS) != 0 )
        {
            if ( compareData->getTotalDigit() > fTotalDigits )
            {                
                XMLCh value1[BUF_LEN+1];
                XMLCh value2[BUF_LEN+1];
                XMLString::binToText(compareData->getTotalDigit(), value1, BUF_LEN, 10, manager);
                XMLString::binToText(fTotalDigits, value2, BUF_LEN, 10, manager);
                ThrowXMLwithMemMgr3(InvalidDatatypeFacetException
                                 , XMLExcepts::VALUE_exceed_totalDigit
                                 , compareData->getRawData()
                                 , value1
                                 , value2
                                 , manager);
            }

            /***
             E2-44 totalDigits

             ... by restricting it to numbers that are expressible as i � 10^-n
             where i and n are integers such that |i| < 10^totalDigits and 0 <= n <= totalDigits.
            ***/

            if ( compareData->getScale() > fTotalDigits )  
            {                
                XMLCh value1[BUF_LEN+1];
                XMLCh value2[BUF_LEN+1];
                XMLString::binToText(compareData->getScale(), value1, BUF_LEN, 10, manager);
                XMLString::binToText(fTotalDigits, value2, BUF_LEN, 10, manager);
                ThrowXMLwithMemMgr3(InvalidDatatypeFacetException
                                 , XMLExcepts::VALUE_exceed_totalDigit
                                 , compareData->getRawData()
                                 , value1
                                 , value2
                                 , manager);
            }        
        }
    }
    catch (XMLException &e)
    {
       errorMsg = XMLString::replicate(e.getMessage(), manager);
    }
    if(errorMsg)
    {
       ArrayJanitor<XMLCh> jan(errorMsg, manager);
       ThrowXMLwithMemMgr1(InvalidDatatypeFacetException, XMLExcepts::RethrowError, errorMsg, manager);
    }
}