/* ** pZ is a UTF-16 encoded unicode string. If nChar is less than zero, ** return the number of bytes up to (but not including), the first pair ** of consecutive 0x00 bytes in pZ. If nChar is not less than zero, ** then return the number of bytes in the first nChar unicode characters ** in pZ (or up until the first pair of 0x00 bytes, whichever comes first). */ int sqlite3utf16ByteLen(const void *zIn, int nChar){ unsigned int c = 1; char const *z = zIn; int n = 0; if( SQLITE_UTF16NATIVE==SQLITE_UTF16BE ){ /* Using an "if (SQLITE_UTF16NATIVE==SQLITE_UTF16BE)" construct here ** and in other parts of this file means that at one branch will ** not be covered by coverage testing on any single host. But coverage ** will be complete if the tests are run on both a little-endian and ** big-endian host. Because both the UTF16NATIVE and SQLITE_UTF16BE ** macros are constant at compile time the compiler can determine ** which branch will be followed. It is therefore assumed that no runtime ** penalty is paid for this "if" statement. */ while( c && ((nChar<0) || n<nChar) ){ READ_UTF16BE(z, c); n++; } }else{ while( c && ((nChar<0) || n<nChar) ){ READ_UTF16LE(z, c); n++; } } return (z-(char const *)zIn)-((c==0)?2:0); }
/* ** This routine is called from the TCL test function "translate_selftest". ** It checks that the primitives for serializing and deserializing ** characters in each encoding are inverses of each other. */ void sqlite3UtfSelfTest(void){ unsigned int i, t; unsigned char zBuf[20]; unsigned char *z; int n; unsigned int c; for(i=0; i<0x00110000; i++){ z = zBuf; WRITE_UTF8(z, i); n = (int)(z-zBuf); assert( n>0 && n<=4 ); z[0] = 0; z = zBuf; c = sqlite3Utf8Read((const u8**)&z); t = i; if( i>=0xD800 && i<=0xDFFF ) t = 0xFFFD; if( (i&0xFFFFFFFE)==0xFFFE ) t = 0xFFFD; assert( c==t ); assert( (z-zBuf)==n ); } for(i=0; i<0x00110000; i++){ if( i>=0xD800 && i<0xE000 ) continue; z = zBuf; WRITE_UTF16LE(z, i); n = (int)(z-zBuf); assert( n>0 && n<=4 ); z[0] = 0; z = zBuf; READ_UTF16LE(z, 1, c); assert( c==i ); assert( (z-zBuf)==n ); } for(i=0; i<0x00110000; i++){ if( i>=0xD800 && i<0xE000 ) continue; z = zBuf; WRITE_UTF16BE(z, i); n = (int)(z-zBuf); assert( n>0 && n<=4 ); z[0] = 0; z = zBuf; READ_UTF16BE(z, 1, c); assert( c==i ); assert( (z-zBuf)==n ); } }
/* ** pZ is a UTF-16 encoded unicode string. If nChar is less than zero, ** return the number of bytes up to (but not including), the first pair ** of consecutive 0x00 bytes in pZ. If nChar is not less than zero, ** then return the number of bytes in the first nChar unicode characters ** in pZ (or up until the first pair of 0x00 bytes, whichever comes first). */ int sqlite3utf16ByteLen(const void *zIn, int nChar){ int c = 1; char const *z = (const char*)zIn; int n = 0; if( SQLITE_UTF16NATIVE==SQLITE_UTF16BE ){ while( c && ((nChar<0) || n<nChar) ){ READ_UTF16BE(z, c); n++; } }else{ while( c && ((nChar<0) || n<nChar) ){ READ_UTF16LE(z, c); n++; } } return (z-(char const *)zIn)-((c==0)?2:0); }
/* ** zIn is a UTF-16 encoded unicode string at least nChar characters long. ** Return the number of bytes in the first nChar unicode characters ** in pZ. nChar must be non-negative. */ int sqlite3Utf16ByteLen(const void *zIn, int nChar){ int c; unsigned char const *z = zIn; int n = 0; if( SQLITE_UTF16NATIVE==SQLITE_UTF16BE ){ while( n<nChar ){ READ_UTF16BE(z, 1, c); n++; } }else{ while( n<nChar ){ READ_UTF16LE(z, 1, c); n++; } } return (int)(z-(unsigned char const *)zIn); }
/* ** This routine is called from the TCL test function "translate_selftest". ** It checks that the primitives for serializing and deserializing ** characters in each encoding are inverses of each other. */ void sqlite3UtfSelfTest(){ unsigned int i, t; unsigned char zBuf[20]; unsigned char *z; int n; unsigned int c; for(i=0; i<0x00110000; i++){ z = zBuf; WRITE_UTF8(z, i); n = z-zBuf; z[0] = 0; z = zBuf; SQLITE_READ_UTF8(z, c); t = i; if( i>=0xD800 && i<=0xDFFF ) t = 0xFFFD; if( (i&0xFFFFFFFE)==0xFFFE ) t = 0xFFFD; assert( c==t ); assert( (z-zBuf)==n ); } for(i=0; i<0x00110000; i++){ if( i>=0xD800 && i<0xE000 ) continue; z = zBuf; WRITE_UTF16LE(z, i); n = z-zBuf; z[0] = 0; z = zBuf; READ_UTF16LE(z, c); assert( c==i ); assert( (z-zBuf)==n ); } for(i=0; i<0x00110000; i++){ if( i>=0xD800 && i<0xE000 ) continue; z = zBuf; WRITE_UTF16BE(z, i); n = z-zBuf; z[0] = 0; z = zBuf; READ_UTF16BE(z, c); assert( c==i ); assert( (z-zBuf)==n ); } }
/* ** This routine is called from the TCL test function "translate_selftest". ** It checks that the primitives for serializing and deserializing ** characters in each encoding are inverses of each other. */ void sqlite3utfSelfTest(){ int i; unsigned char zBuf[20]; unsigned char *z; int n; int c; for(i=0; i<0x00110000; i++){ z = zBuf; WRITE_UTF8(z, i); n = z-zBuf; z = zBuf; READ_UTF8(z, c); assert( c==i ); assert( (z-zBuf)==n ); } for(i=0; i<0x00110000; i++){ if( i>=0xD800 && i<=0xE000 ) continue; z = zBuf; WRITE_UTF16LE(z, i); n = z-zBuf; z = zBuf; READ_UTF16LE(z, c); assert( c==i ); assert( (z-zBuf)==n ); } for(i=0; i<0x00110000; i++){ if( i>=0xD800 && i<=0xE000 ) continue; z = zBuf; WRITE_UTF16BE(z, i); n = z-zBuf; z = zBuf; READ_UTF16BE(z, c); assert( c==i ); assert( (z-zBuf)==n ); } }
/* ** This routine transforms the internal text encoding used by pMem to ** desiredEnc. It is an error if the string is already of the desired ** encoding, or if *pMem does not contain a string value. */ int sqlite3VdbeMemTranslate(Mem *pMem, u8 desiredEnc){ unsigned char zShort[NBFS]; /* Temporary short output buffer */ int len; /* Maximum length of output string in bytes */ unsigned char *zOut; /* Output buffer */ unsigned char *zIn; /* Input iterator */ unsigned char *zTerm; /* End of input */ unsigned char *z; /* Output iterator */ unsigned int c; assert( pMem->flags&MEM_Str ); assert( pMem->enc!=desiredEnc ); assert( pMem->enc!=0 ); assert( pMem->n>=0 ); #if defined(TRANSLATE_TRACE) && defined(SQLITE_DEBUG) { char zBuf[100]; sqlite3VdbeMemPrettyPrint(pMem, zBuf); fprintf(stderr, "INPUT: %s\n", zBuf); } #endif /* If the translation is between UTF-16 little and big endian, then ** all that is required is to swap the byte order. This case is handled ** differently from the others. */ if( pMem->enc!=SQLITE_UTF8 && desiredEnc!=SQLITE_UTF8 ){ u8 temp; int rc; rc = sqlite3VdbeMemMakeWriteable(pMem); if( rc!=SQLITE_OK ){ assert( rc==SQLITE_NOMEM ); return SQLITE_NOMEM; } zIn = (u8*)pMem->z; zTerm = &zIn[pMem->n]; while( zIn<zTerm ){ temp = *zIn; *zIn = *(zIn+1); zIn++; *zIn++ = temp; } pMem->enc = desiredEnc; goto translate_out; } /* Set len to the maximum number of bytes required in the output buffer. */ if( desiredEnc==SQLITE_UTF8 ){ /* When converting from UTF-16, the maximum growth results from ** translating a 2-byte character to a 4-byte UTF-8 character. ** A single byte is required for the output string ** nul-terminator. */ len = pMem->n * 2 + 1; }else{ /* When converting from UTF-8 to UTF-16 the maximum growth is caused ** when a 1-byte UTF-8 character is translated into a 2-byte UTF-16 ** character. Two bytes are required in the output buffer for the ** nul-terminator. */ len = pMem->n * 2 + 2; } /* Set zIn to point at the start of the input buffer and zTerm to point 1 ** byte past the end. ** ** Variable zOut is set to point at the output buffer. This may be space ** obtained from malloc(), or Mem.zShort, if it large enough and not in ** use, or the zShort array on the stack (see above). */ zIn = (u8*)pMem->z; zTerm = &zIn[pMem->n]; if( len>NBFS ){ zOut = sqliteMallocRaw(len); if( !zOut ) return SQLITE_NOMEM; }else{ zOut = zShort; } z = zOut; if( pMem->enc==SQLITE_UTF8 ){ if( desiredEnc==SQLITE_UTF16LE ){ /* UTF-8 -> UTF-16 Little-endian */ while( zIn<zTerm ){ READ_UTF8(zIn, c); WRITE_UTF16LE(z, c); } }else{ assert( desiredEnc==SQLITE_UTF16BE ); /* UTF-8 -> UTF-16 Big-endian */ while( zIn<zTerm ){ READ_UTF8(zIn, c); WRITE_UTF16BE(z, c); } } pMem->n = z - zOut; *z++ = 0; }else{ assert( desiredEnc==SQLITE_UTF8 ); if( pMem->enc==SQLITE_UTF16LE ){ /* UTF-16 Little-endian -> UTF-8 */ while( zIn<zTerm ){ READ_UTF16LE(zIn, c); WRITE_UTF8(z, c); } }else{ /* UTF-16 Little-endian -> UTF-8 */ while( zIn<zTerm ){ READ_UTF16BE(zIn, c); WRITE_UTF8(z, c); } } pMem->n = z - zOut; } *z = 0; assert( (pMem->n+(desiredEnc==SQLITE_UTF8?1:2))<=len ); sqlite3VdbeMemRelease(pMem); pMem->flags &= ~(MEM_Static|MEM_Dyn|MEM_Ephem|MEM_Short); pMem->enc = desiredEnc; if( zOut==zShort ){ memcpy(pMem->zShort, zOut, len); zOut = (u8*)pMem->zShort; pMem->flags |= (MEM_Term|MEM_Short); }else{ pMem->flags |= (MEM_Term|MEM_Dyn); } pMem->z = (char*)zOut; translate_out: #if defined(TRANSLATE_TRACE) && defined(SQLITE_DEBUG) { char zBuf[100]; sqlite3VdbeMemPrettyPrint(pMem, zBuf); fprintf(stderr, "OUTPUT: %s\n", zBuf); } #endif return SQLITE_OK; }
/* ** This routine transforms the internal text encoding used by pMem to ** desiredEnc. It is an error if the string is already of the desired ** encoding, or if *pMem does not contain a string value. */ SQLITE_NOINLINE int sqlite3VdbeMemTranslate(Mem *pMem, u8 desiredEnc){ int len; /* Maximum length of output string in bytes */ unsigned char *zOut; /* Output buffer */ unsigned char *zIn; /* Input iterator */ unsigned char *zTerm; /* End of input */ unsigned char *z; /* Output iterator */ unsigned int c; assert( pMem->db==0 || sqlite3_mutex_held(pMem->db->mutex) ); assert( pMem->flags&MEM_Str ); assert( pMem->enc!=desiredEnc ); assert( pMem->enc!=0 ); assert( pMem->n>=0 ); #if defined(TRANSLATE_TRACE) && defined(SQLITE_DEBUG) { char zBuf[100]; sqlite3VdbeMemPrettyPrint(pMem, zBuf); fprintf(stderr, "INPUT: %s\n", zBuf); } #endif /* If the translation is between UTF-16 little and big endian, then ** all that is required is to swap the byte order. This case is handled ** differently from the others. */ if( pMem->enc!=SQLITE_UTF8 && desiredEnc!=SQLITE_UTF8 ){ u8 temp; int rc; rc = sqlite3VdbeMemMakeWriteable(pMem); if( rc!=SQLITE_OK ){ assert( rc==SQLITE_NOMEM ); return SQLITE_NOMEM; } zIn = (u8*)pMem->z; zTerm = &zIn[pMem->n&~1]; while( zIn<zTerm ){ temp = *zIn; *zIn = *(zIn+1); zIn++; *zIn++ = temp; } pMem->enc = desiredEnc; goto translate_out; } /* Set len to the maximum number of bytes required in the output buffer. */ if( desiredEnc==SQLITE_UTF8 ){ /* When converting from UTF-16, the maximum growth results from ** translating a 2-byte character to a 4-byte UTF-8 character. ** A single byte is required for the output string ** nul-terminator. */ pMem->n &= ~1; len = pMem->n * 2 + 1; }else{ /* When converting from UTF-8 to UTF-16 the maximum growth is caused ** when a 1-byte UTF-8 character is translated into a 2-byte UTF-16 ** character. Two bytes are required in the output buffer for the ** nul-terminator. */ len = pMem->n * 2 + 2; } /* Set zIn to point at the start of the input buffer and zTerm to point 1 ** byte past the end. ** ** Variable zOut is set to point at the output buffer, space obtained ** from sqlite3_malloc(). */ zIn = (u8*)pMem->z; zTerm = &zIn[pMem->n]; zOut = sqlite3DbMallocRaw(pMem->db, len); if( !zOut ){ return SQLITE_NOMEM; } z = zOut; if( pMem->enc==SQLITE_UTF8 ){ if( desiredEnc==SQLITE_UTF16LE ){ /* UTF-8 -> UTF-16 Little-endian */ while( zIn<zTerm ){ READ_UTF8(zIn, zTerm, c); WRITE_UTF16LE(z, c); } }else{ assert( desiredEnc==SQLITE_UTF16BE ); /* UTF-8 -> UTF-16 Big-endian */ while( zIn<zTerm ){ READ_UTF8(zIn, zTerm, c); WRITE_UTF16BE(z, c); } } pMem->n = (int)(z - zOut); *z++ = 0; }else{ assert( desiredEnc==SQLITE_UTF8 ); if( pMem->enc==SQLITE_UTF16LE ){ /* UTF-16 Little-endian -> UTF-8 */ while( zIn<zTerm ){ READ_UTF16LE(zIn, zIn<zTerm, c); WRITE_UTF8(z, c); } }else{ /* UTF-16 Big-endian -> UTF-8 */ while( zIn<zTerm ){ READ_UTF16BE(zIn, zIn<zTerm, c); WRITE_UTF8(z, c); } } pMem->n = (int)(z - zOut); } *z = 0; assert( (pMem->n+(desiredEnc==SQLITE_UTF8?1:2))<=len ); c = pMem->flags; sqlite3VdbeMemRelease(pMem); pMem->flags = MEM_Str|MEM_Term|(c&MEM_AffMask); pMem->enc = desiredEnc; pMem->z = (char*)zOut; pMem->zMalloc = pMem->z; pMem->szMalloc = sqlite3DbMallocSize(pMem->db, pMem->z); translate_out: #if defined(TRANSLATE_TRACE) && defined(SQLITE_DEBUG) { char zBuf[100]; sqlite3VdbeMemPrettyPrint(pMem, zBuf); fprintf(stderr, "OUTPUT: %s\n", zBuf); } #endif return SQLITE_OK; }
/* ** This routine transforms the internal text encoding used by pMem to ** desiredEnc. It is an error if the string is already of the desired ** encoding, or if *pMem does not contain a string value. */ int sqlite3VdbeMemTranslate(Mem *pMem, u8 desiredEnc){ unsigned char zShort[NBFS]; /* Temporary short output buffer */ int len; /* Maximum length of output string in bytes */ unsigned char *zOut; /* Output buffer */ unsigned char *zIn; /* Input iterator */ unsigned char *zTerm; /* End of input */ unsigned char *z; /* Output iterator */ unsigned int c; assert( pMem->flags&MEM_Str ); assert( pMem->enc!=desiredEnc ); assert( pMem->enc!=0 ); assert( pMem->n>=0 ); #if defined(TRANSLATE_TRACE) && defined(SQLITE_DEBUG) { char zBuf[100]; sqlite3VdbeMemPrettyPrint(pMem, zBuf); fprintf(stderr, "INPUT: %s\n", zBuf); } #endif /* If the translation is between UTF-16 little and big endian, then ** all that is required is to swap the byte order. This case is handled ** differently from the others. */ if( pMem->enc!=SQLITE_UTF8 && desiredEnc!=SQLITE_UTF8 ){ u8 temp; int rc; rc = sqlite3VdbeMemMakeWriteable(pMem); if( rc!=SQLITE_OK ){ assert( rc==SQLITE_NOMEM ); return SQLITE_NOMEM; } zIn = (u8*)pMem->z; zTerm = &zIn[pMem->n]; while( zIn<zTerm ){ temp = *zIn; *zIn = *(zIn+1); zIn++; *zIn++ = temp; } pMem->enc = desiredEnc; goto translate_out; } /* Set len to the maximum number of bytes required in the output buffer. */ if( desiredEnc==SQLITE_UTF8 ){ /* When converting from UTF-16, the maximum growth results from ** translating a 2-byte character to a 4-byte UTF-8 character. ** A single byte is required for the output string ** nul-terminator. */ len = pMem->n * 2 + 1; }else{ /* When converting from UTF-8 to UTF-16 the maximum growth is caused ** when a 1-byte UTF-8 character is translated into a 2-byte UTF-16 ** character. Two bytes are required in the output buffer for the ** nul-terminator. */ len = pMem->n * 2 + 2; } /* Set zIn to point at the start of the input buffer and zTerm to point 1 ** byte past the end. ** ** Variable zOut is set to point at the output buffer. This may be space ** obtained from malloc(), or Mem.zShort, if it large enough and not in ** use, or the zShort array on the stack (see above). */ zIn = (u8*)pMem->z; zTerm = &zIn[pMem->n]; if( len>NBFS ){ zOut = sqliteMallocRaw(len); if( !zOut ) return SQLITE_NOMEM; }else{ zOut = zShort; } z = zOut; if( pMem->enc==SQLITE_UTF8 ){ unsigned int iExtra = 0xD800; if( 0==(pMem->flags&MEM_Term) && zTerm>zIn && (zTerm[-1]&0x80) ){ /* This UTF8 string is not nul-terminated, and the last byte is ** not a character in the ascii range (codpoints 0..127). This ** means the SQLITE_READ_UTF8() macro might read past the end ** of the allocated buffer. ** ** There are four possibilities: ** ** 1. The last byte is the first byte of a non-ASCII character, ** ** 2. The final N bytes of the input string are continuation bytes ** and immediately preceding them is the first byte of a ** non-ASCII character. ** ** 3. The final N bytes of the input string are continuation bytes ** and immediately preceding them is a byte that encodes a ** character in the ASCII range. ** ** 4. The entire string consists of continuation characters. ** ** Cases (3) and (4) require no special handling. The SQLITE_READ_UTF8() ** macro will not overread the buffer in these cases. */ unsigned char *zExtra = &zTerm[-1]; while( zExtra>zIn && (zExtra[0]&0xC0)==0x80 ){ zExtra--; } if( (zExtra[0]&0xC0)==0xC0 ){ /* Make a copy of the last character encoding in the input string. ** Then make sure it is nul-terminated and use SQLITE_READ_UTF8() ** to decode the codepoint. Store the codepoint in variable iExtra, ** it will be appended to the output string later. */ unsigned char *zFree = 0; unsigned char zBuf[16]; int nExtra = (pMem->n+zIn-zExtra); zTerm = zExtra; if( nExtra>15 ){ zExtra = sqliteMallocRaw(nExtra+1); if( !zExtra ){ return SQLITE_NOMEM; } zFree = zExtra; }else{ zExtra = zBuf; } memcpy(zExtra, zTerm, nExtra); zExtra[nExtra] = '\0'; SQLITE_READ_UTF8(zExtra, iExtra); sqliteFree(zFree); } } if( desiredEnc==SQLITE_UTF16LE ){ /* UTF-8 -> UTF-16 Little-endian */ while( zIn<zTerm ){ SQLITE_READ_UTF8(zIn, c); WRITE_UTF16LE(z, c); } if( iExtra!=0xD800 ){ WRITE_UTF16LE(z, iExtra); } }else{ assert( desiredEnc==SQLITE_UTF16BE ); /* UTF-8 -> UTF-16 Big-endian */ while( zIn<zTerm ){ SQLITE_READ_UTF8(zIn, c); WRITE_UTF16BE(z, c); } if( iExtra!=0xD800 ){ WRITE_UTF16BE(z, iExtra); } } pMem->n = z - zOut; *z++ = 0; }else{ assert( desiredEnc==SQLITE_UTF8 ); if( pMem->enc==SQLITE_UTF16LE ){ /* UTF-16 Little-endian -> UTF-8 */ while( zIn<zTerm ){ READ_UTF16LE(zIn, c); WRITE_UTF8(z, c); } }else{ /* UTF-16 Little-endian -> UTF-8 */ while( zIn<zTerm ){ READ_UTF16BE(zIn, c); WRITE_UTF8(z, c); } } pMem->n = z - zOut; } *z = 0; assert( (pMem->n+(desiredEnc==SQLITE_UTF8?1:2))<=len ); sqlite3VdbeMemRelease(pMem); pMem->flags &= ~(MEM_Static|MEM_Dyn|MEM_Ephem|MEM_Short); pMem->enc = desiredEnc; if( zOut==zShort ){ memcpy(pMem->zShort, zOut, len); zOut = (u8*)pMem->zShort; pMem->flags |= (MEM_Term|MEM_Short); }else{ pMem->flags |= (MEM_Term|MEM_Dyn); } pMem->z = (char*)zOut; translate_out: #if defined(TRANSLATE_TRACE) && defined(SQLITE_DEBUG) { char zBuf[100]; sqlite3VdbeMemPrettyPrint(pMem, zBuf); fprintf(stderr, "OUTPUT: %s\n", zBuf); } #endif return SQLITE_OK; }