static void reverse_string(int* buffer, char* result, sqlite_uint64 len) { sqlite3_uint64 i; for (i = 0; i < len; i++) { WRITE_UTF8(result, buffer[len - i - 1]); } *result = 0; }
/* ** Translate UTF-8 to UTF-8. ** ** This has the effect of making sure that the string is well-formed ** UTF-8. Miscoded characters are removed. ** ** The translation is done in-place and aborted if the output ** overruns the input. */ int sqlite3Utf8To8(unsigned char *zIn){ unsigned char *zOut = zIn; unsigned char *zStart = zIn; u32 c; while( zIn[0] && zOut<=zIn ){ c = sqlite3Utf8Read((const u8**)&zIn); if( c!=0xfffd ){ WRITE_UTF8(zOut, c); } } *zOut = 0; return (int)(zOut - zStart); }
/* ** Translate UTF-8 to UTF-8. ** ** This has the effect of making sure that the string is well-formed ** UTF-8. Miscoded characters are removed. ** ** The translation is done in-place (since it is impossible for the ** correct UTF-8 encoding to be longer than a malformed encoding). */ int sqlite3Utf8To8(unsigned char *zIn){ unsigned char *zOut = zIn; unsigned char *zStart = zIn; unsigned char *zTerm; u32 c; while( zIn[0] ){ c = sqlite3Utf8Read(zIn, zTerm, (const u8**)&zIn); if( c!=0xfffd ){ WRITE_UTF8(zOut, c); } } *zOut = 0; return zOut - zStart; }
/* ** Translate UTF-8 to UTF-8. ** ** This has the effect of making sure that the string is well-formed ** UTF-8. Miscoded characters are removed. ** ** The translation is done in-place (since it is impossible for the ** correct UTF-8 encoding to be longer than a malformed encoding). */ int sqlite3Utf8To8(unsigned char *zIn){ unsigned char *zOut = zIn; unsigned char *zStart = zIn; int c; while(1){ SQLITE_READ_UTF8(zIn, c); if( c==0 ) break; if( c!=0xfffd ){ WRITE_UTF8(zOut, c); } } *zOut = 0; return zOut - zStart; }
/* ** This routine is called from the TCL test function "translate_selftest". ** It checks that the primitives for serializing and deserializing ** characters in each encoding are inverses of each other. */ void sqlite3UtfSelfTest(void){ unsigned int i, t; unsigned char zBuf[20]; unsigned char *z; int n; unsigned int c; for(i=0; i<0x00110000; i++){ z = zBuf; WRITE_UTF8(z, i); n = (int)(z-zBuf); assert( n>0 && n<=4 ); z[0] = 0; z = zBuf; c = sqlite3Utf8Read((const u8**)&z); t = i; if( i>=0xD800 && i<=0xDFFF ) t = 0xFFFD; if( (i&0xFFFFFFFE)==0xFFFE ) t = 0xFFFD; assert( c==t ); assert( (z-zBuf)==n ); } for(i=0; i<0x00110000; i++){ if( i>=0xD800 && i<0xE000 ) continue; z = zBuf; WRITE_UTF16LE(z, i); n = (int)(z-zBuf); assert( n>0 && n<=4 ); z[0] = 0; z = zBuf; READ_UTF16LE(z, 1, c); assert( c==i ); assert( (z-zBuf)==n ); } for(i=0; i<0x00110000; i++){ if( i>=0xD800 && i<0xE000 ) continue; z = zBuf; WRITE_UTF16BE(z, i); n = (int)(z-zBuf); assert( n>0 && n<=4 ); z[0] = 0; z = zBuf; READ_UTF16BE(z, 1, c); assert( c==i ); assert( (z-zBuf)==n ); } }
/* ** This routine is called from the TCL test function "translate_selftest". ** It checks that the primitives for serializing and deserializing ** characters in each encoding are inverses of each other. */ void sqlite3UtfSelfTest(){ unsigned int i, t; unsigned char zBuf[20]; unsigned char *z; int n; unsigned int c; for(i=0; i<0x00110000; i++){ z = zBuf; WRITE_UTF8(z, i); n = z-zBuf; z[0] = 0; z = zBuf; SQLITE_READ_UTF8(z, c); t = i; if( i>=0xD800 && i<=0xDFFF ) t = 0xFFFD; if( (i&0xFFFFFFFE)==0xFFFE ) t = 0xFFFD; assert( c==t ); assert( (z-zBuf)==n ); } for(i=0; i<0x00110000; i++){ if( i>=0xD800 && i<0xE000 ) continue; z = zBuf; WRITE_UTF16LE(z, i); n = z-zBuf; z[0] = 0; z = zBuf; READ_UTF16LE(z, c); assert( c==i ); assert( (z-zBuf)==n ); } for(i=0; i<0x00110000; i++){ if( i>=0xD800 && i<0xE000 ) continue; z = zBuf; WRITE_UTF16BE(z, i); n = z-zBuf; z[0] = 0; z = zBuf; READ_UTF16BE(z, c); assert( c==i ); assert( (z-zBuf)==n ); } }
/** * Normalizing but non-stemming term copying. * * The original function would take 10 bytes from the front and 10 bytes from * the back if there were no digits in the string and it was more than 20 * bytes long. If there were digits involved that would decrease to 3 bytes * from the front and 3 from the back. This would potentially corrupt utf-8 * encoded characters, which is fine from the perspective of the FTS3 logic. * * In our revised form we now operate on a unicode character basis rather than * a byte basis. Additionally we use the same length limit even if there are * digits involved because it's not clear digit token-space reduction is saving * us from anything and could be hurting. Specifically, if no one is ever * going to search on things with digits, then we should just remove them. * Right now, the space reduction is going to increase false positives when * people do search on them and increase the number of collisions sufficiently * to make it really expensive. The caveat is there will be some increase in * index size which could be meaningful if people are receiving lots of emails * full of distinct numbers. * * In order to do the copy-from-the-front and copy-from-the-back trick, once * we reach N characters in, we set zFrontEnd to the current value of zOut * (which represents the termination of the first part of the result string) * and set zBackStart to the value of zOutStart. We then advanced zBackStart * along a character at a time as we write more characters. Once we have * traversed the entire string, if zBackStart > zFrontEnd, then we know * the string should be shrunk using the characters in the two ranges. * * (It would be faster to scan from the back with specialized logic but that * particular logic seems easy to screw up and we don't have unit tests in here * to the extent required.) * * @param zIn Input string to normalize and potentially shrink. * @param nBytesIn The number of bytes in zIn, distinct from the number of * unicode characters encoded in zIn. * @param zOut The string to write our output into. This must have at least * nBytesIn * MAX_UTF8_GROWTH_FACTOR in order to compensate for * normalization that results in a larger utf-8 encoding. * @param pnBytesOut Integer to write the number of bytes in zOut into. */ static void copy_stemmer(const unsigned char *zIn, const int nBytesIn, unsigned char *zOut, int *pnBytesOut){ const unsigned char *zInTerm = zIn + nBytesIn; unsigned char *zOutStart = zOut; unsigned int c; unsigned int charCount = 0; unsigned char *zFrontEnd = NULL, *zBackStart = NULL; unsigned int trashC; /* copy normalized character */ while (zIn < zInTerm) { READ_UTF8(zIn, zInTerm, c); c = normalize_character(c); /* ignore voiced/semi-voiced sound mark */ if (!isVoicedSoundMark(c)) { /* advance one non-voiced sound mark character. */ if (zBackStart) READ_UTF8(zBackStart, zOut, trashC); WRITE_UTF8(zOut, c); charCount++; if (charCount == COPY_STEMMER_COPY_HALF_LEN) { zFrontEnd = zOut; zBackStart = zOutStart; } } } /* if we need to shrink the string, transplant the back bytes */ if (zBackStart > zFrontEnd) { /* this handles when both are null too */ size_t backBytes = zOut - zBackStart; memmove(zFrontEnd, zBackStart, backBytes); zOut = zFrontEnd + backBytes; } *zOut = 0; *pnBytesOut = (int) zOut - (int) zOutStart; }
/* ** This routine is called from the TCL test function "translate_selftest". ** It checks that the primitives for serializing and deserializing ** characters in each encoding are inverses of each other. */ void sqlite3utfSelfTest(){ int i; unsigned char zBuf[20]; unsigned char *z; int n; int c; for(i=0; i<0x00110000; i++){ z = zBuf; WRITE_UTF8(z, i); n = z-zBuf; z = zBuf; READ_UTF8(z, c); assert( c==i ); assert( (z-zBuf)==n ); } for(i=0; i<0x00110000; i++){ if( i>=0xD800 && i<=0xE000 ) continue; z = zBuf; WRITE_UTF16LE(z, i); n = z-zBuf; z = zBuf; READ_UTF16LE(z, c); assert( c==i ); assert( (z-zBuf)==n ); } for(i=0; i<0x00110000; i++){ if( i>=0xD800 && i<=0xE000 ) continue; z = zBuf; WRITE_UTF16BE(z, i); n = z-zBuf; z = zBuf; READ_UTF16BE(z, c); assert( c==i ); assert( (z-zBuf)==n ); } }
/* ** This routine transforms the internal text encoding used by pMem to ** desiredEnc. It is an error if the string is already of the desired ** encoding, or if *pMem does not contain a string value. */ int sqlite3VdbeMemTranslate(Mem *pMem, u8 desiredEnc){ unsigned char zShort[NBFS]; /* Temporary short output buffer */ int len; /* Maximum length of output string in bytes */ unsigned char *zOut; /* Output buffer */ unsigned char *zIn; /* Input iterator */ unsigned char *zTerm; /* End of input */ unsigned char *z; /* Output iterator */ unsigned int c; assert( pMem->flags&MEM_Str ); assert( pMem->enc!=desiredEnc ); assert( pMem->enc!=0 ); assert( pMem->n>=0 ); #if defined(TRANSLATE_TRACE) && defined(SQLITE_DEBUG) { char zBuf[100]; sqlite3VdbeMemPrettyPrint(pMem, zBuf); fprintf(stderr, "INPUT: %s\n", zBuf); } #endif /* If the translation is between UTF-16 little and big endian, then ** all that is required is to swap the byte order. This case is handled ** differently from the others. */ if( pMem->enc!=SQLITE_UTF8 && desiredEnc!=SQLITE_UTF8 ){ u8 temp; int rc; rc = sqlite3VdbeMemMakeWriteable(pMem); if( rc!=SQLITE_OK ){ assert( rc==SQLITE_NOMEM ); return SQLITE_NOMEM; } zIn = (u8*)pMem->z; zTerm = &zIn[pMem->n]; while( zIn<zTerm ){ temp = *zIn; *zIn = *(zIn+1); zIn++; *zIn++ = temp; } pMem->enc = desiredEnc; goto translate_out; } /* Set len to the maximum number of bytes required in the output buffer. */ if( desiredEnc==SQLITE_UTF8 ){ /* When converting from UTF-16, the maximum growth results from ** translating a 2-byte character to a 4-byte UTF-8 character. ** A single byte is required for the output string ** nul-terminator. */ len = pMem->n * 2 + 1; }else{ /* When converting from UTF-8 to UTF-16 the maximum growth is caused ** when a 1-byte UTF-8 character is translated into a 2-byte UTF-16 ** character. Two bytes are required in the output buffer for the ** nul-terminator. */ len = pMem->n * 2 + 2; } /* Set zIn to point at the start of the input buffer and zTerm to point 1 ** byte past the end. ** ** Variable zOut is set to point at the output buffer. This may be space ** obtained from malloc(), or Mem.zShort, if it large enough and not in ** use, or the zShort array on the stack (see above). */ zIn = (u8*)pMem->z; zTerm = &zIn[pMem->n]; if( len>NBFS ){ zOut = sqliteMallocRaw(len); if( !zOut ) return SQLITE_NOMEM; }else{ zOut = zShort; } z = zOut; if( pMem->enc==SQLITE_UTF8 ){ if( desiredEnc==SQLITE_UTF16LE ){ /* UTF-8 -> UTF-16 Little-endian */ while( zIn<zTerm ){ READ_UTF8(zIn, c); WRITE_UTF16LE(z, c); } }else{ assert( desiredEnc==SQLITE_UTF16BE ); /* UTF-8 -> UTF-16 Big-endian */ while( zIn<zTerm ){ READ_UTF8(zIn, c); WRITE_UTF16BE(z, c); } } pMem->n = z - zOut; *z++ = 0; }else{ assert( desiredEnc==SQLITE_UTF8 ); if( pMem->enc==SQLITE_UTF16LE ){ /* UTF-16 Little-endian -> UTF-8 */ while( zIn<zTerm ){ READ_UTF16LE(zIn, c); WRITE_UTF8(z, c); } }else{ /* UTF-16 Little-endian -> UTF-8 */ while( zIn<zTerm ){ READ_UTF16BE(zIn, c); WRITE_UTF8(z, c); } } pMem->n = z - zOut; } *z = 0; assert( (pMem->n+(desiredEnc==SQLITE_UTF8?1:2))<=len ); sqlite3VdbeMemRelease(pMem); pMem->flags &= ~(MEM_Static|MEM_Dyn|MEM_Ephem|MEM_Short); pMem->enc = desiredEnc; if( zOut==zShort ){ memcpy(pMem->zShort, zOut, len); zOut = (u8*)pMem->zShort; pMem->flags |= (MEM_Term|MEM_Short); }else{ pMem->flags |= (MEM_Term|MEM_Dyn); } pMem->z = (char*)zOut; translate_out: #if defined(TRANSLATE_TRACE) && defined(SQLITE_DEBUG) { char zBuf[100]; sqlite3VdbeMemPrettyPrint(pMem, zBuf); fprintf(stderr, "OUTPUT: %s\n", zBuf); } #endif return SQLITE_OK; }
/* ** Extract the next token from a tokenization cursor. The cursor must ** have been opened by a prior call to simpleOpen(). */ static int unicodeNext( sqlite3_tokenizer_cursor *pC, /* Cursor returned by simpleOpen */ const char **paToken, /* OUT: Token text */ int *pnToken, /* OUT: Number of bytes at *paToken */ int *piStart, /* OUT: Starting offset of token */ int *piEnd, /* OUT: Ending offset of token */ int *piPos /* OUT: Position integer of token */ ){ unicode_cursor *pCsr = (unicode_cursor *)pC; unicode_tokenizer *p = ((unicode_tokenizer *)pCsr->base.pTokenizer); int iCode; char *zOut; const unsigned char *z = &pCsr->aInput[pCsr->iOff]; const unsigned char *zStart = z; const unsigned char *zEnd; const unsigned char *zTerm = &pCsr->aInput[pCsr->nInput]; /* Scan past any delimiter characters before the start of the next token. ** Return SQLITE_DONE early if this takes us all the way to the end of ** the input. */ while( z<zTerm ){ READ_UTF8(z, zTerm, iCode); if( unicodeIsAlnum(p, iCode) ) break; zStart = z; } if( zStart>=zTerm ) return SQLITE_DONE; zOut = pCsr->zToken; do { int iOut; /* Grow the output buffer if required. */ if( (zOut-pCsr->zToken)>=(pCsr->nAlloc-4) ){ char *zNew = sqlite3_realloc(pCsr->zToken, pCsr->nAlloc+64); if( !zNew ) return SQLITE_NOMEM; zOut = &zNew[zOut - pCsr->zToken]; pCsr->zToken = zNew; pCsr->nAlloc += 64; } /* Write the folded case of the last character read to the output */ zEnd = z; iOut = sqlite3FtsUnicodeFold(iCode, p->bRemoveDiacritic); if( iOut ){ WRITE_UTF8(zOut, iOut); } /* If the cursor is not at EOF, read the next character */ if( z>=zTerm ) break; READ_UTF8(z, zTerm, iCode); }while( unicodeIsAlnum(p, iCode) || sqlite3FtsUnicodeIsdiacritic(iCode) ); if ( pCsr->pStemmer!=NULL ) { SN_set_current(pCsr->pStemmer, (int)(zOut - pCsr->zToken), (unsigned char *)pCsr->zToken); if ( p->stemmer.stem(pCsr->pStemmer)<0 ) { *paToken = pCsr->zToken; *pnToken = (int)(zOut - pCsr->zToken); }else { pCsr->pStemmer->p[pCsr->pStemmer->l] = '\0'; *paToken = (char *)pCsr->pStemmer->p; *pnToken = pCsr->pStemmer->l; } }else { *paToken = pCsr->zToken; *pnToken = (int)(zOut - pCsr->zToken); } /* Set the output variables and return. */ pCsr->iOff = (int)(z - pCsr->aInput); *piStart = (int)(zStart - pCsr->aInput); *piEnd = (int)(zEnd - pCsr->aInput); *piPos = pCsr->iToken++; return SQLITE_OK; }
/* ** This routine transforms the internal text encoding used by pMem to ** desiredEnc. It is an error if the string is already of the desired ** encoding, or if *pMem does not contain a string value. */ SQLITE_NOINLINE int sqlite3VdbeMemTranslate(Mem *pMem, u8 desiredEnc){ int len; /* Maximum length of output string in bytes */ unsigned char *zOut; /* Output buffer */ unsigned char *zIn; /* Input iterator */ unsigned char *zTerm; /* End of input */ unsigned char *z; /* Output iterator */ unsigned int c; assert( pMem->db==0 || sqlite3_mutex_held(pMem->db->mutex) ); assert( pMem->flags&MEM_Str ); assert( pMem->enc!=desiredEnc ); assert( pMem->enc!=0 ); assert( pMem->n>=0 ); #if defined(TRANSLATE_TRACE) && defined(SQLITE_DEBUG) { char zBuf[100]; sqlite3VdbeMemPrettyPrint(pMem, zBuf); fprintf(stderr, "INPUT: %s\n", zBuf); } #endif /* If the translation is between UTF-16 little and big endian, then ** all that is required is to swap the byte order. This case is handled ** differently from the others. */ if( pMem->enc!=SQLITE_UTF8 && desiredEnc!=SQLITE_UTF8 ){ u8 temp; int rc; rc = sqlite3VdbeMemMakeWriteable(pMem); if( rc!=SQLITE_OK ){ assert( rc==SQLITE_NOMEM ); return SQLITE_NOMEM; } zIn = (u8*)pMem->z; zTerm = &zIn[pMem->n&~1]; while( zIn<zTerm ){ temp = *zIn; *zIn = *(zIn+1); zIn++; *zIn++ = temp; } pMem->enc = desiredEnc; goto translate_out; } /* Set len to the maximum number of bytes required in the output buffer. */ if( desiredEnc==SQLITE_UTF8 ){ /* When converting from UTF-16, the maximum growth results from ** translating a 2-byte character to a 4-byte UTF-8 character. ** A single byte is required for the output string ** nul-terminator. */ pMem->n &= ~1; len = pMem->n * 2 + 1; }else{ /* When converting from UTF-8 to UTF-16 the maximum growth is caused ** when a 1-byte UTF-8 character is translated into a 2-byte UTF-16 ** character. Two bytes are required in the output buffer for the ** nul-terminator. */ len = pMem->n * 2 + 2; } /* Set zIn to point at the start of the input buffer and zTerm to point 1 ** byte past the end. ** ** Variable zOut is set to point at the output buffer, space obtained ** from sqlite3_malloc(). */ zIn = (u8*)pMem->z; zTerm = &zIn[pMem->n]; zOut = sqlite3DbMallocRaw(pMem->db, len); if( !zOut ){ return SQLITE_NOMEM; } z = zOut; if( pMem->enc==SQLITE_UTF8 ){ if( desiredEnc==SQLITE_UTF16LE ){ /* UTF-8 -> UTF-16 Little-endian */ while( zIn<zTerm ){ READ_UTF8(zIn, zTerm, c); WRITE_UTF16LE(z, c); } }else{ assert( desiredEnc==SQLITE_UTF16BE ); /* UTF-8 -> UTF-16 Big-endian */ while( zIn<zTerm ){ READ_UTF8(zIn, zTerm, c); WRITE_UTF16BE(z, c); } } pMem->n = (int)(z - zOut); *z++ = 0; }else{ assert( desiredEnc==SQLITE_UTF8 ); if( pMem->enc==SQLITE_UTF16LE ){ /* UTF-16 Little-endian -> UTF-8 */ while( zIn<zTerm ){ READ_UTF16LE(zIn, zIn<zTerm, c); WRITE_UTF8(z, c); } }else{ /* UTF-16 Big-endian -> UTF-8 */ while( zIn<zTerm ){ READ_UTF16BE(zIn, zIn<zTerm, c); WRITE_UTF8(z, c); } } pMem->n = (int)(z - zOut); } *z = 0; assert( (pMem->n+(desiredEnc==SQLITE_UTF8?1:2))<=len ); c = pMem->flags; sqlite3VdbeMemRelease(pMem); pMem->flags = MEM_Str|MEM_Term|(c&MEM_AffMask); pMem->enc = desiredEnc; pMem->z = (char*)zOut; pMem->zMalloc = pMem->z; pMem->szMalloc = sqlite3DbMallocSize(pMem->db, pMem->z); translate_out: #if defined(TRANSLATE_TRACE) && defined(SQLITE_DEBUG) { char zBuf[100]; sqlite3VdbeMemPrettyPrint(pMem, zBuf); fprintf(stderr, "OUTPUT: %s\n", zBuf); } #endif return SQLITE_OK; }
static int fts5UnicodeTokenize( Fts5Tokenizer *pTokenizer, void *pCtx, const char *pText, int nText, int (*xToken)(void*, const char*, int nToken, int iStart, int iEnd) ){ Unicode61Tokenizer *p = (Unicode61Tokenizer*)pTokenizer; int rc = SQLITE_OK; unsigned char *a = p->aTokenChar; unsigned char *zTerm = (unsigned char*)&pText[nText]; unsigned char *zCsr = (unsigned char *)pText; /* Output buffer */ char *aFold = p->aFold; int nFold = p->nFold; const char *pEnd = &aFold[nFold-6]; /* Each iteration of this loop gobbles up a contiguous run of separators, ** then the next token. */ while( rc==SQLITE_OK ){ int iCode; /* non-ASCII codepoint read from input */ char *zOut = aFold; int is; int ie; /* Skip any separator characters. */ while( 1 ){ if( zCsr>=zTerm ) goto tokenize_done; if( *zCsr & 0x80 ) { /* A character outside of the ascii range. Skip past it if it is ** a separator character. Or break out of the loop if it is not. */ is = zCsr - (unsigned char*)pText; READ_UTF8(zCsr, zTerm, iCode); if( fts5UnicodeIsAlnum(p, iCode) ){ goto non_ascii_tokenchar; } }else{ if( a[*zCsr] ){ is = zCsr - (unsigned char*)pText; goto ascii_tokenchar; } zCsr++; } } /* Run through the tokenchars. Fold them into the output buffer along ** the way. */ while( zCsr<zTerm ){ /* Grow the output buffer so that there is sufficient space to fit the ** largest possible utf-8 character. */ if( zOut>pEnd ){ aFold = sqlite3_malloc(nFold*2); if( aFold==0 ){ rc = SQLITE_NOMEM; goto tokenize_done; } zOut = &aFold[zOut - p->aFold]; memcpy(aFold, p->aFold, nFold); sqlite3_free(p->aFold); p->aFold = aFold; p->nFold = nFold = nFold*2; pEnd = &aFold[nFold-6]; } if( *zCsr & 0x80 ){ /* An non-ascii-range character. Fold it into the output buffer if ** it is a token character, or break out of the loop if it is not. */ READ_UTF8(zCsr, zTerm, iCode); if( fts5UnicodeIsAlnum(p,iCode)||sqlite3Fts5UnicodeIsdiacritic(iCode) ){ non_ascii_tokenchar: iCode = sqlite3Fts5UnicodeFold(iCode, p->bRemoveDiacritic); if( iCode ) WRITE_UTF8(zOut, iCode); }else{ break; } }else if( a[*zCsr]==0 ){ /* An ascii-range separator character. End of token. */ break; }else{ ascii_tokenchar: if( *zCsr>='A' && *zCsr<='Z' ){ *zOut++ = *zCsr + 32; }else{ *zOut++ = *zCsr; } zCsr++; } ie = zCsr - (unsigned char*)pText; } /* Invoke the token callback */ rc = xToken(pCtx, aFold, zOut-aFold, is, ie); } tokenize_done: if( rc==SQLITE_DONE ) rc = SQLITE_OK; return rc; }
/* ** This routine transforms the internal text encoding used by pMem to ** desiredEnc. It is an error if the string is already of the desired ** encoding, or if *pMem does not contain a string value. */ int sqlite3VdbeMemTranslate(Mem *pMem, u8 desiredEnc){ unsigned char zShort[NBFS]; /* Temporary short output buffer */ int len; /* Maximum length of output string in bytes */ unsigned char *zOut; /* Output buffer */ unsigned char *zIn; /* Input iterator */ unsigned char *zTerm; /* End of input */ unsigned char *z; /* Output iterator */ unsigned int c; assert( pMem->flags&MEM_Str ); assert( pMem->enc!=desiredEnc ); assert( pMem->enc!=0 ); assert( pMem->n>=0 ); #if defined(TRANSLATE_TRACE) && defined(SQLITE_DEBUG) { char zBuf[100]; sqlite3VdbeMemPrettyPrint(pMem, zBuf); fprintf(stderr, "INPUT: %s\n", zBuf); } #endif /* If the translation is between UTF-16 little and big endian, then ** all that is required is to swap the byte order. This case is handled ** differently from the others. */ if( pMem->enc!=SQLITE_UTF8 && desiredEnc!=SQLITE_UTF8 ){ u8 temp; int rc; rc = sqlite3VdbeMemMakeWriteable(pMem); if( rc!=SQLITE_OK ){ assert( rc==SQLITE_NOMEM ); return SQLITE_NOMEM; } zIn = (u8*)pMem->z; zTerm = &zIn[pMem->n]; while( zIn<zTerm ){ temp = *zIn; *zIn = *(zIn+1); zIn++; *zIn++ = temp; } pMem->enc = desiredEnc; goto translate_out; } /* Set len to the maximum number of bytes required in the output buffer. */ if( desiredEnc==SQLITE_UTF8 ){ /* When converting from UTF-16, the maximum growth results from ** translating a 2-byte character to a 4-byte UTF-8 character. ** A single byte is required for the output string ** nul-terminator. */ len = pMem->n * 2 + 1; }else{ /* When converting from UTF-8 to UTF-16 the maximum growth is caused ** when a 1-byte UTF-8 character is translated into a 2-byte UTF-16 ** character. Two bytes are required in the output buffer for the ** nul-terminator. */ len = pMem->n * 2 + 2; } /* Set zIn to point at the start of the input buffer and zTerm to point 1 ** byte past the end. ** ** Variable zOut is set to point at the output buffer. This may be space ** obtained from malloc(), or Mem.zShort, if it large enough and not in ** use, or the zShort array on the stack (see above). */ zIn = (u8*)pMem->z; zTerm = &zIn[pMem->n]; if( len>NBFS ){ zOut = sqliteMallocRaw(len); if( !zOut ) return SQLITE_NOMEM; }else{ zOut = zShort; } z = zOut; if( pMem->enc==SQLITE_UTF8 ){ unsigned int iExtra = 0xD800; if( 0==(pMem->flags&MEM_Term) && zTerm>zIn && (zTerm[-1]&0x80) ){ /* This UTF8 string is not nul-terminated, and the last byte is ** not a character in the ascii range (codpoints 0..127). This ** means the SQLITE_READ_UTF8() macro might read past the end ** of the allocated buffer. ** ** There are four possibilities: ** ** 1. The last byte is the first byte of a non-ASCII character, ** ** 2. The final N bytes of the input string are continuation bytes ** and immediately preceding them is the first byte of a ** non-ASCII character. ** ** 3. The final N bytes of the input string are continuation bytes ** and immediately preceding them is a byte that encodes a ** character in the ASCII range. ** ** 4. The entire string consists of continuation characters. ** ** Cases (3) and (4) require no special handling. The SQLITE_READ_UTF8() ** macro will not overread the buffer in these cases. */ unsigned char *zExtra = &zTerm[-1]; while( zExtra>zIn && (zExtra[0]&0xC0)==0x80 ){ zExtra--; } if( (zExtra[0]&0xC0)==0xC0 ){ /* Make a copy of the last character encoding in the input string. ** Then make sure it is nul-terminated and use SQLITE_READ_UTF8() ** to decode the codepoint. Store the codepoint in variable iExtra, ** it will be appended to the output string later. */ unsigned char *zFree = 0; unsigned char zBuf[16]; int nExtra = (pMem->n+zIn-zExtra); zTerm = zExtra; if( nExtra>15 ){ zExtra = sqliteMallocRaw(nExtra+1); if( !zExtra ){ return SQLITE_NOMEM; } zFree = zExtra; }else{ zExtra = zBuf; } memcpy(zExtra, zTerm, nExtra); zExtra[nExtra] = '\0'; SQLITE_READ_UTF8(zExtra, iExtra); sqliteFree(zFree); } } if( desiredEnc==SQLITE_UTF16LE ){ /* UTF-8 -> UTF-16 Little-endian */ while( zIn<zTerm ){ SQLITE_READ_UTF8(zIn, c); WRITE_UTF16LE(z, c); } if( iExtra!=0xD800 ){ WRITE_UTF16LE(z, iExtra); } }else{ assert( desiredEnc==SQLITE_UTF16BE ); /* UTF-8 -> UTF-16 Big-endian */ while( zIn<zTerm ){ SQLITE_READ_UTF8(zIn, c); WRITE_UTF16BE(z, c); } if( iExtra!=0xD800 ){ WRITE_UTF16BE(z, iExtra); } } pMem->n = z - zOut; *z++ = 0; }else{ assert( desiredEnc==SQLITE_UTF8 ); if( pMem->enc==SQLITE_UTF16LE ){ /* UTF-16 Little-endian -> UTF-8 */ while( zIn<zTerm ){ READ_UTF16LE(zIn, c); WRITE_UTF8(z, c); } }else{ /* UTF-16 Little-endian -> UTF-8 */ while( zIn<zTerm ){ READ_UTF16BE(zIn, c); WRITE_UTF8(z, c); } } pMem->n = z - zOut; } *z = 0; assert( (pMem->n+(desiredEnc==SQLITE_UTF8?1:2))<=len ); sqlite3VdbeMemRelease(pMem); pMem->flags &= ~(MEM_Static|MEM_Dyn|MEM_Ephem|MEM_Short); pMem->enc = desiredEnc; if( zOut==zShort ){ memcpy(pMem->zShort, zOut, len); zOut = (u8*)pMem->zShort; pMem->flags |= (MEM_Term|MEM_Short); }else{ pMem->flags |= (MEM_Term|MEM_Dyn); } pMem->z = (char*)zOut; translate_out: #if defined(TRANSLATE_TRACE) && defined(SQLITE_DEBUG) { char zBuf[100]; sqlite3VdbeMemPrettyPrint(pMem, zBuf); fprintf(stderr, "OUTPUT: %s\n", zBuf); } #endif return SQLITE_OK; }