/* ** As part of a tokenchars= or separators= option, the CREATE VIRTUAL TABLE ** statement has specified that the tokenizer for this table shall consider ** all characters in string zIn/nIn to be separators (if bAlnum==0) or ** token characters (if bAlnum==1). ** ** For each codepoint in the zIn/nIn string, this function checks if the ** sqlite3FtsUnicodeIsalnum() function already returns the desired result. ** If so, no action is taken. Otherwise, the codepoint is added to the ** unicode_tokenizer.aiException[] array. For the purposes of tokenization, ** the return value of sqlite3FtsUnicodeIsalnum() is inverted for all ** codepoints in the aiException[] array. ** ** If a standalone diacritic mark (one that sqlite3FtsUnicodeIsdiacritic() ** identifies as a diacritic) occurs in the zIn/nIn string it is ignored. ** It is not possible to change the behaviour of the tokenizer with respect ** to these codepoints. */ static int unicodeAddExceptions( unicode_tokenizer *p, /* Tokenizer to add exceptions to */ int bAlnum, /* Replace Isalnum() return value with this */ const char *zIn, /* Array of characters to make exceptions */ int nIn /* Length of z in bytes */ ){ const unsigned char *z = (const unsigned char *)zIn; const unsigned char *zTerm = &z[nIn]; int iCode; int nEntry = 0; assert( bAlnum==0 || bAlnum==1 ); while( z<zTerm ){ READ_UTF8(z, zTerm, iCode); assert( (sqlite3FtsUnicodeIsalnum(iCode) & 0xFFFFFFFE)==0 ); if( sqlite3FtsUnicodeIsalnum(iCode)!=bAlnum && sqlite3FtsUnicodeIsdiacritic(iCode)==0 ){ nEntry++; } } if( nEntry ){ int *aNew; /* New aiException[] array */ int nNew; /* Number of valid entries in array aNew[] */ aNew = sqlite3_realloc(p->aiException, (p->nException+nEntry)*sizeof(int)); if( aNew==0 ) return SQLITE_NOMEM; nNew = p->nException; z = (const unsigned char *)zIn; while( z<zTerm ){ READ_UTF8(z, zTerm, iCode); if( sqlite3FtsUnicodeIsalnum(iCode)!=bAlnum && sqlite3FtsUnicodeIsdiacritic(iCode)==0 ){ int i, j; for(i=0; i<nNew && aNew[i]<iCode; i++); for(j=nNew; j>i; j--) aNew[j] = aNew[j-1]; aNew[i] = iCode; nNew++; } } p->aiException = aNew; p->nException = nNew; } return SQLITE_OK; }
static void decode_utf8(const unsigned char* str, int* buffer) { int c = 1; while (c) { READ_UTF8(str, 0, c); if (c) *(buffer++) = c; } }
int sqlite3Utf8Read( const unsigned char *z, /* First byte of UTF-8 character */ const unsigned char *zTerm, /* Pretend this byte is 0x00 */ const unsigned char **pzNext /* Write first byte past UTF-8 char here */ ){ int c; READ_UTF8(z, zTerm, c); *pzNext = z; return c; }
static sqlite_uint64 strlen_utf8(const unsigned char* str) { sqlite_uint64 len = 0; int c = 1; while (c) { READ_UTF8(str, 0, c); if (c) len++; } return len; }
/** * Normalizing but non-stemming term copying. * * The original function would take 10 bytes from the front and 10 bytes from * the back if there were no digits in the string and it was more than 20 * bytes long. If there were digits involved that would decrease to 3 bytes * from the front and 3 from the back. This would potentially corrupt utf-8 * encoded characters, which is fine from the perspective of the FTS3 logic. * * In our revised form we now operate on a unicode character basis rather than * a byte basis. Additionally we use the same length limit even if there are * digits involved because it's not clear digit token-space reduction is saving * us from anything and could be hurting. Specifically, if no one is ever * going to search on things with digits, then we should just remove them. * Right now, the space reduction is going to increase false positives when * people do search on them and increase the number of collisions sufficiently * to make it really expensive. The caveat is there will be some increase in * index size which could be meaningful if people are receiving lots of emails * full of distinct numbers. * * In order to do the copy-from-the-front and copy-from-the-back trick, once * we reach N characters in, we set zFrontEnd to the current value of zOut * (which represents the termination of the first part of the result string) * and set zBackStart to the value of zOutStart. We then advanced zBackStart * along a character at a time as we write more characters. Once we have * traversed the entire string, if zBackStart > zFrontEnd, then we know * the string should be shrunk using the characters in the two ranges. * * (It would be faster to scan from the back with specialized logic but that * particular logic seems easy to screw up and we don't have unit tests in here * to the extent required.) * * @param zIn Input string to normalize and potentially shrink. * @param nBytesIn The number of bytes in zIn, distinct from the number of * unicode characters encoded in zIn. * @param zOut The string to write our output into. This must have at least * nBytesIn * MAX_UTF8_GROWTH_FACTOR in order to compensate for * normalization that results in a larger utf-8 encoding. * @param pnBytesOut Integer to write the number of bytes in zOut into. */ static void copy_stemmer(const unsigned char *zIn, const int nBytesIn, unsigned char *zOut, int *pnBytesOut){ const unsigned char *zInTerm = zIn + nBytesIn; unsigned char *zOutStart = zOut; unsigned int c; unsigned int charCount = 0; unsigned char *zFrontEnd = NULL, *zBackStart = NULL; unsigned int trashC; /* copy normalized character */ while (zIn < zInTerm) { READ_UTF8(zIn, zInTerm, c); c = normalize_character(c); /* ignore voiced/semi-voiced sound mark */ if (!isVoicedSoundMark(c)) { /* advance one non-voiced sound mark character. */ if (zBackStart) READ_UTF8(zBackStart, zOut, trashC); WRITE_UTF8(zOut, c); charCount++; if (charCount == COPY_STEMMER_COPY_HALF_LEN) { zFrontEnd = zOut; zBackStart = zOutStart; } } } /* if we need to shrink the string, transplant the back bytes */ if (zBackStart > zFrontEnd) { /* this handles when both are null too */ size_t backBytes = zOut - zBackStart; memmove(zFrontEnd, zBackStart, backBytes); zOut = zFrontEnd + backBytes; } *zOut = 0; *pnBytesOut = (int) zOut - (int) zOutStart; }
static int fts5UnicodeAddExceptions( Unicode61Tokenizer *p, /* Tokenizer object */ const char *z, /* Characters to treat as exceptions */ int bTokenChars /* 1 for 'tokenchars', 0 for 'separators' */ ){ int rc = SQLITE_OK; int n = strlen(z); int *aNew; if( n>0 ){ aNew = (int*)sqlite3_realloc(p->aiException, (n+p->nException)*sizeof(int)); if( aNew ){ int nNew = p->nException; const unsigned char *zCsr = (const unsigned char*)z; const unsigned char *zTerm = (const unsigned char*)&z[n]; while( zCsr<zTerm ){ int iCode; int bToken; READ_UTF8(zCsr, zTerm, iCode); if( iCode<128 ){ p->aTokenChar[iCode] = bTokenChars; }else{ bToken = sqlite3Fts5UnicodeIsalnum(iCode); assert( (bToken==0 || bToken==1) ); assert( (bTokenChars==0 || bTokenChars==1) ); if( bToken!=bTokenChars && sqlite3Fts5UnicodeIsdiacritic(iCode)==0 ){ int i; for(i=0; i<nNew; i++){ if( aNew[i]>iCode ) break; } memmove(&aNew[i+1], &aNew[i], (nNew-i)*sizeof(int)); aNew[i] = iCode; nNew++; } } } p->aiException = aNew; p->nException = nNew; }else{ rc = SQLITE_NOMEM; } } return rc; }
/* ** This routine is called from the TCL test function "translate_selftest". ** It checks that the primitives for serializing and deserializing ** characters in each encoding are inverses of each other. */ void sqlite3utfSelfTest(){ unsigned int i, t; unsigned char zBuf[20]; unsigned char *z; int n; unsigned int c; for(i=0; i<0x00110000; i++){ z = zBuf; WRITE_UTF8(z, i); n = z-zBuf; z = zBuf; READ_UTF8(z, c); t = i; if( i>=0xD800 && i<=0xDFFF ) t = 0xFFFD; if( (i&0xFFFFFFFE)==0xFFFE ) t = 0xFFFD; assert( c==t ); assert( (z-zBuf)==n ); } for(i=0; i<0x00110000; i++){ if( i>=0xD800 && i<=0xE000 ) continue; z = zBuf; WRITE_UTF16LE(z, i); n = z-zBuf; z = zBuf; READ_UTF16LE(z, c); assert( c==i ); assert( (z-zBuf)==n ); } for(i=0; i<0x00110000; i++){ if( i>=0xD800 && i<=0xE000 ) continue; z = zBuf; WRITE_UTF16BE(z, i); n = z-zBuf; z = zBuf; READ_UTF16BE(z, c); assert( c==i ); assert( (z-zBuf)==n ); } }
static int isDelim( const unsigned char *zCur, /* IN: current pointer of token */ const unsigned char *zTerm, /* IN: one character beyond end of token */ int *len, /* OUT: analyzed bytes in this token */ int *state /* IN/OUT: analyze state */ ){ const unsigned char *zIn = zCur; unsigned int c; int delim; /* get the unicode character to analyze */ READ_UTF8(zIn, zTerm, c); c = normalize_character(c); *len = (int) zIn - (int) zCur; /* ASCII character range has rule */ if( c < 0x80 ){ // This is original porter stemmer isDelim logic. // 0x0 - 0x1f are all control characters, 0x20 is space, 0x21-0x2f are // punctuation. delim = (c < 0x30 || !porterIdChar[c - 0x30]); // cases: "&a", "&." if (*state == BIGRAM_USE || *state == BIGRAM_UNKNOWN ){ /* previous maybe CJK and current is ascii */ *state = BIGRAM_ALPHA; /*ascii*/ delim = 1; /* must break */ } else if (delim == 1) { // cases: "a.", ".." /* this is delimiter character */ *state = BIGRAM_RESET; /*reset*/ } else { // cases: "aa", ".a" *state = BIGRAM_ALPHA; /*ascii*/ } return delim; } // (at this point we must be a non-ASCII character) /* voiced/semi-voiced sound mark is ignore */ if (isVoicedSoundMark(c) && *state != BIGRAM_ALPHA) { /* ignore this because it is combined with previous char */ return 0; } /* this isn't CJK range, so return as no delim */ // Anything less than 0x2000 (except to U+0E00-U+0EFF and U+1780-U+17FF) // is the general scripts area and should not be bi-gram indexed. // 0xa000 - 0a4cf is the Yi area. It is apparently a phonetic language whose // usage does not appear to have simple delimeter rules, so we're leaving it // as bigram processed. This is a guess, if you know better, let us know. // (We previously bailed on this range too.) // Addition, U+0E00-U+0E7F is Thai, U+0E80-U+0EFF is Laos, // and U+1780-U+17FF is Khmer. It is no easy way to break each word. // So these should use bi-gram too. // cases: "aa", ".a", "&a" if (c < 0xe00 || (c >= 0xf00 && c < 0x1780) || (c >= 0x1800 && c < 0x2000)) { *state = BIGRAM_ALPHA; /* not really ASCII but same idea; tokenize it */ return 0; } // (at this point we must be a bi-grammable char or delimiter) /* this is space character or delim character */ // cases: "a.", "..", "&." if( IS_UNI_SPACE(c) || IS_JA_DELIM(c) ){ *state = BIGRAM_RESET; /* reset */ return 1; /* it actually is a delimiter; report as such */ } // (at this point we must be a bi-grammable char) // cases: "a&" if( *state==BIGRAM_ALPHA ){ /* Previous is ascii and current maybe CJK */ *state = BIGRAM_UNKNOWN; /* mark as unknown */ return 1; /* break to emit the ASCII token*/ } /* We have no rule for CJK!. use bi-gram */ // cases: "&&" if( *state==BIGRAM_UNKNOWN || *state==BIGRAM_USE ){ /* previous state is unknown. mark as bi-gram */ *state = BIGRAM_USE; return 1; /* break to emit the digram */ } // cases: ".&" (*state == BIGRAM_RESET) *state = BIGRAM_UNKNOWN; /* mark as unknown */ return 0; /* no need to break; nothing to emit */ }
/* ** Stem the input word zIn[0..nIn-1]. Store the output in zOut. ** zOut is at least big enough to hold nIn bytes. Write the actual ** size of the output word (exclusive of the '\0' terminator) into *pnOut. ** ** Any upper-case characters in the US-ASCII character set ([A-Z]) ** are converted to lower case. Upper-case UTF characters are ** unchanged. ** ** Words that are longer than about 20 bytes are stemmed by retaining ** a few bytes from the beginning and the end of the word. If the ** word contains digits, 3 bytes are taken from the beginning and ** 3 bytes from the end. For long words without digits, 10 bytes ** are taken from each end. US-ASCII case folding still applies. ** ** If the input word contains not digits but does characters not ** in [a-zA-Z] then no stemming is attempted and this routine just ** copies the input into the input into the output with US-ASCII ** case folding. ** ** Stemming never increases the length of the word. So there is ** no chance of overflowing the zOut buffer. */ static void porter_stemmer( const unsigned char *zIn, unsigned int nIn, unsigned char *zOut, int *pnOut ){ unsigned int i, j, c; char zReverse[28]; char *z, *z2; const unsigned char *zTerm = zIn + nIn; const unsigned char *zTmp = zIn; if( nIn<3 || nIn>=sizeof(zReverse)-7 ){ /* The word is too big or too small for the porter stemmer. ** Fallback to the copy stemmer */ copy_stemmer(zIn, nIn, zOut, pnOut); return; } for (j = sizeof(zReverse) - 6; zTmp < zTerm; j--) { READ_UTF8(zTmp, zTerm, c); c = normalize_character(c); if( c>='a' && c<='z' ){ zReverse[j] = c; }else{ /* The use of a character not in [a-zA-Z] means that we fallback ** to the copy stemmer */ copy_stemmer(zIn, nIn, zOut, pnOut); return; } } memset(&zReverse[sizeof(zReverse)-5], 0, 5); z = &zReverse[j+1]; /* Step 1a */ if( z[0]=='s' ){ if( !stem(&z, "sess", "ss", 0) && !stem(&z, "sei", "i", 0) && !stem(&z, "ss", "ss", 0) ){ z++; } } /* Step 1b */ z2 = z; if( stem(&z, "dee", "ee", m_gt_0) ){ /* Do nothing. The work was all in the test */ }else if( (stem(&z, "gni", "", hasVowel) || stem(&z, "de", "", hasVowel)) && z!=z2 ){ if( stem(&z, "ta", "ate", 0) || stem(&z, "lb", "ble", 0) || stem(&z, "zi", "ize", 0) ){ /* Do nothing. The work was all in the test */ }else if( doubleConsonant(z) && (*z!='l' && *z!='s' && *z!='z') ){ z++; }else if( m_eq_1(z) && star_oh(z) ){ *(--z) = 'e'; } } /* Step 1c */ if( z[0]=='y' && hasVowel(z+1) ){ z[0] = 'i'; } /* Step 2 */ switch( z[1] ){ case 'a': (void) (stem(&z, "lanoita", "ate", m_gt_0) || stem(&z, "lanoit", "tion", m_gt_0)); break; case 'c': (void) (stem(&z, "icne", "ence", m_gt_0) || stem(&z, "icna", "ance", m_gt_0)); break; case 'e': (void) (stem(&z, "rezi", "ize", m_gt_0)); break; case 'g': (void) (stem(&z, "igol", "log", m_gt_0)); break; case 'l': (void) (stem(&z, "ilb", "ble", m_gt_0) || stem(&z, "illa", "al", m_gt_0) || stem(&z, "iltne", "ent", m_gt_0) || stem(&z, "ile", "e", m_gt_0) || stem(&z, "ilsuo", "ous", m_gt_0)); break; case 'o': (void) (stem(&z, "noitazi", "ize", m_gt_0) || stem(&z, "noita", "ate", m_gt_0) || stem(&z, "rota", "ate", m_gt_0)); break; case 's': (void) (stem(&z, "msila", "al", m_gt_0) || stem(&z, "ssenevi", "ive", m_gt_0) || stem(&z, "ssenluf", "ful", m_gt_0) || stem(&z, "ssensuo", "ous", m_gt_0)); break; case 't': (void) (stem(&z, "itila", "al", m_gt_0) || stem(&z, "itivi", "ive", m_gt_0) || stem(&z, "itilib", "ble", m_gt_0)); break; } /* Step 3 */ switch( z[0] ){ case 'e': (void) (stem(&z, "etaci", "ic", m_gt_0) || stem(&z, "evita", "", m_gt_0) || stem(&z, "ezila", "al", m_gt_0)); break; case 'i': (void) (stem(&z, "itici", "ic", m_gt_0)); break; case 'l': (void) (stem(&z, "laci", "ic", m_gt_0) || stem(&z, "luf", "", m_gt_0)); break; case 's': (void) (stem(&z, "ssen", "", m_gt_0)); break; } /* Step 4 */ switch( z[1] ){ case 'a': if( z[0]=='l' && m_gt_1(z+2) ){ z += 2; } break; case 'c': if( z[0]=='e' && z[2]=='n' && (z[3]=='a' || z[3]=='e') && m_gt_1(z+4) ){ z += 4; } break; case 'e': if( z[0]=='r' && m_gt_1(z+2) ){ z += 2; } break; case 'i': if( z[0]=='c' && m_gt_1(z+2) ){ z += 2; } break; case 'l': if( z[0]=='e' && z[2]=='b' && (z[3]=='a' || z[3]=='i') && m_gt_1(z+4) ){ z += 4; } break; case 'n': if( z[0]=='t' ){ if( z[2]=='a' ){ if( m_gt_1(z+3) ){ z += 3; } }else if( z[2]=='e' ){ (void) (stem(&z, "tneme", "", m_gt_1) || stem(&z, "tnem", "", m_gt_1) || stem(&z, "tne", "", m_gt_1)); } } break; case 'o': if( z[0]=='u' ){ if( m_gt_1(z+2) ){ z += 2; } }else if( z[3]=='s' || z[3]=='t' ){ (void) (stem(&z, "noi", "", m_gt_1)); } break; case 's': if( z[0]=='m' && z[2]=='i' && m_gt_1(z+3) ){ z += 3; } break; case 't': (void) (stem(&z, "eta", "", m_gt_1) || stem(&z, "iti", "", m_gt_1)); break; case 'u': if( z[0]=='s' && z[2]=='o' && m_gt_1(z+3) ){ z += 3; } break; case 'v': case 'z': if( z[0]=='e' && z[2]=='i' && m_gt_1(z+3) ){ z += 3; } break; } /* Step 5a */ if( z[0]=='e' ){ if( m_gt_1(z+1) ){ z++; }else if( m_eq_1(z+1) && !star_oh(z+1) ){ z++; } } /* Step 5b */ if( m_gt_1(z) && z[0]=='l' && z[1]=='l' ){ z++; } /* z[] is now the stemmed word in reverse order. Flip it back ** around into forward order and return. */ *pnOut = i = (unsigned int) strlen(z); zOut[i] = 0; while( *z ){ zOut[--i] = *(z++); } }
/* ** This routine transforms the internal text encoding used by pMem to ** desiredEnc. It is an error if the string is already of the desired ** encoding, or if *pMem does not contain a string value. */ int sqlite3VdbeMemTranslate(Mem *pMem, u8 desiredEnc){ unsigned char zShort[NBFS]; /* Temporary short output buffer */ int len; /* Maximum length of output string in bytes */ unsigned char *zOut; /* Output buffer */ unsigned char *zIn; /* Input iterator */ unsigned char *zTerm; /* End of input */ unsigned char *z; /* Output iterator */ unsigned int c; assert( pMem->flags&MEM_Str ); assert( pMem->enc!=desiredEnc ); assert( pMem->enc!=0 ); assert( pMem->n>=0 ); #if defined(TRANSLATE_TRACE) && defined(SQLITE_DEBUG) { char zBuf[100]; sqlite3VdbeMemPrettyPrint(pMem, zBuf); fprintf(stderr, "INPUT: %s\n", zBuf); } #endif /* If the translation is between UTF-16 little and big endian, then ** all that is required is to swap the byte order. This case is handled ** differently from the others. */ if( pMem->enc!=SQLITE_UTF8 && desiredEnc!=SQLITE_UTF8 ){ u8 temp; int rc; rc = sqlite3VdbeMemMakeWriteable(pMem); if( rc!=SQLITE_OK ){ assert( rc==SQLITE_NOMEM ); return SQLITE_NOMEM; } zIn = (u8*)pMem->z; zTerm = &zIn[pMem->n]; while( zIn<zTerm ){ temp = *zIn; *zIn = *(zIn+1); zIn++; *zIn++ = temp; } pMem->enc = desiredEnc; goto translate_out; } /* Set len to the maximum number of bytes required in the output buffer. */ if( desiredEnc==SQLITE_UTF8 ){ /* When converting from UTF-16, the maximum growth results from ** translating a 2-byte character to a 4-byte UTF-8 character. ** A single byte is required for the output string ** nul-terminator. */ len = pMem->n * 2 + 1; }else{ /* When converting from UTF-8 to UTF-16 the maximum growth is caused ** when a 1-byte UTF-8 character is translated into a 2-byte UTF-16 ** character. Two bytes are required in the output buffer for the ** nul-terminator. */ len = pMem->n * 2 + 2; } /* Set zIn to point at the start of the input buffer and zTerm to point 1 ** byte past the end. ** ** Variable zOut is set to point at the output buffer. This may be space ** obtained from malloc(), or Mem.zShort, if it large enough and not in ** use, or the zShort array on the stack (see above). */ zIn = (u8*)pMem->z; zTerm = &zIn[pMem->n]; if( len>NBFS ){ zOut = sqliteMallocRaw(len); if( !zOut ) return SQLITE_NOMEM; }else{ zOut = zShort; } z = zOut; if( pMem->enc==SQLITE_UTF8 ){ if( desiredEnc==SQLITE_UTF16LE ){ /* UTF-8 -> UTF-16 Little-endian */ while( zIn<zTerm ){ READ_UTF8(zIn, c); WRITE_UTF16LE(z, c); } }else{ assert( desiredEnc==SQLITE_UTF16BE ); /* UTF-8 -> UTF-16 Big-endian */ while( zIn<zTerm ){ READ_UTF8(zIn, c); WRITE_UTF16BE(z, c); } } pMem->n = z - zOut; *z++ = 0; }else{ assert( desiredEnc==SQLITE_UTF8 ); if( pMem->enc==SQLITE_UTF16LE ){ /* UTF-16 Little-endian -> UTF-8 */ while( zIn<zTerm ){ READ_UTF16LE(zIn, c); WRITE_UTF8(z, c); } }else{ /* UTF-16 Little-endian -> UTF-8 */ while( zIn<zTerm ){ READ_UTF16BE(zIn, c); WRITE_UTF8(z, c); } } pMem->n = z - zOut; } *z = 0; assert( (pMem->n+(desiredEnc==SQLITE_UTF8?1:2))<=len ); sqlite3VdbeMemRelease(pMem); pMem->flags &= ~(MEM_Static|MEM_Dyn|MEM_Ephem|MEM_Short); pMem->enc = desiredEnc; if( zOut==zShort ){ memcpy(pMem->zShort, zOut, len); zOut = (u8*)pMem->zShort; pMem->flags |= (MEM_Term|MEM_Short); }else{ pMem->flags |= (MEM_Term|MEM_Dyn); } pMem->z = (char*)zOut; translate_out: #if defined(TRANSLATE_TRACE) && defined(SQLITE_DEBUG) { char zBuf[100]; sqlite3VdbeMemPrettyPrint(pMem, zBuf); fprintf(stderr, "OUTPUT: %s\n", zBuf); } #endif return SQLITE_OK; }
int sqlite3ReadUtf8(const unsigned char *z){ int c; READ_UTF8(z, c); return c; }
/* ** Extract the next token from a tokenization cursor. The cursor must ** have been opened by a prior call to simpleOpen(). */ static int unicodeNext( sqlite3_tokenizer_cursor *pC, /* Cursor returned by simpleOpen */ const char **paToken, /* OUT: Token text */ int *pnToken, /* OUT: Number of bytes at *paToken */ int *piStart, /* OUT: Starting offset of token */ int *piEnd, /* OUT: Ending offset of token */ int *piPos /* OUT: Position integer of token */ ){ unicode_cursor *pCsr = (unicode_cursor *)pC; unicode_tokenizer *p = ((unicode_tokenizer *)pCsr->base.pTokenizer); int iCode; char *zOut; const unsigned char *z = &pCsr->aInput[pCsr->iOff]; const unsigned char *zStart = z; const unsigned char *zEnd; const unsigned char *zTerm = &pCsr->aInput[pCsr->nInput]; /* Scan past any delimiter characters before the start of the next token. ** Return SQLITE_DONE early if this takes us all the way to the end of ** the input. */ while( z<zTerm ){ READ_UTF8(z, zTerm, iCode); if( unicodeIsAlnum(p, iCode) ) break; zStart = z; } if( zStart>=zTerm ) return SQLITE_DONE; zOut = pCsr->zToken; do { int iOut; /* Grow the output buffer if required. */ if( (zOut-pCsr->zToken)>=(pCsr->nAlloc-4) ){ char *zNew = sqlite3_realloc(pCsr->zToken, pCsr->nAlloc+64); if( !zNew ) return SQLITE_NOMEM; zOut = &zNew[zOut - pCsr->zToken]; pCsr->zToken = zNew; pCsr->nAlloc += 64; } /* Write the folded case of the last character read to the output */ zEnd = z; iOut = sqlite3FtsUnicodeFold(iCode, p->bRemoveDiacritic); if( iOut ){ WRITE_UTF8(zOut, iOut); } /* If the cursor is not at EOF, read the next character */ if( z>=zTerm ) break; READ_UTF8(z, zTerm, iCode); }while( unicodeIsAlnum(p, iCode) || sqlite3FtsUnicodeIsdiacritic(iCode) ); if ( pCsr->pStemmer!=NULL ) { SN_set_current(pCsr->pStemmer, (int)(zOut - pCsr->zToken), (unsigned char *)pCsr->zToken); if ( p->stemmer.stem(pCsr->pStemmer)<0 ) { *paToken = pCsr->zToken; *pnToken = (int)(zOut - pCsr->zToken); }else { pCsr->pStemmer->p[pCsr->pStemmer->l] = '\0'; *paToken = (char *)pCsr->pStemmer->p; *pnToken = pCsr->pStemmer->l; } }else { *paToken = pCsr->zToken; *pnToken = (int)(zOut - pCsr->zToken); } /* Set the output variables and return. */ pCsr->iOff = (int)(z - pCsr->aInput); *piStart = (int)(zStart - pCsr->aInput); *piEnd = (int)(zEnd - pCsr->aInput); *piPos = pCsr->iToken++; return SQLITE_OK; }
/* ** This routine transforms the internal text encoding used by pMem to ** desiredEnc. It is an error if the string is already of the desired ** encoding, or if *pMem does not contain a string value. */ SQLITE_NOINLINE int sqlite3VdbeMemTranslate(Mem *pMem, u8 desiredEnc){ int len; /* Maximum length of output string in bytes */ unsigned char *zOut; /* Output buffer */ unsigned char *zIn; /* Input iterator */ unsigned char *zTerm; /* End of input */ unsigned char *z; /* Output iterator */ unsigned int c; assert( pMem->db==0 || sqlite3_mutex_held(pMem->db->mutex) ); assert( pMem->flags&MEM_Str ); assert( pMem->enc!=desiredEnc ); assert( pMem->enc!=0 ); assert( pMem->n>=0 ); #if defined(TRANSLATE_TRACE) && defined(SQLITE_DEBUG) { char zBuf[100]; sqlite3VdbeMemPrettyPrint(pMem, zBuf); fprintf(stderr, "INPUT: %s\n", zBuf); } #endif /* If the translation is between UTF-16 little and big endian, then ** all that is required is to swap the byte order. This case is handled ** differently from the others. */ if( pMem->enc!=SQLITE_UTF8 && desiredEnc!=SQLITE_UTF8 ){ u8 temp; int rc; rc = sqlite3VdbeMemMakeWriteable(pMem); if( rc!=SQLITE_OK ){ assert( rc==SQLITE_NOMEM ); return SQLITE_NOMEM; } zIn = (u8*)pMem->z; zTerm = &zIn[pMem->n&~1]; while( zIn<zTerm ){ temp = *zIn; *zIn = *(zIn+1); zIn++; *zIn++ = temp; } pMem->enc = desiredEnc; goto translate_out; } /* Set len to the maximum number of bytes required in the output buffer. */ if( desiredEnc==SQLITE_UTF8 ){ /* When converting from UTF-16, the maximum growth results from ** translating a 2-byte character to a 4-byte UTF-8 character. ** A single byte is required for the output string ** nul-terminator. */ pMem->n &= ~1; len = pMem->n * 2 + 1; }else{ /* When converting from UTF-8 to UTF-16 the maximum growth is caused ** when a 1-byte UTF-8 character is translated into a 2-byte UTF-16 ** character. Two bytes are required in the output buffer for the ** nul-terminator. */ len = pMem->n * 2 + 2; } /* Set zIn to point at the start of the input buffer and zTerm to point 1 ** byte past the end. ** ** Variable zOut is set to point at the output buffer, space obtained ** from sqlite3_malloc(). */ zIn = (u8*)pMem->z; zTerm = &zIn[pMem->n]; zOut = sqlite3DbMallocRaw(pMem->db, len); if( !zOut ){ return SQLITE_NOMEM; } z = zOut; if( pMem->enc==SQLITE_UTF8 ){ if( desiredEnc==SQLITE_UTF16LE ){ /* UTF-8 -> UTF-16 Little-endian */ while( zIn<zTerm ){ READ_UTF8(zIn, zTerm, c); WRITE_UTF16LE(z, c); } }else{ assert( desiredEnc==SQLITE_UTF16BE ); /* UTF-8 -> UTF-16 Big-endian */ while( zIn<zTerm ){ READ_UTF8(zIn, zTerm, c); WRITE_UTF16BE(z, c); } } pMem->n = (int)(z - zOut); *z++ = 0; }else{ assert( desiredEnc==SQLITE_UTF8 ); if( pMem->enc==SQLITE_UTF16LE ){ /* UTF-16 Little-endian -> UTF-8 */ while( zIn<zTerm ){ READ_UTF16LE(zIn, zIn<zTerm, c); WRITE_UTF8(z, c); } }else{ /* UTF-16 Big-endian -> UTF-8 */ while( zIn<zTerm ){ READ_UTF16BE(zIn, zIn<zTerm, c); WRITE_UTF8(z, c); } } pMem->n = (int)(z - zOut); } *z = 0; assert( (pMem->n+(desiredEnc==SQLITE_UTF8?1:2))<=len ); c = pMem->flags; sqlite3VdbeMemRelease(pMem); pMem->flags = MEM_Str|MEM_Term|(c&MEM_AffMask); pMem->enc = desiredEnc; pMem->z = (char*)zOut; pMem->zMalloc = pMem->z; pMem->szMalloc = sqlite3DbMallocSize(pMem->db, pMem->z); translate_out: #if defined(TRANSLATE_TRACE) && defined(SQLITE_DEBUG) { char zBuf[100]; sqlite3VdbeMemPrettyPrint(pMem, zBuf); fprintf(stderr, "OUTPUT: %s\n", zBuf); } #endif return SQLITE_OK; }
static int fts5UnicodeTokenize( Fts5Tokenizer *pTokenizer, void *pCtx, const char *pText, int nText, int (*xToken)(void*, const char*, int nToken, int iStart, int iEnd) ){ Unicode61Tokenizer *p = (Unicode61Tokenizer*)pTokenizer; int rc = SQLITE_OK; unsigned char *a = p->aTokenChar; unsigned char *zTerm = (unsigned char*)&pText[nText]; unsigned char *zCsr = (unsigned char *)pText; /* Output buffer */ char *aFold = p->aFold; int nFold = p->nFold; const char *pEnd = &aFold[nFold-6]; /* Each iteration of this loop gobbles up a contiguous run of separators, ** then the next token. */ while( rc==SQLITE_OK ){ int iCode; /* non-ASCII codepoint read from input */ char *zOut = aFold; int is; int ie; /* Skip any separator characters. */ while( 1 ){ if( zCsr>=zTerm ) goto tokenize_done; if( *zCsr & 0x80 ) { /* A character outside of the ascii range. Skip past it if it is ** a separator character. Or break out of the loop if it is not. */ is = zCsr - (unsigned char*)pText; READ_UTF8(zCsr, zTerm, iCode); if( fts5UnicodeIsAlnum(p, iCode) ){ goto non_ascii_tokenchar; } }else{ if( a[*zCsr] ){ is = zCsr - (unsigned char*)pText; goto ascii_tokenchar; } zCsr++; } } /* Run through the tokenchars. Fold them into the output buffer along ** the way. */ while( zCsr<zTerm ){ /* Grow the output buffer so that there is sufficient space to fit the ** largest possible utf-8 character. */ if( zOut>pEnd ){ aFold = sqlite3_malloc(nFold*2); if( aFold==0 ){ rc = SQLITE_NOMEM; goto tokenize_done; } zOut = &aFold[zOut - p->aFold]; memcpy(aFold, p->aFold, nFold); sqlite3_free(p->aFold); p->aFold = aFold; p->nFold = nFold = nFold*2; pEnd = &aFold[nFold-6]; } if( *zCsr & 0x80 ){ /* An non-ascii-range character. Fold it into the output buffer if ** it is a token character, or break out of the loop if it is not. */ READ_UTF8(zCsr, zTerm, iCode); if( fts5UnicodeIsAlnum(p,iCode)||sqlite3Fts5UnicodeIsdiacritic(iCode) ){ non_ascii_tokenchar: iCode = sqlite3Fts5UnicodeFold(iCode, p->bRemoveDiacritic); if( iCode ) WRITE_UTF8(zOut, iCode); }else{ break; } }else if( a[*zCsr]==0 ){ /* An ascii-range separator character. End of token. */ break; }else{ ascii_tokenchar: if( *zCsr>='A' && *zCsr<='Z' ){ *zOut++ = *zCsr + 32; }else{ *zOut++ = *zCsr; } zCsr++; } ie = zCsr - (unsigned char*)pText; } /* Invoke the token callback */ rc = xToken(pCtx, aFold, zOut-aFold, is, ie); } tokenize_done: if( rc==SQLITE_DONE ) rc = SQLITE_OK; return rc; }