/* ** Tokenize some text using the ascii tokenizer. */ static int fts5AsciiTokenize( Fts5Tokenizer *pTokenizer, void *pCtx, int iUnused, const char *pText, int nText, int (*xToken)(void*, int, const char*, int nToken, int iStart, int iEnd) ){ AsciiTokenizer *p = (AsciiTokenizer*)pTokenizer; int rc = SQLITE_OK; int ie; int is = 0; char aFold[64]; int nFold = sizeof(aFold); char *pFold = aFold; unsigned char *a = p->aTokenChar; UNUSED_PARAM(iUnused); while( is<nText && rc==SQLITE_OK ){ int nByte; /* Skip any leading divider characters. */ while( is<nText && ((pText[is]&0x80)==0 && a[(int)pText[is]]==0) ){ is++; } if( is==nText ) break; /* Count the token characters */ ie = is+1; while( ie<nText && ((pText[ie]&0x80) || a[(int)pText[ie]] ) ){ ie++; } /* Fold to lower case */ nByte = ie-is; if( nByte>nFold ){ if( pFold!=aFold ) sqlite3_free(pFold); pFold = sqlite3_malloc(nByte*2); if( pFold==0 ){ rc = SQLITE_NOMEM; break; } nFold = nByte*2; } asciiFold(pFold, &pText[is], nByte); /* Invoke the token callback */ rc = xToken(pCtx, 0, pFold, nByte, is, ie); is = ie+1; } if( pFold!=aFold ) sqlite3_free(pFold); if( rc==SQLITE_DONE ) rc = SQLITE_OK; return rc; }
static int fts5UnicodeTokenize( Fts5Tokenizer *pTokenizer, void *pCtx, const char *pText, int nText, int (*xToken)(void*, const char*, int nToken, int iStart, int iEnd) ){ Unicode61Tokenizer *p = (Unicode61Tokenizer*)pTokenizer; int rc = SQLITE_OK; unsigned char *a = p->aTokenChar; unsigned char *zTerm = (unsigned char*)&pText[nText]; unsigned char *zCsr = (unsigned char *)pText; /* Output buffer */ char *aFold = p->aFold; int nFold = p->nFold; const char *pEnd = &aFold[nFold-6]; /* Each iteration of this loop gobbles up a contiguous run of separators, ** then the next token. */ while( rc==SQLITE_OK ){ int iCode; /* non-ASCII codepoint read from input */ char *zOut = aFold; int is; int ie; /* Skip any separator characters. */ while( 1 ){ if( zCsr>=zTerm ) goto tokenize_done; if( *zCsr & 0x80 ) { /* A character outside of the ascii range. Skip past it if it is ** a separator character. Or break out of the loop if it is not. */ is = zCsr - (unsigned char*)pText; READ_UTF8(zCsr, zTerm, iCode); if( fts5UnicodeIsAlnum(p, iCode) ){ goto non_ascii_tokenchar; } }else{ if( a[*zCsr] ){ is = zCsr - (unsigned char*)pText; goto ascii_tokenchar; } zCsr++; } } /* Run through the tokenchars. Fold them into the output buffer along ** the way. */ while( zCsr<zTerm ){ /* Grow the output buffer so that there is sufficient space to fit the ** largest possible utf-8 character. */ if( zOut>pEnd ){ aFold = sqlite3_malloc(nFold*2); if( aFold==0 ){ rc = SQLITE_NOMEM; goto tokenize_done; } zOut = &aFold[zOut - p->aFold]; memcpy(aFold, p->aFold, nFold); sqlite3_free(p->aFold); p->aFold = aFold; p->nFold = nFold = nFold*2; pEnd = &aFold[nFold-6]; } if( *zCsr & 0x80 ){ /* An non-ascii-range character. Fold it into the output buffer if ** it is a token character, or break out of the loop if it is not. */ READ_UTF8(zCsr, zTerm, iCode); if( fts5UnicodeIsAlnum(p,iCode)||sqlite3Fts5UnicodeIsdiacritic(iCode) ){ non_ascii_tokenchar: iCode = sqlite3Fts5UnicodeFold(iCode, p->bRemoveDiacritic); if( iCode ) WRITE_UTF8(zOut, iCode); }else{ break; } }else if( a[*zCsr]==0 ){ /* An ascii-range separator character. End of token. */ break; }else{ ascii_tokenchar: if( *zCsr>='A' && *zCsr<='Z' ){ *zOut++ = *zCsr + 32; }else{ *zOut++ = *zCsr; } zCsr++; } ie = zCsr - (unsigned char*)pText; } /* Invoke the token callback */ rc = xToken(pCtx, aFold, zOut-aFold, is, ie); } tokenize_done: if( rc==SQLITE_DONE ) rc = SQLITE_OK; return rc; }