/*
** As part of a tokenchars= or separators= option, the CREATE VIRTUAL TABLE
** statement has specified that the tokenizer for this table shall consider
** all characters in string zIn/nIn to be separators (if bAlnum==0) or
** token characters (if bAlnum==1).
**
** For each codepoint in the zIn/nIn string, this function checks if the
** sqlite3FtsUnicodeIsalnum() function already returns the desired result.
** If so, no action is taken. Otherwise, the codepoint is added to the 
** unicode_tokenizer.aiException[] array. For the purposes of tokenization,
** the return value of sqlite3FtsUnicodeIsalnum() is inverted for all
** codepoints in the aiException[] array.
**
** If a standalone diacritic mark (one that sqlite3FtsUnicodeIsdiacritic()
** identifies as a diacritic) occurs in the zIn/nIn string it is ignored.
** It is not possible to change the behaviour of the tokenizer with respect
** to these codepoints.
*/
static int unicodeAddExceptions(
  unicode_tokenizer *p,           /* Tokenizer to add exceptions to */
  int bAlnum,                     /* Replace Isalnum() return value with this */
  const char *zIn,                /* Array of characters to make exceptions */
  int nIn                         /* Length of z in bytes */
){
  const unsigned char *z = (const unsigned char *)zIn;
  const unsigned char *zTerm = &z[nIn];
  int iCode;
  int nEntry = 0;

  assert( bAlnum==0 || bAlnum==1 );

  while( z<zTerm ){
    READ_UTF8(z, zTerm, iCode);
    assert( (sqlite3FtsUnicodeIsalnum(iCode) & 0xFFFFFFFE)==0 );
    if( sqlite3FtsUnicodeIsalnum(iCode)!=bAlnum 
     && sqlite3FtsUnicodeIsdiacritic(iCode)==0 
    ){
      nEntry++;
    }
  }

  if( nEntry ){
    int *aNew;                    /* New aiException[] array */
    int nNew;                     /* Number of valid entries in array aNew[] */

    aNew = sqlite3_realloc(p->aiException, (p->nException+nEntry)*sizeof(int));
    if( aNew==0 ) return SQLITE_NOMEM;
    nNew = p->nException;

    z = (const unsigned char *)zIn;
    while( z<zTerm ){
      READ_UTF8(z, zTerm, iCode);
      if( sqlite3FtsUnicodeIsalnum(iCode)!=bAlnum 
       && sqlite3FtsUnicodeIsdiacritic(iCode)==0
      ){
        int i, j;
        for(i=0; i<nNew && aNew[i]<iCode; i++);
        for(j=nNew; j>i; j--) aNew[j] = aNew[j-1];
        aNew[i] = iCode;
        nNew++;
      }
    }
    p->aiException = aNew;
    p->nException = nNew;
  }

  return SQLITE_OK;
}
static void decode_utf8(const unsigned char* str, int* buffer) {
    int c = 1;

    while (c) {
        READ_UTF8(str, 0, c);

        if (c) *(buffer++) = c;
    }
}
示例#3
0
int sqlite3Utf8Read(
  const unsigned char *z,         /* First byte of UTF-8 character */
  const unsigned char *zTerm,     /* Pretend this byte is 0x00 */
  const unsigned char **pzNext    /* Write first byte past UTF-8 char here */
){
  int c;
  READ_UTF8(z, zTerm, c);
  *pzNext = z;
  return c;
}
static sqlite_uint64 strlen_utf8(const unsigned char* str) {
    sqlite_uint64 len = 0;
    int c = 1;

    while (c) {
        READ_UTF8(str, 0, c);

        if (c) len++;
    }

    return len;
}
/**
 * Normalizing but non-stemming term copying.
 *
 * The original function would take 10 bytes from the front and 10 bytes from
 * the back if there were no digits in the string and it was more than 20
 * bytes long.  If there were digits involved that would decrease to 3 bytes
 * from the front and 3 from the back.  This would potentially corrupt utf-8
 * encoded characters, which is fine from the perspective of the FTS3 logic.
 *
 * In our revised form we now operate on a unicode character basis rather than
 * a byte basis.  Additionally we use the same length limit even if there are
 * digits involved because it's not clear digit token-space reduction is saving
 * us from anything and could be hurting.  Specifically, if no one is ever
 * going to search on things with digits, then we should just remove them.
 * Right now, the space reduction is going to increase false positives when
 * people do search on them and increase the number of collisions sufficiently
 * to make it really expensive.  The caveat is there will be some increase in
 * index size which could be meaningful if people are receiving lots of emails
 * full of distinct numbers.
 *
 * In order to do the copy-from-the-front and copy-from-the-back trick, once
 * we reach N characters in, we set zFrontEnd to the current value of zOut
 * (which represents the termination of the first part of the result string)
 * and set zBackStart to the value of zOutStart.  We then advanced zBackStart
 * along a character at a time as we write more characters.  Once we have
 * traversed the entire string, if zBackStart > zFrontEnd, then we know
 * the string should be shrunk using the characters in the two ranges.
 *
 * (It would be faster to scan from the back with specialized logic but that
 * particular logic seems easy to screw up and we don't have unit tests in here
 * to the extent required.)
 *
 * @param zIn Input string to normalize and potentially shrink.
 * @param nBytesIn The number of bytes in zIn, distinct from the number of
 *     unicode characters encoded in zIn.
 * @param zOut The string to write our output into.  This must have at least
 *     nBytesIn * MAX_UTF8_GROWTH_FACTOR in order to compensate for
 *     normalization that results in a larger utf-8 encoding.
 * @param pnBytesOut Integer to write the number of bytes in zOut into.
 */
static void copy_stemmer(const unsigned char *zIn, const int nBytesIn,
                         unsigned char *zOut, int *pnBytesOut){
  const unsigned char *zInTerm = zIn + nBytesIn;
  unsigned char *zOutStart = zOut;
  unsigned int c;
  unsigned int charCount = 0;
  unsigned char *zFrontEnd = NULL, *zBackStart = NULL;
  unsigned int trashC;

  /* copy normalized character */
  while (zIn < zInTerm) {
    READ_UTF8(zIn, zInTerm, c);
    c = normalize_character(c);

    /* ignore voiced/semi-voiced sound mark */
    if (!isVoicedSoundMark(c)) {
      /* advance one non-voiced sound mark character. */
      if (zBackStart)
        READ_UTF8(zBackStart, zOut, trashC);

      WRITE_UTF8(zOut, c);
      charCount++;
      if (charCount == COPY_STEMMER_COPY_HALF_LEN) {
        zFrontEnd = zOut;
        zBackStart = zOutStart;
      }
    }
  }

  /* if we need to shrink the string, transplant the back bytes */
  if (zBackStart > zFrontEnd) { /* this handles when both are null too */
    size_t backBytes = zOut - zBackStart;
    memmove(zFrontEnd, zBackStart, backBytes);
    zOut = zFrontEnd + backBytes;
  }
  *zOut = 0;
    *pnBytesOut = (int) zOut - (int) zOutStart;
}
示例#6
0
static int fts5UnicodeAddExceptions(
  Unicode61Tokenizer *p,          /* Tokenizer object */
  const char *z,                  /* Characters to treat as exceptions */
  int bTokenChars                 /* 1 for 'tokenchars', 0 for 'separators' */
){
  int rc = SQLITE_OK;
  int n = strlen(z);
  int *aNew;

  if( n>0 ){
    aNew = (int*)sqlite3_realloc(p->aiException, (n+p->nException)*sizeof(int));
    if( aNew ){
      int nNew = p->nException;
      const unsigned char *zCsr = (const unsigned char*)z;
      const unsigned char *zTerm = (const unsigned char*)&z[n];
      while( zCsr<zTerm ){
        int iCode;
        int bToken;
        READ_UTF8(zCsr, zTerm, iCode);
        if( iCode<128 ){
          p->aTokenChar[iCode] = bTokenChars;
        }else{
          bToken = sqlite3Fts5UnicodeIsalnum(iCode);
          assert( (bToken==0 || bToken==1) ); 
          assert( (bTokenChars==0 || bTokenChars==1) );
          if( bToken!=bTokenChars && sqlite3Fts5UnicodeIsdiacritic(iCode)==0 ){
            int i;
            for(i=0; i<nNew; i++){
              if( aNew[i]>iCode ) break;
            }
            memmove(&aNew[i+1], &aNew[i], (nNew-i)*sizeof(int));
            aNew[i] = iCode;
            nNew++;
          }
        }
      }
      p->aiException = aNew;
      p->nException = nNew;
    }else{
      rc = SQLITE_NOMEM;
    }
  }

  return rc;
}
示例#7
0
文件: utf.c 项目: moodboom/Reusable
/*
** This routine is called from the TCL test function "translate_selftest".
** It checks that the primitives for serializing and deserializing
** characters in each encoding are inverses of each other.
*/
void sqlite3utfSelfTest(){
  unsigned int i, t;
  unsigned char zBuf[20];
  unsigned char *z;
  int n;
  unsigned int c;

  for(i=0; i<0x00110000; i++){
    z = zBuf;
    WRITE_UTF8(z, i);
    n = z-zBuf;
    z = zBuf;
    READ_UTF8(z, c);
    t = i;
    if( i>=0xD800 && i<=0xDFFF ) t = 0xFFFD;
    if( (i&0xFFFFFFFE)==0xFFFE ) t = 0xFFFD;
    assert( c==t );
    assert( (z-zBuf)==n );
  }
  for(i=0; i<0x00110000; i++){
    if( i>=0xD800 && i<=0xE000 ) continue;
    z = zBuf;
    WRITE_UTF16LE(z, i);
    n = z-zBuf;
    z = zBuf;
    READ_UTF16LE(z, c);
    assert( c==i );
    assert( (z-zBuf)==n );
  }
  for(i=0; i<0x00110000; i++){
    if( i>=0xD800 && i<=0xE000 ) continue;
    z = zBuf;
    WRITE_UTF16BE(z, i);
    n = z-zBuf;
    z = zBuf;
    READ_UTF16BE(z, c);
    assert( c==i );
    assert( (z-zBuf)==n );
  }
}
static int isDelim(
  const unsigned char *zCur,    /* IN: current pointer of token */
  const unsigned char *zTerm,   /* IN: one character beyond end of token */
  int *len,                     /* OUT: analyzed bytes in this token */
  int *state                    /* IN/OUT: analyze state */
){
  const unsigned char *zIn = zCur;
  unsigned int c;
  int delim;

  /* get the unicode character to analyze */
  READ_UTF8(zIn, zTerm, c);
  c = normalize_character(c);
  *len = (int) zIn - (int) zCur;

  /* ASCII character range has rule */
  if( c < 0x80 ){
    // This is original porter stemmer isDelim logic.
    // 0x0 - 0x1f are all control characters, 0x20 is space, 0x21-0x2f are
    //  punctuation.
    delim = (c < 0x30 || !porterIdChar[c - 0x30]);
    // cases: "&a", "&."
    if (*state == BIGRAM_USE || *state == BIGRAM_UNKNOWN ){
      /* previous maybe CJK and current is ascii */
      *state = BIGRAM_ALPHA; /*ascii*/
      delim = 1; /* must break */
    } else if (delim == 1) {
      // cases: "a.", ".."
      /* this is delimiter character */
      *state = BIGRAM_RESET; /*reset*/
    } else {
      // cases: "aa", ".a"
      *state = BIGRAM_ALPHA; /*ascii*/
    }
    return delim;
  }

  // (at this point we must be a non-ASCII character)

  /* voiced/semi-voiced sound mark is ignore */
  if (isVoicedSoundMark(c) && *state != BIGRAM_ALPHA) {
    /* ignore this because it is combined with previous char */
   return 0;
  }

  /* this isn't CJK range, so return as no delim */
  // Anything less than 0x2000 (except to U+0E00-U+0EFF and  U+1780-U+17FF) 
  // is the general scripts area and should not be bi-gram indexed.
  // 0xa000 - 0a4cf is the Yi area.  It is apparently a phonetic language whose
  //  usage does not appear to have simple delimeter rules, so we're leaving it
  //  as bigram processed.  This is a guess, if you know better, let us know.
  //  (We previously bailed on this range too.)
  // Addition, U+0E00-U+0E7F is Thai, U+0E80-U+0EFF is Laos,
  // and U+1780-U+17FF is Khmer.  It is no easy way to break each word.
  // So these should use bi-gram too. 
  // cases: "aa", ".a", "&a"
  if (c < 0xe00 ||
     (c >= 0xf00 && c < 0x1780) ||
     (c >= 0x1800 && c < 0x2000)) {
    *state = BIGRAM_ALPHA; /* not really ASCII but same idea; tokenize it */
    return 0;
  }

  // (at this point we must be a bi-grammable char or delimiter)

  /* this is space character or delim character */
  // cases: "a.", "..", "&."
  if( IS_UNI_SPACE(c) || IS_JA_DELIM(c) ){
    *state = BIGRAM_RESET; /* reset */
    return 1; /* it actually is a delimiter; report as such */
  }

  // (at this point we must be a bi-grammable char)

  // cases: "a&"
  if( *state==BIGRAM_ALPHA ){
    /* Previous is ascii and current maybe CJK */
    *state = BIGRAM_UNKNOWN; /* mark as unknown */
    return 1; /* break to emit the ASCII token*/
  }

  /* We have no rule for CJK!. use bi-gram */
  // cases: "&&"
  if( *state==BIGRAM_UNKNOWN || *state==BIGRAM_USE ){
    /* previous state is unknown.  mark as bi-gram */
    *state = BIGRAM_USE;
    return 1; /* break to emit the digram */
  }

  // cases: ".&" (*state == BIGRAM_RESET)
  *state = BIGRAM_UNKNOWN; /* mark as unknown */
  return 0; /* no need to break; nothing to emit */
}
/*
** Stem the input word zIn[0..nIn-1].  Store the output in zOut.
** zOut is at least big enough to hold nIn bytes.  Write the actual
** size of the output word (exclusive of the '\0' terminator) into *pnOut.
**
** Any upper-case characters in the US-ASCII character set ([A-Z])
** are converted to lower case.  Upper-case UTF characters are
** unchanged.
**
** Words that are longer than about 20 bytes are stemmed by retaining
** a few bytes from the beginning and the end of the word.  If the
** word contains digits, 3 bytes are taken from the beginning and
** 3 bytes from the end.  For long words without digits, 10 bytes
** are taken from each end.  US-ASCII case folding still applies.
** 
** If the input word contains not digits but does characters not 
** in [a-zA-Z] then no stemming is attempted and this routine just 
** copies the input into the input into the output with US-ASCII
** case folding.
**
** Stemming never increases the length of the word.  So there is
** no chance of overflowing the zOut buffer.
*/
static void porter_stemmer(
  const unsigned char *zIn,
  unsigned int nIn,
  unsigned char *zOut,
  int *pnOut
){
  unsigned int i, j, c;
  char zReverse[28];
  char *z, *z2;
  const unsigned char *zTerm = zIn + nIn;
  const unsigned char *zTmp = zIn;

  if( nIn<3 || nIn>=sizeof(zReverse)-7 ){
    /* The word is too big or too small for the porter stemmer.
    ** Fallback to the copy stemmer */
    copy_stemmer(zIn, nIn, zOut, pnOut);
    return;
  }
  for (j = sizeof(zReverse) - 6; zTmp < zTerm; j--) {
    READ_UTF8(zTmp, zTerm, c);
    c = normalize_character(c);
    if( c>='a' && c<='z' ){
      zReverse[j] = c;
    }else{
      /* The use of a character not in [a-zA-Z] means that we fallback
      ** to the copy stemmer */
      copy_stemmer(zIn, nIn, zOut, pnOut);
      return;
    }
  }
  memset(&zReverse[sizeof(zReverse)-5], 0, 5);
  z = &zReverse[j+1];


  /* Step 1a */
  if( z[0]=='s' ){
    if(
     !stem(&z, "sess", "ss", 0) &&
     !stem(&z, "sei", "i", 0)  &&
     !stem(&z, "ss", "ss", 0)
    ){
      z++;
    }
  }

  /* Step 1b */  
  z2 = z;
  if( stem(&z, "dee", "ee", m_gt_0) ){
    /* Do nothing.  The work was all in the test */
  }else if( 
     (stem(&z, "gni", "", hasVowel) || stem(&z, "de", "", hasVowel))
      && z!=z2
  ){
     if( stem(&z, "ta", "ate", 0) ||
         stem(&z, "lb", "ble", 0) ||
         stem(&z, "zi", "ize", 0) ){
       /* Do nothing.  The work was all in the test */
     }else if( doubleConsonant(z) && (*z!='l' && *z!='s' && *z!='z') ){
       z++;
     }else if( m_eq_1(z) && star_oh(z) ){
       *(--z) = 'e';
     }
  }

  /* Step 1c */
  if( z[0]=='y' && hasVowel(z+1) ){
    z[0] = 'i';
  }

  /* Step 2 */
  switch( z[1] ){
   case 'a':
     (void) (stem(&z, "lanoita", "ate", m_gt_0) ||
             stem(&z, "lanoit", "tion", m_gt_0));
     break;
   case 'c':
     (void) (stem(&z, "icne", "ence", m_gt_0) ||
             stem(&z, "icna", "ance", m_gt_0));
     break;
   case 'e':
     (void) (stem(&z, "rezi", "ize", m_gt_0));
     break;
   case 'g':
     (void) (stem(&z, "igol", "log", m_gt_0));
     break;
   case 'l':
     (void) (stem(&z, "ilb", "ble", m_gt_0) ||
             stem(&z, "illa", "al", m_gt_0) ||
             stem(&z, "iltne", "ent", m_gt_0) ||
             stem(&z, "ile", "e", m_gt_0) ||
             stem(&z, "ilsuo", "ous", m_gt_0));
     break;
   case 'o':
     (void) (stem(&z, "noitazi", "ize", m_gt_0) ||
             stem(&z, "noita", "ate", m_gt_0) ||
             stem(&z, "rota", "ate", m_gt_0));
     break;
   case 's':
     (void) (stem(&z, "msila", "al", m_gt_0) ||
             stem(&z, "ssenevi", "ive", m_gt_0) ||
             stem(&z, "ssenluf", "ful", m_gt_0) ||
             stem(&z, "ssensuo", "ous", m_gt_0));
     break;
   case 't':
     (void) (stem(&z, "itila", "al", m_gt_0) ||
             stem(&z, "itivi", "ive", m_gt_0) ||
             stem(&z, "itilib", "ble", m_gt_0));
     break;
  }

  /* Step 3 */
  switch( z[0] ){
   case 'e':
     (void) (stem(&z, "etaci", "ic", m_gt_0) ||
             stem(&z, "evita", "", m_gt_0)   ||
             stem(&z, "ezila", "al", m_gt_0));
     break;
   case 'i':
     (void) (stem(&z, "itici", "ic", m_gt_0));
     break;
   case 'l':
     (void) (stem(&z, "laci", "ic", m_gt_0) ||
             stem(&z, "luf", "", m_gt_0));
     break;
   case 's':
     (void) (stem(&z, "ssen", "", m_gt_0));
     break;
  }

  /* Step 4 */
  switch( z[1] ){
   case 'a':
     if( z[0]=='l' && m_gt_1(z+2) ){
       z += 2;
     }
     break;
   case 'c':
     if( z[0]=='e' && z[2]=='n' && (z[3]=='a' || z[3]=='e')  && m_gt_1(z+4)  ){
       z += 4;
     }
     break;
   case 'e':
     if( z[0]=='r' && m_gt_1(z+2) ){
       z += 2;
     }
     break;
   case 'i':
     if( z[0]=='c' && m_gt_1(z+2) ){
       z += 2;
     }
     break;
   case 'l':
     if( z[0]=='e' && z[2]=='b' && (z[3]=='a' || z[3]=='i') && m_gt_1(z+4) ){
       z += 4;
     }
     break;
   case 'n':
     if( z[0]=='t' ){
       if( z[2]=='a' ){
         if( m_gt_1(z+3) ){
           z += 3;
         }
       }else if( z[2]=='e' ){
         (void) (stem(&z, "tneme", "", m_gt_1) ||
                 stem(&z, "tnem", "", m_gt_1) ||
                 stem(&z, "tne", "", m_gt_1));
       }
     }
     break;
   case 'o':
     if( z[0]=='u' ){
       if( m_gt_1(z+2) ){
         z += 2;
       }
     }else if( z[3]=='s' || z[3]=='t' ){
       (void) (stem(&z, "noi", "", m_gt_1));
     }
     break;
   case 's':
     if( z[0]=='m' && z[2]=='i' && m_gt_1(z+3) ){
       z += 3;
     }
     break;
   case 't':
     (void) (stem(&z, "eta", "", m_gt_1) ||
             stem(&z, "iti", "", m_gt_1));
     break;
   case 'u':
     if( z[0]=='s' && z[2]=='o' && m_gt_1(z+3) ){
       z += 3;
     }
     break;
   case 'v':
   case 'z':
     if( z[0]=='e' && z[2]=='i' && m_gt_1(z+3) ){
       z += 3;
     }
     break;
  }

  /* Step 5a */
  if( z[0]=='e' ){
    if( m_gt_1(z+1) ){
      z++;
    }else if( m_eq_1(z+1) && !star_oh(z+1) ){
      z++;
    }
  }

  /* Step 5b */
  if( m_gt_1(z) && z[0]=='l' && z[1]=='l' ){
    z++;
  }

  /* z[] is now the stemmed word in reverse order.  Flip it back
  ** around into forward order and return.
  */
  *pnOut = i = (unsigned int) strlen(z);
  zOut[i] = 0;
  while( *z ){
    zOut[--i] = *(z++);
  }
}
示例#10
0
文件: utf.c 项目: moodboom/Reusable
/*
** This routine transforms the internal text encoding used by pMem to
** desiredEnc. It is an error if the string is already of the desired
** encoding, or if *pMem does not contain a string value.
*/
int sqlite3VdbeMemTranslate(Mem *pMem, u8 desiredEnc){
  unsigned char zShort[NBFS]; /* Temporary short output buffer */
  int len;                    /* Maximum length of output string in bytes */
  unsigned char *zOut;                  /* Output buffer */
  unsigned char *zIn;                   /* Input iterator */
  unsigned char *zTerm;                 /* End of input */
  unsigned char *z;                     /* Output iterator */
  unsigned int c;

  assert( pMem->flags&MEM_Str );
  assert( pMem->enc!=desiredEnc );
  assert( pMem->enc!=0 );
  assert( pMem->n>=0 );

#if defined(TRANSLATE_TRACE) && defined(SQLITE_DEBUG)
  {
    char zBuf[100];
    sqlite3VdbeMemPrettyPrint(pMem, zBuf);
    fprintf(stderr, "INPUT:  %s\n", zBuf);
  }
#endif

  /* If the translation is between UTF-16 little and big endian, then 
  ** all that is required is to swap the byte order. This case is handled
  ** differently from the others.
  */
  if( pMem->enc!=SQLITE_UTF8 && desiredEnc!=SQLITE_UTF8 ){
    u8 temp;
    int rc;
    rc = sqlite3VdbeMemMakeWriteable(pMem);
    if( rc!=SQLITE_OK ){
      assert( rc==SQLITE_NOMEM );
      return SQLITE_NOMEM;
    }
    zIn = (u8*)pMem->z;
    zTerm = &zIn[pMem->n];
    while( zIn<zTerm ){
      temp = *zIn;
      *zIn = *(zIn+1);
      zIn++;
      *zIn++ = temp;
    }
    pMem->enc = desiredEnc;
    goto translate_out;
  }

  /* Set len to the maximum number of bytes required in the output buffer. */
  if( desiredEnc==SQLITE_UTF8 ){
    /* When converting from UTF-16, the maximum growth results from
    ** translating a 2-byte character to a 4-byte UTF-8 character.
    ** A single byte is required for the output string
    ** nul-terminator.
    */
    len = pMem->n * 2 + 1;
  }else{
    /* When converting from UTF-8 to UTF-16 the maximum growth is caused
    ** when a 1-byte UTF-8 character is translated into a 2-byte UTF-16
    ** character. Two bytes are required in the output buffer for the
    ** nul-terminator.
    */
    len = pMem->n * 2 + 2;
  }

  /* Set zIn to point at the start of the input buffer and zTerm to point 1
  ** byte past the end.
  **
  ** Variable zOut is set to point at the output buffer. This may be space
  ** obtained from malloc(), or Mem.zShort, if it large enough and not in
  ** use, or the zShort array on the stack (see above).
  */
  zIn = (u8*)pMem->z;
  zTerm = &zIn[pMem->n];
  if( len>NBFS ){
    zOut = sqliteMallocRaw(len);
    if( !zOut ) return SQLITE_NOMEM;
  }else{
    zOut = zShort;
  }
  z = zOut;

  if( pMem->enc==SQLITE_UTF8 ){
    if( desiredEnc==SQLITE_UTF16LE ){
      /* UTF-8 -> UTF-16 Little-endian */
      while( zIn<zTerm ){
        READ_UTF8(zIn, c); 
        WRITE_UTF16LE(z, c);
      }
    }else{
      assert( desiredEnc==SQLITE_UTF16BE );
      /* UTF-8 -> UTF-16 Big-endian */
      while( zIn<zTerm ){
        READ_UTF8(zIn, c); 
        WRITE_UTF16BE(z, c);
      }
    }
    pMem->n = z - zOut;
    *z++ = 0;
  }else{
    assert( desiredEnc==SQLITE_UTF8 );
    if( pMem->enc==SQLITE_UTF16LE ){
      /* UTF-16 Little-endian -> UTF-8 */
      while( zIn<zTerm ){
        READ_UTF16LE(zIn, c); 
        WRITE_UTF8(z, c);
      }
    }else{
      /* UTF-16 Little-endian -> UTF-8 */
      while( zIn<zTerm ){
        READ_UTF16BE(zIn, c); 
        WRITE_UTF8(z, c);
      }
    }
    pMem->n = z - zOut;
  }
  *z = 0;
  assert( (pMem->n+(desiredEnc==SQLITE_UTF8?1:2))<=len );

  sqlite3VdbeMemRelease(pMem);
  pMem->flags &= ~(MEM_Static|MEM_Dyn|MEM_Ephem|MEM_Short);
  pMem->enc = desiredEnc;
  if( zOut==zShort ){
    memcpy(pMem->zShort, zOut, len);
    zOut = (u8*)pMem->zShort;
    pMem->flags |= (MEM_Term|MEM_Short);
  }else{
    pMem->flags |= (MEM_Term|MEM_Dyn);
  }
  pMem->z = (char*)zOut;

translate_out:
#if defined(TRANSLATE_TRACE) && defined(SQLITE_DEBUG)
  {
    char zBuf[100];
    sqlite3VdbeMemPrettyPrint(pMem, zBuf);
    fprintf(stderr, "OUTPUT: %s\n", zBuf);
  }
#endif
  return SQLITE_OK;
}
示例#11
0
文件: utf.c 项目: moodboom/Reusable
int sqlite3ReadUtf8(const unsigned char *z){
  int c;
  READ_UTF8(z, c);
  return c;
}
/*
** Extract the next token from a tokenization cursor.  The cursor must
** have been opened by a prior call to simpleOpen().
*/
static int unicodeNext(
  sqlite3_tokenizer_cursor *pC,   /* Cursor returned by simpleOpen */
  const char **paToken,           /* OUT: Token text */
  int *pnToken,                   /* OUT: Number of bytes at *paToken */
  int *piStart,                   /* OUT: Starting offset of token */
  int *piEnd,                     /* OUT: Ending offset of token */
  int *piPos                      /* OUT: Position integer of token */
){
  unicode_cursor *pCsr = (unicode_cursor *)pC;
  unicode_tokenizer *p = ((unicode_tokenizer *)pCsr->base.pTokenizer);
  int iCode;
  char *zOut;
  const unsigned char *z = &pCsr->aInput[pCsr->iOff];
  const unsigned char *zStart = z;
  const unsigned char *zEnd;
  const unsigned char *zTerm = &pCsr->aInput[pCsr->nInput];

  /* Scan past any delimiter characters before the start of the next token.
  ** Return SQLITE_DONE early if this takes us all the way to the end of 
  ** the input.  */
  while( z<zTerm ){
    READ_UTF8(z, zTerm, iCode);
    if( unicodeIsAlnum(p, iCode) ) break;
    zStart = z;
  }
  if( zStart>=zTerm ) return SQLITE_DONE;

  zOut = pCsr->zToken;
  do {
    int iOut;

    /* Grow the output buffer if required. */
    if( (zOut-pCsr->zToken)>=(pCsr->nAlloc-4) ){
      char *zNew = sqlite3_realloc(pCsr->zToken, pCsr->nAlloc+64);
      if( !zNew ) return SQLITE_NOMEM;
      zOut = &zNew[zOut - pCsr->zToken];
      pCsr->zToken = zNew;
      pCsr->nAlloc += 64;
    }

    /* Write the folded case of the last character read to the output */
    zEnd = z;
    iOut = sqlite3FtsUnicodeFold(iCode, p->bRemoveDiacritic);
    if( iOut ){
      WRITE_UTF8(zOut, iOut);
    }

    /* If the cursor is not at EOF, read the next character */
    if( z>=zTerm ) break;
    READ_UTF8(z, zTerm, iCode);
  }while( unicodeIsAlnum(p, iCode) 
       || sqlite3FtsUnicodeIsdiacritic(iCode)
  );

  if ( pCsr->pStemmer!=NULL ) {
     SN_set_current(pCsr->pStemmer, (int)(zOut - pCsr->zToken), (unsigned char *)pCsr->zToken);
     if ( p->stemmer.stem(pCsr->pStemmer)<0 ) {
	*paToken = pCsr->zToken;
	*pnToken = (int)(zOut - pCsr->zToken);
     }else {
	pCsr->pStemmer->p[pCsr->pStemmer->l] = '\0';
	*paToken = (char *)pCsr->pStemmer->p;
	*pnToken = pCsr->pStemmer->l;
     }
  }else {
     *paToken = pCsr->zToken;
     *pnToken = (int)(zOut - pCsr->zToken);
  }

  /* Set the output variables and return. */
  pCsr->iOff = (int)(z - pCsr->aInput);
  *piStart = (int)(zStart - pCsr->aInput);
  *piEnd = (int)(zEnd - pCsr->aInput);
  *piPos = pCsr->iToken++;
  return SQLITE_OK;
}
示例#13
0
/*
** This routine transforms the internal text encoding used by pMem to
** desiredEnc. It is an error if the string is already of the desired
** encoding, or if *pMem does not contain a string value.
*/
SQLITE_NOINLINE int sqlite3VdbeMemTranslate(Mem *pMem, u8 desiredEnc){
  int len;                    /* Maximum length of output string in bytes */
  unsigned char *zOut;                  /* Output buffer */
  unsigned char *zIn;                   /* Input iterator */
  unsigned char *zTerm;                 /* End of input */
  unsigned char *z;                     /* Output iterator */
  unsigned int c;

  assert( pMem->db==0 || sqlite3_mutex_held(pMem->db->mutex) );
  assert( pMem->flags&MEM_Str );
  assert( pMem->enc!=desiredEnc );
  assert( pMem->enc!=0 );
  assert( pMem->n>=0 );

#if defined(TRANSLATE_TRACE) && defined(SQLITE_DEBUG)
  {
    char zBuf[100];
    sqlite3VdbeMemPrettyPrint(pMem, zBuf);
    fprintf(stderr, "INPUT:  %s\n", zBuf);
  }
#endif

  /* If the translation is between UTF-16 little and big endian, then 
  ** all that is required is to swap the byte order. This case is handled
  ** differently from the others.
  */
  if( pMem->enc!=SQLITE_UTF8 && desiredEnc!=SQLITE_UTF8 ){
    u8 temp;
    int rc;
    rc = sqlite3VdbeMemMakeWriteable(pMem);
    if( rc!=SQLITE_OK ){
      assert( rc==SQLITE_NOMEM );
      return SQLITE_NOMEM;
    }
    zIn = (u8*)pMem->z;
    zTerm = &zIn[pMem->n&~1];
    while( zIn<zTerm ){
      temp = *zIn;
      *zIn = *(zIn+1);
      zIn++;
      *zIn++ = temp;
    }
    pMem->enc = desiredEnc;
    goto translate_out;
  }

  /* Set len to the maximum number of bytes required in the output buffer. */
  if( desiredEnc==SQLITE_UTF8 ){
    /* When converting from UTF-16, the maximum growth results from
    ** translating a 2-byte character to a 4-byte UTF-8 character.
    ** A single byte is required for the output string
    ** nul-terminator.
    */
    pMem->n &= ~1;
    len = pMem->n * 2 + 1;
  }else{
    /* When converting from UTF-8 to UTF-16 the maximum growth is caused
    ** when a 1-byte UTF-8 character is translated into a 2-byte UTF-16
    ** character. Two bytes are required in the output buffer for the
    ** nul-terminator.
    */
    len = pMem->n * 2 + 2;
  }

  /* Set zIn to point at the start of the input buffer and zTerm to point 1
  ** byte past the end.
  **
  ** Variable zOut is set to point at the output buffer, space obtained
  ** from sqlite3_malloc().
  */
  zIn = (u8*)pMem->z;
  zTerm = &zIn[pMem->n];
  zOut = sqlite3DbMallocRaw(pMem->db, len);
  if( !zOut ){
    return SQLITE_NOMEM;
  }
  z = zOut;

  if( pMem->enc==SQLITE_UTF8 ){
    if( desiredEnc==SQLITE_UTF16LE ){
      /* UTF-8 -> UTF-16 Little-endian */
      while( zIn<zTerm ){
        READ_UTF8(zIn, zTerm, c);
        WRITE_UTF16LE(z, c);
      }
    }else{
      assert( desiredEnc==SQLITE_UTF16BE );
      /* UTF-8 -> UTF-16 Big-endian */
      while( zIn<zTerm ){
        READ_UTF8(zIn, zTerm, c);
        WRITE_UTF16BE(z, c);
      }
    }
    pMem->n = (int)(z - zOut);
    *z++ = 0;
  }else{
    assert( desiredEnc==SQLITE_UTF8 );
    if( pMem->enc==SQLITE_UTF16LE ){
      /* UTF-16 Little-endian -> UTF-8 */
      while( zIn<zTerm ){
        READ_UTF16LE(zIn, zIn<zTerm, c); 
        WRITE_UTF8(z, c);
      }
    }else{
      /* UTF-16 Big-endian -> UTF-8 */
      while( zIn<zTerm ){
        READ_UTF16BE(zIn, zIn<zTerm, c); 
        WRITE_UTF8(z, c);
      }
    }
    pMem->n = (int)(z - zOut);
  }
  *z = 0;
  assert( (pMem->n+(desiredEnc==SQLITE_UTF8?1:2))<=len );

  c = pMem->flags;
  sqlite3VdbeMemRelease(pMem);
  pMem->flags = MEM_Str|MEM_Term|(c&MEM_AffMask);
  pMem->enc = desiredEnc;
  pMem->z = (char*)zOut;
  pMem->zMalloc = pMem->z;
  pMem->szMalloc = sqlite3DbMallocSize(pMem->db, pMem->z);

translate_out:
#if defined(TRANSLATE_TRACE) && defined(SQLITE_DEBUG)
  {
    char zBuf[100];
    sqlite3VdbeMemPrettyPrint(pMem, zBuf);
    fprintf(stderr, "OUTPUT: %s\n", zBuf);
  }
#endif
  return SQLITE_OK;
}
示例#14
0
static int fts5UnicodeTokenize(
  Fts5Tokenizer *pTokenizer,
  void *pCtx,
  const char *pText, int nText,
  int (*xToken)(void*, const char*, int nToken, int iStart, int iEnd)
){
  Unicode61Tokenizer *p = (Unicode61Tokenizer*)pTokenizer;
  int rc = SQLITE_OK;
  unsigned char *a = p->aTokenChar;

  unsigned char *zTerm = (unsigned char*)&pText[nText];
  unsigned char *zCsr = (unsigned char *)pText;

  /* Output buffer */
  char *aFold = p->aFold;
  int nFold = p->nFold;
  const char *pEnd = &aFold[nFold-6];

  /* Each iteration of this loop gobbles up a contiguous run of separators,
  ** then the next token.  */
  while( rc==SQLITE_OK ){
    int iCode;                    /* non-ASCII codepoint read from input */
    char *zOut = aFold;
    int is;
    int ie;

    /* Skip any separator characters. */
    while( 1 ){
      if( zCsr>=zTerm ) goto tokenize_done;
      if( *zCsr & 0x80 ) {
        /* A character outside of the ascii range. Skip past it if it is
        ** a separator character. Or break out of the loop if it is not. */
        is = zCsr - (unsigned char*)pText;
        READ_UTF8(zCsr, zTerm, iCode);
        if( fts5UnicodeIsAlnum(p, iCode) ){
          goto non_ascii_tokenchar;
        }
      }else{
        if( a[*zCsr] ){
          is = zCsr - (unsigned char*)pText;
          goto ascii_tokenchar;
        }
        zCsr++;
      }
    }

    /* Run through the tokenchars. Fold them into the output buffer along
    ** the way.  */
    while( zCsr<zTerm ){

      /* Grow the output buffer so that there is sufficient space to fit the
      ** largest possible utf-8 character.  */
      if( zOut>pEnd ){
        aFold = sqlite3_malloc(nFold*2);
        if( aFold==0 ){
          rc = SQLITE_NOMEM;
          goto tokenize_done;
        }
        zOut = &aFold[zOut - p->aFold];
        memcpy(aFold, p->aFold, nFold);
        sqlite3_free(p->aFold);
        p->aFold = aFold;
        p->nFold = nFold = nFold*2;
        pEnd = &aFold[nFold-6];
      }

      if( *zCsr & 0x80 ){
        /* An non-ascii-range character. Fold it into the output buffer if
        ** it is a token character, or break out of the loop if it is not. */
        READ_UTF8(zCsr, zTerm, iCode);
        if( fts5UnicodeIsAlnum(p,iCode)||sqlite3Fts5UnicodeIsdiacritic(iCode) ){
 non_ascii_tokenchar:
          iCode = sqlite3Fts5UnicodeFold(iCode, p->bRemoveDiacritic);
          if( iCode ) WRITE_UTF8(zOut, iCode);
        }else{
          break;
        }
      }else if( a[*zCsr]==0 ){
        /* An ascii-range separator character. End of token. */
        break; 
      }else{
 ascii_tokenchar:
        if( *zCsr>='A' && *zCsr<='Z' ){
          *zOut++ = *zCsr + 32;
        }else{
          *zOut++ = *zCsr;
        }
        zCsr++;
      }
      ie = zCsr - (unsigned char*)pText;
    }

    /* Invoke the token callback */
    rc = xToken(pCtx, aFold, zOut-aFold, is, ie);
  }
  
 tokenize_done:
  if( rc==SQLITE_DONE ) rc = SQLITE_OK;
  return rc;
}