Ejemplo n.º 1
0
/*
** Stem the input word zIn[0..nIn-1].  Store the output in zOut.
** zOut is at least big enough to hold nIn bytes.  Write the actual
** size of the output word (exclusive of the '\0' terminator) into *pnOut.
**
** Any upper-case characters in the US-ASCII character set ([A-Z])
** are converted to lower case.  Upper-case UTF characters are
** unchanged.
**
** Words that are longer than about 20 bytes are stemmed by retaining
** a few bytes from the beginning and the end of the word.  If the
** word contains digits, 3 bytes are taken from the beginning and
** 3 bytes from the end.  For long words without digits, 10 bytes
** are taken from each end.  US-ASCII case folding still applies.
** 
** If the input word contains not digits but does characters not 
** in [a-zA-Z] then no stemming is attempted and this routine just 
** copies the input into the input into the output with US-ASCII
** case folding.
**
** Stemming never increases the length of the word.  So there is
** no chance of overflowing the zOut buffer.
*/
static void porter_stemmer(const char *zIn, int nIn, char *zOut, int *pnOut){
  int i, j, c;
  char zReverse[28];
  char *z, *z2;
  if( nIn<3 || nIn>=sizeof(zReverse)-7 ){
    /* The word is too big or too small for the porter stemmer.
    ** Fallback to the copy stemmer */
    copy_stemmer(zIn, nIn, zOut, pnOut);
    return;
  }
  for(i=0, j=sizeof(zReverse)-6; i<nIn; i++, j--){
    c = zIn[i];
    if( c>='A' && c<='Z' ){
      zReverse[j] = c + 'a' - 'A';
    }else if( c>='a' && c<='z' ){
      zReverse[j] = c;
    }else{
      /* The use of a character not in [a-zA-Z] means that we fallback
      ** to the copy stemmer */
      copy_stemmer(zIn, nIn, zOut, pnOut);
      return;
    }
  }
  memset(&zReverse[sizeof(zReverse)-5], 0, 5);
  z = &zReverse[j+1];


  /* Step 1a */
  if( z[0]=='s' ){
    if(
     !stem(&z, "sess", "ss", 0) &&
     !stem(&z, "sei", "i", 0)  &&
     !stem(&z, "ss", "ss", 0)
    ){
      z++;
    }
  }

  /* Step 1b */  
  z2 = z;
  if( stem(&z, "dee", "ee", m_gt_0) ){
    /* Do nothing.  The work was all in the test */
  }else if( 
     (stem(&z, "gni", "", hasVowel) || stem(&z, "de", "", hasVowel))
      && z!=z2
  ){
     if( stem(&z, "ta", "ate", 0) ||
         stem(&z, "lb", "ble", 0) ||
         stem(&z, "zi", "ize", 0) ){
       /* Do nothing.  The work was all in the test */
     }else if( doubleConsonant(z) && (*z!='l' && *z!='s' && *z!='z') ){
       z++;
     }else if( m_eq_1(z) && star_oh(z) ){
       *(--z) = 'e';
     }
  }

  /* Step 1c */
  if( z[0]=='y' && hasVowel(z+1) ){
    z[0] = 'i';
  }

  /* Step 2 */
  switch( z[1] ){
   case 'a':
     stem(&z, "lanoita", "ate", m_gt_0) ||
     stem(&z, "lanoit", "tion", m_gt_0);
     break;
   case 'c':
     stem(&z, "icne", "ence", m_gt_0) ||
     stem(&z, "icna", "ance", m_gt_0);
     break;
   case 'e':
     stem(&z, "rezi", "ize", m_gt_0);
     break;
   case 'g':
     stem(&z, "igol", "log", m_gt_0);
     break;
   case 'l':
     stem(&z, "ilb", "ble", m_gt_0) ||
     stem(&z, "illa", "al", m_gt_0) ||
     stem(&z, "iltne", "ent", m_gt_0) ||
     stem(&z, "ile", "e", m_gt_0) ||
     stem(&z, "ilsuo", "ous", m_gt_0);
     break;
   case 'o':
     stem(&z, "noitazi", "ize", m_gt_0) ||
     stem(&z, "noita", "ate", m_gt_0) ||
     stem(&z, "rota", "ate", m_gt_0);
     break;
   case 's':
     stem(&z, "msila", "al", m_gt_0) ||
     stem(&z, "ssenevi", "ive", m_gt_0) ||
     stem(&z, "ssenluf", "ful", m_gt_0) ||
     stem(&z, "ssensuo", "ous", m_gt_0);
     break;
   case 't':
     stem(&z, "itila", "al", m_gt_0) ||
     stem(&z, "itivi", "ive", m_gt_0) ||
     stem(&z, "itilib", "ble", m_gt_0);
     break;
  }

  /* Step 3 */
  switch( z[0] ){
   case 'e':
     stem(&z, "etaci", "ic", m_gt_0) ||
     stem(&z, "evita", "", m_gt_0)   ||
     stem(&z, "ezila", "al", m_gt_0);
     break;
   case 'i':
     stem(&z, "itici", "ic", m_gt_0);
     break;
   case 'l':
     stem(&z, "laci", "ic", m_gt_0) ||
     stem(&z, "luf", "", m_gt_0);
     break;
   case 's':
     stem(&z, "ssen", "", m_gt_0);
     break;
  }

  /* Step 4 */
  switch( z[1] ){
   case 'a':
     if( z[0]=='l' && m_gt_1(z+2) ){
       z += 2;
     }
     break;
   case 'c':
     if( z[0]=='e' && z[2]=='n' && (z[3]=='a' || z[3]=='e')  && m_gt_1(z+4)  ){
       z += 4;
     }
     break;
   case 'e':
     if( z[0]=='r' && m_gt_1(z+2) ){
       z += 2;
     }
     break;
   case 'i':
     if( z[0]=='c' && m_gt_1(z+2) ){
       z += 2;
     }
     break;
   case 'l':
     if( z[0]=='e' && z[2]=='b' && (z[3]=='a' || z[3]=='i') && m_gt_1(z+4) ){
       z += 4;
     }
     break;
   case 'n':
     if( z[0]=='t' ){
       if( z[2]=='a' ){
         if( m_gt_1(z+3) ){
           z += 3;
         }
       }else if( z[2]=='e' ){
         stem(&z, "tneme", "", m_gt_1) ||
         stem(&z, "tnem", "", m_gt_1) ||
         stem(&z, "tne", "", m_gt_1);
       }
     }
     break;
   case 'o':
     if( z[0]=='u' ){
       if( m_gt_1(z+2) ){
         z += 2;
       }
     }else if( z[3]=='s' || z[3]=='t' ){
       stem(&z, "noi", "", m_gt_1);
     }
     break;
   case 's':
     if( z[0]=='m' && z[2]=='i' && m_gt_1(z+3) ){
       z += 3;
     }
     break;
   case 't':
     stem(&z, "eta", "", m_gt_1) ||
     stem(&z, "iti", "", m_gt_1);
     break;
   case 'u':
     if( z[0]=='s' && z[2]=='o' && m_gt_1(z+3) ){
       z += 3;
     }
     break;
   case 'v':
   case 'z':
     if( z[0]=='e' && z[2]=='i' && m_gt_1(z+3) ){
       z += 3;
     }
     break;
  }

  /* Step 5a */
  if( z[0]=='e' ){
    if( m_gt_1(z+1) ){
      z++;
    }else if( m_eq_1(z+1) && !star_oh(z+1) ){
      z++;
    }
  }

  /* Step 5b */
  if( m_gt_1(z) && z[0]=='l' && z[1]=='l' ){
    z++;
  }

  /* z[] is now the stemmed word in reverse order.  Flip it back
  ** around into forward order and return.
  */
  *pnOut = i = strlen(z);
  zOut[i] = 0;
  while( *z ){
    zOut[--i] = *(z++);
  }
}
/**
 * Generate a new token.  There are basically three types of token we can
 *  generate:
 * - A porter stemmed token.  This is a word entirely comprised of ASCII
 *    characters.  We run the porter stemmer algorithm against the word.
 *    Because we have no way to know what is and is not an English word
 *    (the only language for which the porter stemmer was designed), this
 *    could theoretically map multiple words that are not variations of the
 *    same word down to the same root, resulting in potentially unexpected
 *    result inclusions in the search results.  We accept this result because
 *    there's not a lot we can do about it and false positives are much
 *    better than false negatives.
 * - A copied token; case/accent-folded but not stemmed.  We call the porter
 *    stemmer for all non-CJK cases and it diverts to the copy stemmer if it
 *    sees any non-ASCII characters (after folding) or if the string is too
 *    long.  The copy stemmer will shrink the string if it is deemed too long.
 * - A bi-gram token; two CJK-ish characters.  For query reasons we generate a
 *    series of overlapping bi-grams.  (We can't require the user to start their
 *    search based on the arbitrary context of the indexed documents.)
 *
 * It may be useful to think of this function as operating at the points between
 *  characters.  While we are considering the 'current' character (the one after
 *  the 'point'), we are also interested in the 'previous' character (the one
 *  preceding the point).
 * At any 'point', there are a number of possible situations which I will
 *  illustrate with pairs of characters. 'a' means alphanumeric ASCII or a
 *  non-ASCII character that is not bi-grammable or a delimeter, '.'
 *  means a delimiter (space or punctuation), '&' means a bi-grammable
 *  character.
 * - aa: We are in the midst of a token.  State remains BIGRAM_ALPHA.
 * - a.: We will generate a porter stemmed or copied token.  State was
 *        BIGRAM_ALPHA, gets set to BIGRAM_RESET.
 * - a&: We will generate a porter stemmed or copied token; we will set our
 *        state to BIGRAM_UNKNOWN to indicate we have seen one bigram character
 *        but that it is not yet time to emit a bigram.
 * - .a: We are starting a token.  State was BIGRAM_RESET, gets set to
 *        BIGRAM_ALPHA.
 * - ..: We skip/eat the delimeters.  State stays BIGRAM_RESET.
 * - .&: State set to BIGRAM_UNKNOWN to indicate we have seen one bigram char.
 * - &a: If the state was BIGRAM_USE, we generate a bi-gram token.  If the state
 *        was BIGRAM_UNKNOWN we had only seen one CJK character and so don't do
 *        anything.  State is set to BIGRAM_ALPHA.
 * - &.: Same as the "&a" case, but state is set to BIGRAM_RESET.
 * - &&: We will generate a bi-gram token.  State was either BIGRAM_UNKNOWN or
 *        BIGRAM_USE, gets set to BIGRAM_USE.
 */
static int porterNext(
  sqlite3_tokenizer_cursor *pCursor,  /* Cursor returned by porterOpen */
  const char **pzToken,               /* OUT: *pzToken is the token text */
  int *pnBytes,                       /* OUT: Number of bytes in token */
  int *piStartOffset,                 /* OUT: Starting offset of token */
  int *piEndOffset,                   /* OUT: Ending offset of token */
  int *piPosition                     /* OUT: Position integer of token */
){
  porter_tokenizer_cursor *c = (porter_tokenizer_cursor *) pCursor;
  const unsigned char *z = (unsigned char *) c->zInput;
  int len = 0;
  int state;

  while( c->iOffset < c->nInput ){
    int iStartOffset, numChars;

    /*
     * This loop basically has two modes of operation:
     * - general processing (iPrevBigramOffset == 0 here)
     * - CJK processing (iPrevBigramOffset != 0 here)
     *
     * In an general processing pass we skip over all the delimiters, leaving us
     *  at a character that promises to produce a token.  This could be a CJK
     *  token (state == BIGRAM_USE) or an ALPHA token (state == BIGRAM_ALPHA).
     * If it was a CJK token, we transition into CJK state for the next loop.
     * If it was an alpha token, our current offset is pointing at a delimiter
     *  (which could be a CJK character), so it is good that our next pass
     *  through the function and loop will skip over any delimiters.  If the
     *  delimiter we hit was a CJK character, the next time through we will
     *  not treat it as a delimiter though; the entry state for that scan is
     *  BIGRAM_RESET so the transition is not treated as a delimiter!
     * 
     * The CJK pass always starts with the second character in a bi-gram emitted
     *  as a token in the previous step.  No delimiter skipping is required
     *  because we know that first character might produce a token for us.  It
     *  only 'might' produce a token because the previous pass performed no
     *  lookahead and cannot be sure it is followed by another CJK character.
     *  This is why 
     */

    // If we have a previous bigram offset
    if (c->iPrevBigramOffset == 0) {
      /* Scan past delimiter characters */
      state = BIGRAM_RESET; /* reset */
      while (c->iOffset < c->nInput &&
             isDelim(z + c->iOffset, z + c->nInput, &len, &state)) {
        c->iOffset += len;
      }

    } else {
      /* for bigram indexing, use previous offset */
      c->iOffset = c->iPrevBigramOffset;
    }

    /* Count non-delimiter characters. */
    iStartOffset = c->iOffset;
    numChars = 0;

    // Start from a reset state.  This means the first character we see
    //  (which will not be a delimiter) determines which of ALPHA or CJK modes
    //  we are operating in.  (It won't be a delimiter because in a 'general'
    //  pass as defined above, we will have eaten all the delimiters, and in
    //  a CJK pass we are guaranteed that the first character is CJK.)
    state = BIGRAM_RESET; /* state is reset */
    // Advance until it is time to emit a token.
    // For ALPHA characters, this means advancing until we encounter a delimiter
    //  or a CJK character.  iOffset will be pointing at the delimiter or CJK
    //  character, aka one beyond the last ALPHA character.
    // For CJK characters this means advancing until we encounter an ALPHA
    //  character, a delimiter, or we have seen two consecutive CJK
    //  characters.  iOffset points at the ALPHA/delimiter in the first 2 cases
    //  and the second of two CJK characters in the last case.
    // Because of the way this loop is structured, iOffset is only updated
    //  when we don't terminate.  However, if we terminate, len still contains
    //  the number of bytes in the character found at iOffset.  (This is useful
    //  in the CJK case.)
    while (c->iOffset < c->nInput &&
           !isDelim(z + c->iOffset, z + c->nInput, &len, &state)) {
      c->iOffset += len;
      numChars++;
    }

    if (state == BIGRAM_USE) {
      /* Split word by bigram */
      // Right now iOffset is pointing at the second character in a pair.
      //  Save this offset so next-time through we start with that as the
      //  first character.
      c->iPrevBigramOffset = c->iOffset;
      // And now advance so that iOffset is pointing at the character after
      //  the second character in the bi-gram pair.  Also count the char.
      c->iOffset += len;
      numChars++;
    } else {
      /* Reset bigram offset */
      c->iPrevBigramOffset = 0;
    }

    /* We emit a token if:
     *  - there are two ideograms together,
     *  - there are three chars or more,
     *  - we think this is a query and wildcard magic is desired.
     * We think is a wildcard query when we have a single character, it starts
     *  at the start of the buffer, it's CJK, our current offset is one shy of
     *  nInput and the character at iOffset is '*'.  Because the state gets
     *  clobbered by the incidence of '*' our requirement for CJK is that the
     *  implied character length is at least 3 given that it takes at least 3
     *  bytes to encode to 0x2000.
     */
    // It is possible we have no token to emit here if iPrevBigramOffset was not
    //  0 on entry and there was no second CJK character.  iPrevBigramOffset
    //  will now be 0 if that is the case (and c->iOffset == iStartOffset).
    if (// allow two-character words only if in bigram
        (numChars == 2 && state == BIGRAM_USE) ||
        // otherwise, drop two-letter words (considered stop-words)
        (numChars >=3) ||
        // wildcard case:
        (numChars == 1 && iStartOffset == 0 &&
         (c->iOffset >= 3) &&
         (c->iOffset == c->nInput - 1) &&
         (z[c->iOffset] == '*'))) {
      /* figure out the number of bytes to copy/stem */
      int n = c->iOffset - iStartOffset;
      /* make sure there is enough buffer space */
      if (n * MAX_UTF8_GROWTH_FACTOR > c->nAllocated) {
        c->nAllocated = n * MAX_UTF8_GROWTH_FACTOR + 20;
        c->zToken = sqlite3_realloc(c->zToken, c->nAllocated);
        if (c->zToken == NULL)
          return SQLITE_NOMEM;
      }

      if (state == BIGRAM_USE) {
        /* This is by bigram. So it is unnecessary to convert word */
        copy_stemmer(&z[iStartOffset], n, c->zToken, pnBytes);
      } else {
        porter_stemmer(&z[iStartOffset], n, c->zToken, pnBytes);
      }
      *pzToken = (const char*) c->zToken;
      *piStartOffset = iStartOffset;
      *piEndOffset = c->iOffset;
      *piPosition = c->iToken++;
      return SQLITE_OK;
    }
  }
  return SQLITE_DONE;
}