/* ** Stem the input word zIn[0..nIn-1]. Store the output in zOut. ** zOut is at least big enough to hold nIn bytes. Write the actual ** size of the output word (exclusive of the '\0' terminator) into *pnOut. ** ** Any upper-case characters in the US-ASCII character set ([A-Z]) ** are converted to lower case. Upper-case UTF characters are ** unchanged. ** ** Words that are longer than about 20 bytes are stemmed by retaining ** a few bytes from the beginning and the end of the word. If the ** word contains digits, 3 bytes are taken from the beginning and ** 3 bytes from the end. For long words without digits, 10 bytes ** are taken from each end. US-ASCII case folding still applies. ** ** If the input word contains not digits but does characters not ** in [a-zA-Z] then no stemming is attempted and this routine just ** copies the input into the input into the output with US-ASCII ** case folding. ** ** Stemming never increases the length of the word. So there is ** no chance of overflowing the zOut buffer. */ static void porter_stemmer(const char *zIn, int nIn, char *zOut, int *pnOut){ int i, j, c; char zReverse[28]; char *z, *z2; if( nIn<3 || nIn>=sizeof(zReverse)-7 ){ /* The word is too big or too small for the porter stemmer. ** Fallback to the copy stemmer */ copy_stemmer(zIn, nIn, zOut, pnOut); return; } for(i=0, j=sizeof(zReverse)-6; i<nIn; i++, j--){ c = zIn[i]; if( c>='A' && c<='Z' ){ zReverse[j] = c + 'a' - 'A'; }else if( c>='a' && c<='z' ){ zReverse[j] = c; }else{ /* The use of a character not in [a-zA-Z] means that we fallback ** to the copy stemmer */ copy_stemmer(zIn, nIn, zOut, pnOut); return; } } memset(&zReverse[sizeof(zReverse)-5], 0, 5); z = &zReverse[j+1]; /* Step 1a */ if( z[0]=='s' ){ if( !stem(&z, "sess", "ss", 0) && !stem(&z, "sei", "i", 0) && !stem(&z, "ss", "ss", 0) ){ z++; } } /* Step 1b */ z2 = z; if( stem(&z, "dee", "ee", m_gt_0) ){ /* Do nothing. The work was all in the test */ }else if( (stem(&z, "gni", "", hasVowel) || stem(&z, "de", "", hasVowel)) && z!=z2 ){ if( stem(&z, "ta", "ate", 0) || stem(&z, "lb", "ble", 0) || stem(&z, "zi", "ize", 0) ){ /* Do nothing. The work was all in the test */ }else if( doubleConsonant(z) && (*z!='l' && *z!='s' && *z!='z') ){ z++; }else if( m_eq_1(z) && star_oh(z) ){ *(--z) = 'e'; } } /* Step 1c */ if( z[0]=='y' && hasVowel(z+1) ){ z[0] = 'i'; } /* Step 2 */ switch( z[1] ){ case 'a': stem(&z, "lanoita", "ate", m_gt_0) || stem(&z, "lanoit", "tion", m_gt_0); break; case 'c': stem(&z, "icne", "ence", m_gt_0) || stem(&z, "icna", "ance", m_gt_0); break; case 'e': stem(&z, "rezi", "ize", m_gt_0); break; case 'g': stem(&z, "igol", "log", m_gt_0); break; case 'l': stem(&z, "ilb", "ble", m_gt_0) || stem(&z, "illa", "al", m_gt_0) || stem(&z, "iltne", "ent", m_gt_0) || stem(&z, "ile", "e", m_gt_0) || stem(&z, "ilsuo", "ous", m_gt_0); break; case 'o': stem(&z, "noitazi", "ize", m_gt_0) || stem(&z, "noita", "ate", m_gt_0) || stem(&z, "rota", "ate", m_gt_0); break; case 's': stem(&z, "msila", "al", m_gt_0) || stem(&z, "ssenevi", "ive", m_gt_0) || stem(&z, "ssenluf", "ful", m_gt_0) || stem(&z, "ssensuo", "ous", m_gt_0); break; case 't': stem(&z, "itila", "al", m_gt_0) || stem(&z, "itivi", "ive", m_gt_0) || stem(&z, "itilib", "ble", m_gt_0); break; } /* Step 3 */ switch( z[0] ){ case 'e': stem(&z, "etaci", "ic", m_gt_0) || stem(&z, "evita", "", m_gt_0) || stem(&z, "ezila", "al", m_gt_0); break; case 'i': stem(&z, "itici", "ic", m_gt_0); break; case 'l': stem(&z, "laci", "ic", m_gt_0) || stem(&z, "luf", "", m_gt_0); break; case 's': stem(&z, "ssen", "", m_gt_0); break; } /* Step 4 */ switch( z[1] ){ case 'a': if( z[0]=='l' && m_gt_1(z+2) ){ z += 2; } break; case 'c': if( z[0]=='e' && z[2]=='n' && (z[3]=='a' || z[3]=='e') && m_gt_1(z+4) ){ z += 4; } break; case 'e': if( z[0]=='r' && m_gt_1(z+2) ){ z += 2; } break; case 'i': if( z[0]=='c' && m_gt_1(z+2) ){ z += 2; } break; case 'l': if( z[0]=='e' && z[2]=='b' && (z[3]=='a' || z[3]=='i') && m_gt_1(z+4) ){ z += 4; } break; case 'n': if( z[0]=='t' ){ if( z[2]=='a' ){ if( m_gt_1(z+3) ){ z += 3; } }else if( z[2]=='e' ){ stem(&z, "tneme", "", m_gt_1) || stem(&z, "tnem", "", m_gt_1) || stem(&z, "tne", "", m_gt_1); } } break; case 'o': if( z[0]=='u' ){ if( m_gt_1(z+2) ){ z += 2; } }else if( z[3]=='s' || z[3]=='t' ){ stem(&z, "noi", "", m_gt_1); } break; case 's': if( z[0]=='m' && z[2]=='i' && m_gt_1(z+3) ){ z += 3; } break; case 't': stem(&z, "eta", "", m_gt_1) || stem(&z, "iti", "", m_gt_1); break; case 'u': if( z[0]=='s' && z[2]=='o' && m_gt_1(z+3) ){ z += 3; } break; case 'v': case 'z': if( z[0]=='e' && z[2]=='i' && m_gt_1(z+3) ){ z += 3; } break; } /* Step 5a */ if( z[0]=='e' ){ if( m_gt_1(z+1) ){ z++; }else if( m_eq_1(z+1) && !star_oh(z+1) ){ z++; } } /* Step 5b */ if( m_gt_1(z) && z[0]=='l' && z[1]=='l' ){ z++; } /* z[] is now the stemmed word in reverse order. Flip it back ** around into forward order and return. */ *pnOut = i = strlen(z); zOut[i] = 0; while( *z ){ zOut[--i] = *(z++); } }
/** * Generate a new token. There are basically three types of token we can * generate: * - A porter stemmed token. This is a word entirely comprised of ASCII * characters. We run the porter stemmer algorithm against the word. * Because we have no way to know what is and is not an English word * (the only language for which the porter stemmer was designed), this * could theoretically map multiple words that are not variations of the * same word down to the same root, resulting in potentially unexpected * result inclusions in the search results. We accept this result because * there's not a lot we can do about it and false positives are much * better than false negatives. * - A copied token; case/accent-folded but not stemmed. We call the porter * stemmer for all non-CJK cases and it diverts to the copy stemmer if it * sees any non-ASCII characters (after folding) or if the string is too * long. The copy stemmer will shrink the string if it is deemed too long. * - A bi-gram token; two CJK-ish characters. For query reasons we generate a * series of overlapping bi-grams. (We can't require the user to start their * search based on the arbitrary context of the indexed documents.) * * It may be useful to think of this function as operating at the points between * characters. While we are considering the 'current' character (the one after * the 'point'), we are also interested in the 'previous' character (the one * preceding the point). * At any 'point', there are a number of possible situations which I will * illustrate with pairs of characters. 'a' means alphanumeric ASCII or a * non-ASCII character that is not bi-grammable or a delimeter, '.' * means a delimiter (space or punctuation), '&' means a bi-grammable * character. * - aa: We are in the midst of a token. State remains BIGRAM_ALPHA. * - a.: We will generate a porter stemmed or copied token. State was * BIGRAM_ALPHA, gets set to BIGRAM_RESET. * - a&: We will generate a porter stemmed or copied token; we will set our * state to BIGRAM_UNKNOWN to indicate we have seen one bigram character * but that it is not yet time to emit a bigram. * - .a: We are starting a token. State was BIGRAM_RESET, gets set to * BIGRAM_ALPHA. * - ..: We skip/eat the delimeters. State stays BIGRAM_RESET. * - .&: State set to BIGRAM_UNKNOWN to indicate we have seen one bigram char. * - &a: If the state was BIGRAM_USE, we generate a bi-gram token. If the state * was BIGRAM_UNKNOWN we had only seen one CJK character and so don't do * anything. State is set to BIGRAM_ALPHA. * - &.: Same as the "&a" case, but state is set to BIGRAM_RESET. * - &&: We will generate a bi-gram token. State was either BIGRAM_UNKNOWN or * BIGRAM_USE, gets set to BIGRAM_USE. */ static int porterNext( sqlite3_tokenizer_cursor *pCursor, /* Cursor returned by porterOpen */ const char **pzToken, /* OUT: *pzToken is the token text */ int *pnBytes, /* OUT: Number of bytes in token */ int *piStartOffset, /* OUT: Starting offset of token */ int *piEndOffset, /* OUT: Ending offset of token */ int *piPosition /* OUT: Position integer of token */ ){ porter_tokenizer_cursor *c = (porter_tokenizer_cursor *) pCursor; const unsigned char *z = (unsigned char *) c->zInput; int len = 0; int state; while( c->iOffset < c->nInput ){ int iStartOffset, numChars; /* * This loop basically has two modes of operation: * - general processing (iPrevBigramOffset == 0 here) * - CJK processing (iPrevBigramOffset != 0 here) * * In an general processing pass we skip over all the delimiters, leaving us * at a character that promises to produce a token. This could be a CJK * token (state == BIGRAM_USE) or an ALPHA token (state == BIGRAM_ALPHA). * If it was a CJK token, we transition into CJK state for the next loop. * If it was an alpha token, our current offset is pointing at a delimiter * (which could be a CJK character), so it is good that our next pass * through the function and loop will skip over any delimiters. If the * delimiter we hit was a CJK character, the next time through we will * not treat it as a delimiter though; the entry state for that scan is * BIGRAM_RESET so the transition is not treated as a delimiter! * * The CJK pass always starts with the second character in a bi-gram emitted * as a token in the previous step. No delimiter skipping is required * because we know that first character might produce a token for us. It * only 'might' produce a token because the previous pass performed no * lookahead and cannot be sure it is followed by another CJK character. * This is why */ // If we have a previous bigram offset if (c->iPrevBigramOffset == 0) { /* Scan past delimiter characters */ state = BIGRAM_RESET; /* reset */ while (c->iOffset < c->nInput && isDelim(z + c->iOffset, z + c->nInput, &len, &state)) { c->iOffset += len; } } else { /* for bigram indexing, use previous offset */ c->iOffset = c->iPrevBigramOffset; } /* Count non-delimiter characters. */ iStartOffset = c->iOffset; numChars = 0; // Start from a reset state. This means the first character we see // (which will not be a delimiter) determines which of ALPHA or CJK modes // we are operating in. (It won't be a delimiter because in a 'general' // pass as defined above, we will have eaten all the delimiters, and in // a CJK pass we are guaranteed that the first character is CJK.) state = BIGRAM_RESET; /* state is reset */ // Advance until it is time to emit a token. // For ALPHA characters, this means advancing until we encounter a delimiter // or a CJK character. iOffset will be pointing at the delimiter or CJK // character, aka one beyond the last ALPHA character. // For CJK characters this means advancing until we encounter an ALPHA // character, a delimiter, or we have seen two consecutive CJK // characters. iOffset points at the ALPHA/delimiter in the first 2 cases // and the second of two CJK characters in the last case. // Because of the way this loop is structured, iOffset is only updated // when we don't terminate. However, if we terminate, len still contains // the number of bytes in the character found at iOffset. (This is useful // in the CJK case.) while (c->iOffset < c->nInput && !isDelim(z + c->iOffset, z + c->nInput, &len, &state)) { c->iOffset += len; numChars++; } if (state == BIGRAM_USE) { /* Split word by bigram */ // Right now iOffset is pointing at the second character in a pair. // Save this offset so next-time through we start with that as the // first character. c->iPrevBigramOffset = c->iOffset; // And now advance so that iOffset is pointing at the character after // the second character in the bi-gram pair. Also count the char. c->iOffset += len; numChars++; } else { /* Reset bigram offset */ c->iPrevBigramOffset = 0; } /* We emit a token if: * - there are two ideograms together, * - there are three chars or more, * - we think this is a query and wildcard magic is desired. * We think is a wildcard query when we have a single character, it starts * at the start of the buffer, it's CJK, our current offset is one shy of * nInput and the character at iOffset is '*'. Because the state gets * clobbered by the incidence of '*' our requirement for CJK is that the * implied character length is at least 3 given that it takes at least 3 * bytes to encode to 0x2000. */ // It is possible we have no token to emit here if iPrevBigramOffset was not // 0 on entry and there was no second CJK character. iPrevBigramOffset // will now be 0 if that is the case (and c->iOffset == iStartOffset). if (// allow two-character words only if in bigram (numChars == 2 && state == BIGRAM_USE) || // otherwise, drop two-letter words (considered stop-words) (numChars >=3) || // wildcard case: (numChars == 1 && iStartOffset == 0 && (c->iOffset >= 3) && (c->iOffset == c->nInput - 1) && (z[c->iOffset] == '*'))) { /* figure out the number of bytes to copy/stem */ int n = c->iOffset - iStartOffset; /* make sure there is enough buffer space */ if (n * MAX_UTF8_GROWTH_FACTOR > c->nAllocated) { c->nAllocated = n * MAX_UTF8_GROWTH_FACTOR + 20; c->zToken = sqlite3_realloc(c->zToken, c->nAllocated); if (c->zToken == NULL) return SQLITE_NOMEM; } if (state == BIGRAM_USE) { /* This is by bigram. So it is unnecessary to convert word */ copy_stemmer(&z[iStartOffset], n, c->zToken, pnBytes); } else { porter_stemmer(&z[iStartOffset], n, c->zToken, pnBytes); } *pzToken = (const char*) c->zToken; *piStartOffset = iStartOffset; *piEndOffset = c->iOffset; *piPosition = c->iToken++; return SQLITE_OK; } } return SQLITE_DONE; }