/* * Finds the next whitespace-delimited word within the 'in' string. * Returns a pointer to the first character of the word, and a pointer * to the next byte after the last character in the word (in *end). */ static char * findwrd(char *in, char **end) { char *start; /* Skip leading spaces */ while (*in && t_isspace(in)) in += pg_mblen(in); /* Return NULL on empty lines */ if (*in == '\0') { *end = NULL; return NULL; } start = in; /* Find end of word */ while (*in && !t_isspace(in)) in += pg_mblen(in); *end = in; return start; }
Datum dabs_int_lexize(PG_FUNCTION_ARGS) { char *in = (char *) PG_GETARG_POINTER(1); char *out = pnstrdup(in, PG_GETARG_INT32(2)); char *start; char *end; TSLexeme *res = palloc0(sizeof(TSLexeme) * 2); res[1].lexeme = NULL; while (*out && t_iseq(out, '-')) out += pg_mblen(out); start = out; while (*out) out += pg_mblen(out); end = out; out = pnstrdup(start, end - start); res[0].lexeme = out; PG_RETURN_POINTER(res); }
static bool mb_strchr(char *str, char *c) { int clen, plen, i; char *ptr = str; bool res = false; clen = pg_mblen(c); while (*ptr && !res) { plen = pg_mblen(ptr); if (plen == clen) { i = plen; res = true; while (i--) if (*(ptr + i) != *(c + i)) { res = false; break; } } ptr += plen; } return res; }
static int pgq_quote_literal(char *dst, const uint8 *src, int srclen) { const uint8 *cp1 = src, *src_end = src + srclen; char *cp2 = dst; bool is_ext = false; *cp2++ = '\''; while (cp1 < src_end) { int wl = pg_mblen((const char *)cp1); if (wl != 1) { while (wl-- > 0 && cp1 < src_end) *cp2++ = *cp1++; continue; } if (*cp1 == '\'') { *cp2++ = '\''; } else if (*cp1 == '\\') { if (!is_ext) { /* make room for 'E' */ memmove(dst + 1, dst, cp2 - dst); *dst = 'E'; is_ext = true; cp2++; } *cp2++ = '\\'; } *cp2++ = *cp1++; } *cp2++ = '\''; return cp2 - dst; }
/* * returns the common prefix length of a node key */ static int32 gbt_var_node_cp_len(const GBT_VARKEY *node, const gbtree_vinfo *tinfo) { GBT_VARKEY_R r = gbt_var_key_readable(node); int32 i = 0; int32 l = 0; int32 t1len = VARSIZE(r.lower) - VARHDRSZ; int32 t2len = VARSIZE(r.upper) - VARHDRSZ; int32 ml = Min(t1len, t2len); char *p1 = VARDATA(r.lower); char *p2 = VARDATA(r.upper); if (ml == 0) return 0; while (i < ml) { if (tinfo->eml > 1 && l == 0) { if ((l = pg_mblen(p1)) != pg_mblen(p2)) { return i; } } if (*p1 != *p2) { if (tinfo->eml > 1) { return (i - l + 1); } else { return i; } } p1++; p2++; l--; i++; } return (ml); /* lower == upper */ }
bool RS_execute(Regis *r, char *str) { RegisNode *ptr = r->node; char *c = str; int len = 0; while (*c) { len++; c += pg_mblen(c); } if (len < r->nchar) return 0; c = str; if (r->issuffix) { len -= r->nchar; while (len-- > 0) c += pg_mblen(c); } while (ptr) { switch (ptr->type) { case RSF_ONEOF: if (!mb_strchr((char *) ptr->data, c)) return false; break; case RSF_NONEOF: if (mb_strchr((char *) ptr->data, c)) return false; break; default: elog(ERROR, "unrecognized regis node type: %d", ptr->type); } ptr = ptr->next; c += pg_mblen(c); } return true; }
/* * Try to match the given source text to a single-quoted literal. * If successful, adjust newcursorpos to correspond to the character * (not byte) index corresponding to cursorpos in the source text. * * At entry, literal points just past a ' character. We must check for the * trailing quote. */ static bool match_prosrc_to_literal(const char *prosrc, const char *literal, int cursorpos, int *newcursorpos) { int newcp = cursorpos; int chlen; /* * This implementation handles backslashes and doubled quotes in the * string literal. It does not handle the SQL syntax for literals * continued across line boundaries. * * We do the comparison a character at a time, not a byte at a time, so * that we can do the correct cursorpos math. */ while (*prosrc) { cursorpos--; /* characters left before cursor */ /* * Check for backslashes and doubled quotes in the literal; adjust * newcp when one is found before the cursor. */ if (*literal == '\\') { literal++; if (cursorpos > 0) newcp++; } else if (*literal == '\'') { if (literal[1] != '\'') goto fail; literal++; if (cursorpos > 0) newcp++; } chlen = pg_mblen(prosrc); if (strncmp(prosrc, literal, chlen) != 0) goto fail; prosrc += chlen; literal += chlen; } if (*literal == '\'' && literal[1] != '\'') { /* success */ *newcursorpos = newcp; return true; } fail: /* Must set *newcursorpos to suppress compiler warning */ *newcursorpos = newcp; return false; }
static char * getlexeme(char *start, char *end, int *len) { char *ptr; int charlen; while (start < end && (charlen = pg_mblen(start)) == 1 && t_iseq(start, '_')) start += charlen; ptr = start; if (ptr >= end) return NULL; while (ptr < end && !((charlen = pg_mblen(ptr)) == 1 && t_iseq(ptr, '_'))) ptr += charlen; *len = ptr - start; return start; }
/* * Adds trigrams from words (already padded). */ static trgm * make_trigrams(trgm *tptr, char *str, int bytelen, int charlen) { char *ptr = str; if (charlen < 3) return tptr; if (bytelen > charlen) { /* Find multibyte character boundaries and apply compact_trigram */ int lenfirst = pg_mblen(str), lenmiddle = pg_mblen(str + lenfirst), lenlast = pg_mblen(str + lenfirst + lenmiddle); while ((ptr - str) + lenfirst + lenmiddle + lenlast <= bytelen) { compact_trigram(tptr, ptr, lenfirst + lenmiddle + lenlast); ptr += lenfirst; tptr++; lenfirst = lenmiddle; lenmiddle = lenlast; lenlast = pg_mblen(ptr + lenfirst + lenmiddle); } } else { /* Fast path when there are no multibyte characters */ Assert(bytelen == charlen); while (ptr - str < bytelen - 2 /* number of trigrams = strlen - 2 */ ) { CPTRGM(tptr, ptr); ptr++; tptr++; } } return tptr; }
/* * Adds trigrams from words (already padded). */ static trgm * make_trigrams(trgm *tptr, char *str, int bytelen, int charlen) { char *ptr = str; if (charlen < 3) return tptr; #ifdef USE_WIDE_UPPER_LOWER if (pg_database_encoding_max_length() > 1) { int lenfirst = pg_mblen(str), lenmiddle = pg_mblen(str + lenfirst), lenlast = pg_mblen(str + lenfirst + lenmiddle); while ((ptr - str) + lenfirst + lenmiddle + lenlast <= bytelen) { cnt_trigram(tptr, ptr, lenfirst + lenmiddle + lenlast); ptr += lenfirst; tptr++; lenfirst = lenmiddle; lenmiddle = lenlast; lenlast = pg_mblen(ptr + lenfirst + lenmiddle); } } else #endif { Assert(bytelen == charlen); while (ptr - str < bytelen - 2 /* number of trigrams = strlen - 2 */ ) { CPTRGM(tptr, ptr); ptr++; tptr++; } } return tptr; }
static char * find_word(char *in, char **end) { char *start; *end = NULL; while (*in && t_isspace(in)) in += pg_mblen(in); if (!*in || *in == '#') return NULL; start = in; while (*in && !t_isspace(in)) in += pg_mblen(in); *end = in; return start; }
/* * Finds first word in string, returns pointer to the word, * endword points to the character after word */ static char * find_word(char *str, int lenstr, char **endword, int *charlen) { char *beginword = str; while (beginword - str < lenstr && !iswordchr(beginword)) beginword += pg_mblen(beginword); if (beginword - str >= lenstr) return NULL; *endword = beginword; *charlen = 0; while (*endword - str < lenstr && iswordchr(*endword)) { *endword += pg_mblen(*endword); (*charlen)++; } return beginword; }
/*-------------------- * Support routine for MatchText. Compares given multibyte streams * as wide characters. If they match, returns 1 otherwise returns 0. *-------------------- */ static inline int wchareq(char *p1, char *p2) { int p1_len; /* Optimization: quickly compare the first byte. */ if (*p1 != *p2) return 0; p1_len = pg_mblen(p1); if (pg_mblen(p2) != p1_len) return 0; /* They are the same length */ while (p1_len--) { if (*p1++ != *p2++) return 0; } return 1; }
/* * Finds the next whitespace-delimited word within the 'in' string. * Returns a pointer to the first character of the word, and a pointer * to the next byte after the last character in the word (in *end). * Character '*' at the end of word will not be threated as word * charater if flags is not null. */ static char * findwrd(char *in, char **end, uint16 *flags) { char *start; char *lastchar; /* Skip leading spaces */ while (*in && t_isspace(in)) in += pg_mblen(in); /* Return NULL on empty lines */ if (*in == '\0') { *end = NULL; return NULL; } lastchar = start = in; /* Find end of word */ while (*in && !t_isspace(in)) { lastchar = in; in += pg_mblen(in); } if (in - lastchar == 1 && t_iseq(lastchar, '*') && flags) { *flags = TSL_PREFIX; *end = lastchar; } else { if (flags) *flags = 0; *end = in; } return start; }
/* * Extract a single, possibly multi-byte char from the input string. */ static char * extract_mb_char(char *s) { char *res; int len; len = pg_mblen(s); res = palloc(len + 1); memcpy(res, s, len); res[len] = '\0'; return res; }
int t_isprint(const char *ptr) { int clen = pg_mblen(ptr); wchar_t character[2]; if (clen == 1 || lc_ctype_is_c()) return isprint(TOUCHAR(ptr)); char2wchar(character, 2, ptr, clen); return iswprint((wint_t) character[0]); }
/* returns the length (counted as a wchar) of a multibyte string (not necessarily NULL terminated) */ int pg_mbstrlen_with_len(const unsigned char *mbstr, int limit) { int len = 0; int l; while (limit > 0 && *mbstr) { l = pg_mblen(mbstr); limit -= l; mbstr += l; len++; } return (len); }
int t_isprint(const char *ptr) { int clen = pg_mblen(ptr); wchar_t character[2]; Oid collation = DEFAULT_COLLATION_OID; /* TODO */ pg_locale_t mylocale = 0; /* TODO */ if (clen == 1 || lc_ctype_is_c(collation)) return isprint(TOUCHAR(ptr)); char2wchar(character, 2, ptr, clen, mylocale); return iswprint((wint_t) character[0]); }
/* returns the length (counted in wchars) of a multibyte string */ int pg_mbstrlen(const char *mbstr) { int len = 0; /* optimization for single byte encoding */ if (pg_database_encoding_max_length() == 1) return strlen(mbstr); while (*mbstr) { mbstr += pg_mblen(mbstr); len++; } return len; }
static char * percent_encode(unsigned char *s, int srclen) { unsigned char *end; StringInfoData buf; int len; initStringInfo(&buf); if (srclen < 0) srclen = strlen((char *) s); end = s + srclen; for (; s < end; s += len) { unsigned char *utf; int ulen; len = pg_mblen((const char *) s); if (len == 1) { if (('0' <= s[0] && s[0] <= '9') || ('A' <= s[0] && s[0] <= 'Z') || ('a' <= s[0] && s[0] <= 'z') || (s[0] == '-') || (s[0] == '.') || (s[0] == '_') || (s[0] == '~')) { appendStringInfoChar(&buf, s[0]); continue; } } utf = pg_do_encoding_conversion(s, len, GetDatabaseEncoding(), PG_UTF8); ulen = pg_encoding_mblen(PG_UTF8, (const char *) utf); while(ulen--) { appendStringInfo(&buf, "%%%2X", *utf); utf++; } } return buf.data; }
/* returns the length (counted in wchars) of a multibyte string * (not necessarily NULL terminated) */ int pg_mbstrlen_with_len(const char *mbstr, int limit) { int len = 0; /* optimization for single byte encoding */ if (pg_database_encoding_max_length() == 1) return limit; while (limit > 0 && *mbstr) { int l = pg_mblen(mbstr); limit -= l; mbstr += l; len++; } return len; }
static char * slon_quote_literal(char *str) { char *result; char *cp1; char *cp2; int len; int wl; if (str == NULL) return NULL; len = strlen(str); result = palloc(len * 2 + 3); cp1 = str; cp2 = result; *cp2++ = '\''; while (len > 0) { if ((wl = pg_mblen((unsigned char *) cp1)) != 1) { len -= wl; while (wl-- > 0) *cp2++ = *cp1++; continue; } if (*cp1 == '\'') *cp2++ = '\''; if (*cp1 == '\\') *cp2++ = '\\'; *cp2++ = *cp1++; len--; } *cp2++ = '\''; *cp2++ = '\0'; return result; }
/* * Test whether a regex is of the subset supported here. * Keep this in sync with RS_compile! */ bool RS_isRegis(const char *str) { int state = RS_IN_WAIT; const char *c = str; while (*c) { if (state == RS_IN_WAIT) { if (t_isalpha(c)) /* okay */ ; else if (t_iseq(c, '[')) state = RS_IN_ONEOF; else return false; } else if (state == RS_IN_ONEOF) { if (t_iseq(c, '^')) state = RS_IN_NONEOF; else if (t_isalpha(c)) state = RS_IN_ONEOF_IN; else return false; } else if (state == RS_IN_ONEOF_IN || state == RS_IN_NONEOF) { if (t_isalpha(c)) /* okay */ ; else if (t_iseq(c, ']')) state = RS_IN_WAIT; else return false; } else elog(ERROR, "internal error in RS_isRegis: state %d", state); c += pg_mblen(c); } return (state == RS_IN_WAIT); }
/* * returns the byte length of a multibyte string * (not necessarily NULL terminated) * that is no longer than limit. * this function does not break multibyte word boundary. */ int pg_mbcliplen(const unsigned char *mbstr, int len, int limit) { int clen = 0; int l; /* optimization for single byte encoding */ if (pg_database_encoding_max_length() == 1) return cliplen(mbstr, len, limit); while (len > 0 && *mbstr) { l = pg_mblen(mbstr); if ((clen + l) > limit) break; clen += l; if (clen == limit) break; len -= l; mbstr += l; } return (clen); }
/* * Similar to pg_mbcliplen except the limit parameter specifies the * character length, not the byte length. */ int pg_mbcharcliplen(const char *mbstr, int len, int limit) { int clen = 0; int nch = 0; int l; /* optimization for single byte encoding */ if (pg_database_encoding_max_length() == 1) return cliplen(mbstr, len, limit); while (len > 0 && *mbstr) { l = pg_mblen(mbstr); nch++; if (nch > limit) break; clen += l; len -= l; mbstr += l; } return clen; }
/* Return a properly quoted literal value (MULTIBYTE version) */ static char * do_quote_literal(char *lptr) { char *result; char *result_return; int len; int wl; len = strlen(lptr); result = (char *) palloc(len * 2 + 3); result_return = result; *result++ = '\''; while (len > 0) { if ((wl = pg_mblen(lptr)) != 1) { len -= wl; while (wl-- > 0) { *result++ = *lptr++; } continue; } if (*lptr == '\'') { *result++ = '\\'; } if (*lptr == '\\') { /* just add a backslash, the ' will be follow */ *result++ = '\\'; } *result++ = *lptr++; len--; } *result++ = '\''; *result++ = '\0'; return result_return; }
static TSVectorStat * ts_stat_sql(MemoryContext persistentContext, text *txt, text *ws) { char *query = text_to_cstring(txt); int i; TSVectorStat *stat; bool isnull; Portal portal; SPIPlanPtr plan; if ((plan = SPI_prepare(query, 0, NULL)) == NULL) /* internal error */ elog(ERROR, "SPI_prepare(\"%s\") failed", query); if ((portal = SPI_cursor_open(NULL, plan, NULL, NULL, true)) == NULL) /* internal error */ elog(ERROR, "SPI_cursor_open(\"%s\") failed", query); SPI_cursor_fetch(portal, true, 100); if (SPI_tuptable == NULL || SPI_tuptable->tupdesc->natts != 1 || !IsBinaryCoercible(SPI_gettypeid(SPI_tuptable->tupdesc, 1), TSVECTOROID)) ereport(ERROR, (errcode(ERRCODE_INVALID_PARAMETER_VALUE), errmsg("ts_stat query must return one tsvector column"))); stat = MemoryContextAllocZero(persistentContext, sizeof(TSVectorStat)); stat->maxdepth = 1; if (ws) { char *buf; buf = VARDATA(ws); while (buf - VARDATA(ws) < VARSIZE(ws) - VARHDRSZ) { if (pg_mblen(buf) == 1) { switch (*buf) { case 'A': case 'a': stat->weight |= 1 << 3; break; case 'B': case 'b': stat->weight |= 1 << 2; break; case 'C': case 'c': stat->weight |= 1 << 1; break; case 'D': case 'd': stat->weight |= 1; break; default: stat->weight |= 0; } } buf += pg_mblen(buf); } } while (SPI_processed > 0) { for (i = 0; i < SPI_processed; i++) { Datum data = SPI_getbinval(SPI_tuptable->vals[i], SPI_tuptable->tupdesc, 1, &isnull); if (!isnull) stat = ts_accum(persistentContext, stat, data); } SPI_freetuptable(SPI_tuptable); SPI_cursor_fetch(portal, true, 100); } SPI_freetuptable(SPI_tuptable); SPI_cursor_close(portal); SPI_freeplan(plan); pfree(query); return stat; }
/* * Extract the next non-wildcard part of a search string, ie, a word bounded * by '_' or '%' meta-characters, non-word characters or string end. * * str: source string, of length lenstr bytes (need not be null-terminated) * buf: where to return the substring (must be long enough) * *bytelen: receives byte length of the found substring * *charlen: receives character length of the found substring * * Returns pointer to end+1 of the found substring in the source string. * Returns NULL if no word found (in which case buf, bytelen, charlen not set) * * If the found word is bounded by non-word characters or string boundaries * then this function will include corresponding padding spaces into buf. */ static const char * get_wildcard_part(const char *str, int lenstr, char *buf, int *bytelen, int *charlen) { const char *beginword = str; const char *endword; char *s = buf; bool in_leading_wildcard_meta = false; bool in_trailing_wildcard_meta = false; bool in_escape = false; int clen; /* * Find the first word character, remembering whether preceding character * was wildcard meta-character. Note that the in_escape state persists * from this loop to the next one, since we may exit at a word character * that is in_escape. */ while (beginword - str < lenstr) { if (in_escape) { if (iswordchr(beginword)) break; in_escape = false; in_leading_wildcard_meta = false; } else { if (ISESCAPECHAR(beginword)) in_escape = true; else if (ISWILDCARDCHAR(beginword)) in_leading_wildcard_meta = true; else if (iswordchr(beginword)) break; else in_leading_wildcard_meta = false; } beginword += pg_mblen(beginword); } /* * Handle string end. */ if (beginword - str >= lenstr) return NULL; /* * Add left padding spaces if preceding character wasn't wildcard * meta-character. */ *charlen = 0; if (!in_leading_wildcard_meta) { if (LPADDING > 0) { *s++ = ' '; (*charlen)++; if (LPADDING > 1) { *s++ = ' '; (*charlen)++; } } } /* * Copy data into buf until wildcard meta-character, non-word character or * string boundary. Strip escapes during copy. */ endword = beginword; while (endword - str < lenstr) { clen = pg_mblen(endword); if (in_escape) { if (iswordchr(endword)) { memcpy(s, endword, clen); (*charlen)++; s += clen; } else { /* * Back up endword to the escape character when stopping at * an escaped char, so that subsequent get_wildcard_part will * restart from the escape character. We assume here that * escape chars are single-byte. */ endword--; break; } in_escape = false; } else { if (ISESCAPECHAR(endword)) in_escape = true; else if (ISWILDCARDCHAR(endword)) { in_trailing_wildcard_meta = true; break; } else if (iswordchr(endword)) { memcpy(s, endword, clen); (*charlen)++; s += clen; } else break; } endword += clen; } /* * Add right padding spaces if next character isn't wildcard * meta-character. */ if (!in_trailing_wildcard_meta) { if (RPADDING > 0) { *s++ = ' '; (*charlen)++; if (RPADDING > 1) { *s++ = ' '; (*charlen)++; } } } *bytelen = s - buf; return endword; }
static void thesaurusRead(char *filename, DictThesaurus *d) { tsearch_readline_state trst; uint16 idsubst = 0; bool useasis = false; char *line; filename = get_tsearch_config_filename(filename, "ths"); if (!tsearch_readline_begin(&trst, filename)) ereport(ERROR, (errcode(ERRCODE_CONFIG_FILE_ERROR), errmsg("could not open thesaurus file \"%s\": %m", filename))); while ((line = tsearch_readline(&trst)) != NULL) { char *ptr; int state = TR_WAITLEX; char *beginwrd = NULL; uint16 posinsubst = 0; uint16 nwrd = 0; ptr = line; /* is it a comment? */ while (*ptr && t_isspace(ptr)) ptr += pg_mblen(ptr); if (t_iseq(ptr, '#') || *ptr == '\0' || t_iseq(ptr, '\n') || t_iseq(ptr, '\r')) { pfree(line); continue; } while (*ptr) { if (state == TR_WAITLEX) { if (t_iseq(ptr, ':')) { if (posinsubst == 0) ereport(ERROR, (errcode(ERRCODE_CONFIG_FILE_ERROR), errmsg("unexpected delimiter"))); state = TR_WAITSUBS; } else if (!t_isspace(ptr)) { beginwrd = ptr; state = TR_INLEX; } } else if (state == TR_INLEX) { if (t_iseq(ptr, ':')) { newLexeme(d, beginwrd, ptr, idsubst, posinsubst++); state = TR_WAITSUBS; } else if (t_isspace(ptr)) { newLexeme(d, beginwrd, ptr, idsubst, posinsubst++); state = TR_WAITLEX; } } else if (state == TR_WAITSUBS) { if (t_iseq(ptr, '*')) { useasis = true; state = TR_INSUBS; beginwrd = ptr + pg_mblen(ptr); } else if (t_iseq(ptr, '\\')) { useasis = false; state = TR_INSUBS; beginwrd = ptr + pg_mblen(ptr); } else if (!t_isspace(ptr)) { useasis = false; beginwrd = ptr; state = TR_INSUBS; } } else if (state == TR_INSUBS) { if (t_isspace(ptr)) { if (ptr == beginwrd) ereport(ERROR, (errcode(ERRCODE_CONFIG_FILE_ERROR), errmsg("unexpected end of line or lexeme"))); addWrd(d, beginwrd, ptr, idsubst, nwrd++, posinsubst, useasis); state = TR_WAITSUBS; } } else elog(ERROR, "unrecognized thesaurus state: %d", state); ptr += pg_mblen(ptr); } if (state == TR_INSUBS) { if (ptr == beginwrd) ereport(ERROR, (errcode(ERRCODE_CONFIG_FILE_ERROR), errmsg("unexpected end of line or lexeme"))); addWrd(d, beginwrd, ptr, idsubst, nwrd++, posinsubst, useasis); } idsubst++; if (!(nwrd && posinsubst)) ereport(ERROR, (errcode(ERRCODE_CONFIG_FILE_ERROR), errmsg("unexpected end of line"))); pfree(line); } d->nsubst = idsubst; tsearch_readline_end(&trst); }
levenshtein_internal(text *s, text *t, int ins_c, int del_c, int sub_c) #endif { int m, n, s_bytes, t_bytes; int *prev; int *curr; int *s_char_len = NULL; int i, j; const char *s_data; const char *t_data; const char *y; /* * For levenshtein_less_equal_internal, we have real variables called * start_column and stop_column; otherwise it's just short-hand for 0 and * m. */ #ifdef LEVENSHTEIN_LESS_EQUAL int start_column, stop_column; #undef START_COLUMN #undef STOP_COLUMN #define START_COLUMN start_column #define STOP_COLUMN stop_column #else #undef START_COLUMN #undef STOP_COLUMN #define START_COLUMN 0 #define STOP_COLUMN m #endif /* Extract a pointer to the actual character data. */ s_data = VARDATA_ANY(s); t_data = VARDATA_ANY(t); /* Determine length of each string in bytes and characters. */ s_bytes = VARSIZE_ANY_EXHDR(s); t_bytes = VARSIZE_ANY_EXHDR(t); m = pg_mbstrlen_with_len(s_data, s_bytes); n = pg_mbstrlen_with_len(t_data, t_bytes); /* * We can transform an empty s into t with n insertions, or a non-empty t * into an empty s with m deletions. */ if (!m) return n * ins_c; if (!n) return m * del_c; /* * For security concerns, restrict excessive CPU+RAM usage. (This * implementation uses O(m) memory and has O(mn) complexity.) */ if (m > MAX_LEVENSHTEIN_STRLEN || n > MAX_LEVENSHTEIN_STRLEN) ereport(ERROR, (errcode(ERRCODE_INVALID_PARAMETER_VALUE), errmsg("argument exceeds the maximum length of %d bytes", MAX_LEVENSHTEIN_STRLEN))); #ifdef LEVENSHTEIN_LESS_EQUAL /* Initialize start and stop columns. */ start_column = 0; stop_column = m + 1; /* * If max_d >= 0, determine whether the bound is impossibly tight. If so, * return max_d + 1 immediately. Otherwise, determine whether it's tight * enough to limit the computation we must perform. If so, figure out * initial stop column. */ if (max_d >= 0) { int min_theo_d; /* Theoretical minimum distance. */ int max_theo_d; /* Theoretical maximum distance. */ int net_inserts = n - m; min_theo_d = net_inserts < 0 ? -net_inserts * del_c : net_inserts * ins_c; if (min_theo_d > max_d) return max_d + 1; if (ins_c + del_c < sub_c) sub_c = ins_c + del_c; max_theo_d = min_theo_d + sub_c * Min(m, n); if (max_d >= max_theo_d) max_d = -1; else if (ins_c + del_c > 0) { /* * Figure out how much of the first row of the notional matrix we * need to fill in. If the string is growing, the theoretical * minimum distance already incorporates the cost of deleting the * number of characters necessary to make the two strings equal in * length. Each additional deletion forces another insertion, so * the best-case total cost increases by ins_c + del_c. If the * string is shrinking, the minimum theoretical cost assumes no * excess deletions; that is, we're starting no further right than * column n - m. If we do start further right, the best-case * total cost increases by ins_c + del_c for each move right. */ int slack_d = max_d - min_theo_d; int best_column = net_inserts < 0 ? -net_inserts : 0; stop_column = best_column + (slack_d / (ins_c + del_c)) + 1; if (stop_column > m) stop_column = m + 1; } } #endif /* * In order to avoid calling pg_mblen() repeatedly on each character in s, * we cache all the lengths before starting the main loop -- but if all * the characters in both strings are single byte, then we skip this and * use a fast-path in the main loop. If only one string contains * multi-byte characters, we still build the array, so that the fast-path * needn't deal with the case where the array hasn't been initialized. */ if (m != s_bytes || n != t_bytes) { int i; const char *cp = s_data; s_char_len = (int *) palloc((m + 1) * sizeof(int)); for (i = 0; i < m; ++i) { s_char_len[i] = pg_mblen(cp); cp += s_char_len[i]; } s_char_len[i] = 0; } /* One more cell for initialization column and row. */ ++m; ++n; /* Previous and current rows of notional array. */ prev = (int *) palloc(2 * m * sizeof(int)); curr = prev + m; /* * To transform the first i characters of s into the first 0 characters of * t, we must perform i deletions. */ for (i = START_COLUMN; i < STOP_COLUMN; i++) prev[i] = i * del_c; /* Loop through rows of the notional array */ for (y = t_data, j = 1; j < n; j++) { int *temp; const char *x = s_data; int y_char_len = n != t_bytes + 1 ? pg_mblen(y) : 1; #ifdef LEVENSHTEIN_LESS_EQUAL /* * In the best case, values percolate down the diagonal unchanged, so * we must increment stop_column unless it's already on the right end * of the array. The inner loop will read prev[stop_column], so we * have to initialize it even though it shouldn't affect the result. */ if (stop_column < m) { prev[stop_column] = max_d + 1; ++stop_column; } /* * The main loop fills in curr, but curr[0] needs a special case: to * transform the first 0 characters of s into the first j characters * of t, we must perform j insertions. However, if start_column > 0, * this special case does not apply. */ if (start_column == 0) { curr[0] = j * ins_c; i = 1; } else i = start_column; #else curr[0] = j * ins_c; i = 1; #endif /* * This inner loop is critical to performance, so we include a * fast-path to handle the (fairly common) case where no multibyte * characters are in the mix. The fast-path is entitled to assume * that if s_char_len is not initialized then BOTH strings contain * only single-byte characters. */ if (s_char_len != NULL) { for (; i < STOP_COLUMN; i++) { int ins; int del; int sub; int x_char_len = s_char_len[i - 1]; /* * Calculate costs for insertion, deletion, and substitution. * * When calculating cost for substitution, we compare the last * character of each possibly-multibyte character first, * because that's enough to rule out most mis-matches. If we * get past that test, then we compare the lengths and the * remaining bytes. */ ins = prev[i] + ins_c; del = curr[i - 1] + del_c; if (x[x_char_len - 1] == y[y_char_len - 1] && x_char_len == y_char_len && (x_char_len == 1 || rest_of_char_same(x, y, x_char_len))) sub = prev[i - 1]; else sub = prev[i - 1] + sub_c; /* Take the one with minimum cost. */ curr[i] = Min(ins, del); curr[i] = Min(curr[i], sub); /* Point to next character. */ x += x_char_len; } } else { for (; i < STOP_COLUMN; i++) { int ins; int del; int sub; /* Calculate costs for insertion, deletion, and substitution. */ ins = prev[i] + ins_c; del = curr[i - 1] + del_c; sub = prev[i - 1] + ((*x == *y) ? 0 : sub_c); /* Take the one with minimum cost. */ curr[i] = Min(ins, del); curr[i] = Min(curr[i], sub); /* Point to next character. */ x++; } } /* Swap current row with previous row. */ temp = curr; curr = prev; prev = temp; /* Point to next character. */ y += y_char_len; #ifdef LEVENSHTEIN_LESS_EQUAL /* * This chunk of code represents a significant performance hit if used * in the case where there is no max_d bound. This is probably not * because the max_d >= 0 test itself is expensive, but rather because * the possibility of needing to execute this code prevents tight * optimization of the loop as a whole. */ if (max_d >= 0) { /* * The "zero point" is the column of the current row where the * remaining portions of the strings are of equal length. There * are (n - 1) characters in the target string, of which j have * been transformed. There are (m - 1) characters in the source * string, so we want to find the value for zp where (n - 1) - j = * (m - 1) - zp. */ int zp = j - (n - m); /* Check whether the stop column can slide left. */ while (stop_column > 0) { int ii = stop_column - 1; int net_inserts = ii - zp; if (prev[ii] + (net_inserts > 0 ? net_inserts * ins_c : -net_inserts * del_c) <= max_d) break; stop_column--; } /* Check whether the start column can slide right. */ while (start_column < stop_column) { int net_inserts = start_column - zp; if (prev[start_column] + (net_inserts > 0 ? net_inserts * ins_c : -net_inserts * del_c) <= max_d) break; /* * We'll never again update these values, so we must make sure * there's nothing here that could confuse any future * iteration of the outer loop. */ prev[start_column] = max_d + 1; curr[start_column] = max_d + 1; if (start_column != 0) s_data += (s_char_len != NULL) ? s_char_len[start_column - 1] : 1; start_column++; } /* If they cross, we're going to exceed the bound. */ if (start_column >= stop_column) return max_d + 1; } #endif } /* * Because the final value was swapped from the previous row to the * current row, that's where we'll find it. */ return prev[m - 1]; }