float4 cnt_sml(TRGM *trg1, TRGM *trg2, bool inexact) { trgm *ptr1, *ptr2; int count = 0; int len1, len2; ptr1 = GETARR(trg1); ptr2 = GETARR(trg2); len1 = ARRNELEM(trg1); len2 = ARRNELEM(trg2); /* explicit test is needed to avoid 0/0 division when both lengths are 0 */ if (len1 <= 0 || len2 <= 0) return (float4) 0.0; while (ptr1 - GETARR(trg1) < len1 && ptr2 - GETARR(trg2) < len2) { int res = CMPTRGM(ptr1, ptr2); if (res < 0) ptr1++; else if (res > 0) ptr2++; else { ptr1++; ptr2++; count++; } } /* * If inexact then len2 is equal to count, because we don't know actual * length of second string in inexact search and we can assume that count * is a lower bound of len2. */ return CALCSML(count, len1, inexact ? count : len2); }
/* * Iterative search function which calculates maximum similarity with word in * the string. But maximum similarity is calculated only if check_only == false. * * trg2indexes: array which stores indexes of the array "found". * found: array which stores true of false values. * ulen1: count of unique trigrams of array "trg1". * len2: length of array "trg2" and array "trg2indexes". * len: length of the array "found". * check_only: if true then only check existaince of similar search pattern in * text. * * Returns word similarity. */ static float4 iterate_word_similarity(int *trg2indexes, bool *found, int ulen1, int len2, int len, bool check_only) { int *lastpos, i, ulen2 = 0, count = 0, upper = -1, lower = -1; float4 smlr_cur, smlr_max = 0.0f; /* Memorise last position of each trigram */ lastpos = (int *) palloc(sizeof(int) * len); memset(lastpos, -1, sizeof(int) * len); for (i = 0; i < len2; i++) { /* Get index of next trigram */ int trgindex = trg2indexes[i]; /* Update last position of this trigram */ if (lower >= 0 || found[trgindex]) { if (lastpos[trgindex] < 0) { ulen2++; if (found[trgindex]) count++; } lastpos[trgindex] = i; } /* Adjust lower bound if this trigram is present in required substing */ if (found[trgindex]) { int prev_lower, tmp_ulen2, tmp_lower, tmp_count; upper = i; if (lower == -1) { lower = i; ulen2 = 1; } smlr_cur = CALCSML(count, ulen1, ulen2); /* Also try to adjust upper bound for greater similarity */ tmp_count = count; tmp_ulen2 = ulen2; prev_lower = lower; for (tmp_lower = lower; tmp_lower <= upper; tmp_lower++) { float smlr_tmp = CALCSML(tmp_count, ulen1, tmp_ulen2); int tmp_trgindex; if (smlr_tmp > smlr_cur) { smlr_cur = smlr_tmp; ulen2 = tmp_ulen2; lower = tmp_lower; count = tmp_count; } /* * if we only check that word similarity is greater than * pg_trgm.word_similarity_threshold we do not need to calculate * a maximum similarity. */ if (check_only && smlr_cur >= word_similarity_threshold) break; tmp_trgindex = trg2indexes[tmp_lower]; if (lastpos[tmp_trgindex] == tmp_lower) { tmp_ulen2--; if (found[tmp_trgindex]) tmp_count--; } } smlr_max = Max(smlr_max, smlr_cur); /* * if we only check that word similarity is greater than * pg_trgm.word_similarity_threshold we do not need to calculate a * maximum similarity */ if (check_only && smlr_max >= word_similarity_threshold) break; for (tmp_lower = prev_lower; tmp_lower < lower; tmp_lower++) { int tmp_trgindex; tmp_trgindex = trg2indexes[tmp_lower]; if (lastpos[tmp_trgindex] == tmp_lower) lastpos[tmp_trgindex] = -1; } } } pfree(lastpos); return smlr_max; }
/* * Iterative search function which calculates maximum similarity with word in * the string. But maximum similarity is calculated only if check_only == false. * * trg2indexes: array which stores indexes of the array "found". * found: array which stores true of false values. * ulen1: count of unique trigrams of array "trg1". * len2: length of array "trg2" and array "trg2indexes". * len: length of the array "found". * lags: set of boolean flags parametrizing similarity calculation. * bounds: whether each trigram is left/right bound of word. * * Returns word similarity. */ static float4 iterate_word_similarity(int *trg2indexes, bool *found, int ulen1, int len2, int len, uint8 flags, TrgmBound *bounds) { int *lastpos, i, ulen2 = 0, count = 0, upper = -1, lower; float4 smlr_cur, smlr_max = 0.0f; double threshold; Assert(bounds || !(flags & WORD_SIMILARITY_STRICT)); /* Select appropriate threshold */ threshold = (flags & WORD_SIMILARITY_STRICT) ? strict_word_similarity_threshold : word_similarity_threshold; /* * Consider first trigram as initial lower bount for strict word similarity, * or initialize it later with first trigram present for plain word * similarity. */ lower = (flags & WORD_SIMILARITY_STRICT) ? 0 : -1; /* Memorise last position of each trigram */ lastpos = (int *) palloc(sizeof(int) * len); memset(lastpos, -1, sizeof(int) * len); for (i = 0; i < len2; i++) { /* Get index of next trigram */ int trgindex = trg2indexes[i]; /* Update last position of this trigram */ if (lower >= 0 || found[trgindex]) { if (lastpos[trgindex] < 0) { ulen2++; if (found[trgindex]) count++; } lastpos[trgindex] = i; } /* * Adjust upper bound if trigram is upper bound of word for strict * word similarity, or if trigram is present in required substring for * plain word similarity */ if ((flags & WORD_SIMILARITY_STRICT) ? (bounds[i] & TRGM_BOUND_RIGHT) : found[trgindex]) { int prev_lower, tmp_ulen2, tmp_lower, tmp_count; upper = i; if (lower == -1) { lower = i; ulen2 = 1; } smlr_cur = CALCSML(count, ulen1, ulen2); /* Also try to adjust lower bound for greater similarity */ tmp_count = count; tmp_ulen2 = ulen2; prev_lower = lower; for (tmp_lower = lower; tmp_lower <= upper; tmp_lower++) { float smlr_tmp; int tmp_trgindex; /* * Adjust lower bound only if trigram is lower bound of word * for strict word similarity, or consider every trigram as * lower bound for plain word similarity. */ if (!(flags & WORD_SIMILARITY_STRICT) || (bounds[tmp_lower] & TRGM_BOUND_LEFT)) { smlr_tmp = CALCSML(tmp_count, ulen1, tmp_ulen2); if (smlr_tmp > smlr_cur) { smlr_cur = smlr_tmp; ulen2 = tmp_ulen2; lower = tmp_lower; count = tmp_count; } /* * If we only check that word similarity is greater than * threshold we do not need to calculate a maximum * similarity. */ if ((flags & WORD_SIMILARITY_CHECK_ONLY) && smlr_cur >= threshold) break; } tmp_trgindex = trg2indexes[tmp_lower]; if (lastpos[tmp_trgindex] == tmp_lower) { tmp_ulen2--; if (found[tmp_trgindex]) tmp_count--; } } smlr_max = Max(smlr_max, smlr_cur); /* * if we only check that word similarity is greater than * threshold we do not need to calculate a maximum similarity. */ if ((flags & WORD_SIMILARITY_CHECK_ONLY) && smlr_max >= threshold) break; for (tmp_lower = prev_lower; tmp_lower < lower; tmp_lower++) { int tmp_trgindex; tmp_trgindex = trg2indexes[tmp_lower]; if (lastpos[tmp_trgindex] == tmp_lower) lastpos[tmp_trgindex] = -1; } } } pfree(lastpos); return smlr_max; }