/* * Returns whether trg2 contains all trigrams in trg1. * This relies on the trigram arrays being sorted. */ bool trgm_contained_by(TRGM *trg1, TRGM *trg2) { trgm *ptr1, *ptr2; int len1, len2; ptr1 = GETARR(trg1); ptr2 = GETARR(trg2); len1 = ARRNELEM(trg1); len2 = ARRNELEM(trg2); while (ptr1 - GETARR(trg1) < len1 && ptr2 - GETARR(trg2) < len2) { int res = CMPTRGM(ptr1, ptr2); if (res < 0) return false; else if (res > 0) ptr2++; else { ptr1++; ptr2++; } } if (ptr1 - GETARR(trg1) < len1) return false; else return true; }
/* * Compare position trigrams: compare trigrams first and position second. */ static int comp_ptrgm(const void *v1, const void *v2) { const pos_trgm *p1 = (const pos_trgm *)v1; const pos_trgm *p2 = (const pos_trgm *)v2; int cmp; cmp = CMPTRGM(p1->trg, p2->trg); if (cmp != 0) return cmp; if (p1->index < p2->index) return -1; else if (p1->index == p2->index) return 0; else return 1; }
static int unique_array(trgm *a, int len) { trgm *curend, *tmp; curend = tmp = a; while (tmp - a < len) if (CMPTRGM(tmp, curend)) { curend++; CPTRGM(curend, tmp); tmp++; } else tmp++; return curend + 1 - a; }
float4 cnt_sml(TRGM *trg1, TRGM *trg2) { trgm *ptr1, *ptr2; int count = 0; int len1, len2; ptr1 = GETARR(trg1); ptr2 = GETARR(trg2); len1 = ARRNELEM(trg1); len2 = ARRNELEM(trg2); /* explicit test is needed to avoid 0/0 division when both lengths are 0 */ if (len1 <= 0 || len2 <= 0) return (float4) 0.0; while (ptr1 - GETARR(trg1) < len1 && ptr2 - GETARR(trg2) < len2) { int res = CMPTRGM(ptr1, ptr2); if (res < 0) ptr1++; else if (res > 0) ptr2++; else { ptr1++; ptr2++; count++; } } #ifdef DIVUNION return ((float4) count) / ((float4) (len1 + len2 - count)); #else return ((float4) count) / ((float4) ((len1 > len2) ? len1 : len2)); #endif }
float4 cnt_sml(TRGM *trg1, TRGM *trg2, bool inexact) { trgm *ptr1, *ptr2; int count = 0; int len1, len2; ptr1 = GETARR(trg1); ptr2 = GETARR(trg2); len1 = ARRNELEM(trg1); len2 = ARRNELEM(trg2); /* explicit test is needed to avoid 0/0 division when both lengths are 0 */ if (len1 <= 0 || len2 <= 0) return (float4) 0.0; while (ptr1 - GETARR(trg1) < len1 && ptr2 - GETARR(trg2) < len2) { int res = CMPTRGM(ptr1, ptr2); if (res < 0) ptr1++; else if (res > 0) ptr2++; else { ptr1++; ptr2++; count++; } } /* * If inexact then len2 is equal to count, because we don't know actual * length of second string in inexact search and we can assume that count * is a lower bound of len2. */ return CALCSML(count, len1, inexact ? count : len2); }
float4 cnt_sml(TRGM *trg1, TRGM *trg2) { trgm *ptr1, *ptr2; int count = 0; int len1, len2; ptr1 = GETARR(trg1); ptr2 = GETARR(trg2); len1 = ARRNELEM(trg1); len2 = ARRNELEM(trg2); while (ptr1 - GETARR(trg1) < len1 && ptr2 - GETARR(trg2) < len2) { int res = CMPTRGM(ptr1, ptr2); if (res < 0) ptr1++; else if (res > 0) ptr2++; else { ptr1++; ptr2++; count++; } } #ifdef DIVUNION return ((((float4) count) / ((float4) (len1 + len2 - count)))); #else return (((float) count) / ((float) ((len1 > len2) ? len1 : len2))); #endif }
/* * Return a palloc'd boolean array showing, for each trigram in "query", * whether it is present in the trigram array "key". * This relies on the "key" array being sorted, but "query" need not be. */ bool * trgm_presence_map(TRGM *query, TRGM *key) { bool *result; trgm *ptrq = GETARR(query), *ptrk = GETARR(key); int lenq = ARRNELEM(query), lenk = ARRNELEM(key), i; result = (bool *) palloc0(lenq * sizeof(bool)); /* for each query trigram, do a binary search in the key array */ for (i = 0; i < lenq; i++) { int lo = 0; int hi = lenk; while (lo < hi) { int mid = (lo + hi) / 2; int res = CMPTRGM(ptrq, ptrk + mid); if (res < 0) hi = mid; else if (res > 0) lo = mid + 1; else { result[i] = true; break; } } ptrq++; } return result; }
static int comp_trgm(const void *a, const void *b) { return CMPTRGM(a, b); }
/* * Calculate word similarity. * This function prepare two arrays: "trg2indexes" and "found". Then this arrays * are used to calculate word similarity using iterate_word_similarity(). * * "trg2indexes" is array which stores indexes of the array "found". * In other words: * trg2indexes[j] = i; * found[i] = true (or false); * If found[i] == true then there is trigram trg2[j] in array "trg1". * If found[i] == false then there is not trigram trg2[j] in array "trg1". * * str1: search pattern string, of length slen1 bytes. * str2: text in which we are looking for a word, of length slen2 bytes. * check_only: if true then only check existaince of similar search pattern in * text. * * Returns word similarity. */ static float4 calc_word_similarity(char *str1, int slen1, char *str2, int slen2, bool check_only) { bool *found; pos_trgm *ptrg; trgm *trg1; trgm *trg2; int len1, len2, len, i, j, ulen1; int *trg2indexes; float4 result; protect_out_of_mem(slen1 + slen2); /* Make positional trigrams */ trg1 = (trgm *) palloc(sizeof(trgm) * (slen1 / 2 + 1) * 3); trg2 = (trgm *) palloc(sizeof(trgm) * (slen2 / 2 + 1) * 3); len1 = generate_trgm_only(trg1, str1, slen1); len2 = generate_trgm_only(trg2, str2, slen2); ptrg = make_positional_trgm(trg1, len1, trg2, len2); len = len1 + len2; qsort(ptrg, len, sizeof(pos_trgm), comp_ptrgm); pfree(trg1); pfree(trg2); /* * Merge positional trigrams array: enumerate each trigram and find its * presence in required word. */ trg2indexes = (int *) palloc(sizeof(int) * len2); found = (bool *) palloc0(sizeof(bool) * len); ulen1 = 0; j = 0; for (i = 0; i < len; i++) { if (i > 0) { int cmp = CMPTRGM(ptrg[i - 1].trg, ptrg[i].trg); if (cmp != 0) { if (found[j]) ulen1++; j++; } } if (ptrg[i].index >= 0) { trg2indexes[ptrg[i].index] = j; } else { found[j] = true; } } if (found[j]) ulen1++; /* Run iterative procedure to find maximum similarity with word */ result = iterate_word_similarity(trg2indexes, found, ulen1, len2, len, check_only); pfree(trg2indexes); pfree(found); pfree(ptrg); return result; }