Datum cosine(PG_FUNCTION_ARGS) { char *a, *b; TokenList *s, *t; int atok, btok, comtok, alltok; float4 res; a = DatumGetPointer(DirectFunctionCall1(textout, PointerGetDatum(PG_GETARG_TEXT_P(0)))); b = DatumGetPointer(DirectFunctionCall1(textout, PointerGetDatum(PG_GETARG_TEXT_P(1)))); if (strlen(a) > PGS_MAX_STR_LEN || strlen(b) > PGS_MAX_STR_LEN) ereport(ERROR, (errcode(ERRCODE_INVALID_PARAMETER_VALUE), errmsg("argument exceeds the maximum length of %d bytes", PGS_MAX_STR_LEN))); /* sets */ s = initTokenList(1); t = initTokenList(1); switch (pgs_cosine_tokenizer) { case PGS_UNIT_WORD: tokenizeBySpace(s, a); tokenizeBySpace(t, b); break; case PGS_UNIT_GRAM: tokenizeByGram(s, a); tokenizeByGram(t, b); break; case PGS_UNIT_CAMELCASE: tokenizeByCamelCase(s, a); tokenizeByCamelCase(t, b); break; case PGS_UNIT_ALNUM: /* default */ default: tokenizeByNonAlnum(s, a); tokenizeByNonAlnum(t, b); break; } elog(DEBUG3, "Token List A"); printToken(s); elog(DEBUG3, "Token List B"); printToken(t); atok = s->size; btok = t->size; /* combine the sets */ switch (pgs_cosine_tokenizer) { case PGS_UNIT_WORD: tokenizeBySpace(s, b); break; case PGS_UNIT_GRAM: tokenizeByGram(s, b); break; case PGS_UNIT_CAMELCASE: tokenizeByCamelCase(s, b); break; case PGS_UNIT_ALNUM: /* default */ default: tokenizeByNonAlnum(s, b); break; } elog(DEBUG3, "All Token List"); printToken(s); alltok = s->size; destroyTokenList(s); destroyTokenList(t); comtok = atok + btok - alltok; elog(DEBUG1, "is normalized: %d", pgs_cosine_is_normalized); elog(DEBUG1, "token list A size: %d", atok); elog(DEBUG1, "token list B size: %d", btok); elog(DEBUG1, "all tokens size: %d", alltok); elog(DEBUG1, "common tokens size: %d", comtok); /* normalized and unnormalized version are the same */ res = (float) comtok / (sqrt(atok) * sqrt(btok)); PG_RETURN_FLOAT4(res); }
Datum mongeelkan(PG_FUNCTION_ARGS) { char *a, *b; TokenList *s, *t; Token *p, *q; double summatches; double maxvalue; float8 res; a = DatumGetPointer(DirectFunctionCall1(textout, PointerGetDatum(PG_GETARG_TEXT_P(0)))); b = DatumGetPointer(DirectFunctionCall1(textout, PointerGetDatum(PG_GETARG_TEXT_P(1)))); if (strlen(a) > PGS_MAX_STR_LEN || strlen(b) > PGS_MAX_STR_LEN) ereport(ERROR, (errcode(ERRCODE_INVALID_PARAMETER_VALUE), errmsg("argument exceeds the maximum length of %d bytes", PGS_MAX_STR_LEN))); /* lists */ s = initTokenList(0); t = initTokenList(0); switch (pgs_mongeelkan_tokenizer) { case PGS_UNIT_WORD: tokenizeBySpace(s, a); tokenizeBySpace(t, b); break; case PGS_UNIT_GRAM: tokenizeByGram(s, a); tokenizeByGram(t, b); break; case PGS_UNIT_CAMELCASE: tokenizeByCamelCase(s, a); tokenizeByCamelCase(t, b); break; case PGS_UNIT_ALNUM: default: tokenizeByNonAlnum(s, a); tokenizeByNonAlnum(t, b); break; } summatches = 0.0; p = s->head; while (p != NULL) { maxvalue = 0.0; q = t->head; while (q != NULL) { double val = _mongeelkan(p->data, q->data); elog(DEBUG3, "p: %s; q: %s", p->data, q->data); if (val > maxvalue) maxvalue = val; q = q->next; } summatches += maxvalue; p = p->next; } /* normalized and unnormalized version are the same */ res = summatches / s->size; elog(DEBUG1, "is normalized: %d", pgs_mongeelkan_is_normalized); elog(DEBUG1, "sum matches: %.3f", summatches); elog(DEBUG1, "s size: %d", s->size); elog(DEBUG1, "medistance(%s, %s) = %.3f", a, b, res); destroyTokenList(s); destroyTokenList(t); PG_RETURN_FLOAT8(res); }