示例#1
0
Datum
cosine(PG_FUNCTION_ARGS)
{
	char		*a, *b;
	TokenList	*s, *t;
	int			atok, btok, comtok, alltok;
	float4		res;

	a = DatumGetPointer(DirectFunctionCall1(textout, PointerGetDatum(PG_GETARG_TEXT_P(0))));
	b = DatumGetPointer(DirectFunctionCall1(textout, PointerGetDatum(PG_GETARG_TEXT_P(1))));

	if (strlen(a) > PGS_MAX_STR_LEN || strlen(b) > PGS_MAX_STR_LEN)
		ereport(ERROR,
				(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
				errmsg("argument exceeds the maximum length of %d bytes",
					PGS_MAX_STR_LEN)));

	/* sets */
	s = initTokenList(1);
	t = initTokenList(1);

	switch (pgs_cosine_tokenizer)
	{
		case PGS_UNIT_WORD:
			tokenizeBySpace(s, a);
			tokenizeBySpace(t, b);
			break;
		case PGS_UNIT_GRAM:
			tokenizeByGram(s, a);
			tokenizeByGram(t, b);
			break;
		case PGS_UNIT_CAMELCASE:
			tokenizeByCamelCase(s, a);
			tokenizeByCamelCase(t, b);
			break;
		case PGS_UNIT_ALNUM:	/* default */
		default:
			tokenizeByNonAlnum(s, a);
			tokenizeByNonAlnum(t, b);
			break;
	}

	elog(DEBUG3, "Token List A");
	printToken(s);
	elog(DEBUG3, "Token List B");
	printToken(t);

	atok = s->size;
	btok = t->size;

	/* combine the sets */
	switch (pgs_cosine_tokenizer)
	{
		case PGS_UNIT_WORD:
			tokenizeBySpace(s, b);
			break;
		case PGS_UNIT_GRAM:
			tokenizeByGram(s, b);
			break;
		case PGS_UNIT_CAMELCASE:
			tokenizeByCamelCase(s, b);
			break;
		case PGS_UNIT_ALNUM:	/* default */
		default:
			tokenizeByNonAlnum(s, b);
			break;
	}

	elog(DEBUG3, "All Token List");
	printToken(s);

	alltok = s->size;

	destroyTokenList(s);
	destroyTokenList(t);

	comtok = atok + btok - alltok;

	elog(DEBUG1, "is normalized: %d", pgs_cosine_is_normalized);
	elog(DEBUG1, "token list A size: %d", atok);
	elog(DEBUG1, "token list B size: %d", btok);
	elog(DEBUG1, "all tokens size: %d", alltok);
	elog(DEBUG1, "common tokens size: %d", comtok);

	/* normalized and unnormalized version are the same */
	res = (float) comtok / (sqrt(atok) * sqrt(btok));

	PG_RETURN_FLOAT4(res);
}
示例#2
0
Datum
mongeelkan(PG_FUNCTION_ARGS)
{
	char		*a, *b;
	TokenList	*s, *t;
	Token		*p, *q;
	double		summatches;
	double		maxvalue;
	float8		res;

	a = DatumGetPointer(DirectFunctionCall1(textout,
											PointerGetDatum(PG_GETARG_TEXT_P(0))));
	b = DatumGetPointer(DirectFunctionCall1(textout,
											PointerGetDatum(PG_GETARG_TEXT_P(1))));

	if (strlen(a) > PGS_MAX_STR_LEN || strlen(b) > PGS_MAX_STR_LEN)
		ereport(ERROR,
				(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
				 errmsg("argument exceeds the maximum length of %d bytes",
						PGS_MAX_STR_LEN)));

	/* lists */
	s = initTokenList(0);
	t = initTokenList(0);

	switch (pgs_mongeelkan_tokenizer)
	{
		case PGS_UNIT_WORD:
			tokenizeBySpace(s, a);
			tokenizeBySpace(t, b);
			break;
		case PGS_UNIT_GRAM:
			tokenizeByGram(s, a);
			tokenizeByGram(t, b);
			break;
		case PGS_UNIT_CAMELCASE:
			tokenizeByCamelCase(s, a);
			tokenizeByCamelCase(t, b);
			break;
		case PGS_UNIT_ALNUM:
		default:
			tokenizeByNonAlnum(s, a);
			tokenizeByNonAlnum(t, b);
			break;
	}

	summatches = 0.0;

	p = s->head;
	while (p != NULL)
	{
		maxvalue = 0.0;

		q = t->head;
		while (q != NULL)
		{
			double val = _mongeelkan(p->data, q->data);
			elog(DEBUG3, "p: %s; q: %s", p->data, q->data);
			if (val > maxvalue)
				maxvalue = val;
			q = q->next;
		}

		summatches += maxvalue;

		p = p->next;
	}

	/* normalized and unnormalized version are the same */
	res = summatches / s->size;

	elog(DEBUG1, "is normalized: %d", pgs_mongeelkan_is_normalized);
	elog(DEBUG1, "sum matches: %.3f", summatches);
	elog(DEBUG1, "s size: %d", s->size);
	elog(DEBUG1, "medistance(%s, %s) = %.3f", a, b, res);

	destroyTokenList(s);
	destroyTokenList(t);

	PG_RETURN_FLOAT8(res);
}