/* * Lexize one word by dictionary, mostly debug function */ Datum ts_lexize(PG_FUNCTION_ARGS) { Oid dictId = PG_GETARG_OID(0); text *in = PG_GETARG_TEXT_PP(1); ArrayType *a; TSDictionaryCacheEntry *dict; TSLexeme *res, *ptr; Datum *da; DictSubState dstate = {false, false, NULL}; dict = lookup_ts_dictionary_cache(dictId); res = (TSLexeme *) DatumGetPointer(FunctionCall4(&dict->lexize, PointerGetDatum(dict->dictData), PointerGetDatum(VARDATA_ANY(in)), Int32GetDatum(VARSIZE_ANY_EXHDR(in)), PointerGetDatum(&dstate))); if (dstate.getnext) { dstate.isend = true; ptr = (TSLexeme *) DatumGetPointer(FunctionCall4(&dict->lexize, PointerGetDatum(dict->dictData), PointerGetDatum(VARDATA_ANY(in)), Int32GetDatum(VARSIZE_ANY_EXHDR(in)), PointerGetDatum(&dstate))); if (ptr != NULL) res = ptr; } if (!res) PG_RETURN_NULL(); ptr = res; while (ptr->lexeme) ptr++; da = (Datum *) palloc(sizeof(Datum) * (ptr - res)); ptr = res; while (ptr->lexeme) { da[ptr - res] = CStringGetTextDatum(ptr->lexeme); ptr++; } a = construct_array(da, ptr - res, TEXTOID, -1, false, 'i'); ptr = res; while (ptr->lexeme) { pfree(DatumGetPointer(da[ptr - res])); pfree(ptr->lexeme); ptr++; } pfree(res); pfree(da); PG_RETURN_POINTER(a); }
static TSLexeme * LexizeExec(LexizeData *ld, ParsedLex **correspondLexem) { int i; ListDictionary *map; TSDictionaryCacheEntry *dict; TSLexeme *res; if (ld->curDictId == InvalidOid) { /* * usial mode: dictionary wants only one word, but we should keep in * mind that we should go through all stack */ while (ld->towork.head) { ParsedLex *curVal = ld->towork.head; char *curValLemm = curVal->lemm; int curValLenLemm = curVal->lenlemm; map = ld->cfg->map + curVal->type; if (curVal->type == 0 || curVal->type >= ld->cfg->lenmap || map->len == 0) { /* skip this type of lexeme */ RemoveHead(ld); continue; } for (i = ld->posDict; i < map->len; i++) { dict = lookup_ts_dictionary_cache(map->dictIds[i]); ld->dictState.isend = ld->dictState.getnext = false; ld->dictState.private_state = NULL; res = (TSLexeme *) DatumGetPointer(FunctionCall4( &(dict->lexize), PointerGetDatum(dict->dictData), PointerGetDatum(curValLemm), Int32GetDatum(curValLenLemm), PointerGetDatum(&ld->dictState) )); if (ld->dictState.getnext) { /* * dictionary wants next word, so setup and store current * position and go to multiword mode */ ld->curDictId = DatumGetObjectId(map->dictIds[i]); ld->posDict = i + 1; ld->curSub = curVal->next; if (res) setNewTmpRes(ld, curVal, res); return LexizeExec(ld, correspondLexem); } if (!res) /* dictionary doesn't know this lexeme */ continue; if (res->flags & TSL_FILTER) { curValLemm = res->lexeme; curValLenLemm = strlen(res->lexeme); continue; } RemoveHead(ld); setCorrLex(ld, correspondLexem); return res; } RemoveHead(ld); } } else { /* curDictId is valid */ dict = lookup_ts_dictionary_cache(ld->curDictId); /* * Dictionary ld->curDictId asks us about following words */ while (ld->curSub) { ParsedLex *curVal = ld->curSub; map = ld->cfg->map + curVal->type; if (curVal->type != 0) { bool dictExists = false; if (curVal->type >= ld->cfg->lenmap || map->len == 0) { /* skip this type of lexeme */ ld->curSub = curVal->next; continue; } /* * We should be sure that current type of lexeme is recognized * by our dictinonary: we just check is it exist in list of * dictionaries ? */ for (i = 0; i < map->len && !dictExists; i++) if (ld->curDictId == DatumGetObjectId(map->dictIds[i])) dictExists = true; if (!dictExists) { /* * Dictionary can't work with current tpe of lexeme, * return to basic mode and redo all stored lexemes */ ld->curDictId = InvalidOid; return LexizeExec(ld, correspondLexem); } } ld->dictState.isend = (curVal->type == 0) ? true : false; ld->dictState.getnext = false; res = (TSLexeme *) DatumGetPointer(FunctionCall4( &(dict->lexize), PointerGetDatum(dict->dictData), PointerGetDatum(curVal->lemm), Int32GetDatum(curVal->lenlemm), PointerGetDatum(&ld->dictState) )); if (ld->dictState.getnext) { /* Dictionary wants one more */ ld->curSub = curVal->next; if (res) setNewTmpRes(ld, curVal, res); continue; } if (res || ld->tmpRes) { /* * Dictionary normalizes lexemes, so we remove from stack all * used lexemes, return to basic mode and redo end of stack * (if it exists) */ if (res) { moveToWaste(ld, ld->curSub); } else { res = ld->tmpRes; moveToWaste(ld, ld->lastRes); } /* reset to initial state */ ld->curDictId = InvalidOid; ld->posDict = 0; ld->lastRes = NULL; ld->tmpRes = NULL; setCorrLex(ld, correspondLexem); return res; } /* * Dict don't want next lexem and didn't recognize anything, redo * from ld->towork.head */ ld->curDictId = InvalidOid; return LexizeExec(ld, correspondLexem); } } setCorrLex(ld, correspondLexem); return NULL; }