static void read_dictionary(DictSyn *d, char *filename) { char *real_filename = get_tsearch_config_filename(filename, "rules"); tsearch_readline_state trst; char *line; int cur = 0; if (!tsearch_readline_begin(&trst, real_filename)) ereport(ERROR, (errcode(ERRCODE_CONFIG_FILE_ERROR), errmsg("could not open synonym file \"%s\": %m", real_filename))); while ((line = tsearch_readline(&trst)) != NULL) { char *value; char *key; char *end = NULL; if (*line == '\0') continue; value = lowerstr(line); pfree(line); key = find_word(value, &end); if (!key) { pfree(value); continue; } if (cur == d->len) { d->len = (d->len > 0) ? 2 * d->len : 16; if (d->syn) d->syn = (Syn *) repalloc(d->syn, sizeof(Syn) * d->len); else d->syn = (Syn *) palloc(sizeof(Syn) * d->len); } d->syn[cur].key = pnstrdup(key, end - key); d->syn[cur].value = value; cur++; } tsearch_readline_end(&trst); d->len = cur; if (cur > 1) qsort(d->syn, d->len, sizeof(Syn), compare_syn); pfree(real_filename); }
/* * initSuffixTree - create suffix tree from file. Function converts * UTF8-encoded file into current encoding. */ static SuffixChar * initSuffixTree(char *filename) { SuffixChar *volatile rootSuffixTree = NULL; MemoryContext ccxt = CurrentMemoryContext; tsearch_readline_state trst; volatile bool skip; filename = get_tsearch_config_filename(filename, "rules"); if (!tsearch_readline_begin(&trst, filename)) ereport(ERROR, (errcode(ERRCODE_CONFIG_FILE_ERROR), errmsg("could not open unaccent file \"%s\": %m", filename))); do { char src[4096]; char trg[4096]; int srclen; int trglen; char *line = NULL; skip = true; PG_TRY(); { /* * pg_do_encoding_conversion() (called by tsearch_readline()) will * emit exception if it finds untranslatable characters in current * locale. We just skip such characters. */ while ((line = tsearch_readline(&trst)) != NULL) { if (sscanf(line, "%s\t%s\n", src, trg) != 2) continue; srclen = strlen(src); trglen = strlen(trg); rootSuffixTree = placeChar(rootSuffixTree, (unsigned char *) src, srclen, trg, trglen); skip = false; pfree(line); } } PG_CATCH(); { ErrorData *errdata; MemoryContext ecxt; ecxt = MemoryContextSwitchTo(ccxt); errdata = CopyErrorData(); if (errdata->sqlerrcode == ERRCODE_UNTRANSLATABLE_CHARACTER) { FlushErrorState(); } else { MemoryContextSwitchTo(ecxt); PG_RE_THROW(); } } PG_END_TRY(); } while (skip); tsearch_readline_end(&trst); return rootSuffixTree; }
static void read_dictionary(DictSyn *d, char *filename) { char *real_filename = get_tsearch_config_filename(filename, "rules"); tsearch_readline_state trst; char *line; int cur = 0; if (!tsearch_readline_begin(&trst, real_filename)) ereport(ERROR, (errcode(ERRCODE_CONFIG_FILE_ERROR), errmsg("could not open synonym file \"%s\": %m", real_filename))); while ((line = tsearch_readline(&trst)) != NULL) { char *value; char *key; char *pos; char *end; if (*line == '\0') continue; value = lowerstr(line); pfree(line); pos = value; while ((key = find_word(pos, &end)) != NULL) { /* Enlarge syn structure if full */ if (cur == d->len) { d->len = (d->len > 0) ? 2 * d->len : 16; if (d->syn) d->syn = (Syn *) repalloc(d->syn, sizeof(Syn) * d->len); else d->syn = (Syn *) palloc(sizeof(Syn) * d->len); } /* Save first word only if we will match it */ if (pos != value || d->matchorig) { d->syn[cur].key = pnstrdup(key, end - key); d->syn[cur].value = pstrdup(value); cur++; } pos = end; /* Don't bother scanning synonyms if we will not match them */ if (!d->matchsynonyms) break; } pfree(value); } tsearch_readline_end(&trst); d->len = cur; if (cur > 1) qsort(d->syn, d->len, sizeof(Syn), compare_syn); pfree(real_filename); }
static void thesaurusRead(char *filename, DictThesaurus *d) { tsearch_readline_state trst; uint16 idsubst = 0; bool useasis = false; char *line; filename = get_tsearch_config_filename(filename, "ths"); if (!tsearch_readline_begin(&trst, filename)) ereport(ERROR, (errcode(ERRCODE_CONFIG_FILE_ERROR), errmsg("could not open thesaurus file \"%s\": %m", filename))); while ((line = tsearch_readline(&trst)) != NULL) { char *ptr; int state = TR_WAITLEX; char *beginwrd = NULL; uint16 posinsubst = 0; uint16 nwrd = 0; ptr = line; /* is it a comment? */ while (*ptr && t_isspace(ptr)) ptr += pg_mblen(ptr); if (t_iseq(ptr, '#') || *ptr == '\0' || t_iseq(ptr, '\n') || t_iseq(ptr, '\r')) { pfree(line); continue; } while (*ptr) { if (state == TR_WAITLEX) { if (t_iseq(ptr, ':')) { if (posinsubst == 0) ereport(ERROR, (errcode(ERRCODE_CONFIG_FILE_ERROR), errmsg("unexpected delimiter"))); state = TR_WAITSUBS; } else if (!t_isspace(ptr)) { beginwrd = ptr; state = TR_INLEX; } } else if (state == TR_INLEX) { if (t_iseq(ptr, ':')) { newLexeme(d, beginwrd, ptr, idsubst, posinsubst++); state = TR_WAITSUBS; } else if (t_isspace(ptr)) { newLexeme(d, beginwrd, ptr, idsubst, posinsubst++); state = TR_WAITLEX; } } else if (state == TR_WAITSUBS) { if (t_iseq(ptr, '*')) { useasis = true; state = TR_INSUBS; beginwrd = ptr + pg_mblen(ptr); } else if (t_iseq(ptr, '\\')) { useasis = false; state = TR_INSUBS; beginwrd = ptr + pg_mblen(ptr); } else if (!t_isspace(ptr)) { useasis = false; beginwrd = ptr; state = TR_INSUBS; } } else if (state == TR_INSUBS) { if (t_isspace(ptr)) { if (ptr == beginwrd) ereport(ERROR, (errcode(ERRCODE_CONFIG_FILE_ERROR), errmsg("unexpected end of line or lexeme"))); addWrd(d, beginwrd, ptr, idsubst, nwrd++, posinsubst, useasis); state = TR_WAITSUBS; } } else elog(ERROR, "unrecognized thesaurus state: %d", state); ptr += pg_mblen(ptr); } if (state == TR_INSUBS) { if (ptr == beginwrd) ereport(ERROR, (errcode(ERRCODE_CONFIG_FILE_ERROR), errmsg("unexpected end of line or lexeme"))); addWrd(d, beginwrd, ptr, idsubst, nwrd++, posinsubst, useasis); } idsubst++; if (!(nwrd && posinsubst)) ereport(ERROR, (errcode(ERRCODE_CONFIG_FILE_ERROR), errmsg("unexpected end of line"))); pfree(line); } d->nsubst = idsubst; tsearch_readline_end(&trst); }
/* * Reads a stop-word file. Each word is run through 'wordop' * function, if given. wordop may either modify the input in-place, * or palloc a new version. */ void readstoplist(const char *fname, StopList *s, char *(*wordop) (const char *)) { char **stop = NULL; s->len = 0; if (fname && *fname) { char *filename = get_tsearch_config_filename(fname, "stop"); tsearch_readline_state trst; char *line; int reallen = 0; if (!tsearch_readline_begin(&trst, filename)) ereport(ERROR, (errcode(ERRCODE_CONFIG_FILE_ERROR), errmsg("could not open stop-word file \"%s\": %m", filename))); while ((line = tsearch_readline(&trst)) != NULL) { char *pbuf = line; /* Trim trailing space */ while (*pbuf && !t_isspace(pbuf)) pbuf += pg_mblen(pbuf); *pbuf = '\0'; /* Skip empty lines */ if (*line == '\0') { pfree(line); continue; } if (s->len >= reallen) { if (reallen == 0) { reallen = 64; stop = (char **) palloc(sizeof(char *) * reallen); } else { reallen *= 2; stop = (char **) repalloc((void *) stop, sizeof(char *) * reallen); } } if (wordop) { stop[s->len] = wordop(line); if (stop[s->len] != line) pfree(line); } else stop[s->len] = line; (s->len)++; } tsearch_readline_end(&trst); pfree(filename); } s->stop = stop; /* Sort to allow binary searching */ if (s->stop && s->len > 0) qsort(s->stop, s->len, sizeof(char *), pg_qsort_strcmp); }
/* * initTrie - create trie from file. * * Function converts UTF8-encoded file into current encoding. */ static TrieChar * initTrie(char *filename) { TrieChar *volatile rootTrie = NULL; MemoryContext ccxt = CurrentMemoryContext; tsearch_readline_state trst; volatile bool skip; filename = get_tsearch_config_filename(filename, "rules"); if (!tsearch_readline_begin(&trst, filename)) ereport(ERROR, (errcode(ERRCODE_CONFIG_FILE_ERROR), errmsg("could not open unaccent file \"%s\": %m", filename))); do { /* * pg_do_encoding_conversion() (called by tsearch_readline()) will * emit exception if it finds untranslatable characters in current * locale. We just skip such lines, continuing with the next. */ skip = true; PG_TRY(); { char *line; while ((line = tsearch_readline(&trst)) != NULL) { /*---------- * The format of each line must be "src" or "src trg", where * src and trg are sequences of one or more non-whitespace * characters, separated by whitespace. Whitespace at start * or end of line is ignored. If trg is omitted, an empty * string is used as the replacement. * * We use a simple state machine, with states * 0 initial (before src) * 1 in src * 2 in whitespace after src * 3 in trg * 4 in whitespace after trg * -1 syntax error detected *---------- */ int state; char *ptr; char *src = NULL; char *trg = NULL; int ptrlen; int srclen = 0; int trglen = 0; state = 0; for (ptr = line; *ptr; ptr += ptrlen) { ptrlen = pg_mblen(ptr); /* ignore whitespace, but end src or trg */ if (t_isspace(ptr)) { if (state == 1) state = 2; else if (state == 3) state = 4; continue; } switch (state) { case 0: /* start of src */ src = ptr; srclen = ptrlen; state = 1; break; case 1: /* continue src */ srclen += ptrlen; break; case 2: /* start of trg */ trg = ptr; trglen = ptrlen; state = 3; break; case 3: /* continue trg */ trglen += ptrlen; break; default: /* bogus line format */ state = -1; break; } } if (state == 1 || state == 2) { /* trg was omitted, so use "" */ trg = ""; trglen = 0; } if (state > 0) rootTrie = placeChar(rootTrie, (unsigned char *) src, srclen, trg, trglen); else if (state < 0) ereport(WARNING, (errcode(ERRCODE_CONFIG_FILE_ERROR), errmsg("invalid syntax: more than two strings in unaccent rule"))); pfree(line); } skip = false; } PG_CATCH(); { ErrorData *errdata; MemoryContext ecxt; ecxt = MemoryContextSwitchTo(ccxt); errdata = CopyErrorData(); if (errdata->sqlerrcode == ERRCODE_UNTRANSLATABLE_CHARACTER) { FlushErrorState(); } else { MemoryContextSwitchTo(ecxt); PG_RE_THROW(); } } PG_END_TRY(); } while (skip); tsearch_readline_end(&trst); return rootTrie; }
/* * initSuffixTree - create suffix tree from file. Function converts * UTF8-encoded file into current encoding. */ static SuffixChar * initSuffixTree(char *filename) { SuffixChar *volatile rootSuffixTree = NULL; MemoryContext ccxt = CurrentMemoryContext; tsearch_readline_state trst; volatile bool skip; filename = get_tsearch_config_filename(filename, "rules"); if (!tsearch_readline_begin(&trst, filename)) ereport(ERROR, (errcode(ERRCODE_CONFIG_FILE_ERROR), errmsg("could not open unaccent file \"%s\": %m", filename))); do { /* * pg_do_encoding_conversion() (called by tsearch_readline()) will * emit exception if it finds untranslatable characters in current * locale. We just skip such lines, continuing with the next. */ skip = true; PG_TRY(); { char *line; while ((line = tsearch_readline(&trst)) != NULL) { /* * The format of each line must be "src trg" where src and trg * are sequences of one or more non-whitespace characters, * separated by whitespace. Whitespace at start or end of * line is ignored. */ int state; char *ptr; char *src = NULL; char *trg = NULL; int ptrlen; int srclen = 0; int trglen = 0; state = 0; for (ptr = line; *ptr; ptr += ptrlen) { ptrlen = pg_mblen(ptr); /* ignore whitespace, but end src or trg */ if (t_isspace(ptr)) { if (state == 1) state = 2; else if (state == 3) state = 4; continue; } switch (state) { case 0: /* start of src */ src = ptr; srclen = ptrlen; state = 1; break; case 1: /* continue src */ srclen += ptrlen; break; case 2: /* start of trg */ trg = ptr; trglen = ptrlen; state = 3; break; case 3: /* continue trg */ trglen += ptrlen; break; default: /* bogus line format */ state = -1; break; } } if (state >= 3) rootSuffixTree = placeChar(rootSuffixTree, (unsigned char *) src, srclen, trg, trglen); pfree(line); } skip = false; } PG_CATCH(); { ErrorData *errdata; MemoryContext ecxt; ecxt = MemoryContextSwitchTo(ccxt); errdata = CopyErrorData(); if (errdata->sqlerrcode == ERRCODE_UNTRANSLATABLE_CHARACTER) { FlushErrorState(); } else { MemoryContextSwitchTo(ecxt); PG_RE_THROW(); } } PG_END_TRY(); } while (skip); tsearch_readline_end(&trst); return rootSuffixTree; }