/* * placeChar - put str into tree's structure, byte by byte. */ static SuffixChar * placeChar(SuffixChar *node, unsigned char *str, int lenstr, char *replaceTo, int replacelen) { SuffixChar *curnode; if (!node) { node = palloc(sizeof(SuffixChar) * 256); memset(node, 0, sizeof(SuffixChar) * 256); } curnode = node + *str; if (lenstr == 1) { if (curnode->replaceTo) elog(WARNING, "duplicate TO argument, use first one"); else { curnode->replacelen = replacelen; curnode->replaceTo = palloc(replacelen); memcpy(curnode->replaceTo, replaceTo, replacelen); } } else { curnode->nextChar = placeChar(curnode->nextChar, str + 1, lenstr - 1, replaceTo, replacelen); } return node; }
/* * placeChar - put str into trie's structure, byte by byte. * * If node is NULL, we need to make a new node, which will be returned; * otherwise the return value is the same as node. */ static TrieChar * placeChar(TrieChar *node, const unsigned char *str, int lenstr, const char *replaceTo, int replacelen) { TrieChar *curnode; if (!node) node = (TrieChar *) palloc0(sizeof(TrieChar) * 256); Assert(lenstr > 0); /* else str[0] doesn't exist */ curnode = node + *str; if (lenstr <= 1) { if (curnode->replaceTo) ereport(WARNING, (errcode(ERRCODE_CONFIG_FILE_ERROR), errmsg("duplicate source strings, first one will be used"))); else { curnode->replacelen = replacelen; curnode->replaceTo = (char *) palloc(replacelen); memcpy(curnode->replaceTo, replaceTo, replacelen); } } else { curnode->nextChar = placeChar(curnode->nextChar, str + 1, lenstr - 1, replaceTo, replacelen); } return node; }
/* * initSuffixTree - create suffix tree from file. Function converts * UTF8-encoded file into current encoding. */ static SuffixChar * initSuffixTree(char *filename) { SuffixChar *volatile rootSuffixTree = NULL; MemoryContext ccxt = CurrentMemoryContext; tsearch_readline_state trst; volatile bool skip; filename = get_tsearch_config_filename(filename, "rules"); if (!tsearch_readline_begin(&trst, filename)) ereport(ERROR, (errcode(ERRCODE_CONFIG_FILE_ERROR), errmsg("could not open unaccent file \"%s\": %m", filename))); do { char src[4096]; char trg[4096]; int srclen; int trglen; char *line = NULL; skip = true; PG_TRY(); { /* * pg_do_encoding_conversion() (called by tsearch_readline()) will * emit exception if it finds untranslatable characters in current * locale. We just skip such characters. */ while ((line = tsearch_readline(&trst)) != NULL) { if (sscanf(line, "%s\t%s\n", src, trg) != 2) continue; srclen = strlen(src); trglen = strlen(trg); rootSuffixTree = placeChar(rootSuffixTree, (unsigned char *) src, srclen, trg, trglen); skip = false; pfree(line); } } PG_CATCH(); { ErrorData *errdata; MemoryContext ecxt; ecxt = MemoryContextSwitchTo(ccxt); errdata = CopyErrorData(); if (errdata->sqlerrcode == ERRCODE_UNTRANSLATABLE_CHARACTER) { FlushErrorState(); } else { MemoryContextSwitchTo(ecxt); PG_RE_THROW(); } } PG_END_TRY(); } while (skip); tsearch_readline_end(&trst); return rootSuffixTree; }
/* * initTrie - create trie from file. * * Function converts UTF8-encoded file into current encoding. */ static TrieChar * initTrie(char *filename) { TrieChar *volatile rootTrie = NULL; MemoryContext ccxt = CurrentMemoryContext; tsearch_readline_state trst; volatile bool skip; filename = get_tsearch_config_filename(filename, "rules"); if (!tsearch_readline_begin(&trst, filename)) ereport(ERROR, (errcode(ERRCODE_CONFIG_FILE_ERROR), errmsg("could not open unaccent file \"%s\": %m", filename))); do { /* * pg_do_encoding_conversion() (called by tsearch_readline()) will * emit exception if it finds untranslatable characters in current * locale. We just skip such lines, continuing with the next. */ skip = true; PG_TRY(); { char *line; while ((line = tsearch_readline(&trst)) != NULL) { /*---------- * The format of each line must be "src" or "src trg", where * src and trg are sequences of one or more non-whitespace * characters, separated by whitespace. Whitespace at start * or end of line is ignored. If trg is omitted, an empty * string is used as the replacement. * * We use a simple state machine, with states * 0 initial (before src) * 1 in src * 2 in whitespace after src * 3 in trg * 4 in whitespace after trg * -1 syntax error detected *---------- */ int state; char *ptr; char *src = NULL; char *trg = NULL; int ptrlen; int srclen = 0; int trglen = 0; state = 0; for (ptr = line; *ptr; ptr += ptrlen) { ptrlen = pg_mblen(ptr); /* ignore whitespace, but end src or trg */ if (t_isspace(ptr)) { if (state == 1) state = 2; else if (state == 3) state = 4; continue; } switch (state) { case 0: /* start of src */ src = ptr; srclen = ptrlen; state = 1; break; case 1: /* continue src */ srclen += ptrlen; break; case 2: /* start of trg */ trg = ptr; trglen = ptrlen; state = 3; break; case 3: /* continue trg */ trglen += ptrlen; break; default: /* bogus line format */ state = -1; break; } } if (state == 1 || state == 2) { /* trg was omitted, so use "" */ trg = ""; trglen = 0; } if (state > 0) rootTrie = placeChar(rootTrie, (unsigned char *) src, srclen, trg, trglen); else if (state < 0) ereport(WARNING, (errcode(ERRCODE_CONFIG_FILE_ERROR), errmsg("invalid syntax: more than two strings in unaccent rule"))); pfree(line); } skip = false; } PG_CATCH(); { ErrorData *errdata; MemoryContext ecxt; ecxt = MemoryContextSwitchTo(ccxt); errdata = CopyErrorData(); if (errdata->sqlerrcode == ERRCODE_UNTRANSLATABLE_CHARACTER) { FlushErrorState(); } else { MemoryContextSwitchTo(ecxt); PG_RE_THROW(); } } PG_END_TRY(); } while (skip); tsearch_readline_end(&trst); return rootTrie; }
/* * initSuffixTree - create suffix tree from file. Function converts * UTF8-encoded file into current encoding. */ static SuffixChar * initSuffixTree(char *filename) { SuffixChar *volatile rootSuffixTree = NULL; MemoryContext ccxt = CurrentMemoryContext; tsearch_readline_state trst; volatile bool skip; filename = get_tsearch_config_filename(filename, "rules"); if (!tsearch_readline_begin(&trst, filename)) ereport(ERROR, (errcode(ERRCODE_CONFIG_FILE_ERROR), errmsg("could not open unaccent file \"%s\": %m", filename))); do { /* * pg_do_encoding_conversion() (called by tsearch_readline()) will * emit exception if it finds untranslatable characters in current * locale. We just skip such lines, continuing with the next. */ skip = true; PG_TRY(); { char *line; while ((line = tsearch_readline(&trst)) != NULL) { /* * The format of each line must be "src trg" where src and trg * are sequences of one or more non-whitespace characters, * separated by whitespace. Whitespace at start or end of * line is ignored. */ int state; char *ptr; char *src = NULL; char *trg = NULL; int ptrlen; int srclen = 0; int trglen = 0; state = 0; for (ptr = line; *ptr; ptr += ptrlen) { ptrlen = pg_mblen(ptr); /* ignore whitespace, but end src or trg */ if (t_isspace(ptr)) { if (state == 1) state = 2; else if (state == 3) state = 4; continue; } switch (state) { case 0: /* start of src */ src = ptr; srclen = ptrlen; state = 1; break; case 1: /* continue src */ srclen += ptrlen; break; case 2: /* start of trg */ trg = ptr; trglen = ptrlen; state = 3; break; case 3: /* continue trg */ trglen += ptrlen; break; default: /* bogus line format */ state = -1; break; } } if (state >= 3) rootSuffixTree = placeChar(rootSuffixTree, (unsigned char *) src, srclen, trg, trglen); pfree(line); } skip = false; } PG_CATCH(); { ErrorData *errdata; MemoryContext ecxt; ecxt = MemoryContextSwitchTo(ccxt); errdata = CopyErrorData(); if (errdata->sqlerrcode == ERRCODE_UNTRANSLATABLE_CHARACTER) { FlushErrorState(); } else { MemoryContextSwitchTo(ecxt); PG_RE_THROW(); } } PG_END_TRY(); } while (skip); tsearch_readline_end(&trst); return rootSuffixTree; }