void Inverter_Add_Field_IMP(Inverter *self, InverterEntry *entry) { InverterIVARS *const ivars = Inverter_IVARS(self); InverterEntryIVARS *const entry_ivars = InvEntry_IVARS(entry); // Get an Inversion, going through analyzer if appropriate. if (entry_ivars->analyzer) { DECREF(entry_ivars->inversion); entry_ivars->inversion = Analyzer_Transform_Text(entry_ivars->analyzer, (String*)entry_ivars->value); Inversion_Invert(entry_ivars->inversion); } else if (entry_ivars->indexed || entry_ivars->highlightable) { String *value = (String*)entry_ivars->value; size_t token_len = Str_Get_Size(value); Token *seed = Token_new(Str_Get_Ptr8(value), token_len, 0, token_len, 1.0f, 1); DECREF(entry_ivars->inversion); entry_ivars->inversion = Inversion_new(seed); DECREF(seed); Inversion_Invert(entry_ivars->inversion); // Nearly a no-op. } // Prime the iterator. VA_Push(ivars->entries, INCREF(entry)); ivars->sorted = false; }
Inversion* PolyAnalyzer_Transform_Text_IMP(PolyAnalyzer *self, String *text) { VArray *const analyzers = PolyAnalyzer_IVARS(self)->analyzers; const uint32_t num_analyzers = VA_Get_Size(analyzers); Inversion *retval; if (num_analyzers == 0) { size_t token_len = Str_Get_Size(text); const char *buf = Str_Get_Ptr8(text); Token *seed = Token_new(buf, token_len, 0, token_len, 1.0f, 1); retval = Inversion_new(seed); DECREF(seed); } else { Analyzer *first_analyzer = (Analyzer*)VA_Fetch(analyzers, 0); retval = Analyzer_Transform_Text(first_analyzer, text); for (uint32_t i = 1; i < num_analyzers; i++) { Analyzer *analyzer = (Analyzer*)VA_Fetch(analyzers, i); Inversion *new_inversion = Analyzer_Transform(analyzer, retval); DECREF(retval); retval = new_inversion; } } return retval; }
/* * Parse a word starting with an ALetter, Numeric or Katakana character. * Advances the iterator and returns the word break property of the current * character. */ static int S_parse_word(const char *text, size_t len, lucy_StringIter *iter, int state, Inversion *inversion) { int wb = -1; lucy_StringIter start = *iter; S_iter_advance(text, iter); lucy_StringIter end = *iter; while (iter->byte_pos < len) { wb = S_wb_lookup(text + iter->byte_pos); switch (wb) { case WB_ALetter: case WB_Numeric: if (state == WB_Katakana) { goto word_break; } break; case WB_Katakana: if (state == WB_ALetter || state == WB_Numeric) { goto word_break; } break; case WB_ExtendNumLet: break; case WB_Extend_Format: // keep state wb = state; break; case WB_MidNumLet: case WB_MidLetter: case WB_MidNum: if ((state == WB_ALetter && wb != WB_MidNum) || (state == WB_Numeric && wb != WB_MidLetter)) { wb = S_skip_extend_format(text, len, iter); if (wb == state) { break; } } goto word_break; default: goto word_break; } state = wb; S_iter_advance(text, iter); end = *iter; } Token *token; word_break: token = Token_new(text + start.byte_pos, end.byte_pos - start.byte_pos, start.char_pos, end.char_pos, 1.0f, 1); Inversion_Append(inversion, token); return wb; }
/* * Parse a word consisting of a single codepoint followed by extend or * format characters. Used for Alphabetic characters that don't have the * ALetter word break property: ideographs, Hiragana, and "complex content". * Advances the iterator and returns the word break property of the current * character. */ static int S_parse_single(const char *text, size_t len, lucy_StringIter *iter, Inversion *inversion) { lucy_StringIter start = *iter; int wb = S_skip_extend_format(text, len, iter); Token *token = Token_new(text + start.byte_pos, iter->byte_pos - start.byte_pos, start.char_pos, iter->char_pos, 1.0f, 1); Inversion_Append(inversion, token); return wb; }
void WhitespaceTokenizer_Tokenize_Str_IMP(WhitespaceTokenizer *self, const char *text, size_t len, Inversion *inversion) { size_t byte_pos = 0; size_t char_pos = 0; size_t start_byte_pos = 0; size_t start_char_pos = 0; int prev_ws = 1; while (byte_pos < len) { uint32_t cp = StrHelp_decode_utf8_char(text + byte_pos); int ws = isspace(cp); if (prev_ws && !ws) { start_byte_pos = byte_pos; start_char_pos = char_pos; } else if (!prev_ws && ws) { Token *token = Token_new(text + start_byte_pos, byte_pos - start_byte_pos, start_char_pos, char_pos, 1.0f, 1); Inversion_Append(inversion, token); } prev_ws = ws; byte_pos += StrHelp_UTF8_COUNT[(uint8_t)text[byte_pos]]; char_pos += 1; } if (!prev_ws) { Token *token = Token_new(text + start_byte_pos, byte_pos - start_byte_pos, start_char_pos, char_pos, 1.0f, 1); Inversion_Append(inversion, token); } }
void getToken (int fd, int sepBySpace) { i = 0; char s[1024]; gfd = fd; switch (ahead){ case A_NONE: c = getChar (gfd); break; case A_SPACE: ahead = A_NONE; Token_new(token, TOKEN_SPACE, 0); return; case A_CRLF: ahead = A_NONE; Token_new(token, TOKEN_CRLF, 0); return; default:{ char *info = "server bug"; write (1, info, strlen (info)); Http_print (gfd, http400); close (gfd); exit (0); return; } } while (1){ switch (c){ case ' ': if (sepBySpace){ if (i){ char *p; int kind; // remember the ' ' ahead = A_SPACE; s[i] = '\0'; p = malloc (strlen(s)+1); strcpy (p, s); kind = Token_getKeyWord (p); if (kind>=0){ Token_new (token, kind, 0); return; } Token_new (token, TOKEN_STR, p); return; } Token_new(token, TOKEN_SPACE, 0); return; } s[i++] = c; break; case '\r':{ char c2; c2 = getChar (gfd); if (c2=='\n'){ if (i){ char *p; int kind; // remember the ' ' ahead = A_CRLF; s[i] = '\0'; p = malloc (strlen(s)+1); strcpy (p, s); kind = Token_getKeyWord (p); if (kind>=0){ Token_new (token, kind, 0); return; } Token_new (token, TOKEN_STR, p); return; } Token_new(token, TOKEN_CRLF, 0); return; } s[i++] = c; s[i++] = c2; break; } default: s[i++] = c; break; } c = getChar (gfd); } return; }
int Scanner_scan(Scanner *s) { unsigned char *cursor = s->cur; unsigned int depth; scan: s->tchar = cursor - s->pos; s->tline = s->cline; s->tok = cursor; #line 224 "scanner.c" { YYCTYPE yych; unsigned int yyaccept; goto yy29; ++YYCURSOR; yy29: if((YYLIMIT - YYCURSOR) < 2) YYFILL(2); yych = *YYCURSOR; if(yych <= '/'){ if(yych <= '"'){ if(yych <= '\n'){ if(yych <= '\b') goto yy53; if(yych <= '\t') goto yy47; goto yy49; } else { if(yych == ' ') goto yy47; if(yych <= '!') goto yy53; goto yy37; } } else { if(yych <= '*'){ if(yych <= '&') goto yy53; if(yych <= '\'') goto yy39; if(yych <= ')') goto yy43; goto yy35; } else { if(yych <= '+') goto yy44; if(yych <= '-') goto yy53; if(yych <= '.') goto yy51; goto yy33; } } } else { if(yych <= '@'){ if(yych <= '<'){ if(yych == ';') goto yy43; goto yy53; } else { if(yych <= '=') goto yy43; if(yych == '?') goto yy44; goto yy53; } } else { if(yych <= '`'){ if(yych <= 'Z') goto yy45; if(yych <= '[') goto yy41; if(yych <= '\\') goto yy43; goto yy53; } else { if(yych <= 'z') goto yy45; if(yych <= '{') goto yy31; if(yych <= '|') goto yy43; goto yy53; } } } yy31: yyaccept = 0; yych = *(YYMARKER = ++YYCURSOR); if(yych <= '/') goto yy32; if(yych <= '9') goto yy84; goto yy32; yy32: #line 133 "scanner.re" { depth = 1; goto code; } #line 291 "scanner.c" yy33: yych = *++YYCURSOR; if(yych == '*') goto yy82; goto yy34; yy34: #line 163 "scanner.re" { RETURN(*s->tok); } #line 298 "scanner.c" yy35: yych = *++YYCURSOR; if(yych == '/') goto yy80; goto yy36; yy36: #line 165 "scanner.re" { yylval.op = *s->tok; RETURN(CLOSE); } #line 306 "scanner.c" yy37: yyaccept = 1; yych = *(YYMARKER = ++YYCURSOR); if(yych != '\n') goto yy76; goto yy38; yy38: #line 150 "scanner.re" { Scanner_fatal(s, "unterminated string constant (missing \")"); } #line 314 "scanner.c" yy39: yyaccept = 2; yych = *(YYMARKER = ++YYCURSOR); if(yych != '\n') goto yy71; goto yy40; yy40: #line 151 "scanner.re" { Scanner_fatal(s, "unterminated string constant (missing ')"); } #line 322 "scanner.c" yy41: yyaccept = 3; yych = *(YYMARKER = ++YYCURSOR); if(yych == '\n') goto yy42; if(yych == '^') goto yy62; goto yy60; yy42: #line 161 "scanner.re" { Scanner_fatal(s, "unterminated range (missing ])"); } #line 331 "scanner.c" yy43: yych = *++YYCURSOR; goto yy34; yy44: yych = *++YYCURSOR; goto yy36; yy45: yych = *++YYCURSOR; goto yy58; yy46: #line 180 "scanner.re" { SubStr substr; s->cur = cursor; substr = Scanner_token(s); yylval.symbol = Symbol_find(&substr); return ID; } #line 345 "scanner.c" yy47: yych = *++YYCURSOR; goto yy56; yy48: #line 186 "scanner.re" { goto scan; } #line 351 "scanner.c" yy49: yych = *++YYCURSOR; goto yy50; yy50: #line 188 "scanner.re" { if(cursor == s->eof) RETURN(0); s->pos = cursor; s->cline++; goto scan; } #line 360 "scanner.c" yy51: yych = *++YYCURSOR; goto yy52; yy52: #line 193 "scanner.re" { s->cur = cursor; yylval.regexp = mkDot(); return RANGE; } #line 369 "scanner.c" yy53: yych = *++YYCURSOR; goto yy54; yy54: #line 198 "scanner.re" { fprintf(stderr, "unexpected character: '%c'\n", *s->tok); goto scan; } #line 377 "scanner.c" yy55: ++YYCURSOR; if(YYLIMIT == YYCURSOR) YYFILL(1); yych = *YYCURSOR; goto yy56; yy56: if(yych == '\t') goto yy55; if(yych == ' ') goto yy55; goto yy48; yy57: ++YYCURSOR; if(YYLIMIT == YYCURSOR) YYFILL(1); yych = *YYCURSOR; goto yy58; yy58: if(yych <= '@'){ if(yych <= '/') goto yy46; if(yych <= '9') goto yy57; goto yy46; } else { if(yych <= 'Z') goto yy57; if(yych <= '`') goto yy46; if(yych <= 'z') goto yy57; goto yy46; } yy59: ++YYCURSOR; if(YYLIMIT == YYCURSOR) YYFILL(1); yych = *YYCURSOR; goto yy60; yy60: if(yych <= '['){ if(yych != '\n') goto yy59; goto yy61; } else { if(yych <= '\\') goto yy64; if(yych <= ']') goto yy65; goto yy59; } yy61: YYCURSOR = YYMARKER; switch(yyaccept){ case 0: goto yy32; case 1: goto yy38; case 2: goto yy40; case 3: goto yy42; } yy62: ++YYCURSOR; if(YYLIMIT == YYCURSOR) YYFILL(1); yych = *YYCURSOR; goto yy63; yy63: if(yych <= '['){ if(yych == '\n') goto yy61; goto yy62; } else { if(yych <= '\\') goto yy67; if(yych <= ']') goto yy68; goto yy62; } yy64: ++YYCURSOR; if(YYLIMIT == YYCURSOR) YYFILL(1); yych = *YYCURSOR; if(yych == '\n') goto yy61; goto yy59; yy65: yych = *++YYCURSOR; goto yy66; yy66: #line 157 "scanner.re" { s->cur = cursor; yylval.regexp = ranToRE(Scanner_token(s)); return RANGE; } #line 442 "scanner.c" yy67: ++YYCURSOR; if(YYLIMIT == YYCURSOR) YYFILL(1); yych = *YYCURSOR; if(yych == '\n') goto yy61; goto yy62; yy68: yych = *++YYCURSOR; goto yy69; yy69: #line 153 "scanner.re" { s->cur = cursor; yylval.regexp = invToRE(Scanner_token(s)); return RANGE; } #line 455 "scanner.c" yy70: ++YYCURSOR; if(YYLIMIT == YYCURSOR) YYFILL(1); yych = *YYCURSOR; goto yy71; yy71: if(yych <= '&'){ if(yych == '\n') goto yy61; goto yy70; } else { if(yych <= '\'') goto yy73; if(yych != '\\') goto yy70; goto yy72; } yy72: ++YYCURSOR; if(YYLIMIT == YYCURSOR) YYFILL(1); yych = *YYCURSOR; if(yych == '\n') goto yy61; goto yy70; yy73: yych = *++YYCURSOR; goto yy74; yy74: #line 146 "scanner.re" { s->cur = cursor; yylval.regexp = strToCaseInsensitiveRE(Scanner_token(s)); return STRING; } #line 480 "scanner.c" yy75: ++YYCURSOR; if(YYLIMIT == YYCURSOR) YYFILL(1); yych = *YYCURSOR; goto yy76; yy76: if(yych <= '!'){ if(yych == '\n') goto yy61; goto yy75; } else { if(yych <= '"') goto yy78; if(yych != '\\') goto yy75; goto yy77; } yy77: ++YYCURSOR; if(YYLIMIT == YYCURSOR) YYFILL(1); yych = *YYCURSOR; if(yych == '\n') goto yy61; goto yy75; yy78: yych = *++YYCURSOR; goto yy79; yy79: #line 142 "scanner.re" { s->cur = cursor; yylval.regexp = strToRE(Scanner_token(s)); return STRING; } #line 505 "scanner.c" yy80: yych = *++YYCURSOR; goto yy81; yy81: #line 139 "scanner.re" { s->tok = cursor; RETURN(0); } #line 512 "scanner.c" yy82: yych = *++YYCURSOR; goto yy83; yy83: #line 136 "scanner.re" { depth = 1; goto comment; } #line 519 "scanner.c" yy84: ++YYCURSOR; if((YYLIMIT - YYCURSOR) < 2) YYFILL(2); yych = *YYCURSOR; goto yy85; yy85: if(yych <= '/'){ if(yych == ',') goto yy88; goto yy61; } else { if(yych <= '9') goto yy84; if(yych != '}') goto yy61; goto yy86; } yy86: yych = *++YYCURSOR; goto yy87; yy87: #line 168 "scanner.re" { yylval.extop.minsize = atoi((char *)s->tok+1); yylval.extop.maxsize = atoi((char *)s->tok+1); RETURN(CLOSESIZE); } #line 539 "scanner.c" yy88: yych = *++YYCURSOR; if(yych != '}') goto yy92; goto yy89; yy89: yych = *++YYCURSOR; goto yy90; yy90: #line 176 "scanner.re" { yylval.extop.minsize = atoi((char *)s->tok+1); yylval.extop.maxsize = -1; RETURN(CLOSESIZE); } #line 550 "scanner.c" yy91: ++YYCURSOR; if(YYLIMIT == YYCURSOR) YYFILL(1); yych = *YYCURSOR; goto yy92; yy92: if(yych <= '/') goto yy61; if(yych <= '9') goto yy91; if(yych != '}') goto yy61; goto yy93; yy93: yych = *++YYCURSOR; goto yy94; yy94: #line 172 "scanner.re" { yylval.extop.minsize = atoi((char *)s->tok+1); yylval.extop.maxsize = MAX(yylval.extop.minsize,atoi(strchr((char *)s->tok, ',')+1)); RETURN(CLOSESIZE); } #line 566 "scanner.c" } #line 201 "scanner.re" code: #line 573 "scanner.c" { YYCTYPE yych; unsigned int yyaccept; goto yy95; ++YYCURSOR; yy95: if((YYLIMIT - YYCURSOR) < 2) YYFILL(2); yych = *YYCURSOR; if(yych <= '&'){ if(yych <= '\n'){ if(yych <= '\t') goto yy103; goto yy101; } else { if(yych == '"') goto yy105; goto yy103; } } else { if(yych <= '{'){ if(yych <= '\'') goto yy106; if(yych <= 'z') goto yy103; goto yy99; } else { if(yych != '}') goto yy103; goto yy97; } } yy97: yych = *++YYCURSOR; goto yy98; yy98: #line 205 "scanner.re" { if(--depth == 0){ s->cur = cursor; yylval.token = Token_new(Scanner_token(s), s->tline); return CODE; } goto code; } #line 610 "scanner.c" yy99: yych = *++YYCURSOR; goto yy100; yy100: #line 211 "scanner.re" { ++depth; goto code; } #line 617 "scanner.c" yy101: yych = *++YYCURSOR; goto yy102; yy102: #line 213 "scanner.re" { if(cursor == s->eof) Scanner_fatal(s, "missing '}'"); s->pos = cursor; s->cline++; goto code; } #line 626 "scanner.c" yy103: yych = *++YYCURSOR; goto yy104; yy104: #line 217 "scanner.re" { goto code; } #line 632 "scanner.c" yy105: yyaccept = 0; yych = *(YYMARKER = ++YYCURSOR); if(yych == '\n') goto yy104; goto yy112; yy106: yyaccept = 0; yych = *(YYMARKER = ++YYCURSOR); if(yych == '\n') goto yy104; goto yy108; yy107: ++YYCURSOR; if(YYLIMIT == YYCURSOR) YYFILL(1); yych = *YYCURSOR; goto yy108; yy108: if(yych <= '&'){ if(yych != '\n') goto yy107; goto yy109; } else { if(yych <= '\'') goto yy103; if(yych == '\\') goto yy110; goto yy107; } yy109: YYCURSOR = YYMARKER; switch(yyaccept){ case 0: goto yy104; } yy110: ++YYCURSOR; if(YYLIMIT == YYCURSOR) YYFILL(1); yych = *YYCURSOR; if(yych == '\n') goto yy109; goto yy107; yy111: ++YYCURSOR; if(YYLIMIT == YYCURSOR) YYFILL(1); yych = *YYCURSOR; goto yy112; yy112: if(yych <= '!'){ if(yych == '\n') goto yy109; goto yy111; } else { if(yych <= '"') goto yy103; if(yych != '\\') goto yy111; goto yy113; } yy113: ++YYCURSOR; if(YYLIMIT == YYCURSOR) YYFILL(1); yych = *YYCURSOR; if(yych == '\n') goto yy109; goto yy111; } #line 218 "scanner.re" comment: #line 685 "scanner.c" { YYCTYPE yych; goto yy114; ++YYCURSOR; yy114: if((YYLIMIT - YYCURSOR) < 2) YYFILL(2); yych = *YYCURSOR; if(yych <= ')'){ if(yych == '\n') goto yy119; goto yy121; } else { if(yych <= '*') goto yy116; if(yych == '/') goto yy118; goto yy121; } yy116: yych = *++YYCURSOR; if(yych == '/') goto yy124; goto yy117; yy117: #line 232 "scanner.re" { goto comment; } #line 707 "scanner.c" yy118: yych = *++YYCURSOR; if(yych == '*') goto yy122; goto yy117; yy119: yych = *++YYCURSOR; goto yy120; yy120: #line 228 "scanner.re" { if(cursor == s->eof) RETURN(0); s->tok = s->pos = cursor; s->cline++; goto comment; } #line 719 "scanner.c" yy121: yych = *++YYCURSOR; goto yy117; yy122: yych = *++YYCURSOR; goto yy123; yy123: #line 226 "scanner.re" { ++depth; goto comment; } #line 728 "scanner.c" yy124: yych = *++YYCURSOR; goto yy125; yy125: #line 222 "scanner.re" { if(--depth == 0) goto scan; else goto comment; } #line 737 "scanner.c" } #line 233 "scanner.re" }
/* * Parse a word starting with an ALetter, Numeric, Katakana, or ExtendNumLet * character. Advances the iterator and returns the word break property of the * current character. */ static int S_parse_word(const char *text, size_t len, lucy_StringIter *iter, int state, Inversion *inversion) { int wb = -1; lucy_StringIter start = *iter; S_iter_advance(text, iter); lucy_StringIter end = *iter; while (iter->byte_pos < len) { wb = S_wb_lookup(text + iter->byte_pos); switch (wb) { case WB_ALetter: case WB_Hebrew_Letter: case WB_Numeric: if (state == WB_Katakana) { goto word_break; } // Rules WB5, WB8, WB9, WB10, and WB13b. break; case WB_Katakana: if (state != WB_Katakana && state != WB_ExtendNumLet) { goto word_break; } // Rules WB13 and WB13b. break; case WB_ExtendNumLet: // Rule WB13a. break; case WB_Extend_Format: // Rule WB4. Keep state. wb = state; break; case WB_Single_Quote: case WB_MidNumLet: case WB_MidLetter: case WB_MidNum: if (state == WB_ALetter) { if (wb == WB_MidNum) { goto word_break; } wb = S_skip_extend_format(text, len, iter); if (wb == WB_ALetter || wb == WB_Hebrew_Letter) { // Rules WB6 and WB7. state = wb; break; } } else if (state == WB_Hebrew_Letter) { if (wb == WB_MidNum) { goto word_break; } if (wb == WB_Single_Quote) { // Rule WB7a. ++end.byte_pos; ++end.char_pos; } wb = S_skip_extend_format(text, len, iter); if (wb == WB_ALetter || wb == WB_Hebrew_Letter) { // Rules WB6 and WB7. state = wb; break; } } else if (state == WB_Numeric) { if (wb == WB_MidLetter) { goto word_break; } wb = S_skip_extend_format(text, len, iter); if (wb == state) { // Rules WB11 and WB12. break; } } goto word_break; case WB_Double_Quote: if (state == WB_Hebrew_Letter) { wb = S_skip_extend_format(text, len, iter); if (wb == state) { // Rules WB7b and WB7c. break; } } goto word_break; default: goto word_break; } state = wb; S_iter_advance(text, iter); end = *iter; } Token *token; word_break: token = Token_new(text + start.byte_pos, end.byte_pos - start.byte_pos, start.char_pos, end.char_pos, 1.0f, 1); Inversion_Append(inversion, token); return wb; }