Exemple #1
0
void
Inverter_Add_Field_IMP(Inverter *self, InverterEntry *entry) {
    InverterIVARS *const ivars = Inverter_IVARS(self);
    InverterEntryIVARS *const entry_ivars = InvEntry_IVARS(entry);

    // Get an Inversion, going through analyzer if appropriate.
    if (entry_ivars->analyzer) {
        DECREF(entry_ivars->inversion);
        entry_ivars->inversion
            = Analyzer_Transform_Text(entry_ivars->analyzer,
                                      (String*)entry_ivars->value);
        Inversion_Invert(entry_ivars->inversion);
    }
    else if (entry_ivars->indexed || entry_ivars->highlightable) {
        String *value = (String*)entry_ivars->value;
        size_t token_len = Str_Get_Size(value);
        Token *seed = Token_new(Str_Get_Ptr8(value),
                                token_len, 0, token_len, 1.0f, 1);
        DECREF(entry_ivars->inversion);
        entry_ivars->inversion = Inversion_new(seed);
        DECREF(seed);
        Inversion_Invert(entry_ivars->inversion); // Nearly a no-op.
    }

    // Prime the iterator.
    VA_Push(ivars->entries, INCREF(entry));
    ivars->sorted = false;
}
Exemple #2
0
Inversion*
PolyAnalyzer_Transform_Text_IMP(PolyAnalyzer *self, String *text) {
    VArray *const   analyzers     = PolyAnalyzer_IVARS(self)->analyzers;
    const uint32_t  num_analyzers = VA_Get_Size(analyzers);
    Inversion      *retval;

    if (num_analyzers == 0) {
        size_t      token_len = Str_Get_Size(text);
        const char *buf       = Str_Get_Ptr8(text);
        Token *seed = Token_new(buf, token_len, 0, token_len, 1.0f, 1);
        retval = Inversion_new(seed);
        DECREF(seed);
    }
    else {
        Analyzer *first_analyzer = (Analyzer*)VA_Fetch(analyzers, 0);
        retval = Analyzer_Transform_Text(first_analyzer, text);
        for (uint32_t i = 1; i < num_analyzers; i++) {
            Analyzer *analyzer = (Analyzer*)VA_Fetch(analyzers, i);
            Inversion *new_inversion = Analyzer_Transform(analyzer, retval);
            DECREF(retval);
            retval = new_inversion;
        }
    }

    return retval;
}
Exemple #3
0
/*
 * Parse a word starting with an ALetter, Numeric or Katakana character.
 * Advances the iterator and returns the word break property of the current
 * character.
 */
static int
S_parse_word(const char *text, size_t len, lucy_StringIter *iter,
             int state, Inversion *inversion) {
    int wb = -1;
    lucy_StringIter start = *iter;
    S_iter_advance(text, iter);
    lucy_StringIter end = *iter;

    while (iter->byte_pos < len) {
        wb = S_wb_lookup(text + iter->byte_pos);

        switch (wb) {
            case WB_ALetter:
            case WB_Numeric:
                if (state == WB_Katakana) { goto word_break; }
                break;
            case WB_Katakana:
                if (state == WB_ALetter || state == WB_Numeric) {
                    goto word_break;
                }
                break;
            case WB_ExtendNumLet:
                break;
            case WB_Extend_Format:
                // keep state
                wb = state;
                break;
            case WB_MidNumLet:
            case WB_MidLetter:
            case WB_MidNum:
                if ((state == WB_ALetter && wb != WB_MidNum)
                    ||  (state == WB_Numeric && wb != WB_MidLetter)) {
                    wb = S_skip_extend_format(text, len, iter);
                    if (wb == state) { break; }
                }
                goto word_break;
            default:
                goto word_break;
        }

        state = wb;
        S_iter_advance(text, iter);
        end = *iter;
    }

    Token *token;
word_break:
    token = Token_new(text + start.byte_pos, end.byte_pos - start.byte_pos,
                      start.char_pos, end.char_pos, 1.0f, 1);
    Inversion_Append(inversion, token);

    return wb;
}
Exemple #4
0
/*
 * Parse a word consisting of a single codepoint followed by extend or
 * format characters. Used for Alphabetic characters that don't have the
 * ALetter word break property: ideographs, Hiragana, and "complex content".
 * Advances the iterator and returns the word break property of the current
 * character.
 */
static int
S_parse_single(const char *text, size_t len, lucy_StringIter *iter,
               Inversion *inversion) {
    lucy_StringIter start = *iter;
    int wb = S_skip_extend_format(text, len, iter);

    Token *token = Token_new(text + start.byte_pos,
                             iter->byte_pos - start.byte_pos,
                             start.char_pos, iter->char_pos, 1.0f, 1);
    Inversion_Append(inversion, token);

    return wb;
}
void
WhitespaceTokenizer_Tokenize_Str_IMP(WhitespaceTokenizer *self,
                                     const char *text, size_t len,
                                     Inversion *inversion) {
    size_t byte_pos       = 0;
    size_t char_pos       = 0;
    size_t start_byte_pos = 0;
    size_t start_char_pos = 0;
    int    prev_ws        = 1;

    while (byte_pos < len) {
        uint32_t cp = StrHelp_decode_utf8_char(text + byte_pos);
        int      ws = isspace(cp);

        if (prev_ws && !ws) {
            start_byte_pos = byte_pos;
            start_char_pos = char_pos;
        }
        else if (!prev_ws && ws) {
            Token *token = Token_new(text + start_byte_pos,
                                     byte_pos - start_byte_pos,
                                     start_char_pos, char_pos, 1.0f, 1);
            Inversion_Append(inversion, token);
        }

        prev_ws = ws;
        byte_pos += StrHelp_UTF8_COUNT[(uint8_t)text[byte_pos]];
        char_pos += 1;
    }

    if (!prev_ws) {
        Token *token = Token_new(text + start_byte_pos,
                                 byte_pos - start_byte_pos,
                                 start_char_pos, char_pos, 1.0f, 1);
        Inversion_Append(inversion, token);
    }
}
Exemple #6
0
void getToken (int fd, int sepBySpace)
{
  i = 0;
  char s[1024];
  gfd = fd;
  switch (ahead){
  case A_NONE:
    c = getChar (gfd);
    break;
  case A_SPACE:
    ahead = A_NONE;
    Token_new(token, TOKEN_SPACE, 0);
    return;
  case A_CRLF:
    ahead = A_NONE;
    Token_new(token, TOKEN_CRLF, 0);
    return;
  default:{
    char *info = "server bug";
    write (1, info, strlen (info));
    Http_print (gfd, http400);
    close (gfd);
    exit (0);
    return;
  }
  }

  while (1){
    switch (c){
    case ' ':
      if (sepBySpace){
	if (i){
	  char *p;
	  int kind;

	  // remember the ' '
	  ahead = A_SPACE;
	  s[i] = '\0';
	  p = malloc (strlen(s)+1);
	  strcpy (p, s);
	  kind = Token_getKeyWord (p);
	  if (kind>=0){

	    Token_new (token, kind, 0);
	    return;
	  }
	  Token_new (token, TOKEN_STR, p);
	  return;
	}
	Token_new(token, TOKEN_SPACE, 0);
	return;
      }
      s[i++] = c;
      break;
    case '\r':{
      char c2;

      c2 = getChar (gfd);
      if (c2=='\n'){
	if (i){
	  char *p;
	  int kind;
	  // remember the ' '
	  ahead = A_CRLF;
	  s[i] = '\0';
	  p = malloc (strlen(s)+1);
	  strcpy (p, s);
	  kind = Token_getKeyWord (p);
	  if (kind>=0){
	    Token_new (token, kind, 0);
	    return;
	  }
	  Token_new (token, TOKEN_STR, p);
	  return;
	}
	Token_new(token, TOKEN_CRLF, 0);
	return;
      }
      s[i++] = c;
      s[i++] = c2;
      break;
    }
    default:
      s[i++] = c;
      break;
    }
    c = getChar (gfd);
  }
  return;
}
Exemple #7
0
int
Scanner_scan(Scanner *s)
{
    unsigned char *cursor = s->cur;
    unsigned int depth;

scan:
    s->tchar = cursor - s->pos;
    s->tline = s->cline;
    s->tok = cursor;

#line 224 "scanner.c"
{
	YYCTYPE yych;
	unsigned int yyaccept;
	goto yy29;
	++YYCURSOR;
yy29:
	if((YYLIMIT - YYCURSOR) < 2) YYFILL(2);
	yych = *YYCURSOR;
	if(yych <= '/'){
		if(yych <= '"'){
			if(yych <= '\n'){
				if(yych <= '\b')	goto yy53;
				if(yych <= '\t')	goto yy47;
				goto yy49;
			} else {
				if(yych == ' ')	goto yy47;
				if(yych <= '!')	goto yy53;
				goto yy37;
			}
		} else {
			if(yych <= '*'){
				if(yych <= '&')	goto yy53;
				if(yych <= '\'')	goto yy39;
				if(yych <= ')')	goto yy43;
				goto yy35;
			} else {
				if(yych <= '+')	goto yy44;
				if(yych <= '-')	goto yy53;
				if(yych <= '.')	goto yy51;
				goto yy33;
			}
		}
	} else {
		if(yych <= '@'){
			if(yych <= '<'){
				if(yych == ';')	goto yy43;
				goto yy53;
			} else {
				if(yych <= '=')	goto yy43;
				if(yych == '?')	goto yy44;
				goto yy53;
			}
		} else {
			if(yych <= '`'){
				if(yych <= 'Z')	goto yy45;
				if(yych <= '[')	goto yy41;
				if(yych <= '\\')	goto yy43;
				goto yy53;
			} else {
				if(yych <= 'z')	goto yy45;
				if(yych <= '{')	goto yy31;
				if(yych <= '|')	goto yy43;
				goto yy53;
			}
		}
	}
yy31:	yyaccept = 0;
	yych = *(YYMARKER = ++YYCURSOR);
	if(yych <= '/')	goto yy32;
	if(yych <= '9')	goto yy84;
	goto yy32;
yy32:
#line 133 "scanner.re"
{ depth = 1;
				  goto code;
				}
#line 291 "scanner.c"
yy33:	yych = *++YYCURSOR;
	if(yych == '*')	goto yy82;
	goto yy34;
yy34:
#line 163 "scanner.re"
{ RETURN(*s->tok); }
#line 298 "scanner.c"
yy35:	yych = *++YYCURSOR;
	if(yych == '/')	goto yy80;
	goto yy36;
yy36:
#line 165 "scanner.re"
{ yylval.op = *s->tok;
				  RETURN(CLOSE); }
#line 306 "scanner.c"
yy37:	yyaccept = 1;
	yych = *(YYMARKER = ++YYCURSOR);
	if(yych != '\n')	goto yy76;
	goto yy38;
yy38:
#line 150 "scanner.re"
{ Scanner_fatal(s, "unterminated string constant (missing \")"); }
#line 314 "scanner.c"
yy39:	yyaccept = 2;
	yych = *(YYMARKER = ++YYCURSOR);
	if(yych != '\n')	goto yy71;
	goto yy40;
yy40:
#line 151 "scanner.re"
{ Scanner_fatal(s, "unterminated string constant (missing ')"); }
#line 322 "scanner.c"
yy41:	yyaccept = 3;
	yych = *(YYMARKER = ++YYCURSOR);
	if(yych == '\n')	goto yy42;
	if(yych == '^')	goto yy62;
	goto yy60;
yy42:
#line 161 "scanner.re"
{ Scanner_fatal(s, "unterminated range (missing ])"); }
#line 331 "scanner.c"
yy43:	yych = *++YYCURSOR;
	goto yy34;
yy44:	yych = *++YYCURSOR;
	goto yy36;
yy45:	yych = *++YYCURSOR;
	goto yy58;
yy46:
#line 180 "scanner.re"
{ SubStr substr;
				  s->cur = cursor;
				  substr = Scanner_token(s);
				  yylval.symbol = Symbol_find(&substr);
				  return ID; }
#line 345 "scanner.c"
yy47:	yych = *++YYCURSOR;
	goto yy56;
yy48:
#line 186 "scanner.re"
{ goto scan; }
#line 351 "scanner.c"
yy49:	yych = *++YYCURSOR;
	goto yy50;
yy50:
#line 188 "scanner.re"
{ if(cursor == s->eof) RETURN(0);
				  s->pos = cursor; s->cline++;
				  goto scan;
	    			}
#line 360 "scanner.c"
yy51:	yych = *++YYCURSOR;
	goto yy52;
yy52:
#line 193 "scanner.re"
{ s->cur = cursor;
				  yylval.regexp = mkDot();
				  return RANGE;
				}
#line 369 "scanner.c"
yy53:	yych = *++YYCURSOR;
	goto yy54;
yy54:
#line 198 "scanner.re"
{ fprintf(stderr, "unexpected character: '%c'\n", *s->tok);
				  goto scan;
				}
#line 377 "scanner.c"
yy55:	++YYCURSOR;
	if(YYLIMIT == YYCURSOR) YYFILL(1);
	yych = *YYCURSOR;
	goto yy56;
yy56:	if(yych == '\t')	goto yy55;
	if(yych == ' ')	goto yy55;
	goto yy48;
yy57:	++YYCURSOR;
	if(YYLIMIT == YYCURSOR) YYFILL(1);
	yych = *YYCURSOR;
	goto yy58;
yy58:	if(yych <= '@'){
		if(yych <= '/')	goto yy46;
		if(yych <= '9')	goto yy57;
		goto yy46;
	} else {
		if(yych <= 'Z')	goto yy57;
		if(yych <= '`')	goto yy46;
		if(yych <= 'z')	goto yy57;
		goto yy46;
	}
yy59:	++YYCURSOR;
	if(YYLIMIT == YYCURSOR) YYFILL(1);
	yych = *YYCURSOR;
	goto yy60;
yy60:	if(yych <= '['){
		if(yych != '\n')	goto yy59;
		goto yy61;
	} else {
		if(yych <= '\\')	goto yy64;
		if(yych <= ']')	goto yy65;
		goto yy59;
	}
yy61:	YYCURSOR = YYMARKER;
	switch(yyaccept){
	case 0:	goto yy32;
	case 1:	goto yy38;
	case 2:	goto yy40;
	case 3:	goto yy42;
	}
yy62:	++YYCURSOR;
	if(YYLIMIT == YYCURSOR) YYFILL(1);
	yych = *YYCURSOR;
	goto yy63;
yy63:	if(yych <= '['){
		if(yych == '\n')	goto yy61;
		goto yy62;
	} else {
		if(yych <= '\\')	goto yy67;
		if(yych <= ']')	goto yy68;
		goto yy62;
	}
yy64:	++YYCURSOR;
	if(YYLIMIT == YYCURSOR) YYFILL(1);
	yych = *YYCURSOR;
	if(yych == '\n')	goto yy61;
	goto yy59;
yy65:	yych = *++YYCURSOR;
	goto yy66;
yy66:
#line 157 "scanner.re"
{ s->cur = cursor;
				  yylval.regexp = ranToRE(Scanner_token(s));
				  return RANGE; }
#line 442 "scanner.c"
yy67:	++YYCURSOR;
	if(YYLIMIT == YYCURSOR) YYFILL(1);
	yych = *YYCURSOR;
	if(yych == '\n')	goto yy61;
	goto yy62;
yy68:	yych = *++YYCURSOR;
	goto yy69;
yy69:
#line 153 "scanner.re"
{ s->cur = cursor;
				  yylval.regexp = invToRE(Scanner_token(s));
				  return RANGE; }
#line 455 "scanner.c"
yy70:	++YYCURSOR;
	if(YYLIMIT == YYCURSOR) YYFILL(1);
	yych = *YYCURSOR;
	goto yy71;
yy71:	if(yych <= '&'){
		if(yych == '\n')	goto yy61;
		goto yy70;
	} else {
		if(yych <= '\'')	goto yy73;
		if(yych != '\\')	goto yy70;
		goto yy72;
	}
yy72:	++YYCURSOR;
	if(YYLIMIT == YYCURSOR) YYFILL(1);
	yych = *YYCURSOR;
	if(yych == '\n')	goto yy61;
	goto yy70;
yy73:	yych = *++YYCURSOR;
	goto yy74;
yy74:
#line 146 "scanner.re"
{ s->cur = cursor;
				  yylval.regexp = strToCaseInsensitiveRE(Scanner_token(s));
				  return STRING; }
#line 480 "scanner.c"
yy75:	++YYCURSOR;
	if(YYLIMIT == YYCURSOR) YYFILL(1);
	yych = *YYCURSOR;
	goto yy76;
yy76:	if(yych <= '!'){
		if(yych == '\n')	goto yy61;
		goto yy75;
	} else {
		if(yych <= '"')	goto yy78;
		if(yych != '\\')	goto yy75;
		goto yy77;
	}
yy77:	++YYCURSOR;
	if(YYLIMIT == YYCURSOR) YYFILL(1);
	yych = *YYCURSOR;
	if(yych == '\n')	goto yy61;
	goto yy75;
yy78:	yych = *++YYCURSOR;
	goto yy79;
yy79:
#line 142 "scanner.re"
{ s->cur = cursor;
				  yylval.regexp = strToRE(Scanner_token(s));
				  return STRING; }
#line 505 "scanner.c"
yy80:	yych = *++YYCURSOR;
	goto yy81;
yy81:
#line 139 "scanner.re"
{ s->tok = cursor;
				  RETURN(0); }
#line 512 "scanner.c"
yy82:	yych = *++YYCURSOR;
	goto yy83;
yy83:
#line 136 "scanner.re"
{ depth = 1;
				  goto comment; }
#line 519 "scanner.c"
yy84:	++YYCURSOR;
	if((YYLIMIT - YYCURSOR) < 2) YYFILL(2);
	yych = *YYCURSOR;
	goto yy85;
yy85:	if(yych <= '/'){
		if(yych == ',')	goto yy88;
		goto yy61;
	} else {
		if(yych <= '9')	goto yy84;
		if(yych != '}')	goto yy61;
		goto yy86;
	}
yy86:	yych = *++YYCURSOR;
	goto yy87;
yy87:
#line 168 "scanner.re"
{ yylval.extop.minsize = atoi((char *)s->tok+1);
				  yylval.extop.maxsize = atoi((char *)s->tok+1);
				  RETURN(CLOSESIZE); }
#line 539 "scanner.c"
yy88:	yych = *++YYCURSOR;
	if(yych != '}')	goto yy92;
	goto yy89;
yy89:	yych = *++YYCURSOR;
	goto yy90;
yy90:
#line 176 "scanner.re"
{ yylval.extop.minsize = atoi((char *)s->tok+1);
				  yylval.extop.maxsize = -1;
				  RETURN(CLOSESIZE); }
#line 550 "scanner.c"
yy91:	++YYCURSOR;
	if(YYLIMIT == YYCURSOR) YYFILL(1);
	yych = *YYCURSOR;
	goto yy92;
yy92:	if(yych <= '/')	goto yy61;
	if(yych <= '9')	goto yy91;
	if(yych != '}')	goto yy61;
	goto yy93;
yy93:	yych = *++YYCURSOR;
	goto yy94;
yy94:
#line 172 "scanner.re"
{ yylval.extop.minsize = atoi((char *)s->tok+1);
				  yylval.extop.maxsize = MAX(yylval.extop.minsize,atoi(strchr((char *)s->tok, ',')+1));
				  RETURN(CLOSESIZE); }
#line 566 "scanner.c"
}
#line 201 "scanner.re"


code:

#line 573 "scanner.c"
{
	YYCTYPE yych;
	unsigned int yyaccept;
	goto yy95;
	++YYCURSOR;
yy95:
	if((YYLIMIT - YYCURSOR) < 2) YYFILL(2);
	yych = *YYCURSOR;
	if(yych <= '&'){
		if(yych <= '\n'){
			if(yych <= '\t')	goto yy103;
			goto yy101;
		} else {
			if(yych == '"')	goto yy105;
			goto yy103;
		}
	} else {
		if(yych <= '{'){
			if(yych <= '\'')	goto yy106;
			if(yych <= 'z')	goto yy103;
			goto yy99;
		} else {
			if(yych != '}')	goto yy103;
			goto yy97;
		}
	}
yy97:	yych = *++YYCURSOR;
	goto yy98;
yy98:
#line 205 "scanner.re"
{ if(--depth == 0){
					s->cur = cursor;
					yylval.token = Token_new(Scanner_token(s), s->tline);
					return CODE;
				  }
				  goto code; }
#line 610 "scanner.c"
yy99:	yych = *++YYCURSOR;
	goto yy100;
yy100:
#line 211 "scanner.re"
{ ++depth;
				  goto code; }
#line 617 "scanner.c"
yy101:	yych = *++YYCURSOR;
	goto yy102;
yy102:
#line 213 "scanner.re"
{ if(cursor == s->eof) Scanner_fatal(s, "missing '}'");
				  s->pos = cursor; s->cline++;
				  goto code;
				}
#line 626 "scanner.c"
yy103:	yych = *++YYCURSOR;
	goto yy104;
yy104:
#line 217 "scanner.re"
{ goto code; }
#line 632 "scanner.c"
yy105:	yyaccept = 0;
	yych = *(YYMARKER = ++YYCURSOR);
	if(yych == '\n')	goto yy104;
	goto yy112;
yy106:	yyaccept = 0;
	yych = *(YYMARKER = ++YYCURSOR);
	if(yych == '\n')	goto yy104;
	goto yy108;
yy107:	++YYCURSOR;
	if(YYLIMIT == YYCURSOR) YYFILL(1);
	yych = *YYCURSOR;
	goto yy108;
yy108:	if(yych <= '&'){
		if(yych != '\n')	goto yy107;
		goto yy109;
	} else {
		if(yych <= '\'')	goto yy103;
		if(yych == '\\')	goto yy110;
		goto yy107;
	}
yy109:	YYCURSOR = YYMARKER;
	switch(yyaccept){
	case 0:	goto yy104;
	}
yy110:	++YYCURSOR;
	if(YYLIMIT == YYCURSOR) YYFILL(1);
	yych = *YYCURSOR;
	if(yych == '\n')	goto yy109;
	goto yy107;
yy111:	++YYCURSOR;
	if(YYLIMIT == YYCURSOR) YYFILL(1);
	yych = *YYCURSOR;
	goto yy112;
yy112:	if(yych <= '!'){
		if(yych == '\n')	goto yy109;
		goto yy111;
	} else {
		if(yych <= '"')	goto yy103;
		if(yych != '\\')	goto yy111;
		goto yy113;
	}
yy113:	++YYCURSOR;
	if(YYLIMIT == YYCURSOR) YYFILL(1);
	yych = *YYCURSOR;
	if(yych == '\n')	goto yy109;
	goto yy111;
}
#line 218 "scanner.re"


comment:

#line 685 "scanner.c"
{
	YYCTYPE yych;
	goto yy114;
	++YYCURSOR;
yy114:
	if((YYLIMIT - YYCURSOR) < 2) YYFILL(2);
	yych = *YYCURSOR;
	if(yych <= ')'){
		if(yych == '\n')	goto yy119;
		goto yy121;
	} else {
		if(yych <= '*')	goto yy116;
		if(yych == '/')	goto yy118;
		goto yy121;
	}
yy116:	yych = *++YYCURSOR;
	if(yych == '/')	goto yy124;
	goto yy117;
yy117:
#line 232 "scanner.re"
{ goto comment; }
#line 707 "scanner.c"
yy118:	yych = *++YYCURSOR;
	if(yych == '*')	goto yy122;
	goto yy117;
yy119:	yych = *++YYCURSOR;
	goto yy120;
yy120:
#line 228 "scanner.re"
{ if(cursor == s->eof) RETURN(0);
				  s->tok = s->pos = cursor; s->cline++;
				  goto comment;
				}
#line 719 "scanner.c"
yy121:	yych = *++YYCURSOR;
	goto yy117;
yy122:	yych = *++YYCURSOR;
	goto yy123;
yy123:
#line 226 "scanner.re"
{ ++depth;
				  goto comment; }
#line 728 "scanner.c"
yy124:	yych = *++YYCURSOR;
	goto yy125;
yy125:
#line 222 "scanner.re"
{ if(--depth == 0)
					goto scan;
				    else
					goto comment; }
#line 737 "scanner.c"
}
#line 233 "scanner.re"

}
Exemple #8
0
/*
 * Parse a word starting with an ALetter, Numeric, Katakana, or ExtendNumLet
 * character. Advances the iterator and returns the word break property of the
 * current character.
 */
static int
S_parse_word(const char *text, size_t len, lucy_StringIter *iter,
             int state, Inversion *inversion) {
    int wb = -1;
    lucy_StringIter start = *iter;
    S_iter_advance(text, iter);
    lucy_StringIter end = *iter;

    while (iter->byte_pos < len) {
        wb = S_wb_lookup(text + iter->byte_pos);

        switch (wb) {
            case WB_ALetter:
            case WB_Hebrew_Letter:
            case WB_Numeric:
                if (state == WB_Katakana) { goto word_break; }
                // Rules WB5, WB8, WB9, WB10, and WB13b.
                break;
            case WB_Katakana:
                if (state != WB_Katakana && state != WB_ExtendNumLet) {
                    goto word_break;
                }
                // Rules WB13 and WB13b.
                break;
            case WB_ExtendNumLet:
                // Rule WB13a.
                break;
            case WB_Extend_Format:
                // Rule WB4. Keep state.
                wb = state;
                break;
            case WB_Single_Quote:
            case WB_MidNumLet:
            case WB_MidLetter:
            case WB_MidNum:
                if (state == WB_ALetter) {
                    if (wb == WB_MidNum) { goto word_break; }
                    wb = S_skip_extend_format(text, len, iter);
                    if (wb == WB_ALetter || wb == WB_Hebrew_Letter) {
                        // Rules WB6 and WB7.
                        state = wb;
                        break;
                    }
                }
                else if (state == WB_Hebrew_Letter) {
                    if (wb == WB_MidNum) { goto word_break; }
                    if (wb == WB_Single_Quote) {
                        // Rule WB7a.
                        ++end.byte_pos;
                        ++end.char_pos;
                    }
                    wb = S_skip_extend_format(text, len, iter);
                    if (wb == WB_ALetter || wb == WB_Hebrew_Letter) {
                        // Rules WB6 and WB7.
                        state = wb;
                        break;
                    }
                }
                else if (state == WB_Numeric) {
                    if (wb == WB_MidLetter) { goto word_break; }
                    wb = S_skip_extend_format(text, len, iter);
                    if (wb == state) {
                        // Rules WB11 and WB12.
                        break;
                    }
                }
                goto word_break;
            case WB_Double_Quote:
                if (state == WB_Hebrew_Letter) {
                    wb = S_skip_extend_format(text, len, iter);
                    if (wb == state) {
                        // Rules WB7b and WB7c.
                        break;
                    }
                }
                goto word_break;
            default:
                goto word_break;
        }

        state = wb;
        S_iter_advance(text, iter);
        end = *iter;
    }

    Token *token;
word_break:
    token = Token_new(text + start.byte_pos, end.byte_pos - start.byte_pos,
                      start.char_pos, end.char_pos, 1.0f, 1);
    Inversion_Append(inversion, token);

    return wb;
}