static bool is_punct (unsigned int ch) { #if 0 return (unicode_attributes[ch].name != NULL && unicode_attributes[ch].category[0] == 'P'); #else /* The traditional POSIX definition of punctuation is every graphic, non-alphanumeric character. */ return (is_graph (ch) && !is_alpha (ch) && !is_digit (ch)); #endif }
//input: *c=='[' **pc==':' static u16 bracket_class(u8 *c,u8 **pc,u8 **sc,u8 not,u8 sc_folded) { u8 char_class[CHAR_CLASS_MAX+1];//don't forget the 0 terminating char u16 r=bracket_char_class_get(c,pc,not,sc_folded,&char_class[0]); if(r!=OK) return r; if((STREQ(char_class,"alnum")&&is_alnum(**sc)) ||(STREQ(char_class,"alpha")&&is_alpha(**sc)) ||(STREQ(char_class,"blank")&&is_blank(**sc)) ||(STREQ(char_class,"cntrl")&&is_cntrl(**sc)) ||(STREQ(char_class,"digit")&&is_digit(**sc)) ||(STREQ(char_class,"graph")&&is_graph(**sc)) ||(STREQ(char_class,"lower")&&is_lower(**sc)) ||(STREQ(char_class,"print")&&is_print(**sc)) ||(STREQ(char_class,"punct")&&is_punct(**sc)) ||(STREQ(char_class,"space")&&is_space(**sc)) ||(STREQ(char_class,"upper")&&is_upper(**sc)) ||(STREQ(char_class,"xdigit")&&is_xdigit(**sc))) return bracket_matched(c,pc,not); *c=*(*pc)++; return OK; }
int _xkbcommon_lex(YYSTYPE *yylval, struct scanner *s) { int tok; skip_more_whitespace_and_comments: /* Skip spaces. */ while (is_space(peek(s))) next(s); /* Skip comments. */ if (lit(s, "//") || chr(s, '#')) { while (!eof(s) && !eol(s)) next(s); goto skip_more_whitespace_and_comments; } /* See if we're done. */ if (eof(s)) return END_OF_FILE; /* New token. */ s->token_line = s->line; s->token_column = s->column; s->buf_pos = 0; /* String literal. */ if (chr(s, '\"')) { while (!eof(s) && !eol(s) && peek(s) != '\"') { if (chr(s, '\\')) { uint8_t o; if (chr(s, '\\')) buf_append(s, '\\'); else if (chr(s, 'n')) buf_append(s, '\n'); else if (chr(s, 't')) buf_append(s, '\t'); else if (chr(s, 'r')) buf_append(s, '\r'); else if (chr(s, 'b')) buf_append(s, '\b'); else if (chr(s, 'f')) buf_append(s, '\f'); else if (chr(s, 'v')) buf_append(s, '\v'); else if (chr(s, 'e')) buf_append(s, '\033'); else if (oct(s, &o)) buf_append(s, (char) o); else { scanner_warn(s, "unknown escape sequence in string literal"); /* Ignore. */ } } else { buf_append(s, next(s)); } } if (!buf_append(s, '\0') || !chr(s, '\"')) return scanner_error(s, "unterminated string literal"); yylval->str = strdup(s->buf); if (!yylval->str) return scanner_error(s, "scanner out of memory"); return STRING; } /* Key name literal. */ if (chr(s, '<')) { while (is_graph(peek(s)) && peek(s) != '>') buf_append(s, next(s)); if (!buf_append(s, '\0') || !chr(s, '>')) return scanner_error(s, "unterminated key name literal"); /* Empty key name literals are allowed. */ yylval->sval = xkb_atom_intern(s->ctx, s->buf, s->buf_pos - 1); return KEYNAME; } /* Operators and punctuation. */ if (chr(s, ';')) return SEMI; if (chr(s, '{')) return OBRACE; if (chr(s, '}')) return CBRACE; if (chr(s, '=')) return EQUALS; if (chr(s, '[')) return OBRACKET; if (chr(s, ']')) return CBRACKET; if (chr(s, '(')) return OPAREN; if (chr(s, ')')) return CPAREN; if (chr(s, '.')) return DOT; if (chr(s, ',')) return COMMA; if (chr(s, '+')) return PLUS; if (chr(s, '-')) return MINUS; if (chr(s, '*')) return TIMES; if (chr(s, '/')) return DIVIDE; if (chr(s, '!')) return EXCLAM; if (chr(s, '~')) return INVERT; /* Identifier. */ if (is_alpha(peek(s)) || peek(s) == '_') { s->buf_pos = 0; while (is_alnum(peek(s)) || peek(s) == '_') buf_append(s, next(s)); if (!buf_append(s, '\0')) return scanner_error(s, "identifier too long"); /* Keyword. */ tok = keyword_to_token(s->buf); if (tok != -1) return tok; yylval->str = strdup(s->buf); if (!yylval->str) return scanner_error(s, "scanner out of memory"); return IDENT; } /* Number literal (hexadecimal / decimal / float). */ if (number(s, &yylval->num, &tok)) { if (tok == ERROR_TOK) return scanner_error(s, "malformed number literal"); return tok; } return scanner_error(s, "unrecognized token"); }
static inline bool is_ident(char ch) { return is_graph(ch) && ch != '\\'; }
static void output_tables (const char *filename, const char *version) { FILE *stream; unsigned int ch; stream = fopen (filename, "w"); if (stream == NULL) { fprintf (stderr, "cannot open '%s' for writing\n", filename); exit (1); } fprintf (stream, "escape_char /\n"); fprintf (stream, "comment_char %%\n"); fprintf (stream, "\n"); fprintf (stream, "%% Generated automatically by gen-unicode-ctype for Unicode %s.\n", version); fprintf (stream, "\n"); fprintf (stream, "LC_IDENTIFICATION\n"); fprintf (stream, "title \"Unicode %s FDCC-set\"\n", version); fprintf (stream, "source \"UnicodeData.txt, PropList.txt\"\n"); fprintf (stream, "address \"\"\n"); fprintf (stream, "contact \"\"\n"); fprintf (stream, "email \"[email protected]\"\n"); fprintf (stream, "tel \"\"\n"); fprintf (stream, "fax \"\"\n"); fprintf (stream, "language \"\"\n"); fprintf (stream, "territory \"Earth\"\n"); fprintf (stream, "revision \"%s\"\n", version); { time_t now; char date[11]; now = time (NULL); strftime (date, sizeof (date), "%Y-%m-%d", gmtime (&now)); fprintf (stream, "date \"%s\"\n", date); } fprintf (stream, "category \"unicode:2001\";LC_CTYPE\n"); fprintf (stream, "END LC_IDENTIFICATION\n"); fprintf (stream, "\n"); /* Verifications. */ for (ch = 0; ch < 0x110000; ch++) { /* toupper restriction: "Only characters specified for the keywords lower and upper shall be specified. */ if (to_upper (ch) != ch && !(is_lower (ch) || is_upper (ch))) fprintf (stderr, "%s is not upper|lower but toupper(0x%04X) = 0x%04X\n", ucs_symbol (ch), ch, to_upper (ch)); /* tolower restriction: "Only characters specified for the keywords lower and upper shall be specified. */ if (to_lower (ch) != ch && !(is_lower (ch) || is_upper (ch))) fprintf (stderr, "%s is not upper|lower but tolower(0x%04X) = 0x%04X\n", ucs_symbol (ch), ch, to_lower (ch)); /* alpha restriction: "Characters classified as either upper or lower shall automatically belong to this class. */ if ((is_lower (ch) || is_upper (ch)) && !is_alpha (ch)) fprintf (stderr, "%s is upper|lower but not alpha\n", ucs_symbol (ch)); /* alpha restriction: "No character specified for the keywords cntrl, digit, punct or space shall be specified." */ if (is_alpha (ch) && is_cntrl (ch)) fprintf (stderr, "%s is alpha and cntrl\n", ucs_symbol (ch)); if (is_alpha (ch) && is_digit (ch)) fprintf (stderr, "%s is alpha and digit\n", ucs_symbol (ch)); if (is_alpha (ch) && is_punct (ch)) fprintf (stderr, "%s is alpha and punct\n", ucs_symbol (ch)); if (is_alpha (ch) && is_space (ch)) fprintf (stderr, "%s is alpha and space\n", ucs_symbol (ch)); /* space restriction: "No character specified for the keywords upper, lower, alpha, digit, graph or xdigit shall be specified." upper, lower, alpha already checked above. */ if (is_space (ch) && is_digit (ch)) fprintf (stderr, "%s is space and digit\n", ucs_symbol (ch)); if (is_space (ch) && is_graph (ch)) fprintf (stderr, "%s is space and graph\n", ucs_symbol (ch)); if (is_space (ch) && is_xdigit (ch)) fprintf (stderr, "%s is space and xdigit\n", ucs_symbol (ch)); /* cntrl restriction: "No character specified for the keywords upper, lower, alpha, digit, punct, graph, print or xdigit shall be specified." upper, lower, alpha already checked above. */ if (is_cntrl (ch) && is_digit (ch)) fprintf (stderr, "%s is cntrl and digit\n", ucs_symbol (ch)); if (is_cntrl (ch) && is_punct (ch)) fprintf (stderr, "%s is cntrl and punct\n", ucs_symbol (ch)); if (is_cntrl (ch) && is_graph (ch)) fprintf (stderr, "%s is cntrl and graph\n", ucs_symbol (ch)); if (is_cntrl (ch) && is_print (ch)) fprintf (stderr, "%s is cntrl and print\n", ucs_symbol (ch)); if (is_cntrl (ch) && is_xdigit (ch)) fprintf (stderr, "%s is cntrl and xdigit\n", ucs_symbol (ch)); /* punct restriction: "No character specified for the keywords upper, lower, alpha, digit, cntrl, xdigit or as the <space> character shall be specified." upper, lower, alpha, cntrl already checked above. */ if (is_punct (ch) && is_digit (ch)) fprintf (stderr, "%s is punct and digit\n", ucs_symbol (ch)); if (is_punct (ch) && is_xdigit (ch)) fprintf (stderr, "%s is punct and xdigit\n", ucs_symbol (ch)); if (is_punct (ch) && (ch == 0x0020)) fprintf (stderr, "%s is punct\n", ucs_symbol (ch)); /* graph restriction: "No character specified for the keyword cntrl shall be specified." Already checked above. */ /* print restriction: "No character specified for the keyword cntrl shall be specified." Already checked above. */ /* graph - print relation: differ only in the <space> character. How is this possible if there are more than one space character?! I think susv2/xbd/locale.html should speak of "space characters", not "space character". */ if (is_print (ch) && !(is_graph (ch) || /* ch == 0x0020 */ is_space (ch))) fprintf (stderr, "%s is print but not graph|<space>\n", ucs_symbol (ch)); if (!is_print (ch) && (is_graph (ch) || ch == 0x0020)) fprintf (stderr, "%s is graph|<space> but not print\n", ucs_symbol (ch)); } fprintf (stream, "LC_CTYPE\n"); output_charclass (stream, "upper", is_upper); output_charclass (stream, "lower", is_lower); output_charclass (stream, "alpha", is_alpha); output_charclass (stream, "digit", is_digit); output_charclass (stream, "outdigit", is_outdigit); output_charclass (stream, "blank", is_blank); output_charclass (stream, "space", is_space); output_charclass (stream, "cntrl", is_cntrl); output_charclass (stream, "punct", is_punct); output_charclass (stream, "xdigit", is_xdigit); output_charclass (stream, "graph", is_graph); output_charclass (stream, "print", is_print); output_charclass (stream, "class \"combining\";", is_combining); output_charclass (stream, "class \"combining_level3\";", is_combining_level3); output_charmap (stream, "toupper", to_upper); output_charmap (stream, "tolower", to_lower); output_charmap (stream, "map \"totitle\";", to_title); output_widthmap (stream); fprintf (stream, "END LC_CTYPE\n"); if (ferror (stream) || fclose (stream)) { fprintf (stderr, "error writing to '%s'\n", filename); exit (1); } }
inline bool is_graph<unicode::char_t> (unicode::char_t c) { return c.value <= 127 && is_graph(static_cast<char>(c.value)); }