/* * Reads a file name of #include directive. If the file name is quoted with <>, * "std" will set to true. If quoted with doublequote, set to false. We use * expand_one() rather than read_cpp_token(), because macros are allowed to be * used in #include. * (C99 6.10.2 Source file inclusion) */ static void read_cpp_header_name(CppContext *ctx, String **name, bool *std) { if (LIST_IS_EMPTY(ctx->ungotten)) { *name = read_header_name(ctx, std); if (name) return; } Token *tok = expand_one(ctx); if (!tok || tok->toktype == TOKTYPE_NEWLINE) error_token(tok, "expected file name, but got '%s'", token_to_string(tok)); if (tok->toktype == TOKTYPE_STRING) { *name = tok->val.str; *std = false; return; } List *tokens = make_list(); if (is_punct(tok, '<')) { for (;;) { Token *tok = expand_one(ctx); if (!tok || tok->toktype == TOKTYPE_NEWLINE) error_token(tok, "premature end of header name"); if (is_punct(tok, '>')) break; list_push(tokens, tok); } *name = join_tokens(tokens, false); *std = true; return; } error_token(tok, "'<' expected, but got '%s'", token_to_string(tok)); }
bool str_is_slot_name(const char* s, fint len) { assert(len >= 0, "shouldn't be negative length"); if (len == 0) { return false; } char c = *s; if (!is_lower(c)) { if (!is_punct(c)) return false; switch (c) { case '^': case '|': case '\\': case '.': if (len == 1) return false; } for (int i = 0; i < len; ) { c = s[i++]; if (! is_punct(c)) return false; switch (c) { case '(': case ')': case '\'': case '\"': case ':': case '[': case ']': return false; } } return true; } for (int i = 1; i < len; ) { c = s[i++]; if (is_id_char(c)) continue; if (c != ':') return false; if (i == len) return true; // this was final ":" if (!is_upper(s[i])) return false; // after ":" must be uppercase if (s[len-1] != ':') return false; // one ":" -> last is ":" } return true; }
/* * Reads function-like macro arguments. Returns true if the argument list ends * with "...". Otherwise false. */ static bool read_funclike_define_args(CppContext *ctx, Dict *param) { for (;;) { Token *tok = read_cpp_token(ctx); if (is_punct(tok, ')')) return false; if (dict_size(param)) { if (!is_punct(tok, ',')) error_token(tok, "',' expected, but got '%s'", token_to_string(tok)); tok = read_cpp_token(ctx); } if (!tok || tok->toktype == TOKTYPE_NEWLINE) error_token(tok, "missing ')' in macro parameter list"); if (is_punct(tok, KEYWORD_THREEDOTS)) { Token *subst = make_token(ctx, TOKTYPE_MACRO_PARAM, (TokenValue)dict_size(param)); dict_put(param, to_string("__VA_ARGS__"), subst); Token *tok1 = read_cpp_token(ctx); if (!is_punct(tok1, ')')) error_token(tok1, "')' expected, but got '%s'", token_to_string(tok1)); return true; } if (tok->toktype != TOKTYPE_IDENT) error_token(tok, "identifier expected, but got '%s'", token_to_string(tok)); Token *subst = make_token(ctx, TOKTYPE_MACRO_PARAM, (TokenValue)dict_size(param)); dict_put(param, tok->val.str, subst); } }
int main() { string line; string processed = ""; bool in_paren {false}; int parens = 0; char ch; while (cin.get(ch)) { if (ch != '\n') { if (ch == '\"') { if (parens % 2 == 0) in_paren = true; else in_paren = false; ++parens; processed += ch; } else if (is_punct(ch, in_paren) || isspace(ch)) { processed += ' '; } else { string s; s += ch; while (cin.get(ch) && !(ch == '\"' || isspace(ch) || is_punct(ch, in_paren))) s += ch; cin.putback(ch); processed += transform(s); } } else break; } cout << processed; }
static const char *scan_punct(const char *src) { assert(is_punct(*src)); while (is_punct(*src)) src++; return src; }
/* * Reads "defined" unary operator of the form "defined <identifier>" or * "defined(<identifier>)". The token "defined" is already read when the * function is called. * * (C99 6.10.1 Conditional inclusion, paragraph 1) */ static Token *read_defined(CppContext *ctx) { Token *tok = read_cpp_token(ctx); if (is_punct(tok, '(')) { tok = read_cpp_token(ctx); Token *tok1 = read_cpp_token(ctx); if (!tok1 || !is_punct(tok1, ')')) error_token(tok1, "')' expected, but got '%s'", token_to_string(tok1)); } Token *r = copy_token(tok); r->toktype = TOKTYPE_CPPNUM; r->val.i = is_defined(ctx, tok); return r; }
/* * Substitutes parameters in macro definition body with actual arguments. */ static List *subst(CppContext *ctx, Macro *macro, List *args, List *hideset) { List *r = make_list(); for (int i = 0; i < LIST_LEN(macro->body); i++) { bool islast = (i == LIST_LEN(macro->body) - 1); Token *t0 = LIST_REF(macro->body, i); Token *t1 = islast ? NULL : LIST_REF(macro->body, i + 1); bool t0_param = t0->toktype == TOKTYPE_MACRO_PARAM; bool t1_param = !islast && t1->toktype == TOKTYPE_MACRO_PARAM; if (is_punct(t0, '#') && t1_param) { list_push(r, stringize(t0, LIST_REF(args, t1->val.i))); i++; continue; } if (is_punct(t0, KEYWORD_TWOSHARPS) && t1_param) { List *arg = LIST_REF(args, t1->val.i); if (!LIST_IS_EMPTY(arg)) { glue_push(r, (Token *)LIST_REF(arg, 0)); List *tmp = make_list(); for (int i = 1; i < LIST_LEN(arg); i++) list_push(tmp, LIST_REF(arg, i)); list_append(r, expand_all(ctx, tmp)); } i++; continue; } if (is_punct(t0, KEYWORD_TWOSHARPS) && !islast) { hideset = t1->hideset; // wrong? glue_push(r, t1); i++; continue; } if (t0_param && !islast && is_punct(t1, KEYWORD_TWOSHARPS)) { hideset = t1->hideset; // wrong? List *arg = LIST_REF(args, t0->val.i); if (LIST_IS_EMPTY(arg)) i++; else list_append(r, arg); continue; } if (t0_param) { List *arg = LIST_REF(args, t0->val.i); list_append(r, expand_all(ctx, arg)); continue; } list_push(r, t0); } return add_hide_set(r, hideset); }
/* * C99 6.10.9 Pragma operator. * * _Pragma("tokens ...") is equivalent to #pragma tokens .... */ static void handle_pragma_macro(CppContext *ctx, Token *ignore) { Token *tok = read_cpp_token(ctx); if (!is_punct(tok, '(')) error_token(tok, "'(' expected, but got '%s'", token_to_string(tok)); Token *body = read_cpp_token(ctx); if (body->toktype != TOKTYPE_STRING) error_token(body, "string expected, but got '%s'", token_to_string(body)); tok = read_cpp_token(ctx); if (!is_punct(tok, ')')) error_token(tok, "')' expected, but got '%s'", token_to_string(tok)); File *file = make_string_file(body->val.str); do_include(ctx, file); handle_pragma(ctx); }
static void test_case(const char *data) { AoS_Copy *tokens = aosc_create(20); printf("Test: [%s]\n", data); const char *word = data; while (*word != '\0') { const char *eow; eow = word = skip_space(word); if (is_alnum(*word) || (word[0] == '\'' && is_alnum(word[1]))) eow = scan_alnum(word); else if (is_punct(*word)) eow = scan_punct(word); if (eow == word) break; aosc_addbytes(tokens, word, eow); word = eow; } int num = 0; aosc_apply_ctxt(tokens, 0, aosc_length(tokens), printer, &num); putchar('\n'); aosc_destroy(tokens); }
static int parse_word(lua_State *L, parse_state *state, const char *word, const char *err_msg) { const char *ptr = state->ptr; while (*ptr == *word && *ptr != 0 && *word != 0) { ptr++; word++; } if (*word == 0 && (is_whitespace(*ptr) || is_punct(*ptr) || *ptr == 0)) { state->ptr = ptr; return 1; } else { return push_parse_error(L, state, err_msg); } }
/* * Reads comma-separated arguments of function-like macro invocation. Comma * characters in matching parentheses are not considered as separator. * * (C99 6.10.3 Macro replacement, sentence 10) */ static List *read_args_int(CppContext *ctx, Macro *macro) { List *r = make_list(); List *arg = make_list(); int depth = 0; Token *tok = peek_cpp_token(ctx); if (!tok || !is_punct(tok, '(')) return NULL; read_cpp_token(ctx); for (Token *tok1 = read_cpp_token(ctx); ; tok1 = read_cpp_token(ctx)) { if (!tok1) error_token(tok, "unterminated macro argument list"); if (tok1->toktype == TOKTYPE_NEWLINE) continue; if (depth) { if (is_punct(tok1, ')')) depth--; list_push(arg, tok1); continue; } if (is_punct(tok1, '(')) depth++; if (is_punct(tok1, ')')) { unget_cpp_token(ctx, tok1); list_push(r, arg); return r; } bool in_threedots = macro->is_varg && LIST_LEN(r) + 1 == macro->nargs; if (is_punct(tok1, ',') && !in_threedots) { list_push(r, arg); arg = make_list(); continue; } list_push(arg, tok1); } }
static int parse_number(lua_State *L, parse_state *state) { char *end; double n = strtod(state->ptr, &end); if (state->ptr == end || (*end != 0 && !is_whitespace(*end) && !is_punct(*end))) { return push_parse_error(L, state, "invalid number"); } else { state->ptr = end; if ((double)((int)n) == n) { // in case we're using different representation for // integers in the lua vm lua_pushinteger(L, (int)n); } else { lua_pushnumber(L, n); } return 1; } }
static Token *read_token_int(CppContext *ctx) { for (;;) { Token *tok = read_cpp_token(ctx); if (!tok) return NULL; if (tok->toktype == TOKTYPE_NEWLINE) { ctx->at_bol = true; return tok; } if (ctx->at_bol && is_punct(tok, '#')) { read_directive(ctx); ctx->at_bol = true; continue; } ctx->at_bol = false; unget_cpp_token(ctx, tok); return expand_one(ctx); } }
/* * #define * (C99 6.10.3 Macro replacement) */ static void read_define(CppContext *ctx) { Token *name = read_cpp_token(ctx); if (name->toktype != TOKTYPE_IDENT) error_cpp_ctx(ctx, "macro name must be an identifier, but got '%s'", token_to_string(name)); bool is_space = is_next_space(ctx); Token *tok = read_cpp_token(ctx); if (!is_space && tok && is_punct(tok, '(')) { read_funclike_define(ctx, name->val.str); return; } List *body = make_list(); while (tok && tok->toktype != TOKTYPE_NEWLINE) { list_push(body, tok); tok = read_cpp_token(ctx); } store_macro(ctx, name->val.str, make_obj_macro(body)); }
//input: *c=='[' **pc==':' static u16 bracket_class(u8 *c,u8 **pc,u8 **sc,u8 not,u8 sc_folded) { u8 char_class[CHAR_CLASS_MAX+1];//don't forget the 0 terminating char u16 r=bracket_char_class_get(c,pc,not,sc_folded,&char_class[0]); if(r!=OK) return r; if((STREQ(char_class,"alnum")&&is_alnum(**sc)) ||(STREQ(char_class,"alpha")&&is_alpha(**sc)) ||(STREQ(char_class,"blank")&&is_blank(**sc)) ||(STREQ(char_class,"cntrl")&&is_cntrl(**sc)) ||(STREQ(char_class,"digit")&&is_digit(**sc)) ||(STREQ(char_class,"graph")&&is_graph(**sc)) ||(STREQ(char_class,"lower")&&is_lower(**sc)) ||(STREQ(char_class,"print")&&is_print(**sc)) ||(STREQ(char_class,"punct")&&is_punct(**sc)) ||(STREQ(char_class,"space")&&is_space(**sc)) ||(STREQ(char_class,"upper")&&is_upper(**sc)) ||(STREQ(char_class,"xdigit")&&is_xdigit(**sc))) return bracket_matched(c,pc,not); *c=*(*pc)++; return OK; }
Token* Scanner::read_op(fint c) { fint l = line; fint col = column - 1; const char* ss = sourceAddr() - 1; buffer[0] = char(c); fint len = 1; while (is_punct(c = get_char())) buffer[len++] = char(c); push_char(c); buffer[len] = '\0'; Token::TokenType t; String* s = NULL; if (strcmp(buffer, "<-") == 0) t = Token::ARROW; else if (strcmp(buffer, "=") == 0) t = as_TokenType('='); else if (strcmp(buffer, "|") == 0) t = as_TokenType('|'); else if (strcmp(buffer, "^") == 0) t = as_TokenType('^'); else if (strcmp(buffer, "\\") == 0 && (c == '\n' || c == '\r')) { get_char(); return get_token(); } else { t = Token::OPERATOR; s = new String(copy_string(buffer)); } return new Token(t, s, l, col, ss); }
Token* Scanner::get_token() { Token* t; if (tokens) { t = tokens->token; tokens = tokens->prev; } else { t = NULL; while (t == NULL) { fint c = get_char(); switch (c) { case EOF: depth = 0; t = new Token(Token::ACCEPT, line, column, sourceAddr()); break; case '\n': case '\r': if (depth <= 0 && !is_string_scanner()) { t = new Token(Token::ACCEPT, line, column - 1, sourceAddr() - 1); depth = 0; } break; case ' ': case '\t': case '\v': case '\b': case '\f': break; case '"': t = skip_comment(); break; case '(': depth ++; t = new Token(as_TokenType('('), line, column - 1, sourceAddr() - 1); break; case ')': depth --; t = new Token(as_TokenType(')'), line, column - 1, sourceAddr() - 1); break; case '[': depth ++; t = new Token(as_TokenType('['), line, column - 1, sourceAddr() - 1); break; case ']': depth --; t = new Token(as_TokenType(']'), line, column - 1, sourceAddr() - 1); break; case '.': t = read_dot(); break; case '\'': t = read_string(); break; case '\\': c = get_char(); if (c == '\n' || c == '\r') { // an escaped newline; ignore } else { push_char(c); c = '\\'; t = read_op(c); } break; case '{': t = new Token(Token::ANNOTATION_START, line, column - 1, sourceAddr() - 1); break; case '}': t = new Token(Token::ANNOTATION_END, line, column - 1, sourceAddr() - 1); break; default: if (is_digit(c) || c == '-') t = read_number(c); else if (is_id_alpha(c) || c == ':') t = read_name(c); else if (is_punct(c)) t = read_op(c); else t = TokenizingError("unknown character in input"); } } } if (t && PrintTokens) t->print(); return t; }
Token* Scanner::read_name(fint c) { Token::TokenType t; fint l = line; fint col = column - 1; const char* ss = sourceAddr() - 1; fint len; if (c == ':') { t = Token::ARG; len = 0; } else { t = c == '_' ? Token::PRIMNAME : Token::NAME; len = 1; buffer[0] = char(c); } while (c = get_char(), is_id_char(c)) { buffer[len++] = char(c); } if (c == ':' && (t == Token::NAME || t == Token::PRIMNAME)) { buffer[len++] = char(c); if (is_upper((fint)*buffer)) t = Token::CAPKEYWORD; else t = c == '_' ? Token::PRIMKEYWORD : Token::KEYWORD; } else { push_char(c); } buffer[len] = '\0'; if (t == Token::ARG && len == 0) { t = as_TokenType(':'); } else if (t == Token::NAME || t == Token::PRIMNAME) { c = get_char(); if (c == '.') { c = get_char(); push_char(c); if (is_id_alpha(c) || is_punct(c)) { t = Token::DELEGATE; } else { push_char('.'); } } else { push_char(c); } } if (strcmp(buffer, "self") == 0) { if (t == Token::NAME) { t = Token::SELF_TOKEN; } else { return TokenizingError( "using \"self\" as a parent name for a directed resend"); } } else if (strcmp(buffer, "resend") == 0) { if (t == Token::DELEGATE) { t = Token::RESEND_TOKEN; } else { return TokenizingError("not using \"resend\" in a resend"); } } String* s; if (t == Token::NAME || t == Token::PRIMNAME || t == Token::ARG || t == Token::DELEGATE || t == Token::KEYWORD || t == Token::PRIMKEYWORD || t == Token::CAPKEYWORD) { s = new String(copy_string(buffer)); } else { s = NULL; } return new Token(t, s, l, col, ss); }
inline bool is_punct<unicode::char_t> (unicode::char_t c) { return c.value <= 127 && is_punct(static_cast<char>(c.value)); }
static void output_tables (const char *filename, const char *version) { FILE *stream; unsigned int ch; stream = fopen (filename, "w"); if (stream == NULL) { fprintf (stderr, "cannot open '%s' for writing\n", filename); exit (1); } fprintf (stream, "escape_char /\n"); fprintf (stream, "comment_char %%\n"); fprintf (stream, "\n"); fprintf (stream, "%% Generated automatically by gen-unicode-ctype for Unicode %s.\n", version); fprintf (stream, "\n"); fprintf (stream, "LC_IDENTIFICATION\n"); fprintf (stream, "title \"Unicode %s FDCC-set\"\n", version); fprintf (stream, "source \"UnicodeData.txt, PropList.txt\"\n"); fprintf (stream, "address \"\"\n"); fprintf (stream, "contact \"\"\n"); fprintf (stream, "email \"[email protected]\"\n"); fprintf (stream, "tel \"\"\n"); fprintf (stream, "fax \"\"\n"); fprintf (stream, "language \"\"\n"); fprintf (stream, "territory \"Earth\"\n"); fprintf (stream, "revision \"%s\"\n", version); { time_t now; char date[11]; now = time (NULL); strftime (date, sizeof (date), "%Y-%m-%d", gmtime (&now)); fprintf (stream, "date \"%s\"\n", date); } fprintf (stream, "category \"unicode:2001\";LC_CTYPE\n"); fprintf (stream, "END LC_IDENTIFICATION\n"); fprintf (stream, "\n"); /* Verifications. */ for (ch = 0; ch < 0x110000; ch++) { /* toupper restriction: "Only characters specified for the keywords lower and upper shall be specified. */ if (to_upper (ch) != ch && !(is_lower (ch) || is_upper (ch))) fprintf (stderr, "%s is not upper|lower but toupper(0x%04X) = 0x%04X\n", ucs_symbol (ch), ch, to_upper (ch)); /* tolower restriction: "Only characters specified for the keywords lower and upper shall be specified. */ if (to_lower (ch) != ch && !(is_lower (ch) || is_upper (ch))) fprintf (stderr, "%s is not upper|lower but tolower(0x%04X) = 0x%04X\n", ucs_symbol (ch), ch, to_lower (ch)); /* alpha restriction: "Characters classified as either upper or lower shall automatically belong to this class. */ if ((is_lower (ch) || is_upper (ch)) && !is_alpha (ch)) fprintf (stderr, "%s is upper|lower but not alpha\n", ucs_symbol (ch)); /* alpha restriction: "No character specified for the keywords cntrl, digit, punct or space shall be specified." */ if (is_alpha (ch) && is_cntrl (ch)) fprintf (stderr, "%s is alpha and cntrl\n", ucs_symbol (ch)); if (is_alpha (ch) && is_digit (ch)) fprintf (stderr, "%s is alpha and digit\n", ucs_symbol (ch)); if (is_alpha (ch) && is_punct (ch)) fprintf (stderr, "%s is alpha and punct\n", ucs_symbol (ch)); if (is_alpha (ch) && is_space (ch)) fprintf (stderr, "%s is alpha and space\n", ucs_symbol (ch)); /* space restriction: "No character specified for the keywords upper, lower, alpha, digit, graph or xdigit shall be specified." upper, lower, alpha already checked above. */ if (is_space (ch) && is_digit (ch)) fprintf (stderr, "%s is space and digit\n", ucs_symbol (ch)); if (is_space (ch) && is_graph (ch)) fprintf (stderr, "%s is space and graph\n", ucs_symbol (ch)); if (is_space (ch) && is_xdigit (ch)) fprintf (stderr, "%s is space and xdigit\n", ucs_symbol (ch)); /* cntrl restriction: "No character specified for the keywords upper, lower, alpha, digit, punct, graph, print or xdigit shall be specified." upper, lower, alpha already checked above. */ if (is_cntrl (ch) && is_digit (ch)) fprintf (stderr, "%s is cntrl and digit\n", ucs_symbol (ch)); if (is_cntrl (ch) && is_punct (ch)) fprintf (stderr, "%s is cntrl and punct\n", ucs_symbol (ch)); if (is_cntrl (ch) && is_graph (ch)) fprintf (stderr, "%s is cntrl and graph\n", ucs_symbol (ch)); if (is_cntrl (ch) && is_print (ch)) fprintf (stderr, "%s is cntrl and print\n", ucs_symbol (ch)); if (is_cntrl (ch) && is_xdigit (ch)) fprintf (stderr, "%s is cntrl and xdigit\n", ucs_symbol (ch)); /* punct restriction: "No character specified for the keywords upper, lower, alpha, digit, cntrl, xdigit or as the <space> character shall be specified." upper, lower, alpha, cntrl already checked above. */ if (is_punct (ch) && is_digit (ch)) fprintf (stderr, "%s is punct and digit\n", ucs_symbol (ch)); if (is_punct (ch) && is_xdigit (ch)) fprintf (stderr, "%s is punct and xdigit\n", ucs_symbol (ch)); if (is_punct (ch) && (ch == 0x0020)) fprintf (stderr, "%s is punct\n", ucs_symbol (ch)); /* graph restriction: "No character specified for the keyword cntrl shall be specified." Already checked above. */ /* print restriction: "No character specified for the keyword cntrl shall be specified." Already checked above. */ /* graph - print relation: differ only in the <space> character. How is this possible if there are more than one space character?! I think susv2/xbd/locale.html should speak of "space characters", not "space character". */ if (is_print (ch) && !(is_graph (ch) || /* ch == 0x0020 */ is_space (ch))) fprintf (stderr, "%s is print but not graph|<space>\n", ucs_symbol (ch)); if (!is_print (ch) && (is_graph (ch) || ch == 0x0020)) fprintf (stderr, "%s is graph|<space> but not print\n", ucs_symbol (ch)); } fprintf (stream, "LC_CTYPE\n"); output_charclass (stream, "upper", is_upper); output_charclass (stream, "lower", is_lower); output_charclass (stream, "alpha", is_alpha); output_charclass (stream, "digit", is_digit); output_charclass (stream, "outdigit", is_outdigit); output_charclass (stream, "blank", is_blank); output_charclass (stream, "space", is_space); output_charclass (stream, "cntrl", is_cntrl); output_charclass (stream, "punct", is_punct); output_charclass (stream, "xdigit", is_xdigit); output_charclass (stream, "graph", is_graph); output_charclass (stream, "print", is_print); output_charclass (stream, "class \"combining\";", is_combining); output_charclass (stream, "class \"combining_level3\";", is_combining_level3); output_charmap (stream, "toupper", to_upper); output_charmap (stream, "tolower", to_lower); output_charmap (stream, "map \"totitle\";", to_title); output_widthmap (stream); fprintf (stream, "END LC_CTYPE\n"); if (ferror (stream) || fclose (stream)) { fprintf (stderr, "error writing to '%s'\n", filename); exit (1); } }
const char* punct(const char* src) { return is_punct(*src) ? src + 1 : 0; }