Object* Character::alphabetical_p(STATE) { bool found; int c = codepoint(state, &found); OnigEncodingType* enc = encoding()->get_encoding(); return RBOOL(found && ONIGENC_IS_CODE_ALPHA(enc, c)); }
Object* Character::lower_p(STATE) { bool found; int c = codepoint(state, &found); OnigEncodingType* enc = encoding()->get_encoding(); return RBOOL(found && ONIGENC_IS_CODE_LOWER(enc, c)); }
Object* Character::punctuation_p(STATE) { bool found; int c = codepoint(state, &found); OnigEncodingType* enc = encoding()->get_encoding(); return RBOOL(found && ONIGENC_IS_CODE_PUNCT(enc, c)); }
inline codepoint encode_raw_byte(char byte) { codepoint c = codepoint(byte) & 0xFF; if (c == 0) { throw encoding_error("null or zero byte cannot be raw-data encoded"); } return c | 0xF800; }
typename boost::enable_if_c<bit_size<typename std::iterator_traits<T>::value_type>::value >= 21, codepoint>::type next(T& it, T const& end) { codepoint c = codepoint(*it); if (!is_valid(c)) { throw encoding_error("invalid codepoint"); } ++it; return c; }
Object* Character::ascii_p(STATE) { bool found; int c = codepoint(state, &found); return RBOOL(found && ONIGENC_IS_CODE_ASCII(c)); }
std::string Scanner::escape(const char *str, int len, char quote_type) const { std::string output; output.reserve(len); if (quote_type == '\'') { for (int i = 0; i < len; i++) { unsigned char ch = str[i]; if (ch == '\\') { if (++i < len) { switch (str[i]) { case '\\': output += "\\"; break; case '\'': output += '\''; break; default: { output += ch; output += str[i]; break; } } } else { assert(false); output += ch; } } else { output += ch; } } } else { for (int i = 0; i < len; i++) { unsigned char ch = str[i]; if (ch == '\\') { if (++i < len) { switch (str[i]) { case 'n': output += '\n'; break; case 't': output += '\t'; break; case 'r': output += '\r'; break; case 'v': output += '\v'; break; case 'f': output += '\f'; break; case 'e': output += '\033'; break; case '\\': output += '\\'; break; case '$': output += '$'; break; case '"': case '`': if (str[i] != quote_type) { output += '\\'; } output += str[i]; break; case 'x': case 'X': { if (isxdigit(str[i+1])) { std::string shex; shex += str[++i]; // 0th hex digit if (isxdigit(str[i+1])) { shex += str[++i]; // 1st hex digit } output += strtol(shex.c_str(), nullptr, 16); } else { output += ch; output += str[i]; } break; } case 'u': { // Unicode escape sequence // "\u{123456}" if (str[i+1] != '{') { // BC for "\u1234" passthrough output += ch; output += str[i]; break; } bool valid = true; auto start = str + i + 2; auto closebrace = strchr(start, '}'); if (closebrace > start) { for (auto p = start; p < closebrace; ++p) { if (!isxdigit(*p)) { valid = false; break; } } } else { valid = false; } auto fatal = [this](const char *msg) { auto loc = getLocation(); return ParseTimeFatalException( loc->file, loc->r.line0, "%s", msg); }; if (!valid) { throw fatal("Invalid UTF-8 codepoint escape sequence"); } std::string codepoint(start, closebrace - start); char *end = nullptr; int32_t uchar = strtol(codepoint.c_str(), &end, 16); if ((end && *end) || (uchar > 0x10FFFF)) { throw fatal( "Invalid UTF-8 codepoint escape sequence: " "Codepoint too large"); } if (uchar <= 0x0007F) { output += (char)uchar; } else if (uchar <= 0x007FF) { output += (char)(0xC0 | ( uchar >> 6 )); output += (char)(0x80 | ( uchar & 0x3F)); } else if (uchar <= 0x00FFFF) { output += (char)(0xE0 | ( uchar >> 12 )); output += (char)(0x80 | ((uchar >> 6) & 0x3F)); output += (char)(0x80 | ( uchar & 0x3F)); } else if (uchar <= 0x10FFFF) { output += (char)(0xF0 | ( uchar >> 18 )); output += (char)(0x80 | ((uchar >> 12) & 0x3F)); output += (char)(0x80 | ((uchar >> 6) & 0x3F)); output += (char)(0x80 | ( uchar & 0x3F)); } else { not_reached(); assert(false); } i += codepoint.size() + 2 /* strlen("{}") */; break; }