static int to_ascii(OnigEncoding enc, UChar *s, UChar *end, UChar buf[], int buf_size, int *is_over) { int len; UChar *p; OnigCodePoint code; if (ONIGENC_MBC_MINLEN(enc) > 1) { p = s; len = 0; while (p < end) { code = ONIGENC_MBC_TO_CODE(enc, p, end); if (code >= 0x80) { if (code > 0xffff && len + 10 <= buf_size) { sprint_byte_with_x((char*)(&(buf[len])), (unsigned int)(code >> 24)); sprint_byte((char*)(&(buf[len+4])), (unsigned int)(code >> 16)); sprint_byte((char*)(&(buf[len+6])), (unsigned int)(code >> 8)); sprint_byte((char*)(&(buf[len+8])), (unsigned int)code); len += 10; } else if (len + 6 <= buf_size) { sprint_byte_with_x((char*)(&(buf[len])), (unsigned int)(code >> 8)); sprint_byte((char*)(&(buf[len+4])), (unsigned int)code); len += 6; }
extern int onigenc_unicode_property_name_to_ctype(OnigEncoding enc, UChar* name, UChar* end) { int len; int ctype; UChar buf[PROPERTY_NAME_MAX_SIZE]; UChar *p; OnigCodePoint code; len = 0; for (p = name; p < end; p += enclen(enc, p)) { code = ONIGENC_MBC_TO_CODE(enc, p, end); if (code == ' ' || code == '-' || code == '_') continue; if (code >= 0x80) return ONIGERR_INVALID_CHAR_PROPERTY_NAME; buf[len++] = ONIGENC_ASCII_CODE_TO_LOWER_CASE(code); if (len >= PROPERTY_NAME_MAX_SIZE) return ONIGERR_INVALID_CHAR_PROPERTY_NAME; } buf[len] = 0; if ((ctype = uniname2ctype(buf, len)) < 0) { return ONIGERR_INVALID_CHAR_PROPERTY_NAME; } return ctype; }
extern int onigenc_unicode_mbc_case_fold(OnigEncoding enc, OnigCaseFoldType flag ARG_UNUSED, const UChar** pp, const UChar* end, UChar* fold) { const struct ByUnfoldKey* buk; OnigCodePoint code; int i, len, rlen; const UChar *p = *pp; code = ONIGENC_MBC_TO_CODE(enc, p, end); len = enclen(enc, p); *pp += len; #ifdef USE_UNICODE_CASE_FOLD_TURKISH_AZERI if ((flag & ONIGENC_CASE_FOLD_TURKISH_AZERI) != 0) { if (code == 0x0130) { return ONIGENC_CODE_TO_MBC(enc, 0x0069, fold); } #if 0 if (code == 0x0049) { return ONIGENC_CODE_TO_MBC(enc, 0x0131, fold); } #endif } #endif buk = unicode_unfold_key(code); if (buk != 0) { if (buk->fold_len == 1) { return ONIGENC_CODE_TO_MBC(enc, *FOLDS1_FOLD(buk->index), fold); } else { OnigCodePoint* addr; FOLDS_FOLD_ADDR_BUK(buk, addr); rlen = 0; for (i = 0; i < buk->fold_len; i++) { OnigCodePoint c = addr[i]; len = ONIGENC_CODE_TO_MBC(enc, c, fold); fold += len; rlen += len; } return rlen; } } for (i = 0; i < len; i++) { *fold++ = *p++; } return len; }
SymbolTable::Kind SymbolTable::detect_kind(STATE, const Symbol* sym) { std::string str = strings[sym->index()]; size_t size = str.size(); uint8_t* p = reinterpret_cast<uint8_t*>(const_cast<char*>(str.c_str())); Encoding* e = Encoding::from_index(state, encodings[sym->index()]); OnigEncodingType* enc = e->encoding(); // Constants start with A-Z, followed by alphanumeric characters or '_' or // non-ascii character. if(isupper(*p)) { uint8_t* e = p + size; int n = 0, code = 0; for(++p; p < e; p += n) { n = Encoding::precise_mbclen(p, e, enc); if(!ONIGENC_MBCLEN_CHARFOUND_P(n)) { return SymbolTable::eNormal; } n = ONIGENC_MBCLEN_CHARFOUND_LEN(n); code = ONIGENC_MBC_TO_CODE(enc, p, p + n); if(!(ONIGENC_IS_CODE_ALNUM(enc, code) || *p == '_' || !ISASCII(*p))) { return SymbolTable::eNormal; } } return SymbolTable::eConstant; } if(p[0] == '@') { // A class variable begins with @@ if(size > 1 && p[1] == '@') { return SymbolTable::eCVar; } // An instance variable can't start with a digit and can't be just @. if((size == 1) || (size > 1 && ISDIGIT(p[1]))) { return SymbolTable::eNormal; } // An instance variable begins with @ return SymbolTable::eIVar; } // A system variable begins with __ if(size > 2 && p[0] == '_' && p[1] == '_') { return SymbolTable::eSystem; } // Everything else is normal return SymbolTable::eNormal; }
extern int onigenc_unicode_property_name_to_ctype(OnigEncoding enc, UChar* name, UChar* end) { int len; UChar *p; OnigCodePoint code; const struct PropertyNameCtype* pc; char buf[PROPERTY_NAME_MAX_SIZE]; p = name; len = 0; while (p < end) { code = ONIGENC_MBC_TO_CODE(enc, p, end); if (code >= 0x80) return ONIGERR_INVALID_CHAR_PROPERTY_NAME; if (code != ' ' && code != '-' && code != '_') { buf[len++] = (char )code; if (len >= PROPERTY_NAME_MAX_SIZE) return ONIGERR_INVALID_CHAR_PROPERTY_NAME; } p += enclen(enc, p); } buf[len] = 0; if (UserDefinedPropertyTable != 0) { UserDefinedPropertyValue* e; e = (UserDefinedPropertyValue* )NULL; onig_st_lookup_strend(UserDefinedPropertyTable, (const UChar* )buf, (const UChar* )buf + len, (hash_data_type* )((void* )(&e))); if (e != 0) { return e->ctype; } } pc = unicode_lookup_property_name(buf, len); if (pc != 0) { /* fprintf(stderr, "LOOKUP: %s: %d\n", buf, pc->ctype); */ #ifndef USE_UNICODE_PROPERTIES if (pc->ctype > ONIGENC_MAX_STD_CTYPE) return ONIGERR_INVALID_CHAR_PROPERTY_NAME; #endif return pc->ctype; } return ONIGERR_INVALID_CHAR_PROPERTY_NAME; }
extern int onigenc_unicode_get_case_fold_codes_by_str(OnigEncoding enc, OnigCaseFoldType flag, const OnigUChar* p, const OnigUChar* end, OnigCaseFoldCodeItem items[]) { int n, m, i, j, k, len; OnigCodePoint code, codes[3]; const struct ByUnfoldKey* buk; n = 0; code = ONIGENC_MBC_TO_CODE(enc, p, end); len = enclen(enc, p); #ifdef USE_UNICODE_CASE_FOLD_TURKISH_AZERI if ((flag & ONIGENC_CASE_FOLD_TURKISH_AZERI) != 0) { if (code == 0x0049) { items[0].byte_len = len; items[0].code_len = 1; items[0].code[0] = 0x0131; return 1; } else if (code == 0x0130) { items[0].byte_len = len; items[0].code_len = 1; items[0].code[0] = 0x0069; return 1; } else if (code == 0x0131) { items[0].byte_len = len; items[0].code_len = 1; items[0].code[0] = 0x0049; return 1; } else if (code == 0x0069) { items[0].byte_len = len; items[0].code_len = 1; items[0].code[0] = 0x0130; return 1; } } #endif buk = unicode_unfold_key(code); if (buk != 0) { if (buk->fold_len == 1) { int un; items[0].byte_len = len; items[0].code_len = 1; items[0].code[0] = *FOLDS1_FOLD(buk->index); n++; un = FOLDS1_UNFOLDS_NUM(buk->index); for (i = 0; i < un; i++) { OnigCodePoint unfold = FOLDS1_UNFOLDS(buk->index)[i]; if (unfold != code) { items[n].byte_len = len; items[n].code_len = 1; items[n].code[0] = unfold; n++; } } code = items[0].code[0]; // for multi-code to unfold search. } else if ((flag & INTERNAL_ONIGENC_CASE_FOLD_MULTI_CHAR) != 0) { OnigCodePoint cs[3][4]; int fn, ncs[3]; if (buk->fold_len == 2) { m = FOLDS2_UNFOLDS_NUM(buk->index); for (i = 0; i < m; i++) { OnigCodePoint unfold = FOLDS2_UNFOLDS(buk->index)[i]; if (unfold == code) continue; items[n].byte_len = len; items[n].code_len = 1; items[n].code[0] = unfold; n++; } for (fn = 0; fn < 2; fn++) { int index; cs[fn][0] = FOLDS2_FOLD(buk->index)[fn]; index = unicode_fold1_key(&cs[fn][0]); if (index >= 0) { int m = FOLDS1_UNFOLDS_NUM(index); for (i = 0; i < m; i++) { cs[fn][i+1] = FOLDS1_UNFOLDS(index)[i]; } ncs[fn] = m + 1; } else ncs[fn] = 1; } for (i = 0; i < ncs[0]; i++) { for (j = 0; j < ncs[1]; j++) { items[n].byte_len = len; items[n].code_len = 2; items[n].code[0] = cs[0][i]; items[n].code[1] = cs[1][j]; n++; } } } else { /* fold_len == 3 */ m = FOLDS3_UNFOLDS_NUM(buk->index); for (i = 0; i < m; i++) { OnigCodePoint unfold = FOLDS3_UNFOLDS(buk->index)[i]; if (unfold == code) continue; items[n].byte_len = len; items[n].code_len = 1; items[n].code[0] = unfold; n++; } for (fn = 0; fn < 3; fn++) { int index; cs[fn][0] = FOLDS3_FOLD(buk->index)[fn]; index = unicode_fold1_key(&cs[fn][0]); if (index >= 0) { int m = FOLDS1_UNFOLDS_NUM(index); for (i = 0; i < m; i++) { cs[fn][i+1] = FOLDS1_UNFOLDS(index)[i]; } ncs[fn] = m + 1; } else ncs[fn] = 1; } for (i = 0; i < ncs[0]; i++) { for (j = 0; j < ncs[1]; j++) { for (k = 0; k < ncs[2]; k++) { items[n].byte_len = len; items[n].code_len = 3; items[n].code[0] = cs[0][i]; items[n].code[1] = cs[1][j]; items[n].code[2] = cs[2][k]; n++; } } } } /* multi char folded code is not head of another folded multi char */ return n; } } else { int index = unicode_fold1_key(&code); if (index >= 0) { int m = FOLDS1_UNFOLDS_NUM(index); for (i = 0; i < m; i++) { items[n].byte_len = len; items[n].code_len = 1; items[n].code[0] = FOLDS1_UNFOLDS(index)[i]; n++; } } } if ((flag & INTERNAL_ONIGENC_CASE_FOLD_MULTI_CHAR) == 0) return n; p += len; if (p < end) { int clen; int index; codes[0] = code; code = ONIGENC_MBC_TO_CODE(enc, p, end); buk = unicode_unfold_key(code); if (buk != 0 && buk->fold_len == 1) { codes[1] = *FOLDS1_FOLD(buk->index); } else codes[1] = code; clen = enclen(enc, p); len += clen; index = unicode_fold2_key(codes); if (index >= 0) { m = FOLDS2_UNFOLDS_NUM(index); for (i = 0; i < m; i++) { items[n].byte_len = len; items[n].code_len = 1; items[n].code[0] = FOLDS2_UNFOLDS(index)[i]; n++; } } p += clen; if (p < end) { code = ONIGENC_MBC_TO_CODE(enc, p, end); buk = unicode_unfold_key(code); if (buk != 0 && buk->fold_len == 1) { codes[2] = *FOLDS1_FOLD(buk->index); } else codes[2] = code; clen = enclen(enc, p); len += clen; index = unicode_fold3_key(codes); if (index >= 0) { m = FOLDS3_UNFOLDS_NUM(index); for (i = 0; i < m; i++) { items[n].byte_len = len; items[n].code_len = 1; items[n].code[0] = FOLDS3_UNFOLDS(index)[i]; n++; } } } } return n; }
extern int onigenc_unicode_mbc_case_fold(OnigEncoding enc, OnigCaseFoldType flag ARG_UNUSED, const UChar** pp, const UChar* end, UChar* fold) { CodePointList3 *to; OnigCodePoint code; int i, len, rlen; const UChar *p = *pp; if (CaseFoldInited == 0) init_case_fold_table(); code = ONIGENC_MBC_TO_CODE(enc, p, end); len = enclen(enc, p); *pp += len; #ifdef USE_UNICODE_CASE_FOLD_TURKISH_AZERI if ((flag & ONIGENC_CASE_FOLD_TURKISH_AZERI) != 0) { if (code == 0x0049) { return ONIGENC_CODE_TO_MBC(enc, 0x0131, fold); } else if (code == 0x0130) { return ONIGENC_CODE_TO_MBC(enc, 0x0069, fold); } } #endif if (onig_st_lookup(FoldTable, (st_data_t )code, (void* )&to) != 0) { if (to->n == 1) { return ONIGENC_CODE_TO_MBC(enc, to->code[0], fold); } #if 0 /* NO NEEDS TO CHECK */ else if ((flag & INTERNAL_ONIGENC_CASE_FOLD_MULTI_CHAR) != 0) { #else else { #endif rlen = 0; for (i = 0; i < to->n; i++) { len = ONIGENC_CODE_TO_MBC(enc, to->code[i], fold); fold += len; rlen += len; } return rlen; } } for (i = 0; i < len; i++) { *fold++ = *p++; } return len; } extern int onigenc_unicode_apply_all_case_fold(OnigCaseFoldType flag, OnigApplyAllCaseFoldFunc f, void* arg) { const CaseUnfold_11_Type* p11; OnigCodePoint code; int i, j, k, r; /* if (CaseFoldInited == 0) init_case_fold_table(); */ for (i = 0; i < numberof(CaseUnfold_11); i++) { p11 = &CaseUnfold_11[i]; for (j = 0; j < p11->to.n; j++) { code = p11->from; r = (*f)(p11->to.code[j], &code, 1, arg); if (r != 0) return r; code = p11->to.code[j]; r = (*f)(p11->from, &code, 1, arg); if (r != 0) return r; for (k = 0; k < j; k++) { r = (*f)(p11->to.code[j], (OnigCodePoint* )(&p11->to.code[k]), 1, arg); if (r != 0) return r; r = (*f)(p11->to.code[k], (OnigCodePoint* )(&p11->to.code[j]), 1, arg); if (r != 0) return r; } } } #ifdef USE_UNICODE_CASE_FOLD_TURKISH_AZERI if ((flag & ONIGENC_CASE_FOLD_TURKISH_AZERI) != 0) { code = 0x0131; r = (*f)(0x0049, &code, 1, arg); if (r != 0) return r; code = 0x0049; r = (*f)(0x0131, &code, 1, arg); if (r != 0) return r; code = 0x0130; r = (*f)(0x0069, &code, 1, arg); if (r != 0) return r; code = 0x0069; r = (*f)(0x0130, &code, 1, arg); if (r != 0) return r; } else { #endif for (i = 0; i < numberof(CaseUnfold_11_Locale); i++) { p11 = &CaseUnfold_11_Locale[i]; for (j = 0; j < p11->to.n; j++) { code = p11->from; r = (*f)(p11->to.code[j], &code, 1, arg); if (r != 0) return r; code = p11->to.code[j]; r = (*f)(p11->from, &code, 1, arg); if (r != 0) return r; for (k = 0; k < j; k++) { r = (*f)(p11->to.code[j], (OnigCodePoint* )(&p11->to.code[k]), 1, arg); if (r != 0) return r; r = (*f)(p11->to.code[k], (OnigCodePoint* )(&p11->to.code[j]), 1, arg); if (r != 0) return r; } } } #ifdef USE_UNICODE_CASE_FOLD_TURKISH_AZERI } #endif if ((flag & INTERNAL_ONIGENC_CASE_FOLD_MULTI_CHAR) != 0) { for (i = 0; i < numberof(CaseUnfold_12); i++) { for (j = 0; j < CaseUnfold_12[i].to.n; j++) { r = (*f)(CaseUnfold_12[i].to.code[j], (OnigCodePoint* )CaseUnfold_12[i].from, 2, arg); if (r != 0) return r; for (k = 0; k < CaseUnfold_12[i].to.n; k++) { if (k == j) continue; r = (*f)(CaseUnfold_12[i].to.code[j], (OnigCodePoint* )(&CaseUnfold_12[i].to.code[k]), 1, arg); if (r != 0) return r; } } } #ifdef USE_UNICODE_CASE_FOLD_TURKISH_AZERI if ((flag & ONIGENC_CASE_FOLD_TURKISH_AZERI) == 0) { #endif for (i = 0; i < numberof(CaseUnfold_12_Locale); i++) { for (j = 0; j < CaseUnfold_12_Locale[i].to.n; j++) { r = (*f)(CaseUnfold_12_Locale[i].to.code[j], (OnigCodePoint* )CaseUnfold_12_Locale[i].from, 2, arg); if (r != 0) return r; for (k = 0; k < CaseUnfold_12_Locale[i].to.n; k++) { if (k == j) continue; r = (*f)(CaseUnfold_12_Locale[i].to.code[j], (OnigCodePoint* )(&CaseUnfold_12_Locale[i].to.code[k]), 1, arg); if (r != 0) return r; } } } #ifdef USE_UNICODE_CASE_FOLD_TURKISH_AZERI } #endif for (i = 0; i < numberof(CaseUnfold_13); i++) { for (j = 0; j < CaseUnfold_13[i].to.n; j++) { r = (*f)(CaseUnfold_13[i].to.code[j], (OnigCodePoint* )CaseUnfold_13[i].from, 3, arg); if (r != 0) return r; for (k = 0; k < CaseUnfold_13[i].to.n; k++) { if (k == j) continue; r = (*f)(CaseUnfold_13[i].to.code[j], (OnigCodePoint* )(&CaseUnfold_13[i].to.code[k]), 1, arg); if (r != 0) return r; } } } } return 0; } extern int onigenc_unicode_get_case_fold_codes_by_str(OnigEncoding enc, OnigCaseFoldType flag, const OnigUChar* p, const OnigUChar* end, OnigCaseFoldCodeItem items[]) { int n, i, j, k, len; OnigCodePoint code, codes[3]; CodePointList3 *to, *z3; CodePointList2 *z2; if (CaseFoldInited == 0) init_case_fold_table(); n = 0; code = ONIGENC_MBC_TO_CODE(enc, p, end); len = enclen(enc, p); #ifdef USE_UNICODE_CASE_FOLD_TURKISH_AZERI if ((flag & ONIGENC_CASE_FOLD_TURKISH_AZERI) != 0) { if (code == 0x0049) { items[0].byte_len = len; items[0].code_len = 1; items[0].code[0] = 0x0131; return 1; } else if (code == 0x0130) { items[0].byte_len = len; items[0].code_len = 1; items[0].code[0] = 0x0069; return 1; } else if (code == 0x0131) { items[0].byte_len = len; items[0].code_len = 1; items[0].code[0] = 0x0049; return 1; } else if (code == 0x0069) { items[0].byte_len = len; items[0].code_len = 1; items[0].code[0] = 0x0130; return 1; } } #endif if (onig_st_lookup(FoldTable, (st_data_t )code, (void* )&to) != 0) { if (to->n == 1) { OnigCodePoint orig_code = code; items[0].byte_len = len; items[0].code_len = 1; items[0].code[0] = to->code[0]; n++; code = to->code[0]; if (onig_st_lookup(Unfold1Table, (st_data_t )code, (void* )&to) != 0) { for (i = 0; i < to->n; i++) { if (to->code[i] != orig_code) { items[n].byte_len = len; items[n].code_len = 1; items[n].code[0] = to->code[i]; n++; } } } } else if ((flag & INTERNAL_ONIGENC_CASE_FOLD_MULTI_CHAR) != 0) { OnigCodePoint cs[3][4]; int fn, ncs[3]; for (fn = 0; fn < to->n; fn++) { cs[fn][0] = to->code[fn]; if (onig_st_lookup(Unfold1Table, (st_data_t )cs[fn][0], (void* )&z3) != 0) { for (i = 0; i < z3->n; i++) { cs[fn][i+1] = z3->code[i]; } ncs[fn] = z3->n + 1; } else ncs[fn] = 1; } if (fn == 2) { for (i = 0; i < ncs[0]; i++) { for (j = 0; j < ncs[1]; j++) { items[n].byte_len = len; items[n].code_len = 2; items[n].code[0] = cs[0][i]; items[n].code[1] = cs[1][j]; n++; } } if (onig_st_lookup(Unfold2Table, (st_data_t )to->code, (void* )&z2) != 0) { for (i = 0; i < z2->n; i++) { if (z2->code[i] == code) continue; items[n].byte_len = len; items[n].code_len = 1; items[n].code[0] = z2->code[i]; n++; } } } else { for (i = 0; i < ncs[0]; i++) { for (j = 0; j < ncs[1]; j++) { for (k = 0; k < ncs[2]; k++) { items[n].byte_len = len; items[n].code_len = 3; items[n].code[0] = cs[0][i]; items[n].code[1] = cs[1][j]; items[n].code[2] = cs[2][k]; n++; } } } if (onig_st_lookup(Unfold3Table, (st_data_t )to->code, (void* )&z2) != 0) { for (i = 0; i < z2->n; i++) { if (z2->code[i] == code) continue; items[n].byte_len = len; items[n].code_len = 1; items[n].code[0] = z2->code[i]; n++; } } } /* multi char folded code is not head of another folded multi char */ flag = 0; /* DISABLE_CASE_FOLD_MULTI_CHAR(flag); */ } } else { if (onig_st_lookup(Unfold1Table, (st_data_t )code, (void* )&to) != 0) { for (i = 0; i < to->n; i++) { items[n].byte_len = len; items[n].code_len = 1; items[n].code[0] = to->code[i]; n++; } } } if ((flag & INTERNAL_ONIGENC_CASE_FOLD_MULTI_CHAR) != 0) { p += len; if (p < end) { int clen; codes[0] = code; code = ONIGENC_MBC_TO_CODE(enc, p, end); if (onig_st_lookup(FoldTable, (st_data_t )code, (void* )&to) != 0 && to->n == 1) { codes[1] = to->code[0]; } else codes[1] = code; clen = enclen(enc, p); len += clen; if (onig_st_lookup(Unfold2Table, (st_data_t )codes, (void* )&z2) != 0) { for (i = 0; i < z2->n; i++) { items[n].byte_len = len; items[n].code_len = 1; items[n].code[0] = z2->code[i]; n++; } } p += clen; if (p < end) { code = ONIGENC_MBC_TO_CODE(enc, p, end); if (onig_st_lookup(FoldTable, (st_data_t )code, (void* )&to) != 0 && to->n == 1) { codes[2] = to->code[0]; } else codes[2] = code; clen = enclen(enc, p); len += clen; if (onig_st_lookup(Unfold3Table, (st_data_t )codes, (void* )&z2) != 0) { for (i = 0; i < z2->n; i++) { items[n].byte_len = len; items[n].code_len = 1; items[n].code[0] = z2->code[i]; n++; } } } } } return n; }