PCRE_EXPORT BOOL _pcre_xclass(int c, const uschar *data) { int t; BOOL negated = (*data & XCL_NOT) != 0; /* Character values < 256 are matched against a bitmap, if one is present. If not, we still carry on, because there may be ranges that start below 256 in the additional data. */ if (c < 256) { if ((*data & XCL_MAP) != 0 && (data[1 + c/8] & (1 << (c&7))) != 0) return !negated; /* char found */ } /* First skip the bit map if present. Then match against the list of Unicode properties or large chars or ranges that end with a large char. We won't ever encounter XCL_PROP or XCL_NOTPROP when UCP support is not compiled. */ if ((*data++ & XCL_MAP) != 0) data += 32; while ((t = *data++) != XCL_END) { int x, y; if (t == XCL_SINGLE) { GETCHARINC(x, data); if (c == x) return !negated; } else if (t == XCL_RANGE) { GETCHARINC(x, data); GETCHARINC(y, data); if (c >= x && c <= y) return !negated; } #ifdef SUPPORT_UCP else /* XCL_PROP & XCL_NOTPROP */ { int chartype, othercase; int rqdtype = *data++; int category = _pcre_ucp_findchar(c, &chartype, &othercase); if (rqdtype >= 128) { if ((rqdtype - 128 == category) == (t == XCL_PROP)) return !negated; } else { if ((rqdtype == chartype) == (t == XCL_PROP)) return !negated; } } #endif /* SUPPORT_UCP */ } return negated; /* char did not match */ }
static const uschar * set_table_bit(uschar *start_bits, const uschar *p, BOOL caseless, compile_data *cd, BOOL utf8) { unsigned int c = *p; SET_BIT(c); #ifdef SUPPORT_UTF8 if (utf8 && c > 127) { GETCHARINC(c, p); #ifdef SUPPORT_UCP if (caseless) { uschar buff[8]; c = UCD_OTHERCASE(c); (void)_pcre_ord2utf8(c, buff); SET_BIT(buff[0]); } #endif return p; } #endif /* Not UTF-8 mode, or character is less than 127. */ if (caseless && (cd->ctypes[c] & ctype_letter) != 0) SET_BIT(cd->fcc[c]); return p + 1; }
BOOL _pcre_xclass(int c, const uschar *data) { int t; BOOL negated = (*data & XCL_NOT) != 0; /* Character values < 256 are matched against a bitmap, if one is present. If not, we still carry on, because there may be ranges that start below 256 in the additional data. */ if (c < 256) { if ((*data & XCL_MAP) != 0 && (data[1 + c/8] & (1 << (c&7))) != 0) return !negated; /* char found */ } /* First skip the bit map if present. Then match against the list of Unicode properties or large chars or ranges that end with a large char. We won't ever encounter XCL_PROP or XCL_NOTPROP when UCP support is not compiled. */ if ((*data++ & XCL_MAP) != 0) data += 32; while ((t = *data++) != XCL_END) { int x, y; if (t == XCL_SINGLE) { GETCHARINC(x, data); if (c == x) return !negated; } else if (t == XCL_RANGE) { GETCHARINC(x, data); GETCHARINC(y, data); if (c >= x && c <= y) return !negated; } #ifdef SUPPORT_UCP else /* XCL_PROP & XCL_NOTPROP */ { const ucd_record *prop = GET_UCD(c); switch(*data) { case PT_ANY: if (t == XCL_PROP) return !negated; break; case PT_LAMP: if ((prop->chartype == ucp_Lu || prop->chartype == ucp_Ll || prop->chartype == ucp_Lt) == (t == XCL_PROP)) return !negated; break; case PT_GC: if ((data[1] == _pcre_ucp_gentype[prop->chartype]) == (t == XCL_PROP)) return !negated; break; case PT_PC: if ((data[1] == prop->chartype) == (t == XCL_PROP)) return !negated; break; case PT_SC: if ((data[1] == prop->script) == (t == XCL_PROP)) return !negated; break; case PT_ALNUM: if ((_pcre_ucp_gentype[prop->chartype] == ucp_L || _pcre_ucp_gentype[prop->chartype] == ucp_N) == (t == XCL_PROP)) return !negated; break; case PT_SPACE: /* Perl space */ if ((_pcre_ucp_gentype[prop->chartype] == ucp_Z || c == CHAR_HT || c == CHAR_NL || c == CHAR_FF || c == CHAR_CR) == (t == XCL_PROP)) return !negated; break; case PT_PXSPACE: /* POSIX space */ if ((_pcre_ucp_gentype[prop->chartype] == ucp_Z || c == CHAR_HT || c == CHAR_NL || c == CHAR_VT || c == CHAR_FF || c == CHAR_CR) == (t == XCL_PROP)) return !negated; break; case PT_WORD: if ((_pcre_ucp_gentype[prop->chartype] == ucp_L || _pcre_ucp_gentype[prop->chartype] == ucp_N || c == CHAR_UNDERSCORE) == (t == XCL_PROP)) return !negated; break; /* This should never occur, but compilers may mutter if there is no default. */ default: return FALSE; } data += 2; } #endif /* SUPPORT_UCP */ } return negated; /* char did not match */ }
static const pcre_uchar * set_table_bit(pcre_uint8 *start_bits, const pcre_uchar *p, BOOL caseless, compile_data *cd, BOOL utf) { pcre_uint32 c = *p; #ifdef COMPILE_PCRE8 SET_BIT(c); #ifdef SUPPORT_UTF if (utf && c > 127) { GETCHARINC(c, p); #ifdef SUPPORT_UCP if (caseless) { pcre_uchar buff[6]; c = UCD_OTHERCASE(c); (void)PRIV(ord2utf)(c, buff); SET_BIT(buff[0]); } #endif /* Not SUPPORT_UCP */ return p; } #else /* Not SUPPORT_UTF */ (void)(utf); /* Stops warning for unused parameter */ #endif /* SUPPORT_UTF */ /* Not UTF-8 mode, or character is less than 127. */ if (caseless && (cd->ctypes[c] & ctype_letter) != 0) SET_BIT(cd->fcc[c]); return p + 1; #endif /* COMPILE_PCRE8 */ #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32 if (c > 0xff) { c = 0xff; caseless = FALSE; } SET_BIT(c); #ifdef SUPPORT_UTF if (utf && c > 127) { GETCHARINC(c, p); #ifdef SUPPORT_UCP if (caseless) { c = UCD_OTHERCASE(c); if (c > 0xff) c = 0xff; SET_BIT(c); } #endif /* SUPPORT_UCP */ return p; } #else /* Not SUPPORT_UTF */ (void)(utf); /* Stops warning for unused parameter */ #endif /* SUPPORT_UTF */ if (caseless && (cd->ctypes[c] & ctype_letter) != 0) SET_BIT(cd->fcc[c]); return p + 1; #endif }
BOOL PRIV(xclass)(pcre_uint32 c, const pcre_uchar *data, BOOL utf) { pcre_uchar t; BOOL negated = (*data & XCL_NOT) != 0; (void)utf; #ifdef COMPILE_PCRE8 /* In 8 bit mode, this must always be TRUE. Help the compiler to know that. */ utf = TRUE; #endif /* Character values < 256 are matched against a bitmap, if one is present. If not, we still carry on, because there may be ranges that start below 256 in the additional data. */ if (c < 256) { if ((*data & XCL_MAP) != 0 && (((pcre_uint8 *)(data + 1))[c/8] & (1 << (c&7))) != 0) return !negated; /* char found */ } /* First skip the bit map if present. Then match against the list of Unicode properties or large chars or ranges that end with a large char. We won't ever encounter XCL_PROP or XCL_NOTPROP when UCP support is not compiled. */ if ((*data++ & XCL_MAP) != 0) data += 32 / sizeof(pcre_uchar); while ((t = *data++) != XCL_END) { pcre_uint32 x, y; if (t == XCL_SINGLE) { #ifdef SUPPORT_UTF if (utf) { GETCHARINC(x, data); /* macro generates multiple statements */ } else #endif x = *data++; if (c == x) return !negated; } else if (t == XCL_RANGE) { #ifdef SUPPORT_UTF if (utf) { GETCHARINC(x, data); /* macro generates multiple statements */ GETCHARINC(y, data); /* macro generates multiple statements */ } else #endif { x = *data++; y = *data++; } if (c >= x && c <= y) return !negated; } #ifdef SUPPORT_UCP else /* XCL_PROP & XCL_NOTPROP */ { const ucd_record *prop = GET_UCD(c); switch(*data) { case PT_ANY: if (t == XCL_PROP) return !negated; break; case PT_LAMP: if ((prop->chartype == ucp_Lu || prop->chartype == ucp_Ll || prop->chartype == ucp_Lt) == (t == XCL_PROP)) return !negated; break; case PT_GC: if ((data[1] == PRIV(ucp_gentype)[prop->chartype]) == (t == XCL_PROP)) return !negated; break; case PT_PC: if ((data[1] == prop->chartype) == (t == XCL_PROP)) return !negated; break; case PT_SC: if ((data[1] == prop->script) == (t == XCL_PROP)) return !negated; break; case PT_ALNUM: if ((PRIV(ucp_gentype)[prop->chartype] == ucp_L || PRIV(ucp_gentype)[prop->chartype] == ucp_N) == (t == XCL_PROP)) return !negated; break; case PT_SPACE: /* Perl space */ if ((PRIV(ucp_gentype)[prop->chartype] == ucp_Z || c == CHAR_HT || c == CHAR_NL || c == CHAR_FF || c == CHAR_CR) == (t == XCL_PROP)) return !negated; break; case PT_PXSPACE: /* POSIX space */ if ((PRIV(ucp_gentype)[prop->chartype] == ucp_Z || c == CHAR_HT || c == CHAR_NL || c == CHAR_VT || c == CHAR_FF || c == CHAR_CR) == (t == XCL_PROP)) return !negated; break; case PT_WORD: if ((PRIV(ucp_gentype)[prop->chartype] == ucp_L || PRIV(ucp_gentype)[prop->chartype] == ucp_N || c == CHAR_UNDERSCORE) == (t == XCL_PROP)) return !negated; break; case PT_UCNC: if (c < 0xa0) { if ((c == CHAR_DOLLAR_SIGN || c == CHAR_COMMERCIAL_AT || c == CHAR_GRAVE_ACCENT) == (t == XCL_PROP)) return !negated; } else { if ((c < 0xd800 || c > 0xdfff) == (t == XCL_PROP)) return !negated; } break; /* This should never occur, but compilers may mutter if there is no default. */ default: return FALSE; } data += 2; } #endif /* SUPPORT_UCP */ } return negated; /* char did not match */ }
BOOL _pcre_xclass(int c, const uschar *data) { int t; BOOL negated = (*data & XCL_NOT) != 0; /* Character values < 256 are matched against a bitmap, if one is present. If not, we still carry on, because there may be ranges that start below 256 in the additional data. */ if (c < 256) { if ((*data & XCL_MAP) != 0 && (data[1 + c/8] & (1 << (c&7))) != 0) return !negated; /* char found */ } /* First skip the bit map if present. Then match against the list of Unicode properties or large chars or ranges that end with a large char. We won't ever encounter XCL_PROP or XCL_NOTPROP when UCP support is not compiled. */ if ((*data++ & XCL_MAP) != 0) data += 32; while ((t = *data++) != XCL_END) { int x, y; if (t == XCL_SINGLE) { GETCHARINC(x, data); if (c == x) return !negated; } else if (t == XCL_RANGE) { GETCHARINC(x, data); GETCHARINC(y, data); if (c >= x && c <= y) return !negated; } #ifdef SUPPORT_UCP else /* XCL_PROP & XCL_NOTPROP */ { int chartype, script; int category = _pcre_ucp_findprop(c, &chartype, &script); switch(*data) { case PT_ANY: if (t == XCL_PROP) return !negated; break; case PT_LAMP: if ((chartype == ucp_Lu || chartype == ucp_Ll || chartype == ucp_Lt) == (t == XCL_PROP)) return !negated; break; case PT_GC: if ((data[1] == category) == (t == XCL_PROP)) return !negated; break; case PT_PC: if ((data[1] == chartype) == (t == XCL_PROP)) return !negated; break; case PT_SC: if ((data[1] == script) == (t == XCL_PROP)) return !negated; break; /* This should never occur, but compilers may mutter if there is no default. */ default: return FALSE; } data += 2; } #endif /* SUPPORT_UCP */ } return negated; /* char did not match */ }
BOOL PRIV(xclass)(pcre_uint32 c, const pcre_uchar *data, BOOL utf) { pcre_uchar t; BOOL negated = (*data & XCL_NOT) != 0; (void)utf; #ifdef COMPILE_PCRE8 /* In 8 bit mode, this must always be TRUE. Help the compiler to know that. */ utf = TRUE; #endif /* Character values < 256 are matched against a bitmap, if one is present. If not, we still carry on, because there may be ranges that start below 256 in the additional data. */ if (c < 256) { if ((*data & XCL_HASPROP) == 0) { if ((*data & XCL_MAP) == 0) return negated; return (((pcre_uint8 *)(data + 1))[c/8] & (1 << (c&7))) != 0; } if ((*data & XCL_MAP) != 0 && (((pcre_uint8 *)(data + 1))[c/8] & (1 << (c&7))) != 0) return !negated; /* char found */ } /* First skip the bit map if present. Then match against the list of Unicode properties or large chars or ranges that end with a large char. We won't ever encounter XCL_PROP or XCL_NOTPROP when UCP support is not compiled. */ if ((*data++ & XCL_MAP) != 0) data += 32 / sizeof(pcre_uchar); while ((t = *data++) != XCL_END) { pcre_uint32 x, y; if (t == XCL_SINGLE) { #ifdef SUPPORT_UTF if (utf) { GETCHARINC(x, data); /* macro generates multiple statements */ } else #endif x = *data++; if (c == x) return !negated; } else if (t == XCL_RANGE) { #ifdef SUPPORT_UTF if (utf) { GETCHARINC(x, data); /* macro generates multiple statements */ GETCHARINC(y, data); /* macro generates multiple statements */ } else #endif { x = *data++; y = *data++; } if (c >= x && c <= y) return !negated; } #ifdef SUPPORT_UCP else /* XCL_PROP & XCL_NOTPROP */ { const ucd_record *prop = GET_UCD(c); BOOL isprop = t == XCL_PROP; switch(*data) { case PT_ANY: if (isprop) return !negated; break; case PT_LAMP: if ((prop->chartype == ucp_Lu || prop->chartype == ucp_Ll || prop->chartype == ucp_Lt) == isprop) return !negated; break; case PT_GC: if ((data[1] == PRIV(ucp_gentype)[prop->chartype]) == isprop) return !negated; break; case PT_PC: if ((data[1] == prop->chartype) == isprop) return !negated; break; case PT_SC: if ((data[1] == prop->script) == isprop) return !negated; break; case PT_ALNUM: if ((PRIV(ucp_gentype)[prop->chartype] == ucp_L || PRIV(ucp_gentype)[prop->chartype] == ucp_N) == isprop) return !negated; break; /* Perl space used to exclude VT, but from Perl 5.18 it is included, which means that Perl space and POSIX space are now identical. PCRE was changed at release 8.34. */ case PT_SPACE: /* Perl space */ case PT_PXSPACE: /* POSIX space */ switch(c) { HSPACE_CASES: VSPACE_CASES: if (isprop) return !negated; break; default: if ((PRIV(ucp_gentype)[prop->chartype] == ucp_Z) == isprop) return !negated; break; } break; case PT_WORD: if ((PRIV(ucp_gentype)[prop->chartype] == ucp_L || PRIV(ucp_gentype)[prop->chartype] == ucp_N || c == CHAR_UNDERSCORE) == isprop) return !negated; break; case PT_UCNC: if (c < 0xa0) { if ((c == CHAR_DOLLAR_SIGN || c == CHAR_COMMERCIAL_AT || c == CHAR_GRAVE_ACCENT) == isprop) return !negated; } else { if ((c < 0xd800 || c > 0xdfff) == isprop) return !negated; } break; /* The following three properties can occur only in an XCLASS, as there is no \p or \P coding for them. */ /* Graphic character. Implement this as not Z (space or separator) and not C (other), except for Cf (format) with a few exceptions. This seems to be what Perl does. The exceptional characters are: U+061C Arabic Letter Mark U+180E Mongolian Vowel Separator U+2066 - U+2069 Various "isolate"s */ case PT_PXGRAPH: if ((PRIV(ucp_gentype)[prop->chartype] != ucp_Z && (PRIV(ucp_gentype)[prop->chartype] != ucp_C || (prop->chartype == ucp_Cf && c != 0x061c && c != 0x180e && (c < 0x2066 || c > 0x2069)) )) == isprop) return !negated; break; /* Printable character: same as graphic, with the addition of Zs, i.e. not Zl and not Zp, and U+180E. */ case PT_PXPRINT: if ((prop->chartype != ucp_Zl && prop->chartype != ucp_Zp && (PRIV(ucp_gentype)[prop->chartype] != ucp_C || (prop->chartype == ucp_Cf && c != 0x061c && (c < 0x2066 || c > 0x2069)) )) == isprop) return !negated; break; /* Punctuation: all Unicode punctuation, plus ASCII characters that Unicode treats as symbols rather than punctuation, for Perl compatibility (these are $+<=>^`|~). */ case PT_PXPUNCT: if ((PRIV(ucp_gentype)[prop->chartype] == ucp_P || (c < 256 && PRIV(ucp_gentype)[prop->chartype] == ucp_S)) == isprop) return !negated; break; /* This should never occur, but compilers may mutter if there is no default. */ default: return FALSE; } data += 2; } #endif /* SUPPORT_UCP */ } return negated; /* char did not match */ }