Beispiel #1
0
PCRE_EXPORT BOOL
_pcre_xclass(int c, const uschar *data)
{
int t;
BOOL negated = (*data & XCL_NOT) != 0;

/* Character values < 256 are matched against a bitmap, if one is present. If
not, we still carry on, because there may be ranges that start below 256 in the
additional data. */

if (c < 256)
  {
  if ((*data & XCL_MAP) != 0 && (data[1 + c/8] & (1 << (c&7))) != 0)
    return !negated;   /* char found */
  }

/* First skip the bit map if present. Then match against the list of Unicode
properties or large chars or ranges that end with a large char. We won't ever
encounter XCL_PROP or XCL_NOTPROP when UCP support is not compiled. */

if ((*data++ & XCL_MAP) != 0) data += 32;

while ((t = *data++) != XCL_END)
  {
  int x, y;
  if (t == XCL_SINGLE)
    {
    GETCHARINC(x, data);
    if (c == x) return !negated;
    }
  else if (t == XCL_RANGE)
    {
    GETCHARINC(x, data);
    GETCHARINC(y, data);
    if (c >= x && c <= y) return !negated;
    }

#ifdef SUPPORT_UCP
  else  /* XCL_PROP & XCL_NOTPROP */
    {
    int chartype, othercase;
    int rqdtype = *data++;
    int category = _pcre_ucp_findchar(c, &chartype, &othercase);
    if (rqdtype >= 128)
      {
      if ((rqdtype - 128 == category) == (t == XCL_PROP)) return !negated;
      }
    else
      {
      if ((rqdtype == chartype) == (t == XCL_PROP)) return !negated;
      }
    }
#endif  /* SUPPORT_UCP */
  }

return negated;   /* char did not match */
}
Beispiel #2
0
static const uschar *
set_table_bit(uschar *start_bits, const uschar *p, BOOL caseless,
  compile_data *cd, BOOL utf8)
{
unsigned int c = *p;

SET_BIT(c);

#ifdef SUPPORT_UTF8
if (utf8 && c > 127)
  {
  GETCHARINC(c, p);
#ifdef SUPPORT_UCP
  if (caseless)
    {
    uschar buff[8];
    c = UCD_OTHERCASE(c);
    (void)_pcre_ord2utf8(c, buff);
    SET_BIT(buff[0]);
    }
#endif
  return p;
  }
#endif

/* Not UTF-8 mode, or character is less than 127. */

if (caseless && (cd->ctypes[c] & ctype_letter) != 0) SET_BIT(cd->fcc[c]);
return p + 1;
}
Beispiel #3
0
BOOL
_pcre_xclass(int c, const uschar *data)
{
int t;
BOOL negated = (*data & XCL_NOT) != 0;

/* Character values < 256 are matched against a bitmap, if one is present. If
not, we still carry on, because there may be ranges that start below 256 in the
additional data. */

if (c < 256)
  {
  if ((*data & XCL_MAP) != 0 && (data[1 + c/8] & (1 << (c&7))) != 0)
    return !negated;   /* char found */
  }

/* First skip the bit map if present. Then match against the list of Unicode
properties or large chars or ranges that end with a large char. We won't ever
encounter XCL_PROP or XCL_NOTPROP when UCP support is not compiled. */

if ((*data++ & XCL_MAP) != 0) data += 32;

while ((t = *data++) != XCL_END)
  {
  int x, y;
  if (t == XCL_SINGLE)
    {
    GETCHARINC(x, data);
    if (c == x) return !negated;
    }
  else if (t == XCL_RANGE)
    {
    GETCHARINC(x, data);
    GETCHARINC(y, data);
    if (c >= x && c <= y) return !negated;
    }

#ifdef SUPPORT_UCP
  else  /* XCL_PROP & XCL_NOTPROP */
    {
    const ucd_record *prop = GET_UCD(c);

    switch(*data)
      {
      case PT_ANY:
      if (t == XCL_PROP) return !negated;
      break;

      case PT_LAMP:
      if ((prop->chartype == ucp_Lu || prop->chartype == ucp_Ll ||
           prop->chartype == ucp_Lt) == (t == XCL_PROP)) return !negated;
      break;

      case PT_GC:
      if ((data[1] == _pcre_ucp_gentype[prop->chartype]) == (t == XCL_PROP))
        return !negated;
      break;

      case PT_PC:
      if ((data[1] == prop->chartype) == (t == XCL_PROP)) return !negated;
      break;

      case PT_SC:
      if ((data[1] == prop->script) == (t == XCL_PROP)) return !negated;
      break;

      case PT_ALNUM:
      if ((_pcre_ucp_gentype[prop->chartype] == ucp_L ||
           _pcre_ucp_gentype[prop->chartype] == ucp_N) == (t == XCL_PROP))
        return !negated;
      break;

      case PT_SPACE:    /* Perl space */
      if ((_pcre_ucp_gentype[prop->chartype] == ucp_Z ||
           c == CHAR_HT || c == CHAR_NL || c == CHAR_FF || c == CHAR_CR)
             == (t == XCL_PROP))
        return !negated;
      break;

      case PT_PXSPACE:  /* POSIX space */
      if ((_pcre_ucp_gentype[prop->chartype] == ucp_Z ||
           c == CHAR_HT || c == CHAR_NL || c == CHAR_VT ||
           c == CHAR_FF || c == CHAR_CR) == (t == XCL_PROP))
        return !negated;
      break;

      case PT_WORD:
      if ((_pcre_ucp_gentype[prop->chartype] == ucp_L ||
           _pcre_ucp_gentype[prop->chartype] == ucp_N || c == CHAR_UNDERSCORE)
             == (t == XCL_PROP))
        return !negated;
      break;

      /* This should never occur, but compilers may mutter if there is no
      default. */

      default:
      return FALSE;
      }

    data += 2;
    }
#endif  /* SUPPORT_UCP */
  }

return negated;   /* char did not match */
}
Beispiel #4
0
static const pcre_uchar *
set_table_bit(pcre_uint8 *start_bits, const pcre_uchar *p, BOOL caseless,
  compile_data *cd, BOOL utf)
{
pcre_uint32 c = *p;

#ifdef COMPILE_PCRE8
SET_BIT(c);

#ifdef SUPPORT_UTF
if (utf && c > 127)
  {
  GETCHARINC(c, p);
#ifdef SUPPORT_UCP
  if (caseless)
    {
    pcre_uchar buff[6];
    c = UCD_OTHERCASE(c);
    (void)PRIV(ord2utf)(c, buff);
    SET_BIT(buff[0]);
    }
#endif  /* Not SUPPORT_UCP */
  return p;
  }
#else   /* Not SUPPORT_UTF */
(void)(utf);   /* Stops warning for unused parameter */
#endif  /* SUPPORT_UTF */

/* Not UTF-8 mode, or character is less than 127. */

if (caseless && (cd->ctypes[c] & ctype_letter) != 0) SET_BIT(cd->fcc[c]);
return p + 1;
#endif  /* COMPILE_PCRE8 */

#if defined COMPILE_PCRE16 || defined COMPILE_PCRE32
if (c > 0xff)
  {
  c = 0xff;
  caseless = FALSE;
  }
SET_BIT(c);

#ifdef SUPPORT_UTF
if (utf && c > 127)
  {
  GETCHARINC(c, p);
#ifdef SUPPORT_UCP
  if (caseless)
    {
    c = UCD_OTHERCASE(c);
    if (c > 0xff)
      c = 0xff;
    SET_BIT(c);
    }
#endif  /* SUPPORT_UCP */
  return p;
  }
#else   /* Not SUPPORT_UTF */
(void)(utf);   /* Stops warning for unused parameter */
#endif  /* SUPPORT_UTF */

if (caseless && (cd->ctypes[c] & ctype_letter) != 0) SET_BIT(cd->fcc[c]);
return p + 1;
#endif
}
Beispiel #5
0
BOOL
PRIV(xclass)(pcre_uint32 c, const pcre_uchar *data, BOOL utf)
{
    pcre_uchar t;
    BOOL negated = (*data & XCL_NOT) != 0;

    (void)utf;
#ifdef COMPILE_PCRE8
    /* In 8 bit mode, this must always be TRUE. Help the compiler to know that. */
    utf = TRUE;
#endif

    /* Character values < 256 are matched against a bitmap, if one is present. If
    not, we still carry on, because there may be ranges that start below 256 in the
    additional data. */

    if (c < 256)
    {
        if ((*data & XCL_MAP) != 0 &&
                (((pcre_uint8 *)(data + 1))[c/8] & (1 << (c&7))) != 0)
            return !negated; /* char found */
    }

    /* First skip the bit map if present. Then match against the list of Unicode
    properties or large chars or ranges that end with a large char. We won't ever
    encounter XCL_PROP or XCL_NOTPROP when UCP support is not compiled. */

    if ((*data++ & XCL_MAP) != 0) data += 32 / sizeof(pcre_uchar);

    while ((t = *data++) != XCL_END)
    {
        pcre_uint32 x, y;
        if (t == XCL_SINGLE)
        {
#ifdef SUPPORT_UTF
            if (utf)
            {
                GETCHARINC(x, data); /* macro generates multiple statements */
            }
            else
#endif
                x = *data++;
            if (c == x) return !negated;
        }
        else if (t == XCL_RANGE)
        {
#ifdef SUPPORT_UTF
            if (utf)
            {
                GETCHARINC(x, data); /* macro generates multiple statements */
                GETCHARINC(y, data); /* macro generates multiple statements */
            }
            else
#endif
            {
                x = *data++;
                y = *data++;
            }
            if (c >= x && c <= y) return !negated;
        }

#ifdef SUPPORT_UCP
        else  /* XCL_PROP & XCL_NOTPROP */
        {
            const ucd_record *prop = GET_UCD(c);

            switch(*data)
            {
            case PT_ANY:
                if (t == XCL_PROP) return !negated;
                break;

            case PT_LAMP:
                if ((prop->chartype == ucp_Lu || prop->chartype == ucp_Ll ||
                        prop->chartype == ucp_Lt) == (t == XCL_PROP)) return !negated;
                break;

            case PT_GC:
                if ((data[1] == PRIV(ucp_gentype)[prop->chartype]) == (t == XCL_PROP))
                    return !negated;
                break;

            case PT_PC:
                if ((data[1] == prop->chartype) == (t == XCL_PROP)) return !negated;
                break;

            case PT_SC:
                if ((data[1] == prop->script) == (t == XCL_PROP)) return !negated;
                break;

            case PT_ALNUM:
                if ((PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
                        PRIV(ucp_gentype)[prop->chartype] == ucp_N) == (t == XCL_PROP))
                    return !negated;
                break;

            case PT_SPACE:    /* Perl space */
                if ((PRIV(ucp_gentype)[prop->chartype] == ucp_Z ||
                        c == CHAR_HT || c == CHAR_NL || c == CHAR_FF || c == CHAR_CR)
                        == (t == XCL_PROP))
                    return !negated;
                break;

            case PT_PXSPACE:  /* POSIX space */
                if ((PRIV(ucp_gentype)[prop->chartype] == ucp_Z ||
                        c == CHAR_HT || c == CHAR_NL || c == CHAR_VT ||
                        c == CHAR_FF || c == CHAR_CR) == (t == XCL_PROP))
                    return !negated;
                break;

            case PT_WORD:
                if ((PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
                        PRIV(ucp_gentype)[prop->chartype] == ucp_N || c == CHAR_UNDERSCORE)
                        == (t == XCL_PROP))
                    return !negated;
                break;

            case PT_UCNC:
                if (c < 0xa0)
                {
                    if ((c == CHAR_DOLLAR_SIGN || c == CHAR_COMMERCIAL_AT ||
                            c == CHAR_GRAVE_ACCENT) == (t == XCL_PROP))
                        return !negated;
                }
                else
                {
                    if ((c < 0xd800 || c > 0xdfff) == (t == XCL_PROP))
                        return !negated;
                }
                break;

            /* This should never occur, but compilers may mutter if there is no
            default. */

            default:
                return FALSE;
            }

            data += 2;
        }
#endif  /* SUPPORT_UCP */
    }

    return negated;   /* char did not match */
}
BOOL
_pcre_xclass(int c, const uschar *data)
{
int t;
BOOL negated = (*data & XCL_NOT) != 0;

/* Character values < 256 are matched against a bitmap, if one is present. If
not, we still carry on, because there may be ranges that start below 256 in the
additional data. */

if (c < 256)
  {
  if ((*data & XCL_MAP) != 0 && (data[1 + c/8] & (1 << (c&7))) != 0)
    return !negated;   /* char found */
  }

/* First skip the bit map if present. Then match against the list of Unicode
properties or large chars or ranges that end with a large char. We won't ever
encounter XCL_PROP or XCL_NOTPROP when UCP support is not compiled. */

if ((*data++ & XCL_MAP) != 0) data += 32;

while ((t = *data++) != XCL_END)
  {
  int x, y;
  if (t == XCL_SINGLE)
    {
    GETCHARINC(x, data);
    if (c == x) return !negated;
    }
  else if (t == XCL_RANGE)
    {
    GETCHARINC(x, data);
    GETCHARINC(y, data);
    if (c >= x && c <= y) return !negated;
    }

#ifdef SUPPORT_UCP
  else  /* XCL_PROP & XCL_NOTPROP */
    {
    int chartype, script;
    int category = _pcre_ucp_findprop(c, &chartype, &script);

    switch(*data)
      {
      case PT_ANY:
      if (t == XCL_PROP) return !negated;
      break;

      case PT_LAMP:
      if ((chartype == ucp_Lu || chartype == ucp_Ll || chartype == ucp_Lt) ==
          (t == XCL_PROP)) return !negated;
      break;

      case PT_GC:
      if ((data[1] == category) == (t == XCL_PROP)) return !negated;
      break;

      case PT_PC:
      if ((data[1] == chartype) == (t == XCL_PROP)) return !negated;
      break;

      case PT_SC:
      if ((data[1] == script) == (t == XCL_PROP)) return !negated;
      break;

      /* This should never occur, but compilers may mutter if there is no
      default. */

      default:
      return FALSE;
      }

    data += 2;
    }
#endif  /* SUPPORT_UCP */
  }

return negated;   /* char did not match */
}
Beispiel #7
0
BOOL
PRIV(xclass)(pcre_uint32 c, const pcre_uchar *data, BOOL utf)
{
pcre_uchar t;
BOOL negated = (*data & XCL_NOT) != 0;

(void)utf;
#ifdef COMPILE_PCRE8
/* In 8 bit mode, this must always be TRUE. Help the compiler to know that. */
utf = TRUE;
#endif

/* Character values < 256 are matched against a bitmap, if one is present. If
not, we still carry on, because there may be ranges that start below 256 in the
additional data. */

if (c < 256)
  {
  if ((*data & XCL_HASPROP) == 0)
    {
    if ((*data & XCL_MAP) == 0) return negated;
    return (((pcre_uint8 *)(data + 1))[c/8] & (1 << (c&7))) != 0;
    }
  if ((*data & XCL_MAP) != 0 &&
    (((pcre_uint8 *)(data + 1))[c/8] & (1 << (c&7))) != 0)
    return !negated; /* char found */
  }

/* First skip the bit map if present. Then match against the list of Unicode
properties or large chars or ranges that end with a large char. We won't ever
encounter XCL_PROP or XCL_NOTPROP when UCP support is not compiled. */

if ((*data++ & XCL_MAP) != 0) data += 32 / sizeof(pcre_uchar);

while ((t = *data++) != XCL_END)
  {
  pcre_uint32 x, y;
  if (t == XCL_SINGLE)
    {
#ifdef SUPPORT_UTF
    if (utf)
      {
      GETCHARINC(x, data); /* macro generates multiple statements */
      }
    else
#endif
      x = *data++;
    if (c == x) return !negated;
    }
  else if (t == XCL_RANGE)
    {
#ifdef SUPPORT_UTF
    if (utf)
      {
      GETCHARINC(x, data); /* macro generates multiple statements */
      GETCHARINC(y, data); /* macro generates multiple statements */
      }
    else
#endif
      {
      x = *data++;
      y = *data++;
      }
    if (c >= x && c <= y) return !negated;
    }

#ifdef SUPPORT_UCP
  else  /* XCL_PROP & XCL_NOTPROP */
    {
    const ucd_record *prop = GET_UCD(c);
    BOOL isprop = t == XCL_PROP;

    switch(*data)
      {
      case PT_ANY:
      if (isprop) return !negated;
      break;

      case PT_LAMP:
      if ((prop->chartype == ucp_Lu || prop->chartype == ucp_Ll ||
           prop->chartype == ucp_Lt) == isprop) return !negated;
      break;

      case PT_GC:
      if ((data[1] == PRIV(ucp_gentype)[prop->chartype]) == isprop)
        return !negated;
      break;

      case PT_PC:
      if ((data[1] == prop->chartype) == isprop) return !negated;
      break;

      case PT_SC:
      if ((data[1] == prop->script) == isprop) return !negated;
      break;

      case PT_ALNUM:
      if ((PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
           PRIV(ucp_gentype)[prop->chartype] == ucp_N) == isprop)
        return !negated;
      break;

      /* Perl space used to exclude VT, but from Perl 5.18 it is included,
      which means that Perl space and POSIX space are now identical. PCRE
      was changed at release 8.34. */

      case PT_SPACE:    /* Perl space */
      case PT_PXSPACE:  /* POSIX space */
      switch(c)
        {
        HSPACE_CASES:
        VSPACE_CASES:
        if (isprop) return !negated;
        break;

        default:
        if ((PRIV(ucp_gentype)[prop->chartype] == ucp_Z) == isprop)
          return !negated;
        break;
        }
      break;

      case PT_WORD:
      if ((PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
           PRIV(ucp_gentype)[prop->chartype] == ucp_N || c == CHAR_UNDERSCORE)
             == isprop)
        return !negated;
      break;

      case PT_UCNC:
      if (c < 0xa0)
        {
        if ((c == CHAR_DOLLAR_SIGN || c == CHAR_COMMERCIAL_AT ||
             c == CHAR_GRAVE_ACCENT) == isprop)
          return !negated;
        }
      else
        {
        if ((c < 0xd800 || c > 0xdfff) == isprop)
          return !negated;
        }
      break;

      /* The following three properties can occur only in an XCLASS, as there
      is no \p or \P coding for them. */

      /* Graphic character. Implement this as not Z (space or separator) and
      not C (other), except for Cf (format) with a few exceptions. This seems
      to be what Perl does. The exceptional characters are:

      U+061C           Arabic Letter Mark
      U+180E           Mongolian Vowel Separator
      U+2066 - U+2069  Various "isolate"s
      */

      case PT_PXGRAPH:
      if ((PRIV(ucp_gentype)[prop->chartype] != ucp_Z &&
            (PRIV(ucp_gentype)[prop->chartype] != ucp_C ||
              (prop->chartype == ucp_Cf &&
                c != 0x061c && c != 0x180e && (c < 0x2066 || c > 0x2069))
         )) == isprop)
        return !negated;
      break;

      /* Printable character: same as graphic, with the addition of Zs, i.e.
      not Zl and not Zp, and U+180E. */

      case PT_PXPRINT:
      if ((prop->chartype != ucp_Zl &&
           prop->chartype != ucp_Zp &&
            (PRIV(ucp_gentype)[prop->chartype] != ucp_C ||
              (prop->chartype == ucp_Cf &&
                c != 0x061c && (c < 0x2066 || c > 0x2069))
         )) == isprop)
        return !negated;
      break;

      /* Punctuation: all Unicode punctuation, plus ASCII characters that
      Unicode treats as symbols rather than punctuation, for Perl
      compatibility (these are $+<=>^`|~). */

      case PT_PXPUNCT:
      if ((PRIV(ucp_gentype)[prop->chartype] == ucp_P ||
            (c < 256 && PRIV(ucp_gentype)[prop->chartype] == ucp_S)) == isprop)
        return !negated;
      break;

      /* This should never occur, but compilers may mutter if there is no
      default. */

      default:
      return FALSE;
      }

    data += 2;
    }
#endif  /* SUPPORT_UCP */
  }

return negated;   /* char did not match */
}