示例#1
0
static void
print_prop(int c)
{
int type = UCD_CATEGORY(c);
int fulltype = UCD_CHARTYPE(c);
int script = UCD_SCRIPT(c);
int gbprop = UCD_GRAPHBREAK(c);
int othercase = UCD_OTHERCASE(c);
int caseset = UCD_CASESET(c);

unsigned char *fulltypename = US"??";
unsigned char *typename = US"??";
unsigned char *scriptname = US"??";
unsigned char *graphbreak = US"??";

switch (type)
  {
  case ucp_C: typename = US"Control"; break;
  case ucp_L: typename = US"Letter"; break;
  case ucp_M: typename = US"Mark"; break;
  case ucp_N: typename = US"Number"; break;
  case ucp_P: typename = US"Punctuation"; break;
  case ucp_S: typename = US"Symbol"; break;
  case ucp_Z: typename = US"Separator"; break;
  }

switch (fulltype)
  {
  case ucp_Cc: fulltypename = US"Control"; break;
  case ucp_Cf: fulltypename = US"Format"; break;
  case ucp_Cn: fulltypename = US"Unassigned"; break;
  case ucp_Co: fulltypename = US"Private use"; break;
  case ucp_Cs: fulltypename = US"Surrogate"; break;
  case ucp_Ll: fulltypename = US"Lower case letter"; break;
  case ucp_Lm: fulltypename = US"Modifier letter"; break;
  case ucp_Lo: fulltypename = US"Other letter"; break;
  case ucp_Lt: fulltypename = US"Title case letter"; break;
  case ucp_Lu: fulltypename = US"Upper case letter"; break;
  case ucp_Mc: fulltypename = US"Spacing mark"; break;
  case ucp_Me: fulltypename = US"Enclosing mark"; break;
  case ucp_Mn: fulltypename = US"Non-spacing mark"; break;
  case ucp_Nd: fulltypename = US"Decimal number"; break;
  case ucp_Nl: fulltypename = US"Letter number"; break;
  case ucp_No: fulltypename = US"Other number"; break;
  case ucp_Pc: fulltypename = US"Connector punctuation"; break;
  case ucp_Pd: fulltypename = US"Dash punctuation"; break;
  case ucp_Pe: fulltypename = US"Close punctuation"; break;
  case ucp_Pf: fulltypename = US"Final punctuation"; break;
  case ucp_Pi: fulltypename = US"Initial punctuation"; break;
  case ucp_Po: fulltypename = US"Other punctuation"; break;
  case ucp_Ps: fulltypename = US"Open punctuation"; break;
  case ucp_Sc: fulltypename = US"Currency symbol"; break;
  case ucp_Sk: fulltypename = US"Modifier symbol"; break;
  case ucp_Sm: fulltypename = US"Mathematical symbol"; break;
  case ucp_So: fulltypename = US"Other symbol"; break;
  case ucp_Zl: fulltypename = US"Line separator"; break;
  case ucp_Zp: fulltypename = US"Paragraph separator"; break;
  case ucp_Zs: fulltypename = US"Space separator"; break;
  }
  
switch(gbprop)
  {
  case ucp_gbCR:           graphbreak = US"CR"; break;
  case ucp_gbLF:           graphbreak = US"LF"; break;
  case ucp_gbControl:      graphbreak = US"Control"; break;
  case ucp_gbExtend:       graphbreak = US"Extend"; break;
  case ucp_gbPrepend:      graphbreak = US"Prepend"; break;
  case ucp_gbSpacingMark:  graphbreak = US"SpacingMark"; break;
  case ucp_gbL:            graphbreak = US"Hangul syllable type L"; break;
  case ucp_gbV:            graphbreak = US"Hangul syllable type V"; break;
  case ucp_gbT:            graphbreak = US"Hangul syllable type T"; break;
  case ucp_gbLV:           graphbreak = US"Hangul syllable type LV"; break;
  case ucp_gbLVT:          graphbreak = US"Hangul syllable type LVT"; break;
  case ucp_gbOther:        graphbreak = US"Other"; break;
  }

switch(script)
  {
  case ucp_Arabic:      scriptname = US"Arabic"; break;
  case ucp_Armenian:    scriptname = US"Armenian"; break;
  case ucp_Balinese:    scriptname = US"Balinese"; break;
  case ucp_Bengali:     scriptname = US"Bengali"; break;
  case ucp_Bopomofo:    scriptname = US"Bopomofo"; break;
  case ucp_Braille:     scriptname = US"Braille"; break;
  case ucp_Buginese:    scriptname = US"Buginese"; break;
  case ucp_Buhid:       scriptname = US"Buhid"; break;
  case ucp_Canadian_Aboriginal: scriptname = US"Canadian_Aboriginal"; break;
  case ucp_Cherokee:    scriptname = US"Cherokee"; break;
  case ucp_Common:      scriptname = US"Common"; break;
  case ucp_Coptic:      scriptname = US"Coptic"; break;
  case ucp_Cuneiform:   scriptname = US"Cuneiform"; break;
  case ucp_Cypriot:     scriptname = US"Cypriot"; break;
  case ucp_Cyrillic:    scriptname = US"Cyrillic"; break;
  case ucp_Deseret:     scriptname = US"Deseret"; break;
  case ucp_Devanagari:  scriptname = US"Devanagari"; break;
  case ucp_Ethiopic:    scriptname = US"Ethiopic"; break;
  case ucp_Georgian:    scriptname = US"Georgian"; break;
  case ucp_Glagolitic:  scriptname = US"Glagolitic"; break;
  case ucp_Gothic:      scriptname = US"Gothic"; break;
  case ucp_Greek:       scriptname = US"Greek"; break;
  case ucp_Gujarati:    scriptname = US"Gujarati"; break;
  case ucp_Gurmukhi:    scriptname = US"Gurmukhi"; break;
  case ucp_Han:         scriptname = US"Han"; break;
  case ucp_Hangul:      scriptname = US"Hangul"; break;
  case ucp_Hanunoo:     scriptname = US"Hanunoo"; break;
  case ucp_Hebrew:      scriptname = US"Hebrew"; break;
  case ucp_Hiragana:    scriptname = US"Hiragana"; break;
  case ucp_Inherited:   scriptname = US"Inherited"; break;
  case ucp_Kannada:     scriptname = US"Kannada"; break;
  case ucp_Katakana:    scriptname = US"Katakana"; break;
  case ucp_Kharoshthi:  scriptname = US"Kharoshthi"; break;
  case ucp_Khmer:       scriptname = US"Khmer"; break;
  case ucp_Lao:         scriptname = US"Lao"; break;
  case ucp_Latin:       scriptname = US"Latin"; break;
  case ucp_Limbu:       scriptname = US"Limbu"; break;
  case ucp_Linear_B:    scriptname = US"Linear_B"; break;
  case ucp_Malayalam:   scriptname = US"Malayalam"; break;
  case ucp_Mongolian:   scriptname = US"Mongolian"; break;
  case ucp_Myanmar:     scriptname = US"Myanmar"; break;
  case ucp_New_Tai_Lue: scriptname = US"New_Tai_Lue"; break;
  case ucp_Nko:         scriptname = US"Nko"; break;
  case ucp_Ogham:       scriptname = US"Ogham"; break;
  case ucp_Old_Italic:  scriptname = US"Old_Italic"; break;
  case ucp_Old_Persian: scriptname = US"Old_Persian"; break;
  case ucp_Oriya:       scriptname = US"Oriya"; break;
  case ucp_Osmanya:     scriptname = US"Osmanya"; break;
  case ucp_Phags_Pa:    scriptname = US"Phags_Pa"; break;
  case ucp_Phoenician:  scriptname = US"Phoenician"; break;
  case ucp_Runic:       scriptname = US"Runic"; break;
  case ucp_Shavian:     scriptname = US"Shavian"; break;
  case ucp_Sinhala:     scriptname = US"Sinhala"; break;
  case ucp_Syloti_Nagri: scriptname = US"Syloti_Nagri"; break;
  case ucp_Syriac:      scriptname = US"Syriac"; break;
  case ucp_Tagalog:     scriptname = US"Tagalog"; break;
  case ucp_Tagbanwa:    scriptname = US"Tagbanwa"; break;
  case ucp_Tai_Le:      scriptname = US"Tai_Le"; break;
  case ucp_Tamil:       scriptname = US"Tamil"; break;
  case ucp_Telugu:      scriptname = US"Telugu"; break;
  case ucp_Thaana:      scriptname = US"Thaana"; break;
  case ucp_Thai:        scriptname = US"Thai"; break;
  case ucp_Tibetan:     scriptname = US"Tibetan"; break;
  case ucp_Tifinagh:    scriptname = US"Tifinagh"; break;
  case ucp_Ugaritic:    scriptname = US"Ugaritic"; break;
  case ucp_Yi:          scriptname = US"Yi"; break;
  /* New for Unicode 5.1: */
  case ucp_Carian:      scriptname = US"Carian"; break;
  case ucp_Cham:        scriptname = US"Cham"; break;
  case ucp_Kayah_Li:    scriptname = US"Kayah_Li"; break;
  case ucp_Lepcha:      scriptname = US"Lepcha"; break;
  case ucp_Lycian:      scriptname = US"Lycian"; break;
  case ucp_Lydian:      scriptname = US"Lydian"; break;
  case ucp_Ol_Chiki:    scriptname = US"Ol_Chiki"; break;
  case ucp_Rejang:      scriptname = US"Rejang"; break;
  case ucp_Saurashtra:  scriptname = US"Saurashtra"; break;
  case ucp_Sundanese:   scriptname = US"Sundanese"; break;
  case ucp_Vai:         scriptname = US"Vai"; break;
  /* New for Unicode 5.2: */
  case ucp_Avestan:     scriptname = US"Avestan"; break;
  case ucp_Bamum:       scriptname = US"Bamum"; break;
  case ucp_Egyptian_Hieroglyphs: scriptname = US"Egyptian_Hieroglyphs"; break;
  case ucp_Imperial_Aramaic: scriptname = US"Imperial_Aramaic"; break;
  case ucp_Inscriptional_Pahlavi: scriptname = US"Inscriptional_Pahlavi"; break;
  case ucp_Inscriptional_Parthian: scriptname = US"Inscriptional_Parthian"; break;
  case ucp_Javanese:    scriptname = US"Javanese"; break;
  case ucp_Kaithi:      scriptname = US"Kaithi"; break;
  case ucp_Lisu:        scriptname = US"Lisu"; break;
  case ucp_Meetei_Mayek: scriptname = US"Meetei_Mayek"; break;
  case ucp_Old_South_Arabian: scriptname = US"Old_South_Arabian"; break;
  case ucp_Old_Turkic:  scriptname = US"Old_Turkic"; break;
  case ucp_Samaritan:   scriptname = US"Samaritan"; break;
  case ucp_Tai_Tham:    scriptname = US"Tai_Tham"; break;
  case ucp_Tai_Viet:    scriptname = US"Tai_Viet"; break;
  /* New for Unicode 6.0.0 */
  case ucp_Batak:       scriptname = US"Batak"; break;
  case ucp_Brahmi:      scriptname = US"Brahmi"; break;
  case ucp_Mandaic:     scriptname = US"Mandaic"; break;

  /* New for Unicode 6.1.0 */
  case ucp_Chakma:               scriptname = US"Chakma"; break;
  case ucp_Meroitic_Cursive:     scriptname = US"Meroitic_Cursive"; break;
  case ucp_Meroitic_Hieroglyphs: scriptname = US"Meroitic_Hieroglyphs"; break;
  case ucp_Miao:                 scriptname = US"Miao"; break;
  case ucp_Sharada:              scriptname = US"Sharada"; break;
  case ucp_Sora_Sompeng:         scriptname = US"Sora Sompent"; break;
  case ucp_Takri:                scriptname = US"Takri"; break;

  /* New for Unicode 7.0.0 */
  case ucp_Bassa_Vah:          scriptname = US"Bassa_Vah"; break;
  case ucp_Caucasian_Albanian: scriptname = US"Caucasian_Albanian"; break;
  case ucp_Duployan:           scriptname = US"Duployan"; break;
  case ucp_Elbasan:            scriptname = US"Elbasan"; break;
  case ucp_Grantha:            scriptname = US"Grantha"; break;
  case ucp_Khojki:             scriptname = US"Khojki"; break;
  case ucp_Khudawadi:          scriptname = US"Khudawadi"; break;
  case ucp_Linear_A:           scriptname = US"Linear_A"; break;
  case ucp_Mahajani:           scriptname = US"Mahajani"; break;
  case ucp_Manichaean:         scriptname = US"Manichaean"; break;
  case ucp_Mende_Kikakui:      scriptname = US"Mende_Kikakui"; break;
  case ucp_Modi:               scriptname = US"Modi"; break;
  case ucp_Mro:                scriptname = US"Mro"; break;
  case ucp_Nabataean:          scriptname = US"Nabataean"; break;
  case ucp_Old_North_Arabian:  scriptname = US"Old_North_Arabian"; break;
  case ucp_Old_Permic:         scriptname = US"Old_Permic"; break;
  case ucp_Pahawh_Hmong:       scriptname = US"Pahawh_Hmong"; break;
  case ucp_Palmyrene:          scriptname = US"Palmyrene"; break;
  case ucp_Psalter_Pahlavi:    scriptname = US"Psalter_Pahlavi"; break;
  case ucp_Pau_Cin_Hau:        scriptname = US"Pau_Cin_Hau"; break;
  case ucp_Siddham:            scriptname = US"Siddham"; break;
  case ucp_Tirhuta:            scriptname = US"Tirhuta"; break;
  case ucp_Warang_Citi:        scriptname = US"Warang_Citi"; break;

  /* New for Unicode 8.0.0 */
  case ucp_Ahom:                  scriptname = US"Ahom"; break;
  case ucp_Anatolian_Hieroglyphs: scriptname = US"Anatolian_Hieroglyphs"; break;
  case ucp_Hatran:                scriptname = US"Hatran"; break;
  case ucp_Multani:               scriptname = US"Multani"; break;
  case ucp_Old_Hungarian:         scriptname = US"Old_Hungarian"; break;
  case ucp_SignWriting:           scriptname = US"SignWriting"; break;
  }

printf("%04x %s: %s, %s, %s", c, typename, fulltypename, scriptname, graphbreak);
if (othercase != c) 
  {
  printf(", %04x", othercase);
  if (caseset != 0)
    {
    const uint32_t *p = PRIV(ucd_caseless_sets) + caseset - 1;
    while (*(++p) < NOTACHAR)
      if (*p != othercase && *p != c) printf(", %04x", *p);
    }   
  } 
printf("\n");
}
示例#2
0
PCRE2_EXP_DEFN int PCRE2_CALL_CONVENTION
pcre2_substitute(const pcre2_code *code, PCRE2_SPTR subject, PCRE2_SIZE length,
  PCRE2_SIZE start_offset, uint32_t options, pcre2_match_data *match_data,
  pcre2_match_context *mcontext, PCRE2_SPTR replacement, PCRE2_SIZE rlength,
  PCRE2_UCHAR *buffer, PCRE2_SIZE *blength)
{
int rc;
int subs;
int forcecase = 0;
int forcecasereset = 0;
uint32_t ovector_count;
uint32_t goptions = 0;
uint32_t suboptions;
BOOL match_data_created = FALSE;
BOOL literal = FALSE;
BOOL overflowed = FALSE;
#ifdef SUPPORT_UNICODE
BOOL utf = (code->overall_options & PCRE2_UTF) != 0;
#endif
PCRE2_UCHAR temp[6];
PCRE2_SPTR ptr;
PCRE2_SPTR repend;
PCRE2_SIZE extra_needed = 0;
PCRE2_SIZE buff_offset, buff_length, lengthleft, fraglength;
PCRE2_SIZE *ovector;

buff_offset = 0;
lengthleft = buff_length = *blength;
*blength = PCRE2_UNSET;

/* Partial matching is not valid. */

if ((options & (PCRE2_PARTIAL_HARD|PCRE2_PARTIAL_SOFT)) != 0)
  return PCRE2_ERROR_BADOPTION;

/* If no match data block is provided, create one. */

if (match_data == NULL)
  {
  pcre2_general_context *gcontext = (mcontext == NULL)?
    (pcre2_general_context *)code :
    (pcre2_general_context *)mcontext;
  match_data = pcre2_match_data_create_from_pattern(code, gcontext);
  if (match_data == NULL) return PCRE2_ERROR_NOMEMORY;
  match_data_created = TRUE;
  }
ovector = pcre2_get_ovector_pointer(match_data);
ovector_count = pcre2_get_ovector_count(match_data);

/* Find lengths of zero-terminated strings and the end of the replacement. */

if (length == PCRE2_ZERO_TERMINATED) length = PRIV(strlen)(subject);
if (rlength == PCRE2_ZERO_TERMINATED) rlength = PRIV(strlen)(replacement);
repend = replacement + rlength;

/* Check UTF replacement string if necessary. */

#ifdef SUPPORT_UNICODE
if (utf && (options & PCRE2_NO_UTF_CHECK) == 0)
  {
  rc = PRIV(valid_utf)(replacement, rlength, &(match_data->rightchar));
  if (rc != 0)
    {
    match_data->leftchar = 0;
    goto EXIT;
    }
  }
#endif  /* SUPPORT_UNICODE */

/* Save the substitute options and remove them from the match options. */

suboptions = options & SUBSTITUTE_OPTIONS;
options &= ~SUBSTITUTE_OPTIONS;

/* Copy up to the start offset */

if (start_offset > length)
  {
  match_data->leftchar = 0;
  rc = PCRE2_ERROR_BADOFFSET;
  goto EXIT;
  }
CHECKMEMCPY(subject, start_offset);

/* Loop for global substituting. */

subs = 0;
do
  {
  PCRE2_SPTR ptrstack[PTR_STACK_SIZE];
  uint32_t ptrstackptr = 0;

  rc = pcre2_match(code, subject, length, start_offset, options|goptions,
    match_data, mcontext);

#ifdef SUPPORT_UNICODE
  if (utf) options |= PCRE2_NO_UTF_CHECK;  /* Only need to check once */
#endif

  /* Any error other than no match returns the error code. No match when not
  doing the special after-empty-match global rematch, or when at the end of the
  subject, breaks the global loop. Otherwise, advance the starting point by one
  character, copying it to the output, and try again. */

  if (rc < 0)
    {
    PCRE2_SIZE save_start;

    if (rc != PCRE2_ERROR_NOMATCH) goto EXIT;
    if (goptions == 0 || start_offset >= length) break;

    /* Advance by one code point. Then, if CRLF is a valid newline sequence and
    we have advanced into the middle of it, advance one more code point. In
    other words, do not start in the middle of CRLF, even if CR and LF on their
    own are valid newlines. */

    save_start = start_offset++;
    if (subject[start_offset-1] == CHAR_CR &&
        code->newline_convention != PCRE2_NEWLINE_CR &&
        code->newline_convention != PCRE2_NEWLINE_LF &&
        start_offset < length &&
        subject[start_offset] == CHAR_LF)
      start_offset++;

    /* Otherwise, in UTF mode, advance past any secondary code points. */

    else if ((code->overall_options & PCRE2_UTF) != 0)
      {
#if PCRE2_CODE_UNIT_WIDTH == 8
      while (start_offset < length && (subject[start_offset] & 0xc0) == 0x80)
        start_offset++;
#elif PCRE2_CODE_UNIT_WIDTH == 16
      while (start_offset < length &&
            (subject[start_offset] & 0xfc00) == 0xdc00)
        start_offset++;
#endif
      }

    /* Copy what we have advanced past, reset the special global options, and
    continue to the next match. */

    fraglength = start_offset - save_start;
    CHECKMEMCPY(subject + save_start, fraglength);
    goptions = 0;
    continue;
    }

  /* Handle a successful match. Matches that use \K to end before they start
  are not supported. */

  if (ovector[1] < ovector[0])
    {
    rc = PCRE2_ERROR_BADSUBSPATTERN;
    goto EXIT;
    }

  /* Count substitutions with a paranoid check for integer overflow; surely no
  real call to this function would ever hit this! */

  if (subs == INT_MAX)
    {
    rc = PCRE2_ERROR_TOOMANYREPLACE;
    goto EXIT;
    }
  subs++;

  /* Copy the text leading up to the match. */

  if (rc == 0) rc = ovector_count;
  fraglength = ovector[0] - start_offset;
  CHECKMEMCPY(subject + start_offset, fraglength);

  /* Process the replacement string. Literal mode is set by \Q, but only in
  extended mode when backslashes are being interpreted. In extended mode we
  must handle nested substrings that are to be reprocessed. */

  ptr = replacement;
  for (;;)
    {
    uint32_t ch;
    unsigned int chlen;

    /* If at the end of a nested substring, pop the stack. */

    if (ptr >= repend)
      {
      if (ptrstackptr <= 0) break;       /* End of replacement string */
      repend = ptrstack[--ptrstackptr];
      ptr = ptrstack[--ptrstackptr];
      continue;
      }

    /* Handle the next character */

    if (literal)
      {
      if (ptr[0] == CHAR_BACKSLASH && ptr < repend - 1 && ptr[1] == CHAR_E)
        {
        literal = FALSE;
        ptr += 2;
        continue;
        }
      goto LOADLITERAL;
      }

    /* Not in literal mode. */

    if (*ptr == CHAR_DOLLAR_SIGN)
      {
      int group, n;
      uint32_t special = 0;
      BOOL inparens;
      BOOL star;
      PCRE2_SIZE sublength;
      PCRE2_SPTR text1_start = NULL;
      PCRE2_SPTR text1_end = NULL;
      PCRE2_SPTR text2_start = NULL;
      PCRE2_SPTR text2_end = NULL;
      PCRE2_UCHAR next;
      PCRE2_UCHAR name[33];

      if (++ptr >= repend) goto BAD;
      if ((next = *ptr) == CHAR_DOLLAR_SIGN) goto LOADLITERAL;

      group = -1;
      n = 0;
      inparens = FALSE;
      star = FALSE;

      if (next == CHAR_LEFT_CURLY_BRACKET)
        {
        if (++ptr >= repend) goto BAD;
        next = *ptr;
        inparens = TRUE;
        }

      if (next == CHAR_ASTERISK)
        {
        if (++ptr >= repend) goto BAD;
        next = *ptr;
        star = TRUE;
        }

      if (!star && next >= CHAR_0 && next <= CHAR_9)
        {
        group = next - CHAR_0;
        while (++ptr < repend)
          {
          next = *ptr;
          if (next < CHAR_0 || next > CHAR_9) break;
          group = group * 10 + next - CHAR_0;

          /* A check for a number greater than the hightest captured group
          is sufficient here; no need for a separate overflow check. If unknown
          groups are to be treated as unset, just skip over any remaining
          digits and carry on. */

          if (group > code->top_bracket)
            {
            if ((suboptions & PCRE2_SUBSTITUTE_UNKNOWN_UNSET) != 0)
              {
              while (++ptr < repend && *ptr >= CHAR_0 && *ptr <= CHAR_9);
              break;
              }
            else
              {
              rc = PCRE2_ERROR_NOSUBSTRING;
              goto PTREXIT;
              }
            }
          }
        }
      else
        {
        const uint8_t *ctypes = code->tables + ctypes_offset;
        while (MAX_255(next) && (ctypes[next] & ctype_word) != 0)
          {
          name[n++] = next;
          if (n > 32) goto BAD;
          if (++ptr >= repend) break;
          next = *ptr;
          }
        if (n == 0) goto BAD;
        name[n] = 0;
        }

      /* In extended mode we recognize ${name:+set text:unset text} and
      ${name:-default text}. */

      if (inparens)
        {
        if ((suboptions & PCRE2_SUBSTITUTE_EXTENDED) != 0 &&
             !star && ptr < repend - 2 && next == CHAR_COLON)
          {
          special = *(++ptr);
          if (special != CHAR_PLUS && special != CHAR_MINUS)
            {
            rc = PCRE2_ERROR_BADSUBSTITUTION;
            goto PTREXIT;
            }

          text1_start = ++ptr;
          rc = find_text_end(code, &ptr, repend, special == CHAR_MINUS);
          if (rc != 0) goto PTREXIT;
          text1_end = ptr;

          if (special == CHAR_PLUS && *ptr == CHAR_COLON)
            {
            text2_start = ++ptr;
            rc = find_text_end(code, &ptr, repend, TRUE);
            if (rc != 0) goto PTREXIT;
            text2_end = ptr;
            }
          }

        else
          {
          if (ptr >= repend || *ptr != CHAR_RIGHT_CURLY_BRACKET)
            {
            rc = PCRE2_ERROR_REPMISSINGBRACE;
            goto PTREXIT;
            }
          }

        ptr++;
        }

      /* Have found a syntactically correct group number or name, or *name.
      Only *MARK is currently recognized. */

      if (star)
        {
        if (PRIV(strcmp_c8)(name, STRING_MARK) == 0)
          {
          PCRE2_SPTR mark = pcre2_get_mark(match_data);
          if (mark != NULL)
            {
            PCRE2_SPTR mark_start = mark;
            while (*mark != 0) mark++;
            fraglength = mark - mark_start;
            CHECKMEMCPY(mark_start, fraglength);
            }
          }
        else goto BAD;
        }

      /* Substitute the contents of a group. We don't use substring_copy
      functions any more, in order to support case forcing. */

      else
        {
        PCRE2_SPTR subptr, subptrend;

        /* Find a number for a named group. In case there are duplicate names,
        search for the first one that is set. If the name is not found when
        PCRE2_SUBSTITUTE_UNKNOWN_EMPTY is set, set the group number to a
        non-existent group. */

        if (group < 0)
          {
          PCRE2_SPTR first, last, entry;
          rc = pcre2_substring_nametable_scan(code, name, &first, &last);
          if (rc == PCRE2_ERROR_NOSUBSTRING &&
              (suboptions & PCRE2_SUBSTITUTE_UNKNOWN_UNSET) != 0)
            {
            group = code->top_bracket + 1;
            }
          else
            {
            if (rc < 0) goto PTREXIT;
            for (entry = first; entry <= last; entry += rc)
              {
              uint32_t ng = GET2(entry, 0);
              if (ng < ovector_count)
                {
                if (group < 0) group = ng;          /* First in ovector */
                if (ovector[ng*2] != PCRE2_UNSET)
                  {
                  group = ng;                       /* First that is set */
                  break;
                  }
                }
              }

            /* If group is still negative, it means we did not find a group
            that is in the ovector. Just set the first group. */

            if (group < 0) group = GET2(first, 0);
            }
          }

        /* We now have a group that is identified by number. Find the length of
        the captured string. If a group in a non-special substitution is unset
        when PCRE2_SUBSTITUTE_UNSET_EMPTY is set, substitute nothing. */

        rc = pcre2_substring_length_bynumber(match_data, group, &sublength);
        if (rc < 0)
          {
          if (rc == PCRE2_ERROR_NOSUBSTRING &&
              (suboptions & PCRE2_SUBSTITUTE_UNKNOWN_UNSET) != 0)
            {
            rc = PCRE2_ERROR_UNSET;
            }
          if (rc != PCRE2_ERROR_UNSET) goto PTREXIT;  /* Non-unset errors */
          if (special == 0)                           /* Plain substitution */
            {
            if ((suboptions & PCRE2_SUBSTITUTE_UNSET_EMPTY) != 0) continue;
            goto PTREXIT;                             /* Else error */
            }
          }

        /* If special is '+' we have a 'set' and possibly an 'unset' text,
        both of which are reprocessed when used. If special is '-' we have a
        default text for when the group is unset; it must be reprocessed. */

        if (special != 0)
          {
          if (special == CHAR_MINUS)
            {
            if (rc == 0) goto LITERAL_SUBSTITUTE;
            text2_start = text1_start;
            text2_end = text1_end;
            }

          if (ptrstackptr >= PTR_STACK_SIZE) goto BAD;
          ptrstack[ptrstackptr++] = ptr;
          ptrstack[ptrstackptr++] = repend;

          if (rc == 0)
            {
            ptr = text1_start;
            repend = text1_end;
            }
          else
            {
            ptr = text2_start;
            repend = text2_end;
            }
          continue;
          }

        /* Otherwise we have a literal substitution of a group's contents. */

        LITERAL_SUBSTITUTE:
        subptr = subject + ovector[group*2];
        subptrend = subject + ovector[group*2 + 1];

        /* Substitute a literal string, possibly forcing alphabetic case. */

        while (subptr < subptrend)
          {
          GETCHARINCTEST(ch, subptr);
          if (forcecase != 0)
            {
#ifdef SUPPORT_UNICODE
            if (utf)
              {
              uint32_t type = UCD_CHARTYPE(ch);
              if (PRIV(ucp_gentype)[type] == ucp_L &&
                  type != ((forcecase > 0)? ucp_Lu : ucp_Ll))
                ch = UCD_OTHERCASE(ch);
              }
            else
#endif
              {
              if (((code->tables + cbits_offset +
                  ((forcecase > 0)? cbit_upper:cbit_lower)
                  )[ch/8] & (1 << (ch%8))) == 0)
                ch = (code->tables + fcc_offset)[ch];
              }
            forcecase = forcecasereset;
            }

#ifdef SUPPORT_UNICODE
          if (utf) chlen = PRIV(ord2utf)(ch, temp); else
#endif
            {
            temp[0] = ch;
            chlen = 1;
            }
          CHECKMEMCPY(temp, chlen);
          }
        }
      }

    /* Handle an escape sequence in extended mode. We can use check_escape()
    to process \Q, \E, \c, \o, \x and \ followed by non-alphanumerics, but
    the case-forcing escapes are not supported in pcre2_compile() so must be
    recognized here. */

    else if ((suboptions & PCRE2_SUBSTITUTE_EXTENDED) != 0 &&
              *ptr == CHAR_BACKSLASH)
      {
      int errorcode;

      if (ptr < repend - 1) switch (ptr[1])
        {
        case CHAR_L:
        forcecase = forcecasereset = -1;
        ptr += 2;
        continue;

        case CHAR_l:
        forcecase = -1;
        forcecasereset = 0;
        ptr += 2;
        continue;

        case CHAR_U:
        forcecase = forcecasereset = 1;
        ptr += 2;
        continue;

        case CHAR_u:
        forcecase = 1;
        forcecasereset = 0;
        ptr += 2;
        continue;

        default:
        break;
        }

      ptr++;  /* Point after \ */
      rc = PRIV(check_escape)(&ptr, repend, &ch, &errorcode,
        code->overall_options, FALSE, NULL);
      if (errorcode != 0) goto BADESCAPE;

      switch(rc)
        {
        case ESC_E:
        forcecase = forcecasereset = 0;
        continue;

        case ESC_Q:
        literal = TRUE;
        continue;

        case 0:      /* Data character */
        goto LITERAL;

        default:
        goto BADESCAPE;
        }
      }

    /* Handle a literal code unit */

    else
      {
      LOADLITERAL:
      GETCHARINCTEST(ch, ptr);    /* Get character value, increment pointer */

      LITERAL:
      if (forcecase != 0)
        {
#ifdef SUPPORT_UNICODE
        if (utf)
          {
          uint32_t type = UCD_CHARTYPE(ch);
          if (PRIV(ucp_gentype)[type] == ucp_L &&
              type != ((forcecase > 0)? ucp_Lu : ucp_Ll))
            ch = UCD_OTHERCASE(ch);
          }
        else
#endif
          {
          if (((code->tables + cbits_offset +
              ((forcecase > 0)? cbit_upper:cbit_lower)
              )[ch/8] & (1 << (ch%8))) == 0)
            ch = (code->tables + fcc_offset)[ch];
          }
        forcecase = forcecasereset;
        }

#ifdef SUPPORT_UNICODE
      if (utf) chlen = PRIV(ord2utf)(ch, temp); else
#endif
        {
        temp[0] = ch;
        chlen = 1;
        }
      CHECKMEMCPY(temp, chlen);
      } /* End handling a literal code unit */
    }   /* End of loop for scanning the replacement. */

  /* The replacement has been copied to the output. Update the start offset to
  point to the rest of the subject string. If we matched an empty string,
  do the magic for global matches. */

  start_offset = ovector[1];
  goptions = (ovector[0] != ovector[1])? 0 :
    PCRE2_ANCHORED|PCRE2_NOTEMPTY_ATSTART;
  } while ((suboptions & PCRE2_SUBSTITUTE_GLOBAL) != 0);  /* Repeat "do" loop */

/* Copy the rest of the subject. */

fraglength = length - start_offset;
CHECKMEMCPY(subject + start_offset, fraglength);
temp[0] = 0;
CHECKMEMCPY(temp , 1);

/* If overflowed is set it means the PCRE2_SUBSTITUTE_OVERFLOW_LENGTH is set,
and matching has carried on after a full buffer, in order to compute the length
needed. Otherwise, an overflow generates an immediate error return. */

if (overflowed)
  {
  rc = PCRE2_ERROR_NOMEMORY;
  *blength = buff_length + extra_needed;
  }

/* After a successful execution, return the number of substitutions and set the
length of buffer used, excluding the trailing zero. */

else
  {
  rc = subs;
  *blength = buff_offset - 1;
  }

EXIT:
if (match_data_created) pcre2_match_data_free(match_data);
  else match_data->rc = rc;
return rc;

NOROOM:
rc = PCRE2_ERROR_NOMEMORY;
goto EXIT;

BAD:
rc = PCRE2_ERROR_BADREPLACEMENT;
goto PTREXIT;

BADESCAPE:
rc = PCRE2_ERROR_BADREPESCAPE;

PTREXIT:
*blength = (PCRE2_SIZE)(ptr - replacement);
goto EXIT;
}
示例#3
0
BOOL
_pcre_xclass(int c, const uschar *data)
{
int t;
BOOL negated = (*data & XCL_NOT) != 0;

/* Character values < 256 are matched against a bitmap, if one is present. If
not, we still carry on, because there may be ranges that start below 256 in the
additional data. */

if (c < 256)
  {
  if ((*data & XCL_MAP) != 0 && (data[1 + c/8] & (1 << (c&7))) != 0)
    return !negated;   /* char found */
  }

/* First skip the bit map if present. Then match against the list of Unicode
properties or large chars or ranges that end with a large char. We won't ever
encounter XCL_PROP or XCL_NOTPROP when UCP support is not compiled. */

if ((*data++ & XCL_MAP) != 0) data += 32;

while ((t = *data++) != XCL_END)
  {
  int x, y;
  if (t == XCL_SINGLE)
    {
    GETCHARINC(x, data);
    if (c == x) return !negated;
    }
  else if (t == XCL_RANGE)
    {
    GETCHARINC(x, data);
    GETCHARINC(y, data);
    if (c >= x && c <= y) return !negated;
    }

#ifdef SUPPORT_UCP
  else  /* XCL_PROP & XCL_NOTPROP */
    {
    int chartype = UCD_CHARTYPE(c);
    switch(*data)
      {
      case PT_ANY:
      if (t == XCL_PROP) return !negated;
      break;

      case PT_LAMP:
      if ((chartype == ucp_Lu || chartype == ucp_Ll || chartype == ucp_Lt) ==
          (t == XCL_PROP)) return !negated;
      break;

      case PT_GC:
      if ((data[1] == _pcre_ucp_gentype[chartype]) == (t == XCL_PROP)) return !negated;
      break;

      case PT_PC:
      if ((data[1] == chartype) == (t == XCL_PROP)) return !negated;
      break;

      case PT_SC:
      if ((data[1] == UCD_SCRIPT(c)) == (t == XCL_PROP)) return !negated;
      break;

      /* This should never occur, but compilers may mutter if there is no
      default. */

      default:
      return FALSE;
      }

    data += 2;
    }
#endif  /* SUPPORT_UCP */
  }

return negated;   /* char did not match */
}