Example #1
0
	int ICUUnicodeSupport::_compareNoCase<1>(ConstStringHolder<1> _first, ConstStringHolder<1> _second)
	{
		int32_t len1 = _first.length();
		int32_t len2 = _second.length();
		int32_t ofs1 = 0;
		int32_t ofs2 = 0;

		int r = checkStringEnd(ofs1, len1, ofs2, len2);
		if(r != 2) return r;

		const uint8_t* buf1 = _first.c_str();
		const uint8_t* buf2 = _second.c_str();
		while(true)
		{
			UChar32 c1, c2;
			U8_NEXT(buf1, ofs1, len1, c1);
			U8_NEXT(buf2, ofs2, len2, c2);
			
			c1 = u_tolower(c1);
			c2 = u_tolower(c2);
			if(c1 != c2)
				return (c1 < c2) ? -1 : 1;

			r = checkStringEnd(ofs1, len1, ofs2, len2);
			if(r != 2) return r;
		}
	}
Example #2
0
static int utf8_comparison(const unsigned char *buf1, size_t buf1size,
        const unsigned char *buf2, size_t buf2size,
        int caseinsensitive, int genericwhitespace) {
    UChar32 c1, c2;
    int i1 = 0;
    int i2 = 0;
    while (i1 < (int)buf1size && i2 < (int)buf2size) {
        U8_NEXT(buf1, i1, buf1size, c1);
        assert(c1 >= 0);
        U8_NEXT(buf2, i2, buf2size, c2);
        assert(c2 >= 0);
        if (c1 != c2) {
            if (caseinsensitive) {
                // turn both lowercase and compare again:
                if (c1 < 127 && c2 < 127) {  // manual ascii case insensitive
                    char bytec1 = (char)c1;
                    char bytec2 = (char)c2;
                    if (((bytec1 >= 'a' && bytec1 <= 'z') ||
                            (bytec1 >= 'A' && bytec1 <= 'Z')) &&
                            ((bytec2 >= 'a' && bytec2 <= 'z') ||
                            (bytec2 >= 'A' && bytec2 <= 'Z'))) {
                        // turn them into uppercase
                        if (bytec1 >= 'a' && bytec1 <= 'z') {
                            bytec1 = bytec1 - ('a' - 'A');
                        }
                        if (bytec2 >= 'a' && bytec2 <= 'z') {
                            bytec2 = bytec2 - ('a' - 'A');
                        }
                        // compare with case insensitive:
                        if (bytec1 == bytec2) {
                            continue;
                        } else {
                            return ((int)bytec1) - ((int)bytec2);
                        }
                    }
                }
            }
            if (genericwhitespace) {
                // see if both are whitespace and try again:
                if (utf8_codepoint_is_whitespace(c1) &&
                        utf8_codepoint_is_whitespace(c2)) {
                    continue;
                }
            }
            // not equal. return difference:
            return ((int)c1) - ((int)c2);
        }
    }
    if (i1 < (int)buf1size) {
        U8_NEXT(buf1, i1, buf1size, c1);
        return c1;
    }
    if (i2 < (int)buf2size) {
        U8_NEXT(buf2, i2, buf2size, c2);
        return -((int)c2);
    }
    return 0;
}
Example #3
0
/*
 * Case-maps [srcStart..srcLimit[ but takes
 * context [0..srcLength[ into account.
 */
static void
_caseMap(int32_t caseLocale, uint32_t options, UCaseMapFull *map,
         const uint8_t *src, UCaseContext *csc,
         int32_t srcStart, int32_t srcLimit,
         icu::ByteSink &sink, icu::Edits *edits,
         UErrorCode &errorCode) {
    /* case mapping loop */
    int32_t srcIndex=srcStart;
    while (U_SUCCESS(errorCode) && srcIndex<srcLimit) {
        int32_t cpStart;
        csc->cpStart=cpStart=srcIndex;
        UChar32 c;
        U8_NEXT(src, srcIndex, srcLimit, c);
        csc->cpLimit=srcIndex;
        if(c<0) {
            // Malformed UTF-8.
            ByteSinkUtil::appendUnchanged(src+cpStart, srcIndex-cpStart,
                                          sink, options, edits, errorCode);
        } else {
            const UChar *s;
            c=map(c, utf8_caseContextIterator, csc, &s, caseLocale);
            appendResult(srcIndex - cpStart, c, s, sink, options, edits, errorCode);
        }
    }
}
Example #4
0
static inline int convert_cp(UChar32* pcp, zval *zcp) {
	zend_long cp = -1;

	if (Z_TYPE_P(zcp) == IS_LONG) {
		cp = Z_LVAL_P(zcp);
	} else if (Z_TYPE_P(zcp) == IS_STRING) {
		int32_t i = 0;
		size_t zcp_len = Z_STRLEN_P(zcp);

		if (ZEND_SIZE_T_INT_OVFL(zcp_len)) {
			intl_error_set_code(NULL, U_ILLEGAL_ARGUMENT_ERROR);
			intl_error_set_custom_msg(NULL, "Input string is too long.", 0);
			return FAILURE;
		}

		U8_NEXT(Z_STRVAL_P(zcp), i, zcp_len, cp);
		if ((size_t)i != zcp_len) {
			intl_error_set_code(NULL, U_ILLEGAL_ARGUMENT_ERROR);
			intl_error_set_custom_msg(NULL, "Passing a UTF-8 character for codepoint requires a string which is exactly one UTF-8 codepoint long.", 0);
			return FAILURE;
		}
	} else {
		intl_error_set_code(NULL, U_ILLEGAL_ARGUMENT_ERROR);
		intl_error_set_custom_msg(NULL, "Invalid parameter for unicode point.  Must be either integer or UTF-8 sequence.", 0);
		return FAILURE;
	}
	if ((cp < UCHAR_MIN_VALUE) || (cp > UCHAR_MAX_VALUE)) {
		intl_error_set_code(NULL, U_ILLEGAL_ARGUMENT_ERROR);
		intl_error_set_custom_msg(NULL, "Codepoint out of range", 0);
		return FAILURE;
	}
	*pcp = (UChar32)cp;
	return SUCCESS;
}
Example #5
0
static void TestNextPrevNonCharacters() {
    /* test non-characters */
    static const uint8_t nonChars[]={
        0xef, 0xb7, 0x90,       /* U+fdd0 */
        0xef, 0xbf, 0xbf,       /* U+feff */
        0xf0, 0x9f, 0xbf, 0xbe, /* U+1fffe */
        0xf0, 0xbf, 0xbf, 0xbf, /* U+3ffff */
        0xf4, 0x8f, 0xbf, 0xbe  /* U+10fffe */
    };

    UChar32 ch;
    int32_t idx;

    for(idx=0; idx<(int32_t)sizeof(nonChars);) {
        U8_NEXT(nonChars, idx, sizeof(nonChars), ch);
        if(!U_IS_UNICODE_NONCHAR(ch)) {
            log_err("U8_NEXT(before %d) failed to read a non-character\n", idx);
        }
    }
    for(idx=(int32_t)sizeof(nonChars); idx>0;) {
        U8_PREV(nonChars, 0, idx, ch);
        if(!U_IS_UNICODE_NONCHAR(ch)) {
            log_err("U8_PREV(at %d) failed to read a non-character\n", idx);
        }
    }
}
Example #6
0
/*
 * Case-maps [srcStart..srcLimit[ but takes
 * context [0..srcLength[ into account.
 */
static int32_t
_caseMap(const UCaseMap *csm, UCaseMapFull *map,
         uint8_t *dest, int32_t destCapacity,
         const uint8_t *src, UCaseContext *csc,
         int32_t srcStart, int32_t srcLimit,
         UErrorCode *pErrorCode) {
    const UChar *s;
    UChar32 c;
    int32_t srcIndex, destIndex;
    int32_t locCache;

    locCache=csm->locCache;

    /* case mapping loop */
    srcIndex=srcStart;
    destIndex=0;
    while(srcIndex<srcLimit) {
        csc->cpStart=srcIndex;
        U8_NEXT(src, srcIndex, srcLimit, c);
        csc->cpLimit=srcIndex;
        c=map(csm->csp, c, utf8_caseContextIterator, csc, &s, csm->locale, &locCache);
        destIndex=appendResult(dest, destIndex, destCapacity, c, s);
    }

    if(destIndex>destCapacity) {
        *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
    }
    return destIndex;
}
Example #7
0
static UChar32 U_CALLCONV
utf8_caseContextIterator(void *context, int8_t dir) {
    UCaseContext *csc=(UCaseContext *)context;
    UChar32 c;

    if(dir<0) {
        /* reset for backward iteration */
        csc->index=csc->cpStart;
        csc->dir=dir;
    } else if(dir>0) {
        /* reset for forward iteration */
        csc->index=csc->cpLimit;
        csc->dir=dir;
    } else {
        /* continue current iteration direction */
        dir=csc->dir;
    }

    if(dir<0) {
        if(csc->start<csc->index) {
            U8_PREV((const uint8_t *)csc->p, csc->start, csc->index, c);
            return c;
        }
    } else {
        if(csc->index<csc->limit) {
            U8_NEXT((const uint8_t *)csc->p, csc->index, csc->limit, c);
            return c;
        }
    }
    return U_SENTINEL;
}
/**
 * Detect if a character class occurs in a string
 *
 * @param str character vector
 * @param pattern character vector
 * @param omit_na single logical value
 * @return logical vector
 *
 * @version 0.3-1 (Bartek Tartanus, 2014-07-25)
 *
 * @version 0.3-1 (Marek Gagolewski, 2014-10-17)
 *                using std::vector<int> to avoid mem-leaks
 *
 * @version 0.3-1 (Marek Gagolewski, 2014-11-04)
 *    Issue #112: str_prepare_arg* retvals were not PROTECTed from gc
 *
 * @version 0.4-1 (Marek Gagolewski, 2014-12-04)
 *    FR #122: omit_na arg added
 *
 * @version 1.0-3 (Marek Gagolewski, 2016-02-03)
 *    FR #216: `negate` arg added
 */
SEXP stri_subset_charclass(SEXP str, SEXP pattern, SEXP omit_na, SEXP negate)
{
   bool negate_1 = stri__prepare_arg_logical_1_notNA(negate, "negate");
   bool omit_na1 = stri__prepare_arg_logical_1_notNA(omit_na, "omit_na");
   PROTECT(str = stri_prepare_arg_string(str, "str"));
   PROTECT(pattern = stri_prepare_arg_string(pattern, "pattern"));
   R_len_t vectorize_length =
      stri__recycling_rule(true, 2, LENGTH(str), LENGTH(pattern));

   STRI__ERROR_HANDLER_BEGIN(2)
   StriContainerUTF8 str_cont(str, vectorize_length);
   StriContainerCharClass pattern_cont(pattern, vectorize_length);

   // BT: this cannot be done with deque, because pattern is reused so i does not
   // go like 0,1,2...n but 0,pat_len,2*pat_len,1,pat_len+1 and so on
   // MG: agreed
   std::vector<int> which(vectorize_length);
   int result_counter = 0;

   for (R_len_t i = pattern_cont.vectorize_init();
         i != pattern_cont.vectorize_end();
         i = pattern_cont.vectorize_next(i))
   {
      if (str_cont.isNA(i) || pattern_cont.isNA(i)) {
         if (omit_na1) which[i] = FALSE;
         else {
            which[i] = NA_LOGICAL;
            result_counter++;
         }
         continue;
      }

      const UnicodeSet* pattern_cur = &pattern_cont.get(i);
      R_len_t     str_cur_n = str_cont.get(i).length();
      const char* str_cur_s = str_cont.get(i).c_str();

      UChar32 chr = 0;
      which[i] = FALSE;
      for (R_len_t j=0; j<str_cur_n; ) {
         U8_NEXT(str_cur_s, j, str_cur_n, chr);
         if (chr < 0) // invalid utf-8 sequence
            throw StriException(MSG__INVALID_UTF8);
         if (pattern_cur->contains(chr)) {
            which[i] = TRUE;
            break;
         }
      }
      if (negate_1) which[i] = !which[i];
      if (which[i]) result_counter++;
   }

   SEXP ret;
   STRI__PROTECT(ret = stri__subset_by_logical(str_cont, which, result_counter));
   STRI__UNPROTECT_ALL
   return ret;
   STRI__ERROR_HANDLER_END(;/* nothing special to be done on error */)
}
/** Convert character vector to UTF-32
 *
 * @param str character vector
 * @return list with integer vectors
 *
 * @version 0.1-?? (Marek Gagolewski)
 *
 * @version 0.1-?? (Marek Gagolewski, 2013-06-16)
 *          make StriException-friendly
 *
 * @version 0.2-1 (Marek Gagolewski, 2014-03-26)
 *          use vector<UChar32> buf instead of R_alloc;
 *          warn and set NULL on improper UTF-8 byte sequences
 *
 * @version 0.2-3 (Marek Gagolewski, 2014-05-12)
 *          Use UChar32* instead of vector<UChar32> as ::data is C++11
 *
 * @version 0.3-1 (Marek Gagolewski, 2014-11-04)
 *    Issue #112: str_prepare_arg* retvals were not PROTECTed from gc
 */
SEXP stri_enc_toutf32(SEXP str)
{
   PROTECT(str = stri_prepare_arg_string(str, "str"));
   R_len_t n = LENGTH(str);

   STRI__ERROR_HANDLER_BEGIN(1)
   StriContainerUTF8 str_cont(str, n);

   R_len_t bufsize = 1; // to avoid allocating an empty buffer
   for (R_len_t i=0; i<n; ++i) {
      if (str_cont.isNA(i)) continue;
      R_len_t ni = str_cont.get(i).length();
      if (ni > bufsize) bufsize = ni;
   }

   UChar32* buf = (UChar32*)R_alloc((size_t)bufsize, (int)sizeof(UChar32)); // at most bufsize UChars32 (bufsize/4 min.)
   if (!buf) throw StriException(MSG__MEM_ALLOC_ERROR);
   // deque<UChar32> was slower than using a common, over-sized buf

   SEXP ret;
   STRI__PROTECT(ret = Rf_allocVector(VECSXP, n)); // all

   for (R_len_t i=0; i<n; ++i) {

      if (str_cont.isNA(i)) {
         SET_VECTOR_ELT(ret, i, R_NilValue);
         continue;
      }

      UChar32 c = (UChar32)0;
      const char* s = str_cont.get(i).c_str();
      R_len_t sn = str_cont.get(i).length();
      R_len_t j = 0;
      R_len_t k = 0;
      while (c >= 0 && j < sn) {
         U8_NEXT(s, j, sn, c);
         buf[k++] = (int)c;
      }

      if (c < 0) {
         Rf_warning(MSG__INVALID_UTF8);
         SET_VECTOR_ELT(ret, i, R_NilValue);
         continue;
      }
      else {
         SEXP conv;
         STRI__PROTECT(conv = Rf_allocVector(INTSXP, k));
         memcpy(INTEGER(conv), buf, (size_t)sizeof(int)*k);
         SET_VECTOR_ELT(ret, i, conv);
         STRI__UNPROTECT(1);
      }
   }

   STRI__UNPROTECT_ALL
   return ret;
   STRI__ERROR_HANDLER_END({ /* do nothing on error */ })
}
/**
 * Substitutes vector elements if a pattern occurs in a string
 *
 * @param str character vector
 * @param pattern character vector
 * @param value character vector
 * @return character vector
 *
 * @version 1.0-3 (Marek Gagolewski, 2016-02-03)
 *   FR#124
 *
 * @version 1.0-3 (Marek Gagolewski, 2016-02-03)
 *    FR #216: `negate` arg added
 */
SEXP stri_subset_charclass_replacement(SEXP str, SEXP pattern, SEXP negate, SEXP value)
{
   bool negate_1 = stri__prepare_arg_logical_1_notNA(negate, "negate");
   PROTECT(str = stri_prepare_arg_string(str, "str"));
   PROTECT(pattern = stri_prepare_arg_string_1(pattern, "pattern"));
   PROTECT(value = stri_prepare_arg_string(value, "value"));

   int vectorize_length = LENGTH(str);
   int value_length = LENGTH(value);
   if (value_length == 0)
      Rf_error(MSG__REPLACEMENT_ZERO);

   STRI__ERROR_HANDLER_BEGIN(3)
   StriContainerUTF8 str_cont(str, vectorize_length);
   StriContainerUTF8 value_cont(value, value_length);
   StriContainerCharClass pattern_cont(pattern, vectorize_length);

   SEXP ret;
   STRI__PROTECT(ret = Rf_allocVector(STRSXP, vectorize_length));

   R_len_t k = 0;
   for (R_len_t i = str_cont.vectorize_init();
         i != str_cont.vectorize_end();
         i = str_cont.vectorize_next(i))
   {
      if (str_cont.isNA(i) || pattern_cont.isNA(i)) {
         SET_STRING_ELT(ret, i, NA_STRING);
         continue;
      }

      const UnicodeSet* pattern_cur = &pattern_cont.get(i);
      R_len_t     str_cur_n = str_cont.get(i).length();
      const char* str_cur_s = str_cont.get(i).c_str();

      UChar32 chr = 0;
      bool found = false;
      for (R_len_t j=0; j<str_cur_n; ) {
         U8_NEXT(str_cur_s, j, str_cur_n, chr);
         if (chr < 0) // invalid utf-8 sequence
            throw StriException(MSG__INVALID_UTF8);
         if (pattern_cur->contains(chr)) {
            found = true;
            break;
         }
      }

      if ((found && !negate_1) || (!found && negate_1))
         SET_STRING_ELT(ret, i, value_cont.toR((k++)%value_length));
      else
         SET_STRING_ELT(ret, i, str_cont.toR(i));
   }

   STRI__UNPROTECT_ALL
   return ret;
   STRI__ERROR_HANDLER_END(;/* nothing special to be done on error */)
}
Example #11
0
static UBool *
getResultsManually(const char** encodings, int32_t num_encodings,
                   const char *utf8, int32_t length,
                   const USet* excludedCodePoints, const UConverterUnicodeSet whichSet) {
  UBool* resultsManually;
  int32_t i;

  resultsManually = (UBool*) uprv_malloc(gCountAvailable);
  uprv_memset(resultsManually, 0, gCountAvailable);

  for(i = 0 ; i < num_encodings ; i++) {
    UErrorCode status = U_ZERO_ERROR;
    /* get unicode set for that converter */
    USet* set;
    UConverter* test_converter;
    UChar32 cp;
    int32_t encIndex, offset;

    set = uset_openEmpty();
    test_converter = ucnv_open(encodings[i], &status);
    ucnv_getUnicodeSet(test_converter, set,
                       whichSet, &status);
    if (excludedCodePoints != NULL) {
      uset_addAll(set, excludedCodePoints);
    }
    uset_freeze(set);
    offset = 0;
    cp = 0;

    encIndex = findIndex(encodings[i]);
    /*
     * The following is almost, but not entirely, the same as
     * resultsManually[encIndex] =
     *   (UBool)(uset_spanUTF8(set, utf8, length, USET_SPAN_SIMPLE) == length);
     * They might be different if the set contains strings,
     * or if the utf8 string contains an illegal sequence.
     *
     * The UConverterSelector does not currently handle strings that can be
     * converted, and it treats an illegal sequence as convertible
     * while uset_spanUTF8() treats it like U+FFFD which may not be convertible.
     */
    resultsManually[encIndex] = TRUE;
    while(offset<length) {
      U8_NEXT(utf8, offset, length, cp);
      if (cp >= 0 && !uset_contains(set, cp)) {
        resultsManually[encIndex] = FALSE;
        break;
      }
    }
    uset_close(set);
    ucnv_close(test_converter);
  }
  return resultsManually;
}
Example #12
0
static int utf8_length(const unsigned char *buf, size_t bufsize) {
    UChar32 c;
    size_t length = 0;
    int i = 0;
    while (i < (int)bufsize) {
        U8_NEXT(buf, i, bufsize, c);
        assert(c >= 0);
        length++;
    }
    return length;
}
/**
 * Extract first or last occurences of a character class in each string
 *
 * @param str character vector
 * @param pattern character vector
 * @return character vector
 *
 * @version 0.1 (Marek Gagolewski, 2013-06-08)
 * @version 0.2 (Marek Gagolewski, 2013-06-15) Use StrContainerCharClass
 * @version 0.3 (Marek Gagolewski, 2013-06-16) make StriException-friendly
 */
SEXP stri__extract_firstlast_charclass(SEXP str, SEXP pattern, bool first)
{
   str = stri_prepare_arg_string(str, "str");
   pattern = stri_prepare_arg_string(pattern, "pattern");
   R_len_t vectorize_length = stri__recycling_rule(true, 2, LENGTH(str), LENGTH(pattern));

   STRI__ERROR_HANDLER_BEGIN
   StriContainerUTF8 str_cont(str, vectorize_length);
   StriContainerCharClass pattern_cont(pattern, vectorize_length);

   SEXP ret;
   PROTECT(ret = Rf_allocVector(STRSXP, vectorize_length));

   for (R_len_t i = pattern_cont.vectorize_init();
         i != pattern_cont.vectorize_end();
         i = pattern_cont.vectorize_next(i))
   {
      SET_STRING_ELT(ret, i, NA_STRING);

      if (str_cont.isNA(i) || pattern_cont.isNA(i))
         continue;

      CharClass pattern_cur = pattern_cont.get(i);
      R_len_t     str_cur_n = str_cont.get(i).length();
      const char* str_cur_s = str_cont.get(i).c_str();
      R_len_t j, jlast;
      UChar32 chr;

      if (first) {
         for (jlast=j=0; j<str_cur_n; ) {
            U8_NEXT(str_cur_s, j, str_cur_n, chr);
            if (pattern_cur.test(chr)) {
               SET_STRING_ELT(ret, i, Rf_mkCharLenCE(str_cur_s+jlast, j-jlast, CE_UTF8));
               break; // that's enough for first
            }
            jlast = j;
         }
      }
      else {
         for (jlast=j=str_cur_n; j>0; ) {
            U8_PREV(str_cur_s, 0, j, chr); // go backwards
            if (pattern_cur.test(chr)) {
               SET_STRING_ELT(ret, i, Rf_mkCharLenCE(str_cur_s+j, jlast-j, CE_UTF8));
               break; // that's enough for last
            }
            jlast = j;
         }
      }
   }

   UNPROTECT(1);
   return ret;
   STRI__ERROR_HANDLER_END(;/* nothing special to be done on error */)
}
/** Convert character vector to ASCII
 *
 * All charcodes > 127 are replaced with subst chars (0x1A)
 *
 * @param str character vector
 * @return character vector
 *
 * @version 0.1 (Marek Gagolewski)
 * @version 0.2 (Marek Gagolewski, 2013-06-16) make StriException-friendly
 */
SEXP stri_enc_toascii(SEXP str)
{
   str = stri_prepare_arg_string(str, "str");
   R_len_t n = LENGTH(str);

   STRI__ERROR_HANDLER_BEGIN
   SEXP ret;
   PROTECT(ret = Rf_allocVector(STRSXP, n));
   for (R_len_t i=0; i<n; ++i) {
      SEXP curs = STRING_ELT(str, i);
      if (curs == NA_STRING) {
         SET_STRING_ELT(ret, i, NA_STRING);
         continue;
      }
      else if (IS_ASCII(curs)) {
         SET_STRING_ELT(ret, i, curs);
      }
      else if (IS_UTF8(curs)) {
         R_len_t curn = LENGTH(curs);
         const char* curs_tab = CHAR(curs);
         // TODO: buffer reuse....
         String8 buf(curn+1); // this may be 4 times too much
         R_len_t k = 0;
         UChar32 c;
         for (int j=0; j<curn; ) {
            U8_NEXT(curs_tab, j, curn, c);
            if (c > ASCII_MAXCHARCODE)
               buf.data()[k++] = ASCII_SUBSTITUTE;
            else
               buf.data()[k++] = (char)c;
         }
         SET_STRING_ELT(ret, i, Rf_mkCharLenCE(buf.data(), k, CE_UTF8)); // will be marked as ASCII anyway by mkCharLenCE
      }
      else { // some 8-bit encoding
         R_len_t curn = LENGTH(curs);
         const char* curs_tab = CHAR(curs);
         // TODO: buffer reuse....
         String8 buf(curn+1);
         R_len_t k = 0;
         for (R_len_t j=0; j<curn; ++j) {
            if (U8_IS_SINGLE(curs_tab[j]))
               buf.data()[k++] = curs_tab[j];
            else {
               buf.data()[k++] = (char)ASCII_SUBSTITUTE; // subst char in ascii
            }
         }
         SET_STRING_ELT(ret, i, Rf_mkCharLenCE(buf.data(), k, CE_UTF8)); // will be marked as ASCII anyway by mkCharLenCE
      }
   }
   UNPROTECT(1);
   return ret;
   STRI__ERROR_HANDLER_END(;/* nothing special to be done on error */)
}
Example #15
0
static int utf8_iterate_codepoints(const unsigned char *buf, size_t bufsize,
        int (*do_something)(UChar32 c)) {
    UChar32 c;
    int i = 0;
    while (i < (int)bufsize) {
        U8_NEXT(buf, i, bufsize, c);
        assert(c >= 0);
        if (!do_something(c)) {
            return 0;
        }
    }
    return 1;
}
Example #16
0
File: utf.cpp Project: MGKhKhD/meta
uint64_t length(const std::string& str)
{
    const char* s = str.c_str();
    auto length = static_cast<int32_t>(str.length());
    uint64_t count = 0;
    for (int32_t i = 0; i < length;)
    {
        UChar32 c;
        U8_NEXT(s, i, length, c);
        ++count;
    }
    return count;
}
Example #17
0
SEXP R_stri_length(SEXP vec) {
  int vec_len = LENGTH(vec);
  SEXP ret = PROTECT(allocVector(INTSXP, vec_len));
  int* retint = INTEGER(ret);

  for (int i = 0; i < vec_len; i++) {
    SEXP str = STRING_ELT(vec, i);
    if (str == NA_STRING) {
      retint[i] = NA_INTEGER;
      continue;
    }

    int str_len = LENGTH(str);
    if (getCharCE(str) == CE_LATIN1 || (getCharCE(str) == CE_NATIVE && getNativeCE() == CE_LATIN1)) {
      retint[i] = str_len;
    } else if (getCharCE(str) == CE_BYTES) {
      UNPROTECT(1);
      error("Invalid encoding: bytes.");
    } else if (getCharCE(str) == CE_UTF8 || (getCharCE(str) == CE_NATIVE && getNativeCE() == CE_UTF8)) {
      UChar32 out = 0;
      const char* source = CHAR(str);
      R_len_t j = 0;
      int count;
      for (count = 0; out >= 0 && j < str_len; count++) {
        U8_NEXT(source, j, str_len, out); // faster that U8_FWD_1 & gives bad UChar32s
      }
      if (out < 0) {
        warning("Invalid UTF8 string: %s", source);
        retint[i] = NA_INTEGER;
      } else {
        retint[i] = count;
      }
    } else if (native_is_singlebyte()) { // native-8bit
      retint[i] = str_len;
    } else {
      // native encoding, not 8 bit
      UErrorCode status = U_ZERO_ERROR;
      UConverter* conv = ucnv_open(NULL, &status);
      const char* source = CHAR(str);
      const char* sourceLimit = source + str_len;
      int j;
      for (j = 0; source != sourceLimit; j++) {
        ucnv_getNextUChar(conv, &source, sourceLimit, &status);
      }
      retint[i] = j; // all right, we got it!
    }
  }

  UNPROTECT(1);
  return ret;
}
Example #18
0
/**
 * Detect if a character class occurs in a string
 *
 * @param str character vector
 * @param pattern character vector
 * @param negate single bool
 * @param max_count single int
 * @return logical vector
 *
 * @version 0.1-?? (Bartek Tartanus)
 *
 * @version 0.1-?? (Marek Gagolewski, 2013-06-02)
 *          Use StrContainerUTF8 and CharClass classes
 *
 * @version 0.1-?? (Marek Gagolewski, 2013-06-15)
 *          Use StrContainerCharClass
 *
 * @version 0.1-?? (Marek Gagolewski, 2013-06-16)
 *          make StriException-friendly
 *
 * @version 0.2-1 (Marek Gagolewski, 2014-04-03)
 *          detects invalid UTF-8 byte stream
 *
 * @version 0.2-1 (Marek Gagolewski, 2014-04-05)
 *          StriContainerCharClass now relies on UnicodeSet
 *
 * @version 0.3-1 (Marek Gagolewski, 2014-11-04)
 *    Issue #112: str_prepare_arg* retvals were not PROTECTed from gc
 *
 * @version 1.0-3 (Marek Gagolewski, 2016-02-03)
 *    FR #216: `negate` arg added
 *
 * @version 1.3.1 (Marek Gagolewski, 2019-02-08)
 *    #232: `max_count` arg added
 */
SEXP stri_detect_charclass(SEXP str, SEXP pattern,
    SEXP negate, SEXP max_count)
{
   bool negate_1 = stri__prepare_arg_logical_1_notNA(negate, "negate");
   int max_count_1 = stri__prepare_arg_integer_1_notNA(max_count, "max_count");
   PROTECT(str = stri_prepare_arg_string(str, "str"));
   PROTECT(pattern = stri_prepare_arg_string(pattern, "pattern"));
   R_len_t vectorize_length =
      stri__recycling_rule(true, 2, LENGTH(str), LENGTH(pattern));

   STRI__ERROR_HANDLER_BEGIN(2)
   StriContainerUTF8 str_cont(str, vectorize_length);
   StriContainerCharClass pattern_cont(pattern, vectorize_length);

   SEXP ret;
   STRI__PROTECT(ret = Rf_allocVector(LGLSXP, vectorize_length));
   int* ret_tab = LOGICAL(ret);

   for (R_len_t i = pattern_cont.vectorize_init();
         i != pattern_cont.vectorize_end();
         i = pattern_cont.vectorize_next(i))
   {
      if (max_count_1 == 0 || str_cont.isNA(i) || pattern_cont.isNA(i)) {
         ret_tab[i] = NA_LOGICAL;
         continue;
      }

      const UnicodeSet* pattern_cur = &pattern_cont.get(i);
      R_len_t     str_cur_n = str_cont.get(i).length();
      const char* str_cur_s = str_cont.get(i).c_str();

      UChar32 chr = 0;
      ret_tab[i] = FALSE;
      for (R_len_t j=0; j<str_cur_n; ) {
         U8_NEXT(str_cur_s, j, str_cur_n, chr);
         if (chr < 0) // invalid UTF-8 sequence
            throw StriException(MSG__INVALID_UTF8);
         if (pattern_cur->contains(chr)) {
            ret_tab[i] = TRUE;
            break;
         }
      }
      if (negate_1) ret_tab[i] = !ret_tab[i];
      if (max_count_1 > 0 && ret_tab[i]) --max_count_1;
   }

   STRI__UNPROTECT_ALL
   return ret;
   STRI__ERROR_HANDLER_END(;/* nothing special to be done on error */)
}
Example #19
0
/* {{{ php_converter_append_toUnicode_target */
static void php_converter_append_toUnicode_target(zval *val, UConverterToUnicodeArgs *args, php_converter_object *objval) {
	switch (Z_TYPE_P(val)) {
		case IS_NULL:
			/* Code unit is being skipped */
			return;
		case IS_LONG:
		{
			zend_long lval = Z_LVAL_P(val);
			if ((lval < 0) || (lval > 0x10FFFF)) {
				php_converter_throw_failure(objval, U_ILLEGAL_ARGUMENT_ERROR, "Invalid codepoint U+%04lx", lval);
				return;
			}
			if (lval > 0xFFFF) {
				/* Supplemental planes U+010000 - U+10FFFF */
				if (TARGET_CHECK(args, 2)) {
					/* TODO: Find the ICU call which does this properly */
					*(args->target++) = (UChar)(((lval - 0x10000) >> 10)   | 0xD800);
					*(args->target++) = (UChar)(((lval - 0x10000) & 0x3FF) | 0xDC00);
				}
				return;
			}
			/* Non-suggogate BMP codepoint */
			if (TARGET_CHECK(args, 1)) {
				*(args->target++) = (UChar)lval;
			}
			return;
		}
		case IS_STRING:
		{
			const char *strval = Z_STRVAL_P(val);
			int i = 0, strlen = Z_STRLEN_P(val);

			while((i != strlen) && TARGET_CHECK(args, 1)) {
				UChar c;
				U8_NEXT(strval, i, strlen, c);
				*(args->target++) = c;
			}
			return;
		}
		case IS_ARRAY:
		{
			HashTable *ht = Z_ARRVAL_P(val);
			zval *tmpzval;

			ZEND_HASH_FOREACH_VAL(ht, tmpzval) {
				php_converter_append_toUnicode_target(tmpzval, args, objval);
			} ZEND_HASH_FOREACH_END();
			return;
		}
Example #20
0
UBool isFollowedByCasedLetter(const uint8_t *s, int32_t i, int32_t length) {
    while (i < length) {
        UChar32 c;
        U8_NEXT(s, i, length, c);
        int32_t type = ucase_getTypeOrIgnorable(c);
        if ((type & UCASE_IGNORABLE) != 0) {
            // Case-ignorable, continue with the loop.
        } else if (type != UCASE_NONE) {
            return TRUE;  // Followed by cased letter.
        } else {
            return FALSE;  // Uncased and not case-ignorable.
        }
    }
    return FALSE;  // Not followed by cased letter.
}
/**
 *
 * @version 0.4-1 (Marek Gagolewski, 2014-12-07)
 */
void StriContainerByteSearch::upgradePatternCaseInsensitive()
{
   UChar32 c = 0;
   R_len_t j = 0;
   patternLenCaseInsensitive = 0;
   while (j < patternLen) {
      U8_NEXT(patternStr, j, patternLen, c);
#ifndef NDEBUG
      if (patternLenCaseInsensitive >= this->kmpMaxSize)
         throw StriException("!NDEBUG: StriContainerByteSearch::upgradePatternCaseInsensitive()");
#endif
      patternStrCaseInsensitive[patternLenCaseInsensitive++] = u_toupper(c);
   }
   patternStrCaseInsensitive[patternLenCaseInsensitive] = 0;
}
Example #22
0
	void ICUUnicodeSupport::_toLowerCase<1>(StringHolder<1> _str)
	{
		if(!_str.empty())
		{
			uint8_t* buf = &_str[0];
			int32_t len = _str.length();
			int32_t ofs = 0, ofs2 = 0;
			while(ofs != len)
			{
				UChar32 c;
				U8_NEXT(buf, ofs, len, c);
				c = u_tolower(c);
				U8_APPEND_UNSAFE( buf, ofs2, c);
			}
		}
	}
Example #23
0
std::vector<uint16_t> utf8ToUtf16(const std::string& text) {
  std::vector<uint16_t> result;
  int32_t i = 0;
  const int32_t textLength = static_cast<int32_t>(text.size());
  uint32_t c = 0;
  while (i < textLength) {
    U8_NEXT(text.c_str(), i, textLength, c);
    if (U16_LENGTH(c) == 1) {
      result.push_back(c);
    } else {
      result.push_back(U16_LEAD(c));
      result.push_back(U16_TRAIL(c));
    }
  }
  return result;
}
Example #24
0
bool readUTFChar(const char* str, int* begin, int length, unsigned* codePointOut)
{
    int codePoint; // Avoids warning when U8_NEXT writes -1 to it.
    U8_NEXT(str, *begin, length, codePoint);
    *codePointOut = static_cast<unsigned>(codePoint);

    // The ICU macro above moves to the next char, we want to point to the last
    // char consumed.
    (*begin)--;

    // Validate the decoded value.
    if (U_IS_UNICODE_CHAR(codePoint))
        return true;
    *codePointOut = kUnicodeReplacementCharacter;
    return false;
}
Example #25
0
static void countLeadingSpaces(const CString& utf8String, int32_t& pointerOffset, int32_t& characterOffset)
{
    pointerOffset = 0;
    characterOffset = 0;
    const char* stringData = utf8String.data();
    UChar32 character = 0;
    while (static_cast<unsigned>(pointerOffset) < utf8String.length()) {
        int32_t nextPointerOffset = pointerOffset;
        U8_NEXT(stringData, nextPointerOffset, static_cast<int32_t>(utf8String.length()), character);

        if (character < 0 || !u_isUWhiteSpace(character))
            return;

        pointerOffset = nextPointerOffset;
        characterOffset++;
    }
}
/**
 *
 * @version 0.4-1 (Marek Gagolewski, 2014-12-07)
 */
bool StriContainerByteSearch::startsWith(R_len_t byteindex)
{
   if (flags&BYTESEARCH_CASE_INSENSITIVE) {
      for (R_len_t k = 0; k < patternLenCaseInsensitive; ++k) {
         UChar32 c;
         U8_NEXT(searchStr, byteindex, searchLen, c);
         c = u_toupper(c);
         if (patternStrCaseInsensitive[k] != c)
            return false;
      }
   }
   else {
      for (R_len_t k=0; k < patternLen; ++k)
         if (searchStr[byteindex+k] != patternStr[k])
            return false;
   }

   return true; // found
}
/** find first match - KMP
 *
 * @param startPos where to start
 * @return USEARCH_DONE on no match, otherwise start index
 *
 * @version 0.1-?? (Bartek Tartanus, 2013-08-15)
 *          KMP - first approach
 *
 * @version 0.2-3 (Marek Gagolewski, 2014-05-11)
 *          KMP upgraded; separate method
 *
 * @version 0.4-1 (Marek Gagolewski, 2014-12-07)
 *    use BYTESEARCH_CASE_INSENSITIVE
 */
R_len_t StriContainerByteSearch::findFromPosFwd_KMP(R_len_t startPos)
{
   int j = startPos;
   patternPos = 0;
   if (flags&BYTESEARCH_CASE_INSENSITIVE) {
      UChar32 c = 0;
      while (j < searchLen) {
         U8_NEXT(searchStr, j, searchLen, c);
         c = u_toupper(c);
         while (patternPos >= 0 && patternStrCaseInsensitive[patternPos] != c)
            patternPos = kmpNext[patternPos];
         patternPos++;
         if (patternPos == patternLenCaseInsensitive) {
            searchEnd = j;

            // we need to go back by patternLenCaseInsensitive code points
            R_len_t k = patternLenCaseInsensitive;
            searchPos = j;
            while (k > 0) {
               U8_BACK_1((const uint8_t*)searchStr, 0, searchPos);
               k--;
            }
            return searchPos;
         }
      }
   }
   else {
      while (j < searchLen) {
         while (patternPos >= 0 && patternStr[patternPos] != searchStr[j])
            patternPos = kmpNext[patternPos];
         patternPos++;
         j++;
         if (patternPos == patternLen) {
            searchEnd = j;
            searchPos = j-patternLen;
            return searchPos;
         }
      }
   }
   // else not found
   searchPos = searchEnd = searchLen;
   return USEARCH_DONE;
}
/**
 * Detect if a character class occurs in a string
 *
 * @param str character vector
 * @param pattern character vector
 * @return logical vector
 *
 * @version 0.1 (Bartek Tartanus)
 * @version 0.2 (Marek Gagolewski, 2013-06-02) Use StrContainerUTF8 and CharClass classes
 * @version 0.3 (Marek Gagolewski, 2013-06-15) Use StrContainerCharClass
 * @version 0.4 (Marek Gagolewski, 2013-06-16) make StriException-friendly
 */
SEXP stri_detect_charclass(SEXP str, SEXP pattern)
{
   str = stri_prepare_arg_string(str, "str");
   pattern = stri_prepare_arg_string(pattern, "pattern");
   R_len_t vectorize_length = stri__recycling_rule(true, 2, LENGTH(str), LENGTH(pattern));

   STRI__ERROR_HANDLER_BEGIN
   StriContainerUTF8 str_cont(str, vectorize_length);
   StriContainerCharClass pattern_cont(pattern, vectorize_length);

   SEXP ret;
   PROTECT(ret = Rf_allocVector(LGLSXP, vectorize_length));
   int* ret_tab = LOGICAL(ret);

   for (R_len_t i = pattern_cont.vectorize_init();
         i != pattern_cont.vectorize_end();
         i = pattern_cont.vectorize_next(i))
   {
      if (str_cont.isNA(i) || pattern_cont.isNA(i)) {
         ret_tab[i] = NA_LOGICAL;
         continue;
      }

      CharClass pattern_cur = pattern_cont.get(i);
      R_len_t     str_cur_n = str_cont.get(i).length();
      const char* str_cur_s = str_cont.get(i).c_str();
      ret_tab[i] = FALSE;
      R_len_t j;
      UChar32 chr;

      for (j=0; j<str_cur_n; ) {
         U8_NEXT(str_cur_s, j, str_cur_n, chr);
         if (pattern_cur.test(chr)) {
            ret_tab[i] = TRUE;
            break;
         }
      }
   }

   UNPROTECT(1);
   return ret;
   STRI__ERROR_HANDLER_END(;/* nothing special to be done on error */)
}
Example #29
0
/*
 * Case-maps [srcStart..srcLimit[ but takes
 * context [0..srcLength[ into account.
 */
static int32_t
_caseMap(const UCaseMap *csm, UCaseMapFull *map,
         uint8_t *dest, int32_t destCapacity,
         const uint8_t *src, UCaseContext *csc,
         int32_t srcStart, int32_t srcLimit,
         UErrorCode *pErrorCode) {
    const UChar *s;
    UChar32 c, c2 = 0;
    int32_t srcIndex, destIndex;
    int32_t locCache;

    locCache=csm->locCache;

    /* case mapping loop */
    srcIndex=srcStart;
    destIndex=0;
    while(srcIndex<srcLimit) {
        csc->cpStart=srcIndex;
        U8_NEXT(src, srcIndex, srcLimit, c);
        csc->cpLimit=srcIndex;
        if(c<0) {
            int32_t i=csc->cpStart;
            while(destIndex<destCapacity && i<srcIndex) {
                dest[destIndex++]=src[i++];
            }
            continue;
        }
        c=map(csm->csp, c, utf8_caseContextIterator, csc, &s, csm->locale, &locCache);
        if((destIndex<destCapacity) && (c<0 ? (c2=~c)<=0x7f : UCASE_MAX_STRING_LENGTH<c && (c2=c)<=0x7f)) {
            /* fast path version of appendResult() for ASCII results */
            dest[destIndex++]=(uint8_t)c2;
        } else {
            destIndex=appendResult(dest, destIndex, destCapacity, c, s);
        }
    }

    if(destIndex>destCapacity) {
        *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
    }
    return destIndex;
}
Example #30
0
static void U_CALLCONV
ucasemap_internalUTF8Fold(int32_t /* caseLocale */, uint32_t options, UCASEMAP_BREAK_ITERATOR_UNUSED
                          const uint8_t *src, int32_t srcLength,
                          icu::ByteSink &sink, icu::Edits *edits,
                          UErrorCode &errorCode) {
    /* case mapping loop */
    int32_t srcIndex = 0;
    while (U_SUCCESS(errorCode) && srcIndex < srcLength) {
        int32_t cpStart = srcIndex;
        UChar32 c;
        U8_NEXT(src, srcIndex, srcLength, c);
        if(c<0) {
            // Malformed UTF-8.
            ByteSinkUtil::appendUnchanged(src+cpStart, srcIndex-cpStart,
                                          sink, options, edits, errorCode);
        } else {
            const UChar *s;
            c = ucase_toFullFolding(c, &s, options);
            appendResult(srcIndex - cpStart, c, s, sink, options, edits, errorCode);
        }
    }
}