int ICUUnicodeSupport::_compareNoCase<1>(ConstStringHolder<1> _first, ConstStringHolder<1> _second) { int32_t len1 = _first.length(); int32_t len2 = _second.length(); int32_t ofs1 = 0; int32_t ofs2 = 0; int r = checkStringEnd(ofs1, len1, ofs2, len2); if(r != 2) return r; const uint8_t* buf1 = _first.c_str(); const uint8_t* buf2 = _second.c_str(); while(true) { UChar32 c1, c2; U8_NEXT(buf1, ofs1, len1, c1); U8_NEXT(buf2, ofs2, len2, c2); c1 = u_tolower(c1); c2 = u_tolower(c2); if(c1 != c2) return (c1 < c2) ? -1 : 1; r = checkStringEnd(ofs1, len1, ofs2, len2); if(r != 2) return r; } }
static int utf8_comparison(const unsigned char *buf1, size_t buf1size, const unsigned char *buf2, size_t buf2size, int caseinsensitive, int genericwhitespace) { UChar32 c1, c2; int i1 = 0; int i2 = 0; while (i1 < (int)buf1size && i2 < (int)buf2size) { U8_NEXT(buf1, i1, buf1size, c1); assert(c1 >= 0); U8_NEXT(buf2, i2, buf2size, c2); assert(c2 >= 0); if (c1 != c2) { if (caseinsensitive) { // turn both lowercase and compare again: if (c1 < 127 && c2 < 127) { // manual ascii case insensitive char bytec1 = (char)c1; char bytec2 = (char)c2; if (((bytec1 >= 'a' && bytec1 <= 'z') || (bytec1 >= 'A' && bytec1 <= 'Z')) && ((bytec2 >= 'a' && bytec2 <= 'z') || (bytec2 >= 'A' && bytec2 <= 'Z'))) { // turn them into uppercase if (bytec1 >= 'a' && bytec1 <= 'z') { bytec1 = bytec1 - ('a' - 'A'); } if (bytec2 >= 'a' && bytec2 <= 'z') { bytec2 = bytec2 - ('a' - 'A'); } // compare with case insensitive: if (bytec1 == bytec2) { continue; } else { return ((int)bytec1) - ((int)bytec2); } } } } if (genericwhitespace) { // see if both are whitespace and try again: if (utf8_codepoint_is_whitespace(c1) && utf8_codepoint_is_whitespace(c2)) { continue; } } // not equal. return difference: return ((int)c1) - ((int)c2); } } if (i1 < (int)buf1size) { U8_NEXT(buf1, i1, buf1size, c1); return c1; } if (i2 < (int)buf2size) { U8_NEXT(buf2, i2, buf2size, c2); return -((int)c2); } return 0; }
/* * Case-maps [srcStart..srcLimit[ but takes * context [0..srcLength[ into account. */ static void _caseMap(int32_t caseLocale, uint32_t options, UCaseMapFull *map, const uint8_t *src, UCaseContext *csc, int32_t srcStart, int32_t srcLimit, icu::ByteSink &sink, icu::Edits *edits, UErrorCode &errorCode) { /* case mapping loop */ int32_t srcIndex=srcStart; while (U_SUCCESS(errorCode) && srcIndex<srcLimit) { int32_t cpStart; csc->cpStart=cpStart=srcIndex; UChar32 c; U8_NEXT(src, srcIndex, srcLimit, c); csc->cpLimit=srcIndex; if(c<0) { // Malformed UTF-8. ByteSinkUtil::appendUnchanged(src+cpStart, srcIndex-cpStart, sink, options, edits, errorCode); } else { const UChar *s; c=map(c, utf8_caseContextIterator, csc, &s, caseLocale); appendResult(srcIndex - cpStart, c, s, sink, options, edits, errorCode); } } }
static inline int convert_cp(UChar32* pcp, zval *zcp) { zend_long cp = -1; if (Z_TYPE_P(zcp) == IS_LONG) { cp = Z_LVAL_P(zcp); } else if (Z_TYPE_P(zcp) == IS_STRING) { int32_t i = 0; size_t zcp_len = Z_STRLEN_P(zcp); if (ZEND_SIZE_T_INT_OVFL(zcp_len)) { intl_error_set_code(NULL, U_ILLEGAL_ARGUMENT_ERROR); intl_error_set_custom_msg(NULL, "Input string is too long.", 0); return FAILURE; } U8_NEXT(Z_STRVAL_P(zcp), i, zcp_len, cp); if ((size_t)i != zcp_len) { intl_error_set_code(NULL, U_ILLEGAL_ARGUMENT_ERROR); intl_error_set_custom_msg(NULL, "Passing a UTF-8 character for codepoint requires a string which is exactly one UTF-8 codepoint long.", 0); return FAILURE; } } else { intl_error_set_code(NULL, U_ILLEGAL_ARGUMENT_ERROR); intl_error_set_custom_msg(NULL, "Invalid parameter for unicode point. Must be either integer or UTF-8 sequence.", 0); return FAILURE; } if ((cp < UCHAR_MIN_VALUE) || (cp > UCHAR_MAX_VALUE)) { intl_error_set_code(NULL, U_ILLEGAL_ARGUMENT_ERROR); intl_error_set_custom_msg(NULL, "Codepoint out of range", 0); return FAILURE; } *pcp = (UChar32)cp; return SUCCESS; }
static void TestNextPrevNonCharacters() { /* test non-characters */ static const uint8_t nonChars[]={ 0xef, 0xb7, 0x90, /* U+fdd0 */ 0xef, 0xbf, 0xbf, /* U+feff */ 0xf0, 0x9f, 0xbf, 0xbe, /* U+1fffe */ 0xf0, 0xbf, 0xbf, 0xbf, /* U+3ffff */ 0xf4, 0x8f, 0xbf, 0xbe /* U+10fffe */ }; UChar32 ch; int32_t idx; for(idx=0; idx<(int32_t)sizeof(nonChars);) { U8_NEXT(nonChars, idx, sizeof(nonChars), ch); if(!U_IS_UNICODE_NONCHAR(ch)) { log_err("U8_NEXT(before %d) failed to read a non-character\n", idx); } } for(idx=(int32_t)sizeof(nonChars); idx>0;) { U8_PREV(nonChars, 0, idx, ch); if(!U_IS_UNICODE_NONCHAR(ch)) { log_err("U8_PREV(at %d) failed to read a non-character\n", idx); } } }
/* * Case-maps [srcStart..srcLimit[ but takes * context [0..srcLength[ into account. */ static int32_t _caseMap(const UCaseMap *csm, UCaseMapFull *map, uint8_t *dest, int32_t destCapacity, const uint8_t *src, UCaseContext *csc, int32_t srcStart, int32_t srcLimit, UErrorCode *pErrorCode) { const UChar *s; UChar32 c; int32_t srcIndex, destIndex; int32_t locCache; locCache=csm->locCache; /* case mapping loop */ srcIndex=srcStart; destIndex=0; while(srcIndex<srcLimit) { csc->cpStart=srcIndex; U8_NEXT(src, srcIndex, srcLimit, c); csc->cpLimit=srcIndex; c=map(csm->csp, c, utf8_caseContextIterator, csc, &s, csm->locale, &locCache); destIndex=appendResult(dest, destIndex, destCapacity, c, s); } if(destIndex>destCapacity) { *pErrorCode=U_BUFFER_OVERFLOW_ERROR; } return destIndex; }
static UChar32 U_CALLCONV utf8_caseContextIterator(void *context, int8_t dir) { UCaseContext *csc=(UCaseContext *)context; UChar32 c; if(dir<0) { /* reset for backward iteration */ csc->index=csc->cpStart; csc->dir=dir; } else if(dir>0) { /* reset for forward iteration */ csc->index=csc->cpLimit; csc->dir=dir; } else { /* continue current iteration direction */ dir=csc->dir; } if(dir<0) { if(csc->start<csc->index) { U8_PREV((const uint8_t *)csc->p, csc->start, csc->index, c); return c; } } else { if(csc->index<csc->limit) { U8_NEXT((const uint8_t *)csc->p, csc->index, csc->limit, c); return c; } } return U_SENTINEL; }
/** * Detect if a character class occurs in a string * * @param str character vector * @param pattern character vector * @param omit_na single logical value * @return logical vector * * @version 0.3-1 (Bartek Tartanus, 2014-07-25) * * @version 0.3-1 (Marek Gagolewski, 2014-10-17) * using std::vector<int> to avoid mem-leaks * * @version 0.3-1 (Marek Gagolewski, 2014-11-04) * Issue #112: str_prepare_arg* retvals were not PROTECTed from gc * * @version 0.4-1 (Marek Gagolewski, 2014-12-04) * FR #122: omit_na arg added * * @version 1.0-3 (Marek Gagolewski, 2016-02-03) * FR #216: `negate` arg added */ SEXP stri_subset_charclass(SEXP str, SEXP pattern, SEXP omit_na, SEXP negate) { bool negate_1 = stri__prepare_arg_logical_1_notNA(negate, "negate"); bool omit_na1 = stri__prepare_arg_logical_1_notNA(omit_na, "omit_na"); PROTECT(str = stri_prepare_arg_string(str, "str")); PROTECT(pattern = stri_prepare_arg_string(pattern, "pattern")); R_len_t vectorize_length = stri__recycling_rule(true, 2, LENGTH(str), LENGTH(pattern)); STRI__ERROR_HANDLER_BEGIN(2) StriContainerUTF8 str_cont(str, vectorize_length); StriContainerCharClass pattern_cont(pattern, vectorize_length); // BT: this cannot be done with deque, because pattern is reused so i does not // go like 0,1,2...n but 0,pat_len,2*pat_len,1,pat_len+1 and so on // MG: agreed std::vector<int> which(vectorize_length); int result_counter = 0; for (R_len_t i = pattern_cont.vectorize_init(); i != pattern_cont.vectorize_end(); i = pattern_cont.vectorize_next(i)) { if (str_cont.isNA(i) || pattern_cont.isNA(i)) { if (omit_na1) which[i] = FALSE; else { which[i] = NA_LOGICAL; result_counter++; } continue; } const UnicodeSet* pattern_cur = &pattern_cont.get(i); R_len_t str_cur_n = str_cont.get(i).length(); const char* str_cur_s = str_cont.get(i).c_str(); UChar32 chr = 0; which[i] = FALSE; for (R_len_t j=0; j<str_cur_n; ) { U8_NEXT(str_cur_s, j, str_cur_n, chr); if (chr < 0) // invalid utf-8 sequence throw StriException(MSG__INVALID_UTF8); if (pattern_cur->contains(chr)) { which[i] = TRUE; break; } } if (negate_1) which[i] = !which[i]; if (which[i]) result_counter++; } SEXP ret; STRI__PROTECT(ret = stri__subset_by_logical(str_cont, which, result_counter)); STRI__UNPROTECT_ALL return ret; STRI__ERROR_HANDLER_END(;/* nothing special to be done on error */) }
/** Convert character vector to UTF-32 * * @param str character vector * @return list with integer vectors * * @version 0.1-?? (Marek Gagolewski) * * @version 0.1-?? (Marek Gagolewski, 2013-06-16) * make StriException-friendly * * @version 0.2-1 (Marek Gagolewski, 2014-03-26) * use vector<UChar32> buf instead of R_alloc; * warn and set NULL on improper UTF-8 byte sequences * * @version 0.2-3 (Marek Gagolewski, 2014-05-12) * Use UChar32* instead of vector<UChar32> as ::data is C++11 * * @version 0.3-1 (Marek Gagolewski, 2014-11-04) * Issue #112: str_prepare_arg* retvals were not PROTECTed from gc */ SEXP stri_enc_toutf32(SEXP str) { PROTECT(str = stri_prepare_arg_string(str, "str")); R_len_t n = LENGTH(str); STRI__ERROR_HANDLER_BEGIN(1) StriContainerUTF8 str_cont(str, n); R_len_t bufsize = 1; // to avoid allocating an empty buffer for (R_len_t i=0; i<n; ++i) { if (str_cont.isNA(i)) continue; R_len_t ni = str_cont.get(i).length(); if (ni > bufsize) bufsize = ni; } UChar32* buf = (UChar32*)R_alloc((size_t)bufsize, (int)sizeof(UChar32)); // at most bufsize UChars32 (bufsize/4 min.) if (!buf) throw StriException(MSG__MEM_ALLOC_ERROR); // deque<UChar32> was slower than using a common, over-sized buf SEXP ret; STRI__PROTECT(ret = Rf_allocVector(VECSXP, n)); // all for (R_len_t i=0; i<n; ++i) { if (str_cont.isNA(i)) { SET_VECTOR_ELT(ret, i, R_NilValue); continue; } UChar32 c = (UChar32)0; const char* s = str_cont.get(i).c_str(); R_len_t sn = str_cont.get(i).length(); R_len_t j = 0; R_len_t k = 0; while (c >= 0 && j < sn) { U8_NEXT(s, j, sn, c); buf[k++] = (int)c; } if (c < 0) { Rf_warning(MSG__INVALID_UTF8); SET_VECTOR_ELT(ret, i, R_NilValue); continue; } else { SEXP conv; STRI__PROTECT(conv = Rf_allocVector(INTSXP, k)); memcpy(INTEGER(conv), buf, (size_t)sizeof(int)*k); SET_VECTOR_ELT(ret, i, conv); STRI__UNPROTECT(1); } } STRI__UNPROTECT_ALL return ret; STRI__ERROR_HANDLER_END({ /* do nothing on error */ }) }
/** * Substitutes vector elements if a pattern occurs in a string * * @param str character vector * @param pattern character vector * @param value character vector * @return character vector * * @version 1.0-3 (Marek Gagolewski, 2016-02-03) * FR#124 * * @version 1.0-3 (Marek Gagolewski, 2016-02-03) * FR #216: `negate` arg added */ SEXP stri_subset_charclass_replacement(SEXP str, SEXP pattern, SEXP negate, SEXP value) { bool negate_1 = stri__prepare_arg_logical_1_notNA(negate, "negate"); PROTECT(str = stri_prepare_arg_string(str, "str")); PROTECT(pattern = stri_prepare_arg_string_1(pattern, "pattern")); PROTECT(value = stri_prepare_arg_string(value, "value")); int vectorize_length = LENGTH(str); int value_length = LENGTH(value); if (value_length == 0) Rf_error(MSG__REPLACEMENT_ZERO); STRI__ERROR_HANDLER_BEGIN(3) StriContainerUTF8 str_cont(str, vectorize_length); StriContainerUTF8 value_cont(value, value_length); StriContainerCharClass pattern_cont(pattern, vectorize_length); SEXP ret; STRI__PROTECT(ret = Rf_allocVector(STRSXP, vectorize_length)); R_len_t k = 0; for (R_len_t i = str_cont.vectorize_init(); i != str_cont.vectorize_end(); i = str_cont.vectorize_next(i)) { if (str_cont.isNA(i) || pattern_cont.isNA(i)) { SET_STRING_ELT(ret, i, NA_STRING); continue; } const UnicodeSet* pattern_cur = &pattern_cont.get(i); R_len_t str_cur_n = str_cont.get(i).length(); const char* str_cur_s = str_cont.get(i).c_str(); UChar32 chr = 0; bool found = false; for (R_len_t j=0; j<str_cur_n; ) { U8_NEXT(str_cur_s, j, str_cur_n, chr); if (chr < 0) // invalid utf-8 sequence throw StriException(MSG__INVALID_UTF8); if (pattern_cur->contains(chr)) { found = true; break; } } if ((found && !negate_1) || (!found && negate_1)) SET_STRING_ELT(ret, i, value_cont.toR((k++)%value_length)); else SET_STRING_ELT(ret, i, str_cont.toR(i)); } STRI__UNPROTECT_ALL return ret; STRI__ERROR_HANDLER_END(;/* nothing special to be done on error */) }
static UBool * getResultsManually(const char** encodings, int32_t num_encodings, const char *utf8, int32_t length, const USet* excludedCodePoints, const UConverterUnicodeSet whichSet) { UBool* resultsManually; int32_t i; resultsManually = (UBool*) uprv_malloc(gCountAvailable); uprv_memset(resultsManually, 0, gCountAvailable); for(i = 0 ; i < num_encodings ; i++) { UErrorCode status = U_ZERO_ERROR; /* get unicode set for that converter */ USet* set; UConverter* test_converter; UChar32 cp; int32_t encIndex, offset; set = uset_openEmpty(); test_converter = ucnv_open(encodings[i], &status); ucnv_getUnicodeSet(test_converter, set, whichSet, &status); if (excludedCodePoints != NULL) { uset_addAll(set, excludedCodePoints); } uset_freeze(set); offset = 0; cp = 0; encIndex = findIndex(encodings[i]); /* * The following is almost, but not entirely, the same as * resultsManually[encIndex] = * (UBool)(uset_spanUTF8(set, utf8, length, USET_SPAN_SIMPLE) == length); * They might be different if the set contains strings, * or if the utf8 string contains an illegal sequence. * * The UConverterSelector does not currently handle strings that can be * converted, and it treats an illegal sequence as convertible * while uset_spanUTF8() treats it like U+FFFD which may not be convertible. */ resultsManually[encIndex] = TRUE; while(offset<length) { U8_NEXT(utf8, offset, length, cp); if (cp >= 0 && !uset_contains(set, cp)) { resultsManually[encIndex] = FALSE; break; } } uset_close(set); ucnv_close(test_converter); } return resultsManually; }
static int utf8_length(const unsigned char *buf, size_t bufsize) { UChar32 c; size_t length = 0; int i = 0; while (i < (int)bufsize) { U8_NEXT(buf, i, bufsize, c); assert(c >= 0); length++; } return length; }
/** * Extract first or last occurences of a character class in each string * * @param str character vector * @param pattern character vector * @return character vector * * @version 0.1 (Marek Gagolewski, 2013-06-08) * @version 0.2 (Marek Gagolewski, 2013-06-15) Use StrContainerCharClass * @version 0.3 (Marek Gagolewski, 2013-06-16) make StriException-friendly */ SEXP stri__extract_firstlast_charclass(SEXP str, SEXP pattern, bool first) { str = stri_prepare_arg_string(str, "str"); pattern = stri_prepare_arg_string(pattern, "pattern"); R_len_t vectorize_length = stri__recycling_rule(true, 2, LENGTH(str), LENGTH(pattern)); STRI__ERROR_HANDLER_BEGIN StriContainerUTF8 str_cont(str, vectorize_length); StriContainerCharClass pattern_cont(pattern, vectorize_length); SEXP ret; PROTECT(ret = Rf_allocVector(STRSXP, vectorize_length)); for (R_len_t i = pattern_cont.vectorize_init(); i != pattern_cont.vectorize_end(); i = pattern_cont.vectorize_next(i)) { SET_STRING_ELT(ret, i, NA_STRING); if (str_cont.isNA(i) || pattern_cont.isNA(i)) continue; CharClass pattern_cur = pattern_cont.get(i); R_len_t str_cur_n = str_cont.get(i).length(); const char* str_cur_s = str_cont.get(i).c_str(); R_len_t j, jlast; UChar32 chr; if (first) { for (jlast=j=0; j<str_cur_n; ) { U8_NEXT(str_cur_s, j, str_cur_n, chr); if (pattern_cur.test(chr)) { SET_STRING_ELT(ret, i, Rf_mkCharLenCE(str_cur_s+jlast, j-jlast, CE_UTF8)); break; // that's enough for first } jlast = j; } } else { for (jlast=j=str_cur_n; j>0; ) { U8_PREV(str_cur_s, 0, j, chr); // go backwards if (pattern_cur.test(chr)) { SET_STRING_ELT(ret, i, Rf_mkCharLenCE(str_cur_s+j, jlast-j, CE_UTF8)); break; // that's enough for last } jlast = j; } } } UNPROTECT(1); return ret; STRI__ERROR_HANDLER_END(;/* nothing special to be done on error */) }
/** Convert character vector to ASCII * * All charcodes > 127 are replaced with subst chars (0x1A) * * @param str character vector * @return character vector * * @version 0.1 (Marek Gagolewski) * @version 0.2 (Marek Gagolewski, 2013-06-16) make StriException-friendly */ SEXP stri_enc_toascii(SEXP str) { str = stri_prepare_arg_string(str, "str"); R_len_t n = LENGTH(str); STRI__ERROR_HANDLER_BEGIN SEXP ret; PROTECT(ret = Rf_allocVector(STRSXP, n)); for (R_len_t i=0; i<n; ++i) { SEXP curs = STRING_ELT(str, i); if (curs == NA_STRING) { SET_STRING_ELT(ret, i, NA_STRING); continue; } else if (IS_ASCII(curs)) { SET_STRING_ELT(ret, i, curs); } else if (IS_UTF8(curs)) { R_len_t curn = LENGTH(curs); const char* curs_tab = CHAR(curs); // TODO: buffer reuse.... String8 buf(curn+1); // this may be 4 times too much R_len_t k = 0; UChar32 c; for (int j=0; j<curn; ) { U8_NEXT(curs_tab, j, curn, c); if (c > ASCII_MAXCHARCODE) buf.data()[k++] = ASCII_SUBSTITUTE; else buf.data()[k++] = (char)c; } SET_STRING_ELT(ret, i, Rf_mkCharLenCE(buf.data(), k, CE_UTF8)); // will be marked as ASCII anyway by mkCharLenCE } else { // some 8-bit encoding R_len_t curn = LENGTH(curs); const char* curs_tab = CHAR(curs); // TODO: buffer reuse.... String8 buf(curn+1); R_len_t k = 0; for (R_len_t j=0; j<curn; ++j) { if (U8_IS_SINGLE(curs_tab[j])) buf.data()[k++] = curs_tab[j]; else { buf.data()[k++] = (char)ASCII_SUBSTITUTE; // subst char in ascii } } SET_STRING_ELT(ret, i, Rf_mkCharLenCE(buf.data(), k, CE_UTF8)); // will be marked as ASCII anyway by mkCharLenCE } } UNPROTECT(1); return ret; STRI__ERROR_HANDLER_END(;/* nothing special to be done on error */) }
static int utf8_iterate_codepoints(const unsigned char *buf, size_t bufsize, int (*do_something)(UChar32 c)) { UChar32 c; int i = 0; while (i < (int)bufsize) { U8_NEXT(buf, i, bufsize, c); assert(c >= 0); if (!do_something(c)) { return 0; } } return 1; }
uint64_t length(const std::string& str) { const char* s = str.c_str(); auto length = static_cast<int32_t>(str.length()); uint64_t count = 0; for (int32_t i = 0; i < length;) { UChar32 c; U8_NEXT(s, i, length, c); ++count; } return count; }
SEXP R_stri_length(SEXP vec) { int vec_len = LENGTH(vec); SEXP ret = PROTECT(allocVector(INTSXP, vec_len)); int* retint = INTEGER(ret); for (int i = 0; i < vec_len; i++) { SEXP str = STRING_ELT(vec, i); if (str == NA_STRING) { retint[i] = NA_INTEGER; continue; } int str_len = LENGTH(str); if (getCharCE(str) == CE_LATIN1 || (getCharCE(str) == CE_NATIVE && getNativeCE() == CE_LATIN1)) { retint[i] = str_len; } else if (getCharCE(str) == CE_BYTES) { UNPROTECT(1); error("Invalid encoding: bytes."); } else if (getCharCE(str) == CE_UTF8 || (getCharCE(str) == CE_NATIVE && getNativeCE() == CE_UTF8)) { UChar32 out = 0; const char* source = CHAR(str); R_len_t j = 0; int count; for (count = 0; out >= 0 && j < str_len; count++) { U8_NEXT(source, j, str_len, out); // faster that U8_FWD_1 & gives bad UChar32s } if (out < 0) { warning("Invalid UTF8 string: %s", source); retint[i] = NA_INTEGER; } else { retint[i] = count; } } else if (native_is_singlebyte()) { // native-8bit retint[i] = str_len; } else { // native encoding, not 8 bit UErrorCode status = U_ZERO_ERROR; UConverter* conv = ucnv_open(NULL, &status); const char* source = CHAR(str); const char* sourceLimit = source + str_len; int j; for (j = 0; source != sourceLimit; j++) { ucnv_getNextUChar(conv, &source, sourceLimit, &status); } retint[i] = j; // all right, we got it! } } UNPROTECT(1); return ret; }
/** * Detect if a character class occurs in a string * * @param str character vector * @param pattern character vector * @param negate single bool * @param max_count single int * @return logical vector * * @version 0.1-?? (Bartek Tartanus) * * @version 0.1-?? (Marek Gagolewski, 2013-06-02) * Use StrContainerUTF8 and CharClass classes * * @version 0.1-?? (Marek Gagolewski, 2013-06-15) * Use StrContainerCharClass * * @version 0.1-?? (Marek Gagolewski, 2013-06-16) * make StriException-friendly * * @version 0.2-1 (Marek Gagolewski, 2014-04-03) * detects invalid UTF-8 byte stream * * @version 0.2-1 (Marek Gagolewski, 2014-04-05) * StriContainerCharClass now relies on UnicodeSet * * @version 0.3-1 (Marek Gagolewski, 2014-11-04) * Issue #112: str_prepare_arg* retvals were not PROTECTed from gc * * @version 1.0-3 (Marek Gagolewski, 2016-02-03) * FR #216: `negate` arg added * * @version 1.3.1 (Marek Gagolewski, 2019-02-08) * #232: `max_count` arg added */ SEXP stri_detect_charclass(SEXP str, SEXP pattern, SEXP negate, SEXP max_count) { bool negate_1 = stri__prepare_arg_logical_1_notNA(negate, "negate"); int max_count_1 = stri__prepare_arg_integer_1_notNA(max_count, "max_count"); PROTECT(str = stri_prepare_arg_string(str, "str")); PROTECT(pattern = stri_prepare_arg_string(pattern, "pattern")); R_len_t vectorize_length = stri__recycling_rule(true, 2, LENGTH(str), LENGTH(pattern)); STRI__ERROR_HANDLER_BEGIN(2) StriContainerUTF8 str_cont(str, vectorize_length); StriContainerCharClass pattern_cont(pattern, vectorize_length); SEXP ret; STRI__PROTECT(ret = Rf_allocVector(LGLSXP, vectorize_length)); int* ret_tab = LOGICAL(ret); for (R_len_t i = pattern_cont.vectorize_init(); i != pattern_cont.vectorize_end(); i = pattern_cont.vectorize_next(i)) { if (max_count_1 == 0 || str_cont.isNA(i) || pattern_cont.isNA(i)) { ret_tab[i] = NA_LOGICAL; continue; } const UnicodeSet* pattern_cur = &pattern_cont.get(i); R_len_t str_cur_n = str_cont.get(i).length(); const char* str_cur_s = str_cont.get(i).c_str(); UChar32 chr = 0; ret_tab[i] = FALSE; for (R_len_t j=0; j<str_cur_n; ) { U8_NEXT(str_cur_s, j, str_cur_n, chr); if (chr < 0) // invalid UTF-8 sequence throw StriException(MSG__INVALID_UTF8); if (pattern_cur->contains(chr)) { ret_tab[i] = TRUE; break; } } if (negate_1) ret_tab[i] = !ret_tab[i]; if (max_count_1 > 0 && ret_tab[i]) --max_count_1; } STRI__UNPROTECT_ALL return ret; STRI__ERROR_HANDLER_END(;/* nothing special to be done on error */) }
/* {{{ php_converter_append_toUnicode_target */ static void php_converter_append_toUnicode_target(zval *val, UConverterToUnicodeArgs *args, php_converter_object *objval) { switch (Z_TYPE_P(val)) { case IS_NULL: /* Code unit is being skipped */ return; case IS_LONG: { zend_long lval = Z_LVAL_P(val); if ((lval < 0) || (lval > 0x10FFFF)) { php_converter_throw_failure(objval, U_ILLEGAL_ARGUMENT_ERROR, "Invalid codepoint U+%04lx", lval); return; } if (lval > 0xFFFF) { /* Supplemental planes U+010000 - U+10FFFF */ if (TARGET_CHECK(args, 2)) { /* TODO: Find the ICU call which does this properly */ *(args->target++) = (UChar)(((lval - 0x10000) >> 10) | 0xD800); *(args->target++) = (UChar)(((lval - 0x10000) & 0x3FF) | 0xDC00); } return; } /* Non-suggogate BMP codepoint */ if (TARGET_CHECK(args, 1)) { *(args->target++) = (UChar)lval; } return; } case IS_STRING: { const char *strval = Z_STRVAL_P(val); int i = 0, strlen = Z_STRLEN_P(val); while((i != strlen) && TARGET_CHECK(args, 1)) { UChar c; U8_NEXT(strval, i, strlen, c); *(args->target++) = c; } return; } case IS_ARRAY: { HashTable *ht = Z_ARRVAL_P(val); zval *tmpzval; ZEND_HASH_FOREACH_VAL(ht, tmpzval) { php_converter_append_toUnicode_target(tmpzval, args, objval); } ZEND_HASH_FOREACH_END(); return; }
UBool isFollowedByCasedLetter(const uint8_t *s, int32_t i, int32_t length) { while (i < length) { UChar32 c; U8_NEXT(s, i, length, c); int32_t type = ucase_getTypeOrIgnorable(c); if ((type & UCASE_IGNORABLE) != 0) { // Case-ignorable, continue with the loop. } else if (type != UCASE_NONE) { return TRUE; // Followed by cased letter. } else { return FALSE; // Uncased and not case-ignorable. } } return FALSE; // Not followed by cased letter. }
/** * * @version 0.4-1 (Marek Gagolewski, 2014-12-07) */ void StriContainerByteSearch::upgradePatternCaseInsensitive() { UChar32 c = 0; R_len_t j = 0; patternLenCaseInsensitive = 0; while (j < patternLen) { U8_NEXT(patternStr, j, patternLen, c); #ifndef NDEBUG if (patternLenCaseInsensitive >= this->kmpMaxSize) throw StriException("!NDEBUG: StriContainerByteSearch::upgradePatternCaseInsensitive()"); #endif patternStrCaseInsensitive[patternLenCaseInsensitive++] = u_toupper(c); } patternStrCaseInsensitive[patternLenCaseInsensitive] = 0; }
void ICUUnicodeSupport::_toLowerCase<1>(StringHolder<1> _str) { if(!_str.empty()) { uint8_t* buf = &_str[0]; int32_t len = _str.length(); int32_t ofs = 0, ofs2 = 0; while(ofs != len) { UChar32 c; U8_NEXT(buf, ofs, len, c); c = u_tolower(c); U8_APPEND_UNSAFE( buf, ofs2, c); } } }
std::vector<uint16_t> utf8ToUtf16(const std::string& text) { std::vector<uint16_t> result; int32_t i = 0; const int32_t textLength = static_cast<int32_t>(text.size()); uint32_t c = 0; while (i < textLength) { U8_NEXT(text.c_str(), i, textLength, c); if (U16_LENGTH(c) == 1) { result.push_back(c); } else { result.push_back(U16_LEAD(c)); result.push_back(U16_TRAIL(c)); } } return result; }
bool readUTFChar(const char* str, int* begin, int length, unsigned* codePointOut) { int codePoint; // Avoids warning when U8_NEXT writes -1 to it. U8_NEXT(str, *begin, length, codePoint); *codePointOut = static_cast<unsigned>(codePoint); // The ICU macro above moves to the next char, we want to point to the last // char consumed. (*begin)--; // Validate the decoded value. if (U_IS_UNICODE_CHAR(codePoint)) return true; *codePointOut = kUnicodeReplacementCharacter; return false; }
static void countLeadingSpaces(const CString& utf8String, int32_t& pointerOffset, int32_t& characterOffset) { pointerOffset = 0; characterOffset = 0; const char* stringData = utf8String.data(); UChar32 character = 0; while (static_cast<unsigned>(pointerOffset) < utf8String.length()) { int32_t nextPointerOffset = pointerOffset; U8_NEXT(stringData, nextPointerOffset, static_cast<int32_t>(utf8String.length()), character); if (character < 0 || !u_isUWhiteSpace(character)) return; pointerOffset = nextPointerOffset; characterOffset++; } }
/** * * @version 0.4-1 (Marek Gagolewski, 2014-12-07) */ bool StriContainerByteSearch::startsWith(R_len_t byteindex) { if (flags&BYTESEARCH_CASE_INSENSITIVE) { for (R_len_t k = 0; k < patternLenCaseInsensitive; ++k) { UChar32 c; U8_NEXT(searchStr, byteindex, searchLen, c); c = u_toupper(c); if (patternStrCaseInsensitive[k] != c) return false; } } else { for (R_len_t k=0; k < patternLen; ++k) if (searchStr[byteindex+k] != patternStr[k]) return false; } return true; // found }
/** find first match - KMP * * @param startPos where to start * @return USEARCH_DONE on no match, otherwise start index * * @version 0.1-?? (Bartek Tartanus, 2013-08-15) * KMP - first approach * * @version 0.2-3 (Marek Gagolewski, 2014-05-11) * KMP upgraded; separate method * * @version 0.4-1 (Marek Gagolewski, 2014-12-07) * use BYTESEARCH_CASE_INSENSITIVE */ R_len_t StriContainerByteSearch::findFromPosFwd_KMP(R_len_t startPos) { int j = startPos; patternPos = 0; if (flags&BYTESEARCH_CASE_INSENSITIVE) { UChar32 c = 0; while (j < searchLen) { U8_NEXT(searchStr, j, searchLen, c); c = u_toupper(c); while (patternPos >= 0 && patternStrCaseInsensitive[patternPos] != c) patternPos = kmpNext[patternPos]; patternPos++; if (patternPos == patternLenCaseInsensitive) { searchEnd = j; // we need to go back by patternLenCaseInsensitive code points R_len_t k = patternLenCaseInsensitive; searchPos = j; while (k > 0) { U8_BACK_1((const uint8_t*)searchStr, 0, searchPos); k--; } return searchPos; } } } else { while (j < searchLen) { while (patternPos >= 0 && patternStr[patternPos] != searchStr[j]) patternPos = kmpNext[patternPos]; patternPos++; j++; if (patternPos == patternLen) { searchEnd = j; searchPos = j-patternLen; return searchPos; } } } // else not found searchPos = searchEnd = searchLen; return USEARCH_DONE; }
/** * Detect if a character class occurs in a string * * @param str character vector * @param pattern character vector * @return logical vector * * @version 0.1 (Bartek Tartanus) * @version 0.2 (Marek Gagolewski, 2013-06-02) Use StrContainerUTF8 and CharClass classes * @version 0.3 (Marek Gagolewski, 2013-06-15) Use StrContainerCharClass * @version 0.4 (Marek Gagolewski, 2013-06-16) make StriException-friendly */ SEXP stri_detect_charclass(SEXP str, SEXP pattern) { str = stri_prepare_arg_string(str, "str"); pattern = stri_prepare_arg_string(pattern, "pattern"); R_len_t vectorize_length = stri__recycling_rule(true, 2, LENGTH(str), LENGTH(pattern)); STRI__ERROR_HANDLER_BEGIN StriContainerUTF8 str_cont(str, vectorize_length); StriContainerCharClass pattern_cont(pattern, vectorize_length); SEXP ret; PROTECT(ret = Rf_allocVector(LGLSXP, vectorize_length)); int* ret_tab = LOGICAL(ret); for (R_len_t i = pattern_cont.vectorize_init(); i != pattern_cont.vectorize_end(); i = pattern_cont.vectorize_next(i)) { if (str_cont.isNA(i) || pattern_cont.isNA(i)) { ret_tab[i] = NA_LOGICAL; continue; } CharClass pattern_cur = pattern_cont.get(i); R_len_t str_cur_n = str_cont.get(i).length(); const char* str_cur_s = str_cont.get(i).c_str(); ret_tab[i] = FALSE; R_len_t j; UChar32 chr; for (j=0; j<str_cur_n; ) { U8_NEXT(str_cur_s, j, str_cur_n, chr); if (pattern_cur.test(chr)) { ret_tab[i] = TRUE; break; } } } UNPROTECT(1); return ret; STRI__ERROR_HANDLER_END(;/* nothing special to be done on error */) }
/* * Case-maps [srcStart..srcLimit[ but takes * context [0..srcLength[ into account. */ static int32_t _caseMap(const UCaseMap *csm, UCaseMapFull *map, uint8_t *dest, int32_t destCapacity, const uint8_t *src, UCaseContext *csc, int32_t srcStart, int32_t srcLimit, UErrorCode *pErrorCode) { const UChar *s; UChar32 c, c2 = 0; int32_t srcIndex, destIndex; int32_t locCache; locCache=csm->locCache; /* case mapping loop */ srcIndex=srcStart; destIndex=0; while(srcIndex<srcLimit) { csc->cpStart=srcIndex; U8_NEXT(src, srcIndex, srcLimit, c); csc->cpLimit=srcIndex; if(c<0) { int32_t i=csc->cpStart; while(destIndex<destCapacity && i<srcIndex) { dest[destIndex++]=src[i++]; } continue; } c=map(csm->csp, c, utf8_caseContextIterator, csc, &s, csm->locale, &locCache); if((destIndex<destCapacity) && (c<0 ? (c2=~c)<=0x7f : UCASE_MAX_STRING_LENGTH<c && (c2=c)<=0x7f)) { /* fast path version of appendResult() for ASCII results */ dest[destIndex++]=(uint8_t)c2; } else { destIndex=appendResult(dest, destIndex, destCapacity, c, s); } } if(destIndex>destCapacity) { *pErrorCode=U_BUFFER_OVERFLOW_ERROR; } return destIndex; }
static void U_CALLCONV ucasemap_internalUTF8Fold(int32_t /* caseLocale */, uint32_t options, UCASEMAP_BREAK_ITERATOR_UNUSED const uint8_t *src, int32_t srcLength, icu::ByteSink &sink, icu::Edits *edits, UErrorCode &errorCode) { /* case mapping loop */ int32_t srcIndex = 0; while (U_SUCCESS(errorCode) && srcIndex < srcLength) { int32_t cpStart = srcIndex; UChar32 c; U8_NEXT(src, srcIndex, srcLength, c); if(c<0) { // Malformed UTF-8. ByteSinkUtil::appendUnchanged(src+cpStart, srcIndex-cpStart, sink, options, edits, errorCode); } else { const UChar *s; c = ucase_toFullFolding(c, &s, options); appendResult(srcIndex - cpStart, c, s, sink, options, edits, errorCode); } } }