/* {{{ grapheme_extract_bytecount_iter - grapheme iterator for grapheme_extract MAXBYTES */ static inline int32_t grapheme_extract_bytecount_iter(UBreakIterator *bi, int32_t bsize, unsigned char *pstr, int32_t str_len) { int pos = 0, prev_pos = 0; int ret_pos = 0, prev_ret_pos = 0; while ( 1 ) { pos = ubrk_next(bi); if ( UBRK_DONE == pos ) { break; } prev_ret_pos = ret_pos; U8_FWD_N(pstr, ret_pos, str_len, pos - prev_pos); if ( ret_pos > bsize ) { ret_pos = prev_ret_pos; break; } if ( prev_ret_pos == ret_pos ) { /* something wrong - malformed utf8? */ break; } prev_pos = pos; } return ret_pos; }
/* {{{ grapheme_extract_charcount_iter - grapheme iterator for grapheme_extract MAXCHARS */ static inline int32_t grapheme_extract_charcount_iter(UBreakIterator *bi, int32_t csize, unsigned char *pstr, int32_t str_len) { int pos = 0, prev_pos = 0; int ret_pos = 0, prev_ret_pos = 0; while ( 1 ) { pos = ubrk_next(bi); if ( UBRK_DONE == pos ) { break; } /* if we are beyond our limit, then the loop is done */ if ( pos > csize ) { break; } /* update our pointer in the original UTF-8 buffer by as many characters as ubrk_next iterated over */ prev_ret_pos = ret_pos; U8_FWD_N(pstr, ret_pos, str_len, pos - prev_pos); if ( prev_ret_pos == ret_pos ) { /* something wrong - malformed utf8? */ break; } prev_pos = pos; } return ret_pos; }
/* {{{ grapheme_extract_count_iter - grapheme iterator for grapheme_extract COUNT */ static inline int32_t grapheme_extract_count_iter(UBreakIterator *bi, int32_t size, unsigned char *pstr, int32_t str_len) { int pos = 0, next_pos = 0; int ret_pos = 0; while ( size ) { next_pos = ubrk_next(bi); if ( UBRK_DONE == next_pos ) { break; } pos = next_pos; size--; } /* pos is one past the last UChar - and represent the number of code units to advance in the utf-8 buffer */ U8_FWD_N(pstr, ret_pos, str_len, pos); return ret_pos; }
/* {{{ strstr_common_handler */ static void strstr_common_handler(INTERNAL_FUNCTION_PARAMETERS, int f_ignore_case) { char *haystack, *needle; const char *found; size_t haystack_len, needle_len; int32_t ret_pos, uchar_pos; zend_bool part = 0; if (zend_parse_parameters(ZEND_NUM_ARGS(), "ss|b", &haystack, &haystack_len, &needle, &needle_len, &part) == FAILURE) { intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR, "grapheme_strstr: unable to parse input param", 0 ); RETURN_FALSE; } if (needle_len == 0) { intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR, "grapheme_strpos: Empty delimiter", 1 ); RETURN_FALSE; } if ( !f_ignore_case ) { /* ASCII optimization: quick check to see if the string might be there * I realize that 'offset' is 'grapheme count offset' but will work in spite of that */ found = php_memnstr(haystack, needle, needle_len, haystack + haystack_len); /* if it isn't there the we are done */ if ( !found ) { RETURN_FALSE; } /* if it is there, and if the haystack is ascii, we are all done */ if ( grapheme_ascii_check((unsigned char *)haystack, haystack_len) >= 0 ) { size_t found_offset = found - haystack; if (part) { RETURN_STRINGL(haystack, found_offset); } else { RETURN_STRINGL(found, haystack_len - found_offset); } } } /* need to work in utf16 */ ret_pos = grapheme_strpos_utf16(haystack, haystack_len, needle, needle_len, 0, &uchar_pos, f_ignore_case, 0 /*last */ ); if ( ret_pos < 0 ) { RETURN_FALSE; } /* uchar_pos is the 'nth' Unicode character position of the needle */ ret_pos = 0; U8_FWD_N(haystack, ret_pos, haystack_len, uchar_pos); if (part) { RETURN_STRINGL(haystack, ret_pos); } else { RETURN_STRINGL(haystack + ret_pos, haystack_len - ret_pos); } }
static void TestFwdBack(){ static const uint8_t input[]={0x61, 0xF0, 0x90, 0x90, 0x81, 0xff, 0x62, 0xc0, 0x80, 0x7f, 0x8f, 0xc0, 0x63, 0x81, 0x90, 0x90, 0xF0, 0x00}; static const uint16_t fwd_unsafe[] ={1, 5, 6, 7, 9, 10, 11, 13, 14, 15, 16, 20, }; static const uint16_t fwd_safe[] ={1, 5, 6, 7, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18}; static const uint16_t back_unsafe[]={17, 16, 12, 11, 9, 7, 6, 5, 1, 0}; static const uint16_t back_safe[] ={17, 16, 15, 14, 13, 12, 11, 10, 9, 7, 6, 5, 1, 0}; static const uint16_t Nvalue[]= {0, 1, 2, 3, 1, 2, 1, 5}; static const uint16_t fwd_N_unsafe[] ={0, 1, 6, 10, 11, 14, 15}; static const uint16_t fwd_N_safe[] ={0, 1, 6, 10, 11, 13, 14, 18}; /*safe macro keeps it at the end of the string */ static const uint16_t back_N_unsafe[]={18, 17, 12, 7, 6, 1, 0}; static const uint16_t back_N_safe[] ={18, 17, 15, 12, 11, 9, 7, 0}; uint32_t offunsafe=0, offsafe=0; uint32_t i=0; while(offunsafe < sizeof(input)){ UTF8_FWD_1_UNSAFE(input, offunsafe); if(offunsafe != fwd_unsafe[i]){ log_err("ERROR: Forward_unsafe offset expected:%d, Got:%d\n", fwd_unsafe[i], offunsafe); } i++; } i=0; while(offunsafe < sizeof(input)){ U8_FWD_1_UNSAFE(input, offunsafe); if(offunsafe != fwd_unsafe[i]){ log_err("ERROR: U8_FWD_1_UNSAFE offset expected:%d, Got:%d\n", fwd_unsafe[i], offunsafe); } i++; } i=0; while(offsafe < sizeof(input)){ UTF8_FWD_1_SAFE(input, offsafe, sizeof(input)); if(offsafe != fwd_safe[i]){ log_err("ERROR: Forward_safe offset expected:%d, Got:%d\n", fwd_safe[i], offsafe); } i++; } i=0; while(offsafe < sizeof(input)){ U8_FWD_1(input, offsafe, sizeof(input)); if(offsafe != fwd_safe[i]){ log_err("ERROR: U8_FWD_1 offset expected:%d, Got:%d\n", fwd_safe[i], offsafe); } i++; } offunsafe=sizeof(input); i=0; while(offunsafe > 0){ UTF8_BACK_1_UNSAFE(input, offunsafe); if(offunsafe != back_unsafe[i]){ log_err("ERROR: Backward_unsafe offset expected:%d, Got:%d\n", back_unsafe[i], offunsafe); } i++; } offunsafe=sizeof(input); i=0; while(offunsafe > 0){ U8_BACK_1_UNSAFE(input, offunsafe); if(offunsafe != back_unsafe[i]){ log_err("ERROR: U8_BACK_1_UNSAFE offset expected:%d, Got:%d\n", back_unsafe[i], offunsafe); } i++; } i=0; offsafe=sizeof(input); while(offsafe > 0){ UTF8_BACK_1_SAFE(input, 0, offsafe); if(offsafe != back_safe[i]){ log_err("ERROR: Backward_safe offset expected:%d, Got:%d\n", back_unsafe[i], offsafe); } i++; } i=0; offsafe=sizeof(input); while(offsafe > 0){ U8_BACK_1(input, 0, offsafe); if(offsafe != back_safe[i]){ log_err("ERROR: U8_BACK_1 offset expected:%d, Got:%d\n", back_unsafe[i], offsafe); } i++; } offunsafe=0; for(i=0; i<sizeof(Nvalue)/sizeof(Nvalue[0])-2; i++){ UTF8_FWD_N_UNSAFE(input, offunsafe, Nvalue[i]); if(offunsafe != fwd_N_unsafe[i]){ log_err("ERROR: Forward_N_unsafe offset=%d expected:%d, Got:%d\n", i, fwd_N_unsafe[i], offunsafe); } } offunsafe=0; for(i=0; i<sizeof(Nvalue)/sizeof(Nvalue[0])-2; i++){ U8_FWD_N_UNSAFE(input, offunsafe, Nvalue[i]); if(offunsafe != fwd_N_unsafe[i]){ log_err("ERROR: U8_FWD_N_UNSAFE offset=%d expected:%d, Got:%d\n", i, fwd_N_unsafe[i], offunsafe); } } offsafe=0; for(i=0; i<sizeof(Nvalue)/sizeof(Nvalue[0]); i++){ UTF8_FWD_N_SAFE(input, offsafe, sizeof(input), Nvalue[i]); if(offsafe != fwd_N_safe[i]){ log_err("ERROR: Forward_N_safe offset=%d expected:%d, Got:%d\n", i, fwd_N_safe[i], offsafe); } } offsafe=0; for(i=0; i<sizeof(Nvalue)/sizeof(Nvalue[0]); i++){ U8_FWD_N(input, offsafe, sizeof(input), Nvalue[i]); if(offsafe != fwd_N_safe[i]){ log_err("ERROR: U8_FWD_N offset=%d expected:%d, Got:%d\n", i, fwd_N_safe[i], offsafe); } } offunsafe=sizeof(input); for(i=0; i<sizeof(Nvalue)/sizeof(Nvalue[0])-2; i++){ UTF8_BACK_N_UNSAFE(input, offunsafe, Nvalue[i]); if(offunsafe != back_N_unsafe[i]){ log_err("ERROR: backward_N_unsafe offset=%d expected:%d, Got:%d\n", i, back_N_unsafe[i], offunsafe); } } offunsafe=sizeof(input); for(i=0; i<sizeof(Nvalue)/sizeof(Nvalue[0])-2; i++){ U8_BACK_N_UNSAFE(input, offunsafe, Nvalue[i]); if(offunsafe != back_N_unsafe[i]){ log_err("ERROR: U8_BACK_N_UNSAFE offset=%d expected:%d, Got:%d\n", i, back_N_unsafe[i], offunsafe); } } offsafe=sizeof(input); for(i=0; i<sizeof(Nvalue)/sizeof(Nvalue[0]); i++){ UTF8_BACK_N_SAFE(input, 0, offsafe, Nvalue[i]); if(offsafe != back_N_safe[i]){ log_err("ERROR: backward_N_safe offset=%d expected:%d, Got:%ld\n", i, back_N_safe[i], offsafe); } } offsafe=sizeof(input); for(i=0; i<sizeof(Nvalue)/sizeof(Nvalue[0]); i++){ U8_BACK_N(input, 0, offsafe, Nvalue[i]); if(offsafe != back_N_safe[i]){ log_err("ERROR: U8_BACK_N offset=%d expected:%d, Got:%ld\n", i, back_N_safe[i], offsafe); } } }
void UTF8CollationIterator::forwardNumCodePoints(int32_t num, UErrorCode & /*errorCode*/) { U8_FWD_N(u8, pos, length, num); }
/* keep this in sync with utf16tst.c's TestNulTerminated() */ static void TestNulTerminated() { static const uint8_t input[]={ /* 0 */ 0x61, /* 1 */ 0xf0, 0x90, 0x90, 0x81, /* 5 */ 0xc0, 0x80, /* 7 */ 0xdf, 0x80, /* 9 */ 0xc2, /* 10 */ 0x62, /* 11 */ 0xfd, 0xbe, /* 13 */ 0xe0, 0xa0, 0x80, /* 16 */ 0xe2, 0x82, 0xac, /* 19 */ 0xf0, 0x90, 0x90, /* 22 */ 0x00 /* 23 */ }; static const UChar32 result[]={ 0x61, 0x10401, U_SENTINEL, 0x7c0, U_SENTINEL, 0x62, U_SENTINEL, 0x800, 0x20ac, U_SENTINEL, 0 }; UChar32 c, c2, expected; int32_t i0, i=0, j, k, expectedIndex; int32_t cpIndex=0; do { i0=i; U8_NEXT(input, i, -1, c); expected=result[cpIndex]; if(c!=expected) { log_err("U8_NEXT(from %d)=U+%04x != U+%04x\n", i0, c, expected); } j=i0; U8_NEXT_OR_FFFD(input, j, -1, c); if(expected<0) { expected=0xfffd; } if(c!=expected) { log_err("U8_NEXT_OR_FFFD(from %d)=U+%04x != U+%04x\n", i0, c, expected); } if(j!=i) { log_err("U8_NEXT_OR_FFFD() moved to index %d but U8_NEXT() moved to %d\n", j, i); } j=i0; U8_FWD_1(input, j, -1); if(j!=i) { log_err("U8_FWD_1() moved to index %d but U8_NEXT() moved to %d\n", j, i); } ++cpIndex; /* * Move by this many code points from the start. * U8_FWD_N() stops at the end of the string, that is, at the NUL if necessary. */ expectedIndex= (c==0) ? i-1 : i; k=0; U8_FWD_N(input, k, -1, cpIndex); if(k!=expectedIndex) { log_err("U8_FWD_N(code points from 0) moved to index %d but expected %d\n", k, expectedIndex); } } while(c!=0); i=0; do { j=i0=i; U8_NEXT(input, i, -1, c); do { U8_GET(input, 0, j, -1, c2); if(c2!=c) { log_err("U8_NEXT(from %d)=U+%04x != U+%04x=U8_GET(at %d)\n", i0, c, c2, j); } U8_GET_OR_FFFD(input, 0, j, -1, c2); expected= (c>=0) ? c : 0xfffd; if(c2!=expected) { log_err("U8_NEXT_OR_FFFD(from %d)=U+%04x != U+%04x=U8_GET_OR_FFFD(at %d)\n", i0, expected, c2, j); } /* U8_SET_CP_LIMIT moves from a non-lead byte to the limit of the code point */ k=j+1; U8_SET_CP_LIMIT(input, 0, k, -1); if(k!=i) { log_err("U8_NEXT() moved to %d but U8_SET_CP_LIMIT(%d) moved to %d\n", i, j+1, k); } } while(++j<i); } while(c!=0); }