static jint groupCount(JNIEnv* env, jclass clazz, RegExData* data) { UErrorCode status = U_ZERO_ERROR; jint result = uregex_groupCount(data->regex, &status); if (!U_SUCCESS(status)) { throwRuntimeException(env, status); } return result; }
/** * Tests whether a given input tag matches a given tag's stored regular expression. * * @param[in] test The tag to be tested * @param[in] tag The tag to test against; only uses the hash and regexp members */ uint32_t GrammarApplicator::doesTagMatchRegexp(uint32_t test, const Tag& tag, bool bypass_index) { uint32_t match = 0; uint32_t ih = hash_value(tag.hash, test); if (!bypass_index && index_matches(index_regexp_no, ih)) { match = 0; } else if (!bypass_index && index_matches(index_regexp_yes, ih)) { match = test; } else { const Tag& itag = *(single_tags.find(test)->second); UErrorCode status = U_ZERO_ERROR; uregex_setText(tag.regexp, itag.tag.c_str(), static_cast<int32_t>(itag.tag.size()), &status); if (status != U_ZERO_ERROR) { u_fprintf(ux_stderr, "Error: uregex_setText(MatchSet) returned %s for tag %S before input line %u - cannot continue!\n", u_errorName(status), tag.tag.c_str(), numLines); CG3Quit(1); } status = U_ZERO_ERROR; if (uregex_find(tag.regexp, -1, &status)) { match = itag.hash; } if (status != U_ZERO_ERROR) { u_fprintf(ux_stderr, "Error: uregex_find(MatchSet) returned %s for tag %S before input line %u - cannot continue!\n", u_errorName(status), tag.tag.c_str(), numLines); CG3Quit(1); } if (match) { int32_t gc = uregex_groupCount(tag.regexp, &status); if (gc > 0 && regexgrps.second != 0) { UChar tmp[1024]; for (int i = 1; i <= gc; ++i) { tmp[0] = 0; int32_t len = uregex_group(tag.regexp, i, tmp, 1024, &status); regexgrps.second->resize(std::max(static_cast<size_t>(regexgrps.first) + 1, regexgrps.second->size())); UnicodeString& ucstr = (*regexgrps.second)[regexgrps.first]; ucstr.remove(); ucstr.append(tmp, len); ++regexgrps.first; } } else { index_regexp_yes.insert(ih); } } else { index_regexp_no.insert(ih); } } return match; }
static void startEnd(JNIEnv* env, jclass clazz, RegExData* data, jintArray offsets) { UErrorCode status = U_ZERO_ERROR; jint * offsetsRaw = env->GetIntArrayElements(offsets, NULL); int groupCount = uregex_groupCount(data->regex, &status); for (int i = 0; i <= groupCount && U_SUCCESS(status); i++) { offsetsRaw[2 * i + 0] = uregex_start(data->regex, i, &status); offsetsRaw[2 * i + 1] = uregex_end(data->regex, i, &status); } env->ReleaseIntArrayElements(offsets, offsetsRaw, 0); if (!U_SUCCESS(status)) { throwRuntimeException(env, status); } }
/** * Tests whether a given reading matches a given tag. * * In the https://visl.sdu.dk/cg3_performance.html test data, this function is executed 1058428 times, * of which 827259 are treated as raw tags. * * @param[in] reading The reading to test * @param[in] tag The tag to test against * @param[in] unif_mode Used to signal that a parent set was a $$unified set */ uint32_t GrammarApplicator::doesTagMatchReading(const Reading& reading, const Tag& tag, bool unif_mode, bool bypass_index) { uint32_t retval = 0; uint32_t match = 0; if (!(tag.type & T_SPECIAL) || tag.type & T_FAILFAST) { uint32SortedVector::const_iterator itf, ite = reading.tags_plain.end(); bool raw_in = reading.tags_plain_bloom.matches(tag.hash); if (tag.type & T_FAILFAST) { itf = reading.tags_plain.find(tag.plain_hash); raw_in = (itf != ite); } else if (raw_in) { itf = reading.tags_plain.find(tag.hash); raw_in = (itf != ite); } if (raw_in) { match = tag.hash; } } else if (tag.type & T_SET) { uint32_t sh = hash_value(tag.tag); sh = grammar->sets_by_name.find(sh)->second; match = doesSetMatchReading(reading, sh, bypass_index, unif_mode); } else if (tag.type & T_VARSTRING) { const Tag* nt = generateVarstringTag(&tag); match = doesTagMatchReading(reading, *nt, unif_mode, bypass_index); } else if (tag.type & T_META) { if (tag.regexp && !reading.parent->text.empty()) { UErrorCode status = U_ZERO_ERROR; uregex_setText(tag.regexp, reading.parent->text.c_str(), static_cast<int32_t>(reading.parent->text.size()), &status); if (status != U_ZERO_ERROR) { u_fprintf(ux_stderr, "Error: uregex_setText(MatchSet) returned %s for tag %S before input line %u - cannot continue!\n", u_errorName(status), tag.tag.c_str(), numLines); CG3Quit(1); } status = U_ZERO_ERROR; if (uregex_find(tag.regexp, -1, &status)) { match = tag.hash; } if (status != U_ZERO_ERROR) { u_fprintf(ux_stderr, "Error: uregex_find(MatchSet) returned %s for tag %S before input line %u - cannot continue!\n", u_errorName(status), tag.tag.c_str(), numLines); CG3Quit(1); } if (match) { int32_t gc = uregex_groupCount(tag.regexp, &status); if (gc > 0 && regexgrps.second != 0) { UChar tmp[1024]; for (int i = 1; i <= gc; ++i) { tmp[0] = 0; int32_t len = uregex_group(tag.regexp, i, tmp, 1024, &status); regexgrps.second->resize(std::max(static_cast<size_t>(regexgrps.first) + 1, regexgrps.second->size())); UnicodeString& ucstr = (*regexgrps.second)[regexgrps.first]; ucstr.remove(); ucstr.append(tmp, len); ++regexgrps.first; } } } } } else if (tag.regexp) { match = doesRegexpMatchReading(reading, tag, bypass_index); } else if (tag.type & T_CASE_INSENSITIVE) { for (auto mter : reading.tags_textual) { match = doesTagMatchIcase(mter, tag, bypass_index); if (match) { break; } } } else if (tag.type & T_REGEXP_ANY) { if (tag.type & T_BASEFORM) { match = reading.baseform; if (unif_mode) { if (unif_last_baseform) { if (unif_last_baseform != reading.baseform) { match = 0; } } else { unif_last_baseform = reading.baseform; } } } else if (tag.type & T_WORDFORM) { match = reading.parent->wordform->hash; if (unif_mode) { if (unif_last_wordform) { if (unif_last_wordform != reading.parent->wordform->hash) { match = 0; } } else { unif_last_wordform = reading.parent->wordform->hash; } } } else { for (auto mter : reading.tags_textual) { const Tag& itag = *(single_tags.find(mter)->second); if (!(itag.type & (T_BASEFORM | T_WORDFORM))) { match = itag.hash; if (unif_mode) { if (unif_last_textual) { if (unif_last_textual != mter) { match = 0; } } else { unif_last_textual = mter; } } } if (match) { break; } } } } else if (tag.type & T_NUMERICAL) { for (auto mter : reading.tags_numerical) { const Tag& itag = *(mter.second); double compval = tag.comparison_val; if (compval <= NUMERIC_MIN) { compval = reading.parent->getMin(tag.comparison_hash); } else if (compval >= NUMERIC_MAX) { compval = reading.parent->getMax(tag.comparison_hash); } if (tag.comparison_hash == itag.comparison_hash) { if (tag.comparison_op == OP_EQUALS && itag.comparison_op == OP_EQUALS && compval == itag.comparison_val) { match = itag.hash; } else if (tag.comparison_op == OP_NOTEQUALS && itag.comparison_op == OP_EQUALS && compval != itag.comparison_val) { match = itag.hash; } else if (tag.comparison_op == OP_EQUALS && itag.comparison_op == OP_NOTEQUALS && compval != itag.comparison_val) { match = itag.hash; } else if (tag.comparison_op == OP_NOTEQUALS && itag.comparison_op == OP_NOTEQUALS && compval == itag.comparison_val) { match = itag.hash; } else if (tag.comparison_op == OP_EQUALS && itag.comparison_op == OP_LESSTHAN && compval < itag.comparison_val) { match = itag.hash; } else if (tag.comparison_op == OP_EQUALS && itag.comparison_op == OP_LESSEQUALS && compval <= itag.comparison_val) { match = itag.hash; } else if (tag.comparison_op == OP_EQUALS && itag.comparison_op == OP_GREATERTHAN && compval > itag.comparison_val) { match = itag.hash; } else if (tag.comparison_op == OP_EQUALS && itag.comparison_op == OP_GREATEREQUALS && compval >= itag.comparison_val) { match = itag.hash; } else if (tag.comparison_op == OP_NOTEQUALS && itag.comparison_op == OP_LESSTHAN) { match = itag.hash; } else if (tag.comparison_op == OP_NOTEQUALS && itag.comparison_op == OP_LESSEQUALS) { match = itag.hash; } else if (tag.comparison_op == OP_NOTEQUALS && itag.comparison_op == OP_GREATERTHAN) { match = itag.hash; } else if (tag.comparison_op == OP_NOTEQUALS && itag.comparison_op == OP_GREATEREQUALS) { match = itag.hash; } else if (tag.comparison_op == OP_LESSTHAN && itag.comparison_op == OP_NOTEQUALS) { match = itag.hash; } else if (tag.comparison_op == OP_LESSEQUALS && itag.comparison_op == OP_NOTEQUALS) { match = itag.hash; } else if (tag.comparison_op == OP_GREATERTHAN && itag.comparison_op == OP_NOTEQUALS) { match = itag.hash; } else if (tag.comparison_op == OP_GREATEREQUALS && itag.comparison_op == OP_NOTEQUALS) { match = itag.hash; } else if (tag.comparison_op == OP_LESSTHAN && itag.comparison_op == OP_EQUALS && compval > itag.comparison_val) { match = itag.hash; } else if (tag.comparison_op == OP_LESSEQUALS && itag.comparison_op == OP_EQUALS && compval >= itag.comparison_val) { match = itag.hash; } else if (tag.comparison_op == OP_LESSTHAN && itag.comparison_op == OP_LESSTHAN) { match = itag.hash; } else if (tag.comparison_op == OP_LESSEQUALS && itag.comparison_op == OP_LESSEQUALS) { match = itag.hash; } else if (tag.comparison_op == OP_LESSEQUALS && itag.comparison_op == OP_LESSTHAN) { match = itag.hash; } else if (tag.comparison_op == OP_LESSTHAN && itag.comparison_op == OP_LESSEQUALS) { match = itag.hash; } else if (tag.comparison_op == OP_LESSTHAN && itag.comparison_op == OP_GREATERTHAN && compval > itag.comparison_val) { match = itag.hash; } else if (tag.comparison_op == OP_LESSTHAN && itag.comparison_op == OP_GREATEREQUALS && compval > itag.comparison_val) { match = itag.hash; } else if (tag.comparison_op == OP_LESSEQUALS && itag.comparison_op == OP_GREATERTHAN && compval > itag.comparison_val) { match = itag.hash; } else if (tag.comparison_op == OP_LESSEQUALS && itag.comparison_op == OP_GREATEREQUALS && compval >= itag.comparison_val) { match = itag.hash; } else if (tag.comparison_op == OP_GREATERTHAN && itag.comparison_op == OP_EQUALS && compval < itag.comparison_val) { match = itag.hash; } else if (tag.comparison_op == OP_GREATEREQUALS && itag.comparison_op == OP_EQUALS && compval <= itag.comparison_val) { match = itag.hash; } else if (tag.comparison_op == OP_GREATERTHAN && itag.comparison_op == OP_GREATERTHAN) { match = itag.hash; } else if (tag.comparison_op == OP_GREATEREQUALS && itag.comparison_op == OP_GREATEREQUALS) { match = itag.hash; } else if (tag.comparison_op == OP_GREATEREQUALS && itag.comparison_op == OP_GREATERTHAN) { match = itag.hash; } else if (tag.comparison_op == OP_GREATERTHAN && itag.comparison_op == OP_GREATEREQUALS) { match = itag.hash; } else if (tag.comparison_op == OP_GREATERTHAN && itag.comparison_op == OP_LESSTHAN && compval < itag.comparison_val) { match = itag.hash; } else if (tag.comparison_op == OP_GREATERTHAN && itag.comparison_op == OP_LESSEQUALS && compval < itag.comparison_val) { match = itag.hash; } else if (tag.comparison_op == OP_GREATEREQUALS && itag.comparison_op == OP_LESSTHAN && compval < itag.comparison_val) { match = itag.hash; } else if (tag.comparison_op == OP_GREATEREQUALS && itag.comparison_op == OP_LESSEQUALS && compval <= itag.comparison_val) { match = itag.hash; } if (match) { break; } } } } else if (tag.type & T_VARIABLE) { if (variables.find(tag.comparison_hash) == variables.end()) { //u_fprintf(ux_stderr, "Info: %S failed.\n", tag.tag.c_str()); match = 0; } else { //u_fprintf(ux_stderr, "Info: %S matched.\n", tag.tag.c_str()); match = tag.hash; } } else if (tag.type & T_PAR_LEFT) { if (par_left_tag && reading.parent->local_number == par_left_pos && reading.tags.find(par_left_tag) != reading.tags.end()) { match = grammar->tag_any; } } else if (tag.type & T_PAR_RIGHT) { if (par_right_tag && reading.parent->local_number == par_right_pos && reading.tags.find(par_right_tag) != reading.tags.end()) { match = grammar->tag_any; } } else if (tag.type & T_ENCL) { if (!reading.parent->enclosed.empty()) { match = true; } } else if (tag.type & T_TARGET) { if (target && reading.parent == target) { match = grammar->tag_any; } } else if (tag.type & T_MARK) { if (mark && reading.parent == mark) { match = grammar->tag_any; } } else if (tag.type & T_ATTACHTO) { if (attach_to && reading.parent == attach_to) { match = grammar->tag_any; } } else if (tag.type & T_SAME_BASIC) { if (reading.hash_plain == same_basic) { match = grammar->tag_any; } } if (match) { ++match_single; retval = match; } return retval; }
int prString_FindRegexp(struct VMGlobals *g, int numArgsPushed) { int err; PyrSlot *a = g->sp - 2; // source string PyrSlot *b = g->sp - 1; // pattern PyrSlot *c = g->sp; // offset if (!isKindOfSlot(b, class_string) || (NotInt(c))) return errWrongType; // post("prString_FindRegexp\n"); int maxfind = MAXREGEXFIND; int offset = slotRawInt(c); int stringsize = slotRawObject(a)->size + 1; int patternsize = slotRawObject(b)->size + 1; char *string = (char*)malloc(slotRawObject(a)->size + 1); err = slotStrVal(a, string, slotRawObject(a)->size + 1); if (err){ free(string); return err; } char *pattern = (char*)malloc(slotRawObject(b)->size + 1); err = slotStrVal(b, pattern, slotRawObject(b)->size + 1); if (err) return err; UParseError uerr; UErrorCode status = (UErrorCode)0; UChar *regexStr; UChar *ustring; regexStr = (UChar*)malloc((patternsize)*sizeof(UChar)); u_charsToUChars (pattern, regexStr, patternsize); ustring = (UChar*)malloc((stringsize)*sizeof(UChar)); u_charsToUChars (string+offset, ustring, stringsize-offset); unsigned flags = UREGEX_MULTILINE; int groupNumber = 0; SCRegExRegion * what; int indx = 0; int size = 0; URegularExpression *expression = uregex_open(regexStr, -1, flags, &uerr, &status); if(U_FAILURE(status)) goto nilout; if(!U_FAILURE(status)) { uregex_setText(expression, ustring, -1, &status); what = (SCRegExRegion*)malloc((maxfind)*sizeof(SCRegExRegion)); for(int i=0; i< maxfind; i++) { SCRegExRegion range; range.matched = false; what[i] = range; } int32_t groups = uregex_groupCount(expression, &status) + 1; if(U_FAILURE(status)) goto nilout; // post("groups: %i\n", groups); while (uregex_findNext(expression, &status) && size<maxfind) { if(U_FAILURE(status)) return errNone; for(int i=0; i< groups; ++i){ what[size].group = i; what[size].start = sc_clip(uregex_start(expression, i, &status), 0, stringsize) ; if(U_FAILURE(status)) goto nilout; what[size].end = sc_clip(uregex_end(expression, i, &status), 0, stringsize); what[size].matched = true; // post("index:%i, size:%i, start %i, end %i\n", i, size, what[i].start, what[i].end); size = indx++ + 1; if(U_FAILURE(status)) goto nilout; } } PyrObject *result_array = newPyrArray(g->gc, size, 0, true); result_array->size = 0; if (size>0) //(matched) { for (int i = 0; i < size; i++) { if (what[0].matched == false) { result_array->size++; SetNil(result_array->slots+i); } else { result_array->size++; int match_start = what[i].start; int match_length = what[i].end - what[i].start; // post("for i:%i, start %i, end %i\n", i, what[i].start, what[i].end); // char *match = (char*)malloc(match_length); char match[match_length]; strncpy(match, string + offset + match_start, match_length); match[match_length] = 0; PyrObject *array = newPyrArray(g->gc, 2, 0, true); array->size = 2; SetInt(array->slots, match_start + offset); PyrObject *matched_string = (PyrObject*)newPyrString(g->gc, match, 0, true); SetObject(array->slots+1, matched_string); g->gc->GCWrite(matched_string, array->slots + 1); SetObject(result_array->slots + i, array); g->gc->GCWrite(array, result_array->slots + i); } } } else { SetNil(a); } free(what); free(pattern); free(regexStr); free(ustring); free(string); SetObject(a, result_array); g->gc->GCWrite(result_array,a); //uregex_close(expression); return errNone; } nilout: free(string); free(what); free(pattern); free(regexStr); free(ustring); SetNil(a); return errNone; }
static void TestRegexCAPI(void) { UErrorCode status = U_ZERO_ERROR; URegularExpression *re; UChar pat[200]; UChar *minus1; memset(&minus1, -1, sizeof(minus1)); /* Mimimalist open/close */ u_uastrncpy(pat, "abc*", sizeof(pat)/2); re = uregex_open(pat, -1, 0, 0, &status); TEST_ASSERT_SUCCESS(status); uregex_close(re); /* Open with all flag values set */ status = U_ZERO_ERROR; re = uregex_open(pat, -1, UREGEX_CASE_INSENSITIVE | UREGEX_COMMENTS | UREGEX_DOTALL | UREGEX_MULTILINE | UREGEX_UWORD, 0, &status); TEST_ASSERT_SUCCESS(status); uregex_close(re); /* Open with an invalid flag */ status = U_ZERO_ERROR; re = uregex_open(pat, -1, 0x40000000, 0, &status); TEST_ASSERT(status == U_REGEX_INVALID_FLAG); uregex_close(re); /* openC with an invalid parameter */ status = U_ZERO_ERROR; re = uregex_openC(NULL, UREGEX_CASE_INSENSITIVE | UREGEX_COMMENTS | UREGEX_DOTALL | UREGEX_MULTILINE | UREGEX_UWORD, 0, &status); TEST_ASSERT(status == U_ILLEGAL_ARGUMENT_ERROR && re == NULL); /* openC with an invalid parameter */ status = U_USELESS_COLLATOR_ERROR; re = uregex_openC(NULL, UREGEX_CASE_INSENSITIVE | UREGEX_COMMENTS | UREGEX_DOTALL | UREGEX_MULTILINE | UREGEX_UWORD, 0, &status); TEST_ASSERT(status == U_USELESS_COLLATOR_ERROR && re == NULL); /* openC open from a C string */ { const UChar *p; int32_t len; status = U_ZERO_ERROR; re = uregex_openC("abc*", 0, 0, &status); TEST_ASSERT_SUCCESS(status); p = uregex_pattern(re, &len, &status); TEST_ASSERT_SUCCESS(status); /* The TEST_ASSERT_SUCCESS above should change too... */ if(U_SUCCESS(status)) { u_uastrncpy(pat, "abc*", sizeof(pat)/2); TEST_ASSERT(u_strcmp(pat, p) == 0); TEST_ASSERT(len==(int32_t)strlen("abc*")); } uregex_close(re); /* TODO: Open with ParseError parameter */ } /* * clone */ { URegularExpression *clone1; URegularExpression *clone2; URegularExpression *clone3; UChar testString1[30]; UChar testString2[30]; UBool result; status = U_ZERO_ERROR; re = uregex_openC("abc*", 0, 0, &status); TEST_ASSERT_SUCCESS(status); clone1 = uregex_clone(re, &status); TEST_ASSERT_SUCCESS(status); TEST_ASSERT(clone1 != NULL); status = U_ZERO_ERROR; clone2 = uregex_clone(re, &status); TEST_ASSERT_SUCCESS(status); TEST_ASSERT(clone2 != NULL); uregex_close(re); status = U_ZERO_ERROR; clone3 = uregex_clone(clone2, &status); TEST_ASSERT_SUCCESS(status); TEST_ASSERT(clone3 != NULL); u_uastrncpy(testString1, "abcccd", sizeof(pat)/2); u_uastrncpy(testString2, "xxxabcccd", sizeof(pat)/2); status = U_ZERO_ERROR; uregex_setText(clone1, testString1, -1, &status); TEST_ASSERT_SUCCESS(status); result = uregex_lookingAt(clone1, 0, &status); TEST_ASSERT_SUCCESS(status); TEST_ASSERT(result==TRUE); status = U_ZERO_ERROR; uregex_setText(clone2, testString2, -1, &status); TEST_ASSERT_SUCCESS(status); result = uregex_lookingAt(clone2, 0, &status); TEST_ASSERT_SUCCESS(status); TEST_ASSERT(result==FALSE); result = uregex_find(clone2, 0, &status); TEST_ASSERT_SUCCESS(status); TEST_ASSERT(result==TRUE); uregex_close(clone1); uregex_close(clone2); uregex_close(clone3); } /* * pattern() */ { const UChar *resultPat; int32_t resultLen; u_uastrncpy(pat, "hello", sizeof(pat)/2); status = U_ZERO_ERROR; re = uregex_open(pat, -1, 0, NULL, &status); resultPat = uregex_pattern(re, &resultLen, &status); TEST_ASSERT_SUCCESS(status); /* The TEST_ASSERT_SUCCESS above should change too... */ if (U_SUCCESS(status)) { TEST_ASSERT(resultLen == -1); TEST_ASSERT(u_strcmp(resultPat, pat) == 0); } uregex_close(re); status = U_ZERO_ERROR; re = uregex_open(pat, 3, 0, NULL, &status); resultPat = uregex_pattern(re, &resultLen, &status); TEST_ASSERT_SUCCESS(status); TEST_ASSERT_SUCCESS(status); /* The TEST_ASSERT_SUCCESS above should change too... */ if (U_SUCCESS(status)) { TEST_ASSERT(resultLen == 3); TEST_ASSERT(u_strncmp(resultPat, pat, 3) == 0); TEST_ASSERT(u_strlen(resultPat) == 3); } uregex_close(re); } /* * flags() */ { int32_t t; status = U_ZERO_ERROR; re = uregex_open(pat, -1, 0, NULL, &status); t = uregex_flags(re, &status); TEST_ASSERT_SUCCESS(status); TEST_ASSERT(t == 0); uregex_close(re); status = U_ZERO_ERROR; re = uregex_open(pat, -1, 0, NULL, &status); t = uregex_flags(re, &status); TEST_ASSERT_SUCCESS(status); TEST_ASSERT(t == 0); uregex_close(re); status = U_ZERO_ERROR; re = uregex_open(pat, -1, UREGEX_CASE_INSENSITIVE | UREGEX_DOTALL, NULL, &status); t = uregex_flags(re, &status); TEST_ASSERT_SUCCESS(status); TEST_ASSERT(t == (UREGEX_CASE_INSENSITIVE | UREGEX_DOTALL)); uregex_close(re); } /* * setText() and lookingAt() */ { UChar text1[50]; UChar text2[50]; UBool result; u_uastrncpy(text1, "abcccd", sizeof(text1)/2); u_uastrncpy(text2, "abcccxd", sizeof(text2)/2); status = U_ZERO_ERROR; u_uastrncpy(pat, "abc*d", sizeof(pat)/2); re = uregex_open(pat, -1, 0, NULL, &status); TEST_ASSERT_SUCCESS(status); /* Operation before doing a setText should fail... */ status = U_ZERO_ERROR; uregex_lookingAt(re, 0, &status); TEST_ASSERT( status== U_REGEX_INVALID_STATE); status = U_ZERO_ERROR; uregex_setText(re, text1, -1, &status); result = uregex_lookingAt(re, 0, &status); TEST_ASSERT(result == TRUE); TEST_ASSERT_SUCCESS(status); status = U_ZERO_ERROR; uregex_setText(re, text2, -1, &status); result = uregex_lookingAt(re, 0, &status); TEST_ASSERT(result == FALSE); TEST_ASSERT_SUCCESS(status); status = U_ZERO_ERROR; uregex_setText(re, text1, -1, &status); result = uregex_lookingAt(re, 0, &status); TEST_ASSERT(result == TRUE); TEST_ASSERT_SUCCESS(status); status = U_ZERO_ERROR; uregex_setText(re, text1, 5, &status); result = uregex_lookingAt(re, 0, &status); TEST_ASSERT(result == FALSE); TEST_ASSERT_SUCCESS(status); status = U_ZERO_ERROR; uregex_setText(re, text1, 6, &status); result = uregex_lookingAt(re, 0, &status); TEST_ASSERT(result == TRUE); TEST_ASSERT_SUCCESS(status); uregex_close(re); } /* * getText() */ { UChar text1[50]; UChar text2[50]; const UChar *result; int32_t textLength; u_uastrncpy(text1, "abcccd", sizeof(text1)/2); u_uastrncpy(text2, "abcccxd", sizeof(text2)/2); status = U_ZERO_ERROR; u_uastrncpy(pat, "abc*d", sizeof(pat)/2); re = uregex_open(pat, -1, 0, NULL, &status); uregex_setText(re, text1, -1, &status); result = uregex_getText(re, &textLength, &status); TEST_ASSERT(result == text1); TEST_ASSERT(textLength == -1); TEST_ASSERT_SUCCESS(status); status = U_ZERO_ERROR; uregex_setText(re, text2, 7, &status); result = uregex_getText(re, &textLength, &status); TEST_ASSERT(result == text2); TEST_ASSERT(textLength == 7); TEST_ASSERT_SUCCESS(status); status = U_ZERO_ERROR; uregex_setText(re, text2, 4, &status); result = uregex_getText(re, &textLength, &status); TEST_ASSERT(result == text2); TEST_ASSERT(textLength == 4); TEST_ASSERT_SUCCESS(status); uregex_close(re); } /* * matches() */ { UChar text1[50]; UBool result; int len; UChar nullString[] = {0,0,0}; u_uastrncpy(text1, "abcccde", sizeof(text1)/2); status = U_ZERO_ERROR; u_uastrncpy(pat, "abc*d", sizeof(pat)/2); re = uregex_open(pat, -1, 0, NULL, &status); uregex_setText(re, text1, -1, &status); result = uregex_matches(re, 0, &status); TEST_ASSERT(result == FALSE); TEST_ASSERT_SUCCESS(status); status = U_ZERO_ERROR; uregex_setText(re, text1, 6, &status); result = uregex_matches(re, 0, &status); TEST_ASSERT(result == TRUE); TEST_ASSERT_SUCCESS(status); status = U_ZERO_ERROR; uregex_setText(re, text1, 6, &status); result = uregex_matches(re, 1, &status); TEST_ASSERT(result == FALSE); TEST_ASSERT_SUCCESS(status); uregex_close(re); status = U_ZERO_ERROR; re = uregex_openC(".?", 0, NULL, &status); uregex_setText(re, text1, -1, &status); len = u_strlen(text1); result = uregex_matches(re, len, &status); TEST_ASSERT(result == TRUE); TEST_ASSERT_SUCCESS(status); status = U_ZERO_ERROR; uregex_setText(re, nullString, -1, &status); TEST_ASSERT_SUCCESS(status); result = uregex_matches(re, 0, &status); TEST_ASSERT(result == TRUE); TEST_ASSERT_SUCCESS(status); uregex_close(re); } /* * lookingAt() Used in setText test. */ /* * find(), findNext, start, end, reset */ { UChar text1[50]; UBool result; u_uastrncpy(text1, "012rx5rx890rxrx...", sizeof(text1)/2); status = U_ZERO_ERROR; re = uregex_openC("rx", 0, NULL, &status); uregex_setText(re, text1, -1, &status); result = uregex_find(re, 0, &status); TEST_ASSERT(result == TRUE); TEST_ASSERT(uregex_start(re, 0, &status) == 3); TEST_ASSERT(uregex_end(re, 0, &status) == 5); TEST_ASSERT_SUCCESS(status); result = uregex_find(re, 9, &status); TEST_ASSERT(result == TRUE); TEST_ASSERT(uregex_start(re, 0, &status) == 11); TEST_ASSERT(uregex_end(re, 0, &status) == 13); TEST_ASSERT_SUCCESS(status); result = uregex_find(re, 14, &status); TEST_ASSERT(result == FALSE); TEST_ASSERT_SUCCESS(status); status = U_ZERO_ERROR; uregex_reset(re, 0, &status); result = uregex_findNext(re, &status); TEST_ASSERT(result == TRUE); TEST_ASSERT(uregex_start(re, 0, &status) == 3); TEST_ASSERT(uregex_end(re, 0, &status) == 5); TEST_ASSERT_SUCCESS(status); result = uregex_findNext(re, &status); TEST_ASSERT(result == TRUE); TEST_ASSERT(uregex_start(re, 0, &status) == 6); TEST_ASSERT(uregex_end(re, 0, &status) == 8); TEST_ASSERT_SUCCESS(status); status = U_ZERO_ERROR; uregex_reset(re, 12, &status); result = uregex_findNext(re, &status); TEST_ASSERT(result == TRUE); TEST_ASSERT(uregex_start(re, 0, &status) == 13); TEST_ASSERT(uregex_end(re, 0, &status) == 15); TEST_ASSERT_SUCCESS(status); result = uregex_findNext(re, &status); TEST_ASSERT(result == FALSE); TEST_ASSERT_SUCCESS(status); uregex_close(re); } /* * groupCount */ { int32_t result; status = U_ZERO_ERROR; re = uregex_openC("abc", 0, NULL, &status); result = uregex_groupCount(re, &status); TEST_ASSERT_SUCCESS(status); TEST_ASSERT(result == 0); uregex_close(re); status = U_ZERO_ERROR; re = uregex_openC("abc(def)(ghi(j))", 0, NULL, &status); result = uregex_groupCount(re, &status); TEST_ASSERT_SUCCESS(status); TEST_ASSERT(result == 3); uregex_close(re); } /* * group() */ { UChar text1[80]; UChar buf[80]; UBool result; int32_t resultSz; u_uastrncpy(text1, "noise abc interior def, and this is off the end", sizeof(text1)/2); status = U_ZERO_ERROR; re = uregex_openC("abc(.*?)def", 0, NULL, &status); TEST_ASSERT_SUCCESS(status); uregex_setText(re, text1, -1, &status); result = uregex_find(re, 0, &status); TEST_ASSERT(result==TRUE); /* Capture Group 0, the full match. Should succeed. */ status = U_ZERO_ERROR; resultSz = uregex_group(re, 0, buf, sizeof(buf)/2, &status); TEST_ASSERT_SUCCESS(status); TEST_ASSERT_STRING("abc interior def", buf, TRUE); TEST_ASSERT(resultSz == (int32_t)strlen("abc interior def")); /* Capture group #1. Should succeed. */ status = U_ZERO_ERROR; resultSz = uregex_group(re, 1, buf, sizeof(buf)/2, &status); TEST_ASSERT_SUCCESS(status); TEST_ASSERT_STRING(" interior ", buf, TRUE); TEST_ASSERT(resultSz == (int32_t)strlen(" interior ")); /* Capture group out of range. Error. */ status = U_ZERO_ERROR; uregex_group(re, 2, buf, sizeof(buf)/2, &status); TEST_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR); /* NULL buffer, pure pre-flight */ status = U_ZERO_ERROR; resultSz = uregex_group(re, 0, NULL, 0, &status); TEST_ASSERT(status == U_BUFFER_OVERFLOW_ERROR); TEST_ASSERT(resultSz == (int32_t)strlen("abc interior def")); /* Too small buffer, truncated string */ status = U_ZERO_ERROR; memset(buf, -1, sizeof(buf)); resultSz = uregex_group(re, 0, buf, 5, &status); TEST_ASSERT(status == U_BUFFER_OVERFLOW_ERROR); TEST_ASSERT_STRING("abc i", buf, FALSE); TEST_ASSERT(buf[5] == (UChar)0xffff); TEST_ASSERT(resultSz == (int32_t)strlen("abc interior def")); /* Output string just fits buffer, no NUL term. */ status = U_ZERO_ERROR; resultSz = uregex_group(re, 0, buf, (int32_t)strlen("abc interior def"), &status); TEST_ASSERT(status == U_STRING_NOT_TERMINATED_WARNING); TEST_ASSERT_STRING("abc interior def", buf, FALSE); TEST_ASSERT(resultSz == (int32_t)strlen("abc interior def")); TEST_ASSERT(buf[strlen("abc interior def")] == (UChar)0xffff); uregex_close(re); } /* * Regions */ /* SetRegion(), getRegion() do something */ TEST_SETUP(".*", "0123456789ABCDEF", 0) UChar resultString[40]; TEST_ASSERT(uregex_regionStart(re, &status) == 0); TEST_ASSERT(uregex_regionEnd(re, &status) == 16); uregex_setRegion(re, 3, 6, &status); TEST_ASSERT(uregex_regionStart(re, &status) == 3); TEST_ASSERT(uregex_regionEnd(re, &status) == 6); TEST_ASSERT(uregex_findNext(re, &status)); TEST_ASSERT(uregex_group(re, 0, resultString, sizeof(resultString)/2, &status) == 3) TEST_ASSERT_STRING("345", resultString, TRUE); TEST_TEARDOWN; /* find(start=-1) uses regions */ TEST_SETUP(".*", "0123456789ABCDEF", 0); uregex_setRegion(re, 4, 6, &status); TEST_ASSERT(uregex_find(re, -1, &status) == TRUE); TEST_ASSERT(uregex_start(re, 0, &status) == 4); TEST_ASSERT(uregex_end(re, 0, &status) == 6); TEST_TEARDOWN; /* find (start >=0) does not use regions */ TEST_SETUP(".*", "0123456789ABCDEF", 0); uregex_setRegion(re, 4, 6, &status); TEST_ASSERT(uregex_find(re, 0, &status) == TRUE); TEST_ASSERT(uregex_start(re, 0, &status) == 0); TEST_ASSERT(uregex_end(re, 0, &status) == 16); TEST_TEARDOWN; /* findNext() obeys regions */ TEST_SETUP(".", "0123456789ABCDEF", 0); uregex_setRegion(re, 4, 6, &status); TEST_ASSERT(uregex_findNext(re,&status) == TRUE); TEST_ASSERT(uregex_start(re, 0, &status) == 4); TEST_ASSERT(uregex_findNext(re, &status) == TRUE); TEST_ASSERT(uregex_start(re, 0, &status) == 5); TEST_ASSERT(uregex_findNext(re, &status) == FALSE); TEST_TEARDOWN; /* matches(start=-1) uses regions */ /* Also, verify that non-greedy *? succeeds in finding the full match. */ TEST_SETUP(".*?", "0123456789ABCDEF", 0); uregex_setRegion(re, 4, 6, &status); TEST_ASSERT(uregex_matches(re, -1, &status) == TRUE); TEST_ASSERT(uregex_start(re, 0, &status) == 4); TEST_ASSERT(uregex_end(re, 0, &status) == 6); TEST_TEARDOWN; /* matches (start >=0) does not use regions */ TEST_SETUP(".*?", "0123456789ABCDEF", 0); uregex_setRegion(re, 4, 6, &status); TEST_ASSERT(uregex_matches(re, 0, &status) == TRUE); TEST_ASSERT(uregex_start(re, 0, &status) == 0); TEST_ASSERT(uregex_end(re, 0, &status) == 16); TEST_TEARDOWN; /* lookingAt(start=-1) uses regions */ /* Also, verify that non-greedy *? finds the first (shortest) match. */ TEST_SETUP(".*?", "0123456789ABCDEF", 0); uregex_setRegion(re, 4, 6, &status); TEST_ASSERT(uregex_lookingAt(re, -1, &status) == TRUE); TEST_ASSERT(uregex_start(re, 0, &status) == 4); TEST_ASSERT(uregex_end(re, 0, &status) == 4); TEST_TEARDOWN; /* lookingAt (start >=0) does not use regions */ TEST_SETUP(".*?", "0123456789ABCDEF", 0); uregex_setRegion(re, 4, 6, &status); TEST_ASSERT(uregex_lookingAt(re, 0, &status) == TRUE); TEST_ASSERT(uregex_start(re, 0, &status) == 0); TEST_ASSERT(uregex_end(re, 0, &status) == 0); TEST_TEARDOWN; /* hitEnd() */ TEST_SETUP("[a-f]*", "abcdefghij", 0); TEST_ASSERT(uregex_find(re, 0, &status) == TRUE); TEST_ASSERT(uregex_hitEnd(re, &status) == FALSE); TEST_TEARDOWN; TEST_SETUP("[a-f]*", "abcdef", 0); TEST_ASSERT(uregex_find(re, 0, &status) == TRUE); TEST_ASSERT(uregex_hitEnd(re, &status) == TRUE); TEST_TEARDOWN; /* requireEnd */ TEST_SETUP("abcd", "abcd", 0); TEST_ASSERT(uregex_find(re, 0, &status) == TRUE); TEST_ASSERT(uregex_requireEnd(re, &status) == FALSE); TEST_TEARDOWN; TEST_SETUP("abcd$", "abcd", 0); TEST_ASSERT(uregex_find(re, 0, &status) == TRUE); TEST_ASSERT(uregex_requireEnd(re, &status) == TRUE); TEST_TEARDOWN; /* anchoringBounds */ TEST_SETUP("abc$", "abcdef", 0); TEST_ASSERT(uregex_hasAnchoringBounds(re, &status) == TRUE); uregex_useAnchoringBounds(re, FALSE, &status); TEST_ASSERT(uregex_hasAnchoringBounds(re, &status) == FALSE); TEST_ASSERT(uregex_find(re, -1, &status) == FALSE); uregex_useAnchoringBounds(re, TRUE, &status); uregex_setRegion(re, 0, 3, &status); TEST_ASSERT(uregex_find(re, -1, &status) == TRUE); TEST_ASSERT(uregex_end(re, 0, &status) == 3); TEST_TEARDOWN; /* Transparent Bounds */ TEST_SETUP("abc(?=def)", "abcdef", 0); TEST_ASSERT(uregex_hasTransparentBounds(re, &status) == FALSE); uregex_useTransparentBounds(re, TRUE, &status); TEST_ASSERT(uregex_hasTransparentBounds(re, &status) == TRUE); uregex_useTransparentBounds(re, FALSE, &status); TEST_ASSERT(uregex_find(re, -1, &status) == TRUE); /* No Region */ uregex_setRegion(re, 0, 3, &status); TEST_ASSERT(uregex_find(re, -1, &status) == FALSE); /* with region, opaque bounds */ uregex_useTransparentBounds(re, TRUE, &status); TEST_ASSERT(uregex_find(re, -1, &status) == TRUE); /* with region, transparent bounds */ TEST_ASSERT(uregex_end(re, 0, &status) == 3); TEST_TEARDOWN; /* * replaceFirst() */ { UChar text1[80]; UChar text2[80]; UChar replText[80]; UChar buf[80]; int32_t resultSz; u_uastrncpy(text1, "Replace xaax x1x x...x.", sizeof(text1)/2); u_uastrncpy(text2, "No match here.", sizeof(text2)/2); u_uastrncpy(replText, "<$1>", sizeof(replText)/2); status = U_ZERO_ERROR; re = uregex_openC("x(.*?)x", 0, NULL, &status); TEST_ASSERT_SUCCESS(status); /* Normal case, with match */ uregex_setText(re, text1, -1, &status); resultSz = uregex_replaceFirst(re, replText, -1, buf, sizeof(buf)/2, &status); TEST_ASSERT_SUCCESS(status); TEST_ASSERT_STRING("Replace <aa> x1x x...x.", buf, TRUE); TEST_ASSERT(resultSz == (int32_t)strlen("Replace xaax x1x x...x.")); /* No match. Text should copy to output with no changes. */ status = U_ZERO_ERROR; uregex_setText(re, text2, -1, &status); resultSz = uregex_replaceFirst(re, replText, -1, buf, sizeof(buf)/2, &status); TEST_ASSERT_SUCCESS(status); TEST_ASSERT_STRING("No match here.", buf, TRUE); TEST_ASSERT(resultSz == (int32_t)strlen("No match here.")); /* Match, output just fills buffer, no termination warning. */ status = U_ZERO_ERROR; uregex_setText(re, text1, -1, &status); memset(buf, -1, sizeof(buf)); resultSz = uregex_replaceFirst(re, replText, -1, buf, strlen("Replace <aa> x1x x...x."), &status); TEST_ASSERT(status == U_STRING_NOT_TERMINATED_WARNING); TEST_ASSERT_STRING("Replace <aa> x1x x...x.", buf, FALSE); TEST_ASSERT(resultSz == (int32_t)strlen("Replace xaax x1x x...x.")); TEST_ASSERT(buf[resultSz] == (UChar)0xffff); /* Do the replaceFirst again, without first resetting anything. * Should give the same results. */ status = U_ZERO_ERROR; memset(buf, -1, sizeof(buf)); resultSz = uregex_replaceFirst(re, replText, -1, buf, strlen("Replace <aa> x1x x...x."), &status); TEST_ASSERT(status == U_STRING_NOT_TERMINATED_WARNING); TEST_ASSERT_STRING("Replace <aa> x1x x...x.", buf, FALSE); TEST_ASSERT(resultSz == (int32_t)strlen("Replace xaax x1x x...x.")); TEST_ASSERT(buf[resultSz] == (UChar)0xffff); /* NULL buffer, zero buffer length */ status = U_ZERO_ERROR; resultSz = uregex_replaceFirst(re, replText, -1, NULL, 0, &status); TEST_ASSERT(status == U_BUFFER_OVERFLOW_ERROR); TEST_ASSERT(resultSz == (int32_t)strlen("Replace xaax x1x x...x.")); /* Buffer too small by one */ status = U_ZERO_ERROR; memset(buf, -1, sizeof(buf)); resultSz = uregex_replaceFirst(re, replText, -1, buf, strlen("Replace <aa> x1x x...x.")-1, &status); TEST_ASSERT(status == U_BUFFER_OVERFLOW_ERROR); TEST_ASSERT_STRING("Replace <aa> x1x x...x", buf, FALSE); TEST_ASSERT(resultSz == (int32_t)strlen("Replace xaax x1x x...x.")); TEST_ASSERT(buf[resultSz] == (UChar)0xffff); uregex_close(re); } /* * replaceAll() */ { UChar text1[80]; UChar text2[80]; UChar replText[80]; UChar buf[80]; int32_t resultSz; int32_t expectedResultSize; int32_t i; u_uastrncpy(text1, "Replace xaax x1x x...x.", sizeof(text1)/2); u_uastrncpy(text2, "No match here.", sizeof(text2)/2); u_uastrncpy(replText, "<$1>", sizeof(replText)/2); expectedResultSize = u_strlen(text1); status = U_ZERO_ERROR; re = uregex_openC("x(.*?)x", 0, NULL, &status); TEST_ASSERT_SUCCESS(status); /* Normal case, with match */ uregex_setText(re, text1, -1, &status); resultSz = uregex_replaceAll(re, replText, -1, buf, sizeof(buf)/2, &status); TEST_ASSERT_SUCCESS(status); TEST_ASSERT_STRING("Replace <aa> <1> <...>.", buf, TRUE); TEST_ASSERT(resultSz == (int32_t)strlen("Replace xaax x1x x...x.")); /* No match. Text should copy to output with no changes. */ status = U_ZERO_ERROR; uregex_setText(re, text2, -1, &status); resultSz = uregex_replaceAll(re, replText, -1, buf, sizeof(buf)/2, &status); TEST_ASSERT_SUCCESS(status); TEST_ASSERT_STRING("No match here.", buf, TRUE); TEST_ASSERT(resultSz == (int32_t)strlen("No match here.")); /* Match, output just fills buffer, no termination warning. */ status = U_ZERO_ERROR; uregex_setText(re, text1, -1, &status); memset(buf, -1, sizeof(buf)); resultSz = uregex_replaceAll(re, replText, -1, buf, strlen("Replace xaax x1x x...x."), &status); TEST_ASSERT(status == U_STRING_NOT_TERMINATED_WARNING); TEST_ASSERT_STRING("Replace <aa> <1> <...>.", buf, FALSE); TEST_ASSERT(resultSz == (int32_t)strlen("Replace <aa> <1> <...>.")); TEST_ASSERT(buf[resultSz] == (UChar)0xffff); /* Do the replaceFirst again, without first resetting anything. * Should give the same results. */ status = U_ZERO_ERROR; memset(buf, -1, sizeof(buf)); resultSz = uregex_replaceAll(re, replText, -1, buf, strlen("Replace xaax x1x x...x."), &status); TEST_ASSERT(status == U_STRING_NOT_TERMINATED_WARNING); TEST_ASSERT_STRING("Replace <aa> <1> <...>.", buf, FALSE); TEST_ASSERT(resultSz == (int32_t)strlen("Replace <aa> <1> <...>.")); TEST_ASSERT(buf[resultSz] == (UChar)0xffff); /* NULL buffer, zero buffer length */ status = U_ZERO_ERROR; resultSz = uregex_replaceAll(re, replText, -1, NULL, 0, &status); TEST_ASSERT(status == U_BUFFER_OVERFLOW_ERROR); TEST_ASSERT(resultSz == (int32_t)strlen("Replace <aa> <1> <...>.")); /* Buffer too small. Try every size, which will tickle edge cases * in uregex_appendReplacement (used by replaceAll) */ for (i=0; i<expectedResultSize; i++) { char expected[80]; status = U_ZERO_ERROR; memset(buf, -1, sizeof(buf)); resultSz = uregex_replaceAll(re, replText, -1, buf, i, &status); TEST_ASSERT(status == U_BUFFER_OVERFLOW_ERROR); strcpy(expected, "Replace <aa> <1> <...>."); expected[i] = 0; TEST_ASSERT_STRING(expected, buf, FALSE); TEST_ASSERT(resultSz == expectedResultSize); TEST_ASSERT(buf[i] == (UChar)0xffff); } uregex_close(re); } /* * appendReplacement() */ { UChar text[100]; UChar repl[100]; UChar buf[100]; UChar *bufPtr; int32_t bufCap; status = U_ZERO_ERROR; re = uregex_openC(".*", 0, 0, &status); TEST_ASSERT_SUCCESS(status); u_uastrncpy(text, "whatever", sizeof(text)/2); u_uastrncpy(repl, "some other", sizeof(repl)/2); uregex_setText(re, text, -1, &status); /* match covers whole target string */ uregex_find(re, 0, &status); TEST_ASSERT_SUCCESS(status); bufPtr = buf; bufCap = sizeof(buf) / 2; uregex_appendReplacement(re, repl, -1, &bufPtr, &bufCap, &status); TEST_ASSERT_SUCCESS(status); TEST_ASSERT_STRING("some other", buf, TRUE); /* Match has \u \U escapes */ uregex_find(re, 0, &status); TEST_ASSERT_SUCCESS(status); bufPtr = buf; bufCap = sizeof(buf) / 2; u_uastrncpy(repl, "abc\\u0041\\U00000042 \\\\ $ \\abc", sizeof(repl)/2); uregex_appendReplacement(re, repl, -1, &bufPtr, &bufCap, &status); TEST_ASSERT_SUCCESS(status); TEST_ASSERT_STRING("abcAB \\ $ abc", buf, TRUE); uregex_close(re); } /* * appendTail(). Checked in ReplaceFirst(), replaceAll(). */ /* * split() */ { UChar textToSplit[80]; UChar text2[80]; UChar buf[200]; UChar *fields[10]; int32_t numFields; int32_t requiredCapacity; int32_t spaceNeeded; int32_t sz; u_uastrncpy(textToSplit, "first : second: third", sizeof(textToSplit)/2); u_uastrncpy(text2, "No match here.", sizeof(text2)/2); status = U_ZERO_ERROR; re = uregex_openC(":", 0, NULL, &status); /* Simple split */ uregex_setText(re, textToSplit, -1, &status); TEST_ASSERT_SUCCESS(status); /* The TEST_ASSERT_SUCCESS call above should change too... */ if (U_SUCCESS(status)) { memset(fields, -1, sizeof(fields)); numFields = uregex_split(re, buf, sizeof(buf)/2, &requiredCapacity, fields, 10, &status); TEST_ASSERT_SUCCESS(status); /* The TEST_ASSERT_SUCCESS call above should change too... */ if(U_SUCCESS(status)) { TEST_ASSERT(numFields == 3); TEST_ASSERT_STRING("first ", fields[0], TRUE); TEST_ASSERT_STRING(" second", fields[1], TRUE); TEST_ASSERT_STRING(" third", fields[2], TRUE); TEST_ASSERT(fields[3] == NULL); spaceNeeded = u_strlen(textToSplit) - (numFields - 1) + /* Field delimiters do not appear in output */ numFields; /* Each field gets a NUL terminator */ TEST_ASSERT(spaceNeeded == requiredCapacity); } } uregex_close(re); /* Split with too few output strings available */ status = U_ZERO_ERROR; re = uregex_openC(":", 0, NULL, &status); uregex_setText(re, textToSplit, -1, &status); TEST_ASSERT_SUCCESS(status); /* The TEST_ASSERT_SUCCESS call above should change too... */ if(U_SUCCESS(status)) { memset(fields, -1, sizeof(fields)); numFields = uregex_split(re, buf, sizeof(buf)/2, &requiredCapacity, fields, 2, &status); TEST_ASSERT_SUCCESS(status); /* The TEST_ASSERT_SUCCESS call above should change too... */ if(U_SUCCESS(status)) { TEST_ASSERT(numFields == 2); TEST_ASSERT_STRING("first ", fields[0], TRUE); TEST_ASSERT_STRING(" second: third", fields[1], TRUE); TEST_ASSERT(!memcmp(&fields[2],&minus1,sizeof(UChar*))); spaceNeeded = u_strlen(textToSplit) - (numFields - 1) + /* Field delimiters do not appear in output */ numFields; /* Each field gets a NUL terminator */ TEST_ASSERT(spaceNeeded == requiredCapacity); /* Split with a range of output buffer sizes. */ spaceNeeded = u_strlen(textToSplit) - (numFields - 1) + /* Field delimiters do not appear in output */ numFields; /* Each field gets a NUL terminator */ for (sz=0; sz < spaceNeeded+1; sz++) { memset(fields, -1, sizeof(fields)); status = U_ZERO_ERROR; numFields = uregex_split(re, buf, sz, &requiredCapacity, fields, 10, &status); if (sz >= spaceNeeded) { TEST_ASSERT_SUCCESS(status); TEST_ASSERT_STRING("first ", fields[0], TRUE); TEST_ASSERT_STRING(" second", fields[1], TRUE); TEST_ASSERT_STRING(" third", fields[2], TRUE); } else { TEST_ASSERT(status == U_BUFFER_OVERFLOW_ERROR); } TEST_ASSERT(numFields == 3); TEST_ASSERT(fields[3] == NULL); TEST_ASSERT(spaceNeeded == requiredCapacity); } } } uregex_close(re); } /* Split(), part 2. Patterns with capture groups. The capture group text * comes out as additional fields. */ { UChar textToSplit[80]; UChar buf[200]; UChar *fields[10]; int32_t numFields; int32_t requiredCapacity; int32_t spaceNeeded; int32_t sz; u_uastrncpy(textToSplit, "first <tag-a> second<tag-b> third", sizeof(textToSplit)/2); status = U_ZERO_ERROR; re = uregex_openC("<(.*?)>", 0, NULL, &status); uregex_setText(re, textToSplit, -1, &status); TEST_ASSERT_SUCCESS(status); /* The TEST_ASSERT_SUCCESS call above should change too... */ if(U_SUCCESS(status)) { memset(fields, -1, sizeof(fields)); numFields = uregex_split(re, buf, sizeof(buf)/2, &requiredCapacity, fields, 10, &status); TEST_ASSERT_SUCCESS(status); /* The TEST_ASSERT_SUCCESS call above should change too... */ if(U_SUCCESS(status)) { TEST_ASSERT(numFields == 5); TEST_ASSERT_STRING("first ", fields[0], TRUE); TEST_ASSERT_STRING("tag-a", fields[1], TRUE); TEST_ASSERT_STRING(" second", fields[2], TRUE); TEST_ASSERT_STRING("tag-b", fields[3], TRUE); TEST_ASSERT_STRING(" third", fields[4], TRUE); TEST_ASSERT(fields[5] == NULL); spaceNeeded = strlen("first .tag-a. second.tag-b. third."); /* "." at NUL positions */ TEST_ASSERT(spaceNeeded == requiredCapacity); } } /* Split with too few output strings available (2) */ status = U_ZERO_ERROR; memset(fields, -1, sizeof(fields)); numFields = uregex_split(re, buf, sizeof(buf)/2, &requiredCapacity, fields, 2, &status); TEST_ASSERT_SUCCESS(status); /* The TEST_ASSERT_SUCCESS call above should change too... */ if(U_SUCCESS(status)) { TEST_ASSERT(numFields == 2); TEST_ASSERT_STRING("first ", fields[0], TRUE); TEST_ASSERT_STRING(" second<tag-b> third", fields[1], TRUE); TEST_ASSERT(!memcmp(&fields[2],&minus1,sizeof(UChar*))); spaceNeeded = strlen("first . second<tag-b> third."); /* "." at NUL positions */ TEST_ASSERT(spaceNeeded == requiredCapacity); } /* Split with too few output strings available (3) */ status = U_ZERO_ERROR; memset(fields, -1, sizeof(fields)); numFields = uregex_split(re, buf, sizeof(buf)/2, &requiredCapacity, fields, 3, &status); TEST_ASSERT_SUCCESS(status); /* The TEST_ASSERT_SUCCESS call above should change too... */ if(U_SUCCESS(status)) { TEST_ASSERT(numFields == 3); TEST_ASSERT_STRING("first ", fields[0], TRUE); TEST_ASSERT_STRING("tag-a", fields[1], TRUE); TEST_ASSERT_STRING(" second<tag-b> third", fields[2], TRUE); TEST_ASSERT(!memcmp(&fields[3],&minus1,sizeof(UChar*))); spaceNeeded = strlen("first .tag-a. second<tag-b> third."); /* "." at NUL positions */ TEST_ASSERT(spaceNeeded == requiredCapacity); } /* Split with just enough output strings available (5) */ status = U_ZERO_ERROR; memset(fields, -1, sizeof(fields)); numFields = uregex_split(re, buf, sizeof(buf)/2, &requiredCapacity, fields, 5, &status); TEST_ASSERT_SUCCESS(status); /* The TEST_ASSERT_SUCCESS call above should change too... */ if(U_SUCCESS(status)) { TEST_ASSERT(numFields == 5); TEST_ASSERT_STRING("first ", fields[0], TRUE); TEST_ASSERT_STRING("tag-a", fields[1], TRUE); TEST_ASSERT_STRING(" second", fields[2], TRUE); TEST_ASSERT_STRING("tag-b", fields[3], TRUE); TEST_ASSERT_STRING(" third", fields[4], TRUE); TEST_ASSERT(!memcmp(&fields[5],&minus1,sizeof(UChar*))); spaceNeeded = strlen("first .tag-a. second.tag-b. third."); /* "." at NUL positions */ TEST_ASSERT(spaceNeeded == requiredCapacity); } /* Split, end of text is a field delimiter. */ status = U_ZERO_ERROR; sz = strlen("first <tag-a> second<tag-b>"); uregex_setText(re, textToSplit, sz, &status); TEST_ASSERT_SUCCESS(status); /* The TEST_ASSERT_SUCCESS call above should change too... */ if(U_SUCCESS(status)) { memset(fields, -1, sizeof(fields)); numFields = uregex_split(re, buf, sizeof(buf)/2, &requiredCapacity, fields, 9, &status); TEST_ASSERT_SUCCESS(status); /* The TEST_ASSERT_SUCCESS call above should change too... */ if(U_SUCCESS(status)) { TEST_ASSERT(numFields == 4); TEST_ASSERT_STRING("first ", fields[0], TRUE); TEST_ASSERT_STRING("tag-a", fields[1], TRUE); TEST_ASSERT_STRING(" second", fields[2], TRUE); TEST_ASSERT_STRING("tag-b", fields[3], TRUE); TEST_ASSERT(fields[4] == NULL); TEST_ASSERT(fields[8] == NULL); TEST_ASSERT(!memcmp(&fields[9],&minus1,sizeof(UChar*))); spaceNeeded = strlen("first .tag-a. second.tag-b."); /* "." at NUL positions */ TEST_ASSERT(spaceNeeded == requiredCapacity); } } uregex_close(re); } }
int32_t __hs_uregex_groupCount(URegularExpression *regexp, UErrorCode *status) { return uregex_groupCount(regexp, status); }
int main(void) { int ret; int32_t l, u; UErrorCode status; UFILE *ustdout, *ustderr; URegularExpression *uregex; UBool case_insensitive = FALSE; UParseError pe = {-1, -1, {0}, {0}}; UChar pattern[] = { 0x0028, // 28, ( 0x005C, // 5C, backslash 0x0070, // 70, p 0x007B, // 7B, { 0x004C, // 4C, L 0x007D, // 7D, } 0x0029, // 29, ) 0x0028, // 28, ( 0x005C, // 5C, backslash 0x0070, // 70, p 0x007B, // 7B, { 0x004E, // 4E, N 0x0064, // 64, d 0x007D, // 7D, } 0x0029, // 29, ) 0 }; UChar string[] = { 0xD835, 0xDE3C, // A 0xD835, 0xDE3C, // A 0xD835, 0xDFE2, // 0 0xD835, 0xDE3D, // B 0xD835, 0xDE3D, // B 0xD835, 0xDFE3, // 1 0xD835, 0xDE3E, // C 0xD835, 0xDE3E, // C 0xD835, 0xDFE4, // 2 0 }; uregex = NULL; ret = EXIT_SUCCESS; status = U_ZERO_ERROR; ustdout = u_finit(stdout, NULL, NULL); ustderr = u_finit(stderr, NULL, NULL); uregex = uregex_open(pattern, -1, case_insensitive ? UREGEX_CASE_INSENSITIVE : 0, &pe, &status); if (U_FAILURE(status)) { if (U_REGEX_RULE_SYNTAX == status) { u_fprintf(ustderr, "Invalid pattern: error at offset %d\n\t%S\n\t%*c\n", pe.offset, pattern, pe.offset, '^'); } goto end; } uregex_setText(uregex, string, -1, &status); if (U_FAILURE(status)) { goto end; } while (uregex_findNext(uregex, &status)) { l = uregex_start(uregex, 0, &status); if (U_FAILURE(status)) { goto end; } u = uregex_end(uregex, 0, &status); if (U_FAILURE(status)) { goto end; } // $0 u_fprintf(ustdout, "Match found at %d position (to %d): %.*S\n", l, u, u - l, string + l); // $1 à $2 (on pourrait aussi utiliser uregex_group avec un groupNum à valeur 0 pour récupérer $0) { UChar buffer[1024]; int32_t i, l, g; l = uregex_groupCount(uregex, &status); if (U_FAILURE(status)) { icu_error(status, "uregex_groupCount"); } for (i = 1; i <= l; i++) { g = uregex_group(uregex, i, buffer, USTRING_SIZE(buffer), &status); if (U_FAILURE(status)) { icu_error(status, "uregex_group"); } u_fprintf(ustdout, "$%d : %S\n", i, buffer); } } } if (U_FAILURE(status)) { goto end; } if (FALSE) { end: ret = EXIT_FAILURE; } if (NULL != ustderr) { u_fclose(ustderr); } if (NULL != ustdout) { u_fclose(ustdout); } if (NULL != uregex) { uregex_close(uregex); } u_cleanup(); return ret; }