static RegExData* open(JNIEnv* env, jclass clazz, jstring pattern, jint flags) { flags = flags | UREGEX_ERROR_ON_UNKNOWN_ESCAPES; RegExData* data = (RegExData*)calloc(sizeof(RegExData), 1); UErrorCode status = U_ZERO_ERROR; UParseError error; error.offset = -1; jchar const * patternRaw; int patternLen = env->GetStringLength(pattern); if (patternLen == 0) { data->regex = uregex_open(&EMPTY_STRING, -1, flags, &error, &status); } else { jchar const * patternRaw = env->GetStringChars(pattern, NULL); data->regex = uregex_open(patternRaw, patternLen, flags, &error, &status); env->ReleaseStringChars(pattern, patternRaw); } if (!U_SUCCESS(status)) { _close(env, clazz, data); throwPatternSyntaxException(env, status, pattern, error); data = NULL; } return data; }
/* ** Implementation of SQLite REGEXP operator. This scalar function takes ** two arguments. The first is a regular expression pattern to compile ** the second is a string to match against that pattern. If either ** argument is an SQL NULL, then NULL Is returned. Otherwise, the result ** is 1 if the string matches the pattern, or 0 otherwise. ** ** SQLite maps the regexp() function to the regexp() operator such ** that the following two are equivalent: ** ** zString REGEXP zPattern ** regexp(zPattern, zString) ** ** Uses the following ICU regexp APIs: ** ** uregex_open() ** uregex_matches() ** uregex_close() */ static void icuRegexpFunc(sqlite3_context *p, int nArg, sqlite3_value **apArg){ UErrorCode status = U_ZERO_ERROR; URegularExpression *pExpr; UBool res; const UChar *zString = sqlite3_value_text16(apArg[1]); (void)nArg; /* Unused parameter */ /* If the left hand side of the regexp operator is NULL, ** then the result is also NULL. */ if( !zString ){ return; } pExpr = sqlite3_get_auxdata(p, 0); if( !pExpr ){ const UChar *zPattern = sqlite3_value_text16(apArg[0]); if( !zPattern ){ return; } pExpr = uregex_open(zPattern, -1, 0, 0, &status); if( U_SUCCESS(status) ){ sqlite3_set_auxdata(p, 0, pExpr, icuRegexpDelete); }else{ assert(!pExpr); icuFunctionError(p, "uregex_open", status); return; } } /* Configure the text that the regular expression operates on. */ uregex_setText(pExpr, zString, -1, &status); if( !U_SUCCESS(status) ){ icuFunctionError(p, "uregex_setText", status); return; } /* Attempt the match */ res = uregex_matches(pExpr, 0, &status); if( !U_SUCCESS(status) ){ icuFunctionError(p, "uregex_matches", status); return; } /* Set the text that the regular expression operates on to a NULL ** pointer. This is not really necessary, but it is tidier than ** leaving the regular expression object configured with an invalid ** pointer after this function returns. */ uregex_setText(pExpr, 0, 0, &status); /* Return 1 or 0. */ sqlite3_result_int(p, res ? 1 : 0); }
U_CAPI URegularExpression * U_EXPORT2 uregex_openC( const char *pattern, uint32_t flags, UParseError *pe, UErrorCode *status) { if (U_FAILURE(*status)) { return NULL; } if (pattern == NULL) { *status = U_ILLEGAL_ARGUMENT_ERROR; return NULL; } UnicodeString patString(pattern); return uregex_open(patString.getBuffer(), patString.length(), flags, pe, status); }
static void regexp_ctor(INTERNAL_FUNCTION_PARAMETERS) { zval *object; Regexp_object *ro; char *pattern; int32_t pattern_len; UChar *upattern = NULL; int32_t upattern_len = 0; zval *zflags = NULL; uint32_t flags = 0; UParseError pe = { -1, -1, {0}, {0} }; intl_error_reset(NULL TSRMLS_CC); object = return_value; if (FAILURE == zend_parse_parameters(ZEND_NUM_ARGS() TSRMLS_CC, "s|z", &pattern, &pattern_len, &zflags)) { intl_error_set(NULL, U_ILLEGAL_ARGUMENT_ERROR, "bad arguments", 0 TSRMLS_CC); zval_dtor(object); RETURN_NULL(); } if (NULL != zflags) { switch (Z_TYPE_P(zflags)) { case IS_LONG: flags = (uint32_t) Z_LVAL_P(zflags); break; case IS_STRING: { const char *p; for (p = Z_STRVAL_P(zflags); '\0' != *p; p++) { switch (*p) { case 'i': flags |= UREGEX_CASE_INSENSITIVE; break; case 'm': flags |= UREGEX_MULTILINE; break; case 's': flags |= UREGEX_DOTALL; break; case 'x': flags |= UREGEX_COMMENTS; break; case 'w': flags |= UREGEX_UWORD; break; default: intl_error_set(NULL, U_ILLEGAL_ARGUMENT_ERROR, "invalid modifier", 0 TSRMLS_CC); zval_dtor(object); RETURN_NULL(); } } break; } default: intl_error_set(NULL, U_ILLEGAL_ARGUMENT_ERROR, "bad arguments", 0 TSRMLS_CC); zval_dtor(object); RETURN_NULL(); } } ro = (Regexp_object *) zend_object_store_get_object(object TSRMLS_CC); intl_convert_utf8_to_utf16(&upattern, &upattern_len, pattern, pattern_len, REGEXP_ERROR_CODE_P(ro)); INTL_CTOR_CHECK_STATUS(ro, "string conversion of pattern to UTF-16 failed"); ro->uregex = uregex_open(upattern, upattern_len, flags, &pe, REGEXP_ERROR_CODE_P(ro)); efree(upattern); if (U_FAILURE(REGEXP_ERROR_CODE(ro))) { intl_error_set_code(NULL, REGEXP_ERROR_CODE(ro) TSRMLS_CC); if (-1 != pe.line) { regexp_parse_error_to_string(pe, pattern, pattern_len); } else { intl_error_set_custom_msg(NULL, "unable to compile ICU regular expression", 0 TSRMLS_CC); } zval_dtor(object); RETURN_NULL(); } }
// Try LocalXyzPointer types with NULL pointers. void LocalPointerTest::TestLocalXyzPointerNull() { { IcuTestErrorCode errorCode(*this, "TestLocalXyzPointerNull/LocalUConverterSelectorPointer"); static const char *const encoding="ISO-8859-1"; LocalUConverterSelectorPointer null; LocalUConverterSelectorPointer sel( ucnvsel_open(&encoding, 1, NULL, UCNV_ROUNDTRIP_SET, errorCode)); sel.adoptInstead(NULL); } #if !UCONFIG_NO_FORMATTING { IcuTestErrorCode errorCode(*this, "TestLocalXyzPointerNull/LocalUCalendarPointer"); LocalUCalendarPointer null; LocalUCalendarPointer cal(ucal_open(NULL, 0, "root", UCAL_GREGORIAN, errorCode)); if(!errorCode.logDataIfFailureAndReset("ucal_open()")) { cal.adoptInstead(NULL); } } { IcuTestErrorCode errorCode(*this, "TestLocalXyzPointerNull/LocalUDateTimePatternGeneratorPointer"); LocalUDateTimePatternGeneratorPointer null; LocalUDateTimePatternGeneratorPointer patgen(udatpg_open("root", errorCode)); patgen.adoptInstead(NULL); } { IcuTestErrorCode errorCode(*this, "TestLocalXyzPointerNull/LocalUMessageFormatPointer"); UnicodeString hello=UNICODE_STRING_SIMPLE("Hello {0}!"); LocalUMessageFormatPointer null; LocalUMessageFormatPointer msg( umsg_open(hello.getBuffer(), hello.length(), "root", NULL, errorCode)); msg.adoptInstead(NULL); } #endif /* !UCONFIG_NO_FORMATTING */ #if !UCONFIG_NO_REGULAR_EXPRESSIONS { IcuTestErrorCode errorCode(*this, "TestLocalXyzPointerNull/LocalURegularExpressionPointer"); UnicodeString pattern=UNICODE_STRING_SIMPLE("abc|xy+z"); LocalURegularExpressionPointer null; LocalURegularExpressionPointer regex( uregex_open(pattern.getBuffer(), pattern.length(), 0, NULL, errorCode)); if(!errorCode.logDataIfFailureAndReset("urege_open()")) { regex.adoptInstead(NULL); } } #endif /* !UCONFIG_NO_REGULAR_EXPRESSIONS */ #if !UCONFIG_NO_TRANSLITERATION { IcuTestErrorCode errorCode(*this, "TestLocalXyzPointerNull/LocalUTransliteratorPointer"); UnicodeString id=UNICODE_STRING_SIMPLE("Grek-Latn"); LocalUTransliteratorPointer null; LocalUTransliteratorPointer trans( utrans_openU(id.getBuffer(), id.length(), UTRANS_FORWARD, NULL, 0, NULL, errorCode)); if(!errorCode.logDataIfFailureAndReset("utrans_openU()")) { trans.adoptInstead(NULL); } } #endif /* !UCONFIG_NO_TRANSLITERATION */ }
// Use LocalXyzPointer types that are not covered elsewhere in the intltest suite. void LocalPointerTest::TestLocalXyzPointer() { IcuTestErrorCode errorCode(*this, "TestLocalXyzPointer"); static const char *const encoding="ISO-8859-1"; LocalUConverterSelectorPointer sel( ucnvsel_open(&encoding, 1, NULL, UCNV_ROUNDTRIP_SET, errorCode)); if(errorCode.logIfFailureAndReset("ucnvsel_open()")) { return; } if(sel.isNull()) { errln("LocalUConverterSelectorPointer failure"); return; } #if !UCONFIG_NO_FORMATTING LocalUCalendarPointer cal(ucal_open(NULL, 0, "root", UCAL_GREGORIAN, errorCode)); if(errorCode.logDataIfFailureAndReset("ucal_open()")) { return; } if(cal.isNull()) { errln("LocalUCalendarPointer failure"); return; } LocalUDateTimePatternGeneratorPointer patgen(udatpg_open("root", errorCode)); if(errorCode.logDataIfFailureAndReset("udatpg_open()")) { return; } if(patgen.isNull()) { errln("LocalUDateTimePatternGeneratorPointer failure"); return; } LocalULocaleDisplayNamesPointer ldn(uldn_open("de-CH", ULDN_STANDARD_NAMES, errorCode)); if(errorCode.logIfFailureAndReset("uldn_open()")) { return; } if(ldn.isNull()) { errln("LocalULocaleDisplayNamesPointer failure"); return; } UnicodeString hello=UNICODE_STRING_SIMPLE("Hello {0}!"); LocalUMessageFormatPointer msg( umsg_open(hello.getBuffer(), hello.length(), "root", NULL, errorCode)); if(errorCode.logIfFailureAndReset("umsg_open()")) { return; } if(msg.isNull()) { errln("LocalUMessageFormatPointer failure"); return; } #endif /* UCONFIG_NO_FORMATTING */ #if !UCONFIG_NO_NORMALIZATION const UNormalizer2 *nfc=unorm2_getNFCInstance(errorCode); UnicodeSet emptySet; LocalUNormalizer2Pointer fn2(unorm2_openFiltered(nfc, emptySet.toUSet(), errorCode)); if(errorCode.logIfFailureAndReset("unorm2_openFiltered()")) { return; } if(fn2.isNull()) { errln("LocalUNormalizer2Pointer failure"); return; } #endif /* !UCONFIG_NO_NORMALIZATION */ #if !UCONFIG_NO_IDNA LocalUIDNAPointer idna(uidna_openUTS46(0, errorCode)); if(errorCode.logIfFailureAndReset("uidna_openUTS46()")) { return; } if(idna.isNull()) { errln("LocalUIDNAPointer failure"); return; } #endif /* !UCONFIG_NO_IDNA */ #if !UCONFIG_NO_REGULAR_EXPRESSIONS UnicodeString pattern=UNICODE_STRING_SIMPLE("abc|xy+z"); LocalURegularExpressionPointer regex( uregex_open(pattern.getBuffer(), pattern.length(), 0, NULL, errorCode)); if(errorCode.logIfFailureAndReset("uregex_open()")) { return; } if(regex.isNull()) { errln("LocalURegularExpressionPointer failure"); return; } #endif /* UCONFIG_NO_REGULAR_EXPRESSIONS */ #if !UCONFIG_NO_TRANSLITERATION UnicodeString id=UNICODE_STRING_SIMPLE("Grek-Latn"); LocalUTransliteratorPointer trans( utrans_openU(id.getBuffer(), id.length(), UTRANS_FORWARD, NULL, 0, NULL, errorCode)); if(errorCode.logIfFailureAndReset("utrans_open()")) { return; } if(trans.isNull()) { errln("LocalUTransliteratorPointer failure"); return; } #endif /* !UCONFIG_NO_TRANSLITERATION */ // destructors }
void ConfusabledataBuilder::build(const char * confusables, int32_t confusablesLen, UErrorCode &status) { // Convert the user input data from UTF-8 to UChar (UTF-16) int32_t inputLen = 0; if (U_FAILURE(status)) { return; } u_strFromUTF8(NULL, 0, &inputLen, confusables, confusablesLen, &status); if (status != U_BUFFER_OVERFLOW_ERROR) { return; } status = U_ZERO_ERROR; fInput = static_cast<UChar *>(uprv_malloc((inputLen+1) * sizeof(UChar))); if (fInput == NULL) { status = U_MEMORY_ALLOCATION_ERROR; return; } u_strFromUTF8(fInput, inputLen+1, NULL, confusables, confusablesLen, &status); // Regular Expression to parse a line from Confusables.txt. The expression will match // any line. What was matched is determined by examining which capture groups have a match. // Capture Group 1: the source char // Capture Group 2: the replacement chars // Capture Group 3-6 the table type, SL, SA, ML, or MA // Capture Group 7: A blank or comment only line. // Capture Group 8: A syntactically invalid line. Anything that didn't match before. // Example Line from the confusables.txt source file: // "1D702 ; 006E 0329 ; SL # MATHEMATICAL ITALIC SMALL ETA ... " UnicodeString pattern( "(?m)^[ \\t]*([0-9A-Fa-f]+)[ \\t]+;" // Match the source char "[ \\t]*([0-9A-Fa-f]+" // Match the replacement char(s) "(?:[ \\t]+[0-9A-Fa-f]+)*)[ \\t]*;" // (continued) "\\s*(?:(SL)|(SA)|(ML)|(MA))" // Match the table type "[ \\t]*(?:#.*?)?$" // Match any trailing #comment "|^([ \\t]*(?:#.*?)?)$" // OR match empty lines or lines with only a #comment "|^(.*?)$", -1, US_INV); // OR match any line, which catches illegal lines. // TODO: Why are we using the regex C API here? C++ would just take UnicodeString... fParseLine = uregex_open(pattern.getBuffer(), pattern.length(), 0, NULL, &status); // Regular expression for parsing a hex number out of a space-separated list of them. // Capture group 1 gets the number, with spaces removed. pattern = UNICODE_STRING_SIMPLE("\\s*([0-9A-F]+)"); fParseHexNum = uregex_open(pattern.getBuffer(), pattern.length(), 0, NULL, &status); // Zap any Byte Order Mark at the start of input. Changing it to a space is benign // given the syntax of the input. if (*fInput == 0xfeff) { *fInput = 0x20; } // Parse the input, one line per iteration of this loop. uregex_setText(fParseLine, fInput, inputLen, &status); while (uregex_findNext(fParseLine, &status)) { fLineNum++; if (uregex_start(fParseLine, 7, &status) >= 0) { // this was a blank or comment line. continue; } if (uregex_start(fParseLine, 8, &status) >= 0) { // input file syntax error. status = U_PARSE_ERROR; return; } // We have a good input line. Extract the key character and mapping string, and // put them into the appropriate mapping table. UChar32 keyChar = SpoofImpl::ScanHex(fInput, uregex_start(fParseLine, 1, &status), uregex_end(fParseLine, 1, &status), status); int32_t mapStringStart = uregex_start(fParseLine, 2, &status); int32_t mapStringLength = uregex_end(fParseLine, 2, &status) - mapStringStart; uregex_setText(fParseHexNum, &fInput[mapStringStart], mapStringLength, &status); UnicodeString *mapString = new UnicodeString(); if (mapString == NULL) { status = U_MEMORY_ALLOCATION_ERROR; return; } while (uregex_findNext(fParseHexNum, &status)) { UChar32 c = SpoofImpl::ScanHex(&fInput[mapStringStart], uregex_start(fParseHexNum, 1, &status), uregex_end(fParseHexNum, 1, &status), status); mapString->append(c); } U_ASSERT(mapString->length() >= 1); // Put the map (value) string into the string pool // This a little like a Java intern() - any duplicates will be eliminated. SPUString *smapString = stringPool->addString(mapString, status); // Add the UChar32 -> string mapping to the appropriate table. UHashtable *table = uregex_start(fParseLine, 3, &status) >= 0 ? fSLTable : uregex_start(fParseLine, 4, &status) >= 0 ? fSATable : uregex_start(fParseLine, 5, &status) >= 0 ? fMLTable : uregex_start(fParseLine, 6, &status) >= 0 ? fMATable : NULL; if (U_SUCCESS(status) && table == NULL) { status = U_PARSE_ERROR; } if (U_FAILURE(status)) { return; } // For Unicode 8, the SL, SA and ML tables have been discontinued. // All input data from confusables.txt is tagged MA. // ICU spoof check functions should ignore the specified table and always // use this MA Data. // For now, implement by populating the MA data into all four tables, and // keep the multiple table implementation in place, in case it comes back // at some time in the future. // There is no run time size penalty to keeping the four table implementation - // the data is shared when it's the same betweeen tables. if (table != fMATable) { status = U_PARSE_ERROR; return; }; // uhash_iput(table, keyChar, smapString, &status); uhash_iput(fSLTable, keyChar, smapString, &status); uhash_iput(fSATable, keyChar, smapString, &status); uhash_iput(fMLTable, keyChar, smapString, &status); uhash_iput(fMATable, keyChar, smapString, &status); fKeySet->add(keyChar); if (U_FAILURE(status)) { return; } } // Input data is now all parsed and collected. // Now create the run-time binary form of the data. // // This is done in two steps. First the data is assembled into vectors and strings, // for ease of construction, then the contents of these collections are dumped // into the actual raw-bytes data storage. // Build up the string array, and record the index of each string therein // in the (build time only) string pool. // Strings of length one are not entered into the strings array. // At the same time, build up the string lengths table, which records the // position in the string table of the first string of each length >= 4. // (Strings in the table are sorted by length) stringPool->sort(status); fStringTable = new UnicodeString(); fStringLengthsTable = new UVector(status); int32_t previousStringLength = 0; int32_t previousStringIndex = 0; int32_t poolSize = stringPool->size(); int32_t i; for (i=0; i<poolSize; i++) { SPUString *s = stringPool->getByIndex(i); int32_t strLen = s->fStr->length(); int32_t strIndex = fStringTable->length(); U_ASSERT(strLen >= previousStringLength); if (strLen == 1) { // strings of length one do not get an entry in the string table. // Keep the single string character itself here, which is the same // convention that is used in the final run-time string table index. s->fStrTableIndex = s->fStr->charAt(0); } else { if ((strLen > previousStringLength) && (previousStringLength >= 4)) { fStringLengthsTable->addElement(previousStringIndex, status); fStringLengthsTable->addElement(previousStringLength, status); } s->fStrTableIndex = strIndex; fStringTable->append(*(s->fStr)); } previousStringLength = strLen; previousStringIndex = strIndex; } // Make the final entry to the string lengths table. // (it holds an entry for the _last_ string of each length, so adding the // final one doesn't happen in the main loop because no longer string was encountered.) if (previousStringLength >= 4) { fStringLengthsTable->addElement(previousStringIndex, status); fStringLengthsTable->addElement(previousStringLength, status); } // Construct the compile-time Key and Value tables // // For each key code point, check which mapping tables it applies to, // and create the final data for the key & value structures. // // The four logical mapping tables are conflated into one combined table. // If multiple logical tables have the same mapping for some key, they // share a single entry in the combined table. // If more than one mapping exists for the same key code point, multiple // entries will be created in the table for (int32_t range=0; range<fKeySet->getRangeCount(); range++) { // It is an oddity of the UnicodeSet API that simply enumerating the contained // code points requires a nested loop. for (UChar32 keyChar=fKeySet->getRangeStart(range); keyChar <= fKeySet->getRangeEnd(range); keyChar++) { addKeyEntry(keyChar, fSLTable, USPOOF_SL_TABLE_FLAG, status); addKeyEntry(keyChar, fSATable, USPOOF_SA_TABLE_FLAG, status); addKeyEntry(keyChar, fMLTable, USPOOF_ML_TABLE_FLAG, status); addKeyEntry(keyChar, fMATable, USPOOF_MA_TABLE_FLAG, status); } } // Put the assembled data into the flat runtime array outputData(status); // All of the intermediate allocated data belongs to the ConfusabledataBuilder // object (this), and is deleted in the destructor. return; }
int BinaryGrammar::readBinaryGrammar_10043(std::istream& input) { if (!input) { u_fprintf(ux_stderr, "Error: Input is null - cannot read from nothing!\n"); CG3Quit(1); } if (!grammar) { u_fprintf(ux_stderr, "Error: No grammar provided - cannot continue!\n"); CG3Quit(1); } uint32_t fields = 0; uint32_t u32tmp = 0; int32_t i32tmp = 0; uint8_t u8tmp = 0; UErrorCode err = U_ZERO_ERROR; UConverter* conv = ucnv_open("UTF-8", &err); if (fread_throw(&cbuffers[0][0], 1, 4, input) != 4) { std::cerr << "Error: Error reading first 4 bytes from grammar!" << std::endl; CG3Quit(1); } if (cbuffers[0][0] != 'C' || cbuffers[0][1] != 'G' || cbuffers[0][2] != '3' || cbuffers[0][3] != 'B') { u_fprintf(ux_stderr, "Error: Grammar does not begin with magic bytes - cannot load as binary!\n"); CG3Quit(1); } fread_throw(&u32tmp, sizeof(uint32_t), 1, input); u32tmp = (uint32_t)ntohl(u32tmp); if (u32tmp < 10043) { u_fprintf(ux_stderr, "Error: Grammar revision is %u, but this loader requires %u or later!\n", u32tmp, 10043); CG3Quit(1); } grammar->is_binary = true; fread_throw(&u32tmp, sizeof(uint32_t), 1, input); fields = (uint32_t)ntohl(u32tmp); grammar->has_dep = (fields & (1 << 0)) != 0; grammar->sub_readings_ltr = (fields & (1 << 2)) != 0; grammar->has_relations = (fields & (1 << 13)) != 0; if (fields & (1 << 1)) { ucnv_reset(conv); fread_throw(&u32tmp, sizeof(uint32_t), 1, input); u32tmp = (uint32_t)ntohl(u32tmp); fread_throw(&cbuffers[0][0], 1, u32tmp, input); i32tmp = ucnv_toUChars(conv, &grammar->mapping_prefix, 1, &cbuffers[0][0], u32tmp, &err); } // Keep track of which sets that the varstring tags used; we can't just assign them as sets are not loaded yet typedef std::map<uint32_t, uint32Vector> tag_varsets_t; tag_varsets_t tag_varsets; u32tmp = 0; if (fields & (1 << 3)) { fread_throw(&u32tmp, sizeof(uint32_t), 1, input); u32tmp = (uint32_t)ntohl(u32tmp); } uint32_t num_single_tags = u32tmp; grammar->single_tags_list.resize(num_single_tags); for (uint32_t i = 0; i < num_single_tags; i++) { Tag* t = grammar->allocateTag(); t->type |= T_GRAMMAR; uint32_t fields = 0; fread_throw(&u32tmp, sizeof(uint32_t), 1, input); fields = (uint32_t)ntohl(u32tmp); if (fields & (1 << 0)) { fread_throw(&u32tmp, sizeof(uint32_t), 1, input); t->number = (uint32_t)ntohl(u32tmp); } if (fields & (1 << 1)) { fread_throw(&u32tmp, sizeof(uint32_t), 1, input); t->hash = (uint32_t)ntohl(u32tmp); } if (fields & (1 << 2)) { fread_throw(&u32tmp, sizeof(uint32_t), 1, input); t->plain_hash = (uint32_t)ntohl(u32tmp); } if (fields & (1 << 3)) { fread_throw(&u32tmp, sizeof(uint32_t), 1, input); t->seed = (uint32_t)ntohl(u32tmp); } if (fields & (1 << 4)) { fread_throw(&u32tmp, sizeof(uint32_t), 1, input); t->type = (uint32_t)ntohl(u32tmp); } if (fields & (1 << 5)) { fread_throw(&u32tmp, sizeof(uint32_t), 1, input); t->comparison_hash = (uint32_t)ntohl(u32tmp); } if (fields & (1 << 6)) { fread_throw(&u32tmp, sizeof(uint32_t), 1, input); t->comparison_op = (C_OPS)ntohl(u32tmp); } if (fields & (1 << 7)) { fread_throw(&i32tmp, sizeof(int32_t), 1, input); t->comparison_val = (int32_t)ntohl(i32tmp); if (t->comparison_val <= std::numeric_limits<int32_t>::min()) { t->comparison_val = NUMERIC_MIN; } if (t->comparison_val >= std::numeric_limits<int32_t>::max()) { t->comparison_val = NUMERIC_MAX; } } if (fields & (1 << 8)) { fread_throw(&u32tmp, sizeof(uint32_t), 1, input); u32tmp = (uint32_t)ntohl(u32tmp); if (u32tmp) { ucnv_reset(conv); fread_throw(&cbuffers[0][0], 1, u32tmp, input); i32tmp = ucnv_toUChars(conv, &gbuffers[0][0], CG3_BUFFER_SIZE - 1, &cbuffers[0][0], u32tmp, &err); t->tag = &gbuffers[0][0]; } } if (fields & (1 << 9)) { fread_throw(&u32tmp, sizeof(uint32_t), 1, input); u32tmp = (uint32_t)ntohl(u32tmp); if (u32tmp) { ucnv_reset(conv); fread_throw(&cbuffers[0][0], 1, u32tmp, input); i32tmp = ucnv_toUChars(conv, &gbuffers[0][0], CG3_BUFFER_SIZE - 1, &cbuffers[0][0], u32tmp, &err); UParseError pe; UErrorCode status = U_ZERO_ERROR; if (t->type & T_CASE_INSENSITIVE) { t->regexp = uregex_open(&gbuffers[0][0], i32tmp, UREGEX_CASE_INSENSITIVE, &pe, &status); } else { t->regexp = uregex_open(&gbuffers[0][0], i32tmp, 0, &pe, &status); } if (status != U_ZERO_ERROR) { u_fprintf(ux_stderr, "Error: uregex_open returned %s trying to parse tag %S - cannot continue!\n", u_errorName(status), t->tag.c_str()); CG3Quit(1); } } } if (fields & (1 << 10)) { fread_throw(&u32tmp, sizeof(uint32_t), 1, input); uint32_t num = (uint32_t)ntohl(u32tmp); t->allocateVsSets(); t->vs_sets->reserve(num); tag_varsets[t->number].reserve(num); for (size_t i = 0; i < num; ++i) { fread_throw(&u32tmp, sizeof(uint32_t), 1, input); u32tmp = (uint32_t)ntohl(u32tmp); tag_varsets[t->number].push_back(u32tmp); } } if (fields & (1 << 11)) { fread_throw(&u32tmp, sizeof(uint32_t), 1, input); uint32_t num = (uint32_t)ntohl(u32tmp); t->allocateVsNames(); t->vs_names->reserve(num); for (size_t i = 0; i < num; ++i) { fread_throw(&u32tmp, sizeof(uint32_t), 1, input); u32tmp = (uint32_t)ntohl(u32tmp); if (u32tmp) { ucnv_reset(conv); fread_throw(&cbuffers[0][0], 1, u32tmp, input); i32tmp = ucnv_toUChars(conv, &gbuffers[0][0], CG3_BUFFER_SIZE - 1, &cbuffers[0][0], u32tmp, &err); t->vs_names->push_back(&gbuffers[0][0]); } } } grammar->single_tags[t->hash] = t; grammar->single_tags_list[t->number] = t; if (t->tag.size() == 1 && t->tag[0] == '*') { grammar->tag_any = t->hash; } } u32tmp = 0; if (fields & (1 << 5)) { fread_throw(&u32tmp, sizeof(uint32_t), 1, input); u32tmp = (uint32_t)ntohl(u32tmp); } uint32_t num_pref_targets = u32tmp; for (uint32_t i = 0; i < num_pref_targets; i++) { fread_throw(&u32tmp, sizeof(uint32_t), 1, input); u32tmp = (uint32_t)ntohl(u32tmp); grammar->preferred_targets.push_back(u32tmp); } u32tmp = 0; if (fields & (1 << 6)) { fread_throw(&u32tmp, sizeof(uint32_t), 1, input); u32tmp = (uint32_t)ntohl(u32tmp); } uint32_t num_par_pairs = u32tmp; for (uint32_t i = 0; i < num_par_pairs; i++) { fread_throw(&u32tmp, sizeof(uint32_t), 1, input); uint32_t left = (uint32_t)ntohl(u32tmp); fread_throw(&u32tmp, sizeof(uint32_t), 1, input); uint32_t right = (uint32_t)ntohl(u32tmp); grammar->parentheses[left] = right; grammar->parentheses_reverse[right] = left; } u32tmp = 0; if (fields & (1 << 7)) { fread_throw(&u32tmp, sizeof(uint32_t), 1, input); u32tmp = (uint32_t)ntohl(u32tmp); } uint32_t num_par_anchors = u32tmp; for (uint32_t i = 0; i < num_par_anchors; i++) { fread_throw(&u32tmp, sizeof(uint32_t), 1, input); uint32_t left = (uint32_t)ntohl(u32tmp); fread_throw(&u32tmp, sizeof(uint32_t), 1, input); uint32_t right = (uint32_t)ntohl(u32tmp); grammar->anchors[left] = right; } u32tmp = 0; if (fields & (1 << 8)) { fread_throw(&u32tmp, sizeof(uint32_t), 1, input); u32tmp = (uint32_t)ntohl(u32tmp); } uint32_t num_sets = u32tmp; grammar->sets_list.resize(num_sets); for (uint32_t i = 0; i < num_sets; i++) { Set* s = grammar->allocateSet(); uint32_t fields = 0; fread_throw(&u32tmp, sizeof(uint32_t), 1, input); fields = (uint32_t)ntohl(u32tmp); if (fields & (1 << 0)) { fread_throw(&u32tmp, sizeof(uint32_t), 1, input); s->number = (uint32_t)ntohl(u32tmp); } if (fields & (1 << 1)) { fread_throw(&u32tmp, sizeof(uint32_t), 1, input); s->hash = (uint32_t)ntohl(u32tmp); } if (fields & (1 << 2)) { fread_throw(&u8tmp, sizeof(uint8_t), 1, input); s->type = u8tmp; } if (fields & (1 << 3)) { fread_throw(&u32tmp, sizeof(uint32_t), 1, input); u32tmp = (uint32_t)ntohl(u32tmp); if (u32tmp) { trie_unserialize(s->trie, input, *grammar, u32tmp); } fread_throw(&u32tmp, sizeof(uint32_t), 1, input); u32tmp = (uint32_t)ntohl(u32tmp); if (u32tmp) { trie_unserialize(s->trie_special, input, *grammar, u32tmp); } } if (fields & (1 << 4)) { fread_throw(&u32tmp, sizeof(uint32_t), 1, input); u32tmp = (uint32_t)ntohl(u32tmp); uint32_t num_set_ops = u32tmp; for (uint32_t j = 0; j < num_set_ops; j++) { fread_throw(&u32tmp, sizeof(uint32_t), 1, input); u32tmp = (uint32_t)ntohl(u32tmp); s->set_ops.push_back(u32tmp); } } if (fields & (1 << 5)) { fread_throw(&u32tmp, sizeof(uint32_t), 1, input); u32tmp = (uint32_t)ntohl(u32tmp); uint32_t num_sets = u32tmp; for (uint32_t j = 0; j < num_sets; j++) { fread_throw(&u32tmp, sizeof(uint32_t), 1, input); u32tmp = (uint32_t)ntohl(u32tmp); s->sets.push_back(u32tmp); } } if (fields & (1 << 6)) { fread_throw(&u32tmp, sizeof(uint32_t), 1, input); u32tmp = (uint32_t)ntohl(u32tmp); if (u32tmp) { ucnv_reset(conv); fread_throw(&cbuffers[0][0], 1, u32tmp, input); i32tmp = ucnv_toUChars(conv, &gbuffers[0][0], CG3_BUFFER_SIZE - 1, &cbuffers[0][0], u32tmp, &err); s->setName(&gbuffers[0][0]); } } grammar->sets_by_contents[s->hash] = s; grammar->sets_list[s->number] = s; } // Actually assign sets to the varstring tags now that sets are loaded for (auto iter : tag_varsets) { Tag* t = grammar->single_tags_list[iter.first]; for (auto uit : iter.second) { Set* s = grammar->sets_list[uit]; t->vs_sets->push_back(s); } } if (fields & (1 << 9)) { fread_throw(&u32tmp, sizeof(uint32_t), 1, input); u32tmp = (uint32_t)ntohl(u32tmp); grammar->delimiters = grammar->sets_by_contents.find(u32tmp)->second; } if (fields & (1 << 10)) { fread_throw(&u32tmp, sizeof(uint32_t), 1, input); u32tmp = (uint32_t)ntohl(u32tmp); grammar->soft_delimiters = grammar->sets_by_contents.find(u32tmp)->second; } u32tmp = 0; if (fields & (1 << 11)) { fread_throw(&u32tmp, sizeof(uint32_t), 1, input); u32tmp = (uint32_t)ntohl(u32tmp); } uint32_t num_contexts = u32tmp; contexts_list.resize(num_contexts); for (uint32_t i = 0; i < num_contexts; i++) { ContextualTest* t = readContextualTest_10043(input); grammar->contexts[t->hash] = t; contexts_list[i] = t; } u32tmp = 0; if (fields & (1 << 12)) { fread_throw(&u32tmp, sizeof(uint32_t), 1, input); u32tmp = (uint32_t)ntohl(u32tmp); } uint32_t num_rules = u32tmp; grammar->rule_by_number.resize(num_rules); for (uint32_t i = 0; i < num_rules; i++) { Rule* r = grammar->allocateRule(); uint32_t fields = 0; fread_throw(&u32tmp, sizeof(uint32_t), 1, input); fields = (uint32_t)ntohl(u32tmp); if (fields & (1 << 0)) { fread_throw(&i32tmp, sizeof(int32_t), 1, input); r->section = (int32_t)ntohl(i32tmp); } if (fields & (1 << 1)) { fread_throw(&u32tmp, sizeof(uint32_t), 1, input); r->type = (KEYWORDS)ntohl(u32tmp); } if (fields & (1 << 2)) { fread_throw(&u32tmp, sizeof(uint32_t), 1, input); r->line = (uint32_t)ntohl(u32tmp); } if (fields & (1 << 3)) { fread_throw(&u32tmp, sizeof(uint32_t), 1, input); r->flags = (uint32_t)ntohl(u32tmp); } if (fields & (1 << 4)) { fread_throw(&u32tmp, sizeof(uint32_t), 1, input); u32tmp = (uint32_t)ntohl(u32tmp); if (u32tmp) { ucnv_reset(conv); fread_throw(&cbuffers[0][0], 1, u32tmp, input); i32tmp = ucnv_toUChars(conv, &gbuffers[0][0], CG3_BUFFER_SIZE - 1, &cbuffers[0][0], u32tmp, &err); r->setName(&gbuffers[0][0]); } } if (fields & (1 << 5)) { fread_throw(&u32tmp, sizeof(uint32_t), 1, input); r->target = (uint32_t)ntohl(u32tmp); } if (fields & (1 << 6)) { fread_throw(&u32tmp, sizeof(uint32_t), 1, input); r->wordform = grammar->single_tags_list[(uint32_t)ntohl(u32tmp)]; } if (fields & (1 << 7)) { fread_throw(&u32tmp, sizeof(uint32_t), 1, input); r->varname = (uint32_t)ntohl(u32tmp); } if (fields & (1 << 8)) { fread_throw(&u32tmp, sizeof(uint32_t), 1, input); r->varvalue = (uint32_t)ntohl(u32tmp); } if (fields & (1 << 9)) { fread_throw(&u32tmp, sizeof(uint32_t), 1, input); u32tmp = (uint32_t)ntohl(u32tmp); int32_t v = u32tmp; if (u32tmp & (1 << 31)) { u32tmp &= ~(1 << 31); v = u32tmp; v = -v; } r->sub_reading = v; } if (fields & (1 << 10)) { fread_throw(&u32tmp, sizeof(uint32_t), 1, input); r->childset1 = (uint32_t)ntohl(u32tmp); } if (fields & (1 << 11)) { fread_throw(&u32tmp, sizeof(uint32_t), 1, input); r->childset2 = (uint32_t)ntohl(u32tmp); } if (fields & (1 << 12)) { fread_throw(&u32tmp, sizeof(uint32_t), 1, input); r->maplist = grammar->sets_list[(uint32_t)ntohl(u32tmp)]; } if (fields & (1 << 13)) { fread_throw(&u32tmp, sizeof(uint32_t), 1, input); r->sublist = grammar->sets_list[(uint32_t)ntohl(u32tmp)]; } if (fields & (1 << 14)) { fread_throw(&u32tmp, sizeof(uint32_t), 1, input); r->number = (uint32_t)ntohl(u32tmp); } fread_throw(&u32tmp, sizeof(uint32_t), 1, input); u32tmp = (uint32_t)ntohl(u32tmp); if (u32tmp) { r->dep_target = contexts_list[u32tmp - 1]; } fread_throw(&u32tmp, sizeof(uint32_t), 1, input); u32tmp = (uint32_t)ntohl(u32tmp); uint32_t num_dep_tests = u32tmp; for (uint32_t j = 0; j < num_dep_tests; j++) { fread_throw(&u32tmp, sizeof(uint32_t), 1, input); u32tmp = (uint32_t)ntohl(u32tmp); ContextualTest* t = contexts_list[u32tmp - 1]; r->addContextualTest(t, r->dep_tests); } fread_throw(&u32tmp, sizeof(uint32_t), 1, input); u32tmp = (uint32_t)ntohl(u32tmp); uint32_t num_tests = u32tmp; for (uint32_t j = 0; j < num_tests; j++) { fread_throw(&u32tmp, sizeof(uint32_t), 1, input); u32tmp = (uint32_t)ntohl(u32tmp); ContextualTest* t = contexts_list[u32tmp - 1]; r->addContextualTest(t, r->tests); } grammar->rule_by_number[r->number] = r; } // Bind the named templates to where they are used for (auto it : deferred_tmpls) { auto tmt = templates.find(it.second); it.first->tmpl = tmt->second; } ucnv_close(conv); // Create the dummy set grammar->allocateDummySet(); grammar->is_binary = false; return 0; }
int prString_FindRegexp(struct VMGlobals *g, int numArgsPushed) { int err; PyrSlot *a = g->sp - 2; // source string PyrSlot *b = g->sp - 1; // pattern PyrSlot *c = g->sp; // offset if (!isKindOfSlot(b, class_string) || (NotInt(c))) return errWrongType; // post("prString_FindRegexp\n"); int maxfind = MAXREGEXFIND; int offset = slotRawInt(c); int stringsize = slotRawObject(a)->size + 1; int patternsize = slotRawObject(b)->size + 1; char *string = (char*)malloc(slotRawObject(a)->size + 1); err = slotStrVal(a, string, slotRawObject(a)->size + 1); if (err){ free(string); return err; } char *pattern = (char*)malloc(slotRawObject(b)->size + 1); err = slotStrVal(b, pattern, slotRawObject(b)->size + 1); if (err) return err; UParseError uerr; UErrorCode status = (UErrorCode)0; UChar *regexStr; UChar *ustring; regexStr = (UChar*)malloc((patternsize)*sizeof(UChar)); u_charsToUChars (pattern, regexStr, patternsize); ustring = (UChar*)malloc((stringsize)*sizeof(UChar)); u_charsToUChars (string+offset, ustring, stringsize-offset); unsigned flags = UREGEX_MULTILINE; int groupNumber = 0; SCRegExRegion * what; int indx = 0; int size = 0; URegularExpression *expression = uregex_open(regexStr, -1, flags, &uerr, &status); if(U_FAILURE(status)) goto nilout; if(!U_FAILURE(status)) { uregex_setText(expression, ustring, -1, &status); what = (SCRegExRegion*)malloc((maxfind)*sizeof(SCRegExRegion)); for(int i=0; i< maxfind; i++) { SCRegExRegion range; range.matched = false; what[i] = range; } int32_t groups = uregex_groupCount(expression, &status) + 1; if(U_FAILURE(status)) goto nilout; // post("groups: %i\n", groups); while (uregex_findNext(expression, &status) && size<maxfind) { if(U_FAILURE(status)) return errNone; for(int i=0; i< groups; ++i){ what[size].group = i; what[size].start = sc_clip(uregex_start(expression, i, &status), 0, stringsize) ; if(U_FAILURE(status)) goto nilout; what[size].end = sc_clip(uregex_end(expression, i, &status), 0, stringsize); what[size].matched = true; // post("index:%i, size:%i, start %i, end %i\n", i, size, what[i].start, what[i].end); size = indx++ + 1; if(U_FAILURE(status)) goto nilout; } } PyrObject *result_array = newPyrArray(g->gc, size, 0, true); result_array->size = 0; if (size>0) //(matched) { for (int i = 0; i < size; i++) { if (what[0].matched == false) { result_array->size++; SetNil(result_array->slots+i); } else { result_array->size++; int match_start = what[i].start; int match_length = what[i].end - what[i].start; // post("for i:%i, start %i, end %i\n", i, what[i].start, what[i].end); // char *match = (char*)malloc(match_length); char match[match_length]; strncpy(match, string + offset + match_start, match_length); match[match_length] = 0; PyrObject *array = newPyrArray(g->gc, 2, 0, true); array->size = 2; SetInt(array->slots, match_start + offset); PyrObject *matched_string = (PyrObject*)newPyrString(g->gc, match, 0, true); SetObject(array->slots+1, matched_string); g->gc->GCWrite(matched_string, array->slots + 1); SetObject(result_array->slots + i, array); g->gc->GCWrite(array, result_array->slots + i); } } } else { SetNil(a); } free(what); free(pattern); free(regexStr); free(ustring); free(string); SetObject(a, result_array); g->gc->GCWrite(result_array,a); //uregex_close(expression); return errNone; } nilout: free(string); free(what); free(pattern); free(regexStr); free(ustring); SetNil(a); return errNone; }
static void TestRegexCAPI(void) { UErrorCode status = U_ZERO_ERROR; URegularExpression *re; UChar pat[200]; UChar *minus1; memset(&minus1, -1, sizeof(minus1)); /* Mimimalist open/close */ u_uastrncpy(pat, "abc*", sizeof(pat)/2); re = uregex_open(pat, -1, 0, 0, &status); TEST_ASSERT_SUCCESS(status); uregex_close(re); /* Open with all flag values set */ status = U_ZERO_ERROR; re = uregex_open(pat, -1, UREGEX_CASE_INSENSITIVE | UREGEX_COMMENTS | UREGEX_DOTALL | UREGEX_MULTILINE | UREGEX_UWORD, 0, &status); TEST_ASSERT_SUCCESS(status); uregex_close(re); /* Open with an invalid flag */ status = U_ZERO_ERROR; re = uregex_open(pat, -1, 0x40000000, 0, &status); TEST_ASSERT(status == U_REGEX_INVALID_FLAG); uregex_close(re); /* openC with an invalid parameter */ status = U_ZERO_ERROR; re = uregex_openC(NULL, UREGEX_CASE_INSENSITIVE | UREGEX_COMMENTS | UREGEX_DOTALL | UREGEX_MULTILINE | UREGEX_UWORD, 0, &status); TEST_ASSERT(status == U_ILLEGAL_ARGUMENT_ERROR && re == NULL); /* openC with an invalid parameter */ status = U_USELESS_COLLATOR_ERROR; re = uregex_openC(NULL, UREGEX_CASE_INSENSITIVE | UREGEX_COMMENTS | UREGEX_DOTALL | UREGEX_MULTILINE | UREGEX_UWORD, 0, &status); TEST_ASSERT(status == U_USELESS_COLLATOR_ERROR && re == NULL); /* openC open from a C string */ { const UChar *p; int32_t len; status = U_ZERO_ERROR; re = uregex_openC("abc*", 0, 0, &status); TEST_ASSERT_SUCCESS(status); p = uregex_pattern(re, &len, &status); TEST_ASSERT_SUCCESS(status); /* The TEST_ASSERT_SUCCESS above should change too... */ if(U_SUCCESS(status)) { u_uastrncpy(pat, "abc*", sizeof(pat)/2); TEST_ASSERT(u_strcmp(pat, p) == 0); TEST_ASSERT(len==(int32_t)strlen("abc*")); } uregex_close(re); /* TODO: Open with ParseError parameter */ } /* * clone */ { URegularExpression *clone1; URegularExpression *clone2; URegularExpression *clone3; UChar testString1[30]; UChar testString2[30]; UBool result; status = U_ZERO_ERROR; re = uregex_openC("abc*", 0, 0, &status); TEST_ASSERT_SUCCESS(status); clone1 = uregex_clone(re, &status); TEST_ASSERT_SUCCESS(status); TEST_ASSERT(clone1 != NULL); status = U_ZERO_ERROR; clone2 = uregex_clone(re, &status); TEST_ASSERT_SUCCESS(status); TEST_ASSERT(clone2 != NULL); uregex_close(re); status = U_ZERO_ERROR; clone3 = uregex_clone(clone2, &status); TEST_ASSERT_SUCCESS(status); TEST_ASSERT(clone3 != NULL); u_uastrncpy(testString1, "abcccd", sizeof(pat)/2); u_uastrncpy(testString2, "xxxabcccd", sizeof(pat)/2); status = U_ZERO_ERROR; uregex_setText(clone1, testString1, -1, &status); TEST_ASSERT_SUCCESS(status); result = uregex_lookingAt(clone1, 0, &status); TEST_ASSERT_SUCCESS(status); TEST_ASSERT(result==TRUE); status = U_ZERO_ERROR; uregex_setText(clone2, testString2, -1, &status); TEST_ASSERT_SUCCESS(status); result = uregex_lookingAt(clone2, 0, &status); TEST_ASSERT_SUCCESS(status); TEST_ASSERT(result==FALSE); result = uregex_find(clone2, 0, &status); TEST_ASSERT_SUCCESS(status); TEST_ASSERT(result==TRUE); uregex_close(clone1); uregex_close(clone2); uregex_close(clone3); } /* * pattern() */ { const UChar *resultPat; int32_t resultLen; u_uastrncpy(pat, "hello", sizeof(pat)/2); status = U_ZERO_ERROR; re = uregex_open(pat, -1, 0, NULL, &status); resultPat = uregex_pattern(re, &resultLen, &status); TEST_ASSERT_SUCCESS(status); /* The TEST_ASSERT_SUCCESS above should change too... */ if (U_SUCCESS(status)) { TEST_ASSERT(resultLen == -1); TEST_ASSERT(u_strcmp(resultPat, pat) == 0); } uregex_close(re); status = U_ZERO_ERROR; re = uregex_open(pat, 3, 0, NULL, &status); resultPat = uregex_pattern(re, &resultLen, &status); TEST_ASSERT_SUCCESS(status); TEST_ASSERT_SUCCESS(status); /* The TEST_ASSERT_SUCCESS above should change too... */ if (U_SUCCESS(status)) { TEST_ASSERT(resultLen == 3); TEST_ASSERT(u_strncmp(resultPat, pat, 3) == 0); TEST_ASSERT(u_strlen(resultPat) == 3); } uregex_close(re); } /* * flags() */ { int32_t t; status = U_ZERO_ERROR; re = uregex_open(pat, -1, 0, NULL, &status); t = uregex_flags(re, &status); TEST_ASSERT_SUCCESS(status); TEST_ASSERT(t == 0); uregex_close(re); status = U_ZERO_ERROR; re = uregex_open(pat, -1, 0, NULL, &status); t = uregex_flags(re, &status); TEST_ASSERT_SUCCESS(status); TEST_ASSERT(t == 0); uregex_close(re); status = U_ZERO_ERROR; re = uregex_open(pat, -1, UREGEX_CASE_INSENSITIVE | UREGEX_DOTALL, NULL, &status); t = uregex_flags(re, &status); TEST_ASSERT_SUCCESS(status); TEST_ASSERT(t == (UREGEX_CASE_INSENSITIVE | UREGEX_DOTALL)); uregex_close(re); } /* * setText() and lookingAt() */ { UChar text1[50]; UChar text2[50]; UBool result; u_uastrncpy(text1, "abcccd", sizeof(text1)/2); u_uastrncpy(text2, "abcccxd", sizeof(text2)/2); status = U_ZERO_ERROR; u_uastrncpy(pat, "abc*d", sizeof(pat)/2); re = uregex_open(pat, -1, 0, NULL, &status); TEST_ASSERT_SUCCESS(status); /* Operation before doing a setText should fail... */ status = U_ZERO_ERROR; uregex_lookingAt(re, 0, &status); TEST_ASSERT( status== U_REGEX_INVALID_STATE); status = U_ZERO_ERROR; uregex_setText(re, text1, -1, &status); result = uregex_lookingAt(re, 0, &status); TEST_ASSERT(result == TRUE); TEST_ASSERT_SUCCESS(status); status = U_ZERO_ERROR; uregex_setText(re, text2, -1, &status); result = uregex_lookingAt(re, 0, &status); TEST_ASSERT(result == FALSE); TEST_ASSERT_SUCCESS(status); status = U_ZERO_ERROR; uregex_setText(re, text1, -1, &status); result = uregex_lookingAt(re, 0, &status); TEST_ASSERT(result == TRUE); TEST_ASSERT_SUCCESS(status); status = U_ZERO_ERROR; uregex_setText(re, text1, 5, &status); result = uregex_lookingAt(re, 0, &status); TEST_ASSERT(result == FALSE); TEST_ASSERT_SUCCESS(status); status = U_ZERO_ERROR; uregex_setText(re, text1, 6, &status); result = uregex_lookingAt(re, 0, &status); TEST_ASSERT(result == TRUE); TEST_ASSERT_SUCCESS(status); uregex_close(re); } /* * getText() */ { UChar text1[50]; UChar text2[50]; const UChar *result; int32_t textLength; u_uastrncpy(text1, "abcccd", sizeof(text1)/2); u_uastrncpy(text2, "abcccxd", sizeof(text2)/2); status = U_ZERO_ERROR; u_uastrncpy(pat, "abc*d", sizeof(pat)/2); re = uregex_open(pat, -1, 0, NULL, &status); uregex_setText(re, text1, -1, &status); result = uregex_getText(re, &textLength, &status); TEST_ASSERT(result == text1); TEST_ASSERT(textLength == -1); TEST_ASSERT_SUCCESS(status); status = U_ZERO_ERROR; uregex_setText(re, text2, 7, &status); result = uregex_getText(re, &textLength, &status); TEST_ASSERT(result == text2); TEST_ASSERT(textLength == 7); TEST_ASSERT_SUCCESS(status); status = U_ZERO_ERROR; uregex_setText(re, text2, 4, &status); result = uregex_getText(re, &textLength, &status); TEST_ASSERT(result == text2); TEST_ASSERT(textLength == 4); TEST_ASSERT_SUCCESS(status); uregex_close(re); } /* * matches() */ { UChar text1[50]; UBool result; int len; UChar nullString[] = {0,0,0}; u_uastrncpy(text1, "abcccde", sizeof(text1)/2); status = U_ZERO_ERROR; u_uastrncpy(pat, "abc*d", sizeof(pat)/2); re = uregex_open(pat, -1, 0, NULL, &status); uregex_setText(re, text1, -1, &status); result = uregex_matches(re, 0, &status); TEST_ASSERT(result == FALSE); TEST_ASSERT_SUCCESS(status); status = U_ZERO_ERROR; uregex_setText(re, text1, 6, &status); result = uregex_matches(re, 0, &status); TEST_ASSERT(result == TRUE); TEST_ASSERT_SUCCESS(status); status = U_ZERO_ERROR; uregex_setText(re, text1, 6, &status); result = uregex_matches(re, 1, &status); TEST_ASSERT(result == FALSE); TEST_ASSERT_SUCCESS(status); uregex_close(re); status = U_ZERO_ERROR; re = uregex_openC(".?", 0, NULL, &status); uregex_setText(re, text1, -1, &status); len = u_strlen(text1); result = uregex_matches(re, len, &status); TEST_ASSERT(result == TRUE); TEST_ASSERT_SUCCESS(status); status = U_ZERO_ERROR; uregex_setText(re, nullString, -1, &status); TEST_ASSERT_SUCCESS(status); result = uregex_matches(re, 0, &status); TEST_ASSERT(result == TRUE); TEST_ASSERT_SUCCESS(status); uregex_close(re); } /* * lookingAt() Used in setText test. */ /* * find(), findNext, start, end, reset */ { UChar text1[50]; UBool result; u_uastrncpy(text1, "012rx5rx890rxrx...", sizeof(text1)/2); status = U_ZERO_ERROR; re = uregex_openC("rx", 0, NULL, &status); uregex_setText(re, text1, -1, &status); result = uregex_find(re, 0, &status); TEST_ASSERT(result == TRUE); TEST_ASSERT(uregex_start(re, 0, &status) == 3); TEST_ASSERT(uregex_end(re, 0, &status) == 5); TEST_ASSERT_SUCCESS(status); result = uregex_find(re, 9, &status); TEST_ASSERT(result == TRUE); TEST_ASSERT(uregex_start(re, 0, &status) == 11); TEST_ASSERT(uregex_end(re, 0, &status) == 13); TEST_ASSERT_SUCCESS(status); result = uregex_find(re, 14, &status); TEST_ASSERT(result == FALSE); TEST_ASSERT_SUCCESS(status); status = U_ZERO_ERROR; uregex_reset(re, 0, &status); result = uregex_findNext(re, &status); TEST_ASSERT(result == TRUE); TEST_ASSERT(uregex_start(re, 0, &status) == 3); TEST_ASSERT(uregex_end(re, 0, &status) == 5); TEST_ASSERT_SUCCESS(status); result = uregex_findNext(re, &status); TEST_ASSERT(result == TRUE); TEST_ASSERT(uregex_start(re, 0, &status) == 6); TEST_ASSERT(uregex_end(re, 0, &status) == 8); TEST_ASSERT_SUCCESS(status); status = U_ZERO_ERROR; uregex_reset(re, 12, &status); result = uregex_findNext(re, &status); TEST_ASSERT(result == TRUE); TEST_ASSERT(uregex_start(re, 0, &status) == 13); TEST_ASSERT(uregex_end(re, 0, &status) == 15); TEST_ASSERT_SUCCESS(status); result = uregex_findNext(re, &status); TEST_ASSERT(result == FALSE); TEST_ASSERT_SUCCESS(status); uregex_close(re); } /* * groupCount */ { int32_t result; status = U_ZERO_ERROR; re = uregex_openC("abc", 0, NULL, &status); result = uregex_groupCount(re, &status); TEST_ASSERT_SUCCESS(status); TEST_ASSERT(result == 0); uregex_close(re); status = U_ZERO_ERROR; re = uregex_openC("abc(def)(ghi(j))", 0, NULL, &status); result = uregex_groupCount(re, &status); TEST_ASSERT_SUCCESS(status); TEST_ASSERT(result == 3); uregex_close(re); } /* * group() */ { UChar text1[80]; UChar buf[80]; UBool result; int32_t resultSz; u_uastrncpy(text1, "noise abc interior def, and this is off the end", sizeof(text1)/2); status = U_ZERO_ERROR; re = uregex_openC("abc(.*?)def", 0, NULL, &status); TEST_ASSERT_SUCCESS(status); uregex_setText(re, text1, -1, &status); result = uregex_find(re, 0, &status); TEST_ASSERT(result==TRUE); /* Capture Group 0, the full match. Should succeed. */ status = U_ZERO_ERROR; resultSz = uregex_group(re, 0, buf, sizeof(buf)/2, &status); TEST_ASSERT_SUCCESS(status); TEST_ASSERT_STRING("abc interior def", buf, TRUE); TEST_ASSERT(resultSz == (int32_t)strlen("abc interior def")); /* Capture group #1. Should succeed. */ status = U_ZERO_ERROR; resultSz = uregex_group(re, 1, buf, sizeof(buf)/2, &status); TEST_ASSERT_SUCCESS(status); TEST_ASSERT_STRING(" interior ", buf, TRUE); TEST_ASSERT(resultSz == (int32_t)strlen(" interior ")); /* Capture group out of range. Error. */ status = U_ZERO_ERROR; uregex_group(re, 2, buf, sizeof(buf)/2, &status); TEST_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR); /* NULL buffer, pure pre-flight */ status = U_ZERO_ERROR; resultSz = uregex_group(re, 0, NULL, 0, &status); TEST_ASSERT(status == U_BUFFER_OVERFLOW_ERROR); TEST_ASSERT(resultSz == (int32_t)strlen("abc interior def")); /* Too small buffer, truncated string */ status = U_ZERO_ERROR; memset(buf, -1, sizeof(buf)); resultSz = uregex_group(re, 0, buf, 5, &status); TEST_ASSERT(status == U_BUFFER_OVERFLOW_ERROR); TEST_ASSERT_STRING("abc i", buf, FALSE); TEST_ASSERT(buf[5] == (UChar)0xffff); TEST_ASSERT(resultSz == (int32_t)strlen("abc interior def")); /* Output string just fits buffer, no NUL term. */ status = U_ZERO_ERROR; resultSz = uregex_group(re, 0, buf, (int32_t)strlen("abc interior def"), &status); TEST_ASSERT(status == U_STRING_NOT_TERMINATED_WARNING); TEST_ASSERT_STRING("abc interior def", buf, FALSE); TEST_ASSERT(resultSz == (int32_t)strlen("abc interior def")); TEST_ASSERT(buf[strlen("abc interior def")] == (UChar)0xffff); uregex_close(re); } /* * Regions */ /* SetRegion(), getRegion() do something */ TEST_SETUP(".*", "0123456789ABCDEF", 0) UChar resultString[40]; TEST_ASSERT(uregex_regionStart(re, &status) == 0); TEST_ASSERT(uregex_regionEnd(re, &status) == 16); uregex_setRegion(re, 3, 6, &status); TEST_ASSERT(uregex_regionStart(re, &status) == 3); TEST_ASSERT(uregex_regionEnd(re, &status) == 6); TEST_ASSERT(uregex_findNext(re, &status)); TEST_ASSERT(uregex_group(re, 0, resultString, sizeof(resultString)/2, &status) == 3) TEST_ASSERT_STRING("345", resultString, TRUE); TEST_TEARDOWN; /* find(start=-1) uses regions */ TEST_SETUP(".*", "0123456789ABCDEF", 0); uregex_setRegion(re, 4, 6, &status); TEST_ASSERT(uregex_find(re, -1, &status) == TRUE); TEST_ASSERT(uregex_start(re, 0, &status) == 4); TEST_ASSERT(uregex_end(re, 0, &status) == 6); TEST_TEARDOWN; /* find (start >=0) does not use regions */ TEST_SETUP(".*", "0123456789ABCDEF", 0); uregex_setRegion(re, 4, 6, &status); TEST_ASSERT(uregex_find(re, 0, &status) == TRUE); TEST_ASSERT(uregex_start(re, 0, &status) == 0); TEST_ASSERT(uregex_end(re, 0, &status) == 16); TEST_TEARDOWN; /* findNext() obeys regions */ TEST_SETUP(".", "0123456789ABCDEF", 0); uregex_setRegion(re, 4, 6, &status); TEST_ASSERT(uregex_findNext(re,&status) == TRUE); TEST_ASSERT(uregex_start(re, 0, &status) == 4); TEST_ASSERT(uregex_findNext(re, &status) == TRUE); TEST_ASSERT(uregex_start(re, 0, &status) == 5); TEST_ASSERT(uregex_findNext(re, &status) == FALSE); TEST_TEARDOWN; /* matches(start=-1) uses regions */ /* Also, verify that non-greedy *? succeeds in finding the full match. */ TEST_SETUP(".*?", "0123456789ABCDEF", 0); uregex_setRegion(re, 4, 6, &status); TEST_ASSERT(uregex_matches(re, -1, &status) == TRUE); TEST_ASSERT(uregex_start(re, 0, &status) == 4); TEST_ASSERT(uregex_end(re, 0, &status) == 6); TEST_TEARDOWN; /* matches (start >=0) does not use regions */ TEST_SETUP(".*?", "0123456789ABCDEF", 0); uregex_setRegion(re, 4, 6, &status); TEST_ASSERT(uregex_matches(re, 0, &status) == TRUE); TEST_ASSERT(uregex_start(re, 0, &status) == 0); TEST_ASSERT(uregex_end(re, 0, &status) == 16); TEST_TEARDOWN; /* lookingAt(start=-1) uses regions */ /* Also, verify that non-greedy *? finds the first (shortest) match. */ TEST_SETUP(".*?", "0123456789ABCDEF", 0); uregex_setRegion(re, 4, 6, &status); TEST_ASSERT(uregex_lookingAt(re, -1, &status) == TRUE); TEST_ASSERT(uregex_start(re, 0, &status) == 4); TEST_ASSERT(uregex_end(re, 0, &status) == 4); TEST_TEARDOWN; /* lookingAt (start >=0) does not use regions */ TEST_SETUP(".*?", "0123456789ABCDEF", 0); uregex_setRegion(re, 4, 6, &status); TEST_ASSERT(uregex_lookingAt(re, 0, &status) == TRUE); TEST_ASSERT(uregex_start(re, 0, &status) == 0); TEST_ASSERT(uregex_end(re, 0, &status) == 0); TEST_TEARDOWN; /* hitEnd() */ TEST_SETUP("[a-f]*", "abcdefghij", 0); TEST_ASSERT(uregex_find(re, 0, &status) == TRUE); TEST_ASSERT(uregex_hitEnd(re, &status) == FALSE); TEST_TEARDOWN; TEST_SETUP("[a-f]*", "abcdef", 0); TEST_ASSERT(uregex_find(re, 0, &status) == TRUE); TEST_ASSERT(uregex_hitEnd(re, &status) == TRUE); TEST_TEARDOWN; /* requireEnd */ TEST_SETUP("abcd", "abcd", 0); TEST_ASSERT(uregex_find(re, 0, &status) == TRUE); TEST_ASSERT(uregex_requireEnd(re, &status) == FALSE); TEST_TEARDOWN; TEST_SETUP("abcd$", "abcd", 0); TEST_ASSERT(uregex_find(re, 0, &status) == TRUE); TEST_ASSERT(uregex_requireEnd(re, &status) == TRUE); TEST_TEARDOWN; /* anchoringBounds */ TEST_SETUP("abc$", "abcdef", 0); TEST_ASSERT(uregex_hasAnchoringBounds(re, &status) == TRUE); uregex_useAnchoringBounds(re, FALSE, &status); TEST_ASSERT(uregex_hasAnchoringBounds(re, &status) == FALSE); TEST_ASSERT(uregex_find(re, -1, &status) == FALSE); uregex_useAnchoringBounds(re, TRUE, &status); uregex_setRegion(re, 0, 3, &status); TEST_ASSERT(uregex_find(re, -1, &status) == TRUE); TEST_ASSERT(uregex_end(re, 0, &status) == 3); TEST_TEARDOWN; /* Transparent Bounds */ TEST_SETUP("abc(?=def)", "abcdef", 0); TEST_ASSERT(uregex_hasTransparentBounds(re, &status) == FALSE); uregex_useTransparentBounds(re, TRUE, &status); TEST_ASSERT(uregex_hasTransparentBounds(re, &status) == TRUE); uregex_useTransparentBounds(re, FALSE, &status); TEST_ASSERT(uregex_find(re, -1, &status) == TRUE); /* No Region */ uregex_setRegion(re, 0, 3, &status); TEST_ASSERT(uregex_find(re, -1, &status) == FALSE); /* with region, opaque bounds */ uregex_useTransparentBounds(re, TRUE, &status); TEST_ASSERT(uregex_find(re, -1, &status) == TRUE); /* with region, transparent bounds */ TEST_ASSERT(uregex_end(re, 0, &status) == 3); TEST_TEARDOWN; /* * replaceFirst() */ { UChar text1[80]; UChar text2[80]; UChar replText[80]; UChar buf[80]; int32_t resultSz; u_uastrncpy(text1, "Replace xaax x1x x...x.", sizeof(text1)/2); u_uastrncpy(text2, "No match here.", sizeof(text2)/2); u_uastrncpy(replText, "<$1>", sizeof(replText)/2); status = U_ZERO_ERROR; re = uregex_openC("x(.*?)x", 0, NULL, &status); TEST_ASSERT_SUCCESS(status); /* Normal case, with match */ uregex_setText(re, text1, -1, &status); resultSz = uregex_replaceFirst(re, replText, -1, buf, sizeof(buf)/2, &status); TEST_ASSERT_SUCCESS(status); TEST_ASSERT_STRING("Replace <aa> x1x x...x.", buf, TRUE); TEST_ASSERT(resultSz == (int32_t)strlen("Replace xaax x1x x...x.")); /* No match. Text should copy to output with no changes. */ status = U_ZERO_ERROR; uregex_setText(re, text2, -1, &status); resultSz = uregex_replaceFirst(re, replText, -1, buf, sizeof(buf)/2, &status); TEST_ASSERT_SUCCESS(status); TEST_ASSERT_STRING("No match here.", buf, TRUE); TEST_ASSERT(resultSz == (int32_t)strlen("No match here.")); /* Match, output just fills buffer, no termination warning. */ status = U_ZERO_ERROR; uregex_setText(re, text1, -1, &status); memset(buf, -1, sizeof(buf)); resultSz = uregex_replaceFirst(re, replText, -1, buf, strlen("Replace <aa> x1x x...x."), &status); TEST_ASSERT(status == U_STRING_NOT_TERMINATED_WARNING); TEST_ASSERT_STRING("Replace <aa> x1x x...x.", buf, FALSE); TEST_ASSERT(resultSz == (int32_t)strlen("Replace xaax x1x x...x.")); TEST_ASSERT(buf[resultSz] == (UChar)0xffff); /* Do the replaceFirst again, without first resetting anything. * Should give the same results. */ status = U_ZERO_ERROR; memset(buf, -1, sizeof(buf)); resultSz = uregex_replaceFirst(re, replText, -1, buf, strlen("Replace <aa> x1x x...x."), &status); TEST_ASSERT(status == U_STRING_NOT_TERMINATED_WARNING); TEST_ASSERT_STRING("Replace <aa> x1x x...x.", buf, FALSE); TEST_ASSERT(resultSz == (int32_t)strlen("Replace xaax x1x x...x.")); TEST_ASSERT(buf[resultSz] == (UChar)0xffff); /* NULL buffer, zero buffer length */ status = U_ZERO_ERROR; resultSz = uregex_replaceFirst(re, replText, -1, NULL, 0, &status); TEST_ASSERT(status == U_BUFFER_OVERFLOW_ERROR); TEST_ASSERT(resultSz == (int32_t)strlen("Replace xaax x1x x...x.")); /* Buffer too small by one */ status = U_ZERO_ERROR; memset(buf, -1, sizeof(buf)); resultSz = uregex_replaceFirst(re, replText, -1, buf, strlen("Replace <aa> x1x x...x.")-1, &status); TEST_ASSERT(status == U_BUFFER_OVERFLOW_ERROR); TEST_ASSERT_STRING("Replace <aa> x1x x...x", buf, FALSE); TEST_ASSERT(resultSz == (int32_t)strlen("Replace xaax x1x x...x.")); TEST_ASSERT(buf[resultSz] == (UChar)0xffff); uregex_close(re); } /* * replaceAll() */ { UChar text1[80]; UChar text2[80]; UChar replText[80]; UChar buf[80]; int32_t resultSz; int32_t expectedResultSize; int32_t i; u_uastrncpy(text1, "Replace xaax x1x x...x.", sizeof(text1)/2); u_uastrncpy(text2, "No match here.", sizeof(text2)/2); u_uastrncpy(replText, "<$1>", sizeof(replText)/2); expectedResultSize = u_strlen(text1); status = U_ZERO_ERROR; re = uregex_openC("x(.*?)x", 0, NULL, &status); TEST_ASSERT_SUCCESS(status); /* Normal case, with match */ uregex_setText(re, text1, -1, &status); resultSz = uregex_replaceAll(re, replText, -1, buf, sizeof(buf)/2, &status); TEST_ASSERT_SUCCESS(status); TEST_ASSERT_STRING("Replace <aa> <1> <...>.", buf, TRUE); TEST_ASSERT(resultSz == (int32_t)strlen("Replace xaax x1x x...x.")); /* No match. Text should copy to output with no changes. */ status = U_ZERO_ERROR; uregex_setText(re, text2, -1, &status); resultSz = uregex_replaceAll(re, replText, -1, buf, sizeof(buf)/2, &status); TEST_ASSERT_SUCCESS(status); TEST_ASSERT_STRING("No match here.", buf, TRUE); TEST_ASSERT(resultSz == (int32_t)strlen("No match here.")); /* Match, output just fills buffer, no termination warning. */ status = U_ZERO_ERROR; uregex_setText(re, text1, -1, &status); memset(buf, -1, sizeof(buf)); resultSz = uregex_replaceAll(re, replText, -1, buf, strlen("Replace xaax x1x x...x."), &status); TEST_ASSERT(status == U_STRING_NOT_TERMINATED_WARNING); TEST_ASSERT_STRING("Replace <aa> <1> <...>.", buf, FALSE); TEST_ASSERT(resultSz == (int32_t)strlen("Replace <aa> <1> <...>.")); TEST_ASSERT(buf[resultSz] == (UChar)0xffff); /* Do the replaceFirst again, without first resetting anything. * Should give the same results. */ status = U_ZERO_ERROR; memset(buf, -1, sizeof(buf)); resultSz = uregex_replaceAll(re, replText, -1, buf, strlen("Replace xaax x1x x...x."), &status); TEST_ASSERT(status == U_STRING_NOT_TERMINATED_WARNING); TEST_ASSERT_STRING("Replace <aa> <1> <...>.", buf, FALSE); TEST_ASSERT(resultSz == (int32_t)strlen("Replace <aa> <1> <...>.")); TEST_ASSERT(buf[resultSz] == (UChar)0xffff); /* NULL buffer, zero buffer length */ status = U_ZERO_ERROR; resultSz = uregex_replaceAll(re, replText, -1, NULL, 0, &status); TEST_ASSERT(status == U_BUFFER_OVERFLOW_ERROR); TEST_ASSERT(resultSz == (int32_t)strlen("Replace <aa> <1> <...>.")); /* Buffer too small. Try every size, which will tickle edge cases * in uregex_appendReplacement (used by replaceAll) */ for (i=0; i<expectedResultSize; i++) { char expected[80]; status = U_ZERO_ERROR; memset(buf, -1, sizeof(buf)); resultSz = uregex_replaceAll(re, replText, -1, buf, i, &status); TEST_ASSERT(status == U_BUFFER_OVERFLOW_ERROR); strcpy(expected, "Replace <aa> <1> <...>."); expected[i] = 0; TEST_ASSERT_STRING(expected, buf, FALSE); TEST_ASSERT(resultSz == expectedResultSize); TEST_ASSERT(buf[i] == (UChar)0xffff); } uregex_close(re); } /* * appendReplacement() */ { UChar text[100]; UChar repl[100]; UChar buf[100]; UChar *bufPtr; int32_t bufCap; status = U_ZERO_ERROR; re = uregex_openC(".*", 0, 0, &status); TEST_ASSERT_SUCCESS(status); u_uastrncpy(text, "whatever", sizeof(text)/2); u_uastrncpy(repl, "some other", sizeof(repl)/2); uregex_setText(re, text, -1, &status); /* match covers whole target string */ uregex_find(re, 0, &status); TEST_ASSERT_SUCCESS(status); bufPtr = buf; bufCap = sizeof(buf) / 2; uregex_appendReplacement(re, repl, -1, &bufPtr, &bufCap, &status); TEST_ASSERT_SUCCESS(status); TEST_ASSERT_STRING("some other", buf, TRUE); /* Match has \u \U escapes */ uregex_find(re, 0, &status); TEST_ASSERT_SUCCESS(status); bufPtr = buf; bufCap = sizeof(buf) / 2; u_uastrncpy(repl, "abc\\u0041\\U00000042 \\\\ $ \\abc", sizeof(repl)/2); uregex_appendReplacement(re, repl, -1, &bufPtr, &bufCap, &status); TEST_ASSERT_SUCCESS(status); TEST_ASSERT_STRING("abcAB \\ $ abc", buf, TRUE); uregex_close(re); } /* * appendTail(). Checked in ReplaceFirst(), replaceAll(). */ /* * split() */ { UChar textToSplit[80]; UChar text2[80]; UChar buf[200]; UChar *fields[10]; int32_t numFields; int32_t requiredCapacity; int32_t spaceNeeded; int32_t sz; u_uastrncpy(textToSplit, "first : second: third", sizeof(textToSplit)/2); u_uastrncpy(text2, "No match here.", sizeof(text2)/2); status = U_ZERO_ERROR; re = uregex_openC(":", 0, NULL, &status); /* Simple split */ uregex_setText(re, textToSplit, -1, &status); TEST_ASSERT_SUCCESS(status); /* The TEST_ASSERT_SUCCESS call above should change too... */ if (U_SUCCESS(status)) { memset(fields, -1, sizeof(fields)); numFields = uregex_split(re, buf, sizeof(buf)/2, &requiredCapacity, fields, 10, &status); TEST_ASSERT_SUCCESS(status); /* The TEST_ASSERT_SUCCESS call above should change too... */ if(U_SUCCESS(status)) { TEST_ASSERT(numFields == 3); TEST_ASSERT_STRING("first ", fields[0], TRUE); TEST_ASSERT_STRING(" second", fields[1], TRUE); TEST_ASSERT_STRING(" third", fields[2], TRUE); TEST_ASSERT(fields[3] == NULL); spaceNeeded = u_strlen(textToSplit) - (numFields - 1) + /* Field delimiters do not appear in output */ numFields; /* Each field gets a NUL terminator */ TEST_ASSERT(spaceNeeded == requiredCapacity); } } uregex_close(re); /* Split with too few output strings available */ status = U_ZERO_ERROR; re = uregex_openC(":", 0, NULL, &status); uregex_setText(re, textToSplit, -1, &status); TEST_ASSERT_SUCCESS(status); /* The TEST_ASSERT_SUCCESS call above should change too... */ if(U_SUCCESS(status)) { memset(fields, -1, sizeof(fields)); numFields = uregex_split(re, buf, sizeof(buf)/2, &requiredCapacity, fields, 2, &status); TEST_ASSERT_SUCCESS(status); /* The TEST_ASSERT_SUCCESS call above should change too... */ if(U_SUCCESS(status)) { TEST_ASSERT(numFields == 2); TEST_ASSERT_STRING("first ", fields[0], TRUE); TEST_ASSERT_STRING(" second: third", fields[1], TRUE); TEST_ASSERT(!memcmp(&fields[2],&minus1,sizeof(UChar*))); spaceNeeded = u_strlen(textToSplit) - (numFields - 1) + /* Field delimiters do not appear in output */ numFields; /* Each field gets a NUL terminator */ TEST_ASSERT(spaceNeeded == requiredCapacity); /* Split with a range of output buffer sizes. */ spaceNeeded = u_strlen(textToSplit) - (numFields - 1) + /* Field delimiters do not appear in output */ numFields; /* Each field gets a NUL terminator */ for (sz=0; sz < spaceNeeded+1; sz++) { memset(fields, -1, sizeof(fields)); status = U_ZERO_ERROR; numFields = uregex_split(re, buf, sz, &requiredCapacity, fields, 10, &status); if (sz >= spaceNeeded) { TEST_ASSERT_SUCCESS(status); TEST_ASSERT_STRING("first ", fields[0], TRUE); TEST_ASSERT_STRING(" second", fields[1], TRUE); TEST_ASSERT_STRING(" third", fields[2], TRUE); } else { TEST_ASSERT(status == U_BUFFER_OVERFLOW_ERROR); } TEST_ASSERT(numFields == 3); TEST_ASSERT(fields[3] == NULL); TEST_ASSERT(spaceNeeded == requiredCapacity); } } } uregex_close(re); } /* Split(), part 2. Patterns with capture groups. The capture group text * comes out as additional fields. */ { UChar textToSplit[80]; UChar buf[200]; UChar *fields[10]; int32_t numFields; int32_t requiredCapacity; int32_t spaceNeeded; int32_t sz; u_uastrncpy(textToSplit, "first <tag-a> second<tag-b> third", sizeof(textToSplit)/2); status = U_ZERO_ERROR; re = uregex_openC("<(.*?)>", 0, NULL, &status); uregex_setText(re, textToSplit, -1, &status); TEST_ASSERT_SUCCESS(status); /* The TEST_ASSERT_SUCCESS call above should change too... */ if(U_SUCCESS(status)) { memset(fields, -1, sizeof(fields)); numFields = uregex_split(re, buf, sizeof(buf)/2, &requiredCapacity, fields, 10, &status); TEST_ASSERT_SUCCESS(status); /* The TEST_ASSERT_SUCCESS call above should change too... */ if(U_SUCCESS(status)) { TEST_ASSERT(numFields == 5); TEST_ASSERT_STRING("first ", fields[0], TRUE); TEST_ASSERT_STRING("tag-a", fields[1], TRUE); TEST_ASSERT_STRING(" second", fields[2], TRUE); TEST_ASSERT_STRING("tag-b", fields[3], TRUE); TEST_ASSERT_STRING(" third", fields[4], TRUE); TEST_ASSERT(fields[5] == NULL); spaceNeeded = strlen("first .tag-a. second.tag-b. third."); /* "." at NUL positions */ TEST_ASSERT(spaceNeeded == requiredCapacity); } } /* Split with too few output strings available (2) */ status = U_ZERO_ERROR; memset(fields, -1, sizeof(fields)); numFields = uregex_split(re, buf, sizeof(buf)/2, &requiredCapacity, fields, 2, &status); TEST_ASSERT_SUCCESS(status); /* The TEST_ASSERT_SUCCESS call above should change too... */ if(U_SUCCESS(status)) { TEST_ASSERT(numFields == 2); TEST_ASSERT_STRING("first ", fields[0], TRUE); TEST_ASSERT_STRING(" second<tag-b> third", fields[1], TRUE); TEST_ASSERT(!memcmp(&fields[2],&minus1,sizeof(UChar*))); spaceNeeded = strlen("first . second<tag-b> third."); /* "." at NUL positions */ TEST_ASSERT(spaceNeeded == requiredCapacity); } /* Split with too few output strings available (3) */ status = U_ZERO_ERROR; memset(fields, -1, sizeof(fields)); numFields = uregex_split(re, buf, sizeof(buf)/2, &requiredCapacity, fields, 3, &status); TEST_ASSERT_SUCCESS(status); /* The TEST_ASSERT_SUCCESS call above should change too... */ if(U_SUCCESS(status)) { TEST_ASSERT(numFields == 3); TEST_ASSERT_STRING("first ", fields[0], TRUE); TEST_ASSERT_STRING("tag-a", fields[1], TRUE); TEST_ASSERT_STRING(" second<tag-b> third", fields[2], TRUE); TEST_ASSERT(!memcmp(&fields[3],&minus1,sizeof(UChar*))); spaceNeeded = strlen("first .tag-a. second<tag-b> third."); /* "." at NUL positions */ TEST_ASSERT(spaceNeeded == requiredCapacity); } /* Split with just enough output strings available (5) */ status = U_ZERO_ERROR; memset(fields, -1, sizeof(fields)); numFields = uregex_split(re, buf, sizeof(buf)/2, &requiredCapacity, fields, 5, &status); TEST_ASSERT_SUCCESS(status); /* The TEST_ASSERT_SUCCESS call above should change too... */ if(U_SUCCESS(status)) { TEST_ASSERT(numFields == 5); TEST_ASSERT_STRING("first ", fields[0], TRUE); TEST_ASSERT_STRING("tag-a", fields[1], TRUE); TEST_ASSERT_STRING(" second", fields[2], TRUE); TEST_ASSERT_STRING("tag-b", fields[3], TRUE); TEST_ASSERT_STRING(" third", fields[4], TRUE); TEST_ASSERT(!memcmp(&fields[5],&minus1,sizeof(UChar*))); spaceNeeded = strlen("first .tag-a. second.tag-b. third."); /* "." at NUL positions */ TEST_ASSERT(spaceNeeded == requiredCapacity); } /* Split, end of text is a field delimiter. */ status = U_ZERO_ERROR; sz = strlen("first <tag-a> second<tag-b>"); uregex_setText(re, textToSplit, sz, &status); TEST_ASSERT_SUCCESS(status); /* The TEST_ASSERT_SUCCESS call above should change too... */ if(U_SUCCESS(status)) { memset(fields, -1, sizeof(fields)); numFields = uregex_split(re, buf, sizeof(buf)/2, &requiredCapacity, fields, 9, &status); TEST_ASSERT_SUCCESS(status); /* The TEST_ASSERT_SUCCESS call above should change too... */ if(U_SUCCESS(status)) { TEST_ASSERT(numFields == 4); TEST_ASSERT_STRING("first ", fields[0], TRUE); TEST_ASSERT_STRING("tag-a", fields[1], TRUE); TEST_ASSERT_STRING(" second", fields[2], TRUE); TEST_ASSERT_STRING("tag-b", fields[3], TRUE); TEST_ASSERT(fields[4] == NULL); TEST_ASSERT(fields[8] == NULL); TEST_ASSERT(!memcmp(&fields[9],&minus1,sizeof(UChar*))); spaceNeeded = strlen("first .tag-a. second.tag-b."); /* "." at NUL positions */ TEST_ASSERT(spaceNeeded == requiredCapacity); } } uregex_close(re); } }
U_NAMESPACE_BEGIN // Build the Whole Script Confusable data // // TODO: Reorganize. Either get rid of the WSConfusableDataBuilder class, // because everything is local to this one build function anyhow, // OR // break this function into more reasonably sized pieces, with // state in WSConfusableDataBuilder. // void buildWSConfusableData(SpoofImpl *spImpl, const char * confusablesWS, int32_t confusablesWSLen, UParseError *pe, UErrorCode &status) { if (U_FAILURE(status)) { return; } URegularExpression *parseRegexp = NULL; int32_t inputLen = 0; UChar *input = NULL; int32_t lineNum = 0; UVector *scriptSets = NULL; uint32_t rtScriptSetsCount = 2; UTrie2 *anyCaseTrie = NULL; UTrie2 *lowerCaseTrie = NULL; anyCaseTrie = utrie2_open(0, 0, &status); lowerCaseTrie = utrie2_open(0, 0, &status); UnicodeString pattern(parseExp, -1, US_INV); // The scriptSets vector provides a mapping from TRIE values to the set of scripts. // // Reserved TRIE values: // 0: Code point has no whole script confusables. // 1: Code point is of script Common or Inherited. // These code points do not participate in whole script confusable detection. // (This is logically equivalent to saying that they contain confusables in // all scripts) // // Because Trie values are indexes into the ScriptSets vector, pre-fill // vector positions 0 and 1 to avoid conflicts with the reserved values. scriptSets = new UVector(status); if (scriptSets == NULL) { status = U_MEMORY_ALLOCATION_ERROR; goto cleanup; } scriptSets->addElement((void *)NULL, status); scriptSets->addElement((void *)NULL, status); // Convert the user input data from UTF-8 to UChar (UTF-16) u_strFromUTF8(NULL, 0, &inputLen, confusablesWS, confusablesWSLen, &status); if (status != U_BUFFER_OVERFLOW_ERROR) { goto cleanup; } status = U_ZERO_ERROR; input = static_cast<UChar *>(uprv_malloc((inputLen+1) * sizeof(UChar))); if (input == NULL) { status = U_MEMORY_ALLOCATION_ERROR; goto cleanup; } u_strFromUTF8(input, inputLen+1, NULL, confusablesWS, confusablesWSLen, &status); parseRegexp = uregex_open(pattern.getBuffer(), pattern.length(), 0, NULL, &status); // Zap any Byte Order Mark at the start of input. Changing it to a space is benign // given the syntax of the input. if (*input == 0xfeff) { *input = 0x20; } // Parse the input, one line per iteration of this loop. uregex_setText(parseRegexp, input, inputLen, &status); while (uregex_findNext(parseRegexp, &status)) { lineNum++; if (uregex_start(parseRegexp, 1, &status) >= 0) { // this was a blank or comment line. continue; } if (uregex_start(parseRegexp, 8, &status) >= 0) { // input file syntax error. status = U_PARSE_ERROR; goto cleanup; } if (U_FAILURE(status)) { goto cleanup; } // Pick up the start and optional range end code points from the parsed line. UChar32 startCodePoint = SpoofImpl::ScanHex( input, uregex_start(parseRegexp, 2, &status), uregex_end(parseRegexp, 2, &status), status); UChar32 endCodePoint = startCodePoint; if (uregex_start(parseRegexp, 3, &status) >=0) { endCodePoint = SpoofImpl::ScanHex( input, uregex_start(parseRegexp, 3, &status), uregex_end(parseRegexp, 3, &status), status); } // Extract the two script names from the source line. We need these in an 8 bit // default encoding (will be EBCDIC on IBM mainframes) in order to pass them on // to the ICU u_getPropertyValueEnum() function. Ugh. char srcScriptName[20]; char targScriptName[20]; extractGroup(parseRegexp, 4, srcScriptName, sizeof(srcScriptName), status); extractGroup(parseRegexp, 5, targScriptName, sizeof(targScriptName), status); UScriptCode srcScript = static_cast<UScriptCode>(u_getPropertyValueEnum(UCHAR_SCRIPT, srcScriptName)); UScriptCode targScript = static_cast<UScriptCode>(u_getPropertyValueEnum(UCHAR_SCRIPT, targScriptName)); if (U_FAILURE(status)) { goto cleanup; } if (srcScript == USCRIPT_INVALID_CODE || targScript == USCRIPT_INVALID_CODE) { status = U_INVALID_FORMAT_ERROR; goto cleanup; } // select the table - (A) any case or (L) lower case only UTrie2 *table = anyCaseTrie; if (uregex_start(parseRegexp, 7, &status) >= 0) { table = lowerCaseTrie; } // Build the set of scripts containing confusable characters for // the code point(s) specified in this input line. // Sanity check that the script of the source code point is the same // as the source script indicated in the input file. Failure of this check is // an error in the input file. // Include the source script in the set (needed for Mixed Script Confusable detection). // UChar32 cp; for (cp=startCodePoint; cp<=endCodePoint; cp++) { int32_t setIndex = utrie2_get32(table, cp); BuilderScriptSet *bsset = NULL; if (setIndex > 0) { U_ASSERT(setIndex < scriptSets->size()); bsset = static_cast<BuilderScriptSet *>(scriptSets->elementAt(setIndex)); } else { bsset = new BuilderScriptSet(); if (bsset == NULL) { status = U_MEMORY_ALLOCATION_ERROR; goto cleanup; } bsset->codePoint = cp; bsset->trie = table; bsset->sset = new ScriptSet(); setIndex = scriptSets->size(); bsset->index = setIndex; bsset->rindex = 0; if (bsset->sset == NULL) { status = U_MEMORY_ALLOCATION_ERROR; goto cleanup; } scriptSets->addElement(bsset, status); utrie2_set32(table, cp, setIndex, &status); } bsset->sset->set(targScript, status); bsset->sset->set(srcScript, status); if (U_FAILURE(status)) { goto cleanup; } UScriptCode cpScript = uscript_getScript(cp, &status); if (cpScript != srcScript) { status = U_INVALID_FORMAT_ERROR; goto cleanup; } } } // Eliminate duplicate script sets. At this point we have a separate // script set for every code point that had data in the input file. // // We eliminate underlying ScriptSet objects, not the BuildScriptSets that wrap them // // printf("Number of scriptSets: %d\n", scriptSets->size()); { int32_t duplicateCount = 0; rtScriptSetsCount = 2; for (int32_t outeri=2; outeri<scriptSets->size(); outeri++) { BuilderScriptSet *outerSet = static_cast<BuilderScriptSet *>(scriptSets->elementAt(outeri)); if (outerSet->index != static_cast<uint32_t>(outeri)) { // This set was already identified as a duplicate. // It will not be allocated a position in the runtime array of ScriptSets. continue; } outerSet->rindex = rtScriptSetsCount++; for (int32_t inneri=outeri+1; inneri<scriptSets->size(); inneri++) { BuilderScriptSet *innerSet = static_cast<BuilderScriptSet *>(scriptSets->elementAt(inneri)); if (*(outerSet->sset) == *(innerSet->sset) && outerSet->sset != innerSet->sset) { delete innerSet->sset; innerSet->scriptSetOwned = FALSE; innerSet->sset = outerSet->sset; innerSet->index = outeri; innerSet->rindex = outerSet->rindex; duplicateCount++; } // But this doesn't get all. We need to fix the TRIE. } } // printf("Number of distinct script sets: %d\n", rtScriptSetsCount); } // Update the Trie values to be reflect the run time script indexes (after duplicate merging). // (Trie Values 0 and 1 are reserved, and the corresponding slots in scriptSets // are unused, which is why the loop index starts at 2.) { for (int32_t i=2; i<scriptSets->size(); i++) { BuilderScriptSet *bSet = static_cast<BuilderScriptSet *>(scriptSets->elementAt(i)); if (bSet->rindex != (uint32_t)i) { utrie2_set32(bSet->trie, bSet->codePoint, bSet->rindex, &status); } } } // For code points with script==Common or script==Inherited, // Set the reserved value of 1 into both Tries. These characters do not participate // in Whole Script Confusable detection; this reserved value is the means // by which they are detected. { UnicodeSet ignoreSet; ignoreSet.applyIntPropertyValue(UCHAR_SCRIPT, USCRIPT_COMMON, status); UnicodeSet inheritedSet; inheritedSet.applyIntPropertyValue(UCHAR_SCRIPT, USCRIPT_INHERITED, status); ignoreSet.addAll(inheritedSet); for (int32_t rn=0; rn<ignoreSet.getRangeCount(); rn++) { UChar32 rangeStart = ignoreSet.getRangeStart(rn); UChar32 rangeEnd = ignoreSet.getRangeEnd(rn); utrie2_setRange32(anyCaseTrie, rangeStart, rangeEnd, 1, TRUE, &status); utrie2_setRange32(lowerCaseTrie, rangeStart, rangeEnd, 1, TRUE, &status); } } // Serialize the data to the Spoof Detector { utrie2_freeze(anyCaseTrie, UTRIE2_16_VALUE_BITS, &status); int32_t size = utrie2_serialize(anyCaseTrie, NULL, 0, &status); // printf("Any case Trie size: %d\n", size); if (status != U_BUFFER_OVERFLOW_ERROR) { goto cleanup; } status = U_ZERO_ERROR; spImpl->fSpoofData->fRawData->fAnyCaseTrie = spImpl->fSpoofData->fMemLimit; spImpl->fSpoofData->fRawData->fAnyCaseTrieLength = size; spImpl->fSpoofData->fAnyCaseTrie = anyCaseTrie; void *where = spImpl->fSpoofData->reserveSpace(size, status); utrie2_serialize(anyCaseTrie, where, size, &status); utrie2_freeze(lowerCaseTrie, UTRIE2_16_VALUE_BITS, &status); size = utrie2_serialize(lowerCaseTrie, NULL, 0, &status); // printf("Lower case Trie size: %d\n", size); if (status != U_BUFFER_OVERFLOW_ERROR) { goto cleanup; } status = U_ZERO_ERROR; spImpl->fSpoofData->fRawData->fLowerCaseTrie = spImpl->fSpoofData->fMemLimit; spImpl->fSpoofData->fRawData->fLowerCaseTrieLength = size; spImpl->fSpoofData->fLowerCaseTrie = lowerCaseTrie; where = spImpl->fSpoofData->reserveSpace(size, status); utrie2_serialize(lowerCaseTrie, where, size, &status); spImpl->fSpoofData->fRawData->fScriptSets = spImpl->fSpoofData->fMemLimit; spImpl->fSpoofData->fRawData->fScriptSetsLength = rtScriptSetsCount; ScriptSet *rtScriptSets = static_cast<ScriptSet *> (spImpl->fSpoofData->reserveSpace(rtScriptSetsCount * sizeof(ScriptSet), status)); uint32_t rindex = 2; for (int32_t i=2; i<scriptSets->size(); i++) { BuilderScriptSet *bSet = static_cast<BuilderScriptSet *>(scriptSets->elementAt(i)); if (bSet->rindex < rindex) { // We have already copied this script set to the serialized data. continue; } U_ASSERT(rindex == bSet->rindex); rtScriptSets[rindex] = *bSet->sset; // Assignment of a ScriptSet just copies the bits. rindex++; } } // Open new utrie2s from the serialized data. We don't want to keep the ones // we just built because we would then have two copies of the data, one internal to // the utries that we have already constructed, and one in the serialized data area. // An alternative would be to not pre-serialize the Trie data, but that makes the // spoof detector data different, depending on how the detector was constructed. // It's simpler to keep the data always the same. spImpl->fSpoofData->fAnyCaseTrie = utrie2_openFromSerialized( UTRIE2_16_VALUE_BITS, (const char *)spImpl->fSpoofData->fRawData + spImpl->fSpoofData->fRawData->fAnyCaseTrie, spImpl->fSpoofData->fRawData->fAnyCaseTrieLength, NULL, &status); spImpl->fSpoofData->fLowerCaseTrie = utrie2_openFromSerialized( UTRIE2_16_VALUE_BITS, (const char *)spImpl->fSpoofData->fRawData + spImpl->fSpoofData->fRawData->fLowerCaseTrie, spImpl->fSpoofData->fRawData->fAnyCaseTrieLength, NULL, &status); cleanup: if (U_FAILURE(status)) { pe->line = lineNum; } uregex_close(parseRegexp); uprv_free(input); int32_t i; if (scriptSets != NULL) { for (i=0; i<scriptSets->size(); i++) { BuilderScriptSet *bsset = static_cast<BuilderScriptSet *>(scriptSets->elementAt(i)); delete bsset; } delete scriptSets; } utrie2_close(anyCaseTrie); utrie2_close(lowerCaseTrie); return; }
URegularExpression * __hs_uregex_open(const UChar *pattern, int32_t patternLength, uint32_t flags, UParseError *pe, UErrorCode *status) { return uregex_open(pattern, patternLength, flags, pe, status); }
int main(void) { int ret; int32_t l, u; UErrorCode status; UFILE *ustdout, *ustderr; URegularExpression *uregex; UBool case_insensitive = FALSE; UParseError pe = {-1, -1, {0}, {0}}; UChar pattern[] = { 0x0028, // 28, ( 0x005C, // 5C, backslash 0x0070, // 70, p 0x007B, // 7B, { 0x004C, // 4C, L 0x007D, // 7D, } 0x0029, // 29, ) 0x0028, // 28, ( 0x005C, // 5C, backslash 0x0070, // 70, p 0x007B, // 7B, { 0x004E, // 4E, N 0x0064, // 64, d 0x007D, // 7D, } 0x0029, // 29, ) 0 }; UChar string[] = { 0xD835, 0xDE3C, // A 0xD835, 0xDE3C, // A 0xD835, 0xDFE2, // 0 0xD835, 0xDE3D, // B 0xD835, 0xDE3D, // B 0xD835, 0xDFE3, // 1 0xD835, 0xDE3E, // C 0xD835, 0xDE3E, // C 0xD835, 0xDFE4, // 2 0 }; uregex = NULL; ret = EXIT_SUCCESS; status = U_ZERO_ERROR; ustdout = u_finit(stdout, NULL, NULL); ustderr = u_finit(stderr, NULL, NULL); uregex = uregex_open(pattern, -1, case_insensitive ? UREGEX_CASE_INSENSITIVE : 0, &pe, &status); if (U_FAILURE(status)) { if (U_REGEX_RULE_SYNTAX == status) { u_fprintf(ustderr, "Invalid pattern: error at offset %d\n\t%S\n\t%*c\n", pe.offset, pattern, pe.offset, '^'); } goto end; } uregex_setText(uregex, string, -1, &status); if (U_FAILURE(status)) { goto end; } while (uregex_findNext(uregex, &status)) { l = uregex_start(uregex, 0, &status); if (U_FAILURE(status)) { goto end; } u = uregex_end(uregex, 0, &status); if (U_FAILURE(status)) { goto end; } // $0 u_fprintf(ustdout, "Match found at %d position (to %d): %.*S\n", l, u, u - l, string + l); // $1 à $2 (on pourrait aussi utiliser uregex_group avec un groupNum à valeur 0 pour récupérer $0) { UChar buffer[1024]; int32_t i, l, g; l = uregex_groupCount(uregex, &status); if (U_FAILURE(status)) { icu_error(status, "uregex_groupCount"); } for (i = 1; i <= l; i++) { g = uregex_group(uregex, i, buffer, USTRING_SIZE(buffer), &status); if (U_FAILURE(status)) { icu_error(status, "uregex_group"); } u_fprintf(ustdout, "$%d : %S\n", i, buffer); } } } if (U_FAILURE(status)) { goto end; } if (FALSE) { end: ret = EXIT_FAILURE; } if (NULL != ustderr) { u_fclose(ustderr); } if (NULL != ustdout) { u_fclose(ustdout); } if (NULL != uregex) { uregex_close(uregex); } u_cleanup(); return ret; }