CAMLprim value mlre2__find_first(value v_regex, value v_sub, value v_str) { CAMLparam2(v_regex, v_str); CAMLlocal1(v_retval); CAMLlocalN(error_args, 2); const RE2 * re = Regex_val(v_regex); const char* input = String_val(v_str); int len = caml_string_length(v_str); StringPiece str = StringPiece(input, len); int n = Int_val(v_sub) + 1; StringPiece * submatches = new StringPiece[n]; assert_valid_sub(re, v_sub); if (! re->Match(str, 0, str.length(), RE2::UNANCHORED, submatches, n)) { delete[] submatches; caml_raise_with_string(*caml_named_value("mlre2__Regex_match_failed"), re->pattern().c_str()); } StringPiece * sub = submatches + Int_val(v_sub); if (!sub->data()) { delete[] submatches; error_args[0] = caml_copy_string(re->pattern().c_str()); error_args[1] = v_sub; caml_raise_with_args(*caml_named_value("mlre2__Regex_submatch_did_not_capture"), 2, error_args); } v_retval = caml_alloc_string(sub->length()); memcpy(String_val(v_retval), String_val(v_str) + (sub->data() - input), sub->length()); delete[] submatches; CAMLreturn(v_retval); }
void SplitStringKeepEmpty( const StringPiece& full, const StringPiece& delim, std::vector<std::string>* result) { // 单个字符的分隔符转调字符版本的分割函数,要快一些 if (delim.length() == 1) { SplitStringKeepEmpty(full, delim[0], result); return; } result->clear(); if (full.empty() || delim.empty()) return; size_t prev_pos = 0; size_t pos; std::string token; while ((pos = full.find(delim, prev_pos)) != std::string::npos) { token.assign(full.data() + prev_pos, pos - prev_pos); result->push_back(token); prev_pos = pos + delim.length(); } token.assign(full.data() + prev_pos, full.length() - prev_pos); result->push_back(token); }
static int new_pos(const char *input, StringPiece &remaining, int startpos, StringPiece &match) { if (remaining.length() < 0) { return -1; } else { /* casting these size_t's to int is safe because StringPiece's track * their lengths using ints */ size_t first_unexamined = remaining.data() + startpos - input; size_t first_unmatched = match.data() - input + match.length(); return (int) (first_unexamined > first_unmatched ? first_unexamined : first_unmatched); } }
UCollationResult Collator::compareUTF8(const StringPiece &source, const StringPiece &target, UErrorCode &status) const { if(U_FAILURE(status)) { return UCOL_EQUAL; } UCharIterator sIter, tIter; uiter_setUTF8(&sIter, source.data(), source.length()); uiter_setUTF8(&tIter, target.data(), target.length()); return compare(sIter, tIter, status); }
std::u16string utf8ToUtf16(const StringPiece& utf8) { ssize_t utf16Length = utf8_to_utf16_length(reinterpret_cast<const uint8_t*>(utf8.data()), utf8.length()); if (utf16Length <= 0) { return {}; } std::u16string utf16; utf16.resize(utf16Length); utf8_to_utf16(reinterpret_cast<const uint8_t*>(utf8.data()), utf8.length(), &*utf16.begin()); return utf16; }
/* this is to ensure we don't return the same zero-length match forever */ static int ensure_progress(StringPiece &str, const StringPiece &match) { static RE2 re("."); if (match.length() > 0) { return match.length(); } else if (str.length() > 0) { StringPiece str_copy = str; /* Drops one character from the front of the StringPiece. Implemented using a regex call because that's the easiest way to handle multibyte Unicode characters. */ RE2::Consume(&str_copy, re); return (int) (str_copy.data() - str.data()); } else return 1; /* we halt on negative length strings, so this value is arbitrary */ }
/// Return the vocID of word "text" if it exist in the vocabulary /// Otherwise return 0 IndexType C_IDVocabulary::returnId(const StringPiece &text) const { StringPiece stemmed(text); if (stem_len_ > 0) { //take first stem_len_ characters stemmed = text.substr(stem_len_); } else if ((stem_len_ < 0) && (text.length() > -stem_len_)) {//take last stem_len_ characters stemmed = text.substr(text.length() + stem_len_, -stem_len_); } //otherwise leave as it is WordId::const_iterator iter = word_id_.find(stemmed); if(iter == word_id_.end()) { return 0; } else { return iter->second; } }
// ------------------------------------- void DigitList::set(const StringPiece &source, UErrorCode &status) { if (U_FAILURE(status)) { return; } // Figure out a max number of digits to use during the conversion, and // resize the number up if necessary. int32_t numDigits = source.length(); if (numDigits > fContext.digits) { fContext.digits = numDigits; char *t = fStorage.resize(sizeof(decNumber) + numDigits, fStorage.getCapacity()); if (t == NULL) { status = U_MEMORY_ALLOCATION_ERROR; return; } fDecNumber = (decNumber *)fStorage.getAlias(); } fContext.status = 0; uprv_decNumberFromString(fDecNumber, source.data(), &fContext); if ((fContext.status & DEC_Conversion_syntax) != 0) { status = U_DECIMAL_NUMBER_SYNTAX_ERROR; } fHaveDouble = FALSE; }
void DoSplitLines( const StringPiece& full, std::vector<StringType>* result, bool keep_line_endling ) { result->clear(); size_t prev_pos = 0; size_t pos; StringType token; while ((pos = full.find('\n', prev_pos)) != std::string::npos) { token.assign(full.data() + prev_pos, pos - prev_pos + 1); if (!keep_line_endling) RemoveLineEnding(&token); result->push_back(token); prev_pos = pos + 1; } if (prev_pos < full.size()) { token.assign(full.data() + prev_pos, full.length() - prev_pos); if (!keep_line_endling) RemoveLineEnding(&token); result->push_back(token); } }
// For each character in characters_wanted, sets the index corresponding // to the ASCII code of that character to 1 in table. This is used by // the m_find.*_of methods below to tell whether or not a character is in // the lookup table in constant time. // The argument `table' must be an array that is large enough to hold all // the possible values of an unsigned char. Thus it should be be declared // as follows: // bool table[UCHAR_MAX + 1] static inline void BuildLookupTable(const StringPiece& characters_wanted, bool* table) { const size_type length = characters_wanted.length(); const char* const data = characters_wanted.data(); for (size_type i = 0; i < length; ++i) { table[static_cast<unsigned char>(data[i])] = true; } }
CAMLprim value mlre2__find_all(value v_regex, value v_sub, value v_str) { CAMLparam2(v_regex, v_str); CAMLlocal3(v_retval, v_car, v_cons); std::vector<StringPiece> results; const RE2 * re = Regex_val(v_regex); const char* input = String_val(v_str); int len = caml_string_length(v_str); StringPiece str = StringPiece(input, len); int n = Int_val(v_sub) + 1; int startpos = 0; StringPiece * matches = new StringPiece[n]; StringPiece * sub = matches + Int_val(v_sub); assert_valid_sub(re, v_sub); while (str.length() > startpos && re->Match(str, startpos, str.length(), RE2::UNANCHORED, matches, n)) { startpos += ensure_progress(str, matches[0]); startpos = new_pos(input, str, startpos, matches[0]); /* push_back followed by back-to-front consing gives the correct final order */ if (sub->data()) { results.push_back(*sub); } } if (results.size() <= 0) { delete[] matches; caml_raise_with_string(*caml_named_value("mlre2__Regex_match_failed"), re->pattern().c_str()); } v_retval = Val_emptylist; for (std::vector<StringPiece>::reverse_iterator it = results.rbegin(); it != results.rend(); ++it) { v_car = caml_alloc_string(it->length()); memcpy(String_val(v_car), String_val(v_str) + (it->data() - input), it->length()); v_cons = caml_alloc_small(2, Tag_cons); Field(v_cons, 0) = v_car; Field(v_cons, 1) = v_retval; v_retval = v_cons; } delete[] matches; CAMLreturn(v_retval); }
void CaseMap::utf8ToUpper( const char *locale, uint32_t options, StringPiece src, ByteSink &sink, Edits *edits, UErrorCode &errorCode) { ucasemap_mapUTF8( ustrcase_getCaseLocale(locale), options, UCASEMAP_BREAK_ITERATOR_NULL src.data(), src.length(), ucasemap_internalUTF8ToUpper, sink, edits, errorCode); }
void CaseMap::utf8Fold( uint32_t options, StringPiece src, ByteSink &sink, Edits *edits, UErrorCode &errorCode) { ucasemap_mapUTF8( UCASE_LOC_ROOT, options, UCASEMAP_BREAK_ITERATOR_NULL src.data(), src.length(), ucasemap_internalUTF8Fold, sink, edits, errorCode); }
static jobject NativeDecimalFormat_parse(JNIEnv* env, jclass, jlong addr, jstring text, jobject position, jboolean parseBigDecimal) { static jmethodID gPP_getIndex = env->GetMethodID(JniConstants::parsePositionClass, "getIndex", "()I"); static jmethodID gPP_setIndex = env->GetMethodID(JniConstants::parsePositionClass, "setIndex", "(I)V"); static jmethodID gPP_setErrorIndex = env->GetMethodID(JniConstants::parsePositionClass, "setErrorIndex", "(I)V"); ScopedJavaUnicodeString src(env, text); if (!src.valid()) { return NULL; } // make sure the ParsePosition is valid. Actually icu4c would parse a number // correctly even if the parsePosition is set to -1, but since the RI fails // for that case we have to fail too int parsePos = env->CallIntMethod(position, gPP_getIndex, NULL); if (parsePos < 0 || parsePos > env->GetStringLength(text)) { return NULL; } Formattable res; ParsePosition pp(parsePos); DecimalFormat* fmt = toDecimalFormat(addr); fmt->parse(src.unicodeString(), res, pp); if (pp.getErrorIndex() == -1) { env->CallVoidMethod(position, gPP_setIndex, pp.getIndex()); } else { env->CallVoidMethod(position, gPP_setErrorIndex, pp.getErrorIndex()); return NULL; } if (parseBigDecimal) { UErrorCode status = U_ZERO_ERROR; StringPiece str = res.getDecimalNumber(status); if (U_SUCCESS(status)) { int len = str.length(); const char* data = str.data(); if (strncmp(data, "NaN", 3) == 0 || strncmp(data, "Inf", 3) == 0 || strncmp(data, "-Inf", 4) == 0) { double resultDouble = res.getDouble(status); return doubleValueOf(env, resultDouble); } return newBigDecimal(env, data, len); } return NULL; } switch (res.getType()) { case Formattable::kDouble: return doubleValueOf(env, res.getDouble()); case Formattable::kLong: return longValueOf(env, res.getLong()); case Formattable::kInt64: return longValueOf(env, res.getInt64()); default: return NULL; } }
CAMLprim value mlre2__iter_next(value v_regex, value v_pos, value v_max_submatch, value v_input) { CAMLparam2(v_regex, v_input); CAMLlocal3(v_retval, v_match_array, v_match); /* [v_retval] is the return value. * [v_match_array] is the array used to return captured substrings * [v_match] is the substring captured by a submatch. */ const RE2 * re = Regex_val(v_regex); const char * input = String_val(v_input); int startpos = Int_val(v_pos); int len = caml_string_length(v_input); StringPiece str = StringPiece(input, len); int max_submatch = Int_val(v_max_submatch) < 0 ? re->NumberOfCapturingGroups() : Int_val(v_max_submatch); /* +1 for whole match ("subpattern zero") */ int n = 1 + (max_submatch > 0 ? max_submatch : 0); StringPiece *submatches = new StringPiece[n]; StringPiece *sub = submatches; /* extra pointer for iterating over [submatches] */ if (str.length() < startpos || ! re->Match(str, startpos, str.length(), RE2::UNANCHORED, submatches, n)) { PAIR(v_retval, Val_int(-1), Val_none); } else { startpos += ensure_progress(str, submatches[0]); v_match_array = caml_alloc_tuple(n); for (int i = 0; i < n; ++i) { sub = submatches + i; if (sub->data()) { PAIR(v_retval, Val_int((int)(sub->data() - input)), Val_int(sub->length())); SOME(v_match, v_retval); } else v_match = Val_none; Store_field(v_match_array, i, v_match); } SOME(v_match, v_match_array); PAIR(v_retval, Val_int(new_pos(input, str, startpos, submatches[0])), v_match); } delete[] submatches; CAMLreturn(v_retval); }
/** * Set the DigitList from a decimal number string. * * The incoming string _must_ be nul terminated, even though it is arriving * as a StringPiece because that is what the decNumber library wants. * We can get away with this for an internal function; it would not * be acceptable for a public API. */ void DigitList::set(StringPiece source, UErrorCode &status, uint32_t /*fastpathBits*/) { if (U_FAILURE(status)) { return; } #if 0 if(fastpathBits==(kFastpathOk|kNoDecimal)) { int32_t size = source.size(); const char *data = source.data(); int64_t r = 0; int64_t m = 1; // fast parse while(size>0) { char ch = data[--size]; if(ch=='+') { break; } else if(ch=='-') { r = -r; break; } else { int64_t d = ch-'0'; //printf("CH[%d]=%c, %d, *=%d\n", size,ch, (int)d, (int)m); r+=(d)*m; m *= 10; } } //printf("R=%d\n", r); set(r); } else #endif { // Figure out a max number of digits to use during the conversion, and // resize the number up if necessary. int32_t numDigits = source.length(); if (numDigits > fContext.digits) { // fContext.digits == fStorage.getCapacity() decNumber *t = fStorage.resize(numDigits, fStorage.getCapacity()); if (t == NULL) { status = U_MEMORY_ALLOCATION_ERROR; return; } fDecNumber = t; fContext.digits = numDigits; } fContext.status = 0; uprv_decNumberFromString(fDecNumber, source.data(), &fContext); if ((fContext.status & DEC_Conversion_syntax) != 0) { status = U_DECIMAL_NUMBER_SYNTAX_ERROR; } } internalClear(); }
void C_IDVocabulary::Insert(const StringPiece &word, IndexType id) { if (id_word_.size() < id) { id_word_.resize(id + 1); } char *&place = id_word_[id]; if (place) { StringPiece existing(place); if (existing != word) { std::cerr << "Duplicate SALM vocab id " << id << ": " << existing << " and " << word << std::endl; abort(); } else { return; } } place = new char[word.length() + 1]; memcpy(place, word.data(), word.length()); place[word.length()] = 0; std::pair<WordId::iterator, bool> result(word_id_.insert(make_pair(StringPiece(place, word.length()), id))); if (!result.second) { std::cerr << "SALM words " << word << " and " << result.first->second << " have the same id " << id << std::endl; abort(); } }
// Do not assert in this function since it is used by the asssertion code! std::wstring SysMultiByteToWide(const StringPiece& mb, uint32 code_page) { if (mb.empty()) return std::wstring(); int mb_length = static_cast<int>(mb.length()); // Compute the length of the buffer. int charcount = MultiByteToWideChar(code_page, 0, mb.data(), mb_length, NULL, 0); if (charcount == 0) return std::wstring(); std::wstring wide; wide.resize(charcount); MultiByteToWideChar(code_page, 0, mb.data(), mb_length, &wide[0], charcount); return wide; }
// This function merges a vector of string components void JoinStrings( const std::vector<std::string>& components, const StringPiece& delim, std::string* result) { size_t length = 0; for (std::vector<std::string>::const_iterator iter = components.begin(); iter != components.end(); ++iter) { if (iter != components.begin()) { length += delim.length(); } length += iter->size(); } result->reserve(length); return JoinStrings<std::vector<std::string>::const_iterator>( components.begin(), components.end(), delim, result); }
// Replace the first "old" pattern with the "new" pattern in a string std::string ReplaceFirst( const StringPiece& s, const StringPiece& oldsub, const StringPiece& newsub) { if (oldsub.empty()) return s.as_string(); std::string res; std::string::size_type pos = s.find(oldsub); if (pos == std::string::npos) return s.as_string(); else { res.append(s.data(), pos); res.append(newsub.data(), newsub.size()); res.append(s.data() + pos + oldsub.size(), s.length() - pos - oldsub.size()); } return res; }
// Replace all the "old" pattern with the "new" pattern in a string std::string ReplaceAll(const StringPiece& s, const StringPiece& oldsub, const StringPiece& newsub) { if (oldsub.empty()) return s.as_string(); std::string res; std::string::size_type start_pos = 0; std::string::size_type pos; do { pos = s.find(oldsub, start_pos); if (pos == std::string::npos) { break; } res.append(s.data() + start_pos, pos - start_pos); res.append(newsub.data(), newsub.size()); start_pos = pos + oldsub.size(); } while (true); res.append(s.data() + start_pos, s.length() - start_pos); return res; }
/** 功能: 把一个字符串划分成多个字符串 * 参数: * 输入参数 const StringPiece& full 主字符串 * 输入参数 const StringPiece& delim 字符串分界符号 * 输出参数 std::vector<std::string>& result 分解后的结果 */ void SplitStringKeepEmpty( const StringPiece& full, char delim, std::vector<std::string>* result) { result->clear(); if (full.empty()) return; size_t prev_pos = 0; size_t pos; std::string token; while ((pos = full.find(delim, prev_pos)) != std::string::npos) { token.assign(full.data() + prev_pos, pos - prev_pos); result->push_back(token); prev_pos = pos + 1; } token.assign(full.data() + prev_pos, full.length() - prev_pos); result->push_back(token); }
void DecNum::setTo(StringPiece str, UErrorCode& status) { // We need NUL-terminated for decNumber; CharString guarantees this, but not StringPiece. CharString cstr(str, status); if (U_FAILURE(status)) { return; } _setTo(cstr.data(), str.length(), status); }
void StringTest::TestStringPiece() { // Default constructor. StringPiece empty; if(!empty.empty() || empty.data()!=NULL || empty.length()!=0 || empty.size()!=0) { errln("StringPiece() failed"); } // Construct from NULL const char * pointer. StringPiece null(NULL); if(!null.empty() || null.data()!=NULL || null.length()!=0 || null.size()!=0) { errln("StringPiece(NULL) failed"); } // Construct from const char * pointer. static const char *abc_chars="abc"; StringPiece abc(abc_chars); if(abc.empty() || abc.data()!=abc_chars || abc.length()!=3 || abc.size()!=3) { errln("StringPiece(abc_chars) failed"); } // Construct from const char * pointer and length. static const char *abcdefg_chars="abcdefg"; StringPiece abcd(abcdefg_chars, 4); if(abcd.empty() || abcd.data()!=abcdefg_chars || abcd.length()!=4 || abcd.size()!=4) { errln("StringPiece(abcdefg_chars, 4) failed"); } #if U_HAVE_STD_STRING // Construct from std::string. std::string uvwxyz_string("uvwxyz"); StringPiece uvwxyz(uvwxyz_string); if(uvwxyz.empty() || uvwxyz.data()!=uvwxyz_string.data() || uvwxyz.length()!=6 || uvwxyz.size()!=6) { errln("StringPiece(uvwxyz_string) failed"); } #endif // Substring constructor with pos. StringPiece sp(abcd, -1); if(sp.empty() || sp.data()!=abcdefg_chars || sp.length()!=4 || sp.size()!=4) { errln("StringPiece(abcd, -1) failed"); } sp=StringPiece(abcd, 5); if(!sp.empty() || sp.length()!=0 || sp.size()!=0) { errln("StringPiece(abcd, 5) failed"); } sp=StringPiece(abcd, 2); if(sp.empty() || sp.data()!=abcdefg_chars+2 || sp.length()!=2 || sp.size()!=2) { errln("StringPiece(abcd, -1) failed"); } // Substring constructor with pos and len. sp=StringPiece(abcd, -1, 8); if(sp.empty() || sp.data()!=abcdefg_chars || sp.length()!=4 || sp.size()!=4) { errln("StringPiece(abcd, -1, 8) failed"); } sp=StringPiece(abcd, 5, 8); if(!sp.empty() || sp.length()!=0 || sp.size()!=0) { errln("StringPiece(abcd, 5, 8) failed"); } sp=StringPiece(abcd, 2, 8); if(sp.empty() || sp.data()!=abcdefg_chars+2 || sp.length()!=2 || sp.size()!=2) { errln("StringPiece(abcd, -1) failed"); } sp=StringPiece(abcd, 2, -1); if(!sp.empty() || sp.length()!=0 || sp.size()!=0) { errln("StringPiece(abcd, 5, -1) failed"); } // static const npos const int32_t *ptr_npos=&StringPiece::npos; if(StringPiece::npos!=0x7fffffff || *ptr_npos!=0x7fffffff) { errln("StringPiece::npos!=0x7fffffff"); } // substr() method with pos, using len=npos. sp=abcd.substr(-1); if(sp.empty() || sp.data()!=abcdefg_chars || sp.length()!=4 || sp.size()!=4) { errln("abcd.substr(-1) failed"); } sp=abcd.substr(5); if(!sp.empty() || sp.length()!=0 || sp.size()!=0) { errln("abcd.substr(5) failed"); } sp=abcd.substr(2); if(sp.empty() || sp.data()!=abcdefg_chars+2 || sp.length()!=2 || sp.size()!=2) { errln("abcd.substr(-1) failed"); } // substr() method with pos and len. sp=abcd.substr(-1, 8); if(sp.empty() || sp.data()!=abcdefg_chars || sp.length()!=4 || sp.size()!=4) { errln("abcd.substr(-1, 8) failed"); } sp=abcd.substr(5, 8); if(!sp.empty() || sp.length()!=0 || sp.size()!=0) { errln("abcd.substr(5, 8) failed"); } sp=abcd.substr(2, 8); if(sp.empty() || sp.data()!=abcdefg_chars+2 || sp.length()!=2 || sp.size()!=2) { errln("abcd.substr(-1) failed"); } sp=abcd.substr(2, -1); if(!sp.empty() || sp.length()!=0 || sp.size()!=0) { errln("abcd.substr(5, -1) failed"); } // clear() sp=abcd; sp.clear(); if(!sp.empty() || sp.data()!=NULL || sp.length()!=0 || sp.size()!=0) { errln("abcd.clear() failed"); } // remove_prefix() sp=abcd; sp.remove_prefix(-1); if(sp.empty() || sp.data()!=abcdefg_chars || sp.length()!=4 || sp.size()!=4) { errln("abcd.remove_prefix(-1) failed"); } sp=abcd; sp.remove_prefix(2); if(sp.empty() || sp.data()!=abcdefg_chars+2 || sp.length()!=2 || sp.size()!=2) { errln("abcd.remove_prefix(2) failed"); } sp=abcd; sp.remove_prefix(5); if(!sp.empty() || sp.length()!=0 || sp.size()!=0) { errln("abcd.remove_prefix(5) failed"); } // remove_suffix() sp=abcd; sp.remove_suffix(-1); if(sp.empty() || sp.data()!=abcdefg_chars || sp.length()!=4 || sp.size()!=4) { errln("abcd.remove_suffix(-1) failed"); } sp=abcd; sp.remove_suffix(2); if(sp.empty() || sp.data()!=abcdefg_chars || sp.length()!=2 || sp.size()!=2) { errln("abcd.remove_suffix(2) failed"); } sp=abcd; sp.remove_suffix(5); if(!sp.empty() || sp.length()!=0 || sp.size()!=0) { errln("abcd.remove_suffix(5) failed"); } }
CAMLprim value mlre2__matches(value v_regex, value v_str) { StringPiece str = String_val(v_str); return Val_int(Regex_val(v_regex)->Match(str, 0, str.length(), RE2::UNANCHORED, NULL, 0)); }
bool StringStartsWithIgnoreCase(const StringPiece& str, const StringPiece& prefix) { return str.size() >= prefix.size() && memcasecmp(str.data(), prefix.data(), prefix.length()) == 0; }