// convert escape char to literal char ucs4string ConvertEscape(const ucs4string &str) { ucs4string out; detail::escape_value<ucs4_t, ucs4_regex_traits::char_class_type> esc; ucs4string::const_iterator begin=str.begin(); ucs4string::const_iterator end=str.end(); compiler_traits<ucs4_regex_traits> ucs4traits; while(begin!=end) { if(*begin=='\\') { if(++begin == end) { //out.push_back('\\'); // last char is '\' throw regex_error(regex_constants::error_escape); } else { esc = detail::parse_escape(begin, end, ucs4traits); out += esc.ch_; } } else { out += *begin; ++begin; } } return out; }
static ucs4_t const *pick(char const *, wchar_t const *cstr) { inter_str.clear(); while(*cstr != 0) { inter_str.push_back(ucs4_t(*cstr)); ++cstr; } return inter_str.c_str(); }
ucs4string Regexp::replace(ucs4string& text, ucs4string& subst, bool& matched) { OnigRegion* const region = matchInternal(text); if (NULL == region) { matched = false; return text; } const ucs4string beg = text.substr(0, region->beg[0] / sizeof(ucs4char)); const ucs4string end = text.substr(region->end[0] / sizeof(ucs4char), text.size() - region->end[0] / sizeof(ucs4char)); matched = true; return (beg + subst + end).data(); }
void Segmentation::DoSegment(ucs4string base, double base_rr, ucs4string to_seg, SegmentData& data) { for (size_t i=1; i<to_seg.size(); ++i) { ucs4string us = to_seg.substr(0, i); DoSegment(base + ucs4_t(' ') + us, base_rr*GetRateReciprocal(us), to_seg.substr(i), data); } double rr = base_rr*GetRateReciprocal(to_seg); if (rr > data.m_rr) return; data.m_rr = rr; data.m_res = base + ucs4_t(' ') + to_seg; }
int main(int argc, char *argv[]) { if (argc != 3) { std::cerr << "Usage:" << argv[0] << " <ucs4-phrase-file> <ucs4-dictionary>" << std::endl; exit(1); } std::ifstream fin(argv[1], std::ios::binary); if (!fin.is_open()) { std::cerr << "Failed to open " << argv[1] << " for read." << std::endl; exit(2); } std::ofstream fout(argv[2], std::ios::binary); if (!fout.is_open()) { std::cerr << "Failed to open " << argv[2] << " for write." << std::endl; exit(3); } DictionaryGenerator dg; while (!fin.eof()) { ucs4string s; ucs4getline(fin, s); dg.SetmentPhrase(s); } DictMap::const_iterator it = dg.GetDictionary().begin(); DictMap::const_iterator itend = dg.GetDictionary().end(); for (; it!=itend; ++it) { const ucs4string ustr=it->first; unsigned int cnt=it->second; if (cnt==1 && ustr.size()!=1) continue; ucs4putstr(fout, ustr); ucs4putstr(fout, stdtoustr(" ")); ucs4putstr(fout, utoustr(cnt)); ucs4putch(fout, ucs4_t('\n')); } return 0; }
void ucs4getline(std::istream& is, ucs4string& ustr) { while (!is.eof()) { ucs4_t ch = ucs4_t('\n'); is.read(reinterpret_cast<char *>(&ch), 4); if (ch == ucs4_t('\n')) break; ustr.push_back(ch); } }
unsigned int ustrtou(const ucs4string& ustr) { unsigned int u = 0; for (size_t i=0; i<ustr.size(); ++i) { ucs4_t ch = ustr[i]; if (ch<'0' || ch>'9') break; u = u * 10 + (ch - '0'); } return u; }
// N.B. If you call loadFileUnsafe, be sure that this code is inside the TRY_VM/CATCH_VM void VM::loadFileUnsafe(const ucs4string& file) { Registers r; saveRegisters(&r); const Object loadPort = Object::makeTextualInputFilePort(file.ascii_c_str()); TextualInputPort* p = loadPort.toTextualInputPort(); bool readErrorOccured = false; for (Object o = p->getDatum(readErrorOccured); !o.isEof(); o = p->getDatum(readErrorOccured)) { if (readErrorOccured) { callLexicalViolationImmidiaImmediately(this, "read", p->error()); } evaluateUnsafe(compile(o).toVector()); } restoreRegisters(&r); }
void Build(const ucs4_t* pat, size_t len) { if(m_Pattern.length()==len && IsTheSame(m_Pattern.c_str(), pat, (int)len)) return; m_Pattern.assign(pat, len); const int len1 = m_Len_1 = (int)len+1; int *ptab = m_UCS2_Table; for(size_t i=0;i<65536; ++i, ++ptab) *ptab = len1; m_Table.clear(); const ucs4_t* p = pat; for(size_t i=0; i<len; ++i, ++p) { const unsigned int idx = (unsigned int)(*p); if(idx <= 0xFFFF) { m_UCS2_Table[idx] = (int)(len-i); } else { m_Table[idx] = (int)(len-i); } } }
Regexp::Regexp(const ucs4string& pattern, bool caseFold, bool isSingleLine) : pattern_(pattern), isErrorOccured_(false), errorMessage_(Object::Nil), irritants_(Object::Nil) { const ucs4char* p = pattern_.data(); int r = onig_new(®exp_, (const uint8_t*)p, (const uint8_t*)(p + pattern_.size()), (ONIG_OPTION_DEFAULT) | (caseFold ? ONIG_OPTION_IGNORECASE : 0) | (isSingleLine? ONIG_OPTION_SINGLELINE : 0), ONIG_ENCODING, ONIG_SYNTAX_RUBY, &einfo_); if (r != ONIG_NORMAL) { char errorMessageBuffer[ONIG_MAX_ERROR_MESSAGE_LEN]; onig_error_code_to_str((uint8_t*)errorMessageBuffer, r, &einfo_); isErrorOccured_ = true; errorMessage_ = errorMessageBuffer; irritants_ = L1(Object::makeString(pattern.data())); } }
void ucs4putstr(std::ostream& os, const ucs4string& ustr) { size_t bytes = ustr.size() * 4; os.write(reinterpret_cast<const char *>(ustr.data()), bytes); }
void TextualOutputPort::format(const VM* theVM, const ucs4string& fmt, Object args) { ucs4string buffer = UC(""); for (uint32_t i = 0; i < fmt.size(); i++) { if (fmt[i] == '~') { i++; if (!buffer.empty()) { putString(buffer); buffer.clear(); } switch (fmt[i]) { case '~': display(theVM, Object::makeChar('~')); break; case '%': display(theVM, Object::makeChar('\n')); break; case 'a': case 'A': case 'd': case 'D': { if (args.isPair()) { display(theVM, args.car()); args = args.cdr(); } else { isErrorOccured_ = true; errorMessage_ = "too few arguments for format string"; irritants_ = Pair::list1(Object::makeString(fmt)); return; } break; } case 's': case 'S': { if (args.isPair()) { putDatum(theVM, args.car()); args = args.cdr(); } else { isErrorOccured_ = true; errorMessage_ = "too few arguments for format string"; irritants_ = Pair::list1(Object::makeString(fmt)); return; } break; } case '\0': i--; break; } } else { buffer += fmt[i]; } } if (!buffer.empty()) { putString(buffer); } flush(); //fflush(stdout); // temp return; }
void TextualOutputPort::putString(const ucs4string& s) { for (ucs4string::size_type i = 0; i < s.size(); i++) { putCharHandleSpecial(s[i]); } }