Example #1
0
// convert escape char to literal char
ucs4string ConvertEscape(const ucs4string &str)
{
    ucs4string out;
    detail::escape_value<ucs4_t, ucs4_regex_traits::char_class_type> esc;

    ucs4string::const_iterator begin=str.begin();
    ucs4string::const_iterator end=str.end();
    compiler_traits<ucs4_regex_traits> ucs4traits;

    while(begin!=end)
    {
        if(*begin=='\\')
        {
            if(++begin == end)
            {
                //out.push_back('\\'); // last char is '\'
                throw regex_error(regex_constants::error_escape);
            }
            else
            {
                esc = detail::parse_escape(begin, end, ucs4traits);
                out += esc.ch_;
            }
        }
        else
        {
            out += *begin;
            ++begin;
        }
    }

    return out;
}
Example #2
0
    static ucs4_t const *pick(char const *, wchar_t const *cstr)
    {
        inter_str.clear();
        while(*cstr != 0)
        {
            inter_str.push_back(ucs4_t(*cstr));
            ++cstr;
        }

        return inter_str.c_str();
    }
Example #3
0
ucs4string Regexp::replace(ucs4string& text, ucs4string& subst, bool& matched)
{
    OnigRegion* const region = matchInternal(text);
    if (NULL == region) {
        matched = false;
        return text;
    }
    const ucs4string beg = text.substr(0, region->beg[0] / sizeof(ucs4char));
    const ucs4string end = text.substr(region->end[0] / sizeof(ucs4char), text.size() - region->end[0] / sizeof(ucs4char));
    matched = true;
    return (beg + subst + end).data();
}
Example #4
0
void Segmentation::DoSegment(ucs4string base, double base_rr, ucs4string to_seg, SegmentData& data)
{
	for (size_t i=1; i<to_seg.size(); ++i)
	{
		ucs4string us = to_seg.substr(0, i);
		DoSegment(base + ucs4_t(' ') + us, base_rr*GetRateReciprocal(us), to_seg.substr(i), data);
	}

	double rr = base_rr*GetRateReciprocal(to_seg);
	if (rr > data.m_rr)
		return;

	data.m_rr = rr;
	data.m_res = base + ucs4_t(' ') + to_seg;
}
Example #5
0
int main(int argc, char *argv[])
{
	if (argc != 3)
	{
		std::cerr << "Usage:" << argv[0] << " <ucs4-phrase-file> <ucs4-dictionary>" << std::endl;
		exit(1);
	}

	std::ifstream fin(argv[1], std::ios::binary);
	if (!fin.is_open())
	{
		std::cerr << "Failed to open " << argv[1] << " for read." << std::endl;
		exit(2);
	}
	std::ofstream fout(argv[2], std::ios::binary);
	if (!fout.is_open())
	{
		std::cerr << "Failed to open " << argv[2] << " for write." << std::endl;
		exit(3);
	}

	DictionaryGenerator dg;

	while (!fin.eof())
	{
		ucs4string s;

		ucs4getline(fin, s);

		dg.SetmentPhrase(s);
	}

	DictMap::const_iterator it = dg.GetDictionary().begin();
	DictMap::const_iterator itend = dg.GetDictionary().end();
	for (; it!=itend; ++it)
	{
		const ucs4string ustr=it->first;
		unsigned int cnt=it->second;
		if (cnt==1 && ustr.size()!=1)
			continue;
		ucs4putstr(fout, ustr);
		ucs4putstr(fout, stdtoustr(" "));
		ucs4putstr(fout, utoustr(cnt));
		ucs4putch(fout, ucs4_t('\n'));
	}

	return 0;
}
Example #6
0
void ucs4getline(std::istream& is, ucs4string& ustr)
{
	while (!is.eof())
	{
		ucs4_t ch = ucs4_t('\n');
		is.read(reinterpret_cast<char *>(&ch), 4);

		if (ch == ucs4_t('\n'))
			break;
		ustr.push_back(ch);
	}
}
unsigned int ustrtou(const ucs4string& ustr)
{
	unsigned int u = 0;
	for (size_t i=0; i<ustr.size(); ++i)
	{
		ucs4_t ch = ustr[i];
		if (ch<'0' || ch>'9')
			break;

		u = u * 10 + (ch - '0');
	}
	return u;
}
Example #8
0
// N.B. If you call loadFileUnsafe, be sure that this code is inside the TRY_VM/CATCH_VM
void VM::loadFileUnsafe(const ucs4string& file)
{
    Registers r;
    saveRegisters(&r);
    const Object loadPort = Object::makeTextualInputFilePort(file.ascii_c_str());
    TextualInputPort* p = loadPort.toTextualInputPort();
    bool readErrorOccured = false;
    for (Object o = p->getDatum(readErrorOccured); !o.isEof(); o = p->getDatum(readErrorOccured)) {
        if (readErrorOccured) {
            callLexicalViolationImmidiaImmediately(this, "read", p->error());
        }
        evaluateUnsafe(compile(o).toVector());
    }
    restoreRegisters(&r);
}
Example #9
0
    void Build(const ucs4_t* pat, size_t len)
    {
        if(m_Pattern.length()==len && IsTheSame(m_Pattern.c_str(), pat, (int)len)) return;
        m_Pattern.assign(pat, len);

        const int len1 = m_Len_1 = (int)len+1;
        int *ptab = m_UCS2_Table;
        for(size_t i=0;i<65536; ++i, ++ptab) *ptab = len1;

        m_Table.clear();
        const ucs4_t* p = pat;
        for(size_t i=0; i<len; ++i, ++p)
        {
            const unsigned int idx = (unsigned int)(*p);
            if(idx <= 0xFFFF)
            {
                m_UCS2_Table[idx] = (int)(len-i);
            }
            else
            {
                m_Table[idx] = (int)(len-i);
            }
        }
    }
Example #10
0
Regexp::Regexp(const ucs4string& pattern, bool caseFold, bool isSingleLine) :
    pattern_(pattern),
    isErrorOccured_(false),
    errorMessage_(Object::Nil),
    irritants_(Object::Nil)
{
    const ucs4char* p = pattern_.data();
    int r = onig_new(&regexp_,
                     (const uint8_t*)p,
                     (const uint8_t*)(p + pattern_.size()),
                     (ONIG_OPTION_DEFAULT) | (caseFold ? ONIG_OPTION_IGNORECASE : 0) | (isSingleLine? ONIG_OPTION_SINGLELINE : 0),
                     ONIG_ENCODING,
                     ONIG_SYNTAX_RUBY,
                     &einfo_);
    if (r != ONIG_NORMAL)
    {
        char errorMessageBuffer[ONIG_MAX_ERROR_MESSAGE_LEN];
        onig_error_code_to_str((uint8_t*)errorMessageBuffer, r, &einfo_);
        isErrorOccured_ = true;
        errorMessage_ = errorMessageBuffer;
        irritants_ = L1(Object::makeString(pattern.data()));
    }
}
Example #11
0
void ucs4putstr(std::ostream& os, const ucs4string& ustr)
{
	size_t bytes = ustr.size() * 4;
	os.write(reinterpret_cast<const char *>(ustr.data()), bytes);
}
Example #12
0
void TextualOutputPort::format(const VM* theVM, const ucs4string& fmt, Object args)
{
    ucs4string buffer = UC("");
    for (uint32_t i = 0; i < fmt.size(); i++) {
        if (fmt[i] == '~') {
            i++;
            if (!buffer.empty()) {
                putString(buffer);
                buffer.clear();
            }
            switch (fmt[i]) {
            case '~':
                display(theVM, Object::makeChar('~'));
                break;
            case '%':
                display(theVM, Object::makeChar('\n'));
                break;
            case 'a':
            case 'A':
            case 'd':
            case 'D':
            {
                if (args.isPair()) {
                    display(theVM, args.car());
                    args = args.cdr();
                } else {
                    isErrorOccured_ = true;
                    errorMessage_ = "too few arguments for format string";
                    irritants_ = Pair::list1(Object::makeString(fmt));
                    return;
                }
                break;
            }
            case 's':
            case 'S':
            {
                if (args.isPair()) {
                    putDatum(theVM, args.car());
                    args = args.cdr();
                } else {
                    isErrorOccured_ = true;
                    errorMessage_ = "too few arguments for format string";
                    irritants_ = Pair::list1(Object::makeString(fmt));
                    return;
                }
                break;
            }
            case '\0':
                i--;
                break;
            }
        } else {
            buffer += fmt[i];
        }
    }

    if (!buffer.empty()) {
        putString(buffer);
    }
    flush();
    //fflush(stdout); // temp
    return;
}
Example #13
0
void TextualOutputPort::putString(const ucs4string& s)
{
    for (ucs4string::size_type i = 0; i < s.size(); i++) {
        putCharHandleSpecial(s[i]);
    }
}