static int valid_utf8p(u8_string s) { int sz=check_utf8_ptr(s,get_utf8_size(*s)); while (sz > 0) if (*s == '\0') return 1; else { s=s+sz; sz=check_utf8_ptr(s,get_utf8_size(*s));} return 0; }
U8_EXPORT /* u8_validate: Arguments: a possible utf8 string Returns: the number of bytes which are valid */ int u8_validate(u8_string s,int len) { int sz=get_utf8_size(*s); const u8_byte *limit=s+len, *start=s; while ((s<limit) && ((sz=get_utf8_size(*s))>0)) if (s+sz>limit) return s-start; else s=s+sz; return s-start; }
U8_EXPORT /* u8_valid_copy: Input: a string which should be UTF-8 encoded Output: a utf-8 encoding string Copies its argument, converting invalid UTF-8 sequences into sequences of latin-1 characters. This always returns a valid UTF8 string. */ u8_string u8_convert_crlfs(u8_string s) { U8_OUTPUT out; U8_INIT_STATIC_OUTPUT(out,32); while (*s) if (*s=='\r') if (s[1]=='\n') { u8_putc(&out,'\n'); s=s+2; } else u8_putc(&out,*s++); else if (*s<0x80) u8_putc(&out,*s++); else if (check_utf8_ptr(s,get_utf8_size(*s))>0) { int c=u8_sgetc(&s); u8_putc(&out,c); } else while (*s>=0x80) u8_putc(&out,*s++); return (u8_string)out.u8_outbuf; }
U8_EXPORT /* u8_validptr: Arguments: a possible utf8 string Returns: 1 if the string is valid, 0 otherwise. */ int u8_validptr(const u8_byte *s) { int sz=get_utf8_size(*s); if (sz>0) return (check_utf8_ptr(s,sz)>0); else return 0; }
void scanner::next() { lean_assert(m_curr != EOF); m_spos++; while (m_spos >= static_cast<int>(m_curr_line.size())) { if (m_last_line) { m_curr = EOF; return; } else { m_curr_line.clear(); if (std::getline(m_stream, m_curr_line)) { m_curr_line.push_back('\n'); m_sline++; m_spos = 0; m_upos = 0; m_curr = m_curr_line[m_spos]; m_uskip = get_utf8_size(m_curr); m_uskip--; return; } else { m_last_line = true; m_curr = EOF; return; } } } m_curr = m_curr_line[m_spos]; if (m_uskip > 0) { if (!is_utf8_next(m_curr)) throw_exception("invalid utf-8 sequence character"); m_uskip--; } else { m_upos++; m_uskip = get_utf8_size(m_curr); m_uskip--; } }
void scanner::fetch_line() { m_curr_line.clear(); if (std::getline(m_stream, m_curr_line)) { m_curr_line.push_back('\n'); m_sline++; m_spos = 0; m_upos = 0; m_curr = m_curr_line[m_spos]; m_uskip = get_utf8_size(m_curr); m_uskip--; } else { m_last_line = true; m_curr = EOF; } }
int get_word_n_utf8(char *p, int len) { int size = 0; int i; int n; for(i=0; i<len; i++) { if(*p == '\0') { return -1; } n = get_utf8_size(p); size += n; p += n; } return size; }
scanner::scanner(std::istream & strm, char const * strm_name, unsigned line): m_tokens(nullptr), m_stream(strm) { m_stream_name = strm_name ? strm_name : "[unknown]"; m_sline = line; m_line = line; m_spos = 0; m_upos = 0; m_in_notation = false; if (std::getline(m_stream, m_curr_line)) { m_last_line = false; m_curr_line.push_back('\n'); m_curr = m_curr_line[m_spos]; m_uskip = get_utf8_size(m_curr); m_uskip--; } else { m_last_line = true; m_curr = EOF; m_uskip = 0; } }
char *find_a_hanzi_utf8(char *p, WORDORIGIN *hanzi) { while(!is_hanzi_utf8(p)) { if(*p == '\0') return NULL; p = forward_a_char(p); } if(hanzi != NULL) { hanzi->str = p; hanzi->size = get_utf8_size(p); if(hanzi->size < 1 || hanzi->size > 4) { return NULL; } } return p; }
void scanner::next() { lean_assert(m_curr != EOF); m_spos++; while (m_spos >= static_cast<int>(m_curr_line.size())) { if (m_last_line) { m_curr = EOF; return; } else { return fetch_line(); } } m_curr = m_curr_line[m_spos]; if (m_uskip > 0) { if (!is_utf8_next(m_curr)) throw_exception("invalid utf-8 sequence character"); m_uskip--; } else { m_upos++; m_uskip = get_utf8_size(m_curr); m_uskip--; } }
char *forward_a_char_utf8(char *p) { return p + get_utf8_size(p); }