Ejemplo n.º 1
0
static int valid_utf8p(u8_string s)
{
  int sz=check_utf8_ptr(s,get_utf8_size(*s));
  while (sz > 0)
    if (*s == '\0') return 1;
    else {
      s=s+sz; sz=check_utf8_ptr(s,get_utf8_size(*s));}
  return 0;
}
Ejemplo n.º 2
0
U8_EXPORT
/* u8_validate:
    Arguments: a possible utf8 string
    Returns: the number of bytes which are valid
*/
int u8_validate(u8_string s,int len)
{
    int sz=get_utf8_size(*s);
    const u8_byte *limit=s+len, *start=s;
    while ((s<limit) && ((sz=get_utf8_size(*s))>0))
        if (s+sz>limit)
            return s-start;
        else s=s+sz;
    return s-start;
}
Ejemplo n.º 3
0
U8_EXPORT
/* u8_valid_copy:
     Input: a string which should be UTF-8 encoded
     Output: a utf-8 encoding string
Copies its argument, converting invalid UTF-8 sequences into
sequences of latin-1 characters. This always returns a valid UTF8
string. */
u8_string u8_convert_crlfs(u8_string s)
{
    U8_OUTPUT out;
    U8_INIT_STATIC_OUTPUT(out,32);
    while (*s)
        if (*s=='\r')
            if (s[1]=='\n') {
                u8_putc(&out,'\n');
                s=s+2;
            }
            else u8_putc(&out,*s++);
        else if (*s<0x80) u8_putc(&out,*s++);
        else if (check_utf8_ptr(s,get_utf8_size(*s))>0) {
            int c=u8_sgetc(&s);
            u8_putc(&out,c);
        }
        else while (*s>=0x80) u8_putc(&out,*s++);
    return (u8_string)out.u8_outbuf;
}
Ejemplo n.º 4
0
U8_EXPORT
/* u8_validptr:
    Arguments: a possible utf8 string
    Returns: 1 if the string is valid, 0 otherwise.
*/
int u8_validptr(const u8_byte *s)
{
    int sz=get_utf8_size(*s);
    if (sz>0) return (check_utf8_ptr(s,sz)>0);
    else return 0;
}
Ejemplo n.º 5
0
void scanner::next() {
    lean_assert(m_curr != EOF);
    m_spos++;
    while (m_spos >= static_cast<int>(m_curr_line.size())) {
        if (m_last_line) {
            m_curr = EOF;
            return;
        } else {
            m_curr_line.clear();
            if (std::getline(m_stream, m_curr_line)) {
                m_curr_line.push_back('\n');
                m_sline++;
                m_spos  = 0;
                m_upos  = 0;
                m_curr  = m_curr_line[m_spos];
                m_uskip = get_utf8_size(m_curr);
                m_uskip--;
                return;
            } else {
                m_last_line = true;
                m_curr      = EOF;
                return;
            }
        }
    }
    m_curr = m_curr_line[m_spos];
    if (m_uskip > 0) {
        if (!is_utf8_next(m_curr))
            throw_exception("invalid utf-8 sequence character");
        m_uskip--;
    } else {
        m_upos++;
        m_uskip = get_utf8_size(m_curr);
        m_uskip--;
    }
}
Ejemplo n.º 6
0
void scanner::fetch_line() {
    m_curr_line.clear();
    if (std::getline(m_stream, m_curr_line)) {
        m_curr_line.push_back('\n');
        m_sline++;
        m_spos  = 0;
        m_upos  = 0;
        m_curr  = m_curr_line[m_spos];
        m_uskip = get_utf8_size(m_curr);
        m_uskip--;
    } else {
        m_last_line = true;
        m_curr      = EOF;
    }
}
Ejemplo n.º 7
0
int get_word_n_utf8(char *p, int len)
{
    int size = 0;
    int i;
    int n;
    for(i=0; i<len; i++)
    {
        if(*p == '\0')
        {
            return -1;
        }
        n = get_utf8_size(p);
        size += n;
        p += n;
    }
    return size;
}
Ejemplo n.º 8
0
scanner::scanner(std::istream & strm, char const * strm_name, unsigned line):
    m_tokens(nullptr), m_stream(strm) {
    m_stream_name = strm_name ? strm_name : "[unknown]";
    m_sline = line;
    m_line  = line;
    m_spos  = 0;
    m_upos  = 0;
    m_in_notation = false;
    if (std::getline(m_stream, m_curr_line)) {
        m_last_line = false;
        m_curr_line.push_back('\n');
        m_curr  = m_curr_line[m_spos];
        m_uskip = get_utf8_size(m_curr);
        m_uskip--;
    } else {
        m_last_line = true;
        m_curr      = EOF;
        m_uskip     = 0;
    }
}
Ejemplo n.º 9
0
char *find_a_hanzi_utf8(char *p, WORDORIGIN *hanzi)
{
    while(!is_hanzi_utf8(p))
    {
        if(*p == '\0') return NULL;
        p = forward_a_char(p);
    }

    if(hanzi != NULL)
    {
        hanzi->str = p;
        hanzi->size = get_utf8_size(p);
        if(hanzi->size < 1 || hanzi->size > 4)
        {
            return NULL;
        }
    }

    return p;
}
Ejemplo n.º 10
0
void scanner::next() {
    lean_assert(m_curr != EOF);
    m_spos++;
    while (m_spos >= static_cast<int>(m_curr_line.size())) {
        if (m_last_line) {
            m_curr = EOF;
            return;
        } else {
            return fetch_line();
        }
    }
    m_curr = m_curr_line[m_spos];
    if (m_uskip > 0) {
        if (!is_utf8_next(m_curr))
            throw_exception("invalid utf-8 sequence character");
        m_uskip--;
    } else {
        m_upos++;
        m_uskip = get_utf8_size(m_curr);
        m_uskip--;
    }
}
Ejemplo n.º 11
0
char *forward_a_char_utf8(char *p)
{
    return p + get_utf8_size(p);
}