C++ (Cpp) utf8proc_iterate Examples

Example #1

0

Show file

File: inlines.c Project: LuisMDeveloper/EventBlankApp

// Scan ***, **, or * and return number scanned, or 0.
// Advances position.
static int
scan_delims(subject* subj, unsigned char c, bool * can_open, bool * can_close)
{
	int numdelims = 0;
	int before_char_pos;
	int32_t after_char = 0;
	int32_t before_char = 0;
	int len;
	bool left_flanking, right_flanking;

	if (subj->pos == 0) {
		before_char = 10;
	} else {
		before_char_pos = subj->pos - 1;
		// walk back to the beginning of the UTF_8 sequence:
		while (peek_at(subj, before_char_pos) >> 6 == 2 &&
		       before_char_pos > 0) {
			before_char_pos -= 1;
		}
		len = utf8proc_iterate(subj->input.data + before_char_pos,
		                       subj->pos - before_char_pos, &before_char);
		if (len == -1) {
			before_char = 10;
		}
	}

	while (peek_char(subj) == c) {
		numdelims++;
		advance(subj);
	}

	len = utf8proc_iterate(subj->input.data + subj->pos,
	                       subj->input.len - subj->pos, &after_char);
	if (len == -1) {
		after_char = 10;
	}
	left_flanking = numdelims > 0 && !utf8proc_is_space(after_char) &&
	            !(utf8proc_is_punctuation(after_char) &&
	              !utf8proc_is_space(before_char) &&
	              !utf8proc_is_punctuation(before_char));
	right_flanking = numdelims > 0 && !utf8proc_is_space(before_char) &&
	             !(utf8proc_is_punctuation(before_char) &&
	               !utf8proc_is_space(after_char) &&
	               !utf8proc_is_punctuation(after_char));
	if (c == '_') {
		*can_open = left_flanking && !right_flanking;
		*can_close = right_flanking && !left_flanking;
	} else {
		*can_open = left_flanking;
		*can_close = right_flanking;
	}
	return numdelims;
}

Example #2

0

Show file

File: MultiLineTextView.cpp Project: zoujiaqing/specter

void MultiLineTextView::typesetGlyphs(const std::string& str,
                                      const zeus::CColor& defaultColor,
                                      unsigned wrap)
{
    if (wrap)
    {
        typesetGlyphs(LineWrap(str, wrap), defaultColor);
        return;
    }

    m_width = 0;
    m_lines.clear();
    size_t rem = str.size() + 1;
    const utf8proc_uint8_t* it = reinterpret_cast<const utf8proc_uint8_t*>(str.data());

    size_t lineCount = 0;
    while (rem)
    {
        utf8proc_int32_t ch;
        utf8proc_ssize_t sz = utf8proc_iterate(it, -1, &ch);
        if (sz < 0)
            Log.report(logvisor::Fatal, "invalid UTF-8 char");
        if (ch == '\n' || ch == '\0')
            ++lineCount;
        rem -= sz;
        it += sz;
    }

    m_lines.reserve(lineCount);
    rem = str.size() + 1;
    it = reinterpret_cast<const utf8proc_uint8_t*>(str.data());
    const utf8proc_uint8_t* beginIt = it;

    while (rem)
    {
        utf8proc_int32_t ch;
        utf8proc_ssize_t sz = utf8proc_iterate(it, -1, &ch);
        if (ch == '\n' || ch == '\0')
        {
            m_lines.emplace_back(new TextView(m_viewSystem, *this, m_fontAtlas, m_align, m_lineCapacity));
            m_lines.back()->typesetGlyphs(std::string((char*)beginIt, it - beginIt), defaultColor);
            m_width = std::max(m_width, m_lines.back()->nominalWidth());
            beginIt = it + 1;
        }
        rem -= sz;
        it += sz;
    }

    updateSize();
}

Example #3

0

Show file

File: sslice.c Project: gicho/sst

SSliceStatus sslice_get_first_rune(SSlice *s, rune *r) {
    rune r2;
    ssize_t bytes_read;

    if (sslice_empty(s)) {
        return SSLICE_END;
    }

    bytes_read = utf8proc_iterate(
        (const unsigned char *)s->data, s->len, &r2
    );

    if (bytes_read < 1) {
        switch (bytes_read) {
            case UTF8PROC_ERROR_NOMEM:
                return SSLICE_MEMORY_EXHAUSTED;
            case UTF8PROC_ERROR_OVERFLOW:
                return SSLICE_OVERFLOW;
            case UTF8PROC_ERROR_INVALIDUTF8:
                return SSLICE_INVALID_UTF8;
            case UTF8PROC_ERROR_NOTASSIGNED:
                return SSLICE_NOT_ASSIGNED;
            case UTF8PROC_ERROR_INVALIDOPTS:
                return SSLICE_INVALID_OPTS;
            default:
                return SSLICE_ERROR_UNKNOWN;
        }
    }

    *r = r2;

    return SSLICE_OK;
}

Example #4

0

Show file

File: unicode_scripts.c Project: riordan/libpostal

string_script_t get_string_script(char *str, size_t len) {
    int32_t ch;
    script_t last_script = SCRIPT_UNKNOWN;
    script_t script = SCRIPT_UNKNOWN;

    uint8_t *ptr = (uint8_t *)str;

    size_t script_len = 0;
    size_t idx = 0;

    bool is_ascii = true;

    while (idx < len) {
        ssize_t char_len = utf8proc_iterate(ptr, len, &ch);

        if (ch == 0) break;

        script = get_char_script((uint32_t)ch);

        if (script == SCRIPT_COMMON && last_script != SCRIPT_UNKNOWN) {
            script = last_script;
        }

        if (last_script != script && last_script != SCRIPT_UNKNOWN && last_script != SCRIPT_COMMON) {
            if (script_len < len) {
                while (true) {
                    char_len = utf8proc_iterate_reversed((const uint8_t *)str, idx, &ch);
                    if (ch == 0) break;

                    script = get_char_script((uint32_t)ch);
                    if (script != SCRIPT_COMMON) {
                        break;
                    }

                    script_len -= char_len;
                    ptr -= char_len;
                    idx -= char_len;
                }
            }

            break;
        }

        is_ascii = is_ascii && ch < MAX_ASCII;

        ptr += char_len;
        idx += char_len;
        script_len += char_len;

        if (script != SCRIPT_UNKNOWN) {
            last_script = script;
        }
    
    }

    return (string_script_t) {last_script, script_len, is_ascii};
}

Example #5

0

Show file

File: ustream.c Project: akyl/ml

inline int sustream_prev_char(struct ustream *ustrm)
{
	struct sustream* sstrm = (struct sustream*)(ustrm->stream);
	sstrm->cur -= sstrm->octcnt;
	if (sstrm->cur < sstrm->str)
		return STRM_END;
	sstrm->octcnt = utf8proc_iterate(sstrm->cur, 4, &ustrm->point);
	if (sstrm->octcnt < 0)
		return WRONG_ENCODING;
	return OK;
}

Example #6

0

Show file

File: tools.cpp Project: NatLibFi/usemarcon

unsigned int utf8_glypheme_length(const char *p)
{
    int32_t cp;
    int len = utf8proc_iterate((uint8_t *) p, -1, &cp);
    if (len == UTF8PROC_ERROR_INVALIDUTF8)
        return 1;

    // Check for combining characters
    const char *p2 = p + len;
    while (*p2)
    {
        int comb_len = utf8proc_iterate((uint8_t *) p2, -1, &cp);
        if (comb_len == UTF8PROC_ERROR_INVALIDUTF8)
            return 1;
        const utf8proc_property_t* prop = utf8proc_get_property(cp);
        if (!prop->combining_class)
            break;
        len += comb_len;
        p2 += comb_len;
    }
    return len;
}

Example #7

0

Show file

File: utf8proc.c Project: SeamusConnor/tmux

int
utf8proc_mbtowc(wchar_t *pwc, const char *s, size_t n)
{
	utf8proc_ssize_t	slen;

	if (s == NULL)
		return (0);

	/*
	 * *pwc == -1 indicates invalid codepoint
	 * slen < 0 indicates an error
	 */
	slen = utf8proc_iterate(s, n, pwc);
	if (*pwc == (wchar_t)-1 || slen < 0)
		return (-1);
	return (slen);
}

Example #8

0

Show file

File: init.c Project: AlexShiLucky/rtems

static void
test_utf8proc_iterate ( void )
{
  char         utf8_str_simple[]    = "The quick brown.fox";
  uint8_t     *utf8_str_simple_ptr  = (uint8_t*)(&utf8_str_simple[0]);
  size_t       length_simple_string = strlen ( utf8_str_simple );
  int32_t      unicode_char;
  unsigned int index;
  ssize_t      bytes_read;

  for (index = 0; index < length_simple_string; ++index) {
    bytes_read = utf8proc_iterate (
      &utf8_str_simple_ptr[index],
      length_simple_string - index,
      &unicode_char );
    rtems_test_assert ( bytes_read == 1 );
    rtems_test_assert ( (uint8_t)unicode_char == utf8_str_simple_ptr[index]);
  }
}

Example #9

0

Show file

File: sslice.c Project: gicho/sst

SSliceStatus sslice_base_assign(SSlice *s, char *cs, bool validate) {
    size_t slen = strlen(cs);

    if (validate) {
        SSlice cursor;
        int32_t cc;
        ssize_t bytes_read;

        cursor.data = cs;
        cursor.len = slen;

        while (cursor.len > 0) {
            bytes_read = utf8proc_iterate(
                (const unsigned char *)cursor.data, cursor.len, &cc
            );

            if (bytes_read < 1) {
                switch (bytes_read) {
                    case UTF8PROC_ERROR_NOMEM:
                        return SSLICE_MEMORY_EXHAUSTED;
                    case UTF8PROC_ERROR_OVERFLOW:
                        return SSLICE_OVERFLOW;
                    case UTF8PROC_ERROR_INVALIDUTF8:
                        return SSLICE_INVALID_UTF8;
                    case UTF8PROC_ERROR_NOTASSIGNED:
                        return SSLICE_NOT_ASSIGNED;
                    case UTF8PROC_ERROR_INVALIDOPTS:
                        return SSLICE_INVALID_OPTS;
                    default:
                        return SSLICE_ERROR_UNKNOWN;
                }
            }

            cursor.data += bytes_read;
            cursor.len -= bytes_read;
        }
    }

    s->data = cs;
    s->len = slen;

    return SSLICE_OK;
}

Example #10

0

Show file

File: sslice.c Project: gicho/sst

SSliceStatus sslice_advance_rune(SSlice *s) {
    rune r;
    ssize_t bytes_read;

    if (sslice_empty(s)) {
        return SSLICE_END;
    }

    bytes_read = utf8proc_iterate(
        (const unsigned char *)s->data, s->len, &r
    );

    if (bytes_read < 1) {
        switch (bytes_read) {
            case UTF8PROC_ERROR_NOMEM:
                return SSLICE_MEMORY_EXHAUSTED;
            case UTF8PROC_ERROR_OVERFLOW:
                return SSLICE_OVERFLOW;
            case UTF8PROC_ERROR_INVALIDUTF8:
                return SSLICE_INVALID_UTF8;
            case UTF8PROC_ERROR_NOTASSIGNED:
                return SSLICE_NOT_ASSIGNED;
            case UTF8PROC_ERROR_INVALIDOPTS:
                return SSLICE_INVALID_OPTS;
            default:
                return SSLICE_ERROR_UNKNOWN;
        }
    }

    if (bytes_read < (ssize_t)s->len) {
        s->data += bytes_read;
        s->len -= bytes_read;
        return SSLICE_OK;
    }

    sslice_clear(s);

    return SSLICE_END;
}

Example #11

0

Show file

File: transliteration_table_builder.c Project: BERENZ/libpostal

string_tree_t *regex_string_tree(char *regex, size_t len) {
    uint8_t *char_ptr = (uint8_t *)regex;
    bool in_set = false;
    bool in_brackets = false;

    int32_t codepoint;
    int32_t last_codepoint = 0;
    ssize_t char_len;

    size_t bracket_start;
    size_t bracket_len;

    char temp_char[MAX_UTF8_CHAR_SIZE];
    ssize_t temp_char_len;

    string_tree_t *tree = string_tree_new();

    if (len == 0) {
        // Single token with zero-length
        string_tree_add_string_len(tree, regex, len);
        string_tree_finalize_token(tree);
        return tree;
    }

    uint32_array *char_set = uint32_array_new();

    size_t idx = 0;

    int i, j;

    bool add_to_index = false;

    while (idx < len) {
        char_len = utf8proc_iterate(char_ptr, len, &codepoint);
        if (char_len <= 0) {
            uint32_array_destroy(char_set);
            string_tree_destroy(tree);
            return NULL;
        }

        if (!(utf8proc_codepoint_valid(codepoint))) {
            idx += char_len;
            char_ptr += char_len;
            continue;
        }

        add_to_index = true;

        if (codepoint == LSQUARE_CODEPOINT && last_codepoint != BACKSLASH_CODEPOINT) {
            log_debug("begin set\n");
            in_set = true;
            codepoint = BEGIN_SET_CODEPOINT;
            uint32_array_clear(char_set);
        } else if (codepoint == RSQUARE_CODEPOINT && last_codepoint != BACKSLASH_CODEPOINT && in_set) {
            log_debug("end set");

            for (j = 0; j < char_set->n; j++) {
                temp_char_len = utf8proc_encode_char(char_set->a[j], (uint8_t *)temp_char);
                log_debug("Adding string %.*s\n", (int)temp_char_len, temp_char);
                string_tree_add_string_len(tree, temp_char, temp_char_len);
            }
            string_tree_finalize_token(tree);

            uint32_array_clear(char_set);
            // Add a special codepoint to the sequence to distinguish from an escaped square bracket
            codepoint = END_SET_CODEPOINT;
            in_set = false;
        } else if (codepoint == LCURLY_CODEPOINT && last_codepoint != BACKSLASH_CODEPOINT) {
            in_brackets = true;
            bracket_start = idx + char_len;
            bracket_len = 0;
            add_to_index = false;
        } else if (codepoint == RCURLY_CODEPOINT && last_codepoint != BACKSLASH_CODEPOINT && in_brackets) {
            log_debug("Adding bracketed string: %.*s\n", (int) bracket_len, regex + bracket_start);
            string_tree_add_string_len(tree, regex + bracket_start, bracket_len);
            in_brackets = false;
        } else if ((codepoint == LPAREN_CODEPOINT || codepoint == RPAREN_CODEPOINT) && last_codepoint != BACKSLASH_CODEPOINT) {
            log_debug("group\n");
            add_to_index = false;
        } else if (in_set) {
            log_debug("in set\n");
            // Queue node, we'll add them to the trie
            uint32_array_push(char_set, codepoint);
            add_to_index = false;
        } else if (in_brackets) {
            add_to_index = false;
            bracket_len += char_len;
        } else if (codepoint == BACKSLASH_CODEPOINT && last_codepoint != BACKSLASH_CODEPOINT) {
            add_to_index = false;
        }

        log_debug("codepoint = %d\n", codepoint);

        if (add_to_index) {
            temp_char_len = utf8proc_encode_char(codepoint, (uint8_t *)temp_char);
            log_debug("char = %.*s\n", (int)temp_char_len, temp_char);
            string_tree_add_string_len(tree, temp_char, temp_char_len);
            string_tree_finalize_token(tree);
        }

        idx += char_len;
        char_ptr += char_len;
    }

    uint32_array_destroy(char_set);

    return tree;
   
}

Example #12

0

Show file

File: transliteration_table_builder.c Project: BERENZ/libpostal

group_capture_array *parse_groups(char *regex, size_t len) {
    uint8_t *char_ptr = (uint8_t *)regex;
    char last_ch = '\0';
    bool in_group = false;
    bool in_set = false;

    int32_t codepoint, last_codepoint = 0;
    ssize_t char_len;

    char temp_char[MAX_UTF8_CHAR_SIZE];
    ssize_t temp_char_len;


    if (len == 0) {
        return NULL;
    }

    group_capture_array *groups = group_capture_array_new_size(1);

    size_t idx = 0;

    size_t pos = 0;
    size_t group_start = 0;
    size_t chars_in_group = 0;

    while (idx < len) {
        char_len = utf8proc_iterate(char_ptr, len, &codepoint);
        if (char_len <= 0) {
            log_error("char %s had len=%zd\n", char_ptr, char_len);
            return NULL;
        }

        if (!(utf8proc_codepoint_valid(codepoint))) {
            idx += char_len;
            char_ptr += char_len;
            pos++;
            continue;
        }

        if (codepoint == LSQUARE_CODEPOINT && last_codepoint != BACKSLASH_CODEPOINT) {
            log_debug("begin set\n");
            in_set = true;
        } else if (codepoint == RSQUARE_CODEPOINT && last_codepoint != BACKSLASH_CODEPOINT) {
            log_debug("end set");
            pos++;
            in_set = false;
        } else if (codepoint == LPAREN_CODEPOINT && last_codepoint != BACKSLASH_CODEPOINT) {
            log_debug("begin group\n");
            in_group = true;
            group_start = pos;
        } else if (codepoint == RPAREN_CODEPOINT && last_codepoint != BACKSLASH_CODEPOINT) {
            log_debug("close group\n");
            in_group = false;
            group_capture_array_push(groups, (group_capture_t){group_start, pos - group_start});
        } else if (!in_set) {
            log_debug("other char\n");
            pos++;
        }

        idx += char_len;
        char_ptr += char_len;

    }

    return groups;

}

Example #13

0

Show file

File: wrap_font.c Project: exdev/exsdk

static int __lua_font_wrap_text ( lua_State *_l ) {
    ex_font_t *font;
    const char *text, *whitespace;
    int maxWidth;

    const char *str, *nextstr, *word_start;
    char *newtext, *newtext_p, *last_newtext;
    int ch, next_ch, len, newlen, cpylen;
    uint ft_index, prev_ft_index;
    int cur_x, word_start_x;
    ex_glyph_t *glyph;
    bool linebreak, beginningOfLine, trimWhitespace, skipcpy; 
    bool wrapword, collapseSpace, collapseLinebreak;

    // get lua arguments
    ex_lua_check_nargs(_l,5);
    text = luaL_checkstring(_l,1);
    luaL_checktype( _l, 2, LUA_TLIGHTUSERDATA );
    font = lua_touserdata(_l,2);
    whitespace = luaL_checkstring(_l,3);
    maxWidth = luaL_checkint(_l,4);
    trimWhitespace = beginningOfLine = (luaL_checkint(_l,5) == 1);

    //
    len = strlen(text);
    str = nextstr = word_start = text;
    newtext_p = newtext = last_newtext = ex_malloc( len * sizeof(char) );
    prev_ft_index = -1;

    // get wrapMode
    wrapword = false;
    collapseSpace = collapseLinebreak = false;
    if ( !strncmp( whitespace, "pre-wrap", 8 ) ) {
        wrapword = true;
        collapseSpace = false;
        collapseLinebreak = false;
    }
    else if ( !strncmp( whitespace, "pre-line", 8 ) ) {
        wrapword = true;
        collapseSpace = true;
        collapseLinebreak = false;
    }
    else if ( !strncmp( whitespace, "normal", 6 ) ) {
        wrapword = true;
        collapseSpace = true;
        collapseLinebreak = true;
    }
    else if ( !strncmp( whitespace, "nowrap", 6 ) ) {
        wrapword = false;
        collapseSpace = true;
        collapseLinebreak = true;
    }
    else if ( !strncmp( whitespace, "pre", 3 ) ) {
        wrapword = false;
        collapseSpace = false;
        collapseLinebreak = false;
    }

    // process text
    cur_x = word_start_x = 0;
    linebreak = false;
    while ( *str ) {
        skipcpy = false;
        nextstr += utf8proc_iterate ((const uint8_t *)str, -1, &ch);

        // if this is line-break
        if ( ch == '\n' || ch == '\r' ) {
            if ( collapseLinebreak ) {
                ch = ' '; // turn it to space
            }
            else {
                linebreak = true;
            }
        }

        // if this is space 
        if ( ch == ' ' || ch == '\t' || ch == '\f' ) {
            if ( collapseSpace ) {
                const char * nextnextstr = nextstr;
                while ( *nextnextstr ) {
                    nextstr = nextnextstr;
                    nextnextstr += utf8proc_iterate ((const uint8_t *)nextnextstr, -1, &next_ch);

                    // if next_ch is white-space, then collapse this char
                    if ( next_ch == ' ' || next_ch == '\t' || next_ch == '\f' ) {
                        str = nextstr;
                        continue;
                    }

                    // if next_ch is line-break and collapseLinebreak is true, then collapse this char
                    if ( next_ch == '\n' || next_ch == '\r' ) {
                        if ( collapseLinebreak ) {
                            str = nextstr;
                            continue;
                        }
                    }

                    //
                    break;
                }

                // skip first-time collapse
                if ( trimWhitespace ) {
                    trimWhitespace = false;
                    str = nextstr;
                    continue;
                }

                // yes, must turn it to space to make sure only one space
                ch = ' ';
            }
        }

        //
        trimWhitespace = false;

        // process word-break, word-wrap
        if ( wrapword ) {
            word_start = str;
            word_start_x = cur_x;

            // if this character can break
            if ( nextstr == NULL || __can_word_break (ch) ) {
                // advanced character
                ft_index = ex_font_get_index ( font, ch );
                glyph = ex_font_get_glyph ( font, ft_index );
                cur_x += ex_font_get_kerning( font, prev_ft_index, ft_index );
                cur_x += glyph->advance_x;
                prev_ft_index = ft_index;

                // check if the word exceed content width
                if ( cur_x > maxWidth ) {
                    if ( !beginningOfLine ) {
                        linebreak = true;

                        // skip copy the white-space if it is at the end of the wrap
                        if ( ch == ' ' || ch == '\t' || ch == '\f' ) {
                            skipcpy = true;
                        }
                        else {
                            nextstr = word_start;
                            cur_x = word_start_x;
                        }
                    }
                }

                beginningOfLine = false;
            }
            else {
                // advanced current character
                ft_index = ex_font_get_index ( font, ch );
                glyph = ex_font_get_glyph ( font, ft_index );
                cur_x += ex_font_get_kerning( font, prev_ft_index, ft_index );
                cur_x += glyph->advance_x;
                prev_ft_index = ft_index;

                const char * nextnextstr = nextstr;
                while ( *nextnextstr ) {
                    nextstr = nextnextstr;
                    nextnextstr += utf8proc_iterate ((const uint8_t *)nextnextstr, -1, &next_ch);

                    // if this character can break
                    if ( __can_word_break (next_ch) ) {
                        break;
                    }

                    // advanced character
                    ft_index = ex_font_get_index ( font, next_ch );
                    glyph = ex_font_get_glyph ( font, ft_index );
                    cur_x += ex_font_get_kerning( font, prev_ft_index, ft_index );
                    cur_x += glyph->advance_x;
                    prev_ft_index = ft_index;

                    // TODO: process word-break
                    // check if the word exceed content width
                    if ( cur_x > maxWidth ) {
                        if ( !beginningOfLine ) {
                            linebreak = true;

                            nextstr = word_start;
                            cur_x = word_start_x;
                            skipcpy = true;
                            break;
                        }
                    }
                }
            }
        }
        else {
            // advanced character
            ft_index = ex_font_get_index ( font, ch );
            glyph = ex_font_get_glyph ( font, ft_index );
            cur_x += ex_font_get_kerning( font, prev_ft_index, ft_index );
            cur_x += glyph->advance_x;
            prev_ft_index = ft_index;
        } 

        // copy character to newtext_p
        if ( !skipcpy ) {
            cpylen = nextstr - str;
            if ( cpylen > 0 ) {
                strncpy( newtext_p, str, cpylen);
                newtext_p += cpylen;
            }
        }

        // step
        str = nextstr;
        if ( linebreak ) {
            break;
        }
    }

    // text1
    newlen = newtext_p-newtext;
    if ( newlen > 0 ) {
        lua_pushlstring(_l, newtext, newlen);
    }
    else {
        lua_pushnil(_l);
    }

    // text2
    if ( linebreak && *str ) {
        lua_pushstring(_l, str );
    }
    else {
        lua_pushnil(_l);
    }
    lua_pushinteger(_l,cur_x); // width
    lua_pushboolean(_l,linebreak); // line-break

    //
    ex_free(newtext);

    return 4; // text1(can be nil), text2(can be nil), width of text1, linebreak
}

Example #14

0

Show file

File: render.c Project: apache/lucy-clownfish

static
void S_out(cmark_renderer *renderer,
           const char *source,
           bool wrap,
           cmark_escaping escape)
{
	int length = cmark_strbuf_safe_strlen(source);
	unsigned char nextc;
	int32_t c;
	int i = 0;
	int len;
	cmark_chunk remainder = cmark_chunk_literal("");
	int k = renderer->buffer->size - 1;

	wrap = wrap && !renderer->no_wrap;

	if (renderer->in_tight_list_item && renderer->need_cr > 1) {
		renderer->need_cr = 1;
	}
	while (renderer->need_cr) {
		if (k < 0 || renderer->buffer->ptr[k] == '\n') {
			k -= 1;
		} else {
			cmark_strbuf_putc(renderer->buffer, '\n');
			if (renderer->need_cr > 1) {
				cmark_strbuf_put(renderer->buffer, renderer->prefix->ptr,
				                 renderer->prefix->size);
			}
		}
		renderer->column = 0;
		renderer->begin_line = true;
		renderer->need_cr -= 1;
	}

	while (i < length) {
		if (renderer->begin_line) {
			cmark_strbuf_put(renderer->buffer, renderer->prefix->ptr,
			                 renderer->prefix->size);
			// note: this assumes prefix is ascii:
			renderer->column = renderer->prefix->size;
		}

		len = utf8proc_iterate((const uint8_t *)source + i, length - i, &c);
		if (len == -1) { // error condition
			return;  // return without rendering rest of string
		}
		nextc = source[i + len];
		if (c == 32 && wrap) {
			if (!renderer->begin_line) {
				cmark_strbuf_putc(renderer->buffer, ' ');
				renderer->column += 1;
				renderer->begin_line = false;
				renderer->last_breakable = renderer->buffer->size -
				                           1;
				// skip following spaces
				while (source[i + 1] == ' ') {
					i++;
				}
			}

		} else if (c == 10) {
			cmark_strbuf_putc(renderer->buffer, '\n');
			renderer->column = 0;
			renderer->begin_line = true;
			renderer->last_breakable = 0;
		} else if (escape == LITERAL) {
			cmark_render_code_point(renderer, c);
			renderer->begin_line = false;
		} else {
			(renderer->outc)(renderer, escape, c, nextc);
			renderer->begin_line = false;
		}

		// If adding the character went beyond width, look for an
		// earlier place where the line could be broken:
		if (renderer->width > 0 &&
		    renderer->column > renderer->width &&
		    !renderer->begin_line &&
		    renderer->last_breakable > 0) {

			// copy from last_breakable to remainder
			cmark_chunk_set_cstr(&remainder, (char *) renderer->buffer->ptr + renderer->last_breakable + 1);
			// truncate at last_breakable
			cmark_strbuf_truncate(renderer->buffer, renderer->last_breakable);
			// add newline, prefix, and remainder
			cmark_strbuf_putc(renderer->buffer, '\n');
			cmark_strbuf_put(renderer->buffer, renderer->prefix->ptr,
			                 renderer->prefix->size);
			cmark_strbuf_put(renderer->buffer, remainder.data, remainder.len);
			renderer->column = renderer->prefix->size + remainder.len;
			cmark_chunk_free(&remainder);
			renderer->last_breakable = 0;
			renderer->begin_line = false;
		}

		i += len;
	}
}

Example #15

0

Show file

File: trie_search.c Project: tomterragni/libpostal

bool trie_search_from_index(trie_t *self, char *text, uint32_t start_node_id, phrase_array **phrases) {
    if (text == NULL) return false;

    ssize_t len, remaining;
    int32_t unich = 0;
    unsigned char ch = '\0';

    const uint8_t *ptr = (const uint8_t *)text;
    const uint8_t *fail_ptr = ptr;

    uint32_t node_id = start_node_id;
    trie_node_t node = trie_get_node(self, node_id), last_node = node;
    uint32_t next_id;

    bool match = false;
    uint32_t index = 0;
    uint32_t phrase_len = 0;
    uint32_t phrase_start = 0;
    uint32_t data;

    trie_search_state_t state = SEARCH_STATE_BEGIN, last_state = SEARCH_STATE_BEGIN;

    bool advance_index = true;

    while(1) {
        len = utf8proc_iterate(ptr, -1, &unich);
        remaining = len;
        if (len <= 0) return false;
        if (!(utf8proc_codepoint_valid(unich))) return false;

        int cat = utf8proc_category(unich);
        bool is_letter = utf8_is_letter(cat);

        // If we're in the middle of a word and the first letter was not a match, skip the word
        if (is_letter && state == SEARCH_STATE_NO_MATCH) { 
            log_debug("skipping\n");
            ptr += len;
            index += len;
            last_state = state;
            continue; 
        }

        // Match in the middle of a word
        if (is_letter && last_state == SEARCH_STATE_MATCH) {
            log_debug("last_state == SEARCH_STATE_MATCH && is_letter\n");
            // Only set match to false so we don't callback
            match = false;
        }

        for (int i=0; remaining > 0; remaining--, i++, ptr++, last_node=node, last_state=state, node_id=next_id) {
            ch = (unsigned char) *ptr;
            log_debug("char=%c\n", ch);

            next_id = trie_get_transition_index(self, node, *ptr);
            node = trie_get_node(self, next_id);

            if (node.check != node_id) {
                state = is_letter ? SEARCH_STATE_NO_MATCH : SEARCH_STATE_BEGIN;
                if (match) {
                    log_debug("match is true and state==SEARCH_STATE_NO_MATCH\n");
                    if (*phrases == NULL) {
                        *phrases = phrase_array_new_size(1);
                    }
                    phrase_array_push(*phrases, (phrase_t){phrase_start, phrase_len, data});
                    index = phrase_start + phrase_len;
                    advance_index = false;
                    // Set the text back to the end of the last phrase
                    ptr = (const uint8_t *)text + index;
                    len = utf8proc_iterate(ptr, -1, &unich);
                    log_debug("ptr=%s\n", ptr);
                } else {
                    ptr += remaining;
                    log_debug("done with char, now at %s\n", ptr);
                }
                fail_ptr = ptr;
                node_id = start_node_id;
                last_node = node = trie_get_node(self, node_id);
                phrase_start = phrase_len = 0;
                last_state = state;
                match = false;
                break;
            } else {
                log_debug("node.check == node_id\n");
                state = SEARCH_STATE_PARTIAL_MATCH;
                if (last_state == SEARCH_STATE_NO_MATCH || last_state == SEARCH_STATE_BEGIN) {
                    log_debug("phrase_start=%u\n", index);
                    phrase_start = index;
                    fail_ptr = ptr + remaining;
                }

                if (node.base < 0) {
                    int32_t data_index = -1*node.base;
                    trie_data_node_t data_node = self->data->a[data_index];
                    unsigned char *current_tail = self->tail->a + data_node.tail;

                    size_t tail_len = strlen((char *)current_tail);
                    char *query_tail = (char *)(*ptr ? ptr + 1 : ptr);
                    size_t query_tail_len = strlen((char *)query_tail);
                    log_debug("next node tail: %s\n", current_tail);
                    log_debug("query node tail: %s\n", query_tail);

                    if (tail_len <= query_tail_len && strncmp((char *)current_tail, query_tail, tail_len) == 0) {
                        state = SEARCH_STATE_MATCH;
                        log_debug("Tail matches\n");
                        last_state = state;
                        data = data_node.data;
                        log_debug("%u, %d, %zu\n", index, phrase_len, tail_len);
                        ptr += tail_len;
                        index += tail_len;
                        advance_index = false;
                        phrase_len = index + 1 - phrase_start;
                        match = true;
                    } else if (match) {
                        log_debug("match is true and longer phrase tail did not match\n");
                        log_debug("phrase_start=%d, phrase_len=%d\n", phrase_start, phrase_len);
                        if (*phrases == NULL) {
                            *phrases = phrase_array_new_size(1);
                        }
                        phrase_array_push(*phrases, (phrase_t){phrase_start, phrase_len, data});
                        ptr = fail_ptr;
                        match = false;
                        index = phrase_start + phrase_len;
                        advance_index = false;
                    }

                } 

                if (ch != '\0') {
                    trie_node_t terminal_node = trie_get_transition(self, node, '\0');
                    if (terminal_node.check == next_id) {
                        log_debug("Transition to NUL byte matched\n");
                        state = SEARCH_STATE_MATCH;
                        match = true;
                        phrase_len = index + (uint32_t)len - phrase_start;
                        if (terminal_node.base < 0) {
                            int32_t data_index = -1*terminal_node.base;
                            trie_data_node_t data_node = self->data->a[data_index];
                            data = data_node.data;
                        }
                        log_debug("Got match with len=%d\n", phrase_len);
                        fail_ptr = ptr;
                    }
                }
            }

        }

        if (unich == 0) {
            if (last_state == SEARCH_STATE_MATCH) {
                log_debug("Found match at the end\n");
                if (*phrases == NULL) {
                    *phrases = phrase_array_new_size(1);
                }
                phrase_array_push(*phrases, (phrase_t){phrase_start, phrase_len, data});
            }
            break;
        }

        if (advance_index) index += len;

        advance_index = true;
        log_debug("index now %u\n", index);
    } // while

    return true;
}

Example #16

0

Show file

File: trie_search.c Project: tomterragni/libpostal

phrase_t trie_search_prefixes_from_index(trie_t *self, char *word, size_t len, uint32_t start_node_id) {
    log_debug("Call to trie_search_prefixes_from_index\n");
    uint32_t node_id = start_node_id, last_node_id = node_id;
    trie_node_t node = trie_get_node(self, node_id), last_node = node;

    log_debug("last_node_id = %d\n", last_node_id);

    uint32_t value = 0, phrase_start = 0, phrase_len = 0;

    uint8_t *ptr = (uint8_t *)word;

    ssize_t char_len = 0;

    uint32_t idx = 0;

    size_t separator_char_len = 0;

    int32_t codepoint = 0;

    bool first_char = true;

    trie_data_node_t data_node;
    trie_node_t terminal_node;

    while (idx < len) {
        char_len = utf8proc_iterate(ptr, len, &codepoint);
        log_debug("char_len = %zu, char=%d\n", char_len, codepoint);
        if (char_len <= 0) break;
        if (!(utf8proc_codepoint_valid(codepoint))) break;

        bool is_hyphen = utf8_is_hyphen(codepoint);

        int cat = utf8proc_category(codepoint);
        bool is_space = utf8_is_separator(cat);

        uint8_t *char_ptr = ptr;
        size_t i = 0;

        bool skip_char = false;
        bool break_out = false;

        for (i = 0; i < char_len; i++) {
            node_id = trie_get_transition_index(self, last_node, *char_ptr);
            node = trie_get_node(self, node_id);
            log_debug("At idx=%zu, char=%.*s\n", i, (int)char_len, char_ptr);

            if (node.check != last_node_id) {
                log_debug("node.check = %d and last_node_id = %d\n", node.check, last_node_id);

                if (is_hyphen || (is_space && *ptr != ' ')) {
                    log_debug("Got hyphen or other separator, trying space instead\n");
                    node_id = trie_get_transition_index(self, last_node, ' ');
                    node = trie_get_node(self, node_id);
                }

                if (is_hyphen && node.check != last_node_id) {
                    log_debug("No space transition\n");
                    ptr += char_len;
                    idx += char_len;
                    separator_char_len = char_len;
                    node_id = last_node_id;
                    node = trie_get_node(self, node_id);
                    skip_char = true;
                    break;
                } else if (node.check != last_node_id) {
                    break_out = true;
                    log_debug("Breaking\n");
                    break;
                }
                break;
            }

            if (first_char) {
                phrase_start = idx;
                first_char = false;
            }

            if (node.base < 0) {
                log_debug("Searching tail\n");

                data_node = trie_get_data_node(self, node);
                uint32_t current_tail_pos = data_node.tail;

                unsigned char *current_tail = self->tail->a + current_tail_pos;

                log_debug("comparing tail: %s vs %s\n", current_tail, char_ptr + 1);
                size_t current_tail_len = strlen((char *)current_tail);

                size_t match_len = i + 1;
                size_t offset = i + 1;
                size_t tail_pos = 0;
                log_debug("offset=%zu\n", offset);

                if (char_len > 1) {
                    log_debug("char_len = %zu\n", char_len);
                    log_debug("Doing strncmp: (%zu) %s vs %s\n", char_len - offset, current_tail, char_ptr + 1);

                    if (strncmp((char *)ptr + offset, (char *)current_tail, char_len - offset) == 0) {
                        match_len += char_len - offset;
                        tail_pos = char_len - offset;
                        log_debug("in char match_len = %zu\n", match_len);
                    } else {
                        return NULL_PHRASE;
                    }
                }

                size_t tail_match_len = utf8_common_prefix_len((char *)ptr + char_len, (char *)current_tail + tail_pos, current_tail_len - tail_pos);
                match_len += tail_match_len;
                log_debug("match_len=%zu\n", match_len);
                
                if (tail_match_len == current_tail_len - tail_pos) {
                    if (first_char) phrase_start = idx;
                    phrase_len = (uint32_t)(idx + match_len) - phrase_start;

                    log_debug("tail match! phrase_len=%u\n", phrase_len);
                    value = data_node.data;
                    return (phrase_t){phrase_start, phrase_len, value};
                } else {
                    return NULL_PHRASE;
                }

            } else if (node.check == last_node_id) {
                terminal_node = trie_get_transition(self, node, '\0');
                log_debug("Trying link from %d to terminal node\n", last_node_id);

                if (terminal_node.check == node_id) {
                    log_debug("Transition to NUL byte matched\n");
                    if (terminal_node.base < 0) {
                        phrase_len = (uint32_t)(idx + char_len) - phrase_start;
                        data_node = trie_get_data_node(self, terminal_node);
                        value = data_node.data;
                    }
                    log_debug("Got match with len=%d\n", phrase_len);
                }
            }

            last_node = node;
            last_node_id = node_id;
            log_debug("last_node_id = %d\n", last_node_id);
            char_ptr++;
        }


        if (break_out) {
            break;
        } else if (skip_char) {
            continue;
        }

        log_debug("Incrementing index\n");

        idx += char_len;
        ptr += char_len;
    }

    log_debug("exited while loop\n");

    if (phrase_len == 0) return NULL_PHRASE;
 
    return (phrase_t) {phrase_start, phrase_len, value};
}