/** * utf8_byteoffset_to_charsoffset_cached: * @string: the gchar * you want to count * @byteoffset: glong with the byteoffset you want the charoffset for * * this function calculates the UTF-8 character offset in a string for * a given byte offset * It uses caching to speedup multiple calls for the same buffer, the cache * is emptied if you change to another buffer. If you use the same buffer but * change it inbetween calls, you have to reset it yourself using * the utf8_offset_cache_reset() function * **** the result is undefined if the provided byteoffset is in the middle of a UTF8 character *** * * Return value: guint with character offset **/ guint utf8_byteoffset_to_charsoffset_cached(gchar *string, glong byteoffset) { guint retval; gint i = UTF8_OFFSET_CACHE_SIZE-1; if (byteoffset ==0) return 0; if (string != utf8_offset_cache.last_buf) { utf8_offset_cache_reset(); utf8_offset_cache.last_buf = string; } #ifdef DEBUG DEBUG_MSG("utf8_byteoffset_to_charsoffset_cached, string %p has strlen %d, looking for byteoffset %ld, starting in cache at i=%d\n", string, strlen(string),byteoffset,i); #endif while (i > 0 && utf8_offset_cache.last_byteoffset[i] > byteoffset) { i--; } if (i > 0) { if (utf8_offset_cache.last_byteoffset[i] == byteoffset) { #ifdef DEBUG DEBUG_MSG("byteoffset %ld is in the cache at i=%d, returning %d\n",byteoffset,i,utf8_offset_cache.last_charoffset[i]); #endif return utf8_offset_cache.last_charoffset[i]; } /* if the byteoffset is in the middle of a multibyte character, this line will fail (but we are not supposed to get called in the middle of a character)*/ retval = g_utf8_pointer_to_offset(string+utf8_offset_cache.last_byteoffset[i], string+byteoffset)+utf8_offset_cache.last_charoffset[i]; #ifdef UTF8_BYTECHARDEBUG utf8_offset_cache.numbytes_parsed += (byteoffset - utf8_offset_cache.last_byteoffset[i]); utf8_offset_cache.numbytes_cached_parsed += (byteoffset - utf8_offset_cache.last_byteoffset[i]); utf8_offset_cache.numcalls_cached_since_reset++; #endif } else { retval = g_utf8_pointer_to_offset(string, string+byteoffset); #ifdef UTF8_BYTECHARDEBUG utf8_offset_cache.numbytes_parsed += byteoffset; #endif } DEBUG_MSG(" and byteoffset %ld has charoffset %d\n",byteoffset,retval); if (i == (UTF8_OFFSET_CACHE_SIZE-1)) { /* add this new calculation to the cache */ /* this is a nasty trick to move all guint entries one back in the array, so we can add the new one */ memmove(&utf8_offset_cache.last_byteoffset[0], &utf8_offset_cache.last_byteoffset[1], (UTF8_OFFSET_CACHE_SIZE+UTF8_OFFSET_CACHE_SIZE-1)*sizeof(guint)); utf8_offset_cache.last_byteoffset[UTF8_OFFSET_CACHE_SIZE-1] = byteoffset; utf8_offset_cache.last_charoffset[UTF8_OFFSET_CACHE_SIZE-1] = retval; } #ifdef UTF8_BYTECHARDEBUG utf8_offset_cache.numcalls_since_reset++; #endif return retval; }
void doc_entities_to_utf8(Tdocument * doc, gint start, gint end, gboolean numerical, gboolean iso8859_1, gboolean symbols, gboolean specials, gboolean xml) { gchar *buf; const gchar *found, *prevfound; guint docoffset = start; /* docoffset is an offset in characters between the buffer and the GtkTextBuffer contents */ buf = doc_get_chars(doc, start, end); utf8_offset_cache_reset(); found = g_utf8_strchr(buf, -1, '&'); while (found) { gchar *endfound; endfound = g_utf8_strchr(found, -1, ';'); if (endfound && endfound - found <= 7) { gchar *entity; gunichar unic; entity = g_strndup(found + 1, (endfound - found) - 1); /*unic = unichar_for_entity(entity,numerical,iso8859_1,symbols,specials,xml); */ unic = xmlentity2unichar(entity, numerical, iso8859_1, symbols, specials, xml); if (unic != -1) { guint cfound, cendfound; gchar tmp[7]; DEBUG_MSG("doc_entities_to_utf8, unic=%d for entity '%s'\n", unic, entity); memset(tmp, 0, 7); g_unichar_to_utf8(unic, tmp); cfound = utf8_byteoffset_to_charsoffset_cached(buf, (found - buf)); cendfound = utf8_byteoffset_to_charsoffset_cached(buf, (endfound - buf)); doc_replace_text_backend(doc, tmp, cfound + docoffset, cendfound + docoffset + 1); docoffset = docoffset - (cendfound + 1 - cfound) + 1; } g_free(entity); prevfound = g_utf8_next_char(endfound); found = g_utf8_strchr(prevfound, -1, '&'); } else { found = g_utf8_strchr(g_utf8_next_char(found), -1, '&'); } } }