Beispiel #1
0
/**
 * utf8_byteoffset_to_charsoffset_cached:
 * @string: the gchar * you want to count
 * @byteoffset: glong with the byteoffset you want the charoffset for
 * 
 * this function calculates the UTF-8 character offset in a string for
 * a given byte offset
 * It uses caching to speedup multiple calls for the same buffer, the cache
 * is emptied if you change to another buffer. If you use the same buffer but 
 * change it inbetween calls, you have to reset it yourself using
 * the utf8_offset_cache_reset() function
 *
 **** the result is undefined if the provided byteoffset is in the middle of a UTF8 character ***
 * 
 * Return value: guint with character offset
 **/
guint utf8_byteoffset_to_charsoffset_cached(gchar *string, glong byteoffset) {
	guint retval;
	gint i = UTF8_OFFSET_CACHE_SIZE-1;
	if (byteoffset ==0) return 0;

	if (string != utf8_offset_cache.last_buf) {
		utf8_offset_cache_reset();
		utf8_offset_cache.last_buf = string;
	}
#ifdef DEBUG
	DEBUG_MSG("utf8_byteoffset_to_charsoffset_cached, string %p has strlen %d, looking for byteoffset %ld, starting in cache at i=%d\n", string, strlen(string),byteoffset,i);
#endif

	while (i > 0 && utf8_offset_cache.last_byteoffset[i] > byteoffset) {
		i--;
	}
	
	if (i > 0) {
		if (utf8_offset_cache.last_byteoffset[i] == byteoffset) {
#ifdef DEBUG
	DEBUG_MSG("byteoffset %ld is in the cache at i=%d, returning %d\n",byteoffset,i,utf8_offset_cache.last_charoffset[i]);
#endif
			return utf8_offset_cache.last_charoffset[i];
       }
       /* if the byteoffset is in the middle of a multibyte character, this line will fail (but
	       we are not supposed to get called in the middle of a character)*/
		retval = g_utf8_pointer_to_offset(string+utf8_offset_cache.last_byteoffset[i], string+byteoffset)+utf8_offset_cache.last_charoffset[i];
#ifdef UTF8_BYTECHARDEBUG
		utf8_offset_cache.numbytes_parsed += (byteoffset - utf8_offset_cache.last_byteoffset[i]);
		utf8_offset_cache.numbytes_cached_parsed += (byteoffset - utf8_offset_cache.last_byteoffset[i]);
		utf8_offset_cache.numcalls_cached_since_reset++;
#endif
	} else {
		retval = g_utf8_pointer_to_offset(string, string+byteoffset);
#ifdef UTF8_BYTECHARDEBUG
		utf8_offset_cache.numbytes_parsed += byteoffset;
#endif
	}
	DEBUG_MSG(" and byteoffset %ld has charoffset %d\n",byteoffset,retval);
	if (i == (UTF8_OFFSET_CACHE_SIZE-1)) {
		/* add this new calculation to the cache */
		/* this is a nasty trick to move all guint entries one back in the array, so we can add the new one */
		memmove(&utf8_offset_cache.last_byteoffset[0], &utf8_offset_cache.last_byteoffset[1], (UTF8_OFFSET_CACHE_SIZE+UTF8_OFFSET_CACHE_SIZE-1)*sizeof(guint));

		utf8_offset_cache.last_byteoffset[UTF8_OFFSET_CACHE_SIZE-1] = byteoffset;
		utf8_offset_cache.last_charoffset[UTF8_OFFSET_CACHE_SIZE-1] = retval;
	}
#ifdef UTF8_BYTECHARDEBUG
	utf8_offset_cache.numcalls_since_reset++;
#endif
	return retval;
}
Beispiel #2
0
void
doc_entities_to_utf8(Tdocument * doc, gint start, gint end, gboolean numerical, gboolean iso8859_1,
					 gboolean symbols, gboolean specials, gboolean xml)
{
	gchar *buf;
	const gchar *found, *prevfound;
	guint docoffset = start;	/* docoffset is an offset in characters between the buffer and the GtkTextBuffer contents */

	buf = doc_get_chars(doc, start, end);
	utf8_offset_cache_reset();

	found = g_utf8_strchr(buf, -1, '&');
	while (found) {
		gchar *endfound;
		endfound = g_utf8_strchr(found, -1, ';');
		if (endfound && endfound - found <= 7) {
			gchar *entity;
			gunichar unic;

			entity = g_strndup(found + 1, (endfound - found) - 1);
			/*unic = unichar_for_entity(entity,numerical,iso8859_1,symbols,specials,xml); */
			unic = xmlentity2unichar(entity, numerical, iso8859_1, symbols, specials, xml);
			if (unic != -1) {
				guint cfound, cendfound;
				gchar tmp[7];
				DEBUG_MSG("doc_entities_to_utf8, unic=%d for entity '%s'\n", unic, entity);
				memset(tmp, 0, 7);
				g_unichar_to_utf8(unic, tmp);

				cfound = utf8_byteoffset_to_charsoffset_cached(buf, (found - buf));
				cendfound = utf8_byteoffset_to_charsoffset_cached(buf, (endfound - buf));

				doc_replace_text_backend(doc, tmp, cfound + docoffset, cendfound + docoffset + 1);
				docoffset = docoffset - (cendfound + 1 - cfound) + 1;
			}
			g_free(entity);
			prevfound = g_utf8_next_char(endfound);
			found = g_utf8_strchr(prevfound, -1, '&');
		} else {
			found = g_utf8_strchr(g_utf8_next_char(found), -1, '&');
		}
	}
}