/** * Get the index of the last occurrence of a character in a dom string * * \param str The string to search in * \param chr UCS4 value to look for * \return Character index of found character, or -1 if none found */ off_t dom_string_rindex(dom_string *str, uint32_t chr) { const uint8_t *s; size_t clen = 0, slen; uint32_t c = 0; off_t coff, index; parserutils_error err; s = (const uint8_t *) dom_string_data(str); slen = dom_string_byte_length(str); index = dom_string_length(str); while (slen > 0) { err = parserutils_charset_utf8_prev(s, slen, (off_t *) &coff); if (err == PARSERUTILS_OK) { err = parserutils_charset_utf8_to_ucs4(s + coff, slen - clen, &c, &clen); } if (err != PARSERUTILS_OK) { return (uint32_t) -1; } if (c == chr) { return index; } slen -= clen; index--; } return (uint32_t) -1; }
/** * Get the index of the first occurrence of a character in a dom string * * \param str The string to search in * \param chr UCS4 value to look for * \return Character index of found character, or -1 if none found */ off_t dom_string_index(dom_string *str, uint32_t chr) { const uint8_t *s; size_t clen, slen; uint32_t c, index; parserutils_error err; s = (const uint8_t *) dom_string_data(str); slen = dom_string_byte_length(str); index = 0; while (slen > 0) { err = parserutils_charset_utf8_to_ucs4(s, slen, &c, &clen); if (err != PARSERUTILS_OK) { return (uint32_t) -1; } if (c == chr) { return index; } s += clen; slen -= clen; index++; } return (uint32_t) -1; }
/** * Validate whether the string is a legal NCName. * Refer http://www.w3.org/TR/REC-xml-names/ for detail. * * \param str The name to validate * \return true if ::name is valid, false otherwise. */ bool _dom_validate_ncname(dom_string *name) { uint32_t ch; size_t clen, slen; parserutils_error err; const uint8_t *s; if (name == NULL) return false; slen = dom_string_length(name); if (slen == 0) return false; s = (const uint8_t *) dom_string_data(name); slen = dom_string_byte_length(name); err = parserutils_charset_utf8_to_ucs4(s, slen, &ch, &clen); if (err != PARSERUTILS_OK) { return false; } if (is_letter(ch) == false && ch != (uint32_t) '_') return false; s += clen; slen -= clen; while (slen > 0) { err = parserutils_charset_utf8_to_ucs4(s, slen, &ch, &clen); if (err != PARSERUTILS_OK) { return false; } if (is_name_char(ch) == false) return false; if (ch == (uint32_t) ':') return false; s += clen; slen -= clen; } return true; }
/** * Convert a UTF-8 multibyte sequence into a single UCS4 character * * Encoding of UCS values outside the UTF-16 plane has been removed from * RFC3629. This function conforms to RFC2279, however. * * \param s_in The sequence to process * \param l Length of sequence * \return UCS4 character */ uint32_t utf8_to_ucs4(const char *s_in, size_t l) { uint32_t ucs4; size_t len; parserutils_error perror; perror = parserutils_charset_utf8_to_ucs4((const uint8_t *) s_in, l, &ucs4, &len); if (perror != PARSERUTILS_OK) ucs4 = 0xfffd; return ucs4; }
/** * Get the UCS4 character at position index * * \param index The position of the charater * \param ch The UCS4 character * \return DOM_NO_ERR on success, appropriate dom_exception on failure. */ dom_exception dom_string_at(dom_string *str, uint32_t index, uint32_t *ch) { const uint8_t *s; size_t clen, slen; uint32_t c, i; parserutils_error err; s = (const uint8_t *) dom_string_data(str); slen = dom_string_byte_length(str); i = 0; while (slen > 0) { err = parserutils_charset_utf8_char_byte_length(s, &clen); if (err != PARSERUTILS_OK) { return (uint32_t) -1; } i++; if (i == index + 1) break; s += clen; slen -= clen; } if (i == index + 1) { err = parserutils_charset_utf8_to_ucs4(s, slen, &c, &clen); if (err != PARSERUTILS_OK) { return (uint32_t) -1; } *ch = c; return DOM_NO_ERR; } else { return DOM_DOMSTRING_SIZE_ERR; } }