// . s[maxLen] should be the NULL // . returns full length of entity @ "s" if there is a valid one, 0 otherwise // . sets *c to the iso character the entity represents (if there is one) // JAB: const-ness for optimizer... int32_t getEntity_a ( char *s , int32_t maxLen , uint32_t *c ) { // ensure there's an & as first char if ( s[0] != '&' ) return 0; // compute maximum length of entity, if it's indeed an entity int32_t len = 1; if ( s[len]=='#' ) len++; // cut it off after 9 chars to save time while ( len < maxLen && len < 9 && is_alnum_a(s[len]) ) len++; // include the ending ; if any if ( len < maxLen && s[len]==';' ) len++; // char d = s[len]; // s[len]='\0'; // fprintf(stderr,"got entity %s \n",s); // s[len]=d; // we don't have entities longer than "¤" if ( len > 10 ) return 0; // all entites are 3 or more chars (>) if ( len < 3 ) return 0; // . if it's a numeric entity like { use this routine // . pass in the whole she-bang: "...;" or "´...; if ( s[1] == '#' ) { if ( s[2] == 'x' ) *c = getHexadecimalEntity (s, len ); else *c = getDecimalEntity (s, len ); } // otherwise, it's text else *c = getTextEntity ( s , len ); // return 0 if not an entity, length of entity if it is an entity if ( *c ) return len; else return 0; }
// . s[maxLen] should be the NULL // . returns full length of entity @ "s" if there is a valid one, 0 otherwise // . sets *c to the iso character the entity represents (if there is one) // JAB: const-ness for optimizer... int32_t getEntity_a ( const char *s , int32_t maxLen , uint32_t *c ) { // ensure there's an & as first char if ( s[0] != '&' ) { return 0; } // compute maximum length of entity, if it's indeed an entity int32_t len = 1; if ( s[len] == '#' ) { len++; } // cut it off after 9 chars to save time while ( len < maxLen && len < 9 && is_alnum_a( s[len] ) ) { len++; } // character entity reference must end with a semicolon. // some browsers have lenient parsing, but we don't accept invalid // references. if ( len == maxLen || s[len] != ';' ) { //not a valid character entity reference return 0; } len++; // we don't have entities longer than "¤" if ( len > 10 ) { return 0; } // all entites are 3 or more chars (>) if ( len < 3 ) { return 0; } // . if it's a numeric entity like { use this routine // . pass in the whole she-bang: "...;" or "´...; if ( s[1] == '#' ) { if ( s[2] == 'x' ) { *c = getHexadecimalEntity( s, len ); } else { *c = getDecimalEntity( s, len ); } } else { // otherwise, it's text *c = getTextEntity( s, len ); } // return 0 if not an entity, length of entity if it is an entity if ( *c ) { return len; } else { return 0; } }
// . s[maxLen] should be the NULL // . returns full length of entity @ "s" if there is a valid one, 0 otherwise // . sets *c to the iso character the entity represents (if there is one) // JAB: const-ness for optimizer... int32_t getEntity_a ( const char *s, int32_t maxLen, uint32_t codepoint[2], int32_t *codepointCount, int32_t *utf8Len ) { //TODO: handle multi-codepoint entitites *utf8Len=0; // ensure there's an & as first char if ( s[0] != '&' ) { return 0; } // compute maximum length of entity, if it's indeed an entity int32_t len = 1; if ( s[len] == '#' ) { len++; } // cut it off after <32> chars to save time and also to avoid parsing // obscenely long incorrect entitites (eg an ampersand followed by 2MB of letters) while ( len < maxLen && len < max_entity_name_len && is_alnum_a( s[len] ) ) { len++; } // character entity reference must end with a semicolon. // some browsers have lenient parsing, but we don't accept invalid // references. if ( len == maxLen || s[len] != ';' ) { //not a valid character entity reference return 0; } len++; // we don't have entities longer than what w3c specified if ( len > max_entity_name_len+1 ) { return 0; } // all entites are 3 or more chars (>) if ( len < 3 ) { return 0; } // . if it's a numeric entity like { use this routine // . pass in the whole she-bang: "...;" or "´...; if ( s[1] == '#' ) { if ( s[2] == 'x' ) { codepoint[0] = getHexadecimalEntity( s, len ); *codepointCount = 1; } else { codepoint[0] = getDecimalEntity( s, len ); *codepointCount = 1; } } else { // otherwise, it's a named entity const Entity *entity = getTextEntity( s, len ); if(entity) { memcpy(codepoint, entity->codepoint, entity->codepoints*sizeof(int32_t)); *codepointCount = entity->codepoints; *utf8Len = (int32_t)entity->utf8Len; return len; } else { return 0; //unknown named entity } } // return 0 if not an entity, length of entity if it is an entity if ( codepoint[0] ) { return len; } else { return 0; } }