int32_t stripAccentMarks (char *outbuf, int32_t outbufsize, unsigned char *p, int32_t inbuflen) { char *s = (char *)p; char *send = (char *)p + inbuflen; int32_t cs; char *dst = outbuf; for ( ; s < send ; s += cs ) { // how big is this character? cs = getUtf8CharSize(s); // convert the utf8 character to UChar32 UChar32 uc = utf8Decode ( s ); // break "uc" into decomposition of UChar32s UChar32 ttt[32]; int32_t klen = recursiveKDExpand(uc,ttt,32); if(klen>32){char *xx=NULL;*xx=0;} // sanity if ( dst + 5 > outbuf+outbufsize ) return -1; // if the same, leave it! it had no accent marks or other // modifiers... if ( klen <= 1 ) { gbmemcpy ( dst , s , cs ); dst += cs; continue; } // take the first one as the stripped // convert back to utf8 int32_t stored = utf8Encode ( ttt[0] , dst ); // skip over the stored utf8 char dst += stored; } // sanity. breach check if ( dst > outbuf+outbufsize ) { char *xx=NULL;*xx=0; } // return # of bytes stored into outbuf return dst - outbuf; }
TEST(Utf8Utils, utf8Encode) { static const struct { int expected_len; uint8_t expected_text[32]; uint32_t codepoint; } kData[] = { { -1, { 0, }, 0x80000000U }, { 1, { 0, 0 }, 0 }, { 1, { 32, }, 32 }, { 1, { 127, }, 127 }, { 2, { 0xc2, 0x80, }, 0x80 }, { 2, { 0xc2, 0x81, }, 0x81 }, { 2, { 0xdf, 0xbf, }, 0x7ff }, { 3, { 0xe0, 0xa0, 0x80 }, 0x800 }, { 3, { 0xe7, 0xbf, 0xbf }, 0x7fff }, { 3, { 0xef, 0xbf, 0xbf }, 0xffff }, { 4, { 0xf0, 0x90, 0x80, 0x80 }, 0x10000 }, { 4, { 0xf7, 0xbf, 0xbf, 0xbf }, 0x1fffff }, }; const size_t kDataSize = ARRAYLEN(kData); for (size_t n = 0; n < kDataSize; ++n) { uint8_t buffer[32] = { 0, }; // First, check length without an output buffer. int len = utf8Encode(kData[n].codepoint, NULL, 0); EXPECT_EQ(kData[n].expected_len, len) << "#" << n; // Second, check length with an output buffer. len = utf8Encode(kData[n].codepoint, buffer, sizeof(buffer)); EXPECT_EQ(kData[n].expected_len, len) << "#" << n; for (int ii = 0; ii < len; ++ii) { EXPECT_EQ(kData[n].expected_text[ii], buffer[ii]) << "#" << n << " @" << ii; } // Third, check length with a buffer that is too short. if (kData[n].expected_len > 0) { len = utf8Encode(kData[n].codepoint, buffer, (size_t)(len - 1)); EXPECT_EQ(-1, len) << "#" << n; } } }
static bool initEntityTable(){ if ( ! s_isInitialized ) { // set up the hash table if ( ! s_table.set ( 8,4,255,NULL,0,false,0,"enttbl" ) ) return log("build: Could not init table of " "HTML entities."); // now add in all the stop words int32_t n = (int32_t)sizeof(s_entities) / (int32_t)sizeof(Entity); for ( int32_t i = 0 ; i < n ; i++ ) { int64_t h = hash64b ( s_entities[i].entity ); // grab the unicode code point UChar32 up = s_entities[i].unicode; // now we are 100% up if ( ! up ) { char *xx=NULL;*xx=0; } // point to it char *buf = (char *)s_entities[i].utf8; // if uchar32 not 0 then set the utf8 with it int32_t len = utf8Encode(up,buf); // // make my own mods to make parsing easier // if ( up == 160 ) { // nbsp buf[0] = ' '; len = 1; } // // end custom mods // // set length s_entities[i].utf8Len = len; // check it if ( len == 0 ) { char *xx=NULL;*xx=0; } // must not exist! if ( s_table.isInTable(&h) ) { char*xx=NULL;*xx=0;} // store the entity index in the hash table as score if ( ! s_table.addTerm ( &h, i+1 ) ) return false; } s_isInitialized = true; } return true; }
static bool initEntityTable(){ if ( ! s_isInitialized ) { // set up the hash table if ( ! s_table.set ( 8,4,4096,NULL,0,false,"enttbl" ) ) { log("build: Could not init table of HTML entities."); return false; } // now add in all the html entities const int32_t n = (int32_t)sizeof(s_entities) / (int32_t)sizeof(Entity); for ( int32_t i = 0 ; i < n ; i++ ) { int64_t h = hash64b ( s_entities[i].entity ); // convert the unicode codepoints to an utf8 string char *buf = (char *)s_entities[i].utf8; for(int j=0; j<s_entities[i].codepoints; j++) { UChar32 codepoint = s_entities[i].codepoint[j]; int32_t len = utf8Encode(codepoint,buf); if ( len == 0 ) { g_process.shutdownAbort(true); } // make modification to make parsing easier if ( codepoint == 160 ) { // nbsp buf[0] = ' '; len = 1; } buf += len; } s_entities[i].utf8Len = (size_t)(buf-s_entities[i].utf8); // must not exist! if ( s_table.isInTable(&h) ) { g_process.shutdownAbort(true);} // store the entity index in the hash table as score if ( ! s_table.addTerm(h, i+1) ) return false; } s_isInitialized = true; } return true; }
bool Words::set ( Xml *xml, bool computeWordIds , long niceness , long node1 , long node2 ) { // prevent setting with the same string if ( m_xml == xml ) { char *xx=NULL;*xx=0; } reset(); m_xml = xml; m_version = xml->getVersion(); //m_version = xml->getVersion(); // quick test if ( ! s_tested ) { // only do once s_tested = true; // set c to a curling quote in unicode long c = 0x201c; // 0x235e; // encode it into utf8 char dst[5]; // point to it char *p = dst; // put space in there *p++ = ' '; // "numBytes" is how many bytes it stored into 'dst" long numBytes = utf8Encode ( c , p ); // must be 2 bytes i guess if ( numBytes != 3 ) { char *xx=NULL; *xx=0; } // check it long size = getUtf8CharSize(p); if ( size != 3 ) { char *xx=NULL; *xx=0; } // is that punct if ( ! is_punct_utf8 ( p ) ) { char *xx=NULL;*xx=0; } // make sure can pair across //unsigned char bits = getPunctuationBits ( dst , 4 ); // must be able to pair across //if ( ! ( bits & D_CAN_PAIR_ACROSS ) ) { char *xx=NULL;*xx=0;} } // if xml is empty, bail if ( ! xml->getContent() ) return true; long numNodes = xml->getNumNodes(); if ( numNodes <= 0 ) return true; // . can be given a range, if node2 is -1 that means all! // . range is half-open: [node1, node2) if ( node2 < 0 ) node2 = numNodes; // sanity check if ( node1 > node2 ) { char *xx=NULL;*xx=0; } char *start = xml->getNode(node1); char *end = xml->getNode(node2-1) + xml->getNodeLen(node2-1); long size = end - start; m_preCount = countWords( start , size , niceness ); // allocate based on the approximate count if ( ! allocateWordBuffers(m_preCount, true)) return false; // are we done? for ( long k = node1 ; k < node2 && m_numWords < m_preCount ; k++ ){ // get the kth node char *node = xml->getNode (k); long nodeLen = xml->getNodeLen(k); // is the kth node a tag? if ( ! xml->isTag(k) ) { char c = node[nodeLen]; node[nodeLen] = '\0'; addWords(node,nodeLen,computeWordIds,niceness); node[nodeLen] = c; continue; } // it is a tag m_words [m_numWords] = node; m_wordLens [m_numWords] = nodeLen; m_tagIds [m_numWords] = xml->getNodeId(k); m_wordIds [m_numWords] = 0LL; m_nodes [m_numWords] = k; // we have less than 127 HTML tags, so set // the high bit for back tags if ( xml->isBackTag(k)) { m_tagIds[m_numWords] |= BACKBIT; } //log(LOG_DEBUG, "Words: Word %ld: got tag %s%s (%d)", // m_numWords, // isBackTag(m_numWords)?"/":"", // g_nodes[getTagId(m_numWords)].m_nodeName, // getTagId(m_numWords)); m_numWords++; // used by XmlDoc.cpp m_numTags++; continue; } return true; }
static bool initEntityTable(){ if ( ! s_isInitialized ) { // set up the hash table if ( ! s_table.set ( 8,4,255,NULL,0,false,0,"enttbl" ) ) return log("build: Could not init table of " "HTML entities."); // now add in all the stop words int32_t n = (int32_t)sizeof(s_entities) / (int32_t)sizeof(Entity); for ( int32_t i = 0 ; i < n ; i++ ) { int64_t h = hash64b ( s_entities[i].entity ); // grab the unicode code point UChar32 up = s_entities[i].unicode; // now we are 100% up if ( ! up ) { char *xx=NULL;*xx=0; } // point to it char *buf = (char *)s_entities[i].utf8; // if uchar32 not 0 then set the utf8 with it int32_t len = utf8Encode(up,buf); // // make my own mods to make parsing easier // if ( up == 160 ) { // nbsp buf[0] = ' '; len = 1; } // make all quotes equal '\"' (34 decimal) // double and single curling quotes //http://www.dwheeler.com/essays/quotes-test-utf-8.html // “, 201d, 2018, 2019 (unicode values, not utf8) // &ldquo, &rdquo, &lsquo, &rsquo /* if ( up == 171 || up == 187 || up == 8216 || up == 8217 || up == 8218 || up == 8220 || up == 8221 || up == 8222 || up == 8249 || up == 8250 ) { buf[0] = '\"'; len = 1; } // and normalize all dashes (mdash,ndash) if ( up == 8211 || up == 8212 ) { buf[0] = '-'; len = 1; } */ // // end custom mods // // set length s_entities[i].utf8Len = len; // check it if ( len == 0 ) { char *xx=NULL;*xx=0; } // must not exist! if ( s_table.isInTable(&h) ) { char*xx=NULL;*xx=0;} // store the entity index in the hash table as score if ( ! s_table.addTerm ( &h, i+1 ) ) return false; } s_isInitialized = true; } return true; }
// . if "doSpecial" is true, then we don't touch <, > and & int32_t htmlDecode( char *dst, const char *src, int32_t srcLen, bool doSpecial ) { //special-case optimization if ( srcLen == 0 ) { return 0; } char * const start = dst; const char * const srcEnd = src + srcLen; for ( ; src < srcEnd ; ) { if ( *src != '&' ) { *dst++ = *src++; } else { // Ok, we have an ampersand. So decode it into unicode/utf8, do a few special // checks, and in general store the resulting string in dst[] // store decoded entity char into dst[j] uint32_t codepoint[2]; int32_t codepointCount; int32_t utf8Len=0; // "skip" is how many bytes the entites was in "src" int32_t skip = getEntity_a( src, srcEnd - src, codepoint, &codepointCount, &utf8Len ); // If the entity is invalid/unknown then store it as text //@todo BR: Temporary fix for named html entities where the utf8 length is // longer than the html entity name. This causes problems for XmlDoc that // calls this function with the same buffer as input and output if ( skip == 0 || utf8Len > skip) { //todo: if doSpecial then make it an & // but the decoding is done in-place (bad idea) so we cannot expand the output *dst++ = *src++; continue; } // . special mapping // . make < and > special so Xml::set() still works // . and make & special so we do not screw up summaries if ( doSpecial ) { if ( codepoint[0] == '<' || codepoint[0] == '>' || codepoint[0] == '&' ) { int32_t entityLen = 4; const char* entityStr = ""; if (codepoint[0] == '<') { entityStr = "<"; } else if (codepoint[0] == '>') { entityStr = ">"; } else { entityStr = "&"; entityLen = 5; } memcpy(dst, entityStr, entityLen); src += skip; dst += entityLen; continue; } /// @todo verify if we need to replace " with ' // some tags have " in their value strings // so we have to preserve that! // use curling quote: //http://www.dwheeler.com/essays/quotes-test-utf-8.html // curling double and single quotes resp: // “ ” ‘ ” if ( codepoint[0] == '\"' ) { *dst = '\''; dst++; src += skip; continue; } } int32_t totalUtf8Bytes = 0; for ( int i=0; i<codepointCount; i++) { // . store it into "dst" in utf8 format int32_t numBytes = utf8Encode ( codepoint[i], dst ); totalUtf8Bytes += numBytes; // sanity check. do not eat our tail if dst == src if ( totalUtf8Bytes > skip ) { g_process.shutdownAbort(true); } // advance dst ptr dst += numBytes; } // skip over the encoded entity in the source string src += skip; } } // NUL term *dst = '\0'; return dst - start; }
int32_t ucToAny(char *outbuf, int32_t outbufsize, char *charset_out, char *inbuf, int32_t inbuflen, char *charset_in, int32_t ignoreBadChars , int32_t niceness ){ if (inbuflen == 0) return 0; // alias for iconv char *csAlias = charset_in; if (!strncmp(charset_in, "x-windows-949", 13)) csAlias = "CP949"; // Treat all latin1 as windows-1252 extended charset if (!strncmp(charset_in, "ISO-8859-1", 10) ) csAlias = "WINDOWS-1252"; iconv_t cd = gbiconv_open(charset_out, csAlias); int32_t numBadChars = 0; if (cd == (iconv_t)-1) { log("uni: Error opening input conversion" " descriptor for %s: %s (%d)\n", charset_in, strerror(errno),errno); return 0; } //if (normalized) *normalized = false; char *pin = (char*)inbuf; size_t inRemaining = inbuflen; char *pout = (char*)outbuf; size_t outRemaining = outbufsize; int res = 0; if (outbuf == NULL || outbufsize == 0) { // just find the size needed for conversion #define TMP_SIZE 32 char buf[TMP_SIZE]; int32_t len = 0; while (inRemaining) { QUICKPOLL(niceness); pout = buf; outRemaining = TMP_SIZE; res = iconv(cd, &pin, &inRemaining, &pout, &outRemaining); if (res < 0 && errno){ // convert the next TMP_SIZE block if (errno == E2BIG) { len += TMP_SIZE; continue; } gbiconv_close(cd); return 0; // other error } len += TMP_SIZE-outRemaining; //len >>= 1; // sizeof UChar len += 1; // NULL terminated gbiconv_close(cd); return len; } } while (inRemaining && outRemaining) { QUICKPOLL(niceness); //printf("Before - in: %d, out: %d\n", //inRemaining, outRemaining); res = iconv(cd,&pin, &inRemaining, &pout, &outRemaining); if (res < 0 && errno){ //printf("errno: %s (%d)\n", strerror(errno), errno); g_errno = errno; switch(errno) { case EILSEQ: numBadChars++; if (ignoreBadChars >= 0 && numBadChars > ignoreBadChars) goto done; utf8Encode('?', pout); pout++;outRemaining --; pin++; inRemaining--; g_errno = 0; continue; case EINVAL: numBadChars++; utf8Encode('?', pout); pout++;outRemaining --; pin++; inRemaining--; g_errno=0; continue; // go ahead and flag an error now // if there is a bad character, we've // probably misguessed the charset case E2BIG: //log("uni: error converting to UTF-8: %s", // strerror(errno)); goto done; default: log("uni: unknown error occurred " "converting to UTF-8: %s (%d)", strerror(errno), errno); goto done; } } } done: gbiconv_close(cd); int32_t len = (outbufsize - outRemaining) ; len = len>=outbufsize-1?outbufsize-2:len; //len >>= 1; //len = outbuf[len]=='\0'?len-1:len; outbuf[len] = '\0'; static char eflag = 1; if (numBadChars) { if ( eflag ) log(LOG_DEBUG, "uni: ucToAny: got %"INT32" bad chars " "in conversion 2. Only reported once.", numBadChars); // this flag makes it so no bad characters are reported // in subsequent conversions //eflag = 0; } if (res < 0 && g_errno) return 0; return len ; }