C++ (Cpp) utf8Encode Examples

Example #1

0

Show file

File: Unicode.cpp Project: DeadNumbers/open-source-search-engine

int32_t stripAccentMarks (char *outbuf, int32_t outbufsize,
		       unsigned char *p, int32_t inbuflen) {
	char *s = (char *)p;
	char *send = (char *)p + inbuflen;
	int32_t cs;
	char *dst = outbuf;
	for ( ; s < send ; s += cs ) {
		// how big is this character?
		cs = getUtf8CharSize(s);
		// convert the utf8 character to UChar32
		UChar32 uc = utf8Decode ( s );
		// break "uc" into decomposition of UChar32s
		UChar32 ttt[32];
		int32_t klen = recursiveKDExpand(uc,ttt,32);
		if(klen>32){char *xx=NULL;*xx=0;}
		// sanity
		if ( dst + 5 > outbuf+outbufsize ) return -1;
		// if the same, leave it! it had no accent marks or other
		// modifiers...
		if ( klen <= 1 ) {
			gbmemcpy ( dst , s , cs );
			dst += cs;
			continue;
		}
		// take the first one as the stripped
		// convert back to utf8
		int32_t stored = utf8Encode ( ttt[0] , dst );
		// skip over the stored utf8 char
		dst += stored;
	}
	// sanity. breach check
	if ( dst > outbuf+outbufsize ) { char *xx=NULL;*xx=0; }
	// return # of bytes stored into outbuf
	return dst - outbuf;
}

Example #2

0

Show file

File: Utf8Utils_unittest.cpp Project: Dorahe/platform_external_qemu

TEST(Utf8Utils, utf8Encode) {
    static const struct {
        int expected_len;
        uint8_t expected_text[32];
        uint32_t codepoint;
    } kData[] = {
        { -1, { 0, }, 0x80000000U },
        { 1, { 0, 0 }, 0 },
        { 1, { 32, }, 32 },
        { 1, { 127, }, 127 },
        { 2, { 0xc2, 0x80, }, 0x80 },
        { 2, { 0xc2, 0x81, }, 0x81 },
        { 2, { 0xdf, 0xbf, }, 0x7ff },
        { 3, { 0xe0, 0xa0, 0x80 }, 0x800 },
        { 3, { 0xe7, 0xbf, 0xbf }, 0x7fff },
        { 3, { 0xef, 0xbf, 0xbf }, 0xffff },
        { 4, { 0xf0, 0x90, 0x80, 0x80 }, 0x10000 },
        { 4, { 0xf7, 0xbf, 0xbf, 0xbf }, 0x1fffff },
    };
    const size_t kDataSize = ARRAYLEN(kData);
    for (size_t n = 0; n < kDataSize; ++n) {
        uint8_t buffer[32] = { 0, };

        // First, check length without an output buffer.
        int len = utf8Encode(kData[n].codepoint, NULL, 0);
        EXPECT_EQ(kData[n].expected_len, len) << "#" << n;

        // Second, check length with an output buffer.
        len = utf8Encode(kData[n].codepoint, buffer, sizeof(buffer));
        EXPECT_EQ(kData[n].expected_len, len) << "#" << n;
        for (int ii = 0; ii < len; ++ii) {
            EXPECT_EQ(kData[n].expected_text[ii], buffer[ii]) 
                    << "#" << n << " @" << ii;
        }

        // Third, check length with a buffer that is too short.
        if (kData[n].expected_len > 0) {
            len = utf8Encode(kData[n].codepoint, buffer, (size_t)(len - 1));
            EXPECT_EQ(-1, len) << "#" << n;
        }
    }
}

Example #3

0

Show file

File: Entities.cpp Project: lemire/open-source-search-engine

static bool initEntityTable(){
	if ( ! s_isInitialized ) {
		// set up the hash table
		if ( ! s_table.set ( 8,4,255,NULL,0,false,0,"enttbl" ) )
			return log("build: Could not init table of "
					   "HTML entities.");
		// now add in all the stop words
		int32_t n = (int32_t)sizeof(s_entities) / (int32_t)sizeof(Entity);
		for ( int32_t i = 0 ; i < n ; i++ ) {
			int64_t h = hash64b ( s_entities[i].entity );

			// grab the unicode code point
			UChar32 up = s_entities[i].unicode;

			// now we are 100% up
			if ( ! up ) { char *xx=NULL;*xx=0; }

			// point to it
			char *buf = (char *)s_entities[i].utf8;

			// if uchar32 not 0 then set the utf8 with it
			int32_t len = utf8Encode(up,buf);

			//
			// make my own mods to make parsing easier
			//

			if ( up == 160 ) {  // nbsp
				buf[0] = ' ';
				len = 1;
			}

			//
			// end custom mods
			//

			// set length
			s_entities[i].utf8Len = len;
			// check it
			if ( len == 0 ) { char *xx=NULL;*xx=0; }
			// must not exist!
			if ( s_table.isInTable(&h) ) { char*xx=NULL;*xx=0;}
			// store the entity index in the hash table as score
			if ( ! s_table.addTerm ( &h, i+1 ) ) return false;
		}
		s_isInitialized = true;
	} 
	return true;
}

Example #4

0

Show file

File: Entities.cpp Project: privacore/open-source-search-engine

static bool initEntityTable(){
	if ( ! s_isInitialized ) {
		// set up the hash table
		if ( ! s_table.set ( 8,4,4096,NULL,0,false,"enttbl" ) ) {
			log("build: Could not init table of HTML entities.");
			return false;
		}

		// now add in all the html entities
		const int32_t n = (int32_t)sizeof(s_entities) / (int32_t)sizeof(Entity);
		for ( int32_t i = 0 ; i < n ; i++ ) {
			int64_t h = hash64b ( s_entities[i].entity );

			// convert the unicode codepoints to an utf8 string
			char *buf = (char *)s_entities[i].utf8;
			for(int j=0; j<s_entities[i].codepoints; j++) {
				UChar32 codepoint = s_entities[i].codepoint[j];
				int32_t len = utf8Encode(codepoint,buf);
				if ( len == 0 ) { g_process.shutdownAbort(true); }
				
				// make modification to make parsing easier
				if ( codepoint == 160 ) {  // nbsp
					buf[0] = ' ';
					len = 1;
				}
				buf += len;
				
			}
			s_entities[i].utf8Len = (size_t)(buf-s_entities[i].utf8);
			// must not exist!
			if ( s_table.isInTable(&h) ) { g_process.shutdownAbort(true);}
			// store the entity index in the hash table as score
			if ( ! s_table.addTerm(h, i+1) ) return false;
		}
		s_isInitialized = true;
	} 
	return true;
}

Example #5

0

Show file

File: Words.cpp Project: BillWangCS/open-source-search-engine

bool Words::set ( Xml *xml, 
		  bool computeWordIds , 
		  long niceness ,
		  long node1 ,
		  long node2 ) {
	// prevent setting with the same string
	if ( m_xml == xml ) { char *xx=NULL;*xx=0; }
	reset();
	m_xml = xml;
	m_version = xml->getVersion();
	//m_version = xml->getVersion();

	// quick test
	if ( ! s_tested ) {
		// only do once
		s_tested = true;
		// set c to a curling quote in unicode
		long c = 0x201c; // 0x235e;
		// encode it into utf8
		char dst[5];
		// point to it
		char *p = dst;
		// put space in there
		*p++ = ' ';
		// "numBytes" is how many bytes it stored into 'dst"
		long numBytes = utf8Encode ( c , p );
		// must be 2 bytes i guess
		if ( numBytes != 3 ) { char *xx=NULL; *xx=0; }
		// check it
		long size = getUtf8CharSize(p);
		if ( size != 3 ) { char *xx=NULL; *xx=0; }
		// is that punct
		if ( ! is_punct_utf8 ( p ) ) { char *xx=NULL;*xx=0; }
		// make sure can pair across
		//unsigned char bits = getPunctuationBits  ( dst , 4 );
		// must be able to pair across
		//if ( ! ( bits & D_CAN_PAIR_ACROSS ) ) { char *xx=NULL;*xx=0;}
	}

	// if xml is empty, bail
	if   ( ! xml->getContent() ) return true;

	long numNodes = xml->getNumNodes();
	if ( numNodes <= 0 ) return true;

	// . can be given a range, if node2 is -1 that means all!
	// . range is half-open: [node1, node2)
	if ( node2 < 0 ) node2 = numNodes;
	// sanity check
	if ( node1 > node2 ) { char *xx=NULL;*xx=0; }
	char *start = xml->getNode(node1);
	char *end   = xml->getNode(node2-1) + xml->getNodeLen(node2-1);
	long  size  = end - start;

	m_preCount = countWords( start , size , niceness );

	// allocate based on the approximate count
	if ( ! allocateWordBuffers(m_preCount, true)) return false;
	
	// are we done?
	for ( long k = node1 ; k < node2 && m_numWords < m_preCount ; k++ ){
		// get the kth node
		char *node    = xml->getNode   (k);
		long  nodeLen = xml->getNodeLen(k);
		// is the kth node a tag?
		if ( ! xml->isTag(k) ) {
			char c = node[nodeLen];
			node[nodeLen] = '\0';
			addWords(node,nodeLen,computeWordIds,niceness);
			node[nodeLen] = c;
			continue;
		}
		// it is a tag
		m_words    [m_numWords] = node;
		m_wordLens [m_numWords] = nodeLen;
		m_tagIds   [m_numWords] = xml->getNodeId(k);
		m_wordIds  [m_numWords] = 0LL;
		m_nodes    [m_numWords] = k;
		// we have less than 127 HTML tags, so set 
		// the high bit for back tags
		if ( xml->isBackTag(k)) {
			m_tagIds[m_numWords] |= BACKBIT;
		}
		//log(LOG_DEBUG, "Words: Word %ld: got tag %s%s (%d)", 
		//    m_numWords,
		//    isBackTag(m_numWords)?"/":"",
		//    g_nodes[getTagId(m_numWords)].m_nodeName,
		//    getTagId(m_numWords));
		
		m_numWords++;
		// used by XmlDoc.cpp
		m_numTags++;
		continue;
	}
	return true;
}

Example #6

0

Show file

File: Entities.cpp Project: BlaBlaNet/open-source-search-engine

static bool initEntityTable(){
	if ( ! s_isInitialized ) {
		// set up the hash table
		if ( ! s_table.set ( 8,4,255,NULL,0,false,0,"enttbl" ) )
			return log("build: Could not init table of "
					   "HTML entities.");
		// now add in all the stop words
		int32_t n = (int32_t)sizeof(s_entities) / (int32_t)sizeof(Entity);
		for ( int32_t i = 0 ; i < n ; i++ ) {
			int64_t h = hash64b ( s_entities[i].entity );
			// grab the unicode code point
			UChar32 up = s_entities[i].unicode;
			// now we are 100% up
			if ( ! up ) { char *xx=NULL;*xx=0; }
			// point to it
			char *buf = (char *)s_entities[i].utf8;
			// if uchar32 not 0 then set the utf8 with it
			int32_t len = utf8Encode(up,buf);
			//
			// make my own mods to make parsing easier
			//
			if ( up == 160 ) {  // nbsp
				buf[0] = ' '; len = 1; }
			// make all quotes equal '\"' (34 decimal)
			// double and single curling quotes
			//http://www.dwheeler.com/essays/quotes-test-utf-8.html
			// &#x201c, 201d, 2018, 2019 (unicode values, not utf8)
			// &ldquo, &rdquo, &lsquo, &rsquo
			/*
			if ( up == 171 ||
			     up == 187 ||
			     up == 8216 ||
			     up == 8217 ||
			     up == 8218 ||
			     up == 8220 ||
			     up == 8221 ||
			     up == 8222 ||
			     up == 8249 ||
			     up == 8250 ) {
				buf[0] = '\"'; len = 1; }
			// and normalize all dashes (mdash,ndash)
			if ( up == 8211 || up == 8212 ) {
				buf[0] = '-'; len = 1; }
			*/

			//
			// end custom mods
			//

			// set length
			s_entities[i].utf8Len = len;
			// check it
			if ( len == 0 ) { char *xx=NULL;*xx=0; }
			// must not exist!
			if ( s_table.isInTable(&h) ) { char*xx=NULL;*xx=0;}
			// store the entity index in the hash table as score
			if ( ! s_table.addTerm ( &h, i+1 ) ) return false;
		}
		s_isInitialized = true;
	} 
	return true;
}

Example #7

0

Show file

File: fctypes.cpp Project: privacore/open-source-search-engine

// . if "doSpecial" is true, then we don't touch &lt;, &gt; and &amp;
int32_t htmlDecode( char *dst, const char *src, int32_t srcLen, bool doSpecial ) {
	//special-case optimization
	if ( srcLen == 0 ) {
		return 0;
	}

	char * const start  = dst;
	const char * const srcEnd = src + srcLen;
	for ( ; src < srcEnd ; ) {
		
		if ( *src != '&' ) {
			*dst++ = *src++;
		} else {
			// Ok, we have an ampersand. So decode it into unicode/utf8, do a few special
			// checks, and in general store the resulting string in dst[]
		
			// store decoded entity char into dst[j]
			uint32_t codepoint[2];
			int32_t codepointCount;
			int32_t utf8Len=0;

			// "skip" is how many bytes the entites was in "src"
			int32_t skip = getEntity_a( src, srcEnd - src, codepoint, &codepointCount, &utf8Len );

			// If the entity is invalid/unknown then store it as text

			//@todo BR: Temporary fix for named html entities where the utf8 length is 
			// longer than the html entity name. This causes problems for XmlDoc that
			// calls this function with the same buffer as input and output
			if ( skip == 0 || utf8Len > skip) {
				//todo: if doSpecial then make it an &amp;
				// but the decoding is done in-place (bad idea) so we cannot expand the output
				*dst++ = *src++;
				continue;
			}

			// . special mapping
			// . make &lt; and &gt; special so Xml::set() still works
			// . and make &amp; special so we do not screw up summaries
			if ( doSpecial ) {
				if ( codepoint[0] == '<' || codepoint[0] == '>' || codepoint[0] == '&' ) {
					int32_t entityLen = 4;
					const char* entityStr = "";
	
					if (codepoint[0] == '<') {
						entityStr = "&lt;";
					} else if (codepoint[0] == '>') {
						entityStr = "&gt;";
					} else {
						entityStr = "&amp;";
						entityLen = 5;
					}
	
					memcpy(dst, entityStr, entityLen);
					src += skip;
					dst += entityLen;
					continue;
				}
	
				/// @todo verify if we need to replace " with '
	
				// some tags have &quot; in their value strings
				// so we have to preserve that!
				// use curling quote:
				//http://www.dwheeler.com/essays/quotes-test-utf-8.html
				// curling double and single quotes resp:
				// &ldquo; &rdquo; &lsquo; &rdquo;
				if ( codepoint[0] == '\"' ) {
					*dst = '\'';
					dst++;
					src += skip;
					continue;
				}
			}

			int32_t totalUtf8Bytes = 0;
			for ( int i=0; i<codepointCount; i++) {
				// . store it into "dst" in utf8 format
				int32_t numBytes = utf8Encode ( codepoint[i], dst );
				totalUtf8Bytes += numBytes;

				// sanity check. do not eat our tail if dst == src
				if ( totalUtf8Bytes > skip ) {
					g_process.shutdownAbort(true);
				}

				// advance dst ptr
				dst += numBytes;
			}

			// skip over the encoded entity in the source string
			src += skip;
		}
	}

	// NUL term
	*dst = '\0';

	return dst - start;
}

Example #8

0

Show file

File: Unicode.cpp Project: DeadNumbers/open-source-search-engine

int32_t 	ucToAny(char *outbuf, int32_t outbufsize, char *charset_out,
		 char *inbuf, int32_t inbuflen, char *charset_in,
		 int32_t ignoreBadChars , int32_t niceness ){
	if (inbuflen == 0) return 0;
	// alias for iconv
	char *csAlias = charset_in;
	if (!strncmp(charset_in, "x-windows-949", 13))
		csAlias = "CP949";

	// Treat all latin1 as windows-1252 extended charset
	if (!strncmp(charset_in, "ISO-8859-1", 10) )
		csAlias = "WINDOWS-1252";
	
	iconv_t cd = gbiconv_open(charset_out, csAlias);
	int32_t numBadChars = 0;
	if (cd == (iconv_t)-1) {	
		log("uni: Error opening input conversion"
		    " descriptor for %s: %s (%d)\n", 
		    charset_in,
		    strerror(errno),errno);
		return 0;		
	}

	//if (normalized) *normalized = false;
	char *pin = (char*)inbuf;
	size_t inRemaining = inbuflen;
	char *pout = (char*)outbuf;
	size_t outRemaining = outbufsize;
	int res = 0;
	if (outbuf == NULL || outbufsize == 0) {
		// just find the size needed for conversion
#define TMP_SIZE 32
		char buf[TMP_SIZE];
		int32_t len = 0;
		while (inRemaining) {
			QUICKPOLL(niceness);
			pout = buf;
			outRemaining = TMP_SIZE;
			res = iconv(cd, &pin, &inRemaining, 
				    &pout, &outRemaining);
			if (res < 0 && errno){
				// convert the next TMP_SIZE block
				if (errno == E2BIG) { 
					len += TMP_SIZE; 
					continue;
				}
				gbiconv_close(cd);
				return 0; // other error
			}
			len += TMP_SIZE-outRemaining;
			//len >>= 1; // sizeof UChar
			len += 1; // NULL terminated
			gbiconv_close(cd);
			return len;			
		}
	}

	while (inRemaining && outRemaining) {
		QUICKPOLL(niceness);
		//printf("Before - in: %d, out: %d\n", 
		//inRemaining, outRemaining);
		res = iconv(cd,&pin, &inRemaining,
				&pout, &outRemaining);

		if (res < 0 && errno){
			//printf("errno: %s (%d)\n", strerror(errno), errno);
			g_errno = errno;
			switch(errno) {
			case EILSEQ:
				numBadChars++;

 				if (ignoreBadChars >= 0 &&
				    numBadChars > ignoreBadChars) goto done;
				utf8Encode('?', pout);
				pout++;outRemaining --;
 				pin++; inRemaining--;
				g_errno = 0;
 				continue;
			case EINVAL:
				numBadChars++;

				utf8Encode('?', pout); 
				pout++;outRemaining --;
				pin++; inRemaining--;
				g_errno=0;
				continue;
				// go ahead and flag an error now
				// if there is a bad character, we've 
				// probably misguessed the charset

			case E2BIG:
				//log("uni: error converting to UTF-8: %s",
				//    strerror(errno));
				goto done;
			default:
				log("uni: unknown error occurred "
				    "converting to UTF-8: %s (%d)",
				    strerror(errno), errno);
				goto done;
			}
		}
	}
done:
	gbiconv_close(cd);
	int32_t len =  (outbufsize - outRemaining) ;
	len = len>=outbufsize-1?outbufsize-2:len;
	//len >>= 1;
	//len = outbuf[len]=='\0'?len-1:len;
	outbuf[len] = '\0';
	static char eflag = 1;
	if (numBadChars) {
		if ( eflag )
			log(LOG_DEBUG, "uni: ucToAny: got %"INT32" bad chars "
			    "in conversion 2. Only reported once.",
			    numBadChars);
		// this flag makes it so no bad characters are reported
		// in subsequent conversions
		//eflag = 0;
	}
	if (res < 0 && g_errno) return 0; 
	return len ;
}