// . set words from a string
// . assume no HTML entities in the string "s"
// . s must be NULL terminated
// . NOTE: do not free "s" from under us cuz we reference it
// . break up the string ,"s", into "words".
// . doesn't do tags, only text nodes in "xml"
// . our definition of a word is as close to English as we can get it
// . BUT we also consider a string of punctuation characters to be a word
bool Words::set( char *s, bool computeWordIds ) {
	reset();

	// determine rough upper bound on number of words by counting
	// punct/alnum boundaries
	m_preCount = countWords ( s );
	if ( !allocateWordBuffers( m_preCount ) ) {
		return false;
	}

	return addWords( s, 0x7fffffff, computeWordIds );
}
bool Words::setxi ( char *s , char *buf, long bufSize, long niceness ) {
	// prevent setting with the same string
	if ( m_s == s ) { char *xx=NULL;*xx=0; }
	reset();
	m_version = TITLEREC_CURRENT_VERSION;
	// save for sanity check
	m_s = s;
	m_localBuf2 = buf;
	m_localBufSize2 = bufSize;
	// determine rough upper bound on number of words by counting
	// punct/alnum boundaries
	m_preCount = countWords ( s , niceness );
	if (!allocateWordBuffers(m_preCount)) return false;
	bool computeWordIds = true;
	return addWords(s,0x7fffffff, computeWordIds, niceness );
}
// . set words from a string
// . assume no HTML entities in the string "s"
// . s must be NULL terminated
// . NOTE: do not free "s" from under us cuz we reference it
// . break up the string ,"s", into "words".
// . doesn't do tags, only text nodes in "xml"
// . our definition of a word is as close to English as we can get it
// . BUT we also consider a string of punctuation characters to be a word
bool Words::set ( char *s , long version, 
		  bool computeWordIds ,
		  long niceness ) {

	// prevent setting with the same string
	if ( m_s == s ) { char *xx=NULL;*xx=0; }

	reset();
	m_version = version;
	// save for sanity check
	m_s = s;

	m_version = version;
	// determine rough upper bound on number of words by counting
	// punct/alnum boundaries
	m_preCount = countWords ( s , niceness );
	if (!allocateWordBuffers(m_preCount)) return false;
	
	return addWords(s,0x7fffffff, computeWordIds, niceness );
}
bool Words::set11 ( char *s , char *send , long niceness ) {
	reset();
	m_version = TITLEREC_CURRENT_VERSION;
	m_s = s;
	// this will make addWords() scan for tags
	m_hasTags = true;
	// save it
	char saved = *send;
	// null term
	*send = '\0';
	// determine rough upper bound on number of words by counting
	// punct/alnum boundaries
	m_preCount = countWords ( s , niceness );
	// true = tagIds
	bool status = allocateWordBuffers(m_preCount,true);
	// deal with error now
	if ( ! status ) { *send = saved; return false; }
	// and set the words
	status = addWords(s,0x7fffffff, true, niceness );
	// bring it back
	*send = saved;
	// return error?
	return status;
}
bool Words::set2 ( Xml *xml, 
		   bool computeWordIds ,
		   long niceness) {
	reset();
	m_xml = xml;
	m_version = xml->getVersion();
	m_version = xml->getVersion();
	register char *p = (char *)xml->getContent();
	if ( *p ) p++;
	register long x = 0;
 ploop:
	//if ( is_alnum(*(p-1)) ^ is_alnum(*p) ) x++;
	//if ( is_alnum(*p ) ) x++;
	//x += g_map_is_alpha[*p] ;
	if ( is_alnum_utf8(p) ) x++;
	//if ( isalnum(*p) ) x++;
	//if ( g_map_is_alpha[*p] ) x++;
	//x++;
	p++;
	if ( *p ) goto ploop;

	m_preCount = x;
	m_preCount = xml->getContentLen() / 2;
	//if ( m_preCount > 9000 ) m_preCount = 9000;
	//m_preCount = 9000;

	if (!allocateWordBuffers(m_preCount, true)) return false;
	
	long numNodes = xml->getNumNodes();
	// are we done?
	for ( long k = 0 ; k < numNodes && m_numWords < m_preCount ; k++ ) {
		// get the kth node
		char *node    = xml->getNode   (k);
		long  nodeLen = xml->getNodeLen(k);
		// is the kth node a tag?
		if ( xml->isTag(k) ) {
			m_words         [m_numWords] = node;
			m_wordLens      [m_numWords] = nodeLen;
			m_tagIds        [m_numWords] = xml->getNodeId(k);
			m_wordIds       [m_numWords] = 0LL;
			m_nodes         [m_numWords] = k;
			// we have less than 127 HTML tags, so set 
			// the high bit for back tags
			if ( xml->isBackTag(k)) {
				m_tagIds[m_numWords] |= BACKBIT;
			}

			//log(LOG_DEBUG, "Words: Word %ld: got tag %s%s (%d)", 
			//    m_numWords,
			//    isBackTag(m_numWords)?"/":"",
			//    g_nodes[getTagId(m_numWords)].m_nodeName,
			//    getTagId(m_numWords));

			m_numWords++;
			// used by XmlDoc.cpp
			m_numTags++;
			continue;
		}
		// otherwise it's a text node
		char c = node[nodeLen];
		node[nodeLen] = '\0';
		addWords(node, nodeLen,computeWordIds, niceness);
		node[nodeLen] = c;
	}
	return true;
}
bool Words::set ( Xml *xml, 
		  bool computeWordIds , 
		  long niceness ,
		  long node1 ,
		  long node2 ) {
	// prevent setting with the same string
	if ( m_xml == xml ) { char *xx=NULL;*xx=0; }
	reset();
	m_xml = xml;
	m_version = xml->getVersion();
	//m_version = xml->getVersion();

	// quick test
	if ( ! s_tested ) {
		// only do once
		s_tested = true;
		// set c to a curling quote in unicode
		long c = 0x201c; // 0x235e;
		// encode it into utf8
		char dst[5];
		// point to it
		char *p = dst;
		// put space in there
		*p++ = ' ';
		// "numBytes" is how many bytes it stored into 'dst"
		long numBytes = utf8Encode ( c , p );
		// must be 2 bytes i guess
		if ( numBytes != 3 ) { char *xx=NULL; *xx=0; }
		// check it
		long size = getUtf8CharSize(p);
		if ( size != 3 ) { char *xx=NULL; *xx=0; }
		// is that punct
		if ( ! is_punct_utf8 ( p ) ) { char *xx=NULL;*xx=0; }
		// make sure can pair across
		//unsigned char bits = getPunctuationBits  ( dst , 4 );
		// must be able to pair across
		//if ( ! ( bits & D_CAN_PAIR_ACROSS ) ) { char *xx=NULL;*xx=0;}
	}

	// if xml is empty, bail
	if   ( ! xml->getContent() ) return true;

	long numNodes = xml->getNumNodes();
	if ( numNodes <= 0 ) return true;

	// . can be given a range, if node2 is -1 that means all!
	// . range is half-open: [node1, node2)
	if ( node2 < 0 ) node2 = numNodes;
	// sanity check
	if ( node1 > node2 ) { char *xx=NULL;*xx=0; }
	char *start = xml->getNode(node1);
	char *end   = xml->getNode(node2-1) + xml->getNodeLen(node2-1);
	long  size  = end - start;

	m_preCount = countWords( start , size , niceness );

	// allocate based on the approximate count
	if ( ! allocateWordBuffers(m_preCount, true)) return false;
	
	// are we done?
	for ( long k = node1 ; k < node2 && m_numWords < m_preCount ; k++ ){
		// get the kth node
		char *node    = xml->getNode   (k);
		long  nodeLen = xml->getNodeLen(k);
		// is the kth node a tag?
		if ( ! xml->isTag(k) ) {
			char c = node[nodeLen];
			node[nodeLen] = '\0';
			addWords(node,nodeLen,computeWordIds,niceness);
			node[nodeLen] = c;
			continue;
		}
		// it is a tag
		m_words    [m_numWords] = node;
		m_wordLens [m_numWords] = nodeLen;
		m_tagIds   [m_numWords] = xml->getNodeId(k);
		m_wordIds  [m_numWords] = 0LL;
		m_nodes    [m_numWords] = k;
		// we have less than 127 HTML tags, so set 
		// the high bit for back tags
		if ( xml->isBackTag(k)) {
			m_tagIds[m_numWords] |= BACKBIT;
		}
		//log(LOG_DEBUG, "Words: Word %ld: got tag %s%s (%d)", 
		//    m_numWords,
		//    isBackTag(m_numWords)?"/":"",
		//    g_nodes[getTagId(m_numWords)].m_nodeName,
		//    getTagId(m_numWords));
		
		m_numWords++;
		// used by XmlDoc.cpp
		m_numTags++;
		continue;
	}
	return true;
}
bool Words::set( Xml *xml, bool computeWordIds, int32_t node1, int32_t node2 ) {
	// prevent setting with the same string
	if ( m_xml == xml ) gbshutdownLogicError();

	reset();

	m_xml = xml;

	// if xml is empty, bail
	if ( !xml->getContent() ) {
		return true;
	}

	int32_t numNodes = xml->getNumNodes();
	if ( numNodes <= 0 ) {
		return true;
	}

	// . can be given a range, if node2 is -1 that means all!
	// . range is half-open: [node1, node2)
	if ( node2 < 0 ) {
		node2 = numNodes;
	}

	// sanity check
	if ( node1 > node2 ) gbshutdownLogicError();

	char *start = xml->getNode(node1);
	char *end = xml->getNode( node2 - 1 ) + xml->getNodeLen( node2 - 1 );
	int32_t  size  = end - start;

	m_preCount = countWords( start , size );

	// allocate based on the approximate count
	if ( !allocateWordBuffers( m_preCount, true ) ) {
		return false;
	}

	// are we done?
	for ( int32_t k = node1; k < node2 && m_numWords < m_preCount; ++k ) {
		// get the kth node
		char *node = xml->getNode( k );
		int32_t nodeLen = xml->getNodeLen( k );

		// is the kth node a tag?
		if ( !xml->isTag( k ) ) {
			/// @todo ALC why are we adding NULL and restoring it after?
			/// addWords should be change to use nodeLen and not null terminated string
			char c = node[nodeLen];
			node[nodeLen] = '\0';
			addWords( node, nodeLen, computeWordIds );
			node[nodeLen] = c;
			continue;
		}

		// it is a tag
		m_words    [m_numWords] = node;
		m_wordLens [m_numWords] = nodeLen;
		m_tagIds   [m_numWords] = xml->getNodeId(k);
		m_wordIds  [m_numWords] = 0LL;
		m_nodes    [m_numWords] = k;

		// we have less than 127 HTML tags, so set 
		// the high bit for back tags
		if ( xml->isBackTag(k)) {
			m_tagIds[m_numWords] |= BACKBIT;
		}

		m_numWords++;

		// used by XmlDoc.cpp
		m_numTags++;

		continue;
	}

	return true;
}