// . set words from a string // . assume no HTML entities in the string "s" // . s must be NULL terminated // . NOTE: do not free "s" from under us cuz we reference it // . break up the string ,"s", into "words". // . doesn't do tags, only text nodes in "xml" // . our definition of a word is as close to English as we can get it // . BUT we also consider a string of punctuation characters to be a word bool Words::set( char *s, bool computeWordIds ) { reset(); // determine rough upper bound on number of words by counting // punct/alnum boundaries m_preCount = countWords ( s ); if ( !allocateWordBuffers( m_preCount ) ) { return false; } return addWords( s, 0x7fffffff, computeWordIds ); }
bool Words::setxi ( char *s , char *buf, long bufSize, long niceness ) { // prevent setting with the same string if ( m_s == s ) { char *xx=NULL;*xx=0; } reset(); m_version = TITLEREC_CURRENT_VERSION; // save for sanity check m_s = s; m_localBuf2 = buf; m_localBufSize2 = bufSize; // determine rough upper bound on number of words by counting // punct/alnum boundaries m_preCount = countWords ( s , niceness ); if (!allocateWordBuffers(m_preCount)) return false; bool computeWordIds = true; return addWords(s,0x7fffffff, computeWordIds, niceness ); }
// . set words from a string // . assume no HTML entities in the string "s" // . s must be NULL terminated // . NOTE: do not free "s" from under us cuz we reference it // . break up the string ,"s", into "words". // . doesn't do tags, only text nodes in "xml" // . our definition of a word is as close to English as we can get it // . BUT we also consider a string of punctuation characters to be a word bool Words::set ( char *s , long version, bool computeWordIds , long niceness ) { // prevent setting with the same string if ( m_s == s ) { char *xx=NULL;*xx=0; } reset(); m_version = version; // save for sanity check m_s = s; m_version = version; // determine rough upper bound on number of words by counting // punct/alnum boundaries m_preCount = countWords ( s , niceness ); if (!allocateWordBuffers(m_preCount)) return false; return addWords(s,0x7fffffff, computeWordIds, niceness ); }
bool Words::set11 ( char *s , char *send , long niceness ) { reset(); m_version = TITLEREC_CURRENT_VERSION; m_s = s; // this will make addWords() scan for tags m_hasTags = true; // save it char saved = *send; // null term *send = '\0'; // determine rough upper bound on number of words by counting // punct/alnum boundaries m_preCount = countWords ( s , niceness ); // true = tagIds bool status = allocateWordBuffers(m_preCount,true); // deal with error now if ( ! status ) { *send = saved; return false; } // and set the words status = addWords(s,0x7fffffff, true, niceness ); // bring it back *send = saved; // return error? return status; }
bool Words::set2 ( Xml *xml, bool computeWordIds , long niceness) { reset(); m_xml = xml; m_version = xml->getVersion(); m_version = xml->getVersion(); register char *p = (char *)xml->getContent(); if ( *p ) p++; register long x = 0; ploop: //if ( is_alnum(*(p-1)) ^ is_alnum(*p) ) x++; //if ( is_alnum(*p ) ) x++; //x += g_map_is_alpha[*p] ; if ( is_alnum_utf8(p) ) x++; //if ( isalnum(*p) ) x++; //if ( g_map_is_alpha[*p] ) x++; //x++; p++; if ( *p ) goto ploop; m_preCount = x; m_preCount = xml->getContentLen() / 2; //if ( m_preCount > 9000 ) m_preCount = 9000; //m_preCount = 9000; if (!allocateWordBuffers(m_preCount, true)) return false; long numNodes = xml->getNumNodes(); // are we done? for ( long k = 0 ; k < numNodes && m_numWords < m_preCount ; k++ ) { // get the kth node char *node = xml->getNode (k); long nodeLen = xml->getNodeLen(k); // is the kth node a tag? if ( xml->isTag(k) ) { m_words [m_numWords] = node; m_wordLens [m_numWords] = nodeLen; m_tagIds [m_numWords] = xml->getNodeId(k); m_wordIds [m_numWords] = 0LL; m_nodes [m_numWords] = k; // we have less than 127 HTML tags, so set // the high bit for back tags if ( xml->isBackTag(k)) { m_tagIds[m_numWords] |= BACKBIT; } //log(LOG_DEBUG, "Words: Word %ld: got tag %s%s (%d)", // m_numWords, // isBackTag(m_numWords)?"/":"", // g_nodes[getTagId(m_numWords)].m_nodeName, // getTagId(m_numWords)); m_numWords++; // used by XmlDoc.cpp m_numTags++; continue; } // otherwise it's a text node char c = node[nodeLen]; node[nodeLen] = '\0'; addWords(node, nodeLen,computeWordIds, niceness); node[nodeLen] = c; } return true; }
bool Words::set ( Xml *xml, bool computeWordIds , long niceness , long node1 , long node2 ) { // prevent setting with the same string if ( m_xml == xml ) { char *xx=NULL;*xx=0; } reset(); m_xml = xml; m_version = xml->getVersion(); //m_version = xml->getVersion(); // quick test if ( ! s_tested ) { // only do once s_tested = true; // set c to a curling quote in unicode long c = 0x201c; // 0x235e; // encode it into utf8 char dst[5]; // point to it char *p = dst; // put space in there *p++ = ' '; // "numBytes" is how many bytes it stored into 'dst" long numBytes = utf8Encode ( c , p ); // must be 2 bytes i guess if ( numBytes != 3 ) { char *xx=NULL; *xx=0; } // check it long size = getUtf8CharSize(p); if ( size != 3 ) { char *xx=NULL; *xx=0; } // is that punct if ( ! is_punct_utf8 ( p ) ) { char *xx=NULL;*xx=0; } // make sure can pair across //unsigned char bits = getPunctuationBits ( dst , 4 ); // must be able to pair across //if ( ! ( bits & D_CAN_PAIR_ACROSS ) ) { char *xx=NULL;*xx=0;} } // if xml is empty, bail if ( ! xml->getContent() ) return true; long numNodes = xml->getNumNodes(); if ( numNodes <= 0 ) return true; // . can be given a range, if node2 is -1 that means all! // . range is half-open: [node1, node2) if ( node2 < 0 ) node2 = numNodes; // sanity check if ( node1 > node2 ) { char *xx=NULL;*xx=0; } char *start = xml->getNode(node1); char *end = xml->getNode(node2-1) + xml->getNodeLen(node2-1); long size = end - start; m_preCount = countWords( start , size , niceness ); // allocate based on the approximate count if ( ! allocateWordBuffers(m_preCount, true)) return false; // are we done? for ( long k = node1 ; k < node2 && m_numWords < m_preCount ; k++ ){ // get the kth node char *node = xml->getNode (k); long nodeLen = xml->getNodeLen(k); // is the kth node a tag? if ( ! xml->isTag(k) ) { char c = node[nodeLen]; node[nodeLen] = '\0'; addWords(node,nodeLen,computeWordIds,niceness); node[nodeLen] = c; continue; } // it is a tag m_words [m_numWords] = node; m_wordLens [m_numWords] = nodeLen; m_tagIds [m_numWords] = xml->getNodeId(k); m_wordIds [m_numWords] = 0LL; m_nodes [m_numWords] = k; // we have less than 127 HTML tags, so set // the high bit for back tags if ( xml->isBackTag(k)) { m_tagIds[m_numWords] |= BACKBIT; } //log(LOG_DEBUG, "Words: Word %ld: got tag %s%s (%d)", // m_numWords, // isBackTag(m_numWords)?"/":"", // g_nodes[getTagId(m_numWords)].m_nodeName, // getTagId(m_numWords)); m_numWords++; // used by XmlDoc.cpp m_numTags++; continue; } return true; }
bool Words::set( Xml *xml, bool computeWordIds, int32_t node1, int32_t node2 ) { // prevent setting with the same string if ( m_xml == xml ) gbshutdownLogicError(); reset(); m_xml = xml; // if xml is empty, bail if ( !xml->getContent() ) { return true; } int32_t numNodes = xml->getNumNodes(); if ( numNodes <= 0 ) { return true; } // . can be given a range, if node2 is -1 that means all! // . range is half-open: [node1, node2) if ( node2 < 0 ) { node2 = numNodes; } // sanity check if ( node1 > node2 ) gbshutdownLogicError(); char *start = xml->getNode(node1); char *end = xml->getNode( node2 - 1 ) + xml->getNodeLen( node2 - 1 ); int32_t size = end - start; m_preCount = countWords( start , size ); // allocate based on the approximate count if ( !allocateWordBuffers( m_preCount, true ) ) { return false; } // are we done? for ( int32_t k = node1; k < node2 && m_numWords < m_preCount; ++k ) { // get the kth node char *node = xml->getNode( k ); int32_t nodeLen = xml->getNodeLen( k ); // is the kth node a tag? if ( !xml->isTag( k ) ) { /// @todo ALC why are we adding NULL and restoring it after? /// addWords should be change to use nodeLen and not null terminated string char c = node[nodeLen]; node[nodeLen] = '\0'; addWords( node, nodeLen, computeWordIds ); node[nodeLen] = c; continue; } // it is a tag m_words [m_numWords] = node; m_wordLens [m_numWords] = nodeLen; m_tagIds [m_numWords] = xml->getNodeId(k); m_wordIds [m_numWords] = 0LL; m_nodes [m_numWords] = k; // we have less than 127 HTML tags, so set // the high bit for back tags if ( xml->isBackTag(k)) { m_tagIds[m_numWords] |= BACKBIT; } m_numWords++; // used by XmlDoc.cpp m_numTags++; continue; } return true; }