// a quickie // this url gives a m_preCount that is too low. why? // http://go.tfol.com/163/speed.asp long countWords ( char *p , long plen , long niceness ) { char *pend = p + plen; long count = 1; loop: // sequence of punct for ( ; p < pend && ! is_alnum_utf8 (p) ; p += getUtf8CharSize(p) ) { // breathe QUICKPOLL ( niceness ); // in case being set from xml tags, count as words now if ( *p=='<') count++; } count++; // sequence of alnum for ( ; p < pend && is_alnum_utf8 (p) ; p += getUtf8CharSize(p) ) // breathe QUICKPOLL ( niceness ); count++; if ( p < pend ) goto loop; // some extra for good meaure return count+10; }
// a quickie // this url gives a m_preCount that is too low. why? // http://go.tfol.com/163/speed.asp static int32_t countWords ( const char *p , int32_t plen ) { const char *pend = p + plen; int32_t count = 1; while ( p < pend ) { // sequence of punct for ( ; p < pend && ! is_alnum_utf8 (p) ; p += getUtf8CharSize(p) ) { // in case being set from xml tags, count as words now if ( *p == '<' ) { count++; } } count++; // sequence of alnum for ( ; p < pend && is_alnum_utf8 (p) ; p += getUtf8CharSize(p) ) ; count++; }; // some extra for good meaure return count+10; }
// does word qualify as a subtitle delimeter? bool isWordQualified ( char *wp , int32_t wlen ) { // must be punct word if ( is_alnum_utf8( wp ) ) { return false; } // scan the chars int32_t x; for ( x = 0; x < wlen; x++ ) { if ( wp[x] == ' ' ) { continue; } break; } // does it qualify as a subtitle delimeter? bool qualified = false; if ( x < wlen ) { qualified = true; } // fix amazon.com from splitting on period if ( wlen == 1 ) { qualified = false; } return qualified; }
static int32_t countWords ( const char *p ) { int32_t count = 1; while ( *p ) { // sequence of punct for ( ; *p && ! is_alnum_utf8 (p) ; p += getUtf8CharSize(p) ) { // in case being set from xml tags, count as words now if ( *p=='<') count++; } count++; // sequence of alnum for ( ; *p && is_alnum_utf8 (p) ; p += getUtf8CharSize(p) ) ; count++; } // some extra for good meaure return count+10; }
// . copy just words in [t0,t1) // . returns false on error and sets g_errno bool Title::copyTitle(Words *w, int32_t t0, int32_t t1) { // skip initial punct const char *const *wp = w->getWords(); const int32_t *wlens = w->getWordLens(); int32_t nw = w->getNumWords(); // sanity check if ( t1 < t0 ) { char *xx = NULL; *xx = 0; } // don't breech number of words if ( t1 > nw ) { t1 = nw; } // no title? if ( nw == 0 || t0 == t1 ) { reset(); return true; } const char *end = wp[t1-1] + wlens[t1-1] ; // allocate title int32_t need = end - wp[t0]; // add 3 bytes for "..." and 1 for \0 need += 5; // return false if could not hold the title if ( need > MAX_TITLE_LEN ) { m_title[0] = '\0'; m_titleLen = 0; log("query: Could not alloc %" PRId32" bytes for title.",need); return false; } // point to the title to transcribe const char *src = wp[t0]; const char *srcEnd = end; // include a \" or \' if ( t0 > 0 && ( src[-1] == '\'' || src[-1] == '\"' ) ) { src--; } // and remove terminating | or : for ( ; srcEnd > src && (srcEnd[-1] == ':' || srcEnd[-1] == ' ' || srcEnd[-1] == '-' || srcEnd[-1] == '\n' || srcEnd[-1] == '\r' || srcEnd[-1] == '|' ) ; srcEnd-- ); // store in here char *dst = m_title; // leave room for "...\0" char *dstEnd = m_title + need - 4; // size of character in bytes, usually 1 char cs ; // point to last punct char char *lastp = dst;//NULL; int32_t charCount = 0; // copy the node @p into "dst" for ( ; src < srcEnd ; src += cs , dst += cs ) { // get src size cs = getUtf8CharSize ( src ); // break if we are full! if ( dst + cs >= dstEnd ) { break; } // or hit our max char limit if ( charCount++ >= m_maxTitleLen ) { break; } // skip unwanted character if (isUtf8UnwantedSymbols(src)) { dst -= cs; continue; } // remember last punct for cutting purposes if ( ! is_alnum_utf8 ( src ) ) { lastp = dst; } // encode it as an html entity if asked to if ( *src == '<' ) { if ( dst + 4 >= dstEnd ) { break; } gbmemcpy ( dst , "<" , 4 ); dst += 4 - cs; continue; } // encode it as an html entity if asked to if ( *src == '>' ) { if ( dst + 4 >= dstEnd ) { break; } gbmemcpy ( dst , ">" , 4 ); dst += 4 - cs; continue; } // if more than 1 byte in char, use gbmemcpy if ( cs == 1 ) { *dst = *src; } else { gbmemcpy ( dst , src , cs ); } } // null term always *dst = '\0'; // do not split a word in the middle! if ( src < srcEnd ) { if ( lastp ) { gbmemcpy ( lastp , "...\0" , 4 ); dst = lastp + 3; } else { gbmemcpy ( dst , "...\0" , 4 ); dst += 3; } } // set size. does not include the terminating \0 m_titleLen = dst - m_title; return true; }
bool Words::set2 ( Xml *xml, bool computeWordIds , long niceness) { reset(); m_xml = xml; m_version = xml->getVersion(); m_version = xml->getVersion(); register char *p = (char *)xml->getContent(); if ( *p ) p++; register long x = 0; ploop: //if ( is_alnum(*(p-1)) ^ is_alnum(*p) ) x++; //if ( is_alnum(*p ) ) x++; //x += g_map_is_alpha[*p] ; if ( is_alnum_utf8(p) ) x++; //if ( isalnum(*p) ) x++; //if ( g_map_is_alpha[*p] ) x++; //x++; p++; if ( *p ) goto ploop; m_preCount = x; m_preCount = xml->getContentLen() / 2; //if ( m_preCount > 9000 ) m_preCount = 9000; //m_preCount = 9000; if (!allocateWordBuffers(m_preCount, true)) return false; long numNodes = xml->getNumNodes(); // are we done? for ( long k = 0 ; k < numNodes && m_numWords < m_preCount ; k++ ) { // get the kth node char *node = xml->getNode (k); long nodeLen = xml->getNodeLen(k); // is the kth node a tag? if ( xml->isTag(k) ) { m_words [m_numWords] = node; m_wordLens [m_numWords] = nodeLen; m_tagIds [m_numWords] = xml->getNodeId(k); m_wordIds [m_numWords] = 0LL; m_nodes [m_numWords] = k; // we have less than 127 HTML tags, so set // the high bit for back tags if ( xml->isBackTag(k)) { m_tagIds[m_numWords] |= BACKBIT; } //log(LOG_DEBUG, "Words: Word %ld: got tag %s%s (%d)", // m_numWords, // isBackTag(m_numWords)?"/":"", // g_nodes[getTagId(m_numWords)].m_nodeName, // getTagId(m_numWords)); m_numWords++; // used by XmlDoc.cpp m_numTags++; continue; } // otherwise it's a text node char c = node[nodeLen]; node[nodeLen] = '\0'; addWords(node, nodeLen,computeWordIds, niceness); node[nodeLen] = c; } return true; }
bool Words::addWords(char *s,long nodeLen,bool computeWordIds, long niceness) { long i = 0; long j; //long k = 0; long wlen; //unsigned long e; //long skip; long badCount = 0; bool hadApostrophe = false; UCScript oldScript = ucScriptCommon; UCScript saved; UCProps props; uptop: // bad utf8 can cause a breach if ( i >= nodeLen ) goto done; if ( ! s[i] ) goto done; if ( ! is_alnum_utf8(s+i) ) { // && m_numWords < m_preCount ) { if ( m_numWords >= m_preCount ) goto done; // tag? if ( s[i]=='<' && m_hasTags && isTagStart(s+i) ) { // get the tag id if ( s[i+1]=='/' ) { // skip over / m_tagIds [m_numWords] = ::getTagId(s+i+2); m_tagIds [m_numWords] |= BACKBIT; } else m_tagIds [m_numWords] = ::getTagId(s+i+1); // word start m_words [m_numWords] = s + i; m_wordIds [m_numWords] = 0LL; // skip till end long tagLen = getTagLen(s+i); // ,niceness); m_wordLens [m_numWords] = tagLen; m_numWords++; // advance i += tagLen; goto uptop; } // it is a punct word, find end of it char *start = s+i; //for (;s[i] && ! is_alnum_utf8(s+i);i+=getUtf8CharSize(s+i)); for ( ; s[i] ; i += getUtf8CharSize(s+i)){ // stop on < if we got tags if ( s[i] == '<' && m_hasTags ) break; // breathe QUICKPOLL(niceness); // if we are simple ascii, skip quickly if ( is_ascii(s[i]) ) { // accumulate NON-alnum chars if ( ! is_alnum_a(s[i]) ) continue; // update oldScript = ucScriptCommon; // otherwise, stop we got alnum break; } // if we are utf8 we stop on special props UChar32 c = utf8Decode ( s+i ); // stop if word char if ( ! ucIsWordChar ( c ) ) continue; // update first though oldScript = ucGetScript ( c ); // then stop break; } m_words [ m_numWords ] = start; m_wordLens [ m_numWords ] = s+i - start; m_wordIds [ m_numWords ] = 0LL; if (m_tagIds) m_tagIds[m_numWords] = 0; m_numWords++; goto uptop; } // get an alnum word j = i; again: //for ( ; is_alnum_utf8 (&s[i] ) ; i += getUtf8CharSize(s+i) ); for ( ; s[i] ; i += getUtf8CharSize(s+i) ) { // breathe QUICKPOLL(niceness); // simple ascii? if ( is_ascii(s[i]) ) { // accumulate alnum chars if ( is_alnum_a(s[i]) ) continue; // update oldScript = ucScriptCommon; // otherwise, stop we got punct break; } // get the code point of the utf8 char UChar32 c = utf8Decode ( s+i ); // get props props = ucProperties ( c ); // good stuff? if ( props & (UC_IGNORABLE|UC_EXTEND) ) continue; // stop? if UC_WORCHAR is set, that means its an alnum if ( ! ( props & UC_WORDCHAR ) ) { // reset script between words oldScript = ucScriptCommon; break; } // save it saved = oldScript; // update here oldScript = ucGetScript(c); // treat ucScriptLatin (30) as common so we can have latin1 // like char without breaking the word! if ( oldScript == ucScriptLatin ) oldScript = ucScriptCommon; // stop on this crap too i guess. like japanes chars? if ( props & ( UC_IDEOGRAPH | UC_HIRAGANA | UC_THAI ) ) { // include it i += getUtf8CharSize(s+i); // but stop break; } // script change? if ( saved != oldScript ) break; } // . java++, A++, C++ exception // . A+, C+, exception // . TODO: consider putting in Bits.cpp w/ D_CAN_BE_IN_PHRASE if ( s[i]=='+' ) { if ( s[i+1]=='+' && !is_alnum_utf8(&s[i+2]) ) i += 2; else if ( !is_alnum_utf8(&s[i+1]) ) i++; } // . c#, j#, ... if ( s[i]=='#' && !is_alnum_utf8(&s[i+1]) ) i++; // allow for words like we're dave's and i'm if(s[i]=='\''&&s[i+1]&&is_alnum_utf8(&s[i+1])&&!hadApostrophe){ i++; hadApostrophe = true; goto again; } hadApostrophe = false; // get word length wlen = i - j; if ( m_numWords >= m_preCount ) goto done; m_words [ m_numWords ] = &s[j]; m_wordLens[ m_numWords ] = wlen; // . Lars says it's better to leave the accented chars intact // . google agrees // . but what about "re'sume"? if ( computeWordIds ) { long long h = hash64Lower_utf8(&s[j],wlen); m_wordIds [m_numWords] = h; // until we get an accent removal algo, comment this // out and possibly use the query synonym pipeline // to search without accents. MDW //long long h2 = hash64AsciiLowerE(&s[j],wlen); //if ( h2 != h ) m_stripWordIds [m_numWords] = h2; //else m_stripWordIds [m_numWords] = 0LL; //m_stripWordIds[m_numWords] = 0; } if (m_tagIds) m_tagIds[m_numWords] = 0; m_numWords++; m_numAlnumWords++; // break on \0 or MAX_WORDS //if ( ! s[i] ) goto done; // get a punct word goto uptop; /* j = i; // delineate the "punctuation" word for ( ; s[i] && !is_alnum_utf8(&s[i]);i+=getUtf8CharSize(s+i)); // bad utf8 could cause us to breach the node, so watch out! if ( i > nodeLen ) { badCount++; i = nodeLen; } // get word length wlen = i - j; if ( m_numWords >= m_preCount ) goto done; m_words [m_numWords ] = &s[j]; m_wordLens [m_numWords ] = wlen; m_wordIds [m_numWords ] = 0LL; if (m_tagIds) m_tagIds[m_numWords] = 0; m_numWords++; */ done: // bad programming warning if ( m_numWords > m_preCount ) { log(LOG_LOGIC, "build: words: set: Fix counting routine."); char *xx = NULL; *xx = 0; } // compute total length if ( m_numWords <= 0 ) m_totalLen = 0; else m_totalLen = m_words[m_numWords-1] - s + m_wordLens[m_numWords-1]; if ( badCount ) log("words: had %li bad utf8 chars",badCount); return true; }
bool Words::addWords( char *s, int32_t nodeLen, bool computeWordIds ) { int32_t i = 0; int32_t j; int32_t wlen; bool hadApostrophe = false; UCScript oldScript = ucScriptCommon; UCScript saved; UCProps props; uptop: // bad utf8 can cause a breach if ( i >= nodeLen ) { goto done; } if ( ! s[i] ) { goto done; } if ( !is_alnum_utf8( s + i ) ) { if ( m_numWords >= m_preCount ) { goto done; } // tag? if ( s[i]=='<' && m_hasTags && isTagStart(s+i) ) { // get the tag id if( m_tagIds ) { if ( s[i + 1] == '/' ) { // skip over / m_tagIds[m_numWords] = ::getTagId( s + i + 2 ); m_tagIds[m_numWords] |= BACKBIT; } else { m_tagIds[m_numWords] = ::getTagId( s + i + 1 ); } } m_words[m_numWords] = s + i; m_wordIds[m_numWords] = 0LL; // skip till end int32_t tagLen = getTagLen( s + i ); m_wordLens[m_numWords] = tagLen; m_nodes[m_numWords] = 0; m_numWords++; // advance i += tagLen; goto uptop; } // it is a punct word, find end of it char *start = s+i; for ( ; s[i] ; i += getUtf8CharSize(s+i)) { // stop on < if we got tags if ( s[i] == '<' && m_hasTags ) { break; } // if we are simple ascii, skip quickly if ( is_ascii(s[i]) ) { // accumulate NON-alnum chars if ( ! is_alnum_a(s[i]) ) { continue; } // update oldScript = ucScriptCommon; // otherwise, stop we got alnum break; } // if we are utf8 we stop on special props UChar32 c = utf8Decode ( s+i ); // stop if word char if ( ! ucIsWordChar ( c ) ) { continue; } // update first though oldScript = ucGetScript ( c ); // then stop break; } m_words [ m_numWords ] = start; m_wordLens [ m_numWords ] = s+i - start; m_wordIds [ m_numWords ] = 0LL; m_nodes [ m_numWords ] = 0; if (m_tagIds) { m_tagIds[m_numWords] = 0; } m_numWords++; goto uptop; } // get an alnum word j = i; again: for ( ; s[i] ; i += getUtf8CharSize(s+i) ) { // simple ascii? if ( is_ascii(s[i]) ) { // accumulate alnum chars if ( is_alnum_a(s[i]) ) continue; // update oldScript = ucScriptCommon; // otherwise, stop we got punct break; } // get the code point of the utf8 char UChar32 c = utf8Decode ( s+i ); // get props props = ucProperties ( c ); // good stuff? if ( props & (UC_IGNORABLE|UC_EXTEND) ) continue; // stop? if UC_WORCHAR is set, that means its an alnum if ( ! ( props & UC_WORDCHAR ) ) { // reset script between words oldScript = ucScriptCommon; break; } // save it saved = oldScript; // update here oldScript = ucGetScript(c); // treat ucScriptLatin (30) as common so we can have latin1 // like char without breaking the word! if ( oldScript == ucScriptLatin ) oldScript = ucScriptCommon; // stop on this crap too i guess. like japanes chars? if ( props & ( UC_IDEOGRAPH | UC_HIRAGANA | UC_THAI ) ) { // include it i += getUtf8CharSize(s+i); // but stop break; } // script change? if ( saved != oldScript ) break; } // . java++, A++, C++ exception // . A+, C+, exception // . TODO: consider putting in Bits.cpp w/ D_CAN_BE_IN_PHRASE if ( s[i]=='+' ) { if ( s[i+1]=='+' && !is_alnum_utf8(&s[i+2]) ) i += 2; else if ( !is_alnum_utf8(&s[i+1]) ) i++; } // . c#, j#, ... if ( s[i]=='#' && !is_alnum_utf8(&s[i+1]) ) i++; // comma is ok if like ,ddd!d if ( s[i]==',' && i-j <= 3 && is_digit(s[i-1]) ) { // if word so far is 2 or 3 chars, make sure digits if ( i-j >= 2 && ! is_digit(s[i-2]) ) goto nogo; if ( i-j >= 3 && ! is_digit(s[i-3]) ) goto nogo; // scan forward while ( s[i] == ',' && is_digit(s[i+1]) && is_digit(s[i+2]) && is_digit(s[i+3]) && ! is_digit(s[i+4]) ) { i += 4; } } // decimal point? if ( s[i] == '.' && is_digit(s[i-1]) && is_digit(s[i+1]) ) { // allow the decimal point i++; // skip over string of digits while ( is_digit(s[i]) ) i++; } nogo: // allow for words like we're dave's and i'm if ( s[i] == '\'' && s[i + 1] && is_alnum_utf8( &s[i + 1] ) && !hadApostrophe ) { i++; hadApostrophe = true; goto again; } hadApostrophe = false; // get word length wlen = i - j; if ( m_numWords >= m_preCount ) goto done; m_words [ m_numWords ] = &s[j]; m_wordLens[ m_numWords ] = wlen; if ( computeWordIds ) { int64_t h = hash64Lower_utf8(&s[j],wlen); m_wordIds [m_numWords] = h; } m_nodes[m_numWords] = 0; if (m_tagIds) m_tagIds[m_numWords] = 0; m_numWords++; m_numAlnumWords++; // get a punct word goto uptop; done: // bad programming warning if ( m_numWords > m_preCount ) { log(LOG_LOGIC, "build: words: set: Fix counting routine."); gbshutdownLogicError(); } return true; }