// a quickie // this url gives a m_preCount that is too low. why? // http://go.tfol.com/163/speed.asp long countWords ( char *p , long plen , long niceness ) { char *pend = p + plen; long count = 1; loop: // sequence of punct for ( ; p < pend && ! is_alnum_utf8 (p) ; p += getUtf8CharSize(p) ) { // breathe QUICKPOLL ( niceness ); // in case being set from xml tags, count as words now if ( *p=='<') count++; } count++; // sequence of alnum for ( ; p < pend && is_alnum_utf8 (p) ; p += getUtf8CharSize(p) ) // breathe QUICKPOLL ( niceness ); count++; if ( p < pend ) goto loop; // some extra for good meaure return count+10; }
// a quickie // this url gives a m_preCount that is too low. why? // http://go.tfol.com/163/speed.asp static int32_t countWords ( const char *p , int32_t plen ) { const char *pend = p + plen; int32_t count = 1; while ( p < pend ) { // sequence of punct for ( ; p < pend && ! is_alnum_utf8 (p) ; p += getUtf8CharSize(p) ) { // in case being set from xml tags, count as words now if ( *p == '<' ) { count++; } } count++; // sequence of alnum for ( ; p < pend && is_alnum_utf8 (p) ; p += getUtf8CharSize(p) ) ; count++; }; // some extra for good meaure return count+10; }
bool verifyUtf8 ( const char *txt , int32_t tlen ) { if ( ! txt || tlen <= 0 ) return true; char size; const char *p = txt; const char *pend = txt + tlen; for ( ; p < pend ; p += size ) { size = getUtf8CharSize(p); // skip if ascii if ( ! (p[0] & 0x80) ) continue; // ok, it's a utf8 char, it must have both hi bits set if ( (p[0] & 0xc0) != 0xc0 ) return false; // if only one byte, we are done.. how can that be? if ( size == 1 ) return false; //if ( ! utf8IsSane ( p[0] ) ) return false; // successive utf8 chars must have & 0xc0 be equal to 0x80 // but the first char it must equal 0xc0, both set if ( (p[1] & 0xc0) != 0x80 ) return false; if ( size == 2 ) continue; if ( (p[2] & 0xc0) != 0x80 ) return false; if ( size == 3 ) continue; if ( (p[3] & 0xc0) != 0x80 ) return false; } if ( p != pend ) return false; return true; }
int32_t stripAccentMarks (char *outbuf, int32_t outbufsize, unsigned char *p, int32_t inbuflen) { char *s = (char *)p; char *send = (char *)p + inbuflen; int32_t cs; char *dst = outbuf; for ( ; s < send ; s += cs ) { // how big is this character? cs = getUtf8CharSize(s); // convert the utf8 character to UChar32 UChar32 uc = utf8Decode ( s ); // break "uc" into decomposition of UChar32s UChar32 ttt[32]; int32_t klen = recursiveKDExpand(uc,ttt,32); if(klen>32){char *xx=NULL;*xx=0;} // sanity if ( dst + 5 > outbuf+outbufsize ) return -1; // if the same, leave it! it had no accent marks or other // modifiers... if ( klen <= 1 ) { gbmemcpy ( dst , s , cs ); dst += cs; continue; } // take the first one as the stripped // convert back to utf8 int32_t stored = utf8Encode ( ttt[0] , dst ); // skip over the stored utf8 char dst += stored; } // sanity. breach check if ( dst > outbuf+outbufsize ) { char *xx=NULL;*xx=0; } // return # of bytes stored into outbuf return dst - outbuf; }
long utf8ToAscii(char *outbuf, long outbufsize, unsigned char *p, long inbuflen) { // inbuf char *dst = outbuf; unsigned char *pend = p + inbuflen; char *dend = outbuf + outbufsize; char cs; for ( ; p < pend ; p += cs ) { // do not breach if ( dst >= dend ) break; // get the size cs = getUtf8CharSize(p); // deal with one ascii char quickly if ( cs == 1 ) { *dst++ = *p; continue; } // we do not know how to convert this! if ( cs != 2 ) return -1; // standard crap char *table ; if ( *p == 0xc3 ) table = ascii_c3; else if ( *p == 0xc4 ) table = ascii_c4; else if ( *p == 0xc5 ) table = ascii_c5; else if ( *p == 0xc6 ) table = ascii_c6; else return -1; if ( p[1] < 0x80 ) return -1; if ( p[1] > 0xbf ) return -1; *dst++ = table[p[1]-0x80]; } return dst - outbuf; }
bool has_alpha_utf8 ( char *s , char *send ) { char cs = 0; for ( ; s < send ; s += cs ) { cs = getUtf8CharSize ( s ); if ( cs == 1 ) { if (is_alpha_a(*s)) return true; continue; } if ( is_alpha_utf8(s) ) return true; } return false; }
static int32_t countWords ( const char *p ) { int32_t count = 1; while ( *p ) { // sequence of punct for ( ; *p && ! is_alnum_utf8 (p) ; p += getUtf8CharSize(p) ) { // in case being set from xml tags, count as words now if ( *p=='<') count++; } count++; // sequence of alnum for ( ; *p && is_alnum_utf8 (p) ; p += getUtf8CharSize(p) ) ; count++; } // some extra for good meaure return count+10; }
// return false and set g_errno on error bool Synonyms::addStripped ( char *w , long wlen , HashTableX *dt ) { // avoid overflow if ( wlen > 200 ) return true; // require utf8 bool hadUtf8 = false; char size; for ( long i = 0 ; i < wlen ; i += size ) { size = getUtf8CharSize(w+i); if ( size == 1 ) continue; hadUtf8 = true; break; } if ( ! hadUtf8 ) return true; // filter out accent marks char abuf[256]; //long alen = utf8ToAscii(abuf,256,(unsigned char *)w,wlen); long alen = stripAccentMarks(abuf,256,(unsigned char *)w,wlen); // skip if can't convert to ascii... (unsupported letter) if ( alen < 0 ) return true; // if same as original word, skip if ( wlen==alen && strncmp(abuf,w,wlen) == 0 ) return true; // hash it uint64_t h2 = hash64Lower_utf8(abuf,alen); // do not add dups if ( dt->isInTable ( &h2 ) ) return true; // add to dedup table. return false with g_errno set if ( ! dt->addKey ( &h2 ) ) return false; // store that *m_aidsPtr++ = h2; *m_wids0Ptr++ = 0LL; *m_wids1Ptr++ = 0LL; *m_termPtrsPtr++ = NULL; *m_termOffsPtr++ = m_synWordBuf.length(); *m_termLensPtr++ = alen; *m_numAlnumWordsPtr++ = 1; *m_numAlnumWordsInBasePtr++ = 1; *m_srcPtr++ = SOURCE_GENERATED; m_synWordBuf.safeStrcpy(abuf); m_synWordBuf.pushChar('\0'); return true; }
unsigned char getCharacterLanguage ( char *utf8Char ) { // romantic? char cs = getUtf8CharSize ( utf8Char ); // can't say what language it is if ( cs == 1 ) return langUnknown; // convert to 32 bit unicode UChar32 c = utf8Decode ( utf8Char ); UCScript us = ucGetScript ( c ); // arabic? this also returns for persian!! fix? if ( us == ucScriptArabic ) return langArabic; if ( us == ucScriptCyrillic ) return langRussian; if ( us == ucScriptHebrew ) return langHebrew; if ( us == ucScriptGreek ) return langGreek; return langUnknown; }
// . copy just words in [t0,t1) // . returns false on error and sets g_errno bool Title::copyTitle(Words *w, int32_t t0, int32_t t1) { // skip initial punct const char *const *wp = w->getWords(); const int32_t *wlens = w->getWordLens(); int32_t nw = w->getNumWords(); // sanity check if ( t1 < t0 ) { char *xx = NULL; *xx = 0; } // don't breech number of words if ( t1 > nw ) { t1 = nw; } // no title? if ( nw == 0 || t0 == t1 ) { reset(); return true; } const char *end = wp[t1-1] + wlens[t1-1] ; // allocate title int32_t need = end - wp[t0]; // add 3 bytes for "..." and 1 for \0 need += 5; // return false if could not hold the title if ( need > MAX_TITLE_LEN ) { m_title[0] = '\0'; m_titleLen = 0; log("query: Could not alloc %" PRId32" bytes for title.",need); return false; } // point to the title to transcribe const char *src = wp[t0]; const char *srcEnd = end; // include a \" or \' if ( t0 > 0 && ( src[-1] == '\'' || src[-1] == '\"' ) ) { src--; } // and remove terminating | or : for ( ; srcEnd > src && (srcEnd[-1] == ':' || srcEnd[-1] == ' ' || srcEnd[-1] == '-' || srcEnd[-1] == '\n' || srcEnd[-1] == '\r' || srcEnd[-1] == '|' ) ; srcEnd-- ); // store in here char *dst = m_title; // leave room for "...\0" char *dstEnd = m_title + need - 4; // size of character in bytes, usually 1 char cs ; // point to last punct char char *lastp = dst;//NULL; int32_t charCount = 0; // copy the node @p into "dst" for ( ; src < srcEnd ; src += cs , dst += cs ) { // get src size cs = getUtf8CharSize ( src ); // break if we are full! if ( dst + cs >= dstEnd ) { break; } // or hit our max char limit if ( charCount++ >= m_maxTitleLen ) { break; } // skip unwanted character if (isUtf8UnwantedSymbols(src)) { dst -= cs; continue; } // remember last punct for cutting purposes if ( ! is_alnum_utf8 ( src ) ) { lastp = dst; } // encode it as an html entity if asked to if ( *src == '<' ) { if ( dst + 4 >= dstEnd ) { break; } gbmemcpy ( dst , "<" , 4 ); dst += 4 - cs; continue; } // encode it as an html entity if asked to if ( *src == '>' ) { if ( dst + 4 >= dstEnd ) { break; } gbmemcpy ( dst , ">" , 4 ); dst += 4 - cs; continue; } // if more than 1 byte in char, use gbmemcpy if ( cs == 1 ) { *dst = *src; } else { gbmemcpy ( dst , src , cs ); } } // null term always *dst = '\0'; // do not split a word in the middle! if ( src < srcEnd ) { if ( lastp ) { gbmemcpy ( lastp , "...\0" , 4 ); dst = lastp + 3; } else { gbmemcpy ( dst , "...\0" , 4 ); dst += 3; } } // set size. does not include the terminating \0 m_titleLen = dst - m_title; return true; }
bool Log::logR ( int64_t now, int32_t type, const char *msg, bool forced ) { if ( ! g_loggingEnabled ) { return true; } // return true if we should not log this if ( ! forced && ! shouldLog ( type , msg ) ) { return true; } // get "msg"'s length int32_t msgLen = strlen ( msg ); ScopedLock sl(s_lock); // do a timestamp, too. use the time synced with host #0 because // it is easier to debug because all log timestamps are in sync. if ( now == 0 ) now = gettimeofdayInMillisecondsGlobalNoCore(); // . skip all logging if power out, we do not want to screw things up // . allow logging for 10 seconds after power out though if ( ! g_process.m_powerIsOn && now - g_process.m_powerOffTime >10000){ return false; } // chop off any spaces at the end of the msg. while ( is_wspace_a ( msg [ msgLen - 1 ] ) && msgLen > 0 ) msgLen--; // a tmp buffer char tt [ MAX_LINE_LEN ]; char *p = tt; if (m_logPrefix) { if ( m_logTimestamps ) { if( m_logReadableTimestamps ) { time_t now_t = (time_t)(now / 1000); struct tm tm_buf; struct tm *stm = localtime_r(&now_t,&tm_buf); p += sprintf ( p , "%04d%02d%02d-%02d%02d%02d-%03d %04" PRId32" ", stm->tm_year+1900,stm->tm_mon+1,stm->tm_mday,stm->tm_hour,stm->tm_min,stm->tm_sec,(int)(now%1000), g_hostdb.m_hostId ); } else { if ( g_hostdb.getNumHosts() <= 999 ) p += sprintf ( p , "%" PRIu64 " %03" PRId32 " ", (uint64_t)now , g_hostdb.m_hostId ); else if ( g_hostdb.getNumHosts() <= 9999 ) p += sprintf ( p , "%" PRIu64" %04" PRId32" ", (uint64_t)now , g_hostdb.m_hostId ); else if ( g_hostdb.getNumHosts() <= 99999 ) p += sprintf ( p , "%" PRIu64" %05" PRId32" ", (uint64_t)now , g_hostdb.m_hostId ); } } // Get thread id. pthread_self instead? unsigned tid=(unsigned)syscall(SYS_gettid); p += sprintf(p, "%06u ", tid); // Log level p += sprintf(p, "%s ", getTypeString(type)); } // then message itself const char *x = msg; int32_t avail = (MAX_LINE_LEN) - (p - tt) - 1; if ( msgLen > avail ) msgLen = avail; if ( *x == ':' ) x++; if ( *x == ' ' ) x++; strncpy ( p , x , avail ); // capitalize for consistency. no, makes grepping log msgs harder. //if ( is_alpha_a(*p) ) *p = to_upper_a(*p); p += strlen(p); // back up over spaces while ( p[-1] == ' ' ) p--; // end in period or ? or ! //if ( p[-1] != '?' && p[-1] != '.' && p[-1] != '!' ) // *p++ = '.'; *p ='\0'; // the total length, not including the \0 int32_t tlen = p - tt; // . filter out nasty chars from the message // . replace with ~'s char cs; char *ttp = tt; char *ttpend = tt + tlen; for ( ; ttp < ttpend ; ttp += cs ) { cs = getUtf8CharSize ( ttp ); if ( is_binary_utf8 ( ttp ) ) { for ( int32_t k = 0 ; k < cs ; k++ ) *ttp++ = '.'; // careful not to skip the already skipped bytes cs = 0; continue; } } // . if filesize would be too big then make a new log file // . should make a new m_fd if ( m_logFileSize + tlen+1 > MAXLOGFILESIZE && g_conf.m_logToFile ) makeNewLogFile(); if ( m_fd >= 0 ) { write ( m_fd , tt , tlen ); write ( m_fd , "\n", 1 ); m_logFileSize += tlen + 1; } else { // print it out for now fprintf ( stderr, "%s\n", tt ); } return false; }
bool Words::addWords(char *s,long nodeLen,bool computeWordIds, long niceness) { long i = 0; long j; //long k = 0; long wlen; //unsigned long e; //long skip; long badCount = 0; bool hadApostrophe = false; UCScript oldScript = ucScriptCommon; UCScript saved; UCProps props; uptop: // bad utf8 can cause a breach if ( i >= nodeLen ) goto done; if ( ! s[i] ) goto done; if ( ! is_alnum_utf8(s+i) ) { // && m_numWords < m_preCount ) { if ( m_numWords >= m_preCount ) goto done; // tag? if ( s[i]=='<' && m_hasTags && isTagStart(s+i) ) { // get the tag id if ( s[i+1]=='/' ) { // skip over / m_tagIds [m_numWords] = ::getTagId(s+i+2); m_tagIds [m_numWords] |= BACKBIT; } else m_tagIds [m_numWords] = ::getTagId(s+i+1); // word start m_words [m_numWords] = s + i; m_wordIds [m_numWords] = 0LL; // skip till end long tagLen = getTagLen(s+i); // ,niceness); m_wordLens [m_numWords] = tagLen; m_numWords++; // advance i += tagLen; goto uptop; } // it is a punct word, find end of it char *start = s+i; //for (;s[i] && ! is_alnum_utf8(s+i);i+=getUtf8CharSize(s+i)); for ( ; s[i] ; i += getUtf8CharSize(s+i)){ // stop on < if we got tags if ( s[i] == '<' && m_hasTags ) break; // breathe QUICKPOLL(niceness); // if we are simple ascii, skip quickly if ( is_ascii(s[i]) ) { // accumulate NON-alnum chars if ( ! is_alnum_a(s[i]) ) continue; // update oldScript = ucScriptCommon; // otherwise, stop we got alnum break; } // if we are utf8 we stop on special props UChar32 c = utf8Decode ( s+i ); // stop if word char if ( ! ucIsWordChar ( c ) ) continue; // update first though oldScript = ucGetScript ( c ); // then stop break; } m_words [ m_numWords ] = start; m_wordLens [ m_numWords ] = s+i - start; m_wordIds [ m_numWords ] = 0LL; if (m_tagIds) m_tagIds[m_numWords] = 0; m_numWords++; goto uptop; } // get an alnum word j = i; again: //for ( ; is_alnum_utf8 (&s[i] ) ; i += getUtf8CharSize(s+i) ); for ( ; s[i] ; i += getUtf8CharSize(s+i) ) { // breathe QUICKPOLL(niceness); // simple ascii? if ( is_ascii(s[i]) ) { // accumulate alnum chars if ( is_alnum_a(s[i]) ) continue; // update oldScript = ucScriptCommon; // otherwise, stop we got punct break; } // get the code point of the utf8 char UChar32 c = utf8Decode ( s+i ); // get props props = ucProperties ( c ); // good stuff? if ( props & (UC_IGNORABLE|UC_EXTEND) ) continue; // stop? if UC_WORCHAR is set, that means its an alnum if ( ! ( props & UC_WORDCHAR ) ) { // reset script between words oldScript = ucScriptCommon; break; } // save it saved = oldScript; // update here oldScript = ucGetScript(c); // treat ucScriptLatin (30) as common so we can have latin1 // like char without breaking the word! if ( oldScript == ucScriptLatin ) oldScript = ucScriptCommon; // stop on this crap too i guess. like japanes chars? if ( props & ( UC_IDEOGRAPH | UC_HIRAGANA | UC_THAI ) ) { // include it i += getUtf8CharSize(s+i); // but stop break; } // script change? if ( saved != oldScript ) break; } // . java++, A++, C++ exception // . A+, C+, exception // . TODO: consider putting in Bits.cpp w/ D_CAN_BE_IN_PHRASE if ( s[i]=='+' ) { if ( s[i+1]=='+' && !is_alnum_utf8(&s[i+2]) ) i += 2; else if ( !is_alnum_utf8(&s[i+1]) ) i++; } // . c#, j#, ... if ( s[i]=='#' && !is_alnum_utf8(&s[i+1]) ) i++; // allow for words like we're dave's and i'm if(s[i]=='\''&&s[i+1]&&is_alnum_utf8(&s[i+1])&&!hadApostrophe){ i++; hadApostrophe = true; goto again; } hadApostrophe = false; // get word length wlen = i - j; if ( m_numWords >= m_preCount ) goto done; m_words [ m_numWords ] = &s[j]; m_wordLens[ m_numWords ] = wlen; // . Lars says it's better to leave the accented chars intact // . google agrees // . but what about "re'sume"? if ( computeWordIds ) { long long h = hash64Lower_utf8(&s[j],wlen); m_wordIds [m_numWords] = h; // until we get an accent removal algo, comment this // out and possibly use the query synonym pipeline // to search without accents. MDW //long long h2 = hash64AsciiLowerE(&s[j],wlen); //if ( h2 != h ) m_stripWordIds [m_numWords] = h2; //else m_stripWordIds [m_numWords] = 0LL; //m_stripWordIds[m_numWords] = 0; } if (m_tagIds) m_tagIds[m_numWords] = 0; m_numWords++; m_numAlnumWords++; // break on \0 or MAX_WORDS //if ( ! s[i] ) goto done; // get a punct word goto uptop; /* j = i; // delineate the "punctuation" word for ( ; s[i] && !is_alnum_utf8(&s[i]);i+=getUtf8CharSize(s+i)); // bad utf8 could cause us to breach the node, so watch out! if ( i > nodeLen ) { badCount++; i = nodeLen; } // get word length wlen = i - j; if ( m_numWords >= m_preCount ) goto done; m_words [m_numWords ] = &s[j]; m_wordLens [m_numWords ] = wlen; m_wordIds [m_numWords ] = 0LL; if (m_tagIds) m_tagIds[m_numWords] = 0; m_numWords++; */ done: // bad programming warning if ( m_numWords > m_preCount ) { log(LOG_LOGIC, "build: words: set: Fix counting routine."); char *xx = NULL; *xx = 0; } // compute total length if ( m_numWords <= 0 ) m_totalLen = 0; else m_totalLen = m_words[m_numWords-1] - s + m_wordLens[m_numWords-1]; if ( badCount ) log("words: had %li bad utf8 chars",badCount); return true; }
bool Words::set ( Xml *xml, bool computeWordIds , long niceness , long node1 , long node2 ) { // prevent setting with the same string if ( m_xml == xml ) { char *xx=NULL;*xx=0; } reset(); m_xml = xml; m_version = xml->getVersion(); //m_version = xml->getVersion(); // quick test if ( ! s_tested ) { // only do once s_tested = true; // set c to a curling quote in unicode long c = 0x201c; // 0x235e; // encode it into utf8 char dst[5]; // point to it char *p = dst; // put space in there *p++ = ' '; // "numBytes" is how many bytes it stored into 'dst" long numBytes = utf8Encode ( c , p ); // must be 2 bytes i guess if ( numBytes != 3 ) { char *xx=NULL; *xx=0; } // check it long size = getUtf8CharSize(p); if ( size != 3 ) { char *xx=NULL; *xx=0; } // is that punct if ( ! is_punct_utf8 ( p ) ) { char *xx=NULL;*xx=0; } // make sure can pair across //unsigned char bits = getPunctuationBits ( dst , 4 ); // must be able to pair across //if ( ! ( bits & D_CAN_PAIR_ACROSS ) ) { char *xx=NULL;*xx=0;} } // if xml is empty, bail if ( ! xml->getContent() ) return true; long numNodes = xml->getNumNodes(); if ( numNodes <= 0 ) return true; // . can be given a range, if node2 is -1 that means all! // . range is half-open: [node1, node2) if ( node2 < 0 ) node2 = numNodes; // sanity check if ( node1 > node2 ) { char *xx=NULL;*xx=0; } char *start = xml->getNode(node1); char *end = xml->getNode(node2-1) + xml->getNodeLen(node2-1); long size = end - start; m_preCount = countWords( start , size , niceness ); // allocate based on the approximate count if ( ! allocateWordBuffers(m_preCount, true)) return false; // are we done? for ( long k = node1 ; k < node2 && m_numWords < m_preCount ; k++ ){ // get the kth node char *node = xml->getNode (k); long nodeLen = xml->getNodeLen(k); // is the kth node a tag? if ( ! xml->isTag(k) ) { char c = node[nodeLen]; node[nodeLen] = '\0'; addWords(node,nodeLen,computeWordIds,niceness); node[nodeLen] = c; continue; } // it is a tag m_words [m_numWords] = node; m_wordLens [m_numWords] = nodeLen; m_tagIds [m_numWords] = xml->getNodeId(k); m_wordIds [m_numWords] = 0LL; m_nodes [m_numWords] = k; // we have less than 127 HTML tags, so set // the high bit for back tags if ( xml->isBackTag(k)) { m_tagIds[m_numWords] |= BACKBIT; } //log(LOG_DEBUG, "Words: Word %ld: got tag %s%s (%d)", // m_numWords, // isBackTag(m_numWords)?"/":"", // g_nodes[getTagId(m_numWords)].m_nodeName, // getTagId(m_numWords)); m_numWords++; // used by XmlDoc.cpp m_numTags++; continue; } return true; }
bool Words::addWords( char *s, int32_t nodeLen, bool computeWordIds ) { int32_t i = 0; int32_t j; int32_t wlen; bool hadApostrophe = false; UCScript oldScript = ucScriptCommon; UCScript saved; UCProps props; uptop: // bad utf8 can cause a breach if ( i >= nodeLen ) { goto done; } if ( ! s[i] ) { goto done; } if ( !is_alnum_utf8( s + i ) ) { if ( m_numWords >= m_preCount ) { goto done; } // tag? if ( s[i]=='<' && m_hasTags && isTagStart(s+i) ) { // get the tag id if( m_tagIds ) { if ( s[i + 1] == '/' ) { // skip over / m_tagIds[m_numWords] = ::getTagId( s + i + 2 ); m_tagIds[m_numWords] |= BACKBIT; } else { m_tagIds[m_numWords] = ::getTagId( s + i + 1 ); } } m_words[m_numWords] = s + i; m_wordIds[m_numWords] = 0LL; // skip till end int32_t tagLen = getTagLen( s + i ); m_wordLens[m_numWords] = tagLen; m_nodes[m_numWords] = 0; m_numWords++; // advance i += tagLen; goto uptop; } // it is a punct word, find end of it char *start = s+i; for ( ; s[i] ; i += getUtf8CharSize(s+i)) { // stop on < if we got tags if ( s[i] == '<' && m_hasTags ) { break; } // if we are simple ascii, skip quickly if ( is_ascii(s[i]) ) { // accumulate NON-alnum chars if ( ! is_alnum_a(s[i]) ) { continue; } // update oldScript = ucScriptCommon; // otherwise, stop we got alnum break; } // if we are utf8 we stop on special props UChar32 c = utf8Decode ( s+i ); // stop if word char if ( ! ucIsWordChar ( c ) ) { continue; } // update first though oldScript = ucGetScript ( c ); // then stop break; } m_words [ m_numWords ] = start; m_wordLens [ m_numWords ] = s+i - start; m_wordIds [ m_numWords ] = 0LL; m_nodes [ m_numWords ] = 0; if (m_tagIds) { m_tagIds[m_numWords] = 0; } m_numWords++; goto uptop; } // get an alnum word j = i; again: for ( ; s[i] ; i += getUtf8CharSize(s+i) ) { // simple ascii? if ( is_ascii(s[i]) ) { // accumulate alnum chars if ( is_alnum_a(s[i]) ) continue; // update oldScript = ucScriptCommon; // otherwise, stop we got punct break; } // get the code point of the utf8 char UChar32 c = utf8Decode ( s+i ); // get props props = ucProperties ( c ); // good stuff? if ( props & (UC_IGNORABLE|UC_EXTEND) ) continue; // stop? if UC_WORCHAR is set, that means its an alnum if ( ! ( props & UC_WORDCHAR ) ) { // reset script between words oldScript = ucScriptCommon; break; } // save it saved = oldScript; // update here oldScript = ucGetScript(c); // treat ucScriptLatin (30) as common so we can have latin1 // like char without breaking the word! if ( oldScript == ucScriptLatin ) oldScript = ucScriptCommon; // stop on this crap too i guess. like japanes chars? if ( props & ( UC_IDEOGRAPH | UC_HIRAGANA | UC_THAI ) ) { // include it i += getUtf8CharSize(s+i); // but stop break; } // script change? if ( saved != oldScript ) break; } // . java++, A++, C++ exception // . A+, C+, exception // . TODO: consider putting in Bits.cpp w/ D_CAN_BE_IN_PHRASE if ( s[i]=='+' ) { if ( s[i+1]=='+' && !is_alnum_utf8(&s[i+2]) ) i += 2; else if ( !is_alnum_utf8(&s[i+1]) ) i++; } // . c#, j#, ... if ( s[i]=='#' && !is_alnum_utf8(&s[i+1]) ) i++; // comma is ok if like ,ddd!d if ( s[i]==',' && i-j <= 3 && is_digit(s[i-1]) ) { // if word so far is 2 or 3 chars, make sure digits if ( i-j >= 2 && ! is_digit(s[i-2]) ) goto nogo; if ( i-j >= 3 && ! is_digit(s[i-3]) ) goto nogo; // scan forward while ( s[i] == ',' && is_digit(s[i+1]) && is_digit(s[i+2]) && is_digit(s[i+3]) && ! is_digit(s[i+4]) ) { i += 4; } } // decimal point? if ( s[i] == '.' && is_digit(s[i-1]) && is_digit(s[i+1]) ) { // allow the decimal point i++; // skip over string of digits while ( is_digit(s[i]) ) i++; } nogo: // allow for words like we're dave's and i'm if ( s[i] == '\'' && s[i + 1] && is_alnum_utf8( &s[i + 1] ) && !hadApostrophe ) { i++; hadApostrophe = true; goto again; } hadApostrophe = false; // get word length wlen = i - j; if ( m_numWords >= m_preCount ) goto done; m_words [ m_numWords ] = &s[j]; m_wordLens[ m_numWords ] = wlen; if ( computeWordIds ) { int64_t h = hash64Lower_utf8(&s[j],wlen); m_wordIds [m_numWords] = h; } m_nodes[m_numWords] = 0; if (m_tagIds) m_tagIds[m_numWords] = 0; m_numWords++; m_numAlnumWords++; // get a punct word goto uptop; done: // bad programming warning if ( m_numWords > m_preCount ) { log(LOG_LOGIC, "build: words: set: Fix counting routine."); gbshutdownLogicError(); } return true; }
// . return the score of the highest-scoring window containing match #m // . window is defined by the half-open interval [a,b) where a and b are // word #'s in the Words array indicated by match #m // . return -1 and set g_errno on error int64_t Summary::getBestWindow ( Matches *matches, int32_t mm, int32_t *lasta, int32_t *besta, int32_t *bestb, char *gotIt, char *retired, int32_t maxExcerptLen ) { // get the window around match #mm Match *m = &matches->m_matches[mm]; // what is the word # of match #mm? int32_t matchWordNum = m->m_wordNum; // what Words/Pos/Bits classes is this match in? Words *words = m->m_words; Section **sp = NULL; int32_t *pos = m->m_pos->m_pos; // use "m_swbits" not "m_bits", that is what Bits::setForSummary() uses const swbit_t *bb = m->m_bits->m_swbits; // shortcut if ( m->m_sections ) { sp = m->m_sections->m_sectionPtrs; } int32_t nw = words->getNumWords(); int64_t *wids = words->getWordIds(); nodeid_t *tids = words->getTagIds(); // . sanity check // . this prevents a core i've seen if ( matchWordNum >= nw ) { log("summary: got overflow condition for q=%s",m_q->m_orig); // assume no best window *besta = -1; *bestb = -1; *lasta = matchWordNum; return 0; } // . we NULLify the section ptrs if we already used the word in another summary. int32_t badFlags = SEC_SCRIPT|SEC_STYLE|SEC_SELECT|SEC_IN_TITLE; if ( (bb[matchWordNum] & D_USED) || ( sp && (sp[matchWordNum]->m_flags & badFlags) ) ) { // assume no best window *besta = -1; *bestb = -1; *lasta = matchWordNum; return 0; } // . "a" is the left fence post of the window (it is a word # in Words) // . go to the left as far as we can // . thus we decrement "a" int32_t a = matchWordNum; // "posa" is the character position of the END of word #a int32_t posa = pos[a+1]; int32_t firstFrag = -1; bool startOnQuote = false; bool goodStart = false; int32_t wordCount = 0; // . decrease "a" as int32_t as we stay within maxNumCharsPerLine // . avoid duplicating windows by using "lasta", the last "a" of the // previous call to getBestWindow(). This can happen if our last // central query term was close to this one. for ( ; a > 0 && posa - pos[a-1] < maxExcerptLen && a > *lasta; a-- ) { // . don't include any "dead zone", // . dead zones have already been used for the summary, and // we are getting a second/third/... excerpt here now then // stop if its the start of a sentence, too // stop before title word if ( (bb[a-1] & D_USED) || (bb[a] & D_STARTS_SENTENCE) || ( bb[a-1] & D_IN_TITLE )) { goodStart = true; break; } // don't go beyond an LI, TR, P tag if ( tids && ( tids[a-1] == TAG_LI || tids[a-1] == TAG_TR || tids[a-1] == TAG_P || tids[a-1] == TAG_DIV ) ) { goodStart = true; break; } // stop if its the start of a quoted sentence if ( a+1<nw && (bb[a+1] & D_IN_QUOTES) && words->getWord(a)[0] == '\"' ){ startOnQuote = true; goodStart = true; break; } // find out the first instance of a fragment (comma, etc) // watch out! because frag also means 's' in there's if ( ( bb[a] & D_STARTS_FRAG ) && !(bb[a-1] & D_IS_STRONG_CONNECTOR) && firstFrag == -1 ) { firstFrag = a; } if ( wids[a] ) { wordCount++; } } // if didn't find a good start, then start at the start of the frag if ( !goodStart && firstFrag != -1 ) { a = firstFrag; } // don't let punct or tag word start a line, unless a quote if ( a < matchWordNum && !wids[a] && words->getWord(a)[0] != '\"' ){ while ( a < matchWordNum && !wids[a] ) a++; // do not break right after a "strong connector", like // apostrophe while ( a < matchWordNum && a > 0 && ( bb[a-1] & D_IS_STRONG_CONNECTOR ) ) a++; // don't let punct or tag word start a line while ( a < matchWordNum && !wids[a] ) a++; } // remember, b is not included in the summary, the summary is [a,b-1] // remember to include all words in a matched phrase int32_t b = matchWordNum + m->m_numWords ; int32_t endQuoteWordNum = -1; int32_t numTagsCrossed = 0; for ( ; b <= nw; b++ ) { if ( b == nw ) { break; } if ( pos[b+1] - pos[a] >= maxExcerptLen ) { break; } if ( startOnQuote && words->getWord(b)[0] == '\"' ) { endQuoteWordNum = b; } // don't include any dead zone, those are already-used samples if ( bb[b] & D_USED ) { break; } // stop on a title word if ( bb[b] & D_IN_TITLE ) { break; } if ( wids[b] ) { wordCount++; } // don't go beyond an LI or TR backtag if ( tids && ( tids[b] == (BACKBIT|TAG_LI) || tids[b] == (BACKBIT|TAG_TR) ) ) { numTagsCrossed++; // try to have atleast 10 words in the summary if ( wordCount > 10 ) { break; } } // go beyond a P or DIV backtag in case the earlier char is a // ':'. This came from a special case for wikipedia pages // eg. http://en.wikipedia.org/wiki/Flyover if ( tids && ( tids[b] == (BACKBIT|TAG_P) || tids[b] == (BACKBIT|TAG_DIV) )) { numTagsCrossed++; // try to have atleast 10 words in the summary if ( wordCount > 10 && words->getWord(b-1)[0] != ':' ) { break; } } } // don't end on a lot of punct words if ( b > matchWordNum && !wids[b-1]){ // remove more than one punct words. if we're ending on a quote // keep it while ( b > matchWordNum && !wids[b-2] && endQuoteWordNum != -1 && b > endQuoteWordNum ) { b--; } // do not break right after a "strong connector", like apostrophe while ( b > matchWordNum && (bb[b-2] & D_IS_STRONG_CONNECTOR) ) { b--; } } Match *ms = matches->m_matches; // make m_matches.m_matches[mi] the first match in our [a,b) window int32_t mi ; // . the match at the center of the window is match #"mm", so that // matches->m_matches[mm] is the Match class // . set "mi" to it and back up "mi" as int32_t as >= a for ( mi = mm ; mi > 0 && ms[mi-1].m_wordNum >=a ; mi-- ) ; // now get the score of this excerpt. Also mark all the represented // query words. Mark the represented query words in the array that // comes to us. also mark how many times the same word is repeated in // this summary. int64_t score = 0LL; // is a url contained in the summary, that looks bad! punish! bool hasUrl = false; // the word count we did above was just an approximate. count it right wordCount = 0; // for debug //char buf[5000]; //char *xp = buf; SafeBuf xp; // wtf? if ( b > nw ) { b = nw; } // first score from the starting match down to a, including match for ( int32_t i = a ; i < b ; i++ ) { // debug print out if ( g_conf.m_logDebugSummary ) { int32_t len = words->getWordLen(i); char cs; for (int32_t k=0;k<len; k+=cs ) { const char *c = words->getWord(i)+k; cs = getUtf8CharSize(c); if ( is_binary_utf8 ( c ) ) { continue; } xp.safeMemcpy ( c , cs ); xp.nullTerm(); } } // skip if in bad section, marquee, select, script, style if ( sp && (sp[i]->m_flags & badFlags) ) { continue; } // don't count just numeric words if ( words->isNum(i) ) { continue; } // check if there is a url. best way to check for '://' if ( wids && !wids[i] ) { const char *wrd = words->getWord(i); int32_t wrdLen = words->getWordLen(i); if ( wrdLen == 3 && wrd[0] == ':' && wrd[1] == '/' && wrd[2] == '/' ) { hasUrl = true; } } // skip if not wid if ( ! wids[i] ) { continue; } // just make every word 100 pts int32_t t = 100; // penalize it if in one of these sections if ( bb[i] & ( D_IN_PARENS | D_IN_SUP | D_IN_LIST ) ) { t /= 2; } // boost it if in bold or italics if ( bb[i] & D_IN_BOLDORITALICS ) { t *= 2; } // add the score for this word score += t; // print the score, "t" if ( g_conf.m_logDebugSummary ) { xp.safePrintf("(%" PRId32")",t); } // count the alpha words we got wordCount++; // if no matches left, skip if ( mi >= matches->m_numMatches ) { continue; } // get the match Match *next = &ms[mi]; // skip if not a match if ( i != next->m_wordNum ) { continue; } // must be a match in this class if ( next->m_words != words ) { continue; } // advance it mi++; // which query word # does it match int32_t qwn = next->m_qwordNum; if ( qwn < 0 || qwn >= m_q->m_numWords ){g_process.shutdownAbort(true);} // undo old score score -= t; // add 100000 per match t = 100000; // weight based on tf, goes from 0.1 to 1.0 t = (int32_t)((float)t * m_wordWeights [ qwn ]); // if it is a query stop word, make it 10000 pts if ( m_q->m_qwords[qwn].m_isQueryStopWord ) { t = 0;//10000; } // penalize it if in one of these sections if ( bb[i] & ( D_IN_PARENS | D_IN_SUP | D_IN_LIST ) ) { t /= 2; } if ( gotIt[qwn] > 0 ) { // have we matched it in this [a,b) already? if ( gotIt[qwn] == 1 ) { t /= 15; } else { // if we have more than 2 matches in the same window, // it may not give a good summary. give a heavy penalty t -= 200000; } } else if ( retired [qwn] > 0 ) { // have we matched it already in a winning window? t /= 12; } // add it back score += t; if ( g_conf.m_logDebugSummary ) { xp.safePrintf ("[%" PRId32"]{qwn=%" PRId32",ww=%f}",t,qwn, m_wordWeights[qwn]); } // inc the query word count for this window if ( gotIt[qwn] < 100 ) { gotIt[qwn]++; } } int32_t oldScore = score; // apply the bonus if it starts or a sentence // only apply if the score is positive and if the wordcount is decent if ( score > 0 && wordCount > 7 ){ // a match can give us 10k to 100k pts based on the tf weights // so we don't want to overwhelm that too much, so let's make // this a 20k bonus if it starts a sentence if ( bb[a] & D_STARTS_SENTENCE ) { score += 8000; } else if ( bb[a] & D_STARTS_FRAG ) { // likewise, a fragment, like after a comma score += 4000; } // 1k if the match word is very close to the // start of a sentence, lets say 3 alphawords if ( matchWordNum - a < 7 ) { score += 1000; } } // a summary isn't really a summary if its less than 7 words. // reduce the score, but still give it a decent score. // minus 5M. if ( wordCount < 7 ) { score -= 20000; } // summaries that cross a lot of tags are usually bad, penalize them if ( numTagsCrossed > 1 ) { score -= (numTagsCrossed * 20000); } if ( hasUrl ) { score -= 8000; } // show it if ( g_conf.m_logDebugSummary ) { log(LOG_DEBUG, "sum: score=%08" PRId32" prescore=%08" PRId32" a=%05" PRId32" b=%05" PRId32" %s", (int32_t)score,oldScore,(int32_t)a,(int32_t)b, xp.getBufStart()); } // set lasta, besta, bestb *lasta = a; *besta = a; *bestb = b; return score; }
// . set the filtered position of each word // . used by Summary.cpp to determine how many chars are in the summary, // be those chars single byte or utf8 chars that are 4 bytes // . returns false and sets g_errno on error // . if f is non-NULL store filtered words into there. back to back spaces // are eliminated. bool Pos::set ( Words *words , Sections *sections , char *f , char *fend, long *len , long a , long b , char *buf , long bufSize ) { // free m_buf in case this is a second call if ( ! f ) reset(); long nw = words->getNumWords(); long *wlens = words->m_wordLens; nodeid_t *tids = words->getTagIds(); // m_tagIds; char **wp = words->m_words; //long *ss = NULL; //long long *wids = words->m_wordIds; //if ( scores ) ss = scores->m_scores; // save start point for filtering char *fstart = f; // -1 is the default value if ( b == -1 ) b = nw; // alloc array if need to long need = (nw+1) * 4; // do not destroy m_pos/m_numWords if only filtering into a buffer if ( f ) goto skip; m_needsFree = false; m_buf = m_localBuf; if ( need > POS_LOCALBUFSIZE && need < bufSize ) m_buf = buf; else if ( need > POS_LOCALBUFSIZE ) { m_buf = (char *)mmalloc(need,"Pos"); m_needsFree = true; } // bail on error if ( ! m_buf ) return false; m_bufSize = need; m_pos = (long *)m_buf; m_numWords = nw; skip: // this is the CHARACTER count. long pos = 0; bool trunc = false; char *p , *pend; //char *nextp; //long skip; char* lastBreak = NULL; // utf8 char //long c; // its size in bytes //char cs; // shortcut //Section **sp = NULL; //if ( sections ) sp = sections->m_sectionPtrs; //long badFlags = SEC_SCRIPT|SEC_STYLE|SEC_SELECT|SEC_MARQUEE; // flag for stopping back-to-back spaces. only count those as one char. bool lastSpace = false; long maxCharSize = 4; // we are utf8 for ( long i = a ; i < b ; i++ ) { if (trunc) break; // set pos for the ith word to "pos" if ( ! f ) m_pos[i] = pos; // if inside a bad tag, skip it //if ( sp && (sp[i]->m_flags & badFlags) ) continue; // is tag? if ( tids && tids[i] ) { // if not breaking, does nothing if ( ! g_nodes[tids[i]&0x7f].m_isBreaking ) continue; // list tag? <li> if ( tids[i] == TAG_LI ) { if ( f ) { if ((fend - f > maxCharSize)) { *f++ = '*'; } else { trunc = true; } } pos++; lastSpace = false; continue; } // if had a previous breaking tag and no non-tag // word after it, do not count back-to-back spaces if ( lastSpace ) continue; // if had a br tag count it as a '.' if ( tids[i] ) { // == 20 ) { // <br> // are we filtering? if ( f && f != fstart ) { if ((fend-f>2*maxCharSize)) { *f++ = '.'; *f++ = ' '; } else trunc = true; } // count as double periods //pos += 3; // no, just single period. pos += 2; lastSpace = true; continue; } // are we filtering? if ( f ) { if ((fend-f > maxCharSize)) { *f++ = ' '; } else trunc = true; } // count as a single space pos++; // do not allow back-to-back spaces lastSpace = true; continue; } // scan through all chars discounting back-to-back spaces // assume filters out to the same # of chars p = wp[i] ; pend = p + wlens[i]; unsigned char cs = 0; for ( ; p < pend ; p += cs ) { // get size cs = getUtf8CharSize(p); // do not count space if one before if ( is_wspace_utf8 (p) ) { if ( lastSpace ) continue; lastSpace = true; // are we filtering? if ( f ) { if (fend-f > 1 ) { lastBreak = f; *f++ = ' '; } else trunc = true; } pos++; continue; } if ( f ) { if (fend-f > cs) { // change '|' to commas if ( *p == '|' ) *f++ = ','; else if ( cs == 1 ) *f++ = *p; else { memcpy(f,p,cs); f += cs; } } else trunc = true; } pos++; lastSpace = false; } } if (trunc) { if(lastBreak == NULL) { *len = 0; return false; } else if(f) f = lastBreak; } // set pos for the END of the last word here (used in Summary.cpp) if ( ! f ) m_pos[nw] = pos; // NULL terminate f else { *len = f - fstart; } if ( fend-f > maxCharSize) { *f = '\0'; } // Success return true; }
bool Log::logR ( long long now , long type , char *msg , bool asterisk , bool forced ) { // filter if we should //if ( forced ) goto skipfilter; // return true if we should not log this if ( ! forced && ! shouldLog ( type , msg ) ) return true; // skipfilter: // can we log if we're a sig handler? don't take changes if ( g_inSigHandler ) return logLater ( now , type , msg , NULL ); //if ( g_inSigHandler ) return false; // get "msg"'s length long msgLen = gbstrlen ( msg ); #ifdef PTHREADS // lock for threads pthread_mutex_lock ( &s_lock ); #endif // do a timestamp, too. use the time synced with host #0 because // it is easier to debug because all log timestamps are in sync. if ( now == 0 ) now = gettimeofdayInMillisecondsGlobalNoCore(); // . skip all logging if power out, we do not want to screw things up // . allow logging for 10 seconds after power out though if ( ! g_process.m_powerIsOn && now - g_process.m_powerOffTime >10000){ #ifdef PTHREADS pthread_mutex_unlock ( &s_lock ); #endif return false; } //if ( now == 0 ) now = g_nowApprox; // chop off any spaces at the end of the msg. while ( is_wspace_a ( msg [ msgLen - 1 ] ) && msgLen > 0 ) msgLen--; // get this pid pid_t pid = getpidtid(); // a tmp buffer char tt [ MAX_LINE_LEN ]; char *p = tt; char *pend = tt + MAX_LINE_LEN; /* // print timestamp, hostid, type if ( g_hostdb.m_numHosts <= 999 ) sprintf ( p , "%llu %03li %s ", now , g_hostdb.m_hostId , getTypeString(type) ); else if ( g_hostdb.m_numHosts <= 9999 ) sprintf ( p , "%llu %04li %s ", now , g_hostdb.m_hostId , getTypeString(type) ); else if ( g_hostdb.m_numHosts <= 99999 ) sprintf ( p , "%llu %05li %s ", now , g_hostdb.m_hostId , getTypeString(type) ); */ // print timestamp, hostid, type if ( m_logTimestamps ) { if ( g_hostdb.m_numHosts <= 999 ) sprintf ( p , "%llu %03li ", now , g_hostdb.m_hostId ); else if ( g_hostdb.m_numHosts <= 9999 ) sprintf ( p , "%llu %04li ", now , g_hostdb.m_hostId ); else if ( g_hostdb.m_numHosts <= 99999 ) sprintf ( p , "%llu %05li ", now , g_hostdb.m_hostId ); p += gbstrlen ( p ); } // msg resource char *x = msg; long cc = 7; // the first 7 bytes or up to the : must be ascii //while ( p < pend && *x && is_alnum_a(*x) ) { *p++ = *x++; cc--; } // space pad //while ( cc-- > 0 ) *p++ = ' '; // ignore the label for now... while ( p < pend && *x && is_alnum_a(*x) ) { x++; cc--; } // thread id if in "thread" if ( pid != s_pid && s_pid != -1 ) { //sprintf ( p , "[%li] " , (long)getpid() ); sprintf ( p , "[%lu] " , (unsigned long)pid ); p += gbstrlen ( p ); } // then message itself long avail = (MAX_LINE_LEN) - (p - tt) - 1; if ( msgLen > avail ) msgLen = avail; if ( *x == ':' ) x++; if ( *x == ' ' ) x++; strncpy ( p , x , avail ); // capitalize for consistency. no, makes grepping log msgs harder. //if ( is_alpha_a(*p) ) *p = to_upper_a(*p); p += gbstrlen(p); // back up over spaces while ( p[-1] == ' ' ) p--; // end in period or ? or ! //if ( p[-1] != '?' && p[-1] != '.' && p[-1] != '!' ) // *p++ = '.'; *p ='\0'; // the total length, not including the \0 long tlen = p - tt; // call sprintf, but first make sure we have room in m_buf and in // the arrays. who know how much room the sprintf is going to need??? // NOTE: TODO: this is shaky -- fix it! if ( m_bufPtr + tlen >= 1024 * 32 || m_numErrors >= MAX_LOG_MSGS){ // this sets m_bufPtr to 0 if ( ! dumpLog ( ) ) { fprintf(stderr,"Log::log: could not dump to file!\n"); #ifdef PTHREADS pthread_mutex_unlock ( &s_lock ); #endif return false; } } // . filter out nasty chars from the message // . replace with ~'s char cs; char *ttp = tt; char *ttpend = tt + tlen; for ( ; ttp < ttpend ; ttp += cs ) { cs = getUtf8CharSize ( ttp ); if ( is_binary_utf8 ( ttp ) ) { for ( long k = 0 ; k < cs ; k++ ) *ttp++ = '.'; // careful not to skip the already skipped bytes cs = 0; continue; } // convert \n's and \r's to spaces if ( *ttp == '\n' ) *ttp = ' '; if ( *ttp == '\r' ) *ttp = ' '; if ( *ttp == '\t' ) *ttp = ' '; } if ( m_fd >= 0 ) { write ( m_fd , tt , tlen ); write ( m_fd , "\n", 1 ); } else { // print it out for now fprintf ( stderr, "%s\n", tt ); } // set the stuff in the array m_errorMsg [m_numErrors] = msg; m_errorMsgLen [m_numErrors] = msgLen; m_errorTime [m_numErrors] = now; m_errorType [m_numErrors] = type; // increase the # of errors m_numErrors++; #ifdef PTHREADS // unlock for threads pthread_mutex_unlock ( &s_lock ); #endif return false; }
JsonItem *Json::parseJsonStringIntoJsonItems ( char *json , int32_t niceness ) { m_prev = NULL; m_stackPtr = 0; m_sb.purge(); JsonItem *ji = NULL; if ( ! json ) return NULL; // how much space will we need to avoid any reallocs? char *p = json; bool inQuote = false; int32_t need = 0; for ( ; *p ; p++ ) { // ignore any escaped char. also \x1234 if ( *p == '\\' ) { if ( p[1] ) p++; continue; } if ( *p == '\"' ) inQuote = ! inQuote; if ( inQuote ) continue; if ( *p == '{' || *p == ',' || *p == '[' || *p == ':' ) // +1 for null terminating string of each item need += sizeof(JsonItem) +1; } // plus the length of the string to store it decoded etc. need += p - json; // plus a \0 for the value and a \0 for the name of each jsonitem need += 2; // prevent cores for now need += 10; // . to prevent safebuf from reallocating do this // . safeMemcpy() calls reserve(m_length+len) and reserves // tries to alloc m_length + (m_length+len) so since, // m_length+len should never be more than "need" we need to // double up here need *= 2; // this should be enough if ( ! m_sb.reserve ( need ) ) return NULL; // for testing if we realloc char *mem = m_sb.getBufStart(); int32_t size; char *NAME = NULL; int32_t NAMELEN = 0; // reset p p = json; // json maybe bad utf8 causing us to miss the \0 char, so use "pend" char *pend = json + gbstrlen(json); // scan for ( ; p < pend ; p += size ) { // get size size = getUtf8CharSize ( p ); // skip spaces if ( is_wspace_a (*p) ) continue; // skip commas if ( *p == ',' ) continue; // did we hit a '{'? that means the existing json item // is a parent of the item(s) inside the {}'s if ( *p == '{' ) { // if ji is non-null it must be a name like in // \"stats\":{\"fetchTime\":2069,....} // . this indicates the start of a json object // . addNewItem() will push the current item on stack ji = addNewItem(); if ( ! ji ) return NULL; // current ji is an object type then ji->m_type = JT_OBJECT; // set the name ji->m_name = NAME; ji->m_nameLen = NAMELEN; // this goes on the stack if ( m_stackPtr >= MAXJSONPARENTS ) return NULL; m_stack[m_stackPtr++] = ji; // and null this ji = NULL; continue; } // pop the stack? if ( *p == '}' ) { // just pop it and restore name cursor if ( m_stackPtr > 0 ) { JsonItem *px = m_stack[m_stackPtr-1]; NAME = px->m_name; NAMELEN = px->m_nameLen; m_stackPtr--; } continue; } // array of things? if ( *p == '[' ) { // make a newitem to put on stack ji = addNewItem(); if ( ! ji ) return NULL; // current ji is an object type then ji->m_type = JT_ARRAY; // start of array hack. HACK! //ji->m_valueLong = (int32_t)p; ji->m_valueArray = p; // set the name ji->m_name = NAME; ji->m_nameLen = NAMELEN; // init to a bogus value. should be set below. // at least this should avoid a core in XmlDoc.cpp // getTokenizedDiffbotReply() ji->m_valueLen = 0; // this goes on the stack if ( m_stackPtr >= MAXJSONPARENTS ) return NULL; m_stack[m_stackPtr++] = ji; ji = NULL; continue; } // pop the stack? if ( *p == ']' ) { // just pop it and restore name cursor if ( m_stackPtr > 0 ) { JsonItem *px = m_stack[m_stackPtr-1]; NAME = px->m_name; NAMELEN = px->m_nameLen; // start of array hack. HACK! char *start = (char *)px->m_valueArray;//Long; // include ending ']' in length of array px->m_valueLen = p - start + 1; m_stackPtr--; } continue; } // a quote? if ( *p == '\"' ) { // find end of quote char *end = p + 1; for ( ; *end ; end++ ) { // skip two chars if escaped if ( *end == '\\' && end[1] ) { end++; continue; } // this quote is unescaped then if ( *end == '\"' ) break; } // field? char *x = end + 1; // skip spaces for ( ; *x && is_wspace_a(*x) ; x++ ); // define the string char *str = p + 1; int32_t slen = end - str; // . if a colon follows, it was a field if ( *x == ':' ) { // we can't be the first thing in the safebuf // json must start with { or [ i guess // otherwise getFirstItem() won't work! if ( m_sb.m_length==0 ) { g_errno = EBADJSONPARSER; return NULL; } // let's push this now so we can \0 term char *savedStr = m_sb.getBuf(); m_sb.safeMemcpy ( str , slen ); m_sb.pushChar('\0'); // just set the name cursor NAME = savedStr;//str; NAMELEN = slen; } // . otherwise, it was field value, so index it // . TODO: later make field names compounded to // better represent nesting? // . added 'else if (NAME){' fix for json=\"too small\" else if ( NAME ) { // make a new one in safebuf. our // parent will be the array type item. ji = addNewItem(); if ( ! ji ) return NULL; // we are a string ji->m_type = JT_STRING; // use name cursor ji->m_name = NAME; ji->m_nameLen = NAMELEN; // get length decoded int32_t curr = m_sb.length(); // store decoded string right after jsonitem if ( !m_sb.safeDecodeJSONToUtf8 (str,slen, niceness )) return NULL; // store length decoded json ji->m_valueLen = m_sb.length() - curr; // end with a \0 m_sb.pushChar('\0'); // ok, this one is done ji = NULL; } else { log("json: fieldless name in json"); g_errno = EBADJSONPARSER; return NULL; } // skip over the string size = 0; p = x; continue; } // true or false? if ( (*p == 't' && strncmp(p,"true",4)==0) || (*p == 'f' && strncmp(p,"false",5)==0) ) { // make a new one ji = addNewItem(); if ( ! ji ) return NULL; // copy the number as a string as well int32_t curr = m_sb.length(); // what is the length of it? int32_t slen = 4; ji->m_valueLong = 1; ji->m_valueDouble = 1.0; if ( *p == 'f' ) { slen = 5; ji->m_valueLong = 0; ji->m_valueDouble = 0; } // store decoded string right after jsonitem if ( !m_sb.safeDecodeJSONToUtf8 (p,slen,niceness)) return NULL; // store length decoded json ji->m_valueLen = m_sb.length() - curr; // end with a \0 m_sb.pushChar('\0'); ji->m_type = JT_NUMBER; // use name cursor ji->m_name = NAME; ji->m_nameLen = NAMELEN; ji = NULL; // skip over the string size = 1; //p = end; continue; } // if we hit a digit they might not be in quotes like // "crawled":123 if ( is_digit ( *p ) || // like .123 ? ( *p == '.' && is_digit(p[1]) ) ) { // find end of the number char *end = p + 1; // . allow '.' for decimal numbers // . TODO: allow E for exponent for ( ; *end && (is_digit(*end) || *end=='.');end++) ; // define the string char *str = p; int32_t slen = end - str; // make a new one ji = addNewItem(); if ( ! ji ) return NULL; // back up over negative sign? if ( str > json && str[-1] == '-' ) str--; // decode //char c = str[slen]; //str[slen] = '\0'; ji->m_valueLong = atol(str); ji->m_valueDouble = atof(str); // copy the number as a string as well int32_t curr = m_sb.length(); // store decoded string right after jsonitem if ( !m_sb.safeDecodeJSONToUtf8 ( str, slen,niceness)) return NULL; // store length decoded json ji->m_valueLen = m_sb.length() - curr; // end with a \0 m_sb.pushChar('\0'); //str[slen] = c; ji->m_type = JT_NUMBER; // use name cursor ji->m_name = NAME; ji->m_nameLen = NAMELEN; ji = NULL; // skip over the string size = 0; p = end; continue; } } // for testing if we realloc char *memEnd = m_sb.getBufStart(); if ( mem != memEnd ) { char *xx=NULL;*xx=0; } return (JsonItem *)m_sb.getBufStart(); }