bool Log::logR ( int64_t now, int32_t type, const char *msg, bool forced ) { if ( ! g_loggingEnabled ) { return true; } // return true if we should not log this if ( ! forced && ! shouldLog ( type , msg ) ) { return true; } // get "msg"'s length int32_t msgLen = strlen ( msg ); ScopedLock sl(s_lock); // do a timestamp, too. use the time synced with host #0 because // it is easier to debug because all log timestamps are in sync. if ( now == 0 ) now = gettimeofdayInMillisecondsGlobalNoCore(); // . skip all logging if power out, we do not want to screw things up // . allow logging for 10 seconds after power out though if ( ! g_process.m_powerIsOn && now - g_process.m_powerOffTime >10000){ return false; } // chop off any spaces at the end of the msg. while ( is_wspace_a ( msg [ msgLen - 1 ] ) && msgLen > 0 ) msgLen--; // a tmp buffer char tt [ MAX_LINE_LEN ]; char *p = tt; if (m_logPrefix) { if ( m_logTimestamps ) { if( m_logReadableTimestamps ) { time_t now_t = (time_t)(now / 1000); struct tm tm_buf; struct tm *stm = localtime_r(&now_t,&tm_buf); p += sprintf ( p , "%04d%02d%02d-%02d%02d%02d-%03d %04" PRId32" ", stm->tm_year+1900,stm->tm_mon+1,stm->tm_mday,stm->tm_hour,stm->tm_min,stm->tm_sec,(int)(now%1000), g_hostdb.m_hostId ); } else { if ( g_hostdb.getNumHosts() <= 999 ) p += sprintf ( p , "%" PRIu64 " %03" PRId32 " ", (uint64_t)now , g_hostdb.m_hostId ); else if ( g_hostdb.getNumHosts() <= 9999 ) p += sprintf ( p , "%" PRIu64" %04" PRId32" ", (uint64_t)now , g_hostdb.m_hostId ); else if ( g_hostdb.getNumHosts() <= 99999 ) p += sprintf ( p , "%" PRIu64" %05" PRId32" ", (uint64_t)now , g_hostdb.m_hostId ); } } // Get thread id. pthread_self instead? unsigned tid=(unsigned)syscall(SYS_gettid); p += sprintf(p, "%06u ", tid); // Log level p += sprintf(p, "%s ", getTypeString(type)); } // then message itself const char *x = msg; int32_t avail = (MAX_LINE_LEN) - (p - tt) - 1; if ( msgLen > avail ) msgLen = avail; if ( *x == ':' ) x++; if ( *x == ' ' ) x++; strncpy ( p , x , avail ); // capitalize for consistency. no, makes grepping log msgs harder. //if ( is_alpha_a(*p) ) *p = to_upper_a(*p); p += strlen(p); // back up over spaces while ( p[-1] == ' ' ) p--; // end in period or ? or ! //if ( p[-1] != '?' && p[-1] != '.' && p[-1] != '!' ) // *p++ = '.'; *p ='\0'; // the total length, not including the \0 int32_t tlen = p - tt; // . filter out nasty chars from the message // . replace with ~'s char cs; char *ttp = tt; char *ttpend = tt + tlen; for ( ; ttp < ttpend ; ttp += cs ) { cs = getUtf8CharSize ( ttp ); if ( is_binary_utf8 ( ttp ) ) { for ( int32_t k = 0 ; k < cs ; k++ ) *ttp++ = '.'; // careful not to skip the already skipped bytes cs = 0; continue; } } // . if filesize would be too big then make a new log file // . should make a new m_fd if ( m_logFileSize + tlen+1 > MAXLOGFILESIZE && g_conf.m_logToFile ) makeNewLogFile(); if ( m_fd >= 0 ) { write ( m_fd , tt , tlen ); write ( m_fd , "\n", 1 ); m_logFileSize += tlen + 1; } else { // print it out for now fprintf ( stderr, "%s\n", tt ); } return false; }
bool Log::logR ( long long now , long type , char *msg , bool asterisk , bool forced ) { // filter if we should //if ( forced ) goto skipfilter; // return true if we should not log this if ( ! forced && ! shouldLog ( type , msg ) ) return true; // skipfilter: // can we log if we're a sig handler? don't take changes if ( g_inSigHandler ) return logLater ( now , type , msg , NULL ); //if ( g_inSigHandler ) return false; // get "msg"'s length long msgLen = gbstrlen ( msg ); #ifdef PTHREADS // lock for threads pthread_mutex_lock ( &s_lock ); #endif // do a timestamp, too. use the time synced with host #0 because // it is easier to debug because all log timestamps are in sync. if ( now == 0 ) now = gettimeofdayInMillisecondsGlobalNoCore(); // . skip all logging if power out, we do not want to screw things up // . allow logging for 10 seconds after power out though if ( ! g_process.m_powerIsOn && now - g_process.m_powerOffTime >10000){ #ifdef PTHREADS pthread_mutex_unlock ( &s_lock ); #endif return false; } //if ( now == 0 ) now = g_nowApprox; // chop off any spaces at the end of the msg. while ( is_wspace_a ( msg [ msgLen - 1 ] ) && msgLen > 0 ) msgLen--; // get this pid pid_t pid = getpidtid(); // a tmp buffer char tt [ MAX_LINE_LEN ]; char *p = tt; char *pend = tt + MAX_LINE_LEN; /* // print timestamp, hostid, type if ( g_hostdb.m_numHosts <= 999 ) sprintf ( p , "%llu %03li %s ", now , g_hostdb.m_hostId , getTypeString(type) ); else if ( g_hostdb.m_numHosts <= 9999 ) sprintf ( p , "%llu %04li %s ", now , g_hostdb.m_hostId , getTypeString(type) ); else if ( g_hostdb.m_numHosts <= 99999 ) sprintf ( p , "%llu %05li %s ", now , g_hostdb.m_hostId , getTypeString(type) ); */ // print timestamp, hostid, type if ( m_logTimestamps ) { if ( g_hostdb.m_numHosts <= 999 ) sprintf ( p , "%llu %03li ", now , g_hostdb.m_hostId ); else if ( g_hostdb.m_numHosts <= 9999 ) sprintf ( p , "%llu %04li ", now , g_hostdb.m_hostId ); else if ( g_hostdb.m_numHosts <= 99999 ) sprintf ( p , "%llu %05li ", now , g_hostdb.m_hostId ); p += gbstrlen ( p ); } // msg resource char *x = msg; long cc = 7; // the first 7 bytes or up to the : must be ascii //while ( p < pend && *x && is_alnum_a(*x) ) { *p++ = *x++; cc--; } // space pad //while ( cc-- > 0 ) *p++ = ' '; // ignore the label for now... while ( p < pend && *x && is_alnum_a(*x) ) { x++; cc--; } // thread id if in "thread" if ( pid != s_pid && s_pid != -1 ) { //sprintf ( p , "[%li] " , (long)getpid() ); sprintf ( p , "[%lu] " , (unsigned long)pid ); p += gbstrlen ( p ); } // then message itself long avail = (MAX_LINE_LEN) - (p - tt) - 1; if ( msgLen > avail ) msgLen = avail; if ( *x == ':' ) x++; if ( *x == ' ' ) x++; strncpy ( p , x , avail ); // capitalize for consistency. no, makes grepping log msgs harder. //if ( is_alpha_a(*p) ) *p = to_upper_a(*p); p += gbstrlen(p); // back up over spaces while ( p[-1] == ' ' ) p--; // end in period or ? or ! //if ( p[-1] != '?' && p[-1] != '.' && p[-1] != '!' ) // *p++ = '.'; *p ='\0'; // the total length, not including the \0 long tlen = p - tt; // call sprintf, but first make sure we have room in m_buf and in // the arrays. who know how much room the sprintf is going to need??? // NOTE: TODO: this is shaky -- fix it! if ( m_bufPtr + tlen >= 1024 * 32 || m_numErrors >= MAX_LOG_MSGS){ // this sets m_bufPtr to 0 if ( ! dumpLog ( ) ) { fprintf(stderr,"Log::log: could not dump to file!\n"); #ifdef PTHREADS pthread_mutex_unlock ( &s_lock ); #endif return false; } } // . filter out nasty chars from the message // . replace with ~'s char cs; char *ttp = tt; char *ttpend = tt + tlen; for ( ; ttp < ttpend ; ttp += cs ) { cs = getUtf8CharSize ( ttp ); if ( is_binary_utf8 ( ttp ) ) { for ( long k = 0 ; k < cs ; k++ ) *ttp++ = '.'; // careful not to skip the already skipped bytes cs = 0; continue; } // convert \n's and \r's to spaces if ( *ttp == '\n' ) *ttp = ' '; if ( *ttp == '\r' ) *ttp = ' '; if ( *ttp == '\t' ) *ttp = ' '; } if ( m_fd >= 0 ) { write ( m_fd , tt , tlen ); write ( m_fd , "\n", 1 ); } else { // print it out for now fprintf ( stderr, "%s\n", tt ); } // set the stuff in the array m_errorMsg [m_numErrors] = msg; m_errorMsgLen [m_numErrors] = msgLen; m_errorTime [m_numErrors] = now; m_errorType [m_numErrors] = type; // increase the # of errors m_numErrors++; #ifdef PTHREADS // unlock for threads pthread_mutex_unlock ( &s_lock ); #endif return false; }
// . return the score of the highest-scoring window containing match #m // . window is defined by the half-open interval [a,b) where a and b are // word #'s in the Words array indicated by match #m // . return -1 and set g_errno on error int64_t Summary::getBestWindow ( Matches *matches, int32_t mm, int32_t *lasta, int32_t *besta, int32_t *bestb, char *gotIt, char *retired, int32_t maxExcerptLen ) { // get the window around match #mm Match *m = &matches->m_matches[mm]; // what is the word # of match #mm? int32_t matchWordNum = m->m_wordNum; // what Words/Pos/Bits classes is this match in? Words *words = m->m_words; Section **sp = NULL; int32_t *pos = m->m_pos->m_pos; // use "m_swbits" not "m_bits", that is what Bits::setForSummary() uses const swbit_t *bb = m->m_bits->m_swbits; // shortcut if ( m->m_sections ) { sp = m->m_sections->m_sectionPtrs; } int32_t nw = words->getNumWords(); int64_t *wids = words->getWordIds(); nodeid_t *tids = words->getTagIds(); // . sanity check // . this prevents a core i've seen if ( matchWordNum >= nw ) { log("summary: got overflow condition for q=%s",m_q->m_orig); // assume no best window *besta = -1; *bestb = -1; *lasta = matchWordNum; return 0; } // . we NULLify the section ptrs if we already used the word in another summary. int32_t badFlags = SEC_SCRIPT|SEC_STYLE|SEC_SELECT|SEC_IN_TITLE; if ( (bb[matchWordNum] & D_USED) || ( sp && (sp[matchWordNum]->m_flags & badFlags) ) ) { // assume no best window *besta = -1; *bestb = -1; *lasta = matchWordNum; return 0; } // . "a" is the left fence post of the window (it is a word # in Words) // . go to the left as far as we can // . thus we decrement "a" int32_t a = matchWordNum; // "posa" is the character position of the END of word #a int32_t posa = pos[a+1]; int32_t firstFrag = -1; bool startOnQuote = false; bool goodStart = false; int32_t wordCount = 0; // . decrease "a" as int32_t as we stay within maxNumCharsPerLine // . avoid duplicating windows by using "lasta", the last "a" of the // previous call to getBestWindow(). This can happen if our last // central query term was close to this one. for ( ; a > 0 && posa - pos[a-1] < maxExcerptLen && a > *lasta; a-- ) { // . don't include any "dead zone", // . dead zones have already been used for the summary, and // we are getting a second/third/... excerpt here now then // stop if its the start of a sentence, too // stop before title word if ( (bb[a-1] & D_USED) || (bb[a] & D_STARTS_SENTENCE) || ( bb[a-1] & D_IN_TITLE )) { goodStart = true; break; } // don't go beyond an LI, TR, P tag if ( tids && ( tids[a-1] == TAG_LI || tids[a-1] == TAG_TR || tids[a-1] == TAG_P || tids[a-1] == TAG_DIV ) ) { goodStart = true; break; } // stop if its the start of a quoted sentence if ( a+1<nw && (bb[a+1] & D_IN_QUOTES) && words->getWord(a)[0] == '\"' ){ startOnQuote = true; goodStart = true; break; } // find out the first instance of a fragment (comma, etc) // watch out! because frag also means 's' in there's if ( ( bb[a] & D_STARTS_FRAG ) && !(bb[a-1] & D_IS_STRONG_CONNECTOR) && firstFrag == -1 ) { firstFrag = a; } if ( wids[a] ) { wordCount++; } } // if didn't find a good start, then start at the start of the frag if ( !goodStart && firstFrag != -1 ) { a = firstFrag; } // don't let punct or tag word start a line, unless a quote if ( a < matchWordNum && !wids[a] && words->getWord(a)[0] != '\"' ){ while ( a < matchWordNum && !wids[a] ) a++; // do not break right after a "strong connector", like // apostrophe while ( a < matchWordNum && a > 0 && ( bb[a-1] & D_IS_STRONG_CONNECTOR ) ) a++; // don't let punct or tag word start a line while ( a < matchWordNum && !wids[a] ) a++; } // remember, b is not included in the summary, the summary is [a,b-1] // remember to include all words in a matched phrase int32_t b = matchWordNum + m->m_numWords ; int32_t endQuoteWordNum = -1; int32_t numTagsCrossed = 0; for ( ; b <= nw; b++ ) { if ( b == nw ) { break; } if ( pos[b+1] - pos[a] >= maxExcerptLen ) { break; } if ( startOnQuote && words->getWord(b)[0] == '\"' ) { endQuoteWordNum = b; } // don't include any dead zone, those are already-used samples if ( bb[b] & D_USED ) { break; } // stop on a title word if ( bb[b] & D_IN_TITLE ) { break; } if ( wids[b] ) { wordCount++; } // don't go beyond an LI or TR backtag if ( tids && ( tids[b] == (BACKBIT|TAG_LI) || tids[b] == (BACKBIT|TAG_TR) ) ) { numTagsCrossed++; // try to have atleast 10 words in the summary if ( wordCount > 10 ) { break; } } // go beyond a P or DIV backtag in case the earlier char is a // ':'. This came from a special case for wikipedia pages // eg. http://en.wikipedia.org/wiki/Flyover if ( tids && ( tids[b] == (BACKBIT|TAG_P) || tids[b] == (BACKBIT|TAG_DIV) )) { numTagsCrossed++; // try to have atleast 10 words in the summary if ( wordCount > 10 && words->getWord(b-1)[0] != ':' ) { break; } } } // don't end on a lot of punct words if ( b > matchWordNum && !wids[b-1]){ // remove more than one punct words. if we're ending on a quote // keep it while ( b > matchWordNum && !wids[b-2] && endQuoteWordNum != -1 && b > endQuoteWordNum ) { b--; } // do not break right after a "strong connector", like apostrophe while ( b > matchWordNum && (bb[b-2] & D_IS_STRONG_CONNECTOR) ) { b--; } } Match *ms = matches->m_matches; // make m_matches.m_matches[mi] the first match in our [a,b) window int32_t mi ; // . the match at the center of the window is match #"mm", so that // matches->m_matches[mm] is the Match class // . set "mi" to it and back up "mi" as int32_t as >= a for ( mi = mm ; mi > 0 && ms[mi-1].m_wordNum >=a ; mi-- ) ; // now get the score of this excerpt. Also mark all the represented // query words. Mark the represented query words in the array that // comes to us. also mark how many times the same word is repeated in // this summary. int64_t score = 0LL; // is a url contained in the summary, that looks bad! punish! bool hasUrl = false; // the word count we did above was just an approximate. count it right wordCount = 0; // for debug //char buf[5000]; //char *xp = buf; SafeBuf xp; // wtf? if ( b > nw ) { b = nw; } // first score from the starting match down to a, including match for ( int32_t i = a ; i < b ; i++ ) { // debug print out if ( g_conf.m_logDebugSummary ) { int32_t len = words->getWordLen(i); char cs; for (int32_t k=0;k<len; k+=cs ) { const char *c = words->getWord(i)+k; cs = getUtf8CharSize(c); if ( is_binary_utf8 ( c ) ) { continue; } xp.safeMemcpy ( c , cs ); xp.nullTerm(); } } // skip if in bad section, marquee, select, script, style if ( sp && (sp[i]->m_flags & badFlags) ) { continue; } // don't count just numeric words if ( words->isNum(i) ) { continue; } // check if there is a url. best way to check for '://' if ( wids && !wids[i] ) { const char *wrd = words->getWord(i); int32_t wrdLen = words->getWordLen(i); if ( wrdLen == 3 && wrd[0] == ':' && wrd[1] == '/' && wrd[2] == '/' ) { hasUrl = true; } } // skip if not wid if ( ! wids[i] ) { continue; } // just make every word 100 pts int32_t t = 100; // penalize it if in one of these sections if ( bb[i] & ( D_IN_PARENS | D_IN_SUP | D_IN_LIST ) ) { t /= 2; } // boost it if in bold or italics if ( bb[i] & D_IN_BOLDORITALICS ) { t *= 2; } // add the score for this word score += t; // print the score, "t" if ( g_conf.m_logDebugSummary ) { xp.safePrintf("(%" PRId32")",t); } // count the alpha words we got wordCount++; // if no matches left, skip if ( mi >= matches->m_numMatches ) { continue; } // get the match Match *next = &ms[mi]; // skip if not a match if ( i != next->m_wordNum ) { continue; } // must be a match in this class if ( next->m_words != words ) { continue; } // advance it mi++; // which query word # does it match int32_t qwn = next->m_qwordNum; if ( qwn < 0 || qwn >= m_q->m_numWords ){g_process.shutdownAbort(true);} // undo old score score -= t; // add 100000 per match t = 100000; // weight based on tf, goes from 0.1 to 1.0 t = (int32_t)((float)t * m_wordWeights [ qwn ]); // if it is a query stop word, make it 10000 pts if ( m_q->m_qwords[qwn].m_isQueryStopWord ) { t = 0;//10000; } // penalize it if in one of these sections if ( bb[i] & ( D_IN_PARENS | D_IN_SUP | D_IN_LIST ) ) { t /= 2; } if ( gotIt[qwn] > 0 ) { // have we matched it in this [a,b) already? if ( gotIt[qwn] == 1 ) { t /= 15; } else { // if we have more than 2 matches in the same window, // it may not give a good summary. give a heavy penalty t -= 200000; } } else if ( retired [qwn] > 0 ) { // have we matched it already in a winning window? t /= 12; } // add it back score += t; if ( g_conf.m_logDebugSummary ) { xp.safePrintf ("[%" PRId32"]{qwn=%" PRId32",ww=%f}",t,qwn, m_wordWeights[qwn]); } // inc the query word count for this window if ( gotIt[qwn] < 100 ) { gotIt[qwn]++; } } int32_t oldScore = score; // apply the bonus if it starts or a sentence // only apply if the score is positive and if the wordcount is decent if ( score > 0 && wordCount > 7 ){ // a match can give us 10k to 100k pts based on the tf weights // so we don't want to overwhelm that too much, so let's make // this a 20k bonus if it starts a sentence if ( bb[a] & D_STARTS_SENTENCE ) { score += 8000; } else if ( bb[a] & D_STARTS_FRAG ) { // likewise, a fragment, like after a comma score += 4000; } // 1k if the match word is very close to the // start of a sentence, lets say 3 alphawords if ( matchWordNum - a < 7 ) { score += 1000; } } // a summary isn't really a summary if its less than 7 words. // reduce the score, but still give it a decent score. // minus 5M. if ( wordCount < 7 ) { score -= 20000; } // summaries that cross a lot of tags are usually bad, penalize them if ( numTagsCrossed > 1 ) { score -= (numTagsCrossed * 20000); } if ( hasUrl ) { score -= 8000; } // show it if ( g_conf.m_logDebugSummary ) { log(LOG_DEBUG, "sum: score=%08" PRId32" prescore=%08" PRId32" a=%05" PRId32" b=%05" PRId32" %s", (int32_t)score,oldScore,(int32_t)a,(int32_t)b, xp.getBufStart()); } // set lasta, besta, bestb *lasta = a; *besta = a; *bestb = b; return score; }