bool has_alpha_utf8 ( char *s , char *send ) { char cs = 0; for ( ; s < send ; s += cs ) { cs = getUtf8CharSize ( s ); if ( cs == 1 ) { if (is_alpha_a(*s)) return true; continue; } if ( is_alpha_utf8(s) ) return true; } return false; }
// returns false and sets g_errno on error bool Summary::setSummary ( Xml *xml, Words *words, Sections *sections, Pos *pos, Query *q, int32_t maxSummaryLen, int32_t maxNumLines, int32_t numDisplayLines, int32_t maxNumCharsPerLine, Url *f, Matches *matches, char *titleBuf, int32_t titleBufLen ) { m_numDisplayLines = numDisplayLines; m_displayLen = 0; // assume we got maxnumlines of summary if ( (maxNumCharsPerLine + 6) * maxNumLines > maxSummaryLen ) { if ( maxNumCharsPerLine < 10 ) { maxNumCharsPerLine = 10; } static char s_flag = 1; if ( s_flag ) { s_flag = 0; log("query: Warning. " "Max summary excerpt length decreased to " "%" PRId32" chars because max summary excerpts and " "max summary length are too big.", maxNumCharsPerLine); } } // . sanity check // . summary must fit in m_summary[] // . leave room for tailing \0 if ( maxSummaryLen >= MAX_SUMMARY_LEN ) { g_errno = EBUFTOOSMALL; return log("query: Summary too big to hold in buffer of %" PRId32" bytes.",(int32_t)MAX_SUMMARY_LEN); } // do not overrun the final*[] buffers if ( maxNumLines > 256 ) { g_errno = EBUFTOOSMALL; return log("query: More than 256 summary lines requested."); } // Nothing to match...print beginning of content as summary if ( matches->m_numMatches == 0 && maxNumLines > 0 ) { return getDefaultSummary ( xml, words, sections, pos, maxSummaryLen ); } int32_t need1 = q->m_numWords * sizeof(float); m_wordWeightSize = need1; if ( need1 < 128 ) { m_wordWeights = (float *)m_tmpWordWeightsBuf; } else { m_wordWeights = (float *)mmalloc ( need1 , "wwsum" ); } if ( ! m_wordWeights ) { return false; } /// @todo ALC fix word weights /// non-working logic is removed in commit 5eacee9063861e859b54ec62035a600aa8af25df // . compute our word weights wrt each query. words which are more rare // have a higher weight. We use this to weight the terms importance // when generating the summary. // . used by the proximity algo // . used in setSummaryScores() for scoring summaries for ( int32_t i = 0 ; i < q->m_numWords; i++ ) { m_wordWeights[i] = 1.0; } // convenience m_maxNumCharsPerLine = maxNumCharsPerLine; m_q = q; // set the max excerpt len to the max summary excerpt len int32_t maxExcerptLen = m_maxNumCharsPerLine; int32_t lastNumFinal = 0; int32_t maxLoops = 1024; // if just computing absScore2... if ( maxNumLines <= 0 ) { return true; } char *p = m_summary; char *pend = m_summary + maxSummaryLen; m_numExcerpts = 0; int32_t need2 = (1+1+1) * m_q->m_numWords; m_buf4Size = need2; if ( need2 < 128 ) { m_buf4 = m_tmpBuf4; } else { m_buf4 = (char *)mmalloc ( need2 , "stkbuf" ); } if ( ! m_buf4 ) { return false; } char *x = m_buf4; char *retired = x; x += m_q->m_numWords; char *maxGotIt = x; x += m_q->m_numWords; char *gotIt = x; // . the "maxGotIt" count vector accumulates into "retired" // . that is how we keep track of what query words we used for previous // summary excerpts so we try to get diversified excerpts with // different query terms/words in them //char retired [ MAX_QUERY_WORDS ]; memset ( retired, 0, m_q->m_numWords * sizeof(char) ); // some query words are already matched in the title for ( int32_t i = 0 ; i < m_q->m_numWords ; i++ ) { if ( matches->m_qwordFlags[i] & MF_TITLEGEN ) { retired [ i ] = 1; } } bool hadEllipsis = false; // // Loop over all words that match a query term. The matching words // could be from any one of the 3 Words arrays above. Find the // highest scoring window around each term. And then find the highest // of those over all the matching terms. // int32_t numFinal; for ( numFinal = 0; numFinal < maxNumLines; numFinal++ ) { if ( numFinal == m_numDisplayLines ) { m_displayLen = p - m_summary; } // reset these at the top of each loop Match *maxm; int64_t maxScore = 0; int32_t maxa = 0; int32_t maxb = 0; int32_t maxi = -1; int32_t lasta = -1; if(lastNumFinal == numFinal) { if(maxLoops-- <= 0) { log(LOG_WARN, "query: got infinite loop bug, query is %s url is %s", m_q->m_orig, f->getUrl()); break; } } lastNumFinal = numFinal; // loop through all the matches and see which is best for ( int32_t i = 0 ; i < matches->m_numMatches ; i++ ) { int32_t a , b; // reset lasta if we changed words class if ( i > 0 && matches->m_matches[i-1].m_words != matches->m_matches[i].m_words ) { lasta = -1; } // only use matches in title, etc. mf_t flags = matches->m_matches[i].m_flags; bool skip = true; if ( flags & MF_METASUMM ) { skip = false; } if ( flags & MF_METADESC ) { skip = false; } if ( flags & MF_BODY ) { skip = false; } if ( flags & MF_RSSDESC ) { skip = false; } if ( skip ) { continue; } // ask him for the query words he matched //char gotIt [ MAX_QUERY_WORDS ]; // clear it for him memset ( gotIt, 0, m_q->m_numWords * sizeof(char) ); // . get score of best window around this match // . do not allow left post of window to be <= lasta to // avoid repeating the same window. int64_t score = getBestWindow (matches, i, &lasta, &a, &b, gotIt, retired, maxExcerptLen); // USE THIS BUF BELOW TO DEBUG THE ABOVE CODE. // PRINTS OUT THE SUMMARY /* //if ( score >=12000 ) { char buf[10*1024]; char *xp = buf; if ( i == 0 ) log (LOG_WARN,"=-=-=-=-=-=-=-=-=-=-=-=-=-=-="); sprintf(xp, "score=%08" PRId32" a=%05" PRId32" b=%05" PRId32" ", (int32_t)score,(int32_t)a,(int32_t)b); xp += strlen(xp); for ( int32_t j = a; j < b; j++ ){ //int32_t s = scores->m_scores[j]; int32_t s = 0; if ( s < 0 ) continue; char e = 1; int32_t len = words->getWordLen(j); for(int32_t k=0;k<len;k +=e){ char c = words->m_words[j][k]; //if ( is_binary( c ) ) continue; *xp = c; xp++; } //p += strlen(p); if ( s == 0 ) continue; sprintf ( xp ,"(%" PRId32")",s); xp += strlen(xp); } log (LOG_WARN,"query: summary: %s", buf); //} */ // prints out the best window with the score /* char buf[MAX_SUMMARY_LEN]; char *bufPtr = buf; char *bufPtrEnd = p + MAX_SUMMARY_LEN; if ( i == 0 ) log (LOG_WARN,"=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-="); int32_t len = 0; Words *ww = matches->m_matches[i].m_words; //Sections *ss = matches->m_matches[i].m_sections; //if ( ss->m_numSections <= 0 ) ss = NULL; //len=pos->filter(bufPtr, bufPtrEnd, ww, a, b, NULL); //log(LOG_WARN,"summary: %" PRId32") %s - %" PRId64,i,bufPtr, //score); log(LOG_WARN,"summary: %" PRId32") %s - %" PRId64,i,bufPtr, score); */ // skip if was in title or something if ( score <= 0 ) { continue; } // skip if not a winner if ( maxi >= 0 && score <= maxScore ) { continue; } // we got a new winner maxi = i; maxa = a; maxb = b; maxScore = score; // save this too gbmemcpy ( maxGotIt , gotIt , m_q->m_numWords ); } // retire the query words in the winning summary //log( LOG_WARN,"summary: took %" PRId64" ms to finish getbestwindo", // gettimeofdayInMilliseconds() - stget ); // all done if no winner was made if ( maxi == -1 || maxa == -1 || maxb == -1) { break; } // who is the winning match? maxm = &matches->m_matches[maxi]; Words *ww = maxm->m_words; // we now use "m_swbits" for the summary bits since they are // of size sizeof(swbit_t), a int16_t at this point swbit_t *bb = maxm->m_bits->m_swbits; // this should be impossible if ( maxa > ww->getNumWords() || maxb > ww->getNumWords() ) { log ( LOG_WARN,"query: summary starts or ends after " "document is over! maxa=%" PRId32" maxb=%" PRId32" nw=%" PRId32, maxa, maxb, ww->getNumWords() ); maxa = ww->getNumWords() - 1; maxb = ww->getNumWords(); } // assume we do not preceed with ellipsis "..." bool needEllipsis = true; const char *c = ww->getWord(maxa)+0; // rule of thumb, don't use ellipsis if the first letter is capital, or a non letter // is punct word before us pair acrossable? if so then we probably are not the start of a sentence. // or if into the sample and previous excerpt had an ellipsis do not bother using one for us. if ( !is_alpha_utf8(c) || is_upper_utf8(c) || (bb[maxa] & D_STARTS_SENTENCE) || (p > m_summary && hadEllipsis)) { needEllipsis = false; } if ( needEllipsis ) { // break out if no room for "..." if ( p + 4 + 2 > pend ) { break; } // space first? if ( p > m_summary ) { *p++ = ' '; } memcpy ( p, "\342\200\246 ", 4 ); //horizontal ellipsis, code point 0x2026 p += 4; } // separate summary excerpts with a single space. if ( p > m_summary ) { if ( p + 2 > pend ) { break; } *p++ = ' '; } // assume we need a trailing ellipsis needEllipsis = true; // so next excerpt does not need to have an ellipsis if we // have one at the end of this excerpt hadEllipsis = needEllipsis; // start with quote? if ( (bb[maxa] & D_IN_QUOTES) && p + 1 < pend ) { // preceed with quote *p++ = '\"'; } // . filter the words into p // . removes back to back spaces // . converts html entities // . filters in stores words in [a,b) interval int32_t len = pos->filter( ww, maxa, maxb, false, p, pend, xml->getVersion() ); // break out if did not fit if ( len == 0 ) { break; } // don't consider it if it is a substring of the title if ( len == titleBufLen && strncasestr(titleBuf, p, titleBufLen, len) ) { // don't consider this one numFinal--; goto skip; } // don't consider it if the length wasn't anything nice if ( len < 5 ){ numFinal--; goto skip; } // otherwise, keep going p += len; // now we just indicate which query terms we got for ( int32_t i = 0 ; i < m_q->m_numWords ; i++ ) { // do not breach if ( retired[i] >= 100 ) { continue; } retired [ i ] += maxGotIt [ i ]; } // add all the scores of the excerpts to the doc summary score. // zero out scores of the winning sample so we don't get them // again. use negative one billion to ensure that we don't get // them again for ( int32_t j = maxa ; j < maxb ; j++ ) { // mark it as used bb[j] |= D_USED; } // if we ended on punct that can be paired across we need // to add an ellipsis if ( needEllipsis ) { if ( p + 4 + 2 > pend ) { break; } memcpy ( p, " \342\200\246", 4 ); //horizontal ellipsis, code point 0x2026 p += 4; } // try to put in a small summary excerpt if we have atleast // half of the normal excerpt length left if ( maxExcerptLen == m_maxNumCharsPerLine && len <= ( m_maxNumCharsPerLine / 2 + 1 ) ) { maxExcerptLen = m_maxNumCharsPerLine / 2; // don't count it in the finals since we try to get a small excerpt numFinal--; } else if ( m_numExcerpts < MAX_SUMMARY_EXCERPTS && m_numExcerpts >= 0 ) { m_summaryExcerptLen[m_numExcerpts] = p - m_summary; m_numExcerpts++; // also reset maxExcerptLen maxExcerptLen = m_maxNumCharsPerLine; } skip: // zero out the scores so they will not be used in others for ( int32_t j = maxa ; j < maxb ; j++ ) { // mark it bb[j] |= D_USED; } } if ( numFinal <= m_numDisplayLines ) { m_displayLen = p - m_summary; } // free the mem we used if we allocated it if ( m_buf4 && m_buf4 != m_tmpBuf4 ) { mfree ( m_buf4 , m_buf4Size , "ssstkb" ); m_buf4 = NULL; } // If we still didn't find a summary, get the default summary if ( p == m_summary ) { bool status = getDefaultSummary ( xml, words, sections, pos, maxSummaryLen ); if ( m_numDisplayLines > 0 ) { m_displayLen = m_summaryLen; } return status; } // if we don't find a summary, theres no need to NULL terminate *p++ = '\0'; // set length m_summaryLen = p - m_summary; if ( m_summaryLen > 50000 ) { g_process.shutdownAbort(true); } return true; }