// . like strstr but haystack may not be NULL terminated // . needle, however, IS null terminated char *strncasestr ( char *haystack , const char *needle , int32_t haystackSize, int32_t needleSize ) { int32_t n = haystackSize - needleSize ; for ( int32_t i = 0 ; i <= n ; i++ ) { // keep looping if first chars do not match if ( to_lower_a(haystack[i]) != to_lower_a(needle[0]) ) continue; // if needle was only 1 char it's a match if ( ! needle[1] ) return &haystack[i]; // compare the whole strings now if ( strncasecmp ( &haystack[i] , needle , needleSize ) == 0 ) return &haystack[i]; } return NULL; }
uint32_t hash32Lower_a ( char *s,int32_t len,uint32_t startHash) { uint32_t h = startHash; register int32_t i = 0; while ( i < len ) { h ^= (uint32_t) g_hashtab [(unsigned char)i] [(unsigned char)to_lower_a(s[i])]; i++; } return h; }
uint8_t getLanguageFromAbbr2 ( char *str , int32_t len ) { // truncate if ( len > 5 ) len = 5; // copy it and check it char lang[6]; for ( int32_t j = 0 ; j < len ; j++ ) lang[j] = to_lower_a(str[j]); lang[len]='\0'; return getLanguageFromAbbr(lang); }
unsigned long hash32Lower_a ( char *s,long len,unsigned long startHash){ unsigned long h = startHash; long i = 0; while ( i < len ) { h ^= (unsigned long) g_hashtab [(unsigned char)i] [(unsigned char)to_lower_a(s[i])]; i++; } return h; }
char *strncasestr( char *haystack, int32_t haylen, const char *needle){ int32_t matchLen = 0; int32_t needleLen = strlen(needle); for (int32_t i = 0; i < haylen;i++){ char c1 = to_lower_a(haystack[i]); char c2 = to_lower_a(needle[matchLen]); if ( c1 != c2 ){ // no match matchLen = 0; continue; } // we matched another character matchLen++; if (matchLen < needleLen) continue; // we've matched the whole string return haystack + i - matchLen + 1; } return NULL; }
// get the id from a 2 character country code uint8_t getCountryId ( char *cc ) { static bool s_init = false; static char buf[2000]; static HashTableX ht; char tmp[4]; if ( ! s_init ) { s_init = true; // hash them up ht.set ( 4 , 1 , -1,buf,2000,false,MAX_NICENESS,"ctryids"); // now add in all the country codes long n = (long) sizeof(s_countryCode) / sizeof(char *); for ( long i = 0 ; i < n ; i++ ) { char *s = (char *)s_countryCode[i]; //long slen = gbstrlen ( s ); // sanity check if ( !s[0] || !s[1] || s[2]) { char *xx=NULL;*xx=0; } // map it to a 4 byte key tmp[0]=s[0]; tmp[1]=s[1]; tmp[2]=0; tmp[3]=0; // a val of 0 does not mean empty in HashTableX, // that is an artifact of HashTableT uint8_t val = i; // +1; // add 1 cuz 0 means lang unknown if ( ! ht.addKey ( tmp , &val ) ) { char *xx=NULL;*xx=0; } } } // lookup tmp[0]=to_lower_a(cc[0]); tmp[1]=to_lower_a(cc[1]); tmp[2]=0; tmp[3]=0; long slot = ht.getSlot ( tmp ); if ( slot < 0 ) return 0; void *val = ht.getValueFromSlot ( slot ); return *(uint8_t *)val ; }
// . a new interface so Msg3b can call this with "s" set to NULL // . returns false if blocked, true otherwise // . sets g_errno on error bool sendPageParser2 ( TcpSocket *s , HttpRequest *r , State8 *st , long long docId , Query *q , // in query term space, not imap space long long *termFreqs , // in imap space float *termFreqWeights , // in imap space float *affWeights , void *state , void (* callback)(void *state) ) { //log("parser: read sock=%li",s->m_sd); // might a simple request to addsomething to validated.*.txt file // from XmlDoc::print() or XmlDoc::validateOutput() char *add = r->getString("add",NULL); //long long uh64 = r->getLongLong("uh64",0LL); char *uh64str = r->getString("uh64",NULL); //char *divTag = r->getString("div",NULL); if ( uh64str ) { // convert add to number long addNum = 0; if ( to_lower_a(add[0])=='t' ) // "true" or "false"? addNum = 1; // convert it. skip beginning "str" inserted to prevent // javascript from messing with the long long since it // was rounding it! //long long uh64 = atoll(uh64str);//+3); // urldecode that //long divTagLen = gbstrlen(divTag); //long newLen = urlDecode ( divTag , divTag , divTagLen ); // null term? //divTag[newLen] = '\0'; // do it. this is defined in XmlDoc.cpp //addCheckboxSpan ( uh64 , divTag , addNum ); // make basic reply char *reply; reply = "HTTP/1.0 200 OK\r\n" "Connection: Close\r\n"; // that is it! send a basic reply ok bool status = g_httpServer.sendDynamicPage( s , reply, gbstrlen(reply), -1, //cachtime false ,//postreply? NULL, //ctype -1 , //httpstatus NULL,//cookie "utf-8"); return status; } // make a state if ( st ) st->m_freeIt = false; if ( ! st ) { try { st = new (State8); } catch ( ... ) { g_errno = ENOMEM; log("PageParser: new(%i): %s", (int)sizeof(State8),mstrerror(g_errno)); return g_httpServer.sendErrorReply(s,500, mstrerror(g_errno));} mnew ( st , sizeof(State8) , "PageParser" ); st->m_freeIt = true; } // msg3b uses this to get a score from the query st->m_state = state; st->m_callback = callback; st->m_q = q; st->m_termFreqs = termFreqs; st->m_termFreqWeights = termFreqWeights; st->m_affWeights = affWeights; //st->m_total = (score_t)-1; st->m_indexCode = 0; st->m_blocked = false; st->m_didRootDom = false; st->m_didRootWWW = false; st->m_wasRootDom = false; st->m_u = NULL; st->m_recompute = false; //st->m_url.reset(); // do not allow more than one to be launched at a time if in // a quickpoll. will cause quickpoll in quickpoll. g_inPageParser = true; // password, too long pwdLen = 0; char *pwd = r->getString ( "pwd" , &pwdLen ); if ( pwdLen > 31 ) pwdLen = 31; if ( pwdLen > 0 ) strncpy ( st->m_pwd , pwd , pwdLen ); st->m_pwd[pwdLen]='\0'; // save socket ptr st->m_s = s; st->m_r.copy ( r ); // get the collection char *coll = r->getString ( "c" , &st->m_collLen ,NULL /*default*/); if ( st->m_collLen > MAX_COLL_LEN ) return sendErrorReply ( st , ENOBUFS ); if ( ! coll ) return sendErrorReply ( st , ENOCOLLREC ); strcpy ( st->m_coll , coll ); // version to use, if -1 use latest st->m_titleRecVersion = r->getLong("version",-1); if ( st->m_titleRecVersion == -1 ) st->m_titleRecVersion = TITLEREC_CURRENT_VERSION; // default to 0 if not provided st->m_hopCount = r->getLong("hc",0); //long ulen = 0; //char *u = r->getString ( "u" , &ulen , NULL /*default*/); long old = r->getLong ( "old", 0 ); // set query long qlen; char *qs = r->getString("q",&qlen,NULL); if ( qs ) st->m_tq.set2 ( qs , langUnknown , true ); // url will override docid if given if ( ! st->m_u || ! st->m_u[0] ) st->m_docId = r->getLongLong ("docid",-1); else st->m_docId = -1; // set url in state class (may have length 0) //if ( u ) st->m_url.set ( u , ulen ); //st->m_urlLen = ulen; st->m_u = st->m_r.getString("u",&st->m_ulen,NULL); // should we recycle link info? st->m_recycle = r->getLong("recycle",0); st->m_recycle2 = r->getLong("recycleimp",0); st->m_render = r->getLong("render" ,0); // for quality computation... takes way longer cuz we have to // lookup the IP address of every outlink, so we can get its root // quality using Msg25 which needs to filter out voters from that IP // range. st->m_oips = r->getLong("oips" ,0); long linkInfoLen = 0; // default is NULL char *linkInfoColl = r->getString ( "oli" , &linkInfoLen, NULL ); if ( linkInfoColl ) strcpy ( st->m_linkInfoColl , linkInfoColl ); else st->m_linkInfoColl[0] = '\0'; // set the flag in our SafeBuf class so that Words.cpp knows to show // html or html source depending on this value st->m_xbuf.m_renderHtml = st->m_render; // should we use the old title rec? st->m_old = old; // are we coming from a local machine? st->m_isLocal = r->isLocal(); //no more setting the default root quality to 30, instead if we do not // know it setting it to -1 st->m_rootQuality=-1; // header SafeBuf *xbuf = &st->m_xbuf; xbuf->safePrintf("<meta http-equiv=\"Content-Type\" " "content=\"text/html; charset=utf-8\">\n"); // print standard header g_pages.printAdminTop ( xbuf , st->m_s , &st->m_r ); // print the standard header for admin pages char *dd = ""; char *rr = ""; char *rr2 = ""; char *render = ""; char *oips = ""; char *us = ""; if ( st->m_u && st->m_u[0] ) us = st->m_u; //if ( st->m_sfn != -1 ) sprintf ( rtu , "%li",st->m_sfn ); if ( st->m_old ) dd = " checked"; if ( st->m_recycle ) rr = " checked"; if ( st->m_recycle2 ) rr2 = " checked"; if ( st->m_render ) render = " checked"; if ( st->m_oips ) oips = " checked"; xbuf->safePrintf( "<style>" ".poo { background-color:#%s;}\n" "</style>\n" , LIGHT_BLUE ); long clen; char *contentParm = r->getString("content",&clen,""); // print the input form xbuf->safePrintf ( "<style>\n" "h2{font-size: 12px; color: #666666;}\n" ".gbtag { border: 1px solid gray;" "background: #ffffef;display:inline;}\n" ".gbcomment { border: 1px solid gray;" "color: #888888; font-style:italic; " "background: #ffffef;display:inline;}\n" ".token { border: 1px solid gray;" "background: #f0ffff;display:inline;}\n" ".spam { border: 1px solid gray;" "background: #af0000;" "color: #ffffa0;}" ".hs {color: #009900;}" "</style>\n" "<center>" "<table %s>" "<tr><td colspan=5><center><b>" "Parser" "</b></center></td></tr>\n" "<tr class=poo>" "<td>" "<b>url</b>" "<br><font size=-2>" "Type in <b>FULL</b> url to parse." "</font>" "</td>" "</td>" "<td>" "<input type=text name=u value=\"%s\" size=\"40\">\n" "</td>" "</tr>" /* "<tr class=poo>" "<td>" "Parser version to use: " "</td>" "<td>" "<input type=text name=\"version\" size=\"4\" value=\"-1\"> " "</td>" "<td>" "(-1 means to use latest title rec version)<br>" "</td>" "</tr>" */ /* "<tr class=poo>" "<td>" "Hop count to use: " "</td>" "<td>" "<input type=text name=\"hc\" size=\"4\" value=\"%li\"> " "</td>" "<td>" "(-1 is unknown. For root urls hopcount is always 0)<br>" "</td>" "</tr>" */ "<tr class=poo>" "<td>" "<b>use cached</b>" "<br><font size=-2>" "Load page from cache (titledb)?" "</font>" "</td>" "<td>" "<input type=checkbox name=old value=1%s> " "</td>" "</tr>" /* "<tr class=poo>" "<td>" "Reparse root:" "</td>" "<td>" "<input type=checkbox name=artr value=1%s> " "</td>" "<td>" "Apply selected ruleset to root to update quality" "</td>" "</tr>" */ "<tr class=poo>" "<td>" "<b>recycle link info</b>" "<br><font size=-2>" "Recycle the link info from the title rec" "Load page from cache (titledb)?" "</font>" "</td>" "<td>" "<input type=checkbox name=recycle value=1%s> " "</td>" "</tr>" /* "<tr class=poo>" "<td>" "Recycle Link Info Imported:" "</td>" "<td>" "<input type=checkbox name=recycleimp value=1%s> " "</td>" "<td>" "Recycle the link info imported from other coll" "</td>" "</tr>" */ "<tr class=poo>" "<td>" "<b>render html</b>" "<br><font size=-2>" "Render document content as HTML" "</font>" "</td>" "<td>" "<input type=checkbox name=render value=1%s> " "</td>" "</tr>" /* "<tr class=poo>" "<td>" "Lookup outlinks' ruleset, ips, quality:" "</td>" "<td>" "<input type=checkbox name=oips value=1%s> " "</td>" "<td>" "To compute quality lookup IP addresses of roots " "of outlinks." "</td>" "</tr>" "<tr class=poo>" "<td>" "LinkInfo Coll:" "</td>" "<td>" "<input type=text name=\"oli\" size=\"10\" value=\"\"> " "</td>" "<td>" "Leave empty usually. Uses this coll to lookup link info." "</td>" "</tr>" */ "<tr class=poo>" "<td>" "<b>optional query</b>" "<br><font size=-2>" "Leave empty usually. For title generation only." "</font>" "</td>" "<td>" "<input type=text name=\"q\" size=\"20\" value=\"\"> " "</td>" "</tr>", TABLE_STYLE, us , dd, rr, render ); xbuf->safePrintf( "<tr class=poo>" "<td>" "<b>content type below is</b>" "<br><font size=-2>" "Is the content below HTML? XML? JSON?" "</font>" "</td>" "<td>" //"<input type=checkbox name=xml value=1> " "<select name=ctype>\n" "<option value=%li selected>HTML</option>\n" "<option value=%li selected>XML</option>\n" "<option value=%li selected>JSON</option>\n" "</select>\n" "</td>" "</tr>", (long)CT_HTML, (long)CT_XML, (long)CT_JSON ); xbuf->safePrintf( "<tr class=poo>" "<td><b>content</b>" "<br><font size=-2>" "Use this content for the provided <i>url</i> " "rather than downloading it from the web." "</td>" "<td>" "<textarea rows=10 cols=80 name=content>" "%s" "</textarea>" "</td>" "</tr>" "</table>" "</center>" "</form>" "<br>", //oips , contentParm ); xbuf->safePrintf( "<center>" "<input type=submit value=Submit>" "</center>" ); // just print the page if no url given if ( ! st->m_u || ! st->m_u[0] ) return processLoop ( st ); XmlDoc *xd = &st->m_xd; // set this up SpiderRequest sreq; sreq.reset(); strcpy(sreq.m_url,st->m_u); long firstIp = hash32n(st->m_u); if ( firstIp == -1 || firstIp == 0 ) firstIp = 1; // parentdocid of 0 sreq.setKey( firstIp, 0LL, false ); sreq.m_isPageParser = 1; sreq.m_hopCount = st->m_hopCount; sreq.m_hopCountValid = 1; sreq.m_fakeFirstIp = 1; sreq.m_firstIp = firstIp; Url nu; nu.set(sreq.m_url); sreq.m_domHash32 = nu.getDomainHash32(); sreq.m_siteHash32 = nu.getHostHash32(); // . get provided content if any // . will be NULL if none provided // . "content" may contain a MIME long contentLen = 0; char *content = r->getString ( "content" , &contentLen , NULL ); // is the "content" url-encoded? default is true. bool contentIsEncoded = true; // mark doesn't like to url-encode his content if ( ! content ) { content = r->getUnencodedContent (); contentLen = r->getUnencodedContentLen (); contentIsEncoded = false; } // ensure null if ( contentLen == 0 ) content = NULL; uint8_t contentType = CT_HTML; if ( r->getBool("xml",0) ) contentType = CT_XML; contentType = r->getLong("ctype",contentType);//CT_HTML); // if facebook, load xml content from title rec... bool isFacebook = (bool)strstr(st->m_u,"http://www.facebook.com/"); if ( isFacebook && ! content ) { long long docId = g_titledb.getProbableDocId(st->m_u); sprintf(sreq.m_url ,"%llu", docId ); sreq.m_isPageReindex = true; } // hack if ( content ) { st->m_dbuf.purge(); st->m_dbuf.safeStrcpy(content); //char *data = strstr(content,"\r\n\r\n"); //long dataPos = 0; //if ( data ) dataPos = (data + 4) - content; //st->m_dbuf.convertJSONtoXML(0,dataPos); //st->m_dbuf.decodeJSON(0); content = st->m_dbuf.getBufStart(); } // . use the enormous power of our new XmlDoc class // . this returns false if blocked if ( ! xd->set4 ( &sreq , NULL , st->m_coll , &st->m_wbuf , 0 ,//PP_NICENESS )) content , false, // deletefromindex 0, // forced ip contentType )) // return error reply if g_errno is set return sendErrorReply ( st , g_errno ); // make this our callback in case something blocks xd->setCallback ( st , processLoop ); // . set xd from the old title rec if recycle is true // . can also use XmlDoc::m_loadFromOldTitleRec flag if ( st->m_recycle ) xd->m_recycleContent = true; return processLoop ( st ); }
// returns false if blocked, true otherwise bool processLoop ( void *state ) { // get it State2 *st = (State2 *)state; // get the tcp socket from the state TcpSocket *s = st->m_socket; // get it XmlDoc *xd = &st->m_xd; if ( ! xd->m_loaded ) { // setting just the docid. niceness is 0. //xd->set3 ( st->m_docId , st->m_coll , 0 ); // callback xd->setCallback ( state , processLoop ); // . and tell it to load from the old title rec // . this sets xd->m_oldTitleRec/m_oldTitleRecSize // . this sets xd->ptr_* and all other member vars from // the old title rec if found in titledb. if ( ! xd->loadFromOldTitleRec ( ) ) return false; } if ( g_errno ) return sendErrorReply ( st , g_errno ); // now force it to load old title rec //char **tr = xd->getTitleRec(); SafeBuf *tr = xd->getTitleRecBuf(); // blocked? return false if so. it will call processLoop() when it rets if ( tr == (void *)-1 ) return false; // we did not block. check for error? this will free "st" too. if ( ! tr ) return sendErrorReply ( st , g_errno ); // if title rec was empty, that is a problem if ( xd->m_titleRecBuf.length() == 0 ) return sendErrorReply ( st , ENOTFOUND); // set callback char *na = xd->getIsNoArchive(); // wait if blocked if ( na == (void *)-1 ) return false; // error? if ( ! na ) return sendErrorReply ( st , g_errno ); // forbidden? allow turkeys through though... if ( ! st->m_isAdmin && *na ) return sendErrorReply ( st , ENOCACHE ); SafeBuf *sb = &st->m_sb; // &page=4 will print rainbow sections if ( ! st->m_printed && st->m_r.getLong("page",0) ) { // do not repeat this call st->m_printed = true; // this will call us again since we called // xd->setCallback() above to us if ( ! xd->printDocForProCog ( sb , &st->m_r ) ) return false; } char *contentType = "text/html"; char format = st->m_format; if ( format == FORMAT_XML ) contentType = "text/xml"; if ( format == FORMAT_JSON ) contentType = "application/json"; // if we printed a special page (like rainbow sections) then return now if ( st->m_printed ) { bool status = g_httpServer.sendDynamicPage (s, //buf,bufLen, sb->getBufStart(), sb->getLength(), -1,false, //"text/html", contentType, -1, NULL, "utf8" ); // nuke state2 mdelete ( st , sizeof(State2) , "PageGet1" ); delete (st); return status; } /* // this was calling XmlDoc and setting sections, etc. to // get the SpiderReply junk... no no no // is it banned or filtered? this ignores the TagRec in the titleRec // and uses msg8a to get it fresh instead char *vi = xd->getIsFiltered();//Visible( ); // wait if blocked if ( vi == (void *)-1 ) return false; // error? if ( ! vi ) return sendErrorReply ( st , g_errno ); // banned? if ( ! st->m_isAdmin && ! *vi ) return sendErrorReply (st,EDOCBANNED); */ // get the utf8 content char **utf8 = xd->getUtf8Content(); //long len = xd->size_utf8Content - 1; // wait if blocked??? if ( utf8 == (void *)-1 ) return false; // strange if ( xd->size_utf8Content<=0) { log("pageget: utf8 content <= 0"); return sendErrorReply(st,EBADENGINEER ); } // alloc error? if ( ! utf8 ) return sendErrorReply ( st , g_errno ); // get this host Host *h = g_hostdb.getHost ( g_hostdb.m_hostId ); if ( ! h ) { log("pageget: hostid %li is bad",g_hostdb.m_hostId); return sendErrorReply(st,EBADENGINEER ); } char *content = xd->ptr_utf8Content; long contentLen = xd->size_utf8Content - 1; // shortcut char strip = st->m_strip; // alloc buffer now //char *buf = NULL; //long bufMaxSize = 0; //bufMaxSize = len + ( 32 * 1024 ) ; //bufMaxSize = contentLen + ( 32 * 1024 ) ; //buf = (char *)mmalloc ( bufMaxSize , "PageGet2" ); //char *p = buf; //char *bufEnd = buf + bufMaxSize; //if ( ! buf ) { // return sendErrorReply ( st , g_errno ); //} // for undoing the header //char *start1 = p; long startLen1 = sb->length(); // we are always utfu if ( strip != 2 ) sb->safePrintf( "<meta http-equiv=\"Content-Type\" " "content=\"text/html;charset=utf8\">\n"); // base href //Url *base = &xd->m_firstUrl; //if ( xd->ptr_redirUrl.m_url[0] ) // base = &xd->m_redirUrl; char *base = xd->ptr_firstUrl; if ( xd->ptr_redirUrl ) base = xd->ptr_redirUrl; //Url *redir = *xd->getRedirUrl(); if ( strip != 2 ) { sb->safePrintf ( "<BASE HREF=\"%s\">" , base ); //p += gbstrlen ( p ); } // default colors in case css files missing if ( strip != 2 ) { sb->safePrintf( "\n<style type=\"text/css\">\n" "body{background-color:white;color:black;}\n" "</style>\n"); //p += gbstrlen ( p ); } //char format = st->m_format; if ( format == FORMAT_XML ) sb->reset(); if ( format == FORMAT_JSON ) sb->reset(); // for undoing the stuff below long startLen2 = sb->length();//p; // query should be NULL terminated char *q = st->m_q; long qlen = st->m_qlen; char styleTitle[128] = "font-size:14px;font-weight:600;" "color:#000000;"; char styleText[128] = "font-size:14px;font-weight:400;" "color:#000000;"; char styleLink[128] = "font-size:14px;font-weight:400;" "color:#0000ff;"; char styleTell[128] = "font-size:14px;font-weight:600;" "color:#cc0000;"; // get the url of the title rec Url *f = xd->getFirstUrl(); bool printDisclaimer = st->m_printDisclaimer; if ( xd->m_contentType == CT_JSON ) printDisclaimer = false; if ( format == FORMAT_XML ) printDisclaimer = false; if ( format == FORMAT_JSON ) printDisclaimer = false; char tbuf[100]; tbuf[0] = 0; time_t lastSpiderDate = xd->m_spideredTime; if ( printDisclaimer || format == FORMAT_XML || format == FORMAT_JSON ) { struct tm *timeStruct = gmtime ( &lastSpiderDate ); strftime ( tbuf, 100,"%b %d, %Y UTC", timeStruct); } // We should always be displaying this disclaimer. // - May eventually want to display this at a different location // on the page, or on the click 'n' scroll browser page itself // when this page is not being viewed solo. // CNS: if ( ! st->m_clickNScroll ) { if ( printDisclaimer ) { sb->safePrintf(//sprintf ( p , //"<BASE HREF=\"%s\">" //"<table border=1 width=100%%>" //"<tr><td>" "<table border=\"1\" bgcolor=\"#" BGCOLOR "\" cellpadding=\"10\" " //"id=\"gbcnsdisctable\" class=\"gbcnsdisctable_v\"" "cellspacing=\"0\" width=\"100%%\" color=\"#ffffff\">" "<tr" //" id=\"gbcnsdisctr\" class=\"gbcnsdisctr_v\"" "><td>" //"<font face=times,sans-serif color=black size=-1>" "<span style=\"%s\">" "This is Gigablast's cached page of </span>" "<a href=\"%s\" style=\"%s\">%s</a>" "" , styleTitle, f->getUrl(), styleLink, f->getUrl() ); //p += gbstrlen ( p ); // then the rest //sprintf(p , sb->safePrintf( "<span style=\"%s\">. " "Gigablast is not responsible for the content of " "this page.</span>", styleTitle ); //p += gbstrlen ( p ); sb->safePrintf ( "<br/><span style=\"%s\">" "Cached: </span>" "<span style=\"%s\">", styleTitle, styleText ); //p += gbstrlen ( p ); // then the spider date in GMT // time_t lastSpiderDate = xd->m_spideredTime; // struct tm *timeStruct = gmtime ( &lastSpiderDate ); // char tbuf[100]; // strftime ( tbuf, 100,"%b %d, %Y UTC", timeStruct); //p += gbstrlen ( p ); sb->safeStrcpy(tbuf); // Moved over from PageResults.cpp sb->safePrintf( "</span> - <a href=\"" "/get?" "q=%s&c=%s&rtq=%li&" "d=%lli&strip=1\"" " style=\"%s\">" "[stripped]</a>", q , st->m_coll , (long)st->m_rtq, st->m_docId, styleLink ); // a link to alexa if ( f->getUrlLen() > 5 ) { sb->safePrintf( " - <a href=\"http:" "//web.archive.org/web/*/%s\"" " style=\"%s\">" "[older copies]</a>" , f->getUrl(), styleLink ); } if (st->m_noArchive){ sb->safePrintf( " - <span style=\"%s\"><b>" "[NOARCHIVE]</b></span>", styleTell ); } if (st->m_isBanned){ sb->safePrintf(" - <span style=\"%s\"><b>" "[BANNED]</b></span>", styleTell ); } // only print this if we got a query if ( qlen > 0 ) { sb->safePrintf("<br/><br/><span style=\"%s\"> " "These search terms have been " "highlighted: ", styleText ); //p += gbstrlen ( p ); } } // how much space left in p? //long avail = bufEnd - p; // . make the url that we're outputting for (like in PageResults.cpp) // . "thisUrl" is the baseUrl for click & scroll char thisUrl[MAX_URL_LEN]; char *thisUrlEnd = thisUrl + MAX_URL_LEN; char *x = thisUrl; // . use the external ip of our gateway // . construct the NAT mapped port // . you should have used iptables to map port to the correct // internal ip:port //unsigned long ip =g_conf.m_mainExternalIp ; // h->m_externalIp; //unsigned short port=g_conf.m_mainExternalPort;//h->m_externalHttpPort // local check //if ( st->m_isLocal ) { unsigned long ip = h->m_ip; unsigned short port = h->m_httpPort; //} //sprintf ( x , "http://%s:%li/get?q=" , iptoa ( ip ) , port ); // . we no longer put the port in here // . but still need http:// since we use <base href=> if (port == 80) sprintf(x,"http://%s/get?q=",iptoa(ip)); else sprintf(x,"http://%s:%hu/get?q=",iptoa(ip),port); x += gbstrlen ( x ); // the query url encoded long elen = urlEncode ( x , thisUrlEnd - x , q , qlen ); x += elen; // separate cgi vars with a & //sprintf ( x, "&seq=%li&rtq=%lid=%lli", // (long)st->m_seq,(long)st->m_rtq,st->m_msg22.getDocId()); sprintf ( x, "&d=%lli",st->m_docId ); x += gbstrlen(x); // set our query for highlighting Query qq; qq.set2 ( q, st->m_langId , true ); // print the query terms into our highlight buffer Highlight hi; // make words so we can set the scores to ignore fielded terms Words qw; qw.set ( q , // content being highlighted, utf8 qlen , // content being highlighted, utf8 TITLEREC_CURRENT_VERSION, true , // computeIds false ); // hasHtmlEntities? // . assign scores of 0 to query words that should be ignored // . TRICKY: loop over words in qq.m_qwords, but they should be 1-1 // with words in qw. // . sanity check //if ( qw.getNumWords() != qq.m_numWords ) { char *xx = NULL; *xx = 0;} // declare up here Matches m; // do the loop //Scores ss; //ss.set ( &qw , NULL ); //for ( long i = 0 ; i < qq.m_numWords ; i++ ) // if ( ! m.matchWord ( &qq.m_qwords[i],i ) ) ss.m_scores[i] = 0; // now set m.m_matches[] to those words in qw that match a query word // or phrase in qq. m.setQuery ( &qq ); //m.addMatches ( &qw , &ss , true ); m.addMatches ( &qw ); long hilen = 0; // CNS: if ( ! st->m_clickNScroll ) { // and highlight the matches if ( printDisclaimer ) { hilen = hi.set ( //p , //avail , sb , &qw , // words to highlight &m , // matches relative to qw false , // doSteming false , // st->m_clickAndScroll , (char *)thisUrl );// base url for ClcknScrll //p += hilen; // now an hr //memcpy ( p , "</span></table></table>\n" , 24 ); p += 24; sb->safeStrcpy("</span></table></table>\n"); } bool includeHeader = st->m_includeHeader; // do not show header for json object display if ( xd->m_contentType == CT_JSON ) includeHeader = false; if ( format == FORMAT_XML ) includeHeader = false; if ( format == FORMAT_JSON ) includeHeader = false; //mfree(uq, uqCapacity, "PageGet"); // undo the header writes if we should if ( ! includeHeader ) { // including base href is off by default when not including // the header, so the caller must explicitly turn it back on if ( st->m_includeBaseHref ) sb->m_length=startLen2;//p=start2; else sb->m_length=startLen1;//p=start1; } //sb->safeStrcpy(tbuf); if ( format == FORMAT_XML ) { sb->safePrintf("<response>\n"); sb->safePrintf("<statusCode>0</statusCode>\n"); sb->safePrintf("<statusMsg>Success</statusMsg>\n"); sb->safePrintf("<url><![CDATA["); sb->cdataEncode(xd->m_firstUrl.m_url); sb->safePrintf("]]></url>\n"); sb->safePrintf("<docId>%llu</docId>\n",xd->m_docId); sb->safePrintf("\t<cachedTimeUTC>%lu</cachedTimeUTC>\n", lastSpiderDate); sb->safePrintf("\t<cachedTimeStr>%s</cachedTimeStr>\n",tbuf); } if ( format == FORMAT_JSON ) { sb->safePrintf("{\"response\":{\n"); sb->safePrintf("\t\"statusCode\":0,\n"); sb->safePrintf("\t\"statusMsg\":\"Success\",\n"); sb->safePrintf("\t\"url\":\""); sb->jsonEncode(xd->m_firstUrl.m_url); sb->safePrintf("\",\n"); sb->safePrintf("\t\"docId\":%llu,\n",xd->m_docId); sb->safePrintf("\t\"cachedTimeUTC\":%lu,\n",lastSpiderDate); sb->safePrintf("\t\"cachedTimeStr\":\"%s\",\n",tbuf); } // identify start of <title> tag we wrote out char *sbstart = sb->getBufStart(); char *sbend = sb->getBufEnd(); char *titleStart = NULL; char *titleEnd = NULL; for ( char *t = sbstart ; t < sbend ; t++ ) { // title tag? if ( t[0]!='<' ) continue; if ( to_lower_a(t[1])!='t' ) continue; if ( to_lower_a(t[2])!='i' ) continue; if ( to_lower_a(t[3])!='t' ) continue; if ( to_lower_a(t[4])!='l' ) continue; if ( to_lower_a(t[5])!='e' ) continue; // point to it char *x = t + 5; // max - to keep things fast char *max = x + 500; for ( ; *x && *x != '>' && x < max ; x++ ); x++; // find end char *e = x; for ( ; *e && e < max ; e++ ) { if ( e[0]=='<' && to_lower_a(e[1])=='/' && to_lower_a(e[2])=='t' && to_lower_a(e[3])=='i' && to_lower_a(e[4])=='t' && to_lower_a(e[5])=='l' && to_lower_a(e[6])=='e' ) break; } if ( e < max ) { titleStart = x; titleEnd = e; } break; } // . print title at top! // . consider moving if ( titleStart ) { char *ebuf = st->m_r.getString("eb"); if ( ! ebuf ) ebuf = ""; //p += sprintf ( p , sb->safePrintf( "<table border=1 " "cellpadding=10 " "cellspacing=0 " "width=100%% " "color=#ffffff>" ); long printLinks = st->m_r.getLong("links",0); if ( ! printDisclaimer && printLinks ) sb->safePrintf(//p += sprintf ( p , // first put cached and live link "<tr>" "<td bgcolor=lightyellow>" // print cached link //"<center>" " " "<b>" "<a " "style=\"font-size:18px;font-weight:600;" "color:#000000;\" " "href=\"" "/get?" "c=%s&d=%lli&qh=0&cnsp=1&eb=%s\">" "cached link</a>" " " "<a " "style=\"font-size:18px;font-weight:600;" "color:#000000;\" " "href=%s>live link</a>" "</b>" //"</center>" "</td>" "</tr>\n" ,st->m_coll ,st->m_docId ,ebuf ,thisUrl // st->ptr_ubuf ); if ( printLinks ) { sb->safePrintf(//p += sprintf ( p , "<tr><td bgcolor=pink>" "<span style=\"font-size:18px;" "font-weight:600;" "color:#000000;\">" " " "<b>PAGE TITLE:</b> " ); long tlen = titleEnd - titleStart; sb->safeMemcpy ( titleStart , tlen ); sb->safePrintf ( "</span></td></tr>" ); } sb->safePrintf( "</table><br>\n" ); } // is the content preformatted? bool pre = false; char ctype = (char)xd->m_contentType; if ( ctype == CT_TEXT ) pre = true ; // text/plain if ( ctype == CT_DOC ) pre = true ; // filtered msword if ( ctype == CT_PS ) pre = true ; // filtered postscript if ( format == FORMAT_XML ) pre = false; if ( format == FORMAT_JSON ) pre = false; // if it is content-type text, add a <pre> if ( pre ) {//p + 5 < bufEnd && pre ) { sb->safePrintf("<pre>"); //p += 5; } if ( st->m_strip == 1 ) contentLen = stripHtml( content, contentLen, (long)xd->m_version, st->m_strip ); // it returns -1 and sets g_errno on error, line OOM if ( contentLen == -1 ) { //if ( buf ) mfree ( buf , bufMaxSize , "PageGet2" ); return sendErrorReply ( st , g_errno ); } Xml xml; Words ww; // if no highlighting, skip it bool queryHighlighting = st->m_queryHighlighting; if ( st->m_strip == 2 ) queryHighlighting = false; // do not do term highlighting if json if ( xd->m_contentType == CT_JSON ) queryHighlighting = false; SafeBuf tmp; SafeBuf *xb = sb; if ( format == FORMAT_XML ) xb = &tmp; if ( format == FORMAT_JSON ) xb = &tmp; if ( ! queryHighlighting ) { xb->safeMemcpy ( content , contentLen ); //p += contentLen ; } else { // get the content as xhtml (should be NULL terminated) //Words *ww = xd->getWords(); if ( ! xml.set ( content , contentLen , false , 0 , false , TITLEREC_CURRENT_VERSION , false , 0 , CT_HTML ) ) { // niceness is 0 //if ( buf ) mfree ( buf , bufMaxSize , "PageGet2" ); return sendErrorReply ( st , g_errno ); } if ( ! ww.set ( &xml , true , 0 ) ) { // niceness is 0 //if ( buf ) mfree ( buf , bufMaxSize , "PageGet2" ); return sendErrorReply ( st , g_errno ); } // sanity check //if ( ! xd->m_wordsValid ) { char *xx=NULL;*xx=0; } // how much space left in p? //avail = bufEnd - p; Matches m; m.setQuery ( &qq ); m.addMatches ( &ww ); hilen = hi.set ( xb , // p , avail , &ww , &m , false /*doStemming?*/ , st->m_clickAndScroll , thisUrl /*base url for click & scroll*/); //p += hilen; log(LOG_DEBUG, "query: Done highlighting cached page content"); } if ( format == FORMAT_XML ) { sb->safePrintf("\t<content><![CDATA["); sb->cdataEncode ( xb->getBufStart() ); sb->safePrintf("]]></content>\n"); sb->safePrintf("</response>\n"); } if ( format == FORMAT_JSON ) { sb->safePrintf("\t\"content\":\"\n"); sb->jsonEncode ( xb->getBufStart() ); sb->safePrintf("\"\n}\n}\n"); } // if it is content-type text, add a </pre> if ( pre ) { // p + 6 < bufEnd && pre ) { sb->safeMemcpy ( "</pre>" , 6 ); //p += 6; } // calculate bufLen //long bufLen = p - buf; long ct = xd->m_contentType; // now filter the entire buffer to escape out the xml tags // so it is displayed nice SafeBuf newbuf; if ( ct == CT_XML ) { // encode the xml tags into <tagname> sequences if ( !newbuf.htmlEncodeXmlTags ( sb->getBufStart() , sb->getLength(), 0)){// niceness=0 //if ( buf ) mfree ( buf , bufMaxSize , "PageGet2" ); return sendErrorReply ( st , g_errno ); } // free out buffer that we alloc'd before returning since this // should have copied it into another buffer //if ( buf ) mfree ( buf , bufMaxSize , "PageGet2" ); // reassign //buf = newbuf.getBufStart(); //bufLen = newbuf.length(); sb->stealBuf ( &newbuf ); } // now encapsulate it in html head/tail and send it off // sendErr: contentType = "text/html"; if ( strip == 2 ) contentType = "text/xml"; // xml is usually buggy and this throws browser off //if ( ctype == CT_XML ) contentType = "text/xml"; if ( xd->m_contentType == CT_JSON ) contentType = "application/json"; if ( format == FORMAT_XML ) contentType = "text/xml"; if ( format == FORMAT_JSON ) contentType = "application/json"; // safebuf, sb, is a member of "st" so this should copy the buffer // when it constructs the http reply, and we gotta call delete(st) // AFTER this so sb is still valid. bool status = g_httpServer.sendDynamicPage (s, //buf,bufLen, sb->getBufStart(), sb->getLength(), -1,false, contentType, -1, NULL, "utf8" ); // nuke state2 mdelete ( st , sizeof(State2) , "PageGet1" ); delete (st); // free out buffer that we alloc'd before returning since this // should have copied it into another buffer //if ( ct == CT_XML ) newbuf.purge(); //else if ( buf ) mfree ( buf , bufMaxSize , "PageGet2" ); // and convey the status return status; }
// Return the value of the specified "field" within this node. // the case of "field" does not matter. char *XmlNode::getFieldValue ( char *field , int32_t *valueLen ) { // reset this to 0 *valueLen = 0; // scan for the field name in our node int32_t flen = gbstrlen(field); char inQuotes = '\0'; int32_t i; // scan the characters in the node, looking for the field name in ascii for ( i = 1; i + flen < m_nodeLen ; i++ ) { // skip the field if it's quoted if ( inQuotes) { if (m_node[i] == inQuotes ) inQuotes = 0; continue; } // set inQuotes to the quote if we're in quotes if ( (m_node[i]=='\"' || m_node[i]=='\'')){ inQuotes = m_node[i]; continue; } // a field name must be preceeded by non-alnum if ( is_alnum_a ( m_node[i-1] ) ) continue; // the first character of this field shout match field[0] if ( to_lower_a (m_node[i]) != to_lower_a(field[0] )) continue; // field just be immediately followed by an = or space if (m_node[i+flen]!='='&&!is_wspace_a(m_node[i+flen]))continue; // field names must match if ( strncasecmp ( &m_node[i], field, flen ) != 0 ) continue; // break cuz we got a match for our field name break; } // return NULL if no matching field if ( i + flen >= m_nodeLen ) return NULL; // advance i over the fieldname so it pts to = or space i += flen; // advance i over spaces while ( i < m_nodeLen && is_wspace_a ( m_node[i] ) ) i++; // advance over the equal sign, return NULL if does not exist if ( i < m_nodeLen && m_node[i++] != '=' ) return NULL; // advance i over spaces after the equal sign while ( i < m_nodeLen && is_wspace_a ( m_node[i] ) ) i++; // now parse out the value of this field (could be in quotes) inQuotes = '\0'; // set inQuotes to the quote if we're in quotes if ( m_node[i]=='\"' || m_node[i]=='\'') inQuotes = m_node[i++]; // mark this as the start of the value int start=i; // advance i until we hit a space, or we hit a that quote if inQuotes if (inQuotes) { while (i<m_nodeLen && m_node[i] != inQuotes ) i++; } else { while ( i<m_nodeLen && !is_wspace_a(m_node[i])&& m_node[i]!='>') i++; } // set the length of the value *valueLen = i - start; // return a ptr to the value return m_node + start; }
// . return the value of the specified "field" within this html tag, "s" // . the case of "field" does not matter char *getFieldValue ( char *s , long slen , char *field , long *valueLen ) { // reset this to 0 *valueLen = 0; // scan for the field name in our node long flen = gbstrlen(field); char inQuotes = '\0'; long i; // make it sane if ( slen > 2000 ) slen = 2000; for ( i = 1; i + flen < slen ; i++ ) { // skip the field if it's quoted if ( inQuotes) { if (s[i] == inQuotes ) inQuotes = 0; continue; } // set inQuotes to the quote if we're in quotes if ( (s[i]=='\"' || s[i]=='\'')){ inQuotes = s[i]; continue; } // if not in quote tag might end if ( s[i] == '>' && ! inQuotes ) return NULL; // a field name must be preceeded by non-alnum if ( is_alnum_a ( s[i-1] ) ) continue; // the first character of this field shout match field[0] if ( to_lower_a (s[i]) != to_lower_a(field[0] )) continue; // field just be immediately followed by an = or space if (s[i+flen]!='='&&!is_wspace_a(s[i+flen]))continue; // field names must match if ( strncasecmp ( &s[i], field, flen ) != 0 ) continue; // break cuz we got a match for our field name break; } // return NULL if no matching field if ( i + flen >= slen ) return NULL; // advance i over the fieldname so it pts to = or space i += flen; // advance i over spaces while ( i < slen && is_wspace_a ( s[i] ) ) i++; // advance over the equal sign, return NULL if does not exist if ( i < slen && s[i++] != '=' ) return NULL; // advance i over spaces after the equal sign while ( i < slen && is_wspace_a ( s[i] ) ) i++; // now parse out the value of this field (could be in quotes) inQuotes = '\0'; // set inQuotes to the quote if we're in quotes if ( s[i]=='\"' || s[i]=='\'') inQuotes = s[i++]; // mark this as the start of the value int start=i; // advance i until we hit a space, or we hit a that quote if inQuotes if (inQuotes) while (i<slen && s[i] != inQuotes ) i++; else while ( i<slen &&!is_wspace_a(s[i])&&s[i]!='>')i++; // set the length of the value *valueLen = i - start; // return a ptr to the value return s + start; }
// . TODO: support stemming later. each word should then have multiple ids. // . add to our m_matches[] array iff addToMatches is true, otherwise we just // set the m_foundTermVector for doing the BIG HACK described in Summary.cpp bool Matches::addMatches(Words *words, Phrases *phrases, Sections *sections, Bits *bits, Pos *pos, mf_t flags ) { // if no query term, bail. if ( m_numSlots <= 0 ) { return true; } // . do not breach // . happens a lot with a lot of link info text if ( m_numMatchGroups >= MAX_MATCHGROUPS ) { return true; } Section *sp = NULL; if ( sections ) { sp = sections->m_sections; } mf_t eflag = 0; m_numMatchGroups++; const int64_t *pids = NULL; if ( phrases ) { pids = phrases->getPhraseIds2(); } // set convenience vars uint32_t mask = m_numSlots - 1; const int64_t *wids = words->getWordIds(); const int32_t *wlens = words->getWordLens(); const char * const *wptrs = words->getWordPtrs(); nodeid_t *tids = words->getTagIds(); int32_t nw = words->getNumWords(); int32_t n; int32_t matchStack = 0; int64_t nextMatchWordIdMustBeThis = 0; int32_t nextMatchWordPos = 0; int32_t lasti = -3; if ( getNumXmlNodes() > 512 ) { g_process.shutdownAbort(true); } int32_t badFlags =SEC_SCRIPT|SEC_STYLE|SEC_SELECT|SEC_IN_TITLE; int32_t qwn; int32_t numQWords; int32_t numWords; // // . set m_matches[] array // . loop over all words in the document // for ( int32_t i = 0 ; i < nw ; i++ ) { //if (tids && (tids[i] ) == TAG_A) // inAnchTag = true; //else if (tids && (tids[i]&BACKBITCOMP) == TAG_A) // inAnchTag = false; if ( tids && tids[i] ){ // tagIds don't have wids and are skipped continue; } // skip if wid is 0, it is not an alnum word then if ( ! wids[i] ) { continue; } // count the number of alnum words m_numAlnums++; // clear this eflag = 0; // NO NO, a score of -1 means in a select tag, and // we do index that!! so only skip if wscores is 0 now. // -1 means in script, style, select or marquee. it is // indexed but with very little weight... this is really // a hack in Scores.cpp and should be fixed. // in Scores.cpp we set even the select tag stuff to -1... //if ( wscores && wscores[i] == -1 ) continue; if ( sp && (sp->m_flags & badFlags) ) continue; // . does it match a query term? // . hash to the slot in the hash table n = ((uint32_t)wids[i]) & mask; //n2 = swids[i]?((uint32_t)swids[i]) & mask:n; chain1: // skip if slot is empty (doesn't match query term) //if ( ! m_qtableIds[n] && ! m_qtableIds[n2]) continue; if ( ! m_qtableIds[n] ) goto tryPhrase; // otherwise chain if ( (m_qtableIds[n] != wids[i]) ) { if ( m_qtableIds[n] && ++n >= m_numSlots ) n = 0; goto chain1; } // we got one! goto gotMatch; // // fix so we hihglight "woman's" when query term is "woman" // for 'spiritual books for women' query // tryPhrase: // try without 's if it had it if ( wlens[i] >= 3 && wptrs[i][wlens[i]-2] == '\'' && to_lower_a(wptrs[i][wlens[i]-1]) == 's' ) { // move 's from word hash... very tricky int64_t nwid = wids[i]; // undo hash64Lower_utf8 in hash.h nwid ^= g_hashtab[wlens[i]-1][(uint8_t)'s']; nwid ^= g_hashtab[wlens[i]-2][(uint8_t)'\'']; n = ((uint32_t)nwid) & mask; chain2: if ( ! m_qtableIds[n] ) goto tryPhrase2; if ( (m_qtableIds[n] != nwid) ) { if ( m_qtableIds[n] && ++n >= m_numSlots ) n=0; goto chain2; } qwn = m_qtableWordNums[n]; numWords = 1; numQWords = 1; // we got one! goto gotMatch2; } tryPhrase2: // try phrase first if ( pids && pids[i] ) { n = ((uint32_t)pids[i]) & mask; chain3: if ( ! m_qtableIds[n] ) continue; if ( (m_qtableIds[n] != pids[i]) ) { if ( m_qtableIds[n] && ++n >= m_numSlots)n = 0; goto chain3; } // what query word # do we match? qwn = m_qtableWordNums[n]; // get that query word # QueryWord *qw = &m_q->m_qwords[qwn]; // . do we match it as a single word? // . did they search for "bluetribe" ...? if ( qw->m_rawWordId == pids[i] ) { // set our # of words basically to 3 numWords = 3; // matching a single query word numQWords = 1; // got a match goto gotMatch2; } if ( qw->m_phraseId == pids[i] ) { // might match more if we had more query // terms in the quote numWords = getNumWordsInMatch( words, i, n, &numQWords, &qwn, true ); // this is 0 if we were an unmatched quote if ( numWords <= 0 ) continue; // got a match goto gotMatch2; } // otherwise we are matching a query phrase id log("matches: wtf? query word not matched for " "highlighting... strange."); // assume one word for now numWords = 1; numQWords = 1; goto gotMatch2; } // // shucks, no match // continue; gotMatch: // what query word # do we match? qwn = m_qtableWordNums[n]; // . how many words are in this match? // . it may match a single word or a phrase or both // . this will be 1 for just matching a single word, and // multiple words for quotes/phrases. The number of words // in both cases will included unmatched punctuation words // and tags in between matching words. numQWords = 0; numWords = getNumWordsInMatch( words, i, n, &numQWords, &qwn, true ); // this is 0 if we were an unmatched quote if ( numWords <= 0 ) continue; gotMatch2: // get query word QueryWord *qw = &m_q->m_qwords[qwn]; // point to next word in the query QueryWord *nq = NULL; if ( qwn+2 < m_q->m_numWords ) nq = &m_q->m_qwords[qwn+2]; // . if only one word matches and its a stop word, make sure // it's next to the correct words in the query // . if phraseId is 0, that means we do not start a phrase, // because stop words can start phrases if they are the // first word, are capitalized, or have breaking punct before // them. if ( numWords == 1 && ! qw->m_inQuotes && m_q->m_numWords > 2 && qw->m_wordSign == '\0' && (nq && nq->m_wordId) && // no field names can follow //(qw->m_isQueryStopWord || qw->m_isStopWord ) ) { // we no longer consider single alnum chars to be // query stop words as stated in StopWords.cpp to fix // the query 'j. w. eagan' qw->m_isQueryStopWord ) { // if stop word does not start a phrase in the query // then he must have a matched word before him in the // document. if he doesn't then do not count as a match if ( qw->m_phraseId == 0LL && i-2 != lasti ) { // peel off anybody before us m_numMatches -= matchStack; if ( m_numMatches < 0 ) m_numMatches = 0; // don't forget to reset the match stack matchStack = 0; continue; } // if we already have a match stack, we must // be in nextMatchWordPos if ( matchStack && nextMatchWordPos != i ) { // peel off anybody before us m_numMatches -= matchStack; if ( m_numMatches < 0 ) m_numMatches = 0; // don't forget to reset the match stack matchStack = 0; //continue; } // if the phraseId is 0 and the previous word // is a match, then we're ok, but put us on a stack // so if we lose a match, we'll be erased QueryWord *nq = &m_q->m_qwords[qwn+2]; // next match is only required if next word in query // is indeed valid. if ( nq->m_wordId && nq->m_fieldCode == 0 ) { nextMatchWordIdMustBeThis = nq->m_rawWordId; nextMatchWordPos = i + 2; matchStack++; } } else if ( matchStack ) { // if the last word matched was a stop word, we have to // match otherwise we have to remove the whole stack. if ( qw->m_rawWordId != nextMatchWordIdMustBeThis || i > nextMatchWordPos ) { m_numMatches -= matchStack; // ensure we never go negative like for // www.experian.com query if ( m_numMatches < 0 ) m_numMatches = 0; } // always reset this here if we're not a stop word matchStack = 0; } // record word # of last match lasti = i; // otherwise, store it in our m_matches[] array Match *m = &m_matches[m_numMatches]; // the word # in the doc, and how many of 'em are in the match m->m_wordNum = i; m->m_numWords = numWords; // the word # in the query, and how many of 'em we match m->m_qwordNum = qwn; m->m_numQWords = numQWords; // get the first query word # of this match qw = &m_q->m_qwords[qwn]; // convenience, used by Summary.cpp m->m_words = words; m->m_sections = sections; m->m_bits = bits; m->m_pos = pos; m->m_flags = flags | eflag ; // add to our vector. we want to know where each QueryWord // is. i.e. in the title, link text, meta tag, etc. so // the proximity algo in Summary.cpp can use that info. m_qwordFlags[qwn] |= flags; // advance m_numMatches++; // we get atleast MAX_MATCHES if ( m_numMatches < MAX_MATCHES ) { continue; } break; } // peel off anybody before us m_numMatches -= matchStack; if ( m_numMatches < 0 ) m_numMatches = 0; return true; }
char *getMatches2 ( Needle *needles , int32_t numNeedles , char *haystack , int32_t haystackSize , char *linkPos , int32_t *needleNum , bool stopAtFirstMatch , bool *hadPreMatch , bool saveQuickTables , int32_t niceness ) { // assume not if ( hadPreMatch ) *hadPreMatch = false; // empty haystack? then no matches if ( ! haystack || haystackSize <= 0 ) return NULL; // JAB: no needles? then no matches if ( ! needles || numNeedles <= 0 ) return NULL; //char tmp[8192]; //char *t = tmp; //char *tend = tmp + 8192; // reset counts to 0 //if ( ! stopAtFirstMatch ) // for ( int32_t i=0 ; i < numNeedles ; i++ ) // needles[i].m_count = 0; // are we responsible for init'ing string lengths? this is much // faster than having to specify lengths manually. for ( int32_t i=0 ; i < numNeedles; i++ ) { // breathe QUICKPOLL(niceness); // clear needles[i].m_count = 0; needles[i].m_firstMatch = NULL; // set the string size in bytes if not provided if ( needles[i].m_stringSize == 0 ) needles[i].m_stringSize = gbstrlen(needles[i].m_string); } // . set up the quick tables. // . utf16 is not as effective here because half the bytes are zeroes! // . TODO: use a static cache of like 4 of these tables where the key // is the Needles ptr ... done int32_t numNeedlesToInit = numNeedles; char space[256 * 4 * sizeof(BITVEC)]; char *buf = NULL; BITVEC *s0; BITVEC *s1; BITVEC *s2; BITVEC *s3; /* static bool s_quickTableInit = false; static char s_qtbuf[128*(12+1)*2]; int32_t slot = -1; if(saveQuickTables) { if ( ! s_quickTableInit ) { s_quickTableInit = true; s_quickTables.set(8,4,128,s_qtbuf,256*13,false,0,"qx"); } uint64_t key = (uint32_t)needles; slot = s_quickTables.getSlot(&key); if ( slot >= 0 ) { buf = s_quickTables.getValueFromSlot(slot); numNeedlesToInit = 0; } } */ if(!buf) { buf = space; memset ( buf , 0 , sizeof(BITVEC)*256*4); } /* if( useQuickTables && slot == -1 ) { //buf = (char*)mcalloc(sizeof(uint32_t)*256*5, // "matches"); if(buf) s_quickTables.addKey(&key, &buf); //sanity check, no reason why there needs to be a //limit, I just don't expect there to be this many //static needles at this point. if(s_quickTables.getNumSlotsUsed() > 32){ char *xx=NULL; *xx = 0; } } */ // try 64 bit bit vectors now since we doubled # of needles int32_t offset = 0; s0 = (BITVEC *)(buf + offset); offset += sizeof(BITVEC)*256; s1 = (BITVEC *)(buf + offset); offset += sizeof(BITVEC)*256; s2 = (BITVEC *)(buf + offset); offset += sizeof(BITVEC)*256; s3 = (BITVEC *)(buf + offset); offset += sizeof(BITVEC)*256; BITVEC mask; // set the letter tables, s0[] through sN[], for each needle for ( int32_t i = 0 ; i < numNeedlesToInit ; i++ ) { // breathe QUICKPOLL(niceness); unsigned char *w = (unsigned char *)needles[i].m_string; unsigned char *wend = w + needles[i].m_stringSize; // BITVEC is now 64 bits mask = (1<<(i&0x3f)); // (1<<(i%64)); // if the needle is small, fill up the remaining letter tables // with its mask... so it matches any character in haystack. s0[(unsigned char)to_lower_a(*w)] |= mask; s0[(unsigned char)to_upper_a(*w)] |= mask; w += 1;//step; if ( w >= wend ) { for ( int32_t j = 0 ; j < 256 ; j++ ) { s1[j] |= mask; s2[j] |= mask; s3[j] |= mask; } continue; } s1[(unsigned char)to_lower_a(*w)] |= mask; s1[(unsigned char)to_upper_a(*w)] |= mask; w += 1;//step; if ( w >= wend ) { for ( int32_t j = 0 ; j < 256 ; j++ ) { s2[j] |= mask; s3[j] |= mask; } continue; } s2[(unsigned char)to_lower_a(*w)] |= mask; s2[(unsigned char)to_upper_a(*w)] |= mask; w += 1;//step; if ( w >= wend ) { for ( int32_t j = 0 ; j < 256 ; j++ ) { s3[j] |= mask; } continue; } s3[(unsigned char)to_lower_a(*w)] |= mask; s3[(unsigned char)to_upper_a(*w)] |= mask; w += 1;//step; } // return a ptr to the first match if we should, this is it char *retVal = NULL; // debug vars //int32_t debugCount = 0; //int32_t pp = 0; // now find the first needle in the haystack unsigned char *p = (unsigned char *)haystack; unsigned char *pend = (unsigned char *)haystack + haystackSize; char *dend = (char *)pend; // do not breach! pend -= 4; for ( ; p < pend ; p++ ) { // breathe QUICKPOLL(niceness); //if ( (char *)p - (char *)haystack >= 12508 ) // log("hey"); // analytics... // is this a possible match? (this should be VERY fast) mask = s0[*(p+0)]; if ( ! mask ) continue; mask &= s1[*(p+1)]; if ( ! mask ) continue; mask &= s2[*(p+2)]; if ( ! mask ) continue; mask &= s3[*(p+3)]; if ( ! mask ) continue; //debugCount++; /* // display char oo[148]; char *xx ; xx = oo; //gbmemcpy ( xx , p , 8 ); for ( int32_t k = 0 ; k < 5 ; k++ ) { *xx++ = p[k]; } gbmemcpy ( xx , "..." , 3 ); xx += 3; */ // // XXX: do a hashtable lookup here so we have the candidate // matches in a chain... // XXX: for small needles which match frequently let's have // a single char hash table, a 2 byte char hash table, // etc. so if we have small needles we check the hash // in those tables first, but only if mask & SMALL_NEEDLE // is true! the single byte needle hash table can just // be a lookup table. just XOR the bytes together for // the hash. // XXX: just hash the mask into a table to get candidate // matches in a chain? but there's 4B hashes!! // we got a good candidate, loop through all the needles for ( int32_t j = 0 ; j < numNeedles ; j++ ) { // skip if does not match mask, will save time if ( ! ((1<<(j&0x3f)) & mask) ) continue; if( needles[j].m_stringSize > 3) { // ensure first 4 bytes matches this needle's if (needles[j].m_string[0]!=to_lower_a(*(p+0))) continue; if (needles[j].m_string[1]!=to_lower_a(*(p+1))) continue; if (needles[j].m_string[2]!=to_lower_a(*(p+2))) continue; if (needles[j].m_string[3]!=to_lower_a(*(p+3))) continue; } // get needle size int32_t msize = needles[j].m_stringSize; // can p possibly be big enough? if ( pend - p < msize ) continue; // needle is "m" now char *m = needles[j].m_string; char *mend = needles[j].m_stringSize + m; // use a tmp ptr for ptr into haystack char *d = (char *)p; // skip first 4 bytes since we know they match if(msize > 3) { d += 4; m += 4; } // loop over each char in "m" //for ( ; *m ; m++ ) { for ( ; m < mend ; m++ ) { //while ( ! *d && d < dend ) d++; //while ( ! *m && m < mend ) m++; // if we are a non alnum, that will match // any string of non-alnums, like a space // for instance. the 0 byte does not count // because it is used in utf16 a lot. this // may trigger some false matches in utf16 // but, oh well... this way "link partner" // will match "link - partner" in the haystk if ( is_wspace_a(*m) && m < mend ) { // skip all in "d" then. while (d<dend&&is_wspace_a(*d)) d++; // advance m then continue; } // make sure we match otherwise if ( *m != to_lower_a(*d) ) break; // ok, we matched, go to next d++; } // if not null, keep going if ( m < mend ) continue; // if this needle is "special" AND it occurs AFTER // linkPos, then do not consider it a match. this is // if we have a comment section indicator, like // "div id=\"comment" AND it occurs AFTER linkPos // (the char ptr to our link in the haystack) then // the match does not count. if ( linkPos && needles[j].m_isSection && (char *)p>linkPos ) { // record this for LinkText.cpp if ( hadPreMatch ) *hadPreMatch = true; continue; } // store ptr if NULL if ( ! needles[j].m_firstMatch ) needles[j].m_firstMatch = (char *)p; // return ptr to needle in "haystack" if ( stopAtFirstMatch ) { // ok, we got a match if ( needleNum ) *needleNum = j; //return (char *)p; retVal = (char *)p; p = pend; break; } // otherwise, just count it needles[j].m_count++; // see if we match another needle, fixes bug // of matching "anal" but not "analy[tics]" continue; // advance to next char in the haystack break; } // ok, we did not match any needles, advance p and try again } // // HACK: // // repeat above loop but for the last 4 characters in haystack!! // this fixes a electric fence mem breach core // // it is slower because we check for \0 // pend += 4; for ( ; p < pend ; p++ ) { // breathe QUICKPOLL(niceness); //if ( (char *)p - (char *)haystack >= 12508 ) // log("hey"); // is this a possible match? (this should be VERY fast) mask = s0[*(p+0)]; if ( ! mask ) continue; if ( p+1 < pend ) { mask &= s1[*(p+1)]; if ( ! mask ) continue; } if ( p+2 < pend ) { mask &= s2[*(p+2)]; if ( ! mask ) continue; } if ( p+3 < pend ) { mask &= s3[*(p+3)]; if ( ! mask ) continue; } //debugCount++; /* // display char oo[148]; char *xx ; xx = oo; //gbmemcpy ( xx , p , 8 ); for ( int32_t k = 0 ; k < 5 ; k++ ) { *xx++ = p[k]; } gbmemcpy ( xx , "..." , 3 ); xx += 3; */ // // XXX: do a hashtable lookup here so we have the candidate // matches in a chain... // XXX: for small needles which match frequently let's have // a single char hash table, a 2 byte char hash table, // etc. so if we have small needles we check the hash // in those tables first, but only if mask & SMALL_NEEDLE // is true! the single byte needle hash table can just // be a lookup table. just XOR the bytes together for // the hash. // XXX: just hash the mask into a table to get candidate // matches in a chain? but there's 4B hashes!! // we got a good candidate, loop through all the needles for ( int32_t j = 0 ; j < numNeedles ; j++ ) { // skip if does not match mask, will save time if ( ! ((1<<(j&0x3f)) & mask) ) continue; if( needles[j].m_stringSize > 3) { // ensure first 4 bytes matches this needle's if (needles[j].m_string[0]!=to_lower_a(*(p+0))) continue; if (!p[1] || needles[j].m_string[1]!=to_lower_a(*(p+1))) continue; if (!p[2] || needles[j].m_string[2]!=to_lower_a(*(p+2))) continue; if (!p[3] || needles[j].m_string[3]!=to_lower_a(*(p+3))) continue; } // get needle size int32_t msize = needles[j].m_stringSize; // can p possibly be big enough? if ( pend - p < msize ) continue; // needle is "m" now char *m = needles[j].m_string; char *mend = needles[j].m_stringSize + m; // use a tmp ptr for ptr into haystack char *d = (char *)p; // skip first 4 bytes since we know they match if(msize > 3) { d += 4; m += 4; } // loop over each char in "m" //for ( ; *m ; m++ ) { for ( ; m < mend ; m++ ) { //while ( ! *d && d < dend ) d++; //while ( ! *m && m < mend ) m++; // if we are a non alnum, that will match // any string of non-alnums, like a space // for instance. the 0 byte does not count // because it is used in utf16 a lot. this // may trigger some false matches in utf16 // but, oh well... this way "link partner" // will match "link - partner" in the haystk if ( is_wspace_a(*m) && m < mend ) { // skip all in "d" then. while (d<dend&&is_wspace_a(*d)) d++; // advance m then continue; } // make sure we match otherwise if ( *m != to_lower_a(*d) ) break; // ok, we matched, go to next d++; } // if not null, keep going if ( m < mend ) continue; // if this needle is "special" AND it occurs AFTER // linkPos, then do not consider it a match. this is // if we have a comment section indicator, like // "div id=\"comment" AND it occurs AFTER linkPos // (the char ptr to our link in the haystack) then // the match does not count. if ( linkPos && needles[j].m_isSection && (char *)p>linkPos ) { // record this for LinkText.cpp if ( hadPreMatch ) *hadPreMatch = true; continue; } // store ptr if NULL if ( ! needles[j].m_firstMatch ) needles[j].m_firstMatch = (char *)p; // return ptr to needle in "haystack" if ( stopAtFirstMatch ) { // ok, we got a match if ( needleNum ) *needleNum = j; //return (char *)p; retVal = (char *)p; p = pend; break; } // otherwise, just count it needles[j].m_count++; // advance to next char in the haystack break; } // ok, we did not match any needles, advance p and try again } //if ( debugCount > 0 ) pp = haystackSize / debugCount; //log("build: debug count = %"INT32" uc=%"INT32" hsize=%"INT32" " // "1 in %"INT32" chars matches.", // debugCount,(int32_t)isHaystackUtf16,haystackSize,pp); // before we exit, clean up return retVal; }