void Images::setCandidates ( Url *pageUrl , Words *words , Xml *xml , Sections *sections , XmlDoc *xd ) { // not valid for now m_thumbnailValid = false; // reset our array of image node candidates m_numImages = 0; // flag it m_setCalled = true; // strange... if ( m_imgReply ) { char *xx=NULL;*xx=0; } // save this m_xml = xml; m_pageUrl = pageUrl; // // first add any open graph candidate. // basically they page telling us the best image straight up. // int32_t node2 = -1; int32_t startNode = 0; // . field can be stuff like "summary","description","keywords",... // . if "convertHtmlEntites" is true we change < to < and > to > // . <meta property="og:image" content="http://example.com/rock2.jpg"/> // . <meta property="og:image" content="http://example.com/rock3.jpg"/> ogimgloop: char ubuf[2000]; int32_t ulen = xml->getMetaContent( ubuf, 1999, "og:image", 8, "property", startNode, &node2 ); // update this in case goto ogimgloop is called startNode = node2 + 1; // see section below for explanation of what we are storing here... if ( node2 >= 0 ) { // save it m_imageNodes[m_numImages] = node2; Query q; if ( ulen > MAX_URL_LEN ) goto ogimgloop; // set it to the full url Url iu; // use "pageUrl" as the baseUrl iu.set( pageUrl, ubuf, ulen ); // skip if invalid domain or TLD if ( iu.getDomainLen() <= 0 ) goto ogimgloop; // for looking it up on disk to see if unique or not char buf[2000]; // if we don't put in quotes it expands '|' into // the "PiiPe" operator in Query.cpp snprintf ( buf , 1999, "gbimage:\"%s\"",iu.getUrl()); // TODO: make sure this is a no-split termid storage thingy // in Msg14.cpp if ( ! q.set2 ( buf , langUnknown , false ) ) return; // sanity test if ( q.getNumTerms() != 1 ) { char *xx=0;*xx=0; } // store the termid m_termIds[m_numImages] = q.getTermId(0); // advance the counter m_numImages++; // try to get more graph images if we have some room if ( m_numImages + 2 < MAX_IMAGES ) goto ogimgloop; } //m_pageSite = pageSite; // scan the words int32_t nw = words->getNumWords(); nodeid_t *tids = words->getTagIds(); int64_t *wids = words->getWordIds(); //int32_t *scores = scoresArg->m_scores; Section **sp = NULL; if ( sections ) sp = sections->m_sectionPtrs; // not if we don't have any identified sections if ( sections && sections->m_numSections <= 0 ) sp = NULL; // the positive scored window int32_t firstPosScore = -1; int32_t lastPosScore = -1; int32_t badFlags = SEC_SCRIPT|SEC_STYLE|SEC_SELECT; // find positive scoring window for ( int32_t i = 0 ; i < nw ; i++ ) { // skip if in bad section if ( sp && (sp[i]->m_flags & badFlags) ) continue; if ( wids[i] != 0 ) continue; // set first positive scoring guy if ( firstPosScore == -1 ) firstPosScore = i; // keep track of last guy lastPosScore = i; } // sanity check if ( getNumXmlNodes() > 512 ) { char *xx=NULL;*xx=0; } // . pedal firstPosScore back until we hit a section boundary // . i.e. stop once we hit a front/back tag pair, like <div> and </div> char tc[512]; memset ( tc , 0 , 512 ); int32_t a = firstPosScore; for ( ; a >= 0 ; a-- ) { // get the tid nodeid_t tid = tids[a]; // remove back bit, if any tid &= BACKBITCOMP; // skip if not a tag, or a generic xml tag if ( tid <= 1 ) continue; // mark it if ( words->isBackTag(a) ) tc[tid] |= 0x02; else tc[tid] |= 0x01; // continue if not a full front/back pair if ( tc[tid] != 0x03 ) continue; // continue if not a "section" type tag (see Scores.cpp) if ( tid != TAG_DIV && tid != TAG_TEXTAREA && tid != TAG_TR && tid != TAG_TD && tid != TAG_TABLE ) continue; // ok we should stop now break; } // min is 0 if ( a < 0 ) a = 0; // now look for the image urls within this window for ( int32_t i = a ; i < lastPosScore ; i++ ) { // skip if not <img> tag if (tids[i] != TAG_IMG ) continue; // get the node num into Xml.cpp::m_nodes[] array int32_t nn = words->getNodes()[i]; // check width to rule out small decorating imgs int32_t width = xml->getLong(nn,nn+1,"width", -1 ); if ( width != -1 && width < 50 ) continue; // same with height int32_t height = xml->getLong(nn,nn+1, "height", -1 ); if ( height != -1 && height < 50 ) continue; // get the url of the image int32_t srcLen; char *src = xml->getString(nn,"src",&srcLen); // skip if none if ( srcLen <= 2 ) continue; // set it to the full url Url iu; // use "pageUrl" as the baseUrl iu.set( pageUrl, src, srcLen ); // skip if invalid domain or TLD if ( iu.getDomainLen() <= 0 ) continue; // skip if not from same domain as page url //int32_t dlen = pageUrl->getDomainLen(); //if ( iu.getDomainLen() != dlen ) continue; //if(strncmp(iu.getDomain(),pageUrl->getDomain(),dlen))continue // get the full url char *u = iu.getUrl(); int32_t ulen = iu.getUrlLen(); // skip common crap if ( strncasestr(u,ulen,"logo" ) ) continue; if ( strncasestr(u,ulen,"comment" ) ) continue; if ( strncasestr(u,ulen,"print" ) ) continue; if ( strncasestr(u,ulen,"subscribe" ) ) continue; if ( strncasestr(u,ulen,"header" ) ) continue; if ( strncasestr(u,ulen,"footer" ) ) continue; if ( strncasestr(u,ulen,"menu" ) ) continue; if ( strncasestr(u,ulen,"button" ) ) continue; if ( strncasestr(u,ulen,"banner" ) ) continue; if ( strncasestr(u,ulen,"ad.doubleclick.") ) continue; if ( strncasestr(u,ulen,"ads.webfeat." ) ) continue; if ( strncasestr(u,ulen,"xads.zedo." ) ) continue; // save it m_imageNodes[m_numImages] = nn; // before we lookup the image url to see if it is unique we // must first make sure that we have an adequate number of // permalinks from this same site with this same hop count. // we need at least 10 before we extract image thumbnails. char buf[2000]; // set the query Query q; // if we do have 10 or more, then we lookup the image url to // make sure it is indeed unique sprintf ( buf , "gbimage:\"%s\"",u); // TODO: make sure this is a no-split termid storage thingy // in Msg14.cpp if ( ! q.set2 ( buf , langUnknown , false ) ) // return true with g_errno set on error return; // store the termid m_termIds[m_numImages] = q.getTermId(0); // advance the counter m_numImages++; // break if full if ( m_numImages >= MAX_IMAGES ) break; } }
// . returns false if blocked, true otherwise // . sets g_errno on error // . make a web page displaying the config of this host // . call g_httpServer.sendDynamicPage() to send it bool sendPageIndexdb ( TcpSocket *s , HttpRequest *r ) { // . get fields from cgi field of the requested url // . get the search query long queryLen = 0; char *query = r->getString ( "q" , &queryLen , NULL /*default*/); // ensure query not too big if ( queryLen >= MAX_QUERY_LEN ) { g_errno = EQUERYTOOBIG; return g_httpServer.sendErrorReply(s,500,mstrerror(g_errno)); } // get the collection long collLen = 0; char *coll = r->getString("c",&collLen); if ( ! coll || ! coll[0] ) { //coll = g_conf.m_defaultColl; coll = g_conf.getDefaultColl( r->getHost(), r->getHostLen() ); collLen = gbstrlen(coll); } // ensure collection not too big if ( collLen >= MAX_COLL_LEN ) { g_errno = ECOLLTOOBIG; return g_httpServer.sendErrorReply(s,500,mstrerror(g_errno)); } // make a state State10 *st ; try { st = new (State10); } catch ( ... ) { g_errno = ENOMEM; log("PageIndexdb: new(%i): %s", sizeof(State10),mstrerror(g_errno)); return g_httpServer.sendErrorReply(s,500,mstrerror(g_errno));} mnew ( st , sizeof(State10) , "PageIndexdb" ); // password, too long pwdLen = 0 ; char *pwd = r->getString ( "pwd" , &pwdLen ); if ( pwdLen > 31 ) pwdLen = 31; if ( pwdLen > 0 ) strncpy ( st->m_pwd , pwd , pwdLen ); st->m_pwd[pwdLen]='\0'; // get # of records to retreive from IndexList st->m_numRecs = r->getLong ( "numRecs" , 100 ); // use disk, tree, or cache? st->m_useDisk = r->getLong ("ud" , 0 ); st->m_useTree = r->getLong ("ut" , 0 ); st->m_useCache = r->getLong ("uc" , 0 ); st->m_useDatedb= r->getLong ("ub" , 0 ); st->m_add = r->getLong ("add", 0 ); st->m_del = r->getLong ("del", 0 ); // get the termId, if any, from the cgi vars st->m_termId = r->getLongLong ("t", 0LL ) ; // get docid and score st->m_docId = r->getLongLong ("d", 0LL ); st->m_score = r->getLong ("score", 0 ); // copy query/collection memcpy ( st->m_query , query , queryLen ); st->m_queryLen = queryLen; st->m_query [ queryLen ] ='\0'; //memcpy ( st->m_coll , coll , collLen ); //st->m_collLen = collLen; //st->m_coll [ collLen ] ='\0'; st->m_coll = coll; // save the TcpSocket st->m_socket = s; // and if the request is local/internal or not st->m_isAdmin = g_collectiondb.isAdmin ( r , s ); st->m_isLocal = r->isLocal(); st->m_r.copy ( r ); // . check for add/delete request if ( st->m_add || st->m_del ) { key_t startKey = g_indexdb.makeStartKey ( st->m_termId ); key_t endKey = g_indexdb.makeEndKey ( st->m_termId ); // construct the key to add/delete st->m_key = g_indexdb.makeKey ( st->m_termId, st->m_score , st->m_docId , st->m_del ); // make an RdbList out of the key st->m_keyList.set ( (char*)&st->m_key, sizeof(key_t), (char*)&st->m_key, sizeof(key_t), startKey, endKey, 0, false, true ); log ( LOG_INFO, "build: adding indexdb key to indexdb: " "%lx %llx", st->m_key.n1, st->m_key.n0 ); // call msg1 to add/delete key if ( ! st->m_msg1.addList ( &st->m_keyList, RDB_INDEXDB, st->m_coll, st, addedKeyWrapper, false, MAX_NICENESS ) ) return false; // continue to page if no block return gotIndexList ( st ); } if ( ! st->m_query[0] ) return gotIndexList(st); // . set query class // . a boolFlag of 0 means query is not boolean Query q; q.set2 ( query , langUnknown , true ); // 0 = boolFlag, not boolean! // reset st->m_msg36.m_termFreq = 0LL; // if query was provided, use that, otherwise use termId if ( q.getNumTerms() > 0 ) st->m_termId = q.getTermId(0); // skip if nothing else return gotTermFreq ( st ); // get the termfreq of this term! if ( ! st->m_msg36.getTermFreq ( coll , 0 , st->m_termId, st , gotTermFreqWrapper ) ) return false; // otherwise, we didn't block return gotTermFreq ( st ); }