// for procog bool sendPageAnalyze ( TcpSocket *s , HttpRequest *r ) { // make a state State8 *st; try { st = new (State8); } catch ( ... ) { g_errno = ENOMEM; log("PageParser: new(%i): %s", (int)sizeof(State8),mstrerror(g_errno)); return g_httpServer.sendErrorReply(s,500, mstrerror(g_errno));} mnew ( st , sizeof(State8) , "PageParser" ); st->m_freeIt = true; st->m_state = NULL; //st->m_callback = callback; //st->m_q = q; //st->m_termFreqs = termFreqs; //st->m_termFreqWeights = termFreqWeights; //st->m_affWeights = affWeights; //st->m_total = (score_t)-1; st->m_indexCode = 0; st->m_blocked = false; st->m_didRootDom = false; st->m_didRootWWW = false; st->m_wasRootDom = false; st->m_u = NULL; // password, too long pwdLen = 0; char *pwd = r->getString ( "pwd" , &pwdLen ); if ( pwdLen > 31 ) pwdLen = 31; if ( pwdLen > 0 ) strncpy ( st->m_pwd , pwd , pwdLen ); st->m_pwd[pwdLen]='\0'; // save socket ptr st->m_s = s; st->m_r.copy ( r ); // get the collection char *coll = r->getString ( "c" , &st->m_collLen ,NULL /*default*/); if ( ! coll ) coll = g_conf.m_defaultColl; if ( ! coll ) coll = "main"; long collLen = gbstrlen(coll); if ( collLen > MAX_COLL_LEN ) return sendErrorReply ( st , ENOBUFS ); strcpy ( st->m_coll , coll ); // version to use, if -1 use latest st->m_titleRecVersion = r->getLong("version",-1); if ( st->m_titleRecVersion == -1 ) st->m_titleRecVersion = TITLEREC_CURRENT_VERSION; // default to 0 if not provided st->m_hopCount = r->getLong("hc",0); long old = r->getLong ( "old", 0 ); // set query long qlen; char *qs = r->getString("q",&qlen,NULL); if ( qs ) st->m_tq.set2 ( qs , langUnknown , true ); // url will override docid if given st->m_docId = r->getLongLong ("d",-1); st->m_docId = r->getLongLong ("docid",st->m_docId); long ulen; char *u = st->m_r.getString("u",&ulen,NULL); if ( ! u ) u = st->m_r.getString("url",&ulen,NULL); if ( ! u && st->m_docId == -1LL ) return sendErrorReply ( st , EBADREQUEST ); // set url in state class (may have length 0) //if ( u ) st->m_url.set ( u , ulen ); //st->m_urlLen = ulen; st->m_u = u; st->m_ulen = 0; if ( u ) st->m_ulen = gbstrlen(u); // should we recycle link info? st->m_recycle = r->getLong("recycle",1); st->m_recycle2 = r->getLong("recycleimp",0); st->m_render = r->getLong("render" ,0); st->m_recompute = r->getLong("recompute" ,0); // for quality computation... takes way longer cuz we have to // lookup the IP address of every outlink, so we can get its root // quality using Msg25 which needs to filter out voters from that IP // range. st->m_oips = r->getLong("oips" ,0); //st->m_page = r->getLong("page",1); long linkInfoLen = 0; // default is NULL char *linkInfoColl = r->getString ( "oli" , &linkInfoLen, NULL ); if ( linkInfoColl ) strcpy ( st->m_linkInfoColl , linkInfoColl ); else st->m_linkInfoColl[0] = '\0'; // set the flag in our SafeBuf class so that Words.cpp knows to show // html or html source depending on this value //st->m_xbuf.m_renderHtml = st->m_render; // should we use the old title rec? st->m_old = old; // are we coming from a local machine? st->m_isLocal = r->isLocal(); //no more setting the default root quality to 30, instead if we do not // know it setting it to -1 st->m_rootQuality=-1; // header //xbuf->safePrintf("<meta http-equiv=\"Content-Type\" " // "content=\"text/html; charset=utf-8\">\n"); XmlDoc *xd = &st->m_xd; long isXml = r->getLong("xml",0); // if got docid, use that if ( st->m_docId != -1 ) { if ( ! xd->set3 ( st->m_docId, st->m_coll, 0 ) ) // niceness // return error reply if g_errno is set return sendErrorReply ( st , g_errno ); // make this our callback in case something blocks xd->setCallback ( st , gotXmlDoc ); xd->m_pbuf = &st->m_wbuf; // reset this flag st->m_donePrinting = false; // . set xd from the old title rec if recycle is true // . can also use XmlDoc::m_loadFromOldTitleRec flag //if ( st->m_recycle ) xd->m_recycleContent = true; xd->m_recycleContent = true; // force this on //xd->m_useSiteLinkBuf = true; //xd->m_usePageLinkBuf = true; if ( isXml ) xd->m_printInXml = true; // now tell it to fetch the old title rec if ( ! xd->loadFromOldTitleRec () ) // return false if this blocks return false; return gotXmlDoc ( st ); } // set this up SpiderRequest sreq; sreq.reset(); if ( st->m_u ) strcpy(sreq.m_url,st->m_u); long firstIp = hash32n(st->m_u); if ( firstIp == -1 || firstIp == 0 ) firstIp = 1; // parentdocid of 0 sreq.setKey( firstIp, 0LL, false ); sreq.m_isPageParser = 1; sreq.m_hopCount = st->m_hopCount; sreq.m_hopCountValid = 1; sreq.m_fakeFirstIp = 1; sreq.m_firstIp = firstIp; Url nu; nu.set(sreq.m_url); sreq.m_domHash32 = nu.getDomainHash32(); sreq.m_siteHash32 = nu.getHostHash32(); // . get provided content if any // . will be NULL if none provided // . "content" may contain a MIME long contentLen = 0; char *content = r->getString ( "content" , &contentLen , NULL ); // is the "content" url-encoded? default is true. bool contentIsEncoded = true; // mark doesn't like to url-encode his content if ( ! content ) { content = r->getUnencodedContent (); contentLen = r->getUnencodedContentLen (); contentIsEncoded = false; } // ensure null if ( contentLen == 0 ) content = NULL; //uint8_t contentType = CT_HTML; //if ( isXml ) contentType = CT_XML; long ctype = r->getLong("ctype",CT_HTML); // . use the enormous power of our new XmlDoc class // . this returns false if blocked if ( ! xd->set4 ( &sreq , NULL , st->m_coll , // we need this so the term table is set! &st->m_wbuf , // XmlDoc::m_pbuf 0, // try 0 now! 1 ,//PP_NICENESS )) content , false, // deletefromindex 0, // forced ip ctype )) // return error reply if g_errno is set return sendErrorReply ( st , g_errno ); // make this our callback in case something blocks xd->setCallback ( st , gotXmlDoc ); // reset this flag st->m_donePrinting = false; // prevent a core here in the event we download the page content xd->m_crawlDelayValid = true; xd->m_crawlDelay = 0; // . set xd from the old title rec if recycle is true // . can also use XmlDoc::m_loadFromOldTitleRec flag //if ( st->m_recycle ) xd->m_recycleContent = true; // only recycle if docid is given!! if ( st->m_recycle ) xd->m_recycleContent = true; // force this on //xd->m_useSiteLinkBuf = true; //xd->m_usePageLinkBuf = true; if ( isXml ) xd->m_printInXml = true; return gotXmlDoc ( st ); }
// . returns false if blocked, true otherwise // . sets g_errno on error bool sendPageGet ( TcpSocket *s , HttpRequest *r ) { // get the collection long collLen = 0; char *coll = r->getString("c",&collLen); if ( ! coll || ! coll[0] ) { //coll = g_conf.m_defaultColl; coll = g_conf.getDefaultColl( r->getHost(), r->getHostLen() ); collLen = gbstrlen(coll); } // ensure collection not too big if ( collLen >= MAX_COLL_LEN ) { g_errno = ECOLLTOOBIG; return g_httpServer.sendErrorReply(s,500,mstrerror(g_errno)); } // get the collection rec CollectionRec *cr = g_collectiondb.getRec ( coll ); if ( ! cr ) { g_errno = ENOCOLLREC; log("query: Archived copy retrieval failed. " "No collection record found for " "collection \"%s\".",coll); return g_httpServer.sendErrorReply(s,500,mstrerror(g_errno)); } // does this collection ban this IP? if ( ! cr->hasSearchPermission ( s ) ) { g_errno = ENOPERM; //log("PageGet::sendDynamicReply0: permission denied for %s", // iptoa(s->m_ip) ); g_msg = " (error: permission denied)"; return g_httpServer.sendErrorReply(s,500,mstrerror(g_errno)); } // . get fields from cgi field of the requested url // . get the search query long qlen = 0; char *q = r->getString ( "q" , &qlen , NULL /*default*/); // ensure query not too big if ( qlen >= MAX_QUERY_LEN-1 ) { g_errno=EQUERYTOOBIG; return g_httpServer.sendErrorReply (s,500 ,mstrerror(g_errno)); } // the docId long long docId = r->getLongLong ( "d" , 0LL /*default*/ ); // get url char *url = r->getString ( "u",NULL); if ( docId == 0 && ! url ) { g_errno = EMISSINGINPUT; return g_httpServer.sendErrorReply (s,500 ,mstrerror(g_errno)); } // . should we do a sequential lookup? // . we need to match summary here so we need to know this //bool seq = r->getLong ( "seq" , false ); // restrict to root file? bool rtq = r->getLong ( "rtq" , false ); // . get the titleRec // . TODO: redirect client to a better http server to save bandwidth State2 *st ; try { st = new (State2); } catch (... ) { g_errno = ENOMEM; log("PageGet: new(%i): %s", (int)sizeof(State2),mstrerror(g_errno)); return g_httpServer.sendErrorReply(s,500,mstrerror(g_errno));} mnew ( st , sizeof(State2) , "PageGet1" ); // save the socket and if Host: is local in the Http request Mime st->m_socket = s; st->m_isAdmin = g_conf.isCollAdmin ( s , r ); st->m_isLocal = r->isLocal(); st->m_docId = docId; st->m_printed = false; // include header ... "this page cached by Gigablast on..." st->m_includeHeader = r->getLong ("ih" , true ); st->m_includeBaseHref = r->getLong ("ibh" , false ); st->m_queryHighlighting = r->getLong ("qh" , true ); st->m_strip = r->getLong ("strip" , 0 ); st->m_clickAndScroll = r->getLong ("cas" , true ); st->m_cnsPage = r->getLong ("cnsp" , true ); char *langAbbr = r->getString("qlang",NULL); st->m_langId = langUnknown; if ( langAbbr ) { uint8_t langId = getLangIdFromAbbr ( langAbbr ); st->m_langId = langId; } strncpy ( st->m_coll , coll , MAX_COLL_LEN+1 ); // store query for query highlighting st->m_netTestResults = r->getLong ("rnettest", false ); //if( st->m_netTestResults ) { // mdelete ( st , sizeof(State2) , "PageGet1" ); // delete ( st ); // return sendPageNetResult( s ); //} if ( q && qlen > 0 ) strcpy ( st->m_q , q ); else st->m_q[0] = '\0'; st->m_qlen = qlen; //st->m_seq = seq; st->m_rtq = rtq; st->m_boolFlag = r->getLong ("bq", 2 /*default is 2*/ ); st->m_isBanned = false; st->m_noArchive = false; st->m_socket = s; st->m_format = r->getReplyFormat(); // default to 0 niceness st->m_niceness = 0; st->m_r.copy ( r ); //st->m_cr = cr; st->m_printDisclaimer = true; if ( st->m_cnsPage ) st->m_printDisclaimer = false; if ( st->m_strip ) // ! st->m_evbits.isEmpty() ) st->m_printDisclaimer = false; // should we cache it? char useCache = r->getLong ( "usecache" , 1 ); char rcache = r->getLong ( "rcache" , 1 ); char wcache = r->getLong ( "wcache" , 1 ); long cacheAge = r->getLong ( "cacheAge" , 60*60 ); // default one hour if ( useCache == 0 ) { cacheAge = 0; wcache = 0; } if ( rcache == 0 ) cacheAge = 0; // . fetch the TitleRec // . a max cache age of 0 means not to read from the cache XmlDoc *xd = &st->m_xd; // url based? if ( url ) { SpiderRequest sreq; sreq.reset(); strcpy(sreq.m_url, url ); sreq.setDataSize(); // this returns false if "coll" is invalid if ( ! xd->set4 ( &sreq , NULL , coll , NULL , st->m_niceness ) ) goto hadSetError; } // . when getTitleRec() is called it will load the old one // since XmlDoc::m_setFromTitleRec will be true // . niceness is 0 // . use st->m_coll since XmlDoc just points to it! // . this returns false if "coll" is invalid else if ( ! xd->set3 ( docId , st->m_coll , 0 ) ) { hadSetError: mdelete ( st , sizeof(State2) , "PageGet1" ); delete ( st ); g_errno = ENOMEM; log("PageGet: set3: %s", mstrerror(g_errno)); return g_httpServer.sendErrorReply(s,500,mstrerror(g_errno)); } // if it blocks while it loads title rec, it will re-call this routine xd->setCallback ( st , processLoopWrapper ); // good to go! return processLoop ( st ); }
// . a new interface so Msg3b can call this with "s" set to NULL // . returns false if blocked, true otherwise // . sets g_errno on error bool sendPageParser2 ( TcpSocket *s , HttpRequest *r , State8 *st , long long docId , Query *q , // in query term space, not imap space long long *termFreqs , // in imap space float *termFreqWeights , // in imap space float *affWeights , void *state , void (* callback)(void *state) ) { //log("parser: read sock=%li",s->m_sd); // might a simple request to addsomething to validated.*.txt file // from XmlDoc::print() or XmlDoc::validateOutput() char *add = r->getString("add",NULL); //long long uh64 = r->getLongLong("uh64",0LL); char *uh64str = r->getString("uh64",NULL); //char *divTag = r->getString("div",NULL); if ( uh64str ) { // convert add to number long addNum = 0; if ( to_lower_a(add[0])=='t' ) // "true" or "false"? addNum = 1; // convert it. skip beginning "str" inserted to prevent // javascript from messing with the long long since it // was rounding it! //long long uh64 = atoll(uh64str);//+3); // urldecode that //long divTagLen = gbstrlen(divTag); //long newLen = urlDecode ( divTag , divTag , divTagLen ); // null term? //divTag[newLen] = '\0'; // do it. this is defined in XmlDoc.cpp //addCheckboxSpan ( uh64 , divTag , addNum ); // make basic reply char *reply; reply = "HTTP/1.0 200 OK\r\n" "Connection: Close\r\n"; // that is it! send a basic reply ok bool status = g_httpServer.sendDynamicPage( s , reply, gbstrlen(reply), -1, //cachtime false ,//postreply? NULL, //ctype -1 , //httpstatus NULL,//cookie "utf-8"); return status; } // make a state if ( st ) st->m_freeIt = false; if ( ! st ) { try { st = new (State8); } catch ( ... ) { g_errno = ENOMEM; log("PageParser: new(%i): %s", (int)sizeof(State8),mstrerror(g_errno)); return g_httpServer.sendErrorReply(s,500, mstrerror(g_errno));} mnew ( st , sizeof(State8) , "PageParser" ); st->m_freeIt = true; } // msg3b uses this to get a score from the query st->m_state = state; st->m_callback = callback; st->m_q = q; st->m_termFreqs = termFreqs; st->m_termFreqWeights = termFreqWeights; st->m_affWeights = affWeights; //st->m_total = (score_t)-1; st->m_indexCode = 0; st->m_blocked = false; st->m_didRootDom = false; st->m_didRootWWW = false; st->m_wasRootDom = false; st->m_u = NULL; st->m_recompute = false; //st->m_url.reset(); // do not allow more than one to be launched at a time if in // a quickpoll. will cause quickpoll in quickpoll. g_inPageParser = true; // password, too long pwdLen = 0; char *pwd = r->getString ( "pwd" , &pwdLen ); if ( pwdLen > 31 ) pwdLen = 31; if ( pwdLen > 0 ) strncpy ( st->m_pwd , pwd , pwdLen ); st->m_pwd[pwdLen]='\0'; // save socket ptr st->m_s = s; st->m_r.copy ( r ); // get the collection char *coll = r->getString ( "c" , &st->m_collLen ,NULL /*default*/); if ( st->m_collLen > MAX_COLL_LEN ) return sendErrorReply ( st , ENOBUFS ); if ( ! coll ) return sendErrorReply ( st , ENOCOLLREC ); strcpy ( st->m_coll , coll ); // version to use, if -1 use latest st->m_titleRecVersion = r->getLong("version",-1); if ( st->m_titleRecVersion == -1 ) st->m_titleRecVersion = TITLEREC_CURRENT_VERSION; // default to 0 if not provided st->m_hopCount = r->getLong("hc",0); //long ulen = 0; //char *u = r->getString ( "u" , &ulen , NULL /*default*/); long old = r->getLong ( "old", 0 ); // set query long qlen; char *qs = r->getString("q",&qlen,NULL); if ( qs ) st->m_tq.set2 ( qs , langUnknown , true ); // url will override docid if given if ( ! st->m_u || ! st->m_u[0] ) st->m_docId = r->getLongLong ("docid",-1); else st->m_docId = -1; // set url in state class (may have length 0) //if ( u ) st->m_url.set ( u , ulen ); //st->m_urlLen = ulen; st->m_u = st->m_r.getString("u",&st->m_ulen,NULL); // should we recycle link info? st->m_recycle = r->getLong("recycle",0); st->m_recycle2 = r->getLong("recycleimp",0); st->m_render = r->getLong("render" ,0); // for quality computation... takes way longer cuz we have to // lookup the IP address of every outlink, so we can get its root // quality using Msg25 which needs to filter out voters from that IP // range. st->m_oips = r->getLong("oips" ,0); long linkInfoLen = 0; // default is NULL char *linkInfoColl = r->getString ( "oli" , &linkInfoLen, NULL ); if ( linkInfoColl ) strcpy ( st->m_linkInfoColl , linkInfoColl ); else st->m_linkInfoColl[0] = '\0'; // set the flag in our SafeBuf class so that Words.cpp knows to show // html or html source depending on this value st->m_xbuf.m_renderHtml = st->m_render; // should we use the old title rec? st->m_old = old; // are we coming from a local machine? st->m_isLocal = r->isLocal(); //no more setting the default root quality to 30, instead if we do not // know it setting it to -1 st->m_rootQuality=-1; // header SafeBuf *xbuf = &st->m_xbuf; xbuf->safePrintf("<meta http-equiv=\"Content-Type\" " "content=\"text/html; charset=utf-8\">\n"); // print standard header g_pages.printAdminTop ( xbuf , st->m_s , &st->m_r ); // print the standard header for admin pages char *dd = ""; char *rr = ""; char *rr2 = ""; char *render = ""; char *oips = ""; char *us = ""; if ( st->m_u && st->m_u[0] ) us = st->m_u; //if ( st->m_sfn != -1 ) sprintf ( rtu , "%li",st->m_sfn ); if ( st->m_old ) dd = " checked"; if ( st->m_recycle ) rr = " checked"; if ( st->m_recycle2 ) rr2 = " checked"; if ( st->m_render ) render = " checked"; if ( st->m_oips ) oips = " checked"; xbuf->safePrintf( "<style>" ".poo { background-color:#%s;}\n" "</style>\n" , LIGHT_BLUE ); long clen; char *contentParm = r->getString("content",&clen,""); // print the input form xbuf->safePrintf ( "<style>\n" "h2{font-size: 12px; color: #666666;}\n" ".gbtag { border: 1px solid gray;" "background: #ffffef;display:inline;}\n" ".gbcomment { border: 1px solid gray;" "color: #888888; font-style:italic; " "background: #ffffef;display:inline;}\n" ".token { border: 1px solid gray;" "background: #f0ffff;display:inline;}\n" ".spam { border: 1px solid gray;" "background: #af0000;" "color: #ffffa0;}" ".hs {color: #009900;}" "</style>\n" "<center>" "<table %s>" "<tr><td colspan=5><center><b>" "Parser" "</b></center></td></tr>\n" "<tr class=poo>" "<td>" "<b>url</b>" "<br><font size=-2>" "Type in <b>FULL</b> url to parse." "</font>" "</td>" "</td>" "<td>" "<input type=text name=u value=\"%s\" size=\"40\">\n" "</td>" "</tr>" /* "<tr class=poo>" "<td>" "Parser version to use: " "</td>" "<td>" "<input type=text name=\"version\" size=\"4\" value=\"-1\"> " "</td>" "<td>" "(-1 means to use latest title rec version)<br>" "</td>" "</tr>" */ /* "<tr class=poo>" "<td>" "Hop count to use: " "</td>" "<td>" "<input type=text name=\"hc\" size=\"4\" value=\"%li\"> " "</td>" "<td>" "(-1 is unknown. For root urls hopcount is always 0)<br>" "</td>" "</tr>" */ "<tr class=poo>" "<td>" "<b>use cached</b>" "<br><font size=-2>" "Load page from cache (titledb)?" "</font>" "</td>" "<td>" "<input type=checkbox name=old value=1%s> " "</td>" "</tr>" /* "<tr class=poo>" "<td>" "Reparse root:" "</td>" "<td>" "<input type=checkbox name=artr value=1%s> " "</td>" "<td>" "Apply selected ruleset to root to update quality" "</td>" "</tr>" */ "<tr class=poo>" "<td>" "<b>recycle link info</b>" "<br><font size=-2>" "Recycle the link info from the title rec" "Load page from cache (titledb)?" "</font>" "</td>" "<td>" "<input type=checkbox name=recycle value=1%s> " "</td>" "</tr>" /* "<tr class=poo>" "<td>" "Recycle Link Info Imported:" "</td>" "<td>" "<input type=checkbox name=recycleimp value=1%s> " "</td>" "<td>" "Recycle the link info imported from other coll" "</td>" "</tr>" */ "<tr class=poo>" "<td>" "<b>render html</b>" "<br><font size=-2>" "Render document content as HTML" "</font>" "</td>" "<td>" "<input type=checkbox name=render value=1%s> " "</td>" "</tr>" /* "<tr class=poo>" "<td>" "Lookup outlinks' ruleset, ips, quality:" "</td>" "<td>" "<input type=checkbox name=oips value=1%s> " "</td>" "<td>" "To compute quality lookup IP addresses of roots " "of outlinks." "</td>" "</tr>" "<tr class=poo>" "<td>" "LinkInfo Coll:" "</td>" "<td>" "<input type=text name=\"oli\" size=\"10\" value=\"\"> " "</td>" "<td>" "Leave empty usually. Uses this coll to lookup link info." "</td>" "</tr>" */ "<tr class=poo>" "<td>" "<b>optional query</b>" "<br><font size=-2>" "Leave empty usually. For title generation only." "</font>" "</td>" "<td>" "<input type=text name=\"q\" size=\"20\" value=\"\"> " "</td>" "</tr>", TABLE_STYLE, us , dd, rr, render ); xbuf->safePrintf( "<tr class=poo>" "<td>" "<b>content type below is</b>" "<br><font size=-2>" "Is the content below HTML? XML? JSON?" "</font>" "</td>" "<td>" //"<input type=checkbox name=xml value=1> " "<select name=ctype>\n" "<option value=%li selected>HTML</option>\n" "<option value=%li selected>XML</option>\n" "<option value=%li selected>JSON</option>\n" "</select>\n" "</td>" "</tr>", (long)CT_HTML, (long)CT_XML, (long)CT_JSON ); xbuf->safePrintf( "<tr class=poo>" "<td><b>content</b>" "<br><font size=-2>" "Use this content for the provided <i>url</i> " "rather than downloading it from the web." "</td>" "<td>" "<textarea rows=10 cols=80 name=content>" "%s" "</textarea>" "</td>" "</tr>" "</table>" "</center>" "</form>" "<br>", //oips , contentParm ); xbuf->safePrintf( "<center>" "<input type=submit value=Submit>" "</center>" ); // just print the page if no url given if ( ! st->m_u || ! st->m_u[0] ) return processLoop ( st ); XmlDoc *xd = &st->m_xd; // set this up SpiderRequest sreq; sreq.reset(); strcpy(sreq.m_url,st->m_u); long firstIp = hash32n(st->m_u); if ( firstIp == -1 || firstIp == 0 ) firstIp = 1; // parentdocid of 0 sreq.setKey( firstIp, 0LL, false ); sreq.m_isPageParser = 1; sreq.m_hopCount = st->m_hopCount; sreq.m_hopCountValid = 1; sreq.m_fakeFirstIp = 1; sreq.m_firstIp = firstIp; Url nu; nu.set(sreq.m_url); sreq.m_domHash32 = nu.getDomainHash32(); sreq.m_siteHash32 = nu.getHostHash32(); // . get provided content if any // . will be NULL if none provided // . "content" may contain a MIME long contentLen = 0; char *content = r->getString ( "content" , &contentLen , NULL ); // is the "content" url-encoded? default is true. bool contentIsEncoded = true; // mark doesn't like to url-encode his content if ( ! content ) { content = r->getUnencodedContent (); contentLen = r->getUnencodedContentLen (); contentIsEncoded = false; } // ensure null if ( contentLen == 0 ) content = NULL; uint8_t contentType = CT_HTML; if ( r->getBool("xml",0) ) contentType = CT_XML; contentType = r->getLong("ctype",contentType);//CT_HTML); // if facebook, load xml content from title rec... bool isFacebook = (bool)strstr(st->m_u,"http://www.facebook.com/"); if ( isFacebook && ! content ) { long long docId = g_titledb.getProbableDocId(st->m_u); sprintf(sreq.m_url ,"%llu", docId ); sreq.m_isPageReindex = true; } // hack if ( content ) { st->m_dbuf.purge(); st->m_dbuf.safeStrcpy(content); //char *data = strstr(content,"\r\n\r\n"); //long dataPos = 0; //if ( data ) dataPos = (data + 4) - content; //st->m_dbuf.convertJSONtoXML(0,dataPos); //st->m_dbuf.decodeJSON(0); content = st->m_dbuf.getBufStart(); } // . use the enormous power of our new XmlDoc class // . this returns false if blocked if ( ! xd->set4 ( &sreq , NULL , st->m_coll , &st->m_wbuf , 0 ,//PP_NICENESS )) content , false, // deletefromindex 0, // forced ip contentType )) // return error reply if g_errno is set return sendErrorReply ( st , g_errno ); // make this our callback in case something blocks xd->setCallback ( st , processLoop ); // . set xd from the old title rec if recycle is true // . can also use XmlDoc::m_loadFromOldTitleRec flag if ( st->m_recycle ) xd->m_recycleContent = true; return processLoop ( st ); }
bool Msg7::inject ( char *url , long forcedIp , char *content , long contentLen , bool recycleContent, uint8_t contentType, char *coll , bool quickReply , char *username , char *pwd , long niceness, void *state , void (*callback)(void *state), long firstIndexed, long lastSpidered, long hopCount, char newOnly, short charset, char spiderLinks, char deleteIt, char hasMime, bool doConsistencyTesting ) { m_quickReply = quickReply; // store coll if ( ! coll ) { g_errno = ENOCOLLREC; return true; } long collLen = gbstrlen ( coll ); if ( collLen > MAX_COLL_LEN ) collLen = MAX_COLL_LEN; strncpy ( m_coll , coll , collLen ); m_coll [ collLen ] = '\0'; // store user //long ulen = 0; //if ( username ) ulen = gbstrlen(username); //if ( ulen >= MAX_USER_SIZE-1 ) {g_errno = EBUFOVERFLOW; return true;} //if ( username ) strcpy( m_username, username ); // store password //long pwdLen = 0; //if ( pwd ) pwdLen = gbstrlen(pwd); //m_pwd [ 0 ] ='\0'; //if ( pwdLen > 31 ) pwdLen = 31; //if ( pwdLen > 0 ) strncpy ( m_pwd , pwd , pwdLen ); //m_pwd [ pwdLen ] = '\0'; // store url if ( ! url ) { g_errno = 0; return true; } long urlLen = gbstrlen(url); if ( urlLen > MAX_URL_LEN ) {g_errno = EBADENGINEER; return true; } // skip injecting if no url given! just print the admin page. if ( urlLen <= 0 ) return true; //strcpy ( m_url , url ); if ( g_repairMode ) { g_errno = EREPAIRING; return true; } // send template reply if no content supplied if ( ! content && ! recycleContent ) { log("inject: no content supplied to inject command and " "recycleContent is false."); //return true; } // clean url? // normalize and add www. if it needs it Url uu; uu.set ( url , gbstrlen(url) , true ); // remove >'s i guess and store in st1->m_url[] buffer char cleanUrl[MAX_URL_LEN+1]; urlLen = cleanInput ( cleanUrl, MAX_URL_LEN, uu.getUrl(), uu.getUrlLen() ); // this can go on the stack since set4() copies it SpiderRequest sreq; sreq.reset(); strcpy(sreq.m_url, cleanUrl ); // parentdocid of 0 long firstIp = hash32n(cleanUrl); if ( firstIp == -1 || firstIp == 0 ) firstIp = 1; sreq.setKey( firstIp,0LL, false ); sreq.m_isInjecting = 1; sreq.m_isPageInject = 1; sreq.m_hopCount = hopCount; sreq.m_hopCountValid = 1; sreq.m_fakeFirstIp = 1; sreq.m_firstIp = firstIp; // shortcut XmlDoc *xd = &m_xd; // log it now //log("inject: injecting doc %s",cleanUrl); static char s_dummy[3]; // sometims the content is indeed NULL... if ( newOnly && ! content ) { // don't let it be NULL because then xmldoc will // try to download the page! s_dummy[0] = '\0'; content = s_dummy; //char *xx=NULL;*xx=0; } } // . use the enormous power of our new XmlDoc class // . this returns false with g_errno set on error if ( //m_needsSet && ! xd->set4 ( &sreq , NULL , m_coll , NULL , // pbuf // give it a niceness of 1, we have to be // careful since we are a niceness of 0!!!! niceness, // 1 , // inject this content content , deleteIt, // false, // deleteFromIndex , forcedIp , contentType , lastSpidered , hasMime )) { // g_errno should be set if that returned false if ( ! g_errno ) { char *xx=NULL;*xx=0; } return true; } // do not re-call the set //m_needsSet = false; // make this our callback in case something blocks xd->setCallback ( state , callback ); xd->m_doConsistencyTesting = doConsistencyTesting; // . set xd from the old title rec if recycle is true // . can also use XmlDoc::m_loadFromOldTitleRec flag if ( recycleContent ) xd->m_recycleContent = true; // othercrap if ( firstIndexed ) { xd->m_firstIndexedDate = firstIndexed; xd->m_firstIndexedDateValid = true; } if ( lastSpidered ) { xd->m_spideredTime = lastSpidered; xd->m_spideredTimeValid = true; } if ( hopCount != -1 ) { xd->m_hopCount = hopCount; xd->m_hopCountValid = true; } if ( charset != -1 && charset != csUnknown ) { xd->m_charset = charset; xd->m_charsetValid = true; } // avoid looking up ip of each outlink to add "firstip" tag to tagdb // because that can be slow!!!!!!! xd->m_spiderLinks = spiderLinks; xd->m_spiderLinks2 = spiderLinks; xd->m_spiderLinksValid = true; // . newOnly is true --> do not inject if document is already indexed! // . maybe just set indexCode xd->m_newOnly = newOnly; // do not re-lookup the robots.txt xd->m_isAllowed = true; xd->m_isAllowedValid = true; xd->m_crawlDelay = -1; // unknown xd->m_crawlDelayValid = true; // set this now g_inPageInject = true; // log it now //log("inject: indexing injected doc %s",cleanUrl); // . now tell it to index // . this returns false if blocked bool status = xd->indexDoc ( ); // log it. i guess only for errors when it does not block? // because xmldoc.cpp::indexDoc calls logIt() if ( status ) xd->logIt(); // undo it g_inPageInject = false; // note that it blocked //if ( ! status ) log("inject: blocked for %s",cleanUrl); // return false if it blocked return status; }