void handleRequest7 ( UdpSlot *slot , long netnice ) { //m_state = state; //m_callback = callback; // shortcut XmlDoc *xd; try { xd = new (XmlDoc); } catch ( ... ) { g_errno = ENOMEM; log("PageInject: import failed: new(%i): %s", (int)sizeof(XmlDoc),mstrerror(g_errno)); sendReply(slot); return; } mnew ( xd, sizeof(XmlDoc) , "PageInject" ); //xd->reset(); char *titleRec = slot->m_readBuf; long titleRecSize = slot->m_readBufSize; long collnum = *(long *)titleRec; titleRec += 4; titleRecSize -= 4; CollectionRec *cr = g_collectiondb.m_recs[collnum]; if ( ! cr ) { sendReply(slot); return; } // if injecting a titlerec from an import operation use set2() //if ( m_sbuf.length() > 0 ) { xd->set2 ( titleRec,//m_sbuf.getBufStart() , titleRecSize,//m_sbuf.length() , cr->m_coll , NULL, // pbuf MAX_NICENESS , NULL ); // sreq // log it i guess log("inject: importing %s",xd->m_firstUrl.getUrl()); // call this when done indexing //xd->m_masterState = this; //xd->m_masterLoop = doneInjectingWrapper9; xd->m_state = xd;//this; xd->m_callback1 = doneInjectingWrapper10; xd->m_isImporting = true; xd->m_isImportingValid = true; // hack this xd->m_slot = slot; // then index it if ( ! xd->indexDoc() ) // return if would block return; // all done? //return true; sendReply ( slot ); }
bool Msg7::inject ( char *url , long forcedIp , char *content , long contentLen , bool recycleContent, uint8_t contentType, char *coll , bool quickReply , char *username , char *pwd , long niceness, void *state , void (*callback)(void *state), long firstIndexed, long lastSpidered, long hopCount, char newOnly, short charset, char spiderLinks, char deleteIt, char hasMime, bool doConsistencyTesting ) { m_quickReply = quickReply; // store coll if ( ! coll ) { g_errno = ENOCOLLREC; return true; } long collLen = gbstrlen ( coll ); if ( collLen > MAX_COLL_LEN ) collLen = MAX_COLL_LEN; strncpy ( m_coll , coll , collLen ); m_coll [ collLen ] = '\0'; // store user //long ulen = 0; //if ( username ) ulen = gbstrlen(username); //if ( ulen >= MAX_USER_SIZE-1 ) {g_errno = EBUFOVERFLOW; return true;} //if ( username ) strcpy( m_username, username ); // store password //long pwdLen = 0; //if ( pwd ) pwdLen = gbstrlen(pwd); //m_pwd [ 0 ] ='\0'; //if ( pwdLen > 31 ) pwdLen = 31; //if ( pwdLen > 0 ) strncpy ( m_pwd , pwd , pwdLen ); //m_pwd [ pwdLen ] = '\0'; // store url if ( ! url ) { g_errno = 0; return true; } long urlLen = gbstrlen(url); if ( urlLen > MAX_URL_LEN ) {g_errno = EBADENGINEER; return true; } // skip injecting if no url given! just print the admin page. if ( urlLen <= 0 ) return true; //strcpy ( m_url , url ); if ( g_repairMode ) { g_errno = EREPAIRING; return true; } // send template reply if no content supplied if ( ! content && ! recycleContent ) { log("inject: no content supplied to inject command and " "recycleContent is false."); //return true; } // clean url? // normalize and add www. if it needs it Url uu; uu.set ( url , gbstrlen(url) , true ); // remove >'s i guess and store in st1->m_url[] buffer char cleanUrl[MAX_URL_LEN+1]; urlLen = cleanInput ( cleanUrl, MAX_URL_LEN, uu.getUrl(), uu.getUrlLen() ); // this can go on the stack since set4() copies it SpiderRequest sreq; sreq.reset(); strcpy(sreq.m_url, cleanUrl ); // parentdocid of 0 long firstIp = hash32n(cleanUrl); if ( firstIp == -1 || firstIp == 0 ) firstIp = 1; sreq.setKey( firstIp,0LL, false ); sreq.m_isInjecting = 1; sreq.m_isPageInject = 1; sreq.m_hopCount = hopCount; sreq.m_hopCountValid = 1; sreq.m_fakeFirstIp = 1; sreq.m_firstIp = firstIp; // shortcut XmlDoc *xd = &m_xd; // log it now //log("inject: injecting doc %s",cleanUrl); static char s_dummy[3]; // sometims the content is indeed NULL... if ( newOnly && ! content ) { // don't let it be NULL because then xmldoc will // try to download the page! s_dummy[0] = '\0'; content = s_dummy; //char *xx=NULL;*xx=0; } } // . use the enormous power of our new XmlDoc class // . this returns false with g_errno set on error if ( //m_needsSet && ! xd->set4 ( &sreq , NULL , m_coll , NULL , // pbuf // give it a niceness of 1, we have to be // careful since we are a niceness of 0!!!! niceness, // 1 , // inject this content content , deleteIt, // false, // deleteFromIndex , forcedIp , contentType , lastSpidered , hasMime )) { // g_errno should be set if that returned false if ( ! g_errno ) { char *xx=NULL;*xx=0; } return true; } // do not re-call the set //m_needsSet = false; // make this our callback in case something blocks xd->setCallback ( state , callback ); xd->m_doConsistencyTesting = doConsistencyTesting; // . set xd from the old title rec if recycle is true // . can also use XmlDoc::m_loadFromOldTitleRec flag if ( recycleContent ) xd->m_recycleContent = true; // othercrap if ( firstIndexed ) { xd->m_firstIndexedDate = firstIndexed; xd->m_firstIndexedDateValid = true; } if ( lastSpidered ) { xd->m_spideredTime = lastSpidered; xd->m_spideredTimeValid = true; } if ( hopCount != -1 ) { xd->m_hopCount = hopCount; xd->m_hopCountValid = true; } if ( charset != -1 && charset != csUnknown ) { xd->m_charset = charset; xd->m_charsetValid = true; } // avoid looking up ip of each outlink to add "firstip" tag to tagdb // because that can be slow!!!!!!! xd->m_spiderLinks = spiderLinks; xd->m_spiderLinks2 = spiderLinks; xd->m_spiderLinksValid = true; // . newOnly is true --> do not inject if document is already indexed! // . maybe just set indexCode xd->m_newOnly = newOnly; // do not re-lookup the robots.txt xd->m_isAllowed = true; xd->m_isAllowedValid = true; xd->m_crawlDelay = -1; // unknown xd->m_crawlDelayValid = true; // set this now g_inPageInject = true; // log it now //log("inject: indexing injected doc %s",cleanUrl); // . now tell it to index // . this returns false if blocked bool status = xd->indexDoc ( ); // log it. i guess only for errors when it does not block? // because xmldoc.cpp::indexDoc calls logIt() if ( status ) xd->logIt(); // undo it g_inPageInject = false; // note that it blocked //if ( ! status ) log("inject: blocked for %s",cleanUrl); // return false if it blocked return status; }