////////////////////////////////////////////////////////////////////////// //check whether user that request to log in from client side is in the repository //if so return Login type message //else return warning. Message MessageHandler::LoginMessage() { strVal path = _repositoryPath+"Users.xml"; XmlDoc doc; if (!doc.LoadXmlFile(path)) return WarningMessage("Users file broken, please contact administrator."); strVal name = GetMessageName(); strVal tag = "User"; strVal content; xmlRep rep; vector<XmlDoc> users = doc.Children(tag); for (size_t i=0; i<users.size(); ++i) { strVal userName = users[i].Children("Name")[0].InnerText(); if (userName == name) { _loginUsers[_curIP] = name; return _msg; } } return WarningMessage("User is not in the repository."); }
void handleRequest7 ( UdpSlot *slot , long netnice ) { //m_state = state; //m_callback = callback; // shortcut XmlDoc *xd; try { xd = new (XmlDoc); } catch ( ... ) { g_errno = ENOMEM; log("PageInject: import failed: new(%i): %s", (int)sizeof(XmlDoc),mstrerror(g_errno)); sendReply(slot); return; } mnew ( xd, sizeof(XmlDoc) , "PageInject" ); //xd->reset(); char *titleRec = slot->m_readBuf; long titleRecSize = slot->m_readBufSize; long collnum = *(long *)titleRec; titleRec += 4; titleRecSize -= 4; CollectionRec *cr = g_collectiondb.m_recs[collnum]; if ( ! cr ) { sendReply(slot); return; } // if injecting a titlerec from an import operation use set2() //if ( m_sbuf.length() > 0 ) { xd->set2 ( titleRec,//m_sbuf.getBufStart() , titleRecSize,//m_sbuf.length() , cr->m_coll , NULL, // pbuf MAX_NICENESS , NULL ); // sreq // log it i guess log("inject: importing %s",xd->m_firstUrl.getUrl()); // call this when done indexing //xd->m_masterState = this; //xd->m_masterLoop = doneInjectingWrapper9; xd->m_state = xd;//this; xd->m_callback1 = doneInjectingWrapper10; xd->m_isImporting = true; xd->m_isImportingValid = true; // hack this xd->m_slot = slot; // then index it if ( ! xd->indexDoc() ) // return if would block return; // all done? //return true; sendReply ( slot ); }
bool gotReplyWrapperxd ( void *state ) { // grab it XmlDoc *xd = (XmlDoc *)state; // get it UdpSlot *slot = (UdpSlot *)xd->m_slot; // parse the request Msg20Request *req = (Msg20Request *)slot->m_readBuf; // print time long long took = gettimeofdayInMilliseconds() - xd->m_setTime; // if there is a baclkog of msg20 summary generation requests this // is really not the cpu it took to make the smmary, but how long it // took to get the reply. this request might have had to wait for the // other summaries to finish computing before it got its turn, // meanwhile its clock was ticking. TODO: make this better? // only do for niceness 0 otherwise it gets interrupted by quickpoll // and can take a long time. if ( (req->m_isDebug || took > 100) && req->m_niceness == 0 ) log("query: Took %lli ms to compute summary for d=%lli u=%s " "niceness=%li", took, xd->m_docId,xd->m_firstUrl.m_url, xd->m_niceness ); // error? if ( g_errno ) { xd->m_reply.sendReply ( xd ); return true; } // this should not block now Msg20Reply *reply = xd->getMsg20Reply ( ); // sanity check, should not block here now if ( reply == (void *)-1 ) { char *xx=NULL;*xx=0; } // NULL means error, -1 means blocked. on error g_errno should be set if ( ! reply && ! g_errno ) { char *xx=NULL;*xx=0;} // send it off. will send an error reply if g_errno is set return reply->sendReply ( xd ); }
TEST_F(XmlDocTest, PosdbGetMetaListNewDoc) { const char *url = "http://www.example.test/index.html"; char contentNew[] = "<html><head><title>my title</title></head><body>new document</body></html>"; XmlDoc xmlDocNew; initializeDocForPosdb(&xmlDocNew, url, contentNew); xmlDocNew.getMetaList(false); auto metaListKeys = parseMetaList(xmlDocNew.m_metaList, xmlDocNew.m_metaListSize); // make sure positive special key is there (to clear out existing negative special key) EXPECT_TRUE(posdbFindRecord(metaListKeys, xmlDocNew.m_docId, POSDB_DELETEDOC_TERMID, false)); EXPECT_FALSE(posdbFindRecord(metaListKeys, xmlDocNew.m_docId, POSDB_DELETEDOC_TERMID, true)); // make sure title & body text is indexed // title EXPECT_TRUE(posdbFindRecord(metaListKeys, xmlDocNew.m_docId, hashWord("title", "my"), false)); EXPECT_TRUE(posdbFindRecord(metaListKeys, xmlDocNew.m_docId, hashWord("title", "title"), false)); EXPECT_TRUE(posdbFindRecord(metaListKeys, xmlDocNew.m_docId, hashWord("title", "mytitle"), false)); EXPECT_TRUE(posdbFindRecord(metaListKeys, xmlDocNew.m_docId, hashWord("my"), false)); EXPECT_TRUE(posdbFindRecord(metaListKeys, xmlDocNew.m_docId, hashWord("title"), false)); EXPECT_TRUE(posdbFindRecord(metaListKeys, xmlDocNew.m_docId, hashWord("mytitle"), false)); // body EXPECT_TRUE(posdbFindRecord(metaListKeys, xmlDocNew.m_docId, hashWord("new"), false)); EXPECT_TRUE(posdbFindRecord(metaListKeys, xmlDocNew.m_docId, hashWord("document"), false)); EXPECT_TRUE(posdbFindRecord(metaListKeys, xmlDocNew.m_docId, hashWord("newdocument"), false)); /// @todo ALC add other terms }
////////////////////////////////////////////////////////////////////////// //Extract InnerText of Name tag in message strVal MessageHandler::GetMessageName() { strVal tag = "Name"; XmlDoc doc = _msg.Doc(); vector<XmlDoc> names = doc.Children(tag); if (names.size() <=0)return ""; strVal name = names[0].InnerText(); return names[0].InnerText(); }
void CClientInfo::Config(const string& fileName) { ifstream inFile(fileName.c_str(), ifstream::in); string inXml, inLine; while (getline(inFile, inLine)) inXml += inLine; XmlVec confData(inXml.begin(), inXml.end()); confData.push_back('\0'); XmlDoc xmlDoc; try { xmlDoc.parse<0>(&confData[0]); } catch (...) { LOG("ERROR: failed to load the server config file."); return; } XmlNodePtr pNode = NULL; if ((pNode = xmlDoc.first_node("server_list"))) { XmlNodePtr pItem = pNode->first_node("server"); while (pItem) { int pId = HAS_ATTR(pItem, "id") ? atoi(ATTR_VALUE(pItem, "id")) : -1; string ipStr = HAS_ATTR(pItem, "ip") ? ATTR_VALUE(pItem, "ip") : "127.0.0.1"; int port = HAS_ATTR(pItem, "port") ? atoi(ATTR_VALUE(pItem, "port")) : 12345; m_svrMap[pId] = pair<string, int>(ipStr, port); pItem = pItem->next_sibling("server"); } } if ((pNode = xmlDoc.first_node("parameters"))) { m_connNum = HAS_NODE(pNode, "conn_num") ? atoi(NODE_VALUE(pNode, "conn_num")) : 1; m_fixRate = HAS_NODE(pNode, "fix_rate") ? atoi(NODE_VALUE(pNode, "fix_rate")) : 0; m_initTimeout = HAS_NODE(pNode, "init_timeout") ? atoi(NODE_VALUE(pNode, "init_timeout")) : 20; m_videoLen = HAS_NODE(pNode, "video_len") ? atoi(NODE_VALUE(pNode, "video_len")) : 278; m_initBufferSize = HAS_NODE(pNode, "init_buffer_size") ? atoi(NODE_VALUE(pNode, "init_buffer_size")) : 10; } LOG("--------------------------------------------------"); svrMap_t::const_iterator cIt = m_svrMap.begin(); for (; cIt != m_svrMap.end(); ++cIt) { LOG("|server: id=%d, %s:%d.", (*cIt).first, (*cIt).second.first.c_str(), (*cIt).second.second); } LOG("|number of connections: %d.", m_connNum); LOG("|video length: %d.", m_videoLen); LOG("|fix rate: %d.", m_fixRate); LOG("|init buffer timeout: %d.", m_initTimeout); LOG("|init buffer size: %d.", m_initBufferSize); LOG("--------------------------------------------------"); xmlDoc.clear(); m_rateVec.push_back(350); m_rateVec.push_back(470); m_rateVec.push_back(630); m_rateVec.push_back(845); m_rateVec.push_back(1130); m_rateVec.push_back(1520); m_rateVec.push_back(2040); }
// . returns false if blocked, true otherwise // . sets g_errno on error // . make a web page displaying the titleRec of "docId" given via cgi // . call g_httpServer.sendDynamicPage() to send it bool sendPageTitledb ( TcpSocket *s , HttpRequest *r ) { // get the docId from the cgi vars long long docId = r->getLongLong ("d", 0LL ); // set up a msg22 to get the next titleRec State4 *st ; try { st = new (State4); } catch ( ... ) { g_errno = ENOMEM; log("PageTitledb: new(%i): %s", (int)sizeof(State4),mstrerror(g_errno)); return g_httpServer.sendErrorReply(s,500,mstrerror(g_errno));} mnew ( st , sizeof(State4) , "PageTitledb"); // save the socket st->m_socket = s; // copy it st->m_r.copy ( r ); // remember if http request is internal/local or not st->m_isRootAdmin = g_conf.isCollAdmin ( s , r ); st->m_isLocal = r->isLocal(); st->m_docId = docId; // password, too st->m_pwd = r->getString ( "pwd" ); // get the collection long collLen = 0; char *coll = st->m_r.getString("c",&collLen); if ( ! coll || ! coll[0] ) { //coll = g_conf.m_defaultColl; coll = g_conf.getDefaultColl( r->getHost(), r->getHostLen() ); collLen = gbstrlen(coll); } st->m_coll = coll; st->m_collLen = collLen; // just print page if no docid provided if ( ! docId ) return gotTitleRec ( st ); // get the handy XmlDoc XmlDoc *xd = &st->m_xd; // use 0 for niceness xd->set3 ( docId , coll , 0 ); // callback xd->setCallback ( st , gotTitleRec ); // . and tell it to load from old title rec // . this sets all the member vars from it and also sets // m_titleRecBuf to contain the actual compressed title rec if ( ! xd->loadFromOldTitleRec ( ) ) return false; // we got it without blocking. cached? return gotTitleRec ( st ); }
// when XmlDoc::inject() complets it calls this void doneInjectingWrapper9 ( void *state ) { Msg7 *msg7 = (Msg7 *)state; msg7->m_inUse = false; // shortcut XmlDoc *xd = &msg7->m_xd; GigablastRequest *gr = &msg7->m_gr; if ( gr->m_getSections && ! gr->m_gotSections ) { // do not re-call gr->m_gotSections = true; // new callback now, same state xd->m_callback1 = doneInjectingWrapper9; // and if it blocks internally, it will call // getInlineSectionVotingBuf until it completes then it will // call xd->m_callback xd->m_masterLoop = NULL; // get sections SafeBuf *buf = xd->getInlineSectionVotingBuf(); // if it returns -1 wait for it to call wrapper10 when done if ( buf == (void *)-1 ) return; // error? if ( ! buf ) log("inject: error getting sections: %s", mstrerror(g_errno)); } loop: // if we were injecting delimterized documents... char *delim = gr->m_contentDelim; if ( delim && ! delim[0] ) delim = NULL; if ( delim && msg7->m_start ) { // do another injection. returns false if it blocks if ( ! msg7->inject ( msg7->m_state , msg7->m_callback ) ) return; } if ( msg7->m_start && delim ) goto loop; // and we call the original caller msg7->m_callback ( msg7->m_state ); }
XmlType getXmlTypeNoThrow(const XmlDoc& doc) //throw() { if (doc.root().getNameAs<std::string>() == "FreeFileSync") { std::string type; if (doc.root().getAttribute("XmlType", type)) { if (type == "GUI") return XML_TYPE_GUI; else if (type == "BATCH") return XML_TYPE_BATCH; else if (type == "GLOBAL") return XML_TYPE_GLOBAL; } } return XML_TYPE_OTHER; }
////////////////////////////////////////////////////////////////////////// //check whether uploaded file is not in repository //or user own the file bool MessageHandler::OKtoCheckin(strVal fileName) { fileName = GetKeyName(fileName)+".xml"; XmlDoc doc; //package not in the repository, ok to check in if (!doc.LoadXmlFile(_repositoryPath+_metaFolder+fileName)) if (!doc.LoadXmlFile(_repositoryPath+_checkinFoler+fileName)) return true; vector<XmlDoc> elems = doc.Children("owner"); if (elems.size()<=0)return false; //check in depend on whether user is package owner strVal owner = elems[0].InnerText(); return owner == _curUser; }
void setXmlType(XmlDoc& doc, XmlType type) //throw() { switch (type) { case XML_TYPE_GUI: doc.root().setAttribute("XmlType", "GUI"); break; case XML_TYPE_BATCH: doc.root().setAttribute("XmlType", "BATCH"); break; case XML_TYPE_GLOBAL: doc.root().setAttribute("XmlType", "GLOBAL"); break; case XML_TYPE_OTHER: assert(false); break; } }
TEST_F(XmlDocTest, PosdbGetMetaListChangedDoc) { const char *url = "http://www.example.test/index.html"; char contentOld[] = "<html><head><title>my title</title></head><body>old document</body></html>"; char contentNew[] = "<html><head><title>my title</title></head><body>new document</body></html>"; XmlDoc *xmlDocOld = new XmlDoc(); mnew(xmlDocOld, sizeof(*xmlDocOld), "XmlDoc"); initializeDocForPosdb(xmlDocOld, url, contentOld); XmlDoc xmlDocNew; initializeDocForPosdb(&xmlDocNew, url, contentNew); xmlDocNew.m_oldDocValid = true; xmlDocNew.m_oldDoc = xmlDocOld; xmlDocNew.getMetaList(false); auto metaListKeys = parseMetaList(xmlDocNew.m_metaList, xmlDocNew.m_metaListSize); // make sure no special key is inserted (positive or negative) EXPECT_FALSE(posdbFindRecord(metaListKeys, xmlDocNew.m_docId, POSDB_DELETEDOC_TERMID, false)); EXPECT_FALSE(posdbFindRecord(metaListKeys, xmlDocNew.m_docId, POSDB_DELETEDOC_TERMID, true)); // make sure title & body text is indexed (with difference between old & new document deleted) // title EXPECT_TRUE(posdbFindRecord(metaListKeys, xmlDocNew.m_docId, hashWord("title", "my"), false)); EXPECT_TRUE(posdbFindRecord(metaListKeys, xmlDocNew.m_docId, hashWord("title", "title"), false)); EXPECT_TRUE(posdbFindRecord(metaListKeys, xmlDocNew.m_docId, hashWord("title", "mytitle"), false)); EXPECT_TRUE(posdbFindRecord(metaListKeys, xmlDocNew.m_docId, hashWord("my"), false)); EXPECT_TRUE(posdbFindRecord(metaListKeys, xmlDocNew.m_docId, hashWord("title"), false)); EXPECT_TRUE(posdbFindRecord(metaListKeys, xmlDocNew.m_docId, hashWord("mytitle"), false)); // body EXPECT_TRUE(posdbFindRecord(metaListKeys, xmlDocNew.m_docId, hashWord("new"), false)); EXPECT_TRUE(posdbFindRecord(metaListKeys, xmlDocNew.m_docId, hashWord("document"), false)); EXPECT_TRUE(posdbFindRecord(metaListKeys, xmlDocNew.m_docId, hashWord("newdocument"), false)); // deleted terms EXPECT_TRUE(posdbFindRecord(metaListKeys, xmlDocNew.m_docId, hashWord("old"), true)); EXPECT_TRUE(posdbFindRecord(metaListKeys, xmlDocNew.m_docId, hashWord("olddocument"), true)); /// @todo ALC add other terms }
// returns false if blocked, true otherwise bool processLoop ( void *state ) { // get it State60 *st = (State60 *)state; // get the tcp socket from the state TcpSocket *s = st->m_socket; // get it XmlDoc *xd = &st->m_xd; if ( ! xd->m_loaded ) { // setting just the docid. niceness is 0. xd->set3 ( st->m_docId , st->m_coll , 0 ); // callback xd->setCallback ( state , processLoop ); // . and tell it to load from the old title rec // . if it returns false it blocked and will call our callback // processLoop() when it completes if ( ! xd->loadFromOldTitleRec ( ) ) return false; } if ( g_errno ) return sendErrorReply ( st , g_errno ); // get the utf8 content char **utf8 = xd->getUtf8Content(); //int32_t len = xd->size_utf8Content - 1; // wait if blocked??? if ( utf8 == (void *)-1 ) return false; // strange if ( xd->size_utf8Content<=0) return sendErrorReply(st,EBADENGINEER ); // alloc error? if ( ! utf8 ) return sendErrorReply ( st , g_errno ); // get this host Host *h = g_hostdb.getHost ( g_hostdb.m_hostId ); if ( ! h ) return sendErrorReply ( st , EBADENGINEER ); // make it into an editable page now for the turk guy sendTurkPageReply ( st ); }
////////////////////////////////////////////////////////////////////////// //generate a Dependency type message Message MessageHandler::DependencyMessage() { const strVal name = GetMessageName(); if (name == "*.*")return AllPackageMessage(); strVal path = _repositoryPath+_metaFolder+name+".xml"; strVal tag = "references"; strVal content; XmlDoc doc; xmlRep rep; if (!doc.LoadXmlFile(path)) return WarningMessage(name+": the package is not in the repository."); vector<XmlDoc> refs = doc.Children(tag); if (refs.size()<=0) return WarningMessage(name+": the package does not depend on other package."); //find all dependency children under dependency tag refs = refs[0].Children(); if (refs.size()<=0) return WarningMessage(name+": the package does not depend on other package."); tag = "Name"; for (size_t i=0; i<refs.size(); ++i) { content = refs[i].InnerText(); content = GetKeyName(content); xmlElem elem(tag,content); rep.addSibling(elem); } tag = MsgType::EnumToString(MsgType::Dependency); rep.makeParent(tag); return Message(rep.xmlStr()); }
void XmlSignature::sign(XmlDoc &xml, const Sec::Key &key) { try { XmlElementPtr rootNode{ xml.root() }; XmlElementPtr sigNode{ std::make_shared<XmlElement>( *_signature->createBlankSignature(xml, CANON_C14N_COM, SIGNATURE_RSA, HASH_SHA1)) }; rootNode->add(*sigNode); DSIGReference * ref = _signature->createReference(MAKE_UNICODE_STRING("")); ref->appendEnvelopedSignatureTransform(); _signature->setSigningKey(key); _signature->sign(); _signed = true; } catch (XSECException &ex) { rethrowWithMessage(ex, "An error occured while signing xml"); } }
////////////////////////////////////////////////////////////////////////// //generate a package type message Message MessageHandler::PackageMessage() { const strVal name = GetMessageName(); strVal path = _repositoryPath+_metaFolder+name+".xml"; XmlDoc doc; strVal prefix = "\n "; if (!doc.LoadXmlFile(path)) return WarningMessage(name+": the package is not in the repository."); vector<XmlDoc> refs ; strVal packInfo; if((refs=doc.Children("head")).size()>0) packInfo+=prefix+refs[0].InnerText(); if((refs=doc.Children("implement")).size()>0) packInfo+=prefix+refs[0].InnerText(); if (packInfo.size()>0) packInfo = "Package files: " + packInfo; else return WarningMessage("Package metadata file broken, please administrator."); strVal dependency="\n \nDependencies: "; if ((refs=doc.Children("references")).size()>0) if ((refs=refs[0].Children("reference")).size()>0) for(size_t i=0; i<refs.size(); ++i) dependency+=prefix+refs[i].InnerText(); strVal owner="\n \nPackage Owner: "; if ((refs=doc.Children("owner")).size()>0) owner+=prefix+refs[0].InnerText(); xmlRep rep(xmlElem("Name",packInfo+dependency+owner)); rep.makeParent(MsgType::EnumToString(MsgType::Package)); return Message(rep.xmlStr()); }
// . returns false if blocked, true otherwise // . sets g_errno on error bool sendPageGet ( TcpSocket *s , HttpRequest *r ) { // get the collection long collLen = 0; char *coll = r->getString("c",&collLen); if ( ! coll || ! coll[0] ) { //coll = g_conf.m_defaultColl; coll = g_conf.getDefaultColl( r->getHost(), r->getHostLen() ); collLen = gbstrlen(coll); } // ensure collection not too big if ( collLen >= MAX_COLL_LEN ) { g_errno = ECOLLTOOBIG; return g_httpServer.sendErrorReply(s,500,mstrerror(g_errno)); } // get the collection rec CollectionRec *cr = g_collectiondb.getRec ( coll ); if ( ! cr ) { g_errno = ENOCOLLREC; log("query: Archived copy retrieval failed. " "No collection record found for " "collection \"%s\".",coll); return g_httpServer.sendErrorReply(s,500,mstrerror(g_errno)); } // does this collection ban this IP? if ( ! cr->hasSearchPermission ( s ) ) { g_errno = ENOPERM; //log("PageGet::sendDynamicReply0: permission denied for %s", // iptoa(s->m_ip) ); g_msg = " (error: permission denied)"; return g_httpServer.sendErrorReply(s,500,mstrerror(g_errno)); } // . get fields from cgi field of the requested url // . get the search query long qlen = 0; char *q = r->getString ( "q" , &qlen , NULL /*default*/); // ensure query not too big if ( qlen >= MAX_QUERY_LEN-1 ) { g_errno=EQUERYTOOBIG; return g_httpServer.sendErrorReply (s,500 ,mstrerror(g_errno)); } // the docId long long docId = r->getLongLong ( "d" , 0LL /*default*/ ); // get url char *url = r->getString ( "u",NULL); if ( docId == 0 && ! url ) { g_errno = EMISSINGINPUT; return g_httpServer.sendErrorReply (s,500 ,mstrerror(g_errno)); } // . should we do a sequential lookup? // . we need to match summary here so we need to know this //bool seq = r->getLong ( "seq" , false ); // restrict to root file? bool rtq = r->getLong ( "rtq" , false ); // . get the titleRec // . TODO: redirect client to a better http server to save bandwidth State2 *st ; try { st = new (State2); } catch (... ) { g_errno = ENOMEM; log("PageGet: new(%i): %s", (int)sizeof(State2),mstrerror(g_errno)); return g_httpServer.sendErrorReply(s,500,mstrerror(g_errno));} mnew ( st , sizeof(State2) , "PageGet1" ); // save the socket and if Host: is local in the Http request Mime st->m_socket = s; st->m_isAdmin = g_conf.isCollAdmin ( s , r ); st->m_isLocal = r->isLocal(); st->m_docId = docId; st->m_printed = false; // include header ... "this page cached by Gigablast on..." st->m_includeHeader = r->getLong ("ih" , true ); st->m_includeBaseHref = r->getLong ("ibh" , false ); st->m_queryHighlighting = r->getLong ("qh" , true ); st->m_strip = r->getLong ("strip" , 0 ); st->m_clickAndScroll = r->getLong ("cas" , true ); st->m_cnsPage = r->getLong ("cnsp" , true ); char *langAbbr = r->getString("qlang",NULL); st->m_langId = langUnknown; if ( langAbbr ) { uint8_t langId = getLangIdFromAbbr ( langAbbr ); st->m_langId = langId; } strncpy ( st->m_coll , coll , MAX_COLL_LEN+1 ); // store query for query highlighting st->m_netTestResults = r->getLong ("rnettest", false ); //if( st->m_netTestResults ) { // mdelete ( st , sizeof(State2) , "PageGet1" ); // delete ( st ); // return sendPageNetResult( s ); //} if ( q && qlen > 0 ) strcpy ( st->m_q , q ); else st->m_q[0] = '\0'; st->m_qlen = qlen; //st->m_seq = seq; st->m_rtq = rtq; st->m_boolFlag = r->getLong ("bq", 2 /*default is 2*/ ); st->m_isBanned = false; st->m_noArchive = false; st->m_socket = s; st->m_format = r->getReplyFormat(); // default to 0 niceness st->m_niceness = 0; st->m_r.copy ( r ); //st->m_cr = cr; st->m_printDisclaimer = true; if ( st->m_cnsPage ) st->m_printDisclaimer = false; if ( st->m_strip ) // ! st->m_evbits.isEmpty() ) st->m_printDisclaimer = false; // should we cache it? char useCache = r->getLong ( "usecache" , 1 ); char rcache = r->getLong ( "rcache" , 1 ); char wcache = r->getLong ( "wcache" , 1 ); long cacheAge = r->getLong ( "cacheAge" , 60*60 ); // default one hour if ( useCache == 0 ) { cacheAge = 0; wcache = 0; } if ( rcache == 0 ) cacheAge = 0; // . fetch the TitleRec // . a max cache age of 0 means not to read from the cache XmlDoc *xd = &st->m_xd; // url based? if ( url ) { SpiderRequest sreq; sreq.reset(); strcpy(sreq.m_url, url ); sreq.setDataSize(); // this returns false if "coll" is invalid if ( ! xd->set4 ( &sreq , NULL , coll , NULL , st->m_niceness ) ) goto hadSetError; } // . when getTitleRec() is called it will load the old one // since XmlDoc::m_setFromTitleRec will be true // . niceness is 0 // . use st->m_coll since XmlDoc just points to it! // . this returns false if "coll" is invalid else if ( ! xd->set3 ( docId , st->m_coll , 0 ) ) { hadSetError: mdelete ( st , sizeof(State2) , "PageGet1" ); delete ( st ); g_errno = ENOMEM; log("PageGet: set3: %s", mstrerror(g_errno)); return g_httpServer.sendErrorReply(s,500,mstrerror(g_errno)); } // if it blocks while it loads title rec, it will re-call this routine xd->setCallback ( st , processLoopWrapper ); // good to go! return processLoop ( st ); }
// returns false if blocked, true otherwise bool processLoop ( void *state ) { // get it State2 *st = (State2 *)state; // get the tcp socket from the state TcpSocket *s = st->m_socket; // get it XmlDoc *xd = &st->m_xd; if ( ! xd->m_loaded ) { // setting just the docid. niceness is 0. //xd->set3 ( st->m_docId , st->m_coll , 0 ); // callback xd->setCallback ( state , processLoop ); // . and tell it to load from the old title rec // . this sets xd->m_oldTitleRec/m_oldTitleRecSize // . this sets xd->ptr_* and all other member vars from // the old title rec if found in titledb. if ( ! xd->loadFromOldTitleRec ( ) ) return false; } if ( g_errno ) return sendErrorReply ( st , g_errno ); // now force it to load old title rec //char **tr = xd->getTitleRec(); SafeBuf *tr = xd->getTitleRecBuf(); // blocked? return false if so. it will call processLoop() when it rets if ( tr == (void *)-1 ) return false; // we did not block. check for error? this will free "st" too. if ( ! tr ) return sendErrorReply ( st , g_errno ); // if title rec was empty, that is a problem if ( xd->m_titleRecBuf.length() == 0 ) return sendErrorReply ( st , ENOTFOUND); // set callback char *na = xd->getIsNoArchive(); // wait if blocked if ( na == (void *)-1 ) return false; // error? if ( ! na ) return sendErrorReply ( st , g_errno ); // forbidden? allow turkeys through though... if ( ! st->m_isAdmin && *na ) return sendErrorReply ( st , ENOCACHE ); SafeBuf *sb = &st->m_sb; // &page=4 will print rainbow sections if ( ! st->m_printed && st->m_r.getLong("page",0) ) { // do not repeat this call st->m_printed = true; // this will call us again since we called // xd->setCallback() above to us if ( ! xd->printDocForProCog ( sb , &st->m_r ) ) return false; } char *contentType = "text/html"; char format = st->m_format; if ( format == FORMAT_XML ) contentType = "text/xml"; if ( format == FORMAT_JSON ) contentType = "application/json"; // if we printed a special page (like rainbow sections) then return now if ( st->m_printed ) { bool status = g_httpServer.sendDynamicPage (s, //buf,bufLen, sb->getBufStart(), sb->getLength(), -1,false, //"text/html", contentType, -1, NULL, "utf8" ); // nuke state2 mdelete ( st , sizeof(State2) , "PageGet1" ); delete (st); return status; } /* // this was calling XmlDoc and setting sections, etc. to // get the SpiderReply junk... no no no // is it banned or filtered? this ignores the TagRec in the titleRec // and uses msg8a to get it fresh instead char *vi = xd->getIsFiltered();//Visible( ); // wait if blocked if ( vi == (void *)-1 ) return false; // error? if ( ! vi ) return sendErrorReply ( st , g_errno ); // banned? if ( ! st->m_isAdmin && ! *vi ) return sendErrorReply (st,EDOCBANNED); */ // get the utf8 content char **utf8 = xd->getUtf8Content(); //long len = xd->size_utf8Content - 1; // wait if blocked??? if ( utf8 == (void *)-1 ) return false; // strange if ( xd->size_utf8Content<=0) { log("pageget: utf8 content <= 0"); return sendErrorReply(st,EBADENGINEER ); } // alloc error? if ( ! utf8 ) return sendErrorReply ( st , g_errno ); // get this host Host *h = g_hostdb.getHost ( g_hostdb.m_hostId ); if ( ! h ) { log("pageget: hostid %li is bad",g_hostdb.m_hostId); return sendErrorReply(st,EBADENGINEER ); } char *content = xd->ptr_utf8Content; long contentLen = xd->size_utf8Content - 1; // shortcut char strip = st->m_strip; // alloc buffer now //char *buf = NULL; //long bufMaxSize = 0; //bufMaxSize = len + ( 32 * 1024 ) ; //bufMaxSize = contentLen + ( 32 * 1024 ) ; //buf = (char *)mmalloc ( bufMaxSize , "PageGet2" ); //char *p = buf; //char *bufEnd = buf + bufMaxSize; //if ( ! buf ) { // return sendErrorReply ( st , g_errno ); //} // for undoing the header //char *start1 = p; long startLen1 = sb->length(); // we are always utfu if ( strip != 2 ) sb->safePrintf( "<meta http-equiv=\"Content-Type\" " "content=\"text/html;charset=utf8\">\n"); // base href //Url *base = &xd->m_firstUrl; //if ( xd->ptr_redirUrl.m_url[0] ) // base = &xd->m_redirUrl; char *base = xd->ptr_firstUrl; if ( xd->ptr_redirUrl ) base = xd->ptr_redirUrl; //Url *redir = *xd->getRedirUrl(); if ( strip != 2 ) { sb->safePrintf ( "<BASE HREF=\"%s\">" , base ); //p += gbstrlen ( p ); } // default colors in case css files missing if ( strip != 2 ) { sb->safePrintf( "\n<style type=\"text/css\">\n" "body{background-color:white;color:black;}\n" "</style>\n"); //p += gbstrlen ( p ); } //char format = st->m_format; if ( format == FORMAT_XML ) sb->reset(); if ( format == FORMAT_JSON ) sb->reset(); // for undoing the stuff below long startLen2 = sb->length();//p; // query should be NULL terminated char *q = st->m_q; long qlen = st->m_qlen; char styleTitle[128] = "font-size:14px;font-weight:600;" "color:#000000;"; char styleText[128] = "font-size:14px;font-weight:400;" "color:#000000;"; char styleLink[128] = "font-size:14px;font-weight:400;" "color:#0000ff;"; char styleTell[128] = "font-size:14px;font-weight:600;" "color:#cc0000;"; // get the url of the title rec Url *f = xd->getFirstUrl(); bool printDisclaimer = st->m_printDisclaimer; if ( xd->m_contentType == CT_JSON ) printDisclaimer = false; if ( format == FORMAT_XML ) printDisclaimer = false; if ( format == FORMAT_JSON ) printDisclaimer = false; char tbuf[100]; tbuf[0] = 0; time_t lastSpiderDate = xd->m_spideredTime; if ( printDisclaimer || format == FORMAT_XML || format == FORMAT_JSON ) { struct tm *timeStruct = gmtime ( &lastSpiderDate ); strftime ( tbuf, 100,"%b %d, %Y UTC", timeStruct); } // We should always be displaying this disclaimer. // - May eventually want to display this at a different location // on the page, or on the click 'n' scroll browser page itself // when this page is not being viewed solo. // CNS: if ( ! st->m_clickNScroll ) { if ( printDisclaimer ) { sb->safePrintf(//sprintf ( p , //"<BASE HREF=\"%s\">" //"<table border=1 width=100%%>" //"<tr><td>" "<table border=\"1\" bgcolor=\"#" BGCOLOR "\" cellpadding=\"10\" " //"id=\"gbcnsdisctable\" class=\"gbcnsdisctable_v\"" "cellspacing=\"0\" width=\"100%%\" color=\"#ffffff\">" "<tr" //" id=\"gbcnsdisctr\" class=\"gbcnsdisctr_v\"" "><td>" //"<font face=times,sans-serif color=black size=-1>" "<span style=\"%s\">" "This is Gigablast's cached page of </span>" "<a href=\"%s\" style=\"%s\">%s</a>" "" , styleTitle, f->getUrl(), styleLink, f->getUrl() ); //p += gbstrlen ( p ); // then the rest //sprintf(p , sb->safePrintf( "<span style=\"%s\">. " "Gigablast is not responsible for the content of " "this page.</span>", styleTitle ); //p += gbstrlen ( p ); sb->safePrintf ( "<br/><span style=\"%s\">" "Cached: </span>" "<span style=\"%s\">", styleTitle, styleText ); //p += gbstrlen ( p ); // then the spider date in GMT // time_t lastSpiderDate = xd->m_spideredTime; // struct tm *timeStruct = gmtime ( &lastSpiderDate ); // char tbuf[100]; // strftime ( tbuf, 100,"%b %d, %Y UTC", timeStruct); //p += gbstrlen ( p ); sb->safeStrcpy(tbuf); // Moved over from PageResults.cpp sb->safePrintf( "</span> - <a href=\"" "/get?" "q=%s&c=%s&rtq=%li&" "d=%lli&strip=1\"" " style=\"%s\">" "[stripped]</a>", q , st->m_coll , (long)st->m_rtq, st->m_docId, styleLink ); // a link to alexa if ( f->getUrlLen() > 5 ) { sb->safePrintf( " - <a href=\"http:" "//web.archive.org/web/*/%s\"" " style=\"%s\">" "[older copies]</a>" , f->getUrl(), styleLink ); } if (st->m_noArchive){ sb->safePrintf( " - <span style=\"%s\"><b>" "[NOARCHIVE]</b></span>", styleTell ); } if (st->m_isBanned){ sb->safePrintf(" - <span style=\"%s\"><b>" "[BANNED]</b></span>", styleTell ); } // only print this if we got a query if ( qlen > 0 ) { sb->safePrintf("<br/><br/><span style=\"%s\"> " "These search terms have been " "highlighted: ", styleText ); //p += gbstrlen ( p ); } } // how much space left in p? //long avail = bufEnd - p; // . make the url that we're outputting for (like in PageResults.cpp) // . "thisUrl" is the baseUrl for click & scroll char thisUrl[MAX_URL_LEN]; char *thisUrlEnd = thisUrl + MAX_URL_LEN; char *x = thisUrl; // . use the external ip of our gateway // . construct the NAT mapped port // . you should have used iptables to map port to the correct // internal ip:port //unsigned long ip =g_conf.m_mainExternalIp ; // h->m_externalIp; //unsigned short port=g_conf.m_mainExternalPort;//h->m_externalHttpPort // local check //if ( st->m_isLocal ) { unsigned long ip = h->m_ip; unsigned short port = h->m_httpPort; //} //sprintf ( x , "http://%s:%li/get?q=" , iptoa ( ip ) , port ); // . we no longer put the port in here // . but still need http:// since we use <base href=> if (port == 80) sprintf(x,"http://%s/get?q=",iptoa(ip)); else sprintf(x,"http://%s:%hu/get?q=",iptoa(ip),port); x += gbstrlen ( x ); // the query url encoded long elen = urlEncode ( x , thisUrlEnd - x , q , qlen ); x += elen; // separate cgi vars with a & //sprintf ( x, "&seq=%li&rtq=%lid=%lli", // (long)st->m_seq,(long)st->m_rtq,st->m_msg22.getDocId()); sprintf ( x, "&d=%lli",st->m_docId ); x += gbstrlen(x); // set our query for highlighting Query qq; qq.set2 ( q, st->m_langId , true ); // print the query terms into our highlight buffer Highlight hi; // make words so we can set the scores to ignore fielded terms Words qw; qw.set ( q , // content being highlighted, utf8 qlen , // content being highlighted, utf8 TITLEREC_CURRENT_VERSION, true , // computeIds false ); // hasHtmlEntities? // . assign scores of 0 to query words that should be ignored // . TRICKY: loop over words in qq.m_qwords, but they should be 1-1 // with words in qw. // . sanity check //if ( qw.getNumWords() != qq.m_numWords ) { char *xx = NULL; *xx = 0;} // declare up here Matches m; // do the loop //Scores ss; //ss.set ( &qw , NULL ); //for ( long i = 0 ; i < qq.m_numWords ; i++ ) // if ( ! m.matchWord ( &qq.m_qwords[i],i ) ) ss.m_scores[i] = 0; // now set m.m_matches[] to those words in qw that match a query word // or phrase in qq. m.setQuery ( &qq ); //m.addMatches ( &qw , &ss , true ); m.addMatches ( &qw ); long hilen = 0; // CNS: if ( ! st->m_clickNScroll ) { // and highlight the matches if ( printDisclaimer ) { hilen = hi.set ( //p , //avail , sb , &qw , // words to highlight &m , // matches relative to qw false , // doSteming false , // st->m_clickAndScroll , (char *)thisUrl );// base url for ClcknScrll //p += hilen; // now an hr //memcpy ( p , "</span></table></table>\n" , 24 ); p += 24; sb->safeStrcpy("</span></table></table>\n"); } bool includeHeader = st->m_includeHeader; // do not show header for json object display if ( xd->m_contentType == CT_JSON ) includeHeader = false; if ( format == FORMAT_XML ) includeHeader = false; if ( format == FORMAT_JSON ) includeHeader = false; //mfree(uq, uqCapacity, "PageGet"); // undo the header writes if we should if ( ! includeHeader ) { // including base href is off by default when not including // the header, so the caller must explicitly turn it back on if ( st->m_includeBaseHref ) sb->m_length=startLen2;//p=start2; else sb->m_length=startLen1;//p=start1; } //sb->safeStrcpy(tbuf); if ( format == FORMAT_XML ) { sb->safePrintf("<response>\n"); sb->safePrintf("<statusCode>0</statusCode>\n"); sb->safePrintf("<statusMsg>Success</statusMsg>\n"); sb->safePrintf("<url><![CDATA["); sb->cdataEncode(xd->m_firstUrl.m_url); sb->safePrintf("]]></url>\n"); sb->safePrintf("<docId>%llu</docId>\n",xd->m_docId); sb->safePrintf("\t<cachedTimeUTC>%lu</cachedTimeUTC>\n", lastSpiderDate); sb->safePrintf("\t<cachedTimeStr>%s</cachedTimeStr>\n",tbuf); } if ( format == FORMAT_JSON ) { sb->safePrintf("{\"response\":{\n"); sb->safePrintf("\t\"statusCode\":0,\n"); sb->safePrintf("\t\"statusMsg\":\"Success\",\n"); sb->safePrintf("\t\"url\":\""); sb->jsonEncode(xd->m_firstUrl.m_url); sb->safePrintf("\",\n"); sb->safePrintf("\t\"docId\":%llu,\n",xd->m_docId); sb->safePrintf("\t\"cachedTimeUTC\":%lu,\n",lastSpiderDate); sb->safePrintf("\t\"cachedTimeStr\":\"%s\",\n",tbuf); } // identify start of <title> tag we wrote out char *sbstart = sb->getBufStart(); char *sbend = sb->getBufEnd(); char *titleStart = NULL; char *titleEnd = NULL; for ( char *t = sbstart ; t < sbend ; t++ ) { // title tag? if ( t[0]!='<' ) continue; if ( to_lower_a(t[1])!='t' ) continue; if ( to_lower_a(t[2])!='i' ) continue; if ( to_lower_a(t[3])!='t' ) continue; if ( to_lower_a(t[4])!='l' ) continue; if ( to_lower_a(t[5])!='e' ) continue; // point to it char *x = t + 5; // max - to keep things fast char *max = x + 500; for ( ; *x && *x != '>' && x < max ; x++ ); x++; // find end char *e = x; for ( ; *e && e < max ; e++ ) { if ( e[0]=='<' && to_lower_a(e[1])=='/' && to_lower_a(e[2])=='t' && to_lower_a(e[3])=='i' && to_lower_a(e[4])=='t' && to_lower_a(e[5])=='l' && to_lower_a(e[6])=='e' ) break; } if ( e < max ) { titleStart = x; titleEnd = e; } break; } // . print title at top! // . consider moving if ( titleStart ) { char *ebuf = st->m_r.getString("eb"); if ( ! ebuf ) ebuf = ""; //p += sprintf ( p , sb->safePrintf( "<table border=1 " "cellpadding=10 " "cellspacing=0 " "width=100%% " "color=#ffffff>" ); long printLinks = st->m_r.getLong("links",0); if ( ! printDisclaimer && printLinks ) sb->safePrintf(//p += sprintf ( p , // first put cached and live link "<tr>" "<td bgcolor=lightyellow>" // print cached link //"<center>" " " "<b>" "<a " "style=\"font-size:18px;font-weight:600;" "color:#000000;\" " "href=\"" "/get?" "c=%s&d=%lli&qh=0&cnsp=1&eb=%s\">" "cached link</a>" " " "<a " "style=\"font-size:18px;font-weight:600;" "color:#000000;\" " "href=%s>live link</a>" "</b>" //"</center>" "</td>" "</tr>\n" ,st->m_coll ,st->m_docId ,ebuf ,thisUrl // st->ptr_ubuf ); if ( printLinks ) { sb->safePrintf(//p += sprintf ( p , "<tr><td bgcolor=pink>" "<span style=\"font-size:18px;" "font-weight:600;" "color:#000000;\">" " " "<b>PAGE TITLE:</b> " ); long tlen = titleEnd - titleStart; sb->safeMemcpy ( titleStart , tlen ); sb->safePrintf ( "</span></td></tr>" ); } sb->safePrintf( "</table><br>\n" ); } // is the content preformatted? bool pre = false; char ctype = (char)xd->m_contentType; if ( ctype == CT_TEXT ) pre = true ; // text/plain if ( ctype == CT_DOC ) pre = true ; // filtered msword if ( ctype == CT_PS ) pre = true ; // filtered postscript if ( format == FORMAT_XML ) pre = false; if ( format == FORMAT_JSON ) pre = false; // if it is content-type text, add a <pre> if ( pre ) {//p + 5 < bufEnd && pre ) { sb->safePrintf("<pre>"); //p += 5; } if ( st->m_strip == 1 ) contentLen = stripHtml( content, contentLen, (long)xd->m_version, st->m_strip ); // it returns -1 and sets g_errno on error, line OOM if ( contentLen == -1 ) { //if ( buf ) mfree ( buf , bufMaxSize , "PageGet2" ); return sendErrorReply ( st , g_errno ); } Xml xml; Words ww; // if no highlighting, skip it bool queryHighlighting = st->m_queryHighlighting; if ( st->m_strip == 2 ) queryHighlighting = false; // do not do term highlighting if json if ( xd->m_contentType == CT_JSON ) queryHighlighting = false; SafeBuf tmp; SafeBuf *xb = sb; if ( format == FORMAT_XML ) xb = &tmp; if ( format == FORMAT_JSON ) xb = &tmp; if ( ! queryHighlighting ) { xb->safeMemcpy ( content , contentLen ); //p += contentLen ; } else { // get the content as xhtml (should be NULL terminated) //Words *ww = xd->getWords(); if ( ! xml.set ( content , contentLen , false , 0 , false , TITLEREC_CURRENT_VERSION , false , 0 , CT_HTML ) ) { // niceness is 0 //if ( buf ) mfree ( buf , bufMaxSize , "PageGet2" ); return sendErrorReply ( st , g_errno ); } if ( ! ww.set ( &xml , true , 0 ) ) { // niceness is 0 //if ( buf ) mfree ( buf , bufMaxSize , "PageGet2" ); return sendErrorReply ( st , g_errno ); } // sanity check //if ( ! xd->m_wordsValid ) { char *xx=NULL;*xx=0; } // how much space left in p? //avail = bufEnd - p; Matches m; m.setQuery ( &qq ); m.addMatches ( &ww ); hilen = hi.set ( xb , // p , avail , &ww , &m , false /*doStemming?*/ , st->m_clickAndScroll , thisUrl /*base url for click & scroll*/); //p += hilen; log(LOG_DEBUG, "query: Done highlighting cached page content"); } if ( format == FORMAT_XML ) { sb->safePrintf("\t<content><![CDATA["); sb->cdataEncode ( xb->getBufStart() ); sb->safePrintf("]]></content>\n"); sb->safePrintf("</response>\n"); } if ( format == FORMAT_JSON ) { sb->safePrintf("\t\"content\":\"\n"); sb->jsonEncode ( xb->getBufStart() ); sb->safePrintf("\"\n}\n}\n"); } // if it is content-type text, add a </pre> if ( pre ) { // p + 6 < bufEnd && pre ) { sb->safeMemcpy ( "</pre>" , 6 ); //p += 6; } // calculate bufLen //long bufLen = p - buf; long ct = xd->m_contentType; // now filter the entire buffer to escape out the xml tags // so it is displayed nice SafeBuf newbuf; if ( ct == CT_XML ) { // encode the xml tags into <tagname> sequences if ( !newbuf.htmlEncodeXmlTags ( sb->getBufStart() , sb->getLength(), 0)){// niceness=0 //if ( buf ) mfree ( buf , bufMaxSize , "PageGet2" ); return sendErrorReply ( st , g_errno ); } // free out buffer that we alloc'd before returning since this // should have copied it into another buffer //if ( buf ) mfree ( buf , bufMaxSize , "PageGet2" ); // reassign //buf = newbuf.getBufStart(); //bufLen = newbuf.length(); sb->stealBuf ( &newbuf ); } // now encapsulate it in html head/tail and send it off // sendErr: contentType = "text/html"; if ( strip == 2 ) contentType = "text/xml"; // xml is usually buggy and this throws browser off //if ( ctype == CT_XML ) contentType = "text/xml"; if ( xd->m_contentType == CT_JSON ) contentType = "application/json"; if ( format == FORMAT_XML ) contentType = "text/xml"; if ( format == FORMAT_JSON ) contentType = "application/json"; // safebuf, sb, is a member of "st" so this should copy the buffer // when it constructs the http reply, and we gotta call delete(st) // AFTER this so sb is still valid. bool status = g_httpServer.sendDynamicPage (s, //buf,bufLen, sb->getBufStart(), sb->getLength(), -1,false, contentType, -1, NULL, "utf8" ); // nuke state2 mdelete ( st , sizeof(State2) , "PageGet1" ); delete (st); // free out buffer that we alloc'd before returning since this // should have copied it into another buffer //if ( ct == CT_XML ) newbuf.purge(); //else if ( buf ) mfree ( buf , bufMaxSize , "PageGet2" ); // and convey the status return status; }
bool Msg7::inject ( char *url , long forcedIp , char *content , long contentLen , bool recycleContent, uint8_t contentType, char *coll , bool quickReply , char *username , char *pwd , long niceness, void *state , void (*callback)(void *state), long firstIndexed, long lastSpidered, long hopCount, char newOnly, short charset, char spiderLinks, char deleteIt, char hasMime, bool doConsistencyTesting ) { m_quickReply = quickReply; // store coll if ( ! coll ) { g_errno = ENOCOLLREC; return true; } long collLen = gbstrlen ( coll ); if ( collLen > MAX_COLL_LEN ) collLen = MAX_COLL_LEN; strncpy ( m_coll , coll , collLen ); m_coll [ collLen ] = '\0'; // store user //long ulen = 0; //if ( username ) ulen = gbstrlen(username); //if ( ulen >= MAX_USER_SIZE-1 ) {g_errno = EBUFOVERFLOW; return true;} //if ( username ) strcpy( m_username, username ); // store password //long pwdLen = 0; //if ( pwd ) pwdLen = gbstrlen(pwd); //m_pwd [ 0 ] ='\0'; //if ( pwdLen > 31 ) pwdLen = 31; //if ( pwdLen > 0 ) strncpy ( m_pwd , pwd , pwdLen ); //m_pwd [ pwdLen ] = '\0'; // store url if ( ! url ) { g_errno = 0; return true; } long urlLen = gbstrlen(url); if ( urlLen > MAX_URL_LEN ) {g_errno = EBADENGINEER; return true; } // skip injecting if no url given! just print the admin page. if ( urlLen <= 0 ) return true; //strcpy ( m_url , url ); if ( g_repairMode ) { g_errno = EREPAIRING; return true; } // send template reply if no content supplied if ( ! content && ! recycleContent ) { log("inject: no content supplied to inject command and " "recycleContent is false."); //return true; } // clean url? // normalize and add www. if it needs it Url uu; uu.set ( url , gbstrlen(url) , true ); // remove >'s i guess and store in st1->m_url[] buffer char cleanUrl[MAX_URL_LEN+1]; urlLen = cleanInput ( cleanUrl, MAX_URL_LEN, uu.getUrl(), uu.getUrlLen() ); // this can go on the stack since set4() copies it SpiderRequest sreq; sreq.reset(); strcpy(sreq.m_url, cleanUrl ); // parentdocid of 0 long firstIp = hash32n(cleanUrl); if ( firstIp == -1 || firstIp == 0 ) firstIp = 1; sreq.setKey( firstIp,0LL, false ); sreq.m_isInjecting = 1; sreq.m_isPageInject = 1; sreq.m_hopCount = hopCount; sreq.m_hopCountValid = 1; sreq.m_fakeFirstIp = 1; sreq.m_firstIp = firstIp; // shortcut XmlDoc *xd = &m_xd; // log it now //log("inject: injecting doc %s",cleanUrl); static char s_dummy[3]; // sometims the content is indeed NULL... if ( newOnly && ! content ) { // don't let it be NULL because then xmldoc will // try to download the page! s_dummy[0] = '\0'; content = s_dummy; //char *xx=NULL;*xx=0; } } // . use the enormous power of our new XmlDoc class // . this returns false with g_errno set on error if ( //m_needsSet && ! xd->set4 ( &sreq , NULL , m_coll , NULL , // pbuf // give it a niceness of 1, we have to be // careful since we are a niceness of 0!!!! niceness, // 1 , // inject this content content , deleteIt, // false, // deleteFromIndex , forcedIp , contentType , lastSpidered , hasMime )) { // g_errno should be set if that returned false if ( ! g_errno ) { char *xx=NULL;*xx=0; } return true; } // do not re-call the set //m_needsSet = false; // make this our callback in case something blocks xd->setCallback ( state , callback ); xd->m_doConsistencyTesting = doConsistencyTesting; // . set xd from the old title rec if recycle is true // . can also use XmlDoc::m_loadFromOldTitleRec flag if ( recycleContent ) xd->m_recycleContent = true; // othercrap if ( firstIndexed ) { xd->m_firstIndexedDate = firstIndexed; xd->m_firstIndexedDateValid = true; } if ( lastSpidered ) { xd->m_spideredTime = lastSpidered; xd->m_spideredTimeValid = true; } if ( hopCount != -1 ) { xd->m_hopCount = hopCount; xd->m_hopCountValid = true; } if ( charset != -1 && charset != csUnknown ) { xd->m_charset = charset; xd->m_charsetValid = true; } // avoid looking up ip of each outlink to add "firstip" tag to tagdb // because that can be slow!!!!!!! xd->m_spiderLinks = spiderLinks; xd->m_spiderLinks2 = spiderLinks; xd->m_spiderLinksValid = true; // . newOnly is true --> do not inject if document is already indexed! // . maybe just set indexCode xd->m_newOnly = newOnly; // do not re-lookup the robots.txt xd->m_isAllowed = true; xd->m_isAllowedValid = true; xd->m_crawlDelay = -1; // unknown xd->m_crawlDelayValid = true; // set this now g_inPageInject = true; // log it now //log("inject: indexing injected doc %s",cleanUrl); // . now tell it to index // . this returns false if blocked bool status = xd->indexDoc ( ); // log it. i guess only for errors when it does not block? // because xmldoc.cpp::indexDoc calls logIt() if ( status ) xd->logIt(); // undo it g_inPageInject = false; // note that it blocked //if ( ! status ) log("inject: blocked for %s",cleanUrl); // return false if it blocked return status; }
bool sendTurkPageReply ( State60 *st ) { XmlDoc *xd = &st->m_xd; //char *content = xd->ptr_utf8Content; //int32_t contentLen = xd->size_utf8Content - 1; // count the total number of EventDesc classes for all evids //char *evd = xd->ptr_eventData; //EventDisplay *ed = (EventDisplay *)evd; //char *addr = evd + (int32_t)ed->m_addr; //char timeZoneOffset = getTimeZoneFromAddr ( addr ); // in case getSections() block come right back in xd->setCallback ( st , xdcallback ); // . set niceness to 1 so all this processing doesn't slow queries down // . however, g_niceness should still be zero... hmmm... xd->m_niceness = 1; // default to 1 niceness st->m_niceness = 1; // now set the sections class Sections *ss = xd->getSections(); // now for each section with alnum text, telescope up as far as // possible without containing anymore alnum text than what it // contained. set SEC_CONTROL bit. such sections will have the // 2 green/blue dots, that are used for turning on/off title/desc. // but really the indians will only turn off sections that should // not have a title/desc. for ( Section *si = ss->m_rootSection ; si ; si = si->m_next ) { // breathe QUICKPOLL(st->m_niceness); // skip if does not have text if ( si->m_firstWordPos < 0 ) continue; // otherwise, find biggest parent that contains just that text Section *p = si->m_parent; Section *last = si; for ( ; p ; p = p->m_parent ) { if ( p->m_firstWordPos != si->m_firstWordPos ) break; if ( p->m_lastWordPos != si->m_lastWordPos ) break; last = p; } // set that bit then last->m_flags |= SEC_CONTROL; // and speed up the loop si = last; } // * now each SEC_CONTROL sections have a fence activated by a turker // * an event title or description can not span a fence. it must be // confined within a fence. however, it is allowed to include // title or description from a "title section". // * hold shift down to designate as title section when clicking it // * show the raw text of each event changing as you fence // sections in or out. show in a right frame. // * show list of events on page in the top frame. can toggle them // all individually. // * and remove no-display from all tags so we can see everything. // * highlight addresses, not just dates. // * each section hash has its own unique bg color when activated // * with a single click, completely reject an event because: // contains bad time, address, title or desc. specify which so // we can improve our algo. // * when selecting an individual event, scroll to its tod... // * remove all color from webpage that we can so our colors show up // * remove all imgs. just src them to dev null. // * allow for entering a custom title for an event or all events // that are or will ever appear on the page. // * when displaying the text of the events, use hyphens to // delineate the section topology. strike out text as a section // fence is activated. // * when a section is activated is it easier to just redownload // the whole text of the page? maybe just the text frame? // * clicking on an individual sentence section should just remove // that sentence. that is kinda a special content hash removal // tag. like "Click here for video." // * when an event id is selected i guess activate its bgcolor to // be light blue for all sentences currently in the event that // are not in activated sections. (make exception for designated // title sections). so we need multiple tags for each events // sentence div section. if sentence is split use multiple div tags // then to keep the order. so each event sentence would have // <div ev1=1 ev2=1 ev10=1>...</div> if it is in event ids 1,2 and // 10. that way we can activate it when one of those event ids is // activated. SafeBuf sb; // int16_tcuts if ( ! xd->m_wordsValid ) { char *xx=NULL;*xx=0; } Words *words = &xd->m_words; int32_t nw = words->getNumWords(); char **wptrs = words->getWords(); int32_t *wlens = words->getWordLens(); nodeid_t *tids = words->getTagIds(); // a special array for printing </div> tags char *endCounts = (char *)mcalloc ( nw ,"endcounts"); if ( ! endCounts ) return sendErrorReply ( st , g_errno ); // // now loop over all the words. if word starts a section that has // SEC_CONTROL bit set, and print out the section hash and a color // tag to be activated if the turkey activates us. // CAUTION: word may start multiple sections. // for ( int32_t i = 0 ; i < nw ; i++ ) { // get section ptr Section *sj = ss->m_sectionPtrs[i]; // sanity check. sj must be first section ptr that starts @ a if ( sj && sj->m_a==i && sj->m_prev && sj->m_prev->m_a==i ) { char *xx=NULL;*xx=0; } // . does word #i start a section? // . if section is control, print out the control while ( sj && sj->m_a == i ) { // print this section's hash if ( sj->m_flags & SEC_CONTROL) { // after the turkeys have made all the edits // they need to submit the changes they made. // how can we get that data sent back to the // back end? we need to send back the colors // of the sections that have been activated // i guess. just do a loop over them. sb.safePrintf("<div nobreak gbsecid=%"UINT32" " "bgcolor=#%"XINT32" " "onclick=gbtogglecolor()>", (uint32_t)sj->m_tagHash, (uint32_t)sj->m_tagHash); // sanity check if ( sj->m_b < 0 ) { char *xx=NULL;*xx=0; } if ( sj->m_b > nw ) { char *xx=NULL;*xx=0; } // and inc the /div count for that word endCounts[sj->m_b-1]++; } // try next section too sj = sj->m_next; } // if this is a tag, remove any coloring if ( tids[i] ) { } // print the word, be it a tag, alnum, punct sb.safeMemcpy ( wptrs[i] , wlens[i] ); // end a div tag? if ( ! endCounts[i] ) continue; // might be many so loop it for ( int32_t j = 0 ; j < endCounts[i] ; j++ ) sb.safePrintf("</div>"); } return false; }
// . this is called // . destroys the UdpSlot if false is returned void handleRequest20 ( UdpSlot *slot , long netnice ) { // . check g_errno // . before, we were not sending a reply back here and we continued // to process the request, even though it was empty. the slot // had a NULL m_readBuf because it could not alloc mem for the read // buf i'm assuming. and the slot was saved in a line below here... // state20->m_msg22.m_parent = slot; if ( g_errno ) { log("net: Msg20 handler got error: %s.",mstrerror(g_errno)); g_udpServer.sendErrorReply ( slot , g_errno ); return; } // ensure request is big enough if ( slot->m_readBufSize < (long)sizeof(Msg20Request) ) { g_udpServer.sendErrorReply ( slot , EBADREQUESTSIZE ); return; } // parse the request Msg20Request *req = (Msg20Request *)slot->m_readBuf; // . turn the string offsets into ptrs in the request // . this is "destructive" on "request" long nb = req->deserialize(); // sanity check if ( nb != slot->m_readBufSize ) { char *xx = NULL; *xx = 0; } // sanity check, the size include the \0 if ( req->size_coll <= 1 || *req->ptr_coll == '\0' ) { log("query: Got empty collection in msg20 handler. FIX!"); char *xx =NULL; *xx = 0; } // if it's not stored locally that's an error if ( req->m_docId >= 0 && ! g_titledb.isLocal ( req->m_docId ) ) { log("query: Got msg20 request for non-local docId %lli", req->m_docId); g_udpServer.sendErrorReply ( slot , ENOTLOCAL ); return; } // sanity if ( req->m_docId == 0 && ! req->ptr_ubuf ) { char *xx=NULL;*xx=0; } long long startTime = gettimeofdayInMilliseconds(); // alloc a new state to get the titlerec XmlDoc *xd; try { xd = new (XmlDoc); } catch ( ... ) { g_errno = ENOMEM; log("query: msg20 new(%i): %s", sizeof(XmlDoc), mstrerror(g_errno)); g_udpServer.sendErrorReply ( slot, g_errno ); return; } mnew ( xd , sizeof(XmlDoc) , "xd20" ); // ok, let's use the new XmlDoc.cpp class now! xd->set20 ( req ); // encode slot xd->m_slot = slot; // set the callback xd->setCallback ( xd , gotReplyWrapperxd ); // set set time xd->m_setTime = startTime; xd->m_cpuSummaryStartTime = 0; // . now as for the msg20 reply! // . TODO: move the parse state cache into just a cache of the // XmlDoc itself, and put that cache logic into XmlDoc.cpp so // it can be used more generally. Msg20Reply *reply = xd->getMsg20Reply ( ); // this is just blocked if ( reply == (void *)-1 ) return; // got it? gotReplyWrapperxd ( xd ); }
void gotDatedbList ( State60 *st ) { // must only be run on host #0 since we need just one lock table if ( g_hostdb.m_myHost->m_hostId != 0 ) { char *xx=NULL;*xx=0; } // load turk lock table if we need to bool s_init = false; if ( ! s_init ) { s_init = true; if ( ! g_turkLocks.set(8,sizeof(TurkLock),256) ) log("turk: failed to init turk lock table"); if ( ! g_turkLocks.load(g_conf.m_dir,"turkdir/docidlocks.dat")) log("turk: failed to load turk lock table"); } time_t now = getTimeGlobal(); // int16_tcut RdbList *list = &st->m_list; // the best docid int64_t best = 0LL; // scan the list to get urls/docids to turk out for ( ; ! list->isExhausted() ; ) { // get rec char *k = list->getCurrentKey(); // skip that list->skipCurrentRecord(); // skip if negative if ( (k[0] & 0x01) == 0x00 ) continue; // get the docid int64_t docid = g_datedb.getDocId ( k ); // skip if locked TurkLock *tt = (TurkLock *)g_turkLock.getValue(&docid); // if there check time if ( tt && now - tt->m_lockTime > 3600 ) { // remove it g_turkLock.removeKey(&docId); // nuke tt tt = NULL; } // if still there, skip it and try next one if ( tt ) continue; // ok, we got a good docid to dish out best = docId; break; } SafeBuf sb; // print description so they can clikc a button to start the turk sb.safePrintf("<html>\n" "<title>Event Editor</title>\n" "<body>\n" "<table width=\"100%%\" border=\"0\">\n" "<tr><td style=\"background-color:#0079ba;\">\n" "<center><font color=#00000>" "<h2>Event Editor</h2>\n" "</font></center></td>" "</tr></table>"); // if we had no docid, give user an empty msg if ( ! best ) { sb.safePrintf("<center>Nothing currently available to edit. " "Please try again later.</center>" "</body></html>\n"); sendReply ( &sb ); return; } // lock it! TurkLock tt; strcpy ( tt.m_user , st->m_user ); tt.m_lockTime = now; if ( ! g_lockTable.addLock ( &tt ) ) { sendErrorReply ( st , g_errno ); return; } // . fetch the TitleRec // . a max cache age of 0 means not to read from the cache XmlDoc *xd = &st->m_xd; // . when getTitleRec() is called it will load the old one // since XmlDoc::m_setFromTitleRec will be true // . niceness is 0 xd->set3 ( best , st->m_coll , 0 ); // if it blocks while it loads title rec, it will re-call this routine xd->setCallback ( st , processLoopWrapper ); // good to go! return processLoop ( st ); }
// . returns false if blocked and callback will be called, true otherwise // . sets g_errno on error bool Msg7::inject ( void *state , void (*callback)(void *state) //long spiderLinksDefault , //char *collOveride ) { ) { GigablastRequest *gr = &m_gr; char *coll2 = gr->m_coll; CollectionRec *cr = g_collectiondb.getRec ( coll2 ); if ( ! cr ) { g_errno = ENOCOLLREC; return true; } m_state = state; m_callback = callback; // shortcut XmlDoc *xd = &m_xd; if ( ! gr->m_url ) { log("inject: no url provied to inject"); g_errno = EBADURL; return true; } //char *coll = cr->m_coll; // test //diffbotReply = "{\"request\":{\"pageUrl\":\"http://www.washingtonpost.com/2011/03/10/ABe7RaQ_moreresults.html\",\"api\":\"article\",\"version\":3},\"objects\":[{\"icon\":\"http://www.washingtonpost.com/favicon.ico\",\"text\":\"In Case You Missed It\nWeb Hostess Live: The latest from the Web (vForum, May 15, 2014; 3:05 PM)\nGot Plans: Advice from the Going Out Guide (vForum, May 15, 2014; 2:05 PM)\nWhat to Watch: TV chat with Hank Stuever (vForum, May 15, 2014; 1:10 PM)\nColor of Money Live (vForum, May 15, 2014; 1:05 PM)\nWeb Hostess Live: The latest from the Web (vForum, May 15, 2014; 12:25 PM)\nMichael Devine outdoor entertaining and design | Home Front (vForum, May 15, 2014; 12:20 PM)\nThe Answer Sheet: Education chat with Valerie Strauss (vForum, May 14, 2014; 2:00 PM)\nThe Reliable Source Live (vForum, May 14, 2014; 1:05 PM)\nAsk Tom: Rants, raves and questions on the DC dining scene (vForum, May 14, 2014; 12:15 PM)\nOn Parenting with Meghan Leahy (vForum, May 14, 2014; 12:10 PM)\nAsk Aaron: The week in politics (vForum, May 13, 2014; 3:05 PM)\nEugene Robinson Live (vForum, May 13, 2014; 2:05 PM)\nTuesdays with Moron: Chatological Humor Update (vForum, May 13, 2014; 12:00 PM)\nComPost Live with Alexandra Petri (vForum, May 13, 2014; 11:05 AM)\nAsk Boswell: Redskins, Nationals and Washington sports (vForum, May 12, 2014; 1:50 PM)\nAdvice from Slate's 'Dear Prudence' (vForum, May 12, 2014; 1:40 PM)\nDr. Gridlock (vForum, May 12, 2014; 1:35 PM)\nSwitchback: Talking Tech (vForum, May 9, 2014; 12:05 PM)\nThe Fix Live (vForum, May 9, 2014; 12:00 PM)\nWhat to Watch: TV chat with Hank Stuever (vForum, May 8, 2014; 1:10 PM)\nMore News\",\"title\":\"The Washington Post\",\"diffbotUri\":\"article|3|828850106\",\"pageUrl\":\"http://www.washingtonpost.com/2011/03/10/ABe7RaQ_moreresults.html\",\"humanLanguage\":\"en\",\"html\":\"<p>In Case You Missed It<\\/p>\n<p> <a href=\\\"http://live.washingtonpost.com/web-hostess-140515-new.html\\\">Web Hostess Live: The latest from the Web<\\/a> <\\/p>\n<p>(vForum, May 15, 2014; 3:05 PM)<\\/p>\n<p> <a href=\\\"http://live.washingtonpost.com/got-plans-05-15-2014.html\\\">Got Plans: Advice from the Going Out Guide<\\/a> <\\/p>\n<p>(vForum, May 15, 2014; 2:05 PM)<\\/p>\n<p> <a href=\\\"http://live.washingtonpost.com/tv-chat-140515.html\\\">What to Watch: TV chat with Hank Stuever<\\/a> <\\/p>\n<p>(vForum, May 15, 2014; 1:10 PM)<\\/p>\n<p> <a href=\\\"http://live.washingtonpost.com/color-of-money-live-20140515.html\\\">Color of Money Live<\\/a> <\\/p>\n<p>(vForum, May 15, 2014; 1:05 PM)<\\/p>\n<p> <a href=\\\"http://live.washingtonpost.com/web-hostess-140515-new.html\\\">Web Hostess Live: The latest from the Web<\\/a> <\\/p>\n<p>(vForum, May 15, 2014; 12:25 PM)<\\/p>\n<p> <a href=\\\"http://live.washingtonpost.com/home-front-0515.html\\\">Michael Devine outdoor entertaining and design | Home Front<\\/a> <\\/p>\n<p>(vForum, May 15, 2014; 12:20 PM)<\\/p>\n<p> <a href=\\\"http://live.washingtonpost.com/the-answer-sheet-20140514.html\\\">The Answer Sheet: Education chat with Valerie Strauss<\\/a> <\\/p>\n<p>(vForum, May 14, 2014; 2:00 PM)<\\/p>\n<p> <a href=\\\"http://live.washingtonpost.com/the-reliable-source-140514-new.html\\\">The Reliable Source Live<\\/a> <\\/p>\n<p>(vForum, May 14, 2014; 1:05 PM)<\\/p>\n<p> <a href=\\\"http://live.washingtonpost.com/ask-tom-5-14-14.html\\\">Ask Tom: Rants, raves and questions on the DC dining scene <\\/a> <\\/p>\n<p>(vForum, May 14, 2014; 12:15 PM)<\\/p>\n<p> <a href=\\\"http://live.washingtonpost.com/parenting-0514.html\\\">On Parenting with Meghan Leahy<\\/a> <\\/p>\n<p>(vForum, May 14, 2014; 12:10 PM)<\\/p>\n<p> <a href=\\\"http://live.washingtonpost.com/post-politics-ask-aaron-051313.html\\\">Ask Aaron: The week in politics<\\/a> <\\/p>\n<p>(vForum, May 13, 2014; 3:05 PM)<\\/p>\n<p> <a href=\\\"http://live.washingtonpost.com/opinion-focus-with-eugene-robinson-20140513.html\\\">Eugene Robinson Live<\\/a> <\\/p>\n<p>(vForum, May 13, 2014; 2:05 PM)<\\/p>\n<p> <a href=\\\"http://live.washingtonpost.com/gene-weingarten-140513.html\\\">Tuesdays with Moron: Chatological Humor Update<\\/a> <\\/p>\n<p>(vForum, May 13, 2014; 12:00 PM)<\\/p>\n<p> <a href=\\\"http://live.washingtonpost.com/compost-live-140513.html\\\">ComPost Live with Alexandra Petri<\\/a> <\\/p>\n<p>(vForum, May 13, 2014; 11:05 AM)<\\/p>\n<p> <a href=\\\"http://live.washingtonpost.com/ask-boswell-1400512.html\\\">Ask Boswell: Redskins, Nationals and Washington sports<\\/a> <\\/p>\n<p>(vForum, May 12, 2014; 1:50 PM)<\\/p>\n<p> <a href=\\\"http://live.washingtonpost.com/dear-prudence-140512.html\\\">Advice from Slate's 'Dear Prudence'<\\/a> <\\/p>\n<p>(vForum, May 12, 2014; 1:40 PM)<\\/p>\n<p> <a href=\\\"http://live.washingtonpost.com/gridlock-0512.html\\\">Dr. Gridlock <\\/a> <\\/p>\n<p>(vForum, May 12, 2014; 1:35 PM)<\\/p>\n<p> <a href=\\\"http://live.washingtonpost.com/switchback-20140509.html\\\">Switchback: Talking Tech<\\/a> <\\/p>\n<p>(vForum, May 9, 2014; 12:05 PM)<\\/p>\n<p> <a href=\\\"http://live.washingtonpost.com/live-fix-140509.html\\\">The Fix Live<\\/a> <\\/p>\n<p>(vForum, May 9, 2014; 12:00 PM)<\\/p>\n<p> <a href=\\\"http://live.washingtonpost.com/tv-chat-140508.html\\\">What to Watch: TV chat with Hank Stuever<\\/a> <\\/p>\n<p>(vForum, May 8, 2014; 1:10 PM)<\\/p>\n<p> <a href=\\\"http://www.washingtonpost.com/2011/03/10/ /2011/03/10/ABe7RaQ_moreresults.html ?startIndex=20&dwxLoid=\\\">More News <\\/a> <\\/p>\",\"date\":\"Tue, 13 May 2014 00:00:00 GMT\",\"type\":\"article\"}]}"; if ( g_repairMode ) { g_errno = EREPAIRING; return true; } // this will be NULL if the "content" was empty or not given char *content = gr->m_content; // . try the uploaded file if nothing in the text area // . this will be NULL if the "content" was empty or not given if ( ! content ) content = gr->m_contentFile; if ( m_firstTime ) { m_firstTime = false; m_start = content; } // save current start since we update it next char *start = m_start; // if this is empty we are done //if ( ! start ) // return true; char *delim = gr->m_contentDelim; if ( delim && ! delim[0] ) delim = NULL; if ( m_fixMe ) { // we had made the first delim char a \0 to index the // previous document, now put it back to what it was *m_start = *delim; // i guess unset this m_fixMe = false; } // if we had a delimeter... if ( delim ) { // we've saved m_start as "start" above, // so find the next delimeter after it and set that to m_start // add +1 to avoid infinite loop m_start = strstr(start+1,delim); // for injecting "start" set this to \0 if ( m_start ) { // null term it *m_start = '\0'; // put back the original char on next round...? m_fixMe = true; } } // this is the url of the injected content m_injectUrlBuf.safeStrcpy ( gr->m_url ); bool modifiedUrl = false; // if we had a delimeter we must make a fake url // if ( delim ) { // // if user had a <url> or <doc> or <docid> field use that // char *hint = strcasestr ( start , "<url>" ); // if ( hint ) { // modifiedUrl = true; // ... // } // } // if we had a delimeter thus denoting multiple items/documents to // be injected, we must create unique urls for each item. if ( delim && ! modifiedUrl ) { // use hash of the content long long ch64 = hash64n ( start , 0LL ); // normalize it Url u; u.set ( gr->m_url ); // reset it m_injectUrlBuf.reset(); // by default append a -<ch64> to the provided url m_injectUrlBuf.safePrintf("%s-%llu",u.getUrl(),ch64); } // count them m_injectCount++; m_inUse = true; if ( ! xd->injectDoc ( m_injectUrlBuf.getBufStart() , cr , start , // content , gr->m_diffbotReply, gr->m_hasMime, // content starts with http mime? gr->m_hopCount, gr->m_charset, gr->m_deleteUrl, gr->m_contentTypeStr, // text/html text/xml gr->m_spiderLinks , gr->m_newOnly, // index iff new this , doneInjectingWrapper9 ) ) // we blocked... return false; m_inUse = false; return true; }
// for procog bool sendPageAnalyze ( TcpSocket *s , HttpRequest *r ) { // make a state State8 *st; try { st = new (State8); } catch ( ... ) { g_errno = ENOMEM; log("PageParser: new(%i): %s", (int)sizeof(State8),mstrerror(g_errno)); return g_httpServer.sendErrorReply(s,500, mstrerror(g_errno));} mnew ( st , sizeof(State8) , "PageParser" ); st->m_freeIt = true; st->m_state = NULL; //st->m_callback = callback; //st->m_q = q; //st->m_termFreqs = termFreqs; //st->m_termFreqWeights = termFreqWeights; //st->m_affWeights = affWeights; //st->m_total = (score_t)-1; st->m_indexCode = 0; st->m_blocked = false; st->m_didRootDom = false; st->m_didRootWWW = false; st->m_wasRootDom = false; st->m_u = NULL; // password, too long pwdLen = 0; char *pwd = r->getString ( "pwd" , &pwdLen ); if ( pwdLen > 31 ) pwdLen = 31; if ( pwdLen > 0 ) strncpy ( st->m_pwd , pwd , pwdLen ); st->m_pwd[pwdLen]='\0'; // save socket ptr st->m_s = s; st->m_r.copy ( r ); // get the collection char *coll = r->getString ( "c" , &st->m_collLen ,NULL /*default*/); if ( ! coll ) coll = g_conf.m_defaultColl; if ( ! coll ) coll = "main"; long collLen = gbstrlen(coll); if ( collLen > MAX_COLL_LEN ) return sendErrorReply ( st , ENOBUFS ); strcpy ( st->m_coll , coll ); // version to use, if -1 use latest st->m_titleRecVersion = r->getLong("version",-1); if ( st->m_titleRecVersion == -1 ) st->m_titleRecVersion = TITLEREC_CURRENT_VERSION; // default to 0 if not provided st->m_hopCount = r->getLong("hc",0); long old = r->getLong ( "old", 0 ); // set query long qlen; char *qs = r->getString("q",&qlen,NULL); if ( qs ) st->m_tq.set2 ( qs , langUnknown , true ); // url will override docid if given st->m_docId = r->getLongLong ("d",-1); st->m_docId = r->getLongLong ("docid",st->m_docId); long ulen; char *u = st->m_r.getString("u",&ulen,NULL); if ( ! u ) u = st->m_r.getString("url",&ulen,NULL); if ( ! u && st->m_docId == -1LL ) return sendErrorReply ( st , EBADREQUEST ); // set url in state class (may have length 0) //if ( u ) st->m_url.set ( u , ulen ); //st->m_urlLen = ulen; st->m_u = u; st->m_ulen = 0; if ( u ) st->m_ulen = gbstrlen(u); // should we recycle link info? st->m_recycle = r->getLong("recycle",1); st->m_recycle2 = r->getLong("recycleimp",0); st->m_render = r->getLong("render" ,0); st->m_recompute = r->getLong("recompute" ,0); // for quality computation... takes way longer cuz we have to // lookup the IP address of every outlink, so we can get its root // quality using Msg25 which needs to filter out voters from that IP // range. st->m_oips = r->getLong("oips" ,0); //st->m_page = r->getLong("page",1); long linkInfoLen = 0; // default is NULL char *linkInfoColl = r->getString ( "oli" , &linkInfoLen, NULL ); if ( linkInfoColl ) strcpy ( st->m_linkInfoColl , linkInfoColl ); else st->m_linkInfoColl[0] = '\0'; // set the flag in our SafeBuf class so that Words.cpp knows to show // html or html source depending on this value //st->m_xbuf.m_renderHtml = st->m_render; // should we use the old title rec? st->m_old = old; // are we coming from a local machine? st->m_isLocal = r->isLocal(); //no more setting the default root quality to 30, instead if we do not // know it setting it to -1 st->m_rootQuality=-1; // header //xbuf->safePrintf("<meta http-equiv=\"Content-Type\" " // "content=\"text/html; charset=utf-8\">\n"); XmlDoc *xd = &st->m_xd; long isXml = r->getLong("xml",0); // if got docid, use that if ( st->m_docId != -1 ) { if ( ! xd->set3 ( st->m_docId, st->m_coll, 0 ) ) // niceness // return error reply if g_errno is set return sendErrorReply ( st , g_errno ); // make this our callback in case something blocks xd->setCallback ( st , gotXmlDoc ); xd->m_pbuf = &st->m_wbuf; // reset this flag st->m_donePrinting = false; // . set xd from the old title rec if recycle is true // . can also use XmlDoc::m_loadFromOldTitleRec flag //if ( st->m_recycle ) xd->m_recycleContent = true; xd->m_recycleContent = true; // force this on //xd->m_useSiteLinkBuf = true; //xd->m_usePageLinkBuf = true; if ( isXml ) xd->m_printInXml = true; // now tell it to fetch the old title rec if ( ! xd->loadFromOldTitleRec () ) // return false if this blocks return false; return gotXmlDoc ( st ); } // set this up SpiderRequest sreq; sreq.reset(); if ( st->m_u ) strcpy(sreq.m_url,st->m_u); long firstIp = hash32n(st->m_u); if ( firstIp == -1 || firstIp == 0 ) firstIp = 1; // parentdocid of 0 sreq.setKey( firstIp, 0LL, false ); sreq.m_isPageParser = 1; sreq.m_hopCount = st->m_hopCount; sreq.m_hopCountValid = 1; sreq.m_fakeFirstIp = 1; sreq.m_firstIp = firstIp; Url nu; nu.set(sreq.m_url); sreq.m_domHash32 = nu.getDomainHash32(); sreq.m_siteHash32 = nu.getHostHash32(); // . get provided content if any // . will be NULL if none provided // . "content" may contain a MIME long contentLen = 0; char *content = r->getString ( "content" , &contentLen , NULL ); // is the "content" url-encoded? default is true. bool contentIsEncoded = true; // mark doesn't like to url-encode his content if ( ! content ) { content = r->getUnencodedContent (); contentLen = r->getUnencodedContentLen (); contentIsEncoded = false; } // ensure null if ( contentLen == 0 ) content = NULL; //uint8_t contentType = CT_HTML; //if ( isXml ) contentType = CT_XML; long ctype = r->getLong("ctype",CT_HTML); // . use the enormous power of our new XmlDoc class // . this returns false if blocked if ( ! xd->set4 ( &sreq , NULL , st->m_coll , // we need this so the term table is set! &st->m_wbuf , // XmlDoc::m_pbuf 0, // try 0 now! 1 ,//PP_NICENESS )) content , false, // deletefromindex 0, // forced ip ctype )) // return error reply if g_errno is set return sendErrorReply ( st , g_errno ); // make this our callback in case something blocks xd->setCallback ( st , gotXmlDoc ); // reset this flag st->m_donePrinting = false; // prevent a core here in the event we download the page content xd->m_crawlDelayValid = true; xd->m_crawlDelay = 0; // . set xd from the old title rec if recycle is true // . can also use XmlDoc::m_loadFromOldTitleRec flag //if ( st->m_recycle ) xd->m_recycleContent = true; // only recycle if docid is given!! if ( st->m_recycle ) xd->m_recycleContent = true; // force this on //xd->m_useSiteLinkBuf = true; //xd->m_usePageLinkBuf = true; if ( isXml ) xd->m_printInXml = true; return gotXmlDoc ( st ); }
// . a new interface so Msg3b can call this with "s" set to NULL // . returns false if blocked, true otherwise // . sets g_errno on error bool sendPageParser2 ( TcpSocket *s , HttpRequest *r , State8 *st , long long docId , Query *q , // in query term space, not imap space long long *termFreqs , // in imap space float *termFreqWeights , // in imap space float *affWeights , void *state , void (* callback)(void *state) ) { //log("parser: read sock=%li",s->m_sd); // might a simple request to addsomething to validated.*.txt file // from XmlDoc::print() or XmlDoc::validateOutput() char *add = r->getString("add",NULL); //long long uh64 = r->getLongLong("uh64",0LL); char *uh64str = r->getString("uh64",NULL); //char *divTag = r->getString("div",NULL); if ( uh64str ) { // convert add to number long addNum = 0; if ( to_lower_a(add[0])=='t' ) // "true" or "false"? addNum = 1; // convert it. skip beginning "str" inserted to prevent // javascript from messing with the long long since it // was rounding it! //long long uh64 = atoll(uh64str);//+3); // urldecode that //long divTagLen = gbstrlen(divTag); //long newLen = urlDecode ( divTag , divTag , divTagLen ); // null term? //divTag[newLen] = '\0'; // do it. this is defined in XmlDoc.cpp //addCheckboxSpan ( uh64 , divTag , addNum ); // make basic reply char *reply; reply = "HTTP/1.0 200 OK\r\n" "Connection: Close\r\n"; // that is it! send a basic reply ok bool status = g_httpServer.sendDynamicPage( s , reply, gbstrlen(reply), -1, //cachtime false ,//postreply? NULL, //ctype -1 , //httpstatus NULL,//cookie "utf-8"); return status; } // make a state if ( st ) st->m_freeIt = false; if ( ! st ) { try { st = new (State8); } catch ( ... ) { g_errno = ENOMEM; log("PageParser: new(%i): %s", (int)sizeof(State8),mstrerror(g_errno)); return g_httpServer.sendErrorReply(s,500, mstrerror(g_errno));} mnew ( st , sizeof(State8) , "PageParser" ); st->m_freeIt = true; } // msg3b uses this to get a score from the query st->m_state = state; st->m_callback = callback; st->m_q = q; st->m_termFreqs = termFreqs; st->m_termFreqWeights = termFreqWeights; st->m_affWeights = affWeights; //st->m_total = (score_t)-1; st->m_indexCode = 0; st->m_blocked = false; st->m_didRootDom = false; st->m_didRootWWW = false; st->m_wasRootDom = false; st->m_u = NULL; st->m_recompute = false; //st->m_url.reset(); // do not allow more than one to be launched at a time if in // a quickpoll. will cause quickpoll in quickpoll. g_inPageParser = true; // password, too long pwdLen = 0; char *pwd = r->getString ( "pwd" , &pwdLen ); if ( pwdLen > 31 ) pwdLen = 31; if ( pwdLen > 0 ) strncpy ( st->m_pwd , pwd , pwdLen ); st->m_pwd[pwdLen]='\0'; // save socket ptr st->m_s = s; st->m_r.copy ( r ); // get the collection char *coll = r->getString ( "c" , &st->m_collLen ,NULL /*default*/); if ( st->m_collLen > MAX_COLL_LEN ) return sendErrorReply ( st , ENOBUFS ); if ( ! coll ) return sendErrorReply ( st , ENOCOLLREC ); strcpy ( st->m_coll , coll ); // version to use, if -1 use latest st->m_titleRecVersion = r->getLong("version",-1); if ( st->m_titleRecVersion == -1 ) st->m_titleRecVersion = TITLEREC_CURRENT_VERSION; // default to 0 if not provided st->m_hopCount = r->getLong("hc",0); //long ulen = 0; //char *u = r->getString ( "u" , &ulen , NULL /*default*/); long old = r->getLong ( "old", 0 ); // set query long qlen; char *qs = r->getString("q",&qlen,NULL); if ( qs ) st->m_tq.set2 ( qs , langUnknown , true ); // url will override docid if given if ( ! st->m_u || ! st->m_u[0] ) st->m_docId = r->getLongLong ("docid",-1); else st->m_docId = -1; // set url in state class (may have length 0) //if ( u ) st->m_url.set ( u , ulen ); //st->m_urlLen = ulen; st->m_u = st->m_r.getString("u",&st->m_ulen,NULL); // should we recycle link info? st->m_recycle = r->getLong("recycle",0); st->m_recycle2 = r->getLong("recycleimp",0); st->m_render = r->getLong("render" ,0); // for quality computation... takes way longer cuz we have to // lookup the IP address of every outlink, so we can get its root // quality using Msg25 which needs to filter out voters from that IP // range. st->m_oips = r->getLong("oips" ,0); long linkInfoLen = 0; // default is NULL char *linkInfoColl = r->getString ( "oli" , &linkInfoLen, NULL ); if ( linkInfoColl ) strcpy ( st->m_linkInfoColl , linkInfoColl ); else st->m_linkInfoColl[0] = '\0'; // set the flag in our SafeBuf class so that Words.cpp knows to show // html or html source depending on this value st->m_xbuf.m_renderHtml = st->m_render; // should we use the old title rec? st->m_old = old; // are we coming from a local machine? st->m_isLocal = r->isLocal(); //no more setting the default root quality to 30, instead if we do not // know it setting it to -1 st->m_rootQuality=-1; // header SafeBuf *xbuf = &st->m_xbuf; xbuf->safePrintf("<meta http-equiv=\"Content-Type\" " "content=\"text/html; charset=utf-8\">\n"); // print standard header g_pages.printAdminTop ( xbuf , st->m_s , &st->m_r ); // print the standard header for admin pages char *dd = ""; char *rr = ""; char *rr2 = ""; char *render = ""; char *oips = ""; char *us = ""; if ( st->m_u && st->m_u[0] ) us = st->m_u; //if ( st->m_sfn != -1 ) sprintf ( rtu , "%li",st->m_sfn ); if ( st->m_old ) dd = " checked"; if ( st->m_recycle ) rr = " checked"; if ( st->m_recycle2 ) rr2 = " checked"; if ( st->m_render ) render = " checked"; if ( st->m_oips ) oips = " checked"; xbuf->safePrintf( "<style>" ".poo { background-color:#%s;}\n" "</style>\n" , LIGHT_BLUE ); long clen; char *contentParm = r->getString("content",&clen,""); // print the input form xbuf->safePrintf ( "<style>\n" "h2{font-size: 12px; color: #666666;}\n" ".gbtag { border: 1px solid gray;" "background: #ffffef;display:inline;}\n" ".gbcomment { border: 1px solid gray;" "color: #888888; font-style:italic; " "background: #ffffef;display:inline;}\n" ".token { border: 1px solid gray;" "background: #f0ffff;display:inline;}\n" ".spam { border: 1px solid gray;" "background: #af0000;" "color: #ffffa0;}" ".hs {color: #009900;}" "</style>\n" "<center>" "<table %s>" "<tr><td colspan=5><center><b>" "Parser" "</b></center></td></tr>\n" "<tr class=poo>" "<td>" "<b>url</b>" "<br><font size=-2>" "Type in <b>FULL</b> url to parse." "</font>" "</td>" "</td>" "<td>" "<input type=text name=u value=\"%s\" size=\"40\">\n" "</td>" "</tr>" /* "<tr class=poo>" "<td>" "Parser version to use: " "</td>" "<td>" "<input type=text name=\"version\" size=\"4\" value=\"-1\"> " "</td>" "<td>" "(-1 means to use latest title rec version)<br>" "</td>" "</tr>" */ /* "<tr class=poo>" "<td>" "Hop count to use: " "</td>" "<td>" "<input type=text name=\"hc\" size=\"4\" value=\"%li\"> " "</td>" "<td>" "(-1 is unknown. For root urls hopcount is always 0)<br>" "</td>" "</tr>" */ "<tr class=poo>" "<td>" "<b>use cached</b>" "<br><font size=-2>" "Load page from cache (titledb)?" "</font>" "</td>" "<td>" "<input type=checkbox name=old value=1%s> " "</td>" "</tr>" /* "<tr class=poo>" "<td>" "Reparse root:" "</td>" "<td>" "<input type=checkbox name=artr value=1%s> " "</td>" "<td>" "Apply selected ruleset to root to update quality" "</td>" "</tr>" */ "<tr class=poo>" "<td>" "<b>recycle link info</b>" "<br><font size=-2>" "Recycle the link info from the title rec" "Load page from cache (titledb)?" "</font>" "</td>" "<td>" "<input type=checkbox name=recycle value=1%s> " "</td>" "</tr>" /* "<tr class=poo>" "<td>" "Recycle Link Info Imported:" "</td>" "<td>" "<input type=checkbox name=recycleimp value=1%s> " "</td>" "<td>" "Recycle the link info imported from other coll" "</td>" "</tr>" */ "<tr class=poo>" "<td>" "<b>render html</b>" "<br><font size=-2>" "Render document content as HTML" "</font>" "</td>" "<td>" "<input type=checkbox name=render value=1%s> " "</td>" "</tr>" /* "<tr class=poo>" "<td>" "Lookup outlinks' ruleset, ips, quality:" "</td>" "<td>" "<input type=checkbox name=oips value=1%s> " "</td>" "<td>" "To compute quality lookup IP addresses of roots " "of outlinks." "</td>" "</tr>" "<tr class=poo>" "<td>" "LinkInfo Coll:" "</td>" "<td>" "<input type=text name=\"oli\" size=\"10\" value=\"\"> " "</td>" "<td>" "Leave empty usually. Uses this coll to lookup link info." "</td>" "</tr>" */ "<tr class=poo>" "<td>" "<b>optional query</b>" "<br><font size=-2>" "Leave empty usually. For title generation only." "</font>" "</td>" "<td>" "<input type=text name=\"q\" size=\"20\" value=\"\"> " "</td>" "</tr>", TABLE_STYLE, us , dd, rr, render ); xbuf->safePrintf( "<tr class=poo>" "<td>" "<b>content type below is</b>" "<br><font size=-2>" "Is the content below HTML? XML? JSON?" "</font>" "</td>" "<td>" //"<input type=checkbox name=xml value=1> " "<select name=ctype>\n" "<option value=%li selected>HTML</option>\n" "<option value=%li selected>XML</option>\n" "<option value=%li selected>JSON</option>\n" "</select>\n" "</td>" "</tr>", (long)CT_HTML, (long)CT_XML, (long)CT_JSON ); xbuf->safePrintf( "<tr class=poo>" "<td><b>content</b>" "<br><font size=-2>" "Use this content for the provided <i>url</i> " "rather than downloading it from the web." "</td>" "<td>" "<textarea rows=10 cols=80 name=content>" "%s" "</textarea>" "</td>" "</tr>" "</table>" "</center>" "</form>" "<br>", //oips , contentParm ); xbuf->safePrintf( "<center>" "<input type=submit value=Submit>" "</center>" ); // just print the page if no url given if ( ! st->m_u || ! st->m_u[0] ) return processLoop ( st ); XmlDoc *xd = &st->m_xd; // set this up SpiderRequest sreq; sreq.reset(); strcpy(sreq.m_url,st->m_u); long firstIp = hash32n(st->m_u); if ( firstIp == -1 || firstIp == 0 ) firstIp = 1; // parentdocid of 0 sreq.setKey( firstIp, 0LL, false ); sreq.m_isPageParser = 1; sreq.m_hopCount = st->m_hopCount; sreq.m_hopCountValid = 1; sreq.m_fakeFirstIp = 1; sreq.m_firstIp = firstIp; Url nu; nu.set(sreq.m_url); sreq.m_domHash32 = nu.getDomainHash32(); sreq.m_siteHash32 = nu.getHostHash32(); // . get provided content if any // . will be NULL if none provided // . "content" may contain a MIME long contentLen = 0; char *content = r->getString ( "content" , &contentLen , NULL ); // is the "content" url-encoded? default is true. bool contentIsEncoded = true; // mark doesn't like to url-encode his content if ( ! content ) { content = r->getUnencodedContent (); contentLen = r->getUnencodedContentLen (); contentIsEncoded = false; } // ensure null if ( contentLen == 0 ) content = NULL; uint8_t contentType = CT_HTML; if ( r->getBool("xml",0) ) contentType = CT_XML; contentType = r->getLong("ctype",contentType);//CT_HTML); // if facebook, load xml content from title rec... bool isFacebook = (bool)strstr(st->m_u,"http://www.facebook.com/"); if ( isFacebook && ! content ) { long long docId = g_titledb.getProbableDocId(st->m_u); sprintf(sreq.m_url ,"%llu", docId ); sreq.m_isPageReindex = true; } // hack if ( content ) { st->m_dbuf.purge(); st->m_dbuf.safeStrcpy(content); //char *data = strstr(content,"\r\n\r\n"); //long dataPos = 0; //if ( data ) dataPos = (data + 4) - content; //st->m_dbuf.convertJSONtoXML(0,dataPos); //st->m_dbuf.decodeJSON(0); content = st->m_dbuf.getBufStart(); } // . use the enormous power of our new XmlDoc class // . this returns false if blocked if ( ! xd->set4 ( &sreq , NULL , st->m_coll , &st->m_wbuf , 0 ,//PP_NICENESS )) content , false, // deletefromindex 0, // forced ip contentType )) // return error reply if g_errno is set return sendErrorReply ( st , g_errno ); // make this our callback in case something blocks xd->setCallback ( st , processLoop ); // . set xd from the old title rec if recycle is true // . can also use XmlDoc::m_loadFromOldTitleRec flag if ( st->m_recycle ) xd->m_recycleContent = true; return processLoop ( st ); }
void handleRequest7 ( UdpSlot *slot , int32_t netnice ) { InjectionRequest *ir = (InjectionRequest *)slot->m_readBuf; // now just supply the first guy's char ** and size ptr if ( ! deserializeMsg2 ( &ir->ptr_url, &ir->size_url ) ) { log("inject: error deserializing inject request from " "host ip %s port %i",iptoa(slot->m_ip),(int)slot->m_port); g_errno = EBADREQUEST; g_udpServer.sendErrorReply(slot,g_errno); //g_corruptCount++; return; } // the url can be like xyz.com. so need to do another corruption // test for ia if ( ! ir->ptr_url ) { // || strncmp(ir->ptr_url,"http",4) != 0 ) { //log("inject: trying to inject NULL or non http url."); log("inject: trying to inject NULL url."); g_errno = EBADURL; //g_corruptCount++; g_udpServer.sendErrorReply(slot,g_errno); return; } CollectionRec *cr = g_collectiondb.getRec ( ir->m_collnum ); if ( ! cr ) { log("inject: cr rec is null %i", ir->m_collnum); g_errno = ENOCOLLREC; g_udpServer.sendErrorReply(slot,g_errno); return; } XmlDoc *xd; try { xd = new (XmlDoc); } catch ( ... ) { g_errno = ENOMEM; log("PageInject: import failed: new(%i): %s", (int)sizeof(XmlDoc),mstrerror(g_errno)); g_udpServer.sendErrorReply(slot,g_errno); return; } mnew ( xd, sizeof(XmlDoc) , "PageInject" ); xd->m_injectionSlot = slot; xd->m_injectStartTime = gettimeofdayInMilliseconds(); // add to linked list xd->m_nextInject = NULL; xd->m_prevInject = NULL; if ( s_injectTail ) { s_injectTail->m_nextInject = xd; xd->m_prevInject = s_injectTail; s_injectTail = xd; } else { s_injectHead = xd; s_injectTail = xd; } if(ir->ptr_content && ir->ptr_content[ir->size_content - 1]) { // XmlDoc expects this buffer to be null terminated. char *xx=NULL;*xx=0; } if ( ! xd->injectDoc ( ir->ptr_url , // m_injectUrlBuf.getBufStart() , cr , ir->ptr_content , // start , // content , ir->ptr_diffbotReply, // if this doc is a 'container doc' then // hasMime applies to the SUBDOCS only!! ir->m_hasMime, // content starts with http mime? ir->m_hopCount, ir->m_charset, ir->m_deleteUrl, // warcs/arcs include the mime so we don't // look at this in that case in // XmlDoc::injectDoc() when it calls set4() ir->ptr_contentTypeStr, // text/html text/xml ir->m_spiderLinks , ir->m_newOnly, // index iff new xd, // state , sendUdpReply7 , // extra shit ir->m_firstIndexed, ir->m_lastSpidered , // the ip of the url being injected. // use 0 if unknown and it won't be valid. ir->m_injectDocIp , ir->ptr_contentDelim, ir->ptr_metadata, ir->size_metadata, ir->size_content - 1 // there should be a null in that last byte ) ) // we blocked... return; // if injected without blocking, send back reply sendUdpReply7 ( xd ); }
bool processLoop ( void *state ) { // cast it State8 *st = (State8 *)state; // get the xmldoc XmlDoc *xd = &st->m_xd; // error? if ( g_errno ) return sendErrorReply ( st , g_errno ); // shortcut SafeBuf *xbuf = &st->m_xbuf; if ( st->m_u && st->m_u[0] ) { // . save the ips.txt file if we are the test coll // . saveTestBuf() is a function in Msge1.cpp CollectionRec *cr = xd->getCollRec(); if ( xd && cr && cr->m_coll && !strcmp(cr->m_coll,"qatest123")) // use same dir that XmlDoc::getTestDir() would use //saveTestBuf ( "test-page-parser" ); saveTestBuf("qa"); // now get the meta list, in the process it will print out a // bunch of junk into st->m_xbuf char *metalist = xd->getMetaList ( ); if ( ! metalist ) return sendErrorReply ( st , g_errno ); // return false if it blocked if ( metalist == (void *)-1 ) return false; // for debug... if ( ! xd->m_indexCode ) xd->doConsistencyTest ( false ); // print it out xd->printDoc( xbuf ); } // print reason we can't analyze it (or index it) //if ( st->m_indexCode != 0 ) { // xbuf->safePrintf ("<br><br><b>indexCode: %s</b>\n<br>", // mstrerror(st->m_indexCode)); //} // we are done g_inPageParser = false; // print the final tail //p += g_httpServer.printTail ( p , pend - p ); //log("parser: send sock=%li",st->m_s->m_sd); // now encapsulate it in html head/tail and send it off bool status = g_httpServer.sendDynamicPage( st->m_s , xbuf->getBufStart(), xbuf->length() , -1, //cachtime false ,//postreply? NULL, //ctype -1 , //httpstatus NULL,//cookie "utf-8"); // delete the state now if ( st->m_freeIt ) { mdelete ( st , sizeof(State8) , "PageParser" ); delete (st); } // return the status return status; }
bool sendReply ( void *state ) { // get the state properly Msg7 *msg7= (Msg7 *) state; GigablastRequest *gr = &msg7->m_gr; // extract info from state TcpSocket *sock = gr->m_socket; XmlDoc *xd = &msg7->m_xd; // log it //if ( msg7->m_url[0] ) xd->logIt(); // msg7 has the docid for what we injected, iff g_errno is not set //long long docId = msg7->m_msg7.m_docId; //long hostId = msg7->m_msg7.m_hostId; long long docId = xd->m_docId; long hostId = 0;//msg7->m_msg7.m_hostId; // set g_errno to index code if ( xd->m_indexCodeValid && xd->m_indexCode && ! g_errno ) g_errno = xd->m_indexCode; char format = gr->m_hr.getReplyFormat(); // no url parm? if ( ! g_errno && ! gr->m_url && format != FORMAT_HTML ) g_errno = EMISSINGINPUT; if ( g_errno && g_errno != EDOCUNCHANGED ) { long save = g_errno; mdelete ( msg7, sizeof(Msg7) , "PageInject" ); delete (msg7); g_errno = save; char *msg = mstrerror(g_errno); return g_httpServer.sendErrorReply(sock,save,msg,NULL); } char abuf[320]; SafeBuf am(abuf,320,0,false); am.setLabel("injbuf"); char *ct = NULL; // a success reply, include docid and url i guess if ( format == FORMAT_XML ) { am.safePrintf("<response>\n"); am.safePrintf("\t<statusCode>%li</statusCode>\n", (long)g_errno); am.safePrintf("\t<statusMsg><![CDATA["); am.cdataEncode(mstrerror(g_errno)); am.safePrintf("]]></statusMsg>\n"); am.safePrintf("\t<docId>%lli</docId>\n",xd->m_docId); if ( gr->m_getSections ) { SafeBuf *secBuf = xd->getInlineSectionVotingBuf(); am.safePrintf("\t<htmlSrc><![CDATA["); if ( secBuf->length() ) am.cdataEncode(secBuf->getBufStart()); am.safePrintf("]]></htmlSrc>\n"); } am.safePrintf("</response>\n"); ct = "text/xml"; } if ( format == FORMAT_JSON ) { am.safePrintf("{\"response\":{\n"); am.safePrintf("\t\"statusCode\":%li,\n",(long)g_errno); am.safePrintf("\t\"statusMsg\":\""); am.jsonEncode(mstrerror(g_errno)); am.safePrintf("\",\n"); am.safePrintf("\t\"docId\":%lli,\n",xd->m_docId); if ( gr->m_getSections ) { SafeBuf *secBuf = xd->getInlineSectionVotingBuf(); am.safePrintf("\t\"htmlSrc\":\""); if ( secBuf->length() ) am.jsonEncode(secBuf->getBufStart()); am.safePrintf("\",\n"); } // subtract ",\n" am.m_length -= 2; am.safePrintf("\n}\n}\n"); ct = "application/json"; } if ( format == FORMAT_XML || format == FORMAT_JSON ) { mdelete ( msg7, sizeof(Msg7) , "PageInject" ); delete (msg7); return g_httpServer.sendDynamicPage(sock, am.getBufStart(), am.length(), 0, false, ct ); } // // debug // /* // now get the meta list, in the process it will print out a // bunch of junk into msg7->m_pbuf if ( xd->m_docId ) { char *metalist = xd->getMetaList ( 1,1,1,1,1,1 ); if ( ! metalist || metalist==(void *)-1){char *xx=NULL;*xx=0;} // print it out SafeBuf *pbuf = &msg7->m_sbuf; xd->printDoc( pbuf ); bool status = g_httpServer.sendDynamicPage( msg7->m_socket , pbuf->getBufStart(), pbuf->length() , -1, //cachtime false ,//postreply? NULL, //ctype -1 , //httpstatus NULL,//cookie "utf-8"); // delete the state now mdelete ( st , sizeof(Msg7) , "PageInject" ); delete (st); // return the status return status; } */ // // end debug // char *url = gr->m_url; // . if we're talking w/ a robot he doesn't care about this crap // . send him back the error code (0 means success) if ( url && gr->m_shortReply ) { char buf[1024*32]; char *p = buf; // return docid and hostid if ( ! g_errno ) p += sprintf ( p , "0,docId=%lli,hostId=%li," , docId , hostId ); // print error number here else p += sprintf ( p , "%li,0,0,", (long)g_errno ); // print error msg out, too or "Success" p += sprintf ( p , "%s", mstrerror(g_errno)); mdelete ( msg7, sizeof(Msg7) , "PageInject" ); delete (msg7); return g_httpServer.sendDynamicPage ( sock,buf, gbstrlen(buf) , -1/*cachetime*/); } SafeBuf sb; // print admin bar g_pages.printAdminTop ( &sb, sock , &gr->m_hr ); // print a response msg if rendering the page after a submission if ( g_errno ) sb.safePrintf ( "<center>Error injecting url: <b>%s[%i]</b>" "</center>", mstrerror(g_errno) , g_errno); else if ( (gr->m_url&&gr->m_url[0]) || (gr->m_queryToScrape&&gr->m_queryToScrape[0]) ) sb.safePrintf ( "<center><b>Sucessfully injected %s" "</center><br>" , xd->m_firstUrl.m_url ); // print the table of injection parms g_parms.printParmTable ( &sb , sock , &gr->m_hr ); // clear g_errno, if any, so our reply send goes through g_errno = 0; // calculate buffer length //long bufLen = p - buf; // nuke state mdelete ( msg7, sizeof(Msg7) , "PageInject" ); delete (msg7); // . send this page // . encapsulates in html header and tail // . make a Mime // . i thought we need -2 for cacheTime, but i guess not return g_httpServer.sendDynamicPage (sock, sb.getBufStart(), sb.length(), -1/*cachetime*/); }
bool gotXmlDoc ( void *state ) { // cast it State8 *st = (State8 *)state; // get the xmldoc XmlDoc *xd = &st->m_xd; // if we loaded from old title rec, it should be there! // . save the ips.txt file if we are the test coll // . saveTestBuf() is a function in Msge1.cpp //if ( xd && xd->m_coll && ! strcmp ( xd->m_coll , "qatest123")) // // use same dir that XmlDoc::getTestDir() would use // saveTestBuf ( "test-page-parser" ); // error? if ( g_errno ) return sendErrorReply ( st , g_errno ); // shortcut SafeBuf *xbuf = &st->m_xbuf; bool printIt = false; if ( st->m_u && st->m_u[0] ) printIt = true; if ( st->m_docId != -1LL ) printIt = true; if ( st->m_donePrinting ) printIt = false; // do not re-call this if printDocForProCog blocked... (check length()) if ( printIt ) { // mark as done st->m_donePrinting = true; // always re-compute the page inlinks dynamically, do not // use the ptr_linkInfo1 stored in titlerec!! // NO! not if set from titlerec/docid if ( st->m_recompute ) xd->m_linkInfo1Valid = false; // try a recompute regardless, because we do not store the // bad inlinkers, and ppl want to see why they are bad! //xd->m_linkInfo1Valid = false; // now get the meta list, in the process it will print out a // bunch of junk into st->m_xbuf //char *metalist = xd->getMetaList ( ); //if ( ! metalist ) return sendErrorReply ( st , g_errno ); // return false if it blocked //if ( metalist == (void *)-1 ) return false; // for debug... //if ( ! xd->m_indexCode ) xd->doConsistencyTest ( false ); // . print it out // . returns false if blocks, true otherwise // . sets g_errno on error if ( ! xd->printDocForProCog ( xbuf , &st->m_r ) ) return false; // error? if ( g_errno ) return sendErrorReply ( st , g_errno ); } long isXml = st->m_r.getLong("xml",0); char ctype2 = CT_HTML; if ( isXml ) ctype2 = CT_XML; // now encapsulate it in html head/tail and send it off bool status = g_httpServer.sendDynamicPage( st->m_s , xbuf->getBufStart(), xbuf->length() , -1, //cachtime false ,//postreply? &ctype2, -1 , //httpstatus NULL,//cookie "utf-8"); // delete the state now if ( st->m_freeIt ) { mdelete ( st , sizeof(State8) , "PageParser" ); delete (st); } // return the status return status; }
// . make a web page from results stored in msg40 // . send it on TcpSocket "s" when done // . returns false if blocked, true otherwise // . sets g_errno on error bool gotTitleRec ( void *state ) { // cast the State4 out State4 *st = (State4 *) state; // get the socket TcpSocket *s = st->m_socket; SafeBuf sb; // get it's docId long long docId = st->m_docId; // make the query string for passing to different hosts char qs[64]; sprintf(qs,"&d=%lli",docId); if ( docId==0LL ) qs[0] = 0; // print standard header sb.reserve2x ( 32768 ); g_pages.printAdminTop (&sb, st->m_socket, &st->m_r ); //PAGE_TITLEDB, // st->m_username,//NULL , // st->m_coll , st->m_pwd , s->m_ip , qs ); // shortcut XmlDoc *xd = &st->m_xd; // . deal with errors // . print none if non title rec at or after the provided docId if ( g_errno || docId == 0LL || xd->m_titleRecBuf.length() <= 0 ) { // print docId in box sb.safePrintf ( "<center>\nEnter docId: " "<input type=text name=d value=%lli size=15>", docId); sb.safePrintf ( "</form><br>\n" ); if ( docId == 0 ) sb.safePrintf("<br>"); else if ( g_errno ) sb.safePrintf("<br><br>Error = %s",mstrerror(g_errno)); else sb.safePrintf("<br><br>No titleRec for that docId " "or higher"); // print where it should be //unsigned long gid = getGroupIdFromDocId ( docId ); //Host *hosts = g_hostdb.getGroup(gid); long shardNum = getShardNumFromDocId ( docId ); Host *hosts = g_hostdb.getShard ( shardNum ); long hostId = -1; if ( hosts ) hostId = hosts[0].m_hostId; sb.safePrintf("<br><br>docId on host #%li and twins.",hostId); sb.safePrintf ( "\n</center>" ); mdelete ( st , sizeof(State4) , "PageTitledb"); delete (st); // erase g_errno for sending g_errno = 0; // now encapsulate it in html head/tail and send it off return g_httpServer.sendDynamicPage ( s , sb.getBufStart(), sb.length() ); } // print docId in box sb.safePrintf ("<br>\n" "<center>Enter docId: " "<input type=text name=d value=%lli size=15>", docId ); // print where it should be //unsigned long gid = getGroupIdFromDocId ( docId ); //Host *hosts = g_hostdb.getGroup(gid); long shardNum = getShardNumFromDocId ( docId ); Host *hosts = g_hostdb.getShard ( shardNum ); long hostId = -1; if ( hosts ) hostId = hosts[0].m_hostId; sb.safePrintf("<br><br>docId on host #%li and twins.",hostId); sb.safePrintf ( "</form><br>\n" ); //char *coll = st->m_coll; Title *ti = xd->getTitle(); if ( ! ti ) { log ( "admin: Could not set title" ); return g_httpServer.sendErrorReply(s,500,mstrerror(g_errno)); } // sanity check. should not block if ( ! xd->m_titleValid ) { char *xx=NULL;*xx=0; } // print it out xd->printDoc ( &sb ); // don't forget to cleanup mdelete ( st , sizeof(State4) , "PageTitledb"); delete (st); // now encapsulate it in html head/tail and send it off return g_httpServer.sendDynamicPage (s, sb.getBufStart(), sb.length()); }