Ejemplos de XmlDoc en C++ (Cpp), ejemplos de XmlDoc en C++ (Cpp)

Ejemplo n.º 1

0

Mostrar archivo

Archivo: ServerMessageHandler.cpp Proyecto: Ider/SU-Courses

//////////////////////////////////////////////////////////////////////////
//check whether user that request to log in from client side is in the repository
//if so return Login type message
//else return warning.
Message MessageHandler::LoginMessage()
{
    strVal path = _repositoryPath+"Users.xml";
    XmlDoc doc;

    if (!doc.LoadXmlFile(path))
        return WarningMessage("Users file broken, please contact administrator.");

    strVal name = GetMessageName();
    strVal tag = "User";
    strVal content;
    xmlRep rep;

    vector<XmlDoc> users = doc.Children(tag);

    for (size_t i=0; i<users.size(); ++i)
    {
        strVal userName = users[i].Children("Name")[0].InnerText();
        if (userName == name)
        {
            _loginUsers[_curIP] = name;
            return _msg;
        }
    }

    return WarningMessage("User is not in the repository.");
}

Ejemplo n.º 2

0

Mostrar archivo

Archivo: PageInject.cpp Proyecto: chushuai/open-source-search-engine

void handleRequest7 ( UdpSlot *slot , long netnice ) {

	//m_state = state;
	//m_callback = callback;

	// shortcut
	XmlDoc *xd;
	try { xd = new (XmlDoc); }
	catch ( ... ) { 
		g_errno = ENOMEM;
		log("PageInject: import failed: new(%i): %s", 
		    (int)sizeof(XmlDoc),mstrerror(g_errno));
		sendReply(slot);
		return;
	}
	mnew ( xd, sizeof(XmlDoc) , "PageInject" );

	//xd->reset();
	char *titleRec = slot->m_readBuf;
	long titleRecSize = slot->m_readBufSize;

	long collnum = *(long *)titleRec;

	titleRec += 4;
	titleRecSize -= 4;

	CollectionRec *cr = g_collectiondb.m_recs[collnum];
	if ( ! cr ) {
		sendReply(slot);
		return;
	}

	// if injecting a titlerec from an import operation use set2()
	//if ( m_sbuf.length() > 0 ) {
	xd->set2 ( titleRec,//m_sbuf.getBufStart() ,
		   titleRecSize,//m_sbuf.length() ,
		   cr->m_coll ,
		   NULL, // pbuf
		   MAX_NICENESS ,
		   NULL ); // sreq
	// log it i guess
	log("inject: importing %s",xd->m_firstUrl.getUrl());
	// call this when done indexing
	//xd->m_masterState = this;
	//xd->m_masterLoop  = doneInjectingWrapper9;
	xd->m_state = xd;//this;
	xd->m_callback1  = doneInjectingWrapper10;
	xd->m_isImporting = true;
	xd->m_isImportingValid = true;
	// hack this
	xd->m_slot = slot;
	// then index it
	if ( ! xd->indexDoc() )
		// return if would block
		return;

	// all done?
	//return true;
	sendReply ( slot );
}

Ejemplo n.º 3

0

Mostrar archivo

Archivo: Msg20.cpp Proyecto: ex610z/open-source-search-engine

bool gotReplyWrapperxd ( void *state ) {
	// grab it
	XmlDoc *xd = (XmlDoc *)state;
	// get it
	UdpSlot *slot = (UdpSlot *)xd->m_slot;
	// parse the request
	Msg20Request *req = (Msg20Request *)slot->m_readBuf;
	// print time
	long long took = gettimeofdayInMilliseconds() - xd->m_setTime;
	// if there is a baclkog of msg20 summary generation requests this
	// is really not the cpu it took to make the smmary, but how long it
	// took to get the reply. this request might have had to wait for the
	// other summaries to finish computing before it got its turn, 
	// meanwhile its clock was ticking. TODO: make this better?
	// only do for niceness 0 otherwise it gets interrupted by quickpoll
	// and can take a long time.
	if ( (req->m_isDebug || took > 100) && req->m_niceness == 0 )
		log("query: Took %lli ms to compute summary for d=%lli u=%s "
		    "niceness=%li",
		    took,
		    xd->m_docId,xd->m_firstUrl.m_url,
		    xd->m_niceness );
	// error?
	if ( g_errno ) { xd->m_reply.sendReply ( xd ); return true; }
	// this should not block now
	Msg20Reply *reply = xd->getMsg20Reply ( );
	// sanity check, should not block here now
	if ( reply == (void *)-1 ) { char *xx=NULL;*xx=0; }
	// NULL means error, -1 means blocked. on error g_errno should be set
	if ( ! reply && ! g_errno ) { char *xx=NULL;*xx=0;}
	// send it off. will send an error reply if g_errno is set
	return reply->sendReply ( xd );
}

Ejemplo n.º 4

0

Mostrar archivo

Archivo: XmlDocTest.cpp Proyecto: privacore/open-source-search-engine

TEST_F(XmlDocTest, PosdbGetMetaListNewDoc) {
	const char *url = "http://www.example.test/index.html";
	char contentNew[] = "<html><head><title>my title</title></head><body>new document</body></html>";

	XmlDoc xmlDocNew;
	initializeDocForPosdb(&xmlDocNew, url, contentNew);

	xmlDocNew.getMetaList(false);
	auto metaListKeys = parseMetaList(xmlDocNew.m_metaList, xmlDocNew.m_metaListSize);

	// make sure positive special key is there (to clear out existing negative special key)
	EXPECT_TRUE(posdbFindRecord(metaListKeys, xmlDocNew.m_docId, POSDB_DELETEDOC_TERMID, false));
	EXPECT_FALSE(posdbFindRecord(metaListKeys, xmlDocNew.m_docId, POSDB_DELETEDOC_TERMID, true));

	// make sure title & body text is indexed

	// title
	EXPECT_TRUE(posdbFindRecord(metaListKeys, xmlDocNew.m_docId, hashWord("title", "my"), false));
	EXPECT_TRUE(posdbFindRecord(metaListKeys, xmlDocNew.m_docId, hashWord("title", "title"), false));
	EXPECT_TRUE(posdbFindRecord(metaListKeys, xmlDocNew.m_docId, hashWord("title", "mytitle"), false));
	EXPECT_TRUE(posdbFindRecord(metaListKeys, xmlDocNew.m_docId, hashWord("my"), false));
	EXPECT_TRUE(posdbFindRecord(metaListKeys, xmlDocNew.m_docId, hashWord("title"), false));
	EXPECT_TRUE(posdbFindRecord(metaListKeys, xmlDocNew.m_docId, hashWord("mytitle"), false));

	// body
	EXPECT_TRUE(posdbFindRecord(metaListKeys, xmlDocNew.m_docId, hashWord("new"), false));
	EXPECT_TRUE(posdbFindRecord(metaListKeys, xmlDocNew.m_docId, hashWord("document"), false));
	EXPECT_TRUE(posdbFindRecord(metaListKeys, xmlDocNew.m_docId, hashWord("newdocument"), false));

	/// @todo ALC add other terms
}

Ejemplo n.º 5

0

Mostrar archivo

Archivo: ServerMessageHandler.cpp Proyecto: Ider/SU-Courses

//////////////////////////////////////////////////////////////////////////
//Extract InnerText of Name tag in message
strVal MessageHandler::GetMessageName()
{
    strVal tag = "Name";
    XmlDoc doc = _msg.Doc();
    vector<XmlDoc> names = doc.Children(tag);
    if (names.size() <=0)return "";
    strVal name = names[0].InnerText();
    return names[0].InnerText();
}

Ejemplo n.º 6

0

Mostrar archivo

Archivo: ClientInfo.cpp Proyecto: asider/SimpleClient

void CClientInfo::Config(const string& fileName)
{
	ifstream inFile(fileName.c_str(), ifstream::in);
	string inXml, inLine;
	while (getline(inFile, inLine))
		inXml += inLine;
	XmlVec confData(inXml.begin(), inXml.end());
	confData.push_back('\0');
	XmlDoc xmlDoc;
	try {
		xmlDoc.parse<0>(&confData[0]);
	} catch (...) {
		LOG("ERROR: failed to load the server config file.");
		return;
	}

	XmlNodePtr pNode = NULL;
	if ((pNode = xmlDoc.first_node("server_list"))) {
		XmlNodePtr pItem = pNode->first_node("server");
		while (pItem) {
			int    pId   = HAS_ATTR(pItem, "id")   ? atoi(ATTR_VALUE(pItem, "id")) : -1;
			string ipStr = HAS_ATTR(pItem, "ip")   ? ATTR_VALUE(pItem, "ip") : "127.0.0.1";
			int    port  = HAS_ATTR(pItem, "port") ? atoi(ATTR_VALUE(pItem, "port")) : 12345;
			m_svrMap[pId] = pair<string, int>(ipStr, port);
			pItem = pItem->next_sibling("server");
		}
	}
	if ((pNode = xmlDoc.first_node("parameters"))) {
		m_connNum = HAS_NODE(pNode, "conn_num") ? atoi(NODE_VALUE(pNode, "conn_num")) : 1;
		m_fixRate = HAS_NODE(pNode, "fix_rate") ? atoi(NODE_VALUE(pNode, "fix_rate")) : 0;
		m_initTimeout = HAS_NODE(pNode, "init_timeout") ? atoi(NODE_VALUE(pNode, "init_timeout")) : 20;
		m_videoLen = HAS_NODE(pNode, "video_len") ? atoi(NODE_VALUE(pNode, "video_len")) : 278;
		m_initBufferSize = HAS_NODE(pNode, "init_buffer_size") ? atoi(NODE_VALUE(pNode, "init_buffer_size")) : 10;
	}
	LOG("--------------------------------------------------");
	svrMap_t::const_iterator cIt = m_svrMap.begin();
	for (; cIt != m_svrMap.end(); ++cIt) {
		LOG("|server: id=%d, %s:%d.", (*cIt).first, (*cIt).second.first.c_str(), (*cIt).second.second);
	}
	LOG("|number of connections: %d.", m_connNum);
	LOG("|video length: %d.", m_videoLen);
	LOG("|fix rate: %d.", m_fixRate);
	LOG("|init buffer timeout: %d.", m_initTimeout);
	LOG("|init buffer size: %d.", m_initBufferSize);
	LOG("--------------------------------------------------");
	xmlDoc.clear();

	m_rateVec.push_back(350);
    m_rateVec.push_back(470);
    m_rateVec.push_back(630);
    m_rateVec.push_back(845);
    m_rateVec.push_back(1130);
    m_rateVec.push_back(1520);
    m_rateVec.push_back(2040);


}

Ejemplo n.º 7

0

Mostrar archivo

Archivo: PageTitledb.cpp Proyecto: chushuai/open-source-search-engine

// . returns false if blocked, true otherwise
// . sets g_errno on error
// . make a web page displaying the titleRec of "docId" given via cgi
// . call g_httpServer.sendDynamicPage() to send it
bool sendPageTitledb ( TcpSocket *s , HttpRequest *r ) {
	// get the docId from the cgi vars
	long long docId = r->getLongLong ("d", 0LL );
	// set up a msg22 to get the next titleRec
	State4 *st ;
	try { st = new (State4); }
	catch ( ... ) {
		g_errno = ENOMEM;
		log("PageTitledb: new(%i): %s", 
		    (int)sizeof(State4),mstrerror(g_errno));
		return g_httpServer.sendErrorReply(s,500,mstrerror(g_errno));}
	mnew ( st , sizeof(State4) , "PageTitledb");
	// save the socket
	st->m_socket = s;
	// copy it
	st->m_r.copy ( r );
	// remember if http request is internal/local or not
	st->m_isRootAdmin = g_conf.isCollAdmin ( s , r );
	st->m_isLocal = r->isLocal();
	st->m_docId   = docId;
	// password, too
	st->m_pwd = r->getString ( "pwd" );
	// get the collection
	long  collLen = 0;
	char *coll    = st->m_r.getString("c",&collLen);
	if ( ! coll || ! coll[0] ) {
		//coll    = g_conf.m_defaultColl;
		coll = g_conf.getDefaultColl( r->getHost(), r->getHostLen() );
		collLen = gbstrlen(coll);
	}
	st->m_coll    = coll;
	st->m_collLen = collLen;

	// just print page if no docid provided
	if ( ! docId ) return gotTitleRec ( st );

	// get the handy XmlDoc
	XmlDoc *xd = &st->m_xd;
	// use 0 for niceness
	xd->set3 ( docId , coll , 0 );
	// callback
	xd->setCallback ( st , gotTitleRec );
	// . and tell it to load from old title rec
	// . this sets all the member vars from it and also sets
	//   m_titleRecBuf to contain the actual compressed title rec
	if ( ! xd->loadFromOldTitleRec ( ) ) return false;
	// we got it without blocking. cached?
	return gotTitleRec ( st );
}

Ejemplo n.º 8

0

Mostrar archivo

Archivo: PageInject.cpp Proyecto: chushuai/open-source-search-engine

// when XmlDoc::inject() complets it calls this
void doneInjectingWrapper9 ( void *state ) {

	Msg7 *msg7 = (Msg7 *)state;

	msg7->m_inUse = false;
	
	// shortcut
	XmlDoc *xd = &msg7->m_xd;

	GigablastRequest *gr = &msg7->m_gr;

	if ( gr->m_getSections && ! gr->m_gotSections ) {
		// do not re-call
		gr->m_gotSections = true;
		// new callback now, same state
		xd->m_callback1 = doneInjectingWrapper9;
		// and if it blocks internally, it will call 
		// getInlineSectionVotingBuf until it completes then it will 
		// call xd->m_callback
		xd->m_masterLoop = NULL;
		// get sections
		SafeBuf *buf = xd->getInlineSectionVotingBuf();
		// if it returns -1 wait for it to call wrapper10 when done
		if ( buf == (void *)-1 ) return;
		// error?
		if ( ! buf ) log("inject: error getting sections: %s",
				 mstrerror(g_errno));
	}

 loop:

	// if we were injecting delimterized documents...
	char *delim = gr->m_contentDelim;
	if ( delim && ! delim[0] ) delim = NULL;
	if ( delim && msg7->m_start ) {
		// do another injection. returns false if it blocks
		if ( ! msg7->inject ( msg7->m_state , msg7->m_callback ) )
			return;
	}

	if ( msg7->m_start && delim ) 
		goto loop;

	// and we call the original caller
	msg7->m_callback ( msg7->m_state );
}

Ejemplo n.º 9

0

Mostrar archivo

Archivo: process_xml.cpp Proyecto: HackLinux/MinFFS

XmlType getXmlTypeNoThrow(const XmlDoc& doc) //throw()
{
    if (doc.root().getNameAs<std::string>() == "FreeFileSync")
    {
        std::string type;
        if (doc.root().getAttribute("XmlType", type))
        {
            if (type == "GUI")
                return XML_TYPE_GUI;
            else if (type == "BATCH")
                return XML_TYPE_BATCH;
            else if (type == "GLOBAL")
                return XML_TYPE_GLOBAL;
        }
    }
    return XML_TYPE_OTHER;
}

Ejemplo n.º 10

0

Mostrar archivo

Archivo: ServerMessageHandler.cpp Proyecto: Ider/SU-Courses

//////////////////////////////////////////////////////////////////////////
//check whether uploaded file is not in repository
//or user own the file
bool MessageHandler::OKtoCheckin(strVal fileName)
{
    fileName = GetKeyName(fileName)+".xml";

    XmlDoc doc;

    //package not in the repository, ok to check in
    if (!doc.LoadXmlFile(_repositoryPath+_metaFolder+fileName))
        if (!doc.LoadXmlFile(_repositoryPath+_checkinFoler+fileName))
            return true;

    vector<XmlDoc> elems = doc.Children("owner");
    if (elems.size()<=0)return false;

    //check in depend on whether user is package owner
    strVal owner = elems[0].InnerText();
    return owner == _curUser;
}

Ejemplo n.º 11

0

Mostrar archivo

Archivo: process_xml.cpp Proyecto: HackLinux/MinFFS

void setXmlType(XmlDoc& doc, XmlType type) //throw()
{
    switch (type)
    {
        case XML_TYPE_GUI:
            doc.root().setAttribute("XmlType", "GUI");
            break;
        case XML_TYPE_BATCH:
            doc.root().setAttribute("XmlType", "BATCH");
            break;
        case XML_TYPE_GLOBAL:
            doc.root().setAttribute("XmlType", "GLOBAL");
            break;
        case XML_TYPE_OTHER:
            assert(false);
            break;
    }
}

Ejemplo n.º 12

0

Mostrar archivo

Archivo: XmlDocTest.cpp Proyecto: privacore/open-source-search-engine

TEST_F(XmlDocTest, PosdbGetMetaListChangedDoc) {
	const char *url = "http://www.example.test/index.html";
	char contentOld[] = "<html><head><title>my title</title></head><body>old document</body></html>";
	char contentNew[] = "<html><head><title>my title</title></head><body>new document</body></html>";

	XmlDoc *xmlDocOld = new XmlDoc();
	mnew(xmlDocOld, sizeof(*xmlDocOld), "XmlDoc");
	initializeDocForPosdb(xmlDocOld, url, contentOld);

	XmlDoc xmlDocNew;
	initializeDocForPosdb(&xmlDocNew, url, contentNew);

	xmlDocNew.m_oldDocValid = true;
	xmlDocNew.m_oldDoc = xmlDocOld;
	xmlDocNew.getMetaList(false);
	auto metaListKeys = parseMetaList(xmlDocNew.m_metaList, xmlDocNew.m_metaListSize);

	// make sure no special key is inserted (positive or negative)
	EXPECT_FALSE(posdbFindRecord(metaListKeys, xmlDocNew.m_docId, POSDB_DELETEDOC_TERMID, false));
	EXPECT_FALSE(posdbFindRecord(metaListKeys, xmlDocNew.m_docId, POSDB_DELETEDOC_TERMID, true));

	// make sure title & body text is indexed (with difference between old & new document deleted)

	// title
	EXPECT_TRUE(posdbFindRecord(metaListKeys, xmlDocNew.m_docId, hashWord("title", "my"), false));
	EXPECT_TRUE(posdbFindRecord(metaListKeys, xmlDocNew.m_docId, hashWord("title", "title"), false));
	EXPECT_TRUE(posdbFindRecord(metaListKeys, xmlDocNew.m_docId, hashWord("title", "mytitle"), false));
	EXPECT_TRUE(posdbFindRecord(metaListKeys, xmlDocNew.m_docId, hashWord("my"), false));
	EXPECT_TRUE(posdbFindRecord(metaListKeys, xmlDocNew.m_docId, hashWord("title"), false));
	EXPECT_TRUE(posdbFindRecord(metaListKeys, xmlDocNew.m_docId, hashWord("mytitle"), false));

	// body
	EXPECT_TRUE(posdbFindRecord(metaListKeys, xmlDocNew.m_docId, hashWord("new"), false));
	EXPECT_TRUE(posdbFindRecord(metaListKeys, xmlDocNew.m_docId, hashWord("document"), false));
	EXPECT_TRUE(posdbFindRecord(metaListKeys, xmlDocNew.m_docId, hashWord("newdocument"), false));

	// deleted terms
	EXPECT_TRUE(posdbFindRecord(metaListKeys, xmlDocNew.m_docId, hashWord("old"), true));
	EXPECT_TRUE(posdbFindRecord(metaListKeys, xmlDocNew.m_docId, hashWord("olddocument"), true));

	/// @todo ALC add other terms
}

Ejemplo n.º 13

0

Mostrar archivo

Archivo: Turkdb.cpp Proyecto: DeadNumbers/open-source-search-engine

// returns false if blocked, true otherwise
bool processLoop ( void *state ) {
	// get it
	State60 *st = (State60 *)state;
	// get the tcp socket from the state
	TcpSocket *s = st->m_socket;
	// get it
	XmlDoc *xd = &st->m_xd;

	if ( ! xd->m_loaded ) {
		// setting just the docid. niceness is 0.
		xd->set3 ( st->m_docId , st->m_coll , 0 );
		// callback
		xd->setCallback ( state , processLoop );
		// . and tell it to load from the old title rec
		// . if it returns false it blocked and will call our callback
		//   processLoop() when it completes
		if ( ! xd->loadFromOldTitleRec ( ) ) return false;
	}

	if ( g_errno ) return sendErrorReply ( st , g_errno );


	// get the utf8 content
	char **utf8 = xd->getUtf8Content();
	//int32_t   len  = xd->size_utf8Content - 1;
	// wait if blocked???
	if ( utf8 == (void *)-1 ) return false;
	// strange
	if ( xd->size_utf8Content<=0) return sendErrorReply(st,EBADENGINEER );
	// alloc error?
	if ( ! utf8 ) return sendErrorReply ( st , g_errno );

	// get this host
	Host *h = g_hostdb.getHost ( g_hostdb.m_hostId );
	if ( ! h ) return sendErrorReply ( st , EBADENGINEER );
	
	// make it into an editable page now for the turk guy
	sendTurkPageReply ( st );
}

Ejemplo n.º 14

0

Mostrar archivo

Archivo: ServerMessageHandler.cpp Proyecto: Ider/SU-Courses

//////////////////////////////////////////////////////////////////////////
//generate a Dependency type message
Message MessageHandler::DependencyMessage()
{
    const strVal name = GetMessageName();

    if (name == "*.*")return AllPackageMessage();

    strVal path = _repositoryPath+_metaFolder+name+".xml";
    strVal tag = "references";
    strVal content;
    XmlDoc doc;
    xmlRep rep;

    if (!doc.LoadXmlFile(path))
        return WarningMessage(name+": the package is not in the repository.");

    vector<XmlDoc> refs = doc.Children(tag);
    if (refs.size()<=0)
        return WarningMessage(name+": the package does not depend on other package.");

    //find all dependency children under dependency tag
    refs = refs[0].Children();
    if (refs.size()<=0)
        return WarningMessage(name+": the package does not depend on other package.");

    tag = "Name";
    for (size_t i=0; i<refs.size(); ++i)
    {
        content = refs[i].InnerText();
        content = GetKeyName(content);
        xmlElem elem(tag,content);
        rep.addSibling(elem);
    }

    tag = MsgType::EnumToString(MsgType::Dependency);
    rep.makeParent(tag);

    return Message(rep.xmlStr());
}

Ejemplo n.º 15

0

Mostrar archivo

Archivo: XmlSignature.cpp Proyecto: okean/cpputils

void XmlSignature::sign(XmlDoc &xml, const Sec::Key &key)
{
    try
    {
        XmlElementPtr rootNode{ xml.root() };

        XmlElementPtr sigNode{ std::make_shared<XmlElement>(
            *_signature->createBlankSignature(xml, CANON_C14N_COM, SIGNATURE_RSA, HASH_SHA1)) };

        rootNode->add(*sigNode);

        DSIGReference * ref = _signature->createReference(MAKE_UNICODE_STRING(""));
        ref->appendEnvelopedSignatureTransform();

        _signature->setSigningKey(key);
        _signature->sign();

        _signed = true;
    }
    catch (XSECException &ex)
    {
        rethrowWithMessage(ex, "An error occured while signing xml");
    }
}

Ejemplo n.º 16

0

Mostrar archivo

Archivo: ServerMessageHandler.cpp Proyecto: Ider/SU-Courses

//////////////////////////////////////////////////////////////////////////
//generate a package type message
Message MessageHandler::PackageMessage()
{
    const strVal name = GetMessageName();

    strVal path = _repositoryPath+_metaFolder+name+".xml";
    XmlDoc doc;
    strVal prefix = "\n  ";

    if (!doc.LoadXmlFile(path))
        return WarningMessage(name+": the package is not in the repository.");
    vector<XmlDoc> refs ;

    strVal packInfo;
    if((refs=doc.Children("head")).size()>0)
        packInfo+=prefix+refs[0].InnerText();

    if((refs=doc.Children("implement")).size()>0)
        packInfo+=prefix+refs[0].InnerText();

    if (packInfo.size()>0) packInfo = "Package files: " + packInfo;
    else return WarningMessage("Package metadata file broken, please administrator.");

    strVal dependency="\n \nDependencies: ";
    if ((refs=doc.Children("references")).size()>0)
        if ((refs=refs[0].Children("reference")).size()>0)
            for(size_t i=0; i<refs.size(); ++i)
                dependency+=prefix+refs[i].InnerText();

    strVal owner="\n \nPackage Owner: ";
    if ((refs=doc.Children("owner")).size()>0)
        owner+=prefix+refs[0].InnerText();


    xmlRep	rep(xmlElem("Name",packInfo+dependency+owner));
    rep.makeParent(MsgType::EnumToString(MsgType::Package));

    return Message(rep.xmlStr());
}

Ejemplo n.º 17

0

Mostrar archivo

Archivo: PageGet.cpp Proyecto: firatkarakusoglu/open-source-search-engine

// . returns false if blocked, true otherwise
// . sets g_errno on error
bool sendPageGet ( TcpSocket *s , HttpRequest *r ) {
	// get the collection
	long  collLen = 0;
	char *coll    = r->getString("c",&collLen);
	if ( ! coll || ! coll[0] ) {
		//coll    = g_conf.m_defaultColl;
		coll = g_conf.getDefaultColl( r->getHost(), r->getHostLen() );
		collLen = gbstrlen(coll);
	}
	// ensure collection not too big
	if ( collLen >= MAX_COLL_LEN ) { 
		g_errno = ECOLLTOOBIG; 
		return g_httpServer.sendErrorReply(s,500,mstrerror(g_errno)); 
	}
	// get the collection rec
	CollectionRec *cr = g_collectiondb.getRec ( coll );
	if ( ! cr ) {
		g_errno = ENOCOLLREC;
		log("query: Archived copy retrieval failed. "
		    "No collection record found for "
		    "collection \"%s\".",coll);
		return g_httpServer.sendErrorReply(s,500,mstrerror(g_errno));
	}
	// does this collection ban this IP?
	if ( ! cr->hasSearchPermission ( s ) ) {
		g_errno = ENOPERM;
		//log("PageGet::sendDynamicReply0: permission denied for %s",
		//    iptoa(s->m_ip) );
		g_msg = " (error: permission denied)";
		return g_httpServer.sendErrorReply(s,500,mstrerror(g_errno));
	}
	// . get fields from cgi field of the requested url
	// . get the search query
	long  qlen = 0;
	char *q = r->getString ( "q" , &qlen , NULL /*default*/);
	// ensure query not too big
	if ( qlen >= MAX_QUERY_LEN-1 ) { 
		g_errno=EQUERYTOOBIG; 
		return g_httpServer.sendErrorReply (s,500 ,mstrerror(g_errno));
	}
	// the docId
	long  long docId = r->getLongLong ( "d" , 0LL /*default*/ );
	// get url
	char *url = r->getString ( "u",NULL);

	if ( docId == 0 && ! url ) {
		g_errno = EMISSINGINPUT;
		return g_httpServer.sendErrorReply (s,500 ,mstrerror(g_errno));
	}


	// . should we do a sequential lookup?
	// . we need to match summary here so we need to know this
	//bool seq = r->getLong ( "seq" , false );
	// restrict to root file?
	bool rtq = r->getLong ( "rtq" , false );

	// . get the titleRec
	// . TODO: redirect client to a better http server to save bandwidth
	State2 *st ;
	try { st = new (State2); }
	catch (... ) {
		g_errno = ENOMEM;
		log("PageGet: new(%i): %s", 
		    (int)sizeof(State2),mstrerror(g_errno));
		return g_httpServer.sendErrorReply(s,500,mstrerror(g_errno));}
	mnew ( st , sizeof(State2) , "PageGet1" );
	// save the socket and if Host: is local in the Http request Mime
	st->m_socket   = s;
	st->m_isAdmin  = g_conf.isCollAdmin ( s , r );
	st->m_isLocal  = r->isLocal();
	st->m_docId    = docId;
	st->m_printed  = false;
	// include header ... "this page cached by Gigablast on..."
	st->m_includeHeader     = r->getLong ("ih"    , true  );
	st->m_includeBaseHref   = r->getLong ("ibh"   , false );
	st->m_queryHighlighting = r->getLong ("qh"    , true  );
	st->m_strip             = r->getLong ("strip" , 0     );
	st->m_clickAndScroll    = r->getLong ("cas"   , true  );
	st->m_cnsPage           = r->getLong ("cnsp"  , true );
	char *langAbbr = r->getString("qlang",NULL);
	st->m_langId = langUnknown;
	if ( langAbbr ) {
		uint8_t langId = getLangIdFromAbbr ( langAbbr );
		st->m_langId = langId;
	}
	strncpy ( st->m_coll , coll , MAX_COLL_LEN+1 );
	// store query for query highlighting
	st->m_netTestResults    = r->getLong ("rnettest", false );
	//if( st->m_netTestResults ) {
	//	mdelete ( st , sizeof(State2) , "PageGet1" );
	//	delete ( st );
	//	return sendPageNetResult( s );
	//}
	if ( q && qlen > 0 ) strcpy ( st->m_q , q );
	else                 st->m_q[0] = '\0';
	st->m_qlen = qlen;
	//st->m_seq      = seq;
	st->m_rtq      = rtq;
	st->m_boolFlag = r->getLong ("bq", 2 /*default is 2*/ );
	st->m_isBanned = false;
	st->m_noArchive = false;
	st->m_socket = s;
	st->m_format = r->getReplyFormat();
	// default to 0 niceness
	st->m_niceness = 0;
	st->m_r.copy ( r );
	//st->m_cr = cr;
	st->m_printDisclaimer = true;
	if ( st->m_cnsPage )
		st->m_printDisclaimer = false;
	if ( st->m_strip ) // ! st->m_evbits.isEmpty() ) 
		st->m_printDisclaimer = false;
	
	// should we cache it?
	char useCache = r->getLong ( "usecache" ,  1 );
	char rcache   = r->getLong ( "rcache"   ,  1 );
	char wcache   = r->getLong ( "wcache"   ,  1 );
	long cacheAge = r->getLong ( "cacheAge" , 60*60 ); // default one hour
	if ( useCache == 0 ) { cacheAge = 0; wcache = 0; }
	if ( rcache   == 0 )   cacheAge = 0; 
	// . fetch the TitleRec
	// . a max cache age of 0 means not to read from the cache
	XmlDoc *xd = &st->m_xd;
	// url based?
	if ( url ) {
		SpiderRequest sreq;
		sreq.reset();
		strcpy(sreq.m_url, url );
		sreq.setDataSize();
		// this returns false if "coll" is invalid
		if ( ! xd->set4 ( &sreq , NULL , coll , NULL , st->m_niceness ) ) 
			goto hadSetError;
	}
	// . when getTitleRec() is called it will load the old one
	//   since XmlDoc::m_setFromTitleRec will be true
	// . niceness is 0
	// . use st->m_coll since XmlDoc just points to it!
	// . this returns false if "coll" is invalid
	else if ( ! xd->set3 ( docId , st->m_coll , 0 ) ) {
	hadSetError:
		mdelete ( st , sizeof(State2) , "PageGet1" );
		delete ( st );
		g_errno = ENOMEM;
		log("PageGet: set3: %s", mstrerror(g_errno));
		return g_httpServer.sendErrorReply(s,500,mstrerror(g_errno));
	}
	// if it blocks while it loads title rec, it will re-call this routine
	xd->setCallback ( st , processLoopWrapper );
	// good to go!
	return processLoop ( st );
}

Ejemplo n.º 18

0

Mostrar archivo

Archivo: PageGet.cpp Proyecto: firatkarakusoglu/open-source-search-engine

// returns false if blocked, true otherwise
bool processLoop ( void *state ) {
	// get it
	State2 *st = (State2 *)state;
	// get the tcp socket from the state
	TcpSocket *s = st->m_socket;
	// get it
	XmlDoc *xd = &st->m_xd;

	if ( ! xd->m_loaded ) {
		// setting just the docid. niceness is 0.
		//xd->set3 ( st->m_docId , st->m_coll , 0 );
		// callback
		xd->setCallback ( state , processLoop );
		// . and tell it to load from the old title rec
		// . this sets xd->m_oldTitleRec/m_oldTitleRecSize
		// . this sets xd->ptr_* and all other member vars from
		//   the old title rec if found in titledb.
		if ( ! xd->loadFromOldTitleRec ( ) ) return false;
	}

	if ( g_errno ) return sendErrorReply ( st , g_errno );
	// now force it to load old title rec
	//char **tr = xd->getTitleRec();
	SafeBuf *tr = xd->getTitleRecBuf();
	// blocked? return false if so. it will call processLoop() when it rets
	if ( tr == (void *)-1 ) return false;
	// we did not block. check for error? this will free "st" too.
	if ( ! tr ) return sendErrorReply ( st , g_errno );
	// if title rec was empty, that is a problem
	if ( xd->m_titleRecBuf.length() == 0 ) 
		return sendErrorReply ( st , ENOTFOUND);

	// set callback
	char *na = xd->getIsNoArchive();
	// wait if blocked
	if ( na == (void *)-1 ) return false;
	// error?
	if ( ! na ) return sendErrorReply ( st , g_errno );
	// forbidden? allow turkeys through though...
	if ( ! st->m_isAdmin && *na )
		return sendErrorReply ( st , ENOCACHE );

	SafeBuf *sb = &st->m_sb;


	// &page=4 will print rainbow sections
	if ( ! st->m_printed && st->m_r.getLong("page",0) ) {
		// do not repeat this call
		st->m_printed = true;
		// this will call us again since we called
		// xd->setCallback() above to us
		if ( ! xd->printDocForProCog ( sb , &st->m_r ) )
			return false;
	}

	char *contentType = "text/html";
	char format = st->m_format;
	if ( format == FORMAT_XML ) contentType = "text/xml";
	if ( format == FORMAT_JSON ) contentType = "application/json";

	// if we printed a special page (like rainbow sections) then return now
	if ( st->m_printed ) {
		bool status = g_httpServer.sendDynamicPage (s,
							    //buf,bufLen,
							    sb->getBufStart(),
							    sb->getLength(),
							    -1,false,
							    //"text/html",
							    contentType,
							    -1, NULL, "utf8" );
		// nuke state2
		mdelete ( st , sizeof(State2) , "PageGet1" );
		delete (st);
		return status;
	}

	/*
	  // this was calling XmlDoc and setting sections, etc. to
	  // get the SpiderReply junk... no no no
	// is it banned or filtered? this ignores the TagRec in the titleRec
	// and uses msg8a to get it fresh instead
	char *vi = xd->getIsFiltered();//Visible( );
	// wait if blocked
	if ( vi == (void *)-1 ) return false;
	// error?
	if ( ! vi ) return sendErrorReply ( st , g_errno );
	// banned?
	if ( ! st->m_isAdmin && ! *vi ) return sendErrorReply (st,EDOCBANNED);
	*/

	// get the utf8 content
	char **utf8 = xd->getUtf8Content();
	//long   len  = xd->size_utf8Content - 1;
	// wait if blocked???
	if ( utf8 == (void *)-1 ) return false;
	// strange
	if ( xd->size_utf8Content<=0) {
		log("pageget: utf8 content <= 0");
		return sendErrorReply(st,EBADENGINEER );
	}
	// alloc error?
	if ( ! utf8 ) return sendErrorReply ( st , g_errno );

	// get this host
	Host *h = g_hostdb.getHost ( g_hostdb.m_hostId );
	if ( ! h ) {
		log("pageget: hostid %li is bad",g_hostdb.m_hostId);
		return sendErrorReply(st,EBADENGINEER );
	}


	char *content    = xd->ptr_utf8Content;
	long  contentLen = xd->size_utf8Content - 1;

	// shortcut
	char strip = st->m_strip;

	// alloc buffer now
	//char *buf = NULL;
	//long  bufMaxSize = 0;
	//bufMaxSize = len + ( 32 * 1024 ) ;
	//bufMaxSize = contentLen + ( 32 * 1024 ) ;
	//buf        = (char *)mmalloc ( bufMaxSize , "PageGet2" );
	//char *p          = buf;
	//char *bufEnd     = buf + bufMaxSize;
	//if ( ! buf ) {
	//	return sendErrorReply ( st , g_errno );
	//}

	// for undoing the header
	//char *start1 = p;
	long startLen1 = sb->length();

	// we are always utfu
	if ( strip != 2 )
		sb->safePrintf( "<meta http-equiv=\"Content-Type\" "
			     "content=\"text/html;charset=utf8\">\n");

	// base href
	//Url *base = &xd->m_firstUrl;
	//if ( xd->ptr_redirUrl.m_url[0] )
	//	base = &xd->m_redirUrl;
	char *base = xd->ptr_firstUrl;
	if ( xd->ptr_redirUrl ) base = xd->ptr_redirUrl;
	//Url *redir = *xd->getRedirUrl();
	if ( strip != 2 ) {
		sb->safePrintf ( "<BASE HREF=\"%s\">" , base );
		//p += gbstrlen ( p );
	}

	// default colors in case css files missing
	if ( strip != 2 ) {
		sb->safePrintf( "\n<style type=\"text/css\">\n"
			  "body{background-color:white;color:black;}\n"
			  "</style>\n");
		//p += gbstrlen ( p );
	}

	//char format = st->m_format;
	if ( format == FORMAT_XML ) sb->reset();
	if ( format == FORMAT_JSON ) sb->reset();

	// for undoing the stuff below
	long startLen2 = sb->length();//p;

	// query should be NULL terminated
	char *q    = st->m_q;
	long  qlen = st->m_qlen;

	char styleTitle[128] =  "font-size:14px;font-weight:600;"
				"color:#000000;";
	char styleText[128]  =  "font-size:14px;font-weight:400;"
				"color:#000000;";
	char styleLink[128] =  "font-size:14px;font-weight:400;"
				"color:#0000ff;";
	char styleTell[128] =  "font-size:14px;font-weight:600;"
				"color:#cc0000;";

	// get the url of the title rec
	Url *f = xd->getFirstUrl();

	bool printDisclaimer = st->m_printDisclaimer;

	if ( xd->m_contentType == CT_JSON )
		printDisclaimer = false;

	if ( format == FORMAT_XML ) printDisclaimer = false;
	if ( format == FORMAT_JSON ) printDisclaimer = false;

	char tbuf[100];
	tbuf[0] = 0;
	time_t lastSpiderDate = xd->m_spideredTime;

	if ( printDisclaimer ||
	     format == FORMAT_XML ||
	     format == FORMAT_JSON ) {
		struct tm *timeStruct = gmtime ( &lastSpiderDate );
		strftime ( tbuf, 100,"%b %d, %Y UTC", timeStruct);
	}

	// We should always be displaying this disclaimer.
	// - May eventually want to display this at a different location
	//   on the page, or on the click 'n' scroll browser page itself
	//   when this page is not being viewed solo.
	// CNS: if ( ! st->m_clickNScroll ) {
	if ( printDisclaimer ) {

		sb->safePrintf(//sprintf ( p , 
			  //"<BASE HREF=\"%s\">"
			  //"<table border=1 width=100%%>"
			  //"<tr><td>"
			  "<table border=\"1\" bgcolor=\"#"
			  BGCOLOR
			  "\" cellpadding=\"10\" "
			  //"id=\"gbcnsdisctable\" class=\"gbcnsdisctable_v\""
			  "cellspacing=\"0\" width=\"100%%\" color=\"#ffffff\">"
			  "<tr"
			  //" id=\"gbcnsdisctr\" class=\"gbcnsdisctr_v\""
			  "><td>"
			  //"<font face=times,sans-serif color=black size=-1>"
			  "<span style=\"%s\">"
			  "This is Gigablast's cached page of </span>"
			  "<a href=\"%s\" style=\"%s\">%s</a>"
			  "" , styleTitle, f->getUrl(), styleLink,
			  f->getUrl() );
		//p += gbstrlen ( p );
		// then the rest
		//sprintf(p , 
		sb->safePrintf(
			"<span style=\"%s\">. "
			"Gigablast is not responsible for the content of "
			"this page.</span>", styleTitle );
		//p += gbstrlen ( p );

		sb->safePrintf ( "<br/><span style=\"%s\">"
			  "Cached: </span>"
			  "<span style=\"%s\">",
			  styleTitle, styleText );
		//p += gbstrlen ( p );

		// then the spider date in GMT
		// time_t lastSpiderDate = xd->m_spideredTime;
		// struct tm *timeStruct = gmtime ( &lastSpiderDate );
		// char tbuf[100];
		// strftime ( tbuf, 100,"%b %d, %Y UTC", timeStruct);
		//p += gbstrlen ( p );
		sb->safeStrcpy(tbuf);

		// Moved over from PageResults.cpp
		sb->safePrintf( "</span> - <a href=\""
			      "/get?"
			      "q=%s&amp;c=%s&amp;rtq=%li&amp;"
			      "d=%lli&amp;strip=1\""
			      " style=\"%s\">"
			      "[stripped]</a>", 
			      q , st->m_coll , 
			      (long)st->m_rtq,
			      st->m_docId, styleLink ); 

		// a link to alexa
		if ( f->getUrlLen() > 5 ) {
			sb->safePrintf( " - <a href=\"http:"
					 "//web.archive.org/web/*/%s\""
					 " style=\"%s\">"
					 "[older copies]</a>" ,
					 f->getUrl(), styleLink );
		}

		if (st->m_noArchive){
			sb->safePrintf( " - <span style=\"%s\"><b>"
				     "[NOARCHIVE]</b></span>",
				     styleTell );
		}
		if (st->m_isBanned){
			sb->safePrintf(" - <span style=\"%s\"><b>"
				     "[BANNED]</b></span>",
				     styleTell );
		}

		// only print this if we got a query
		if ( qlen > 0 ) {
			sb->safePrintf("<br/><br/><span style=\"%s\"> "
				   "These search terms have been "
				   "highlighted:  ",
				   styleText );
			//p += gbstrlen ( p );
		}
		
	}

	// how much space left in p?
	//long avail = bufEnd - p;
	// . make the url that we're outputting for (like in PageResults.cpp)
	// . "thisUrl" is the baseUrl for click & scroll
	char thisUrl[MAX_URL_LEN];
	char *thisUrlEnd = thisUrl + MAX_URL_LEN;
	char *x = thisUrl;
	// . use the external ip of our gateway
	// . construct the NAT mapped port
	// . you should have used iptables to map port to the correct
	//   internal ip:port
	//unsigned long  ip   =g_conf.m_mainExternalIp  ; // h->m_externalIp;
	//unsigned short port=g_conf.m_mainExternalPort;//h->m_externalHttpPort
	// local check
	//if ( st->m_isLocal ) {
	unsigned long  ip   = h->m_ip;
	unsigned short port = h->m_httpPort;
	//}
	//sprintf ( x , "http://%s:%li/get?q=" , iptoa ( ip ) , port );
	// . we no longer put the port in here
	// . but still need http:// since we use <base href=>
	if (port == 80) sprintf(x,"http://%s/get?q=",iptoa(ip));
	else            sprintf(x,"http://%s:%hu/get?q=",iptoa(ip),port);
	x += gbstrlen ( x );
	// the query url encoded
	long elen = urlEncode ( x , thisUrlEnd - x , q , qlen );
	x += elen;
	// separate cgi vars with a &
	//sprintf ( x, "&seq=%li&rtq=%lid=%lli",
	//	  (long)st->m_seq,(long)st->m_rtq,st->m_msg22.getDocId());
	sprintf ( x, "&d=%lli",st->m_docId );
	x += gbstrlen(x);		
	// set our query for highlighting
	Query qq;
	qq.set2 ( q, st->m_langId , true );

	// print the query terms into our highlight buffer
	Highlight hi;
	// make words so we can set the scores to ignore fielded terms
	Words qw;
	qw.set ( q            ,  // content being highlighted, utf8
		 qlen         ,  // content being highlighted, utf8
		 TITLEREC_CURRENT_VERSION,
		 true         ,  // computeIds
		 false        ); // hasHtmlEntities?
	// . assign scores of 0 to query words that should be ignored
	// . TRICKY: loop over words in qq.m_qwords, but they should be 1-1
	//   with words in qw.
	// . sanity check
	//if ( qw.getNumWords() != qq.m_numWords ) { char *xx = NULL; *xx = 0;}
	// declare up here
	Matches m;
	// do the loop
	//Scores ss;
	//ss.set ( &qw , NULL );
	//for ( long i = 0 ; i < qq.m_numWords ; i++ )
	//	if ( ! m.matchWord ( &qq.m_qwords[i],i ) ) ss.m_scores[i] = 0;
	// now set m.m_matches[] to those words in qw that match a query word
	// or phrase in qq.
	m.setQuery ( &qq );
	//m.addMatches ( &qw , &ss , true );
	m.addMatches ( &qw );
	long hilen = 0;

	// CNS: if ( ! st->m_clickNScroll ) {
	// and highlight the matches
	if ( printDisclaimer ) {
		hilen = hi.set ( //p       ,
				 //avail   ,
				sb ,
				 &qw     , // words to highlight
				 &m      , // matches relative to qw
				 false   , // doSteming
				 false   , // st->m_clickAndScroll , 
				 (char *)thisUrl );// base url for ClcknScrll
		//p += hilen;
		// now an hr
		//memcpy ( p , "</span></table></table>\n" , 24 );   p += 24;
		sb->safeStrcpy("</span></table></table>\n");
	}


	bool includeHeader = st->m_includeHeader;

	// do not show header for json object display
	if ( xd->m_contentType == CT_JSON )
		includeHeader = false;

	if ( format == FORMAT_XML ) includeHeader = false;
	if ( format == FORMAT_JSON ) includeHeader = false;

	//mfree(uq, uqCapacity, "PageGet");
	// undo the header writes if we should
	if ( ! includeHeader ) {
		// including base href is off by default when not including
		// the header, so the caller must explicitly turn it back on
		if ( st->m_includeBaseHref ) sb->m_length=startLen2;//p=start2;
		else                         sb->m_length=startLen1;//p=start1;
	}

	//sb->safeStrcpy(tbuf);



	if ( format == FORMAT_XML ) {
		sb->safePrintf("<response>\n");
		sb->safePrintf("<statusCode>0</statusCode>\n");
		sb->safePrintf("<statusMsg>Success</statusMsg>\n");
		sb->safePrintf("<url><![CDATA[");
		sb->cdataEncode(xd->m_firstUrl.m_url);
		sb->safePrintf("]]></url>\n");
		sb->safePrintf("<docId>%llu</docId>\n",xd->m_docId);
		sb->safePrintf("\t<cachedTimeUTC>%lu</cachedTimeUTC>\n",
			      lastSpiderDate);
		sb->safePrintf("\t<cachedTimeStr>%s</cachedTimeStr>\n",tbuf);
	}

	if ( format == FORMAT_JSON ) {
		sb->safePrintf("{\"response\":{\n");
		sb->safePrintf("\t\"statusCode\":0,\n");
		sb->safePrintf("\t\"statusMsg\":\"Success\",\n");
		sb->safePrintf("\t\"url\":\"");
		sb->jsonEncode(xd->m_firstUrl.m_url);
		sb->safePrintf("\",\n");
		sb->safePrintf("\t\"docId\":%llu,\n",xd->m_docId);
		sb->safePrintf("\t\"cachedTimeUTC\":%lu,\n",lastSpiderDate);
		sb->safePrintf("\t\"cachedTimeStr\":\"%s\",\n",tbuf);
	}

	// identify start of <title> tag we wrote out
	char *sbstart = sb->getBufStart();
	char *sbend   = sb->getBufEnd();
	char *titleStart = NULL;
	char *titleEnd   = NULL;
	for ( char *t = sbstart ; t < sbend ; t++ ) {
		// title tag?
		if ( t[0]!='<' ) continue;
		if ( to_lower_a(t[1])!='t' ) continue;
		if ( to_lower_a(t[2])!='i' ) continue;
		if ( to_lower_a(t[3])!='t' ) continue;
		if ( to_lower_a(t[4])!='l' ) continue;
		if ( to_lower_a(t[5])!='e' ) continue;
		// point to it
		char *x = t + 5;
		// max - to keep things fast
		char *max = x + 500;
		for ( ; *x && *x != '>' && x < max ; x++ );
		x++;
		// find end
		char *e = x;
		for ( ; *e && e < max ; e++ ) {
			if ( e[0]=='<' &&
			     to_lower_a(e[1])=='/' &&
			     to_lower_a(e[2])=='t' &&
			     to_lower_a(e[3])=='i' &&
			     to_lower_a(e[4])=='t' &&
			     to_lower_a(e[5])=='l' &&
			     to_lower_a(e[6])=='e' )
				break;
		}
		if ( e < max ) {
			titleStart = x;
			titleEnd   = e;
		}
		break;
	}

	// . print title at top!
	// . consider moving
	if ( titleStart ) {

		char *ebuf = st->m_r.getString("eb");
		if ( ! ebuf ) ebuf = "";

		//p += sprintf ( p , 
		sb->safePrintf(
			       "<table border=1 "
			       "cellpadding=10 "
			       "cellspacing=0 "
			       "width=100%% "
			       "color=#ffffff>" );

		long printLinks = st->m_r.getLong("links",0);

		if ( ! printDisclaimer && printLinks )
			sb->safePrintf(//p += sprintf ( p , 
				       // first put cached and live link
				       "<tr>"
				       "<td bgcolor=lightyellow>"
				       // print cached link
				       //"<center>"
				       "&nbsp; "
				       "<b>"
				       "<a "
				       "style=\"font-size:18px;font-weight:600;"
				       "color:#000000;\" "
				       "href=\""
				       "/get?"
				       "c=%s&d=%lli&qh=0&cnsp=1&eb=%s\">"
				       "cached link</a>"
				       " &nbsp; "
				       "<a "
				       "style=\"font-size:18px;font-weight:600;"
				       "color:#000000;\" "
				       "href=%s>live link</a>"
				       "</b>"
				       //"</center>"
				       "</td>"
				       "</tr>\n"
				       ,st->m_coll
				       ,st->m_docId 
				       ,ebuf
				       ,thisUrl // st->ptr_ubuf
				       );

		if ( printLinks ) {
			sb->safePrintf(//p += sprintf ( p ,
				       "<tr><td bgcolor=pink>"
				       "<span style=\"font-size:18px;"
				       "font-weight:600;"
				       "color:#000000;\">"
				       "&nbsp; "
				       "<b>PAGE TITLE:</b> "
				       );
			long tlen = titleEnd - titleStart;
			sb->safeMemcpy ( titleStart , tlen );
			sb->safePrintf ( "</span></td></tr>" );
		}

		sb->safePrintf( "</table><br>\n" );

	}

	// is the content preformatted?
	bool pre = false;
	char ctype = (char)xd->m_contentType;
	if ( ctype == CT_TEXT ) pre = true ; // text/plain
	if ( ctype == CT_DOC  ) pre = true ; // filtered msword
	if ( ctype == CT_PS   ) pre = true ; // filtered postscript

	if ( format == FORMAT_XML ) pre = false;
	if ( format == FORMAT_JSON ) pre = false;

	// if it is content-type text, add a <pre>
	if ( pre ) {//p + 5 < bufEnd && pre ) {
		sb->safePrintf("<pre>");
		//p += 5;
	}

	if ( st->m_strip == 1 )
		contentLen = stripHtml( content, contentLen, 
					(long)xd->m_version, st->m_strip );
	// it returns -1 and sets g_errno on error, line OOM
	if ( contentLen == -1 ) {
		//if ( buf ) mfree ( buf , bufMaxSize , "PageGet2" );	
		return sendErrorReply ( st , g_errno );
	}

	Xml xml;
	Words ww;

	// if no highlighting, skip it
	bool queryHighlighting = st->m_queryHighlighting;
	if ( st->m_strip == 2 ) queryHighlighting = false;

	// do not do term highlighting if json
	if ( xd->m_contentType == CT_JSON )
		queryHighlighting = false;

	SafeBuf tmp;
	SafeBuf *xb = sb;
	if ( format == FORMAT_XML ) xb = &tmp;
	if ( format == FORMAT_JSON ) xb = &tmp;
	

	if ( ! queryHighlighting ) {
		xb->safeMemcpy ( content , contentLen );
		//p += contentLen ;
	}
	else {
		// get the content as xhtml (should be NULL terminated)
		//Words *ww = xd->getWords();
		if ( ! xml.set ( content , contentLen , false ,
				 0 , false , TITLEREC_CURRENT_VERSION ,
				 false , 0 , CT_HTML ) ) { // niceness is 0
			//if ( buf ) mfree ( buf , bufMaxSize , "PageGet2" );
			return sendErrorReply ( st , g_errno );
		}			
		if ( ! ww.set ( &xml , true , 0 ) ) { // niceness is 0
			//if ( buf ) mfree ( buf , bufMaxSize , "PageGet2" );
			return sendErrorReply ( st , g_errno );
		}
		// sanity check
		//if ( ! xd->m_wordsValid ) { char *xx=NULL;*xx=0; }
		// how much space left in p?
		//avail = bufEnd - p;

		Matches m;
		m.setQuery ( &qq );
		m.addMatches ( &ww );
		hilen = hi.set ( xb , // p , avail , 
				 &ww , &m ,
				 false /*doStemming?*/ ,  
				 st->m_clickAndScroll , 
				 thisUrl /*base url for click & scroll*/);
		//p += hilen;
		log(LOG_DEBUG, "query: Done highlighting cached page content");
	}


	if ( format == FORMAT_XML ) {
		sb->safePrintf("\t<content><![CDATA[");
		sb->cdataEncode ( xb->getBufStart() );
		sb->safePrintf("]]></content>\n");
		sb->safePrintf("</response>\n");
	}

	if ( format == FORMAT_JSON ) {
		sb->safePrintf("\t\"content\":\"\n");
		sb->jsonEncode ( xb->getBufStart() );
		sb->safePrintf("\"\n}\n}\n");
	}


	// if it is content-type text, add a </pre>
	if ( pre ) { // p + 6 < bufEnd && pre ) {
		sb->safeMemcpy ( "</pre>" , 6 );
		//p += 6;
	}

	// calculate bufLen
	//long bufLen = p - buf;

	long ct = xd->m_contentType;

	// now filter the entire buffer to escape out the xml tags
	// so it is displayed nice
	SafeBuf newbuf;

	if ( ct == CT_XML ) {
		// encode the xml tags into &lt;tagname&gt; sequences
		if ( !newbuf.htmlEncodeXmlTags ( sb->getBufStart() ,
						 sb->getLength(),
						 0)){// niceness=0
			//if ( buf ) mfree ( buf , bufMaxSize , "PageGet2" );
			return sendErrorReply ( st , g_errno );
		}
		// free out buffer that we alloc'd before returning since this
		// should have copied it into another buffer
		//if ( buf ) mfree ( buf , bufMaxSize , "PageGet2" );	
		// reassign
		//buf    = newbuf.getBufStart();
		//bufLen = newbuf.length();
		sb->stealBuf ( &newbuf );
	}

	// now encapsulate it in html head/tail and send it off
	// sendErr:
	contentType = "text/html";
	if ( strip == 2 ) contentType = "text/xml";
	// xml is usually buggy and this throws browser off
	//if ( ctype == CT_XML ) contentType = "text/xml";

	if ( xd->m_contentType == CT_JSON )
		contentType = "application/json";

	if ( format == FORMAT_XML ) contentType = "text/xml";
	if ( format == FORMAT_JSON ) contentType = "application/json";

	// safebuf, sb, is a member of "st" so this should copy the buffer
	// when it constructs the http reply, and we gotta call delete(st)
	// AFTER this so sb is still valid.
	bool status = g_httpServer.sendDynamicPage (s,
						    //buf,bufLen,
						    sb->getBufStart(),
						    sb->getLength(),
						    -1,false,
						    contentType,
						     -1, NULL, "utf8" );

	// nuke state2
	mdelete ( st , sizeof(State2) , "PageGet1" );
	delete (st);


	// free out buffer that we alloc'd before returning since this
	// should have copied it into another buffer

	//if      ( ct == CT_XML ) newbuf.purge();
	//else if ( buf          ) mfree ( buf , bufMaxSize , "PageGet2" );
	
	// and convey the status
	return status;
}

Ejemplo n.º 19

0

Mostrar archivo

Archivo: PageInject.cpp Proyecto: BKJackson/open-source-search-engine

bool Msg7::inject ( char *url ,
		    long  forcedIp ,
		    char *content ,
		    long  contentLen ,
		    bool  recycleContent,
		    uint8_t contentType,
		    char *coll ,
		    bool  quickReply ,
		    char *username ,
		    char *pwd ,
		    long  niceness,
		    void *state ,
		    void (*callback)(void *state),
		    long firstIndexed,
		    long lastSpidered,
		    long hopCount,
		    char newOnly,
		    short charset,
		    char spiderLinks,
		    char deleteIt,
		    char hasMime,
		    bool doConsistencyTesting
		    ) {

	m_quickReply = quickReply;

	// store coll
	if ( ! coll ) { g_errno = ENOCOLLREC; return true; }
        long collLen = gbstrlen ( coll );
	if ( collLen > MAX_COLL_LEN ) collLen = MAX_COLL_LEN;
	strncpy ( m_coll , coll , collLen );
	m_coll [ collLen ] = '\0';

	// store user
	//long ulen = 0;
	//if ( username ) ulen = gbstrlen(username);
	//if ( ulen >= MAX_USER_SIZE-1 ) {g_errno = EBUFOVERFLOW; return true;}
	//if ( username ) strcpy( m_username, username );

	// store password
	//long pwdLen = 0;
	//if ( pwd ) pwdLen = gbstrlen(pwd);
	//m_pwd [ 0 ] ='\0';
	//if ( pwdLen > 31 ) pwdLen = 31;
	//if ( pwdLen > 0 ) strncpy ( m_pwd , pwd , pwdLen );
	//m_pwd [ pwdLen ] = '\0';

	// store url
	if ( ! url ) { g_errno = 0; return true; }
	long urlLen = gbstrlen(url);
	if ( urlLen > MAX_URL_LEN ) {g_errno = EBADENGINEER; return true; }
	// skip injecting if no url given! just print the admin page.
	if ( urlLen <= 0 ) return true;
	//strcpy ( m_url , url );

	if ( g_repairMode ) { g_errno = EREPAIRING; return true; }

	// send template reply if no content supplied
	if ( ! content && ! recycleContent ) {
		log("inject: no content supplied to inject command and "
		    "recycleContent is false.");
		//return true;
	}

	// clean url?
	// normalize and add www. if it needs it
	Url uu;
	uu.set ( url , gbstrlen(url) , true );
	// remove >'s i guess and store in st1->m_url[] buffer
	char cleanUrl[MAX_URL_LEN+1];
	urlLen = cleanInput ( cleanUrl,
			      MAX_URL_LEN, 
			      uu.getUrl(),
			      uu.getUrlLen() );


	// this can go on the stack since set4() copies it
	SpiderRequest sreq;
	sreq.reset();
	strcpy(sreq.m_url, cleanUrl );
	// parentdocid of 0
	long firstIp = hash32n(cleanUrl);
	if ( firstIp == -1 || firstIp == 0 ) firstIp = 1;
	sreq.setKey( firstIp,0LL, false );
	sreq.m_isInjecting   = 1; 
	sreq.m_isPageInject  = 1;
	sreq.m_hopCount      = hopCount;
	sreq.m_hopCountValid = 1;
	sreq.m_fakeFirstIp   = 1;
	sreq.m_firstIp       = firstIp;

	// shortcut
	XmlDoc *xd = &m_xd;

	// log it now
	//log("inject: injecting doc %s",cleanUrl);

	static char s_dummy[3];
	// sometims the content is indeed NULL...
	if ( newOnly && ! content ) { 
		// don't let it be NULL because then xmldoc will
		// try to download the page!
		s_dummy[0] = '\0';
		content = s_dummy;
		//char *xx=NULL;*xx=0; }
	}


	// . use the enormous power of our new XmlDoc class
	// . this returns false with g_errno set on error
	if ( //m_needsSet &&
	     ! xd->set4 ( &sreq       ,
			  NULL        ,
			  m_coll  ,
			  NULL        , // pbuf
			  // give it a niceness of 1, we have to be
			  // careful since we are a niceness of 0!!!!
			  niceness, // 1 , 
			  // inject this content
			  content ,
			  deleteIt, // false, // deleteFromIndex ,
			  forcedIp ,
			  contentType ,
			  lastSpidered ,
			  hasMime )) {
		// g_errno should be set if that returned false
		if ( ! g_errno ) { char *xx=NULL;*xx=0; }
		return true;
	}
	// do not re-call the set
	//m_needsSet = false;
	// make this our callback in case something blocks
	xd->setCallback ( state , callback );

	xd->m_doConsistencyTesting = doConsistencyTesting;

	// . set xd from the old title rec if recycle is true
	// . can also use XmlDoc::m_loadFromOldTitleRec flag
	if ( recycleContent ) xd->m_recycleContent = true;

	// othercrap
	if ( firstIndexed ) {
		xd->m_firstIndexedDate = firstIndexed;
		xd->m_firstIndexedDateValid = true;
	}

	if ( lastSpidered ) {
		xd->m_spideredTime      = lastSpidered;
		xd->m_spideredTimeValid = true;
	}

	if ( hopCount != -1 ) {
		xd->m_hopCount = hopCount;
		xd->m_hopCountValid = true;
	}

	if ( charset != -1 && charset != csUnknown ) {
		xd->m_charset = charset;
		xd->m_charsetValid = true;
	}

	// avoid looking up ip of each outlink to add "firstip" tag to tagdb
	// because that can be slow!!!!!!!
	xd->m_spiderLinks = spiderLinks;
	xd->m_spiderLinks2 = spiderLinks;
	xd->m_spiderLinksValid = true;

	// . newOnly is true --> do not inject if document is already indexed!
	// . maybe just set indexCode
	xd->m_newOnly = newOnly;

	// do not re-lookup the robots.txt
	xd->m_isAllowed      = true;
	xd->m_isAllowedValid = true;
	xd->m_crawlDelay     = -1; // unknown
	xd->m_crawlDelayValid = true;

	// set this now
	g_inPageInject = true;

	// log it now
	//log("inject: indexing injected doc %s",cleanUrl);

	// . now tell it to index
	// . this returns false if blocked
	bool status = xd->indexDoc ( );

	// log it. i guess only for errors when it does not block?
	// because xmldoc.cpp::indexDoc calls logIt()
	if ( status ) xd->logIt();

	// undo it
	g_inPageInject = false;

	// note that it blocked
	//if ( ! status ) log("inject: blocked for %s",cleanUrl);

	// return false if it blocked
	return status;
}

Ejemplo n.º 20

0

Mostrar archivo

Archivo: Turkdb.cpp Proyecto: DeadNumbers/open-source-search-engine

bool sendTurkPageReply ( State60 *st ) {

	XmlDoc *xd = &st->m_xd;
	//char *content    = xd->ptr_utf8Content;
	//int32_t  contentLen = xd->size_utf8Content - 1;

	// count the total number of EventDesc classes for all evids
	//char *evd = xd->ptr_eventData;
	//EventDisplay *ed = (EventDisplay *)evd;
	//char *addr = evd + (int32_t)ed->m_addr;
	//char timeZoneOffset = getTimeZoneFromAddr ( addr );

	// in case getSections() block come right back in
	xd->setCallback ( st , xdcallback );

	// . set niceness to 1 so all this processing doesn't slow queries down
	// . however, g_niceness should still be zero... hmmm...
	xd->m_niceness = 1;

	// default to 1 niceness
	st->m_niceness = 1;

	// now set the sections class
	Sections *ss = xd->getSections();

	// now for each section with alnum text, telescope up as far as 
	// possible without containing anymore alnum text than what it 
	// contained. set SEC_CONTROL bit. such sections will have the
	// 2 green/blue dots, that are used for turning on/off title/desc.
	// but really the indians will only turn off sections that should
	// not have a title/desc.
	for ( Section *si = ss->m_rootSection ; si ; si = si->m_next ) {
		// breathe
		QUICKPOLL(st->m_niceness);
		// skip if does not have text
		if ( si->m_firstWordPos < 0 ) continue;
		// otherwise, find biggest parent that contains just that text
		Section *p    = si->m_parent;
		Section *last = si;
		for ( ; p ; p = p->m_parent ) {
			if ( p->m_firstWordPos != si->m_firstWordPos ) break;
			if ( p->m_lastWordPos  != si->m_lastWordPos  ) break;
			last = p;
		}
		// set that bit then
		last->m_flags |= SEC_CONTROL;
		// and speed up the loop
		si = last;
	}

	// * now each SEC_CONTROL sections have a fence activated by a turker

	// * an event title or description can not span a fence. it must be
	//   confined within a fence. however, it is allowed to include
	//   title or description from a "title section".

	// * hold shift down to designate as title section when clicking it

	// * show the raw text of each event changing as you fence
	//   sections in or out.  show in a right frame.

	// * show list of events on page in the top frame. can toggle them
	//   all individually.

	// * and remove no-display from all tags so we can see everything.

	// * highlight addresses, not just dates.

	// * each section hash has its own unique bg color when activated

	// * with a single click, completely reject an event because:
	//   contains bad time, address, title or desc. specify which so
	//   we can improve our algo.

	// * when selecting an individual event, scroll to its tod...

	// * remove all color from webpage that we can so our colors show up

	// * remove all imgs. just src them to dev null.

	// * allow for entering a custom title for an event or all events
	//   that are or will ever appear on the page. 

	// * when displaying the text of the events, use hyphens to
	//   delineate the section topology. strike out text as a section
	//   fence is activated.

	// * when a section is activated is it easier to just redownload
	//   the whole text of the page? maybe just the text frame?

	// * clicking on an individual sentence section should just remove
	//   that sentence. that is kinda a special content hash removal
	//   tag. like "Click here for video."

	// * when an event id is selected i guess activate its bgcolor to
	//   be light blue for all sentences currently in the event that
	//   are not in activated sections. (make exception for designated 
	//   title sections). so we need multiple tags for each events
	//   sentence div section. if sentence is split use multiple div tags
	//   then to keep the order. so each event sentence would have 
	//   <div ev1=1 ev2=1 ev10=1>...</div> if it is in event ids 1,2 and
	//   10. that way we can activate it when one of those event ids is
	//   activated.


	SafeBuf sb;

	// int16_tcuts
	if ( ! xd->m_wordsValid ) { char *xx=NULL;*xx=0; }
	Words     *words = &xd->m_words;
	int32_t       nw    = words->getNumWords();
	char     **wptrs = words->getWords();
	int32_t      *wlens = words->getWordLens();
	nodeid_t  *tids  = words->getTagIds();

	// a special array for printing </div> tags
	char *endCounts = (char *)mcalloc ( nw ,"endcounts");
	if ( ! endCounts ) return sendErrorReply ( st , g_errno );


	// 
	// now loop over all the words. if word starts a section that has
	// SEC_CONTROL bit set, and print out the section hash and a color
	// tag to be activated if the turkey activates us.
	// CAUTION: word may start multiple sections.
	//
	for ( int32_t i = 0 ; i < nw ; i++ ) { 
		// get section ptr
		Section *sj = ss->m_sectionPtrs[i];
		// sanity check. sj must be first section ptr that starts @ a
		if ( sj && sj->m_a==i && sj->m_prev && sj->m_prev->m_a==i ) {
			char *xx=NULL;*xx=0; }
		// . does word #i start a section?
		// . if section is control, print out the control
		while ( sj && sj->m_a == i ) {
			// print this section's hash
			if ( sj->m_flags & SEC_CONTROL) {
				// after the turkeys have made all the edits
				// they need to submit the changes they made.
				// how can we get that data sent back to the
				// back end? we need to send back the colors
				// of the sections that have been activated
				// i guess. just do a loop over them.
				sb.safePrintf("<div nobreak gbsecid=%"UINT32" "
					      "bgcolor=#%"XINT32" "
					      "onclick=gbtogglecolor()>",
					      (uint32_t)sj->m_tagHash,
					      (uint32_t)sj->m_tagHash);
				// sanity check
				if ( sj->m_b < 0  ) { char *xx=NULL;*xx=0; }
				if ( sj->m_b > nw ) { char *xx=NULL;*xx=0; }
				// and inc the /div count for that word
				endCounts[sj->m_b-1]++;
			}
			// try next section too
			sj = sj->m_next;
		}
		// if this is a tag, remove any coloring
		if ( tids[i] ) {
		}
		// print the word, be it a tag, alnum, punct
		sb.safeMemcpy ( wptrs[i] , wlens[i] );
		// end a div tag?
		if ( ! endCounts[i] ) continue;
		// might be many so loop it
		for ( int32_t j = 0 ; j < endCounts[i] ; j++ )
			sb.safePrintf("</div>");
	}			







	return false;
}

Ejemplo n.º 21

0

Mostrar archivo

Archivo: Msg20.cpp Proyecto: BILObilo/open-source-search-engine

// . this is called
// . destroys the UdpSlot if false is returned
void handleRequest20 ( UdpSlot *slot , long netnice ) {
	// . check g_errno
	// . before, we were not sending a reply back here and we continued
	//   to process the request, even though it was empty. the slot
	//   had a NULL m_readBuf because it could not alloc mem for the read
	//   buf i'm assuming. and the slot was saved in a line below here...
	//   state20->m_msg22.m_parent = slot;
	if ( g_errno ) {
		log("net: Msg20 handler got error: %s.",mstrerror(g_errno));
		g_udpServer.sendErrorReply ( slot , g_errno );
		return;
	}

	// ensure request is big enough
	if ( slot->m_readBufSize < (long)sizeof(Msg20Request) ) {
		g_udpServer.sendErrorReply ( slot , EBADREQUESTSIZE );
		return;
	}

	// parse the request
	Msg20Request *req = (Msg20Request *)slot->m_readBuf;

	// . turn the string offsets into ptrs in the request
	// . this is "destructive" on "request"
	long nb = req->deserialize();
	// sanity check
	if ( nb != slot->m_readBufSize ) { char *xx = NULL; *xx = 0; }

	// sanity check, the size include the \0
	if ( req->size_coll <= 1 || *req->ptr_coll == '\0' ) {
		log("query: Got empty collection in msg20 handler. FIX!");
		char *xx =NULL; *xx = 0; 
	}
	// if it's not stored locally that's an error
	if ( req->m_docId >= 0 && ! g_titledb.isLocal ( req->m_docId ) ) {
		log("query: Got msg20 request for non-local docId %lli",
		    req->m_docId);
	        g_udpServer.sendErrorReply ( slot , ENOTLOCAL ); 
		return; 
	}

	// sanity
	if ( req->m_docId == 0 && ! req->ptr_ubuf ) { char *xx=NULL;*xx=0; }

	long long startTime = gettimeofdayInMilliseconds();

	// alloc a new state to get the titlerec
	XmlDoc *xd;

	try { xd = new (XmlDoc); }
	catch ( ... ) { 
		g_errno = ENOMEM;
		log("query: msg20 new(%i): %s", sizeof(XmlDoc),
		    mstrerror(g_errno));
		g_udpServer.sendErrorReply ( slot, g_errno ); 
		return; 
	}
	mnew ( xd , sizeof(XmlDoc) , "xd20" );

	// ok, let's use the new XmlDoc.cpp class now!
	xd->set20 ( req );
	// encode slot
	xd->m_slot = slot;
	// set the callback
	xd->setCallback ( xd , gotReplyWrapperxd );
	// set set time
	xd->m_setTime = startTime;
	xd->m_cpuSummaryStartTime = 0;
	// . now as for the msg20 reply!
	// . TODO: move the parse state cache into just a cache of the
	//   XmlDoc itself, and put that cache logic into XmlDoc.cpp so
	//   it can be used more generally.
	Msg20Reply *reply = xd->getMsg20Reply ( );
	// this is just blocked
	if ( reply == (void *)-1 ) return;
	// got it?
	gotReplyWrapperxd ( xd );
}

Ejemplo n.º 22

0

Mostrar archivo

Archivo: Turkdb.cpp Proyecto: DeadNumbers/open-source-search-engine

void gotDatedbList ( State60 *st ) {

	// must only be run on host #0 since we need just one lock table
	if ( g_hostdb.m_myHost->m_hostId != 0 ) { char *xx=NULL;*xx=0; }

	// load turk lock table if we need to
	bool s_init = false;
	if ( ! s_init ) {
		s_init = true;
		if ( ! g_turkLocks.set(8,sizeof(TurkLock),256) )
			log("turk: failed to init turk lock table");
		if ( ! g_turkLocks.load(g_conf.m_dir,"turkdir/docidlocks.dat"))
			log("turk: failed to load turk lock table");
	}

	time_t now = getTimeGlobal();
	// int16_tcut
	RdbList *list = &st->m_list;
	// the best docid
	int64_t best = 0LL;
	// scan the list to get urls/docids to turk out
	for ( ; ! list->isExhausted() ; ) {
		// get rec
		char *k = list->getCurrentKey();
		// skip that
		list->skipCurrentRecord();
		// skip if negative
		if ( (k[0] & 0x01) == 0x00 ) continue;
		// get the docid
		int64_t docid = g_datedb.getDocId ( k );
		// skip if locked
		TurkLock *tt = (TurkLock *)g_turkLock.getValue(&docid);
		// if there check time
		if ( tt && now - tt->m_lockTime > 3600 ) {
			// remove it
			g_turkLock.removeKey(&docId);
			// nuke tt
			tt = NULL;
		}
		// if still there, skip it and try next one
		if ( tt ) continue;
		// ok, we got a good docid to dish out
		best = docId;
		break;
	}

	SafeBuf sb;

	// print description so they can clikc a button to start the turk
	sb.safePrintf("<html>\n"
		      "<title>Event Editor</title>\n"
		      "<body>\n"
		      "<table width=\"100%%\" border=\"0\">\n"
		      "<tr><td style=\"background-color:#0079ba;\">\n"
		      "<center><font color=#00000>"
		      "<h2>Event Editor</h2>\n"
		      "</font></center></td>"
		      "</tr></table>");

	// if we had no docid, give user an empty msg
	if ( ! best ) {
		sb.safePrintf("<center>Nothing currently available to edit. "
			      "Please try again later.</center>"
			      "</body></html>\n");
		sendReply ( &sb );
		return;
	}

	// lock it!
	TurkLock tt;
	strcpy ( tt.m_user , st->m_user );
	tt.m_lockTime = now;
	if ( ! g_lockTable.addLock ( &tt ) ) {
		sendErrorReply ( st , g_errno );
		return;
	}

	// . fetch the TitleRec
	// . a max cache age of 0 means not to read from the cache
	XmlDoc *xd = &st->m_xd;
	// . when getTitleRec() is called it will load the old one
	//   since XmlDoc::m_setFromTitleRec will be true
	// . niceness is 0
	xd->set3 ( best , st->m_coll , 0 );
	// if it blocks while it loads title rec, it will re-call this routine
	xd->setCallback ( st , processLoopWrapper );
	// good to go!
	return processLoop ( st );
}

Ejemplo n.º 23

0

Mostrar archivo

Archivo: PageInject.cpp Proyecto: chushuai/open-source-search-engine

// . returns false if blocked and callback will be called, true otherwise
// . sets g_errno on error
bool Msg7::inject ( void *state ,
		    void (*callback)(void *state) 
		    //long spiderLinksDefault ,
		    //char *collOveride ) {
		    ) {

	GigablastRequest *gr = &m_gr;

	char *coll2 = gr->m_coll;
	CollectionRec *cr = g_collectiondb.getRec ( coll2 );

	if ( ! cr ) {
		g_errno = ENOCOLLREC;
		return true;
	}

	m_state = state;
	m_callback = callback;

	// shortcut
	XmlDoc *xd = &m_xd;

	if ( ! gr->m_url ) {
		log("inject: no url provied to inject");
		g_errno = EBADURL;
		return true;
	}

	//char *coll = cr->m_coll;

	// test
	//diffbotReply = "{\"request\":{\"pageUrl\":\"http://www.washingtonpost.com/2011/03/10/ABe7RaQ_moreresults.html\",\"api\":\"article\",\"version\":3},\"objects\":[{\"icon\":\"http://www.washingtonpost.com/favicon.ico\",\"text\":\"In Case You Missed It\nWeb Hostess Live: The latest from the Web (vForum, May 15, 2014; 3:05 PM)\nGot Plans: Advice from the Going Out Guide (vForum, May 15, 2014; 2:05 PM)\nWhat to Watch: TV chat with Hank Stuever (vForum, May 15, 2014; 1:10 PM)\nColor of Money Live (vForum, May 15, 2014; 1:05 PM)\nWeb Hostess Live: The latest from the Web (vForum, May 15, 2014; 12:25 PM)\nMichael Devine outdoor entertaining and design | Home Front (vForum, May 15, 2014; 12:20 PM)\nThe Answer Sheet: Education chat with Valerie Strauss (vForum, May 14, 2014; 2:00 PM)\nThe Reliable Source Live (vForum, May 14, 2014; 1:05 PM)\nAsk Tom: Rants, raves and questions on the DC dining scene (vForum, May 14, 2014; 12:15 PM)\nOn Parenting with Meghan Leahy (vForum, May 14, 2014; 12:10 PM)\nAsk Aaron: The week in politics (vForum, May 13, 2014; 3:05 PM)\nEugene Robinson Live (vForum, May 13, 2014; 2:05 PM)\nTuesdays with Moron: Chatological Humor Update (vForum, May 13, 2014; 12:00 PM)\nComPost Live with Alexandra Petri (vForum, May 13, 2014; 11:05 AM)\nAsk Boswell: Redskins, Nationals and Washington sports (vForum, May 12, 2014; 1:50 PM)\nAdvice from Slate's 'Dear Prudence' (vForum, May 12, 2014; 1:40 PM)\nDr. Gridlock (vForum, May 12, 2014; 1:35 PM)\nSwitchback: Talking Tech (vForum, May 9, 2014; 12:05 PM)\nThe Fix Live (vForum, May 9, 2014; 12:00 PM)\nWhat to Watch: TV chat with Hank Stuever (vForum, May 8, 2014; 1:10 PM)\nMore News\",\"title\":\"The Washington Post\",\"diffbotUri\":\"article|3|828850106\",\"pageUrl\":\"http://www.washingtonpost.com/2011/03/10/ABe7RaQ_moreresults.html\",\"humanLanguage\":\"en\",\"html\":\"<p>In Case You Missed It<\\/p>\n<p>  <a href=\\\"http://live.washingtonpost.com/web-hostess-140515-new.html\\\">Web Hostess Live: The latest from the Web<\\/a>  <\\/p>\n<p>(vForum, May 15, 2014; 3:05 PM)<\\/p>\n<p>  <a href=\\\"http://live.washingtonpost.com/got-plans-05-15-2014.html\\\">Got Plans: Advice from the Going Out Guide<\\/a>  <\\/p>\n<p>(vForum, May 15, 2014; 2:05 PM)<\\/p>\n<p>  <a href=\\\"http://live.washingtonpost.com/tv-chat-140515.html\\\">What to Watch: TV chat with Hank Stuever<\\/a>  <\\/p>\n<p>(vForum, May 15, 2014; 1:10 PM)<\\/p>\n<p>  <a href=\\\"http://live.washingtonpost.com/color-of-money-live-20140515.html\\\">Color of Money Live<\\/a>  <\\/p>\n<p>(vForum, May 15, 2014; 1:05 PM)<\\/p>\n<p>  <a href=\\\"http://live.washingtonpost.com/web-hostess-140515-new.html\\\">Web Hostess Live: The latest from the Web<\\/a>  <\\/p>\n<p>(vForum, May 15, 2014; 12:25 PM)<\\/p>\n<p>  <a href=\\\"http://live.washingtonpost.com/home-front-0515.html\\\">Michael Devine outdoor entertaining and design | Home Front<\\/a>  <\\/p>\n<p>(vForum, May 15, 2014; 12:20 PM)<\\/p>\n<p>  <a href=\\\"http://live.washingtonpost.com/the-answer-sheet-20140514.html\\\">The Answer Sheet: Education chat with Valerie Strauss<\\/a>  <\\/p>\n<p>(vForum, May 14, 2014; 2:00 PM)<\\/p>\n<p>  <a href=\\\"http://live.washingtonpost.com/the-reliable-source-140514-new.html\\\">The Reliable Source Live<\\/a>  <\\/p>\n<p>(vForum, May 14, 2014; 1:05 PM)<\\/p>\n<p>  <a href=\\\"http://live.washingtonpost.com/ask-tom-5-14-14.html\\\">Ask Tom: Rants, raves and questions on the DC dining scene <\\/a>  <\\/p>\n<p>(vForum, May 14, 2014; 12:15 PM)<\\/p>\n<p>  <a href=\\\"http://live.washingtonpost.com/parenting-0514.html\\\">On Parenting with Meghan Leahy<\\/a>  <\\/p>\n<p>(vForum, May 14, 2014; 12:10 PM)<\\/p>\n<p>  <a href=\\\"http://live.washingtonpost.com/post-politics-ask-aaron-051313.html\\\">Ask Aaron: The week in politics<\\/a>  <\\/p>\n<p>(vForum, May 13, 2014; 3:05 PM)<\\/p>\n<p>  <a href=\\\"http://live.washingtonpost.com/opinion-focus-with-eugene-robinson-20140513.html\\\">Eugene Robinson Live<\\/a>  <\\/p>\n<p>(vForum, May 13, 2014; 2:05 PM)<\\/p>\n<p>  <a href=\\\"http://live.washingtonpost.com/gene-weingarten-140513.html\\\">Tuesdays with Moron: Chatological Humor Update<\\/a>  <\\/p>\n<p>(vForum, May 13, 2014; 12:00 PM)<\\/p>\n<p>  <a href=\\\"http://live.washingtonpost.com/compost-live-140513.html\\\">ComPost Live with Alexandra Petri<\\/a>  <\\/p>\n<p>(vForum, May 13, 2014; 11:05 AM)<\\/p>\n<p>  <a href=\\\"http://live.washingtonpost.com/ask-boswell-1400512.html\\\">Ask Boswell: Redskins, Nationals and Washington sports<\\/a>  <\\/p>\n<p>(vForum, May 12, 2014; 1:50 PM)<\\/p>\n<p>  <a href=\\\"http://live.washingtonpost.com/dear-prudence-140512.html\\\">Advice from Slate's 'Dear Prudence'<\\/a>  <\\/p>\n<p>(vForum, May 12, 2014; 1:40 PM)<\\/p>\n<p>  <a href=\\\"http://live.washingtonpost.com/gridlock-0512.html\\\">Dr. Gridlock <\\/a>  <\\/p>\n<p>(vForum, May 12, 2014; 1:35 PM)<\\/p>\n<p>  <a href=\\\"http://live.washingtonpost.com/switchback-20140509.html\\\">Switchback: Talking Tech<\\/a>  <\\/p>\n<p>(vForum, May 9, 2014; 12:05 PM)<\\/p>\n<p>  <a href=\\\"http://live.washingtonpost.com/live-fix-140509.html\\\">The Fix Live<\\/a>  <\\/p>\n<p>(vForum, May 9, 2014; 12:00 PM)<\\/p>\n<p>  <a href=\\\"http://live.washingtonpost.com/tv-chat-140508.html\\\">What to Watch: TV chat with Hank Stuever<\\/a>  <\\/p>\n<p>(vForum, May 8, 2014; 1:10 PM)<\\/p>\n<p>  <a href=\\\"http://www.washingtonpost.com/2011/03/10/ /2011/03/10/ABe7RaQ_moreresults.html ?startIndex=20&dwxLoid=\\\">More News <\\/a>  <\\/p>\",\"date\":\"Tue, 13 May 2014 00:00:00 GMT\",\"type\":\"article\"}]}";

	if ( g_repairMode ) { g_errno = EREPAIRING; return true; }

	// this will be NULL if the "content" was empty or not given
	char *content = gr->m_content;

	// . try the uploaded file if nothing in the text area
	// . this will be NULL if the "content" was empty or not given
	if ( ! content ) content = gr->m_contentFile;

	if ( m_firstTime ) {
		m_firstTime = false;
		m_start = content;
	}

	// save current start since we update it next
	char *start = m_start;

	// if this is empty we are done
	//if ( ! start ) 
	//	return true;

	char *delim = gr->m_contentDelim;
	if ( delim && ! delim[0] ) delim = NULL;

	if ( m_fixMe ) {
		// we had made the first delim char a \0 to index the
		// previous document, now put it back to what it was
		*m_start = *delim;
		// i guess unset this
		m_fixMe = false;
	}

	// if we had a delimeter...
	if ( delim ) {
		// we've saved m_start as "start" above, 
		// so find the next delimeter after it and set that to m_start
		// add +1 to avoid infinite loop
		m_start = strstr(start+1,delim);
		// for injecting "start" set this to \0
		if ( m_start ) {
			// null term it
			*m_start = '\0';
			// put back the original char on next round...?
			m_fixMe = true;
		}
	}

	// this is the url of the injected content
	m_injectUrlBuf.safeStrcpy ( gr->m_url );

	bool modifiedUrl = false;

	// if we had a delimeter we must make a fake url
	// if ( delim ) {
	//  	// if user had a <url> or <doc> or <docid> field use that
	//  	char *hint = strcasestr ( start , "<url>" );
	//  	if ( hint ) {
	// 		modifiedUrl = true;
	// 		...
	// 	}
	// }

	// if we had a delimeter thus denoting multiple items/documents to
	// be injected, we must create unique urls for each item.
	if ( delim && ! modifiedUrl ) {
		// use hash of the content
		long long ch64 = hash64n ( start , 0LL );
		// normalize it
		Url u; u.set ( gr->m_url );
		// reset it
		m_injectUrlBuf.reset();
		// by default append a -<ch64> to the provided url
		m_injectUrlBuf.safePrintf("%s-%llu",u.getUrl(),ch64);
	}

	// count them
	m_injectCount++;

	m_inUse = true;

	if ( ! xd->injectDoc ( m_injectUrlBuf.getBufStart() ,
			       cr ,
			       start , // content ,
			       gr->m_diffbotReply,
			       gr->m_hasMime, // content starts with http mime?
			       gr->m_hopCount,
			       gr->m_charset,

			       gr->m_deleteUrl,
			       gr->m_contentTypeStr, // text/html text/xml
			       gr->m_spiderLinks ,
			       gr->m_newOnly, // index iff new

			       this ,
			       doneInjectingWrapper9 ) )
		// we blocked...
		return false;


	m_inUse = false;

	return true;
}

Ejemplo n.º 24

0

Mostrar archivo

Archivo: PageParser.cpp Proyecto: acchou/open-source-search-engine

// for procog
bool sendPageAnalyze ( TcpSocket *s , HttpRequest *r ) {

	// make a state
	State8 *st;
	try { st = new (State8); }
	catch ( ... ) {
		g_errno = ENOMEM;
		log("PageParser: new(%i): %s", 
		    (int)sizeof(State8),mstrerror(g_errno));
		return g_httpServer.sendErrorReply(s,500,
						   mstrerror(g_errno));}
	mnew ( st , sizeof(State8) , "PageParser" );
	st->m_freeIt = true;
	st->m_state           = NULL;
	//st->m_callback        = callback;
	//st->m_q               = q;
	//st->m_termFreqs       = termFreqs;
	//st->m_termFreqWeights = termFreqWeights;
	//st->m_affWeights      = affWeights;
	//st->m_total           = (score_t)-1;
	st->m_indexCode       = 0;
	st->m_blocked         = false;
	st->m_didRootDom      = false;
	st->m_didRootWWW      = false;
	st->m_wasRootDom      = false;
	st->m_u               = NULL;

	// password, too
	long pwdLen = 0;
	char *pwd = r->getString ( "pwd" , &pwdLen );
	if ( pwdLen > 31 ) pwdLen = 31;
	if ( pwdLen > 0 ) strncpy ( st->m_pwd , pwd , pwdLen );
	st->m_pwd[pwdLen]='\0';

	// save socket ptr
	st->m_s = s;
	st->m_r.copy ( r );

	// get the collection
	char *coll    = r->getString ( "c" , &st->m_collLen ,NULL /*default*/);
	if ( ! coll ) coll = g_conf.m_defaultColl;
	if ( ! coll ) coll = "main";
	long collLen = gbstrlen(coll);
	if ( collLen > MAX_COLL_LEN ) return sendErrorReply ( st , ENOBUFS );
	strcpy ( st->m_coll , coll );

	// version to use, if -1 use latest
	st->m_titleRecVersion = r->getLong("version",-1);
	if ( st->m_titleRecVersion == -1 ) 
		st->m_titleRecVersion = TITLEREC_CURRENT_VERSION;
	// default to 0 if not provided
	st->m_hopCount = r->getLong("hc",0);
	long  old     = r->getLong   ( "old", 0 );
	// set query
	long qlen;
	char *qs = r->getString("q",&qlen,NULL);
	if ( qs ) st->m_tq.set2 ( qs , langUnknown , true );
	// url will override docid if given
	st->m_docId = r->getLongLong ("d",-1);
	st->m_docId = r->getLongLong ("docid",st->m_docId);

	long ulen;
	char *u = st->m_r.getString("u",&ulen,NULL);
	if ( ! u ) u = st->m_r.getString("url",&ulen,NULL);
	if ( ! u && st->m_docId == -1LL ) 
		return sendErrorReply ( st , EBADREQUEST );

	// set url in state class (may have length 0)
	//if ( u ) st->m_url.set ( u , ulen );
	//st->m_urlLen = ulen;
	st->m_u = u;
	st->m_ulen = 0;
	if ( u ) st->m_ulen = gbstrlen(u);
	// should we recycle link info?
	st->m_recycle  = r->getLong("recycle",1);
	st->m_recycle2 = r->getLong("recycleimp",0);
	st->m_render   = r->getLong("render" ,0);
	st->m_recompute = r->getLong("recompute" ,0);
	// for quality computation... takes way longer cuz we have to 
	// lookup the IP address of every outlink, so we can get its root
	// quality using Msg25 which needs to filter out voters from that IP
	// range.
	st->m_oips     = r->getLong("oips"    ,0);
	//st->m_page = r->getLong("page",1);

	long  linkInfoLen  = 0;
	// default is NULL
	char *linkInfoColl = r->getString ( "oli" , &linkInfoLen, NULL );
	if ( linkInfoColl ) strcpy ( st->m_linkInfoColl , linkInfoColl );
	else st->m_linkInfoColl[0] = '\0';
	
	// set the flag in our SafeBuf class so that Words.cpp knows to show 
	// html or html source depending on this value
	//st->m_xbuf.m_renderHtml = st->m_render;

	// should we use the old title rec?
	st->m_old    = old;
	// are we coming from a local machine?
	st->m_isLocal = r->isLocal();
 	//no more setting the default root quality to 30, instead if we do not
 	// know it setting it to -1
 	st->m_rootQuality=-1;

	// header
	//xbuf->safePrintf("<meta http-equiv=\"Content-Type\" "
	//		 "content=\"text/html; charset=utf-8\">\n");

	XmlDoc *xd = &st->m_xd;

	long isXml = r->getLong("xml",0);

	// if got docid, use that
	if ( st->m_docId != -1 ) {
		if ( ! xd->set3 ( st->m_docId,
				  st->m_coll,
				  0 ) ) // niceness
			// return error reply if g_errno is set
			return sendErrorReply ( st , g_errno );
		// make this our callback in case something blocks
		xd->setCallback ( st , gotXmlDoc );
		xd->m_pbuf = &st->m_wbuf;
		// reset this flag
		st->m_donePrinting = false;
		// . set xd from the old title rec if recycle is true
		// . can also use XmlDoc::m_loadFromOldTitleRec flag
		//if ( st->m_recycle ) xd->m_recycleContent = true;
		xd->m_recycleContent = true;
		// force this on
		//xd->m_useSiteLinkBuf = true;
		//xd->m_usePageLinkBuf = true;
		if ( isXml ) xd->m_printInXml = true;
		// now tell it to fetch the old title rec
		if ( ! xd->loadFromOldTitleRec () )
			// return false if this blocks
			return false;
		return gotXmlDoc ( st );
	}			

	// set this up
	SpiderRequest sreq;
	sreq.reset();
	if ( st->m_u ) strcpy(sreq.m_url,st->m_u);
	long firstIp = hash32n(st->m_u);
	if ( firstIp == -1 || firstIp == 0 ) firstIp = 1;
	// parentdocid of 0
	sreq.setKey( firstIp, 0LL, false );
	sreq.m_isPageParser = 1;
	sreq.m_hopCount = st->m_hopCount;
	sreq.m_hopCountValid = 1;
	sreq.m_fakeFirstIp   = 1;
	sreq.m_firstIp = firstIp;
	Url nu;
	nu.set(sreq.m_url);
	sreq.m_domHash32 = nu.getDomainHash32();
	sreq.m_siteHash32 = nu.getHostHash32();

	// . get provided content if any
	// . will be NULL if none provided
	// . "content" may contain a MIME
	long  contentLen = 0;
	char *content = r->getString ( "content" , &contentLen , NULL );
	// is the "content" url-encoded? default is true.
	bool contentIsEncoded = true;
	// mark doesn't like to url-encode his content
	if ( ! content ) {
		content    = r->getUnencodedContent    ();
		contentLen = r->getUnencodedContentLen ();
		contentIsEncoded = false;
	}
	// ensure null
	if ( contentLen == 0 ) content = NULL;

	//uint8_t contentType = CT_HTML;
	//if ( isXml ) contentType = CT_XML;
	long ctype = r->getLong("ctype",CT_HTML);

	// . use the enormous power of our new XmlDoc class
	// . this returns false if blocked
	if ( ! xd->set4 ( &sreq       ,
			  NULL        ,
			  st->m_coll  ,
			  // we need this so the term table is set!
			  &st->m_wbuf        , // XmlDoc::m_pbuf
			  0, // try 0 now! 1 ,//PP_NICENESS ))
			  content ,
			  false, // deletefromindex
			  0, // forced ip
			  ctype ))
		// return error reply if g_errno is set
		return sendErrorReply ( st , g_errno );
	// make this our callback in case something blocks
	xd->setCallback ( st , gotXmlDoc );
	// reset this flag
	st->m_donePrinting = false;
	// prevent a core here in the event we download the page content
	xd->m_crawlDelayValid = true;
	xd->m_crawlDelay = 0;
	// . set xd from the old title rec if recycle is true
	// . can also use XmlDoc::m_loadFromOldTitleRec flag
	//if ( st->m_recycle ) xd->m_recycleContent = true;
	// only recycle if docid is given!!
	if ( st->m_recycle ) xd->m_recycleContent = true;
	// force this on
	//xd->m_useSiteLinkBuf = true;
	//xd->m_usePageLinkBuf = true;
	if ( isXml ) xd->m_printInXml = true;

	return gotXmlDoc ( st );
}

Ejemplo n.º 25

0

Mostrar archivo

Archivo: PageParser.cpp Proyecto: acchou/open-source-search-engine

// . a new interface so Msg3b can call this with "s" set to NULL
// . returns false if blocked, true otherwise
// . sets g_errno on error
bool sendPageParser2 ( TcpSocket   *s , 
		       HttpRequest *r ,
		       State8      *st ,
		       long long    docId ,
		       Query       *q ,
		       // in query term space, not imap space
		       long long   *termFreqs       ,
		       // in imap space
		       float       *termFreqWeights ,
		       // in imap space
		       float       *affWeights      ,
		       void        *state ,
		       void       (* callback)(void *state) ) {

	//log("parser: read sock=%li",s->m_sd);

	// might a simple request to addsomething to validated.*.txt file
	// from XmlDoc::print() or XmlDoc::validateOutput()
	char *add = r->getString("add",NULL);
	//long long uh64 = r->getLongLong("uh64",0LL);
	char *uh64str = r->getString("uh64",NULL);
	//char *divTag = r->getString("div",NULL);
	if ( uh64str ) {
		// convert add to number
		long addNum = 0;
		if ( to_lower_a(add[0])=='t' ) // "true" or "false"?
			addNum = 1;
		// convert it. skip beginning "str" inserted to prevent
		// javascript from messing with the long long since it
		// was rounding it!
		//long long uh64 = atoll(uh64str);//+3);
		// urldecode that
		//long divTagLen = gbstrlen(divTag);
		//long newLen  = urlDecode ( divTag , divTag , divTagLen );
		// null term?
		//divTag[newLen] = '\0';
		// do it. this is defined in XmlDoc.cpp
		//addCheckboxSpan ( uh64 , divTag , addNum );
		// make basic reply
		char *reply;
		reply = "HTTP/1.0 200 OK\r\n"
			"Connection: Close\r\n";
		// that is it! send a basic reply ok
		bool status = g_httpServer.sendDynamicPage( s , 
							    reply,
							    gbstrlen(reply),
							    -1, //cachtime
							    false ,//postreply?
							    NULL, //ctype
							    -1 , //httpstatus
							    NULL,//cookie
							    "utf-8");
		return status;
	}

	// make a state
	if (   st ) st->m_freeIt = false;
	if ( ! st ) {
		try { st = new (State8); }
		catch ( ... ) {
			g_errno = ENOMEM;
			log("PageParser: new(%i): %s", 
			    (int)sizeof(State8),mstrerror(g_errno));
			return g_httpServer.sendErrorReply(s,500,
						       mstrerror(g_errno));}
		mnew ( st , sizeof(State8) , "PageParser" );
		st->m_freeIt = true;
	}
	// msg3b uses this to get a score from the query
	st->m_state           = state;
	st->m_callback        = callback;
	st->m_q               = q;
	st->m_termFreqs       = termFreqs;
	st->m_termFreqWeights = termFreqWeights;
	st->m_affWeights      = affWeights;
	//st->m_total           = (score_t)-1;
	st->m_indexCode       = 0;
	st->m_blocked         = false;
	st->m_didRootDom      = false;
	st->m_didRootWWW      = false;
	st->m_wasRootDom      = false;
	st->m_u               = NULL;
	st->m_recompute       = false;
	//st->m_url.reset();

	// do not allow more than one to be launched at a time if in 
	// a quickpoll. will cause quickpoll in quickpoll.
	g_inPageParser = true;

	// password, too
	long pwdLen = 0;
	char *pwd = r->getString ( "pwd" , &pwdLen );
	if ( pwdLen > 31 ) pwdLen = 31;
	if ( pwdLen > 0 ) strncpy ( st->m_pwd , pwd , pwdLen );
	st->m_pwd[pwdLen]='\0';

	// save socket ptr
	st->m_s = s;
	st->m_r.copy ( r );
	// get the collection
	char *coll    = r->getString ( "c" , &st->m_collLen ,NULL /*default*/);
	if ( st->m_collLen > MAX_COLL_LEN )
		return sendErrorReply ( st , ENOBUFS );
	if ( ! coll )
		return sendErrorReply ( st , ENOCOLLREC );
	strcpy ( st->m_coll , coll );

	// version to use, if -1 use latest
	st->m_titleRecVersion = r->getLong("version",-1);
	if ( st->m_titleRecVersion == -1 ) 
		st->m_titleRecVersion = TITLEREC_CURRENT_VERSION;
	// default to 0 if not provided
	st->m_hopCount = r->getLong("hc",0);
	//long  ulen    = 0;
	//char *u     = r->getString ( "u" , &ulen     , NULL /*default*/);
	long  old     = r->getLong   ( "old", 0 );
	// set query
	long qlen;
	char *qs = r->getString("q",&qlen,NULL);
	if ( qs ) st->m_tq.set2 ( qs , langUnknown , true );
	// url will override docid if given
	if ( ! st->m_u || ! st->m_u[0] ) 
		st->m_docId = r->getLongLong ("docid",-1);
	else    
		st->m_docId = -1;
	// set url in state class (may have length 0)
	//if ( u ) st->m_url.set ( u , ulen );
	//st->m_urlLen = ulen;
	st->m_u = st->m_r.getString("u",&st->m_ulen,NULL);
	// should we recycle link info?
	st->m_recycle  = r->getLong("recycle",0);
	st->m_recycle2 = r->getLong("recycleimp",0);
	st->m_render   = r->getLong("render" ,0);
	// for quality computation... takes way longer cuz we have to 
	// lookup the IP address of every outlink, so we can get its root
	// quality using Msg25 which needs to filter out voters from that IP
	// range.
	st->m_oips     = r->getLong("oips"    ,0);

	long  linkInfoLen  = 0;
	// default is NULL
	char *linkInfoColl = r->getString ( "oli" , &linkInfoLen, NULL );
	if ( linkInfoColl ) strcpy ( st->m_linkInfoColl , linkInfoColl );
	else st->m_linkInfoColl[0] = '\0';
	
	// set the flag in our SafeBuf class so that Words.cpp knows to show 
	// html or html source depending on this value
	st->m_xbuf.m_renderHtml = st->m_render;

	// should we use the old title rec?
	st->m_old    = old;
	// are we coming from a local machine?
	st->m_isLocal = r->isLocal();
 	//no more setting the default root quality to 30, instead if we do not
 	// know it setting it to -1
 	st->m_rootQuality=-1;







	// header
	SafeBuf *xbuf = &st->m_xbuf;
	xbuf->safePrintf("<meta http-equiv=\"Content-Type\" "
			 "content=\"text/html; charset=utf-8\">\n");

	// print standard header
	g_pages.printAdminTop ( xbuf , st->m_s , &st->m_r );


	// print the standard header for admin pages
	char *dd     = "";
	char *rr     = "";
	char *rr2    = "";
	char *render = "";
	char *oips   = "";
	char *us     = "";
	if ( st->m_u && st->m_u[0] ) us = st->m_u;
	//if ( st->m_sfn != -1 ) sprintf ( rtu , "%li",st->m_sfn );
	if ( st->m_old ) dd = " checked";
	if ( st->m_recycle            ) rr     = " checked";
	if ( st->m_recycle2           ) rr2    = " checked";
	if ( st->m_render             ) render = " checked";
	if ( st->m_oips               ) oips   = " checked";

	xbuf->safePrintf(
			 "<style>"
			 ".poo { background-color:#%s;}\n"
			 "</style>\n" ,
			 LIGHT_BLUE );


	long clen;
	char *contentParm = r->getString("content",&clen,"");
	
	// print the input form
	xbuf->safePrintf (
		       "<style>\n"
		       "h2{font-size: 12px; color: #666666;}\n"

		       ".gbtag { border: 1px solid gray;"
		  	"background: #ffffef;display:inline;}\n"
		  	".gbcomment { border: 1px solid gray;"
		  	"color: #888888; font-style:italic; "
		  	"background: #ffffef;display:inline;}\n"

		       ".token { border: 1px solid gray;"
		       "background: #f0ffff;display:inline;}\n"
		       ".spam { border: 1px solid gray;"
		       "background: #af0000;"
		       "color: #ffffa0;}"
		       ".hs {color: #009900;}"
		       "</style>\n"
		       "<center>"

		  "<table %s>"

			  "<tr><td colspan=5><center><b>"
			  "Parser"
			  "</b></center></td></tr>\n"

		  "<tr class=poo>"
		  "<td>"
		  "<b>url</b>"
			  "<br><font size=-2>"
			  "Type in <b>FULL</b> url to parse."
			  "</font>"
		  "</td>"

		  "</td>"
		  "<td>"
		  "<input type=text name=u value=\"%s\" size=\"40\">\n"
		  "</td>"
		  "</tr>"


			  /*
		  "<tr class=poo>"
		  "<td>"
		  "Parser version to use: "
		  "</td>"
		  "<td>"
		  "<input type=text name=\"version\" size=\"4\" value=\"-1\"> "
		  "</td>"
		  "<td>"
		  "(-1 means to use latest title rec version)<br>"
		  "</td>"
		  "</tr>"
			  */

			  /*
		  "<tr class=poo>"
		  "<td>"
		  "Hop count to use: "
		  "</td>"
		  "<td>"
		  "<input type=text name=\"hc\" size=\"4\" value=\"%li\"> "
		  "</td>"
		  "<td>"
		  "(-1 is unknown. For root urls hopcount is always 0)<br>"
		  "</td>"
		  "</tr>"
			  */

		  "<tr class=poo>"
		  "<td>"
			  "<b>use cached</b>"

			  "<br><font size=-2>"
			  "Load page from cache (titledb)?"
			  "</font>"

		  "</td>"
		  "<td>"
		  "<input type=checkbox name=old value=1%s> "
		  "</td>"
		  "</tr>"

			  /*
		  "<tr class=poo>"
		  "<td>"
		  "Reparse root:"
		  "</td>"
		  "<td>"
		  "<input type=checkbox name=artr value=1%s> "
		  "</td>"
		  "<td>"
		  "Apply selected ruleset to root to update quality"
		  "</td>"
		  "</tr>"
			  */

		  "<tr class=poo>"
		  "<td>"
		  "<b>recycle link info</b>"

			  "<br><font size=-2>"
			  "Recycle the link info from the title rec"
			  "Load page from cache (titledb)?"
			  "</font>"

		  "</td>"
		  "<td>"
		  "<input type=checkbox name=recycle value=1%s> "
		  "</td>"
		  "</tr>"

			  /*
		  "<tr class=poo>"
		  "<td>"
		  "Recycle Link Info Imported:"
		  "</td>"
		  "<td>"
		  "<input type=checkbox name=recycleimp value=1%s> "
		  "</td>"
		  "<td>"
		  "Recycle the link info imported from other coll"
		  "</td>"
		  "</tr>"
			  */

		  "<tr class=poo>"
		  "<td>"
		  "<b>render html</b>"

			  "<br><font size=-2>"
			  "Render document content as HTML"
			  "</font>"

		  "</td>"
		  "<td>"
		  "<input type=checkbox name=render value=1%s> "
		  "</td>"
		  "</tr>"

			  /*
		  "<tr class=poo>"
		  "<td>"
		  "Lookup outlinks' ruleset, ips, quality:"
		  "</td>"
		  "<td>"
		  "<input type=checkbox name=oips value=1%s> "
		  "</td>"
		  "<td>"
		  "To compute quality lookup IP addresses of roots "
		  "of outlinks."
		  "</td>"
		  "</tr>"

		  "<tr class=poo>"
		  "<td>"
		  "LinkInfo Coll:"
		  "</td>"
		  "<td>"
		  "<input type=text name=\"oli\" size=\"10\" value=\"\"> "
		  "</td>"
		  "<td>"
		  "Leave empty usually. Uses this coll to lookup link info."
		  "</td>"
		  "</tr>"
			  */

		  "<tr class=poo>"
		  "<td>"
		  "<b>optional query</b>"

			  "<br><font size=-2>"
			  "Leave empty usually. For title generation only."
			  "</font>"

		  "</td>"
		  "<td>"
		  "<input type=text name=\"q\" size=\"20\" value=\"\"> "
		  "</td>"
		       "</tr>",

			  TABLE_STYLE,
			  us ,
			   dd,
			   rr, 
			   render
			  );

	xbuf->safePrintf(
		  "<tr class=poo>"
			  "<td>"
			  "<b>content type below is</b>"
			  "<br><font size=-2>"
			  "Is the content below HTML? XML? JSON?"
			  "</font>"
			  "</td>"

		  "<td>"
		       //"<input type=checkbox name=xml value=1> "
		       "<select name=ctype>\n"
		       "<option value=%li selected>HTML</option>\n"
		       "<option value=%li selected>XML</option>\n"
		       "<option value=%li selected>JSON</option>\n"
		       "</select>\n"

		  "</td>"
		       "</tr>",
		       (long)CT_HTML,
		       (long)CT_XML,
		       (long)CT_JSON
			  );

	xbuf->safePrintf(

			  "<tr class=poo>"
			  "<td><b>content</b>"
			  "<br><font size=-2>"
			  "Use this content for the provided <i>url</i> "
			  "rather than downloading it from the web."
			  "</td>"

			  "<td>"
			  "<textarea rows=10 cols=80 name=content>"
			  "%s"
			  "</textarea>"
			  "</td>"
			  "</tr>"

		  "</table>"
		  "</center>"
		  "</form>"
		  "<br>",

			  //oips ,
			  contentParm );



	xbuf->safePrintf(
			 "<center>"
			 "<input type=submit value=Submit>"
			 "</center>"
			 );


	// just print the page if no url given
	if ( ! st->m_u || ! st->m_u[0] ) return processLoop ( st );


	XmlDoc *xd = &st->m_xd;
	// set this up
	SpiderRequest sreq;
	sreq.reset();
	strcpy(sreq.m_url,st->m_u);
	long firstIp = hash32n(st->m_u);
	if ( firstIp == -1 || firstIp == 0 ) firstIp = 1;
	// parentdocid of 0
	sreq.setKey( firstIp, 0LL, false );
	sreq.m_isPageParser = 1;
	sreq.m_hopCount = st->m_hopCount;
	sreq.m_hopCountValid = 1;
	sreq.m_fakeFirstIp   = 1;
	sreq.m_firstIp = firstIp;
	Url nu;
	nu.set(sreq.m_url);
	sreq.m_domHash32 = nu.getDomainHash32();
	sreq.m_siteHash32 = nu.getHostHash32();

	// . get provided content if any
	// . will be NULL if none provided
	// . "content" may contain a MIME
	long  contentLen = 0;
	char *content = r->getString ( "content" , &contentLen , NULL );
	// is the "content" url-encoded? default is true.
	bool contentIsEncoded = true;
	// mark doesn't like to url-encode his content
	if ( ! content ) {
		content    = r->getUnencodedContent    ();
		contentLen = r->getUnencodedContentLen ();
		contentIsEncoded = false;
	}
	// ensure null
	if ( contentLen == 0 ) content = NULL;

	uint8_t contentType = CT_HTML;
	if ( r->getBool("xml",0) ) contentType = CT_XML;

	contentType = r->getLong("ctype",contentType);//CT_HTML);


	// if facebook, load xml content from title rec...
	bool isFacebook = (bool)strstr(st->m_u,"http://www.facebook.com/");
	if ( isFacebook && ! content ) {
		long long docId = g_titledb.getProbableDocId(st->m_u);
		sprintf(sreq.m_url ,"%llu", docId );
		sreq.m_isPageReindex = true;
	}

	// hack
	if ( content ) {
		st->m_dbuf.purge();
		st->m_dbuf.safeStrcpy(content);
		//char *data = strstr(content,"\r\n\r\n");
		//long dataPos = 0;
		//if ( data ) dataPos = (data + 4) - content;
		//st->m_dbuf.convertJSONtoXML(0,dataPos);
		//st->m_dbuf.decodeJSON(0);
		content = st->m_dbuf.getBufStart();
	}

	// . use the enormous power of our new XmlDoc class
	// . this returns false if blocked
	if ( ! xd->set4 ( &sreq       ,
			  NULL        ,
			  st->m_coll  ,
			  &st->m_wbuf        ,
			  0 ,//PP_NICENESS ))
			  content ,
			  false, // deletefromindex
			  0, // forced ip
			  contentType ))
		// return error reply if g_errno is set
		return sendErrorReply ( st , g_errno );
	// make this our callback in case something blocks
	xd->setCallback ( st , processLoop );
	// . set xd from the old title rec if recycle is true
	// . can also use XmlDoc::m_loadFromOldTitleRec flag
	if ( st->m_recycle ) xd->m_recycleContent = true;

	return processLoop ( st );
}

Ejemplo n.º 26

0

Mostrar archivo

Archivo: PageInject.cpp Proyecto: BlaBlaNet/open-source-search-engine

void handleRequest7 ( UdpSlot *slot , int32_t netnice ) {

	InjectionRequest *ir = (InjectionRequest *)slot->m_readBuf;

	// now just supply the first guy's char ** and size ptr
	if ( ! deserializeMsg2 ( &ir->ptr_url, &ir->size_url ) ) {
		log("inject: error deserializing inject request from "
		    "host ip %s port %i",iptoa(slot->m_ip),(int)slot->m_port);
		g_errno = EBADREQUEST;
		g_udpServer.sendErrorReply(slot,g_errno);
		//g_corruptCount++;
		return;
	}
		

	// the url can be like xyz.com. so need to do another corruption
	// test for ia
	if ( ! ir->ptr_url ) { // || strncmp(ir->ptr_url,"http",4) != 0 ) {
		//log("inject: trying to inject NULL or non http url.");
		log("inject: trying to inject NULL url.");
		g_errno = EBADURL;
		//g_corruptCount++;
		g_udpServer.sendErrorReply(slot,g_errno);
		return;
	}

	CollectionRec *cr = g_collectiondb.getRec ( ir->m_collnum );
	if ( ! cr ) {
		log("inject: cr rec is null %i", ir->m_collnum);
		g_errno = ENOCOLLREC;
		g_udpServer.sendErrorReply(slot,g_errno);
		return;
	}

	XmlDoc *xd;
	try { xd = new (XmlDoc); }
	catch ( ... ) { 
		g_errno = ENOMEM;
		log("PageInject: import failed: new(%i): %s", 
		    (int)sizeof(XmlDoc),mstrerror(g_errno));
		g_udpServer.sendErrorReply(slot,g_errno);
		return;
	}
	mnew ( xd, sizeof(XmlDoc) , "PageInject" );

	xd->m_injectionSlot = slot;
	xd->m_injectStartTime = gettimeofdayInMilliseconds();

	// add to linked list
	xd->m_nextInject = NULL;
	xd->m_prevInject = NULL;
	if ( s_injectTail ) {
		s_injectTail->m_nextInject = xd;
		xd->m_prevInject = s_injectTail;
		s_injectTail = xd;
	}
	else {
		s_injectHead = xd;
		s_injectTail = xd;
	}
	if(ir->ptr_content && ir->ptr_content[ir->size_content - 1]) {
		// XmlDoc expects this buffer to be null terminated.
		char *xx=NULL;*xx=0;
	}

	if ( ! xd->injectDoc ( ir->ptr_url , // m_injectUrlBuf.getBufStart() ,
			       cr ,
			       ir->ptr_content , // start , // content ,
			       ir->ptr_diffbotReply,
			       // if this doc is a 'container doc' then
			       // hasMime applies to the SUBDOCS only!!
			       ir->m_hasMime, // content starts with http mime?
			       ir->m_hopCount,
			       ir->m_charset,

			       ir->m_deleteUrl,
			       // warcs/arcs include the mime so we don't
			       // look at this in that case in 
			       // XmlDoc::injectDoc() when it calls set4()
			       ir->ptr_contentTypeStr, // text/html text/xml
			       ir->m_spiderLinks ,
			       ir->m_newOnly, // index iff new

			       xd, // state ,
			       sendUdpReply7 ,

			       // extra shit
			       ir->m_firstIndexed,
			       ir->m_lastSpidered ,
			       // the ip of the url being injected.
			       // use 0 if unknown and it won't be valid.
			       ir->m_injectDocIp ,
				   ir->ptr_contentDelim,
				   ir->ptr_metadata,
				   ir->size_metadata,
				   ir->size_content - 1 // there should be a null in that last byte
			       ) )
		// we blocked...
		return;

	// if injected without blocking, send back reply
	sendUdpReply7 ( xd );
}

Ejemplo n.º 27

0

Mostrar archivo

Archivo: PageParser.cpp Proyecto: acchou/open-source-search-engine

bool processLoop ( void *state ) {
	// cast it
	State8 *st = (State8 *)state;
	// get the xmldoc
	XmlDoc *xd = &st->m_xd;

	// error?
	if ( g_errno ) return sendErrorReply ( st , g_errno );

	// shortcut
	SafeBuf *xbuf = &st->m_xbuf;

	if ( st->m_u && st->m_u[0] ) {
		// . save the ips.txt file if we are the test coll
		// . saveTestBuf() is a function in Msge1.cpp
		CollectionRec *cr = xd->getCollRec();
		if ( xd && cr && cr->m_coll && !strcmp(cr->m_coll,"qatest123"))
			// use same dir that XmlDoc::getTestDir() would use
			//saveTestBuf ( "test-page-parser" );
			saveTestBuf("qa");
		// now get the meta list, in the process it will print out a 
		// bunch of junk into st->m_xbuf
		char *metalist = xd->getMetaList ( );
		if ( ! metalist ) return sendErrorReply ( st , g_errno );
		// return false if it blocked
		if ( metalist == (void *)-1 ) return false;
		// for debug...
		if ( ! xd->m_indexCode ) xd->doConsistencyTest ( false );
		// print it out
		xd->printDoc( xbuf );
	}

	// print reason we can't analyze it (or index it)
	//if ( st->m_indexCode != 0 ) {
	//	xbuf->safePrintf ("<br><br><b>indexCode: %s</b>\n<br>", 
	//			  mstrerror(st->m_indexCode));
	//}

	// we are done
	g_inPageParser = false;

	// print the final tail
	//p += g_httpServer.printTail ( p , pend - p );

	//log("parser: send sock=%li",st->m_s->m_sd);
	
	// now encapsulate it in html head/tail and send it off
	bool status = g_httpServer.sendDynamicPage( st->m_s , 
						    xbuf->getBufStart(), 
						    xbuf->length() ,
						    -1, //cachtime
						    false ,//postreply?
						    NULL, //ctype
						    -1 , //httpstatus
						    NULL,//cookie
						    "utf-8");
	// delete the state now
	if ( st->m_freeIt ) {
		mdelete ( st , sizeof(State8) , "PageParser" );
		delete (st);
	}
	// return the status
	return status;
}

Ejemplo n.º 28

0

Mostrar archivo

Archivo: PageInject.cpp Proyecto: chushuai/open-source-search-engine

bool sendReply ( void *state ) {
	// get the state properly
	Msg7 *msg7= (Msg7 *) state;

	GigablastRequest *gr = &msg7->m_gr;

	// extract info from state
	TcpSocket *sock = gr->m_socket;

	XmlDoc *xd = &msg7->m_xd;
	// log it
	//if ( msg7->m_url[0] ) xd->logIt();

	// msg7 has the docid for what we injected, iff g_errno is not set
	//long long docId  = msg7->m_msg7.m_docId;
	//long      hostId = msg7->m_msg7.m_hostId;
	long long docId  = xd->m_docId;
	long      hostId = 0;//msg7->m_msg7.m_hostId;

	// set g_errno to index code
	if ( xd->m_indexCodeValid && xd->m_indexCode && ! g_errno )
		g_errno = xd->m_indexCode;

	char format = gr->m_hr.getReplyFormat();

	// no url parm?
	if ( ! g_errno && ! gr->m_url && format != FORMAT_HTML )
		g_errno = EMISSINGINPUT;

	if ( g_errno && g_errno != EDOCUNCHANGED ) {
		long save = g_errno;
		mdelete ( msg7, sizeof(Msg7) , "PageInject" );
		delete (msg7);
		g_errno = save;
		char *msg = mstrerror(g_errno);
		return g_httpServer.sendErrorReply(sock,save,msg,NULL);
	}

	char abuf[320];
	SafeBuf am(abuf,320,0,false);
	am.setLabel("injbuf");
	char *ct = NULL;

	// a success reply, include docid and url i guess
	if ( format == FORMAT_XML ) {
		am.safePrintf("<response>\n");
		am.safePrintf("\t<statusCode>%li</statusCode>\n",
			      (long)g_errno);
		am.safePrintf("\t<statusMsg><![CDATA[");
		am.cdataEncode(mstrerror(g_errno));
		am.safePrintf("]]></statusMsg>\n");
		am.safePrintf("\t<docId>%lli</docId>\n",xd->m_docId);
		if ( gr->m_getSections ) {
			SafeBuf *secBuf = xd->getInlineSectionVotingBuf();
			am.safePrintf("\t<htmlSrc><![CDATA[");
			if ( secBuf->length() ) 
				am.cdataEncode(secBuf->getBufStart());
			am.safePrintf("]]></htmlSrc>\n");
		}
		am.safePrintf("</response>\n");
		ct = "text/xml";
	}

	if ( format == FORMAT_JSON ) {
		am.safePrintf("{\"response\":{\n");
		am.safePrintf("\t\"statusCode\":%li,\n",(long)g_errno);
		am.safePrintf("\t\"statusMsg\":\"");
		am.jsonEncode(mstrerror(g_errno));
		am.safePrintf("\",\n");
		am.safePrintf("\t\"docId\":%lli,\n",xd->m_docId);
		if ( gr->m_getSections ) {
			SafeBuf *secBuf = xd->getInlineSectionVotingBuf();
			am.safePrintf("\t\"htmlSrc\":\"");
			if ( secBuf->length() ) 
				am.jsonEncode(secBuf->getBufStart());
			am.safePrintf("\",\n");
		}
		// subtract ",\n"
		am.m_length -= 2;
		am.safePrintf("\n}\n}\n");
		ct = "application/json";
	}

	if ( format == FORMAT_XML || format == FORMAT_JSON ) {
		mdelete ( msg7, sizeof(Msg7) , "PageInject" );
		delete (msg7);
		return g_httpServer.sendDynamicPage(sock,
						    am.getBufStart(),
						    am.length(),
						    0,
						    false,
						    ct );
	}

	//
	// debug
	//

	/*
	// now get the meta list, in the process it will print out a 
	// bunch of junk into msg7->m_pbuf
	if ( xd->m_docId ) {
		char *metalist = xd->getMetaList ( 1,1,1,1,1,1 );
		if ( ! metalist || metalist==(void *)-1){char *xx=NULL;*xx=0;}
		// print it out
		SafeBuf *pbuf = &msg7->m_sbuf;
		xd->printDoc( pbuf );
		bool status = g_httpServer.sendDynamicPage( msg7->m_socket , 
							   pbuf->getBufStart(),
							    pbuf->length() ,
							    -1, //cachtime
							    false ,//postreply?
							    NULL, //ctype
							    -1 , //httpstatus
							    NULL,//cookie
							    "utf-8");
		// delete the state now
		mdelete ( st , sizeof(Msg7) , "PageInject" );
		delete (st);
		// return the status
		return status;
	}
	*/
	//
	// end debug
	//

	char *url = gr->m_url;
	
	// . if we're talking w/ a robot he doesn't care about this crap
	// . send him back the error code (0 means success)
	if ( url && gr->m_shortReply ) {
		char buf[1024*32];
		char *p = buf;
		// return docid and hostid
		if ( ! g_errno ) p += sprintf ( p , 
					   "0,docId=%lli,hostId=%li," , 
					   docId , hostId );
		// print error number here
		else  p += sprintf ( p , "%li,0,0,", (long)g_errno );
		// print error msg out, too or "Success"
		p += sprintf ( p , "%s", mstrerror(g_errno));
		mdelete ( msg7, sizeof(Msg7) , "PageInject" );
		delete (msg7);
		return g_httpServer.sendDynamicPage ( sock,buf, gbstrlen(buf) ,
						      -1/*cachetime*/);
	}

	SafeBuf sb;

	// print admin bar
	g_pages.printAdminTop ( &sb, sock , &gr->m_hr );

	// print a response msg if rendering the page after a submission
	if ( g_errno )
		sb.safePrintf ( "<center>Error injecting url: <b>%s[%i]</b>"
				"</center>", 
				mstrerror(g_errno) , g_errno);
	else if ( (gr->m_url&&gr->m_url[0]) ||
		  (gr->m_queryToScrape&&gr->m_queryToScrape[0]) )
		sb.safePrintf ( "<center><b>Sucessfully injected %s"
				"</center><br>"
				, xd->m_firstUrl.m_url
				);


	// print the table of injection parms
	g_parms.printParmTable ( &sb , sock , &gr->m_hr );


	// clear g_errno, if any, so our reply send goes through
	g_errno = 0;
	// calculate buffer length
	//long bufLen = p - buf;
	// nuke state
	mdelete ( msg7, sizeof(Msg7) , "PageInject" );
	delete (msg7);
	// . send this page
	// . encapsulates in html header and tail
	// . make a Mime
	// . i thought we need -2 for cacheTime, but i guess not
	return g_httpServer.sendDynamicPage (sock, 
					     sb.getBufStart(),
					     sb.length(), 
					     -1/*cachetime*/);
}

Ejemplo n.º 29

0

Mostrar archivo

Archivo: PageParser.cpp Proyecto: acchou/open-source-search-engine

bool gotXmlDoc ( void *state ) {
	// cast it
	State8 *st = (State8 *)state;
	// get the xmldoc
	XmlDoc *xd = &st->m_xd;

	// if we loaded from old title rec, it should be there!


	// . save the ips.txt file if we are the test coll
	// . saveTestBuf() is a function in Msge1.cpp
	//if ( xd && xd->m_coll && ! strcmp ( xd->m_coll , "qatest123")) 
	//	// use same dir that XmlDoc::getTestDir() would use
	//	saveTestBuf ( "test-page-parser" );

	// error?
	if ( g_errno ) return sendErrorReply ( st , g_errno );

	// shortcut
	SafeBuf *xbuf = &st->m_xbuf;

	bool printIt = false;
	if ( st->m_u && st->m_u[0] ) printIt = true;
	if ( st->m_docId != -1LL ) printIt = true;
	if ( st->m_donePrinting ) printIt = false;

	// do not re-call this if printDocForProCog blocked... (check length())
	if ( printIt ) {
		// mark as done
		st->m_donePrinting = true;
		// always re-compute the page inlinks dynamically, do not
		// use the ptr_linkInfo1 stored in titlerec!!
		// NO! not if set from titlerec/docid
		if ( st->m_recompute )
			xd->m_linkInfo1Valid = false;
		// try a recompute regardless, because we do not store the
		// bad inlinkers, and ppl want to see why they are bad!
		//xd->m_linkInfo1Valid = false;
		// now get the meta list, in the process it will print out a 
		// bunch of junk into st->m_xbuf
		//char *metalist = xd->getMetaList ( );
		//if ( ! metalist ) return sendErrorReply ( st , g_errno );
		// return false if it blocked
		//if ( metalist == (void *)-1 ) return false;
		// for debug...
		//if ( ! xd->m_indexCode ) xd->doConsistencyTest ( false );
		// . print it out
		// . returns false if blocks, true otherwise
		// . sets g_errno on error
		if ( ! xd->printDocForProCog ( xbuf , &st->m_r ) )
			return false;
		// error?
		if ( g_errno ) return sendErrorReply ( st , g_errno );
	}

	long isXml = st->m_r.getLong("xml",0);
	char ctype2 = CT_HTML;
	if ( isXml ) ctype2 = CT_XML;
	// now encapsulate it in html head/tail and send it off
	bool status = g_httpServer.sendDynamicPage( st->m_s , 
						    xbuf->getBufStart(), 
						    xbuf->length() ,
						    -1, //cachtime
						    false ,//postreply?
						    &ctype2,
						    -1 , //httpstatus
						    NULL,//cookie
						    "utf-8");
	// delete the state now
	if ( st->m_freeIt ) {
		mdelete ( st , sizeof(State8) , "PageParser" );
		delete (st);
	}
	// return the status
	return status;
}

Ejemplo n.º 30

0

Mostrar archivo

Archivo: PageTitledb.cpp Proyecto: chushuai/open-source-search-engine

// . make a web page from results stored in msg40
// . send it on TcpSocket "s" when done
// . returns false if blocked, true otherwise
// . sets g_errno on error
bool gotTitleRec ( void *state ) {
	// cast the State4 out
	State4 *st = (State4 *) state;
	// get the socket
	TcpSocket *s = st->m_socket;

	SafeBuf sb;
	// get it's docId
	long long docId = st->m_docId;
	// make the query string for passing to different hosts
	char  qs[64];
	sprintf(qs,"&d=%lli",docId);
	if ( docId==0LL ) qs[0] = 0;
	// print standard header
	sb.reserve2x ( 32768 );
	g_pages.printAdminTop (&sb, st->m_socket, &st->m_r );
	//PAGE_TITLEDB,
	//		       st->m_username,//NULL ,
	//		       st->m_coll , st->m_pwd , s->m_ip , qs );
	// shortcut
	XmlDoc *xd = &st->m_xd;

	// . deal with errors
	// . print none if non title rec at or after the provided docId
	if ( g_errno || docId == 0LL || xd->m_titleRecBuf.length() <= 0 ) {
		// print docId in box
		sb.safePrintf (  "<center>\nEnter docId: "
				 "<input type=text name=d value=%lli size=15>",
				 docId);
		sb.safePrintf ( "</form><br>\n" );
		if ( docId == 0 ) 
			sb.safePrintf("<br>");
		else if ( g_errno ) 
			sb.safePrintf("<br><br>Error = %s",mstrerror(g_errno));
		else 
			sb.safePrintf("<br><br>No titleRec for that docId "
				      "or higher");
		// print where it should be
		//unsigned long gid = getGroupIdFromDocId ( docId );
		//Host *hosts = g_hostdb.getGroup(gid);
		long shardNum = getShardNumFromDocId ( docId );
		Host *hosts = g_hostdb.getShard ( shardNum );
		long hostId = -1;
		if ( hosts ) hostId = hosts[0].m_hostId;
		sb.safePrintf("<br><br>docId on host #%li and twins.",hostId);
		sb.safePrintf ( "\n</center>" );
		mdelete ( st , sizeof(State4) , "PageTitledb");
		delete (st);
		// erase g_errno for sending
		g_errno = 0;
		// now encapsulate it in html head/tail and send it off
		return g_httpServer.sendDynamicPage ( s , 
						      sb.getBufStart(),
						      sb.length() );
	}
	// print docId in box
	sb.safePrintf ("<br>\n"
		       "<center>Enter docId: "
		       "<input type=text name=d value=%lli size=15>", docId );
	// print where it should be
	//unsigned long gid = getGroupIdFromDocId ( docId );
	//Host *hosts = g_hostdb.getGroup(gid);
	long shardNum = getShardNumFromDocId ( docId );
	Host *hosts = g_hostdb.getShard ( shardNum );
	long hostId = -1;
	if ( hosts ) hostId = hosts[0].m_hostId;
	sb.safePrintf("<br><br>docId on host #%li and twins.",hostId);
	sb.safePrintf ( "</form><br>\n" );

	//char *coll    = st->m_coll;

	Title *ti = xd->getTitle();
	if ( ! ti ) {
		log ( "admin: Could not set title" );
		return g_httpServer.sendErrorReply(s,500,mstrerror(g_errno));
	}
	// sanity check. should not block
	if ( ! xd->m_titleValid ) { char *xx=NULL;*xx=0; }

	// print it out
	xd->printDoc ( &sb );

	// don't forget to cleanup
	mdelete ( st , sizeof(State4) , "PageTitledb");
	delete (st);
	// now encapsulate it in html head/tail and send it off
	return g_httpServer.sendDynamicPage (s, sb.getBufStart(), sb.length());
}