コード例 #1
0
// . returns false if blocked, true otherwise
// . sets g_errno on error
// . make a web page displaying the titleRec of "docId" given via cgi
// . call g_httpServer.sendDynamicPage() to send it
bool sendPageTitledb ( TcpSocket *s , HttpRequest *r ) {
	// get the docId from the cgi vars
	long long docId = r->getLongLong ("d", 0LL );
	// set up a msg22 to get the next titleRec
	State4 *st ;
	try { st = new (State4); }
	catch ( ... ) {
		g_errno = ENOMEM;
		log("PageTitledb: new(%i): %s", 
		    (int)sizeof(State4),mstrerror(g_errno));
		return g_httpServer.sendErrorReply(s,500,mstrerror(g_errno));}
	mnew ( st , sizeof(State4) , "PageTitledb");
	// save the socket
	st->m_socket = s;
	// copy it
	st->m_r.copy ( r );
	// remember if http request is internal/local or not
	st->m_isRootAdmin = g_conf.isCollAdmin ( s , r );
	st->m_isLocal = r->isLocal();
	st->m_docId   = docId;
	// password, too
	st->m_pwd = r->getString ( "pwd" );
	// get the collection
	long  collLen = 0;
	char *coll    = st->m_r.getString("c",&collLen);
	if ( ! coll || ! coll[0] ) {
		//coll    = g_conf.m_defaultColl;
		coll = g_conf.getDefaultColl( r->getHost(), r->getHostLen() );
		collLen = gbstrlen(coll);
	}
	st->m_coll    = coll;
	st->m_collLen = collLen;

	// just print page if no docid provided
	if ( ! docId ) return gotTitleRec ( st );

	// get the handy XmlDoc
	XmlDoc *xd = &st->m_xd;
	// use 0 for niceness
	xd->set3 ( docId , coll , 0 );
	// callback
	xd->setCallback ( st , gotTitleRec );
	// . and tell it to load from old title rec
	// . this sets all the member vars from it and also sets
	//   m_titleRecBuf to contain the actual compressed title rec
	if ( ! xd->loadFromOldTitleRec ( ) ) return false;
	// we got it without blocking. cached?
	return gotTitleRec ( st );
}
コード例 #2
0
// returns false if blocked, true otherwise
bool processLoop ( void *state ) {
	// get it
	State60 *st = (State60 *)state;
	// get the tcp socket from the state
	TcpSocket *s = st->m_socket;
	// get it
	XmlDoc *xd = &st->m_xd;

	if ( ! xd->m_loaded ) {
		// setting just the docid. niceness is 0.
		xd->set3 ( st->m_docId , st->m_coll , 0 );
		// callback
		xd->setCallback ( state , processLoop );
		// . and tell it to load from the old title rec
		// . if it returns false it blocked and will call our callback
		//   processLoop() when it completes
		if ( ! xd->loadFromOldTitleRec ( ) ) return false;
	}

	if ( g_errno ) return sendErrorReply ( st , g_errno );


	// get the utf8 content
	char **utf8 = xd->getUtf8Content();
	//int32_t   len  = xd->size_utf8Content - 1;
	// wait if blocked???
	if ( utf8 == (void *)-1 ) return false;
	// strange
	if ( xd->size_utf8Content<=0) return sendErrorReply(st,EBADENGINEER );
	// alloc error?
	if ( ! utf8 ) return sendErrorReply ( st , g_errno );

	// get this host
	Host *h = g_hostdb.getHost ( g_hostdb.m_hostId );
	if ( ! h ) return sendErrorReply ( st , EBADENGINEER );
	
	// make it into an editable page now for the turk guy
	sendTurkPageReply ( st );
}
コード例 #3
0
// for procog
bool sendPageAnalyze ( TcpSocket *s , HttpRequest *r ) {

	// make a state
	State8 *st;
	try { st = new (State8); }
	catch ( ... ) {
		g_errno = ENOMEM;
		log("PageParser: new(%i): %s", 
		    (int)sizeof(State8),mstrerror(g_errno));
		return g_httpServer.sendErrorReply(s,500,
						   mstrerror(g_errno));}
	mnew ( st , sizeof(State8) , "PageParser" );
	st->m_freeIt = true;
	st->m_state           = NULL;
	//st->m_callback        = callback;
	//st->m_q               = q;
	//st->m_termFreqs       = termFreqs;
	//st->m_termFreqWeights = termFreqWeights;
	//st->m_affWeights      = affWeights;
	//st->m_total           = (score_t)-1;
	st->m_indexCode       = 0;
	st->m_blocked         = false;
	st->m_didRootDom      = false;
	st->m_didRootWWW      = false;
	st->m_wasRootDom      = false;
	st->m_u               = NULL;

	// password, too
	long pwdLen = 0;
	char *pwd = r->getString ( "pwd" , &pwdLen );
	if ( pwdLen > 31 ) pwdLen = 31;
	if ( pwdLen > 0 ) strncpy ( st->m_pwd , pwd , pwdLen );
	st->m_pwd[pwdLen]='\0';

	// save socket ptr
	st->m_s = s;
	st->m_r.copy ( r );

	// get the collection
	char *coll    = r->getString ( "c" , &st->m_collLen ,NULL /*default*/);
	if ( ! coll ) coll = g_conf.m_defaultColl;
	if ( ! coll ) coll = "main";
	long collLen = gbstrlen(coll);
	if ( collLen > MAX_COLL_LEN ) return sendErrorReply ( st , ENOBUFS );
	strcpy ( st->m_coll , coll );

	// version to use, if -1 use latest
	st->m_titleRecVersion = r->getLong("version",-1);
	if ( st->m_titleRecVersion == -1 ) 
		st->m_titleRecVersion = TITLEREC_CURRENT_VERSION;
	// default to 0 if not provided
	st->m_hopCount = r->getLong("hc",0);
	long  old     = r->getLong   ( "old", 0 );
	// set query
	long qlen;
	char *qs = r->getString("q",&qlen,NULL);
	if ( qs ) st->m_tq.set2 ( qs , langUnknown , true );
	// url will override docid if given
	st->m_docId = r->getLongLong ("d",-1);
	st->m_docId = r->getLongLong ("docid",st->m_docId);

	long ulen;
	char *u = st->m_r.getString("u",&ulen,NULL);
	if ( ! u ) u = st->m_r.getString("url",&ulen,NULL);
	if ( ! u && st->m_docId == -1LL ) 
		return sendErrorReply ( st , EBADREQUEST );

	// set url in state class (may have length 0)
	//if ( u ) st->m_url.set ( u , ulen );
	//st->m_urlLen = ulen;
	st->m_u = u;
	st->m_ulen = 0;
	if ( u ) st->m_ulen = gbstrlen(u);
	// should we recycle link info?
	st->m_recycle  = r->getLong("recycle",1);
	st->m_recycle2 = r->getLong("recycleimp",0);
	st->m_render   = r->getLong("render" ,0);
	st->m_recompute = r->getLong("recompute" ,0);
	// for quality computation... takes way longer cuz we have to 
	// lookup the IP address of every outlink, so we can get its root
	// quality using Msg25 which needs to filter out voters from that IP
	// range.
	st->m_oips     = r->getLong("oips"    ,0);
	//st->m_page = r->getLong("page",1);

	long  linkInfoLen  = 0;
	// default is NULL
	char *linkInfoColl = r->getString ( "oli" , &linkInfoLen, NULL );
	if ( linkInfoColl ) strcpy ( st->m_linkInfoColl , linkInfoColl );
	else st->m_linkInfoColl[0] = '\0';
	
	// set the flag in our SafeBuf class so that Words.cpp knows to show 
	// html or html source depending on this value
	//st->m_xbuf.m_renderHtml = st->m_render;

	// should we use the old title rec?
	st->m_old    = old;
	// are we coming from a local machine?
	st->m_isLocal = r->isLocal();
 	//no more setting the default root quality to 30, instead if we do not
 	// know it setting it to -1
 	st->m_rootQuality=-1;

	// header
	//xbuf->safePrintf("<meta http-equiv=\"Content-Type\" "
	//		 "content=\"text/html; charset=utf-8\">\n");

	XmlDoc *xd = &st->m_xd;

	long isXml = r->getLong("xml",0);

	// if got docid, use that
	if ( st->m_docId != -1 ) {
		if ( ! xd->set3 ( st->m_docId,
				  st->m_coll,
				  0 ) ) // niceness
			// return error reply if g_errno is set
			return sendErrorReply ( st , g_errno );
		// make this our callback in case something blocks
		xd->setCallback ( st , gotXmlDoc );
		xd->m_pbuf = &st->m_wbuf;
		// reset this flag
		st->m_donePrinting = false;
		// . set xd from the old title rec if recycle is true
		// . can also use XmlDoc::m_loadFromOldTitleRec flag
		//if ( st->m_recycle ) xd->m_recycleContent = true;
		xd->m_recycleContent = true;
		// force this on
		//xd->m_useSiteLinkBuf = true;
		//xd->m_usePageLinkBuf = true;
		if ( isXml ) xd->m_printInXml = true;
		// now tell it to fetch the old title rec
		if ( ! xd->loadFromOldTitleRec () )
			// return false if this blocks
			return false;
		return gotXmlDoc ( st );
	}			

	// set this up
	SpiderRequest sreq;
	sreq.reset();
	if ( st->m_u ) strcpy(sreq.m_url,st->m_u);
	long firstIp = hash32n(st->m_u);
	if ( firstIp == -1 || firstIp == 0 ) firstIp = 1;
	// parentdocid of 0
	sreq.setKey( firstIp, 0LL, false );
	sreq.m_isPageParser = 1;
	sreq.m_hopCount = st->m_hopCount;
	sreq.m_hopCountValid = 1;
	sreq.m_fakeFirstIp   = 1;
	sreq.m_firstIp = firstIp;
	Url nu;
	nu.set(sreq.m_url);
	sreq.m_domHash32 = nu.getDomainHash32();
	sreq.m_siteHash32 = nu.getHostHash32();

	// . get provided content if any
	// . will be NULL if none provided
	// . "content" may contain a MIME
	long  contentLen = 0;
	char *content = r->getString ( "content" , &contentLen , NULL );
	// is the "content" url-encoded? default is true.
	bool contentIsEncoded = true;
	// mark doesn't like to url-encode his content
	if ( ! content ) {
		content    = r->getUnencodedContent    ();
		contentLen = r->getUnencodedContentLen ();
		contentIsEncoded = false;
	}
	// ensure null
	if ( contentLen == 0 ) content = NULL;

	//uint8_t contentType = CT_HTML;
	//if ( isXml ) contentType = CT_XML;
	long ctype = r->getLong("ctype",CT_HTML);

	// . use the enormous power of our new XmlDoc class
	// . this returns false if blocked
	if ( ! xd->set4 ( &sreq       ,
			  NULL        ,
			  st->m_coll  ,
			  // we need this so the term table is set!
			  &st->m_wbuf        , // XmlDoc::m_pbuf
			  0, // try 0 now! 1 ,//PP_NICENESS ))
			  content ,
			  false, // deletefromindex
			  0, // forced ip
			  ctype ))
		// return error reply if g_errno is set
		return sendErrorReply ( st , g_errno );
	// make this our callback in case something blocks
	xd->setCallback ( st , gotXmlDoc );
	// reset this flag
	st->m_donePrinting = false;
	// prevent a core here in the event we download the page content
	xd->m_crawlDelayValid = true;
	xd->m_crawlDelay = 0;
	// . set xd from the old title rec if recycle is true
	// . can also use XmlDoc::m_loadFromOldTitleRec flag
	//if ( st->m_recycle ) xd->m_recycleContent = true;
	// only recycle if docid is given!!
	if ( st->m_recycle ) xd->m_recycleContent = true;
	// force this on
	//xd->m_useSiteLinkBuf = true;
	//xd->m_usePageLinkBuf = true;
	if ( isXml ) xd->m_printInXml = true;

	return gotXmlDoc ( st );
}
コード例 #4
0
// . returns false if blocked, true otherwise
// . sets g_errno on error
bool sendPageGet ( TcpSocket *s , HttpRequest *r ) {
	// get the collection
	long  collLen = 0;
	char *coll    = r->getString("c",&collLen);
	if ( ! coll || ! coll[0] ) {
		//coll    = g_conf.m_defaultColl;
		coll = g_conf.getDefaultColl( r->getHost(), r->getHostLen() );
		collLen = gbstrlen(coll);
	}
	// ensure collection not too big
	if ( collLen >= MAX_COLL_LEN ) { 
		g_errno = ECOLLTOOBIG; 
		return g_httpServer.sendErrorReply(s,500,mstrerror(g_errno)); 
	}
	// get the collection rec
	CollectionRec *cr = g_collectiondb.getRec ( coll );
	if ( ! cr ) {
		g_errno = ENOCOLLREC;
		log("query: Archived copy retrieval failed. "
		    "No collection record found for "
		    "collection \"%s\".",coll);
		return g_httpServer.sendErrorReply(s,500,mstrerror(g_errno));
	}
	// does this collection ban this IP?
	if ( ! cr->hasSearchPermission ( s ) ) {
		g_errno = ENOPERM;
		//log("PageGet::sendDynamicReply0: permission denied for %s",
		//    iptoa(s->m_ip) );
		g_msg = " (error: permission denied)";
		return g_httpServer.sendErrorReply(s,500,mstrerror(g_errno));
	}
	// . get fields from cgi field of the requested url
	// . get the search query
	long  qlen = 0;
	char *q = r->getString ( "q" , &qlen , NULL /*default*/);
	// ensure query not too big
	if ( qlen >= MAX_QUERY_LEN-1 ) { 
		g_errno=EQUERYTOOBIG; 
		return g_httpServer.sendErrorReply (s,500 ,mstrerror(g_errno));
	}
	// the docId
	long  long docId = r->getLongLong ( "d" , 0LL /*default*/ );
	// get url
	char *url = r->getString ( "u",NULL);

	if ( docId == 0 && ! url ) {
		g_errno = EMISSINGINPUT;
		return g_httpServer.sendErrorReply (s,500 ,mstrerror(g_errno));
	}


	// . should we do a sequential lookup?
	// . we need to match summary here so we need to know this
	//bool seq = r->getLong ( "seq" , false );
	// restrict to root file?
	bool rtq = r->getLong ( "rtq" , false );

	// . get the titleRec
	// . TODO: redirect client to a better http server to save bandwidth
	State2 *st ;
	try { st = new (State2); }
	catch (... ) {
		g_errno = ENOMEM;
		log("PageGet: new(%i): %s", 
		    (int)sizeof(State2),mstrerror(g_errno));
		return g_httpServer.sendErrorReply(s,500,mstrerror(g_errno));}
	mnew ( st , sizeof(State2) , "PageGet1" );
	// save the socket and if Host: is local in the Http request Mime
	st->m_socket   = s;
	st->m_isAdmin  = g_conf.isCollAdmin ( s , r );
	st->m_isLocal  = r->isLocal();
	st->m_docId    = docId;
	st->m_printed  = false;
	// include header ... "this page cached by Gigablast on..."
	st->m_includeHeader     = r->getLong ("ih"    , true  );
	st->m_includeBaseHref   = r->getLong ("ibh"   , false );
	st->m_queryHighlighting = r->getLong ("qh"    , true  );
	st->m_strip             = r->getLong ("strip" , 0     );
	st->m_clickAndScroll    = r->getLong ("cas"   , true  );
	st->m_cnsPage           = r->getLong ("cnsp"  , true );
	char *langAbbr = r->getString("qlang",NULL);
	st->m_langId = langUnknown;
	if ( langAbbr ) {
		uint8_t langId = getLangIdFromAbbr ( langAbbr );
		st->m_langId = langId;
	}
	strncpy ( st->m_coll , coll , MAX_COLL_LEN+1 );
	// store query for query highlighting
	st->m_netTestResults    = r->getLong ("rnettest", false );
	//if( st->m_netTestResults ) {
	//	mdelete ( st , sizeof(State2) , "PageGet1" );
	//	delete ( st );
	//	return sendPageNetResult( s );
	//}
	if ( q && qlen > 0 ) strcpy ( st->m_q , q );
	else                 st->m_q[0] = '\0';
	st->m_qlen = qlen;
	//st->m_seq      = seq;
	st->m_rtq      = rtq;
	st->m_boolFlag = r->getLong ("bq", 2 /*default is 2*/ );
	st->m_isBanned = false;
	st->m_noArchive = false;
	st->m_socket = s;
	st->m_format = r->getReplyFormat();
	// default to 0 niceness
	st->m_niceness = 0;
	st->m_r.copy ( r );
	//st->m_cr = cr;
	st->m_printDisclaimer = true;
	if ( st->m_cnsPage )
		st->m_printDisclaimer = false;
	if ( st->m_strip ) // ! st->m_evbits.isEmpty() ) 
		st->m_printDisclaimer = false;
	
	// should we cache it?
	char useCache = r->getLong ( "usecache" ,  1 );
	char rcache   = r->getLong ( "rcache"   ,  1 );
	char wcache   = r->getLong ( "wcache"   ,  1 );
	long cacheAge = r->getLong ( "cacheAge" , 60*60 ); // default one hour
	if ( useCache == 0 ) { cacheAge = 0; wcache = 0; }
	if ( rcache   == 0 )   cacheAge = 0; 
	// . fetch the TitleRec
	// . a max cache age of 0 means not to read from the cache
	XmlDoc *xd = &st->m_xd;
	// url based?
	if ( url ) {
		SpiderRequest sreq;
		sreq.reset();
		strcpy(sreq.m_url, url );
		sreq.setDataSize();
		// this returns false if "coll" is invalid
		if ( ! xd->set4 ( &sreq , NULL , coll , NULL , st->m_niceness ) ) 
			goto hadSetError;
	}
	// . when getTitleRec() is called it will load the old one
	//   since XmlDoc::m_setFromTitleRec will be true
	// . niceness is 0
	// . use st->m_coll since XmlDoc just points to it!
	// . this returns false if "coll" is invalid
	else if ( ! xd->set3 ( docId , st->m_coll , 0 ) ) {
	hadSetError:
		mdelete ( st , sizeof(State2) , "PageGet1" );
		delete ( st );
		g_errno = ENOMEM;
		log("PageGet: set3: %s", mstrerror(g_errno));
		return g_httpServer.sendErrorReply(s,500,mstrerror(g_errno));
	}
	// if it blocks while it loads title rec, it will re-call this routine
	xd->setCallback ( st , processLoopWrapper );
	// good to go!
	return processLoop ( st );
}
コード例 #5
0
void gotDatedbList ( State60 *st ) {

	// must only be run on host #0 since we need just one lock table
	if ( g_hostdb.m_myHost->m_hostId != 0 ) { char *xx=NULL;*xx=0; }

	// load turk lock table if we need to
	bool s_init = false;
	if ( ! s_init ) {
		s_init = true;
		if ( ! g_turkLocks.set(8,sizeof(TurkLock),256) )
			log("turk: failed to init turk lock table");
		if ( ! g_turkLocks.load(g_conf.m_dir,"turkdir/docidlocks.dat"))
			log("turk: failed to load turk lock table");
	}

	time_t now = getTimeGlobal();
	// int16_tcut
	RdbList *list = &st->m_list;
	// the best docid
	int64_t best = 0LL;
	// scan the list to get urls/docids to turk out
	for ( ; ! list->isExhausted() ; ) {
		// get rec
		char *k = list->getCurrentKey();
		// skip that
		list->skipCurrentRecord();
		// skip if negative
		if ( (k[0] & 0x01) == 0x00 ) continue;
		// get the docid
		int64_t docid = g_datedb.getDocId ( k );
		// skip if locked
		TurkLock *tt = (TurkLock *)g_turkLock.getValue(&docid);
		// if there check time
		if ( tt && now - tt->m_lockTime > 3600 ) {
			// remove it
			g_turkLock.removeKey(&docId);
			// nuke tt
			tt = NULL;
		}
		// if still there, skip it and try next one
		if ( tt ) continue;
		// ok, we got a good docid to dish out
		best = docId;
		break;
	}

	SafeBuf sb;

	// print description so they can clikc a button to start the turk
	sb.safePrintf("<html>\n"
		      "<title>Event Editor</title>\n"
		      "<body>\n"
		      "<table width=\"100%%\" border=\"0\">\n"
		      "<tr><td style=\"background-color:#0079ba;\">\n"
		      "<center><font color=#00000>"
		      "<h2>Event Editor</h2>\n"
		      "</font></center></td>"
		      "</tr></table>");

	// if we had no docid, give user an empty msg
	if ( ! best ) {
		sb.safePrintf("<center>Nothing currently available to edit. "
			      "Please try again later.</center>"
			      "</body></html>\n");
		sendReply ( &sb );
		return;
	}

	// lock it!
	TurkLock tt;
	strcpy ( tt.m_user , st->m_user );
	tt.m_lockTime = now;
	if ( ! g_lockTable.addLock ( &tt ) ) {
		sendErrorReply ( st , g_errno );
		return;
	}

	// . fetch the TitleRec
	// . a max cache age of 0 means not to read from the cache
	XmlDoc *xd = &st->m_xd;
	// . when getTitleRec() is called it will load the old one
	//   since XmlDoc::m_setFromTitleRec will be true
	// . niceness is 0
	xd->set3 ( best , st->m_coll , 0 );
	// if it blocks while it loads title rec, it will re-call this routine
	xd->setCallback ( st , processLoopWrapper );
	// good to go!
	return processLoop ( st );
}