Exemplo n.º 1
0
bool getUrl( char *path , void (* callback) (void *state, TcpSocket *sock) ) {
	SafeBuf sb;
	sb.safePrintf ( "http://%s:%li%s"
			, iptoa(g_hostdb.m_myHost->m_ip)
			, (long)g_hostdb.m_myHost->m_port
			, path
			);
	Url u;
	u.set ( sb.getBufStart() );
	if ( ! g_httpServer.getDoc ( u.getUrl() ,
				     0 , // ip
				     0 , // offset
				     -1 , // size
				     0 , // ifmodsince
				     NULL ,
				     callback ,
				     60*1000, // timeout
				     0, // proxyip
				     0, // proxyport
				     -1, // maxtextdoclen
				     -1, // maxotherdoclen
				     NULL ) ) // useragent
		return false;
	// error?
	log("qa: getUrl error: %s",mstrerror(g_errno));
	return true;
}	
Exemplo n.º 2
0
// for example, RENAME log000 to log000-bak20131104-181932
static bool renameCurrentLogFile ( ) {
	File f;
	char tmp[16];
	sprintf(tmp,"log%03" PRId32,g_hostdb.m_hostId);
	f.set ( g_hostdb.m_dir , tmp );
	// make new filename like log000-bak20131104-181932
	time_t now = time(NULL);
	struct tm tm_buf;
	tm *tm1 = gmtime_r(&now,&tm_buf);
	char tmp2[64];
	strftime(tmp2,64,"%Y%m%d-%H%M%S",tm1);
	SafeBuf newName;
	if ( ! newName.safePrintf ( "%slog%03" PRId32"-bak%s",
				    g_hostdb.m_dir,
				    g_hostdb.m_hostId,
				    tmp2 ) ) {
		fprintf(stderr,"log rename failed\n");
		return false;
	}
	// rename log000 to log000-2013_11_04-18:19:32
	if ( f.doesExist() ) {
		//fprintf(stdout,"renaming file\n");
		f.rename ( newName.getBufStart() );
	}
	return true;
}
// "xd" is the XmlDoc that just completed injecting
void ImportState::saveFileBookMark ( ) { //Msg7 *msg7 ) {

	long long minOff = -1LL;
	long minFileId = -1;

	//long fileId  = msg7->m_hackFileId;
	//long long fileOff = msg7->m_hackFileOff;

	// if there is one outstanding the preceeded us, we can't update
	// the bookmark just yet.
	for ( long i = 0 ; i < m_numPtrs ; i++ ) {
		Multicast *mcast = &m_ptrs[i];
		if ( ! mcast->m_inUse ) continue;
		if ( minOff == -1 ) {
			minOff = mcast->m_hackFileOff;
			minFileId = mcast->m_hackFileId;
			continue;
		}
		if ( mcast->m_hackFileId > minFileId ) 
			continue;
		if ( mcast->m_hackFileId == minFileId &&
		     mcast->m_hackFileOff > minOff ) 
			continue;
		minOff = mcast->m_hackFileOff;
		minFileId = mcast->m_hackFileId;
	}

	char fname[256];
	sprintf(fname,"%slasttitledbinjectinfo.dat",g_hostdb.m_dir);
	SafeBuf ff;
	ff.safePrintf("%llu,%lu",minOff,minFileId);//_fileOffset,m_bfFileId);
	ff.save ( fname );
}
// draw a HORIZONTAL line in html
void Statsdb::drawLine3 ( SafeBuf &sb ,
		 long x1 , 
		 long x2 ,
		 long fy1 , 
		 long color ,
		 long width ) {

	// do not draw repeats in the case we have a ton of points to plot
	long key32 ;
	key32 = hash32h ( x1  , 0 );
	key32 = hash32h ( x2  , key32);
	key32 = hash32h ( fy1 , key32);
	key32 = hash32h ( color , key32);
	key32 = hash32h ( width , key32);
	if ( m_dupTable.isInTable(&key32) ) return;
	m_dupTable.addKey(&key32);

	sb.safePrintf("<div style=\"position:absolute;"
		      "left:%li;"
		      "bottom:%li;"
		      "background-color:#%lx;"
		      "z-index:-5;"
		      "min-height:%lipx;"
		      "min-width:%lipx;\"></div>\n"
		      , x1 + m_bx
		      , (fy1 - width/2) + m_by
		      , color
		      , width
		      , x2 - x1
		      );
}
Exemplo n.º 5
0
// draw a HORIZONTAL line in html
void Statsdb::drawLine3 ( SafeBuf &sb ,
		 int32_t x1 , 
		 int32_t x2 ,
		 int32_t fy1 , 
		 int32_t color ,
		 int32_t width ) {

	// do not draw repeats in the case we have a ton of points to plot
	int32_t key32 ;
	key32 = hash32h ( x1  , 0 );
	key32 = hash32h ( x2  , key32);
	key32 = hash32h ( fy1 , key32);
	key32 = hash32h ( color , key32);
	key32 = hash32h ( width , key32);
	if ( m_dupTable.isInTable(&key32) ) return;
	m_dupTable.addKey(&key32);

	sb.safePrintf("<div style=\"position:absolute;"
				  "left:%" PRId32";"
				  "bottom:%" PRId32";"
				  "background-color:#%" PRIx32";"
				  "z-index:-5;"
				  "min-height:%" PRId32"px;"
				  "min-width:%" PRId32"px;\""
				  " class=\"color-%" PRIx32"\"></div>\n"
		      , x1 + m_bx
		      , (fy1 - width/2) + m_by
		      , color
		      , width
		      , x2 - x1
		      , color
		      );
}
bool saveHashTable ( ) {
	if ( s_ht.m_numSlotsUsed <= 0 ) return true;
	SafeBuf fn;
	fn.safePrintf("%s/qa/",g_hostdb.m_dir);
	log("qa: saving crctable.dat");
	s_ht.save ( fn.getBufStart() , "crctable.dat" );
	return true;
}
bool qascrape ( ) {
	//
	// delete the 'qatest123' collection
	//
	//static bool s_x1 = false;
	if ( ! s_flags[0] ) {
		s_flags[0] = true;
		if ( ! getUrl ( "/admin/delcoll?xml=1&delcoll=qatest123" ) )
			return false;
	}

	//
	// add the 'qatest123' collection
	//
	//static bool s_x2 = false;
	if ( ! s_flags[1] ) {
		s_flags[1] = true;
		if ( ! getUrl ( "/admin/addcoll?addcoll=qatest123&xml=1" , 
				// checksum of reply expected
				238170006 ) )
			return false;
	}


	// scrape it
	if ( ! s_flags[3] ) {
		s_flags[3] = true;
		SafeBuf sb;
		sb.safePrintf( "/admin/inject?c=qatest123&"
			       "format=xml&qts=test");
		if ( ! getUrl ( sb.getBufStart() , 999 ) )
			return false;
	}
		


	// verify no results for gbhopcount:2 query
	//static bool s_y4 = false;
	if ( ! s_flags[6] ) {
		s_flags[6] = true;
		if ( ! getUrl ( "/search?c=qatest123&qa=1&format=xml&"
				"q=test",
				-1310551262 ) )
			return false;
	}


	//static bool s_fee2 = false;
	if ( ! s_flags[13] ) {
		s_flags[13] = true;
		log("qa: SUCCESSFULLY COMPLETED "
		    "QA SCRAPE TEST");
		return true;
	}

	return true;
}
// . the url being reuqested
// . removes &code= facebook cruft
bool HttpRequest::getCurrentUrl ( SafeBuf &cu ) {
	// makre sure we got enough room
	if ( ! cu.reserve ( m_hostLen + 64 + m_plen + 1 + 1 ) ) return false;
	// need a "Host: "
	char *host = m_host;
	if ( ! host ) host = APPSUBDOMAIN;
	cu.safePrintf("http");
	if ( m_isSSL ) cu.pushChar('s');
	cu.safePrintf("://%s",host);
	char *path = m_path;
	long  plen = m_plen;
	if ( ! path ) {
		path = "/";
		plen = 1;
	}
	// . scan path and change \0 back to = or &
	// . similar logic in HttpServer.cpp for logging!
	char *dst = cu.getBuf();
	char *src = path;
	char *srcEnd = path + plen;
	char dd = '=';
	for ( ; src < srcEnd ; src++ , dst++ ) {
		*dst = *src;
		if ( *src ) continue;
		*dst = dd;
		if ( dd == '=' ) dd = '&';
		else             dd = '=';
	}
	*dst = '\0';
	// cut it off at facebook's &code=
	char *buf = cu.getBufStart();
	char *code = strstr( buf,"&code=");
	// fix for eventguru.com/blog.html?code=
	if ( ! code ) code = strstr(buf,"?code=");
	// hack that off if there
	if ( code ) {
		*code = '\0';
		dst = code;
	}
	// update length
	cu.setLength( dst - cu.getBufStart() );
	return true;
}
// . run a series of tests to ensure that gb is functioning properly
// . uses the ./qa subdirectory to hold archive pages, ips, spider dates to
//   ensure consistency between tests for exact replays
bool qatest ( ) {

	if ( s_registered ) {
		g_loop.unregisterSleepCallback(NULL,qatestWrapper);
		s_registered = false;
	}

	if ( ! s_callback ) s_callback = qatest;

	if ( ! g_qaSock ) return true;


	// returns true when done, false when blocked
	//if ( ! qainject ( ) ) return false;

	// returns true when done, false when blocked
	//if ( ! qaspider ( ) ) return false;

	long n = sizeof(s_qatests)/sizeof(QATest);
	for ( long i = 0 ; i < n ; i++ ) {
		QATest *qt = &s_qatests[i];
		if ( ! qt->m_doTest ) continue;
		// store that
		s_qt = qt;
		// point to flags
		s_flags = qt->m_flags;
		// call the qatest
		if ( ! qt->m_func() ) return false;
	}

	// save this
	saveHashTable();
	// do not reset since we don't reload it above!
	//s_ht.reset();

	//if ( g_numErrors )
	//	g_qaOutput.safePrintf("<input type=submit value=submit><br>");

	g_qaOutput.safePrintf("<br>DONE RUNNING QA TESTS<br>");


	// . print the output
	// . the result of each test is stored in the g_qaOutput safebuf
	g_httpServer.sendDynamicPage(g_qaSock,
				     g_qaOutput.getBufStart(),
				     g_qaOutput.length(),
				     -1/*cachetime*/);

	g_qaOutput.purge();

	g_qaSock = NULL;

	return true;
}
Exemplo n.º 10
0
bool deleteUrls ( ) {
	static long s_ii2 = 0;
	for ( ; s_ii2 < s_numUrls ; ) {
		// pre-inc it
		s_ii2++;
		// reject using html api
		SafeBuf sb;
		sb.safePrintf( "/admin/inject?c=qatest123&delete=1&u=");
		sb.urlEncode ( s_urlPtrs[s_ii2] );
		return getUrl ( sb.getBufStart() , qatestWrapper );
	}
	return true;
}
void doneInjectingLinksWrapper ( void *state ) {
	Msg7 *msg7 = (Msg7 *)state;
	SafeBuf *sb = &msg7->m_sb;
	// copy the serps into ou rbuf
	if ( ! g_errno ) {
		// print header
		if ( sb->length() == 0 ) {
			// print header of page
			sb->safePrintf("<?xml version=\"1.0\" "
				       "encoding=\"UTF-8\" ?>\n"
				       "<response>\n" );
		}
		// serp header
		if ( msg7->m_round == 1 )
			sb->safePrintf("\t<googleResults>\n");
		else
			sb->safePrintf("\t<bingResults>\n");
		// print results
		sb->safeMemcpy(&msg7->m_xd.m_serpBuf);
		// end that
		if ( msg7->m_round == 1 )
			sb->safePrintf("\t</googleResults>\n");
		else
			sb->safePrintf("\t</bingResults>\n");
	}
	// do bing now
	if ( msg7->m_round == 1 ) {
		// return if it blocks
		if ( ! msg7->scrapeQuery() ) return;
	}

	// otherwise, parse out the search results so steve can display them
	if ( g_errno )
		sb->safePrintf("<error><![CDATA[%s]]></error>\n",
			       mstrerror(g_errno));
	// print header of page
	sb->safePrintf("</response>\n");
	// page is not more than 32k
	//char buf[1024*32];
	//char *p = buf;
	// return docid and hostid
	//p += sprintf ( p , "scraping status ");
	// print error msg out, too or "Success"
	//p += sprintf ( p , "%s", mstrerror(g_errno));
	TcpSocket *sock = msg7->m_socket;
	g_httpServer.sendDynamicPage ( sock, 
				       sb->getBufStart(),
				       sb->length(),
				       -1/*cachetime*/);
	// hopefully sb buffer is copied becaues this will free it:
	mdelete ( msg7, sizeof(Msg7) , "PageInject" );
	delete (msg7);
}
Exemplo n.º 12
0
// ensure search results are consistent
bool searchTest2 () {
	long nq = sizeof(s_queries)/sizeof(char *);
	for ( ; s_qi2 < nq ; ) {
		// pre-inc it
		s_qi2++;
		// inject using html api
		SafeBuf sb;
		// qa=1 tell gb to exclude "variable" or "random" things
		// from the serps so we can checksum it consistently
		sb.safePrintf ( "/search?c=qatest123&qa=1&q=" );
		sb.urlEncode ( s_queries[s_qi2] );
		return getUrl ( sb.getBufStart() , doneSearching2 );
	}
	return true;
}	
// returns false if blocked, true otherwise, like on quick connect error
bool getUrl( char *path , long checkCRC = 0 , char *post = NULL ) {

	SafeBuf sb;
	sb.safePrintf ( "http://%s:%li%s"
			, iptoa(g_hostdb.m_myHost->m_ip)
			, (long)g_hostdb.m_myHost->m_httpPort
			, path
			);

	s_checkCRC = checkCRC;

	bool doPost = true;
	if ( strncmp ( path , "/search" , 7 ) == 0 )
		doPost = false;

	//Url u;
	s_url.set ( sb.getBufStart() );
	log("qa: getting %s",sb.getBufStart());
	if ( ! g_httpServer.getDoc ( s_url.getUrl() ,
				     0 , // ip
				     0 , // offset
				     -1 , // size
				     0 , // ifmodsince
				     NULL ,
				     gotReplyWrapper,
				     999999*1000, // timeout ms
				     0, // proxyip
				     0, // proxyport
				     -1, // maxtextdoclen
				     -1, // maxotherdoclen
				     NULL , // useragent
				     "HTTP/1.0" , // protocol
				     doPost , // doPost
				     NULL , // cookie
				     NULL , // additionalHeader
				     NULL , // fullRequest
				     post ) )
		return false;
	// error?
	processReply ( NULL , 0 );
	//log("qa: getUrl error: %s",mstrerror(g_errno));
	return true;
}	
Exemplo n.º 14
0
// draw a HORIZONTAL line in html
void drawLine2 ( SafeBuf &sb ,
		 long x1 , 
		 long x2 ,
		 long fy1 , 
		 long color ,
		 long width ) {

	sb.safePrintf("<div style=\"position:absolute;"
		      "left:%li;"
		      "top:%li;"
		      "background-color:#%06lx;"
		      "z-index:-5;"
		      "min-height:%lipx;"
		      "min-width:%lipx;\"></div>\n"
		      , x1
		      , (fy1 - width/2) - 20 //- 300
		      , color
		      , width
		      , x2 - x1
		      );
}
Exemplo n.º 15
0
// . returns false if blocked, true otherwise
// . sets g_errno on error
// . called either from 
//   1) doDocIdSplitLoop
//   2) or getDocIds2() if only 1 docidsplit
bool Msg39::getLists () {

	if ( m_debug ) m_startTime = gettimeofdayInMilliseconds();
	// . ask Indexdb for the IndexLists we need for these termIds
	// . each rec in an IndexList is a termId/score/docId tuple

	//
	// restrict to docid range?
	//
	// . get the docid start and end
	// . do docid paritioning so we can send to all hosts
	//   in the network, not just one stripe
	int64_t docIdStart = 0;
	int64_t docIdEnd = MAX_DOCID;
	// . restrict to this docid?
	// . will really make gbdocid:| searches much faster!
	int64_t dr = m_tmpq.m_docIdRestriction;
	if ( dr ) {
		docIdStart = dr;
		docIdEnd   = dr + 1;
	}
	// . override
	// . this is set from Msg39::doDocIdSplitLoop() to compute 
	//   search results in stages, so that we do not load massive
	//   termlists into memory and got OOM (out of memory)
	if ( m_r->m_minDocId != -1 ) docIdStart = m_r->m_minDocId;
	if ( m_r->m_maxDocId != -1 ) docIdEnd   = m_r->m_maxDocId+1;
	
	// if we have twins, then make sure the twins read different
	// pieces of the same docid range to make things 2x faster
	//bool useTwins = false;
	//if ( g_hostdb.getNumStripes() == 2 ) useTwins = true;
	//if ( useTwins ) {
	//	int64_t delta2 = ( docIdEnd - docIdStart ) / 2;
	//	if ( m_r->m_stripe == 0 ) docIdEnd = docIdStart + delta2;
	//	else                      docIdStart = docIdStart + delta2;
	//}
	// new striping logic:
	int32_t numStripes = g_hostdb.getNumStripes();
	int64_t delta2 = ( docIdEnd - docIdStart ) / numStripes;
	int32_t stripe = g_hostdb.getMyHost()->m_stripe;
	docIdStart += delta2 * stripe; // is this right?
	docIdEnd = docIdStart + delta2;
	// add 1 to be safe so we don't lose a docid
	docIdEnd++;
	// TODO: add triplet support later for this to split the
	// read 3 ways. 4 ways for quads, etc.
	//if ( g_hostdb.getNumStripes() >= 3 ) { char *xx=NULL;*xx=0;}
	// do not go over MAX_DOCID  because it gets masked and
	// ends up being 0!!! and we get empty lists
	if ( docIdEnd > MAX_DOCID ) docIdEnd = MAX_DOCID;
	// remember so Msg2.cpp can use them to restrict the termlists 
	// from "whiteList" as well
	m_docIdStart = docIdStart;
	m_docIdEnd   = docIdEnd;
	

	//
	// set startkey/endkey for each term/termlist
	//
	for ( int32_t i = 0 ; i < m_tmpq.getNumTerms() ; i++ ) {
		// breathe
		QUICKPOLL ( m_r->m_niceness );
		// int16_tcuts
		QueryTerm *qterm = &m_tmpq.m_qterms[i];
		char *sk = qterm->m_startKey;
		char *ek = qterm->m_endKey;
		// get the term id
		int64_t tid = m_tmpq.getTermId(i);
		// if only 1 stripe
		//if ( g_hostdb.getNumStripes() == 1 ) {
		//	docIdStart = 0;
		//	docIdEnd   = MAX_DOCID;
		//}
		// debug
		if ( m_debug )
			log("query: setting sk/ek for docids %"INT64""
			    " to %"INT64" for termid=%"INT64""
			    , docIdStart
			    , docIdEnd
			    , tid
			    );
		// store now in qterm
		g_posdb.makeStartKey ( sk , tid , docIdStart );
		g_posdb.makeEndKey   ( ek , tid , docIdEnd   );
		qterm->m_ks = sizeof(POSDBKEY);//key144_t);
	}

	// debug msg
	if ( m_debug || g_conf.m_logDebugQuery ) {
		for ( int32_t i = 0 ; i < m_tmpq.getNumTerms() ; i++ ) {
			// get the term in utf8
			//char bb[256];
			QueryTerm *qt = &m_tmpq.m_qterms[i];
			//utf16ToUtf8(bb, 256, qt->m_term, qt->m_termLen);
			char *tpc = qt->m_term + qt->m_termLen;
			char  tmp = *tpc;
			*tpc = '\0';
			char sign = qt->m_termSign;
			if ( sign == 0 ) sign = '0';
			QueryWord *qw = qt->m_qword;
			int32_t wikiPhrId = qw->m_wikiPhraseId;
			if ( m_tmpq.isPhrase(i) ) wikiPhrId = 0;
			char leftwikibigram = 0;
			char rightwikibigram = 0;
			if ( qt->m_leftPhraseTerm &&
			     qt->m_leftPhraseTerm->m_isWikiHalfStopBigram )
				leftwikibigram = 1;
			if ( qt->m_rightPhraseTerm &&
			     qt->m_rightPhraseTerm->m_isWikiHalfStopBigram )
				rightwikibigram = 1;
			/*
			char c = m_tmpq.getTermSign(i);
			char tt[512];
			int32_t ttlen = m_tmpq.getTermLen(i);
			if ( ttlen > 254 ) ttlen = 254;
			if ( ttlen < 0   ) ttlen = 0;
			// old:painful: convert each term from unicode to ascii
			gbmemcpy ( tt , m_tmpq.getTerm(i) , ttlen );
			*/
			int32_t isSynonym = 0;
			QueryTerm *st = qt->m_synonymOf;
			if ( st ) isSynonym = true;
			SafeBuf sb;
			// now we can display it
			//tt[ttlen]='\0';
			//if ( c == '\0' ) c = ' ';
			sb.safePrintf(
			     "query: msg39: [%"PTRFMT"] "
			     "query term #%"INT32" \"%s\" "
			     "phr=%"INT32" termId=%"UINT64" rawTermId=%"UINT64" "
			     //"estimatedTermFreq=%"INT64" (+/- ~16000) "
			     "tfweight=%.02f "
			     "sign=%c "
			     "numPlusses=%hhu "
			     "required=%"INT32" "
			     "fielcode=%"INT32" "

			     "ebit=0x%0"XINT64" "
			     "impBits=0x%0"XINT64" "

			     "wikiphrid=%"INT32" "
			     "leftwikibigram=%"INT32" "
			     "rightwikibigram=%"INT32" "
			     //"range.startTermNum=%hhi range.endTermNum=%hhi "
			     //"minRecSizes=%"INT32" "
			     "readSizeInBytes=%"INT32" "
			     //"ebit=0x%"XINT64" "
			     //"impBits=0x%"XINT64" "
			     "hc=%"INT32" "
			     "component=%"INT32" "
			     "otermLen=%"INT32" "
			     "isSynonym=%"INT32" "
			     "querylangid=%"INT32" " ,
			     (PTRTYPE)this ,
			     i          ,
			     qt->m_term,//bb ,
			     (int32_t)m_tmpq.isPhrase (i) ,
			     m_tmpq.getTermId      (i) ,
			     m_tmpq.getRawTermId   (i) ,
			     ((float *)m_r->ptr_termFreqWeights)[i] ,
			     sign , //c ,
			     0 , 
			     (int32_t)qt->m_isRequired,
			     (int32_t)qt->m_fieldCode,

			     (int64_t)qt->m_explicitBit  ,
			     (int64_t)qt->m_implicitBits ,

			     wikiPhrId,
			     (int32_t)leftwikibigram,
			     (int32_t)rightwikibigram,
			     ((int32_t *)m_r->ptr_readSizes)[i]         ,
			     //(int64_t)m_tmpq.m_qterms[i].m_explicitBit  ,
			     //(int64_t)m_tmpq.m_qterms[i].m_implicitBits ,
			     (int32_t)m_tmpq.m_qterms[i].m_hardCount ,
			     (int32_t)m_tmpq.m_componentCodes[i],
			     (int32_t)m_tmpq.getTermLen(i) ,
			     isSynonym,
			     (int32_t)m_tmpq.m_langId ); // ,tt
			// put it back
			*tpc = tmp;
			if ( st ) {
				int32_t stnum = st - m_tmpq.m_qterms;
				sb.safePrintf("synofterm#=%"INT32"",stnum);
				//sb.safeMemcpy(st->m_term,st->m_termLen);
				sb.pushChar(' ');
				sb.safePrintf("synwid0=%"INT64" ",qt->m_synWids0);
				sb.safePrintf("synwid1=%"INT64" ",qt->m_synWids1);
				sb.safePrintf("synalnumwords=%"INT32" ",
					      qt->m_numAlnumWordsInSynonym);
				// like for synonym "nj" it's base,
				// "new jersey" has 2 alnum words!
				sb.safePrintf("synbasealnumwords=%"INT32" ",
					      qt->m_numAlnumWordsInBase);
			}
			logf(LOG_DEBUG,"%s",sb.getBufStart());

		}
		m_tmpq.printBooleanTree();
	}
	// timestamp log
	if ( m_debug ) 
		log(LOG_DEBUG,"query: msg39: [%"PTRFMT"] "
		    "Getting %"INT32" index lists ",
		     (PTRTYPE)this,m_tmpq.getNumTerms());
	// . now get the index lists themselves
	// . return if it blocked
	// . not doing a merge (last parm) means that the lists we receive
	//   will be an appending of a bunch of lists so keys won't be in order
	// . merging is uneccessary for us here because we hash the keys anyway
	// . and merging takes up valuable cpu time
	// . caution: the index lists returned from Msg2 are now compressed
	// . now i'm merging because it's 10 times faster than hashing anyway
	//   and the reply buf should now always be <= minRecSizes so we can
	//   pre-allocate one better, and, 3) this should fix the yahoo.com 
	//   reindex bug
	char rdbId = RDB_POSDB;

	// . TODO: MDW: fix
	// . partap says there is a bug in this??? we can't cache UOR'ed lists?
	bool checkCache = false;
	// split is us????
	//int32_t split = g_hostdb.m_myHost->m_group;
	int32_t split = g_hostdb.m_myHost->m_shardNum;
	// call msg2
	if ( ! m_msg2.getLists ( rdbId                      ,
				 m_r->m_collnum,//m_r->ptr_coll              ,
				 m_r->m_maxAge              ,
				 m_r->m_addToCache          ,
				 //m_tmpq.m_qterms ,
				 &m_tmpq,
				 m_r->ptr_whiteList,
				 // we need to restrict docid range for
				 // whitelist as well! this is from
				 // doDocIdSplitLoop()
				 m_docIdStart,
				 m_docIdEnd,
				 // how much of each termlist to read in bytes
				 (int32_t *)m_r->ptr_readSizes ,
				 //m_tmpq.getNumTerms()       , // numLists
				 // 1-1 with query terms
				 m_lists                    ,
				 this                       ,
				 controlLoopWrapper,//gotListsWrapper      ,
				 m_r                        ,
				 m_r->m_niceness            ,
				 true                       , // do merge?
				 m_debug                  ,
				 NULL                       ,  // best hostids
				 m_r->m_restrictPosdbForQuery  ,
				 split                      ,
				 checkCache                 )) {
		m_blocked = true;
		return false;
	}

	// error?
	//if ( g_errno ) { 
	//	log("msg39: Had error getting termlists2: %s.",
	//	    mstrerror(g_errno));
	//	// don't bail out here because we are in docIdSplitLoop()
	//	//sendReply (m_slot,this,NULL,0,0,true);
	//	return true; 
	//}
	
	//return gotLists ( true );
	return true;
}
void Statsdb::drawHR ( float z ,
		       float ymin , 
		       float ymax ,
		       //GIFPlotter *plotter ,
		       SafeBuf &gw,
		       Label *label ,
		       float zoff ,
		       long color ) {

	// convert into yspace
	float z2 = ((float)DY * (float)(z - ymin)) /(float)(ymax-ymin);
	// avoid collisions with other graphs
	z2 += zoff;
	// border
	//z2 += m_by;
	// round off error
	z2 += 0.5;
	// for adjusatmnet
	float ptsPerPixel = (ymax-ymin)/ (float)DY;
	// make an adjustment to the label then! -- Commented out because it's currently not used.
	float zadj = zoff * ptsPerPixel;

	//#ifdef _USEPLOTTER_

	// use the color specified from addStat_r() for this line/pt
	//plotter->pencolor ( ((color >> 16) & 0xff) << 8 ,
	//		    ((color >>  8) & 0xff) << 8 ,
	//		    ((color >>  0) & 0xff) << 8 );

	// horizontal line
	//plotter->line ( m_bx, (long)z2 , DX + m_bx, (long)z2 );
	long width = 1;
	drawLine3 ( m_gw, 0, DX , (long)z2,color, width); 


	// make label
	char tmp[128];
	// . use "graphHash" to map to unit display
	// . this is a disk read volume
	sprintf(tmp,label->m_format,z +zadj);//* label->m_yscalar);

	/*
	// a white shadow
	plotter->pencolor ( 0xffff,0xffff,0xffff );
	plotter->move ( m_bx + 80 + 2 , z2 + 10 - 2 );
	plotter->alabel     ( 'c' , 'c' , tmp );
	
	// a black shadow
	plotter->pencolor ( 0 , 0 , 0 );
	plotter->move ( m_bx + 80 + 1 , z2 + 10 - 1 );
	plotter->alabel     ( 'c' , 'c' , tmp );
	
	//long color = label->m_color;
	// use the color specified from addStat_r() for this line/pt
	plotter->pencolor ( ((color >> 16) & 0xff) << 8 ,
			    ((color >>  8) & 0xff) << 8 ,
			    ((color >>  0) & 0xff) << 8 );
	
	// move cursor
	plotter->move ( m_bx + 80 , z2 + 10 );
	// plot label
	plotter->alabel     ( 'c' , 'c' , tmp );
	*/

	// LABEL
	gw.safePrintf("<div style=\"position:absolute;"
		      "left:%li;"
		      "bottom:%li;"
		      "color:#%lx;"
		      "z-index:110;"
		      "font-size:14px;"
		      "min-height:20px;"
		      "min-width:3px;\">%s</div>\n"
		      , (long)(m_bx)
		      , (long)z2 +m_by
		      , color
		      // the label:
		      , tmp
		      );
	
}
bool sendTurkPageReply ( State60 *st ) {

	XmlDoc *xd = &st->m_xd;
	//char *content    = xd->ptr_utf8Content;
	//int32_t  contentLen = xd->size_utf8Content - 1;

	// count the total number of EventDesc classes for all evids
	//char *evd = xd->ptr_eventData;
	//EventDisplay *ed = (EventDisplay *)evd;
	//char *addr = evd + (int32_t)ed->m_addr;
	//char timeZoneOffset = getTimeZoneFromAddr ( addr );

	// in case getSections() block come right back in
	xd->setCallback ( st , xdcallback );

	// . set niceness to 1 so all this processing doesn't slow queries down
	// . however, g_niceness should still be zero... hmmm...
	xd->m_niceness = 1;

	// default to 1 niceness
	st->m_niceness = 1;

	// now set the sections class
	Sections *ss = xd->getSections();

	// now for each section with alnum text, telescope up as far as 
	// possible without containing anymore alnum text than what it 
	// contained. set SEC_CONTROL bit. such sections will have the
	// 2 green/blue dots, that are used for turning on/off title/desc.
	// but really the indians will only turn off sections that should
	// not have a title/desc.
	for ( Section *si = ss->m_rootSection ; si ; si = si->m_next ) {
		// breathe
		QUICKPOLL(st->m_niceness);
		// skip if does not have text
		if ( si->m_firstWordPos < 0 ) continue;
		// otherwise, find biggest parent that contains just that text
		Section *p    = si->m_parent;
		Section *last = si;
		for ( ; p ; p = p->m_parent ) {
			if ( p->m_firstWordPos != si->m_firstWordPos ) break;
			if ( p->m_lastWordPos  != si->m_lastWordPos  ) break;
			last = p;
		}
		// set that bit then
		last->m_flags |= SEC_CONTROL;
		// and speed up the loop
		si = last;
	}

	// * now each SEC_CONTROL sections have a fence activated by a turker

	// * an event title or description can not span a fence. it must be
	//   confined within a fence. however, it is allowed to include
	//   title or description from a "title section".

	// * hold shift down to designate as title section when clicking it

	// * show the raw text of each event changing as you fence
	//   sections in or out.  show in a right frame.

	// * show list of events on page in the top frame. can toggle them
	//   all individually.

	// * and remove no-display from all tags so we can see everything.

	// * highlight addresses, not just dates.

	// * each section hash has its own unique bg color when activated

	// * with a single click, completely reject an event because:
	//   contains bad time, address, title or desc. specify which so
	//   we can improve our algo.

	// * when selecting an individual event, scroll to its tod...

	// * remove all color from webpage that we can so our colors show up

	// * remove all imgs. just src them to dev null.

	// * allow for entering a custom title for an event or all events
	//   that are or will ever appear on the page. 

	// * when displaying the text of the events, use hyphens to
	//   delineate the section topology. strike out text as a section
	//   fence is activated.

	// * when a section is activated is it easier to just redownload
	//   the whole text of the page? maybe just the text frame?

	// * clicking on an individual sentence section should just remove
	//   that sentence. that is kinda a special content hash removal
	//   tag. like "Click here for video."

	// * when an event id is selected i guess activate its bgcolor to
	//   be light blue for all sentences currently in the event that
	//   are not in activated sections. (make exception for designated 
	//   title sections). so we need multiple tags for each events
	//   sentence div section. if sentence is split use multiple div tags
	//   then to keep the order. so each event sentence would have 
	//   <div ev1=1 ev2=1 ev10=1>...</div> if it is in event ids 1,2 and
	//   10. that way we can activate it when one of those event ids is
	//   activated.


	SafeBuf sb;

	// int16_tcuts
	if ( ! xd->m_wordsValid ) { char *xx=NULL;*xx=0; }
	Words     *words = &xd->m_words;
	int32_t       nw    = words->getNumWords();
	char     **wptrs = words->getWords();
	int32_t      *wlens = words->getWordLens();
	nodeid_t  *tids  = words->getTagIds();

	// a special array for printing </div> tags
	char *endCounts = (char *)mcalloc ( nw ,"endcounts");
	if ( ! endCounts ) return sendErrorReply ( st , g_errno );


	// 
	// now loop over all the words. if word starts a section that has
	// SEC_CONTROL bit set, and print out the section hash and a color
	// tag to be activated if the turkey activates us.
	// CAUTION: word may start multiple sections.
	//
	for ( int32_t i = 0 ; i < nw ; i++ ) { 
		// get section ptr
		Section *sj = ss->m_sectionPtrs[i];
		// sanity check. sj must be first section ptr that starts @ a
		if ( sj && sj->m_a==i && sj->m_prev && sj->m_prev->m_a==i ) {
			char *xx=NULL;*xx=0; }
		// . does word #i start a section?
		// . if section is control, print out the control
		while ( sj && sj->m_a == i ) {
			// print this section's hash
			if ( sj->m_flags & SEC_CONTROL) {
				// after the turkeys have made all the edits
				// they need to submit the changes they made.
				// how can we get that data sent back to the
				// back end? we need to send back the colors
				// of the sections that have been activated
				// i guess. just do a loop over them.
				sb.safePrintf("<div nobreak gbsecid=%"UINT32" "
					      "bgcolor=#%"XINT32" "
					      "onclick=gbtogglecolor()>",
					      (uint32_t)sj->m_tagHash,
					      (uint32_t)sj->m_tagHash);
				// sanity check
				if ( sj->m_b < 0  ) { char *xx=NULL;*xx=0; }
				if ( sj->m_b > nw ) { char *xx=NULL;*xx=0; }
				// and inc the /div count for that word
				endCounts[sj->m_b-1]++;
			}
			// try next section too
			sj = sj->m_next;
		}
		// if this is a tag, remove any coloring
		if ( tids[i] ) {
		}
		// print the word, be it a tag, alnum, punct
		sb.safeMemcpy ( wptrs[i] , wlens[i] );
		// end a div tag?
		if ( ! endCounts[i] ) continue;
		// might be many so loop it
		for ( int32_t j = 0 ; j < endCounts[i] ; j++ )
			sb.safePrintf("</div>");
	}			







	return false;
}
void gotDatedbList ( State60 *st ) {

	// must only be run on host #0 since we need just one lock table
	if ( g_hostdb.m_myHost->m_hostId != 0 ) { char *xx=NULL;*xx=0; }

	// load turk lock table if we need to
	bool s_init = false;
	if ( ! s_init ) {
		s_init = true;
		if ( ! g_turkLocks.set(8,sizeof(TurkLock),256) )
			log("turk: failed to init turk lock table");
		if ( ! g_turkLocks.load(g_conf.m_dir,"turkdir/docidlocks.dat"))
			log("turk: failed to load turk lock table");
	}

	time_t now = getTimeGlobal();
	// int16_tcut
	RdbList *list = &st->m_list;
	// the best docid
	int64_t best = 0LL;
	// scan the list to get urls/docids to turk out
	for ( ; ! list->isExhausted() ; ) {
		// get rec
		char *k = list->getCurrentKey();
		// skip that
		list->skipCurrentRecord();
		// skip if negative
		if ( (k[0] & 0x01) == 0x00 ) continue;
		// get the docid
		int64_t docid = g_datedb.getDocId ( k );
		// skip if locked
		TurkLock *tt = (TurkLock *)g_turkLock.getValue(&docid);
		// if there check time
		if ( tt && now - tt->m_lockTime > 3600 ) {
			// remove it
			g_turkLock.removeKey(&docId);
			// nuke tt
			tt = NULL;
		}
		// if still there, skip it and try next one
		if ( tt ) continue;
		// ok, we got a good docid to dish out
		best = docId;
		break;
	}

	SafeBuf sb;

	// print description so they can clikc a button to start the turk
	sb.safePrintf("<html>\n"
		      "<title>Event Editor</title>\n"
		      "<body>\n"
		      "<table width=\"100%%\" border=\"0\">\n"
		      "<tr><td style=\"background-color:#0079ba;\">\n"
		      "<center><font color=#00000>"
		      "<h2>Event Editor</h2>\n"
		      "</font></center></td>"
		      "</tr></table>");

	// if we had no docid, give user an empty msg
	if ( ! best ) {
		sb.safePrintf("<center>Nothing currently available to edit. "
			      "Please try again later.</center>"
			      "</body></html>\n");
		sendReply ( &sb );
		return;
	}

	// lock it!
	TurkLock tt;
	strcpy ( tt.m_user , st->m_user );
	tt.m_lockTime = now;
	if ( ! g_lockTable.addLock ( &tt ) ) {
		sendErrorReply ( st , g_errno );
		return;
	}

	// . fetch the TitleRec
	// . a max cache age of 0 means not to read from the cache
	XmlDoc *xd = &st->m_xd;
	// . when getTitleRec() is called it will load the old one
	//   since XmlDoc::m_setFromTitleRec will be true
	// . niceness is 0
	xd->set3 ( best , st->m_coll , 0 );
	// if it blocks while it loads title rec, it will re-call this routine
	xd->setCallback ( st , processLoopWrapper );
	// good to go!
	return processLoop ( st );
}
void doneReindexing ( void *state ) {
	// cast it
	State13 *st = (State13 *)state;

	GigablastRequest *gr = &st->m_gr;

	// note it
	if ( gr->m_query && gr->m_query[0] )
		log(LOG_INFO,"admin: Done with query reindex. %s",
		    mstrerror(g_errno));

	////
	//
	// print the html page
	//
	/////

	HttpRequest *hr = &gr->m_hr;

	char format = hr->getReplyFormat();

	SafeBuf sb;

	const char *ct = "text/html";
	if ( format == FORMAT_JSON ) ct = "application/json";
	if ( format == FORMAT_XML  ) ct = "text/xml";

	if ( format == FORMAT_XML ) {
		sb.safePrintf("<response>\n"
			      "\t<statusCode>0</statusCode>\n"
			      "\t<statusMsg>Success</statusMsg>\n"
			      "\t<matchingResults>%" PRId32"</matchingResults>\n"
			      "</response>"
			      , st->m_msg1c.m_numDocIdsAdded
			      );
		g_httpServer.sendDynamicPage ( gr->m_socket,
					       sb.getBufStart(),
					       sb.length(),
					       -1,
					       false,ct);
		mdelete ( st , sizeof(State13) , "PageTagdb" );
		delete (st);
		return;
	}

	if ( format == FORMAT_JSON ) {
		sb.safePrintf("{\"response\":{\n"
			      "\t\"statusCode\":0,\n"
			      "\t\"statusMsg\":\"Success\",\n"
			      "\t\"matchingResults\":%" PRId32"\n"
			      "}\n"
			      "}\n"
			      , st->m_msg1c.m_numDocIdsAdded
			      );
		g_httpServer.sendDynamicPage ( gr->m_socket,
					       sb.getBufStart(),
					       sb.length(),
					       -1,
					       false,ct);
		mdelete ( st , sizeof(State13) , "PageTagdb" );
		delete (st);
		return;
	}



	g_pages.printAdminTop ( &sb , gr->m_socket , &gr->m_hr );

	sb.safePrintf("<style>"
		       ".poo { background-color:#%s;}\n"
		       "</style>\n" ,
		       LIGHT_BLUE );


	//
	// print error msg if any
	//

	if ( gr->m_query && gr->m_query[0] && ! g_errno )
		sb.safePrintf ( "<center><font color=red><b>Success. "
			  "Added %" PRId32" docid(s) to "
			  "spider queue.</b></font></center><br>" , 
			  st->m_msg1c.m_numDocIdsAdded );

	if ( gr->m_query && gr->m_query[0] && g_errno )
		sb.safePrintf ( "<center><font color=red><b>Error. "
				 "%s</b></font></center><br>" , 
				 mstrerror(g_errno));


	// print the reindex interface
	g_parms.printParmTable ( &sb , gr->m_socket , &gr->m_hr  );


	g_httpServer.sendDynamicPage ( gr->m_socket,
				       sb.getBufStart(),
				       sb.length(),
				       -1,
				       false);

	mdelete ( st , sizeof(State13) , "PageTagdb" );
	delete (st);
}
Exemplo n.º 20
0
bool sendReply ( void *state ) {
	GigablastRequest *gr = (GigablastRequest *)state;

	// in order to see what sites are being added log it, then we can
	// more easily remove sites from sitesearch.gigablast.com that are
	// being added but not being searched
	SafeBuf xb;
	if ( gr->m_urlsBuf ) {
		xb.safeTruncateEllipsis ( gr->m_urlsBuf , 200 );
		log( LOG_INFO, "http: add url %s (%s)", xb.getBufStart(), mstrerror( g_errno ) );
	}

	char format = gr->m_hr.getReplyFormat();
	TcpSocket *sock = gr->m_socket;

	if ( format == FORMAT_JSON || format == FORMAT_XML ) {
		bool status = g_httpServer.sendSuccessReply ( gr );
		// nuke state
		mdelete ( gr , sizeof(gr) , "PageAddUrl" );
		delete (gr);
		return status;
	}

	int32_t ulen = 0;
	const char *url = gr->m_urlsBuf;
	if ( url ) ulen = gbstrlen (url);

	// re-null it out if just http://
	bool printUrl = true;
	if ( ulen == 0 ) printUrl = false;
	if ( ! gr->m_urlsBuf       ) printUrl = false;
	if ( ulen==7 && printUrl && !strncasecmp(gr->m_url,"http://",7))
		printUrl = false;
	if ( ulen==8 && printUrl && !strncasecmp(gr->m_url,"https://",8))
		printUrl = false;

	// page is not more than 32k
	char buf[1024*32+MAX_URL_LEN*2];
	SafeBuf sb(buf, 1024*32+MAX_URL_LEN*2);

	g_pages.printAdminTop ( &sb , sock , &gr->m_hr );

	// if there was an error let them know
	SafeBuf mbuf;

	if ( g_errno ) {
		mbuf.safePrintf("<center><font color=red>");
		mbuf.safePrintf("Error adding url(s): <b>%s[%i]</b>", mstrerror(g_errno) , g_errno);
		mbuf.safePrintf("</font></center>");
	} else if ( printUrl ) {
		mbuf.safePrintf("<center><font color=red>");
		mbuf.safePrintf("<b><u>");
		mbuf.safeTruncateEllipsis(gr->m_urlsBuf,200);
		mbuf.safePrintf("</u></b></font> added to spider queue successfully<br><br>");
		mbuf.safePrintf("</font></center>");
	}

	if ( mbuf.length() ) {
		sb.safeStrcpy( mbuf.getBufStart() );
	}

	g_parms.printParmTable ( &sb , sock , &gr->m_hr );

	// print the final tail
	g_pages.printTail ( &sb, true ); // admin?

	// clear g_errno, if any, so our reply send goes through
	g_errno = 0;

	// nuke state
	mdelete ( gr , sizeof(GigablastRequest) , "PageAddUrl" );
	delete (gr);

	return g_httpServer.sendDynamicPage( sock, sb.getBufStart(), sb.length(), -1 ); // cachetime
}
// returns false if blocked, true otherwise
bool processLoop ( void *state ) {
	// get it
	State2 *st = (State2 *)state;
	// get the tcp socket from the state
	TcpSocket *s = st->m_socket;
	// get it
	XmlDoc *xd = &st->m_xd;

	if ( ! xd->m_loaded ) {
		// setting just the docid. niceness is 0.
		//xd->set3 ( st->m_docId , st->m_coll , 0 );
		// callback
		xd->setCallback ( state , processLoop );
		// . and tell it to load from the old title rec
		// . this sets xd->m_oldTitleRec/m_oldTitleRecSize
		// . this sets xd->ptr_* and all other member vars from
		//   the old title rec if found in titledb.
		if ( ! xd->loadFromOldTitleRec ( ) ) return false;
	}

	if ( g_errno ) return sendErrorReply ( st , g_errno );
	// now force it to load old title rec
	//char **tr = xd->getTitleRec();
	SafeBuf *tr = xd->getTitleRecBuf();
	// blocked? return false if so. it will call processLoop() when it rets
	if ( tr == (void *)-1 ) return false;
	// we did not block. check for error? this will free "st" too.
	if ( ! tr ) return sendErrorReply ( st , g_errno );
	// if title rec was empty, that is a problem
	if ( xd->m_titleRecBuf.length() == 0 ) 
		return sendErrorReply ( st , ENOTFOUND);

	// set callback
	char *na = xd->getIsNoArchive();
	// wait if blocked
	if ( na == (void *)-1 ) return false;
	// error?
	if ( ! na ) return sendErrorReply ( st , g_errno );
	// forbidden? allow turkeys through though...
	if ( ! st->m_isAdmin && *na )
		return sendErrorReply ( st , ENOCACHE );

	SafeBuf *sb = &st->m_sb;


	// &page=4 will print rainbow sections
	if ( ! st->m_printed && st->m_r.getLong("page",0) ) {
		// do not repeat this call
		st->m_printed = true;
		// this will call us again since we called
		// xd->setCallback() above to us
		if ( ! xd->printDocForProCog ( sb , &st->m_r ) )
			return false;
	}

	char *contentType = "text/html";
	char format = st->m_format;
	if ( format == FORMAT_XML ) contentType = "text/xml";
	if ( format == FORMAT_JSON ) contentType = "application/json";

	// if we printed a special page (like rainbow sections) then return now
	if ( st->m_printed ) {
		bool status = g_httpServer.sendDynamicPage (s,
							    //buf,bufLen,
							    sb->getBufStart(),
							    sb->getLength(),
							    -1,false,
							    //"text/html",
							    contentType,
							    -1, NULL, "utf8" );
		// nuke state2
		mdelete ( st , sizeof(State2) , "PageGet1" );
		delete (st);
		return status;
	}

	/*
	  // this was calling XmlDoc and setting sections, etc. to
	  // get the SpiderReply junk... no no no
	// is it banned or filtered? this ignores the TagRec in the titleRec
	// and uses msg8a to get it fresh instead
	char *vi = xd->getIsFiltered();//Visible( );
	// wait if blocked
	if ( vi == (void *)-1 ) return false;
	// error?
	if ( ! vi ) return sendErrorReply ( st , g_errno );
	// banned?
	if ( ! st->m_isAdmin && ! *vi ) return sendErrorReply (st,EDOCBANNED);
	*/

	// get the utf8 content
	char **utf8 = xd->getUtf8Content();
	//long   len  = xd->size_utf8Content - 1;
	// wait if blocked???
	if ( utf8 == (void *)-1 ) return false;
	// strange
	if ( xd->size_utf8Content<=0) {
		log("pageget: utf8 content <= 0");
		return sendErrorReply(st,EBADENGINEER );
	}
	// alloc error?
	if ( ! utf8 ) return sendErrorReply ( st , g_errno );

	// get this host
	Host *h = g_hostdb.getHost ( g_hostdb.m_hostId );
	if ( ! h ) {
		log("pageget: hostid %li is bad",g_hostdb.m_hostId);
		return sendErrorReply(st,EBADENGINEER );
	}


	char *content    = xd->ptr_utf8Content;
	long  contentLen = xd->size_utf8Content - 1;

	// shortcut
	char strip = st->m_strip;

	// alloc buffer now
	//char *buf = NULL;
	//long  bufMaxSize = 0;
	//bufMaxSize = len + ( 32 * 1024 ) ;
	//bufMaxSize = contentLen + ( 32 * 1024 ) ;
	//buf        = (char *)mmalloc ( bufMaxSize , "PageGet2" );
	//char *p          = buf;
	//char *bufEnd     = buf + bufMaxSize;
	//if ( ! buf ) {
	//	return sendErrorReply ( st , g_errno );
	//}

	// for undoing the header
	//char *start1 = p;
	long startLen1 = sb->length();

	// we are always utfu
	if ( strip != 2 )
		sb->safePrintf( "<meta http-equiv=\"Content-Type\" "
			     "content=\"text/html;charset=utf8\">\n");

	// base href
	//Url *base = &xd->m_firstUrl;
	//if ( xd->ptr_redirUrl.m_url[0] )
	//	base = &xd->m_redirUrl;
	char *base = xd->ptr_firstUrl;
	if ( xd->ptr_redirUrl ) base = xd->ptr_redirUrl;
	//Url *redir = *xd->getRedirUrl();
	if ( strip != 2 ) {
		sb->safePrintf ( "<BASE HREF=\"%s\">" , base );
		//p += gbstrlen ( p );
	}

	// default colors in case css files missing
	if ( strip != 2 ) {
		sb->safePrintf( "\n<style type=\"text/css\">\n"
			  "body{background-color:white;color:black;}\n"
			  "</style>\n");
		//p += gbstrlen ( p );
	}

	//char format = st->m_format;
	if ( format == FORMAT_XML ) sb->reset();
	if ( format == FORMAT_JSON ) sb->reset();

	// for undoing the stuff below
	long startLen2 = sb->length();//p;

	// query should be NULL terminated
	char *q    = st->m_q;
	long  qlen = st->m_qlen;

	char styleTitle[128] =  "font-size:14px;font-weight:600;"
				"color:#000000;";
	char styleText[128]  =  "font-size:14px;font-weight:400;"
				"color:#000000;";
	char styleLink[128] =  "font-size:14px;font-weight:400;"
				"color:#0000ff;";
	char styleTell[128] =  "font-size:14px;font-weight:600;"
				"color:#cc0000;";

	// get the url of the title rec
	Url *f = xd->getFirstUrl();

	bool printDisclaimer = st->m_printDisclaimer;

	if ( xd->m_contentType == CT_JSON )
		printDisclaimer = false;

	if ( format == FORMAT_XML ) printDisclaimer = false;
	if ( format == FORMAT_JSON ) printDisclaimer = false;

	char tbuf[100];
	tbuf[0] = 0;
	time_t lastSpiderDate = xd->m_spideredTime;

	if ( printDisclaimer ||
	     format == FORMAT_XML ||
	     format == FORMAT_JSON ) {
		struct tm *timeStruct = gmtime ( &lastSpiderDate );
		strftime ( tbuf, 100,"%b %d, %Y UTC", timeStruct);
	}

	// We should always be displaying this disclaimer.
	// - May eventually want to display this at a different location
	//   on the page, or on the click 'n' scroll browser page itself
	//   when this page is not being viewed solo.
	// CNS: if ( ! st->m_clickNScroll ) {
	if ( printDisclaimer ) {

		sb->safePrintf(//sprintf ( p , 
			  //"<BASE HREF=\"%s\">"
			  //"<table border=1 width=100%%>"
			  //"<tr><td>"
			  "<table border=\"1\" bgcolor=\"#"
			  BGCOLOR
			  "\" cellpadding=\"10\" "
			  //"id=\"gbcnsdisctable\" class=\"gbcnsdisctable_v\""
			  "cellspacing=\"0\" width=\"100%%\" color=\"#ffffff\">"
			  "<tr"
			  //" id=\"gbcnsdisctr\" class=\"gbcnsdisctr_v\""
			  "><td>"
			  //"<font face=times,sans-serif color=black size=-1>"
			  "<span style=\"%s\">"
			  "This is Gigablast's cached page of </span>"
			  "<a href=\"%s\" style=\"%s\">%s</a>"
			  "" , styleTitle, f->getUrl(), styleLink,
			  f->getUrl() );
		//p += gbstrlen ( p );
		// then the rest
		//sprintf(p , 
		sb->safePrintf(
			"<span style=\"%s\">. "
			"Gigablast is not responsible for the content of "
			"this page.</span>", styleTitle );
		//p += gbstrlen ( p );

		sb->safePrintf ( "<br/><span style=\"%s\">"
			  "Cached: </span>"
			  "<span style=\"%s\">",
			  styleTitle, styleText );
		//p += gbstrlen ( p );

		// then the spider date in GMT
		// time_t lastSpiderDate = xd->m_spideredTime;
		// struct tm *timeStruct = gmtime ( &lastSpiderDate );
		// char tbuf[100];
		// strftime ( tbuf, 100,"%b %d, %Y UTC", timeStruct);
		//p += gbstrlen ( p );
		sb->safeStrcpy(tbuf);

		// Moved over from PageResults.cpp
		sb->safePrintf( "</span> - <a href=\""
			      "/get?"
			      "q=%s&amp;c=%s&amp;rtq=%li&amp;"
			      "d=%lli&amp;strip=1\""
			      " style=\"%s\">"
			      "[stripped]</a>", 
			      q , st->m_coll , 
			      (long)st->m_rtq,
			      st->m_docId, styleLink ); 

		// a link to alexa
		if ( f->getUrlLen() > 5 ) {
			sb->safePrintf( " - <a href=\"http:"
					 "//web.archive.org/web/*/%s\""
					 " style=\"%s\">"
					 "[older copies]</a>" ,
					 f->getUrl(), styleLink );
		}

		if (st->m_noArchive){
			sb->safePrintf( " - <span style=\"%s\"><b>"
				     "[NOARCHIVE]</b></span>",
				     styleTell );
		}
		if (st->m_isBanned){
			sb->safePrintf(" - <span style=\"%s\"><b>"
				     "[BANNED]</b></span>",
				     styleTell );
		}

		// only print this if we got a query
		if ( qlen > 0 ) {
			sb->safePrintf("<br/><br/><span style=\"%s\"> "
				   "These search terms have been "
				   "highlighted:  ",
				   styleText );
			//p += gbstrlen ( p );
		}
		
	}

	// how much space left in p?
	//long avail = bufEnd - p;
	// . make the url that we're outputting for (like in PageResults.cpp)
	// . "thisUrl" is the baseUrl for click & scroll
	char thisUrl[MAX_URL_LEN];
	char *thisUrlEnd = thisUrl + MAX_URL_LEN;
	char *x = thisUrl;
	// . use the external ip of our gateway
	// . construct the NAT mapped port
	// . you should have used iptables to map port to the correct
	//   internal ip:port
	//unsigned long  ip   =g_conf.m_mainExternalIp  ; // h->m_externalIp;
	//unsigned short port=g_conf.m_mainExternalPort;//h->m_externalHttpPort
	// local check
	//if ( st->m_isLocal ) {
	unsigned long  ip   = h->m_ip;
	unsigned short port = h->m_httpPort;
	//}
	//sprintf ( x , "http://%s:%li/get?q=" , iptoa ( ip ) , port );
	// . we no longer put the port in here
	// . but still need http:// since we use <base href=>
	if (port == 80) sprintf(x,"http://%s/get?q=",iptoa(ip));
	else            sprintf(x,"http://%s:%hu/get?q=",iptoa(ip),port);
	x += gbstrlen ( x );
	// the query url encoded
	long elen = urlEncode ( x , thisUrlEnd - x , q , qlen );
	x += elen;
	// separate cgi vars with a &
	//sprintf ( x, "&seq=%li&rtq=%lid=%lli",
	//	  (long)st->m_seq,(long)st->m_rtq,st->m_msg22.getDocId());
	sprintf ( x, "&d=%lli",st->m_docId );
	x += gbstrlen(x);		
	// set our query for highlighting
	Query qq;
	qq.set2 ( q, st->m_langId , true );

	// print the query terms into our highlight buffer
	Highlight hi;
	// make words so we can set the scores to ignore fielded terms
	Words qw;
	qw.set ( q            ,  // content being highlighted, utf8
		 qlen         ,  // content being highlighted, utf8
		 TITLEREC_CURRENT_VERSION,
		 true         ,  // computeIds
		 false        ); // hasHtmlEntities?
	// . assign scores of 0 to query words that should be ignored
	// . TRICKY: loop over words in qq.m_qwords, but they should be 1-1
	//   with words in qw.
	// . sanity check
	//if ( qw.getNumWords() != qq.m_numWords ) { char *xx = NULL; *xx = 0;}
	// declare up here
	Matches m;
	// do the loop
	//Scores ss;
	//ss.set ( &qw , NULL );
	//for ( long i = 0 ; i < qq.m_numWords ; i++ )
	//	if ( ! m.matchWord ( &qq.m_qwords[i],i ) ) ss.m_scores[i] = 0;
	// now set m.m_matches[] to those words in qw that match a query word
	// or phrase in qq.
	m.setQuery ( &qq );
	//m.addMatches ( &qw , &ss , true );
	m.addMatches ( &qw );
	long hilen = 0;

	// CNS: if ( ! st->m_clickNScroll ) {
	// and highlight the matches
	if ( printDisclaimer ) {
		hilen = hi.set ( //p       ,
				 //avail   ,
				sb ,
				 &qw     , // words to highlight
				 &m      , // matches relative to qw
				 false   , // doSteming
				 false   , // st->m_clickAndScroll , 
				 (char *)thisUrl );// base url for ClcknScrll
		//p += hilen;
		// now an hr
		//memcpy ( p , "</span></table></table>\n" , 24 );   p += 24;
		sb->safeStrcpy("</span></table></table>\n");
	}


	bool includeHeader = st->m_includeHeader;

	// do not show header for json object display
	if ( xd->m_contentType == CT_JSON )
		includeHeader = false;

	if ( format == FORMAT_XML ) includeHeader = false;
	if ( format == FORMAT_JSON ) includeHeader = false;

	//mfree(uq, uqCapacity, "PageGet");
	// undo the header writes if we should
	if ( ! includeHeader ) {
		// including base href is off by default when not including
		// the header, so the caller must explicitly turn it back on
		if ( st->m_includeBaseHref ) sb->m_length=startLen2;//p=start2;
		else                         sb->m_length=startLen1;//p=start1;
	}

	//sb->safeStrcpy(tbuf);



	if ( format == FORMAT_XML ) {
		sb->safePrintf("<response>\n");
		sb->safePrintf("<statusCode>0</statusCode>\n");
		sb->safePrintf("<statusMsg>Success</statusMsg>\n");
		sb->safePrintf("<url><![CDATA[");
		sb->cdataEncode(xd->m_firstUrl.m_url);
		sb->safePrintf("]]></url>\n");
		sb->safePrintf("<docId>%llu</docId>\n",xd->m_docId);
		sb->safePrintf("\t<cachedTimeUTC>%lu</cachedTimeUTC>\n",
			      lastSpiderDate);
		sb->safePrintf("\t<cachedTimeStr>%s</cachedTimeStr>\n",tbuf);
	}

	if ( format == FORMAT_JSON ) {
		sb->safePrintf("{\"response\":{\n");
		sb->safePrintf("\t\"statusCode\":0,\n");
		sb->safePrintf("\t\"statusMsg\":\"Success\",\n");
		sb->safePrintf("\t\"url\":\"");
		sb->jsonEncode(xd->m_firstUrl.m_url);
		sb->safePrintf("\",\n");
		sb->safePrintf("\t\"docId\":%llu,\n",xd->m_docId);
		sb->safePrintf("\t\"cachedTimeUTC\":%lu,\n",lastSpiderDate);
		sb->safePrintf("\t\"cachedTimeStr\":\"%s\",\n",tbuf);
	}

	// identify start of <title> tag we wrote out
	char *sbstart = sb->getBufStart();
	char *sbend   = sb->getBufEnd();
	char *titleStart = NULL;
	char *titleEnd   = NULL;
	for ( char *t = sbstart ; t < sbend ; t++ ) {
		// title tag?
		if ( t[0]!='<' ) continue;
		if ( to_lower_a(t[1])!='t' ) continue;
		if ( to_lower_a(t[2])!='i' ) continue;
		if ( to_lower_a(t[3])!='t' ) continue;
		if ( to_lower_a(t[4])!='l' ) continue;
		if ( to_lower_a(t[5])!='e' ) continue;
		// point to it
		char *x = t + 5;
		// max - to keep things fast
		char *max = x + 500;
		for ( ; *x && *x != '>' && x < max ; x++ );
		x++;
		// find end
		char *e = x;
		for ( ; *e && e < max ; e++ ) {
			if ( e[0]=='<' &&
			     to_lower_a(e[1])=='/' &&
			     to_lower_a(e[2])=='t' &&
			     to_lower_a(e[3])=='i' &&
			     to_lower_a(e[4])=='t' &&
			     to_lower_a(e[5])=='l' &&
			     to_lower_a(e[6])=='e' )
				break;
		}
		if ( e < max ) {
			titleStart = x;
			titleEnd   = e;
		}
		break;
	}

	// . print title at top!
	// . consider moving
	if ( titleStart ) {

		char *ebuf = st->m_r.getString("eb");
		if ( ! ebuf ) ebuf = "";

		//p += sprintf ( p , 
		sb->safePrintf(
			       "<table border=1 "
			       "cellpadding=10 "
			       "cellspacing=0 "
			       "width=100%% "
			       "color=#ffffff>" );

		long printLinks = st->m_r.getLong("links",0);

		if ( ! printDisclaimer && printLinks )
			sb->safePrintf(//p += sprintf ( p , 
				       // first put cached and live link
				       "<tr>"
				       "<td bgcolor=lightyellow>"
				       // print cached link
				       //"<center>"
				       "&nbsp; "
				       "<b>"
				       "<a "
				       "style=\"font-size:18px;font-weight:600;"
				       "color:#000000;\" "
				       "href=\""
				       "/get?"
				       "c=%s&d=%lli&qh=0&cnsp=1&eb=%s\">"
				       "cached link</a>"
				       " &nbsp; "
				       "<a "
				       "style=\"font-size:18px;font-weight:600;"
				       "color:#000000;\" "
				       "href=%s>live link</a>"
				       "</b>"
				       //"</center>"
				       "</td>"
				       "</tr>\n"
				       ,st->m_coll
				       ,st->m_docId 
				       ,ebuf
				       ,thisUrl // st->ptr_ubuf
				       );

		if ( printLinks ) {
			sb->safePrintf(//p += sprintf ( p ,
				       "<tr><td bgcolor=pink>"
				       "<span style=\"font-size:18px;"
				       "font-weight:600;"
				       "color:#000000;\">"
				       "&nbsp; "
				       "<b>PAGE TITLE:</b> "
				       );
			long tlen = titleEnd - titleStart;
			sb->safeMemcpy ( titleStart , tlen );
			sb->safePrintf ( "</span></td></tr>" );
		}

		sb->safePrintf( "</table><br>\n" );

	}

	// is the content preformatted?
	bool pre = false;
	char ctype = (char)xd->m_contentType;
	if ( ctype == CT_TEXT ) pre = true ; // text/plain
	if ( ctype == CT_DOC  ) pre = true ; // filtered msword
	if ( ctype == CT_PS   ) pre = true ; // filtered postscript

	if ( format == FORMAT_XML ) pre = false;
	if ( format == FORMAT_JSON ) pre = false;

	// if it is content-type text, add a <pre>
	if ( pre ) {//p + 5 < bufEnd && pre ) {
		sb->safePrintf("<pre>");
		//p += 5;
	}

	if ( st->m_strip == 1 )
		contentLen = stripHtml( content, contentLen, 
					(long)xd->m_version, st->m_strip );
	// it returns -1 and sets g_errno on error, line OOM
	if ( contentLen == -1 ) {
		//if ( buf ) mfree ( buf , bufMaxSize , "PageGet2" );	
		return sendErrorReply ( st , g_errno );
	}

	Xml xml;
	Words ww;

	// if no highlighting, skip it
	bool queryHighlighting = st->m_queryHighlighting;
	if ( st->m_strip == 2 ) queryHighlighting = false;

	// do not do term highlighting if json
	if ( xd->m_contentType == CT_JSON )
		queryHighlighting = false;

	SafeBuf tmp;
	SafeBuf *xb = sb;
	if ( format == FORMAT_XML ) xb = &tmp;
	if ( format == FORMAT_JSON ) xb = &tmp;
	

	if ( ! queryHighlighting ) {
		xb->safeMemcpy ( content , contentLen );
		//p += contentLen ;
	}
	else {
		// get the content as xhtml (should be NULL terminated)
		//Words *ww = xd->getWords();
		if ( ! xml.set ( content , contentLen , false ,
				 0 , false , TITLEREC_CURRENT_VERSION ,
				 false , 0 , CT_HTML ) ) { // niceness is 0
			//if ( buf ) mfree ( buf , bufMaxSize , "PageGet2" );
			return sendErrorReply ( st , g_errno );
		}			
		if ( ! ww.set ( &xml , true , 0 ) ) { // niceness is 0
			//if ( buf ) mfree ( buf , bufMaxSize , "PageGet2" );
			return sendErrorReply ( st , g_errno );
		}
		// sanity check
		//if ( ! xd->m_wordsValid ) { char *xx=NULL;*xx=0; }
		// how much space left in p?
		//avail = bufEnd - p;

		Matches m;
		m.setQuery ( &qq );
		m.addMatches ( &ww );
		hilen = hi.set ( xb , // p , avail , 
				 &ww , &m ,
				 false /*doStemming?*/ ,  
				 st->m_clickAndScroll , 
				 thisUrl /*base url for click & scroll*/);
		//p += hilen;
		log(LOG_DEBUG, "query: Done highlighting cached page content");
	}


	if ( format == FORMAT_XML ) {
		sb->safePrintf("\t<content><![CDATA[");
		sb->cdataEncode ( xb->getBufStart() );
		sb->safePrintf("]]></content>\n");
		sb->safePrintf("</response>\n");
	}

	if ( format == FORMAT_JSON ) {
		sb->safePrintf("\t\"content\":\"\n");
		sb->jsonEncode ( xb->getBufStart() );
		sb->safePrintf("\"\n}\n}\n");
	}


	// if it is content-type text, add a </pre>
	if ( pre ) { // p + 6 < bufEnd && pre ) {
		sb->safeMemcpy ( "</pre>" , 6 );
		//p += 6;
	}

	// calculate bufLen
	//long bufLen = p - buf;

	long ct = xd->m_contentType;

	// now filter the entire buffer to escape out the xml tags
	// so it is displayed nice
	SafeBuf newbuf;

	if ( ct == CT_XML ) {
		// encode the xml tags into &lt;tagname&gt; sequences
		if ( !newbuf.htmlEncodeXmlTags ( sb->getBufStart() ,
						 sb->getLength(),
						 0)){// niceness=0
			//if ( buf ) mfree ( buf , bufMaxSize , "PageGet2" );
			return sendErrorReply ( st , g_errno );
		}
		// free out buffer that we alloc'd before returning since this
		// should have copied it into another buffer
		//if ( buf ) mfree ( buf , bufMaxSize , "PageGet2" );	
		// reassign
		//buf    = newbuf.getBufStart();
		//bufLen = newbuf.length();
		sb->stealBuf ( &newbuf );
	}

	// now encapsulate it in html head/tail and send it off
	// sendErr:
	contentType = "text/html";
	if ( strip == 2 ) contentType = "text/xml";
	// xml is usually buggy and this throws browser off
	//if ( ctype == CT_XML ) contentType = "text/xml";

	if ( xd->m_contentType == CT_JSON )
		contentType = "application/json";

	if ( format == FORMAT_XML ) contentType = "text/xml";
	if ( format == FORMAT_JSON ) contentType = "application/json";

	// safebuf, sb, is a member of "st" so this should copy the buffer
	// when it constructs the http reply, and we gotta call delete(st)
	// AFTER this so sb is still valid.
	bool status = g_httpServer.sendDynamicPage (s,
						    //buf,bufLen,
						    sb->getBufStart(),
						    sb->getLength(),
						    -1,false,
						    contentType,
						     -1, NULL, "utf8" );

	// nuke state2
	mdelete ( st , sizeof(State2) , "PageGet1" );
	delete (st);


	// free out buffer that we alloc'd before returning since this
	// should have copied it into another buffer

	//if      ( ct == CT_XML ) newbuf.purge();
	//else if ( buf          ) mfree ( buf , bufMaxSize , "PageGet2" );
	
	// and convey the status
	return status;
}
Exemplo n.º 22
0
//
// new code for drawing graph in html with absolute divs instead
// of using GIF plotter library which had issues
//
void Stats::printGraphInHtml ( SafeBuf &sb ) {

	// gif size
	char tmp[64];
	sprintf ( tmp , "%lix%li", (long)DX+40 , (long)DY+40 ); // "1040x440"

	// 20 pixel borders
	//int bx = 10;
	//int by = 30;
	// define the space with boundaries 100 unit wide boundaries
	//plotter.space ( -bx , -by , DX + bx , DY + by );
	// draw the x-axis
	//plotter.line ( 0 , 0 , DX , 0  );
	// draw the y-axis
	//plotter.line ( 0 , 0 ,  0 , DY );

	// find time ranges
	long long t2 = 0;
	for ( long i = 0 ; i < MAX_POINTS ; i++ ) {
		// skip empties
		if ( m_pts[i].m_startTime == 0 ) continue;
		// set min/max
		if ( m_pts[i].m_endTime   > t2 ) t2 = m_pts[i].m_endTime;
	}
	// now compute the start time for the graph
	long long t1 = 0x7fffffffffffffffLL;
	// now recompute t1
	for ( long i = 0 ; i < MAX_POINTS ; i++ ) {
		// skip empties
		if ( m_pts[i].m_startTime == 0 ) continue;
		// can't be behind more than 1 second
		if ( m_pts[i].m_startTime   < t2 - DT ) continue;
		// otherwise, it's a candidate for the first time
		if ( m_pts[i].m_startTime < t1 ) t1 = m_pts[i].m_startTime;
	}

	//
	// main graphing window
	//
	sb.safePrintf("<div style=\"position:relative;"
		      "background-color:#c0c0c0;"

		      // match style of tables
		      "border-radius:10px;"
		      "border:#6060f0 2px solid;"
		      
		      //"overflow-y:hidden;"
		      "overflow-x:hidden;"
		      "z-index:-10;"
		      // the tick marks we print below are based on it
		      // being a window of the last 20 seconds... and using
		      // DX pixels
		      "min-width:%lipx;"
		      "min-height:%lipx;"
		      //"width:100%%;"
		      //"min-height:600px;"
		      //"margin-top:10px;"
		      "margin-bottom:10px;"
		      //"margin-right:10px;"
		      //"margin-left:10px;"
		      "\">"
		      ,(long)DX
		      ,(long)DY +20); // add 10 more for "2s" labels etc.

	// 10 x-axis tick marks
	for ( int x = DX/20 ; x <= DX ; x += DX/20 ) {
		// tick mark
		//plotter.line ( x , -20 , x , 20 );
		sb.safePrintf("<div style=\"position:absolute;"
			      "left:%li;"
			      "bottom:0;"
			      "background-color:#000000;"
			      "z-index:110;"
			      "min-height:20px;"
			      "min-width:3px;\"></div>\n"
			      , (long)x-1
			      );
		// generate label
		//char buf [ 32 ];
		//sprintf ( buf , "%li" , 
		//	  (long)(DT * (long long)x / (long long)DX) );
		// LABEL
		sb.safePrintf("<div style=\"position:absolute;"
			      "left:%li;"
			      "bottom:20;"
			      //"background-color:#000000;"
			      "z-index:110;"
			      "min-height:20px;"
			      "min-width:3px;\">%lis</div>\n"
			      , (long)x-10
			      // the label:
			      ,(long)(DT * (long long)x / (long long)DX)/1000
			      );

		// move cursor
		//plotter.move ( x , -by / 2 - 9 );
		// plot label
		//plotter.alabel     ( 'c' , 'c' , buf );
	}

	// . each line consists of several points
	// . we need to know each point for adding otherlines
	// . is about [400/6][1024] = 70k
	// . each line can contain multiple data points
	// . each data point is expressed as a horizontal line segment
	void *lrgBuf;
	long lrgSize = 0;
	lrgSize += MAX_LINES * MAX_POINTS * sizeof(StatPoint *);
	lrgSize += MAX_LINES * sizeof(long);
	lrgBuf = (char *) mmalloc(lrgSize, "Stats.cpp"); 
	if (! lrgBuf) {
	    log("could not allocate memory for local buffer in Stats.cpp"
		"%li bytes needed", lrgSize);
	    return;
	}
	char *lrgPtr = (char *)lrgBuf;
	StatPoint **points = (StatPoint **)lrgPtr;   
	lrgPtr += MAX_LINES * MAX_POINTS * sizeof(StatPoint *);
	long *numPoints = (long *)lrgPtr;
	lrgPtr += MAX_LINES * sizeof(long);
	memset ( (char *)numPoints , 0 , MAX_LINES * sizeof(long) );

	// store the data points into "lines"
	long count = MAX_POINTS;
	for ( long i = m_next ; count >= 0 ; i++ , count-- ) {
		// wrap around the array
		if ( i >= MAX_POINTS ) i = 0;
		// skip point if empty
		if ( m_pts[i].m_startTime == 0 ) continue;
		// skip if too early
		if ( m_pts[i].m_endTime < t1 ) continue;
		// . find the lowest line the will hold us
		// . this adds point to points[x][n] where x is determined
		addPoint ( points , numPoints , &m_pts[i] );
	}

	int y1 = 21;
	// plot the points (lines) in each line
	for ( long i = 0 ; i < MAX_LINES    ; i++ ) {
		// increase vert
		y1 += MAX_WIDTH + 1;
		// wrap back down if necessary
		if ( y1 >= DY ) y1 = 21;
		// plt all points in this row
	for ( long j = 0 ; j < numPoints[i] ; j++ ) {
		// get the point
		StatPoint *p =  points[MAX_POINTS * i + j];
		// transform time to x coordinates
		int x1 = (p->m_startTime - t1) * (long long)DX / DT;
		int x2 = (p->m_endTime   - t1) * (long long)DX / DT;
		// if x2 is negative, skip it
		if ( x2 < 0 ) continue;
		// if x1 is negative, boost it to -2
		if ( x1 < 0 ) x1 = -2;
		// . line thickness is function of read/write size
		// . take logs
		int w = (int)log(((double)p->m_numBytes)/8192.0) + 3;
		//log("log of %li is %i",m_pts[i].m_numBytes,w);
		if ( w < 3         ) w = 3;
		if ( w > MAX_WIDTH ) w = MAX_WIDTH;
		//plotter.linewidth ( w );       
		// use the color specified from addStat_r() for this line/pt
		//plotter.pencolor ( ((p->m_color >> 16) & 0xff) << 8 ,
		//		   ((p->m_color >>  8) & 0xff) << 8 ,
		//		   ((p->m_color >>  0) & 0xff) << 8 );
		// ensure at least 3 units wide for visibility
		if ( x2 < x1 + 3 ) x2 = x1 + 3;
		// . flip the y so we don't have to scroll the browser down
		// . DY does not include the axis and tick marks
		long fy1 = DY - y1 + 20 ;
		// plot it
		//plotter.line ( x1 , fy1 , x2 , fy1 );
		drawLine2 ( sb , x1 , x2 , fy1 , p->m_color , w );
		// debug msg
		//log("line (%i,%i, %i,%i) ", x1 , vert , x2 , vert );
		//log("bytes = %li width = %li ", m_pts[i].m_numBytes,w);
		//log("st=%i, end=%i color=%lx " ,
		//      (int)m_pts[i].m_startTime , 
		//      (int)m_pts[i].m_endTime   , 
		//      m_pts[i].m_color );
	}
	}

	sb.safePrintf("</div>\n");

	mfree(lrgBuf, lrgSize, "Stats.cpp");
}
// . make a web page from results stored in msg40
// . send it on TcpSocket "s" when done
// . returns false if blocked, true otherwise
// . sets g_errno on error
bool gotTitleRec ( void *state ) {
	// cast the State4 out
	State4 *st = (State4 *) state;
	// get the socket
	TcpSocket *s = st->m_socket;

	SafeBuf sb;
	// get it's docId
	long long docId = st->m_docId;
	// make the query string for passing to different hosts
	char  qs[64];
	sprintf(qs,"&d=%lli",docId);
	if ( docId==0LL ) qs[0] = 0;
	// print standard header
	sb.reserve2x ( 32768 );
	g_pages.printAdminTop (&sb, st->m_socket, &st->m_r );
	//PAGE_TITLEDB,
	//		       st->m_username,//NULL ,
	//		       st->m_coll , st->m_pwd , s->m_ip , qs );
	// shortcut
	XmlDoc *xd = &st->m_xd;

	// . deal with errors
	// . print none if non title rec at or after the provided docId
	if ( g_errno || docId == 0LL || xd->m_titleRecBuf.length() <= 0 ) {
		// print docId in box
		sb.safePrintf (  "<center>\nEnter docId: "
				 "<input type=text name=d value=%lli size=15>",
				 docId);
		sb.safePrintf ( "</form><br>\n" );
		if ( docId == 0 ) 
			sb.safePrintf("<br>");
		else if ( g_errno ) 
			sb.safePrintf("<br><br>Error = %s",mstrerror(g_errno));
		else 
			sb.safePrintf("<br><br>No titleRec for that docId "
				      "or higher");
		// print where it should be
		//unsigned long gid = getGroupIdFromDocId ( docId );
		//Host *hosts = g_hostdb.getGroup(gid);
		long shardNum = getShardNumFromDocId ( docId );
		Host *hosts = g_hostdb.getShard ( shardNum );
		long hostId = -1;
		if ( hosts ) hostId = hosts[0].m_hostId;
		sb.safePrintf("<br><br>docId on host #%li and twins.",hostId);
		sb.safePrintf ( "\n</center>" );
		mdelete ( st , sizeof(State4) , "PageTitledb");
		delete (st);
		// erase g_errno for sending
		g_errno = 0;
		// now encapsulate it in html head/tail and send it off
		return g_httpServer.sendDynamicPage ( s , 
						      sb.getBufStart(),
						      sb.length() );
	}
	// print docId in box
	sb.safePrintf ("<br>\n"
		       "<center>Enter docId: "
		       "<input type=text name=d value=%lli size=15>", docId );
	// print where it should be
	//unsigned long gid = getGroupIdFromDocId ( docId );
	//Host *hosts = g_hostdb.getGroup(gid);
	long shardNum = getShardNumFromDocId ( docId );
	Host *hosts = g_hostdb.getShard ( shardNum );
	long hostId = -1;
	if ( hosts ) hostId = hosts[0].m_hostId;
	sb.safePrintf("<br><br>docId on host #%li and twins.",hostId);
	sb.safePrintf ( "</form><br>\n" );

	//char *coll    = st->m_coll;

	Title *ti = xd->getTitle();
	if ( ! ti ) {
		log ( "admin: Could not set title" );
		return g_httpServer.sendErrorReply(s,500,mstrerror(g_errno));
	}
	// sanity check. should not block
	if ( ! xd->m_titleValid ) { char *xx=NULL;*xx=0; }

	// print it out
	xd->printDoc ( &sb );

	// don't forget to cleanup
	mdelete ( st , sizeof(State4) , "PageTitledb");
	delete (st);
	// now encapsulate it in html head/tail and send it off
	return g_httpServer.sendDynamicPage (s, sb.getBufStart(), sb.length());
}
Exemplo n.º 24
0
// . now come here when we got the necessary index lists
// . returns false if blocked, true otherwise
// . sets g_errno on error
bool Msg39::intersectLists ( ) { // bool updateReadInfo ) {
	// bail on error
	if ( g_errno ) { 
	hadError:
		log("msg39: Had error getting termlists: %s.",
		    mstrerror(g_errno));
		if ( ! g_errno ) { char *xx=NULL;*xx=0; }
		//sendReply (m_slot,this,NULL,0,0,true);
		return true; 
	}
	// timestamp log
	if ( m_debug ) {
		log(LOG_DEBUG,"query: msg39: [%"PTRFMT"] "
		    "Got %"INT32" lists in %"INT64" ms"
		    , (PTRTYPE)this,m_tmpq.getNumTerms(),
		     gettimeofdayInMilliseconds() - m_startTime);
		m_startTime = gettimeofdayInMilliseconds();
	}

	// breathe
	QUICKPOLL ( m_r->m_niceness );

	// ensure collection not deleted from under us
	CollectionRec *cr = g_collectiondb.getRec ( m_r->m_collnum );
	if ( ! cr ) {
		g_errno = ENOCOLLREC;
		goto hadError;
	}

	// . set the IndexTable so it can set it's score weights from the
	//   termFreqs of each termId in the query
	// . this now takes into account the special termIds used for sorting
	//   by date (0xdadadada and 0xdadadad2 & TERMID_MASK)
	// . it should weight them so much so that the summation of scores
	//   from other query terms cannot make up for a lower date score
	// . this will actually calculate the top
	// . this might also change m_tmpq.m_termSigns 
	// . this won't do anything if it was already called
	m_posdbTable.init ( &m_tmpq                ,
			    m_debug              ,
			    this                   ,
			    &m_tt                  ,
			    m_r->m_collnum,//ptr_coll          , 
			    &m_msg2 , // m_lists                ,
			    //m_tmpq.m_numTerms      , // m_numLists
			    m_r                              );

	// breathe
	QUICKPOLL ( m_r->m_niceness );

	// . we have to do this here now too
	// . but if we are getting weights, we don't need m_tt!
	// . actually we were using it before for rat=0/bool queries but
	//   i got rid of NO_RAT_SLOTS
	if ( ! m_allocedTree && ! m_posdbTable.allocTopTree() ) {
		if ( ! g_errno ) { char *xx=NULL;*xx=0; }
		//sendReply ( m_slot , this , NULL , 0 , 0 , true);
		return true;
	}

	// if msg2 had ALL empty lists we can cut it int16_t
	if ( m_posdbTable.m_topTree->m_numNodes == 0 ) {
		//estimateHitsAndSendReply ( );
		return true;
	}
		

	// we have to allocate this with each call because each call can
	// be a different docid range from doDocIdSplitLoop.
	if ( ! m_posdbTable.allocWhiteListTable() ) {
		log("msg39: Had error allocating white list table: %s.",
		    mstrerror(g_errno));
		if ( ! g_errno ) { char *xx=NULL;*xx=0; }
		//sendReply (m_slot,this,NULL,0,0,true);
		return true; 
	}


	// do not re do it if doing docid range splitting
	m_allocedTree = true;


	// . now we must call this separately here, not in allocTopTree()
	// . we have to re-set the QueryTermInfos with each docid range split
	//   since it will set the list ptrs from the msg2 lists
	if ( ! m_posdbTable.setQueryTermInfo () ) return true;

	// print query term bit numbers here
	for ( int32_t i = 0 ; m_debug && i < m_tmpq.getNumTerms() ; i++ ) {
		QueryTerm *qt = &m_tmpq.m_qterms[i];
		//utf16ToUtf8(bb, 256, qt->m_term, qt->m_termLen);
		char *tpc = qt->m_term + qt->m_termLen;
		char  tmp = *tpc;
		*tpc = '\0';
		SafeBuf sb;
		sb.safePrintf("query: msg39: BITNUM query term #%"INT32" \"%s\" "
			      "bitnum=%"INT32" ", i , qt->m_term, qt->m_bitNum );
		// put it back
		*tpc = tmp;
		logf(LOG_DEBUG,"%s",sb.getBufStart());
	}


	// timestamp log
	if ( m_debug ) {
		log(LOG_DEBUG,"query: msg39: [%"PTRFMT"] "
		    "Preparing to intersect "
		     "took %"INT64" ms",
		     (PTRTYPE)this, 
		    gettimeofdayInMilliseconds() - m_startTime );
		m_startTime = gettimeofdayInMilliseconds();
	}

	// time it
	int64_t start = gettimeofdayInMilliseconds();
	int64_t diff;

	// . don't bother making a thread if lists are small
	// . look at STAGE? in IndexReadInfo.cpp to see how we read in stages
	// . it's always saying msg39 handler is hogging cpu...could this be it
	//if ( m_msg2.getTotalRead() < 2000*8 ) goto skipThread;

	// debug
	//goto skipThread;

	// . NOW! let's do this in a thread so we can continue to service
	//   incoming requests
	// . don't launch more than 1 thread at a time for this
	// . set callback when thread done

	// breathe
	QUICKPOLL ( m_r->m_niceness );

	// . create the thread
	// . only one of these type of threads should be launched at a time
	if ( ! m_debug &&
	     g_threads.call ( INTERSECT_THREAD  , // threadType
			      m_r->m_niceness   ,
			      this              , // top 4 bytes must be cback
			      controlLoopWrapper2,//threadDoneWrapper ,
			      addListsWrapper   ) ) {
		m_blocked = true;
		return false;
	}
	// if it failed
	//log(LOG_INFO,"query: Intersect thread creation failed. Doing "
	//    "blocking. Hurts performance.");
	// check tree
	if ( m_tt.m_nodes == NULL ) {
		log(LOG_LOGIC,"query: msg39: Badness."); 
		char *xx = NULL; *xx = 0; }

	// sometimes we skip the thread
	//skipThread:
	// . addLists() should never have a problem
	// . g_errno should be set by prepareToAddLists() above if there is
	//   going to be a problem
	//if ( m_r->m_useNewAlgo )
	m_posdbTable.intersectLists10_r ( );
	//else
	//	m_posdbTable.intersectLists9_r ( );

	// time it
	diff = gettimeofdayInMilliseconds() - start;
	if ( diff > 10 ) log("query: Took %"INT64" ms for intersection",diff);

	// returns false if blocked, true otherwise
	//return addedLists ();
	return true;
}
Exemplo n.º 25
0
bool Log::init ( char *filename ) {
	// set the main process id
	//s_pid = getpidtid();
	setPid();
	// init these
	m_numErrors =  0;
	m_bufPtr    =  0;
	m_fd        = -1;
	m_disabled  = false;

#ifdef DEBUG
	g_dbufSize = 4096;
	g_dbuf = (char*)mmalloc(g_dbufSize,"Log: DebugBuffer");
	if (!g_dbuf) fprintf(stderr, "Unable to init debug buffer");
#endif
	//	m_hostname  = g_conf.m_hostname;
	//	m_port      = port;
	// is there a filename to log our errors to?
	m_filename = filename;
	if ( ! m_filename ) return true;

	// skip this for now
	//return true;

	//
	// RENAME log000 to log000-2013_11_04-18:19:32
	//
	if ( g_conf.m_runAsDaemon ) {
		File f;
		char tmp[16];
		sprintf(tmp,"log%03li",g_hostdb.m_hostId);
		f.set ( g_hostdb.m_dir , tmp );
		// make new filename like log000-2013_11_04-18:19:32
		time_t now = getTimeLocal();
		tm *tm1 = gmtime((const time_t *)&now);
		char tmp2[64];
		strftime(tmp2,64,"%Y_%m_%d-%T",tm1);
		SafeBuf newName;
		if ( ! newName.safePrintf ( "%slog%03li-%s",
					    g_hostdb.m_dir,
					    g_hostdb.m_hostId,
					    tmp2 ) ) {
			fprintf(stderr,"log rename failed\n");
			return false;
		}
		// rename log000 to log000-2013_11_04-18:19:32
		if ( f.doesExist() ) {
			//fprintf(stdout,"renaming file\n");
			f.rename ( newName.getBufStart() );
		}
	}


	// open it for appending.
	// create with -rw-rw-r-- permissions if it's not there.
	m_fd = open ( m_filename , 
		      O_APPEND | O_CREAT | O_RDWR , 
		      S_IRUSR | S_IWUSR | S_IRGRP | S_IWGRP | S_IROTH );
	if ( m_fd >= 0 ) return true;
	// bitch to stderr and return false on error
	fprintf(stderr,"could not open log file %s for appending\n",
		m_filename);
	return false;
}
Exemplo n.º 26
0
bool sendReply ( void *state , bool addUrlEnabled ) {
	// allow others to add now
	//s_inprogress = false;
	// get the state properly
	//gr *st1 = (gr *) state;
	GigablastRequest *gr = (GigablastRequest *)state;
	// in order to see what sites are being added log it, then we can
	// more easily remove sites from sitesearch.gigablast.com that are
	// being added but not being searched
	SafeBuf xb;
	if ( gr->m_urlsBuf ) {
		xb.safeTruncateEllipsis ( gr->m_urlsBuf , 200 );
		log(LOG_INFO,"http: add url %s (%s)",
		    xb.getBufStart(),mstrerror(g_errno));
	}

	char format = gr->m_hr.getReplyFormat();
	TcpSocket *sock    = gr->m_socket;

	if ( format == FORMAT_JSON || format == FORMAT_XML ) {
		bool status = g_httpServer.sendSuccessReply ( gr );
		// nuke state
		mdelete ( gr , sizeof(gr) , "PageAddUrl" );
		delete (gr);
		return status;
	}


	long ulen = 0;
	char *url = gr->m_urlsBuf;
	if ( url ) ulen = gbstrlen (url);

	// re-null it out if just http://
	bool printUrl = true;
	if ( ulen == 0 ) printUrl = false;
	if ( ! gr->m_urlsBuf       ) printUrl = false;
	if ( ulen==7 && printUrl && !strncasecmp(gr->m_url,"http://",7))
		printUrl = false;
	if ( ulen==8 && printUrl && !strncasecmp(gr->m_url,"https://",8))
		printUrl = false;

	// page is not more than 32k
	char buf[1024*32+MAX_URL_LEN*2];
	SafeBuf sb(buf, 1024*32+MAX_URL_LEN*2);
	
	//char rawbuf[1024*8];
	//SafeBuf rb(rawbuf, 1024*8);	
	//rb.safePrintf("<?xml version=\"1.0\" encoding=\"utf-8\"?>\n");
	//rb.safePrintf("<status>\n");
	//CollectionRec *cr = g_collectiondb.getRec ( gr->m_coll );
	
	// collection name

	char tt [ 128 ];
	tt[0] = '\0';

	g_pages.printAdminTop ( &sb , sock , &gr->m_hr );

	// display url
	//char *url = gr->m_urlsBuf;
	//if ( url && ! url[0] ) url = NULL;

	// watch out for NULLs
	if ( ! url ) url = "http://";

	// if there was an error let them know
	//char msg[MAX_URL_LEN + 1024];
	SafeBuf mbuf;
	//char *pm = "";
	if ( g_errno ) {
		mbuf.safePrintf("<center><font color=red>");
		mbuf.safePrintf("Error adding url(s): <b>%s[%i]</b>", 
				mstrerror(g_errno) , g_errno);
		mbuf.safePrintf("</font></center>");
		//pm = msg;
		//rb.safePrintf("Error adding url(s): %s[%i]", 
		//	      mstrerror(g_errno) , g_errno);
	}
	else if ( printUrl ) {
		mbuf.safePrintf("<center><font color=red>");
		mbuf.safePrintf("<b><u>");
		mbuf.safeTruncateEllipsis(gr->m_urlsBuf,200);
		mbuf.safePrintf("</u></b> added to spider "
				 "queue "
				 "successfully<br><br>");
		mbuf.safePrintf("</font></center>");
		//rb.safePrintf("%s added to spider "
		//	      "queue successfully", url );
		//pm = msg;
		//url = "http://";
		//else
		//	pm = "Don't forget to <a href=/gigaboost.html>"
		//		"Gigaboost</a> your URL.";
	}


	if ( mbuf.length() ) sb.safeStrcpy ( mbuf.getBufStart() );

	g_parms.printParmTable ( &sb , sock , &gr->m_hr );

	// print the final tail
	g_pages.printTail ( &sb, true ); // admin?
	// clear g_errno, if any, so our reply send goes through
	g_errno = 0;

	// nuke state
	mdelete ( gr , sizeof(GigablastRequest) , "PageAddUrl" );
	delete (gr);

	return g_httpServer.sendDynamicPage (sock, 
					     sb.getBufStart(), 
					     sb.length(),
					     -1 ); // cachetime
}
bool sendPageAddDelColl ( TcpSocket *s , HttpRequest *r , bool add ) {
	// get collection name
	//int32_t  nclen;
	//char *nc   = r->getString ( "nc" , &nclen );
	//int32_t  cpclen;
	//char *cpc  = r->getString ( "cpc" , &cpclen );

	g_errno = 0;

	//bool cast = r->getLong("cast",0);

	const char *msg = NULL;

	// if any host in network is dead, do not do this
	//if ( g_hostdb.hasDeadHost() ) msg = "A host in the network is dead.";

	char format = r->getReplyFormat();


	if ( format == FORMAT_XML || format == FORMAT_JSON ) {
		// no addcoll given?
		int32_t  page = g_pages.getDynamicPageNumber ( r );
		const char *addcoll = r->getString("addcoll",NULL);
		const char *delcoll = r->getString("delcoll",NULL);
		if ( ! addcoll ) addcoll = r->getString("addColl",NULL);
		if ( ! delcoll ) delcoll = r->getString("delColl",NULL);
		if ( page == PAGE_ADDCOLL && ! addcoll ) {
			g_errno = EBADENGINEER;
			const char *msg = "no addcoll parm provided";
			return g_httpServer.sendErrorReply(s,g_errno,msg,NULL);
		}
		if ( page == PAGE_DELCOLL && ! delcoll ) {
			g_errno = EBADENGINEER;
			const char *msg = "no delcoll parm provided";
			return g_httpServer.sendErrorReply(s,g_errno,msg,NULL);
		}
		return g_httpServer.sendSuccessReply(s,format);
	}

	// error?
	const char *action = r->getString("action",NULL);
	const char *addColl = r->getString("addcoll",NULL);


	char  buf [ 64*1024 ];
	SafeBuf p(buf, 64*1024);


	//
	// CLOUD SEARCH ENGINE SUPPORT - GIGABOT ERRORS
	//

	SafeBuf gtmp;
	char *gmsg = NULL;
	// is it too big?
	if ( action && addColl && strlen(addColl) > MAX_COLL_LEN ) {
		gtmp.safePrintf("search engine name is too long");
		gmsg = gtmp.getBufStart();
	}
	// from Collectiondb.cpp::addNewColl() ensure coll name is legit
	const char *x = addColl;
	for ( ; x && *x ; x++ ) {
		if ( is_alnum_a(*x) ) continue;
		if ( *x == '-' ) continue;
		if ( *x == '_' ) continue; // underscore now allowed
		break;
	}
	if ( x && *x ) {
		g_errno = EBADENGINEER;
		gtmp.safePrintf("<font color=red>Error. \"%s\" is a "
				"malformed name because it "
				"contains the '%c' character.</font><br><br>",
				addColl,*x);
		gmsg = gtmp.getBufStart();
	}

	//
	// END GIGABOT ERRORS
	//



	//
	// CLOUD SEARCH ENGINE SUPPORT
	//
	// if added the coll successfully, do not print same page, jump to
	// printing the basic settings page so they can add sites to it.
	// crap, this GET request, "r", is missing the "c" parm sometimes.
	// we need to use the "addcoll" parm anyway. maybe print a meta
	// redirect then?
	char guide = r->getLong("guide",0);
	// do not redirect if gmsg is set, there was a problem with the name
	if ( action && ! msg && format == FORMAT_HTML && guide && ! gmsg ) {
		//return g_parms.sendPageGeneric ( s, r, PAGE_BASIC_SETTINGS );
		// just redirect to it
		if ( addColl )
			p.safePrintf("<meta http-equiv=Refresh "
				      "content=\"0; URL=/admin/settings"
				      "?guide=1&c=%s\">",
				      addColl);
		return g_httpServer.sendDynamicPage (s,
						     p.getBufStart(),
						     p.length());
	}


	// print standard header
	g_pages.printAdminTop ( &p , s , r , NULL, 
				"onload=document."
				"getElementById('acbox').focus();");


	if ( g_errno ) {
		msg = mstrerror( g_errno );
	}

	if ( msg && ! guide ) {
		const char *cc = "deleting";
		if ( add ) cc = "adding";
		p.safePrintf (
			  "<center>\n"
			  "<font color=red>"
			  "<b>Error %s collection: %s. "
			  "See log file for details.</b>"
			  "</font>"
			  "</center><br>\n",cc,msg);
	}

	//
	// CLOUD SEARCH ENGINE SUPPORT
	//
	if ( add && guide )
		printGigabotAdvice ( &p , PAGE_ADDCOLL , r , gmsg );



	// print the add collection box
	if ( add /*&& (! nc[0] || g_errno ) */ ) {

		const char *t1 = "Add Collection";
		if ( guide ) t1 = "Add Search Engine";

		p.safePrintf (
			  "<center>\n<table %s>\n"
			   "<tr class=hdrow><td colspan=2>"
			  "<center><b>%s</b></center>"
			  "</td></tr>\n"
			  ,TABLE_STYLE
			  ,t1
			      );
		const char *t2 = "collection";
		if ( guide ) t2 = "search engine";
		const char *str = addColl;
		if ( ! addColl ) str = "";
		p.safePrintf (
			      "<tr bgcolor=#%s>"
			      "<td><b>name of new %s to add</td>\n"
			      "<td><input type=text name=addcoll size=30 "
			      "id=acbox "
			      "value=\"%s\">"
			      "</td></tr>\n"
			      , LIGHT_BLUE
			      , t2 
			      , str
			      );

		// don't show the clone box if we are under gigabot the guide
		if ( ! guide )
			p.safePrintf(
				     "<tr bgcolor=#%s>"
				     "<td><b>clone settings from this "
				     "collection</b>"
				     "<br><font size=1>Copy settings from "
				     "this pre-existing collection. Leave "
				     "blank to "
				     "accept default values.</font></td>\n"
				     "<td><input type=text name=clonecoll "
				     "size=30>"
				     "</td>"
				     "</tr>"
				     , LIGHT_BLUE
				     );

		// collection pwds
		p.safePrintf(
			     "<tr bgcolor=#%s>"
			     "<td><b>collection passwords"
			     "</b>"
			     "<br><font size=1>List of white space separated "
			     "passwords allowed to adminster collection."
			     "</font>"
			     "</td>\n"
			     "<td><input type=text name=collpwd "
			     "size=60>"
			     "</td>"
			     "</tr>"
			     , LIGHT_BLUE
			     );

		// ips box for security
		p.safePrintf(
			     "<tr bgcolor=#%s>"
			     "<td><b>collection ips"
			     "</b>"

			     "<br><font size=1>List of white space separated "
			     "IPs allowed to adminster collection."
			     "</font>"

			     "</td>\n"
			     "<td><input type=text name=collips "
			     "size=60>"
			     "</td>"
			     "</tr>"
			     , LIGHT_BLUE
			     );

		// now list collections from which to copy the config
		//p.safePrintf (
		//	  "<tr><td><b>copy configuration from this "
		//	  "collection</b><br><font size=1>Leave blank to "
		//	  "accept default values.</font></td>\n"
		//	  "<td><input type=text name=cpc value=\"%s\" size=30>"
		//	  "</td></tr>\n",coll);
		p.safePrintf ( "</table></center><br>\n");

		// wrap up the form started by printAdminTop
		g_pages.printAdminBottom ( &p );
		int32_t bufLen = p.length();
		return g_httpServer.sendDynamicPage (s,p.getBufStart(),bufLen);
	}

	// if we added a collection, print its page
	//if ( add && nc[0] && ! g_errno ) 
	//	return g_parms.sendPageGeneric2 ( s , r , PAGE_SEARCH ,
	//					  nc , pwd );

	if ( g_collectiondb.m_numRecsUsed <= 0 ) goto skip;

	// print all collections out in a checklist so you can check the
	// ones you want to delete, the values will be the id of that collectn
	p.safePrintf (
		  "<center>\n<table %s>\n"
		  "<tr class=hdrow><td><center><b>Delete Collections"
		  "</b></center></td></tr>\n"
		  "<tr bgcolor=#%s><td>"
		  "<center><b>Select the collections you wish to delete. "
		  //"<font color=red>This feature is currently under "
		  //"development.</font>"
		  "</b></center></td></tr>\n"
		  "<tr bgcolor=#%s><td>"
		  // table within a table
		  "<center><table width=20%%>\n",
		  TABLE_STYLE,
		  LIGHT_BLUE,
		  DARK_BLUE
		      );

	for ( int32_t i = 0 ; i < g_collectiondb.m_numRecs ; i++ ) {
		CollectionRec *cr = g_collectiondb.m_recs[i];
		if ( ! cr ) continue;
		p.safePrintf (
			  "<tr bgcolor=#%s><td>"
			  "<input type=checkbox name=delcoll value=\"%s\"> "
			  "%s</td></tr>\n",
			  DARK_BLUE,
			  cr->m_coll,cr->m_coll);
	}
	p.safePrintf( "</table></center></td></tr></table><br>\n" );
skip:
	// wrap up the form started by printAdminTop
	g_pages.printAdminBottom ( &p );
	int32_t bufLen = p.length();
	return g_httpServer.sendDynamicPage (s,p.getBufStart(),bufLen);
}
// . displays the stats for a username
// . show stats for every day we have them for
// . in a big list
// . if they click the day display all docids evaluated for that day
// . show the accuracy for that day too
// . how many docs they edited
// . how many of those docs were verified by another
// . and if there was consensus
void gotTransdbList ( State60 *st ) {

	// get today's time range
	time_t now = getTimeGlobal();
	// get start of today
	time_t dayStart = now / (24*3600);

	SafeBuf sb;

	// int16_tcut
	TcpSocket *s = st->m_s;

	// make about 200k of mem to write into
	if ( ! sb.reserve ( 200000 ) ) 
		return g_httpServer.sendErrorReply(s,500,mstrerrno(g_errno));

	// print description so they can clikc a button to start the turk
	sb.safePrintf("<html>\n"
		      "<title>Event Editor</title>\n"
		      "<body>\n"
		      "<table width=\"100%%\" border=\"0\">\n"
		      "<tr><td style=\"background-color:#0079ba;\">\n"
		      "<center><font color=#00000>"
		      "<h2>Event Editor</h2>\n"
		      "</font></center></td>"
		      "</tr></table>");
	// print the content
	sb.safePrintf("<center><font size=4><blink>"
		      "<b><a href=\"/pageturk?c=%s&edit=1\">"
		      "Click here to start editing.</a></b></blink>"
		      "</font><br><i>Please take your "
		      "time to read the information below before you begin"
		      "</i><br><font color=\"red\" size=2> Warning: Adult "
		      "content might be presented to you."
		      " You should be above 18 years of age to continue."
		      "</center></font>",st->m_coll);

	sb.safePrintf("<font face=arial,sans-serif color=black size=3>"
		      "<p>By clicking <i>Start Voting</i>, you will be "
		       "presented with an interface for editing events. "
		      "The editor will display a modified web page that "
		      "contains one or more events. Each event's description "
		      "will be highlight with a blue background. You can "
		      "toggle whether a particular event is displayed by "
		      "clicking on that event's ID. You can highlight one or "
		      "multiple event descriptions at the same time. "
		      "</p><p>"
		      "By clicking on the section icons in the web page you "
		      "can tell the editor that a virtual fence should be "
		      "erected around that section. The fence will make sure "
		      "that event descriptions can not span across it. Each "
		      "event description must be fully contained either "
		      "inside or outside the fence. However, you can also "
		      "declare a section as a title section, which means that "
		      "the text that the title section contains is free to be "
		      "used by any event description."
		      "</p>\n"
		      "<p>When you are done erecting section fences, you "
		      "submit your changes. The more changes you make the "
		      "more points you earn. Other users may evaluate " 
		      "your edits for accuracy. You will be paid based on the "
		      "points you earn as well as your accuracy. All "
		      "transactions are listed in the table below.</p>"
		      "<p>You may not change your username or password "
		      "but you can change your email address. Your email "
		      "address will be used to pay you with PayPal every "
		      "Friday. Paypal fees will be deducted on your end. By "
		      "using this service you agree to all stated Terms & "
		      "Conditions.</p>"
		      "</font>\n");

	// get the user record
	User *uu = g_users.getUser ( username );
	// print out their info, like paypal email
	sb.safePrintf("<table>\n"
		      "<tr><td colspan=10><center>Your Info</center>"
		      "</td></tr>\n"
		      "<tr>"
		      "<td>Email</td>"
		      "<td><input type=text value=%s></td>"
		      "<td>email address used to pay with paypal</td>"
		      "</tr>\n"
		      "<tr><td colspan=10><input type=submit value=update>"
		      "</td></tr>\n"
		      "</table>\n" ,
		      uu->m_payPalEmail );

	// print your stats here now
	sb.safePrintf("<table>\n"
		      "<tr><td colspan=10><center>Your Stats</center>"
		      "</td></tr>\n"
		      "<tr>"
		      "<td>date</td>"
		      "<td>action</td>"
		      "<td>amount</td>"
		      "<td>desc</td>"
		      "</tr>\n");

	// int16_tcut
	RdbList *list = &st->m_list;

	int32_t lastDay        = -1;
	int32_t totalReceives  = 0;
	int32_t totalSubmits   = 0;
	int32_t totalPasses    = 0;
	int32_t totalFails     = 0;

	// scan the list
	for ( ; ! list->isExhausted() ; ) {
		// get rec
		char *rec      = list->getCurrentRecord();
		char *data     = list->getCurrentData();
		int32_t  dataSize = list->getCurrentDataSize();
		// skip that
		list->skipCurrentRecord();
		// skip if negative
		if ( (rec[0] & 0x01) == 0x00 ) continue;
		// get the time (global time - sync'd with host #0)
		time_t tt = g_transdb.getTimeStamp ( rec );
		// get day #
		int32_t daynum = tt / (24*3600);
		// is it today?
		bool isToday = ( daynum >= dayStart );
		// point to the Transaction
		Trans *trans = (Trans *)data;
		// if is today, print it out verbatim
		if ( isToday ) {
			// print it in html row format to match table above
			//printTrans ( &sb , rec );
			sb.safePrintf("<tr>");
			// make it into a nice date
			time_t dd = lastDay * 86400;
			struct tm *timeStruct = localtime ( &dd );
			char ppp[100];
			strftime(ppp,100,"%H:%M:%S",timeStruct);
			// print last days stats first
			sb.safePrintf("<td>%s</td>",ppp);
			// then stats
			if ( trans->m_actionType == AT_RECEIVE_DOC )
				sb.safePrintf("<td>receive</td>"
					      "<td>%"INT32" pts</td>"
					      "<td>docid=%"UINT64"</td>",
					      (int32_t)trans->m_number,
					      trans->m_docId);
			else if ( trans->m_actionType == AT_SUBMIT_DOC )
				sb.safePrintf("<td>submit</td>"
					      "<td>%"INT32" pts</td>"
					      "<td>docid=%"UINT64"</td>",
					      (int32_t)trans->m_number,
					      trans->m_docId);
			else if ( trans->m_actionType == AT_PASS_DOC )
				sb.safePrintf("<td>verify</td>"
					      "<td>%"INT32" pts</td>"
					      "<td>docid=%"UINT64" was verified "
					      "by user=\"%s\"</td>",
					      (int32_t)trans->m_number,
					      trans->m_docId,
					      trans->m_desc);
			else if ( trans->m_actionType == AT_FAIL_DOC )
				sb.safePrintf("<td>verify</td>"
					      "<td>%"INT32" pts</td>"
					      "<td>docid=%"UINT64" was deemed to "
					      "be incorrect "
					      "by user=\"%s\"</td>",
					      (int32_t)trans->m_number,
					      trans->m_docId,
					      trans->m_desc);
			else if ( trans->m_actionType == AT_ACCURACY_EVAL)
				sb.safePrintf("<td>accuracy eval</td>"
					      "<td>%.02f</td>"
					      "<td>docid=%"UINT64"</td>",
					      trans->m_number,
					      trans->m_docId);
			else if ( trans->m_actionType == AT_CHARGE)
				sb.safePrintf("<td>credit</td>"
					      "<td>%.02f</td>"
					      "<td>You made money.</td>",
					      trans->m_number);
			else if ( trans->m_actionType == AT_PAYMENT)
				sb.safePrintf("<td>payment</td>"
					      "<td>%.02f</td>"
					      "<td>We paid you.</td>",
					      trans->m_number);
			else if ( trans->m_actionType == AT_LOGIN)
				sb.safePrintf("<td>login</td>"
					      "<td>-</td>"
					      "<td>You logged in.</td>");
			else if ( trans->m_actionType == AT_LOGOUT)
				sb.safePrintf("<td>logout</td>"
					      "<td>-</td>"
					      "<td>You logged out.</td>");
			else if ( trans->m_actionType == AT_AUTO_LOGOUT)
				sb.safePrintf("<td>logout</td>"
					      "<td>-</td>"
					      "<td>You were auto "
					      "logged out.</td>");
			else {
				char *xx=NULL;*xx=0; }
			sb.safePrintf("</tr>\n");
			continue;
		}
		// if does not match last day, print out that last day's stats
		// and reset for next guy
		if ( daynum != lastDay && lastDay != -1 ) {
			// make it into a nice date
			time_t dd = lastDay * 86400;
			struct tm *timeStruct = localtime ( &dd );
			char ppp[100];
			strftime(ppp,100,"%b-%d-%Y",timeStruct);
			// print last days stats first
			sb.safePrintf("<td>%s</td>",ppp);
			// then stats
			sb.safePrintf("<tr>"
				      "<td>receive</td>"
				      "<td>%"INT32"</td>"
				      "<td>Total received</td>"
				      "</tr>\n",
				      totalReceives);
			sb.safePrintf("<tr>"
				      "<td>submit</td>"
				      "<td>%"INT32"</td>"
				      "<td>Total submitted</td>"
				      "</tr>\n",
				      totalSubmits);
			sb.safePrintf("<tr>"
				      "<td>pass</td>"
				      "<td>%"INT32"</td>"
				      "<td>Total accuracy tests passed</td>"
				      "</tr>\n",
				      totalPasses);
			sb.safePrintf("<tr>"
				      "<td>fail</td>"
				      "<td>%"INT32"</td>"
				      "<td>Total accuracy tests failed</td>"
				      "</tr>\n",
				      totalFails);
			// reset as well
			totalReceived = 0;
			totalSubmits  = 0;
			totalPasses   = 0;
			totalFails    = 0;
		}
		// remember last day # we processed for accumulating stats
		lastDay = daynum;
		// accum stats
		if ( trans->m_actionType == AT_RECEIVE_DOC )
			totalReceives++;
		if ( trans->m_actionType == AT_SUBMIT_DOC )
			totalSubmits++;
		if ( trans->m_actionType == AT_PASS_DOC )
			totalPasses++;
		if ( trans->m_actionType == AT_FAIL_DOC )
			totalFails++;
	}

	sb.safePrintf("</body></html>\n");

	sendReply ( &sb );
}
Exemplo n.º 29
0
// . a new interface so Msg3b can call this with "s" set to NULL
// . returns false if blocked, true otherwise
// . sets g_errno on error
bool sendPageParser2 ( TcpSocket   *s , 
		       HttpRequest *r ,
		       State8      *st ,
		       long long    docId ,
		       Query       *q ,
		       // in query term space, not imap space
		       long long   *termFreqs       ,
		       // in imap space
		       float       *termFreqWeights ,
		       // in imap space
		       float       *affWeights      ,
		       void        *state ,
		       void       (* callback)(void *state) ) {

	//log("parser: read sock=%li",s->m_sd);

	// might a simple request to addsomething to validated.*.txt file
	// from XmlDoc::print() or XmlDoc::validateOutput()
	char *add = r->getString("add",NULL);
	//long long uh64 = r->getLongLong("uh64",0LL);
	char *uh64str = r->getString("uh64",NULL);
	//char *divTag = r->getString("div",NULL);
	if ( uh64str ) {
		// convert add to number
		long addNum = 0;
		if ( to_lower_a(add[0])=='t' ) // "true" or "false"?
			addNum = 1;
		// convert it. skip beginning "str" inserted to prevent
		// javascript from messing with the long long since it
		// was rounding it!
		//long long uh64 = atoll(uh64str);//+3);
		// urldecode that
		//long divTagLen = gbstrlen(divTag);
		//long newLen  = urlDecode ( divTag , divTag , divTagLen );
		// null term?
		//divTag[newLen] = '\0';
		// do it. this is defined in XmlDoc.cpp
		//addCheckboxSpan ( uh64 , divTag , addNum );
		// make basic reply
		char *reply;
		reply = "HTTP/1.0 200 OK\r\n"
			"Connection: Close\r\n";
		// that is it! send a basic reply ok
		bool status = g_httpServer.sendDynamicPage( s , 
							    reply,
							    gbstrlen(reply),
							    -1, //cachtime
							    false ,//postreply?
							    NULL, //ctype
							    -1 , //httpstatus
							    NULL,//cookie
							    "utf-8");
		return status;
	}

	// make a state
	if (   st ) st->m_freeIt = false;
	if ( ! st ) {
		try { st = new (State8); }
		catch ( ... ) {
			g_errno = ENOMEM;
			log("PageParser: new(%i): %s", 
			    (int)sizeof(State8),mstrerror(g_errno));
			return g_httpServer.sendErrorReply(s,500,
						       mstrerror(g_errno));}
		mnew ( st , sizeof(State8) , "PageParser" );
		st->m_freeIt = true;
	}
	// msg3b uses this to get a score from the query
	st->m_state           = state;
	st->m_callback        = callback;
	st->m_q               = q;
	st->m_termFreqs       = termFreqs;
	st->m_termFreqWeights = termFreqWeights;
	st->m_affWeights      = affWeights;
	//st->m_total           = (score_t)-1;
	st->m_indexCode       = 0;
	st->m_blocked         = false;
	st->m_didRootDom      = false;
	st->m_didRootWWW      = false;
	st->m_wasRootDom      = false;
	st->m_u               = NULL;
	st->m_recompute       = false;
	//st->m_url.reset();

	// do not allow more than one to be launched at a time if in 
	// a quickpoll. will cause quickpoll in quickpoll.
	g_inPageParser = true;

	// password, too
	long pwdLen = 0;
	char *pwd = r->getString ( "pwd" , &pwdLen );
	if ( pwdLen > 31 ) pwdLen = 31;
	if ( pwdLen > 0 ) strncpy ( st->m_pwd , pwd , pwdLen );
	st->m_pwd[pwdLen]='\0';

	// save socket ptr
	st->m_s = s;
	st->m_r.copy ( r );
	// get the collection
	char *coll    = r->getString ( "c" , &st->m_collLen ,NULL /*default*/);
	if ( st->m_collLen > MAX_COLL_LEN )
		return sendErrorReply ( st , ENOBUFS );
	if ( ! coll )
		return sendErrorReply ( st , ENOCOLLREC );
	strcpy ( st->m_coll , coll );

	// version to use, if -1 use latest
	st->m_titleRecVersion = r->getLong("version",-1);
	if ( st->m_titleRecVersion == -1 ) 
		st->m_titleRecVersion = TITLEREC_CURRENT_VERSION;
	// default to 0 if not provided
	st->m_hopCount = r->getLong("hc",0);
	//long  ulen    = 0;
	//char *u     = r->getString ( "u" , &ulen     , NULL /*default*/);
	long  old     = r->getLong   ( "old", 0 );
	// set query
	long qlen;
	char *qs = r->getString("q",&qlen,NULL);
	if ( qs ) st->m_tq.set2 ( qs , langUnknown , true );
	// url will override docid if given
	if ( ! st->m_u || ! st->m_u[0] ) 
		st->m_docId = r->getLongLong ("docid",-1);
	else    
		st->m_docId = -1;
	// set url in state class (may have length 0)
	//if ( u ) st->m_url.set ( u , ulen );
	//st->m_urlLen = ulen;
	st->m_u = st->m_r.getString("u",&st->m_ulen,NULL);
	// should we recycle link info?
	st->m_recycle  = r->getLong("recycle",0);
	st->m_recycle2 = r->getLong("recycleimp",0);
	st->m_render   = r->getLong("render" ,0);
	// for quality computation... takes way longer cuz we have to 
	// lookup the IP address of every outlink, so we can get its root
	// quality using Msg25 which needs to filter out voters from that IP
	// range.
	st->m_oips     = r->getLong("oips"    ,0);

	long  linkInfoLen  = 0;
	// default is NULL
	char *linkInfoColl = r->getString ( "oli" , &linkInfoLen, NULL );
	if ( linkInfoColl ) strcpy ( st->m_linkInfoColl , linkInfoColl );
	else st->m_linkInfoColl[0] = '\0';
	
	// set the flag in our SafeBuf class so that Words.cpp knows to show 
	// html or html source depending on this value
	st->m_xbuf.m_renderHtml = st->m_render;

	// should we use the old title rec?
	st->m_old    = old;
	// are we coming from a local machine?
	st->m_isLocal = r->isLocal();
 	//no more setting the default root quality to 30, instead if we do not
 	// know it setting it to -1
 	st->m_rootQuality=-1;







	// header
	SafeBuf *xbuf = &st->m_xbuf;
	xbuf->safePrintf("<meta http-equiv=\"Content-Type\" "
			 "content=\"text/html; charset=utf-8\">\n");

	// print standard header
	g_pages.printAdminTop ( xbuf , st->m_s , &st->m_r );


	// print the standard header for admin pages
	char *dd     = "";
	char *rr     = "";
	char *rr2    = "";
	char *render = "";
	char *oips   = "";
	char *us     = "";
	if ( st->m_u && st->m_u[0] ) us = st->m_u;
	//if ( st->m_sfn != -1 ) sprintf ( rtu , "%li",st->m_sfn );
	if ( st->m_old ) dd = " checked";
	if ( st->m_recycle            ) rr     = " checked";
	if ( st->m_recycle2           ) rr2    = " checked";
	if ( st->m_render             ) render = " checked";
	if ( st->m_oips               ) oips   = " checked";

	xbuf->safePrintf(
			 "<style>"
			 ".poo { background-color:#%s;}\n"
			 "</style>\n" ,
			 LIGHT_BLUE );


	long clen;
	char *contentParm = r->getString("content",&clen,"");
	
	// print the input form
	xbuf->safePrintf (
		       "<style>\n"
		       "h2{font-size: 12px; color: #666666;}\n"

		       ".gbtag { border: 1px solid gray;"
		  	"background: #ffffef;display:inline;}\n"
		  	".gbcomment { border: 1px solid gray;"
		  	"color: #888888; font-style:italic; "
		  	"background: #ffffef;display:inline;}\n"

		       ".token { border: 1px solid gray;"
		       "background: #f0ffff;display:inline;}\n"
		       ".spam { border: 1px solid gray;"
		       "background: #af0000;"
		       "color: #ffffa0;}"
		       ".hs {color: #009900;}"
		       "</style>\n"
		       "<center>"

		  "<table %s>"

			  "<tr><td colspan=5><center><b>"
			  "Parser"
			  "</b></center></td></tr>\n"

		  "<tr class=poo>"
		  "<td>"
		  "<b>url</b>"
			  "<br><font size=-2>"
			  "Type in <b>FULL</b> url to parse."
			  "</font>"
		  "</td>"

		  "</td>"
		  "<td>"
		  "<input type=text name=u value=\"%s\" size=\"40\">\n"
		  "</td>"
		  "</tr>"


			  /*
		  "<tr class=poo>"
		  "<td>"
		  "Parser version to use: "
		  "</td>"
		  "<td>"
		  "<input type=text name=\"version\" size=\"4\" value=\"-1\"> "
		  "</td>"
		  "<td>"
		  "(-1 means to use latest title rec version)<br>"
		  "</td>"
		  "</tr>"
			  */

			  /*
		  "<tr class=poo>"
		  "<td>"
		  "Hop count to use: "
		  "</td>"
		  "<td>"
		  "<input type=text name=\"hc\" size=\"4\" value=\"%li\"> "
		  "</td>"
		  "<td>"
		  "(-1 is unknown. For root urls hopcount is always 0)<br>"
		  "</td>"
		  "</tr>"
			  */

		  "<tr class=poo>"
		  "<td>"
			  "<b>use cached</b>"

			  "<br><font size=-2>"
			  "Load page from cache (titledb)?"
			  "</font>"

		  "</td>"
		  "<td>"
		  "<input type=checkbox name=old value=1%s> "
		  "</td>"
		  "</tr>"

			  /*
		  "<tr class=poo>"
		  "<td>"
		  "Reparse root:"
		  "</td>"
		  "<td>"
		  "<input type=checkbox name=artr value=1%s> "
		  "</td>"
		  "<td>"
		  "Apply selected ruleset to root to update quality"
		  "</td>"
		  "</tr>"
			  */

		  "<tr class=poo>"
		  "<td>"
		  "<b>recycle link info</b>"

			  "<br><font size=-2>"
			  "Recycle the link info from the title rec"
			  "Load page from cache (titledb)?"
			  "</font>"

		  "</td>"
		  "<td>"
		  "<input type=checkbox name=recycle value=1%s> "
		  "</td>"
		  "</tr>"

			  /*
		  "<tr class=poo>"
		  "<td>"
		  "Recycle Link Info Imported:"
		  "</td>"
		  "<td>"
		  "<input type=checkbox name=recycleimp value=1%s> "
		  "</td>"
		  "<td>"
		  "Recycle the link info imported from other coll"
		  "</td>"
		  "</tr>"
			  */

		  "<tr class=poo>"
		  "<td>"
		  "<b>render html</b>"

			  "<br><font size=-2>"
			  "Render document content as HTML"
			  "</font>"

		  "</td>"
		  "<td>"
		  "<input type=checkbox name=render value=1%s> "
		  "</td>"
		  "</tr>"

			  /*
		  "<tr class=poo>"
		  "<td>"
		  "Lookup outlinks' ruleset, ips, quality:"
		  "</td>"
		  "<td>"
		  "<input type=checkbox name=oips value=1%s> "
		  "</td>"
		  "<td>"
		  "To compute quality lookup IP addresses of roots "
		  "of outlinks."
		  "</td>"
		  "</tr>"

		  "<tr class=poo>"
		  "<td>"
		  "LinkInfo Coll:"
		  "</td>"
		  "<td>"
		  "<input type=text name=\"oli\" size=\"10\" value=\"\"> "
		  "</td>"
		  "<td>"
		  "Leave empty usually. Uses this coll to lookup link info."
		  "</td>"
		  "</tr>"
			  */

		  "<tr class=poo>"
		  "<td>"
		  "<b>optional query</b>"

			  "<br><font size=-2>"
			  "Leave empty usually. For title generation only."
			  "</font>"

		  "</td>"
		  "<td>"
		  "<input type=text name=\"q\" size=\"20\" value=\"\"> "
		  "</td>"
		       "</tr>",

			  TABLE_STYLE,
			  us ,
			   dd,
			   rr, 
			   render
			  );

	xbuf->safePrintf(
		  "<tr class=poo>"
			  "<td>"
			  "<b>content type below is</b>"
			  "<br><font size=-2>"
			  "Is the content below HTML? XML? JSON?"
			  "</font>"
			  "</td>"

		  "<td>"
		       //"<input type=checkbox name=xml value=1> "
		       "<select name=ctype>\n"
		       "<option value=%li selected>HTML</option>\n"
		       "<option value=%li selected>XML</option>\n"
		       "<option value=%li selected>JSON</option>\n"
		       "</select>\n"

		  "</td>"
		       "</tr>",
		       (long)CT_HTML,
		       (long)CT_XML,
		       (long)CT_JSON
			  );

	xbuf->safePrintf(

			  "<tr class=poo>"
			  "<td><b>content</b>"
			  "<br><font size=-2>"
			  "Use this content for the provided <i>url</i> "
			  "rather than downloading it from the web."
			  "</td>"

			  "<td>"
			  "<textarea rows=10 cols=80 name=content>"
			  "%s"
			  "</textarea>"
			  "</td>"
			  "</tr>"

		  "</table>"
		  "</center>"
		  "</form>"
		  "<br>",

			  //oips ,
			  contentParm );



	xbuf->safePrintf(
			 "<center>"
			 "<input type=submit value=Submit>"
			 "</center>"
			 );


	// just print the page if no url given
	if ( ! st->m_u || ! st->m_u[0] ) return processLoop ( st );


	XmlDoc *xd = &st->m_xd;
	// set this up
	SpiderRequest sreq;
	sreq.reset();
	strcpy(sreq.m_url,st->m_u);
	long firstIp = hash32n(st->m_u);
	if ( firstIp == -1 || firstIp == 0 ) firstIp = 1;
	// parentdocid of 0
	sreq.setKey( firstIp, 0LL, false );
	sreq.m_isPageParser = 1;
	sreq.m_hopCount = st->m_hopCount;
	sreq.m_hopCountValid = 1;
	sreq.m_fakeFirstIp   = 1;
	sreq.m_firstIp = firstIp;
	Url nu;
	nu.set(sreq.m_url);
	sreq.m_domHash32 = nu.getDomainHash32();
	sreq.m_siteHash32 = nu.getHostHash32();

	// . get provided content if any
	// . will be NULL if none provided
	// . "content" may contain a MIME
	long  contentLen = 0;
	char *content = r->getString ( "content" , &contentLen , NULL );
	// is the "content" url-encoded? default is true.
	bool contentIsEncoded = true;
	// mark doesn't like to url-encode his content
	if ( ! content ) {
		content    = r->getUnencodedContent    ();
		contentLen = r->getUnencodedContentLen ();
		contentIsEncoded = false;
	}
	// ensure null
	if ( contentLen == 0 ) content = NULL;

	uint8_t contentType = CT_HTML;
	if ( r->getBool("xml",0) ) contentType = CT_XML;

	contentType = r->getLong("ctype",contentType);//CT_HTML);


	// if facebook, load xml content from title rec...
	bool isFacebook = (bool)strstr(st->m_u,"http://www.facebook.com/");
	if ( isFacebook && ! content ) {
		long long docId = g_titledb.getProbableDocId(st->m_u);
		sprintf(sreq.m_url ,"%llu", docId );
		sreq.m_isPageReindex = true;
	}

	// hack
	if ( content ) {
		st->m_dbuf.purge();
		st->m_dbuf.safeStrcpy(content);
		//char *data = strstr(content,"\r\n\r\n");
		//long dataPos = 0;
		//if ( data ) dataPos = (data + 4) - content;
		//st->m_dbuf.convertJSONtoXML(0,dataPos);
		//st->m_dbuf.decodeJSON(0);
		content = st->m_dbuf.getBufStart();
	}

	// . use the enormous power of our new XmlDoc class
	// . this returns false if blocked
	if ( ! xd->set4 ( &sreq       ,
			  NULL        ,
			  st->m_coll  ,
			  &st->m_wbuf        ,
			  0 ,//PP_NICENESS ))
			  content ,
			  false, // deletefromindex
			  0, // forced ip
			  contentType ))
		// return error reply if g_errno is set
		return sendErrorReply ( st , g_errno );
	// make this our callback in case something blocks
	xd->setCallback ( st , processLoop );
	// . set xd from the old title rec if recycle is true
	// . can also use XmlDoc::m_loadFromOldTitleRec flag
	if ( st->m_recycle ) xd->m_recycleContent = true;

	return processLoop ( st );
}
bool sendReply ( void *state ) {
	StateCatdb *st = (StateCatdb*)state;
	// check for error
	if (g_errno) {
		if (st->m_catLookup)
			log("PageCatdb: Msg8b had error getting Site Rec: %s",
			    mstrerror(g_errno));
		else
			log("PageCatdb: Msg2a had error generating Catdb: %s",
			    mstrerror(g_errno));
		st->m_catLookup = false;
		g_errno = 0;
	}
	long long endTime = gettimeofdayInMilliseconds();
	// page buffer
	SafeBuf sb;
	sb.reserve(64*1024);
	// . print standard header
	// . do not print big links if only an assassin, just print host ids
	g_pages.printAdminTop ( &sb, st->m_socket , &st->m_r );

	sb.safePrintf(
		      "<style>"
		      ".poo { background-color:#%s;}\n"
		      "</style>\n" ,
		      LIGHT_BLUE );


	sb.safePrintf ( "<table %s>"
			"<tr><td colspan=2>"
			"<center><font size=+1><b>Catdb</b></font></center>"
			"</td></tr>", TABLE_STYLE );

	// instructions
	sb.safePrintf("<tr bgcolor=#%s>"
		      "<td colspan=3>"
		      "<font size=-2>"
		      "<center>"
		      "Don't just start using this, you need to follow the "
		      "instructions in the <i>admin guide</i> for adding "
		      "DMOZ support."
		      "</center>"
		      "</font>"
		      "</td>"
		      "</tr>"
		      ,DARK_BLUE
		      );

	// print the generate Catdb link
	sb.safePrintf ( "<tr class=poo><td>Update Catdb from DMOZ data.</td>"
			"<td><center>"
			"<a href=\"/master/catdb?c=%s&gencatdb=2\">"
			"Update Catdb</a> "
			"</center></td></tr>",
			st->m_coll );
	sb.safePrintf ( "<tr class=poo>"
			"<td>Generate New Catdb from DMOZ data.</td>"
			"<td><center>"
			"<a href=\"/master/catdb?c=%s&gencatdb=1\">"
			"Generate Catdb</a> "
			"</center></td></tr>",
			st->m_coll );
	if (st->m_genCatdb)
		sb.safePrintf ( "<tr class=poo>"
				"<td> Catdb Generation took %lli ms."
				"</td></tr>",
				endTime - st->m_startTime );
	// print Url Catgory Lookup
	sb.safePrintf ( "<tr class=poo><td>Lookup Category of Url.</td>"
			"<td><input type=text name=caturl size=80"
			" value=\"");
	if (st->m_catLookup) {
		sb.safeMemcpy(st->m_url.getUrl(), st->m_url.getUrlLen());
	}
	sb.safePrintf("\"></center></td></tr>" );
	// print Url Info if Lookup was done
	if (st->m_catLookup) {
		sb.safePrintf("<tr><td>");
		// print the url
		sb.safeMemcpy(st->m_url.getUrl(), st->m_url.getUrlLen());
		sb.safePrintf(" (%lli ms)</td><td>",
				endTime - st->m_startTime );
		// print each category id and path
		for (long i = 0; i < st->m_catRec.m_numCatids; i++) {
			sb.safePrintf("<b>[%li] ",
					st->m_catRec.m_catids[i]);
			g_categories->printPathFromId(&sb,
					st->m_catRec.m_catids[i]);
			sb.safePrintf("</b><br>");
			// lookup title and summary
			char  title[1024];
			long  titleLen = 0;
			char  summ[4096];
			long  summLen = 0;
			char  anchor[256];
			unsigned char anchorLen = 0;
			g_categories->getTitleAndSummary(
					st->m_url.getUrl(),
					st->m_url.getUrlLen(),
					st->m_catRec.m_catids[i],
					title,
					&titleLen,
					1023,
					summ,
					&summLen,
					4098,
					anchor,
					&anchorLen,
					255 );
			title[titleLen] = '\0';
			summ[summLen] = '\0';
			anchor[anchorLen] = '\0';
			// print title and summary
			sb.safePrintf("<b>Title:</b> %s<br>"
					"<b>Summary:</b> %s<br>",
					title, summ);
			if (anchorLen > 0)
				sb.safePrintf("<b>Anchor:</b> %s<br>",
						anchor);
			sb.safePrintf("<br>");
		}
		sb.safePrintf("<b>Filenum:</b> %li<br>",
				st->m_catRec.m_filenum);
		// print indirect catids
		if (st->m_catRec.m_numIndCatids > 0) {
			sb.safePrintf("<hr><b>Indirect Catids [%li]:"
					"</b><br>\n",
					st->m_catRec.m_numIndCatids );
			for (long i = 0;
				  i < st->m_catRec.m_numIndCatids; i++) {
				sb.safePrintf("%lu<br>",
					st->m_catRec.m_indCatids[i]);
			}
		}
		sb.safePrintf("</td></tr>");
	}
	// end it
	sb.safePrintf ( "</center></td></tr></table>" );
	// print submit button
	sb.safePrintf ( "<br><center>"
			"<input type=submit value=\"Submit\" border=0>"
			"</form></center>" );

	// print the final tail
	//p += g_httpServer.printTail ( p , pend - p );
	// clear g_errno, if any, so our reply send goes through
	g_errno = 0;
	// extract the socket
	TcpSocket *s = st->m_socket;
	// clear the state
	mdelete ( st, sizeof(StateCatdb), "PageCatdb" );
	delete st;
	// . send this page
	// . encapsulates in html header and tail
	// . make a Mime
	return g_httpServer.sendDynamicPage(s , sb.getBufStart(), sb.length());
}