// . return ptr to the buffer we serialize into
// . return NULL and set g_errno on error
bool Msg20Reply::sendReply ( XmlDoc *xd ) {

	// get it
	UdpSlot *slot = (UdpSlot *)xd->m_slot;

	if ( g_errno ) {
		// extract titleRec ptr
		log("query: Had error generating msg20 reply for d=%lli: "
		    "%s",m_docId, mstrerror(g_errno));
		// don't forget to delete this list
	haderror:
		mdelete ( xd, sizeof(XmlDoc) , "Msg20" );
		delete ( xd );
		g_udpServer.sendErrorReply ( slot , g_errno ) ;
		return true;
	}

	// now create a buffer to store title/summary/url/docLen and send back
	long  need = getStoredSize();
	char *buf  = (char *)mmalloc ( need , "Msg20Reply" );
	if ( ! buf ) goto haderror;

	// should never have an error!
	long used = serialize ( buf , need );

	// sanity
	if ( used != need ) { char *xx=NULL;*xx=0; }

	// sanity check, no, might have been banned/filtered above around
	// line 956 and just called sendReply directly
	//if ( st->m_memUsed == 0 ) { char *xx=NULL;*xx=0; }

	// use blue for our color
	long color = 0x0000ff;
	// but use dark blue for niceness > 0
	if ( xd->m_niceness > 0 ) color = 0x0000b0;

	//Msg20Reply *tt = (Msg20Reply *)buf;

	// sanity check
	if ( ! xd->m_utf8ContentValid ) { char *xx=NULL;*xx=0; }
	// for records
	long clen = 0;
	if ( xd->m_utf8ContentValid ) clen = xd->size_utf8Content - 1;
	// show it in performance graph
	if ( xd->m_startTimeValid ) 
		g_stats.addStat_r ( clen                         ,
				    xd->m_startTime              , 
				    gettimeofdayInMilliseconds() ,
				    color                        );
	
	// . del the list at this point, we've copied all the data into reply
	// . this will free a non-null State20::m_ps (ParseState) for us
	mdelete ( xd , sizeof(XmlDoc) , "xd20" );
	delete ( xd );
	
	g_udpServer.sendReply_ass ( buf , need , buf , need , slot );

	return true;
}
void Msg0::reset ( ) {
	if ( m_msg5  && m_deleteMsg5  ) {
		mdelete ( m_msg5 , sizeof(Msg5) , "Msg0" );
		delete ( m_msg5  );
	}
	if ( m_msg5b && m_deleteMsg5b ) {
		mdelete ( m_msg5b , sizeof(Msg5) , "Msg0" );
		delete ( m_msg5b );
	}
	m_msg5  = NULL;
	m_msg5b = NULL;
//#ifdef SPLIT_INDEXDB
	if ( m_replyBuf )
		mfree ( m_replyBuf, m_replyBufSize, "Msg0" );
	m_replyBuf = NULL;
	m_replyBufSize = 0;
//#endif
	if ( m_mcasts ) {
		mfree(m_mcasts,sizeof(Multicast),"msg0mcast");
		m_mcasts = NULL;
	}
	// no longer do this because we call reset after the msg5 completes
	// and it was destroying our handylist... so just call freelist
	// in the destructor now
	//m_handyList.freeList();
}
void gotMulticastReplyWrapper ( void *state , void *state2 ) {

	Multicast *mcast = (Multicast *)state;
	//msg7->gotMsg7Reply();

	ImportState *is = mcast->m_importState;

	is->m_numIn++;

	log("import: imported %lli docs (off=%lli)",
	    is->m_numIn,is->m_fileOffset);

	if ( ! is->importLoop() ) return;

	// we will be called again when this multicast reply comes in...
	if ( is->m_numIn < is->m_numOut ) return;

	log("inject: import is done");

	CollectionRec *cr = g_collectiondb.getRec ( is->m_collnum );
	// signify to qa.cpp that we are done
	if ( cr ) cr->m_importState = NULL;

	mdelete ( is, sizeof(ImportState) , "impstate");
	delete (is);
}
bool sendReply ( SafeBuf *sb ) {
	// save this
	TcpSocket *s = st->m_s;
	// nuke state60
	mdelete ( st , sizeof(State60) , "turk1" );
	delete (st);
	// get page to send back
	char *buf = sb->getBufStart();
	// does this include the \0???
	int32_t  bufLen = sb->length();
	// remove \0 i guess if we had one
	if ( bufLen > 0 && buf[bufLen-1] == '\0' ) bufLen--;
	// and send that back
	bool  status = g_httpServer.sendDynamicPage (s,
						     buf,
						     bufLen,
						     -1, // cachetime
						     false, // POSTReply?
						     "text/html",
						     -1, // httpstatus
						     NULL,  // cookie
						     "utf8" ); // charset
	// and convey the status
	return status;
}
void Blaster::freeStateBD(StateBD *st){
	// Free stateBD's buf
	if (!st) return;
	if (st->m_buf1)
	        mfree(st->m_buf1,st->m_buf1MaxLen,"Blaster5");
	mdelete(st,sizeof(StateBD),"Blaster3");
}
// when XmlDoc::inject() complets it calls this
void doneInjectingWrapper10 ( void *state ) {
	XmlDoc *xd = (XmlDoc *)state;
	UdpSlot *slot = (UdpSlot *)xd->m_slot;
	long err = g_errno;
	mdelete ( xd, sizeof(XmlDoc) , "PageInject" );
	delete (xd);
	g_errno = err;
	sendReply ( slot );
}
// when XmlDoc::inject() complets it calls this
void doneInjectingWrapper10 ( void *state ) {
	XmlDoc *xd = (XmlDoc *)state;
	UdpSlot *slot = (UdpSlot *)xd->m_slot;
	int32_t err = g_errno;
	mdelete ( xd, sizeof(XmlDoc) , "PageInject" );
	delete (xd);
	g_errno = err;
	if ( g_errno ) g_udpServer.sendErrorReply(slot,g_errno);
	else           g_udpServer.sendReply_ass(NULL,0,NULL,0,slot);
}
// reset rdb
void Collectiondb::reset() {
	log("db: resetting collectiondb.");
	for ( long i = 0 ; i < m_numRecs ; i++ ) {
		if ( ! m_recs[i] ) continue;
		mdelete ( m_recs[i], sizeof(CollectionRec), "CollectionRec" ); 
		delete ( m_recs[i] );
		m_recs[i] = NULL;
	}
	m_numRecs     = 0;
	m_numRecsUsed = 0;
}
// send back a reply to the originator of the msg7 injection request
void sendUdpReply7 ( void *state ) {

	XmlDoc *xd = (XmlDoc *)state;

	// remove from linked list
	if ( xd->m_nextInject ) 
		xd->m_nextInject->m_prevInject = xd->m_prevInject;
	if ( xd->m_prevInject )
		xd->m_prevInject->m_nextInject = xd->m_nextInject;
	if ( s_injectHead == xd )
		s_injectHead = xd->m_nextInject;
	if ( s_injectTail == xd )
		s_injectTail = xd->m_prevInject;
	xd->m_nextInject = NULL;
	xd->m_prevInject = NULL;


	UdpSlot *slot = xd->m_injectionSlot;

    uint32_t statColor = 0xccffcc;
    if(xd->m_indexCode) {
        statColor = 0xaaddaa;//0x4e99e9;
    }
	g_stats.addStat_r ( xd->m_rawUtf8ContentSize,
						xd->m_injectStartTime, 
						gettimeofdayInMilliseconds(),
						statColor );


	// injecting a warc seems to not set m_indexCodeValid to true
	// for the container doc... hmmm...
	int32_t indexCode = -1;
	int64_t docId = 0;
	if ( xd && xd->m_indexCodeValid ) indexCode = xd->m_indexCode;
	if ( xd && xd->m_docIdValid     ) docId = xd->m_docId;
	mdelete ( xd, sizeof(XmlDoc) , "PageInject" );
	delete (xd);


	if ( g_errno ) {
		g_udpServer.sendErrorReply(slot,g_errno);
		return;
	}
	// just send back the 4 byte indexcode, which is 0 on success,
	// otherwise it is the errno
	char *tmp = slot->m_tmpBuf;
	char *p = tmp;
	memcpy ( p , (char *)&indexCode , 4 );
	p += 4;
	memcpy ( p , (char *)&docId , 8 );
	p += 8;

	g_udpServer.sendReply_ass(tmp,(p-tmp),NULL,0,slot);
}
void doneInjectingLinksWrapper ( void *state ) {
	Msg7 *msg7 = (Msg7 *)state;
	SafeBuf *sb = &msg7->m_sb;
	// copy the serps into ou rbuf
	if ( ! g_errno ) {
		// print header
		if ( sb->length() == 0 ) {
			// print header of page
			sb->safePrintf("<?xml version=\"1.0\" "
				       "encoding=\"UTF-8\" ?>\n"
				       "<response>\n" );
		}
		// serp header
		if ( msg7->m_round == 1 )
			sb->safePrintf("\t<googleResults>\n");
		else
			sb->safePrintf("\t<bingResults>\n");
		// print results
		sb->safeMemcpy(&msg7->m_xd.m_serpBuf);
		// end that
		if ( msg7->m_round == 1 )
			sb->safePrintf("\t</googleResults>\n");
		else
			sb->safePrintf("\t</bingResults>\n");
	}
	// do bing now
	if ( msg7->m_round == 1 ) {
		// return if it blocks
		if ( ! msg7->scrapeQuery() ) return;
	}

	// otherwise, parse out the search results so steve can display them
	if ( g_errno )
		sb->safePrintf("<error><![CDATA[%s]]></error>\n",
			       mstrerror(g_errno));
	// print header of page
	sb->safePrintf("</response>\n");
	// page is not more than 32k
	//char buf[1024*32];
	//char *p = buf;
	// return docid and hostid
	//p += sprintf ( p , "scraping status ");
	// print error msg out, too or "Success"
	//p += sprintf ( p , "%s", mstrerror(g_errno));
	TcpSocket *sock = msg7->m_socket;
	g_httpServer.sendDynamicPage ( sock, 
				       sb->getBufStart(),
				       sb->length(),
				       -1/*cachetime*/);
	// hopefully sb buffer is copied becaues this will free it:
	mdelete ( msg7, sizeof(Msg7) , "PageInject" );
	delete (msg7);
}
Example #11
0
/*******************************************************************
  Part of Sub-level Kurtosis Calculate:
     sum(Zjk*ones(1,p).*(data_proj))./sum(Zjk)
*******************************************************************/
void kurtmodel(matrix *mZjk, double sumZjk, matrix *data, vector *meanZjk)
{
    int i;
    matrix Mt;

    mnew(&Mt, data->m, data->n);
    mmDotMul(mZjk, data, &Mt);
    msum(&Mt, 'c', meanZjk);
    for (i=0; i<(meanZjk->l); i++) {
        *(meanZjk->pr + i) /= sumZjk;
    };
    mdelete(&Mt);
}
Example #12
0
static void
test(struct map *m , int n, int start) {
	init(n,start);
	shuffle(n);
	int i;
	for (i=0;i<n;i++) {
		mnew(m,INDEX[i]);
	}
	shuffle(n);
	n =  rand() % (n/2);
	for (i=0;i<n;i++) {
		mdelete(m,INDEX[i]);
	}
}
// this must always be called sometime AFTER handleRequest() is called
void sendReply ( UdpSlot *slot , Msg39 *msg39 , char *reply , int32_t replyLen ,
		 int32_t replyMaxSize , bool hadError ) {
	// debug msg
	if ( g_conf.m_logDebugQuery || (msg39&&msg39->m_debug) ) 
		logf(LOG_DEBUG,"query: msg39: [%"PTRFMT"] "
		     "Sending reply len=%"INT32".",
		     (PTRTYPE)msg39,replyLen);

	// sanity
	if ( hadError && ! g_errno ) { char *xx=NULL;*xx=0; }

	// no longer in use. msg39 will be NULL if ENOMEM or something
	if ( msg39 ) msg39->m_inUse = false;

	// . if we enter from a local call and not from handling a udp slot
	//   then execute this logic here to return control to caller.
	// . do not delete ourselves because we will be re-used probably and
	//   caller handles that now.
	if ( msg39 && msg39->m_callback ) {
		// if we blocked call user callback
		if ( msg39->m_blocked ) msg39->m_callback ( msg39->m_state );
		// if not sending back a udp reply, return now
		return;
	}

	// . now we can free the lists before sending
	// . may help a little bit...
	//if ( msg39 ) {
	//	for ( int32_t j = 0 ; j < msg39->m_msg2.m_numLists ; j++ ) 
	//		msg39->m_lists[j].freeList();
	//}
	// get the appropriate UdpServer for this niceness level
	UdpServer *us = &g_udpServer;
	// i guess clear this
	int32_t err = g_errno;
	g_errno = 0;
	// send an error reply if g_errno is set
	if ( err ) us->sendErrorReply ( slot , err ) ; 
	else       us->sendReply_ass ( reply    , 
				       replyLen , 
				       reply    , 
				       replyMaxSize , 
				       slot     );
	// always delete ourselves when done handling the request
	if ( msg39 ) {
		mdelete ( msg39 , sizeof(Msg39) , "Msg39" );
		delete (msg39);
	}
}
Example #14
0
void ckurtmodel(matrix *mZjk, double sumZjk,
                matrix *data_re, matrix *data_im,
                vector *meanZjk_re, vector *meanZjk_im)
{
    int i;
    matrix Mt_re;
    matrix Mt_im;
    matrix mZjk_im;

    mnew(&Mt_re, data_re->m, data_re->n);
    mnew(&Mt_im, data_im->m, data_im->n);
    mnew(&mZjk_im, mZjk->m, mZjk->n);

    cmmDotMul(mZjk, &mZjk_im, data_re, data_im, &Mt_re, &Mt_im);
    msum(&Mt_re, 'c', meanZjk_re);
    msum(&Mt_im, 'c', meanZjk_im);
    for (i=0; i<(meanZjk_re->l); i++) {
        *(meanZjk_re->pr + i) /= sumZjk;
        *(meanZjk_im->pr + i) /= sumZjk;
    };
    mdelete(&Mt_re);
    mdelete(&Mt_im);
    mdelete(&mZjk_im);
}
// . this may be called from a signal handler
// . we call from a signal handler to keep msg21 zippy
// . this may be called twice, onece from sig handler and next time not
//   from the sig handler
void doneSending_ass ( void *state , UdpSlot *slot ) {
	// point to our state
	State00 *st0 = (State00 *)state;
	// this is nULL if we hit the cache above
	if ( ! st0 ) return;
	// this might be inaccurate cuz sig handler can't call it!
	int64_t now = gettimeofdayInMilliseconds();
	// log the stats
	if ( g_conf.m_logTimingNet ) {
		double mbps ;
		mbps = (((double)slot->m_sendBufSize) * 8.0 / (1024.0*1024.0))/
			(((double)slot->m_startTime)/1000.0);
		log("net: msg0: Sent %"INT32" bytes of data in %"INT64" ms (%3.1fMbps) "
		      "(niceness=%"INT32").",
		      slot->m_sendBufSize , now - slot->m_startTime , mbps ,
		      st0->m_niceness );
	}
	// can't go any further if we're in a sig handler
	//if ( g_inSigHandler ) return;
	// . mark it in pinkish purple
	// . BUT, do not add stats here for tagdb, we get WAY too many lookups
	//   and it clutters the performance graph
	if ( st0->m_rdbId == RDB_TAGDB ) {
	}
	else if(slot->m_niceness > 0) {
		g_stats.addStat_r ( slot->m_sendBufSize , 
				    st0->m_startTime ,
				    now ,
				    //"transmit_data_nice",
				    0x00aa00aa);
	} 
	else {
		g_stats.addStat_r ( slot->m_sendBufSize , 
				    st0->m_startTime ,
				    now ,
				    //"transmit_data",
				    0x00ff00ff );
	}


	// release st0 now
	mdelete ( st0 , sizeof(State00) , "Msg0" );
	delete ( st0 );
}
void Msg7::reset() { 
	m_round = 0;
	//if ( m_inUse ) { char *xx=NULL;*xx=0; }
	//m_firstTime = true;
	//m_fixMe = false;
	//m_injectCount = 0;
	//m_start = NULL;
	m_sbuf.reset();
	//m_isDoneInjecting = false;
	if ( m_xd ) {
		mdelete ( m_xd, sizeof(XmlDoc) , "PageInject" );
		delete (m_xd);
		m_xd = NULL;
	}
	if ( m_sir ) {
		mfree ( m_sir , m_sirSize , "m7ir" );
		m_sir = NULL;
	}
}
// returns true
bool sendErrorReply ( void *state , long err ) {
	// ensure this is set
	if ( ! err ) { char *xx=NULL;*xx=0; }
	// get it
	State8 *st = (State8 *)state;
	// get the tcp socket from the state
	TcpSocket *s = st->m_s;

	char tmp [ 1024*32 ] ;
	sprintf ( tmp , "<b>had server-side error: %s</b><br>",
		  mstrerror(g_errno));
	// nuke state8
	mdelete ( st , sizeof(State8) , "PageGet1" );
	delete (st);
	// erase g_errno for sending
	//g_errno = 0;
	// . now encapsulate it in html head/tail and send it off
	//return g_httpServer.sendDynamicPage ( s , tmp , gbstrlen(tmp) );
	return g_httpServer.sendErrorReply ( s, err, mstrerror(err) );
}
void doneReindexing ( void *state ) {
	// cast it
	State13 *st = (State13 *)state;

	GigablastRequest *gr = &st->m_gr;

	// note it
	if ( gr->m_query && gr->m_query[0] )
		log(LOG_INFO,"admin: Done with query reindex. %s",
		    mstrerror(g_errno));

	////
	//
	// print the html page
	//
	/////

	HttpRequest *hr = &gr->m_hr;

	char format = hr->getReplyFormat();

	SafeBuf sb;

	const char *ct = "text/html";
	if ( format == FORMAT_JSON ) ct = "application/json";
	if ( format == FORMAT_XML  ) ct = "text/xml";

	if ( format == FORMAT_XML ) {
		sb.safePrintf("<response>\n"
			      "\t<statusCode>0</statusCode>\n"
			      "\t<statusMsg>Success</statusMsg>\n"
			      "\t<matchingResults>%" PRId32"</matchingResults>\n"
			      "</response>"
			      , st->m_msg1c.m_numDocIdsAdded
			      );
		g_httpServer.sendDynamicPage ( gr->m_socket,
					       sb.getBufStart(),
					       sb.length(),
					       -1,
					       false,ct);
		mdelete ( st , sizeof(State13) , "PageTagdb" );
		delete (st);
		return;
	}

	if ( format == FORMAT_JSON ) {
		sb.safePrintf("{\"response\":{\n"
			      "\t\"statusCode\":0,\n"
			      "\t\"statusMsg\":\"Success\",\n"
			      "\t\"matchingResults\":%" PRId32"\n"
			      "}\n"
			      "}\n"
			      , st->m_msg1c.m_numDocIdsAdded
			      );
		g_httpServer.sendDynamicPage ( gr->m_socket,
					       sb.getBufStart(),
					       sb.length(),
					       -1,
					       false,ct);
		mdelete ( st , sizeof(State13) , "PageTagdb" );
		delete (st);
		return;
	}



	g_pages.printAdminTop ( &sb , gr->m_socket , &gr->m_hr );

	sb.safePrintf("<style>"
		       ".poo { background-color:#%s;}\n"
		       "</style>\n" ,
		       LIGHT_BLUE );


	//
	// print error msg if any
	//

	if ( gr->m_query && gr->m_query[0] && ! g_errno )
		sb.safePrintf ( "<center><font color=red><b>Success. "
			  "Added %" PRId32" docid(s) to "
			  "spider queue.</b></font></center><br>" , 
			  st->m_msg1c.m_numDocIdsAdded );

	if ( gr->m_query && gr->m_query[0] && g_errno )
		sb.safePrintf ( "<center><font color=red><b>Error. "
				 "%s</b></font></center><br>" , 
				 mstrerror(g_errno));


	// print the reindex interface
	g_parms.printParmTable ( &sb , gr->m_socket , &gr->m_hr  );


	g_httpServer.sendDynamicPage ( gr->m_socket,
				       sb.getBufStart(),
				       sb.length(),
				       -1,
				       false);

	mdelete ( st , sizeof(State13) , "PageTagdb" );
	delete (st);
}
bool sendReply ( void *state ) {
	// get the state properly
	Msg7 *msg7= (Msg7 *) state;
	// extract info from state
	TcpSocket *s = msg7->m_socket;

	XmlDoc *xd = &msg7->m_xd;
	// log it
	//if ( msg7->m_url[0] ) xd->logIt();

	// msg7 has the docid for what we injected, iff g_errno is not set
	//long long docId  = msg7->m_msg7.m_docId;
	//long      hostId = msg7->m_msg7.m_hostId;
	long long docId  = xd->m_docId;
	long      hostId = 0;//msg7->m_msg7.m_hostId;
	

	//
	// debug
	//

	/*
	// now get the meta list, in the process it will print out a 
	// bunch of junk into msg7->m_pbuf
	if ( xd->m_docId ) {
		char *metalist = xd->getMetaList ( 1,1,1,1,1,1 );
		if ( ! metalist || metalist==(void *)-1){char *xx=NULL;*xx=0;}
		// print it out
		SafeBuf *pbuf = &msg7->m_sbuf;
		xd->printDoc( pbuf );
		bool status = g_httpServer.sendDynamicPage( msg7->m_socket , 
							   pbuf->getBufStart(),
							    pbuf->length() ,
							    -1, //cachtime
							    false ,//postreply?
							    NULL, //ctype
							    -1 , //httpstatus
							    NULL,//cookie
							    "utf-8");
		// delete the state now
		mdelete ( st , sizeof(Msg7) , "PageInject" );
		delete (st);
		// return the status
		return status;
	}
	*/
	//
	// end debug
	//
	


	// page is not more than 32k
	char buf[1024*32];

	// . if we're talking w/ a robot he doesn't care about this crap
	// . send him back the error code (0 means success)
	if ( msg7->m_quickReply ) {
		char *p = buf;
		// set g_errno to index code
		if ( xd->m_indexCodeValid &&
		     xd->m_indexCode &&
		     ! g_errno )
			g_errno = xd->m_indexCode;
		// return docid and hostid
		if ( ! g_errno ) p += sprintf ( p , 
					   "0,docId=%lli,hostId=%li," , 
					   docId , hostId );
		// print error number here
		else  p += sprintf ( p , "%li,0,0,", (long)g_errno );
		// print error msg out, too or "Success"
		p += sprintf ( p , "%s", mstrerror(g_errno));
		mdelete ( msg7, sizeof(Msg7) , "PageInject" );
		delete (msg7);
		return g_httpServer.sendDynamicPage ( s, buf , gbstrlen(buf) ,
						      -1/*cachetime*/);
	}

	// get an active ptr into buf
	char *p    = buf;
	char *pend = buf + 1024*32;

	// print admin bar
	p = g_pages.printAdminTop ( p , pend , PAGE_INJECT, 
				    NULL, // msg7->m_username ,
				    msg7->m_coll , 
				    NULL ,  // pwd
				    s->m_ip );

	// if there was an error let them know
	char msg[1024];
	char *pm = "";
	if ( g_errno ) {
		sprintf ( msg ,"Error injecting url: <b>%s[%i]</b>", 
			  mstrerror(g_errno) , g_errno);
		pm = msg;
	}
	//else if ( msg7->m_injected )
	//	pm = "url successfully injected";

	// bail if not enabled
	//if ( ! g_conf.m_injectionEnabled ) {
	//	sprintf ( msg ,"<font color=red>URL injection is disabled "
	//		  "in the Master Controls</font>");
	//	pm = msg;
	//}

	//char *c = msg7->m_coll;
	char bb [ MAX_COLL_LEN + 60 ];
	bb[0]='\0';
	//if ( c && c[0] ) sprintf ( bb , " (%s)", c);

	// make a table, each row will be an injectable parameter
	sprintf ( p ,
		  "<center>"
		  "<b>%s</b>\n\n" // the url msg
		  //"<FORM method=POST action=/inject>\n\n" 

		  //"<input type=hidden name=pwd value=\"%s\">\n"
		  //"<input type=hidden name=username value=\"%s\">\n"
		  "<table width=100%% bgcolor=#%s cellpadding=4 border=1>"
		  "<tr><td  bgcolor=#%s colspan=2>"
		  "<center>"
		  //"<font size=+1>"
		  "<b>"
		  "Inject URL</b>%s"
		  //"</font>"
		  "<br>"
		  //"Enter the information below to inject "
		  //"a URL. This allows you to specify the URL as well as the "
		  //"content for the URL."
		  "</td></tr>\n\n"

		  "<tr><td><b>url</b></td>"
		  "<td>\n"
		  "<input type=text name=u value=\"\" size=50>"
		  "</td></tr>\n\n"

		  "<tr><td><b>query to scrape</b></td>"
		  "<td>\n"
		  "<input type=text name=qts value=\"\" size=50>"
		  "</td></tr>\n\n"

		  //"<tr><td><b>use ahrefs.com</b></td>"
		  //"<td>\n"
		  //"<input type=radio name=useahrefs value=0 checked>no &nbsp; "
		  //"<input type=radio name=useahrefs value=1>yes "
		  //"</td></tr>\n\n"

		  
		  "<tr><td><b>spider links</b></td>"
		  "<td>\n"
		  "<input type=radio name=spiderlinks value=0>no &nbsp; "
		  "<input type=radio name=spiderlinks value=1 checked>yes "
		  "<br>"
		  "<font size=1>Should we add the page's outlinks to "
		  "spiderdb for spidering? "
		  "Default: yes"
		  "</font>"
		  "</td></tr>\n\n"



		  "<tr><td><b>inject scraped links</b></td>"
		  "<td>\n"
		  "<input type=radio name=injectlinks value=0 checked>no &nbsp; "
		  "<input type=radio name=injectlinks value=1>yes "
		  "</td></tr>\n\n"

		  "<tr><td><b>collection</b></td>"
		  "<td>\n"
		  "<input type=text name=c value=\"%s\" size=15>"
		  "</td></tr>\n\n"

		  "<tr><td><b>quick reply?</b><br>"
		  "<font size=1>Should reply be short? "
		  "Default: no"
		  "</td>"
		  "<td>\n"
		  "<input type=radio name=quick value=0 checked>no &nbsp; "
		  "<input type=radio name=quick value=1>yes "
		  "</td></tr>\n\n"

		  "<tr><td><b>only inject new docs?</b><br>"
		  "<font size=1>Skips injection if docs already indexed. "
		  "Default: no"
		  "</td>"
		  "<td>\n"
		  "<input type=radio name=newonly value=0 checked>no &nbsp; "
		  "<input type=radio name=newonly value=1>yes "
		  "</td></tr>\n\n"


		  "<tr><td><b>delete?</b><br>"
		  "<font size=1>Should this url be deleted from the index? "
		  "Default: no"
		  "</td>"
		  "<td>\n"
		  "<input type=radio name=delete value=0 checked>no &nbsp; "
		  "<input type=radio name=delete value=1>yes "
		  "</td></tr>\n\n"


		  "<tr><td><b>recycle content?</b><br>"
		  "<font size=1>Should page content be recycled if "
		  "reindexing? "
		  "Default: no"
		  "</td>"
		  "<td>\n"
		  "<input type=radio name=recycle value=0 checked>no &nbsp; "
		  "<input type=radio name=recycle value=1>yes "
		  "</td></tr>\n\n"

		  "<tr><td><b>ip</b><br>"
		  "<font size=1>IP address of the url. If blank then "
		  "Gigablast will look up. "
		  "Default: blank"
		  "</td>"
		  "<td>\n<input type=text name=ip value=\"\" size=15>"
		  "</td></tr>\n\n"

		  /*
		  "<tr><td><b>do ip lookups?</b><br>"
		  "<font size=1>Should Gigablast look up the IP address "
		  "of the url, if it is not provided. "
		  "Default: yes"
		  "</td>"
		  "<td>\n"
		  "<input type=radio name=iplookups value=0>no &nbsp; "
		  "<input type=radio name=iplookups value=1 checked>yes "
		  "</td></tr>\n\n"
		  */

		  //"<tr><td><b>is url new?</b><br>"
		  //"<font size=1>Is this url new to the index? If unsure "
		  //"then you should say no here. "
		  //"Default: yes"
		  //"</td>"
		  //"<td>\n"
		  //"<input type=radio name=isnew value=0>no &nbsp; "
		  //"<input type=radio name=isnew value=1 checked>yes "
		  //"</td></tr>\n\n"

		  "<tr><td><b>dedup?</b><br>"
		  "<font size=1>Should this url be skipped if there is "
		  "already  a url in the index from this same domain with "
		  "this same content? "
		  "Default: yes"
		  "</td>"
		  "<td>\n"
		  "<input type=radio name=dedup value=0>no &nbsp; "
		  "<input type=radio name=dedup value=1 checked>yes "
		  "</td></tr>\n\n" ,
		  //"<tr><td><b>ruleset</b><br>"
		  //"<font size=1>Use this ruleset to index the URL. "
		  //"Default: auto"
		  //"</td>"
		  //"<td>\n<select name=rs>" ,
		  pm , // msg7->m_pwd , 
		  //msg7->m_username,
		  LIGHT_BLUE , DARK_BLUE , bb , msg7->m_coll );


	p += gbstrlen(p);

	// . print pulldown menu of different site filenums
	// . 0 - default site
	// . 1 - banned  site
	// . 2 - bad     site
	// . 3 - decent  site
	// . 4 - good    site
	// . 5 - super   site
	/*
	for ( long i = 0 ; i < 10000 ; i++ ) {
		Xml *xml = g_tagdb.getSiteXml(i, msg7->m_coll, 
					       gbstrlen(msg7->m_coll));
		if ( ! xml ) break;
		long  slen;
		char *s = xml->getString ( "name" , &slen );
		if ( s && slen > 0 ) {
			char c = s[slen];
			s[slen] = '\0';
			sprintf ( p , "<option value=%li>%s", i , s );
			s[slen] = c;
		}
		else  
			sprintf ( p , "<option value=%li>#%li", i , i );
		p += gbstrlen ( p );
	}
	// end the pull-down menu
	sprintf ( p , "</select></td></tr>\n\n" );
	p += gbstrlen ( p );	
	*/

	// make a table, each row will be an injectable parameter
	sprintf ( p ,
		  "<tr><td><b>content has mime</b><br>"
		  "<font size=1>IP address of the url. If blank then "
		  "Gigablast will look up. "
		  "Default: blank"
		  "</td>"
		  "<td>\n"
		  "<input type=radio name=hasmime value=0 checked>no &nbsp; "
		  "<input type=radio name=hasmime value=1>yes "
		  "</td></tr>\n\n" 

		  "<tr><td colspan=2>"
		  "<center>"
		  "<b>content</b><br>"
		  "<font size=1>Enter the content here. Enter MIME header "
		  "first if \"content has mime\" is set to true above. "
		  "Separate MIME from actual content with two returns."
		  "<br>"
		  "<input type=submit value=Submit>"
		  "<br>"
		  "\n"
		  "<textarea rows=32 cols=80 name=content>"
		  "</textarea>"
		  "<br>"
		  "<br>\n\n"
		  "<input type=submit value=Submit>"
		  "</center>"
		  "</td></tr></table>\n"
		  "</form>\n"
		  );
	p += gbstrlen ( p );	

	p += sprintf(p, "\n</body>\n</html>\n");
	// print the final tail
	//p += g_httpServer.printTail ( p , pend - p , true /*adminLink?*/);
	// clear g_errno, if any, so our reply send goes through
	g_errno = 0;
	// calculate buffer length
	long bufLen = p - buf;
	// nuke state
	mdelete ( msg7, sizeof(Msg7) , "PageInject" );
	delete (msg7);
	// . send this page
	// . encapsulates in html header and tail
	// . make a Mime
	// . i thought we need -2 for cacheTime, but i guess not
	return g_httpServer.sendDynamicPage (s, buf, bufLen, -1/*cachetime*/);
}
Example #20
0
static void
check(void *ud, struct object * obj) {
	struct map * m = ud;
	printf("%u ",obj->id);
	mdelete(m, obj->id);
}
Example #21
0
// . slot should be auto-nuked upon transmission or error
// . TODO: ensure if this sendReply() fails does it really nuke the slot?
void gotListWrapper ( void *state , RdbList *listb , Msg5 *msg5xx ) {
	logTrace( g_conf.m_logTraceMsg0, "BEGIN" );
	
	// get the state
	State00 *st0 = (State00 *)state;
	// extract the udp slot and list and msg5
	UdpSlot   *slot =  st0->m_slot;
	RdbList   *list = &st0->m_list;
	Msg5      *msg5 = &st0->m_msg5;
	UdpServer *us   =  st0->m_us;

	// timing debug
	if ( g_conf.m_logTimingNet || g_conf.m_logDebugNet ) {
		//log("Msg0:hndled request %" PRIu64,gettimeofdayInMilliseconds());
		int32_t size = -1;
		if ( list ) size     = list->getListSize();
		log(LOG_TIMING|LOG_DEBUG,
		    "net: msg0: Handled request for data. "
		    "Now sending data termId=%" PRIu64" size=%" PRId32
		    " transId=%" PRId32" ip=%s port=%i took=%" PRId64" "
		    "(niceness=%" PRId32").",
		    g_posdb.getTermId(msg5->m_startKey),
		    size,slot->m_transId,
		    iptoa(slot->m_ip),slot->m_port,
		    gettimeofdayInMilliseconds() - st0->m_startTime ,
		    st0->m_niceness );
	}

	// on error nuke the list and it's data
	if ( g_errno ) {
		mdelete ( st0 , sizeof(State00) , "Msg0" );
		delete (st0);
		// TODO: free "slot" if this send fails
		
		log(LOG_ERROR,"%s:%s:%d: call sendErrorReply.", __FILE__, __func__, __LINE__);
		us->sendErrorReply ( slot , g_errno );
		return;
	}

	QUICKPOLL(st0->m_niceness);
	// point to the serialized list in "list"
	char *data      = list->getList();
	int32_t  dataSize  = list->getListSize();
	char *alloc     = list->getAlloc();
	int32_t  allocSize = list->getAllocSize();
	// tell list not to free the data since it is a reply so UdpServer
	// will free it when it destroys the slot
	list->setOwnData ( false );
	// keep track of stats
	Rdb *rdb = getRdbFromId ( st0->m_rdbId );
	if ( rdb ) rdb->sentReplyGet ( dataSize );
	// TODO: can we free any memory here???

	// keep track of how long it takes to complete the send
	st0->m_startTime = gettimeofdayInMilliseconds();
	// debug point
	int32_t oldSize = msg5->m_minRecSizes;
	int32_t newSize = msg5->m_minRecSizes + 20;
	// watch for wrap around
	if ( newSize < oldSize ) newSize = 0x7fffffff;
	if ( dataSize > newSize && list->getFixedDataSize() == 0 &&
	     // do not annoy me with these linkdb msgs
	     dataSize > newSize+100 ) 
		log(LOG_LOGIC,"net: msg0: Sending more data than what was "
		    "requested. Ineffcient. Bad engineer. dataSize=%" PRId32" "
		    "minRecSizes=%" PRId32".",dataSize,oldSize);
		    
	//
	// for linkdb lists, remove all the keys that have the same IP32
	// and store a count of what we removed somewhere
	//
	if ( st0->m_rdbId == RDB_LINKDB ) {
		// store compressed list on itself
		char *dst = list->m_list;
		// keep stats
		int32_t totalOrigLinks = 0;
		int32_t ipDups = 0;
		int32_t lastIp32 = 0;
		char *listEnd = list->getListEnd();
		// compress the list
		for ( ; ! list->isExhausted() ; list->skipCurrentRecord() ) {
			// breathe
			QUICKPOLL ( st0->m_niceness );
			// count it
			totalOrigLinks++;
			// get rec
			char *rec = list->getCurrentRec();
			int32_t ip32 = g_linkdb.getLinkerIp_uk((key224_t *)rec );
			// same as one before?
			if ( ip32 == lastIp32 && 
			     // are we the last rec? include that for
			     // advancing the m_nextKey in Linkdb more 
			     // efficiently.
			     rec + LDBKS < listEnd ) {
				ipDups++;
				continue;
			}
			// store it
			gbmemcpy (dst , rec , LDBKS );
			dst += LDBKS;
			// update it
			lastIp32 = ip32;
		}
		// . if we removed one key, store the stats
		// . caller should recognize reply is not a multiple of
		//   the linkdb key size LDBKS and no its there!
		if ( ipDups ) {
			//*(int32_t *)dst = totalOrigLinks;
			//dst += 4;
			//*(int32_t *)dst = ipDups;
			//dst += 4;
		}
		// update list parms
		list->m_listSize = dst - list->m_list;
		list->m_listEnd  = list->m_list + list->m_listSize;
		data      = list->getList();
		dataSize  = list->getListSize();
	}


	//log("sending replySize=%" PRId32" min=%" PRId32,dataSize,msg5->m_minRecSizes);
	// . TODO: dataSize may not equal list->getListMaxSize() so
	//         Mem class may show an imblanace
	// . now g_udpServer is responsible for freeing data/dataSize
	// . the "true" means to call doneSending_ass() from the signal handler
	//   if need be
	st0->m_us->sendReply_ass( data, dataSize, alloc, allocSize, slot, st0, doneSending_ass, -1, -1, true );

	logTrace( g_conf.m_logTraceMsg0, "END" );
}	
Example #22
0
File: veCov.cpp Project: NCIP/visda
/*******************************************************************
 Subroutine to compute the inverse matrix and determinant
   matrix *cov:        the pointer to the covariance matrix
   matrix *inv_cov:    the pointer to the inverse covariance matrix
   matrix *cov_mat:    the pointer to the approximate covariance matrix
                       when singular. If unsingular, it equals to cov
   double *det_cov:    the pointer to determinant

 return value: '1' - successfully exit
               '0' - exit with waring/error
*******************************************************************/
int veCov(matrix *cov, matrix *inv_cov, matrix *cov_mat, 
		  double *det_cov)
{
	int i, j;
	matrix eigvec_re;
	matrix eigvec_im;
	vector eigval_re;
	vector eigval_im;
    int *eig_order;
	int eig_info;
	int num_v;  // the number of eigenvalue
	int rank_c; 
	double sum_v;
	double factor = 0.02;
	double ass_value;
	double min_real;

    mnew(&eigvec_re, cov->m, cov->n);
    mnew(&eigvec_im, cov->m, cov->n);
	vnew(&eigval_re, cov->n);
	vnew(&eigval_im, cov->n);
    eig_order = new int[cov->n];
    

    // the eigenvector and eigenvalue of covariance matrix
    eig_info = eig(cov, &eigvec_re, &eigvec_im, &eigval_re, &eigval_im);
	//vprint(&eigval_re);
	//vprint(&eigval_im);

	if (!eig_info) {
		printf(" The eigenvalue computation failed! \n");
		return 0;
		//....
	}
	
	// the rank of covariance matrix
	num_v = cov->n;
	
	/*rank_c = num_v;
	for (i=0; i<num_v; i++) {
		if ((fabs(*(eigval_re.pr+i)) < ZEROTHRESH) && (fabs(*(eigval_im.pr+i)) < ZEROTHRESH)) {
			rank_c--;
		}
	}
	printf("rank = %d", rank_c);*/

	rank_c = rank(cov, TOLERANCE);

	// compute the inverse and determinate
    if (rank_c == num_v) {  // nonsingular
		inv(cov, inv_cov);
		mcopy(cov, cov_mat);
		*det_cov = det(cov);
	
	} else {  // singular
		min_real = pow(10, (((double)-250) / ((double) cov->m)));

		/*for (i=0; i<num_v; i++) {
			if ((*(eigval_re.pr+i) < ZEROTHRESH) || (*(eigval_im.pr+i) != 0)) {
				*(eigval_re.pr+i) = 0;  // ???? keep the real part of complex or not
				*(eigval_im.pr+i) = 0;
			}
		}
		sort(&eigval_re, eig_order, 'd'); */

		for (i=0; i<num_v; i++) {
			// when negtive real eigenvalue, change to absolute value
			//   to ensure all the real eigenvalues are positive
			if ((eigval_re.pr[i] < 0) && (eigval_im.pr[i] == 0)) {
				eigval_re.pr[i] *= -1;
				// the i-th column of eigenvector should also be changed the sign
				for (j=0; j<(eigvec_re.m); j++) {
					eigvec_re.pr[j*(eigvec_re.n)+i] *= -1;
				}
			}
		}

		//vprint(&eigval_re);
		//vprint(&eigval_im);

		// sort real eigenvalues descendingly, put complex ones at the end
		sorteig(&eigval_re, &eigval_im, eig_order);

		for (i=rank_c; i<num_v; i++) {
				*(eigval_re.pr+i) = 0;
				*(eigval_im.pr+i) = 0;
		}

		//vprint(&eigval_re);
		//vprint(&eigval_im);

		sum_v = vsum(&eigval_re);

		ass_value = factor * sum_v / (num_v - rank_c);

		if (ass_value < (0.5 * (*(eigval_re.pr+rank_c)) * (1 - factor))) {
			if (ass_value > min_real) {
				for (i=rank_c; i<num_v; i++) {
					*(eigval_re.pr+i) = ass_value;
				}
				for (i=0; i<rank_c; i++) {
					*(eigval_re.pr+i) *= 1 - factor;
				}
			} else {
				for (i=rank_c; i<num_v; i++) {
					*(eigval_re.pr+i) = min_real;
				}
			}
		} else {
			ass_value = 0.5 * (*(eigval_re.pr+rank_c)) * (1 - factor);
			if (ass_value > min_real) {
				for (i=rank_c; i<num_v; i++) {
					*(eigval_re.pr+i) = ass_value;
				}
				for (i=0; i<rank_c; i++) {
					*(eigval_re.pr+i) = *(eigval_re.pr+i) - ass_value * (num_v - rank_c) * (*(eigval_re.pr+i)) / sum_v;
				}
			} else {
				for (i=rank_c; i<num_v; i++) {
					*(eigval_re.pr+i) = min_real;
				}
			}
		}
        
		//vprint(&eigval_re);
		//vprint(&eigval_im);

		matrix eigvec_re_sorted;
		matrix eigvec_re_sorted_t;
		mnew(&eigvec_re_sorted, num_v, num_v);
		mnew(&eigvec_re_sorted_t, num_v, num_v);

		sortcols(eig_order, &eigvec_re, &eigvec_re_sorted);
		transpose(&eigvec_re_sorted, &eigvec_re_sorted_t); 

		matrix inv_eig_vl_s;
		mnew(&inv_eig_vl_s, num_v, num_v);
		for (i=1; i<num_v; i++) {
			*(inv_eig_vl_s.pr + i*num_v + i) = 1 / (*(eigval_re.pr+i));    
		}
		
		matrix tmp;
		mnew(&tmp, num_v, num_v);

		mmMul(&eigvec_re_sorted, &inv_eig_vl_s, &tmp);
		mmMul(&tmp, &eigvec_re_sorted_t, inv_cov);

		matrix diag_eigval;
		mnew(&diag_eigval, num_v, num_v);
		for (i=0; i<num_v; i++) {
			*(diag_eigval.pr + i*num_v + i) = *(eigval_re.pr+i);    
		}
		mmMul(&eigvec_re_sorted, &diag_eigval, &tmp);
		mmMul(&tmp, &eigvec_re_sorted_t, cov_mat);

		*det_cov = 1;
		for (i=0; i<num_v; i++) {
			*det_cov = (*det_cov) * (*(eigval_re.pr+i)); 
		}

		mdelete(&inv_eig_vl_s);
		mdelete(&eigvec_re_sorted);
		mdelete(&eigvec_re_sorted_t);
		mdelete(&tmp);
		mdelete(&diag_eigval);

	}

	#ifdef _DEBUG
	printf("rank = %d \n", rank_c);
	printf("\n det_cov = %e \n", *det_cov);
	printf("inv_cov = \n");
	mprint(inv_cov);
	printf("cov_mat = \n");
	mprint(cov_mat);
	#endif

    mdelete(&eigvec_re);
	mdelete(&eigvec_im);
	vdelete(&eigval_re);
	vdelete(&eigval_im);
    delete []eig_order;

    return 1;
}
// . returns false if blocked, true otherwise
// . sets g_errno on error
bool sendPageGet ( TcpSocket *s , HttpRequest *r ) {
	// get the collection
	long  collLen = 0;
	char *coll    = r->getString("c",&collLen);
	if ( ! coll || ! coll[0] ) {
		//coll    = g_conf.m_defaultColl;
		coll = g_conf.getDefaultColl( r->getHost(), r->getHostLen() );
		collLen = gbstrlen(coll);
	}
	// ensure collection not too big
	if ( collLen >= MAX_COLL_LEN ) { 
		g_errno = ECOLLTOOBIG; 
		return g_httpServer.sendErrorReply(s,500,mstrerror(g_errno)); 
	}
	// get the collection rec
	CollectionRec *cr = g_collectiondb.getRec ( coll );
	if ( ! cr ) {
		g_errno = ENOCOLLREC;
		log("query: Archived copy retrieval failed. "
		    "No collection record found for "
		    "collection \"%s\".",coll);
		return g_httpServer.sendErrorReply(s,500,mstrerror(g_errno));
	}
	// does this collection ban this IP?
	if ( ! cr->hasSearchPermission ( s ) ) {
		g_errno = ENOPERM;
		//log("PageGet::sendDynamicReply0: permission denied for %s",
		//    iptoa(s->m_ip) );
		g_msg = " (error: permission denied)";
		return g_httpServer.sendErrorReply(s,500,mstrerror(g_errno));
	}
	// . get fields from cgi field of the requested url
	// . get the search query
	long  qlen = 0;
	char *q = r->getString ( "q" , &qlen , NULL /*default*/);
	// ensure query not too big
	if ( qlen >= MAX_QUERY_LEN-1 ) { 
		g_errno=EQUERYTOOBIG; 
		return g_httpServer.sendErrorReply (s,500 ,mstrerror(g_errno));
	}
	// the docId
	long  long docId = r->getLongLong ( "d" , 0LL /*default*/ );
	// get url
	char *url = r->getString ( "u",NULL);

	if ( docId == 0 && ! url ) {
		g_errno = EMISSINGINPUT;
		return g_httpServer.sendErrorReply (s,500 ,mstrerror(g_errno));
	}


	// . should we do a sequential lookup?
	// . we need to match summary here so we need to know this
	//bool seq = r->getLong ( "seq" , false );
	// restrict to root file?
	bool rtq = r->getLong ( "rtq" , false );

	// . get the titleRec
	// . TODO: redirect client to a better http server to save bandwidth
	State2 *st ;
	try { st = new (State2); }
	catch (... ) {
		g_errno = ENOMEM;
		log("PageGet: new(%i): %s", 
		    (int)sizeof(State2),mstrerror(g_errno));
		return g_httpServer.sendErrorReply(s,500,mstrerror(g_errno));}
	mnew ( st , sizeof(State2) , "PageGet1" );
	// save the socket and if Host: is local in the Http request Mime
	st->m_socket   = s;
	st->m_isAdmin  = g_conf.isCollAdmin ( s , r );
	st->m_isLocal  = r->isLocal();
	st->m_docId    = docId;
	st->m_printed  = false;
	// include header ... "this page cached by Gigablast on..."
	st->m_includeHeader     = r->getLong ("ih"    , true  );
	st->m_includeBaseHref   = r->getLong ("ibh"   , false );
	st->m_queryHighlighting = r->getLong ("qh"    , true  );
	st->m_strip             = r->getLong ("strip" , 0     );
	st->m_clickAndScroll    = r->getLong ("cas"   , true  );
	st->m_cnsPage           = r->getLong ("cnsp"  , true );
	char *langAbbr = r->getString("qlang",NULL);
	st->m_langId = langUnknown;
	if ( langAbbr ) {
		uint8_t langId = getLangIdFromAbbr ( langAbbr );
		st->m_langId = langId;
	}
	strncpy ( st->m_coll , coll , MAX_COLL_LEN+1 );
	// store query for query highlighting
	st->m_netTestResults    = r->getLong ("rnettest", false );
	//if( st->m_netTestResults ) {
	//	mdelete ( st , sizeof(State2) , "PageGet1" );
	//	delete ( st );
	//	return sendPageNetResult( s );
	//}
	if ( q && qlen > 0 ) strcpy ( st->m_q , q );
	else                 st->m_q[0] = '\0';
	st->m_qlen = qlen;
	//st->m_seq      = seq;
	st->m_rtq      = rtq;
	st->m_boolFlag = r->getLong ("bq", 2 /*default is 2*/ );
	st->m_isBanned = false;
	st->m_noArchive = false;
	st->m_socket = s;
	st->m_format = r->getReplyFormat();
	// default to 0 niceness
	st->m_niceness = 0;
	st->m_r.copy ( r );
	//st->m_cr = cr;
	st->m_printDisclaimer = true;
	if ( st->m_cnsPage )
		st->m_printDisclaimer = false;
	if ( st->m_strip ) // ! st->m_evbits.isEmpty() ) 
		st->m_printDisclaimer = false;
	
	// should we cache it?
	char useCache = r->getLong ( "usecache" ,  1 );
	char rcache   = r->getLong ( "rcache"   ,  1 );
	char wcache   = r->getLong ( "wcache"   ,  1 );
	long cacheAge = r->getLong ( "cacheAge" , 60*60 ); // default one hour
	if ( useCache == 0 ) { cacheAge = 0; wcache = 0; }
	if ( rcache   == 0 )   cacheAge = 0; 
	// . fetch the TitleRec
	// . a max cache age of 0 means not to read from the cache
	XmlDoc *xd = &st->m_xd;
	// url based?
	if ( url ) {
		SpiderRequest sreq;
		sreq.reset();
		strcpy(sreq.m_url, url );
		sreq.setDataSize();
		// this returns false if "coll" is invalid
		if ( ! xd->set4 ( &sreq , NULL , coll , NULL , st->m_niceness ) ) 
			goto hadSetError;
	}
	// . when getTitleRec() is called it will load the old one
	//   since XmlDoc::m_setFromTitleRec will be true
	// . niceness is 0
	// . use st->m_coll since XmlDoc just points to it!
	// . this returns false if "coll" is invalid
	else if ( ! xd->set3 ( docId , st->m_coll , 0 ) ) {
	hadSetError:
		mdelete ( st , sizeof(State2) , "PageGet1" );
		delete ( st );
		g_errno = ENOMEM;
		log("PageGet: set3: %s", mstrerror(g_errno));
		return g_httpServer.sendErrorReply(s,500,mstrerror(g_errno));
	}
	// if it blocks while it loads title rec, it will re-call this routine
	xd->setCallback ( st , processLoopWrapper );
	// good to go!
	return processLoop ( st );
}
// . returns false if blocked, true otherwise
// . sets g_errno on error
// . query re-index interface
// . call g_httpServer.sendDynamicPage() to send it
bool sendPageReindex ( TcpSocket *s , HttpRequest *r ) {
	// make a state
	State13 *st ;
	try { st = new (State13); }
	catch ( ... ) {
		g_errno = ENOMEM;
		log("PageTagdb: new(%i): %s", 
		    (int)sizeof(State13),mstrerror(g_errno));
		return g_httpServer.sendErrorReply(s,500,mstrerror(g_errno));}
	mnew ( st , sizeof(State13) , "PageReindex" );

	// set this. also sets gr->m_hr
	GigablastRequest *gr = &st->m_gr;
	// this will fill in GigablastRequest so all the parms we need are set
	g_parms.setGigablastRequest ( s , r , gr );

	TcpSocket *sock = gr->m_socket;

	// get collection rec
	CollectionRec *cr = g_collectiondb.getRec ( gr->m_coll );
	// bitch if no collection rec found
	if ( ! cr ) {
		g_errno = ENOCOLLREC;

		// g_errno should be set so it will return an error response
		g_httpServer.sendErrorReply(sock,500,mstrerror(g_errno));
		mdelete ( st , sizeof(State13) , "PageTagdb" );
		delete (st);
		return true;

	}


	collnum_t collnum = cr->m_collnum;

	// if no query send back the page blanked out i guess
	if ( ! gr->m_query || ! gr->m_query[0] ) {
		doneReindexing ( st );
		return true;
	}

	// no permmission?
	bool isMasterAdmin = g_conf.isMasterAdmin ( s , r );
	bool isCollAdmin = g_conf.isCollAdmin ( s , r );
	if ( ! isMasterAdmin &&
	     ! isCollAdmin ) {
		g_errno = ENOPERM;
		doneReindexing ( st );
		return true;
	}

	int32_t langId = getLangIdFromAbbr ( gr->m_qlang );

	// let msg1d do all the work now
	if ( ! st->m_msg1c.reindexQuery ( gr->m_query ,
					  collnum,
					  gr->m_srn , // startNum ,
					  gr->m_ern , // endNum   ,
					  (bool)gr->m_forceDel,
					  langId,
					  st ,
					  doneReindexing ) )
		return false;

	// no waiting
	doneReindexing ( st );
	return true;
}
// returns false if blocked, true otherwise
bool processLoop ( void *state ) {
	// get it
	State2 *st = (State2 *)state;
	// get the tcp socket from the state
	TcpSocket *s = st->m_socket;
	// get it
	XmlDoc *xd = &st->m_xd;

	if ( ! xd->m_loaded ) {
		// setting just the docid. niceness is 0.
		//xd->set3 ( st->m_docId , st->m_coll , 0 );
		// callback
		xd->setCallback ( state , processLoop );
		// . and tell it to load from the old title rec
		// . this sets xd->m_oldTitleRec/m_oldTitleRecSize
		// . this sets xd->ptr_* and all other member vars from
		//   the old title rec if found in titledb.
		if ( ! xd->loadFromOldTitleRec ( ) ) return false;
	}

	if ( g_errno ) return sendErrorReply ( st , g_errno );
	// now force it to load old title rec
	//char **tr = xd->getTitleRec();
	SafeBuf *tr = xd->getTitleRecBuf();
	// blocked? return false if so. it will call processLoop() when it rets
	if ( tr == (void *)-1 ) return false;
	// we did not block. check for error? this will free "st" too.
	if ( ! tr ) return sendErrorReply ( st , g_errno );
	// if title rec was empty, that is a problem
	if ( xd->m_titleRecBuf.length() == 0 ) 
		return sendErrorReply ( st , ENOTFOUND);

	// set callback
	char *na = xd->getIsNoArchive();
	// wait if blocked
	if ( na == (void *)-1 ) return false;
	// error?
	if ( ! na ) return sendErrorReply ( st , g_errno );
	// forbidden? allow turkeys through though...
	if ( ! st->m_isAdmin && *na )
		return sendErrorReply ( st , ENOCACHE );

	SafeBuf *sb = &st->m_sb;


	// &page=4 will print rainbow sections
	if ( ! st->m_printed && st->m_r.getLong("page",0) ) {
		// do not repeat this call
		st->m_printed = true;
		// this will call us again since we called
		// xd->setCallback() above to us
		if ( ! xd->printDocForProCog ( sb , &st->m_r ) )
			return false;
	}

	char *contentType = "text/html";
	char format = st->m_format;
	if ( format == FORMAT_XML ) contentType = "text/xml";
	if ( format == FORMAT_JSON ) contentType = "application/json";

	// if we printed a special page (like rainbow sections) then return now
	if ( st->m_printed ) {
		bool status = g_httpServer.sendDynamicPage (s,
							    //buf,bufLen,
							    sb->getBufStart(),
							    sb->getLength(),
							    -1,false,
							    //"text/html",
							    contentType,
							    -1, NULL, "utf8" );
		// nuke state2
		mdelete ( st , sizeof(State2) , "PageGet1" );
		delete (st);
		return status;
	}

	/*
	  // this was calling XmlDoc and setting sections, etc. to
	  // get the SpiderReply junk... no no no
	// is it banned or filtered? this ignores the TagRec in the titleRec
	// and uses msg8a to get it fresh instead
	char *vi = xd->getIsFiltered();//Visible( );
	// wait if blocked
	if ( vi == (void *)-1 ) return false;
	// error?
	if ( ! vi ) return sendErrorReply ( st , g_errno );
	// banned?
	if ( ! st->m_isAdmin && ! *vi ) return sendErrorReply (st,EDOCBANNED);
	*/

	// get the utf8 content
	char **utf8 = xd->getUtf8Content();
	//long   len  = xd->size_utf8Content - 1;
	// wait if blocked???
	if ( utf8 == (void *)-1 ) return false;
	// strange
	if ( xd->size_utf8Content<=0) {
		log("pageget: utf8 content <= 0");
		return sendErrorReply(st,EBADENGINEER );
	}
	// alloc error?
	if ( ! utf8 ) return sendErrorReply ( st , g_errno );

	// get this host
	Host *h = g_hostdb.getHost ( g_hostdb.m_hostId );
	if ( ! h ) {
		log("pageget: hostid %li is bad",g_hostdb.m_hostId);
		return sendErrorReply(st,EBADENGINEER );
	}


	char *content    = xd->ptr_utf8Content;
	long  contentLen = xd->size_utf8Content - 1;

	// shortcut
	char strip = st->m_strip;

	// alloc buffer now
	//char *buf = NULL;
	//long  bufMaxSize = 0;
	//bufMaxSize = len + ( 32 * 1024 ) ;
	//bufMaxSize = contentLen + ( 32 * 1024 ) ;
	//buf        = (char *)mmalloc ( bufMaxSize , "PageGet2" );
	//char *p          = buf;
	//char *bufEnd     = buf + bufMaxSize;
	//if ( ! buf ) {
	//	return sendErrorReply ( st , g_errno );
	//}

	// for undoing the header
	//char *start1 = p;
	long startLen1 = sb->length();

	// we are always utfu
	if ( strip != 2 )
		sb->safePrintf( "<meta http-equiv=\"Content-Type\" "
			     "content=\"text/html;charset=utf8\">\n");

	// base href
	//Url *base = &xd->m_firstUrl;
	//if ( xd->ptr_redirUrl.m_url[0] )
	//	base = &xd->m_redirUrl;
	char *base = xd->ptr_firstUrl;
	if ( xd->ptr_redirUrl ) base = xd->ptr_redirUrl;
	//Url *redir = *xd->getRedirUrl();
	if ( strip != 2 ) {
		sb->safePrintf ( "<BASE HREF=\"%s\">" , base );
		//p += gbstrlen ( p );
	}

	// default colors in case css files missing
	if ( strip != 2 ) {
		sb->safePrintf( "\n<style type=\"text/css\">\n"
			  "body{background-color:white;color:black;}\n"
			  "</style>\n");
		//p += gbstrlen ( p );
	}

	//char format = st->m_format;
	if ( format == FORMAT_XML ) sb->reset();
	if ( format == FORMAT_JSON ) sb->reset();

	// for undoing the stuff below
	long startLen2 = sb->length();//p;

	// query should be NULL terminated
	char *q    = st->m_q;
	long  qlen = st->m_qlen;

	char styleTitle[128] =  "font-size:14px;font-weight:600;"
				"color:#000000;";
	char styleText[128]  =  "font-size:14px;font-weight:400;"
				"color:#000000;";
	char styleLink[128] =  "font-size:14px;font-weight:400;"
				"color:#0000ff;";
	char styleTell[128] =  "font-size:14px;font-weight:600;"
				"color:#cc0000;";

	// get the url of the title rec
	Url *f = xd->getFirstUrl();

	bool printDisclaimer = st->m_printDisclaimer;

	if ( xd->m_contentType == CT_JSON )
		printDisclaimer = false;

	if ( format == FORMAT_XML ) printDisclaimer = false;
	if ( format == FORMAT_JSON ) printDisclaimer = false;

	char tbuf[100];
	tbuf[0] = 0;
	time_t lastSpiderDate = xd->m_spideredTime;

	if ( printDisclaimer ||
	     format == FORMAT_XML ||
	     format == FORMAT_JSON ) {
		struct tm *timeStruct = gmtime ( &lastSpiderDate );
		strftime ( tbuf, 100,"%b %d, %Y UTC", timeStruct);
	}

	// We should always be displaying this disclaimer.
	// - May eventually want to display this at a different location
	//   on the page, or on the click 'n' scroll browser page itself
	//   when this page is not being viewed solo.
	// CNS: if ( ! st->m_clickNScroll ) {
	if ( printDisclaimer ) {

		sb->safePrintf(//sprintf ( p , 
			  //"<BASE HREF=\"%s\">"
			  //"<table border=1 width=100%%>"
			  //"<tr><td>"
			  "<table border=\"1\" bgcolor=\"#"
			  BGCOLOR
			  "\" cellpadding=\"10\" "
			  //"id=\"gbcnsdisctable\" class=\"gbcnsdisctable_v\""
			  "cellspacing=\"0\" width=\"100%%\" color=\"#ffffff\">"
			  "<tr"
			  //" id=\"gbcnsdisctr\" class=\"gbcnsdisctr_v\""
			  "><td>"
			  //"<font face=times,sans-serif color=black size=-1>"
			  "<span style=\"%s\">"
			  "This is Gigablast's cached page of </span>"
			  "<a href=\"%s\" style=\"%s\">%s</a>"
			  "" , styleTitle, f->getUrl(), styleLink,
			  f->getUrl() );
		//p += gbstrlen ( p );
		// then the rest
		//sprintf(p , 
		sb->safePrintf(
			"<span style=\"%s\">. "
			"Gigablast is not responsible for the content of "
			"this page.</span>", styleTitle );
		//p += gbstrlen ( p );

		sb->safePrintf ( "<br/><span style=\"%s\">"
			  "Cached: </span>"
			  "<span style=\"%s\">",
			  styleTitle, styleText );
		//p += gbstrlen ( p );

		// then the spider date in GMT
		// time_t lastSpiderDate = xd->m_spideredTime;
		// struct tm *timeStruct = gmtime ( &lastSpiderDate );
		// char tbuf[100];
		// strftime ( tbuf, 100,"%b %d, %Y UTC", timeStruct);
		//p += gbstrlen ( p );
		sb->safeStrcpy(tbuf);

		// Moved over from PageResults.cpp
		sb->safePrintf( "</span> - <a href=\""
			      "/get?"
			      "q=%s&amp;c=%s&amp;rtq=%li&amp;"
			      "d=%lli&amp;strip=1\""
			      " style=\"%s\">"
			      "[stripped]</a>", 
			      q , st->m_coll , 
			      (long)st->m_rtq,
			      st->m_docId, styleLink ); 

		// a link to alexa
		if ( f->getUrlLen() > 5 ) {
			sb->safePrintf( " - <a href=\"http:"
					 "//web.archive.org/web/*/%s\""
					 " style=\"%s\">"
					 "[older copies]</a>" ,
					 f->getUrl(), styleLink );
		}

		if (st->m_noArchive){
			sb->safePrintf( " - <span style=\"%s\"><b>"
				     "[NOARCHIVE]</b></span>",
				     styleTell );
		}
		if (st->m_isBanned){
			sb->safePrintf(" - <span style=\"%s\"><b>"
				     "[BANNED]</b></span>",
				     styleTell );
		}

		// only print this if we got a query
		if ( qlen > 0 ) {
			sb->safePrintf("<br/><br/><span style=\"%s\"> "
				   "These search terms have been "
				   "highlighted:  ",
				   styleText );
			//p += gbstrlen ( p );
		}
		
	}

	// how much space left in p?
	//long avail = bufEnd - p;
	// . make the url that we're outputting for (like in PageResults.cpp)
	// . "thisUrl" is the baseUrl for click & scroll
	char thisUrl[MAX_URL_LEN];
	char *thisUrlEnd = thisUrl + MAX_URL_LEN;
	char *x = thisUrl;
	// . use the external ip of our gateway
	// . construct the NAT mapped port
	// . you should have used iptables to map port to the correct
	//   internal ip:port
	//unsigned long  ip   =g_conf.m_mainExternalIp  ; // h->m_externalIp;
	//unsigned short port=g_conf.m_mainExternalPort;//h->m_externalHttpPort
	// local check
	//if ( st->m_isLocal ) {
	unsigned long  ip   = h->m_ip;
	unsigned short port = h->m_httpPort;
	//}
	//sprintf ( x , "http://%s:%li/get?q=" , iptoa ( ip ) , port );
	// . we no longer put the port in here
	// . but still need http:// since we use <base href=>
	if (port == 80) sprintf(x,"http://%s/get?q=",iptoa(ip));
	else            sprintf(x,"http://%s:%hu/get?q=",iptoa(ip),port);
	x += gbstrlen ( x );
	// the query url encoded
	long elen = urlEncode ( x , thisUrlEnd - x , q , qlen );
	x += elen;
	// separate cgi vars with a &
	//sprintf ( x, "&seq=%li&rtq=%lid=%lli",
	//	  (long)st->m_seq,(long)st->m_rtq,st->m_msg22.getDocId());
	sprintf ( x, "&d=%lli",st->m_docId );
	x += gbstrlen(x);		
	// set our query for highlighting
	Query qq;
	qq.set2 ( q, st->m_langId , true );

	// print the query terms into our highlight buffer
	Highlight hi;
	// make words so we can set the scores to ignore fielded terms
	Words qw;
	qw.set ( q            ,  // content being highlighted, utf8
		 qlen         ,  // content being highlighted, utf8
		 TITLEREC_CURRENT_VERSION,
		 true         ,  // computeIds
		 false        ); // hasHtmlEntities?
	// . assign scores of 0 to query words that should be ignored
	// . TRICKY: loop over words in qq.m_qwords, but they should be 1-1
	//   with words in qw.
	// . sanity check
	//if ( qw.getNumWords() != qq.m_numWords ) { char *xx = NULL; *xx = 0;}
	// declare up here
	Matches m;
	// do the loop
	//Scores ss;
	//ss.set ( &qw , NULL );
	//for ( long i = 0 ; i < qq.m_numWords ; i++ )
	//	if ( ! m.matchWord ( &qq.m_qwords[i],i ) ) ss.m_scores[i] = 0;
	// now set m.m_matches[] to those words in qw that match a query word
	// or phrase in qq.
	m.setQuery ( &qq );
	//m.addMatches ( &qw , &ss , true );
	m.addMatches ( &qw );
	long hilen = 0;

	// CNS: if ( ! st->m_clickNScroll ) {
	// and highlight the matches
	if ( printDisclaimer ) {
		hilen = hi.set ( //p       ,
				 //avail   ,
				sb ,
				 &qw     , // words to highlight
				 &m      , // matches relative to qw
				 false   , // doSteming
				 false   , // st->m_clickAndScroll , 
				 (char *)thisUrl );// base url for ClcknScrll
		//p += hilen;
		// now an hr
		//memcpy ( p , "</span></table></table>\n" , 24 );   p += 24;
		sb->safeStrcpy("</span></table></table>\n");
	}


	bool includeHeader = st->m_includeHeader;

	// do not show header for json object display
	if ( xd->m_contentType == CT_JSON )
		includeHeader = false;

	if ( format == FORMAT_XML ) includeHeader = false;
	if ( format == FORMAT_JSON ) includeHeader = false;

	//mfree(uq, uqCapacity, "PageGet");
	// undo the header writes if we should
	if ( ! includeHeader ) {
		// including base href is off by default when not including
		// the header, so the caller must explicitly turn it back on
		if ( st->m_includeBaseHref ) sb->m_length=startLen2;//p=start2;
		else                         sb->m_length=startLen1;//p=start1;
	}

	//sb->safeStrcpy(tbuf);



	if ( format == FORMAT_XML ) {
		sb->safePrintf("<response>\n");
		sb->safePrintf("<statusCode>0</statusCode>\n");
		sb->safePrintf("<statusMsg>Success</statusMsg>\n");
		sb->safePrintf("<url><![CDATA[");
		sb->cdataEncode(xd->m_firstUrl.m_url);
		sb->safePrintf("]]></url>\n");
		sb->safePrintf("<docId>%llu</docId>\n",xd->m_docId);
		sb->safePrintf("\t<cachedTimeUTC>%lu</cachedTimeUTC>\n",
			      lastSpiderDate);
		sb->safePrintf("\t<cachedTimeStr>%s</cachedTimeStr>\n",tbuf);
	}

	if ( format == FORMAT_JSON ) {
		sb->safePrintf("{\"response\":{\n");
		sb->safePrintf("\t\"statusCode\":0,\n");
		sb->safePrintf("\t\"statusMsg\":\"Success\",\n");
		sb->safePrintf("\t\"url\":\"");
		sb->jsonEncode(xd->m_firstUrl.m_url);
		sb->safePrintf("\",\n");
		sb->safePrintf("\t\"docId\":%llu,\n",xd->m_docId);
		sb->safePrintf("\t\"cachedTimeUTC\":%lu,\n",lastSpiderDate);
		sb->safePrintf("\t\"cachedTimeStr\":\"%s\",\n",tbuf);
	}

	// identify start of <title> tag we wrote out
	char *sbstart = sb->getBufStart();
	char *sbend   = sb->getBufEnd();
	char *titleStart = NULL;
	char *titleEnd   = NULL;
	for ( char *t = sbstart ; t < sbend ; t++ ) {
		// title tag?
		if ( t[0]!='<' ) continue;
		if ( to_lower_a(t[1])!='t' ) continue;
		if ( to_lower_a(t[2])!='i' ) continue;
		if ( to_lower_a(t[3])!='t' ) continue;
		if ( to_lower_a(t[4])!='l' ) continue;
		if ( to_lower_a(t[5])!='e' ) continue;
		// point to it
		char *x = t + 5;
		// max - to keep things fast
		char *max = x + 500;
		for ( ; *x && *x != '>' && x < max ; x++ );
		x++;
		// find end
		char *e = x;
		for ( ; *e && e < max ; e++ ) {
			if ( e[0]=='<' &&
			     to_lower_a(e[1])=='/' &&
			     to_lower_a(e[2])=='t' &&
			     to_lower_a(e[3])=='i' &&
			     to_lower_a(e[4])=='t' &&
			     to_lower_a(e[5])=='l' &&
			     to_lower_a(e[6])=='e' )
				break;
		}
		if ( e < max ) {
			titleStart = x;
			titleEnd   = e;
		}
		break;
	}

	// . print title at top!
	// . consider moving
	if ( titleStart ) {

		char *ebuf = st->m_r.getString("eb");
		if ( ! ebuf ) ebuf = "";

		//p += sprintf ( p , 
		sb->safePrintf(
			       "<table border=1 "
			       "cellpadding=10 "
			       "cellspacing=0 "
			       "width=100%% "
			       "color=#ffffff>" );

		long printLinks = st->m_r.getLong("links",0);

		if ( ! printDisclaimer && printLinks )
			sb->safePrintf(//p += sprintf ( p , 
				       // first put cached and live link
				       "<tr>"
				       "<td bgcolor=lightyellow>"
				       // print cached link
				       //"<center>"
				       "&nbsp; "
				       "<b>"
				       "<a "
				       "style=\"font-size:18px;font-weight:600;"
				       "color:#000000;\" "
				       "href=\""
				       "/get?"
				       "c=%s&d=%lli&qh=0&cnsp=1&eb=%s\">"
				       "cached link</a>"
				       " &nbsp; "
				       "<a "
				       "style=\"font-size:18px;font-weight:600;"
				       "color:#000000;\" "
				       "href=%s>live link</a>"
				       "</b>"
				       //"</center>"
				       "</td>"
				       "</tr>\n"
				       ,st->m_coll
				       ,st->m_docId 
				       ,ebuf
				       ,thisUrl // st->ptr_ubuf
				       );

		if ( printLinks ) {
			sb->safePrintf(//p += sprintf ( p ,
				       "<tr><td bgcolor=pink>"
				       "<span style=\"font-size:18px;"
				       "font-weight:600;"
				       "color:#000000;\">"
				       "&nbsp; "
				       "<b>PAGE TITLE:</b> "
				       );
			long tlen = titleEnd - titleStart;
			sb->safeMemcpy ( titleStart , tlen );
			sb->safePrintf ( "</span></td></tr>" );
		}

		sb->safePrintf( "</table><br>\n" );

	}

	// is the content preformatted?
	bool pre = false;
	char ctype = (char)xd->m_contentType;
	if ( ctype == CT_TEXT ) pre = true ; // text/plain
	if ( ctype == CT_DOC  ) pre = true ; // filtered msword
	if ( ctype == CT_PS   ) pre = true ; // filtered postscript

	if ( format == FORMAT_XML ) pre = false;
	if ( format == FORMAT_JSON ) pre = false;

	// if it is content-type text, add a <pre>
	if ( pre ) {//p + 5 < bufEnd && pre ) {
		sb->safePrintf("<pre>");
		//p += 5;
	}

	if ( st->m_strip == 1 )
		contentLen = stripHtml( content, contentLen, 
					(long)xd->m_version, st->m_strip );
	// it returns -1 and sets g_errno on error, line OOM
	if ( contentLen == -1 ) {
		//if ( buf ) mfree ( buf , bufMaxSize , "PageGet2" );	
		return sendErrorReply ( st , g_errno );
	}

	Xml xml;
	Words ww;

	// if no highlighting, skip it
	bool queryHighlighting = st->m_queryHighlighting;
	if ( st->m_strip == 2 ) queryHighlighting = false;

	// do not do term highlighting if json
	if ( xd->m_contentType == CT_JSON )
		queryHighlighting = false;

	SafeBuf tmp;
	SafeBuf *xb = sb;
	if ( format == FORMAT_XML ) xb = &tmp;
	if ( format == FORMAT_JSON ) xb = &tmp;
	

	if ( ! queryHighlighting ) {
		xb->safeMemcpy ( content , contentLen );
		//p += contentLen ;
	}
	else {
		// get the content as xhtml (should be NULL terminated)
		//Words *ww = xd->getWords();
		if ( ! xml.set ( content , contentLen , false ,
				 0 , false , TITLEREC_CURRENT_VERSION ,
				 false , 0 , CT_HTML ) ) { // niceness is 0
			//if ( buf ) mfree ( buf , bufMaxSize , "PageGet2" );
			return sendErrorReply ( st , g_errno );
		}			
		if ( ! ww.set ( &xml , true , 0 ) ) { // niceness is 0
			//if ( buf ) mfree ( buf , bufMaxSize , "PageGet2" );
			return sendErrorReply ( st , g_errno );
		}
		// sanity check
		//if ( ! xd->m_wordsValid ) { char *xx=NULL;*xx=0; }
		// how much space left in p?
		//avail = bufEnd - p;

		Matches m;
		m.setQuery ( &qq );
		m.addMatches ( &ww );
		hilen = hi.set ( xb , // p , avail , 
				 &ww , &m ,
				 false /*doStemming?*/ ,  
				 st->m_clickAndScroll , 
				 thisUrl /*base url for click & scroll*/);
		//p += hilen;
		log(LOG_DEBUG, "query: Done highlighting cached page content");
	}


	if ( format == FORMAT_XML ) {
		sb->safePrintf("\t<content><![CDATA[");
		sb->cdataEncode ( xb->getBufStart() );
		sb->safePrintf("]]></content>\n");
		sb->safePrintf("</response>\n");
	}

	if ( format == FORMAT_JSON ) {
		sb->safePrintf("\t\"content\":\"\n");
		sb->jsonEncode ( xb->getBufStart() );
		sb->safePrintf("\"\n}\n}\n");
	}


	// if it is content-type text, add a </pre>
	if ( pre ) { // p + 6 < bufEnd && pre ) {
		sb->safeMemcpy ( "</pre>" , 6 );
		//p += 6;
	}

	// calculate bufLen
	//long bufLen = p - buf;

	long ct = xd->m_contentType;

	// now filter the entire buffer to escape out the xml tags
	// so it is displayed nice
	SafeBuf newbuf;

	if ( ct == CT_XML ) {
		// encode the xml tags into &lt;tagname&gt; sequences
		if ( !newbuf.htmlEncodeXmlTags ( sb->getBufStart() ,
						 sb->getLength(),
						 0)){// niceness=0
			//if ( buf ) mfree ( buf , bufMaxSize , "PageGet2" );
			return sendErrorReply ( st , g_errno );
		}
		// free out buffer that we alloc'd before returning since this
		// should have copied it into another buffer
		//if ( buf ) mfree ( buf , bufMaxSize , "PageGet2" );	
		// reassign
		//buf    = newbuf.getBufStart();
		//bufLen = newbuf.length();
		sb->stealBuf ( &newbuf );
	}

	// now encapsulate it in html head/tail and send it off
	// sendErr:
	contentType = "text/html";
	if ( strip == 2 ) contentType = "text/xml";
	// xml is usually buggy and this throws browser off
	//if ( ctype == CT_XML ) contentType = "text/xml";

	if ( xd->m_contentType == CT_JSON )
		contentType = "application/json";

	if ( format == FORMAT_XML ) contentType = "text/xml";
	if ( format == FORMAT_JSON ) contentType = "application/json";

	// safebuf, sb, is a member of "st" so this should copy the buffer
	// when it constructs the http reply, and we gotta call delete(st)
	// AFTER this so sb is still valid.
	bool status = g_httpServer.sendDynamicPage (s,
						    //buf,bufLen,
						    sb->getBufStart(),
						    sb->getLength(),
						    -1,false,
						    contentType,
						     -1, NULL, "utf8" );

	// nuke state2
	mdelete ( st , sizeof(State2) , "PageGet1" );
	delete (st);


	// free out buffer that we alloc'd before returning since this
	// should have copied it into another buffer

	//if      ( ct == CT_XML ) newbuf.purge();
	//else if ( buf          ) mfree ( buf , bufMaxSize , "PageGet2" );
	
	// and convey the status
	return status;
}
// . returns false if blocked, true otherwise
// . sets g_errno on error
// . add url page for admin, users use sendPageAddUrl() in PageRoot.cpp
bool sendPageAddUrl2 ( TcpSocket *sock , HttpRequest *hr ) {

	// or if in read-only mode
	if ( g_conf.m_readOnlyMode ) {
		g_errno = EREADONLYMODE;
		const char *msg = mstrerror(g_errno);
		return g_httpServer.sendErrorReply(sock,500,msg);
	}

	// . get fields from cgi field of the requested url
	// . get the search query
	int32_t  urlLen = 0;
	const char *urls = hr->getString ( "urls" , &urlLen , NULL /*default*/);

	char format = hr->getReplyFormat();

	const char *c = hr->getString("c");
	
	if ( ! c && (format == FORMAT_XML || format == FORMAT_JSON) ) {
		g_errno = EMISSINGINPUT;
		const char *msg = "missing c parm. See /admin/api to see parms.";
		return g_httpServer.sendErrorReply(sock,500,msg);
	}

	if ( ! urls && (format == FORMAT_XML || format == FORMAT_JSON) ) {
		g_errno = EMISSINGINPUT;
		const char *msg = "missing urls parm. See /admin/api to see parms.";
		return g_httpServer.sendErrorReply(sock,500,msg);
	}


	// get collection rec
	CollectionRec *cr = g_collectiondb.getRec ( hr );
	// bitch if no collection rec found
	if ( ! cr ) {
		g_errno = ENOCOLLREC;
		const char *msg = mstrerror(g_errno);
		return g_httpServer.sendErrorReply(sock,500,msg);
	}


	// make a new state
	GigablastRequest *gr;
	try { gr = new (GigablastRequest); }
	catch ( ... ) { 
		g_errno = ENOMEM;
		log( LOG_WARN, "PageAddUrl: new(%i): %s", (int)sizeof(GigablastRequest),mstrerror(g_errno) );
		return g_httpServer.sendErrorReply(sock, 500, mstrerror(g_errno));
	}
	mnew ( gr , sizeof(GigablastRequest) , "PageAddUrl" );


	// this will fill in GigablastRequest so all the parms we need are set
	// set this. also sets gr->m_hr
	g_parms.setGigablastRequest ( sock , hr , gr );

	// if no url given, just print a blank page
	if ( ! urls ) return sendReply (  gr );

	// do not spider links for spots
	bool status = getSpiderRequestMetaList ( (char*)urls, &gr->m_listBuf , gr->m_harvestLinks, NULL );
	int32_t size = gr->m_listBuf.length();
	
	// error / not list
	if ( ! status || !size ) {
		// nuke it
		if ( !size ) {
			g_errno = EMISSINGINPUT;
		}

		bool rc = g_httpServer.sendErrorReply(gr);
		mdelete ( gr , sizeof(gr) , "PageAddUrl" );
		delete gr;
		return rc;
	}

	// add to spiderdb
	if ( ! gr->m_msg4.addMetaList( &(gr->m_listBuf), cr->m_collnum, gr, addedUrlsToSpiderdbWrapper, 0 ) ) {
		// blocked!
		return false;
	}

	// did not block, print page!
	sendReply ( gr );
	return true;
}
bool processLoop ( void *state ) {
	// cast it
	State8 *st = (State8 *)state;
	// get the xmldoc
	XmlDoc *xd = &st->m_xd;

	// error?
	if ( g_errno ) return sendErrorReply ( st , g_errno );

	// shortcut
	SafeBuf *xbuf = &st->m_xbuf;

	if ( st->m_u && st->m_u[0] ) {
		// . save the ips.txt file if we are the test coll
		// . saveTestBuf() is a function in Msge1.cpp
		CollectionRec *cr = xd->getCollRec();
		if ( xd && cr && cr->m_coll && !strcmp(cr->m_coll,"qatest123"))
			// use same dir that XmlDoc::getTestDir() would use
			//saveTestBuf ( "test-page-parser" );
			saveTestBuf("qa");
		// now get the meta list, in the process it will print out a 
		// bunch of junk into st->m_xbuf
		char *metalist = xd->getMetaList ( );
		if ( ! metalist ) return sendErrorReply ( st , g_errno );
		// return false if it blocked
		if ( metalist == (void *)-1 ) return false;
		// for debug...
		if ( ! xd->m_indexCode ) xd->doConsistencyTest ( false );
		// print it out
		xd->printDoc( xbuf );
	}

	// print reason we can't analyze it (or index it)
	//if ( st->m_indexCode != 0 ) {
	//	xbuf->safePrintf ("<br><br><b>indexCode: %s</b>\n<br>", 
	//			  mstrerror(st->m_indexCode));
	//}

	// we are done
	g_inPageParser = false;

	// print the final tail
	//p += g_httpServer.printTail ( p , pend - p );

	//log("parser: send sock=%li",st->m_s->m_sd);
	
	// now encapsulate it in html head/tail and send it off
	bool status = g_httpServer.sendDynamicPage( st->m_s , 
						    xbuf->getBufStart(), 
						    xbuf->length() ,
						    -1, //cachtime
						    false ,//postreply?
						    NULL, //ctype
						    -1 , //httpstatus
						    NULL,//cookie
						    "utf-8");
	// delete the state now
	if ( st->m_freeIt ) {
		mdelete ( st , sizeof(State8) , "PageParser" );
		delete (st);
	}
	// return the status
	return status;
}
bool sendReply ( void *state , bool addUrlEnabled ) {
	// allow others to add now
	//s_inprogress = false;
	// get the state properly
	//gr *st1 = (gr *) state;
	GigablastRequest *gr = (GigablastRequest *)state;
	// in order to see what sites are being added log it, then we can
	// more easily remove sites from sitesearch.gigablast.com that are
	// being added but not being searched
	SafeBuf xb;
	if ( gr->m_urlsBuf ) {
		xb.safeTruncateEllipsis ( gr->m_urlsBuf , 200 );
		log(LOG_INFO,"http: add url %s (%s)",
		    xb.getBufStart(),mstrerror(g_errno));
	}

	char format = gr->m_hr.getReplyFormat();
	TcpSocket *sock    = gr->m_socket;

	if ( format == FORMAT_JSON || format == FORMAT_XML ) {
		bool status = g_httpServer.sendSuccessReply ( gr );
		// nuke state
		mdelete ( gr , sizeof(gr) , "PageAddUrl" );
		delete (gr);
		return status;
	}


	long ulen = 0;
	char *url = gr->m_urlsBuf;
	if ( url ) ulen = gbstrlen (url);

	// re-null it out if just http://
	bool printUrl = true;
	if ( ulen == 0 ) printUrl = false;
	if ( ! gr->m_urlsBuf       ) printUrl = false;
	if ( ulen==7 && printUrl && !strncasecmp(gr->m_url,"http://",7))
		printUrl = false;
	if ( ulen==8 && printUrl && !strncasecmp(gr->m_url,"https://",8))
		printUrl = false;

	// page is not more than 32k
	char buf[1024*32+MAX_URL_LEN*2];
	SafeBuf sb(buf, 1024*32+MAX_URL_LEN*2);
	
	//char rawbuf[1024*8];
	//SafeBuf rb(rawbuf, 1024*8);	
	//rb.safePrintf("<?xml version=\"1.0\" encoding=\"utf-8\"?>\n");
	//rb.safePrintf("<status>\n");
	//CollectionRec *cr = g_collectiondb.getRec ( gr->m_coll );
	
	// collection name

	char tt [ 128 ];
	tt[0] = '\0';

	g_pages.printAdminTop ( &sb , sock , &gr->m_hr );

	// display url
	//char *url = gr->m_urlsBuf;
	//if ( url && ! url[0] ) url = NULL;

	// watch out for NULLs
	if ( ! url ) url = "http://";

	// if there was an error let them know
	//char msg[MAX_URL_LEN + 1024];
	SafeBuf mbuf;
	//char *pm = "";
	if ( g_errno ) {
		mbuf.safePrintf("<center><font color=red>");
		mbuf.safePrintf("Error adding url(s): <b>%s[%i]</b>", 
				mstrerror(g_errno) , g_errno);
		mbuf.safePrintf("</font></center>");
		//pm = msg;
		//rb.safePrintf("Error adding url(s): %s[%i]", 
		//	      mstrerror(g_errno) , g_errno);
	}
	else if ( printUrl ) {
		mbuf.safePrintf("<center><font color=red>");
		mbuf.safePrintf("<b><u>");
		mbuf.safeTruncateEllipsis(gr->m_urlsBuf,200);
		mbuf.safePrintf("</u></b> added to spider "
				 "queue "
				 "successfully<br><br>");
		mbuf.safePrintf("</font></center>");
		//rb.safePrintf("%s added to spider "
		//	      "queue successfully", url );
		//pm = msg;
		//url = "http://";
		//else
		//	pm = "Don't forget to <a href=/gigaboost.html>"
		//		"Gigaboost</a> your URL.";
	}


	if ( mbuf.length() ) sb.safeStrcpy ( mbuf.getBufStart() );

	g_parms.printParmTable ( &sb , sock , &gr->m_hr );

	// print the final tail
	g_pages.printTail ( &sb, true ); // admin?
	// clear g_errno, if any, so our reply send goes through
	g_errno = 0;

	// nuke state
	mdelete ( gr , sizeof(GigablastRequest) , "PageAddUrl" );
	delete (gr);

	return g_httpServer.sendDynamicPage (sock, 
					     sb.getBufStart(), 
					     sb.length(),
					     -1 ); // cachetime
}
bool gotXmlDoc ( void *state ) {
	// cast it
	State8 *st = (State8 *)state;
	// get the xmldoc
	XmlDoc *xd = &st->m_xd;

	// if we loaded from old title rec, it should be there!


	// . save the ips.txt file if we are the test coll
	// . saveTestBuf() is a function in Msge1.cpp
	//if ( xd && xd->m_coll && ! strcmp ( xd->m_coll , "qatest123")) 
	//	// use same dir that XmlDoc::getTestDir() would use
	//	saveTestBuf ( "test-page-parser" );

	// error?
	if ( g_errno ) return sendErrorReply ( st , g_errno );

	// shortcut
	SafeBuf *xbuf = &st->m_xbuf;

	bool printIt = false;
	if ( st->m_u && st->m_u[0] ) printIt = true;
	if ( st->m_docId != -1LL ) printIt = true;
	if ( st->m_donePrinting ) printIt = false;

	// do not re-call this if printDocForProCog blocked... (check length())
	if ( printIt ) {
		// mark as done
		st->m_donePrinting = true;
		// always re-compute the page inlinks dynamically, do not
		// use the ptr_linkInfo1 stored in titlerec!!
		// NO! not if set from titlerec/docid
		if ( st->m_recompute )
			xd->m_linkInfo1Valid = false;
		// try a recompute regardless, because we do not store the
		// bad inlinkers, and ppl want to see why they are bad!
		//xd->m_linkInfo1Valid = false;
		// now get the meta list, in the process it will print out a 
		// bunch of junk into st->m_xbuf
		//char *metalist = xd->getMetaList ( );
		//if ( ! metalist ) return sendErrorReply ( st , g_errno );
		// return false if it blocked
		//if ( metalist == (void *)-1 ) return false;
		// for debug...
		//if ( ! xd->m_indexCode ) xd->doConsistencyTest ( false );
		// . print it out
		// . returns false if blocks, true otherwise
		// . sets g_errno on error
		if ( ! xd->printDocForProCog ( xbuf , &st->m_r ) )
			return false;
		// error?
		if ( g_errno ) return sendErrorReply ( st , g_errno );
	}

	long isXml = st->m_r.getLong("xml",0);
	char ctype2 = CT_HTML;
	if ( isXml ) ctype2 = CT_XML;
	// now encapsulate it in html head/tail and send it off
	bool status = g_httpServer.sendDynamicPage( st->m_s , 
						    xbuf->getBufStart(), 
						    xbuf->length() ,
						    -1, //cachtime
						    false ,//postreply?
						    &ctype2,
						    -1 , //httpstatus
						    NULL,//cookie
						    "utf-8");
	// delete the state now
	if ( st->m_freeIt ) {
		mdelete ( st , sizeof(State8) , "PageParser" );
		delete (st);
	}
	// return the status
	return status;
}
// . returns false if blocked, true otherwise
// . sets g_errno on error
// . add url page for admin, users use sendPageAddUrl() in PageRoot.cpp
bool sendPageAddUrl2 ( TcpSocket *sock , HttpRequest *hr ) {

	// or if in read-only mode
	if ( g_conf.m_readOnlyMode ) {
		g_errno = EREADONLYMODE;
		char *msg = mstrerror(g_errno);
		return g_httpServer.sendErrorReply(sock,500,msg);
	}

	// . get fields from cgi field of the requested url
	// . get the search query
	long  urlLen = 0;
	char *urls = hr->getString ( "urls" , &urlLen , NULL /*default*/);
	// also try "url" and "urls"
	//if ( ! url ) url = r->getString ( "url" , &urlLen , NULL );
	//if ( ! url ) url = r->getString ( "urls" , &urlLen , NULL );


	char format = hr->getReplyFormat();

	char *c = hr->getString("c");
	
	if ( ! c && (format == FORMAT_XML || format == FORMAT_JSON) ) {
		g_errno = EMISSINGINPUT;
		char *msg = "missing c parm. See /admin/api to see parms.";
		return g_httpServer.sendErrorReply(sock,500,msg);
	}

	if ( ! urls && (format == FORMAT_XML || format == FORMAT_JSON) ) {
		g_errno = EMISSINGINPUT;
		char *msg = "missing urls parm. See /admin/api to see parms.";
		return g_httpServer.sendErrorReply(sock,500,msg);
	}


	// get collection rec
	CollectionRec *cr = g_collectiondb.getRec ( hr );
	// bitch if no collection rec found
	if ( ! cr ) {
		g_errno = ENOCOLLREC;
		//g_msg = " (error: no collection)";
		char *msg = mstrerror(g_errno);
		return g_httpServer.sendErrorReply(sock,500,msg);
	}


	// make a new state
	GigablastRequest *gr;
	try { gr = new (GigablastRequest); }
	catch ( ... ) { 
		g_errno = ENOMEM;
		log("PageAddUrl: new(%i): %s", 
		    sizeof(GigablastRequest),mstrerror(g_errno));
		return g_httpServer.sendErrorReply(sock,500,
						   mstrerror(g_errno)); 
	}
	mnew ( gr , sizeof(GigablastRequest) , "PageAddUrl" );


	// this will fill in GigablastRequest so all the parms we need are set
	// set this. also sets gr->m_hr
	g_parms.setGigablastRequest ( sock , hr , gr );

	// if no url given, just print a blank page
	if ( ! urls ) return sendReply (  gr , true );

		


	bool status = true;

	// do not spider links for spots
	if ( ! getSpiderRequestMetaList ( urls,
					  // a safebuf
					  &gr->m_listBuf ,
					  gr->m_harvestLinks, // spiderLinks?
					  NULL ) )
		status = false;

	// empty?
	long size = gr->m_listBuf.length();
	
	// error?
	if ( ! status ) {
		// nuke it
		mdelete ( gr , sizeof(gr) , "PageAddUrl" );
		delete (gr);
		return g_httpServer.sendErrorReply(gr);
	}
	// if not list
	if ( ! size ) {
		// nuke it
		mdelete ( gr , sizeof(gr) , "PageAddUrl" );
		delete (gr);
		g_errno = EMISSINGINPUT;
		return g_httpServer.sendErrorReply(gr);
	}

	// add to spiderdb
	if ( ! gr->m_msg4.addMetaList( gr->m_listBuf.getBufStart() ,
				       gr->m_listBuf.length(),
				       cr->m_coll,
				       gr ,
				       addedUrlsToSpiderdbWrapper,
				       0 // niceness
				       ) )
		// blocked!
		return false;

	// did not block, print page!
	//addedUrlsToSpiderdbWrapper(gr);
	sendReply ( gr , true );
	return true;

	// send back the reply
	//return sendReply ( gr , true );
}