bool sendPageThesaurus( TcpSocket *s, HttpRequest *r ) {
	SafeBuf p;
	char getBuf[64]; // holds extra values for GET method
	char formBuf[256]; // holds extra values for forms
	snprintf(getBuf, 64, "c=%s", 
		 r->getString("c", 0, ""));
	snprintf(formBuf, 256, 
		 "<input type=hidden name=\"c\" value=\"%s\">",
		 //"<input type=hidden name=\"pwd\" value=\"%s\">",
		 r->getString("c", 0, ""));
	g_pages.printAdminTop( &p, s, r);
	
	if (r->getLong("cancel", 0) != 0) {
		g_thesaurus.cancelRebuild();
		p.safePrintf("<br><br>\n");
		p.safePrintf(
		  "<center><b><font color=#ff0000>"
		  "rebuild canceled"
		  "</font></b></center>");
	}

	if (r->getLong("rebuild", 0) != 0) {
		bool full = r->getLong("full", 0);
		p.safePrintf("<br><br>\n");
		if (g_thesaurus.rebuild(0, full)) {
			p.safePrintf(
			  "<center><b><font color=#ff0000>"
			  "error starting rebuild, check log for details"
			  "</font></b></center>");
		} else {
			p.safePrintf(
			  "<center><b><font color=#ff0000>"
			  "rebuild started"
			  "</font></b></center>");
		}
	}
	
	if (r->getLong("rebuildaff", 0) != 0) {
		bool full = r->getLong("full", 0);
		p.safePrintf("<br><br>\n");
		if (g_thesaurus.rebuildAffinity(0, full)) {
			p.safePrintf(
			  "<center><b><font color=#ff0000>"
			  "error starting rebuild, check log for details"
			  "</font></b></center>");
		} else {
			p.safePrintf(
			  "<center><b><font color=#ff0000>"
			  "rebuild started"
			  "</font></b></center>");
		}
	}

	if (r->getLong("distribute", 0) != 0) {
		char cmd[1024];
		p.safePrintf("<br><br>\n");
		if (g_thesaurus.m_affinityState) {
			p.safePrintf(
			  "<center><b><font color=#ff0000>"
			  "cannot distribute during rebuild"
			  "</font></b></center>");
		} else {
			for ( long i = 0; i < g_hostdb.getNumHosts() ; i++ ) {
				Host *h = g_hostdb.getHost(i);
				snprintf(cmd, 512,
					"rcp -r "
					"./dict/thesaurus.* "
					"%s:%s/dict/ &",
					iptoa(h->m_ip),
					h->m_dir);
				log(LOG_INFO, "admin: %s", cmd);
				system( cmd );
			}
			p.safePrintf(
			  "<center><b><font color=#ff0000>"
			  "data distributed"
			  "</font></b></center>");
		}	
	}

	if (r->getLong("reload", 0) != 0) {
		p.safePrintf("<br><br>\n");
		if (r->getLong("cast", 0) != 0) {
			p.safePrintf(
			  "<center><b><font color=#ff0000>"
			  "reload command broadcast"
			  "</font></b></center>");
		} else if (g_thesaurus.init()) {
			p.safePrintf(
			  "<center><b><font color=#ff0000>"
			  "thesaurus data reloaded"
			  "</font></b></center>");
		} else {
			p.safePrintf(
			  "<center><b><font color=#ff0000>"
			  "error reloading thesaurus data"
			  "</font></b></center>");
		}
	}

	long manualAddLen = 0;
	char *manualAdd = NULL;
	SafeBuf manualAddBuf;
	if ((manualAdd = r->getString("manualadd", &manualAddLen))) {
		trimWhite(manualAdd);
		manualAddLen = gbstrlen(manualAdd);
		File manualFile;
		manualFile.set(g_hostdb.m_dir, "dict/thesaurus-manual.txt");
		if (manualFile.open(O_WRONLY | O_CREAT | O_TRUNC) &&
			(manualFile.write(manualAdd, manualAddLen, 0) ==
			 manualAddLen)) {
			char newl = '\n'; // for write()
			if (manualAdd[manualAddLen-1] != '\n')
				manualFile.write(&newl, 1, manualAddLen);
			p.safePrintf(
			  "<center><b><font color=#ff0000>"
			  "updated manual add file sucessfully"
			  "</font></b></center>");
		} else {
			p.safePrintf(
			  "<center><b><font color=#ff0000>"
			  "error writing manual add file"
			  "</font></b></center>");
		}
	} else {
		char ff[PATH_MAX];
		snprintf(ff, PATH_MAX, "%sdict/thesaurus-manual.txt",
			g_hostdb.m_dir);
		if (manualAddBuf.fillFromFile(ff)) {
			if (*(manualAddBuf.getBuf()-1) != '\n')
				manualAddBuf.pushChar('\n');
			manualAdd = manualAddBuf.getBufStart();
			manualAddLen = manualAddBuf.length();
		}
	}

	long affinityAddLen = 0;
	char *affinityAdd = NULL;
	SafeBuf affinityAddBuf;
	if ((affinityAdd = r->getString("affinityadd", &affinityAddLen))) {
		trimWhite(affinityAdd);
		affinityAddLen = gbstrlen(affinityAdd);
		File affinityFile;
		affinityFile.set(g_hostdb.m_dir, 
			"dict/thesaurus-affinity.txt");
		if (affinityFile.open(O_WRONLY | O_CREAT | O_TRUNC) &&
			(affinityFile.write(affinityAdd, affinityAddLen, 0) ==
			 affinityAddLen)) {
			char newl = '\n'; // for write()
			if (affinityAdd[affinityAddLen-1] != '\n')
				affinityFile.write(&newl, 1, affinityAddLen);
			p.safePrintf(
			  "<center><b><font color=#ff0000>"
			  "updated affinity add file sucessfully"
			  "</font></b></center>");
		} else {
			p.safePrintf(
			  "<center><b><font color=#ff0000>"
			  "error writing affinity add file"
			  "</font></b></center>");
		}
	} else {
		char ff[PATH_MAX];
		snprintf(ff, PATH_MAX, "%sdict/thesaurus-affinity.txt",
			g_hostdb.m_dir);
		if (affinityAddBuf.fillFromFile(ff)) {
			if (*(affinityAddBuf.getBuf()-1) != '\n')
				affinityAddBuf.pushChar('\n');
			affinityAdd = affinityAddBuf.getBufStart();
			affinityAddLen = affinityAddBuf.length();
		}
	}
	

	char *syn = r->getString("synonym");
	long len = 0;
	if (syn) len = gbstrlen(syn);

	if (len) {
		SynonymInfo info;
		bool r = g_thesaurus.getAllInfo(syn, &info, len, SYNBIT_ALL);
		p.safePrintf("<br><br>\n");
		p.safePrintf ( 
		  "<table cellpadding=4 width=100%% bgcolor=#%s border=1>"
		  "<tr>"
		  "<td colspan=2 bgcolor=#%s>"
		  "<center><b>Synonym List (%ld)</b></center>"
		  "</td>"
		  "</tr>\n",
		  LIGHT_BLUE, DARK_BLUE, info.m_numSyns);
		if (r) {
			p.safePrintf("<tr>"
			  "<td align=right><tt>%s</tt></td>"
			  "<td align=left>"
			  "<tt>1.000/%08lX (1.000/%08lX)</tt>"
			  "</td>"
			  "</tr>\n", syn, MAX_AFFINITY, MAX_AFFINITY);
			for (long i = 0; i < info.m_numSyns; i++) {
				// get the reverse affinity as well
				long aff = g_thesaurus.getAffinity(
					info.m_syn[i], syn,
					info.m_len[i], len);
				p.safePrintf( 
				  "<tr>"
				  "<td width=40%% align=right>"
				  "<tt>");
				p.safeMemcpy(info.m_syn[i], info.m_len[i]);
				p.safePrintf("</tt>"
				  "</td>"
				  "<td width=60%% align=left>"
				  "<tt>");
				if (info.m_affinity[i] >= 0) {
					p.safePrintf("%0.3f/%08lX ",
				  	  (float)info.m_affinity[i] 
					  	/ MAX_AFFINITY,
					  info.m_affinity[i]);
				} else {
					p.safePrintf("u ");
				}
				if (aff >= 0) {
					p.safePrintf("(%0.3f/%08lX) ",
					  (float)aff / MAX_AFFINITY, 
					  aff);
				} else {
					p.safePrintf("(u) ");
				}
				p.safePrintf("(%ld) (%ld) (%ld) (%ld) "
					     "(%lld) (%lld)",
				  (long)info.m_type[i], (long)info.m_sort[i],
				  info.m_firstId[i], info.m_lastId[i],
				  info.m_leftSynHash[i], 
				  info.m_rightSynHash[i]);
				for (int j = info.m_firstId[i]; 
					j <= info.m_lastId[i];
					j++) {
					p.safePrintf(" (%lld)",
						info.m_termId[j]);
				}
				p.safePrintf(
				  "</tt>"
				  "</td>"
				  "</tr>\n");
			}
			p.safePrintf("</table>");
		} else {
			p.safePrintf("<tr>"
			  "<td align=center><font color=#FF0000>"
			  "synonym not found: %s"
			  "</font></td>"
			  "</tr>\n",
			  syn);
		}
	}

	p.safePrintf ( "<br><br>\n" );

	p.safePrintf ( 
		  "<table cellpadding=4 width=100%% bgcolor=#%s border=1>"
		  "<tr>"
		  "<td colspan=2 bgcolor=#%s>"
		  "<center><b>Thesaurus Controls"
		  "</b></center></td>"
		  "</tr>\n",
		  LIGHT_BLUE, DARK_BLUE);
	
	p.safePrintf (
		  "<tr>"
		  "<td width=37%%><b>rebuild all data</b><br>"
		  "<font size=1>"
		  "rebuilds synonyms and then begins the rebuild process for "
		  "affinity data; this should only be run on one host, as the "
		  "data is copied when the process is finished; full rebuild "
		  "does not use existing affinity data"
		  "</font>"
		  "</td>"
		  "<td width=12%% bgcolor=#0000ff>"
		  "<center><b><a href=\"/master/thesaurus?rebuild=1&%s\">"
		  "rebuild all data</a> <a href=\"/master/thesaurus?"
		  "rebuild=1&full=1&%s\">(full)</a></b></center>"
		  "</td>"
		  "</tr>\n", getBuf, getBuf);

	p.safePrintf (
		  "<tr>"
		  "<td width=37%%><b>distribute data</b><br>"
		  "<font size=1>"
		  "distributes all thesaurus data to all hosts, this is "
		  "normally done automatically but if there was a problem "
		  "with the copy, this lets you do it manually"
		  "</font>"
		  "</td>"
		  "<td width=12%% bgcolor=#0000ff>"
		  "<center><b><a href=\"/master/thesaurus?distribute=1&%s\">"
		  "distribute data</a></b></center>"
		  "</td>"
		  "</tr>\n", getBuf);

	p.safePrintf (
		  "<tr>"
		  "<td width=37%%><b>reload data</b><br>"
		  "<font size=1>"
		  "reloads the synonyms and affinity table on this host only"
		  "</font>"
		  "</td>"
		  "<td width=12%% bgcolor=#0000ff>"
		  "<center><b>"
		  "<a href=\"/master/thesaurus?reload=1&cast=0&%s\">"
		  "reload data</a></b></center>"
		  "</td>"
		  "</tr>\n", getBuf);

	p.safePrintf (
		  "<tr>"
		  "<td width=37%%><b>reload data (all hosts)</b><br>"
		  "<font size=1>"
		  "reloads the synonyms and affinity table on all hosts"
		  "</font>"
		  "</td>"
		  "<td width=12%% bgcolor=#0000ff>"
		  "<center><b>"
		  "<a href=\"/master/thesaurus?reload=1&cast=1&%s\">"
		  "reload data (all hosts)</a></b></center>"
		  "</td>"
		  "</tr>\n", getBuf);

	p.safePrintf (
		  "<tr>"
		  "<td width=37%%><b>list synonyms</b><br>"
		  "<font size=1>"
		  "enter a word here to list all synonym entries and their "
		  "affinities"
		  "</font>"
		  "</td>"
		  "<td width=12%%>"
		  "<form action=\"/master/thesaurus>\">"
		  "<input type=text name=synonym size=20>"
		  "<input type=submit value=Submit>"
		  "%s"
		  "</form></td>"
		  "</tr>\n", formBuf);
		
	p.safePrintf (
		  "<tr>"
		  "<td colspan=2 bgcolor=#%s>"
		  "<center><b>Affinity Controls"
		  "</b></center></td>"
		  "</tr>\n",
		  DARK_BLUE);

	p.safePrintf (
		  "<tr>"
		  "<td width=37%%><b>cancel running rebuild</b><br>"
		  "<font size=1>"
		  "cancels the rebuild and throws all intermediate data away"
		  "</font>"
		  "</td>"
		  "<td width=12%% bgcolor=#0000ff>"
		  "<center><b><a href=\"/master/thesaurus?cancel=1&%s\">"
		  "cancel running rebuild</a></b></center>"
		  "</td>"
		  "</tr>\n", getBuf);
	
	p.safePrintf (
		  "<tr>"
		  "<td width=37%%><b>rebuild affinity only</b><br>"
		  "<font size=1>"
		  "begins the rebuild process for affinity data, has no "
		  "effect if a rebuild is already in progress; full rebuild "
		  "does not reuse existing affinity data"
		  "</font>"
		  "</td>"
		  "<td width=12%% bgcolor=#0000ff>"
		  "<center><b><a href=\"/master/thesaurus?rebuildaff=1&%s\">"
		  "rebuild affinity</a> <a href=\"/master/thesaurus?"
		  "rebuildaff=1&full=1&%s\">(full)</a></b></center>"
		  "</td>"
		  "</tr>\n", getBuf, getBuf);
	
	p.safePrintf (
		  "<tr>"
		  "<td colspan=2 bgcolor=#%s>"
		  "<center><b>Manual File Controls"
		  "</b></td>"
		  "</tr>\n",
		  DARK_BLUE);

	p.safePrintf (
		  "<tr>"
		  "<td align=center colspan=2>");
	
	p.safePrintf(
		  "<b>manually added pairs</b><br>\n"
		  "<font size=1>place word pairs here that should be linked "
		  "as synonyms, one pair per line, seperated by a pipe '|' "
		  "character, optionally followed by another pipe and a type "
		  "designation; any badly formatted lines will be silently "
		  "ignored</font><br>\n"
		  "<form action=\"/master/thesaurus\" method=post>"
		  "<textarea name=\"manualadd\" rows=20 cols=80>");

	if (manualAdd && manualAddLen) {
		p.htmlEncode(manualAdd, manualAddLen, true);
	}
	
	p.safePrintf (
		  "</textarea><br>"
		  "<input type=submit value=Submit>"
		  "<input type=reset value=Reset>"
		  "%s"
		  "</form></td>"
		  "</tr>\n",
		  formBuf);

	
	p.safePrintf (
		  "<tr>"
		  "<td align=center colspan=2>"
		  "<b>affinity value overrides</b><br>\n"
		  "<font size=1>place word/phrase pairs here that should have "
		  "there affinity values overridden, format is "
		  "\"word1|word2|value\", where value is a floating point, "
		  "integer (either decimal or hex), or the word \"max\"; "
		  "any badly formatted lines will be silently ignored; note "
		  "that these pairs will only work if the thesaurus otherwise "
		  "has an entry for them, so add them to the manual add file "
		  "above if need be</font><br>\n"
		  "<form action=\"/master/thesaurus\" method=post>"
		  "<textarea name=\"affinityadd\" rows=20 cols=80>");

	if (affinityAdd && affinityAddLen) {
		p.htmlEncode(affinityAdd, affinityAddLen, true);
	}
	
	p.safePrintf (
		  "</textarea><br>"
		  "<input type=submit value=Submit>"
		  "<input type=reset value=Reset>"
		  "%s"
		  "</form></td>"
		  "</tr>\n", 
		  formBuf);


	p.safePrintf ( "</table>\n" );
	p.safePrintf ( "<br><br>\n" );

	p.safePrintf (
		  "<table cellpadding=4 width=100%% bgcolor=#%s border=1>"
		  "<tr>"
		  "<td colspan=2 bgcolor=#%s>"
		  "<center><b>Affinity Builder Status"
		  "</b></td>"
		  "</tr>\n",
		  LIGHT_BLUE, DARK_BLUE);

	long long a, b, c, d, e, f, g, h, i, j, k;
	StateAffinity *aff = g_thesaurus.m_affinityState;
	if (!aff) {
		p.safePrintf (
		  "<tr><td colspan=2>"
		  "<center><b>Not running</b></center>"
		  "</td></tr>\n");
		a = b = c = d = e = f = g = h = i = j = k = 0;
	} else {
		a = aff->m_oldTable->getNumSlotsUsed();
		b = aff->m_oldTable->getNumSlotsUsed() - aff->m_n;
		c = aff->m_n;
		d = (gettimeofdayInMilliseconds() - aff->m_time) / 1000;
		if (!d || !(c / d)) { 
			e = 0;
		} else {
			e = b / (c / d);
		}
		f = aff->m_sent;
		g = aff->m_recv;
		h = aff->m_errors;
		i = aff->m_old;
		j = aff->m_cache;
		k = aff->m_hitsTable.getNumSlotsUsed();
	}
	p.safePrintf (
		  "<tr><td><b># of total pairs</b></td>"
		  "<td>%lli</td></tr>\n"
		  "<tr><td><b># of pairs remaining</b></td>"
		  "<td>%lli</td></tr>\n"
		  "<tr><td><b># of pairs processed</b></td>"
		  "<td>%lli</td></tr>\n"
		  "<tr><td><b>elapsed time in seconds</b></td>"
		  "<td>%lli</td></tr>\n"
		  "<tr><td><b>estimated remaining time in seconds</b></td>"
		  "<td>%lli</td></tr>\n"
		  "<tr><td><b># of requests sent</b></td>"
		  "<td>%lli</td></tr>\n"
		  "<tr><td><b># of requests received</b></td>"
		  "<td>%lli</td></tr>\n"
		  "<tr><td><b># of request errors</b></td>"
		  "<td>%lli</td></tr>\n"
		  "<tr><td><b># of old values reused</b></td>"
		  "<td>%lli</td></tr>\n"
		  "<tr><td><b># of cache hits</b></td>"
		  "<td>%lli</td></tr>\n"
		  "<tr><td><b>cache size</b></td>"
		  "<td>%lli</td></tr>\n",
		  a, b, c, d, e, f, g, h, i, j, k);
	p.safePrintf ( "</table>\n" );

	return g_httpServer.sendDynamicPage ( s, p.getBufStart(), p.length() );
}
// . sets m_fileOffset and m_bf
// . returns false and sets g_errno on error
// . returns false if nothing to read too... but does not set g_errno
bool ImportState::setCurrentTitleFileAndOffset ( ) {

	// leave m_bf and m_fileOffset alone if there is more to read
	if ( m_fileOffset < m_bfFileSize )
		return true;

	CollectionRec *cr = g_collectiondb.getRec ( m_collnum );
	if ( ! cr ) return false;

	log("import: import finding next file");
	
	// if ( m_offIsValid ) {
	// 	//*off = m_fileOffset;
	// 	return &m_bf; 
	// }
	//m_offIsValid = true;

	// look for titledb0001.dat etc. files in the 
	// workingDir/inject/ subdir
	SafeBuf ddd;
	ddd.safePrintf("%sinject",cr->m_importDir.getBufStart());
	// now use the one provided. we should also provide the # of threads
	if ( cr->m_importDir.getBufStart() && 
	     cr->m_importDir.getBufStart()[0] ) {
		ddd.reset();
		ddd.safeStrcpy ( cr->m_importDir.getBufStart() );
	}

	//
	// assume we are the first filename
	// set s_fileId to the minimum
	//
	Dir dir;
	dir.set(ddd.getBufStart());

	if ( ! dir.open() ) return false;

	// assume none
	long minFileId = -1;

	// getNextFilename() writes into this
	char pattern[64]; strcpy ( pattern , "titledb*" );
	char *filename;
	while ( ( filename = dir.getNextFilename ( pattern ) ) ) {
		// filename must be a certain length
		long filenameLen = gbstrlen(filename);
		// we need at least "titledb0001.dat"
		if ( filenameLen < 15 ) continue;
		// ensure filename starts w/ our m_dbname
		if ( strncmp ( filename , "titledb", 7 ) != 0 )
			continue;
		// skip if not .dat file
		if ( ! strstr ( filename , ".dat" ) )
			continue;
		// then a 4 digit number should follow
		char *s = filename + 7;
		if ( ! isdigit(*(s+0)) ) continue;
		if ( ! isdigit(*(s+1)) ) continue;
		if ( ! isdigit(*(s+2)) ) continue;
		if ( ! isdigit(*(s+3)) ) continue;
		// convert digit to id
		long id = atol(s);
		// . do not accept files we've already processed
		// . -1 means we haven't processed any yet
		if ( m_bfFileId >= 0 && id <= m_bfFileId ) continue;
		// the min of those we haven't yet processed/injected
		if ( id < minFileId || minFileId < 0 ) minFileId = id;
	}

	// get where we left off
	if ( ! m_loadedPlaceHolder ) {
		// read where we left off from file if possible
		char fname[256];
		sprintf(fname,"%slasttitledbinjectinfo.dat",g_hostdb.m_dir);
		SafeBuf ff;
		ff.fillFromFile(fname);
		if ( ff.length() > 1 ) {
			m_loadedPlaceHolder = true;
			// get the placeholder
			sscanf ( ff.getBufStart() 
				 , "%llu,%lu"
				 , &m_fileOffset
				 , &minFileId
				 );
		}
	}

	// if no files! return false to indicate we are done
	if ( minFileId == -1 ) return false;

	// set up s_bf then
	//if ( m_bfFileId != minFileId ) {
	SafeBuf tmp;
	tmp.safePrintf("titledb%04li-000.dat"
		       //,dir.getDirname()
		       ,minFileId);
	m_bf.set ( dir.getDirname() ,tmp.getBufStart() );
	if ( ! m_bf.open( O_RDONLY ) ) {
		log("inject: import: could not open %s%s for reading",
		    dir.getDirname(),tmp.getBufStart());
		return false;
	}
	m_bfFileId = minFileId;
	// reset ptr into file
	//*off = 0;
	// and set this
	m_bfFileSize = m_bf.getFileSize();

	m_fileOffset = 0;
	//}

	log("import: importing from file %s",m_bf.getFilename());

	return true;//&m_bf;
}
Пример #3
0
// this should be called when all docs have finished spidering
void Test::stopIt ( ) {

	// sanity
	if ( m_isAdding ) { char *xx=NULL;*xx=0; }
	// flag that we are done
	m_isRunning = false;

	// print time
	log("test: took %lli ms to complete injections.",
	    gettimeofdayInMilliseconds() - m_testStartTime );

	// get this before setting testParserEnabled to false
	char *testDir = g_test.getTestDir();

	// turn this off now too
	g_conf.m_testParserEnabled = false;
	g_conf.m_testSpiderEnabled = false;



	// save all!
	bool disabled = g_threads.m_disabled;
	g_threads.disableThreads();
	// save it blocking style
	g_process.save();
	if ( ! disabled ) g_threads.enableThreads();

	// save ips.txt
	saveTestBuf ( testDir );

	log("test: test completed. making qa.html");

	//
	//
	// NOW MAKE THE qa.html FILE
	//
	//

	// only analyze up to last 7 runs
	long start = m_runId - 7;
	if ( start < 0 ) start = 0;

	SafeBuf sb;
	sb.safePrintf("<table border=1>\n");
	sb.safePrintf("<tr>"
		      "<td><b><nobr>run id</nobr></b></td>"
		      "<td><b><nobr>conf diff</nobr></b></td>"
		      "<td><b><nobr>coll diff</nobr></b></td>"
		      "<td><b><nobr>run info</nobr></b></td>"
		      "</tr>\n");

	// take diffs between this run and the last run for confparms
	for ( long i = m_runId ; i > start ; i-- ) {
		// shortcut
		char *dir = g_hostdb.m_dir;
		// make diff filename
		char diff1[200];
		sprintf(diff1,"%s/%s/run.%li.confparms.txt.diff",dir,
			testDir,i);
		File f1;
		f1.set(diff1);
		if ( ! f1.doesExist() ) {
			char df1[200];
			char df2[200];
			sprintf(df1,"%s/%s/run.%li.confparms.txt",dir,
				testDir,i);
			sprintf(df2,"%s/%s/run.%li.confparms.txt",dir,
				testDir,i-1);
			// do the diff
			char cmd[600];
			sprintf(cmd,"diff %s %s > %s",df1,df2,diff1);
			log("test: system(\"%s\")",cmd);
			system (cmd);
		}
		long fs1 = f1.getFileSize();
		sb.safePrintf("<tr><td>%li</td><td>%li</td>", i,fs1);

		// make diff filename
		char diff2[200];
		sprintf(diff2,"%s/%s/run.%li.collparms.txt.diff",dir,
			testDir,i);
		File f2;
		f2.set(diff2);
		if ( ! f2.doesExist() ) {
			char df1[200];
			char df2[200];
			sprintf(df1,"%s/%s/run.%li.collparms.txt",dir,
				testDir,i);
			sprintf(df2,"%s/%s/run.%li.collparms.txt",dir,
				testDir,i-1);
			// do the diff
			char cmd[600];
			sprintf(cmd,"diff %s %s > %s",df1,df2,diff2);
			log("test: system(\"%s\")",cmd);
			system (cmd);
		}
		long fs2 = f2.getFileSize();
		sb.safePrintf("<td>%li</td>", fs2);

		// the version
		char vf[200]; 
		sprintf(vf,"%s/%s/run.%li.version.txt",dir,testDir,i);
		File f3; 
		f3.set ( vf );
		long fs3 = f3.getFileSize();
		char vbuf[1000];
		vbuf[0] = 0;
		if ( fs3 > 0 ) {
			f3.open(O_RDONLY);
			long rs = f3.read(vbuf,fs3,0);
			vbuf[fs3] = '\0';
			if ( rs <= 0 ) continue;
			f3.close();
		}
		// show it
		sb.safePrintf("<td><pre>%s</pre></td></tr>\n", vbuf);
	}
	sb.safePrintf("</table>\n");
	sb.safePrintf("<br>\n");


	//
	// now diff each parser output file for each url in urls.txt
	//


	//
	// loop over url buf first so we can print one table per url
	//

	char *next = NULL;
	// reset the url buf ptr
	m_urlPtr = m_urlBuf;
	// count em
	long count = 0;

	// ptrs to each url table
	long  un = 0;
	long  uptr [5000]; // offsets now, not char ptr since buf gets reallocd
	char  udiff[5000];
	long  ulen [5000];
	long  uhits[5000]; // critical errors! validateOutput() choked!
	long  uunchecked[5000]; // events/addresses found but were not validatd
	long  umiss[5000];
	long  usort[5000];
	long  uevents[5000];
	SafeBuf tmp;

	long niceness = MAX_NICENESS;

	// advance to next url
	for ( ; m_urlPtr < m_urlEnd ; m_urlPtr = next ) {
		// breathe
		QUICKPOLL(niceness);
		// we converted all non-url chars into \0's so skip those!
		for ( ; m_urlPtr<m_urlEnd && !*m_urlPtr ; m_urlPtr++ );
		// breach check
		if ( m_urlPtr >= m_urlEnd ) break;
		// set this up
		next = m_urlPtr;
		// compute next url ptr
		for ( ; next < m_urlEnd && *next ; next++ );
		// point to this url
		char *u = m_urlPtr;
		// get hash
		long long h = hash64 ( u , gbstrlen(u) );
		// shortcut
		char *dir = g_hostdb.m_dir;


		// print into a secondary safe buf with a ptr to
		// it so we can sort that and transfer into the
		// primary safebuf later
		uptr[un] = tmp.length();
		// assume no diff
		udiff[un] = 0;

		// print number
		tmp.safePrintf("%li) ",count++);
		// . link to our stored http server reply
		// . TODO: link it to our [cached] copy in the test coll!!!
		char local[1200];
		sprintf(local,"/%s/doc.%llu.html",testDir,h);
		tmp.safePrintf("<a href=\"%s\"><b>%s</b></a> ",local,u);
		// link to live page
		tmp.safePrintf(" <a href=\"%s\">live</a> ",u);
		// link to page parser
		char ubuf[2000];
		urlEncode(ubuf,2000,u,gbstrlen(u),true);
		tmp.safePrintf(" <a href=\"/master/parser?c=test&"
			       "u=%s\">parser</a> ",ubuf);
		//tmp.safePrintf(" (%llu)",h);
		tmp.safePrintf("<br>\n");
		//tmp.safePrintf("<br>\n");
		tmp.safePrintf("<table border=1>\n");
		tmp.safePrintf("<tr>"
			      "<td><b><nobr>run id</nobr></b></td>"
			      "<td><b><nobr>crit hits</nobr></b></td>"
			      "<td><b><nobr>crit errors</nobr></b></td>"
			      "<td><b><nobr># e</nobr></b></td>"
			      "<td><b><nobr>unchecked</nobr></b></td>"
			      "<td><b><nobr>diff chars</nobr></b></td>"
			      "<td><b><nobr>diff file</nobr></b></td>"
			      "<td><b><nobr>full output</nobr></b></td>"
			      "</tr>\n");

		//SafeBuf sd;

		// loop over all the runs now, starting with latest run first
		for ( long ri = m_runId ; ri >= start ; ri-- ) {

			QUICKPOLL(niceness);

			// the diff filename
			char pdiff[200];
			sprintf(pdiff,"%s/%s/parse.%llu.%li.html.diff",dir,
				testDir,h,ri);
			File f;
			f.set(pdiff);
			long fs = f.getFileSize();
			if ( ! f.doesExist() && ri > 0 ) {
				// make the parse filename
				char pbuf1[200];
				char pbuf2[200];
				sprintf(pbuf1,"%s/%s/parse.%llu.%li.html",
					dir,testDir,h,ri);
				sprintf(pbuf2,"%s/%s/parse.%llu.%li.html",
					dir,testDir,h,ri-1);
				// sanity check
				//File tf; tf.set(pbuf1);
				//if ( ! tf.doesExist()) {char *xx=NULL;*xx=0;}
				// tmp file name
				char tmp1[200];
				char tmp2[200];
				sprintf(tmp1,"%s/%s/t1.html",dir,testDir);
				sprintf(tmp2,"%s/%s/t2.html",dir,testDir);
				// filter first
				char cmd[600];
				sprintf(cmd,
					"cat %s | "
					"grep -v \"<!--ignore-->\" "
					" > %s", pbuf1,tmp1);
				system(cmd);
				sprintf(cmd,
					"cat %s | "
					"grep -v \"<!--ignore-->\" "
					" > %s", pbuf2,tmp2);
				system(cmd);
				// make the system cmd to do the diff
				sprintf(cmd,
					"echo \"<pre>\" > %s ; "
					"diff -w --text %s %s "
					// ignore this table header row
					//" | grep -v \"R#4\""
					" >> %s",
					pdiff,
					tmp1,tmp2,pdiff);
				log("test: system(\"%s\")",cmd);
				system(cmd);
				// try again
				f.set(pdiff);
				fs = f.getFileSize();
			}

			QUICKPOLL(niceness);

			// this means 0 . it just has the <pre> tag in it!
			if ( fs < 0 || fs == 6 ) fs = 0;
			// . if no diff and NOT current run, do not print it
			// . print it if the run right before the current 
			//   now always too
			if ( ri != m_runId && ri != m_runId-1 && fs == 0 ) 
				continue;
			// relative filename
			char rel[200];
			sprintf(rel,"/%s/parse.%llu.%li.html.diff",
				testDir,h,ri);
			char full[200];
			sprintf(full,"/%s/parse.%llu.%li.html",
				testDir,h,ri);
			char validate[200];
			sprintf(validate,
				"/%s/parse-shortdisplay.%llu.%li.html",
				testDir,h,ri);
			// use red font for current run that has a diff!
			char *t1 = "";
			char *t2 = "";
			if ( ri == m_runId && fs != 0 ) {
				t1 = "<font color=pink><b>";
				t2 = "</b></font>";
				// a diff
				udiff[un] = 1;
			}

			// . get critical errors
			// . i.e. XmlDoc::validateOutput() could not validate
			//   a particular event or address that was in the
			//   url's "validated.uh64.txt" file since the admin
			//   clicked on the checkbox in the page parser output
			// . if we do not find such a tag in the parser output
			//   any more then Spider.cpp creates this file!
			if ( ri == m_runId ) {
				char cfile[256];
				sprintf(cfile,"%s/%s/critical.%llu.%li.txt",
					g_hostdb.m_dir,testDir,h,ri);
				SafeBuf ttt;
				ttt.fillFromFile(cfile);
				// first long is misses, then hits then events
				umiss[un] = 0;
				uhits[un] = 0;
				uevents[un] = 0;
				uunchecked[un] = 0;
				if ( ttt.length() >= 3 )
					sscanf(ttt.getBufStart(),
					       "%li %li %li %li",
					       &umiss[un],
					       &uhits[un],
					       &uevents[un],
					       &uunchecked[un]);
				usort[un] = umiss[un] + uunchecked[un];
				//File cf;
				//cf.set(cfile);
				//if ( cf.doesExist()) ucrit[un] = 1;
				//else                 ucrit[un] = 0;
			}

			// more critical?
			if ( ri == m_runId && umiss[un] != 0 ) {
				t1 = "<font color=red><b>";
				t2 = "</b></font>";
			}

			// . these are good to have
			// . if you don't have 1+ critical hits then you
			//   probably need to be validate by the qa guy
			char *uhb1 = "";
			char *uhb2 = "";
			if ( ri == m_runId && uhits[un] != 0 ) {
				uhb1 = "<font color=green><b>**";
				uhb2 = "**</b></font>";
			}

			QUICKPOLL(niceness);

			char *e1 = "<td>";
			char *e2 = "</td>";
			long ne = uevents[un];
			if ( ne ) { 
				e1="<td bgcolor=orange><b><font color=brown>"; 
				e2="</font></b></td>"; 
			}
			char *u1 = "<td>";
			char *u2 = "</td>";
			if ( uunchecked[un] ) {
				u1="<td bgcolor=purple><b><font color=white>"; 
				u2="</font></b></td>"; 
			}
				
			// print the row!
			tmp.safePrintf("<tr>"
				      "<td>%s%li%s</td>"
				       "<td>%s%li%s</td>" // critical hits
				       "<td>%s%li%s</td>" // critical misses
				       "%s%li%s" // # events
				       "%s%li%s" // unchecked
				       "<td>%s%li%s</td>" // filesize of diff
				      // diff filename
				      "<td><a href=\"%s\">%s%s%s</a></td>"
				      // full parser output
				      "<td>"
				       "<a href=\"%s\">full</a> | "
				       "<a href=\"%s\">validate</a> "
				       "</td>"
				      "</tr>\n",
				      t1,ri,t2,
				      uhb1,uhits[un],uhb2,
				      t1,umiss[un],t2,
				      e1,ne,e2,
				       u1,uunchecked[un],u2,
				      t1,fs,t2,
				      rel,t1,rel,t2,
				      full,
				       validate);


			// only fill "sd" for the most recent guy
			if ( ri != m_runId ) continue;

			// now concatenate the parse-shortdisplay file
			// to this little table so qa admin can check/uncheck
			// validation checkboxes for addresses and events
			//sprintf(cfile,
			//	"%s/test/parse-shortdisplay.%llu.%li.html",
			//	g_hostdb.m_dir,h,ri);
			//sd.fillFromFile ( cfile );
		}
		// end table
		tmp.safePrintf("</table>\n");

		// . and a separate little section for the checkboxes
		// . should already be in tables, etc.
		// . each checkbox should provide its own uh64 when it
		//   calls senddiv() when clicked now
		//tmp.cat ( sd );

		tmp.safePrintf("<br>\n");
		tmp.safePrintf("<br>\n");
		// set this
		ulen[un] = tmp.length() - uptr[un] ;
		// sanity check
		if ( ulen[un] > 10000000 ) { char *xx=NULL;*xx=0; }
		// inc it
		un++;
		// increase the 5000!!
		if ( un >= 5000 ) { char *xx=NULL; *xx=0; }
	}


	char flag ;
 bubble:
	flag = 0;
	// sort the url tables
	for ( long i = 0 ; i < un - 1 ; i++ ) {
		QUICKPOLL(niceness);
		if ( usort[i] >  usort[i+1] ) continue;
		if ( usort[i] == usort[i+1] ) 
			if ( udiff[i] >= udiff[i+1] ) continue;
		// swap em
		long  tp = uptr[i];
		long  td = udiff[i];
		long  um = umiss[i];
		long  us = usort[i];
		long  uh = uhits[i];
		long  tl = ulen [i];
		uptr[i] = uptr[i+1];
		umiss[i] = umiss[i+1];
		usort[i] = usort[i+1];
		uhits[i] = uhits[i+1];
		udiff[i] = udiff[i+1];
		ulen[i]  = ulen[i+1];
		uptr[i+1] = tp;
		umiss[i+1] = um;
		usort[i+1] = us;
		uhits[i+1] = uh;
		udiff[i+1] = td;
		ulen [i+1] = tl;
		flag = 1;
	}
	if ( flag ) goto bubble;

	// transfer into primary safe buf now
	for ( long i = 0 ; i < un ; i++ ) 
		sb.safeMemcpy(tmp.getBufStart() + uptr[i],ulen[i]);


	sb.safePrintf("</html>\n");

	char dfile[200];
	sprintf(dfile,"%s/%s/qa.html",g_hostdb.m_dir,testDir);
	sb.dumpToFile ( dfile );

	// free the buffer of urls
	reset();

	// turn off spiders
	g_conf.m_spideringEnabled = 0;

	// all done
	return;
}