C++ (Cpp) SafeBuf Examples

Programming Language: C++ (Cpp)

Class/Type: SafeBuf

Examples at hotexamples.com: 30

C++ (Cpp) SafeBuf - 30 examples found. These are the top rated real world C++ (Cpp) examples of SafeBuf extracted from open source projects. You can rate examples to help us improve the quality of examples.

Frequently Used Methods

Show Hide

safePrintf(30)

getBufStart(30)

safeMemcpy(12)

urlEncode(8)

nullTerm(8)

reserve(7)

pushChar(5)

reset(5)

getBuf(4)

fillFromFile(3)

safeStrcpy(3)

load(3)

pushLong(3)

htmlEncode(3)

safeTruncateEllipsis(3)

detachBuf(2)

getBufPtr(2)

replaceChar(1)

save(1)

reserve2x(1)

setLength(1)

stealBuf(1)

cdataEncode(1)

pushFloat(1)

removeLastChar(1)

pushLongLong(1)

purge(1)

base64Encode(1)

length(1)

jsonEncode(1)

incrementLength(1)

htmlEncodeXmlTags(1)

getLength(1)

getCapacity(1)

dumpToFile(1)

getBufEnd(1)

base64Decode(1)

Example #1

Show file

File: AutoBan.cpp Project: RevBooyah/open-source-search-engine

bool AutoBan::printTable( TcpSocket *s , HttpRequest *r ) {
	SafeBuf sb(512 * 512,"autobbuf");
	//read in all of the possible cgi parms off the bat:
	//long  user     = g_pages.getUserType( s , r );
	char *username = g_users.getUsername(r);
	//char *pwd  = r->getString ("pwd");

	char *coll = r->getString ("c");

	long banIpsLen;
	char *banIps = r->getString ("banIps" , &banIpsLen , NULL);

	long allowIpsLen;
	char *allowIps = r->getString ("allowIps" , &allowIpsLen , NULL);

 	long clearLen;
 	char *clear = r->getString ("clear" , &clearLen , NULL);

	bool changed = false;

 	long validCodesLen;
 	char *validCodes = r->getString ("validCodes", &validCodesLen, NULL);

	long showAllIps = r->getLong("showAllIps", 0);
	long showLongView = r->getLong("longview", 0);

	// do it all from parm now
	//long banRegexLen;
	//char *banRegex = r->getString("banRegex", &banRegexLen, NULL);
	

// 	char *ss = sb.getBuf();
// 	char *ssend = sb.getBufEnd();
	g_pages.printAdminTop ( &sb, PAGE_AUTOBAN, username,
				coll , NULL , s->m_ip );
	//sb.incrementLength(sss - ss);

	// MDW: moved to here

	long now = getTime();
	
	long days;
	long hours;
	long minutes;
	long secs;
	long msecs;

	if(r->getLong("resetcodes", 0)) {
		setCodesFromConf();
	}

	sb.safePrintf("\n<br><br><table width=100%% bgcolor=#%s "
		      "cellpadding=4 border=1>\n", 
		      BABY_BLUE);
	getCalendarFromMs((now - m_codeResetTime) * 1000,
			  &days, 
			  &hours, 
			  &minutes, 
			  &secs,
			  &msecs);
	sb.safePrintf("<tr><td colspan=18 bgcolor=#%s>"
		      "<center><b>Code Usage "
		      "(<a href=\"/master/"
		      "autoban?c=%s&resetcodes=1\">reset</a> "
		      "%li days %li hours %li "
		      "minutes %li sec ago)"
		      "</b></center></td></tr>", 
		      DARK_BLUE,
		      coll,
		      days, 
		      hours, 
		      minutes, 
		      secs);
	sb.safePrintf("<tr bgcolor=#%s>"
		      "<td><center><b>Code</b></center></td>"
		      "<td><center><b>IP</b></center></td>"
		      "<td><center><b>Query Count</b></center></td>"

		      "<td><center><b>Bytes Read</b></center></td>"
		      "<td><center><b>Bytes Sent</b></center></td>"
		      
		      "<td><center><b>Outstanding Count</b></center></td>"
		      "<td><center><b>Most Ever Outstanding</b></center></td>"
		      "<td><center><b>Max Outstanding</b></center></td>"
		      "</tr>", 
		      LIGHT_BLUE);


	for(long i = 0; i < m_ht.getNumSlots(); i++) {
		if ( m_ht.getKey ( i ) == 0 ) continue;
		CodeVal *cv = m_ht.getValuePointerFromSlot ( i );
		if ( ! cv ) continue;
		
		sb.safePrintf("<tr>");
		sb.safePrintf("<td>");
		sb.copyToken(cv->m_code);//m_codeVals[i].m_code);
		sb.safePrintf("</td>");
		sb.safePrintf("<td><center>%s</center> </td>",
			      iptoa(cv->m_ip));
		sb.safePrintf("<td><center>%lli</center></td>", 
			      cv->m_count);

		sb.safePrintf("<td><center>%lli</center></td>", 
			      cv->m_bytesRead);
		sb.safePrintf("<td><center>%lli</center></td>", 
			      cv->m_bytesSent);

		sb.safePrintf("<td><center>%li</center></td>", 
			      cv->m_outstanding);
		sb.safePrintf("<td><center>%li</center></td>", 
			      cv->m_maxEver);
		if ( cv->m_maxOutstanding != 50 )
			sb.safePrintf("<td><center><b>%li</b></center></td>", 
				      cv->m_maxOutstanding);
		else
			sb.safePrintf("<td><center>%li</center></td>", 
				      cv->m_maxOutstanding);

		sb.safePrintf("</tr>");
		
	}
	sb.safePrintf ("</table><br><br>\n" );


 	if(clear && clearLen < 64) {
 		long ip = atoip(clear, clearLen);
 		if(ip) {
			removeIp(ip);
			char *beginning;
			char ipbuf[64];//gotta NULL terminate for strstr
			memcpy(ipbuf, clear, clearLen);
			ipbuf[clearLen] = '\0';
			beginning = findToken(g_conf.m_banIps, ipbuf, 
					      clearLen);
			if(beginning) {
				char *to = beginning;
				char *from = beginning + clearLen;
				while(*to) *to++ = *from++;
			}
			beginning = findToken(g_conf.m_allowIps, ipbuf,
					      clearLen);
			if(beginning) {
				char *to = beginning;
				char *from = beginning + clearLen;
				while(*to) *to++ = *from++;
			}
			changed = true;
 		}
 	}

 	long allowLen;
 	char *allow = r->getString ( "allow" , &allowLen , NULL );
 	if(allow && allowLen < 64) {
 		long ip = atoip(allow, allowLen);
		
 		if(ip) {
			char *beginning;
			char ipbuf[64];//gotta NULL terminate for strstr
			memcpy(ipbuf, allow, allowLen);
			ipbuf[allowLen] = '\0';
			beginning = findToken(g_conf.m_allowIps, ipbuf, 
					      allowLen);
			if(!beginning) {
				//its not present, so add it.
				char *p = g_conf.m_allowIps;
				while(*p) p++;
				if(p - g_conf.m_allowIps + allowLen + 2 
				   < AUTOBAN_TEXT_SIZE) {
					*p++ = '\n';
					memcpy(p, ipbuf,allowLen);
					*(p + allowLen) = '\0';
				}
				else {
					sb.safePrintf("<font color=red>"
						      "Not enough stack space "
						      "to fit allowIps.  "
						      "Increase "
						      "AUTOBAN_TEXT_SIZE in "
						      "Conf.h. "
						      "Had %i need %li."
						      "</font>", 
						      AUTOBAN_TEXT_SIZE,
						      p - g_conf.m_allowIps + 
						      allowLen + 2);
					goto dontRemove1;
				}
			}
			beginning = findToken(g_conf.m_banIps, ipbuf, 
					      allowLen);
			if(beginning) {
				//remove it from banned if present.
				char *to = beginning;
				char *from = beginning + allowLen;
				while(*to) *to++ = *from++;
			}

			changed = true;
 		}
 	}
 dontRemove1:
 	long denyLen;
 	char *deny = r->getString ( "deny" , &denyLen , NULL );
 	if(deny && denyLen < 64) {
 		long ip = atoip(deny, denyLen);
		
 		if(ip) {
			char *beginning;
			char ipbuf[64];//gotta NULL terminate for strstr
			memcpy(ipbuf, deny, denyLen);
			ipbuf[denyLen] = '\0';
			beginning = findToken(g_conf.m_banIps, ipbuf, denyLen);
			if(!beginning) {
				//its not present, so add it.
				char *p =g_conf.m_banIps;
				while(*p) p++;
				if(p - g_conf.m_banIps + denyLen + 2 < 
				   AUTOBAN_TEXT_SIZE) {
					*p++ = '\n';
					memcpy(p, ipbuf,denyLen);
					*(p + denyLen) = '\0';
				}
				else {
					sb.safePrintf("<font color=red>Not "
						      "enough stack space "
						      "to fit bannedIPs.  "
						      "Increase "
						      "AUTOBAN_TEXT_SIZE in "
						      "Conf.h. "
						      "Had %i need %li."
						      "</font>", 
						      AUTOBAN_TEXT_SIZE,
						      p - g_conf.m_banIps +
						      denyLen + 2);
					goto dontRemove2;
				}
			}
			beginning = findToken(g_conf.m_allowIps, ipbuf,
					      denyLen);
			if(beginning) {
				//remove it from allowed list if present.
				char *to = beginning;
				char *from = beginning + denyLen;
				while(*to) *to++ = *from++;
			}
			changed = true;
 		}
 	}
 dontRemove2:

	if(!g_conf.m_doAutoBan) {
		sb.safePrintf("<center><font color=red><b>Autoban is disabled, "
			      "turn it on in Master Controls.</b></font></center><br>");
	}

 	if(validCodes) {
		if(validCodesLen >= AUTOBAN_TEXT_SIZE) {
			sb.safePrintf("<font color=red>Not enough stack space "
				      "to fit codes.  "
				      "Increase AUTOBAN_TEXT_SIZE in Conf.h. "
				      "Had %i need %li.</font>", 
				      AUTOBAN_TEXT_SIZE,
				      validCodesLen);
			validCodes = NULL;
			validCodesLen = 0;
		}
		else {
			memcpy(g_conf.m_validCodes, validCodes, validCodesLen);
			g_conf.m_validCodes[validCodesLen] = '\0';
			trimWhite(g_conf.m_validCodes);
			setCodesFromConf();
		}
	}



	//first remove all of the ips in the conf, then add the passed in 
	//  ones to the conf parm; 
	if (banIps) {
		//ack, the browser puts in crlf when this comes back, so
		//we will have a longer string here than the one we sent 
		//out. trim back all extrainious whitespace before we do
		//bounds checking.
		trimWhite(banIps);
		banIpsLen = gbstrlen(banIps);
		if(banIpsLen >= AUTOBAN_TEXT_SIZE) {
			sb.safePrintf("<font color=red>Not enough stack space "
				      "to fit bannedIps.  "
				      "Increase AUTOBAN_TEXT_SIZE in Conf.h. "
				      "Had %i need %li.</font>", 
				      AUTOBAN_TEXT_SIZE,
				      banIpsLen);
			banIpsLen = AUTOBAN_TEXT_SIZE - 1;
		}
		for(long i = 0; i < m_tableSize; i++) {
			if(m_detectKeys[i] == 0) continue;
			//check the 'set from conf' bit, and clear those.
			if(m_detectVals[i].m_flags & FROMCONF) {
				removeIp(m_detectKeys[i]);
			}
		}
		memcpy(g_conf.m_banIps, banIps, banIpsLen);
		g_conf.m_banIps[banIpsLen] = '\0';
		changed = true;
	}
	if (allowIps) {
		trimWhite(allowIps);
		allowIpsLen = gbstrlen(allowIps);

		if(allowIpsLen >= AUTOBAN_TEXT_SIZE) {
			sb.safePrintf("<font color=red>Not enough stack space "
				      "to fit allowIps.  "
				      "Increase AUTOBAN_TEXT_SIZE in Conf.h. "
				      "Had %i need %li.</font>", 
				      AUTOBAN_TEXT_SIZE,
				      allowIpsLen);
			allowIpsLen = AUTOBAN_TEXT_SIZE - 1;
		}
		for(long i = 0; i < m_tableSize; i++) {
			if(m_detectKeys[i] == 0) continue;
			//check the 'set from conf' bit, and clear those.
			if(m_detectVals[i].m_flags & FROMCONF) {
				removeIp(m_detectKeys[i]);
			}
		}
		memcpy(g_conf.m_allowIps, allowIps, allowIpsLen);
		g_conf.m_allowIps[allowIpsLen] = '\0';
		changed = true;
	}
	if(changed) {
		trimWhite(g_conf.m_allowIps);
		trimWhite(g_conf.m_banIps);
		setFromConf();
	}



	sb.safePrintf("\n<table width=100%% bgcolor=#%s "
		      "cellpadding=4 border=1>\n", 
		      BABY_BLUE);
	sb.safePrintf("<tr><td colspan=2 bgcolor=#%s>"
		      "<center><b>Add IPs</b></center></td></tr>", 
		      DARK_BLUE);

// 	ss = sb.getBuf();
// 	ssend = sb.getBufEnd();
	g_parms.printParms (&sb, s, r);
	//	sb.incrementLength(sss - ss);



	sb.safePrintf ("<tr><td>"
		       "<center>" 
		       "<input type=submit value=\"Update\" "
		       "method=\"POST\" border=0>"
		       "</center></td></tr>");

	sb.safePrintf ("</table><br><br>\n" );



	if(!showLongView) {
		sb.safePrintf("<b><a href=\"autoban"
			      "?c=%s"
			      "&showAllIps=%li"
			      "&longview=1\">Show watched ips table...</a></b>",
			      coll,
			      showAllIps);
		return g_httpServer.sendDynamicPage ( s , 
						      sb.getBufStart() , 
						      sb.length() , 
						      -1 , 
						      false);
	}

	/////////////////////////////////////////////////////////////////////

	sb.safePrintf("\n<table width=100%% bgcolor=#%s "
		      "cellpadding=4 border=1>\n", 
		      BABY_BLUE);

	sb.safePrintf("<tr><td colspan=3 bgcolor=#%s>"
		      "<center><b>Watched Ips</b></center></td></tr>", 
		      DARK_BLUE);

	sb.safePrintf("<tr bgcolor=#%s>"
		      "<td><center><b>IP</b></center></td>"
		      "<td><center><b>Description</b></center></td>"
		      //		      "<td><center><b>Time Added</b></center></td>"
		      "<td><center><b>Allow/Deny/Clear</b></center></td>"
		      "</tr>", 
		      LIGHT_BLUE);




	long *sortedIndices = (long*)mmalloc(m_tableSize * sizeof(long), 
					     "AutoBanH");

	if(!sortedIndices) {
		return g_httpServer.sendErrorReply(s,500,mstrerror(ENOMEM));
	}

	long numEntries = 0;
	for(long i = 0; i < m_tableSize; i++) {
		if(m_detectKeys[i] == 0) continue;
		sortedIndices[numEntries++] = i;
	}
	SorterTable = m_detectKeys;

        gbsort(sortedIndices, numEntries, sizeof(long), ip_cmp);


	//lets put each class of watched ip in its own safebuf then cat 
	//them together at the end.
	
	SafeBuf allowed;
	SafeBuf banned; 
	SafeBuf feedLeachers; 
	SafeBuf cowBots; 
	SafeBuf *e;

	for(long j = 0; j < numEntries; j++) {
		long i = sortedIndices[j];
		if(m_detectKeys[i] == 0) continue;
		//if(!(m_detectVals[i].m_flags & FROMCONF)) continue;
		bool allow =  m_detectVals[i].m_flags & ALLOW && 
			m_detectVals[i].m_flags & FROMCONF;
		bool deny  =  m_detectVals[i].m_flags & DENY && 
			m_detectVals[i].m_flags & FROMCONF;
		bool explicitban = deny && m_detectVals[i].m_flags & FROMCONF;
		unsigned short dayCount = m_detectVals[i].m_dayCount;
		unsigned char minuteCount = m_detectVals[i].m_minuteCount;

		bool day =    dayCount >= g_conf.m_numFreeQueriesPerDay;
		bool minute = minuteCount >= g_conf.m_numFreeQueriesPerMinute;

		char *description;
		char *color;

		if(allow) {
			color = GREEN;
			description = "Allowed";
			e = &allowed;
		} 
		else if(explicitban) {
			color = RED;
			description = "Banned";
			e = &banned;
		}
		else if(minute) {
			color = RED;
			description = "Cow Bot";
			e = &cowBots;
		}
		else if(day) {
			color = RED;
			description = "Feed Leacher";
			e = &feedLeachers;
		}
		else {
			//this can happen when someone was banned due to 
			//exceeding the quota, then the quota was lowered.
			
			m_detectVals[i].m_flags &= ~DENY;
			//log("autoban: ohshit-banning %s",iptoa(s->m_ip));
			continue;
		}

		
		e->safePrintf("<tr>");

		e->safePrintf("<td bgcolor=#%s><center>%s</center></td><td>"
			      "<center>%s</center></td>"

// 			      "<td><center>"
// 			      "%li days %li hrs %li min ago"
// 			      "</center></td>"

			      "<td><center><a href=\"/master/"
			      "autoban?c=%s&allow=%s&showAllIps=%li\">" 
			      "allow/</a>"

			      "<a href=\"/master/"
			      "autoban?c=%s&deny=%s&showAllIps=%li\">" 
			      "deny/</a>"

			      "<a href=\"/master/"
			      "autoban?c=%s&clear=%s&showAllIps=%li\">"
			      "clear</a></center>"
			      "</td>",color, 
			      iptoa(m_detectKeys[i]),
			      description,

			      //      days,hours,minutes,

			      coll,
			      iptoa(m_detectKeys[i]),
			      showAllIps,
			      coll,
			      iptoa(m_detectKeys[i]),
			      showAllIps,
			      coll,
			      iptoa(m_detectKeys[i]),
			      showAllIps);
		e->safePrintf("</tr>");
	}

	sb.cat(allowed);
	sb.cat(banned); 
	sb.cat(feedLeachers); 
	sb.cat(cowBots); 

	sb.safePrintf ("</table><br><br>\n" );


	// MDW moved from here

	sb.safePrintf("\n<br><br><table width=100%% bgcolor=#%s "
		      "cellpadding=4 border=1>\n", 
		      BABY_BLUE);

	sb.safePrintf("<tr><td colspan=5 bgcolor=#%s>"
		      "<center><b>Control Panel</b></center></td></tr>", 
		      DARK_BLUE);

	sb.safePrintf("<tr>"
		      "<td bgcolor=#%s><center><b>Show Ips by Number of Queries"
		      "</b></center></td>",
		      LIGHT_BLUE);
	sb.safePrintf("<td><center><font color=red><b><a href=\"/master/"
		      "autoban?c=%s&showAllIps=0\">"
		      "0 Queries</a></b>"
		      "</font></center></td>",
		      coll);
	sb.safePrintf("<td><center><font color=red><b><a href=\"/master/"
		      "autoban?c=%s&showAllIps=1\">"
		      "1 Query</a></b>"
		      "</font></center></td>",
		      coll);
	sb.safePrintf("<td><center><font color=red><b><a href=\"/master/"
		      "autoban?c=%s&showAllIps=10\">"
		      "10 Queries</a></b>"
		      "</font></center></td>",
		      coll);
	sb.safePrintf("<td><center><font color=red><b><a href=\"/master/"
		      "autoban?c=%s&showAllIps=100\">"
		      "100 Queries</a></b>"
		      "</font></center></td></tr>",
		      coll);

	sb.safePrintf ("</table><br><br>\n");



	if(!showAllIps) {

		char* ss = (char*) sb.getBufStart();
		long sslen = sb.length();
		mfree(sortedIndices, m_tableSize * sizeof(long),"AutoBanH");

		return g_httpServer.sendDynamicPage ( s , ss , sslen , -1 , false);
	}
	

	sb.safePrintf("\n<br><br><table width=100%% bgcolor=#%s "
		      "cellpadding=4 border=1>\n", 
		      BABY_BLUE);

	sb.safePrintf("<tr><td colspan=6 bgcolor=#%s>"
		      "<center><b>Queries Today</b></center></td></tr>", 
		      DARK_BLUE);

	sb.safePrintf("<tr bgcolor=#%s>"
		      "<td><center><b>IP</b></center></td>"
		      "<td><center><b>Minute count</b></center></td>"
		      "<td><center><b>Day count</b></center></td>"
		      "<td><center><b>Time Until Reset</b></center></td>"
		      "<td><center><b>Times Banned</b></center></td>"
		      "<td><center><b>Allow/Deny</b></center></td>"
		      "</tr>", 
		      LIGHT_BLUE);


	char minBuf[128];
	char dayBuf[128];
	unsigned long lastIpGroup = 0;
	for(long j = 0; j < numEntries; j++) {
		long i = sortedIndices[j];
		long  dayCount = m_detectVals[i].m_dayCount;
		unsigned char minuteCount = m_detectVals[i].m_minuteCount;

		if(!(m_detectVals[i].m_flags & FROMCONF)) {
			if(m_detectVals[i].m_minuteExpires < now) 
				minuteCount = 0;
			if(!(m_detectVals[i].m_flags & DENY) && 
			   m_detectVals[i].m_dayExpires < now) 
				dayCount = 0;
		}
		//a hack:
		if( dayCount < showAllIps) continue;

		char *color = YELLOW;
		
		if(m_detectVals[i].m_flags & ALLOW) {
			color = GREEN;
			snprintf(minBuf, 128, "--");
			snprintf(dayBuf, 128, "%li", dayCount);
		}
		else if(m_detectVals[i].m_flags & DENY) {
			color = RED;
			snprintf(minBuf, 128, "--");
			snprintf(dayBuf, 128, "%li", dayCount);
		} 
		else {
			snprintf(minBuf, 128, "%li", (long)minuteCount);
			snprintf(dayBuf, 128, "%li", (long)dayCount);
		}

		unsigned long thisIpGroup = (unsigned long)m_detectKeys[i] & 
			0x00ffffff;

		sb.safePrintf("<tr><center>");

		if(m_detectVals[i].m_flags & FROMCONF) {
			sb.safePrintf("<td bgcolor=#%s><center>%s%s%s</center></td>"
				      "<td><center>%s</center> </td>"
				      "<td><center>%s</center></td>" 
				      "<td><center><font color=red>"
				      "<b>NEVER</b>"
				      "</font></center></td>"
				      "<td><center>--</center></td>",
				      color, 
				      (thisIpGroup == lastIpGroup)?"<b>":"",
				      iptoa(m_detectKeys[i]),
				      (thisIpGroup == lastIpGroup)?"</b>":"",
				      minBuf,
				      dayBuf);
		}
		else {
			//they haven't done a query since being unbanned,
			//unban them now so we don't get negative resets displayed.
			/*
			  no, don't unban the bots!!! MDW yippy project
			if(m_detectVals[i].m_dayExpires < now) {
				m_detectVals[i].m_flags &= ~DENY; 
				//log("autoban: dayexpire-unbanning %s",
				//    iptoa(ip));
				m_detectVals[i].m_dayExpires = now + ONE_DAY;
				m_detectVals[i].m_minuteExpires = now + 60;
				m_detectVals[i].m_dayCount = 0;
				m_detectVals[i].m_minuteCount = 0;
				sb.safePrintf("</center></tr>");
				continue;
			}
			*/

			getCalendarFromMs((m_detectVals[i].m_dayExpires - now)* 1000,
					  &days, 
					  &hours, 
					  &minutes, 
					  &secs,
					  &msecs);

			sb.safePrintf("<td bgcolor=#%s><center>%s%s%s</center></td>"
				      "<td><center>%s</center> </td>"
				      "<td><center>%s</center></td>" 
				      "<td><center><font color=red>"
				      "<b>%li days %li hrs %li min %li sec</b>"
				      "</font></center></td>"
				      "<td><center>%i</center></td>",
				      color, 
				      (thisIpGroup == lastIpGroup)?"<b>":"",
				      iptoa(m_detectKeys[i]),
				      (thisIpGroup == lastIpGroup)?"</b>":"",
				      minBuf,
				      dayBuf,
				      days, hours, minutes, secs,
				      m_detectVals[i].m_timesBanned);
		}
		sb.safePrintf("<td><center>"
			      "<a href=\"/master/"
			      "autoban?c=%s&allow=%s&showAllIps=%li\">" 
			      "allow/</a>"
			      "<a href=\"/master/"
			      "autoban?c=%s&deny=%s&showAllIps=%li\">" 
			      "deny</a></center>"
			      "</td>",
			      coll,
			      iptoa(m_detectKeys[i]),
			      showAllIps,
			      coll,
			      iptoa(m_detectKeys[i]),
			      showAllIps);

		sb.safePrintf("</center></tr>");
		lastIpGroup = thisIpGroup;
	}


	sb.safePrintf ("</table><br><br>\n" );


	char* ss = (char*) sb.getBufStart();
	long sslen = sb.length();

	mfree(sortedIndices, m_tableSize * sizeof(long),"AutoBanH");

	return g_httpServer.sendDynamicPage ( s , ss , sslen , -1 , false);
}

Example #2

Show file

File: PageReindex.cpp Project: privacore/open-source-search-engine

void doneReindexing ( void *state ) {
	// cast it
	State13 *st = (State13 *)state;

	GigablastRequest *gr = &st->m_gr;

	// note it
	if ( gr->m_query && gr->m_query[0] )
		log(LOG_INFO,"admin: Done with query reindex. %s",
		    mstrerror(g_errno));

	////
	//
	// print the html page
	//
	/////

	HttpRequest *hr = &gr->m_hr;

	char format = hr->getReplyFormat();

	SafeBuf sb;

	const char *ct = "text/html";
	if ( format == FORMAT_JSON ) ct = "application/json";
	if ( format == FORMAT_XML  ) {
		ct = "text/xml";

		sb.safePrintf("<response>\n"
			      "\t<statusCode>0</statusCode>\n"
			      "\t<statusMsg>Success</statusMsg>\n"
			      "\t<matchingResults>%" PRId32"</matchingResults>\n"
			      "</response>"
			      , st->m_msg1c.m_numDocIdsAdded
			      );
		g_httpServer.sendDynamicPage ( gr->m_socket,
					       sb.getBufStart(),
					       sb.length(),
					       -1,
					       false,ct);
		mdelete ( st , sizeof(State13) , "PageTagdb" );
		delete (st);
		return;
	}

	if ( format == FORMAT_JSON ) {
		sb.safePrintf("{\"response\":{\n"
			      "\t\"statusCode\":0,\n"
			      "\t\"statusMsg\":\"Success\",\n"
			      "\t\"matchingResults\":%" PRId32"\n"
			      "}\n"
			      "}\n"
			      , st->m_msg1c.m_numDocIdsAdded
			      );
		g_httpServer.sendDynamicPage ( gr->m_socket,
					       sb.getBufStart(),
					       sb.length(),
					       -1,
					       false,ct);
		mdelete ( st , sizeof(State13) , "PageTagdb" );
		delete (st);
		return;
	}



	g_pages.printAdminTop ( &sb , gr->m_socket , &gr->m_hr );

	sb.safePrintf("<style>"
		       ".poo { background-color:#%s;}\n"
		       "</style>\n" ,
		       LIGHT_BLUE );


	//
	// print error msg if any
	//

	if ( gr->m_query && gr->m_query[0] && ! g_errno )
		sb.safePrintf ( "<center><font color=red><b>Success. "
			  "Added %" PRId32" docid(s) to "
			  "spider queue.</b></font></center><br>" , 
			  st->m_msg1c.m_numDocIdsAdded );

	if ( gr->m_query && gr->m_query[0] && g_errno )
		sb.safePrintf ( "<center><font color=red><b>Error. "
				 "%s</b></font></center><br>" , 
				 mstrerror(g_errno));


	// print the reindex interface
	g_parms.printParmTable ( &sb , gr->m_socket , &gr->m_hr  );


	g_httpServer.sendDynamicPage ( gr->m_socket,
				       sb.getBufStart(),
				       sb.length(),
				       -1,
				       false);

	mdelete ( st , sizeof(State13) , "PageTagdb" );
	delete (st);
}

Example #3

Show file

File: Turkdb.cpp Project: DeadNumbers/open-source-search-engine

void gotDatedbList ( State60 *st ) {

	// must only be run on host #0 since we need just one lock table
	if ( g_hostdb.m_myHost->m_hostId != 0 ) { char *xx=NULL;*xx=0; }

	// load turk lock table if we need to
	bool s_init = false;
	if ( ! s_init ) {
		s_init = true;
		if ( ! g_turkLocks.set(8,sizeof(TurkLock),256) )
			log("turk: failed to init turk lock table");
		if ( ! g_turkLocks.load(g_conf.m_dir,"turkdir/docidlocks.dat"))
			log("turk: failed to load turk lock table");
	}

	time_t now = getTimeGlobal();
	// int16_tcut
	RdbList *list = &st->m_list;
	// the best docid
	int64_t best = 0LL;
	// scan the list to get urls/docids to turk out
	for ( ; ! list->isExhausted() ; ) {
		// get rec
		char *k = list->getCurrentKey();
		// skip that
		list->skipCurrentRecord();
		// skip if negative
		if ( (k[0] & 0x01) == 0x00 ) continue;
		// get the docid
		int64_t docid = g_datedb.getDocId ( k );
		// skip if locked
		TurkLock *tt = (TurkLock *)g_turkLock.getValue(&docid);
		// if there check time
		if ( tt && now - tt->m_lockTime > 3600 ) {
			// remove it
			g_turkLock.removeKey(&docId);
			// nuke tt
			tt = NULL;
		}
		// if still there, skip it and try next one
		if ( tt ) continue;
		// ok, we got a good docid to dish out
		best = docId;
		break;
	}

	SafeBuf sb;

	// print description so they can clikc a button to start the turk
	sb.safePrintf("<html>\n"
		      "<title>Event Editor</title>\n"
		      "<body>\n"
		      "<table width=\"100%%\" border=\"0\">\n"
		      "<tr><td style=\"background-color:#0079ba;\">\n"
		      "<center><font color=#00000>"
		      "<h2>Event Editor</h2>\n"
		      "</font></center></td>"
		      "</tr></table>");

	// if we had no docid, give user an empty msg
	if ( ! best ) {
		sb.safePrintf("<center>Nothing currently available to edit. "
			      "Please try again later.</center>"
			      "</body></html>\n");
		sendReply ( &sb );
		return;
	}

	// lock it!
	TurkLock tt;
	strcpy ( tt.m_user , st->m_user );
	tt.m_lockTime = now;
	if ( ! g_lockTable.addLock ( &tt ) ) {
		sendErrorReply ( st , g_errno );
		return;
	}

	// . fetch the TitleRec
	// . a max cache age of 0 means not to read from the cache
	XmlDoc *xd = &st->m_xd;
	// . when getTitleRec() is called it will load the old one
	//   since XmlDoc::m_setFromTitleRec will be true
	// . niceness is 0
	xd->set3 ( best , st->m_coll , 0 );
	// if it blocks while it loads title rec, it will re-call this routine
	xd->setCallback ( st , processLoopWrapper );
	// good to go!
	return processLoop ( st );
}

Example #4

Show file

File: PageParser.cpp Project: BILObilo/open-source-search-engine

bool gotXmlDoc ( void *state ) {
    // cast it
    State8 *st = (State8 *)state;
    // get the xmldoc
    XmlDoc *xd = &st->m_xd;

    // if we loaded from old title rec, it should be there!


    // . save the ips.txt file if we are the test coll
    // . saveTestBuf() is a function in Msge1.cpp
    //if ( xd && xd->m_coll && ! strcmp ( xd->m_coll , "test"))
    //	// use same dir that XmlDoc::getTestDir() would use
    //	saveTestBuf ( "test-page-parser" );

    // error?
    if ( g_errno ) return sendErrorReply ( st , g_errno );

    // shortcut
    SafeBuf *xbuf = &st->m_xbuf;

    bool printIt = false;
    if ( st->m_u && st->m_u[0] ) printIt = true;
    if ( st->m_docId != -1LL ) printIt = true;
    if ( st->m_donePrinting ) printIt = false;

    // do not re-call this if printDocForProCog blocked... (check length())
    if ( printIt ) {
        // mark as done
        st->m_donePrinting = true;
        // always re-compute the page inlinks dynamically, do not
        // use the ptr_linkInfo1 stored in titlerec!!
        // NO! not if set from titlerec/docid
        if ( st->m_recompute )
            xd->m_linkInfo1Valid = false;
        // try a recompute regardless, because we do not store the
        // bad inlinkers, and ppl want to see why they are bad!
        //xd->m_linkInfo1Valid = false;
        // now get the meta list, in the process it will print out a
        // bunch of junk into st->m_xbuf
        //char *metalist = xd->getMetaList ( );
        //if ( ! metalist ) return sendErrorReply ( st , g_errno );
        // return false if it blocked
        //if ( metalist == (void *)-1 ) return false;
        // for debug...
        //if ( ! xd->m_indexCode ) xd->doConsistencyTest ( false );
        // . print it out
        // . returns false if blocks, true otherwise
        // . sets g_errno on error
        if ( ! xd->printDocForProCog ( xbuf , &st->m_r ) )
            return false;
        // error?
        if ( g_errno ) return sendErrorReply ( st , g_errno );
    }

    long isXml = st->m_r.getLong("xml",0);
    char ctype = CT_HTML;
    if ( isXml ) ctype = CT_XML;

    // now encapsulate it in html head/tail and send it off
    bool status = g_httpServer.sendDynamicPage( st->m_s ,
                  xbuf->getBufStart(),
                  xbuf->length() ,
                  -1, //cachtime
                  false ,//postreply?
                  &ctype,
                  -1 , //httpstatus
                  NULL,//cookie
                  "utf-8");
    // delete the state now
    if ( st->m_freeIt ) {
        mdelete ( st , sizeof(State8) , "PageParser" );
        delete (st);
    }
    // return the status
    return status;
}

Example #5

Show file

File: PageAddUrl.cpp Project: lemire/open-source-search-engine

bool sendReply ( void *state ) {
	GigablastRequest *gr = (GigablastRequest *)state;

	// in order to see what sites are being added log it, then we can
	// more easily remove sites from sitesearch.gigablast.com that are
	// being added but not being searched
	SafeBuf xb;
	if ( gr->m_urlsBuf ) {
		xb.safeTruncateEllipsis ( gr->m_urlsBuf , 200 );
		log( LOG_INFO, "http: add url %s (%s)", xb.getBufStart(), mstrerror( g_errno ) );
	}

	char format = gr->m_hr.getReplyFormat();
	TcpSocket *sock = gr->m_socket;

	if ( format == FORMAT_JSON || format == FORMAT_XML ) {
		bool status = g_httpServer.sendSuccessReply ( gr );
		// nuke state
		mdelete ( gr , sizeof(gr) , "PageAddUrl" );
		delete (gr);
		return status;
	}

	int32_t ulen = 0;
	const char *url = gr->m_urlsBuf;
	if ( url ) ulen = gbstrlen (url);

	// re-null it out if just http://
	bool printUrl = true;
	if ( ulen == 0 ) printUrl = false;
	if ( ! gr->m_urlsBuf       ) printUrl = false;
	if ( ulen==7 && printUrl && !strncasecmp(gr->m_url,"http://",7))
		printUrl = false;
	if ( ulen==8 && printUrl && !strncasecmp(gr->m_url,"https://",8))
		printUrl = false;

	// page is not more than 32k
	char buf[1024*32+MAX_URL_LEN*2];
	SafeBuf sb(buf, 1024*32+MAX_URL_LEN*2);

	g_pages.printAdminTop ( &sb , sock , &gr->m_hr );

	// if there was an error let them know
	SafeBuf mbuf;

	if ( g_errno ) {
		mbuf.safePrintf("<center><font color=red>");
		mbuf.safePrintf("Error adding url(s): <b>%s[%i]</b>", mstrerror(g_errno) , g_errno);
		mbuf.safePrintf("</font></center>");
	} else if ( printUrl ) {
		mbuf.safePrintf("<center><font color=red>");
		mbuf.safePrintf("<b><u>");
		mbuf.safeTruncateEllipsis(gr->m_urlsBuf,200);
		mbuf.safePrintf("</u></b></font> added to spider queue successfully<br><br>");
		mbuf.safePrintf("</font></center>");
	}

	if ( mbuf.length() ) {
		sb.safeStrcpy( mbuf.getBufStart() );
	}

	g_parms.printParmTable ( &sb , sock , &gr->m_hr );

	// print the final tail
	g_pages.printTail ( &sb, true ); // admin?

	// clear g_errno, if any, so our reply send goes through
	g_errno = 0;

	// nuke state
	mdelete ( gr , sizeof(GigablastRequest) , "PageAddUrl" );
	delete (gr);

	return g_httpServer.sendDynamicPage( sock, sb.getBufStart(), sb.length(), -1 ); // cachetime
}

Example #6

Show file

File: PageCatdb.cpp Project: BILObilo/open-source-search-engine

bool sendReply ( void *state ) {
	StateCatdb *st = (StateCatdb*)state;
	// check for error
	if (g_errno) {
		if (st->m_catLookup)
			log("PageCatdb: Msg8b had error getting Site Rec: %s",
			    mstrerror(g_errno));
		else
			log("PageCatdb: Msg2a had error generating Catdb: %s",
			    mstrerror(g_errno));
		st->m_catLookup = false;
		g_errno = 0;
	}
	long long endTime = gettimeofdayInMilliseconds();
	// page buffer
	SafeBuf sb;
	sb.reserve(64*1024);
	// . print standard header
	// . do not print big links if only an assassin, just print host ids
	g_pages.printAdminTop ( &sb, st->m_socket , &st->m_r );

	sb.safePrintf(
		      "<style>"
		      ".poo { background-color:#%s;}\n"
		      "</style>\n" ,
		      LIGHT_BLUE );


	sb.safePrintf ( "<table %s>"
			"<tr><td colspan=2>"
			"<center><font size=+1><b>Catdb</b></font></center>"
			"</td></tr>", TABLE_STYLE );

	// instructions
	sb.safePrintf("<tr bgcolor=#%s>"
		      "<td colspan=3>"
		      "<font size=-2>"
		      "<center>"
		      "Don't just start using this, you need to follow the "
		      "instructions in the <i>admin guide</i> for adding "
		      "DMOZ support."
		      "</center>"
		      "</font>"
		      "</td>"
		      "</tr>"
		      ,DARK_BLUE
		      );

	// print the generate Catdb link
	sb.safePrintf ( "<tr class=poo><td>Update Catdb from DMOZ data.</td>"
			"<td><center>"
			"<a href=\"/master/catdb?c=%s&gencatdb=2\">"
			"Update Catdb</a> "
			"</center></td></tr>",
			st->m_coll );
	sb.safePrintf ( "<tr class=poo>"
			"<td>Generate New Catdb from DMOZ data.</td>"
			"<td><center>"
			"<a href=\"/master/catdb?c=%s&gencatdb=1\">"
			"Generate Catdb</a> "
			"</center></td></tr>",
			st->m_coll );
	if (st->m_genCatdb)
		sb.safePrintf ( "<tr class=poo>"
				"<td> Catdb Generation took %lli ms."
				"</td></tr>",
				endTime - st->m_startTime );
	// print Url Catgory Lookup
	sb.safePrintf ( "<tr class=poo><td>Lookup Category of Url.</td>"
			"<td><input type=text name=caturl size=80"
			" value=\"");
	if (st->m_catLookup) {
		sb.safeMemcpy(st->m_url.getUrl(), st->m_url.getUrlLen());
	}
	sb.safePrintf("\"></center></td></tr>" );
	// print Url Info if Lookup was done
	if (st->m_catLookup) {
		sb.safePrintf("<tr><td>");
		// print the url
		sb.safeMemcpy(st->m_url.getUrl(), st->m_url.getUrlLen());
		sb.safePrintf(" (%lli ms)</td><td>",
				endTime - st->m_startTime );
		// print each category id and path
		for (long i = 0; i < st->m_catRec.m_numCatids; i++) {
			sb.safePrintf("<b>[%li] ",
					st->m_catRec.m_catids[i]);
			g_categories->printPathFromId(&sb,
					st->m_catRec.m_catids[i]);
			sb.safePrintf("</b><br>");
			// lookup title and summary
			char  title[1024];
			long  titleLen = 0;
			char  summ[4096];
			long  summLen = 0;
			char  anchor[256];
			unsigned char anchorLen = 0;
			g_categories->getTitleAndSummary(
					st->m_url.getUrl(),
					st->m_url.getUrlLen(),
					st->m_catRec.m_catids[i],
					title,
					&titleLen,
					1023,
					summ,
					&summLen,
					4098,
					anchor,
					&anchorLen,
					255 );
			title[titleLen] = '\0';
			summ[summLen] = '\0';
			anchor[anchorLen] = '\0';
			// print title and summary
			sb.safePrintf("<b>Title:</b> %s<br>"
					"<b>Summary:</b> %s<br>",
					title, summ);
			if (anchorLen > 0)
				sb.safePrintf("<b>Anchor:</b> %s<br>",
						anchor);
			sb.safePrintf("<br>");
		}
		sb.safePrintf("<b>Filenum:</b> %li<br>",
				st->m_catRec.m_filenum);
		// print indirect catids
		if (st->m_catRec.m_numIndCatids > 0) {
			sb.safePrintf("<hr><b>Indirect Catids [%li]:"
					"</b><br>\n",
					st->m_catRec.m_numIndCatids );
			for (long i = 0;
				  i < st->m_catRec.m_numIndCatids; i++) {
				sb.safePrintf("%lu<br>",
					st->m_catRec.m_indCatids[i]);
			}
		}
		sb.safePrintf("</td></tr>");
	}
	// end it
	sb.safePrintf ( "</center></td></tr></table>" );
	// print submit button
	sb.safePrintf ( "<br><center>"
			"<input type=submit value=\"Submit\" border=0>"
			"</form></center>" );

	// print the final tail
	//p += g_httpServer.printTail ( p , pend - p );
	// clear g_errno, if any, so our reply send goes through
	g_errno = 0;
	// extract the socket
	TcpSocket *s = st->m_socket;
	// clear the state
	mdelete ( st, sizeof(StateCatdb), "PageCatdb" );
	delete st;
	// . send this page
	// . encapsulates in html header and tail
	// . make a Mime
	return g_httpServer.sendDynamicPage(s , sb.getBufStart(), sb.length());
}

Example #7

Show file

File: PageParser.cpp Project: BILObilo/open-source-search-engine

// . a new interface so Msg3b can call this with "s" set to NULL
// . returns false if blocked, true otherwise
// . sets g_errno on error
bool sendPageParser2 ( TcpSocket   *s ,
                       HttpRequest *r ,
                       State8      *st ,
                       long long    docId ,
                       Query       *q ,
                       // in query term space, not imap space
                       long long   *termFreqs       ,
                       // in imap space
                       float       *termFreqWeights ,
                       // in imap space
                       float       *affWeights      ,
                       void        *state ,
                       void       (* callback)(void *state) ) {

    //log("parser: read sock=%li",s->m_sd);

    // might a simple request to addsomething to validated.*.txt file
    // from XmlDoc::print() or XmlDoc::validateOutput()
    char *add = r->getString("add",NULL);
    //long long uh64 = r->getLongLong("uh64",0LL);
    char *uh64str = r->getString("uh64",NULL);
    //char *divTag = r->getString("div",NULL);
    if ( uh64str ) {
        // convert add to number
        long addNum = 0;
        if ( to_lower_a(add[0])=='t' ) // "true" or "false"?
            addNum = 1;
        // convert it. skip beginning "str" inserted to prevent
        // javascript from messing with the long long since it
        // was rounding it!
        //long long uh64 = atoll(uh64str);//+3);
        // urldecode that
        //long divTagLen = gbstrlen(divTag);
        //long newLen  = urlDecode ( divTag , divTag , divTagLen );
        // null term?
        //divTag[newLen] = '\0';
        // do it. this is defined in XmlDoc.cpp
        //addCheckboxSpan ( uh64 , divTag , addNum );
        // make basic reply
        char *reply;
        reply = "HTTP/1.0 200 OK\r\n"
                "Connection: Close\r\n";
        // that is it! send a basic reply ok
        bool status = g_httpServer.sendDynamicPage( s ,
                      reply,
                      gbstrlen(reply),
                      -1, //cachtime
                      false ,//postreply?
                      NULL, //ctype
                      -1 , //httpstatus
                      NULL,//cookie
                      "utf-8");
        return status;
    }

    // make a state
    if (   st ) st->m_freeIt = false;
    if ( ! st ) {
        try {
            st = new (State8);
        }
        catch ( ... ) {
            g_errno = ENOMEM;
            log("PageParser: new(%i): %s",
                sizeof(State8),mstrerror(g_errno));
            return g_httpServer.sendErrorReply(s,500,
                                               mstrerror(g_errno));
        }
        mnew ( st , sizeof(State8) , "PageParser" );
        st->m_freeIt = true;
    }
    // msg3b uses this to get a score from the query
    st->m_state           = state;
    st->m_callback        = callback;
    st->m_q               = q;
    st->m_termFreqs       = termFreqs;
    st->m_termFreqWeights = termFreqWeights;
    st->m_affWeights      = affWeights;
    st->m_total           = (score_t)-1;
    st->m_indexCode       = 0;
    st->m_blocked         = false;
    st->m_didRootDom      = false;
    st->m_didRootWWW      = false;
    st->m_wasRootDom      = false;
    st->m_u               = NULL;
    st->m_recompute       = false;
    //st->m_url.reset();

    // do not allow more than one to be launched at a time if in
    // a quickpoll. will cause quickpoll in quickpoll.
    g_inPageParser = true;

    // password, too
    long pwdLen = 0;
    char *pwd = r->getString ( "pwd" , &pwdLen );
    if ( pwdLen > 31 ) pwdLen = 31;
    if ( pwdLen > 0 ) strncpy ( st->m_pwd , pwd , pwdLen );
    st->m_pwd[pwdLen]='\0';

    // save socket ptr
    st->m_s = s;
    st->m_r.copy ( r );
    // get the collection
    char *coll    = r->getString ( "c" , &st->m_collLen ,NULL /*default*/);
    if ( st->m_collLen > MAX_COLL_LEN )
        return sendErrorReply ( st , ENOBUFS );
    strcpy ( st->m_coll , coll );

    // version to use, if -1 use latest
    st->m_titleRecVersion = r->getLong("version",-1);
    if ( st->m_titleRecVersion == -1 )
        st->m_titleRecVersion = TITLEREC_CURRENT_VERSION;
    // default to 0 if not provided
    st->m_hopCount = r->getLong("hc",0);
    //long  ulen    = 0;
    //char *u     = r->getString ( "u" , &ulen     , NULL /*default*/);
    long  old     = r->getLong   ( "old", 0 );
    // set query
    long qlen;
    char *qs = r->getString("q",&qlen,NULL);
    if ( qs ) st->m_tq.set2 ( qs , langUnknown , true );
    // url will override docid if given
    if ( ! st->m_u || ! st->m_u[0] )
        st->m_docId = r->getLongLong ("docid",-1);
    else
        st->m_docId = -1;
    // set url in state class (may have length 0)
    //if ( u ) st->m_url.set ( u , ulen );
    //st->m_urlLen = ulen;
    st->m_u = st->m_r.getString("u",&st->m_ulen,NULL);
    // should we recycle link info?
    st->m_recycle  = r->getLong("recycle",0);
    st->m_recycle2 = r->getLong("recycleimp",0);
    st->m_render   = r->getLong("render" ,0);
    // for quality computation... takes way longer cuz we have to
    // lookup the IP address of every outlink, so we can get its root
    // quality using Msg25 which needs to filter out voters from that IP
    // range.
    st->m_oips     = r->getLong("oips"    ,0);

    long  linkInfoLen  = 0;
    // default is NULL
    char *linkInfoColl = r->getString ( "oli" , &linkInfoLen, NULL );
    if ( linkInfoColl ) strcpy ( st->m_linkInfoColl , linkInfoColl );
    else st->m_linkInfoColl[0] = '\0';

    // set the flag in our SafeBuf class so that Words.cpp knows to show
    // html or html source depending on this value
    st->m_xbuf.m_renderHtml = st->m_render;

    // should we use the old title rec?
    st->m_old    = old;
    // are we coming from a local machine?
    st->m_isLocal = r->isLocal();
    //no more setting the default root quality to 30, instead if we do not
    // know it setting it to -1
    st->m_rootQuality=-1;







    // header
    SafeBuf *xbuf = &st->m_xbuf;
    xbuf->safePrintf("<meta http-equiv=\"Content-Type\" "
                     "content=\"text/html; charset=utf-8\">\n");

    // print standard header
    g_pages.printAdminTop ( xbuf , st->m_s , &st->m_r );


    // print the standard header for admin pages
    char *dd     = "";
    char *rr     = "";
    char *rr2    = "";
    char *render = "";
    char *oips   = "";
    char *us     = "";
    if ( st->m_u && st->m_u[0] ) us = st->m_u;
    //if ( st->m_sfn != -1 ) sprintf ( rtu , "%li",st->m_sfn );
    if ( st->m_old ) dd = " checked";
    if ( st->m_recycle            ) rr     = " checked";
    if ( st->m_recycle2           ) rr2    = " checked";
    if ( st->m_render             ) render = " checked";
    if ( st->m_oips               ) oips   = " checked";

    xbuf->safePrintf(
        "<style>"
        ".poo { background-color:#%s;}\n"
        "</style>\n" ,
        LIGHT_BLUE );


    long clen;
    char *contentParm = r->getString("content",&clen,"");

    // print the input form
    xbuf->safePrintf (
        "<style>\n"
        "h2{font-size: 12px; color: #666666;}\n"

        ".gbtag { border: 1px solid gray;"
        "background: #ffffef;display:inline;}\n"
        ".gbcomment { border: 1px solid gray;"
        "color: #888888; font-style:italic; "
        "background: #ffffef;display:inline;}\n"

        ".token { border: 1px solid gray;"
        "background: #f0ffff;display:inline;}\n"
        ".spam { border: 1px solid gray;"
        "background: #af0000;"
        "color: #ffffa0;}"
        ".hs {color: #009900;}"
        "</style>\n"
        "<center>"

        "<table %s>"

        "<tr><td colspan=5><center><b>"
        "Parser"
        "</b></center></td></tr>\n"

        "<tr class=poo>"
        "<td>"
        "<b>url</b>"
        "<br><font size=-2>"
        "Type in <b>FULL</b> url to parse."
        "</font>"
        "</td>"

        "</td>"
        "<td>"
        "<input type=text name=u value=\"%s\" size=\"40\">\n"
        "</td>"
        "</tr>"


        /*
        "<tr class=poo>"
        "<td>"
        "Parser version to use: "
        "</td>"
        "<td>"
        "<input type=text name=\"version\" size=\"4\" value=\"-1\"> "
        "</td>"
        "<td>"
        "(-1 means to use latest title rec version)<br>"
        "</td>"
        "</tr>"
         */

        /*
        "<tr class=poo>"
        "<td>"
        "Hop count to use: "
        "</td>"
        "<td>"
        "<input type=text name=\"hc\" size=\"4\" value=\"%li\"> "
        "</td>"
        "<td>"
        "(-1 is unknown. For root urls hopcount is always 0)<br>"
        "</td>"
        "</tr>"
         */

        "<tr class=poo>"
        "<td>"
        "<b>use cached</b>"

        "<br><font size=-2>"
        "Load page from cache (titledb)?"
        "</font>"

        "</td>"
        "<td>"
        "<input type=checkbox name=old value=1%s> "
        "</td>"
        "</tr>"

        /*
        "<tr class=poo>"
        "<td>"
        "Reparse root:"
        "</td>"
        "<td>"
        "<input type=checkbox name=artr value=1%s> "
        "</td>"
        "<td>"
        "Apply selected ruleset to root to update quality"
        "</td>"
        "</tr>"
         */

        "<tr class=poo>"
        "<td>"
        "<b>recycle link info</b>"

        "<br><font size=-2>"
        "Recycle the link info from the title rec"
        "Load page from cache (titledb)?"
        "</font>"

        "</td>"
        "<td>"
        "<input type=checkbox name=recycle value=1%s> "
        "</td>"
        "</tr>"

        /*
        "<tr class=poo>"
        "<td>"
        "Recycle Link Info Imported:"
        "</td>"
        "<td>"
        "<input type=checkbox name=recycleimp value=1%s> "
        "</td>"
        "<td>"
        "Recycle the link info imported from other coll"
        "</td>"
        "</tr>"
         */

        "<tr class=poo>"
        "<td>"
        "<b>render html</b>"

        "<br><font size=-2>"
        "Render document content as HTML"
        "</font>"

        "</td>"
        "<td>"
        "<input type=checkbox name=render value=1%s> "
        "</td>"
        "</tr>"

        /*
        "<tr class=poo>"
        "<td>"
        "Lookup outlinks' ruleset, ips, quality:"
        "</td>"
        "<td>"
        "<input type=checkbox name=oips value=1%s> "
        "</td>"
        "<td>"
        "To compute quality lookup IP addresses of roots "
        "of outlinks."
        "</td>"
        "</tr>"

        "<tr class=poo>"
        "<td>"
        "LinkInfo Coll:"
        "</td>"
        "<td>"
        "<input type=text name=\"oli\" size=\"10\" value=\"\"> "
        "</td>"
        "<td>"
        "Leave empty usually. Uses this coll to lookup link info."
        "</td>"
        "</tr>"
         */

        "<tr class=poo>"
        "<td>"
        "<b>optional query</b>"

        "<br><font size=-2>"
        "Leave empty usually. For title generation only."
        "</font>"

        "</td>"
        "<td>"
        "<input type=text name=\"q\" size=\"20\" value=\"\"> "
        "</td>"
        "</tr>"



        "<tr class=poo>"
        "<td>"
        "<b>content below is xml</b>"
        "<br><font size=-2>"
        "Is the content below XML?"
        "</font>"
        "</td>"

        "<td>"
        "<input type=checkbox name=xml value=1> "

        "</td>"
        "</tr>"




        "<tr class=poo>"
        "<td><b>content</b>"
        "<br><font size=-2>"
        "Use this content for the provided <i>url</i> "
        "rather than downloading it from the web."
        "</td>"

        "<td>"
        "<textarea rows=10 cols=80 name=content>"
        "%s"
        "</textarea>"
        "</td>"
        "</tr>"

        "</table>"
        "</center>"
        "</form>"
        "<br>",

        TABLE_STYLE,
        us ,
        //(long)st->m_hopCount,
        //rtu,
        dd,
        //artr ,
        rr,
        //rr2,
        render ,
        //oips ,
        contentParm );



    xbuf->safePrintf(
        "<center>"
        "<input type=submit value=Submit>"
        "</center>"
    );


    // just print the page if no url given
    if ( ! st->m_u || ! st->m_u[0] ) return processLoop ( st );


    XmlDoc *xd = &st->m_xd;
    // set this up
    SpiderRequest sreq;
    sreq.reset();
    strcpy(sreq.m_url,st->m_u);
    long firstIp = hash32n(st->m_u);
    if ( firstIp == -1 || firstIp == 0 ) firstIp = 1;
    // parentdocid of 0
    sreq.setKey( firstIp, 0LL, false );
    sreq.m_isPageParser = 1;
    sreq.m_hopCount = st->m_hopCount;
    sreq.m_hopCountValid = 1;
    sreq.m_fakeFirstIp   = 1;
    sreq.m_firstIp = firstIp;
    Url nu;
    nu.set(sreq.m_url);
    sreq.m_domHash32 = nu.getDomainHash32();
    sreq.m_siteHash32 = nu.getHostHash32();

    // . get provided content if any
    // . will be NULL if none provided
    // . "content" may contain a MIME
    long  contentLen = 0;
    char *content = r->getString ( "content" , &contentLen , NULL );
    // is the "content" url-encoded? default is true.
    bool contentIsEncoded = true;
    // mark doesn't like to url-encode his content
    if ( ! content ) {
        content    = r->getUnencodedContent    ();
        contentLen = r->getUnencodedContentLen ();
        contentIsEncoded = false;
    }
    // ensure null
    if ( contentLen == 0 ) content = NULL;

    uint8_t contentType = CT_HTML;
    if ( r->getBool("xml",0) ) contentType = CT_XML;

    // if facebook, load xml content from title rec...
    bool isFacebook = (bool)strstr(st->m_u,"http://www.facebook.com/");
    if ( isFacebook && ! content ) {
        long long docId = g_titledb.getProbableDocId(st->m_u);
        sprintf(sreq.m_url ,"%llu", docId );
        sreq.m_isPageReindex = true;
    }

    // hack
    if ( content ) {
        st->m_dbuf.purge();
        st->m_dbuf.safeStrcpy(content);
        //char *data = strstr(content,"\r\n\r\n");
        //long dataPos = 0;
        //if ( data ) dataPos = (data + 4) - content;
        //st->m_dbuf.convertJSONtoXML(0,dataPos);
        //st->m_dbuf.decodeJSON(0);
        content = st->m_dbuf.getBufStart();
    }

    // . use the enormous power of our new XmlDoc class
    // . this returns false if blocked
    if ( ! xd->set4 ( &sreq       ,
                      NULL        ,
                      st->m_coll  ,
                      &st->m_wbuf        ,
                      0 ,//PP_NICENESS ))
                      content ,
                      false, // deletefromindex
                      0, // forced ip
                      contentType ))
        // return error reply if g_errno is set
        return sendErrorReply ( st , g_errno );
    // make this our callback in case something blocks
    xd->setCallback ( st , processLoop );
    // . set xd from the old title rec if recycle is true
    // . can also use XmlDoc::m_loadFromOldTitleRec flag
    if ( st->m_recycle ) xd->m_recycleContent = true;

    return processLoop ( st );
}

Example #8

Show file

File: qa.cpp Project: firatkarakusoglu/open-source-search-engine

bool qajson ( ) {
	//
	// delete the 'qatest123' collection
	//
	//static bool s_x1 = false;
	if ( ! s_flags[0] ) {
		s_flags[0] = true;
		if ( ! getUrl ( "/admin/delcoll?xml=1&delcoll=qatest123" ) )
			return false;
	}

	//
	// add the 'qatest123' collection
	//
	//static bool s_x2 = false;
	if ( ! s_flags[1] ) {
		s_flags[1] = true;
		if ( ! getUrl ( "/admin/addcoll?addcoll=qatest123&xml=1" , 
				// checksum of reply expected
				238170006 ) )
			return false;
	}


	// add the 50 urls
	if ( ! s_flags[3] ) {
		s_flags[3] = true;
		SafeBuf sb;

		sb.safePrintf("&c=qatest123"
			      "&format=json"
			      "&strip=1"
			      "&spiderlinks=0"
			      "&urls="//www.walmart.com+ibm.com"
			      );
		sb.urlEncode ( s_ubuf4 );
		// . now a list of websites we want to spider
		// . the space is already encoded as +
		if ( ! getUrl ( "/admin/addurl",0,sb.getBufStart()) )
			return false;
	}


	//
	// wait for spidering to stop
	//
 checkagain:

	// wait until spider finishes. check the spider status page
	// in json to see when completed
	//static bool s_k1 = false;
	if ( ! s_flags[5] ) {
		// wait 5 seconds, call sleep timer... then call qatest()
		//usleep(5000000); // 5 seconds
		wait(3.0);
		s_flags[5] = true;
		return false;
	}

	if ( ! s_flags[15] ) {
		s_flags[15] = true;
		if ( ! getUrl ( "/admin/status?format=json&c=qatest123",0) )
			return false;
	}

	//static bool s_k2 = false;
	if ( ! s_flags[6] ) {
		// ensure spiders are done. 
		// "Nothing currently available to spider"
		if ( s_content&&!strstr(s_content,"Nothing currently avail")){
			s_flags[5] = false;
			s_flags[15] = false;
			goto checkagain;
		}
		s_flags[6] = true;
	}

		

	if ( ! s_flags[7] ) {
		s_flags[7] = true;
		if ( ! getUrl ( "/search?c=qatest123&qa=1&format=xml&"
				"q=type%3Ajson+meta.authors%3Appk",
				-1310551262 ) )
			return false;
	}

	if ( ! s_flags[8] ) {
		s_flags[8] = true;
		if ( ! getUrl ( "/search?c=qatest123&qa=1&format=xml&n=100&"
				"q=type%3Ajson",
				-1310551262 ) )
			return false;
	}

	if ( ! s_flags[9] ) {
		s_flags[9] = true;
		if ( ! getUrl ( "/search?c=qatest123&qa=1&format=json&"
				"q=gbfacetstr%3Ameta.authors",
				-1310551262 ) )
			return false;
	}

	if ( ! s_flags[10] ) {
		s_flags[10] = true;
		// this has > 50 values for the facet field hash
		if ( ! getUrl ( "/search?c=qatest123&qa=1&format=json&"
				"q=gbfacetstr%3Astrings.key",
				-1310551262 ) )
			return false;
	}


	// other query tests...
	if ( ! s_flags[12] ) {
		s_flags[12] = true;
		if ( ! getUrl ( "/search?c=qatest123&qa=1&format=json&"
				"q=inurl2%3Aquirksmode.org%2Fm%2F",
				-1310551262 ) )
			return false;
	}

	if ( ! s_flags[13] ) {
		s_flags[13] = true;
		if ( ! getUrl ( "/search?c=qatest123&qa=1&format=json&"
				"q=site%3Aquirksmode.org",
				-1310551262 ) )
			return false;
	}
	

	// test gbfieldmatch:field:"quoted value" query to ensure it converts
	// the quoted value into the right int32
	if ( ! s_flags[14] ) {
		s_flags[14] = true;
		if ( ! getUrl ( "/search?c=qatest123&qa=1&format=json&"
				"q=gbfieldmatch%3Astrings.key"
				"%3Ainvestigate-tweet",
				-1310551262 ) )
			return false;
	}

	if ( ! s_flags[15] ) {
		s_flags[15] = true;
		if ( ! getUrl ( "/search?c=qatest123&qa=1&format=json&"
				"q=gbfieldmatch%3Astrings.key"
				"%3A\"Maemo+Browser\"",
				-1310551262 ) )
			return false;
	}

	if ( ! s_flags[16] ) {
		s_flags[16] = true;
		if ( ! getUrl ( "/search?c=qatest123&qa=1&format=json&"
				"q=gbfieldmatch%3Astrings.key"
				"%3A\"Google+Wireless+Transcoder\"",
				-1310551262 ) )
			return false;
	}

	// this should have no results, not capitalized
	if ( ! s_flags[17] ) {
		s_flags[17] = true;
		if ( ! getUrl ( "/search?c=qatest123&qa=1&format=json&"
				"q=gbfieldmatch%3Astrings.key%3A\"samsung\"",
				-1310551262 ) )
			return false;
	}

	if ( ! s_flags[18] ) {
		s_flags[18] = true;
		if ( ! getUrl ( "/search?c=qatest123&qa=1&format=json&"
				"q=gbfieldmatch%3Astrings.key%3ASamsung",
				-1310551262 ) )
			return false;
	}

	if ( ! s_flags[18] ) {
		s_flags[18] = true;
		if ( ! getUrl ( "/search?c=qatest123&qa=1&format=json&"
				"q=gbfieldmatch%3Astrings.key%3A\"Samsung\"",
				-1310551262 ) )
			return false;
	}



	//static bool s_fee2 = false;
	if ( ! s_flags[20] ) {
		s_flags[20] = true;
		log("qa: SUCCESSFULLY COMPLETED "
		    "QA JSON TEST");
		return true;
	}

	return true;
}

Example #9

Show file

File: qa.cpp Project: firatkarakusoglu/open-source-search-engine

bool sendPageQA ( TcpSocket *sock , HttpRequest *hr ) {
	char pbuf[32768];
	SafeBuf sb(pbuf, 32768);

	//char format = hr->getReplyFormat();

	// set this. also sets gr->m_hr
	GigablastRequest gr;
	// this will fill in GigablastRequest so all the parms we need are set
	g_parms.setGigablastRequest ( sock , hr , &gr );


	//
	// . handle a request to update the crc for this test
	// . test id identified by "ajaxUrlHash" which is the hash of the test's url
	//   and the test name, QATest::m_testName
	long ajax = hr->getLong("ajax",0);
	unsigned long ajaxUrlHash ;
	ajaxUrlHash = (unsigned long long)hr->getLongLong("uh",0LL);
	unsigned long ajaxCrc ;
	ajaxCrc = (unsigned long long)hr->getLongLong("crc",0LL);

	if ( ajax ) {
		// make sure it is initialized
		if ( s_ht.m_ks ) {
			// overwrite current value with provided one because 
			// the user click on an override checkbox to update 
			// the crc
			s_ht.addKey ( &ajaxUrlHash , &ajaxCrc );
			saveHashTable();
		}
		// send back the urlhash so the checkbox can turn the
		// bg color of the "diff" gray
		SafeBuf sb3;
		sb3.safePrintf("%lu",ajaxUrlHash);
		g_httpServer.sendDynamicPage(sock,
					     sb3.getBufStart(),
					     sb3.length(),
					     -1/*cachetime*/);
		return true;
	}
		

	// if they hit the submit button, begin the tests
	long submit = hr->hasField("action");

	long n = sizeof(s_qatests)/sizeof(QATest);


	if ( submit && g_qaInProgress ) {
		g_errno = EINPROGRESS;
		g_httpServer.sendErrorReply(sock,g_errno,mstrerror(g_errno));
		return true;
	}

	// set m_doTest
	for ( long i = 0 ; submit && i < n ; i++ ) {
		QATest *qt = &s_qatests[i];
		char tmp[10];
		sprintf(tmp,"test%li",i);
		qt->m_doTest = hr->getLong(tmp,0);
	}

	if ( submit ) {
		// reset all the static thingies
		resetFlags();
		// save socket
		g_qaSock = sock;
		g_numErrors = 0;
		g_qaOutput.reset();
		g_qaOutput.safePrintf("<html><body>"
				      "<title>QA Test Results</title>\n");

		g_qaOutput.safePrintf("<SCRIPT LANGUAGE=\"javascript\">\n"
				      // update s_ht with the new crc for this test
				      "function submitchanges(urlhash,crc) "
				      "{\n "
				      "var client=new XMLHttpRequest();\n"
				      "client.onreadystatechange=gotsubmitreplyhandler;"
				      "var "
				      "u='/admin/qa?ajax=1&uh='+urlhash+'&crc='+crc;\n"
				      "client.open('GET',u);\n"
				      "client.send();\n"
				      
				      // use that to fix background to gray
				      "var w=document.getElementById(urlhash);\n"
				      // set background color
				      "w.style.backgroundColor = '0xe0e0e0';\n"

				      // gear spinning after checkbox
				      "}\n\n "

				      // call this when we got the reply that the 
				      // checkbox went through
				      "function gotsubmitreplyhandler() {\n"
				      // return if reply is not fully ready
				      "if(this.readyState != 4 )return;\n"
				      // if error or empty reply then do nothing
				      "if(!this.responseText)return;\n"
				      // response text is the urlhash32, unsigned long
				      "var id=this.responseText;\n"
				      // use that to fix background to gray
				      "var w=document.getElementById(id);\n"
				      // set background color
				      "w.style.backgroundColor = '0xe0e0e0';\n"
				      "}\n\n"

				      "</SCRIPT> ");
		// and run the qa test loop
		if ( ! qatest( ) ) return false;
		// what happened?
		log("qa: qatest completed without blocking");
	}

	// show tests, all checked by default, to perform

	g_pages.printAdminTop ( &sb , sock , hr );

	sb.safePrintf("<SCRIPT LANGUAGE=\"javascript\">\n"
		     "function checkAll(name, num)\n "
		      "{ "
		      "    for (var i = 0; i < num; i++) {\n"
		      "      var e = document.getElementById(name + i);\n"
		      //"alert(name+i);"
		      "      e.checked = !e.checked ;\n "
		      "  }\n"
		      "}\n\n "

		      "</SCRIPT> ");

	//sb.safePrintf("<form name=\"fo\">");

	sb.safePrintf("\n<table %s>\n",TABLE_STYLE);
	sb.safePrintf("<tr class=hdrow><td colspan=2>"
		      "<center><b>QA Tests</b></center>"
		      "</td></tr>");

	// header row
	sb.safePrintf("<tr><td><b>Do Test?</b> <a style=cursor:hand;"
		      "cursor:pointer; "
		      "onclick=\"checkAll('test', %li);\">(toggle)</a>",n);
	sb.safePrintf("</td><td><b>Test Name</b></td></tr>\n");
	
	// . we keep the ptr to each test in an array
	// . print out each qa function
	for ( long i = 0 ; i < n ; i++ ) {
		QATest *qt = &s_qatests[i];
		char *bg;
		if ( i % 2 == 0 ) bg = LIGHT_BLUE;
		else              bg = DARK_BLUE;
		sb.safePrintf("<tr bgcolor=#%s>"
			      "<td><input type=checkbox value=1 name=test%li "
			      "id=test%li></td>"
			      "<td>%s"
			      "<br>"
			      "<font color=gray size=-1>%s</font>"
			      "</td>"
			      "</tr>\n"
			      , bg
			      , i
			      , i
			      , qt->m_testName
			      , qt->m_testDesc
			      );
	}

	sb.safePrintf("</table>\n<br>\n");
	//	      "</form>\n");

	g_pages.printAdminBottom ( &sb , hr );


	g_httpServer.sendDynamicPage(sock,
				     sb.getBufStart(),
				     sb.length(),
				     -1/*cachetime*/);

	return true;
}

Example #10

Show file

File: qa.cpp Project: firatkarakusoglu/open-source-search-engine

void processReply ( char *reply , long replyLen ) {

	// store our current reply
	SafeBuf fb2;
	fb2.safeMemcpy(reply,replyLen );
	fb2.nullTerm();

	// log that we got the reply
	log("qa: got reply(len=%li)(errno=%s)=%s",
	    replyLen,mstrerror(g_errno),reply);

	char *content = NULL;
	long  contentLen = 0;

	// get mime
	if ( reply ) {
		HttpMime mime;
		mime.set ( reply, replyLen , NULL );
		// only hash content since mime has a timestamp in it
		content = mime.getContent();
		contentLen = mime.getContentLen();
		if ( content && contentLen>0 && content[contentLen] ) { 
			char *xx=NULL;*xx=0; }
	}

	if ( ! content ) {
		content = "";
		contentLen = 0;
	}

	s_content = content;

	// take out <responseTimeMS>
	markOut ( content , "<currentTimeUTC>");
	markOut ( content , "<responseTimeMS>");

	// until i figure this one out, take it out
	markOut ( content , "<docsInCollection>");

	// until i figure this one out, take it out
	markOut ( content , "<hits>");

	// for those links in the html pages
	markOut ( content, "rand64=");

	// for json
	markOut ( content , "\"currentTimeUTC\":" );
	markOut ( content , "\"responseTimeMS\":");
	markOut ( content , "\"docsInCollection\":");

	// for xml
	markOut ( content , "<currentTimeUTC>" );
	markOut ( content , "<responseTimeMS>");
	markOut ( content , "<docsInCollection>");

	// indexed 1 day ago
	markOut ( content,"indexed:");
	// modified 1 day ago
	markOut ( content,"modified:");

	// s_gigabitCount... it is perpetually incrementing static counter
	// in PageResults.cpp
	markOut(content,"ccc(");
	markOut(content,"id=fd");
	markOut(content,"id=sd");

	// for some reason the term freq seems to change a little in
	// the scoring table
	markOut(content,"id=tf");

	// make checksum. we ignore back to back spaces so this
	// hash works for <docsInCollection>10 vs <docsInCollection>9
	long contentCRC = 0; 
	if ( content ) contentCRC = qa_hash32 ( content );

	// note it
	log("qa: got contentCRC of %lu",contentCRC);


	// if what we expected, save to disk if not there yet, then
	// call s_callback() to resume the qa pipeline
	/*
	if ( contentCRC == s_expectedCRC ) {
		// save content if good
		char fn3[1024];
		sprintf(fn3,"%sqa/content.%lu",g_hostdb.m_dir,contentCRC);
		File ff; ff.set ( fn3 );
		if ( ! ff.doesExist() ) {
			// if not there yet then save it
			fb2.save(fn3);
		}
		// . continue on with the qa process
		// . which qa function that may be
		//s_callback();
		return;
	}
	*/

	//
	// if crc of content does not match what was expected then do a diff
	// so we can see why not
	//

	// this means caller does not care about the response
	if ( ! s_checkCRC ) {
		//s_callback();
		return;
	}

	//const char *emsg = "qa: bad contentCRC of %li should be %li "
	//	"\n";//"phase=%li\n";
	//fprintf(stderr,emsg,contentCRC,s_expectedCRC);//,s_phase-1);

	// hash url
	long urlHash32 = hash32n ( s_url.getUrl() );

	// combine test function too since two tests may use the same url
	long nameHash = hash32n ( s_qt->m_testName );

	// combine together
	urlHash32 = hash32h ( nameHash , urlHash32 );

	static bool s_init = false;
	if ( ! s_init ) {
		s_init = true;
		s_ht.set(4,4,1024,NULL,0,false,0,"qaht");
		// make symlink
		//char cmd[512];
		//snprintf(cmd,"cd %s/html ;ln -s ../qa ./qa", g_hostdb.m_dir);
		//system(cmd);
		char dir[1024];
		snprintf(dir,1000,"%sqa",g_hostdb.m_dir);
		long status = ::mkdir ( dir ,
					S_IRUSR | S_IWUSR | S_IXUSR | 
					S_IRGRP | S_IWGRP | S_IXGRP | 
					S_IROTH | S_IXOTH );
	        if ( status == -1 && errno != EEXIST && errno )
			log("qa: Failed to make directory %s: %s.",
			    dir,mstrerror(errno));
		// try to load from disk
		SafeBuf fn;
		fn.safePrintf("%s/qa/",g_hostdb.m_dir);
		log("qa: loading crctable.dat");
		s_ht.load ( fn.getBufStart() , "crctable.dat" );
	}

	// break up into lines
	char fn2[1024];
	sprintf(fn2,"%sqa/content.%lu",g_hostdb.m_dir,contentCRC);
	fb2.save ( fn2 );

	// look up in hashtable to see what reply crc should be
	long *val = (long *)s_ht.getValue ( &urlHash32 );

	// just return if the same
	if ( val && contentCRC == *val ) {
		g_qaOutput.safePrintf("<b style=color:green;>"
				      "passed test</b><br>%s : "
				      "<a href=%s>%s</a> (urlhash=%lu "
				      "crc=<a href=/qa/content.%lu>"
				      "%lu</a>)<br>"
				      "<hr>",
				      s_qt->m_testName,
				      s_url.getUrl(),
				      s_url.getUrl(),
				      urlHash32,
				      contentCRC,
				      contentCRC);
		return;
	}



	if ( ! val ) {
		// add it so we know
		s_ht.addKey ( &urlHash32 , &contentCRC );
		g_qaOutput.safePrintf("<b style=color:blue;>"
				      "first time testing</b><br>%s : "
				      "<a href=%s>%s</a> "
				      "(urlhash=%lu "
				      "crc=<a href=/qa/content.%lu>%lu"
				      "</a>)<br>"
				      "<hr>",
				      s_qt->m_testName,
				      s_url.getUrl(),
				      s_url.getUrl(),
				      urlHash32,
				      contentCRC,
				      contentCRC);
		return;
	}


	log("qa: crc changed for url %s from %li to %li",
	    s_url.getUrl(),*val,contentCRC);

	// get response on file
	SafeBuf fb1;
	char fn1[1024];
	sprintf(fn1,"%sqa/content.%lu",g_hostdb.m_dir, *val);
	fb1.load(fn1);
	fb1.nullTerm();

	// do the diff between the two replies so we can see what changed
	char cmd[1024];
	sprintf(cmd,"diff %s %s > /tmp/diffout",fn1,fn2);
	log("qa: %s\n",cmd);
	system(cmd);

	g_numErrors++;
	
	g_qaOutput.safePrintf("<b style=color:red;>FAILED TEST</b><br>%s : "
			      "<a href=%s>%s</a> (urlhash=%lu)<br>"

			      "<input type=checkbox name=urlhash%lu value=1 "
			      // use ajax to update test crc. if you undo your
			      // check then it should put the old val back.
			      // when you first click the checkbox it should
			      // gray out the diff i guess.
			      "onclick=submitchanges(%lu,%lu);> "
			      "Accept changes"

			      "<br>"
			      "original on left, new on right. "
			      "oldcrc = <a href=/qa/content.%lu>%lu</a>"

			      " != <a href=/qa/content.%lu>%lu</a> = newcrc"
			      "<br>diff output follows:<br>"
			      "<pre id=%lu style=background-color:0xffffff;>",
			      s_qt->m_testName,
			      s_url.getUrl(),
			      s_url.getUrl(),
			      urlHash32,

			      // input checkbox name field
			      urlHash32,

			      // submitchanges() parms
			      urlHash32, 
			      contentCRC,

			      // original/old content.%lu
			      *val,
			      *val,

			      // new content.%lu
			      contentCRC,
			      contentCRC,

			      // for the pre tag id:
			      urlHash32);


	// store in output
	SafeBuf sb;
	sb.load("/tmp/diffout");
	g_qaOutput.htmlEncode ( sb.getBufStart() );

	g_qaOutput.safePrintf("</pre><br><hr>");

	// if this is zero allow it to slide by. it is learning mode i guess.
	// so we can learn what crc we need to use.
	// otherwise, stop right there for debugging
	//if ( s_expectedCRC != 0 ) exit(1);

	// keep on going
	//s_callback();
}

Example #11

Show file

File: qa.cpp Project: firatkarakusoglu/open-source-search-engine

bool qaspider2 ( ) {
	//
	// delete the 'qatest123' collection
	//
	//static bool s_x1 = false;
	if ( ! s_flags[0] ) {
		s_flags[0] = true;
		if ( ! getUrl ( "/admin/delcoll?xml=1&delcoll=qatest123" ) )
			return false;
	}

	//
	// add the 'qatest123' collection
	//
	//static bool s_x2 = false;
	if ( ! s_flags[1] ) {
		s_flags[1] = true;
		if ( ! getUrl ( "/admin/addcoll?addcoll=qatest123&xml=1" , 
				// checksum of reply expected
				238170006 ) )
			return false;
	}

	// restrict hopcount to 0 or 1 in url filters so we do not spider
	// too deep
	//static bool s_z1 = false;
	if ( ! s_flags[2] ) {
		s_flags[2] = true;
		SafeBuf sb;
		sb.safePrintf("&c=qatest123&"
			      // make it the custom filter
			      "ufp=0&"

	       "fe=%%21ismanualadd+%%26%%26+%%21insitelist&hspl=0&hspl=1&fsf=0.000000&mspr=0&mspi=1&xg=1000&fsp=-3&"

			      // take out hopcount for now, just test quotas
			      //	       "fe1=tag%%3Ashallow+%%26%%26+hopcount%%3C%%3D1&hspl1=0&hspl1=1&fsf1=1.000000&mspr1=1&mspi1=1&xg1=1000&fsp1=3&"

			      // sitepages is a little fuzzy so take it
			      // out for this test and use hopcount!!!
			      //"fe1=tag%%3Ashallow+%%26%%26+sitepages%%3C%%3D20&hspl1=0&hspl1=1&fsf1=1.000000&mspr1=1&mspi1=1&xg1=1000&fsp1=45&"
			      "fe1=tag%%3Ashallow+%%26%%26+hopcount<%%3D1&hspl1=0&hspl1=1&fsf1=1.000000&mspr1=1&mspi1=1&xg1=1000&fsp1=45&"

	       "fe2=default&hspl2=0&hspl2=1&fsf2=1.000000&mspr2=0&mspi2=1&xg2=1000&fsp2=45&"

		);
		if ( ! getUrl ( "/admin/filters",0,sb.getBufStart()) )
			return false;
	}

	// set the site list to 
	// a few sites
	// these should auto seed so no need to use addurl
	//static bool s_z2 = false;
	if ( ! s_flags[3] ) {
		s_flags[3] = true;
		SafeBuf sb;
		sb.safePrintf("&c=qatest123&format=xml&sitelist=");
		sb.urlEncode(//walmart has too many pages at depth 1, so remove it
			     //"tag:shallow www.walmart.com\r\n"
			     "tag:shallow http://www.ibm.com/\r\n");
		sb.nullTerm();
		if ( ! getUrl ("/admin/settings",0,sb.getBufStart() ) )
			return false;
	}
		

	//
	// wait for spidering to stop
	//
 checkagain:

	// wait until spider finishes. check the spider status page
	// in json to see when completed
	//static bool s_k1 = false;
	if ( ! s_flags[4] ) {
		//usleep(5000000); // 5 seconds
		s_flags[4] = true;
		wait(3.0);
		return false;
	}

	if ( ! s_flags[14] ) {
		s_flags[14] = true;
		if ( ! getUrl ( "/admin/status?format=json&c=qatest123",0) )
			return false;
	}

	//static bool s_k2 = false;
	if ( ! s_flags[5] ) {
		// ensure spiders are done. 
		// "Nothing currently available to spider"
		if ( s_content&&!strstr(s_content,"Nothing currently avail")){
			s_flags[4] = false;
			s_flags[14] = false;
			goto checkagain;
		}
		s_flags[5] = true;
	}




	// verify no results for gbhopcount:2 query
	//static bool s_y4 = false;
	if ( ! s_flags[6] ) {
		s_flags[6] = true;
		if ( ! getUrl ( "/search?c=qatest123&qa=1&format=xml&"
				"q=gbhopcount%3A2",
				-1310551262 ) )
			return false;
	}

	// but some for gbhopcount:0 query
	//static bool s_t0 = false;
	if ( ! s_flags[7] ) {
		s_flags[7] = true;
		if ( ! getUrl ( "/search?c=qatest123&qa=1&format=xml&n=500&"
				"q=gbhopcount%3A0",
				999 ) )
			return false;
	}
	
	// check facet sections query for walmart
	//static bool s_y5 = false;
	if ( ! s_flags[8] ) {
		s_flags[8] = true;
		if ( ! getUrl ( "/search?c=qatest123&format=json&stream=0&"
				"q=gbfacetstr%3Agbxpathsitehash3311332088",
				999 ) )
			return false;
	}

	// wait for some reason
	if ( ! s_flags[15] ) {
		s_flags[15] = true;
		wait(1.5);
		return false;
	}



	//static bool s_y6 = false;
	if ( ! s_flags[9] ) {
		s_flags[9] = true;
		if ( ! getUrl ( "/get?page=4&q=gbfacetstr:gbxpathsitehash3311332088&qlang=xx&c=qatest123&d=9577169402&cnsp=0" , 999 ) )
			return false;
	}

	// in xml
	//static bool s_y7 = false;
	if ( ! s_flags[10] ) {
		s_flags[10] = true;
		if ( ! getUrl ( "/get?xml=1&page=4&q=gbfacetstr:gbxpathsitehash2492664135&qlang=xx&c=qatest123&d=9577169402&cnsp=0" , 999 ) )
			return false;
	}

	// and json
	//static bool s_y8 = false;
	if ( ! s_flags[11] ) {
		s_flags[11] = true;
		if ( ! getUrl ( "/get?json=1&page=4&q=gbfacetstr:gbxpathsitehash2492664135&qlang=xx&c=qatest123&d=9577169402&cnsp=0" , 999 ) )
			return false;
	}


	// delete the collection
	//static bool s_fee = false;
	// if ( ! s_flags[12] ) {
	// 	s_flags[12] = true;
	// 	if ( ! getUrl ( "/admin/delcoll?delcoll=qatest123" ) )
	// 		return false;
	// }

	//static bool s_fee2 = false;
	if ( ! s_flags[13] ) {
		s_flags[13] = true;
		log("qa: SUCCESSFULLY COMPLETED "
		    "QA SPIDER2 TEST");
		return true;
	}

	return true;
}

Example #12

Show file

File: HttpRequest.cpp Project: privacore/open-source-search-engine

// . parse an incoming request
// . return false and set g_errno on error
// . CAUTION: we destroy "req" by replacing it's last char with a \0
// . last char must be \n or \r for it to be a proper request anyway
bool HttpRequest::set ( char *origReq , int32_t origReqLen , TcpSocket *sock ) {
	// reset number of cgi field terms
	reset();

	 if ( ! m_reqBuf.reserve ( origReqLen + 1 ) ) {
		 log("http: failed to copy request: %s",mstrerror(g_errno));
		 return false;
	 }

	 // copy it to avoid mangling it
	 m_reqBuf.safeMemcpy ( origReq , origReqLen );
	 // NULL term
	 m_reqBuf.pushChar('\0');

	 m_reqBufValid = true;

	 // and point to that
	 char *req    = m_reqBuf.getBufStart();

	 if( !req ) { 
		 log(LOG_ERROR, "http: req is NULL");
		 g_errno = EBADREQUEST; 
		 return false; 
	 }


	 int32_t  reqLen = m_reqBuf.length() - 1;

	 // save this
	 m_userIP = sock ? sock->m_ip : 0;
	 m_isSSL  = sock ? (sock->m_ssl!=NULL) : false;

	 // TcpServer should always give us a NULL terminated request
	 if ( req[reqLen] != '\0' ) { g_process.shutdownAbort(true); }
	 
	 // how long is the first line, the primary request
	 // int32_t i;
	 // for ( i = 0 ; i<reqLen && i<MAX_REQ_LEN && 
	 //	       req[i]!='\n' && req[i]!='\r'; i++);
	 // . now fill up m_buf, used to log the request
	 // . make sure the url was encoded correctly
	 // . we don't want assholes encoding every char so we can't see what
	 //   url they are submitting to be spidered/indexed
	 // . also, don't de-code encoded ' ' '+' '?' '=' '&' because that would
	 //   change the meaning of the url
	 // . and finally, non-ascii chars that don't display correctly
	 // . this should NULL terminate m_buf, too
	 // . turn this off for now, just try to log a different way
	 // m_bufLen = urlNormCode ( m_buf , MAX_REQ_LEN - 1 , req , i );
	 // ensure it's big enough to be a valid request
	 if ( reqLen < 5 ) { 
		 log(LOG_WARN, "http: got reqlen %" PRId32"<5 = %s",reqLen,req);
		 g_errno = EBADREQUEST; 
		 return false; 
	 }

	 int32_t cmdLen = 0;

	 // or if first line too long
	 //if ( i >= 1024 )  { g_errno = EBADREQUEST; return false; }
	 // get the type, must be GET or HEAD
	 if      ( strncmp ( req , "GET "  , 4 ) == 0 ) {
		 m_requestType = RT_GET;
		 cmdLen = 3;
	 }
	 // these means a compressed reply was requested. use by query
	 // compression proxies.
	 else if ( strncmp ( req , "ZET "  , 4 ) == 0 ) {
		 m_requestType = RT_GET;
		 cmdLen = 3;
	 }
	 else if ( strncmp ( req , "HEAD " , 5 ) == 0 ) {
		 m_requestType = RT_HEAD;
		 cmdLen = 4;
	 }
	 else if ( strncmp ( req , "POST " , 5 ) == 0 )  {
		 m_requestType = RT_POST;
		 cmdLen = 4;
	 }
	 else if ( strncmp ( req , "CONNECT " , 8 ) == 0 ) {
		 // take this out until it stops losing descriptors and works
		 //m_requestType = RT_CONNECT;
		 //cmdLen = 7;
		 // we no longer insert section info. emmanuel gets section
		 // info when injecting a doc now i think in PageInject.cpp.
		 // we do not proxy https requests because we can't
		 // decrypt the page contents to cache them or to insert
		 // the sectiondb voting markup, so it's kinda pointless...
		 // and i'm not aiming to be a full-fledge squid proxy.
		 log("http: CONNECT request not supported because we "
		   "can't insert section markup and we can't cache: %s",req);
		 g_errno = EBADREQUEST; 
		 return false; 
	 }
	 else { 
		 log("http: got bad request cmd: %s",req);
		 g_errno = EBADREQUEST; 
		 return false; 
	 }
	 // . NULL terminate the request (a destructive operation!)
	 // . this removes the last \n in the trailing \r\n 
	 // . shit, but it f***s up POST requests
	 if ( m_requestType != RT_POST ) { 
		 req [ reqLen - 1 ] = '\0'; 
		 reqLen--; 
	 }

	 // POST requests can be absolutely huge if you are injecting a 100MB
	 // file, so limit our strstrs to the end of the mime
	 char *d = NULL;
	 char  dc;
	 // check for body if it was a POST request
	 if ( m_requestType == RT_POST ) {
		 d = strstr ( req , "\r\n\r\n" );
		 if ( d ) { dc = *d; *d = '\0'; }
		 else log("http: Got POST request without \\r\\n\\r\\n.");
	 }

	 // is it a proxy request?
	 m_isSquidProxyRequest = false;
	 if ( strncmp ( req + cmdLen + 1, "http://" ,7) == 0 ||
	      strncmp ( req + cmdLen + 1, "https://",8) == 0 ) {
		 m_isSquidProxyRequest = true;
		 // set url parms for it
		 m_squidProxiedUrl = req + cmdLen + 1;
		 char *p = m_squidProxiedUrl + 7;
		 if ( *p == '/' ) p++; // https:// ?
		 // stop at whitespace or \0
		 for ( ; *p && ! is_wspace_a(*p) ; p++ );
		 // that's the length of it
		 m_squidProxiedUrlLen = p - m_squidProxiedUrl;
	 }
	 else if ( m_requestType == RT_CONNECT ) {
		 m_isSquidProxyRequest = true;
		 // set url parms for it
		 m_squidProxiedUrl = req + cmdLen + 1;
		 // usually its like CONNECT diffbot.com:443
		 char *p = m_squidProxiedUrl;
		 // stop at whitespace or \0
		 for ( ; *p && ! is_wspace_a(*p) ; p++ );
		 // that's the length of it
		 m_squidProxiedUrlLen = p - m_squidProxiedUrl;
	 }

	 // check authentication
	 char *auth = NULL;
	 if ( m_isSquidProxyRequest && req )
		 auth = strstr(req,"Proxy-authorization: Basic ");

	 //if ( m_isSquidProxyRequest && ! auth ) {
	 //	 log("http: no auth in proxy request %s",req);
	 //	 g_errno = EBADREQUEST; 
	 //	 return false; 
	 //}

	 SafeBuf tmp;
	 if ( auth ) {
		 // find end of it
		 char *p = auth;
		 for ( ; *p && *p != '\r' && *p != '\n' ; p++ );
		 tmp.base64Decode ( auth , p - auth );
	 }

	 // assume incorrect username/password
	 bool matched = false;
	 if ( m_isSquidProxyRequest ) {
		 // now try to match in g_conf.m_proxyAuth safebuf of
		 // username:password space-separated list
		 char *p = g_conf.m_proxyAuth.getBufStart();
		 // loop over those
		 for ( ; p && *p ; ) {
			 // skip initial white space
			 for ( ; *p && is_wspace_a(*p); p++ );
			 // skip to end of username:password thing
			 char *end = p;
			 for ( ; *end && !is_wspace_a(*end); end++);
			 // save
			 char *start = p;
			 // advance
			 p = end;
			 // this is always a match
			 if ( end-start == 3 && strncmp(start,"*:*",3) == 0 ) {
				 matched = true;
				 break;
			 }
			 // compare now
			 if ( tmp.length() != end-start ) 
				 continue;
			 if ( strncmp(tmp.getBufStart(),start,end-start) != 0 )
				 continue;
			 // we got a match
			 matched = true;
			 break;
		 }
	 }

	 // incorrect username:passwrod?
	 if ( m_isSquidProxyRequest && ! matched ) {
		 log("http: bad username:password in proxy request %s",req);
		 g_errno = EPERMDENIED;
		 return false; 
	 }

	 // if proxy request to download a url through us, we are done
	 if ( m_isSquidProxyRequest ) return true;

	 bool multipart = false;
	 if ( m_requestType == 2 ) { // is POST?
		 char *cd ;
		 cd = gb_strcasestr(req,"Content-Type: multipart/form-data");
		 if ( cd ) multipart = true;
	 }

	 // . point to the file path 
	 // . skip over the "GET "
	 int32_t filenameStart = 4 ;
	 // skip over extra char if it's a "HEAD " request
	 if ( m_requestType == RT_HEAD || m_requestType == RT_POST ) 
		 filenameStart++;

	 // are we a redirect?
	 int32_t i = filenameStart;
	 m_redirLen = 0;
	 if ( strncmp ( &req[i] , "/?redir=" , 8 ) == 0 ) {
		 for ( int32_t k = i+8; k<reqLen && m_redirLen<126 ; k++) {
			 if ( req[k] == '\r' ) break;
			 if ( req[k] == '\n' ) break;
			 if ( req[k] == '\t' ) break;
			 if ( req[k] ==  ' ' ) break;
			 m_redir[m_redirLen++] = req[k];
		 }
	 }
	 m_redir[m_redirLen] = '\0';

	 // find a \n space \r or ? that delimits the filename
	 for ( i = filenameStart ; i < reqLen ; i++ ) {
		 if ( is_wspace_a ( req [ i ] ) ) break;
		 if ( req [ i ] == '?' ) break;
	 }

	 // now calc the filename length
	 m_filenameLen = i - filenameStart;
	 // return false and set g_errno if it's 0
	 if ( m_filenameLen <= 0  ) { 
		 log("http: got filenameLen<=0: %s",req);
		 g_errno = EBADREQUEST; 
		 return false; 
	 }
	 // . bitch if too big
	 // . leave room for strcatting "index.html" below
	 if ( m_filenameLen >= MAX_HTTP_FILENAME_LEN - 10 ) { 
		 log("http: got filenameLen>=max");
		 g_errno = EBADREQUEST; 
		 return false; 
	 }
	 // . decode the filename into m_filename and reassign it's length
	 // . decode %2F to / , etc...
	 m_filenameLen = urlDecode(m_filename,req+filenameStart,m_filenameLen);
	 // NULL terminate m_filename
	 m_filename [ m_filenameLen ] = '\0';
	 // does it have a file extension AFTER the last / in the filename?
	 bool hasExtension = false;
	 for ( int32_t j = m_filenameLen-1 ; j >= 0 ; j-- ) {
		 if ( m_filename[j] == '.' ) { hasExtension = true; break; }
		 if ( m_filename[j] == '/' ) break;
	 }
	 // if it has no file extension append a /index.html
	 if ( ! hasExtension && m_filename [ m_filenameLen - 1 ] == '/' ) {
		 strcat ( m_filename , "index.html" );
		 m_filenameLen = strlen ( m_filename );
	 }


	 // . uses the TcpSocket::m_readBuf
	 // . if *p was ? then keep going
	 m_origUrlRequest = origReq + filenameStart;
	 char *p = origReq + m_filenameLen;
	 for ( ; *p && ! is_wspace_a(*p) ; p++ );
	 m_origUrlRequestLen = p - m_origUrlRequest;

	 // set file offset/size defaults
	 m_fileOffset = 0;
	 // -1 means ALL the file from m_fileOffset onwards
	 m_fileSize   = -1;  
	 // "e" points to where the range actually starts, if any
	 //char *e;
	 // . TODO: speed up by doing one strstr for Range: and maybe range:
	 // . do they have a Range: 0-100\n in the mime denoting a partial get?
	 //char *s = strstr ( req ,"Range:bytes=" );
	 //e = s + 12;
	 // try alternate formats
	 //if ( ! s ) { s = strstr ( req ,"Range: bytes=" ); e = s + 13; }
	 //if ( ! s ) { s = strstr ( req ,"Range: "       ); e = s +  7; }
	 // parse out the range if we got one
	 //if ( s ) {
	 //	int32_t x = 0;
	 //	sscanf ( e ,"%" PRId32"-%" PRId32 , &m_fileOffset , &x );
	 //	// get all file if range's 2nd number is non-existant
	 //	if ( x == 0 ) m_fileSize = -1;
	 //	else          m_fileSize = x - m_fileOffset;
	 //	// ensure legitimacy
	 //	if ( m_fileOffset < 0 ) m_fileOffset = 0;
	 //}
	 // reset our hostname
	 m_hostLen = 0;
	 // assume request is NOT from local network
	 //m_isMasterAdmin = false;
	 m_isLocal = false;
	 // get the virtual hostname they want to use
	 char *s = strstr ( req ,"Host:" );
	 // try alternate formats
	 if ( ! s ) s = strstr ( req , "host:" ); 
	 // must be on its own line, otherwise it's not valid
	 if ( s && s > req && *(s-1) !='\n' ) s = NULL;
	 // parse out the host if we got one
	 if ( s ) {
		 // skip field name, host:
		 s += 5;
		 // skip e to beginning of the host name after "host:"
		 while ( *s==' ' || *s=='\t' ) s++;
		 // find end of the host name
		 char *end = s;
		 while ( *end && !is_wspace_a(*end) ) end++;
		 // . now *end should be \0, \n, \r, ' ', ...
		 // . get host len
		 m_hostLen = end - s;
		 // truncate if too big
		 if ( m_hostLen >= 255 ) m_hostLen = 254;
		 // copy into hostname
		 gbmemcpy ( m_host , s , m_hostLen );
	 }
	 // NULL terminate it
	 m_host [ m_hostLen ] = '\0';

	 // get Referer: field
	 s = strstr ( req ,"Referer:" );
	 // find another
	 if ( ! s ) s = strstr ( req ,"referer:" );
	 // must be on its own line, otherwise it's not valid
	 if ( s && s > req && *(s-1) !='\n' ) s = NULL;
	 // assume no referer
	 m_refLen = 0;
	 // parse out the referer if we got one
	 if ( s ) {
		 // skip field name, referer:
		 s += 8;
		 // skip e to beginning of the host name after ':'
		 while ( *s==' ' || *s=='\t' ) s++;
		 // find end of the host name
		 char *end = s;
		 while ( *end && !is_wspace_a(*end) ) end++;
		 // . now *end should be \0, \n, \r, ' ', ...
		 // . get len
		 m_refLen = end - s;
		 // truncate if too big
		 if ( m_refLen >= 255 ) m_refLen = 254;
		 // copy into m_ref
		 gbmemcpy ( m_ref , s , m_refLen );
	 }
	 // NULL terminate it
	 m_ref [ m_refLen ] = '\0';

	 // get User-Agent: field
	 s = strstr ( req ,"User-Agent:" );
	 // find another
	 if ( ! s ) s = strstr ( req ,"user-agent:" );
	 // must be on its own line, otherwise it's not valid
	 if ( s && s > req && *(s-1) !='\n' ) s = NULL;
	 // assume empty
	 int32_t len = 0;
	 // parse out the referer if we got one
	 if ( s ) {
		 // skip field name, referer:
		 s += 11;
		 // skip e to beginning of the host name after ':'
		 while ( *s==' ' || *s=='\t' ) s++;
		 // find end of the agent name
		 char *end = s;
		 while ( *end && *end!='\n' && *end!='\r' ) end++;
		 // . now *end should be \0, \n, \r, ' ', ...
		 // . get agent len
		 len = end - s;
		 // truncate if too big
		 if ( len > 127 ) len = 127;
		 // copy into m_userAgent
		 gbmemcpy ( m_userAgent , s , len );
	 }
	 // NULL terminate it
	 m_userAgent [ len ] = '\0';

	 // get Cookie: field
	 s = strstr ( req, "Cookie:" );
	 // find another
	 if ( !s ) s = strstr ( req, "cookie:" );
	 // must be on its own line, otherwise it's not valid
	 if ( s && s > req && *(s-1) != '\n' ) s = NULL;
	 // assume empty
	 // m_cookieBufLen = 0;
	 m_cookiePtr = s;
	 // parse out the cookie if we got one
	 if ( s ) {
		 // skip field name, Cookie:
		 s += 7;
		 // skip s to beginning of cookie after ':'
		 while ( *s == ' ' || *s == '\t' ) s++;
		 // find end of the cookie
		 char *end = s;
		 while ( *end && *end != '\n' && *end != '\r' ) end++;
		 // save length
		 m_cookieLen = end - m_cookiePtr;
		 // get cookie len
		 //m_cookieBufLen = end - s;
		 // trunc if too big
		 //if (m_cookieBufLen > 1023) m_cookieBufLen = 1023;
		 // copy into m_cookieBuf
		 //gbmemcpy(m_cookieBuf, s, m_cookieBufLen);
	 }
	 // NULL terminate it
	 if ( m_cookiePtr ) m_cookiePtr[m_cookieLen] = '\0';
	 //m_cookieBuf[m_cookieBufLen] = '\0';
	 // convert every '&' in cookie to a \0 for parsing the fields
	 // for ( int32_t j = 0 ; j < m_cookieBufLen ; j++ ) 
	 //	 if ( m_cookieBuf[j] == '&' ) m_cookieBuf[j] = '\0';

	 // mark it as cgi if it has a ?
	 bool isCgi = ( req [ i ] == '?' ) ;
	 // reset m_filename length to exclude the ?* stuff
	 if ( isCgi ) {
		 // skip over the '?'
		 i++;
		 // find a space the delmits end of cgi
		 int32_t j;
		 for ( j = i; j < reqLen; j++) if (is_wspace_a(req[j])) break;
		 // now add it
		 if ( ! addCgi ( &req[i] , j-i ) ) return false;
		 // update i
		 i = j;
	 }

	 // . set path ptrs
	 // . the whole /cgi/14.cgi?coll=xxx&..... thang
	 m_path = req + filenameStart;
	 m_plen = i - filenameStart;
	 // we're local if hostname is 192.168.[0|1].y
	 //if ( strncmp(iptoa(sock->m_ip),"192.168.1.",10) == 0) {
	 //	m_isMasterAdmin = true; m_isLocal = true; }
	 //if ( strncmp(iptoa(sock->m_ip),"192.168.0.",10) == 0) {
	 //	m_isMasterAdmin = true; m_isLocal = true; }
	 //if(strncmp(iptoa(sock->m_ip),"192.168.1.",10) == 0) m_isLocal = true;
	 //if(strncmp(iptoa(sock->m_ip),"192.168.0.",10) == 0) m_isLocal = true;
	 if ( sock && strncmp(iptoa(sock->m_ip),"192.168.",8) == 0) 
		 m_isLocal = true;
	 if ( sock && strncmp(iptoa(sock->m_ip),"10.",3) == 0) 
		 m_isLocal = true;

	 // gotta scan all ips in hosts.conf as well...
	 // if we are coming from any of our own hosts.conf c blocks
	 // consider ourselves local
	 uint32_t last = 0;
	 for ( int32_t i = 0 ; i < g_hostdb.getNumHosts() ; i++ ) {
		 Host *h = g_hostdb.getHost(i);
		 // save time with this check
		 if ( h->m_ip == last ) continue;
		 // update it
		 last = h->m_ip;
		 // returns number of top bytes in comon
		 int32_t nt = sock ? ipCmp ( sock->m_ip , h->m_ip ) : 0;
		 // at least be in the same c-block as a host in hosts.conf
		 if ( nt < 3 ) continue;
		 m_isLocal = true;
		 break;
	 }
		 

	 // connectips/adminips
	 // for ( int32_t i = 0 ; i < g_conf.m_numConnectIps ; i++ ) {
	 // 	 if ( sock->m_ip != g_conf.m_connectIps[i] ) continue;
	 // 	 m_isLocal = true;
	 // 	 break;
	 // }

	 // 127.0.0.1
	 if ( sock && sock->m_ip == 16777343 )
		 m_isLocal = true;

	 // . TODO: now add any cgi data from a POST.....
	 // . look after the mime
	 //char *d = NULL;
	 // check for body if it was a POST request
	 //if ( m_requestType == RT_POST ) d = strstr ( req , "\r\n\r\n" );

	 // return true now if no cgi stuff to parse
	 if ( d ) {
	 	// now put d's char back, just in case... does it really matter?
		*d = dc;

		 char *post    = d + 4;
		 int32_t  postLen = reqLen-(d+4-req) ;
		 // post sometimes has a \r or\n after it
		 while ( postLen > 0 && post[postLen-1]=='\r' ) postLen--;
		 // add it to m_cgiBuf, filter and everything
		 if ( ! addCgi ( post , postLen ) ) return false;
	 }

	 // Put '\0' back into the HttpRequest buffer...
	 // crap, not if we are multi-part unencoded stuff...
	 if ( m_cgiBuf && ! multipart ) {
		 // do not mangle the "ucontent"!
		 int32_t cgiBufLen = m_cgiBufLen;
		 cgiBufLen -= m_ucontentLen;
		 char *buf = m_cgiBuf;
		 for (int32_t i = 0; i < cgiBufLen ; i++) 
			 if (buf[i] == '&') buf[i] = '\0';
		 // don't decode the ucontent= field!
		 int32_t decodeLen = m_cgiBufLen;
		 // so subtract that
		 if ( m_ucontent ) decodeLen -= m_ucontentLen;
		 // decode everything. fixed for %00 in &content= so it
		 // doesn't set our parms when injecting.
		 int32_t len = urlDecodeNoZeroes(m_cgiBuf,m_cgiBuf,decodeLen);
		 // we're parsing crap after the null if the last parm 
		 // has no value
		 //memset(m_cgiBuf+len, '\0', m_cgiBufLen-len);
		 m_cgiBufLen = len;
		 // ensure that is null i guess
		 if ( ! m_ucontent ) m_cgiBuf[len] = '\0';
	 }
	
	 if (m_cgiBuf2){
		 char *buf = m_cgiBuf2;
		 for (int32_t i = 0; i < m_cgiBuf2Size-1 ; i++) 
			 if (buf[i] == '&') buf[i] = '\0';
		 // decode everything. fixed for %00 in &content= so it
		 // doesn't set our parms when injecting.
		 int32_t len = urlDecodeNoZeroes ( m_cgiBuf2 , 
						   m_cgiBuf2 , 
						   m_cgiBuf2Size);
		 memset(m_cgiBuf2+len, '\0', m_cgiBuf2Size-len);
	 }
	 // . parse the fields after the ? in a cgi filename
	 // . or fields in the content if it's a POST
	 // . m_cgiBuf must be and is NULL terminated for this
	 parseFields ( m_cgiBuf , m_cgiBufLen );
	 // Add extra parms to the request.  
	 if (m_cgiBuf2Size){
		 parseFields(m_cgiBuf2, m_cgiBuf2Size);
	 }

	 // urldecode the cookie buf too!!
	 if ( m_cookiePtr ) {
		 char *p = m_cookiePtr;
		 for (int32_t i = 0; i < m_cookieLen ; i++) {
			 //if (p[i] == '&') p[i] = '\0';
			 // cookies are separated with ';' in the request only
			 if (p[i] == ';') p[i] = '\0';
			 // a hack for the metacookie=....
			 // which uses &'s to separate its subcookies
			 // this is a hack for msie's limit of 50 cookies
			 if ( p[i] == '&' ) p[i] = '\0';
			 // set m_metaCookie to start of meta cookie
			 if ( p[i] == 'm' && p[i+1] == 'e' &&
			      strncmp(p,"metacookie",10) == 0 )
				 m_metaCookie = p;
		 }
		 int32_t len = urlDecode ( m_cookiePtr , 
					m_cookiePtr,
					m_cookieLen );
		 // we're parsing crap after the null if the last parm 
		 // has no value
		 memset(m_cookiePtr+len, '\0', m_cookieLen-len);
		 m_cookieLen = len;
	 }

	 return true;
 }

Example #13

Show file

File: HttpRequest.cpp Project: privacore/open-source-search-engine

// . form an HTTP request 
// . use size 0 for HEAD requests
// . use size -1 for GET whole doc requests
// . fill in your own offset/size for partial GET requests
// . returns false and sets g_errno on error
// . NOTE: http 1.1 uses Keep-Alive by default (use Connection: close to not)
bool HttpRequest::set (char *url,int32_t offset,int32_t size,time_t ifModifiedSince,
		       const char *userAgent, const char *proto, bool doPost,
		       const char *cookieJar, const char *additionalHeader,
		       // if posting something, how many bytes is it?
		       int32_t postContentLen ,
		       // are we sending the request through an http proxy?
		       // if so this will be non-zero
		       int32_t proxyIp ,
		       const char *proxyUsernamePwd ) {

	m_reqBufValid = false;

	int32_t hlen ;
	int32_t port = 80;
	const char *hptr = getHostFast ( url , &hlen , &port );
	char *path = getPathFast ( url );

	// . use the full url if sending to an http proxy
	// . HACK: do NOT do this if it is httpS because we end up
	//   using the http tunnel using the CONNECT cmd and the squid proxy
	//   will just forward/proxy just the entire tcp packets.
	if ( proxyIp && strncmp(url,"https://",8) != 0 ) path = url;

	char *pathEnd  = NULL;
	const char *postData = NULL;
	if ( doPost ) {
		pathEnd  = strstr(path,"?");
		if ( pathEnd ) {
			*pathEnd = '\0';
			postData = pathEnd + 1;
		}
	}

	// if no legit host
	if ( hlen <= 0 || ! hptr ) { g_errno = EBADURL; return false; }
	// sanity check. port is only 16 bits
	if ( port > (int32_t)0xffff ) { g_errno = EBADURL; return false; }
	// return false and set g_errno if url too big
	//if ( url->getUrlLen() + 400 >= MAX_REQ_LEN ) { 
	//	g_errno = EURLTOOBIG; return false;}
	// assume request type is a GET
	m_requestType = RT_GET;//0;
	// get the host NULL terminated
	char host[1024+8];
	//int32_t hlen = url->getHostLen();
	strncpy ( host , hptr , hlen );
	host [ hlen ] = '\0';
	// then port
	//uint16_t port = url->getPort();
	if ( port != 80 ) {
		sprintf ( host + hlen , ":%" PRIu32 , (uint32_t)port );
		hlen += strlen ( host + hlen );
	}
	// the if-modified-since field
	const char *ims = "";
#if 0
	char  ibuf[64];
	if ( ifModifiedSince ) {
		struct tm tm_buf;
		char buf[64];
		// NOTE: ctime appends a \n 
		snprintf(ibuf, sizeof(ibuf), "If-Modified-Since: %s UTC",
			asctime_r(gmtime_r(&ifModifiedSince,&tm_buf),buf));
		// get the length
		int32_t ilen = strlen(ibuf);
		if( ilen && ilen < (int32_t)sizeof(ibuf)-1 ) {
			// hack off \n from ctime - replace with \r\n\0
			ibuf [ ilen - 1 ] = '\r';
			ibuf [ ilen     ] = '\n';
			ibuf [ ilen + 1 ] = '\0';
			// set ims to this string
			ims = ibuf;
		}
	}
	// . until we fix if-modified-since, take it out
	// . seems like we are being called with it as true when should not be
	ims="";
#endif

	// . use one in conf file if caller did not provide
	// . this is usually Gigabot/1.0
	if ( ! userAgent ) userAgent = g_conf.m_spiderUserAgent;
	// accept only these
	const char *accept = "*/*";
	/*
		 "text/html, "
		 "text/plain, "
		 "text/xml, "
		 "application/pdf, "
		 "application/msword, "
		 "application/vnd.ms-excel, "
		 "application/mspowerpoint, "
		 "application/postscript";
	*/

	const char *cmd = "GET";
	if ( size == 0 ) cmd = "HEAD";
	if ( doPost    ) cmd = "POST";

	// crap, can't spider nyt.com if we are 1.0, so use 1.0 but also
	// note Connection: Close\r\n when making requests
	//proto = "HTTP/1.1";

	SafeBuf tmp;
	const char *up = "";
	if ( proxyUsernamePwd && proxyUsernamePwd[0] ) {
		tmp.safePrintf("Proxy-Authorization: Basic ");
		tmp.base64Encode (proxyUsernamePwd,strlen(proxyUsernamePwd));
		tmp.safePrintf("\r\n");
		up = tmp.getBufStart();
	}

	 // . now use "Accept-Language: en" to tell servers we prefer english
	 // . i removed keep-alive connection since some connections close on
	 //   non-200 ok http statuses and we think they're open since close
	 //   signal (read 0 bytes) may have been delayed
	 const char* acceptEncoding = "";
	 // the scraper is getting back gzipped search results from goog,
	 // so disable this for now
	 // i am re-enabling now for testing...
	 if(g_conf.m_gzipDownloads)
	 	 acceptEncoding = "Accept-Encoding: gzip;q=1.0\r\n";
	 // i thought this might stop wikipedia from forcing gzip on us
	 // but it did not!
	 // else
	 //	 acceptEncoding = "Accept-Encoding:\r\n";

	 // char *p = m_buf;
	 // init the safebuf to point to this buffer in our class to avoid
	 // a potential alloc
	 // m_reqBuf.setBuf ( m_buf , MAX_REQ_LEN , 0 , false, csUTF8 );
	 m_reqBuf.purge();
	 // indicate this is good
	 m_reqBufValid = true;

	 if ( size == 0 ) {
		 // 1 for HEAD requests
		 m_requestType = RT_HEAD; 
		 m_reqBuf.safePrintf (
			   "%s %s %s\r\n" 
			   "Host: %s\r\n"
			   "%s"
			   "User-Agent: %s\r\n"
			   "Connection: Close\r\n"
			   //"Connection: Keep-Alive\r\n" 
			   "Accept-Language: en\r\n"
			   //"Accept: */*\r\n\r\n" ,
			   "Accept: %s\r\n" 
			   "%s"
			   ,
				 cmd,
			   path , proto, host , 
			   ims , userAgent , accept , up );
	 }
	 else 
	 if ( size != -1 ) 
		 m_reqBuf.safePrintf (
			   "%s %s %s\r\n" 
			   "Host: %s\r\n"
			   "%s"
			   "User-Agent: %s\r\n"
			   "Connection: Close\r\n"
			   //"Connection: Keep-Alive\r\n"
			   "Accept-Language: en\r\n"
			   //"Accept: */*\r\n"
			   "Accept: %s\r\n"
			   "Range: bytes=%" PRId32"-%" PRId32"\r\n" 
			   "%s"
			   ,
				cmd,
			   path ,
			   proto ,
			   host ,
			   ims  ,
			   userAgent ,
			   accept ,
			   offset ,
			   offset + size ,
				      up);
	 else 
	 if ( offset > 0 ) 	// size is -1
		 m_reqBuf.safePrintf (
			   "%s %s %s\r\n" 
			   "Host: %s\r\n"
			   "%s"
			   "User-Agent: %s\r\n"
			   "Connection: Close\r\n"
			   //"Connection: Keep-Alive\r\n"
			   "Accept-Language: en\r\n"
			   //"Accept: */*\r\n"
			   "Accept: %s\r\n"
			   "Range: bytes=%" PRId32"-\r\n" 
			   "%s"
			   ,
				cmd,
			   path ,
			   proto ,
			   host ,
			   ims  ,
			   userAgent ,
			   accept ,
			   offset ,
				      up );
	 // Wget's request:
	 // GET / HTTP/1.0\r\nUser-Agent: Wget/1.10.2\r\nAccept: */*\r\nHost: 127.0.0.1:8000\r\nConnection: Keep-Alive\r\n\r\n
	 // firefox's request:
	 // GET /master?c=main HTTP/1.1\r\nHost: 10.5.1.203:8000\r\nUser-Agent: Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.2.7) Gecko/20100715 Ubuntu/10.04 (lucid) Firefox/3.6.7\r\nAccept: text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8\r\nAccept-Language: en-us,en;q=0.5\r\nAccept-Encoding: gzip,deflate\r\nAccept-Charset: ISO-8859-1,utf-8;q=0.7,*;q=0.7\r\nKeep-Alive: 115\r\nConnection: keep-alive\r\nReferer: http://10.5.0.2:8002/qpmdw.html\r\nCookie: __utma=267617550.1103353528.1269214594.1273256655.1276103782.12; __utmz=267617550.1269214594.1.1.utmcsr=(direct)|utmccn=(direct)|utmcmd=(none); _incvi=qCffL7N8chFyJLwWrBDMbNz2Q3EWmAnf4uA; s_lastvisit=1269900225815; s_pers=%20s_getnr%3D1276103782254-New%7C1339175782254%3B%20s_nrgvo%3DNew%7C1339175782258%3B\r\n\r\n
	 else {
		 // until we fix if-modified-since, take it out
		 //ims="";
		 //userAgent = "Wget/1.10.2";
		 //userAgent = "Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.2.7) Gecko/20100715 Ubuntu/10.04 (lucid) Firefox/3.6.7";
		 //proto = "HTTP/1.0";
		 m_reqBuf.safePrintf (
			   "%s %s %s\r\n" 
			   "User-Agent: %s\r\n"
			   "Accept: */*\r\n" 
			   "Host: %s\r\n"
			   "%s"
			   "Connection: Close\r\n"
			   //"Connection: Keep-Alive\r\n"
			   //"Accept-Language: en\r\n"
				"%s"
			   "%s"
			   ,
			   //"Accept: %s\r\n\r\n" ,
				//"\r\n",
				cmd,
			   path ,
			   proto ,
			   userAgent ,
			   host ,
			   ims ,
			   acceptEncoding,
				      up );
			   //accept );
	 }

	 if ( additionalHeader )
		 m_reqBuf.safePrintf("%s\r\n",additionalHeader );

	 // cookie here
	if (cookieJar) {
		HttpMime::addCookieHeader(cookieJar, url, &m_reqBuf);
	}

	 // print content-length: if post
	 if ( postData ) {
		 // dammit... recaptcha does not work without this!!!!
		 m_reqBuf.safePrintf (
			      "Content-Type: "
			      "application/x-www-form-urlencoded\r\n");
	 }

	 // we need this if doing a post even if postData is NULL
	 if ( doPost ) {
		 int32_t contentLen = 0;
		 if ( postData ) contentLen = strlen(postData);
		 // this overrides if provided. -1 is default
		 if ( postContentLen >= 0 ) contentLen = postContentLen;
		 m_reqBuf.safePrintf ("Content-Length: %" PRId32"\r\n", contentLen );
		 m_reqBuf.safePrintf("\r\n");
		 if ( postData ) m_reqBuf.safePrintf("%s",postData);
		 // log it for debug
		 //log("captch: %s",m_buf);
	 }

	 if ( ! doPost ) { // ! postData ) {
		 m_reqBuf.safePrintf("\r\n");
	 }

	 // restore url buffer
	 if ( pathEnd ) *pathEnd = '?';

	 return true;
 }

Example #14

Show file

File: Log.cpp Project: LetsUnlockiPhone/open-source-search-engine

bool Log::init ( char *filename ) {
	// set the main process id
	//s_pid = getpidtid();
	setPid();
	// init these
	m_numErrors =  0;
	m_bufPtr    =  0;
	m_fd        = -1;
	m_disabled  = false;

#ifdef DEBUG
	g_dbufSize = 4096;
	g_dbuf = (char*)mmalloc(g_dbufSize,"Log: DebugBuffer");
	if (!g_dbuf) fprintf(stderr, "Unable to init debug buffer");
#endif
	//	m_hostname  = g_conf.m_hostname;
	//	m_port      = port;
	// is there a filename to log our errors to?
	m_filename = filename;
	if ( ! m_filename ) return true;

	// skip this for now
	//return true;

	//
	// RENAME log000 to log000-2013_11_04-18:19:32
	//
	if ( g_conf.m_runAsDaemon ) {
		File f;
		char tmp[16];
		sprintf(tmp,"log%03li",g_hostdb.m_hostId);
		f.set ( g_hostdb.m_dir , tmp );
		// make new filename like log000-2013_11_04-18:19:32
		time_t now = getTimeLocal();
		tm *tm1 = gmtime((const time_t *)&now);
		char tmp2[64];
		strftime(tmp2,64,"%Y_%m_%d-%T",tm1);
		SafeBuf newName;
		if ( ! newName.safePrintf ( "%slog%03li-%s",
					    g_hostdb.m_dir,
					    g_hostdb.m_hostId,
					    tmp2 ) ) {
			fprintf(stderr,"log rename failed\n");
			return false;
		}
		// rename log000 to log000-2013_11_04-18:19:32
		if ( f.doesExist() ) {
			//fprintf(stdout,"renaming file\n");
			f.rename ( newName.getBufStart() );
		}
	}


	// open it for appending.
	// create with -rw-rw-r-- permissions if it's not there.
	m_fd = open ( m_filename , 
		      O_APPEND | O_CREAT | O_RDWR , 
		      S_IRUSR | S_IWUSR | S_IRGRP | S_IWGRP | S_IROTH );
	if ( m_fd >= 0 ) return true;
	// bitch to stderr and return false on error
	fprintf(stderr,"could not open log file %s for appending\n",
		m_filename);
	return false;
}

Example #15

Show file

File: PageTitledb.cpp Project: chushuai/open-source-search-engine

// . make a web page from results stored in msg40
// . send it on TcpSocket "s" when done
// . returns false if blocked, true otherwise
// . sets g_errno on error
bool gotTitleRec ( void *state ) {
	// cast the State4 out
	State4 *st = (State4 *) state;
	// get the socket
	TcpSocket *s = st->m_socket;

	SafeBuf sb;
	// get it's docId
	long long docId = st->m_docId;
	// make the query string for passing to different hosts
	char  qs[64];
	sprintf(qs,"&d=%lli",docId);
	if ( docId==0LL ) qs[0] = 0;
	// print standard header
	sb.reserve2x ( 32768 );
	g_pages.printAdminTop (&sb, st->m_socket, &st->m_r );
	//PAGE_TITLEDB,
	//		       st->m_username,//NULL ,
	//		       st->m_coll , st->m_pwd , s->m_ip , qs );
	// shortcut
	XmlDoc *xd = &st->m_xd;

	// . deal with errors
	// . print none if non title rec at or after the provided docId
	if ( g_errno || docId == 0LL || xd->m_titleRecBuf.length() <= 0 ) {
		// print docId in box
		sb.safePrintf (  "<center>\nEnter docId: "
				 "<input type=text name=d value=%lli size=15>",
				 docId);
		sb.safePrintf ( "</form><br>\n" );
		if ( docId == 0 ) 
			sb.safePrintf("<br>");
		else if ( g_errno ) 
			sb.safePrintf("<br><br>Error = %s",mstrerror(g_errno));
		else 
			sb.safePrintf("<br><br>No titleRec for that docId "
				      "or higher");
		// print where it should be
		//unsigned long gid = getGroupIdFromDocId ( docId );
		//Host *hosts = g_hostdb.getGroup(gid);
		long shardNum = getShardNumFromDocId ( docId );
		Host *hosts = g_hostdb.getShard ( shardNum );
		long hostId = -1;
		if ( hosts ) hostId = hosts[0].m_hostId;
		sb.safePrintf("<br><br>docId on host #%li and twins.",hostId);
		sb.safePrintf ( "\n</center>" );
		mdelete ( st , sizeof(State4) , "PageTitledb");
		delete (st);
		// erase g_errno for sending
		g_errno = 0;
		// now encapsulate it in html head/tail and send it off
		return g_httpServer.sendDynamicPage ( s , 
						      sb.getBufStart(),
						      sb.length() );
	}
	// print docId in box
	sb.safePrintf ("<br>\n"
		       "<center>Enter docId: "
		       "<input type=text name=d value=%lli size=15>", docId );
	// print where it should be
	//unsigned long gid = getGroupIdFromDocId ( docId );
	//Host *hosts = g_hostdb.getGroup(gid);
	long shardNum = getShardNumFromDocId ( docId );
	Host *hosts = g_hostdb.getShard ( shardNum );
	long hostId = -1;
	if ( hosts ) hostId = hosts[0].m_hostId;
	sb.safePrintf("<br><br>docId on host #%li and twins.",hostId);
	sb.safePrintf ( "</form><br>\n" );

	//char *coll    = st->m_coll;

	Title *ti = xd->getTitle();
	if ( ! ti ) {
		log ( "admin: Could not set title" );
		return g_httpServer.sendErrorReply(s,500,mstrerror(g_errno));
	}
	// sanity check. should not block
	if ( ! xd->m_titleValid ) { char *xx=NULL;*xx=0; }

	// print it out
	xd->printDoc ( &sb );

	// don't forget to cleanup
	mdelete ( st , sizeof(State4) , "PageTitledb");
	delete (st);
	// now encapsulate it in html head/tail and send it off
	return g_httpServer.sendDynamicPage (s, sb.getBufStart(), sb.length());
}

Example #16

Show file

File: qa.cpp Project: firatkarakusoglu/open-source-search-engine

//
// the injection qa test suite
//
bool qainject1 ( ) {

	//if ( ! s_callback ) s_callback = qainject1;

	//
	// delete the 'qatest123' collection
	//
	//static bool s_x1 = false;
	if ( ! s_flags[0] ) {
		s_flags[0] = true;
		if ( ! getUrl ( "/admin/delcoll?xml=1&delcoll=qatest123" ) )
			return false;
	}

	//
	// add the 'qatest123' collection
	//
	//static bool s_x2 = false;
	if ( ! s_flags[1] ) {
		s_flags[1] = true;
		if ( ! getUrl ( "/admin/addcoll?addcoll=qatest123&xml=1" , 
				// checksum of reply expected
				238170006 ) )
			return false;
	}

	// this only loads once
	loadUrls();
	long max = s_ubuf2.length()/(long)sizeof(char *);
	//max = 1;

	//
	// inject urls, return false if not done yet
	//
	//static bool s_x4 = false;
	if ( ! s_flags[2] ) {
		// TODO: try delimeter based injection too
		//static long s_ii = 0;
		for ( ; s_flags[20] < max ; ) {
			// inject using html api
			SafeBuf sb;
			sb.safePrintf("&c=qatest123&deleteurl=0&"
				      "format=xml&u=");
			sb.urlEncode ( s_urlPtrs[s_flags[20]] );
			// the content
			sb.safePrintf("&hasmime=1");
			// sanity
			//if ( strstr(s_urlPtrs[s_flags[20]],"wdc.htm") )
			//	log("hey");
			sb.safePrintf("&content=");
			sb.urlEncode(s_contentPtrs[s_flags[20]] );
			sb.nullTerm();
			// pre-inc it in case getUrl() blocks
			s_flags[20]++;//ii++;
			if ( ! getUrl("/admin/inject",
				      0, // no idea what crc to expect
				      sb.getBufStart()) )
				return false;
		}
		s_flags[2] = true;
	}

	// +the
	//static bool s_x5 = false;
	if ( ! s_flags[3] ) {
		wait(1.5);
		s_flags[3] = true;
		return false;
	}

	if ( ! s_flags[16] ) {
		s_flags[16] = true;
		if ( ! getUrl ( "/search?c=qatest123&qa=1&format=xml&q=%2Bthe",
				702467314 ) )
			return false;
	}

	// sports news
	//static bool s_x7 = false;
	if ( ! s_flags[4] ) {
		s_flags[4] = true;
		if ( ! getUrl ( "/search?c=qatest123&qa=1&format=xml&"
				"q=sports+news",2009472889 ) )
		     return false;
	}

	// 'washer & dryer' does some algorithmic synonyms 'washer and dryer'
	if ( ! s_flags[15] ) {
		s_flags[15] = true;
		if ( ! getUrl ( "/search?c=qatest123&qa=1&format=xml&"
				"debug=1&q=washer+%26+dryer",9999 ) )
		     return false;
	}

	//
	// mdw: query reindex test
	//
	// if ( ! s_flags[30] ) {
	// 	s_flags[30] = true;
	// 	if ( ! getUrl ( "/admin/reindex?c=qatest123&qa=1&format=xml&"
	// 			"debug=1&q=sports",9999 ) )
	// 		return false;
	// }

	// // temp end it here
	// return true;

	//
	// eject/delete the urls
	//
	//static long s_ii2 = 0;
	for ( ; s_flags[5] < max ; ) {
		// reject using html api
		SafeBuf sb;
		sb.safePrintf( "/admin/inject?c=qatest123&deleteurl=1&"
			       "format=xml&u=");
		sb.urlEncode ( s_urlPtrs[s_flags[5]] );
		sb.nullTerm();
		// pre-inc it in case getUrl() blocks
		//s_ii2++;
		s_flags[5]++;
		if ( ! getUrl ( sb.getBufStart() , 0 ) )
			return false;
	}

	//
	// make sure no results left, +the
	//
	if ( ! s_flags[6] ) {
		wait(1.5);
		s_flags[6] = true;
		return false;
	}

	if ( ! s_flags[14] ) {
		s_flags[14] = true;
		if ( ! getUrl ( "/search?c=qatest123&qa=2&format=xml&q=%2Bthe",
				-1672870556 ) )
			return false;
	}

	//static bool s_fee2 = false;
	if ( ! s_flags[13] ) {
		s_flags[13] = true;
		log("qa: SUCCESSFULLY COMPLETED "
			"QA INJECT TEST 1");
		//if ( s_callback == qainject ) exit(0);
		return true;
	}


	return true;
}

Example #17

Show file

File: Stats.cpp Project: BILObilo/open-source-search-engine

//
// new code for drawing graph in html with absolute divs instead
// of using GIF plotter library which had issues
//
void Stats::printGraphInHtml ( SafeBuf &sb ) {

	// gif size
	char tmp[64];
	sprintf ( tmp , "%lix%li", (long)DX+40 , (long)DY+40 ); // "1040x440"

	// 20 pixel borders
	//int bx = 10;
	//int by = 30;
	// define the space with boundaries 100 unit wide boundaries
	//plotter.space ( -bx , -by , DX + bx , DY + by );
	// draw the x-axis
	//plotter.line ( 0 , 0 , DX , 0  );
	// draw the y-axis
	//plotter.line ( 0 , 0 ,  0 , DY );

	// find time ranges
	long long t2 = 0;
	for ( long i = 0 ; i < MAX_POINTS ; i++ ) {
		// skip empties
		if ( m_pts[i].m_startTime == 0 ) continue;
		// set min/max
		if ( m_pts[i].m_endTime   > t2 ) t2 = m_pts[i].m_endTime;
	}
	// now compute the start time for the graph
	long long t1 = 0x7fffffffffffffffLL;
	// now recompute t1
	for ( long i = 0 ; i < MAX_POINTS ; i++ ) {
		// skip empties
		if ( m_pts[i].m_startTime == 0 ) continue;
		// can't be behind more than 1 second
		if ( m_pts[i].m_startTime   < t2 - DT ) continue;
		// otherwise, it's a candidate for the first time
		if ( m_pts[i].m_startTime < t1 ) t1 = m_pts[i].m_startTime;
	}

	//
	// main graphing window
	//
	sb.safePrintf("<div style=\"position:relative;"
		      "background-color:#c0c0c0;"

		      // match style of tables
		      "border-radius:10px;"
		      "border:#6060f0 2px solid;"
		      
		      //"overflow-y:hidden;"
		      "overflow-x:hidden;"
		      "z-index:-10;"
		      // the tick marks we print below are based on it
		      // being a window of the last 20 seconds... and using
		      // DX pixels
		      "min-width:%lipx;"
		      "min-height:%lipx;"
		      //"width:100%%;"
		      //"min-height:600px;"
		      //"margin-top:10px;"
		      "margin-bottom:10px;"
		      //"margin-right:10px;"
		      //"margin-left:10px;"
		      "\">"
		      ,(long)DX
		      ,(long)DY +20); // add 10 more for "2s" labels etc.

	// 10 x-axis tick marks
	for ( int x = DX/20 ; x <= DX ; x += DX/20 ) {
		// tick mark
		//plotter.line ( x , -20 , x , 20 );
		sb.safePrintf("<div style=\"position:absolute;"
			      "left:%li;"
			      "bottom:0;"
			      "background-color:#000000;"
			      "z-index:110;"
			      "min-height:20px;"
			      "min-width:3px;\"></div>\n"
			      , (long)x-1
			      );
		// generate label
		//char buf [ 32 ];
		//sprintf ( buf , "%li" , 
		//	  (long)(DT * (long long)x / (long long)DX) );
		// LABEL
		sb.safePrintf("<div style=\"position:absolute;"
			      "left:%li;"
			      "bottom:20;"
			      //"background-color:#000000;"
			      "z-index:110;"
			      "min-height:20px;"
			      "min-width:3px;\">%lis</div>\n"
			      , (long)x-10
			      // the label:
			      ,(long)(DT * (long long)x / (long long)DX)/1000
			      );

		// move cursor
		//plotter.move ( x , -by / 2 - 9 );
		// plot label
		//plotter.alabel     ( 'c' , 'c' , buf );
	}

	// . each line consists of several points
	// . we need to know each point for adding otherlines
	// . is about [400/6][1024] = 70k
	// . each line can contain multiple data points
	// . each data point is expressed as a horizontal line segment
	void *lrgBuf;
	long lrgSize = 0;
	lrgSize += MAX_LINES * MAX_POINTS * sizeof(StatPoint *);
	lrgSize += MAX_LINES * sizeof(long);
	lrgBuf = (char *) mmalloc(lrgSize, "Stats.cpp"); 
	if (! lrgBuf) {
	    log("could not allocate memory for local buffer in Stats.cpp"
		"%li bytes needed", lrgSize);
	    return;
	}
	char *lrgPtr = (char *)lrgBuf;
	StatPoint **points = (StatPoint **)lrgPtr;   
	lrgPtr += MAX_LINES * MAX_POINTS * sizeof(StatPoint *);
	long *numPoints = (long *)lrgPtr;
	lrgPtr += MAX_LINES * sizeof(long);
	memset ( (char *)numPoints , 0 , MAX_LINES * sizeof(long) );

	// store the data points into "lines"
	long count = MAX_POINTS;
	for ( long i = m_next ; count >= 0 ; i++ , count-- ) {
		// wrap around the array
		if ( i >= MAX_POINTS ) i = 0;
		// skip point if empty
		if ( m_pts[i].m_startTime == 0 ) continue;
		// skip if too early
		if ( m_pts[i].m_endTime < t1 ) continue;
		// . find the lowest line the will hold us
		// . this adds point to points[x][n] where x is determined
		addPoint ( points , numPoints , &m_pts[i] );
	}

	int y1 = 21;
	// plot the points (lines) in each line
	for ( long i = 0 ; i < MAX_LINES    ; i++ ) {
		// increase vert
		y1 += MAX_WIDTH + 1;
		// wrap back down if necessary
		if ( y1 >= DY ) y1 = 21;
		// plt all points in this row
	for ( long j = 0 ; j < numPoints[i] ; j++ ) {
		// get the point
		StatPoint *p =  points[MAX_POINTS * i + j];
		// transform time to x coordinates
		int x1 = (p->m_startTime - t1) * (long long)DX / DT;
		int x2 = (p->m_endTime   - t1) * (long long)DX / DT;
		// if x2 is negative, skip it
		if ( x2 < 0 ) continue;
		// if x1 is negative, boost it to -2
		if ( x1 < 0 ) x1 = -2;
		// . line thickness is function of read/write size
		// . take logs
		int w = (int)log(((double)p->m_numBytes)/8192.0) + 3;
		//log("log of %li is %i",m_pts[i].m_numBytes,w);
		if ( w < 3         ) w = 3;
		if ( w > MAX_WIDTH ) w = MAX_WIDTH;
		//plotter.linewidth ( w );       
		// use the color specified from addStat_r() for this line/pt
		//plotter.pencolor ( ((p->m_color >> 16) & 0xff) << 8 ,
		//		   ((p->m_color >>  8) & 0xff) << 8 ,
		//		   ((p->m_color >>  0) & 0xff) << 8 );
		// ensure at least 3 units wide for visibility
		if ( x2 < x1 + 3 ) x2 = x1 + 3;
		// . flip the y so we don't have to scroll the browser down
		// . DY does not include the axis and tick marks
		long fy1 = DY - y1 + 20 ;
		// plot it
		//plotter.line ( x1 , fy1 , x2 , fy1 );
		drawLine2 ( sb , x1 , x2 , fy1 , p->m_color , w );
		// debug msg
		//log("line (%i,%i, %i,%i) ", x1 , vert , x2 , vert );
		//log("bytes = %li width = %li ", m_pts[i].m_numBytes,w);
		//log("st=%i, end=%i color=%lx " ,
		//      (int)m_pts[i].m_startTime , 
		//      (int)m_pts[i].m_endTime   , 
		//      m_pts[i].m_color );
	}
	}

	sb.safePrintf("</div>\n");

	mfree(lrgBuf, lrgSize, "Stats.cpp");
}

Example #18

Show file

File: qa.cpp Project: firatkarakusoglu/open-source-search-engine

bool qainject2 ( ) {

	//if ( ! s_callback ) s_callback = qainject2;

	//
	// delete the 'qatest123' collection
	//
	//static bool s_x1 = false;
	if ( ! s_flags[0] ) {
		s_flags[0] = true;
		if ( ! getUrl ( "/admin/delcoll?xml=1&delcoll=qatest123" ) )
			return false;
	}

	//
	// add the 'qatest123' collection
	//
	//static bool s_x2 = false;
	if ( ! s_flags[1] ) {
		s_flags[1] = true;
		if ( ! getUrl ( "/admin/addcoll?addcoll=qatest123&xml=1" , 
				// checksum of reply expected
				238170006 ) )
			return false;
	}


	//
	// try delimeter based injecting
	//
	//static bool s_y2 = false;
	if ( ! s_flags[7] ) {
		s_flags[7] = true;
		SafeBuf sb;
		// delim=+++URL:
		sb.safePrintf("&c=qatest123&deleteurl=0&"
			      "delim=%%2B%%2B%%2BURL%%3A&format=xml&u=xyz.com&"
			      "hasmime=1&content=");
		// use injectme3 file
		SafeBuf ubuf;
		ubuf.load("./injectme3");
		sb.urlEncode(ubuf.getBufStart());
		if ( ! getUrl ( "/admin/inject",
				// check reply, seems to have only a single 
				// docid in it
				-1970198487, sb.getBufStart()) )
			return false;
	}

	// now query check
	//static bool s_y4 = false;
	if ( ! s_flags[8] ) {
		wait(1.5);
		s_flags[8] = true;
		return false;
	}

	if ( ! s_flags[14] ) {
		s_flags[14] = true;
		if ( ! getUrl ( "/search?c=qatest123&qa=1&format=xml&q=%2Bthe",
				-1804253505 ) )
			return false;
	}

	//static bool s_y5 = false;
	if ( ! s_flags[9] ) {
		s_flags[9] = true;
		if ( ! getUrl ( "/search?c=qatest123&qa=1&format=xml&q=sports"
				"+news&ns=1&tml=20&smxcpl=30&"
				"sw=10&showimages=1"
				,-1874756636 ) )
			return false;
	}

	//static bool s_y6 = false;
	if ( ! s_flags[10] ) {
		s_flags[10] = true;
		if ( ! getUrl ( "/search?c=qatest123&qa=1&format=xml&q=sports"
				"+news&ns=1&tml=20&smxcpl=30&"
				"sw=10&showimages=0&hacr=1"
				,1651330319 ) )
			return false;
	}

	//static bool s_y7 = false;
	if ( ! s_flags[11] ) {
		s_flags[11] = true;
		if ( ! getUrl ( "/search?c=qatest123&qa=1&format=xml&q=sports"
				"+news&ns=1&tml=20&smxcpl=30&"
				"sw=10&showimages=0&sc=1"
				,-1405546537 ) )
			return false;
	}


	//
	// delete the 'qatest123' collection
	//
	if ( ! s_flags[12] ) {
		s_flags[12] = true;
		if ( ! getUrl ( "/admin/delcoll?xml=1&delcoll=qatest123" ) )
			return false;
	}


	//static bool s_fee2 = false;
	if ( ! s_flags[13] ) {
		s_flags[13] = true;
		log("qa: SUCCESSFULLY COMPLETED "
			"QA INJECT TEST 2");
		//if ( s_callback == qainject ) exit(0);
		return true;
	}


	return true;
}

Example #19

Show file

File: PageGet.cpp Project: NEXUS1000/open-source-search-engine

// returns false if blocked, true otherwise
bool processLoop ( void *state ) {
	// get it
	State2 *st = (State2 *)state;
	// get the tcp socket from the state
	TcpSocket *s = st->m_socket;
	// get it
	XmlDoc *xd = &st->m_xd;

	if ( ! xd->m_loaded ) {
		// setting just the docid. niceness is 0.
		//xd->set3 ( st->m_docId , st->m_coll , 0 );
		// callback
		xd->setCallback ( state , processLoop );
		// . and tell it to load from the old title rec
		// . this sets xd->m_oldTitleRec/m_oldTitleRecSize
		// . this sets xd->ptr_* and all other member vars from
		//   the old title rec if found in titledb.
		if ( ! xd->loadFromOldTitleRec ( ) ) return false;
	}

	if ( g_errno ) return sendErrorReply ( st , g_errno );
	// now force it to load old title rec
	//char **tr = xd->getTitleRec();
	SafeBuf *tr = xd->getTitleRecBuf();
	// blocked? return false if so. it will call processLoop() when it rets
	if ( tr == (void *)-1 ) return false;
	// we did not block. check for error? this will free "st" too.
	if ( ! tr ) return sendErrorReply ( st , g_errno );
	// if title rec was empty, that is a problem
	if ( xd->m_titleRecBuf.length() == 0 ) 
		return sendErrorReply ( st , ENOTFOUND);

	// set callback
	char *na = xd->getIsNoArchive();
	// wait if blocked
	if ( na == (void *)-1 ) return false;
	// error?
	if ( ! na ) return sendErrorReply ( st , g_errno );
	// forbidden? allow turkeys through though...
	if ( ! st->m_isMasterAdmin && *na )
		return sendErrorReply ( st , ENOCACHE );

	SafeBuf *sb = &st->m_sb;


	// &page=4 will print rainbow sections
	if ( ! st->m_printed && st->m_r.getLong("page",0) ) {
		// do not repeat this call
		st->m_printed = true;
		// this will call us again since we called
		// xd->setCallback() above to us
		if ( ! xd->printDocForProCog ( sb , &st->m_r ) )
			return false;
	}

	char *contentType = "text/html";
	char format = st->m_format;
	if ( format == FORMAT_XML ) contentType = "text/xml";
	if ( format == FORMAT_JSON ) contentType = "application/json";

	// if we printed a special page (like rainbow sections) then return now
	if ( st->m_printed ) {
		bool status = g_httpServer.sendDynamicPage (s,
							    //buf,bufLen,
							    sb->getBufStart(),
							    sb->getLength(),
							    -1,false,
							    //"text/html",
							    contentType,
							    -1, NULL, "utf8" );
		// nuke state2
		mdelete ( st , sizeof(State2) , "PageGet1" );
		delete (st);
		return status;
	}

	/*
	  // this was calling XmlDoc and setting sections, etc. to
	  // get the SpiderReply junk... no no no
	// is it banned or filtered? this ignores the TagRec in the titleRec
	// and uses msg8a to get it fresh instead
	char *vi = xd->getIsFiltered();//Visible( );
	// wait if blocked
	if ( vi == (void *)-1 ) return false;
	// error?
	if ( ! vi ) return sendErrorReply ( st , g_errno );
	// banned?
	if ( ! st->m_isMasterAdmin && ! *vi ) return sendErrorReply (st,EDOCBANNED);
	*/

	// get the utf8 content
	char **utf8 = xd->getUtf8Content();
	//int32_t   len  = xd->size_utf8Content - 1;
	// wait if blocked???
	if ( utf8 == (void *)-1 ) return false;
	// strange
	if ( xd->size_utf8Content<=0) {
		log("pageget: utf8 content <= 0");
		return sendErrorReply(st,EBADENGINEER );
	}
	// alloc error?
	if ( ! utf8 ) return sendErrorReply ( st , g_errno );

	// get this host
	Host *h = g_hostdb.getHost ( g_hostdb.m_hostId );
	if ( ! h ) {
		log("pageget: hostid %"INT32" is bad",g_hostdb.m_hostId);
		return sendErrorReply(st,EBADENGINEER );
	}


	char *content    = xd->ptr_utf8Content;
	int32_t  contentLen = xd->size_utf8Content - 1;

	// int16_tcut
	char strip = st->m_strip;

	// alloc buffer now
	//char *buf = NULL;
	//int32_t  bufMaxSize = 0;
	//bufMaxSize = len + ( 32 * 1024 ) ;
	//bufMaxSize = contentLen + ( 32 * 1024 ) ;
	//buf        = (char *)mmalloc ( bufMaxSize , "PageGet2" );
	//char *p          = buf;
	//char *bufEnd     = buf + bufMaxSize;
	//if ( ! buf ) {
	//	return sendErrorReply ( st , g_errno );
	//}

	// for undoing the header
	//char *start1 = p;
	int32_t startLen1 = sb->length();

	// we are always utfu
	if ( strip != 2 )
		sb->safePrintf( "<meta http-equiv=\"Content-Type\" "
			     "content=\"text/html;charset=utf8\">\n");

	// base href
	//Url *base = &xd->m_firstUrl;
	//if ( xd->ptr_redirUrl.m_url[0] )
	//	base = &xd->m_redirUrl;
	char *base = xd->ptr_firstUrl;
	if ( xd->ptr_redirUrl ) base = xd->ptr_redirUrl;
	//Url *redir = *xd->getRedirUrl();
	if ( strip != 2 ) {
		sb->safePrintf ( "<BASE HREF=\"%s\">" , base );
		//p += gbstrlen ( p );
	}

	// default colors in case css files missing
	if ( strip != 2 ) {
		sb->safePrintf( "\n<style type=\"text/css\">\n"
			  "body{background-color:white;color:black;}\n"
			  "</style>\n");
		//p += gbstrlen ( p );
	}

	//char format = st->m_format;
	if ( format == FORMAT_XML ) sb->reset();
	if ( format == FORMAT_JSON ) sb->reset();

	if ( xd->m_contentType == CT_JSON ) sb->reset();
	if ( xd->m_contentType == CT_XML  ) sb->reset();
	if ( xd->m_contentType == CT_STATUS ) sb->reset();

	// for undoing the stuff below
	int32_t startLen2 = sb->length();//p;

	// query should be NULL terminated
	char *q    = st->m_qsb.getBufStart();
	int32_t  qlen = st->m_qsb.getLength(); // m_qlen;

	char styleTitle[128] =  "font-size:14px;font-weight:600;"
				"color:#000000;";
	char styleText[128]  =  "font-size:14px;font-weight:400;"
				"color:#000000;";
	char styleLink[128] =  "font-size:14px;font-weight:400;"
				"color:#0000ff;";
	char styleTell[128] =  "font-size:14px;font-weight:600;"
				"color:#cc0000;";

	// get the url of the title rec
	Url *f = xd->getFirstUrl();

	bool printDisclaimer = st->m_printDisclaimer;

	if ( xd->m_contentType == CT_JSON )
		printDisclaimer = false;

	if ( xd->m_contentType == CT_STATUS )
		printDisclaimer = false;

	if ( format == FORMAT_XML ) printDisclaimer = false;
	if ( format == FORMAT_JSON ) printDisclaimer = false;

	char tbuf[100];
	tbuf[0] = 0;
	time_t lastSpiderDate = xd->m_spideredTime;

	if ( printDisclaimer ||
	     format == FORMAT_XML ||
	     format == FORMAT_JSON ) {
		struct tm *timeStruct = gmtime ( &lastSpiderDate );
		strftime ( tbuf, 100,"%b %d, %Y UTC", timeStruct);
	}

	// We should always be displaying this disclaimer.
	// - May eventually want to display this at a different location
	//   on the page, or on the click 'n' scroll browser page itself
	//   when this page is not being viewed solo.
	// CNS: if ( ! st->m_clickNScroll ) {
	if ( printDisclaimer ) {

		sb->safePrintf(//sprintf ( p , 
			  //"<BASE HREF=\"%s\">"
			  //"<table border=1 width=100%%>"
			  //"<tr><td>"
			  "<table border=\"1\" bgcolor=\"#"
			  BGCOLOR
			  "\" cellpadding=\"10\" "
			  //"id=\"gbcnsdisctable\" class=\"gbcnsdisctable_v\""
			  "cellspacing=\"0\" width=\"100%%\" color=\"#ffffff\">"
			  "<tr"
			  //" id=\"gbcnsdisctr\" class=\"gbcnsdisctr_v\""
			  "><td>"
			  //"<font face=times,sans-serif color=black size=-1>"
			  "<span style=\"%s\">"
			  "This is Gigablast's cached page of </span>"
			  "<a href=\"%s\" style=\"%s\">%s</a>"
			  "" , styleTitle, f->getUrl(), styleLink,
			  f->getUrl() );
		//p += gbstrlen ( p );
		// then the rest
		//sprintf(p , 
		sb->safePrintf(
			"<span style=\"%s\">. "
			"Gigablast is not responsible for the content of "
			"this page.</span>", styleTitle );
		//p += gbstrlen ( p );

		sb->safePrintf ( "<br/><span style=\"%s\">"
			  "Cached: </span>"
			  "<span style=\"%s\">",
			  styleTitle, styleText );
		//p += gbstrlen ( p );

		// then the spider date in GMT
		// time_t lastSpiderDate = xd->m_spideredTime;
		// struct tm *timeStruct = gmtime ( &lastSpiderDate );
		// char tbuf[100];
		// strftime ( tbuf, 100,"%b %d, %Y UTC", timeStruct);
		//p += gbstrlen ( p );
		sb->safeStrcpy(tbuf);

		// Moved over from PageResults.cpp
		sb->safePrintf( "</span> - <a href=\""
			      "/get?"
			      "q=%s&amp;c=%s&amp;rtq=%"INT32"&amp;"
			      "d=%"INT64"&amp;strip=1\""
			      " style=\"%s\">"
			      "[stripped]</a>", 
			      q , st->m_coll , 
			      (int32_t)st->m_rtq,
			      st->m_docId, styleLink ); 

		// a link to alexa
		if ( f->getUrlLen() > 5 ) {
			sb->safePrintf( " - <a href=\"http:"
					 "//web.archive.org/web/*/%s\""
					 " style=\"%s\">"
					 "[older copies]</a>" ,
					 f->getUrl(), styleLink );
		}

		if (st->m_noArchive){
			sb->safePrintf( " - <span style=\"%s\"><b>"
				     "[NOARCHIVE]</b></span>",
				     styleTell );
		}
		if (st->m_isBanned){
			sb->safePrintf(" - <span style=\"%s\"><b>"
				     "[BANNED]</b></span>",
				     styleTell );
		}

		// only print this if we got a query
		if ( qlen > 0 ) {
			sb->safePrintf("<br/><br/><span style=\"%s\"> "
				   "These search terms have been "
				   "highlighted:  ",
				   styleText );
			//p += gbstrlen ( p );
		}
		
	}

	// how much space left in p?
	//int32_t avail = bufEnd - p;
	// . make the url that we're outputting for (like in PageResults.cpp)
	// . "thisUrl" is the baseUrl for click & scroll
	char thisUrl[MAX_URL_LEN];
	char *thisUrlEnd = thisUrl + MAX_URL_LEN;
	char *x = thisUrl;
	// . use the external ip of our gateway
	// . construct the NAT mapped port
	// . you should have used iptables to map port to the correct
	//   internal ip:port
	//uint32_t  ip   =g_conf.m_mainExternalIp  ; // h->m_externalIp;
	//uint16_t port=g_conf.m_mainExternalPort;//h->m_externalHttpPort
	// local check
	//if ( st->m_isLocal ) {
	uint32_t  ip   = h->m_ip;
	uint16_t port = h->m_httpPort;
	//}
	//sprintf ( x , "http://%s:%"INT32"/get?q=" , iptoa ( ip ) , port );
	// . we no longer put the port in here
	// . but still need http:// since we use <base href=>
	if (port == 80) sprintf(x,"http://%s/get?q=",iptoa(ip));
	else            sprintf(x,"http://%s:%hu/get?q=",iptoa(ip),port);
	x += gbstrlen ( x );
	// the query url encoded
	int32_t elen = urlEncode ( x , thisUrlEnd - x , q , qlen );
	x += elen;
	// separate cgi vars with a &
	//sprintf ( x, "&seq=%"INT32"&rtq=%"INT32"d=%"INT64"",
	//	  (int32_t)st->m_seq,(int32_t)st->m_rtq,st->m_msg22.getDocId());
	sprintf ( x, "&d=%"INT64"",st->m_docId );
	x += gbstrlen(x);		
	// set our query for highlighting
	Query qq;
	qq.set2 ( q, st->m_langId , true );

	// print the query terms into our highlight buffer
	Highlight hi;
	// make words so we can set the scores to ignore fielded terms
	Words qw;
	qw.set ( q            ,  // content being highlighted, utf8
		 qlen         ,  // content being highlighted, utf8
		 TITLEREC_CURRENT_VERSION,
		 true         ,  // computeIds
		 false        ); // hasHtmlEntities?
	// . assign scores of 0 to query words that should be ignored
	// . TRICKY: loop over words in qq.m_qwords, but they should be 1-1
	//   with words in qw.
	// . sanity check
	//if ( qw.getNumWords() != qq.m_numWords ) { char *xx = NULL; *xx = 0;}
	// declare up here
	Matches m;
	// do the loop
	//Scores ss;
	//ss.set ( &qw , NULL );
	//for ( int32_t i = 0 ; i < qq.m_numWords ; i++ )
	//	if ( ! m.matchWord ( &qq.m_qwords[i],i ) ) ss.m_scores[i] = 0;
	// now set m.m_matches[] to those words in qw that match a query word
	// or phrase in qq.
	m.setQuery ( &qq );
	//m.addMatches ( &qw , &ss , true );
	m.addMatches ( &qw );
	int32_t hilen = 0;

	// CNS: if ( ! st->m_clickNScroll ) {
	// and highlight the matches
	if ( printDisclaimer ) {
		hilen = hi.set ( //p       ,
				 //avail   ,
				sb ,
				 &qw     , // words to highlight
				 &m      , // matches relative to qw
				 false   , // doSteming
				 false   , // st->m_clickAndScroll , 
				 (char *)thisUrl );// base url for ClcknScrll
		//p += hilen;
		// now an hr
		//gbmemcpy ( p , "</span></table></table>\n" , 24 );   p += 24;
		sb->safeStrcpy("</span></table></table>\n");
	}


	bool includeHeader = st->m_includeHeader;

	// do not show header for json object display
	if ( xd->m_contentType == CT_JSON )
		includeHeader = false;
	if ( xd->m_contentType == CT_XML )
		includeHeader = false;
	if ( xd->m_contentType == CT_STATUS )
		includeHeader = false;

	if ( format == FORMAT_XML ) includeHeader = false;
	if ( format == FORMAT_JSON ) includeHeader = false;

	//mfree(uq, uqCapacity, "PageGet");
	// undo the header writes if we should
	if ( ! includeHeader ) {
		// including base href is off by default when not including
		// the header, so the caller must explicitly turn it back on
		if ( st->m_includeBaseHref ) sb->m_length=startLen2;//p=start2;
		else                         sb->m_length=startLen1;//p=start1;
	}

	//sb->safeStrcpy(tbuf);



	if ( format == FORMAT_XML ) {
		sb->safePrintf("<response>\n");
		sb->safePrintf("<statusCode>0</statusCode>\n");
		sb->safePrintf("<statusMsg>Success</statusMsg>\n");
		sb->safePrintf("<url><![CDATA[");
		sb->cdataEncode(xd->m_firstUrl.m_url);
		sb->safePrintf("]]></url>\n");
		sb->safePrintf("<docId>%"UINT64"</docId>\n",xd->m_docId);
		sb->safePrintf("\t<cachedTimeUTC>%"INT32"</cachedTimeUTC>\n",
			       (int32_t)lastSpiderDate);
		sb->safePrintf("\t<cachedTimeStr>%s</cachedTimeStr>\n",tbuf);
	}

	if ( format == FORMAT_JSON ) {
		sb->safePrintf("{\"response\":{\n");
		sb->safePrintf("\t\"statusCode\":0,\n");
		sb->safePrintf("\t\"statusMsg\":\"Success\",\n");
		sb->safePrintf("\t\"url\":\"");
		sb->jsonEncode(xd->m_firstUrl.m_url);
		sb->safePrintf("\",\n");
		sb->safePrintf("\t\"docId\":%"UINT64",\n",xd->m_docId);
		sb->safePrintf("\t\"cachedTimeUTC\":%"INT32",\n",
			       (int32_t)lastSpiderDate);
		sb->safePrintf("\t\"cachedTimeStr\":\"%s\",\n",tbuf);
	}

	
	// identify start of <title> tag we wrote out
	char *sbstart = sb->getBufStart();
	char *sbend   = sb->getBufEnd();
	char *titleStart = NULL;
	char *titleEnd   = NULL;

	char ctype = (char)xd->m_contentType;

	// do not calc title or print it if doc is xml or json
	if ( ctype == CT_XML ) sbend = sbstart;
	if ( ctype == CT_JSON ) sbend = sbstart;
	if ( ctype == CT_STATUS ) sbend = sbstart;

	for ( char *t = sbstart ; t < sbend ; t++ ) {
		// title tag?
		if ( t[0]!='<' ) continue;
		if ( to_lower_a(t[1])!='t' ) continue;
		if ( to_lower_a(t[2])!='i' ) continue;
		if ( to_lower_a(t[3])!='t' ) continue;
		if ( to_lower_a(t[4])!='l' ) continue;
		if ( to_lower_a(t[5])!='e' ) continue;
		// point to it
		char *x = t + 5;
		// max - to keep things fast
		char *max = x + 500;
		for ( ; *x && *x != '>' && x < max ; x++ );
		x++;
		// find end
		char *e = x;
		for ( ; *e && e < max ; e++ ) {
			if ( e[0]=='<' &&
			     to_lower_a(e[1])=='/' &&
			     to_lower_a(e[2])=='t' &&
			     to_lower_a(e[3])=='i' &&
			     to_lower_a(e[4])=='t' &&
			     to_lower_a(e[5])=='l' &&
			     to_lower_a(e[6])=='e' )
				break;
		}
		if ( e < max ) {
			titleStart = x;
			titleEnd   = e;
		}
		break;
	}

	// . print title at top!
	// . consider moving
	if ( titleStart ) {

		char *ebuf = st->m_r.getString("eb");
		if ( ! ebuf ) ebuf = "";

		//p += sprintf ( p , 
		sb->safePrintf(
			       "<table border=1 "
			       "cellpadding=10 "
			       "cellspacing=0 "
			       "width=100%% "
			       "color=#ffffff>" );

		int32_t printLinks = st->m_r.getLong("links",0);

		if ( ! printDisclaimer && printLinks )
			sb->safePrintf(//p += sprintf ( p , 
				       // first put cached and live link
				       "<tr>"
				       "<td bgcolor=lightyellow>"
				       // print cached link
				       //"<center>"
				       "&nbsp; "
				       "<b>"
				       "<a "
				       "style=\"font-size:18px;font-weight:600;"
				       "color:#000000;\" "
				       "href=\""
				       "/get?"
				       "c=%s&d=%"INT64"&qh=0&cnsp=1&eb=%s\">"
				       "cached link</a>"
				       " &nbsp; "
				       "<a "
				       "style=\"font-size:18px;font-weight:600;"
				       "color:#000000;\" "
				       "href=%s>live link</a>"
				       "</b>"
				       //"</center>"
				       "</td>"
				       "</tr>\n"
				       ,st->m_coll
				       ,st->m_docId 
				       ,ebuf
				       ,thisUrl // st->ptr_ubuf
				       );

		if ( printLinks ) {
			sb->safePrintf(//p += sprintf ( p ,
				       "<tr><td bgcolor=pink>"
				       "<span style=\"font-size:18px;"
				       "font-weight:600;"
				       "color:#000000;\">"
				       "&nbsp; "
				       "<b>PAGE TITLE:</b> "
				       );
			int32_t tlen = titleEnd - titleStart;
			sb->safeMemcpy ( titleStart , tlen );
			sb->safePrintf ( "</span></td></tr>" );
		}

		sb->safePrintf( "</table><br>\n" );

	}

	// is the content preformatted?
	bool pre = false;
	if ( ctype == CT_TEXT ) pre = true ; // text/plain
	if ( ctype == CT_DOC  ) pre = true ; // filtered msword
	if ( ctype == CT_PS   ) pre = true ; // filtered postscript

	if ( format == FORMAT_XML ) pre = false;
	if ( format == FORMAT_JSON ) pre = false;

	// if it is content-type text, add a <pre>
	if ( pre ) {//p + 5 < bufEnd && pre ) {
		sb->safePrintf("<pre>");
		//p += 5;
	}

	if ( st->m_strip == 1 )
		contentLen = stripHtml( content, contentLen, 
					(int32_t)xd->m_version, st->m_strip );
	// it returns -1 and sets g_errno on error, line OOM
	if ( contentLen == -1 ) {
		//if ( buf ) mfree ( buf , bufMaxSize , "PageGet2" );	
		return sendErrorReply ( st , g_errno );
	}

	Xml xml;
	Words ww;

	// if no highlighting, skip it
	bool queryHighlighting = st->m_queryHighlighting;
	if ( st->m_strip == 2 ) queryHighlighting = false;

	// do not do term highlighting if json
	if ( xd->m_contentType == CT_JSON )
		queryHighlighting = false;
	if ( xd->m_contentType == CT_STATUS )
		queryHighlighting = false;

	SafeBuf tmp;
	SafeBuf *xb = sb;
	if ( format == FORMAT_XML ) xb = &tmp;
	if ( format == FORMAT_JSON ) xb = &tmp;
	

	if ( ! queryHighlighting ) {
		xb->safeMemcpy ( content , contentLen );
		xb->nullTerm();
		//p += contentLen ;
	}
	else {
		// get the content as xhtml (should be NULL terminated)
		//Words *ww = xd->getWords();
		if ( ! xml.set ( content , contentLen , false ,
				 0 , false , TITLEREC_CURRENT_VERSION ,
				 false , 0 , CT_HTML ) ) { // niceness is 0
			//if ( buf ) mfree ( buf , bufMaxSize , "PageGet2" );
			return sendErrorReply ( st , g_errno );
		}			
		if ( ! ww.set ( &xml , true , 0 ) ) { // niceness is 0
			//if ( buf ) mfree ( buf , bufMaxSize , "PageGet2" );
			return sendErrorReply ( st , g_errno );
		}
		// sanity check
		//if ( ! xd->m_wordsValid ) { char *xx=NULL;*xx=0; }
		// how much space left in p?
		//avail = bufEnd - p;

		Matches m;
		m.setQuery ( &qq );
		m.addMatches ( &ww );
		hilen = hi.set ( xb , // p , avail , 
				 &ww , &m ,
				 false /*doStemming?*/ ,  
				 st->m_clickAndScroll , 
				 thisUrl /*base url for click & scroll*/);
		//p += hilen;
		log(LOG_DEBUG, "query: Done highlighting cached page content");
	}


	if ( format == FORMAT_XML ) {
		sb->safePrintf("\t<content><![CDATA[");
		sb->cdataEncode ( xb->getBufStart() );
		sb->safePrintf("]]></content>\n");
		sb->safePrintf("</response>\n");
	}

	if ( format == FORMAT_JSON ) {
		sb->safePrintf("\t\"content\":\"\n");
		sb->jsonEncode ( xb->getBufStart() );
		sb->safePrintf("\"\n}\n}\n");
	}


	// if it is content-type text, add a </pre>
	if ( pre ) { // p + 6 < bufEnd && pre ) {
		sb->safeMemcpy ( "</pre>" , 6 );
		//p += 6;
	}

	// calculate bufLen
	//int32_t bufLen = p - buf;

	/*

	  MDW: return the xml page as is now. 9/28/2014

	int32_t ct = xd->m_contentType;

	// now filter the entire buffer to escape out the xml tags
	// so it is displayed nice
	SafeBuf newbuf;

	if ( ct == CT_XML ) {
		// encode the xml tags into &lt;tagname&gt; sequences
		if ( !newbuf.htmlEncodeXmlTags ( sb->getBufStart() ,
						 sb->getLength(),
						 0)){// niceness=0
			//if ( buf ) mfree ( buf , bufMaxSize , "PageGet2" );
			return sendErrorReply ( st , g_errno );
		}
		// free out buffer that we alloc'd before returning since this
		// should have copied it into another buffer
		//if ( buf ) mfree ( buf , bufMaxSize , "PageGet2" );	
		// reassign
		//buf    = newbuf.getBufStart();
		//bufLen = newbuf.length();
		sb->stealBuf ( &newbuf );
	}
	*/

	// now encapsulate it in html head/tail and send it off
	// sendErr:
	contentType = "text/html";
	if ( strip == 2 ) contentType = "text/xml";
	// xml is usually buggy and this throws browser off
	//if ( ctype == CT_XML ) contentType = "text/xml";

	if ( xd->m_contentType == CT_JSON )
		contentType = "application/json";

	if ( xd->m_contentType == CT_STATUS )
		contentType = "application/json";

	if ( xd->m_contentType == CT_XML )
		contentType = "test/xml";

	if ( format == FORMAT_XML ) contentType = "text/xml";
	if ( format == FORMAT_JSON ) contentType = "application/json";

	// safebuf, sb, is a member of "st" so this should copy the buffer
	// when it constructs the http reply, and we gotta call delete(st)
	// AFTER this so sb is still valid.
	bool status = g_httpServer.sendDynamicPage (s,
						    //buf,bufLen,
						    sb->getBufStart(),
						    sb->getLength(),
						    -1,false,
						    contentType,
						     -1, NULL, "utf8" );

	// nuke state2
	mdelete ( st , sizeof(State2) , "PageGet1" );
	delete (st);


	// free out buffer that we alloc'd before returning since this
	// should have copied it into another buffer

	//if      ( ct == CT_XML ) newbuf.purge();
	//else if ( buf          ) mfree ( buf , bufMaxSize , "PageGet2" );
	
	// and convey the status
	return status;
}

Example #20

Show file

File: qa.cpp Project: firatkarakusoglu/open-source-search-engine

bool qaspider1 ( ) {
	//
	// delete the 'qatest123' collection
	//
	//static bool s_x1 = false;
	if ( ! s_flags[0] ) {
		s_flags[0] = true;
		if ( ! getUrl ( "/admin/delcoll?xml=1&delcoll=qatest123" ) )
			return false;
	}

	//
	// add the 'qatest123' collection
	//
	//static bool s_x2 = false;
	if ( ! s_flags[1] ) {
		s_flags[1] = true;
		if ( ! getUrl ( "/admin/addcoll?addcoll=qatest123&xml=1" , 
				// checksum of reply expected
				238170006 ) )
			return false;
	}

	// restrict hopcount to 0 or 1 in url filters so we do not spider
	// too deep
	//static bool s_z1 = false;
	if ( ! s_flags[2] ) {
		s_flags[2] = true;
		SafeBuf sb;
		sb.safePrintf("&c=qatest123&"
			      // make it the custom filter
			      "ufp=0&"

	       "fe=%%21ismanualadd+%%26%%26+%%21insitelist&hspl=0&hspl=1&fsf=0.000000&mspr=0&mspi=1&xg=1000&fsp=-3&"

			      // take out hopcount for now, just test quotas
			      //	       "fe1=tag%%3Ashallow+%%26%%26+hopcount%%3C%%3D1&hspl1=0&hspl1=1&fsf1=1.000000&mspr1=1&mspi1=1&xg1=1000&fsp1=3&"

			      // just one spider out allowed for consistency
	       "fe1=tag%%3Ashallow+%%26%%26+sitepages%%3C%%3D20&hspl1=0&hspl1=1&fsf1=1.000000&mspr1=1&mspi1=1&xg1=1000&fsp1=45&"

	       "fe2=default&hspl2=0&hspl2=1&fsf2=1.000000&mspr2=0&mspi2=1&xg2=1000&fsp2=45&"

		);
		if ( ! getUrl ( "/admin/filters",0,sb.getBufStart()) )
			return false;
	}

	// set the site list to 
	// a few sites
	//static bool s_z2 = false;
	if ( ! s_flags[3] ) {
		s_flags[3] = true;
		SafeBuf sb;
		sb.safePrintf("&c=qatest123&format=xml&sitelist=");
		sb.urlEncode("tag:shallow site:www.walmart.com\r\n"
			     "tag:shallow site:http://www.ibm.com/\r\n");
		sb.nullTerm();
		if ( ! getUrl ("/admin/settings",0,sb.getBufStart() ) )
			return false;
	}
		
	//
	// use the add url interface now
	// walmart.com above was not seeded because of the site: directive
	// so this will seed it.
	//
	//static bool s_y2 = false;
	if ( ! s_flags[4] ) {
		s_flags[4] = true;
		SafeBuf sb;
		// delim=+++URL:
		sb.safePrintf("&c=qatest123"
			      "&format=json"
			      "&strip=1"
			      "&spiderlinks=1"
			      "&urls=www.walmart.com+ibm.com"
			      );
		// . now a list of websites we want to spider
		// . the space is already encoded as +
		//sb.urlEncode(s_urls1);
		if ( ! getUrl ( "/admin/addurl",0,sb.getBufStart()) )
			return false;
	}

	//
	// wait for spidering to stop
	//
 checkagain:

	// wait until spider finishes. check the spider status page
	// in json to see when completed
	//static bool s_k1 = false;
	if ( ! s_flags[5] ) {
		// wait 5 seconds, call sleep timer... then call qatest()
		//usleep(5000000); // 5 seconds
		wait(3.0);
		s_flags[5] = true;
		return false;
	}

	if ( ! s_flags[15] ) {
		s_flags[15] = true;
		if ( ! getUrl ( "/admin/status?format=json&c=qatest123",0) )
			return false;
	}

	//static bool s_k2 = false;
	if ( ! s_flags[6] ) {
		// ensure spiders are done. 
		// "Nothing currently available to spider"
		if ( s_content&&!strstr(s_content,"Nothing currently avail")){
			s_flags[5] = false;
			s_flags[15] = false;
			goto checkagain;
		}
		s_flags[6] = true;
	}


	// wait for index msg4 to not be cached to ensure all results indexed
	if ( ! s_flags[22] ) {
		s_flags[22] = true;
		wait(1.5);
	}


	// verify no results for gbhopcount:2 query
	//static bool s_y4 = false;
	if ( ! s_flags[7] ) {
		s_flags[7] = true;
		if ( ! getUrl ( "/search?c=qatest123&qa=1&format=xml&"
				"q=gbhopcount%3A2",
				-1672870556 ) )
			return false;
	}

	// but some for gbhopcount:0 query
	//static bool s_t0 = false;
	if ( ! s_flags[8] ) {
		s_flags[8] = true;
		if ( ! getUrl ( "/search?c=qatest123&qa=1&format=xml&"
				"q=gbhopcount%3A0",
				908338607 ) )
			return false;
	}
	
	// check facet sections query for walmart
	//static bool s_y5 = false;
	if ( ! s_flags[9] ) {
		s_flags[9] = true;
		if ( ! getUrl ( "/search?c=qatest123&format=json&stream=1&"
				"q=gbfacetstr%3Agbxpathsitehash2492664135",
				55157060 ) )
			return false;
	}

	//static bool s_y6 = false;
	if ( ! s_flags[10] ) {
		s_flags[10] = true;
		if ( ! getUrl ( "/get?page=4&q=gbfacetstr:gbxpathsitehash2492664135&qlang=xx&c=qatest123&d=9861563119&cnsp=0" , 999 ) )
			return false;
	}

	// in xml
	//static bool s_y7 = false;
	if ( ! s_flags[11] ) {
		s_flags[11] = true;
		if ( ! getUrl ( "/get?xml=1&page=4&q=gbfacetstr:gbxpathsitehash2492664135&qlang=xx&c=qatest123&d=9861563119&cnsp=0" , 999 ) )
			return false;
	}

	// and json
	//static bool s_y8 = false;
	if ( ! s_flags[12] ) {
		s_flags[12] = true;
		if ( ! getUrl ( "/get?json=1&page=4&q=gbfacetstr:gbxpathsitehash2492664135&qlang=xx&c=qatest123&d=9861563119&cnsp=0" , 999 ) )
			return false;
	}


	// delete the collection
	//static bool s_fee = false;
	// if ( ! s_flags[13] ) {
	// 	s_flags[13] = true;
	// 	if ( ! getUrl ( "/admin/delcoll?delcoll=qatest123" ) )
	// 		return false;
	// }

	if ( ! s_flags[17] ) {
		s_flags[17] = true;
		if ( ! getUrl ( "/search?c=qatest123&qa=1&format=xml&"
				"q=site2%3Awww.walmart.com+"
				"gbsortby%3Agbspiderdate",
				999 ) )
			return false;
	}

	// xpath is like a title here i think. check the returned
	// facet table in the left column
	if ( ! s_flags[18] ) {
		s_flags[18] = true;
		if ( ! getUrl ( "/search?c=qatest123&qa=1&format=html&"
				"q=gbfacetstr%3Agbxpathsitehash3624590799"
				, 999 ) )
			return false;
	}

	if ( ! s_flags[19] ) {
		s_flags[19] = true;
		if ( ! getUrl ( "/search?c=qatest123&qa=1&xml=1&"
				"q=gbfacetint%3Agbhopcount"
				, 999 ) )
			return false;
	}

	if ( ! s_flags[20] ) {
		s_flags[20] = true;
		if ( ! getUrl ( "/search?c=qatest123&qa=1&json=1&"
				"q=gbfacetint%3Alog.score"
				, 999 ) )
			return false;
	}

	if ( ! s_flags[21] ) {
		s_flags[21] = true;
		if ( ! getUrl ( "/search?c=qatest123&qa=1&xml=1&"
				"q=gbfacetfloat%3Atalks.rating"
				, 999 ) )
			return false;
	}

	if ( ! s_flags[23] ) {
		s_flags[23] = true;
		// test facets mixed with gigabits in left hand column
		if ( ! getUrl ( "/search?c=qatest123&qa=1&html=1&"
				"q=gbfacetint%3Agbhopcount+walmart"
				, 999 ) )
			return false;
	}


	//static bool s_fee2 = false;
	if ( ! s_flags[14] ) {
		s_flags[14] = true;
		log("qa: SUCCESSFULLY COMPLETED "
			"QA SPIDER1 TEST");
		return true;
	}

	return true;
}

Example #21

Show file

File: PageParser.cpp Project: BILObilo/open-source-search-engine

bool processLoop ( void *state ) {
    // cast it
    State8 *st = (State8 *)state;
    // get the xmldoc
    XmlDoc *xd = &st->m_xd;

    // error?
    if ( g_errno ) return sendErrorReply ( st , g_errno );

    // shortcut
    SafeBuf *xbuf = &st->m_xbuf;

    if ( st->m_u && st->m_u[0] ) {
        // . save the ips.txt file if we are the test coll
        // . saveTestBuf() is a function in Msge1.cpp
        CollectionRec *cr = xd->getCollRec();
        if ( xd && cr && cr->m_coll && ! strcmp ( cr->m_coll,"test") )
            // use same dir that XmlDoc::getTestDir() would use
            saveTestBuf ( "test-page-parser" );
        // now get the meta list, in the process it will print out a
        // bunch of junk into st->m_xbuf
        char *metalist = xd->getMetaList ( );
        if ( ! metalist ) return sendErrorReply ( st , g_errno );
        // return false if it blocked
        if ( metalist == (void *)-1 ) return false;
        // for debug...
        if ( ! xd->m_indexCode ) xd->doConsistencyTest ( false );
        // print it out
        xd->printDoc( xbuf );
    }

    // print reason we can't analyze it (or index it)
    //if ( st->m_indexCode != 0 ) {
    //	xbuf->safePrintf ("<br><br><b>indexCode: %s</b>\n<br>",
    //			  mstrerror(st->m_indexCode));
    //}

    // we are done
    g_inPageParser = false;

    // print the final tail
    //p += g_httpServer.printTail ( p , pend - p );

    //log("parser: send sock=%li",st->m_s->m_sd);

    // now encapsulate it in html head/tail and send it off
    bool status = g_httpServer.sendDynamicPage( st->m_s ,
                  xbuf->getBufStart(),
                  xbuf->length() ,
                  -1, //cachtime
                  false ,//postreply?
                  NULL, //ctype
                  -1 , //httpstatus
                  NULL,//cookie
                  "utf-8");
    // delete the state now
    if ( st->m_freeIt ) {
        mdelete ( st , sizeof(State8) , "PageParser" );
        delete (st);
    }
    // return the status
    return status;
}

Example #22

Show file

File: PageIndexdb.cpp Project: BKJackson/open-source-search-engine

// . make a web page from results stored in msg40
// . send it on TcpSocket "s" when done
// . returns false if blocked, true otherwise
// . sets g_errno on error
bool gotIndexList ( void *state ) {
	// the state
	State10 *st = (State10 *) state;
	// launch more
	if ( ! launchRequests ( st ) ) return false;
	/*
	// get the date list
	//fprintf(stderr,"termId now=%lli\n",st->m_termId);
	//fprintf(stderr,"should be=%lli\n",(st->m_termId & TERMID_MASK));
	// . now get the indexList for this termId
	// . date is complemented, so start with bigger one first
	key128_t startKey = g_datedb.makeStartKey ( st->m_termId ,0xffffffff);
	key128_t endKey   = g_datedb.makeEndKey   ( st->m_termId ,0x0);
	// get the rdb ptr to titledb's rdb
	//Rdb *rdb = g_indexdb.getRdb();
	// -1 means read from all files in Indexdb
	long numFiles = -1;
	// make it zero if caller doesn't want to hit the disk
	if ( ! st->m_useDisk ) numFiles = 0;
	// get the title rec at or after this docId
	if ( ! st->m_msg0.getList ( -1 ,
				    0  ,
				    0  ,
				    0  ,    // max cache age
				    false , // add to cache?
				    RDB_DATEDB  , // rdbId of 2 = indexdb
				    st->m_coll ,
				    &st->m_list2  ,
				    (char *)&startKey  ,
				    (char *)&endKey    ,
				    st->m_numRecs * sizeof(key128_t),//recSizes
				    //st->m_useTree   , // include tree?
				    //st->m_useCache  , // include cache?
				    //false     , // add to cache?
				    //0         , // startFileNum
				    //numFiles  , // numFiles
				    st        , // state
				    gotIndexListWrapper2 ,
				    0  ) )  // niceness
		return false;
	// otherwise call gotResults which returns false if blocked, true else
	// and sets g_errno on error
	return gotIndexList2 ( (void *) st , NULL );
}


void gotIndexListWrapper2 ( void *state , RdbList *list ) {
	gotIndexList2 ( state , list );
}

void addedKeyWrapper ( void *state ) {
	gotIndexList2 ( state, NULL );
}

// . make a web page from results stored in msg40
// . send it on TcpSocket "s" when done
// . returns false if blocked, true otherwise
// . sets g_errno on error
bool gotIndexList2 ( void *state , RdbList *list ) {
	// the state
	State10 *st = (State10 *) state;
	*/
	// get the socket
	TcpSocket *s = st->m_socket;
	// don't allow pages bigger than 128k in cache
	//char  buf [ 64*1024 ];
	// a ptr into "buf"
	//char *p    = buf;
	//char *pend = buf + 64*1024;
	/*
	// get termId
	key_t k = *(key_t *)st->m_list.getStartKey();
	long long termId = g_indexdb.getTermId ( k );
	// get groupId from termId
	//unsigned long groupId = k.n1 & g_hostdb.m_groupMask;
	unsigned long groupId = g_indexdb.getGroupIdFromKey ( &k );
	long hostnum = g_hostdb.makeHostId ( groupId );
	*/
	// check box " checked" strings
	char *ubs = "";
	char *uts = "";
	char *uds = "";
	char *ucs = "";
	char *add = "";
	char *del = "";
	if ( st->m_useDatedb) ubs = " checked";
	if ( st->m_useTree  ) uts = " checked";
	if ( st->m_useDisk  ) uds = " checked";
	if ( st->m_useCache ) ucs = " checked";
	if ( st->m_add      ) add = " checked";
	if ( st->m_del      ) del = " checked";

	SafeBuf *pbuf = &st->m_pbuf;

	g_pages.printAdminTop ( pbuf , st->m_socket , &st->m_r );

	// get base, returns NULL and sets g_errno to ENOCOLLREC on error
	RdbBase *base; 
	if (!(base=getRdbBase((uint8_t)RDB_INDEXDB,st->m_coll))) return true;

	// print the standard header for admin pages
	pbuf->safePrintf ( 
		  "<center>\n"
		  "<table cellpadding=2><tr><td colspan=4>"
		  "useDatedb:<input type=checkbox value=1 name=ub%s> "
		  "useTree:<input type=checkbox value=1 name=ut%s> "
		  "useDisk:<input type=checkbox value=1 name=ud%s> "
		  "useCache:<input type=checkbox value=1 name=uc%s> "
		  "ADD:<input type=checkbox value=1 name=add%s> "
		  "DELETE:<input type=checkbox value=1 name=del%s>"
		  "</td></tr><tr><td>"
		  "query:"
		  "</td><td>"
		  "<input type=text name=q value=\"%s\" size=20>"
		  "</td><td>"
		  "collection:"
		  "</td><td>"
		  "<input type=text name=c value=\"%s\" size=10>"
		  "</td></tr><tr><td>"
		  "termId:"
		  "</td><td>"
		  "<input type=text name=t value=%lli size=20>"
		  "</td><td>"
		  "numRecs:"
		  "</td><td>"
		  "<input type=text name=numRecs value=%li size=10> "
		  "</td></tr><tr><td>"
		  "docId:"
		  "</td><td>"
		  "<input type=text name=d value=%lli size=20> "
		  "</td><td>"
		  "score:"
		  "</td><td>"
		  "<input type=text name=score value=%li size=10> "
		  "</td><td>"
		  "<input type=submit value=ok border=0>"
		  "</td></tr>"
		  "<tr><td colspan=2>"
		  "term appears in about %lli docs +/- %li"
		  "</td></tr>"
		  //"<tr><td colspan=2>"
		  //"this indexlist held by host #%li and twins"
		  //"</td></tr>"
		  "</table>"
		  "</form><br><br>" ,
		  ubs, uts, uds, ucs, add, del,
		  st->m_query , st->m_coll , st->m_termId  , 
		  st->m_numRecs  ,
		  st->m_docId , (long)st->m_score ,
		  st->m_termFreq ,
		  2 * (long)GB_INDEXDB_PAGE_SIZE / 6 * 
		  base->getNumFiles() );
		  //hostnum );

	if ( g_errno || (st->m_list.isEmpty() ) ) {//&&st->m_list2.isEmpty())){
		if (g_errno)pbuf->safePrintf("Error = %s",mstrerror(g_errno));
		else        pbuf->safePrintf("List is empty");
		pbuf->safePrintf("</center>");
		// erase g_errno for sending
		g_errno = 0;
		// now encapsulate it in html head/tail and send it off
		bool status = g_httpServer.sendDynamicPage(s , 
							   pbuf->getBufStart(),
							   pbuf->length() );
		// delete it
		mdelete ( st , sizeof(State10) , "PageIndexdb" );
		delete (st);
		return status;
	}

	pbuf->safePrintf ( 
		  "<table cellpadding=1 border=1>" 
		  "<tr><td>#</td><td>score</td>"
		  "<td>docId</td><td>domHash</td></tr>");

	//if ( searchingEvents

	// now print the score/docId of indexlist
	long i = 0;
	for (   st->m_list.resetListPtr () ;
	      ! st->m_list.isExhausted  () ;
		st->m_list.skipCurrentRecord () ) {
		// break if buf is low
		//if ( p + 1024 >= pend ) break;
		// but set the ip/port to a host that has this titleRec
		// stored locally!
		long long     docId   = st->m_list.getCurrentDocId () ;
		unsigned long groupId = getGroupIdFromDocId ( docId );
		// get the first host's hostId in this groupId
		Host *h = g_hostdb.getFastestHostInGroup ( groupId );
		// . pick the first host to handle the cached titleRec request
		// . we assume it has the best time and is up!! TODO: fix!
		// . use local ip though if it was an internal request
		// . otherwise, use the external ip
		//unsigned long  ip   = h->m_externalIp;
		unsigned long  ip   = h->m_ip;
		// use the NAT mapped port
		unsigned short port = h->m_externalHttpPort;
		// log the first docid so we can blaster url: queries
		// to PageIndexdb and see if they are in indexdb
		if ( i == 0 ) 
			logf(LOG_INFO,"indexdb: %llu %s",docId,st->m_query);
		// adjust ip/port if local
		if ( st->m_isLocal ) {
			ip   = h->m_ip;
			port = h->m_httpPort;
		}
		unsigned long date = 0;
		if ( st->m_useDatedb )
			date = (unsigned long)st->m_list.getCurrentDate();
		uint8_t dh = g_titledb.getDomHash8FromDocId ( docId );
		char ds[32];
		ds[0]=0;
		if ( st->m_useDatedb ) sprintf (ds,"%lu/",date);
		pbuf->safePrintf ( 
			  "<tr><td>%li.</td>"
			  "<td>%s%i</td>"
			  "<td>"
			  //"<a href=http://%s:%hu/master/titledb?d=%llu>"
			  "<a href=/master/titledb?c=%s&d=%llu>"
			  "%llu"
			  //"<td><a href=/cgi/4.cgi?d=%llu>%llu"
			  "</td>"
			  "<td>"
			  "0x%02lx"
			  "</td>"
			  "</tr>\n" ,
			  i++,
			  ds, (int)st->m_list.getCurrentScore() ,
			  //iptoa(ip) , port ,
			  st->m_coll,
			  docId , 
			  docId ,
			  (long)dh );
	}	
	pbuf->safePrintf ( "</table>" );

	/*
	if ( ! st->m_list2.isEmpty() ) 
		p += sprintf ( p ,
			       "<br>"
			       "<br>"
			       "<table cellpadding=1 border=1>" 
			       "<tr><td>#</td><td>termId</td>"
			       "<td>date</td><td>score</td>"
			       "<td>docId</td></tr>");

	// now print the score/docId of datedb list
	i = 0;
	for (   st->m_list2.resetListPtr () ;
	      ! st->m_list2.isExhausted  () ;
		st->m_list2.skipCurrentRecord () ) {
		// break if buf is low
		if ( p + 1024 >= pend ) break;
		// but set the ip/port to a host that has this titleRec
		// stored locally!
		long long     docId   = st->m_list2.getCurrentDocId () ;
		unsigned long groupId = g_titledb.getGroupId ( docId );
		// get the first host's hostId in this groupId
		Host *h = g_hostdb.getFastestHostInGroup ( groupId );
		// . pick the first host to handle the cached titleRec request
		// . we assume it has the best time and is up!! TODO: fix!
		// . use local ip though if it was an internal request
		// . otherwise, use the external ip
		//unsigned long  ip   = h->m_externalIp;
		unsigned long  ip   = h->m_ip;
		// use the NAT mapped port
		unsigned short port = h->m_externalHttpPort;
		// adjust ip/port if local
		if ( st->m_isLocal ) {
			ip   = h->m_ip;
			port = h->m_httpPort;
		}
		// debug
		char kb[16];
		st->m_list2.getCurrentKey(kb);
		//log(LOG_INFO,"debug: n1=%016llx n0=%016llx",
		//    *(long long *)(kb+8),*(long long *)(kb+0));
		//if ( (unsigned long)st->m_list2.getCurrentDate() == 0 )
		//	log("STOP");
		sprintf ( p , 
			  "<tr><td>%li.</td>"
			  "<td>%llu</td>"
			  "<td>%lu</td><td>%i</td>"
			  "<td>"
			  //"<a href=http://%s:%hu/master/titledb?d=%llu>"
			  "<a href=/master/titledb?c=%s&d=%llu>"
			  "%llu"
			  //"<td><a href=/cgi/4.cgi?d=%llu>%llu"
			  "</td></tr>\n" ,
			  i++,
			  st->m_list2.getTermId16(kb) ,
			  (unsigned long)st->m_list2.getCurrentDate() ,
			  (int)st->m_list2.getCurrentScore() ,
			  //iptoa(ip) , port ,
			  st->m_coll,
			  docId , 
			  docId );
		p += gbstrlen ( p );
	}	
	*/
	if ( ! st->m_list.isEmpty() ) 
		pbuf->safePrintf ( "</table>" );


	// print msg if we could fit all into buf
	//if ( p + 1024 >= pend ) {
	//	sprintf ( p ,"... truncated ... no mem" );
	//	p += gbstrlen ( p );		
	//}
	// print the final tail
	//p += g_httpServer.printTail ( p , pend - p );
	pbuf->safePrintf ( "</center>\n");
	// now encapsulate it in html head/tail and send it off
	bool status = g_httpServer.sendDynamicPage ( s , 
						     pbuf->getBufStart() ,
						     pbuf->length() );
	// delete the state
	mdelete ( st , sizeof(State10) , "PageIndexdb" );
	delete (st) ;
	return status;
}

Example #23

Show file

File: PageAddColl.cpp Project: lemire/open-source-search-engine

bool sendPageAddDelColl ( TcpSocket *s , HttpRequest *r , bool add ) {
#ifdef PRIVACORE_SAFE_VERSION
	g_errno = EBADENGINEER;
	char *msg = "Function disabled by PRIVACORE_SAFE_VERSION define";
	return g_httpServer.sendErrorReply(s,g_errno,msg,NULL);
#else
	// get collection name
	//int32_t  nclen;
	//char *nc   = r->getString ( "nc" , &nclen );
	//int32_t  cpclen;
	//char *cpc  = r->getString ( "cpc" , &cpclen );

	g_errno = 0;

	//bool cast = r->getLong("cast",0);

	const char *msg = NULL;

	// if any host in network is dead, do not do this
	//if ( g_hostdb.hasDeadHost() ) msg = "A host in the network is dead.";

	// . are we adding a collection?
	// . return if error adding, might already exist!
	// . g_errno should be set
	// . WE DO NOT NEED THIS ANYMORE. Pages.cpp now broadcasts
	//   addcoll as CommandAddColl() parm.
	/*
	if ( nclen > 0 && add && ! cast ) {
		// do not allow "main" that is used for the "" collection
		// for backwards compatibility
		//if ( strcmp ( nc , "main" ) != 0 ) 
		g_collectiondb.addRec (nc,cpc,cpclen,true,(collnum_t)-1,
				       false , // isdump?
				       true  ) ;// save it?
		//else 
		//	log("admin: \"main\" collection is forbidden.");
	}

	if ( ! add && ! cast ) g_collectiondb.deleteRecs ( r )   ;
	*/

	char format = r->getReplyFormat();


	if ( format == FORMAT_XML || format == FORMAT_JSON ) {
		// no addcoll given?
		int32_t  page = g_pages.getDynamicPageNumber ( r );
		const char *addcoll = r->getString("addcoll",NULL);
		const char *delcoll = r->getString("delcoll",NULL);
		if ( ! addcoll ) addcoll = r->getString("addColl",NULL);
		if ( ! delcoll ) delcoll = r->getString("delColl",NULL);
		if ( page == PAGE_ADDCOLL && ! addcoll ) {
			g_errno = EBADENGINEER;
			const char *msg = "no addcoll parm provided";
			return g_httpServer.sendErrorReply(s,g_errno,msg,NULL);
		}
		if ( page == PAGE_DELCOLL && ! delcoll ) {
			g_errno = EBADENGINEER;
			const char *msg = "no delcoll parm provided";
			return g_httpServer.sendErrorReply(s,g_errno,msg,NULL);
		}
		return g_httpServer.sendSuccessReply(s,format);
	}

	// error?
	const char *action = r->getString("action",NULL);
	const char *addColl = r->getString("addcoll",NULL);

	// add our ip to the list
	//char *ips = r->getString("collips",NULL);
	//char *pwds = r->getString("collpwd",NULL);


	char  buf [ 64*1024 ];
	SafeBuf p(buf, 64*1024);


	//
	// CLOUD SEARCH ENGINE SUPPORT - GIGABOT ERRORS
	//

	SafeBuf gtmp;
	char *gmsg = NULL;
	// is it too big?
	if ( action && addColl && gbstrlen(addColl) > MAX_COLL_LEN ) {
		gtmp.safePrintf("search engine name is too long");
		gmsg = gtmp.getBufStart();
	}
	// from Collectiondb.cpp::addNewColl() ensure coll name is legit
	const char *x = addColl;
	for ( ; x && *x ; x++ ) {
		if ( is_alnum_a(*x) ) continue;
		if ( *x == '-' ) continue;
		if ( *x == '_' ) continue; // underscore now allowed
		break;
	}
	if ( x && *x ) {
		g_errno = EBADENGINEER;
		gtmp.safePrintf("<font color=red>Error. \"%s\" is a "
				"malformed name because it "
				"contains the '%c' character.</font><br><br>",
				addColl,*x);
		gmsg = gtmp.getBufStart();
	}

	//
	// END GIGABOT ERRORS
	//



	//
	// CLOUD SEARCH ENGINE SUPPORT
	//
	// if added the coll successfully, do not print same page, jump to
	// printing the basic settings page so they can add sites to it.
	// crap, this GET request, "r", is missing the "c" parm sometimes.
	// we need to use the "addcoll" parm anyway. maybe print a meta
	// redirect then?
	char guide = r->getLong("guide",0);
	// do not redirect if gmsg is set, there was a problem with the name
	if ( action && ! msg && format == FORMAT_HTML && guide && ! gmsg ) {
		//return g_parms.sendPageGeneric ( s, r, PAGE_BASIC_SETTINGS );
		// just redirect to it
		if ( addColl )
			p.safePrintf("<meta http-equiv=Refresh "
				      "content=\"0; URL=/admin/settings"
				      "?guide=1&c=%s\">",
				      addColl);
		return g_httpServer.sendDynamicPage (s,
						     p.getBufStart(),
						     p.length());
	}


	// print standard header
	g_pages.printAdminTop ( &p , s , r , NULL, 
				"onload=document."
				"getElementById('acbox').focus();");


	if ( g_errno ) {
		msg = mstrerror( g_errno );
	}

	if ( msg && ! guide ) {
		const char *cc = "deleting";
		if ( add ) cc = "adding";
		p.safePrintf (
			  "<center>\n"
			  "<font color=red>"
			  "<b>Error %s collection: %s. "
			  "See log file for details.</b>"
			  "</font>"
			  "</center><br>\n",cc,msg);
	}

	//
	// CLOUD SEARCH ENGINE SUPPORT
	//
	if ( add && guide )
		printGigabotAdvice ( &p , PAGE_ADDCOLL , r , gmsg );



	// print the add collection box
	if ( add /*&& (! nc[0] || g_errno ) */ ) {

		const char *t1 = "Add Collection";
		if ( guide ) t1 = "Add Search Engine";

		p.safePrintf (
			  "<center>\n<table %s>\n"
			   "<tr class=hdrow><td colspan=2>"
			  "<center><b>%s</b></center>"
			  "</td></tr>\n"
			  ,TABLE_STYLE
			  ,t1
			      );
		const char *t2 = "collection";
		if ( guide ) t2 = "search engine";
		const char *str = addColl;
		if ( ! addColl ) str = "";
		p.safePrintf (
			      "<tr bgcolor=#%s>"
			      "<td><b>name of new %s to add</td>\n"
			      "<td><input type=text name=addcoll size=30 "
			      "id=acbox "
			      "value=\"%s\">"
			      "</td></tr>\n"
			      , LIGHT_BLUE
			      , t2 
			      , str
			      );

		// don't show the clone box if we are under gigabot the guide
		if ( ! guide )
			p.safePrintf(
				     "<tr bgcolor=#%s>"
				     "<td><b>clone settings from this "
				     "collection</b>"
				     "<br><font size=1>Copy settings from "
				     "this pre-existing collection. Leave "
				     "blank to "
				     "accept default values.</font></td>\n"
				     "<td><input type=text name=clonecoll "
				     "size=30>"
				     "</td>"
				     "</tr>"
				     , LIGHT_BLUE
				     );

		// collection pwds
		p.safePrintf(
			     "<tr bgcolor=#%s>"
			     "<td><b>collection passwords"
			     "</b>"
			     "<br><font size=1>List of white space separated "
			     "passwords allowed to adminster collection."
			     "</font>"
			     "</td>\n"
			     "<td><input type=text name=collpwd "
			     "size=60>"
			     "</td>"
			     "</tr>"
			     , LIGHT_BLUE
			     );

		// ips box for security
		p.safePrintf(
			     "<tr bgcolor=#%s>"
			     "<td><b>collection ips"
			     "</b>"

			     "<br><font size=1>List of white space separated "
			     "IPs allowed to adminster collection."
			     "</font>"

			     "</td>\n"
			     "<td><input type=text name=collips "
			     "size=60>"
			     "</td>"
			     "</tr>"
			     , LIGHT_BLUE
			     );

		// now list collections from which to copy the config
		//p.safePrintf (
		//	  "<tr><td><b>copy configuration from this "
		//	  "collection</b><br><font size=1>Leave blank to "
		//	  "accept default values.</font></td>\n"
		//	  "<td><input type=text name=cpc value=\"%s\" size=30>"
		//	  "</td></tr>\n",coll);
		p.safePrintf ( "</table></center><br>\n");

		// wrap up the form started by printAdminTop
		g_pages.printAdminBottom ( &p );
		int32_t bufLen = p.length();
		return g_httpServer.sendDynamicPage (s,p.getBufStart(),bufLen);
	}

	// if we added a collection, print its page
	//if ( add && nc[0] && ! g_errno ) 
	//	return g_parms.sendPageGeneric2 ( s , r , PAGE_SEARCH ,
	//					  nc , pwd );

	if ( g_collectiondb.m_numRecsUsed <= 0 ) goto skip;

	// print all collections out in a checklist so you can check the
	// ones you want to delete, the values will be the id of that collectn
	p.safePrintf (
		  "<center>\n<table %s>\n"
		  "<tr class=hdrow><td><center><b>Delete Collections"
		  "</b></center></td></tr>\n"
		  "<tr bgcolor=#%s><td>"
		  "<center><b>Select the collections you wish to delete. "
		  //"<font color=red>This feature is currently under "
		  //"development.</font>"
		  "</b></center></td></tr>\n"
		  "<tr bgcolor=#%s><td>"
		  // table within a table
		  "<center><table width=20%%>\n",
		  TABLE_STYLE,
		  LIGHT_BLUE,
		  DARK_BLUE
		      );

	for ( int32_t i = 0 ; i < g_collectiondb.m_numRecs ; i++ ) {
		CollectionRec *cr = g_collectiondb.m_recs[i];
		if ( ! cr ) continue;
		p.safePrintf (
			  "<tr bgcolor=#%s><td>"
			  "<input type=checkbox name=delcoll value=\"%s\"> "
			  "%s</td></tr>\n",
			  DARK_BLUE,
			  cr->m_coll,cr->m_coll);
	}
	p.safePrintf( "</table></center></td></tr></table><br>\n" );
skip:
	// wrap up the form started by printAdminTop
	g_pages.printAdminBottom ( &p );
	int32_t bufLen = p.length();
	return g_httpServer.sendDynamicPage (s,p.getBufStart(),bufLen);
#endif
}

Example #24

Show file

File: Msg39.cpp Project: abhayprakash/open-source-search-engine

// . returns false if blocked, true otherwise
// . sets g_errno on error
bool Msg39::getLists () {

	if ( m_debug ) m_startTime = gettimeofdayInMilliseconds();
	// . ask Indexdb for the IndexLists we need for these termIds
	// . each rec in an IndexList is a termId/score/docId tuple

	//
	// restrict to docid range?
	//
	// . get the docid start and end
	// . do docid paritioning so we can send to all hosts
	//   in the network, not just one stripe
	long long docIdStart = 0;
	long long docIdEnd = MAX_DOCID;
	// . restrict to this docid?
	// . will really make gbdocid:| searches much faster!
	long long dr = m_tmpq.m_docIdRestriction;
	if ( dr ) {
		docIdStart = dr;
		docIdEnd   = dr + 1;
	}
	// . override
	// . this is set from Msg39::doDocIdSplitLoop() to compute 
	//   search results in stages, so that we do not load massive
	//   termlists into memory and got OOM (out of memory)
	if ( m_r->m_minDocId != -1 ) docIdStart = m_r->m_minDocId;
	if ( m_r->m_maxDocId != -1 ) docIdEnd   = m_r->m_maxDocId+1;
	
	// if we have twins, then make sure the twins read different
	// pieces of the same docid range to make things 2x faster
	//bool useTwins = false;
	//if ( g_hostdb.getNumStripes() == 2 ) useTwins = true;
	//if ( useTwins ) {
	//	long long delta2 = ( docIdEnd - docIdStart ) / 2;
	//	if ( m_r->m_stripe == 0 ) docIdEnd = docIdStart + delta2;
	//	else                      docIdStart = docIdStart + delta2;
	//}
	// new striping logic:
	long numStripes = g_hostdb.getNumStripes();
	long long delta2 = ( docIdEnd - docIdStart ) / numStripes;
	long stripe = g_hostdb.getMyHost()->m_stripe;
	docIdStart += delta2 * stripe; // is this right?
	docIdEnd = docIdStart + delta2;
	// add 1 to be safe so we don't lose a docid
	docIdEnd++;
	// TODO: add triplet support later for this to split the
	// read 3 ways. 4 ways for quads, etc.
	//if ( g_hostdb.getNumStripes() >= 3 ) { char *xx=NULL;*xx=0;}
	// do not go over MAX_DOCID  because it gets masked and
	// ends up being 0!!! and we get empty lists
	if ( docIdEnd > MAX_DOCID ) docIdEnd = MAX_DOCID;
	// remember so Msg2.cpp can use them to restrict the termlists 
	// from "whiteList" as well
	m_docIdStart = docIdStart;
	m_docIdEnd   = docIdEnd;
	

	//
	// set startkey/endkey for each term/termlist
	//
	for ( long i = 0 ; i < m_tmpq.getNumTerms() ; i++ ) {
		// breathe
		QUICKPOLL ( m_r->m_niceness );
		// shortcuts
		QueryTerm *qterm = &m_tmpq.m_qterms[i];
		char *sk = qterm->m_startKey;
		char *ek = qterm->m_endKey;
		// get the term id
		long long tid = m_tmpq.getTermId(i);
		// if only 1 stripe
		//if ( g_hostdb.getNumStripes() == 1 ) {
		//	docIdStart = 0;
		//	docIdEnd   = MAX_DOCID;
		//}
		// store now in qterm
		g_posdb.makeStartKey ( sk , tid , docIdStart );
		g_posdb.makeEndKey   ( ek , tid , docIdEnd   );
		qterm->m_ks = sizeof(POSDBKEY);//key144_t);
	}

	// debug msg
	if ( m_debug || g_conf.m_logDebugQuery ) {
		for ( long i = 0 ; i < m_tmpq.getNumTerms() ; i++ ) {
			// get the term in utf8
			//char bb[256];
			QueryTerm *qt = &m_tmpq.m_qterms[i];
			//utf16ToUtf8(bb, 256, qt->m_term, qt->m_termLen);
			char *tpc = qt->m_term + qt->m_termLen;
			char  tmp = *tpc;
			*tpc = '\0';
			char sign = qt->m_termSign;
			if ( sign == 0 ) sign = '0';
			QueryWord *qw = qt->m_qword;
			long wikiPhrId = qw->m_wikiPhraseId;
			if ( m_tmpq.isPhrase(i) ) wikiPhrId = 0;
			char leftwikibigram = 0;
			char rightwikibigram = 0;
			if ( qt->m_leftPhraseTerm &&
			     qt->m_leftPhraseTerm->m_isWikiHalfStopBigram )
				leftwikibigram = 1;
			if ( qt->m_rightPhraseTerm &&
			     qt->m_rightPhraseTerm->m_isWikiHalfStopBigram )
				rightwikibigram = 1;
			/*
			char c = m_tmpq.getTermSign(i);
			char tt[512];
			long ttlen = m_tmpq.getTermLen(i);
			if ( ttlen > 254 ) ttlen = 254;
			if ( ttlen < 0   ) ttlen = 0;
			// old:painful: convert each term from unicode to ascii
			memcpy ( tt , m_tmpq.getTerm(i) , ttlen );
			*/
			long isSynonym = 0;
			QueryTerm *st = qt->m_synonymOf;
			if ( st ) isSynonym = true;
			SafeBuf sb;
			// now we can display it
			//tt[ttlen]='\0';
			//if ( c == '\0' ) c = ' ';
			sb.safePrintf(
			     "query: msg39: [%lu] query term #%li \"%s\" "
			     "phr=%li termId=%llu rawTermId=%llu "
			     //"estimatedTermFreq=%lli (+/- ~16000) "
			     "tfweight=%.02f "
			     "sign=%c "
			     "numPlusses=%hhu "
			     "required=%li "
			     "fielcode=%li "

			     "ebit=0x%0llx "
			     "impBits=0x%0llx "

			     "wikiphrid=%li "
			     "leftwikibigram=%li "
			     "rightwikibigram=%li "
			     //"range.startTermNum=%hhi range.endTermNum=%hhi "
			     //"minRecSizes=%li "
			     "readSizeInBytes=%li "
			     //"ebit=0x%llx "
			     //"impBits=0x%llx "
			     "hc=%li "
			     "component=%li "
			     "otermLen=%li "
			     "isSynonym=%li "
			     "querylangid=%li ",
			     (long)this ,
			     i          ,
			     qt->m_term,//bb ,
			     (long)m_tmpq.isPhrase (i) ,
			     m_tmpq.getTermId      (i) ,
			     m_tmpq.getRawTermId   (i) ,
			     ((float *)m_r->ptr_termFreqWeights)[i] ,
			     sign , //c ,
			     0 , 
			     (long)qt->m_isRequired,
			     (long)qt->m_fieldCode,

			     (long long)qt->m_explicitBit  ,
			     (long long)qt->m_implicitBits ,

			     wikiPhrId,
			     (long)leftwikibigram,
			     (long)rightwikibigram,
			     ((long *)m_r->ptr_readSizes)[i]         ,
			     //(long long)m_tmpq.m_qterms[i].m_explicitBit  ,
			     //(long long)m_tmpq.m_qterms[i].m_implicitBits ,
			     (long)m_tmpq.m_qterms[i].m_hardCount ,
			     (long)m_tmpq.m_componentCodes[i],
			     (long)m_tmpq.getTermLen(i) ,
			     isSynonym,
			     (long)m_tmpq.m_langId); // ,tt
			// put it back
			*tpc = tmp;
			if ( st ) {
				long stnum = st - m_tmpq.m_qterms;
				sb.safePrintf("synofterm#=%li",stnum);
				//sb.safeMemcpy(st->m_term,st->m_termLen);
				sb.pushChar(' ');
				sb.safePrintf("synwid0=%lli ",qt->m_synWids0);
				sb.safePrintf("synwid1=%lli ",qt->m_synWids1);
				sb.safePrintf("synalnumwords=%li ",
					      qt->m_numAlnumWordsInSynonym);
				// like for synonym "nj" it's base,
				// "new jersey" has 2 alnum words!
				sb.safePrintf("synbasealnumwords=%li ",
					      qt->m_numAlnumWordsInBase);
			}
			logf(LOG_DEBUG,"%s",sb.getBufStart());

		}
		m_tmpq.printBooleanTree();
	}
	// timestamp log
	if ( m_debug ) 
		log(LOG_DEBUG,"query: msg39: [%lu] Getting %li index lists ",
		     (long)this,m_tmpq.getNumTerms());
	// . now get the index lists themselves
	// . return if it blocked
	// . not doing a merge (last parm) means that the lists we receive
	//   will be an appending of a bunch of lists so keys won't be in order
	// . merging is uneccessary for us here because we hash the keys anyway
	// . and merging takes up valuable cpu time
	// . caution: the index lists returned from Msg2 are now compressed
	// . now i'm merging because it's 10 times faster than hashing anyway
	//   and the reply buf should now always be <= minRecSizes so we can
	//   pre-allocate one better, and, 3) this should fix the yahoo.com 
	//   reindex bug
	char rdbId = RDB_POSDB;

	// . TODO: MDW: fix
	// . partap says there is a bug in this??? we can't cache UOR'ed lists?
	bool checkCache = false;
	// split is us????
	//long split = g_hostdb.m_myHost->m_group;
	long split = g_hostdb.m_myHost->m_shardNum;
	// call msg2
	if ( ! m_msg2.getLists ( rdbId                      ,
				 m_r->ptr_coll              ,
				 m_r->m_maxAge              ,
				 m_r->m_addToCache          ,
				 //m_tmpq.m_qterms ,
				 &m_tmpq,
				 m_r->ptr_whiteList,
				 // we need to restrict docid range for
				 // whitelist as well! this is from
				 // doDocIdSplitLoop()
				 m_docIdStart,
				 m_docIdEnd,
				 // how much of each termlist to read in bytes
				 (long *)m_r->ptr_readSizes ,
				 //m_tmpq.getNumTerms()       , // numLists
				 m_lists                    ,
				 this                       ,
				 gotListsWrapper            ,
				 m_r                        ,
				 m_r->m_niceness            ,
				 true                       , // do merge?
				 m_debug                  ,
				 NULL                       ,  // best hostids
				 m_r->m_restrictPosdbForQuery  ,
				 split                      ,
				 checkCache                 )) {
		m_blocked = true;
		return false;
	}

	// error?
	if ( g_errno ) { 
		log("msg39: Had error getting termlists2: %s.",
		    mstrerror(g_errno));
		// don't bail out here because we are in docIdSplitLoop()
		//sendReply (m_slot,this,NULL,0,0,true);
		return true; 
	}
	
	return gotLists ( true );
}

Example #25

Show file

File: PageAddUrl.cpp Project: nikhs/open-source-search-engine

bool sendReply ( void *state , bool addUrlEnabled ) {
	// allow others to add now
	//s_inprogress = false;
	// get the state properly
	//gr *st1 = (gr *) state;
	GigablastRequest *gr = (GigablastRequest *)state;
	// in order to see what sites are being added log it, then we can
	// more easily remove sites from sitesearch.gigablast.com that are
	// being added but not being searched
	SafeBuf xb;
	if ( gr->m_urlsBuf ) {
		xb.safeTruncateEllipsis ( gr->m_urlsBuf , 200 );
		log(LOG_INFO,"http: add url %s (%s)",
		    xb.getBufStart(),mstrerror(g_errno));
	}

	char format = gr->m_hr.getReplyFormat();
	TcpSocket *sock    = gr->m_socket;

	if ( format == FORMAT_JSON || format == FORMAT_XML ) {
		bool status = g_httpServer.sendSuccessReply ( gr );
		// nuke state
		mdelete ( gr , sizeof(gr) , "PageAddUrl" );
		delete (gr);
		return status;
	}


	long ulen = 0;
	char *url = gr->m_urlsBuf;
	if ( url ) ulen = gbstrlen (url);

	// re-null it out if just http://
	bool printUrl = true;
	if ( ulen == 0 ) printUrl = false;
	if ( ! gr->m_urlsBuf       ) printUrl = false;
	if ( ulen==7 && printUrl && !strncasecmp(gr->m_url,"http://",7))
		printUrl = false;
	if ( ulen==8 && printUrl && !strncasecmp(gr->m_url,"https://",8))
		printUrl = false;

	// page is not more than 32k
	char buf[1024*32+MAX_URL_LEN*2];
	SafeBuf sb(buf, 1024*32+MAX_URL_LEN*2);
	
	//char rawbuf[1024*8];
	//SafeBuf rb(rawbuf, 1024*8);	
	//rb.safePrintf("<?xml version=\"1.0\" encoding=\"utf-8\"?>\n");
	//rb.safePrintf("<status>\n");
	//CollectionRec *cr = g_collectiondb.getRec ( gr->m_coll );
	
	// collection name

	char tt [ 128 ];
	tt[0] = '\0';

	g_pages.printAdminTop ( &sb , sock , &gr->m_hr );

	// display url
	//char *url = gr->m_urlsBuf;
	//if ( url && ! url[0] ) url = NULL;

	// watch out for NULLs
	if ( ! url ) url = "http://";

	// if there was an error let them know
	//char msg[MAX_URL_LEN + 1024];
	SafeBuf mbuf;
	//char *pm = "";
	if ( g_errno ) {
		mbuf.safePrintf("<center><font color=red>");
		mbuf.safePrintf("Error adding url(s): <b>%s[%i]</b>", 
				mstrerror(g_errno) , g_errno);
		mbuf.safePrintf("</font></center>");
		//pm = msg;
		//rb.safePrintf("Error adding url(s): %s[%i]", 
		//	      mstrerror(g_errno) , g_errno);
	}
	else if ( printUrl ) {
		mbuf.safePrintf("<center><font color=red>");
		mbuf.safePrintf("<b><u>");
		mbuf.safeTruncateEllipsis(gr->m_urlsBuf,200);
		mbuf.safePrintf("</u></b> added to spider "
				 "queue "
				 "successfully<br><br>");
		mbuf.safePrintf("</font></center>");
		//rb.safePrintf("%s added to spider "
		//	      "queue successfully", url );
		//pm = msg;
		//url = "http://";
		//else
		//	pm = "Don't forget to <a href=/gigaboost.html>"
		//		"Gigaboost</a> your URL.";
	}


	if ( mbuf.length() ) sb.safeStrcpy ( mbuf.getBufStart() );

	g_parms.printParmTable ( &sb , sock , &gr->m_hr );

	// print the final tail
	g_pages.printTail ( &sb, true ); // admin?
	// clear g_errno, if any, so our reply send goes through
	g_errno = 0;

	// nuke state
	mdelete ( gr , sizeof(GigablastRequest) , "PageAddUrl" );
	delete (gr);

	return g_httpServer.sendDynamicPage (sock, 
					     sb.getBufStart(), 
					     sb.length(),
					     -1 ); // cachetime
}

Example #26

Show file

File: Msg3a.cpp Project: rdhananjaya/open-source-search-engine

bool Msg3a::gotAllSplitReplies ( ) {

    // if any of the split requests had an error, give up and set m_errno
    // but don't set if for non critical errors like query truncation
    if ( m_errno ) {
        g_errno = m_errno;
        return true;
    }

    // also reset the finalbuf and the oldNumTopDocIds
    if ( m_finalBuf ) {
        mfree ( m_finalBuf, m_finalBufSize, "Msg3aF" );
        m_finalBuf     = NULL;
        m_finalBufSize = 0;
    }

    // update our estimated total hits
    m_numTotalEstimatedHits = 0;

    for ( long i = 0; i < m_numHosts ; i++ ) {
        // get that host that gave us the reply
        //Host *h = g_hostdb.getHost(i);
        // . get the reply from multicast
        // . multicast should have destroyed all slots, but saved reply
        // . we are responsible for freeing the reply
        // . we need to call this even if g_errno or m_errno is
        //   set so we can free the replies in Msg3a::reset()
        // . if we don't call getBestReply() on it multicast should
        //   free it, because Multicast::m_ownReadBuf is still true
        Multicast *m = &m_mcast[i];
        bool freeit = false;
        long  replySize = 0;
        long  replyMaxSize;
        char *rbuf;
        Msg39Reply *mr;
        // . only get it if the reply not already full
        // . if reply already processed, skip
        // . perhaps it had no more docids to give us or all termlists
        //   were exhausted on its disk and this is a re-call
        // . we have to re-process it for count m_numTotalEstHits, etc.
        rbuf = m->getBestReply ( &replySize    ,
                                 &replyMaxSize ,
                                 &freeit       ,
                                 true          ); //stealIt?
        // cast it
        mr = (Msg39Reply *)rbuf;
        // in case of mem leak, re-label from "mcast" to this so we
        // can determine where it came from, "Msg3a-GBR"
        relabel( rbuf, replyMaxSize , "Msg3a-GBR" );
        // . we must be able to free it... we must own it
        // . this is true if we should free it, but we should not have
        //   to free it since it is owned by the slot?
        if ( freeit ) {
            log(LOG_LOGIC,"query: msg3a: Steal failed.");
            char *xx = NULL;
            *xx=0;
        }
        // bad reply?
        if ( ! mr ) {
            log(LOG_LOGIC,"query: msg3a: Bad NULL reply.");
            m_reply       [i] = NULL;
            m_replyMaxSize[i] = 0;
            // it might have been timd out, just ignore it!!
            continue;
            // if size is 0 it can be Msg39 giving us an error!
            g_errno = EBADREPLYSIZE;
            m_errno = EBADREPLYSIZE;
            // all reply buffers should be freed on reset()
            return true;
        }
        // how did this happen?
        if ( replySize < 29 && ! mr->m_errno ) {
            // if size is 0 it can be Msg39 giving us an error!
            g_errno = EBADREPLYSIZE;
            m_errno = EBADREPLYSIZE;
            log(LOG_LOGIC,"query: msg3a: Bad reply size of %li.",
                replySize);
            // all reply buffers should be freed on reset()
            return true;
        }

        // can this be non-null? we shouldn't be overwriting one
        // without freeing it...
        if ( m_reply[i] )
            // note the mem leak now
            log("query: mem leaking a 0x39 reply");

        // cast it and set it
        m_reply       [i] = mr;
        m_replyMaxSize[i] = replyMaxSize;
        // deserialize it (just sets the ptr_ and size_ member vars)
        //mr->deserialize ( );
        deserializeMsg ( sizeof(Msg39Reply) ,
                         &mr->size_docIds,
                         &mr->size_clusterRecs,
                         &mr->ptr_docIds,
                         mr->m_buf );

        // sanity check
        if ( mr->m_nqt != m_q->getNumTerms() ) {
            g_errno = EBADREPLY;
            m_errno = EBADREPLY;
            log("query: msg3a: Split reply qterms=%li != %li.",
                (long)mr->m_nqt,(long)m_q->getNumTerms() );
            return true;
        }
        // return if split had an error, but not for a non-critical
        // error like query truncation
        if ( mr->m_errno && mr->m_errno != EQUERYTRUNCATED ) {
            g_errno = mr->m_errno;
            m_errno = mr->m_errno;
            log("query: msg3a: Split had error: %s",
                mstrerror(g_errno));
            return true;
        }
        // skip down here if reply was already set
        //skip:
        // add of the total hits from each split, this is how many
        // total results the lastest split is estimated to be able to
        // return
        // . THIS should now be exact since we read all termlists
        //   of posdb...
        m_numTotalEstimatedHits += mr->m_estimatedHits;

        // debug log stuff
        if ( ! m_debug ) continue;
        // cast these for printing out
        long long *docIds    = (long long *)mr->ptr_docIds;
        score_t   *scores    = (score_t   *)mr->ptr_scores;
        // print out every docid in this split reply
        for ( long j = 0; j < mr->m_numDocIds ; j++ ) {
            // print out score_t
            logf( LOG_DEBUG,
                  "query: msg3a: [%lu] %03li) "
                  "split=%li docId=%012llu domHash=0x%02lx "
                  "score=%lu"                     ,
                  (unsigned long)this                      ,
                  j                                        ,
                  i                                        ,
                  docIds [j] ,
                  (long)g_titledb.getDomHash8FromDocId(docIds[j]),
                  (long)scores[j] );
        }
    }

    // this seems to always return true!
    mergeLists ( );

    if ( ! m_r->m_useSeoResultsCache ) return true;

    // now cache the reply
    SafeBuf cr;
    long dataSize = 4 + 4 + 4 + m_numDocIds * (8+4+4);
    long need = sizeof(key_t) + 4 + dataSize;
    bool status = cr.reserve ( need );
    // sanity
    if ( ( m_ckey.n0 & 0x01 ) == 0x00 ) {
        char *xx=NULL;
        *xx=0;
    }
    // ignore errors
    g_errno = 0;
    // return on error with g_errno cleared if cache add failed
    if ( ! status ) return true;
    // add to buf otherwise
    cr.safeMemcpy ( &m_ckey , sizeof(key_t) );
    cr.safeMemcpy ( &dataSize , 4 );
    long now = getTimeGlobal();
    cr.pushLong ( now );
    cr.pushLong ( m_numDocIds );
    cr.pushLong ( m_numTotalEstimatedHits );//Results );
    long max = m_numDocIds;
    // then the docids
    for ( long i = 0 ; i < max ; i++ )
        cr.pushLongLong(m_docIds[i] );
    for ( long i = 0 ; i < max ; i++ )
        cr.pushFloat(m_scores[i]);
    for ( long i = 0 ; i < max ; i++ )
        cr.pushLong(getSiteHash26(i));
    // sanity
    if ( cr.length() != need ) {
        char *xx=NULL;
        *xx=0;
    }
    // make these
    key_t startKey;
    key_t endKey;
    startKey = m_ckey;
    // clear delbit
    startKey.n0 &= 0xfffffffffffffffeLL;
    // end key is us
    endKey = m_ckey;
    // that is the single record
    m_seoCacheList.set ( cr.getBufStart() ,
                         cr.length(),
                         cr.getBufStart(), // alloc
                         cr.getCapacity(), // alloc size
                         (char *)&startKey,
                         (char *)&endKey,
                         -1, // fixeddatasize
                         true, // owndata?
                         false,// use half keys?
                         sizeof(key_t) );
    // do not allow cr to free it, msg1 will
    cr.detachBuf();
    // note it
    //log("seopipe: storing ckey=%s q=%s"
    //    ,KEYSTR(&m_ckey,12)
    //    ,m_r->ptr_query
    //    );
    //log("msg1: sending niceness=%li",(long)m_r->m_niceness);
    // this will often block, but who cares!? it just sends a request off
    if ( ! m_msg1.addList ( &m_seoCacheList ,
                            RDB_SERPDB,//RDB_CACHEDB,
                            m_r->ptr_coll,
                            this, // state
                            gotSerpdbReplyWrapper, // callback
                            false, // forcelocal?
                            m_r->m_niceness ) ) {
        //log("blocked");
        return false;
    }

    // we can safely delete m_msg17... just return true
    return true;
}

Example #27

Show file

File: Turkdb.cpp Project: DeadNumbers/open-source-search-engine

// . displays the stats for a username
// . show stats for every day we have them for
// . in a big list
// . if they click the day display all docids evaluated for that day
// . show the accuracy for that day too
// . how many docs they edited
// . how many of those docs were verified by another
// . and if there was consensus
void gotTransdbList ( State60 *st ) {

	// get today's time range
	time_t now = getTimeGlobal();
	// get start of today
	time_t dayStart = now / (24*3600);

	SafeBuf sb;

	// int16_tcut
	TcpSocket *s = st->m_s;

	// make about 200k of mem to write into
	if ( ! sb.reserve ( 200000 ) ) 
		return g_httpServer.sendErrorReply(s,500,mstrerrno(g_errno));

	// print description so they can clikc a button to start the turk
	sb.safePrintf("<html>\n"
		      "<title>Event Editor</title>\n"
		      "<body>\n"
		      "<table width=\"100%%\" border=\"0\">\n"
		      "<tr><td style=\"background-color:#0079ba;\">\n"
		      "<center><font color=#00000>"
		      "<h2>Event Editor</h2>\n"
		      "</font></center></td>"
		      "</tr></table>");
	// print the content
	sb.safePrintf("<center><font size=4><blink>"
		      "<b><a href=\"/pageturk?c=%s&edit=1\">"
		      "Click here to start editing.</a></b></blink>"
		      "</font><br><i>Please take your "
		      "time to read the information below before you begin"
		      "</i><br><font color=\"red\" size=2> Warning: Adult "
		      "content might be presented to you."
		      " You should be above 18 years of age to continue."
		      "</center></font>",st->m_coll);

	sb.safePrintf("<font face=arial,sans-serif color=black size=3>"
		      "<p>By clicking <i>Start Voting</i>, you will be "
		       "presented with an interface for editing events. "
		      "The editor will display a modified web page that "
		      "contains one or more events. Each event's description "
		      "will be highlight with a blue background. You can "
		      "toggle whether a particular event is displayed by "
		      "clicking on that event's ID. You can highlight one or "
		      "multiple event descriptions at the same time. "
		      "</p><p>"
		      "By clicking on the section icons in the web page you "
		      "can tell the editor that a virtual fence should be "
		      "erected around that section. The fence will make sure "
		      "that event descriptions can not span across it. Each "
		      "event description must be fully contained either "
		      "inside or outside the fence. However, you can also "
		      "declare a section as a title section, which means that "
		      "the text that the title section contains is free to be "
		      "used by any event description."
		      "</p>\n"
		      "<p>When you are done erecting section fences, you "
		      "submit your changes. The more changes you make the "
		      "more points you earn. Other users may evaluate " 
		      "your edits for accuracy. You will be paid based on the "
		      "points you earn as well as your accuracy. All "
		      "transactions are listed in the table below.</p>"
		      "<p>You may not change your username or password "
		      "but you can change your email address. Your email "
		      "address will be used to pay you with PayPal every "
		      "Friday. Paypal fees will be deducted on your end. By "
		      "using this service you agree to all stated Terms & "
		      "Conditions.</p>"
		      "</font>\n");

	// get the user record
	User *uu = g_users.getUser ( username );
	// print out their info, like paypal email
	sb.safePrintf("<table>\n"
		      "<tr><td colspan=10><center>Your Info</center>"
		      "</td></tr>\n"
		      "<tr>"
		      "<td>Email</td>"
		      "<td><input type=text value=%s></td>"
		      "<td>email address used to pay with paypal</td>"
		      "</tr>\n"
		      "<tr><td colspan=10><input type=submit value=update>"
		      "</td></tr>\n"
		      "</table>\n" ,
		      uu->m_payPalEmail );

	// print your stats here now
	sb.safePrintf("<table>\n"
		      "<tr><td colspan=10><center>Your Stats</center>"
		      "</td></tr>\n"
		      "<tr>"
		      "<td>date</td>"
		      "<td>action</td>"
		      "<td>amount</td>"
		      "<td>desc</td>"
		      "</tr>\n");

	// int16_tcut
	RdbList *list = &st->m_list;

	int32_t lastDay        = -1;
	int32_t totalReceives  = 0;
	int32_t totalSubmits   = 0;
	int32_t totalPasses    = 0;
	int32_t totalFails     = 0;

	// scan the list
	for ( ; ! list->isExhausted() ; ) {
		// get rec
		char *rec      = list->getCurrentRecord();
		char *data     = list->getCurrentData();
		int32_t  dataSize = list->getCurrentDataSize();
		// skip that
		list->skipCurrentRecord();
		// skip if negative
		if ( (rec[0] & 0x01) == 0x00 ) continue;
		// get the time (global time - sync'd with host #0)
		time_t tt = g_transdb.getTimeStamp ( rec );
		// get day #
		int32_t daynum = tt / (24*3600);
		// is it today?
		bool isToday = ( daynum >= dayStart );
		// point to the Transaction
		Trans *trans = (Trans *)data;
		// if is today, print it out verbatim
		if ( isToday ) {
			// print it in html row format to match table above
			//printTrans ( &sb , rec );
			sb.safePrintf("<tr>");
			// make it into a nice date
			time_t dd = lastDay * 86400;
			struct tm *timeStruct = localtime ( &dd );
			char ppp[100];
			strftime(ppp,100,"%H:%M:%S",timeStruct);
			// print last days stats first
			sb.safePrintf("<td>%s</td>",ppp);
			// then stats
			if ( trans->m_actionType == AT_RECEIVE_DOC )
				sb.safePrintf("<td>receive</td>"
					      "<td>%"INT32" pts</td>"
					      "<td>docid=%"UINT64"</td>",
					      (int32_t)trans->m_number,
					      trans->m_docId);
			else if ( trans->m_actionType == AT_SUBMIT_DOC )
				sb.safePrintf("<td>submit</td>"
					      "<td>%"INT32" pts</td>"
					      "<td>docid=%"UINT64"</td>",
					      (int32_t)trans->m_number,
					      trans->m_docId);
			else if ( trans->m_actionType == AT_PASS_DOC )
				sb.safePrintf("<td>verify</td>"
					      "<td>%"INT32" pts</td>"
					      "<td>docid=%"UINT64" was verified "
					      "by user=\"%s\"</td>",
					      (int32_t)trans->m_number,
					      trans->m_docId,
					      trans->m_desc);
			else if ( trans->m_actionType == AT_FAIL_DOC )
				sb.safePrintf("<td>verify</td>"
					      "<td>%"INT32" pts</td>"
					      "<td>docid=%"UINT64" was deemed to "
					      "be incorrect "
					      "by user=\"%s\"</td>",
					      (int32_t)trans->m_number,
					      trans->m_docId,
					      trans->m_desc);
			else if ( trans->m_actionType == AT_ACCURACY_EVAL)
				sb.safePrintf("<td>accuracy eval</td>"
					      "<td>%.02f</td>"
					      "<td>docid=%"UINT64"</td>",
					      trans->m_number,
					      trans->m_docId);
			else if ( trans->m_actionType == AT_CHARGE)
				sb.safePrintf("<td>credit</td>"
					      "<td>%.02f</td>"
					      "<td>You made money.</td>",
					      trans->m_number);
			else if ( trans->m_actionType == AT_PAYMENT)
				sb.safePrintf("<td>payment</td>"
					      "<td>%.02f</td>"
					      "<td>We paid you.</td>",
					      trans->m_number);
			else if ( trans->m_actionType == AT_LOGIN)
				sb.safePrintf("<td>login</td>"
					      "<td>-</td>"
					      "<td>You logged in.</td>");
			else if ( trans->m_actionType == AT_LOGOUT)
				sb.safePrintf("<td>logout</td>"
					      "<td>-</td>"
					      "<td>You logged out.</td>");
			else if ( trans->m_actionType == AT_AUTO_LOGOUT)
				sb.safePrintf("<td>logout</td>"
					      "<td>-</td>"
					      "<td>You were auto "
					      "logged out.</td>");
			else {
				char *xx=NULL;*xx=0; }
			sb.safePrintf("</tr>\n");
			continue;
		}
		// if does not match last day, print out that last day's stats
		// and reset for next guy
		if ( daynum != lastDay && lastDay != -1 ) {
			// make it into a nice date
			time_t dd = lastDay * 86400;
			struct tm *timeStruct = localtime ( &dd );
			char ppp[100];
			strftime(ppp,100,"%b-%d-%Y",timeStruct);
			// print last days stats first
			sb.safePrintf("<td>%s</td>",ppp);
			// then stats
			sb.safePrintf("<tr>"
				      "<td>receive</td>"
				      "<td>%"INT32"</td>"
				      "<td>Total received</td>"
				      "</tr>\n",
				      totalReceives);
			sb.safePrintf("<tr>"
				      "<td>submit</td>"
				      "<td>%"INT32"</td>"
				      "<td>Total submitted</td>"
				      "</tr>\n",
				      totalSubmits);
			sb.safePrintf("<tr>"
				      "<td>pass</td>"
				      "<td>%"INT32"</td>"
				      "<td>Total accuracy tests passed</td>"
				      "</tr>\n",
				      totalPasses);
			sb.safePrintf("<tr>"
				      "<td>fail</td>"
				      "<td>%"INT32"</td>"
				      "<td>Total accuracy tests failed</td>"
				      "</tr>\n",
				      totalFails);
			// reset as well
			totalReceived = 0;
			totalSubmits  = 0;
			totalPasses   = 0;
			totalFails    = 0;
		}
		// remember last day # we processed for accumulating stats
		lastDay = daynum;
		// accum stats
		if ( trans->m_actionType == AT_RECEIVE_DOC )
			totalReceives++;
		if ( trans->m_actionType == AT_SUBMIT_DOC )
			totalSubmits++;
		if ( trans->m_actionType == AT_PASS_DOC )
			totalPasses++;
		if ( trans->m_actionType == AT_FAIL_DOC )
			totalFails++;
	}

	sb.safePrintf("</body></html>\n");

	sendReply ( &sb );
}

Example #28

Show file

File: Statsdb.cpp Project: FlavioFalcao/open-source-search-engine

void Statsdb::drawHR ( float z ,
		       float ymin , 
		       float ymax ,
		       //GIFPlotter *plotter ,
		       SafeBuf &gw,
		       Label *label ,
		       float zoff ,
		       long color ) {

	// convert into yspace
	float z2 = ((float)DY2 * (float)(z - ymin)) /(float)(ymax-ymin);
	// avoid collisions with other graphs
	z2 += zoff;
	// border
	//z2 += m_by;
	// round off error
	z2 += 0.5;
	// for adjusatmnet
	float ptsPerPixel = (ymax-ymin)/ (float)DY2;
	// make an adjustment to the label then! -- Commented out because it's currently not used.
	float zadj = zoff * ptsPerPixel;

	//#ifdef _USEPLOTTER_

	// use the color specified from addStat_r() for this line/pt
	//plotter->pencolor ( ((color >> 16) & 0xff) << 8 ,
	//		    ((color >>  8) & 0xff) << 8 ,
	//		    ((color >>  0) & 0xff) << 8 );

	// horizontal line
	//plotter->line ( m_bx, (long)z2 , DX2 + m_bx, (long)z2 );
	long width = 1;
	drawLine3 ( m_gw, 0, DX2 , (long)z2,color, width); 


	// make label
	char tmp[128];
	// . use "graphHash" to map to unit display
	// . this is a disk read volume
	sprintf(tmp,label->m_format,z +zadj);//* label->m_yscalar);

	/*
	// a white shadow
	plotter->pencolor ( 0xffff,0xffff,0xffff );
	plotter->move ( m_bx + 80 + 2 , z2 + 10 - 2 );
	plotter->alabel     ( 'c' , 'c' , tmp );
	
	// a black shadow
	plotter->pencolor ( 0 , 0 , 0 );
	plotter->move ( m_bx + 80 + 1 , z2 + 10 - 1 );
	plotter->alabel     ( 'c' , 'c' , tmp );
	
	//long color = label->m_color;
	// use the color specified from addStat_r() for this line/pt
	plotter->pencolor ( ((color >> 16) & 0xff) << 8 ,
			    ((color >>  8) & 0xff) << 8 ,
			    ((color >>  0) & 0xff) << 8 );
	
	// move cursor
	plotter->move ( m_bx + 80 , z2 + 10 );
	// plot label
	plotter->alabel     ( 'c' , 'c' , tmp );
	*/

	// LABEL
	gw.safePrintf("<div style=\"position:absolute;"
		      "left:%li;"
		      "bottom:%li;"
		      "color:#%lx;"
		      "z-index:110;"
		      "font-size:14px;"
		      "min-height:20px;"
		      "min-width:3px;\">%s</div>\n"
		      , (long)(m_bx)
		      , (long)z2 +m_by
		      , color
		      // the label:
		      , tmp
		      );
	
}

Example #29

Show file

File: Turkdb.cpp Project: DeadNumbers/open-source-search-engine

bool sendTurkPageReply ( State60 *st ) {

	XmlDoc *xd = &st->m_xd;
	//char *content    = xd->ptr_utf8Content;
	//int32_t  contentLen = xd->size_utf8Content - 1;

	// count the total number of EventDesc classes for all evids
	//char *evd = xd->ptr_eventData;
	//EventDisplay *ed = (EventDisplay *)evd;
	//char *addr = evd + (int32_t)ed->m_addr;
	//char timeZoneOffset = getTimeZoneFromAddr ( addr );

	// in case getSections() block come right back in
	xd->setCallback ( st , xdcallback );

	// . set niceness to 1 so all this processing doesn't slow queries down
	// . however, g_niceness should still be zero... hmmm...
	xd->m_niceness = 1;

	// default to 1 niceness
	st->m_niceness = 1;

	// now set the sections class
	Sections *ss = xd->getSections();

	// now for each section with alnum text, telescope up as far as 
	// possible without containing anymore alnum text than what it 
	// contained. set SEC_CONTROL bit. such sections will have the
	// 2 green/blue dots, that are used for turning on/off title/desc.
	// but really the indians will only turn off sections that should
	// not have a title/desc.
	for ( Section *si = ss->m_rootSection ; si ; si = si->m_next ) {
		// breathe
		QUICKPOLL(st->m_niceness);
		// skip if does not have text
		if ( si->m_firstWordPos < 0 ) continue;
		// otherwise, find biggest parent that contains just that text
		Section *p    = si->m_parent;
		Section *last = si;
		for ( ; p ; p = p->m_parent ) {
			if ( p->m_firstWordPos != si->m_firstWordPos ) break;
			if ( p->m_lastWordPos  != si->m_lastWordPos  ) break;
			last = p;
		}
		// set that bit then
		last->m_flags |= SEC_CONTROL;
		// and speed up the loop
		si = last;
	}

	// * now each SEC_CONTROL sections have a fence activated by a turker

	// * an event title or description can not span a fence. it must be
	//   confined within a fence. however, it is allowed to include
	//   title or description from a "title section".

	// * hold shift down to designate as title section when clicking it

	// * show the raw text of each event changing as you fence
	//   sections in or out.  show in a right frame.

	// * show list of events on page in the top frame. can toggle them
	//   all individually.

	// * and remove no-display from all tags so we can see everything.

	// * highlight addresses, not just dates.

	// * each section hash has its own unique bg color when activated

	// * with a single click, completely reject an event because:
	//   contains bad time, address, title or desc. specify which so
	//   we can improve our algo.

	// * when selecting an individual event, scroll to its tod...

	// * remove all color from webpage that we can so our colors show up

	// * remove all imgs. just src them to dev null.

	// * allow for entering a custom title for an event or all events
	//   that are or will ever appear on the page. 

	// * when displaying the text of the events, use hyphens to
	//   delineate the section topology. strike out text as a section
	//   fence is activated.

	// * when a section is activated is it easier to just redownload
	//   the whole text of the page? maybe just the text frame?

	// * clicking on an individual sentence section should just remove
	//   that sentence. that is kinda a special content hash removal
	//   tag. like "Click here for video."

	// * when an event id is selected i guess activate its bgcolor to
	//   be light blue for all sentences currently in the event that
	//   are not in activated sections. (make exception for designated 
	//   title sections). so we need multiple tags for each events
	//   sentence div section. if sentence is split use multiple div tags
	//   then to keep the order. so each event sentence would have 
	//   <div ev1=1 ev2=1 ev10=1>...</div> if it is in event ids 1,2 and
	//   10. that way we can activate it when one of those event ids is
	//   activated.


	SafeBuf sb;

	// int16_tcuts
	if ( ! xd->m_wordsValid ) { char *xx=NULL;*xx=0; }
	Words     *words = &xd->m_words;
	int32_t       nw    = words->getNumWords();
	char     **wptrs = words->getWords();
	int32_t      *wlens = words->getWordLens();
	nodeid_t  *tids  = words->getTagIds();

	// a special array for printing </div> tags
	char *endCounts = (char *)mcalloc ( nw ,"endcounts");
	if ( ! endCounts ) return sendErrorReply ( st , g_errno );


	// 
	// now loop over all the words. if word starts a section that has
	// SEC_CONTROL bit set, and print out the section hash and a color
	// tag to be activated if the turkey activates us.
	// CAUTION: word may start multiple sections.
	//
	for ( int32_t i = 0 ; i < nw ; i++ ) { 
		// get section ptr
		Section *sj = ss->m_sectionPtrs[i];
		// sanity check. sj must be first section ptr that starts @ a
		if ( sj && sj->m_a==i && sj->m_prev && sj->m_prev->m_a==i ) {
			char *xx=NULL;*xx=0; }
		// . does word #i start a section?
		// . if section is control, print out the control
		while ( sj && sj->m_a == i ) {
			// print this section's hash
			if ( sj->m_flags & SEC_CONTROL) {
				// after the turkeys have made all the edits
				// they need to submit the changes they made.
				// how can we get that data sent back to the
				// back end? we need to send back the colors
				// of the sections that have been activated
				// i guess. just do a loop over them.
				sb.safePrintf("<div nobreak gbsecid=%"UINT32" "
					      "bgcolor=#%"XINT32" "
					      "onclick=gbtogglecolor()>",
					      (uint32_t)sj->m_tagHash,
					      (uint32_t)sj->m_tagHash);
				// sanity check
				if ( sj->m_b < 0  ) { char *xx=NULL;*xx=0; }
				if ( sj->m_b > nw ) { char *xx=NULL;*xx=0; }
				// and inc the /div count for that word
				endCounts[sj->m_b-1]++;
			}
			// try next section too
			sj = sj->m_next;
		}
		// if this is a tag, remove any coloring
		if ( tids[i] ) {
		}
		// print the word, be it a tag, alnum, punct
		sb.safeMemcpy ( wptrs[i] , wlens[i] );
		// end a div tag?
		if ( ! endCounts[i] ) continue;
		// might be many so loop it
		for ( int32_t j = 0 ; j < endCounts[i] ; j++ )
			sb.safePrintf("</div>");
	}			







	return false;
}

Example #30

Show file

File: Summary.cpp Project: exename/open-source-search-engine

// . return the score of the highest-scoring window containing match #m
// . window is defined by the half-open interval [a,b) where a and b are 
//   word #'s in the Words array indicated by match #m
// . return -1 and set g_errno on error
int64_t Summary::getBestWindow ( Matches *matches, int32_t mm, int32_t *lasta,
                                 int32_t *besta, int32_t *bestb, char *gotIt,
                                 char *retired, int32_t maxExcerptLen ) {
	// get the window around match #mm
	Match *m = &matches->m_matches[mm];

	// what is the word # of match #mm?
	int32_t matchWordNum = m->m_wordNum;

	// what Words/Pos/Bits classes is this match in?
	Words *words = m->m_words;
	Section **sp = NULL;
	int32_t *pos = m->m_pos->m_pos;

	// use "m_swbits" not "m_bits", that is what Bits::setForSummary() uses
	const swbit_t *bb = m->m_bits->m_swbits;

	// shortcut
	if ( m->m_sections ) {
		sp = m->m_sections->m_sectionPtrs;
	}

	int32_t nw = words->getNumWords();
	int64_t *wids = words->getWordIds();
	nodeid_t *tids = words->getTagIds();

	// . sanity check
	// . this prevents a core i've seen
	if ( matchWordNum >= nw ) {
		log("summary: got overflow condition for q=%s",m_q->m_orig);

		// assume no best window
		*besta = -1;
		*bestb = -1;
		*lasta = matchWordNum;
		return 0;
	}

	// . we NULLify the section ptrs if we already used the word in another summary.
	int32_t badFlags = SEC_SCRIPT|SEC_STYLE|SEC_SELECT|SEC_IN_TITLE;
	if ( (bb[matchWordNum] & D_USED) || ( sp && (sp[matchWordNum]->m_flags & badFlags) ) ) {
		// assume no best window
		*besta = -1;
		*bestb = -1;
		*lasta = matchWordNum;
		return 0;
	}

	// . "a" is the left fence post of the window (it is a word # in Words)
	// . go to the left as far as we can 
	// . thus we decrement "a"
	int32_t a = matchWordNum;

	// "posa" is the character position of the END of word #a
	int32_t posa = pos[a+1];
	int32_t firstFrag = -1;
	bool startOnQuote = false;
	bool goodStart = false;
	int32_t wordCount = 0;

	// . decrease "a" as int32_t as we stay within maxNumCharsPerLine
	// . avoid duplicating windows by using "lasta", the last "a" of the
	//   previous call to getBestWindow(). This can happen if our last
	//   central query term was close to this one.
	for ( ; a > 0 && posa - pos[a-1] < maxExcerptLen && a > *lasta; a-- ) {
		// . don't include any "dead zone", 
		// . dead zones have already been used for the summary, and
		//   we are getting a second/third/... excerpt here now then
		// stop if its the start of a sentence, too
		// stop before title word
		if ( (bb[a-1] & D_USED) || (bb[a] & D_STARTS_SENTENCE) || ( bb[a-1] & D_IN_TITLE )) {
			goodStart = true;
			break;
		}

		// don't go beyond an LI, TR, P tag
		if ( tids && ( tids[a-1] == TAG_LI ||
		               tids[a-1] == TAG_TR ||
		               tids[a-1] == TAG_P  ||
		               tids[a-1] == TAG_DIV ) ) {
			goodStart = true;
			break;
		}

		// stop if its the start of a quoted sentence
		if ( a+1<nw && (bb[a+1] & D_IN_QUOTES) && 
		     words->getWord(a)[0] == '\"' ){
			startOnQuote = true;
			goodStart    = true;
			break;
		}

		// find out the first instance of a fragment (comma, etc)
		// watch out! because frag also means 's' in there's
		if ( ( bb[a] & D_STARTS_FRAG ) && !(bb[a-1] & D_IS_STRONG_CONNECTOR) && firstFrag == -1 ) {
			firstFrag = a;
		}

		if ( wids[a] ) {
			wordCount++;
		}
	}

	// if didn't find a good start, then start at the start of the frag
	if ( !goodStart && firstFrag != -1 ) {
		a = firstFrag;
	}

	// don't let punct or tag word start a line, unless a quote
	if ( a < matchWordNum && !wids[a] && words->getWord(a)[0] != '\"' ){
		while ( a < matchWordNum && !wids[a] ) a++;
		
		// do not break right after a "strong connector", like 
		// apostrophe
		while ( a < matchWordNum && a > 0 && 
			( bb[a-1] & D_IS_STRONG_CONNECTOR ) )
			a++;
		
		// don't let punct or tag word start a line
		while ( a < matchWordNum && !wids[a] ) a++;
	}

	// remember, b is not included in the summary, the summary is [a,b-1]
	// remember to include all words in a matched phrase
	int32_t b = matchWordNum + m->m_numWords ;
	int32_t endQuoteWordNum = -1;
	int32_t numTagsCrossed = 0;

	for ( ; b <= nw; b++ ) {
		if ( b == nw ) {
			break;
		}

		if ( pos[b+1] - pos[a] >= maxExcerptLen ) {
			break;
		}
		
		if ( startOnQuote && words->getWord(b)[0] == '\"' ) {
			endQuoteWordNum = b;
		}

		// don't include any dead zone, those are already-used samples
		if ( bb[b] & D_USED ) {
			break;
		}

		// stop on a title word
		if ( bb[b] & D_IN_TITLE ) {
			break;
		}

		if ( wids[b] ) {
			wordCount++;
		}

		// don't go beyond an LI or TR backtag
		if ( tids && ( tids[b] == (BACKBIT|TAG_LI) ||
		               tids[b] == (BACKBIT|TAG_TR) ) ) {
			numTagsCrossed++;

			// try to have atleast 10 words in the summary
			if ( wordCount > 10 ) {
				break;
			}
		}

		// go beyond a P or DIV backtag in case the earlier char is a
		// ':'. This came from a special case for wikipedia pages 
		// eg. http://en.wikipedia.org/wiki/Flyover
		if ( tids && ( tids[b] == (BACKBIT|TAG_P)  ||
		               tids[b] == (BACKBIT|TAG_DIV) )) {
			numTagsCrossed++;

			// try to have atleast 10 words in the summary
			if ( wordCount > 10 && words->getWord(b-1)[0] != ':' ) {
				break;
			}
		}
	}

	// don't end on a lot of punct words
	if ( b > matchWordNum && !wids[b-1]){
		// remove more than one punct words. if we're ending on a quote
		// keep it
		while ( b > matchWordNum && !wids[b-2] && endQuoteWordNum != -1 && b > endQuoteWordNum ) {
			b--;
		}
		
		// do not break right after a "strong connector", like apostrophe
		while ( b > matchWordNum && (bb[b-2] & D_IS_STRONG_CONNECTOR) ) {
			b--;
		}
	}

	Match *ms = matches->m_matches;

	// make m_matches.m_matches[mi] the first match in our [a,b) window
	int32_t mi ;

	// . the match at the center of the window is match #"mm", so that
	//   matches->m_matches[mm] is the Match class
	// . set "mi" to it and back up "mi" as int32_t as >= a
	for ( mi = mm ; mi > 0 && ms[mi-1].m_wordNum >=a ; mi-- )
		;

	// now get the score of this excerpt. Also mark all the represented 
	// query words. Mark the represented query words in the array that
	// comes to us. also mark how many times the same word is repeated in
	// this summary.
	int64_t score = 0LL;

	// is a url contained in the summary, that looks bad! punish!
	bool hasUrl = false;

	// the word count we did above was just an approximate. count it right
	wordCount = 0;

	// for debug
	//char buf[5000];
	//char *xp = buf;
	SafeBuf xp;

	// wtf?
	if ( b > nw ) {
		b = nw;
	}

	// first score from the starting match down to a, including match
	for ( int32_t i = a ; i < b ; i++ ) {
		// debug print out
		if ( g_conf.m_logDebugSummary ) {
			int32_t len = words->getWordLen(i);
			char cs;
			for (int32_t k=0;k<len; k+=cs ) {
				const char *c = words->getWord(i)+k;
				cs = getUtf8CharSize(c);
				if ( is_binary_utf8 ( c ) ) {
					continue;
				}
				xp.safeMemcpy ( c , cs );
				xp.nullTerm();
			}
		}

		// skip if in bad section, marquee, select, script, style
		if ( sp && (sp[i]->m_flags & badFlags) ) {
			continue;
		}

		// don't count just numeric words
		if ( words->isNum(i) ) {
			continue;
		}

		// check if there is a url. best way to check for '://'
		if ( wids && !wids[i] ) {
			const char *wrd = words->getWord(i);
			int32_t  wrdLen = words->getWordLen(i);
			if ( wrdLen == 3 && wrd[0] == ':' && wrd[1] == '/' &&  wrd[2] == '/' ) {
				hasUrl = true;
			}
		}

		// skip if not wid
		if ( ! wids[i] ) {
			continue;
		}

		// just make every word 100 pts
		int32_t t = 100;

		// penalize it if in one of these sections
		if ( bb[i] & ( D_IN_PARENS | D_IN_SUP | D_IN_LIST ) ) {
			t /= 2;
		}

		// boost it if in bold or italics
		if ( bb[i] & D_IN_BOLDORITALICS ) {
			t *= 2;
		}

		// add the score for this word
		score += t;

		// print the score, "t"
		if ( g_conf.m_logDebugSummary ) {
			xp.safePrintf("(%" PRId32")",t);
		}

		// count the alpha words we got
		wordCount++;

		// if no matches left, skip
		if ( mi >= matches->m_numMatches ) {
			continue;
		}

		// get the match
		Match *next = &ms[mi];

		// skip if not a match
		if ( i != next->m_wordNum ) {
			continue;
		}

		// must be a match in this class
		if ( next->m_words != words ) {
			continue;
		}

		// advance it
		mi++;

		// which query word # does it match
		int32_t qwn = next->m_qwordNum;

		if ( qwn < 0 || qwn >= m_q->m_numWords ){g_process.shutdownAbort(true);}

		// undo old score
		score -= t;

		// add 100000 per match
		t = 100000;

		// weight based on tf, goes from 0.1 to 1.0
		t = (int32_t)((float)t * m_wordWeights [ qwn ]);

		// if it is a query stop word, make it 10000 pts
		if ( m_q->m_qwords[qwn].m_isQueryStopWord ) {
			t = 0;//10000;
		}

		// penalize it if in one of these sections
		if ( bb[i] & ( D_IN_PARENS | D_IN_SUP | D_IN_LIST ) ) {
			t /= 2;
		}

		if ( gotIt[qwn] > 0 ) {
			// have we matched it in this [a,b) already?
			if ( gotIt[qwn] == 1 ) {
				t /= 15;
			} else {
				// if we have more than 2 matches in the same window,
				// it may not give a good summary. give a heavy penalty
				t -= 200000;
			}
		} else if ( retired [qwn] > 0 ) {
			// have we matched it already in a winning window?
			t /= 12;
		}

		// add it back
		score += t;

		if ( g_conf.m_logDebugSummary ) {
			xp.safePrintf ("[%" PRId32"]{qwn=%" PRId32",ww=%f}",t,qwn,
				       m_wordWeights[qwn]);
		}

		// inc the query word count for this window
		if ( gotIt[qwn] < 100 ) {
			gotIt[qwn]++;
		}
	}

	int32_t oldScore = score;
	
	// apply the bonus if it starts or a sentence
	// only apply if the score is positive and if the wordcount is decent
	if ( score > 0 && wordCount > 7 ){
		// a match can give us 10k to 100k pts based on the tf weights
		// so we don't want to overwhelm that too much, so let's make
		// this a 20k bonus if it starts a sentence
		if ( bb[a] & D_STARTS_SENTENCE ) {
			score += 8000;
		} else if ( bb[a] & D_STARTS_FRAG ) {
			// likewise, a fragment, like after a comma
			score += 4000;
		}

		// 1k if the match word is very close to the
		// start of a sentence, lets say 3 alphawords
		if ( matchWordNum - a < 7 ) {
			score += 1000;
		}
	}

	// a summary isn't really a summary if its less than 7 words.
	// reduce the score, but still give it a decent score.
	// minus 5M.
	if ( wordCount < 7 ) {
		score -= 20000;
	}

	// summaries that cross a lot of tags are usually bad, penalize them
	if ( numTagsCrossed > 1 ) {
		score -= (numTagsCrossed * 20000);
	}

	if ( hasUrl ) {
		score -= 8000;
	}

	// show it
	if ( g_conf.m_logDebugSummary ) {
		log(LOG_DEBUG, "sum: score=%08" PRId32" prescore=%08" PRId32" a=%05" PRId32" b=%05" PRId32" %s",
		     (int32_t)score,oldScore,(int32_t)a,(int32_t)b,
		     xp.getBufStart());
	}

	// set lasta, besta, bestb
	*lasta = a;
	*besta = a;
	*bestb = b;

	return score;
}