C++ (Cpp) SafeBuf 예제들

bool AutoBan::printTable( TcpSocket *s , HttpRequest *r ) {
	SafeBuf sb(512 * 512,"autobbuf");
	//read in all of the possible cgi parms off the bat:
	//long  user     = g_pages.getUserType( s , r );
	char *username = g_users.getUsername(r);
	//char *pwd  = r->getString ("pwd");

	char *coll = r->getString ("c");

	long banIpsLen;
	char *banIps = r->getString ("banIps" , &banIpsLen , NULL);

	long allowIpsLen;
	char *allowIps = r->getString ("allowIps" , &allowIpsLen , NULL);

 	long clearLen;
 	char *clear = r->getString ("clear" , &clearLen , NULL);

	bool changed = false;

 	long validCodesLen;
 	char *validCodes = r->getString ("validCodes", &validCodesLen, NULL);

	long showAllIps = r->getLong("showAllIps", 0);
	long showLongView = r->getLong("longview", 0);

	// do it all from parm now
	//long banRegexLen;
	//char *banRegex = r->getString("banRegex", &banRegexLen, NULL);
	

// 	char *ss = sb.getBuf();
// 	char *ssend = sb.getBufEnd();
	g_pages.printAdminTop ( &sb, PAGE_AUTOBAN, username,
				coll , NULL , s->m_ip );
	//sb.incrementLength(sss - ss);

	// MDW: moved to here

	long now = getTime();
	
	long days;
	long hours;
	long minutes;
	long secs;
	long msecs;

	if(r->getLong("resetcodes", 0)) {
		setCodesFromConf();
	}

	sb.safePrintf("\n<br><br><table width=100%% bgcolor=#%s "
		      "cellpadding=4 border=1>\n", 
		      BABY_BLUE);
	getCalendarFromMs((now - m_codeResetTime) * 1000,
			  &days, 
			  &hours, 
			  &minutes, 
			  &secs,
			  &msecs);
	sb.safePrintf("<tr><td colspan=18 bgcolor=#%s>"
		      "<center><b>Code Usage "
		      "(<a href=\"/master/"
		      "autoban?c=%s&resetcodes=1\">reset</a> "
		      "%li days %li hours %li "
		      "minutes %li sec ago)"
		      "</b></center></td></tr>", 
		      DARK_BLUE,
		      coll,
		      days, 
		      hours, 
		      minutes, 
		      secs);
	sb.safePrintf("<tr bgcolor=#%s>"
		      "<td><center><b>Code</b></center></td>"
		      "<td><center><b>IP</b></center></td>"
		      "<td><center><b>Query Count</b></center></td>"

		      "<td><center><b>Bytes Read</b></center></td>"
		      "<td><center><b>Bytes Sent</b></center></td>"
		      
		      "<td><center><b>Outstanding Count</b></center></td>"
		      "<td><center><b>Most Ever Outstanding</b></center></td>"
		      "<td><center><b>Max Outstanding</b></center></td>"
		      "</tr>", 
		      LIGHT_BLUE);


	for(long i = 0; i < m_ht.getNumSlots(); i++) {
		if ( m_ht.getKey ( i ) == 0 ) continue;
		CodeVal *cv = m_ht.getValuePointerFromSlot ( i );
		if ( ! cv ) continue;
		
		sb.safePrintf("<tr>");
		sb.safePrintf("<td>");
		sb.copyToken(cv->m_code);//m_codeVals[i].m_code);
		sb.safePrintf("</td>");
		sb.safePrintf("<td><center>%s</center> </td>",
			      iptoa(cv->m_ip));
		sb.safePrintf("<td><center>%lli</center></td>", 
			      cv->m_count);

		sb.safePrintf("<td><center>%lli</center></td>", 
			      cv->m_bytesRead);
		sb.safePrintf("<td><center>%lli</center></td>", 
			      cv->m_bytesSent);

		sb.safePrintf("<td><center>%li</center></td>", 
			      cv->m_outstanding);
		sb.safePrintf("<td><center>%li</center></td>", 
			      cv->m_maxEver);
		if ( cv->m_maxOutstanding != 50 )
			sb.safePrintf("<td><center><b>%li</b></center></td>", 
				      cv->m_maxOutstanding);
		else
			sb.safePrintf("<td><center>%li</center></td>", 
				      cv->m_maxOutstanding);

		sb.safePrintf("</tr>");
		
	}
	sb.safePrintf ("</table><br><br>\n" );


 	if(clear && clearLen < 64) {
 		long ip = atoip(clear, clearLen);
 		if(ip) {
			removeIp(ip);
			char *beginning;
			char ipbuf[64];//gotta NULL terminate for strstr
			memcpy(ipbuf, clear, clearLen);
			ipbuf[clearLen] = '\0';
			beginning = findToken(g_conf.m_banIps, ipbuf, 
					      clearLen);
			if(beginning) {
				char *to = beginning;
				char *from = beginning + clearLen;
				while(*to) *to++ = *from++;
			}
			beginning = findToken(g_conf.m_allowIps, ipbuf,
					      clearLen);
			if(beginning) {
				char *to = beginning;
				char *from = beginning + clearLen;
				while(*to) *to++ = *from++;
			}
			changed = true;
 		}
 	}

 	long allowLen;
 	char *allow = r->getString ( "allow" , &allowLen , NULL );
 	if(allow && allowLen < 64) {
 		long ip = atoip(allow, allowLen);
		
 		if(ip) {
			char *beginning;
			char ipbuf[64];//gotta NULL terminate for strstr
			memcpy(ipbuf, allow, allowLen);
			ipbuf[allowLen] = '\0';
			beginning = findToken(g_conf.m_allowIps, ipbuf, 
					      allowLen);
			if(!beginning) {
				//its not present, so add it.
				char *p = g_conf.m_allowIps;
				while(*p) p++;
				if(p - g_conf.m_allowIps + allowLen + 2 
				   < AUTOBAN_TEXT_SIZE) {
					*p++ = '\n';
					memcpy(p, ipbuf,allowLen);
					*(p + allowLen) = '\0';
				}
				else {
					sb.safePrintf("<font color=red>"
						      "Not enough stack space "
						      "to fit allowIps.  "
						      "Increase "
						      "AUTOBAN_TEXT_SIZE in "
						      "Conf.h. "
						      "Had %i need %li."
						      "</font>", 
						      AUTOBAN_TEXT_SIZE,
						      p - g_conf.m_allowIps + 
						      allowLen + 2);
					goto dontRemove1;
				}
			}
			beginning = findToken(g_conf.m_banIps, ipbuf, 
					      allowLen);
			if(beginning) {
				//remove it from banned if present.
				char *to = beginning;
				char *from = beginning + allowLen;
				while(*to) *to++ = *from++;
			}

			changed = true;
 		}
 	}
 dontRemove1:
 	long denyLen;
 	char *deny = r->getString ( "deny" , &denyLen , NULL );
 	if(deny && denyLen < 64) {
 		long ip = atoip(deny, denyLen);
		
 		if(ip) {
			char *beginning;
			char ipbuf[64];//gotta NULL terminate for strstr
			memcpy(ipbuf, deny, denyLen);
			ipbuf[denyLen] = '\0';
			beginning = findToken(g_conf.m_banIps, ipbuf, denyLen);
			if(!beginning) {
				//its not present, so add it.
				char *p =g_conf.m_banIps;
				while(*p) p++;
				if(p - g_conf.m_banIps + denyLen + 2 < 
				   AUTOBAN_TEXT_SIZE) {
					*p++ = '\n';
					memcpy(p, ipbuf,denyLen);
					*(p + denyLen) = '\0';
				}
				else {
					sb.safePrintf("<font color=red>Not "
						      "enough stack space "
						      "to fit bannedIPs.  "
						      "Increase "
						      "AUTOBAN_TEXT_SIZE in "
						      "Conf.h. "
						      "Had %i need %li."
						      "</font>", 
						      AUTOBAN_TEXT_SIZE,
						      p - g_conf.m_banIps +
						      denyLen + 2);
					goto dontRemove2;
				}
			}
			beginning = findToken(g_conf.m_allowIps, ipbuf,
					      denyLen);
			if(beginning) {
				//remove it from allowed list if present.
				char *to = beginning;
				char *from = beginning + denyLen;
				while(*to) *to++ = *from++;
			}
			changed = true;
 		}
 	}
 dontRemove2:

	if(!g_conf.m_doAutoBan) {
		sb.safePrintf("<center><font color=red><b>Autoban is disabled, "
			      "turn it on in Master Controls.</b></font></center><br>");
	}

 	if(validCodes) {
		if(validCodesLen >= AUTOBAN_TEXT_SIZE) {
			sb.safePrintf("<font color=red>Not enough stack space "
				      "to fit codes.  "
				      "Increase AUTOBAN_TEXT_SIZE in Conf.h. "
				      "Had %i need %li.</font>", 
				      AUTOBAN_TEXT_SIZE,
				      validCodesLen);
			validCodes = NULL;
			validCodesLen = 0;
		}
		else {
			memcpy(g_conf.m_validCodes, validCodes, validCodesLen);
			g_conf.m_validCodes[validCodesLen] = '\0';
			trimWhite(g_conf.m_validCodes);
			setCodesFromConf();
		}
	}



	//first remove all of the ips in the conf, then add the passed in 
	//  ones to the conf parm; 
	if (banIps) {
		//ack, the browser puts in crlf when this comes back, so
		//we will have a longer string here than the one we sent 
		//out. trim back all extrainious whitespace before we do
		//bounds checking.
		trimWhite(banIps);
		banIpsLen = gbstrlen(banIps);
		if(banIpsLen >= AUTOBAN_TEXT_SIZE) {
			sb.safePrintf("<font color=red>Not enough stack space "
				      "to fit bannedIps.  "
				      "Increase AUTOBAN_TEXT_SIZE in Conf.h. "
				      "Had %i need %li.</font>", 
				      AUTOBAN_TEXT_SIZE,
				      banIpsLen);
			banIpsLen = AUTOBAN_TEXT_SIZE - 1;
		}
		for(long i = 0; i < m_tableSize; i++) {
			if(m_detectKeys[i] == 0) continue;
			//check the 'set from conf' bit, and clear those.
			if(m_detectVals[i].m_flags & FROMCONF) {
				removeIp(m_detectKeys[i]);
			}
		}
		memcpy(g_conf.m_banIps, banIps, banIpsLen);
		g_conf.m_banIps[banIpsLen] = '\0';
		changed = true;
	}
	if (allowIps) {
		trimWhite(allowIps);
		allowIpsLen = gbstrlen(allowIps);

		if(allowIpsLen >= AUTOBAN_TEXT_SIZE) {
			sb.safePrintf("<font color=red>Not enough stack space "
				      "to fit allowIps.  "
				      "Increase AUTOBAN_TEXT_SIZE in Conf.h. "
				      "Had %i need %li.</font>", 
				      AUTOBAN_TEXT_SIZE,
				      allowIpsLen);
			allowIpsLen = AUTOBAN_TEXT_SIZE - 1;
		}
		for(long i = 0; i < m_tableSize; i++) {
			if(m_detectKeys[i] == 0) continue;
			//check the 'set from conf' bit, and clear those.
			if(m_detectVals[i].m_flags & FROMCONF) {
				removeIp(m_detectKeys[i]);
			}
		}
		memcpy(g_conf.m_allowIps, allowIps, allowIpsLen);
		g_conf.m_allowIps[allowIpsLen] = '\0';
		changed = true;
	}
	if(changed) {
		trimWhite(g_conf.m_allowIps);
		trimWhite(g_conf.m_banIps);
		setFromConf();
	}



	sb.safePrintf("\n<table width=100%% bgcolor=#%s "
		      "cellpadding=4 border=1>\n", 
		      BABY_BLUE);
	sb.safePrintf("<tr><td colspan=2 bgcolor=#%s>"
		      "<center><b>Add IPs</b></center></td></tr>", 
		      DARK_BLUE);

// 	ss = sb.getBuf();
// 	ssend = sb.getBufEnd();
	g_parms.printParms (&sb, s, r);
	//	sb.incrementLength(sss - ss);



	sb.safePrintf ("<tr><td>"
		       "<center>" 
		       "<input type=submit value=\"Update\" "
		       "method=\"POST\" border=0>"
		       "</center></td></tr>");

	sb.safePrintf ("</table><br><br>\n" );



	if(!showLongView) {
		sb.safePrintf("<b><a href=\"autoban"
			      "?c=%s"
			      "&showAllIps=%li"
			      "&longview=1\">Show watched ips table...</a></b>",
			      coll,
			      showAllIps);
		return g_httpServer.sendDynamicPage ( s , 
						      sb.getBufStart() , 
						      sb.length() , 
						      -1 , 
						      false);
	}

	/////////////////////////////////////////////////////////////////////

	sb.safePrintf("\n<table width=100%% bgcolor=#%s "
		      "cellpadding=4 border=1>\n", 
		      BABY_BLUE);

	sb.safePrintf("<tr><td colspan=3 bgcolor=#%s>"
		      "<center><b>Watched Ips</b></center></td></tr>", 
		      DARK_BLUE);

	sb.safePrintf("<tr bgcolor=#%s>"
		      "<td><center><b>IP</b></center></td>"
		      "<td><center><b>Description</b></center></td>"
		      //		      "<td><center><b>Time Added</b></center></td>"
		      "<td><center><b>Allow/Deny/Clear</b></center></td>"
		      "</tr>", 
		      LIGHT_BLUE);




	long *sortedIndices = (long*)mmalloc(m_tableSize * sizeof(long), 
					     "AutoBanH");

	if(!sortedIndices) {
		return g_httpServer.sendErrorReply(s,500,mstrerror(ENOMEM));
	}

	long numEntries = 0;
	for(long i = 0; i < m_tableSize; i++) {
		if(m_detectKeys[i] == 0) continue;
		sortedIndices[numEntries++] = i;
	}
	SorterTable = m_detectKeys;

        gbsort(sortedIndices, numEntries, sizeof(long), ip_cmp);


	//lets put each class of watched ip in its own safebuf then cat 
	//them together at the end.
	
	SafeBuf allowed;
	SafeBuf banned; 
	SafeBuf feedLeachers; 
	SafeBuf cowBots; 
	SafeBuf *e;

	for(long j = 0; j < numEntries; j++) {
		long i = sortedIndices[j];
		if(m_detectKeys[i] == 0) continue;
		//if(!(m_detectVals[i].m_flags & FROMCONF)) continue;
		bool allow =  m_detectVals[i].m_flags & ALLOW && 
			m_detectVals[i].m_flags & FROMCONF;
		bool deny  =  m_detectVals[i].m_flags & DENY && 
			m_detectVals[i].m_flags & FROMCONF;
		bool explicitban = deny && m_detectVals[i].m_flags & FROMCONF;
		unsigned short dayCount = m_detectVals[i].m_dayCount;
		unsigned char minuteCount = m_detectVals[i].m_minuteCount;

		bool day =    dayCount >= g_conf.m_numFreeQueriesPerDay;
		bool minute = minuteCount >= g_conf.m_numFreeQueriesPerMinute;

		char *description;
		char *color;

		if(allow) {
			color = GREEN;
			description = "Allowed";
			e = &allowed;
		} 
		else if(explicitban) {
			color = RED;
			description = "Banned";
			e = &banned;
		}
		else if(minute) {
			color = RED;
			description = "Cow Bot";
			e = &cowBots;
		}
		else if(day) {
			color = RED;
			description = "Feed Leacher";
			e = &feedLeachers;
		}
		else {
			//this can happen when someone was banned due to 
			//exceeding the quota, then the quota was lowered.
			
			m_detectVals[i].m_flags &= ~DENY;
			//log("autoban: ohshit-banning %s",iptoa(s->m_ip));
			continue;
		}

		
		e->safePrintf("<tr>");

		e->safePrintf("<td bgcolor=#%s><center>%s</center></td><td>"
			      "<center>%s</center></td>"

// 			      "<td><center>"
// 			      "%li days %li hrs %li min ago"
// 			      "</center></td>"

			      "<td><center><a href=\"/master/"
			      "autoban?c=%s&allow=%s&showAllIps=%li\">" 
			      "allow/</a>"

			      "<a href=\"/master/"
			      "autoban?c=%s&deny=%s&showAllIps=%li\">" 
			      "deny/</a>"

			      "<a href=\"/master/"
			      "autoban?c=%s&clear=%s&showAllIps=%li\">"
			      "clear</a></center>"
			      "</td>",color, 
			      iptoa(m_detectKeys[i]),
			      description,

			      //      days,hours,minutes,

			      coll,
			      iptoa(m_detectKeys[i]),
			      showAllIps,
			      coll,
			      iptoa(m_detectKeys[i]),
			      showAllIps,
			      coll,
			      iptoa(m_detectKeys[i]),
			      showAllIps);
		e->safePrintf("</tr>");
	}

	sb.cat(allowed);
	sb.cat(banned); 
	sb.cat(feedLeachers); 
	sb.cat(cowBots); 

	sb.safePrintf ("</table><br><br>\n" );


	// MDW moved from here

	sb.safePrintf("\n<br><br><table width=100%% bgcolor=#%s "
		      "cellpadding=4 border=1>\n", 
		      BABY_BLUE);

	sb.safePrintf("<tr><td colspan=5 bgcolor=#%s>"
		      "<center><b>Control Panel</b></center></td></tr>", 
		      DARK_BLUE);

	sb.safePrintf("<tr>"
		      "<td bgcolor=#%s><center><b>Show Ips by Number of Queries"
		      "</b></center></td>",
		      LIGHT_BLUE);
	sb.safePrintf("<td><center><font color=red><b><a href=\"/master/"
		      "autoban?c=%s&showAllIps=0\">"
		      "0 Queries</a></b>"
		      "</font></center></td>",
		      coll);
	sb.safePrintf("<td><center><font color=red><b><a href=\"/master/"
		      "autoban?c=%s&showAllIps=1\">"
		      "1 Query</a></b>"
		      "</font></center></td>",
		      coll);
	sb.safePrintf("<td><center><font color=red><b><a href=\"/master/"
		      "autoban?c=%s&showAllIps=10\">"
		      "10 Queries</a></b>"
		      "</font></center></td>",
		      coll);
	sb.safePrintf("<td><center><font color=red><b><a href=\"/master/"
		      "autoban?c=%s&showAllIps=100\">"
		      "100 Queries</a></b>"
		      "</font></center></td></tr>",
		      coll);

	sb.safePrintf ("</table><br><br>\n");



	if(!showAllIps) {

		char* ss = (char*) sb.getBufStart();
		long sslen = sb.length();
		mfree(sortedIndices, m_tableSize * sizeof(long),"AutoBanH");

		return g_httpServer.sendDynamicPage ( s , ss , sslen , -1 , false);
	}
	

	sb.safePrintf("\n<br><br><table width=100%% bgcolor=#%s "
		      "cellpadding=4 border=1>\n", 
		      BABY_BLUE);

	sb.safePrintf("<tr><td colspan=6 bgcolor=#%s>"
		      "<center><b>Queries Today</b></center></td></tr>", 
		      DARK_BLUE);

	sb.safePrintf("<tr bgcolor=#%s>"
		      "<td><center><b>IP</b></center></td>"
		      "<td><center><b>Minute count</b></center></td>"
		      "<td><center><b>Day count</b></center></td>"
		      "<td><center><b>Time Until Reset</b></center></td>"
		      "<td><center><b>Times Banned</b></center></td>"
		      "<td><center><b>Allow/Deny</b></center></td>"
		      "</tr>", 
		      LIGHT_BLUE);


	char minBuf[128];
	char dayBuf[128];
	unsigned long lastIpGroup = 0;
	for(long j = 0; j < numEntries; j++) {
		long i = sortedIndices[j];
		long  dayCount = m_detectVals[i].m_dayCount;
		unsigned char minuteCount = m_detectVals[i].m_minuteCount;

		if(!(m_detectVals[i].m_flags & FROMCONF)) {
			if(m_detectVals[i].m_minuteExpires < now) 
				minuteCount = 0;
			if(!(m_detectVals[i].m_flags & DENY) && 
			   m_detectVals[i].m_dayExpires < now) 
				dayCount = 0;
		}
		//a hack:
		if( dayCount < showAllIps) continue;

		char *color = YELLOW;
		
		if(m_detectVals[i].m_flags & ALLOW) {
			color = GREEN;
			snprintf(minBuf, 128, "--");
			snprintf(dayBuf, 128, "%li", dayCount);
		}
		else if(m_detectVals[i].m_flags & DENY) {
			color = RED;
			snprintf(minBuf, 128, "--");
			snprintf(dayBuf, 128, "%li", dayCount);
		} 
		else {
			snprintf(minBuf, 128, "%li", (long)minuteCount);
			snprintf(dayBuf, 128, "%li", (long)dayCount);
		}

		unsigned long thisIpGroup = (unsigned long)m_detectKeys[i] & 
			0x00ffffff;

		sb.safePrintf("<tr><center>");

		if(m_detectVals[i].m_flags & FROMCONF) {
			sb.safePrintf("<td bgcolor=#%s><center>%s%s%s</center></td>"
				      "<td><center>%s</center> </td>"
				      "<td><center>%s</center></td>" 
				      "<td><center><font color=red>"
				      "<b>NEVER</b>"
				      "</font></center></td>"
				      "<td><center>--</center></td>",
				      color, 
				      (thisIpGroup == lastIpGroup)?"<b>":"",
				      iptoa(m_detectKeys[i]),
				      (thisIpGroup == lastIpGroup)?"</b>":"",
				      minBuf,
				      dayBuf);
		}
		else {
			//they haven't done a query since being unbanned,
			//unban them now so we don't get negative resets displayed.
			/*
			  no, don't unban the bots!!! MDW yippy project
			if(m_detectVals[i].m_dayExpires < now) {
				m_detectVals[i].m_flags &= ~DENY; 
				//log("autoban: dayexpire-unbanning %s",
				//    iptoa(ip));
				m_detectVals[i].m_dayExpires = now + ONE_DAY;
				m_detectVals[i].m_minuteExpires = now + 60;
				m_detectVals[i].m_dayCount = 0;
				m_detectVals[i].m_minuteCount = 0;
				sb.safePrintf("</center></tr>");
				continue;
			}
			*/

			getCalendarFromMs((m_detectVals[i].m_dayExpires - now)* 1000,
					  &days, 
					  &hours, 
					  &minutes, 
					  &secs,
					  &msecs);

			sb.safePrintf("<td bgcolor=#%s><center>%s%s%s</center></td>"
				      "<td><center>%s</center> </td>"
				      "<td><center>%s</center></td>" 
				      "<td><center><font color=red>"
				      "<b>%li days %li hrs %li min %li sec</b>"
				      "</font></center></td>"
				      "<td><center>%i</center></td>",
				      color, 
				      (thisIpGroup == lastIpGroup)?"<b>":"",
				      iptoa(m_detectKeys[i]),
				      (thisIpGroup == lastIpGroup)?"</b>":"",
				      minBuf,
				      dayBuf,
				      days, hours, minutes, secs,
				      m_detectVals[i].m_timesBanned);
		}
		sb.safePrintf("<td><center>"
			      "<a href=\"/master/"
			      "autoban?c=%s&allow=%s&showAllIps=%li\">" 
			      "allow/</a>"
			      "<a href=\"/master/"
			      "autoban?c=%s&deny=%s&showAllIps=%li\">" 
			      "deny</a></center>"
			      "</td>",
			      coll,
			      iptoa(m_detectKeys[i]),
			      showAllIps,
			      coll,
			      iptoa(m_detectKeys[i]),
			      showAllIps);

		sb.safePrintf("</center></tr>");
		lastIpGroup = thisIpGroup;
	}


	sb.safePrintf ("</table><br><br>\n" );


	char* ss = (char*) sb.getBufStart();
	long sslen = sb.length();

	mfree(sortedIndices, m_tableSize * sizeof(long),"AutoBanH");

	return g_httpServer.sendDynamicPage ( s , ss , sslen , -1 , false);
}

예제 #2

파일 보기

파일: PageReindex.cpp 프로젝트: privacore/open-source-search-engine

void doneReindexing ( void *state ) {
	// cast it
	State13 *st = (State13 *)state;

	GigablastRequest *gr = &st->m_gr;

	// note it
	if ( gr->m_query && gr->m_query[0] )
		log(LOG_INFO,"admin: Done with query reindex. %s",
		    mstrerror(g_errno));

	////
	//
	// print the html page
	//
	/////

	HttpRequest *hr = &gr->m_hr;

	char format = hr->getReplyFormat();

	SafeBuf sb;

	const char *ct = "text/html";
	if ( format == FORMAT_JSON ) ct = "application/json";
	if ( format == FORMAT_XML  ) {
		ct = "text/xml";

		sb.safePrintf("<response>\n"
			      "\t<statusCode>0</statusCode>\n"
			      "\t<statusMsg>Success</statusMsg>\n"
			      "\t<matchingResults>%" PRId32"</matchingResults>\n"
			      "</response>"
			      , st->m_msg1c.m_numDocIdsAdded
			      );
		g_httpServer.sendDynamicPage ( gr->m_socket,
					       sb.getBufStart(),
					       sb.length(),
					       -1,
					       false,ct);
		mdelete ( st , sizeof(State13) , "PageTagdb" );
		delete (st);
		return;
	}

	if ( format == FORMAT_JSON ) {
		sb.safePrintf("{\"response\":{\n"
			      "\t\"statusCode\":0,\n"
			      "\t\"statusMsg\":\"Success\",\n"
			      "\t\"matchingResults\":%" PRId32"\n"
			      "}\n"
			      "}\n"
			      , st->m_msg1c.m_numDocIdsAdded
			      );
		g_httpServer.sendDynamicPage ( gr->m_socket,
					       sb.getBufStart(),
					       sb.length(),
					       -1,
					       false,ct);
		mdelete ( st , sizeof(State13) , "PageTagdb" );
		delete (st);
		return;
	}



	g_pages.printAdminTop ( &sb , gr->m_socket , &gr->m_hr );

	sb.safePrintf("<style>"
		       ".poo { background-color:#%s;}\n"
		       "</style>\n" ,
		       LIGHT_BLUE );


	//
	// print error msg if any
	//

	if ( gr->m_query && gr->m_query[0] && ! g_errno )
		sb.safePrintf ( "<center><font color=red><b>Success. "
			  "Added %" PRId32" docid(s) to "
			  "spider queue.</b></font></center><br>" , 
			  st->m_msg1c.m_numDocIdsAdded );

	if ( gr->m_query && gr->m_query[0] && g_errno )
		sb.safePrintf ( "<center><font color=red><b>Error. "
				 "%s</b></font></center><br>" , 
				 mstrerror(g_errno));


	// print the reindex interface
	g_parms.printParmTable ( &sb , gr->m_socket , &gr->m_hr  );


	g_httpServer.sendDynamicPage ( gr->m_socket,
				       sb.getBufStart(),
				       sb.length(),
				       -1,
				       false);

	mdelete ( st , sizeof(State13) , "PageTagdb" );
	delete (st);
}

예제 #3

파일 보기

파일: Turkdb.cpp 프로젝트: DeadNumbers/open-source-search-engine

void gotDatedbList ( State60 *st ) {

	// must only be run on host #0 since we need just one lock table
	if ( g_hostdb.m_myHost->m_hostId != 0 ) { char *xx=NULL;*xx=0; }

	// load turk lock table if we need to
	bool s_init = false;
	if ( ! s_init ) {
		s_init = true;
		if ( ! g_turkLocks.set(8,sizeof(TurkLock),256) )
			log("turk: failed to init turk lock table");
		if ( ! g_turkLocks.load(g_conf.m_dir,"turkdir/docidlocks.dat"))
			log("turk: failed to load turk lock table");
	}

	time_t now = getTimeGlobal();
	// int16_tcut
	RdbList *list = &st->m_list;
	// the best docid
	int64_t best = 0LL;
	// scan the list to get urls/docids to turk out
	for ( ; ! list->isExhausted() ; ) {
		// get rec
		char *k = list->getCurrentKey();
		// skip that
		list->skipCurrentRecord();
		// skip if negative
		if ( (k[0] & 0x01) == 0x00 ) continue;
		// get the docid
		int64_t docid = g_datedb.getDocId ( k );
		// skip if locked
		TurkLock *tt = (TurkLock *)g_turkLock.getValue(&docid);
		// if there check time
		if ( tt && now - tt->m_lockTime > 3600 ) {
			// remove it
			g_turkLock.removeKey(&docId);
			// nuke tt
			tt = NULL;
		}
		// if still there, skip it and try next one
		if ( tt ) continue;
		// ok, we got a good docid to dish out
		best = docId;
		break;
	}

	SafeBuf sb;

	// print description so they can clikc a button to start the turk
	sb.safePrintf("<html>\n"
		      "<title>Event Editor</title>\n"
		      "<body>\n"
		      "<table width=\"100%%\" border=\"0\">\n"
		      "<tr><td style=\"background-color:#0079ba;\">\n"
		      "<center><font color=#00000>"
		      "<h2>Event Editor</h2>\n"
		      "</font></center></td>"
		      "</tr></table>");

	// if we had no docid, give user an empty msg
	if ( ! best ) {
		sb.safePrintf("<center>Nothing currently available to edit. "
			      "Please try again later.</center>"
			      "</body></html>\n");
		sendReply ( &sb );
		return;
	}

	// lock it!
	TurkLock tt;
	strcpy ( tt.m_user , st->m_user );
	tt.m_lockTime = now;
	if ( ! g_lockTable.addLock ( &tt ) ) {
		sendErrorReply ( st , g_errno );
		return;
	}

	// . fetch the TitleRec
	// . a max cache age of 0 means not to read from the cache
	XmlDoc *xd = &st->m_xd;
	// . when getTitleRec() is called it will load the old one
	//   since XmlDoc::m_setFromTitleRec will be true
	// . niceness is 0
	xd->set3 ( best , st->m_coll , 0 );
	// if it blocks while it loads title rec, it will re-call this routine
	xd->setCallback ( st , processLoopWrapper );
	// good to go!
	return processLoop ( st );
}

예제 #4

파일 보기

파일: PageParser.cpp 프로젝트: BILObilo/open-source-search-engine

bool gotXmlDoc ( void *state ) {
    // cast it
    State8 *st = (State8 *)state;
    // get the xmldoc
    XmlDoc *xd = &st->m_xd;

    // if we loaded from old title rec, it should be there!


    // . save the ips.txt file if we are the test coll
    // . saveTestBuf() is a function in Msge1.cpp
    //if ( xd && xd->m_coll && ! strcmp ( xd->m_coll , "test"))
    //	// use same dir that XmlDoc::getTestDir() would use
    //	saveTestBuf ( "test-page-parser" );

    // error?
    if ( g_errno ) return sendErrorReply ( st , g_errno );

    // shortcut
    SafeBuf *xbuf = &st->m_xbuf;

    bool printIt = false;
    if ( st->m_u && st->m_u[0] ) printIt = true;
    if ( st->m_docId != -1LL ) printIt = true;
    if ( st->m_donePrinting ) printIt = false;

    // do not re-call this if printDocForProCog blocked... (check length())
    if ( printIt ) {
        // mark as done
        st->m_donePrinting = true;
        // always re-compute the page inlinks dynamically, do not
        // use the ptr_linkInfo1 stored in titlerec!!
        // NO! not if set from titlerec/docid
        if ( st->m_recompute )
            xd->m_linkInfo1Valid = false;
        // try a recompute regardless, because we do not store the
        // bad inlinkers, and ppl want to see why they are bad!
        //xd->m_linkInfo1Valid = false;
        // now get the meta list, in the process it will print out a
        // bunch of junk into st->m_xbuf
        //char *metalist = xd->getMetaList ( );
        //if ( ! metalist ) return sendErrorReply ( st , g_errno );
        // return false if it blocked
        //if ( metalist == (void *)-1 ) return false;
        // for debug...
        //if ( ! xd->m_indexCode ) xd->doConsistencyTest ( false );
        // . print it out
        // . returns false if blocks, true otherwise
        // . sets g_errno on error
        if ( ! xd->printDocForProCog ( xbuf , &st->m_r ) )
            return false;
        // error?
        if ( g_errno ) return sendErrorReply ( st , g_errno );
    }

    long isXml = st->m_r.getLong("xml",0);
    char ctype = CT_HTML;
    if ( isXml ) ctype = CT_XML;

    // now encapsulate it in html head/tail and send it off
    bool status = g_httpServer.sendDynamicPage( st->m_s ,
                  xbuf->getBufStart(),
                  xbuf->length() ,
                  -1, //cachtime
                  false ,//postreply?
                  &ctype,
                  -1 , //httpstatus
                  NULL,//cookie
                  "utf-8");
    // delete the state now
    if ( st->m_freeIt ) {
        mdelete ( st , sizeof(State8) , "PageParser" );
        delete (st);
    }
    // return the status
    return status;
}

예제 #5

파일 보기

파일: PageAddUrl.cpp 프로젝트: lemire/open-source-search-engine

bool sendReply ( void *state ) {
	GigablastRequest *gr = (GigablastRequest *)state;

	// in order to see what sites are being added log it, then we can
	// more easily remove sites from sitesearch.gigablast.com that are
	// being added but not being searched
	SafeBuf xb;
	if ( gr->m_urlsBuf ) {
		xb.safeTruncateEllipsis ( gr->m_urlsBuf , 200 );
		log( LOG_INFO, "http: add url %s (%s)", xb.getBufStart(), mstrerror( g_errno ) );
	}

	char format = gr->m_hr.getReplyFormat();
	TcpSocket *sock = gr->m_socket;

	if ( format == FORMAT_JSON || format == FORMAT_XML ) {
		bool status = g_httpServer.sendSuccessReply ( gr );
		// nuke state
		mdelete ( gr , sizeof(gr) , "PageAddUrl" );
		delete (gr);
		return status;
	}

	int32_t ulen = 0;
	const char *url = gr->m_urlsBuf;
	if ( url ) ulen = gbstrlen (url);

	// re-null it out if just http://
	bool printUrl = true;
	if ( ulen == 0 ) printUrl = false;
	if ( ! gr->m_urlsBuf       ) printUrl = false;
	if ( ulen==7 && printUrl && !strncasecmp(gr->m_url,"http://",7))
		printUrl = false;
	if ( ulen==8 && printUrl && !strncasecmp(gr->m_url,"https://",8))
		printUrl = false;

	// page is not more than 32k
	char buf[1024*32+MAX_URL_LEN*2];
	SafeBuf sb(buf, 1024*32+MAX_URL_LEN*2);

	g_pages.printAdminTop ( &sb , sock , &gr->m_hr );

	// if there was an error let them know
	SafeBuf mbuf;

	if ( g_errno ) {
		mbuf.safePrintf("<center><font color=red>");
		mbuf.safePrintf("Error adding url(s): <b>%s[%i]</b>", mstrerror(g_errno) , g_errno);
		mbuf.safePrintf("</font></center>");
	} else if ( printUrl ) {
		mbuf.safePrintf("<center><font color=red>");
		mbuf.safePrintf("<b><u>");
		mbuf.safeTruncateEllipsis(gr->m_urlsBuf,200);
		mbuf.safePrintf("</u></b></font> added to spider queue successfully<br><br>");
		mbuf.safePrintf("</font></center>");
	}

	if ( mbuf.length() ) {
		sb.safeStrcpy( mbuf.getBufStart() );
	}

	g_parms.printParmTable ( &sb , sock , &gr->m_hr );

	// print the final tail
	g_pages.printTail ( &sb, true ); // admin?

	// clear g_errno, if any, so our reply send goes through
	g_errno = 0;

	// nuke state
	mdelete ( gr , sizeof(GigablastRequest) , "PageAddUrl" );
	delete (gr);

	return g_httpServer.sendDynamicPage( sock, sb.getBufStart(), sb.length(), -1 ); // cachetime
}

예제 #6

파일 보기

파일: PageCatdb.cpp 프로젝트: BILObilo/open-source-search-engine

bool sendReply ( void *state ) {
	StateCatdb *st = (StateCatdb*)state;
	// check for error
	if (g_errno) {
		if (st->m_catLookup)
			log("PageCatdb: Msg8b had error getting Site Rec: %s",
			    mstrerror(g_errno));
		else
			log("PageCatdb: Msg2a had error generating Catdb: %s",
			    mstrerror(g_errno));
		st->m_catLookup = false;
		g_errno = 0;
	}
	long long endTime = gettimeofdayInMilliseconds();
	// page buffer
	SafeBuf sb;
	sb.reserve(64*1024);
	// . print standard header
	// . do not print big links if only an assassin, just print host ids
	g_pages.printAdminTop ( &sb, st->m_socket , &st->m_r );

	sb.safePrintf(
		      "<style>"
		      ".poo { background-color:#%s;}\n"
		      "</style>\n" ,
		      LIGHT_BLUE );


	sb.safePrintf ( "<table %s>"
			"<tr><td colspan=2>"
			"<center><font size=+1><b>Catdb</b></font></center>"
			"</td></tr>", TABLE_STYLE );

	// instructions
	sb.safePrintf("<tr bgcolor=#%s>"
		      "<td colspan=3>"
		      "<font size=-2>"
		      "<center>"
		      "Don't just start using this, you need to follow the "
		      "instructions in the <i>admin guide</i> for adding "
		      "DMOZ support."
		      "</center>"
		      "</font>"
		      "</td>"
		      "</tr>"
		      ,DARK_BLUE
		      );

	// print the generate Catdb link
	sb.safePrintf ( "<tr class=poo><td>Update Catdb from DMOZ data.</td>"
			"<td><center>"
			"<a href=\"/master/catdb?c=%s&gencatdb=2\">"
			"Update Catdb</a> "
			"</center></td></tr>",
			st->m_coll );
	sb.safePrintf ( "<tr class=poo>"
			"<td>Generate New Catdb from DMOZ data.</td>"
			"<td><center>"
			"<a href=\"/master/catdb?c=%s&gencatdb=1\">"
			"Generate Catdb</a> "
			"</center></td></tr>",
			st->m_coll );
	if (st->m_genCatdb)
		sb.safePrintf ( "<tr class=poo>"
				"<td> Catdb Generation took %lli ms."
				"</td></tr>",
				endTime - st->m_startTime );
	// print Url Catgory Lookup
	sb.safePrintf ( "<tr class=poo><td>Lookup Category of Url.</td>"
			"<td><input type=text name=caturl size=80"
			" value=\"");
	if (st->m_catLookup) {
		sb.safeMemcpy(st->m_url.getUrl(), st->m_url.getUrlLen());
	}
	sb.safePrintf("\"></center></td></tr>" );
	// print Url Info if Lookup was done
	if (st->m_catLookup) {
		sb.safePrintf("<tr><td>");
		// print the url
		sb.safeMemcpy(st->m_url.getUrl(), st->m_url.getUrlLen());
		sb.safePrintf(" (%lli ms)</td><td>",
				endTime - st->m_startTime );
		// print each category id and path
		for (long i = 0; i < st->m_catRec.m_numCatids; i++) {
			sb.safePrintf("<b>[%li] ",
					st->m_catRec.m_catids[i]);
			g_categories->printPathFromId(&sb,
					st->m_catRec.m_catids[i]);
			sb.safePrintf("</b><br>");
			// lookup title and summary
			char  title[1024];
			long  titleLen = 0;
			char  summ[4096];
			long  summLen = 0;
			char  anchor[256];
			unsigned char anchorLen = 0;
			g_categories->getTitleAndSummary(
					st->m_url.getUrl(),
					st->m_url.getUrlLen(),
					st->m_catRec.m_catids[i],
					title,
					&titleLen,
					1023,
					summ,
					&summLen,
					4098,
					anchor,
					&anchorLen,
					255 );
			title[titleLen] = '\0';
			summ[summLen] = '\0';
			anchor[anchorLen] = '\0';
			// print title and summary
			sb.safePrintf("<b>Title:</b> %s<br>"
					"<b>Summary:</b> %s<br>",
					title, summ);
			if (anchorLen > 0)
				sb.safePrintf("<b>Anchor:</b> %s<br>",
						anchor);
			sb.safePrintf("<br>");
		}
		sb.safePrintf("<b>Filenum:</b> %li<br>",
				st->m_catRec.m_filenum);
		// print indirect catids
		if (st->m_catRec.m_numIndCatids > 0) {
			sb.safePrintf("<hr><b>Indirect Catids [%li]:"
					"</b><br>\n",
					st->m_catRec.m_numIndCatids );
			for (long i = 0;
				  i < st->m_catRec.m_numIndCatids; i++) {
				sb.safePrintf("%lu<br>",
					st->m_catRec.m_indCatids[i]);
			}
		}
		sb.safePrintf("</td></tr>");
	}
	// end it
	sb.safePrintf ( "</center></td></tr></table>" );
	// print submit button
	sb.safePrintf ( "<br><center>"
			"<input type=submit value=\"Submit\" border=0>"
			"</form></center>" );

	// print the final tail
	//p += g_httpServer.printTail ( p , pend - p );
	// clear g_errno, if any, so our reply send goes through
	g_errno = 0;
	// extract the socket
	TcpSocket *s = st->m_socket;
	// clear the state
	mdelete ( st, sizeof(StateCatdb), "PageCatdb" );
	delete st;
	// . send this page
	// . encapsulates in html header and tail
	// . make a Mime
	return g_httpServer.sendDynamicPage(s , sb.getBufStart(), sb.length());
}

예제 #7

파일 보기

파일: PageParser.cpp 프로젝트: BILObilo/open-source-search-engine

// . a new interface so Msg3b can call this with "s" set to NULL
// . returns false if blocked, true otherwise
// . sets g_errno on error
bool sendPageParser2 ( TcpSocket   *s ,
                       HttpRequest *r ,
                       State8      *st ,
                       long long    docId ,
                       Query       *q ,
                       // in query term space, not imap space
                       long long   *termFreqs       ,
                       // in imap space
                       float       *termFreqWeights ,
                       // in imap space
                       float       *affWeights      ,
                       void        *state ,
                       void       (* callback)(void *state) ) {

    //log("parser: read sock=%li",s->m_sd);

    // might a simple request to addsomething to validated.*.txt file
    // from XmlDoc::print() or XmlDoc::validateOutput()
    char *add = r->getString("add",NULL);
    //long long uh64 = r->getLongLong("uh64",0LL);
    char *uh64str = r->getString("uh64",NULL);
    //char *divTag = r->getString("div",NULL);
    if ( uh64str ) {
        // convert add to number
        long addNum = 0;
        if ( to_lower_a(add[0])=='t' ) // "true" or "false"?
            addNum = 1;
        // convert it. skip beginning "str" inserted to prevent
        // javascript from messing with the long long since it
        // was rounding it!
        //long long uh64 = atoll(uh64str);//+3);
        // urldecode that
        //long divTagLen = gbstrlen(divTag);
        //long newLen  = urlDecode ( divTag , divTag , divTagLen );
        // null term?
        //divTag[newLen] = '\0';
        // do it. this is defined in XmlDoc.cpp
        //addCheckboxSpan ( uh64 , divTag , addNum );
        // make basic reply
        char *reply;
        reply = "HTTP/1.0 200 OK\r\n"
                "Connection: Close\r\n";
        // that is it! send a basic reply ok
        bool status = g_httpServer.sendDynamicPage( s ,
                      reply,
                      gbstrlen(reply),
                      -1, //cachtime
                      false ,//postreply?
                      NULL, //ctype
                      -1 , //httpstatus
                      NULL,//cookie
                      "utf-8");
        return status;
    }

    // make a state
    if (   st ) st->m_freeIt = false;
    if ( ! st ) {
        try {
            st = new (State8);
        }
        catch ( ... ) {
            g_errno = ENOMEM;
            log("PageParser: new(%i): %s",
                sizeof(State8),mstrerror(g_errno));
            return g_httpServer.sendErrorReply(s,500,
                                               mstrerror(g_errno));
        }
        mnew ( st , sizeof(State8) , "PageParser" );
        st->m_freeIt = true;
    }
    // msg3b uses this to get a score from the query
    st->m_state           = state;
    st->m_callback        = callback;
    st->m_q               = q;
    st->m_termFreqs       = termFreqs;
    st->m_termFreqWeights = termFreqWeights;
    st->m_affWeights      = affWeights;
    st->m_total           = (score_t)-1;
    st->m_indexCode       = 0;
    st->m_blocked         = false;
    st->m_didRootDom      = false;
    st->m_didRootWWW      = false;
    st->m_wasRootDom      = false;
    st->m_u               = NULL;
    st->m_recompute       = false;
    //st->m_url.reset();

    // do not allow more than one to be launched at a time if in
    // a quickpoll. will cause quickpoll in quickpoll.
    g_inPageParser = true;

    // password, too
    long pwdLen = 0;
    char *pwd = r->getString ( "pwd" , &pwdLen );
    if ( pwdLen > 31 ) pwdLen = 31;
    if ( pwdLen > 0 ) strncpy ( st->m_pwd , pwd , pwdLen );
    st->m_pwd[pwdLen]='\0';

    // save socket ptr
    st->m_s = s;
    st->m_r.copy ( r );
    // get the collection
    char *coll    = r->getString ( "c" , &st->m_collLen ,NULL /*default*/);
    if ( st->m_collLen > MAX_COLL_LEN )
        return sendErrorReply ( st , ENOBUFS );
    strcpy ( st->m_coll , coll );

    // version to use, if -1 use latest
    st->m_titleRecVersion = r->getLong("version",-1);
    if ( st->m_titleRecVersion == -1 )
        st->m_titleRecVersion = TITLEREC_CURRENT_VERSION;
    // default to 0 if not provided
    st->m_hopCount = r->getLong("hc",0);
    //long  ulen    = 0;
    //char *u     = r->getString ( "u" , &ulen     , NULL /*default*/);
    long  old     = r->getLong   ( "old", 0 );
    // set query
    long qlen;
    char *qs = r->getString("q",&qlen,NULL);
    if ( qs ) st->m_tq.set2 ( qs , langUnknown , true );
    // url will override docid if given
    if ( ! st->m_u || ! st->m_u[0] )
        st->m_docId = r->getLongLong ("docid",-1);
    else
        st->m_docId = -1;
    // set url in state class (may have length 0)
    //if ( u ) st->m_url.set ( u , ulen );
    //st->m_urlLen = ulen;
    st->m_u = st->m_r.getString("u",&st->m_ulen,NULL);
    // should we recycle link info?
    st->m_recycle  = r->getLong("recycle",0);
    st->m_recycle2 = r->getLong("recycleimp",0);
    st->m_render   = r->getLong("render" ,0);
    // for quality computation... takes way longer cuz we have to
    // lookup the IP address of every outlink, so we can get its root
    // quality using Msg25 which needs to filter out voters from that IP
    // range.
    st->m_oips     = r->getLong("oips"    ,0);

    long  linkInfoLen  = 0;
    // default is NULL
    char *linkInfoColl = r->getString ( "oli" , &linkInfoLen, NULL );
    if ( linkInfoColl ) strcpy ( st->m_linkInfoColl , linkInfoColl );
    else st->m_linkInfoColl[0] = '\0';

    // set the flag in our SafeBuf class so that Words.cpp knows to show
    // html or html source depending on this value
    st->m_xbuf.m_renderHtml = st->m_render;

    // should we use the old title rec?
    st->m_old    = old;
    // are we coming from a local machine?
    st->m_isLocal = r->isLocal();
    //no more setting the default root quality to 30, instead if we do not
    // know it setting it to -1
    st->m_rootQuality=-1;







    // header
    SafeBuf *xbuf = &st->m_xbuf;
    xbuf->safePrintf("<meta http-equiv=\"Content-Type\" "
                     "content=\"text/html; charset=utf-8\">\n");

    // print standard header
    g_pages.printAdminTop ( xbuf , st->m_s , &st->m_r );


    // print the standard header for admin pages
    char *dd     = "";
    char *rr     = "";
    char *rr2    = "";
    char *render = "";
    char *oips   = "";
    char *us     = "";
    if ( st->m_u && st->m_u[0] ) us = st->m_u;
    //if ( st->m_sfn != -1 ) sprintf ( rtu , "%li",st->m_sfn );
    if ( st->m_old ) dd = " checked";
    if ( st->m_recycle            ) rr     = " checked";
    if ( st->m_recycle2           ) rr2    = " checked";
    if ( st->m_render             ) render = " checked";
    if ( st->m_oips               ) oips   = " checked";

    xbuf->safePrintf(
        "<style>"
        ".poo { background-color:#%s;}\n"
        "</style>\n" ,
        LIGHT_BLUE );


    long clen;
    char *contentParm = r->getString("content",&clen,"");

    // print the input form
    xbuf->safePrintf (
        "<style>\n"
        "h2{font-size: 12px; color: #666666;}\n"

        ".gbtag { border: 1px solid gray;"
        "background: #ffffef;display:inline;}\n"
        ".gbcomment { border: 1px solid gray;"
        "color: #888888; font-style:italic; "
        "background: #ffffef;display:inline;}\n"

        ".token { border: 1px solid gray;"
        "background: #f0ffff;display:inline;}\n"
        ".spam { border: 1px solid gray;"
        "background: #af0000;"
        "color: #ffffa0;}"
        ".hs {color: #009900;}"
        "</style>\n"
        "<center>"

        "<table %s>"

        "<tr><td colspan=5><center><b>"
        "Parser"
        "</b></center></td></tr>\n"

        "<tr class=poo>"
        "<td>"
        "<b>url</b>"
        "<br><font size=-2>"
        "Type in <b>FULL</b> url to parse."
        "</font>"
        "</td>"

        "</td>"
        "<td>"
        "<input type=text name=u value=\"%s\" size=\"40\">\n"
        "</td>"
        "</tr>"


        /*
        "<tr class=poo>"
        "<td>"
        "Parser version to use: "
        "</td>"
        "<td>"
        "<input type=text name=\"version\" size=\"4\" value=\"-1\"> "
        "</td>"
        "<td>"
        "(-1 means to use latest title rec version)<br>"
        "</td>"
        "</tr>"
         */

        /*
        "<tr class=poo>"
        "<td>"
        "Hop count to use: "
        "</td>"
        "<td>"
        "<input type=text name=\"hc\" size=\"4\" value=\"%li\"> "
        "</td>"
        "<td>"
        "(-1 is unknown. For root urls hopcount is always 0)<br>"
        "</td>"
        "</tr>"
         */

        "<tr class=poo>"
        "<td>"
        "<b>use cached</b>"

        "<br><font size=-2>"
        "Load page from cache (titledb)?"
        "</font>"

        "</td>"
        "<td>"
        "<input type=checkbox name=old value=1%s> "
        "</td>"
        "</tr>"

        /*
        "<tr class=poo>"
        "<td>"
        "Reparse root:"
        "</td>"
        "<td>"
        "<input type=checkbox name=artr value=1%s> "
        "</td>"
        "<td>"
        "Apply selected ruleset to root to update quality"
        "</td>"
        "</tr>"
         */

        "<tr class=poo>"
        "<td>"
        "<b>recycle link info</b>"

        "<br><font size=-2>"
        "Recycle the link info from the title rec"
        "Load page from cache (titledb)?"
        "</font>"

        "</td>"
        "<td>"
        "<input type=checkbox name=recycle value=1%s> "
        "</td>"
        "</tr>"

        /*
        "<tr class=poo>"
        "<td>"
        "Recycle Link Info Imported:"
        "</td>"
        "<td>"
        "<input type=checkbox name=recycleimp value=1%s> "
        "</td>"
        "<td>"
        "Recycle the link info imported from other coll"
        "</td>"
        "</tr>"
         */

        "<tr class=poo>"
        "<td>"
        "<b>render html</b>"

        "<br><font size=-2>"
        "Render document content as HTML"
        "</font>"

        "</td>"
        "<td>"
        "<input type=checkbox name=render value=1%s> "
        "</td>"
        "</tr>"

        /*
        "<tr class=poo>"
        "<td>"
        "Lookup outlinks' ruleset, ips, quality:"
        "</td>"
        "<td>"
        "<input type=checkbox name=oips value=1%s> "
        "</td>"
        "<td>"
        "To compute quality lookup IP addresses of roots "
        "of outlinks."
        "</td>"
        "</tr>"

        "<tr class=poo>"
        "<td>"
        "LinkInfo Coll:"
        "</td>"
        "<td>"
        "<input type=text name=\"oli\" size=\"10\" value=\"\"> "
        "</td>"
        "<td>"
        "Leave empty usually. Uses this coll to lookup link info."
        "</td>"
        "</tr>"
         */

        "<tr class=poo>"
        "<td>"
        "<b>optional query</b>"

        "<br><font size=-2>"
        "Leave empty usually. For title generation only."
        "</font>"

        "</td>"
        "<td>"
        "<input type=text name=\"q\" size=\"20\" value=\"\"> "
        "</td>"
        "</tr>"



        "<tr class=poo>"
        "<td>"
        "<b>content below is xml</b>"
        "<br><font size=-2>"
        "Is the content below XML?"
        "</font>"
        "</td>"

        "<td>"
        "<input type=checkbox name=xml value=1> "

        "</td>"
        "</tr>"




        "<tr class=poo>"
        "<td><b>content</b>"
        "<br><font size=-2>"
        "Use this content for the provided <i>url</i> "
        "rather than downloading it from the web."
        "</td>"

        "<td>"
        "<textarea rows=10 cols=80 name=content>"
        "%s"
        "</textarea>"
        "</td>"
        "</tr>"

        "</table>"
        "</center>"
        "</form>"
        "<br>",

        TABLE_STYLE,
        us ,
        //(long)st->m_hopCount,
        //rtu,
        dd,
        //artr ,
        rr,
        //rr2,
        render ,
        //oips ,
        contentParm );



    xbuf->safePrintf(
        "<center>"
        "<input type=submit value=Submit>"
        "</center>"
    );


    // just print the page if no url given
    if ( ! st->m_u || ! st->m_u[0] ) return processLoop ( st );


    XmlDoc *xd = &st->m_xd;
    // set this up
    SpiderRequest sreq;
    sreq.reset();
    strcpy(sreq.m_url,st->m_u);
    long firstIp = hash32n(st->m_u);
    if ( firstIp == -1 || firstIp == 0 ) firstIp = 1;
    // parentdocid of 0
    sreq.setKey( firstIp, 0LL, false );
    sreq.m_isPageParser = 1;
    sreq.m_hopCount = st->m_hopCount;
    sreq.m_hopCountValid = 1;
    sreq.m_fakeFirstIp   = 1;
    sreq.m_firstIp = firstIp;
    Url nu;
    nu.set(sreq.m_url);
    sreq.m_domHash32 = nu.getDomainHash32();
    sreq.m_siteHash32 = nu.getHostHash32();

    // . get provided content if any
    // . will be NULL if none provided
    // . "content" may contain a MIME
    long  contentLen = 0;
    char *content = r->getString ( "content" , &contentLen , NULL );
    // is the "content" url-encoded? default is true.
    bool contentIsEncoded = true;
    // mark doesn't like to url-encode his content
    if ( ! content ) {
        content    = r->getUnencodedContent    ();
        contentLen = r->getUnencodedContentLen ();
        contentIsEncoded = false;
    }
    // ensure null
    if ( contentLen == 0 ) content = NULL;

    uint8_t contentType = CT_HTML;
    if ( r->getBool("xml",0) ) contentType = CT_XML;

    // if facebook, load xml content from title rec...
    bool isFacebook = (bool)strstr(st->m_u,"http://www.facebook.com/");
    if ( isFacebook && ! content ) {
        long long docId = g_titledb.getProbableDocId(st->m_u);
        sprintf(sreq.m_url ,"%llu", docId );
        sreq.m_isPageReindex = true;
    }

    // hack
    if ( content ) {
        st->m_dbuf.purge();
        st->m_dbuf.safeStrcpy(content);
        //char *data = strstr(content,"\r\n\r\n");
        //long dataPos = 0;
        //if ( data ) dataPos = (data + 4) - content;
        //st->m_dbuf.convertJSONtoXML(0,dataPos);
        //st->m_dbuf.decodeJSON(0);
        content = st->m_dbuf.getBufStart();
    }

    // . use the enormous power of our new XmlDoc class
    // . this returns false if blocked
    if ( ! xd->set4 ( &sreq       ,
                      NULL        ,
                      st->m_coll  ,
                      &st->m_wbuf        ,
                      0 ,//PP_NICENESS ))
                      content ,
                      false, // deletefromindex
                      0, // forced ip
                      contentType ))
        // return error reply if g_errno is set
        return sendErrorReply ( st , g_errno );
    // make this our callback in case something blocks
    xd->setCallback ( st , processLoop );
    // . set xd from the old title rec if recycle is true
    // . can also use XmlDoc::m_loadFromOldTitleRec flag
    if ( st->m_recycle ) xd->m_recycleContent = true;

    return processLoop ( st );
}

예제 #8

파일 보기

파일: qa.cpp 프로젝트: firatkarakusoglu/open-source-search-engine

bool qajson ( ) {
	//
	// delete the 'qatest123' collection
	//
	//static bool s_x1 = false;
	if ( ! s_flags[0] ) {
		s_flags[0] = true;
		if ( ! getUrl ( "/admin/delcoll?xml=1&delcoll=qatest123" ) )
			return false;
	}

	//
	// add the 'qatest123' collection
	//
	//static bool s_x2 = false;
	if ( ! s_flags[1] ) {
		s_flags[1] = true;
		if ( ! getUrl ( "/admin/addcoll?addcoll=qatest123&xml=1" , 
				// checksum of reply expected
				238170006 ) )
			return false;
	}


	// add the 50 urls
	if ( ! s_flags[3] ) {
		s_flags[3] = true;
		SafeBuf sb;

		sb.safePrintf("&c=qatest123"
			      "&format=json"
			      "&strip=1"
			      "&spiderlinks=0"
			      "&urls="//www.walmart.com+ibm.com"
			      );
		sb.urlEncode ( s_ubuf4 );
		// . now a list of websites we want to spider
		// . the space is already encoded as +
		if ( ! getUrl ( "/admin/addurl",0,sb.getBufStart()) )
			return false;
	}


	//
	// wait for spidering to stop
	//
 checkagain:

	// wait until spider finishes. check the spider status page
	// in json to see when completed
	//static bool s_k1 = false;
	if ( ! s_flags[5] ) {
		// wait 5 seconds, call sleep timer... then call qatest()
		//usleep(5000000); // 5 seconds
		wait(3.0);
		s_flags[5] = true;
		return false;
	}

	if ( ! s_flags[15] ) {
		s_flags[15] = true;
		if ( ! getUrl ( "/admin/status?format=json&c=qatest123",0) )
			return false;
	}

	//static bool s_k2 = false;
	if ( ! s_flags[6] ) {
		// ensure spiders are done. 
		// "Nothing currently available to spider"
		if ( s_content&&!strstr(s_content,"Nothing currently avail")){
			s_flags[5] = false;
			s_flags[15] = false;
			goto checkagain;
		}
		s_flags[6] = true;
	}

		

	if ( ! s_flags[7] ) {
		s_flags[7] = true;
		if ( ! getUrl ( "/search?c=qatest123&qa=1&format=xml&"
				"q=type%3Ajson+meta.authors%3Appk",
				-1310551262 ) )
			return false;
	}

	if ( ! s_flags[8] ) {
		s_flags[8] = true;
		if ( ! getUrl ( "/search?c=qatest123&qa=1&format=xml&n=100&"
				"q=type%3Ajson",
				-1310551262 ) )
			return false;
	}

	if ( ! s_flags[9] ) {
		s_flags[9] = true;
		if ( ! getUrl ( "/search?c=qatest123&qa=1&format=json&"
				"q=gbfacetstr%3Ameta.authors",
				-1310551262 ) )
			return false;
	}

	if ( ! s_flags[10] ) {
		s_flags[10] = true;
		// this has > 50 values for the facet field hash
		if ( ! getUrl ( "/search?c=qatest123&qa=1&format=json&"
				"q=gbfacetstr%3Astrings.key",
				-1310551262 ) )
			return false;
	}


	// other query tests...
	if ( ! s_flags[12] ) {
		s_flags[12] = true;
		if ( ! getUrl ( "/search?c=qatest123&qa=1&format=json&"
				"q=inurl2%3Aquirksmode.org%2Fm%2F",
				-1310551262 ) )
			return false;
	}

	if ( ! s_flags[13] ) {
		s_flags[13] = true;
		if ( ! getUrl ( "/search?c=qatest123&qa=1&format=json&"
				"q=site%3Aquirksmode.org",
				-1310551262 ) )
			return false;
	}
	

	// test gbfieldmatch:field:"quoted value" query to ensure it converts
	// the quoted value into the right int32
	if ( ! s_flags[14] ) {
		s_flags[14] = true;
		if ( ! getUrl ( "/search?c=qatest123&qa=1&format=json&"
				"q=gbfieldmatch%3Astrings.key"
				"%3Ainvestigate-tweet",
				-1310551262 ) )
			return false;
	}

	if ( ! s_flags[15] ) {
		s_flags[15] = true;
		if ( ! getUrl ( "/search?c=qatest123&qa=1&format=json&"
				"q=gbfieldmatch%3Astrings.key"
				"%3A\"Maemo+Browser\"",
				-1310551262 ) )
			return false;
	}

	if ( ! s_flags[16] ) {
		s_flags[16] = true;
		if ( ! getUrl ( "/search?c=qatest123&qa=1&format=json&"
				"q=gbfieldmatch%3Astrings.key"
				"%3A\"Google+Wireless+Transcoder\"",
				-1310551262 ) )
			return false;
	}

	// this should have no results, not capitalized
	if ( ! s_flags[17] ) {
		s_flags[17] = true;
		if ( ! getUrl ( "/search?c=qatest123&qa=1&format=json&"
				"q=gbfieldmatch%3Astrings.key%3A\"samsung\"",
				-1310551262 ) )
			return false;
	}

	if ( ! s_flags[18] ) {
		s_flags[18] = true;
		if ( ! getUrl ( "/search?c=qatest123&qa=1&format=json&"
				"q=gbfieldmatch%3Astrings.key%3ASamsung",
				-1310551262 ) )
			return false;
	}

	if ( ! s_flags[18] ) {
		s_flags[18] = true;
		if ( ! getUrl ( "/search?c=qatest123&qa=1&format=json&"
				"q=gbfieldmatch%3Astrings.key%3A\"Samsung\"",
				-1310551262 ) )
			return false;
	}



	//static bool s_fee2 = false;
	if ( ! s_flags[20] ) {
		s_flags[20] = true;
		log("qa: SUCCESSFULLY COMPLETED "
		    "QA JSON TEST");
		return true;
	}

	return true;
}

예제 #9

파일 보기

파일: qa.cpp 프로젝트: firatkarakusoglu/open-source-search-engine

bool sendPageQA ( TcpSocket *sock , HttpRequest *hr ) {
	char pbuf[32768];
	SafeBuf sb(pbuf, 32768);

	//char format = hr->getReplyFormat();

	// set this. also sets gr->m_hr
	GigablastRequest gr;
	// this will fill in GigablastRequest so all the parms we need are set
	g_parms.setGigablastRequest ( sock , hr , &gr );


	//
	// . handle a request to update the crc for this test
	// . test id identified by "ajaxUrlHash" which is the hash of the test's url
	//   and the test name, QATest::m_testName
	long ajax = hr->getLong("ajax",0);
	unsigned long ajaxUrlHash ;
	ajaxUrlHash = (unsigned long long)hr->getLongLong("uh",0LL);
	unsigned long ajaxCrc ;
	ajaxCrc = (unsigned long long)hr->getLongLong("crc",0LL);

	if ( ajax ) {
		// make sure it is initialized
		if ( s_ht.m_ks ) {
			// overwrite current value with provided one because 
			// the user click on an override checkbox to update 
			// the crc
			s_ht.addKey ( &ajaxUrlHash , &ajaxCrc );
			saveHashTable();
		}
		// send back the urlhash so the checkbox can turn the
		// bg color of the "diff" gray
		SafeBuf sb3;
		sb3.safePrintf("%lu",ajaxUrlHash);
		g_httpServer.sendDynamicPage(sock,
					     sb3.getBufStart(),
					     sb3.length(),
					     -1/*cachetime*/);
		return true;
	}
		

	// if they hit the submit button, begin the tests
	long submit = hr->hasField("action");

	long n = sizeof(s_qatests)/sizeof(QATest);


	if ( submit && g_qaInProgress ) {
		g_errno = EINPROGRESS;
		g_httpServer.sendErrorReply(sock,g_errno,mstrerror(g_errno));
		return true;
	}

	// set m_doTest
	for ( long i = 0 ; submit && i < n ; i++ ) {
		QATest *qt = &s_qatests[i];
		char tmp[10];
		sprintf(tmp,"test%li",i);
		qt->m_doTest = hr->getLong(tmp,0);
	}

	if ( submit ) {
		// reset all the static thingies
		resetFlags();
		// save socket
		g_qaSock = sock;
		g_numErrors = 0;
		g_qaOutput.reset();
		g_qaOutput.safePrintf("<html><body>"
				      "<title>QA Test Results</title>\n");

		g_qaOutput.safePrintf("<SCRIPT LANGUAGE=\"javascript\">\n"
				      // update s_ht with the new crc for this test
				      "function submitchanges(urlhash,crc) "
				      "{\n "
				      "var client=new XMLHttpRequest();\n"
				      "client.onreadystatechange=gotsubmitreplyhandler;"
				      "var "
				      "u='/admin/qa?ajax=1&uh='+urlhash+'&crc='+crc;\n"
				      "client.open('GET',u);\n"
				      "client.send();\n"
				      
				      // use that to fix background to gray
				      "var w=document.getElementById(urlhash);\n"
				      // set background color
				      "w.style.backgroundColor = '0xe0e0e0';\n"

				      // gear spinning after checkbox
				      "}\n\n "

				      // call this when we got the reply that the 
				      // checkbox went through
				      "function gotsubmitreplyhandler() {\n"
				      // return if reply is not fully ready
				      "if(this.readyState != 4 )return;\n"
				      // if error or empty reply then do nothing
				      "if(!this.responseText)return;\n"
				      // response text is the urlhash32, unsigned long
				      "var id=this.responseText;\n"
				      // use that to fix background to gray
				      "var w=document.getElementById(id);\n"
				      // set background color
				      "w.style.backgroundColor = '0xe0e0e0';\n"
				      "}\n\n"

				      "</SCRIPT> ");
		// and run the qa test loop
		if ( ! qatest( ) ) return false;
		// what happened?
		log("qa: qatest completed without blocking");
	}

	// show tests, all checked by default, to perform

	g_pages.printAdminTop ( &sb , sock , hr );

	sb.safePrintf("<SCRIPT LANGUAGE=\"javascript\">\n"
		     "function checkAll(name, num)\n "
		      "{ "
		      "    for (var i = 0; i < num; i++) {\n"
		      "      var e = document.getElementById(name + i);\n"
		      //"alert(name+i);"
		      "      e.checked = !e.checked ;\n "
		      "  }\n"
		      "}\n\n "

		      "</SCRIPT> ");

	//sb.safePrintf("<form name=\"fo\">");

	sb.safePrintf("\n<table %s>\n",TABLE_STYLE);
	sb.safePrintf("<tr class=hdrow><td colspan=2>"
		      "<center><b>QA Tests</b></center>"
		      "</td></tr>");

	// header row
	sb.safePrintf("<tr><td><b>Do Test?</b> <a style=cursor:hand;"
		      "cursor:pointer; "
		      "onclick=\"checkAll('test', %li);\">(toggle)</a>",n);
	sb.safePrintf("</td><td><b>Test Name</b></td></tr>\n");
	
	// . we keep the ptr to each test in an array
	// . print out each qa function
	for ( long i = 0 ; i < n ; i++ ) {
		QATest *qt = &s_qatests[i];
		char *bg;
		if ( i % 2 == 0 ) bg = LIGHT_BLUE;
		else              bg = DARK_BLUE;
		sb.safePrintf("<tr bgcolor=#%s>"
			      "<td><input type=checkbox value=1 name=test%li "
			      "id=test%li></td>"
			      "<td>%s"
			      "<br>"
			      "<font color=gray size=-1>%s</font>"
			      "</td>"
			      "</tr>\n"
			      , bg
			      , i
			      , i
			      , qt->m_testName
			      , qt->m_testDesc
			      );
	}

	sb.safePrintf("</table>\n<br>\n");
	//	      "</form>\n");

	g_pages.printAdminBottom ( &sb , hr );


	g_httpServer.sendDynamicPage(sock,
				     sb.getBufStart(),
				     sb.length(),
				     -1/*cachetime*/);

	return true;
}

예제 #10

파일 보기

파일: qa.cpp 프로젝트: firatkarakusoglu/open-source-search-engine

void processReply ( char *reply , long replyLen ) {

	// store our current reply
	SafeBuf fb2;
	fb2.safeMemcpy(reply,replyLen );
	fb2.nullTerm();

	// log that we got the reply
	log("qa: got reply(len=%li)(errno=%s)=%s",
	    replyLen,mstrerror(g_errno),reply);

	char *content = NULL;
	long  contentLen = 0;

	// get mime
	if ( reply ) {
		HttpMime mime;
		mime.set ( reply, replyLen , NULL );
		// only hash content since mime has a timestamp in it
		content = mime.getContent();
		contentLen = mime.getContentLen();
		if ( content && contentLen>0 && content[contentLen] ) { 
			char *xx=NULL;*xx=0; }
	}

	if ( ! content ) {
		content = "";
		contentLen = 0;
	}

	s_content = content;

	// take out <responseTimeMS>
	markOut ( content , "<currentTimeUTC>");
	markOut ( content , "<responseTimeMS>");

	// until i figure this one out, take it out
	markOut ( content , "<docsInCollection>");

	// until i figure this one out, take it out
	markOut ( content , "<hits>");

	// for those links in the html pages
	markOut ( content, "rand64=");

	// for json
	markOut ( content , "\"currentTimeUTC\":" );
	markOut ( content , "\"responseTimeMS\":");
	markOut ( content , "\"docsInCollection\":");

	// for xml
	markOut ( content , "<currentTimeUTC>" );
	markOut ( content , "<responseTimeMS>");
	markOut ( content , "<docsInCollection>");

	// indexed 1 day ago
	markOut ( content,"indexed:");
	// modified 1 day ago
	markOut ( content,"modified:");

	// s_gigabitCount... it is perpetually incrementing static counter
	// in PageResults.cpp
	markOut(content,"ccc(");
	markOut(content,"id=fd");
	markOut(content,"id=sd");

	// for some reason the term freq seems to change a little in
	// the scoring table
	markOut(content,"id=tf");

	// make checksum. we ignore back to back spaces so this
	// hash works for <docsInCollection>10 vs <docsInCollection>9
	long contentCRC = 0; 
	if ( content ) contentCRC = qa_hash32 ( content );

	// note it
	log("qa: got contentCRC of %lu",contentCRC);


	// if what we expected, save to disk if not there yet, then
	// call s_callback() to resume the qa pipeline
	/*
	if ( contentCRC == s_expectedCRC ) {
		// save content if good
		char fn3[1024];
		sprintf(fn3,"%sqa/content.%lu",g_hostdb.m_dir,contentCRC);
		File ff; ff.set ( fn3 );
		if ( ! ff.doesExist() ) {
			// if not there yet then save it
			fb2.save(fn3);
		}
		// . continue on with the qa process
		// . which qa function that may be
		//s_callback();
		return;
	}
	*/

	//
	// if crc of content does not match what was expected then do a diff
	// so we can see why not
	//

	// this means caller does not care about the response
	if ( ! s_checkCRC ) {
		//s_callback();
		return;
	}

	//const char *emsg = "qa: bad contentCRC of %li should be %li "
	//	"\n";//"phase=%li\n";
	//fprintf(stderr,emsg,contentCRC,s_expectedCRC);//,s_phase-1);

	// hash url
	long urlHash32 = hash32n ( s_url.getUrl() );

	// combine test function too since two tests may use the same url
	long nameHash = hash32n ( s_qt->m_testName );

	// combine together
	urlHash32 = hash32h ( nameHash , urlHash32 );

	static bool s_init = false;
	if ( ! s_init ) {
		s_init = true;
		s_ht.set(4,4,1024,NULL,0,false,0,"qaht");
		// make symlink
		//char cmd[512];
		//snprintf(cmd,"cd %s/html ;ln -s ../qa ./qa", g_hostdb.m_dir);
		//system(cmd);
		char dir[1024];
		snprintf(dir,1000,"%sqa",g_hostdb.m_dir);
		long status = ::mkdir ( dir ,
					S_IRUSR | S_IWUSR | S_IXUSR | 
					S_IRGRP | S_IWGRP | S_IXGRP | 
					S_IROTH | S_IXOTH );
	        if ( status == -1 && errno != EEXIST && errno )
			log("qa: Failed to make directory %s: %s.",
			    dir,mstrerror(errno));
		// try to load from disk
		SafeBuf fn;
		fn.safePrintf("%s/qa/",g_hostdb.m_dir);
		log("qa: loading crctable.dat");
		s_ht.load ( fn.getBufStart() , "crctable.dat" );
	}

	// break up into lines
	char fn2[1024];
	sprintf(fn2,"%sqa/content.%lu",g_hostdb.m_dir,contentCRC);
	fb2.save ( fn2 );

	// look up in hashtable to see what reply crc should be
	long *val = (long *)s_ht.getValue ( &urlHash32 );

	// just return if the same
	if ( val && contentCRC == *val ) {
		g_qaOutput.safePrintf("<b style=color:green;>"
				      "passed test</b><br>%s : "
				      "<a href=%s>%s</a> (urlhash=%lu "
				      "crc=<a href=/qa/content.%lu>"
				      "%lu</a>)<br>"
				      "<hr>",
				      s_qt->m_testName,
				      s_url.getUrl(),
				      s_url.getUrl(),
				      urlHash32,
				      contentCRC,
				      contentCRC);
		return;
	}



	if ( ! val ) {
		// add it so we know
		s_ht.addKey ( &urlHash32 , &contentCRC );
		g_qaOutput.safePrintf("<b style=color:blue;>"
				      "first time testing</b><br>%s : "
				      "<a href=%s>%s</a> "
				      "(urlhash=%lu "
				      "crc=<a href=/qa/content.%lu>%lu"
				      "</a>)<br>"
				      "<hr>",
				      s_qt->m_testName,
				      s_url.getUrl(),
				      s_url.getUrl(),
				      urlHash32,
				      contentCRC,
				      contentCRC);
		return;
	}


	log("qa: crc changed for url %s from %li to %li",
	    s_url.getUrl(),*val,contentCRC);

	// get response on file
	SafeBuf fb1;
	char fn1[1024];
	sprintf(fn1,"%sqa/content.%lu",g_hostdb.m_dir, *val);
	fb1.load(fn1);
	fb1.nullTerm();

	// do the diff between the two replies so we can see what changed
	char cmd[1024];
	sprintf(cmd,"diff %s %s > /tmp/diffout",fn1,fn2);
	log("qa: %s\n",cmd);
	system(cmd);

	g_numErrors++;
	
	g_qaOutput.safePrintf("<b style=color:red;>FAILED TEST</b><br>%s : "
			      "<a href=%s>%s</a> (urlhash=%lu)<br>"

			      "<input type=checkbox name=urlhash%lu value=1 "
			      // use ajax to update test crc. if you undo your
			      // check then it should put the old val back.
			      // when you first click the checkbox it should
			      // gray out the diff i guess.
			      "onclick=submitchanges(%lu,%lu);> "
			      "Accept changes"

			      "<br>"
			      "original on left, new on right. "
			      "oldcrc = <a href=/qa/content.%lu>%lu</a>"

			      " != <a href=/qa/content.%lu>%lu</a> = newcrc"
			      "<br>diff output follows:<br>"
			      "<pre id=%lu style=background-color:0xffffff;>",
			      s_qt->m_testName,
			      s_url.getUrl(),
			      s_url.getUrl(),
			      urlHash32,

			      // input checkbox name field
			      urlHash32,

			      // submitchanges() parms
			      urlHash32, 
			      contentCRC,

			      // original/old content.%lu
			      *val,
			      *val,

			      // new content.%lu
			      contentCRC,
			      contentCRC,

			      // for the pre tag id:
			      urlHash32);


	// store in output
	SafeBuf sb;
	sb.load("/tmp/diffout");
	g_qaOutput.htmlEncode ( sb.getBufStart() );

	g_qaOutput.safePrintf("</pre><br><hr>");

	// if this is zero allow it to slide by. it is learning mode i guess.
	// so we can learn what crc we need to use.
	// otherwise, stop right there for debugging
	//if ( s_expectedCRC != 0 ) exit(1);

	// keep on going
	//s_callback();
}

예제 #11

파일 보기

파일: qa.cpp 프로젝트: firatkarakusoglu/open-source-search-engine

bool qaspider2 ( ) {
	//
	// delete the 'qatest123' collection
	//
	//static bool s_x1 = false;
	if ( ! s_flags[0] ) {
		s_flags[0] = true;
		if ( ! getUrl ( "/admin/delcoll?xml=1&delcoll=qatest123" ) )
			return false;
	}

	//
	// add the 'qatest123' collection
	//
	//static bool s_x2 = false;
	if ( ! s_flags[1] ) {
		s_flags[1] = true;
		if ( ! getUrl ( "/admin/addcoll?addcoll=qatest123&xml=1" , 
				// checksum of reply expected
				238170006 ) )
			return false;
	}

	// restrict hopcount to 0 or 1 in url filters so we do not spider
	// too deep
	//static bool s_z1 = false;
	if ( ! s_flags[2] ) {
		s_flags[2] = true;
		SafeBuf sb;
		sb.safePrintf("&c=qatest123&"
			      // make it the custom filter
			      "ufp=0&"

	       "fe=%%21ismanualadd+%%26%%26+%%21insitelist&hspl=0&hspl=1&fsf=0.000000&mspr=0&mspi=1&xg=1000&fsp=-3&"

			      // take out hopcount for now, just test quotas
			      //	       "fe1=tag%%3Ashallow+%%26%%26+hopcount%%3C%%3D1&hspl1=0&hspl1=1&fsf1=1.000000&mspr1=1&mspi1=1&xg1=1000&fsp1=3&"

			      // sitepages is a little fuzzy so take it
			      // out for this test and use hopcount!!!
			      //"fe1=tag%%3Ashallow+%%26%%26+sitepages%%3C%%3D20&hspl1=0&hspl1=1&fsf1=1.000000&mspr1=1&mspi1=1&xg1=1000&fsp1=45&"
			      "fe1=tag%%3Ashallow+%%26%%26+hopcount<%%3D1&hspl1=0&hspl1=1&fsf1=1.000000&mspr1=1&mspi1=1&xg1=1000&fsp1=45&"

	       "fe2=default&hspl2=0&hspl2=1&fsf2=1.000000&mspr2=0&mspi2=1&xg2=1000&fsp2=45&"

		);
		if ( ! getUrl ( "/admin/filters",0,sb.getBufStart()) )
			return false;
	}

	// set the site list to 
	// a few sites
	// these should auto seed so no need to use addurl
	//static bool s_z2 = false;
	if ( ! s_flags[3] ) {
		s_flags[3] = true;
		SafeBuf sb;
		sb.safePrintf("&c=qatest123&format=xml&sitelist=");
		sb.urlEncode(//walmart has too many pages at depth 1, so remove it
			     //"tag:shallow www.walmart.com\r\n"
			     "tag:shallow http://www.ibm.com/\r\n");
		sb.nullTerm();
		if ( ! getUrl ("/admin/settings",0,sb.getBufStart() ) )
			return false;
	}
		

	//
	// wait for spidering to stop
	//
 checkagain:

	// wait until spider finishes. check the spider status page
	// in json to see when completed
	//static bool s_k1 = false;
	if ( ! s_flags[4] ) {
		//usleep(5000000); // 5 seconds
		s_flags[4] = true;
		wait(3.0);
		return false;
	}

	if ( ! s_flags[14] ) {
		s_flags[14] = true;
		if ( ! getUrl ( "/admin/status?format=json&c=qatest123",0) )
			return false;
	}

	//static bool s_k2 = false;
	if ( ! s_flags[5] ) {
		// ensure spiders are done. 
		// "Nothing currently available to spider"
		if ( s_content&&!strstr(s_content,"Nothing currently avail")){
			s_flags[4] = false;
			s_flags[14] = false;
			goto checkagain;
		}
		s_flags[5] = true;
	}




	// verify no results for gbhopcount:2 query
	//static bool s_y4 = false;
	if ( ! s_flags[6] ) {
		s_flags[6] = true;
		if ( ! getUrl ( "/search?c=qatest123&qa=1&format=xml&"
				"q=gbhopcount%3A2",
				-1310551262 ) )
			return false;
	}

	// but some for gbhopcount:0 query
	//static bool s_t0 = false;
	if ( ! s_flags[7] ) {
		s_flags[7] = true;
		if ( ! getUrl ( "/search?c=qatest123&qa=1&format=xml&n=500&"
				"q=gbhopcount%3A0",
				999 ) )
			return false;
	}
	
	// check facet sections query for walmart
	//static bool s_y5 = false;
	if ( ! s_flags[8] ) {
		s_flags[8] = true;
		if ( ! getUrl ( "/search?c=qatest123&format=json&stream=0&"
				"q=gbfacetstr%3Agbxpathsitehash3311332088",
				999 ) )
			return false;
	}

	// wait for some reason
	if ( ! s_flags[15] ) {
		s_flags[15] = true;
		wait(1.5);
		return false;
	}



	//static bool s_y6 = false;
	if ( ! s_flags[9] ) {
		s_flags[9] = true;
		if ( ! getUrl ( "/get?page=4&q=gbfacetstr:gbxpathsitehash3311332088&qlang=xx&c=qatest123&d=9577169402&cnsp=0" , 999 ) )
			return false;
	}

	// in xml
	//static bool s_y7 = false;
	if ( ! s_flags[10] ) {
		s_flags[10] = true;
		if ( ! getUrl ( "/get?xml=1&page=4&q=gbfacetstr:gbxpathsitehash2492664135&qlang=xx&c=qatest123&d=9577169402&cnsp=0" , 999 ) )
			return false;
	}

	// and json
	//static bool s_y8 = false;
	if ( ! s_flags[11] ) {
		s_flags[11] = true;
		if ( ! getUrl ( "/get?json=1&page=4&q=gbfacetstr:gbxpathsitehash2492664135&qlang=xx&c=qatest123&d=9577169402&cnsp=0" , 999 ) )
			return false;
	}


	// delete the collection
	//static bool s_fee = false;
	// if ( ! s_flags[12] ) {
	// 	s_flags[12] = true;
	// 	if ( ! getUrl ( "/admin/delcoll?delcoll=qatest123" ) )
	// 		return false;
	// }

	//static bool s_fee2 = false;
	if ( ! s_flags[13] ) {
		s_flags[13] = true;
		log("qa: SUCCESSFULLY COMPLETED "
		    "QA SPIDER2 TEST");
		return true;
	}

	return true;
}

예제 #12

파일 보기

파일: HttpRequest.cpp 프로젝트: privacore/open-source-search-engine

// . parse an incoming request
// . return false and set g_errno on error
// . CAUTION: we destroy "req" by replacing it's last char with a \0
// . last char must be \n or \r for it to be a proper request anyway
bool HttpRequest::set ( char *origReq , int32_t origReqLen , TcpSocket *sock ) {
	// reset number of cgi field terms
	reset();

	 if ( ! m_reqBuf.reserve ( origReqLen + 1 ) ) {
		 log("http: failed to copy request: %s",mstrerror(g_errno));
		 return false;
	 }

	 // copy it to avoid mangling it
	 m_reqBuf.safeMemcpy ( origReq , origReqLen );
	 // NULL term
	 m_reqBuf.pushChar('\0');

	 m_reqBufValid = true;

	 // and point to that
	 char *req    = m_reqBuf.getBufStart();

	 if( !req ) { 
		 log(LOG_ERROR, "http: req is NULL");
		 g_errno = EBADREQUEST; 
		 return false; 
	 }


	 int32_t  reqLen = m_reqBuf.length() - 1;

	 // save this
	 m_userIP = sock ? sock->m_ip : 0;
	 m_isSSL  = sock ? (sock->m_ssl!=NULL) : false;

	 // TcpServer should always give us a NULL terminated request
	 if ( req[reqLen] != '\0' ) { g_process.shutdownAbort(true); }
	 
	 // how long is the first line, the primary request
	 // int32_t i;
	 // for ( i = 0 ; i<reqLen && i<MAX_REQ_LEN && 
	 //	       req[i]!='\n' && req[i]!='\r'; i++);
	 // . now fill up m_buf, used to log the request
	 // . make sure the url was encoded correctly
	 // . we don't want assholes encoding every char so we can't see what
	 //   url they are submitting to be spidered/indexed
	 // . also, don't de-code encoded ' ' '+' '?' '=' '&' because that would
	 //   change the meaning of the url
	 // . and finally, non-ascii chars that don't display correctly
	 // . this should NULL terminate m_buf, too
	 // . turn this off for now, just try to log a different way
	 // m_bufLen = urlNormCode ( m_buf , MAX_REQ_LEN - 1 , req , i );
	 // ensure it's big enough to be a valid request
	 if ( reqLen < 5 ) { 
		 log(LOG_WARN, "http: got reqlen %" PRId32"<5 = %s",reqLen,req);
		 g_errno = EBADREQUEST; 
		 return false; 
	 }

	 int32_t cmdLen = 0;

	 // or if first line too long
	 //if ( i >= 1024 )  { g_errno = EBADREQUEST; return false; }
	 // get the type, must be GET or HEAD
	 if      ( strncmp ( req , "GET "  , 4 ) == 0 ) {
		 m_requestType = RT_GET;
		 cmdLen = 3;
	 }
	 // these means a compressed reply was requested. use by query
	 // compression proxies.
	 else if ( strncmp ( req , "ZET "  , 4 ) == 0 ) {
		 m_requestType = RT_GET;
		 cmdLen = 3;
	 }
	 else if ( strncmp ( req , "HEAD " , 5 ) == 0 ) {
		 m_requestType = RT_HEAD;
		 cmdLen = 4;
	 }
	 else if ( strncmp ( req , "POST " , 5 ) == 0 )  {
		 m_requestType = RT_POST;
		 cmdLen = 4;
	 }
	 else if ( strncmp ( req , "CONNECT " , 8 ) == 0 ) {
		 // take this out until it stops losing descriptors and works
		 //m_requestType = RT_CONNECT;
		 //cmdLen = 7;
		 // we no longer insert section info. emmanuel gets section
		 // info when injecting a doc now i think in PageInject.cpp.
		 // we do not proxy https requests because we can't
		 // decrypt the page contents to cache them or to insert
		 // the sectiondb voting markup, so it's kinda pointless...
		 // and i'm not aiming to be a full-fledge squid proxy.
		 log("http: CONNECT request not supported because we "
		   "can't insert section markup and we can't cache: %s",req);
		 g_errno = EBADREQUEST; 
		 return false; 
	 }
	 else { 
		 log("http: got bad request cmd: %s",req);
		 g_errno = EBADREQUEST; 
		 return false; 
	 }
	 // . NULL terminate the request (a destructive operation!)
	 // . this removes the last \n in the trailing \r\n 
	 // . shit, but it f***s up POST requests
	 if ( m_requestType != RT_POST ) { 
		 req [ reqLen - 1 ] = '\0'; 
		 reqLen--; 
	 }

	 // POST requests can be absolutely huge if you are injecting a 100MB
	 // file, so limit our strstrs to the end of the mime
	 char *d = NULL;
	 char  dc;
	 // check for body if it was a POST request
	 if ( m_requestType == RT_POST ) {
		 d = strstr ( req , "\r\n\r\n" );
		 if ( d ) { dc = *d; *d = '\0'; }
		 else log("http: Got POST request without \\r\\n\\r\\n.");
	 }

	 // is it a proxy request?
	 m_isSquidProxyRequest = false;
	 if ( strncmp ( req + cmdLen + 1, "http://" ,7) == 0 ||
	      strncmp ( req + cmdLen + 1, "https://",8) == 0 ) {
		 m_isSquidProxyRequest = true;
		 // set url parms for it
		 m_squidProxiedUrl = req + cmdLen + 1;
		 char *p = m_squidProxiedUrl + 7;
		 if ( *p == '/' ) p++; // https:// ?
		 // stop at whitespace or \0
		 for ( ; *p && ! is_wspace_a(*p) ; p++ );
		 // that's the length of it
		 m_squidProxiedUrlLen = p - m_squidProxiedUrl;
	 }
	 else if ( m_requestType == RT_CONNECT ) {
		 m_isSquidProxyRequest = true;
		 // set url parms for it
		 m_squidProxiedUrl = req + cmdLen + 1;
		 // usually its like CONNECT diffbot.com:443
		 char *p = m_squidProxiedUrl;
		 // stop at whitespace or \0
		 for ( ; *p && ! is_wspace_a(*p) ; p++ );
		 // that's the length of it
		 m_squidProxiedUrlLen = p - m_squidProxiedUrl;
	 }

	 // check authentication
	 char *auth = NULL;
	 if ( m_isSquidProxyRequest && req )
		 auth = strstr(req,"Proxy-authorization: Basic ");

	 //if ( m_isSquidProxyRequest && ! auth ) {
	 //	 log("http: no auth in proxy request %s",req);
	 //	 g_errno = EBADREQUEST; 
	 //	 return false; 
	 //}

	 SafeBuf tmp;
	 if ( auth ) {
		 // find end of it
		 char *p = auth;
		 for ( ; *p && *p != '\r' && *p != '\n' ; p++ );
		 tmp.base64Decode ( auth , p - auth );
	 }

	 // assume incorrect username/password
	 bool matched = false;
	 if ( m_isSquidProxyRequest ) {
		 // now try to match in g_conf.m_proxyAuth safebuf of
		 // username:password space-separated list
		 char *p = g_conf.m_proxyAuth.getBufStart();
		 // loop over those
		 for ( ; p && *p ; ) {
			 // skip initial white space
			 for ( ; *p && is_wspace_a(*p); p++ );
			 // skip to end of username:password thing
			 char *end = p;
			 for ( ; *end && !is_wspace_a(*end); end++);
			 // save
			 char *start = p;
			 // advance
			 p = end;
			 // this is always a match
			 if ( end-start == 3 && strncmp(start,"*:*",3) == 0 ) {
				 matched = true;
				 break;
			 }
			 // compare now
			 if ( tmp.length() != end-start ) 
				 continue;
			 if ( strncmp(tmp.getBufStart(),start,end-start) != 0 )
				 continue;
			 // we got a match
			 matched = true;
			 break;
		 }
	 }

	 // incorrect username:passwrod?
	 if ( m_isSquidProxyRequest && ! matched ) {
		 log("http: bad username:password in proxy request %s",req);
		 g_errno = EPERMDENIED;
		 return false; 
	 }

	 // if proxy request to download a url through us, we are done
	 if ( m_isSquidProxyRequest ) return true;

	 bool multipart = false;
	 if ( m_requestType == 2 ) { // is POST?
		 char *cd ;
		 cd = gb_strcasestr(req,"Content-Type: multipart/form-data");
		 if ( cd ) multipart = true;
	 }

	 // . point to the file path 
	 // . skip over the "GET "
	 int32_t filenameStart = 4 ;
	 // skip over extra char if it's a "HEAD " request
	 if ( m_requestType == RT_HEAD || m_requestType == RT_POST ) 
		 filenameStart++;

	 // are we a redirect?
	 int32_t i = filenameStart;
	 m_redirLen = 0;
	 if ( strncmp ( &req[i] , "/?redir=" , 8 ) == 0 ) {
		 for ( int32_t k = i+8; k<reqLen && m_redirLen<126 ; k++) {
			 if ( req[k] == '\r' ) break;
			 if ( req[k] == '\n' ) break;
			 if ( req[k] == '\t' ) break;
			 if ( req[k] ==  ' ' ) break;
			 m_redir[m_redirLen++] = req[k];
		 }
	 }
	 m_redir[m_redirLen] = '\0';

	 // find a \n space \r or ? that delimits the filename
	 for ( i = filenameStart ; i < reqLen ; i++ ) {
		 if ( is_wspace_a ( req [ i ] ) ) break;
		 if ( req [ i ] == '?' ) break;
	 }

	 // now calc the filename length
	 m_filenameLen = i - filenameStart;
	 // return false and set g_errno if it's 0
	 if ( m_filenameLen <= 0  ) { 
		 log("http: got filenameLen<=0: %s",req);
		 g_errno = EBADREQUEST; 
		 return false; 
	 }
	 // . bitch if too big
	 // . leave room for strcatting "index.html" below
	 if ( m_filenameLen >= MAX_HTTP_FILENAME_LEN - 10 ) { 
		 log("http: got filenameLen>=max");
		 g_errno = EBADREQUEST; 
		 return false; 
	 }
	 // . decode the filename into m_filename and reassign it's length
	 // . decode %2F to / , etc...
	 m_filenameLen = urlDecode(m_filename,req+filenameStart,m_filenameLen);
	 // NULL terminate m_filename
	 m_filename [ m_filenameLen ] = '\0';
	 // does it have a file extension AFTER the last / in the filename?
	 bool hasExtension = false;
	 for ( int32_t j = m_filenameLen-1 ; j >= 0 ; j-- ) {
		 if ( m_filename[j] == '.' ) { hasExtension = true; break; }
		 if ( m_filename[j] == '/' ) break;
	 }
	 // if it has no file extension append a /index.html
	 if ( ! hasExtension && m_filename [ m_filenameLen - 1 ] == '/' ) {
		 strcat ( m_filename , "index.html" );
		 m_filenameLen = strlen ( m_filename );
	 }


	 // . uses the TcpSocket::m_readBuf
	 // . if *p was ? then keep going
	 m_origUrlRequest = origReq + filenameStart;
	 char *p = origReq + m_filenameLen;
	 for ( ; *p && ! is_wspace_a(*p) ; p++ );
	 m_origUrlRequestLen = p - m_origUrlRequest;

	 // set file offset/size defaults
	 m_fileOffset = 0;
	 // -1 means ALL the file from m_fileOffset onwards
	 m_fileSize   = -1;  
	 // "e" points to where the range actually starts, if any
	 //char *e;
	 // . TODO: speed up by doing one strstr for Range: and maybe range:
	 // . do they have a Range: 0-100\n in the mime denoting a partial get?
	 //char *s = strstr ( req ,"Range:bytes=" );
	 //e = s + 12;
	 // try alternate formats
	 //if ( ! s ) { s = strstr ( req ,"Range: bytes=" ); e = s + 13; }
	 //if ( ! s ) { s = strstr ( req ,"Range: "       ); e = s +  7; }
	 // parse out the range if we got one
	 //if ( s ) {
	 //	int32_t x = 0;
	 //	sscanf ( e ,"%" PRId32"-%" PRId32 , &m_fileOffset , &x );
	 //	// get all file if range's 2nd number is non-existant
	 //	if ( x == 0 ) m_fileSize = -1;
	 //	else          m_fileSize = x - m_fileOffset;
	 //	// ensure legitimacy
	 //	if ( m_fileOffset < 0 ) m_fileOffset = 0;
	 //}
	 // reset our hostname
	 m_hostLen = 0;
	 // assume request is NOT from local network
	 //m_isMasterAdmin = false;
	 m_isLocal = false;
	 // get the virtual hostname they want to use
	 char *s = strstr ( req ,"Host:" );
	 // try alternate formats
	 if ( ! s ) s = strstr ( req , "host:" ); 
	 // must be on its own line, otherwise it's not valid
	 if ( s && s > req && *(s-1) !='\n' ) s = NULL;
	 // parse out the host if we got one
	 if ( s ) {
		 // skip field name, host:
		 s += 5;
		 // skip e to beginning of the host name after "host:"
		 while ( *s==' ' || *s=='\t' ) s++;
		 // find end of the host name
		 char *end = s;
		 while ( *end && !is_wspace_a(*end) ) end++;
		 // . now *end should be \0, \n, \r, ' ', ...
		 // . get host len
		 m_hostLen = end - s;
		 // truncate if too big
		 if ( m_hostLen >= 255 ) m_hostLen = 254;
		 // copy into hostname
		 gbmemcpy ( m_host , s , m_hostLen );
	 }
	 // NULL terminate it
	 m_host [ m_hostLen ] = '\0';

	 // get Referer: field
	 s = strstr ( req ,"Referer:" );
	 // find another
	 if ( ! s ) s = strstr ( req ,"referer:" );
	 // must be on its own line, otherwise it's not valid
	 if ( s && s > req && *(s-1) !='\n' ) s = NULL;
	 // assume no referer
	 m_refLen = 0;
	 // parse out the referer if we got one
	 if ( s ) {
		 // skip field name, referer:
		 s += 8;
		 // skip e to beginning of the host name after ':'
		 while ( *s==' ' || *s=='\t' ) s++;
		 // find end of the host name
		 char *end = s;
		 while ( *end && !is_wspace_a(*end) ) end++;
		 // . now *end should be \0, \n, \r, ' ', ...
		 // . get len
		 m_refLen = end - s;
		 // truncate if too big
		 if ( m_refLen >= 255 ) m_refLen = 254;
		 // copy into m_ref
		 gbmemcpy ( m_ref , s , m_refLen );
	 }
	 // NULL terminate it
	 m_ref [ m_refLen ] = '\0';

	 // get User-Agent: field
	 s = strstr ( req ,"User-Agent:" );
	 // find another
	 if ( ! s ) s = strstr ( req ,"user-agent:" );
	 // must be on its own line, otherwise it's not valid
	 if ( s && s > req && *(s-1) !='\n' ) s = NULL;
	 // assume empty
	 int32_t len = 0;
	 // parse out the referer if we got one
	 if ( s ) {
		 // skip field name, referer:
		 s += 11;
		 // skip e to beginning of the host name after ':'
		 while ( *s==' ' || *s=='\t' ) s++;
		 // find end of the agent name
		 char *end = s;
		 while ( *end && *end!='\n' && *end!='\r' ) end++;
		 // . now *end should be \0, \n, \r, ' ', ...
		 // . get agent len
		 len = end - s;
		 // truncate if too big
		 if ( len > 127 ) len = 127;
		 // copy into m_userAgent
		 gbmemcpy ( m_userAgent , s , len );
	 }
	 // NULL terminate it
	 m_userAgent [ len ] = '\0';

	 // get Cookie: field
	 s = strstr ( req, "Cookie:" );
	 // find another
	 if ( !s ) s = strstr ( req, "cookie:" );
	 // must be on its own line, otherwise it's not valid
	 if ( s && s > req && *(s-1) != '\n' ) s = NULL;
	 // assume empty
	 // m_cookieBufLen = 0;
	 m_cookiePtr = s;
	 // parse out the cookie if we got one
	 if ( s ) {
		 // skip field name, Cookie:
		 s += 7;
		 // skip s to beginning of cookie after ':'
		 while ( *s == ' ' || *s == '\t' ) s++;
		 // find end of the cookie
		 char *end = s;
		 while ( *end && *end != '\n' && *end != '\r' ) end++;
		 // save length
		 m_cookieLen = end - m_cookiePtr;
		 // get cookie len
		 //m_cookieBufLen = end - s;
		 // trunc if too big
		 //if (m_cookieBufLen > 1023) m_cookieBufLen = 1023;
		 // copy into m_cookieBuf
		 //gbmemcpy(m_cookieBuf, s, m_cookieBufLen);
	 }
	 // NULL terminate it
	 if ( m_cookiePtr ) m_cookiePtr[m_cookieLen] = '\0';
	 //m_cookieBuf[m_cookieBufLen] = '\0';
	 // convert every '&' in cookie to a \0 for parsing the fields
	 // for ( int32_t j = 0 ; j < m_cookieBufLen ; j++ ) 
	 //	 if ( m_cookieBuf[j] == '&' ) m_cookieBuf[j] = '\0';

	 // mark it as cgi if it has a ?
	 bool isCgi = ( req [ i ] == '?' ) ;
	 // reset m_filename length to exclude the ?* stuff
	 if ( isCgi ) {
		 // skip over the '?'
		 i++;
		 // find a space the delmits end of cgi
		 int32_t j;
		 for ( j = i; j < reqLen; j++) if (is_wspace_a(req[j])) break;
		 // now add it
		 if ( ! addCgi ( &req[i] , j-i ) ) return false;
		 // update i
		 i = j;
	 }

	 // . set path ptrs
	 // . the whole /cgi/14.cgi?coll=xxx&..... thang
	 m_path = req + filenameStart;
	 m_plen = i - filenameStart;
	 // we're local if hostname is 192.168.[0|1].y
	 //if ( strncmp(iptoa(sock->m_ip),"192.168.1.",10) == 0) {
	 //	m_isMasterAdmin = true; m_isLocal = true; }
	 //if ( strncmp(iptoa(sock->m_ip),"192.168.0.",10) == 0) {
	 //	m_isMasterAdmin = true; m_isLocal = true; }
	 //if(strncmp(iptoa(sock->m_ip),"192.168.1.",10) == 0) m_isLocal = true;
	 //if(strncmp(iptoa(sock->m_ip),"192.168.0.",10) == 0) m_isLocal = true;
	 if ( sock && strncmp(iptoa(sock->m_ip),"192.168.",8) == 0) 
		 m_isLocal = true;
	 if ( sock && strncmp(iptoa(sock->m_ip),"10.",3) == 0) 
		 m_isLocal = true;

	 // gotta scan all ips in hosts.conf as well...
	 // if we are coming from any of our own hosts.conf c blocks
	 // consider ourselves local
	 uint32_t last = 0;
	 for ( int32_t i = 0 ; i < g_hostdb.getNumHosts() ; i++ ) {
		 Host *h = g_hostdb.getHost(i);
		 // save time with this check
		 if ( h->m_ip == last ) continue;
		 // update it
		 last = h->m_ip;
		 // returns number of top bytes in comon
		 int32_t nt = sock ? ipCmp ( sock->m_ip , h->m_ip ) : 0;
		 // at least be in the same c-block as a host in hosts.conf
		 if ( nt < 3 ) continue;
		 m_isLocal = true;
		 break;
	 }
		 

	 // connectips/adminips
	 // for ( int32_t i = 0 ; i < g_conf.m_numConnectIps ; i++ ) {
	 // 	 if ( sock->m_ip != g_conf.m_connectIps[i] ) continue;
	 // 	 m_isLocal = true;
	 // 	 break;
	 // }

	 // 127.0.0.1
	 if ( sock && sock->m_ip == 16777343 )
		 m_isLocal = true;

	 // . TODO: now add any cgi data from a POST.....
	 // . look after the mime
	 //char *d = NULL;
	 // check for body if it was a POST request
	 //if ( m_requestType == RT_POST ) d = strstr ( req , "\r\n\r\n" );

	 // return true now if no cgi stuff to parse
	 if ( d ) {
	 	// now put d's char back, just in case... does it really matter?
		*d = dc;

		 char *post    = d + 4;
		 int32_t  postLen = reqLen-(d+4-req) ;
		 // post sometimes has a \r or\n after it
		 while ( postLen > 0 && post[postLen-1]=='\r' ) postLen--;
		 // add it to m_cgiBuf, filter and everything
		 if ( ! addCgi ( post , postLen ) ) return false;
	 }

	 // Put '\0' back into the HttpRequest buffer...
	 // crap, not if we are multi-part unencoded stuff...
	 if ( m_cgiBuf && ! multipart ) {
		 // do not mangle the "ucontent"!
		 int32_t cgiBufLen = m_cgiBufLen;
		 cgiBufLen -= m_ucontentLen;
		 char *buf = m_cgiBuf;
		 for (int32_t i = 0; i < cgiBufLen ; i++) 
			 if (buf[i] == '&') buf[i] = '\0';
		 // don't decode the ucontent= field!
		 int32_t decodeLen = m_cgiBufLen;
		 // so subtract that
		 if ( m_ucontent ) decodeLen -= m_ucontentLen;
		 // decode everything. fixed for %00 in &content= so it
		 // doesn't set our parms when injecting.
		 int32_t len = urlDecodeNoZeroes(m_cgiBuf,m_cgiBuf,decodeLen);
		 // we're parsing crap after the null if the last parm 
		 // has no value
		 //memset(m_cgiBuf+len, '\0', m_cgiBufLen-len);
		 m_cgiBufLen = len;
		 // ensure that is null i guess
		 if ( ! m_ucontent ) m_cgiBuf[len] = '\0';
	 }
	
	 if (m_cgiBuf2){
		 char *buf = m_cgiBuf2;
		 for (int32_t i = 0; i < m_cgiBuf2Size-1 ; i++) 
			 if (buf[i] == '&') buf[i] = '\0';
		 // decode everything. fixed for %00 in &content= so it
		 // doesn't set our parms when injecting.
		 int32_t len = urlDecodeNoZeroes ( m_cgiBuf2 , 
						   m_cgiBuf2 , 
						   m_cgiBuf2Size);
		 memset(m_cgiBuf2+len, '\0', m_cgiBuf2Size-len);
	 }
	 // . parse the fields after the ? in a cgi filename
	 // . or fields in the content if it's a POST
	 // . m_cgiBuf must be and is NULL terminated for this
	 parseFields ( m_cgiBuf , m_cgiBufLen );
	 // Add extra parms to the request.  
	 if (m_cgiBuf2Size){
		 parseFields(m_cgiBuf2, m_cgiBuf2Size);
	 }

	 // urldecode the cookie buf too!!
	 if ( m_cookiePtr ) {
		 char *p = m_cookiePtr;
		 for (int32_t i = 0; i < m_cookieLen ; i++) {
			 //if (p[i] == '&') p[i] = '\0';
			 // cookies are separated with ';' in the request only
			 if (p[i] == ';') p[i] = '\0';
			 // a hack for the metacookie=....
			 // which uses &'s to separate its subcookies
			 // this is a hack for msie's limit of 50 cookies
			 if ( p[i] == '&' ) p[i] = '\0';
			 // set m_metaCookie to start of meta cookie
			 if ( p[i] == 'm' && p[i+1] == 'e' &&
			      strncmp(p,"metacookie",10) == 0 )
				 m_metaCookie = p;
		 }
		 int32_t len = urlDecode ( m_cookiePtr , 
					m_cookiePtr,
					m_cookieLen );
		 // we're parsing crap after the null if the last parm 
		 // has no value
		 memset(m_cookiePtr+len, '\0', m_cookieLen-len);
		 m_cookieLen = len;
	 }

	 return true;
 }

예제 #13

파일 보기

파일: HttpRequest.cpp 프로젝트: privacore/open-source-search-engine

// . form an HTTP request 
// . use size 0 for HEAD requests
// . use size -1 for GET whole doc requests
// . fill in your own offset/size for partial GET requests
// . returns false and sets g_errno on error
// . NOTE: http 1.1 uses Keep-Alive by default (use Connection: close to not)
bool HttpRequest::set (char *url,int32_t offset,int32_t size,time_t ifModifiedSince,
		       const char *userAgent, const char *proto, bool doPost,
		       const char *cookieJar, const char *additionalHeader,
		       // if posting something, how many bytes is it?
		       int32_t postContentLen ,
		       // are we sending the request through an http proxy?
		       // if so this will be non-zero
		       int32_t proxyIp ,
		       const char *proxyUsernamePwd ) {

	m_reqBufValid = false;

	int32_t hlen ;
	int32_t port = 80;
	const char *hptr = getHostFast ( url , &hlen , &port );
	char *path = getPathFast ( url );

	// . use the full url if sending to an http proxy
	// . HACK: do NOT do this if it is httpS because we end up
	//   using the http tunnel using the CONNECT cmd and the squid proxy
	//   will just forward/proxy just the entire tcp packets.
	if ( proxyIp && strncmp(url,"https://",8) != 0 ) path = url;

	char *pathEnd  = NULL;
	const char *postData = NULL;
	if ( doPost ) {
		pathEnd  = strstr(path,"?");
		if ( pathEnd ) {
			*pathEnd = '\0';
			postData = pathEnd + 1;
		}
	}

	// if no legit host
	if ( hlen <= 0 || ! hptr ) { g_errno = EBADURL; return false; }
	// sanity check. port is only 16 bits
	if ( port > (int32_t)0xffff ) { g_errno = EBADURL; return false; }
	// return false and set g_errno if url too big
	//if ( url->getUrlLen() + 400 >= MAX_REQ_LEN ) { 
	//	g_errno = EURLTOOBIG; return false;}
	// assume request type is a GET
	m_requestType = RT_GET;//0;
	// get the host NULL terminated
	char host[1024+8];
	//int32_t hlen = url->getHostLen();
	strncpy ( host , hptr , hlen );
	host [ hlen ] = '\0';
	// then port
	//uint16_t port = url->getPort();
	if ( port != 80 ) {
		sprintf ( host + hlen , ":%" PRIu32 , (uint32_t)port );
		hlen += strlen ( host + hlen );
	}
	// the if-modified-since field
	const char *ims = "";
#if 0
	char  ibuf[64];
	if ( ifModifiedSince ) {
		struct tm tm_buf;
		char buf[64];
		// NOTE: ctime appends a \n 
		snprintf(ibuf, sizeof(ibuf), "If-Modified-Since: %s UTC",
			asctime_r(gmtime_r(&ifModifiedSince,&tm_buf),buf));
		// get the length
		int32_t ilen = strlen(ibuf);
		if( ilen && ilen < (int32_t)sizeof(ibuf)-1 ) {
			// hack off \n from ctime - replace with \r\n\0
			ibuf [ ilen - 1 ] = '\r';
			ibuf [ ilen     ] = '\n';
			ibuf [ ilen + 1 ] = '\0';
			// set ims to this string
			ims = ibuf;
		}
	}
	// . until we fix if-modified-since, take it out
	// . seems like we are being called with it as true when should not be
	ims="";
#endif

	// . use one in conf file if caller did not provide
	// . this is usually Gigabot/1.0
	if ( ! userAgent ) userAgent = g_conf.m_spiderUserAgent;
	// accept only these
	const char *accept = "*/*";
	/*
		 "text/html, "
		 "text/plain, "
		 "text/xml, "
		 "application/pdf, "
		 "application/msword, "
		 "application/vnd.ms-excel, "
		 "application/mspowerpoint, "
		 "application/postscript";
	*/

	const char *cmd = "GET";
	if ( size == 0 ) cmd = "HEAD";
	if ( doPost    ) cmd = "POST";

	// crap, can't spider nyt.com if we are 1.0, so use 1.0 but also
	// note Connection: Close\r\n when making requests
	//proto = "HTTP/1.1";

	SafeBuf tmp;
	const char *up = "";
	if ( proxyUsernamePwd && proxyUsernamePwd[0] ) {
		tmp.safePrintf("Proxy-Authorization: Basic ");
		tmp.base64Encode (proxyUsernamePwd,strlen(proxyUsernamePwd));
		tmp.safePrintf("\r\n");
		up = tmp.getBufStart();
	}

	 // . now use "Accept-Language: en" to tell servers we prefer english
	 // . i removed keep-alive connection since some connections close on
	 //   non-200 ok http statuses and we think they're open since close
	 //   signal (read 0 bytes) may have been delayed
	 const char* acceptEncoding = "";
	 // the scraper is getting back gzipped search results from goog,
	 // so disable this for now
	 // i am re-enabling now for testing...
	 if(g_conf.m_gzipDownloads)
	 	 acceptEncoding = "Accept-Encoding: gzip;q=1.0\r\n";
	 // i thought this might stop wikipedia from forcing gzip on us
	 // but it did not!
	 // else
	 //	 acceptEncoding = "Accept-Encoding:\r\n";

	 // char *p = m_buf;
	 // init the safebuf to point to this buffer in our class to avoid
	 // a potential alloc
	 // m_reqBuf.setBuf ( m_buf , MAX_REQ_LEN , 0 , false, csUTF8 );
	 m_reqBuf.purge();
	 // indicate this is good
	 m_reqBufValid = true;

	 if ( size == 0 ) {
		 // 1 for HEAD requests
		 m_requestType = RT_HEAD; 
		 m_reqBuf.safePrintf (
			   "%s %s %s\r\n" 
			   "Host: %s\r\n"
			   "%s"
			   "User-Agent: %s\r\n"
			   "Connection: Close\r\n"
			   //"Connection: Keep-Alive\r\n" 
			   "Accept-Language: en\r\n"
			   //"Accept: */*\r\n\r\n" ,
			   "Accept: %s\r\n" 
			   "%s"
			   ,
				 cmd,
			   path , proto, host , 
			   ims , userAgent , accept , up );
	 }
	 else 
	 if ( size != -1 ) 
		 m_reqBuf.safePrintf (
			   "%s %s %s\r\n" 
			   "Host: %s\r\n"
			   "%s"
			   "User-Agent: %s\r\n"
			   "Connection: Close\r\n"
			   //"Connection: Keep-Alive\r\n"
			   "Accept-Language: en\r\n"
			   //"Accept: */*\r\n"
			   "Accept: %s\r\n"
			   "Range: bytes=%" PRId32"-%" PRId32"\r\n" 
			   "%s"
			   ,
				cmd,
			   path ,
			   proto ,
			   host ,
			   ims  ,
			   userAgent ,
			   accept ,
			   offset ,
			   offset + size ,
				      up);
	 else 
	 if ( offset > 0 ) 	// size is -1
		 m_reqBuf.safePrintf (
			   "%s %s %s\r\n" 
			   "Host: %s\r\n"
			   "%s"
			   "User-Agent: %s\r\n"
			   "Connection: Close\r\n"
			   //"Connection: Keep-Alive\r\n"
			   "Accept-Language: en\r\n"
			   //"Accept: */*\r\n"
			   "Accept: %s\r\n"
			   "Range: bytes=%" PRId32"-\r\n" 
			   "%s"
			   ,
				cmd,
			   path ,
			   proto ,
			   host ,
			   ims  ,
			   userAgent ,
			   accept ,
			   offset ,
				      up );
	 // Wget's request:
	 // GET / HTTP/1.0\r\nUser-Agent: Wget/1.10.2\r\nAccept: */*\r\nHost: 127.0.0.1:8000\r\nConnection: Keep-Alive\r\n\r\n
	 // firefox's request:
	 // GET /master?c=main HTTP/1.1\r\nHost: 10.5.1.203:8000\r\nUser-Agent: Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.2.7) Gecko/20100715 Ubuntu/10.04 (lucid) Firefox/3.6.7\r\nAccept: text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8\r\nAccept-Language: en-us,en;q=0.5\r\nAccept-Encoding: gzip,deflate\r\nAccept-Charset: ISO-8859-1,utf-8;q=0.7,*;q=0.7\r\nKeep-Alive: 115\r\nConnection: keep-alive\r\nReferer: http://10.5.0.2:8002/qpmdw.html\r\nCookie: __utma=267617550.1103353528.1269214594.1273256655.1276103782.12; __utmz=267617550.1269214594.1.1.utmcsr=(direct)|utmccn=(direct)|utmcmd=(none); _incvi=qCffL7N8chFyJLwWrBDMbNz2Q3EWmAnf4uA; s_lastvisit=1269900225815; s_pers=%20s_getnr%3D1276103782254-New%7C1339175782254%3B%20s_nrgvo%3DNew%7C1339175782258%3B\r\n\r\n
	 else {
		 // until we fix if-modified-since, take it out
		 //ims="";
		 //userAgent = "Wget/1.10.2";
		 //userAgent = "Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.2.7) Gecko/20100715 Ubuntu/10.04 (lucid) Firefox/3.6.7";
		 //proto = "HTTP/1.0";
		 m_reqBuf.safePrintf (
			   "%s %s %s\r\n" 
			   "User-Agent: %s\r\n"
			   "Accept: */*\r\n" 
			   "Host: %s\r\n"
			   "%s"
			   "Connection: Close\r\n"
			   //"Connection: Keep-Alive\r\n"
			   //"Accept-Language: en\r\n"
				"%s"
			   "%s"
			   ,
			   //"Accept: %s\r\n\r\n" ,
				//"\r\n",
				cmd,
			   path ,
			   proto ,
			   userAgent ,
			   host ,
			   ims ,
			   acceptEncoding,
				      up );
			   //accept );
	 }

	 if ( additionalHeader )
		 m_reqBuf.safePrintf("%s\r\n",additionalHeader );

	 // cookie here
	if (cookieJar) {
		HttpMime::addCookieHeader(cookieJar, url, &m_reqBuf);
	}

	 // print content-length: if post
	 if ( postData ) {
		 // dammit... recaptcha does not work without this!!!!
		 m_reqBuf.safePrintf (
			      "Content-Type: "
			      "application/x-www-form-urlencoded\r\n");
	 }

	 // we need this if doing a post even if postData is NULL
	 if ( doPost ) {
		 int32_t contentLen = 0;
		 if ( postData ) contentLen = strlen(postData);
		 // this overrides if provided. -1 is default
		 if ( postContentLen >= 0 ) contentLen = postContentLen;
		 m_reqBuf.safePrintf ("Content-Length: %" PRId32"\r\n", contentLen );
		 m_reqBuf.safePrintf("\r\n");
		 if ( postData ) m_reqBuf.safePrintf("%s",postData);
		 // log it for debug
		 //log("captch: %s",m_buf);
	 }

	 if ( ! doPost ) { // ! postData ) {
		 m_reqBuf.safePrintf("\r\n");
	 }

	 // restore url buffer
	 if ( pathEnd ) *pathEnd = '?';

	 return true;
 }

예제 #14

파일 보기

파일: Log.cpp 프로젝트: LetsUnlockiPhone/open-source-search-engine

bool Log::init ( char *filename ) {
	// set the main process id
	//s_pid = getpidtid();
	setPid();
	// init these
	m_numErrors =  0;
	m_bufPtr    =  0;
	m_fd        = -1;
	m_disabled  = false;

#ifdef DEBUG
	g_dbufSize = 4096;
	g_dbuf = (char*)mmalloc(g_dbufSize,"Log: DebugBuffer");
	if (!g_dbuf) fprintf(stderr, "Unable to init debug buffer");
#endif
	//	m_hostname  = g_conf.m_hostname;
	//	m_port      = port;
	// is there a filename to log our errors to?
	m_filename = filename;
	if ( ! m_filename ) return true;

	// skip this for now
	//return true;

	//
	// RENAME log000 to log000-2013_11_04-18:19:32
	//
	if ( g_conf.m_runAsDaemon ) {
		File f;
		char tmp[16];
		sprintf(tmp,"log%03li",g_hostdb.m_hostId);
		f.set ( g_hostdb.m_dir , tmp );
		// make new filename like log000-2013_11_04-18:19:32
		time_t now = getTimeLocal();
		tm *tm1 = gmtime((const time_t *)&now);
		char tmp2[64];
		strftime(tmp2,64,"%Y_%m_%d-%T",tm1);
		SafeBuf newName;
		if ( ! newName.safePrintf ( "%slog%03li-%s",
					    g_hostdb.m_dir,
					    g_hostdb.m_hostId,
					    tmp2 ) ) {
			fprintf(stderr,"log rename failed\n");
			return false;
		}
		// rename log000 to log000-2013_11_04-18:19:32
		if ( f.doesExist() ) {
			//fprintf(stdout,"renaming file\n");
			f.rename ( newName.getBufStart() );
		}
	}


	// open it for appending.
	// create with -rw-rw-r-- permissions if it's not there.
	m_fd = open ( m_filename , 
		      O_APPEND | O_CREAT | O_RDWR , 
		      S_IRUSR | S_IWUSR | S_IRGRP | S_IWGRP | S_IROTH );
	if ( m_fd >= 0 ) return true;
	// bitch to stderr and return false on error
	fprintf(stderr,"could not open log file %s for appending\n",
		m_filename);
	return false;
}

예제 #15

파일 보기

파일: PageTitledb.cpp 프로젝트: chushuai/open-source-search-engine

// . make a web page from results stored in msg40
// . send it on TcpSocket "s" when done
// . returns false if blocked, true otherwise
// . sets g_errno on error
bool gotTitleRec ( void *state ) {
	// cast the State4 out
	State4 *st = (State4 *) state;
	// get the socket
	TcpSocket *s = st->m_socket;

	SafeBuf sb;
	// get it's docId
	long long docId = st->m_docId;
	// make the query string for passing to different hosts
	char  qs[64];
	sprintf(qs,"&d=%lli",docId);
	if ( docId==0LL ) qs[0] = 0;
	// print standard header
	sb.reserve2x ( 32768 );
	g_pages.printAdminTop (&sb, st->m_socket, &st->m_r );
	//PAGE_TITLEDB,
	//		       st->m_username,//NULL ,
	//		       st->m_coll , st->m_pwd , s->m_ip , qs );
	// shortcut
	XmlDoc *xd = &st->m_xd;

	// . deal with errors
	// . print none if non title rec at or after the provided docId
	if ( g_errno || docId == 0LL || xd->m_titleRecBuf.length() <= 0 ) {
		// print docId in box
		sb.safePrintf (  "<center>\nEnter docId: "
				 "<input type=text name=d value=%lli size=15>",
				 docId);
		sb.safePrintf ( "</form><br>\n" );
		if ( docId == 0 ) 
			sb.safePrintf("<br>");
		else if ( g_errno ) 
			sb.safePrintf("<br><br>Error = %s",mstrerror(g_errno));
		else 
			sb.safePrintf("<br><br>No titleRec for that docId "
				      "or higher");
		// print where it should be
		//unsigned long gid = getGroupIdFromDocId ( docId );
		//Host *hosts = g_hostdb.getGroup(gid);
		long shardNum = getShardNumFromDocId ( docId );
		Host *hosts = g_hostdb.getShard ( shardNum );
		long hostId = -1;
		if ( hosts ) hostId = hosts[0].m_hostId;
		sb.safePrintf("<br><br>docId on host #%li and twins.",hostId);
		sb.safePrintf ( "\n</center>" );
		mdelete ( st , sizeof(State4) , "PageTitledb");
		delete (st);
		// erase g_errno for sending
		g_errno = 0;
		// now encapsulate it in html head/tail and send it off
		return g_httpServer.sendDynamicPage ( s , 
						      sb.getBufStart(),
						      sb.length() );
	}
	// print docId in box
	sb.safePrintf ("<br>\n"
		       "<center>Enter docId: "
		       "<input type=text name=d value=%lli size=15>", docId );
	// print where it should be
	//unsigned long gid = getGroupIdFromDocId ( docId );
	//Host *hosts = g_hostdb.getGroup(gid);
	long shardNum = getShardNumFromDocId ( docId );
	Host *hosts = g_hostdb.getShard ( shardNum );
	long hostId = -1;
	if ( hosts ) hostId = hosts[0].m_hostId;
	sb.safePrintf("<br><br>docId on host #%li and twins.",hostId);
	sb.safePrintf ( "</form><br>\n" );

	//char *coll    = st->m_coll;

	Title *ti = xd->getTitle();
	if ( ! ti ) {
		log ( "admin: Could not set title" );
		return g_httpServer.sendErrorReply(s,500,mstrerror(g_errno));
	}
	// sanity check. should not block
	if ( ! xd->m_titleValid ) { char *xx=NULL;*xx=0; }

	// print it out
	xd->printDoc ( &sb );

	// don't forget to cleanup
	mdelete ( st , sizeof(State4) , "PageTitledb");
	delete (st);
	// now encapsulate it in html head/tail and send it off
	return g_httpServer.sendDynamicPage (s, sb.getBufStart(), sb.length());
}

예제 #16

파일 보기

파일: qa.cpp 프로젝트: firatkarakusoglu/open-source-search-engine

//
// the injection qa test suite
//
bool qainject1 ( ) {

	//if ( ! s_callback ) s_callback = qainject1;

	//
	// delete the 'qatest123' collection
	//
	//static bool s_x1 = false;
	if ( ! s_flags[0] ) {
		s_flags[0] = true;
		if ( ! getUrl ( "/admin/delcoll?xml=1&delcoll=qatest123" ) )
			return false;
	}

	//
	// add the 'qatest123' collection
	//
	//static bool s_x2 = false;
	if ( ! s_flags[1] ) {
		s_flags[1] = true;
		if ( ! getUrl ( "/admin/addcoll?addcoll=qatest123&xml=1" , 
				// checksum of reply expected
				238170006 ) )
			return false;
	}

	// this only loads once
	loadUrls();
	long max = s_ubuf2.length()/(long)sizeof(char *);
	//max = 1;

	//
	// inject urls, return false if not done yet
	//
	//static bool s_x4 = false;
	if ( ! s_flags[2] ) {
		// TODO: try delimeter based injection too
		//static long s_ii = 0;
		for ( ; s_flags[20] < max ; ) {
			// inject using html api
			SafeBuf sb;
			sb.safePrintf("&c=qatest123&deleteurl=0&"
				      "format=xml&u=");
			sb.urlEncode ( s_urlPtrs[s_flags[20]] );
			// the content
			sb.safePrintf("&hasmime=1");
			// sanity
			//if ( strstr(s_urlPtrs[s_flags[20]],"wdc.htm") )
			//	log("hey");
			sb.safePrintf("&content=");
			sb.urlEncode(s_contentPtrs[s_flags[20]] );
			sb.nullTerm();
			// pre-inc it in case getUrl() blocks
			s_flags[20]++;//ii++;
			if ( ! getUrl("/admin/inject",
				      0, // no idea what crc to expect
				      sb.getBufStart()) )
				return false;
		}
		s_flags[2] = true;
	}

	// +the
	//static bool s_x5 = false;
	if ( ! s_flags[3] ) {
		wait(1.5);
		s_flags[3] = true;
		return false;
	}

	if ( ! s_flags[16] ) {
		s_flags[16] = true;
		if ( ! getUrl ( "/search?c=qatest123&qa=1&format=xml&q=%2Bthe",
				702467314 ) )
			return false;
	}

	// sports news
	//static bool s_x7 = false;
	if ( ! s_flags[4] ) {
		s_flags[4] = true;
		if ( ! getUrl ( "/search?c=qatest123&qa=1&format=xml&"
				"q=sports+news",2009472889 ) )
		     return false;
	}

	// 'washer & dryer' does some algorithmic synonyms 'washer and dryer'
	if ( ! s_flags[15] ) {
		s_flags[15] = true;
		if ( ! getUrl ( "/search?c=qatest123&qa=1&format=xml&"
				"debug=1&q=washer+%26+dryer",9999 ) )
		     return false;
	}

	//
	// mdw: query reindex test
	//
	// if ( ! s_flags[30] ) {
	// 	s_flags[30] = true;
	// 	if ( ! getUrl ( "/admin/reindex?c=qatest123&qa=1&format=xml&"
	// 			"debug=1&q=sports",9999 ) )
	// 		return false;
	// }

	// // temp end it here
	// return true;

	//
	// eject/delete the urls
	//
	//static long s_ii2 = 0;
	for ( ; s_flags[5] < max ; ) {
		// reject using html api
		SafeBuf sb;
		sb.safePrintf( "/admin/inject?c=qatest123&deleteurl=1&"
			       "format=xml&u=");
		sb.urlEncode ( s_urlPtrs[s_flags[5]] );
		sb.nullTerm();
		// pre-inc it in case getUrl() blocks
		//s_ii2++;
		s_flags[5]++;
		if ( ! getUrl ( sb.getBufStart() , 0 ) )
			return false;
	}

	//
	// make sure no results left, +the
	//
	if ( ! s_flags[6] ) {
		wait(1.5);
		s_flags[6] = true;
		return false;
	}

	if ( ! s_flags[14] ) {
		s_flags[14] = true;
		if ( ! getUrl ( "/search?c=qatest123&qa=2&format=xml&q=%2Bthe",
				-1672870556 ) )
			return false;
	}

	//static bool s_fee2 = false;
	if ( ! s_flags[13] ) {
		s_flags[13] = true;
		log("qa: SUCCESSFULLY COMPLETED "
			"QA INJECT TEST 1");
		//if ( s_callback == qainject ) exit(0);
		return true;
	}


	return true;
}

예제 #17

파일 보기

파일: Stats.cpp 프로젝트: BILObilo/open-source-search-engine

//
// new code for drawing graph in html with absolute divs instead
// of using GIF plotter library which had issues
//
void Stats::printGraphInHtml ( SafeBuf &sb ) {

	// gif size
	char tmp[64];
	sprintf ( tmp , "%lix%li", (long)DX+40 , (long)DY+40 ); // "1040x440"

	// 20 pixel borders
	//int bx = 10;
	//int by = 30;
	// define the space with boundaries 100 unit wide boundaries
	//plotter.space ( -bx , -by , DX + bx , DY + by );
	// draw the x-axis
	//plotter.line ( 0 , 0 , DX , 0  );
	// draw the y-axis
	//plotter.line ( 0 , 0 ,  0 , DY );

	// find time ranges
	long long t2 = 0;
	for ( long i = 0 ; i < MAX_POINTS ; i++ ) {
		// skip empties
		if ( m_pts[i].m_startTime == 0 ) continue;
		// set min/max
		if ( m_pts[i].m_endTime   > t2 ) t2 = m_pts[i].m_endTime;
	}
	// now compute the start time for the graph
	long long t1 = 0x7fffffffffffffffLL;
	// now recompute t1
	for ( long i = 0 ; i < MAX_POINTS ; i++ ) {
		// skip empties
		if ( m_pts[i].m_startTime == 0 ) continue;
		// can't be behind more than 1 second
		if ( m_pts[i].m_startTime   < t2 - DT ) continue;
		// otherwise, it's a candidate for the first time
		if ( m_pts[i].m_startTime < t1 ) t1 = m_pts[i].m_startTime;
	}

	//
	// main graphing window
	//
	sb.safePrintf("<div style=\"position:relative;"
		      "background-color:#c0c0c0;"

		      // match style of tables
		      "border-radius:10px;"
		      "border:#6060f0 2px solid;"
		      
		      //"overflow-y:hidden;"
		      "overflow-x:hidden;"
		      "z-index:-10;"
		      // the tick marks we print below are based on it
		      // being a window of the last 20 seconds... and using
		      // DX pixels
		      "min-width:%lipx;"
		      "min-height:%lipx;"
		      //"width:100%%;"
		      //"min-height:600px;"
		      //"margin-top:10px;"
		      "margin-bottom:10px;"
		      //"margin-right:10px;"
		      //"margin-left:10px;"
		      "\">"
		      ,(long)DX
		      ,(long)DY +20); // add 10 more for "2s" labels etc.

	// 10 x-axis tick marks
	for ( int x = DX/20 ; x <= DX ; x += DX/20 ) {
		// tick mark
		//plotter.line ( x , -20 , x , 20 );
		sb.safePrintf("<div style=\"position:absolute;"
			      "left:%li;"
			      "bottom:0;"
			      "background-color:#000000;"
			      "z-index:110;"
			      "min-height:20px;"
			      "min-width:3px;\"></div>\n"
			      , (long)x-1
			      );
		// generate label
		//char buf [ 32 ];
		//sprintf ( buf , "%li" , 
		//	  (long)(DT * (long long)x / (long long)DX) );
		// LABEL
		sb.safePrintf("<div style=\"position:absolute;"
			      "left:%li;"
			      "bottom:20;"
			      //"background-color:#000000;"
			      "z-index:110;"
			      "min-height:20px;"
			      "min-width:3px;\">%lis</div>\n"
			      , (long)x-10
			      // the label:
			      ,(long)(DT * (long long)x / (long long)DX)/1000
			      );

		// move cursor
		//plotter.move ( x , -by / 2 - 9 );
		// plot label
		//plotter.alabel     ( 'c' , 'c' , buf );
	}

	// . each line consists of several points
	// . we need to know each point for adding otherlines
	// . is about [400/6][1024] = 70k
	// . each line can contain multiple data points
	// . each data point is expressed as a horizontal line segment
	void *lrgBuf;
	long lrgSize = 0;
	lrgSize += MAX_LINES * MAX_POINTS * sizeof(StatPoint *);
	lrgSize += MAX_LINES * sizeof(long);
	lrgBuf = (char *) mmalloc(lrgSize, "Stats.cpp"); 
	if (! lrgBuf) {
	    log("could not allocate memory for local buffer in Stats.cpp"
		"%li bytes needed", lrgSize);
	    return;
	}
	char *lrgPtr = (char *)lrgBuf;
	StatPoint **points = (StatPoint **)lrgPtr;   
	lrgPtr += MAX_LINES * MAX_POINTS * sizeof(StatPoint *);
	long *numPoints = (long *)lrgPtr;
	lrgPtr += MAX_LINES * sizeof(long);
	memset ( (char *)numPoints , 0 , MAX_LINES * sizeof(long) );

	// store the data points into "lines"
	long count = MAX_POINTS;
	for ( long i = m_next ; count >= 0 ; i++ , count-- ) {
		// wrap around the array
		if ( i >= MAX_POINTS ) i = 0;
		// skip point if empty
		if ( m_pts[i].m_startTime == 0 ) continue;
		// skip if too early
		if ( m_pts[i].m_endTime < t1 ) continue;
		// . find the lowest line the will hold us
		// . this adds point to points[x][n] where x is determined
		addPoint ( points , numPoints , &m_pts[i] );
	}

	int y1 = 21;
	// plot the points (lines) in each line
	for ( long i = 0 ; i < MAX_LINES    ; i++ ) {
		// increase vert
		y1 += MAX_WIDTH + 1;
		// wrap back down if necessary
		if ( y1 >= DY ) y1 = 21;
		// plt all points in this row
	for ( long j = 0 ; j < numPoints[i] ; j++ ) {
		// get the point
		StatPoint *p =  points[MAX_POINTS * i + j];
		// transform time to x coordinates
		int x1 = (p->m_startTime - t1) * (long long)DX / DT;
		int x2 = (p->m_endTime   - t1) * (long long)DX / DT;
		// if x2 is negative, skip it
		if ( x2 < 0 ) continue;
		// if x1 is negative, boost it to -2
		if ( x1 < 0 ) x1 = -2;
		// . line thickness is function of read/write size
		// . take logs
		int w = (int)log(((double)p->m_numBytes)/8192.0) + 3;
		//log("log of %li is %i",m_pts[i].m_numBytes,w);
		if ( w < 3         ) w = 3;
		if ( w > MAX_WIDTH ) w = MAX_WIDTH;
		//plotter.linewidth ( w );       
		// use the color specified from addStat_r() for this line/pt
		//plotter.pencolor ( ((p->m_color >> 16) & 0xff) << 8 ,
		//		   ((p->m_color >>  8) & 0xff) << 8 ,
		//		   ((p->m_color >>  0) & 0xff) << 8 );
		// ensure at least 3 units wide for visibility
		if ( x2 < x1 + 3 ) x2 = x1 + 3;
		// . flip the y so we don't have to scroll the browser down
		// . DY does not include the axis and tick marks
		long fy1 = DY - y1 + 20 ;
		// plot it
		//plotter.line ( x1 , fy1 , x2 , fy1 );
		drawLine2 ( sb , x1 , x2 , fy1 , p->m_color , w );
		// debug msg
		//log("line (%i,%i, %i,%i) ", x1 , vert , x2 , vert );
		//log("bytes = %li width = %li ", m_pts[i].m_numBytes,w);
		//log("st=%i, end=%i color=%lx " ,
		//      (int)m_pts[i].m_startTime , 
		//      (int)m_pts[i].m_endTime   , 
		//      m_pts[i].m_color );
	}
	}

	sb.safePrintf("</div>\n");

	mfree(lrgBuf, lrgSize, "Stats.cpp");
}

예제 #18

파일 보기

파일: qa.cpp 프로젝트: firatkarakusoglu/open-source-search-engine

bool qainject2 ( ) {

	//if ( ! s_callback ) s_callback = qainject2;

	//
	// delete the 'qatest123' collection
	//
	//static bool s_x1 = false;
	if ( ! s_flags[0] ) {
		s_flags[0] = true;
		if ( ! getUrl ( "/admin/delcoll?xml=1&delcoll=qatest123" ) )
			return false;
	}

	//
	// add the 'qatest123' collection
	//
	//static bool s_x2 = false;
	if ( ! s_flags[1] ) {
		s_flags[1] = true;
		if ( ! getUrl ( "/admin/addcoll?addcoll=qatest123&xml=1" , 
				// checksum of reply expected
				238170006 ) )
			return false;
	}


	//
	// try delimeter based injecting
	//
	//static bool s_y2 = false;
	if ( ! s_flags[7] ) {
		s_flags[7] = true;
		SafeBuf sb;
		// delim=+++URL:
		sb.safePrintf("&c=qatest123&deleteurl=0&"
			      "delim=%%2B%%2B%%2BURL%%3A&format=xml&u=xyz.com&"
			      "hasmime=1&content=");
		// use injectme3 file
		SafeBuf ubuf;
		ubuf.load("./injectme3");
		sb.urlEncode(ubuf.getBufStart());
		if ( ! getUrl ( "/admin/inject",
				// check reply, seems to have only a single 
				// docid in it
				-1970198487, sb.getBufStart()) )
			return false;
	}

	// now query check
	//static bool s_y4 = false;
	if ( ! s_flags[8] ) {
		wait(1.5);
		s_flags[8] = true;
		return false;
	}

	if ( ! s_flags[14] ) {
		s_flags[14] = true;
		if ( ! getUrl ( "/search?c=qatest123&qa=1&format=xml&q=%2Bthe",
				-1804253505 ) )
			return false;
	}

	//static bool s_y5 = false;
	if ( ! s_flags[9] ) {
		s_flags[9] = true;
		if ( ! getUrl ( "/search?c=qatest123&qa=1&format=xml&q=sports"
				"+news&ns=1&tml=20&smxcpl=30&"
				"sw=10&showimages=1"
				,-1874756636 ) )
			return false;
	}

	//static bool s_y6 = false;
	if ( ! s_flags[10] ) {
		s_flags[10] = true;
		if ( ! getUrl ( "/search?c=qatest123&qa=1&format=xml&q=sports"
				"+news&ns=1&tml=20&smxcpl=30&"
				"sw=10&showimages=0&hacr=1"
				,1651330319 ) )
			return false;
	}

	//static bool s_y7 = false;
	if ( ! s_flags[11] ) {
		s_flags[11] = true;
		if ( ! getUrl ( "/search?c=qatest123&qa=1&format=xml&q=sports"
				"+news&ns=1&tml=20&smxcpl=30&"
				"sw=10&showimages=0&sc=1"
				,-1405546537 ) )
			return false;
	}


	//
	// delete the 'qatest123' collection
	//
	if ( ! s_flags[12] ) {
		s_flags[12] = true;
		if ( ! getUrl ( "/admin/delcoll?xml=1&delcoll=qatest123" ) )
			return false;
	}


	//static bool s_fee2 = false;
	if ( ! s_flags[13] ) {
		s_flags[13] = true;
		log("qa: SUCCESSFULLY COMPLETED "
			"QA INJECT TEST 2");
		//if ( s_callback == qainject ) exit(0);
		return true;
	}


	return true;
}

예제 #19

파일 보기

파일: PageGet.cpp 프로젝트: NEXUS1000/open-source-search-engine

// returns false if blocked, true otherwise
bool processLoop ( void *state ) {
	// get it
	State2 *st = (State2 *)state;
	// get the tcp socket from the state
	TcpSocket *s = st->m_socket;
	// get it
	XmlDoc *xd = &st->m_xd;

	if ( ! xd->m_loaded ) {
		// setting just the docid. niceness is 0.
		//xd->set3 ( st->m_docId , st->m_coll , 0 );
		// callback
		xd->setCallback ( state , processLoop );
		// . and tell it to load from the old title rec
		// . this sets xd->m_oldTitleRec/m_oldTitleRecSize
		// . this sets xd->ptr_* and all other member vars from
		//   the old title rec if found in titledb.
		if ( ! xd->loadFromOldTitleRec ( ) ) return false;
	}

	if ( g_errno ) return sendErrorReply ( st , g_errno );
	// now force it to load old title rec
	//char **tr = xd->getTitleRec();
	SafeBuf *tr = xd->getTitleRecBuf();
	// blocked? return false if so. it will call processLoop() when it rets
	if ( tr == (void *)-1 ) return false;
	// we did not block. check for error? this will free "st" too.
	if ( ! tr ) return sendErrorReply ( st , g_errno );
	// if title rec was empty, that is a problem
	if ( xd->m_titleRecBuf.length() == 0 ) 
		return sendErrorReply ( st , ENOTFOUND);

	// set callback
	char *na = xd->getIsNoArchive();
	// wait if blocked
	if ( na == (void *)-1 ) return false;
	// error?
	if ( ! na ) return sendErrorReply ( st , g_errno );
	// forbidden? allow turkeys through though...
	if ( ! st->m_isMasterAdmin && *na )
		return sendErrorReply ( st , ENOCACHE );

	SafeBuf *sb = &st->m_sb;


	// &page=4 will print rainbow sections
	if ( ! st->m_printed && st->m_r.getLong("page",0) ) {
		// do not repeat this call
		st->m_printed = true;
		// this will call us again since we called
		// xd->setCallback() above to us
		if ( ! xd->printDocForProCog ( sb , &st->m_r ) )
			return false;
	}

	char *contentType = "text/html";
	char format = st->m_format;
	if ( format == FORMAT_XML ) contentType = "text/xml";
	if ( format == FORMAT_JSON ) contentType = "application/json";

	// if we printed a special page (like rainbow sections) then return now
	if ( st->m_printed ) {
		bool status = g_httpServer.sendDynamicPage (s,
							    //buf,bufLen,
							    sb->getBufStart(),
							    sb->getLength(),
							    -1,false,
							    //"text/html",
							    contentType,
							    -1, NULL, "utf8" );
		// nuke state2
		mdelete ( st , sizeof(State2) , "PageGet1" );
		delete (st);
		return status;
	}

	/*
	  // this was calling XmlDoc and setting sections, etc. to
	  // get the SpiderReply junk... no no no
	// is it banned or filtered? this ignores the TagRec in the titleRec
	// and uses msg8a to get it fresh instead
	char *vi = xd->getIsFiltered();//Visible( );
	// wait if blocked
	if ( vi == (void *)-1 ) return false;
	// error?
	if ( ! vi ) return sendErrorReply ( st , g_errno );
	// banned?
	if ( ! st->m_isMasterAdmin && ! *vi ) return sendErrorReply (st,EDOCBANNED);
	*/

	// get the utf8 content
	char **utf8 = xd->getUtf8Content();
	//int32_t   len  = xd->size_utf8Content - 1;
	// wait if blocked???
	if ( utf8 == (void *)-1 ) return false;
	// strange
	if ( xd->size_utf8Content<=0) {
		log("pageget: utf8 content <= 0");
		return sendErrorReply(st,EBADENGINEER );
	}
	// alloc error?
	if ( ! utf8 ) return sendErrorReply ( st , g_errno );

	// get this host
	Host *h = g_hostdb.getHost ( g_hostdb.m_hostId );
	if ( ! h ) {
		log("pageget: hostid %"INT32" is bad",g_hostdb.m_hostId);
		return sendErrorReply(st,EBADENGINEER );
	}


	char *content    = xd->ptr_utf8Content;
	int32_t  contentLen = xd->size_utf8Content - 1;

	// int16_tcut
	char strip = st->m_strip;

	// alloc buffer now
	//char *buf = NULL;
	//int32_t  bufMaxSize = 0;
	//bufMaxSize = len + ( 32 * 1024 ) ;
	//bufMaxSize = contentLen + ( 32 * 1024 ) ;
	//buf        = (char *)mmalloc ( bufMaxSize , "PageGet2" );
	//char *p          = buf;
	//char *bufEnd     = buf + bufMaxSize;
	//if ( ! buf ) {
	//	return sendErrorReply ( st , g_errno );
	//}

	// for undoing the header
	//char *start1 = p;
	int32_t startLen1 = sb->length();

	// we are always utfu
	if ( strip != 2 )
		sb->safePrintf( "<meta http-equiv=\"Content-Type\" "
			     "content=\"text/html;charset=utf8\">\n");

	// base href
	//Url *base = &xd->m_firstUrl;
	//if ( xd->ptr_redirUrl.m_url[0] )
	//	base = &xd->m_redirUrl;
	char *base = xd->ptr_firstUrl;
	if ( xd->ptr_redirUrl ) base = xd->ptr_redirUrl;
	//Url *redir = *xd->getRedirUrl();
	if ( strip != 2 ) {
		sb->safePrintf ( "<BASE HREF=\"%s\">" , base );
		//p += gbstrlen ( p );
	}

	// default colors in case css files missing
	if ( strip != 2 ) {
		sb->safePrintf( "\n<style type=\"text/css\">\n"
			  "body{background-color:white;color:black;}\n"
			  "</style>\n");
		//p += gbstrlen ( p );
	}

	//char format = st->m_format;
	if ( format == FORMAT_XML ) sb->reset();
	if ( format == FORMAT_JSON ) sb->reset();

	if ( xd->m_contentType == CT_JSON ) sb->reset();
	if ( xd->m_contentType == CT_XML  ) sb->reset();
	if ( xd->m_contentType == CT_STATUS ) sb->reset();

	// for undoing the stuff below
	int32_t startLen2 = sb->length();//p;

	// query should be NULL terminated
	char *q    = st->m_qsb.getBufStart();
	int32_t  qlen = st->m_qsb.getLength(); // m_qlen;

	char styleTitle[128] =  "font-size:14px;font-weight:600;"
				"color:#000000;";
	char styleText[128]  =  "font-size:14px;font-weight:400;"
				"color:#000000;";
	char styleLink[128] =  "font-size:14px;font-weight:400;"
				"color:#0000ff;";
	char styleTell[128] =  "font-size:14px;font-weight:600;"
				"color:#cc0000;";

	// get the url of the title rec
	Url *f = xd->getFirstUrl();

	bool printDisclaimer = st->m_printDisclaimer;

	if ( xd->m_contentType == CT_JSON )
		printDisclaimer = false;

	if ( xd->m_contentType == CT_STATUS )
		printDisclaimer = false;

	if ( format == FORMAT_XML ) printDisclaimer = false;
	if ( format == FORMAT_JSON ) printDisclaimer = false;

	char tbuf[100];
	tbuf[0] = 0;
	time_t lastSpiderDate = xd->m_spideredTime;

	if ( printDisclaimer ||
	     format == FORMAT_XML ||
	     format == FORMAT_JSON ) {
		struct tm *timeStruct = gmtime ( &lastSpiderDate );
		strftime ( tbuf, 100,"%b %d, %Y UTC", timeStruct);
	}

	// We should always be displaying this disclaimer.
	// - May eventually want to display this at a different location
	//   on the page, or on the click 'n' scroll browser page itself
	//   when this page is not being viewed solo.
	// CNS: if ( ! st->m_clickNScroll ) {
	if ( printDisclaimer ) {

		sb->safePrintf(//sprintf ( p , 
			  //"<BASE HREF=\"%s\">"
			  //"<table border=1 width=100%%>"
			  //"<tr><td>"
			  "<table border=\"1\" bgcolor=\"#"
			  BGCOLOR
			  "\" cellpadding=\"10\" "
			  //"id=\"gbcnsdisctable\" class=\"gbcnsdisctable_v\""
			  "cellspacing=\"0\" width=\"100%%\" color=\"#ffffff\">"
			  "<tr"
			  //" id=\"gbcnsdisctr\" class=\"gbcnsdisctr_v\""
			  "><td>"
			  //"<font face=times,sans-serif color=black size=-1>"
			  "<span style=\"%s\">"
			  "This is Gigablast's cached page of </span>"
			  "<a href=\"%s\" style=\"%s\">%s</a>"
			  "" , styleTitle, f->getUrl(), styleLink,
			  f->getUrl() );
		//p += gbstrlen ( p );
		// then the rest
		//sprintf(p , 
		sb->safePrintf(
			"<span style=\"%s\">. "
			"Gigablast is not responsible for the content of "
			"this page.</span>", styleTitle );
		//p += gbstrlen ( p );

		sb->safePrintf ( "<br/><span style=\"%s\">"
			  "Cached: </span>"
			  "<span style=\"%s\">",
			  styleTitle, styleText );
		//p += gbstrlen ( p );

		// then the spider date in GMT
		// time_t lastSpiderDate = xd->m_spideredTime;
		// struct tm *timeStruct = gmtime ( &lastSpiderDate );
		// char tbuf[100];
		// strftime ( tbuf, 100,"%b %d, %Y UTC", timeStruct);
		//p += gbstrlen ( p );
		sb->safeStrcpy(tbuf);

		// Moved over from PageResults.cpp
		sb->safePrintf( "</span> - <a href=\""
			      "/get?"
			      "q=%s&amp;c=%s&amp;rtq=%"INT32"&amp;"
			      "d=%"INT64"&amp;strip=1\""
			      " style=\"%s\">"
			      "[stripped]</a>", 
			      q , st->m_coll , 
			      (int32_t)st->m_rtq,
			      st->m_docId, styleLink ); 

		// a link to alexa
		if ( f->getUrlLen() > 5 ) {
			sb->safePrintf( " - <a href=\"http:"
					 "//web.archive.org/web/*/%s\""
					 " style=\"%s\">"
					 "[older copies]</a>" ,
					 f->getUrl(), styleLink );
		}

		if (st->m_noArchive){
			sb->safePrintf( " - <span style=\"%s\"><b>"
				     "[NOARCHIVE]</b></span>",
				     styleTell );
		}
		if (st->m_isBanned){
			sb->safePrintf(" - <span style=\"%s\"><b>"
				     "[BANNED]</b></span>",
				     styleTell );
		}

		// only print this if we got a query
		if ( qlen > 0 ) {
			sb->safePrintf("<br/><br/><span style=\"%s\"> "
				   "These search terms have been "
				   "highlighted:  ",
				   styleText );
			//p += gbstrlen ( p );
		}
		
	}

	// how much space left in p?
	//int32_t avail = bufEnd - p;
	// . make the url that we're outputting for (like in PageResults.cpp)
	// . "thisUrl" is the baseUrl for click & scroll
	char thisUrl[MAX_URL_LEN];
	char *thisUrlEnd = thisUrl + MAX_URL_LEN;
	char *x = thisUrl;
	// . use the external ip of our gateway
	// . construct the NAT mapped port
	// . you should have used iptables to map port to the correct
	//   internal ip:port
	//uint32_t  ip   =g_conf.m_mainExternalIp  ; // h->m_externalIp;
	//uint16_t port=g_conf.m_mainExternalPort;//h->m_externalHttpPort
	// local check
	//if ( st->m_isLocal ) {
	uint32_t  ip   = h->m_ip;
	uint16_t port = h->m_httpPort;
	//}
	//sprintf ( x , "http://%s:%"INT32"/get?q=" , iptoa ( ip ) , port );
	// . we no longer put the port in here
	// . but still need http:// since we use <base href=>
	if (port == 80) sprintf(x,"http://%s/get?q=",iptoa(ip));
	else            sprintf(x,"http://%s:%hu/get?q=",iptoa(ip),port);
	x += gbstrlen ( x );
	// the query url encoded
	int32_t elen = urlEncode ( x , thisUrlEnd - x , q , qlen );
	x += elen;
	// separate cgi vars with a &
	//sprintf ( x, "&seq=%"INT32"&rtq=%"INT32"d=%"INT64"",
	//	  (int32_t)st->m_seq,(int32_t)st->m_rtq,st->m_msg22.getDocId());
	sprintf ( x, "&d=%"INT64"",st->m_docId );
	x += gbstrlen(x);		
	// set our query for highlighting
	Query qq;
	qq.set2 ( q, st->m_langId , true );

	// print the query terms into our highlight buffer
	Highlight hi;
	// make words so we can set the scores to ignore fielded terms
	Words qw;
	qw.set ( q            ,  // content being highlighted, utf8
		 qlen         ,  // content being highlighted, utf8
		 TITLEREC_CURRENT_VERSION,
		 true         ,  // computeIds
		 false        ); // hasHtmlEntities?
	// . assign scores of 0 to query words that should be ignored
	// . TRICKY: loop over words in qq.m_qwords, but they should be 1-1
	//   with words in qw.
	// . sanity check
	//if ( qw.getNumWords() != qq.m_numWords ) { char *xx = NULL; *xx = 0;}
	// declare up here
	Matches m;
	// do the loop
	//Scores ss;
	//ss.set ( &qw , NULL );
	//for ( int32_t i = 0 ; i < qq.m_numWords ; i++ )
	//	if ( ! m.matchWord ( &qq.m_qwords[i],i ) ) ss.m_scores[i] = 0;
	// now set m.m_matches[] to those words in qw that match a query word
	// or phrase in qq.
	m.setQuery ( &qq );
	//m.addMatches ( &qw , &ss , true );
	m.addMatches ( &qw );
	int32_t hilen = 0;

	// CNS: if ( ! st->m_clickNScroll ) {
	// and highlight the matches
	if ( printDisclaimer ) {
		hilen = hi.set ( //p       ,
				 //avail   ,
				sb ,
				 &qw     , // words to highlight
				 &m      , // matches relative to qw
				 false   , // doSteming
				 false   , // st->m_clickAndScroll , 
				 (char *)thisUrl );// base url for ClcknScrll
		//p += hilen;
		// now an hr
		//gbmemcpy ( p , "</span></table></table>\n" , 24 );   p += 24;
		sb->safeStrcpy("</span></table></table>\n");
	}


	bool includeHeader = st->m_includeHeader;

	// do not show header for json object display
	if ( xd->m_contentType == CT_JSON )
		includeHeader = false;
	if ( xd->m_contentType == CT_XML )
		includeHeader = false;
	if ( xd->m_contentType == CT_STATUS )
		includeHeader = false;

	if ( format == FORMAT_XML ) includeHeader = false;
	if ( format == FORMAT_JSON ) includeHeader = false;

	//mfree(uq, uqCapacity, "PageGet");
	// undo the header writes if we should
	if ( ! includeHeader ) {
		// including base href is off by default when not including
		// the header, so the caller must explicitly turn it back on
		if ( st->m_includeBaseHref ) sb->m_length=startLen2;//p=start2;
		else                         sb->m_length=startLen1;//p=start1;
	}

	//sb->safeStrcpy(tbuf);



	if ( format == FORMAT_XML ) {
		sb->safePrintf("<response>\n");
		sb->safePrintf("<statusCode>0</statusCode>\n");
		sb->safePrintf("<statusMsg>Success</statusMsg>\n");
		sb->safePrintf("<url><![CDATA[");
		sb->cdataEncode(xd->m_firstUrl.m_url);
		sb->safePrintf("]]></url>\n");
		sb->safePrintf("<docId>%"UINT64"</docId>\n",xd->m_docId);
		sb->safePrintf("\t<cachedTimeUTC>%"INT32"</cachedTimeUTC>\n",
			       (int32_t)lastSpiderDate);
		sb->safePrintf("\t<cachedTimeStr>%s</cachedTimeStr>\n",tbuf);
	}

	if ( format == FORMAT_JSON ) {
		sb->safePrintf("{\"response\":{\n");
		sb->safePrintf("\t\"statusCode\":0,\n");
		sb->safePrintf("\t\"statusMsg\":\"Success\",\n");
		sb->safePrintf("\t\"url\":\"");
		sb->jsonEncode(xd->m_firstUrl.m_url);
		sb->safePrintf("\",\n");
		sb->safePrintf("\t\"docId\":%"UINT64",\n",xd->m_docId);
		sb->safePrintf("\t\"cachedTimeUTC\":%"INT32",\n",
			       (int32_t)lastSpiderDate);
		sb->safePrintf("\t\"cachedTimeStr\":\"%s\",\n",tbuf);
	}

	
	// identify start of <title> tag we wrote out
	char *sbstart = sb->getBufStart();
	char *sbend   = sb->getBufEnd();
	char *titleStart = NULL;
	char *titleEnd   = NULL;

	char ctype = (char)xd->m_contentType;

	// do not calc title or print it if doc is xml or json
	if ( ctype == CT_XML ) sbend = sbstart;
	if ( ctype == CT_JSON ) sbend = sbstart;
	if ( ctype == CT_STATUS ) sbend = sbstart;

	for ( char *t = sbstart ; t < sbend ; t++ ) {
		// title tag?
		if ( t[0]!='<' ) continue;
		if ( to_lower_a(t[1])!='t' ) continue;
		if ( to_lower_a(t[2])!='i' ) continue;
		if ( to_lower_a(t[3])!='t' ) continue;
		if ( to_lower_a(t[4])!='l' ) continue;
		if ( to_lower_a(t[5])!='e' ) continue;
		// point to it
		char *x = t + 5;
		// max - to keep things fast
		char *max = x + 500;
		for ( ; *x && *x != '>' && x < max ; x++ );
		x++;
		// find end
		char *e = x;
		for ( ; *e && e < max ; e++ ) {
			if ( e[0]=='<' &&
			     to_lower_a(e[1])=='/' &&
			     to_lower_a(e[2])=='t' &&
			     to_lower_a(e[3])=='i' &&
			     to_lower_a(e[4])=='t' &&
			     to_lower_a(e[5])=='l' &&
			     to_lower_a(e[6])=='e' )
				break;
		}
		if ( e < max ) {
			titleStart = x;
			titleEnd   = e;
		}
		break;
	}

	// . print title at top!
	// . consider moving
	if ( titleStart ) {

		char *ebuf = st->m_r.getString("eb");
		if ( ! ebuf ) ebuf = "";

		//p += sprintf ( p , 
		sb->safePrintf(
			       "<table border=1 "
			       "cellpadding=10 "
			       "cellspacing=0 "
			       "width=100%% "
			       "color=#ffffff>" );

		int32_t printLinks = st->m_r.getLong("links",0);

		if ( ! printDisclaimer && printLinks )
			sb->safePrintf(//p += sprintf ( p , 
				       // first put cached and live link
				       "<tr>"
				       "<td bgcolor=lightyellow>"
				       // print cached link
				       //"<center>"
				       "&nbsp; "
				       "<b>"
				       "<a "
				       "style=\"font-size:18px;font-weight:600;"
				       "color:#000000;\" "
				       "href=\""
				       "/get?"
				       "c=%s&d=%"INT64"&qh=0&cnsp=1&eb=%s\">"
				       "cached link</a>"
				       " &nbsp; "
				       "<a "
				       "style=\"font-size:18px;font-weight:600;"
				       "color:#000000;\" "
				       "href=%s>live link</a>"
				       "</b>"
				       //"</center>"
				       "</td>"
				       "</tr>\n"
				       ,st->m_coll
				       ,st->m_docId 
				       ,ebuf
				       ,thisUrl // st->ptr_ubuf
				       );

		if ( printLinks ) {
			sb->safePrintf(//p += sprintf ( p ,
				       "<tr><td bgcolor=pink>"
				       "<span style=\"font-size:18px;"
				       "font-weight:600;"
				       "color:#000000;\">"
				       "&nbsp; "
				       "<b>PAGE TITLE:</b> "
				       );
			int32_t tlen = titleEnd - titleStart;
			sb->safeMemcpy ( titleStart , tlen );
			sb->safePrintf ( "</span></td></tr>" );
		}

		sb->safePrintf( "</table><br>\n" );

	}

	// is the content preformatted?
	bool pre = false;
	if ( ctype == CT_TEXT ) pre = true ; // text/plain
	if ( ctype == CT_DOC  ) pre = true ; // filtered msword
	if ( ctype == CT_PS   ) pre = true ; // filtered postscript

	if ( format == FORMAT_XML ) pre = false;
	if ( format == FORMAT_JSON ) pre = false;

	// if it is content-type text, add a <pre>
	if ( pre ) {//p + 5 < bufEnd && pre ) {
		sb->safePrintf("<pre>");
		//p += 5;
	}

	if ( st->m_strip == 1 )
		contentLen = stripHtml( content, contentLen, 
					(int32_t)xd->m_version, st->m_strip );
	// it returns -1 and sets g_errno on error, line OOM
	if ( contentLen == -1 ) {
		//if ( buf ) mfree ( buf , bufMaxSize , "PageGet2" );	
		return sendErrorReply ( st , g_errno );
	}

	Xml xml;
	Words ww;

	// if no highlighting, skip it
	bool queryHighlighting = st->m_queryHighlighting;
	if ( st->m_strip == 2 ) queryHighlighting = false;

	// do not do term highlighting if json
	if ( xd->m_contentType == CT_JSON )
		queryHighlighting = false;
	if ( xd->m_contentType == CT_STATUS )
		queryHighlighting = false;

	SafeBuf tmp;
	SafeBuf *xb = sb;
	if ( format == FORMAT_XML ) xb = &tmp;
	if ( format == FORMAT_JSON ) xb = &tmp;
	

	if ( ! queryHighlighting ) {
		xb->safeMemcpy ( content , contentLen );
		xb->nullTerm();
		//p += contentLen ;
	}
	else {
		// get the content as xhtml (should be NULL terminated)
		//Words *ww = xd->getWords();
		if ( ! xml.set ( content , contentLen , false ,
				 0 , false , TITLEREC_CURRENT_VERSION ,
				 false , 0 , CT_HTML ) ) { // niceness is 0
			//if ( buf ) mfree ( buf , bufMaxSize , "PageGet2" );
			return sendErrorReply ( st , g_errno );
		}			
		if ( ! ww.set ( &xml , true , 0 ) ) { // niceness is 0
			//if ( buf ) mfree ( buf , bufMaxSize , "PageGet2" );
			return sendErrorReply ( st , g_errno );
		}
		// sanity check
		//if ( ! xd->m_wordsValid ) { char *xx=NULL;*xx=0; }
		// how much space left in p?
		//avail = bufEnd - p;

		Matches m;
		m.setQuery ( &qq );
		m.addMatches ( &ww );
		hilen = hi.set ( xb , // p , avail , 
				 &ww , &m ,
				 false /*doStemming?*/ ,  
				 st->m_clickAndScroll , 
				 thisUrl /*base url for click & scroll*/);
		//p += hilen;
		log(LOG_DEBUG, "query: Done highlighting cached page content");
	}


	if ( format == FORMAT_XML ) {
		sb->safePrintf("\t<content><![CDATA[");
		sb->cdataEncode ( xb->getBufStart() );
		sb->safePrintf("]]></content>\n");
		sb->safePrintf("</response>\n");
	}

	if ( format == FORMAT_JSON ) {
		sb->safePrintf("\t\"content\":\"\n");
		sb->jsonEncode ( xb->getBufStart() );
		sb->safePrintf("\"\n}\n}\n");
	}


	// if it is content-type text, add a </pre>
	if ( pre ) { // p + 6 < bufEnd && pre ) {
		sb->safeMemcpy ( "</pre>" , 6 );
		//p += 6;
	}

	// calculate bufLen
	//int32_t bufLen = p - buf;

	/*

	  MDW: return the xml page as is now. 9/28/2014

	int32_t ct = xd->m_contentType;

	// now filter the entire buffer to escape out the xml tags
	// so it is displayed nice
	SafeBuf newbuf;

	if ( ct == CT_XML ) {
		// encode the xml tags into &lt;tagname&gt; sequences
		if ( !newbuf.htmlEncodeXmlTags ( sb->getBufStart() ,
						 sb->getLength(),
						 0)){// niceness=0
			//if ( buf ) mfree ( buf , bufMaxSize , "PageGet2" );
			return sendErrorReply ( st , g_errno );
		}
		// free out buffer that we alloc'd before returning since this
		// should have copied it into another buffer
		//if ( buf ) mfree ( buf , bufMaxSize , "PageGet2" );	
		// reassign
		//buf    = newbuf.getBufStart();
		//bufLen = newbuf.length();
		sb->stealBuf ( &newbuf );
	}
	*/

	// now encapsulate it in html head/tail and send it off
	// sendErr:
	contentType = "text/html";
	if ( strip == 2 ) contentType = "text/xml";
	// xml is usually buggy and this throws browser off
	//if ( ctype == CT_XML ) contentType = "text/xml";

	if ( xd->m_contentType == CT_JSON )
		contentType = "application/json";

	if ( xd->m_contentType == CT_STATUS )
		contentType = "application/json";

	if ( xd->m_contentType == CT_XML )
		contentType = "test/xml";

	if ( format == FORMAT_XML ) contentType = "text/xml";
	if ( format == FORMAT_JSON ) contentType = "application/json";

	// safebuf, sb, is a member of "st" so this should copy the buffer
	// when it constructs the http reply, and we gotta call delete(st)
	// AFTER this so sb is still valid.
	bool status = g_httpServer.sendDynamicPage (s,
						    //buf,bufLen,
						    sb->getBufStart(),
						    sb->getLength(),
						    -1,false,
						    contentType,
						     -1, NULL, "utf8" );

	// nuke state2
	mdelete ( st , sizeof(State2) , "PageGet1" );
	delete (st);


	// free out buffer that we alloc'd before returning since this
	// should have copied it into another buffer

	//if      ( ct == CT_XML ) newbuf.purge();
	//else if ( buf          ) mfree ( buf , bufMaxSize , "PageGet2" );
	
	// and convey the status
	return status;
}

예제 #20

파일 보기

파일: qa.cpp 프로젝트: firatkarakusoglu/open-source-search-engine

bool qaspider1 ( ) {
	//
	// delete the 'qatest123' collection
	//
	//static bool s_x1 = false;
	if ( ! s_flags[0] ) {
		s_flags[0] = true;
		if ( ! getUrl ( "/admin/delcoll?xml=1&delcoll=qatest123" ) )
			return false;
	}

	//
	// add the 'qatest123' collection
	//
	//static bool s_x2 = false;
	if ( ! s_flags[1] ) {
		s_flags[1] = true;
		if ( ! getUrl ( "/admin/addcoll?addcoll=qatest123&xml=1" , 
				// checksum of reply expected
				238170006 ) )
			return false;
	}

	// restrict hopcount to 0 or 1 in url filters so we do not spider
	// too deep
	//static bool s_z1 = false;
	if ( ! s_flags[2] ) {
		s_flags[2] = true;
		SafeBuf sb;
		sb.safePrintf("&c=qatest123&"
			      // make it the custom filter
			      "ufp=0&"

	       "fe=%%21ismanualadd+%%26%%26+%%21insitelist&hspl=0&hspl=1&fsf=0.000000&mspr=0&mspi=1&xg=1000&fsp=-3&"

			      // take out hopcount for now, just test quotas
			      //	       "fe1=tag%%3Ashallow+%%26%%26+hopcount%%3C%%3D1&hspl1=0&hspl1=1&fsf1=1.000000&mspr1=1&mspi1=1&xg1=1000&fsp1=3&"

			      // just one spider out allowed for consistency
	       "fe1=tag%%3Ashallow+%%26%%26+sitepages%%3C%%3D20&hspl1=0&hspl1=1&fsf1=1.000000&mspr1=1&mspi1=1&xg1=1000&fsp1=45&"

	       "fe2=default&hspl2=0&hspl2=1&fsf2=1.000000&mspr2=0&mspi2=1&xg2=1000&fsp2=45&"

		);
		if ( ! getUrl ( "/admin/filters",0,sb.getBufStart()) )
			return false;
	}

	// set the site list to 
	// a few sites
	//static bool s_z2 = false;
	if ( ! s_flags[3] ) {
		s_flags[3] = true;
		SafeBuf sb;
		sb.safePrintf("&c=qatest123&format=xml&sitelist=");
		sb.urlEncode("tag:shallow site:www.walmart.com\r\n"
			     "tag:shallow site:http://www.ibm.com/\r\n");
		sb.nullTerm();
		if ( ! getUrl ("/admin/settings",0,sb.getBufStart() ) )
			return false;
	}
		
	//
	// use the add url interface now
	// walmart.com above was not seeded because of the site: directive
	// so this will seed it.
	//
	//static bool s_y2 = false;
	if ( ! s_flags[4] ) {
		s_flags[4] = true;
		SafeBuf sb;
		// delim=+++URL:
		sb.safePrintf("&c=qatest123"
			      "&format=json"
			      "&strip=1"
			      "&spiderlinks=1"
			      "&urls=www.walmart.com+ibm.com"
			      );
		// . now a list of websites we want to spider
		// . the space is already encoded as +
		//sb.urlEncode(s_urls1);
		if ( ! getUrl ( "/admin/addurl",0,sb.getBufStart()) )
			return false;
	}

	//
	// wait for spidering to stop
	//
 checkagain:

	// wait until spider finishes. check the spider status page
	// in json to see when completed
	//static bool s_k1 = false;
	if ( ! s_flags[5] ) {
		// wait 5 seconds, call sleep timer... then call qatest()
		//usleep(5000000); // 5 seconds
		wait(3.0);
		s_flags[5] = true;
		return false;
	}

	if ( ! s_flags[15] ) {
		s_flags[15] = true;
		if ( ! getUrl ( "/admin/status?format=json&c=qatest123",0) )
			return false;
	}

	//static bool s_k2 = false;
	if ( ! s_flags[6] ) {
		// ensure spiders are done. 
		// "Nothing currently available to spider"
		if ( s_content&&!strstr(s_content,"Nothing currently avail")){
			s_flags[5] = false;
			s_flags[15] = false;
			goto checkagain;
		}
		s_flags[6] = true;
	}


	// wait for index msg4 to not be cached to ensure all results indexed
	if ( ! s_flags[22] ) {
		s_flags[22] = true;
		wait(1.5);
	}


	// verify no results for gbhopcount:2 query
	//static bool s_y4 = false;
	if ( ! s_flags[7] ) {
		s_flags[7] = true;
		if ( ! getUrl ( "/search?c=qatest123&qa=1&format=xml&"
				"q=gbhopcount%3A2",
				-1672870556 ) )
			return false;
	}

	// but some for gbhopcount:0 query
	//static bool s_t0 = false;
	if ( ! s_flags[8] ) {
		s_flags[8] = true;
		if ( ! getUrl ( "/search?c=qatest123&qa=1&format=xml&"
				"q=gbhopcount%3A0",
				908338607 ) )
			return false;
	}
	
	// check facet sections query for walmart
	//static bool s_y5 = false;
	if ( ! s_flags[9] ) {
		s_flags[9] = true;
		if ( ! getUrl ( "/search?c=qatest123&format=json&stream=1&"
				"q=gbfacetstr%3Agbxpathsitehash2492664135",
				55157060 ) )
			return false;
	}

	//static bool s_y6 = false;
	if ( ! s_flags[10] ) {
		s_flags[10] = true;
		if ( ! getUrl ( "/get?page=4&q=gbfacetstr:gbxpathsitehash2492664135&qlang=xx&c=qatest123&d=9861563119&cnsp=0" , 999 ) )
			return false;
	}

	// in xml
	//static bool s_y7 = false;
	if ( ! s_flags[11] ) {
		s_flags[11] = true;
		if ( ! getUrl ( "/get?xml=1&page=4&q=gbfacetstr:gbxpathsitehash2492664135&qlang=xx&c=qatest123&d=9861563119&cnsp=0" , 999 ) )
			return false;
	}

	// and json
	//static bool s_y8 = false;
	if ( ! s_flags[12] ) {
		s_flags[12] = true;
		if ( ! getUrl ( "/get?json=1&page=4&q=gbfacetstr:gbxpathsitehash2492664135&qlang=xx&c=qatest123&d=9861563119&cnsp=0" , 999 ) )
			return false;
	}


	// delete the collection
	//static bool s_fee = false;
	// if ( ! s_flags[13] ) {
	// 	s_flags[13] = true;
	// 	if ( ! getUrl ( "/admin/delcoll?delcoll=qatest123" ) )
	// 		return false;
	// }

	if ( ! s_flags[17] ) {
		s_flags[17] = true;
		if ( ! getUrl ( "/search?c=qatest123&qa=1&format=xml&"
				"q=site2%3Awww.walmart.com+"
				"gbsortby%3Agbspiderdate",
				999 ) )
			return false;
	}

	// xpath is like a title here i think. check the returned
	// facet table in the left column
	if ( ! s_flags[18] ) {
		s_flags[18] = true;
		if ( ! getUrl ( "/search?c=qatest123&qa=1&format=html&"
				"q=gbfacetstr%3Agbxpathsitehash3624590799"
				, 999 ) )
			return false;
	}

	if ( ! s_flags[19] ) {
		s_flags[19] = true;
		if ( ! getUrl ( "/search?c=qatest123&qa=1&xml=1&"
				"q=gbfacetint%3Agbhopcount"
				, 999 ) )
			return false;
	}

	if ( ! s_flags[20] ) {
		s_flags[20] = true;
		if ( ! getUrl ( "/search?c=qatest123&qa=1&json=1&"
				"q=gbfacetint%3Alog.score"
				, 999 ) )
			return false;
	}

	if ( ! s_flags[21] ) {
		s_flags[21] = true;
		if ( ! getUrl ( "/search?c=qatest123&qa=1&xml=1&"
				"q=gbfacetfloat%3Atalks.rating"
				, 999 ) )
			return false;
	}

	if ( ! s_flags[23] ) {
		s_flags[23] = true;
		// test facets mixed with gigabits in left hand column
		if ( ! getUrl ( "/search?c=qatest123&qa=1&html=1&"
				"q=gbfacetint%3Agbhopcount+walmart"
				, 999 ) )
			return false;
	}


	//static bool s_fee2 = false;
	if ( ! s_flags[14] ) {
		s_flags[14] = true;
		log("qa: SUCCESSFULLY COMPLETED "
			"QA SPIDER1 TEST");
		return true;
	}

	return true;
}

예제 #21

파일 보기

파일: PageParser.cpp 프로젝트: BILObilo/open-source-search-engine

bool processLoop ( void *state ) {
    // cast it
    State8 *st = (State8 *)state;
    // get the xmldoc
    XmlDoc *xd = &st->m_xd;

    // error?
    if ( g_errno ) return sendErrorReply ( st , g_errno );

    // shortcut
    SafeBuf *xbuf = &st->m_xbuf;

    if ( st->m_u && st->m_u[0] ) {
        // . save the ips.txt file if we are the test coll
        // . saveTestBuf() is a function in Msge1.cpp
        CollectionRec *cr = xd->getCollRec();
        if ( xd && cr && cr->m_coll && ! strcmp ( cr->m_coll,"test") )
            // use same dir that XmlDoc::getTestDir() would use
            saveTestBuf ( "test-page-parser" );
        // now get the meta list, in the process it will print out a
        // bunch of junk into st->m_xbuf
        char *metalist = xd->getMetaList ( );
        if ( ! metalist ) return sendErrorReply ( st , g_errno );
        // return false if it blocked
        if ( metalist == (void *)-1 ) return false;
        // for debug...
        if ( ! xd->m_indexCode ) xd->doConsistencyTest ( false );
        // print it out
        xd->printDoc( xbuf );
    }

    // print reason we can't analyze it (or index it)
    //if ( st->m_indexCode != 0 ) {
    //	xbuf->safePrintf ("<br><br><b>indexCode: %s</b>\n<br>",
    //			  mstrerror(st->m_indexCode));
    //}

    // we are done
    g_inPageParser = false;

    // print the final tail
    //p += g_httpServer.printTail ( p , pend - p );

    //log("parser: send sock=%li",st->m_s->m_sd);

    // now encapsulate it in html head/tail and send it off
    bool status = g_httpServer.sendDynamicPage( st->m_s ,
                  xbuf->getBufStart(),
                  xbuf->length() ,
                  -1, //cachtime
                  false ,//postreply?
                  NULL, //ctype
                  -1 , //httpstatus
                  NULL,//cookie
                  "utf-8");
    // delete the state now
    if ( st->m_freeIt ) {
        mdelete ( st , sizeof(State8) , "PageParser" );
        delete (st);
    }
    // return the status
    return status;
}

예제 #22

파일 보기

파일: PageIndexdb.cpp 프로젝트: BKJackson/open-source-search-engine

// . make a web page from results stored in msg40
// . send it on TcpSocket "s" when done
// . returns false if blocked, true otherwise
// . sets g_errno on error
bool gotIndexList ( void *state ) {
	// the state
	State10 *st = (State10 *) state;
	// launch more
	if ( ! launchRequests ( st ) ) return false;
	/*
	// get the date list
	//fprintf(stderr,"termId now=%lli\n",st->m_termId);
	//fprintf(stderr,"should be=%lli\n",(st->m_termId & TERMID_MASK));
	// . now get the indexList for this termId
	// . date is complemented, so start with bigger one first
	key128_t startKey = g_datedb.makeStartKey ( st->m_termId ,0xffffffff);
	key128_t endKey   = g_datedb.makeEndKey   ( st->m_termId ,0x0);
	// get the rdb ptr to titledb's rdb
	//Rdb *rdb = g_indexdb.getRdb();
	// -1 means read from all files in Indexdb
	long numFiles = -1;
	// make it zero if caller doesn't want to hit the disk
	if ( ! st->m_useDisk ) numFiles = 0;
	// get the title rec at or after this docId
	if ( ! st->m_msg0.getList ( -1 ,
				    0  ,
				    0  ,
				    0  ,    // max cache age
				    false , // add to cache?
				    RDB_DATEDB  , // rdbId of 2 = indexdb
				    st->m_coll ,
				    &st->m_list2  ,
				    (char *)&startKey  ,
				    (char *)&endKey    ,
				    st->m_numRecs * sizeof(key128_t),//recSizes
				    //st->m_useTree   , // include tree?
				    //st->m_useCache  , // include cache?
				    //false     , // add to cache?
				    //0         , // startFileNum
				    //numFiles  , // numFiles
				    st        , // state
				    gotIndexListWrapper2 ,
				    0  ) )  // niceness
		return false;
	// otherwise call gotResults which returns false if blocked, true else
	// and sets g_errno on error
	return gotIndexList2 ( (void *) st , NULL );
}


void gotIndexListWrapper2 ( void *state , RdbList *list ) {
	gotIndexList2 ( state , list );
}

void addedKeyWrapper ( void *state ) {
	gotIndexList2 ( state, NULL );
}

// . make a web page from results stored in msg40
// . send it on TcpSocket "s" when done
// . returns false if blocked, true otherwise
// . sets g_errno on error
bool gotIndexList2 ( void *state , RdbList *list ) {
	// the state
	State10 *st = (State10 *) state;
	*/
	// get the socket
	TcpSocket *s = st->m_socket;
	// don't allow pages bigger than 128k in cache
	//char  buf [ 64*1024 ];
	// a ptr into "buf"
	//char *p    = buf;
	//char *pend = buf + 64*1024;
	/*
	// get termId
	key_t k = *(key_t *)st->m_list.getStartKey();
	long long termId = g_indexdb.getTermId ( k );
	// get groupId from termId
	//unsigned long groupId = k.n1 & g_hostdb.m_groupMask;
	unsigned long groupId = g_indexdb.getGroupIdFromKey ( &k );
	long hostnum = g_hostdb.makeHostId ( groupId );
	*/
	// check box " checked" strings
	char *ubs = "";
	char *uts = "";
	char *uds = "";
	char *ucs = "";
	char *add = "";
	char *del = "";
	if ( st->m_useDatedb) ubs = " checked";
	if ( st->m_useTree  ) uts = " checked";
	if ( st->m_useDisk  ) uds = " checked";
	if ( st->m_useCache ) ucs = " checked";
	if ( st->m_add      ) add = " checked";
	if ( st->m_del      ) del = " checked";

	SafeBuf *pbuf = &st->m_pbuf;

	g_pages.printAdminTop ( pbuf , st->m_socket , &st->m_r );

	// get base, returns NULL and sets g_errno to ENOCOLLREC on error
	RdbBase *base; 
	if (!(base=getRdbBase((uint8_t)RDB_INDEXDB,st->m_coll))) return true;

	// print the standard header for admin pages
	pbuf->safePrintf ( 
		  "<center>\n"
		  "<table cellpadding=2><tr><td colspan=4>"
		  "useDatedb:<input type=checkbox value=1 name=ub%s> "
		  "useTree:<input type=checkbox value=1 name=ut%s> "
		  "useDisk:<input type=checkbox value=1 name=ud%s> "
		  "useCache:<input type=checkbox value=1 name=uc%s> "
		  "ADD:<input type=checkbox value=1 name=add%s> "
		  "DELETE:<input type=checkbox value=1 name=del%s>"
		  "</td></tr><tr><td>"
		  "query:"
		  "</td><td>"
		  "<input type=text name=q value=\"%s\" size=20>"
		  "</td><td>"
		  "collection:"
		  "</td><td>"
		  "<input type=text name=c value=\"%s\" size=10>"
		  "</td></tr><tr><td>"
		  "termId:"
		  "</td><td>"
		  "<input type=text name=t value=%lli size=20>"
		  "</td><td>"
		  "numRecs:"
		  "</td><td>"
		  "<input type=text name=numRecs value=%li size=10> "
		  "</td></tr><tr><td>"
		  "docId:"
		  "</td><td>"
		  "<input type=text name=d value=%lli size=20> "
		  "</td><td>"
		  "score:"
		  "</td><td>"
		  "<input type=text name=score value=%li size=10> "
		  "</td><td>"
		  "<input type=submit value=ok border=0>"
		  "</td></tr>"
		  "<tr><td colspan=2>"
		  "term appears in about %lli docs +/- %li"
		  "</td></tr>"
		  //"<tr><td colspan=2>"
		  //"this indexlist held by host #%li and twins"
		  //"</td></tr>"
		  "</table>"
		  "</form><br><br>" ,
		  ubs, uts, uds, ucs, add, del,
		  st->m_query , st->m_coll , st->m_termId  , 
		  st->m_numRecs  ,
		  st->m_docId , (long)st->m_score ,
		  st->m_termFreq ,
		  2 * (long)GB_INDEXDB_PAGE_SIZE / 6 * 
		  base->getNumFiles() );
		  //hostnum );

	if ( g_errno || (st->m_list.isEmpty() ) ) {//&&st->m_list2.isEmpty())){
		if (g_errno)pbuf->safePrintf("Error = %s",mstrerror(g_errno));
		else        pbuf->safePrintf("List is empty");
		pbuf->safePrintf("</center>");
		// erase g_errno for sending
		g_errno = 0;
		// now encapsulate it in html head/tail and send it off
		bool status = g_httpServer.sendDynamicPage(s , 
							   pbuf->getBufStart(),
							   pbuf->length() );
		// delete it
		mdelete ( st , sizeof(State10) , "PageIndexdb" );
		delete (st);
		return status;
	}

	pbuf->safePrintf ( 
		  "<table cellpadding=1 border=1>" 
		  "<tr><td>#</td><td>score</td>"
		  "<td>docId</td><td>domHash</td></tr>");

	//if ( searchingEvents

	// now print the score/docId of indexlist
	long i = 0;
	for (   st->m_list.resetListPtr () ;
	      ! st->m_list.isExhausted  () ;
		st->m_list.skipCurrentRecord () ) {
		// break if buf is low
		//if ( p + 1024 >= pend ) break;
		// but set the ip/port to a host that has this titleRec
		// stored locally!
		long long     docId   = st->m_list.getCurrentDocId () ;
		unsigned long groupId = getGroupIdFromDocId ( docId );
		// get the first host's hostId in this groupId
		Host *h = g_hostdb.getFastestHostInGroup ( groupId );
		// . pick the first host to handle the cached titleRec request
		// . we assume it has the best time and is up!! TODO: fix!
		// . use local ip though if it was an internal request
		// . otherwise, use the external ip
		//unsigned long  ip   = h->m_externalIp;
		unsigned long  ip   = h->m_ip;
		// use the NAT mapped port
		unsigned short port = h->m_externalHttpPort;
		// log the first docid so we can blaster url: queries
		// to PageIndexdb and see if they are in indexdb
		if ( i == 0 ) 
			logf(LOG_INFO,"indexdb: %llu %s",docId,st->m_query);
		// adjust ip/port if local
		if ( st->m_isLocal ) {
			ip   = h->m_ip;
			port = h->m_httpPort;
		}
		unsigned long date = 0;
		if ( st->m_useDatedb )
			date = (unsigned long)st->m_list.getCurrentDate();
		uint8_t dh = g_titledb.getDomHash8FromDocId ( docId );
		char ds[32];
		ds[0]=0;
		if ( st->m_useDatedb ) sprintf (ds,"%lu/",date);
		pbuf->safePrintf ( 
			  "<tr><td>%li.</td>"
			  "<td>%s%i</td>"
			  "<td>"
			  //"<a href=http://%s:%hu/master/titledb?d=%llu>"
			  "<a href=/master/titledb?c=%s&d=%llu>"
			  "%llu"
			  //"<td><a href=/cgi/4.cgi?d=%llu>%llu"
			  "</td>"
			  "<td>"
			  "0x%02lx"
			  "</td>"
			  "</tr>\n" ,
			  i++,
			  ds, (int)st->m_list.getCurrentScore() ,
			  //iptoa(ip) , port ,
			  st->m_coll,
			  docId , 
			  docId ,
			  (long)dh );
	}	
	pbuf->safePrintf ( "</table>" );

	/*
	if ( ! st->m_list2.isEmpty() ) 
		p += sprintf ( p ,
			       "<br>"
			       "<br>"
			       "<table cellpadding=1 border=1>" 
			       "<tr><td>#</td><td>termId</td>"
			       "<td>date</td><td>score</td>"
			       "<td>docId</td></tr>");

	// now print the score/docId of datedb list
	i = 0;
	for (   st->m_list2.resetListPtr () ;
	      ! st->m_list2.isExhausted  () ;
		st->m_list2.skipCurrentRecord () ) {
		// break if buf is low
		if ( p + 1024 >= pend ) break;
		// but set the ip/port to a host that has this titleRec
		// stored locally!
		long long     docId   = st->m_list2.getCurrentDocId () ;
		unsigned long groupId = g_titledb.getGroupId ( docId );
		// get the first host's hostId in this groupId
		Host *h = g_hostdb.getFastestHostInGroup ( groupId );
		// . pick the first host to handle the cached titleRec request
		// . we assume it has the best time and is up!! TODO: fix!
		// . use local ip though if it was an internal request
		// . otherwise, use the external ip
		//unsigned long  ip   = h->m_externalIp;
		unsigned long  ip   = h->m_ip;
		// use the NAT mapped port
		unsigned short port = h->m_externalHttpPort;
		// adjust ip/port if local
		if ( st->m_isLocal ) {
			ip   = h->m_ip;
			port = h->m_httpPort;
		}
		// debug
		char kb[16];
		st->m_list2.getCurrentKey(kb);
		//log(LOG_INFO,"debug: n1=%016llx n0=%016llx",
		//    *(long long *)(kb+8),*(long long *)(kb+0));
		//if ( (unsigned long)st->m_list2.getCurrentDate() == 0 )
		//	log("STOP");
		sprintf ( p , 
			  "<tr><td>%li.</td>"
			  "<td>%llu</td>"
			  "<td>%lu</td><td>%i</td>"
			  "<td>"
			  //"<a href=http://%s:%hu/master/titledb?d=%llu>"
			  "<a href=/master/titledb?c=%s&d=%llu>"
			  "%llu"
			  //"<td><a href=/cgi/4.cgi?d=%llu>%llu"
			  "</td></tr>\n" ,
			  i++,
			  st->m_list2.getTermId16(kb) ,
			  (unsigned long)st->m_list2.getCurrentDate() ,
			  (int)st->m_list2.getCurrentScore() ,
			  //iptoa(ip) , port ,
			  st->m_coll,
			  docId , 
			  docId );
		p += gbstrlen ( p );
	}	
	*/
	if ( ! st->m_list.isEmpty() ) 
		pbuf->safePrintf ( "</table>" );


	// print msg if we could fit all into buf
	//if ( p + 1024 >= pend ) {
	//	sprintf ( p ,"... truncated ... no mem" );
	//	p += gbstrlen ( p );		
	//}
	// print the final tail
	//p += g_httpServer.printTail ( p , pend - p );
	pbuf->safePrintf ( "</center>\n");
	// now encapsulate it in html head/tail and send it off
	bool status = g_httpServer.sendDynamicPage ( s , 
						     pbuf->getBufStart() ,
						     pbuf->length() );
	// delete the state
	mdelete ( st , sizeof(State10) , "PageIndexdb" );
	delete (st) ;
	return status;
}

예제 #23

파일 보기

파일: PageAddColl.cpp 프로젝트: lemire/open-source-search-engine

bool sendPageAddDelColl ( TcpSocket *s , HttpRequest *r , bool add ) {
#ifdef PRIVACORE_SAFE_VERSION
	g_errno = EBADENGINEER;
	char *msg = "Function disabled by PRIVACORE_SAFE_VERSION define";
	return g_httpServer.sendErrorReply(s,g_errno,msg,NULL);
#else
	// get collection name
	//int32_t  nclen;
	//char *nc   = r->getString ( "nc" , &nclen );
	//int32_t  cpclen;
	//char *cpc  = r->getString ( "cpc" , &cpclen );

	g_errno = 0;

	//bool cast = r->getLong("cast",0);

	const char *msg = NULL;

	// if any host in network is dead, do not do this
	//if ( g_hostdb.hasDeadHost() ) msg = "A host in the network is dead.";

	// . are we adding a collection?
	// . return if error adding, might already exist!
	// . g_errno should be set
	// . WE DO NOT NEED THIS ANYMORE. Pages.cpp now broadcasts
	//   addcoll as CommandAddColl() parm.
	/*
	if ( nclen > 0 && add && ! cast ) {
		// do not allow "main" that is used for the "" collection
		// for backwards compatibility
		//if ( strcmp ( nc , "main" ) != 0 ) 
		g_collectiondb.addRec (nc,cpc,cpclen,true,(collnum_t)-1,
				       false , // isdump?
				       true  ) ;// save it?
		//else 
		//	log("admin: \"main\" collection is forbidden.");
	}

	if ( ! add && ! cast ) g_collectiondb.deleteRecs ( r )   ;
	*/

	char format = r->getReplyFormat();


	if ( format == FORMAT_XML || format == FORMAT_JSON ) {
		// no addcoll given?
		int32_t  page = g_pages.getDynamicPageNumber ( r );
		const char *addcoll = r->getString("addcoll",NULL);
		const char *delcoll = r->getString("delcoll",NULL);
		if ( ! addcoll ) addcoll = r->getString("addColl",NULL);
		if ( ! delcoll ) delcoll = r->getString("delColl",NULL);
		if ( page == PAGE_ADDCOLL && ! addcoll ) {
			g_errno = EBADENGINEER;
			const char *msg = "no addcoll parm provided";
			return g_httpServer.sendErrorReply(s,g_errno,msg,NULL);
		}
		if ( page == PAGE_DELCOLL && ! delcoll ) {
			g_errno = EBADENGINEER;
			const char *msg = "no delcoll parm provided";
			return g_httpServer.sendErrorReply(s,g_errno,msg,NULL);
		}
		return g_httpServer.sendSuccessReply(s,format);
	}

	// error?
	const char *action = r->getString("action",NULL);
	const char *addColl = r->getString("addcoll",NULL);

	// add our ip to the list
	//char *ips = r->getString("collips",NULL);
	//char *pwds = r->getString("collpwd",NULL);


	char  buf [ 64*1024 ];
	SafeBuf p(buf, 64*1024);


	//
	// CLOUD SEARCH ENGINE SUPPORT - GIGABOT ERRORS
	//

	SafeBuf gtmp;
	char *gmsg = NULL;
	// is it too big?
	if ( action && addColl && gbstrlen(addColl) > MAX_COLL_LEN ) {
		gtmp.safePrintf("search engine name is too long");
		gmsg = gtmp.getBufStart();
	}
	// from Collectiondb.cpp::addNewColl() ensure coll name is legit
	const char *x = addColl;
	for ( ; x && *x ; x++ ) {
		if ( is_alnum_a(*x) ) continue;
		if ( *x == '-' ) continue;
		if ( *x == '_' ) continue; // underscore now allowed
		break;
	}
	if ( x && *x ) {
		g_errno = EBADENGINEER;
		gtmp.safePrintf("<font color=red>Error. \"%s\" is a "
				"malformed name because it "
				"contains the '%c' character.</font><br><br>",
				addColl,*x);
		gmsg = gtmp.getBufStart();
	}

	//
	// END GIGABOT ERRORS
	//



	//
	// CLOUD SEARCH ENGINE SUPPORT
	//
	// if added the coll successfully, do not print same page, jump to
	// printing the basic settings page so they can add sites to it.
	// crap, this GET request, "r", is missing the "c" parm sometimes.
	// we need to use the "addcoll" parm anyway. maybe print a meta
	// redirect then?
	char guide = r->getLong("guide",0);
	// do not redirect if gmsg is set, there was a problem with the name
	if ( action && ! msg && format == FORMAT_HTML && guide && ! gmsg ) {
		//return g_parms.sendPageGeneric ( s, r, PAGE_BASIC_SETTINGS );
		// just redirect to it
		if ( addColl )
			p.safePrintf("<meta http-equiv=Refresh "
				      "content=\"0; URL=/admin/settings"
				      "?guide=1&c=%s\">",
				      addColl);
		return g_httpServer.sendDynamicPage (s,
						     p.getBufStart(),
						     p.length());
	}


	// print standard header
	g_pages.printAdminTop ( &p , s , r , NULL, 
				"onload=document."
				"getElementById('acbox').focus();");


	if ( g_errno ) {
		msg = mstrerror( g_errno );
	}

	if ( msg && ! guide ) {
		const char *cc = "deleting";
		if ( add ) cc = "adding";
		p.safePrintf (
			  "<center>\n"
			  "<font color=red>"
			  "<b>Error %s collection: %s. "
			  "See log file for details.</b>"
			  "</font>"
			  "</center><br>\n",cc,msg);
	}

	//
	// CLOUD SEARCH ENGINE SUPPORT
	//
	if ( add && guide )
		printGigabotAdvice ( &p , PAGE_ADDCOLL , r , gmsg );



	// print the add collection box
	if ( add /*&& (! nc[0] || g_errno ) */ ) {

		const char *t1 = "Add Collection";
		if ( guide ) t1 = "Add Search Engine";

		p.safePrintf (
			  "<center>\n<table %s>\n"
			   "<tr class=hdrow><td colspan=2>"
			  "<center><b>%s</b></center>"
			  "</td></tr>\n"
			  ,TABLE_STYLE
			  ,t1
			      );
		const char *t2 = "collection";
		if ( guide ) t2 = "search engine";
		const char *str = addColl;
		if ( ! addColl ) str = "";
		p.safePrintf (
			      "<tr bgcolor=#%s>"
			      "<td><b>name of new %s to add</td>\n"
			      "<td><input type=text name=addcoll size=30 "
			      "id=acbox "
			      "value=\"%s\">"
			      "</td></tr>\n"
			      , LIGHT_BLUE
			      , t2 
			      , str
			      );

		// don't show the clone box if we are under gigabot the guide
		if ( ! guide )
			p.safePrintf(
				     "<tr bgcolor=#%s>"
				     "<td><b>clone settings from this "
				     "collection</b>"
				     "<br><font size=1>Copy settings from "
				     "this pre-existing collection. Leave "
				     "blank to "
				     "accept default values.</font></td>\n"
				     "<td><input type=text name=clonecoll "
				     "size=30>"
				     "</td>"
				     "</tr>"
				     , LIGHT_BLUE
				     );

		// collection pwds
		p.safePrintf(
			     "<tr bgcolor=#%s>"
			     "<td><b>collection passwords"
			     "</b>"
			     "<br><font size=1>List of white space separated "
			     "passwords allowed to adminster collection."
			     "</font>"
			     "</td>\n"
			     "<td><input type=text name=collpwd "
			     "size=60>"
			     "</td>"
			     "</tr>"
			     , LIGHT_BLUE
			     );

		// ips box for security
		p.safePrintf(
			     "<tr bgcolor=#%s>"
			     "<td><b>collection ips"
			     "</b>"

			     "<br><font size=1>List of white space separated "
			     "IPs allowed to adminster collection."
			     "</font>"

			     "</td>\n"
			     "<td><input type=text name=collips "
			     "size=60>"
			     "</td>"
			     "</tr>"
			     , LIGHT_BLUE
			     );

		// now list collections from which to copy the config
		//p.safePrintf (
		//	  "<tr><td><b>copy configuration from this "
		//	  "collection</b><br><font size=1>Leave blank to "
		//	  "accept default values.</font></td>\n"
		//	  "<td><input type=text name=cpc value=\"%s\" size=30>"
		//	  "</td></tr>\n",coll);
		p.safePrintf ( "</table></center><br>\n");

		// wrap up the form started by printAdminTop
		g_pages.printAdminBottom ( &p );
		int32_t bufLen = p.length();
		return g_httpServer.sendDynamicPage (s,p.getBufStart(),bufLen);
	}

	// if we added a collection, print its page
	//if ( add && nc[0] && ! g_errno ) 
	//	return g_parms.sendPageGeneric2 ( s , r , PAGE_SEARCH ,
	//					  nc , pwd );

	if ( g_collectiondb.m_numRecsUsed <= 0 ) goto skip;

	// print all collections out in a checklist so you can check the
	// ones you want to delete, the values will be the id of that collectn
	p.safePrintf (
		  "<center>\n<table %s>\n"
		  "<tr class=hdrow><td><center><b>Delete Collections"
		  "</b></center></td></tr>\n"
		  "<tr bgcolor=#%s><td>"
		  "<center><b>Select the collections you wish to delete. "
		  //"<font color=red>This feature is currently under "
		  //"development.</font>"
		  "</b></center></td></tr>\n"
		  "<tr bgcolor=#%s><td>"
		  // table within a table
		  "<center><table width=20%%>\n",
		  TABLE_STYLE,
		  LIGHT_BLUE,
		  DARK_BLUE
		      );

	for ( int32_t i = 0 ; i < g_collectiondb.m_numRecs ; i++ ) {
		CollectionRec *cr = g_collectiondb.m_recs[i];
		if ( ! cr ) continue;
		p.safePrintf (
			  "<tr bgcolor=#%s><td>"
			  "<input type=checkbox name=delcoll value=\"%s\"> "
			  "%s</td></tr>\n",
			  DARK_BLUE,
			  cr->m_coll,cr->m_coll);
	}
	p.safePrintf( "</table></center></td></tr></table><br>\n" );
skip:
	// wrap up the form started by printAdminTop
	g_pages.printAdminBottom ( &p );
	int32_t bufLen = p.length();
	return g_httpServer.sendDynamicPage (s,p.getBufStart(),bufLen);
#endif
}

예제 #24

파일 보기

파일: Msg39.cpp 프로젝트: abhayprakash/open-source-search-engine

// . returns false if blocked, true otherwise
// . sets g_errno on error
bool Msg39::getLists () {

	if ( m_debug ) m_startTime = gettimeofdayInMilliseconds();
	// . ask Indexdb for the IndexLists we need for these termIds
	// . each rec in an IndexList is a termId/score/docId tuple

	//
	// restrict to docid range?
	//
	// . get the docid start and end
	// . do docid paritioning so we can send to all hosts
	//   in the network, not just one stripe
	long long docIdStart = 0;
	long long docIdEnd = MAX_DOCID;
	// . restrict to this docid?
	// . will really make gbdocid:| searches much faster!
	long long dr = m_tmpq.m_docIdRestriction;
	if ( dr ) {
		docIdStart = dr;
		docIdEnd   = dr + 1;
	}
	// . override
	// . this is set from Msg39::doDocIdSplitLoop() to compute 
	//   search results in stages, so that we do not load massive
	//   termlists into memory and got OOM (out of memory)
	if ( m_r->m_minDocId != -1 ) docIdStart = m_r->m_minDocId;
	if ( m_r->m_maxDocId != -1 ) docIdEnd   = m_r->m_maxDocId+1;
	
	// if we have twins, then make sure the twins read different
	// pieces of the same docid range to make things 2x faster
	//bool useTwins = false;
	//if ( g_hostdb.getNumStripes() == 2 ) useTwins = true;
	//if ( useTwins ) {
	//	long long delta2 = ( docIdEnd - docIdStart ) / 2;
	//	if ( m_r->m_stripe == 0 ) docIdEnd = docIdStart + delta2;
	//	else                      docIdStart = docIdStart + delta2;
	//}
	// new striping logic:
	long numStripes = g_hostdb.getNumStripes();
	long long delta2 = ( docIdEnd - docIdStart ) / numStripes;
	long stripe = g_hostdb.getMyHost()->m_stripe;
	docIdStart += delta2 * stripe; // is this right?
	docIdEnd = docIdStart + delta2;
	// add 1 to be safe so we don't lose a docid
	docIdEnd++;
	// TODO: add triplet support later for this to split the
	// read 3 ways. 4 ways for quads, etc.
	//if ( g_hostdb.getNumStripes() >= 3 ) { char *xx=NULL;*xx=0;}
	// do not go over MAX_DOCID  because it gets masked and
	// ends up being 0!!! and we get empty lists
	if ( docIdEnd > MAX_DOCID ) docIdEnd = MAX_DOCID;
	// remember so Msg2.cpp can use them to restrict the termlists 
	// from "whiteList" as well
	m_docIdStart = docIdStart;
	m_docIdEnd   = docIdEnd;
	

	//
	// set startkey/endkey for each term/termlist
	//
	for ( long i = 0 ; i < m_tmpq.getNumTerms() ; i++ ) {
		// breathe
		QUICKPOLL ( m_r->m_niceness );
		// shortcuts
		QueryTerm *qterm = &m_tmpq.m_qterms[i];
		char *sk = qterm->m_startKey;
		char *ek = qterm->m_endKey;
		// get the term id
		long long tid = m_tmpq.getTermId(i);
		// if only 1 stripe
		//if ( g_hostdb.getNumStripes() == 1 ) {
		//	docIdStart = 0;
		//	docIdEnd   = MAX_DOCID;
		//}
		// store now in qterm
		g_posdb.makeStartKey ( sk , tid , docIdStart );
		g_posdb.makeEndKey   ( ek , tid , docIdEnd   );
		qterm->m_ks = sizeof(POSDBKEY);//key144_t);
	}

	// debug msg
	if ( m_debug || g_conf.m_logDebugQuery ) {
		for ( long i = 0 ; i < m_tmpq.getNumTerms() ; i++ ) {
			// get the term in utf8
			//char bb[256];
			QueryTerm *qt = &m_tmpq.m_qterms[i];
			//utf16ToUtf8(bb, 256, qt->m_term, qt->m_termLen);
			char *tpc = qt->m_term + qt->m_termLen;
			char  tmp = *tpc;
			*tpc = '\0';
			char sign = qt->m_termSign;
			if ( sign == 0 ) sign = '0';
			QueryWord *qw = qt->m_qword;
			long wikiPhrId = qw->m_wikiPhraseId;
			if ( m_tmpq.isPhrase(i) ) wikiPhrId = 0;
			char leftwikibigram = 0;
			char rightwikibigram = 0;
			if ( qt->m_leftPhraseTerm &&
			     qt->m_leftPhraseTerm->m_isWikiHalfStopBigram )
				leftwikibigram = 1;
			if ( qt->m_rightPhraseTerm &&
			     qt->m_rightPhraseTerm->m_isWikiHalfStopBigram )
				rightwikibigram = 1;
			/*
			char c = m_tmpq.getTermSign(i);
			char tt[512];
			long ttlen = m_tmpq.getTermLen(i);
			if ( ttlen > 254 ) ttlen = 254;
			if ( ttlen < 0   ) ttlen = 0;
			// old:painful: convert each term from unicode to ascii
			memcpy ( tt , m_tmpq.getTerm(i) , ttlen );
			*/
			long isSynonym = 0;
			QueryTerm *st = qt->m_synonymOf;
			if ( st ) isSynonym = true;
			SafeBuf sb;
			// now we can display it
			//tt[ttlen]='\0';
			//if ( c == '\0' ) c = ' ';
			sb.safePrintf(
			     "query: msg39: [%lu] query term #%li \"%s\" "
			     "phr=%li termId=%llu rawTermId=%llu "
			     //"estimatedTermFreq=%lli (+/- ~16000) "
			     "tfweight=%.02f "
			     "sign=%c "
			     "numPlusses=%hhu "
			     "required=%li "
			     "fielcode=%li "

			     "ebit=0x%0llx "
			     "impBits=0x%0llx "

			     "wikiphrid=%li "
			     "leftwikibigram=%li "
			     "rightwikibigram=%li "
			     //"range.startTermNum=%hhi range.endTermNum=%hhi "
			     //"minRecSizes=%li "
			     "readSizeInBytes=%li "
			     //"ebit=0x%llx "
			     //"impBits=0x%llx "
			     "hc=%li "
			     "component=%li "
			     "otermLen=%li "
			     "isSynonym=%li "
			     "querylangid=%li ",
			     (long)this ,
			     i          ,
			     qt->m_term,//bb ,
			     (long)m_tmpq.isPhrase (i) ,
			     m_tmpq.getTermId      (i) ,
			     m_tmpq.getRawTermId   (i) ,
			     ((float *)m_r->ptr_termFreqWeights)[i] ,
			     sign , //c ,
			     0 , 
			     (long)qt->m_isRequired,
			     (long)qt->m_fieldCode,

			     (long long)qt->m_explicitBit  ,
			     (long long)qt->m_implicitBits ,

			     wikiPhrId,
			     (long)leftwikibigram,
			     (long)rightwikibigram,
			     ((long *)m_r->ptr_readSizes)[i]         ,
			     //(long long)m_tmpq.m_qterms[i].m_explicitBit  ,
			     //(long long)m_tmpq.m_qterms[i].m_implicitBits ,
			     (long)m_tmpq.m_qterms[i].m_hardCount ,
			     (long)m_tmpq.m_componentCodes[i],
			     (long)m_tmpq.getTermLen(i) ,
			     isSynonym,
			     (long)m_tmpq.m_langId); // ,tt
			// put it back
			*tpc = tmp;
			if ( st ) {
				long stnum = st - m_tmpq.m_qterms;
				sb.safePrintf("synofterm#=%li",stnum);
				//sb.safeMemcpy(st->m_term,st->m_termLen);
				sb.pushChar(' ');
				sb.safePrintf("synwid0=%lli ",qt->m_synWids0);
				sb.safePrintf("synwid1=%lli ",qt->m_synWids1);
				sb.safePrintf("synalnumwords=%li ",
					      qt->m_numAlnumWordsInSynonym);
				// like for synonym "nj" it's base,
				// "new jersey" has 2 alnum words!
				sb.safePrintf("synbasealnumwords=%li ",
					      qt->m_numAlnumWordsInBase);
			}
			logf(LOG_DEBUG,"%s",sb.getBufStart());

		}
		m_tmpq.printBooleanTree();
	}
	// timestamp log
	if ( m_debug ) 
		log(LOG_DEBUG,"query: msg39: [%lu] Getting %li index lists ",
		     (long)this,m_tmpq.getNumTerms());
	// . now get the index lists themselves
	// . return if it blocked
	// . not doing a merge (last parm) means that the lists we receive
	//   will be an appending of a bunch of lists so keys won't be in order
	// . merging is uneccessary for us here because we hash the keys anyway
	// . and merging takes up valuable cpu time
	// . caution: the index lists returned from Msg2 are now compressed
	// . now i'm merging because it's 10 times faster than hashing anyway
	//   and the reply buf should now always be <= minRecSizes so we can
	//   pre-allocate one better, and, 3) this should fix the yahoo.com 
	//   reindex bug
	char rdbId = RDB_POSDB;

	// . TODO: MDW: fix
	// . partap says there is a bug in this??? we can't cache UOR'ed lists?
	bool checkCache = false;
	// split is us????
	//long split = g_hostdb.m_myHost->m_group;
	long split = g_hostdb.m_myHost->m_shardNum;
	// call msg2
	if ( ! m_msg2.getLists ( rdbId                      ,
				 m_r->ptr_coll              ,
				 m_r->m_maxAge              ,
				 m_r->m_addToCache          ,
				 //m_tmpq.m_qterms ,
				 &m_tmpq,
				 m_r->ptr_whiteList,
				 // we need to restrict docid range for
				 // whitelist as well! this is from
				 // doDocIdSplitLoop()
				 m_docIdStart,
				 m_docIdEnd,
				 // how much of each termlist to read in bytes
				 (long *)m_r->ptr_readSizes ,
				 //m_tmpq.getNumTerms()       , // numLists
				 m_lists                    ,
				 this                       ,
				 gotListsWrapper            ,
				 m_r                        ,
				 m_r->m_niceness            ,
				 true                       , // do merge?
				 m_debug                  ,
				 NULL                       ,  // best hostids
				 m_r->m_restrictPosdbForQuery  ,
				 split                      ,
				 checkCache                 )) {
		m_blocked = true;
		return false;
	}

	// error?
	if ( g_errno ) { 
		log("msg39: Had error getting termlists2: %s.",
		    mstrerror(g_errno));
		// don't bail out here because we are in docIdSplitLoop()
		//sendReply (m_slot,this,NULL,0,0,true);
		return true; 
	}
	
	return gotLists ( true );
}

예제 #25

파일 보기

파일: PageAddUrl.cpp 프로젝트: nikhs/open-source-search-engine

bool sendReply ( void *state , bool addUrlEnabled ) {
	// allow others to add now
	//s_inprogress = false;
	// get the state properly
	//gr *st1 = (gr *) state;
	GigablastRequest *gr = (GigablastRequest *)state;
	// in order to see what sites are being added log it, then we can
	// more easily remove sites from sitesearch.gigablast.com that are
	// being added but not being searched
	SafeBuf xb;
	if ( gr->m_urlsBuf ) {
		xb.safeTruncateEllipsis ( gr->m_urlsBuf , 200 );
		log(LOG_INFO,"http: add url %s (%s)",
		    xb.getBufStart(),mstrerror(g_errno));
	}

	char format = gr->m_hr.getReplyFormat();
	TcpSocket *sock    = gr->m_socket;

	if ( format == FORMAT_JSON || format == FORMAT_XML ) {
		bool status = g_httpServer.sendSuccessReply ( gr );
		// nuke state
		mdelete ( gr , sizeof(gr) , "PageAddUrl" );
		delete (gr);
		return status;
	}


	long ulen = 0;
	char *url = gr->m_urlsBuf;
	if ( url ) ulen = gbstrlen (url);

	// re-null it out if just http://
	bool printUrl = true;
	if ( ulen == 0 ) printUrl = false;
	if ( ! gr->m_urlsBuf       ) printUrl = false;
	if ( ulen==7 && printUrl && !strncasecmp(gr->m_url,"http://",7))
		printUrl = false;
	if ( ulen==8 && printUrl && !strncasecmp(gr->m_url,"https://",8))
		printUrl = false;

	// page is not more than 32k
	char buf[1024*32+MAX_URL_LEN*2];
	SafeBuf sb(buf, 1024*32+MAX_URL_LEN*2);
	
	//char rawbuf[1024*8];
	//SafeBuf rb(rawbuf, 1024*8);	
	//rb.safePrintf("<?xml version=\"1.0\" encoding=\"utf-8\"?>\n");
	//rb.safePrintf("<status>\n");
	//CollectionRec *cr = g_collectiondb.getRec ( gr->m_coll );
	
	// collection name

	char tt [ 128 ];
	tt[0] = '\0';

	g_pages.printAdminTop ( &sb , sock , &gr->m_hr );

	// display url
	//char *url = gr->m_urlsBuf;
	//if ( url && ! url[0] ) url = NULL;

	// watch out for NULLs
	if ( ! url ) url = "http://";

	// if there was an error let them know
	//char msg[MAX_URL_LEN + 1024];
	SafeBuf mbuf;
	//char *pm = "";
	if ( g_errno ) {
		mbuf.safePrintf("<center><font color=red>");
		mbuf.safePrintf("Error adding url(s): <b>%s[%i]</b>", 
				mstrerror(g_errno) , g_errno);
		mbuf.safePrintf("</font></center>");
		//pm = msg;
		//rb.safePrintf("Error adding url(s): %s[%i]", 
		//	      mstrerror(g_errno) , g_errno);
	}
	else if ( printUrl ) {
		mbuf.safePrintf("<center><font color=red>");
		mbuf.safePrintf("<b><u>");
		mbuf.safeTruncateEllipsis(gr->m_urlsBuf,200);
		mbuf.safePrintf("</u></b> added to spider "
				 "queue "
				 "successfully<br><br>");
		mbuf.safePrintf("</font></center>");
		//rb.safePrintf("%s added to spider "
		//	      "queue successfully", url );
		//pm = msg;
		//url = "http://";
		//else
		//	pm = "Don't forget to <a href=/gigaboost.html>"
		//		"Gigaboost</a> your URL.";
	}


	if ( mbuf.length() ) sb.safeStrcpy ( mbuf.getBufStart() );

	g_parms.printParmTable ( &sb , sock , &gr->m_hr );

	// print the final tail
	g_pages.printTail ( &sb, true ); // admin?
	// clear g_errno, if any, so our reply send goes through
	g_errno = 0;

	// nuke state
	mdelete ( gr , sizeof(GigablastRequest) , "PageAddUrl" );
	delete (gr);

	return g_httpServer.sendDynamicPage (sock, 
					     sb.getBufStart(), 
					     sb.length(),
					     -1 ); // cachetime
}

예제 #26

파일 보기

파일: Msg3a.cpp 프로젝트: rdhananjaya/open-source-search-engine

bool Msg3a::gotAllSplitReplies ( ) {

    // if any of the split requests had an error, give up and set m_errno
    // but don't set if for non critical errors like query truncation
    if ( m_errno ) {
        g_errno = m_errno;
        return true;
    }

    // also reset the finalbuf and the oldNumTopDocIds
    if ( m_finalBuf ) {
        mfree ( m_finalBuf, m_finalBufSize, "Msg3aF" );
        m_finalBuf     = NULL;
        m_finalBufSize = 0;
    }

    // update our estimated total hits
    m_numTotalEstimatedHits = 0;

    for ( long i = 0; i < m_numHosts ; i++ ) {
        // get that host that gave us the reply
        //Host *h = g_hostdb.getHost(i);
        // . get the reply from multicast
        // . multicast should have destroyed all slots, but saved reply
        // . we are responsible for freeing the reply
        // . we need to call this even if g_errno or m_errno is
        //   set so we can free the replies in Msg3a::reset()
        // . if we don't call getBestReply() on it multicast should
        //   free it, because Multicast::m_ownReadBuf is still true
        Multicast *m = &m_mcast[i];
        bool freeit = false;
        long  replySize = 0;
        long  replyMaxSize;
        char *rbuf;
        Msg39Reply *mr;
        // . only get it if the reply not already full
        // . if reply already processed, skip
        // . perhaps it had no more docids to give us or all termlists
        //   were exhausted on its disk and this is a re-call
        // . we have to re-process it for count m_numTotalEstHits, etc.
        rbuf = m->getBestReply ( &replySize    ,
                                 &replyMaxSize ,
                                 &freeit       ,
                                 true          ); //stealIt?
        // cast it
        mr = (Msg39Reply *)rbuf;
        // in case of mem leak, re-label from "mcast" to this so we
        // can determine where it came from, "Msg3a-GBR"
        relabel( rbuf, replyMaxSize , "Msg3a-GBR" );
        // . we must be able to free it... we must own it
        // . this is true if we should free it, but we should not have
        //   to free it since it is owned by the slot?
        if ( freeit ) {
            log(LOG_LOGIC,"query: msg3a: Steal failed.");
            char *xx = NULL;
            *xx=0;
        }
        // bad reply?
        if ( ! mr ) {
            log(LOG_LOGIC,"query: msg3a: Bad NULL reply.");
            m_reply       [i] = NULL;
            m_replyMaxSize[i] = 0;
            // it might have been timd out, just ignore it!!
            continue;
            // if size is 0 it can be Msg39 giving us an error!
            g_errno = EBADREPLYSIZE;
            m_errno = EBADREPLYSIZE;
            // all reply buffers should be freed on reset()
            return true;
        }
        // how did this happen?
        if ( replySize < 29 && ! mr->m_errno ) {
            // if size is 0 it can be Msg39 giving us an error!
            g_errno = EBADREPLYSIZE;
            m_errno = EBADREPLYSIZE;
            log(LOG_LOGIC,"query: msg3a: Bad reply size of %li.",
                replySize);
            // all reply buffers should be freed on reset()
            return true;
        }

        // can this be non-null? we shouldn't be overwriting one
        // without freeing it...
        if ( m_reply[i] )
            // note the mem leak now
            log("query: mem leaking a 0x39 reply");

        // cast it and set it
        m_reply       [i] = mr;
        m_replyMaxSize[i] = replyMaxSize;
        // deserialize it (just sets the ptr_ and size_ member vars)
        //mr->deserialize ( );
        deserializeMsg ( sizeof(Msg39Reply) ,
                         &mr->size_docIds,
                         &mr->size_clusterRecs,
                         &mr->ptr_docIds,
                         mr->m_buf );

        // sanity check
        if ( mr->m_nqt != m_q->getNumTerms() ) {
            g_errno = EBADREPLY;
            m_errno = EBADREPLY;
            log("query: msg3a: Split reply qterms=%li != %li.",
                (long)mr->m_nqt,(long)m_q->getNumTerms() );
            return true;
        }
        // return if split had an error, but not for a non-critical
        // error like query truncation
        if ( mr->m_errno && mr->m_errno != EQUERYTRUNCATED ) {
            g_errno = mr->m_errno;
            m_errno = mr->m_errno;
            log("query: msg3a: Split had error: %s",
                mstrerror(g_errno));
            return true;
        }
        // skip down here if reply was already set
        //skip:
        // add of the total hits from each split, this is how many
        // total results the lastest split is estimated to be able to
        // return
        // . THIS should now be exact since we read all termlists
        //   of posdb...
        m_numTotalEstimatedHits += mr->m_estimatedHits;

        // debug log stuff
        if ( ! m_debug ) continue;
        // cast these for printing out
        long long *docIds    = (long long *)mr->ptr_docIds;
        score_t   *scores    = (score_t   *)mr->ptr_scores;
        // print out every docid in this split reply
        for ( long j = 0; j < mr->m_numDocIds ; j++ ) {
            // print out score_t
            logf( LOG_DEBUG,
                  "query: msg3a: [%lu] %03li) "
                  "split=%li docId=%012llu domHash=0x%02lx "
                  "score=%lu"                     ,
                  (unsigned long)this                      ,
                  j                                        ,
                  i                                        ,
                  docIds [j] ,
                  (long)g_titledb.getDomHash8FromDocId(docIds[j]),
                  (long)scores[j] );
        }
    }

    // this seems to always return true!
    mergeLists ( );

    if ( ! m_r->m_useSeoResultsCache ) return true;

    // now cache the reply
    SafeBuf cr;
    long dataSize = 4 + 4 + 4 + m_numDocIds * (8+4+4);
    long need = sizeof(key_t) + 4 + dataSize;
    bool status = cr.reserve ( need );
    // sanity
    if ( ( m_ckey.n0 & 0x01 ) == 0x00 ) {
        char *xx=NULL;
        *xx=0;
    }
    // ignore errors
    g_errno = 0;
    // return on error with g_errno cleared if cache add failed
    if ( ! status ) return true;
    // add to buf otherwise
    cr.safeMemcpy ( &m_ckey , sizeof(key_t) );
    cr.safeMemcpy ( &dataSize , 4 );
    long now = getTimeGlobal();
    cr.pushLong ( now );
    cr.pushLong ( m_numDocIds );
    cr.pushLong ( m_numTotalEstimatedHits );//Results );
    long max = m_numDocIds;
    // then the docids
    for ( long i = 0 ; i < max ; i++ )
        cr.pushLongLong(m_docIds[i] );
    for ( long i = 0 ; i < max ; i++ )
        cr.pushFloat(m_scores[i]);
    for ( long i = 0 ; i < max ; i++ )
        cr.pushLong(getSiteHash26(i));
    // sanity
    if ( cr.length() != need ) {
        char *xx=NULL;
        *xx=0;
    }
    // make these
    key_t startKey;
    key_t endKey;
    startKey = m_ckey;
    // clear delbit
    startKey.n0 &= 0xfffffffffffffffeLL;
    // end key is us
    endKey = m_ckey;
    // that is the single record
    m_seoCacheList.set ( cr.getBufStart() ,
                         cr.length(),
                         cr.getBufStart(), // alloc
                         cr.getCapacity(), // alloc size
                         (char *)&startKey,
                         (char *)&endKey,
                         -1, // fixeddatasize
                         true, // owndata?
                         false,// use half keys?
                         sizeof(key_t) );
    // do not allow cr to free it, msg1 will
    cr.detachBuf();
    // note it
    //log("seopipe: storing ckey=%s q=%s"
    //    ,KEYSTR(&m_ckey,12)
    //    ,m_r->ptr_query
    //    );
    //log("msg1: sending niceness=%li",(long)m_r->m_niceness);
    // this will often block, but who cares!? it just sends a request off
    if ( ! m_msg1.addList ( &m_seoCacheList ,
                            RDB_SERPDB,//RDB_CACHEDB,
                            m_r->ptr_coll,
                            this, // state
                            gotSerpdbReplyWrapper, // callback
                            false, // forcelocal?
                            m_r->m_niceness ) ) {
        //log("blocked");
        return false;
    }

    // we can safely delete m_msg17... just return true
    return true;
}

예제 #27

파일 보기

파일: Turkdb.cpp 프로젝트: DeadNumbers/open-source-search-engine

// . displays the stats for a username
// . show stats for every day we have them for
// . in a big list
// . if they click the day display all docids evaluated for that day
// . show the accuracy for that day too
// . how many docs they edited
// . how many of those docs were verified by another
// . and if there was consensus
void gotTransdbList ( State60 *st ) {

	// get today's time range
	time_t now = getTimeGlobal();
	// get start of today
	time_t dayStart = now / (24*3600);

	SafeBuf sb;

	// int16_tcut
	TcpSocket *s = st->m_s;

	// make about 200k of mem to write into
	if ( ! sb.reserve ( 200000 ) ) 
		return g_httpServer.sendErrorReply(s,500,mstrerrno(g_errno));

	// print description so they can clikc a button to start the turk
	sb.safePrintf("<html>\n"
		      "<title>Event Editor</title>\n"
		      "<body>\n"
		      "<table width=\"100%%\" border=\"0\">\n"
		      "<tr><td style=\"background-color:#0079ba;\">\n"
		      "<center><font color=#00000>"
		      "<h2>Event Editor</h2>\n"
		      "</font></center></td>"
		      "</tr></table>");
	// print the content
	sb.safePrintf("<center><font size=4><blink>"
		      "<b><a href=\"/pageturk?c=%s&edit=1\">"
		      "Click here to start editing.</a></b></blink>"
		      "</font><br><i>Please take your "
		      "time to read the information below before you begin"
		      "</i><br><font color=\"red\" size=2> Warning: Adult "
		      "content might be presented to you."
		      " You should be above 18 years of age to continue."
		      "</center></font>",st->m_coll);

	sb.safePrintf("<font face=arial,sans-serif color=black size=3>"
		      "<p>By clicking <i>Start Voting</i>, you will be "
		       "presented with an interface for editing events. "
		      "The editor will display a modified web page that "
		      "contains one or more events. Each event's description "
		      "will be highlight with a blue background. You can "
		      "toggle whether a particular event is displayed by "
		      "clicking on that event's ID. You can highlight one or "
		      "multiple event descriptions at the same time. "
		      "</p><p>"
		      "By clicking on the section icons in the web page you "
		      "can tell the editor that a virtual fence should be "
		      "erected around that section. The fence will make sure "
		      "that event descriptions can not span across it. Each "
		      "event description must be fully contained either "
		      "inside or outside the fence. However, you can also "
		      "declare a section as a title section, which means that "
		      "the text that the title section contains is free to be "
		      "used by any event description."
		      "</p>\n"
		      "<p>When you are done erecting section fences, you "
		      "submit your changes. The more changes you make the "
		      "more points you earn. Other users may evaluate " 
		      "your edits for accuracy. You will be paid based on the "
		      "points you earn as well as your accuracy. All "
		      "transactions are listed in the table below.</p>"
		      "<p>You may not change your username or password "
		      "but you can change your email address. Your email "
		      "address will be used to pay you with PayPal every "
		      "Friday. Paypal fees will be deducted on your end. By "
		      "using this service you agree to all stated Terms & "
		      "Conditions.</p>"
		      "</font>\n");

	// get the user record
	User *uu = g_users.getUser ( username );
	// print out their info, like paypal email
	sb.safePrintf("<table>\n"
		      "<tr><td colspan=10><center>Your Info</center>"
		      "</td></tr>\n"
		      "<tr>"
		      "<td>Email</td>"
		      "<td><input type=text value=%s></td>"
		      "<td>email address used to pay with paypal</td>"
		      "</tr>\n"
		      "<tr><td colspan=10><input type=submit value=update>"
		      "</td></tr>\n"
		      "</table>\n" ,
		      uu->m_payPalEmail );

	// print your stats here now
	sb.safePrintf("<table>\n"
		      "<tr><td colspan=10><center>Your Stats</center>"
		      "</td></tr>\n"
		      "<tr>"
		      "<td>date</td>"
		      "<td>action</td>"
		      "<td>amount</td>"
		      "<td>desc</td>"
		      "</tr>\n");

	// int16_tcut
	RdbList *list = &st->m_list;

	int32_t lastDay        = -1;
	int32_t totalReceives  = 0;
	int32_t totalSubmits   = 0;
	int32_t totalPasses    = 0;
	int32_t totalFails     = 0;

	// scan the list
	for ( ; ! list->isExhausted() ; ) {
		// get rec
		char *rec      = list->getCurrentRecord();
		char *data     = list->getCurrentData();
		int32_t  dataSize = list->getCurrentDataSize();
		// skip that
		list->skipCurrentRecord();
		// skip if negative
		if ( (rec[0] & 0x01) == 0x00 ) continue;
		// get the time (global time - sync'd with host #0)
		time_t tt = g_transdb.getTimeStamp ( rec );
		// get day #
		int32_t daynum = tt / (24*3600);
		// is it today?
		bool isToday = ( daynum >= dayStart );
		// point to the Transaction
		Trans *trans = (Trans *)data;
		// if is today, print it out verbatim
		if ( isToday ) {
			// print it in html row format to match table above
			//printTrans ( &sb , rec );
			sb.safePrintf("<tr>");
			// make it into a nice date
			time_t dd = lastDay * 86400;
			struct tm *timeStruct = localtime ( &dd );
			char ppp[100];
			strftime(ppp,100,"%H:%M:%S",timeStruct);
			// print last days stats first
			sb.safePrintf("<td>%s</td>",ppp);
			// then stats
			if ( trans->m_actionType == AT_RECEIVE_DOC )
				sb.safePrintf("<td>receive</td>"
					      "<td>%"INT32" pts</td>"
					      "<td>docid=%"UINT64"</td>",
					      (int32_t)trans->m_number,
					      trans->m_docId);
			else if ( trans->m_actionType == AT_SUBMIT_DOC )
				sb.safePrintf("<td>submit</td>"
					      "<td>%"INT32" pts</td>"
					      "<td>docid=%"UINT64"</td>",
					      (int32_t)trans->m_number,
					      trans->m_docId);
			else if ( trans->m_actionType == AT_PASS_DOC )
				sb.safePrintf("<td>verify</td>"
					      "<td>%"INT32" pts</td>"
					      "<td>docid=%"UINT64" was verified "
					      "by user=\"%s\"</td>",
					      (int32_t)trans->m_number,
					      trans->m_docId,
					      trans->m_desc);
			else if ( trans->m_actionType == AT_FAIL_DOC )
				sb.safePrintf("<td>verify</td>"
					      "<td>%"INT32" pts</td>"
					      "<td>docid=%"UINT64" was deemed to "
					      "be incorrect "
					      "by user=\"%s\"</td>",
					      (int32_t)trans->m_number,
					      trans->m_docId,
					      trans->m_desc);
			else if ( trans->m_actionType == AT_ACCURACY_EVAL)
				sb.safePrintf("<td>accuracy eval</td>"
					      "<td>%.02f</td>"
					      "<td>docid=%"UINT64"</td>",
					      trans->m_number,
					      trans->m_docId);
			else if ( trans->m_actionType == AT_CHARGE)
				sb.safePrintf("<td>credit</td>"
					      "<td>%.02f</td>"
					      "<td>You made money.</td>",
					      trans->m_number);
			else if ( trans->m_actionType == AT_PAYMENT)
				sb.safePrintf("<td>payment</td>"
					      "<td>%.02f</td>"
					      "<td>We paid you.</td>",
					      trans->m_number);
			else if ( trans->m_actionType == AT_LOGIN)
				sb.safePrintf("<td>login</td>"
					      "<td>-</td>"
					      "<td>You logged in.</td>");
			else if ( trans->m_actionType == AT_LOGOUT)
				sb.safePrintf("<td>logout</td>"
					      "<td>-</td>"
					      "<td>You logged out.</td>");
			else if ( trans->m_actionType == AT_AUTO_LOGOUT)
				sb.safePrintf("<td>logout</td>"
					      "<td>-</td>"
					      "<td>You were auto "
					      "logged out.</td>");
			else {
				char *xx=NULL;*xx=0; }
			sb.safePrintf("</tr>\n");
			continue;
		}
		// if does not match last day, print out that last day's stats
		// and reset for next guy
		if ( daynum != lastDay && lastDay != -1 ) {
			// make it into a nice date
			time_t dd = lastDay * 86400;
			struct tm *timeStruct = localtime ( &dd );
			char ppp[100];
			strftime(ppp,100,"%b-%d-%Y",timeStruct);
			// print last days stats first
			sb.safePrintf("<td>%s</td>",ppp);
			// then stats
			sb.safePrintf("<tr>"
				      "<td>receive</td>"
				      "<td>%"INT32"</td>"
				      "<td>Total received</td>"
				      "</tr>\n",
				      totalReceives);
			sb.safePrintf("<tr>"
				      "<td>submit</td>"
				      "<td>%"INT32"</td>"
				      "<td>Total submitted</td>"
				      "</tr>\n",
				      totalSubmits);
			sb.safePrintf("<tr>"
				      "<td>pass</td>"
				      "<td>%"INT32"</td>"
				      "<td>Total accuracy tests passed</td>"
				      "</tr>\n",
				      totalPasses);
			sb.safePrintf("<tr>"
				      "<td>fail</td>"
				      "<td>%"INT32"</td>"
				      "<td>Total accuracy tests failed</td>"
				      "</tr>\n",
				      totalFails);
			// reset as well
			totalReceived = 0;
			totalSubmits  = 0;
			totalPasses   = 0;
			totalFails    = 0;
		}
		// remember last day # we processed for accumulating stats
		lastDay = daynum;
		// accum stats
		if ( trans->m_actionType == AT_RECEIVE_DOC )
			totalReceives++;
		if ( trans->m_actionType == AT_SUBMIT_DOC )
			totalSubmits++;
		if ( trans->m_actionType == AT_PASS_DOC )
			totalPasses++;
		if ( trans->m_actionType == AT_FAIL_DOC )
			totalFails++;
	}

	sb.safePrintf("</body></html>\n");

	sendReply ( &sb );
}

예제 #28

파일 보기

파일: Statsdb.cpp 프로젝트: FlavioFalcao/open-source-search-engine

void Statsdb::drawHR ( float z ,
		       float ymin , 
		       float ymax ,
		       //GIFPlotter *plotter ,
		       SafeBuf &gw,
		       Label *label ,
		       float zoff ,
		       long color ) {

	// convert into yspace
	float z2 = ((float)DY2 * (float)(z - ymin)) /(float)(ymax-ymin);
	// avoid collisions with other graphs
	z2 += zoff;
	// border
	//z2 += m_by;
	// round off error
	z2 += 0.5;
	// for adjusatmnet
	float ptsPerPixel = (ymax-ymin)/ (float)DY2;
	// make an adjustment to the label then! -- Commented out because it's currently not used.
	float zadj = zoff * ptsPerPixel;

	//#ifdef _USEPLOTTER_

	// use the color specified from addStat_r() for this line/pt
	//plotter->pencolor ( ((color >> 16) & 0xff) << 8 ,
	//		    ((color >>  8) & 0xff) << 8 ,
	//		    ((color >>  0) & 0xff) << 8 );

	// horizontal line
	//plotter->line ( m_bx, (long)z2 , DX2 + m_bx, (long)z2 );
	long width = 1;
	drawLine3 ( m_gw, 0, DX2 , (long)z2,color, width); 


	// make label
	char tmp[128];
	// . use "graphHash" to map to unit display
	// . this is a disk read volume
	sprintf(tmp,label->m_format,z +zadj);//* label->m_yscalar);

	/*
	// a white shadow
	plotter->pencolor ( 0xffff,0xffff,0xffff );
	plotter->move ( m_bx + 80 + 2 , z2 + 10 - 2 );
	plotter->alabel     ( 'c' , 'c' , tmp );
	
	// a black shadow
	plotter->pencolor ( 0 , 0 , 0 );
	plotter->move ( m_bx + 80 + 1 , z2 + 10 - 1 );
	plotter->alabel     ( 'c' , 'c' , tmp );
	
	//long color = label->m_color;
	// use the color specified from addStat_r() for this line/pt
	plotter->pencolor ( ((color >> 16) & 0xff) << 8 ,
			    ((color >>  8) & 0xff) << 8 ,
			    ((color >>  0) & 0xff) << 8 );
	
	// move cursor
	plotter->move ( m_bx + 80 , z2 + 10 );
	// plot label
	plotter->alabel     ( 'c' , 'c' , tmp );
	*/

	// LABEL
	gw.safePrintf("<div style=\"position:absolute;"
		      "left:%li;"
		      "bottom:%li;"
		      "color:#%lx;"
		      "z-index:110;"
		      "font-size:14px;"
		      "min-height:20px;"
		      "min-width:3px;\">%s</div>\n"
		      , (long)(m_bx)
		      , (long)z2 +m_by
		      , color
		      // the label:
		      , tmp
		      );
	
}

예제 #29

파일 보기

파일: Turkdb.cpp 프로젝트: DeadNumbers/open-source-search-engine

bool sendTurkPageReply ( State60 *st ) {

	XmlDoc *xd = &st->m_xd;
	//char *content    = xd->ptr_utf8Content;
	//int32_t  contentLen = xd->size_utf8Content - 1;

	// count the total number of EventDesc classes for all evids
	//char *evd = xd->ptr_eventData;
	//EventDisplay *ed = (EventDisplay *)evd;
	//char *addr = evd + (int32_t)ed->m_addr;
	//char timeZoneOffset = getTimeZoneFromAddr ( addr );

	// in case getSections() block come right back in
	xd->setCallback ( st , xdcallback );

	// . set niceness to 1 so all this processing doesn't slow queries down
	// . however, g_niceness should still be zero... hmmm...
	xd->m_niceness = 1;

	// default to 1 niceness
	st->m_niceness = 1;

	// now set the sections class
	Sections *ss = xd->getSections();

	// now for each section with alnum text, telescope up as far as 
	// possible without containing anymore alnum text than what it 
	// contained. set SEC_CONTROL bit. such sections will have the
	// 2 green/blue dots, that are used for turning on/off title/desc.
	// but really the indians will only turn off sections that should
	// not have a title/desc.
	for ( Section *si = ss->m_rootSection ; si ; si = si->m_next ) {
		// breathe
		QUICKPOLL(st->m_niceness);
		// skip if does not have text
		if ( si->m_firstWordPos < 0 ) continue;
		// otherwise, find biggest parent that contains just that text
		Section *p    = si->m_parent;
		Section *last = si;
		for ( ; p ; p = p->m_parent ) {
			if ( p->m_firstWordPos != si->m_firstWordPos ) break;
			if ( p->m_lastWordPos  != si->m_lastWordPos  ) break;
			last = p;
		}
		// set that bit then
		last->m_flags |= SEC_CONTROL;
		// and speed up the loop
		si = last;
	}

	// * now each SEC_CONTROL sections have a fence activated by a turker

	// * an event title or description can not span a fence. it must be
	//   confined within a fence. however, it is allowed to include
	//   title or description from a "title section".

	// * hold shift down to designate as title section when clicking it

	// * show the raw text of each event changing as you fence
	//   sections in or out.  show in a right frame.

	// * show list of events on page in the top frame. can toggle them
	//   all individually.

	// * and remove no-display from all tags so we can see everything.

	// * highlight addresses, not just dates.

	// * each section hash has its own unique bg color when activated

	// * with a single click, completely reject an event because:
	//   contains bad time, address, title or desc. specify which so
	//   we can improve our algo.

	// * when selecting an individual event, scroll to its tod...

	// * remove all color from webpage that we can so our colors show up

	// * remove all imgs. just src them to dev null.

	// * allow for entering a custom title for an event or all events
	//   that are or will ever appear on the page. 

	// * when displaying the text of the events, use hyphens to
	//   delineate the section topology. strike out text as a section
	//   fence is activated.

	// * when a section is activated is it easier to just redownload
	//   the whole text of the page? maybe just the text frame?

	// * clicking on an individual sentence section should just remove
	//   that sentence. that is kinda a special content hash removal
	//   tag. like "Click here for video."

	// * when an event id is selected i guess activate its bgcolor to
	//   be light blue for all sentences currently in the event that
	//   are not in activated sections. (make exception for designated 
	//   title sections). so we need multiple tags for each events
	//   sentence div section. if sentence is split use multiple div tags
	//   then to keep the order. so each event sentence would have 
	//   <div ev1=1 ev2=1 ev10=1>...</div> if it is in event ids 1,2 and
	//   10. that way we can activate it when one of those event ids is
	//   activated.


	SafeBuf sb;

	// int16_tcuts
	if ( ! xd->m_wordsValid ) { char *xx=NULL;*xx=0; }
	Words     *words = &xd->m_words;
	int32_t       nw    = words->getNumWords();
	char     **wptrs = words->getWords();
	int32_t      *wlens = words->getWordLens();
	nodeid_t  *tids  = words->getTagIds();

	// a special array for printing </div> tags
	char *endCounts = (char *)mcalloc ( nw ,"endcounts");
	if ( ! endCounts ) return sendErrorReply ( st , g_errno );


	// 
	// now loop over all the words. if word starts a section that has
	// SEC_CONTROL bit set, and print out the section hash and a color
	// tag to be activated if the turkey activates us.
	// CAUTION: word may start multiple sections.
	//
	for ( int32_t i = 0 ; i < nw ; i++ ) { 
		// get section ptr
		Section *sj = ss->m_sectionPtrs[i];
		// sanity check. sj must be first section ptr that starts @ a
		if ( sj && sj->m_a==i && sj->m_prev && sj->m_prev->m_a==i ) {
			char *xx=NULL;*xx=0; }
		// . does word #i start a section?
		// . if section is control, print out the control
		while ( sj && sj->m_a == i ) {
			// print this section's hash
			if ( sj->m_flags & SEC_CONTROL) {
				// after the turkeys have made all the edits
				// they need to submit the changes they made.
				// how can we get that data sent back to the
				// back end? we need to send back the colors
				// of the sections that have been activated
				// i guess. just do a loop over them.
				sb.safePrintf("<div nobreak gbsecid=%"UINT32" "
					      "bgcolor=#%"XINT32" "
					      "onclick=gbtogglecolor()>",
					      (uint32_t)sj->m_tagHash,
					      (uint32_t)sj->m_tagHash);
				// sanity check
				if ( sj->m_b < 0  ) { char *xx=NULL;*xx=0; }
				if ( sj->m_b > nw ) { char *xx=NULL;*xx=0; }
				// and inc the /div count for that word
				endCounts[sj->m_b-1]++;
			}
			// try next section too
			sj = sj->m_next;
		}
		// if this is a tag, remove any coloring
		if ( tids[i] ) {
		}
		// print the word, be it a tag, alnum, punct
		sb.safeMemcpy ( wptrs[i] , wlens[i] );
		// end a div tag?
		if ( ! endCounts[i] ) continue;
		// might be many so loop it
		for ( int32_t j = 0 ; j < endCounts[i] ; j++ )
			sb.safePrintf("</div>");
	}			







	return false;
}

예제 #30

파일 보기

파일: Summary.cpp 프로젝트: exename/open-source-search-engine

// . return the score of the highest-scoring window containing match #m
// . window is defined by the half-open interval [a,b) where a and b are 
//   word #'s in the Words array indicated by match #m
// . return -1 and set g_errno on error
int64_t Summary::getBestWindow ( Matches *matches, int32_t mm, int32_t *lasta,
                                 int32_t *besta, int32_t *bestb, char *gotIt,
                                 char *retired, int32_t maxExcerptLen ) {
	// get the window around match #mm
	Match *m = &matches->m_matches[mm];

	// what is the word # of match #mm?
	int32_t matchWordNum = m->m_wordNum;

	// what Words/Pos/Bits classes is this match in?
	Words *words = m->m_words;
	Section **sp = NULL;
	int32_t *pos = m->m_pos->m_pos;

	// use "m_swbits" not "m_bits", that is what Bits::setForSummary() uses
	const swbit_t *bb = m->m_bits->m_swbits;

	// shortcut
	if ( m->m_sections ) {
		sp = m->m_sections->m_sectionPtrs;
	}

	int32_t nw = words->getNumWords();
	int64_t *wids = words->getWordIds();
	nodeid_t *tids = words->getTagIds();

	// . sanity check
	// . this prevents a core i've seen
	if ( matchWordNum >= nw ) {
		log("summary: got overflow condition for q=%s",m_q->m_orig);

		// assume no best window
		*besta = -1;
		*bestb = -1;
		*lasta = matchWordNum;
		return 0;
	}

	// . we NULLify the section ptrs if we already used the word in another summary.
	int32_t badFlags = SEC_SCRIPT|SEC_STYLE|SEC_SELECT|SEC_IN_TITLE;
	if ( (bb[matchWordNum] & D_USED) || ( sp && (sp[matchWordNum]->m_flags & badFlags) ) ) {
		// assume no best window
		*besta = -1;
		*bestb = -1;
		*lasta = matchWordNum;
		return 0;
	}

	// . "a" is the left fence post of the window (it is a word # in Words)
	// . go to the left as far as we can 
	// . thus we decrement "a"
	int32_t a = matchWordNum;

	// "posa" is the character position of the END of word #a
	int32_t posa = pos[a+1];
	int32_t firstFrag = -1;
	bool startOnQuote = false;
	bool goodStart = false;
	int32_t wordCount = 0;

	// . decrease "a" as int32_t as we stay within maxNumCharsPerLine
	// . avoid duplicating windows by using "lasta", the last "a" of the
	//   previous call to getBestWindow(). This can happen if our last
	//   central query term was close to this one.
	for ( ; a > 0 && posa - pos[a-1] < maxExcerptLen && a > *lasta; a-- ) {
		// . don't include any "dead zone", 
		// . dead zones have already been used for the summary, and
		//   we are getting a second/third/... excerpt here now then
		// stop if its the start of a sentence, too
		// stop before title word
		if ( (bb[a-1] & D_USED) || (bb[a] & D_STARTS_SENTENCE) || ( bb[a-1] & D_IN_TITLE )) {
			goodStart = true;
			break;
		}

		// don't go beyond an LI, TR, P tag
		if ( tids && ( tids[a-1] == TAG_LI ||
		               tids[a-1] == TAG_TR ||
		               tids[a-1] == TAG_P  ||
		               tids[a-1] == TAG_DIV ) ) {
			goodStart = true;
			break;
		}

		// stop if its the start of a quoted sentence
		if ( a+1<nw && (bb[a+1] & D_IN_QUOTES) && 
		     words->getWord(a)[0] == '\"' ){
			startOnQuote = true;
			goodStart    = true;
			break;
		}

		// find out the first instance of a fragment (comma, etc)
		// watch out! because frag also means 's' in there's
		if ( ( bb[a] & D_STARTS_FRAG ) && !(bb[a-1] & D_IS_STRONG_CONNECTOR) && firstFrag == -1 ) {
			firstFrag = a;
		}

		if ( wids[a] ) {
			wordCount++;
		}
	}

	// if didn't find a good start, then start at the start of the frag
	if ( !goodStart && firstFrag != -1 ) {
		a = firstFrag;
	}

	// don't let punct or tag word start a line, unless a quote
	if ( a < matchWordNum && !wids[a] && words->getWord(a)[0] != '\"' ){
		while ( a < matchWordNum && !wids[a] ) a++;
		
		// do not break right after a "strong connector", like 
		// apostrophe
		while ( a < matchWordNum && a > 0 && 
			( bb[a-1] & D_IS_STRONG_CONNECTOR ) )
			a++;
		
		// don't let punct or tag word start a line
		while ( a < matchWordNum && !wids[a] ) a++;
	}

	// remember, b is not included in the summary, the summary is [a,b-1]
	// remember to include all words in a matched phrase
	int32_t b = matchWordNum + m->m_numWords ;
	int32_t endQuoteWordNum = -1;
	int32_t numTagsCrossed = 0;

	for ( ; b <= nw; b++ ) {
		if ( b == nw ) {
			break;
		}

		if ( pos[b+1] - pos[a] >= maxExcerptLen ) {
			break;
		}
		
		if ( startOnQuote && words->getWord(b)[0] == '\"' ) {
			endQuoteWordNum = b;
		}

		// don't include any dead zone, those are already-used samples
		if ( bb[b] & D_USED ) {
			break;
		}

		// stop on a title word
		if ( bb[b] & D_IN_TITLE ) {
			break;
		}

		if ( wids[b] ) {
			wordCount++;
		}

		// don't go beyond an LI or TR backtag
		if ( tids && ( tids[b] == (BACKBIT|TAG_LI) ||
		               tids[b] == (BACKBIT|TAG_TR) ) ) {
			numTagsCrossed++;

			// try to have atleast 10 words in the summary
			if ( wordCount > 10 ) {
				break;
			}
		}

		// go beyond a P or DIV backtag in case the earlier char is a
		// ':'. This came from a special case for wikipedia pages 
		// eg. http://en.wikipedia.org/wiki/Flyover
		if ( tids && ( tids[b] == (BACKBIT|TAG_P)  ||
		               tids[b] == (BACKBIT|TAG_DIV) )) {
			numTagsCrossed++;

			// try to have atleast 10 words in the summary
			if ( wordCount > 10 && words->getWord(b-1)[0] != ':' ) {
				break;
			}
		}
	}

	// don't end on a lot of punct words
	if ( b > matchWordNum && !wids[b-1]){
		// remove more than one punct words. if we're ending on a quote
		// keep it
		while ( b > matchWordNum && !wids[b-2] && endQuoteWordNum != -1 && b > endQuoteWordNum ) {
			b--;
		}
		
		// do not break right after a "strong connector", like apostrophe
		while ( b > matchWordNum && (bb[b-2] & D_IS_STRONG_CONNECTOR) ) {
			b--;
		}
	}

	Match *ms = matches->m_matches;

	// make m_matches.m_matches[mi] the first match in our [a,b) window
	int32_t mi ;

	// . the match at the center of the window is match #"mm", so that
	//   matches->m_matches[mm] is the Match class
	// . set "mi" to it and back up "mi" as int32_t as >= a
	for ( mi = mm ; mi > 0 && ms[mi-1].m_wordNum >=a ; mi-- )
		;

	// now get the score of this excerpt. Also mark all the represented 
	// query words. Mark the represented query words in the array that
	// comes to us. also mark how many times the same word is repeated in
	// this summary.
	int64_t score = 0LL;

	// is a url contained in the summary, that looks bad! punish!
	bool hasUrl = false;

	// the word count we did above was just an approximate. count it right
	wordCount = 0;

	// for debug
	//char buf[5000];
	//char *xp = buf;
	SafeBuf xp;

	// wtf?
	if ( b > nw ) {
		b = nw;
	}

	// first score from the starting match down to a, including match
	for ( int32_t i = a ; i < b ; i++ ) {
		// debug print out
		if ( g_conf.m_logDebugSummary ) {
			int32_t len = words->getWordLen(i);
			char cs;
			for (int32_t k=0;k<len; k+=cs ) {
				const char *c = words->getWord(i)+k;
				cs = getUtf8CharSize(c);
				if ( is_binary_utf8 ( c ) ) {
					continue;
				}
				xp.safeMemcpy ( c , cs );
				xp.nullTerm();
			}
		}

		// skip if in bad section, marquee, select, script, style
		if ( sp && (sp[i]->m_flags & badFlags) ) {
			continue;
		}

		// don't count just numeric words
		if ( words->isNum(i) ) {
			continue;
		}

		// check if there is a url. best way to check for '://'
		if ( wids && !wids[i] ) {
			const char *wrd = words->getWord(i);
			int32_t  wrdLen = words->getWordLen(i);
			if ( wrdLen == 3 && wrd[0] == ':' && wrd[1] == '/' &&  wrd[2] == '/' ) {
				hasUrl = true;
			}
		}

		// skip if not wid
		if ( ! wids[i] ) {
			continue;
		}

		// just make every word 100 pts
		int32_t t = 100;

		// penalize it if in one of these sections
		if ( bb[i] & ( D_IN_PARENS | D_IN_SUP | D_IN_LIST ) ) {
			t /= 2;
		}

		// boost it if in bold or italics
		if ( bb[i] & D_IN_BOLDORITALICS ) {
			t *= 2;
		}

		// add the score for this word
		score += t;

		// print the score, "t"
		if ( g_conf.m_logDebugSummary ) {
			xp.safePrintf("(%" PRId32")",t);
		}

		// count the alpha words we got
		wordCount++;

		// if no matches left, skip
		if ( mi >= matches->m_numMatches ) {
			continue;
		}

		// get the match
		Match *next = &ms[mi];

		// skip if not a match
		if ( i != next->m_wordNum ) {
			continue;
		}

		// must be a match in this class
		if ( next->m_words != words ) {
			continue;
		}

		// advance it
		mi++;

		// which query word # does it match
		int32_t qwn = next->m_qwordNum;

		if ( qwn < 0 || qwn >= m_q->m_numWords ){g_process.shutdownAbort(true);}

		// undo old score
		score -= t;

		// add 100000 per match
		t = 100000;

		// weight based on tf, goes from 0.1 to 1.0
		t = (int32_t)((float)t * m_wordWeights [ qwn ]);

		// if it is a query stop word, make it 10000 pts
		if ( m_q->m_qwords[qwn].m_isQueryStopWord ) {
			t = 0;//10000;
		}

		// penalize it if in one of these sections
		if ( bb[i] & ( D_IN_PARENS | D_IN_SUP | D_IN_LIST ) ) {
			t /= 2;
		}

		if ( gotIt[qwn] > 0 ) {
			// have we matched it in this [a,b) already?
			if ( gotIt[qwn] == 1 ) {
				t /= 15;
			} else {
				// if we have more than 2 matches in the same window,
				// it may not give a good summary. give a heavy penalty
				t -= 200000;
			}
		} else if ( retired [qwn] > 0 ) {
			// have we matched it already in a winning window?
			t /= 12;
		}

		// add it back
		score += t;

		if ( g_conf.m_logDebugSummary ) {
			xp.safePrintf ("[%" PRId32"]{qwn=%" PRId32",ww=%f}",t,qwn,
				       m_wordWeights[qwn]);
		}

		// inc the query word count for this window
		if ( gotIt[qwn] < 100 ) {
			gotIt[qwn]++;
		}
	}

	int32_t oldScore = score;
	
	// apply the bonus if it starts or a sentence
	// only apply if the score is positive and if the wordcount is decent
	if ( score > 0 && wordCount > 7 ){
		// a match can give us 10k to 100k pts based on the tf weights
		// so we don't want to overwhelm that too much, so let's make
		// this a 20k bonus if it starts a sentence
		if ( bb[a] & D_STARTS_SENTENCE ) {
			score += 8000;
		} else if ( bb[a] & D_STARTS_FRAG ) {
			// likewise, a fragment, like after a comma
			score += 4000;
		}

		// 1k if the match word is very close to the
		// start of a sentence, lets say 3 alphawords
		if ( matchWordNum - a < 7 ) {
			score += 1000;
		}
	}

	// a summary isn't really a summary if its less than 7 words.
	// reduce the score, but still give it a decent score.
	// minus 5M.
	if ( wordCount < 7 ) {
		score -= 20000;
	}

	// summaries that cross a lot of tags are usually bad, penalize them
	if ( numTagsCrossed > 1 ) {
		score -= (numTagsCrossed * 20000);
	}

	if ( hasUrl ) {
		score -= 8000;
	}

	// show it
	if ( g_conf.m_logDebugSummary ) {
		log(LOG_DEBUG, "sum: score=%08" PRId32" prescore=%08" PRId32" a=%05" PRId32" b=%05" PRId32" %s",
		     (int32_t)score,oldScore,(int32_t)a,(int32_t)b,
		     xp.getBufStart());
	}

	// set lasta, besta, bestb
	*lasta = a;
	*besta = a;
	*bestb = b;

	return score;
}