C++ (Cpp) getDomFastの例

プログラミング言語: C++ (Cpp)

メソッド/関数: getDomFast

hotexamples.comのコード掲載数: 7

C++ (Cpp) getDomFast - 7件のコード例が見つかりました。すべてオープンソースプロジェクトから抽出されたC++ (Cpp)のgetDomFastの実例で、最も評価が高いものを厳選しています。コード例の評価を行っていただくことで、より質の高いコード例が表示されるようになります。

コード例 #1

ファイルを表示

ファイル: Titledb.cpp プロジェクト: privacore/open-source-search-engine

// init our rdb
bool Titledb::init ( ) {

	// key sanity tests
	int64_t uh48  = 0x1234567887654321LL & 0x0000ffffffffffffLL;
	int64_t docId = 123456789;
	key96_t k = makeKey(docId,uh48,false);
	if ( getDocId(&k) != docId ) { g_process.shutdownAbort(true);}
	if ( getUrlHash48(&k) != uh48 ) { g_process.shutdownAbort(true);}

	const char *url = "http://.ezinemark.com/int32_t-island-child-custody-attorneys-new-york-visitation-lawyers-melville-legal-custody-law-firm-45f00bbed18.html";
	Url uu;
	uu.set(url);
	const char *d1 = uu.getDomain();
	int32_t  dlen1 = uu.getDomainLen();
	int32_t dlen2 = 0;
	const char *d2 = getDomFast ( url , &dlen2 );
	if ( !d1 || !d2 ) { g_process.shutdownAbort(true); }
	if ( dlen1 != dlen2 ) { g_process.shutdownAbort(true); }

	// another one
	url = "http://ok/";
	uu.set(url);
	const char *d1a = uu.getDomain();
	dlen1 = uu.getDomainLen();
	dlen2 = 0;
	const char *d2a = getDomFast ( url , &dlen2 );
	if ( d1a || d2a ) { g_process.shutdownAbort(true); }
	if ( dlen1 != dlen2 ) { g_process.shutdownAbort(true); }

	// . what's max # of tree nodes?
	// . assume avg TitleRec size (compressed html doc) is about 1k we get:
	// . NOTE: overhead is about 32 bytes per node
	int32_t maxTreeNodes  = g_conf.m_titledbMaxTreeMem / (1*1024);

	// initialize our own internal rdb
	return m_rdb.init ( "titledb"                   ,
			    -1                          , // fixed record size
			    //g_conf.m_titledbMinFilesToMerge ,
			    // this should not really be changed...
			    -1,
			    g_conf.m_titledbMaxTreeMem  ,
			    maxTreeNodes                ,
			    false,                         // half keys?
			    12,             // key size
			    false,          //isCollectionLess
			    false);         //useIndexFile

	// validate
	//return verify ( );
}

コード例 #2

ファイルを表示

ファイル: Titledb.cpp プロジェクト: DeadNumbers/open-source-search-engine

// init our rdb
bool Titledb::init ( ) {

	// key sanity tests
	int64_t uh48  = 0x1234567887654321LL & 0x0000ffffffffffffLL;
	int64_t docId = 123456789;
	key_t k = makeKey(docId,uh48,false);
	if ( getDocId(&k) != docId ) { char *xx=NULL;*xx=0;}
	if ( getUrlHash48(&k) != uh48 ) { char *xx=NULL;*xx=0;}

	char *url = "http://.ezinemark.com/int32_t-island-child-custody-attorneys-new-york-visitation-lawyers-melville-legal-custody-law-firm-45f00bbed18.html";
	Url uu;
	uu.set(url);
	char *d1 = uu.getDomain();
	int32_t  dlen1 = uu.getDomainLen();
	int32_t dlen2 = 0;
	char *d2 = getDomFast ( url , &dlen2 );
	if ( dlen1 != dlen2 ) { char *xx=NULL;*xx=0; }
	// another one
	url = "http://ok/";
	uu.set(url);
	d1 = uu.getDomain();
	dlen1 = uu.getDomainLen();
	dlen2 = 0;
	d2 = getDomFast ( url , &dlen2 );
	if ( dlen1 != dlen2 ) { char *xx=NULL;*xx=0; }


	int64_t maxMem = 200000000; // 200MB

	// . what's max # of tree nodes?
	// . assume avg TitleRec size (compressed html doc) is about 1k we get:
	// . NOTE: overhead is about 32 bytes per node
	int32_t maxTreeNodes  = maxMem  / (1*1024);

	// . we now use a disk page cache for titledb as opposed to the
	//   old rec cache. i am trying to do away with the Rdb::m_cache rec
	//   cache in favor of cleverly used disk page caches, because
	//   the rec caches are not real-time and get stale.
	// . just hard-code 30MB for now
	int32_t pcmem    = 30000000; // = g_conf.m_titledbMaxDiskPageCacheMem;
	// f**k that we need all the mem!
	//pcmem = 0;
	// do not use any page cache if doing tmp cluster in order to
	// prevent swapping
	if ( g_hostdb.m_useTmpCluster ) pcmem = 0;
	int32_t pageSize = GB_INDEXDB_PAGE_SIZE;
	// init the page cache
	// . MDW: "minimize disk seeks" not working otherwise i'd enable it!
	if ( ! m_pc.init ( "titledb",
			   RDB_TITLEDB,
			   pcmem    ,
			   pageSize ) )
		return log("db: Titledb init failed.");

	// each entry in the cache is usually just a single record, no lists
	//int32_t maxCacheNodes = g_conf.m_titledbMaxCacheMem / (10*1024);
	// initialize our own internal rdb
	if ( ! m_rdb.init ( g_hostdb.m_dir              ,
			    "titledb"                   ,
			    true                        , // dedup same keys?
			    -1                          , // fixed record size
			    //g_hostdb.m_groupMask          ,
			    //g_hostdb.m_groupId            ,
			    //g_conf.m_titledbMinFilesToMerge , 
			    // this should not really be changed...
			    -1,//3,//230  minfilestomerge mintomerge
			    maxMem, // g_conf.m_titledbMaxTreeMem  ,
			    maxTreeNodes                ,
			    // now we balance so Sync.cpp can ordered huge list
			    true                        , // balance tree?
			    // turn off cache for now because the page cache
			    // is just as fast and does not get out of date
			    // so bad??
			    //0                         ,
			    0,//g_conf.m_titledbMaxCacheMem ,
			    0,//maxCacheNodes               ,
			    false                       ,// half keys?
			    false                       ,// g_conf.m_titledbSav
			    &m_pc               , // page cache ptr
			    true                        ) )// is titledb?
		return false;
	return true;
	// validate
	//return verify ( );
}

コード例 #3

ファイルを表示

ファイル: Blaster.cpp プロジェクト: alvinlai/open-source-search-engine

void Blaster::gotDoc2 ( void *state, TcpSocket *s){
	StateBD *st=(StateBD *)state;
	// bail if got cut off
	if ( s->m_readOffset == 0 ) {
		log("blaster: Lost the Request in gotDoc2");
		m_launched--;
		//No need to point p2
		// Free stateBD
		freeStateBD(st);
		return;
	}
	
	// . don't let TcpServer free m_buf when socket is recycled/closed
	// . we own it now and are responsible for freeing it
	//	s->m_readBuf = NULL;

	long long now = gettimeofdayInMilliseconds();
	// So now after getting both docIds, get their contents
	char *reply1 = st->m_buf1 ;
	long  size1  = st->m_buf1Len;
	HttpMime mime1;
	mime1.set ( reply1 , size1 , NULL );
	char *content1    = reply1 + mime1.getMimeLen();
	long  content1Len = size1  - mime1.getMimeLen();
	unsigned long h = hash32 ( content1 , content1Len );
	// log msg
	if ( g_errno ) 
		logf(LOG_INFO,"blaster: got doc (%li) (%li ms) %s : %s",
		     s->m_readOffset      , 
		     (long)(now - s->m_startTime) , 
		     st->m_u2   , 
		     mstrerror(g_errno)   );
	else
		logf(LOG_INFO,"blaster: got doc (%li) (%li ms) "
		     "(hash=%lx) %s",
		     s->m_readOffset      , 
		     (long)(now - s->m_startTime) , 
		     h ,
		     st->m_u2       );


	if (m_verbose){
		log(LOG_WARN,"blaster: content1len=%li, Content1 is =%s",
		    content1Len,content1);
		log(LOG_WARN,"\n");
	}
	char *reply2 = s->m_readBuf ;
	long  size2  = s->m_readOffset;
	HttpMime mime2;
	mime2.set ( reply2 , size2 , NULL );
	char *content2    = reply2 + mime2.getMimeLen();
	long  content2Len = size2  - mime2.getMimeLen();
	if (m_verbose)	
		log(LOG_WARN,"blaster: content2len=%li, Content2 is =%s",
		    content2Len,content2);

	// Now that we've got the contents, lets get the url links out 
	// of these pages.Passing them to function getSearchLinks should 
	// get the first x links found out.
	/*	st->m_links1=(char *) mmalloc(200*MAX_URL_LEN,"Blaster3");
	st->m_links2=st->m_links1+100*MAX_URL_LEN;
	st->m_numLinks1=100;
	st->m_numLinks2=100;*/

	/*	long numLinks1=getSearchLinks(content1,content1Len,
				      st->m_links1,st->m_numLinks1);
	long numLinks2=getSearchLinks(content2,content2Len,
	st->m_links2,st->m_numLinks2);*/


	content1[content1Len]='\0';
	//short csEnum1= get_iana_charset(mime1.getCharset(), 
	//				mime1.getCharsetLen());
	/*	if (csEnum1== csUnknown)
		log(LOG_DEBUG, "blaster: Unknown charset : %s", mime2.getCharset());*/
	Xml xml1;
	// assume utf8
	if (!xml1.set(content1, 
		     content1Len,
		     false,
		     0,
		     false,
		     TITLEREC_CURRENT_VERSION)){
		log(LOG_WARN,"blaster: Couldn't set XML1 Class in gotDoc2");
	}
	Links links1;
	Url parent; parent.set ( st->m_u1);
	if (!links1.set(false , // userellnofollow
			&xml1,
			&parent,//mime1.getLocationUrl(), parent Url
			false, // setLinkHashes
			NULL  , // baseUrl
			TITLEREC_CURRENT_VERSION, // version
			0 , // niceness
			false , // parent is permalink?
			NULL )) { // oldLinks
		log(LOG_WARN,"blaster: Couldn't set Links Class in gotDoc2");
	}

	content2[content2Len]='\0';
	//short csEnum2= get_iana_charset(mime2.getCharset(), 
	//				mime2.getCharsetLen());
	/*	if (csEnum2== csUnknown)
		log(LOG_DEBUG, "blaster: Unknown charset : %s", mime2.getCharset());*/
	Xml xml2;
	if (!xml2.set(content2, 
		     content2Len,
		     false,
		     0,
		     false,
		     TITLEREC_CURRENT_VERSION)){
		log(LOG_WARN,"blaster: Couldn't set XML2 Class in gotDoc2");
	}
	Links links2;
	parent.set(st->m_u2);
	if (!links2.set(0,//siterec xml
			&xml2,
			&parent,//&st->m_u2,//mime2.getLocationUrl(),
			false,
			NULL,
			TITLEREC_CURRENT_VERSION,
			0,
			false,
			NULL)){
		log(LOG_WARN,"blaster: Couldn't set links2 Class in gotDoc2");
	}
	

	// put the hash of the sites into a hashtable, since we have
	// about a 100 or so of them
	HashTableT<unsigned long, bool> urlHash;
	// put the urls from doc2 into the hastable, but first check if
	// they are links to google or gigablast (for now). For msn and
	// yahoo we have to add other checks.
	char domain2[256];
	long dlen = 0;
	char *dom = getDomFast ( st->m_u2 , &dlen );
	if ( dom ) strncpy(domain2,dom,dlen);
	domain2[dlen]='\0';
	for (long i=0;i<links2.getNumLinks();i++){
		// The dots check if exactly google or gigablast are present
		// in the link
		char *ss=links2.getLink(i);
		char *p;
		p=strstr(ss,domain2);
		if(p) continue;
		p=strstr(ss,"google.");
		if(p) continue;
		p=strstr(ss,"cache:");  //googles cache page
		if(p) continue;
		p= strstr(ss,"gigablast.");
		if(p) continue;
		p= strstr(ss,"web.archive.org");//older copies on gigablast
		if(p) continue;
		p= strstr(ss,"search.yahoo.com");//from gigablast search
		if(p) continue;
		p= strstr(ss,"search.msn.com");//from gigablast search
		if(p) continue;
		p= strstr(ss,"s.teoma.com");//from gigablast search
		if(p) continue;
		p= strstr(ss,"search.dmoz.org");//from gigablast search
		if(p) continue;
		p= strstr(ss,"www.answers.com");//from gigablast search
		if(p) continue;
		p= strstr(ss,"cc.msncache.com");//msn's cache page
		if(p) continue;
		if (m_verbose)
			log(LOG_WARN,"blaster: link in Doc2=%s"
			    ,links2.getLink(i));
		unsigned long h=hash32Lower_a(links2.getLink(i),
					    links2.getLinkLen(i));
		//should i check for conflict. no, because it doesn't matter
		urlHash.addKey(h,1);
	}
	// now check if the urls from doc1 are in doc2. save the
	// ones that are not
	// in there for later.
	/*	long numUrlsToCheck=links2.getNumLinks();*/
	long numUrlsNotFound=0;
	/*if (numLinks1<numUrlsToCheck)
	numUrlsToCheck=numLinks1;*/
	char domain1[256];
	dlen = 0;
	dom = getDomFast ( st->m_u1 ,&dlen );
	if ( dom ) strncpy(domain1,dom,dlen);
	domain1[dlen]='\0';
	for (long i=0;i<links1.getNumLinks();i++){
		char *ss=links1.getLink(i);
		char *p;
		p=strstr(ss,domain1);
		if(p) continue;
		p=strstr(ss,"google.");
		if(p) continue;
		p=strstr(ss,"cache:");  //googles cache page
		if(p) continue;
		p= strstr(ss,"gigablast.");
		if(p) continue;
		p= strstr(ss,"web.archive.org");//older copies on gigablast
		if(p) continue;
		p= strstr(ss,"search.yahoo.com");//from gigablast search
		if(p) continue;
		p= strstr(ss,"search.msn.com");//from gigablast search
		if(p) continue;
		p= strstr(ss,"s.teoma.com");//from gigablast search
		if(p) continue;
		p= strstr(ss,"search.dmoz.org");//from gigablast search
		if(p) continue;
		p= strstr(ss,"www.answers.com");//from gigablast search
		if(p) continue;
		p= strstr(ss,"cc.msncache.com");//msn's cache page
		if(p) continue;
		if (m_verbose)
			log(LOG_WARN,"blaster: link in Doc1=%s"
			    ,links1.getLink(i));
		unsigned long h=hash32Lower_a(links1.getLink(i),
					    links1.getLinkLen(i));
		long slot= urlHash.getSlot(h);		
		if(slot!=-1) continue;

		// if url is not present, get its doc.
		if (m_verbose || m_justDisplay)
			log(LOG_WARN,"blaster: NOT FOUND %s in %s"
			    ,links1.getLink(i),domain2);
		numUrlsNotFound++;
		//Don't do anything else if just have to display the urls
		if (m_justDisplay) continue;
		//now get the doc of these urls
		//initialize
		st->m_numUrlDocsReceived=0;

		StateBD2 *st2;
		try { st2 = new (StateBD2); }
		catch ( ... ) {
			g_errno = ENOMEM;
			log("blaster: Failed. "
			    "Could not allocate %li bytes for query. "
			    "Returning HTTP status of 500.",
			    (long)sizeof(StateBD2));
			return;
		}
		mnew ( st2 , sizeof(StateBD2) , "Blaster4" );
		//Point to the big state;
		st2->m_st=st;
		//Msg16 does 6 redirects, so I do 6 too
		st2->m_numRedirects=6;
		//st2->m_url.set(links1.getLink(i),links1.getLinkLen(i));
		st2->m_url = links1.getLink(i);
		// No need for a proxy ip here, since we are fetching
		// doc's from different IPs. Faster this way
		bool status = g_httpServer.getDoc ( st2->m_url, // url
						    0,//ip
						    0 ,  // offset
						    -1 ,  // size
						    0 , // ifModifiedSince
						    st2,  // state
						    gotDocWrapper3, // callback
						    60*1000, // timeout
						    0, // proxy ip
						    0, // proxy port
						    30*1024*1024, //maxLen
						    30*1024*1024);//maxOtherLen
		// continue if it blocked
		if ( ! status ) continue;
		// If not blocked, there is an error.
		st->m_numUrlDocsReceived++;
	}
	st->m_numUrlDocsSent=numUrlsNotFound;

	//There might have been an error while sending the docs, so if there
	//has been put a check
	if ( st->m_numUrlDocsReceived > 0 && 
	     st->m_numUrlDocsReceived <= st->m_numUrlDocsSent ){
		log(LOG_WARN,"blaster: %li docs could not be sent due to "
		    "error",st->m_numUrlDocsReceived);
		m_launched--;
		freeStateBD(st);
		return;
	}
		
	if (numUrlsNotFound==0){
		//job done for this pair
		log(LOG_WARN,"blaster: All urls from %s found in "
		    "%s",domain1,domain2);
		m_launched--;
		// Free stateBD
		freeStateBD(st);
		return;
	}
	log(LOG_WARN,"blaster: %li urls from %s Not found in %s",
	    numUrlsNotFound,domain1,domain2);
	if(m_justDisplay){
		m_launched--;
		// Free stateBD
		freeStateBD(st);
	}
	return;
}

コード例 #4

ファイルを表示

ファイル: Msg22.cpp プロジェクト: FlavioFalcao/open-source-search-engine

void handleRequest22 ( UdpSlot *slot , long netnice ) {
	// shortcut
	UdpServer *us = &g_udpServer;
	// get the request
	Msg22Request *r = (Msg22Request *)slot->m_readBuf;
       // get this
	//char *coll = g_collectiondb.getCollName ( r->m_collnum );

	// sanity check
	long  requestSize = slot->m_readBufSize;
	if ( requestSize < r->getMinSize() ) {
		log("db: Got bad request size of %li bytes for title record. "
		    "Need at least 28.",  requestSize );
		us->sendErrorReply ( slot , EBADREQUESTSIZE );
		return;
	}

	// get base, returns NULL and sets g_errno to ENOCOLLREC on error
	RdbBase *tbase; 
	if ( ! (tbase=getRdbBase(RDB_TITLEDB,r->m_collnum) ) ) {
		log("db: Could not get title rec in collection # %li "
		    "because rdbbase is null.",
		    (long)r->m_collnum);
		g_errno = EBADENGINEER;
		us->sendErrorReply ( slot , g_errno ); 
		return; 
	}

	// overwrite what is in there so niceness conversion algo works
	r->m_niceness = netnice;

	// if just checking tfndb, do not do the cache lookup in clusterdb
	if ( r->m_justCheckTfndb ) r->m_maxCacheAge = 0;

	// keep track of stats
	//if    (r->m_justCheckTfndb)
	//       g_tfndb.getRdb()->readRequestGet(requestSize);
	//      else    
       g_titledb.getRdb()->readRequestGet  (requestSize);

       // breathe
       QUICKPOLL ( r->m_niceness);

       // sanity check
       if ( r->m_collnum < 0 ) { char *xx=NULL;*xx=0; }


       // make the state now
       State22 *st ;
       try { st = new (State22); }
       catch ( ... ) {
	       g_errno = ENOMEM;
	       log("query: Msg22: new(%i): %s", sizeof(State22),
		   mstrerror(g_errno));
	       us->sendErrorReply ( slot , g_errno );
	       return;
       }
       mnew ( st , sizeof(State22) , "Msg22" );
       
       // store ptr to the msg22request
       st->m_r = r;
       // save for sending back reply
       st->m_slot = slot;

       // then tell slot not to free it since m_r references it!
       // so we'll have to free it when we destroy State22
       st->m_slotAllocSize = slot->m_readBufMaxSize;
       st->m_slotReadBuf   = slot->m_readBuf;
       slot->m_readBuf = NULL;

       // . make the keys for getting recs from tfndb
       // . url recs map docid to the title file # that contains the titleRec
       //key_t uk1 ;
       //key_t uk2 ;
       // . if docId was explicitly specified...
       // . we may get multiple tfndb recs
       if ( ! r->m_url[0] ) {
	       // there are no del bits in tfndb
	       //uk1 = g_tfndb.makeMinKey ( r->m_docId );
	       //uk2 = g_tfndb.makeMaxKey ( r->m_docId );
	       st->m_docId1 = r->m_docId;
	       st->m_docId2 = r->m_docId;
       }

       // but if we are requesting an available docid, it might be taken
       // so try the range
       if ( r->m_getAvailDocIdOnly ) {
	       long long pd = r->m_docId;
	       long long d1 = g_titledb.getFirstProbableDocId ( pd );
	       long long d2 = g_titledb.getLastProbableDocId  ( pd );
	       // sanity - bad url with bad subdomain?
	       if ( pd < d1 || pd > d2 ) { char *xx=NULL;*xx=0; }
	       // make sure we get a decent sample in titledb then in 
	       // case the docid we wanted is not available
	       st->m_docId1 = d1;
	       st->m_docId2 = d2;
       }

       // . otherwise, url was given, like from Msg15
       // . we may get multiple tfndb recs
       if ( r->m_url[0] ) {
	       long  dlen = 0;
	       // this causes ip based urls to be inconsistent with the call
	       // to getProbableDocId(url) below
	       char *dom  = getDomFast ( r->m_url , &dlen );
	       // bogus url?
	       if ( ! dom ) {
		       log("msg22: got bad url in request: %s",r->m_url);
		       g_errno = EBADURL;
		       us->sendErrorReply ( slot , g_errno ); 
		       mdelete ( st , sizeof(State22) , "Msg22" );
		       delete ( st );
		       return; 
	       }
	       long long pd = g_titledb.getProbableDocId (r->m_url,dom,dlen);
	       long long d1 = g_titledb.getFirstProbableDocId ( pd );
	       long long d2 = g_titledb.getLastProbableDocId  ( pd );
	       // sanity - bad url with bad subdomain?
	       if ( pd < d1 || pd > d2 ) { char *xx=NULL;*xx=0; }
	       // there are no del bits in tfndb
	       //uk1 = g_tfndb.makeMinKey ( d1 );
	       //uk2 = g_tfndb.makeMaxKey ( d2 );
	       // store these
	       st->m_pd     = pd;
	       st->m_docId1 = d1;
	       st->m_docId2 = d2;
	       st->m_uh48   = hash64b ( r->m_url ) & 0x0000ffffffffffffLL;
       }
       
       QUICKPOLL ( r->m_niceness );
       
       /*
       // shortcut
       Rdb *tdb = g_titledb.getRdb();

       // init this
       st->m_tfn2 = -1;
       // skip tfndb lookup if we can. saves some time.
       if ( g_conf.m_readOnlyMode && 
	    // must not be a *url* lookup, it must be a docid lookup
	    ! r->m_url[0] &&
	    // tree must be empty too i guess
	    tdb->getTree()->getNumUsedNodes() ==0 ) {
	       // the RdbBase contains the BigFiles for tfndb
	       RdbBase *base = tdb->m_bases[r->m_collnum];
	       // can only have one titledb file
	       if ( base->getNumFiles() == 1 ) {
		       // now we can get RdbBase
		       st->m_tfn2 = base->m_fileIds2[0];
		       // sanity check
		       if ( st->m_tfn2 < 0 ) { char *xx = NULL; *xx = 0; }
	       }
       }

       // check the tree for this docid
       RdbTree *tt = tdb->getTree();
       // make titledb keys
       key_t startKey = g_titledb.makeFirstKey ( st->m_docId1 );
       key_t endKey   = g_titledb.makeLastKey  ( st->m_docId2 );
       long  n        = tt->getNextNode ( r->m_collnum , startKey );
       
       // there should only be one match, one titlerec per docid!
       for ( ; n >= 0 ; n = tt->getNextNode ( n ) ) {
	       // break if collnum does not match. we exceeded our tree range.
	       if ( tt->getCollnum ( n ) != r->m_collnum ) break;
	       // get the key of this node
	       key_t k = *(key_t *)tt->getKey(n);
	       // if passed limit, break out, no match
	       if ( k > endKey ) break;
	       // if we had a url make sure uh48 matches
	       if ( r->m_url[0] ) {
		       // get it
		       long long uh48 = g_titledb.getUrlHash48(&k);
		       // sanity check
		       if ( st->m_uh48 == 0 ) { char *xx=NULL;*xx=0; }
		       // we must match this exactly
		       if ( uh48 != st->m_uh48 ) continue;
	       }
	       // . if we matched a negative key, then skip
	       // . just break out here and enter the normal logic
	       // . it should load tfndb and find that it is not in tfndb
	       //   because when you add a negative key to titledb in
	       //   Rdb::addList, it adds a negative rec to tfndb immediately
	       // . NO! because we add the negative key to the tree when we
	       //   delete the old titledb rec, then we add the new one!
	       //   when a negative key is added Rdb::addRecord() removes
	       //   the positive key (and vice versa) from the tree.
	       if ( KEYNEG((char *)&k) ) continue;
	       // if just checking for its existence, we are done
	       if ( r->m_justCheckTfndb ) {
		       us->sendReply_ass ( NULL,0,NULL,0,slot);
		       // don't forget to free the state
		       mdelete ( st , sizeof(State22) , "Msg22" );
		       delete ( st );
		       return;
	       }
	       // ok, we got a match, return it
	       char *data     = tt->getData     ( n );
	       long  dataSize = tt->getDataSize ( n );
	       // wierd!
	       if ( dataSize == 0 ) { char *xx=NULL;*xx=0; }
	       // send the whole rec back
	       long need = 12 + 4 + dataSize;
	       // will this copy it? not!
	       char *buf = (char *)mmalloc ( need , "msg22t" );
	       if ( ! buf ) {
		       us->sendErrorReply ( slot , g_errno ); 
		       mdelete ( st , sizeof(State22) , "Msg22" );
		       delete ( st );
		       return; 
	       }
	       // log it
	       if ( g_conf.m_logDebugSpider )
		       logf(LOG_DEBUG,"spider: found %s in titledb tree",
			    r->m_url);
	       // store in the buf for sending
	       char *p = buf;
	       // store key
	       *(key_t *)p = k; p += sizeof(key_t);
	       // then dataSize
	       *(long *)p = dataSize; p += 4;
	       // then the data
	       memcpy ( p , data , dataSize ); p += dataSize;
	       // send off the record
	       us->sendReply_ass (buf, need,buf, need,slot);
	       // don't forget to free the state
	       mdelete ( st , sizeof(State22) , "Msg22" );
	       delete ( st );
	       return;
       }

       // if we did not need to consult tfndb cuz we only have one file
       if ( st->m_tfn2 >= 0 ) {
	       gotUrlListWrapper ( st , NULL , NULL );
	       return;
       }

       // . get the list of url recs for this docid range
       // . this should not block, tfndb SHOULD all be in memory all the time
       // . use 500 million for min recsizes to get all in range
       // . no, using 500MB causes problems for RdbTree::getList, so use
       //   100k. how many recs can there be?
       if ( ! st->m_msg5.getList ( RDB_TFNDB         ,
				   coll              ,
				   &st->m_ulist      ,
				   uk1               , // startKey
				   uk2               , // endKey
				   // use 0x7fffffff preceisely because it
				   // will determine eactly how long the
				   // tree list needs to allocate in Msg5.cpp
				   0x7fffffff        , // minRecSizes
				   true              , // includeTree?
				   false             , // addToCache?
				   0                 , // max cache age
				   0                 , // startFileNum
				   -1                , // numFiles (-1 =all)
				   st                ,
				   gotUrlListWrapper ,
				   r->m_niceness     ,
				   true              ))// error correction?
	       return ;
       // we did not block
       gotUrlListWrapper ( st , NULL , NULL );
}

static void gotTitleList ( void *state , RdbList *list , Msg5 *msg5 ) ;

void gotUrlListWrapper ( void *state , RdbList *list , Msg5 *msg5 ) {
	// shortcuts
	State22   *st = (State22 *)state;
	UdpServer *us = &g_udpServer;

	// bail on error
	if ( g_errno ) { 
		log("db: Had error getting info from tfndb: %s.",
		    mstrerror(g_errno));
		log("db: uk1.n1=%li n0=%lli uk2.n1=%li n0=%lli "
		    "d1=%lli d2=%lli.",
		    ((key_t *)st->m_msg5.m_startKey)->n1 ,
		    ((key_t *)st->m_msg5.m_startKey)->n0 ,
		    ((key_t *)st->m_msg5.m_endKey)->n1   ,
		    ((key_t *)st->m_msg5.m_endKey)->n0   ,
		    st->m_docId1 ,
		    st->m_docId2 );
		us->sendErrorReply ( st->m_slot , g_errno ); 
		mdelete ( st , sizeof(State22) , "Msg22" );
		delete ( st );
		return;
	}

	// shortcuts
	RdbList      *ulist = &st->m_ulist;
	Msg22Request *r     = st->m_r;
	char         *coll  = g_collectiondb.getCollName ( r->m_collnum );

	// point to top just in case
	ulist->resetListPtr();

	// get base, returns NULL and sets g_errno to ENOCOLLREC on error
	RdbBase *tbase = getRdbBase(RDB_TITLEDB,coll);

	// set probable docid
	long long pd = 0LL;
	if ( r->m_url[0] ) {
		pd = g_titledb.getProbableDocId(r->m_url);
		// sanity
		if ( pd != st->m_pd ) { char *xx=NULL;*xx=0; }
	}

	// . these are both meant to be available docids
	// . if ad2 gets exhausted we use ad1
	long long ad1 = st->m_docId1;
	long long ad2 = pd;


	long tfn = -1;
	// sanity check. make sure did not load from tfndb if did not need to
	if ( ! ulist->isExhausted() && st->m_tfn2 >= 0 ) {char *xx=NULL;*xx=0;}
	// if only one titledb file and none in memory use it
	if ( st->m_tfn2 >= 0 ) tfn = st->m_tfn2;

	// we may have multiple tfndb recs but we should NEVER have to read
	// multiple titledb files...
	for ( ; ! ulist->isExhausted() ; ulist->skipCurrentRecord() ) {
		// breathe
		QUICKPOLL ( r->m_niceness );
		// get first rec
		key_t k = ulist->getCurrentKey();
		// . skip negative keys
		// . seems to happen when we have tfndb in the tree...
		if ( KEYNEG((char *)&k) ) continue;

		// if we have a url and no docid, we gotta check uh48!
		if ( r->m_url[0] && g_tfndb.getUrlHash48(&k)!=st->m_uh48){
			// get docid of that guy
			long long dd = g_tfndb.getDocId(&k);
			// if matches avail docid, inc it
			if ( dd == ad1 ) ad1++;
			if ( dd == ad2 ) ad2++;
			// try next tfndb key
			continue;
		}
		// . get file num this rec is stored in
		// . this is updated right after the file num is merged by
		//   scanning all records in tfndb. this is very quick if all
		//   of tfndb is in memory, otherwise, it might take a few
		//   seconds. update call done in RdbMerge::incorporateMerge().
		tfn = g_tfndb.getTfn ( &k );
		// i guess we got a good match!
		break;
	}

	// sanity check. 255 used to mean in spiderdb or in tree
	if ( tfn >= 255 ) { char *xx=NULL;*xx=0; }


	// maybe no available docid if we breached our range
	if ( ad1 >= pd           ) ad1 = 0LL;
	if ( ad2 >  st->m_docId2 ) ad2 = 0LL;
	// get best
	long long ad = ad2;
	// but wrap around if we need to
	if ( ad == 0LL ) ad = ad1;

	// breathe
	QUICKPOLL ( r->m_niceness);

	// . log if different
	// . if our url rec was in there, this could still be different
	//   if there was another url rec in there with the same docid and
	//   a diferent extension, but with a tfn of 255, meaning that it
	//   is just in spiderdb and not in titledb yet. so it hasn't been
	//   assigned a permanent docid...
	// . another way "ad" may be different now is from the old bug which
	//   did not chain the docid properly because it limited the docid
	//   chaining to one titleRec file. so conceivably we can have 
	//   different docs sharing the same docids, but with different 
	//   url hash extensions. for instance, on host #9 we have:
	//   00f3b2ff63aec3a9 docId=261670033643 e=0x58 tfn=117 clean=0 half=0 
	//   00f3b2ff63af66c9 docId=261670033643 e=0x6c tfn=217 clean=0 half=0 
	// . Msg16 will only use the avail docid if the titleRec is not found
	if ( r->m_url[0] && pd != ad ) {
		//log(LOG_INFO,"build: Docid %lli collided. %s Changing "
		//
		// http://www.airliegardens.org/events.asp?dt=2&date=8/5/2011
		//
		// COLLIDES WITH
		//
		// http://www.bbonline.com/i/chicago.html
		//
		// collision alert!
		log("spider: Docid %lli collided. %s Changing "
		    "to %lli.", r->m_docId , r->m_url , ad );
		// debug this for now
		//char *xx=NULL;*xx=0; 
	}

	// remember it
	st->m_availDocId = ad;

	// if tfn is -1 then it was not in titledb
	if ( tfn == -1 ) { 
		// store docid in reply
		char *p = st->m_slot->m_tmpBuf;
		// send back the available docid
		*(long long *)p = ad;
		// send it
		us->sendReply_ass ( p , 8 , p , 8 , st->m_slot );
		// don't forget to free state
		mdelete ( st , sizeof(State22) , "Msg22" );
		delete ( st );
		return;
	}

	// sanity
	if ( tfn < 0 ) { char *xx=NULL;*xx=0; }

	// breathe
	QUICKPOLL ( r->m_niceness );

	// ok, if just "checking tfndb" no need to go further
	if ( r->m_justCheckTfndb ) {
		// send back a good reply (empty means found!)
		us->sendReply_ass ( NULL,0,NULL,0,st->m_slot);
		// don't forget to free the state
		mdelete ( st , sizeof(State22) , "Msg22" );
		delete ( st );
		return;
	}

	// . compute the file scan range
	// . tfn is now equivalent to Rdb's id2, a secondary file id, it
	//   follows the hyphen in "titledb0001-023.dat"
	// . default to just scan the root file AND the tree, cuz we're
	//   assuming restrictToRoot was set to true so we did not get a tfndb
	//   list
	// . even if a file number is given, always check the tree in case
	//   it got re-spidered
	// . shit, but we can still miss it if it gets dumped right after
	//   our thread is spawned, in which case we'd fall back to the old
	//   version. no. because if its in the tree now we get it before
	//   spawning a thread. there is no blocking. TRICKY. so if it is in 
	//   the tree at this point we'll get it, but may end up scanning the
	//   file with the older version of the doc... not too bad.
	long startFileNum = tbase->getFileNumFromId2 ( tfn );

	// if tfn refers to a missing titledb file...
	if ( startFileNum < 0 ) {
		if ( r->m_url[0] ) log("db: titledb missing url %s",r->m_url);
		else        log("db: titledb missing docid %lli", r->m_docId);
		us->sendErrorReply ( st->m_slot,ENOTFOUND );
		mdelete ( st , sizeof(State22) , "Msg22" );
		delete ( st ); 
		return ;
	}

	// save this
	st->m_tfn = tfn;
	*/
	// make the cacheKey ourself, since Msg5 would make the key wrong
	// since it would base it on startFileNum and numFiles
	key_t cacheKey ; cacheKey.n1 = 0; cacheKey.n0 = r->m_docId;
	// make titledb keys
	key_t startKey = g_titledb.makeFirstKey ( st->m_docId1 );
	key_t endKey   = g_titledb.makeLastKey  ( st->m_docId2 );

	// . load the list of title recs from disk now
	// . our file range should be solid
	// . use 500 million for min recsizes to get all in range
	if ( ! st->m_msg5.getList ( RDB_TITLEDB       ,
				    r->m_collnum ,
				    &st->m_tlist      ,
				    startKey          , // startKey
				    endKey            , // endKey
				    500000000         , // minRecSizes
				    true              , // includeTree
				    false,//r->m_addToCache   , // addToCache?
				    0,//r->m_maxCacheAge  , // max cache age
				    0,//startFileNum      ,
				    -1                 , // numFiles
				    st , // state             ,
				    gotTitleList      ,
				    r->m_niceness     ,
				    true              , // do error correct?
				    &cacheKey         ,
				    0                 , // retry num
				    -1                , // maxRetries
				    true              , // compensate for merge
				    -1LL              , // sync point
				    &st->m_msg5b      ) ) return ;

	// we did not block, nice... in cache?
	gotTitleList ( st , NULL , NULL );
}

コード例 #5

ファイルを表示

ファイル: Msg22.cpp プロジェクト: lemire/open-source-search-engine

void handleRequest22 ( UdpSlot *slot , int32_t netnice ) {
	// shortcut
	UdpServer *us = &g_udpServer;
	// get the request
	Msg22Request *r = (Msg22Request *)slot->m_readBuf;

	// sanity check
	int32_t  requestSize = slot->m_readBufSize;
	if ( requestSize < r->getMinSize() ) {
		log("db: Got bad request size of %" PRId32" bytes for title record. "
		    "Need at least 28.",  requestSize );
		log(LOG_ERROR,"%s:%s:%d: call sendErrorReply.", __FILE__, __func__, __LINE__);
		us->sendErrorReply ( slot , EBADREQUESTSIZE );
		return;
	}

	// get base, returns NULL and sets g_errno to ENOCOLLREC on error
	RdbBase *tbase = getRdbBase( RDB_TITLEDB, r->m_collnum );
	if ( ! tbase ) {
		log("db: Could not get title rec in collection # %" PRId32" because rdbbase is null.", (int32_t)r->m_collnum);
		g_errno = EBADENGINEER;
		log(LOG_ERROR,"%s:%s:%d: call sendErrorReply.", __FILE__, __func__, __LINE__);
		us->sendErrorReply ( slot , g_errno ); 
		return; 
	}

	// overwrite what is in there so niceness conversion algo works
	r->m_niceness = netnice;

	// if just checking tfndb, do not do the cache lookup in clusterdb
	if ( r->m_justCheckTfndb ) {
		r->m_maxCacheAge = 0;
	}

	g_titledb.getRdb()->readRequestGet  (requestSize);

	// breathe
	QUICKPOLL ( r->m_niceness);

	// sanity check
	if ( r->m_collnum < 0 ) { char *xx=NULL;*xx=0; }


	// make the state now
	State22 *st ;
	try { st = new (State22); }
	catch ( ... ) {
		g_errno = ENOMEM;
		log("query: Msg22: new(%" PRId32"): %s", (int32_t)sizeof(State22),
		mstrerror(g_errno));
		log(LOG_ERROR,"%s:%s:%d: call sendErrorReply.", __FILE__, __func__, __LINE__);
		us->sendErrorReply ( slot , g_errno );
		return;
	}
	mnew ( st , sizeof(State22) , "Msg22" );

	// store ptr to the msg22request
	st->m_r = r;
	// save for sending back reply
	st->m_slot = slot;

	// then tell slot not to free it since m_r references it!
	// so we'll have to free it when we destroy State22
	st->m_slotAllocSize = slot->m_readBufMaxSize;
	st->m_slotReadBuf   = slot->m_readBuf;
	slot->m_readBuf = NULL;

	// . if docId was explicitly specified...
	// . we may get multiple tfndb recs
	if ( ! r->m_url[0] ) {
	   st->m_docId1 = r->m_docId;
	   st->m_docId2 = r->m_docId;
	}

	// but if we are requesting an available docid, it might be taken
	// so try the range
	if ( r->m_getAvailDocIdOnly ) {
	   int64_t pd = r->m_docId;
	   int64_t d1 = g_titledb.getFirstProbableDocId ( pd );
	   int64_t d2 = g_titledb.getLastProbableDocId  ( pd );
	   // sanity - bad url with bad subdomain?
	   if ( pd < d1 || pd > d2 ) { char *xx=NULL;*xx=0; }
	   // make sure we get a decent sample in titledb then in
	   // case the docid we wanted is not available
	   st->m_docId1 = d1;
	   st->m_docId2 = d2;
	}

	// . otherwise, url was given, like from Msg15
	// . we may get multiple tfndb recs
	if ( r->m_url[0] ) {
	   int32_t  dlen = 0;
	   // this causes ip based urls to be inconsistent with the call
	   // to getProbableDocId(url) below
	   char *dom  = getDomFast ( r->m_url , &dlen );
	   // bogus url?
	   if ( ! dom ) {
	       log("msg22: got bad url in request: %s from "
		   "hostid %" PRId32" for msg22 call ",
		   r->m_url,slot->m_host->m_hostId);
	       g_errno = EBADURL;
	       log(LOG_ERROR,"%s:%s:%d: call sendErrorReply.", __FILE__, __func__, __LINE__);
	       us->sendErrorReply ( slot , g_errno );
	       mdelete ( st , sizeof(State22) , "Msg22" );
	       delete ( st );
	       return;
	   }
	   int64_t pd = g_titledb.getProbableDocId (r->m_url,dom,dlen);
	   int64_t d1 = g_titledb.getFirstProbableDocId ( pd );
	   int64_t d2 = g_titledb.getLastProbableDocId  ( pd );
	   // sanity - bad url with bad subdomain?
	   if ( pd < d1 || pd > d2 ) { char *xx=NULL;*xx=0; }
	   // store these
	   st->m_pd     = pd;
	   st->m_docId1 = d1;
	   st->m_docId2 = d2;
	   st->m_uh48   = hash64b ( r->m_url ) & 0x0000ffffffffffffLL;
	}

	QUICKPOLL ( r->m_niceness );

	// make the cacheKey ourself, since Msg5 would make the key wrong
	// since it would base it on startFileNum and numFiles
	key_t cacheKey ; cacheKey.n1 = 0; cacheKey.n0 = r->m_docId;
	// make titledb keys
	key_t startKey = g_titledb.makeFirstKey ( st->m_docId1 );
	key_t endKey   = g_titledb.makeLastKey  ( st->m_docId2 );

	// . load the list of title recs from disk now
	// . our file range should be solid
	// . use 500 million for min recsizes to get all in range
	if ( ! st->m_msg5.getList ( RDB_TITLEDB       ,
				    r->m_collnum ,
				    &st->m_tlist      ,
				    startKey          , // startKey
				    endKey            , // endKey
				    500000000         , // minRecSizes
				    true              , // includeTree
				    false,//r->m_addToCache   , // addToCache?
				    0,//r->m_maxCacheAge  , // max cache age
				    0,//startFileNum      ,
				    -1                 , // numFiles
				    st , // state             ,
				    gotTitleList      ,
				    r->m_niceness     ,
				    true              , // do error correct?
				    &cacheKey         ,
				    0                 , // retry num
				    -1                , // maxRetries
				    true              , // compensate for merge
				    -1LL              ) ) // sync point
		return ;

	// we did not block, nice... in cache?
	gotTitleList ( st , NULL , NULL );
}

コード例 #6

ファイルを表示

ファイル: Test.cpp プロジェクト: BKJackson/open-source-search-engine

// . returns true if all done!
// . returns false if still doing stuff
bool Test::injectLoop ( ) {

	long  dlen   ;
	char *dom    ;
	long  fakeIp ;

 loop:
	// advance to next url
	for ( ; m_urlPtr < m_urlEnd && ! *m_urlPtr ; m_urlPtr++ ) ;
	// all done?
	if ( m_urlPtr >= m_urlEnd ) {
		// flush em out
		if ( ! flushMsg4Buffers ( this , injectedWrapper ) ) 
			return false;
		// note it
		m_isAdding = false;
		// all done
		return true;
	}
	// error means all done
	if ( m_errno ) { m_isAdding = false; return true; }
	// point to it
	char *u = m_urlPtr;
	// advance to point to the next url for the next loop!
	for ( ; m_urlPtr < m_urlEnd && *m_urlPtr ; m_urlPtr++ ) ;

	// hash it
	long long h = hash64b ( u );
	// dedup it lest we freeze up and stopIt() never gets called because
	// m_urlsAdded is never decremented all the way to zero in Spider.cpp
	if ( m_dt.isInTable ( &h ) ) goto loop;
	// add it. return true with g_errno set on error
	if ( ! m_dt.addKey ( &h ) ) goto hadError;

	// make the SpiderRequest from it
	m_sreq.reset();
	// url
	strcpy ( m_sreq.m_url , u );
	// get domain of url
	dom = getDomFast ( m_sreq.m_url , &dlen );
	// make a fake ip
	fakeIp = 0x123456;
	// use domain if we got that
	if ( dom && dlen ) fakeIp = hash32 ( dom , dlen );
	// first ip is fake
	m_sreq.m_firstIp = fakeIp; // 0x123456;
	// these too
	m_sreq.m_domHash32  = fakeIp;
	m_sreq.m_hostHash32 = fakeIp;
	m_sreq.m_siteHash32 = fakeIp;
	m_sreq.m_probDocId = g_titledb.getProbableDocId( m_sreq.m_url );
	// this crap is fake
	m_sreq.m_isInjecting = 1;
	// use test-spider subdir for storing pages and spider times?
	if ( g_conf.m_testSpiderEnabled ) m_sreq.m_useTestSpiderDir = 1;
	// use this later
	m_sreq.m_hasContent = 0;
	// injected requests use this as the spider time i guess
	// so we can sort them by this
	m_sreq.m_addedTime = ++s_count;

	// no, because to compute XmlDoc::m_min/maxPubDate we need this to
	// be valid for our test run.. no no we will fix it to be
	// basically 2 days before spider time in the code...
	//m_sreq.m_addedTime = spiderTime;

	m_sreq.m_fakeFirstIp = 1;

	// make the key (parentDocId=0)
	m_sreq.setKey ( fakeIp, 0LL , false );
	// test it
	if ( g_spiderdb.getFirstIp(&m_sreq.m_key) != fakeIp ) {
		char *xx=NULL;*xx=0;}
	// sanity check. check for http(s)://
	if ( m_sreq.m_url[0] != 'h' ) { char *xx=NULL;*xx=0; }

	// reset this
	g_errno = 0;

	// count it
	m_urlsAdded++;

	// note it
	//log("crazyout: %s",m_sreq.m_url );
	logf(LOG_DEBUG,"spider: injecting test url %s",m_sreq.m_url);

	// the receiving end will realize that we are injecting into the test
	// collection and use the "/test/" subdir to load the file
	// "ips.txt" to do our ip lookups, and search for any downloads in
	// that subdirectory as well.
	if ( ! m_msg4.addMetaList ( (char *)&m_sreq     ,
				    m_sreq.getRecSize() ,
				    m_coll              ,
				    NULL                ,
				    injectedWrapper     ,
				    MAX_NICENESS        ,
				    RDB_SPIDERDB        ) )
		// return false if blocked
		return false;
	// error?
	if ( g_errno ) {
		// jump down here from above on error
	hadError:
		// save it
		m_errno = g_errno;
		// flag it
		m_isAdding = false;
		// note it
		log("test: inject had error: %s",mstrerror(g_errno));
		// stop, we are all done!
		return true;
	}
	// add the next spider request
	goto loop;
}

コード例 #7

ファイルを表示

ファイル: PageAddUrl.cpp プロジェクト: RevBooyah/open-source-search-engine

// . returns false if blocked, true otherwise
// . sets g_errno on error
bool sendPageAddUrl ( TcpSocket *s , HttpRequest *r ) {
	// . get fields from cgi field of the requested url
	// . get the search query
	long  urlLen = 0;
	char *url = r->getString ( "u" , &urlLen , NULL /*default*/);

	// see if they provided a url of a file of urls if they did not
	// provide a url to add directly
	//bool isAdmin = g_collectiondb.isAdmin ( r , s );
	bool isAdmin = r->getIsLocal();
	long  ufuLen = 0;
	char *ufu = NULL;
	if ( isAdmin )
		// get the url of a file of urls (ufu)
		ufu = r->getString ( "ufu" , &ufuLen , NULL );

	// can't be too long, that's obnoxious
	if ( urlLen > MAX_URL_LEN || ufuLen > MAX_URL_LEN ) {
		g_errno = EBUFTOOSMALL;
		g_msg = " (error: url too long)";
		return g_httpServer.sendErrorReply(s,500,"url too long");
	}
	// get the collection
	long  collLen = 0;
	char *coll    = r->getString("c",&collLen);
	if ( ! coll || ! coll[0] ) {
		//coll    = g_conf.m_defaultColl;
		coll = g_conf.getDefaultColl( r->getHost(), r->getHostLen() );
		collLen = gbstrlen(coll);
	}
	// get collection rec
	CollectionRec *cr = g_collectiondb.getRec ( coll );
	// bitch if no collection rec found
	if ( ! cr ) {
		g_errno = ENOCOLLREC;
		g_msg = " (error: no collection)";
		return g_httpServer.sendErrorReply(s,500,"no coll rec");
	}
	// . make sure the ip is not banned
	// . we may also have an exclusive list of IPs for private collections
	if ( ! cr->hasSearchPermission ( s ) ) {
		g_errno = ENOPERM;
		g_msg = " (error: permission denied)";
		return g_httpServer.sendErrorReply(s,500,mstrerror(g_errno));
	}
	// make a new state
	State1 *st1 ;
	try { st1 = new (State1); }
	catch ( ... ) { 
		g_errno = ENOMEM;
		log("PageAddUrl: new(%i): %s", 
		    sizeof(State1),mstrerror(g_errno));
		return g_httpServer.sendErrorReply(s,500,mstrerror(g_errno)); }
	mnew ( st1 , sizeof(State1) , "PageAddUrl" );
	// save socket and isAdmin
	st1->m_socket  = s;
	st1->m_isAdmin = isAdmin;

	// assume no url buf yet, set below
	//st1->m_ubuf      = NULL;
	//st1->m_ubufAlloc = NULL;
	//st1->m_metaList  = NULL;

	// save the url
	st1->m_url[0] = '\0';
	if ( url ) {
		// normalize and add www. if it needs it
		Url uu;
		uu.set ( url , gbstrlen(url) , true );
		// remove >'s i guess and store in st1->m_url[] buffer
		st1->m_urlLen=cleanInput ( st1->m_url,
					   MAX_URL_LEN, 
					   uu.getUrl(),
					   uu.getUrlLen() );
		// point to that as the url "buf" to add
		//st1->m_ubuf      = st1->m_url;
		//st1->m_ubufSize  = urlLen;
		//st1->m_ubufAlloc = NULL; // do not free it!
	}

	// save the "ufu" (url of file of urls)
	st1->m_ufu[0] = '\0';
	st1->m_ufuLen  = ufuLen;
	memcpy ( st1->m_ufu , ufu , ufuLen );
	st1->m_ufu[ufuLen] = '\0';

	st1->m_doTuringTest = cr->m_doTuringTest;
	char *username     = g_users.getUsername(r);
	if(username) strcpy(st1->m_username,username);
	//st1->m_user    = g_pages.getUserType ( s , r );
	st1->m_spiderLinks = true;
	st1->m_strip   = true;
	//st1->m_raw = r->getLong("raw",0);

	// init state2
	for ( long i = 0; i < 5; i++ ){
		st1->m_state2[i].m_buf = NULL;
		st1->m_state2[i].m_bufLen = 0;
		st1->m_state2[i].m_bufMaxLen = 0;
	}

	// save the collection name in the State1 class
	if ( collLen > MAX_COLL_LEN ) collLen = MAX_COLL_LEN;
	strncpy ( st1->m_coll , coll , collLen );
	st1->m_coll [ collLen ] = '\0';

	// assume they answered turing test correctly
	st1->m_goodAnswer = true;
	// if addurl is turned off, just print "disabled" msg
	if ( ! g_conf.m_addUrlEnabled ) return sendReply ( st1 , false );
	// can also be turned off in the collection rec
	if ( ! cr->m_addUrlEnabled    ) return sendReply ( st1 , false );
	// or if in read-only mode
	if (   g_conf.m_readOnlyMode  ) return sendReply ( st1 , false );
	// cannot add if another Msg10 from here is still in progress
	if ( s_inprogress ) return sendReply ( st1 , true );
	// use now as the spiderTime

	// get ip of submitter
	//unsigned long h = ipdom ( s->m_ip );
	// . use top 2 bytes now, some isps have large blocks
	// . if this causes problems, then they can do pay for inclusion
	unsigned long h = iptop ( s->m_ip );
	long codeLen;
	char* code = r->getString("code", &codeLen);
	if(g_autoBan.hasCode(code, codeLen, s->m_ip)) {
		long uipLen = 0;
		char* uip = r->getString("uip",&uipLen);
		long hip = 0;
		//use the uip when we have a raw query to test if 
		//we can submit
		if(uip) {
			hip = atoip(uip, uipLen);
			h = iptop( hip );
		}
	}


	st1->m_strip = r->getLong("strip",0);
	// Remember, for cgi, if the box is not checked, then it is not 
	// reported in the request, so set default return value to 0
	long spiderLinks = r->getLong("spiderLinks",-1);
	// also support all lowercase like PageInject.cpp uses
	if ( spiderLinks == -1 )
		spiderLinks = r->getLong("spiderlinks",0);

	// . should we force it into spiderdb even if already in there
	// . use to manually update spider times for a url
	// . however, will not remove old scheduled spider times
	// . mdw: made force on the default
	st1->m_forceRespider = r->getLong("force",1); // 0);

	long now = getTimeGlobal();

	// . allow 1 submit every 1 hour
	// . restrict by submitter domain ip
	if ( ! st1->m_isAdmin &&
	     ! canSubmit ( h , now , cr->m_maxAddUrlsPerIpDomPerDay ) ) {
		// return error page
		g_errno = ETOOEARLY;
		return sendReply ( st1 , true );
	}


	//st1->m_query = r->getString( "qts", &st1->m_queryLen );


	// check it, if turing test is enabled for this collection
	if ( ! st1->m_isAdmin && cr->m_doTuringTest && 
	     ! g_turingTest.isHuman(r) )  {
		// log note so we know it didn't make it
		g_msg = " (error: bad answer)";
		//log("PageAddUrl:: addurl failed for %s : bad answer",
		//    iptoa(s->m_ip));
		st1->m_goodAnswer = false;
		return sendReply ( st1 , true /*addUrl enabled?*/ );
	}

	//if ( st1->m_queryLen > 0 )
	//	return getPages( st1 );

	// if no url given, just print a blank page
	if ( ! url ) return sendReply (  st1 , true );


	//
	// make a SpiderRequest
	//

	SpiderRequest *sreq = &st1->m_sreq;
	// reset it
	sreq->reset();
	// make the probable docid
	long long probDocId = g_titledb.getProbableDocId ( st1->m_url );
	// make one up, like we do in PageReindex.cpp
	long firstIp = (probDocId & 0xffffffff);
	// . now fill it up
	// . TODO: calculate the other values... lazy!!! (m_isRSSExt, 
	//         m_siteNumInlinks,...)
	sreq->m_isNewOutlink = 1;
	sreq->m_isAddUrl     = 1;
	sreq->m_addedTime    = now;
	sreq->m_fakeFirstIp   = 1;
	sreq->m_probDocId     = probDocId;
	sreq->m_firstIp       = firstIp;
	sreq->m_hopCount      = 0;
	// its valid if root
	Url uu; uu.set ( st1->m_url );
	if ( uu.isRoot() ) sreq->m_hopCountValid = true;
	// too big?
	//long len = st1->m_urlLen;
	// the url! includes \0
	strcpy ( sreq->m_url , st1->m_url );
	// call this to set sreq->m_dataSize now
	sreq->setDataSize();
	// make the key dude -- after setting url
	sreq->setKey ( firstIp , 0LL, false );
	// need a fake first ip lest we core!
	//sreq->m_firstIp = (pdocId & 0xffffffff);
	// how to set m_firstIp? i guess addurl can be throttled independently
	// of the other urls???  use the hash of the domain for it!
	long  dlen;
	char *dom = getDomFast ( st1->m_url , &dlen );
	// fake it for this...
	//sreq->m_firstIp = hash32 ( dom , dlen );
	// sanity
	if ( ! dom ) {
		g_errno = EBADURL;
		return sendReply ( st1 , true );
	}
	// shortcut
	Msg4 *m = &st1->m_msg4;
	// now add that to spiderdb using msg4
	if ( ! m->addMetaList ( (char *)sreq    ,
				sreq->getRecSize() ,
				coll            ,
				st1             , // state
				addedStuff      ,
				MAX_NICENESS    ,
				RDB_SPIDERDB    ) )
		// we blocked
		return false;

	// send back the reply
	return sendReply ( st1 , true );
}