Esempi in C++ (Cpp) per gbmemcpy

Esempio n. 1

0

Mostra file

File: Msg20.cpp Progetto: HackLinux/open-source-search-engine

// returns NULL and set g_errno on error
int32_t Msg20Reply::serialize ( char *buf , int32_t bufSize ) {
	// copy the easy stuff
	char *p = buf;
	gbmemcpy ( p , (char *)this , sizeof(Msg20Reply) );
	p += (int32_t)sizeof(Msg20Reply);
	// then store the strings!
	int32_t  *sizePtr = &size_tbuf;
	int32_t  *sizeEnd = &size_note;
	char **strPtr  = &ptr_tbuf;
	//char **strEnd= &ptr_note;
	for ( ; sizePtr <= sizeEnd ;  ) {
		// sometimes the ptr is NULL but size is positive
		// so watch out for that
		if ( *strPtr ) {
			gbmemcpy ( p , *strPtr , *sizePtr );
			// advance our destination ptr
			p += *sizePtr;
		}
		// advance both ptrs to next string
		sizePtr++;
		strPtr++;
	}
	int32_t used = p - buf;
	// sanity check, core on overflow of supplied buffer
	if ( used > bufSize ) { char *xx = NULL; *xx = 0; }
	// return it
	return used;
}

Esempio n. 2

0

Mostra file

File: Msg2b.cpp Progetto: BlaBlaNet/open-source-search-engine

int32_t Msg2b::serialize ( char *buf, int32_t bufLen ) {
	// make sure we have room
	int32_t storedSize = getStoredSize();
	if (bufLen < storedSize)
		return -1;
	char *p = buf;
	// m_dirId + m_numSubCats + m_catBufferLen
	*(int32_t *)p = m_dirId;        p += sizeof(int32_t);
	*(int32_t *)p = m_numSubCats;   p += sizeof(int32_t);
	*(int32_t *)p = m_catBufferLen; p += sizeof(int32_t);
	// sub cats
	gbmemcpy(p, m_subCats, sizeof(SubCategory)*m_numSubCats);
	p += sizeof(SubCategory)*m_numSubCats;
	// cat buffer
	gbmemcpy(p, m_catBuffer, m_catBufferLen);
	p += m_catBufferLen;
	// sanity check
	if (p - buf != storedSize) {
		log("Msg2b: Bad serialize size, %i != %"INT32", bad engineer.",
		    p - buf, storedSize);
		char *xx = NULL; *xx = 0;
	}
	// return bytes stored
	return storedSize;
}

Esempio n. 3

0

Mostra file

File: Msg30.cpp Progetto: BlaBlaNet/open-source-search-engine

// . returns false if blocked, true otherwise
// . sets g_errno on error
// . tells ALL hosts update to this coll rec
bool Msg30::update ( CollectionRec *rec      ,
		     bool           deleteIt ,
		     void          *state    , 
		     void         (* callback)(void *state ) ) {
	// bail if this is an interface machine, don't modify the main
	if ( g_conf.m_interfaceMachine ) return true;
	// remember callback parms
	m_state    = state;
	m_callback = callback;
	// quick delete send
	if ( deleteIt ) {
		// include the terminating \0
		m_sendBufSize = gbstrlen ( rec->m_coll ) + 1;
		gbmemcpy ( m_sendBuf , rec->m_coll , m_sendBufSize );
	}
	else {
		// serialize the rec into m_sendBuf
		m_sendBufSize = sizeof(CollectionRec);
		gbmemcpy ( m_sendBuf , rec , sizeof(CollectionRec) );
	}
	// reset some parms
	m_requests = 0;
	m_replies  = 0;
	// send a Msg30 to all hosts so they update it!
	int32_t n = g_hostdb.getNumHosts();
	for ( int32_t i = 0; i < n ; i++ ) {
		// get the ith host
		Host *h = g_hostdb.getHost ( i );
		// not to THIS host, however
		if ( h->m_hostId == g_hostdb.m_hostId ) continue;
		// # requests sent
		m_requests++;
		// . send to him
		// . returns false and sets g_errno on error
		// . NEVER timeout... like Msg10 (add url)
		if ( g_udpServer.sendRequest ( m_sendBuf    ,// request
					       m_sendBufSize,// requestLen
					       0x30         ,// msgType 0x30
					       h->m_ip      ,
					       h->m_port    ,
					       h->m_hostId  ,
					       NULL         , // slotPtr
					       this         , // state data
					       gotReplyWrapper30 ,
					       60*60*24*365 ) ) continue;
		// log it
		log("admin: Error sending collection record update request: "
		    "%s.",mstrerror(g_errno));
		// got it w/o blocking?? how?
		g_errno = 0;
		// mark reply as read
		m_replies++;
	}
	// return false if waiting on any reply
	if ( m_replies < m_requests ) return false;
	// got em all!
	return true;
}

Esempio n. 4

0

Mostra file

File: fctypes.cpp Progetto: privacore/open-source-search-engine

char *serializeMsg ( int32_t  baseSize ,
		     int32_t *firstSizeParm ,
		     int32_t *lastSizeParm ,
		     char **firstStrPtr ,
		     void *thisPtr ,
		     int32_t *retSize     ,
		     char *userBuf     ,
		     int32_t  userBufSize ,
		     bool  makePtrsRefNewBuf ) {
	// make a buffer to serialize into
	char *buf  = NULL;
	//int32_t  need = getStoredSize();
	int32_t need = getMsgStoredSize(baseSize,firstSizeParm,lastSizeParm);
	// big enough?
	if ( need <= userBufSize ) buf = userBuf;
	// alloc if we should
	if ( ! buf ) buf = (char *)mmalloc ( need , "Ra" );
	// bail on error, g_errno should be set
	if ( ! buf ) return NULL;
	// set how many bytes we will serialize into
	*retSize = need;
	// copy the easy stuff
	char *p = buf;
	gbmemcpy ( p , (char *)thisPtr , baseSize );//getBaseSize() );
	p += baseSize; // getBaseSize();
	// then store the strings!
	int32_t  *sizePtr = firstSizeParm;//getFirstSizeParm(); // &size_qbuf;
	int32_t  *sizeEnd = lastSizeParm;//getLastSizeParm (); // &size_displayMet
	char **strPtr  = firstStrPtr;//getFirstStrPtr  (); // &ptr_qbuf;
	for ( ; sizePtr <= sizeEnd ;  ) {
		// if we are NULL, we are a "bookmark", so
		// we alloc'd space for it, but don't copy into
		// the space until after this call toe serialize()
		if ( ! *strPtr ) goto skip;
		// sanity check -- cannot copy onto ourselves
		if ( p > *strPtr && p < *strPtr + *sizePtr ) {
			g_process.shutdownAbort(true); }
		// copy the string into the buffer
		gbmemcpy ( p , *strPtr , *sizePtr );
	skip:
		// . make it point into the buffer now
		// . MDW: why? that is causing problems for the re-call in
		//   Msg3a, it calls this twice with the same "m_r"
		if ( makePtrsRefNewBuf ) *strPtr = p;
		// advance our destination ptr
		p += *sizePtr;
		// advance both ptrs to next string
		sizePtr++;
		strPtr++;
	}
	return buf;
}

Esempio n. 5

0

Mostra file

File: Msg30.cpp Progetto: BlaBlaNet/open-source-search-engine

// . reply to a request for an RdbList
// . MUST call g_udpServer::sendReply or sendErrorReply() so slot can
//   be destroyed
void handleRequest30 ( UdpSlot *slot , int32_t niceness ) {
	// get what we've read
	char *readBuf     = slot->m_readBuf;
	int32_t  readBufSize = slot->m_readBufSize;
	// is it a delete?
	if ( readBufSize < (int32_t)sizeof(CollectionRec) ) {
		char *coll = readBuf;
		g_collectiondb.deleteRec ( coll );
		return;
	}
	CollectionRec *cr = (CollectionRec *)readBuf;
	// otherwise add/update it
	if ( ! g_collectiondb.addRec (cr->m_coll,NULL,0,true,-1,
				      false , // isdump?
				      true  )) { // save it?
		log("admin: Failed to update new record.");
		g_udpServer.sendErrorReply ( slot , g_errno );
		return;
	}
	// set it
	CollectionRec *nr = g_collectiondb.getRec ( cr->m_coll );
	if ( ! nr ) {
		g_errno = EBADENGINEER;
		log(LOG_LOGIC,"admin: New collection added but not found.");
		g_udpServer.sendErrorReply ( slot , g_errno );
		return;
	}
	// set to what it should be
	gbmemcpy ( nr , cr , sizeof(CollectionRec) );
	// always return a reply immediately, even though list not loaded yet
	g_udpServer.sendReply_ass ( NULL , 0 , NULL , 0 , slot );
}

Esempio n. 6

0

Mostra file

File: HttpRequest.cpp Progetto: privacore/open-source-search-engine

// returns false with g_errno set on error
bool HttpRequest::copy(const HttpRequest *r) {
	//highly suspect code here!
	//todo: make a normal and much safer operator=()
	SafeBuf raw_tmp_copy;
	memcpy(&raw_tmp_copy, &m_reqBuf, sizeof(raw_tmp_copy));
	gbmemcpy ( this , r , sizeof(HttpRequest) );
	memcpy(&m_reqBuf,&raw_tmp_copy,sizeof(m_reqBuf));
	
	// we copy it and update the ptrs below
	m_reqBuf.purge();
	if ( ! m_reqBuf.safeMemcpy ( &r->m_reqBuf ) )
		return false;

	// fix ptrs
	const char *sbuf = r->m_reqBuf.getBufStart();
	char *dbuf =    m_reqBuf.getBufStart();
	for ( int32_t i = 0 ; i < m_numFields ; i++ ) {
		m_fields     [i] = dbuf + (r->m_fields     [i] - sbuf);
		m_fieldValues[i] = NULL;
		if ( r->m_fieldValues[i] )
			m_fieldValues[i] = dbuf + (r->m_fieldValues[i] - sbuf);
	}
	m_cookiePtr  = dbuf + (r->m_cookiePtr  - sbuf );
	m_metaCookie = dbuf + (r->m_metaCookie - sbuf );
	m_ucontent   = dbuf + (r->m_ucontent   - sbuf );
	m_path       = dbuf + (r->m_path       - sbuf );
	m_cgiBuf     = dbuf + (r->m_cgiBuf     - sbuf );
	// not supported yet. we'd have to allocate it
	if ( m_cgiBuf2 ) { g_process.shutdownAbort(true); }
	return true;
}

Esempio n. 7

0

Mostra file

File: File.cpp Progetto: BlaBlaNet/open-source-search-engine

void File::set ( char *filename ) {
	// reset m_filename
	m_filename[0] = '\0';
	//m_filename.reset();
	// return if NULL
	if ( ! filename ) { 
		log ( LOG_LOGIC,"disk: Provided filename is NULL");
		return;
	}
	// bail if too long
	int32_t len = gbstrlen ( filename );
	// account for terminating '\0'
	if ( len + 1 >= MAX_FILENAME_LEN ) { 
	 	log ( "disk: Provdied filename %s length of %"INT32" "
		      "is bigger "
	 	      "than %"INT32".",filename,len,
		      (int32_t)MAX_FILENAME_LEN-1); 
	 	return; 
	}
	// if we already had another file open then we must close it first.
	if ( m_fd >= 0 ) close();
	// copy into m_filename and NULL terminate
	gbmemcpy ( m_filename , filename , len );
	m_filename [ len ] = '\0';
	//m_filename.setLabel   ("sbfnm");
	//m_filename.safeStrcpy ( filename );
	m_calledSet  = true;
	// TODO: make this a bool returning function if ( ! m_filename ) g_log
}

Esempio n. 8

0

Mostra file

File: Unicode.cpp Progetto: DeadNumbers/open-source-search-engine

int32_t stripAccentMarks (char *outbuf, int32_t outbufsize,
		       unsigned char *p, int32_t inbuflen) {
	char *s = (char *)p;
	char *send = (char *)p + inbuflen;
	int32_t cs;
	char *dst = outbuf;
	for ( ; s < send ; s += cs ) {
		// how big is this character?
		cs = getUtf8CharSize(s);
		// convert the utf8 character to UChar32
		UChar32 uc = utf8Decode ( s );
		// break "uc" into decomposition of UChar32s
		UChar32 ttt[32];
		int32_t klen = recursiveKDExpand(uc,ttt,32);
		if(klen>32){char *xx=NULL;*xx=0;}
		// sanity
		if ( dst + 5 > outbuf+outbufsize ) return -1;
		// if the same, leave it! it had no accent marks or other
		// modifiers...
		if ( klen <= 1 ) {
			gbmemcpy ( dst , s , cs );
			dst += cs;
			continue;
		}
		// take the first one as the stripped
		// convert back to utf8
		int32_t stored = utf8Encode ( ttt[0] , dst );
		// skip over the stored utf8 char
		dst += stored;
	}
	// sanity. breach check
	if ( dst > outbuf+outbufsize ) { char *xx=NULL;*xx=0; }
	// return # of bytes stored into outbuf
	return dst - outbuf;
}

Esempio n. 9

0

Mostra file

File: Msg20.cpp Progetto: HackLinux/open-source-search-engine

// copy "src" to ourselves
void Msg20::copyFrom ( Msg20 *src ) {
	gbmemcpy ( this , src , sizeof(Msg20) );
	// if the Msg20Reply was actually in src->m_replyBuf[] we have to
	// re-serialize into our this->m_replyBuf[] in order for the ptrs
	// to be correct
	//if ( (char *)src->m_r == src->m_replyBuf ) {
	//	// ok, point our Msg20Reply to our m_replyBuf[]
	//	m_r = (Msg20Reply *)m_replyBuf;
	//	// serialize the reply into that buf
	//	src->m_r->serialize ( m_replyBuf , MSG20_MAX_REPLY_SIZE );
	//	// then change the offsets to ptrs into this->m_replyBuf
	//	m_r->deserialize ();
	//}

	// make sure it does not free it!
	src->m_r = NULL;
	// why would we need to re-do the request? dunno, pt to it just in case
	if ( src->m_request == src->m_requestBuf ) 
		m_request = m_requestBuf;
	// make sure destructor does not free this
	src->m_request = NULL;
	// but should free its request, m_request, if it does not point to
	// src->m_requestBuf[], but an allocated buffer
	src->destructor();
}

Esempio n. 10

0

Mostra file

File: RdbMem.cpp Progetto: exename/open-source-search-engine

// . if a dump is not going on this uses the primary mem space
// . if a dump is going on and this key has already been dumped
//   (we check RdbDump::getFirstKey()/getLastKey()) add it to the
//   secondary mem space, otherwise add it to the primary mem space
//void *RdbMem::dupData ( key_t key , char *data , int32_t dataSize ) {
void *RdbMem::dupData ( char *key , char *data , int32_t dataSize ,
			collnum_t collnum ) {
	char *s = (char *) allocData ( key , dataSize , collnum );
	if ( ! s ) return NULL;
	gbmemcpy ( s , data , dataSize );
	return s;
}

Esempio n. 11

0

Mostra file

File: Flags.cpp Progetto: BlaBlaNet/open-source-search-engine

bool Flags::resize( int32_t size ) { 
	if ( size < 0           ) return false;
	if ( size == m_numFlags ) return true;
	
	char *newFlags = (char *)mcalloc( size*sizeof(char), "Flags" );
	if ( ! newFlags ) return false;

	m_numSet     = 0;
	m_highestSet = NoMax;
	m_lowestSet  = NoMin;
	if ( m_flags ) {
		// copy as many of old flags over as possible
		int32_t min = m_numFlags;
		if ( min > size ) min = size;
		gbmemcpy( newFlags, m_flags, min*sizeof(char) );
		mfree( m_flags, m_numFlags*sizeof(char), "Flags" );
		m_flags = NULL;
		// find new values for member variables
		for ( int32_t i = 0; i < min; i++ ) {
			if ( newFlags[i] ) {
				m_numSet++;
				if ( i > m_highestSet ) m_highestSet = i;
				if ( i < m_lowestSet  ) m_lowestSet  = i;
			}
		}
	}
	m_flags      = newFlags;
	m_numFlags   = size;

	return (m_flags != NULL);
}

Esempio n. 12

0

Mostra file

File: HttpMime.cpp Progetto: privacore/open-source-search-engine

// make a redirect mime
void HttpMime::makeRedirMime ( const char *redir , int32_t redirLen ) {
	char *p = m_buf;
	gbmemcpy ( p , "HTTP/1.0 302 RD\r\nLocation: " , 27 );
	p += 27;
	if ( redirLen > 600 ) redirLen = 600;
	gbmemcpy ( p , redir , redirLen );
	p += redirLen;
	*p++ = '\r';
	*p++ = '\n';
	*p++ = '\r';
	*p++ = '\n';
	*p = '\0';
	m_mimeLen = p - m_buf;
	if ( m_mimeLen > 1023 ) { g_process.shutdownAbort(true); }
	// set the mime's length
	//m_bufLen = strlen ( m_buf );
}

Esempio n. 13

0

Mostra file

File: HttpMime.cpp Progetto: lemire/open-source-search-engine

// make a redirect mime
void HttpMime::makeRedirMime ( const char *redir , int32_t redirLen ) {
	char *p = m_buf;
	gbmemcpy ( p , "HTTP/1.0 302 RD\r\nLocation: " , 27 );
	p += 27;
	if ( redirLen > 600 ) redirLen = 600;
	gbmemcpy ( p , redir , redirLen );
	p += redirLen;
	*p++ = '\r';
	*p++ = '\n';
	*p++ = '\r';
	*p++ = '\n';
	*p = '\0';
	m_bufLen = p - m_buf;
	if ( m_bufLen > 1023 ) { char *xx=NULL;*xx=0; }
	// set the mime's length
	//m_bufLen = gbstrlen ( m_buf );
}

Esempio n. 14

0

Mostra file

File: LanguageIdentifier.cpp Progetto: BlaBlaNet/open-source-search-engine

static uint8_t s_getCountryFromSpec(char *str) {
	char code[6];
	memset(code, 0,6);
	gbmemcpy(code, str, s_wordLen(str));
	for(int x = 0; x < 6; x++)
		if(code[x] > 'A' && code[x] < 'Z') code[x] -= ('A' - 'a');
	if(code[2] == '_' || code[2] == '-')
		return g_countryCode.getIndexOfAbbr(&code[3]);
	return g_countryCode.getIndexOfAbbr(code);
}

Esempio n. 15

0

Mostra file

File: Words.cpp Progetto: DeadNumbers/open-source-search-engine

int32_t Words::isFloat  ( int32_t n, float& f) {
	char buf[128];
	char *p = buf;
	int32_t offset = 0;
	while(isPunct(n+offset) && 
	      !(m_words[n+offset][0] == '.' || 
		m_words[n+offset][0] == '-')) offset++;

	while(isPunct(n+offset) && 
	      !(m_words[n+offset][0] == '.' || 
		m_words[n+offset][0] == '-')) offset++;


	gbmemcpy(buf, getWord(n), getWordLen(n));
	buf[getWordLen(n)] = '\0';
	log(LOG_WARN, "trying to get %s %"INT32"", buf, offset);
	

	if(isNum(n)) {
		if(1 + n < m_numWords && 
		   isPunct(n+1) && m_words[n+1][0] == '.') {
			if(2 + n < m_numWords && isNum(n+2)) {
				gbmemcpy(p, m_words[n], m_wordLens[n]);
				p += m_wordLens[n];
				gbmemcpy(p, ".", 1);
				p++;
				gbmemcpy(p, m_words[n+2], m_wordLens[n+2]);
				f = atof(buf);
				return 3 + offset;
			}
			else {
				return offset;
			}
		} else if(n > 0 && isPunct(n-1) && m_wordLens[n-1] > 0 &&
			  (m_words[n-1][m_wordLens[n-1]-1] == '.' ||
			   m_words[n-1][m_wordLens[n-1]-1] == '-')) {
			//hmm, we just skipped the period as punct?
			sprintf(buf, "0.%s",m_words[n]);
			f = atof(buf);
			return 1 + offset;
		}
		else {
			f = atof(m_words[n]);
			return 1 + offset;
		}
	}

	//does this have a period in front?
	if(isPunct(n) && (m_words[n][0] == '.' || m_words[n][0] == '-')) {
		if(1 + n < m_numWords && isNum(n+1)) {
			gbmemcpy(p, m_words[n], m_wordLens[n]);
			p += m_wordLens[n];
			gbmemcpy(p, m_words[n+1], m_wordLens[n+1]);
			f = atof(buf);
			return 2 + offset;
		}
	}
	return offset;
}

Esempio n. 16

0

Mostra file

File: Mem.cpp Progetto: lemire/open-source-search-engine

// Relabels memory in table.  Returns true on success, false on failure.
// Purpose is for times when UdpSlot's buffer is not owned and freed by someone
// else.  Now we can verify that passed memory is freed.
bool Mem::lblMem( void *mem, int32_t size, const char *note ) {
	logTrace( g_conf.m_logTraceMem, "mem=%p size=%" PRId32" note=%s", mem, size, note );

	// seems to be a bad bug in this...
	return true;

#if 0
	bool val = false;

	// Make sure we're not relabeling a NULL or dummy memory address,
	// if so, error then exit
	if ( !mem ) {
		//log( "mem: lblMem: Mem addr (0x%08X) invalid/NULL, not "
		//     "relabeling.", mem );
		return val;
	}

	uint32_t u = ( PTRTYPE ) mem * ( PTRTYPE ) 0x4bf60ade;
	uint32_t h = u % ( uint32_t ) m_memtablesize;
	// chain to bucket
	while ( s_mptrs[ h ] ) {
		if ( s_mptrs[ h ] == mem ) {
			if ( s_sizes[ h ] != size ) {
				val = false;
				log( LOG_WARN, "mem: lblMem: Mem addr (0x%08" PTRFMT") exists, size is %" PRId32" off.",
				     ( PTRTYPE ) mem,
				     s_sizes[ h ] - size );
				break;
			}
			int32_t len = gbstrlen( note );
			if ( len > 15 ) len = 15;
			char *here = &s_labels[ h * 16 ];
			gbmemcpy ( here, note, len );
			// make sure NULL terminated
			here[ len ] = '\0';
			val = true;
			break;
		}
		h++;
		if ( h == m_memtablesize ) h = 0;
	}

	if ( !val ) {
		log( "mem: lblMem: Mem addr (0x%08" PTRFMT") not found.", ( PTRTYPE ) mem );
	}

	return val;
#endif
}

Esempio n. 17

0

Mostra file

File: Msg20.cpp Progetto: HackLinux/open-source-search-engine

// . return ptr to the buffer we serialize into
// . return NULL and set g_errno on error
char *Msg20Request::serialize ( int32_t *retSize     ,
				char *userBuf     ,
				int32_t  userBufSize ) {
	// make a buffer to serialize into
	char *buf  = NULL;
	int32_t  need = getStoredSize();
	// big enough?
	if ( need <= userBufSize ) buf = userBuf;
	// alloc if we should
	if ( ! buf ) buf = (char *)mmalloc ( need , "Msg20Ra" );
	// bail on error, g_errno should be set
	if ( ! buf ) return NULL;
	// set how many bytes we will serialize into
	*retSize = need;
	// copy the easy stuff
	char *p = buf;
	gbmemcpy ( p , (char *)this , sizeof(Msg20Request) );
	p += (int32_t)sizeof(Msg20Request);
	// then store the strings!
	int32_t  *sizePtr = &size_qbuf;
	int32_t  *sizeEnd = &size_displayMetas;
	char **strPtr  = &ptr_qbuf;
	for ( ; sizePtr <= sizeEnd ;  ) {
		// sanity check -- cannot copy onto ourselves
		if ( p > *strPtr && p < *strPtr + *sizePtr ) {
			char *xx = NULL; *xx = 0; }
		// copy the string into the buffer
		gbmemcpy ( p , *strPtr , *sizePtr );
		// advance our destination ptr
		p += *sizePtr;
		// advance both ptrs to next string
		sizePtr++;
		strPtr++;
	}
	return buf;
}

Esempio n. 18

0

Mostra file

File: blaster2.cpp Progetto: exename/open-source-search-engine

int32_t getRandomWords(char *buf, char *bufend, int32_t numWords) {
	int32_t totalWords = s_windices.length() / sizeof(int32_t);
	char *p = buf;
	while(1) {
		int32_t wordNum = rand() % totalWords;
		int32_t windex = *(int32_t*)(&s_windices[wordNum*sizeof(int32_t)]);
		int32_t wlen = gbstrlen(&s_words[windex]);
		if(wlen + 1 + p >= bufend) return p - buf;
		gbmemcpy(p, &s_words[windex], wlen);
		p += wlen;
		if(--numWords <= 0) return p - buf;
		*p++ = '+';
	}
	return p - buf;
}

Esempio n. 19

0

Mostra file

File: filterquerylogs.cpp Progetto: DeadNumbers/open-source-search-engine

int32_t atoip ( char *s , int32_t slen ) {
	// point to it
	char *p = s;
	if ( s[slen] ) {
		// copy into buffer and NULL terminate
		char buf[1024];
		if ( slen >= 1024 ) slen = 1023;
		gbmemcpy ( buf , s , slen );
		buf [ slen ] = '\0';
		// point to that
		p = buf;
	}
	// convert to int
	struct in_addr in;
	in.s_addr = 0;
	inet_aton ( p , &in );
	// ensure this really is a int32_t before returning ip
	if ( sizeof(in_addr) == 4 ) return in.s_addr;
	// otherwise bitch and return 0
	//log("ip:bad inet_aton"); 
	return 0; 
}

Esempio n. 20

0

Mostra file

File: blaster2.cpp Progetto: exename/open-source-search-engine

void startSpidering ( ) {
	// url class for parsing/normalizing url
	Url u;
	// count total urls done
	static int64_t s_startTime = 0;
	// set startTime
	if ( s_startTime == 0 ) s_startTime = gettimeofdayInMilliseconds();
	// get time now
	int64_t now = gettimeofdayInMilliseconds();
	// elapsed time to do all urls
	double took = (double)(now - s_startTime) / 1000.0 ;
	// log this every 20 urls
	if ( s_printIt && s_total > 0 && ( s_total % 20 ) == 0 ) {
		logf(LOG_INFO,"did %"INT32" urls in %f seconds. %f urls per second."
		    " threads now = %"INT32".",
		    s_total ,  took , ((double)s_total) / took, s_launched);
		s_printIt = false;
	}
	// did we wait int32_t enough?
	if ( now - s_lastTime < s_wait ) return;
	s_lastTime = now;
	// . use HttpServer.getDoc() to fetch it
	// . fetch X at a time
	while ( (s_server || s_p < s_pend) && s_launched < s_maxNumThreads ) {
		// clear any error
		g_errno = 0;
		//append s_append to the url
		char url[MAX_URL_LEN];
		char *p = url;
		char *pend = url + MAX_URL_LEN;
		char *t = NULL;

		if(s_server) {
			int32_t len = gbstrlen(s_server);
			gbmemcpy ( p, s_server, len);
			p += len;
			p += getRandomWords(p, pend, s_numRandWords);
			int32_t appendLen = gbstrlen(s_append);
			if ( p + appendLen < pend ) {
				gbmemcpy ( p, s_append, gbstrlen(s_append) );
				p += gbstrlen(s_append);
			}
			*p++ = '\0';
			u.set ( url , p - url, false, false, false, false, false, 0x7fffffff );
			t = g_mem.strdup(url, "saved url");
		}
		else {
			gbmemcpy ( p, s_p, gbstrlen(s_p));
			p += gbstrlen ( s_p );
			if ( gbstrlen(s_p) + gbstrlen(s_append) < MAX_URL_LEN )
				gbmemcpy ( p, s_append, gbstrlen(s_append) );
			p += gbstrlen(s_append);
			//null end
			*p ='\0';

			// make into a url class
			u.set ( url , gbstrlen(url), false, false, false, false, false, 0x7fffffff  );
			// set port if port switch is true
			//if ( s_portSwitch ) {
			//	int32_t r = rand() % 32;
			//	u.setPort ( 8000 + r );
			//}
			// save s_p
			t = s_p;
			// skip to next url
			s_p += gbstrlen ( s_p ) + 1;
		}
		// count it
		s_launched++;
		// get it
		bool status = g_httpServer.getDoc ( u.getUrl() , // url
						    0, // ip
						    0 ,  // offset
						    -1 ,  // size
						    0 , // ifModifiedSince
						    (void *)t ,  // state
						    gotDocWrapper, // callback
						    20*1000, // timeout
						    0, // proxy ip
						    0, // proxy port
						    30*1024*1024, //maxLen
						    30*1024*1024);//maxOtherLen
		// continue if it blocked
		if ( ! status ) continue;
		// otherwise, got it right away
		s_launched--;
		// log msg
		log("got doc1 %s: %s", u.getUrl() , mstrerror(g_errno) );
		// we gotta wait
		break;
	}
	// bail if not done yet
	//if ( s_launched > 0 ) return;
	if ( s_server || s_p < s_pend ) return;
	// otherwise, we're all done
	logf(LOG_INFO,"blaster: did %"INT32" urls in %f seconds. %f urls per "
	     "second.",
	    s_total ,  took , ((double)s_total) / took );
	// exit now
	exit ( 0 );
}

Esempio n. 21

0

Mostra file

File: Msg39.cpp Progetto: DeadNumbers/open-source-search-engine

void Msg39::estimateHitsAndSendReply ( ) {

	// no longer in use
	m_inUse = false;

	// now this for the query loop on the QueryLogEntries.
	m_topDocId50 = 0LL;
	m_topScore50 = 0.0;

	// a little hack for the seo pipeline in xmldoc.cpp
	m_topDocId  = 0LL;
	m_topScore  = 0.0;
	m_topDocId2 = 0LL;
	m_topScore2 = 0.0;
	int32_t ti = m_tt.getHighNode();
	if ( ti >= 0 ) {
		TopNode *t = &m_tt.m_nodes[ti];
		m_topDocId = t->m_docId;
		m_topScore = t->m_score;
	}
	// try the 2nd one too
	int32_t ti2 = -1;
	if ( ti >= 0 ) ti2 = m_tt.getNext ( ti );
	if ( ti2 >= 0 ) {
		TopNode *t2 = &m_tt.m_nodes[ti2];
		m_topDocId2 = t2->m_docId;
		m_topScore2 = t2->m_score;
	}

	// convenience ptrs. we will store the docids/scores into these arrays
	int64_t *topDocIds;
	double    *topScores;
	key_t     *topRecs;

	// numDocIds counts docs in all tiers when using toptree.
	int32_t numDocIds = m_tt.m_numUsedNodes;

	// the msg39 reply we send back
	int32_t  replySize;
	char *reply;

	//m_numTotalHits = m_posdbTable.m_docIdVoteBuf.length() / 6;

	// make the reply?
	Msg39Reply mr;

	// this is what you want to look at if there is no seo.cpp module...
	if ( ! m_callback ) {
		// if we got clusterdb recs in here, use 'em
		if ( m_gotClusterRecs ) numDocIds = m_numVisible;
		
		// don't send more than the docs that are asked for
		if ( numDocIds > m_r->m_docsToGet) numDocIds =m_r->m_docsToGet;

		// # of QueryTerms in query
		int32_t nqt = m_tmpq.m_numTerms;
		// start setting the stuff
		mr.m_numDocIds = numDocIds;
		// copy # estiamted hits into 8 bytes of reply
		//int64_t est = m_posdbTable.m_estimatedTotalHits;
		// ensure it has at least as many results as we got
		//if ( est < numDocIds ) est = numDocIds;
		// or if too big...
		//if ( numDocIds < m_r->m_docsToGet ) est = numDocIds;
		// . total estimated hits
		// . this is now an EXACT count!
		mr.m_estimatedHits = m_numTotalHits;
		// sanity check
		mr.m_nqt = nqt;
		// the m_errno if any
		mr.m_errno = m_errno;
		// int16_tcut
		PosdbTable *pt = &m_posdbTable;
		// the score info, in no particular order right now
		mr.ptr_scoreInfo  = pt->m_scoreInfoBuf.getBufStart();
		mr.size_scoreInfo = pt->m_scoreInfoBuf.length();
		// that has offset references into posdbtable::m_pairScoreBuf 
		// and m_singleScoreBuf, so we need those too now
		mr.ptr_pairScoreBuf    = pt->m_pairScoreBuf.getBufStart();
		mr.size_pairScoreBuf   = pt->m_pairScoreBuf.length();
		mr.ptr_singleScoreBuf  = pt->m_singleScoreBuf.getBufStart();
		mr.size_singleScoreBuf = pt->m_singleScoreBuf.length();
		// save some time since seo.cpp gets from posdbtable directly,
		// so we can avoid serializing/copying this stuff at least
		if ( ! m_r->m_makeReply ) {
			mr.size_scoreInfo      = 0;
			mr.size_pairScoreBuf   = 0;
			mr.size_singleScoreBuf = 0;
		}
		//mr.m_sectionStats    = pt->m_sectionStats;
		// reserve space for these guys, we fill them in below
		mr.ptr_docIds       = NULL;
		mr.ptr_scores       = NULL;
		mr.ptr_clusterRecs  = NULL;
		// this is how much space to reserve
		mr.size_docIds      = 8 * numDocIds; // int64_t
		mr.size_scores      = sizeof(double) * numDocIds; // float
		// if not doing site clustering, we won't have these perhaps...
		if ( m_gotClusterRecs ) 
			mr.size_clusterRecs = sizeof(key_t) *numDocIds;
		else    
			mr.size_clusterRecs = 0;

		#define MAX_FACETS 20000

		/////////////////
		//
		// FACETS
		//
		/////////////////

		// We can have multiple gbfacet: terms in a query so
		// serialize all the QueryTerm::m_facetHashTables into
		// Msg39Reply::ptr_facetHashList.
		//
		// combine the facet hash lists of each query term into
		// a list of lists. each lsit is preceeded by the query term
		// id of the query term (like gbfacet:xpathsitehash12345)
		// followed by a 4 byte length of the following 32-bit
		// facet values
		int32_t need = 0;
		for ( int32_t i = 0 ; i < m_tmpq.m_numTerms; i++ ) {
			QueryTerm *qt = &m_tmpq.m_qterms[i];
			// skip if not facet
			if ( qt->m_fieldCode != FIELD_GBFACETSTR &&
			     qt->m_fieldCode != FIELD_GBFACETINT &&
			     qt->m_fieldCode != FIELD_GBFACETFLOAT )
				continue;
			HashTableX *ft = &qt->m_facetHashTable;
			if ( ft->m_numSlotsUsed == 0 ) continue;
			int32_t used = ft->m_numSlotsUsed;
			// limit for memory
			if ( used > (int32_t)MAX_FACETS ) {
				log("msg39: truncating facet list to 20000 "
				    "from %"INT32" for %s",used,qt->m_term);
				used = (int32_t)MAX_FACETS;
			}
			// store query term id 64 bit
			need += 8;
			// then size
			need += 4;
			// then buckets. keys and counts
			need += (4+sizeof(FacetEntry)) * used;
		}
		// allocate
		SafeBuf tmp;
		if ( ! tmp.reserve ( need ) ) {
			log("query: Could not allocate memory "
			    "to hold reply facets");
			sendReply(m_slot,this,NULL,0,0,true);
			return;
		}
		// point to there
		char *p = tmp.getBufStart();
		for ( int32_t i = 0 ; i < m_tmpq.m_numTerms ; i++ ) {
			QueryTerm *qt = &m_tmpq.m_qterms[i];
			// skip if not facet
			if ( qt->m_fieldCode != FIELD_GBFACETSTR &&
			     qt->m_fieldCode != FIELD_GBFACETINT &&
			     qt->m_fieldCode != FIELD_GBFACETFLOAT )
				continue;
			// get all the facet hashes and their counts
			HashTableX *ft = &qt->m_facetHashTable;
			// skip if none
			if ( ft->m_numSlotsUsed == 0 ) continue;
			// store query term id 64 bit
			*(int64_t *)p = qt->m_termId;
			p += 8;
			int32_t used = ft->getNumSlotsUsed();
			if ( used > (int32_t)MAX_FACETS ) 
				used = (int32_t)MAX_FACETS;
			// store count
			*(int32_t *)p = used;
			p += 4;
			int32_t count = 0;
			// for sanity check
			char *pend = p + (used * (4+sizeof(FacetEntry)));
			// serialize the key/val pairs
			for ( int32_t k = 0 ; k < ft->m_numSlots ; k++ ) {
				// skip empty buckets
				if ( ! ft->m_flags[k] ) continue;
				// store key. the hash of the facet value.
				*(int32_t *)p = ft->getKey32FromSlot(k); p += 4;
				// then store count
				//*(int32_t *)p = ft->getVal32FromSlot(k); p += 4;
				// now this has a docid on it so we can
				// lookup the text of the facet in Msg40.cpp
				FacetEntry *fe;
				fe = (FacetEntry *)ft->getValFromSlot(k);
				// sanity
				// no, count can be zero if its a range facet
				// that was never added to. we add those
				// empty FaceEntries only for range facets
				// in Posdb.cpp
				//if(fe->m_count == 0 ) { char *xx=NULL;*xx=0;}
				gbmemcpy ( p , fe , sizeof(FacetEntry) );
				p += sizeof(FacetEntry);
				// do not breach
				if ( ++count >= (int32_t)MAX_FACETS ) break;
			}
			// sanity check
			if ( p != pend ) { char *xx=NULL;*xx=0; }
			// do the next query term
		}
		// now point to that so it can be serialized below
		mr.ptr_facetHashList  = tmp.getBufStart();
		mr.size_facetHashList = p - tmp.getBufStart();//tmp.length();

		/////////////
		//
		// END FACETS
		//
		/////////////


		// . that is pretty much it,so serialize it into buffer,"reply"
		// . mr.ptr_docIds, etc., will point into the buffer so we can
		//   re-serialize into it below from the tree
		// . returns NULL and sets g_errno on error
		// . "true" means we should make mr.ptr_* reference into the 
		//   newly  serialized buffer.
		reply = serializeMsg ( sizeof(Msg39Reply), // baseSize
				       &mr.size_docIds, // firstSizeParm
				       &mr.size_clusterRecs,//lastSizePrm
				       &mr.ptr_docIds , // firstStrPtr
				       &mr , // thisPtr
				       &replySize , 
				       NULL , 
				       0 , 
				       true ) ;
		if ( ! reply ) {
			log("query: Could not allocated memory "
			    "to hold reply of docids to send back.");
			sendReply(m_slot,this,NULL,0,0,true);
			return;
		}
		topDocIds    = (int64_t *) mr.ptr_docIds;
		topScores    = (double    *) mr.ptr_scores;
		topRecs      = (key_t     *) mr.ptr_clusterRecs;
	}

	int32_t docCount = 0;
	// loop over all results in the TopTree
	for ( int32_t ti = m_tt.getHighNode() ; ti >= 0 ; 
	      ti = m_tt.getPrev(ti) ) {
		// get the guy
		TopNode *t = &m_tt.m_nodes[ti];
		// skip if clusterLevel is bad!
		if ( m_gotClusterRecs && t->m_clusterLevel != CR_OK ) 
			continue;

		// if not sending back a reply... we were called from seo.cpp
		// State3f logic to evaluate a QueryLogEntry, etc.
		if ( m_callback ) {
			// skip results past #50
			if ( docCount > 50 ) continue;
			// set this
			m_topScore50 = t->m_score;
			m_topDocId50 = t->m_docId;
			// that's it
			continue;
		}

		// get the docid ptr
		//char      *diptr = t->m_docIdPtr;
		//int64_t  docId = getDocIdFromPtr(diptr);
		// sanity check
		if ( t->m_docId < 0 ) { char *xx=NULL; *xx=0; }
		//add it to the reply
		topDocIds         [docCount] = t->m_docId;
		topScores         [docCount] = t->m_score;
		if ( m_tt.m_useIntScores ) 
			topScores[docCount] = (double)t->m_intScore;
		// supply clusterdb rec? only for full splits
		if ( m_gotClusterRecs ) 
			topRecs [docCount] = t->m_clusterRec;
		//topExplicits      [docCount] = 
		//	getNumBitsOn(t->m_explicits)
		docCount++;

		// 50th score? set this for seo.cpp. if less than 50 results
		// we want the score of the last doc then.
		if ( docCount <= 50 ) m_topScore50 = t->m_score;
		
		if ( m_debug ) {
			logf(LOG_DEBUG,"query: msg39: [%"PTRFMT"] "
			    "%03"INT32") docId=%012"UINT64" sum=%.02f",
			    (PTRTYPE)this, docCount,
			    t->m_docId,t->m_score);
		}
		//don't send more than the docs that are wanted
		if ( docCount >= numDocIds ) break;
	}
 	if ( docCount > 300 && m_debug )
		log("query: Had %"INT32" nodes in top tree",docCount);

	// this is sensitive info
	if ( m_debug ) {
		log(LOG_DEBUG,
		    "query: msg39: [%"PTRFMT"] "
		    "Intersected lists took %"INT64" (%"INT64") "
		    "ms "
		    "docIdsToGet=%"INT32" docIdsGot=%"INT32" "
		    "q=%s",
		    (PTRTYPE)this                        ,
		    m_posdbTable.m_addListsTime       ,
		    gettimeofdayInMilliseconds() - m_startTime ,
		    m_r->m_docsToGet                       ,
		    numDocIds                         ,
		    m_tmpq.getQuery()                 );
	}


	// if we blocked because we used a thread then call callback if
	// summoned from a msg3f handler and not a msg39 handler
	if ( m_callback ) {
		// if we blocked call user callback
		if ( m_blocked ) m_callback ( m_state );
		// if not sending back a udp reply, return now
		return;
	}

	// now send back the reply
	sendReply(m_slot,this,reply,replySize,replySize,false);
	return;
}

Esempio n. 22

0

Mostra file

File: Msg0.cpp Progetto: lemire/open-source-search-engine

// . slot should be auto-nuked upon transmission or error
// . TODO: ensure if this sendReply() fails does it really nuke the slot?
void gotListWrapper ( void *state , RdbList *listb , Msg5 *msg5xx ) {
	logTrace( g_conf.m_logTraceMsg0, "BEGIN" );
	
	// get the state
	State00 *st0 = (State00 *)state;
	// extract the udp slot and list and msg5
	UdpSlot   *slot =  st0->m_slot;
	RdbList   *list = &st0->m_list;
	Msg5      *msg5 = &st0->m_msg5;
	UdpServer *us   =  st0->m_us;

	// timing debug
	if ( g_conf.m_logTimingNet || g_conf.m_logDebugNet ) {
		//log("Msg0:hndled request %" PRIu64,gettimeofdayInMilliseconds());
		int32_t size = -1;
		if ( list ) size     = list->getListSize();
		log(LOG_TIMING|LOG_DEBUG,
		    "net: msg0: Handled request for data. "
		    "Now sending data termId=%" PRIu64" size=%" PRId32
		    " transId=%" PRId32" ip=%s port=%i took=%" PRId64" "
		    "(niceness=%" PRId32").",
		    g_posdb.getTermId(msg5->m_startKey),
		    size,slot->m_transId,
		    iptoa(slot->m_ip),slot->m_port,
		    gettimeofdayInMilliseconds() - st0->m_startTime ,
		    st0->m_niceness );
	}

	// on error nuke the list and it's data
	if ( g_errno ) {
		mdelete ( st0 , sizeof(State00) , "Msg0" );
		delete (st0);
		// TODO: free "slot" if this send fails
		
		log(LOG_ERROR,"%s:%s:%d: call sendErrorReply.", __FILE__, __func__, __LINE__);
		us->sendErrorReply ( slot , g_errno );
		return;
	}

	QUICKPOLL(st0->m_niceness);
	// point to the serialized list in "list"
	char *data      = list->getList();
	int32_t  dataSize  = list->getListSize();
	char *alloc     = list->getAlloc();
	int32_t  allocSize = list->getAllocSize();
	// tell list not to free the data since it is a reply so UdpServer
	// will free it when it destroys the slot
	list->setOwnData ( false );
	// keep track of stats
	Rdb *rdb = getRdbFromId ( st0->m_rdbId );
	if ( rdb ) rdb->sentReplyGet ( dataSize );
	// TODO: can we free any memory here???

	// keep track of how long it takes to complete the send
	st0->m_startTime = gettimeofdayInMilliseconds();
	// debug point
	int32_t oldSize = msg5->m_minRecSizes;
	int32_t newSize = msg5->m_minRecSizes + 20;
	// watch for wrap around
	if ( newSize < oldSize ) newSize = 0x7fffffff;
	if ( dataSize > newSize && list->getFixedDataSize() == 0 &&
	     // do not annoy me with these linkdb msgs
	     dataSize > newSize+100 ) 
		log(LOG_LOGIC,"net: msg0: Sending more data than what was "
		    "requested. Ineffcient. Bad engineer. dataSize=%" PRId32" "
		    "minRecSizes=%" PRId32".",dataSize,oldSize);
		    
	//
	// for linkdb lists, remove all the keys that have the same IP32
	// and store a count of what we removed somewhere
	//
	if ( st0->m_rdbId == RDB_LINKDB ) {
		// store compressed list on itself
		char *dst = list->m_list;
		// keep stats
		int32_t totalOrigLinks = 0;
		int32_t ipDups = 0;
		int32_t lastIp32 = 0;
		char *listEnd = list->getListEnd();
		// compress the list
		for ( ; ! list->isExhausted() ; list->skipCurrentRecord() ) {
			// breathe
			QUICKPOLL ( st0->m_niceness );
			// count it
			totalOrigLinks++;
			// get rec
			char *rec = list->getCurrentRec();
			int32_t ip32 = g_linkdb.getLinkerIp_uk((key224_t *)rec );
			// same as one before?
			if ( ip32 == lastIp32 && 
			     // are we the last rec? include that for
			     // advancing the m_nextKey in Linkdb more 
			     // efficiently.
			     rec + LDBKS < listEnd ) {
				ipDups++;
				continue;
			}
			// store it
			gbmemcpy (dst , rec , LDBKS );
			dst += LDBKS;
			// update it
			lastIp32 = ip32;
		}
		// . if we removed one key, store the stats
		// . caller should recognize reply is not a multiple of
		//   the linkdb key size LDBKS and no its there!
		if ( ipDups ) {
			//*(int32_t *)dst = totalOrigLinks;
			//dst += 4;
			//*(int32_t *)dst = ipDups;
			//dst += 4;
		}
		// update list parms
		list->m_listSize = dst - list->m_list;
		list->m_listEnd  = list->m_list + list->m_listSize;
		data      = list->getList();
		dataSize  = list->getListSize();
	}


	//log("sending replySize=%" PRId32" min=%" PRId32,dataSize,msg5->m_minRecSizes);
	// . TODO: dataSize may not equal list->getListMaxSize() so
	//         Mem class may show an imblanace
	// . now g_udpServer is responsible for freeing data/dataSize
	// . the "true" means to call doneSending_ass() from the signal handler
	//   if need be
	st0->m_us->sendReply_ass( data, dataSize, alloc, allocSize, slot, st0, doneSending_ass, -1, -1, true );

	logTrace( g_conf.m_logTraceMsg0, "END" );
}

Esempio n. 23

0

Mostra file

File: Msg3.cpp Progetto: lemire/open-source-search-engine

// . return false if blocked, true otherwise
// . set g_errno on error
// . read list of keys in [startKey,endKey] range
// . read at least "minRecSizes" bytes of keys in that range
// . the "m_endKey" of resulting, merged list may have a smaller endKey
//   than the argument, "endKey" due to limitation by "minRecSizes"
// . resulting list will contain ALL keys between ITS [m_startKey,m_endKey]
// . final merged list "should" try to have a size of at least "minRecSizes"
//   but due to negative/postive rec elimination may be less
// . the endKey of the lists we read may be <= "endKey" provided
// . we try to shrink the endKey if minRecSizes is >= 0 in order to
//   avoid excessive reading
// . by shrinking the endKey we cannot take into account the size of deleted
//   records, so therefore we may fall short of "minRecSizes" in actuality,
//   in fact, the returned list may even be empty with a shrunken endKey
// . we merge all lists read from disk into the provided "list"
// . caller should call Msg3.getList(int32_t i) and Msg3:getNumLists() to retrieve
// . this makes the query engine faster since we don't need to merge the docIds
//   and can just send them across the network separately and they will be
//   hashed into IndexTable's table w/o having to do time-wasting merging.
// . caller can specify array of filenums to read from so incremental syncing
//   in Sync class can just read from titledb*.dat files that were formed
//   since the last sync point.
bool Msg3::readList  ( char           rdbId         ,
		       collnum_t collnum ,
		       const char       *startKeyArg   ,
		       const char       *endKeyArg     ,
		       int32_t           minRecSizes   , // max size of scan
		       int32_t           startFileNum  , // first file to scan
		       int32_t           numFiles      , // rel. to startFileNum
		       void          *state         , // for callback
		       void        (* callback ) ( void *state ) ,
		       int32_t           niceness      ,
		       int32_t           retryNum      ,
		       int32_t           maxRetries    ,
		       bool           compensateForMerge ,
		       bool           justGetEndKey ,
		       bool           allowPageCache ,
		       bool           hitDisk        ) {

	// set this to true to validate
	m_validateCache = false;//true;

	// clear, this MUST be done so if we return true g_errno is correct
	g_errno = 0;
	// assume lists are not checked for corruption
	m_listsChecked = false;
	// warn
	if ( minRecSizes < -1 ) {
		log(LOG_LOGIC,"db: Msg3 got minRecSizes of %" PRId32", changing "
		    "to -1.",minRecSizes);
		minRecSizes = -1;
	}
	// reset m_alloc and data in all lists in case we are a re-call
	reset();
	// warning
	if ( collnum < 0 ) log(LOG_LOGIC,"net: NULL collection. msg3.");
	// remember the callback
	m_rdbId              = rdbId;
	m_collnum = collnum;
	m_callback           = callback;
	m_state              = state;
	m_niceness           = niceness;
	m_numScansCompleted  = 0;
	m_retryNum           = retryNum;
	m_maxRetries         = maxRetries;
	m_compensateForMerge = compensateForMerge;
	m_allowPageCache     = allowPageCache;
	m_hitDisk            = hitDisk;
	m_hadCorruption      = false;
	// get keySize of rdb
	m_ks = getKeySizeFromRdbId ( m_rdbId );
	// reset the group error
	m_errno    = 0;
	// . reset all our lists 
	// . these are reset in call the RdbScan::setRead() below
	//for ( int32_t i = 0 ; i < MAX_RDB_FILES ; i++ ) m_lists[i].reset();
	// . ensure startKey last bit clear, endKey last bit set
	// . no! this warning is now only in Msg5
	// . if RdbMerge is merging some files, not involving the root 
	//   file, then we can expect to get a lot of unmatched negative recs.
	// . as a consequence, our endKeys may often be negative. This means
	//   it may not annihilate with the positive key, but we should only
	//   miss like this at the boundaries of the lists we fetch.
	// . so in that case RdbList::merge will stop merging once the
	//   minRecSizes limit is reached even if it means ending on a negative
	//   rec key
	//if ( (startKey.n0 & 0x01) == 0x01 ) 
	if ( !KEYNEG(startKeyArg) )
		log(LOG_REMIND,"net: msg3: StartKey lastbit set."); 
	if (  KEYNEG(endKeyArg) )
		log(LOG_REMIND,"net: msg3: EndKey lastbit clear."); 

	// declare vars here becaues of 'goto skip' below
	int32_t mergeFileNum = -1 ;
	int32_t max ;

	// get base, returns NULL and sets g_errno to ENOCOLLREC on error
	RdbBase *base = getRdbBase( m_rdbId, m_collnum );
	if ( ! base ) {
		return true;
	}

	// store the file numbers in the array, these are the files we read
	m_numFileNums = 0;

	// save startFileNum here, just for recall
	m_startFileNum = startFileNum;
	m_numFiles     = numFiles;

	// . if we have a merge going on, we may have to change startFileNum
	// . if some files get unlinked because merge completes then our 
	//   reads will detect the error and loop back here
	// . we launch are reads right after this without giving up the cpu
	//   and we use file descriptors, so any changes to Rdb::m_files[]
	//   should not hurt us
	// . WARNING: just make sure you don't lose control of cpu until after
	//   you call RdbScan::set()
	// . we use hasMergeFile() instead of isMerging() because he may not 
	//   be merging cuz he got suspended or he restarted and
	//   hasn't called attemptMerge() yet, but he may still contain it
	if ( g_conf.m_logDebugQuery )
		log(LOG_DEBUG,
		    "net: msg3: "
		    "c=%" PRId32" hmf=%" PRId32" sfn=%" PRId32" msfn=%" PRId32" nf=%" PRId32" db=%s.",
		     (int32_t)compensateForMerge,(int32_t)base->hasMergeFile(),
		     (int32_t)startFileNum,(int32_t)base->m_mergeStartFileNum-1,
		     (int32_t)numFiles,base->m_dbname);
	int32_t pre = -10;
	if ( compensateForMerge && base->hasMergeFile() && 
	     startFileNum >= base->m_mergeStartFileNum - 1 &&
	     (startFileNum > 0 || numFiles != -1) ) {
		// now also include the file being merged into, but only
		// if we are reading from a file being merged...
		if ( startFileNum < base->m_mergeStartFileNum +
		     base->m_numFilesToMerge - 1 )
			//m_fileNums [ m_numFileNums++ ] =
			//	base->m_mergeStartFileNum - 1;
			pre = base->m_mergeStartFileNum - 1;
		// debug msg
		if ( g_conf.m_logDebugQuery )
			log(LOG_DEBUG,
			   "net: msg3: startFileNum from %" PRId32" to %" PRId32" (mfn=%" PRId32")",
			    startFileNum,startFileNum+1,mergeFileNum);
		// if merge file was inserted before us, inc our file number
		startFileNum++;
	}
	// adjust num files if we need to, as well
	if ( compensateForMerge && base->hasMergeFile() && 
	     startFileNum < base->m_mergeStartFileNum - 1 &&
	     numFiles != -1 &&
	     startFileNum + numFiles - 1 >= base->m_mergeStartFileNum - 1 ) {
		// debug msg
		if ( g_conf.m_logDebugQuery )
			log(LOG_DEBUG,"net: msg3: numFiles up one.");
		// if merge file was inserted before us, inc our file number
		numFiles++;
	}

	// . how many rdb files does this base have?
	// . IMPORTANT: this can change since files are unstable because they
	//   might have all got merged into one!
	// . so do this check to make sure we're safe... especially if
	//   there was an error before and we called readList() on ourselves
	max = base->getNumFiles();
	// -1 means we should scan ALL the files in the base
	if ( numFiles == -1 ) numFiles = max;
	// limit it by startFileNum, however
	if ( numFiles > max - startFileNum ) numFiles = max - startFileNum;
	// set g_errno and return true if it is < 0
	if ( numFiles < 0 ) { 
		log(LOG_LOGIC,
		   "net: msg3: readList: numFiles = %" PRId32" < 0 (max=%" PRId32")(sf=%" PRId32")",
		    numFiles , max , startFileNum );
		g_errno = EBADENGINEER; 
		// force core dump
		char *xx=NULL;*xx=0;
		return true; 
	}

	// . allocate buffer space
	// . m_scans, m_startpg, m_endpg, m_hintKeys, m_hintOffsets,
	//   m_fileNums, m_lists
	int32_t chunk = sizeof(RdbScan) + // m_scans
		4 +                    // m_startpg
		4 +                    // m_endpg
		//sizeof(key_t) +        // m_hintKeys
		m_ks +                 // m_hintKeys
		4 +                    // m_hintOffsets
		4 +                    // m_fileNums
		sizeof(RdbList) ;      // m_lists
	int32_t nn   = numFiles;
	if ( pre != -10 ) nn++;
	m_numChunks = nn;
	int32_t need = nn * (chunk);
	m_alloc = m_buf;
	if ( need > (int32_t)MSG3_BUF_SIZE ) {
		m_allocSize = need;
		m_alloc = (char *)mcalloc ( need , "Msg3" );
		if ( ! m_alloc ) {
			log("disk: Could not allocate %" PRId32" bytes read "
			    "structures to read %s.",need,base->m_dbname);
			return true;
		}
	}
	char *p = m_alloc;
	m_scans       = (RdbScan *)p; p += nn * sizeof(RdbScan);
	m_startpg     = (int32_t    *)p; p += nn * 4;
	m_endpg       = (int32_t    *)p; p += nn * 4;
	//m_hintKeys    = (key_t   *)p; p += nn * sizeof(key_t);
	m_hintKeys    = (char    *)p; p += nn * m_ks;
	m_hintOffsets = (int32_t    *)p; p += nn * 4;
	m_fileNums    = (int32_t    *)p; p += nn * 4;
	m_lists       = (RdbList *)p; p += nn * sizeof(RdbList);
	// sanity check
	if ( p - m_alloc != need ) {
		log(LOG_LOGIC,"disk: Bad malloc in Msg3.cpp.");
		char *xx = NULL; *xx = 0;
	}
	// call constructors
	for ( int32_t i = 0 ; i < nn ; i++ ) m_lists[i].constructor();
	// make fix from up top
	if ( pre != -10 ) m_fileNums [ m_numFileNums++ ] = pre;

	// store them all
	for ( int32_t i = startFileNum ; i < startFileNum + numFiles ; i++ )
		m_fileNums [ m_numFileNums++ ] = i;

	// . remove file nums that are being unlinked after a merge now
	// . keep it here (below skip: label) so sync point reads can use it
	int32_t n = 0;
	for ( int32_t i = 0 ; i < m_numFileNums ; i++ ) {
		// skip those that are being unlinked after the merge
		if ( base->m_isUnlinking && 
		     m_fileNums[i] >= base->m_mergeStartFileNum &&
		     m_fileNums[i] <  base->m_mergeStartFileNum + 
		                      base->m_numFilesToMerge      )
			continue;
		// otherwise, keep it
		m_fileNums[n++] = m_fileNums[i];
	}
	m_numFileNums = n;

	// . if root file is being merged, he's file #0, & root file is file #1
	// . this is a hack so caller gets what he wants
	//if ( startFileNum == 0 && base->getFileId(0) == 0 && numFiles == 1 )
	//	numFiles = 2;

	// remember the file range we should scan
	m_numScansStarted    = 0;
	m_numScansCompleted  = 0;
	//m_startKey           = startKey;
	//m_endKey             = endKey;
	//m_constrainKey       = endKey; // set in case justGetEndKey is true
	KEYSET(m_startKey,startKeyArg,m_ks);
	KEYSET(m_endKey,endKeyArg,m_ks);
	KEYSET(m_constrainKey,endKeyArg,m_ks);//set incase justGetEndKey istrue
	m_minRecSizes        = minRecSizes;
	m_compensateForMerge = compensateForMerge;
	// bail if 0 files to scan -- no! need to set startKey/endKey
	if ( numFiles == 0 ) return true;
	// don't read anything if endKey < startKey
	//if ( m_startKey > m_endKey ) return true;
	if ( KEYCMP(m_startKey,m_endKey,m_ks)>0 ) return true;
	// keep the original in tact in case g_errno == ETRYAGAIN
	//m_endKeyOrig        = endKey;
	KEYSET(m_endKeyOrig,endKeyArg,m_ks);
	m_minRecSizesOrig   = minRecSizes;
	// start reading at this key
	m_fileStartKey = startKeyArg;
	// start the timer, keep it fast for clusterdb though
	if ( g_conf.m_logTimingDb ) m_startTime = gettimeofdayInMilliseconds();
	// translate base to an id, for the sake of m_msg0
	//char baseId = m_msg0->getRdbId ( base );
	// map ptrs
	RdbMap **maps = base->getMaps();
	// . we now boost m_minRecSizes to account for negative recs 
	// . but not if only reading one list, cuz it won't get merged and
	//   it will be too big to send back
	if ( m_numFileNums > 1 ) compensateForNegativeRecs ( base );
	// . often endKey is too big for an efficient read of minRecSizes bytes
	//   because we end up reading too much from all the files
	// . this will set m_startpg[i], m_endpg[i] for each RdbScan/RdbFile
	//   to ensure we read "minRecSizes" worth of records, not much more
	// . returns the new endKey for all ranges
	// . now this just overwrites m_endKey
	//m_endKey = setPageRanges ( base           ,
	setPageRanges ( base           ,
			m_fileNums     ,
			m_numFileNums  ,
			m_fileStartKey , // start reading @ key
			m_endKey       , // stop reading @ key
			m_minRecSizes  );

	// . NEVER let m_endKey be a negative key, because it will 
	//   always be unmatched, since delbit is cleared
	// . adjusting it here ensures our generated hints are valid
	// . we will use this key to call constrain() with
	//m_constrainKey = m_endKey;
	//if ( ( m_constrainKey.n0 & 0x01) == 0x00 ) 
	//	m_constrainKey -= (uint32_t)1;
	KEYSET(m_constrainKey,m_endKey,m_ks);
	if ( KEYNEG(m_constrainKey) )
		KEYSUB(m_constrainKey,m_ks);

	// Msg5 likes to get the endkey for getting the list from the tree
	if ( justGetEndKey ) return true;

	// sanity check
	if ( m_numFileNums > nn ) {
		log(LOG_LOGIC,"disk: Failed sanity check in Msg3.");
		char *xx = NULL; *xx = 0;
	}

	// debug msg
	//log("msg3 getting list (msg5=%" PRIu32")",m_state);
	// . MDW removed this -- go ahead an end on a delete key
	// . RdbMerge might not pick it up this round, but oh well
	// . so we can have both positive and negative co-existing in same file
	// make sure the last bit is set so we don't end on a delete key
	//m_endKey.n0 |= 0x01LL;
	// . now start reading/scanning the files
	// . our m_scans array starts at 0
	for ( int32_t i = 0 ; i < m_numFileNums ; i++ ) {
		// get the page range
		//int32_t p1 = m_startpg [ i ];
		//int32_t p2 = m_endpg   [ i ];
		//#ifdef GBSANITYCHECK
		int32_t fn = m_fileNums[i];
		// this can happen somehow!
		if ( fn < 0 ) {
			log(LOG_LOGIC,"net: msg3: fn=%" PRId32". Bad engineer.",fn);
			continue;
		}
		// sanity check
		if ( i > 0 && m_fileNums[i-1] >= fn ) {
			log(LOG_LOGIC,
			    "net: msg3: files must be read in order "
			    "from oldest to newest so RdbList::indexMerge_r "
			    "works properly. Otherwise, corruption will "
			    "result. ");
			char *xx = NULL; *xx = 0;
			return true;
		}
		// . sanity check?
		// . no, we must get again since we turn on endKey's last bit
		int32_t p1 , p2;
		maps[fn]->getPageRange ( m_fileStartKey , 
					m_endKey       , 
					&p1            , 
					&p2            ,
					NULL           );
		//if ( p1 != p1c || p2 != p2c ) {
		//	fprintf(stderr,"Msg3::bad page range\n");
		//	sleep(50000);
		//}
		// sanity check, each endpg's key should be > endKey
		//if ( p2 < maps[fn]->getNumPages() && 
		//     maps[fn]->getKey ( p2 ) <= m_endKey ) {
		//	fprintf(stderr,"Msg3::bad page range 2\n");
		//	sleep(50000);
		//}
		//#endif
		//int32_t p1 , p2; 
		//maps[fn]->getPageRange (startKey,endKey,minRecSizes,&p1,&p2);
		// now get some read info
		int64_t offset      = maps[fn]->getAbsoluteOffset ( p1 );
		int32_t      bytesToRead = maps[fn]->getRecSizes ( p1, p2, false);
		// max out the endkey for this list
		// debug msg
		//#ifdef _DEBUG_		
		//if ( minRecSizes == 2000000 ) 
		//log("Msg3:: reading %" PRId32" bytes from file #%" PRId32,bytesToRead,i);
		//#endif
		// inc our m_numScans
		m_numScansStarted++;
		// . keep stats on our disk accesses
		// . count disk seeks (assuming no fragmentation)
		// . count disk bytes read
		if ( bytesToRead > 0 ) {
			base->m_rdb->didSeek (             );
			base->m_rdb->didRead ( bytesToRead );
		}
		// . the startKey may be different for each RdbScan class
		// . RdbLists must have all keys within their [startKey,endKey]
		// . therefore set startKey individually from first page in map
		// . this endKey must be >= m_endKey 
		// . this startKey must be < m_startKey
		//key_t startKey = maps[fn]->getKey ( p1 );
		//key_t endKey   = maps[fn]->getKey ( p2 );
		char startKey2 [ MAX_KEY_BYTES ];
		char endKey2   [ MAX_KEY_BYTES ];
		maps[fn]->getKey ( p1 , startKey2 );
		maps[fn]->getKey ( p2 , endKey2 );
		//char *startKey = maps[fn]->getKeyPtr ( p1 );
		//char *endKey   = maps[fn]->getKeyPtr ( p2 );
		// store in here
		m_startpg [ i ] = p1;
		m_endpg   [ i ] = p2;

		// . we read UP TO that endKey, so reduce by 1
		// . but iff p2 is NOT the last page in the map/file
		// . maps[fn]->getKey(lastPage) will return the LAST KEY
		//   and maps[fn]->getOffset(lastPage) the length of the file
		//if ( maps[fn]->getNumPages()!=p2) endKey -=(uint32_t)1;
		if ( maps[fn]->getNumPages() != p2 ) KEYSUB(endKey2,m_ks);
		// otherwise, if we're reading all pages, then force the
		// endKey to virtual inifinite
		//else endKey.setMax();
		else KEYMAX(endKey2,m_ks);

		// . set up the hints
		// . these are only used if we are only reading from 1 file
		// . these are used to call constrain() so we can constrain
		//   the end of the list w/o looping through all the recs
		//   in the list
		int32_t h2 = p2 ;
		// decrease by one page if we're on the last page
		if ( h2 > p1 && maps[fn]->getNumPages() == h2 ) h2--;
		// . decrease hint page until key is <= endKey on that page
		//   AND offset is NOT -1 because the old way would give
		//   us hints passed the endkey
		// . also decrease so we can constrain on minRecSizes in
		//   case we're the only list being read
		// . use >= m_minRecSizes instead of >, otherwise we may
		//   never be able to set "size" in RdbList::constrain()
		//   because "p" could equal "maxPtr" right away
		while ( h2 > p1 && 
			//( maps[fn]->getKey   (h2) > m_constrainKey ||
		      (KEYCMP(maps[fn]->getKeyPtr(h2),m_constrainKey,m_ks)>0||
			  maps[fn]->getOffset(h2) == -1            ||
			  maps[fn]->getAbsoluteOffset(h2) - offset >=
			  m_minRecSizes ) )
			h2--;
		// now set the hint
		m_hintOffsets [ i ] = maps[fn]->getAbsoluteOffset ( h2 ) -
			              maps[fn]->getAbsoluteOffset ( p1 ) ;
		//m_hintKeys    [ i ] = maps[fn]->getKey            ( h2 );
		KEYSET(&m_hintKeys[i*m_ks],maps[fn]->getKeyPtr(h2),m_ks);

		// reset g_errno before calling setRead()
		g_errno = 0;
		// . this fix is now in RdbList::checklist_r()
		// . we can now have dup keys, so, we may read in
		//   a rec with key "lastMinKey" even though we don't read
		//   in the first key on the end page, so don't subtract 1...
		//if ( endKey != m_endKeyOrig ) 
		//	endKey += (uint32_t) 1;

		// timing debug
		if ( g_conf.m_logTimingDb )
			log(LOG_TIMING,
			    "net: msg: reading %" PRId32" bytes from %s file #%" PRId32" "
			     "(niceness=%" PRId32")",
			     bytesToRead,base->m_dbname,i,m_niceness);

		// log huge reads, those hurt us
		if ( bytesToRead > 150000000 ) {
			logf(LOG_INFO,"disk: Reading %" PRId32" bytes at offset %" PRId64" "
			    "from %s.",
			    bytesToRead,offset,base->m_dbname);
		}

		// if any keys in the map are the same report corruption
		char tmpKey    [16];
		char lastTmpKey[16];
		int32_t ccount = 0;
		if ( bytesToRead     > 10000000      && 
		     bytesToRead / 2 > m_minRecSizes &&
		     base->m_fixedDataSize >= 0        ) {
			for ( int32_t pn = p1 ; pn <= p2 ; pn++ ) {
				maps[fn]->getKey ( pn , tmpKey );
				if ( KEYCMP(tmpKey,lastTmpKey,m_ks) == 0 ) 
					ccount++;
				gbmemcpy(lastTmpKey,tmpKey,m_ks);
			}
		}
		if ( ccount > 10 ) {
			logf(LOG_INFO,"disk: Reading %" PRId32" bytes from %s file #"
			     "%" PRId32" when min "
			     "required is %" PRId32". Map is corrupt and has %" PRId32" "
			     "identical consecutive page keys because the "
			     "map was \"repaired\" because out of order keys "
			     "in the index.",
			     (int32_t)bytesToRead,
			     base->m_dbname,fn,
			     (int32_t)m_minRecSizes,
			     (int32_t)ccount);
			m_numScansCompleted++;
			m_errno = ECORRUPTDATA;
			m_hadCorruption = true;
			//m_maxRetries = 0;
			break;
		}

		////////
		//
		// try to get from PAGE CACHE
		//
		////////
		BigFile *ff = base->getFile(m_fileNums[i]);
		RdbCache *rpc = getDiskPageCache ( m_rdbId );
		if ( ! m_allowPageCache ) rpc = NULL;
		// . vfd is unique 64 bit file id
		// . if file is opened vfd is -1, only set in call to open()
		int64_t vfd = ff->getVfd();
		key192_t ck = makeCacheKey ( vfd , offset, bytesToRead);
		char *rec; int32_t recSize;
		bool inCache = false;
		if ( rpc && vfd != -1 && ! m_validateCache ) 
			inCache = rpc->getRecord ( (collnum_t)0 , // collnum
						   (char *)&ck , 
						   &rec , 
						   &recSize ,
						   true , // copy?
						   -1 , // maxAge, none 
						   true ); // inccounts?
		m_scans[i].m_inPageCache = false;
		if ( inCache ) {
			m_scans[i].m_inPageCache = true;
			m_numScansCompleted++;
			// now we have to store this value, 6 or 12 so
			// we can modify the hint appropriately
			m_scans[i].m_shifted = *rec;
			m_lists[i].set ( rec +1,
					 recSize-1 ,
					 rec , // alloc
					 recSize , // allocSize
					 startKey2 ,
					 endKey2 ,
					 base->m_fixedDataSize ,
					 true , // owndata
					 base->useHalfKeys() ,
					 getKeySizeFromRdbId ( m_rdbId ) );
			continue;
		}

		// . do the scan/read of file #i
		// . this returns false if blocked, true otherwise
		// . this will set g_errno on error
		bool done = m_scans[i].setRead (base->getFile(m_fileNums[i]),
						base->m_fixedDataSize ,
						 offset                 ,
						 bytesToRead            ,
						 startKey2              ,
						 endKey2                ,
						m_ks                    ,
						 &m_lists[i]            ,
						 this                   ,
						 doneScanningWrapper    ,
						 base->useHalfKeys()    ,
						m_rdbId,
						 m_niceness             ,
						 m_allowPageCache       ,
						 m_hitDisk              ) ;
		// . damn, usually the above will indirectly launch a thread
		//   to do the reading, but it sets g_errno to EINTR,
		//   "interrupted system call"!
		// . i guess the thread does the read w/o blocking and then
		//   queues the signal on g_loop's queue before it exits
		// . try ignoring, and keep going
		if ( g_errno == EINTR ) {
			log("net: Interrupted system call while reading file. "
			    "Ignoring.");
			g_errno = 0;
		}
		// debug msg
		//fprintf(stderr,"Msg3:: reading %" PRId32" bytes from file #%" PRId32","
		//	"done=%" PRId32",offset=%" PRId64",g_errno=%s,"
		//	"startKey=n1=%" PRIu32",n0=%" PRIu64",  "
		//	"endKey=n1=%" PRIu32",n0=%" PRIu64"\n",
		//	bytesToRead,i,(int32_t)done,offset,mstrerror(g_errno),
		//	m_startKey,m_endKey);
		//if ( bytesToRead == 0 )
		//	fprintf(stderr,"shit\n");
		// if it did not block then it completed, so count it
		if ( done ) m_numScansCompleted++;
		// break on an error, and remember g_errno in case we block
		if ( g_errno && g_errno != ENOTHREADSLOTS ) { 
			int32_t tt = LOG_WARN;
			if ( g_errno == EFILECLOSED ) tt = LOG_INFO;
			log(tt,"disk: Reading %s had error: %s.",
			    base->m_dbname, mstrerror(g_errno));
			m_errno = g_errno; 
			break; 
		}
	}
	// debug test
	//if ( rand() % 100 <= 10 ) m_errno = EIO;

	// if we blocked, return false
	if ( m_numScansCompleted < m_numScansStarted ) return false;
	// . if all scans completed without blocking then wrap it up & ret true
	// . doneScanning may now block if it finds data corruption and must
	//   get the list remotely
	return doneScanning();
}

Esempio n. 24

0

Mostra file

File: Title.cpp Progetto: lemire/open-source-search-engine

// . copy just words in [t0,t1)
// . returns false on error and sets g_errno
bool Title::copyTitle(Words *w, int32_t t0, int32_t t1) {
	// skip initial punct
	const char *const *wp    = w->getWords();
	const int32_t     *wlens = w->getWordLens();
	int32_t            nw    = w->getNumWords();

	// sanity check
	if ( t1 < t0 ) { char *xx = NULL; *xx = 0; }

	// don't breech number of words
	if ( t1 > nw ) {
		t1 = nw;
	}

	// no title?
	if ( nw == 0 || t0 == t1 ) {
		reset();
		return true;
	}

	const char *end = wp[t1-1] + wlens[t1-1] ;

	// allocate title
	int32_t need = end - wp[t0];

	// add 3 bytes for "..." and 1 for \0
	need += 5;

	// return false if could not hold the title
	if ( need > MAX_TITLE_LEN ) {
		m_title[0] = '\0';
		m_titleLen = 0;
		log("query: Could not alloc %" PRId32" bytes for title.",need);
		return false;
	}

	// point to the title to transcribe
	const char *src    = wp[t0];
	const char *srcEnd = end;

	// include a \" or \'
	if ( t0 > 0 && ( src[-1] == '\'' || src[-1] == '\"' ) ) {
		src--;
	}

	// and remove terminating | or :
	for ( ; 
	      srcEnd > src && 
		      (srcEnd[-1] == ':' || 
		       srcEnd[-1] == ' ' ||
		       srcEnd[-1] == '-' ||
		       srcEnd[-1] == '\n' ||
		       srcEnd[-1] == '\r' ||
		       srcEnd[-1] == '|'   )    ; 
	      srcEnd-- );

	// store in here
	char *dst    = m_title;

	// leave room for "...\0"
	char *dstEnd = m_title + need - 4;

	// size of character in bytes, usually 1
	char cs ;

	// point to last punct char
	char *lastp = dst;//NULL;

	int32_t charCount = 0;
	// copy the node @p into "dst"
	for ( ; src < srcEnd ; src += cs , dst += cs ) {
		// get src size
		cs = getUtf8CharSize ( src );

		// break if we are full!
		if ( dst + cs >= dstEnd ) {
			break;
		}

		// or hit our max char limit
		if ( charCount++ >= m_maxTitleLen ) {
			break;
		}

		// skip unwanted character
		if (isUtf8UnwantedSymbols(src)) {
			dst -= cs;
			continue;
		}

		// remember last punct for cutting purposes
		if ( ! is_alnum_utf8 ( src ) ) {
			lastp = dst;
		}

		// encode it as an html entity if asked to
		if ( *src == '<' ) {
			if ( dst + 4 >= dstEnd ) {
				break;
			}

			gbmemcpy ( dst , "&lt;" , 4 );
			dst += 4 - cs;
			continue;
		}

		// encode it as an html entity if asked to
		if ( *src == '>' ) {
			if ( dst + 4 >= dstEnd ) {
				break;
			}

			gbmemcpy ( dst , "&gt;" , 4 );
			dst += 4 - cs;
			continue;
		}

		// if more than 1 byte in char, use gbmemcpy
		if ( cs == 1 ) {
			*dst = *src;
		} else {
			gbmemcpy ( dst , src , cs );
		}
	}

	// null term always
	*dst = '\0';
	
	// do not split a word in the middle!
	if ( src < srcEnd ) { 
		if ( lastp ) {
			gbmemcpy ( lastp , "...\0" , 4 );
			dst = lastp + 3;
		} else {
			gbmemcpy ( dst   , "...\0" , 4 );
			dst += 3;
		}
	}

	// set size. does not include the terminating \0
	m_titleLen = dst - m_title;

	return true;
}

Esempio n. 25

0

Mostra file

File: RdbMem.cpp Progetto: exename/open-source-search-engine

// . when a dump completes we free the primary mem space and make
//   the secondary mem space the new primary mem space
void RdbMem::freeDumpedMem( RdbTree *tree ) {
	// bail if we have no mem
	if ( m_memSize == 0 ) return;
		
    log("rdbmem: start freeing dumped mem");

    //char *memEnd = m_mem + m_memSize;

    // this should still be true so allocData() returns m_ptr2 ptrs
    if ( ! m_rdb->m_inDumpLoop ) { g_process.shutdownAbort(true); }

    // count how many data nodes we had to move to avoid corruption
    int32_t count = 0;
    int32_t scanned = 0;
    for ( int32_t i = 0 ; i < tree->m_minUnusedNode ; i++ ) {
		// give up control to handle search query stuff of niceness 0
		QUICKPOLL ( MAX_NICENESS );
		// skip node if parents is -2 (unoccupied)
		if ( tree->m_parents[i] == -2 ) continue;
		scanned++;
		// get the ptr
		char *data = tree->m_data[i];
		if ( ! data ) continue;
		// how could it's data not be stored in here?
		// if ( data < m_mem ) {
		//      log("rdbmem: bad data1");
		//      continue;
		// }
		// if ( data >= memEnd ) {
		//      log("rdbmem: bad data2");
		//      continue;
		// }
		// is it in primary mem? m_ptr1 mem was just dump
		// if growing upward
		bool needsMove = false;
		// if the primary mem (that was dumped) is
		// growing upwards
		if ( m_ptr1 < m_ptr2 ) {
			// and the node data is in it...
			if ( data < m_ptr1 )
				needsMove = true;
		}
		// growing downward otherwise
		else if ( data >= m_ptr1 ) {
			needsMove = true;
		}
		if ( ! needsMove ) continue;
		// move it. m_inDumpLoop should still
		// be true so we will get added to
		// m_ptr2
		int32_t size;
		if ( tree->m_sizes ) size = tree->m_sizes[i];
		else size = tree->m_fixedDataSize;
			
		if ( size < 0 ) { g_process.shutdownAbort(true); }
		if ( size == 0 ) continue;
			
		// m_inDumpLoop is still true at this point so
		// so allocData should return m_ptr2 guys
		char *newData = (char *)allocData(NULL,size,0);
		if ( ! newData ) {
			log("rdbmem: failed to alloc %i "
				"bytes node %i",(int)size,(int)i);
			continue;
		}
		// debug test
		bool stillNeedsMove = false;
		if ( m_ptr1 < m_ptr2 ) {
			// and the node data is in it...
			if ( newData < m_ptr1 )
				stillNeedsMove = true;
		}
		// growing downward otherwise
		else if ( newData >= m_ptr1 ) {
			stillNeedsMove = true;
		}
		if ( stillNeedsMove ) {// this should never happen!!
			log("rdbmem: olddata=0x%" PTRFMT" newdata=0x%" PTRFMT,
				(PTRTYPE)data, (PTRTYPE)newData);
			log("rdbmem: still needs move!");
		}
		count++;
		gbmemcpy(newData,data,size);
		tree->m_data[i] = newData;
    }
    if ( count > 0 )
		log("rdbmem: moved %i tree nodes for %s",(int)count,
			m_rdb->m_dbname);
		
	log("rdbmem: stop freeing dumped mem. scanned %i nodes.",(int)scanned);		

	// save primary ptr
	char *tmp = m_ptr1;
	// debug
	//logf(LOG_DEBUG,
	//     "db: freeing dumped mem ptr1=%" PRIx32" ptr2=%" PRIx32".",m_ptr1,m_ptr2);
	// primary pointer, m_ptr1, becomes m_ptr2
	m_ptr1 = m_ptr2;
	// secondary ptr becomes primary
	m_ptr2 = tmp;
	// reset secondary (old primary mem was dumped out to disk)
	if ( m_ptr2 > m_ptr1 ) m_ptr2  = m_mem + m_memSize;
	else                   m_ptr2  = m_mem;
	// no longer 90% full
	m_is90PercentFull = false;
}

Esempio n. 26

0

Mostra file

File: Images.cpp Progetto: lemire/open-source-search-engine

bool Images::downloadImages () {
	// all done if we got a valid thumbnail
	//if ( m_thumbnailValid ) return true;

	int32_t  srcLen;
	char *src = NULL;
	int32_t node;

	// . download each leftover image
	// . stop as soon as we get one with good dimensions
	// . make a thumbnail of that one
	for (  ; m_j < m_numImages ; m_j++ , m_phase = 0 ) {

		// did collection get nuked?
		CollectionRec *cr = g_collectiondb.getRec(m_collnum);
		if ( ! cr ) { g_errno = ENOCOLLREC; return true; }

		// clear error
		g_errno = 0;

		if ( m_phase == 0 ) {
			// advance
			m_phase++;
			// only if not diffbot, we set "src" above for it
			// get img tag node
			node = m_imageNodes[m_j];
			// get the url of the image
			src = getImageUrl ( m_j , &srcLen );
			// use "pageUrl" as the baseUrl
			m_imageUrl.set( m_pageUrl, src, srcLen );
			// if we should stop, stop
			if ( m_stopDownloading ) break;
			// skip if bad or not unique
			if ( m_errors[m_j] ) continue;
			// set status msg
			sprintf ( m_statusBuf ,"downloading image %" PRId32,m_j);
			// point to it
			if ( m_xd ) m_xd->setStatus ( m_statusBuf );
		}

		// get image ip
		if ( m_phase == 1 ) {
			// advance
			m_phase++;
			// this increments phase if it should
			if ( ! getImageIp() ) return false;
			// error?
			if ( g_errno ) continue;
		}

		// download the actual image
		if ( m_phase == 2 ) {
			// advance
			m_phase++;
			// download image data
			if ( ! downloadImage() ) return false;
			// error downloading?
			if ( g_errno ) continue;
		}

		// get thumbnail using threaded call to netpbm stuff
		if ( m_phase == 3 ) {
			// advance
			m_phase++;
			// call pnmscale etc. to make thumbnail
			if ( ! makeThumb() ) return false;
			// error downloading?
			if ( g_errno ) continue;
		}

		// error making thumb or just not a good thumb size?
		if ( ! m_thumbnailValid ) {
			// free old image we downloaded, if any
			m_msg13.reset();
			// i guess do this too, it was pointing at it in msg13
			m_imgReply = NULL;
			// try the next image candidate
			continue;
		}

		// it's a keeper
		int32_t urlSize = m_imageUrl.getUrlLen() + 1; // include \0
		// . make our ThumbnailArray out of it
		int32_t need = 0;
		// the array itself
		need += sizeof(ThumbnailArray);
		// and each thumbnail it contains
		need += urlSize;
		need += m_thumbnailSize;
		need += sizeof(ThumbnailInfo);
		// reserve it
		m_imageBuf.reserve ( need );
		// point to array
		ThumbnailArray *ta =(ThumbnailArray *)m_imageBuf.getBufStart();
		// set that as much as possible, version...
		ta->m_version = 0;
		// and thumb count
		ta->m_numThumbnails = 1;
		// now store the thumbnail info
		ThumbnailInfo *ti = ta->getThumbnailInfo (0);
		// and set our one thumbnail
		ti->m_origDX = m_dx;
		ti->m_origDY = m_dy;
		ti->m_dx = m_tdx;
		ti->m_dy = m_tdy;
		ti->m_urlSize = urlSize;
		ti->m_dataSize = m_thumbnailSize;
		// now copy the data over sequentially
		char *p = ti->m_buf;
		// the image url
		gbmemcpy(p,m_imageUrl.getUrl(),urlSize);
		p += urlSize;
		// the image thumbnail data
		gbmemcpy(p,m_imgData,m_thumbnailSize);
		p += m_thumbnailSize;
		// update buf length of course
		m_imageBuf.setLength ( p - m_imageBuf.getBufStart() );

		// validate the buffer
		m_imageBufValid = true;

		// save mem. do this after because m_imgData uses m_msg13's
		// reply buf to store the thumbnail for now...
		m_msg13.reset();
		m_imgReply = NULL;

		g_errno = 0;

		return true;
	}

	// don't tell caller EBADIMG it will make him fail to index doc
	g_errno = 0;

	return true;
}

Esempio n. 27

0

Mostra file

File: Msg0.cpp Progetto: BlaBlaNet/open-source-search-engine

// . slot should be auto-nuked upon transmission or error
// . TODO: ensure if this sendReply() fails does it really nuke the slot?
void gotListWrapper ( void *state , RdbList *listb , Msg5 *msg5xx ) {
	// get the state
	State00 *st0 = (State00 *)state;
	// extract the udp slot and list and msg5
	UdpSlot   *slot =  st0->m_slot;
	RdbList   *list = &st0->m_list;
	Msg5      *msg5 = &st0->m_msg5;
	UdpServer *us   =  st0->m_us;
	// sanity check -- ensure they match
	//if ( niceness != st0->m_niceness )
	//	log("Msg0: niceness mismatch");
	// debug msg
	//if ( niceness != 0 ) 
	//	log("HEY! niceness is not 0");
	// timing debug
	if ( g_conf.m_logTimingNet || g_conf.m_logDebugNet ) {
		//log("Msg0:hndled request %"UINT64"",gettimeofdayInMilliseconds());
		int32_t size = -1;
		if ( list ) size     = list->getListSize();
		log(LOG_TIMING|LOG_DEBUG,
		    "net: msg0: Handled request for data. "
		    "Now sending data termId=%"UINT64" size=%"INT32""
		    " transId=%"INT32" ip=%s port=%i took=%"INT64" "
		    "(niceness=%"INT32").",
		    g_posdb.getTermId(msg5->m_startKey),
		    size,slot->m_transId,
		    iptoa(slot->m_ip),slot->m_port,
		    gettimeofdayInMilliseconds() - st0->m_startTime ,
		    st0->m_niceness );
	}
	// debug
	//if ( ! msg5->m_includeTree )
	//	log("hotit\n");
	// on error nuke the list and it's data
	if ( g_errno ) {
		mdelete ( st0 , sizeof(State00) , "Msg0" );
		delete (st0);
		// TODO: free "slot" if this send fails
		us->sendErrorReply ( slot , g_errno );
		return;
	}

	QUICKPOLL(st0->m_niceness);
	// point to the serialized list in "list"
	char *data      = list->getList();
	int32_t  dataSize  = list->getListSize();
	char *alloc     = list->getAlloc();
	int32_t  allocSize = list->getAllocSize();
	// tell list not to free the data since it is a reply so UdpServer
	// will free it when it destroys the slot
	list->setOwnData ( false );
	// keep track of stats
	Rdb *rdb = getRdbFromId ( st0->m_rdbId );
	if ( rdb ) rdb->sentReplyGet ( dataSize );
	// TODO: can we free any memory here???

	// keep track of how long it takes to complete the send
	st0->m_startTime = gettimeofdayInMilliseconds();
	// debug point
	int32_t oldSize = msg5->m_minRecSizes;
	int32_t newSize = msg5->m_minRecSizes + 20;
	// watch for wrap around
	if ( newSize < oldSize ) newSize = 0x7fffffff;
	if ( dataSize > newSize && list->getFixedDataSize() == 0 &&
	     // do not annoy me with these linkdb msgs
	     dataSize > newSize+100 ) 
		log(LOG_LOGIC,"net: msg0: Sending more data than what was "
		    "requested. Ineffcient. Bad engineer. dataSize=%"INT32" "
		    "minRecSizes=%"INT32".",dataSize,oldSize);
	/*
	// always compress these lists
	if ( st0->m_rdbId == RDB_SECTIONDB ) { // && 1 == 3) {

		// get sh48, the sitehash
		key128_t *startKey = (key128_t *)msg5->m_startKey ;
		int64_t sh48 = g_datedb.getTermId(startKey);

		// debug
		//log("msg0: got sectiondblist from disk listsize=%"INT32"",
		//    list->getListSize());

		if ( dataSize > 50000 )
			log("msg0: sending back list rdb=%"INT32" "
			    "listsize=%"INT32" sh48=0x%"XINT64"",
			    (int32_t)st0->m_rdbId,
			    dataSize,
			    sh48);

		// save it
		int32_t origDataSize = dataSize;
		// store compressed list on itself
		char *dst = list->m_list;
		// warn if niceness is 0!
		if ( st0->m_niceness == 0 )
			log("msg0: compressing sectiondb list at niceness 0!");
		// compress the list
		uint32_t lastVoteHash32 = 0LL;
		SectionVote *lastVote = NULL;
		for ( ; ! list->isExhausted() ; list->skipCurrentRecord() ) {
			// breathe
			QUICKPOLL ( st0->m_niceness );
			// get rec
			char *rec = list->getCurrentRec();
			// for ehre
			key128_t *key = (key128_t *)rec;
			// the score is the bit which is was set in 
			// Section::m_flags for that docid
			int32_t secType = g_indexdb.getScore ( (char *)key );
			// 0 means it probably used to count # of voters
			// from this site, so i don't think xmldoc uses
			// that any more
			if ( secType == SV_SITE_VOTER ) continue;
			// treat key like a datedb key and get the taghash
			uint32_t h32 = g_datedb.getDate ( key );
			// get data/vote from the current record in the 
			// sectiondb list
			SectionVote *sv=(SectionVote *)list->getCurrentData ();
			// get the average score for this doc
			float avg = sv->m_score ;
			if ( sv->m_numSampled > 0.0 ) avg /= sv->m_numSampled;
			// if same as last guy, add to it
			if ( lastVoteHash32 == h32 && lastVote ) {
				// turn possible multi-vote into single docid
				// into a single vote, with the score averaged.
				lastVote->m_score += avg;
				lastVote->m_numSampled++;
				continue;
			}
			// otherwise, add in a new guy!
			*(key128_t *)dst = *key;
			dst += sizeof(key128_t);
			// the new vote
			SectionVote *dsv = (SectionVote *)dst;
			dsv->m_score = avg;
			dsv->m_numSampled = 1;
			// set this
			lastVote = dsv;
			lastVoteHash32 = h32;
			// skip over
			dst += sizeof(SectionVote);
		}
		// update the list size now for sending back
		dataSize = dst - data;
		// if the list was over the requested minrecsizes we need
		// to set a flag so that the caller will do a re-call.
		// so making the entire odd, will be the flag.
	        if ( origDataSize > msg5->m_minRecSizes && 
		     dataSize < origDataSize ) {
			*dst++ = '\0';
			dataSize++;
		}

		// debug
		//log("msg0: compressed sectiondblist from disk "
		//    "newlistsize=%"INT32"", dataSize);
		
		// use this timestamp
		int32_t now = getTimeLocal();//Global();
		// finally, cache this sucker
		s_sectiondbCache.addRecord ( msg5->m_coll,
					     (char *)startKey,//(char *)&sh48
					     data, 
					     dataSize ,
					     now );
		// ignore errors
		g_errno = 0;
	}
	*/
		    
	//
	// for linkdb lists, remove all the keys that have the same IP32
	// and store a count of what we removed somewhere
	//
	if ( st0->m_rdbId == RDB_LINKDB ) {
		// store compressed list on itself
		char *dst = list->m_list;
		// keep stats
		int32_t totalOrigLinks = 0;
		int32_t ipDups = 0;
		int32_t lastIp32 = 0;
		char *listEnd = list->getListEnd();
		// compress the list
		for ( ; ! list->isExhausted() ; list->skipCurrentRecord() ) {
			// breathe
			QUICKPOLL ( st0->m_niceness );
			// count it
			totalOrigLinks++;
			// get rec
			char *rec = list->getCurrentRec();
			int32_t ip32 = g_linkdb.getLinkerIp_uk((key224_t *)rec );
			// same as one before?
			if ( ip32 == lastIp32 && 
			     // are we the last rec? include that for
			     // advancing the m_nextKey in Linkdb more 
			     // efficiently.
			     rec + LDBKS < listEnd ) {
				ipDups++;
				continue;
			}
			// store it
			gbmemcpy (dst , rec , LDBKS );
			dst += LDBKS;
			// update it
			lastIp32 = ip32;
		}
		// . if we removed one key, store the stats
		// . caller should recognize reply is not a multiple of
		//   the linkdb key size LDBKS and no its there!
		if ( ipDups ) {
			//*(int32_t *)dst = totalOrigLinks;
			//dst += 4;
			//*(int32_t *)dst = ipDups;
			//dst += 4;
		}
		// update list parms
		list->m_listSize = dst - list->m_list;
		list->m_listEnd  = list->m_list + list->m_listSize;
		data      = list->getList();
		dataSize  = list->getListSize();
	}


	//log("sending replySize=%"INT32" min=%"INT32"",dataSize,msg5->m_minRecSizes);
	// . TODO: dataSize may not equal list->getListMaxSize() so
	//         Mem class may show an imblanace
	// . now g_udpServer is responsible for freeing data/dataSize
	// . the "true" means to call doneSending_ass() from the signal handler
	//   if need be
	st0->m_us->sendReply_ass  ( data            ,
				    dataSize        ,
				    alloc           , // alloc
				    allocSize       , // alloc size
				    slot            ,
				    60              ,
				    st0             ,
				    doneSending_ass ,
				    -1              ,
				    -1              ,
				    true            );
}

Esempio n. 28

0

Mostra file

File: PageCatdb.cpp Progetto: BlaBlaNet/open-source-search-engine

// . returns false if blocked, true otherwise
// . sets g_errno on error
bool sendPageCatdb ( TcpSocket *s , HttpRequest *r ) {
	// are we the admin?
	//bool isAdmin    = g_collectiondb.hasPermission ( r , s );
	// get the collection record
	CollectionRec *cr = g_collectiondb.getRec ( r );
	if ( ! cr ) {
		log("admin: No collection record found "
		    "for specified collection name. Could not add sites to "
		    "tagdb. Returning HTTP status of 500.");
		return g_httpServer.sendErrorReply ( s , 500 ,
						  "collection does not exist");
	}
	/*
	bool isAssassin = cr->isAssassin ( s->m_ip );
	if ( isAdmin ) isAssassin = true;
	// bail if permission denied
	if ( ! isAssassin && ! cr->hasPermission ( r , s ) ) {
		log("admin: Bad collection name or password. Could not add "
		    "sites to tagdb. Permission denied.");
		return sendPagexxxx ( s , r , 
						    "Collection name or "
						    "password is incorrect");
	}
	*/


	// no permmission?
	bool isMasterAdmin = g_conf.isMasterAdmin ( s , r );
	bool isCollAdmin = g_conf.isCollAdmin ( s , r );
	if ( ! isMasterAdmin &&
	     ! isCollAdmin ) {
		g_errno = ENOPERM;
		g_httpServer.sendErrorReply(s,g_errno,mstrerror(g_errno));
		return true;
	}


	// get the collection
	int32_t collLen = 0;
	char *coll   = r->getString("c", &collLen, NULL);
	// check for generate catdb command
	int32_t genCatdb = r->getLong("gencatdb", 0);
	// check for a lookup url
	int32_t urlLen = 0;
	char *url   = r->getString("caturl", &urlLen, NULL);
	// create the State
	StateCatdb *st;
	try { st = new (StateCatdb); }
	catch ( ... ) {
		g_errno = ENOMEM;
		log("catdb: Unable to allocate %"INT32" bytes for StateCatdb",
		    (int32_t)sizeof(StateCatdb) );
		return true;
	}
	mnew ( st, sizeof(StateCatdb), "PageCatdb" );
	// fill the state
	st->m_socket = s;
	st->m_r.copy(r);
	// copy collection
	if (collLen > MAX_COLL_LEN) collLen = MAX_COLL_LEN - 1;
	gbmemcpy(st->m_coll, coll, collLen);
	st->m_coll[collLen] = '\0';
	st->m_collLen = collLen;
	// defaults
	st->m_catLookup = false;
	st->m_genCatdb  = false;
	st->m_startTime = gettimeofdayInMilliseconds();
	// generate catdb if requested
	if (genCatdb == 1) {
		st->m_genCatdb = true;
		if (!st->m_msg2a.makeCatdb ( st->m_coll,
					     st->m_collLen,
					     false,
					     st,
					     sendReplyWrapper ) )
			return false;
	}
	// update catdb from .new files
	else if (genCatdb == 2) {
		st->m_genCatdb = true;
		if (!st->m_msg2a.makeCatdb ( st->m_coll,
					     st->m_collLen,
					     true,
					     st,
					     sendReplyWrapper ) )
			return false;
	}
	// lookup a url if requested
	else if (url && urlLen > 0) {
		st->m_catLookup = true;
		// set the url
		st->m_url.set(url, urlLen);
		// call msg8b to lookup in catdb
		if (!st->m_msg8b.getCatRec ( &st->m_url,
					     NULL,//st->m_coll,
					     0,//st->m_collLen,
					      true,
					      1,
					      &st->m_catRec,
					      st,
					      gotCatInfoWrapper))
					      //RDB_CATDB ) )
					      //RDB_TAGDB ) )
			return false;
	}
	// otherwise return the regular page
	return sendReply ( st );
}

Esempio n. 29

0

Mostra file

File: SpiderProxy.cpp Progetto: privacore/open-source-search-engine

// . when the Conf::m_proxyIps parm is updated we call this to rebuild
//   s_iptab, our table of SpiderProxy instances, which has the proxies and 
//   their performance statistics.
// . we try to maintain stats of ip/ports that did NOT change when rebuilding.
bool buildProxyTable ( ) {

	// scan the NEW list of proxy ip/port pairs in g_conf
	char *p = g_conf.m_proxyIps.getBufStart();

	HashTableX tmptab;
	tmptab.set(8,0,16,NULL,0,false,"tmptab");

	// scan the user inputted space-separated list of ip:ports
	// (optional username:password@ip:port)
	for ( ; *p ; ) {
		// skip white space
		if ( is_wspace_a(*p) ) { p++; continue; }

		// skip http://
		if ( strncasecmp(p,"http://",7) == 0 ) { p += 7; continue; }

		// scan in an ip:port
		char *s = p; char *portStr = NULL;
		int32_t dc = 0, pc = 0, gc = 0, bc = 0;
		const char *msg;

		char *usernamePwd = NULL;
		int32_t usernamePwdLen = 0;
		char *ipStart = p;

		// scan all characters until we hit \0 or another whitespace
		for ( ; *s && !is_wspace_a(*s); s++) {

			if ( *s == '@' ) {
				// must be username:pwd
				if ( pc != 1 ) {
					msg = "bad username:password";
					goto hadError;
				}
				usernamePwd = p;
				usernamePwdLen = s - p;
				if ( usernamePwdLen >= MAXUSERNAMEPWD-2 ) {
					msg = "username:password too long";
					goto hadError;
				}
				dc = 0;
				gc = 0;
				bc = 0;
				pc = 0;
				portStr = NULL;
				ipStart = s+1;
				continue;
			}

			if ( *s == '.' ) { dc++; continue; }
			if ( *s == ':' ) { portStr=s; pc++; continue; }
			if ( is_digit(*s) ) { gc++; continue; }
			bc++;
			continue;
		}
		// ensure it is a legit ip:port combo
		msg = NULL;
		if ( gc < 4 ) 
			msg = "not enough digits for an ip";
		if ( pc > 1 )
			msg = "too many colons";
		if ( dc != 3 )
			msg = "need 3 dots for an ip address";
		if ( bc )
			msg = "got illegal char in ip:port listing";
		if ( msg ) {
		hadError:
			char c = *s;
			*s = '\0';
			log("buf: %s for %s",msg,p);
			*s = c;
			return false;
		}

		// convert it
		int32_t iplen = s - ipStart;
		if ( portStr ) iplen = portStr - ipStart;
		int32_t ip = atoip(ipStart,iplen);
		// another sanity check
		if ( ip == 0 || ip == -1 ) {
			log("spider: got bad proxy ip for %s",p);
			return false;
		}

		// and the port default is 80
		int32_t port = 80;
		if ( portStr ) port = atol2(portStr+1,s-portStr-1);
		if ( port < 0 || port > 65535 ) {
			log("spider: got bad proxy port for %s",p);
			return false;
		}


		// . we got a legit ip:port
		// . see if already in our table
		uint64_t ipKey = (uint32_t)ip;
		ipKey <<= 16;
		ipKey |= (uint16_t)(port & 0xffff);

		// also store into tmptable to see what we need to remove
		tmptab.addKey(&ipKey);

		// see if in table
		int32_t islot = s_iptab.getSlot( &ipKey);

		// advance p
		p = s;

		// if in there, keep it as is
		if ( islot >= 0 ) continue;

		// otherwise add new entry
		SpiderProxy newThing;
		memset ( &newThing , 0 , sizeof(SpiderProxy));
		newThing.m_ip = ip;
		newThing.m_port = port;
		newThing.m_lastDownloadTookMS = -1;
		newThing.m_lastSuccessfulTestMS = -1;

		gbmemcpy(newThing.m_usernamePwd,usernamePwd,usernamePwdLen);
		// ensure it is NULL terminated
		newThing.m_usernamePwd[usernamePwdLen] = '\0';

		if ( ! s_iptab.addKey ( &ipKey, &newThing ) )
			return false;
	}		

 redo:
	int32_t removed = 0;
	// scan all SpiderProxies in tmptab
	for ( int32_t i = 0 ; i < s_iptab.getNumSlots() ; i++ ) {
		// skip empty buckets in hashtable s_iptab
		if ( ! s_iptab.m_flags[i] ) continue;
		// get the key
		int64_t key = *(int64_t *)s_iptab.getKeyFromSlot(i);
		// must also exist in tmptab, otherwise it got removed by user
		if ( tmptab.isInTable ( &key ) ) continue;
		// skip if not in table
		if ( s_iptab.getSlot ( &key ) < 0 ) {
			log("sproxy: iptable hashing messed up");
			continue;
		}
		// shoot, it got removed. not in the new list of ip:ports
		s_iptab.removeKey ( &key );
		removed++;
		// hashtable is messed up now, start over
		//goto redo;
	}
	if ( removed ) goto redo;
	return true;
}

Esempio n. 30

0

Mostra file

File: PagePerf.cpp Progetto: privacore/open-source-search-engine

// . returns false if blocked, true otherwise
// . sets errno on error
// . make a web page displaying the config of this host
// . call g_httpServer.sendDynamicPage() to send it
bool sendPagePerf ( TcpSocket *s , HttpRequest *r ) {
	// don't allow pages bigger than 128k in cache
	char  buf [ 64*1024 ];
	SafeBuf p(buf, 64*1024);
	p.setLabel ( "perfgrph" );

	// print standard header
	g_pages.printAdminTop ( &p , s , r );



	// password, too
	//int32_t pwdLen = 0;
	//char *pwd = r->getString ( "pwd" , &pwdLen );

	int32_t autoRefresh = r->getLong("rr", 0);
	if(autoRefresh > 0) {
		p.safePrintf("<script language=\"JavaScript\"><!--\n ");

		p.safePrintf( "\nfunction timeit() {\n"
			     " setTimeout('loadXMLDoc(\"%s&refresh=1"
			     "&dontlog=1\")',\n"
			     " 500);\n }\n ",  
			     r->getRequest() + 4/*skip over GET*/); 
		p.safePrintf(
		     "var req;"
		     "function loadXMLDoc(url) {"
		     "    if (window.XMLHttpRequest) {"
		     "        req = new XMLHttpRequest();" 
		     "        req.onreadystatechange = processReqChange;"
		     "        req.open(\"GET\", url, true);"
		     "        req.send(null);"
		     "    } else if (window.ActiveXObject) {"
		     "        req = new ActiveXObject(\"Microsoft.XMLHTTP\");"
		     "        if (req) {"
		     "            req.onreadystatechange = processReqChange;"
		     "            req.open(\"GET\", url, true);"
		     "            req.send();"
		     "        }"
		     "    }"
		     "}"
		     "function processReqChange() {"
		     "    if (req.readyState == 4) {"
		     "        if (req.status == 200) {"
		     "        var uniq = new Date();"
		     "        uniq.getTime();"
		     "   document.diskgraph.src=\"/diskGraph%" PRId32".gif?\" + uniq;"
		     "        timeit();"
		     "    } else {"
	     //   "            alert(\"There was a problem retrieving \");"
		     "        }"
		     "    }"
		     "} \n ",g_hostdb.m_hostId);

		p.safePrintf( "// --></script>");

		p.safePrintf("<body onLoad=\"timeit();\">"); 
	}


	//get the 'path' part of the request.
	char rbuf[1024];
	if(r->getRequestLen() > 1023) {
		gbmemcpy( rbuf, r->getRequest(), 1023);
	}
	else {
		gbmemcpy( rbuf, r->getRequest(), r->getRequestLen());
	}
	char* rbufEnd = rbuf;
	//skip GET
	while (!isspace(*rbufEnd)) rbufEnd++;
	//skip space(s)
	while (isspace(*rbufEnd)) rbufEnd++;
	//skip request path
	while (!isspace(*rbufEnd)) rbufEnd++;
	*rbufEnd = '\0';
	//char* refresh = strstr(rbuf, "&rr=");


	// print resource table
	// columns are the dbs
	p.safePrintf(
		       //"<center>Disk Statistics<br><br>"
		       "<center>"
		       //"<br>"
		       //"<img name=\"diskgraph\" 
		       //src=/diskGraph%" PRId32".gif><br><br>",
		       //g_hostdb.m_hostId );
		     );

	// now try using absolute divs instead of a GIF
	g_stats.printGraphInHtml ( p );

	/*
	if(autoRefresh > 0) {
		if(refresh) *(refresh+4) = '0';
		p.safePrintf(
			     "<center><a href=\"%s\">Auto Refresh Off</a>"
			     "</center>",
			     rbuf + 4);  // skip over GET
		p.safePrintf( "<input type=\"hidden\" "
			      "name=\"dontlog\" value=\"1\">");
		
	}
	else {
		char* rr = "";
		if(refresh) *(refresh+4) = '1';
		else rr = "&rr=1";
		p.safePrintf(
			     "<center><a href=\"%s%s\">Auto Refresh</a>"
			     "</center>",
			     rbuf + 4, rr);  // skip over "GET "
	}
	*/

	// print the key
	p.safePrintf (
		      "<br>"
		       "<center>"
		       //"<table %s>"
		       //"<tr>%s</tr></table>"

		       "<style>"
		       ".poo { background-color:#%s;}\n"
		       "</style>\n"


		       "<table %s>"

		       // black
		       "<tr class=poo>"
		       "<td bgcolor=#000000>&nbsp; &nbsp;</td>"
		       "<td> High priority disk read. "
		       "Thicker lines for bigger reads.</td>"

		       // grey
		       "<td bgcolor=#808080>&nbsp; &nbsp;</td>"
		       "<td> Low priority disk read. "
		       "Thicker lines for bigger reads.</td>"
		       "</tr>"


		       // red
		       "<tr class=poo>"
		       "<td bgcolor=#ff0000>&nbsp; &nbsp;</td>"
		       "<td> Disk write. "
		       "Thicker lines for bigger writes.</td>"

		       // light brown
		       "<td bgcolor=#b58869>&nbsp; &nbsp;</td>"
		       "<td> Processing end user query. No raw= parm.</td>"
		       "</tr>"


		       // dark brown
		       "<tr class=poo>"
		       "<td bgcolor=#753d30>&nbsp; &nbsp;</td>"
		       "<td> Processing raw query. Has raw= parm.</td>"

		       // blue
		       "<td bgcolor=#0000ff>&nbsp; &nbsp;</td>"
		       "<td> Summary extraction for one document.</td>"
		       "</tr>"


		       // pinkish purple
		       "<tr class=poo>"
		       "<td bgcolor=#aa00aa>&nbsp; &nbsp;</td>"
		       "<td> Send data over network. (low priority)"
		       "Thicker lines for bigger sends.</td>"

		       // yellow
		       "<td bgcolor=#aaaa00>&nbsp; &nbsp;</td>"
		       "<td> Read all termlists (msg2). (low priority)"
		       "Thicker lines for bigger reads.</td>"
		       "</tr>"

		       // pinkish purple
		       "<tr class=poo>"
		       "<td bgcolor=#ff00ff>&nbsp; &nbsp;</td>"
		       "<td> Send data over network.  (high priority)"
		       "Thicker lines for bigger sends.</td>"

		       // light yellow
		       "<td bgcolor=#ffff00>&nbsp; &nbsp;</td>"
		       "<td> Read all termlists (msg2). (high priority)"
		       "Thicker lines for bigger reads.</td>"
		       "</tr>"


		       // dark purple
		       "<tr class=poo>"
		       "<td bgcolor=#8220ff>&nbsp; &nbsp;</td>"
		       "<td> Get all summaries for results.</td>"

		       // turquoise
		       "<td bgcolor=#00ffff>&nbsp; &nbsp;</td>"
		       "<td> Merge multiple disk reads. Real-time searching. "
		       "Thicker lines for bigger merges.</td>"
		       "</tr>"


		       // white
		       "<tr class=poo>"
		       "<td bgcolor=#ffffff>&nbsp; &nbsp;</td>"
		       "<td> Uncompress cached document.</td>"

		       // orange
		       "<td bgcolor=#fea915>&nbsp; &nbsp;</td>"
		       "<td> Parse a document. Blocks CPU.</td>"
		       "</tr>"


		       // bright green
		       "<tr class=poo>"
		       "<td bgcolor=#00ff00>&nbsp; &nbsp;</td>"
		       "<td> Compute search results. "
		       "All terms required. rat=1.</td>"

		       // dark green
		       "<td bgcolor=#008000>&nbsp; &nbsp;</td>"
		       "<td> Compute search results. "
		       "Not all terms required. rat=0.</td>"
		       "</tr>"

		       // bright green
		       "<tr class=poo>"
		       "<td bgcolor=#ccffcc>&nbsp; &nbsp;</td>"
		       "<td> Inject a document"
		       "</td>"

		       // dark green
		       "<td bgcolor=#FFFACD>&nbsp; &nbsp;</td>"
		       "<td> Compute related pages. "
		       "</td>"
		       "</tr>"

		       "<tr class=poo>"

		       "<td bgcolor=#d1e1ff>&nbsp; &nbsp;</td>"
		       "<td> Compute Gigabits. "
		       "</td>"

		       "<td bgcolor=#009fe5>&nbsp; &nbsp;</td>"
		       "<td> Quick Poll. "
		       "</td>"

		       "</tr>"


		       "<tr class=poo>"

		       "<td bgcolor=#0000b0>&nbsp; &nbsp;</td>"
		       "<td> \"Summary\" extraction (low priority) "
		       "</td>"

		       "<td bgcolor=#ffffff>&nbsp; &nbsp;</td>"
		       "<td> &nbsp; "
		       "</td>"

		       "</tr>"
		       

		       "</table>"
		       "</center>"
		       , LIGHT_BLUE 
		       , TABLE_STYLE
		       //,g_stats.m_keyCols.getBufStart() && 
		       //g_conf.m_dynamicPerfGraph ? 
		       //g_stats.m_keyCols.getBufStart() : ""
		       );

	if(autoRefresh > 0) p.safePrintf("</body>"); 

	// print the final tail
	//p += g_httpServer.printTail ( p , pend - p );

	int32_t bufLen = p.length();
	// . send this page
	// . encapsulates in html header and tail
	// . make a Mime
	return g_httpServer.sendDynamicPage ( s, p.getBufStart(), bufLen );
}