예제 #1
0
DxArchiverParameters::DxArchiverParameters(const DxArchiverParameters& rhs):
   SeekerParameterGroup(rhs.getCommand()),
   internal_(new DxArchiverParametersInternal(*rhs.internal_))
{
   setSite(rhs.getSite());
   addParameters();
}
예제 #2
0
DxParameters::DxParameters(const DxParameters& rhs):
   SeekerParameterGroup(rhs.getCommand(), rhs.getDbTableName(), 
			rhs.getIdColNameInActsTable()),
   internal_(new DxParametersInternal(*rhs.internal_))
{
   setSite(rhs.getSite());
   addParameters();
}
예제 #3
0
void Voronoi::ParabolaNode::_move(std::unique_ptr<ParabolaNode> parabola)
{
	_leftSibling = parabola->_leftSibling;
	_rightSibling = parabola->_rightSibling;
	_leftChild = std::move(parabola->_leftChild);
	_rightChild = std::move(parabola->_rightChild);
	setSite(parabola->site());
	setEdge(parabola->edge());
	setEvent(parabola->event());
}
예제 #4
0
DxParameters& DxParameters::operator=(const DxParameters& rhs)
{
   if (this == &rhs) {
      return *this;
   }
  
   setCommand(rhs.getCommand());
   eraseParamList();
   setSite(rhs.getSite());
   delete internal_;
   internal_ = new DxParametersInternal(*rhs.internal_);
   addParameters();
   return *this;
}
예제 #5
0
Voronoi::ParabolaNode * Voronoi::ParabolaNode::emplaceParabola(const Point & site)
{
	if (!isValid()) {
		setSite(site);
		return this;
	}

	ParabolaNode * parabola = findParabola(site);
	assert(parabola->site().y() >= site.y());

	// disable event including parabola->sites in the middle of a triple.
	parabola->setEvent(nullptr);

	// Handle special case when: one_parabola_in_root && parabolaSite.y() == site.y()
	const Point parabolaSite = parabola->site();
	if (isLeaf(this) && parabolaSite.y() == site.y()) {
		if (site.x() < parabolaSite.x()) {
			parabola->_createChildren(site, parabolaSite);
			return parabola->leftChild();
		}
		else {
			parabola->_createChildren(parabolaSite, site);
			return parabola->rightChild();
		}
	}
	else if (site.x() < parabolaSite.x()) {
		// Create new parabola branch
		parabola->_createChildren(Point(), parabolaSite);
		parabola->leftChild()->_createChildren(parabolaSite, site);

		// Set edge to right sibling from the original parabola
		parabola->_rightChild->setEdge(parabola->edge());  // the rightmost parabola
		parabola->setEdge(nullptr);

		return parabola->leftChild()->rightChild();
	}
	else {
		// Create new parabola branch
		parabola->_createChildren(parabolaSite, Point());
		parabola->rightChild()->_createChildren(site, parabolaSite);

		// Set edge to right sibling from the original parabola
		parabola->rightChild()->_rightChild->setEdge(parabola->edge());  // the rightmost parabola
		parabola->setEdge(nullptr);

		return parabola->rightChild()->leftChild();
	}
}
// . returns false if blocked, returns true and sets g_errno on error
// . returns true with m_allDone set to false to process another subsite
// . we use voters to set SEC_VOTE_STATIC and SEC_VOTE_DYNAMIC flags
//   in addition to SEC_VOTE_TEXTY and SEC_VOTE_UNIQUE
bool SiteGetter::gotSiteList ( ) {
	// assume not trying again
	m_tryAgain = false;
	// error?
	if ( g_errno ) {
		// timeouts usually...
		log("site: sitegetter gotList: %s",mstrerror(g_errno));
		// mark it so caller knows
		m_errno = g_errno;
		// so try again without increasing m_pathDepth
		// i've seen a host return EBADRDBID for some reason
		// and put host #0 in an infinite log spam loop so stop it
		if ( g_errno != EBADRDBID ) m_tryAgain = true;
		return true;
	}
	// how many urls at this path depth?
	int32_t count = ( m_list.getListSize() - 6 ) / 6;
	// if we do not have enough to quality this as a subsite path depth
	// try the next
	if ( count < 100 ) { 
		// increment and try again
		m_pathDepth++; 
		// clear just in case
		g_errno = 0;
		// get another list if we can, m_allDone is no true yet
		if ( m_pathDepth < m_maxPathDepth ) {
			m_tryAgain = true;
			return true;
		}
	}

	// ok, i guess this indicates we have a subsite level
	m_sitePathDepth = m_pathDepth;

	// this basically means none!
	if ( m_pathDepth >= m_maxPathDepth ) m_sitePathDepth = -1;

	// . sets m_site and m_siteLen from m_url
	// . this returns false if blocked, true otherwise
	return setSite ( ) ;
}
// . also sets m_sitePathDepth to what it should be
// . -1 indicates unknown (not enough data, etc.) or host/domain is the site
// . returns false if blocked, true otherwise
// . returns true and sets g_errno on error
// . sets m_site to reference into "url" so XmlDoc::updateTagdb() can just
//   pass a bunch of site ptrs to msg9a
// . "url" MUST BE NORMALIZED via Url.cpp. so using Links' buffer is ok!
// . TODO: consider setting "state" to null if your url host has tons of inlinx
bool SiteGetter::getSite ( char   *url      ,
			   TagRec *gr       ,
			   int32_t    timestamp,
			   //char   *coll     ,
			   collnum_t collnum,
			   int32_t    niceness ,
			   //bool    addTags  ,
			   void   *state    ,
			   void (* callback)(void *state) ) {
	
	// save it
	m_gr       = gr;
	m_url      = url;
	//m_coll     = coll;
	m_collnum = collnum;
	//m_addTags  = addTags;
	m_state    = state;
	m_callback = callback;
	m_timestamp= timestamp;
	m_niceness = niceness;
	m_errno    = 0;

	// is it domain only?
	m_hasSubdomain = ::hasSubdomain ( url );

	// reset
	m_siteLen = 0;
	m_site[0] = '\0';
	m_allDone  = false;
	m_addedTag.reset();

	// set this to unknown for now
	m_sitePathDepth    = -1;
	m_oldSitePathDepth = -1;

	// reset this just in case
	g_errno = 0;

	//
	// HARDCODED algos
	//
	// ~ /user/ /users/ /profile/ myspace facebook linkedin
	//
	if ( setRecognizedSite ( ) ) {
		m_allDone = true;
		return true;
	}

	// bail if nothing else we can do
	if ( ! gr ) return setSite ( ) ;

	CollectionRec *cr = g_collectiondb.getRec ( collnum );
	// g_errno should be set if this is NULL
	if ( ! cr ) return true;
	//if ( ! cr->m_subsiteDetectionEnabled ) return true;

	// check the current tag for an age
	Tag *tag = gr->getTag("sitepathdepth");
	// if there and the age is young, skip it
	int32_t age = -1;
	//int32_t now = getTimeGlobal();
	//if ( tag ) age = now - tag->m_timestamp;
	// to parse conssitently for the qa test "qatest123" coll use 
	// "timestamp" as the "current time"
	if ( tag ) age = timestamp - tag->m_timestamp;
	// if there, at least get it (might be -1)
	if ( tag ) m_oldSitePathDepth = atol ( tag->getTagData() );
	// . if older than 10 days, we need to redo it
	// . if caller give us a timestamp of 0, never redo it!
	if ( age > 10*24*60*60 && timestamp != 0 ) age = -1;

	//if ( strstr(m_url,"http://www.topix.com/yp/albuquerque/c/community-religion-and-spirituality-churches") )
	//	log("hey");

	// . if our site quality is low, forget about dividing it up too
	// . if age is valid, skip it
	// . also if caller does not want a callback, like XmlDoc.cpp,
	//   then use whatever we got
	if ( age >= 0 || ! m_state ) { // || hostRootNumInlinks < 500 ) {
		// do not add to tagdb
		m_state = NULL;
		// just use what we had, it is not expired
		m_sitePathDepth = m_oldSitePathDepth;
		// . now set the site with m_sitePathDepth
		// . sanity check, should not block since m_state is NULL
		if ( ! setSite () ) { char *xx=NULL;*xx=0; }
		// we did not block
		return true;
	}

	// right now we only run on host #0 so we do not flood the cluster
	// with queries...
	if ( g_hostdb.m_hostId != 0 ) { 
		// do not add to tagdb and do not block!
		m_state = NULL;
		// . use a sitepathdepth of -1 by default then, until host #0
		//   has a chance to evaluate
		// . a sitepathdepth of -1 means to use the full hostname
		//   as the site
		m_sitePathDepth = -1;
		// sanity check, should not block since m_state is NULL
		if ( ! setSite () ) { char *xx=NULL;*xx=0; }
		// we did not block
		return true;
	}

	// . initial path depth
	// . this actually includes the first subdir name, up to, but not
	//   including the /, according to Url::getPathEnd()
	// . start with the broadest site as our possible subsite first
	//   in order to reduce errors i guess. because if we have examples:
	//   xyz.com/fred/
	//   xyz.com/jamie/
	//   xyz.com/bob/ ...
	//   and we also have:
	//   xyz.com/home/users/fred/
	//   xyz.com/home/users/jamie/
	//   xyz.com/home/users/bob/ ...
	//   then we need the first set to take precedence!
	m_pathDepth = 0;

	// set our fill url class. do not addWWW
	//m_u.set ( m_url , gbstrlen(m_url) , false );

	// must have http:// i guess
	if ( strncmp(m_url,"http",4) ) { 
		g_errno = EBADURL;
		return true;
		// don't let bad input from pageparser core us!
		char *xx=NULL;*xx=0;
	}

	// how many can we do? false = countFilename?
	//m_maxPathDepth = m_u.getPathDepth ( false );

	// . pathDepth==0 for "www.xyz.com"
	// . pathDepth==0 for "www.xyz.com/"
	// . pathDepth==0 for "www.xyz.com/foo"
	// . pathDepth==1 for "www.xyz.com/foo/"
	// . pathDepth==1 for "www.xyz.com/foo/x"
	// . pathDepth==2 for "www.xyz.com/foo/x/"
	// . pathDepth==2 for "www.xyz.com/foo/x/y"
	// . true --> we have the protocol, http:// in m_url
	m_maxPathDepth = getPathDepth ( m_url , true );

	// get it. return false if it blocked.
	return getSiteList();
}
// . returns false if blocked, true otherwise
// . returns true on error and sets g_errno
bool SiteGetter::getSiteList ( ) {

top:
	// . setSite() will return TRUE and set g_errno on error, and returns
	//   false if it blocked adding a tag, which will call callback once
	//   tag is added
	// . stop at this point
	if ( m_pathDepth >= 3 ) return setSite();
	// or if no more
	if ( m_pathDepth >= m_maxPathDepth ) return setSite();

	// . make the termid
	// . but here we get are based on "m_pathDepth" which ranges
	//   from 1 to N
	// . if m_pathDepth==0 use "www.xyz.com" as site
	// . if m_pathDepth==1 use "www.xyz.com/foo/" as site ...
	char *pend = getPathEnd ( m_url , m_pathDepth );
	// hash up to that
	//char *host = m_u.getHost();
	char *host = getHostFast ( m_url , NULL );
	// hash the prefix first to match XmlDoc::hashNoSplit()
	char *prefix = "siteterm";
	// hash that and we will incorporate it to match XmlDoc::hashNoSplit()
	int64_t ph = hash64 ( prefix , gbstrlen(prefix) );
	// . this should match basically what is in XmlDoc.cpp::hash()
	// . and this now does not include pages that have no outlinks 
	//   "underneath" them.
	int64_t termId = hash64 ( host , pend - host , ph ) & TERMID_MASK;

	// get all pages that have this as their termid!
	key144_t start ;
	key144_t end   ;
	g_posdb.makeStartKey ( &start, termId );
	g_posdb.makeEndKey   ( &end  , termId );

	// . now see how many urls art at this path depth from this hostname
	// . if it is a huge # then we know they are all subsites!
	//   because it is too bushy to be anything else
	// . i'd say 100 nodes is good enough to qualify as a homestead site

	int32_t minRecSizes = 5000000;
	// get the group this list is in
	//uint32_t gid ;
	//gid = getGroupId ( RDB_POSDB , (char *)&start , false ); //split?
	//uint32_t shardNum ;
	//shardNum = getShardNum( RDB_POSDB , (char *)&start , false ); //split?

	// i guess this is split by termid and not docid????
	int32_t shardNum = g_hostdb.getShardNumByTermId ( &start );

	// we need a group #. the column #.
	//int32_t split = g_hostdb.getGroupNum ( gid );
	// int16_tcut
	Msg0 *m = &m_msg0;
	// get the list. returns false if blocked.
	if ( ! m->getList ( -1                 , // hostId
			    0                  , // ip
			    0                  , // port
			    0                  , // maxCacheAge
			    false              , // addToCache
			    RDB_POSDB        ,
			    m_collnum             ,
			    &m_list            ,
			    (char *)&start     ,
			    (char *)&end       ,
			    minRecSizes        ,
			    this               ,
			    gotSiteListWrapper ,
			    m_niceness         , // MAX_NICENESS
			    // default parms follow
			    true  ,  // doErrorCorrection?
			    true  ,  // includeTree?
			    true  ,  // doMerge?
			    -1    ,  // firstHostId
			    0     ,  // startFileNum
			    -1    ,  // numFiles
			    999999,  // timeout
			    -1    ,  // syncPoint
			    -1    ,  // preferLocalReads
			    NULL  ,  // msg5
			    NULL  ,  // msg5b
			    false ,  // isrealmerge?
			    true  ,  // allowpagecache?
			    false ,  // forceLocalIndexdb?
			    false ,  // doIndexdbSplit? nosplit
			    shardNum ) )//split ))
		return false;

	// return false if this blocked
	if ( ! gotSiteList() ) return false;
	// error?
	if ( g_errno ) return true;
	// or all done
	if ( m_allDone ) return true;
	// otherwise, try the next path component!
	goto top;
}