DxArchiverParameters::DxArchiverParameters(const DxArchiverParameters& rhs): SeekerParameterGroup(rhs.getCommand()), internal_(new DxArchiverParametersInternal(*rhs.internal_)) { setSite(rhs.getSite()); addParameters(); }
DxParameters::DxParameters(const DxParameters& rhs): SeekerParameterGroup(rhs.getCommand(), rhs.getDbTableName(), rhs.getIdColNameInActsTable()), internal_(new DxParametersInternal(*rhs.internal_)) { setSite(rhs.getSite()); addParameters(); }
void Voronoi::ParabolaNode::_move(std::unique_ptr<ParabolaNode> parabola) { _leftSibling = parabola->_leftSibling; _rightSibling = parabola->_rightSibling; _leftChild = std::move(parabola->_leftChild); _rightChild = std::move(parabola->_rightChild); setSite(parabola->site()); setEdge(parabola->edge()); setEvent(parabola->event()); }
DxParameters& DxParameters::operator=(const DxParameters& rhs) { if (this == &rhs) { return *this; } setCommand(rhs.getCommand()); eraseParamList(); setSite(rhs.getSite()); delete internal_; internal_ = new DxParametersInternal(*rhs.internal_); addParameters(); return *this; }
Voronoi::ParabolaNode * Voronoi::ParabolaNode::emplaceParabola(const Point & site) { if (!isValid()) { setSite(site); return this; } ParabolaNode * parabola = findParabola(site); assert(parabola->site().y() >= site.y()); // disable event including parabola->sites in the middle of a triple. parabola->setEvent(nullptr); // Handle special case when: one_parabola_in_root && parabolaSite.y() == site.y() const Point parabolaSite = parabola->site(); if (isLeaf(this) && parabolaSite.y() == site.y()) { if (site.x() < parabolaSite.x()) { parabola->_createChildren(site, parabolaSite); return parabola->leftChild(); } else { parabola->_createChildren(parabolaSite, site); return parabola->rightChild(); } } else if (site.x() < parabolaSite.x()) { // Create new parabola branch parabola->_createChildren(Point(), parabolaSite); parabola->leftChild()->_createChildren(parabolaSite, site); // Set edge to right sibling from the original parabola parabola->_rightChild->setEdge(parabola->edge()); // the rightmost parabola parabola->setEdge(nullptr); return parabola->leftChild()->rightChild(); } else { // Create new parabola branch parabola->_createChildren(parabolaSite, Point()); parabola->rightChild()->_createChildren(site, parabolaSite); // Set edge to right sibling from the original parabola parabola->rightChild()->_rightChild->setEdge(parabola->edge()); // the rightmost parabola parabola->setEdge(nullptr); return parabola->rightChild()->leftChild(); } }
// . returns false if blocked, returns true and sets g_errno on error // . returns true with m_allDone set to false to process another subsite // . we use voters to set SEC_VOTE_STATIC and SEC_VOTE_DYNAMIC flags // in addition to SEC_VOTE_TEXTY and SEC_VOTE_UNIQUE bool SiteGetter::gotSiteList ( ) { // assume not trying again m_tryAgain = false; // error? if ( g_errno ) { // timeouts usually... log("site: sitegetter gotList: %s",mstrerror(g_errno)); // mark it so caller knows m_errno = g_errno; // so try again without increasing m_pathDepth // i've seen a host return EBADRDBID for some reason // and put host #0 in an infinite log spam loop so stop it if ( g_errno != EBADRDBID ) m_tryAgain = true; return true; } // how many urls at this path depth? int32_t count = ( m_list.getListSize() - 6 ) / 6; // if we do not have enough to quality this as a subsite path depth // try the next if ( count < 100 ) { // increment and try again m_pathDepth++; // clear just in case g_errno = 0; // get another list if we can, m_allDone is no true yet if ( m_pathDepth < m_maxPathDepth ) { m_tryAgain = true; return true; } } // ok, i guess this indicates we have a subsite level m_sitePathDepth = m_pathDepth; // this basically means none! if ( m_pathDepth >= m_maxPathDepth ) m_sitePathDepth = -1; // . sets m_site and m_siteLen from m_url // . this returns false if blocked, true otherwise return setSite ( ) ; }
// . also sets m_sitePathDepth to what it should be // . -1 indicates unknown (not enough data, etc.) or host/domain is the site // . returns false if blocked, true otherwise // . returns true and sets g_errno on error // . sets m_site to reference into "url" so XmlDoc::updateTagdb() can just // pass a bunch of site ptrs to msg9a // . "url" MUST BE NORMALIZED via Url.cpp. so using Links' buffer is ok! // . TODO: consider setting "state" to null if your url host has tons of inlinx bool SiteGetter::getSite ( char *url , TagRec *gr , int32_t timestamp, //char *coll , collnum_t collnum, int32_t niceness , //bool addTags , void *state , void (* callback)(void *state) ) { // save it m_gr = gr; m_url = url; //m_coll = coll; m_collnum = collnum; //m_addTags = addTags; m_state = state; m_callback = callback; m_timestamp= timestamp; m_niceness = niceness; m_errno = 0; // is it domain only? m_hasSubdomain = ::hasSubdomain ( url ); // reset m_siteLen = 0; m_site[0] = '\0'; m_allDone = false; m_addedTag.reset(); // set this to unknown for now m_sitePathDepth = -1; m_oldSitePathDepth = -1; // reset this just in case g_errno = 0; // // HARDCODED algos // // ~ /user/ /users/ /profile/ myspace facebook linkedin // if ( setRecognizedSite ( ) ) { m_allDone = true; return true; } // bail if nothing else we can do if ( ! gr ) return setSite ( ) ; CollectionRec *cr = g_collectiondb.getRec ( collnum ); // g_errno should be set if this is NULL if ( ! cr ) return true; //if ( ! cr->m_subsiteDetectionEnabled ) return true; // check the current tag for an age Tag *tag = gr->getTag("sitepathdepth"); // if there and the age is young, skip it int32_t age = -1; //int32_t now = getTimeGlobal(); //if ( tag ) age = now - tag->m_timestamp; // to parse conssitently for the qa test "qatest123" coll use // "timestamp" as the "current time" if ( tag ) age = timestamp - tag->m_timestamp; // if there, at least get it (might be -1) if ( tag ) m_oldSitePathDepth = atol ( tag->getTagData() ); // . if older than 10 days, we need to redo it // . if caller give us a timestamp of 0, never redo it! if ( age > 10*24*60*60 && timestamp != 0 ) age = -1; //if ( strstr(m_url,"http://www.topix.com/yp/albuquerque/c/community-religion-and-spirituality-churches") ) // log("hey"); // . if our site quality is low, forget about dividing it up too // . if age is valid, skip it // . also if caller does not want a callback, like XmlDoc.cpp, // then use whatever we got if ( age >= 0 || ! m_state ) { // || hostRootNumInlinks < 500 ) { // do not add to tagdb m_state = NULL; // just use what we had, it is not expired m_sitePathDepth = m_oldSitePathDepth; // . now set the site with m_sitePathDepth // . sanity check, should not block since m_state is NULL if ( ! setSite () ) { char *xx=NULL;*xx=0; } // we did not block return true; } // right now we only run on host #0 so we do not flood the cluster // with queries... if ( g_hostdb.m_hostId != 0 ) { // do not add to tagdb and do not block! m_state = NULL; // . use a sitepathdepth of -1 by default then, until host #0 // has a chance to evaluate // . a sitepathdepth of -1 means to use the full hostname // as the site m_sitePathDepth = -1; // sanity check, should not block since m_state is NULL if ( ! setSite () ) { char *xx=NULL;*xx=0; } // we did not block return true; } // . initial path depth // . this actually includes the first subdir name, up to, but not // including the /, according to Url::getPathEnd() // . start with the broadest site as our possible subsite first // in order to reduce errors i guess. because if we have examples: // xyz.com/fred/ // xyz.com/jamie/ // xyz.com/bob/ ... // and we also have: // xyz.com/home/users/fred/ // xyz.com/home/users/jamie/ // xyz.com/home/users/bob/ ... // then we need the first set to take precedence! m_pathDepth = 0; // set our fill url class. do not addWWW //m_u.set ( m_url , gbstrlen(m_url) , false ); // must have http:// i guess if ( strncmp(m_url,"http",4) ) { g_errno = EBADURL; return true; // don't let bad input from pageparser core us! char *xx=NULL;*xx=0; } // how many can we do? false = countFilename? //m_maxPathDepth = m_u.getPathDepth ( false ); // . pathDepth==0 for "www.xyz.com" // . pathDepth==0 for "www.xyz.com/" // . pathDepth==0 for "www.xyz.com/foo" // . pathDepth==1 for "www.xyz.com/foo/" // . pathDepth==1 for "www.xyz.com/foo/x" // . pathDepth==2 for "www.xyz.com/foo/x/" // . pathDepth==2 for "www.xyz.com/foo/x/y" // . true --> we have the protocol, http:// in m_url m_maxPathDepth = getPathDepth ( m_url , true ); // get it. return false if it blocked. return getSiteList(); }
// . returns false if blocked, true otherwise // . returns true on error and sets g_errno bool SiteGetter::getSiteList ( ) { top: // . setSite() will return TRUE and set g_errno on error, and returns // false if it blocked adding a tag, which will call callback once // tag is added // . stop at this point if ( m_pathDepth >= 3 ) return setSite(); // or if no more if ( m_pathDepth >= m_maxPathDepth ) return setSite(); // . make the termid // . but here we get are based on "m_pathDepth" which ranges // from 1 to N // . if m_pathDepth==0 use "www.xyz.com" as site // . if m_pathDepth==1 use "www.xyz.com/foo/" as site ... char *pend = getPathEnd ( m_url , m_pathDepth ); // hash up to that //char *host = m_u.getHost(); char *host = getHostFast ( m_url , NULL ); // hash the prefix first to match XmlDoc::hashNoSplit() char *prefix = "siteterm"; // hash that and we will incorporate it to match XmlDoc::hashNoSplit() int64_t ph = hash64 ( prefix , gbstrlen(prefix) ); // . this should match basically what is in XmlDoc.cpp::hash() // . and this now does not include pages that have no outlinks // "underneath" them. int64_t termId = hash64 ( host , pend - host , ph ) & TERMID_MASK; // get all pages that have this as their termid! key144_t start ; key144_t end ; g_posdb.makeStartKey ( &start, termId ); g_posdb.makeEndKey ( &end , termId ); // . now see how many urls art at this path depth from this hostname // . if it is a huge # then we know they are all subsites! // because it is too bushy to be anything else // . i'd say 100 nodes is good enough to qualify as a homestead site int32_t minRecSizes = 5000000; // get the group this list is in //uint32_t gid ; //gid = getGroupId ( RDB_POSDB , (char *)&start , false ); //split? //uint32_t shardNum ; //shardNum = getShardNum( RDB_POSDB , (char *)&start , false ); //split? // i guess this is split by termid and not docid???? int32_t shardNum = g_hostdb.getShardNumByTermId ( &start ); // we need a group #. the column #. //int32_t split = g_hostdb.getGroupNum ( gid ); // int16_tcut Msg0 *m = &m_msg0; // get the list. returns false if blocked. if ( ! m->getList ( -1 , // hostId 0 , // ip 0 , // port 0 , // maxCacheAge false , // addToCache RDB_POSDB , m_collnum , &m_list , (char *)&start , (char *)&end , minRecSizes , this , gotSiteListWrapper , m_niceness , // MAX_NICENESS // default parms follow true , // doErrorCorrection? true , // includeTree? true , // doMerge? -1 , // firstHostId 0 , // startFileNum -1 , // numFiles 999999, // timeout -1 , // syncPoint -1 , // preferLocalReads NULL , // msg5 NULL , // msg5b false , // isrealmerge? true , // allowpagecache? false , // forceLocalIndexdb? false , // doIndexdbSplit? nosplit shardNum ) )//split )) return false; // return false if this blocked if ( ! gotSiteList() ) return false; // error? if ( g_errno ) return true; // or all done if ( m_allDone ) return true; // otherwise, try the next path component! goto top; }