// Slightly modified from getTextEntity int16_t get_iana_charset(const char *cs, int len) { if (!s_isInitialized){ // set up the hash table if ( ! s_table.set ( 8,4,4096,NULL,0,false,"ianatbl") ) { log(LOG_WARN, "build: Could not init table of IANA Charsets."); return csUnknown; } // now add in all the charset entries int32_t n = (int32_t)sizeof(s_charsets) / (int32_t)sizeof(IANACharset); // turn off quickpolling char saved = g_conf.m_useQuickpoll; g_conf.m_useQuickpoll = false; for ( int32_t i = 0 ; i < n ; i++ ) { int64_t h = hash64Lower_a ( s_charsets[i].name, strlen(s_charsets[i].name) ); // store the charset index in the hash table as score if ( ! s_table.addTerm(h, i+1) ) { log(LOG_WARN, "build: add term failed"); return csUnknown; } } g_conf.m_useQuickpoll = saved; s_isInitialized = true; } int64_t h = hash64Lower_a ( cs , len ); // get the entity index from table (stored in the score field) int32_t i = (int32_t) s_table.getScore(h); // return 0 if no match if ( i == 0 ) return csUnknown; // return the iso character return (int16_t)s_charsets[i-1].mib_enum; }
// Slightly modified from getTextEntity short get_iana_charset(char *cs, int len) { if (!s_isInitialized){ // set up the hash table if ( ! s_table.set ( 8,4,4096,NULL,0,false,0,"ianatbl") ) return log("build: Could not init table of " "IANA Charsets."); // now add in all the charset entries long n = (long)sizeof(s_charsets) / (long)sizeof(IANACharset); // turn off quickpolling char saved = g_conf.m_useQuickpoll; g_conf.m_useQuickpoll = false; for ( long i = 0 ; i < n ; i++ ) { long long h = hash64Lower_a ( s_charsets[i].name, strlen(s_charsets[i].name) ); // store the charset index in the hash table as score if ( ! s_table.addTerm(&h, i+1) ) return log("build: add term failed"); } g_conf.m_useQuickpoll = saved; s_isInitialized = true; } long long h = hash64Lower_a ( cs , len ); // get the entity index from table (stored in the score field) long i = (long) s_table.getScore ( &h ); // return 0 if no match if ( i == 0 ) return csUnknown; // return the iso character return (short)s_charsets[i-1].mib_enum; }
bool AdultBit::isDirty ( char *s , int32_t len ) { static bool s_isInitialized = false; static char *s_dirty[] = { "anal", "analsex", "b*****b", "blowjobs", "boob", "boobs", "clitoris", "c**k", "cocks", "cum", "dick", "dicks", "g******g", "gangbangs", "gangbanging", "movie", "movies", "oral", "oralsex", "p**n", "porno", "pussy", "pussies", "sex", "sexy", "tit", "t**s", "video", "videos", "xxx", "xxxx", "xxxx" }; if ( ! s_isInitialized ) { // set up the hash table if ( ! s_dtable.set ( 8,4,sizeof(s_dirty )*2,NULL,0,false,0, "adulttab")) return log("build: Error initializing " "dirty word hash table." ); // now add in all the dirty words int32_t n = (int32_t)sizeof(s_dirty)/ sizeof(char *); for ( int32_t i = 0 ; i < n ; i++ ) { int64_t h = hash64b ( s_dirty [i] ); if ( ! s_dtable.addTerm (&h, i+1) ) return false; } s_isInitialized = true; } // compute the hash of the word "s" int64_t h = hash64Lower_a ( s , len ); // get from table return s_dtable.getScore ( &h ); }
// . is "s" an HTML entity? (ascii representative of an iso char) // . return the 32-bit unicode char it represents // . returns 0 if none // . JAB: const-ness for optimizer... static const Entity *getTextEntity ( const char *s , int32_t len ) { if ( !initEntityTable()) return 0; // take the ; off, if any if ( s[len-1] == ';' ) len--; // compute the hash of the entity including &, but not ; int64_t h = hash64 ( s , len ); // get the entity index from table (stored in the score field) int32_t i = (int32_t) s_table.getScore(h); // return 0 if no match if ( i == 0 ) return NULL; // point to the utf8 char. these is 1 or 2 bytes it seems return s_entities+i-1; }
// . is "s" an HTML entity? (ascii representative of an iso char) // . return the 32-bit unicode char it represents // . returns 0 if none // . JAB: const-ness for optimizer... uint32_t getTextEntity ( const char *s , int32_t len ) { if ( !initEntityTable()) return 0; // take the ; off, if any if ( s[len-1] == ';' ) len--; // compute the hash of the entity including &, but not ; int64_t h = hash64 ( s , len ); // get the entity index from table (stored in the score field) int32_t i = (int32_t) s_table.getScore ( &h ); // return 0 if no match if ( i == 0 ) return 0; // point to the utf8 char. these is 1 or 2 bytes it seems char *p = (char *)s_entities[i-1].utf8; // encode into unicode uint32_t c = utf8Decode ( p ); // return that return c; }
bool AdultBit::isObscene ( char *s , int32_t len ) { static bool s_isInitialized = false; static char *s_obscene[] = { "c**t", "clits", // "cum", magna cum laude "cums", "cumshot", "c**t", "cunts", "milf", "rimjob", "felch", "f**k", "f****d", "f****r", "f*****g", "f***s", "w***e", "w****s" }; if ( ! s_isInitialized ) { // set up the hash table if ( ! s_otable.set ( 8,4,sizeof(s_obscene)*2,NULL,0,false,0, "obscenetab") ) return log("build: Error initializing " "obscene word hash table." ); // now add in all the stop words int32_t n = sizeof(s_obscene) / sizeof(char *); for ( int32_t i = 0 ; i < n ; i++ ) { int64_t h = hash64b ( s_obscene[i] ); if ( ! s_otable.addTerm ( &h, i+1 ) ) return false; } s_isInitialized = true; } // compute the hash of the word "s" int64_t h = hash64Lower_a ( s , len ); // get from table return s_otable.getScore ( &h ); }
// . is "s" an HTML entity? (ascii representative of an iso char) // . return the 32-bit unicode char it represents // . returns 0 if none // . JAB: const-ness for optimizer... uint32_t getTextEntity ( char *s , int32_t len ) { if ( !initEntityTable()) return 0; // take the ; off, if any if ( s[len-1] == ';' ) len--; // compute the hash of the entity including &, but not ; int64_t h = hash64 ( s , len ); // get the entity index from table (stored in the score field) int32_t i = (int32_t) s_table.getScore ( &h ); // return 0 if no match if ( i == 0 ) return 0; // point to the utf8 char. these is 1 or 2 bytes it seems char *p = (char *)s_entities[i-1].utf8; // encode into unicode uint32_t c = utf8Decode ( p ); // return that return c; // return the iso character //printf("Converted text entity \""); //for(int si=0;si<len;si++)putchar(s[si]); //printf("\" to 0x%x(%d)\"%c\"\n",s_entities[i-1].c,s_entities[i-1].c, // s_entities[i-1].c); //return (uint32_t)s_entities[i-1].c; }
// . cluster the docids based on the clusterRecs // . returns false and sets g_errno on error // . if maxDocIdsPerHostname is -1 do not do hostname clsutering bool setClusterLevels ( const key96_t *clusterRecs, const int64_t *docIds, int32_t numRecs , int32_t maxDocIdsPerHostname , bool doHostnameClustering , bool familyFilter , bool isDebug , // output to clusterLevels[] char *clusterLevels ) { if ( numRecs <= 0 ) return true; // skip if not clustering on anything //if ( ! doHostnameClustering && ! familyFilter ) { // memset ( clusterLevels, CR_OK, numRecs ); // return true; //} // how many negative site hashes do we have? // count how many docids we got, they are a cgi value, so represented // in ascii separated by +'s. i.e. "12345+435322+3439333333" //HashTableT <int64_t,char> sht; //if ( ! hashFromString ( &sht , noSiteIds ) ) return false; //bool checkNegative = ( sht.getNumSlotsUsed() > 0 ); HashTableX ctab; // init to 2*numRecs for speed. use 0 for niceness! if ( ! ctab.set ( 8 , 4 , numRecs * 2,NULL,0,false,"clustertab" ) ) return false; // time it u_int64_t startTime = gettimeofdayInMilliseconds(); // init loop counter vars int32_t count = 0; uint32_t score = 0; char *crec ; int64_t h ; char *level ; bool fakeIt ; for(int32_t i=0; i<numRecs; i++) { crec = (char *)&clusterRecs[i]; // . set this cluster level // . right now will be CR_ERROR_CLUSTERDB or CR_OK... level = &clusterLevels[i]; // sanity check if ( *level == CR_UNINIT ) gbshutdownLogicError(); // and the adult bit, for cleaning the results if ( familyFilter && g_clusterdb.hasAdultContent ( crec ) ) { *level = CR_DIRTY; continue; } // if error looking up in clusterdb, use a 8 bit domainhash from docid fakeIt = (*level==CR_ERROR_CLUSTERDB); // assume ok, show it, it is visible *level = CR_OK; // site hash comes next if(!doHostnameClustering) continue; // . get the site hash // . these are only 32 bits! if(fakeIt) h = Titledb::getDomHash8FromDocId(docIds[i]); else h = g_clusterdb.getSiteHash26 ( crec ); // inc this count! if ( fakeIt ) { g_stats.m_filterStats[CR_ERROR_CLUSTERDB]++; } // if it matches a siteid on our black list //if ( checkNegative && sht.getSlot((int64_t)h) > 0 ) { // *level = CR_BLACKLISTED_SITE; goto loop; } // look it up score = ctab.getScore(h) ; // if still visible, just continue if ( score < (uint32_t)maxDocIdsPerHostname ) { if ( ! ctab.addTerm(h)) return false; continue; } // otherwise, no lonegr visible *level = CR_CLUSTERED; } // debug for ( int32_t i = 0 ; i < numRecs && isDebug ; i++ ) { crec = (char *)&clusterRecs[i]; uint32_t siteHash26=g_clusterdb.getSiteHash26(crec); logf(LOG_DEBUG,"query: msg51: hit #%" PRId32") sitehash26=%" PRIu32" " "rec.n0=%" PRIx64" docid=%" PRId64" cl=%" PRId32" (%s)", (int32_t)count++, (int32_t)siteHash26, clusterRecs[i].n0, (int64_t)docIds[i], (int32_t)clusterLevels[i], g_crStrings[(int32_t)clusterLevels[i]] ); } //log(LOG_DEBUG,"build: numVisible=%" PRId32" numClustered=%" PRId32" numErrors=%" PRId32, // *numVisible,*numClustered,*numErrors); // show time uint64_t took = gettimeofdayInMilliseconds() - startTime; if ( took > 3 ) log(LOG_INFO,"build: Took %" PRId64" ms to do clustering.",took); // we are all done return true; }