// Slightly modified from getTextEntity int16_t get_iana_charset(const char *cs, int len) { if (!s_isInitialized){ // set up the hash table if ( ! s_table.set ( 8,4,4096,NULL,0,false,"ianatbl") ) { log(LOG_WARN, "build: Could not init table of IANA Charsets."); return csUnknown; } // now add in all the charset entries int32_t n = (int32_t)sizeof(s_charsets) / (int32_t)sizeof(IANACharset); // turn off quickpolling char saved = g_conf.m_useQuickpoll; g_conf.m_useQuickpoll = false; for ( int32_t i = 0 ; i < n ; i++ ) { int64_t h = hash64Lower_a ( s_charsets[i].name, strlen(s_charsets[i].name) ); // store the charset index in the hash table as score if ( ! s_table.addTerm(h, i+1) ) { log(LOG_WARN, "build: add term failed"); return csUnknown; } } g_conf.m_useQuickpoll = saved; s_isInitialized = true; } int64_t h = hash64Lower_a ( cs , len ); // get the entity index from table (stored in the score field) int32_t i = (int32_t) s_table.getScore(h); // return 0 if no match if ( i == 0 ) return csUnknown; // return the iso character return (int16_t)s_charsets[i-1].mib_enum; }
// Slightly modified from getTextEntity short get_iana_charset(char *cs, int len) { if (!s_isInitialized){ // set up the hash table if ( ! s_table.set ( 8,4,4096,NULL,0,false,0,"ianatbl") ) return log("build: Could not init table of " "IANA Charsets."); // now add in all the charset entries long n = (long)sizeof(s_charsets) / (long)sizeof(IANACharset); // turn off quickpolling char saved = g_conf.m_useQuickpoll; g_conf.m_useQuickpoll = false; for ( long i = 0 ; i < n ; i++ ) { long long h = hash64Lower_a ( s_charsets[i].name, strlen(s_charsets[i].name) ); // store the charset index in the hash table as score if ( ! s_table.addTerm(&h, i+1) ) return log("build: add term failed"); } g_conf.m_useQuickpoll = saved; s_isInitialized = true; } long long h = hash64Lower_a ( cs , len ); // get the entity index from table (stored in the score field) long i = (long) s_table.getScore ( &h ); // return 0 if no match if ( i == 0 ) return csUnknown; // return the iso character return (short)s_charsets[i-1].mib_enum; }
bool AdultBit::isDirty ( char *s , int32_t len ) { static bool s_isInitialized = false; static char *s_dirty[] = { "anal", "analsex", "b*****b", "blowjobs", "boob", "boobs", "clitoris", "c**k", "cocks", "cum", "dick", "dicks", "g******g", "gangbangs", "gangbanging", "movie", "movies", "oral", "oralsex", "p**n", "porno", "pussy", "pussies", "sex", "sexy", "tit", "t**s", "video", "videos", "xxx", "xxxx", "xxxx" }; if ( ! s_isInitialized ) { // set up the hash table if ( ! s_dtable.set ( 8,4,sizeof(s_dirty )*2,NULL,0,false,0, "adulttab")) return log("build: Error initializing " "dirty word hash table." ); // now add in all the dirty words int32_t n = (int32_t)sizeof(s_dirty)/ sizeof(char *); for ( int32_t i = 0 ; i < n ; i++ ) { int64_t h = hash64b ( s_dirty [i] ); if ( ! s_dtable.addTerm (&h, i+1) ) return false; } s_isInitialized = true; } // compute the hash of the word "s" int64_t h = hash64Lower_a ( s , len ); // get from table return s_dtable.getScore ( &h ); }
static bool initEntityTable(){ if ( ! s_isInitialized ) { // set up the hash table if ( ! s_table.set ( 8,4,255,NULL,0,false,0,"enttbl" ) ) return log("build: Could not init table of " "HTML entities."); // now add in all the stop words int32_t n = (int32_t)sizeof(s_entities) / (int32_t)sizeof(Entity); for ( int32_t i = 0 ; i < n ; i++ ) { int64_t h = hash64b ( s_entities[i].entity ); // grab the unicode code point UChar32 up = s_entities[i].unicode; // now we are 100% up if ( ! up ) { char *xx=NULL;*xx=0; } // point to it char *buf = (char *)s_entities[i].utf8; // if uchar32 not 0 then set the utf8 with it int32_t len = utf8Encode(up,buf); // // make my own mods to make parsing easier // if ( up == 160 ) { // nbsp buf[0] = ' '; len = 1; } // // end custom mods // // set length s_entities[i].utf8Len = len; // check it if ( len == 0 ) { char *xx=NULL;*xx=0; } // must not exist! if ( s_table.isInTable(&h) ) { char*xx=NULL;*xx=0;} // store the entity index in the hash table as score if ( ! s_table.addTerm ( &h, i+1 ) ) return false; } s_isInitialized = true; } return true; }
bool AdultBit::isObscene ( char *s , int32_t len ) { static bool s_isInitialized = false; static char *s_obscene[] = { "c**t", "clits", // "cum", magna cum laude "cums", "cumshot", "c**t", "cunts", "milf", "rimjob", "felch", "f**k", "f****d", "f****r", "f*****g", "f***s", "w***e", "w****s" }; if ( ! s_isInitialized ) { // set up the hash table if ( ! s_otable.set ( 8,4,sizeof(s_obscene)*2,NULL,0,false,0, "obscenetab") ) return log("build: Error initializing " "obscene word hash table." ); // now add in all the stop words int32_t n = sizeof(s_obscene) / sizeof(char *); for ( int32_t i = 0 ; i < n ; i++ ) { int64_t h = hash64b ( s_obscene[i] ); if ( ! s_otable.addTerm ( &h, i+1 ) ) return false; } s_isInitialized = true; } // compute the hash of the word "s" int64_t h = hash64Lower_a ( s , len ); // get from table return s_otable.getScore ( &h ); }
static bool initEntityTable(){ if ( ! s_isInitialized ) { // set up the hash table if ( ! s_table.set ( 8,4,4096,NULL,0,false,"enttbl" ) ) { log("build: Could not init table of HTML entities."); return false; } // now add in all the html entities const int32_t n = (int32_t)sizeof(s_entities) / (int32_t)sizeof(Entity); for ( int32_t i = 0 ; i < n ; i++ ) { int64_t h = hash64b ( s_entities[i].entity ); // convert the unicode codepoints to an utf8 string char *buf = (char *)s_entities[i].utf8; for(int j=0; j<s_entities[i].codepoints; j++) { UChar32 codepoint = s_entities[i].codepoint[j]; int32_t len = utf8Encode(codepoint,buf); if ( len == 0 ) { g_process.shutdownAbort(true); } // make modification to make parsing easier if ( codepoint == 160 ) { // nbsp buf[0] = ' '; len = 1; } buf += len; } s_entities[i].utf8Len = (size_t)(buf-s_entities[i].utf8); // must not exist! if ( s_table.isInTable(&h) ) { g_process.shutdownAbort(true);} // store the entity index in the hash table as score if ( ! s_table.addTerm(h, i+1) ) return false; } s_isInitialized = true; } return true; }
// . cluster the docids based on the clusterRecs // . returns false and sets g_errno on error // . if maxDocIdsPerHostname is -1 do not do hostname clsutering bool setClusterLevels ( const key96_t *clusterRecs, const int64_t *docIds, int32_t numRecs , int32_t maxDocIdsPerHostname , bool doHostnameClustering , bool familyFilter , bool isDebug , // output to clusterLevels[] char *clusterLevels ) { if ( numRecs <= 0 ) return true; // skip if not clustering on anything //if ( ! doHostnameClustering && ! familyFilter ) { // memset ( clusterLevels, CR_OK, numRecs ); // return true; //} // how many negative site hashes do we have? // count how many docids we got, they are a cgi value, so represented // in ascii separated by +'s. i.e. "12345+435322+3439333333" //HashTableT <int64_t,char> sht; //if ( ! hashFromString ( &sht , noSiteIds ) ) return false; //bool checkNegative = ( sht.getNumSlotsUsed() > 0 ); HashTableX ctab; // init to 2*numRecs for speed. use 0 for niceness! if ( ! ctab.set ( 8 , 4 , numRecs * 2,NULL,0,false,"clustertab" ) ) return false; // time it u_int64_t startTime = gettimeofdayInMilliseconds(); // init loop counter vars int32_t count = 0; uint32_t score = 0; char *crec ; int64_t h ; char *level ; bool fakeIt ; for(int32_t i=0; i<numRecs; i++) { crec = (char *)&clusterRecs[i]; // . set this cluster level // . right now will be CR_ERROR_CLUSTERDB or CR_OK... level = &clusterLevels[i]; // sanity check if ( *level == CR_UNINIT ) gbshutdownLogicError(); // and the adult bit, for cleaning the results if ( familyFilter && g_clusterdb.hasAdultContent ( crec ) ) { *level = CR_DIRTY; continue; } // if error looking up in clusterdb, use a 8 bit domainhash from docid fakeIt = (*level==CR_ERROR_CLUSTERDB); // assume ok, show it, it is visible *level = CR_OK; // site hash comes next if(!doHostnameClustering) continue; // . get the site hash // . these are only 32 bits! if(fakeIt) h = Titledb::getDomHash8FromDocId(docIds[i]); else h = g_clusterdb.getSiteHash26 ( crec ); // inc this count! if ( fakeIt ) { g_stats.m_filterStats[CR_ERROR_CLUSTERDB]++; } // if it matches a siteid on our black list //if ( checkNegative && sht.getSlot((int64_t)h) > 0 ) { // *level = CR_BLACKLISTED_SITE; goto loop; } // look it up score = ctab.getScore(h) ; // if still visible, just continue if ( score < (uint32_t)maxDocIdsPerHostname ) { if ( ! ctab.addTerm(h)) return false; continue; } // otherwise, no lonegr visible *level = CR_CLUSTERED; } // debug for ( int32_t i = 0 ; i < numRecs && isDebug ; i++ ) { crec = (char *)&clusterRecs[i]; uint32_t siteHash26=g_clusterdb.getSiteHash26(crec); logf(LOG_DEBUG,"query: msg51: hit #%" PRId32") sitehash26=%" PRIu32" " "rec.n0=%" PRIx64" docid=%" PRId64" cl=%" PRId32" (%s)", (int32_t)count++, (int32_t)siteHash26, clusterRecs[i].n0, (int64_t)docIds[i], (int32_t)clusterLevels[i], g_crStrings[(int32_t)clusterLevels[i]] ); } //log(LOG_DEBUG,"build: numVisible=%" PRId32" numClustered=%" PRId32" numErrors=%" PRId32, // *numVisible,*numClustered,*numErrors); // show time uint64_t took = gettimeofdayInMilliseconds() - startTime; if ( took > 3 ) log(LOG_INFO,"build: Took %" PRId64" ms to do clustering.",took); // we are all done return true; }
static bool initEntityTable(){ if ( ! s_isInitialized ) { // set up the hash table if ( ! s_table.set ( 8,4,255,NULL,0,false,0,"enttbl" ) ) return log("build: Could not init table of " "HTML entities."); // now add in all the stop words int32_t n = (int32_t)sizeof(s_entities) / (int32_t)sizeof(Entity); for ( int32_t i = 0 ; i < n ; i++ ) { int64_t h = hash64b ( s_entities[i].entity ); // grab the unicode code point UChar32 up = s_entities[i].unicode; // now we are 100% up if ( ! up ) { char *xx=NULL;*xx=0; } // point to it char *buf = (char *)s_entities[i].utf8; // if uchar32 not 0 then set the utf8 with it int32_t len = utf8Encode(up,buf); // // make my own mods to make parsing easier // if ( up == 160 ) { // nbsp buf[0] = ' '; len = 1; } // make all quotes equal '\"' (34 decimal) // double and single curling quotes //http://www.dwheeler.com/essays/quotes-test-utf-8.html // “, 201d, 2018, 2019 (unicode values, not utf8) // &ldquo, &rdquo, &lsquo, &rsquo /* if ( up == 171 || up == 187 || up == 8216 || up == 8217 || up == 8218 || up == 8220 || up == 8221 || up == 8222 || up == 8249 || up == 8250 ) { buf[0] = '\"'; len = 1; } // and normalize all dashes (mdash,ndash) if ( up == 8211 || up == 8212 ) { buf[0] = '-'; len = 1; } */ // // end custom mods // // set length s_entities[i].utf8Len = len; // check it if ( len == 0 ) { char *xx=NULL;*xx=0; } // must not exist! if ( s_table.isInTable(&h) ) { char*xx=NULL;*xx=0;} // store the entity index in the hash table as score if ( ! s_table.addTerm ( &h, i+1 ) ) return false; } s_isInitialized = true; } return true; }