// . returns true if document is adult, false otherwise bool AdultBit::getBit ( char *s , int32_t niceness) { // rudimentary adult detection algorithm int32_t i = 0; int32_t dirties = 0; int32_t j; int32_t slen; loop: // skip until we hit an alpha while ( s[i] && ! is_alpha_a(s[i]) ) i++; // return if done if ( ! s[i] ) return false; // . point to char after this alpha // . return if none j = i + 1; // find end of the alpha char sequence while ( s[j] && is_alpha_a(s[j]) ) j++; // skip over 1 or 2 letter words slen = j - i; if ( slen <= 2 ) { i = j; goto loop; } // it's adult content if it has just 1 obscene word if ( isObscene ( (char *) s+i , slen ) ) return true; // W = non-dirty word // D = dirty word // . = sequence of punctuation/num and/or 1 to 2 letter words // dirty sequences: // . D . D . D . (dirties=6) // . D . W . D . D . (dirties=5) // . basically, if 3 out of 4 words in a subsequence are // "dirty" then the whole document is "adult" content if ( isDirty ( (char *) s+i , slen ) ) { dirties += 2; if ( dirties >= 5 ) return true; i = j; goto loop; } dirties--; if ( dirties < 0 ) dirties = 0; QUICKPOLL((niceness)); i = j; goto loop; }
bool has_alpha_utf8 ( char *s , char *send ) { char cs = 0; for ( ; s < send ; s += cs ) { cs = getUtf8CharSize ( s ); if ( cs == 1 ) { if (is_alpha_a(*s)) return true; continue; } if ( is_alpha_utf8(s) ) return true; } return false; }