コード例 #1
0
// . returns true if document is adult, false otherwise
bool AdultBit::getBit ( char *s , int32_t niceness) {

	// rudimentary adult detection algorithm
	int32_t  i   = 0;
	int32_t  dirties = 0;
	int32_t  j;
	int32_t  slen;
 loop:

	// skip until we hit an alpha
	while ( s[i] && ! is_alpha_a(s[i]) ) i++;
	// return if done
	if ( ! s[i] ) return false;
	// . point to char after this alpha
	// . return if none
	j = i + 1;
	// find end of the alpha char sequence
	while ( s[j] && is_alpha_a(s[j]) ) j++;
	// skip over 1 or 2 letter words
	slen = j - i; 
	if ( slen <= 2 ) { i = j; goto loop; }
	// it's adult content if it has just 1 obscene word
	if ( isObscene ( (char *) s+i , slen ) ) return true;

	// W = non-dirty word
	// D = dirty word
	// . = sequence of punctuation/num and/or 1 to 2 letter words
	// dirty sequences: 
	// . D . D . D .     (dirties=6)
	// . D . W . D . D . (dirties=5)
	// . basically, if 3 out of 4 words in a subsequence are
	//   "dirty" then the whole document is "adult" content
	if ( isDirty ( (char *) s+i , slen ) ) {
		dirties += 2;
		if ( dirties >= 5 ) return true;
		i = j;
		goto loop;
	}

	dirties--;
	if ( dirties < 0 ) dirties = 0;

	QUICKPOLL((niceness));
	i = j;
	goto loop;
}
コード例 #2
0
bool has_alpha_utf8 ( char *s , char *send ) {
	char cs = 0;
	for ( ; s < send ; s += cs ) {
		cs = getUtf8CharSize ( s );
		if ( cs == 1 ) {
			if (is_alpha_a(*s)) return true;
			continue;
		}
		if ( is_alpha_utf8(s) ) return true;
	}
	return false;
}