Пример #1
0
/// \ingroup waUtility 
/// \fn string extract_text( const string &text, const int option, const size_t len )
/// 全角半角字符转换并提取正文
/// \param text 源字符串
/// \param option 过滤范围选项,可选值组合有
/// - EXTRACT_ALPHA 过滤字母
/// - EXTRACT_DIGIT 过滤数字
/// - EXTRACT_PUNCT 过滤标点
/// - EXTRACT_SPACE 过滤空白
/// - EXTRACT_HTML 过滤HTML代码
/// - 默认值为EXTRACT_ALL即以上全部
/// \param len 过滤长度,大于0时只截取前len个有效字符,默认为0
/// \return 转换提取结果字符串,若源字符串内容被全部过滤则返回空
string extract_text( const string &text, const int option, const size_t len ) {
	if ( text=="" || option<=0 )
		return text;
	
	string converted = sbc_to_dbc( text );
	
	// is HTML
	if ( option&EXTRACT_HTML )
		converted = extract_html( converted );
	if ( option == EXTRACT_HTML )
		return converted;

	string extracted;
	extracted.reserve( text.length() );
	
	for ( unsigned int i=0; i<converted.length(); ++i ) {
		unsigned char c = converted[i];
		if ( isalpha(c) )
			c = tolower( c );
		
		// is GBK char
		if ( !is_punctuation(c) && !isalpha(c) && 
			 ((c>=0x81&&c<=0xFE) || (c>=0x40&&c<=0x7E) || (c>=0xA1&&c<=0xFE)) )
			extracted += c;
		// is alpha
		else if ( option&EXTRACT_ALPHA && isalpha(c) )
			continue;
		// is digit
		else if ( option&EXTRACT_DIGIT && isdigit(c) )
			continue;
		// is punct
		else if ( option&EXTRACT_PUNCT && (ispunct(c)||is_punctuation(c)) )
			continue;
		// is space
		else if ( option&EXTRACT_SPACE && (isspace(c)||isblank(c)) )
			continue;
		// other 
		else
			extracted += c;
		
		// enough
		if ( len>0 && extracted.length()>=len )
			break;
	}
	
	return extracted;
}
Пример #2
0
/* Process incoming stream */
int
http_process_stream(SOCK * sock_obj, int r)
{
	sock_obj->size += r;
	sock_obj->total_size += r;

	if (!sock_obj->extracted) {
		if (req->verbose)
			printf(HTTP_HEADER_HEXA);
		if ((sock_obj->extracted = extract_html(sock_obj->buffer, sock_obj->size))) {
			if (req->verbose)
				http_dump_header(sock_obj->buffer + (sock_obj->size - r),
						 (sock_obj->extracted - sock_obj->buffer)
						 - (sock_obj->size - r));
			r = sock_obj->size - (sock_obj->extracted - sock_obj->buffer);
			if (r) {
				if (req->verbose) {
					printf(HTML_HEADER_HEXA);
					dump_buffer(sock_obj->extracted, r);
				}
				memmove(sock_obj->buffer, sock_obj->extracted, r);
				HASH_UPDATE(sock_obj, sock_obj->buffer, r);
				r = 0;
			}
			sock_obj->size = r;
		} else {
			if (req->verbose)
				http_dump_header(sock_obj->buffer + (sock_obj->size - r),
						 r);

			/* minimize buffer using no 2*CR/LF found yet */
			if (sock_obj->size > 4) {
				memmove(sock_obj->buffer,
					sock_obj->buffer + sock_obj->size - 4, 4);
				sock_obj->size = 4;
			}
		}
	} else if (sock_obj->size) {
		if (req->verbose)
			dump_buffer(sock_obj->buffer, r);
		HASH_UPDATE(sock_obj, sock_obj->buffer, sock_obj->size);
		sock_obj->size = 0;
	}

	return 0;
}