/// \ingroup waUtility /// \fn string extract_text( const string &text, const int option, const size_t len ) /// 全角半角字符转换并提取正文 /// \param text 源字符串 /// \param option 过滤范围选项,可选值组合有 /// - EXTRACT_ALPHA 过滤字母 /// - EXTRACT_DIGIT 过滤数字 /// - EXTRACT_PUNCT 过滤标点 /// - EXTRACT_SPACE 过滤空白 /// - EXTRACT_HTML 过滤HTML代码 /// - 默认值为EXTRACT_ALL即以上全部 /// \param len 过滤长度,大于0时只截取前len个有效字符,默认为0 /// \return 转换提取结果字符串,若源字符串内容被全部过滤则返回空 string extract_text( const string &text, const int option, const size_t len ) { if ( text=="" || option<=0 ) return text; string converted = sbc_to_dbc( text ); // is HTML if ( option&EXTRACT_HTML ) converted = extract_html( converted ); if ( option == EXTRACT_HTML ) return converted; string extracted; extracted.reserve( text.length() ); for ( unsigned int i=0; i<converted.length(); ++i ) { unsigned char c = converted[i]; if ( isalpha(c) ) c = tolower( c ); // is GBK char if ( !is_punctuation(c) && !isalpha(c) && ((c>=0x81&&c<=0xFE) || (c>=0x40&&c<=0x7E) || (c>=0xA1&&c<=0xFE)) ) extracted += c; // is alpha else if ( option&EXTRACT_ALPHA && isalpha(c) ) continue; // is digit else if ( option&EXTRACT_DIGIT && isdigit(c) ) continue; // is punct else if ( option&EXTRACT_PUNCT && (ispunct(c)||is_punctuation(c)) ) continue; // is space else if ( option&EXTRACT_SPACE && (isspace(c)||isblank(c)) ) continue; // other else extracted += c; // enough if ( len>0 && extracted.length()>=len ) break; } return extracted; }
/* Process incoming stream */ int http_process_stream(SOCK * sock_obj, int r) { sock_obj->size += r; sock_obj->total_size += r; if (!sock_obj->extracted) { if (req->verbose) printf(HTTP_HEADER_HEXA); if ((sock_obj->extracted = extract_html(sock_obj->buffer, sock_obj->size))) { if (req->verbose) http_dump_header(sock_obj->buffer + (sock_obj->size - r), (sock_obj->extracted - sock_obj->buffer) - (sock_obj->size - r)); r = sock_obj->size - (sock_obj->extracted - sock_obj->buffer); if (r) { if (req->verbose) { printf(HTML_HEADER_HEXA); dump_buffer(sock_obj->extracted, r); } memmove(sock_obj->buffer, sock_obj->extracted, r); HASH_UPDATE(sock_obj, sock_obj->buffer, r); r = 0; } sock_obj->size = r; } else { if (req->verbose) http_dump_header(sock_obj->buffer + (sock_obj->size - r), r); /* minimize buffer using no 2*CR/LF found yet */ if (sock_obj->size > 4) { memmove(sock_obj->buffer, sock_obj->buffer + sock_obj->size - 4, 4); sock_obj->size = 4; } } } else if (sock_obj->size) { if (req->verbose) dump_buffer(sock_obj->buffer, r); HASH_UPDATE(sock_obj, sock_obj->buffer, sock_obj->size); sock_obj->size = 0; } return 0; }