// 打开在这里 void operator()( const boost::system::error_code &ec ) { if( ec ) { // 报告出错。 m_sender( boost::str( boost::format("@%s, 获取url有错 %s") % m_speaker % ec.message() ) ); return; } // 根据 content_type 了, 如果不是 text/html 的就不要继续下去了. avhttp::response_opts opt = m_httpstream->response_options(); if( ! is_html( opt.find( avhttp::http_options::content_type ) ) ) { // 报告类型就可以 m_sender( boost::str( boost::format("%s 发的 ⇪ 类型是 %s ") % m_speaker % opt.find( avhttp::http_options::content_type ) ) ); return; } m_content = std::make_shared<boost::array<char, 512>>(); m_html_page = std::make_shared<html::dom>(); m_httpstream->async_read_some(boost::asio::buffer(*m_content, 512), *this); }
int html_count(char* html){ printf("html_count recupere: %s\n", html); int min, index, max, compte; compte = min = index = 0; do{ min++; }while(html[min] != '>'); min++; printf("min: %d\n", min); max = length(html); do{ max--; }while(html[max] != '<'); printf("max: %d\n", max); int minStripSpace = min; do{ minStripSpace++; }while(html[minStripSpace] == ' '); printf("minStripSpace: %d => %c\n", minStripSpace, html[minStripSpace]); int maxStripSpace = max; do{ maxStripSpace--; }while(html[maxStripSpace] == ' '); printf("maxStripSpace: %d => %c\n", maxStripSpace, html[maxStripSpace]); for(index = minStripSpace; index < maxStripSpace+1; index++){ printf("%c, index=%d\n",html[index], index); if(html[index] == '&'){ compte += is_html(html, &index, maxStripSpace); }else if(html[index] == ' '){ printf("espace\n"); compte++; toNextChar(html, &index); } else{ compte++; } } return compte; }
PRIVATE int HTGuess_flush (HTStream * me) { if (!me->transparent) { HTResponse * response = me->response; /* ** First we look for magic tokens and evaluate the contents of the buffer ** that we are investigating. */ if (me->cnt) { HTTRACE(STREAM_TRACE, "GUESSING.... Result of content analysis: Text=%d%% Newlines=%d%% Ctrl=%d%% High=%d%%\n" _ (int)(100*me->text_cnt/me->cnt + 0.5) _ (int)(100*me->lf_cnt /me->cnt + 0.5) _ (int)(100*me->ctrl_cnt/me->cnt + 0.5) _ (int)(100*me->high_cnt/me->cnt + 0.5)); } if (!me->ctrl_cnt || me->text_cnt + me->lf_cnt >= 16 * (me->ctrl_cnt + me->high_cnt)) { char *ptr; /* some kind of text */ *me->write_ptr = 0; /* terminate buffer */ if (me->high_cnt > 0) HTResponse_setContentTransferEncoding(response, WWW_CODING_8BIT); else HTResponse_setContentTransferEncoding(response, WWW_CODING_7BIT); if (is_html(me->buffer)) HTResponse_setFormat(response, HTAtom_for("text/html")); else if (!strncmp(me->buffer, "%!", 2)) HTResponse_setFormat(response, HTAtom_for("application/postscript")); else if (strstr(me->buffer, "#define") && strstr(me->buffer, "_width") && strstr(me->buffer, "_bits")) HTResponse_setFormat(response, HTAtom_for("image/x-xbitmap")); else if ((ptr = strstr(me->buffer, "converted with BinHex"))!=NULL) HTResponse_setContentTransferEncoding(response, WWW_CODING_MACBINHEX); else if (!strncmp(me->buffer, "begin ", 6)) HTResponse_setContentTransferEncoding(response, WWW_CODING_BASE64); else HTResponse_setFormat(response, WWW_PLAINTEXT); } else { if (!strncmp(me->buffer, "GIF", 3)) HTResponse_setFormat(response, WWW_GIF); else if (!strncmp(me->buffer, "\377\330\377\340", 4)) HTResponse_setFormat(response, WWW_JPEG); else if (!strcmp(me->buffer, "MM")) /* MM followed by a zero */ HTResponse_setFormat(response, WWW_TIFF); else if (!strncmp(me->buffer, "\211PNG\r\n\032\n", 8)) HTResponse_setFormat(response, WWW_PNG); else if (!strncmp(me->buffer, ".snd", 4)) HTResponse_setFormat(response, WWW_AUDIO); else if (!strncmp(me->buffer, "\037\235", 2)) HTResponse_addEncoding(response, WWW_CODING_COMPRESS); else if (!strncmp(me->buffer, "\037\213", 2)) HTResponse_addEncoding(response, WWW_CODING_GZIP); else HTResponse_setFormat(response, WWW_BINARY); } /* ** If we couldn't find any magic tokens then we try and look at the suffix ** of the URL file name and use our own bindings to see if that gives any ** results. */ if (HTResponse_format(response) == WWW_UNKNOWN) { HTParentAnchor * anchor = HTRequest_anchor(me->request); char * addr = HTAnchor_physical(anchor); HTTRACE(STREAM_TRACE, "GUESSING.... Hmm - trying local bindings\n"); HTBind_getResponseBindings (response, addr); } /* ** If nothing worked then give up and say binary... */ if (HTResponse_format(response) == WWW_UNKNOWN) { HTTRACE(STREAM_TRACE, "GUESSING.... That's it - I'm giving up!\n"); HTResponse_setFormat(response, WWW_BINARY); } HTTRACE(STREAM_TRACE, "Guessed..... Content-Type `%s\'\n" _ HTAtom_name(HTResponse_format(response))); /* ** Set up the new stream stack with the type we figured out */ if ((me->target = HTStreamStack(HTResponse_format(response), me->output_format, me->output_stream, me->request, NO)) == NULL) { HTTRACE(STREAM_TRACE, "HTGuess..... Can't convert media type\n"); me->target = HTErrorStream(); } me->transparent = YES; return PUT_BLOCK(me->buffer, me->cnt); } return HT_OK; }
int main(void) { signal(SIGINT, sighandler); LIBXML_TEST_VERSION; curl_global_init(CURL_GLOBAL_DEFAULT); CURLM *multi_handle = curl_multi_init(); curl_multi_setopt(multi_handle, CURLMOPT_MAX_TOTAL_CONNECTIONS, max_con); curl_multi_setopt(multi_handle, CURLMOPT_MAX_HOST_CONNECTIONS, 6L); /* enables http/2 if available */ #ifdef CURLPIPE_MULTIPLEX curl_multi_setopt(multi_handle, CURLMOPT_PIPELINING, CURLPIPE_MULTIPLEX); #endif /* sets html start page */ curl_multi_add_handle(multi_handle, make_handle(start_page)); int msgs_left; int pending = 0; int complete = 0; int still_running = 1; while(still_running && !pending_interrupt) { int numfds; curl_multi_wait(multi_handle, NULL, 0, 1000, &numfds); curl_multi_perform(multi_handle, &still_running); /* See how the transfers went */ CURLMsg *m = NULL; while((m = curl_multi_info_read(multi_handle, &msgs_left))) { if(m->msg == CURLMSG_DONE) { CURL *handle = m->easy_handle; char *url; memory *mem; curl_easy_getinfo(handle, CURLINFO_PRIVATE, &mem); curl_easy_getinfo(handle, CURLINFO_EFFECTIVE_URL, &url); if(m->data.result == CURLE_OK) { long res_status; curl_easy_getinfo(handle, CURLINFO_RESPONSE_CODE, &res_status); if(res_status == 200) { char *ctype; curl_easy_getinfo(handle, CURLINFO_CONTENT_TYPE, &ctype); printf("[%d] HTTP 200 (%s): %s\n", complete, ctype, url); if(is_html(ctype) && mem->size > 100) { if(pending < max_requests && (complete + pending) < max_total) { pending += follow_links(multi_handle, mem, url); still_running = 1; } } } else { printf("[%d] HTTP %d: %s\n", complete, (int) res_status, url); } } else { printf("[%d] Connection failure: %s\n", complete, url); } curl_multi_remove_handle(multi_handle, handle); curl_easy_cleanup(handle); free(mem->buf); free(mem); complete++; pending--; } } } curl_multi_cleanup(multi_handle); curl_global_cleanup(); return 0; }
void event_cb(struct bufferevent *bev, short events, void *ptr) { conn_t *cn = (conn_t *) ptr; url_t *u = cn->url; bool finished = false; if (events & BEV_EVENT_CONNECTED) { /* log_info("connected:%s", u->full_url); */ /*start writing*/ char buffer[BUFSIZE] = {0}; create_http_req(u, buffer); //struct evbuffer *output = bufferevent_get_output(bev); bufferevent_write(bev, buffer, strlen(buffer)); } if (events & BEV_EVENT_ERROR) { finished = true; log_error("bufferevent error:%s, %s", u->full_url, evutil_socket_error_to_string(EVUTIL_SOCKET_ERROR())); statis.error_urls++; } if (events & BEV_EVENT_EOF) { finished = true; char buf[BUFSIZE]; int n; struct evbuffer *input = bufferevent_get_input(bev); while ((n = evbuffer_remove(input, buf, sizeof(buf))) > 0) { conn_append(cn, buf, n); } /* parse the buffer we read from the bufferevent and init the connection's resp field */ /* log_debug("http_resp_init: %s, %s", cn->url->full_url, cn->buf);*/ if (http_resp_init(cn->resp, cn->buf, cn->dsize) < 0) { log_error("http response parse error: %s", cn->url->full_url); } else { // /* is robots.txt */ // if (!cn->site->robots_gotten) // { // parse_robots(cn); // cn->site->robots_gotten = true; // } // else // { switch(cn->resp->status_code / 100) { case 3: { char *location = http_hdr_list_get_value(cn->resp->headers, "Location"); if (location) { put_url_str(location, 0); } } break; case 2: /* is a html? parse it and store all urls to url fifo*/ if (is_html(cn)) { fetch_all_urls(cn); } /* save the content to file*/ save(cn); /*update the statistics*/ statis.ok_urls++; log_ok_url("%s", cn->url->full_url); break; case 4: log_error_url("%s", cn->url->full_url); statis.error_urls++; break; default: break; } //} } } if (finished) { bufferevent_free(bev); stop_conn(cn); statis.conns--; statis.ram_urls--; /* fetch_urls(); */ /* fetch_dns(); */ /* fetch_pages(); */ } }
/* ** Output Wiki text while inserting the proper HTML control codes. ** The following formatting conventions are implemented: ** ** * Characters with special meaning to HTML are escaped. ** ** * Blank lines results in a paragraph break. ** ** * Paragraphs where the first line is indented by two or more ** spaces are shown verbatim. None of the following rules apply ** to verbatim text. ** ** * Lines beginning with "*: " begin a bullet in a bullet list. ** ** * Lines beginning with "1: " begin an item in an enumerated list. ** ** * Paragraphs beginning with "_: " are indented. ** ** * Multiple colons can be used in *:, 1:, and _: for multiple ** levels of indentation. ** ** * Text within _..._ is italic and text in *...* is bold. ** Text with in **...** or ***...*** bold with a larger font. ** ** * Wiki pages names (Words in initial caps) are enclosed in an ** appropriate hyperlink. ** ** * Words that begin with "http:", "https:", "ftp:", or "mailto:" ** are enclosed in an appropriate hyperlink. ** ** * Text of the form "#NNN" where NNN is a valid ticket number ** is converted into a hyperlink to the corresponding ticket. ** ** * Text of the form "[NNN]" where NNN is a valid check-in number ** becomes a hyperlink to the checkin. ** ** * {quote: XYZ} renders XYZ with all special meanings for XYZ escaped. ** ** * {link: URL TEXT} renders TEXT with a link to URL. URL can be ** relative. ** ** * {linebreak} renders a linebreak. ** ** * {image: URL ALT} renders an in-line image from URL. URL can be ** relative or it can be the name of an attachment to zPageId. ** {leftimage: URL ALT} and {rightimage: URL ALT} create wrap-around ** images at the left or right margin. ** ** * {clear} skips down the page far enough to clear any wrap-around ** images. ** ** * Text between <html>...</html> is interpreted as HTML. A restricted ** subset of tags are supported - things like forms and javascript are ** intentionally excluded. The initial <html> must occur at the ** beginning of a paragraph. */ void output_wiki( const char *zText, /* The text to be formatted */ const char *zLinkSuffix, /* Suffix added to hyperlinks to Wiki */ const char *zPageId /* Name of current page */ ){ int i, j, k; int aList[20]; /* See adjust_list_nesting for details */ int inPRE = 0; int inB = 0; int inI = 0; int v; int wordStart = 1; /* At the start of a word */ int lineStart = 1; /* At the start of a line */ int paraStart = 1; /* At the start of a paragraph */ const char *zEndB; /* Text used to end a run of bold */ char **azAttach; /* Attachments to zPageId */ static int once = 1; static int nTicket, nCommit; if( once ){ nTicket = atoi(db_short_query("SELECT max(tn) FROM ticket")); nCommit = atoi(db_short_query("SELECT max(cn) FROM chng")); once = 0; } i = 0; aList[0] = 0; azAttach = 0; zEndB = ""; while( zText[i] ){ char *z; int n; Markup sMarkup; int c = zText[i]; /* Text between <html>...</html> is interpreted as HTML. */ if( c=='<' && (n = is_html(&zText[i]))>0 ){ put_htmlized_text(&zText, i); zText += 6; output_restricted_html(zText, n-13); zText += n - 6; i = 0; continue; } /* Markup may consist of special strings contained in curly braces. ** Examples: "{linebreak}" or "{quote: *:}" */ if( c=='{' && is_markup(&zText[i], &sMarkup) ){ /* ** Markup of the form "{linebreak}" forces a line break. */ if( sMarkup.lenType==9 && strncmp(sMarkup.zType,"linebreak",9)==0 ){ put_htmlized_text(&zText, i); zText += sMarkup.lenTotal; i = 0; cgi_printf("<br>\n"); wordStart = lineStart = paraStart = 0; continue; } /* ** Markup of the form "{clear}" moves down past any left or right ** aligned images. */ if( sMarkup.lenType==5 && strncmp(sMarkup.zType,"clear",5)==0 ){ put_htmlized_text(&zText, i); zText += sMarkup.lenTotal; i = 0; cgi_printf("<br clear=\"both\">\n"); wordStart = lineStart = paraStart = 0; continue; } /* ** Markup of the form "{quote: ABC}" writes out the text ABC exactly ** as it appears. This can be used to escape special meanings ** associated with ABC. */ if( sMarkup.lenType==5 && strncmp(sMarkup.zType,"quote",5)==0 ){ int n; put_htmlized_text(&zText, i); if( sMarkup.zKey==sMarkup.zArgs ){ n = sMarkup.lenKey; }else{ n = &sMarkup.zArgs[sMarkup.lenArgs] - sMarkup.zKey; } put_htmlized_text(&sMarkup.zKey, n); zText += sMarkup.lenTotal; i = 0; wordStart = lineStart = paraStart = 0; continue; } /* ** Markup of the form "{link: TO TEXT}" creates a hyperlink to TO. ** The hyperlink appears on the screen as TEXT. TO can be a any URL, ** including a relative URL such as "chngview?cn=123". */ if( sMarkup.lenType==4 && strncmp(sMarkup.zType,"link",4)==0 ){ put_htmlized_text(&zText, i); cgi_printf("<a href=\"%.*s\">", sMarkup.lenKey, sMarkup.zKey); put_htmlized_text(&sMarkup.zArgs, sMarkup.lenArgs); cgi_printf("</a>"); zText += sMarkup.lenTotal; i = 0; wordStart = lineStart = paraStart = 0; continue; } /* ** Markup of the form "{image: URL ALT}" creates an in-line image to ** URL with ALT as the alternate text. URL can be relative (for example ** the URL of an attachment. ** ** If the URL is the name of an attachment, then automatically ** convert it to the correct URL for that attachment. */ if( (sMarkup.lenType==5 && strncmp(sMarkup.zType,"image",5)==0) || (sMarkup.lenType==9 && strncmp(sMarkup.zType,"leftimage",9)==0) || (sMarkup.lenType==10 && strncmp(sMarkup.zType,"rightimage",10)==0) ){ char *zUrl = 0; const char *zAlign; char *zAlt = htmlize(sMarkup.zArgs, sMarkup.lenArgs); if( azAttach==0 && zPageId!=0 ){ azAttach = (char **) db_query("SELECT fname, atn FROM attachment " "WHERE tn='%q'", zPageId); } if( azAttach ){ int ix; for(ix=0; azAttach[ix]; ix+=2){ if( strncmp(azAttach[ix],sMarkup.zKey,sMarkup.lenKey)==0 ){ free(zUrl); zUrl = mprintf("attach_get/%s/%h", azAttach[ix+1], azAttach[ix]); break; } } } if( zUrl==0 ){ zUrl = htmlize(sMarkup.zKey, sMarkup.lenKey); } put_htmlized_text(&zText, i); switch( sMarkup.zType[0] ){ case 'l': case 'L': zAlign = " align=\"left\""; break; case 'r': case 'R': zAlign = " align=\"right\""; break; default: zAlign = ""; break; } cgi_printf("<img src=\"%s\" alt=\"%s\"%s>", zUrl, zAlt, zAlign); free(zUrl); free(zAlt); zText += sMarkup.lenTotal; i = 0; wordStart = lineStart = paraStart = 0; continue; } } if( paraStart ){ put_htmlized_text(&zText, i); /* Blank lines at the beginning of a paragraph are ignored. */ if( isspace(c) && (j = is_blank_line(&zText[i]))>0 ){ zText += j; continue; } /* If the first line of a paragraph begins with a tab or with two ** or more spaces, then that paragraph is printed verbatim. */ if( c=='\t' || (c==' ' && (zText[i+1]==' ' || zText[i+1]=='\t')) ){ if( !inPRE ){ if( inB ){ cgi_printf(zEndB); inB=0; } if( inI ){ cgi_printf("</i>"); inI=0; } adjust_list_nesting(aList, 0); cgi_printf("<pre>\n"); inPRE = 1; } } } /* end if( paraStart ) */ if( lineStart ){ /* Blank lines in the middle of text cause a paragraph break */ if( isspace(c) && (j = is_blank_line(&zText[i]))>0 ){ put_htmlized_text(&zText, i); zText += j; if( inB ){ cgi_printf(zEndB); inB=0; } if( inI ){ cgi_printf("</i>"); inI=0; } if( inPRE ){ cgi_printf("</pre>\n"); inPRE = 0; } is_list_elem(zText, &k); if( abs(k)<aList[0] ) adjust_list_nesting(aList, k); if( zText[0]!=0 ){ cgi_printf("\n<p>"); } wordStart = lineStart = paraStart = 1; i = 0; continue; } } /* end if( lineStart ) */ if( lineStart && !inPRE ){ /* If we are not in verbatim text and a line begins with "*:", then ** generate a bullet. Or if the line begins with "NNN:" where NNN ** is a number, generate an enumeration item. */ if( (j = is_list_elem(&zText[i], &k))>0 ){ put_htmlized_text(&zText, i); adjust_list_nesting(aList, k); if( zText[0]!='_' ) cgi_printf("<li>"); zText += j; i = 0; wordStart = 1; lineStart = paraStart = 0; continue; } /* Four or more "-" characters on at the beginning of a line that ** contains no other text results in a horizontal rule. */ if( (c=='-' || c=='=') && (j = is_horizontal_rule(&zText[i]))>0 ){ put_htmlized_text(&zText, i); adjust_list_nesting(aList, 0); cgi_printf("<hr>\n"); zText += j; if( *zText ) zText++; i = 0; lineStart = wordStart = 1; paraStart = 1; continue; } } /* end if( lineStart && !inPre ) */ if( wordStart && !inPRE ){ /* A wiki name at the beginning of a word which is not in verbatim ** text generates a hyperlink to that wiki page. ** ** Special case: If the name is in CamelCase but ends with a "_", then ** suppress the "_" and do not generate the hyperlink. This allows ** CamelCase words that are not wiki page names to appear in text. */ if( g.okRdWiki && isupper(c) && (j = is_wiki_name(&zText[i]))>0 ){ put_htmlized_text(&zText, i); cgi_printf("<a href=\"wiki?p=%.*s%s\">%.*s</a>", j, zText, zLinkSuffix, j, zText); zText += j; i = 0; wordStart = lineStart = paraStart = 0; continue; } /* A "_" at the beginning of a word puts us into an italic font. */ if( c=='_' && !inB && !inI && font_terminator(&zText[i+1],c,1) ){ put_htmlized_text(&zText, i); i = 0; zText++; cgi_printf("<i>"); inI = 1; continue; } /* A "*" at the beginning of a word puts us into a bold font. */ if( c=='*' && !inB && !inI && (j = count_stars(&zText[i]))>=1 && j<=3 && font_terminator(&zText[i+j],c,j) ){ const char *zBeginB = ""; put_htmlized_text(&zText, i); i = 0; zText += j; switch( j ){ case 1: zBeginB = "<b>"; zEndB = "</b>"; break; case 2: zBeginB = "<big><b>"; zEndB = "</b></big>"; break; case 3: zBeginB = "<big><big><b>"; zEndB = "</b></big></big>"; break; } cgi_printf(zBeginB); inB = j; continue; } /* Words that begin with "http:" or "https:" or "ftp:" or "mailto:" ** become hyperlinks. */ if( (c=='h' || c=='f' || c=='m') && (j=is_url(&zText[i]))>0 ){ put_htmlized_text(&zText, i); z = htmlize(zText, j); if( is_image(z, strlen(z)) ){ cgi_printf("<img src=\"%s\" alt=\"%s\">", z, z); }else{ cgi_printf("<a href=\"%s\">%s</a>", z, z); } free(z); zText += j; i = 0; wordStart = lineStart = paraStart = 0; continue; } /* If the user has read permission on tickets and a word is of the ** form "#NNN" where NNN is a sequence of digits, then generate a ** hyperlink to ticket number NNN. */ if( c=='#' && g.okRead && (j = ndigit(&zText[i+1]))>0 && is_eow(&zText[i+1+j],0) && (v = atoi(&zText[i+1]))>0 && v<=nTicket ){ put_htmlized_text(&zText, i); cgi_printf("<a href=\"tktview?tn=%d\">#%d</a>", v, v); zText += j; if( *zText ) zText++; i = 0; wordStart = lineStart = paraStart = 0; continue; } /* If the user has checkout permissions and a word is of the form ** "[NNN]" where NNN is a checkin number, then generate a hyperlink ** to check-in NNN. */ if( c=='[' && g.okCheckout && (j = ndigit(&zText[i+1]))>0 && is_eow(&zText[i+j+2],0) && (v = atoi(&zText[i+1]))>0 && v<=nCommit && zText[i+j+1]==']' ){ put_htmlized_text(&zText, i); cgi_printf("<a href=\"chngview?cn=%d\">[%d]</a>", v, v); zText += j+1; if( *zText ) zText++; i = 0; wordStart = lineStart = paraStart = 0; continue; } } /* end if( wordStart && !inPre ) */ /* A "*" or a "_" at the end of a word takes us out of bold or ** italic mode. */ if( inB && c=='*' && !isspace(zText[i-1]) && zText[i-1]!='*' && (j = count_stars(&zText[i]))==inB && is_eow(&zText[i+j],0) ){ inB = 0; put_htmlized_text(&zText, i); i = 0; zText += j; cgi_printf(zEndB); continue; } if( inI && c=='_' && !isspace(zText[i-1]) && is_eow(&zText[i+1],0) ){ put_htmlized_text(&zText, i); i = 0; zText++; inI = 0; cgi_printf("</i>"); continue; } if( wordStart ){ wordStart = isspace(c) || c=='(' || c=='"'; }else{ wordStart = isspace(c); } lineStart = c=='\n'; paraStart = 0; i++; } if( zText[0] ) cgi_printf("%h", zText); if( inB ) cgi_printf("%s\n",zEndB); if( inI ) cgi_printf("</i>\n"); adjust_list_nesting(aList, 0); if( inPRE ) cgi_printf("</pre>\n"); }