/* * 返回最后找到的链接的下一个下标,如果没找到返回 0; */ int extract_url(regex_t *re, char *str, Url *ourl) { const size_t nmatch = 2; regmatch_t matchptr[nmatch]; int len; char *p = str; while (regexec(re, p, nmatch, matchptr, 0) != REG_NOMATCH) { len = (matchptr[1].rm_eo - matchptr[1].rm_so); p = p + matchptr[1].rm_so; char *tmp = (char *)calloc(len+1, 1); strncpy(tmp, p, len); tmp[len] = '\0'; p = p + len + (matchptr[0].rm_eo - matchptr[1].rm_eo); /* exclude binary file */ if (is_bin_url(tmp)) { free(tmp); continue; } char *url = attach_domain(tmp, ourl->domain); if (url != NULL) { SPIDER_LOG(SPIDER_LEVEL_DEBUG, "I find a url: %s", url); Surl * surl = (Surl *)malloc(sizeof(Surl)); surl->level = ourl->level + 1; surl->type = TYPE_HTML; /* normalize url */ if ((surl->url = url_normalized(url)) == NULL) { SPIDER_LOG(SPIDER_LEVEL_WARN, "Normalize url fail"); free(surl); continue; } if (iscrawled(surl->url)) { /* if is crawled */ SPIDER_LOG(SPIDER_LEVEL_DEBUG, "I seen this url: %s", surl->url); free(surl->url); free(surl); continue; } else { push_surlqueue(surl); } } } return (p-str); }
static int handler(void * data) { Response *r = (Response *)data; const size_t nmatch = 2; regmatch_t matchptr[nmatch]; int len; regex_t re; if (strstr(r->header->content_type, "text/html") != NULL) { if (regcomp(&re, IMG_PATTERN, 0) != 0) {/* compile error */ return MODULE_ERR; } char *p = r->body; while (regexec(&re, p, nmatch, matchptr, 0) != REG_NOMATCH) { len = (matchptr[1].rm_eo - matchptr[1].rm_so); p = p + matchptr[1].rm_so; char *tmp = (char *)calloc(len+1, 1); strncpy(tmp, p, len); tmp[len] = '\0'; p = p + len + (matchptr[0].rm_eo - matchptr[1].rm_eo); char *url = attach_domain(tmp, r->url->domain); if (url != NULL) { Surl * surl = (Surl *)malloc(sizeof(Surl)); surl->level = r->url->level; surl->type = TYPE_IMAGE; /* normalize url */ if ((surl->url = url_normalized(url)) == NULL) { free(surl); continue; } if (iscrawled(surl->url)) { /* if is crawled */ free(surl->url); free(surl); continue; } else { push_surlqueue(surl); } } } } else if (strstr(r->header->content_type, "image") != NULL) { char *fn = url2fn(r->url); int fd = -1; if ((fd = open(fn, O_WRONLY|O_CREAT|O_TRUNC, 0666)) < 0) { return MODULE_ERR; } // save image int left = r->body_len; int n = -1; while (left) { if ((n = write(fd, r->body, left)) < 0) { // error close(fd); unlink(fn); free(fn); return MODULE_ERR; } else { left -= n; } } close(fd); free(fn); } return MODULE_OK; }