int mget_cookie_load_public_suffixes(const char *fname) { PUBLIC_SUFFIX suffix, *suffixp; FILE *fp; int nsuffixes = 0; char *buf = NULL, *linep, *p; size_t bufsize = 0; ssize_t buflen; // as of 02.11.2012, the list at http://publicsuffix.org/list/ contains ~6000 rules // and 40 exceptions. if (!suffixes) suffixes = mget_vector_create(6*1024, -2, (int(*)(const void *, const void *))suffix_compare); if (!suffix_exceptions) suffix_exceptions = mget_vector_create(64, -2, (int(*)(const void *, const void *))suffix_compare); if ((fp = fopen(fname, "r"))) { while ((buflen = mget_getline(&buf, &bufsize, fp)) >= 0) { linep = buf; while (isspace(*linep)) linep++; // ignore leading whitespace if (!*linep) continue; // skip empty lines if (*linep == '/' && linep[1] == '/') continue; // skip comments // parse suffix rule for (p = linep; *linep && !isspace(*linep);) linep++; *linep = 0; if (*p == '!') { // add to exceptions suffix_init(&suffix, p + 1, linep - p - 1); suffixp = mget_vector_get(suffix_exceptions, mget_vector_add(suffix_exceptions, &suffix, sizeof(suffix))); } else { suffix_init(&suffix, p, linep - p); suffixp = mget_vector_get(suffixes, mget_vector_add(suffixes, &suffix, sizeof(suffix))); } if (suffixp) suffixp->label = suffixp->label_buf; // set label to changed address nsuffixes++;; } xfree(buf); fclose(fp); mget_vector_sort(suffix_exceptions); mget_vector_sort(suffixes); } else error_printf(_("Failed to open public suffix file '%s'\n"), fname); return nsuffixes; }
static void _rss_get_url(void *context, int flags, const char *dir, const char *attr, const char *val, size_t len, size_t pos G_GNUC_MGET_UNUSED) { struct rss_context *ctx = context; mget_string_t url; if (!val || !len) return; url.p = NULL; if ((flags & XML_FLG_ATTRIBUTE)) { if (!mget_strcasecmp_ascii(attr, "url") || !mget_strcasecmp_ascii(attr, "href") || !mget_strcasecmp_ascii(attr, "src") || !mget_strcasecmp_ascii(attr, "domain") || !mget_strcasecmp_ascii(attr, "xmlns") || !mget_strncasecmp_ascii(attr, "xmlns:", 6)) { for (;len && isspace(*val); val++, len--); // skip leading spaces for (;len && isspace(val[len - 1]); len--); // skip trailing spaces url.p = val; url.len = len; if (!ctx->urls) ctx->urls = mget_vector_create(32, -2, NULL); mget_vector_add(ctx->urls, &url, sizeof(url)); } } else if ((flags & XML_FLG_CONTENT)) { const char *elem = strrchr(dir, '/'); if (elem) { elem++; if (!mget_strcasecmp_ascii(elem, "guid") || !mget_strcasecmp_ascii(elem, "link") || !mget_strcasecmp_ascii(elem, "comments") || !mget_strcasecmp_ascii(elem, "docs")) { for (;len && isspace(*val); val++, len--); // skip leading spaces for (;len && isspace(val[len - 1]); len--); // skip trailing spaces // debug_printf("#2 %02X %s %s '%.*s' %zd\n", flags, dir, attr, (int) len, val, len); url.p = val; url.len = len; if (!ctx->urls) ctx->urls = mget_vector_create(32, -2, NULL); mget_vector_add(ctx->urls, &url, sizeof(url)); } } } }
void job_create_parts(JOB *job) { PART part; mget_metalink_t *metalink; ssize_t fsize; int it; if (!job || !(metalink = job->metalink)) return; memset(&part, 0, sizeof(PART)); // create space to hold enough parts if (!job->parts) job->parts = mget_vector_create(mget_vector_size(metalink->pieces), 4, NULL); else mget_vector_clear(job->parts); fsize = metalink->size; for (it = 0; it < mget_vector_size(metalink->pieces); it++) { mget_metalink_piece_t *piece = mget_vector_get(metalink->pieces, it); if (fsize >= piece->length) { part.length = piece->length; } else { part.length = fsize; } part.id = it + 1; mget_vector_add(job->parts, &part, sizeof(PART)); part.position += part.length; fsize -= piece->length; } }
static void test_stringmap(void) { mget_stringmap_t *m; char key[128], value[128], *val; int run, it; size_t valuesize; // the initial size of 16 forces the internal reshashing function to be called twice m = mget_stringmap_create(16); for (run = 0; run < 2; run++) { if (run) { mget_stringmap_clear(m); mget_stringmap_sethashfunc(m, hash_txt); } for (it = 0; it < 26; it++) { sprintf(key, "http://www.example.com/subdir/%d.html", it); valuesize = sprintf(value, "%d.html", it); if (mget_stringmap_put(m, key, value, valuesize + 1)) { failed++; info_printf("stringmap_put(%s) returns unexpected old value\n", key); } else ok++; } if ((it = mget_stringmap_size(m)) != 26) { failed++; info_printf("stringmap_size() returned %d (expected %d)\n", it, 26); } else ok++; // now, look up every single entry for (it = 0; it < 26; it++) { sprintf(key, "http://www.example.com/subdir/%d.html", it); sprintf(value, "%d.html", it); if (!(val = mget_stringmap_get(m, key))) { failed++; info_printf("stringmap_get(%s) didn't find entry\n", key); } else if (strcmp(val, value)) { failed++; info_printf("stringmap_get(%s) found '%s' (expected '%s')\n", key, val, value); } else ok++; } mget_stringmap_clear(m); if ((it = mget_stringmap_size(m)) != 0) { failed++; info_printf("stringmap_size() returned %d (expected 0)\n", it); } else ok++; for (it = 0; it < 26; it++) { sprintf(key, "http://www.example.com/subdir/%d.html", it); valuesize = sprintf(value, "%d.html", it); if (mget_stringmap_put(m, key, value, valuesize + 1)) { failed++; info_printf("stringmap_put(%s) returns unexpected old value\n", key); } else ok++; } if ((it = mget_stringmap_size(m)) != 26) { failed++; info_printf("stringmap_size() returned %d (expected %d)\n", it, 26); } else ok++; // now, remove every single entry for (it = 0; it < 26; it++) { sprintf(key, "http://www.example.com/subdir/%d.html", it); sprintf(value, "%d.html", it); mget_stringmap_remove(m, key); } if ((it = mget_stringmap_size(m)) != 0) { failed++; info_printf("stringmap_size() returned %d (expected 0)\n", it); } else ok++; for (it = 0; it < 26; it++) { sprintf(key, "http://www.example.com/subdir/%d.html", it); valuesize = sprintf(value, "%d.html", it); if (mget_stringmap_put(m, key, value, valuesize + 1)) { failed++; info_printf("stringmap_put(%s) returns unexpected old value\n", key); } else ok++; } if ((it = mget_stringmap_size(m)) != 26) { failed++; info_printf("stringmap_size() returned %d (expected %d)\n", it, 26); } else ok++; } // testing alloc/free in stringmap/hashmap mget_stringmap_clear(m); mget_stringmap_put(m, "thekey", NULL, 0) ? failed++ : ok++; mget_stringmap_put(m, "thekey", NULL, 0) ? ok++ : failed++; mget_stringmap_put(m, "thekey", "thevalue", 9) ? ok++ : failed++; mget_stringmap_put(m, "thekey", "thevalue", 9) ? ok++ : failed++; mget_stringmap_put(m, "thekey", NULL, 0) ? ok++ : failed++; // testing key/value identity alloc/free in stringmap/hashmap mget_stringmap_clear(m); mget_stringmap_put(m, "thekey", NULL, 0) ? failed++ : ok++; mget_stringmap_put(m, "thekey", NULL, 0) ? ok++ : failed++; mget_stringmap_put(m, "thekey", "thevalue", 9) ? ok++ : failed++; mget_stringmap_put(m, "thekey", NULL, 0) ? ok++ : failed++; mget_stringmap_free(&m); mget_http_challenge_t challenge; mget_http_parse_challenge("Basic realm=\"test realm\"", &challenge); mget_http_free_challenge(&challenge); mget_vector_t *challenges; challenges = mget_vector_create(2, 2, NULL); mget_vector_set_destructor(challenges, (void(*)(void *))mget_http_free_challenge); mget_http_parse_challenge("Basic realm=\"test realm\"", &challenge); mget_vector_add(challenges, &challenge, sizeof(challenge)); mget_http_free_challenges(&challenges); char *response_text = strdup( "HTTP/1.1 401 Authorization Required\r\n"\ "Date: Sun, 23 Dec 2012 21:03:45 GMT\r\n"\ "Server: Apache/2.2.22 (Debian)\r\n"\ "WWW-Authenticate: Digest realm=\"therealm\", nonce=\"Ip6MaovRBAA=c4af733c51270698260f5d357724c2cbce20fa3d\", algorithm=MD5, domain=\"/prot_digest_md5\", qop=\"auth\"\r\n"\ "Vary: Accept-Encoding\r\n"\ "Content-Length: 476\r\n"\ "Keep-Alive: timeout=5, max=99\r\n"\ "Connection: Keep-Alive\r\n"\ "Content-Type: text/html; charset=iso-8859-1\r\n\r\n"); mget_iri_t *iri = mget_iri_parse("http://localhost/prot_digest_md5/", NULL); mget_http_request_t *req = mget_http_create_request(iri, "GET"); mget_http_response_t *resp = mget_http_parse_response_header(response_text); mget_http_add_credentials(req, mget_vector_get(resp->challenges, 0), "tim", "123"); // for (it=0;it<vec_size(req->lines);it++) { // info_printf("%s\n", (char *)vec_get(req->lines, it)); // } mget_http_free_response(&resp); mget_http_free_request(&req); mget_iri_free(&iri); xfree(response_text); // Authorization: Digest username="******", realm="therealm", nonce="Ip6MaovRBAA=c4af733c51270698260f5d357724c2cbce20fa3d", uri="/prot_digest_md5/", response="a99e2012d507a73dd46eb044d3f4641c", qop=auth, nc=00000001, cnonce="3d20faa1" }
int job_validate_file(JOB *job) { PART part; mget_metalink_t *metalink; off_t fsize; int fd, rc = -1, it; struct stat st; if (!job || !(metalink = job->metalink)) return 0; memset(&part, 0, sizeof(PART)); // create space to hold enough parts if (!job->parts) job->parts = mget_vector_create(mget_vector_size(metalink->pieces), 4, NULL); else mget_vector_clear(job->parts); fsize = metalink->size; if (mget_vector_size(metalink->hashes) == 0) { // multipart non-metalink download: do not clobber if file has expected size if (stat(metalink->name, &st) == 0 && st.st_size == fsize) { return 1; // we are done } } // truncate file if needed if (stat(metalink->name, &st) == 0 && st.st_size > fsize) { if (truncate(metalink->name, fsize) == -1) error_printf(_("Failed to truncate %s\n from %llu to %llu bytes\n"), metalink->name, (unsigned long long)st.st_size, (unsigned long long)fsize); } if ((fd = open(metalink->name, O_RDONLY)) != -1) { // file exists, check which piece is invalid and requeue it for (it = 0; errno != EINTR && it < mget_vector_size(metalink->hashes); it++) { mget_metalink_hash_t *hash = mget_vector_get(metalink->hashes, it); if ((rc = check_file_fd(hash, fd)) == -1) continue; // hash type not available, try next break; } if (rc == 1) { info_printf(_("Checksum OK for '%s'\n"), metalink->name); return 1; // we are done } else if (rc == -1) { // failed to check file, continue as if file is ok info_printf(_("Failed to build checksum, assuming file to be OK\n")); return 1; // we are done } else info_printf(_("Bad checksum for '%s'\n"), metalink->name); // if (vec_size(metalink->pieces) < 1) // return; for (it = 0; errno != EINTR && it < mget_vector_size(metalink->pieces); it++) { mget_metalink_piece_t *piece = mget_vector_get(metalink->pieces, it); mget_metalink_hash_t *hash = &piece->hash; if (fsize >= piece->length) { part.length = piece->length; } else { part.length = (size_t)fsize; } part.id = it + 1; if ((rc = check_piece_hash(hash, fd, part.position, part.length)) != 1) { info_printf(_("Piece %d/%d not OK - requeuing\n"), it + 1, mget_vector_size(metalink->pieces)); mget_vector_add(job->parts, &part, sizeof(PART)); debug_printf(" need to download %llu bytes from pos=%llu\n", (unsigned long long)part.length, (unsigned long long)part.position); } part.position += part.length; fsize -= piece->length; } close(fd); } else { for (it = 0; it < mget_vector_size(metalink->pieces); it++) { mget_metalink_piece_t *piece = mget_vector_get(metalink->pieces, it); if (fsize >= piece->length) { part.length = piece->length; } else { part.length = fsize; } part.id = it + 1; mget_vector_add(job->parts, &part, sizeof(PART)); part.position += part.length; fsize -= piece->length; } } return 0; }
ROBOTS *mget_robots_parse(const char *data) { ROBOTS *robots; ROBOTS_PATH path; int collect = 0; const char *p; if (!data || !*data) return NULL; robots = xcalloc(1, sizeof (ROBOTS)); do { if (collect < 2 && !strncasecmp(data, "User-agent:", 11)) { if (!collect) { for (data += 11; *data == ' ' || *data == '\t'; data++); if (!strncasecmp(data, "mget", 4)) { collect = 1; } else if (*data == '*') { collect = 1; } } else collect = 2; } else if (collect == 1 && !strncasecmp(data, "Disallow:", 9)) { for (data += 9; *data == ' ' || *data == '\t'; data++); if (*data == '\r' || *data == '\n' || !*data) { // all allowed mget_vector_free(&robots->paths); collect = 2; } else { if (!robots->paths) { robots->paths = mget_vector_create(32, -2, NULL); mget_vector_set_destructor(robots->paths, (void(*)(void *))_free_path); } for (p = data; !isspace(*p); p++); path.len = p - data; path.path = strndup(data, path.len); mget_vector_add(robots->paths, &path, sizeof(path)); } } else if (!strncasecmp(data, "Sitemap:", 8)) { for (data += 8; *data==' ' || *data == '\t'; data++); for (p = data; !isspace(*p); p++); if (!robots->sitemaps) robots->sitemaps = mget_vector_create(4, -2, NULL); mget_vector_add_noalloc(robots->sitemaps, strndup(data, p - data)); } if ((data = strchr(data, '\n'))) data++; // point to next line } while (data && *data); /* for (int it = 0; it < mget_vector_size(robots->paths); it++) { ROBOTS_PATH *path = mget_vector_get(robots->paths, it); info_printf("path '%s'\n", path->path); } for (int it = 0; it < mget_vector_size(robots->sitemaps); it++) { const char *sitemap = mget_vector_get(robots->sitemaps, it); info_printf("sitemap '%s'\n", sitemap); } */ return robots; }