示例#1
0
static void _add_mirror(_metalink_context_t *ctx, const char *value)
{
	if (wget_strncasecmp_ascii(value, "http:", 5) && wget_strncasecmp_ascii(value, "https:", 6))
		return;

	wget_metalink_t *metalink = ctx->metalink;
	wget_metalink_mirror_t mirror;

	memset(&mirror, 0, sizeof(wget_metalink_mirror_t));
	strlcpy(mirror.location, ctx->location, sizeof(mirror.location));
	mirror.priority = ctx->priority;
	mirror.iri = wget_iri_parse(value, NULL);

	if (!mirror.iri)
		return;

	if (!metalink->mirrors) {
		metalink->mirrors = wget_vector_create(4, 4, NULL);
		wget_vector_set_destructor(metalink->mirrors, (void(*)(void *))_free_mirror);
	}
	wget_vector_add(metalink->mirrors, &mirror, sizeof(wget_metalink_mirror_t));

	*ctx->location = 0;
	ctx->priority = 999999;
}
示例#2
0
static void _atom_get_url(void *context, int flags, const char *dir, const char *attr, const char *val, size_t len, size_t pos G_GNUC_WGET_UNUSED)
{
	struct atom_context *ctx = context;
	wget_string_t url;

	if (!val || !len)
		return;

	url.p = NULL;

	if ((flags & XML_FLG_ATTRIBUTE)) {
		if (!wget_strcasecmp_ascii(attr, "href") || !wget_strcasecmp_ascii(attr, "uri")
			|| !wget_strcasecmp_ascii(attr, "src") || !wget_strcasecmp_ascii(attr, "scheme")
			|| !wget_strcasecmp_ascii(attr, "xmlns") || !wget_strncasecmp_ascii(attr, "xmlns:", 6))
		{
			for (;len && c_isspace(*val); val++, len--); // skip leading spaces
			for (;len && c_isspace(val[len - 1]); len--);  // skip trailing spaces

			url.p = val;
			url.len = len;

			if (!ctx->urls)
				ctx->urls = wget_vector_create(32, -2, NULL);

			wget_vector_add(ctx->urls, &url, sizeof(url));
		}
	}
	else if ((flags & XML_FLG_CONTENT)) {
		const char *elem = strrchr(dir, '/');

		if (elem) {
			elem++;

			if (!wget_strcasecmp_ascii(elem, "icon") || !wget_strcasecmp_ascii(elem, "id")
				 || !wget_strcasecmp_ascii(elem, "logo"))
			{
				for (;len && c_isspace(*val); val++, len--); // skip leading spaces
				for (;len && c_isspace(val[len - 1]); len--);  // skip trailing spaces

				// debug_printf("#2 %02X %s %s '%.*s' %zd\n", flags, dir, attr, (int) len, val, len);

				url.p = val;
				url.len = len;

				if (!ctx->urls)
					ctx->urls = wget_vector_create(32, -2, NULL);

				wget_vector_add(ctx->urls, &url, sizeof(url));
			}
		}
	}
}
示例#3
0
文件: css.c 项目: rockdaboot/wget2
void wget_css_parse_buffer(
	const char *buf,
	void(*callback_uri)(void *user_ctx, const char *url, size_t len, size_t pos),
	void(*callback_encoding)(void *user_ctx, const char *url, size_t len),
	void *user_ctx)
{
	int token;
	size_t length, pos = 0;
	char *text;
	yyscan_t scanner;

	// let flex operate on buf as a 0 terminated string
	// we could give buflen to this function and use yy_scan_bytes or yy_scan_buffer
	yylex_init(&scanner);
	yy_scan_string(buf, scanner);

	while ((token = yylex(scanner)) != CSSEOF) {
		if (token == IMPORT_SYM) {
			// e.g. @import "http:example.com/index.html"
			pos += yyget_leng(scanner);

			// skip whitespace before URI/STRING
			while ((token = yylex(scanner)) == S)
				pos += yyget_leng(scanner);

			// now token should be STRING or URI
			if (token == STRING)
				token = URI;
		}

		if (token == URI && callback_uri) {
			// e.g. url(http:example.com/index.html)
			text = yyget_text(scanner);
			length = yyget_leng(scanner);

			if (*text == '\'' || *text == '\"') {
				// a string - remove the quotes
				callback_uri(user_ctx, text + 1, length - 2, pos + 1);
			} else {
				// extract URI from url(...)
				if (!wget_strncasecmp_ascii(text, "url(", 4)) {
					char *otext = text;

					// remove trailing ) and any spaces before
					for (length--; c_isspace(text[length - 1]); length--);

					// remove leading url( and any spaces after
					for (length -= 4, text += 4; c_isspace(*text); text++, length--);

					// remove quotes
					if (*text == '\'' || *text == '\"') {
						text++;
						length -= 2;
					}

					callback_uri(user_ctx, text, length, pos + (text - otext));
				}
			}
		} else if (token == CHARSET_SYM && callback_encoding) {
			// e.g. @charset "UTF-8"
			pos += yyget_leng(scanner);

			// skip whitespace before charset name
			while ((token = yylex(scanner)) == S)
				pos += yyget_leng(scanner);

			// now token should be STRING
			if (token == STRING) {
				text = yyget_text(scanner);
				length = yyget_leng(scanner);

				if (*text == '\'' || *text == '\"') {
					// a string - remove the quotes
					callback_encoding(user_ctx, text + 1, length - 2);
				} else {
					// a string without quotes
					callback_encoding(user_ctx, text, length);
				}
			} else {
				error_printf(_("Unknown token after @charset: %d\n"), token);
			}
		}
		pos += yyget_leng(scanner);
	}

	yylex_destroy(scanner);
}
示例#4
0
static void html_parse_localfile(const char *fname)
{
	char *data;
	const char *encoding = NULL;
	size_t len;

	if ((data = wget_read_file(fname, &len))) {
		if ((unsigned char)data[0] == 0xFE && (unsigned char)data[1] == 0xFF) {
			// Big-endian UTF-16
			encoding = "UTF-16BE";

			// adjust behind BOM, ignore trailing single byte
			data += 2;
			len -= 2;
		} else if ((unsigned char)data[0] == 0xFF && (unsigned char)data[1] == 0xFE) {
			// Little-endian UTF-16
			encoding = "UTF-16LE";

			// adjust behind BOM
			data += 2;
			len -= 2;
		} else if ((unsigned char)data[0] == 0xEF && (unsigned char)data[1] == 0xBB && (unsigned char)data[2] == 0xBF) {
			// UTF-8
			encoding = "UTF-8";

			// adjust behind BOM
			data += 3;
			len -= 3;
		}

		if (encoding)
			printf("URI encoding '%s' set by BOM\n", encoding);

		if (!wget_strncasecmp_ascii(encoding, "UTF-16", 6)) {
			size_t n;
			char *utf8;

			len -= len & 1; // ignore single trailing byte, else charset conversion fails

			if (wget_memiconv(encoding, data, len, "UTF-8", &utf8, &n) == 0) {
				printf("Convert non-ASCII encoding '%s' to UTF-8\n", encoding);
				data = utf8;
			} else {
				printf("Failed to convert non-ASCII encoding '%s' to UTF-8, skip parsing\n", encoding);
				return;
			}
		}

		WGET_HTML_PARSED_RESULT *res  = wget_html_get_urls_inline(data, NULL, NULL);

		if (encoding) {
			if (res->encoding && wget_strcasecmp_ascii(encoding, res->encoding))
				printf("Encoding '%s' as stated in document has been ignored\n", encoding);
		}

		for (int it = 0; it < wget_vector_size(res->uris); it++) {
			WGET_HTML_PARSED_URL *html_url = wget_vector_get(res->uris, it);
			wget_string_t *url = &html_url->url;

			printf("  %s.%s '%.*s'\n", html_url->dir, html_url->attr, (int) url->len, url->p);
		}

		wget_xfree(data);
		wget_html_free_urls_inline(&res);
	}
}
示例#5
0
static void _metalink_parse(void *context, int flags, const char *dir, const char *attr, const char *val, size_t len, size_t pos G_GNUC_WGET_UNUSED)
{
	_metalink_context_t *ctx = context;
	char value[len + 1];

	// info_printf("\n%02X %s %s '%s'\n", flags, dir, attr, value);
	if (!(flags & (XML_FLG_CONTENT | XML_FLG_ATTRIBUTE))) return; // ignore comments

	if (wget_strncasecmp_ascii(dir, "/metalink/file", 14)) return;

	dir += 14;

	memcpy(value, val, len);
	value[len] = 0;

	if (!wget_strncasecmp_ascii(dir, "s/file", 6)) {
		// metalink 3 XML format
		dir += 6;

		if (attr) {
			if (*dir == 0) { // /metalink/file
				if (!wget_strcasecmp_ascii(attr, "name")) {
					ctx->metalink->name = wget_strmemdup(val, len);
				}
			} else if (!wget_strcasecmp_ascii(dir, "/verification/pieces")) {
				if (!wget_strcasecmp_ascii(attr, "type")) {
					sscanf(value, "%15s", ctx->hash_type);
				} else if (!wget_strcasecmp_ascii(attr, "length")) {
					ctx->length = atoll(value);
				}
//			} else if (!wget_strcasecmp_ascii(dir, "/verification/pieces/hash")) {
//				if (!wget_strcasecmp_ascii(attr, "type")) {
//					ctx->id = atoi(value);
//				}
			} else if (!wget_strcasecmp_ascii(dir, "/verification/hash")) {
				if (!wget_strcasecmp_ascii(attr, "type")) {
					sscanf(value, "%15s", ctx->hash_type);
				}
			} else if (!wget_strcasecmp_ascii(dir, "/resources/url")) {
				if (!wget_strcasecmp_ascii(attr, "location")) {
					sscanf(value, " %2[a-zA-Z]", ctx->location); // ISO 3166-1 alpha-2 two letter country code
//				} else if (!wget_strcasecmp_ascii(attr, "protocol")) {
//					sscanf(value, " %7[a-zA-Z]", ctx->protocol); // type of URL, e.g. HTTP, HTTPS, FTP, ...
//				} else if (!wget_strcasecmp_ascii(attr, "type")) {
//					sscanf(value, " %2[a-zA-Z]", ctx->type); // type of URL, e.g. HTTP, FTP, ...
				} else if (!wget_strcasecmp_ascii(attr, "preference")) {
					sscanf(value, " %6d", &ctx->priority);
					if (ctx->priority < 1 || ctx->priority > 999999)
						ctx->priority = 999999;
				}
			}
		} else {
			if (!wget_strcasecmp_ascii(dir, "/verification/pieces/hash")) {
				_add_piece(ctx, value);
			} else if (!wget_strcasecmp_ascii(dir, "/verification/hash")) {
				_add_file_hash(ctx, value);
			} else if (!wget_strcasecmp_ascii(dir, "/size")) {
				ctx->metalink->size = atoll(value);
			} else if (!wget_strcasecmp_ascii(dir, "/resources/url")) {
				_add_mirror(ctx, value);
			}
		}
	} else {
		// metalink 4 XML format
		if (attr) {
			if (*dir == 0) { // /metalink/file
				if (!wget_strcasecmp_ascii(attr, "name")) {
					ctx->metalink->name = wget_strmemdup(val, len);
				}
			} else if (!wget_strcasecmp_ascii(dir, "/pieces")) {
				if (!wget_strcasecmp_ascii(attr, "type")) {
					sscanf(value, "%15s", ctx->hash_type);
				} else if (!wget_strcasecmp_ascii(attr, "length")) {
					ctx->length = atoll(value);
				}
			} else if (!wget_strcasecmp_ascii(dir, "/hash")) {
				if (!wget_strcasecmp_ascii(attr, "type")) {
					sscanf(value, "%15s", ctx->hash_type);
				}
			} else if (!wget_strcasecmp_ascii(dir, "/url")) {
				if (!wget_strcasecmp_ascii(attr, "location")) {
					sscanf(value, " %2[a-zA-Z]", ctx->location); // ISO 3166-1 alpha-2 two letter country code
				} else if (!wget_strcasecmp_ascii(attr, "priority") || !wget_strcasecmp_ascii(attr, "preference")) {
					sscanf(value, " %6d", &ctx->priority);
					if (ctx->priority < 1 || ctx->priority > 999999)
						ctx->priority = 999999;
				}
			}
		} else {
			if (!wget_strcasecmp_ascii(dir, "/pieces/hash")) {
				_add_piece(ctx, value);
			} else if (!wget_strcasecmp_ascii(dir, "/hash")) {
				_add_file_hash(ctx, value);
			} else if (!wget_strcasecmp_ascii(dir, "/size")) {
				ctx->metalink->size = atoll(value);
			} else if (!wget_strcasecmp_ascii(dir, "/url")) {
				_add_mirror(ctx, value);
			}
		}
	}
}
示例#6
0
文件: test.c 项目: armistace/wget2
static void test_strcasecmp_ascii(void)
{
	static const struct test_data {
		const char *
			s1;
		const char *
			s2;
		int
			result;
	} test_data[] = {
		{ NULL, NULL, 0 },
		{ NULL, "x", -1 },
		{ "x", NULL, 1 },
		{ "Abc", "abc", 0 },
		{ "abc", "abc", 0 },
		{ "abc", "ab", 'c' },
		{ "ab", "abc", -'c' },
		{ "abc", "", 'a' },
		{ "", "abc", -'a' },
	};
	static const struct test_data2 {
		const char *
			s1;
		const char *
			s2;
		size_t
			n;
		int
			result;
	} test_data2[] = {
		{ NULL, NULL, 1, 0 },
		{ NULL, "x", 1, -1 },
		{ "x", NULL, 1, 1 },
		{ "Abc", "abc", 2, 0 },
		{ "abc", "abc", 3, 0 },
		{ "abc", "ab", 2, 0 },
		{ "abc", "ab", 3, 'c' },
		{ "ab", "abc", 2, 0 },
		{ "ab", "abc", 3, -'c' },
		{ "abc", "", 1, 'a' },
		{ "", "abc", 1, -'a' },
		{ "", "abc", 0, 0 },
	};

	for (unsigned it = 0; it < countof(test_data); it++) {
		const struct test_data *t = &test_data[it];

		int n = wget_strcasecmp_ascii(t->s1, t->s2);

		if (n == t->result)
			ok++;
		else {
			failed++;
			info_printf("Failed [%u]: wget_strcasecmp_ascii(%s,%s) -> %d (expected %d)\n", it, t->s1, t->s2, n, t->result);
		}
	}

	for (unsigned it = 0; it < countof(test_data2); it++) {
		const struct test_data2 *t = &test_data2[it];

		int n = wget_strncasecmp_ascii(t->s1, t->s2, t->n);

		if (n == t->result)
			ok++;
		else {
			failed++;
			info_printf("Failed [%u]: wget_strncasecmp_ascii(%s,%s,%zu) -> %d (expected %d)\n", it, t->s1, t->s2, t->n, n, t->result);
		}
	}

	for (unsigned it = 0; it < 26; it++) {
		char s1[8], s2[8];

		s1[0] = 'a' + it; s1[1] = 0;
		s2[0] = 'A' + it; s2[1] = 0;

		if (wget_strcasecmp_ascii(s1, s2) == 0)
			ok++;
		else {
			failed++;
			info_printf("Failed: wget_strcasecmp_ascii(%s,%s) != 0\n", s1, s2);
		}

		if (wget_strncasecmp_ascii(s1, s2, 1) == 0)
			ok++;
		else {
			failed++;
			info_printf("Failed: wget_strncasecmp_ascii(%s,%s) != 0\n", s1, s2);
		}
	}
}