/* Construct the robots URL. */ static struct urlinfo * robots_url (const char *url, const char *robots_filename) { struct urlinfo *u = newurl (); uerr_t err; err = parseurl (url, u, 0); assert (err == URLOK && u->proto == URLHTTP); xfree (u->file); xfree (u->dir); xfree (u->url); u->dir = xstrdup (""); u->file = xstrdup (robots_filename); u->url = str_url (u, 0); return u; }
void PlayerWidget::AddURLToPlaylist(){ QInputDialog dlg(0, Qt::Dialog | Qt::WindowStaysOnTopHint ); dlg.setInputMode(QInputDialog::TextInput); dlg.setLabelText(tr("Enter a valid URL for a multimedia file or stream:")); dlg.setTextEchoMode(QLineEdit::Normal); dlg.setWindowTitle(tr("Multimedia URL")); dlg.setWindowIcon( LXDG::findIcon("download","") ); //ensure it is centered on the current screen QPoint center = QApplication::desktop()->screenGeometry(this).center(); dlg.move( center.x()-(dlg.width()/2), center.y()-(dlg.height()/2) ); dlg.show(); while( dlg.isVisible() ){ QApplication::processEvents(); } QString url = dlg.textValue(); if(url.isEmpty() || dlg.result()!=QDialog::Accepted){ return; } //cancelled //QString url = QInputDialog::getText(0, tr("Multimedia URL"), tr("Enter a valid URL for a multimedia file or stream"), QLineEdit::Normal); //if(url.isEmpty()){ return; } QUrl newurl(url); if(!newurl.isValid()){ return; } //invalid URL PLAYLIST->addMedia(newurl); playlistChanged(); }
void xml_compiler::url_loader::traverse(const extracted_ptree& pt) const { for (const auto& it : pt) { if (it.get_tag_name() != "vkopenurldef") { if (!it.children_empty()) { traverse(it.children_extracted_ptree()); } } else { std::shared_ptr<url> newurl(new url()); if (!newurl) continue; // ---------------------------------------- bool error = false; for (const auto& child : it.children_extracted_ptree()) { if (child.get_tag_name() == "name") { newurl->set_name(pqrs::string::remove_whitespaces_copy(child.get_data())); if (!boost::starts_with(*(newurl->get_name()), "KeyCode::VK_OPEN_URL_")) { error = true; xml_compiler_.error_information_.set(boost::format("<name> within <vkopenurldef> must start with \"KeyCode::VK_OPEN_URL_\":\n\n<name>%1%</name>") % *(newurl->get_name())); } } else if (child.get_tag_name() == "url") { newurl->set_url(boost::trim_copy(child.get_data())); auto type = child.get_optional("<xmlattr>.type"); if (type) { newurl->set_type(boost::trim_copy(*type)); } } else if (child.get_tag_name() == "background") { newurl->set_background(true); } } if (error) { continue; } // ---------------------------------------- // Validation // name if (!newurl->get_name()) { xml_compiler_.error_information_.set(boost::format("No <name> within <%1%>.") % it.get_tag_name()); continue; } if (newurl->get_name()->empty()) { xml_compiler_.error_information_.set(boost::format("Empty <name> within <%1%>.") % it.get_tag_name()); continue; } // url if (!newurl->get_url()) { xml_compiler_.error_information_.set(boost::format("No <url> within <%1%>.") % it.get_tag_name()); continue; } if (newurl->get_url()->empty()) { xml_compiler_.error_information_.set(boost::format("Empty <url> within <%1%>.") % it.get_tag_name()); continue; } // ---------------------------------------- // register to symbol_map_. if (!symbol_map_.get_optional(*(newurl->get_name()))) { auto keycode = symbol_map_.add("KeyCode", boost::replace_first_copy(*(newurl->get_name()), "KeyCode::", "")); vk_open_url_map_[keycode] = newurl; } } } }
int muse_tui::userinput () { int ch; while ((ch = getch ()) != 'q') { warning ("input: int %d", ch); if ((ch > 48) & (ch <= (48 + MAX_CHANNELS))) { setactivechan (ch - 49); } else switch (ch) { case KEY_TAB: tabchan(); break; case KEY_F1: // F1 case 80: // does keypad work? warning ("request change to win 1"); changewin (0); break; case KEY_F2: // F2 case 81: // does keypad work? warning ("request change to win 2"); changewin (1); break; case KEY_F3: // F3 case 82: // does keypad work? warning ("request change to win 3"); changewin (2); break; case KEY_UP: chan[activechan].selprevitem(); break; case KEY_DOWN: chan[activechan].selnextitem(); break; case '+': chan[activechan].incrvol(); break; case '-': chan[activechan].decrvol(); break; case 'u': case 'U': newurl(); refresh(); break; case 'f': case 'F': newfile(); refresh(); break; case 't': case 'T': /* talk */ break; case 'a': case 'A': showabout(); refresh(); break; case 's': case 'S': /* speaker */ break; case 'l': case 'L': /* line-in */ break; case 'c': case 'C': /* connect */ break; case 'h': case 'H': /* connect */ showhelp(); refresh(); break; default: continue; } } return 0; }
/* convert_links() is called from recursive_retrieve() after we're done with an HTML file. This call to convert_links is not complete because it converts only the downloaded files, and Wget cannot know which files will be downloaded afterwards. So, if we have file fileone.html with: <a href="/c/something.gif"> and /c/something.gif was not downloaded because it exceeded the recursion depth, the reference will *not* be changed. However, later we can encounter /c/something.gif from an "upper" level HTML (let's call it filetwo.html), and it gets downloaded. But now we have a problem because /c/something.gif will be correctly transformed in filetwo.html, but not in fileone.html, since Wget could not have known that /c/something.gif will be downloaded in the future. This is why Wget must, after the whole retrieval, call convert_all_links to go once more through the entire list of retrieved HTMLs, and re-convert them. All the downloaded HTMLs are kept in downloaded_html_files, and downloaded URLs in urls_downloaded. From these two lists information is extracted. */ void convert_all_links (void) { slist *html; /* Destructively reverse downloaded_html_files to get it in the right order. recursive_retrieve() used slist_prepend() consistently. */ downloaded_html_files = slist_nreverse (downloaded_html_files); for (html = downloaded_html_files; html; html = html->next) { urlpos *urls, *cur_url; char *url; DEBUGP (("Rescanning %s\n", html->string)); /* Determine the URL of the HTML file. get_urls_html will need it. */ url = hash_table_get (dl_file_url_map, html->string); if (url) DEBUGP (("It should correspond to %s.\n", url)); else DEBUGP (("I cannot find the corresponding URL.\n")); /* Parse the HTML file... */ urls = get_urls_html (html->string, url, FALSE, NULL); /* We don't respect meta_disallow_follow here because, even if the file is not followed, we might still want to convert the links that have been followed from other files. */ for (cur_url = urls; cur_url; cur_url = cur_url->next) { char *local_name; /* The URL must be in canonical form to be compared. */ struct urlinfo *u = newurl (); uerr_t res = parseurl (cur_url->url, u, 0); if (res != URLOK) { freeurl (u, 1); continue; } /* We decide the direction of conversion according to whether a URL was downloaded. Downloaded URLs will be converted ABS2REL, whereas non-downloaded will be converted REL2ABS. */ local_name = hash_table_get (dl_url_file_map, u->url); if (local_name) DEBUGP (("%s marked for conversion, local %s\n", u->url, local_name)); /* Decide on the conversion direction. */ if (local_name) { /* We've downloaded this URL. Convert it to relative form. We do this even if the URL already is in relative form, because our directory structure may not be identical to that on the server (think `-nd', `--cut-dirs', etc.) */ cur_url->convert = CO_CONVERT_TO_RELATIVE; cur_url->local_name = xstrdup (local_name); } else { /* We haven't downloaded this URL. If it's not already complete (including a full host name), convert it to that form, so it can be reached while browsing this HTML locally. */ if (!cur_url->link_complete_p) cur_url->convert = CO_CONVERT_TO_COMPLETE; cur_url->local_name = NULL; } freeurl (u, 1); } /* Convert the links in the file. */ convert_links (html->string, urls); /* Free the data. */ free_urlpos (urls); } }
/* The core of recursive retrieving. Endless recursion is avoided by having all URLs stored to a linked list of URLs, which is checked before loading any URL. That way no URL can get loaded twice. The function also supports specification of maximum recursion depth and a number of other goodies. */ uerr_t recursive_retrieve (const char *file, const char *this_url) { char *constr, *filename, *newloc; char *canon_this_url = NULL; int dt, inl, dash_p_leaf_HTML = FALSE; int meta_disallow_follow; int this_url_ftp; /* See below the explanation */ uerr_t err; struct urlinfo *rurl; urlpos *url_list, *cur_url; char *rfile; /* For robots */ struct urlinfo *u; assert (this_url != NULL); assert (file != NULL); /* If quota was exceeded earlier, bail out. */ if (downloaded_exceeds_quota ()) return QUOTEXC; /* Cache the current URL in the list. */ if (first_time) { /* These three operations need to be done only once per Wget run. They should probably be at a different location. */ if (!undesirable_urls) undesirable_urls = make_string_hash_table (0); hash_table_clear (undesirable_urls); string_set_add (undesirable_urls, this_url); /* Enter this_url to the hash table, in original and "enhanced" form. */ u = newurl (); err = parseurl (this_url, u, 0); if (err == URLOK) { string_set_add (undesirable_urls, u->url); if (opt.no_parent) base_dir = xstrdup (u->dir); /* Set the base dir. */ /* Set the canonical this_url to be sent as referer. This problem exists only when running the first time. */ canon_this_url = xstrdup (u->url); } else { DEBUGP (("Double yuck! The *base* URL is broken.\n")); base_dir = NULL; } freeurl (u, 1); depth = 1; robots_host = NULL; forbidden = NULL; first_time = 0; } else ++depth; if (opt.reclevel != INFINITE_RECURSION && depth > opt.reclevel) /* We've exceeded the maximum recursion depth specified by the user. */ { if (opt.page_requisites && depth <= opt.reclevel + 1) /* When -p is specified, we can do one more partial recursion from the "leaf nodes" on the HTML document tree. The recursion is partial in that we won't traverse any <A> or <AREA> tags, nor any <LINK> tags except for <LINK REL="stylesheet">. */ dash_p_leaf_HTML = TRUE; else /* Either -p wasn't specified or it was and we've already gone the one extra (pseudo-)level that it affords us, so we need to bail out. */ { DEBUGP (("Recursion depth %d exceeded max. depth %d.\n", depth, opt.reclevel)); --depth; return RECLEVELEXC; } } /* Determine whether this_url is an FTP URL. If it is, it means that the retrieval is done through proxy. In that case, FTP links will be followed by default and recursion will not be turned off when following them. */ this_url_ftp = (urlproto (this_url) == URLFTP); /* Get the URL-s from an HTML file: */ url_list = get_urls_html (file, canon_this_url ? canon_this_url : this_url, dash_p_leaf_HTML, &meta_disallow_follow); if (opt.use_robots && meta_disallow_follow) { /* The META tag says we are not to follow this file. Respect that. */ free_urlpos (url_list); url_list = NULL; } /* Decide what to do with each of the URLs. A URL will be loaded if it meets several requirements, discussed later. */ for (cur_url = url_list; cur_url; cur_url = cur_url->next) { /* If quota was exceeded earlier, bail out. */ if (downloaded_exceeds_quota ()) break; /* Parse the URL for convenient use in other functions, as well as to get the optimized form. It also checks URL integrity. */ u = newurl (); if (parseurl (cur_url->url, u, 0) != URLOK) { DEBUGP (("Yuck! A bad URL.\n")); freeurl (u, 1); continue; } if (u->proto == URLFILE) { DEBUGP (("Nothing to do with file:// around here.\n")); freeurl (u, 1); continue; } assert (u->url != NULL); constr = xstrdup (u->url); /* Several checkings whether a file is acceptable to load: 1. check if URL is ftp, and we don't load it 2. check for relative links (if relative_only is set) 3. check for domain 4. check for no-parent 5. check for excludes && includes 6. check for suffix 7. check for same host (if spanhost is unset), with possible gethostbyname baggage 8. check for robots.txt Addendum: If the URL is FTP, and it is to be loaded, only the domain and suffix settings are "stronger". Note that .html and (yuck) .htm will get loaded regardless of suffix rules (but that is remedied later with unlink) unless the depth equals the maximum depth. More time- and memory- consuming tests should be put later on the list. */ /* inl is set if the URL we are working on (constr) is stored in undesirable_urls. Using it is crucial to avoid unnecessary repeated continuous hits to the hash table. */ inl = string_set_contains (undesirable_urls, constr); /* If it is FTP, and FTP is not followed, chuck it out. */ if (!inl) if (u->proto == URLFTP && !opt.follow_ftp && !this_url_ftp) { DEBUGP (("Uh, it is FTP but i'm not in the mood to follow FTP.\n")); string_set_add (undesirable_urls, constr); inl = 1; } /* If it is absolute link and they are not followed, chuck it out. */ if (!inl && u->proto != URLFTP) if (opt.relative_only && !cur_url->link_relative_p) { DEBUGP (("It doesn't really look like a relative link.\n")); string_set_add (undesirable_urls, constr); inl = 1; } /* If its domain is not to be accepted/looked-up, chuck it out. */ if (!inl) if (!accept_domain (u)) { DEBUGP (("I don't like the smell of that domain.\n")); string_set_add (undesirable_urls, constr); inl = 1; } /* Check for parent directory. */ if (!inl && opt.no_parent /* If the new URL is FTP and the old was not, ignore opt.no_parent. */ && !(!this_url_ftp && u->proto == URLFTP)) { /* Check for base_dir first. */ if (!(base_dir && frontcmp (base_dir, u->dir))) { /* Failing that, check for parent dir. */ struct urlinfo *ut = newurl (); if (parseurl (this_url, ut, 0) != URLOK) DEBUGP (("Double yuck! The *base* URL is broken.\n")); else if (!frontcmp (ut->dir, u->dir)) { /* Failing that too, kill the URL. */ DEBUGP (("Trying to escape parental guidance with no_parent on.\n")); string_set_add (undesirable_urls, constr); inl = 1; } freeurl (ut, 1); } } /* If the file does not match the acceptance list, or is on the rejection list, chuck it out. The same goes for the directory exclude- and include- lists. */ if (!inl && (opt.includes || opt.excludes)) { if (!accdir (u->dir, ALLABS)) { DEBUGP (("%s (%s) is excluded/not-included.\n", constr, u->dir)); string_set_add (undesirable_urls, constr); inl = 1; } } if (!inl) { char *suf = NULL; /* We check for acceptance/rejection rules only for non-HTML documents. Since we don't know whether they really are HTML, it will be deduced from (an OR-ed list): 1) u->file is "" (meaning it is a directory) 2) suffix exists, AND: a) it is "html", OR b) it is "htm" If the file *is* supposed to be HTML, it will *not* be subject to acc/rej rules, unless a finite maximum depth has been specified and the current depth is the maximum depth. */ if (! (!*u->file || (((suf = suffix (constr)) != NULL) && ((!strcmp (suf, "html") || !strcmp (suf, "htm")) && ((opt.reclevel != INFINITE_RECURSION) && (depth != opt.reclevel)))))) { if (!acceptable (u->file)) { DEBUGP (("%s (%s) does not match acc/rej rules.\n", constr, u->file)); string_set_add (undesirable_urls, constr); inl = 1; } } FREE_MAYBE (suf); } /* Optimize the URL (which includes possible DNS lookup) only after all other possibilities have been exhausted. */ if (!inl) { if (!opt.simple_check) opt_url (u); else { char *p; /* Just lowercase the hostname. */ for (p = u->host; *p; p++) *p = TOLOWER (*p); xfree (u->url); u->url = str_url (u, 0); } xfree (constr); constr = xstrdup (u->url); string_set_add (undesirable_urls, constr); if (!inl && !((u->proto == URLFTP) && !this_url_ftp)) if (!opt.spanhost && this_url && !same_host (this_url, constr)) { DEBUGP (("This is not the same hostname as the parent's.\n")); string_set_add (undesirable_urls, constr); inl = 1; } } /* What about robots.txt? */ if (!inl && opt.use_robots && u->proto == URLHTTP) { /* Since Wget knows about only one set of robot rules at a time, /robots.txt must be reloaded whenever a new host is accessed. robots_host holds the host the current `forbid' variable is assigned to. */ if (!robots_host || !same_host (robots_host, u->host)) { FREE_MAYBE (robots_host); /* Now make robots_host the new host, no matter what the result will be. So if there is no /robots.txt on the site, Wget will not retry getting robots all the time. */ robots_host = xstrdup (u->host); free_vec (forbidden); forbidden = NULL; err = retrieve_robots (constr, ROBOTS_FILENAME); if (err == ROBOTSOK) { rurl = robots_url (constr, ROBOTS_FILENAME); rfile = url_filename (rurl); forbidden = parse_robots (rfile); freeurl (rurl, 1); xfree (rfile); } } /* Now that we have (or don't have) robots, we can check for them. */ if (!robots_match (u, forbidden)) { DEBUGP (("Stuffing %s because %s forbids it.\n", this_url, ROBOTS_FILENAME)); string_set_add (undesirable_urls, constr); inl = 1; } } filename = NULL; /* If it wasn't chucked out, do something with it. */ if (!inl) { DEBUGP (("I've decided to load it -> ")); /* Add it to the list of already-loaded URL-s. */ string_set_add (undesirable_urls, constr); /* Automatically followed FTPs will *not* be downloaded recursively. */ if (u->proto == URLFTP) { /* Don't you adore side-effects? */ opt.recursive = 0; } /* Reset its type. */ dt = 0; /* Retrieve it. */ retrieve_url (constr, &filename, &newloc, canon_this_url ? canon_this_url : this_url, &dt); if (u->proto == URLFTP) { /* Restore... */ opt.recursive = 1; } if (newloc) { xfree (constr); constr = newloc; } /* If there was no error, and the type is text/html, parse it recursively. */ if (dt & TEXTHTML) { if (dt & RETROKF) recursive_retrieve (filename, constr); } else DEBUGP (("%s is not text/html so we don't chase.\n", filename ? filename: "(null)")); if (opt.delete_after || (filename && !acceptable (filename))) /* Either --delete-after was specified, or we loaded this otherwise rejected (e.g. by -R) HTML file just so we could harvest its hyperlinks -- in either case, delete the local file. */ { DEBUGP (("Removing file due to %s in recursive_retrieve():\n", opt.delete_after ? "--delete-after" : "recursive rejection criteria")); logprintf (LOG_VERBOSE, (opt.delete_after ? _("Removing %s.\n") : _("Removing %s since it should be rejected.\n")), filename); if (unlink (filename)) logprintf (LOG_NOTQUIET, "unlink: %s\n", strerror (errno)); dt &= ~RETROKF; } /* If everything was OK, and links are to be converted, let's store the local filename. */ if (opt.convert_links && (dt & RETROKF) && (filename != NULL)) { cur_url->convert = CO_CONVERT_TO_RELATIVE; cur_url->local_name = xstrdup (filename); } } else DEBUGP (("%s already in list, so we don't load.\n", constr)); /* Free filename and constr. */ FREE_MAYBE (filename); FREE_MAYBE (constr); freeurl (u, 1); /* Increment the pbuf for the appropriate size. */ } if (opt.convert_links && !opt.delete_after) /* This is merely the first pass: the links that have been successfully downloaded are converted. In the second pass, convert_all_links() will also convert those links that have NOT been downloaded to their canonical form. */ convert_links (file, url_list); /* Free the linked list of URL-s. */ free_urlpos (url_list); /* Free the canonical this_url. */ FREE_MAYBE (canon_this_url); /* Decrement the recursion depth. */ --depth; if (downloaded_exceeds_quota ()) return QUOTEXC; else return RETROK; }
/* Simple calls to convert_links will often fail because only the downloaded files are converted, and Wget cannot know which files will be converted in the future. So, if we have file fileone.html with: <a href=/c/something.gif> and /c/something.gif was not downloaded because it exceeded the recursion depth, the reference will *not* be changed. However, later we can encounter /c/something.gif from an "upper" level HTML (let's call it filetwo.html), and it gets downloaded. But now we have a problem because /c/something.gif will be correctly transformed in filetwo.html, but not in fileone.html, since Wget could not have known that /c/something.gif will be downloaded in the future. This is why Wget must, after the whole retrieval, call convert_all_links to go once more through the entire list of retrieved HTML-s, and re-convert them. All the downloaded HTMLs are kept in urls_html, and downloaded URLs in urls_downloaded. From these two lists information is extracted. */ void convert_all_links (void) { uerr_t res; urlpos *l1, *l2, *urls; struct urlinfo *u; slist *html; urlpos *urlhtml; for (html = urls_html; html; html = html->next) { DEBUGP (("Rescanning %s\n", html->string)); /* Determine the URL of the HTML file. get_urls_html will need it. */ for (urlhtml = urls_downloaded; urlhtml; urlhtml = urlhtml->next) if (!strcmp (urlhtml->local_name, html->string)) break; if (urlhtml) DEBUGP (("It should correspond to %s.\n", urlhtml->url)); else DEBUGP (("I cannot find the corresponding URL.\n")); /* Parse the HTML file... */ urls = get_urls_html (html->string, urlhtml ? urlhtml->url : NULL, 1); if (!urls) continue; for (l1 = urls; l1; l1 = l1->next) { /* The URL must be in canonical form to be compared. */ u = newurl (); res = parseurl (l1->url, u, 0); if (res != URLOK) { freeurl (u, 1); continue; } /* We decide the direction of conversion according to whether a URL was downloaded. Downloaded URLs will be converted ABS2REL, whereas non-downloaded will be converted REL2ABS. Note: not yet implemented; only ABS2REL works. */ for (l2 = urls_downloaded; l2; l2 = l2->next) if (!strcmp (l2->url, u->url)) { DEBUGP (("%s flagged for conversion, local %s\n", l2->url, l2->local_name)); break; } /* Clear the flags. */ l1->flags &= ~ (UABS2REL | UREL2ABS); /* Decide on the conversion direction. */ if (l2) { l1->flags |= UABS2REL; l1->local_name = xstrdup (l2->local_name); } else { l1->flags |= UREL2ABS; l1->local_name = NULL; } freeurl (u, 1); } /* Convert the links in the file. */ convert_links (html->string, urls); /* Free the data. */ free_urlpos (urls); } }
/* The core of recursive retrieving. Endless recursion is avoided by having all URL-s stored to a linked list of URL-s, which is checked before loading any URL. That way no URL can get loaded twice. The function also supports specification of maximum recursion depth and a number of other goodies. */ uerr_t recursive_retrieve (const char *file, const char *this_url) { char *constr, *filename, *newloc; char *canon_this_url = NULL; int dt, inl; int this_url_ftp; /* See below the explanation */ uerr_t err; struct urlinfo *rurl; urlpos *url_list, *cur_url; char *rfile; /* For robots */ struct urlinfo *u; assert (this_url != NULL); assert (file != NULL); /* If quota was exceeded earlier, bail out. */ if (opt.quota && (opt.downloaded > opt.quota)) return QUOTEXC; /* Cache the current URL in the list. */ if (first_time) { ulist = add_slist (ulist, this_url, 0); urls_downloaded = NULL; urls_html = NULL; /* Enter this_url to the slist, in original and "enhanced" form. */ u = newurl (); err = parseurl (this_url, u, 0); if (err == URLOK) { ulist = add_slist (ulist, u->url, 0); urls_downloaded = add_url (urls_downloaded, u->url, file); urls_html = add_slist (urls_html, file, NOSORT); if (opt.no_parent) base_dir = xstrdup (u->dir); /* Set the base dir. */ /* Set the canonical this_url to be sent as referer. This problem exists only when running the first time. */ canon_this_url = xstrdup (u->url); } else { DEBUGP (("Double yuck! The *base* URL is broken.\n")); base_dir = NULL; } freeurl (u, 1); depth = 1; robots_host = NULL; forbidden = NULL; first_time = 0; } else ++depth; /* Bail out if opt.reclevel is exceeded. */ if ((opt.reclevel != 0) && (depth > opt.reclevel)) { DEBUGP (("Recursion depth %d exceeded max. depth %d.\n", depth, opt.reclevel)); --depth; return RECLEVELEXC; } /* Determine whether this_url is an FTP URL. If it is, it means that the retrieval is done through proxy. In that case, FTP links will be followed by default and recursion will not be turned off when following them. */ this_url_ftp = (urlproto (this_url) == URLFTP); /* Get the URL-s from an HTML file: */ url_list = get_urls_html (file, canon_this_url ? canon_this_url : this_url, 0); /* Decide what to do with each of the URLs. A URL will be loaded if it meets several requirements, discussed later. */ for (cur_url = url_list; cur_url; cur_url = cur_url->next) { /* If quota was exceeded earlier, bail out. */ if (opt.quota && (opt.downloaded > opt.quota)) break; /* Parse the URL for convenient use in other functions, as well as to get the optimized form. It also checks URL integrity. */ u = newurl (); if (parseurl (cur_url->url, u, 0) != URLOK) { DEBUGP (("Yuck! A bad URL.\n")); freeurl (u, 1); continue; } if (u->proto == URLFILE) { DEBUGP (("Nothing to do with file:// around here.\n")); freeurl (u, 1); continue; } assert (u->url != NULL); constr = xstrdup (u->url); /* Several checkings whether a file is acceptable to load: 1. check if URL is ftp, and we don't load it 2. check for relative links (if relative_only is set) 3. check for domain 4. check for no-parent 5. check for excludes && includes 6. check for suffix 7. check for same host (if spanhost is unset), with possible gethostbyname baggage 8. check for robots.txt Addendum: If the URL is FTP, and it is to be loaded, only the domain and suffix settings are "stronger". Note that .html and (yuck) .htm will get loaded regardless of suffix rules (but that is remedied later with unlink). More time- and memory- consuming tests should be put later on the list. */ /* inl is set if the URL we are working on (constr) is stored in ulist. Using it is crucial to avoid the incessant calls to in_slist, which is quite slow. */ inl = in_slist (ulist, constr); /* If it is FTP, and FTP is not followed, chuck it out. */ if (!inl) if (u->proto == URLFTP && !opt.follow_ftp && !this_url_ftp) { DEBUGP (("Uh, it is FTP but i'm not in the mood to follow FTP.\n")); ulist = add_slist (ulist, constr, 0); inl = 1; } /* If it is absolute link and they are not followed, chuck it out. */ if (!inl && u->proto != URLFTP) if (opt.relative_only && !(cur_url->flags & URELATIVE)) { DEBUGP (("It doesn't really look like a relative link.\n")); ulist = add_slist (ulist, constr, 0); inl = 1; } /* If its domain is not to be accepted/looked-up, chuck it out. */ if (!inl) if (!accept_domain (u)) { DEBUGP (("I don't like the smell of that domain.\n")); ulist = add_slist (ulist, constr, 0); inl = 1; } /* Check for parent directory. */ if (!inl && opt.no_parent /* If the new URL is FTP and the old was not, ignore opt.no_parent. */ && !(!this_url_ftp && u->proto == URLFTP)) { /* Check for base_dir first. */ if (!(base_dir && frontcmp (base_dir, u->dir))) { /* Failing that, check for parent dir. */ struct urlinfo *ut = newurl (); if (parseurl (this_url, ut, 0) != URLOK) DEBUGP (("Double yuck! The *base* URL is broken.\n")); else if (!frontcmp (ut->dir, u->dir)) { /* Failing that too, kill the URL. */ DEBUGP (("Trying to escape parental guidance with no_parent on.\n")); ulist = add_slist (ulist, constr, 0); inl = 1; } freeurl (ut, 1); } } /* If the file does not match the acceptance list, or is on the rejection list, chuck it out. The same goes for the directory exclude- and include- lists. */ if (!inl && (opt.includes || opt.excludes)) { if (!accdir (u->dir, ALLABS)) { DEBUGP (("%s (%s) is excluded/not-included.\n", constr, u->dir)); ulist = add_slist (ulist, constr, 0); inl = 1; } } if (!inl) { char *suf = NULL; /* We check for acceptance/rejection rules only for non-HTML documents. Since we don't know whether they really are HTML, it will be deduced from (an OR-ed list): 1) u->file is "" (meaning it is a directory) 2) suffix exists, AND: a) it is "html", OR b) it is "htm" If the file *is* supposed to be HTML, it will *not* be subject to acc/rej rules. That's why the `!'. */ if (! (!*u->file || (((suf = suffix (constr)) != NULL) && (!strcmp (suf, "html") || !strcmp (suf, "htm"))))) { if (!acceptable (u->file)) { DEBUGP (("%s (%s) does not match acc/rej rules.\n", constr, u->file)); ulist = add_slist (ulist, constr, 0); inl = 1; } } FREE_MAYBE (suf); } /* Optimize the URL (which includes possible DNS lookup) only after all other possibilities have been exhausted. */ if (!inl) { if (!opt.simple_check) opt_url (u); else { char *p; /* Just lowercase the hostname. */ for (p = u->host; *p; p++) *p = tolower (*p); free (u->url); u->url = str_url (u, 0); } free (constr); constr = xstrdup (u->url); inl = in_slist (ulist, constr); if (!inl && !((u->proto == URLFTP) && !this_url_ftp)) if (!opt.spanhost && this_url && !same_host (this_url, constr)) { DEBUGP (("This is not the same hostname as the parent's.\n")); ulist = add_slist (ulist, constr, 0); inl = 1; } } /* What about robots.txt? */ if (!inl && opt.use_robots && u->proto == URLHTTP) { /* Since Wget knows about only one set of robot rules at a time, /robots.txt must be reloaded whenever a new host is accessed. robots_host holds the host the current `forbid' variable is assigned to. */ if (!robots_host || !same_host (robots_host, u->host)) { FREE_MAYBE (robots_host); /* Now make robots_host the new host, no matter what the result will be. So if there is no /robots.txt on the site, Wget will not retry getting robots all the time. */ robots_host = xstrdup (u->host); free_vec (forbidden); forbidden = NULL; err = retrieve_robots (constr, ROBOTS_FILENAME); if (err == ROBOTSOK) { rurl = robots_url (constr, ROBOTS_FILENAME); rfile = url_filename (rurl); forbidden = parse_robots (rfile); freeurl (rurl, 1); free (rfile); } } /* Now that we have (or don't have) robots, we can check for them. */ if (!robots_match (u, forbidden)) { DEBUGP (("Stuffing %s because %s forbids it.\n", this_url, ROBOTS_FILENAME)); ulist = add_slist (ulist, constr, 0); inl = 1; } } filename = NULL; /* If it wasn't chucked out, do something with it. */ if (!inl) { DEBUGP (("I've decided to load it -> ")); /* Add it to the list of already-loaded URL-s. */ ulist = add_slist (ulist, constr, 0); /* Automatically followed FTPs will *not* be downloaded recursively. */ if (u->proto == URLFTP) { /* Don't you adore side-effects? */ opt.recursive = 0; } /* Reset its type. */ dt = 0; /* Retrieve it. */ retrieve_url (constr, &filename, &newloc, canon_this_url ? canon_this_url : this_url, &dt); if (u->proto == URLFTP) { /* Restore... */ opt.recursive = 1; } if (newloc) { free (constr); constr = newloc; } /* In case of convert_links: If there was no error, add it to the list of downloaded URLs. We might need it for conversion. */ if (opt.convert_links && filename) { if (dt & RETROKF) { urls_downloaded = add_url (urls_downloaded, constr, filename); /* If the URL is HTML, note it. */ if (dt & TEXTHTML) urls_html = add_slist (urls_html, filename, NOSORT); } } /* If there was no error, and the type is text/html, parse it recursively. */ if (dt & TEXTHTML) { if (dt & RETROKF) recursive_retrieve (filename, constr); } else DEBUGP (("%s is not text/html so we don't chase.\n", filename ? filename: "(null)")); /* If an suffix-rejected file was loaded only because it was HTML, undo the error now */ if (opt.delete_after || (filename && !acceptable (filename))) { logprintf (LOG_VERBOSE, (opt.delete_after ? _("Removing %s.\n") : _("Removing %s since it should be rejected.\n")), filename); if (unlink (filename)) logprintf (LOG_NOTQUIET, "unlink: %s\n", strerror (errno)); dt &= ~RETROKF; } /* If everything was OK, and links are to be converted, let's store the local filename. */ if (opt.convert_links && (dt & RETROKF) && (filename != NULL)) { cur_url->flags |= UABS2REL; cur_url->local_name = xstrdup (filename); } } DEBUGP (("%s already in list, so we don't load.\n", constr)); /* Free filename and constr. */ FREE_MAYBE (filename); FREE_MAYBE (constr); freeurl (u, 1); /* Increment the pbuf for the appropriate size. */ } if (opt.convert_links) convert_links (file, url_list); /* Free the linked list of URL-s. */ free_urlpos (url_list); /* Free the canonical this_url. */ FREE_MAYBE (canon_this_url); /* Decrement the recursion depth. */ --depth; if (opt.quota && (opt.downloaded > opt.quota)) return QUOTEXC; else return RETROK; }
/* Retrieve the given URL. Decides which loop to call -- HTTP, FTP, or simply copy it with file:// (#### the latter not yet implemented!). */ uerr_t retrieve_url (const char *origurl, char **file, char **newloc, const char *refurl, int *dt) { uerr_t result; char *url; int location_changed, already_redirected, dummy; int local_use_proxy; char *mynewloc, *proxy; struct urlinfo *u; /* If dt is NULL, just ignore it. */ if (!dt) dt = &dummy; url = xstrdup (origurl); if (newloc) *newloc = NULL; if (file) *file = NULL; already_redirected = 0; again: u = newurl (); /* Parse the URL. RFC2068 requires `Location' to contain an absoluteURI, but many sites break this requirement. #### We should be liberal and accept a relative location, too. */ result = parseurl (url, u, already_redirected); if (result != URLOK) { freeurl (u, 1); logprintf (LOG_NOTQUIET, "%s: %s.\n", url, uerrmsg (result)); return result; } /* Set the referer. */ if (refurl) u->referer = xstrdup (refurl); else u->referer = NULL; local_use_proxy = USE_PROXY_P (u); if (local_use_proxy) { struct urlinfo *pu = newurl (); /* Copy the original URL to new location. */ memcpy (pu, u, sizeof (*u)); pu->proxy = NULL; /* A minor correction :) */ /* Initialize u to nil. */ memset (u, 0, sizeof (*u)); u->proxy = pu; /* Get the appropriate proxy server, appropriate for the current protocol. */ proxy = getproxy (pu->proto); if (!proxy) { logputs (LOG_NOTQUIET, _("Could not find proxy host.\n")); freeurl (u, 1); return PROXERR; } /* Parse the proxy URL. */ result = parseurl (proxy, u, 0); if (result != URLOK || u->proto != URLHTTP) { if (u->proto == URLHTTP) logprintf (LOG_NOTQUIET, "Proxy %s: %s.\n", proxy, uerrmsg (result)); else logprintf (LOG_NOTQUIET, _("Proxy %s: Must be HTTP.\n"), proxy); freeurl (u, 1); return PROXERR; } u->proto = URLHTTP; } assert (u->proto != URLFILE); /* #### Implement me! */ mynewloc = NULL; if (u->proto == URLHTTP) result = http_loop (u, &mynewloc, dt); else if (u->proto == URLFTP) { /* If this is a redirection, we must not allow recursive FTP retrieval, so we save recursion to oldrec, and restore it later. */ int oldrec = opt.recursive; if (already_redirected) opt.recursive = 0; result = ftp_loop (u, dt); opt.recursive = oldrec; /* There is a possibility of having HTTP being redirected to FTP. In these cases we must decide whether the text is HTML according to the suffix. The HTML suffixes are `.html' and `.htm', case-insensitive. #### All of this is, of course, crap. These types should be determined through mailcap. */ if (already_redirected && u->local && (u->proto == URLFTP )) { char *suf = suffix (u->local); if (suf && (!strcasecmp (suf, "html") || !strcasecmp (suf, "htm"))) *dt |= TEXTHTML; FREE_MAYBE (suf); } } location_changed = (result == NEWLOCATION); if (location_changed) { /* Check for redirection to oneself. */ if (url_equal (url, mynewloc)) { logprintf (LOG_NOTQUIET, _("%s: Redirection to itself.\n"), mynewloc); return WRONGCODE; } if (mynewloc) { free (url); url = mynewloc; } freeurl (u, 1); already_redirected = 1; goto again; } if (file) { if (u->local) *file = xstrdup (u->local); else *file = NULL; } freeurl (u, 1); if (newloc) *newloc = url; else free (url); return result; }
// fix bugs in certain web servers that don't obey standards. // actually, it's us that don't obey standards - HTTP RFC says header names // are case-insensitive. - Anonymous SF Poster, 2006-02-23 void HTTPHeader::checkheader(bool allowpersistent) { // are these headers outgoing, or incoming? bool outgoing = true; if (header.front().startsWith("HT")) outgoing = false; bool first = true; for (std::deque<String>::iterator i = header.begin(); i != header.end(); i++) { // check each line in the headers // HTTP 1.1 is persistent by default if (first) { if (i->after("HTTP/").startsWith("1.1")) { #ifdef DGDEBUG std::cout << "CheckHeader: HTTP/1.1, so assuming persistency" << std::endl; #endif waspersistent = true; ispersistent = true; } // Do not allow persistent connections on CONNECT requests - the browser thinks it has a tunnel // directly to the external server, not a connection to the proxy, so it won't be re-used in the // manner expected by DG and will result in waiting for time-outs. Bug identified by Jason Deasi. if ((*i)[0] == 'C') { #ifdef DGDEBUG std::cout << "CheckHeader: CONNECT request; disallowing persistency" << std::endl; #endif allowpersistent = false; } first = false; // force HTTP/1.0 - we don't support chunked transfer encoding, possibly amongst other things if (outgoing) (*i) = i->before(" HTTP/") + " HTTP/1.0\r"; } // index headers - try to perform the checks in the order the average browser sends the headers. // also only do the necessary checks for the header type (sent/received). else if (outgoing && (phost == NULL) && i->startsWithLower("host:")) { phost = &(*i); } // don't allow through multiple host headers else if (outgoing && (phost != NULL) && i->startsWithLower("host:")) { i->assign("X-DG-IgnoreMe: removed multiple host headers\r"); } else if (outgoing && (puseragent == NULL) && i->startsWithLower("user-agent:")) { puseragent = &(*i); } else if (outgoing && i->startsWithLower("accept-encoding:")) { (*i) = "Accept-Encoding:" + i->after(":"); (*i) = modifyEncodings(*i) + "\r"; } else if ((pcontenttype == NULL) && i->startsWithLower("content-type:")) { pcontenttype = &(*i); } else if ((pcontentlength == NULL) && i->startsWithLower("content-length:")) { pcontentlength = &(*i); } // is this ever sent outgoing? else if ((pcontentdisposition == NULL) && i->startsWithLower("content-disposition:")) { pcontentdisposition = &(*i); } else if ((!outgoing) && (pcontentencoding == NULL) && i->startsWithLower("content-encoding:")) { pcontentencoding = &(*i); } else if ((pproxyauthorization == NULL) && i->startsWithLower("proxy-authorization:")) { pproxyauthorization = &(*i); } else if ((pproxyconnection == NULL) && (i->startsWithLower("proxy-connection:") || i->startsWithLower("connection:"))) { #ifdef DGDEBUG std::cout << "CheckHeader: Found Proxy-Connection" << std::endl; #endif if (i->contains("live")) { #ifdef DGDEBUG std::cout << "CheckHeader: P-C says keep-alive" << std::endl; #endif waspersistent = true; if (!allowpersistent) { #ifdef DGDEBUG std::cout << "CheckHeader: ... but we aren't allowed to" << std::endl; #endif ispersistent = false; (*i) = i->before(":") + ": Close\r"; } else { ispersistent = true; } } else { #ifdef DGDEBUG std::cout << "CheckHeader: P-C says close" << std::endl; #endif ispersistent = false; waspersistent = false; } pproxyconnection = &(*i); } else if (outgoing && (pxforwardedfor == NULL) && i->startsWithLower("x-forwarded-for:")) { pxforwardedfor = &(*i); } // this one's non-standard, so check for it last else if (outgoing && (pport = NULL) && i->startsWithLower("port:")) { pport = &(*i); } #ifdef DGDEBUG std::cout << (*i) << std::endl; #endif } #ifdef DGDEBUG std::cout << "CheckHeader flags: AP=" << allowpersistent << " IP=" << ispersistent << " PPC=" << !(pproxyconnection == NULL) << std::endl; #endif // if a request was HTTP 1.1 and there was no proxy-connection header, we may need to add one if ((!allowpersistent) && ispersistent) { // we should only be in this state if pproxyconnection == NULL (otherwise ispersistent will have been falsified earlier) #ifdef DGDEBUG std::cout << "CheckHeader: Adding our own Proxy-Connection: Close" << std::endl; #endif header.push_back("Proxy-Connection: Close\r"); pproxyconnection = &(header.back()); ispersistent = false; } else if (allowpersistent && ispersistent && (pproxyconnection == NULL)) { #ifdef DGDEBUG std::cout << "CheckHeader: Adding our own Proxy-Connection: Keep-Alive" << std::endl; #endif // we should only be in this state if HTTP 1.1, persistency allowed, but persistency not explicitly asked for header.push_back("Proxy-Connection: Keep-Alive\r"); pproxyconnection = &(header.back()); } // Normalise request headers (fix host, port, first line of header, etc. to all be consistent) if (outgoing) { String newurl(url(true)); setURL(newurl); } }