C++ (Cpp) parseurl Examples

Example #1

0

Show file

File: perlvdbfunc.c Project: ZRouter/ZRouter

/*
 * Initialize database module
 * No function should be called before this
 */
db_con_t* perlvdb_db_init(const str* url) {
	db_con_t* res;
	str *cn;
	SV *obj = NULL;
	int consize = sizeof(db_con_t) + sizeof(SV);

	if (!url || !url->s | !url->len) {
		LM_ERR("invalid parameter value\n");
		return NULL;
	}

	cn = parseurl(url);
	if (!cn) {
		LM_ERR("invalid perl vdb url.\n");
		return NULL;
	}

	obj = newvdbobj(cn);
	if (!checkobj(obj)) {
		LM_ERR("could not initialize module. Not inheriting from %s?\n",
				PERL_VDB_BASECLASS);
		return NULL;
	}

	res = pkg_malloc(consize);
	if (!res) {
		LM_ERR("no pkg memory left\n");
		return NULL;
	}
	memset(res, 0, consize);
	CON_TAIL(res) = (unsigned int)(unsigned long)obj;

	return res;
}

Example #2

0

Show file

File: rumpsp_sock.c Project: gandro/rumpfiber-sp

static int
rumpsp_init_server(const char *url, struct rumpsp_handlers hndlrs)
{
	struct sockaddr *sap;
	unsigned int i;
	int err, sockfd, flags;

	err = parseurl(url, &sap, &protoidx, 1);

	if (err)
		return err;
	
	sockfd = socket(parsetab[protoidx].domain, SOCK_STREAM, 0);
	if (sockfd == -1) {
		return errno;
	}

	if (bind(sockfd, sap, parsetab[protoidx].slen) == -1) {
		fprintf(stderr, "rump_sp: failed to bind to URL %s\n", url);
		close(sockfd);
		return errno;
	}

	if (listen(sockfd, MAXFDS) == -1) {
		fprintf(stderr, "rump_sp: server listen failed\n");
		close(sockfd);
		return errno;
	}

	/* make sure accept() does not block */
	flags = fcntl(sockfd, F_GETFL, 0);
	if (fcntl(sockfd, F_SETFL, flags | O_NONBLOCK) == -1) {
		close(sockfd);
		return errno;
	}

	protosa = sap;
	handlers = hndlrs;

	for (i = 0; i < MAXFDS; i++) {
		chanfds[i].fd = -1;
	}

	chanfds[0].fd = sockfd;
	pollfds[0].fd = sockfd;
	pollfds[0].events = POLLIN;
	maxidx = 0;

	return 0;
}

Example #3

0

Show file

/* Construct the robots URL.  */
static struct urlinfo *
robots_url (const char *url, const char *robots_filename)
{
  struct urlinfo *u = newurl ();
  uerr_t err;

  err = parseurl (url, u, 0);
  assert (err == URLOK && u->proto == URLHTTP);
  xfree (u->file);
  xfree (u->dir);
  xfree (u->url);
  u->dir = xstrdup ("");
  u->file = xstrdup (robots_filename);
  u->url = str_url (u, 0);
  return u;
}

Example #4

0

Show file

File: main.c Project: WongTai/snippets

struct ProxyStruct parseuri(char *uri)
{
	struct ProxyStruct ret;
	char **atmp;
	char tmpfiletmp[64];
	struct growstring grow;
	int i;

	/* Verify we got something */
	assert(uri);

	/* Initialize the growstring */
	grow.size=1024*sizeof(char);
	grow.string=calloc(sizeof(char), grow.size);

	/* Copy the uri */
	ret.request_uri=strdup(uri);

	/* we prepend each part with a / in the for loop */
	str_append(&grow, "http:/");
	atmp=split('/', uri);
	for(i=3; atmp[i]!=NULL; i++) {
		str_append(&grow, "/");
		str_append(&grow, atmp[i]);
	}

	/* Get the URL stuff */
	ret.request_url=strdup(grow.string);
	ret.url=parseurl(ret.request_url);

	/* Get the file */
	grow.string[0]=0x00;
	str_append(&grow, GETENV("DOCUMENT_ROOT"));
	str_append(&grow, "/");
	str_append(&grow, ret.request_uri);

	ret.file=strdup(grow.string);

	sprintf(tmpfiletmp, ".tmp.%d", getpid());
	str_append(&grow, tmpfiletmp);
	ret.tmpfile=grow.string;

	freeptrlist(atmp);

	return(ret);
}

Example #5

0

Show file

File: server2.c Project: quasi/shakey

void service(char *buffer, int clientsocket)
{
  FILE *fp;
  char *url, *protocol, *parsedurl;
  int redirect,i;

  url=getNextparameter(buffer);
  protocol=getNextparameter(url);

  redirect = parseurl(url,&parsedurl);
  if(redirect)
    {
      senddata(url,clientsocket,redirect);
      printf("\nURL:%s\n",url);
    }
  else
    {
      senddata(parsedurl,clientsocket,redirect);
      printf("\nURL:%s\n",parsedurl);
    }
}

Example #6

0

Show file

File: main.c Project: egustafson/sandbox

int main() {

    char  url_string[] = "ftp://*****:*****@host:123/path/elem";
    url_t url_struct;

    printf("----------------------------------------\n");
    printf("url:     %s\n", url_string);

    parseurl(&url_struct, url_string);

    printf("----------------------------------------\n");
    printf("type:    %s\n", url_struct.type);
    printf("user:    %s\n", url_struct.user);
    printf("pass:    %s\n", url_struct.pass);
    printf("server:  %s\n", url_struct.server);
    printf("port:    %s\n", url_struct.port);
    printf("path:    %s\n", url_struct.file);
    printf("----------------------------------------\n");

    return 0;
}

Example #7

0

Show file

File: scep.c Project: xman1979/openscep

int	main(int argc, char *argv[]) {
	int		c, poll = 0, reqversion = 0, rc = -1;
	char		*cacertfile = NULL, *keyfile = NULL, *challenge = NULL,
			*savedrequestfile = NULL, *requestfile = NULL,
			*dn = NULL, *spkacfile = NULL, *endrequest = NULL;
	scep_t		scep;
	BIO		*repbio;
	char		*url = "http://localhost/cgi-bin";
	scepmsg_t	*msg;
	unsigned char	*checkNonce = NULL;

	/* initialize what you can					*/
	scepinit();
	scep_clear(&scep);

	/* we are a client 						*/
	scep.client = 1;

	/* parse command line						*/
	while (EOF != (c = getopt(argc, argv, "dc:e:r:s:k:w:pu:2a:q:")))
		switch (c) {
		case 'd':
			debug++;
			break;
		case 'e':
			endrequest = optarg;
			break;
		case 'c':
			cacertfile = optarg;
			break;
		case 's':
			savedrequestfile = optarg;
		case 'r':
			/* the request file will also contain the self	*/
			/* signed certificate				*/
			requestfile = optarg;
			break;
		case 'k':
			keyfile = optarg;
			break;
		case 'w':
			challenge = optarg;
			break;
		case 'p':
			poll = 1;
			break;
		case 'q':
			scep.community = optarg;
			break;
		case 'u':
			url = optarg;
			break;
		case '2':
			reqversion = 1;
			break;
		case 'a':
			spkacfile = optarg;
			break;
		}

	/* stop immediately if request or key is missing		*/
	/* (even in the case of a version 2 proxied request, we need	*/
	/* a request as the carrier of the proxy entities public key)	*/
	if (keyfile == NULL) {
		BIO_printf(bio_err, "%s:%d: key file is required argument\n",
			__FILE__, __LINE__);
		goto err;
	}
	if (requestfile == NULL) {
		BIO_printf(bio_err, "%s:%d: request file is required "
			"argument\n", __FILE__, __LINE__);
		goto err;
	}

	/* we are preparing the request message				*/
	msg = &scep.request;

	/* decode the URL						*/
	if (parseurl(&scep, url) < 0) {
		BIO_printf(bio_err, "%s:%d: cannot parse url\n", __FILE__,
			__LINE__);
		goto err;
	}
	if (debug)
		BIO_printf(bio_err, "%s:%d: decoded URL %s|%d|%s\n", __FILE__,
			__LINE__, scep.h.httphost, scep.h.httpport,
			scep.h.httppath);

	/* read the client key and request information			*/
	if (read_clientstuff(&scep, requestfile, keyfile) < 0) {
		BIO_printf(bio_err, "%s:%d: failed to read client stuff\n",
			__FILE__, __LINE__);
		goto err;
	}

	/* now we have to decide about the payload we want to have	*/
	/* with our scep request:					*/
	/*  - for a version 1 request, this will always be the original	*/
	/*    certificate signing request				*/
	/*  - for a version 2 request, it will be a payload structure	*/
	switch (reqversion) {
	case 0:
		/* for a version 1 client, client pubkey and client req	*/
		/* coincide						*/
		scep.requestorpubkey = scep.clientpubkey;
		scep.requestorreq = scep.clientreq;
		if (debug)
			BIO_printf(bio_err, "%s:%d: end request coincides "
				"with SCEP client\n", __FILE__, __LINE__);
		break;
	case 1:
		msg->rd.payload = payload_new();
		rc = -1;
		if (spkacfile) {
			if (debug)
				BIO_printf(bio_err, "%s:%d: reading spki "
					"from %s\n", __FILE__, __LINE__,
					spkacfile);
			rc = read_requestorstuff(&scep, 1, spkacfile);
		} else if (endrequest) {
			if (debug)
				BIO_printf(bio_err, "%s:%d: reading X509 req "
					"from %s\n", __FILE__, __LINE__,
					endrequest);
			rc = read_requestorstuff(&scep, 0, endrequest);
		}
		if (rc < 0) {
			BIO_printf(bio_err, "%s:%d: could not read end "
				"request data\n", __FILE__, __LINE__);
			goto err;
		}
		if (debug)
			BIO_printf(bio_err, "%s:%d: end request read\n",
				__FILE__, __LINE__);
		break;
	}

	/* set the transaction id value					*/
	scep.transId = key_fingerprint(scep.requestorpubkey);
	if (debug)
		BIO_printf(bio_err, "%s:%d: transaction ID is %s\n",
			__FILE__, __LINE__, scep.transId);

	/* read the CA certificate file					*/
	if (read_castuff(&scep, cacertfile) < 0) {
		BIO_printf(bio_err, "%s:%d: read CA certificate info\n",
			__FILE__, __LINE__);
	}
	if (debug)
		BIO_printf(bio_err, "%s:%d: CA certificate read\n",
			__FILE__, __LINE__);

	/* for SPKI requests, there should be exactly one more argument	*/
	/* namely the distinguished name				*/
	if (spkacfile) {
		if ((argc - optind) != 1) {
			BIO_printf(bio_err, "%s:%d: DN argument needed\n",
				__FILE__, __LINE__);
			goto err;
		}
		dn = argv[optind];
		if (debug)
			BIO_printf(bio_err, "%s:%d: DN argument is '%s'\n",
				__FILE__, __LINE__, dn);

		/* convert the DN into attributes and add them to the	*/
		/* payload						*/
		if (payload_dn_to_attrs(msg->rd.payload, dn) < 0) {
			BIO_printf(bio_err, "%s:%d: failed to add DN attrs\n",
				__FILE__, __LINE__);
			goto err;
		}
	}

	/* skip creation of a request message when polling		*/
	if (poll)
		goto pollinginit;

	/* pack the request as a PKSCReq message, of type PKCSReq 	*/
	switch (reqversion) {
	case 0:
		msg->messageType = SCEP_MESSAGE_TYPE_PKCSREQ;
		msg->rd.req = scep.clientreq;
		break;
	case 1:
		/* build a version 2 payload				*/
		if (debug)
			BIO_printf(bio_err, "%s:%d: building version 2 "
				"payload\n", __FILE__, __LINE__);
		if (scep.requestorreq)
			payload_set_req(msg->rd.payload, scep.requestorreq);
		if (scep.requestorspki)
			payload_set_spki(msg->rd.payload, scep.requestorspki);

		/* set the correct message type				*/
		if (scep.community) {
			/* compute the authenticator from the original 	*/
			/* request and the community			*/
			msg->messageType = SCEP_MESSAGE_TYPE_V2PROXY;
		} else {
			msg->messageType = SCEP_MESSAGE_TYPE_V2REQUEST;
		}
		break;
	}

	/* write the request to the request file, for later perusal	*/
	if (savedrequestfile) {
		BIO	*reqbio;
		reqbio = BIO_new(BIO_s_file());
		BIO_write_filename(reqbio, savedrequestfile);
		switch (reqversion) {
		case 0:
			/* version 1 request has a X509_REQ payload	*/
			PEM_write_bio_X509_REQ(reqbio, msg->rd.req);
			break;
		case 1:
			/* version 2 requests have a "real" payload	*/
			i2d_payload_bio(reqbio, msg->rd.payload);
			break;
		}
		BIO_free(reqbio);
	}

	goto common;

pollinginit:
	/* when polling, the request is a GetCertInitial message	*/
	msg->messageType = SCEP_MESSAGE_TYPE_GETCERTINITIAL;

	/* the contents is the pair issuer and subject			*/
	msg->rd.is = (issuer_and_subject_t *)malloc(
		sizeof(issuer_and_subject_t));
	msg->rd.is->issuer = X509_get_subject_name(scep.cacert);
	msg->rd.is->subject = NULL;

	/* when polling we should read the request from request file	*/
	/* (only needed for the distinguished name of the client)	*/
	if (debug)
		BIO_printf(bio_err, "%s:%d: getting subject X509_NAME\n",
			__FILE__, __LINE__);
	switch (reqversion) {
	case 0:
		msg->rd.is->subject = X509_REQ_get_subject_name(scep.clientreq);
		break;
	case 1:
		if (scep.requestorreq)
			msg->rd.is->subject
				= X509_REQ_get_subject_name(scep.requestorreq);
		if (scep.requestorspki) {
			if (debug)
				BIO_printf(bio_err, "%s:%d: converting DN '%s' "
					"to X509_NAME\n",
					__FILE__, __LINE__, dn);
			msg->rd.is->subject = ldap_to_x509(dn);
		}
		break;
	}
	if (msg->rd.is->subject == NULL) {
		BIO_printf(bio_err, "%s:%d: no subject found\n", 
			__FILE__, __LINE__);
		goto err;
	}
	if (debug)
		BIO_printf(bio_err, "%s:%d: issuer and subject found\n",
			__FILE__, __LINE__);

common:
	/* create a self signed certificate for use with SCEP		*/
	if (selfsigned(&scep) < 0) {
		BIO_printf(bio_err, "%s:%d: failed to create self signed "
			"certificate\n", __FILE__, __LINE__);
		goto err;
	}
	if (debug)
		BIO_printf(bio_err, "%s:%d: self signed certificate created\n",
			__FILE__, __LINE__);

	/* set the senderNonce						*/
	scep.senderNonceLength = 16;
	scep.senderNonce = (unsigned char *)malloc(scep.senderNonceLength);
	RAND_bytes(scep.senderNonce, scep.senderNonceLength);
	if (debug)
		BIO_printf(bio_err, "%s:%d: senderNonce set\n", __FILE__,
			__LINE__);
	checkNonce = scep.senderNonce;

	/* all messages sent from the client are base 64 encoded	*/
	msg->base64 = 1;

	/* encode							*/
	if (encode(&scep) < 0) {
		BIO_printf(bio_err, "%s:%d: encoding the request failed\n",
			__FILE__, __LINE__);
		goto err;
	}
	if (debug)
		BIO_printf(bio_err, "%s:%d: encoded bytes: %d\n", 
			__FILE__, __LINE__, scep.request.length);

	/* send the request to the server, read the reply		*/
	repbio = getrequest(&scep);
	if (repbio == NULL) {
		BIO_printf(bio_err, "%s:%d: failed to read correct reply\n",
			__FILE__, __LINE__);
		goto err;
	}

	/* analyze  the reply						*/
	if (decode(&scep, repbio) < 0) {
		BIO_printf(bio_err, "%s:%d: decoding the reply failed\n",
			__FILE__, __LINE__);
		goto err;
	}

	/* display some information about the reply			*/
	printf("transaction id: %s\n", scep.transId);
	printf("PKIstatus: %s\n", (scep.reply.pkiStatus)
		? scep.reply.pkiStatus : "(null)");
	printf("reply message type: %s\n", scep.reply.messageType);
	if (scep.reply.failinfo) {
		printf("failinfo: %s\n", scep.reply.failinfo);
	}

	/* make sure we get a CertRep message back			*/
	if (strcmp(scep.reply.messageType, SCEP_MESSAGE_TYPE_CERTREP)) {
		BIO_printf(bio_err, "%s:%d: only CertRep message acceptable "
			" in response to PKCSReq/GetCertInitial\n",
			__FILE__, __LINE__);
		goto err;
	}

	/* check for the Nonces						*/
	if (memcmp(checkNonce, scep.recipientNonce, 16)) {
		BIO_printf(bio_err, "%s:%d: recipientNonce != sent "
			"senderNonce\n", __FILE__, __LINE__);
		goto err;
	}
	if (debug)
		BIO_printf(bio_err, "%s:%d: Nonce check OK\n", __FILE__, 
			__LINE__);

	if (scep.reply.pkiStatus == NULL) {
		BIO_printf(bio_err, "no pkiStatus returned\n");
		exit(1);
	}

	switch (atoi(scep.reply.pkiStatus)) {
	case PKI_SUCCESS:
		/* Success						*/
		scep.clientcert = extract_cert(&scep);
		if (debug)
			BIO_printf(bio_err, "%s:%d: certificate returned %p\n",
				__FILE__, __LINE__, scep.clientcert);
		if (scep.clientcert) {
			BIO	*cb;
			cb = BIO_new(BIO_s_file());
			BIO_set_fp(cb, stdout, BIO_NOCLOSE);
			PEM_write_bio_X509(cb, scep.clientcert);
			BIO_free(cb);
		}
		exit(EXIT_SUCCESS);
		break;
	case PKI_FAILURE:	/* Failure				*/
		if (debug)
			BIO_printf(bio_err, "%s:%d: request failed: %s\n",
				__FILE__, __LINE__, scep.reply.failinfo);
		exit(1);
		break;
	case PKI_PENDING:	/* Pending				*/
		if (debug)
			BIO_printf(bio_err, "%s:%d: request still pending\n",
				__FILE__, __LINE__);
		exit(2);
		break;
	}

	/* error return							*/
err:
	ERR_print_errors(bio_err);
	exit(EXIT_FAILURE);
}

Example #8

0

Show file

File: wgetlite.c Project: bobrippling/wgetlite

int wget(const char *url, int redirect_no)
{
	extern struct cfg global_cfg;
	struct wgetfile finfo;
	int ret;
	char *outname = NULL;
	char *host, *file, *proto, *port;

	char *urlcpy = alloca(strlen(url) + 2);

	strcpy(urlcpy, url);
	memset(&finfo, 0, sizeof finfo);
	finfo.redirect_no = redirect_no;

	if(!strchr(urlcpy, '/'))
		strcat(urlcpy, "/");

	if(parseurl(url, &host, &file, &proto, &port))
		return 1;

	if(!host || !port){
		output_err(OUT_ERR, "invalid url \"%s\"", url);
		goto bail;
	}

	if(     !strcmp(proto, "http")) finfo.proto = HTTP;
	else if(!strcmp(proto, "ftp"))  finfo.proto = FTP;
	else if(!strcmp(proto, "gopher")) finfo.proto = GOPHER;
	else{
		ret = 1;
		output_err(OUT_ERR, "unknown protocol: %s", proto);
		goto bail;
	}

	free(proto);

	if(global_cfg.out_fname){
		if(!strcmp(global_cfg.out_fname, "-"))
			outname = NULL;
		else
			outname = xstrdup(global_cfg.out_fname);

		finfo.namemode = NAME_FORCE;
	}else{
		char *the_last_slash_rated_pg_for_parental_guidance;

		the_last_slash_rated_pg_for_parental_guidance = strrchr(file, '/');
		if(the_last_slash_rated_pg_for_parental_guidance)
			outname = xstrdup(the_last_slash_rated_pg_for_parental_guidance + 1);
		else
			outname = xstrdup(file);

		finfo.namemode = NAME_GUESS;

		/* TODO: urldecode outname (except for %3f) */
	}

	finfo.sock      = -1;
	finfo.host_file = file;
	finfo.host_name = host;
	finfo.host_port = port;
	finfo.outname   = outname;

	if(wget_connect(&finfo))
		goto bail;

	switch(finfo.proto) {
	default:
	case HTTP:
		ret = http_GET(&finfo);
		break;
	case FTP:
		ret = ftp_RETR(&finfo);
		break;
	case GOPHER:
		ret = gopher_retrieve(&finfo);
		break;
	}

fin:
	/* don't close the connection - let connection_* handle it */

	/* free the struct's members, since they might be changed */
	free(finfo.host_file);
	free(finfo.host_name);
	free(finfo.host_port);
	free(finfo.outname  );

	return ret;
bail:
	ret = 1;
	goto fin;
}

Example #9

0

Show file

int main(int argc, char *argv[])
{

	static struct option options[] = 
	{
		{"body", no_argument,NULL,'b'},
		{"version", no_argument,NULL,'V'},
		{"file", required_argument,NULL,'f'},
		{"help",    no_argument,NULL,'h'},
		{"ie", no_argument,NULL,'i'},
		{"mozilla", no_argument,NULL, 'm'},
	};

	FILE *in = 0;
	char opt;
	char *optstring = "imbVhf:";
	char *progname;
	int file_count, fileind, length;
	int file_open = 0;
	char protocol[1024];
	char port[20];
	char path[1024];
	char hostname[1024];
	char fileurl[1024];
	char method[5] = "HEAD";
	char useragent[100] = "Furl/2.1";

	/* Let's see if it works */
	unsigned long ipAddr;
	SOCKET sock;

#ifdef WIN32
	Win32Init();
#endif

	progname = basename(argv[0]);	
	strlwr(progname);

	while ((opt=getopt_long(argc,argv,optstring,options,NULL)) != EOF ) {
		switch (opt) {
			
			case 'i':
				strcpy(useragent, "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1;)");
				break;
			case 'm':
				strcpy(useragent, "Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.7b) Gecko/20040421");
				break;
			case 'b':
				strcpy(method, "GET");
				break;
			case 'V':
				PrintVersion(progname);
				return 0;

			case 'h':
				PrintUsage(progname);
				return 0;

			case 'f':
				if((in=fopen(optarg, "r")) == NULL) {
					fprintf(stderr,"Bad Magic. Can\'t open specified file %s\n", optarg);
#ifdef WIN32
					Win32Cleanup();
#endif
					exit(1);
				} else {
					file_open = 1;
				}
				break;

			default:
				fprintf(stderr,"Try '%s --help' for more information.\n",progname);
				return 1;
		}
	}
	
	file_count = argc - optind;

	fileind = optind++;

	if((file_count == 0  || file_count > 1) && !file_open) {

		PrintUsage(progname);
#ifdef WIN32
		Win32Cleanup();
#endif
		exit(1);

	} else {

		while(1) {
			if (file_open) {
				if(fgets(fileurl,1022,in) == NULL) {
					if (feof(in)) {
						break;
					} else {
						perror("Bad Magic ");
#ifdef WIN32
						Win32Cleanup();
#endif
						exit(1);
					}
				} else {
					length = strlen(fileurl);
					fileurl[length-1] = '\0';
					fprintf(stdout,"\nURL= %s\n\n", fileurl);
				}	
			} else {
				length = -1;
				strcpy(fileurl, argv[fileind]);
			}
			if (length > 1 || length == -1) {
				if (!parseurl(fileurl,protocol,hostname,port,path,method,useragent)) {

					if (GetAddress(&ipAddr, hostname)) {
						fprintf(stderr,"%s doesn't exist\n",hostname);
						fprintf(stderr,"Try '%s --help' for more information.\n",progname);
					} else {
						sock = CreateSocket(&ipAddr,port);
						if (sock == INVALID_SOCKET) {
							fprintf(stderr,"Bad Magic. Couldn\'t get a socket connection\n");
						} else {
							if (ProcessSocket(sock, path) == SOCKET_ERROR) {
								fprintf(stderr,"Bad Magic. Encountered an error connecting to socket\n");
#ifdef WIN32
								Win32Cleanup();
#endif
								exit(1);
							}								
						}

					}
					
				} else {
					fprintf(stderr,"Protocol not supported\n");
					fprintf(stderr,"Try '%s --help' for more information.\n",progname);
				}
			}
			if (!file_open) {
				break;
			}
		}
	}
#ifdef WIN32
	Win32Cleanup();
#endif
	exit(0);
}

Example #10

0

Show file

File: proxy.c Project: mkazantsev/os

int main(int argc, char *argv[]) {
	int status, new_fd;
	int listensock;
	struct addrinfo hints, *res;
	struct sockaddr saddr;
	socklen_t saddr_size = sizeof(saddr);
	char buf[BUFSIZE];
	int ret;
	struct fdnode *fdl = NULL, *writefdl = NULL, *t;
	struct cnode *cache = NULL;
	fd_set readfds, writefds;
	struct timeval timeout;

	memset(&hints, 0, sizeof(struct addrinfo));
	hints.ai_family = AF_INET;
	hints.ai_socktype = SOCK_STREAM;
	hints.ai_protocol = IPPROTO_TCP;

	if (argc != 2) {
		printf("Usage: %s PORT\n", argv[0]);
		exit(1);
	}

	if (ret = getaddrinfo(NULL, argv[1], &hints, &res)) {
		printf("getaddrinfo: %s\n", gai_strerror(ret));
		exit(1);
	}

	if ((listensock = socket(res->ai_family, res->ai_socktype, res->ai_protocol)) == -1) {
		perror("socket");
		freeaddrinfo(res);
		exit(1);
	}

	if (bind(listensock, res->ai_addr, res->ai_addrlen) == -1) {
		perror("bind");
		freeaddrinfo(res);
		close(listensock);
		exit(1);
	}

	if (listen(listensock, MAX_CON_NUM) == -1) {
		perror("listen");
		freeaddrinfo(res);
		close(listensock);
		exit(1);
	}

	timeout.tv_sec = 60 * 3;
	timeout.tv_usec = 0;

	while (1) {
		struct cnode *c;
		int i = 0;
		ret = setfds(&readfds, &writefds, fdl, listensock);
		printf("wait for select\n");
		ret = select(ret, &readfds, &writefds, NULL, &timeout);
		printf("got selected %d fds\n", ret);

		if (ret < 0) {
			perror("select");
			break;
		}
		if (ret == 0) { /* timeout */
			printf(".\n");
			continue;
		}

		for (c = cache; c != NULL; c = c->next) {
			i++;
			printf("%s\n", c->url);
		}
		printf("Got %d cache pages\n", i);

		for (t = fdl; t != NULL; t = t->next) {
			if (FD_ISSET(t->fd, &readfds))
				printf("%d is ready to read\n", t->fd);
			if (FD_ISSET(t->fd, &writefds))
				printf("%d is ready to write\n", t->fd);
			if (FD_ISSET(t->sout, &readfds))
				printf("%d is ready to read\n", t->sout);
			if (FD_ISSET(t->sout, &writefds))
				printf("%d is ready to write\n", t->sout);
		}

		if (FD_ISSET(listensock, &readfds)) {
			new_fd = accept(listensock, &saddr, &saddr_size);
			if (new_fd == -1) {
				perror("accept");
				continue;
			}
			ret = fdladd(new_fd, &fdl);
			if (ret == -1) {
				perror("add socket");
				continue;
			}
			printf("Accepted new %d\n", new_fd);
		}

		for (t = fdl; t != NULL; t = t->next) {
			if (t->use && t->rw == 1 && !t->full && !t->eof && FD_ISSET(t->sout, &readfds)) {
				//memset(buf, 0, BUFSIZE);
				printf("to read from sout %d\n", t->sout);
				if (t->nread >= t->nwrote) {
					ret = read(t->sout, t->buf + t->nread, sizeof(t->buf)-t->nread);
					//cacheit(&cache, t->url, t->buf + t->nread, ret);
					if (ret == 0) {
						t->eof = 1;
						//completecache(cache, t->url);
						/*
						close(t->sout);
						fdlrem(t->fd, fdl);
						printf("Closed %d\n", t->sout);
						continue;
						*/
					} else {
						t->nread += ret;
						if (t->nread == sizeof(t->buf)) t->nread = 0;
						if (t->nread == t->nwrote) t->full = 1;
						t->empty = 0;
					}
				} else {
					ret = read(t->sout, t->buf + t->nread, t->nwrote-t->nread);
					//cacheit(&cache, t->url, t->buf + t->nread, ret);
					if (ret == 0) {
						t->eof = 1;
						//completecache(cache, t->url);
						/*
						close(t->sout);
						fdlrem(t->fd, fdl);
						printf("Closed %d\n", t->sout);
						continue;
						*/
					} else {
						t->nread += ret;
						if (t->nread == t->nwrote) t->full = 1;
						t->empty = 0;
					}
				}
			}
			if (t->use && t->rw == 0 && FD_ISSET(t->fd, &readfds)) {
				memset(buf, 0, BUFSIZE);
				printf("to read from sin %d\n", t->fd);
				ret = read(t->fd, buf, BUFSIZE);
				if (ret > 0) {
					int len;
					char *url;
					char *host;
					char *dir;

					write(1, buf, ret);

					/* Parse incoming request */
					if (parsereq(buf, ret, &url, &len) == -1) {
						close(t->fd);
						close(t->sout);
						t->use = 0;
						continue;
					}
					t->url = malloc((len+1)*sizeof(char));
					t->url = strncpy(t->url, url, len);
					t->url[len] = '\0';

					/* Parse given URL */
					parseurl(t->url, &len, &host, &dir);
					t->host = malloc((len+1)*sizeof(char));
					t->host = strncpy(t->host, host, len);
					t->host[len] = '\0';

					len = strlen(t->url) - len;
					t->uri = malloc((len+1)*sizeof(char));
					t->uri = strncpy(t->uri, dir, len);
					t->uri[len] = '\0';
					/*
					if (t->cache = cached(cache, t->url)) {
						t->cached = 1;
						t->rw = 1;
						continue;
					}
					*/
					if (ret = getaddrinfo(t->host, "http", &hints, &t->addr)) {
						printf("getaddrinfo: %s\n", gai_strerror(ret));
						write(t->fd, "HTTP/1.0 400\r\n\r\n", 17);
						close(t->fd);
						fdlrem(t->fd, fdl);
						continue;
					}

					if (connect(t->sout, t->addr->ai_addr, t->addr->ai_addrlen) == -1) {						
						if (errno == EINPROGRESS) {
							printf("wait for connection on %d\n", t->fd);
							continue;
						}
						close(t->fd);
						fdlrem(t->fd, fdl);
					}
					continue;
				}
				if (ret == 0) {
					close(t->fd);
					fdlrem(t->fd, fdl);
					printf("Closed %d\n", t->fd);
				}
			}
		//} /* readfds */
		//for (t = fdl; t != NULL; t = t->next) {
			if (t->use && t->rw == 0 && FD_ISSET(t->sout, &writefds)) {
				int len = strlen(t->uri) + 3 + 8 + 6;
				char *msg = malloc(len*sizeof(char));
				printf("to write to sout %d\n", t->sout);
				sprintf(msg, "GET %s HTTP/1.0\r\n\r\n", t->uri);
				if (write(t->sout, msg, len) <= 0) {
					perror("write sout");
					close(t->fd);
					fdlrem(t->fd, fdl);
					continue;
				}
				t->rw = 1;
				free(msg);
				printf("request on %d\n", t->sout);
			}
			if (t->use && t->rw == 1 && t->cached == 0 && !t->empty && FD_ISSET(t->fd, &writefds)) {
				char *l;
				if (t->nwrote < t->nread)
					l = t->buf + t->nread;
				else
					l = t->buf + sizeof(t->buf);
				//ret = write(t->fd, t->buf + t->nwrote, l - (t->buf + t->nwrote));
				ret = send(t->fd, t->buf + t->nwrote, l - (t->buf + t->nwrote), 0);
				if (ret <= 0) {
					perror("write sin");
					close(t->fd);
					fdlrem(t->fd, fdl);
					continue;
				}
				t->nwrote += ret;
				t->full = 0;
				if (t->nwrote == sizeof(t->buf)) t->nwrote = 0;
				if (t->nwrote == t->nread) t->empty = 1;
			}
			if (t->use && t->rw == 1 && t->cached == 1 && FD_ISSET(t->fd, &writefds)) {
				ret = write(t->fd, t->cache->content + t->nwrote, t->cache->size - t->nwrote);
				t->nwrote += ret;
				printf("work from cache for %s\n", t->url);
				if (t->nwrote == t->cache->size) {
					t->eof = 1;
					t->empty = 1;
				}
			}
			if (t->use && t->eof && t->empty) {
				fdlrem(t->fd, fdl);
				printf("Completed %d\n", t->fd);
				continue;
			}
		} /* fdl traverse */
	} /* while (1) */

	printf("gotcha");

	fdlfree(fdl);

	freeaddrinfo(res);
	close(listensock);
	return 0;
}

Example #11

0

Show file

/* convert_links() is called from recursive_retrieve() after we're
   done with an HTML file.  This call to convert_links is not complete
   because it converts only the downloaded files, and Wget cannot know
   which files will be downloaded afterwards.  So, if we have file
   fileone.html with:

   <a href="/c/something.gif">

   and /c/something.gif was not downloaded because it exceeded the
   recursion depth, the reference will *not* be changed.

   However, later we can encounter /c/something.gif from an "upper"
   level HTML (let's call it filetwo.html), and it gets downloaded.

   But now we have a problem because /c/something.gif will be
   correctly transformed in filetwo.html, but not in fileone.html,
   since Wget could not have known that /c/something.gif will be
   downloaded in the future.

   This is why Wget must, after the whole retrieval, call
   convert_all_links to go once more through the entire list of
   retrieved HTMLs, and re-convert them.

   All the downloaded HTMLs are kept in downloaded_html_files, and downloaded URLs
   in urls_downloaded.  From these two lists information is
   extracted.  */
void
convert_all_links (void)
{
  slist *html;

  /* Destructively reverse downloaded_html_files to get it in the right order.
     recursive_retrieve() used slist_prepend() consistently.  */
  downloaded_html_files = slist_nreverse (downloaded_html_files);

  for (html = downloaded_html_files; html; html = html->next)
    {
      urlpos *urls, *cur_url;
      char *url;

      DEBUGP (("Rescanning %s\n", html->string));
      /* Determine the URL of the HTML file.  get_urls_html will need
	 it.  */
      url = hash_table_get (dl_file_url_map, html->string);
      if (url)
	DEBUGP (("It should correspond to %s.\n", url));
      else
	DEBUGP (("I cannot find the corresponding URL.\n"));
      /* Parse the HTML file...  */
      urls = get_urls_html (html->string, url, FALSE, NULL);
      /* We don't respect meta_disallow_follow here because, even if
         the file is not followed, we might still want to convert the
         links that have been followed from other files.  */
      for (cur_url = urls; cur_url; cur_url = cur_url->next)
	{
	  char *local_name;

	  /* The URL must be in canonical form to be compared.  */
	  struct urlinfo *u = newurl ();
	  uerr_t res = parseurl (cur_url->url, u, 0);
	  if (res != URLOK)
	    {
	      freeurl (u, 1);
	      continue;
	    }
	  /* We decide the direction of conversion according to whether
	     a URL was downloaded.  Downloaded URLs will be converted
	     ABS2REL, whereas non-downloaded will be converted REL2ABS.  */
	  local_name = hash_table_get (dl_url_file_map, u->url);
	  if (local_name)
	    DEBUGP (("%s marked for conversion, local %s\n",
		     u->url, local_name));
	  /* Decide on the conversion direction.  */
	  if (local_name)
	    {
	      /* We've downloaded this URL.  Convert it to relative
                 form.  We do this even if the URL already is in
                 relative form, because our directory structure may
                 not be identical to that on the server (think `-nd',
                 `--cut-dirs', etc.)  */
	      cur_url->convert = CO_CONVERT_TO_RELATIVE;
	      cur_url->local_name = xstrdup (local_name);
	    }
	  else
	    {
	      /* We haven't downloaded this URL.  If it's not already
                 complete (including a full host name), convert it to
                 that form, so it can be reached while browsing this
                 HTML locally.  */
	      if (!cur_url->link_complete_p)
		cur_url->convert = CO_CONVERT_TO_COMPLETE;
	      cur_url->local_name = NULL;
	    }
	  freeurl (u, 1);
	}
      /* Convert the links in the file.  */
      convert_links (html->string, urls);
      /* Free the data.  */
      free_urlpos (urls);
    }
}

Example #12

0

Show file

/* The core of recursive retrieving.  Endless recursion is avoided by
   having all URLs stored to a linked list of URLs, which is checked
   before loading any URL.  That way no URL can get loaded twice.

   The function also supports specification of maximum recursion depth
   and a number of other goodies.  */
uerr_t
recursive_retrieve (const char *file, const char *this_url)
{
  char *constr, *filename, *newloc;
  char *canon_this_url = NULL;
  int dt, inl, dash_p_leaf_HTML = FALSE;
  int meta_disallow_follow;
  int this_url_ftp;            /* See below the explanation */
  uerr_t err;
  struct urlinfo *rurl;
  urlpos *url_list, *cur_url;
  char *rfile; /* For robots */
  struct urlinfo *u;

  assert (this_url != NULL);
  assert (file != NULL);
  /* If quota was exceeded earlier, bail out.  */
  if (downloaded_exceeds_quota ())
    return QUOTEXC;
  /* Cache the current URL in the list.  */
  if (first_time)
    {
      /* These three operations need to be done only once per Wget
         run.  They should probably be at a different location.  */
      if (!undesirable_urls)
	undesirable_urls = make_string_hash_table (0);

      hash_table_clear (undesirable_urls);
      string_set_add (undesirable_urls, this_url);
      /* Enter this_url to the hash table, in original and "enhanced" form.  */
      u = newurl ();
      err = parseurl (this_url, u, 0);
      if (err == URLOK)
	{
	  string_set_add (undesirable_urls, u->url);
	  if (opt.no_parent)
	    base_dir = xstrdup (u->dir); /* Set the base dir.  */
	  /* Set the canonical this_url to be sent as referer.  This
	     problem exists only when running the first time.  */
	  canon_this_url = xstrdup (u->url);
	}
      else
	{
	  DEBUGP (("Double yuck!  The *base* URL is broken.\n"));
	  base_dir = NULL;
	}
      freeurl (u, 1);
      depth = 1;
      robots_host = NULL;
      forbidden = NULL;
      first_time = 0;
    }
  else
    ++depth;

  if (opt.reclevel != INFINITE_RECURSION && depth > opt.reclevel)
    /* We've exceeded the maximum recursion depth specified by the user. */
    {
      if (opt.page_requisites && depth <= opt.reclevel + 1)
	/* When -p is specified, we can do one more partial recursion from the
	   "leaf nodes" on the HTML document tree.  The recursion is partial in
	   that we won't traverse any <A> or <AREA> tags, nor any <LINK> tags
	   except for <LINK REL="stylesheet">. */
	dash_p_leaf_HTML = TRUE;
      else
	/* Either -p wasn't specified or it was and we've already gone the one
	   extra (pseudo-)level that it affords us, so we need to bail out. */
	{
	  DEBUGP (("Recursion depth %d exceeded max. depth %d.\n",
		   depth, opt.reclevel));
	  --depth;
	  return RECLEVELEXC;
	}
    }

  /* Determine whether this_url is an FTP URL.  If it is, it means
     that the retrieval is done through proxy.  In that case, FTP
     links will be followed by default and recursion will not be
     turned off when following them.  */
  this_url_ftp = (urlproto (this_url) == URLFTP);

  /* Get the URL-s from an HTML file: */
  url_list = get_urls_html (file, canon_this_url ? canon_this_url : this_url,
			    dash_p_leaf_HTML, &meta_disallow_follow);

  if (opt.use_robots && meta_disallow_follow)
    {
      /* The META tag says we are not to follow this file.  Respect
         that.  */
      free_urlpos (url_list);
      url_list = NULL;
    }

  /* Decide what to do with each of the URLs.  A URL will be loaded if
     it meets several requirements, discussed later.  */
  for (cur_url = url_list; cur_url; cur_url = cur_url->next)
    {
      /* If quota was exceeded earlier, bail out.  */
      if (downloaded_exceeds_quota ())
	break;
      /* Parse the URL for convenient use in other functions, as well
	 as to get the optimized form.  It also checks URL integrity.  */
      u = newurl ();
      if (parseurl (cur_url->url, u, 0) != URLOK)
	{
	  DEBUGP (("Yuck!  A bad URL.\n"));
	  freeurl (u, 1);
	  continue;
	}
      if (u->proto == URLFILE)
	{
	  DEBUGP (("Nothing to do with file:// around here.\n"));
	  freeurl (u, 1);
	  continue;
	}
      assert (u->url != NULL);
      constr = xstrdup (u->url);

      /* Several checkings whether a file is acceptable to load:
	 1. check if URL is ftp, and we don't load it
	 2. check for relative links (if relative_only is set)
	 3. check for domain
	 4. check for no-parent
	 5. check for excludes && includes
	 6. check for suffix
	 7. check for same host (if spanhost is unset), with possible
	 gethostbyname baggage
	 8. check for robots.txt

	 Addendum: If the URL is FTP, and it is to be loaded, only the
	 domain and suffix settings are "stronger".

	 Note that .html and (yuck) .htm will get loaded regardless of
	 suffix rules (but that is remedied later with unlink) unless
	 the depth equals the maximum depth.

	 More time- and memory- consuming tests should be put later on
	 the list.  */

      /* inl is set if the URL we are working on (constr) is stored in
	 undesirable_urls.  Using it is crucial to avoid unnecessary
	 repeated continuous hits to the hash table.  */
      inl = string_set_contains (undesirable_urls, constr);

      /* If it is FTP, and FTP is not followed, chuck it out.  */
      if (!inl)
	if (u->proto == URLFTP && !opt.follow_ftp && !this_url_ftp)
	  {
	    DEBUGP (("Uh, it is FTP but i'm not in the mood to follow FTP.\n"));
	    string_set_add (undesirable_urls, constr);
	    inl = 1;
	  }
      /* If it is absolute link and they are not followed, chuck it
	 out.  */
      if (!inl && u->proto != URLFTP)
	if (opt.relative_only && !cur_url->link_relative_p)
	  {
	    DEBUGP (("It doesn't really look like a relative link.\n"));
	    string_set_add (undesirable_urls, constr);
	    inl = 1;
	  }
      /* If its domain is not to be accepted/looked-up, chuck it out.  */
      if (!inl)
	if (!accept_domain (u))
	  {
	    DEBUGP (("I don't like the smell of that domain.\n"));
	    string_set_add (undesirable_urls, constr);
	    inl = 1;
	  }
      /* Check for parent directory.  */
      if (!inl && opt.no_parent
	  /* If the new URL is FTP and the old was not, ignore
             opt.no_parent.  */
	  && !(!this_url_ftp && u->proto == URLFTP))
	{
	  /* Check for base_dir first.  */
	  if (!(base_dir && frontcmp (base_dir, u->dir)))
	    {
	      /* Failing that, check for parent dir.  */
	      struct urlinfo *ut = newurl ();
	      if (parseurl (this_url, ut, 0) != URLOK)
		DEBUGP (("Double yuck!  The *base* URL is broken.\n"));
	      else if (!frontcmp (ut->dir, u->dir))
		{
		  /* Failing that too, kill the URL.  */
		  DEBUGP (("Trying to escape parental guidance with no_parent on.\n"));
		  string_set_add (undesirable_urls, constr);
		  inl = 1;
		}
	      freeurl (ut, 1);
	    }
	}
      /* If the file does not match the acceptance list, or is on the
	 rejection list, chuck it out.  The same goes for the
	 directory exclude- and include- lists.  */
      if (!inl && (opt.includes || opt.excludes))
	{
	  if (!accdir (u->dir, ALLABS))
	    {
	      DEBUGP (("%s (%s) is excluded/not-included.\n", constr, u->dir));
	      string_set_add (undesirable_urls, constr);
	      inl = 1;
	    }
	}
      if (!inl)
	{
	  char *suf = NULL;
	  /* We check for acceptance/rejection rules only for non-HTML
	     documents.  Since we don't know whether they really are
	     HTML, it will be deduced from (an OR-ed list):

	     1) u->file is "" (meaning it is a directory)
	     2) suffix exists, AND:
	     a) it is "html", OR
	     b) it is "htm"

	     If the file *is* supposed to be HTML, it will *not* be
            subject to acc/rej rules, unless a finite maximum depth has
            been specified and the current depth is the maximum depth. */
	  if (!
	      (!*u->file
	       || (((suf = suffix (constr)) != NULL)
                  && ((!strcmp (suf, "html") || !strcmp (suf, "htm"))
                      && ((opt.reclevel != INFINITE_RECURSION) &&
			  (depth != opt.reclevel))))))
	    {
	      if (!acceptable (u->file))
		{
		  DEBUGP (("%s (%s) does not match acc/rej rules.\n",
			  constr, u->file));
		  string_set_add (undesirable_urls, constr);
		  inl = 1;
		}
	    }
	  FREE_MAYBE (suf);
	}
      /* Optimize the URL (which includes possible DNS lookup) only
	 after all other possibilities have been exhausted.  */
      if (!inl)
	{
	  if (!opt.simple_check)
	    opt_url (u);
	  else
	    {
	      char *p;
	      /* Just lowercase the hostname.  */
	      for (p = u->host; *p; p++)
		*p = TOLOWER (*p);
	      xfree (u->url);
	      u->url = str_url (u, 0);
	    }
	  xfree (constr);
	  constr = xstrdup (u->url);
	  string_set_add (undesirable_urls, constr);
	  if (!inl && !((u->proto == URLFTP) && !this_url_ftp))
	    if (!opt.spanhost && this_url && !same_host (this_url, constr))
	      {
		DEBUGP (("This is not the same hostname as the parent's.\n"));
		string_set_add (undesirable_urls, constr);
		inl = 1;
	      }
	}
      /* What about robots.txt?  */
      if (!inl && opt.use_robots && u->proto == URLHTTP)
	{
	  /* Since Wget knows about only one set of robot rules at a
	     time, /robots.txt must be reloaded whenever a new host is
	     accessed.

	     robots_host holds the host the current `forbid' variable
	     is assigned to.  */
	  if (!robots_host || !same_host (robots_host, u->host))
	    {
	      FREE_MAYBE (robots_host);
	      /* Now make robots_host the new host, no matter what the
		 result will be.  So if there is no /robots.txt on the
		 site, Wget will not retry getting robots all the
		 time.  */
	      robots_host = xstrdup (u->host);
	      free_vec (forbidden);
	      forbidden = NULL;
	      err = retrieve_robots (constr, ROBOTS_FILENAME);
	      if (err == ROBOTSOK)
		{
		  rurl = robots_url (constr, ROBOTS_FILENAME);
		  rfile = url_filename (rurl);
		  forbidden = parse_robots (rfile);
		  freeurl (rurl, 1);
		  xfree (rfile);
		}
	    }

	  /* Now that we have (or don't have) robots, we can check for
	     them.  */
	  if (!robots_match (u, forbidden))
	    {
	      DEBUGP (("Stuffing %s because %s forbids it.\n", this_url,
		       ROBOTS_FILENAME));
	      string_set_add (undesirable_urls, constr);
	      inl = 1;
	    }
	}

      filename = NULL;
      /* If it wasn't chucked out, do something with it.  */
      if (!inl)
	{
	  DEBUGP (("I've decided to load it -> "));
	  /* Add it to the list of already-loaded URL-s.  */
	  string_set_add (undesirable_urls, constr);
	  /* Automatically followed FTPs will *not* be downloaded
	     recursively.  */
	  if (u->proto == URLFTP)
	    {
	      /* Don't you adore side-effects?  */
	      opt.recursive = 0;
	    }
	  /* Reset its type.  */
	  dt = 0;
	  /* Retrieve it.  */
	  retrieve_url (constr, &filename, &newloc,
		       canon_this_url ? canon_this_url : this_url, &dt);
	  if (u->proto == URLFTP)
	    {
	      /* Restore...  */
	      opt.recursive = 1;
	    }
	  if (newloc)
	    {
	      xfree (constr);
	      constr = newloc;
	    }
	  /* If there was no error, and the type is text/html, parse
	     it recursively.  */
	  if (dt & TEXTHTML)
	    {
	      if (dt & RETROKF)
		recursive_retrieve (filename, constr);
	    }
	  else
	    DEBUGP (("%s is not text/html so we don't chase.\n",
		     filename ? filename: "(null)"));

	  if (opt.delete_after || (filename && !acceptable (filename)))
	    /* Either --delete-after was specified, or we loaded this otherwise
	       rejected (e.g. by -R) HTML file just so we could harvest its
	       hyperlinks -- in either case, delete the local file. */
	    {
	      DEBUGP (("Removing file due to %s in recursive_retrieve():\n",
		       opt.delete_after ? "--delete-after" :
		       "recursive rejection criteria"));
	      logprintf (LOG_VERBOSE,
			 (opt.delete_after ? _("Removing %s.\n")
			  : _("Removing %s since it should be rejected.\n")),
			 filename);
	      if (unlink (filename))
		logprintf (LOG_NOTQUIET, "unlink: %s\n", strerror (errno));
	      dt &= ~RETROKF;
	    }

	  /* If everything was OK, and links are to be converted, let's
	     store the local filename.  */
	  if (opt.convert_links && (dt & RETROKF) && (filename != NULL))
	    {
	      cur_url->convert = CO_CONVERT_TO_RELATIVE;
	      cur_url->local_name = xstrdup (filename);
	    }
	}
      else
	DEBUGP (("%s already in list, so we don't load.\n", constr));
      /* Free filename and constr.  */
      FREE_MAYBE (filename);
      FREE_MAYBE (constr);
      freeurl (u, 1);
      /* Increment the pbuf for the appropriate size.  */
    }
  if (opt.convert_links && !opt.delete_after)
    /* This is merely the first pass: the links that have been
       successfully downloaded are converted.  In the second pass,
       convert_all_links() will also convert those links that have NOT
       been downloaded to their canonical form.  */
    convert_links (file, url_list);
  /* Free the linked list of URL-s.  */
  free_urlpos (url_list);
  /* Free the canonical this_url.  */
  FREE_MAYBE (canon_this_url);
  /* Decrement the recursion depth.  */
  --depth;
  if (downloaded_exceeds_quota ())
    return QUOTEXC;
  else
    return RETROK;
}

Example #13

0

Show file

File: recur.c Project: KevinAdams05/VWGet

/* Simple calls to convert_links will often fail because only the
   downloaded files are converted, and Wget cannot know which files
   will be converted in the future.  So, if we have file fileone.html
   with:

   <a href=/c/something.gif>

   and /c/something.gif was not downloaded because it exceeded the
   recursion depth, the reference will *not* be changed.

   However, later we can encounter /c/something.gif from an "upper"
   level HTML (let's call it filetwo.html), and it gets downloaded.

   But now we have a problem because /c/something.gif will be
   correctly transformed in filetwo.html, but not in fileone.html,
   since Wget could not have known that /c/something.gif will be
   downloaded in the future.

   This is why Wget must, after the whole retrieval, call
   convert_all_links to go once more through the entire list of
   retrieved HTML-s, and re-convert them.

   All the downloaded HTMLs are kept in urls_html, and downloaded URLs
   in urls_downloaded.  From these two lists information is
   extracted.  */
void
convert_all_links (void)
{
  uerr_t res;
  urlpos *l1, *l2, *urls;
  struct urlinfo *u;
  slist *html;
  urlpos *urlhtml;

  for (html = urls_html; html; html = html->next)
    {
      DEBUGP (("Rescanning %s\n", html->string));
      /* Determine the URL of the HTML file.  get_urls_html will need
	 it.  */
      for (urlhtml = urls_downloaded; urlhtml; urlhtml = urlhtml->next)
	if (!strcmp (urlhtml->local_name, html->string))
	  break;
      if (urlhtml)
	DEBUGP (("It should correspond to %s.\n", urlhtml->url));
      else
	DEBUGP (("I cannot find the corresponding URL.\n"));
      /* Parse the HTML file...  */
      urls = get_urls_html (html->string, urlhtml ? urlhtml->url : NULL, 1);
      if (!urls)
	continue;
      for (l1 = urls; l1; l1 = l1->next)
	{
	  /* The URL must be in canonical form to be compared.  */
	  u = newurl ();
	  res = parseurl (l1->url, u, 0);
	  if (res != URLOK)
	    {
	      freeurl (u, 1);
	      continue;
	    }
	  /* We decide the direction of conversion according to whether
	     a URL was downloaded.  Downloaded URLs will be converted
	     ABS2REL, whereas non-downloaded will be converted REL2ABS.
	     Note: not yet implemented; only ABS2REL works.  */
	  for (l2 = urls_downloaded; l2; l2 = l2->next)
	    if (!strcmp (l2->url, u->url))
	      {
		DEBUGP (("%s flagged for conversion, local %s\n",
			 l2->url, l2->local_name));
		break;
	      }
	  /* Clear the flags.  */
	  l1->flags &= ~ (UABS2REL | UREL2ABS);
	  /* Decide on the conversion direction.  */
	  if (l2)
	    {
	      l1->flags |= UABS2REL;
	      l1->local_name = xstrdup (l2->local_name);
	    }
	  else
	    {
	      l1->flags |= UREL2ABS;
	      l1->local_name = NULL;
	    }
	  freeurl (u, 1);
	}
      /* Convert the links in the file.  */
      convert_links (html->string, urls);
      /* Free the data.  */
      free_urlpos (urls);
    }
}

Example #14

0

Show file

File: recur.c Project: KevinAdams05/VWGet

/* The core of recursive retrieving.  Endless recursion is avoided by
   having all URL-s stored to a linked list of URL-s, which is checked
   before loading any URL.  That way no URL can get loaded twice.

   The function also supports specification of maximum recursion depth
   and a number of other goodies.  */
uerr_t
recursive_retrieve (const char *file, const char *this_url)
{
  char *constr, *filename, *newloc;
  char *canon_this_url = NULL;
  int dt, inl;
  int this_url_ftp;            /* See below the explanation */
  uerr_t err;
  struct urlinfo *rurl;
  urlpos *url_list, *cur_url;
  char *rfile; /* For robots */
  struct urlinfo *u;

  assert (this_url != NULL);
  assert (file != NULL);
  /* If quota was exceeded earlier, bail out.  */
  if (opt.quota && (opt.downloaded > opt.quota))
    return QUOTEXC;
  /* Cache the current URL in the list.  */
  if (first_time)
    {
      ulist = add_slist (ulist, this_url, 0);
      urls_downloaded = NULL;
      urls_html = NULL;
      /* Enter this_url to the slist, in original and "enhanced" form.  */
      u = newurl ();
      err = parseurl (this_url, u, 0);
      if (err == URLOK)
	{
	  ulist = add_slist (ulist, u->url, 0);
	  urls_downloaded = add_url (urls_downloaded, u->url, file);
	  urls_html = add_slist (urls_html, file, NOSORT);
	  if (opt.no_parent)
	    base_dir = xstrdup (u->dir); /* Set the base dir.  */
	  /* Set the canonical this_url to be sent as referer.  This
	     problem exists only when running the first time.  */
	  canon_this_url = xstrdup (u->url);
	}
      else
	{
	  DEBUGP (("Double yuck!  The *base* URL is broken.\n"));
	  base_dir = NULL;
	}
      freeurl (u, 1);
      depth = 1;
      robots_host = NULL;
      forbidden = NULL;
      first_time = 0;
    }
  else
    ++depth;

  /* Bail out if opt.reclevel is exceeded.  */
  if ((opt.reclevel != 0) && (depth > opt.reclevel))
    {
      DEBUGP (("Recursion depth %d exceeded max. depth %d.\n",
	       depth, opt.reclevel));
      --depth;
      return RECLEVELEXC;
    }

  /* Determine whether this_url is an FTP URL.  If it is, it means
     that the retrieval is done through proxy.  In that case, FTP
     links will be followed by default and recursion will not be
     turned off when following them.  */
  this_url_ftp = (urlproto (this_url) == URLFTP);

  /* Get the URL-s from an HTML file: */
  url_list = get_urls_html (file,
			    canon_this_url ? canon_this_url : this_url, 0);

  /* Decide what to do with each of the URLs.  A URL will be loaded if
     it meets several requirements, discussed later.  */
  for (cur_url = url_list; cur_url; cur_url = cur_url->next)
    {
      /* If quota was exceeded earlier, bail out.  */
      if (opt.quota && (opt.downloaded > opt.quota))
	break;
      /* Parse the URL for convenient use in other functions, as well
	 as to get the optimized form.  It also checks URL integrity.  */
      u = newurl ();
      if (parseurl (cur_url->url, u, 0) != URLOK)
	{
	  DEBUGP (("Yuck!  A bad URL.\n"));
	  freeurl (u, 1);
	  continue;
	}
      if (u->proto == URLFILE)
	{
	  DEBUGP (("Nothing to do with file:// around here.\n"));
	  freeurl (u, 1);
	  continue;
	}
      assert (u->url != NULL);
      constr = xstrdup (u->url);

      /* Several checkings whether a file is acceptable to load:
	 1. check if URL is ftp, and we don't load it
	 2. check for relative links (if relative_only is set)
	 3. check for domain
	 4. check for no-parent
	 5. check for excludes && includes
	 6. check for suffix
	 7. check for same host (if spanhost is unset), with possible
	 gethostbyname baggage
	 8. check for robots.txt

	 Addendum: If the URL is FTP, and it is to be loaded, only the
	 domain and suffix settings are "stronger".

	 Note that .html and (yuck) .htm will get loaded
	 regardless of suffix rules (but that is remedied later with
	 unlink).

	 More time- and memory- consuming tests should be put later on
	 the list.  */

      /* inl is set if the URL we are working on (constr) is stored in
	 ulist.  Using it is crucial to avoid the incessant calls to
	 in_slist, which is quite slow.  */
      inl = in_slist (ulist, constr);

      /* If it is FTP, and FTP is not followed, chuck it out.  */
      if (!inl)
	if (u->proto == URLFTP && !opt.follow_ftp && !this_url_ftp)
	  {
	    DEBUGP (("Uh, it is FTP but i'm not in the mood to follow FTP.\n"));
	    ulist = add_slist (ulist, constr, 0);
	    inl = 1;
	  }
      /* If it is absolute link and they are not followed, chuck it
	 out.  */
      if (!inl && u->proto != URLFTP)
	if (opt.relative_only && !(cur_url->flags & URELATIVE))
	  {
	    DEBUGP (("It doesn't really look like a relative link.\n"));
	    ulist = add_slist (ulist, constr, 0);
	    inl = 1;
	  }
      /* If its domain is not to be accepted/looked-up, chuck it out.  */
      if (!inl)
	if (!accept_domain (u))
	  {
	    DEBUGP (("I don't like the smell of that domain.\n"));
	    ulist = add_slist (ulist, constr, 0);
	    inl = 1;
	  }
      /* Check for parent directory.  */
      if (!inl && opt.no_parent
	  /* If the new URL is FTP and the old was not, ignore
             opt.no_parent.  */
	  && !(!this_url_ftp && u->proto == URLFTP))
	{
	  /* Check for base_dir first.  */
	  if (!(base_dir && frontcmp (base_dir, u->dir)))
	    {
	      /* Failing that, check for parent dir.  */
	      struct urlinfo *ut = newurl ();
	      if (parseurl (this_url, ut, 0) != URLOK)
		DEBUGP (("Double yuck!  The *base* URL is broken.\n"));
	      else if (!frontcmp (ut->dir, u->dir))
		{
		  /* Failing that too, kill the URL.  */
		  DEBUGP (("Trying to escape parental guidance with no_parent on.\n"));
		  ulist = add_slist (ulist, constr, 0);
		  inl = 1;
		}
	      freeurl (ut, 1);
	    }
	}
      /* If the file does not match the acceptance list, or is on the
	 rejection list, chuck it out.  The same goes for the
	 directory exclude- and include- lists.  */
      if (!inl && (opt.includes || opt.excludes))
	{
	  if (!accdir (u->dir, ALLABS))
	    {
	      DEBUGP (("%s (%s) is excluded/not-included.\n", constr, u->dir));
	      ulist = add_slist (ulist, constr, 0);
	      inl = 1;
	    }
	}
      if (!inl)
	{
	  char *suf = NULL;
	  /* We check for acceptance/rejection rules only for non-HTML
	     documents.  Since we don't know whether they really are
	     HTML, it will be deduced from (an OR-ed list):

	     1) u->file is "" (meaning it is a directory)
	     2) suffix exists, AND:
	     a) it is "html", OR
	     b) it is "htm"

	     If the file *is* supposed to be HTML, it will *not* be
	     subject to acc/rej rules.  That's why the `!'.  */
	  if (!
	      (!*u->file
	       || (((suf = suffix (constr)) != NULL)
		   && (!strcmp (suf, "html") || !strcmp (suf, "htm")))))
	    {
	      if (!acceptable (u->file))
		{
		  DEBUGP (("%s (%s) does not match acc/rej rules.\n",
			  constr, u->file));
		  ulist = add_slist (ulist, constr, 0);
		  inl = 1;
		}
	    }
	  FREE_MAYBE (suf);
	}
      /* Optimize the URL (which includes possible DNS lookup) only
	 after all other possibilities have been exhausted.  */
      if (!inl)
	{
	  if (!opt.simple_check)
	    opt_url (u);
	  else
	    {
	      char *p;
	      /* Just lowercase the hostname.  */
	      for (p = u->host; *p; p++)
		*p = tolower (*p);
	      free (u->url);
	      u->url = str_url (u, 0);
	    }
	  free (constr);
	  constr = xstrdup (u->url);
	  inl = in_slist (ulist, constr);
	  if (!inl && !((u->proto == URLFTP) && !this_url_ftp))
	    if (!opt.spanhost && this_url && !same_host (this_url, constr))
	      {
		DEBUGP (("This is not the same hostname as the parent's.\n"));
		ulist = add_slist (ulist, constr, 0);
		inl = 1;
	      }
	}
      /* What about robots.txt?  */
      if (!inl && opt.use_robots && u->proto == URLHTTP)
	{
	  /* Since Wget knows about only one set of robot rules at a
	     time, /robots.txt must be reloaded whenever a new host is
	     accessed.

	     robots_host holds the host the current `forbid' variable
	     is assigned to.  */
	  if (!robots_host || !same_host (robots_host, u->host))
	    {
	      FREE_MAYBE (robots_host);
	      /* Now make robots_host the new host, no matter what the
		 result will be.  So if there is no /robots.txt on the
		 site, Wget will not retry getting robots all the
		 time.  */
	      robots_host = xstrdup (u->host);
	      free_vec (forbidden);
	      forbidden = NULL;
	      err = retrieve_robots (constr, ROBOTS_FILENAME);
	      if (err == ROBOTSOK)
		{
		  rurl = robots_url (constr, ROBOTS_FILENAME);
		  rfile = url_filename (rurl);
		  forbidden = parse_robots (rfile);
		  freeurl (rurl, 1);
		  free (rfile);
		}
	    }

	  /* Now that we have (or don't have) robots, we can check for
	     them.  */
	  if (!robots_match (u, forbidden))
	    {
	      DEBUGP (("Stuffing %s because %s forbids it.\n", this_url,
		       ROBOTS_FILENAME));
	      ulist = add_slist (ulist, constr, 0);
	      inl = 1;
	    }
	}

      filename = NULL;
      /* If it wasn't chucked out, do something with it.  */
      if (!inl)
	{
	  DEBUGP (("I've decided to load it -> "));
	  /* Add it to the list of already-loaded URL-s.  */
	  ulist = add_slist (ulist, constr, 0);
	  /* Automatically followed FTPs will *not* be downloaded
	     recursively.  */
	  if (u->proto == URLFTP)
	    {
	      /* Don't you adore side-effects?  */
	      opt.recursive = 0;
	    }
	  /* Reset its type.  */
	  dt = 0;
	  /* Retrieve it.  */
	  retrieve_url (constr, &filename, &newloc,
		       canon_this_url ? canon_this_url : this_url, &dt);
	  if (u->proto == URLFTP)
	    {
	      /* Restore...  */
	      opt.recursive = 1;
	    }
	  if (newloc)
	    {
	      free (constr);
	      constr = newloc;
	    }
	  /* In case of convert_links: If there was no error, add it to
	     the list of downloaded URLs.  We might need it for
	     conversion.  */
	  if (opt.convert_links && filename)
	    {
	      if (dt & RETROKF)
		{
		  urls_downloaded = add_url (urls_downloaded, constr, filename);
		  /* If the URL is HTML, note it.  */
		  if (dt & TEXTHTML)
		    urls_html = add_slist (urls_html, filename, NOSORT);
		}
	    }
	  /* If there was no error, and the type is text/html, parse
	     it recursively.  */
	  if (dt & TEXTHTML)
	    {
	      if (dt & RETROKF)
		recursive_retrieve (filename, constr);
	    }
	  else
	    DEBUGP (("%s is not text/html so we don't chase.\n",
		     filename ? filename: "(null)"));
	  /* If an suffix-rejected file was loaded only because it was HTML,
	     undo the error now */
	  if (opt.delete_after || (filename && !acceptable (filename)))
	    {
	      logprintf (LOG_VERBOSE,
			 (opt.delete_after ? _("Removing %s.\n")
			  : _("Removing %s since it should be rejected.\n")),
			 filename);
	      if (unlink (filename))
		logprintf (LOG_NOTQUIET, "unlink: %s\n", strerror (errno));
	      dt &= ~RETROKF;
	    }
	  /* If everything was OK, and links are to be converted, let's
	     store the local filename.  */
	  if (opt.convert_links && (dt & RETROKF) && (filename != NULL))
	    {
	      cur_url->flags |= UABS2REL;
	      cur_url->local_name = xstrdup (filename);
	    }
	}
      DEBUGP (("%s already in list, so we don't load.\n", constr));
      /* Free filename and constr.  */
      FREE_MAYBE (filename);
      FREE_MAYBE (constr);
      freeurl (u, 1);
      /* Increment the pbuf for the appropriate size.  */
    }
  if (opt.convert_links)
    convert_links (file, url_list);
  /* Free the linked list of URL-s.  */
  free_urlpos (url_list);
  /* Free the canonical this_url.  */
  FREE_MAYBE (canon_this_url);
  /* Decrement the recursion depth.  */
  --depth;
  if (opt.quota && (opt.downloaded > opt.quota))
    return QUOTEXC;
  else
    return RETROK;
}

Example #15

0

Show file

File: retr.c Project: aosm/wget

/* Retrieve the given URL.  Decides which loop to call -- HTTP, FTP,
   or simply copy it with file:// (#### the latter not yet
   implemented!).  */
uerr_t
retrieve_url (const char *origurl, char **file, char **newloc,
	      const char *refurl, int *dt)
{
  uerr_t result;
  char *url;
  int location_changed, already_redirected, dummy;
  int local_use_proxy;
  char *mynewloc, *proxy;
  struct urlinfo *u;


  /* If dt is NULL, just ignore it.  */
  if (!dt)
    dt = &dummy;
  url = xstrdup (origurl);
  if (newloc)
    *newloc = NULL;
  if (file)
    *file = NULL;
  already_redirected = 0;

 again:
  u = newurl ();
  /* Parse the URL.  RFC2068 requires `Location' to contain an
     absoluteURI, but many sites break this requirement.  #### We
     should be liberal and accept a relative location, too.  */
  result = parseurl (url, u, already_redirected);
  if (result != URLOK)
    {
      freeurl (u, 1);
      logprintf (LOG_NOTQUIET, "%s: %s.\n", url, uerrmsg (result));
      return result;
    }

  /* Set the referer.  */
  if (refurl)
    u->referer = xstrdup (refurl);
  else
    u->referer = NULL;

  local_use_proxy = USE_PROXY_P (u);
  if (local_use_proxy)
    {
      struct urlinfo *pu = newurl ();

      /* Copy the original URL to new location.  */
      memcpy (pu, u, sizeof (*u));
      pu->proxy = NULL; /* A minor correction :) */
      /* Initialize u to nil.  */
      memset (u, 0, sizeof (*u));
      u->proxy = pu;
      /* Get the appropriate proxy server, appropriate for the
	 current protocol.  */
      proxy = getproxy (pu->proto);
      if (!proxy)
	{
	  logputs (LOG_NOTQUIET, _("Could not find proxy host.\n"));
	  freeurl (u, 1);
	  return PROXERR;
	}
      /* Parse the proxy URL.  */
      result = parseurl (proxy, u, 0);
      if (result != URLOK || u->proto != URLHTTP)
	{
	  if (u->proto == URLHTTP)
	    logprintf (LOG_NOTQUIET, "Proxy %s: %s.\n", proxy, uerrmsg (result));
	  else
	    logprintf (LOG_NOTQUIET, _("Proxy %s: Must be HTTP.\n"), proxy);
	  freeurl (u, 1);
	  return PROXERR;
	}
      u->proto = URLHTTP;
    }

  assert (u->proto != URLFILE);	/* #### Implement me!  */
  mynewloc = NULL;

  if (u->proto == URLHTTP)
    result = http_loop (u, &mynewloc, dt);
  else if (u->proto == URLFTP)
    {
      /* If this is a redirection, we must not allow recursive FTP
	 retrieval, so we save recursion to oldrec, and restore it
	 later.  */
      int oldrec = opt.recursive;
      if (already_redirected)
	opt.recursive = 0;
      result = ftp_loop (u, dt);
      opt.recursive = oldrec;
      /* There is a possibility of having HTTP being redirected to
	 FTP.  In these cases we must decide whether the text is HTML
	 according to the suffix.  The HTML suffixes are `.html' and
	 `.htm', case-insensitive.

	 #### All of this is, of course, crap.  These types should be
	 determined through mailcap.  */
      if (already_redirected && u->local && (u->proto == URLFTP ))
	{
	  char *suf = suffix (u->local);
	  if (suf && (!strcasecmp (suf, "html") || !strcasecmp (suf, "htm")))
	    *dt |= TEXTHTML;
	  FREE_MAYBE (suf);
	}
    }
  location_changed = (result == NEWLOCATION);
  if (location_changed)
    {
      /* Check for redirection to oneself.  */
      if (url_equal (url, mynewloc))
	{
	  logprintf (LOG_NOTQUIET, _("%s: Redirection to itself.\n"),
		     mynewloc);
	  return WRONGCODE;
	}
      if (mynewloc)
	{
	  free (url);
	  url = mynewloc;
	}
      freeurl (u, 1);
      already_redirected = 1;
      goto again;
    }
  if (file)
    {
      if (u->local)
	*file = xstrdup (u->local);
      else
	*file = NULL;
    }
  freeurl (u, 1);

  if (newloc)
    *newloc = url;
  else
    free (url);

  return result;
}

Example #16

0

Show file

File: checkweb.c Project: WongTai/snippets

/*
 * Accept a url string, return a struct status.
 *
 * A negative status indicates a problem either connecting to the
 * machine, or a url parse problem.  The message will tell you what,
 * specifically happened (although it doesn't distinguish between a
 * timeout, and a connection refused).
 */
struct status
getstatus(char *url)
{
    int     i;
    char    line[1024];
    char   *p, *q;
    struct url u;
    struct status st;
    struct host_ret conn;

    st.status = -1;
    st.message = NULL;
    st.bytesread = 0;

    u = parseurl(url);
    if (u.port == -1) {
        st.message = strdup("Invalid url request format");
        return (st);
    }
    conn = openhost(u.host, u.port, u.ssl);

    if (conn.s < 0) {
        st.message = strdup("Could not connect to host");
        return (st);
    }
    send_data(conn, u, "GET ");
    send_data(conn, u, u.req);
    send_data(conn, u, " HTTP/1.0\n\n");

    alarm(120);
    i = recv_data(conn, u, line, 1024);
    alarm(0);
    if (i < 1) {
        st.message = strdup("Timeout, or nothing returned.");
        return (st);
    }
    line[i] = NULL;

    /*
     * My keen parsing techniques, flip through it with a pointer
     * to get the status number
     */
    p = &line[0];
    while (*p++ && *p != ' ');
    st.status = atoi(p);

    /* Now we want the status message */
    while (*++p && *p != ' ');

    /* Kill Whitey */
    q = p;
    while (*++q && !iswhitey(*q));
    *q = NULL;
    st.message = strdup(p + 1);

    /* Eat the rest of the page */
    while (recv_data(conn, u, line, 1024));

#ifdef USE_SSLEAY
    if (u.ssl) {
        if (conn.ssl)
            SSL_free(conn.ssl);
        if (conn.ctx)
            SSL_CTX_free(conn.ctx);
    }
#endif
    close(conn.s);
    freeurl(u);
    return (st);
}