Exemplo n.º 1
0
/*
 * Find the best relation op for matching the two trees it has.
 * This is a sub-version of the function findops() above.
 * The instruction with the lowest grading is emitted.
 *
 * Level assignment for priority:
 *	left	right	prio
 *	-	-	-
 *	direct	direct	1
 *	direct	OREG	2	# make oreg
 *	OREG	direct	2	# make oreg
 *	OREG	OREG	2	# make both oreg
 *	direct	REG	3	# put in reg
 *	OREG	REG	3	# put in reg, make oreg
 *	REG	direct	3	# put in reg
 *	REG	OREG	3	# put in reg, make oreg
 *	REG	REG	4	# put both in reg
 */
int
relops(NODE *p)
{
	extern int *qtable[];
	struct optab *q;
	int i, shl = 0, shr = 0;
	NODE *l, *r;
	int *ixp, idx = 0;
	int lvl = 10, gol = 0, gor = 0;

	F2DEBUG(("relops tree:\n"));
	F2WALK(p);

	l = getlr(p, 'L');
	r = getlr(p, 'R');
	ixp = qtable[p->n_op];
	for (i = 0; ixp[i] >= 0; i++) {
		q = &table[ixp[i]];

		F2DEBUG(("relops: ixp %d\n", ixp[i]));
		if (!acceptable(q))		/* target-dependent filter */
			continue;

		if (ttype(l->n_type, q->ltype) == 0 ||
		    ttype(r->n_type, q->rtype) == 0)
			continue; /* Types must be correct */

		F2DEBUG(("relops got types\n"));
		if ((shl = chcheck(l, q->lshape, 0)) == SRNOPE)
			continue;
		F2DEBUG(("relops lshape %d\n", shl));
		F2WALK(p);
		if ((shr = chcheck(r, q->rshape, 0)) == SRNOPE)
			continue;
		F2DEBUG(("relops rshape %d\n", shr));
		F2WALK(p);
		if (q->needs & REWRITE)
			break;	/* Done here */

		if (lvl <= (shl + shr))
			continue;
		lvl = shl + shr;
		idx = ixp[i];
		gol = shl;
		gor = shr;
	}
	if (lvl == 10) {
		F2DEBUG(("relops failed\n"));
		if (setbin(p))
			return FRETRY;
		return FFAIL;
	}
	F2DEBUG(("relops entry %d(%s %s)\n", idx, srtyp[gol], srtyp[gor]));

	q = &table[idx];

	(void)shswitch(-1, p->n_left, q->lshape, FORCC,
	    q->rewrite & RLEFT, gol);

	(void)shswitch(-1, p->n_right, q->rshape, FORCC,
	    q->rewrite & RRIGHT, gor);
	
	F2DEBUG(("relops: node %p\n", p));
	p->n_su = MKIDX(idx, 0);
	SCLASS(p->n_su, CLASSA); /* XXX */
	return 0;
}
Exemplo n.º 2
0
/*
 * Find a matching assign op.
 *
 * Level assignment for priority:
 *	left	right	prio
 *	-	-	-
 *	direct	direct	1
 *	direct	REG	2
 *	direct	OREG	3
 *	OREG	direct	4
 *	OREG	REG	5
 *	OREG	OREG	6
 */
int
findasg(NODE *p, int cookie)
{
	extern int *qtable[];
	struct optab *q;
	int i, sh, shl, shr, lvl = 10;
	NODE *l, *r;
	int *ixp;
	struct optab *qq = NULL; /* XXX gcc */
	int idx = 0, gol = 0, gor = 0;

	shl = shr = 0;

	F2DEBUG(("findasg tree: %s\n", prcook(cookie)));
	F2WALK(p);

	ixp = qtable[p->n_op];
	l = getlr(p, 'L');
	r = getlr(p, 'R');
	for (i = 0; ixp[i] >= 0; i++) {
		q = &table[ixp[i]];

		F2DEBUG(("findasg: ixp %d\n", ixp[i]));
		if (!acceptable(q))		/* target-dependent filter */
			continue;

		if (ttype(l->n_type, q->ltype) == 0 ||
		    ttype(r->n_type, q->rtype) == 0)
			continue; /* Types must be correct */

		if ((cookie & q->visit) == 0)
			continue; /* must get a result */

		F2DEBUG(("findasg got types\n"));
#ifdef mach_pdp11 /* XXX - check for other targets too */
		if (p->n_op == STASG && ISPTR(l->n_type)) {
			/* Accept lvalue to be in register */
			/* if struct assignment is given a pointer */
			if ((shl = chcheck(l, q->lshape,
			    q->rewrite & RLEFT)) == SRNOPE)
				continue;
		} else
#endif
		{
			if ((shl = tshape(l, q->lshape)) == SRNOPE)
				continue;
			if (shl == SRREG)
				continue;
		}

		F2DEBUG(("findasg lshape %d\n", shl));
		F2WALK(l);

		if ((shr = chcheck(r, q->rshape, q->rewrite & RRIGHT)) == SRNOPE)
			continue;

		F2DEBUG(("findasg rshape %d\n", shr));
		F2WALK(r);
		if (q->needs & REWRITE)
			break;	/* Done here */

		if (lvl <= (shl + shr))
			continue;

		lvl = shl + shr;
		qq = q;
		idx = ixp[i];
		gol = shl;
		gor = shr;
	}

	if (lvl == 10) {
		F2DEBUG(("findasg failed\n"));
		if (setasg(p, cookie))
			return FRETRY;
		return FFAIL;
	}
	F2DEBUG(("findasg entry %d(%s,%s)\n", idx, srtyp[gol], srtyp[gor]));

	sh = -1;
	sh = shswitch(sh, p->n_left, qq->lshape, cookie,
	    qq->rewrite & RLEFT, gol);

	sh = shswitch(sh, p->n_right, qq->rshape, cookie,
	    qq->rewrite & RRIGHT, gor);

#ifdef mach_pdp11 /* XXX all targets? */
	lvl = 0;
	if (cookie == FOREFF)
		lvl = RVEFF, sh = 0;
	else if (cookie == FORCC)
		lvl = RVCC, sh = 0;
	else if (sh == -1) {
		sh = ffs(cookie & qq->visit & INREGS)-1;
#ifdef PCC_DEBUG
		if (sh == -1)
			comperr("findasg bad shape");
#endif
		SCLASS(lvl,sh);
	} else
		SCLASS(lvl,sh);
	p->n_su = MKIDX(idx, lvl);
#else
	if (sh == -1) {
		if (cookie == FOREFF)
			sh = 0;
		else
			sh = ffs(cookie & qq->visit & INREGS)-1;
	}
	F2DEBUG(("findasg: node %p class %d\n", p, sh));

	p->n_su = MKIDX(idx, 0);
	SCLASS(p->n_su, sh);
#endif /* mach_pdp11 */
#ifdef FINDMOPS
	p->n_flags &= ~1;
#endif
	return sh;
}
Exemplo n.º 3
0
/*
 * Find the best instruction to evaluate the given tree.
 * Best is to match both subnodes directly, second-best is if
 * subnodes must be evaluated into OREGs, thereafter if nodes 
 * must be put into registers.
 * Whether 2-op instructions or 3-op is preferred is depending on in
 * which order they are found in the table.
 * mtchno is set to the count of regs needed for its legs.
 */
int
findops(NODE *p, int cookie)
{
	extern int *qtable[];
	struct optab *q, *qq = NULL;
	int i, shl, shr, *ixp, sh;
	int lvl = 10, idx = 0, gol = 0, gor = 0;
	NODE *l, *r;

	F2DEBUG(("findops node %p (%s)\n", p, prcook(cookie)));
	F2WALK(p);

	ixp = qtable[p->n_op];
	l = getlr(p, 'L');
	r = getlr(p, 'R');
	for (i = 0; ixp[i] >= 0; i++) {
		q = &table[ixp[i]];

		F2DEBUG(("findop: ixp %d str %s\n", ixp[i], q->cstring));
		if (!acceptable(q))		/* target-dependent filter */
			continue;

		if (ttype(l->n_type, q->ltype) == 0 ||
		    ttype(r->n_type, q->rtype) == 0)
			continue; /* Types must be correct */

		if ((cookie & q->visit) == 0)
			continue; /* must get a result */

		F2DEBUG(("findop got types\n"));

		if ((shl = chcheck(l, q->lshape, q->rewrite & RLEFT)) == SRNOPE)
			continue;

		F2DEBUG(("findop lshape %s\n", srtyp[shl]));
		F2WALK(l);

		if ((shr = chcheck(r, q->rshape, q->rewrite & RRIGHT)) == SRNOPE)
			continue;

		F2DEBUG(("findop rshape %s\n", srtyp[shr]));
		F2WALK(r);

		/* Help register assignment after SSA by preferring */
		/* 2-op insns instead of 3-ops */
		if (xssa && (q->rewrite & RLEFT) == 0 && shl == SRDIR)
			shl = SRREG;

		if (q->needs & REWRITE)
			break;  /* Done here */

		if (lvl <= (shl + shr))
			continue;
		lvl = shl + shr;
		qq = q;
		idx = ixp[i];
		gol = shl;
		gor = shr;
	}
	if (lvl == 10) {
		F2DEBUG(("findops failed\n"));
		if (setbin(p))
			return FRETRY;
		return FFAIL;
	}

	F2DEBUG(("findops entry %d(%s,%s)\n", idx, srtyp[gol], srtyp[gor]));

	sh = -1;

#ifdef mach_pdp11
	if (cookie == FORCC && p->n_op != AND)	/* XXX - fix */
		cookie = INREGS;
#else
	if (cookie == FORCC)
		cookie = INREGS;
#endif

	sh = shswitch(sh, p->n_left, qq->lshape, cookie,
	    qq->rewrite & RLEFT, gol);
	sh = shswitch(sh, p->n_right, qq->rshape, cookie,
	    qq->rewrite & RRIGHT, gor);

	if (sh == -1) {
		if (cookie == FOREFF || cookie == FORCC)
			sh = 0;
		else
			sh = ffs(cookie & qq->visit & INREGS)-1;
	}
	F2DEBUG(("findops: node %p sh %d (%s)\n", p, sh, prcook(1 << sh)));
	p->n_su = MKIDX(idx, 0);
	SCLASS(p->n_su, sh);
	return sh;
}
Exemplo n.º 4
0
/*
 * Try to find constructs like "a = a + 1;" and match them together
 * with instructions like "incl a" or "addl $1,a".
 *
 * Level assignment for priority:
 *	left	right	prio
 *	-	-	-
 *	direct	direct	1
 *	direct	REG	2
 *	direct	OREG	3
 *	OREG	direct	4
 *	OREG	REG	5
 *	OREG	OREG	6
 */
int
findmops(NODE *p, int cookie)
{
	extern int *qtable[];
	struct optab *q;
	int i, sh, shl, shr, lvl = 10;
	NODE *l, *r;
	int *ixp;
	struct optab *qq = NULL; /* XXX gcc */
	int idx = 0, gol = 0, gor = 0;

	shl = shr = 0;

	F2DEBUG(("findmops tree: %s\n", prcook(cookie)));
	F2WALK(p);

	l = getlr(p, 'L');
	r = getlr(p, 'R');
	/* See if this is a usable tree to work with */
	/* Currently only check for leaves */
	if (optype(r->n_op) != BITYPE || treecmp(l, r->n_left) == 0)
		return FFAIL;

	F2DEBUG(("findmops is useable\n"));

	/* We can try to find a match.  Use right op */
	ixp = qtable[r->n_op];
	l = getlr(r, 'L');
	r = getlr(r, 'R');

	for (i = 0; ixp[i] >= 0; i++) {
		q = &table[ixp[i]];

		F2DEBUG(("findmops: ixp %d\n", ixp[i]));
		if (!acceptable(q))		/* target-dependent filter */
			continue;

		if (ttype(l->n_type, q->ltype) == 0 ||
		    ttype(r->n_type, q->rtype) == 0)
			continue; /* Types must be correct */

		F2DEBUG(("findmops got types\n"));

		switch (cookie) {
		case FOREFF:
			if ((q->visit & FOREFF) == 0)
				continue; /* Not only for side effects */
			break;
		case FORCC:
			if ((q->visit & FORCC) == 0)
				continue; /* Not only for side effects */
			break;
		default:
			if ((cookie & q->visit) == 0)
				continue; /* Won't match requested shape */
			if (((cookie & INREGS & q->lshape) == 0) || !isreg(l))
				continue; /* Bad return register */
			break;
		}
		F2DEBUG(("findmops cookie\n"));

		/*
		 * left shape must match left node.
		 */
		if ((shl = tshape(l, q->lshape)) != SRDIR && (shl != SROREG))
			continue;

		F2DEBUG(("findmops lshape %s\n", srtyp[shl]));
		F2WALK(l);

		if ((shr = chcheck(r, q->rshape, 0)) == SRNOPE)
			continue;

		F2DEBUG(("findmops rshape %s\n", srtyp[shr]));

		/*
		 * Only allow RLEFT. XXX
		 */
		if ((q->rewrite & (RLEFT|RRIGHT)) != RLEFT)
			continue;

		F2DEBUG(("rewrite OK\n"));

		F2WALK(r);
		if (q->needs & REWRITE)
			break;	/* Done here */

		if (lvl <= (shl + shr))
			continue;

		lvl = shl + shr;
		qq = q;
		idx = ixp[i];
		gol = shl;
		gor = shr;
	}

	if (lvl == 10)
		return FFAIL;
	F2DEBUG(("findmops entry %d(%s,%s)\n", idx, srtyp[gol], srtyp[gor]));

	/*
	 * Now we're here and have a match. left is semi-direct and 
	 * right may be anything.
	 */

	sh = -1;
	sh = shswitch(sh, p->n_left, qq->lshape, cookie,
	    qq->rewrite & RLEFT, gol);
	sh = shswitch(sh, r, qq->rshape, cookie, 0, gor);

	if (sh == -1) {
		if (cookie & (FOREFF|FORCC))
			sh = 0;
		else
			sh = ffs(cookie & qq->visit & INREGS)-1;
	}
	F2DEBUG(("findmops done: node %p class %d\n", p, sh));

	/* Trickery:  Set table index on assign to op instead */
	/* gencode() will remove useless nodes */
	p->n_su = MKIDX(idx, 0);
	p->n_flags |= 1; /* XXX tell gencode to reduce the right tree */
	SCLASS(p->n_su, sh);

	return sh;
}
Exemplo n.º 5
0
/* The core of recursive retrieving.  Endless recursion is avoided by
   having all URLs stored to a linked list of URLs, which is checked
   before loading any URL.  That way no URL can get loaded twice.

   The function also supports specification of maximum recursion depth
   and a number of other goodies.  */
uerr_t
recursive_retrieve (const char *file, const char *this_url)
{
  char *constr, *filename, *newloc;
  char *canon_this_url = NULL;
  int dt, inl, dash_p_leaf_HTML = FALSE;
  int meta_disallow_follow;
  int this_url_ftp;            /* See below the explanation */
  uerr_t err;
  struct urlinfo *rurl;
  urlpos *url_list, *cur_url;
  char *rfile; /* For robots */
  struct urlinfo *u;

  assert (this_url != NULL);
  assert (file != NULL);
  /* If quota was exceeded earlier, bail out.  */
  if (downloaded_exceeds_quota ())
    return QUOTEXC;
  /* Cache the current URL in the list.  */
  if (first_time)
    {
      /* These three operations need to be done only once per Wget
         run.  They should probably be at a different location.  */
      if (!undesirable_urls)
	undesirable_urls = make_string_hash_table (0);

      hash_table_clear (undesirable_urls);
      string_set_add (undesirable_urls, this_url);
      /* Enter this_url to the hash table, in original and "enhanced" form.  */
      u = newurl ();
      err = parseurl (this_url, u, 0);
      if (err == URLOK)
	{
	  string_set_add (undesirable_urls, u->url);
	  if (opt.no_parent)
	    base_dir = xstrdup (u->dir); /* Set the base dir.  */
	  /* Set the canonical this_url to be sent as referer.  This
	     problem exists only when running the first time.  */
	  canon_this_url = xstrdup (u->url);
	}
      else
	{
	  DEBUGP (("Double yuck!  The *base* URL is broken.\n"));
	  base_dir = NULL;
	}
      freeurl (u, 1);
      depth = 1;
      robots_host = NULL;
      forbidden = NULL;
      first_time = 0;
    }
  else
    ++depth;

  if (opt.reclevel != INFINITE_RECURSION && depth > opt.reclevel)
    /* We've exceeded the maximum recursion depth specified by the user. */
    {
      if (opt.page_requisites && depth <= opt.reclevel + 1)
	/* When -p is specified, we can do one more partial recursion from the
	   "leaf nodes" on the HTML document tree.  The recursion is partial in
	   that we won't traverse any <A> or <AREA> tags, nor any <LINK> tags
	   except for <LINK REL="stylesheet">. */
	dash_p_leaf_HTML = TRUE;
      else
	/* Either -p wasn't specified or it was and we've already gone the one
	   extra (pseudo-)level that it affords us, so we need to bail out. */
	{
	  DEBUGP (("Recursion depth %d exceeded max. depth %d.\n",
		   depth, opt.reclevel));
	  --depth;
	  return RECLEVELEXC;
	}
    }

  /* Determine whether this_url is an FTP URL.  If it is, it means
     that the retrieval is done through proxy.  In that case, FTP
     links will be followed by default and recursion will not be
     turned off when following them.  */
  this_url_ftp = (urlproto (this_url) == URLFTP);

  /* Get the URL-s from an HTML file: */
  url_list = get_urls_html (file, canon_this_url ? canon_this_url : this_url,
			    dash_p_leaf_HTML, &meta_disallow_follow);

  if (opt.use_robots && meta_disallow_follow)
    {
      /* The META tag says we are not to follow this file.  Respect
         that.  */
      free_urlpos (url_list);
      url_list = NULL;
    }

  /* Decide what to do with each of the URLs.  A URL will be loaded if
     it meets several requirements, discussed later.  */
  for (cur_url = url_list; cur_url; cur_url = cur_url->next)
    {
      /* If quota was exceeded earlier, bail out.  */
      if (downloaded_exceeds_quota ())
	break;
      /* Parse the URL for convenient use in other functions, as well
	 as to get the optimized form.  It also checks URL integrity.  */
      u = newurl ();
      if (parseurl (cur_url->url, u, 0) != URLOK)
	{
	  DEBUGP (("Yuck!  A bad URL.\n"));
	  freeurl (u, 1);
	  continue;
	}
      if (u->proto == URLFILE)
	{
	  DEBUGP (("Nothing to do with file:// around here.\n"));
	  freeurl (u, 1);
	  continue;
	}
      assert (u->url != NULL);
      constr = xstrdup (u->url);

      /* Several checkings whether a file is acceptable to load:
	 1. check if URL is ftp, and we don't load it
	 2. check for relative links (if relative_only is set)
	 3. check for domain
	 4. check for no-parent
	 5. check for excludes && includes
	 6. check for suffix
	 7. check for same host (if spanhost is unset), with possible
	 gethostbyname baggage
	 8. check for robots.txt

	 Addendum: If the URL is FTP, and it is to be loaded, only the
	 domain and suffix settings are "stronger".

	 Note that .html and (yuck) .htm will get loaded regardless of
	 suffix rules (but that is remedied later with unlink) unless
	 the depth equals the maximum depth.

	 More time- and memory- consuming tests should be put later on
	 the list.  */

      /* inl is set if the URL we are working on (constr) is stored in
	 undesirable_urls.  Using it is crucial to avoid unnecessary
	 repeated continuous hits to the hash table.  */
      inl = string_set_contains (undesirable_urls, constr);

      /* If it is FTP, and FTP is not followed, chuck it out.  */
      if (!inl)
	if (u->proto == URLFTP && !opt.follow_ftp && !this_url_ftp)
	  {
	    DEBUGP (("Uh, it is FTP but i'm not in the mood to follow FTP.\n"));
	    string_set_add (undesirable_urls, constr);
	    inl = 1;
	  }
      /* If it is absolute link and they are not followed, chuck it
	 out.  */
      if (!inl && u->proto != URLFTP)
	if (opt.relative_only && !cur_url->link_relative_p)
	  {
	    DEBUGP (("It doesn't really look like a relative link.\n"));
	    string_set_add (undesirable_urls, constr);
	    inl = 1;
	  }
      /* If its domain is not to be accepted/looked-up, chuck it out.  */
      if (!inl)
	if (!accept_domain (u))
	  {
	    DEBUGP (("I don't like the smell of that domain.\n"));
	    string_set_add (undesirable_urls, constr);
	    inl = 1;
	  }
      /* Check for parent directory.  */
      if (!inl && opt.no_parent
	  /* If the new URL is FTP and the old was not, ignore
             opt.no_parent.  */
	  && !(!this_url_ftp && u->proto == URLFTP))
	{
	  /* Check for base_dir first.  */
	  if (!(base_dir && frontcmp (base_dir, u->dir)))
	    {
	      /* Failing that, check for parent dir.  */
	      struct urlinfo *ut = newurl ();
	      if (parseurl (this_url, ut, 0) != URLOK)
		DEBUGP (("Double yuck!  The *base* URL is broken.\n"));
	      else if (!frontcmp (ut->dir, u->dir))
		{
		  /* Failing that too, kill the URL.  */
		  DEBUGP (("Trying to escape parental guidance with no_parent on.\n"));
		  string_set_add (undesirable_urls, constr);
		  inl = 1;
		}
	      freeurl (ut, 1);
	    }
	}
      /* If the file does not match the acceptance list, or is on the
	 rejection list, chuck it out.  The same goes for the
	 directory exclude- and include- lists.  */
      if (!inl && (opt.includes || opt.excludes))
	{
	  if (!accdir (u->dir, ALLABS))
	    {
	      DEBUGP (("%s (%s) is excluded/not-included.\n", constr, u->dir));
	      string_set_add (undesirable_urls, constr);
	      inl = 1;
	    }
	}
      if (!inl)
	{
	  char *suf = NULL;
	  /* We check for acceptance/rejection rules only for non-HTML
	     documents.  Since we don't know whether they really are
	     HTML, it will be deduced from (an OR-ed list):

	     1) u->file is "" (meaning it is a directory)
	     2) suffix exists, AND:
	     a) it is "html", OR
	     b) it is "htm"

	     If the file *is* supposed to be HTML, it will *not* be
            subject to acc/rej rules, unless a finite maximum depth has
            been specified and the current depth is the maximum depth. */
	  if (!
	      (!*u->file
	       || (((suf = suffix (constr)) != NULL)
                  && ((!strcmp (suf, "html") || !strcmp (suf, "htm"))
                      && ((opt.reclevel != INFINITE_RECURSION) &&
			  (depth != opt.reclevel))))))
	    {
	      if (!acceptable (u->file))
		{
		  DEBUGP (("%s (%s) does not match acc/rej rules.\n",
			  constr, u->file));
		  string_set_add (undesirable_urls, constr);
		  inl = 1;
		}
	    }
	  FREE_MAYBE (suf);
	}
      /* Optimize the URL (which includes possible DNS lookup) only
	 after all other possibilities have been exhausted.  */
      if (!inl)
	{
	  if (!opt.simple_check)
	    opt_url (u);
	  else
	    {
	      char *p;
	      /* Just lowercase the hostname.  */
	      for (p = u->host; *p; p++)
		*p = TOLOWER (*p);
	      xfree (u->url);
	      u->url = str_url (u, 0);
	    }
	  xfree (constr);
	  constr = xstrdup (u->url);
	  string_set_add (undesirable_urls, constr);
	  if (!inl && !((u->proto == URLFTP) && !this_url_ftp))
	    if (!opt.spanhost && this_url && !same_host (this_url, constr))
	      {
		DEBUGP (("This is not the same hostname as the parent's.\n"));
		string_set_add (undesirable_urls, constr);
		inl = 1;
	      }
	}
      /* What about robots.txt?  */
      if (!inl && opt.use_robots && u->proto == URLHTTP)
	{
	  /* Since Wget knows about only one set of robot rules at a
	     time, /robots.txt must be reloaded whenever a new host is
	     accessed.

	     robots_host holds the host the current `forbid' variable
	     is assigned to.  */
	  if (!robots_host || !same_host (robots_host, u->host))
	    {
	      FREE_MAYBE (robots_host);
	      /* Now make robots_host the new host, no matter what the
		 result will be.  So if there is no /robots.txt on the
		 site, Wget will not retry getting robots all the
		 time.  */
	      robots_host = xstrdup (u->host);
	      free_vec (forbidden);
	      forbidden = NULL;
	      err = retrieve_robots (constr, ROBOTS_FILENAME);
	      if (err == ROBOTSOK)
		{
		  rurl = robots_url (constr, ROBOTS_FILENAME);
		  rfile = url_filename (rurl);
		  forbidden = parse_robots (rfile);
		  freeurl (rurl, 1);
		  xfree (rfile);
		}
	    }

	  /* Now that we have (or don't have) robots, we can check for
	     them.  */
	  if (!robots_match (u, forbidden))
	    {
	      DEBUGP (("Stuffing %s because %s forbids it.\n", this_url,
		       ROBOTS_FILENAME));
	      string_set_add (undesirable_urls, constr);
	      inl = 1;
	    }
	}

      filename = NULL;
      /* If it wasn't chucked out, do something with it.  */
      if (!inl)
	{
	  DEBUGP (("I've decided to load it -> "));
	  /* Add it to the list of already-loaded URL-s.  */
	  string_set_add (undesirable_urls, constr);
	  /* Automatically followed FTPs will *not* be downloaded
	     recursively.  */
	  if (u->proto == URLFTP)
	    {
	      /* Don't you adore side-effects?  */
	      opt.recursive = 0;
	    }
	  /* Reset its type.  */
	  dt = 0;
	  /* Retrieve it.  */
	  retrieve_url (constr, &filename, &newloc,
		       canon_this_url ? canon_this_url : this_url, &dt);
	  if (u->proto == URLFTP)
	    {
	      /* Restore...  */
	      opt.recursive = 1;
	    }
	  if (newloc)
	    {
	      xfree (constr);
	      constr = newloc;
	    }
	  /* If there was no error, and the type is text/html, parse
	     it recursively.  */
	  if (dt & TEXTHTML)
	    {
	      if (dt & RETROKF)
		recursive_retrieve (filename, constr);
	    }
	  else
	    DEBUGP (("%s is not text/html so we don't chase.\n",
		     filename ? filename: "(null)"));

	  if (opt.delete_after || (filename && !acceptable (filename)))
	    /* Either --delete-after was specified, or we loaded this otherwise
	       rejected (e.g. by -R) HTML file just so we could harvest its
	       hyperlinks -- in either case, delete the local file. */
	    {
	      DEBUGP (("Removing file due to %s in recursive_retrieve():\n",
		       opt.delete_after ? "--delete-after" :
		       "recursive rejection criteria"));
	      logprintf (LOG_VERBOSE,
			 (opt.delete_after ? _("Removing %s.\n")
			  : _("Removing %s since it should be rejected.\n")),
			 filename);
	      if (unlink (filename))
		logprintf (LOG_NOTQUIET, "unlink: %s\n", strerror (errno));
	      dt &= ~RETROKF;
	    }

	  /* If everything was OK, and links are to be converted, let's
	     store the local filename.  */
	  if (opt.convert_links && (dt & RETROKF) && (filename != NULL))
	    {
	      cur_url->convert = CO_CONVERT_TO_RELATIVE;
	      cur_url->local_name = xstrdup (filename);
	    }
	}
      else
	DEBUGP (("%s already in list, so we don't load.\n", constr));
      /* Free filename and constr.  */
      FREE_MAYBE (filename);
      FREE_MAYBE (constr);
      freeurl (u, 1);
      /* Increment the pbuf for the appropriate size.  */
    }
  if (opt.convert_links && !opt.delete_after)
    /* This is merely the first pass: the links that have been
       successfully downloaded are converted.  In the second pass,
       convert_all_links() will also convert those links that have NOT
       been downloaded to their canonical form.  */
    convert_links (file, url_list);
  /* Free the linked list of URL-s.  */
  free_urlpos (url_list);
  /* Free the canonical this_url.  */
  FREE_MAYBE (canon_this_url);
  /* Decrement the recursion depth.  */
  --depth;
  if (downloaded_exceeds_quota ())
    return QUOTEXC;
  else
    return RETROK;
}
Exemplo n.º 6
0
/* The core of recursive retrieving.  Endless recursion is avoided by
   having all URL-s stored to a linked list of URL-s, which is checked
   before loading any URL.  That way no URL can get loaded twice.

   The function also supports specification of maximum recursion depth
   and a number of other goodies.  */
uerr_t
recursive_retrieve (const char *file, const char *this_url)
{
  char *constr, *filename, *newloc;
  char *canon_this_url = NULL;
  int dt, inl;
  int this_url_ftp;            /* See below the explanation */
  uerr_t err;
  struct urlinfo *rurl;
  urlpos *url_list, *cur_url;
  char *rfile; /* For robots */
  struct urlinfo *u;

  assert (this_url != NULL);
  assert (file != NULL);
  /* If quota was exceeded earlier, bail out.  */
  if (opt.quota && (opt.downloaded > opt.quota))
    return QUOTEXC;
  /* Cache the current URL in the list.  */
  if (first_time)
    {
      ulist = add_slist (ulist, this_url, 0);
      urls_downloaded = NULL;
      urls_html = NULL;
      /* Enter this_url to the slist, in original and "enhanced" form.  */
      u = newurl ();
      err = parseurl (this_url, u, 0);
      if (err == URLOK)
	{
	  ulist = add_slist (ulist, u->url, 0);
	  urls_downloaded = add_url (urls_downloaded, u->url, file);
	  urls_html = add_slist (urls_html, file, NOSORT);
	  if (opt.no_parent)
	    base_dir = xstrdup (u->dir); /* Set the base dir.  */
	  /* Set the canonical this_url to be sent as referer.  This
	     problem exists only when running the first time.  */
	  canon_this_url = xstrdup (u->url);
	}
      else
	{
	  DEBUGP (("Double yuck!  The *base* URL is broken.\n"));
	  base_dir = NULL;
	}
      freeurl (u, 1);
      depth = 1;
      robots_host = NULL;
      forbidden = NULL;
      first_time = 0;
    }
  else
    ++depth;

  /* Bail out if opt.reclevel is exceeded.  */
  if ((opt.reclevel != 0) && (depth > opt.reclevel))
    {
      DEBUGP (("Recursion depth %d exceeded max. depth %d.\n",
	       depth, opt.reclevel));
      --depth;
      return RECLEVELEXC;
    }

  /* Determine whether this_url is an FTP URL.  If it is, it means
     that the retrieval is done through proxy.  In that case, FTP
     links will be followed by default and recursion will not be
     turned off when following them.  */
  this_url_ftp = (urlproto (this_url) == URLFTP);

  /* Get the URL-s from an HTML file: */
  url_list = get_urls_html (file,
			    canon_this_url ? canon_this_url : this_url, 0);

  /* Decide what to do with each of the URLs.  A URL will be loaded if
     it meets several requirements, discussed later.  */
  for (cur_url = url_list; cur_url; cur_url = cur_url->next)
    {
      /* If quota was exceeded earlier, bail out.  */
      if (opt.quota && (opt.downloaded > opt.quota))
	break;
      /* Parse the URL for convenient use in other functions, as well
	 as to get the optimized form.  It also checks URL integrity.  */
      u = newurl ();
      if (parseurl (cur_url->url, u, 0) != URLOK)
	{
	  DEBUGP (("Yuck!  A bad URL.\n"));
	  freeurl (u, 1);
	  continue;
	}
      if (u->proto == URLFILE)
	{
	  DEBUGP (("Nothing to do with file:// around here.\n"));
	  freeurl (u, 1);
	  continue;
	}
      assert (u->url != NULL);
      constr = xstrdup (u->url);

      /* Several checkings whether a file is acceptable to load:
	 1. check if URL is ftp, and we don't load it
	 2. check for relative links (if relative_only is set)
	 3. check for domain
	 4. check for no-parent
	 5. check for excludes && includes
	 6. check for suffix
	 7. check for same host (if spanhost is unset), with possible
	 gethostbyname baggage
	 8. check for robots.txt

	 Addendum: If the URL is FTP, and it is to be loaded, only the
	 domain and suffix settings are "stronger".

	 Note that .html and (yuck) .htm will get loaded
	 regardless of suffix rules (but that is remedied later with
	 unlink).

	 More time- and memory- consuming tests should be put later on
	 the list.  */

      /* inl is set if the URL we are working on (constr) is stored in
	 ulist.  Using it is crucial to avoid the incessant calls to
	 in_slist, which is quite slow.  */
      inl = in_slist (ulist, constr);

      /* If it is FTP, and FTP is not followed, chuck it out.  */
      if (!inl)
	if (u->proto == URLFTP && !opt.follow_ftp && !this_url_ftp)
	  {
	    DEBUGP (("Uh, it is FTP but i'm not in the mood to follow FTP.\n"));
	    ulist = add_slist (ulist, constr, 0);
	    inl = 1;
	  }
      /* If it is absolute link and they are not followed, chuck it
	 out.  */
      if (!inl && u->proto != URLFTP)
	if (opt.relative_only && !(cur_url->flags & URELATIVE))
	  {
	    DEBUGP (("It doesn't really look like a relative link.\n"));
	    ulist = add_slist (ulist, constr, 0);
	    inl = 1;
	  }
      /* If its domain is not to be accepted/looked-up, chuck it out.  */
      if (!inl)
	if (!accept_domain (u))
	  {
	    DEBUGP (("I don't like the smell of that domain.\n"));
	    ulist = add_slist (ulist, constr, 0);
	    inl = 1;
	  }
      /* Check for parent directory.  */
      if (!inl && opt.no_parent
	  /* If the new URL is FTP and the old was not, ignore
             opt.no_parent.  */
	  && !(!this_url_ftp && u->proto == URLFTP))
	{
	  /* Check for base_dir first.  */
	  if (!(base_dir && frontcmp (base_dir, u->dir)))
	    {
	      /* Failing that, check for parent dir.  */
	      struct urlinfo *ut = newurl ();
	      if (parseurl (this_url, ut, 0) != URLOK)
		DEBUGP (("Double yuck!  The *base* URL is broken.\n"));
	      else if (!frontcmp (ut->dir, u->dir))
		{
		  /* Failing that too, kill the URL.  */
		  DEBUGP (("Trying to escape parental guidance with no_parent on.\n"));
		  ulist = add_slist (ulist, constr, 0);
		  inl = 1;
		}
	      freeurl (ut, 1);
	    }
	}
      /* If the file does not match the acceptance list, or is on the
	 rejection list, chuck it out.  The same goes for the
	 directory exclude- and include- lists.  */
      if (!inl && (opt.includes || opt.excludes))
	{
	  if (!accdir (u->dir, ALLABS))
	    {
	      DEBUGP (("%s (%s) is excluded/not-included.\n", constr, u->dir));
	      ulist = add_slist (ulist, constr, 0);
	      inl = 1;
	    }
	}
      if (!inl)
	{
	  char *suf = NULL;
	  /* We check for acceptance/rejection rules only for non-HTML
	     documents.  Since we don't know whether they really are
	     HTML, it will be deduced from (an OR-ed list):

	     1) u->file is "" (meaning it is a directory)
	     2) suffix exists, AND:
	     a) it is "html", OR
	     b) it is "htm"

	     If the file *is* supposed to be HTML, it will *not* be
	     subject to acc/rej rules.  That's why the `!'.  */
	  if (!
	      (!*u->file
	       || (((suf = suffix (constr)) != NULL)
		   && (!strcmp (suf, "html") || !strcmp (suf, "htm")))))
	    {
	      if (!acceptable (u->file))
		{
		  DEBUGP (("%s (%s) does not match acc/rej rules.\n",
			  constr, u->file));
		  ulist = add_slist (ulist, constr, 0);
		  inl = 1;
		}
	    }
	  FREE_MAYBE (suf);
	}
      /* Optimize the URL (which includes possible DNS lookup) only
	 after all other possibilities have been exhausted.  */
      if (!inl)
	{
	  if (!opt.simple_check)
	    opt_url (u);
	  else
	    {
	      char *p;
	      /* Just lowercase the hostname.  */
	      for (p = u->host; *p; p++)
		*p = tolower (*p);
	      free (u->url);
	      u->url = str_url (u, 0);
	    }
	  free (constr);
	  constr = xstrdup (u->url);
	  inl = in_slist (ulist, constr);
	  if (!inl && !((u->proto == URLFTP) && !this_url_ftp))
	    if (!opt.spanhost && this_url && !same_host (this_url, constr))
	      {
		DEBUGP (("This is not the same hostname as the parent's.\n"));
		ulist = add_slist (ulist, constr, 0);
		inl = 1;
	      }
	}
      /* What about robots.txt?  */
      if (!inl && opt.use_robots && u->proto == URLHTTP)
	{
	  /* Since Wget knows about only one set of robot rules at a
	     time, /robots.txt must be reloaded whenever a new host is
	     accessed.

	     robots_host holds the host the current `forbid' variable
	     is assigned to.  */
	  if (!robots_host || !same_host (robots_host, u->host))
	    {
	      FREE_MAYBE (robots_host);
	      /* Now make robots_host the new host, no matter what the
		 result will be.  So if there is no /robots.txt on the
		 site, Wget will not retry getting robots all the
		 time.  */
	      robots_host = xstrdup (u->host);
	      free_vec (forbidden);
	      forbidden = NULL;
	      err = retrieve_robots (constr, ROBOTS_FILENAME);
	      if (err == ROBOTSOK)
		{
		  rurl = robots_url (constr, ROBOTS_FILENAME);
		  rfile = url_filename (rurl);
		  forbidden = parse_robots (rfile);
		  freeurl (rurl, 1);
		  free (rfile);
		}
	    }

	  /* Now that we have (or don't have) robots, we can check for
	     them.  */
	  if (!robots_match (u, forbidden))
	    {
	      DEBUGP (("Stuffing %s because %s forbids it.\n", this_url,
		       ROBOTS_FILENAME));
	      ulist = add_slist (ulist, constr, 0);
	      inl = 1;
	    }
	}

      filename = NULL;
      /* If it wasn't chucked out, do something with it.  */
      if (!inl)
	{
	  DEBUGP (("I've decided to load it -> "));
	  /* Add it to the list of already-loaded URL-s.  */
	  ulist = add_slist (ulist, constr, 0);
	  /* Automatically followed FTPs will *not* be downloaded
	     recursively.  */
	  if (u->proto == URLFTP)
	    {
	      /* Don't you adore side-effects?  */
	      opt.recursive = 0;
	    }
	  /* Reset its type.  */
	  dt = 0;
	  /* Retrieve it.  */
	  retrieve_url (constr, &filename, &newloc,
		       canon_this_url ? canon_this_url : this_url, &dt);
	  if (u->proto == URLFTP)
	    {
	      /* Restore...  */
	      opt.recursive = 1;
	    }
	  if (newloc)
	    {
	      free (constr);
	      constr = newloc;
	    }
	  /* In case of convert_links: If there was no error, add it to
	     the list of downloaded URLs.  We might need it for
	     conversion.  */
	  if (opt.convert_links && filename)
	    {
	      if (dt & RETROKF)
		{
		  urls_downloaded = add_url (urls_downloaded, constr, filename);
		  /* If the URL is HTML, note it.  */
		  if (dt & TEXTHTML)
		    urls_html = add_slist (urls_html, filename, NOSORT);
		}
	    }
	  /* If there was no error, and the type is text/html, parse
	     it recursively.  */
	  if (dt & TEXTHTML)
	    {
	      if (dt & RETROKF)
		recursive_retrieve (filename, constr);
	    }
	  else
	    DEBUGP (("%s is not text/html so we don't chase.\n",
		     filename ? filename: "(null)"));
	  /* If an suffix-rejected file was loaded only because it was HTML,
	     undo the error now */
	  if (opt.delete_after || (filename && !acceptable (filename)))
	    {
	      logprintf (LOG_VERBOSE,
			 (opt.delete_after ? _("Removing %s.\n")
			  : _("Removing %s since it should be rejected.\n")),
			 filename);
	      if (unlink (filename))
		logprintf (LOG_NOTQUIET, "unlink: %s\n", strerror (errno));
	      dt &= ~RETROKF;
	    }
	  /* If everything was OK, and links are to be converted, let's
	     store the local filename.  */
	  if (opt.convert_links && (dt & RETROKF) && (filename != NULL))
	    {
	      cur_url->flags |= UABS2REL;
	      cur_url->local_name = xstrdup (filename);
	    }
	}
      DEBUGP (("%s already in list, so we don't load.\n", constr));
      /* Free filename and constr.  */
      FREE_MAYBE (filename);
      FREE_MAYBE (constr);
      freeurl (u, 1);
      /* Increment the pbuf for the appropriate size.  */
    }
  if (opt.convert_links)
    convert_links (file, url_list);
  /* Free the linked list of URL-s.  */
  free_urlpos (url_list);
  /* Free the canonical this_url.  */
  FREE_MAYBE (canon_this_url);
  /* Decrement the recursion depth.  */
  --depth;
  if (opt.quota && (opt.downloaded > opt.quota))
    return QUOTEXC;
  else
    return RETROK;
}
Exemplo n.º 7
0
int client::upload()
{
    char file_name[BUFFSIZE] = {0};
    if(m_cmd.size() < 2)
    {
        do
        {
            cout << "file name: ";
            cout.flush();

            cin.getline(file_name, BUFFSIZE - 1);

        }while(file_name[0] == '\0');
    }
    else
    {
        strncpy(file_name, m_cmd[1], BUFFSIZE);
    }

    // Check file's existance in local.
    if(-1 == access(file_name, F_OK) )
    {
        cout << "no such file" << endl;
        return -1;
    }

    int res = 0;

    if( -1 == send_file_info(file_name) )
    {
        ;// Send info error.
    }

    char pack_type = acceptable();
    if( is_pack_type(PT_FEXIST, pack_type) )
    {
        while( 1 )
        {
            char choice[BUFFSIZE] = {0};
            cout << "file exist in remote, "
                << "do you want to cover it ? [yes | no]"
                << " >> ";
            cout.flush();

            cin.getline(choice, BUFFSIZE - 1);

            // Default choice is "yes".
            if('\0' == choice[0])
            {
                strcpy(choice, "yes");
            }

            if( is_no(choice) )
            {
                reply(PT_NOCOVER);
                return 0;
            }
            else if( is_yes(choice) )
            {
                reply(PT_COVER);
                break;
            }
            else
            {
                cout << "input error" << endl;
            }
        }
    }

    res = send_file(file_name);

    return res;
}