Exemple #1
0
static bool
graphite_initialize (isl_ctx *ctx)
{
  if (number_of_loops (cfun) <= 1
      /* FIXME: This limit on the number of basic blocks of a function
	 should be removed when the SCOP detection is faster.  */
      || n_basic_blocks > PARAM_VALUE (PARAM_GRAPHITE_MAX_BBS_PER_FUNCTION))
    {
      if (dump_file && (dump_flags & TDF_DETAILS))
	print_global_statistics (dump_file);

      isl_ctx_free (ctx);
      return false;
    }

  scev_reset ();
  recompute_all_dominators ();
  initialize_original_copy_tables ();

  cloog_state = cloog_isl_state_malloc (ctx);

  if (dump_file && dump_flags)
    dump_function_to_file (current_function_decl, dump_file, dump_flags);

  return true;
}
Exemple #2
0
unsigned int
tree_ssa_prefetch_arrays (struct loops *loops)
{
    unsigned i;
    struct loop *loop;
    bool unrolled = false;
    int todo_flags = 0;

    if (!HAVE_prefetch
            /* It is possible to ask compiler for say -mtune=i486 -march=pentium4.
            -mtune=i486 causes us having PREFETCH_BLOCK 0, since this is part
             of processor costs and i486 does not have prefetch, but
             -march=pentium4 causes HAVE_prefetch to be true.  Ugh.  */
            || PREFETCH_BLOCK == 0)
        return 0;

    initialize_original_copy_tables ();

    if (!built_in_decls[BUILT_IN_PREFETCH])
    {
        tree type = build_function_type (void_type_node,
                                         tree_cons (NULL_TREE,
                                                 const_ptr_type_node,
                                                 NULL_TREE));
        tree decl = lang_hooks.builtin_function ("__builtin_prefetch", type,
                    BUILT_IN_PREFETCH, BUILT_IN_NORMAL,
                    NULL, NULL_TREE);
        DECL_IS_NOVOPS (decl) = true;
        built_in_decls[BUILT_IN_PREFETCH] = decl;
    }

    /* We assume that size of cache line is a power of two, so verify this
       here.  */
    gcc_assert ((PREFETCH_BLOCK & (PREFETCH_BLOCK - 1)) == 0);

    for (i = loops->num - 1; i > 0; i--)
    {
        loop = loops->parray[i];

        if (dump_file && (dump_flags & TDF_DETAILS))
            fprintf (dump_file, "Processing loop %d:\n", loop->num);

        if (loop)
            unrolled |= loop_prefetch_arrays (loops, loop);

        if (dump_file && (dump_flags & TDF_DETAILS))
            fprintf (dump_file, "\n\n");
    }

    if (unrolled)
    {
        scev_reset ();
        todo_flags |= TODO_cleanup_cfg;
    }

    free_original_copy_tables ();
    return todo_flags;
}
Exemple #3
0
void
cfg_layout_initialize (unsigned int flags)
{
  initialize_original_copy_tables ();

  cfg_layout_rtl_register_cfg_hooks ();

  record_effective_endpoints ();

  cleanup_cfg (CLEANUP_CFGLAYOUT | flags);
}
static bool
split_paths ()
{
  bool changed = false;
  loop_p loop;

  loop_optimizer_init (LOOPS_NORMAL | LOOPS_HAVE_RECORDED_EXITS);
  initialize_original_copy_tables ();
  calculate_dominance_info (CDI_DOMINATORS);

  FOR_EACH_LOOP (loop, LI_FROM_INNERMOST)
    {
      /* Only split paths if we are optimizing this loop for speed.  */
      if (!optimize_loop_for_speed_p (loop))
	continue;

      /* See if there is a block that we can duplicate to split the
	 path to the loop latch.  */
      basic_block bb
	= find_block_to_duplicate_for_splitting_paths (loop->latch);

      /* BB is the merge point for an IF-THEN-ELSE we want to transform.

	 Essentially we want to create a duplicate of bb and redirect the
	 first predecessor of BB to the duplicate (leaving the second
	 predecessor as is.  This will split the path leading to the latch
	 re-using BB to avoid useless copying.  */
      if (bb && is_feasible_trace (bb))
	{
	  if (dump_file && (dump_flags & TDF_DETAILS))
	    fprintf (dump_file,
		     "Duplicating join block %d into predecessor paths\n",
		     bb->index);
	  basic_block pred0 = EDGE_PRED (bb, 0)->src;
	  transform_duplicate (pred0, bb);
	  changed = true;
	}
    }

  loop_optimizer_finalize ();
  free_original_copy_tables ();
  return changed;
}
Exemple #5
0
static bool
tail_duplicate (void)
{
  auto_vec<fibonacci_node<long, basic_block_def>*> blocks;
  blocks.safe_grow_cleared (last_basic_block_for_fn (cfun));

  basic_block *trace = XNEWVEC (basic_block, n_basic_blocks_for_fn (cfun));
  int *counts = XNEWVEC (int, last_basic_block_for_fn (cfun));
  int ninsns = 0, nduplicated = 0;
  gcov_type weighted_insns = 0, traced_insns = 0;
  fibonacci_heap<long, basic_block_def> heap (LONG_MIN);
  gcov_type cover_insns;
  int max_dup_insns;
  basic_block bb;
  bool changed = false;

  /* Create an oversized sbitmap to reduce the chance that we need to
     resize it.  */
  bb_seen = sbitmap_alloc (last_basic_block_for_fn (cfun) * 2);
  bitmap_clear (bb_seen);
  initialize_original_copy_tables ();

  if (profile_info && flag_branch_probabilities)
    probability_cutoff = PARAM_VALUE (TRACER_MIN_BRANCH_PROBABILITY_FEEDBACK);
  else
    probability_cutoff = PARAM_VALUE (TRACER_MIN_BRANCH_PROBABILITY);
  probability_cutoff = REG_BR_PROB_BASE / 100 * probability_cutoff;

  branch_ratio_cutoff =
    (REG_BR_PROB_BASE / 100 * PARAM_VALUE (TRACER_MIN_BRANCH_RATIO));

  FOR_EACH_BB_FN (bb, cfun)
    {
      int n = count_insns (bb);
      if (!ignore_bb_p (bb))
	blocks[bb->index] = heap.insert (-bb->frequency, bb);

      counts [bb->index] = n;
      ninsns += n;
      weighted_insns += n * bb->frequency;
    }
Exemple #6
0
static struct loop *
copy_loop_before (struct loop *loop)
{
  struct loop *res;
  edge preheader = loop_preheader_edge (loop);

  if (!single_exit (loop))
    return NULL;

  initialize_original_copy_tables ();
  res = slpeel_tree_duplicate_loop_to_edge_cfg (loop, preheader);
  free_original_copy_tables ();

  if (!res)
    return NULL;

  update_phis_for_loop_copy (loop, res);
  rename_variables_in_loop (res);

  return res;
}
Exemple #7
0
static bool
try_unroll_loop_completely (struct loop *loop,
			    edge exit, tree niter,
			    enum unroll_level ul,
			    HOST_WIDE_INT maxiter,
			    location_t locus)
{
  unsigned HOST_WIDE_INT n_unroll = 0, ninsns, unr_insns;
  struct loop_size size;
  bool n_unroll_found = false;
  edge edge_to_cancel = NULL;
  int report_flags = MSG_OPTIMIZED_LOCATIONS | TDF_RTL | TDF_DETAILS;

  /* See if we proved number of iterations to be low constant.

     EXIT is an edge that will be removed in all but last iteration of 
     the loop.

     EDGE_TO_CACNEL is an edge that will be removed from the last iteration
     of the unrolled sequence and is expected to make the final loop not
     rolling. 

     If the number of execution of loop is determined by standard induction
     variable test, then EXIT and EDGE_TO_CANCEL are the two edges leaving
     from the iv test.  */
  if (tree_fits_uhwi_p (niter))
    {
      n_unroll = tree_to_uhwi (niter);
      n_unroll_found = true;
      edge_to_cancel = EDGE_SUCC (exit->src, 0);
      if (edge_to_cancel == exit)
	edge_to_cancel = EDGE_SUCC (exit->src, 1);
    }
  /* We do not know the number of iterations and thus we can not eliminate
     the EXIT edge.  */
  else
    exit = NULL;

  /* See if we can improve our estimate by using recorded loop bounds.  */
  if (maxiter >= 0
      && (!n_unroll_found || (unsigned HOST_WIDE_INT)maxiter < n_unroll))
    {
      n_unroll = maxiter;
      n_unroll_found = true;
      /* Loop terminates before the IV variable test, so we can not
	 remove it in the last iteration.  */
      edge_to_cancel = NULL;
    }

  if (!n_unroll_found)
    return false;

  if (n_unroll > (unsigned) PARAM_VALUE (PARAM_MAX_COMPLETELY_PEEL_TIMES))
    {
      if (dump_file && (dump_flags & TDF_DETAILS))
	fprintf (dump_file, "Not unrolling loop %d "
		 "(--param max-completely-peeled-times limit reached).\n",
		 loop->num);
      return false;
    }

  if (!edge_to_cancel)
    edge_to_cancel = loop_edge_to_cancel (loop);

  if (n_unroll)
    {
      sbitmap wont_exit;
      edge e;
      unsigned i;
      bool large;
      vec<edge> to_remove = vNULL;
      if (ul == UL_SINGLE_ITER)
	return false;

      large = tree_estimate_loop_size
		 (loop, exit, edge_to_cancel, &size,
		  PARAM_VALUE (PARAM_MAX_COMPLETELY_PEELED_INSNS));
      ninsns = size.overall;
      if (large)
	{
	  if (dump_file && (dump_flags & TDF_DETAILS))
	    fprintf (dump_file, "Not unrolling loop %d: it is too large.\n",
		     loop->num);
	  return false;
	}

      unr_insns = estimated_unrolled_size (&size, n_unroll);
      if (dump_file && (dump_flags & TDF_DETAILS))
	{
	  fprintf (dump_file, "  Loop size: %d\n", (int) ninsns);
	  fprintf (dump_file, "  Estimated size after unrolling: %d\n",
		   (int) unr_insns);
	}

      /* If the code is going to shrink, we don't need to be extra cautious
	 on guessing if the unrolling is going to be profitable.  */
      if (unr_insns
	  /* If there is IV variable that will become constant, we save
	     one instruction in the loop prologue we do not account
	     otherwise.  */
	  <= ninsns + (size.constant_iv != false))
	;
      /* We unroll only inner loops, because we do not consider it profitable
	 otheriwse.  We still can cancel loopback edge of not rolling loop;
	 this is always a good idea.  */
      else if (ul == UL_NO_GROWTH)
	{
	  if (dump_file && (dump_flags & TDF_DETAILS))
	    fprintf (dump_file, "Not unrolling loop %d: size would grow.\n",
		     loop->num);
	  return false;
	}
      /* Outer loops tend to be less interesting candidates for complete
	 unrolling unless we can do a lot of propagation into the inner loop
	 body.  For now we disable outer loop unrolling when the code would
	 grow.  */
      else if (loop->inner)
	{
	  if (dump_file && (dump_flags & TDF_DETAILS))
	    fprintf (dump_file, "Not unrolling loop %d: "
		     "it is not innermost and code would grow.\n",
		     loop->num);
	  return false;
	}
      /* If there is call on a hot path through the loop, then
	 there is most probably not much to optimize.  */
      else if (size.num_non_pure_calls_on_hot_path)
	{
	  if (dump_file && (dump_flags & TDF_DETAILS))
	    fprintf (dump_file, "Not unrolling loop %d: "
		     "contains call and code would grow.\n",
		     loop->num);
	  return false;
	}
      /* If there is pure/const call in the function, then we
	 can still optimize the unrolled loop body if it contains
	 some other interesting code than the calls and code
	 storing or cumulating the return value.  */
      else if (size.num_pure_calls_on_hot_path
	       /* One IV increment, one test, one ivtmp store
		  and one useful stmt.  That is about minimal loop
		  doing pure call.  */
	       && (size.non_call_stmts_on_hot_path
		   <= 3 + size.num_pure_calls_on_hot_path))
	{
	  if (dump_file && (dump_flags & TDF_DETAILS))
	    fprintf (dump_file, "Not unrolling loop %d: "
		     "contains just pure calls and code would grow.\n",
		     loop->num);
	  return false;
	}
      /* Complette unrolling is major win when control flow is removed and
	 one big basic block is created.  If the loop contains control flow
	 the optimization may still be a win because of eliminating the loop
	 overhead but it also may blow the branch predictor tables.
	 Limit number of branches on the hot path through the peeled
	 sequence.  */
      else if (size.num_branches_on_hot_path * (int)n_unroll
	       > PARAM_VALUE (PARAM_MAX_PEEL_BRANCHES))
	{
	  if (dump_file && (dump_flags & TDF_DETAILS))
	    fprintf (dump_file, "Not unrolling loop %d: "
		     " number of branches on hot path in the unrolled sequence"
		     " reach --param max-peel-branches limit.\n",
		     loop->num);
	  return false;
	}
      else if (unr_insns
	       > (unsigned) PARAM_VALUE (PARAM_MAX_COMPLETELY_PEELED_INSNS))
	{
	  if (dump_file && (dump_flags & TDF_DETAILS))
	    fprintf (dump_file, "Not unrolling loop %d: "
		     "(--param max-completely-peeled-insns limit reached).\n",
		     loop->num);
	  return false;
	}
      dump_printf_loc (report_flags, locus,
                       "loop turned into non-loop; it never loops.\n");

      initialize_original_copy_tables ();
      wont_exit = sbitmap_alloc (n_unroll + 1);
      bitmap_ones (wont_exit);
      bitmap_clear_bit (wont_exit, 0);

      if (!gimple_duplicate_loop_to_header_edge (loop, loop_preheader_edge (loop),
						 n_unroll, wont_exit,
						 exit, &to_remove,
						 DLTHE_FLAG_UPDATE_FREQ
						 | DLTHE_FLAG_COMPLETTE_PEEL))
	{
          free_original_copy_tables ();
	  free (wont_exit);
	  if (dump_file && (dump_flags & TDF_DETAILS))
	    fprintf (dump_file, "Failed to duplicate the loop\n");
	  return false;
	}

      FOR_EACH_VEC_ELT (to_remove, i, e)
	{
	  bool ok = remove_path (e);
	  gcc_assert (ok);
	}

      to_remove.release ();
      free (wont_exit);
      free_original_copy_tables ();
    }
Exemple #8
0
static bool
tail_duplicate (void)
{
  fibnode_t *blocks = XCNEWVEC (fibnode_t, last_basic_block);
  basic_block *trace = XNEWVEC (basic_block, n_basic_blocks);
  int *counts = XNEWVEC (int, last_basic_block);
  int ninsns = 0, nduplicated = 0;
  gcov_type weighted_insns = 0, traced_insns = 0;
  fibheap_t heap = fibheap_new ();
  gcov_type cover_insns;
  int max_dup_insns;
  basic_block bb;
  bool changed = false;

  /* Create an oversized sbitmap to reduce the chance that we need to
     resize it.  */
  bb_seen = sbitmap_alloc (last_basic_block * 2);
  bitmap_clear (bb_seen);
  initialize_original_copy_tables ();

  if (profile_info && flag_branch_probabilities)
    probability_cutoff = PARAM_VALUE (TRACER_MIN_BRANCH_PROBABILITY_FEEDBACK);
  else
    probability_cutoff = PARAM_VALUE (TRACER_MIN_BRANCH_PROBABILITY);
  probability_cutoff = REG_BR_PROB_BASE / 100 * probability_cutoff;

  branch_ratio_cutoff =
    (REG_BR_PROB_BASE / 100 * PARAM_VALUE (TRACER_MIN_BRANCH_RATIO));

  FOR_EACH_BB (bb)
    {
      int n = count_insns (bb);
      if (!ignore_bb_p (bb))
	blocks[bb->index] = fibheap_insert (heap, -bb->frequency,
					    bb);

      counts [bb->index] = n;
      ninsns += n;
      weighted_insns += n * bb->frequency;
    }

  if (profile_info && flag_branch_probabilities)
    cover_insns = PARAM_VALUE (TRACER_DYNAMIC_COVERAGE_FEEDBACK);
  else
    cover_insns = PARAM_VALUE (TRACER_DYNAMIC_COVERAGE);
  cover_insns = (weighted_insns * cover_insns + 50) / 100;
  max_dup_insns = (ninsns * PARAM_VALUE (TRACER_MAX_CODE_GROWTH) + 50) / 100;

  while (traced_insns < cover_insns && nduplicated < max_dup_insns
         && !fibheap_empty (heap))
    {
      basic_block bb = (basic_block) fibheap_extract_min (heap);
      int n, pos;

      if (!bb)
	break;

      blocks[bb->index] = NULL;

      if (ignore_bb_p (bb))
	continue;
      gcc_assert (!bb_seen_p (bb));

      n = find_trace (bb, trace);

      bb = trace[0];
      traced_insns += bb->frequency * counts [bb->index];
      if (blocks[bb->index])
	{
	  fibheap_delete_node (heap, blocks[bb->index]);
	  blocks[bb->index] = NULL;
	}

      for (pos = 1; pos < n; pos++)
	{
	  basic_block bb2 = trace[pos];

	  if (blocks[bb2->index])
	    {
	      fibheap_delete_node (heap, blocks[bb2->index]);
	      blocks[bb2->index] = NULL;
	    }
	  traced_insns += bb2->frequency * counts [bb2->index];
	  if (EDGE_COUNT (bb2->preds) > 1
	      && can_duplicate_block_p (bb2)
	      /* We have the tendency to duplicate the loop header
	         of all do { } while loops.  Do not do that - it is
		 not profitable and it might create a loop with multiple
		 entries or at least rotate the loop.  */
	      && (!current_loops
		  || bb2->loop_father->header != bb2))
	    {
	      edge e;
	      basic_block copy;

	      nduplicated += counts [bb2->index];

	      e = find_edge (bb, bb2);

	      copy = duplicate_block (bb2, e, bb);
	      flush_pending_stmts (e);

	      add_phi_args_after_copy (&copy, 1, NULL);

	      /* Reconsider the original copy of block we've duplicated.
	         Removing the most common predecessor may make it to be
	         head.  */
	      blocks[bb2->index] =
		fibheap_insert (heap, -bb2->frequency, bb2);

	      if (dump_file)
		fprintf (dump_file, "Duplicated %i as %i [%i]\n",
			 bb2->index, copy->index, copy->frequency);

	      bb2 = copy;
	      changed = true;
	    }
	  mark_bb_seen (bb2);
	  bb = bb2;
	  /* In case the trace became infrequent, stop duplicating.  */
	  if (ignore_bb_p (bb))
	    break;
	}
      if (dump_file)
	fprintf (dump_file, " covered now %.1f\n\n",
		 traced_insns * 100.0 / weighted_insns);
    }
  if (dump_file)
    fprintf (dump_file, "Duplicated %i insns (%i%%)\n", nduplicated,
	     nduplicated * 100 / ninsns);

  free_original_copy_tables ();
  sbitmap_free (bb_seen);
  free (blocks);
  free (trace);
  free (counts);
  fibheap_delete (heap);

  return changed;
}
Exemple #9
0
static bool
split_paths ()
{
    bool changed = false;
    loop_p loop;

    loop_optimizer_init (LOOPS_NORMAL | LOOPS_HAVE_RECORDED_EXITS);
    initialize_original_copy_tables ();
    calculate_dominance_info (CDI_DOMINATORS);

    FOR_EACH_LOOP (loop, LI_FROM_INNERMOST)
    {
        /* Only split paths if we are optimizing this loop for speed.  */
        if (!optimize_loop_for_speed_p (loop))
            continue;

        /* See if there is a block that we can duplicate to split the
        path to the loop latch.  */
        basic_block bb
            = find_block_to_duplicate_for_splitting_paths (loop->latch);

        /* BB is the merge point for an IF-THEN-ELSE we want to transform.

        Essentially we want to create a duplicate of bb and redirect the
         first predecessor of BB to the duplicate (leaving the second
         predecessor as is.  This will split the path leading to the latch
         re-using BB to avoid useless copying.  */
        if (bb && is_feasible_trace (bb))
        {
            if (dump_file && (dump_flags & TDF_DETAILS))
                fprintf (dump_file,
                         "Duplicating join block %d into predecessor paths\n",
                         bb->index);
            basic_block pred0 = EDGE_PRED (bb, 0)->src;
            transform_duplicate (pred0, bb);
            changed = true;

            /* If BB has an outgoing edge marked as IRREDUCIBLE, then
               duplicating BB may result in an irreducible region turning
               into a natural loop.

               Long term we might want to hook this into the block
               duplication code, but as we've seen with similar changes
               for edge removal, that can be somewhat risky.  */
            if (EDGE_SUCC (bb, 0)->flags & EDGE_IRREDUCIBLE_LOOP
                    || EDGE_SUCC (bb, 1)->flags & EDGE_IRREDUCIBLE_LOOP)
            {
                if (dump_file && (dump_flags & TDF_DETAILS))
                    fprintf (dump_file,
                             "Join block %d has EDGE_IRREDUCIBLE_LOOP set.  "
                             "Scheduling loop fixups.\n",
                             bb->index);
                loops_state_set (LOOPS_NEED_FIXUP);
            }
        }
    }

    loop_optimizer_finalize ();
    free_original_copy_tables ();
    return changed;
}
Exemple #10
0
unsigned int
tree_ssa_prefetch_arrays (void)
{
  loop_iterator li;
  struct loop *loop;
  bool unrolled = false;
  int todo_flags = 0;

  if (!HAVE_prefetch
      /* It is possible to ask compiler for say -mtune=i486 -march=pentium4.
	 -mtune=i486 causes us having PREFETCH_BLOCK 0, since this is part
	 of processor costs and i486 does not have prefetch, but
	 -march=pentium4 causes HAVE_prefetch to be true.  Ugh.  */
      || PREFETCH_BLOCK == 0)
    return 0;

  if (dump_file && (dump_flags & TDF_DETAILS))
    {
      fprintf (dump_file, "Prefetching parameters:\n");
      fprintf (dump_file, "    simultaneous prefetches: %d\n",
	       SIMULTANEOUS_PREFETCHES);
      fprintf (dump_file, "    prefetch latency: %d\n", PREFETCH_LATENCY);
      fprintf (dump_file, "    prefetch block size: %d\n", PREFETCH_BLOCK);
      fprintf (dump_file, "    L1 cache size: %d lines, %d kB\n",
	       L1_CACHE_SIZE_BYTES / L1_CACHE_LINE_SIZE, L1_CACHE_SIZE);
      fprintf (dump_file, "    L1 cache line size: %d\n", L1_CACHE_LINE_SIZE);
      fprintf (dump_file, "    L2 cache size: %d kB\n", L2_CACHE_SIZE);
      fprintf (dump_file, "\n");
    }

  initialize_original_copy_tables ();

  if (!built_in_decls[BUILT_IN_PREFETCH])
    {
      tree type = build_function_type (void_type_node,
				       tree_cons (NULL_TREE,
						  const_ptr_type_node,
						  NULL_TREE));
      tree decl = add_builtin_function ("__builtin_prefetch", type,
					BUILT_IN_PREFETCH, BUILT_IN_NORMAL,
					NULL, NULL_TREE);
      DECL_IS_NOVOPS (decl) = true;
      built_in_decls[BUILT_IN_PREFETCH] = decl;
    }

  /* We assume that size of cache line is a power of two, so verify this
     here.  */
  gcc_assert ((PREFETCH_BLOCK & (PREFETCH_BLOCK - 1)) == 0);

  FOR_EACH_LOOP (li, loop, LI_FROM_INNERMOST)
    {
      if (dump_file && (dump_flags & TDF_DETAILS))
	fprintf (dump_file, "Processing loop %d:\n", loop->num);

      unrolled |= loop_prefetch_arrays (loop);

      if (dump_file && (dump_flags & TDF_DETAILS))
	fprintf (dump_file, "\n\n");
    }

  if (unrolled)
    {
      scev_reset ();
      todo_flags |= TODO_cleanup_cfg;
    }

  free_original_copy_tables ();
  return todo_flags;
}