예제 #1
0
		glm::vec3 BSDF::local_to_world(glm::vec3 local) const {
			glm::vec3 random_vec(0.85577678, 0.503440032, 0.119139549);
			glm::vec3 tn = glm::normalize(glm::cross(_normal, random_vec));
			glm::vec3 sn = glm::normalize(glm::cross(_normal, tn));

			return glm::vec3(sn.x * local.x + tn.x * local.y + _normal.x * local.z,
							sn.y * local.x + tn.y * local.y + _normal.y * local.z,
							sn.z * local.x + tn.z * local.y + _normal.z * local.z);
		}
예제 #2
0
		glm::vec3 BSDF::world_to_local(glm::vec3 world) const {
			//This random vector is a normalized random direction, used to calculate the spherical coordinate system.
			//it should be same for the reverse translate function below
			glm::vec3 random_vec(0.85577678, 0.503440032, 0.119139549);
			glm::vec3 tn = glm::normalize(glm::cross(_normal, random_vec));
			glm::vec3 sn = glm::normalize(glm::cross(_normal, tn));

			return glm::vec3(glm::dot(world, sn), glm::dot(world, tn), glm::dot(world, _normal));
		}
예제 #3
0
    /**
     * Randomly expands the configuration
     */
    vector<MatchConfig> FAsTMatch::randomExpandConfigs( vector<MatchConfig>& configs, MatchNet& net, int level,
                                                int no_of_points, float delta_factor ) {
        
        float factor = pow(delta_factor, level);
        
        float   half_step_tx = net.stepsTransX / factor,
                half_step_ty = net.stepsTransY / factor,
                half_step_r  = net.stepsRotate / factor,
                half_step_s  = net.stepsScale / factor;
        
        int no_of_configs = static_cast<int>( configs.size() );
        
        /* Create random vectors that contain values which are either -1, 0, or 1 */
        Mat random_vec( no_of_points * no_of_configs, 6, CV_32SC1 );
        rng.fill( random_vec, RNG::NORMAL, 0, 0.5 );
        random_vec.convertTo( random_vec, CV_32FC1 );

        /* Convert our vector of configurations into a large matrix */
        vector<Mat> configs_mat(no_of_configs);
        for(int i = 0; i < no_of_configs; i++ )
            configs_mat[i] = configs[i].asMatrix();
        
        Mat expanded;
        vconcat( configs_mat, expanded );
        expanded = repeat( expanded, no_of_points, 1 );

        vector<float> ranges_vec = {
            half_step_tx, half_step_ty, half_step_r, half_step_s, half_step_s, half_step_r
        };
        
        Mat ranges = repeat(Mat(ranges_vec).t() , no_of_points * no_of_configs, 1);
        
        /* The expanded configs is the original configs plus some random changes */
        Mat expanded_configs = expanded + random_vec.mul( ranges );

        return MatchConfig::fromMatrix( expanded_configs );
    }
PIPE_ALIGN_STACK
static boolean
test_one(unsigned verbose,
         FILE *fp,
         struct lp_type src_type,
         struct lp_type dst_type)
{
   struct gallivm_state *gallivm;
   LLVMValueRef func = NULL;
   conv_test_ptr_t conv_test_ptr;
   boolean success;
   const unsigned n = LP_TEST_NUM_SAMPLES;
   int64_t cycles[LP_TEST_NUM_SAMPLES];
   double cycles_avg = 0.0;
   unsigned num_srcs;
   unsigned num_dsts;
   double eps;
   unsigned i, j;

   if ((src_type.width >= dst_type.width && src_type.length > dst_type.length) ||
       (src_type.width <= dst_type.width && src_type.length < dst_type.length)) {
      return TRUE;
   }

   /* Known failures
    * - fixed point 32 -> float 32
    * - float 32 -> signed normalised integer 32
    */
   if ((src_type.floating && !dst_type.floating && dst_type.sign && dst_type.norm && src_type.width == dst_type.width) ||
       (!src_type.floating && dst_type.floating && src_type.fixed && src_type.width == dst_type.width)) {
      return TRUE;
   }

   /* Known failures
    * - fixed point 32 -> float 32
    * - float 32 -> signed normalised integer 32
    */
   if ((src_type.floating && !dst_type.floating && dst_type.sign && dst_type.norm && src_type.width == dst_type.width) ||
       (!src_type.floating && dst_type.floating && src_type.fixed && src_type.width == dst_type.width)) {
      return TRUE;
   }

   if(verbose >= 1)
      dump_conv_types(stderr, src_type, dst_type);

   if (src_type.length > dst_type.length) {
      num_srcs = 1;
      num_dsts = src_type.length/dst_type.length;
   }
   else if (src_type.length < dst_type.length) {
      num_dsts = 1;
      num_srcs = dst_type.length/src_type.length;
   }
   else  {
      num_dsts = 1;
      num_srcs = 1;
   }

   /* We must not loose or gain channels. Only precision */
   assert(src_type.length * num_srcs == dst_type.length * num_dsts);

   eps = MAX2(lp_const_eps(src_type), lp_const_eps(dst_type));

   gallivm = gallivm_create();

   func = add_conv_test(gallivm, src_type, num_srcs, dst_type, num_dsts);

   gallivm_compile_module(gallivm);

   conv_test_ptr = (conv_test_ptr_t)gallivm_jit_function(gallivm, func);

   success = TRUE;
   for(i = 0; i < n && success; ++i) {
      unsigned src_stride = src_type.length*src_type.width/8;
      unsigned dst_stride = dst_type.length*dst_type.width/8;
      PIPE_ALIGN_VAR(LP_MIN_VECTOR_ALIGN) uint8_t src[LP_MAX_VECTOR_LENGTH*LP_MAX_VECTOR_LENGTH];
      PIPE_ALIGN_VAR(LP_MIN_VECTOR_ALIGN) uint8_t dst[LP_MAX_VECTOR_LENGTH*LP_MAX_VECTOR_LENGTH];
      double fref[LP_MAX_VECTOR_LENGTH*LP_MAX_VECTOR_LENGTH];
      uint8_t ref[LP_MAX_VECTOR_LENGTH*LP_MAX_VECTOR_LENGTH];
      int64_t start_counter = 0;
      int64_t end_counter = 0;

      for(j = 0; j < num_srcs; ++j) {
         random_vec(src_type, src + j*src_stride);
         read_vec(src_type, src + j*src_stride, fref + j*src_type.length);
      }

      for(j = 0; j < num_dsts; ++j) {
         write_vec(dst_type, ref + j*dst_stride, fref + j*dst_type.length);
      }

      start_counter = rdtsc();
      conv_test_ptr(src, dst);
      end_counter = rdtsc();

      cycles[i] = end_counter - start_counter;

      for(j = 0; j < num_dsts; ++j) {
         if(!compare_vec_with_eps(dst_type, dst + j*dst_stride, ref + j*dst_stride, eps))
            success = FALSE;
      }

      if (!success || verbose >= 3) {
         if(verbose < 1)
            dump_conv_types(stderr, src_type, dst_type);
         if (success) {
            fprintf(stderr, "PASS\n");
         }
         else {
            fprintf(stderr, "MISMATCH\n");
         }

         for(j = 0; j < num_srcs; ++j) {
            fprintf(stderr, "  Src%u: ", j);
            dump_vec(stderr, src_type, src + j*src_stride);
            fprintf(stderr, "\n");
         }

#if 1
         fprintf(stderr, "  Ref: ");
         for(j = 0; j < src_type.length*num_srcs; ++j)
            fprintf(stderr, " %f", fref[j]);
         fprintf(stderr, "\n");
#endif

         for(j = 0; j < num_dsts; ++j) {
            fprintf(stderr, "  Dst%u: ", j);
            dump_vec(stderr, dst_type, dst + j*dst_stride);
            fprintf(stderr, "\n");

            fprintf(stderr, "  Ref%u: ", j);
            dump_vec(stderr, dst_type, ref + j*dst_stride);
            fprintf(stderr, "\n");
         }
      }
   }

   /*
    * Unfortunately the output of cycle counter is not very reliable as it comes
    * -- sometimes we get outliers (due IRQs perhaps?) which are
    * better removed to avoid random or biased data.
    */
   {
      double sum = 0.0, sum2 = 0.0;
      double avg, std;
      unsigned m;

      for(i = 0; i < n; ++i) {
         sum += cycles[i];
         sum2 += cycles[i]*cycles[i];
      }

      avg = sum/n;
      std = sqrtf((sum2 - n*avg*avg)/n);

      m = 0;
      sum = 0.0;
      for(i = 0; i < n; ++i) {
         if(fabs(cycles[i] - avg) <= 4.0*std) {
            sum += cycles[i];
            ++m;
         }
      }

      cycles_avg = sum/m;

   }

   if(fp)
      write_tsv_row(fp, src_type, dst_type, cycles_avg, success);

   gallivm_free_function(gallivm, func, conv_test_ptr);

   gallivm_destroy(gallivm);

   return success;
}
예제 #5
0
PIPE_ALIGN_STACK
static boolean
test_one(unsigned verbose,
         FILE *fp,
         const struct pipe_blend_state *blend,
         struct lp_type type)
{
    struct gallivm_state *gallivm;
    LLVMValueRef func = NULL;
    blend_test_ptr_t blend_test_ptr;
    boolean success;
    const unsigned n = LP_TEST_NUM_SAMPLES;
    int64_t cycles[LP_TEST_NUM_SAMPLES];
    double cycles_avg = 0.0;
    unsigned i, j;
    const unsigned stride = lp_type_width(type)/8;

    if(verbose >= 1)
        dump_blend_type(stdout, blend, type);

    gallivm = gallivm_create();

    func = add_blend_test(gallivm, blend, type);

    gallivm_compile_module(gallivm);

    blend_test_ptr = (blend_test_ptr_t)gallivm_jit_function(gallivm, func);

    success = TRUE;

    {
        uint8_t *src, *dst, *con, *res, *ref;
        src = align_malloc(stride, stride);
        dst = align_malloc(stride, stride);
        con = align_malloc(stride, stride);
        res = align_malloc(stride, stride);
        ref = align_malloc(stride, stride);

        for(i = 0; i < n && success; ++i) {
            int64_t start_counter = 0;
            int64_t end_counter = 0;

            random_vec(type, src);
            random_vec(type, dst);
            random_vec(type, con);

            {
                double fsrc[LP_MAX_VECTOR_LENGTH];
                double fdst[LP_MAX_VECTOR_LENGTH];
                double fcon[LP_MAX_VECTOR_LENGTH];
                double fref[LP_MAX_VECTOR_LENGTH];

                read_vec(type, src, fsrc);
                read_vec(type, dst, fdst);
                read_vec(type, con, fcon);

                for(j = 0; j < type.length; j += 4)
                    compute_blend_ref(blend, fsrc + j, fdst + j, fcon + j, fref + j);

                write_vec(type, ref, fref);
            }

            start_counter = rdtsc();
            blend_test_ptr(src, dst, con, res);
            end_counter = rdtsc();

            cycles[i] = end_counter - start_counter;

            if(!compare_vec(type, res, ref)) {
                success = FALSE;

                if(verbose < 1)
                    dump_blend_type(stderr, blend, type);
                fprintf(stderr, "MISMATCH\n");

                fprintf(stderr, "  Src: ");
                dump_vec(stderr, type, src);
                fprintf(stderr, "\n");

                fprintf(stderr, "  Dst: ");
                dump_vec(stderr, type, dst);
                fprintf(stderr, "\n");

                fprintf(stderr, "  Con: ");
                dump_vec(stderr, type, con);
                fprintf(stderr, "\n");

                fprintf(stderr, "  Res: ");
                dump_vec(stderr, type, res);
                fprintf(stderr, "\n");

                fprintf(stderr, "  Ref: ");
                dump_vec(stderr, type, ref);
                fprintf(stderr, "\n");
            }
        }
        align_free(src);
        align_free(dst);
        align_free(con);
        align_free(res);
        align_free(ref);
    }

    /*
     * Unfortunately the output of cycle counter is not very reliable as it comes
     * -- sometimes we get outliers (due IRQs perhaps?) which are
     * better removed to avoid random or biased data.
     */
    {
        double sum = 0.0, sum2 = 0.0;
        double avg, std;
        unsigned m;

        for(i = 0; i < n; ++i) {
            sum += cycles[i];
            sum2 += cycles[i]*cycles[i];
        }

        avg = sum/n;
        std = sqrtf((sum2 - n*avg*avg)/n);

        m = 0;
        sum = 0.0;
        for(i = 0; i < n; ++i) {
            if(fabs(cycles[i] - avg) <= 4.0*std) {
                sum += cycles[i];
                ++m;
            }
        }

        cycles_avg = sum/m;

    }

    if(fp)
        write_tsv_row(fp, blend, type, cycles_avg, success);

    gallivm_free_function(gallivm, func, blend_test_ptr);

    gallivm_destroy(gallivm);

    return success;
}
예제 #6
0
ALIGN_STACK
static boolean
test_one(unsigned verbose,
         FILE *fp,
         const struct pipe_blend_state *blend,
         enum vector_mode mode,
         struct lp_type type)
{
   LLVMModuleRef module = NULL;
   LLVMValueRef func = NULL;
   LLVMExecutionEngineRef engine = NULL;
   LLVMModuleProviderRef provider = NULL;
   LLVMPassManagerRef pass = NULL;
   char *error = NULL;
   blend_test_ptr_t blend_test_ptr;
   boolean success;
   const unsigned n = LP_TEST_NUM_SAMPLES;
   int64_t cycles[LP_TEST_NUM_SAMPLES];
   double cycles_avg = 0.0;
   unsigned i, j;

   if(verbose >= 1)
      dump_blend_type(stdout, blend, mode, type);

   module = LLVMModuleCreateWithName("test");

   func = add_blend_test(module, blend, mode, type);

   if(LLVMVerifyModule(module, LLVMPrintMessageAction, &error)) {
      LLVMDumpModule(module);
      abort();
   }
   LLVMDisposeMessage(error);

   provider = LLVMCreateModuleProviderForExistingModule(module);
   if (LLVMCreateJITCompiler(&engine, provider, 1, &error)) {
      if(verbose < 1)
         dump_blend_type(stderr, blend, mode, type);
      fprintf(stderr, "%s\n", error);
      LLVMDisposeMessage(error);
      abort();
   }

#if 0
   pass = LLVMCreatePassManager();
   LLVMAddTargetData(LLVMGetExecutionEngineTargetData(engine), pass);
   /* These are the passes currently listed in llvm-c/Transforms/Scalar.h,
    * but there are more on SVN. */
   LLVMAddConstantPropagationPass(pass);
   LLVMAddInstructionCombiningPass(pass);
   LLVMAddPromoteMemoryToRegisterPass(pass);
   LLVMAddGVNPass(pass);
   LLVMAddCFGSimplificationPass(pass);
   LLVMRunPassManager(pass, module);
#else
   (void)pass;
#endif

   if(verbose >= 2)
      LLVMDumpModule(module);

   blend_test_ptr = (blend_test_ptr_t)LLVMGetPointerToGlobal(engine, func);

   if(verbose >= 2)
      lp_disassemble(blend_test_ptr);

   success = TRUE;
   for(i = 0; i < n && success; ++i) {
      if(mode == AoS) {
         ALIGN16_ATTRIB uint8_t src[LP_NATIVE_VECTOR_WIDTH/8];
         ALIGN16_ATTRIB uint8_t dst[LP_NATIVE_VECTOR_WIDTH/8];
         ALIGN16_ATTRIB uint8_t con[LP_NATIVE_VECTOR_WIDTH/8];
         ALIGN16_ATTRIB uint8_t res[LP_NATIVE_VECTOR_WIDTH/8];
         ALIGN16_ATTRIB uint8_t ref[LP_NATIVE_VECTOR_WIDTH/8];
         int64_t start_counter = 0;
         int64_t end_counter = 0;

         random_vec(type, src);
         random_vec(type, dst);
         random_vec(type, con);

         {
            double fsrc[LP_MAX_VECTOR_LENGTH];
            double fdst[LP_MAX_VECTOR_LENGTH];
            double fcon[LP_MAX_VECTOR_LENGTH];
            double fref[LP_MAX_VECTOR_LENGTH];

            read_vec(type, src, fsrc);
            read_vec(type, dst, fdst);
            read_vec(type, con, fcon);

            for(j = 0; j < type.length; j += 4)
               compute_blend_ref(blend, fsrc + j, fdst + j, fcon + j, fref + j);

            write_vec(type, ref, fref);
         }

         start_counter = rdtsc();
         blend_test_ptr(src, dst, con, res);
         end_counter = rdtsc();

         cycles[i] = end_counter - start_counter;

         if(!compare_vec(type, res, ref)) {
            success = FALSE;

            if(verbose < 1)
               dump_blend_type(stderr, blend, mode, type);
            fprintf(stderr, "MISMATCH\n");

            fprintf(stderr, "  Src: ");
            dump_vec(stderr, type, src);
            fprintf(stderr, "\n");

            fprintf(stderr, "  Dst: ");
            dump_vec(stderr, type, dst);
            fprintf(stderr, "\n");

            fprintf(stderr, "  Con: ");
            dump_vec(stderr, type, con);
            fprintf(stderr, "\n");

            fprintf(stderr, "  Res: ");
            dump_vec(stderr, type, res);
            fprintf(stderr, "\n");

            fprintf(stderr, "  Ref: ");
            dump_vec(stderr, type, ref);
            fprintf(stderr, "\n");
         }
      }

      if(mode == SoA) {
         const unsigned stride = type.length*type.width/8;
         ALIGN16_ATTRIB uint8_t src[4*LP_NATIVE_VECTOR_WIDTH/8];
         ALIGN16_ATTRIB uint8_t dst[4*LP_NATIVE_VECTOR_WIDTH/8];
         ALIGN16_ATTRIB uint8_t con[4*LP_NATIVE_VECTOR_WIDTH/8];
         ALIGN16_ATTRIB uint8_t res[4*LP_NATIVE_VECTOR_WIDTH/8];
         ALIGN16_ATTRIB uint8_t ref[4*LP_NATIVE_VECTOR_WIDTH/8];
         int64_t start_counter = 0;
         int64_t end_counter = 0;
         boolean mismatch;

         for(j = 0; j < 4; ++j) {
            random_vec(type, src + j*stride);
            random_vec(type, dst + j*stride);
            random_vec(type, con + j*stride);
         }

         {
            double fsrc[4];
            double fdst[4];
            double fcon[4];
            double fref[4];
            unsigned k;

            for(k = 0; k < type.length; ++k) {
               for(j = 0; j < 4; ++j) {
                  fsrc[j] = read_elem(type, src + j*stride, k);
                  fdst[j] = read_elem(type, dst + j*stride, k);
                  fcon[j] = read_elem(type, con + j*stride, k);
               }

               compute_blend_ref(blend, fsrc, fdst, fcon, fref);

               for(j = 0; j < 4; ++j)
                  write_elem(type, ref + j*stride, k, fref[j]);
            }
         }

         start_counter = rdtsc();
         blend_test_ptr(src, dst, con, res);
         end_counter = rdtsc();

         cycles[i] = end_counter - start_counter;

         mismatch = FALSE;
         for (j = 0; j < 4; ++j)
            if(!compare_vec(type, res + j*stride, ref + j*stride))
               mismatch = TRUE;

         if (mismatch) {
            success = FALSE;

            if(verbose < 1)
               dump_blend_type(stderr, blend, mode, type);
            fprintf(stderr, "MISMATCH\n");
            for(j = 0; j < 4; ++j) {
               char channel = "RGBA"[j];
               fprintf(stderr, "  Src%c: ", channel);
               dump_vec(stderr, type, src + j*stride);
               fprintf(stderr, "\n");

               fprintf(stderr, "  Dst%c: ", channel);
               dump_vec(stderr, type, dst + j*stride);
               fprintf(stderr, "\n");

               fprintf(stderr, "  Con%c: ", channel);
               dump_vec(stderr, type, con + j*stride);
               fprintf(stderr, "\n");

               fprintf(stderr, "  Res%c: ", channel);
               dump_vec(stderr, type, res + j*stride);
               fprintf(stderr, "\n");

               fprintf(stderr, "  Ref%c: ", channel);
               dump_vec(stderr, type, ref + j*stride);
               fprintf(stderr, "\n");
            }
         }
      }
   }

   /*
    * Unfortunately the output of cycle counter is not very reliable as it comes
    * -- sometimes we get outliers (due IRQs perhaps?) which are
    * better removed to avoid random or biased data.
    */
   {
      double sum = 0.0, sum2 = 0.0;
      double avg, std;
      unsigned m;

      for(i = 0; i < n; ++i) {
         sum += cycles[i];
         sum2 += cycles[i]*cycles[i];
      }

      avg = sum/n;
      std = sqrtf((sum2 - n*avg*avg)/n);

      m = 0;
      sum = 0.0;
      for(i = 0; i < n; ++i) {
         if(fabs(cycles[i] - avg) <= 4.0*std) {
            sum += cycles[i];
            ++m;
         }
      }

      cycles_avg = sum/m;

   }

   if(fp)
      write_tsv_row(fp, blend, mode, type, cycles_avg, success);

   if (!success) {
      if(verbose < 2)
         LLVMDumpModule(module);
      LLVMWriteBitcodeToFile(module, "blend.bc");
      fprintf(stderr, "blend.bc written\n");
      fprintf(stderr, "Invoke as \"llc -o - blend.bc\"\n");
      abort();
   }

   LLVMFreeMachineCodeForFunction(engine, func);

   LLVMDisposeExecutionEngine(engine);
   if(pass)
      LLVMDisposePassManager(pass);

   return success;
}
예제 #7
0
파일: axpy.c 프로젝트: g-koutsou/LAP2015
/*
 * Main
 */
int
main(int argc, char *argv[])
{
  if(argc != 3) {
    usage(argv);
    exit(1);
  }

  char *e;
  int L = (int)strtoul(argv[1], &e, 10);
  if(*e != '\0') {
    usage(argv);
    exit(2);
  }

  int nreps = (int)strtoul(argv[2], &e, 10);
  if(*e != '\0') {
    usage(argv);
    exit(2);
  }

  _Complex float *x = alloc(sizeof(_Complex float)*L);
  _Complex float *y = alloc(sizeof(_Complex float)*L);
  _Complex float a;

  random_vec(L, x);
  random_vec(L, y);
  random_vec(1, &a);
  
  axpy(L, a, x, y);
  int nreps_inner = 2;
  double tave = 0;
  double tvar = 0;
  for(int k=0; ;k++) {
    tave = 0;
    tvar = 0;
    for(int i=0; i<nreps; i++)
      {
	double t0 = stop_watch(0);
	for(int j=0; j<nreps_inner; j++)
	  axpy(L, a, x, y);
	t0 = stop_watch(t0)/nreps_inner;
	tave += t0;
	tvar += t0*t0;
      }
    tave /= (double)nreps;
    tvar /= (double)nreps;
    tvar = sqrt(tvar - tave*tave);
    if(tvar < tave/25)
      break;
    nreps_inner = nreps_inner*2;    
  }

  /* 
     ___TODO_1___
     Print: 
     1) Time per kernel call with error (usec)
     2) Susstained floating-point rate (GFlop/sec)
     3) Susstained bandwidth (GBytes/sec)
  */
  double beta_fp = (8*L/tave)*1e-9;
  double beta_io = (8*3*L/tave)*1e-9;
  printf(" L = %12d, %4.2e ± %4.2e usec/call, perf. = %6.4e GFlop/sec, bw = %6.4e GBytes/sec\n",
	 L, tave*1e6, tvar*1e6, beta_fp, beta_io);
  
  free(x);
  free(y);  
  return 0;
}
예제 #8
0
PIPE_ALIGN_STACK
static boolean
test_one(unsigned verbose,
         FILE *fp,
         struct lp_type src_type,
         struct lp_type dst_type)
{
   LLVMModuleRef module = NULL;
   LLVMValueRef func = NULL;
   LLVMExecutionEngineRef engine = NULL;
   LLVMModuleProviderRef provider = NULL;
   LLVMPassManagerRef pass = NULL;
   char *error = NULL;
   conv_test_ptr_t conv_test_ptr;
   boolean success;
   const unsigned n = LP_TEST_NUM_SAMPLES;
   int64_t cycles[LP_TEST_NUM_SAMPLES];
   double cycles_avg = 0.0;
   unsigned num_srcs;
   unsigned num_dsts;
   double eps;
   unsigned i, j;

   if(verbose >= 1)
      dump_conv_types(stdout, src_type, dst_type);

   if(src_type.length > dst_type.length) {
      num_srcs = 1;
      num_dsts = src_type.length/dst_type.length;
   }
   else  {
      num_dsts = 1;
      num_srcs = dst_type.length/src_type.length;
   }

   assert(src_type.width * src_type.length == dst_type.width * dst_type.length);

   /* We must not loose or gain channels. Only precision */
   assert(src_type.length * num_srcs == dst_type.length * num_dsts);

   eps = MAX2(lp_const_eps(src_type), lp_const_eps(dst_type));

   module = LLVMModuleCreateWithName("test");

   func = add_conv_test(module, src_type, num_srcs, dst_type, num_dsts);

   if(LLVMVerifyModule(module, LLVMPrintMessageAction, &error)) {
      LLVMDumpModule(module);
      abort();
   }
   LLVMDisposeMessage(error);

   provider = LLVMCreateModuleProviderForExistingModule(module);
   if (LLVMCreateJITCompiler(&engine, provider, 1, &error)) {
      if(verbose < 1)
         dump_conv_types(stderr, src_type, dst_type);
      fprintf(stderr, "%s\n", error);
      LLVMDisposeMessage(error);
      abort();
   }

#if 0
   pass = LLVMCreatePassManager();
   LLVMAddTargetData(LLVMGetExecutionEngineTargetData(engine), pass);
   /* These are the passes currently listed in llvm-c/Transforms/Scalar.h,
    * but there are more on SVN. */
   LLVMAddConstantPropagationPass(pass);
   LLVMAddInstructionCombiningPass(pass);
   LLVMAddPromoteMemoryToRegisterPass(pass);
   LLVMAddGVNPass(pass);
   LLVMAddCFGSimplificationPass(pass);
   LLVMRunPassManager(pass, module);
#else
   (void)pass;
#endif

   if(verbose >= 2)
      LLVMDumpModule(module);

   conv_test_ptr = (conv_test_ptr_t)LLVMGetPointerToGlobal(engine, func);

   if(verbose >= 2)
      lp_disassemble(conv_test_ptr);

   success = TRUE;
   for(i = 0; i < n && success; ++i) {
      unsigned src_stride = src_type.length*src_type.width/8;
      unsigned dst_stride = dst_type.length*dst_type.width/8;
      PIPE_ALIGN_VAR(16) uint8_t src[LP_MAX_VECTOR_LENGTH*LP_MAX_VECTOR_LENGTH];
      PIPE_ALIGN_VAR(16) uint8_t dst[LP_MAX_VECTOR_LENGTH*LP_MAX_VECTOR_LENGTH];
      double fref[LP_MAX_VECTOR_LENGTH*LP_MAX_VECTOR_LENGTH];
      uint8_t ref[LP_MAX_VECTOR_LENGTH*LP_MAX_VECTOR_LENGTH];
      int64_t start_counter = 0;
      int64_t end_counter = 0;

      for(j = 0; j < num_srcs; ++j) {
         random_vec(src_type, src + j*src_stride);
         read_vec(src_type, src + j*src_stride, fref + j*src_type.length);
      }

      for(j = 0; j < num_dsts; ++j) {
         write_vec(dst_type, ref + j*dst_stride, fref + j*dst_type.length);
      }

      start_counter = rdtsc();
      conv_test_ptr(src, dst);
      end_counter = rdtsc();

      cycles[i] = end_counter - start_counter;

      for(j = 0; j < num_dsts; ++j) {
         if(!compare_vec_with_eps(dst_type, dst + j*dst_stride, ref + j*dst_stride, eps))
            success = FALSE;
      }

      if (!success) {
         if(verbose < 1)
            dump_conv_types(stderr, src_type, dst_type);
         fprintf(stderr, "MISMATCH\n");

         for(j = 0; j < num_srcs; ++j) {
            fprintf(stderr, "  Src%u: ", j);
            dump_vec(stderr, src_type, src + j*src_stride);
            fprintf(stderr, "\n");
         }

#if 1
         fprintf(stderr, "  Ref: ");
         for(j = 0; j < src_type.length*num_srcs; ++j)
            fprintf(stderr, " %f", fref[j]);
         fprintf(stderr, "\n");
#endif

         for(j = 0; j < num_dsts; ++j) {
            fprintf(stderr, "  Dst%u: ", j);
            dump_vec(stderr, dst_type, dst + j*dst_stride);
            fprintf(stderr, "\n");

            fprintf(stderr, "  Ref%u: ", j);
            dump_vec(stderr, dst_type, ref + j*dst_stride);
            fprintf(stderr, "\n");
         }
      }
   }

   /*
    * Unfortunately the output of cycle counter is not very reliable as it comes
    * -- sometimes we get outliers (due IRQs perhaps?) which are
    * better removed to avoid random or biased data.
    */
   {
      double sum = 0.0, sum2 = 0.0;
      double avg, std;
      unsigned m;

      for(i = 0; i < n; ++i) {
         sum += cycles[i];
         sum2 += cycles[i]*cycles[i];
      }

      avg = sum/n;
      std = sqrtf((sum2 - n*avg*avg)/n);

      m = 0;
      sum = 0.0;
      for(i = 0; i < n; ++i) {
         if(fabs(cycles[i] - avg) <= 4.0*std) {
            sum += cycles[i];
            ++m;
         }
      }

      cycles_avg = sum/m;

   }

   if(fp)
      write_tsv_row(fp, src_type, dst_type, cycles_avg, success);

   if (!success) {
      static boolean firsttime = TRUE;
      if(firsttime) {
         if(verbose < 2)
            LLVMDumpModule(module);
         LLVMWriteBitcodeToFile(module, "conv.bc");
         fprintf(stderr, "conv.bc written\n");
         fprintf(stderr, "Invoke as \"llc -o - conv.bc\"\n");
         firsttime = FALSE;
         /* abort(); */
      }
   }

   LLVMFreeMachineCodeForFunction(engine, func);

   LLVMDisposeExecutionEngine(engine);
   if(pass)
      LLVMDisposePassManager(pass);

   return success;
}