void orc_memcpy_u32 (guint32 * d1, const guint32 * s1, int n) { OrcExecutor _ex, *ex = &_ex; static int p_inited = 0; static OrcProgram *p = 0; void (*func) (OrcExecutor *); if (!p_inited) { orc_once_mutex_lock (); if (!p_inited) { OrcCompileResult result; p = orc_program_new (); orc_program_set_name (p, "orc_memcpy_u32"); orc_program_set_backup_function (p, _backup_orc_memcpy_u32); orc_program_add_destination (p, 4, "d1"); orc_program_add_source (p, 4, "s1"); orc_program_append (p, "copyl", ORC_VAR_D1, ORC_VAR_S1, ORC_VAR_D1); result = orc_program_compile (p); } p_inited = TRUE; orc_once_mutex_unlock (); } ex->program = p; ex->n = n; ex->arrays[ORC_VAR_D1] = d1; ex->arrays[ORC_VAR_S1] = (void *) s1; func = p->code_exec; func (ex); }
void orc_blend_u8 (guint8 * d1, int d1_stride, const guint8 * s1, int s1_stride, int p1, int n, int m) { OrcExecutor _ex, *ex = &_ex; static int p_inited = 0; static OrcProgram *p = 0; void (*func) (OrcExecutor *); if (!p_inited) { orc_once_mutex_lock (); if (!p_inited) { OrcCompileResult result; p = orc_program_new (); orc_program_set_2d (p); orc_program_set_name (p, "orc_blend_u8"); orc_program_set_backup_function (p, _backup_orc_blend_u8); orc_program_add_destination (p, 1, "d1"); orc_program_add_source (p, 1, "s1"); orc_program_add_constant (p, 1, 8, "c1"); orc_program_add_parameter (p, 2, "p1"); orc_program_add_temporary (p, 2, "t1"); orc_program_add_temporary (p, 2, "t2"); orc_program_append (p, "convubw", ORC_VAR_T1, ORC_VAR_D1, ORC_VAR_D1); orc_program_append (p, "convubw", ORC_VAR_T2, ORC_VAR_S1, ORC_VAR_D1); orc_program_append (p, "subw", ORC_VAR_T2, ORC_VAR_T2, ORC_VAR_T1); orc_program_append (p, "mullw", ORC_VAR_T2, ORC_VAR_T2, ORC_VAR_P1); orc_program_append (p, "shlw", ORC_VAR_T1, ORC_VAR_T1, ORC_VAR_C1); orc_program_append (p, "addw", ORC_VAR_T2, ORC_VAR_T1, ORC_VAR_T2); orc_program_append (p, "shruw", ORC_VAR_T2, ORC_VAR_T2, ORC_VAR_C1); orc_program_append (p, "convsuswb", ORC_VAR_D1, ORC_VAR_T2, ORC_VAR_D1); result = orc_program_compile (p); } p_inited = TRUE; orc_once_mutex_unlock (); } ex->program = p; ex->n = n; ORC_EXECUTOR_M (ex) = m; ex->arrays[ORC_VAR_D1] = d1; ex->params[ORC_VAR_D1] = d1_stride; ex->arrays[ORC_VAR_S1] = (void *) s1; ex->params[ORC_VAR_S1] = s1_stride; ex->params[ORC_VAR_P1] = p1; func = p->code_exec; func (ex); }
int main(int argc, char *argv[]) { char *s, *d; orc_uint8 *src, *dest; OrcProfile prof; OrcProfile prof_libc; double ave, std; double ave_libc, std_libc; double null; int i,j; double cpufreq; int unalign; OrcProgram *p; int level1, level2, level3; int max; /* const uint8_t zero = 0; */ orc_init (); /* cpufreq = 2333e6; */ cpufreq = 1; if (argc > 1) { unalign = strtoul (argv[1], NULL, 0); } else { unalign = 0; } s = malloc(1024*1024*64+1024); d = malloc(1024*1024*64+1024); src = ORC_PTR_OFFSET(ALIGN(s,128),unalign); dest = ALIGN(d,128); orc_profile_init (&prof); for(j=0;j<10;j++){ orc_profile_start(&prof); orc_profile_stop(&prof); } orc_profile_get_ave_std (&prof, &null, &std); { OrcCompileResult result; p = orc_program_new (); orc_program_set_name (p, "orc_memcpy"); /* orc_program_set_name (p, "orc_memset"); */ orc_program_add_destination (p, 1, "d1"); orc_program_add_source (p, 1, "s1"); /* orc_program_add_parameter (p, 1, "p1"); */ orc_program_append (p, "copyb", ORC_VAR_D1, ORC_VAR_S1, ORC_VAR_D1); result = orc_program_compile (p); if (ORC_COMPILE_RESULT_IS_FATAL (result)) { fprintf (stderr, "Failed to compile orc_memcpy\n"); return -1; } } #ifndef M_LN2 #define M_LN2 0.69314718055994530942 #endif orc_get_data_cache_sizes (&level1, &level2, &level3); if (level3 > 0) { max = (log(level3)/M_LN2 - 6.0) * 10 + 20; } else if (level2 > 0) { max = (log(level2)/M_LN2 - 6.0) * 10 + 20; } else { max = 140; } for(i=0;i<max;i++){ double x = i*0.1 + 6.0; int size = pow(2.0, x); if (flush_cache) { touch (src, (1<<18)); } if (hot_src) { touch (src, size); } if (hot_dest) { touch (dest, size); } orc_profile_init (&prof); for(j=0;j<10;j++){ OrcExecutor _ex, *ex = &_ex; void (*func) (OrcExecutor *); orc_profile_start(&prof); /* orc_memcpy (dest, src, size); */ ex->program = p; ex->n = size; ex->arrays[ORC_VAR_D1] = dest; ex->arrays[ORC_VAR_S1] = (void *)src; func = p->code_exec; func (ex); orc_profile_stop(&prof); if (flush_cache) { touch (src, (1<<18)); } if (hot_src) { touch (src, size); } if (hot_dest) { touch (dest, size); } } orc_profile_init (&prof_libc); for(j=0;j<10;j++){ orc_profile_start(&prof_libc); memcpy (dest, src, size); orc_profile_stop(&prof_libc); if (flush_cache) { touch (src, (1<<18)); } if (hot_src) { touch (src, size); } if (hot_dest) { touch (dest, size); } } orc_profile_get_ave_std (&prof, &ave, &std); orc_profile_get_ave_std (&prof_libc, &ave_libc, &std_libc); ave -= null; ave_libc -= null; /* printf("%d: %10.4g %10.4g %10.4g %10.4g (libc %10.4g)\n", i, ave, std, */ /* ave/(1<<i), cpufreq/(ave/(1<<i)), */ /* cpufreq/(ave_libc/(1<<i))); */ printf("%g %10.4g %10.4g\n", x, cpufreq/(ave/size), cpufreq/(ave_libc/size)); /* printf("%g %10.4g %10.4g\n", x, */ /* 32*(ave/(size)), 32*(ave_libc/(size))); */ fflush (stdout); } orc_program_free (p); free (s); free (d); return 0; }
static void schro_motion_init_functions (SchroMotion * motion) { if (motion_funcs[motion->xblen >> 1].block_accumulate == NULL) { OrcProgram *p; OrcCompileResult result; p = orc_program_new (); orc_program_set_constant_n (p, motion->xblen); orc_program_set_2d (p); orc_program_set_name (p, "block_acc_Xxn"); orc_program_add_destination (p, 2, "d1"); orc_program_add_source (p, 2, "s1"); orc_program_add_source (p, 1, "s2"); orc_program_add_temporary (p, 2, "t1"); orc_program_append (p, "convubw", ORC_VAR_T1, ORC_VAR_S2, ORC_VAR_D1); orc_program_append (p, "mullw", ORC_VAR_T1, ORC_VAR_T1, ORC_VAR_S1); orc_program_append (p, "addw", ORC_VAR_D1, ORC_VAR_D1, ORC_VAR_T1); result = orc_program_compile (p); if (!ORC_COMPILE_RESULT_IS_SUCCESSFUL (result)) { SCHRO_ERROR ("compile failed"); } motion_funcs[motion->xblen / 2].block_accumulate = p; } if (motion_funcs[motion->xblen >> 1].block_accumulate_scaled == NULL) { OrcProgram *p; OrcCompileResult result; p = orc_program_new (); orc_program_set_constant_n (p, motion->xblen); orc_program_set_2d (p); orc_program_set_name (p, "block_acc_scaled_Xxn"); orc_program_add_destination (p, 2, "d1"); orc_program_add_source (p, 2, "s1"); orc_program_add_source (p, 1, "s2"); orc_program_add_parameter (p, 2, "p1"); orc_program_add_constant (p, 2, 32, "c1"); orc_program_add_constant (p, 2, 6, "c2"); orc_program_add_temporary (p, 2, "t1"); orc_program_append (p, "convubw", ORC_VAR_T1, ORC_VAR_S2, ORC_VAR_D1); orc_program_append (p, "mullw", ORC_VAR_T1, ORC_VAR_T1, ORC_VAR_P1); orc_program_append (p, "addw", ORC_VAR_T1, ORC_VAR_T1, ORC_VAR_C1); orc_program_append (p, "shrsw", ORC_VAR_T1, ORC_VAR_T1, ORC_VAR_C2); orc_program_append (p, "mullw", ORC_VAR_T1, ORC_VAR_T1, ORC_VAR_S1); orc_program_append (p, "addw", ORC_VAR_D1, ORC_VAR_D1, ORC_VAR_T1); result = orc_program_compile (p); if (!ORC_COMPILE_RESULT_IS_SUCCESSFUL (result)) { SCHRO_ERROR ("compile failed"); } motion_funcs[motion->xblen / 2].block_accumulate_scaled = p; } if (motion_funcs[motion->xblen >> 1].block_accumulate_dc == NULL) { OrcProgram *p; OrcCompileResult result; p = orc_program_new (); orc_program_set_constant_n (p, motion->xblen); orc_program_set_2d (p); orc_program_set_name (p, "block_acc_dc_Xxn"); orc_program_add_destination (p, 2, "d1"); orc_program_add_source (p, 2, "s1"); orc_program_add_parameter (p, 2, "p1"); orc_program_add_temporary (p, 2, "t1"); orc_program_append (p, "mullw", ORC_VAR_T1, ORC_VAR_S1, ORC_VAR_P1); orc_program_append (p, "addw", ORC_VAR_D1, ORC_VAR_D1, ORC_VAR_T1); result = orc_program_compile (p); if (!ORC_COMPILE_RESULT_IS_SUCCESSFUL (result)) { SCHRO_ERROR ("compile failed"); } motion_funcs[motion->xblen / 2].block_accumulate_dc = p; } if (motion_funcs[motion->xblen >> 1].block_accumulate_avg == NULL) { OrcProgram *p; OrcCompileResult result; p = orc_program_new (); orc_program_set_constant_n (p, motion->xblen); orc_program_set_2d (p); orc_program_set_name (p, "block_acc_avg_Xxn"); orc_program_add_destination (p, 2, "d1"); orc_program_add_source (p, 2, "s1"); orc_program_add_source (p, 1, "s2"); orc_program_add_source (p, 1, "s3"); orc_program_add_temporary (p, 2, "t1"); orc_program_add_temporary (p, 1, "t2"); orc_program_append (p, "avgub", ORC_VAR_T2, ORC_VAR_S2, ORC_VAR_S3); orc_program_append (p, "convubw", ORC_VAR_T1, ORC_VAR_T2, 0); orc_program_append (p, "mullw", ORC_VAR_T1, ORC_VAR_T1, ORC_VAR_S1); orc_program_append (p, "addw", ORC_VAR_D1, ORC_VAR_D1, ORC_VAR_T1); result = orc_program_compile (p); if (!ORC_COMPILE_RESULT_IS_SUCCESSFUL (result)) { SCHRO_ERROR ("compile failed"); } motion_funcs[motion->xblen / 2].block_accumulate_avg = p; } if (motion_funcs[motion->xblen >> 1].block_accumulate_biref == NULL) { OrcProgram *p; OrcCompileResult result; p = orc_program_new (); orc_program_set_constant_n (p, motion->xblen); orc_program_set_2d (p); orc_program_set_name (p, "block_acc_biref_Xxn"); orc_program_add_destination (p, 2, "d1"); orc_program_add_source (p, 2, "s1"); orc_program_add_source (p, 1, "s2"); orc_program_add_source (p, 1, "s3"); orc_program_add_parameter (p, 2, "p1"); orc_program_add_parameter (p, 2, "p2"); orc_program_add_constant (p, 2, 32, "c1"); orc_program_add_constant (p, 2, 6, "c2"); orc_program_add_temporary (p, 2, "t1"); orc_program_add_temporary (p, 2, "t2"); orc_program_append (p, "convubw", ORC_VAR_T1, ORC_VAR_S2, 0); orc_program_append (p, "mullw", ORC_VAR_T1, ORC_VAR_T1, ORC_VAR_P1); orc_program_append (p, "convubw", ORC_VAR_T2, ORC_VAR_S3, 0); orc_program_append (p, "mullw", ORC_VAR_T2, ORC_VAR_T2, ORC_VAR_P2); orc_program_append (p, "addw", ORC_VAR_T1, ORC_VAR_T1, ORC_VAR_T2); orc_program_append (p, "addw", ORC_VAR_T1, ORC_VAR_T1, ORC_VAR_C1); orc_program_append (p, "shrsw", ORC_VAR_T1, ORC_VAR_T1, ORC_VAR_C2); orc_program_append (p, "mullw", ORC_VAR_T1, ORC_VAR_T1, ORC_VAR_S1); orc_program_append (p, "addw", ORC_VAR_D1, ORC_VAR_D1, ORC_VAR_T1); result = orc_program_compile (p); if (!ORC_COMPILE_RESULT_IS_SUCCESSFUL (result)) { SCHRO_ERROR ("compile failed"); } motion_funcs[motion->xblen / 2].block_accumulate_biref = p; } }