static void _backup_orc_blend_u8 (OrcExecutor * ex) { int i; int j; int n = ex->n; int m = ex->params[ORC_VAR_A1]; orc_int8 var0; orc_int8 *ptr0; orc_int8 var4; const orc_int8 *ptr4; const orc_int8 var16 = 8; const int var24 = ex->params[24]; orc_int16 var32; orc_int16 var33; orc_int16 var34; orc_int16 var35; orc_int16 var36; orc_int16 var37; orc_int16 var38; for (j = 0; j < m; j++) { ptr0 = ORC_PTR_OFFSET (ex->arrays[0], ex->params[0] * j); ptr4 = ORC_PTR_OFFSET (ex->arrays[4], ex->params[4] * j); for (i = 0; i < n; i++) { var0 = *ptr0; var4 = *ptr4; ptr4++; /* 0: convubw */ var32 = (orc_uint8) var0; /* 1: convubw */ var33 = (orc_uint8) var4; /* 2: subw */ var34 = var33 - var32; /* 3: mullw */ var35 = (var34 * var24) & 0xffff; /* 4: shlw */ var36 = var32 << var16; /* 5: addw */ var37 = var36 + var35; /* 6: shruw */ var38 = ((orc_uint16) var37) >> var16; /* 7: convsuswb */ var0 = ORC_CLAMP_UB (var38); *ptr0 = var0; ptr0++; } } }
void orc_blend_u8 (guint8 * d1, int d1_stride, const guint8 * s1, int s1_stride, int p1, int n, int m) { int i; int j; orc_int8 var0; orc_int8 *ptr0; orc_int8 var4; const orc_int8 *ptr4; const orc_int8 var16 = 8; const int var24 = p1; orc_int16 var32; orc_int16 var33; orc_int16 var34; orc_int16 var35; orc_int16 var36; orc_int16 var37; orc_int16 var38; for (j = 0; j < m; j++) { ptr0 = ORC_PTR_OFFSET (d1, d1_stride * j); ptr4 = ORC_PTR_OFFSET (s1, s1_stride * j); for (i = 0; i < n; i++) { var0 = *ptr0; var4 = *ptr4; ptr4++; /* 0: convubw */ var32 = (orc_uint8) var0; /* 1: convubw */ var33 = (orc_uint8) var4; /* 2: subw */ var34 = var33 - var32; /* 3: mullw */ var35 = (var34 * var24) & 0xffff; /* 4: shlw */ var36 = var32 << var16; /* 5: addw */ var37 = var36 + var35; /* 6: shruw */ var38 = ((orc_uint16) var37) >> var16; /* 7: convsuswb */ var0 = ORC_CLAMP_UB (var38); *ptr0 = var0; ptr0++; } } }
int orc_array_compare (OrcArray *array1, OrcArray *array2, int flags) { if ((flags & ORC_TEST_FLAGS_FLOAT)) { if (array1->element_size == 4) { int j; for(j=0;j<array1->m;j++){ float *a, *b; int i; a = ORC_PTR_OFFSET (array1->data, j*array1->stride); b = ORC_PTR_OFFSET (array2->data, j*array2->stride); for (i=0;i<array1->n;i++){ if (isnan(a[i]) && isnan(b[i])) continue; if (a[i] == b[i]) continue; if (fabs(a[i] - b[i]) < MIN_NONDENORMAL) continue; return FALSE; } } return TRUE; } else if (array1->element_size == 8) { int j; for(j=0;j<array1->m;j++){ double *a, *b; int i; a = ORC_PTR_OFFSET (array1->data, j*array1->stride); b = ORC_PTR_OFFSET (array2->data, j*array2->stride); for (i=0;i<array1->n;i++){ if (isnan(a[i]) && isnan(b[i])) continue; if (a[i] == b[i]) continue; if (abs(a[i] - b[i]) < MIN_NONDENORMAL_D) continue; return FALSE; } } return TRUE; } } else { if (memcmp (array1->alloc_data, array2->alloc_data, array1->alloc_len) == 0) { return TRUE; } } return FALSE; }
OrcArray * orc_array_new (int n, int m, int element_size, int misalignment) { OrcArray *ar; void *data; #ifdef HAVE_POSIX_MEMALIGN int ret; #endif ar = malloc (sizeof(OrcArray)); memset (ar, 0, sizeof(OrcArray)); ar->n = n; ar->m = m; ar->element_size = element_size; ar->stride = (n*element_size + EXTEND_STRIDE); ar->stride = (ar->stride + (ALIGNMENT-1)) & (~(ALIGNMENT-1)); ar->alloc_len = ar->stride * (m+2*EXTEND_ROWS) + (ALIGNMENT * element_size); #ifdef HAVE_POSIX_MEMALIGN ret = posix_memalign (&data, ALIGNMENT, ar->alloc_len); #else data = malloc (ar->alloc_len); #endif ar->alloc_data = data; ar->data = ORC_PTR_OFFSET (ar->alloc_data, ar->stride * EXTEND_ROWS + element_size * misalignment); return ar; }
int float_compare (OrcArray *array1, OrcArray *array2, int i, int j) { void *ptr1 = ORC_PTR_OFFSET (array1->data, i*array1->element_size + j*array1->stride); void *ptr2 = ORC_PTR_OFFSET (array2->data, i*array2->element_size + j*array2->stride); switch (array1->element_size) { case 4: if (isnan(*(float *)ptr1) && isnan(*(float *)ptr2)) return TRUE; if (*(float *)ptr1 == *(float *)ptr2) return TRUE; if (fabs(*(float *)ptr1 - *(float *)ptr2) < MIN_NONDENORMAL) return TRUE; return FALSE; case 8: /* FIXME */ return FALSE; } return FALSE; }
void orc_code_allocate_codemem (OrcCode *code, int size) { OrcCodeRegion *region; OrcCodeChunk *chunk; int aligned_size = (size + 15) & (~15); chunk = orc_code_region_get_free_chunk (aligned_size); region = chunk->region; if (chunk->size > aligned_size) { orc_code_chunk_split (chunk, aligned_size); } chunk->used = TRUE; code->chunk = chunk; code->code = ORC_PTR_OFFSET(region->write_ptr, chunk->offset); code->exec = ORC_PTR_OFFSET(region->exec_ptr, chunk->offset); code->code_size = size; /* compiler->codeptr = ORC_PTR_OFFSET(region->write_ptr, chunk->offset); */ }
int orc_array_check_out_of_bounds (OrcArray *array) { int i; int j; unsigned char *data; data = array->alloc_data; for(i=0;i<array->stride * EXTEND_ROWS;i++){ if (data[i] != ORC_OOB_VALUE) { printf("OOB check failed at start-%d\n", array->stride * EXTEND_ROWS - i); return FALSE; } } for(j=0;j<array->m;j++){ data = ORC_PTR_OFFSET(array->data, array->stride * j); for(i=array->element_size * array->n;i<array->stride;i++){ if (data[i] != ORC_OOB_VALUE) { printf("OOB check failed on row %d, end+%d\n", j, i - array->element_size * array->n); return FALSE; } } } data = ORC_PTR_OFFSET (array->data, array->stride * array->m); for(i=0;i<array->stride * EXTEND_ROWS;i++){ if (data[i] != ORC_OOB_VALUE) { printf("OOB check failed at end+%d\n", i); return FALSE; } } return TRUE; }
static orc_uint64 print_array_val_float (OrcArray *array, int i, int j) { void *ptr = ORC_PTR_OFFSET (array->data, i*array->element_size + j*array->stride); switch (array->element_size) { case 4: if (isnan(*(float *)ptr)) { printf(" nan %08x", *(orc_uint32 *)ptr); /* This is to get around signaling/non-signaling nans in the output */ return (*(orc_uint32 *)ptr) & 0xffbfffff; } else { printf(" %12.5g", *(float *)ptr); return *(orc_int32 *)ptr; } case 8: printf(" %12.5g", *(double *)ptr); return *(orc_int64 *)ptr; default: printf(" ERROR"); return -1; } }
static orc_uint64 print_array_val_hex (OrcArray *array, int i, int j) { void *ptr = ORC_PTR_OFFSET (array->data, i*array->element_size + j*array->stride); switch (array->element_size) { case 1: printf(" %02x", *(orc_uint8 *)ptr); return *(orc_int8 *)ptr; case 2: printf(" %04x", *(orc_uint16 *)ptr); return *(orc_int16 *)ptr; case 4: printf(" %08x", *(orc_uint32 *)ptr); return *(orc_int32 *)ptr; case 8: printf(" 0x%08x%08x", (orc_uint32)((*(orc_uint64 *)ptr)>>32), (orc_uint32)((*(orc_uint64 *)ptr))); return *(orc_int64 *)ptr; default: return -1; } }
void orc_array_set_pattern_2 (OrcArray *array, OrcRandomContext *context, int type) { int i,j; switch (type) { case ORC_PATTERN_RANDOM: orc_random_bits (context, array->alloc_data, array->alloc_len); break; case ORC_PATTERN_FLOAT_SMALL: { if (array->element_size != 4) return; for(j=0;j<array->m;j++){ orc_union32 *data; int exp; data = ORC_PTR_OFFSET(array->data, array->stride * j); for(i=0;i<array->n;i++){ data[i].i = orc_random (context); exp = (data[i].i & 0x7f80000) >> 23; exp &= 0xf; exp += 122; data[i].i &= ~0x7f800000; data[i].i |= (exp&0xff) << 23; } } } break; case ORC_PATTERN_FLOAT_SPECIAL: { if (array->element_size != 4) return; for(j=0;j<array->m;j++){ orc_union32 *data; int x; data = ORC_PTR_OFFSET(array->data, array->stride * j); for(i=0;i<array->n;i++){ x = i&0x1f; data[i].i = special_floats[x]; } } } break; case ORC_PATTERN_FLOAT_DENORMAL: { if (array->element_size != 4) return; for(j=0;j<array->m;j++){ orc_union32 *data; data = ORC_PTR_OFFSET(array->data, array->stride * j); for(i=0;i<array->n;i++){ data[i].i = orc_random (context); data[i].i &= ~0x7f800000; } } } break; default: break; } }
int main(int argc, char *argv[]) { char *s, *d; orc_uint8 *src, *dest; OrcProfile prof; OrcProfile prof_libc; double ave, std; double ave_libc, std_libc; double null; int i,j; double cpufreq; int unalign; OrcProgram *p; int level1, level2, level3; int max; /* const uint8_t zero = 0; */ orc_init (); /* cpufreq = 2333e6; */ cpufreq = 1; if (argc > 1) { unalign = strtoul (argv[1], NULL, 0); } else { unalign = 0; } s = malloc(1024*1024*64+1024); d = malloc(1024*1024*64+1024); src = ORC_PTR_OFFSET(ALIGN(s,128),unalign); dest = ALIGN(d,128); orc_profile_init (&prof); for(j=0;j<10;j++){ orc_profile_start(&prof); orc_profile_stop(&prof); } orc_profile_get_ave_std (&prof, &null, &std); { OrcCompileResult result; p = orc_program_new (); orc_program_set_name (p, "orc_memcpy"); /* orc_program_set_name (p, "orc_memset"); */ orc_program_add_destination (p, 1, "d1"); orc_program_add_source (p, 1, "s1"); /* orc_program_add_parameter (p, 1, "p1"); */ orc_program_append (p, "copyb", ORC_VAR_D1, ORC_VAR_S1, ORC_VAR_D1); result = orc_program_compile (p); if (ORC_COMPILE_RESULT_IS_FATAL (result)) { fprintf (stderr, "Failed to compile orc_memcpy\n"); return -1; } } #ifndef M_LN2 #define M_LN2 0.69314718055994530942 #endif orc_get_data_cache_sizes (&level1, &level2, &level3); if (level3 > 0) { max = (log(level3)/M_LN2 - 6.0) * 10 + 20; } else if (level2 > 0) { max = (log(level2)/M_LN2 - 6.0) * 10 + 20; } else { max = 140; } for(i=0;i<max;i++){ double x = i*0.1 + 6.0; int size = pow(2.0, x); if (flush_cache) { touch (src, (1<<18)); } if (hot_src) { touch (src, size); } if (hot_dest) { touch (dest, size); } orc_profile_init (&prof); for(j=0;j<10;j++){ OrcExecutor _ex, *ex = &_ex; void (*func) (OrcExecutor *); orc_profile_start(&prof); /* orc_memcpy (dest, src, size); */ ex->program = p; ex->n = size; ex->arrays[ORC_VAR_D1] = dest; ex->arrays[ORC_VAR_S1] = (void *)src; func = p->code_exec; func (ex); orc_profile_stop(&prof); if (flush_cache) { touch (src, (1<<18)); } if (hot_src) { touch (src, size); } if (hot_dest) { touch (dest, size); } } orc_profile_init (&prof_libc); for(j=0;j<10;j++){ orc_profile_start(&prof_libc); memcpy (dest, src, size); orc_profile_stop(&prof_libc); if (flush_cache) { touch (src, (1<<18)); } if (hot_src) { touch (src, size); } if (hot_dest) { touch (dest, size); } } orc_profile_get_ave_std (&prof, &ave, &std); orc_profile_get_ave_std (&prof_libc, &ave_libc, &std_libc); ave -= null; ave_libc -= null; /* printf("%d: %10.4g %10.4g %10.4g %10.4g (libc %10.4g)\n", i, ave, std, */ /* ave/(1<<i), cpufreq/(ave/(1<<i)), */ /* cpufreq/(ave_libc/(1<<i))); */ printf("%g %10.4g %10.4g\n", x, cpufreq/(ave/size), cpufreq/(ave_libc/size)); /* printf("%g %10.4g %10.4g\n", x, */ /* 32*(ave/(size)), 32*(ave_libc/(size))); */ fflush (stdout); } orc_program_free (p); free (s); free (d); return 0; }
void orc_executor_emulate (OrcExecutor *ex) { int i; int j; int k; int m, m_index; OrcCode *code; OrcInstruction *insn; OrcStaticOpcode *opcode; OrcOpcodeExecutor *opcode_ex; void *tmpspace[ORC_N_COMPILER_VARIABLES] = { 0 }; if (ex->program) { code = ex->program->orccode; } else { code = (OrcCode *)ex->arrays[ORC_VAR_A2]; } ex->accumulators[0] = 0; ex->accumulators[1] = 0; ex->accumulators[2] = 0; ex->accumulators[3] = 0; ORC_DEBUG("emulating"); memset (&opcode_ex, 0, sizeof(opcode_ex)); if (code == NULL) { ORC_ERROR("attempt to run program that failed to compile"); ORC_ASSERT(0); } if (code->is_2d) { m = ORC_EXECUTOR_M(ex); } else { m = 1; } for(i=0;i<ORC_N_COMPILER_VARIABLES;i++){ OrcCodeVariable *var = code->vars + i; if (var->size) { tmpspace[i] = malloc(ORC_MAX_VAR_SIZE * CHUNK_SIZE); } } opcode_ex = malloc(sizeof(OrcOpcodeExecutor)*code->n_insns); for(j=0;j<code->n_insns;j++){ insn = code->insns + j; opcode = insn->opcode; opcode_ex[j].emulateN = opcode->emulateN; opcode_ex[j].shift = 0; if (insn->flags & ORC_INSTRUCTION_FLAG_X2) { opcode_ex[j].shift = 1; } else if (insn->flags & ORC_INSTRUCTION_FLAG_X4) { opcode_ex[j].shift = 2; } for(k=0;k<ORC_STATIC_OPCODE_N_SRC;k++) { OrcCodeVariable *var = code->vars + insn->src_args[k]; if (opcode->src_size[k] == 0) continue; if (var->vartype == ORC_VAR_TYPE_CONST) { opcode_ex[j].src_ptrs[k] = tmpspace[insn->src_args[k]]; /* FIXME hack */ load_constant (tmpspace[insn->src_args[k]], 8, var->value.i); } else if (var->vartype == ORC_VAR_TYPE_PARAM) { opcode_ex[j].src_ptrs[k] = tmpspace[insn->src_args[k]]; /* FIXME hack */ load_constant (tmpspace[insn->src_args[k]], 8, (orc_uint64)(orc_uint32)ex->params[insn->src_args[k]] | (((orc_uint64)(orc_uint32)ex->params[insn->src_args[k] + (ORC_VAR_T1 - ORC_VAR_P1)])<<32)); } else if (var->vartype == ORC_VAR_TYPE_TEMP) { opcode_ex[j].src_ptrs[k] = tmpspace[insn->src_args[k]]; } else if (var->vartype == ORC_VAR_TYPE_SRC) { if (ORC_PTR_TO_INT(ex->arrays[insn->src_args[k]]) & (var->size - 1)) { ORC_ERROR("Unaligned array for src%d, program %s", (insn->src_args[k]-ORC_VAR_S1), ex->program->name); } opcode_ex[j].src_ptrs[k] = ex->arrays[insn->src_args[k]]; } else if (var->vartype == ORC_VAR_TYPE_DEST) { if (ORC_PTR_TO_INT(ex->arrays[insn->src_args[k]]) & (var->size - 1)) { ORC_ERROR("Unaligned array for dest%d, program %s", (insn->src_args[k]-ORC_VAR_D1), ex->program->name); } opcode_ex[j].src_ptrs[k] = ex->arrays[insn->src_args[k]]; } } for(k=0;k<ORC_STATIC_OPCODE_N_DEST;k++) { OrcCodeVariable *var = code->vars + insn->dest_args[k]; if (opcode->dest_size[k] == 0) continue; if (var->vartype == ORC_VAR_TYPE_TEMP) { ORC_DEBUG("dest vartype tmp %d", insn->dest_args[k]); opcode_ex[j].dest_ptrs[k] = tmpspace[insn->dest_args[k]]; } else if (var->vartype == ORC_VAR_TYPE_ACCUMULATOR) { opcode_ex[j].dest_ptrs[k] = &ex->accumulators[insn->dest_args[k] - ORC_VAR_A1]; } else if (var->vartype == ORC_VAR_TYPE_DEST) { if (ORC_PTR_TO_INT(ex->arrays[insn->dest_args[k]]) & (var->size - 1)) { ORC_ERROR("Unaligned array for dest%d, program %s", (insn->dest_args[k]-ORC_VAR_D1), ex->program->name); } opcode_ex[j].dest_ptrs[k] = ex->arrays[insn->dest_args[k]]; } } ORC_DEBUG("opcode %s %p %p %p", opcode->name, opcode_ex[j].dest_ptrs[0], opcode_ex[j].src_ptrs[0], opcode_ex[j].src_ptrs[1]); } ORC_DEBUG("src ptr %p stride %d", ex->arrays[ORC_VAR_S1], ex->params[ORC_VAR_S1]); for(m_index=0;m_index<m;m_index++){ ORC_DEBUG("m_index %d m %d", m_index, m); for(j=0;j<code->n_insns;j++){ insn = code->insns + j; opcode = insn->opcode; for(k=0;k<ORC_STATIC_OPCODE_N_SRC;k++) { OrcCodeVariable *var = code->vars + insn->src_args[k]; if (opcode->src_size[k] == 0) continue; if (var->vartype == ORC_VAR_TYPE_SRC) { opcode_ex[j].src_ptrs[k] = ORC_PTR_OFFSET(ex->arrays[insn->src_args[k]], ex->params[insn->src_args[k]]*m_index); } else if (var->vartype == ORC_VAR_TYPE_DEST) { opcode_ex[j].src_ptrs[k] = ORC_PTR_OFFSET(ex->arrays[insn->src_args[k]], ex->params[insn->src_args[k]]*m_index); } } for(k=0;k<ORC_STATIC_OPCODE_N_DEST;k++) { OrcCodeVariable *var = code->vars + insn->dest_args[k]; if (opcode->dest_size[k] == 0) continue; if (var->vartype == ORC_VAR_TYPE_DEST) { opcode_ex[j].dest_ptrs[k] = ORC_PTR_OFFSET(ex->arrays[insn->dest_args[k]], ex->params[insn->dest_args[k]]*m_index); } } } for(i=0;i<ex->n;i+=CHUNK_SIZE){ for(j=0;j<code->n_insns;j++){ if (ex->n - i >= CHUNK_SIZE) { opcode_ex[j].emulateN (opcode_ex + j, i, CHUNK_SIZE << opcode_ex[j].shift); } else { opcode_ex[j].emulateN (opcode_ex + j, i, (ex->n - i) << opcode_ex[j].shift); } } } } free (opcode_ex); for(i=0;i<ORC_N_COMPILER_VARIABLES;i++){ if (tmpspace[i]) free (tmpspace[i]); } }