bool example_test() { void *uc = NULL; int errors = 0; for (int channels = 1; channels <= 4; channels++) { errors += run_test(uc, channels, kCPU, kChunky); errors += run_test(uc, channels, kCPU, kPlanar); errors += run_test(uc, channels, kGLSL, kChunky); // GLSL+Planar is a silly combination; the conversion overhead is high. // But let's run it anyway, since it should work. errors += run_test(uc, channels, kGLSL, kPlanar); } // -------- Other stuff halide_print(uc, "Here is a random image.\n"); Image<uint8_t> randomness(300, 400, 3); (void) halide_randomize_buffer_host<uint8_t>(uc, 0, 0, 255, randomness); halide_buffer_display(randomness); halide_print(uc, "Here is a smooth image.\n"); Image<uint8_t> smoothness(300, 400, 3); (void) halide_smooth_buffer_host<uint8_t>(uc, 0, smoothness); halide_buffer_display(smoothness); return errors > 0; }
int initialize_kernels(const unsigned char *code, int codeLen, bool use_dlopenbuf, handle_t *module_ptr) { void *lib = NULL; if (use_dlopenbuf) { if (!dlopenbuf) { log_printf("dlopenbuf not available.\n"); return -1; } // We need a unique soname, or dlopenbuf will return a // previously opened library. static int unique_id = 0; char soname[256]; sprintf(soname, "libhalide_kernels%04d.so", __sync_fetch_and_add(&unique_id, 1)); // Open the library dllib_init(); // We need to use RTLD_NOW, the libraries we build for Hexagon // offloading do not support lazy bindin. lib = dlopenbuf(soname, (const char*)code, codeLen, RTLD_LOCAL | RTLD_NOW); if (!lib) { halide_print(NULL, "dlopenbuf failed\n"); halide_print(NULL, dlerror()); return -1; } } else { lib = mmap_dlopen(code, codeLen); if (!lib) { halide_print(NULL, "mmap_dlopen failed\n"); return -1; } } *module_ptr = reinterpret_cast<handle_t>(lib); return 0; }
WEAK void validate_cache() { print(NULL) << "validating cache, " << "current size " << current_cache_size << " of maximum " << max_cache_size << "\n"; int entries_in_hash_table = 0; for (size_t i = 0; i < kHashTableSize; i++) { CacheEntry *entry = cache_entries[i]; while (entry != NULL) { entries_in_hash_table++; if (entry->more_recent == NULL && entry != most_recently_used) { halide_print(NULL, "cache invalid case 1\n"); __builtin_trap(); } if (entry->less_recent == NULL && entry != least_recently_used) { halide_print(NULL, "cache invalid case 2\n"); __builtin_trap(); } entry = entry->next; } } int entries_from_mru = 0; CacheEntry *mru_chain = most_recently_used; while (mru_chain != NULL) { entries_from_mru++; mru_chain = mru_chain->less_recent; } int entries_from_lru = 0; CacheEntry *lru_chain = least_recently_used; while (lru_chain != NULL) { entries_from_lru++; lru_chain = lru_chain->more_recent; } print(NULL) << "hash entries " << entries_in_hash_table << ", mru entries " << entries_from_mru << ", lru entries " << entries_from_lru << "\n"; if (entries_in_hash_table != entries_from_mru) { halide_print(NULL, "cache invalid case 3\n"); __builtin_trap(); } if (entries_in_hash_table != entries_from_lru) { halide_print(NULL, "cache invalid case 4\n"); __builtin_trap(); } if (current_cache_size < 0) { halide_print(NULL, "cache size is negative\n"); __builtin_trap(); } }
WEAK int halide_printf(void *user_context, const char * fmt, ...) { char buffer[4096]; va_list args; va_start(args,fmt); int ret = vsnprintf(buffer, sizeof(buffer), fmt, args); va_end(args); halide_print(user_context, buffer); return ret; }
WEAK void halide_profiler_report_unlocked(void *user_context, halide_profiler_state *s) { char line_buf[160]; Printer<StringStreamPrinter, sizeof(line_buf)> sstr(user_context, line_buf); for (halide_profiler_pipeline_stats *p = s->pipelines; p; p = (halide_profiler_pipeline_stats *)(p->next)) { float t = p->time / 1000000.0f; if (!p->runs) continue; sstr.clear(); sstr << p->name << " total time: " << t << " ms" << " samples: " << p->samples << " runs: " << p->runs << " time per run: " << t / p->runs << " ms\n"; halide_print(user_context, sstr.str()); if (p->time) { for (int i = 0; i < p->num_funcs; i++) { sstr.clear(); halide_profiler_func_stats *fs = p->funcs + i; // The first func is always a catch-all overhead // slot. Only report overhead time if it's non-zero if (i == 0 && fs->time == 0) continue; sstr << " " << fs->name << ": "; while (sstr.size() < 25) sstr << " "; float ft = fs->time / (p->runs * 1000000.0f); sstr << ft << "ms"; while (sstr.size() < 40) sstr << " "; int percent = fs->time / (p->time / 100); sstr << "(" << percent << "%)\n"; halide_print(user_context, sstr.str()); } } } }
WEAK void default_error_handler(void *user_context, const char *msg) { char buf[4096]; char *dst = halide_string_to_string(buf, buf + 4095, "Error: "); dst = halide_string_to_string(dst, buf + 4095, msg); // We still have one character free. Add a newline if there // isn't one already. if (dst[-1] != '\n') { dst[0] = '\n'; dst[1] = 0; } halide_print(user_context, buf); exit(1); }
int initialize_kernels(const unsigned char *code, int codeLen, handle_t *module_ptr) { elf_t *lib = obj_dlopen_mem(code, codeLen); if (!lib) { halide_print(NULL, "dlopen_mem failed\n"); return -1; } // Initialize the runtime. The Hexagon runtime can't call any // system functions (because we can't link them), so we put all // the implementations that need to do so here, and pass poiners // to them in here. set_runtime_t set_runtime = (set_runtime_t)obj_dlsym(lib, "halide_noos_set_runtime"); if (!set_runtime) { obj_dlclose(lib); halide_print(NULL, "halide_noos_set_runtime not found in shared object\n"); return -1; } int result = set_runtime(halide_malloc, halide_free, halide_print, halide_error, halide_do_par_for, halide_do_task, halide_get_symbol, halide_load_library, halide_get_library_symbol); if (result != 0) { obj_dlclose(lib); halide_print(NULL, "set_runtime failed\n"); return result; } *module_ptr = reinterpret_cast<handle_t>(lib); return 0; }
WEAK void halide_default_error(void *user_context, const char *msg) { char buf[4096]; char *dst = halide_string_to_string(buf, buf + 4094, "Error: "); dst = halide_string_to_string(dst, buf + 4094, msg); // We still have one character free. Add a newline if there // isn't one already. if (dst[-1] != '\n') { dst[0] = '\n'; dst[1] = 0; dst += 1; } halide_msan_annotate_memory_is_initialized(user_context, buf, dst - buf + 1); halide_print(user_context, buf); Halide::Runtime::Internal::halide_abort(); }
// This checks if there are any log messages available on the remote // side. It should be called after every remote call. WEAK void poll_log(void *user_context) { if (!remote_poll_log) return; while (true) { char message[1024]; int read = 0; int result = remote_poll_log(&message[0], sizeof(message), &read); if (result != 0) { // Don't make this an error, otherwise we might obscure // more information about errors that would come later. print(user_context) << "Hexagon: remote_poll_log failed " << result << "\n"; return; } if (read > 0) { halide_print(user_context, message); } else { break; } } }
WEAK void halide_profiler_report_unlocked(void *user_context, halide_profiler_state *s) { char line_buf[1024]; Printer<StringStreamPrinter, sizeof(line_buf)> sstr(user_context, line_buf); for (halide_profiler_pipeline_stats *p = s->pipelines; p; p = (halide_profiler_pipeline_stats *)(p->next)) { float t = p->time / 1000000.0f; if (!p->runs) continue; sstr.clear(); int alloc_avg = 0; if (p->num_allocs != 0) { alloc_avg = p->memory_total/p->num_allocs; } sstr << p->name << "\n" << " total time: " << t << " ms" << " samples: " << p->samples << " runs: " << p->runs << " time/run: " << t / p->runs << " ms\n" << " heap allocations: " << p->num_allocs << " peak heap usage: " << p->memory_peak << " bytes\n"; halide_print(user_context, sstr.str()); bool print_f_states = p->time || p->memory_total; if (!print_f_states) { for (int i = 0; i < p->num_funcs; i++) { halide_profiler_func_stats *fs = p->funcs + i; if (fs->stack_peak) { print_f_states = true; break; } } } if (print_f_states) { for (int i = 0; i < p->num_funcs; i++) { sstr.clear(); halide_profiler_func_stats *fs = p->funcs + i; // The first func is always a catch-all overhead // slot. Only report overhead time if it's non-zero if (i == 0 && fs->time == 0) continue; sstr << " " << fs->name << ": "; while (sstr.size() < 25) sstr << " "; float ft = fs->time / (p->runs * 1000000.0f); sstr << ft << "ms"; while (sstr.size() < 40) sstr << " "; int percent = 0; if (p->time != 0) { percent = (100*fs->time) / p->time; } sstr << "(" << percent << "%)"; while (sstr.size() < 50) sstr << " "; int alloc_avg = 0; if (fs->num_allocs != 0) { alloc_avg = fs->memory_total/fs->num_allocs; } if (fs->memory_peak) { sstr << " peak: " << fs->memory_peak; while (sstr.size() < 65) sstr << " "; sstr << " num: " << fs->num_allocs; while (sstr.size() < 80) sstr << " "; sstr << " avg: " << alloc_avg; } if (fs->stack_peak > 0) { sstr << " stack: " << fs->stack_peak; } sstr << "\n"; halide_print(user_context, sstr.str()); } } } }
WEAK int32_t halide_trace(void *user_context, const halide_trace_event *e) { static int32_t ids = 1; if (halide_custom_trace) { return (*halide_custom_trace)(user_context, e); } else { int32_t my_id = __sync_fetch_and_add(&ids, 1); // If we're dumping to a file, use a binary format int fd = halide_get_trace_file(user_context); if (fd > 0) { // A 32-byte header. The first 6 bytes are metadata, then the rest is a zero-terminated string. uint8_t clamped_width = e->vector_width < 256 ? e->vector_width : 255; uint8_t clamped_dimensions = e->dimensions < 256 ? e->dimensions : 255; // Upgrade the bit count to a power of two, because that's // how it will be stored on the stack. int bytes = 1; while (bytes*8 < e->bits) bytes <<= 1; // Compute the size of each portion of the tracing packet size_t header_bytes = 32; size_t value_bytes = clamped_width * bytes; size_t int_arg_bytes = clamped_dimensions * sizeof(int32_t); size_t total_bytes = header_bytes + value_bytes + int_arg_bytes; uint8_t buffer[4096]; halide_assert(user_context, total_bytes <= 4096 && "Tracing packet too large"); ((int32_t *)buffer)[0] = my_id; ((int32_t *)buffer)[1] = e->parent_id; buffer[8] = e->event; buffer[9] = e->type_code; buffer[10] = e->bits; buffer[11] = clamped_width; buffer[12] = e->value_index; buffer[13] = clamped_dimensions; // Use up to 17 bytes for the function name int i = 14; for (; i < header_bytes-1; i++) { buffer[i] = e->func[i-14]; if (buffer[i] == 0) break; } // Fill the rest with zeros for (; i < header_bytes; i++) { buffer[i] = 0; } // Next comes the value for (size_t i = 0; i < value_bytes; i++) { buffer[header_bytes + i] = ((uint8_t *)(e->value))[i]; } // Then the int args for (size_t i = 0; i < int_arg_bytes; i++) { buffer[header_bytes + value_bytes + i] = ((uint8_t *)(e->coordinates))[i]; } size_t written = write(fd, &buffer[0], total_bytes); halide_assert(user_context, written == total_bytes && "Can't write to trace file"); } else { stringstream ss(user_context); // Round up bits to 8, 16, 32, or 64 int print_bits = 8; while (print_bits < e->bits) print_bits <<= 1; halide_assert(user_context, print_bits <= 64 && "Tracing bad type"); // Otherwise, use halide_printf and a plain-text format const char *event_types[] = {"Load", "Store", "Begin realization", "End realization", "Produce", "Update", "Consume", "End consume"}; // Only print out the value on stores and loads. bool print_value = (e->event < 2); ss << event_types[e->event] << " " << e->func << "." << e->value_index << "["; if (e->vector_width > 1) { ss << "<"; } for (int i = 0; i < e->dimensions; i++) { if (i > 0) { if ((e->vector_width > 1) && (i % e->vector_width) == 0) { ss << ">, <"; } else { ss << ", "; } } ss << e->coordinates[i]; } if (e->vector_width > 1) { ss << ">]"; } else { ss << "]"; } if (print_value) { if (e->vector_width > 1) { ss << " = <"; } else { ss << " = "; } for (int i = 0; i < e->vector_width; i++) { if (i > 0) { ss << ", "; } if (e->type_code == 0) { if (print_bits == 8) { ss << ((int8_t *)(e->value))[i]; } else if (print_bits == 16) { ss << ((int16_t *)(e->value))[i]; } else if (print_bits == 32) { ss << ((int32_t *)(e->value))[i]; } else { ss << ((int64_t *)(e->value))[i]; } } else if (e->type_code == 1) { if (print_bits == 8) { ss << ((uint8_t *)(e->value))[i]; } else if (print_bits == 16) { ss << ((uint16_t *)(e->value))[i]; } else if (print_bits == 32) { ss << ((uint32_t *)(e->value))[i]; } else { ss << ((uint64_t *)(e->value))[i]; } } else if (e->type_code == 2) { halide_assert(user_context, print_bits >= 32 && "Tracing a bad type"); if (print_bits == 32) { ss << ((float *)(e->value))[i]; } else { ss << ((double *)(e->value))[i]; } } else if (e->type_code == 3) { ss << ((void **)(e->value))[i]; } } if (e->vector_width > 1) { ss << ">"; } } ss << "\n"; halide_print(user_context, ss.str()); } return my_id; } }
WEAK void halide_profiler_report_unlocked(void *user_context, halide_profiler_state *s) { char line_buf[1024]; Printer<StringStreamPrinter, sizeof(line_buf)> sstr(user_context, line_buf); for (halide_profiler_pipeline_stats *p = s->pipelines; p; p = (halide_profiler_pipeline_stats *)(p->next)) { float t = p->time / 1000000.0f; if (!p->runs) continue; sstr.clear(); int alloc_avg = 0; if (p->num_allocs != 0) { alloc_avg = p->memory_total/p->num_allocs; } bool serial = p->active_threads_numerator == p->active_threads_denominator; float threads = p->active_threads_numerator / (p->active_threads_denominator + 1e-10); sstr << p->name << "\n" << " total time: " << t << " ms" << " samples: " << p->samples << " runs: " << p->runs << " time/run: " << t / p->runs << " ms\n"; if (!serial) { sstr << " average threads used: " << threads << "\n"; } sstr << " heap allocations: " << p->num_allocs << " peak heap usage: " << p->memory_peak << " bytes\n"; halide_print(user_context, sstr.str()); bool print_f_states = p->time || p->memory_total; if (!print_f_states) { for (int i = 0; i < p->num_funcs; i++) { halide_profiler_func_stats *fs = p->funcs + i; if (fs->stack_peak) { print_f_states = true; break; } } } if (print_f_states) { for (int i = 0; i < p->num_funcs; i++) { size_t cursor = 0; sstr.clear(); halide_profiler_func_stats *fs = p->funcs + i; // The first func is always a catch-all overhead // slot. Only report overhead time if it's non-zero if (i == 0 && fs->time == 0) continue; sstr << " " << fs->name << ": "; cursor += 25; while (sstr.size() < cursor) sstr << " "; float ft = fs->time / (p->runs * 1000000.0f); sstr << ft; // We don't need 6 sig. figs. sstr.erase(3); sstr << "ms"; cursor += 10; while (sstr.size() < cursor) sstr << " "; int percent = 0; if (p->time != 0) { percent = (100*fs->time) / p->time; } sstr << "(" << percent << "%)"; cursor += 8; while (sstr.size() < cursor) sstr << " "; if (!serial) { float threads = fs->active_threads_numerator / (fs->active_threads_denominator + 1e-10); sstr << "threads: " << threads; sstr.erase(3); cursor += 15; while (sstr.size() < cursor) sstr << " "; } int alloc_avg = 0; if (fs->num_allocs != 0) { alloc_avg = fs->memory_total/fs->num_allocs; } if (fs->memory_peak) { cursor += 15; sstr << " peak: " << fs->memory_peak; while (sstr.size() < cursor) sstr << " "; sstr << " num: " << fs->num_allocs; cursor += 15; while (sstr.size() < cursor) sstr << " "; sstr << " avg: " << alloc_avg; } if (fs->stack_peak > 0) { sstr << " stack: " << fs->stack_peak; } sstr << "\n"; halide_print(user_context, sstr.str()); } } } }
// This is a basic implementation of the Halide runtime for Hexagon. void halide_error(void *user_context, const char *str) { halide_print(user_context, str); }