WEAK int halide_do_par_for(void *user_context, int (*f)(void *, int, uint8_t *), int min, int size, uint8_t *closure) { if (halide_custom_do_par_for) { return (*halide_custom_do_par_for)(user_context, f, min, size, closure); } if (!halide_thread_pool_initialized) { halide_work_queue.shutdown = false; pthread_mutex_init(&halide_work_queue.mutex, NULL); pthread_cond_init(&halide_work_queue.state_change, NULL); halide_work_queue.jobs = NULL; char *threadStr = getenv("HL_NUMTHREADS"); if (threadStr) { halide_threads = atoi(threadStr); } else { halide_threads = halide_host_cpu_count(); // halide_printf(user_context, "HL_NUMTHREADS not defined. Defaulting to %d threads.\n", halide_threads); } if (halide_threads > MAX_THREADS) { halide_threads = MAX_THREADS; } else if (halide_threads < 1) { halide_threads = 1; } for (int i = 0; i < halide_threads-1; i++) { //fprintf(stderr, "Creating thread %d\n", i); pthread_create(halide_work_queue.threads + i, NULL, halide_worker_thread, NULL); } halide_thread_pool_initialized = true; } // Make the job. work job; job.f = f; // The job should call this function. It takes an index and a closure. job.user_context = user_context; job.next = min; // Start at this index. job.max = min + size; // Keep going until one less than this index. job.closure = closure; // Use this closure. job.exit_status = 0; // The job hasn't failed yet job.active_workers = 0; // Nobody is working on this yet // Push the job onto the stack. pthread_mutex_lock(&halide_work_queue.mutex); job.next_job = halide_work_queue.jobs; halide_work_queue.jobs = &job; pthread_mutex_unlock(&halide_work_queue.mutex); // Wake up any idle worker threads. pthread_cond_broadcast(&halide_work_queue.state_change); // Do some work myself. halide_worker_thread((void *)(&job)); // Return zero if the job succeeded, otherwise return the exit // status of one of the failing jobs (whichever one failed last). return job.exit_status; }
JNIEXPORT void JNICALL Java_com_example_hellohalide_CameraPreview_processFrame( JNIEnv *env, jobject obj, jbyteArray jSrc, jint j_w, jint j_h, jobject surf) { const int w = j_w, h = j_h; halide_set_error_handler(handler); unsigned char *src = (unsigned char *)env->GetByteArrayElements(jSrc, NULL); if (!src) { LOGD("src is null\n"); return; } ANativeWindow *win = ANativeWindow_fromSurface(env, surf); ANativeWindow_acquire(win); static bool first_call = true; static unsigned counter = 0; static unsigned times[16]; if (first_call) { LOGD("According to Halide, host system has %d cpus\n", halide_host_cpu_count()); LOGD("Resetting buffer format"); ANativeWindow_setBuffersGeometry(win, w, h, 0); first_call = false; for (int t = 0; t < 16; t++) times[t] = 0; } ANativeWindow_Buffer buf; ARect rect = {0, 0, w, h}; if (int err = ANativeWindow_lock(win, &buf, NULL)) { LOGD("ANativeWindow_lock failed with error code %d\n", err); return; } uint8_t *dst = (uint8_t *)buf.bits; // If we're using opencl, use the gpu backend for it. halide_set_ocl_device_type("gpu"); // Make these static so that we can reuse device allocations across frames. static buffer_t srcBuf = {0}; static buffer_t dstBuf = {0}; if (dst) { srcBuf.host = (uint8_t *)src; srcBuf.host_dirty = true; srcBuf.extent[0] = w; srcBuf.extent[1] = h; srcBuf.extent[2] = 0; srcBuf.extent[3] = 0; srcBuf.stride[0] = 1; srcBuf.stride[1] = w; srcBuf.min[0] = 0; srcBuf.min[1] = 0; srcBuf.elem_size = 1; dstBuf.host = dst; dstBuf.extent[0] = w; dstBuf.extent[1] = h; dstBuf.extent[2] = 0; dstBuf.extent[3] = 0; dstBuf.stride[0] = 1; dstBuf.stride[1] = w; dstBuf.min[0] = 0; dstBuf.min[1] = 0; dstBuf.elem_size = 1; // Just copy over chrominance untouched memcpy(dst + w*h, src + w*h, (w*h)/2); int64_t t1 = halide_current_time_ns(); halide_generated(&srcBuf, &dstBuf); if (dstBuf.dev) { halide_copy_to_host(NULL, &dstBuf); } int64_t t2 = halide_current_time_ns(); unsigned elapsed_us = (t2 - t1)/1000; times[counter & 15] = elapsed_us; counter++; unsigned min = times[0]; for (int i = 1; i < 16; i++) { if (times[i] < min) min = times[i]; } LOGD("Time taken: %d (%d)", elapsed_us, min); } ANativeWindow_unlockAndPost(win); ANativeWindow_release(win); env->ReleaseByteArrayElements(jSrc, (jbyte *)src, 0); }
WEAK int default_do_par_for(void *user_context, halide_task f, int min, int size, uint8_t *closure) { // Grab the lock. If it hasn't been initialized yet, then the // field will be zero-initialized because it's a static // global. pthreads helpfully interprets zero-valued mutex objects // as uninitialized and initializes them for you (see PTHREAD_MUTEX_INITIALIZER). pthread_mutex_lock(&halide_work_queue.mutex); if (!halide_thread_pool_initialized) { halide_work_queue.shutdown = false; pthread_cond_init(&halide_work_queue.wakeup_owners, NULL); pthread_cond_init(&halide_work_queue.wakeup_a_team, NULL); pthread_cond_init(&halide_work_queue.wakeup_b_team, NULL); halide_work_queue.jobs = NULL; if (!halide_num_threads) { char *threads_str = getenv("HL_NUM_THREADS"); if (!threads_str) { // Legacy name for HL_NUM_THREADS threads_str = getenv("HL_NUMTHREADS"); } if (threads_str) { halide_num_threads = atoi(threads_str); } else { halide_num_threads = halide_host_cpu_count(); // halide_printf(user_context, "HL_NUM_THREADS not defined. Defaulting to %d threads.\n", halide_num_threads); } } if (halide_num_threads > MAX_THREADS) { halide_num_threads = MAX_THREADS; } else if (halide_num_threads < 1) { halide_num_threads = 1; } for (int i = 0; i < halide_num_threads-1; i++) { //fprintf(stderr, "Creating thread %d\n", i); pthread_create(halide_work_queue.threads + i, NULL, halide_worker_thread, NULL); } // Everyone starts on the a team. halide_work_queue.a_team_size = halide_num_threads; halide_thread_pool_initialized = true; } // Make the job. work job; job.f = f; // The job should call this function. It takes an index and a closure. job.user_context = user_context; job.next = min; // Start at this index. job.max = min + size; // Keep going until one less than this index. job.closure = closure; // Use this closure. job.exit_status = 0; // The job hasn't failed yet job.active_workers = 0; // Nobody is working on this yet if (!halide_work_queue.jobs && size < halide_num_threads) { // If there's no nested parallelism happening and there are // fewer tasks to do than threads, then set the target A team // size so that some threads will put themselves to sleep // until a larger job arrives. halide_work_queue.target_a_team_size = size; } else { halide_work_queue.target_a_team_size = halide_num_threads; } // If there are more tasks than threads in the A team, we should // wake up everyone. bool wake_b_team = size > halide_work_queue.a_team_size; // Push the job onto the stack. job.next_job = halide_work_queue.jobs; halide_work_queue.jobs = &job; pthread_mutex_unlock(&halide_work_queue.mutex); // Wake up our A team. pthread_cond_broadcast(&halide_work_queue.wakeup_a_team); if (wake_b_team) { // We need the B team too. pthread_cond_broadcast(&halide_work_queue.wakeup_b_team); } // Do some work myself. halide_worker_thread((void *)(&job)); // Return zero if the job succeeded, otherwise return the exit // status of one of the failing jobs (whichever one failed last). return job.exit_status; }
JNIEXPORT bool JNICALL Java_com_example_helloandroidcamera2_JNIUtils_edgeDetect( JNIEnv *env, jobject obj, jint srcWidth, jint srcHeight, jobject srcLumaByteBuffer, jint srcLumaRowStrideBytes, jobject dstSurface) { uint8_t *srcLumaPtr = reinterpret_cast<uint8_t *>( env->GetDirectBufferAddress(srcLumaByteBuffer)); if (srcLumaPtr == NULL) { return false; } ANativeWindow *win = ANativeWindow_fromSurface(env, dstSurface); ANativeWindow_acquire(win); ANativeWindow_Buffer buf; if (int err = ANativeWindow_lock(win, &buf, NULL)) { LOGE("ANativeWindow_lock failed with error code %d\n", err); ANativeWindow_release(win); return false; } ANativeWindow_setBuffersGeometry(win, srcWidth, srcHeight, 0 /*format unchanged*/); uint8_t *dstLumaPtr = reinterpret_cast<uint8_t *>(buf.bits); if (dstLumaPtr == NULL) { ANativeWindow_unlockAndPost(win); ANativeWindow_release(win); return false; } if (buf.format != IMAGE_FORMAT_YV12) { LOGE("ANativeWindow buffer locked but its format was not YV12."); ANativeWindow_unlockAndPost(win); ANativeWindow_release(win); return false; } if (!checkBufferSizesMatch(srcWidth, srcHeight, &buf)) { LOGE("ANativeWindow buffer locked but its size was %d x %d, expected " "%d x %d", buf.width, buf.height, srcWidth, srcHeight); ANativeWindow_unlockAndPost(win); ANativeWindow_release(win); return false; } uint32_t dstLumaSizeBytes = buf.stride * buf.height; uint32_t dstChromaRowStrideBytes = ALIGN(buf.stride / 2, 16); // Size of one chroma plane. uint32_t dstChromaSizeBytes = dstChromaRowStrideBytes * buf.height / 2; uint8_t *dstChromaVPtr = dstLumaPtr + dstLumaSizeBytes; uint8_t *dstChromaUPtr = dstLumaPtr + dstLumaSizeBytes + dstChromaSizeBytes; // Make these static so that we can reuse device allocations across frames. // It doesn't matter now, but useful for GPU backends. static buffer_t srcBuf = { 0 }; static buffer_t dstBuf = { 0 }; static buffer_t dstChromaBuf = { 0 }; srcBuf.host = srcLumaPtr; srcBuf.host_dirty = true; srcBuf.extent[0] = srcWidth; srcBuf.extent[1] = srcHeight; srcBuf.extent[2] = 0; srcBuf.extent[3] = 0; srcBuf.stride[0] = 1; srcBuf.stride[1] = srcLumaRowStrideBytes; srcBuf.min[0] = 0; srcBuf.min[1] = 0; srcBuf.elem_size = 1; dstBuf.host = dstLumaPtr; dstBuf.extent[0] = buf.width; // src and dst width/height actually match. dstBuf.extent[1] = buf.height; dstBuf.extent[2] = 0; dstBuf.extent[3] = 0; dstBuf.stride[0] = 1; dstBuf.stride[1] = buf.stride; // src and dst strides actually match. dstBuf.min[0] = 0; dstBuf.min[1] = 0; dstBuf.elem_size = 1; static bool first_call = true; static unsigned counter = 0; static unsigned times[16]; if (first_call) { LOGD("According to Halide, host system has %d cpus\n", halide_host_cpu_count()); first_call = false; for (int t = 0; t < 16; t++) { times[t] = 0; } } // Set chrominance to 128 to appear grayscale. // The dst chroma is guaranteed to be tightly packed since it's YV12. memset(dstChromaVPtr, 128, dstChromaSizeBytes * 2); int64_t t1 = halide_current_time_ns(); int err = edge_detect(&srcBuf, &dstBuf); if (err != halide_error_code_success) { LOGE("edge_detect failed with error code: %d", err); } int64_t t2 = halide_current_time_ns(); unsigned elapsed_us = (t2 - t1) / 1000; times[counter & 15] = elapsed_us; counter++; unsigned min = times[0]; for (int i = 1; i < 16; i++) { if (times[i] < min) { min = times[i]; } } LOGD("Time taken: %d us (minimum: %d us)", elapsed_us, min); ANativeWindow_unlockAndPost(win); ANativeWindow_release(win); return (err != halide_error_code_success); }
JNIEXPORT void JNICALL Java_com_example_hellohalide_CameraPreview_processFrame( JNIEnv *env, jobject obj, jbyteArray jSrc, jint j_w, jint j_h, jint j_orientation, jobject surf) { const int w = j_w, h = j_h, orientation = j_orientation; halide_start_clock(NULL); halide_set_error_handler(handler); unsigned char *src = (unsigned char *)env->GetByteArrayElements(jSrc, NULL); if (!src) { LOGD("src is null\n"); return; } LOGD("[output window size] j_w = %d, j_h = %d", j_w, j_h); LOGD("[src array length] jSrc.length = %d", env->GetArrayLength(jSrc)); ANativeWindow *win = ANativeWindow_fromSurface(env, surf); static bool first_call = true; static unsigned counter = 0; static unsigned times[16]; if (first_call) { LOGD("According to Halide, host system has %d cpus\n", halide_host_cpu_count()); LOGD("Resetting buffer format"); ANativeWindow_setBuffersGeometry(win, w, h, 0); first_call = false; for (int t = 0; t < 16; t++) times[t] = 0; } ANativeWindow_Buffer buf; ARect rect = {0, 0, w, h}; if (int err = ANativeWindow_lock(win, &buf, NULL)) { LOGD("ANativeWindow_lock failed with error code %d\n", err); return; } uint8_t *dst = (uint8_t *)buf.bits; // If we're using opencl, use the gpu backend for it. #if COMPILING_FOR_OPENCL halide_opencl_set_device_type("gpu"); #endif // Make these static so that we can reuse device allocations across frames. static halide_buffer_t srcBuf = {0}; static halide_dimension_t srcDim[2]; static halide_buffer_t dstBuf = {0}; static halide_dimension_t dstDim[2]; if (dst) { srcBuf.host = (uint8_t *)src; srcBuf.set_host_dirty(); srcBuf.dim = srcDim; srcBuf.dim[0].min = 0; srcBuf.dim[0].extent = w; srcBuf.dim[0].stride = 1; srcBuf.dim[1].min = 0; srcBuf.dim[1].extent = h; srcBuf.dim[1].stride = w; srcBuf.type = halide_type_of<uint8_t>(); if (orientation >= 180) { // Camera sensor is probably upside down (e.g. Nexus 5x) srcBuf.host += w*h-1; srcBuf.dim[0].stride = -1; srcBuf.dim[1].stride = -w; } dstBuf.host = dst; dstBuf.dim = dstDim; dstBuf.dim[0].min = 0; dstBuf.dim[0].extent = w; dstBuf.dim[0].stride = 1; dstBuf.dim[1].min = 0; dstBuf.dim[1].extent = h; dstBuf.dim[1].stride = w; dstBuf.type = halide_type_of<uint8_t>(); // Just set chroma to gray. memset(dst + w*h, 128, (w*h)/2); int64_t t1 = halide_current_time_ns(); hello(&srcBuf, &dstBuf); halide_copy_to_host(NULL, &dstBuf); int64_t t2 = halide_current_time_ns(); unsigned elapsed_us = (t2 - t1)/1000; times[counter & 15] = elapsed_us; counter++; unsigned min = times[0]; for (int i = 1; i < 16; i++) { if (times[i] < min) min = times[i]; } LOGD("Time taken: %d (%d)", elapsed_us, min); } ANativeWindow_unlockAndPost(win); ANativeWindow_release(win); env->ReleaseByteArrayElements(jSrc, (jbyte *)src, 0); }