WEAK int halide_do_par_for(void *user_context, int (*f)(void *, int, uint8_t *),
                           int min, int size, uint8_t *closure) {
    if (halide_custom_do_par_for) {
        return (*halide_custom_do_par_for)(user_context, f, min, size, closure);
    }
    if (!halide_thread_pool_initialized) {
        halide_work_queue.shutdown = false;
        pthread_mutex_init(&halide_work_queue.mutex, NULL);
        pthread_cond_init(&halide_work_queue.state_change, NULL);
        halide_work_queue.jobs = NULL;

        char *threadStr = getenv("HL_NUMTHREADS");
        if (threadStr) {
            halide_threads = atoi(threadStr);
        } else {
            halide_threads = halide_host_cpu_count();
            // halide_printf(user_context, "HL_NUMTHREADS not defined. Defaulting to %d threads.\n", halide_threads);
        }
        if (halide_threads > MAX_THREADS) {
            halide_threads = MAX_THREADS;
        } else if (halide_threads < 1) {
            halide_threads = 1;
        }
        for (int i = 0; i < halide_threads-1; i++) {
            //fprintf(stderr, "Creating thread %d\n", i);
            pthread_create(halide_work_queue.threads + i, NULL, halide_worker_thread, NULL);
        }

        halide_thread_pool_initialized = true;
    }

    // Make the job.
    work job;
    job.f = f;               // The job should call this function. It takes an index and a closure.
    job.user_context = user_context;
    job.next = min;          // Start at this index.
    job.max  = min + size;   // Keep going until one less than this index.
    job.closure = closure;   // Use this closure.
    job.exit_status = 0;     // The job hasn't failed yet
    job.active_workers = 0;  // Nobody is working on this yet

    // Push the job onto the stack.
    pthread_mutex_lock(&halide_work_queue.mutex);
    job.next_job = halide_work_queue.jobs;
    halide_work_queue.jobs = &job;
    pthread_mutex_unlock(&halide_work_queue.mutex);

    // Wake up any idle worker threads.
    pthread_cond_broadcast(&halide_work_queue.state_change);

    // Do some work myself.
    halide_worker_thread((void *)(&job));

    // Return zero if the job succeeded, otherwise return the exit
    // status of one of the failing jobs (whichever one failed last).
    return job.exit_status;
}
Beispiel #2
0
JNIEXPORT void JNICALL Java_com_example_hellohalide_CameraPreview_processFrame(
    JNIEnv *env, jobject obj, jbyteArray jSrc, jint j_w, jint j_h, jobject surf) {

    const int w = j_w, h = j_h;

    halide_set_error_handler(handler);

    unsigned char *src = (unsigned char *)env->GetByteArrayElements(jSrc, NULL);
    if (!src) {
        LOGD("src is null\n");
        return;
    }

    ANativeWindow *win = ANativeWindow_fromSurface(env, surf);
    ANativeWindow_acquire(win);

    static bool first_call = true;
    static unsigned counter = 0;
    static unsigned times[16];
    if (first_call) {
        LOGD("According to Halide, host system has %d cpus\n", halide_host_cpu_count());
        LOGD("Resetting buffer format");
        ANativeWindow_setBuffersGeometry(win, w, h, 0);
        first_call = false;
        for (int t = 0; t < 16; t++) times[t] = 0;
    }

    ANativeWindow_Buffer buf;
    ARect rect = {0, 0, w, h};

    if (int err = ANativeWindow_lock(win, &buf, NULL)) {
        LOGD("ANativeWindow_lock failed with error code %d\n", err);
        return;
    }

    uint8_t *dst = (uint8_t *)buf.bits;

    // If we're using opencl, use the gpu backend for it.
    halide_set_ocl_device_type("gpu");

    // Make these static so that we can reuse device allocations across frames.
    static buffer_t srcBuf = {0};
    static buffer_t dstBuf = {0};

    if (dst) {
        srcBuf.host = (uint8_t *)src;
        srcBuf.host_dirty = true;
        srcBuf.extent[0] = w;
        srcBuf.extent[1] = h;
        srcBuf.extent[2] = 0;
        srcBuf.extent[3] = 0;
        srcBuf.stride[0] = 1;
        srcBuf.stride[1] = w;
        srcBuf.min[0] = 0;
        srcBuf.min[1] = 0;
        srcBuf.elem_size = 1;

        dstBuf.host = dst;
        dstBuf.extent[0] = w;
        dstBuf.extent[1] = h;
        dstBuf.extent[2] = 0;
        dstBuf.extent[3] = 0;
        dstBuf.stride[0] = 1;
        dstBuf.stride[1] = w;
        dstBuf.min[0] = 0;
        dstBuf.min[1] = 0;
        dstBuf.elem_size = 1;

        // Just copy over chrominance untouched
        memcpy(dst + w*h, src + w*h, (w*h)/2);

        int64_t t1 = halide_current_time_ns();
        halide_generated(&srcBuf, &dstBuf);

        if (dstBuf.dev) {
            halide_copy_to_host(NULL, &dstBuf);
        }

        int64_t t2 = halide_current_time_ns();
        unsigned elapsed_us = (t2 - t1)/1000;


        times[counter & 15] = elapsed_us;
        counter++;
        unsigned min = times[0];
        for (int i = 1; i < 16; i++) {
            if (times[i] < min) min = times[i];
        }
        LOGD("Time taken: %d (%d)", elapsed_us, min);
    }

    ANativeWindow_unlockAndPost(win);
    ANativeWindow_release(win);

    env->ReleaseByteArrayElements(jSrc, (jbyte *)src, 0);
}
Beispiel #3
0
WEAK int default_do_par_for(void *user_context, halide_task f,
                            int min, int size, uint8_t *closure) {
    // Grab the lock. If it hasn't been initialized yet, then the
    // field will be zero-initialized because it's a static
    // global. pthreads helpfully interprets zero-valued mutex objects
    // as uninitialized and initializes them for you (see PTHREAD_MUTEX_INITIALIZER).
    pthread_mutex_lock(&halide_work_queue.mutex);

    if (!halide_thread_pool_initialized) {
        halide_work_queue.shutdown = false;
        pthread_cond_init(&halide_work_queue.wakeup_owners, NULL);
        pthread_cond_init(&halide_work_queue.wakeup_a_team, NULL);
        pthread_cond_init(&halide_work_queue.wakeup_b_team, NULL);
        halide_work_queue.jobs = NULL;

        if (!halide_num_threads) {
            char *threads_str = getenv("HL_NUM_THREADS");
            if (!threads_str) {
                // Legacy name for HL_NUM_THREADS
                threads_str = getenv("HL_NUMTHREADS");
            }
            if (threads_str) {
                halide_num_threads = atoi(threads_str);
            } else {
                halide_num_threads = halide_host_cpu_count();
                // halide_printf(user_context, "HL_NUM_THREADS not defined. Defaulting to %d threads.\n", halide_num_threads);
            }
        }
        if (halide_num_threads > MAX_THREADS) {
            halide_num_threads = MAX_THREADS;
        } else if (halide_num_threads < 1) {
            halide_num_threads = 1;
        }
        for (int i = 0; i < halide_num_threads-1; i++) {
            //fprintf(stderr, "Creating thread %d\n", i);
            pthread_create(halide_work_queue.threads + i, NULL, halide_worker_thread, NULL);
        }
        // Everyone starts on the a team.
        halide_work_queue.a_team_size = halide_num_threads;

        halide_thread_pool_initialized = true;
    }

    // Make the job.
    work job;
    job.f = f;               // The job should call this function. It takes an index and a closure.
    job.user_context = user_context;
    job.next = min;          // Start at this index.
    job.max  = min + size;   // Keep going until one less than this index.
    job.closure = closure;   // Use this closure.
    job.exit_status = 0;     // The job hasn't failed yet
    job.active_workers = 0;  // Nobody is working on this yet

    if (!halide_work_queue.jobs && size < halide_num_threads) {
        // If there's no nested parallelism happening and there are
        // fewer tasks to do than threads, then set the target A team
        // size so that some threads will put themselves to sleep
        // until a larger job arrives.
        halide_work_queue.target_a_team_size = size;
    } else {
        halide_work_queue.target_a_team_size = halide_num_threads;
    }

    // If there are more tasks than threads in the A team, we should
    // wake up everyone.
    bool wake_b_team = size > halide_work_queue.a_team_size;

    // Push the job onto the stack.
    job.next_job = halide_work_queue.jobs;
    halide_work_queue.jobs = &job;

    pthread_mutex_unlock(&halide_work_queue.mutex);

    // Wake up our A team.
    pthread_cond_broadcast(&halide_work_queue.wakeup_a_team);

    if (wake_b_team) {
        // We need the B team too.
        pthread_cond_broadcast(&halide_work_queue.wakeup_b_team);
    }

    // Do some work myself.
    halide_worker_thread((void *)(&job));

    // Return zero if the job succeeded, otherwise return the exit
    // status of one of the failing jobs (whichever one failed last).
    return job.exit_status;
}
JNIEXPORT bool JNICALL Java_com_example_helloandroidcamera2_JNIUtils_edgeDetect(
    JNIEnv *env, jobject obj, jint srcWidth, jint srcHeight,
    jobject srcLumaByteBuffer, jint srcLumaRowStrideBytes, jobject dstSurface) {
    uint8_t *srcLumaPtr = reinterpret_cast<uint8_t *>(
        env->GetDirectBufferAddress(srcLumaByteBuffer));
    if (srcLumaPtr == NULL) {
        return false;
    }

    ANativeWindow *win = ANativeWindow_fromSurface(env, dstSurface);
    ANativeWindow_acquire(win);

    ANativeWindow_Buffer buf;
    if (int err = ANativeWindow_lock(win, &buf, NULL)) {
        LOGE("ANativeWindow_lock failed with error code %d\n", err);
        ANativeWindow_release(win);
        return false;
    }

    ANativeWindow_setBuffersGeometry(win, srcWidth, srcHeight, 0 /*format unchanged*/);

    uint8_t *dstLumaPtr = reinterpret_cast<uint8_t *>(buf.bits);
    if (dstLumaPtr == NULL) {
        ANativeWindow_unlockAndPost(win);
        ANativeWindow_release(win);
        return false;
    }

    if (buf.format != IMAGE_FORMAT_YV12) {
        LOGE("ANativeWindow buffer locked but its format was not YV12.");
        ANativeWindow_unlockAndPost(win);
        ANativeWindow_release(win);
        return false;
    }

    if (!checkBufferSizesMatch(srcWidth, srcHeight, &buf)) {
        LOGE("ANativeWindow buffer locked but its size was %d x %d, expected "
                "%d x %d", buf.width, buf.height, srcWidth, srcHeight);
        ANativeWindow_unlockAndPost(win);
        ANativeWindow_release(win);
        return false;
    }

    uint32_t dstLumaSizeBytes = buf.stride * buf.height;
    uint32_t dstChromaRowStrideBytes = ALIGN(buf.stride / 2, 16);
    // Size of one chroma plane.
    uint32_t dstChromaSizeBytes = dstChromaRowStrideBytes * buf.height / 2;
    uint8_t *dstChromaVPtr = dstLumaPtr + dstLumaSizeBytes;
    uint8_t *dstChromaUPtr = dstLumaPtr + dstLumaSizeBytes + dstChromaSizeBytes;

    // Make these static so that we can reuse device allocations across frames.
    // It doesn't matter now, but useful for GPU backends.
    static buffer_t srcBuf = { 0 };
    static buffer_t dstBuf = { 0 };
    static buffer_t dstChromaBuf = { 0 };

    srcBuf.host = srcLumaPtr;
    srcBuf.host_dirty = true;
    srcBuf.extent[0] = srcWidth;
    srcBuf.extent[1] = srcHeight;
    srcBuf.extent[2] = 0;
    srcBuf.extent[3] = 0;
    srcBuf.stride[0] = 1;
    srcBuf.stride[1] = srcLumaRowStrideBytes;
    srcBuf.min[0] = 0;
    srcBuf.min[1] = 0;
    srcBuf.elem_size = 1;

    dstBuf.host = dstLumaPtr;
    dstBuf.extent[0] = buf.width;  // src and dst width/height actually match.
    dstBuf.extent[1] = buf.height;
    dstBuf.extent[2] = 0;
    dstBuf.extent[3] = 0;
    dstBuf.stride[0] = 1;
    dstBuf.stride[1] = buf.stride;  // src and dst strides actually match.
    dstBuf.min[0] = 0;
    dstBuf.min[1] = 0;
    dstBuf.elem_size = 1;

    static bool first_call = true;
    static unsigned counter = 0;
    static unsigned times[16];
    if (first_call) {
        LOGD("According to Halide, host system has %d cpus\n",
             halide_host_cpu_count());
        first_call = false;
        for (int t = 0; t < 16; t++) {
            times[t] = 0;
        }
    }

    // Set chrominance to 128 to appear grayscale.
    // The dst chroma is guaranteed to be tightly packed since it's YV12.
    memset(dstChromaVPtr, 128, dstChromaSizeBytes * 2);

    int64_t t1 = halide_current_time_ns();
    int err = edge_detect(&srcBuf, &dstBuf);
    if (err != halide_error_code_success) {
        LOGE("edge_detect failed with error code: %d", err);
    }

    int64_t t2 = halide_current_time_ns();
    unsigned elapsed_us = (t2 - t1) / 1000;

    times[counter & 15] = elapsed_us;
    counter++;
    unsigned min = times[0];
    for (int i = 1; i < 16; i++) {
        if (times[i] < min) {
            min = times[i];
        }
    }
    LOGD("Time taken: %d us (minimum: %d us)", elapsed_us, min);

    ANativeWindow_unlockAndPost(win);
    ANativeWindow_release(win);

    return (err != halide_error_code_success);
}
Beispiel #5
0
JNIEXPORT void JNICALL Java_com_example_hellohalide_CameraPreview_processFrame(
    JNIEnv *env, jobject obj, jbyteArray jSrc, jint j_w, jint j_h, jint j_orientation, jobject surf) {

    const int w = j_w, h = j_h, orientation = j_orientation;

    halide_start_clock(NULL);
    halide_set_error_handler(handler);

    unsigned char *src = (unsigned char *)env->GetByteArrayElements(jSrc, NULL);
    if (!src) {
        LOGD("src is null\n");
        return;
    }

    LOGD("[output window size] j_w = %d, j_h = %d", j_w, j_h);
    LOGD("[src array length] jSrc.length = %d", env->GetArrayLength(jSrc));

    ANativeWindow *win = ANativeWindow_fromSurface(env, surf);


    static bool first_call = true;
    static unsigned counter = 0;
    static unsigned times[16];
    if (first_call) {
        LOGD("According to Halide, host system has %d cpus\n", halide_host_cpu_count());
        LOGD("Resetting buffer format");
        ANativeWindow_setBuffersGeometry(win, w, h, 0);
        first_call = false;
        for (int t = 0; t < 16; t++) times[t] = 0;
    }

    ANativeWindow_Buffer buf;
    ARect rect = {0, 0, w, h};

    if (int err = ANativeWindow_lock(win, &buf, NULL)) {
        LOGD("ANativeWindow_lock failed with error code %d\n", err);
        return;
    }

    uint8_t *dst = (uint8_t *)buf.bits;

    // If we're using opencl, use the gpu backend for it.
#if COMPILING_FOR_OPENCL
    halide_opencl_set_device_type("gpu");
#endif

    // Make these static so that we can reuse device allocations across frames.
    static halide_buffer_t srcBuf = {0};
    static halide_dimension_t srcDim[2];
    static halide_buffer_t dstBuf = {0};
    static halide_dimension_t dstDim[2];

    if (dst) {
        srcBuf.host = (uint8_t *)src;
        srcBuf.set_host_dirty();
        srcBuf.dim = srcDim;
        srcBuf.dim[0].min = 0;
        srcBuf.dim[0].extent = w;
        srcBuf.dim[0].stride = 1;
        srcBuf.dim[1].min = 0;
        srcBuf.dim[1].extent = h;
        srcBuf.dim[1].stride = w;
        srcBuf.type = halide_type_of<uint8_t>();

        if (orientation >= 180) {
            // Camera sensor is probably upside down (e.g. Nexus 5x)
            srcBuf.host += w*h-1;
            srcBuf.dim[0].stride = -1;
            srcBuf.dim[1].stride = -w;
        }

        dstBuf.host = dst;
        dstBuf.dim = dstDim;
        dstBuf.dim[0].min = 0;
        dstBuf.dim[0].extent = w;
        dstBuf.dim[0].stride = 1;
        dstBuf.dim[1].min = 0;
        dstBuf.dim[1].extent = h;
        dstBuf.dim[1].stride = w;
        dstBuf.type = halide_type_of<uint8_t>();

        // Just set chroma to gray.
        memset(dst + w*h, 128, (w*h)/2);

        int64_t t1 = halide_current_time_ns();
        hello(&srcBuf, &dstBuf);

        halide_copy_to_host(NULL, &dstBuf);

        int64_t t2 = halide_current_time_ns();
        unsigned elapsed_us = (t2 - t1)/1000;


        times[counter & 15] = elapsed_us;
        counter++;
        unsigned min = times[0];
        for (int i = 1; i < 16; i++) {
            if (times[i] < min) min = times[i];
        }
        LOGD("Time taken: %d (%d)", elapsed_us, min);
    }

    ANativeWindow_unlockAndPost(win);
    ANativeWindow_release(win);

    env->ReleaseByteArrayElements(jSrc, (jbyte *)src, 0);
}