void feature_vector(nv_matrix_t *vec, int vec_j, nv_matrix_t *key_vec, nv_matrix_t *desc_vec, int desc_m ) { int i; int procs = nv_omp_procs(); nv_matrix_t *vec_tmp = nv_matrix_alloc(vec->n, procs); const nv_matrix_t *posi = POSI(); const nv_matrix_t *nega = NEGA(); nv_matrix_zero(vec_tmp); #ifdef _OPENMP #pragma omp parallel for num_threads(procs) #endif for (i = 0; i < desc_m; ++i) { int j; int thread_id = nv_omp_thread_id(); nv_vector_normalize(desc_vec, i); if (NV_MAT_V(key_vec, i, NV_KEYPOINT_RESPONSE_IDX) > 0.0f) { int label = nv_nn(posi, desc_vec, i); for (j = 0; j < posi->n; ++j) { NV_MAT_V(vec_tmp, thread_id, label * NV_KEYPOINT_DESC_N + j) += NV_MAT_V(desc_vec, i, j) - NV_MAT_V(posi, label, j); } } else { int label = nv_nn(nega, desc_vec, i); int vl = (KP + label) * NV_KEYPOINT_DESC_N; for (j = 0; j < nega->n; ++j) { NV_MAT_V(vec_tmp, thread_id, (vl + j)) += NV_MAT_V(desc_vec, i, j) - NV_MAT_V(nega, label, j); } } } nv_vector_zero(vec, vec_j); for (i = 0; i < procs; ++i) { nv_vector_add(vec, vec_j, vec, vec_j, vec_tmp, i); } nv_vector_normalize(vec, vec_j); nv_matrix_free(&vec_tmp); }
void nv_vector_avg(nv_matrix_t *mean, int mean_m, const nv_matrix_t *mat) { float factor = 1.0f / mat->m; int m; NV_ASSERT(mean->n == mat->n); nv_vector_zero(mean, mean_m); for (m = 0; m < mat->m; ++m) { int n; for (n = 0; n < mat->n; ++n) { NV_MAT_V(mean, mean_m, n) += factor * NV_MAT_V(mat, m, n); } } }
void nv_face_haarlike(nv_face_haarlike_normalize_e normalize_type, nv_matrix_t *feature, int feature_m, const nv_matrix_t *sum, int x, int y, int width, int height) { int ix, iy, n; float v, vmax, vmin; float xscale = width / 32.0f; float yscale = height / 32.0f; float ystep = yscale; float xstep = xscale; int hystep = (32 - 8) / 2 * 8; int sy = NV_ROUND_INT(4.0f * ystep); int sx = NV_ROUND_INT(4.0f * xstep); int hy, hx; nv_vector_zero(feature, feature_m); // level1 #ifdef _OPENMP //#pragma omp parallel for private(ix) #endif for (iy = 0, hy = 0; iy < 32-8; iy += 2, ++hy) { int py = y + NV_ROUND_INT(ystep * iy); int ey = py + NV_ROUND_INT(8.0f * ystep); const float pty = (ey - py) * 255.0f; for (ix = 0, hx = 0; ix < 32-8; ix += 2, ++hx) { int px = x + NV_ROUND_INT(xstep * ix); int ex = px + NV_ROUND_INT(8.0f * xstep); float p1, p2, area, ptx; // 全エリア area = NV_MAT3D_V(sum, ey, ex, 0) - NV_MAT3D_V(sum, ey, px, 0) - (NV_MAT3D_V(sum, py, ex, 0) - NV_MAT3D_V(sum, py, px, 0)); // 1 // [+] // [-] p1 = NV_MAT3D_V(sum, py + sy, ex, 0) - NV_MAT3D_V(sum, py + sy, px, 0) - (NV_MAT3D_V(sum, py, ex, 0) - NV_MAT3D_V(sum, py, px, 0)); p2 = area - p1; ptx = (ex - px) * 255.0f; p1 /= ((py + sy) - py) * ptx; p2 /= (ey - (py + sy)) * ptx; if (p1 > p2) { NV_MAT_V(feature, feature_m, hy * hystep + hx * 8 + 0) = p1 - p2; } else { NV_MAT_V(feature, feature_m, hy * hystep + hx * 8 + 1) = p2 - p1; } // 2 // [+][-] p1 = NV_MAT3D_V(sum, ey, px + sx, 0) - NV_MAT3D_V(sum, ey, px, 0) - (NV_MAT3D_V(sum, py, px + sx, 0) - NV_MAT3D_V(sum, py, px, 0)); p2 = area - p1; p1 /= ((px + sx) - px) * pty; p2 /= (ex - (px + sx)) * pty; if (p1 > p2) { NV_MAT_V(feature, feature_m, hy * hystep + hx * 8 + 2) = p1 - p2; } else { NV_MAT_V(feature, feature_m, hy * hystep + hx * 8 + 3) = p2 - p1; } // 3 p1 = nv_face_haarlike_diagonal_filter(1, sum, px, py, xscale, yscale); if (p1 > 0.0f) { NV_MAT_V(feature, feature_m, hy * hystep + hx * 8 + 4) = p1; } else { NV_MAT_V(feature, feature_m, hy * hystep + hx * 8 + 5) = -p1; } // 4 p1 = nv_face_haarlike_diagonal_filter(2, sum, px, py, xscale, yscale); if (p1 > 0.0f) { NV_MAT_V(feature, feature_m, hy * hystep + hx * 8 + 6) = p1; } else { NV_MAT_V(feature, feature_m, hy * hystep + hx * 8 + 7) = -p1; } } } // 正規化 switch (normalize_type) { case NV_NORMALIZE_MAX: // Maximum=1.0 vmax = 0.0f; vmin = FLT_MAX; for (n = 0; n < feature->n; ++n) { if (NV_MAT_V(feature, feature_m, n) > vmax) { vmax = NV_MAT_V(feature, feature_m, n); } if (NV_MAT_V(feature, feature_m, n) != 0.0f && NV_MAT_V(feature, feature_m, n) < vmin) { vmin = NV_MAT_V(feature, feature_m, n); } } if (vmax != 0.0f && vmax > vmin) { v = 1.0f / (vmax - vmin); for (n = 0; n < feature->n; ++n) { if (NV_MAT_V(feature, feature_m, n) != 0.0f) { NV_MAT_V(feature, feature_m, n) = (NV_MAT_V(feature, feature_m, n) - vmin) * v; } } } break; case NV_NORMALIZE_NORM: // Vector Norm=1.0 v = 0.0f; for (n = 0; n < feature->n; ++n) { v += NV_MAT_V(feature, feature_m, n) * NV_MAT_V(feature, feature_m, n); } if (v != 0.0) { v = 1.0f / sqrtf(v); for (n = 0; n < feature->n; ++n) { NV_MAT_V(feature, feature_m, n) *= v; } } break; case NV_NORMALIZE_NONE: default: break; } }
void nv_lr_train(nv_lr_t *lr, const nv_matrix_t *data, const nv_matrix_t *label, nv_lr_param_t param) { int m, n, i, j, k, l; long tm, tm_all = nv_clock(); float oe = FLT_MAX, er = 1.0f, we; float sum_e = 0.0f; int epoch = 0; int pn = (data->m > 256) ? 128:1; int step = data->m / (pn); int threads = nv_omp_procs(); nv_matrix_t *y = nv_matrix_alloc(lr->k, threads); nv_matrix_t *t = nv_matrix_alloc(lr->k, threads); nv_matrix_t *dw = nv_matrix_list_alloc(lr->n, lr->k, threads); nv_matrix_t *count = nv_matrix_alloc(lr->k, 1); nv_matrix_t *label_weight = nv_matrix_alloc(lr->k, 1); float count_max_log; nv_matrix_zero(count); nv_matrix_fill(label_weight, 1.0f); if (param.auto_balance) { /* クラスごとに数が違う場合に更新重みをスケーリングする */ for (m = 0; m < data->m; ++m) { NV_MAT_V(count, 0, (int)NV_MAT_V(label, m, 0)) += 1.0f; } count_max_log = logf(3.0f + NV_MAT_V(count, 0, nv_vector_max_n(count, 0))); for (n = 0; n < count->n; ++n) { if (NV_MAT_V(count, 0, n) > 0.0f) { float count_log = logf(3.0f + NV_MAT_V(count, 0, n)); NV_MAT_V(label_weight, 0, n) = powf(count_max_log, NV_LR_CLASS_COUNT_PENALTY_EXP) / powf(count_log, NV_LR_CLASS_COUNT_PENALTY_EXP); } else { NV_MAT_V(label_weight, 0, n) = 1.0f; } } } do { we = 1.0f / er; tm = nv_clock(); sum_e = 0.0f; for (m = 0; m < step; ++m) { nv_matrix_zero(dw); #ifdef _OPENMP #pragma omp parallel for schedule(dynamic, 4) reduction(+:sum_e) num_threads(threads) #endif for (i = 0; i < pn; ++i) { int rand_m = NV_ROUND_INT((data->m - 1) * nv_rand()); int thread_num = nv_omp_thread_id(); int label_i = (int)NV_MAT_V(label, rand_m, 0); float weight = NV_MAT_V(label_weight, 0, label_i); float yp; nv_vector_zero(t, thread_num); NV_MAT_V(t, thread_num, label_i) = 1.0f; nv_lr_predict_vector(lr, y, thread_num, data, rand_m); yp = NV_MAT_V(y, thread_num, (int)NV_MAT_V(label, rand_m, 0)); if (yp < 1.0 - NV_LR_MARGIN) { nv_lr_dw(lr, weight, dw, thread_num, data, rand_m, t, thread_num, y, thread_num); sum_e += nv_lr_error(t, thread_num, y, thread_num); } } for (l = 1; l < threads; ++l) { for (j = 0; j < dw->m; ++j) { for (i = 0; i < dw->n; ++i) { NV_MAT_LIST_V(dw, 0, j, i) += NV_MAT_LIST_V(dw, l, j, i); } } } #ifdef _OPENMP #pragma omp parallel for private(n) num_threads(threads) if (lr->k > 32) #endif for (k = 0; k < lr->k; ++k) { switch (param.reg_type) { case NV_LR_REG_NONE: for (n = 0; n < lr->n; ++n) { NV_MAT_V(lr->w, k, n) -= we * param.grad_w * NV_MAT_LIST_V(dw, 0, k, n); } break; case NV_LR_REG_L1: // FOBOS L1 for (n = 0; n < lr->n; ++n) { NV_MAT_V(lr->w, k, n) -= we * param.grad_w * NV_MAT_LIST_V(dw, 0, k, n); } for (n = 0; n < lr->n; ++n) { float w_i = NV_MAT_V(lr->w, k, n); float lambda = we * param.reg_w * (1.0f / (1.0f + epoch)); NV_MAT_V(lr->w, k, n) = nv_sign(w_i) * NV_MAX(0.0f, (fabsf(w_i) - lambda)); } break; case NV_LR_REG_L2: for (n = 0; n < lr->n; ++n) { NV_MAT_V(lr->w, k, n) -= we * (param.grad_w * (NV_MAT_LIST_V(dw, 0, k, n) + param.reg_w * NV_MAT_V(lr->w, k, n))); } break; } } } if (nv_lr_progress_flag) { printf("nv_lr:%d: E: %E, %ldms\n", epoch, sum_e / (pn * step), nv_clock() - tm); } if (nv_lr_progress_flag > 1) { int *ok = nv_alloc_type(int, lr->k); int *ng = nv_alloc_type(int, lr->k); memset(ok, 0, sizeof(int) * lr->k); memset(ng, 0, sizeof(int) * lr->k); for (i = 0; i < data->m; ++i) { int predict = nv_lr_predict_label(lr, data, i); int teach = (int)NV_MAT_V(label, i, 0); if (predict == teach) { ++ok[teach]; } else { ++ng[teach]; } } for (i = 0; i < lr->k; ++i) { printf("%d: ok: %d, ng: %d, %f\n", i, ok[i], ng[i], (float)ok[i] / (float)(ok[i] + ng[i])); } nv_free(ok); nv_free(ng); } if (nv_lr_progress_flag) { fflush(stdout); } if (sum_e > oe) { er += 1.0f; } if (er >= 20.0f) { break; } if (sum_e < FLT_EPSILON) { break; } oe = sum_e; } while (param.max_epoch > ++epoch);