nv_matrix_t *nv_matrix_list_get(const nv_matrix_t *parent, int list) { nv_matrix_t *matrix = (nv_matrix_t *)malloc(sizeof(nv_matrix_t)); matrix->list = 1; matrix->n = parent->n; matrix->m = parent->m; matrix->rows = parent->rows; matrix->cols = parent->cols; matrix->v = &NV_MAT_LIST_V(parent, list, 0, 0); matrix->step = parent->step; matrix->list_step = parent->list_step; matrix->alias = 1; return matrix; }
static void nv_lr_dw(const nv_lr_t *lr, float w, nv_matrix_t *dw, int el, const nv_matrix_t *data, int dj, const nv_matrix_t *t, int tj, const nv_matrix_t *y, int yj) { int j; for (j = 0; j < lr->k; ++j) { int i; float y_t = NV_MAT_V(y, yj, j) - NV_MAT_V(t, tj, j); for (i = 0; i < lr->n; ++i) { NV_MAT_LIST_V(dw, el, j, i) += w * y_t * NV_MAT_V(data, dj, i); } } }
int nv_klr_em(nv_lr_t *lr, // k nv_matrix_t *count, // k nv_matrix_t *labels, // data->m const nv_matrix_t *data, const nv_lr_param_t param, const int max_epoch) { int j, l; int processing = 1, last_processing = 0; int converge, epoch; long t; int relabel_count; int empty_class; float relabel_per; int num_threads = nv_omp_procs(); nv_matrix_t *old_labels = nv_matrix_alloc(1, data->m); nv_matrix_t *count_tmp = nv_matrix_list_alloc(1, lr->k, num_threads); NV_ASSERT(labels->m >= data->m); NV_ASSERT(count->m >= lr->k); nv_matrix_copy(old_labels, 0, labels, 0, old_labels->m); epoch = 0; do { if (last_processing) { processing = 0; } t = nv_clock(); nv_matrix_zero(count); nv_matrix_zero(count_tmp); #ifdef _OPENMP #pragma omp parallel for num_threads(num_threads) #endif for (j = 0; j < data->m; ++j) { int label = nv_lr_predict_label(lr, data, j); int thread_idx = nv_omp_thread_id(); NV_ASSERT(label < lr->k); NV_MAT_V(labels, j, 0) = (float)label; NV_MAT_LIST_V(count_tmp, thread_idx, label, 0) += 1.0f; } for (l = 0; l < num_threads; ++l) { for (j = 0; j < count->m; ++j) { NV_MAT_V(count, j, 0) += NV_MAT_LIST_V(count_tmp, l, j, 0); } } ++epoch; /* 終了判定 */ relabel_count = 0; for (j = 0; j < data->m; ++j) { if (NV_MAT_V(labels, j, 0) != NV_MAT_V(old_labels, j, 0)) { ++relabel_count; } } empty_class = 0; for (j = 0; j < lr->k; ++j) { empty_class += (NV_MAT_V(count, j, 0) > 0.0f ? 0:1); } relabel_per = (float)relabel_count / data->m; if (epoch > 1) { converge = (relabel_per < 0.001f) ? 1:0; } else { converge =0; } if (nv_klr_progress_flag) { printf("nv_klr: %d: relabel: %f, empty_class: %d, %ldms\n", epoch, relabel_per, empty_class, nv_clock() -t); fflush(stdout); } t = nv_clock(); if (converge) { /* 終了 */ if (nv_klr_progress_flag) { printf("nv_klr: %d: finish:\n", epoch); fflush(stdout); } processing = 0; } else { /* ラベル更新 */ nv_matrix_copy(old_labels, 0, labels, 0, old_labels->m); /* LR再計算 */ nv_lr_train(lr, data, labels, param); /* 最大試行回数判定 */ if (max_epoch != 0 && epoch >= max_epoch) { /* 終了 */ processing = 0; } if (nv_klr_progress_flag) { printf("nv_klr: %d: train: %ldms\n", epoch, nv_clock() -t); fflush(stdout); } } } while (processing); nv_matrix_free(&old_labels); nv_matrix_free(&count_tmp); return converge; }
void nv_lr_train(nv_lr_t *lr, const nv_matrix_t *data, const nv_matrix_t *label, nv_lr_param_t param) { int m, n, i, j, k, l; long tm, tm_all = nv_clock(); float oe = FLT_MAX, er = 1.0f, we; float sum_e = 0.0f; int epoch = 0; int pn = (data->m > 256) ? 128:1; int step = data->m / (pn); int threads = nv_omp_procs(); nv_matrix_t *y = nv_matrix_alloc(lr->k, threads); nv_matrix_t *t = nv_matrix_alloc(lr->k, threads); nv_matrix_t *dw = nv_matrix_list_alloc(lr->n, lr->k, threads); nv_matrix_t *count = nv_matrix_alloc(lr->k, 1); nv_matrix_t *label_weight = nv_matrix_alloc(lr->k, 1); float count_max_log; nv_matrix_zero(count); nv_matrix_fill(label_weight, 1.0f); if (param.auto_balance) { /* クラスごとに数が違う場合に更新重みをスケーリングする */ for (m = 0; m < data->m; ++m) { NV_MAT_V(count, 0, (int)NV_MAT_V(label, m, 0)) += 1.0f; } count_max_log = logf(3.0f + NV_MAT_V(count, 0, nv_vector_max_n(count, 0))); for (n = 0; n < count->n; ++n) { if (NV_MAT_V(count, 0, n) > 0.0f) { float count_log = logf(3.0f + NV_MAT_V(count, 0, n)); NV_MAT_V(label_weight, 0, n) = powf(count_max_log, NV_LR_CLASS_COUNT_PENALTY_EXP) / powf(count_log, NV_LR_CLASS_COUNT_PENALTY_EXP); } else { NV_MAT_V(label_weight, 0, n) = 1.0f; } } } do { we = 1.0f / er; tm = nv_clock(); sum_e = 0.0f; for (m = 0; m < step; ++m) { nv_matrix_zero(dw); #ifdef _OPENMP #pragma omp parallel for schedule(dynamic, 4) reduction(+:sum_e) num_threads(threads) #endif for (i = 0; i < pn; ++i) { int rand_m = NV_ROUND_INT((data->m - 1) * nv_rand()); int thread_num = nv_omp_thread_id(); int label_i = (int)NV_MAT_V(label, rand_m, 0); float weight = NV_MAT_V(label_weight, 0, label_i); float yp; nv_vector_zero(t, thread_num); NV_MAT_V(t, thread_num, label_i) = 1.0f; nv_lr_predict_vector(lr, y, thread_num, data, rand_m); yp = NV_MAT_V(y, thread_num, (int)NV_MAT_V(label, rand_m, 0)); if (yp < 1.0 - NV_LR_MARGIN) { nv_lr_dw(lr, weight, dw, thread_num, data, rand_m, t, thread_num, y, thread_num); sum_e += nv_lr_error(t, thread_num, y, thread_num); } } for (l = 1; l < threads; ++l) { for (j = 0; j < dw->m; ++j) { for (i = 0; i < dw->n; ++i) { NV_MAT_LIST_V(dw, 0, j, i) += NV_MAT_LIST_V(dw, l, j, i); } } } #ifdef _OPENMP #pragma omp parallel for private(n) num_threads(threads) if (lr->k > 32) #endif for (k = 0; k < lr->k; ++k) { switch (param.reg_type) { case NV_LR_REG_NONE: for (n = 0; n < lr->n; ++n) { NV_MAT_V(lr->w, k, n) -= we * param.grad_w * NV_MAT_LIST_V(dw, 0, k, n); } break; case NV_LR_REG_L1: // FOBOS L1 for (n = 0; n < lr->n; ++n) { NV_MAT_V(lr->w, k, n) -= we * param.grad_w * NV_MAT_LIST_V(dw, 0, k, n); } for (n = 0; n < lr->n; ++n) { float w_i = NV_MAT_V(lr->w, k, n); float lambda = we * param.reg_w * (1.0f / (1.0f + epoch)); NV_MAT_V(lr->w, k, n) = nv_sign(w_i) * NV_MAX(0.0f, (fabsf(w_i) - lambda)); } break; case NV_LR_REG_L2: for (n = 0; n < lr->n; ++n) { NV_MAT_V(lr->w, k, n) -= we * (param.grad_w * (NV_MAT_LIST_V(dw, 0, k, n) + param.reg_w * NV_MAT_V(lr->w, k, n))); } break; } } } if (nv_lr_progress_flag) { printf("nv_lr:%d: E: %E, %ldms\n", epoch, sum_e / (pn * step), nv_clock() - tm); } if (nv_lr_progress_flag > 1) { int *ok = nv_alloc_type(int, lr->k); int *ng = nv_alloc_type(int, lr->k); memset(ok, 0, sizeof(int) * lr->k); memset(ng, 0, sizeof(int) * lr->k); for (i = 0; i < data->m; ++i) { int predict = nv_lr_predict_label(lr, data, i); int teach = (int)NV_MAT_V(label, i, 0); if (predict == teach) { ++ok[teach]; } else { ++ng[teach]; } } for (i = 0; i < lr->k; ++i) { printf("%d: ok: %d, ng: %d, %f\n", i, ok[i], ng[i], (float)ok[i] / (float)(ok[i] + ng[i])); } nv_free(ok); nv_free(ng); } if (nv_lr_progress_flag) { fflush(stdout); } if (sum_e > oe) { er += 1.0f; } if (er >= 20.0f) { break; } if (sum_e < FLT_EPSILON) { break; } oe = sum_e; } while (param.max_epoch > ++epoch);