コード例 #1
0
// Mahalanobis distance
float nv_mahalanobis(const nv_cov_t *cov, const nv_matrix_t *x, int xm)
{
	int n;
	nv_matrix_t *y = nv_matrix_alloc(x->n, 1);
	nv_matrix_t *x2 = nv_matrix_alloc(x->n, 1);
	float distance;
	float delta2 = 0.0f;

	nv_matrix_zero(y);
	nv_matrix_zero(x2);
	for (n = 0; n < x2->n; ++n) {
		NV_MAT_V(x2, 0, n) = NV_MAT_V(x, xm, n) - NV_MAT_V(cov->u, 0, n);
	}
	nv_gemv(y, 0, NV_MAT_TR, cov->eigen_vec, x2, xm);
	for (n = 0; n < x->n; ++n) {
		float ev = NV_MAT_V(cov->eigen_val, n, 0);
		float xv = NV_MAT_V(y, 0, n);
		delta2 += (xv * xv) / ev;
	}

	distance = sqrtf(delta2);
	nv_matrix_free(&x2);
	nv_matrix_free(&y);

	return distance;
}
コード例 #2
0
/*
 * 45°回転したIntegral Image
 */
void 
nv_integral_tilted(nv_matrix_t *integral,
				   const nv_matrix_t *img, int channel)
{
	int row, col, scol, srow;
	int erow = img->rows + 1;
	int ecol = img->cols + 1;
	nv_matrix_t *prev_tilted = nv_matrix_alloc(img->cols + 1, 1);

	NV_ASSERT(
		integral->rows - 1 == img->rows 
		&& integral->cols - 1 == img->cols
	);

	nv_matrix_zero(prev_tilted);
	nv_matrix_zero(integral);

	for (scol = img->cols; scol > 0; --scol) {
		float tilted_sum = 0.0f;
		for (row = 1, col = scol; row < erow && col < ecol; ++row, ++col) {
			float tilted_val = NV_MAT3D_V(img, row - 1, col - 1, channel);
			if (col + 1 == ecol) {
				NV_MAT3D_V(integral, row, col, 0) = 
					NV_MAT3D_V(integral, row - 1, col, 0)
					+ tilted_sum + tilted_val;
			} else {
				NV_MAT3D_V(integral, row, col, 0) = 
					NV_MAT3D_V(integral, row - 1, col + 1, 0) 
					+ NV_MAT_V(prev_tilted, 0, col)
					+ tilted_sum + tilted_val;
			}
			tilted_sum += tilted_val;
			NV_MAT_V(prev_tilted, 0, col) = tilted_sum;
		}
	}
	for (srow = 2; srow < erow; ++srow) {
		float tilted_sum = 0.0f;
		for (row = srow, col = 1; row < erow && col < ecol; ++row, ++col) {
			float tilted_val = NV_MAT3D_V(img, row - 1, col - 1, channel);
			if (col + 1 == ecol) {
				NV_MAT3D_V(integral, row, col, 0) = 
					NV_MAT3D_V(integral, row - 1, col, 0)
					+ tilted_sum + tilted_val;
			} else {
				NV_MAT3D_V(integral, row, col, 0) = 
					NV_MAT3D_V(integral, row - 1, col + 1, 0) 
					+ NV_MAT_V(prev_tilted, 0, col)
					+ tilted_sum + tilted_val;
			}
			tilted_sum += tilted_val;
			NV_MAT_V(prev_tilted, 0, col) = tilted_sum;
		}
	}

	nv_matrix_free(&prev_tilted);
}
コード例 #3
0
ファイル: nv_vlad.hpp プロジェクト: kyama/otama
	void
	extract_dense(nv_matrix_t *vlad, int j,
				  const nv_matrix_t *image,
				  nv_keypoint_dense_t *dense,
				  int ndense
		)
	{
		NV_ASSERT(vlad->n == DIM);
		int desc_m;
		nv_matrix_t *key_vec;
		nv_matrix_t *desc_vec;
		nv_matrix_t *resize, *gray, *smooth;
		
		int i;
		int km = 0;

		if (m_fit_area == 0) {
			float scale = IMG_SIZE() / (float)NV_MAX(image->rows, image->cols);	
			resize = nv_matrix3d_alloc(3, (int)(image->rows * scale),
									   (int)(image->cols * scale));
		} else {
			float axis_ratio = (float)image->rows / image->cols;
			int new_cols = (int)sqrtf(m_fit_area / axis_ratio);
			int new_rows = (int)((float)m_fit_area / new_cols);
			resize = nv_matrix3d_alloc(3, new_rows, new_cols);
		}
		gray = nv_matrix3d_alloc(1, resize->rows, resize->cols);
		smooth = nv_matrix3d_alloc(1, resize->rows, resize->cols);
		
		for (i = 0; i < ndense; ++i) {
			km += dense[i].rows * dense[i].cols;
		}
		km *= 2;
		key_vec = nv_matrix_alloc(NV_KEYPOINT_KEYPOINT_N, km);
		desc_vec = nv_matrix_alloc(NV_KEYPOINT_DESC_N, km);
		nv_resize(resize, image);
		nv_gray(gray, resize);
		nv_gaussian5x5(smooth, 0, gray, 0);
		
		nv_matrix_zero(desc_vec);
		nv_matrix_zero(key_vec);
		
		desc_m = nv_keypoint_dense_ex(m_ctx, key_vec, desc_vec, smooth, 0,
									  dense, ndense);
		feature_vector(vlad, j, key_vec, desc_vec, desc_m);
		
		nv_matrix_free(&gray);
		nv_matrix_free(&resize);
		nv_matrix_free(&smooth);
		nv_matrix_free(&key_vec);
		nv_matrix_free(&desc_vec);
	}
コード例 #4
0
ファイル: nv_ip_bgseg.c プロジェクト: andrew950468/otama_api
nv_bgseg_t *
nv_bgseg_alloc(int frame_rows, int frame_cols,
			   float zeta, float bg_v, float fg_v,
			   int size
	)
{
	nv_bgseg_t *bg = nv_alloc_type(nv_bgseg_t, 1);
	float scale = (float)size / (float)NV_MAX(frame_rows, frame_cols);
	
	bg->init_1st = 0;
	bg->init_2nd = 0;
	bg->init_1st_finished = 0;
	bg->init_2nd_finished = 0;
	bg->frame_rows = frame_rows;
	bg->frame_cols = frame_cols;
	bg->rows = NV_ROUND_INT(frame_rows * scale);
	bg->cols = NV_ROUND_INT(frame_cols * scale);
	
	bg->zeta = zeta;
	bg->bg_v = bg_v;
	bg->fg_v = fg_v;
	bg->size = size;
	
	bg->av = nv_matrix_alloc(1 * bg->rows * bg->cols, 1);
	nv_matrix_zero(bg->av);
	bg->sgm = nv_matrix_dup(bg->av);
	
	return bg;
}
コード例 #5
0
void
kmeans(nv_matrix_t *centroids,
	   const nv_matrix_t *data)
{
	nv_matrix_t *cluster_labels = nv_matrix_alloc(1, data->m);
	nv_matrix_t *count = nv_matrix_alloc(1, CENTROIDS);

	nv_matrix_zero(count);
	nv_matrix_zero(centroids);
	nv_matrix_zero(cluster_labels);
	
	nv_kmeans_progress(1);
	nv_kmeans(centroids, count, cluster_labels, data, CENTROIDS, 50);

	nv_matrix_free(&cluster_labels);
	nv_matrix_free(&count);
}
コード例 #6
0
ファイル: nv_ml_lr.c プロジェクト: nagadomi/nv-debian
nv_lr_t *
nv_lr_alloc(int n, int k)
{
	nv_lr_t *lr = (nv_lr_t *)nv_malloc(sizeof(nv_lr_t));
	lr->n = n;
	lr->k = k;
	lr->w = nv_matrix_alloc(lr->n, k);

	nv_matrix_zero(lr->w);

	return lr;
}
コード例 #7
0
ファイル: nv_ml_mlp.c プロジェクト: andrew950468/otama_api
/* 乱数で初期化  */
void
nv_mlp_init_rand(nv_mlp_t *mlp, const nv_matrix_t *data)
{
	const float data_scale = 1.0f / data->m;
	const float input_norm_mean = sqrtf(0.8f * (mlp->input_w->m + 1));
	const float hidden_norm_mean = sqrtf(0.8f * (mlp->hidden_w->m + 1));
	float data_norm_mean;
	float input_scale, hidden_scale;
	int j;
	
	data_norm_mean = 0.0f;
	for (j = 0; j < data->m; ++j) {
		data_norm_mean += nv_vector_norm(data, j) * data_scale;
	}
	input_scale = 1.0f / (data_norm_mean * input_norm_mean);
	hidden_scale = 1.0f / hidden_norm_mean;
	
	nv_matrix_rand(mlp->input_w, -0.5f * input_scale, 0.5f * input_scale);
	nv_matrix_rand(mlp->hidden_w, -0.5f * hidden_scale, 0.5f * hidden_scale);
	nv_matrix_zero(mlp->input_bias);
	nv_matrix_zero(mlp->hidden_bias);
}
コード例 #8
0
ファイル: nv_ml_klr.c プロジェクト: andrew950468/otama_api
void 
nv_klr_init(nv_lr_t *lr,         // k
			nv_matrix_t *count,  // k
			nv_matrix_t *labels, // data->m
			const nv_matrix_t *data,
			const nv_lr_param_t param)
{
	nv_matrix_t *means = nv_matrix_alloc(lr->n, lr->k);
	long t;

	NV_ASSERT(labels->m >= data->m);

	nv_matrix_zero(means);
	nv_matrix_zero(labels);
	nv_matrix_zero(count);

	if (nv_klr_progress_flag) {
		printf("nv_klr: 0: init++\n");
	}

	t = nv_clock();
	nv_kmeans(means, count, labels, data, lr->k, 50);
	//nv_lbgu(means, count, labels, data, lr->k, 5, 10);

	if (nv_klr_progress_flag) {
		printf("nv_klr: 0: init end: %ldms\n", nv_clock() - t);
	}
	nv_lr_init(lr, data);
	nv_lr_train(lr, data, labels, param);	
	nv_matrix_free(&means);

	if (nv_klr_progress_flag) {
		printf("nv_klr: 0: first step: %ldms\n", nv_clock() - t);
		fflush(stdout);
	}
}
コード例 #9
0
ファイル: nv_vlad.hpp プロジェクト: kyama/otama
	void
	feature_vector(nv_matrix_t *vec,
				   int vec_j,
				   nv_matrix_t *key_vec,
				   nv_matrix_t *desc_vec,
				   int desc_m
		)
	{
		int i;
		int procs = nv_omp_procs();
		nv_matrix_t *vec_tmp = nv_matrix_alloc(vec->n, procs);
		const nv_matrix_t *posi = POSI();
		const nv_matrix_t *nega = NEGA();

		nv_matrix_zero(vec_tmp);

#ifdef _OPENMP
#pragma omp parallel for num_threads(procs)
#endif	
		for (i = 0; i < desc_m; ++i) {
			int j;
			int thread_id = nv_omp_thread_id();
			nv_vector_normalize(desc_vec, i);
			
			if (NV_MAT_V(key_vec, i, NV_KEYPOINT_RESPONSE_IDX) > 0.0f) {
				int label = nv_nn(posi, desc_vec, i);
				
				for (j = 0; j < posi->n; ++j) {
					NV_MAT_V(vec_tmp, thread_id, label * NV_KEYPOINT_DESC_N + j) +=
						NV_MAT_V(desc_vec, i, j) - NV_MAT_V(posi, label, j);
				}
			} else {
				int label = nv_nn(nega, desc_vec, i);
				int vl = (KP + label) * NV_KEYPOINT_DESC_N;
				for (j = 0; j < nega->n; ++j) {
					NV_MAT_V(vec_tmp, thread_id, (vl + j)) +=
						NV_MAT_V(desc_vec, i, j) - NV_MAT_V(nega, label, j);
				}
			}
		}
		nv_vector_zero(vec, vec_j);
		for (i = 0; i < procs; ++i) {
			nv_vector_add(vec, vec_j, vec, vec_j, vec_tmp, i);
		}
		nv_vector_normalize(vec, vec_j);
		
		nv_matrix_free(&vec_tmp);
	}
コード例 #10
0
ファイル: nv_ml_lbgu.c プロジェクト: andrew950468/otama_api
static void 
nv_lbgu_u(nv_matrix_t *u,
		  const nv_matrix_t *means,
		  const nv_matrix_t *data,
		  const nv_matrix_t *labels,
		  const nv_matrix_t *count)
{
	int m;
	nv_matrix_t *scale = nv_matrix_alloc(1, count->m);

	nv_matrix_zero(u);

	for (m = 0; m < count->m; ++m) {
		NV_MAT_V(scale, m, 0) = 1.0f / NV_MAT_V(count, m, 0);
	}

#ifdef _OPENMP
#pragma omp parallel for schedule(dynamic, 1)
#endif
	for (m = 0; m < data->m; ++m) {
		int k;
		float diff, min_error = FLT_MAX;
		int i = NV_MAT_VI(labels, m, 0);

		for (k = 0; k < means->m; ++k) {
			float dist;
			if (k == i) {
				continue;
			}
			dist = nv_euclidean2(means, k, data, m);
			if (min_error > dist) {
				min_error = dist;
			}
		}
		diff = min_error - nv_euclidean2(means, i, data, m);

#ifdef _OPENMP
#pragma omp critical (nv_lbgu_u)
#endif
		{
			NV_MAT_V(u, i, 0) += diff;
		}
	}

	nv_matrix_free(&scale);
}
コード例 #11
0
void
patch_sampling(nv_matrix_t *samples, std::vector<fileinfo_t> &list)
{
	nv_matrix_t *data = nv_matrix_alloc(PATCH_SIZE * PATCH_SIZE * 3,
										(int)((IMG_SIZE-PATCH_SIZE) * (IMG_SIZE-PATCH_SIZE) * list.size()));
	int data_index = 0;
	int i;
	
	nv_matrix_zero(data);
#ifdef _OPENMP
#pragma omp parallel for schedule(dynamic, 1)
#endif
	for (i = 0; i < (int)list.size(); ++i) {
		nv_matrix_t *src;
		nv_matrix_t *patches;
		src = nv_load_image(list[i].file.c_str());
		if (!src) {
			fprintf(stderr, "open filed: %s\n", list[i].file.c_str());
			exit(-1);
		}
		
		patches = nv_patch_matrix_alloc(src, PATCH_SIZE);
		nv_patch_extract(patches, src, PATCH_SIZE);
		
#ifdef _OPENMP
#pragma omp critical (patch_sampling)
#endif
		{
			int j;
			for (j = 0; j < patches->m; ++j) {
				nv_vector_copy(data, data_index, patches, j);
				data_index += 1;
			}
		}
		
		nv_matrix_free(&src);
		nv_matrix_free(&patches);
	}
	nv_vector_shuffle(data);
	nv_matrix_m(data, NV_MIN(samples->m, data_index));
	nv_matrix_copy_all(samples, data);
	nv_matrix_free(&data);
}
コード例 #12
0
ファイル: nv_ip_gray.c プロジェクト: andrew950468/otama_api
void 
nv_histgram_equalization(nv_matrix_t *eq, const nv_matrix_t *img, int channel)
{
	float freq[256] = {0};
	float fm;
	int m, i;
	float min_freq = FLT_MAX;

	NV_ASSERT(eq->m == img->m);
	if (img->m == 0) {
		nv_matrix_zero(eq);
		return ;
	}

	// freq
	fm = 1.0f / (float )img->m;
	for (m = 0; m < img->m; ++m) {
		int idx = (int)NV_MAT_V(img, m, channel);
		freq[idx] += 1.0f;
	}
	for (i = 1; i < 256; ++i) {
		freq[i] = freq[i] + freq[i - 1];
	}
	for (i = 0; i < 256; ++i) {
		freq[i] *= fm;
		
		if (freq[i] < min_freq) {
			min_freq = freq[i];
		}
	}
	if (min_freq == 1.0) {
		min_freq = 0.999999f;
	}

	// equalization
	for (m = 0; m < img->m; ++m) {
		int idx = (int)NV_MAT_V(img, m, channel);
		float v = (freq[idx] - min_freq) * 255.0f / (1.0f - min_freq);//255.0f * freq[idx];
		v = NV_MIN(NV_MAX(v, 0.0f), 255.0f);
		NV_MAT_V(eq, m, channel) = v;
	}
}
コード例 #13
0
ファイル: nv_ml_lbgu.c プロジェクト: andrew950468/otama_api
static void 
nv_lbgu_update(nv_matrix_t *means,
			   const nv_matrix_t *data,
			   const nv_matrix_t *labels,
			   const nv_matrix_t *count,
			   int max_error_class,
			   int min_error_class,
			   int kmeans_max_epoch)
{
	/*
	 * 分割対象のクラスタを2つにクラスタリングしてそのセントロイドで更新する
	 * (Bernd Fritzkeの論文とは異なる実装)
	 */

	int m, j;
	int c = NV_MAT_VI(count, max_error_class, 0);
	nv_matrix_t *data_tmp = nv_matrix_alloc(means->n, c);
	nv_matrix_t *means_tmp = nv_matrix_alloc(means->n, 2);
	nv_matrix_t *labels_tmp = nv_matrix_alloc(1, c);
	nv_matrix_t *count_tmp = nv_matrix_alloc(1, 2);

	nv_matrix_zero(data_tmp);

	for (m = j = 0; m < data->m; ++m) {
		if (max_error_class == NV_MAT_VI(labels, m, 0)) {
			nv_vector_copy(data_tmp, j++, data, m);
		}
	}
	NV_ASSERT(j == c);

	nv_kmeans(means_tmp, count_tmp, labels_tmp, data_tmp, 2, kmeans_max_epoch);
	nv_vector_copy(means, max_error_class, means_tmp, 0);
	nv_vector_copy(means, min_error_class, means_tmp, 1);

	nv_matrix_free(&data_tmp);
	nv_matrix_free(&means_tmp);
	nv_matrix_free(&labels_tmp);
	nv_matrix_free(&count_tmp);
}
コード例 #14
0
/*
 * Integral Image
 * 積分画像
 */
void 
nv_integral(nv_matrix_t *integral,
			const nv_matrix_t *img, int channel)
{
	int row, col;
	int erow = img->rows + 1;
	int ecol = img->cols + 1;

	NV_ASSERT(
		integral->rows - 1 == img->rows 
		&& integral->cols - 1 == img->cols
	);

	nv_matrix_zero(integral);
	for (row = 1; row < erow; ++row) {
		float col_sum = 0.0f;
		for (col = 1; col < ecol; ++col) {
			float col_val = NV_MAT3D_V(img, row - 1, col - 1, channel);
			NV_MAT3D_V(integral, row, col, 0) =	
				NV_MAT3D_V(integral, row - 1, col, 0) + col_sum + col_val;
			col_sum += col_val;
		}
	}
}
コード例 #15
0
ファイル: nv_ml_lbgu.c プロジェクト: andrew950468/otama_api
static float
nv_lbgu_e(nv_matrix_t *e,
		  const nv_matrix_t *means,
		  const nv_matrix_t *data,
		  const nv_matrix_t *labels,
		  const nv_matrix_t *count)
{
	int m;
	float rmse = 0.0f;

	nv_matrix_zero(e);

	for (m = 0; m < data->m; ++m) {
		int i = NV_MAT_VI(labels, m, 0);
		float dist = nv_euclidean2(means, i, data, m);

		{
			NV_MAT_V(e, i, 0) += dist;
			rmse += dist;
		}
	}

	return rmse / data->m;
}
コード例 #16
0
ファイル: nv_ml_lr.c プロジェクト: nagadomi/nv-debian
void 
nv_lr_init(nv_lr_t *lr, const nv_matrix_t *data)
{
	nv_matrix_zero(lr->w);
}
コード例 #17
0
ファイル: nv_ml_pa.c プロジェクト: nagadomi/nv-debian
void
nv_pa_init(nv_pa_t *pa)
{
	nv_matrix_zero(pa->w);
}
コード例 #18
0
ファイル: nv_ml_klr.c プロジェクト: andrew950468/otama_api
int 
nv_klr_em(nv_lr_t *lr,         // k
		  nv_matrix_t *count,  // k
		  nv_matrix_t *labels, // data->m
		  const nv_matrix_t *data,
		  const nv_lr_param_t param,
		  const int max_epoch)
{
	int j, l;
	int processing = 1, last_processing = 0;
	int converge, epoch;
	long t;
	int relabel_count;
	int empty_class;
	float relabel_per;
	int num_threads = nv_omp_procs();
	nv_matrix_t *old_labels = nv_matrix_alloc(1, data->m);
	nv_matrix_t *count_tmp = nv_matrix_list_alloc(1, lr->k, num_threads);

	NV_ASSERT(labels->m >= data->m);
	NV_ASSERT(count->m >= lr->k);

	nv_matrix_copy(old_labels, 0, labels, 0, old_labels->m);

	epoch = 0;
	do {
		if (last_processing) {
			processing = 0;
		}
		t = nv_clock();
		nv_matrix_zero(count);
		nv_matrix_zero(count_tmp);
#ifdef _OPENMP
#pragma omp parallel for num_threads(num_threads)
#endif
		for (j = 0; j < data->m; ++j) {
			int label = nv_lr_predict_label(lr, data, j);
			int thread_idx = nv_omp_thread_id();

			NV_ASSERT(label < lr->k);

			NV_MAT_V(labels, j, 0) = (float)label;
			NV_MAT_LIST_V(count_tmp, thread_idx, label, 0) += 1.0f;
		}

		for (l = 0; l < num_threads; ++l) {
			for (j = 0; j < count->m; ++j) {
				NV_MAT_V(count, j, 0) += NV_MAT_LIST_V(count_tmp, l, j, 0);
			}
		}
		++epoch;

		/* 終了判定 */
		relabel_count = 0;
		for (j = 0; j < data->m; ++j) {
			if (NV_MAT_V(labels, j, 0) != NV_MAT_V(old_labels, j, 0)) {
				++relabel_count;
			}
		}
		empty_class = 0;
		for (j = 0; j < lr->k; ++j) {
			empty_class += (NV_MAT_V(count, j, 0) > 0.0f ? 0:1);
		}
		relabel_per = (float)relabel_count / data->m;
		if (epoch > 1) {
			converge = (relabel_per < 0.001f) ? 1:0;
		} else {
			converge =0;
		}

		if (nv_klr_progress_flag) {
			printf("nv_klr: %d: relabel: %f, empty_class: %d, %ldms\n",
			epoch, relabel_per, empty_class, nv_clock() -t);
			fflush(stdout);
		}
		t = nv_clock();

		if (converge) {
			/* 終了 */ 
			if (nv_klr_progress_flag) {
				printf("nv_klr: %d: finish:\n", epoch);
				fflush(stdout);				
			}
			processing = 0;
		} else {
			/* ラベル更新 */ 
			nv_matrix_copy(old_labels, 0, labels, 0, old_labels->m);

			/* LR再計算 */ 
			nv_lr_train(lr, data, labels, param);

			/* 最大試行回数判定 */ 
			if (max_epoch != 0
				&& epoch >= max_epoch)
			{
				/* 終了 */
				processing = 0;
			}
			if (nv_klr_progress_flag) {
				printf("nv_klr: %d: train: %ldms\n", epoch, nv_clock() -t);
				fflush(stdout);				
			}
		}
	} while (processing);

	nv_matrix_free(&old_labels);
	nv_matrix_free(&count_tmp);

	return converge;
}
コード例 #19
0
void
kmeans_feature(nv_matrix_t *fv, int fv_j,
			   const nv_matrix_t *src,
			   const nv_matrix_t *zca_m,
			   const nv_matrix_t *zca_u,
			   const nv_matrix_t *centroids)
{
	nv_matrix_t *patches;
	nv_matrix_t *conv;
	int y, i;	
	
	NV_ASSERT(fv->n == DATA_N);
	patches = nv_patch_matrix_alloc(src, PATCH_SIZE);
	nv_patch_extract(patches, src, PATCH_SIZE);
	nv_standardize_local_all(patches, 10.0f);
	nv_zca_whitening_all(patches, zca_m, 0, zca_u);
	
	conv = nv_matrix_alloc(centroids->m, GRID);
	nv_matrix_zero(conv);
	
	for (y = 0; y < patches->rows; ++y) {
		int x;
		for (x = 0; x < patches->cols; ++x) {
			nv_matrix_t *z = nv_matrix_alloc(centroids->m, 1);
			nv_matrix_t *d = nv_matrix_alloc(patches->n, 1);
			int conv_index;
			int r = (int)sqrtf(GRID);
			int x_idx = (x / (patches->cols / r));
			int y_idx = (y / (patches->rows / r));

			if (x_idx >= r) {
				x_idx = r -1;
			}
			if (y_idx >= r) {
				y_idx = r -1;
			}
			conv_index = y_idx * r + x_idx;
			if (conv_index >= GRID) {
				conv_index = GRID - 1;
			}
#if TRIANGLE_DISTANCE
			{
				float mean;
				float min_z = FLT_MAX;
				int k;
				
				for (k = 0; k < centroids->m; ++k) {
					NV_MAT_V(z, 0, k) = nv_euclidean(centroids, k, patches, NV_MAT_M(patches, y, x));
					if (NV_MAT_V(z, 0, k) < min_z) {
						min_z = NV_MAT_V(z, 0, k);
					}
				}
				mean = nv_vector_mean(z, 0);
#if TRIANGLE_DISTANCE_HALF
				mean = mean - (mean - min_z) / 4.0f;
#endif
				for (k = 0; k < centroids->m; ++k) {
					float v = mean - NV_MAT_V(z, 0, k);
					if (0.0f < v) {
#if TRIANGLE_DISTANCE_MAX						
						if (NV_MAT_V(conv, conv_index, k) < v) {
							NV_MAT_V(conv, conv_index, k) = v;
						}
#else
						NV_MAT_V(conv, conv_index, k) += v;
#endif
					}
				}
			}
#else
			{
				int nn = nv_nn(centroids, patches, NV_MAT_M(patches, y, x));
				NV_MAT_V(conv, conv_index, nn) += 1.0f;
			}
#endif
			nv_matrix_free(&z);
			nv_matrix_free(&d);
		}
	}
	for (i = 0; i < GRID; ++i) {
		memmove(&NV_MAT_V(fv, fv_j, i * conv->n),
				&NV_MAT_V(conv, i, 0), conv->n * sizeof(float));
	}
	nv_matrix_free(&patches);
	nv_matrix_free(&conv);
}
コード例 #20
0
ファイル: nv_ml_arow.c プロジェクト: andrew950468/otama_api
void
nv_arow_init(nv_arow_t *arow)
{
	nv_matrix_zero(arow->w);
	nv_matrix_zero(arow->bias);
}
コード例 #21
0
void nv_shapecontext_feature(nv_shapecontext_t *sctx,
							const nv_matrix_t *img,
							float r
)
{
	int m, row, col, pc, i, l;
	nv_matrix_t *edge = nv_matrix3d_alloc(1, img->rows, img->cols);
	nv_matrix_t *points = nv_matrix_alloc(2, img->m);
	int *rand_idx = (int *)nv_malloc(sizeof(int) * img->m);
	float u_x, u_y, p_x, p_y, r_e;
	int pn;

	// 細線化
	nv_matrix_zero(points);
	nv_shapecontext_edge_image(edge, img);
	pc = 0;
	u_x = 0.0f;
	u_y = 0.0f;
	for (row = 0; row < edge->rows; ++row) {
		for (col = 0; col < edge->cols; ++col) {
			if (NV_MAT3D_V(edge, row, col, 0) > 50.0f) {
				NV_MAT_V(points, pc, 0) = (float)row;
				NV_MAT_V(points, pc, 1) = (float)col;
				++pc;
				u_y += (float)row;
				u_x += (float)col;
			}
		}
	}
	u_x /= pc;
	u_y /= pc;
	// 指定数の特徴にする(ランダム)
	pn = NV_MIN(pc, sctx->sctx->list);
	nv_shuffle_index(rand_idx, 0, pc);
#if 1
	{
		float max_x, max_y;

		if (pc < sctx->sctx->list) {
			// 足りないときはランダムに増やす
			for (i = pc; i < sctx->sctx->list; ++i) {
				rand_idx[i] = (int)(nv_rand() * pn);
			}
		}
		pc = pn = sctx->sctx->list;

		// 半径を求める

		max_x = 0.0f;
		max_y = 0.0f;
		for (m = 0; m < pn; ++m) {
			float yd = fabsf(NV_MAT_V(points, rand_idx[m], 0) - u_y);
			float xd = fabsf(NV_MAT_V(points, rand_idx[m], 1) - u_x);
			max_x = NV_MAX(max_x, xd);
			max_y = NV_MAX(max_y, yd);
		}
		r = (float)img->rows/2.0f;//NV_MAX(max_x, max_y) * 1.0f;
	}
#endif

	// log(r) = 5の基底定数を求める
	r_e = powf(r, 1.0f / NV_SC_LOG_R_BIN);

	// histgramを計算する
	sctx->n = pn;
	nv_matrix_zero(sctx->sctx);
	nv_matrix_zero(sctx->tan_angle);

	for (l = 0; l < pn; ++l) {
		// tangent angle
#if 0
		float max_bin = 0.0f, min_bin = FLT_MAX;
		float tan_angle = tangent_angle(
			r,
			NV_MAT_V(points, rand_idx[l], 0),
			NV_MAT_V(points, rand_idx[l], 1),
			points, pc);
#else
		float tan_angle = 0.0f;
#endif
		p_y = NV_MAT_V(points, rand_idx[l], 0);
		p_x = NV_MAT_V(points, rand_idx[l], 1);
		NV_MAT_V(sctx->tan_angle, l, 0) = tan_angle;
		NV_MAT_V(sctx->coodinate, l, 0) = p_y;
		NV_MAT_V(sctx->coodinate, l, 1) = p_x;
		NV_MAT_V(sctx->radius, l, 0) = r;

		// shape context
		for (i = 0; i < pn; ++i) {
			// # i ≠ l判定はとりあえずしない
			float xd = NV_MAT_V(points, rand_idx[i], 1) - p_x;
			float yd = NV_MAT_V(points, rand_idx[i], 0) - p_y;
			//int row = i / img->rows;
			//int col = i % img->rows;
			//float xd = col - p_x;
			//float yd = row - p_y;
			float theta;
			float log_r = logf(sqrtf(xd * xd + yd * yd)) / logf(r_e);
			float atan_r = atan2f(xd, yd);

			//if (NV_MAT3D_V(img, row, col, 0) == 0.0f) {
			//	continue;
			//}
			if (i == l) {
				continue;
			}

			if (atan_r < 0.0f) {
				atan_r = 2.0f * NV_PI + atan_r;
			}
			if (tan_angle > 0.0f) {
				if (atan_r + tan_angle > 2.0f * NV_PI) {
					atan_r = atan_r + tan_angle - 2.0f * NV_PI;
				} else {
					atan_r += tan_angle;
				}
			} else {
				if (atan_r + tan_angle < 0.0f) {
					atan_r = 2.0f * NV_PI + (atan_r + tan_angle);
				} else {
					atan_r += tan_angle;
				}
			}

			theta = atan_r / (2.0f * NV_PI / NV_SC_THETA_BIN);
			if (theta < NV_SC_THETA_BIN && log_r < NV_SC_LOG_R_BIN) {
				NV_MAT3D_LIST_V(sctx->sctx, l, (int)log_r, (int)theta, 0) += 1.0f;
			}
		}
#if 0
		for (row = 0; row < NV_SC_LOG_R_BIN; ++row) {
			for (col = 0; col < NV_SC_THETA_BIN; ++col) {
				max_bin = NV_MAX(max_bin, NV_MAT3D_LIST_V(sctx->sctx, l, row, col, 0));
				min_bin = NV_MIN(min_bin, NV_MAT3D_LIST_V(sctx->sctx, l, row, col, 0));
			}
		}
		if (max_bin > 0.0f) {
			for (row = 0; row < NV_SC_LOG_R_BIN; ++row) {
				for (col = 0; col < NV_SC_THETA_BIN; ++col) {
					NV_MAT3D_LIST_V(sctx->sctx, l, row, col, 0) 
						= (NV_MAT3D_LIST_V(sctx->sctx, l, row, col, 0) - min_bin) / (max_bin - min_bin);
				}
			}
		}
#endif
	}
	nv_matrix_free(&edge);
	nv_matrix_free(&points);
	nv_free(rand_idx);
}
コード例 #22
0
float nv_shapecontext_distance(const nv_shapecontext_t *sctx1,
							   const nv_shapecontext_t *sctx2)
{
	float distance = 0.0f;
	int points = NV_MIN(sctx1->n, sctx2->n);
	int m, n;
	nv_matrix_t *cost_matrix = nv_matrix_alloc(points, points);
	nv_matrix_t *mincost = nv_matrix_alloc(points, 1);

#ifdef _DEBUG
	FILE *f1 = fopen("1.dat", "w");
	FILE *f2 = fopen("2.dat", "w");
	FILE *fd = fopen("d.dat", "w");

	if (sctx1->n != points) {
		const nv_shapecontext_t *t1 = sctx1;
		sctx1 = sctx2;
		sctx2 = t1;
	}
#endif

	// cosine distance
	nv_matrix_zero(cost_matrix);
	for (m = 0; m < points; ++m) {
		for (n = 0; n < points; ++n) {
			float cosdist = x2_test(sctx1->sctx, m, sctx2->sctx, n);//cos_distance(sctx1->sctx, m, sctx2->sctx, n);
			float dy = NV_MAT_V(sctx1->coodinate, m, 0) - NV_MAT_V(sctx2->coodinate, n, 0);
			float dx = NV_MAT_V(sctx1->coodinate, m, 1) - NV_MAT_V(sctx2->coodinate, n, 1);
			float rx2 = (NV_MAT_V(sctx1->radius, m, 0) + NV_MAT_V(sctx2->radius, n, 0));
			float eudist = sqrtf(dy * dy + dx * dx)/sqrtf(rx2*rx2);
			float v = 1.0f * eudist + 0.9f * cosdist;
			NV_MAT_V(cost_matrix, m, n) = v;
		}
	}
	distance += nv_munkres(mincost, cost_matrix) / points;

#ifdef _DEBUG
	for (m = 0; m < sctx1->n; ++m) {
		fprintf(f1, "%f %f\n", 
			NV_MAT_V(sctx1->coodinate, m, 1),
			NV_MAT_V(sctx1->coodinate, m, 0));
	}
	for (n = 0; n < sctx2->n; ++n) {
		fprintf(f2, "%f %f\n", 
			NV_MAT_V(sctx2->coodinate, n, 1),
			NV_MAT_V(sctx2->coodinate, n, 0));
	}

	for (n = 0; n < sctx2->n;++n) {
		fprintf(fd, "%f %f\n", 
			NV_MAT_V(sctx2->coodinate, n, 1),
			NV_MAT_V(sctx2->coodinate, n, 0));

		fprintf(fd, "%f %f\n", 
			NV_MAT_V(sctx1->coodinate, NV_MAT_VI(mincost, 0, n), 1),
			NV_MAT_V(sctx1->coodinate, NV_MAT_VI(mincost, 0, n), 0));
		fprintf(fd, "\n\n");
	}
	fclose(f1);
	fclose(f2);
	fclose(fd);
#endif

	nv_matrix_free(&cost_matrix);
	nv_matrix_free(&mincost);

	return distance;
}
コード例 #23
0
ファイル: nv_ml_mlp.c プロジェクト: andrew950468/otama_api
static void
nv_mlp_backward(
	nv_mlp_t *mlp,
	nv_matrix_t *input_w_momentum,
	nv_matrix_t *input_bias_momentum,
	nv_matrix_t *hidden_w_momentum,
	nv_matrix_t *hidden_bias_momentum,
	const nv_matrix_t *output_y,
	const nv_matrix_t *input_y,
	const nv_matrix_t *corrupted_data,
	const nv_matrix_t *t,
	int *dj,
	const float ir,
	const float hr)
{
	int n, m, j;
	nv_matrix_t *output_bp = nv_matrix_alloc(mlp->output, NV_MLP_BATCH_SIZE);
	nv_matrix_t *hidden_bp = nv_matrix_alloc(mlp->input_w->m, NV_MLP_BATCH_SIZE);
	nv_matrix_t *input_w_grad = nv_matrix_alloc(mlp->input_w->n, mlp->input_w->m);
	nv_matrix_t *input_bias_grad = nv_matrix_alloc(mlp->input_bias->n,
													   mlp->input_bias->m);
	nv_matrix_t *hidden_w_grad = nv_matrix_alloc(mlp->hidden_w->n, mlp->hidden_w->m);
	nv_matrix_t *hidden_bias_grad = nv_matrix_alloc(mlp->hidden_bias->n,
													mlp->hidden_bias->m);
	nv_matrix_zero(input_w_grad);
	nv_matrix_zero(hidden_w_grad);
	nv_matrix_zero(input_bias_grad);
	nv_matrix_zero(hidden_bias_grad);

#ifdef _OPENMP
#pragma omp parallel for private(m, n)
#endif
	for (j = 0; j < NV_MLP_BATCH_SIZE; ++j) {
		for (n = 0; n < output_bp->n; ++n) {
			float y_t = NV_MAT_V(output_y, j, n) - NV_MAT_V(t, dj[j], n);
			float bp = y_t;
			NV_MAT_V(output_bp, j, n) = bp;
		}
		for (m = 0; m < mlp->hidden_w->n; ++m) {
			float y = 0.0f;
			for (n = 0; n < mlp->output; ++n) {
				y += NV_MAT_V(output_bp, j, n) * NV_MAT_V(mlp->hidden_w, n, m);
			}
			NV_MAT_V(hidden_bp, j, m) = 
				y * (1.0f - NV_MAT_V(input_y, j, m)) * NV_MAT_V(input_y, j, m);
		}
	}
#ifdef _OPENMP
#pragma omp parallel for private(m, j)
#endif
	for (n = 0; n < mlp->hidden_w->m; ++n) {
		for (j = 0; j < NV_MLP_BATCH_SIZE; ++j) {
			const float w = hr * NV_MAT_V(output_bp, j, n);
			for (m = 0; m < mlp->hidden_w->n; ++m) {
				NV_MAT_V(hidden_w_grad, n, m) += w * NV_MAT_V(input_y, j, m);
			}
			NV_MAT_V(hidden_bias_grad, n, 0) += w * NV_MLP_BIAS;
		}
	}
#ifdef _OPENMP
#pragma omp parallel for private(m, j)
#endif
	for (n = 0; n < mlp->input_w->m; ++n) {
		for (j = 0; j < NV_MLP_BATCH_SIZE; ++j) {
			const float w = ir * NV_MAT_V(hidden_bp, j, n);
			if (w != 0.0f) {
				for (m = 0; m < mlp->input_w->n; ++m) {
					NV_MAT_V(input_w_grad, n, m) += w * NV_MAT_V(corrupted_data, j, m);
				}
				NV_MAT_V(input_bias_grad, n, 0) += w * NV_MLP_BIAS;
			} // dropout
		}
	}

#ifdef _OPENMP
#pragma omp parallel for private(m)
#endif
	for (n = 0; n < mlp->hidden_w->m; ++n) {
		for (m = 0; m < mlp->hidden_w->n; ++m) {
			NV_MAT_V(hidden_w_momentum, n, m) =
				NV_MLP_MOMENTUM * NV_MAT_V(hidden_w_momentum, n, m)
				+ NV_MLP_WEIGHT_DECAY * hr * NV_MAT_V(mlp->hidden_w, n, m)
				+ NV_MAT_V(hidden_w_grad, n, m);
			NV_MAT_V(mlp->hidden_w, n, m) -= NV_MAT_V(hidden_w_momentum, n, m) * (1.0f - NV_MLP_MOMENTUM);
		}
		NV_MAT_V(hidden_bias_momentum, n, 0) =
			NV_MLP_MOMENTUM * NV_MAT_V(hidden_bias_momentum, n, 0)
			+ NV_MAT_V(hidden_bias_grad, n, 0);
		NV_MAT_V(mlp->hidden_bias, n, 0) -= NV_MAT_V(hidden_bias_momentum, n, 0) * (1.0f - NV_MLP_MOMENTUM);
	}
#ifdef _OPENMP
#pragma omp parallel for private(m)
#endif
	for (n = 0; n < mlp->input_w->m; ++n) {
		for (m = 0; m < mlp->input_w->n; ++m) {
			NV_MAT_V(input_w_momentum, n, m) =
				NV_MLP_MOMENTUM * NV_MAT_V(input_w_momentum, n, m)
				+ NV_MAT_V(input_w_grad, n, m);
			NV_MAT_V(mlp->input_w, n, m) -= NV_MAT_V(input_w_momentum, n, m) * (1.0f - NV_MLP_MOMENTUM);
		}
		NV_MAT_V(input_bias_momentum, n, 0) =
			NV_MLP_MOMENTUM * NV_MAT_V(input_bias_momentum, n, 0)
			+ NV_MAT_V(input_bias_grad, n, 0);
		NV_MAT_V(mlp->input_bias, n, 0) -= NV_MAT_V(input_bias_momentum, n, 0) * (1.0f - NV_MLP_MOMENTUM);
	}
	
	nv_matrix_free(&input_w_grad);
	nv_matrix_free(&hidden_w_grad);
	nv_matrix_free(&input_bias_grad);
	nv_matrix_free(&hidden_bias_grad);
	nv_matrix_free(&output_bp);
	nv_matrix_free(&hidden_bp);
}
コード例 #24
0
ファイル: nv_ml_mlp.c プロジェクト: andrew950468/otama_api
float
nv_mlp_train_lex(nv_mlp_t *mlp,
				 const nv_matrix_t *data,
				 const nv_matrix_t *label,
				 const nv_matrix_t *t,
				 float ir, float hr, 
				 int start_epoch, int end_epoch, int max_epoch)
{
	int i;
	int epoch = 1;
	float p;
	nv_matrix_t *input_y = nv_matrix_alloc(mlp->input_w->m, NV_MLP_BATCH_SIZE);
	nv_matrix_t *hidden_y = nv_matrix_alloc(mlp->hidden_w->m, NV_MLP_BATCH_SIZE);
	nv_matrix_t *output_y = nv_matrix_alloc(mlp->output, NV_MLP_BATCH_SIZE);
	nv_matrix_t *corrupted_data = nv_matrix_alloc(mlp->input, NV_MLP_BATCH_SIZE);
	nv_matrix_t *input_w_momentum = nv_matrix_alloc(mlp->input_w->n, mlp->input_w->m);
	nv_matrix_t *input_bias_momentum = nv_matrix_alloc(mlp->input_bias->n,
													   mlp->input_bias->m);
	nv_matrix_t *hidden_w_momentum = nv_matrix_alloc(mlp->hidden_w->n, mlp->hidden_w->m);
	nv_matrix_t *hidden_bias_momentum = nv_matrix_alloc(mlp->hidden_bias->n,
														mlp->hidden_bias->m);
	
	int *djs = nv_alloc_type(int, NV_MLP_BATCH_SIZE);
	int *rand_idx = nv_alloc_type(int, data->m);
	
	NV_ASSERT(data->m > NV_MLP_BATCH_SIZE);

	nv_matrix_zero(input_w_momentum);
	nv_matrix_zero(hidden_w_momentum);
	nv_matrix_zero(input_bias_momentum);
	nv_matrix_zero(hidden_bias_momentum);

	epoch = start_epoch + 1;
	do {
		long tm;
		int correct = 0;
		float e = 0.0f;
		int count = 0;
		
		tm = nv_clock();
		nv_shuffle_index(rand_idx, 0, data->m);

		for (i = 0; i < data->m / NV_MLP_BATCH_SIZE; ++i) {
			int j;
#ifdef _OPENMP
#pragma omp parallel for schedule(dynamic, 1) reduction(+:correct, count, e)
#endif
			for (j = 0; j < NV_MLP_BATCH_SIZE; ++j) {
				int label_correct;
				int dj = rand_idx[i * NV_MLP_BATCH_SIZE + j];
				djs[j] = dj;
				
				nv_mlp_corrupt(mlp, corrupted_data, j, data, dj);
				nv_mlp_forward(mlp, input_y, j, hidden_y, j,
							   corrupted_data, j);
				nv_mlp_softmax(output_y, j, hidden_y, j);
				e += nv_mlp_error(output_y, j, t, dj);
				label_correct = (int)NV_MAT_V(label, dj, 0);
				if (nv_vector_max_n(output_y, j) == label_correct) {
					++correct;
				}
				count += 1;
			}
			nv_mlp_backward(
				mlp,
				input_w_momentum, input_bias_momentum,
				hidden_w_momentum, hidden_bias_momentum,
				output_y, input_y, corrupted_data,
				t, djs,
				ir, hr);
		}
		p = (float)correct / count;
		if (nv_mlp_progress_flag) {
			printf("%d: E:%E, %f (%d/%d), %ldms\n",
				   epoch, e / count / mlp->output,
				   p, correct,
				   count, 
				nv_clock() - tm);
			if (nv_mlp_progress_flag >= 2) {
				nv_mlp_train_accuracy(mlp, data, label);
			}
			fflush(stdout);
		}
	} while (epoch++ < end_epoch);
	nv_free(rand_idx);
	nv_free(djs);
	nv_matrix_free(&input_y);
	nv_matrix_free(&hidden_y);
	nv_matrix_free(&output_y);
	nv_matrix_free(&corrupted_data);
	nv_matrix_free(&input_w_momentum);
	nv_matrix_free(&input_bias_momentum);
	nv_matrix_free(&hidden_w_momentum);
	nv_matrix_free(&hidden_bias_momentum);
	
	return p;
}
コード例 #25
0
ファイル: nv_num_eigen.c プロジェクト: nagadomi/nv-debian
int
nv_eigen(nv_matrix_t *eigen_vec, 
		 nv_matrix_t *eigen_val,
		 const nv_matrix_t *mat,
		 int n,
		 int max_epoch)
{
	int i;
	nv_matrix_t *a = nv_matrix_dup(mat);
	nv_matrix_t *vec_tmp = nv_matrix_alloc(a->m, 1);
#if NV_ENABLE_SSE2	
	const int pk_lp = (a->n & 0xfffffffc);
#endif
	
	NV_ASSERT(n > 0);
	NV_ASSERT(n <= mat->m);
	NV_ASSERT(n <= eigen_vec->m);
	NV_ASSERT(n <= eigen_val->m);
	NV_ASSERT(mat->m == mat->n);
	NV_ASSERT(mat->m == eigen_vec->n);

	nv_matrix_zero(eigen_val);
	nv_matrix_fill(eigen_vec, 1.0f);
	nv_vector_normalize_all(eigen_vec);
	
	for (i = 0; i < n; ++i) {
		int k, jj;
		float lambda_old;
		
		for (k = 0; k < max_epoch; ++k) {
			int j;
			float lambda;
			
#ifdef _OPENMP
#pragma omp parallel for
#endif
			for (j = 0; j < a->m; ++j) {
				NV_MAT_V(vec_tmp, 0, j) = nv_vector_dot(a, j, eigen_vec, i);
			}
			lambda = nv_vector_norm(vec_tmp, 0);
			if (lambda > 0.0f) {
				nv_vector_muls(vec_tmp, 0, vec_tmp, 0, 1.0f / lambda);
			}
			NV_MAT_V(eigen_val, i, 0) = lambda;
			nv_vector_copy(eigen_vec, i, vec_tmp, 0);
			
			if (k > 0) {
				if (fabsf(lambda_old - lambda) < FLT_EPSILON) {
					break;
				}
			}
			lambda_old = NV_MAT_V(eigen_val, i, 0);
		}
#if NV_ENABLE_SSE2
		{
			const __m128 val = _mm_set1_ps(NV_MAT_V(eigen_val, i, 0));
			
#ifdef _OPENMP
#pragma omp parallel for
#endif
			for (jj = 0; jj < a->m; ++jj) {
				int ii;
				const __m128 vjj = _mm_set1_ps(NV_MAT_V(eigen_vec, i, jj));
				for (ii = 0; ii < pk_lp; ii += 4) {
					_mm_store_ps(&NV_MAT_V(a, jj, ii),
								 _mm_sub_ps(*(const __m128 *)&NV_MAT_V(a, jj, ii),
											_mm_mul_ps(val,_mm_mul_ps(vjj, *(const __m128 *)&NV_MAT_V(eigen_vec, i, ii)))));
				}
				for (; ii < a->n; ++ii) {
					NV_MAT_V(a, jj, ii) -=
						NV_MAT_V(eigen_val, i, 0)
						* NV_MAT_V(eigen_vec, i, ii)
						* NV_MAT_V(eigen_vec, i, jj);
				}
			}
		}
#else
#ifdef _OPENMP
#pragma omp parallel for
#endif
		for (jj = 0; jj < a->m; ++jj) {
			int ii;
			for (ii = 0; ii < a->n; ++ii) {
				NV_MAT_V(a, jj, ii) -=
					NV_MAT_V(eigen_val, i, 0)
					* NV_MAT_V(eigen_vec, i, ii)
					* NV_MAT_V(eigen_vec, i, jj);
			}
		}
#endif		
	}
	nv_matrix_free(&vec_tmp);
	nv_matrix_free(&a);
	
	return 0;
}
コード例 #26
0
ファイル: nv_ml_lr.c プロジェクト: nagadomi/nv-debian
void 
nv_lr_train(nv_lr_t *lr,
			const nv_matrix_t *data, const nv_matrix_t *label,
			nv_lr_param_t param)
{
	int m, n, i, j, k, l;
	long tm, tm_all = nv_clock();
	float oe = FLT_MAX, er = 1.0f, we;
	float sum_e = 0.0f;
	int epoch = 0;
	int pn = (data->m > 256) ? 128:1;
	int step = data->m / (pn);
	int threads = nv_omp_procs();
	nv_matrix_t *y = nv_matrix_alloc(lr->k, threads);
	nv_matrix_t *t = nv_matrix_alloc(lr->k, threads);
	nv_matrix_t *dw = nv_matrix_list_alloc(lr->n, lr->k, threads);
	nv_matrix_t *count = nv_matrix_alloc(lr->k, 1);
	nv_matrix_t *label_weight = nv_matrix_alloc(lr->k, 1);
	float count_max_log;
	
	nv_matrix_zero(count);
	nv_matrix_fill(label_weight, 1.0f);
	if (param.auto_balance) {
		/* クラスごとに数が違う場合に更新重みをスケーリングする */
		for (m = 0; m < data->m; ++m) {
			NV_MAT_V(count, 0, (int)NV_MAT_V(label, m, 0)) += 1.0f;
		}
		count_max_log = logf(3.0f + NV_MAT_V(count, 0, nv_vector_max_n(count, 0)));
		for (n = 0; n < count->n; ++n) {
			if (NV_MAT_V(count, 0, n) > 0.0f) {
				float count_log = logf(3.0f + NV_MAT_V(count, 0, n));
				NV_MAT_V(label_weight, 0, n) = 
					powf(count_max_log, NV_LR_CLASS_COUNT_PENALTY_EXP) 
					/ powf(count_log, NV_LR_CLASS_COUNT_PENALTY_EXP);
			} else {
				NV_MAT_V(label_weight, 0, n) = 1.0f;
			}
		}
	}
	do {
		we = 1.0f / er;
		tm = nv_clock();
		sum_e = 0.0f;

		for (m = 0; m < step; ++m) {
			nv_matrix_zero(dw);

#ifdef _OPENMP
#pragma omp parallel for schedule(dynamic, 4) reduction(+:sum_e) num_threads(threads) 
#endif
			for (i = 0; i < pn; ++i) {
				int rand_m = NV_ROUND_INT((data->m - 1) * nv_rand());
				int thread_num = nv_omp_thread_id();
				int label_i = (int)NV_MAT_V(label, rand_m, 0);
				float weight = NV_MAT_V(label_weight, 0, label_i);
				float yp;

				nv_vector_zero(t, thread_num);
				NV_MAT_V(t, thread_num, label_i) = 1.0f;
				nv_lr_predict_vector(lr, y, thread_num, data, rand_m);
				yp = NV_MAT_V(y, thread_num, (int)NV_MAT_V(label, rand_m, 0));
				
				if (yp < 1.0 - NV_LR_MARGIN) {
					nv_lr_dw(lr, weight, dw, thread_num, data, rand_m, t, thread_num, y, thread_num);
					sum_e += nv_lr_error(t, thread_num, y, thread_num);
				}
			}

			for (l = 1; l < threads; ++l) {
				for (j = 0; j < dw->m; ++j) {
					for (i = 0; i < dw->n; ++i) {
						NV_MAT_LIST_V(dw, 0, j, i) += NV_MAT_LIST_V(dw, l, j, i);
					}
				}
			}
#ifdef _OPENMP
#pragma omp parallel for private(n)  num_threads(threads) if (lr->k > 32)
#endif
			for (k = 0; k < lr->k; ++k) {
				switch (param.reg_type) {
                case NV_LR_REG_NONE:
					for (n = 0; n < lr->n; ++n) {
						NV_MAT_V(lr->w, k, n) -= 
							we * param.grad_w * NV_MAT_LIST_V(dw, 0, k, n);
					}
					break;
				case NV_LR_REG_L1:
					// FOBOS L1
					for (n = 0; n < lr->n; ++n) {
						NV_MAT_V(lr->w, k, n) -= 
							we * param.grad_w * NV_MAT_LIST_V(dw, 0, k, n);
					}
					for (n = 0; n < lr->n; ++n) {
						float w_i = NV_MAT_V(lr->w, k, n);
						float lambda = we * param.reg_w * (1.0f / (1.0f + epoch));
						NV_MAT_V(lr->w, k, n) = nv_sign(w_i) * NV_MAX(0.0f, (fabsf(w_i) - lambda));
					}
					break;
				case NV_LR_REG_L2:
					for (n = 0; n < lr->n; ++n) {
						NV_MAT_V(lr->w, k, n) -= 
							we * (param.grad_w * (NV_MAT_LIST_V(dw, 0, k, n)
												  + param.reg_w * NV_MAT_V(lr->w, k, n)));
					}
					break;
				}
			}
		}
		if (nv_lr_progress_flag) {
			printf("nv_lr:%d: E: %E, %ldms\n",
				epoch, sum_e / (pn * step), nv_clock() - tm);
		}
		if (nv_lr_progress_flag > 1) {
			int *ok = nv_alloc_type(int, lr->k);
			int *ng = nv_alloc_type(int, lr->k);

			memset(ok, 0, sizeof(int) * lr->k);
			memset(ng, 0, sizeof(int) * lr->k);
			for (i = 0; i < data->m; ++i) {
				int predict = nv_lr_predict_label(lr, data, i);
				int teach = (int)NV_MAT_V(label, i, 0);
				if (predict == teach) {
					++ok[teach];
				} else {
					++ng[teach];
				}
			}
			for (i = 0; i < lr->k; ++i) {
				printf("%d: ok: %d, ng: %d, %f\n", i, ok[i], ng[i], (float)ok[i] / (float)(ok[i] + ng[i]));
			}
			nv_free(ok);
			nv_free(ng);
		}
		if (nv_lr_progress_flag) {
			fflush(stdout);
		}

		if (sum_e > oe) {
			er += 1.0f;
		}
		if (er >= 20.0f) {
			break;
		}
		
		if (sum_e < FLT_EPSILON) {
			break;
		}
		oe = sum_e;
	} while (param.max_epoch > ++epoch);