BNTree* train_bntree (char *treestr, char *modelstr, char *ss_filename, char *model_name, char *output_filename) { FILE *ss_fp, *out_fp; int model_type; double *background_probs; char **tuple_arr; double *count_arr; int num_tuples = 0; double ll; double last_ll; int iteration; int converged; char tag_str[STRLEN]; char val_str[STRLEN]; int u; int i, j, k; Node *node; int param_num; double row_sum; double param_val; CellListItem *cli; BNTree *bntree = NewBNTree(0); /* no cache for training (for now) */ strcpy (bntree->name, model_name); if (strcmp (modelstr, "TT0") == 0) { model_type = TT0; bntree->order = 0; bntree->num_states = 6; } else if (strcmp (modelstr, "TT1") == 0) { model_type = TT1; bntree->order = 1; bntree->num_states = 36; } else if (strcmp (modelstr, "TT2") == 0) { model_type = TT2; bntree->order = 2; bntree->num_states = 216; } else if (strcmp (modelstr, "R0") == 0) { model_type = R0; bntree->order = 0; bntree->num_states = 6; bntree->num_params = 18; } else if (strcmp (modelstr, "R1") == 0) { model_type = R1; bntree->order = 1; bntree->num_states = 36; bntree->num_params = 236; } else if (strcmp (modelstr, "R2") == 0) { model_type = R2; bntree->order = 2; bntree->num_states = 216; } else if (strcmp (modelstr, "G0") == 0) { model_type = G0; bntree->order = 0; bntree->num_states = 6; bntree->num_params = 36; } else if (strcmp (modelstr, "G1") == 0) { model_type = G1; bntree->order = 1; bntree->num_states = 36; bntree->num_params = 1296; } else if (strcmp (modelstr, "G2") == 0) { model_type = G2; bntree->order = 2; bntree->num_states = 216; bntree->num_params = 46656; } else { fprintf (stderr, "Unknown probability model type: %s\n", modelstr); exit(-1); } /* initialize tree structure */ parse_tree_string (bntree, treestr); /* parse the sufficient statistics file */ ss_fp = fopen (ss_filename, "r"); while (fscanf (ss_fp, "%s = %s\n", tag_str, val_str) == 2) { if (strcmp (tag_str, "NTUPLES") == 0) num_tuples = atoi(val_str); } fclose (ss_fp); tuple_arr = (char **) malloc (num_tuples * sizeof (char *)); for (i=0; i<num_tuples; i++) tuple_arr[i] = (char *) malloc (STRLEN * sizeof (char)); count_arr = (double *) malloc (num_tuples * sizeof (double)); background_probs = (double *) malloc (bntree->num_states * sizeof (double)); parse_ss (bntree, ss_filename, tuple_arr, count_arr, background_probs); /* initialize parameters */ bntree->param_map = (int **)malloc(bntree->num_states * sizeof(int *)); for (i=0; i<bntree->num_states; i++) bntree->param_map[i] = (int *)malloc(bntree->num_states * sizeof(int)); bntree->weight_idx = (int **)malloc(bntree->num_states * sizeof(int *)); for (i=0; i<bntree->num_states; i++) bntree->weight_idx[i] = (int *)malloc(bntree->num_states * sizeof(int)); if (model_type == R0) init_params_R0 (bntree); else if (model_type == G0) init_params_G0 (bntree); else if (model_type == R1) init_params_R1 (bntree); else if (model_type == G1) init_params_G1 (bntree); else if (model_type == G2) init_params_G2 (bntree); else { fprintf (stderr, "Model type %s not yet supported\n", modelstr); exit (-1); } /* create inverse parameter map */ bntree->inv_param_map = (CellListItem **)malloc(bntree->num_params * sizeof (CellListItem *)); for (i=0; i<bntree->num_params; i++) bntree->inv_param_map[i] = NewCellList(); for (i=0; i<bntree->num_states; i++) { for (j=0; j<bntree->num_states; j++) { param_num = bntree->param_map[i][j]; cell_list_append(bntree->inv_param_map[param_num], i, j); } } /* allocate local CPDs and expectations */ for (u=0; u<bntree->num_nodes; u++) { node = bntree->preorder[u]; if (node->parent == NULL) continue; node->E = (double **)malloc(bntree->num_states * sizeof(double *)); for (i=0; i<bntree->num_states; i++) node->E[i] = (double *)malloc(bntree->num_states * sizeof(double)); node->P = (float **)malloc(bntree->num_states * sizeof(float *)); for (i=0; i<bntree->num_states; i++) node->P[i] = (float *)malloc(bntree->num_states * sizeof(float)); node->W = (double **)malloc(bntree->num_states * sizeof(double *)); for (i=0; i<bntree->num_states; i++) node->W[i] = (double *)malloc(bntree->num_states * sizeof(double)); } /* randomly initialize the parameters in a way that guarantees all probabilities will be within 0 and 1, and all rows sum to 1 */ /* also, self substitution probs will be at least 0.5 */ for (u=0; u<bntree->num_nodes; u++) { node = bntree->preorder[u]; if (node->parent == NULL) continue; srand(time(NULL)); for (k=0; k<bntree->num_params - bntree->num_states; k++) { param_val = bounded_random (0.05, 0.50) / (bntree->num_states - 1); for (cli = bntree->inv_param_map[k]; cli != NULL; cli = cli->next) { i = cli->row; j = cli->col; node->P[i][j] = param_val; } } for (i=0; i<bntree->num_states; i++) { row_sum = 0; for (j=0; j<bntree->num_states; j++) { if (j != i) { row_sum += node->P[i][j]; } } node->P[i][i] = 1 - row_sum; } } /* run EM algorithm */ /* We'll output the current tree after every iteration */ iteration = 0; converged = 0; ll = calc_log_likelihood (bntree, tuple_arr, count_arr, num_tuples, 0); fprintf (stderr, "Initial log likelihood: %f\n", ll); while (! converged) { iteration++; /* improve likelihood */ Estep(bntree, tuple_arr, count_arr, num_tuples); Mstep(bntree); /* output new model */ out_fp = fopen (output_filename, "w"); print_tree (out_fp, bntree); fclose (out_fp); /* check for convergence */ last_ll = ll; ll = calc_log_likelihood (bntree, tuple_arr, count_arr, num_tuples, 0); fprintf (stderr, "EM iteration %d, log likelihood: %f\n", iteration, ll); if ( (last_ll - ll) / last_ll < EM_TOL || iteration >= MAX_EM_ITERS) converged = 1; } return bntree; }
void map_append(map *map, int t, int x, int y) { cell_list *level = map_get_level(map, t); cell_list_append(level, x, y); }