/*===========================================================================*/ static int parseModelType(const char * data, unsigned int size) { #define CUR_PROC "parseModelType" int i, noValidMo, modelType=0; const char * end = data; char * str; while ((end = strchr(data, ' '))) { modelType += matchModelType(data, end-data); size -= (end-data)+1; data = end+1; } modelType += matchModelType(data, size); noValidMo = sizeof(validModelTypes)/sizeof(validModelTypes[0]); for (i=0; i<noValidMo; i++) { if (modelType == validModelTypes[i]) break; } if (i == noValidMo) { str = ighmm_mprintf(NULL, 0, "%d is no known valid model type", modelType); GHMM_LOG(LERROR, str); m_free(str); return -1; } return modelType; #undef CUR_PROC }
/*===========================================================================*/ static int parseCSVList(const char * data, unsigned int size, double * array, int reverse) { #define CUR_PROC "parseCSVList" int retval=0; int i; char * * next, * estr; double tmp; ARRAY_CALLOC(next, 1); for (i=0; i<size; i++) { array[i] = strtod(data, next); if (data == *next) { estr = ighmm_mprintf(NULL, 0, "error in parsing CSV. entry %d of %d. (%s)", i, size, *next); GHMM_LOG(LERROR, estr); m_free(estr); retval=-1; break; } if (next) data = *next+1; else break; } if (i != size) { retval=-1; estr = ighmm_mprintf(NULL, 0, "error in parsing CSV. sizes do not match (%d != %d)", i, size); GHMM_LOG(LERROR, estr); m_free(estr); } if (reverse) { for (i=0; i<size/2; i++) { tmp = array[i]; array[i] = array[size-i-1]; array[size-i-1] = tmp; } } STOP: m_free(next); return retval; #undef CUR_PROC }
/*===========================================================================*/ static ghmm_alphabet * parseAlphabet(xmlDocPtr doc, xmlNodePtr cur, ghmm_xmlfile* f) { #define CUR_PROC "parseAlphabet" char * str; int M, code, error; xmlNodePtr symbol; ghmm_alphabet * alfa; ARRAY_CALLOC(alfa, 1); symbol = cur->children; M=0; while (symbol!=NULL) { if ((!xmlStrcmp(symbol->name, BAD_CAST "symbol"))) { code = getIntAttribute(symbol, "code", &error); if (error || code!=M) { str = ighmm_mprintf(NULL, 0, "non consecutive code %d == %d", code, M); GHMM_LOG(LERROR, str); m_free(str); goto STOP; } else M++; } symbol=symbol->next; } alfa->size = M; /*printf("Parsing alphabet with %d symbols\n", alfa->size);*/ ARRAY_MALLOC(alfa->symbols, M); symbol = cur->children; M=0; while (symbol!=NULL) { if ((!xmlStrcmp(symbol->name, BAD_CAST "symbol"))) { alfa->symbols[M++] = (char *)xmlNodeGetContent(symbol); /*printf("%d. symbol: %s\n", M, alfa->symbols[M-1]);*/ } symbol=symbol->next; } return alfa; STOP: m_free(alfa->symbols); m_free(alfa) return NULL; #undef CUR_PROC }
/* PROBLEM: Entries can get very small and be rounded to 0 */ int ighmm_cvector_normalize (double *v, int len) { #define CUR_PROC "ighmm_cvector_normalize" int i; double sum = 0.0; char * estr; for (i = 0; i < len; i++) sum += v[i]; if (i>0 && sum<GHMM_EPS_PREC) { estr = ighmm_mprintf(NULL, 0, "Can't normalize vector. Sum smaller than %g\n" , GHMM_EPS_PREC); GHMM_LOG(LWARN, estr); m_free(estr); return (-1); } for (i = 0; i < len; i++) v[i] /= sum; return 0; #undef CUR_PROC } /* ighmm_cvector_normalize */
/*===========================================================================*/ static int parseState(xmlDocPtr doc, xmlNodePtr cur, ghmm_xmlfile* f, int * inDegree, int * outDegree, int modelNo) { #define CUR_PROC "parseState" int i, error, order=0, state=-1442, fixed=-985, tied=-9354, M, aprox, label; int curX=0, curY=0; double pi, prior; double *emissions = NULL; char *desc = NULL; char *s = NULL, *estr; int rev, stateFixed=1; ghmm_cstate *newcstate; ghmm_c_emission *emission; xmlNodePtr elem, child, multichild; state = getIntAttribute(cur, "id", &error); pi = getDoubleAttribute(cur, "initial", &error); if (error) { estr = ighmm_mprintf(NULL, 0, "can't read required intial probability for" "state %d", state); GHMM_LOG(LERROR, estr); goto STOP; } else desc = xmlGetProp(cur, BAD_CAST "desc"); elem = cur->children; while (elem!=NULL) { /* ======== silent state ============================================== */ if ((!xmlStrcmp(elem->name, BAD_CAST "silent"))) { switch (f->modelType & PTR_TYPE_MASK) { case (GHMM_kDiscreteHMM): f->model.d[modelNo]->silent[state] = 1; break; case (GHMM_kDiscreteHMM+GHMM_kTransitionClasses): f->model.ds[modelNo]->silent[state] = 1; break; case (GHMM_kDiscreteHMM+GHMM_kPairHMM): case (GHMM_kDiscreteHMM+GHMM_kPairHMM+GHMM_kTransitionClasses): f->model.dp[modelNo]->silent[state] = 1; break; default: GHMM_LOG(LERROR, "invalid modelType"); goto STOP; } } /* ======== discrete state (possible higher order) ==================== */ if ((!xmlStrcmp(elem->name, BAD_CAST "discrete"))) { assert((f->modelType & GHMM_kDiscreteHMM) && ((f->modelType & GHMM_kPairHMM) == 0)); /* fixed is a propety of the distribution and optional */ fixed = getIntAttribute(elem, "fixed", &error); if (error) fixed = 0; /* order is optional for discrete */ if (f->modelType & GHMM_kHigherOrderEmissions) { order = getIntAttribute(elem, "order", &error); if (error) order = 0; } rev = getIntAttribute(cur, "rev", &error); if (error) rev = 0; /* parsing emission probabilities */ s = (char *)xmlNodeGetContent(elem); switch (f->modelType & PTR_TYPE_MASK) { case (GHMM_kDiscreteHMM): f->model.d[modelNo]->s[state].desc = desc; f->model.d[modelNo]->s[state].pi = pi; f->model.d[modelNo]->s[state].fix = fixed; if (f->modelType & GHMM_kHigherOrderEmissions) { f->model.d[modelNo]->order[state] = order; if (f->model.d[modelNo]->maxorder < order) { f->model.d[modelNo]->maxorder = order; estr = ighmm_mprintf(NULL, 0, "Updated maxorder to %d\n", f->model.d[modelNo]->maxorder); GHMM_LOG(LDEBUG, estr); m_free(estr); } } ARRAY_MALLOC(emissions, pow(f->model.d[modelNo]->M, order+1)); parseCSVList(s, pow(f->model.d[modelNo]->M, order+1), emissions, rev); free(f->model.d[modelNo]->s[state].b); f->model.d[modelNo]->s[state].b = emissions; break; case (GHMM_kDiscreteHMM+GHMM_kTransitionClasses): f->model.ds[modelNo]->s[state].desc = desc; f->model.ds[modelNo]->s[state].pi = pi; f->model.ds[modelNo]->s[state].fix = fixed; if (f->modelType & GHMM_kHigherOrderEmissions) f->model.ds[modelNo]->order[state] = order; ARRAY_MALLOC(emissions, pow(f->model.ds[modelNo]->M, order+1)); parseCSVList(s, pow(f->model.ds[modelNo]->M, order+1), emissions, rev); f->model.ds[modelNo]->s[state].b = emissions; break; default: GHMM_LOG(LERROR, "invalid modelType"); goto STOP; } m_free(s); } /* ======== continuous state ========================================== */ if ((!xmlStrcmp(elem->name, BAD_CAST "mixture"))) { assert(f->modelType & GHMM_kContinuousHMM); M = 0; child = elem->children; while (child != NULL) { if ((!xmlStrcmp(child->name, BAD_CAST "normal")) || (!xmlStrcmp(child->name, BAD_CAST "normalLeftTail")) || (!xmlStrcmp(child->name, BAD_CAST "normalRightTail")) || (!xmlStrcmp(child->name, BAD_CAST "multinormal")) || (!xmlStrcmp(child->name, BAD_CAST "uniform"))){ M ++; } child = child->next; } ghmm_cstate_alloc(f->model.c[modelNo]->s + state, M, inDegree[state], outDegree[state], f->model.c[modelNo]->cos); newcstate = f->model.c[modelNo]->s + state; newcstate->desc = desc; newcstate->M = M; newcstate->pi = pi; if( f->model.c[modelNo]->M < M) f->model.c[modelNo]->M = M; child = elem->children; i = 0; while (child != NULL) { emission = newcstate->e+i; /* common attributes */ if ((!xmlStrcmp(child->name, BAD_CAST "normal")) || (!xmlStrcmp(child->name, BAD_CAST "normalLeftTail")) || (!xmlStrcmp(child->name, BAD_CAST "normalRightTail")) || (!xmlStrcmp(child->name, BAD_CAST "multinormal")) || (!xmlStrcmp(child->name, BAD_CAST "uniform"))){ fixed = getIntAttribute(child, "fixed", &error); if (error) fixed = 0; stateFixed = fixed && stateFixed; /* allocate emission */ emission->fixed = fixed; prior = getDoubleAttribute(child, "prior", &error); if (error) prior = 1.0; newcstate->c[i] = prior; } /* child is not a density, continue with the next child */ else { child = child->next; continue; } /* density type dependent attributes */ if ((!xmlStrcmp(child->name, BAD_CAST "normal"))) { emission->mean.val = getDoubleAttribute(child, "mean", &error); emission->variance.val = getDoubleAttribute(child, "variance", &error); /* should the normal distribution be approximated? */ aprox = getIntAttribute(child, "approx", &error); if (error) aprox = 0; emission->type = aprox ? normal_approx : normal; emission->dimension = 1; if (f->model.c[modelNo]->dim > 1) { GHMM_LOG(LERROR, "All emissions must have same dimension."); goto STOP; } } if ((!xmlStrcmp(child->name, BAD_CAST "normalLeftTail"))) { emission->mean.val = getDoubleAttribute(child, "mean", &error); emission->variance.val = getDoubleAttribute(child, "variance", &error); emission->min = getDoubleAttribute(child, "max", &error); emission->type = normal_left; emission->dimension = 1; if (f->model.c[modelNo]->dim > 1) { GHMM_LOG(LERROR, "All emissions must have same dimension."); goto STOP; } } if ((!xmlStrcmp(child->name, BAD_CAST "normalRightTail"))) { emission->mean.val = getDoubleAttribute(child, "mean", &error); emission->variance.val = getDoubleAttribute(child, "variance", &error); emission->max = getDoubleAttribute(child, "min", &error); emission->type = normal_right; emission->dimension = 1; if (f->model.c[modelNo]->dim > 1) { GHMM_LOG(LERROR, "All emissions must have same dimension."); goto STOP; } } if ((!xmlStrcmp(child->name, BAD_CAST "uniform"))) { emission->max = getDoubleAttribute(child, "max", &error); emission->min = getDoubleAttribute(child, "min", &error); emission->type = uniform; emission->dimension = 1; if (f->model.c[modelNo]->dim > 1) { GHMM_LOG(LERROR, "All emissions must have same dimension."); goto STOP; } } if ((!xmlStrcmp(child->name, BAD_CAST "multinormal"))) { emission->type = multinormal; emission->dimension = getIntAttribute(child, "dimension", &error); /* check that all emissions in all states have same dimension or set when first emission is read*/ if (f->model.c[modelNo]->dim <= 1) f->model.c[modelNo]->dim = emission->dimension; else if (f->model.c[modelNo]->dim != emission->dimension) { GHMM_LOG(LERROR, "All emissions must have same dimension."); goto STOP; } if (0 != ghmm_c_emission_alloc(emission, emission->dimension)) { GHMM_LOG(LERROR, "Can not allocate multinormal emission."); goto STOP; } multichild = child->children; while (multichild != NULL) { if ((!xmlStrcmp(multichild->name, BAD_CAST "mean"))) { s = (char *)xmlNodeGetContent(multichild); if (-1 == parseCSVList(s, emission->dimension, emission->mean.vec, 0)) { GHMM_LOG(LERROR, "Can not parse mean CSV list."); goto STOP; } } if ((!xmlStrcmp(multichild->name, BAD_CAST "variance"))) { s = (char *)xmlNodeGetContent(multichild); if (-1 == parseCSVList(s, emission->dimension * emission->dimension, emission->variance.mat, 0)) { GHMM_LOG(LERROR, "Can not parse variance CSV list."); goto STOP; } if (0 != ighmm_invert_det(emission->sigmainv, &emission->det, emission->dimension, emission->variance.mat)) { GHMM_LOG(LERROR, "Can not calculate inverse of covariance matrix."); goto STOP; } if (0 != ighmm_cholesky_decomposition(emission->sigmacd, emission->dimension, emission->variance.mat)) { GHMM_LOG(LERROR, "Can not calculate cholesky decomposition of covariance matrix."); goto STOP; } } multichild = multichild->next; } } i++; child = child->next; } newcstate->fix = stateFixed; } /* ======== pair hmm state ============================================ */ if ((!xmlStrcmp(elem->name, BAD_CAST "pair"))) { } /* -------- background name ------------------------------------------ */ if ((!xmlStrcmp(elem->name, BAD_CAST "backgroundKey"))) { assert(f->modelType & GHMM_kBackgroundDistributions); s = (char *)xmlNodeGetContent(elem); for (i=0; i<f->model.d[modelNo]->bp->n; i++) { if (0 == strcmp(s, f->model.d[modelNo]->bp->name[i])) { if (order != f->model.d[modelNo]->bp->order[i]) { estr = ighmm_mprintf(NULL, 0, "order of background %s and state %d" " does not match", f->model.d[modelNo]->bp->name[i], state); GHMM_LOG(LERROR, estr); m_free(estr); goto STOP; } else { f->model.d[modelNo]->background_id[state] = i; break; } } } if (i == f->model.d[modelNo]->bp->n) { estr = ighmm_mprintf(NULL, 0, "can't find background with name %s in" " state %d", s, state); GHMM_LOG(LERROR, estr); m_free(estr); goto STOP; } m_free(s); } /* -------- tied to --------------------------------------------------- */ if ((!xmlStrcmp(elem->name, BAD_CAST "class"))) { assert(f->modelType & GHMM_kLabeledStates); s = (char *)xmlNodeGetContent(elem); label = atoi(s); m_free(s); if ((f->modelType & PTR_TYPE_MASK) == GHMM_kDiscreteHMM) { if (f->model.d[modelNo]->label_alphabet->size > label) f->model.d[modelNo]->label[state] = label; else GHMM_LOG(LWARN, "Invalid label"); } } /* -------- tied to --------------------------------------------------- */ if ((!xmlStrcmp(elem->name, BAD_CAST "tiedTo"))) { assert(f->modelType & GHMM_kTiedEmissions); s = (char *)xmlNodeGetContent(elem); tied = atoi(s); if (state>=tied) { f->model.d[modelNo]->tied_to[state] = tied; if (f->model.d[modelNo]->tied_to[tied] != tied) { estr = ighmm_mprintf(NULL, 0, "state %d not tied to tie group leader", state); GHMM_LOG(LERROR, estr); m_free(estr); goto STOP; } } else { estr = ighmm_mprintf(NULL, 0, "state %d tiedTo (%d) is invalid", state, tied); GHMM_LOG(LERROR, estr); m_free(estr); goto STOP; } m_free(s); } /* -------- position for graphical editing ---------------------------- */ if ((!xmlStrcmp(elem->name, BAD_CAST "position"))) { curX = getIntAttribute(elem, "x", &error); if (error) GHMM_LOG(LWARN, "failed to read x position"); curY = getIntAttribute(elem, "y", &error); if (error) GHMM_LOG(LWARN, "failed to read y position"); switch (f->modelType & PTR_TYPE_MASK) { case GHMM_kDiscreteHMM: f->model.d[modelNo]->s[state].xPosition = curX; f->model.d[modelNo]->s[state].yPosition = curY; break; case GHMM_kDiscreteHMM+GHMM_kTransitionClasses: f->model.ds[modelNo]->s[state].xPosition = curX; f->model.ds[modelNo]->s[state].yPosition = curY; break; case GHMM_kDiscreteHMM+GHMM_kPairHMM: case GHMM_kDiscreteHMM+GHMM_kPairHMM+GHMM_kTransitionClasses: f->model.dp[modelNo]->s[state].xPosition = curX; f->model.dp[modelNo]->s[state].yPosition = curY; break; case GHMM_kContinuousHMM: case GHMM_kContinuousHMM+GHMM_kTransitionClasses: case (GHMM_kContinuousHMM+GHMM_kMultivariate): case (GHMM_kContinuousHMM+GHMM_kMultivariate+GHMM_kTransitionClasses): f->model.c[modelNo]->s[state].xPosition = curX; f->model.c[modelNo]->s[state].yPosition = curY; break; default: GHMM_LOG(LERROR, "invalid modelType"); goto STOP; } } elem = elem->next; } return 0; STOP: m_free(s); m_free(desc); m_free(emissions) return -1; #undef CUR_PROC }
/** Trains the ghmm_dmodel with a set of annotated sequences till convergence using gradient descent. Model must not have silent states. (checked in Python wrapper) @return trained model/NULL pointer success/error @param mo: pointer to a ghmm_dmodel @param sq: struct of annotated sequences @param eta: intial parameter eta (learning rate) @param no_steps number of training steps */ ghmm_dmodel* ghmm_dmodel_label_gradient_descent (ghmm_dmodel* mo, ghmm_dseq * sq, double eta, int no_steps) { #define CUR_PROC "ghmm_dmodel_label_gradient_descent" char * str; int runs = 0; double cur_perf, last_perf; ghmm_dmodel *last; last = ghmm_dmodel_copy(mo); last_perf = compute_performance (last, sq); while (eta > GHMM_EPS_PREC && runs < no_steps) { runs++; if (-1 == gradient_descent_onestep(mo, sq, eta)) { ghmm_dmodel_free(&last); return NULL; } cur_perf = compute_performance(mo, sq); if (last_perf < cur_perf) { /* if model is degenerated, lower eta and try again */ if (cur_perf > 0.0) { str = ighmm_mprintf(NULL, 0, "current performance = %g", cur_perf); GHMM_LOG(LINFO, str); m_free(str); ghmm_dmodel_free(&mo); mo = ghmm_dmodel_copy(last); eta *= .5; } else { /* Improvement insignificant, assume convergence */ if (fabs (last_perf - cur_perf) < cur_perf * (-1e-8)) { ghmm_dmodel_free(&last); str = ighmm_mprintf(NULL, 0, "convergence after %d steps.", runs); GHMM_LOG(LINFO, str); m_free(str); return 0; } if (runs < 175 || 0 == runs % 50) { str = ighmm_mprintf(NULL, 0, "Performance: %g\t improvement: %g\t step %d", cur_perf, cur_perf - last_perf, runs); GHMM_LOG(LINFO, str); m_free(str); } /* significant improvement, next iteration */ ghmm_dmodel_free(&last); last = ghmm_dmodel_copy(mo); last_perf = cur_perf; eta *= 1.07; } } /* no improvement */ else { if (runs < 175 || 0 == runs % 50) { str = ighmm_mprintf(NULL, 0, "Performance: %g\t !IMPROVEMENT: %g\t step %d", cur_perf, cur_perf - last_perf, runs); GHMM_LOG(LINFO, str); m_free(str); } /* try another training step */ runs++; eta *= .85; if (-1 == gradient_descent_onestep(mo, sq, eta)) { ghmm_dmodel_free(&last); return NULL; } cur_perf = compute_performance (mo, sq); str = ighmm_mprintf(NULL, 0, "Performance: %g\t ?Improvement: %g\t step %d", cur_perf, cur_perf - last_perf, runs); GHMM_LOG(LINFO, str); m_free(str); /* improvement, save and proceed with next iteration */ if (last_perf < cur_perf && cur_perf < 0.0) { ghmm_dmodel_free (&last); last = ghmm_dmodel_copy(mo); last_perf = cur_perf; } /* still no improvement, revert to saved model */ else { runs--; ghmm_dmodel_free(&mo); mo = ghmm_dmodel_copy(last); eta *= .9; } } } ghmm_dmodel_free(&last); return mo; #undef CUR_PROC }
/*============================================================================*/ int *ghmm_dmodel_label_kbest (ghmm_dmodel * mo, int *o_seq, int seq_len, int k, double *log_p) { #define CUR_PROC "ghmm_dl_kbest" int i, t, c, l, m; /* counters */ int no_oldHyps; /* number of hypotheses until position t-1 */ int b_index, i_id; /* index for addressing states' b arrays */ int no_labels = 0; int exists, g_nr; int *states_wlabel; int *label_max_out; char *str; /* logarithmized transition matrix A, log(a(i,j)) => log_a[i*N+j], 1.0 for zero probability */ double **log_a; /* matrix of hypotheses, holds for every position in the sequence a list of hypotheses */ hypoList **h; hypoList *hP; /* vectors for rows in the matrices */ int *hypothesis; /* pointer & prob. of the k most probable hypotheses for each state - matrices of dimensions #states x k: argm(i,l) => argmaxs[i*k+l] */ double *maxima; hypoList **argmaxs; /* pointer to & probability of most probable hypothesis in a certain state */ hypoList *argmax; double sum; /* break if sequence empty or k<1: */ if (seq_len <= 0 || k <= 0) return NULL; ARRAY_CALLOC (h, seq_len); /* 1. Initialization (extend empty hypothesis to #labels hypotheses of length 1): */ /* get number of labels (= maximum label + 1) and number of states with those labels */ ARRAY_CALLOC (states_wlabel, mo->N); ARRAY_CALLOC (label_max_out, mo->N); for (i = 0; i < mo->N; i++) { c = mo->label[i]; states_wlabel[c]++; if (c > no_labels) no_labels = c; if (mo->s[i].out_states > label_max_out[c]) label_max_out[c] = mo->s[i].out_states; } /* add one to the maximum label to get the number of labels */ no_labels++; ARRAY_REALLOC (states_wlabel, no_labels); ARRAY_REALLOC (label_max_out, no_labels); /* initialize h: */ hP = h[0]; for (i = 0; i < mo->N; i++) { if (mo->s[i].pi > KBEST_EPS) { /* printf("Found State %d with initial probability %f\n", i, mo->s[i].pi); */ exists = 0; while (hP != NULL) { if (hP->hyp_c == mo->label[i]) { /* add entry to the gamma list */ g_nr = hP->gamma_states; hP->gamma_id[g_nr] = i; hP->gamma_a[g_nr] = log (mo->s[i].pi) + log (mo->s[i].b[get_emission_index (mo, i, o_seq[0], 0)]); hP->gamma_states = g_nr + 1; exists = 1; break; } else hP = hP->next; } if (!exists) { ighmm_hlist_insert (&(h[0]), mo->label[i], NULL); /* initiallize gamma-array with safe size (number of states) and add the first entry */ ARRAY_MALLOC (h[0]->gamma_a, states_wlabel[mo->label[i]]); ARRAY_MALLOC (h[0]->gamma_id, states_wlabel[mo->label[i]]); h[0]->gamma_id[0] = i; h[0]->gamma_a[0] = log (mo->s[i].pi) + log (mo->s[i].b[get_emission_index (mo, i, o_seq[0], 0)]); h[0]->gamma_states = 1; h[0]->chosen = 1; } hP = h[0]; } } /* reallocating the gamma list to the real size */ hP = h[0]; while (hP != NULL) { ARRAY_REALLOC (hP->gamma_a, hP->gamma_states); ARRAY_REALLOC (hP->gamma_id, hP->gamma_states); hP = hP->next; } /* calculate transition matrix with logarithmic values: */ log_a = kbest_buildLogMatrix (mo->s, mo->N); /* initialize temporary arrays: */ ARRAY_MALLOC (maxima, mo->N * k); /* for each state save k */ ARRAY_MALLOC (argmaxs, mo->N * k); /*------ Main loop: Cycle through the sequence: ------*/ for (t = 1; t < seq_len; t++) { /* put o_seq[t-1] in emission history: */ update_emission_history (mo, o_seq[t - 1]); /* 2. Propagate hypotheses forward and update gamma: */ no_oldHyps = ighmm_hlist_prop_forward (mo, h[t - 1], &(h[t]), no_labels, states_wlabel, label_max_out); /* printf("t = %d (%d), no of old hypotheses = %d\n", t, seq_len, no_oldHyps); */ /*-- calculate new gamma: --*/ hP = h[t]; /* cycle through list of hypotheses */ while (hP != NULL) { for (i = 0; i < hP->gamma_states; i++) { /* if hypothesis hP ends with label of state i: gamma(i,c):= log(sum(exp(a(j,i)*exp(oldgamma(j,old_c))))) + log(b[i](o_seq[t])) else: gamma(i,c):= -INF (represented by 1.0) */ i_id = hP->gamma_id[i]; hP->gamma_a[i] = ighmm_log_gamma_sum (log_a[i_id], &mo->s[i_id], hP->parent); b_index = get_emission_index (mo, i_id, o_seq[t], t); if (b_index < 0) { hP->gamma_a[i] = 1.0; if (mo->order[i_id] > t) continue; else { str = ighmm_mprintf (NULL, 0, "i_id: %d, o_seq[%d]=%d\ninvalid emission index!\n", i_id, t, o_seq[t]); GHMM_LOG(LCONVERTED, str); m_free (str); } } else hP->gamma_a[i] += log (mo->s[i_id].b[b_index]); /*printf("%g = %g\n", log(mo->s[i_id].b[b_index]), hP->gamma_a[i]); */ if (hP->gamma_a[i] > 0.0) { GHMM_LOG(LCONVERTED, "gamma to large. ghmm_dl_kbest failed\n"); exit (1); } } hP = hP->next; } /* 3. Choose the k most probable hypotheses for each state and discard all hypotheses that were not chosen: */ /* initialize temporary arrays: */ for (i = 0; i < mo->N * k; i++) { maxima[i] = 1.0; argmaxs[i] = NULL; } /* cycle through hypotheses & calculate the k most probable hypotheses for each state: */ hP = h[t]; while (hP != NULL) { for (i = 0; i < hP->gamma_states; i++) { i_id = hP->gamma_id[i]; if (hP->gamma_a[i] > KBEST_EPS) continue; /* find first best hypothesis that is worse than current hypothesis: */ for (l = 0; l < k && maxima[i_id * k + l] < KBEST_EPS && maxima[i_id * k + l] > hP->gamma_a[i]; l++); if (l < k) { /* for each m>l: m'th best hypothesis becomes (m+1)'th best */ for (m = k - 1; m > l; m--) { argmaxs[i_id * k + m] = argmaxs[i_id * k + m - 1]; maxima[i_id * k + m] = maxima[i_id * k + m - 1]; } /* save new l'th best hypothesis: */ maxima[i_id * k + l] = hP->gamma_a[i]; argmaxs[i_id * k + l] = hP; } } hP = hP->next; } /* set 'chosen' for all hypotheses from argmaxs array: */ for (i = 0; i < mo->N * k; i++) /* only choose hypotheses whose prob. is at least threshold*max_prob */ if (maxima[i] != 1.0 && maxima[i] >= KBEST_THRESHOLD + maxima[(i % mo->N) * k]) argmaxs[i]->chosen = 1; /* remove hypotheses that were not chosen from the lists: */ /* remove all hypotheses till the first chosen one */ while (h[t] != NULL && 0 == h[t]->chosen) ighmm_hlist_remove (&(h[t])); /* remove all other not chosen hypotheses */ if (!h[t]) { GHMM_LOG(LCONVERTED, "No chosen hypothesis. ghmm_dl_kbest failed\n"); exit (1); } hP = h[t]; while (hP->next != NULL) { if (1 == hP->next->chosen) hP = hP->next; else ighmm_hlist_remove (&(hP->next)); } } /* dispose of temporary arrays: */ m_free(states_wlabel); m_free(label_max_out); m_free(argmaxs); m_free(maxima); /* transition matrix is no longer needed from here on */ for (i=0; i<mo->N; i++) m_free(log_a[i]); m_free(log_a); /* 4. Save the hypothesis with the highest probability over all states: */ hP = h[seq_len - 1]; argmax = NULL; *log_p = 1.0; /* log_p will store log of maximum summed probability */ while (hP != NULL) { /* sum probabilities for each hypothesis over all states: */ sum = ighmm_cvector_log_sum (hP->gamma_a, hP->gamma_states); /* and select maximum sum */ if (sum < KBEST_EPS && (*log_p == 1.0 || sum > *log_p)) { *log_p = sum; argmax = hP; } hP = hP->next; } /* found a valid path? */ if (*log_p < KBEST_EPS) { /* yes: extract chosen hypothesis: */ ARRAY_MALLOC (hypothesis, seq_len); for (i = seq_len - 1; i >= 0; i--) { hypothesis[i] = argmax->hyp_c; argmax = argmax->parent; } } else /* no: return 1.0 representing -INF and an empty hypothesis */ hypothesis = NULL; /* dispose of calculation matrices: */ hP = h[seq_len - 1]; while (hP != NULL) ighmm_hlist_remove (&hP); free (h); return hypothesis; STOP: /* Label STOP from ARRAY_[CM]ALLOC */ GHMM_LOG(LCONVERTED, "ghmm_dl_kbest failed\n"); exit (1); #undef CUR_PROC }