/* ========================================================================= */ static char * doubleArrayToCSV(double * array, int size) { #define CUR_PROC "doubleArrayToCSV" int i, pos=0; char *csv=NULL; int singlelength = (2 + /* comma and space */ 8 + /* 8 signifcant digits */ 1 + /* sign */ 5 + /* 'E' and signed mantissa */ 3); /* safety */ int maxlength = size * singlelength; ARRAY_MALLOC(csv, maxlength); for (i=0; i < size-1 && pos + singlelength < maxlength; i++) { pos += sprintf(csv+pos, "%.8g, ", array[i]); } if (i < size-1 || pos + singlelength > maxlength) { GHMM_LOG(LERROR, "writing CSV failed"); goto STOP; } else { pos += sprintf(csv+pos, "%.8g", array[i]); } /*printf("%d bytes of %d written\n", pos, maxlength);*/ return csv; STOP: free(csv); return NULL; #undef CUR_PROC }
/** allocates memory for m and n matrices: */ static int gradient_descent_galloc (double ***matrix_b, double **matrix_a, double **matrix_pi, ghmm_dmodel * mo) { #define CUR_PROC "gradient_descent_galloc" int i; /* first allocate memory for matrix_b */ ARRAY_MALLOC (*matrix_b, mo->N); for (i = 0; i < mo->N; i++) ARRAY_CALLOC ((*matrix_b)[i], ghmm_ipow (mo, mo->M, mo->order[i] + 1)); /* matrix_a(i,j) = matrix_a[i*mo->N+j] */ ARRAY_CALLOC (*matrix_a, mo->N * mo->N); /* allocate memory for matrix_pi */ ARRAY_CALLOC (*matrix_pi, mo->N); return 0; STOP: /* Label STOP from ARRAY_[CM]ALLOC */ gradient_descent_gfree (*matrix_b, *matrix_a, *matrix_pi, mo->N); return -1; #undef CUR_PROC }
ghmm_dpseq * ghmm_dpseq_init(int length, int number_of_alphabets, int number_of_d_seqs) { #define CUR_PROC "ghmm_dpseq_init" ghmm_dpseq * seq; ARRAY_MALLOC (seq, 1); seq->length = length; seq->number_of_alphabets = number_of_alphabets; seq->number_of_d_seqs = number_of_d_seqs; seq->seq = NULL; seq->d_value = NULL; if (number_of_alphabets > 0) { seq->seq = ighmm_dmatrix_alloc(number_of_alphabets, length); if (!(seq->seq)) goto STOP; } if (number_of_d_seqs > 0) { seq->d_value = ighmm_cmatrix_alloc(number_of_d_seqs, length); if (!(seq->d_value)) goto STOP; } return seq; STOP: /* Label STOP from ARRAY_[CM]ALLOC */ ghmm_dpseq_free(seq); return NULL; #undef CUR_PROC }
int ghmm_alloc_sample_data(ghmm_bayes_hmm *mo, ghmm_sample_data *data){ #define CUR_PROC "ghmm_alloc_sample_data" //XXX must do alloc matrices for dim >1 int i; data->transition = ighmm_cmatrix_alloc(mo->N, mo->N); ARRAY_MALLOC(data->state_data, mo->N); for(i = 0; i < mo->N; i++){ ARRAY_MALLOC(data->state_data[i], mo->M[i]); /*for(i = 0; i < mo->M[i]; i++){//only needed for dim >1 ghmm_alloc_emission_data(data->state_data[i][j], ghmm_bayes_hmm->params[i][j]) }*/ } return 0; STOP: return -1; #undef CUR_PROC }
/** Calculates the logarithm of sum(exp(log_a[j,a_pos])+exp(log_gamma[j,g_pos])) which corresponds to the logarithm of the sum of a[j,a_pos]*gamma[j,g_pos] @return ighmm_log_sum for products of a row from gamma and a row from matrix A @param log_a: row of the transition matrix with logarithmic values (1.0 for log(0)) @param s: ghmm_dstate whose gamma-value is calculated @param parent: a pointer to the parent hypothesis */ static double ighmm_log_gamma_sum (double *log_a, ghmm_dstate * s, hypoList * parent) { #define CUR_PROC "ighmm_log_gamma_sum" double result; int j, j_id, k; double max = 1.0; int argmax = 0; double *logP; /* shortcut for the trivial case */ if (parent->gamma_states == 1) for (j = 0; j < s->in_states; j++) if (parent->gamma_id[0] == s->in_id[j]) return parent->gamma_a[0] + log_a[j]; ARRAY_MALLOC (logP, s->in_states); /* calculate logs of a[k,l]*gamma[k,hi] as sums of logs and find maximum: */ for (j = 0; j < s->in_states; j++) { j_id = s->in_id[j]; /* search for state j_id in the gamma list */ for (k = 0; k < parent->gamma_states; k++) if (parent->gamma_id[k] == j_id) break; if (k == parent->gamma_states) logP[j] = 1.0; else { logP[j] = log_a[j] + parent->gamma_a[k]; if (max == 1.0 || (logP[j] > max && logP[j] != 1.0)) { max = logP[j]; argmax = j; } } } /* calculate max+log(1+sum[j!=argmax; exp(logP[j]-max)]) */ result = 1.0; for (j = 0; j < s->in_states; j++) if (j != argmax && logP[j] != 1.0) result += exp (logP[j] - max); result = log (result); result += max; free (logP); return result; STOP: /* Label STOP from ARRAY_[CM]ALLOC */ GHMM_LOG(LCONVERTED, "ighmm_log_gamma_sum failed\n"); exit (1); #undef CUR_PROC }
/*===========================================================================*/ static int parseBackground(xmlDocPtr doc, xmlNodePtr cur, ghmm_xmlfile* f, int modelNo) { #define CUR_PROC "parseBackground" int error, order; int bgNr, rev; double *b = NULL; char *s = NULL; assert(f->modelType & GHMM_kDiscreteHMM); bgNr = f->model.d[modelNo]->bp->n++; /* get order */ order = getIntAttribute(cur, "order", &error); if (error) order=0; else if (order && !(f->modelType & GHMM_kHigherOrderEmissions)) { GHMM_LOG(LERROR, "background distribution has order > 0, but model is not higher order"); goto STOP; } f->model.d[modelNo]->bp->order[bgNr] = order; /* get name */ s = (char *)getXMLCharAttribute(cur, "key", &error); f->model.d[modelNo]->bp->name[bgNr] = s; rev = getIntAttribute(cur, "rev", &error); if (error) rev = 0; /* get distribution */ s = (char *)xmlNodeGetContent(cur); ARRAY_MALLOC(b, pow(f->model.d[modelNo]->bp->m, order+1)); if (-1 != parseCSVList(s, pow(f->model.d[modelNo]->bp->m, order+1), b, rev)) f->model.d[modelNo]->bp->b[bgNr] = b; else { GHMM_LOG(LERROR, "Can not parse background CSV list."); goto STOP; } free(s); return 0; STOP: m_free(b); free(s); return -1; #undef CUR_PROC }
/*===========================================================================*/ static ghmm_alphabet * parseAlphabet(xmlDocPtr doc, xmlNodePtr cur, ghmm_xmlfile* f) { #define CUR_PROC "parseAlphabet" char * str; int M, code, error; xmlNodePtr symbol; ghmm_alphabet * alfa; ARRAY_CALLOC(alfa, 1); symbol = cur->children; M=0; while (symbol!=NULL) { if ((!xmlStrcmp(symbol->name, BAD_CAST "symbol"))) { code = getIntAttribute(symbol, "code", &error); if (error || code!=M) { str = ighmm_mprintf(NULL, 0, "non consecutive code %d == %d", code, M); GHMM_LOG(LERROR, str); m_free(str); goto STOP; } else M++; } symbol=symbol->next; } alfa->size = M; /*printf("Parsing alphabet with %d symbols\n", alfa->size);*/ ARRAY_MALLOC(alfa->symbols, M); symbol = cur->children; M=0; while (symbol!=NULL) { if ((!xmlStrcmp(symbol->name, BAD_CAST "symbol"))) { alfa->symbols[M++] = (char *)xmlNodeGetContent(symbol); /*printf("%d. symbol: %s\n", M, alfa->symbols[M-1]);*/ } symbol=symbol->next; } return alfa; STOP: m_free(alfa->symbols); m_free(alfa) return NULL; #undef CUR_PROC }
/*===========================================================================*/ static int parseState(xmlDocPtr doc, xmlNodePtr cur, ghmm_xmlfile* f, int * inDegree, int * outDegree, int modelNo) { #define CUR_PROC "parseState" int i, error, order=0, state=-1442, fixed=-985, tied=-9354, M, aprox, label; int curX=0, curY=0; double pi, prior; double *emissions = NULL; char *desc = NULL; char *s = NULL, *estr; int rev, stateFixed=1; ghmm_cstate *newcstate; ghmm_c_emission *emission; xmlNodePtr elem, child, multichild; state = getIntAttribute(cur, "id", &error); pi = getDoubleAttribute(cur, "initial", &error); if (error) { estr = ighmm_mprintf(NULL, 0, "can't read required intial probability for" "state %d", state); GHMM_LOG(LERROR, estr); goto STOP; } else desc = xmlGetProp(cur, BAD_CAST "desc"); elem = cur->children; while (elem!=NULL) { /* ======== silent state ============================================== */ if ((!xmlStrcmp(elem->name, BAD_CAST "silent"))) { switch (f->modelType & PTR_TYPE_MASK) { case (GHMM_kDiscreteHMM): f->model.d[modelNo]->silent[state] = 1; break; case (GHMM_kDiscreteHMM+GHMM_kTransitionClasses): f->model.ds[modelNo]->silent[state] = 1; break; case (GHMM_kDiscreteHMM+GHMM_kPairHMM): case (GHMM_kDiscreteHMM+GHMM_kPairHMM+GHMM_kTransitionClasses): f->model.dp[modelNo]->silent[state] = 1; break; default: GHMM_LOG(LERROR, "invalid modelType"); goto STOP; } } /* ======== discrete state (possible higher order) ==================== */ if ((!xmlStrcmp(elem->name, BAD_CAST "discrete"))) { assert((f->modelType & GHMM_kDiscreteHMM) && ((f->modelType & GHMM_kPairHMM) == 0)); /* fixed is a propety of the distribution and optional */ fixed = getIntAttribute(elem, "fixed", &error); if (error) fixed = 0; /* order is optional for discrete */ if (f->modelType & GHMM_kHigherOrderEmissions) { order = getIntAttribute(elem, "order", &error); if (error) order = 0; } rev = getIntAttribute(cur, "rev", &error); if (error) rev = 0; /* parsing emission probabilities */ s = (char *)xmlNodeGetContent(elem); switch (f->modelType & PTR_TYPE_MASK) { case (GHMM_kDiscreteHMM): f->model.d[modelNo]->s[state].desc = desc; f->model.d[modelNo]->s[state].pi = pi; f->model.d[modelNo]->s[state].fix = fixed; if (f->modelType & GHMM_kHigherOrderEmissions) { f->model.d[modelNo]->order[state] = order; if (f->model.d[modelNo]->maxorder < order) { f->model.d[modelNo]->maxorder = order; estr = ighmm_mprintf(NULL, 0, "Updated maxorder to %d\n", f->model.d[modelNo]->maxorder); GHMM_LOG(LDEBUG, estr); m_free(estr); } } ARRAY_MALLOC(emissions, pow(f->model.d[modelNo]->M, order+1)); parseCSVList(s, pow(f->model.d[modelNo]->M, order+1), emissions, rev); free(f->model.d[modelNo]->s[state].b); f->model.d[modelNo]->s[state].b = emissions; break; case (GHMM_kDiscreteHMM+GHMM_kTransitionClasses): f->model.ds[modelNo]->s[state].desc = desc; f->model.ds[modelNo]->s[state].pi = pi; f->model.ds[modelNo]->s[state].fix = fixed; if (f->modelType & GHMM_kHigherOrderEmissions) f->model.ds[modelNo]->order[state] = order; ARRAY_MALLOC(emissions, pow(f->model.ds[modelNo]->M, order+1)); parseCSVList(s, pow(f->model.ds[modelNo]->M, order+1), emissions, rev); f->model.ds[modelNo]->s[state].b = emissions; break; default: GHMM_LOG(LERROR, "invalid modelType"); goto STOP; } m_free(s); } /* ======== continuous state ========================================== */ if ((!xmlStrcmp(elem->name, BAD_CAST "mixture"))) { assert(f->modelType & GHMM_kContinuousHMM); M = 0; child = elem->children; while (child != NULL) { if ((!xmlStrcmp(child->name, BAD_CAST "normal")) || (!xmlStrcmp(child->name, BAD_CAST "normalLeftTail")) || (!xmlStrcmp(child->name, BAD_CAST "normalRightTail")) || (!xmlStrcmp(child->name, BAD_CAST "multinormal")) || (!xmlStrcmp(child->name, BAD_CAST "uniform"))){ M ++; } child = child->next; } ghmm_cstate_alloc(f->model.c[modelNo]->s + state, M, inDegree[state], outDegree[state], f->model.c[modelNo]->cos); newcstate = f->model.c[modelNo]->s + state; newcstate->desc = desc; newcstate->M = M; newcstate->pi = pi; if( f->model.c[modelNo]->M < M) f->model.c[modelNo]->M = M; child = elem->children; i = 0; while (child != NULL) { emission = newcstate->e+i; /* common attributes */ if ((!xmlStrcmp(child->name, BAD_CAST "normal")) || (!xmlStrcmp(child->name, BAD_CAST "normalLeftTail")) || (!xmlStrcmp(child->name, BAD_CAST "normalRightTail")) || (!xmlStrcmp(child->name, BAD_CAST "multinormal")) || (!xmlStrcmp(child->name, BAD_CAST "uniform"))){ fixed = getIntAttribute(child, "fixed", &error); if (error) fixed = 0; stateFixed = fixed && stateFixed; /* allocate emission */ emission->fixed = fixed; prior = getDoubleAttribute(child, "prior", &error); if (error) prior = 1.0; newcstate->c[i] = prior; } /* child is not a density, continue with the next child */ else { child = child->next; continue; } /* density type dependent attributes */ if ((!xmlStrcmp(child->name, BAD_CAST "normal"))) { emission->mean.val = getDoubleAttribute(child, "mean", &error); emission->variance.val = getDoubleAttribute(child, "variance", &error); /* should the normal distribution be approximated? */ aprox = getIntAttribute(child, "approx", &error); if (error) aprox = 0; emission->type = aprox ? normal_approx : normal; emission->dimension = 1; if (f->model.c[modelNo]->dim > 1) { GHMM_LOG(LERROR, "All emissions must have same dimension."); goto STOP; } } if ((!xmlStrcmp(child->name, BAD_CAST "normalLeftTail"))) { emission->mean.val = getDoubleAttribute(child, "mean", &error); emission->variance.val = getDoubleAttribute(child, "variance", &error); emission->min = getDoubleAttribute(child, "max", &error); emission->type = normal_left; emission->dimension = 1; if (f->model.c[modelNo]->dim > 1) { GHMM_LOG(LERROR, "All emissions must have same dimension."); goto STOP; } } if ((!xmlStrcmp(child->name, BAD_CAST "normalRightTail"))) { emission->mean.val = getDoubleAttribute(child, "mean", &error); emission->variance.val = getDoubleAttribute(child, "variance", &error); emission->max = getDoubleAttribute(child, "min", &error); emission->type = normal_right; emission->dimension = 1; if (f->model.c[modelNo]->dim > 1) { GHMM_LOG(LERROR, "All emissions must have same dimension."); goto STOP; } } if ((!xmlStrcmp(child->name, BAD_CAST "uniform"))) { emission->max = getDoubleAttribute(child, "max", &error); emission->min = getDoubleAttribute(child, "min", &error); emission->type = uniform; emission->dimension = 1; if (f->model.c[modelNo]->dim > 1) { GHMM_LOG(LERROR, "All emissions must have same dimension."); goto STOP; } } if ((!xmlStrcmp(child->name, BAD_CAST "multinormal"))) { emission->type = multinormal; emission->dimension = getIntAttribute(child, "dimension", &error); /* check that all emissions in all states have same dimension or set when first emission is read*/ if (f->model.c[modelNo]->dim <= 1) f->model.c[modelNo]->dim = emission->dimension; else if (f->model.c[modelNo]->dim != emission->dimension) { GHMM_LOG(LERROR, "All emissions must have same dimension."); goto STOP; } if (0 != ghmm_c_emission_alloc(emission, emission->dimension)) { GHMM_LOG(LERROR, "Can not allocate multinormal emission."); goto STOP; } multichild = child->children; while (multichild != NULL) { if ((!xmlStrcmp(multichild->name, BAD_CAST "mean"))) { s = (char *)xmlNodeGetContent(multichild); if (-1 == parseCSVList(s, emission->dimension, emission->mean.vec, 0)) { GHMM_LOG(LERROR, "Can not parse mean CSV list."); goto STOP; } } if ((!xmlStrcmp(multichild->name, BAD_CAST "variance"))) { s = (char *)xmlNodeGetContent(multichild); if (-1 == parseCSVList(s, emission->dimension * emission->dimension, emission->variance.mat, 0)) { GHMM_LOG(LERROR, "Can not parse variance CSV list."); goto STOP; } if (0 != ighmm_invert_det(emission->sigmainv, &emission->det, emission->dimension, emission->variance.mat)) { GHMM_LOG(LERROR, "Can not calculate inverse of covariance matrix."); goto STOP; } if (0 != ighmm_cholesky_decomposition(emission->sigmacd, emission->dimension, emission->variance.mat)) { GHMM_LOG(LERROR, "Can not calculate cholesky decomposition of covariance matrix."); goto STOP; } } multichild = multichild->next; } } i++; child = child->next; } newcstate->fix = stateFixed; } /* ======== pair hmm state ============================================ */ if ((!xmlStrcmp(elem->name, BAD_CAST "pair"))) { } /* -------- background name ------------------------------------------ */ if ((!xmlStrcmp(elem->name, BAD_CAST "backgroundKey"))) { assert(f->modelType & GHMM_kBackgroundDistributions); s = (char *)xmlNodeGetContent(elem); for (i=0; i<f->model.d[modelNo]->bp->n; i++) { if (0 == strcmp(s, f->model.d[modelNo]->bp->name[i])) { if (order != f->model.d[modelNo]->bp->order[i]) { estr = ighmm_mprintf(NULL, 0, "order of background %s and state %d" " does not match", f->model.d[modelNo]->bp->name[i], state); GHMM_LOG(LERROR, estr); m_free(estr); goto STOP; } else { f->model.d[modelNo]->background_id[state] = i; break; } } } if (i == f->model.d[modelNo]->bp->n) { estr = ighmm_mprintf(NULL, 0, "can't find background with name %s in" " state %d", s, state); GHMM_LOG(LERROR, estr); m_free(estr); goto STOP; } m_free(s); } /* -------- tied to --------------------------------------------------- */ if ((!xmlStrcmp(elem->name, BAD_CAST "class"))) { assert(f->modelType & GHMM_kLabeledStates); s = (char *)xmlNodeGetContent(elem); label = atoi(s); m_free(s); if ((f->modelType & PTR_TYPE_MASK) == GHMM_kDiscreteHMM) { if (f->model.d[modelNo]->label_alphabet->size > label) f->model.d[modelNo]->label[state] = label; else GHMM_LOG(LWARN, "Invalid label"); } } /* -------- tied to --------------------------------------------------- */ if ((!xmlStrcmp(elem->name, BAD_CAST "tiedTo"))) { assert(f->modelType & GHMM_kTiedEmissions); s = (char *)xmlNodeGetContent(elem); tied = atoi(s); if (state>=tied) { f->model.d[modelNo]->tied_to[state] = tied; if (f->model.d[modelNo]->tied_to[tied] != tied) { estr = ighmm_mprintf(NULL, 0, "state %d not tied to tie group leader", state); GHMM_LOG(LERROR, estr); m_free(estr); goto STOP; } } else { estr = ighmm_mprintf(NULL, 0, "state %d tiedTo (%d) is invalid", state, tied); GHMM_LOG(LERROR, estr); m_free(estr); goto STOP; } m_free(s); } /* -------- position for graphical editing ---------------------------- */ if ((!xmlStrcmp(elem->name, BAD_CAST "position"))) { curX = getIntAttribute(elem, "x", &error); if (error) GHMM_LOG(LWARN, "failed to read x position"); curY = getIntAttribute(elem, "y", &error); if (error) GHMM_LOG(LWARN, "failed to read y position"); switch (f->modelType & PTR_TYPE_MASK) { case GHMM_kDiscreteHMM: f->model.d[modelNo]->s[state].xPosition = curX; f->model.d[modelNo]->s[state].yPosition = curY; break; case GHMM_kDiscreteHMM+GHMM_kTransitionClasses: f->model.ds[modelNo]->s[state].xPosition = curX; f->model.ds[modelNo]->s[state].yPosition = curY; break; case GHMM_kDiscreteHMM+GHMM_kPairHMM: case GHMM_kDiscreteHMM+GHMM_kPairHMM+GHMM_kTransitionClasses: f->model.dp[modelNo]->s[state].xPosition = curX; f->model.dp[modelNo]->s[state].yPosition = curY; break; case GHMM_kContinuousHMM: case GHMM_kContinuousHMM+GHMM_kTransitionClasses: case (GHMM_kContinuousHMM+GHMM_kMultivariate): case (GHMM_kContinuousHMM+GHMM_kMultivariate+GHMM_kTransitionClasses): f->model.c[modelNo]->s[state].xPosition = curX; f->model.c[modelNo]->s[state].yPosition = curY; break; default: GHMM_LOG(LERROR, "invalid modelType"); goto STOP; } } elem = elem->next; } return 0; STOP: m_free(s); m_free(desc); m_free(emissions) return -1; #undef CUR_PROC }
void generateModel(model *mo, int noStates) { # define CUR_PROC "generateModel" state *states; int i, j; /* flags indicating whether a state is silent */ int *silent_array; /*allocate memory for states and array of silent flags*/ ARRAY_MALLOC(states, noStates); silent_array = (int*)malloc(sizeof(int)*noStates); /* initialize all states as none silent*/ for (i=0; i < noStates; i++) { silent_array[i] = 0; } mo->N = noStates; mo->M = 4; mo->maxorder = noStates-1; mo->prior = -1; /* Model has Higher order Emissions and labeled states*/ mo->model_type = kLabeledStates; if (mo->maxorder>0) mo->model_type += kHigherOrderEmissions; /* kHigherOrderEmissions + kHasBackgroundDistributions*/ /* allocate memory for pow look-up table and fill it */ ARRAY_MALLOC(mo->pow_lookup, mo->maxorder+1) mo->pow_lookup[0] = 1; for (i=1; i<mo->maxorder+1; i++) mo->pow_lookup[i] = mo->pow_lookup[i-1] * mo->M; /*initialize states*/ for (i=0; i < mo->N; i++) { states[i].pi = (0==i ? 1.0:0.0); states[i].fix = 0; states[i].label = i%3; states[i].order = i%2; states[i].out_states = 2; states[i].in_states = 2; /* allocate memory for the a, the out- and incoming States and b array for higher emmission order states*/ states[i].b = (double*)malloc(sizeof(double) * pow(mo->M, (states[i].order+1) )); states[i].out_id = (int*)malloc(sizeof(int)*states[i].out_states); states[i].in_id = (int*)malloc(sizeof(int)*states[i].in_states); states[i].out_a = (double*)malloc(sizeof(double)*states[i].out_states); states[i].in_a = (double*)malloc(sizeof(double)*states[i].in_states); for (j = 0; j < pow(mo->M,states[i].order+1); j++){ states[i].b[j] = ( (0==(i+j)%mo->M) ? .6 : .4 / (mo->M-1)); } if ((mo->N-1)==i) { states[i].out_id[0] = 0; states[i].out_id[1] = i; } else { states[i].out_id[0] = i; states[i].out_id[1] = i+1; } if (0==i) { states[i].in_id[0] = i; states[i].in_id[1] = mo->N-1; } else { states[i].in_id[1] = i-1; states[i].in_id[0] = i; } states[i].out_a[0] = 0.5; states[i].out_a[1] = 0.5; states[i].in_a[0] = 0.5; states[i].in_a[1] = 0.5; #ifdef DEBUG printf("State %d goto : %d, %d\n", i, states[i].out_id[0], states[i].out_id[1]); printf("State %d comefrom: %d, %d\n", i, states[i].in_id[0], states[i].in_id[1]); printf("State %d goto : %g, %g\n", i, states[i].out_a[0], states[i].out_a[1]); printf("State %d comefrom: %g, %g\n", i, states[i].in_a[0], states[i].in_a[1]); #endif } mo->s = states; mo->silent = silent_array; #ifdef DEBUG for (i = 0; i < mo->N; i++) { printf("\n State %d:\n", i); for (j = 0; j < pow(mo->M,states[i].order+1); j++){ printf("%g ",mo->s[i].b[j]); } } #endif model_print(stdout, mo); STOP: printf("\n"); # undef CUR_PROC }
/* ========================================================================= */ static int writeTransition(xmlTextWriterPtr writer, ghmm_xmlfile* f, int moNo, int sNo) { #define CUR_PROC "writeTransition" int cos, i, j; int out_states, * out_id; double * * out_a; double * w_out_a; char * tmp; /* write state contents for different model types */ switch (f->modelType & PTR_TYPE_MASK) { case GHMM_kDiscreteHMM: out_states = f->model.d[moNo]->s[sNo].out_states; out_id = f->model.d[moNo]->s[sNo].out_id; out_a = &(f->model.d[moNo]->s[sNo].out_a); cos = 1; break; case (GHMM_kDiscreteHMM+GHMM_kTransitionClasses): out_states = f->model.ds[moNo]->s[sNo].out_states; out_id = f->model.ds[moNo]->s[sNo].out_id; out_a = f->model.ds[moNo]->s[sNo].out_a; cos = f->model.ds[moNo]->cos; break; case (GHMM_kDiscreteHMM+GHMM_kPairHMM): case (GHMM_kDiscreteHMM+GHMM_kPairHMM+GHMM_kTransitionClasses): /* out_states = f->model.dp[moNo]->s[sNo].out_states; out_id = f->model.dp[moNo]->s[sNo].out_id; out_a = f->model.dp[moNo]->s[sNo].out_a; cos = f->model.dp[moNo]->cos; */ break; case GHMM_kContinuousHMM: case (GHMM_kContinuousHMM+GHMM_kTransitionClasses): case (GHMM_kContinuousHMM+GHMM_kMultivariate): case (GHMM_kContinuousHMM+GHMM_kMultivariate+GHMM_kTransitionClasses): out_states = f->model.c[moNo]->s[sNo].out_states; out_id = f->model.c[moNo]->s[sNo].out_id; out_a = f->model.c[moNo]->s[sNo].out_a; cos = f->model.c[moNo]->cos; break; default: GHMM_LOG(LCRITIC, "invalid modelType");} ARRAY_MALLOC(w_out_a, cos); for (i=0; i<out_states; i++) { if (0 > xmlTextWriterStartElement(writer, BAD_CAST "transition")) { GHMM_LOG(LERROR, "Error at xmlTextWriterStartElement (transition)"); goto STOP; } /* write source id (current state attribute */ if (0 > xmlTextWriterWriteFormatAttribute(writer, BAD_CAST "source", "%d", sNo)) GHMM_LOG(LERROR, "failed to write transition source attribute"); /* write target id as attribute */ if (0 > xmlTextWriterWriteFormatAttribute(writer, BAD_CAST "target", "%d", out_id[i])) GHMM_LOG(LERROR, "failed to write transition target attribute"); for (j=0; j<cos; j++) w_out_a[j] = out_a[j][i]; tmp = doubleArrayToCSV(w_out_a, cos); if (tmp) { if (0 > xmlTextWriterWriteElement(writer, BAD_CAST "probability", BAD_CAST tmp)) { GHMM_LOG(LERROR, "Error at xmlTextWriterWriteElement (transition probabilities)"); m_free(tmp); goto STOP; } m_free(tmp); } else { GHMM_LOG(LERROR, "converting transition probabilities array to CSV failed"); goto STOP; } /* end transition */ if (0 > xmlTextWriterEndElement(writer)) { GHMM_LOG(LERROR, "Error at xmlTextWriterEndElement (transition)"); goto STOP; } } return 0; STOP: return -1; #undef CUR_PROC }
/*============================================================================*/ static int ighmm_hlist_prop_forward (ghmm_dmodel * mo, hypoList * h, hypoList ** hplus, int labels, int *nr_s, int *max_out) { #define CUR_PROC "ighmm_hlist_prop_forward" int i, j, c, k; int i_id, j_id, g_nr; int no_oldHyps = 0, newHyps = 0; hypoList *hP = h; hypoList **created; ARRAY_MALLOC (created, labels); /* extend the all hypotheses with the labels of out_states of all states in the hypotesis */ while (hP != NULL) { /* lookup table for labels, created[i]!=0 iff the current hypotheses was propagated forward with label i */ for (c = 0; c < labels; c++) created[c] = NULL; /* extend the current hypothesis and add all states which may have probability greater null */ for (i = 0; i < hP->gamma_states; i++) { /* skip impossible states */ if (hP->gamma_a[i] == 1.0) continue; i_id = hP->gamma_id[i]; for (j = 0; j < mo->s[i_id].out_states; j++) { j_id = mo->s[i_id].out_id[j]; c = mo->label[j_id]; /* create a new hypothesis with label c */ if (!created[c]) { ighmm_hlist_insert (hplus, c, hP); created[c] = *hplus; /* initiallize gamma-array with safe size (number of states */ ARRAY_MALLOC ((*hplus)->gamma_id, m_min (nr_s[c], hP->gamma_states * max_out[hP->hyp_c])); (*hplus)->gamma_id[0] = j_id; (*hplus)->gamma_states = 1; newHyps++; } /* add a new gamma state to the existing hypothesis with c */ else { g_nr = created[c]->gamma_states; /* search for state j_id in the gamma list */ for (k = 0; k < g_nr; k++) if (j_id == created[c]->gamma_id[k]) break; /* add the state to the gamma list */ if (k == g_nr) { created[c]->gamma_id[g_nr] = j_id; created[c]->gamma_states = g_nr + 1; } } } } /* reallocating gamma-array to the correct size */ for (c = 0; c < labels; c++) { if (created[c]) { ARRAY_CALLOC (created[c]->gamma_a, created[c]->gamma_states); ARRAY_REALLOC (created[c]->gamma_id, created[c]->gamma_states); created[c] = NULL; } } hP = hP->next; no_oldHyps++; } /* printf("Created %d new Hypotheses.\n", newHyps); */ free (created); return (no_oldHyps); STOP: /* Label STOP from ARRAY_[CM]ALLOC */ GHMM_LOG(LCONVERTED, "ighmm_hlist_prop_forward failed\n"); exit (1); #undef CUR_PROC }
/*============================================================================*/ int *ghmm_dmodel_label_kbest (ghmm_dmodel * mo, int *o_seq, int seq_len, int k, double *log_p) { #define CUR_PROC "ghmm_dl_kbest" int i, t, c, l, m; /* counters */ int no_oldHyps; /* number of hypotheses until position t-1 */ int b_index, i_id; /* index for addressing states' b arrays */ int no_labels = 0; int exists, g_nr; int *states_wlabel; int *label_max_out; char *str; /* logarithmized transition matrix A, log(a(i,j)) => log_a[i*N+j], 1.0 for zero probability */ double **log_a; /* matrix of hypotheses, holds for every position in the sequence a list of hypotheses */ hypoList **h; hypoList *hP; /* vectors for rows in the matrices */ int *hypothesis; /* pointer & prob. of the k most probable hypotheses for each state - matrices of dimensions #states x k: argm(i,l) => argmaxs[i*k+l] */ double *maxima; hypoList **argmaxs; /* pointer to & probability of most probable hypothesis in a certain state */ hypoList *argmax; double sum; /* break if sequence empty or k<1: */ if (seq_len <= 0 || k <= 0) return NULL; ARRAY_CALLOC (h, seq_len); /* 1. Initialization (extend empty hypothesis to #labels hypotheses of length 1): */ /* get number of labels (= maximum label + 1) and number of states with those labels */ ARRAY_CALLOC (states_wlabel, mo->N); ARRAY_CALLOC (label_max_out, mo->N); for (i = 0; i < mo->N; i++) { c = mo->label[i]; states_wlabel[c]++; if (c > no_labels) no_labels = c; if (mo->s[i].out_states > label_max_out[c]) label_max_out[c] = mo->s[i].out_states; } /* add one to the maximum label to get the number of labels */ no_labels++; ARRAY_REALLOC (states_wlabel, no_labels); ARRAY_REALLOC (label_max_out, no_labels); /* initialize h: */ hP = h[0]; for (i = 0; i < mo->N; i++) { if (mo->s[i].pi > KBEST_EPS) { /* printf("Found State %d with initial probability %f\n", i, mo->s[i].pi); */ exists = 0; while (hP != NULL) { if (hP->hyp_c == mo->label[i]) { /* add entry to the gamma list */ g_nr = hP->gamma_states; hP->gamma_id[g_nr] = i; hP->gamma_a[g_nr] = log (mo->s[i].pi) + log (mo->s[i].b[get_emission_index (mo, i, o_seq[0], 0)]); hP->gamma_states = g_nr + 1; exists = 1; break; } else hP = hP->next; } if (!exists) { ighmm_hlist_insert (&(h[0]), mo->label[i], NULL); /* initiallize gamma-array with safe size (number of states) and add the first entry */ ARRAY_MALLOC (h[0]->gamma_a, states_wlabel[mo->label[i]]); ARRAY_MALLOC (h[0]->gamma_id, states_wlabel[mo->label[i]]); h[0]->gamma_id[0] = i; h[0]->gamma_a[0] = log (mo->s[i].pi) + log (mo->s[i].b[get_emission_index (mo, i, o_seq[0], 0)]); h[0]->gamma_states = 1; h[0]->chosen = 1; } hP = h[0]; } } /* reallocating the gamma list to the real size */ hP = h[0]; while (hP != NULL) { ARRAY_REALLOC (hP->gamma_a, hP->gamma_states); ARRAY_REALLOC (hP->gamma_id, hP->gamma_states); hP = hP->next; } /* calculate transition matrix with logarithmic values: */ log_a = kbest_buildLogMatrix (mo->s, mo->N); /* initialize temporary arrays: */ ARRAY_MALLOC (maxima, mo->N * k); /* for each state save k */ ARRAY_MALLOC (argmaxs, mo->N * k); /*------ Main loop: Cycle through the sequence: ------*/ for (t = 1; t < seq_len; t++) { /* put o_seq[t-1] in emission history: */ update_emission_history (mo, o_seq[t - 1]); /* 2. Propagate hypotheses forward and update gamma: */ no_oldHyps = ighmm_hlist_prop_forward (mo, h[t - 1], &(h[t]), no_labels, states_wlabel, label_max_out); /* printf("t = %d (%d), no of old hypotheses = %d\n", t, seq_len, no_oldHyps); */ /*-- calculate new gamma: --*/ hP = h[t]; /* cycle through list of hypotheses */ while (hP != NULL) { for (i = 0; i < hP->gamma_states; i++) { /* if hypothesis hP ends with label of state i: gamma(i,c):= log(sum(exp(a(j,i)*exp(oldgamma(j,old_c))))) + log(b[i](o_seq[t])) else: gamma(i,c):= -INF (represented by 1.0) */ i_id = hP->gamma_id[i]; hP->gamma_a[i] = ighmm_log_gamma_sum (log_a[i_id], &mo->s[i_id], hP->parent); b_index = get_emission_index (mo, i_id, o_seq[t], t); if (b_index < 0) { hP->gamma_a[i] = 1.0; if (mo->order[i_id] > t) continue; else { str = ighmm_mprintf (NULL, 0, "i_id: %d, o_seq[%d]=%d\ninvalid emission index!\n", i_id, t, o_seq[t]); GHMM_LOG(LCONVERTED, str); m_free (str); } } else hP->gamma_a[i] += log (mo->s[i_id].b[b_index]); /*printf("%g = %g\n", log(mo->s[i_id].b[b_index]), hP->gamma_a[i]); */ if (hP->gamma_a[i] > 0.0) { GHMM_LOG(LCONVERTED, "gamma to large. ghmm_dl_kbest failed\n"); exit (1); } } hP = hP->next; } /* 3. Choose the k most probable hypotheses for each state and discard all hypotheses that were not chosen: */ /* initialize temporary arrays: */ for (i = 0; i < mo->N * k; i++) { maxima[i] = 1.0; argmaxs[i] = NULL; } /* cycle through hypotheses & calculate the k most probable hypotheses for each state: */ hP = h[t]; while (hP != NULL) { for (i = 0; i < hP->gamma_states; i++) { i_id = hP->gamma_id[i]; if (hP->gamma_a[i] > KBEST_EPS) continue; /* find first best hypothesis that is worse than current hypothesis: */ for (l = 0; l < k && maxima[i_id * k + l] < KBEST_EPS && maxima[i_id * k + l] > hP->gamma_a[i]; l++); if (l < k) { /* for each m>l: m'th best hypothesis becomes (m+1)'th best */ for (m = k - 1; m > l; m--) { argmaxs[i_id * k + m] = argmaxs[i_id * k + m - 1]; maxima[i_id * k + m] = maxima[i_id * k + m - 1]; } /* save new l'th best hypothesis: */ maxima[i_id * k + l] = hP->gamma_a[i]; argmaxs[i_id * k + l] = hP; } } hP = hP->next; } /* set 'chosen' for all hypotheses from argmaxs array: */ for (i = 0; i < mo->N * k; i++) /* only choose hypotheses whose prob. is at least threshold*max_prob */ if (maxima[i] != 1.0 && maxima[i] >= KBEST_THRESHOLD + maxima[(i % mo->N) * k]) argmaxs[i]->chosen = 1; /* remove hypotheses that were not chosen from the lists: */ /* remove all hypotheses till the first chosen one */ while (h[t] != NULL && 0 == h[t]->chosen) ighmm_hlist_remove (&(h[t])); /* remove all other not chosen hypotheses */ if (!h[t]) { GHMM_LOG(LCONVERTED, "No chosen hypothesis. ghmm_dl_kbest failed\n"); exit (1); } hP = h[t]; while (hP->next != NULL) { if (1 == hP->next->chosen) hP = hP->next; else ighmm_hlist_remove (&(hP->next)); } } /* dispose of temporary arrays: */ m_free(states_wlabel); m_free(label_max_out); m_free(argmaxs); m_free(maxima); /* transition matrix is no longer needed from here on */ for (i=0; i<mo->N; i++) m_free(log_a[i]); m_free(log_a); /* 4. Save the hypothesis with the highest probability over all states: */ hP = h[seq_len - 1]; argmax = NULL; *log_p = 1.0; /* log_p will store log of maximum summed probability */ while (hP != NULL) { /* sum probabilities for each hypothesis over all states: */ sum = ighmm_cvector_log_sum (hP->gamma_a, hP->gamma_states); /* and select maximum sum */ if (sum < KBEST_EPS && (*log_p == 1.0 || sum > *log_p)) { *log_p = sum; argmax = hP; } hP = hP->next; } /* found a valid path? */ if (*log_p < KBEST_EPS) { /* yes: extract chosen hypothesis: */ ARRAY_MALLOC (hypothesis, seq_len); for (i = seq_len - 1; i >= 0; i--) { hypothesis[i] = argmax->hyp_c; argmax = argmax->parent; } } else /* no: return 1.0 representing -INF and an empty hypothesis */ hypothesis = NULL; /* dispose of calculation matrices: */ hP = h[seq_len - 1]; while (hP != NULL) ighmm_hlist_remove (&hP); free (h); return hypothesis; STOP: /* Label STOP from ARRAY_[CM]ALLOC */ GHMM_LOG(LCONVERTED, "ghmm_dl_kbest failed\n"); exit (1); #undef CUR_PROC }