예제 #1
0
void calc_interpolated_prob(arpa_lm_t *lm, arpa_lm_t *lm1, arpa_lm_t *lm2,fb_info* fb1,fb_info* fb2,double w1,double w2)
{
	TBROWSE br;
	id__t id[MAX_K];
	char** words;
	int k,j;
	double p1,p2,p;

	words=(char**)NewArray(MAX_K,MAX_WORD,sizeof(char));

	for (k=1;k<=lm->n;k++) {
		begin_browse(lm,k,&br);
		j=0;
		printf("\nProcessing %d-gram\n",k);
		while (get_next_ngram(id,&br)) {
		  j++;
		  show_dot(j);
			ids2words(words,id,k,lm->vocab);
			p1=calc_prob(words,k,lm1,fb1);
			p2=calc_prob(words,k,lm2,fb2);
			p=w1*p1+w2*p2;
			lm->probs[k-1][br.pos[k-1]-1]=log10(p);
			/* comment out this probability correction
			if(lm->probs[k-1][br.pos[k-1]-1]==0.0){
				lm->probs[k-1][br.pos[k-1]-1]=-0.00001;
			}
			*/

		}
	}

	DeleteArray(words);
}
예제 #2
0
//get soft assignment prob from data and GMM model; process_mode1(data, GMM,prob);
void process_mode1(const mxArray *mxfea, const mxArray *mxGMM, mxArray *output[]) {
    double *fea = (double *)mxGetPr(mxfea);
    const int *dims = mxGetDimensions(mxfea);
    nDim = dims[0];
    nSample = dims[1];
    
    double *gmmMu = (double *)mxGetPr(mxGetField(mxGMM, 0, "Mu"));
    
    double *gmmSigma = (double *)mxGetPr(mxGetField(mxGMM, 0, "Sigma"));
    // //mexPrintf("\n %f %f %f.", gmmSigma[0],gmmSigma[1],gmmSigma[2]);
    
    
    double *gmmPriors = (double *)mxGetPr(mxGetField(mxGMM, 0, "Priors"));
    const int *dims2 = mxGetDimensions(mxGetField(mxGMM, 0, "Mu"));
    nBase = dims2[1];
    int nDim2 = dims2[0];
    if (nDim!=nDim2) mexErrMsgTxt("the dim of feature and gmm model is different!");
    
    int out[2];
    out[0] = nSample;
    out[1] = nBase;
    mxArray *mxProb = mxCreateNumericArray(2, out, mxDOUBLE_CLASS, mxREAL);
    double *prob = (double *)mxGetPr(mxProb);
    ////mexPrintf("%d.%d.%d.\n", nBase, nSample,nDim);
    
    calc_prob(fea, gmmMu, gmmSigma, gmmPriors, prob);
    output[0] = mxProb;
    //    //mexPrintf("%f.%f.%f.%f\n", prob[0], prob[1],prob[2],prob[3]);
    //     return;
    
}
예제 #3
0
static ex_t ds_histogram_hook(/*@unused@*/ word_t *key, dsv_t *data,
			     void *userdata)
{
    rhistogram_t *hist = (rhistogram_t *)userdata;

    double fw = calc_prob(data->goodcount, data->spamcount, mgood, mbad);
    uint idx = min(fw * INTERVALS, INTERVALS-1);

    /* ignore meta-tokens */
    if (*key->u.text == (byte) '.')
	return EX_OK;

    hist->count[idx] += 1;

    if (data->spamcount == 0) {
	ham_only += 1;
	if (data->goodcount == 1)
	    ham_hapax += 1;
    }

    if (data->goodcount == 0) {
	spam_only += 1;
	if (data->spamcount == 1)
	    spam_hapax += 1;
    }

    return EX_OK;
}
예제 #4
0
void check_prob(arpa_lm_t *lm, arpa_lm_t *lm1, arpa_lm_t *lm2,fb_info* fb1,fb_info* fb2,double w1,double w2)
{
  TBROWSE br;
  id__t id[MAX_K];
  char** words;
  int j,k;
  double p1,p2,p,s1,s2;
  int bo_pos = 0; /* for a quiet compile */
  
  words=(char**)NewArray(MAX_K,MAX_WORD,sizeof(char));

  for (k=1;k<=lm->n;k++) {
    begin_browse(lm,k,&br);
    if (k>=2) bo_pos=br.pos[k-2];
    s1=s2=0;
    j=0;

    printf("\nProcessing %d-gram\n",k);
    while (get_next_ngram(id,&br)) {
      ids2words(words,id,k,lm->vocab);
      p1=calc_prob(words,k,lm1,fb1);
      p2=calc_prob(words,k,lm2,fb2);

      j++;
      show_dot(j);
      s1+=p1;
      s2+=p2;
      if ((k>=2 && br.pos[k-2]!=bo_pos) || (k==1 && br.pos[0]==lm->num_kgrams[0])) {
	s1=s2=0;
      }
      p=w1*p1+w2*p2;
      lm->probs[k-1][br.pos[k-1]-1]=log10(p);
    }
  }

  DeleteArray(words);
}
예제 #5
0
static void rstats_print_rtable(rstats_t *rstats_head)
{
    const char *pfx = !stats_in_header ? "" : "  ";

    rstats_t *cur;

    /* print header */
    if (!Rtable)
	(void)fprintf(fpo, "%s%*s %6s    %-6s    %-6s    %-6s %s\n",
		      pfx, max_token_len+2, "", "n", "pgood", "pbad", "fw", "U");
    else
	(void)fprintf(fpo, "%s%*s %6s    %-6s    %-6s    %-6s  %-6s    %-6s %s\n",
		      pfx, max_token_len+2, "", "n", "pgood", "pbad", "fw", "invfwlog", "fwlog", "U");

    /* Print 1 line per token */
    for (cur=rstats_head->next; cur != NULL; cur=cur->next)
    {
	int len = (cur->token->leng >= max_token_len) ? 0 : (max_token_len - cur->token->leng);
	double fw = calc_prob(cur->good, cur->bad, cur->msgs_good, cur->msgs_bad);
	char flag = cur->used ? '+' : '-';

	(void)fprintf(fpo, "%s\"", pfx);
	(void)word_puts(cur->token, 0, fpo);

	if (cur->msgs_good == 0 && cur->msgs_bad == 0)
	{
	    flag = 'i';
	    (void)fprintf(fpo, "\"%*s %6lu  %8s  %8s  %8.6f",
			  len, " ", (unsigned long)(cur->good + cur->bad),
			  "--------", "--------",
			  fw);
	}
	else
	    (void)fprintf(fpo, "\"%*s %6lu  %8.6f  %8.6f  %8.6f",
			  len, " ", (unsigned long)(cur->good + cur->bad),
			  (double)cur->good / cur->msgs_good,
			  (double)cur->bad  / cur->msgs_bad,
			  fw);

	if (Rtable)
	    (void)fprintf(fpo, "%s%10.5f%10.5f",
			  pfx, log(1.0 - fw), log(fw));
	(void)fprintf(fpo, " %c\n", flag);
    }

    /* print trailer */
    msg_print_summary(pfx);
}
예제 #6
0
//get coding from data and GMM; process_model3(data,GMM,coding);
void process_mode3(const mxArray *mxfea, const mxArray *mxGMM, mxArray *output[]) {
    double *fea = (double *)mxGetPr(mxfea);
    const int *dims = mxGetDimensions(mxfea);
    nDim = dims[0];
    nSample = dims[1];
    double *gmmMu = (double *)mxGetPr(mxGetField(mxGMM, 0, "Mu"));
    
    double *gmmSigma = (double *)mxGetPr(mxGetField(mxGMM, 0, "Sigma"));
    // //mexPrintf("\n %f %f %f.", gmmSigma[0],gmmSigma[1],gmmSigma[2]);
    
    double *gmmPriors = (double *)mxGetPr(mxGetField(mxGMM, 0, "Priors"));
    const int *dims2 = mxGetDimensions(mxGetField(mxGMM, 0, "Mu"));
    nBase = dims2[1];
    int nDim2 = dims2[0];
    if (nDim!=nDim2) mexErrMsgTxt("the dim of feature and gmm model is different!");
    
    mwSize out[2];
    out[0] = nSample;
    out[1] = nBase;
    
    double *prob = (double *)mxCalloc(nSample*nBase, sizeof(double));
    calc_prob(fea, gmmMu, gmmSigma, gmmPriors, prob);
    
    #if (FIRST_ORDER)
    out[0] = nBase*nDim;
    
    #else
    out[0] = nBase*nDim*2 ;
    #endif
    out[1] = 1;
    mxArray *mxCoding = mxCreateNumericArray(2, out, mxDOUBLE_CLASS, mxREAL);
    double *coding = (double *)mxGetPr(mxCoding);
    //     //mexPrintf("%f %f %f \n",prob[1], prob[2],prob[4]);
    if (nSample==0){
        //mexPrintf("wrong! the coding samples is zeros!!!...... \n");
        
    }else{
        fisher_coding(fea, gmmMu, gmmSigma, gmmPriors, prob, coding);
    }
    output[0] = mxCoding;
    mxFree(prob);
}
예제 #7
0
// get map from data, GMM, model, pos,size;
void process_mode6(const mxArray *mxfea, const mxArray *mxGMM, const mxArray *mxModels, const mxArray *mxPos, const mxArray *mxMapSize, mxArray *output[]) {
    double *fea = (double *)mxGetPr(mxfea);
    const mwSize *dims = mxGetDimensions(mxfea);
    nDim = dims[0];
    nSample = dims[1];
    double *gmmMu = (double *)mxGetPr(mxGetField(mxGMM, 0, "Mu"));
    double *gmmSigma = (double *)mxGetPr(mxGetField(mxGMM, 0, "Sigma"));
    double *gmmPriors = (double *)mxGetPr(mxGetField(mxGMM, 0, "Priors"));
    const mwSize *dims2 = mxGetDimensions(mxGetField(mxGMM, 0, "Mu"));
    nBase = dims2[1];
    int nDim2 = dims2[0];
    if (nDim!=nDim2) mexErrMsgTxt("the dim of feature and gmm model is different!");
    
    double *prob = (double *)mxCalloc(nSample*nBase, sizeof(double));
    
    calc_prob(fea, gmmMu, gmmSigma, gmmPriors, prob);
    
    
    nModel = mxGetNumberOfElements(mxModels);
    
    double *conf= (double *)mxCalloc(nSample*nModel, sizeof(double));
    
    #if (FIRST_ORDER)
    int nModelDim = nDim*nBase;
    #else
    int nModelDim = nDim*nBase*2;
    #endif
    double *model_w = (double *)mxCalloc(nModelDim*nModel, sizeof(double));
    int i, j;
    for(i=0;i<nModel;i++){
        mxArray *tempModel = mxGetCell(mxModels, i);
        //        double *w = (double *)mxGetPr(mxGetField(tempModel, 0, "w_reshape"));
        double *w = (double *)mxGetPr(mxGetField(tempModel, 0, "w"));
        
        for (j=0;j<nModelDim;j++){
            model_w[i*nModelDim+j] = w[j];
        }
    }
    conf_model(fea, gmmMu, gmmSigma, gmmPriors, prob, model_w, conf);
    
    //mexPrintf("\n %f %d %d.", conf[1],nDim,nSample);
    
    int *mapSize = (int*) mxGetPr(mxMapSize);
    
    int nRows =mapSize[0];
    int nCols = mapSize[1];
    int nSample2 = mxGetN(mxPos);
    
    int *pos = (int*)mxGetPr(mxPos);
    
    int out []= {nRows, nCols, nModel};
    
    //mexPrintf("%d %d %d \n",nModel, nRows,nCols);
    
    
    mxArray *mxMap =  mxCreateNumericArray(3, out , mxDOUBLE_CLASS, mxREAL);
    double *map = mxGetPr(mxMap);
    
    map_conf(conf, pos, nRows, nCols, nModel, map);
    output[0] = mxMap;
    // mxSetPr(output[0],conf);
    
    mxFree(conf);
    mxFree(model_w);
    mxFree(prob);
}
예제 #8
0
static ex_t display_words(bfpath *bfp, int argc, char **argv, bool show_probability)
{
    byte buf[BUFSIZE];
    buff_t *buff = buff_new(buf, 0, BUFSIZE);
    const byte *word;

    const char *path = bfp->filepath;

    const char *head_format = !show_probability ? "%-30s %6s %6s\n"   : "%-30s %6s  %6s  %6s\n";
    const char *data_format = !show_probability ? "%-30s %6lu %6lu\n" : "%-30s %6lu  %6lu  %f\n";

    void *dsh = NULL; /* initialize to silence bogus gcc warning */
    void *dbe;

    int rv = 0;
    ex_t ec = EX_OK;

    dsv_t msgcnts;

    /* protect against broken stat(2) that succeeds for empty names */
    if (path == NULL || *path == '\0') {
        fprintf(stderr, "Expecting non-empty directory or file name.\n");
        return EX_ERROR;
    }

    dbe = ds_init(bfp);
    dsh = ds_open(dbe, bfp, DS_READ);;
    if (dsh == NULL)
	/* print error, cleanup, and exit */
	ds_open_failure(bfp, dbe);

    if (DST_OK != ds_txn_begin(dsh)) {
	ds_close(dsh);
	ds_cleanup(dbe);
	fprintf(stderr, "Cannot begin transaction.\n");
	return EX_ERROR;
    }

    if (show_probability)
    {
	ds_get_msgcounts(dsh, &msgcnts);
	robs = ROBS;
	robx = ROBX;
    }

    fprintf(fpo, head_format, "", "spam", "good", "  Fisher");
    while (argc >= 0)
    {
	dsv_t val;
	word_t *token;
	int rc;

	unsigned long spam_count;
	unsigned long good_count;
	double rob_prob = 0.0;
	
	if (argc == 0)
	{
	    if (get_token(buff, stdin) != 0)
		break;
	    token = &buff->t;
	} else {
	    word = (const byte *) *argv++;
	    if (--argc == 0)
		argc = -1;
	    token = word_news((const char *)word);
	}

	rc = ds_read(dsh, token, &val);
	switch (rc) {
	    case 0:
		spam_count = val.spamcount;
		good_count = val.goodcount;

		if (!show_probability)
		    fprintf(fpo, data_format, token->u.text, spam_count, good_count);
		else
		{
		    rob_prob = calc_prob(good_count, spam_count, msgcnts.goodcount, msgcnts.spamcount);
		    fprintf(fpo, data_format, token->u.text, spam_count, good_count, rob_prob);
		}
		break;
	    case 1:
		break;
	    default:
		fprintf(stderr, "Cannot read from database.\n");
		ec = EX_ERROR;
		goto finish;
	}

	if (token != &buff->t)
	    word_free(token);
    }

finish:
    if (DST_OK != rv ? ds_txn_abort(dsh) : ds_txn_commit(dsh)) {
	fprintf(stderr, "Cannot %s transaction.\n", rv ? "abort" : "commit");
	ec = EX_ERROR;
    }
    ds_close(dsh);
    ds_cleanup(dbe);

    buff_free(buff);

    return ec;
}