コード例 #1
0
ファイル: lr.c プロジェクト: dineshmdh/randomjungle
void lr_train_update_z( lr_train *lrt)
{
  /* y,n,u,w -> z */
  int i;
  double yi, ni, ui, wi, val;

  for (i=0; i < lrt->numrows; ++i) {
    yi  = dyv_ref( lrt->y, i);
    ni  = dyv_ref( lrt_n_ref(lrt), i);
    ui  = dyv_ref( lrt_u_ref(lrt), i);
    wi  = dyv_ref( lrt_w_ref(lrt), i);
    val = ni + (yi-ui) / wi;
    
#ifndef AMFAST
    if (!am_isnum( val)) {
      my_errorf( "lr_train_update_z: NaN or Inf problem: val is %f.\n"
		 "Inputs: i=%d, yi=%f, ni=%f, ui=%f, wi=%f\n",
		 val, i, yi, ni, ui, wi);
    }
#endif

    dyv_set( lrt_z_ref(lrt), i, val);
  }
  return;
}
コード例 #2
0
ファイル: stats.c プロジェクト: yesyestian/BNB_Globlinear
double integ_cdf_inv(integ *it,double prob)
/*
   Find x s.t. integ_cdf(in,x) = prob
*/
{
  double result;

  if ( prob < 0.0 || prob > 1.0 )
  {
    result = 0.0;
    printf("****** prob = %g (should be between 0 and 1)\n",prob);
    my_error("integ_cdf_inv: illegal prob");
  }
  else if ( prob == 0.0 )  /* == with doubles usually dodgy, but here
                                    harmless */
    result = it->xlo;
  else if ( prob == 1.0 )
    result = it->xhi;
  else
  {
    int lo = 0;
    int hi = dyv_size(it->integral);
    double result_index;

    while ( lo < hi-1 )
    {
      int mid = (lo + hi)/2;
      double value = dyv_ref(it->integral,mid);
      if ( value < prob )
        lo = mid;
      else
        hi = mid;
    }
    
    if ( hi - lo != 1 ) my_error("ouvbobvrlobfv");

      /* If c(x) is cdf of index x, then the local
         linear behaviour is c(x) = y1 + (x - x1) * (y2 - y1) / (x2 - x1) 

         If x1 = (double) lo, x2 = (double) hi, then x2 - x1 = 1.

         If we want x such that c(x) = prob then we need

             x = x1 + (prob - y1) / (y2 - y1)
      */
     
    result_index = ((double) lo) + 
                   ( (prob - dyv_ref(it->integral,lo)) /
                     (real_max(1e-10,dyv_ref(it->integral,hi) -
                                     dyv_ref(it->integral,lo)
                              )
                     )
                   );

    result = it->xlo + (it->xhi - it->xlo) * 
             result_index / ( dyv_size(it->integral) - 1 );
  }

  return(result);
}
コード例 #3
0
ファイル: lr.c プロジェクト: dineshmdh/randomjungle
lr_predict *mk_in_lr_predict( PFILE *f)
{
  int i, size;
  double val;
  dyv *dv, *b;
  lr_predict *lrp;

  lrp = AM_MALLOC( lr_predict);

  dv = mk_dyv_read( f);
  size = dyv_size( dv);

  lrp->b0 = dyv_ref( dv, 0);

  b = mk_dyv( size-1);
  for (i=1; i<size; ++i) {
    val = dyv_ref( dv, i);
    dyv_set( b, i-1, val);
  }
  lrp->b = b;

  free_dyv( dv);

  return lrp;
}
コード例 #4
0
ファイル: stats.c プロジェクト: yesyestian/BNB_Globlinear
/* As below, except we insist as a pre-condition that
   no entry in hypothesized_dist has an expected value of
   zero.
*/
double chi_squared_prob_helper(dyv *actual_dist,dyv *hypothesized_dist,int dof)
{
  int size = dyv_size(actual_dist);
  int nr_size = size + 1;
  double *bins = AM_MALLOC_ARRAY(double,nr_size);
  double *ebins = AM_MALLOC_ARRAY(double,nr_size);
  int i;
  int knstrn = size - dof;
  double df,chsq,prob;
 
  if ( size != dyv_size(hypothesized_dist) )
    my_error("chi_squared_prob");

  for ( i = 0 ; i < size ; i++ )
  {
    bins[i+1] = (double) dyv_ref(actual_dist,i);
    ebins[i+1] = (double) dyv_ref(hypothesized_dist,i);
  }

  chsone(bins,ebins,size,knstrn,&df,&chsq,&prob);

  AM_FREE_ARRAY(bins,double,nr_size);
  AM_FREE_ARRAY(ebins,double,nr_size);

  return prob;
}
コード例 #5
0
ファイル: stats.c プロジェクト: yesyestian/BNB_Globlinear
/* 
   PRE: size of actual_dist is same as size of hypothesized_dist.
        Any entry in which hypothesized_dist has a value of
        zero must have an actual_dist value of zero (i.e.
          forall i, hy_dist[i]==0 => ac_dist[i] == 0

   Given two distributions represented as histograms 
   (actual_dist and hypothesized_dist), how much evidence is there that they are
   from the same distribution? 
   Note that these things must be counts. Each element of actual_dist must
   be an integer. Each element of hypothesized_dist may be non-integer
   because we're talking expected counts there.

   The prob returned by this function answers that question
   using a standard chi-squared test. If it is low (e.g. < 0.05), then it is
   unlikely that they are the same. 

   The "dof" parameter is the mysterious "Degrees Of Freedom" that haunts
   any use of the word "Chi". 
   
       If it is possible for any entry in the dist
       to take any value, then set dof==size.

       If the sum of values is constrained to a certain value
       then set dof==size-1.

       If there are more constraints than that, then subtract
       more from size.
*/
double chi_squared_prob(dyv *actual_dist,dyv *hypothesized_dist,int dof)
{
  double result = -1.0;
  double min_hyp_dist = dyv_min(hypothesized_dist);
  if ( min_hyp_dist < 0.0 )
    my_error("chi_squared_prob: -ve count in hypothesized_dist");
  else if ( min_hyp_dist > 0.0 )
    result = chi_squared_prob_helper(actual_dist,hypothesized_dist,dof);
  else
  {
    dyv *copy_ad = mk_dyv(0);
    dyv *copy_hd = mk_dyv(0);
    int i;
    for ( i = 0 ; i < dyv_size(actual_dist) ; i++ )
    {
      if ( dyv_ref(hypothesized_dist,i) > 0.0 )
      {
        add_to_dyv(copy_ad,dyv_ref(actual_dist,i));
        add_to_dyv(copy_hd,dyv_ref(hypothesized_dist,i));
        dof -= 1;
      }
      else if ( dyv_ref(actual_dist,i) > 0.0 )
        my_error("chi_squared_prob: actual_dist value must be zero if hyp dist value is zero");
    }
    dof = int_max(2,dof);
    result = chi_squared_prob_helper(copy_ad,copy_hd,dof);
    free_dyv(copy_ad);
    free_dyv(copy_hd);
  }

  return result;
}
コード例 #6
0
ファイル: lr.c プロジェクト: dineshmdh/randomjungle
int lr_train_update_b( lr_train *lrt)
{
  /* X,w,z -> b */
  /*
                   [1t]                [1t]
    Compute b = (( [--] W [1|X])^-1) * [--] W z, where W = diag(w).
                   [Xt]                [Xt]
  */
  int numatts, i, iters;
  double cgeps, cgdeveps, val;
  dyv *B, *initb;

  numatts = lrt->numatts;

  /* We are now using initial CG residuaal for scaling cgeps.
     This is best done inside mk_lr_cgresult(). */
  /* cgeps = lrt->numatts * lrt->opts->cgeps; */
  cgeps = lrt->opts->cgeps;
  cgdeveps = lrt->opts->cgdeveps;

  /* Create initb. */
  initb = NULL;
  if (lrt->opts->cgbinit) {
    initb = mk_dyv( numatts);
    dyv_set( initb, 0, lrt_b0_ref(lrt));
    for (i=1; i<numatts; ++i) {
      val = dyv_ref( lrt_b_ref(lrt), i-1);
      dyv_set( initb, i, val);
    }
  }

  B = mk_lr_update_b_conjugate_gradient_helper( lrt, cgeps, cgdeveps,
                                                lrt->opts->cgmax, &iters,
                                                initb);

  if (initb != NULL) free_dyv( initb);

  /* Break newb into ( b0, b ). */
  lrt_b0_set(lrt, dyv_ref( B, 0));
  for (i=1; i<numatts; ++i) {
    val = dyv_ref( B, i);
    dyv_set( lrt_b_ref(lrt), i-1, val);
  }

  free_dyv( B);

  /* Hitting cgmax is considered a failure. */
  if ( iters > lrt->opts->cgmax) return -2;

  return 1;
}
コード例 #7
0
ファイル: lr.c プロジェクト: dineshmdh/randomjungle
double lr_deviance_from_cg( lr_train *lrt, conjgrad *cg)
{
  int numrows;
  double cgb0, likelihood, dev;
  dyv *cgb, *cgn, *cgu;

  /* Get beta. */
  cgb = mk_copy_dyv( conjgrad_x_ref( cg)); /* good params */
  cgb0 = dyv_ref( cgb, 0);
  dyv_remove( cgb, 0);

  numrows = lrt->numrows;
  cgn = mk_dyv( numrows);
  cgu = mk_dyv( numrows);

  /* Compute u and n. */
  if (lrt->X != NULL) lr_compute_n_from_spardat( lrt->X, cgb0, cgb, cgn);
  else lr_compute_n_from_dym( lrt->M, cgb0, cgb, cgn);
  free_dyv( cgb);
  lr_compute_u_from_n( cgn, cgu);
  free_dyv( cgn);

  /* Compute likelihood and deviance. */
  likelihood = lr_log_likelihood_basic( lrt->y, cgu);
  free_dyv( cgu);

  dev = lr_deviance_from_log_likelihood( likelihood, lrt->likesat);

  return dev;
}
コード例 #8
0
ファイル: lr.c プロジェクト: dineshmdh/randomjungle
dyv *mk_lr_XtWXv_dyv( const lr_train *lrt, const dyv *v)
{
  /* Compute [1t]
             [--] W [1|X] v
             [Xt]            */

  double v0, cterm;
  dyv *subv, *Xv, *XtWXv;

  /* Split v into v0=v[0] and subv=v[1:] */
  v0 = dyv_ref( v, 0);
  subv = mk_dyv_slice( v, 1, dyv_size( v));

  /* Compute [1|X] v. */
  if (lrt->X != NULL) Xv = mk_spardat_times_dyv( lrt->X, subv);
  else Xv = mk_dym_times_dyv( lrt->M, subv);
  dyv_scalar_add( Xv, v0, Xv);
  free_dyv( subv);

  /* Compute W [1|X] v. */
  dyv_mult( Xv, lrt_w_ref(lrt), Xv);    /* Xv now stores WXv. */

  /* Compute Xt W [1|X] v and  1t W [1|X] v separately. Both get stored in
     XtWXv. */
  if (lrt->X != NULL) XtWXv = mk_spardat_transpose_times_dyv( lrt->X, Xv);
  else XtWXv = mk_dym_transpose_times_dyv( lrt->M, Xv);
  cterm = dyv_sum( Xv);
  dyv_insert( XtWXv, 0, cterm);

  free_dyv( Xv);

  return XtWXv;
}
コード例 #9
0
ファイル: amdyv.c プロジェクト: insilico/randomjungle
double dyv_sum(const dyv *dv)
{
  double result = 0.0;
  int i;
  for ( i = 0 ; i < dyv_size(dv) ; i++ )
    result += dyv_ref(dv,i);
  return(result);
}
コード例 #10
0
ファイル: lrutils.c プロジェクト: insilico/randomjungle
void dyv_write( PFILE *f, const dyv *dv)
{
  int size, i;
  size = dyv_size( dv);
  pfprintf( f, "%d\n", size);
  for (i=0; i<size; ++i) pfprintf( f, "%.16f\n", dyv_ref(dv,i));
  return;
}
コード例 #11
0
ファイル: amdyv.c プロジェクト: insilico/randomjungle
double dyv_product(const dyv *dv)
{
  double result = 1.0;
  int i;
  for ( i = 0 ; i < dyv_size(dv) ; i++ )
    result *= dyv_ref(dv,i);
  return(result);
}
コード例 #12
0
ファイル: amdyv.c プロジェクト: insilico/randomjungle
/* Returns 1 if all elements are 0.0 or 1.0.  Note that 0.0 and 1.0 are
   perfectly representable in IEEE 754. */
int dyv_is_binary( const dyv *dv)
{
  int size, i;
  double val;
  size = dyv_size( dv);
  for (i=0; i<size; ++i) {
    val = dyv_ref( dv, i);
    if (val != 0.0 && val != 1.0) return 0;
  }
  return 1;
}
コード例 #13
0
ファイル: lr.c プロジェクト: dineshmdh/randomjungle
void lr_train_update_w( lr_train *lrt)
{
  /* u -> w */
  int i;
  double ui, val;
  for (i=0; i < lrt->numrows; ++i) {
    ui  = dyv_ref( lrt_u_ref(lrt), i);
    val = ui * (1-ui);
    dyv_set( lrt_w_ref(lrt), i, val);
  }
  return;
}
コード例 #14
0
ファイル: main.c プロジェクト: yesyestian/BNB_Globlinear
void draw_2d_gaussian(dyv *mu,dym *cov,dyv *lo,dyv *hi)
{
  surgraph *sg;
  gauss_info gi[1];
  char buff[1000];

  gi -> mu = mu;
  gi -> cov_inv = mk_invert_dym(cov);
  gi -> cov_determinant = dym_determinant(cov);

  sprintf(buff,"mu = (%g,%g), cov=((%g,%g),(%g,%g))",
          dyv_ref(mu,0),dyv_ref(mu,1),
          dym_ref(cov,0,0),dym_ref(cov,0,1),
          dym_ref(cov,1,0),dym_ref(cov,1,1));

  sg = mk_surgraph_from_2d_function(gauss_height_fn,(char *)gi,
               30,30,buff,"x1","x2",dyv_ref(lo,0),dyv_ref(lo,1),
               dyv_ref(hi,0),dyv_ref(hi,1));

  ag_on("gauss.ps");
  render_surgraph(sg);
  ag_off();

  free_dym(gi->cov_inv);
  free_surgraph(sg);
}
コード例 #15
0
ファイル: lr.c プロジェクト: dineshmdh/randomjungle
double lr_deviance_basic( dyv *y, dyv *u)
{
  int i;
  double yi, ui, sum, deviance;

  /* deviance = -2*sum(yi*ln(yi/ui) + (1-yi)ln((1-yi)/(1-ui))),
     but with binary yi we have to compute the terms conditionally. */
  sum = 0.0;
  for (i=0; i < dyv_size(y); ++i) {
    yi = dyv_ref( y, i);
    ui = dyv_ref( u, i);

    if (yi != 0) sum += log(ui);
    else sum += log(1-ui);

    /* Stop summing after overflow. */
    if (sum < -FLT_MAX) return FLT_MAX;
  }

  deviance = -2*sum;
  return deviance;
}
コード例 #16
0
ファイル: lr.c プロジェクト: dineshmdh/randomjungle
double lr_log_likelihood_basic( dyv *y, dyv *u)
{
  /* Compute log likelihood L(b) = Sum( yi*ln(ui) + (1-yi)*ln(1-ui) ). */
  /* Note that this falls apart if u == 0.0 or u == 1.0, which it should
     never be for the logit. */
  int numrows, row;
  double sum, val, ui, yi;

  numrows = dyv_size( y);

  /* Compute log likelihood. */
  sum = 0.0;
  for (row=0; row<numrows; ++row) {
    ui  = dyv_ref( u, row);
    yi  = dyv_ref( y, row);
    val = yi*log(ui) + (1.0 - yi)*log(1.0 - ui);
    sum += val;
  }

  /* Done. */
  return sum;
}
コード例 #17
0
ファイル: stats.c プロジェクト: yesyestian/BNB_Globlinear
double integ_cdf(integ *it,double x)
{
  double result;

  if ( x <= it->xlo )
    result = 0.0;
  else if ( x >= it->xhi )
    result = 1.0;
  else
  {
    int index;
    double fraction;
    get_index_and_fraction(x,it->xlo,it->xhi,
                           dyv_size(it->integral),&index,&fraction
                          );
    result = two_interpolate(dyv_ref(it->integral,index),
                             dyv_ref(it->integral,index+1),
                             fraction
                            );
  }

  return(result);
}
コード例 #18
0
ファイル: lr.c プロジェクト: dineshmdh/randomjungle
void lr_compute_u_from_n( dyv *n, dyv *u)
{
  int numrows, i;
  double en, val, ni;
  numrows = dyv_size( n);

  for (i=0; i < numrows; ++i) {
    ni  = dyv_ref( n, i);
    en = exp(ni);
    val = en / (1.0 + en);
    dyv_set( u, i, val);
  }

  return;
}
コード例 #19
0
ファイル: lr.c プロジェクト: dineshmdh/randomjungle
void lr_compute_n_from_dym( const dym *M, double b0, dyv *b, dyv *n)
{
  int numrows, numgood, row, j;
  double sum;

  numrows = dym_rows( M);
  numgood = dyv_size(b);
  for (row=0; row < numrows; ++row) {
    sum = 0.0;
    for (j=0; j<numgood; ++j) sum += dym_ref( M, row, j) * dyv_ref( b, j);
    sum += b0;
    dyv_set( n, row, sum);
  }
  return;
}
コード例 #20
0
ファイル: lr.c プロジェクト: dineshmdh/randomjungle
void diag_precond( const dyv *v, dyv *result, void *userdata)
{
  /* Get diagonal     ( [1t]         )
                  diag( [--] W [1|X] )  = [ m_ii = Sum(x_ki^2 * w_k over k) ]
                      ( [Xt]         )
     In the sparse case, X is binary and x_ki^2 == x_ki, and the
     diagonal is [ m_ii = Sum(w_k over posrows_i) ].
     Preconditioning matrix is the diagonal matrix.  Multiply inverse
     of this matrix time v, which is an element-wise product.
  */
  int colidx;
  double divisor, val;
  ivec *posrows;
  dyv *w;
  lr_train *lrt;

  lrt = (lr_train *) userdata;

  if (lrt->X == NULL) {
    my_error( "diag_precond: dense problems not yet supported.");
  }

  w = lrt_w_ref( lrt);
  val = dyv_ref( v, 0);
  dyv_set( result, 0, val / dyv_sum( w));


  for (colidx=1; colidx < lrt->numatts; ++colidx) {
    posrows = spardat_attnum_to_posrows( lrt->X, colidx-1);
    divisor = dyv_partial_sum( w, posrows);
    val = dyv_ref( v, colidx);
    dyv_set( result, colidx, val / divisor);
  }

  return;
}
コード例 #21
0
ファイル: lr.c プロジェクト: dineshmdh/randomjungle
void out_lr_predict( PFILE *f, lr_predict *lrp)
{
  int nump, i;
  double val;
  dyv *dv;

  nump = dyv_size( lrp->b) + 1;

  /* Copy b0, b into a single dyv. */
  dv = mk_dyv( nump);
  dyv_set( dv, 0, lrp->b0);
  for (i=1; i<nump; ++i) {
    val = dyv_ref( lrp->b, i-1);
    dyv_set( dv, i, val);
  }

  dyv_write( f, dv);

  free_dyv( dv);
  return;
}
コード例 #22
0
ファイル: lr.c プロジェクト: dineshmdh/randomjungle
/* Exactly one of X and ds should be NULL. */
lr_train *mk_lr_train( spardat *X, dym *factors, dyv *outputs,
                       dyv *initb, lr_options *opts)
{
  /* initb is copied into lr->b. */
  int converge, rc;
  int numiters, bestiter;
  double dev, olddev;
  dyv *devhist;
  lr_train *lrt;
  lr_state *bestlrs;
  lr_statearr *lrsarr;

  /* Create lr_train struct. */
  if (X != NULL) lrt = mk_lr_train_from_spardat( X, opts);
  else lrt = mk_lr_train_from_dym( factors, outputs, opts);

  /* Set initial value of model parameters, if desired. */
  if (initb != NULL) lr_train_overwrite_b( lrt, initb);

  /* Initialize our loop state */
  dev = -1000.0;
  lrsarr = mk_array_of_null_lr_states( opts->lrmax);
  devhist = mk_constant_dyv( opts->lrmax, FLT_MAX);

  /* START OF IRLS ITERATIONS */
  /* Iterate until the change in deviance is relatively small. */
  for (numiters=0; numiters < opts->lrmax; ++numiters) {

    /* Update olddev and iterate. */
    olddev = dev;
    rc = lr_train_iterate(lrt);

    /* Test for convergence. */
    lr_statearr_set( lrsarr, numiters, lrt->lrs);
    converge = lr_deviance_test( lrt, opts->lreps, olddev, &dev);
    dyv_set( devhist, numiters, dev);

    /* Print stuff. */
    if (Verbosity >= 1) printf( ".");
    if (Verbosity >= 3) {
      printf( "LR ITER %d: likesat: %g, likelihood: %g, deviance: %g\n",
	      numiters, lrt->likesat,
              lr_log_likelihood_from_deviance( dev, lrt->likesat), dev);
    }
    if (Verbosity >= 5) {
      /* Print all or most extreme attributes. */
        printf( "  Params, b0: %g\n", lrt->lrs->b0);
        fprintf_oneline_dyv( stdout, "  Params, b:", lrt->lrs->b, "\n");
    }

    if (converge) break;
    else if (rc == -2) break; /* Exceeded cgmax. */
    else if (am_isnan(dev)) break;
  }
  /* END OF ITERATIONS */

  /* Check state history for best holdout performance. */
  bestiter = dyv_argmin( devhist);
  bestlrs  = lr_statearr_ref( lrsarr, bestiter);
  free_lr_state( lrt->lrs);
  lrt->lrs = mk_copy_lr_state( bestlrs);
	if (converge) lrt->lrs->converged = converge;
  if (Verbosity == 1) printf( "\n");
  if (Verbosity >= 2) {
    printf( "CHOOSING ITERATION %d WITH DEVIANCE %g\n",
            bestiter, dyv_ref( devhist, bestiter));
  }
  if (Verbosity >= 2) {
    fprintf_oneline_dyv( stdout, "  devhist:", devhist, "\n");
  }

  /* Free state history. */
  free_lr_statearr( lrsarr);
  free_dyv( devhist);

  /* Done. */
  return lrt;
}