/* PRE: size of actual_dist is same as size of hypothesized_dist. Any entry in which hypothesized_dist has a value of zero must have an actual_dist value of zero (i.e. forall i, hy_dist[i]==0 => ac_dist[i] == 0 Given two distributions represented as histograms (actual_dist and hypothesized_dist), how much evidence is there that they are from the same distribution? Note that these things must be counts. Each element of actual_dist must be an integer. Each element of hypothesized_dist may be non-integer because we're talking expected counts there. The prob returned by this function answers that question using a standard chi-squared test. If it is low (e.g. < 0.05), then it is unlikely that they are the same. The "dof" parameter is the mysterious "Degrees Of Freedom" that haunts any use of the word "Chi". If it is possible for any entry in the dist to take any value, then set dof==size. If the sum of values is constrained to a certain value then set dof==size-1. If there are more constraints than that, then subtract more from size. */ double chi_squared_prob(dyv *actual_dist,dyv *hypothesized_dist,int dof) { double result = -1.0; double min_hyp_dist = dyv_min(hypothesized_dist); if ( min_hyp_dist < 0.0 ) my_error("chi_squared_prob: -ve count in hypothesized_dist"); else if ( min_hyp_dist > 0.0 ) result = chi_squared_prob_helper(actual_dist,hypothesized_dist,dof); else { dyv *copy_ad = mk_dyv(0); dyv *copy_hd = mk_dyv(0); int i; for ( i = 0 ; i < dyv_size(actual_dist) ; i++ ) { if ( dyv_ref(hypothesized_dist,i) > 0.0 ) { add_to_dyv(copy_ad,dyv_ref(actual_dist,i)); add_to_dyv(copy_hd,dyv_ref(hypothesized_dist,i)); dof -= 1; } else if ( dyv_ref(actual_dist,i) > 0.0 ) my_error("chi_squared_prob: actual_dist value must be zero if hyp dist value is zero"); } dof = int_max(2,dof); result = chi_squared_prob_helper(copy_ad,copy_hd,dof); free_dyv(copy_ad); free_dyv(copy_hd); } return result; }
double lr_deviance_from_cg( lr_train *lrt, conjgrad *cg) { int numrows; double cgb0, likelihood, dev; dyv *cgb, *cgn, *cgu; /* Get beta. */ cgb = mk_copy_dyv( conjgrad_x_ref( cg)); /* good params */ cgb0 = dyv_ref( cgb, 0); dyv_remove( cgb, 0); numrows = lrt->numrows; cgn = mk_dyv( numrows); cgu = mk_dyv( numrows); /* Compute u and n. */ if (lrt->X != NULL) lr_compute_n_from_spardat( lrt->X, cgb0, cgb, cgn); else lr_compute_n_from_dym( lrt->M, cgb0, cgb, cgn); free_dyv( cgb); lr_compute_u_from_n( cgn, cgu); free_dyv( cgn); /* Compute likelihood and deviance. */ likelihood = lr_log_likelihood_basic( lrt->y, cgu); free_dyv( cgu); dev = lr_deviance_from_log_likelihood( likelihood, lrt->likesat); return dev; }
lr_predict *mk_in_lr_predict( PFILE *f) { int i, size; double val; dyv *dv, *b; lr_predict *lrp; lrp = AM_MALLOC( lr_predict); dv = mk_dyv_read( f); size = dyv_size( dv); lrp->b0 = dyv_ref( dv, 0); b = mk_dyv( size-1); for (i=1; i<size; ++i) { val = dyv_ref( dv, i); dyv_set( b, i-1, val); } lrp->b = b; free_dyv( dv); return lrp; }
double lr_deviance_from_dym_b( const dym *M, dyv *y, double b0, dyv *b) { int numrows; double dev; dyv *n, *u; numrows = dym_rows( M); n = mk_dyv( numrows); u = mk_dyv( numrows); lr_compute_n_from_dym( M, b0, b, n); lr_compute_u_from_n( n, u); dev = lr_deviance_basic( y, u); free_dyv( n); free_dyv( u); return dev; }
double lr_deviance_from_spardat_b( const spardat *X, dyv *y, double b0, dyv *b) { int numrows; double dev; dyv *n, *u; numrows = spardat_num_rows( X); n = mk_dyv( numrows); u = mk_dyv( numrows); lr_compute_n_from_spardat( X, b0, b, n); lr_compute_u_from_n( n, u); dev = lr_deviance_basic( y, u); free_dyv( n); free_dyv( u); return dev; }
int lr_train_update_b( lr_train *lrt) { /* X,w,z -> b */ /* [1t] [1t] Compute b = (( [--] W [1|X])^-1) * [--] W z, where W = diag(w). [Xt] [Xt] */ int numatts, i, iters; double cgeps, cgdeveps, val; dyv *B, *initb; numatts = lrt->numatts; /* We are now using initial CG residuaal for scaling cgeps. This is best done inside mk_lr_cgresult(). */ /* cgeps = lrt->numatts * lrt->opts->cgeps; */ cgeps = lrt->opts->cgeps; cgdeveps = lrt->opts->cgdeveps; /* Create initb. */ initb = NULL; if (lrt->opts->cgbinit) { initb = mk_dyv( numatts); dyv_set( initb, 0, lrt_b0_ref(lrt)); for (i=1; i<numatts; ++i) { val = dyv_ref( lrt_b_ref(lrt), i-1); dyv_set( initb, i, val); } } B = mk_lr_update_b_conjugate_gradient_helper( lrt, cgeps, cgdeveps, lrt->opts->cgmax, &iters, initb); if (initb != NULL) free_dyv( initb); /* Break newb into ( b0, b ). */ lrt_b0_set(lrt, dyv_ref( B, 0)); for (i=1; i<numatts; ++i) { val = dyv_ref( B, i); dyv_set( lrt_b_ref(lrt), i-1, val); } free_dyv( B); /* Hitting cgmax is considered a failure. */ if ( iters > lrt->opts->cgmax) return -2; return 1; }
integ *mk_integ( double (*h)(double parameter,double constant,double x), double xlo, double xhi, double parameter, double constant, int size ) /* Returns an it in which it->integral[i] = integal_from_xlo_to(x_lo + h*i) of h(parameter,x) dx ------------------------------------------------ integal_from_xlo_to_x_hi of h(parameter,x) dx */ { integ *it = AM_MALLOC(integ); dyv *dig = mk_dyv(size); int i; double sum = 0.0; double last_pdf = 0.0; double delta = (xhi - xlo) / (size-1); if ( h(parameter,constant,xhi) > 1e-6 ) my_error("Hmm... I was really hoping h(parameter,xhi) == 0"); dyv_set(dig,0,0.0); for ( i = 1 ; i < size ; i++ ) { double x = xlo + i * delta; double this_pdf = h(parameter,constant,x); if (i == 1) sum += delta * this_pdf; else sum += delta * (this_pdf + last_pdf) / 2.0; dyv_set(dig,i,sum); last_pdf = this_pdf; /* added 2/26/97 JGS */ } dyv_scalar_mult(dig,1.0 / sum,dig); it -> integral = dig; it -> xlo = xlo; it -> xhi = xhi; it -> parameter = parameter; it -> constant = constant; return(it); }
dyv *mk_dyv_read( PFILE *f) { int i, size, lineno; double val; char line[101]; dyv *dv; lineno = 1; line[100] = '\0'; /* Read size and make dyv. */ if (pfeof(f)) { my_errorf( "mk_dyv_read: unexpected end-of-file while reading size,\n" "after line %d of file", lineno); } if (pfgets( line, 100, f) == NULL) { my_errorf( "mk_dyv_read: failed to read line %d from the passed stream.", lineno); } else lineno++; size = atoi( line); dv = mk_dyv( size); /* Read values. */ for (i=0; i<size; ++i) { if (pfeof(f)) { my_errorf( "mk_dyv_read: unexpected end-of-file while reading %d vals,\n" "after line %d of file (after the %dth value)", size, lineno, lineno-1); } if (pfgets( line, 100, f) == NULL) { my_errorf( "mk_dyv_read: failed to read line %d from the passed stream.", lineno); } else lineno++; val = atof( line); dyv_set( dv, i, val); } return dv; }
dyv *mk_dyv_x( int size, ...) { /* Warning: no type checking can be done by the compiler. You *must* send the values as doubles for this to work correctly. */ int i; double val; va_list argptr; dyv *dv; dv = mk_dyv( size); va_start( argptr, size); for (i=0; i<size; ++i) { val = va_arg( argptr, double); dyv_set( dv, i, val); } va_end(argptr); return dv; }
void out_lr_predict( PFILE *f, lr_predict *lrp) { int nump, i; double val; dyv *dv; nump = dyv_size( lrp->b) + 1; /* Copy b0, b into a single dyv. */ dv = mk_dyv( nump); dyv_set( dv, 0, lrp->b0); for (i=1; i<nump; ++i) { val = dyv_ref( lrp->b, i-1); dyv_set( dv, i, val); } dyv_write( f, dv); free_dyv( dv); return; }