void test_content() { rng_type * rng = rng_alloc(MZRAN , INIT_DEFAULT); matrix_type * PC = matrix_alloc( 3 , 10); matrix_type * PC_obs = matrix_alloc( 3 , 1 ); double_vector_type * singular_values = double_vector_alloc(3 , 1); matrix_random_init( PC , rng ); matrix_random_init( PC_obs , rng ); { pca_plot_data_type * data = pca_plot_data_alloc("KEY" , PC , PC_obs, singular_values); for (int i=0; i < matrix_get_rows( PC ); i++) { const pca_plot_vector_type * vector = pca_plot_data_iget_vector( data , i ); test_assert_double_equal( matrix_iget( PC_obs , i , 0) , pca_plot_vector_get_obs_value( vector ) ); test_assert_double_equal( double_vector_iget( singular_values , i), pca_plot_vector_get_singular_value( vector ) ); for (int j=0; j < matrix_get_columns( PC ); j++) test_assert_double_equal( matrix_iget( PC , i , j ) , pca_plot_vector_iget_sim_value( vector , j )); test_assert_int_equal( matrix_get_columns( PC ) , pca_plot_vector_get_size( vector )); } pca_plot_data_free( data ); } double_vector_free( singular_values ); matrix_free( PC ); matrix_free( PC_obs ); }
void matrix_fprintf( const matrix_type * matrix , const char * fmt , FILE * stream ) { int i,j; for (i=0; i < matrix->rows; i++) { for (j=0; j < matrix->columns; j++) fprintf(stream , fmt , matrix_iget( matrix , i , j)); fprintf(stream , "\n"); } }
void matrix_pretty_fprint_submat(const matrix_type * matrix , const char * name , const char * fmt , FILE * stream, int m, int M, int n, int N) { int i,j; if (m<0 || m>M || M >= matrix->rows || n<0 || n>N || N >= matrix->columns) util_abort("%s: matrix:%s not compatible with print subdimensions. \n",__func__ , matrix->name); fprintf(stream , "%s =" , name); for (i=m; i < M; i++) { fprintf(stream , " ["); for (j=n; j < N; j++) fprintf(stream , fmt , matrix_iget(matrix , i,j)); fprintf(stream , "]\n"); } }
void matrix_pretty_fprint(const matrix_type * matrix , const char * name , const char * fmt , FILE * stream) { int i,j; for (i=0; i < matrix->rows; i++) { if (i == (matrix->rows / 2)) fprintf(stream , "%s =" , name); else { int l; for (l = 0; l < strlen(name) + 2; l++) fprintf(stream , " "); } fprintf(stream , " ["); for (j=0; j < matrix->columns; j++) fprintf(stream , fmt , matrix_iget(matrix , i,j)); fprintf(stream , "]\n"); } }
double matrix_det( matrix_type *A ) { matrix_lapack_assert_square( A ); { int dgetrf_info; double det = 1; double det_scale = 0; int n = matrix_get_columns( A ); int * ipiv = util_malloc( n * sizeof * ipiv ); matrix_dgetrf__( A , ipiv , &dgetrf_info ); { int i; for (i=0; i < n; i++) { det *= matrix_iget(A , i , i); if (det == 0) return 0; /* Holy f**k - a float == comparison ?? */ if (ipiv[i] != (i + 1)) /* A permutation has taken place. */ det *= -1; /* Try to avoid overflow/underflow by factoring out the order of magnitude. */ while (fabs(det) > 10.0) { det /= 10; det_scale += 1; } while (fabs(det) < 1.0) { det *= 10; det_scale -= 1; } } } free( ipiv ); return det * pow(10 , det_scale ); } }
double matrix_iget_safe(const matrix_type * matrix , int i , int j) { matrix_assert_ij( matrix , i , j ); return matrix_iget( matrix , i , j ); }
double stepwise_iget_beta(const stepwise_type * stepwise, const int index ) { return matrix_iget( stepwise->beta, index, 0); }
static double stepwise_estimate__( stepwise_type * stepwise , bool_vector_type * active_rows) { matrix_type * X; matrix_type * E; matrix_type * Y; double y_mean = 0; int nvar = matrix_get_columns( stepwise->X0 ); int nsample = matrix_get_rows( stepwise->X0 ); nsample = bool_vector_count_equal( active_rows , true ); nvar = bool_vector_count_equal( stepwise->active_set , true ); matrix_set( stepwise->beta , 0 ); // It is essential to make sure that old finite values in the beta0 vector do not hang around. /* Extracting the data used for regression, and storing them in the temporary local matrices X and Y. Selecting data is based both on which varibles are active (stepwise->active_set) and which rows should be used for regression, versus which should be used for validation (@active_rows). */ if ((nsample < matrix_get_rows( stepwise->X0 )) || (nvar < matrix_get_columns( stepwise->X0 ))) { X = matrix_alloc( nsample , nvar ); E = matrix_alloc( nsample , nvar ); Y = matrix_alloc( nsample , 1); { int icol,irow; // Running over all values. int arow,acol; // Running over active values. arow = 0; for (irow = 0; irow < matrix_get_rows( stepwise->X0 ); irow++) { if (bool_vector_iget( active_rows , irow )) { acol = 0; for (icol = 0; icol < matrix_get_columns( stepwise->X0 ); icol++) { if (bool_vector_iget( stepwise->active_set , icol )) { matrix_iset( X , arow , acol , matrix_iget( stepwise->X0 , irow , icol )); matrix_iset( E , arow , acol , matrix_iget( stepwise->E0 , irow , icol )); acol++; } } matrix_iset( Y , arow , 0 , matrix_iget( stepwise->Y0 , irow , 0 )); arow++; } } } } else { X = matrix_alloc_copy( stepwise->X0 ); E = matrix_alloc_copy( stepwise->E0 ); Y = matrix_alloc_copy( stepwise->Y0 ); } { if (stepwise->X_mean != NULL) matrix_free( stepwise->X_mean); stepwise->X_mean = matrix_alloc( 1 , nvar ); if (stepwise->X_norm != NULL) matrix_free( stepwise->X_norm); stepwise->X_norm = matrix_alloc( 1 , nvar ); matrix_type * beta = matrix_alloc( nvar , 1); /* This is the beta vector as estimated from the OLS estimator. */ regression_augmented_OLS( X , Y , E, beta ); /* In this code block the beta/tmp_beta vector which is dense with fewer elements than the full model is scattered into the beta0 vector which has full size and @nvar elements. */ { int ivar,avar; avar = 0; for (ivar = 0; ivar < matrix_get_columns( stepwise->X0 ); ivar++) { if (bool_vector_iget( stepwise->active_set , ivar )) { matrix_iset( stepwise->beta , ivar , 0 , matrix_iget( beta , avar , 0)); avar++; } } } matrix_free( beta ); } matrix_free( X ); matrix_free( E ); matrix_free( Y ); return y_mean; }
static double stepwise_test_var( stepwise_type * stepwise , int test_var , int blocks) { double prediction_error = 0; bool_vector_iset( stepwise->active_set , test_var , true ); // Temporarily activate this variable { int nvar = matrix_get_columns( stepwise->X0 ); int nsample = matrix_get_rows( stepwise->X0 ); int block_size = nsample / blocks; bool_vector_type * active_rows = bool_vector_alloc( nsample, true ); /*True Cross-Validation: */ int * randperms = util_calloc( nsample , sizeof * randperms ); for (int i=0; i < nsample; i++) randperms[i] = i; /* Randomly perturb ensemble indices */ rng_shuffle_int( stepwise->rng , randperms , nsample ); for (int iblock = 0; iblock < blocks; iblock++) { int validation_start = iblock * block_size; int validation_end = validation_start + block_size - 1; if (iblock == (blocks - 1)) validation_end = nsample - 1; /* Ensure that the active_rows vector has a block consisting of the interval [validation_start : validation_end] which is set to false, and the remaining part of the vector is set to true. */ { bool_vector_set_all(active_rows, true); /* If blocks == 1 that means all datapoint are used in the regression, and then subsequently reused in the R2 calculation. */ if (blocks > 1) { for (int i = validation_start; i <= validation_end; i++) { bool_vector_iset( active_rows , randperms[i] , false ); } } } /* Evaluate the prediction error on the validation part of the dataset. */ { stepwise_estimate__( stepwise , active_rows ); { int irow; matrix_type * x_vector = matrix_alloc( 1 , nvar ); //matrix_type * e_vector = matrix_alloc( 1 , nvar ); for (irow=validation_start; irow <= validation_end; irow++) { matrix_copy_row( x_vector , stepwise->X0 , 0 , randperms[irow]); //matrix_copy_row( e_vector , stepwise->E0 , 0 , randperms[irow]); { double true_value = matrix_iget( stepwise->Y0 , randperms[irow] , 0 ); double estimated_value = stepwise_eval__( stepwise , x_vector ); prediction_error += (true_value - estimated_value) * (true_value - estimated_value); //double e_estimated_value = stepwise_eval__( stepwise , e_vector ); //prediction_error += e_estimated_value*e_estimated_value; } } matrix_free( x_vector ); } } } free( randperms ); bool_vector_free( active_rows ); } /*inactivate the test_var-variable after completion*/ bool_vector_iset( stepwise->active_set , test_var , false ); return prediction_error; }
void lars_estimate(lars_type * lars , int max_vars , double max_beta , bool verbose) { int nvars = matrix_get_columns( lars->X ); int nsample = matrix_get_rows( lars->X ); matrix_type * X = matrix_alloc( nsample, nvars ); // Allocate local X and Y variables matrix_type * Y = matrix_alloc( nsample, 1 ); // which will hold the normalized data lars_estimate_init( lars , X , Y); // during the estimation process. { matrix_type * G = matrix_alloc_gram( X , true ); matrix_type * mu = matrix_alloc( nsample , 1 ); matrix_type * C = matrix_alloc( nvars , 1 ); matrix_type * Y_mu = matrix_alloc_copy( Y ); int_vector_type * active_set = int_vector_alloc(0,0); int_vector_type * inactive_set = int_vector_alloc(0,0); int active_size; if ((max_vars <= 0) || (max_vars > nvars)) max_vars = nvars; { int i; for (i=0; i < nvars; i++) int_vector_iset( inactive_set , i , i ); } matrix_set( mu , 0 ); while (true) { double maxC = 0; /* The first step is to calculate the correlations between the covariates, and the current residual. All the currently inactive covariates are searched; the covariate with the greatest correlation with (Y - mu) is selected and added to the active set. */ matrix_sub( Y_mu , Y , mu ); // Y_mu = Y - mu matrix_dgemm( C , X , Y_mu , true , false , 1.0 , 0); // C = X' * Y_mu { int i; int max_set_index = 0; for (i=0; i < int_vector_size( inactive_set ); i++) { int set_index = i; int var_index = int_vector_iget( inactive_set , set_index ); double value = fabs( matrix_iget(C , var_index , 0) ); if (value > maxC) { maxC = value; max_set_index = set_index; } } /* Remove element corresponding to max_set_index from the inactive set and add it to the active set: */ int_vector_append( active_set , int_vector_idel( inactive_set , max_set_index )); } active_size = int_vector_size( active_set ); /* Now we have calculated the correlations between all the covariates and the current residual @Y_mu. The correlations are stored in the matrix @C. The value of the maximum correlation is stored in @maxC. Based on the value of @maxC we have added one new covariate to the model, technically by moving the index from @inactive_set to @active_set. */ /*****************************************************************/ { matrix_type * weights = matrix_alloc( active_size , 1); double scale; /*****************************************************************/ /* This scope should compute and initialize the variables @weights and @scale. */ { matrix_type * subG = matrix_alloc( active_size , active_size ); matrix_type * STS = matrix_alloc( active_size , active_size ); matrix_type * sign_vector = matrix_alloc( active_size , 1); int i , j; /* STS = S' o S where 'o' is the Schur product and S is given by: [ s1 s2 s3 s4 ] S = [ s1 s2 s3 s4 ] [ s1 s2 s3 s4 ] [ s1 s2 s3 s4 ] Where si is the sign of the correlation between (active) variable 'i' and Y_mu. */ for (i=0; i < active_size ; i++) { int vari = int_vector_iget( active_set , i ); double signi = sgn( matrix_iget( C , vari , 0)); matrix_iset( sign_vector , i , 0 , signi ); for (j=0; j < active_size; j++) { int varj = int_vector_iget( active_set , j ); double signj = sgn( matrix_iget( C , varj , 0)); matrix_iset( STS , i , j , signi * signj ); } } // Extract the elements from G corresponding to active indices and // copy to the matrix subG: for (i=0; i < active_size ; i++) { int ii = int_vector_iget( active_set , i ); for (j=0; j < active_size; j++) { int jj = int_vector_iget( active_set , j ); matrix_iset( subG , i , j , matrix_iget(G , ii , jj)); } } // Weights matrix_inplace_mul( subG , STS ); matrix_inv( subG ); { matrix_type * ones = matrix_alloc( active_size , 1 ); matrix_type * GA1 = matrix_alloc( active_size , 1 ); matrix_set( ones , 1.0 ); matrix_matmul( GA1 , subG , ones ); scale = 1.0 / sqrt( matrix_get_column_sum( GA1 , 0 )); matrix_mul( weights , GA1 , sign_vector ); matrix_scale( weights , scale ); matrix_free( GA1 ); matrix_free( ones ); } matrix_free( sign_vector ); matrix_free( subG ); matrix_free( STS ); } /******************************************************************/ /* The variables weight and scale have been calculated, proceed to calculate the step length @gamma. */ { int i; double gamma; { matrix_type * u = matrix_alloc( nsample , 1 ); int j; for (i=0; i < nsample; i++) { double row_sum = 0; for (j =0; j < active_size; j++) row_sum += matrix_iget( X , i , int_vector_iget( active_set , j)) * matrix_iget(weights , j , 0 ); matrix_iset( u , i , 0 , row_sum ); } gamma = maxC / scale; if (active_size < matrix_get_columns( X )) { matrix_type * equi_corr = matrix_alloc( nvars , 1 ); matrix_dgemm( equi_corr , X , u , true , false , 1.0 , 0); // equi_corr = X'·u for (i=0; i < (nvars - active_size); i++) { int var_index = int_vector_iget( inactive_set , i ); double gamma1 = (maxC - matrix_iget(C , var_index , 0 )) / ( scale - matrix_iget( equi_corr , var_index , 0)); double gamma2 = (maxC + matrix_iget(C , var_index , 0 )) / ( scale + matrix_iget( equi_corr , var_index , 0)); if ((gamma1 > 0) && (gamma1 < gamma)) gamma = gamma1; if ((gamma2 > 0) && (gamma2 < gamma)) gamma = gamma2; } matrix_free( equi_corr ); } /* Update the current estimated 'location' mu. */ matrix_scale( u , gamma ); matrix_inplace_add( mu , u ); matrix_free( u ); } /* We have calculated the step length @gamma, and the @weights. Update the @beta matrix. */ for (i=0; i < active_size; i++) matrix_iset( lars->beta , int_vector_iget( active_set , i ) , active_size - 1 , gamma * matrix_iget( weights , i , 0)); if (active_size > 1) for (i=0; i < nvars; i++) matrix_iadd( lars->beta , i , active_size - 1 , matrix_iget( lars->beta , i , active_size - 2)); matrix_free( weights ); } } if (active_size == max_vars) break; if (max_beta > 0) { double beta_norm2 = matrix_get_column_abssum( lars->beta , active_size - 1 ); if (beta_norm2 > max_beta) { // We stop - we will use an interpolation between this beta estimate and // the previous, to ensure that the |beta| = max_beta criteria is satisfied. if (active_size >= 2) { double beta_norm1 = matrix_get_column_abssum( lars->beta , active_size - 2 ); double s = (max_beta - beta_norm1)/(beta_norm2 - beta_norm1); { int j; for (j=0; j < nvars; j++) { double beta1 = matrix_iget( lars->beta , j , active_size - 2 ); double beta2 = matrix_iget( lars->beta , j , active_size - 1 ); matrix_iset( lars->beta , j , active_size - 1 , beta1 + s*(beta2 - beta1)); } } } break; } } } matrix_free( G ); matrix_free( mu ); matrix_free( C ); matrix_free( Y_mu ); int_vector_free( active_set ); int_vector_free( inactive_set ); matrix_resize( lars->beta , nvars , active_size , true ); if (verbose) matrix_pretty_fprint( lars->beta , "beta" , "%12.5f" , stdout ); lars_select_beta( lars , active_size - 1); } matrix_free( X ); matrix_free( Y ); }
double lars_iget_beta( const lars_type * lars , int index) { return matrix_iget( lars->beta0 , index , 0 ); }