void GaussJordan::find_pivot (const int k) { if (iszero (get_entry (k, k))) swap_rows (k, find_nonzero (k)); }
/** * Having finished scanning a putatively tabular file, apply heuristics * to the column statistics to finally determine the statistical class * for each column (quantitative, categorical or (maybe) ordinal). * * The determination depends primarily on the presence or lack of unanimity * in the observed types (integer, float, string) in the column, then * devolves to consideration of the many other statistics in the column * struct. */ void analyze_column( struct column *c ) { const int OBSERVED_TYPE_COUNT = count_nonzero( c->type_vote+1 /* exclude FTY_EMPTY */, FTY_COUNT-1 ); c->stat_class = STC_UNK; // ...unless overridden below! if( OBSERVED_TYPE_COUNT == 0 /* all must be empty */ ) { assert( c->type_vote[ FTY_EMPTY ] > 0 ); } else if( OBSERVED_TYPE_COUNT == 1 /* Unanimity... */ ) { // ...doesn't necessarily determine the stat class because... switch( find_nonzero( c->type_vote, FTY_COUNT ) ) { case FTY_INTEGER: // ...integers can be used in many ways! c->stat_class = _integer_inference( c ); break; case FTY_STRING: if( ( ! c->excess_values ) && set_count( & c->value_set ) < c->type_vote[ FTY_STRING ] && c->long_field_count == 0 ) c->stat_class = STC_CAT; break; case FTY_FLOAT: c->stat_class = STC_QUA; } } else { /** * If more than two types are observed and STRING is one of them, * then everything hinges on the cardinality of observed strings * (the contents of value_set EXCLUDING any integers it contains). * 1. If exactly one string value is observed AND it's a potential * missing data indicator, then inference devolves to that for * numeric types. * 2. If more than one string value is observed all bets are off; * the column remains STC_UNKnown. */ const char *sval[2]; const bool UNIQUE_STRING = c->type_vote[ FTY_STRING ] > 0 && _fetch_string_values( & c->value_set, sval, 2 ) == 1; const bool HAS_CANDIDATE_MISSING_DATA_PLACEHOLDER = UNIQUE_STRING && regexec( &_compiled_re_NA, sval[0], 0, NULL, 0 ) == 0; if( OBSERVED_TYPE_COUNT == 2 ) { if( c->type_vote[ FTY_STRING ] > 0 ) { if( HAS_CANDIDATE_MISSING_DATA_PLACEHOLDER ) { if( c->type_vote[ FTY_INTEGER ] > 0 ) { c->stat_class = _integer_inference( c ); } else { assert( c->type_vote[ FTY_FLOAT ] > 0 ); c->stat_class = STC_QUA; } } } else { // no string, just ints and floats assert( c->type_vote[ FTY_INTEGER ] > 0 && c->type_vote[ FTY_FLOAT ] > 0 ); c->stat_class = STC_QUA; } } else { // Column contains int(s), float(s) AND string(s). if( HAS_CANDIDATE_MISSING_DATA_PLACEHOLDER ) c->stat_class = STC_QUA; } // 3 types observed } // > 1 type observed. }