/* * Estimate selectivity of "column <@ const" based on most common element * statistics. * * mcelem (of length nmcelem) and numbers (of length nnumbers) are from * the array column's MCELEM statistics slot, or are NULL/0 if stats are * not available. array_data (of length nitems) is the constant's elements. * hist (of length nhist) is from the array column's DECHIST statistics slot, * or is NULL/0 if those stats are not available. * * Both the mcelem and array_data arrays are assumed presorted according * to the element type's cmpfunc. Null elements are not present. * * Independent element occurrence would imply a particular distribution of * distinct element counts among matching rows. Real data usually falsifies * that assumption. For example, in a set of 11-element integer arrays having * elements in the range [0..10], element occurrences are typically not * independent. If they were, a sufficiently-large set would include all * distinct element counts 0 through 11. We correct for this using the * histogram of distinct element counts. * * In the "column @> const" and "column && const" cases, we usually have a * "const" with low number of elements (otherwise we have selectivity close * to 0 or 1 respectively). That's why the effect of dependence related * to distinct element count distribution is negligible there. In the * "column <@ const" case, number of elements is usually high (otherwise we * have selectivity close to 0). That's why we should do a correction with * the array distinct element count distribution here. * * Using the histogram of distinct element counts produces a different * distribution law than independent occurrences of elements. This * distribution law can be described as follows: * * P(o1, o2, ..., on) = f1^o1 * (1 - f1)^(1 - o1) * f2^o2 * * (1 - f2)^(1 - o2) * ... * fn^on * (1 - fn)^(1 - on) * hist[m] / ind[m] * * where: * o1, o2, ..., on - occurrences of elements 1, 2, ..., n * (1 - occurrence, 0 - no occurrence) in row * f1, f2, ..., fn - frequencies of elements 1, 2, ..., n * (scalar values in [0..1]) according to collected statistics * m = o1 + o2 + ... + on = total number of distinct elements in row * hist[m] - histogram data for occurrence of m elements. * ind[m] - probability of m occurrences from n events assuming their * probabilities to be equal to frequencies of array elements. * * ind[m] = sum(f1^o1 * (1 - f1)^(1 - o1) * f2^o2 * (1 - f2)^(1 - o2) * * ... * fn^on * (1 - fn)^(1 - on), o1, o2, ..., on) | o1 + o2 + .. on = m */ static Selectivity mcelem_array_contained_selec(Datum *mcelem, int nmcelem, float4 *numbers, int nnumbers, Datum *array_data, int nitems, float4 *hist, int nhist, Oid operator, FmgrInfo *cmpfunc) { int mcelem_index, i, unique_nitems = 0; float selec, minfreq, nullelem_freq; float *dist, *mcelem_dist, *hist_part; float avg_count, mult, rest; float *elem_selec; /* * There should be three more Numbers than Values in the MCELEM slot, * because the last three cells should hold minimal and maximal frequency * among the non-null elements, and then the frequency of null elements. * Punt if not right, because we can't do much without the element freqs. */ if (numbers == NULL || nnumbers != nmcelem + 3) return DEFAULT_CONTAIN_SEL; /* Can't do much without a count histogram, either */ if (hist == NULL || nhist < 3) return DEFAULT_CONTAIN_SEL; /* * Grab some of the summary statistics that compute_array_stats() stores: * lowest frequency, frequency of null elements, and average distinct * element count. */ minfreq = numbers[nmcelem]; nullelem_freq = numbers[nmcelem + 2]; avg_count = hist[nhist - 1]; /* * "rest" will be the sum of the frequencies of all elements not * represented in MCELEM. The average distinct element count is the sum * of the frequencies of *all* elements. Begin with that; we will proceed * to subtract the MCELEM frequencies. */ rest = avg_count; /* * mult is a multiplier representing estimate of probability that each * mcelem that is not present in constant doesn't occur. */ mult = 1.0f; /* * elem_selec is array of estimated frequencies for elements in the * constant. */ elem_selec = (float *) palloc(sizeof(float) * nitems); /* Scan mcelem and array in parallel. */ mcelem_index = 0; for (i = 0; i < nitems; i++) { bool match = false; /* Ignore any duplicates in the array data. */ if (i > 0 && element_compare(&array_data[i - 1], &array_data[i], cmpfunc) == 0) continue; /* * Iterate over MCELEM until we find an entry greater than or equal to * this element of the constant. Update "rest" and "mult" for mcelem * entries skipped over. */ while (mcelem_index < nmcelem) { int cmp = element_compare(&mcelem[mcelem_index], &array_data[i], cmpfunc); if (cmp < 0) { mult *= (1.0f - numbers[mcelem_index]); rest -= numbers[mcelem_index]; mcelem_index++; } else { if (cmp == 0) match = true; /* mcelem is found */ break; } } if (match) { /* MCELEM matches the array item. */ elem_selec[unique_nitems] = numbers[mcelem_index]; /* "rest" is decremented for all mcelems, matched or not */ rest -= numbers[mcelem_index]; mcelem_index++; } else { /* * The element is not in MCELEM. Punt, but assume that the * selectivity cannot be more than minfreq / 2. */ elem_selec[unique_nitems] = Min(DEFAULT_CONTAIN_SEL, minfreq / 2); } unique_nitems++; } /* * If we handled all constant elements without exhausting the MCELEM * array, finish walking it to complete calculation of "rest" and "mult". */ while (mcelem_index < nmcelem) { mult *= (1.0f - numbers[mcelem_index]); rest -= numbers[mcelem_index]; mcelem_index++; } /* * The presence of many distinct rare elements materially decreases * selectivity. Use the Poisson distribution to estimate the probability * of a column value having zero occurrences of such elements. See above * for the definition of "rest". */ mult *= exp(-rest); /*---------- * Using the distinct element count histogram requires * O(unique_nitems * (nmcelem + unique_nitems)) * operations. Beyond a certain computational cost threshold, it's * reasonable to sacrifice accuracy for decreased planning time. We limit * the number of operations to EFFORT * nmcelem; since nmcelem is limited * by the column's statistics target, the work done is user-controllable. * * If the number of operations would be too large, we can reduce it * without losing all accuracy by reducing unique_nitems and considering * only the most-common elements of the constant array. To make the * results exactly match what we would have gotten with only those * elements to start with, we'd have to remove any discarded elements' * frequencies from "mult", but since this is only an approximation * anyway, we don't bother with that. Therefore it's sufficient to qsort * elem_selec[] and take the largest elements. (They will no longer match * up with the elements of array_data[], but we don't care.) *---------- */ #define EFFORT 100 if ((nmcelem + unique_nitems) > 0 && unique_nitems > EFFORT * nmcelem / (nmcelem + unique_nitems)) { /* * Use the quadratic formula to solve for largest allowable N. We * have A = 1, B = nmcelem, C = - EFFORT * nmcelem. */ double b = (double) nmcelem; int n; n = (int) ((sqrt(b * b + 4 * EFFORT * b) - b) / 2); /* Sort, then take just the first n elements */ qsort(elem_selec, unique_nitems, sizeof(float), float_compare_desc); unique_nitems = n; } /* * Calculate probabilities of each distinct element count for both mcelems * and constant elements. At this point, assume independent element * occurrence. */ dist = calc_distr(elem_selec, unique_nitems, unique_nitems, 0.0f); mcelem_dist = calc_distr(numbers, nmcelem, unique_nitems, rest); /* ignore hist[nhist-1], which is the average not a histogram member */ hist_part = calc_hist(hist, nhist - 1, unique_nitems); selec = 0.0f; for (i = 0; i <= unique_nitems; i++) { /* * mult * dist[i] / mcelem_dist[i] gives us probability of qual * matching from assumption of independent element occurrence with the * condition that distinct element count = i. */ if (mcelem_dist[i] > 0) selec += hist_part[i] * mult * dist[i] / mcelem_dist[i]; } pfree(dist); pfree(mcelem_dist); pfree(hist_part); pfree(elem_selec); /* Take into account occurrence of NULL element. */ selec *= (1.0f - nullelem_freq); CLAMP_PROBABILITY(selec); return selec; }
int main(int argc, char **argv) { double min, max; /* Minimum & maximum sample values */ double sum_of_samples=0.0; /* Sum of all samples accounted for */ double sum_of_squared_samples=0.0; /* Sum of all squared samples accounted for*/ double trim_fraction; /* Fraction used to trim the histogram */ int ii; /* Loop index */ long samples_counted=0; /* Number of all samples accounted for */ float *data_line; /* Buffer for a line of samples */ long line, sample; /* Line and sample indices */ long num_lines, num_samples; /* Number of lines and samples */ int percent_complete=0; /* Percent of data sweep completed */ int overmeta_flag=FALSE; /* If TRUE write over current .meta file */ int overstat_flag=FALSE; /* If TRUE write over current .stat file */ int nometa_flag=FALSE; /* If TRUE do not write .meta file */ int nostat_flag=FALSE; /* If TRUE do not write .stat file */ int mask_flag=FALSE; /* TRUE if user specifies a mask value */ int trim_flag=FALSE; /* If TRUE trim histogram */ double mask=NAN; /* Value to ignore while caculating stats*/ char meta_name[261]; /* Meta file name */ meta_parameters *meta; /* SAR meta data structure */ char sar_name[256]; /* SAR file name WITH extention */ FILE *sar_file; /* SAR data file pointer to take stats on*/ stat_parameters *stats; /* Statistics structure */ char stat_name[261]; /* Stats file name */ extern int currArg; /* Pre-initialized to 1 */ /* We initialize these to a magic number for checking. */ long start_line = -1; /* Window starting line. */ long start_sample = -1; /* Window starting sample. */ long window_height = -1; /* Window height in lines. */ long window_width = -1; /* Window width in samples. */ /* parse command line */ handle_license_and_version_args(argc, argv, "stats"); logflag=quietflag=FALSE; while (currArg < (argc-1)) { char *key = argv[currArg++]; if (strmatch(key,"-quiet")) { quietflag=TRUE; } else if (strmatch(key,"-log")) { CHECK_ARG(1); strcpy(logFile,GET_ARG(1)); fLog = FOPEN(logFile, "a"); logflag=TRUE; } else if (strmatch(key,"-mask")) { CHECK_ARG(1); mask = atof(GET_ARG(1)); mask_flag=TRUE; } else if (strmatch(key,"-overmeta")) { overmeta_flag=TRUE; } else if (strmatch(key,"-overstat")) { overstat_flag=TRUE; } else if (strmatch(key,"-nometa")) { nometa_flag=TRUE; } else if (strmatch(key,"-nostat")) { nostat_flag=TRUE; } else if (strmatch(key,"-startline")) { CHECK_ARG(1); nometa_flag=TRUE; /* Implied. */ start_line = atol(GET_ARG(1)); if ( start_line < 0 ) { printf("error: -startline argument must be greater than or equal to zero\n"); usage(argv[0]); } } else if (strmatch(key,"-startsample")) { CHECK_ARG(1); nometa_flag=TRUE; /* Implied. */ start_sample = atol(GET_ARG(1)); if ( start_sample < 0 ) { printf("error: -startsample argument must be greater than or equal to zero\n"); usage(argv[0]); } } else if (strmatch(key,"-width")) { CHECK_ARG(1); nometa_flag=TRUE; /* Implied. */ window_width = atol(GET_ARG(1)); if ( window_width < 0 ) { printf("error: -width argument must be greater than or equal to zero\n"); usage(argv[0]); } } else if (strmatch(key,"-height")) { CHECK_ARG(1); nometa_flag=TRUE; /* Implied. */ window_height = atol(GET_ARG(1)); if ( window_height < 0 ) { printf("error: -height argument must be greater than or equal to zero\n"); usage(argv[0]); } } else if (strmatch(key,"-trim")) { CHECK_ARG(1); trim_flag=TRUE; /* Implied. */ trim_fraction = atof(GET_ARG(1)); } else {printf( "\n**Invalid option: %s\n",argv[currArg-1]); usage(argv[0]);} } if ((argc-currArg)<1) {printf("Insufficient arguments.\n"); usage(argv[0]);} strcpy (sar_name, argv[currArg]); char *ext = findExt(sar_name); if (ext == NULL || strcmp("IMG", uc(ext)) != 0) { strcpy(sar_name, appendExt(sar_name, ".img")); } create_name(meta_name, sar_name, ".meta"); create_name(stat_name, sar_name, ".stat"); printf("\nProgram: stats\n\n"); if (logflag) { fprintf(fLog, "\nProgram: stats\n\n"); } printf("\nCalculating statistics for %s\n\n", sar_name); if (logflag) { fprintf(fLog,"\nCalculating statistics for %s\n\n", sar_name); } meta = meta_read(meta_name); num_lines = meta->general->line_count; num_samples = meta->general->sample_count; if ( start_line == -1 ) start_line = 0; if ( start_line > num_lines ) { printf("error: -startline argument is larger than index of last line in image\n"); exit(EXIT_FAILURE); } if ( start_sample == -1 ) start_sample = 0; if ( start_sample > num_samples ) { printf("error: -startsample argument is larger than index of last sample in image\n"); exit(EXIT_FAILURE); } if ( window_height == -1 ) window_height = num_lines; if ( start_line + window_height > num_lines ) { printf("warning: window specified with -startline, -height options doesn't fit in image\n"); } if ( window_width == -1 ) window_width = num_samples; if ( start_sample + window_width > num_samples ) { printf("warning: window specified with -startsample, -width options doesn't fit in image\n"); } /* Make sure we don't over write any files that we don't want to */ if (meta->stats && !overmeta_flag && !nometa_flag) { printf(" ** The meta file already has a populated statistics structure.\n" " ** If you want to run this program and replace that structure,\n" " ** then use the -overmeta option to do so. If you want to run\n" " ** this program, but don't want to replace the structure, use\n" " ** the -nometa option.\n"); if (logflag) { fprintf(fLog, " ** The meta file already has a populated statistics structure.\n" " ** If you want to run this program and replace that structure,\n" " ** then use the -overmeta option to do so. If you want to run\n" " ** this program, but don't want to replace the structure, use\n" " ** the -nometa option.\n"); } exit(EXIT_FAILURE); } if (fileExists(stat_name) && !overstat_flag && !nostat_flag) { printf(" ** The file, %s, already exists. If you want to\n" " ** overwrite it, then use the -overstat option to do so.\n" " ** If you want to run the progam but don't want to write\n" " ** over the current file, then use the -nostat option.\n", stat_name); if (logflag) { fprintf(fLog, " ** The file, %s, already exists. If you want to\n" " ** overwrite it, then use the -overstat option to do so.\n" " ** If you want to run the progam but don't want to write\n" " ** over the current file, then use the -nostat option.\n", stat_name); } exit(EXIT_FAILURE); } /* Let user know the window in which the stats will be taken */ if ((start_line!=0) || (start_sample!=0) || (window_height!=num_lines) || (window_width!=num_samples)) { if (!quietflag) { printf("Taking statistics on a window with upper left corner (%ld,%ld)\n" " and lower right corner (%ld,%ld)\n", start_sample, start_line, window_width+start_sample, window_height+start_line); } if (logflag && !quietflag) { fprintf(fLog, "Taking statistics on a window with upper left corner (%ld,%ld)\n" " and lower right corner (%ld,%ld)\n", start_sample, start_line, window_width+start_sample, window_height+start_line); } } /* Allocate line buffer */ data_line = (float *)MALLOC(sizeof(float)*num_samples); if (meta->stats) FREE(meta->stats); if (meta->general->band_count <= 0) { printf(" ** Band count in the existing data is missing or less than zero.\nDefaulting to one band.\n"); if (logflag) { fprintf(fLog, " ** Band count in the existing data is missing or less than zero.\nDefaulting to one band.\n"); } meta->general->band_count = 1; } meta->stats = meta_statistics_init(meta->general->band_count); if (!meta->stats) { printf(" ** Cannot allocate memory for statistics data structures.\n"); if (logflag) { fprintf(fLog, " ** Cannot allocate memory for statistics data structures.\n"); } exit(EXIT_FAILURE); } stats = (stat_parameters *)MALLOC(sizeof(stat_parameters) * meta->stats->band_count); if (!stats) { printf(" ** Cannot allocate memory for statistics data structures.\n"); if (logflag) { fprintf(fLog, " ** Cannot allocate memory for statistics data structures.\n"); } exit(EXIT_FAILURE); } int band; long band_offset; for (band = 0; band < meta->stats->band_count; band++) { /* Find min, max, and mean values */ if (!quietflag) printf("\n"); if (logflag && !quietflag) fprintf(fLog,"\n"); min = 100000000; max = -100000000; sum_of_samples=0.0; sum_of_squared_samples=0.0; percent_complete=0; band_offset = band * meta->general->line_count; sar_file = FOPEN(sar_name, "r"); for (line=start_line+band_offset; line<start_line+window_height+band_offset; line++) { if (!quietflag) asfPercentMeter((float)(line-start_line-band_offset)/(float)(window_height-start_line)); get_float_line(sar_file, meta, line, data_line); for (sample=start_sample; sample<start_sample+window_width; sample++) { if ( mask_flag && FLOAT_EQUIVALENT(data_line[sample],mask) ) continue; if (data_line[sample] < min) min=data_line[sample]; if (data_line[sample] > max) max=data_line[sample]; sum_of_samples += data_line[sample]; sum_of_squared_samples += SQR(data_line[sample]); samples_counted++; } } if (!quietflag) asfPercentMeter(1.0); // if (!quietflag) printf("\rFirst data sweep: 100%% complete.\n"); FCLOSE(sar_file); stats[band].min = min; stats[band].max = max; stats[band].upper_left_line = start_line; stats[band].upper_left_samp = start_sample; stats[band].lower_right_line = start_line + window_height; stats[band].lower_right_samp = start_sample + window_width; stats[band].mask = mask; stats[band] = calc_hist(stats[band], sar_name, band, meta, sum_of_samples, samples_counted, mask_flag); /* Remove outliers and trim the histogram by resetting the minimum and and maximum */ if (trim_flag) { register int sum=0, num_pixels, minDex=0, maxDex=255; double overshoot, width; num_pixels = (int)(samples_counted*trim_fraction); minDex = 0; while (sum < num_pixels) sum += stats[band].histogram[minDex++]; if (minDex-1>=0) overshoot = (double)(num_pixels-sum)/stats[band].histogram[minDex-1]; else overshoot = 0; stats[band].min = (minDex-overshoot-stats[band].offset)/stats[band].slope; sum=0; while (sum < num_pixels) sum += stats[band].histogram[maxDex--]; if (maxDex+1<256) overshoot = (double)(num_pixels-sum)/stats[band].histogram[maxDex+1]; else overshoot = 0; stats[band].max = (maxDex+1+overshoot-stats[band].offset)/stats[band].slope; /* Widening the range for better visual effect */ width = (stats[band].max-stats[band].min)*(1/(1.0-2*trim_fraction)-1); stats[band].min -= width/2; stats[band].max += width/2; /* Couple useful corrections borrowed from SARview */ if ((stats[band].max-stats[band].min) < 0.01*(max-min)) { stats[band].max = max; stats[band].min = min; } if (min == 0.0) stats[band].min=0.0; if (stats[band].min == stats[band].max) stats[band].max = stats[band].min + MICRON; stats[band].slope = 255.0/(stats[band].max-stats[band].min); stats[band].offset = -stats[band].slope*stats[band].min; stats[band] = calc_hist(stats[band], sar_name, band, meta, sum_of_samples, samples_counted, mask_flag); } } if(data_line)FREE(data_line); /* Populate meta->stats structure */ char **band_names = NULL; if (meta_is_valid_string(meta->general->bands) && strlen(meta->general->bands) && meta->general->band_count > 0) { band_names = extract_band_names(meta->general->bands, meta->general->band_count); } else { if (meta->general->band_count <= 0) meta->general->band_count = 1; band_names = (char **) MALLOC (meta->general->band_count * sizeof(char *)); int i; for (i=0; i<meta->general->band_count; i++) { band_names[i] = (char *) MALLOC (64 * sizeof(char)); sprintf(band_names[i], "%02d", i); } } int band_no; for (band_no = 0; band_no < meta->stats->band_count; band_no++) { strcpy(meta->stats->band_stats[band_no].band_id, band_names[band_no]); meta->stats->band_stats[band_no].min = stats[band_no].min; meta->stats->band_stats[band_no].max = stats[band_no].max; meta->stats->band_stats[band_no].mean = stats[band_no].mean; meta->stats->band_stats[band_no].rmse = stats[band_no].rmse; meta->stats->band_stats[band_no].std_deviation = stats[band_no].std_deviation; meta->stats->band_stats[band_no].mask = stats[band_no].mask; } if (band_names) { int i; for (i=0; i<meta->general->band_count; i++) { if (band_names[i]) FREE (band_names[i]); } FREE(band_names); } /* Print findings to the screen (and log file if applicable)*/ if (!quietflag) { printf("\n"); printf("Statistics found:\n"); if (mask_flag) { printf("Used mask %-16.11g\n",mask); } printf("Number of bands: %d\n", meta->stats->band_count); for (band=0; band<meta->stats->band_count; band++) { printf("\n\nBand name = \"%s\"\n", meta->stats->band_stats[band].band_id); printf("Minimum = %-16.11g\n",stats[band].min); printf("Maximum = %-16.11g\n",stats[band].max); printf("Mean = %-16.11g\n",stats[band].mean); printf("Root mean squared error = %-16.11g\n", stats[band].rmse); printf("Standard deviation = %-16.11g\n", stats[band].std_deviation); printf("\n"); printf("Data fit to [0..255] using equation: byte = %g * sample + %g\n", stats[band].slope, stats[band].offset); if (trim_flag) printf("Trimming fraction = %.3g\n", trim_fraction); printf("\n"); printf("Histogram:\n"); for (ii=0; ii<256; ii++) { if (ii%8 == 0) { printf("%s%3i-%3i:", (ii==0) ? "" : "\n", ii, ii+7); } printf(" %8i", stats[band].histogram[ii]); } printf("\n"); } } if (logflag && !quietflag) { fprintf(fLog,"Statistics found:\n"); if (mask_flag) { fprintf(fLog,"Used mask %-16.11g\n",mask); } fprintf(fLog,"Number of bands: %d\n", meta->stats->band_count); for (band=0; band<meta->stats->band_count; band++) { fprintf(fLog,"\n\nBand name = \"%s\"\n", meta->stats->band_stats[band].band_id); fprintf(fLog,"Minimum = %-16.11g\n",stats[band].min); fprintf(fLog,"Maximum = %-16.11g\n",stats[band].max); fprintf(fLog,"Mean = %-16.11g\n",stats[band].mean); fprintf(fLog,"Root mean squared error = %-16.11g\n", stats[band].rmse); fprintf(fLog,"Standard deviation = %-16.11g\n", stats[band].std_deviation); fprintf(fLog,"\n"); fprintf(fLog,"Data fit to [0..255] using equation: byte = %g * sample + %g\n", stats[band].slope, stats[band].offset); if (trim_flag) fprintf(fLog,"Trimming fraction = %.3g\n", trim_fraction); fprintf(fLog,"\n"); fprintf(fLog,"Histogram:\n"); for (ii=0; ii<256; ii++) { if (ii%8 == 0) { fprintf(fLog,"%s%3i-%3i:", (ii==0) ? "" : "\n", ii, ii+7); } fprintf(fLog," %8i", stats[band].histogram[ii]); } fprintf(fLog,"\n"); } } /* Write out .meta and .stat files */ if (!nometa_flag) meta_write(meta, meta_name); if (!nostat_flag) stat_write(stats, stat_name, meta->stats->band_count); /* Free the metadata structure */ meta_free(meta); /* Report */ if (!quietflag) { printf("\n"); printf("Statistics taken on image file %s.\n",sar_name); if (!nometa_flag) printf("Statistics written to the stats block in %s.\n", meta_name); if (!nostat_flag) printf("Statistics plus histogram written to %s.\n", stat_name); printf("\n"); } if (logflag && !quietflag) { fprintf(fLog,"\n"); fprintf(fLog,"Statistics taken on image file '%s'\n",sar_name); if (!nometa_flag) fprintf(fLog,"Statistics written to the stats block in %s\n", meta_name); if (!nostat_flag) fprintf(fLog,"Statistics plus histogram written to %s\n", stat_name); fprintf(fLog,"\n"); } if (fLog) FCLOSE(fLog); return 0; }