/** * \ingroup nongraph * \function igraph_power_law_fit * \brief Fits a power-law distribution to a vector of numbers * * This function fits a power-law distribution to a vector containing samples * from a distribution (that is assumed to follow a power-law of course). In * a power-law distribution, it is generally assumed that P(X=x) is * proportional to x<superscript>-alpha</superscript>, where x is a positive number and alpha * is greater than 1. In many real-world cases, the power-law behaviour kicks * in only above a threshold value \em xmin. The goal of this functions is to * determine \em alpha if \em xmin is given, or to determine \em xmin and the * corresponding value of \em alpha. * * </para><para> * The function uses the maximum likelihood principle to determine \em alpha * for a given \em xmin; in other words, the function will return the \em alpha * value for which the probability of drawing the given sample is the highest. * When \em xmin is not given in advance, the algorithm will attempt to find * the optimal \em xmin value for which the p-value of a Kolmogorov-Smirnov * test between the fitted distribution and the original sample is the largest. * The function uses the method of Clauset, Shalizi and Newman to calculate the * parameters of the fitted distribution. See the following reference for * details: * * </para><para> * Aaron Clauset, Cosma R .Shalizi and Mark E.J. Newman: Power-law * distributions in empirical data. SIAM Review 51(4):661-703, 2009. * * \param data vector containing the samples for which a power-law distribution * is to be fitted. Note that you have to provide the \em samples, * not the probability density function or the cumulative * distribution function. For example, if you wish to fit * a power-law to the degrees of a graph, you can use the output of * \ref igraph_degree directly as an input argument to * \ref igraph_power_law_fit * \param result the result of the fitting algorithm. See \ref igraph_plfit_result_t * for more details. * \param xmin the minimum value in the sample vector where the power-law * behaviour is expected to kick in. Samples smaller than \c xmin * will be ignored by the algoritm. Pass zero here if you want to * include all the samples. If \c xmin is negative, the algorithm * will attempt to determine its best value automatically. * \param force_continuous assume that the samples in the \c data argument come * from a continuous distribution even if the sample vector * contains integer values only (by chance). If this argument is * false, igraph will assume a continuous distribution if at least * one sample is non-integer and assume a discrete distribution * otherwise. * \return Error code: * \c IGRAPH_ENOMEM: not enough memory * \c IGRAPH_EINVAL: one of the arguments is invalid * \c IGRAPH_EOVERFLOW: overflow during the fitting process * \c IGRAPH_EUNDERFLOW: underflow during the fitting process * \c IGRAPH_FAILURE: the underlying algorithm signaled a failure * without returning a more specific error code * * Time complexity: in the continuous case, O(n log(n)) if \c xmin is given. * In the discrete case, the time complexity is dominated by the complexity of * the underlying L-BFGS algorithm that is used to optimize alpha. If \c xmin * is not given, the time complexity is multiplied by the number of unique * samples in the input vector (although it should be faster in practice). * * \example examples/simple/igraph_power_law_fit.c */ int igraph_power_law_fit(const igraph_vector_t* data, igraph_plfit_result_t* result, igraph_real_t xmin, igraph_bool_t force_continuous) { plfit_error_handler_t* plfit_stored_error_handler; plfit_result_t plfit_result; plfit_continuous_options_t cont_options; plfit_discrete_options_t disc_options; igraph_bool_t discrete = force_continuous ? 0 : 1; igraph_bool_t finite_size_correction; int retval; size_t i, n; n = igraph_vector_size(data); finite_size_correction = (n < 50); if (discrete) { /* Does the vector contain discrete values only? */ for (i = 0; i < n; i++) { if ((long int)(VECTOR(*data)[i]) != VECTOR(*data)[i]) { discrete = 0; break; } } } plfit_stored_error_handler = plfit_set_error_handler(igraph_i_plfit_error_handler_store); if (discrete) { plfit_discrete_options_init(&disc_options); disc_options.finite_size_correction = finite_size_correction; if (xmin >= 0) { retval = plfit_estimate_alpha_discrete(VECTOR(*data), n, xmin, &disc_options, &plfit_result); } else { retval = plfit_discrete(VECTOR(*data), n, &disc_options, &plfit_result); } } else { plfit_continuous_options_init(&cont_options); cont_options.finite_size_correction = finite_size_correction; if (xmin >= 0) { retval = plfit_estimate_alpha_continuous(VECTOR(*data), n, xmin, &cont_options, &plfit_result); } else { retval = plfit_continuous(VECTOR(*data), n, &cont_options, &plfit_result); } } plfit_set_error_handler(plfit_stored_error_handler); switch (retval) { case PLFIT_FAILURE: IGRAPH_ERROR(igraph_i_plfit_error_message, IGRAPH_FAILURE); break; case PLFIT_EINVAL: IGRAPH_ERROR(igraph_i_plfit_error_message, IGRAPH_EINVAL); break; case PLFIT_UNDRFLOW: IGRAPH_ERROR(igraph_i_plfit_error_message, IGRAPH_EUNDERFLOW); break; case PLFIT_OVERFLOW: IGRAPH_ERROR(igraph_i_plfit_error_message, IGRAPH_EOVERFLOW); break; case PLFIT_ENOMEM: IGRAPH_ERROR(igraph_i_plfit_error_message, IGRAPH_ENOMEM); break; default: break; } if (result) { result->continuous = !discrete; result->alpha = plfit_result.alpha; result->xmin = plfit_result.xmin; result->L = plfit_result.L; result->D = plfit_result.D; result->p = plfit_result.p; } return 0; }
void process_file(FILE* f, const char* fname) { double* data; size_t i, n = 0, nalloc = 100; unsigned short int warned = 0, discrete = opts.force_continuous ? 0 : 1; plfit_continuous_options_t plfit_continuous_options; plfit_discrete_options_t plfit_discrete_options; plfit_result_t result; struct { double mean; double variance; double skewness; double kurtosis; } moments = { 0, 0, 0, 0 }; /* allocate memory for 100 samples */ data = (double*)calloc(nalloc, sizeof(double)); if (data == 0) { perror(fname); return; } /* read the input file */ for (;;) { int nparsed = fscanf(f, "%lf", data+n); if (nparsed == EOF) /* reached the end of file */ break; if (nparsed < 1) { /* parse error */ int c = '\0'; if ((c = fgetc(f)) == '#') { do { c = fgetc(f); } while (c != '\n'); } else { if (warned++ < 16) { fprintf(stderr, "%s: parse error at byte %ld\n", fname, (long)ftell(f)); } } continue; } if (discrete && (floor(data[n]) != data[n])) discrete = 0; n++; if (n == nalloc) { /* allocate twice as much memory */ nalloc *= 2; data = (double*)realloc(data, sizeof(double) * nalloc); if (data == 0) { perror(fname); return; } } } if (warned) { fprintf(stderr, "%s: corrupted data points in file\n", fname); exit(EX_DATAERR); return; } if (n == 0) { fprintf(stderr, "%s: no data points in file\n", fname); exit(EX_DATAERR); return; } /* apply the divisor if needed */ if (opts.divisor != 1) { #ifdef _OPENMP #pragma omp parallel for private(i) #endif for (i = 0; i < n; i++) { data[i] /= opts.divisor; } if (discrete) { #ifdef _OPENMP #pragma omp parallel for private(i) #endif for (i = 0; i < n; i++) { data[i] = round(data[i]); } } } /* construct the plfit options */ plfit_continuous_options_init(&plfit_continuous_options); plfit_discrete_options_init(&plfit_discrete_options); plfit_continuous_options.finite_size_correction = opts.finite_size_correction; plfit_discrete_options.finite_size_correction = opts.finite_size_correction; plfit_continuous_options.p_value_method = opts.p_value_method; plfit_discrete_options.p_value_method = opts.p_value_method; plfit_continuous_options.p_value_precision = opts.p_value_precision; plfit_discrete_options.p_value_precision = opts.p_value_precision; plfit_continuous_options.rng = &rng; plfit_discrete_options.rng = &rng; /* fit the power-law distribution */ if (discrete) { if (opts.alpha_step > 0) { /* Old estimation based on brute-force search */ plfit_discrete_options.alpha_method = PLFIT_LINEAR_SCAN; plfit_discrete_options.alpha.min = opts.alpha_min; plfit_discrete_options.alpha.max = opts.alpha_max; plfit_discrete_options.alpha.step = opts.alpha_step; } else { plfit_discrete_options.alpha_method = PLFIT_LBFGS; } if (opts.xmin < 0) { /* Estimate xmin and alpha */ plfit_discrete(data, n, &plfit_discrete_options, &result); } else { /* Estimate alpha only */ plfit_estimate_alpha_discrete(data, n, opts.xmin, &plfit_discrete_options, &result); } } else { if (opts.xmin < 0) { /* Estimate xmin and alpha */ plfit_continuous(data, n, &plfit_continuous_options, &result); } else { /* Estimate alpha only */ plfit_estimate_alpha_continuous(data, n, opts.xmin, &plfit_continuous_options, &result); } } /* calculate the moments if needed */ if (opts.print_moments) { plfit_moments(data, n, &moments.mean, &moments.variance, &moments.skewness, &moments.kurtosis); } /* print the results */ if (opts.brief_mode) { if (opts.print_moments) { printf("%s: S %lg %lg %lg %lg\n", fname, moments.mean, moments.variance, moments.skewness, moments.kurtosis); } printf("%s: %c %lg %lg %lg %lg %lg\n", fname, discrete ? 'D' : 'C', result.alpha, result.xmin, result.L, result.D, result.p); } else { printf("%s:\n", fname); if (!opts.finite_size_correction && n < 50) printf("\tWARNING: finite size bias may be present\n\n"); if (opts.print_moments) { printf("\tCentral moments\n"); printf("\tmean = %12.5lf\n", moments.mean); printf("\tvariance = %12.5lf\n", moments.variance); printf("\tstd.dev. = %12.5lf\n", sqrt(moments.variance)); printf("\tskewness = %12.5lf\n", moments.skewness); printf("\tkurtosis = %12.5lf\n", moments.kurtosis); printf("\tex.kurt. = %12.5lf\n", moments.kurtosis-3); printf("\n"); } printf("\t%s MLE", discrete ? "Discrete" : "Continuous"); if (opts.finite_size_correction) printf(" with finite size correction"); printf("\n"); printf("\talpha = %12.5lf\n", result.alpha); printf("\txmin = %12.5lf\n", result.xmin ); printf("\tL = %12.5lf\n", result.L ); printf("\tD = %12.5lf\n", result.D ); if (!isnan(result.p)) { printf("\tp = %12.5lf%s\n", result.p, opts.p_value_method == PLFIT_P_VALUE_APPROXIMATE ? " (approximation)" : ""); } printf("\n"); } /* free the stored data */ free(data); }