Example #1
0
int test_celegans() {
	plfit_result_t result;
	plfit_discrete_options_t options;
	size_t n;

	plfit_discrete_options_init(&options);
    options.p_value_method = PLFIT_P_VALUE_SKIP;

	n = test_read_file("celegans-indegree.dat", data, 41000);
	ASSERT_NONZERO(n);

	result.alpha = result.xmin = result.L = 0;
    plfit_discrete(data, n, &options, &result);
	ASSERT_ALMOST_EQUAL(result.alpha, 2.9967, 1e-1);
	ASSERT_EQUAL(result.xmin, 10);
	ASSERT_ALMOST_EQUAL(result.L, -245.14869, 1e-4);
	ASSERT_ALMOST_EQUAL(result.D, 0.04448, 1e-3);

	n = test_read_file("celegans-outdegree.dat", data, 41000);
	ASSERT_NONZERO(n);

	result.alpha = result.xmin = result.L = 0;
    plfit_discrete(data, n, &options, &result);
	ASSERT_ALMOST_EQUAL(result.alpha, 3.3778, 1e-1);
	ASSERT_EQUAL(result.xmin, 11);
	ASSERT_ALMOST_EQUAL(result.L, -232.80207, 1e-4);
	ASSERT_ALMOST_EQUAL(result.D, 0.08615, 1e-3);

	n = test_read_file("celegans-totaldegree.dat", data, 41000);
	ASSERT_NONZERO(n);

	result.alpha = result.xmin = result.L = 0;
    plfit_discrete(data, n, &options, &result);
	ASSERT_ALMOST_EQUAL(result.alpha, 3.29264, 1e-1);
	ASSERT_EQUAL(result.xmin, 18);
	ASSERT_ALMOST_EQUAL(result.L, -315.78214, 1e-4);
	ASSERT_ALMOST_EQUAL(result.D, 0.04760, 1e-3);

	return 0;
}
Example #2
0
int test_celegans() {
	plfit_result_t result;
	size_t n;

	n = test_read_file("celegans-indegree.dat", data, 41000);
	ASSERT_NONZERO(n);

	result.alpha = result.xmin = result.L = 0;
    plfit_discrete(data, n, 0, &result);
	ASSERT_ALMOST_EQUAL(result.alpha, 2.9967, 1e-1);
	ASSERT_EQUAL(result.xmin, 10);
	ASSERT_ALMOST_EQUAL(result.L, -245.14869, 1e-4);
	ASSERT_ALMOST_EQUAL(result.D, 0.04448, 1e-3);
	ASSERT_ALMOST_EQUAL(result.p, 0.9974, 1e-3);

	n = test_read_file("celegans-outdegree.dat", data, 41000);
	ASSERT_NONZERO(n);

	result.alpha = result.xmin = result.L = 0;
    plfit_discrete(data, n, 0, &result);
	ASSERT_ALMOST_EQUAL(result.alpha, 3.3778, 1e-1);
	ASSERT_EQUAL(result.xmin, 11);
	ASSERT_ALMOST_EQUAL(result.L, -232.80207, 1e-4);
	ASSERT_ALMOST_EQUAL(result.D, 0.08615, 1e-3);
	ASSERT_ALMOST_EQUAL(result.p, 0.6076, 1e-3);

	n = test_read_file("celegans-totaldegree.dat", data, 41000);
	ASSERT_NONZERO(n);

	result.alpha = result.xmin = result.L = 0;
    plfit_discrete(data, n, 0, &result);
	ASSERT_ALMOST_EQUAL(result.alpha, 3.29264, 1e-1);
	ASSERT_EQUAL(result.xmin, 18);
	ASSERT_ALMOST_EQUAL(result.L, -315.78214, 1e-4);
	ASSERT_ALMOST_EQUAL(result.D, 0.04760, 1e-3);
	ASSERT_ALMOST_EQUAL(result.p, 0.98610, 1e-3);

	return 0;
}
Example #3
0
int test_condmat() {
	plfit_result_t result;
	size_t n;

	n = test_read_file("condmat2005-degree.dat", data, 41000);
	ASSERT_NONZERO(n);

	result.alpha = result.xmin = result.L = 0;
    plfit_discrete(data, n, 0, &result);
	ASSERT_ALMOST_EQUAL(result.alpha, 3.68612, 1e-2);
	ASSERT_EQUAL(result.xmin, 49);
	ASSERT_ALMOST_EQUAL(result.L, -3152.48302, 1e-4);
	ASSERT_ALMOST_EQUAL(result.D, 0.02393, 1e-3);
	ASSERT_ALMOST_EQUAL(result.p, 0.79117, 1e-3);

	return 0;
}
Example #4
0
int test_condmat() {
	plfit_result_t result;
	plfit_discrete_options_t options;
	size_t n;

	plfit_discrete_options_init(&options);
    options.p_value_method = PLFIT_P_VALUE_SKIP;

	n = test_read_file("condmat2005-degree.dat", data, 41000);
	ASSERT_NONZERO(n);

	result.alpha = result.xmin = result.L = 0;
    plfit_discrete(data, n, &options, &result);
	ASSERT_ALMOST_EQUAL(result.alpha, 3.68612, 1e-2);
	ASSERT_EQUAL(result.xmin, 49);
	ASSERT_ALMOST_EQUAL(result.L, -3152.48302, 1e-4);
	ASSERT_ALMOST_EQUAL(result.D, 0.02393, 1e-3);

	return 0;
}
Example #5
0
/**
 * \ingroup nongraph
 * \function igraph_power_law_fit
 * \brief Fits a power-law distribution to a vector of numbers
 *
 * This function fits a power-law distribution to a vector containing samples
 * from a distribution (that is assumed to follow a power-law of course). In
 * a power-law distribution, it is generally assumed that P(X=x) is
 * proportional to x<superscript>-alpha</superscript>, where x is a positive number and alpha
 * is greater than 1. In many real-world cases, the power-law behaviour kicks
 * in only above a threshold value \em xmin. The goal of this functions is to
 * determine \em alpha if \em xmin is given, or to determine \em xmin and the
 * corresponding value of \em alpha.
 *
 * </para><para>
 * The function uses the maximum likelihood principle to determine \em alpha
 * for a given \em xmin; in other words, the function will return the \em alpha
 * value for which the probability of drawing the given sample is the highest.
 * When \em xmin is not given in advance, the algorithm will attempt to find
 * the optimal \em xmin value for which the p-value of a Kolmogorov-Smirnov
 * test between the fitted distribution and the original sample is the largest.
 * The function uses the method of Clauset, Shalizi and Newman to calculate the
 * parameters of the fitted distribution. See the following reference for
 * details:
 *
 * </para><para>
 * Aaron Clauset, Cosma R .Shalizi and Mark E.J. Newman: Power-law
 * distributions in empirical data. SIAM Review 51(4):661-703, 2009.
 *
 * \param data vector containing the samples for which a power-law distribution
 *             is to be fitted. Note that you have to provide the \em samples,
 *             not the probability density function or the cumulative
 *             distribution function. For example, if you wish to fit
 *             a power-law to the degrees of a graph, you can use the output of
 *             \ref igraph_degree directly as an input argument to
 *             \ref igraph_power_law_fit
 * \param result the result of the fitting algorithm. See \ref igraph_plfit_result_t
 *             for more details.
 * \param xmin the minimum value in the sample vector where the power-law
 *             behaviour is expected to kick in. Samples smaller than \c xmin
 *             will be ignored by the algoritm. Pass zero here if you want to
 *             include all the samples. If \c xmin is negative, the algorithm
 *             will attempt to determine its best value automatically.
 * \param force_continuous assume that the samples in the \c data argument come
 *             from a continuous distribution even if the sample vector
 *             contains integer values only (by chance). If this argument is
 *             false, igraph will assume a continuous distribution if at least
 *             one sample is non-integer and assume a discrete distribution
 *             otherwise.
 * \return Error code:
 *         \c IGRAPH_ENOMEM: not enough memory
 *         \c IGRAPH_EINVAL: one of the arguments is invalid
 *         \c IGRAPH_EOVERFLOW: overflow during the fitting process
 *         \c IGRAPH_EUNDERFLOW: underflow during the fitting process
 *         \c IGRAPH_FAILURE: the underlying algorithm signaled a failure
 *         without returning a more specific error code
 * 
 * Time complexity: in the continuous case, O(n log(n)) if \c xmin is given.
 * In the discrete case, the time complexity is dominated by the complexity of
 * the underlying L-BFGS algorithm that is used to optimize alpha. If \c xmin
 * is not given, the time complexity is multiplied by the number of unique
 * samples in the input vector (although it should be faster in practice).
 * 
 * \example examples/simple/igraph_power_law_fit.c
 */
int igraph_power_law_fit(const igraph_vector_t* data, igraph_plfit_result_t* result,
	igraph_real_t xmin, igraph_bool_t force_continuous) {
  plfit_error_handler_t* plfit_stored_error_handler;
  plfit_result_t plfit_result;
  plfit_continuous_options_t cont_options;
  plfit_discrete_options_t disc_options;
  igraph_bool_t discrete = force_continuous ? 0 : 1;
  igraph_bool_t finite_size_correction;
  int retval;
  size_t i, n;

  n = igraph_vector_size(data);
  finite_size_correction = (n < 50);

  if (discrete) {
    /* Does the vector contain discrete values only? */
    for (i = 0; i < n; i++) {
      if ((long int)(VECTOR(*data)[i]) != VECTOR(*data)[i]) {
	discrete = 0;
	break;
      }
    }
  }

  plfit_stored_error_handler = plfit_set_error_handler(igraph_i_plfit_error_handler_store);
  if (discrete) {
    plfit_discrete_options_init(&disc_options);
    disc_options.finite_size_correction = finite_size_correction;

    if (xmin >= 0) {
      retval = plfit_estimate_alpha_discrete(VECTOR(*data), n, xmin,
	  &disc_options, &plfit_result);
    } else {
      retval = plfit_discrete(VECTOR(*data), n, &disc_options, &plfit_result);
    }
  } else {
    plfit_continuous_options_init(&cont_options);
    cont_options.finite_size_correction = finite_size_correction;

    if (xmin >= 0) {
      retval = plfit_estimate_alpha_continuous(VECTOR(*data), n, xmin,
	  &cont_options, &plfit_result);
    } else {
      retval = plfit_continuous(VECTOR(*data), n, &cont_options, &plfit_result);
    }
  }
  plfit_set_error_handler(plfit_stored_error_handler);

  switch (retval) {
    case PLFIT_FAILURE:
      IGRAPH_ERROR(igraph_i_plfit_error_message, IGRAPH_FAILURE);
      break;

    case PLFIT_EINVAL:
      IGRAPH_ERROR(igraph_i_plfit_error_message, IGRAPH_EINVAL);
      break;

    case PLFIT_UNDRFLOW:
      IGRAPH_ERROR(igraph_i_plfit_error_message, IGRAPH_EUNDERFLOW);
      break;

    case PLFIT_OVERFLOW:
      IGRAPH_ERROR(igraph_i_plfit_error_message, IGRAPH_EOVERFLOW);
      break;

    case PLFIT_ENOMEM:
      IGRAPH_ERROR(igraph_i_plfit_error_message, IGRAPH_ENOMEM);
      break;

    default:
      break;
  }

  if (result) {
    result->continuous = !discrete;
    result->alpha = plfit_result.alpha;
    result->xmin = plfit_result.xmin;
    result->L = plfit_result.L;
    result->D = plfit_result.D;
    result->p = plfit_result.p;
  }

  return 0;
}
Example #6
0
File: main.c Project: ntamas/plfit
void process_file(FILE* f, const char* fname) {
    double* data;
    size_t i, n = 0, nalloc = 100;
    unsigned short int warned = 0, discrete = opts.force_continuous ? 0 : 1;
	plfit_continuous_options_t plfit_continuous_options;
	plfit_discrete_options_t plfit_discrete_options;
    plfit_result_t result;
    struct {
        double mean;
        double variance;
        double skewness;
        double kurtosis;
    } moments = { 0, 0, 0, 0 };

    /* allocate memory for 100 samples */
    data = (double*)calloc(nalloc, sizeof(double));
    if (data == 0) {
        perror(fname);
        return;
    }

    /* read the input file */
    for (;;) {
        int nparsed = fscanf(f, "%lf", data+n);
        if (nparsed == EOF)  /* reached the end of file */
            break;

        if (nparsed < 1) {   /* parse error */
            int c = '\0';
            if ((c = fgetc(f)) == '#') {
                do { c = fgetc(f); } while (c != '\n');
            } else {
                if (warned++ < 16) {
                    fprintf(stderr, "%s: parse error at byte %ld\n", fname, (long)ftell(f));
                }
            }
            continue;
        }

        if (discrete && (floor(data[n]) != data[n]))
            discrete = 0;

        n++;
        if (n == nalloc) {
            /* allocate twice as much memory */
            nalloc *= 2;
            data = (double*)realloc(data, sizeof(double) * nalloc);
            if (data == 0) {
                perror(fname);
                return;
            }
        }
    }

    if (warned) {
        fprintf(stderr, "%s: corrupted data points in file\n", fname);
        exit(EX_DATAERR);
        return;
    }
    if (n == 0) {
        fprintf(stderr, "%s: no data points in file\n", fname);
        exit(EX_DATAERR);
        return;
    }

    /* apply the divisor if needed */
    if (opts.divisor != 1) {
#ifdef _OPENMP
#pragma omp parallel for private(i)
#endif
        for (i = 0; i < n; i++) {
            data[i] /= opts.divisor;
        }
        if (discrete) {
#ifdef _OPENMP
#pragma omp parallel for private(i)
#endif
            for (i = 0; i < n; i++) {
                data[i] = round(data[i]);
            }
        }
    }

	/* construct the plfit options */
	plfit_continuous_options_init(&plfit_continuous_options);
	plfit_discrete_options_init(&plfit_discrete_options);
	plfit_continuous_options.finite_size_correction = opts.finite_size_correction;
	plfit_discrete_options.finite_size_correction = opts.finite_size_correction;
    plfit_continuous_options.p_value_method = opts.p_value_method;
    plfit_discrete_options.p_value_method = opts.p_value_method;
    plfit_continuous_options.p_value_precision = opts.p_value_precision;
    plfit_discrete_options.p_value_precision = opts.p_value_precision;
    plfit_continuous_options.rng = &rng;
    plfit_discrete_options.rng = &rng;

    /* fit the power-law distribution */
    if (discrete) {
        if (opts.alpha_step > 0) {
            /* Old estimation based on brute-force search */
			plfit_discrete_options.alpha_method = PLFIT_LINEAR_SCAN;
			plfit_discrete_options.alpha.min = opts.alpha_min;
			plfit_discrete_options.alpha.max = opts.alpha_max;
			plfit_discrete_options.alpha.step = opts.alpha_step;
        } else {
			plfit_discrete_options.alpha_method = PLFIT_LBFGS;
		}
		if (opts.xmin < 0) {
			/* Estimate xmin and alpha */
			plfit_discrete(data, n, &plfit_discrete_options, &result);
		} else {
			/* Estimate alpha only */
			plfit_estimate_alpha_discrete(data, n, opts.xmin,
					&plfit_discrete_options, &result);
		}
    } else {
        if (opts.xmin < 0) {
            /* Estimate xmin and alpha */
            plfit_continuous(data, n, &plfit_continuous_options, &result);
        } else {
            /* Estimate alpha only */
            plfit_estimate_alpha_continuous(data, n, opts.xmin,
                    &plfit_continuous_options, &result);
        }
    }

    /* calculate the moments if needed */
    if (opts.print_moments) {
        plfit_moments(data, n, &moments.mean, &moments.variance,
                &moments.skewness, &moments.kurtosis);
    }

    /* print the results */
    if (opts.brief_mode) {
        if (opts.print_moments) {
            printf("%s: S %lg %lg %lg %lg\n", fname, moments.mean, moments.variance,
                    moments.skewness, moments.kurtosis);
        }
        printf("%s: %c %lg %lg %lg %lg %lg\n", fname, discrete ? 'D' : 'C',
                result.alpha, result.xmin, result.L, result.D, result.p);
    } else {
        printf("%s:\n", fname);
        if (!opts.finite_size_correction && n < 50)
            printf("\tWARNING: finite size bias may be present\n\n");

        if (opts.print_moments) {
            printf("\tCentral moments\n");
            printf("\tmean     = %12.5lf\n", moments.mean);
            printf("\tvariance = %12.5lf\n", moments.variance);
            printf("\tstd.dev. = %12.5lf\n", sqrt(moments.variance));
            printf("\tskewness = %12.5lf\n", moments.skewness);
            printf("\tkurtosis = %12.5lf\n", moments.kurtosis);
            printf("\tex.kurt. = %12.5lf\n", moments.kurtosis-3);
            printf("\n");
        }

        printf("\t%s MLE", discrete ? "Discrete" : "Continuous");
        if (opts.finite_size_correction)
            printf(" with finite size correction");
        printf("\n");
        printf("\talpha = %12.5lf\n", result.alpha);
        printf("\txmin  = %12.5lf\n", result.xmin );
        printf("\tL     = %12.5lf\n", result.L    );
        printf("\tD     = %12.5lf\n", result.D    );
        if (!isnan(result.p)) {
            printf("\tp     = %12.5lf%s\n", result.p,
                    opts.p_value_method == PLFIT_P_VALUE_APPROXIMATE ?
                    " (approximation)" : "");
        }
        printf("\n");
    }

    /* free the stored data */
    free(data);
}