int main(int argc, char*argv[]) {
    // transform size
    unsigned int nfft = 30;

    int dopt;
    while ((dopt = getopt(argc,argv,"uhn:")) != EOF) {
        switch (dopt) {
        case 'h':   usage();                return 0;
        case 'n':   nfft = atoi(optarg);    break;
        default:
            exit(1);
        }
    }

    // validate input
    if ( nfft == 0 ) {
        fprintf(stderr,"error: input transform size must be at least 2\n");
        exit(1);
    }

    unsigned int i;
    unsigned int k;
    
    // find 'prime' factors
    unsigned int n = nfft;
    unsigned int p[MAX_FACTORS];
    unsigned int m[MAX_FACTORS];
    unsigned int num_factors = 0;

    do {
        for (k=2; k<=n; k++) {
            if ( (n%k)==0 ) {
                n /= k;
                p[num_factors] = k;
                m[num_factors] = n;
                num_factors++;
                break;
            }
        }
    } while (n > 1 && num_factors < MAX_FACTORS);

    // NOTE: this is extremely unlikely as the worst case is
    //       nfft=2^MAX_FACTORS in which case we will probably run out
    //       of memory first
    if (num_factors == MAX_FACTORS) {
        fprintf(stderr,"error: could not factor %u with %u factors\n", nfft, MAX_FACTORS);
        exit(1);
    }

    printf("factors of %u:\n", nfft);
    for (i=0; i<num_factors; i++)
        printf("  p=%3u, m=%3u\n", p[i], m[i]);

    // create and initialize data arrays
    float complex * x      = (float complex *) malloc(nfft * sizeof(float complex));
    float complex * y      = (float complex *) malloc(nfft * sizeof(float complex));
    float complex * y_test = (float complex *) malloc(nfft * sizeof(float complex));
    if (x == NULL || y == NULL || y_test == NULL) {
        fprintf(stderr,"error: %s, not enough memory for allocation\n", argv[0]);
        exit(1);
    }
    for (i=0; i<nfft; i++) {
        //x[i] = randnf() + _Complex_I*randnf();
        x[i] = (float)i + _Complex_I*(3 - (float)i);
        y[i] = 0.0f;
    }

    // compute output for testing
    dft_run(nfft, x, y_test, DFT_FORWARD, 0);

    // compute twiddle factors (roots of unity)
    float complex * twiddle = (float complex *) malloc(nfft * sizeof(float complex));
    if (x == NULL || y == NULL || y_test == NULL) {
        fprintf(stderr,"error: %s, not enough memory for twiddle factors\n", argv[0]);
        exit(1);
    }
    for (i=0; i<nfft; i++)
        twiddle[i] = cexpf(-_Complex_I*2*M_PI*(float)i / (float)nfft);

    // call mixed-radix function
    fftmr_cycle(x, y, twiddle, nfft, 0, 1, m, p);

    // 
    // print results
    //
    for (i=0; i<nfft; i++) {
        printf("  y[%3u] = %12.6f + j*%12.6f (expected %12.6f + j%12.6f)\n",
            i,
            crealf(y[i]),      cimagf(y[i]),
            crealf(y_test[i]), cimagf(y_test[i]));
    }

    // compute error
    float rmse = 0.0f;
    for (i=0; i<nfft; i++) {
        float e = cabsf(y[i] - y_test[i]);
        rmse += e*e;
    }
    rmse = sqrtf(rmse / (float)nfft);
    printf("RMS error : %12.4e (%s)\n", rmse, rmse < 1e-3 ? "pass" : "FAIL");

    // free allocated memory
    free(x);
    free(y);
    free(y_test);
    free(twiddle);

    return 0;
}
int main(int argc, char*argv[]) {
    // transform size (must be prime)
    unsigned int nfft = 17;

    int dopt;
    while ((dopt = getopt(argc,argv,"uhn:")) != EOF) {
        switch (dopt) {
        case 'h':   usage();                return 0;
        case 'n':   nfft = atoi(optarg);    break;
        default:
            exit(1);
        }
    }

    // validate input
    if ( nfft <= 2 || !is_prime(nfft)) {
        fprintf(stderr,"error: %s, input transform size must be prime and greater than two\n", argv[0]);
        exit(1);
    }

    unsigned int i;

    // create and initialize data arrays
    float complex * x      = (float complex *) malloc(nfft * sizeof(float complex));
    float complex * y      = (float complex *) malloc(nfft * sizeof(float complex));
    float complex * y_test = (float complex *) malloc(nfft * sizeof(float complex));
    if (x == NULL || y == NULL || y_test == NULL) {
        fprintf(stderr,"error: %s, not enough memory for allocation\n", argv[0]);
        exit(1);
    }
    for (i=0; i<nfft; i++) {
        //x[i] = randnf() + _Complex_I*randnf();
        x[i] = (float)i + _Complex_I*(3 - (float)i);
        y[i] = 0.0f;
    }

    // compute output for testing
    dft_run(nfft, x, y_test, DFT_FORWARD, 0);

    // 
    // run Rader's algorithm
    //

    // compute primitive root of nfft
    unsigned int g = primitive_root(nfft);

    // create and initialize sequence
    unsigned int * s = (unsigned int *)malloc((nfft-1)*sizeof(unsigned int));
    for (i=0; i<nfft-1; i++)
        s[i] = modpow(g, i+1, nfft);

#if DEBUG
    printf("computed primitive root of %u as %u\n", nfft, g);
    // generate sequence (sanity check)
    printf("s = [");
    for (i=0; i<nfft-1; i++)
        printf("%4u", s[i]);
    printf("]\n");
#endif

    // compute DFT of sequence { exp(-j*2*pi*g^i/nfft }, size: nfft-1
    // NOTE: R[0] = -1, |R[k]| = sqrt(nfft) for k != 0
    // NOTE: R can be pre-computed
    float complex * r = (float complex*)malloc((nfft-1)*sizeof(float complex));
    float complex * R = (float complex*)malloc((nfft-1)*sizeof(float complex));
    for (i=0; i<nfft-1; i++)
        r[i] = cexpf(-_Complex_I*2*M_PI*s[i]/(float)(nfft));
    dft_run(nfft-1, r, R, DFT_FORWARD, 0);

    // compute DFT of permuted sequence, size: nfft-1
    float complex * xp = (float complex*)malloc((nfft-1)*sizeof(float complex));
    float complex * Xp = (float complex*)malloc((nfft-1)*sizeof(float complex));
    for (i=0; i<nfft-1; i++) {
        // reverse sequence
        unsigned int k = s[nfft-1-i-1];
        xp[i] = x[k];
    }
    dft_run(nfft-1, xp, Xp, DFT_FORWARD, 0);

    // compute inverse FFT of product
    for (i=0; i<nfft-1; i++)
        Xp[i] *= R[i];
    dft_run(nfft-1, Xp, xp, DFT_REVERSE, 0);

    // set DC value
    y[0] = 0.0f;
    for (i=0; i<nfft; i++)
        y[0] += x[i];

    // reverse permute result, scale, and add offset x[0]
    for (i=0; i<nfft-1; i++) {
        unsigned int k = s[i];

        y[k] = xp[i] / (float)(nfft-1) + x[0];
    }

    // free internal memory
    free(r);
    free(R);
    free(xp);
    free(Xp);
    free(s);

    // 
    // print results
    //
    for (i=0; i<nfft; i++) {
        printf("  y[%3u] = %12.6f + j*%12.6f (expected %12.6f + j%12.6f)\n",
            i,
            crealf(y[i]),      cimagf(y[i]),
            crealf(y_test[i]), cimagf(y_test[i]));
    }

    // compute error
    float rmse = 0.0f;
    for (i=0; i<nfft; i++) {
        float e = cabsf(y[i] - y_test[i]);
        rmse += e*e;
    }
    rmse = sqrtf(rmse / (float)nfft);
    printf("RMS error : %12.4e (%s)\n", rmse, rmse < 1e-3 ? "pass" : "FAIL");

    // free allocated memory
    free(x);
    free(y);
    free(y_test);

    return 0;
}