Esempio n. 1
0
static double multigrid_recurse(struct element **sel, double ****f,
		double ****x, int var, int level, double eps,
		void (*opeval) ())
{
	if (level == (*sel)->precon->nlevels - 1)
		return conjugate_gradient(sel, f, x, var, opeval,
				jacobi_preconditioner, eps, 500);

	const int m = 15;
	const int n1 = (*sel)->basis->n;
	const int n2 = (*sel)->precon->nmg[level + 1];
	double ****r = new_4d_array(n1, n1, n1, (*sel)->nel);
	double ****fn = new_4d_array(n2, n2, n2, (*sel)->nel);
	double ****e = new_4d_array(n2, n2, n2, (*sel)->nel);

	if (level == 0)
		compute_residual(sel, x, f, r, var, opeval);
	else
		copy_array(***f, ***r, n1 * n1 * n1 * (*sel)->nel);

	switch_multigrid_level(sel, level + 1);
	multigrid_restriction(sel, r, fn, level + 1);
	multigrid_recurse(sel, fn, e, var, level + 1, eps, opeval);
	switch_multigrid_level(sel, level);
	multigrid_prolongation(sel, e, r, level);
	add_array(***r, ***x, n1 * n1 * n1 * (*sel)->nel, 1.0);

	double err = conjugate_gradient(sel, f, x, var, opeval,
			jacobi_preconditioner, eps, m);
	free_4d_array(r);
	free_4d_array(fn);
	free_4d_array(e);
	return err;
}
Esempio n. 2
0
void 
FMultiGrid::compute_residual (MultiFab & phi,
			      MultiFab & rhs,
			      MultiFab & res)
{
    PArray<MultiFab> phi_p;
    PArray<MultiFab> rhs_p;
    PArray<MultiFab> res_p;
    Copy(phi_p, phi);
    Copy(rhs_p, rhs);
    Copy(res_p, res);
    compute_residual(phi_p, rhs_p, res_p);
}
Esempio n. 3
0
double* inner_outer_gauss_seidel_alg_vecs(
                      Graph g, double alpha, double tol, int maxit, 
                      double *v, double *x, double *y, double *f, 
                      double beta, double itol, bool resid, bool normed)
{
    size_t n = (size_t)size(g);
    stime_struct start; simple_time_clock(&start);
    
    printf("inoutgs(%6.4f,%6.4f,%8e,%1i) with tol=%8e and maxit=%6i iterations\n", 
            alpha, beta, itol, resid, tol, maxit); fflush(stdout);

    double odelta, sumy=0.0, dtx, nx, ndiff, ltol=log(tol), la=log(alpha), dt;
    if (dangling_mult(g, x, y, v, 1.0, &dtx, NULL)) { return (NULL); }
    odelta = compute_outer_residual(x, y, v, alpha, n);
    int iter = 0, rval, nresit = 0, nmult = 1, nresids = 0;
#ifdef BVALGS_VERBOSE
        printf(" iogs (outer) : iter = %6i ; odelta = %10e ; dt = %7.1f\n", 
                    iter, odelta, elapsed_time(&start));
#endif     
    while (odelta > tol && nmult < maxit) {
        int iiter=0; 
        double idelta = odelta;
        compute_f(f, y, v, alpha, beta, n); 
        while (iter+iiter < maxit && idelta > itol) {
            gauss_seidel_sweep(g, x, NULL, f, 
                    beta, 1.0, dtx, false, &nx, &idelta, &dtx); nmult++;
            if (normed) {
                shift_and_scale(x,0.0,1./nx,n); dtx=dtx/nx; nx=1.0;
            }
            idelta = idelta; iiter++; // adjust for diff and not residual
        }
        if (dangling_mult(g, x, y, v, 1.0, &dtx, &dtx)) { return (NULL); }
        iter++; nmult++;
        odelta = compute_outer_residual(x,y,v,alpha,n);
#ifdef BVALGS_VERBOSE
        printf(" iogs (outer) : iter = %6i ; odelta = %10e ; dt = %7.1f ; nmult = %9i\n", 
                    iter, odelta, elapsed_time(&start), nmult);
#endif 
        if (iiter < 2 || odelta < itol) {
            break; 
        }
    }
    dtx = sum_dtx(g,x); dt = elapsed_time(&start);
    simple_time_clock(&start);
    while (odelta > tol && nmult-nresids < maxit) {
        rval = gauss_seidel_sweep(g, x, NULL, v, 
                    alpha, (1.0-alpha), dtx, false, &nx, &ndiff, &dtx);
        nmult++; iter++;
        if (normed) {
            shift_and_scale(x,0.0,1./nx,n); dtx=dtx/nx; nx=1.0;
        }
        if (rval) { return (NULL); }
        dt += elapsed_time(&start);
        
        /* compute the residual */
        if (resid) {
            odelta = compute_residual(g,x,y,v,alpha,dtx,nx); nmult++; nresids++;
            simple_time_clock(&start);
        } else {
            simple_time_clock(&start);
            if (ndiff < tol && iter>nresit) {
                odelta = compute_residual(g,x,y,v,alpha,dtx,nx); nmult++; nresids++;
                nresit=iter+(int)((ltol - log(odelta))/(2.0*la));
            }
        }
#ifdef BVALGS_VERBOSE
        printf(" iogs (   gs) : iter = %6i ; delta = %10e ; diff = %10e ; dt = %7.1f sec ; nmult = %6i\n", 
            iter, odelta, ndiff, dt, nmult );
#endif                
    }
    if (odelta > tol) { 
        printf("iogs(%6.4f) did not converge to %8e in %6i sweeps\n", 
            alpha, tol, maxit); fflush(stdout);
    } else {
        printf("iogs : solved pagerank(a=%6.4f) in %5i its, %5i sweeps, and %5i mults to %8e tol\n",
            alpha, iter, nmult-nresids, nmult, tol); fflush(stdout);
    }
    return y;
}
Esempio n. 4
0
File: tracker.c Progetto: bion/ats
/* ATS_SOUND *tracker (ANARGS *anargs, char *soundfile)
 * partial tracking function 
 * anargs: pointer to analysis parameters
 * soundfile: path to input file 
 * returns an ATS_SOUND with data issued from analysis
 */
ATS_SOUND *tracker (ANARGS *anargs, char *soundfile, char *resfile)
{
  int fd, M_2, first_point, filptr, n_partials = 0;
  int frame_n, k, sflen, *win_samps, peaks_size, tracks_size = 0;
  int i, frame, i_tmp;
  float *window, norm, sfdur, f_tmp;
  /* declare structures and buffers */
  ATS_SOUND *sound = NULL;
  ATS_PEAK *peaks, *tracks = NULL, cpy_peak;
  ATS_FRAME *ana_frames = NULL, *unmatched_peaks = NULL;
  mus_sample_t **bufs;
  ATS_FFT fft;
#ifdef FFTW
  fftw_plan plan;
  FILE *fftw_wisdom_file;
#endif

  /* open input file
     we get srate and total_samps in file in anargs */
  if ((fd = mus_sound_open_input(soundfile))== -1) {
    fprintf(stderr, "%s: %s\n", soundfile, strerror(errno));
    return(NULL);
  }
  /* warn about multi-channel sound files */
  if (mus_sound_chans(soundfile) > 1) {
    fprintf(stderr, "Error: file has %d channels, must be mono!\n",
	    mus_sound_chans(soundfile));
    return(NULL);
  }

  fprintf(stderr, "tracking...\n");

  /* get sample rate and # of frames from file header */
  anargs->srate = mus_sound_srate(soundfile);
  sflen = mus_sound_frames(soundfile);
  sfdur = (float)sflen/anargs->srate;
  /* check analysis parameters */
  /* check start time */
  if( !(anargs->start >= 0.0 && anargs->start < sfdur) ){
    fprintf(stderr, "Warning: start %f out of bounds, corrected to 0.0\n", anargs->start);
    anargs->start = (float)0.0;
  }
  /* check duration */
  if(anargs->duration == ATSA_DUR) {
    anargs->duration = sfdur - anargs->start;
  }
  f_tmp = anargs->duration + anargs->start;
  if( !(anargs->duration > 0.0 && f_tmp <= sfdur) ){
    fprintf(stderr, "Warning: duration %f out of bounds, limited to file duration\n", anargs->duration);
    anargs->duration = sfdur - anargs->start;
  }
  /* print time bounds */
  fprintf(stderr, "start: %f duration: %f file dur: %f\n", anargs->start, anargs->duration , sfdur);
  /* check lowest frequency */
  if( !(anargs->lowest_freq > 0.0 && anargs->lowest_freq < anargs->highest_freq)){
    fprintf(stderr, "Warning: lowest freq. %f out of bounds, forced to default: %f\n", anargs->lowest_freq, ATSA_LFREQ);
    anargs->lowest_freq = ATSA_LFREQ;
  }
  /* check highest frequency */
  if( !(anargs->highest_freq > anargs->lowest_freq && anargs->highest_freq <= anargs->srate * 0.5 )){
    fprintf(stderr, "Warning: highest freq. %f out of bounds, forced to default: %f\n", anargs->highest_freq, ATSA_HFREQ);
    anargs->highest_freq = ATSA_HFREQ;
  }
  /* frequency deviation */
  if( !(anargs->freq_dev > 0.0 && anargs->freq_dev < 1.0) ){
    fprintf(stderr, "Warning: freq. dev. %f out of bounds, should be > 0.0 and <= 1.0,  forced to default: %f\n", anargs->freq_dev, ATSA_FREQDEV);
    anargs->freq_dev = ATSA_FREQDEV;
  }
  /* window cycles */
  if( !(anargs->win_cycles >= 1 && anargs->win_cycles <= 8) ){
    fprintf(stderr, "Warning: windows cycles %d out of bounds, should be between 1 and 8, forced to default: %d\n", anargs->win_cycles, ATSA_WCYCLES);
    anargs->win_cycles = ATSA_WCYCLES;
  }
  /* window type */
  if( !(anargs->win_type >= 0 && anargs->win_type <= 3) ){
    fprintf(stderr, "Warning: window type %d out of bounds, should be between 0 and 3, forced to default: %d\n", anargs->win_type, ATSA_WTYPE);
    anargs->win_type = ATSA_WTYPE;
  }
  /* hop size */
  if( !(anargs->hop_size > 0.0 && anargs->hop_size <= 1.0) ){
    fprintf(stderr, "Warning: hop size %f out of bounds, should be > 0.0 and <= 1.0, forced to default: %f\n", anargs->hop_size, ATSA_HSIZE);
    anargs->hop_size = ATSA_HSIZE;
  }
  /* lowest mag */
  if( !(anargs->lowest_mag <= 0.0) ){
    fprintf(stderr, "Warning: lowest magnitude %f out of bounds, should be >= 0.0 and <= 1.0, forced to default: %f\n", anargs->lowest_mag, ATSA_LMAG);
    anargs->lowest_mag = ATSA_LMAG;
  }
  /* set some values before checking next set of parameters */
  anargs->first_smp = (int)floor(anargs->start * (float)anargs->srate);
  anargs->total_samps = (int)floor(anargs->duration * (float)anargs->srate);
  /* fundamental cycles */
  anargs->cycle_smp = (int)floor((double)anargs->win_cycles * (double)anargs->srate / (double)anargs->lowest_freq);
  /* window size */
  anargs->win_size = (anargs->cycle_smp % 2 == 0) ? anargs->cycle_smp+1 : anargs->cycle_smp;
  /* calculate hop samples */
  anargs->hop_smp = floor( (float)anargs->win_size * anargs->hop_size );
  /* compute total number of frames */
  anargs->frames = compute_frames(anargs);
  /* check that we have enough frames for the analysis */
  if( !(anargs->frames >= ATSA_MFRAMES) ){
    fprintf(stderr, "Error: %d frames are not enough for analysis, nead at least %d\n", anargs->frames , ATSA_MFRAMES);
    return(NULL);
  }
  /* check other user parameters */
  /* track length */
  if( !(anargs->track_len >= 1 && anargs->track_len < anargs->frames) ){
    i_tmp = (ATSA_TRKLEN < anargs->frames) ? ATSA_TRKLEN : anargs->frames-1;
    fprintf(stderr, "Warning: track length %d out of bounds, forced to: %d\n", anargs->track_len , i_tmp);
    anargs->track_len = i_tmp;
  }    
  /* min. segment length */
  if( !(anargs->min_seg_len >= 1 && anargs->min_seg_len < anargs->frames) ){
    i_tmp = (ATSA_MSEGLEN < anargs->frames) ? ATSA_MSEGLEN : anargs->frames-1;
    fprintf(stderr, "Warning: min. segment length %d out of bounds, forced to: %d\n", anargs->min_seg_len, i_tmp);
    anargs->min_seg_len = i_tmp;
  }
  /* min. gap length */
  if( !(anargs->min_gap_len >= 0 && anargs->min_gap_len < anargs->frames) ){
    i_tmp = (ATSA_MGAPLEN < anargs->frames) ? ATSA_MGAPLEN : anargs->frames-1;
    fprintf(stderr, "Warning: min. gap length %d out of bounds, forced to: %d\n", anargs->min_gap_len, i_tmp);
    anargs->min_gap_len = i_tmp;
  }
  /* SMR threshold */
  if( !(anargs->SMR_thres >= 0.0 && anargs->SMR_thres < ATSA_MAX_DB_SPL) ){
    fprintf(stderr, "Warning: SMR threshold %f out of bounds, shoul be >= 0.0 and < %f dB SPL, forced to default: %f\n", anargs->SMR_thres, ATSA_MAX_DB_SPL, ATSA_SMRTHRES);
    anargs->SMR_thres = ATSA_SMRTHRES;
  }
  /* min. seg. SMR */
  if( !(anargs->min_seg_SMR >= anargs->SMR_thres && anargs->min_seg_SMR < ATSA_MAX_DB_SPL) ){
    fprintf(stderr, "Warning: min. seg. SMR  %f out of bounds, shoul be >= %f and < %f dB SPL, forced to default: %f\n", anargs->min_seg_SMR, anargs->SMR_thres, ATSA_MAX_DB_SPL, ATSA_MSEGSMR);
    anargs->min_seg_SMR = ATSA_MSEGSMR;
  }
  /* last peak contibution */
  if( !(anargs->last_peak_cont >= 0.0 && anargs->last_peak_cont <= 1.0) ){
    fprintf(stderr, "Warning: last peak contibution %f out of bounds, should be >= 0.0 and <= 1.0, forced to default: %f\n", anargs->last_peak_cont, ATSA_LPKCONT);
    anargs->last_peak_cont = ATSA_LPKCONT;
  }
  /* SMR cont. */
  if( !(anargs->SMR_cont >= 0.0 && anargs->SMR_cont <= 1.0) ){
    fprintf(stderr, "Warning: SMR contibution %f out of bounds, should be >= 0.0 and <= 1.0, forced to default: %f\n", anargs->SMR_cont, ATSA_SMRCONT);
    anargs->SMR_cont = ATSA_SMRCONT;
  }
  /* continue computing parameters */
  /* fft size */
  anargs->fft_size = ppp2(2*anargs->win_size);

  /* allocate memory for sound, we read the whole sound in memory */
  bufs = (mus_sample_t **)malloc(sizeof(mus_sample_t*));
  bufs[0] = (mus_sample_t *)malloc(sflen * sizeof(mus_sample_t));
  /*  bufs = malloc(sizeof(mus_sample_t*));
      bufs[0] = malloc(sflen * sizeof(mus_sample_t)); */
  /* make our window */
  window = make_window(anargs->win_type, anargs->win_size);
  /* get window norm */
  norm = window_norm(window, anargs->win_size);
  /* fft mag for computing frequencies */
  anargs->fft_mag = (double)anargs->srate / (double)anargs->fft_size;
  /* lowest fft bin for analysis */
  anargs->lowest_bin = floor( anargs->lowest_freq / anargs->fft_mag );
  /* highest fft bin for analisis */
  anargs->highest_bin = floor( anargs->highest_freq / anargs->fft_mag );
  /* allocate an array analysis frames in memory */
  ana_frames = (ATS_FRAME *)malloc(anargs->frames * sizeof(ATS_FRAME));
  /* alocate memory to store mid-point window sample numbers */
  win_samps = (int *)malloc(anargs->frames * sizeof(int));
  /* center point of window */
  M_2 = floor((anargs->win_size - 1) / 2); 
  /* first point in fft buffer to write */
  first_point = anargs->fft_size - M_2;  
  /* half a window from first sample */
  filptr = anargs->first_smp - M_2;   
  /* read sound into memory */
  mus_sound_read(fd, 0, sflen-1, 1, bufs);     

  /* make our fft-struct */
  fft.size = anargs->fft_size;
  fft.rate = anargs->srate;
#ifdef FFTW
  fft.data = fftw_malloc(sizeof(fftw_complex) * fft.size);
  if(fftw_import_system_wisdom()) fprintf(stderr, "system wisdom loaded!\n");
  else fprintf(stderr, "cannot locate system wisdom!\n");
  if((fftw_wisdom_file = fopen("ats-wisdom", "r")) != NULL) {
    fftw_import_wisdom_from_file(fftw_wisdom_file);
    fprintf(stderr, "ats-wisdom loaded!\n");
    fclose(fftw_wisdom_file);
  } else fprintf(stderr, "cannot locate ats-wisdom!\n");
  plan = fftw_plan_dft_1d(fft.size, fft.data, fft.data, FFTW_FORWARD, FFTW_PATIENT);
#else
  fft.fdr = (double *)malloc(anargs->fft_size * sizeof(double));
  fft.fdi = (double *)malloc(anargs->fft_size * sizeof(double));
#endif

  /* main loop */
  for (frame_n=0; frame_n<anargs->frames; frame_n++) {
    /* clear fft arrays */
#ifdef FFTW
    for(k=0; k<fft.size; k++) fft.data[k][0] = fft.data[k][1] = 0.0f;
#else
    for(k=0; k<fft.size; k++) fft.fdr[k] = fft.fdi[k] = 0.0f;
#endif
    /* multiply by window */
    for (k=0; k<anargs->win_size; k++) {
      if ((filptr >= 0) && (filptr < sflen)) 
#ifdef FFTW
        fft.data[(k+first_point)%fft.size][0] = window[k] * MUS_SAMPLE_TO_FLOAT(bufs[0][filptr]);
#else
        fft.fdr[(k+first_point)%anargs->fft_size] = window[k] * MUS_SAMPLE_TO_FLOAT(bufs[0][filptr]);
#endif
      filptr++;
    }
    /* we keep sample numbers of window midpoints in win_samps array */
    win_samps[frame_n] = filptr - M_2 - 1;
    /* move file pointer back */
    filptr = filptr - anargs->win_size + anargs->hop_smp;
    /* take the fft */
#ifdef FFTW
    fftw_execute(plan);
#else
    fft_slow(fft.fdr, fft.fdi, fft.size, 1);
#endif
    /* peak detection */
    peaks_size = 0;
    peaks = peak_detection(&fft, anargs->lowest_bin, anargs->highest_bin, anargs->lowest_mag, norm, &peaks_size); 
    /* peak tracking */
    if (peaks != NULL) {
      /* evaluate peaks SMR (masking curves) */
      evaluate_smr(peaks, peaks_size);
      if (frame_n) {
	/* initialize or update tracks */
	if ((tracks = update_tracks(tracks, &tracks_size, anargs->track_len, frame_n, ana_frames, anargs->last_peak_cont)) != NULL) {
	  /* do peak matching */
          unmatched_peaks = peak_tracking(tracks, &tracks_size, peaks, &peaks_size,  anargs->freq_dev, 2.0 * anargs->SMR_cont, &n_partials);
	  /* kill unmatched peaks from previous frame */
          if(unmatched_peaks[0].peaks != NULL) {
	    for(k=0; k<unmatched_peaks[0].n_peaks; k++) {
	      cpy_peak = unmatched_peaks[0].peaks[k];
	      cpy_peak.amp = cpy_peak.smr = 0.0;
	      peaks = push_peak(&cpy_peak, peaks, &peaks_size);
             }
             free(unmatched_peaks[0].peaks);
           }
           /* give birth to peaks from new frame */
           if(unmatched_peaks[1].peaks != NULL) {
             for(k=0; k<unmatched_peaks[1].n_peaks; k++) {
               tracks = push_peak(&unmatched_peaks[1].peaks[k], tracks, &tracks_size);
               unmatched_peaks[1].peaks[k].amp = unmatched_peaks[1].peaks[k].smr = 0.0;
               ana_frames[frame_n-1].peaks = push_peak(&unmatched_peaks[1].peaks[k], ana_frames[frame_n-1].peaks, &ana_frames[frame_n-1].n_peaks);
             }
             free(unmatched_peaks[1].peaks);
           }
         } else {
           /* give number to all peaks */
           qsort(peaks, peaks_size, sizeof(ATS_PEAK), peak_frq_inc);
           for(k=0; k<peaks_size; k++) peaks[k].track = n_partials++;
         }
      } else {
        /* give number to all peaks */
        qsort(peaks, peaks_size, sizeof(ATS_PEAK), peak_frq_inc);
        for(k=0; k<peaks_size; k++) peaks[k].track = n_partials++;
      }
      /* attach peaks to ana_frames */
      ana_frames[frame_n].peaks = peaks;
      ana_frames[frame_n].n_peaks = n_partials;
      ana_frames[frame_n].time = (double)(win_samps[frame_n] - anargs->first_smp) / (double)anargs->srate;
      /* free memory */
      free(unmatched_peaks);
    } else {
      /* if no peaks found, initialize empty frame */
      ana_frames[frame_n].peaks = NULL;
      ana_frames[frame_n].n_peaks = 0;
      ana_frames[frame_n].time = (double)(win_samps[frame_n] - anargs->first_smp) / (double)anargs->srate;
    }
  }
  /* free up some memory */
  free(window);
  free(tracks);
#ifdef FFTW
  fftw_destroy_plan(plan);
  fftw_free(fft.data);
#else
  free(fft.fdr);
  free(fft.fdi);
#endif
  /* init sound */
  fprintf(stderr, "Initializing ATS data...");
  sound = (ATS_SOUND *)malloc(sizeof(ATS_SOUND));
  init_sound(sound, anargs->srate, (int)(anargs->hop_size * anargs->win_size), 
             anargs->win_size, anargs->frames, anargs->duration, n_partials,
             ((anargs->type == 3 || anargs->type == 4) ? 1 : 0));
  /* store values from frames into the arrays */
  for(k=0; k<n_partials; k++) {
    for(frame=0; frame<sound->frames; frame++) {
      sound->time[k][frame] = ana_frames[frame].time;
      for(i=0; i<ana_frames[frame].n_peaks; i++) 
        if(ana_frames[frame].peaks[i].track == k) {
	  sound->amp[k][frame] = ana_frames[frame].peaks[i].amp;
          sound->frq[k][frame] = ana_frames[frame].peaks[i].frq;
          sound->pha[k][frame] = ana_frames[frame].peaks[i].pha;
          sound->smr[k][frame] = ana_frames[frame].peaks[i].smr;
        }
    }
  }
  fprintf(stderr, "done!\n");
  /* free up ana_frames memory */
  /* first, free all peaks in each slot of ana_frames... */
  for (k=0; k<anargs->frames; k++) free(ana_frames[k].peaks);  
  /* ...then free ana_frames */
  free(ana_frames);                                            
  /* optimize sound */
  optimize_sound(anargs, sound);
  /* compute  residual */
  if( anargs->type == 3 || anargs->type == 4 ) {
    fprintf(stderr, "Computing residual...");
    compute_residual(bufs, sflen, resfile, sound, win_samps, anargs->srate);
    fprintf(stderr, "done!\n");
  }
  /* free the rest of the memory */
  free(win_samps);
  free(bufs[0]);
  free(bufs);
  /* analyze residual */
  if( anargs->type == 3 || anargs->type == 4 ) {
    fprintf(stderr, "Analyzing residual...");
    residual_analysis(ATSA_RES_FILE, sound);
    fprintf(stderr, "done!\n");
  }
#ifdef FFTW
  fftw_wisdom_file = fopen("ats-wisdom", "w");
  fftw_export_wisdom_to_file(fftw_wisdom_file);
  fclose(fftw_wisdom_file);
#endif
  fprintf(stderr, "tracking completed.\n");
  return(sound);
}
int
main (int argc, char **argv)
{
  int mpi_rank, mpi_size;

  int n, ln;
  double *A, *b, *xA, *xB, *xC, *r, *X, *x;
  double lmax, max;
  struct timeval before, after;

  MPI_Init (&argc, &argv);
  MPI_Comm_size (MPI_COMM_WORLD, &mpi_size);
  MPI_Comm_rank (MPI_COMM_WORLD, &mpi_rank);
  
  /* Get the argument that indicates the problem size */
  n = JACOBI_DEF_SIZE;
  if (argc == 2)
    n = atoi (argv[1]);
  if (mpi_rank == ROOT_NODE)
    fprintf (stdout, "n = %d\n", n);
  ln = n / mpi_size;
  
  /* Initialize the random seed */
  /*srandom((unsigned int) getpid ());*/

  /* Allocate memory */
  A = (double *) calloc (ln * n, sizeof(double));
  if (A == NULL)
    {
      perror ("calloc");
      MPI_Finalize ();
      return EXIT_FAILURE;
    }

  b = (double *) calloc (ln, sizeof(double));
  if (b == NULL)
    {
      perror ("calloc");
      MPI_Finalize ();
      return EXIT_FAILURE;
    }
  
  xA = (double *) calloc (ln, sizeof(double));
  if (xA == NULL)
    {
      perror ("calloc");
      MPI_Finalize ();
      return EXIT_FAILURE;
    }

  xB = (double *) calloc (ln, sizeof(double));
  if (xB == NULL)
    {
      perror ("calloc");
      MPI_Finalize ();
      return EXIT_FAILURE;
    }

  xC = (double *) calloc (ln, sizeof(double));
  if (xC == NULL)
    {
      perror ("calloc");
      MPI_Finalize ();
      return EXIT_FAILURE;
    }

  X = (double *) calloc (n, sizeof(double));
  if (X == NULL)
    {
      perror ("calloc");
      MPI_Finalize ();
      return EXIT_FAILURE;
    }

  r = (double *) calloc (ln, sizeof(double));
  if (r == NULL)
    {
      perror ("calloc");
      MPI_Finalize ();
      return EXIT_FAILURE;
    }

  generate_matrix (A, ln, n, mpi_rank);
  generate_vector (b, ln);
  
  gettimeofday(&before, NULL);
  x = jacobi (xA, xB, xC, A, b, ln, n, mpi_rank, mpi_size);
  gettimeofday(&after, NULL);

  MPI_Allgather (x, ln, MPI_DOUBLE,
		 X, ln, MPI_DOUBLE, MPI_COMM_WORLD);

  /* Compute the residual */
  compute_residual (r, A, X, b, ln, n);

  /* Compute the maximum absolute value of the residual */
  lmax = find_max_abs (r, ln);
  MPI_Reduce (&lmax, &max, 1, MPI_DOUBLE, MPI_MAX, ROOT_NODE, MPI_COMM_WORLD);
  
  if (mpi_rank == ROOT_NODE)
    display_info (A, x, b, r, n, max, &before, &after);

  /* Free the memory */
  free (A);
  free (b);
  free (xA);
  free (xB);
  free (xC);
  free (X);
  free (r);

  /* Return success */
  MPI_Finalize ();
  return 0;
}
int main(int argc, char* argv[])
{

	// Print help if necessary
	bool help = read_bool(argc, argv, "--help", false);
	if ((argc < 2) || (help)) {
		usage(argv);
		return 0;
	}

	// Use parameters struct for passing parameters to kernels efficiently
	parameters prm;

	// Parse inputs
	prm.matDims[0] = read_int(argc, argv, "--m", 2);
	prm.matDims[1] = read_int(argc, argv, "--k", 2);
	prm.matDims[2] = read_int(argc, argv, "--n", 2);
	prm.rank = read_int(argc, argv, "--rank", 7);
	prm.method = read_string(argc, argv, "--method", (char *)"als");
	int maxIters = read_int(argc, argv, "--maxiters", 1000);
	int maxSecs = read_int(argc, argv, "--maxsecs", 1000);
	double tol = read_double(argc, argv, "--tol", 1e-8);
	int printItn = read_int(argc, argv, "--printitn", 0);
	double printTol = read_double(argc, argv, "--printtol", 1.0);
	int seed = read_int(argc, argv, "--seed", 0);
	int numSeeds = read_int(argc, argv, "--numseeds", 1);
	bool verbose = read_bool(argc, argv, "--verbose", false);
	prm.rnd_maxVal = read_double(argc,argv,"--maxval",1.0);
	prm.rnd_pwrOfTwo = read_int(argc,argv,"--pwrof2",0);
	bool roundFinal = read_bool(argc, argv, "--rndfin",false);
	prm.alpha = read_double(argc,argv, "--alpha", 0.1);
	int M = read_int(argc,argv, "--M", 0);
	if (M)
	{
		prm.M[0] = M;
		prm.M[1] = M;
		prm.M[2] = M;
	} else {	    
		prm.M[0] = read_int(argc, argv, "--M0", -1);
		prm.M[1] = read_int(argc, argv, "--M1", -1);
		prm.M[2] = read_int(argc, argv, "--M2", -1);
	}
	char * infile = read_string(argc, argv, "--input", NULL);
	char * outfile = read_string(argc, argv, "--output", NULL);

	if (verbose) {
		setbuf(stdout, NULL);
		printf("\n\n---------------------------------------------------------\n");
		printf("PARAMETERS\n");
		printf("dimensions = %d %d %d\n",prm.matDims[0],prm.matDims[1],prm.matDims[2]);
		printf("rank       = %d\n",prm.rank);
		printf("method     = %s\n",prm.method);
		if (infile)
			printf("input      = %s\n",infile);
		else
		{
			if (numSeeds == 1)
				printf("input      = seed %d\n",seed); 
			else
				printf("inputs     = seeds %d-%d\n",seed,seed+numSeeds-1);
		}
		if (outfile)
			printf("output     = %s\n",outfile);
		else
			printf("output     = none\n"); 
		if (!strcmp(prm.method,"als"))
		{
			printf("tol        = %1.2e\n",tol);
			printf("alpha      = %1.2e\n",prm.alpha);
			printf("maval      = %1.2e\n",prm.rnd_maxVal);
			printf("M's        = (%d,%d,%d)\n",prm.M[0],prm.M[1],prm.M[2]);
			printf("maxiters   = %d\n",maxIters);
			printf("maxsecs    = %d\n",maxSecs);
			printf("printitn   = %d\n",printItn);
			printf("printtol   = %1.2e\n",printTol);
		}
		printf("---------------------------------------------------------\n");
	}

	// Initialize other variables
	int i, j, k, numIters, mkn, tidx[3];
	double err, errOld, errChange = 0.0, start_als, start_search, elapsed, threshold;

	// Compute tensor dimensions
	prm.dims[0] = prm.matDims[0]*prm.matDims[1];
	prm.dims[1] = prm.matDims[1]*prm.matDims[2];
	prm.dims[2] = prm.matDims[0]*prm.matDims[2];

	// Compute tensor's nnz, total number of entries, and Frobenius norm
	mkn = prm.matDims[0]*prm.matDims[1]*prm.matDims[2];
	prm.mkn2 = mkn*mkn;
	prm.xNorm = sqrt(mkn);

	// Compute number of columns in matricized tensors
	for (i = 0; i < 3; i++)
		prm.mtCols[i] = prm.mkn2 / prm.dims[i];

	// Construct three matricizations of matmul tensor
	prm.X = (double**) malloc( 3 * sizeof(double*) );
	for (i = 0; i < 3; i++)
		prm.X[i] = (double*) calloc( prm.mkn2, sizeof(double) );
	for (int mm = 0; mm < prm.matDims[0]; mm++)
		for (int kk = 0; kk < prm.matDims[1]; kk++)
			for (int nn = 0; nn < prm.matDims[2]; nn++)
			{
				tidx[0] = mm + kk*prm.matDims[0];
				tidx[1] = kk + nn*prm.matDims[1];
				tidx[2] = mm + nn*prm.matDims[0];
				prm.X[0][tidx[0]+prm.dims[0]*(tidx[1]+prm.dims[1]*tidx[2])] = 1;
				prm.X[1][tidx[1]+prm.dims[1]*(tidx[0]+prm.dims[0]*tidx[2])] = 1;
				prm.X[2][tidx[2]+prm.dims[2]*(tidx[0]+prm.dims[0]*tidx[1])] = 1;
			}

	// Allocate factor weights and matrices: working, initial, and model
	prm.lambda = (double*) malloc( prm.rank * sizeof(double) );
	prm.U  = (double**) malloc( 3 * sizeof(double*) );
	double** U0 = (double**) malloc( 3 * sizeof(double*) );
	prm.model = (double**) malloc( 3 * sizeof(double*) );
	for (i = 0; i < 3; i++)
	{
		prm.U[i] =  (double*) calloc( prm.mkn2, sizeof(double) );
		U0[i] = (double*) calloc( prm.dims[i]*prm.rank, sizeof(double) );
		prm.model[i] = (double*) calloc( prm.dims[i]*prm.rank, sizeof(double) );
	}

	// Allocate coefficient matrix within ALS (Khatri-Rao product) 
	int maxMatDim = prm.matDims[0];
	if (maxMatDim < prm.matDims[1]) maxMatDim = prm.matDims[1];
	if (maxMatDim < prm.matDims[2]) maxMatDim = prm.matDims[2];
	prm.A = (double*) malloc( maxMatDim*mkn*prm.rank * sizeof(double) );

	// Allocate workspaces
	prm.tau = (double*) malloc( mkn * sizeof(double) );
	prm.lwork = maxMatDim*mkn*prm.rank;
	prm.work = (double*) malloc( prm.lwork * sizeof(double) );
	prm.iwork = (int*) malloc( prm.mkn2 * sizeof(int) );    

	// Allocate matrices for normal equations 
	int maxDim = prm.dims[0];
	if (maxDim < prm.dims[1]) maxDim = prm.dims[1];
	if (maxDim < prm.dims[2]) maxDim = prm.dims[2];
	prm.NE_coeff = (double*) malloc( prm.rank*prm.rank * sizeof(double) );
	prm.NE_rhs = (double*) malloc( maxDim*prm.rank * sizeof(double) );
	prm.residual = (double*) malloc( prm.mkn2 * sizeof(double) );

	//--------------------------------------------------
	// Search Loop
	//--------------------------------------------------
	int mySeed = seed, numGoodSeeds = 0, statusCnt = 0, status = 1;
	start_search = wall_time(); 
	for (int seed_cnt = 0; seed_cnt < numSeeds; ++seed_cnt)
	{
		// Set starting point from random seed (match Matlab Tensor Toolbox)
		RandomMT cRMT(mySeed);
		for (i = 0; i < 3; i++)
			for (j = 0; j < prm.dims[i]; j++)
				for (k = 0; k < prm.rank; k++)
					U0[i][j+k*prm.dims[i]] = cRMT.genMatlabMT();
		for (i = 0; i < prm.rank; i++)
			prm.lambda[i] = 1.0;  

		// Copy starting point
		for (i = 0; i < 3; i++)
			cblas_dcopy(prm.dims[i]*prm.rank,U0[i],1,prm.U[i],1); 

		// read from file if input is given    
		if( infile )
			read_input( infile, prm ); 

		if (verbose)
		{ 
			printf("\nSTARTING POINT...\n");
			for (i = 0; i < 3; i++)
			{
				printf("Factor matrix %d:\n",i);
				print_matrix(prm.U[i],prm.dims[i],prm.rank,prm.dims[i]);
			}
			printf("\n");
		}   

		//--------------------------------------------------
		// Main ALS Loop
		//--------------------------------------------------
		start_als = wall_time();
		err = 1.0; 
		threshold = 1e-4;
		for (numIters = 0; numIters < maxIters && (wall_time()-start_als) < maxSecs; numIters++)
		{
			errOld = err;

			if (!strcmp(prm.method,"als"))
			{
				// Perform an iteration of ALS using NE with Smirnov's penalty term
				err = als( prm );
			}
			else if (!strcmp(prm.method,"sparsify"))
			{   
				// print stats before sparsifying
				printf("Old residual: %1.2e\n",compute_residual(prm,2,true));
				printf("Old nnz (larger than %1.1e): %d %d %d\n", threshold, nnz(prm.U[0],prm.dims[0]*prm.rank,threshold), nnz(prm.U[1],prm.dims[1]*prm.rank,threshold), nnz(prm.U[2],prm.dims[2]*prm.rank,threshold) );

				// sparsify and return
				printf("\nSparsifying...\n\n");
				sparsify( prm );
				numIters = maxIters;

				// print stats after sparsifying
				printf("New residual: %1.2e\n",compute_residual(prm,2,true));
				printf("New nnz (larger than %1.1e): %d %d %d\n", threshold, nnz(prm.U[0],prm.dims[0]*prm.rank,threshold), nnz(prm.U[1],prm.dims[1]*prm.rank,threshold), nnz(prm.U[2],prm.dims[2]*prm.rank,threshold) );
			}
			else if (!strcmp(prm.method,"round"))
			{
				// print stats before rounding
				printf("Old residual: %1.2e\n",compute_residual(prm,2,true));
				printf("Old nnz (larger than %1.1e): %d %d %d\n", threshold, nnz(prm.U[0],prm.dims[0]*prm.rank,threshold), nnz(prm.U[1],prm.dims[1]*prm.rank,threshold), nnz(prm.U[2],prm.dims[2]*prm.rank,threshold) );
				// round and return
				for (i = 0; i < 3; i++)
				{
					capping(prm.U[i],prm.dims[i]*prm.rank,prm.rnd_maxVal);
					rounding(prm.U[i],prm.dims[i]*prm.rank,prm.rnd_pwrOfTwo);
				}
				numIters = maxIters;

				// print stats after rounding
				printf("New residual: %1.2e\n",compute_residual(prm,2,true));
				printf("New nnz (larger than %1.1e): %d %d %d\n", threshold, nnz(prm.U[0],prm.dims[0]*prm.rank,threshold), nnz(prm.U[1],prm.dims[1]*prm.rank,threshold), nnz(prm.U[2],prm.dims[2]*prm.rank,threshold) );
			}
			else
				die("Invalid method\n");   

			// Compute change in relative residual norm
			errChange = fabs(err - errOld);          

			// Print info at current iteration
			if ((printItn > 0) && (((numIters + 1) % printItn) == 0))
			{                
				// print info                    
				printf ("Iter %d: residual = %1.5e change = %1.5e\n", numIters + 1, err, errChange);
			} 

			// Check for convergence 
			if ( numIters > 0 && errChange < tol )
				break;

		}

		// If rounding, round final solution and re-compute residual
		if(roundFinal)
		{
			// normalize columns in A and B factors, put arbitrary weights into C
			normalize_model( prm, 2 );

			// cap large values and round to nearest power of 2
			for (i = 0; i < 3; i++)
			{
				capping(prm.U[i],prm.dims[i]*prm.rank,prm.rnd_maxVal);
				rounding(prm.U[i],prm.dims[i]*prm.rank,prm.rnd_pwrOfTwo);
			}

			err = compute_residual(prm,0,true);
		}    

		// Print status if searching over many seeds
		statusCnt++;
		if (numSeeds > 1000 && statusCnt == numSeeds/10)
		{
			printf("...%d%% complete...\n",10*status);
			status++;
			statusCnt = 0;
		}

		// Print final info
		elapsed = wall_time() - start_als;
		if ((printItn > 0 || verbose) && !strcmp(prm.method,"als"))
		{
			if (infile)
				printf("\nInput %s ",infile);
			else
				printf("\nInitial seed %d ",mySeed);
			printf("achieved residual %1.3e in %d iterations and %1.3e seconds\n \t final residual change: %1.3e\n \t average time per iteration: %1.3e s\n", err, numIters, elapsed, errChange, elapsed/numIters);
		}

		if (verbose)
		{
			printf("\nSOLUTION...\n");
			for (i = 0; i < 3; i++)
			{
				printf("Factor matrix %d:\n",i);
				if (roundFinal || !strcmp(prm.method,"round"))
					print_int_matrix(prm.U[i], prm.dims[i], prm.rank, prm.dims[i], prm.rnd_pwrOfTwo);
				else
					print_matrix(prm.U[i],prm.dims[i],prm.rank,prm.dims[i]);
			}
			
			if (err < printTol)
				numGoodSeeds++;
		}
		else if (err < printTol)
		{
			numGoodSeeds++;

			printf("\n\n***************************************\n");
			if (infile)
				printf("Input %s: ",infile);
			else
				printf("Initial seed %d: ",mySeed);
			printf("after %d iterations, achieved residual %1.3e with final residual change of %1.3e\n", numIters, err, errChange);
			if (roundFinal)
			{

				for (i = 0; i < 3; i++)
				{
					printf("Factor matrix %d:\n",i);
					print_int_matrix(prm.U[i], prm.dims[i], prm.rank, prm.dims[i], prm.rnd_pwrOfTwo);
				}

				int count = 0;
				for (i = 0; i < 3; i++)
					count += nnz(prm.U[i],prm.dims[i]*prm.rank);
				printf("\ttotal nnz in solution: %d\n",count);
				printf("\tnaive adds/subs:       %d\n",count - prm.dims[2] - 2*prm.rank);
			}
			printf("***************************************\n\n\n");
		}

		// write to output
		if( outfile )
			write_output( outfile, prm ); 

		mySeed++;
	}      

	// Final report of processor statistics
	elapsed = wall_time()-start_search;

	// Print stats
	if (!strcmp(prm.method,"als"))
	{
		printf("\n\n------------------------------------------------------------\n");
		printf("Time elapsed:                \t%1.1e\tseconds\n",elapsed);
		printf("Total number of seeds tried: \t%d\n",numSeeds);
		printf("Total number of good seeds:  \t%d",numGoodSeeds);
		printf("\t(residual < %2.1e)\n",printTol);   
		printf("------------------------------------------------------------\n");
	}


	// free allocated memory
	for (i = 0; i < 3; i++)
	{
		free( prm.X[i] );
		free( prm.U[i] );
		free( U0[i] );
		free( prm.model[i] );
	} 
	free( prm.X );
	free( prm.U );
	free( U0 );
	free( prm.model );
	free( prm.lambda );
	free( prm.A );
	free( prm.NE_coeff );
	free( prm.NE_rhs );
	free( prm.residual );
	free( prm.tau );
	free( prm.work );
	free( prm.iwork );

	return 0;

}