CAMLprim value sunml_cvodes_bbd_prec_initb (value vparentwhich, value vlocaln,
					value vbandwidths, value vdqrely,
					value vhascomm)
{
    CAMLparam5(vparentwhich, vlocaln, vbandwidths, vdqrely, vhascomm);
    void *cvode_mem = CVODE_MEM_FROM_ML (Field(vparentwhich, 0));
    int flag;

    flag = CVBBDPrecInitB (cvode_mem, Int_val(Field(vparentwhich, 1)),
	Long_val(vlocaln),
	Long_val(Field(vbandwidths, RECORD_CVODE_BANDBLOCK_BANDWIDTHS_MUDQ)),
	Long_val(Field(vbandwidths, RECORD_CVODE_BANDBLOCK_BANDWIDTHS_MLDQ)),
	Long_val(Field(vbandwidths, RECORD_CVODE_BANDBLOCK_BANDWIDTHS_MUKEEP)),
	Long_val(Field(vbandwidths, RECORD_CVODE_BANDBLOCK_BANDWIDTHS_MLKEEP)),
	Double_val(vdqrely),
	bbbdlocal,
	Bool_val(vhascomm) ? bbbdcomm : NULL);
    CHECK_FLAG ("CVBBDPrecInitB", flag);

    CAMLreturn (Val_unit);
}
int main(int argc, char *argv[])
{
  ProblemData d;

  MPI_Comm comm;
  int npes, npes_needed;
  int myId;
 
  long int neq, l_neq;

  void *cvode_mem;
  N_Vector y, q;
  realtype abstol, reltol, abstolQ, reltolQ;
  long int mudq, mldq, mukeep, mlkeep;

  int indexB;
  N_Vector yB, qB;
  realtype abstolB, reltolB, abstolQB, reltolQB;
  long int mudqB, mldqB, mukeepB, mlkeepB;

  realtype tret, *qdata, G;

  int ncheckpnt, flag;

  booleantype output;

  /* Initialize MPI and set Ids */
  MPI_Init(&argc, &argv);
  comm = MPI_COMM_WORLD;
  MPI_Comm_rank(comm, &myId);

  /* Check number of processes */
  npes_needed = NPX * NPY;
#ifdef USE3D
  npes_needed *= NPZ;
#endif
  MPI_Comm_size(comm, &npes);
  if (npes_needed != npes) {
    if (myId == 0)
      fprintf(stderr,"I need %d processes but I only got %d\n",
              npes_needed, npes);
    MPI_Abort(comm, EXIT_FAILURE);
  }

  /* Test if matlab output is requested */
  if (argc > 1) output = TRUE;
  else          output = FALSE;

  /* Allocate and set problem data structure */
  d = (ProblemData) malloc(sizeof *d);
  SetData(d, comm, npes, myId, &neq, &l_neq);
  
  if (myId == 0) PrintHeader();

  /*-------------------------- 
    Forward integration phase
    --------------------------*/

  /* Allocate space for y and set it with the I.C. */
  y = N_VNew_Parallel(comm, l_neq, neq);
  N_VConst(ZERO, y);
  
  /* Allocate and initialize qB (local contribution to cost) */
  q = N_VNew_Parallel(comm, 1, npes); 
  N_VConst(ZERO, q);

  /* Create CVODES object, attach user data, and allocate space */
  cvode_mem = CVodeCreate(CV_BDF, CV_NEWTON);
  flag = CVodeSetUserData(cvode_mem, d);
  flag = CVodeInit(cvode_mem, f, ti, y);
  abstol = ATOL;  
  reltol = RTOL;   
  flag = CVodeSStolerances(cvode_mem, reltol, abstol);

  /* attach linear solver */
  flag = CVSpgmr(cvode_mem, PREC_LEFT, 0);
  
  /* Attach preconditioner and linear solver modules */
  mudq = mldq = d->l_m[0]+1;
  mukeep = mlkeep = 2;  
  flag = CVBBDPrecInit(cvode_mem, l_neq, mudq, mldq, 
                       mukeep, mlkeep, ZERO,
                       f_local, NULL);
  
  /* Initialize quadrature calculations */
  abstolQ = ATOL_Q;
  reltolQ = RTOL_Q;
  flag = CVodeQuadInit(cvode_mem, fQ, q);
  flag = CVodeQuadSStolerances(cvode_mem, reltolQ, abstolQ);
  flag = CVodeSetQuadErrCon(cvode_mem, TRUE);

  /* Allocate space for the adjoint calculation */
  flag = CVodeAdjInit(cvode_mem, STEPS, CV_HERMITE);

  /* Integrate forward in time while storing check points */
  if (myId == 0) printf("Begin forward integration... ");
  flag = CVodeF(cvode_mem, tf, y, &tret, CV_NORMAL, &ncheckpnt);
  if (myId == 0) printf("done. ");

   /* Extract quadratures */
  flag = CVodeGetQuad(cvode_mem, &tret, q);
  qdata = NV_DATA_P(q);
  MPI_Allreduce(&qdata[0], &G, 1, PVEC_REAL_MPI_TYPE, MPI_SUM, comm);
#if defined(SUNDIALS_EXTENDED_PRECISION)
  if (myId == 0) printf("  G = %Le\n",G);
#elif defined(SUNDIALS_DOUBLE_PRECISION)
  if (myId == 0) printf("  G = %e\n",G);
#else
  if (myId == 0) printf("  G = %e\n",G);
#endif

  /* Print statistics for forward run */
  if (myId == 0) PrintFinalStats(cvode_mem);

  /*-------------------------- 
    Backward integration phase
    --------------------------*/
 
  /* Allocate and initialize yB */
  yB = N_VNew_Parallel(comm, l_neq, neq); 
  N_VConst(ZERO, yB);

  /* Allocate and initialize qB (gradient) */
  qB = N_VNew_Parallel(comm, l_neq, neq); 
  N_VConst(ZERO, qB);

  /* Create and allocate backward CVODE memory */
  flag = CVodeCreateB(cvode_mem, CV_BDF, CV_NEWTON, &indexB);
  flag = CVodeSetUserDataB(cvode_mem, indexB, d);
  flag = CVodeInitB(cvode_mem, indexB, fB, tf, yB);
  abstolB = ATOL_B;  
  reltolB = RTOL_B; 
  flag = CVodeSStolerancesB(cvode_mem, indexB, reltolB, abstolB);

  /* Attach preconditioner and linear solver modules */
  flag = CVSpgmrB(cvode_mem, indexB, PREC_LEFT, 0); 
  mudqB = mldqB = d->l_m[0]+1;
  mukeepB = mlkeepB = 2;  
  flag = CVBBDPrecInitB(cvode_mem, indexB, l_neq, mudqB, mldqB, 
                        mukeepB, mlkeepB, ZERO, fB_local, NULL);

  /* Initialize quadrature calculations */
  abstolQB = ATOL_QB;
  reltolQB = RTOL_QB;
  flag = CVodeQuadInitB(cvode_mem, indexB, fQB, qB);
  flag = CVodeQuadSStolerancesB(cvode_mem, indexB, reltolQB, abstolQB);
  flag = CVodeSetQuadErrConB(cvode_mem, indexB, TRUE);

  /* Integrate backwards */
  if (myId == 0) printf("Begin backward integration... ");
  flag = CVodeB(cvode_mem, ti, CV_NORMAL);
  if (myId == 0) printf("done.\n");
  
  /* Extract solution */
  flag = CVodeGetB(cvode_mem, indexB, &tret, yB);

  /* Extract quadratures */
  flag = CVodeGetQuadB(cvode_mem, indexB, &tret, qB);

  /* Print statistics for backward run */
  if (myId == 0) {
    PrintFinalStats(CVodeGetAdjCVodeBmem(cvode_mem, indexB));
  }

  /* Process 0 collects the gradient components and prints them */
  if (output) {
    OutputGradient(myId, qB, d);
    if (myId == 0) printf("Wrote matlab file 'grad.m'.\n");
  }

  /* Free memory */

  N_VDestroy_Parallel(y);
  N_VDestroy_Parallel(q);
  N_VDestroy_Parallel(qB);
  N_VDestroy_Parallel(yB);

  CVodeFree(&cvode_mem);

  MPI_Finalize();

  return(0);
}