Exemple #1
0
THREADABLE_FUNCTION_END

//evolve the configuration with the momenta
THREADABLE_FUNCTION_3ARG(evolve_lx_conf_with_momenta, quad_su3*,lx_conf, quad_su3*,H, double,dt)
{
    GET_THREAD_ID();

    verbosity_lv2_master_printf("Evolving conf with momenta, dt=%lg\n",dt);

    START_TIMING(conf_evolve_time,nconf_evolve);

    //evolve
    NISSA_PARALLEL_LOOP(ivol,0,loc_vol)
    for(int mu=0; mu<NDIM; mu++)
    {
        su3 t1,t2;
        su3_prod_double(t1,H[ivol][mu],dt);
        safe_hermitian_exact_i_exponentiate(t2,t1);

        safe_su3_prod_su3(lx_conf[ivol][mu],t2,lx_conf[ivol][mu]);
    }

    set_borders_invalid(lx_conf);

    STOP_TIMING(conf_evolve_time);
}
static int
RunTest(int *iparam, double *dparam, real_Double_t *t_) 
{
    PASTE_CODE_IPARAM_LOCALS( iparam );

    if ( M != N && check ) {
        fprintf(stderr, "Check cannot be perfomed with M != N\n");
        check = 0;
    }

    /* Allocate Data */
    PASTE_CODE_ALLOCATE_MATRIX_TILE( descA, 1, PLASMA_Complex64_t, PlasmaComplexDouble, LDA, M, N );
    
    PLASMA_zplrnt_Tile(descA, 3456);

    {
        PLASMA_Complex64_t *Amat;
        int m, i, ldam;
        for(m=0; m<MT; m++) {
            ldam = BLKLDD( *descA, m );
            Amat = (PLASMA_Complex64_t*)plasma_getaddr(*descA, m, m);
            for(i=0; i<ldam; i++) {
                Amat[i*ldam+i] += max(M,N);
            }
        }
    }

    /* Save AT in lapack layout for check */
    PASTE_TILE_TO_LAPACK( descA, A, check, PLASMA_Complex64_t, LDA, N );
    
    START_TIMING();
    PLASMA_zgetrf_nopiv_Tile( descA );
    STOP_TIMING();
    
    /* Check the solution */
    if ( check )
    {
        PASTE_CODE_ALLOCATE_MATRIX_TILE( descB, 1, PLASMA_Complex64_t, PlasmaComplexDouble, LDB, N, NRHS );
        PLASMA_zplrnt_Tile( descB, 7732 );
        PASTE_TILE_TO_LAPACK( descB, b, check, PLASMA_Complex64_t, LDB, NRHS );

        PLASMA_ztrsm_Tile( PlasmaLeft, PlasmaLower, PlasmaNoTrans, PlasmaUnit,
                           1.0, descA, descB );
        PLASMA_ztrsm_Tile( PlasmaLeft, PlasmaUpper, PlasmaNoTrans, PlasmaNonUnit,
                           1.0, descA, descB );

        PASTE_TILE_TO_LAPACK( descB, x, check, PLASMA_Complex64_t, LDB, NRHS );
        dparam[IPARAM_RES] = z_check_solution(M, N, NRHS, A, LDA, b, x, LDB,
                                              &(dparam[IPARAM_ANORM]), 
                                              &(dparam[IPARAM_BNORM]), 
                                              &(dparam[IPARAM_XNORM]));
        free(A); free(b); free(x);
    }

    PASTE_CODE_FREE_MATRIX( descA );
 
    return 0;
}
static int
RunTest(int *iparam, double *dparam, real_Double_t *t_) 
{
    PLASMA_desc *descT;
    PASTE_CODE_IPARAM_LOCALS( iparam );

    /* Allocate Data */
    PASTE_CODE_ALLOCATE_MATRIX_TILE( descA, 1, PLASMA_Complex64_t, PlasmaComplexDouble, LDA, M, N );
    PLASMA_zplrnt_Tile( descA, 5373 );

    /* Save A for check */
    PASTE_TILE_TO_LAPACK( descA, A, ( check && M == N ), PLASMA_Complex64_t, LDA, N );
    
    /* Allocate B for check */
    PASTE_CODE_ALLOCATE_MATRIX_TILE( descB, (check && M == N), PLASMA_Complex64_t, PlasmaComplexDouble, LDB, M, NRHS );
     
    /* Allocate Workspace */
    PLASMA_Alloc_Workspace_zgels_Tile(M, N, &descT);

    /* Do the computations */
    START_TIMING();
    PLASMA_zgeqrf_Tile( descA, descT );
    STOP_TIMING();
    
    /* Check the solution */
    if ( check && M == N )
    {
        /* Initialize and save B */
        PLASMA_zplrnt_Tile( descB, 2264 );
        PASTE_TILE_TO_LAPACK( descB, B, 1, PLASMA_Complex64_t, LDB, NRHS );

        /* Compute the solution */
        PLASMA_zgeqrs_Tile( descA, descT, descB );

        /* Copy solution to X */
        PASTE_TILE_TO_LAPACK( descB, X, 1, PLASMA_Complex64_t, LDB, NRHS );

        dparam[IPARAM_RES] = z_check_solution(M, N, NRHS, A, LDA, B, X, LDB,
                                              &(dparam[IPARAM_ANORM]), 
                                              &(dparam[IPARAM_BNORM]), 
                                              &(dparam[IPARAM_XNORM]));

        /* Free checking structures */
        PASTE_CODE_FREE_MATRIX( descB );
        free( A ); 
        free( B ); 
        free( X );
    }

    /* Free data */
    PLASMA_Dealloc_Handle_Tile(&descT);
    PASTE_CODE_FREE_MATRIX( descA );

    return 0;
}
	void cWorld3D::Update(float afTimeStep)
	{
		START_TIMING(Physics);
		if(mpPhysicsWorld) mpPhysicsWorld->Update(afTimeStep);
		STOP_TIMING(Physics);


		START_TIMING(Entities);
		UpdateEntities(afTimeStep);
		STOP_TIMING(Entities);

		START_TIMING(Bodies);
		UpdateBodies(afTimeStep);
		STOP_TIMING(Bodies);

		START_TIMING(Particles);
		UpdateParticles(afTimeStep);
		STOP_TIMING(Particles);

		START_TIMING(Lights);
		UpdateLights(afTimeStep);
		STOP_TIMING(Lights);

		START_TIMING(SoundEntities);
		UpdateSoundEntities(afTimeStep);
		STOP_TIMING(SoundEntities);
	}
Exemple #5
0
static int
RunTest(int *iparam, double *dparam, real_Double_t *t_) 
{
    PLASMA_desc *T;
    PASTE_CODE_IPARAM_LOCALS( iparam );

    if ( M != N && check ) {
        fprintf(stderr, "Check cannot be perfomed with M != N\n");
        check = 0;
    }

    /* Allocate Data */
    PASTE_CODE_ALLOCATE_MATRIX( A, 1, PLASMA_Complex64_t, LDA, N );

    /* Initialize Data */
    PLASMA_zplrnt(M, N, A, LDA, 3456);

    /* Allocate Workspace */
    PLASMA_Alloc_Workspace_zgels(M, N, &T);

    /* Save AT in lapack layout for check */
    PASTE_CODE_ALLOCATE_COPY( Acpy, check, PLASMA_Complex64_t, A, LDA, N );

    START_TIMING();
    PLASMA_zgeqrf( M, N, A, LDA, T );
    STOP_TIMING();
    
    /* Check the solution */
    if ( check )
    {
        PASTE_CODE_ALLOCATE_MATRIX( X, 1, PLASMA_Complex64_t, LDB, NRHS );
        PLASMA_zplrnt( N, NRHS, X, LDB, 5673 );
        PASTE_CODE_ALLOCATE_COPY( B, 1, PLASMA_Complex64_t, X, LDB, NRHS );
        
        PLASMA_zgeqrs(M, N, NRHS, A, LDA, T, X, LDB);

        dparam[IPARAM_RES] = z_check_solution(M, N, NRHS, Acpy, LDA, B, X, LDB,
                                              &(dparam[IPARAM_ANORM]), 
                                              &(dparam[IPARAM_BNORM]), 
                                              &(dparam[IPARAM_XNORM]));

        free( Acpy ); 
        free( B ); 
        free( X );
      }

    /* Free Workspace */
    PLASMA_Dealloc_Handle_Tile( &T );
    free( A );

    return 0;
}
static int
RunTest(int *iparam, double *dparam, real_Double_t *t_) 
{
    PLASMA_Complex64_t alpha, beta;
    PASTE_CODE_IPARAM_LOCALS( iparam );
    
    LDB = max(K, iparam[IPARAM_LDB]);
    LDC = max(M, iparam[IPARAM_LDC]);

    /* Allocate Data */
    PASTE_CODE_ALLOCATE_MATRIX_TILE( descA, 1, PLASMA_Complex64_t, PlasmaComplexDouble, LDA, M, K );
    PASTE_CODE_ALLOCATE_MATRIX_TILE( descB, 1, PLASMA_Complex64_t, PlasmaComplexDouble, LDB, K, N );
    PASTE_CODE_ALLOCATE_MATRIX_TILE( descC, 1, PLASMA_Complex64_t, PlasmaComplexDouble, LDC, M, N );

    /* Initialiaze Data */
    PLASMA_zplrnt_Tile( descA, 5373 );
    PLASMA_zplrnt_Tile( descB, 7672 );
    PLASMA_zplrnt_Tile( descC, 6387 );
    
    LAPACKE_zlarnv_work(1, ISEED, 1, &alpha);
    LAPACKE_zlarnv_work(1, ISEED, 1, &beta);
    
    /* Save C for check */
    PASTE_TILE_TO_LAPACK( descC, C2, check, PLASMA_Complex64_t, LDC, N );

    START_TIMING();
    PLASMA_zgemm_Tile( PlasmaNoTrans, PlasmaNoTrans, alpha, descA, descB, beta, descC );
    STOP_TIMING();
    
    /* Check the solution */
    if (check)
    {
        PASTE_TILE_TO_LAPACK( descA, A, check, PLASMA_Complex64_t, LDA, K );
        PASTE_TILE_TO_LAPACK( descB, B, check, PLASMA_Complex64_t, LDB, N );
        PASTE_TILE_TO_LAPACK( descC, C, check, PLASMA_Complex64_t, LDC, N );

        dparam[IPARAM_RES] = z_check_gemm( PlasmaNoTrans, PlasmaNoTrans, M, N, K,
                                           alpha, A, LDA, B, LDB, beta, C, C2, LDC,
                                           &(dparam[IPARAM_ANORM]), 
                                           &(dparam[IPARAM_BNORM]), 
                                           &(dparam[IPARAM_XNORM]));
        free(A); free(B); free(C); free(C2);
    }

    PASTE_CODE_FREE_MATRIX( descA );
    PASTE_CODE_FREE_MATRIX( descB );
    PASTE_CODE_FREE_MATRIX( descC );
    return 0;
}
static int
RunTest(int *iparam, double *dparam, real_Double_t *t_) 
{
    PLASMA_desc *L;
    int *piv;
    PASTE_CODE_IPARAM_LOCALS( iparam );
    
    if ( M != N ) {
        fprintf(stderr, "This timing works only with M == N\n");
        return -1;
    }
    
    /* Allocate Data */
    PASTE_CODE_ALLOCATE_MATRIX( A, 1, PLASMA_Complex64_t, LDA, N    );
    PASTE_CODE_ALLOCATE_MATRIX( X, 1, PLASMA_Complex64_t, LDB, NRHS );
    
    /* Initialiaze Data */
    PLASMA_zplrnt( N, N,    A, LDA,   51 );
    PLASMA_zplrnt( N, NRHS, X, LDB, 5673 );

    PLASMA_Alloc_Workspace_zgesv_incpiv(N, &L, &piv);

    /* Save A and b  */
    PASTE_CODE_ALLOCATE_COPY( Acpy, check, PLASMA_Complex64_t, A, LDA, N    );
    PASTE_CODE_ALLOCATE_COPY( B,    check, PLASMA_Complex64_t, X, LDB, NRHS );

    START_TIMING();
    PLASMA_zgesv_incpiv( N, NRHS, A, LDA, L, piv, X, LDB );
    STOP_TIMING();
    
    /* Check the solution */
    if (check)
    {
        dparam[IPARAM_RES] = z_check_solution(N, N, NRHS, Acpy, LDA, B, X, LDB,
                                              &(dparam[IPARAM_ANORM]), 
                                              &(dparam[IPARAM_BNORM]), 
                                              &(dparam[IPARAM_XNORM]));
        free(Acpy); free(B);
    }

    PLASMA_Dealloc_Handle_Tile( &L );
    free( piv );
    free( X );
    free( A );


    return 0;
}
static int
RunTest(int *iparam, double *dparam, real_Double_t *t_)
{
    PASTE_CODE_IPARAM_LOCALS( iparam );

    if ( M != N && check ) {
        fprintf(stderr, "Check cannot be perfomed with M != N\n");
        check = 0;
    }

    /* Allocate Data */
    PASTE_CODE_ALLOCATE_MATRIX_TILE( descA, 1, PLASMA_Complex64_t, PlasmaComplexDouble, LDA, M, N );
    PASTE_CODE_ALLOCATE_MATRIX( piv, 1, int, min(M, N), 1 );

    PLASMA_zplrnt_Tile(descA, 3456);

    /* Save AT in lapack layout for check */
    PASTE_TILE_TO_LAPACK( descA, A, check, PLASMA_Complex64_t, LDA, N );

    START_TIMING();
    PLASMA_zgetrf_tntpiv_Tile( descA, piv );
    STOP_TIMING();

    /* Check the solution */
    if ( check )
    {
        PASTE_CODE_ALLOCATE_MATRIX_TILE( descB, 1, PLASMA_Complex64_t, PlasmaComplexDouble, LDB, N, NRHS );
        PLASMA_zplrnt_Tile( descB, 7732 );
        PASTE_TILE_TO_LAPACK( descB, b, check, PLASMA_Complex64_t, LDB, NRHS );

        PLASMA_zgetrs_Tile( PlasmaNoTrans, descA, piv, descB );

        PASTE_TILE_TO_LAPACK( descB, x, check, PLASMA_Complex64_t, LDB, NRHS );
        dparam[IPARAM_RES] = z_check_solution(M, N, NRHS, A, LDA, b, x, LDB,
                                              &(dparam[IPARAM_ANORM]),
                                              &(dparam[IPARAM_BNORM]),
                                              &(dparam[IPARAM_XNORM]));

        PASTE_CODE_FREE_MATRIX( descB );
        free(A); free(b); free(x);
    }

    PASTE_CODE_FREE_MATRIX( descA );
    free( piv );

    return 0;
}
Exemple #9
0
static double
RunTest(real_Double_t *t_, struct user_parameters* params)
{
    double t;
    PLASMA_desc *descT;
    int64_t N     = params->matrix_size;
    int64_t IB    = params->iblocksize;
    int64_t NB    = params->blocksize;
    int check     = params->check;
    double check_res = 0;

    /* Allocate Data */
    PLASMA_desc *descA = NULL;
    double *ptr = (double*)malloc(N * N * sizeof(double));
    PLASMA_Desc_Create(&descA, ptr, PlasmaRealDouble, NB, NB, NB*NB, N, N, 0, 0, N, N);

#pragma omp parallel
    {
#pragma omp single
        {
    plasma_pdpltmg_quark(*descA, 5373 );
        }
    }

    /* Save A for check */
    double *A = NULL;
    if ( check ) {
        A = (double*)malloc(N * N * sizeof(double));
        plasma_pdtile_to_lapack_quark(*descA, (void*)A, N);
    }

    /* Allocate Workspace */
    plasma_alloc_ibnb_tile(N, N, PlasmaRealDouble, &descT, IB, NB);

    /* Do the computations */
    START_TIMING();
#pragma omp parallel
    {
#pragma omp single
        {
    plasma_pdgeqrf_quark( *descA, *descT , IB);
        }
    }
    STOP_TIMING();

    /* Check the solution */
    if ( check )
    {
        /* Allocate B for check */
        PLASMA_desc *descB = NULL;
        double* ptr = (double*)malloc(N * sizeof(double));
        PLASMA_Desc_Create(&descB, ptr, PlasmaRealDouble, NB, NB, NB*NB, N, 1, 0, 0, N, 1);

        /* Initialize and save B */
        plasma_pdpltmg_seq(*descB, 2264 );
        double *B = (double*)malloc(N * sizeof(double));
        plasma_pdtile_to_lapack_quark(*descB, (void*)B, N);

        /* Compute the solution */
        PLASMA_dgeqrs_Tile( descA, descT, descB , IB);

        /* Copy solution to X */
        double *X = (double*)malloc(N * sizeof(double));
        plasma_pdtile_to_lapack_quark(*descB, (void*)X, N);

        check_res = d_check_solution(N, N, 1, A, N, B, X, N);

        /* Free checking structures */
        PASTE_CODE_FREE_MATRIX( descB );
        free( A );
        free( B );
        free( X );
    }

    /* Free data */
    PLASMA_Dealloc_Handle_Tile(&descT);
    PASTE_CODE_FREE_MATRIX( descA );

    return check_res;
}
Exemple #10
0
static double
RunTest(real_Double_t *t_, struct user_parameters* params)
{
    double  t;
    int64_t N     = params->matrix_size;
    int64_t NB    = params->blocksize;
    int check     = params->check;
    int uplo = PlasmaUpper;
    double check_res = 0;

    /* Allocate Data */
    PLASMA_desc *descA = NULL;
    double* ptr = malloc(N * N * sizeof(double));
    PLASMA_Desc_Create(&descA, ptr, PlasmaRealDouble, NB, NB, NB*NB, N, N, 0, 0, N, N);

#pragma omp parallel 
{
#pragma omp single 
{
    plasma_pdplgsy_quark( (double)N, *descA, 51 );
        }
    }

    /* Save A for check */
    double *A = NULL;
    if(check) {
        A = (double*)malloc(N * N * sizeof(double));
        plasma_pdtile_to_lapack_quark(*descA, (void*)A, N);
    }

    /* PLASMA DPOSV */
    START_TIMING();
#pragma omp parallel 
{
#pragma omp single 
{
    plasma_pdpotrf_quark(uplo, *descA);
        }
    }
    STOP_TIMING();

    /* Check the solution */
    if ( check )
    {
        PLASMA_desc *descB = NULL;
        double* ptr = (double*)malloc(N * sizeof(double));
        PLASMA_Desc_Create(&descB, ptr, PlasmaRealDouble, NB, NB, NB*NB, N, 1, 0, 0, N, 1);

        plasma_pdpltmg_seq(* descB, 7672 );
        double* B = (double*)malloc(N * sizeof(double));
        plasma_pdtile_to_lapack_quark(*descB, (void*)B, N);

        PLASMA_dpotrs_Tile( uplo, descA, descB );

        double* X = (double*)malloc(N * sizeof(double));
        plasma_pdtile_to_lapack_quark(*descB, (void*)X, N);

        check_res = d_check_solution(N, N, 1, A, N, B, X, N);

        PASTE_CODE_FREE_MATRIX( descB );
        free( A );
        free( B );
        free( X );
    }

    PASTE_CODE_FREE_MATRIX( descA );

    return check_res;
}
static int
RunTest(int *iparam, double *dparam, real_Double_t *t_) 
{
    PASTE_CODE_IPARAM_LOCALS( iparam );
    PLASMA_desc *descT;
    int jobu  = PlasmaNoVec;
    int jobvt = PlasmaNoVec;
    int INFO;

    /* Allocate Data */
    PASTE_CODE_ALLOCATE_MATRIX_TILE( descA, 1, PLASMA_Complex64_t, PlasmaComplexDouble, LDA, M, N );
    PASTE_CODE_ALLOCATE_MATRIX( VT, (jobvt == PlasmaVec), PLASMA_Complex64_t, N, N );
    PASTE_CODE_ALLOCATE_MATRIX( U, (jobu == PlasmaVec), PLASMA_Complex64_t, M, M );
    PASTE_CODE_ALLOCATE_MATRIX( S, 1, double, N, 1 );

    /* Initialiaze Data */
    PLASMA_zplrnt_Tile(descA, 51 );

    /* Save AT and bT in lapack layout for check */
    if ( check ) {
    }

    /* Allocate Workspace */
    PLASMA_Alloc_Workspace_zgesvd(N, N, &descT);

    if ( jobu == PlasmaVec ) {
        LAPACKE_zlaset_work(LAPACK_COL_MAJOR, 'A', M, M, 0., 1., U, M);
    }
    if ( jobvt == PlasmaVec ) {
        LAPACKE_zlaset_work(LAPACK_COL_MAJOR, 'A', N, N, 0., 1., VT, N);
    }


    START_TIMING(); 
    INFO = PLASMA_zgesvd_Tile(jobu, jobvt, descA, S, descT, U, M, VT, N);
    STOP_TIMING();

    if(INFO!=0){
            printf(" ERROR OCCURED INFO %d\n",INFO);
    }


    /* Check the solution */
    if ( check )
      {
      }

    /* DeAllocate Workspace */
    PLASMA_Dealloc_Handle_Tile(&descT);

    if (jobu == PlasmaVec) {
      free( U );
    }
    if (jobvt == PlasmaVec) {
      free( VT );
    }
    PASTE_CODE_FREE_MATRIX( descA );
    free( S );

    return 0;
}
Exemple #12
0
  //take also the TA
  THREADABLE_FUNCTION_3ARG(compute_gluonic_force_lx_conf, quad_su3*,F, quad_su3*,conf, theory_pars_t*,physics)
  {
    GET_THREAD_ID();
    
    START_TIMING(gluon_force_time,ngluon_force);
    
#ifdef DEBUG
    vector_reset(F);
    double eps=1e-5;
    
    //store initial link and compute action
    su3 sto;
    su3_copy(sto,conf[0][0]);
    double act_ori;
    gluonic_action(&act_ori,conf,physics->gauge_action_name,physics->beta);
    
    //store derivative
    su3 nu_plus,nu_minus;
    su3_put_to_zero(nu_plus);
    su3_put_to_zero(nu_minus);
    
    for(int igen=0;igen<NCOL*NCOL-1;igen++)
      {
	//prepare increment and change
	su3 ba;
	su3_prod_double(ba,gell_mann_matr[igen],eps/2);
	
	su3 exp_mod;
	safe_hermitian_exact_i_exponentiate(exp_mod,ba);
	
	//change -, compute action
	unsafe_su3_dag_prod_su3(conf[0][0],exp_mod,sto);
	double act_minus;
	gluonic_action(&act_minus,conf,physics->gauge_action_name,physics->beta);
	
	//change +, compute action
	unsafe_su3_prod_su3(conf[0][0],exp_mod,sto);
	double act_plus;
	gluonic_action(&act_plus,conf,physics->gauge_action_name,physics->beta);
	
	//set back everything
	su3_copy(conf[0][0],sto);
	
	//printf("plus: %+016.016le, ori: %+016.016le, minus: %+016.016le, eps: %lg\n",act_plus,act_ori,act_minus,eps);
	double gr_plus=-(act_plus-act_ori)/eps;
	double gr_minus=-(act_ori-act_minus)/eps;
	su3_summ_the_prod_idouble(nu_plus,gell_mann_matr[igen],gr_plus);
	su3_summ_the_prod_idouble(nu_minus,gell_mann_matr[igen],gr_minus);
      }
    
    //take the average
    su3 nu;
    su3_summ(nu,nu_plus,nu_minus);
    su3_prodassign_double(nu,0.5);
    
    vector_reset(F);
#endif
    
    compute_gluonic_force_lx_conf_do_not_finish(F,conf,physics);
    
    //finish the calculation
    gluonic_force_finish_computation(F,conf);
    
#ifdef DEBUG
    master_printf("checking pure gauge force\n");
    master_printf("an\n");
    su3_print(F[0][0]);
    master_printf("nu\n");
    su3_print(nu);
    master_printf("nu_plus\n");
    su3_print(nu_plus);
    master_printf("nu_minus\n");
    su3_print(nu_minus);
    //crash("anna");
#endif
    
    //print the intensity of the force
    if(VERBOSITY_LV2)
      {
	double norm=0;
	norm+=double_vector_glb_norm2(F,loc_vol);
	master_printf("  Gluonic force average norm: %lg\n",sqrt(norm/glb_vol));
      }
    
    STOP_TIMING(gluon_force_time);
  }
Exemple #13
0
/////////////////////////////////////////////////////////
// snapMess
//
/////////////////////////////////////////////////////////
void pix_snap :: snapMess(void)
{
  if(getState()==INIT) {
    verbose(0, "not initialized yet with a valid context");
    return;
  }
  if(!GLEW_VERSION_1_1 && !GLEW_EXT_texture_object) {
    return;
  }

  if (m_cache&&m_cache->m_magic!=GEMCACHE_MAGIC) {
    m_cache=NULL;
  }

  if (m_width <= 0 || m_height <= 0) {
    error("Illegal size");
    return;
  }
  // do we need to remake the data?
  bool makeNew = false;
  bool makePbo = false;

  // release previous data
  if (m_originalImage)  {
    if (m_originalImage->xsize != m_width ||
        m_originalImage->ysize != m_height) {
      m_originalImage->clear();
      delete m_originalImage;
      m_originalImage = NULL;
      makeNew = true;
    }
  }       else {
    makeNew = true;
  }
  if (makeNew) {
    m_originalImage = new imageStruct;
    m_originalImage->xsize = m_width;
    m_originalImage->ysize = m_height;
    m_originalImage->setCsizeByFormat(GL_RGBA_GEM);
    // FIXXXME: upsidedown should default be 'true'
    m_originalImage->upsidedown = false;

    m_originalImage->allocate(m_originalImage->xsize * m_originalImage->ysize *
                              m_originalImage->csize);

    makePbo=true;
  }


  if(m_numPbo>0 && !m_pbo) {
    makePbo=true;
  } else if(m_numPbo<=0) {
    makePbo=false;
  }

  /* FIXXME */
  if(makePbo) {
    if(m_pbo) {
      delete[]m_pbo;
      m_pbo=NULL;
    }
    if(GLEW_ARB_pixel_buffer_object) {
      m_pbo=new GLuint[m_numPbo];
      glGenBuffersARB(m_numPbo, m_pbo);
      int i=0;
      for(i=0; i<m_numPbo; i++) {
        glBindBufferARB(GL_PIXEL_PACK_BUFFER_ARB, m_pbo[i]);
        glBufferDataARB(GL_PIXEL_PACK_BUFFER_ARB,
                        m_originalImage->xsize*m_originalImage->ysize*m_originalImage->csize,
                        0, GL_STREAM_READ_ARB);
      }
      glBindBufferARB(GL_PIXEL_PACK_BUFFER_ARB, 0);
    } else {
      verbose(1, "PBOs not supported! disabling");
      m_numPbo=0;
    }
  }

  if(m_pbo) {
    START_TIMING();
    m_curPbo=(m_curPbo+1)%m_numPbo;
    int index=m_curPbo;
    int nextIndex=(m_curPbo+1)%m_numPbo;

    glBindBufferARB(GL_PIXEL_PACK_BUFFER_ARB, m_pbo[index]);

    glReadPixels(m_x, m_y, m_width, m_height,
                 m_originalImage->format, m_originalImage->type, 0);


    glBindBufferARB(GL_PIXEL_PACK_BUFFER_ARB, m_pbo[nextIndex]);
    GLubyte* src = (GLubyte*)glMapBufferARB(GL_PIXEL_PACK_BUFFER_ARB,
                                            GL_READ_ONLY_ARB);
    if(src) {
      m_originalImage->fromRGBA(src);
      glUnmapBufferARB(
        GL_PIXEL_PACK_BUFFER_ARB);     // release pointer to the mapped buffer
    }
    glBindBufferARB(GL_PIXEL_PACK_BUFFER_ARB, 0);
    STOP_TIMING(m_numPbo);
  } else {
    START_TIMING();
    glFinish();
    glPixelStorei(GL_PACK_ALIGNMENT, 4);
    glPixelStorei(GL_PACK_ROW_LENGTH, 0);
    glPixelStorei(GL_PACK_SKIP_ROWS, 0);
    glPixelStorei(GL_PACK_SKIP_PIXELS, 0);

    glReadPixels(m_x, m_y, m_width, m_height,
                 m_originalImage->format, m_originalImage->type, m_originalImage->data);
    STOP_TIMING(-1);
  }

  if (m_cache) {
    m_cache->resendImage = 1;
  }
}
int MeanShift(const IplImage* img, int **labels)
{
	DECLARE_TIMING(timer);
	START_TIMING(timer);

	int level = 1;
	double color_radius2 = color_radius*color_radius;
	int minRegion = 50;

	// use Lab rather than L*u*v!
	// since Luv may produce noise points
	IplImage *result = cvCreateImage(cvGetSize(img), img->depth, img->nChannels);
	cvCvtColor(img, result, CV_RGB2Lab);

	// Step One. Filtering stage of meanshift segmentation
	// http://rsbweb.nih.gov/ij/plugins/download/Mean_Shift.java
	for (int i = 0; i<img->height; i++)
	for (int j = 0; j<img->width; j++)
	{
		int ic = i;
		int jc = j;
		int icOld, jcOld;
		float LOld, UOld, VOld;
		float L = (float)((uchar *)(result->imageData + i*img->widthStep))[j*result->nChannels + 0];
		float U = (float)((uchar *)(result->imageData + i*img->widthStep))[j*result->nChannels + 1];
		float V = (float)((uchar *)(result->imageData + i*img->widthStep))[j*result->nChannels + 2];
		// in the case of 8-bit and 16-bit images R, G and B are converted to floating-point format and scaled to fit 0 to 1 range
		// http://opencv.willowgarage.com/documentation/c/miscellaneous_image_transformations.html
		L = L * 100 / 255;
		U = U - 128;
		V = V - 128;
		double shift = 5;
		for (int iters = 0; shift > 3 && iters < 100; iters++)
		{
			icOld = ic;
			jcOld = jc;
			LOld = L;
			UOld = U;
			VOld = V;

			float mi = 0;
			float mj = 0;
			float mL = 0;
			float mU = 0;
			float mV = 0;
			int num = 0;

			int i2from = max(0, i - spatial_radius), i2to = min(img->height, i + spatial_radius + 1);
			int j2from = max(0, j - spatial_radius), j2to = min(img->width, j + spatial_radius + 1);
			for (int i2 = i2from; i2 < i2to; i2++) {
				for (int j2 = j2from; j2 < j2to; j2++) {
					float L2 = (float)((uchar *)(result->imageData + i2*img->widthStep))[j2*result->nChannels + 0],
						U2 = (float)((uchar *)(result->imageData + i2*img->widthStep))[j2*result->nChannels + 1],
						V2 = (float)((uchar *)(result->imageData + i2*img->widthStep))[j2*result->nChannels + 2];
					L2 = L2 * 100 / 255;
					U2 = U2 - 128;
					V2 = V2 - 128;

					double dL = L2 - L;
					double dU = U2 - U;
					double dV = V2 - V;
					if (dL*dL + dU*dU + dV*dV <= color_radius2) {
						mi += i2;
						mj += j2;
						mL += L2;
						mU += U2;
						mV += V2;
						num++;
					}
				}
			}
			float num_ = 1.f / num;
			L = mL*num_;
			U = mU*num_;
			V = mV*num_;
			ic = (int)(mi*num_ + 0.5);
			jc = (int)(mj*num_ + 0.5);
			int di = ic - icOld;
			int dj = jc - jcOld;
			double dL = L - LOld;
			double dU = U - UOld;
			double dV = V - VOld;

			shift = di*di + dj*dj + dL*dL + dU*dU + dV*dV;
		}

		L = L * 255 / 100;
		U = U + 128;
		V = V + 128;
		((uchar *)(result->imageData + i*img->widthStep))[j*result->nChannels + 0] = (uchar)L;
		((uchar *)(result->imageData + i*img->widthStep))[j*result->nChannels + 1] = (uchar)U;
		((uchar *)(result->imageData + i*img->widthStep))[j*result->nChannels + 2] = (uchar)V;
	}

	IplImage *tobeshow = cvCreateImage(cvGetSize(img), img->depth, img->nChannels);
	cvCvtColor(result, tobeshow, CV_Lab2RGB);
	cvSaveImage("filtered.png", tobeshow);
	cvReleaseImage(&tobeshow);

	// Step Two. Cluster
	// Connect
	int regionCount = 0;
	int *modePointCounts = new int[img->height*img->width];
	memset(modePointCounts, 0, img->width*img->height*sizeof(int));
	float *mode = new float[img->height*img->width * 3];
	{
		int label = -1;
		for (int i = 0; i<img->height; i++)
		for (int j = 0; j<img->width; j++)
			labels[i][j] = -1;
		for (int i = 0; i<img->height; i++)
		for (int j = 0; j<img->width; j++)
		if (labels[i][j]<0)
		{
			labels[i][j] = ++label;
			float L = (float)((uchar *)(result->imageData + i*img->widthStep))[j*result->nChannels + 0],
				U = (float)((uchar *)(result->imageData + i*img->widthStep))[j*result->nChannels + 1],
				V = (float)((uchar *)(result->imageData + i*img->widthStep))[j*result->nChannels + 2];
			mode[label * 3 + 0] = L * 100 / 255;
			mode[label * 3 + 1] = 354 * U / 255 - 134;
			mode[label * 3 + 2] = 256 * V / 255 - 140;
			// Fill
			std::stack<CvPoint> neighStack;
			neighStack.push(cvPoint(i, j));
			const int dxdy[][2] = { { -1, -1 }, { -1, 0 }, { -1, 1 }, { 0, -1 }, { 0, 1 }, { 1, -1 }, { 1, 0 }, { 1, 1 } };
			while (!neighStack.empty())
			{
				CvPoint p = neighStack.top();
				neighStack.pop();
				for (int k = 0; k<8; k++)
				{
					int i2 = p.x + dxdy[k][0], j2 = p.y + dxdy[k][1];
					if (i2 >= 0 && j2 >= 0 && i2<img->height && j2<img->width && labels[i2][j2]<0 && color_distance(result, i, j, i2, j2)<color_radius2)
					{
						labels[i2][j2] = label;
						neighStack.push(cvPoint(i2, j2));
						modePointCounts[label]++;
						L = (float)((uchar *)(result->imageData + i2*img->widthStep))[j2*result->nChannels + 0];
						U = (float)((uchar *)(result->imageData + i2*img->widthStep))[j2*result->nChannels + 1];
						V = (float)((uchar *)(result->imageData + i2*img->widthStep))[j2*result->nChannels + 2];
						mode[label * 3 + 0] += L * 100 / 255;
						mode[label * 3 + 1] += 354 * U / 255 - 134;
						mode[label * 3 + 2] += 256 * V / 255 - 140;
					}
				}
			}
			mode[label * 3 + 0] /= modePointCounts[label];
			mode[label * 3 + 1] /= modePointCounts[label];
			mode[label * 3 + 2] /= modePointCounts[label];
		}
		//current Region count
		regionCount = label + 1;
	}
	std::cout << "Mean Shift(Connect):" << regionCount << std::endl;
	int oldRegionCount = regionCount;

	// TransitiveClosure
	for (int counter = 0, deltaRegionCount = 1; counter<5 && deltaRegionCount>0; counter++)
	{
		// 1.Build RAM using classifiction structure
		RAList *raList = new RAList[regionCount], *raPool = new RAList[10 * regionCount];	//10 is hard coded!
		for (int i = 0; i < regionCount; i++)
		{
			raList[i].label = i;
			raList[i].next = NULL;
		}
		for (int i = 0; i < regionCount * 10 - 1; i++)
		{
			raPool[i].next = &raPool[i + 1];
		}
		raPool[10 * regionCount - 1].next = NULL;
		RAList	*raNode1, *raNode2, *oldRAFreeList, *freeRAList = raPool;
		for (int i = 0; i<img->height; i++)
		for (int j = 0; j<img->width; j++)
		{
			if (i>0 && labels[i][j] != labels[i - 1][j])
			{
				// Get 2 free node
				raNode1 = freeRAList;
				raNode2 = freeRAList->next;
				oldRAFreeList = freeRAList;
				freeRAList = freeRAList->next->next;
				// connect the two region
				raNode1->label = labels[i][j];
				raNode2->label = labels[i - 1][j];
				if (raList[labels[i][j]].Insert(raNode2))	//already exists!
					freeRAList = oldRAFreeList;
				else
					raList[labels[i - 1][j]].Insert(raNode1);
			}
			if (j>0 && labels[i][j] != labels[i][j - 1])
			{
				// Get 2 free node
				raNode1 = freeRAList;
				raNode2 = freeRAList->next;
				oldRAFreeList = freeRAList;
				freeRAList = freeRAList->next->next;
				// connect the two region
				raNode1->label = labels[i][j];
				raNode2->label = labels[i][j - 1];
				if (raList[labels[i][j]].Insert(raNode2))
					freeRAList = oldRAFreeList;
				else
					raList[labels[i][j - 1]].Insert(raNode1);
			}
		}

		// 2.Treat each region Ri as a disjoint set
		for (int i = 0; i < regionCount; i++)
		{
			RAList	*neighbor = raList[i].next;
			while (neighbor)
			{
				if (color_distance(&mode[3 * i], &mode[3 * neighbor->label])<color_radius2)
				{
					int iCanEl = i, neighCanEl = neighbor->label;
					while (raList[iCanEl].label != iCanEl) iCanEl = raList[iCanEl].label;
					while (raList[neighCanEl].label != neighCanEl) neighCanEl = raList[neighCanEl].label;
					if (iCanEl<neighCanEl)
						raList[neighCanEl].label = iCanEl;
					else
					{
						//raList[raList[iCanEl].label].label = iCanEl;
						raList[iCanEl].label = neighCanEl;
					}
				}
				neighbor = neighbor->next;
			}
		}
		// 3. Union Find
		for (int i = 0; i < regionCount; i++)
		{
			int iCanEl = i;
			while (raList[iCanEl].label != iCanEl) iCanEl = raList[iCanEl].label;
			raList[i].label = iCanEl;
		}
		// 4. Traverse joint sets, relabeling image.
		int *modePointCounts_buffer = new int[regionCount];
		memset(modePointCounts_buffer, 0, regionCount*sizeof(int));
		float *mode_buffer = new float[regionCount * 3];
		int	*label_buffer = new int[regionCount];

		for (int i = 0; i<regionCount; i++)
		{
			label_buffer[i] = -1;
			mode_buffer[i * 3 + 0] = 0;
			mode_buffer[i * 3 + 1] = 0;
			mode_buffer[i * 3 + 2] = 0;
		}
		for (int i = 0; i<regionCount; i++)
		{
			int iCanEl = raList[i].label;
			modePointCounts_buffer[iCanEl] += modePointCounts[i];
			for (int k = 0; k<3; k++)
				mode_buffer[iCanEl * 3 + k] += mode[i * 3 + k] * modePointCounts[i];
		}
		int	label = -1;
		for (int i = 0; i < regionCount; i++)
		{
			int iCanEl = raList[i].label;
			if (label_buffer[iCanEl] < 0)
			{
				label_buffer[iCanEl] = ++label;

				for (int k = 0; k < 3; k++)
					mode[label * 3 + k] = (mode_buffer[iCanEl * 3 + k]) / (modePointCounts_buffer[iCanEl]);

				modePointCounts[label] = modePointCounts_buffer[iCanEl];
			}
		}
		regionCount = label + 1;
		for (int i = 0; i < img->height; i++)
		for (int j = 0; j < img->width; j++)
			labels[i][j] = label_buffer[raList[labels[i][j]].label];

		delete[] mode_buffer;
		delete[] modePointCounts_buffer;
		delete[] label_buffer;

		//Destroy RAM
		delete[] raList;
		delete[] raPool;

		deltaRegionCount = oldRegionCount - regionCount;
		oldRegionCount = regionCount;
		std::cout << "Mean Shift(TransitiveClosure):" << regionCount << std::endl;
	}

	// Prune
	{
		int *modePointCounts_buffer = new int[regionCount];
		float *mode_buffer = new float[regionCount * 3];
		int	*label_buffer = new int[regionCount];
		int minRegionCount;

		do{
			minRegionCount = 0;
			// Build RAM again
			RAList *raList = new RAList[regionCount], *raPool = new RAList[10 * regionCount];	//10 is hard coded!
			for (int i = 0; i < regionCount; i++)
			{
				raList[i].label = i;
				raList[i].next = NULL;
			}
			for (int i = 0; i < regionCount * 10 - 1; i++)
			{
				raPool[i].next = &raPool[i + 1];
			}
			raPool[10 * regionCount - 1].next = NULL;
			RAList	*raNode1, *raNode2, *oldRAFreeList, *freeRAList = raPool;
			for (int i = 0; i<img->height; i++)
			for (int j = 0; j<img->width; j++)
			{
				if (i>0 && labels[i][j] != labels[i - 1][j])
				{
					// Get 2 free node
					raNode1 = freeRAList;
					raNode2 = freeRAList->next;
					oldRAFreeList = freeRAList;
					freeRAList = freeRAList->next->next;
					// connect the two region
					raNode1->label = labels[i][j];
					raNode2->label = labels[i - 1][j];
					if (raList[labels[i][j]].Insert(raNode2))	//already exists!
						freeRAList = oldRAFreeList;
					else
						raList[labels[i - 1][j]].Insert(raNode1);
				}
				if (j>0 && labels[i][j] != labels[i][j - 1])
				{
					// Get 2 free node
					raNode1 = freeRAList;
					raNode2 = freeRAList->next;
					oldRAFreeList = freeRAList;
					freeRAList = freeRAList->next->next;
					// connect the two region
					raNode1->label = labels[i][j];
					raNode2->label = labels[i][j - 1];
					if (raList[labels[i][j]].Insert(raNode2))
						freeRAList = oldRAFreeList;
					else
						raList[labels[i][j - 1]].Insert(raNode1);
				}
			}
			// Find small regions
			for (int i = 0; i < regionCount; i++)
			if (modePointCounts[i] < minRegion)
			{
				minRegionCount++;
				RAList *neighbor = raList[i].next;
				int candidate = neighbor->label;
				float minDistance = color_distance(&mode[3 * i], &mode[3 * candidate]);
				neighbor = neighbor->next;
				while (neighbor)
				{
					float minDistance2 = color_distance(&mode[3 * i], &mode[3 * neighbor->label]);
					if (minDistance2<minDistance)
					{
						minDistance = minDistance2;
						candidate = neighbor->label;
					}
					neighbor = neighbor->next;
				}
				int iCanEl = i, neighCanEl = candidate;
				while (raList[iCanEl].label != iCanEl) iCanEl = raList[iCanEl].label;
				while (raList[neighCanEl].label != neighCanEl) neighCanEl = raList[neighCanEl].label;
				if (iCanEl < neighCanEl)
					raList[neighCanEl].label = iCanEl;
				else
				{
					//raList[raList[iCanEl].label].label	= neighCanEl;
					raList[iCanEl].label = neighCanEl;
				}
			}
			for (int i = 0; i < regionCount; i++)
			{
				int iCanEl = i;
				while (raList[iCanEl].label != iCanEl)
					iCanEl = raList[iCanEl].label;
				raList[i].label = iCanEl;
			}
			memset(modePointCounts_buffer, 0, regionCount*sizeof(int));
			for (int i = 0; i < regionCount; i++)
			{
				label_buffer[i] = -1;
				mode_buffer[3 * i + 0] = 0;
				mode_buffer[3 * i + 1] = 0;
				mode_buffer[3 * i + 2] = 0;
			}
			for (int i = 0; i<regionCount; i++)
			{
				int iCanEl = raList[i].label;
				modePointCounts_buffer[iCanEl] += modePointCounts[i];
				for (int k = 0; k<3; k++)
					mode_buffer[iCanEl * 3 + k] += mode[i * 3 + k] * modePointCounts[i];
			}
			int	label = -1;
			for (int i = 0; i < regionCount; i++)
			{
				int iCanEl = raList[i].label;
				if (label_buffer[iCanEl] < 0)
				{
					label_buffer[iCanEl] = ++label;

					for (int k = 0; k < 3; k++)
						mode[label * 3 + k] = (mode_buffer[iCanEl * 3 + k]) / (modePointCounts_buffer[iCanEl]);

					modePointCounts[label] = modePointCounts_buffer[iCanEl];
				}
			}
			regionCount = label + 1;
			for (int i = 0; i < img->height; i++)
			for (int j = 0; j < img->width; j++)
				labels[i][j] = label_buffer[raList[labels[i][j]].label];
			//Destroy RAM
			delete[] raList;
			delete[] raPool;
			std::cout << "Mean Shift(Prune):" << regionCount << std::endl;
		} while (minRegionCount > 0);

		delete[] mode_buffer;
		delete[] modePointCounts_buffer;
		delete[] label_buffer;
	}

	// Output
	STOP_TIMING(timer);
	std::cout << "Mean Shift(ms):" << GET_TIMING(timer) << std::endl;

	cvReleaseImage(&result);
	delete[]mode;
	delete[]modePointCounts;
	return regionCount;
}