void _calculate_parameters(double h,my_point p[],double w[],int num) {
    double H,I,J,K,L,A0, A1;
    double x,y,d;
    if (num > MAX_POINTS_NUM) {
        fprintf(stderr,"Point number is larger than previous set!\n");
        return;
    }
    is_set_ret = false;
    compute_Aj(h,w,num);
    H = compute_H(p,num);
    I = compute_I(p,num);
    J = compute_J(p,num);
    K = compute_K(p,num);
    L = compute_L(p,num);
    A0 = H -h*h*J*J-K+h*h*L*L;
    A1 = 2*(I-h*h*J*L);
//    printf("H=%.3lf I=%.3lf J=%.3lf K=%.3lf L=%.3lf A0=%.3lf A1=%.3lf\n",
//           H,I,J,K,L,A0,A1);
//    printf("Calculated as follows:\n");
    if (0 == A0) {
        if (0 == A1) { // A0 A1 are0
            // x,y could be any value
#if 1
            printf("The distribution of the given points is a circle.\n");
            x = y = sqrt(2.0) / 2;
            d = -(h*h*(J*x+L*y));
            compute_error(d,x,y,p,num);
#else
#endif
        }
        else { // A0 is 0 A1 is not 0,x2=1/2,x2+y2=1
            double ar[2] = {sqrt(2.0)/2,-sqrt(2.0)/2};// possible values of x,y
            int i,j;
            for (i=0;i<2;i++) {
                x = ar[i];
                for (j=0;j<2;j++) {
                    y = ar[j];
                    d = -(h*h*(J*x+L*y));
                    compute_error(d,x,y,p,num);
                }
            }
        }
    }
    else if (0 == A1) {
        double x_ar[4] = {0,0,1,-1};
        double y_ar[4] = {1,-1,0,0};//possible values of x,y
        int i;
        for (i=0;i<4;i++) {
            x = x_ar[i];
            y = y_ar[i];
            d = -(h*h*(J*x+L*y));
            compute_error(d,x,y,p,num);
        }
    }
    else { // A0!=0 A1!=0
        double t = A0 / sqrt (A1*A1+A0*A0); // 0 < t < 1
        double x_ar[4] = {sqrt (0.5*(1+t)),sqrt (0.5*(1-t)),
                          -sqrt (0.5*(1+t)),-sqrt (0.5*(1-t))}; // possible values of x , x2 ≠ 0 or 1
        int i;
        for (i=0;i<4;i++) {
            x = x_ar[i];
            y = (A1/A0)* (x - 0.5/x);
            d = -(h*h*(J*x+L*y));
            compute_error(d,x,y,p,num);
        }
    }
}
void *LLW_train_thread(void *th_data) {

	// Recover data
	struct ThreadData *data =  (struct ThreadData *)th_data;
	const int thread_id = data->thread_id;
	const int nprocs = data->nprocs;	
	struct Model *model = data->model;
	struct KernelCache *kernelcache = data->kernelcache;
	long chunk_size = data->chunk_size;
	const double accuracy = data->accuracy;
	double **gradient = data->gradient;
	double **H_alpha = data->H_alpha;
	double *best_primal_upper_bound = data->best_primal_upper_bound;
	int *activeset = data->activeset;
	long *nb_SV = data->nb_SV;	
	double *lp_rhs = data->lp_rhs;
	FILE *fp = data->logfile_ptr;	
	
	pthread_mutex_unlock(&thread_data_mutex);	// Release thread_data for next thread 
	 
	// Local variables
	int do_eval;
	char yesno;
	long long return_status = -1;	
	
	// Prepare the cache
	struct TrainingCache cache;
	cache.chunk_size =  chunk_size;
	LLW_alloc_memory(&cache, model->Q, model->nb_data, chunk_size);
	cache.kc = kernelcache;
	cache.activeset = activeset;
	cache.lp_rhs = lp_rhs;
	
	double **delta = matrix(chunk_size, model->Q);
	double previous_ratio = 0.0;
	double improvement = 1.0;	
	double theta_opt;
	int jump = false;
			
	if(accuracy == 0)
		do_eval = 0;
	else 
		do_eval = 1;
	
	/*
		Prepare parallel gradient computations:
		- the gradient vector is split into NUMTHREADS_GRAD parts (along i)
		- each part is updated by a different thread
	*/
	// max number of threads for gradient updates is nprocs
	pthread_t *grad_threads = (pthread_t *)malloc(sizeof(pthread_t) * nprocs); 

	// start with 1 thread (main load on kernel evaluations)
	int numthreads_grad = 1;		

	void *status; 			
	int rc; 		
	long k;	
	struct ThreadGradient_data *grad_data = (struct ThreadGradient_data *)malloc(sizeof(struct ThreadGradient_data) * nprocs);


	// Disable parallel gradient computation for small data sets
	int parallel_gradient_update = 1;
	if(model->nb_data < 5000 || nprocs == 1)
		parallel_gradient_update = 0;

	if(parallel_gradient_update) {
		for(k=0;k<nprocs;k++) {
			grad_data[k].gradient = gradient;
			grad_data[k].H_alpha = H_alpha;
			grad_data[k].cache = &cache;
			grad_data[k].model = model;
		}		
		grad_data[0].start_i = 1;
		grad_data[0].end_i = model->nb_data / numthreads_grad;	
		for(k=1;k<numthreads_grad-1;k++) {	
			grad_data[k].start_i = grad_data[k-1].end_i + 1;
			grad_data[k].end_i = grad_data[k].start_i + model->nb_data / numthreads_grad -1;
		}
		if(numthreads_grad>1) {
			grad_data[numthreads_grad-1].start_i = grad_data[numthreads_grad-2].end_i + 1;
			grad_data[numthreads_grad-1].end_i = model->nb_data;
		}	
	}
#ifdef _WIN32
	// Init POOL
	TP_WORK ** work;
	
	if(parallel_gradient_update) {
		
		work = malloc(sizeof(TP_WORK *) * nprocs);
		for(k=0;k<nprocs;k++)
			work[k] = CreateThreadpoolWork(LLW_update_gradient_thread2, (void *) &grad_data[k], NULL);
	}
#endif
		
	// Switch to nprocs/4 threads for gradient update when 25% of the kernel matrix is cached
	int percentage_step = 1;
	long percentage = model->nb_data / 4;
	int next_numthreads_grad = nprocs/4;
	if(next_numthreads_grad == 0) 
		next_numthreads_grad = 1;
	
	// Main loop
	int thread_stop = 0;
	do {	
	  	if((TRAIN_SMALL_STEP < TRAIN_STEP) && (model->iter%TRAIN_SMALL_STEP) == 0) {
		    	printf(".");
			fflush(stdout);
	  	}
	  
 	  	// Select a random chunk of data to optimize 
		select_random_chunk(&cache,model);
				
		// Compute the kernel submatrix for this chunk
  		compute_K(&cache,model);			
  	
		// Enter Critical Section (using and modifying the model)
		pthread_mutex_lock(&(model->mutex)); 
		
		jump = LLW_solve_lp(gradient, &cache, model);
	  	
	  	if(jump == false)
	    		jump = LLW_check_opt_sol(gradient,&cache,model);
	    		
		if(jump == false) {
			
	      	LLW_compute_delta(delta,&cache,model);
	    	theta_opt = LLW_compute_theta_opt(delta, &cache, model);
	    	
	    	if (theta_opt > 0.0) { 
			
				*nb_SV += LLW_compute_new_alpha(theta_opt,&cache,model);
				
				if(parallel_gradient_update) {
				
					// Update gradient in parallel 
		   			for(k=0;k<numthreads_grad;k++) {
					#ifdef _WIN32
						SubmitThreadpoolWork(work[k]);
					#else
						rc = pthread_create(&grad_threads[k], NULL, LLW_update_gradient_thread, (void *) &grad_data[k]);	
					#endif
					}			
					// Wait for gradient computations to terminate
					for(k=0;k<numthreads_grad;k++) {
					#ifdef _WIN32
						WaitForThreadpoolWorkCallbacks(work[k], FALSE);
					#else
						rc = pthread_join(grad_threads[k],&status);
					#endif
					}
				}
				else {
					// old-style non-threaded gradient update (for small data sets)
					LLW_update_gradient(gradient,H_alpha, &cache,model); 
				}
			}
   		}
				    
		if((do_eval && (model->iter%TRAIN_STEP) == 0) || EVAL || STOP || (do_eval && model->ratio >= accuracy) )  
		    {    	   	
			if(fp != NULL)
				fprintf(fp,"%ld ",model->iter);
	
			if(EVAL)
				printf("\n\n*** Evaluating the model at iteration %ld...\n",model->iter);
								 
			// Evaluate how far we are in the optimization
			// (prints more info if interrutped by user)
			previous_ratio = model->ratio;
			model->ratio = MSVM_eval(best_primal_upper_bound, gradient, H_alpha, NULL, model, EVAL, fp);

			print_training_info(*nb_SV, model);
		
			improvement = model->ratio - previous_ratio;			

			if(EVAL) // if interrupted by user (otherwise let the ratio decide if we go on training)
			  {			  	
				printf("\n *** Do you want to continue training ([y]/n)? ");
				yesno = getchar();
				if(yesno=='n') {
					STOP = 1;
				}
				EVAL = 0; // reset interruption trigger
			  }		
		    }
	    
	    	// Release kernel submatrix in cache
		release_K(&cache);
							
		// Check if a sufficient % of the kernel matrix is cached
		if( parallel_gradient_update && cache.kc->max_idx >= percentage ) {	
			// and switch thread to compute gradient upates instead of kernel rows if it is		
			thread_stop = switch_thread(nprocs, &numthreads_grad, &next_numthreads_grad, &percentage,  &percentage_step, grad_data, thread_id, model->nb_data);				
			// (threads are actually stopped to leave the CPUs
			//  to other threads that will compute gradient updates)
		}				
	
  		model->iter++;

		// Release mutex: End of critical section
		pthread_mutex_unlock(&(model->mutex));			
   		
	} while(model->iter <= MSVM_TRAIN_MAXIT && (!do_eval || (model->ratio < accuracy && improvement != 0.0)) && !STOP && !thread_stop);  
 	
  	// Release mutex: End of critical section (see below)
	pthread_mutex_unlock(&(model->mutex));

#ifdef _WIN32
	if(parallel_gradient_update){
		for(k=0;k<numthreads_grad;k++)
			CloseThreadpoolWork(work[k]);
	}	
#endif
  	// compute return_status
	if(do_eval && (model->ratio >= accuracy || improvement==0.0))
		return_status = 0; // optimum reached or no more improvement. 
		
  	// Free memory
	LLW_free_memory(&cache);
	free(delta[1]);free(delta);
	free(grad_threads);
	free(grad_data);
	
	pthread_exit((void*)return_status);
}
bool extract_clips(const char *input_path,const char *cluster_path,const char *output_path,const char *index_out_path,int clip_size) {
    DiskReadMda X(input_path);
    DiskReadMda C(cluster_path);

    if (X.totalSize()<=1) {
        printf("Problem reading input file: %s\n",input_path);
        return false;
    }
    if (C.totalSize()<=1) {
        printf("Problem reading input file: %s\n",cluster_path);
        return false;
    }

    int M=X.N1();
    int T=clip_size;
    int num_clips=C.N2();
    int K=compute_K(C);
    printf("K=%d\n",K);

    Mda index_out;
    index_out.allocate(1,K);

    MDAIO_HEADER H_out;
    H_out.data_type=MDAIO_TYPE_FLOAT32;
    H_out.num_bytes_per_entry=4;
    H_out.num_dims=3;
    H_out.dims[0]=M;
    H_out.dims[1]=T;
    H_out.dims[2]=num_clips;

    FILE *outf=fopen(output_path,"wb");
    if (!outf) {
        printf("Unable to open output file: %s\n",output_path);
        return false;
    }

    mda_write_header(&H_out,outf);

    float *buf=(float *)malloc(sizeof(float)*M*T);
    int jj=0;
    for (int k=1; k<=K; k++) {
        index_out.setValue(jj,0,k-1);
        for (int i=0; i<num_clips; i++) {
            int ii=0;
            int time0=(int)C.value(1,i);
            int k0=(int)C.value(2,i);
            if (k0==k) {
                for (int t=0; t<T; t++) {
                    for (int m=0; m<M; m++) {
                        buf[ii]=X.value(m,t+time0-T/2);
                        ii++;
                    }
                }
                mda_write_float32(buf,&H_out,M*T,outf);
                jj++;
            }
        }
    }
    free(buf);

    fclose(outf);

    if (!index_out.write(index_out_path)) {
        printf("Unable to write output file: %s\n",index_out_path);
        return false;
    }

    return true;
}