void rcb_rec2D(vector<T> *p_coord,
               vector<int> *p_map,
               vector<int> *p_part,
               int start,
               int end,
               int dims, 
               int cur_depth,
               int ttl_depth){

	// end of partitioning
	if(cur_depth == 0) return;

	//cout << "-------------------------------------------------------------" << endl;
	//cout << "    start=" << start << " end=" << end << " cur_depth=" << cur_depth << endl;

tmr_span.start();
	// calculate max distance on each axis
	double x_span = calc_span(p_coord, start, end, dims, 0);
	double y_span = calc_span(p_coord, start, end, dims, 1);
tmr_span.stop_and_add_to_total();;

	// choose axis
	int axis = -1;
	if(x_span >= y_span)
		axis = 0;
	else
		axis = 1;

	//cout << "    x_span=" << x_span << " y_span=" << y_span << " axis=" << axis << endl;

	// find mid-point
tmr_pivot1.start();
	T pivot = find_pivot(p_coord, start, end, dims, axis);
tmr_pivot1.stop_and_add_to_total();;
	//cout << "    pivot=" << pivot <<endl;

	// partition into two
	int level= cur_depth-1;
	int part_index = partition(p_coord, p_map, p_part, start, end, dims, axis, pivot, level);

	//cout << "    part_index=" << part_index << endl;

	// next partitioning
	rcb_rec2D(p_coord, p_map, p_part, start, start+part_index, dims, cur_depth-1, ttl_depth);
	rcb_rec2D(p_coord, p_map, p_part, start+part_index, end, dims, cur_depth-1, ttl_depth);

}
void part_rcb(int numoflevel, int dims,
              int nnode, int nedge, int nbedge, int ncell,
              point *partnode, point *partedge, point *partbedge, point *partcell,
              int *cell, int *ecell, int *becell, int *edge,
              float *x){

	// calc coordinate of center of gravity in each cell
	vector<float> coord_cell(ncell*dims);
	calc_cellcentre(ncell, dims, cell, x, &coord_cell);

	// initialize map and partition data
	vector<int> map, part;
	for(int i=0; i<ncell; i++){
		map.push_back(i);
		part.push_back(0);
	}

 	// call recursive coordinate bisection algorithm
	rcb_rec2D(&coord_cell, &map, &part, 0, ncell, dims, numoflevel, numoflevel);

	// Debug Print
	//part_rcb_DebugPrint(ncell, dims, &coord_cell, &map, &part);
	
 	// output partition data
tmr_out.start();
/*
	if(check_partdata(ncell, &map, &part, numoflevel)){
		generate_partdata(nnode, nedge, nbedge, ncell,
                          &map, &part, 
                          partnode, partedge, partbedge, partcell, 
                          cell, ecell, becell);
	}else{
		cout << "partition data is invalid" << endl;
		exit(-1);
	}
*/
tmr_out.stop_and_add_to_total();

printf("span  =%lf\n", tmr_span.total_time());
printf("pivot =%lf\n", tmr_pivot1.total_time());
printf("part1 =%lf\n", tmr_part1.total_time());
printf("part2 =%lf\n", tmr_part2.total_time());
printf("part3 =%lf\n", tmr_part3.total_time());
printf("out   =%lf\n", tmr_out.total_time());

}
int partition(vector<T> *p_in, 
              vector<int> *p_map,
              vector<int> *p_part,
              int start, 
              int end, 
              int dims, 
              int axis, 
              T pivot,
              int level){

/*
tmr_part1.start();
	// Buffer
	vector<T> out1, out2;
	vector<int> map1, map2;
	int cnt1=0, cnt2=0;
	for(int i=start; i < end; i++){
		if(p_in->at(i*dims+axis)> pivot){
			for(int d_index=0; d_index<dims; d_index++)
				out1.push_back(p_in->at(i*dims+d_index));
			map1.push_back(p_map->at(i));
			cnt1+=1;
		}else{
			for(int d_index=0; d_index<dims; d_index++)
				out2.push_back(p_in->at(i*dims+d_index));
			map2.push_back(p_map->at(i));
			cnt2+=1;
		}
	}

tmr_part1.stop_and_add_to_total();
*/


tmr_part1.start();

	// Buffer
	vector<T> out1, out2;
	vector<int> map1, map2;
	int cnt1=0, cnt2=0;
	int halfcnt = (end-start)/2;
	map1.reserve(halfcnt);
	map2.reserve(halfcnt);
	out1.reserve(halfcnt*dims);
	out2.reserve(halfcnt*dims);
	T tmpcoord;
	for(int i=start; i < end; i++){
	
		tmpcoord = p_in->at(i*dims+axis);

		// - balance load when multiple node has same coordinate value
		if(tmpcoord == pivot){
			// if cnt1 is less than half size, data belong to map1
			if(halfcnt>cnt1){
				for(int d_index=0; d_index<dims; d_index++)
					out1.push_back(p_in->at(i*dims+d_index));
				map1.push_back(p_map->at(i));
				cnt1+=1;
			// if cnt1 is more than half size, data belong to map2
			}else{
				for(int d_index=0; d_index<dims; d_index++)
					out2.push_back(p_in->at(i*dims+d_index));
				map2.push_back(p_map->at(i));
				cnt2+=1;
			}
		}else if(tmpcoord > pivot){
			for(int d_index=0; d_index<dims; d_index++)
				out1.push_back(p_in->at(i*dims+d_index));
			map1.push_back(p_map->at(i));
			cnt1+=1;
		}else{
			for(int d_index=0; d_index<dims; d_index++)
				out2.push_back(p_in->at(i*dims+d_index));
			map2.push_back(p_map->at(i));
			cnt2+=1;
		}
	}
tmr_part1.stop_and_add_to_total();;

	//cout << "    out1.size =" << out1.size()/dims << " out2.size =" << out2.size()/dims << endl;
	//cout << "    level =" << level  << endl;

	// Replace to original coord data <--- can be replaced with this loop to memcpy but may not safe..

tmr_part2.start();

	for(int i=0; i < (int)out1.size(); i++){
		p_in->at(start*dims+i) = out1.at(i);
	}

	for(int i=0; i < (int)out2.size(); i++){
		p_in->at(start*dims+out1.size()+i) = out2.at(i);
	}
tmr_part2.stop_and_add_to_total();;


tmr_part3.start();
	for(int i=0; i<cnt2; i++){
		p_part->at(start+cnt1+i) |= (1 << level);
	}

	// Replace to original map/part data
	for(int i=0; i < (int)map1.size(); i++)
		p_map->at(start+i) = map1.at(i);

	for(int i=0; i < (int)map2.size(); i++)
		p_map->at(start+map1.size()+i) = map2.at(i);
tmr_part3.stop_and_add_to_total();;

	return cnt1;
}