void insieme_wi_bench_implementation(irt_work_item* wi) {
	insieme_wi_bench_params* params = (insieme_wi_bench_params*)wi->parameters;

	if(params->count > 0) {
		insieme_wi_bench_params bench_params = {1, params->count - 1, params->check};
		irt_work_item_id* bench_wi_ids = (irt_work_item_id*)malloc(NUM_ITER * sizeof(irt_work_item_id));
		for(int i = 0; i < NUM_ITER; ++i) {
			irt_work_item* wi = irt_wi_create(irt_g_wi_range_one_elem, &g_insieme_impl_table[1], (irt_lw_data_item*)&bench_params);
			bench_wi_ids[i] = wi->id;
			irt_scheduling_assign_wi(irt_worker_get_current(), wi);
		}

		// irt_wi_multi_join(NUM_ITER, bench_wis);
		for(int i = 0; i < NUM_ITER; ++i) {
			irt_wi_join(bench_wi_ids[i]);
		}

		free(bench_wi_ids);
	}
	irt_atomic_inc(params->check, uint64);
}
void insieme_wi_startup_implementation(irt_work_item* wi) {
	{
		uint64 start_time = irt_time_ms();
		uint64 check_val = 0;
		insieme_wi_bench_params bench_params = {1, NUM_LEVELS, &check_val};
		for(int i = 0; i < NUM_REPEATS; ++i) {
			irt_work_item* bench_wi = irt_wi_create(irt_g_wi_range_one_elem, &g_insieme_impl_table[1], (irt_lw_data_item*)&bench_params);
			irt_work_item_id wi_id = bench_wi->id;
			irt_scheduling_assign_wi(irt_worker_get_current(), bench_wi);
			irt_wi_join(wi_id);
		}
		uint64 total_time = irt_time_ms() - start_time;
		uint64 total_wis = (uint64)pow((double)NUM_ITER, (double)NUM_LEVELS);
		uint64 wis_per_sec = (uint64)(total_wis / ((double)total_time / 1000.0));
		printf("======================\n= manual irt wi benchmark done\n");
		printf("= number of wis executed: %lu\n", check_val);
		printf("= time taken: %lu\n", total_time);
		printf("= wis/s: %lu\n======================\n", wis_per_sec);
	}

	//{
	//	uint64 start_time = irt_time_ms();
	//	uint64 check_val = 0;
	//	insieme_wi_bench_params bench_params = { 1, NUM_LEVELS, &check_val };
	//	irt_work_item* bench_wi = irt_wi_create(irt_g_wi_range_one_elem, 2, (irt_lw_data_item*)&bench_params);
	//	irt_scheduling_assign_wi(irt_worker_get_current(), bench_wi);
	//	irt_wi_join(bench_wi);
	//	uint64 total_time = irt_time_ms() - start_time;
	//	uint64 total_wis = pow(NUM_ITER, NUM_LEVELS);
	//	uint64 wis_per_sec = (uint64)(total_wis/((double)total_time/1000.0));
	//	printf("======================\n= manual irt optional wi benchmark done\n");
	//	printf("= number of wis executed: %lu\n", check_val);
	//	printf("= time taken: %lu\n", total_time);
	//	printf("= optional wis/s: %lu\n======================\n", wis_per_sec);
	//}
}
void insieme_wi_startup_implementation(irt_work_item* wi) {

	// create data arrays
	irt_data_range range[] = {{0,N,1},{0,N,1}};
	irt_data_item* A = irt_di_create(INSIEME_DOUBLE_T_INDEX, 2, range);
	irt_data_item* B = irt_di_create(INSIEME_DOUBLE_T_INDEX, 2, range);
	irt_data_item* C = irt_di_create(INSIEME_DOUBLE_T_INDEX, 2, range);

	// measure the time
	uint64 start_time = irt_time_ms();
	uint64 start_ticks = irt_time_ticks();

	// create and run initialization job
	insieme_wi_init_params init_params = { INSIEME_WI_INIT_PARAM_T_INDEX, A->id, B->id };
	irt_work_item* init_wi = irt_wi_create((irt_work_item_range){ 0, N, 1 }, &g_insieme_impl_table[INSIEME_WI_INIT_INDEX], (irt_lw_data_item*)&init_params);
	irt_work_item_id init_id = init_wi->id;
	irt_scheduling_assign_wi(irt_worker_get_current(), init_wi);

	// wait until finished
	irt_wi_join(init_id);

	// conduct the multiplication
	insieme_wi_mul_params mul_params = { INSIEME_WI_MUL_PARAM_T_INDEX, A->id, B->id, C->id };
	irt_work_item* mul_wi = irt_wi_create((irt_work_item_range){ 0, N, 1 }, &g_insieme_impl_table[INSIEME_WI_MUL_INDEX], (irt_lw_data_item*)&mul_params);
	irt_work_item_id mul_id = mul_wi->id;
	irt_scheduling_assign_wi(irt_worker_get_current(), mul_wi);

	// wait until finished
	irt_wi_join(mul_id);

	// stop the time
	uint64 end_ticks = irt_time_ticks();
	uint64 end_time = irt_time_ms();


	// check correctness

	irt_data_range subrange[] = {{0,N,1},{0,N,1}};
	irt_data_item* itemR = irt_di_create_sub(irt_data_item_table_lookup(C->id), subrange);
	irt_data_block* blockR = irt_di_acquire(itemR, IRT_DMODE_READ_ONLY);
	double** R = (double**)blockR->data;

	printf("======================\n= manual irt test matrix multiplication\n");
	printf("= time taken: %lu ms, %lu clock ticks\n", end_time - start_time, end_ticks - start_ticks);
	bool check = true;
	for (int i=0; i<N; i++) {
		for (int j=0; j<N; j++) {
			if (R[i][j] != i*j) {
				check = false;
				//printf("= fail at (%d,%d) - expected %d / actual %f\n", i, j, i*j, R[i][j]);
			}
		}
	}
	printf("= result check: %s\n======================\n", check ? "OK" : "FAIL");

	irt_di_free(blockR);
	irt_di_destroy(itemR);

	// cleanup
	irt_di_destroy(A);
	irt_di_destroy(B);
	irt_di_destroy(C);
}