int main(int argc, const char * argv[])
{
	Remotery* rmt;
	rmt_CreateGlobalInstance(&rmt);

    // Set the callbacks BEFORE initialize or we will get no threadstart nor first waitStart calls
    g_TS.GetProfilerCallbacks()->threadStart    = threadStartCallback;
    g_TS.GetProfilerCallbacks()->waitStart      = waitStartCallback;
    g_TS.GetProfilerCallbacks()->waitStop       = waitStopCallback;

	g_TS.Initialize();


	rmt_SetCurrentThreadName("Main");

	double avSpeedUp = 0.0;
	for( int run = 0; run< RUNS; ++run )
	{
		rmt_ScopedCPUSample(Run);

		printf("Run %d.....\n", run);
		ParallelReductionSumTaskSet m_ParallelReductionSumTaskSet( SUMS );
		{
			rmt_ScopedCPUSample(Parallel);

			m_ParallelReductionSumTaskSet.Init();

			g_TS.AddTaskSetToPipe(&m_ParallelReductionSumTaskSet);

			g_TS.WaitforTaskSet(&m_ParallelReductionSumTaskSet);
		}



		volatile uint64_t sum = 0;
		{
			rmt_ScopedCPUSample(Serial);
			for (uint64_t i = 0; i < (uint64_t)m_ParallelReductionSumTaskSet.m_ParallelSumTaskSet.m_SetSize; ++i)
			{
				sum += i + 1;
			}
		}

	}
	rmt_DestroyGlobalInstance(rmt);
	return 0;
}