void AdlPrimitivesDemo::testFill2( Buffer<int2>& buf, int size, Stopwatch& sw )
{
	MyFill::Data* sortData = MyFill::allocate( m_deviceData );
	
	sw.start();

	MyFill::execute( sortData, buf, make_int2(12, 13), size );

	sw.stop();

	MyFill::deallocate( sortData );

	{
		float t = sw.getMs();
		sprintf_s(m_txtBuffer[m_nTxtLines++], LINE_CAPACITY, "Fill int2: %3.2fGB/s (%3.2fms)", size/t/1000/1000*8, t);		
	}
}
void AdlPrimitivesDemo::testSort( Buffer<SortData>& buf, int size, Stopwatch& sw )
{
	MySort::Data* sortData = MySort::allocate( m_deviceData, size, RadixSortBase::SORT_ADVANCED );
	
	sw.start();

	MySort::execute( sortData, buf, size );

	sw.stop();

	MySort::deallocate( sortData );

	{
		m_nTxtLines = 0;
		float t = sw.getMs();
		sprintf_s(m_txtBuffer[m_nTxtLines++], LINE_CAPACITY, "%d: %3.2fms, %3.2fMKeys/s", size, t, size/t/1000);		
	}
}
void AdlPrimitivesDemo::test( Buffer<int2>& buf, int size, Stopwatch& sw )
{
	Kernel* kernel = KernelManager::query( m_deviceData, "..\\..\\AdlDemos\\TestBed\\Demos\\AdlPrimitivesDemoKernel", "FillInt4Kernel" );
	Buffer<int4> constBuffer( m_deviceData, 1, BufferBase::BUFFER_CONST );


	int numGroups = (size+128*4-1)/(128*4);
	Buffer<u32> workBuffer0( m_deviceData, numGroups*(16) );
	Buffer<u32> workBuffer1( m_deviceData, numGroups*(16) );

	Buffer<int2> sortBuffer( m_deviceData, size );
	{
		int2* host = new int2[size];
		for(int i=0; i<size; i++)
		{
			host[i] = make_int2( getRandom(0, 0xf), i );
		}
		sortBuffer.write( host, size );
		DeviceUtils::waitForCompletion( m_deviceData );
		delete [] host;
	}

	int4 constData;
	{
		constData.x = size;
		constData.y = 0;
		constData.z = numGroups;
		constData.w = 0;
	}

	sw.start();

	int nThreads = size/4;
	{
		BufferInfo bInfo[] = { BufferInfo( &buf ), BufferInfo( &workBuffer0 ), BufferInfo( &workBuffer1 ) };
		Launcher launcher( m_deviceData, kernel );
		launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(Launcher::BufferInfo) );
		launcher.setConst( constBuffer, constData );
		launcher.launch1D( nThreads, 128 );
	}

	sw.split();

	{
		constData.w = 1;
		int nThreads = size/4;
		BufferInfo bInfo[] = { BufferInfo( &buf ), BufferInfo( &workBuffer0 ), BufferInfo( &workBuffer1 ) };
		Launcher launcher( m_deviceData, kernel );
		launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(Launcher::BufferInfo) );
		launcher.setConst( constBuffer, constData );
		launcher.launch1D( nThreads, 128 );
	}

	sw.split();

	{
		constData.w = 2;
		int nThreads = size/4;
		BufferInfo bInfo[] = { BufferInfo( &sortBuffer ), BufferInfo( &workBuffer0 ), BufferInfo( &workBuffer1 ) };
		Launcher launcher( m_deviceData, kernel );
		launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(Launcher::BufferInfo) );
		launcher.setConst( constBuffer, constData );
		launcher.launch1D( nThreads, 128 );
	}

	sw.stop();

	{
		int2* host = new int2[size];
		buf.read( host, size );
		DeviceUtils::waitForCompletion( m_deviceData );

		for(int i=0; i<128*4-1; i++)
		{
			ADLASSERT( host[i].x <= host[i+1].x );
		}

		delete [] host;
	}

	{
		float t[3];
		sw.getMs(t, 3);
		//	(byte * nElems)
		sprintf_s(m_txtBuffer[m_nTxtLines++], LINE_CAPACITY, "LoadStore: %3.2fGB/s (%3.2fns)", (4*8*2)*nThreads/t[0]/1000/1000, t[0]*1000.f);		
		sprintf_s(m_txtBuffer[m_nTxtLines++], LINE_CAPACITY, "GenHistog: %3.2fGB/s (%3.2fns)", (4*(8*2+2))*nThreads/t[1]/1000/1000, t[1]*1000.f);		
		sprintf_s(m_txtBuffer[m_nTxtLines++], LINE_CAPACITY, "FullSort: %3.2fGB/s (%3.2fns)", (4*(8*2+2))*nThreads/t[2]/1000/1000, t[2]*1000.f);		
	}
}