void HashStreamTest::hmacstream_test01()
{
	{
		std::string key = "1234557890123456";
		ibrcommon::HMacStream hstream((unsigned char*)key.c_str(), key.length());
		hstream << "Hello World" << std::flush;
		if ("b3:7a:c5:9f:6d:2:60:bd:da:51:cc:d3:95:2:11:7:c0:f1:7b:f9:" != getHex(hstream))
		{
			throw ibrcommon::Exception("unexpected hash value");
		}
	}

	{
		std::string key = "1234557890123456";
		ibrcommon::HMacStream hstream((unsigned char*)key.c_str(), key.length());
		hstream << "Hello again my World!" << std::flush;
		if ("a0:1d:99:f2:99:a4:b9:dc:b7:44:df:7b:b5:75:19:c6:20:8:bc:da:" != getHex(hstream))
		{
			throw ibrcommon::Exception("unexpected hash value");
		}
	}

}
	void ParallelismTraceAnalyzer::parallelism() const
	{
	
		typedef std::vector< ParallelismTraceGenerator::Event > EventVector;
		typedef std::set< long long unsigned int > InstructionSet;
	
		double averageSIMD = 0;
		double averageMIMD = 0;
		double averageCTAs = 0;
		
		for( KernelMap::const_iterator vector = _kernels.begin(); 
			vector != _kernels.end(); ++vector ) 
		{
			std::cout << " From program \"" << vector->first << "\".\n";

			double localSIMD = 0;
			double localMIMD = 0;
			double localCTAs = 0;

			for( KernelVector::const_iterator kernel = vector->second.begin(); 
				kernel != vector->second.end(); ++kernel )	
			{

				std::ifstream hstream( kernel->header.c_str() );
				boost::archive::text_iarchive harchive( hstream );
			
				ParallelismTraceGenerator::Header header;
			
				harchive >> header;
				assert( header.format 
					== TraceGenerator::ParallelismTraceFormat );
			
				hstream.close();

				std::ifstream stream( kernel->path.c_str() );
			
				if( !stream.is_open() )
				{
					throw hydrazine::Exception(
						"Failed to open ParallelismTrace kernel trace file " 
						+ kernel->path );
				}
			
				boost::archive::text_iarchive archive( stream );
			
				EventVector events( header.dimensions );
			
				for( EventVector::iterator event = events.begin(); 
					event != events.end(); ++event )
				{
					archive >> *event;
				}

				std::cout << " From file " << kernel->path << "\n";
				std::cout << "  kernel: " << kernel->name << "\n";
				std::cout << "  module: " << kernel->module << "\n";
				std::cout << "  statistics:\n";
				std::cout << "   ctas: " << header.dimensions << "\n";
				std::cout << "   threads: " << header.threads << "\n";
			
				InstructionSet instructions;
				long long unsigned int totalInstructions = 0;
				double activity = 0;
			
				for( EventVector::iterator event = events.begin(); 
					event != events.end(); ++event )
				{
					totalInstructions += event->instructions;
					instructions.insert( event->instructions );
					activity += event->activity * event->instructions;
				
				}
			
				activity /= totalInstructions + DBL_EPSILON;
			
				unsigned int previous = 0;
				unsigned int count = header.dimensions;
				double mimd = 0;
			
				for( InstructionSet::iterator 
					instruction = instructions.begin();
					instruction != instructions.end(); ++instruction )
				{
					mimd += (*instruction - previous) * count;
					previous = *instruction;
					--count;
				}
			
				if( !instructions.empty() )
				{
					mimd /= *(--instructions.end()) + DBL_EPSILON;
				}
				
				std::cout << "   SIMD parallelism: " << activity 
					<< "\n";		
				std::cout << "   MIMD parallelism: " << mimd << "\n";
			
				localSIMD += activity;
				localMIMD += mimd;
				localCTAs += header.dimensions;
			
			}

			localSIMD /= vector->second.size() + DBL_EPSILON;
			localMIMD /= vector->second.size() + DBL_EPSILON;
			localCTAs /= vector->second.size() + DBL_EPSILON;

			std::cout << " Kernel " << vector->first << " statistics:\n";
			std::cout << "  average CTAs: " << localCTAs << "\n";
			std::cout << "  average SIMD parallelism: " << localSIMD << "\n";
			std::cout << "  average MIMD parallelism: " << localMIMD << "\n";
			
			averageSIMD += localSIMD;
			averageMIMD += localMIMD;
			averageCTAs += localCTAs;
		}
				
		averageSIMD /= _kernels.size() + DBL_EPSILON;
		averageMIMD /= _kernels.size() + DBL_EPSILON;
		averageCTAs /= _kernels.size() + DBL_EPSILON;
		
		std::cout << "Aggregate statistics:\n";
		std::cout << " average CTAs: " << averageCTAs << "\n";
		std::cout << " average SIMD parallelism: " << averageSIMD << "\n";
		std::cout << " average MIMD parallelism: " << averageMIMD << "\n";
	}
void trace::InstructionTraceAnalyzer::instructions_by_kernel(bool pyList) const {


	// sequence of functional units
	trace::InstructionTraceGenerator::FunctionalUnit funcUnits[] = {
		InstructionTraceGenerator::Integer_arithmetic,	//! integer arithmetic
		InstructionTraceGenerator::Integer_logical,		//! itneger logical
		InstructionTraceGenerator::Integer_comparison,	//! comparison
		InstructionTraceGenerator::Float_single,				//! floating-point single-precision
		InstructionTraceGenerator::Float_double,				//! floating-point, double-precision
		InstructionTraceGenerator::Float_comparison,		//! floating-point comparison
		InstructionTraceGenerator::Memory_offchip,			//! off-chip: {global, local}
		InstructionTraceGenerator::Memory_onchip,			//! cached or scratchpad: {texture, shared, constant}
		InstructionTraceGenerator::Control,						//! control-flow instructions
		InstructionTraceGenerator::Parallelism,				//! parallelism: sync, reduction, vote
		InstructionTraceGenerator::Special,						//! transcendental and special functions
		InstructionTraceGenerator::Other,							//! not categorized
		InstructionTraceGenerator::FunctionalUnit_invalid
	};

	std::cout << "# InstructionTraceAnalyzer - by kernel\n#" << std::endl;

	std::cout << "\n";
	std::cout << "# Python object notation:\n";
	std::cout << "#  \n";
	std::cout << "#  map<program name.kernel name, \n";
	std::cout << "#    map<functional unit, pair<dynamic count, static count, activity> > >\n\n";
	
	std::cout << "kernels = [\n";
	for( KernelMap::const_iterator vector = _kernels.begin(); 
		vector != _kernels.end(); ++vector ) {
		
		std::string program = vector->first;
		const KernelVector & kernels = vector->second;

		std::map< std::string, int > kernelCount;
		
		for (KernelVector::const_iterator k_it = kernels.begin(); k_it != kernels.end(); ++k_it) {
			if (kernelCount.find(k_it->name) == kernelCount.end()) {
				kernelCount[k_it->name] = 0;
				std::cout << "  '" << program << ":" << k_it->name << "',\n";
			}
			kernelCount[k_it->name] ++;
		}
	}
	std::cout << "]\n\n";
	
	std::cout << "functional_units = [\n";
	for (int n = 0; funcUnits[n] != InstructionTraceGenerator::FunctionalUnit_invalid; n++) {
	
		std::cout << "  '" << trace::InstructionTraceGenerator::toString(funcUnits[n]) << "',\n";
	}
	std::cout << "]\n\n";
	
	std::cout << "results = {\n";

	for( KernelMap::const_iterator vector = _kernels.begin(); 
		vector != _kernels.end(); ++vector ) {
		
		std::string program = vector->first;
		const KernelVector & kernels = vector->second;
		InstructionTraceGenerator::FunctionalUnitCountMap appCounter;
		std::map< std::string, int > kernelCount;

		// loop over the kernels
		for (KernelVector::const_iterator k_it = kernels.begin(); k_it != kernels.end(); ++k_it) {
			InstructionTraceGenerator::Header header;
			std::ifstream hstream( k_it->header.c_str() );
			boost::archive::text_iarchive harchive( hstream );

			if (kernelCount.find(k_it->name) == kernelCount.end()) {
				kernelCount[k_it->name] = 0;
			}

			InstructionTraceGenerator::FunctionalUnitCountMap counter;
	
			try {
				harchive >> header;
				assert(header.format == TraceGenerator::InstructionTraceFormat);
			
				harchive >> counter;
			}
			catch (boost::archive::archive_exception &exp) {
				std::cout << "### boost::archive::archive_exception: application '" << program << "' kernel '" << k_it->name << "' ";
				std::cout << " (exception code: " << toString(exp.code) << ") - stream: '" << k_it->header.c_str() << "'\n";
				continue;
			}

			// print the program name
			std::cout << "  '" << program << ":" << k_it->name << ":" << kernelCount[k_it->name] << "': {" << std::endl;
			
			std::cout << "    'blockDim': [" << header.blockDim.x << ", " << header.blockDim.y << ", " << header.blockDim.z << "],\n";
			std::cout << "    'gridDim': [" << header.gridDim.x << ", " << header.gridDim.y << ", " << header.gridDim.z << "],\n";
		
			kernelCount[k_it->name] ++;
			size_t totalDynamicCount = 0;
			size_t totalStaticCount = 0;
			double totalActivity = 0;

			// print out one bar per functional unit
			for (int n = 0; funcUnits[n] != InstructionTraceGenerator::FunctionalUnit_invalid; n++) {		

				size_t dynamicCount = 0;
				size_t staticCount = 0;
				double activity = 0;
//				int activeFU = 0;
			
				typedef trace::InstructionTraceGenerator::OpcodeCountMap OC;
				for (OC::iterator op_it = counter[funcUnits[n]].begin(); op_it != counter[funcUnits[n]].end();
					++op_it) {
				
					dynamicCount += op_it->second.dynamic_count;
					staticCount += op_it->second.static_count;
					
					if (op_it->second.dynamic_count) {
						activity += op_it->second.activity;
//						activeFU++;
					}
				}

				totalDynamicCount += dynamicCount;
				totalStaticCount += staticCount;
				totalActivity += activity;
			
//				if (activeFU) {
//					activity /= (double)activeFU;
//				}
				if(dynamicCount)
					activity /= (double)dynamicCount;
			
				// write to stdout
				std::cout << "    '" << trace::InstructionTraceGenerator::toString(funcUnits[n]) << "': ( " 
					<< dynamicCount << ", " << staticCount << ", " << activity << " )," << std::endl;
			}
			if(totalDynamicCount)
				totalActivity /= (double)totalDynamicCount;
			std::cout << "    '" << "total" << "': ( " 
				<< totalDynamicCount << ", " << totalStaticCount << ", " << totalActivity << " )," << std::endl;
		
			std::cout << "  },\n";
		}
		

	}
	std::cout << "}\n";

}
/*!
	Produces: histogram of dynamic instruction counts for each application
		x-axis: functional units
		y-axis: number of dynamic instructions
*/
void trace::InstructionTraceAnalyzer::instructions_by_application(bool pyList) const {

	// sequence of functional units
	trace::InstructionTraceGenerator::FunctionalUnit funcUnits[] = {
		InstructionTraceGenerator::Integer_arithmetic,	//! integer arithmetic
		InstructionTraceGenerator::Integer_logical,		//! itneger logical
		InstructionTraceGenerator::Integer_comparison,	//! comparison
		InstructionTraceGenerator::Float_single,				//! floating-point single-precision
		InstructionTraceGenerator::Float_double,				//! floating-point, double-precision
		InstructionTraceGenerator::Float_comparison,		//! floating-point comparison
		InstructionTraceGenerator::Memory_offchip,			//! off-chip: {global, local}
		InstructionTraceGenerator::Memory_onchip,			//! cached or scratchpad: {texture, shared, constant}
		InstructionTraceGenerator::Control,						//! control-flow instructions
		InstructionTraceGenerator::Parallelism,				//! parallelism: sync, reduction, vote
		InstructionTraceGenerator::Special,						//! transcendental and special functions
		InstructionTraceGenerator::Other,							//! not categorized
		InstructionTraceGenerator::FunctionalUnit_invalid
	};

	std::cout << "# InstructionTraceAnalyzer - by application\n#" << std::endl;

	std::cout << "\n";
	std::cout << "# Python object notation:\n";
	std::cout << "#  \n";
	std::cout << "#  map<program name, \n";
	std::cout << "#    map<functional unit, tuple<dynamic count, static count, activity> > >\n\n";
	
	std::cout << "applications = [\n";
	for( KernelMap::const_iterator vector = _kernels.begin(); 
		vector != _kernels.end(); ++vector ) {
		std::string program = vector->first;
		std::cout << "  '" << program << "',\n";
	}
	std::cout << "]\n\n";
	
	std::cout << "functional_units = [\n";
	for (int n = 0; funcUnits[n] != InstructionTraceGenerator::FunctionalUnit_invalid; n++) {
		std::cout << "  '" << trace::InstructionTraceGenerator::toString(funcUnits[n]) << "',\n";
	}
	std::cout << "]\n\n";
	
	std::cout << "results = {\n";

	for( KernelMap::const_iterator vector = _kernels.begin(); 
		vector != _kernels.end(); ++vector ) {

		std::set< std::string > visitedKernels;
		
		std::string program = vector->first;
		const KernelVector & kernels = vector->second;
		InstructionTraceGenerator::FunctionalUnitCountMap appCounter;

		// loop over the kernels
		for (KernelVector::const_iterator k_it = kernels.begin(); k_it != kernels.end(); ++k_it) {
			InstructionTraceGenerator::Header header;
			report("Open header file " << k_it->header.c_str());
			std::ifstream hstream( k_it->header.c_str() );
			boost::archive::text_iarchive harchive( hstream );
			InstructionTraceGenerator::FunctionalUnitCountMap counter;

			try {
				harchive >> header;
				assert(header.format == TraceGenerator::InstructionTraceFormat);
				harchive >> counter;
			}
			catch (boost::archive::archive_exception &exp) {
				std::cout << "### Exception: application '" << program << "' kernel '" << k_it->name << "' ";
				std::cout << " (exception code: " << toString(exp.code) << ") - stream: '" << k_it->header.c_str() << "'\n";
				continue;
			}

	
			// aggregate counts
			append(appCounter, counter, visitedKernels.find(k_it->name) == visitedKernels.end());
			visitedKernels.insert(k_it->name);
		}

		std::stringstream ssDynamic, ssStatic, ssDynamicComments, ssStaticComments;
		
		if (pyList) {
			ssDynamic << " '" << program << "': [";
			ssStatic <<  " '" << program << "': [";
		}
		else {
			// print the program name
			std::cout << "  '" << program << "': {" << std::endl;
		}

		size_t totalDynamicCount = 0;
		size_t totalStaticCount = 0;
		double totalActiveCount = 0;

		// print out one bar per functional unit
		for (int n = 0; funcUnits[n] != InstructionTraceGenerator::FunctionalUnit_invalid; n++) {		

			size_t dynamicCount = 0;
			size_t staticCount = 0;
			double activity = 0;
			int activeFU = 0;
			
			typedef trace::InstructionTraceGenerator::OpcodeCountMap OC;
			for (OC::iterator op_it = appCounter[funcUnits[n]].begin(); op_it != appCounter[funcUnits[n]].end();
				++op_it) {
				
				dynamicCount += op_it->second.dynamic_count;
				staticCount += op_it->second.static_count;
				if (op_it->second.dynamic_count) {
					activity += op_it->second.activity;
					activeFU ++;
				}

			}

			totalDynamicCount += dynamicCount;
			totalStaticCount += staticCount;
			totalActiveCount += activity;

			if(dynamicCount != 0)
				activity /= (double)dynamicCount;
			
			if (pyList) {
				ssDynamic << (n ? ", " : " ") << dynamicCount;
				ssStatic << (n ? ", " : " ") << staticCount;
				ssDynamicComments << " " << trace::InstructionTraceGenerator::toString(funcUnits[n]);
				ssStaticComments << " " << trace::InstructionTraceGenerator::toString(funcUnits[n]);
			}
			else {
				// write to stdout
				std::cout << "    '" << trace::InstructionTraceGenerator::toString(funcUnits[n]) << "': ( " 
					<< dynamicCount << ", " << staticCount << ", " << activity << " )," << std::endl;
			}
		}

		if (pyList) {
			std::cout << "# " << program << " sequence: " << ssDynamicComments.str() << "\n";
			std::cout << ssDynamic.str() << " ]  # dynamic\n";
			std::cout << ssStatic.str() << " ]  # static\n";
		}

		if(totalDynamicCount != 0)
			totalActiveCount /= (double)totalDynamicCount; 
		std::cout << "    '" << "total" << "': ( " 
			<< totalDynamicCount << ", " << totalStaticCount << ", " << totalActiveCount << " )," << std::endl;

		std::cout << "  },\n";
	}
	std::cout << "}\n";
}