// remove cycles
void cleanUpGroupingRec(UINT address, ParentRefMap& parentMap, ChildRefMap& childMap, MapAddressToAllocation& addressesToAllocations, AddressSet& visited)
{
   if (visited.find(address) != visited.end()) return;

   visited.insert(address);

   auto myUid = addressesToAllocations[address].uid;
   auto parents = parentMap[address]; // copy
   for (auto parent : parents)
   {
      auto parentUid = addressesToAllocations[parent].uid;
      
      bool eraseParent = 
         (parentUid > myUid) || 
         (address == parent); // remove self references
      
      if (eraseParent)
      {
         parentMap[address].erase(parent);
         childMap[parent].erase(address);
      }
      else
      {
         cleanUpGroupingRec(parent, parentMap, childMap, addressesToAllocations, visited);
      }
   }
}
void checkMemoryGrouping(const MemoryTracker::TrackedAllocation& ta, AddressSet& addresses, ParentRefMap& parentMap, ChildRefMap& childMap)
{
   //UINT * memory = (UINT*)address, *endMemory = (UINT*)(address+size);	//use memory hacks to see if there's pointers to the allocations in here

   UINT * buf = new UINT[ta.bytes];
   ZeroMemory(buf, ta.bytes);

   SIZE_T numberOfBytesRead = 0;

   if (ReadProcessMemory(global::hProcess, ta.mem, buf, ta.bytes, &numberOfBytesRead))
   {
      UINT * memory = buf;
      UINT * endMemory = buf + (ta.bytes / sizeof(UINT)); // min( (ta.bytes / sizeof(UINT)), (numberOfBytesRead / sizeof(UINT)) );
      for (; memory < endMemory; memory++)
      {
         auto it = addresses.find(*memory);
         if (it != addresses.end())
         {
            parentMap[*it].insert((UINT)ta.mem);
            childMap[(UINT)ta.mem].insert(*it);
         }
      }
   }

   delete[] buf;
}
void printSingleAggregateGrouping(std::ostream& out, int level, UINT address, TrackedCallstack::ptr callstackPtr, ReferenceCount& rc, MapAddressToAllocation& addressToAllocationMap, ChildRefMap& childMap, AddressSet visited, UINT sumResponsible)
{
   if (visited.find(address) != visited.end()) return;
   visited.insert(address);
   auto& alloc = addressToAllocationMap[address];
   auto p = callstackPtr;

   indent(out, level) << "Allocations " << dec << rc.totalSize << " total bytes for " << rc.instances << " instances; Responsible for " << sumResponsible << " nested bytes. Callstack:" << endl;
   for (auto s: p->callstackVerbose)
   {
      indent(out, level) << "|" << s << endl;
   }
   printAggregateGroupings(out, level + 1, childMap[address], addressToAllocationMap, childMap, visited);
}
UINT findResponsibleBytes(UINT address, MapAddressToAllocation& addressToAllocationMap, ChildRefMap& childMap, AddressSet& visitedInThisSearch)
{
   if (visitedInThisSearch.find(address) != visitedInThisSearch.end()) return 0;
   visitedInThisSearch.insert(address);
   UINT sum = 0;

   sum += addressToAllocationMap[address].bytes;

   for (UINT c: childMap[address])
   {
      sum += findResponsibleBytes(c, addressToAllocationMap, childMap, visitedInThisSearch);
   }
   return sum;
}
void printGroupings(std::ostream& out, int level, UINT address, MapAddressToAllocation& addressToAllocationMap, ChildRefMap& childMap, AddressSet& visited)
{
   if (visited.find(address) != visited.end()) return;
   visited.insert(address);
   auto& alloc = addressToAllocationMap[address];
   auto p = alloc.callStack;

   indent(out, level) << "Allocation at " << hex << address << " " << dec << alloc.bytes << " bytes; Callstack:" << endl;
   for (auto s: p->callstackVerbose)
   {
      indent(out, level) << s << endl;
   }

   for (auto a: childMap[address])
   {
      printGroupings(out, level + 1, a, addressToAllocationMap, childMap, visited);
   }
}
//recursive
void walkUpGrouping(UINT address, ParentRefMap& parentMap, AddressSet& outAddresses, AddressSet& visited)
{
   visited.insert(address);
   if (parentMap[address].size() == 0)	//base case
   {
      outAddresses.insert(address);
   }
   else
   {
      for (UINT a: parentMap[address])
      {
         if (visited.find(a) == visited.end())
         {
            walkUpGrouping(a, parentMap, outAddresses, visited);
         }
      }
   }
}
static ir::PTXU64 extent(const executive::ExecutableKernel& kernel) {
	typedef std::unordered_set<ir::PTXU64> AddressSet;
	report("Computing extent for kernel " << kernel.name);
	AddressSet encountered;
	AddressSet pointers;

	ir::PTXU64 extent = kernel.constMemorySize() + kernel.parameterMemorySize() 
		+ kernel.totalSharedMemorySize() + kernel.localMemorySize();
	
	executive::ExecutableKernel::TextureVector textures = kernel.textureReferences();
	
	for (executive::ExecutableKernel::TextureVector::iterator 
		texture = textures.begin(); texture != textures.end(); ++texture) {
		report(" Checking texture mapped address " << (*texture)->data);
		pointers.insert((ir::PTXU64)(*texture)->data);
	}
#if 0
	for (ir::Kernel::ParameterVector::const_iterator 
		parameter = kernel.parameters.begin();
		parameter != kernel.parameters.end(); ++parameter) {
		ir::PTXU64 address = 0;
		
		for (ir::Parameter::ValueVector::const_iterator 
			element = parameter->arrayValues.begin();
			element != parameter->arrayValues.end(); ++element) {
			switch (parameter->type) {
				case ir::PTXOperand::b8:
				case ir::PTXOperand::s8:
				case ir::PTXOperand::u8:
				{
					address <<= 8;
					address |= element->val_u8;
					break;
				}
				case ir::PTXOperand::b16:
				case ir::PTXOperand::s16:
				case ir::PTXOperand::u16:
				{
					address <<= 16;
					address |= element->val_u16;
					break;
				}
				case ir::PTXOperand::b32:
				case ir::PTXOperand::s32:
				case ir::PTXOperand::u32:
				{
					address <<= 32;
					address |= element->val_u32;
					break;
				}
				case ir::PTXOperand::b64:
				case ir::PTXOperand::s64:
				case ir::PTXOperand::u64:
				{
					address = element->val_u64;
					break;
				}
				default: break;
			}
			
			report(" Checking address " << (void*)address);
			pointers.insert(address);
		}
	}
#endif
	
	for(AddressSet::iterator pointer = pointers.begin(); 
		pointer != pointers.end(); ++pointer)
	{
		executive::Device::MemoryAllocation* 
			allocation = kernel.device->getMemoryAllocation((void*)*pointer);
		if(allocation != 0)
		{
			if(encountered.insert((ir::PTXU64)allocation->pointer()).second)
			{
				extent += allocation->size();
			}
		}
	}
	
	return extent;
	
}