Beispiel #1
0
static void CUPTIAPI
bufferCompleted(CUcontext ctx, uint32_t streamId, uint8_t *buffer, size_t size, size_t validSize)
{
    CUptiResult status;
    CUpti_Activity *record = NULL;
    do {
        status = cuptiActivityGetNextRecord(buffer, validSize, &record);
        if(status == CUPTI_SUCCESS) {
            printActivity(record);
        }
        else if (status == CUPTI_ERROR_MAX_LIMIT_REACHED) {
            break;
        }
        else {
            CUPTI_CALL(status);
        }
    } while (1);

    size_t dropped;
    CUPTI_CALL(cuptiActivityGetNumDroppedRecords(ctx, streamId, &dropped));
    if (dropped != 0) {
        printf("Dropped %u activity records\n", (unsigned int)dropped);
    }
	printf("\n\n\n\n\n\n");
	printf("************* STALL SUMMARY ********************\n");
	int i;
	for(i=0;i<12;++i)
		if(stall_name[i] != NULL)
			printf("%s = %d \n",stall_name[i],val[i]);
	printf("*************************************************\n\n");


}
Beispiel #2
0
static void
handleSync(CUpti_CallbackId cbid, const CUpti_SynchronizeData *syncData)
{
  // check the top buffer of the global queue and dequeue if full. If
  // we dump a buffer add it back to the queue
  uint8_t *buffer = dumpIfFull(NULL, 0);
  if (buffer != NULL) {
    CUPTI_CALL(cuptiActivityEnqueueBuffer(NULL, 0, buffer, BUF_SIZE));
  }

  // dump context buffer on context sync
  if (cbid == CUPTI_CBID_SYNCHRONIZE_CONTEXT_SYNCHRONIZED) {
    buffer = dumpIfFull(syncData->context, 0);
    if (buffer != NULL) {
      CUPTI_CALL(cuptiActivityEnqueueBuffer(syncData->context, 0, buffer, BUF_SIZE));
    }
  }
  // dump stream buffer on stream sync
  else if (cbid == CUPTI_CBID_SYNCHRONIZE_STREAM_SYNCHRONIZED) {
    uint32_t streamId;
    CUPTI_CALL(cuptiGetStreamId(syncData->context, syncData->stream, &streamId));
    buffer = dumpIfFull(syncData->context, streamId);
    if (buffer != NULL) {
      CUPTI_CALL(cuptiActivityEnqueueBuffer(syncData->context, streamId, buffer, BUF_SIZE));
    }
  }
}
Beispiel #3
0
static void
handleResource(CUpti_CallbackId cbid, const CUpti_ResourceData *resourceData)
{
  // enqueue buffers on a context's queue when the context is created
  if (cbid == CUPTI_CBID_RESOURCE_CONTEXT_CREATED) {
    queueNewBuffer(resourceData->context, 0);
    queueNewBuffer(resourceData->context, 0);
  }
  // dump all buffers on a context destroy
  else if (cbid == CUPTI_CBID_RESOURCE_CONTEXT_DESTROY_STARTING) {
    while (dump(resourceData->context, 0) != NULL) ;
  }

  // enqueue buffers on a stream's queue when a non-default stream is created
  if (cbid == CUPTI_CBID_RESOURCE_STREAM_CREATED) {
    uint32_t streamId;
    CUPTI_CALL(cuptiGetStreamId(resourceData->context, resourceData->resourceHandle.stream, &streamId));
    queueNewBuffer(resourceData->context, streamId);
    queueNewBuffer(resourceData->context, streamId);
  }
  // dump all buffers on a stream destroy
  else if (cbid == CUPTI_CBID_RESOURCE_STREAM_DESTROY_STARTING) {
    uint32_t streamId;
    CUPTI_CALL(cuptiGetStreamId(resourceData->context, resourceData->resourceHandle.stream, &streamId));
    while (dump(resourceData->context, streamId) != NULL) ;
  }
}
Beispiel #4
0
void
initTrace()
{

  // Enqueue a couple of buffers in the global queue
  queueNewBuffer(NULL, 0);
  queueNewBuffer(NULL, 0);

  // device activity record is created when CUDA initializes, so we
  // want to enable it before cuInit() or any CUDA runtime call
  CUPTI_CALL(cuptiActivityEnable(CUPTI_ACTIVITY_KIND_DEVICE));
//  CUPTI_CALL(cuptiActivityEnable(CUPTI_ACTIVITY_KIND_MEMCPY));
  cuptiActivityEnable(CUPTI_ACTIVITY_KIND_CONTEXT);
 //                       cuptiActivityEnable(CUPTI_ACTIVITY_KIND_DRIVER);
//                        cuptiActivityEnable(CUPTI_ACTIVITY_KIND_RUNTIME);
                        cuptiActivityEnable(CUPTI_ACTIVITY_KIND_MEMCPY);
                        cuptiActivityEnable(CUPTI_ACTIVITY_KIND_MEMSET);
                        cuptiActivityEnable(CUPTI_ACTIVITY_KIND_KERNEL);

	
  CUpti_SubscriberHandle subscriber;
  CUPTI_CALL(cuptiSubscribe(&subscriber, (CUpti_CallbackFunc)traceCallback, NULL));

  CUPTI_CALL(cuptiEnableDomain(1, subscriber, CUPTI_CB_DOMAIN_RESOURCE));
  CUPTI_CALL(cuptiEnableDomain(1, subscriber, CUPTI_CB_DOMAIN_SYNCHRONIZE));
//add by wukai
  //CUPIT_CALL(cuptiEnableDomain(1, subscriber, CUPTI_CB_DOMAIN_RUNTIME_API));

  CUPTI_CALL(cuptiGetTimestamp(&startTimestamp));
}
Beispiel #5
0
/**
 * Dump the contents of the top buffer in the queue specified by
 * 'context' and 'streamId', and return the top buffer. If the queue
 * is empty return NULL.
 */
static uint8_t *
dump(CUcontext context, uint32_t streamId)
{
  uint8_t *buffer = NULL;
  size_t validBufferSizeBytes;
  CUptiResult status;
  status = cuptiActivityDequeueBuffer(context, streamId, &buffer, &validBufferSizeBytes);
  if (status == CUPTI_ERROR_QUEUE_EMPTY) {
    return NULL;
  }
  CUPTI_CALL(status);
  
  if (context == NULL) {
    printf("[CUPTI] Starting dump for global\n");
  } else if (streamId == 0) {
    printf("[CUPTI] Starting dump for context %p\n", context);
  } else {
    printf("[CUPTI] Starting dump for context %p, stream %u\n", context, streamId);
  }

  CUpti_Activity *record = NULL;
  do {
    status = cuptiActivityGetNextRecord(buffer, validBufferSizeBytes, &record);
    if(status == CUPTI_SUCCESS) {
      printActivity(record);
    }
    else if (status == CUPTI_ERROR_MAX_LIMIT_REACHED) {
       	printf ("[CUPTI] CUPTI_ERROR_MAX_LIMIT_REACHED\n");
	break;
    }
    else {
      CUPTI_CALL(status);
    }
  } while (1);

  // report any records dropped from the queue
  size_t dropped;
  CUPTI_CALL(cuptiActivityGetNumDroppedRecords(context, streamId, &dropped));
  if (dropped != 0) {
    printf("Dropped %u activity records\n", (unsigned int)dropped);
  }

  if (context == NULL) {
    printf("[CUPTI] Finished dump for global\n");
  } else if (streamId == 0) {
    printf("[CUPTI] Finished dump for context %p \n", context);
  } else {
    printf("[CUPTI] Finished dump for context %p, stream %u\n", context, streamId);
  }

  return buffer;
}
Beispiel #6
0
/**
 * Allocate a new BUF_SIZE buffer and add it to the queue specified by
 * 'context' and 'streamId'.
 */
static void
queueNewBuffer(CUcontext context, uint32_t streamId)
{
  size_t size = BUF_SIZE;
  uint8_t *buffer = (uint8_t *)malloc(size+ALIGN_SIZE);
  CUPTI_CALL(cuptiActivityEnqueueBuffer(context, streamId, ALIGN_BUFFER(buffer, ALIGN_SIZE), size));
}
Beispiel #7
0
/**
 * If the top buffer in the queue specified by 'context' and
 * 'streamId' is full, then dump its contents and return the
 * buffer. If the top buffer is not full, return NULL.
 */
static uint8_t *
dumpIfFull(CUcontext context, uint32_t streamId)
{
  size_t validBufferSizeBytes;
  CUptiResult status;
  status = cuptiActivityQueryBuffer(context, streamId, &validBufferSizeBytes);
  if (status == CUPTI_ERROR_MAX_LIMIT_REACHED) {
    return dump(context, streamId);
  } else if ((status != CUPTI_SUCCESS) && (status != CUPTI_ERROR_QUEUE_EMPTY)) {
    CUPTI_CALL(status);
  }

  return NULL;
}
int cupti_metrics_enable()
{
    CUptiResult res;

    if (!initialised || !metricCount) {
        return 0;
    }

    // We cannot initialise anything here - we only hook CUDA context
    // creation in order to do the actual work later.
    if (subscriber == NULL) {

        // Create subscription
        CUPTI_CALL(cuptiSubscribe(&subscriber, (CUpti_CallbackFunc)cupti_callback,
                                  NULL));

        // Bind callbacks
        CUPTI_CALL(cuptiEnableCallback(1, subscriber, CUPTI_CB_DOMAIN_RESOURCE,
                                       CUPTI_CBID_RESOURCE_CONTEXT_CREATED));
        CUPTI_CALL(cuptiEnableCallback(1, subscriber, CUPTI_CB_DOMAIN_RESOURCE,
                                       CUPTI_CBID_RESOURCE_CONTEXT_DESTROY_STARTING));

        // Hook kernel launch. This feels hacky, but is the approach
        // callback_metric.cu takes.
#ifndef LOG_ALL_DRIVER_CALLS
        CUPTI_CALL(cuptiEnableCallback(1, subscriber, CUPTI_CB_DOMAIN_DRIVER_API, 
                                       CUPTI_DRIVER_TRACE_CBID_cuLaunchKernel));
#else
        int i;
        for (i = CUPTI_DRIVER_TRACE_CBID_cuInit; i < CUPTI_DRIVER_TRACE_CBID_SIZE; i++) {
            CUPTI_CALL(cuptiEnableCallback(1, subscriber, CUPTI_CB_DOMAIN_DRIVER_API, i));
        }
#endif
    }

}
Beispiel #9
0
__attribute__((constructor)) void
initTrace()
{
	//get the arguments from the environment variables
	int deviceId, sampRate;
	
    CUcontext cuCtx;
	deviceId = atoi(getenv("GPU_DEVICE_ID"));
    cuInit(0);
	cuCtxCreate(&cuCtx,0,deviceId);
	CUPTI_CALL(cuptiActivityRegisterCallbacks(bufferRequested, bufferCompleted));
    CUPTI_CALL(cuptiActivityEnable(CUPTI_ACTIVITY_KIND_PC_SAMPLING));
	//CUPTI_CALL(cuptiActivityEnable(CUPTI_ACTIVITY_KIND_BRANCH));

	CUPTI_CALL(cuptiActivityEnable(CUPTI_ACTIVITY_KIND_KERNEL));
	CUPTI_CALL(cuptiSubscribe(&g_subscriber, (CUpti_CallbackFunc)traceCallback, NULL));
	CUPTI_CALL(cuptiEnableDomain(1, g_subscriber, CUPTI_CB_DOMAIN_RESOURCE));
	CUpti_ActivityPCSamplingConfig config;
	sampRate=atoi(getenv("PC_SAMPLING_RATE"));
	config.samplingPeriod= sampRate;
	CUPTI_CALL(cuptiActivityConfigurePCSampling(cuCtx, &config));
}
Beispiel #10
0
__attribute__((destructor)) void
finiTrace()
{
//	printf("FLushing CUPTI \n");
	CUPTI_CALL(cuptiActivityFlushAll(0));
}