void bw(uint64_t size) { uint8_t *H, *D; int i; printf("%"PRIu64" ", size); cudaInit(); printf("0 "); // reg func H = (uint8_t*)malloc(sizeof(uint8_t)*size); time_begin(); cudaMalloc((void**)&D, sizeof(uint8_t)*size); printf("%u ", time_end()); for(i=0; i<size; i++) { H[i]=i%255; } time_begin(); cudaMemcpy(D, H, size*sizeof(uint8_t), cudaMemcpyHostToDevice); printf("%u ", time_end()); printf("0 "); // exec kernel for(i=0; i<size; i++) { H[i]=0; } time_begin(); cudaMemcpy(H, D, size*sizeof(uint8_t), cudaMemcpyDeviceToHost); printf("%u ", time_end()); for(i=0; i<size; i++) { if(H[i]!=i%255) printf("error %d\n", i); } free(H); time_begin(); cudaFree(D); printf("%u ", time_end()); cudaFini(); printf("\n"); }
/*------------------------------------------------------------------*/ bool AzRgforest::growForest() { clock_t b_time; time_begin(&b_time); /*--- find the best split ---*/ AzTrTsplit best_split; searchBestSplit(&best_split); if (shouldExit(&best_split)) { /* exit if no more split */ return true; /* exit */ } /*--- split the node ---*/ double w_inc; int leaf_nx[2] = {-1,-1}; const AzRgfTree *tree = splitNode(&best_split, &w_inc, leaf_nx); if (lmax_timer.reachedMax(l_num, "AzRgforest: #leaf", out)) { return true; /* #leaf reached max; exit */ } /*--- update target ---*/ updateTarget(tree, leaf_nx, w_inc); time_end(b_time, &search_time); return false; /* don't exit */ }
cudaError_t cudaConfigureCall( dim3 gridDim, dim3 blockDim, size_t sharedMem, cudaStream_t stream) { pfunc(); time_begin(); ptrace("gridDim= %d %d %d\n", gridDim.x, gridDim.y, gridDim.z); ptrace("blockDim= %d %d %d\n", blockDim.x, blockDim.y, blockDim.z); ptrace("sharedMem= %lu\n", sharedMem); ptrace("stream= %p\n", (void*)stream); //ptrace("size= %lu\n", sizeof(cudaStream_t)); cudaKernelConf[0] = gridDim.x; cudaKernelConf[1] = gridDim.y; cudaKernelConf[2] = gridDim.z; cudaKernelConf[3] = blockDim.x; cudaKernelConf[4] = blockDim.y; cudaKernelConf[5] = blockDim.z; cudaKernelConf[6] = sharedMem; cudaKernelConf[7] = (stream==NULL)?(uint64_t)-1:(uint64_t)stream; memset(cudaKernelPara, 0, cudaKernelParaMaxSize); cudaParaSize = sizeof(uint32_t); time_end(t_ConfigCall); return cudaSuccess; }
/*------------------------------------------------------------------*/ void AzRgforest::optimize_resetTarget() { clock_t b_time; time_begin(&b_time); int t_num = ens->size(); AzBytArr s("Calling optimizer with "); s.cn(t_num); s.c(" trees and "); s.cn(l_num); s.c(" leaves"); AzTimeLog::print(s, out); opt->update(data, ens, &v_p); resetTarget(); int tx; for (tx = 0; tx < t_num; ++tx) { ens->tree_u(tx)->removeSplitAssessment(); /* since weights changed */ } isOpt = true; time_end(b_time, &opt_time); }
int cc_sweep_phase(char *buffer, int bufsize, struct cc **tokens) { struct cc **pp = tokens; int i, n; #ifdef STATS int nn, ii; #endif #ifdef STATS if (verbose >= 0) time_begin(); if (verbose > 0) printf("Sweep:"); #endif cc_sweep0(buffer, bufsize, tt.tt_token_min - 1); #ifdef STATS ntoken_stat = 0; nn = 0; ii = 0; #endif for (i = tt.tt_token_min; i <= tt.tt_token_max; i++) { #ifdef STATS if (verbose > 0) { if (ii > 7) { printf("\n "); ii = 0; } ii++; printf(" (%d", i); (void) fflush(stdout); } #endif n = cc_sweep(buffer, bufsize, pp, i); pp += n; #ifdef STATS if (verbose > 0) { if (--n > 0) { printf(" %d", n); nn += n; } putchar(')'); } #endif } qinsertq(&cc_q1b, &cc_q1a); #ifdef STATS if (verbose > 0) printf("\n %d tokens, %d candidates\n", ntoken_stat, nn); if (verbose >= 0) time_end(); #endif return pp - tokens; }
void __cudaUnregisterFatBinary(void **fatCubinHandle) { pfunc(); time_begin(); ptrace("fatCubinHandle= %p, value= %p\n", fatCubinHandle, *fatCubinHandle); send_cmd_to_device( VIRTQC_cudaUnregisterFatBinary, NULL); free(fatCubinHandle); time_end(t_UnregFatbin); close_device(); }
void __cudaRegisterFunction( void **fatCubinHandle, const char *hostFun, char *deviceFun, const char *deviceName, int thread_limit, uint3 *tid, uint3 *bid, dim3 *bDim, dim3 *gDim, int *wSize ) { VirtioQCArg arg; computeFatBinaryFormat_t fatBinHeader; pfunc(); time_begin(); ptrace("fatCubinHandle= %p, value= %p\n", fatCubinHandle, *fatCubinHandle); ptrace("hostFun= %s (%p)\n", hostFun, hostFun); ptrace("deviceFun= %s (%p)\n", deviceFun, deviceFun); ptrace("deviceName= %s\n", deviceName); ptrace("thread_limit= %d\n", thread_limit); if(tid) ptrace("tid= %u %u %u\n", tid->x, tid->y, tid->z); else ptrace("tid is NULL\n"); if(bid) ptrace("bid= %u %u %u\n", bid->x, bid->y, bid->z); else ptrace("bid is NULL\n"); if(bDim)ptrace("bDim= %u %u %u\n", bDim->x, bDim->y, bDim->z); else ptrace("bDim is NULL\n"); if(gDim)ptrace("gDim= %u %u %u\n", gDim->x, gDim->y, gDim->z); else ptrace("gDim is NULL\n"); if(wSize)ptrace("wSize= %d\n", *wSize); else ptrace("wSize is NULL\n"); memset(&arg, 0, sizeof(VirtioQCArg)); fatBinHeader = (computeFatBinaryFormat_t)(*fatCubinHandle); ptr( arg.pA , fatBinHeader, fatBinHeader->fatSize); ptr( arg.pB , deviceName , strlen(deviceName)+1 ); arg.flag = (uint32_t)(uint64_t)hostFun; ptrace("pA= %p, pASize= %u, pB= %p, pBSize= %u\n", (void*)arg.pA, arg.pASize, (void*)arg.pB, arg.pBSize); send_cmd_to_device( VIRTQC_cudaRegisterFunction, &arg); time_end(t_RegFunc); }
cudaError_t cudaDeviceReset(void) { VirtioQCArg arg; pfunc(); time_begin(); memset(&arg, 0, sizeof(VirtioQCArg)); send_cmd_to_device( VIRTQC_cudaDeviceReset, &arg); time_end(t_DevReset); return (cudaError_t)arg.cmd; }
cudaError_t cudaGetLastError(void) { VirtioQCArg arg; pfunc(); time_begin(); memset(&arg, 0, sizeof(VirtioQCArg)); send_cmd_to_device( VIRTQC_cudaGetLastError, &arg); time_end(t_GetLastError); return (cudaError_t)arg.cmd; }
cudaError_t cudaSetDevice(int device) { VirtioQCArg arg; pfunc(); time_begin(); memset(&arg, 0, sizeof(VirtioQCArg)); ptr( arg.pA, device, 0); send_cmd_to_device( VIRTQC_cudaSetDevice, &arg); time_end(t_SetDev); return (cudaError_t)arg.cmd; }
cudaError_t cudaGetDeviceCount(int *count) { VirtioQCArg arg; pfunc(); time_begin(); memset(&arg, 0, sizeof(VirtioQCArg)); send_cmd_to_device( VIRTQC_cudaGetDeviceCount, &arg); *count = (int)arg.pA; time_end(t_GetDevCount); return (cudaError_t)arg.cmd; }
cudaError_t cudaDriverGetVersion(int *driverVersion) { VirtioQCArg arg; pfunc(); time_begin(); memset(&arg, 0, sizeof(VirtioQCArg)); send_cmd_to_device( VIRTQC_cudaDriverGetVersion, &arg); *driverVersion = (int)arg.pA; time_end(t_DriverGetVersion); return (cudaError_t)arg.cmd; }
cudaError_t cudaEventDestroy(cudaEvent_t event) { VirtioQCArg arg; pfunc(); time_begin(); memset(&arg, 0, sizeof(VirtioQCArg)); ptr( arg.pA, event, 0); send_cmd_to_device( VIRTQC_cudaEventDestroy, &arg); time_end(t_EventDestroy); return (cudaError_t)arg.cmd; }
cudaError_t cudaRuntimeGetVersion(int *runtimeVersion) { VirtioQCArg arg; pfunc(); time_begin(); memset(&arg, 0, sizeof(VirtioQCArg)); send_cmd_to_device( VIRTQC_cudaRuntimeGetVersion, &arg); *runtimeVersion = (uint64_t)arg.pA; time_end(t_RuntimeGetVersion); return (cudaError_t)arg.cmd; }
void cc_compress_phase1(struct cc **output, struct cc **tokens, int ntoken, int flag) { struct cc **pp; #ifdef STATS int i = 0; int nt = 0, cc = 0, nc = 0; #endif #ifdef STATS if (verbose >= 0) time_begin(); if (verbose > 0) printf("Compress:"); #endif pp = tokens; while (pp < tokens + ntoken) { #ifdef STATS if (verbose > 0) { ntoken_stat = 0; ccount_stat = 0; ncover_stat = 0; if (i > 2) { printf("\n "); i = 0; } i++; printf(" (%d", (*pp)->length); (void) fflush(stdout); } #endif pp += cc_compress(output, pp, flag); #ifdef STATS if (verbose > 0) { printf(" %dt %du %dc)", ntoken_stat, ccount_stat, ncover_stat); nt += ntoken_stat; cc += ccount_stat; nc += ncover_stat; } #endif } #ifdef STATS if (verbose > 0) printf("\n total: (%dt %du %dc)\n", nt, cc, nc); if (verbose >= 0) time_end(); #endif }
cudaError_t cudaGetDeviceProperties(struct cudaDeviceProp *prop, int device) { VirtioQCArg arg; pfunc(); time_begin(); memset(&arg, 0, sizeof(VirtioQCArg)); ptr( arg.pA, prop, sizeof(struct cudaDeviceProp)); ptr( arg.pB, device, 0); send_cmd_to_device( VIRTQC_cudaGetDeviceProperties, &arg); time_end(t_GetDevProp); return (cudaError_t)arg.cmd; }
cudaError_t cudaEventCreate(cudaEvent_t *event) { VirtioQCArg arg; pfunc(); time_begin(); memset(&arg, 0, sizeof(VirtioQCArg)); send_cmd_to_device( VIRTQC_cudaEventCreate, &arg); *event = (void*)arg.pA; time_end(t_EventCreate); return (cudaError_t)arg.cmd; }
cudaError_t cudaFree(void* devPtr) { VirtioQCArg arg; pfunc(); time_begin(); memset(&arg, 0, sizeof(VirtioQCArg)); ptr( arg.pA, devPtr, 0); send_cmd_to_device( VIRTQC_cudaFree, &arg); ptrace("devPtr= %p\n", (void*)arg.pA); time_end(t_Free); return (cudaError_t)arg.cmd; }
cudaError_t cudaEventElapsedTime(float *ms, cudaEvent_t start, cudaEvent_t end) { VirtioQCArg arg; pfunc(); time_begin(); memset(&arg, 0, sizeof(VirtioQCArg)); ptr( arg.pA, start, 0); ptr( arg.pB, end, 0); send_cmd_to_device( VIRTQC_cudaEventElapsedTime, &arg); memcpy(ms, &arg.flag, sizeof(float)); time_end(t_EventElapsedTime); return (cudaError_t)arg.cmd; }
cudaError_t cudaLaunch(const void *func) { VirtioQCArg arg; pfunc(); time_begin(); memset(&arg, 0, sizeof(VirtioQCArg)); // ptr( arg.pA, cudaKernelConf, 7*sizeof(uint32_t)); ptr( arg.pA, cudaKernelConf, 8*sizeof(uint64_t)); ptr( arg.pB, cudaKernelPara, cudaParaSize); arg.flag = (uint32_t)(uint64_t)func; send_cmd_to_device( VIRTQC_cudaLaunch, &arg); time_end(t_Launch); return cudaSuccess; }
cudaError_t cudaMalloc(void** devPtr, size_t size) { VirtioQCArg arg; pfunc(); time_begin(); memset(&arg, 0, sizeof(VirtioQCArg)); ptr( arg.pA, 0, 0); arg.flag = size; send_cmd_to_device( VIRTQC_cudaMalloc, &arg); *devPtr = (void*)arg.pA; ptrace("devPtr= %p\n", (void*)arg.pA); time_end(t_Malloc); return (cudaError_t)arg.cmd; }
cudaError_t cudaMemcpy( void* dst, const void* src, size_t count, enum cudaMemcpyKind kind) { VirtioQCArg arg; pfunc(); time_begin(); memset(&arg, 0, sizeof(VirtioQCArg)); ptrace("dst= %p , src= %p ,size= %lu\n", (void*)dst, (void*)src, count); if( kind == cudaMemcpyHostToDevice) { ptr( arg.pA, dst, 0); ptr( arg.pB, src, count); arg.flag = 1; } else if( kind == cudaMemcpyDeviceToHost ) { ptr( arg.pA, dst, count); ptr( arg.pB, src, 0); arg.flag = 2; } else if( kind == cudaMemcpyDeviceToDevice ) { ptr( arg.pA, dst, 0); ptr( arg.pB, src, count); arg.flag = 3; } else { error("Not impletment cudaMemcpyKind %d\n", kind); return cudaErrorInvalidValue; } send_cmd_to_device( VIRTQC_cudaMemcpy, &arg); if(kind==1){ time_end(t_MemcpyH2D); }else if(kind==2){ time_end(t_MemcpyD2H); } return (cudaError_t)arg.cmd; }
cudaError_t cudaEventRecord (cudaEvent_t event, cudaStream_t stream) { VirtioQCArg arg; pfunc(); time_begin(); memset(&arg, 0, sizeof(VirtioQCArg)); uint64_t mystream = (stream==NULL)?(uint64_t)-1:(uint64_t)stream; ptr( arg.pA, event, 0); //ptr( arg.pB, stream, 0); ptr( arg.pB, mystream, 0); send_cmd_to_device( VIRTQC_cudaEventRecord, &arg); time_end(t_EventRecord); return (cudaError_t)arg.cmd; }
extern void init_timer(void) { get_min_period(); time_begin(); }