Beispiel #1
0
Datei: bw.c Projekt: CJacky/qCUDA
void bw(uint64_t size)
{
	uint8_t *H, *D;
	int i;

	printf("%"PRIu64" ", size);
	
	cudaInit();

	printf("0 "); // reg func

	H = (uint8_t*)malloc(sizeof(uint8_t)*size);
	
	time_begin();
	cudaMalloc((void**)&D, sizeof(uint8_t)*size);
	printf("%u ", time_end());

	for(i=0; i<size; i++)
	{
		H[i]=i%255;
	}

	time_begin();
	cudaMemcpy(D, H, size*sizeof(uint8_t), cudaMemcpyHostToDevice);
	printf("%u ", time_end());

	printf("0 "); // exec kernel

	for(i=0; i<size; i++)
	{
		H[i]=0;
	}

	time_begin();
	cudaMemcpy(H, D, size*sizeof(uint8_t), cudaMemcpyDeviceToHost);
	printf("%u ", time_end());


	for(i=0; i<size; i++)
	{
		if(H[i]!=i%255)
			printf("error %d\n", i);
	}

	free(H);
	
	time_begin();
	cudaFree(D);
	printf("%u ", time_end());
	
	cudaFini();
	printf("\n");
}
Beispiel #2
0
/*------------------------------------------------------------------*/
bool AzRgforest::growForest()
{
    clock_t b_time;
    time_begin(&b_time);

    /*---  find the best split  ---*/
    AzTrTsplit best_split;
    searchBestSplit(&best_split);
    if (shouldExit(&best_split)) { /* exit if no more split */
        return true; /* exit */
    }

    /*---  split the node  ---*/
    double w_inc;
    int leaf_nx[2] = {-1,-1};
    const AzRgfTree *tree = splitNode(&best_split, &w_inc, leaf_nx);

    if (lmax_timer.reachedMax(l_num, "AzRgforest: #leaf", out)) {
        return true; /* #leaf reached max; exit */
    }

    /*---  update target  ---*/
    updateTarget(tree, leaf_nx, w_inc);

    time_end(b_time, &search_time);
    return false; /* don't exit */
}
Beispiel #3
0
cudaError_t cudaConfigureCall(
		dim3 gridDim,
		dim3 blockDim,
		size_t sharedMem,
		cudaStream_t stream)
{
	pfunc();
	time_begin();

	ptrace("gridDim= %d %d %d\n", gridDim.x, gridDim.y, gridDim.z);
	ptrace("blockDim= %d %d %d\n", blockDim.x, blockDim.y, blockDim.z);
	ptrace("sharedMem= %lu\n", sharedMem);
	ptrace("stream= %p\n", (void*)stream);
	//ptrace("size= %lu\n", sizeof(cudaStream_t));

	cudaKernelConf[0] = gridDim.x;
	cudaKernelConf[1] = gridDim.y;
	cudaKernelConf[2] = gridDim.z;

	cudaKernelConf[3] = blockDim.x;
	cudaKernelConf[4] = blockDim.y;
	cudaKernelConf[5] = blockDim.z;

	cudaKernelConf[6] = sharedMem;

	cudaKernelConf[7] = (stream==NULL)?(uint64_t)-1:(uint64_t)stream;


	memset(cudaKernelPara, 0, cudaKernelParaMaxSize);
	cudaParaSize = sizeof(uint32_t);

	time_end(t_ConfigCall);
	return cudaSuccess;
}
Beispiel #4
0
/*------------------------------------------------------------------*/
void AzRgforest::optimize_resetTarget()
{
    clock_t b_time;
    time_begin(&b_time);

    int t_num = ens->size();

    AzBytArr s("Calling optimizer with ");
    s.cn(t_num);
    s.c(" trees and ");
    s.cn(l_num);
    s.c(" leaves");
    AzTimeLog::print(s, out);

    opt->update(data, ens, &v_p);
    resetTarget();

    int tx;
    for (tx = 0; tx < t_num; ++tx) {
        ens->tree_u(tx)->removeSplitAssessment(); /* since weights changed */
    }

    isOpt = true;

    time_end(b_time, &opt_time);
}
Beispiel #5
0
int
cc_sweep_phase(char *buffer, int bufsize, struct cc **tokens)
{
    struct cc **pp = tokens;
    int i, n;
#ifdef STATS
    int nn, ii;
#endif

#ifdef STATS
    if (verbose >= 0)
        time_begin();
    if (verbose > 0)
        printf("Sweep:");
#endif
    cc_sweep0(buffer, bufsize, tt.tt_token_min - 1);
#ifdef STATS
    ntoken_stat = 0;
    nn = 0;
    ii = 0;
#endif
    for (i = tt.tt_token_min; i <= tt.tt_token_max; i++) {
#ifdef STATS
        if (verbose > 0) {
            if (ii > 7) {
                printf("\n      ");
                ii = 0;
            }
            ii++;
            printf(" (%d", i);
            (void) fflush(stdout);
        }
#endif
        n = cc_sweep(buffer, bufsize, pp, i);
        pp += n;
#ifdef STATS
        if (verbose > 0) {
            if (--n > 0) {
                printf(" %d", n);
                nn += n;
            }
            putchar(')');
        }
#endif
    }
    qinsertq(&cc_q1b, &cc_q1a);
#ifdef STATS
    if (verbose > 0)
        printf("\n       %d tokens, %d candidates\n",
               ntoken_stat, nn);
    if (verbose >= 0)
        time_end();
#endif
    return pp - tokens;
}
Beispiel #6
0
void __cudaUnregisterFatBinary(void **fatCubinHandle)
{
	pfunc();
	time_begin();

	ptrace("fatCubinHandle= %p, value= %p\n", fatCubinHandle, *fatCubinHandle);
	send_cmd_to_device( VIRTQC_cudaUnregisterFatBinary, NULL);

	free(fatCubinHandle);
	time_end(t_UnregFatbin);
	close_device();
}
Beispiel #7
0
void __cudaRegisterFunction(
		void   **fatCubinHandle,
		const char    *hostFun,
		char    *deviceFun,
		const char    *deviceName,
		int      thread_limit,
		uint3   *tid,
		uint3   *bid,
		dim3    *bDim,
		dim3    *gDim,
		int     *wSize
		)
{
	VirtioQCArg arg;
	computeFatBinaryFormat_t fatBinHeader;
	pfunc();
	time_begin();

	ptrace("fatCubinHandle= %p, value= %p\n", fatCubinHandle, *fatCubinHandle);
	ptrace("hostFun= %s (%p)\n", hostFun, hostFun);
	ptrace("deviceFun= %s (%p)\n", deviceFun, deviceFun);
	ptrace("deviceName= %s\n", deviceName);
	ptrace("thread_limit= %d\n", thread_limit);

	if(tid) ptrace("tid= %u %u %u\n", tid->x, tid->y, tid->z);
	else	ptrace("tid is NULL\n");

	if(bid)	ptrace("bid= %u %u %u\n", bid->x, bid->y, bid->z);
	else	ptrace("bid is NULL\n");

	if(bDim)ptrace("bDim= %u %u %u\n", bDim->x, bDim->y, bDim->z);
	else	ptrace("bDim is NULL\n");

	if(gDim)ptrace("gDim= %u %u %u\n", gDim->x, gDim->y, gDim->z);
	else	ptrace("gDim is NULL\n");

	if(wSize)ptrace("wSize= %d\n", *wSize);
	else	 ptrace("wSize is NULL\n");

	memset(&arg, 0, sizeof(VirtioQCArg));
	fatBinHeader = (computeFatBinaryFormat_t)(*fatCubinHandle);

	ptr( arg.pA , fatBinHeader, fatBinHeader->fatSize);
	ptr( arg.pB , deviceName  , strlen(deviceName)+1 );
	arg.flag = (uint32_t)(uint64_t)hostFun;

		ptrace("pA= %p, pASize= %u, pB= %p, pBSize= %u\n",
				(void*)arg.pA, arg.pASize, (void*)arg.pB, arg.pBSize);

	send_cmd_to_device( VIRTQC_cudaRegisterFunction, &arg);
	time_end(t_RegFunc);
}
Beispiel #8
0
cudaError_t cudaDeviceReset(void)
{
	VirtioQCArg arg;
	pfunc();
	time_begin();

	memset(&arg, 0, sizeof(VirtioQCArg));

	send_cmd_to_device( VIRTQC_cudaDeviceReset, &arg);

	time_end(t_DevReset);
	return (cudaError_t)arg.cmd;
}
Beispiel #9
0
cudaError_t cudaGetLastError(void)
{
	VirtioQCArg arg;
	pfunc();
	time_begin();

	memset(&arg, 0, sizeof(VirtioQCArg));

	send_cmd_to_device( VIRTQC_cudaGetLastError, &arg);

	time_end(t_GetLastError);
	return (cudaError_t)arg.cmd;
}
Beispiel #10
0
cudaError_t cudaSetDevice(int device)
{
	VirtioQCArg arg;
	pfunc();
	time_begin();
	memset(&arg, 0, sizeof(VirtioQCArg));

	ptr( arg.pA, device, 0);
	send_cmd_to_device( VIRTQC_cudaSetDevice, &arg);

	time_end(t_SetDev);
	return (cudaError_t)arg.cmd;
}
Beispiel #11
0
cudaError_t cudaGetDeviceCount(int *count)
{
	VirtioQCArg arg;
	pfunc();
	time_begin();

	memset(&arg, 0, sizeof(VirtioQCArg));

	send_cmd_to_device( VIRTQC_cudaGetDeviceCount, &arg);
	*count = (int)arg.pA;

	time_end(t_GetDevCount);
	return (cudaError_t)arg.cmd;
}
Beispiel #12
0
cudaError_t cudaDriverGetVersion(int *driverVersion)
{
	VirtioQCArg arg;
	pfunc();
	time_begin();

	memset(&arg, 0, sizeof(VirtioQCArg));

	send_cmd_to_device( VIRTQC_cudaDriverGetVersion, &arg);
	*driverVersion = (int)arg.pA;

	time_end(t_DriverGetVersion);
	return (cudaError_t)arg.cmd;
}
Beispiel #13
0
cudaError_t cudaEventDestroy(cudaEvent_t event)
{
	VirtioQCArg arg;
	pfunc();
	time_begin();

	memset(&arg, 0, sizeof(VirtioQCArg));

	ptr( arg.pA, event, 0);
	send_cmd_to_device( VIRTQC_cudaEventDestroy, &arg);

	time_end(t_EventDestroy);
	return (cudaError_t)arg.cmd;
}
Beispiel #14
0
cudaError_t cudaRuntimeGetVersion(int *runtimeVersion)
{
	VirtioQCArg arg;
	pfunc();
	time_begin();

	memset(&arg, 0, sizeof(VirtioQCArg));

	send_cmd_to_device( VIRTQC_cudaRuntimeGetVersion, &arg);
	*runtimeVersion = (uint64_t)arg.pA;

	time_end(t_RuntimeGetVersion);
	return (cudaError_t)arg.cmd;
}
Beispiel #15
0
void
cc_compress_phase1(struct cc **output, struct cc **tokens, int ntoken, int flag)
{
    struct cc **pp;
#ifdef STATS
    int i = 0;
    int nt = 0, cc = 0, nc = 0;
#endif

#ifdef STATS
    if (verbose >= 0)
        time_begin();
    if (verbose > 0)
        printf("Compress:");
#endif
    pp = tokens;
    while (pp < tokens + ntoken) {
#ifdef STATS
        if (verbose > 0) {
            ntoken_stat = 0;
            ccount_stat = 0;
            ncover_stat = 0;
            if (i > 2) {
                printf("\n         ");
                i = 0;
            }
            i++;
            printf(" (%d", (*pp)->length);
            (void) fflush(stdout);
        }
#endif
        pp += cc_compress(output, pp, flag);
#ifdef STATS
        if (verbose > 0) {
            printf(" %dt %du %dc)", ntoken_stat, ccount_stat,
                   ncover_stat);
            nt += ntoken_stat;
            cc += ccount_stat;
            nc += ncover_stat;
        }
#endif
    }
#ifdef STATS
    if (verbose > 0)
        printf("\n   total: (%dt %du %dc)\n", nt, cc, nc);
    if (verbose >= 0)
        time_end();
#endif
}
Beispiel #16
0
cudaError_t cudaGetDeviceProperties(struct cudaDeviceProp *prop, int device)
{
	VirtioQCArg arg;
	pfunc();
	time_begin();

	memset(&arg, 0, sizeof(VirtioQCArg));

	ptr( arg.pA, prop, sizeof(struct cudaDeviceProp));
	ptr( arg.pB, device, 0);
	send_cmd_to_device( VIRTQC_cudaGetDeviceProperties, &arg);

	time_end(t_GetDevProp);
	return (cudaError_t)arg.cmd;
}
Beispiel #17
0
cudaError_t cudaEventCreate(cudaEvent_t *event)
{
	VirtioQCArg arg;
	pfunc();
	time_begin();

	memset(&arg, 0, sizeof(VirtioQCArg));

	send_cmd_to_device( VIRTQC_cudaEventCreate, &arg);

	*event = (void*)arg.pA;

	time_end(t_EventCreate);
	return (cudaError_t)arg.cmd;
}
Beispiel #18
0
cudaError_t cudaFree(void* devPtr)
{
	VirtioQCArg arg;
	pfunc();
	time_begin();

	memset(&arg, 0, sizeof(VirtioQCArg));
	ptr( arg.pA, devPtr, 0);

	send_cmd_to_device( VIRTQC_cudaFree, &arg);
	ptrace("devPtr= %p\n", (void*)arg.pA);

	time_end(t_Free);
	return (cudaError_t)arg.cmd;
}
Beispiel #19
0
cudaError_t cudaEventElapsedTime(float *ms,	cudaEvent_t start, cudaEvent_t end)
{
	VirtioQCArg arg;
	pfunc();
	time_begin();

	memset(&arg, 0, sizeof(VirtioQCArg));

	ptr( arg.pA, start, 0);
	ptr( arg.pB, end, 0);
	send_cmd_to_device( VIRTQC_cudaEventElapsedTime, &arg);

	memcpy(ms, &arg.flag, sizeof(float));

	time_end(t_EventElapsedTime);
	return (cudaError_t)arg.cmd;
}
Beispiel #20
0
cudaError_t cudaLaunch(const void *func)
{
	VirtioQCArg arg;
	pfunc();
	time_begin();

	memset(&arg, 0, sizeof(VirtioQCArg));
//	ptr( arg.pA, cudaKernelConf, 7*sizeof(uint32_t));
	ptr( arg.pA, cudaKernelConf, 8*sizeof(uint64_t));
	ptr( arg.pB, cudaKernelPara, cudaParaSize);
	arg.flag = (uint32_t)(uint64_t)func;

	send_cmd_to_device( VIRTQC_cudaLaunch, &arg);

	time_end(t_Launch);
	return cudaSuccess;
}
Beispiel #21
0
cudaError_t cudaMalloc(void** devPtr, size_t size)
{
	VirtioQCArg arg;
	pfunc();
	time_begin();

	memset(&arg, 0, sizeof(VirtioQCArg));
	ptr( arg.pA, 0,  0);
	arg.flag = size;

	send_cmd_to_device( VIRTQC_cudaMalloc, &arg);
	*devPtr = (void*)arg.pA;
	ptrace("devPtr= %p\n", (void*)arg.pA);

	time_end(t_Malloc);
	return (cudaError_t)arg.cmd;
}
Beispiel #22
0
cudaError_t cudaMemcpy(
		void* dst,
		const void* src,
		size_t count,
		enum cudaMemcpyKind kind)
{
	VirtioQCArg arg;
	pfunc();
	time_begin();

	memset(&arg, 0, sizeof(VirtioQCArg));
	ptrace("dst= %p , src= %p ,size= %lu\n", (void*)dst, (void*)src, count);

	if( kind == cudaMemcpyHostToDevice)
	{
		ptr( arg.pA, dst, 0);
		ptr( arg.pB, src, count);
		arg.flag   = 1;
	}
	else if( kind == cudaMemcpyDeviceToHost )
	{
		ptr( arg.pA, dst, count);
		ptr( arg.pB, src, 0);
		arg.flag   = 2;
	}
	else if( kind == cudaMemcpyDeviceToDevice )
	{
		ptr( arg.pA, dst, 0);
		ptr( arg.pB, src, count);
		arg.flag   = 3;
	}
	else
	{
		error("Not impletment cudaMemcpyKind %d\n", kind);
		return cudaErrorInvalidValue;
	}

	send_cmd_to_device( VIRTQC_cudaMemcpy, &arg);

	if(kind==1){
		time_end(t_MemcpyH2D);
	}else if(kind==2){
		time_end(t_MemcpyD2H);
	}
	return (cudaError_t)arg.cmd;
}
Beispiel #23
0
cudaError_t cudaEventRecord	(cudaEvent_t event,	cudaStream_t stream)
{
	VirtioQCArg arg;
	pfunc();
	time_begin();

	memset(&arg, 0, sizeof(VirtioQCArg));

	uint64_t mystream = (stream==NULL)?(uint64_t)-1:(uint64_t)stream;

	ptr( arg.pA, event, 0);
	//ptr( arg.pB, stream, 0);
	ptr( arg.pB, mystream, 0);
	send_cmd_to_device( VIRTQC_cudaEventRecord, &arg);

	time_end(t_EventRecord);
	return (cudaError_t)arg.cmd;
}
Beispiel #24
0
extern void init_timer(void)
{
  get_min_period();
  time_begin();
}