Ejemplo n.º 1
bool runRalfFunction(std::string name, scalarFnType fun, CUmodule* hModule, CUdeviceptr d_data,
		     DataStruct *h_data,DataStruct* h_data_reference, unsigned int memSize)
  const unsigned inputNr = 10;
  const float scalarInputs[4][inputNr] = {{ 0.f, 3.f, 2.f, 8.f, 10.2f, -1.f, 0.f, 1000.23f, 0.02f, -0.02f },
					   { 1.f, 2.f, 4.f, 6.f, -14.13f, -13.f, 0.f, 0.02f, 420.001f, -420.001f },
					   { 2.f, 1.f, 6.f, 4.f, 999.f, -5.f, 0.f, 420.001f, 0.01f, 0.01f },
					   { 3.f, 0.f, 8.f, 2.f, 0.f, -420.001f, 0.f, 0.01f, 1000.23f, 0.01f }};

  std::cout << "====================== " << name << "===============================\n";
  for (unsigned i=0; i<inputNr; ++i) {
    for (unsigned j=0; j<inputNr; ++j) 
    for (unsigned k=0; k<4; ++k) 
      h_data->fa[0] = h_data_reference->fa[0] = scalarInputs[k][i];
      h_data->fa[1] = h_data_reference->fa[1] = scalarInputs[k][j];

      //run device function
      loadAndRunTestFunction(hModule, name, d_data, h_data, memSize);   

      if(!compareData(h_data, h_data_reference))                      //compare Data
	std::cout << "\n Error in Ralf: fa0=" << scalarInputs[k][i]  
                  << ", fa1=" << scalarInputs[k][j] << " (" << name << ")\n";
	return false;
  std::cout << " => Test passed!!!\n";
  return true;
Ejemplo n.º 2
/*  Performs post processing of the collected timeStamp values
 *  This function converts the timer count into time in micro-seconds
 *  Then compares to see if 1 or 0 bit
void postProcess(uint32_t *edge_timeStamp)
    uint8_t address[16] = {0};
    uint8_t data[8] = {0};
    uint16_t i, temp_i;

    for (i = 2; i <= 66; i++) {
        // convert to time in microseconds
        edge_timeStamp[i] = (625 * edge_timeStamp[i])/10000;

    // Get the address bit (0 or 1?)
    for (i = 3, temp_i = 0; i <= 33; i+=2, temp_i++) {
        // If time spent on second period is 1T, then it is a 0 bit. If 3T, then it is 1 bit
        if ( 450 < edge_timeStamp[i] && edge_timeStamp[i] < 650 )
            address[temp_i] = 0;
        else if ( 1500 < edge_timeStamp[i] && edge_timeStamp[i] < 1700 )
            address[temp_i] = 1;
            address[temp_i] = 2;

    // Get the data bit (0 or 1?)
    for (i = 35, temp_i = 0; i <= 65; i+=2, temp_i++) {
        // If time spent on second period is 1T, then it is a 0 bit. If 3T, then it is 1 bit
        if ( 450 < edge_timeStamp[i] && edge_timeStamp[i] < 650 )
            data[temp_i] = 0;
        else if ( 1500 < edge_timeStamp[i] && edge_timeStamp[i] < 1700 )
            data[temp_i] = 1;
            data[temp_i] = 2;

Ejemplo n.º 3
void ICACHE_FLASH_ATTR sortData(void) {
	int outerIdx, innerIdx, swapped = true;
	for (outerIdx = 0; (outerIdx < MAX_ENTRY) && swapped; outerIdx++) {
		swapped = false;
		for (innerIdx = MAX_ENTRY - 1; innerIdx > outerIdx + 1; innerIdx--) {
			if (compareData(innerIdx, innerIdx - 1) > 0) {
				swapData(innerIdx, innerIdx - 1);
				// os_printf("%d<->%d\n", innerIdx, innerIdx - 1);
				swapped = true;
Ejemplo n.º 4
bool isOrdered(List l)
	if (l == 0)
		return true;
	if (l->nextPtr == 0)
		return true;
	Data d1 = l->data;
	Data d2 = l->nextPtr->data;
	int cmp = compareData(d1, d2);
	if (cmp <= 0) {
		return isOrdered(l->nextPtr);
	} else {
		return false;
Ejemplo n.º 5
// Return a pointer to the node that contains the smallest data.
NodePtr getMin(List l)
	assert(l != 0);
	Data first  = l->data;
	if (l->nextPtr == 0) {
		return l;
	NodePtr min_rest = getMin(l->nextPtr);
	Data    rest = min_rest->data;
	int cmp = compareData(first, rest);
	if (cmp <= 0) {
		return l;
	} else {
		return min_rest;
Ejemplo n.º 6
int searchLinkedList(struct linkedList *ll,struct data *dta)
//case 1: Linked list is empty
        if(ll->head == NULL) return 0;
	struct node* temp = ll->head;
		if(compareData(dta,temp->dta) == 1)
			return 1;
		temp = temp->next;
	return 0;

Ejemplo n.º 7
//! Perform the wavelet decomposition
int runTest( int argc, char** argv)
  int i;
#ifdef HW
  XFcuda xcore;
  int Status;
  Status = XFcuda_Initialize(&xcore, 0);
  if (Status != XST_SUCCESS) {
    printf("Initialization failed %d\r\n", Status);
    return XST_FAILURE;

  unsigned int slength = 262144;
  // get the number of decompositions necessary to perform a full decomposition
  unsigned int dlevels_complete = 0;
  if (1 != getLevels( slength, &dlevels_complete))
    // error message
    fprintf( stderr, "Signal length not supported.\n");

  // device in data
  float* d_idata = NULL;
  // device out data
  float* d_odata = NULL;
  // device approx_final data
  float* approx_final = NULL;

  // The very final approximation coefficient has to be written to the output
  // data, all others are reused as input data in the next global step and
  // therefore have to be written to the input data again.
  // The following flag indicates where to copy approx_final data
  //   - 0 is input, 1 is output
  int approx_is_input;

  // allocate device mem
  const unsigned int smem_size = sizeof(float) * slength;
  d_idata = (float*) malloc(smem_size);
  d_odata = (float*) malloc(smem_size);
  approx_final = (float*) malloc(smem_size);

  // copy input data to device
  memcpy(d_idata, signal, smem_size);

  // clear result memory
  float* tmp = (float*) malloc( smem_size);
  for (i = 0; i < slength; ++i)
    tmp[i] = 0.0;
  memcpy(d_odata, tmp, smem_size);

  // total number of threads
  // in the first decomposition step always one thread computes the average and
  // detail signal for one pair of adjacent values
  unsigned int num_threads_total_left = slength / 2;
  // decomposition levels performed in the current / next step
  unsigned int  dlevels_step = dlevels_complete;

  // 1D signal so the arrangement of elements is also 1D
  dim3  block_size;
  dim3  grid_size;

  // number of decomposition levels left after one iteration on the device
  unsigned int dlevels_left = dlevels_complete;

  // if less or equal 1k elements, then the data can be processed in one block,
  // this avoids the Wait-For-Idle (WFI) on host side which is necessary if the
  // computation is split accross multiple SM's if enough input data
  if( dlevels_complete <= 10) {
    // decomposition can be performed at once
    block_size.x = num_threads_total_left;
    approx_is_input = 0;
  } else {
    // 512 threads per block
    grid_size.x = (num_threads_total_left / 512);
    block_size.x = 512;

    // 512 threads corresponds to 10 decomposition steps
    dlevels_step = 10;
    dlevels_left -= 10;
    approx_is_input = 1;
  grid_size.y = 1;
  grid_size.z = 1;
  block_size.y = 1;
  block_size.z = 1;

#ifdef HW
  XFcuda_SetGriddim_y(&xcore, grid_size.y);
  //XFcuda_SetGriddim_z(&xcore, grid_size.z);
  //XFcuda_SetBlockdim_y(&xcore, block_size.y);
  //XFcuda_SetBlockdim_z(&xcore, block_size.z);
  XFcuda_SetId_addr(&xcore, (int)d_idata / sizeof(float));
  XFcuda_SetOd_addr(&xcore, (int)d_odata / sizeof(float));
  XFcuda_SetApprox_final_addr(&xcore, (int)approx_final / sizeof(float));

  while( 0 != num_threads_total_left) {

#ifndef HW
    //PS execution
    dwtHaar1D(d_idata, d_odata, approx_final, dlevels_step, num_threads_total_left, block_size.x, grid_size, block_size, 1, 0);

    XFcuda_SetDlevels(&xcore, dlevels_step);
    XFcuda_SetSlength_step_half(&xcore, num_threads_total_left);
    XFcuda_SetBdim(&xcore, block_size.x);
    XFcuda_SetGriddim_x(&xcore, grid_size.x);
    XFcuda_SetBlockdim_x(&xcore, block_size.x);

    XFcuda_SetEn_fcuda1(&xcore, 1);

    while (!XFcuda_IsDone(&xcore));

    // Copy approx_final to appropriate location
    if (approx_is_input) {
      memcpy(d_idata, approx_final, grid_size.x*4);
    else {
      memcpy(d_odata, approx_final, grid_size.x*4);

    // update level variables
    if( dlevels_left < 10) {
      // approx_final = d_odata;
      approx_is_input = 0;

    // more global steps necessary
    dlevels_step = (dlevels_left > 10) ? dlevels_left - 10 : dlevels_left;
    dlevels_left -= 10;

    // after each step only half the threads are used any longer
    // therefore after 10 steps 2^10 less threads
    num_threads_total_left = num_threads_total_left >> 10;

    // update block and grid size
    grid_size.x = (num_threads_total_left / 512)
      + (0 != (num_threads_total_left % 512)) ? 1 : 0;
    if( grid_size.x <= 1) {
      block_size.x = num_threads_total_left;

#ifdef VERIFY
  for (i = 0; i < 10; i++)
    printf("index=%d, ref=%f, fpga=%f\n", i, reference[i], d_odata[i]);
  int res = compareData(reference, d_odata, slength, 0.1f);
  printf("%s\n", (1 == res) ? "PASSED." : "FAILED.");
bool runTest(int argc, const char **argv)
    bool ok = true;

    float *host_output;
    float *device_output;
    float *input;
    float *coeff;

    int defaultDim;
    int dimx;
    int dimy;
    int dimz;
    int outerDimx;
    int outerDimy;
    int outerDimz;
    int radius;
    int timesteps;
    size_t volumeSize;
    memsize_t memsize;

    const float lowerBound = 0.0f;
    const float upperBound = 1.0f;

    // Determine default dimensions
    shrLog("Set-up, based upon target device GMEM size...\n");
    if (ok)
        // Get the memory size of the target device
        shrLog(" getTargetDeviceGlobalMemSize\n");
        ok = getTargetDeviceGlobalMemSize(&memsize, argc, argv);
    if (ok)
        // We can never use all the memory so to keep things simple we aim to
        // use around half the total memory
        memsize /= 2;
        // Most of our memory use is taken up by the input and output buffers -
        // two buffers of equal size - and for simplicity the volume is a cube:
        //   dim = floor( (N/2)^(1/3) )
        defaultDim = (int)floor(pow((memsize / (2.0 * sizeof(float))), 1.0/3.0));

        // By default, make the volume edge size an integer multiple of 128B to
        // improve performance by coalescing memory accesses, in a real
        // application it would make sense to pad the lines accordingly
        int roundTarget = 128 / sizeof(float);
        defaultDim = defaultDim / roundTarget * roundTarget;
        defaultDim -= k_radius_default * 2;

        // Check dimension is valid
        if (defaultDim < k_dim_min)
            shrLogEx(LOGBOTH | ERRORMSG, -1000, STDERROR);
            shrLog("\tinsufficient device memory (maximum volume on device is %d, must be between %d and %d).\n", defaultDim, k_dim_min, k_dim_max);
            ok = false;

        else if (defaultDim > k_dim_max)
            defaultDim = k_dim_max;

    // For QA testing, override default volume size
    if (ok)
        if (shrCheckCmdLineFlag(argc, argv, "qatest"))
            defaultDim = MIN(defaultDim, k_dim_qa);

    // Parse command line arguments
    if (ok)
        char *dim = 0;
        if (shrGetCmdLineArgumentstr(argc, argv, "dimx", &dim))
            dimx = (int)atoi(dim);
            if (dimx < k_dim_min || dimx > k_dim_max)
                shrLogEx(LOGBOTH | ERRORMSG, -1001, STDERROR);
                shrLog("\tdimx out of range (%d requested, must be between %d and %d), see header files for details.\n", dimx, k_dim_min, k_dim_max);
                ok = false;
            dimx = defaultDim;
        if (shrGetCmdLineArgumentstr(argc, argv, "dimy", &dim))
            dimy = (int)atoi(dim);
            if (dimy < k_dim_min || dimy > k_dim_max)
                shrLogEx(LOGBOTH | ERRORMSG, -1002, STDERROR);
                shrLog("\tdimy out of range (%d requested, must be between %d and %d), see header files for details.\n", dimy, k_dim_min, k_dim_max);
                ok = false;
            dimy = defaultDim;
        if (shrGetCmdLineArgumentstr(argc, argv, "dimz", &dim))
            dimz = (int)atoi(dim);
            if (dimz < k_dim_min || dimz > k_dim_max)
                shrLogEx(LOGBOTH | ERRORMSG, -1003, STDERROR);
                shrLog("\tdimz out of range (%d requested, must be between %d and %d), see header files for details.\n", dimz, k_dim_min, k_dim_max);
                ok = false;
            dimz = defaultDim;
        if (shrGetCmdLineArgumentstr(argc, argv, "radius", &dim))
            radius = (int)atoi(dim);
            if (radius < k_radius_min || radius >= k_radius_max)
                shrLogEx(LOGBOTH | ERRORMSG, -1004, STDERROR);
                shrLog("\tradius out of range (%d requested, must be between %d and %d), see header files for details.\n", radius, k_radius_min, k_radius_max);
                ok = false;
            radius = k_radius_default;
        if (shrGetCmdLineArgumentstr(argc, argv, "timesteps", &dim))
            timesteps = (int)atoi(dim);
            if (timesteps < k_timesteps_min || radius >= k_timesteps_max)
                shrLogEx(LOGBOTH | ERRORMSG, -1005, STDERROR);
                shrLog("\ttimesteps out of range (%d requested, must be between %d and %d), see header files for details.\n", timesteps, k_timesteps_min, k_timesteps_max);
                ok = false;
            timesteps = k_timesteps_default;
        if (dim)

    // Determine volume size
    if (ok)
        outerDimx = dimx + 2 * radius;
        outerDimy = dimy + 2 * radius;
        outerDimz = dimz + 2 * radius;
        volumeSize = outerDimx * outerDimy * outerDimz;
    // Allocate memory
    if (ok)
        shrLog(" calloc host_output\n");
        if ((host_output = (float *)calloc(volumeSize, sizeof(float))) == NULL)
            shrLogEx(LOGBOTH | ERRORMSG, -1006, STDERROR);
            shrLog("\tInsufficient memory for host_output calloc, please try a smaller volume (use --help for syntax).\n");
            ok = false;
    if (ok)
        shrLog(" malloc input\n");
        if ((input = (float *)malloc(volumeSize * sizeof(float))) == NULL)
            shrLogEx(LOGBOTH | ERRORMSG, -1007, STDERROR);
            shrLog("\tInsufficient memory for input malloc, please try a smaller volume (use --help for syntax).\n");
            ok = false;
    if (ok)
        shrLog(" malloc coeff\n");
        if ((coeff = (float *)malloc((radius + 1) * sizeof(float))) == NULL)
            shrLogEx(LOGBOTH | ERRORMSG, -1008, STDERROR);
            shrLog("\tInsufficient memory for coeff malloc, please try a smaller volume (use --help for syntax).\n");
            ok = false;

    // Create coefficients
    if (ok)
        for (int i = 0 ; i <= radius ; i++)
            coeff[i] = 0.1f;

    // Generate data
    if (ok)
        shrLog(" generateRandomData\n\n");
        generateRandomData(input, outerDimx, outerDimy, outerDimz, lowerBound, upperBound);

    if (ok)
        shrLog("FDTD on %d x %d x %d volume with symmetric filter radius %d for %d timesteps...\n\n", dimx, dimy, dimz, radius, timesteps);

    // Execute on the host
    if (ok)
        ok = fdtdReference(host_output, input, coeff, dimx, dimy, dimz, radius, timesteps);
        shrLog("fdtdReference complete\n");

    // Allocate memory
    if (ok)
        shrLog(" calloc device_output\n");
        if ((device_output = (float *)calloc(volumeSize, sizeof(float))) == NULL)
            shrLogEx(LOGBOTH | ERRORMSG, -1009, STDERROR);
            shrLog("\tInsufficient memory for device output calloc, please try a smaller volume (use --help for syntax).\n");
            ok = false;

    // Execute on the device
    if (ok)
        ok = fdtdGPU(device_output, input, coeff, dimx, dimy, dimz, radius, timesteps, argc, argv);
        shrLog("fdtdGPU complete\n");

    // Compare the results
    if (ok)
        float tolerance = 0.0001f;
        shrLog("\nCompareData (tolerance %f)...\n", tolerance);
        ok = compareData(device_output, host_output, dimx, dimy, dimz, radius, tolerance);

    return ok;
Ejemplo n.º 9
int RM_FileScan::NextRec(RM_Record &rec, RID &rid) {
	if (isclose) {
		//cout << "[RM_FileScan]GetNextRec error: This filescan has been closed!" << endl;
		return 2;
	if(op <= GE && op >= LT && comparevalue == NULL)	return 1;

	bool get_flag = false;
	int index;
	Bytes head = (Bytes)filehandle->bpm->getPage(currentRid->GetFileid(), currentRid->GetPageid(), index);
	Bits* slots = new Bits(head, filehandle->GetPnum());
	Bytes record_head;
	int current_slotid = currentRid->GetSlotid();
	int current_pageid = currentRid->GetPageid();

	int offset = PAGE_HEAD_BYTE;
	while (!get_flag) {
//		cout << "fuck1" << endl;
		while ( current_slotid < filehandle->GetPnum() && slots->bit_get(current_slotid) == false) {
//		cout << "fuck2" << endl;
		if (current_slotid == filehandle->GetPnum()) {		//该页记录已全部遍历
			current_slotid = 0;
			if (current_pageid == filehandle->GetPsum()) {	//说明该文件下所有页的所有记录都已遍历过.此时返回0表示已遍历完
				return 1;
			head = (Bytes)filehandle->bpm->getPage(currentRid->GetFileid(), current_pageid, index);
			offset = PAGE_HEAD_BYTE;
		} else {	//得到一个记录
//			cout << "fuck3" << endl;
			offset = PAGE_HEAD_BYTE;
			offset = offset + filehandle->GetRsize() * current_slotid;
			record_head = head + offset;
			int nullbits_offset = RECORD_NULLBITS_OFFSET_BYTE;
			Bits* nullbits = new Bits(record_head + nullbits_offset, attrcol+1);		//获取相应记录的null位图(一定长度)
//			cout << "fuck3.1" << endl;
			if(nullbits->bit_get(attrcol) == 0) {//左边为NULL
				if(op == NO) {
					get_flag = true;
				if(op <= GE && op >= LT) {		//为理解方便写出来,实际没什么用
					get_flag = false;
				if(op == EQ) {
					get_flag = (comparevalue == NULL);
				if(op == NE) {
					get_flag = !(comparevalue == NULL);
			} else /*if (comparevalue != 0)*/{//其他情况,进行相应比较
//				cout << "fuck3.2" << endl;
				char* value_head = record_head + attroffset;
//				enum AttrType{INTEGER,FLOAT,STRING};
//				enum CompOp{EQ,LT,GT,LE,GE,NE,NO};
//				cout << "11" << endl;
				const char* cmp1 = value_head;
				const char* cmp2 = (char*)comparevalue;
				const CompOp cmpop = op;
				const AttrType cmptype = type;
//				cout << "22" << endl;
				if  (comparevalue != 0 || op == NO){
//					cout << "33" << endl;
					get_flag = compareData(cmp1, cmpop, cmp2, cmptype);
					get_flag = false;

//			cout << "fuck4" << endl;
			if (get_flag) {
//				cout << "get_data: " << &get_data << " " << "head: " << &record_head << endl;
			} else {
//			cout << "fuck5" << endl;
	currentRid->SetSlotid(current_slotid + 1);
	return 0;
Ejemplo n.º 10
//! Perform the wavelet decomposition
void runTest( int argc, char** argv)
  char* s_fname ;
  char* r_fname ;
  char* r_gold_fname ;
  const char usage[] =
      "  dwtHaar1D --signal=<signal_file> --result=<result_file> --gold=<gold_file>\n\n"
      "  <signal_file> Input file containing the signal\n"
      "  <result_file> Output file storing the result of the wavelet decomposition\n"
      "  <gold_file>   Input file containing the reference result of the wavelet decomposition\n"
      "  bin\\win32\\release\\dwtHaar1D\n"
      "       --signal=projects\\dwtHaar1D\\data\\signal.dat\n"
      "       --result=projects\\dwtHaar1D\\data\\regression.dat\n"
      "       --gold=projects\\dwtHaar1D\\data\\regression.gold.dat\n"

  char s_fname_arr[] = "data/signal_2_18.dat";
  char r_gold_fname_arr[] = "data/regression_2_18.gold.dat";

  char r_fname_arr[] = "regression.dat";
  s_fname = s_fname_arr;
  r_fname = r_fname_arr;
  r_gold_fname = r_gold_fname_arr;

  // read in signal
  unsigned int slength = 262144;
  DATATYPE* signal = NULL;
  if (s_fname == 0)
    fprintf(stderr, "Cannot find the file containing the signal.\n%s", usage);
  if (readFile(s_fname, &signal) == 1) {
    printf("Reading signal from %s\n", s_fname);
  } else {

  // get the number of decompositions necessary to perform a full decomposition
  unsigned int dlevels_complete = 0;
  if (1 != getLevels( slength, &dlevels_complete))
    // error message
    fprintf( stderr, "Signal length not supported.\n");

  // device in data
  DATATYPE* d_idata = NULL;
  // device out data
  DATATYPE* d_odata = NULL;
  // device approx_final data
  DATATYPE* approx_final = NULL;

  // The very final approximation coefficient has to be written to the output
  // data, all others are reused as input data in the next global step and
  // therefore have to be written to the input data again.
  // The following flag indicates where to copy approx_final data
  //   - 0 is input, 1 is output
  int approx_is_input;

  // allocate device mem
  const unsigned int smem_size = sizeof(DATATYPE) * slength;
  d_idata = (DATATYPE*) malloc(smem_size);
  d_odata = (DATATYPE*) malloc(smem_size);
  approx_final = (DATATYPE*) malloc(smem_size);
  memcpy(d_idata, signal, smem_size);

  // clear result memory
  DATATYPE* tmp = (DATATYPE*) malloc( smem_size);
  int i;
  for (i = 0; i < slength; ++i)
    tmp[i] = 0.0;
  memcpy(d_odata, tmp, smem_size);
  free( tmp);

  // total number of threads
  // in the first decomposition step always one thread computes the average and
  // detail signal for one pair of adjacent values
  unsigned int num_threads_total_left = slength / 2;
  // decomposition levels performed in the current / next step
  unsigned int  dlevels_step = dlevels_complete;

  // 1D signal so the arrangement of elements is also 1D
  dim3  block_size;
  dim3  grid_size;

  // number of decomposition levels left after one iteration on the device
  unsigned int dlevels_left = dlevels_complete;

  // if less or equal 1k elements, then the data can be processed in one block,
  // this avoids the Wait-For-Idle (WFI) on host side which is necessary if the
  // computation is split accross multiple SM's if enough input data
  if( dlevels_complete <= 10) {
    // decomposition can be performed at once
    block_size.x = num_threads_total_left;
    approx_is_input = 0;
  } else {
    // 512 threads per block
    grid_size.x = (num_threads_total_left / 512);
    block_size.x = 512;

    // 512 threads corresponds to 10 decomposition steps
    dlevels_step = 10;
    dlevels_left -= 10;
    approx_is_input = 1;
  grid_size.y = 1;
  grid_size.z = 1;
  block_size.y = 1;
  block_size.z = 1;

  // do until full decomposition is accomplished
  while( 0 != num_threads_total_left) {
    // run kernel
    dwtHaar1D(d_idata, d_odata, approx_final, dlevels_step, num_threads_total_left, block_size.x, grid_size, block_size, 1, 0);

    // Copy approx_final to appropriate location
    if (approx_is_input) {
      memcpy(d_idata, approx_final, grid_size.x*4);
    } else {
      memcpy(d_odata, approx_final, grid_size.x*4);

    // update level variables
    if( dlevels_left < 10) {
      // approx_final = d_odata;
      approx_is_input = 0;

    // more global steps necessary
    dlevels_step = (dlevels_left > 10) ? dlevels_left - 10 : dlevels_left;
    dlevels_left -= 10;

    // after each step only half the threads are used any longer
    // therefore after 10 steps 2^10 less threads
    num_threads_total_left = num_threads_total_left >> 10;

    // update block and grid size
    grid_size.x = (num_threads_total_left / 512)
      + (0 != (num_threads_total_left % 512)) ? 1 : 0;
    if( grid_size.x <= 1) {
      block_size.x = num_threads_total_left;

  // load the reference solution
  unsigned int len_reference = 262144;
  DATATYPE* reference = NULL;
  if (r_gold_fname == 0) {
    fprintf(stderr, "Cannot read the file containing the reference result of the wavelet decomposition.\n%s", usage);
  if (readFile(r_gold_fname, &reference) == 1)
    printf("Reading reference result from %s\n", r_gold_fname);
  else {
  assert(slength == len_reference);

  //compare the computed solution and the reference
  int res = compareData(reference, d_odata, slength, 0.001f);
  printf("%s\n", (1 == res) ? "PASSED." : "FAILED.");
Ejemplo n.º 11
int main(int argc, char **argv)
  CUdeviceptr  d_data0   = 0;
  CUdeviceptr  d_data1   = 0;
  DataStruct *h_data0  = 0;
  DataStruct *h_data1  = 0;
  DataStruct h_data_reference0;
  DataStruct h_data_reference1;
  unsigned int memSize = sizeof(DataStruct);
  //device references
  CUcontext    hContext = 0;
  CUdevice     hDevice  = 0;
  CUmodule     hModule  = 0;
  CUstream     hStream  = 0;

  // Initialize the device and get a handle to the kernel
  CUresult status = initialize(0, &hContext, &hDevice, &hModule, &hStream);
  // Allocate memory on host and device
  if ((h_data0 = (DataStruct *)malloc(memSize)) == NULL)
      std::cerr << "Could not allocate host memory" << std::endl;
  status = cuMemAlloc(&d_data0, memSize);

  if ((h_data1 = (DataStruct *)malloc(memSize)) == NULL)
      std::cerr << "Could not allocate host memory" << std::endl;
  status = cuMemAlloc(&d_data1, memSize);
  if (status != CUDA_SUCCESS)
    printf("ERROR: during cuMemAlloc\n");

  //======================= test cases ========================================//
  std::string name = "";
  unsigned int testnum=0;
  unsigned int passed=0;

  /////////////////////// Ralf ///////////////////////////////////////////////////

  if(runRalfFunction("test_phi_scalar", test_phi_scalar, &hModule, d_data0, h_data0, &h_data_reference0, memSize))
  if(runRalfFunction("test_phi2_scalar", test_phi2_scalar, &hModule, d_data0, h_data0, &h_data_reference0, memSize))
  if(runRalfFunction("test_phi3_scalar", test_phi3_scalar, &hModule, d_data0, h_data0, &h_data_reference0, memSize))
  if(runRalfFunction("test_phi4_scalar", test_phi4_scalar, &hModule, d_data0, h_data0, &h_data_reference0, memSize))
  if(runRalfFunction("test_phi5_scalar", test_phi5_scalar, &hModule, d_data0, h_data0, &h_data_reference0, memSize))
  if(runRalfFunction("test_phi6_scalar", test_phi6_scalar, &hModule, d_data0, h_data0, &h_data_reference0, memSize))
  if(runRalfFunction("test_phi7_scalar", test_phi7_scalar, &hModule, d_data0, h_data0, &h_data_reference0, memSize))
  if(runRalfFunction("test_phi8_scalar", test_phi8_scalar, &hModule, d_data0, h_data0, &h_data_reference0, memSize))
  if(runRalfFunction("test_phi9_scalar", test_phi9_scalar, &hModule, d_data0, h_data0, &h_data_reference0, memSize))

  if(runRalfFunction("test_loopbad_scalar", test_loopbad_scalar, &hModule, d_data0, h_data0, &h_data_reference0, memSize))
  if(runRalfFunction("test_loop23_scalar", test_loop23_scalar, &hModule, d_data0, h_data0, &h_data_reference0, memSize))
  if(runRalfFunction("test_loop13_scalar", test_loop13_scalar, &hModule, d_data0, h_data0, &h_data_reference0, memSize))

  /*///////////////*/ name = "test_GetElementPointer_constant"; /////////////////////

  std::cout << "=============== Test " << testnum << ": " << name << " ===================\n";
  loadAndRunTestFunction(&hModule, name, d_data0, h_data0, memSize);   //run device function
  test_GetElementPointer_constant(&h_data_reference0);                //run host reference
  if(compareData(h_data0,&h_data_reference0))                      //compare Data
    {passed++;  std::cout << " => Test passed!!!\n";}
  /*///////////////*/ name = "test_calculate"; /////////////////////
  h_data0->i = h_data_reference0.i = 3;
  h_data0->f = h_data_reference0.f = 3.2;
  std::cout << "=============== Test " << testnum << ": " << name << " ===================\n";
  loadAndRunTestFunction(&hModule, name, d_data0, h_data0, memSize);   //run device function
  test_calculate(&h_data_reference0);                                 //run host reference
  if(compareData(h_data0,&h_data_reference0))                      //compare Data
    {passed++;  std::cout << " => Test passed!!!\n";}

  /*///////////////*/ name = "test_parquetShader"; /////////////////////
  h_data0->f = h_data_reference0.f = 1;
  std::cout << "=============== Test " << testnum << ": " << name << " ===================\n";
  loadAndRunTestFunction(&hModule, name, d_data0, h_data0, memSize);   //run device function
  test_parquetShader(&h_data_reference0);                                 //run host reference
  if(compareData(h_data0,&h_data_reference0))                      //compare Data
    {passed++;  std::cout << " => Test passed!!!\n";}

  /*///////////////*/ name = "test_GetElementPointer_dyn"; /////////////////////
  h_data0->i = h_data_reference0.i = 3;
  h_data0->u = h_data_reference0.u = 7;
  std::cout << "=============== Test " << testnum << ": " << name << " ===================\n";
  loadAndRunTestFunction(&hModule, name, d_data0, h_data0, memSize);   //run device function
  test_GetElementPointer_dyn(&h_data_reference0);                                 //run host reference
  if(compareData(h_data0,&h_data_reference0))                      //compare Data
    {passed++;  std::cout << " => Test passed!!!\n";}

  /*///////////////*/ name = "test_branch_simple"; // Branch 1 /////////////////
  h_data0->f = h_data_reference0.f = -4;
  std::cout << "=============== Test " << testnum << ": " << name << " ===================\n";
  loadAndRunTestFunction(&hModule, name, d_data0, h_data0, memSize);   //run device function
  test_branch_simple(&h_data_reference0);                                 //run host reference
  if(compareData(h_data0,&h_data_reference0))                      //compare Data
    {passed++;  std::cout << " => Test passed!!!\n";}

  /*///////////////*/ name = "test_branch_simple"; // Branch 2 /////////////////
  h_data0->f = h_data_reference0.f = 8;
  std::cout << "=============== Test " << testnum << ": " << name << " ===================\n";
  loadAndRunTestFunction(&hModule, name, d_data0, h_data0, memSize);   //run device function
  test_branch_simple(&h_data_reference0);                                 //run host reference
  if(compareData(h_data0,&h_data_reference0))                      //compare Data
    {passed++;  std::cout << " => Test passed!!!\n";}
  /*///////////////*/ name = "test_branch_simplePHI"; // Branch 1 /////////////////
  h_data0->f = h_data_reference0.f = -10;
  std::cout << "=============== Test " << testnum << ": " << name << " ===================\n";
  loadAndRunTestFunction(&hModule, name, d_data0, h_data0, memSize);   //run device function
  test_branch_simplePHI(&h_data_reference0);                                 //run host reference
  if(compareData(h_data0,&h_data_reference0))                      //compare Data
    {passed++;  std::cout << " => Test passed!!!\n";}

  /*///////////////*/ name = "test_branch_loop"; //////////////////////////////////
  h_data0->i = h_data_reference0.i = 100;
  std::cout << "=============== Test " << testnum << ": " << name << " ===================\n";
  loadAndRunTestFunction(&hModule, name, d_data0, h_data0, memSize);   //run device function
  test_branch_loop(&h_data_reference0);                                 //run host reference
  if(compareData(h_data0,&h_data_reference0))                      //compare Data
    {passed++;  std::cout << " => Test passed!!!\n";}

  /*///////////////*/ name = "test_math"; //////////////////////////////////////////
  h_data0->f = h_data_reference0.f = 1.4;
  h_data0->i = h_data_reference0.i = 3;
  std::cout << "=============== Test " << testnum << ": " << name << " ===================\n";
  loadAndRunTestFunction(&hModule, name, d_data0, h_data0, memSize);   //run device function
  test_math(&h_data_reference0);                                 //run host reference
  if(compareData(h_data0,&h_data_reference0))                      //compare Data
    {passed++;  std::cout << " => Test passed!!!\n";}

  /*///////////////*/ name = "test_signedOperands"; //////////////////////////////////////////
  h_data0->i = h_data_reference0.i = 3;
  h_data0->f = h_data_reference0.f = -7;
  std::cout << "=============== Test " << testnum << ": " << name << " ===================\n";
  loadAndRunTestFunction(&hModule, name, d_data0, h_data0, memSize);   //run device function
  test_signedOperands(&h_data_reference0);                                 //run host reference
  if(compareData(h_data0,&h_data_reference0))                      //compare Data
    {passed++;  std::cout << " => Test passed!!!\n";}

  /*///////////////*/ name = "test_constantOperands"; //////////////////////////////////////////
  h_data0->i = h_data_reference0.i = 3;
  h_data0->f = h_data_reference0.f = -1.44;
  std::cout << "=============== Test " << testnum << ": " << name << " ===================\n";
  loadAndRunTestFunction(&hModule, name, d_data0, h_data0, memSize);   //run device function
  test_constantOperands(&h_data_reference0);                                 //run host reference
  if(compareData(h_data0,&h_data_reference0))                      //compare Data
    {passed++;  std::cout << " => Test passed!!!\n";}
  /*///////////////*/ name = "test_branch_loop_semihard"; /////////////////////////
  h_data0->i = h_data_reference0.i = 10;
  std::cout << "=============== Test " << testnum << ": " << name << " ===================\n";
  loadAndRunTestFunction(&hModule, name, d_data0, h_data0, memSize);   //run device function
  test_branch_loop_semihard(&h_data_reference0);                                 //run host reference
  if(compareData(h_data0,&h_data_reference0))                      //compare Data
    {passed++;  std::cout << " => Test passed!!!\n";}

  /*///////////////*/ name = "test_branch_loop_hard"; // Branch 1 /////////////////
  h_data0->i = h_data_reference0.i = 1;
  h_data0->u = h_data_reference0.u = 3;
  std::cout << "=============== Test " << testnum << ": " << name << " ===================\n";
  loadAndRunTestFunction(&hModule, name, d_data0, h_data0, memSize);   //run device function
  test_branch_loop_hard(&h_data_reference0);                                 //run host reference
  if(compareData(h_data0,&h_data_reference0))                      //compare Data
    {passed++;  std::cout << " => Test passed!!!\n";}
  /*////////////*/ name = "test_branch_loop_hard"; // Branch 2 /////////////////
  h_data0->i = h_data_reference0.i = 7;
  h_data0->u = h_data_reference0.u = 10;
  std::cout << "=============== Test " << testnum << ": " << name << " ===================\n";
  loadAndRunTestFunction(&hModule, name, d_data0, h_data0, memSize);   //run device function
  test_branch_loop_hard(&h_data_reference0);                                 //run host reference
  if(compareData(h_data0,&h_data_reference0))                      //compare Data
    {passed++;  std::cout << " => Test passed!!!\n";}
  /*///////////////*/ name = "test_binaryInst"; /////////////////////////
  h_data0->i = h_data_reference0.i = 5;
  h_data0->f = h_data_reference0.f = -121.23;
  std::cout << "=============== Test " << testnum << ": " << name << " ===================\n";
  loadAndRunTestFunction(&hModule, name, d_data0, h_data0, memSize);   //run device function
  test_binaryInst(&h_data_reference0);                                 //run host reference
  if(compareData(h_data0,&h_data_reference0))                      //compare Data
    {passed++;  std::cout << " => Test passed!!!\n";}

  /*///////////////*/ name = "test_selp"; /////////////////////////
  h_data0->i = h_data_reference0.i = -15;
  std::cout << "=============== Test " << testnum << ": " << name << " ===================\n";
  loadAndRunTestFunction(&hModule, name, d_data0, h_data0, memSize);   //run device function
  test_selp(&h_data_reference0);                                 //run host reference
  if(compareData(h_data0,&h_data_reference0))                      //compare Data
    {passed++;  std::cout << " => Test passed!!!\n";}

  /*///////////////*/ name = "test_GetElementPointer_complicated"; /////////////////////////
  h_data0->i = h_data_reference0.i = 1;
  h_data_reference0.s.s.f = h_data0->s.s.f = 3.11;
  h_data_reference0.s.sa[2].f = h_data0->s.sa[2].f = -4.32;
  h_data_reference0.s.sa[h_data0->i].f = h_data0->s.sa[h_data0->i].f = 111.3;
  std::cout << "=============== Test " << testnum << ": " << name << " ===================\n";
  loadAndRunTestFunction(&hModule, name, d_data0, h_data0, memSize);   //run device function
  test_GetElementPointer_complicated(&h_data_reference0);                                 //run host reference
  if(compareData(h_data0,&h_data_reference0))                      //compare Data
    {passed++;  std::cout << " => Test passed!!!\n";}

  /*///////////////*/ name = "test_call"; /////////////////////////
  h_data0->i = h_data_reference0.i = 10;
  std::cout << "=============== Test " << testnum << ": " << name << " ===================\n";
  loadAndRunTestFunction(&hModule, name, d_data0, h_data0, memSize);   //run device function
  test_call(&h_data_reference0);                                 //run host reference
  if(compareData(h_data0,&h_data_reference0))                      //compare Data
    {passed++;  std::cout << " => Test passed!!!\n";}

  /*/////////////*/ name = "test_alloca"; /////////////////////////
  h_data0->i = h_data_reference0.i = 1;
  h_data0->f = h_data_reference0.f = -3.23;
  std::cout << "=============== Test " << testnum << ": " << name << " ===================\n";
  loadAndRunTestFunction(&hModule, name, d_data0, h_data0, memSize);   //run device function
  test_alloca(&h_data_reference0);                                 //run host reference
  if(compareData(h_data0,&h_data_reference0))                     //compare Data
    {passed++; std::cout << " => Test passed!!!\n";}

  /*///////////////*/ name = "test_alloca_complicated"; /////////////////////////
  h_data0->i = h_data_reference0.i = 1;
  h_data0->f = h_data_reference0.f = 23.213;
  std::cout << "=============== Test " << testnum << ": " << name << " ===================\n";
  loadAndRunTestFunction(&hModule, name, d_data0, h_data0, memSize);   //run device function
  test_alloca_complicated(&h_data_reference0);                                 //run host reference
  if(compareData(h_data0,&h_data_reference0))                      //compare Data
    {passed++;  std::cout << " => Test passed!!!\n";}

  /*///////////////*/ name = "test_globalVariables"; /////////////////////////
  std::cout << "=============== Test " << testnum << ": " << name << " ===================\n";
  loadAndRunTestFunction(&hModule, name, d_data0, h_data0, memSize);   //run device function
  test_globalVariables(&h_data_reference0);                                 //run host reference
  if(compareData(h_data0,&h_data_reference0))                      //compare Data
    {passed++;  std::cout << " => Test passed!!!\n";}

  /*///////////////*/ name = "test_specialRegisters_x"; /////////////////////////
  std::cout << "=============== Test " << testnum << ": " << name << " ===================\n";
  loadAndRunTestFunction(&hModule, name, d_data0, h_data0, memSize, 2,3,4, 2,3);   //run device function
  runHostTestFunction(test_specialRegisters_x, &h_data_reference0,   2,3,4, 2,3);   //run host reference
  if(compareData(h_data0,&h_data_reference0))                      //compare Data
    {passed++;  std::cout << " => Test passed!!!\n";}

  /*///////////////*/ name = "test_specialRegisters_y"; /////////////////////////
  std::cout << "=============== Test " << testnum << ": " << name << " ===================\n";
  loadAndRunTestFunction(&hModule, name, d_data0, h_data0, memSize, 2,3,4, 2,3);   //run device function
  runHostTestFunction(test_specialRegisters_x, &h_data_reference0,   2,3,4, 2,3);   //run host reference
  if(compareData(h_data0,&h_data_reference0))                      //compare Data
    {passed++;  std::cout << " => Test passed!!!\n";}

  /*///////////////*/ name = "test_dualArgument"; /////////////////////////
  std::cout << "=============== Test " << testnum << ": " << name << " ===================\n";
  loadAndRunDualTestFunction(&hModule, name, d_data0, d_data1, h_data0, h_data1, memSize);   //run device function

  test_dualArgument(&h_data_reference0,&h_data_reference1);   //run host reference
  if(compareData(h_data0,&h_data_reference0))                      //compare Data
    {passed++;  std::cout << " => Test passed!!!\n";}
  if(compareData(h_data1,&h_data_reference1))                      //compare Data
    {passed++;  std::cout << " => Test passed!!!\n";}
  testnum++;  testnum++;

  /*///////////////*/ name = "test_vector"; /////////////////////////

  h_data0->fa[0] = h_data_reference0.fa[0] = 0.43f;
  h_data0->fa[1] = h_data_reference0.fa[1] = 0.234f;
  h_data0->fa[2] = h_data_reference0.fa[2] = 12893.f;
  h_data0->fa[3] = h_data_reference0.fa[3] = 13.33f;
  std::cout << "=============== Test " << testnum << ": " << name << " ===================\n";
  loadAndRunTestFunction(&hModule, name, d_data0, h_data0, memSize);   //run device function
  test_vector(&h_data_reference0);                                 //run host reference
  if(compareData(h_data0,&h_data_reference0))                     //compare Data
    {passed++; std::cout << " => Test passed!!!\n";}

  /*///////////////*/ name = "test_reg2Const"; /////////////////////////

  unsigned int bytes; //size of constant
  CUdeviceptr devptr_const=0; 
  status = cuModuleGetGlobal(&devptr_const,
			     hModule, "__ptx_constant_data_global");

  cuMemcpyHtoD(devptr_const, h_data0, memSize);

  std::cout << "=============== Test " << testnum << ": " << name << " ===================\n";
  loadAndRunTestFunction(&hModule, name, d_data0, h_data0, memSize);   //run device function
  test_reg2Const(&h_data_reference0);                                 //run host reference
  if(compareData(h_data0,&h_data_reference0))                     //compare Data
    {passed++; std::cout << " => Test passed!!!\n";}

  /*///////////////*/ name = "test_constantMemory"; /////////////////////////

  h_data0->fa[0] = __ptx_constant_data_global.fa[0] = 0.2348f;
  unsigned int bytes; //size of constant
  CUdeviceptr devptr_const=0; 
  status = cuModuleGetGlobal(&devptr_const,
			     hModule, "__ptx_constant_data_global");

  cuMemcpyHtoD(devptr_const, h_data0, memSize);


  std::cout << "=============== Test " << testnum << ": " << name << " ===================\n";
  loadAndRunTestFunction(&hModule, name, d_data0, h_data0, memSize);   //run device function
  test_constantMemory(&h_data_reference0);                                 //run host reference
  if(compareData(h_data0,&h_data_reference0))                     //compare Data
    {passed++; std::cout << " => Test passed!!!\n";}

  /*///////////////*/ name = "test_sharedMemory"; /////////////////////////

  for(int i = 0; i < ARRAY_N/2; i++)
    h_data0->fa[i*2] = i;

  for(int i = 0; i < ARRAY_N/2; i++)
    h_data0->fa[i*2+1] = -i;

  std::cout << "=============== Test " << testnum << ": " << name << " ===================\n";
  loadAndRunTestFunction(&hModule, name, d_data0, h_data0, memSize, 32,1,1, 1,1);   //run device function

  for(int i = 0; i < ARRAY_N/2; i++)
    h_data_reference0.fa[i] = i;
  for(int i = 0; i < ARRAY_N/2; i++)
    h_data_reference0.fa[i+32] = -i;
  //  runHostTestFunction(test_sharedMemory, &h_data_reference0, 16,1,1, 1,1);                                 //run host reference

  if(compareData(h_data0,&h_data_reference0))                     //compare Data
    {passed++; std::cout << " => Test passed!!!\n";}

  /*///////////////*/ name = "test_lightShader"; /////////////////////////

  unsigned int bytes; //size of constant
  CUdeviceptr devptr_const=0; 
  status = cuModuleGetGlobal(&devptr_const,
			     hModule, "__ptx_constant_data_global");

  cuMemcpyHtoD(devptr_const, h_data0, memSize);

  std::cout << "=============== Test " << testnum << ": " << name << " ===================\n";
  loadAndRunTestFunction(&hModule, name, d_data0, h_data0, memSize);   //run device function

  test_lightShader(&h_data_reference0);                                 //run host reference
  if(compareData(h_data0,&h_data_reference0))                     //compare Data
    {passed++; std::cout << " => Test passed!!!\n";}

  //======================= test cases END ====================================//

  // Check the result
  std::cout << "\nPASSED " << passed << " tests" << std::endl;
  std::cout << "FAILED " << (testnum-passed) << " tests" << std::endl;

  // Cleanup
  if (d_data0)
      d_data0 = 0;
  if (d_data1)
      d_data1 = 0;
  if (h_data0)
      h_data0 = 0;
  if (h_data1)
      h_data1 = 0;
  if (hModule)
      hModule = 0;
  if (hStream)
      hStream = 0;
  if (hContext)
      hContext = 0;
  return 0;
Ejemplo n.º 12
 inline int compareNode(const BasicPtree & other) const{
     return (compareKey(other) && compareData(other));
Ejemplo n.º 13
inline bool
sdkComparePPM(const char *src_file, const char *ref_file,
              const float epsilon, const float threshold, bool verboseErrors)
    unsigned char *src_data, *ref_data;
    unsigned long error_count = 0;
    unsigned int ref_width, ref_height;
    unsigned int src_width, src_height;

    if (src_file == NULL || ref_file == NULL)
        if (verboseErrors)
            std::cerr << "PPMvsPPM: src_file or ref_file is NULL.  Aborting comparison\n";

        return false;

    if (verboseErrors)
        std::cerr << "> Compare (a)rendered:  <" << src_file << ">\n";
        std::cerr << ">         (b)reference: <" << ref_file << ">\n";

    if (sdkLoadPPM4ub(ref_file, &ref_data, &ref_width, &ref_height) != true)
        if (verboseErrors)
            std::cerr << "PPMvsPPM: unable to load ref image file: "<< ref_file << "\n";

        return false;

    if (sdkLoadPPM4ub(src_file, &src_data, &src_width, &src_height) != true)
        std::cerr << "PPMvsPPM: unable to load src image file: " << src_file << "\n";
        return false;

    if (src_height != ref_height || src_width != ref_width)
        if (verboseErrors) std::cerr << "PPMvsPPM: source and ref size mismatch (" << src_width <<
                                         "," << src_height << ")vs(" << ref_width << "," << ref_height << ")\n";

    if (verboseErrors) std::cerr << "PPMvsPPM: comparing images size (" << src_width <<
                                     "," << src_height << ") epsilon(" << epsilon << "), threshold(" << threshold*100 << "%)\n";

    if (compareData(ref_data, src_data, src_width*src_height*4, epsilon, threshold) == false)

    if (error_count == 0)
        if (verboseErrors)
            std::cerr << "    OK\n\n";
        if (verboseErrors)
            std::cerr << "    FAILURE!  "<<error_count<<" errors...\n\n";

    return (error_count == 0)? true : false;  // returns true if all pixels pass