Exemple #1
0
void* run_tests(void* arg)
{
  memtest_control_t* mc = (memtest_control_t*)arg;
  struct timeval t0, t1;
  unsigned int pass = 0;
  int i;
  while(1){
    for (i = 0;i < DIM(cuda_memtests); i++){
      if (cuda_memtests[i].enabled){
	PRINTF("%s\n", cuda_memtests[i].desc);
	gettimeofday(&t0, NULL);
	cuda_memtests[i].func(mc);
	gettimeofday(&t1, NULL);
	PRINTF("Test%d finished in %.1f seconds\n", i, TDIFF(t1, t0));
      }//if
    }//for
        

    if (num_passes <=0){
      continue;
    }

    pass++;
    if (pass >= num_passes){
      break;
    }
  }
  return NULL;
  }
Exemple #2
0
/* Measure insertions. */
static void ins(void) {
    skiplist *sl = skiplist_new(intptr_cmp, NULL, NULL);

    TIME(pre);
    for (intptr_t i=0; i < lim; i++) {
        skiplist_add(sl, (void *) i, (void *) i);
    }
    TIME(post);

    TDIFF();
    skiplist_free(sl, NULL, NULL);
}
Exemple #3
0
static void set(void) {
    skiplist *sl = skiplist_new(intptr_cmp, NULL, NULL);

    TIME(pre);
    for (intptr_t i=0; i < lim; i++) {
        intptr_t k = i % (lim / 2);
        skiplist_set(sl, (void *) k, (void *) k, NULL);
    }
    TIME(post);

    TDIFF();
    skiplist_free(sl, NULL, NULL);
}
Exemple #4
0
/* Measure getting _nonexistent_ values (lookup failure). */
static void get_nonexistent(void) {
    skiplist *sl = skiplist_new(intptr_cmp, NULL, NULL);

    for (intptr_t i=0; i < lim; i++) {
        skiplist_add(sl, (void *) i, (void *) i);
    }

    TIME(pre);
    for (intptr_t i=0; i < lim; i++) {
        intptr_t k = (i * largeish_prime) + lim;
        intptr_t v = 0;
        skiplist_get(sl, (void *) k, (void **)&v);
        assert(v == 0);
        if (0) { printf("%lu %lu\n", k, v); }
    }
    TIME(post);

    TDIFF();
    skiplist_free(sl, NULL, NULL);
}
Exemple #5
0
static void set_and_get(void) {
    skiplist *sl = skiplist_new(intptr_cmp, NULL, NULL);

    TIME(pre);
    for (intptr_t i=0; i < lim; i++) {
        intptr_t k = i % (lim / 2);
        skiplist_set(sl, (void *) k, (void *) k, NULL);
    }

    for (intptr_t i=0; i < lim; i++) {
        intptr_t k = (i * largeish_prime) % (lim / 2);
        intptr_t v = (intptr_t)0;
        skiplist_get(sl, (void *) k, (void **)&v);
        if (0) { printf("%lu %lu\n", k, v); }
        assert(v == k);
    }
    TIME(post);

    TDIFF();
    skiplist_free(sl, NULL, NULL);
}
Exemple #6
0
int hip_xmlrpc_rm(struct dht_val *v)
{
  struct timeval now;
  struct sockaddr_storage server;
  int mode = XMLRPC_MODE_RM | XMLRPC_MODE_RETRY_OFF;
  int err, key_len, value_len, secret_len, rm_ttl;

  if (!v)
    {
      return(-1);
    }
  if (hip_dht_select_server(SA(&server)) < 0)
    {
      return(-1);
    }

  gettimeofday(&now, NULL);
  rm_ttl = TDIFF(v->expire_time, now);
  if (rm_ttl <= 0)
    {
      return(0);
    }

  key_len = sizeof(v->key);
  value_len = v->value_hash_len;
  secret_len = v->secret_len;

  err = hip_xmlrpc_getput(mode, v->app, SA(&server),
                          (char *)v->key, key_len,
                          (char *)v->value_hash,  &value_len,
                          (char *)v->secret, secret_len, rm_ttl);
  if (err == 0)
    {
      log_(NORM, "Removed a %s value from the DHT.\n", v->app);
    }
  return(err);
}
Exemple #7
0
/*
 * \fn hip_xmlrpc_getput()
 *
 * \param mode		determines get or put, app, retry on/off
 *		         If retry is off only one attempt should be made,
 *                       on means the connect() should keep retrying
 * \param app		string to use in the XML RPC application field
 * \param server	server address and port to connect to
 * \param key           DHT key used for get or put
 * \param key_len	length of DHT key in bytes
 * \param value		DHT value used for put, ptr for storing value for get
 * \param value_len	ptr to length of value buffer, length of get is returned
 * \param secret	secret value used to make put removable
 * \param secret_len	length of secret value
 * \param ttl		time to live in seconds
 *
 * \brief Perform the XML RPC GET, PUT, and RM operations.
 */
int hip_xmlrpc_getput(int mode, char *app, struct sockaddr *server,
                      char *key, int key_len, char *value, int *value_len,
                      char *secret, int secret_len, int ttl)
{
  xmlDocPtr doc = NULL;
  xmlNodePtr root_node = NULL, node;
  int len = 0, s, retval = 0;
  char buff[2048], oper[14];
  unsigned char key64[2 * DHT_KEY_SIZE], val64[2 * DHT_VAL_SIZE];
  unsigned char tmp[2 * DHT_VAL_SIZE], *xmlbuff = NULL;
  fd_set read_fdset;
  struct timeval timeout, now;
  char *p;
  unsigned int retry_attempts = 0;
  struct sockaddr_in src_addr;
  struct dht_val *dv, rm;
  SHA_CTX c;
  __u8 secret_hash[SHA_DIGEST_LENGTH], value_hash[SHA_DIGEST_LENGTH];
  int rm_ttl = 0, value_hash_len;

  int retry = ((mode & 0x00F0) == XMLRPC_MODE_RETRY_ON);

  if ((key_len > (2 * DHT_KEY_SIZE)) ||
      (*value_len > (2 * DHT_VAL_SIZE)))
    {
      return(-1);
    }

  /*
   * support for removable puts
   */
  memset(&rm, 0, sizeof(struct dht_val));
  if ((mode & 0x000F) == XMLRPC_MODE_PUT)
    {
      /*
       * produce hashes of the secret and the value, for later removal
       */
      SHA1_Init(&c);
      SHA1_Update(&c, value, *value_len);
      SHA1_Final(value_hash, &c);
      SHA1_Init(&c);
      SHA1_Update(&c, secret, secret_len);
      SHA1_Final(secret_hash, &c);

      /*
       * check if we already published a record with this key; record
       * this new secret value and value_hash
       */
      pthread_mutex_lock(&dht_vals_lock);
      gettimeofday(&now, NULL);
      dv = lookup_dht_val(key);
      if (dv)
        {
          /* save old secret so we can remove it later below */
          memcpy(&rm, &dv, sizeof(struct dht_val));
          /* any time left for removing the old record? */
          rm_ttl = TDIFF(rm.expire_time, now);
        }
      else
        {
          dv = insert_dht_val(key);
        }
      strncpy(dv->app, app, sizeof(dv->app));
      dv->value_hash_len = SHA_DIGEST_LENGTH;
      memcpy(dv->value_hash, value_hash, SHA_DIGEST_LENGTH);
      dv->secret_len = secret_len;
      memcpy(dv->secret, secret, secret_len);
      dv->expire_time.tv_usec = now.tv_usec;
      dv->expire_time.tv_sec = now.tv_sec + ttl;
      pthread_mutex_unlock(&dht_vals_lock);
    }

  switch (mode & 0x000F)
    {
    case XMLRPC_MODE_PUT:
      sprintf(oper, "put_removable");
      break;
    case XMLRPC_MODE_GET:
      sprintf(oper, "get");
      break;
    case XMLRPC_MODE_RM:
      sprintf(oper, "rm");
      break;
    default:
      log_(WARN, "Invalid XMLRPC mode given to DHT.\n");
      return(-1);
    }

  /*
   * create a new XML document
   */
  doc = xmlNewDoc(BAD_CAST "1.0");
  root_node = xmlNewNode(NULL, BAD_CAST "methodCall");
  xmlDocSetRootElement(doc, root_node);
  node = xmlNewChild(root_node, NULL, BAD_CAST "methodName",
                     BAD_CAST oper);
  node = xmlNewChild(root_node, NULL, BAD_CAST "params", NULL);
  memset(tmp, 0, sizeof(tmp));
  memcpy(tmp, key, key_len);
  EVP_EncodeBlock(key64, tmp, key_len);
  xml_new_param(node, "base64", (char *)key64);                 /* key */
  /* log_(NORM, "Doing %s using key(%d)=",
   *    ((mode & 0x000F)==XMLRPC_MODE_PUT) ? "PUT":"GET", key_len);
   *  print_hex(key, key_len);
   *  log_(NORM, " [%s]\n", key64); // */
  switch (mode & 0x000F)
    {
    case XMLRPC_MODE_PUT:
      memset(tmp, 0, sizeof(tmp));
      memcpy(tmp, value, *value_len);
      EVP_EncodeBlock(val64, tmp, *value_len);
      xml_new_param(node, "base64", (char *)val64);             /* value */
      xml_new_param(node, "string", "SHA");                     /* hash type */
      memset(tmp, 0, sizeof(tmp));
      memcpy(tmp, secret_hash, SHA_DIGEST_LENGTH);
      EVP_EncodeBlock(val64, tmp, SHA_DIGEST_LENGTH);
      xml_new_param(node, "base64", (char *)val64);            /* secret_hash */
      sprintf((char *)tmp, "%d", ttl);
      xml_new_param(node, "int", (char *)tmp);                  /* lifetime */
      break;
    case XMLRPC_MODE_GET:
      xml_new_param(node, "int", "10");                 /* maxvals */
      xml_new_param(node, "base64", "");                /* placemark */
      memset(value, 0, *value_len);
      break;
    case XMLRPC_MODE_RM:
      memset(tmp, 0, sizeof(tmp));
      memcpy(tmp, value_hash, SHA_DIGEST_LENGTH);
      EVP_EncodeBlock(val64, tmp, SHA_DIGEST_LENGTH);
      xml_new_param(node, "base64", (char *)val64);             /* value_hash */
      xml_new_param(node, "string", "SHA");                     /* hash type */
      memset(tmp, 0, sizeof(tmp));
      memcpy(tmp, secret, secret_len);
      EVP_EncodeBlock(val64, tmp, secret_len);
      xml_new_param(node, "base64", (char *)val64);             /* secret */
      sprintf((char *)tmp, "%d", ttl);
      xml_new_param(node, "int", (char *)tmp);                  /* lifetime */
    }
  xml_new_param(node, "string", app);                   /* app */
  xmlDocDumpFormatMemory(doc, &xmlbuff, &len, 0);

  /*
   * Build an HTTP POST and transmit to server
   */
  memset(buff, 0, sizeof(buff));
  build_http_post_header(buff, len, server);       /* len is XML length above */
  memcpy(&buff[strlen(buff)], xmlbuff, len);
  xmlFree(xmlbuff);
  len = strlen(buff) + 1;
connect_retry:
  /* Connect and send the XML RPC */
  if ((s = socket(PF_INET, SOCK_STREAM, IPPROTO_TCP)) < 0)
    {
      log_(WARN, "DHT connect - socket error: %s\n", strerror(errno));
      retval = -1;
      goto putget_exit;
    }
  /* Use the preferred address as source */
  memset(&src_addr, 0, sizeof(src_addr));
  src_addr.sin_family = AF_INET;
  src_addr.sin_addr.s_addr = get_preferred_addr();
  if (!src_addr.sin_addr.s_addr)
    {
      log_(NORM, "No preferred address, deferring DHT!\n");
      return(-1);
    }
  log_(NORM, "Using source address of %s for DHT %s.\n",
       logaddr(SA(&src_addr)), oper);
  fflush(stdout);
  if (bind(s, SA(&src_addr), SALEN(&src_addr)) < 0)
    {
      log_(WARN, "DHT connect - bind error: %s\n", strerror(errno));
    }

  if (g_state != 0)
    {
      return(-1);
    }
  if (retry && (retry_attempts > 0))
    {
      /* quit after a certain number of retries */
      if (retry_attempts >= HCNF.max_retries)
        {
          retval = -2;
          goto putget_exit;
        }
      /* wait packet_timeout seconds before retrying */
      hip_sleep(HCNF.packet_timeout);
    }
  retry_attempts++;

  if (connect(s, server, SALEN(server)) < 0)
    {
      log_(WARN, "DHT server connect error: %s\n", strerror(errno));
      closesocket(s);
#ifdef __WIN32__
      errno = WSAGetLastError();
      if (retry && ((errno == WSAETIMEDOUT) ||
                    (errno == WSAENETUNREACH)))
        {
          goto connect_retry;
        }
#else
      if (retry &&
          ((errno == ETIMEDOUT) || (errno == EHOSTUNREACH)))
        {
          goto connect_retry;
        }
#endif
      retval = -3;
      goto putget_exit;
    }

  if (send(s, buff, len, 0) != len)
    {
      log_(WARN, "DHT sent incorrect number of bytes\n");
      retval = -4;
      goto putget_exit;
    }
  xmlFreeDoc(doc);
  doc = NULL;

  /*
   * Receive XML RPC response from server
   */
  FD_ZERO(&read_fdset);
  FD_SET((unsigned int)s, &read_fdset);
  /* use longer timeout when retry==TRUE, because we have own thread */
  if (retry)
    {
      timeout.tv_sec = 3;
      timeout.tv_usec = 0;
    }
  else
    {
      timeout.tv_sec = 0;
      timeout.tv_usec = 300000;           /* 300ms */
    }
  if (select(s + 1, &read_fdset, NULL, NULL, &timeout) < 0)
    {
      log_(WARN, "DHT select error: %s\n", strerror(errno));
      retval = -5;
      goto putget_exit;
    }
  else if (FD_ISSET(s, &read_fdset))
    {
      if ((len = recv(s, buff, sizeof(buff) - 1, 0)) <= 0)
        {
          log_(WARN, "DHT error receiving from server: %s\n",
               strerror(errno));
          retval = -6;
          goto putget_exit;
        }
      if (strncmp(buff, "HTTP", 4) != 0)
        {
          return(-7);
        }
      if ((p = strstr(buff, "Content-Length: ")) == NULL)
        {
          return(-8);
        }
      else               /* advance ptr to Content-Length */
        {
          p += 16;
        }
      sscanf(p, "%d", &len);
      p = strchr(p, '\n') + 3;           /* advance to end of line */
      retval = hip_xmlrpc_parse_response(mode, p, len,
                                         value, value_len);
      log_(NORM, "DHT server responded with return code %d (%s).\n",
           retval, hip_xmlrpc_resp_to_str(retval));
    }
  else
    {
      /* select timeout */
      if (retry)             /* XXX testme: retry select instead? */
        {
          goto connect_retry;
        }
      retval = -9;
    }

putget_exit:
#ifdef __WIN32__
  closesocket(s);
#else
  close(s);
#endif
  if (doc != NULL)
    {
      xmlFreeDoc(doc);
    }
  if (rm_ttl > 0)
    {
      value_hash_len = sizeof(rm.value_hash);
      hip_xmlrpc_getput(((mode & 0x00F0) | XMLRPC_MODE_RM),
                        app, server, key, key_len,
                        (char *)rm.value_hash, &value_hash_len,
                        (char *)rm.secret, secret_len, rm_ttl);
    }
  return(retval);
}
static int
unitarize_link_test()
{

  QudaGaugeParam qudaGaugeParam = newQudaGaugeParam();

  initQuda(0);

  cpu_prec = prec;
  gSize = cpu_prec;  
  qudaGaugeParam.anisotropy = 1.0;
  
  qudaGaugeParam.X[0] = xdim;
  qudaGaugeParam.X[1] = ydim;
  qudaGaugeParam.X[2] = zdim;
  qudaGaugeParam.X[3] = tdim;

  setDims(qudaGaugeParam.X);
  
  QudaPrecision link_prec = QUDA_SINGLE_PRECISION;
  QudaReconstructType link_recon = QUDA_RECONSTRUCT_NO;

  qudaGaugeParam.cpu_prec  = link_prec;
  qudaGaugeParam.cuda_prec = link_prec;
  qudaGaugeParam.reconstruct = link_recon;
  qudaGaugeParam.type = QUDA_WILSON_LINKS;


  hisq::fermion_force::hisqForceInitCuda(&qudaGaugeParam);
  
  qudaGaugeParam.t_boundary  	   = QUDA_PERIODIC_T;
  qudaGaugeParam.anisotropy  	   = 1.0;
  qudaGaugeParam.cuda_prec_sloppy   = prec;
  qudaGaugeParam.reconstruct_sloppy = QUDA_RECONSTRUCT_NO;
  qudaGaugeParam.gauge_fix   	   = QUDA_GAUGE_FIXED_NO;
  qudaGaugeParam.ga_pad      	   = 0;
  qudaGaugeParam.packed_size 	   = 0;
  qudaGaugeParam.gaugeGiB    	   = 0;
  qudaGaugeParam.flag              = false;

   
  qudaGaugeParam.cpu_prec = cpu_prec;
  qudaGaugeParam.cuda_prec = prec;
  qudaGaugeParam.gauge_order = gauge_order;
  qudaGaugeParam.type=QUDA_WILSON_LINKS;
  qudaGaugeParam.reconstruct = link_recon;
  qudaGaugeParam.flag = QUDA_FAT_PRESERVE_CPU_GAUGE
    | QUDA_FAT_PRESERVE_GPU_GAUGE
    | QUDA_FAT_PRESERVE_COMM_MEM;

  setFatLinkPadding(QUDA_COMPUTE_FAT_STANDARD, &qudaGaugeParam);
 
  GaugeFieldParam gParam(0, qudaGaugeParam);
  gParam.pad = 0;
  gParam.create    = QUDA_REFERENCE_FIELD_CREATE;
  gParam.link_type = QUDA_WILSON_LINKS;
  gParam.order     = QUDA_MILC_GAUGE_ORDER;
  cpuGaugeField *cpuOutLink  = new cpuGaugeField(gParam);

  gParam.pad         = 0;
  gParam.create      = QUDA_NULL_FIELD_CREATE;
  gParam.link_type   = QUDA_WILSON_LINKS;
  gParam.order       = QUDA_QDP_GAUGE_ORDER;
  gParam.reconstruct = QUDA_RECONSTRUCT_NO;
  cudaGaugeField *cudaFatLink = new cudaGaugeField(gParam);
  cudaGaugeField *cudaULink   = new cudaGaugeField(gParam);  

  initCommonConstants(*cudaFatLink);

  void* fatlink = (void*)malloc(4*V*gaugeSiteSize*gSize);
  if(fatlink == NULL){
    errorQuda("ERROR: allocating fatlink failed\n");
  }
  
  void* sitelink[4];
  for(int i=0;i < 4;i++){
    cudaMallocHost((void**)&sitelink[i], V*gaugeSiteSize*gSize);
    if(sitelink[i] == NULL){
      errorQuda("ERROR; allocate sitelink[%d] failed\n", i);
    }
  }
  
  createSiteLinkCPU(sitelink, qudaGaugeParam.cpu_prec, 1);

  double act_path_coeff[6];
  act_path_coeff[0] = 0.625000;
  act_path_coeff[1] = -0.058479;
  act_path_coeff[2] = -0.087719;
  act_path_coeff[3] = 0.030778;
  act_path_coeff[4] = -0.007200;
  act_path_coeff[5] = -0.123113;


  //only record the last call's performance
  //the first one is for creating the cpu/cuda data structures
  
  if(gauge_order == QUDA_QDP_GAUGE_ORDER){
    computeFatLinkQuda(fatlink, sitelink, act_path_coeff, &qudaGaugeParam,
			   QUDA_COMPUTE_FAT_STANDARD);
  } // gauge order is QDP_GAUGE_ORDER

  cpuOutLink->setGauge((void**)fatlink);
  cudaFatLink->loadCPUField(*cpuOutLink, QUDA_CPU_FIELD_LOCATION);
 

 
  hisq::setUnitarizeLinksConstants(unitarize_eps,
				   max_allowed_error,
				   reunit_allow_svd,
				   reunit_svd_only,
				   svd_rel_error,
				   svd_abs_error);
 
  hisq::setUnitarizeLinksPadding(0,0);

  int* num_failures_dev;
  cudaMalloc(&num_failures_dev, sizeof(int));
  cudaMemset(num_failures_dev, 0, sizeof(int));

  struct timeval t0, t1;

  gettimeofday(&t0,NULL);
  hisq::unitarizeLinksCuda(qudaGaugeParam,*cudaFatLink, cudaULink, num_failures_dev);
  cudaThreadSynchronize();
  gettimeofday(&t1,NULL);

  int num_failures=0;
  cudaMemcpy(&num_failures, num_failures_dev, sizeof(int), cudaMemcpyDeviceToHost);

 delete cudaFatLink;
 delete cudaULink;
 for(int dir=0; dir<4; ++dir) cudaFreeHost(sitelink[dir]);
  cudaFree(num_failures_dev); 
#ifdef MULTI_GPU
  exchange_llfat_cleanup();
#endif
  endQuda();
   
  printfQuda("Unitarization time: %g ms\n", TDIFF(t0,t1)*1000); 
  return num_failures;
}
Exemple #9
0
  static void
llfat_test(int test)
{

  QudaGaugeParam qudaGaugeParam;
#ifdef MULTI_GPU
  void* ghost_sitelink[4];
  void* ghost_sitelink_diag[16];
#endif


  initQuda(device);

  cpu_prec = prec;
  gSize = cpu_prec;  
  qudaGaugeParam = newQudaGaugeParam();

  qudaGaugeParam.anisotropy = 1.0;

  qudaGaugeParam.X[0] = xdim;
  qudaGaugeParam.X[1] = ydim;
  qudaGaugeParam.X[2] = zdim;
  qudaGaugeParam.X[3] = tdim;

  setDims(qudaGaugeParam.X);

  qudaGaugeParam.cpu_prec = cpu_prec;
  qudaGaugeParam.cuda_prec = prec;
  qudaGaugeParam.gauge_order = gauge_order;
  qudaGaugeParam.type=QUDA_WILSON_LINKS;
  qudaGaugeParam.reconstruct = link_recon;
  /*
     qudaGaugeParam.flag = QUDA_FAT_PRESERVE_CPU_GAUGE
     | QUDA_FAT_PRESERVE_GPU_GAUGE
     | QUDA_FAT_PRESERVE_COMM_MEM;
     */
  qudaGaugeParam.preserve_gauge =0;
  void* fatlink;
  if (cudaMallocHost((void**)&fatlink, 4*V*gaugeSiteSize*gSize) == cudaErrorMemoryAllocation) {
    errorQuda("ERROR: cudaMallocHost failed for fatlink\n");
  }

  void* longlink;
  if (cudaMallocHost((void**)&longlink, 4*V*gaugeSiteSize*gSize) == cudaErrorMemoryAllocation) {
    errorQuda("ERROR: cudaMallocHost failed for longlink\n");
  } // page-locked memory

  void* sitelink[4];
  for(int i=0;i < 4;i++){
    if (cudaMallocHost((void**)&sitelink[i], V*gaugeSiteSize*gSize) == cudaErrorMemoryAllocation) {
      errorQuda("ERROR: cudaMallocHost failed for sitelink\n");
    }
  }

  void* sitelink_ex[4];
  for(int i=0;i < 4;i++){
    if (cudaMallocHost((void**)&sitelink_ex[i], V_ex*gaugeSiteSize*gSize) ==  cudaErrorMemoryAllocation) {
      errorQuda("ERROR: cudaMallocHost failed for sitelink_ex\n");
    }
  }


  void* milc_sitelink;
  milc_sitelink = (void*)malloc(4*V*gaugeSiteSize*gSize);
  if(milc_sitelink == NULL){
    errorQuda("ERROR: allocating milc_sitelink failed\n");
  }

  void* milc_sitelink_ex;
  milc_sitelink_ex = (void*)malloc(4*V_ex*gaugeSiteSize*gSize);
  if(milc_sitelink_ex == NULL){
    errorQuda("Error: allocating milc_sitelink failed\n");
  }



  createSiteLinkCPU(sitelink, qudaGaugeParam.cpu_prec, 1);

  if(gauge_order == QUDA_MILC_GAUGE_ORDER){
    for(int i=0; i<V; ++i){
      for(int dir=0; dir<4; ++dir){
        char* src = (char*)sitelink[dir];
        memcpy((char*)milc_sitelink + (i*4 + dir)*gaugeSiteSize*gSize, src+i*gaugeSiteSize*gSize, gaugeSiteSize*gSize);
      }	
    }
  }

  int X1=Z[0];
  int X2=Z[1];
  int X3=Z[2];
  int X4=Z[3];

  for(int i=0; i < V_ex; i++){
    int sid = i;
    int oddBit=0;
    if(i >= Vh_ex){
      sid = i - Vh_ex;
      oddBit = 1;
    }

    int za = sid/E1h;
    int x1h = sid - za*E1h;
    int zb = za/E2;
    int x2 = za - zb*E2;
    int x4 = zb/E3;
    int x3 = zb - x4*E3;
    int x1odd = (x2 + x3 + x4 + oddBit) & 1;
    int x1 = 2*x1h + x1odd;


    if( x1< 2 || x1 >= X1 +2
        || x2< 2 || x2 >= X2 +2
        || x3< 2 || x3 >= X3 +2
        || x4< 2 || x4 >= X4 +2){
#ifdef MULTI_GPU
      continue;
#endif
    }



    x1 = (x1 - 2 + X1) % X1;
    x2 = (x2 - 2 + X2) % X2;
    x3 = (x3 - 2 + X3) % X3;
    x4 = (x4 - 2 + X4) % X4;

    int idx = (x4*X3*X2*X1+x3*X2*X1+x2*X1+x1)>>1;
    if(oddBit){
      idx += Vh;
    }
    for(int dir= 0; dir < 4; dir++){
      char* src = (char*)sitelink[dir];
      char* dst = (char*)sitelink_ex[dir];
      memcpy(dst+i*gaugeSiteSize*gSize, src+idx*gaugeSiteSize*gSize, gaugeSiteSize*gSize);

      // milc ordering 
      memcpy((char*)milc_sitelink_ex + (i*4 + dir)*gaugeSiteSize*gSize, src+idx*gaugeSiteSize*gSize, gaugeSiteSize*gSize);
    }//dir
  }//i


  double act_path_coeff[6];
  for(int i=0;i < 6;i++){
    act_path_coeff[i]= 0.1*i;
  }


  //only record the last call's performance
  //the first one is for creating the cpu/cuda data structures
  struct timeval t0, t1;


  void** sitelink_ptr; 
  QudaComputeFatMethod method = (test) ? QUDA_COMPUTE_FAT_EXTENDED_VOLUME : QUDA_COMPUTE_FAT_STANDARD;
  if(gauge_order == QUDA_QDP_GAUGE_ORDER){
    sitelink_ptr = (test) ? (void**)sitelink_ex : (void**)sitelink;
  }else{
    sitelink_ptr = (test) ? (void**)milc_sitelink_ex : (void**)milc_sitelink;
  }
  void* longlink_ptr = longlink;
#ifdef MULTI_GPU
  if(!test) longlink_ptr = NULL; // Have to have an extended volume for the long-link calculation
#endif

  gettimeofday(&t0, NULL);
  computeKSLinkQuda(fatlink, longlink_ptr, NULL, milc_sitelink, act_path_coeff, &qudaGaugeParam, method);
  gettimeofday(&t1, NULL);


  double secs = TDIFF(t0,t1);

  void* fat_reflink[4];
  void* long_reflink[4];
  for(int i=0;i < 4;i++){
    fat_reflink[i] = malloc(V*gaugeSiteSize*gSize);
    if(fat_reflink[i] == NULL){
      errorQuda("ERROR; allocate fat_reflink[%d] failed\n", i);
    }
    long_reflink[i] = malloc(V*gaugeSiteSize*gSize);
    if(long_reflink[i] == NULL) errorQuda("ERROR; allocate long_reflink[%d] failed\n", i);
  }

  if (verify_results){

    //FIXME: we have this compplication because references takes coeff as float/double 
    //        depending on the precision while the GPU code aways take coeff as double
    void* coeff;
    double coeff_dp[6];
    float  coeff_sp[6];
    for(int i=0;i < 6;i++){
      coeff_sp[i] = coeff_dp[i] = act_path_coeff[i];
    }
    if(prec == QUDA_DOUBLE_PRECISION){
      coeff = coeff_dp;
    }else{
      coeff = coeff_sp;
    }
#ifdef MULTI_GPU
    int optflag = 0;
    //we need x,y,z site links in the back and forward T slice
    // so it is 3*2*Vs_t
    int Vs[4] = {Vs_x, Vs_y, Vs_z, Vs_t};
    for(int i=0;i < 4; i++){
      ghost_sitelink[i] = malloc(8*Vs[i]*gaugeSiteSize*gSize);
      if (ghost_sitelink[i] == NULL){
        printf("ERROR: malloc failed for ghost_sitelink[%d] \n",i);
        exit(1);
      }
    }

    /*
       nu |     |
       |_____|
       mu
       */

    for(int nu=0;nu < 4;nu++){
      for(int mu=0; mu < 4;mu++){
        if(nu == mu){
          ghost_sitelink_diag[nu*4+mu] = NULL;
        }else{
          //the other directions
          int dir1, dir2;
          for(dir1= 0; dir1 < 4; dir1++){
            if(dir1 !=nu && dir1 != mu){
              break;
            }
          }
          for(dir2=0; dir2 < 4; dir2++){
            if(dir2 != nu && dir2 != mu && dir2 != dir1){
              break;
            }
          }
          ghost_sitelink_diag[nu*4+mu] = malloc(Z[dir1]*Z[dir2]*gaugeSiteSize*gSize);
          if(ghost_sitelink_diag[nu*4+mu] == NULL){
            errorQuda("malloc failed for ghost_sitelink_diag\n");
          }

          memset(ghost_sitelink_diag[nu*4+mu], 0, Z[dir1]*Z[dir2]*gaugeSiteSize*gSize);
        }

      }
    }

    exchange_cpu_sitelink(qudaGaugeParam.X, sitelink, ghost_sitelink, ghost_sitelink_diag, qudaGaugeParam.cpu_prec, &qudaGaugeParam, optflag);
    llfat_reference_mg(fat_reflink, sitelink, ghost_sitelink, ghost_sitelink_diag, qudaGaugeParam.cpu_prec, coeff);
  

    {
      int R[4] = {2,2,2,2};
      exchange_cpu_sitelink_ex(qudaGaugeParam.X, R, sitelink_ex, QUDA_QDP_GAUGE_ORDER, qudaGaugeParam.cpu_prec, 0, 4);
      computeLongLinkCPU(long_reflink, sitelink_ex, qudaGaugeParam.cpu_prec, coeff);
    }
#else
    llfat_reference(fat_reflink, sitelink, qudaGaugeParam.cpu_prec, coeff);
    computeLongLinkCPU(long_reflink, sitelink, qudaGaugeParam.cpu_prec, coeff);
#endif

  }//verify_results

  //format change for fatlink and longlink
  void* myfatlink[4];
  void* mylonglink[4];
  for(int i=0;i < 4;i++){
    myfatlink[i] = malloc(V*gaugeSiteSize*gSize);
    if(myfatlink[i] == NULL){
      printf("Error: malloc failed for myfatlink[%d]\n", i);
      exit(1);
    }
    mylonglink[i] = malloc(V*gaugeSiteSize*gSize);
    if(mylonglink[i] == NULL){
      printf("Error: malloc failed for mylonglink[%d]\n", i);
      exit(1);
    }
    memset(myfatlink[i], 0, V*gaugeSiteSize*gSize);
    memset(mylonglink[i], 0, V*gaugeSiteSize*gSize);
  }

  for(int i=0;i < V; i++){
    for(int dir=0; dir< 4; dir++){
      char* src = ((char*)fatlink)+ (4*i+dir)*gaugeSiteSize*gSize;
      char* dst = ((char*)myfatlink[dir]) + i*gaugeSiteSize*gSize;
      memcpy(dst, src, gaugeSiteSize*gSize);

      src = ((char*)longlink)+ (4*i+dir)*gaugeSiteSize*gSize;
      dst = ((char*)mylonglink[dir]) + i*gaugeSiteSize*gSize;
      memcpy(dst, src, gaugeSiteSize*gSize);
    }
  }

  if (verify_results) {
    printfQuda("Checking fat links...\n");
    int res=1;
    for(int dir=0; dir<4; dir++){
      res &= compare_floats(fat_reflink[dir], myfatlink[dir], V*gaugeSiteSize, 1e-3, qudaGaugeParam.cpu_prec);
    }
    
    strong_check_link(myfatlink, "GPU results: ",
		      fat_reflink, "CPU reference results:",
		      V, qudaGaugeParam.cpu_prec);
    
    printfQuda("Fat-link test %s\n\n",(1 == res) ? "PASSED" : "FAILED");
#ifdef MULTI_GPU
    if(test){
#endif
      printfQuda("Checking long links...\n");
      res = 1;
      for(int dir=0; dir<4; ++dir){
	res &= compare_floats(long_reflink[dir], mylonglink[dir], V*gaugeSiteSize, 1e-3, qudaGaugeParam.cpu_prec);
      }
      
      strong_check_link(mylonglink, "GPU results: ",
			long_reflink, "CPU reference results:",
			V, qudaGaugeParam.cpu_prec);
      
      printfQuda("Long-link test %s\n\n",(1 == res) ? "PASSED" : "FAILED");
      
#ifdef MULTI_GPU
    }else{ // !test
      printfQuda("Extended volume is required for multi-GPU long-link construction\n");
    }
#endif
  }

  int volume = qudaGaugeParam.X[0]*qudaGaugeParam.X[1]*qudaGaugeParam.X[2]*qudaGaugeParam.X[3];
  int flops= 61632;
#ifdef MULTI_GPU
  if(test) flops += (252*4); // long-link contribution 
#else
  flops += (252*4); // 2*117 + 18 (two matrix-matrix multiplications and a matrix rescale)
#endif

  double perf = 1.0* flops*volume/(secs*1024*1024*1024);
  printfQuda("link computation time =%.2f ms, flops= %.2f Gflops\n", secs*1000, perf);


  for(int i=0;i < 4;i++){
    free(myfatlink[i]);
  }

#ifdef MULTI_GPU
  if (verify_results){
    int i;
    for(i=0;i < 4;i++){
      free(ghost_sitelink[i]);
    }
    for(i=0;i <4; i++){
      for(int j=0;j <4; j++){
        if (i==j){
          continue;
        }
        free(ghost_sitelink_diag[i*4+j]);
      }
    }
  }
#endif

  for(int i=0;i < 4; i++){
    cudaFreeHost(sitelink[i]);
    cudaFreeHost(sitelink_ex[i]);
    free(fat_reflink[i]);
  }
  cudaFreeHost(fatlink);
  cudaFreeHost(longlink);
  if(milc_sitelink) free(milc_sitelink);
  if(milc_sitelink_ex) free(milc_sitelink_ex);
#ifdef MULTI_GPU
  exchange_llfat_cleanup();
#endif
  endQuda();
}
Exemple #10
0
/*
 * A worker thread.
 *
 * Each thread waits for the pool to be non-empty.
 * As soon as this applies, one of them succeeds in getting the lock
 * and then executes the job.
 */
static void *t_pool_worker(void *arg) {
    t_pool_worker_t *w = (t_pool_worker_t *)arg;
    t_pool *p = w->p;
    t_pool_job *j;
#ifdef DEBUG_TIME
    struct timeval t1, t2, t3;
#endif

    for (;;) {
	// Pop an item off the pool queue
#ifdef DEBUG_TIME
	gettimeofday(&t1, NULL);
#endif

	pthread_mutex_lock(&p->pool_m);

#ifdef DEBUG_TIME
	gettimeofday(&t2, NULL);
	p->wait_time += TDIFF(t2,t1);
	w->wait_time += TDIFF(t2,t1);
#endif

	// If there is something on the job list and a higher priority
	// thread waiting, let it handle this instead.
//	while (p->head && p->t_stack_top != -1 && p->t_stack_top < w->idx) {
//	    pthread_mutex_unlock(&p->pool_m);
//	    pthread_cond_signal(&p->t[p->t_stack_top].pending_c);
//	    pthread_mutex_lock(&p->pool_m);
//	}

	while (!p->head && !p->shutdown) {
	    p->nwaiting++;

	    if (p->njobs == 0)
		pthread_cond_signal(&p->empty_c);
#ifdef DEBUG_TIME
	    gettimeofday(&t2, NULL);
#endif

#ifdef IN_ORDER
	    // Push this thread to the top of the waiting stack
	    if (p->t_stack_top == -1 || p->t_stack_top > w->idx)
		p->t_stack_top = w->idx;

	    p->t_stack[w->idx] = 1;
	    pthread_cond_wait(&w->pending_c, &p->pool_m);
	    p->t_stack[w->idx] = 0;

	    /* Find new t_stack_top */
	    {
		int i;
		p->t_stack_top = -1;
		for (i = 0; i < p->tsize; i++) {
		    if (p->t_stack[i]) {
			p->t_stack_top = i;
			break;
		    }
		}
	    }
#else
	    pthread_cond_wait(&p->pending_c, &p->pool_m);
#endif

#ifdef DEBUG_TIME
	    gettimeofday(&t3, NULL);
	    p->wait_time += TDIFF(t3,t2);
	    w->wait_time += TDIFF(t3,t2);
#endif
	    p->nwaiting--;
	}

	if (p->shutdown) {
#ifdef DEBUG_TIME
	    p->total_time += TDIFF(t3,t1);
#endif
#ifdef DEBUG
	    fprintf(stderr, "%d: Shutting down\n", worker_id(p));
#endif
	    pthread_mutex_unlock(&p->pool_m);
	    pthread_exit(NULL);
	}

	j = p->head;
	if (!(p->head = j->next))
	    p->tail = NULL;

	if (p->njobs-- >= p->qsize)
	    pthread_cond_signal(&p->full_c);

	if (p->njobs == 0)
	    pthread_cond_signal(&p->empty_c);

	pthread_mutex_unlock(&p->pool_m);
	    
	// We have job 'j' - now execute it.
	t_pool_add_result(j, j->func(j->arg));	
#ifdef DEBUG_TIME
	pthread_mutex_lock(&p->pool_m);
	gettimeofday(&t3, NULL);
	p->total_time += TDIFF(t3,t1);
	pthread_mutex_unlock(&p->pool_m);
#endif
	memset(j, 0xbb, sizeof(*j));
	free(j);
    }

    return NULL;
}