void* run_tests(void* arg) { memtest_control_t* mc = (memtest_control_t*)arg; struct timeval t0, t1; unsigned int pass = 0; int i; while(1){ for (i = 0;i < DIM(cuda_memtests); i++){ if (cuda_memtests[i].enabled){ PRINTF("%s\n", cuda_memtests[i].desc); gettimeofday(&t0, NULL); cuda_memtests[i].func(mc); gettimeofday(&t1, NULL); PRINTF("Test%d finished in %.1f seconds\n", i, TDIFF(t1, t0)); }//if }//for if (num_passes <=0){ continue; } pass++; if (pass >= num_passes){ break; } } return NULL; }
/* Measure insertions. */ static void ins(void) { skiplist *sl = skiplist_new(intptr_cmp, NULL, NULL); TIME(pre); for (intptr_t i=0; i < lim; i++) { skiplist_add(sl, (void *) i, (void *) i); } TIME(post); TDIFF(); skiplist_free(sl, NULL, NULL); }
static void set(void) { skiplist *sl = skiplist_new(intptr_cmp, NULL, NULL); TIME(pre); for (intptr_t i=0; i < lim; i++) { intptr_t k = i % (lim / 2); skiplist_set(sl, (void *) k, (void *) k, NULL); } TIME(post); TDIFF(); skiplist_free(sl, NULL, NULL); }
/* Measure getting _nonexistent_ values (lookup failure). */ static void get_nonexistent(void) { skiplist *sl = skiplist_new(intptr_cmp, NULL, NULL); for (intptr_t i=0; i < lim; i++) { skiplist_add(sl, (void *) i, (void *) i); } TIME(pre); for (intptr_t i=0; i < lim; i++) { intptr_t k = (i * largeish_prime) + lim; intptr_t v = 0; skiplist_get(sl, (void *) k, (void **)&v); assert(v == 0); if (0) { printf("%lu %lu\n", k, v); } } TIME(post); TDIFF(); skiplist_free(sl, NULL, NULL); }
static void set_and_get(void) { skiplist *sl = skiplist_new(intptr_cmp, NULL, NULL); TIME(pre); for (intptr_t i=0; i < lim; i++) { intptr_t k = i % (lim / 2); skiplist_set(sl, (void *) k, (void *) k, NULL); } for (intptr_t i=0; i < lim; i++) { intptr_t k = (i * largeish_prime) % (lim / 2); intptr_t v = (intptr_t)0; skiplist_get(sl, (void *) k, (void **)&v); if (0) { printf("%lu %lu\n", k, v); } assert(v == k); } TIME(post); TDIFF(); skiplist_free(sl, NULL, NULL); }
int hip_xmlrpc_rm(struct dht_val *v) { struct timeval now; struct sockaddr_storage server; int mode = XMLRPC_MODE_RM | XMLRPC_MODE_RETRY_OFF; int err, key_len, value_len, secret_len, rm_ttl; if (!v) { return(-1); } if (hip_dht_select_server(SA(&server)) < 0) { return(-1); } gettimeofday(&now, NULL); rm_ttl = TDIFF(v->expire_time, now); if (rm_ttl <= 0) { return(0); } key_len = sizeof(v->key); value_len = v->value_hash_len; secret_len = v->secret_len; err = hip_xmlrpc_getput(mode, v->app, SA(&server), (char *)v->key, key_len, (char *)v->value_hash, &value_len, (char *)v->secret, secret_len, rm_ttl); if (err == 0) { log_(NORM, "Removed a %s value from the DHT.\n", v->app); } return(err); }
/* * \fn hip_xmlrpc_getput() * * \param mode determines get or put, app, retry on/off * If retry is off only one attempt should be made, * on means the connect() should keep retrying * \param app string to use in the XML RPC application field * \param server server address and port to connect to * \param key DHT key used for get or put * \param key_len length of DHT key in bytes * \param value DHT value used for put, ptr for storing value for get * \param value_len ptr to length of value buffer, length of get is returned * \param secret secret value used to make put removable * \param secret_len length of secret value * \param ttl time to live in seconds * * \brief Perform the XML RPC GET, PUT, and RM operations. */ int hip_xmlrpc_getput(int mode, char *app, struct sockaddr *server, char *key, int key_len, char *value, int *value_len, char *secret, int secret_len, int ttl) { xmlDocPtr doc = NULL; xmlNodePtr root_node = NULL, node; int len = 0, s, retval = 0; char buff[2048], oper[14]; unsigned char key64[2 * DHT_KEY_SIZE], val64[2 * DHT_VAL_SIZE]; unsigned char tmp[2 * DHT_VAL_SIZE], *xmlbuff = NULL; fd_set read_fdset; struct timeval timeout, now; char *p; unsigned int retry_attempts = 0; struct sockaddr_in src_addr; struct dht_val *dv, rm; SHA_CTX c; __u8 secret_hash[SHA_DIGEST_LENGTH], value_hash[SHA_DIGEST_LENGTH]; int rm_ttl = 0, value_hash_len; int retry = ((mode & 0x00F0) == XMLRPC_MODE_RETRY_ON); if ((key_len > (2 * DHT_KEY_SIZE)) || (*value_len > (2 * DHT_VAL_SIZE))) { return(-1); } /* * support for removable puts */ memset(&rm, 0, sizeof(struct dht_val)); if ((mode & 0x000F) == XMLRPC_MODE_PUT) { /* * produce hashes of the secret and the value, for later removal */ SHA1_Init(&c); SHA1_Update(&c, value, *value_len); SHA1_Final(value_hash, &c); SHA1_Init(&c); SHA1_Update(&c, secret, secret_len); SHA1_Final(secret_hash, &c); /* * check if we already published a record with this key; record * this new secret value and value_hash */ pthread_mutex_lock(&dht_vals_lock); gettimeofday(&now, NULL); dv = lookup_dht_val(key); if (dv) { /* save old secret so we can remove it later below */ memcpy(&rm, &dv, sizeof(struct dht_val)); /* any time left for removing the old record? */ rm_ttl = TDIFF(rm.expire_time, now); } else { dv = insert_dht_val(key); } strncpy(dv->app, app, sizeof(dv->app)); dv->value_hash_len = SHA_DIGEST_LENGTH; memcpy(dv->value_hash, value_hash, SHA_DIGEST_LENGTH); dv->secret_len = secret_len; memcpy(dv->secret, secret, secret_len); dv->expire_time.tv_usec = now.tv_usec; dv->expire_time.tv_sec = now.tv_sec + ttl; pthread_mutex_unlock(&dht_vals_lock); } switch (mode & 0x000F) { case XMLRPC_MODE_PUT: sprintf(oper, "put_removable"); break; case XMLRPC_MODE_GET: sprintf(oper, "get"); break; case XMLRPC_MODE_RM: sprintf(oper, "rm"); break; default: log_(WARN, "Invalid XMLRPC mode given to DHT.\n"); return(-1); } /* * create a new XML document */ doc = xmlNewDoc(BAD_CAST "1.0"); root_node = xmlNewNode(NULL, BAD_CAST "methodCall"); xmlDocSetRootElement(doc, root_node); node = xmlNewChild(root_node, NULL, BAD_CAST "methodName", BAD_CAST oper); node = xmlNewChild(root_node, NULL, BAD_CAST "params", NULL); memset(tmp, 0, sizeof(tmp)); memcpy(tmp, key, key_len); EVP_EncodeBlock(key64, tmp, key_len); xml_new_param(node, "base64", (char *)key64); /* key */ /* log_(NORM, "Doing %s using key(%d)=", * ((mode & 0x000F)==XMLRPC_MODE_PUT) ? "PUT":"GET", key_len); * print_hex(key, key_len); * log_(NORM, " [%s]\n", key64); // */ switch (mode & 0x000F) { case XMLRPC_MODE_PUT: memset(tmp, 0, sizeof(tmp)); memcpy(tmp, value, *value_len); EVP_EncodeBlock(val64, tmp, *value_len); xml_new_param(node, "base64", (char *)val64); /* value */ xml_new_param(node, "string", "SHA"); /* hash type */ memset(tmp, 0, sizeof(tmp)); memcpy(tmp, secret_hash, SHA_DIGEST_LENGTH); EVP_EncodeBlock(val64, tmp, SHA_DIGEST_LENGTH); xml_new_param(node, "base64", (char *)val64); /* secret_hash */ sprintf((char *)tmp, "%d", ttl); xml_new_param(node, "int", (char *)tmp); /* lifetime */ break; case XMLRPC_MODE_GET: xml_new_param(node, "int", "10"); /* maxvals */ xml_new_param(node, "base64", ""); /* placemark */ memset(value, 0, *value_len); break; case XMLRPC_MODE_RM: memset(tmp, 0, sizeof(tmp)); memcpy(tmp, value_hash, SHA_DIGEST_LENGTH); EVP_EncodeBlock(val64, tmp, SHA_DIGEST_LENGTH); xml_new_param(node, "base64", (char *)val64); /* value_hash */ xml_new_param(node, "string", "SHA"); /* hash type */ memset(tmp, 0, sizeof(tmp)); memcpy(tmp, secret, secret_len); EVP_EncodeBlock(val64, tmp, secret_len); xml_new_param(node, "base64", (char *)val64); /* secret */ sprintf((char *)tmp, "%d", ttl); xml_new_param(node, "int", (char *)tmp); /* lifetime */ } xml_new_param(node, "string", app); /* app */ xmlDocDumpFormatMemory(doc, &xmlbuff, &len, 0); /* * Build an HTTP POST and transmit to server */ memset(buff, 0, sizeof(buff)); build_http_post_header(buff, len, server); /* len is XML length above */ memcpy(&buff[strlen(buff)], xmlbuff, len); xmlFree(xmlbuff); len = strlen(buff) + 1; connect_retry: /* Connect and send the XML RPC */ if ((s = socket(PF_INET, SOCK_STREAM, IPPROTO_TCP)) < 0) { log_(WARN, "DHT connect - socket error: %s\n", strerror(errno)); retval = -1; goto putget_exit; } /* Use the preferred address as source */ memset(&src_addr, 0, sizeof(src_addr)); src_addr.sin_family = AF_INET; src_addr.sin_addr.s_addr = get_preferred_addr(); if (!src_addr.sin_addr.s_addr) { log_(NORM, "No preferred address, deferring DHT!\n"); return(-1); } log_(NORM, "Using source address of %s for DHT %s.\n", logaddr(SA(&src_addr)), oper); fflush(stdout); if (bind(s, SA(&src_addr), SALEN(&src_addr)) < 0) { log_(WARN, "DHT connect - bind error: %s\n", strerror(errno)); } if (g_state != 0) { return(-1); } if (retry && (retry_attempts > 0)) { /* quit after a certain number of retries */ if (retry_attempts >= HCNF.max_retries) { retval = -2; goto putget_exit; } /* wait packet_timeout seconds before retrying */ hip_sleep(HCNF.packet_timeout); } retry_attempts++; if (connect(s, server, SALEN(server)) < 0) { log_(WARN, "DHT server connect error: %s\n", strerror(errno)); closesocket(s); #ifdef __WIN32__ errno = WSAGetLastError(); if (retry && ((errno == WSAETIMEDOUT) || (errno == WSAENETUNREACH))) { goto connect_retry; } #else if (retry && ((errno == ETIMEDOUT) || (errno == EHOSTUNREACH))) { goto connect_retry; } #endif retval = -3; goto putget_exit; } if (send(s, buff, len, 0) != len) { log_(WARN, "DHT sent incorrect number of bytes\n"); retval = -4; goto putget_exit; } xmlFreeDoc(doc); doc = NULL; /* * Receive XML RPC response from server */ FD_ZERO(&read_fdset); FD_SET((unsigned int)s, &read_fdset); /* use longer timeout when retry==TRUE, because we have own thread */ if (retry) { timeout.tv_sec = 3; timeout.tv_usec = 0; } else { timeout.tv_sec = 0; timeout.tv_usec = 300000; /* 300ms */ } if (select(s + 1, &read_fdset, NULL, NULL, &timeout) < 0) { log_(WARN, "DHT select error: %s\n", strerror(errno)); retval = -5; goto putget_exit; } else if (FD_ISSET(s, &read_fdset)) { if ((len = recv(s, buff, sizeof(buff) - 1, 0)) <= 0) { log_(WARN, "DHT error receiving from server: %s\n", strerror(errno)); retval = -6; goto putget_exit; } if (strncmp(buff, "HTTP", 4) != 0) { return(-7); } if ((p = strstr(buff, "Content-Length: ")) == NULL) { return(-8); } else /* advance ptr to Content-Length */ { p += 16; } sscanf(p, "%d", &len); p = strchr(p, '\n') + 3; /* advance to end of line */ retval = hip_xmlrpc_parse_response(mode, p, len, value, value_len); log_(NORM, "DHT server responded with return code %d (%s).\n", retval, hip_xmlrpc_resp_to_str(retval)); } else { /* select timeout */ if (retry) /* XXX testme: retry select instead? */ { goto connect_retry; } retval = -9; } putget_exit: #ifdef __WIN32__ closesocket(s); #else close(s); #endif if (doc != NULL) { xmlFreeDoc(doc); } if (rm_ttl > 0) { value_hash_len = sizeof(rm.value_hash); hip_xmlrpc_getput(((mode & 0x00F0) | XMLRPC_MODE_RM), app, server, key, key_len, (char *)rm.value_hash, &value_hash_len, (char *)rm.secret, secret_len, rm_ttl); } return(retval); }
static int unitarize_link_test() { QudaGaugeParam qudaGaugeParam = newQudaGaugeParam(); initQuda(0); cpu_prec = prec; gSize = cpu_prec; qudaGaugeParam.anisotropy = 1.0; qudaGaugeParam.X[0] = xdim; qudaGaugeParam.X[1] = ydim; qudaGaugeParam.X[2] = zdim; qudaGaugeParam.X[3] = tdim; setDims(qudaGaugeParam.X); QudaPrecision link_prec = QUDA_SINGLE_PRECISION; QudaReconstructType link_recon = QUDA_RECONSTRUCT_NO; qudaGaugeParam.cpu_prec = link_prec; qudaGaugeParam.cuda_prec = link_prec; qudaGaugeParam.reconstruct = link_recon; qudaGaugeParam.type = QUDA_WILSON_LINKS; hisq::fermion_force::hisqForceInitCuda(&qudaGaugeParam); qudaGaugeParam.t_boundary = QUDA_PERIODIC_T; qudaGaugeParam.anisotropy = 1.0; qudaGaugeParam.cuda_prec_sloppy = prec; qudaGaugeParam.reconstruct_sloppy = QUDA_RECONSTRUCT_NO; qudaGaugeParam.gauge_fix = QUDA_GAUGE_FIXED_NO; qudaGaugeParam.ga_pad = 0; qudaGaugeParam.packed_size = 0; qudaGaugeParam.gaugeGiB = 0; qudaGaugeParam.flag = false; qudaGaugeParam.cpu_prec = cpu_prec; qudaGaugeParam.cuda_prec = prec; qudaGaugeParam.gauge_order = gauge_order; qudaGaugeParam.type=QUDA_WILSON_LINKS; qudaGaugeParam.reconstruct = link_recon; qudaGaugeParam.flag = QUDA_FAT_PRESERVE_CPU_GAUGE | QUDA_FAT_PRESERVE_GPU_GAUGE | QUDA_FAT_PRESERVE_COMM_MEM; setFatLinkPadding(QUDA_COMPUTE_FAT_STANDARD, &qudaGaugeParam); GaugeFieldParam gParam(0, qudaGaugeParam); gParam.pad = 0; gParam.create = QUDA_REFERENCE_FIELD_CREATE; gParam.link_type = QUDA_WILSON_LINKS; gParam.order = QUDA_MILC_GAUGE_ORDER; cpuGaugeField *cpuOutLink = new cpuGaugeField(gParam); gParam.pad = 0; gParam.create = QUDA_NULL_FIELD_CREATE; gParam.link_type = QUDA_WILSON_LINKS; gParam.order = QUDA_QDP_GAUGE_ORDER; gParam.reconstruct = QUDA_RECONSTRUCT_NO; cudaGaugeField *cudaFatLink = new cudaGaugeField(gParam); cudaGaugeField *cudaULink = new cudaGaugeField(gParam); initCommonConstants(*cudaFatLink); void* fatlink = (void*)malloc(4*V*gaugeSiteSize*gSize); if(fatlink == NULL){ errorQuda("ERROR: allocating fatlink failed\n"); } void* sitelink[4]; for(int i=0;i < 4;i++){ cudaMallocHost((void**)&sitelink[i], V*gaugeSiteSize*gSize); if(sitelink[i] == NULL){ errorQuda("ERROR; allocate sitelink[%d] failed\n", i); } } createSiteLinkCPU(sitelink, qudaGaugeParam.cpu_prec, 1); double act_path_coeff[6]; act_path_coeff[0] = 0.625000; act_path_coeff[1] = -0.058479; act_path_coeff[2] = -0.087719; act_path_coeff[3] = 0.030778; act_path_coeff[4] = -0.007200; act_path_coeff[5] = -0.123113; //only record the last call's performance //the first one is for creating the cpu/cuda data structures if(gauge_order == QUDA_QDP_GAUGE_ORDER){ computeFatLinkQuda(fatlink, sitelink, act_path_coeff, &qudaGaugeParam, QUDA_COMPUTE_FAT_STANDARD); } // gauge order is QDP_GAUGE_ORDER cpuOutLink->setGauge((void**)fatlink); cudaFatLink->loadCPUField(*cpuOutLink, QUDA_CPU_FIELD_LOCATION); hisq::setUnitarizeLinksConstants(unitarize_eps, max_allowed_error, reunit_allow_svd, reunit_svd_only, svd_rel_error, svd_abs_error); hisq::setUnitarizeLinksPadding(0,0); int* num_failures_dev; cudaMalloc(&num_failures_dev, sizeof(int)); cudaMemset(num_failures_dev, 0, sizeof(int)); struct timeval t0, t1; gettimeofday(&t0,NULL); hisq::unitarizeLinksCuda(qudaGaugeParam,*cudaFatLink, cudaULink, num_failures_dev); cudaThreadSynchronize(); gettimeofday(&t1,NULL); int num_failures=0; cudaMemcpy(&num_failures, num_failures_dev, sizeof(int), cudaMemcpyDeviceToHost); delete cudaFatLink; delete cudaULink; for(int dir=0; dir<4; ++dir) cudaFreeHost(sitelink[dir]); cudaFree(num_failures_dev); #ifdef MULTI_GPU exchange_llfat_cleanup(); #endif endQuda(); printfQuda("Unitarization time: %g ms\n", TDIFF(t0,t1)*1000); return num_failures; }
static void llfat_test(int test) { QudaGaugeParam qudaGaugeParam; #ifdef MULTI_GPU void* ghost_sitelink[4]; void* ghost_sitelink_diag[16]; #endif initQuda(device); cpu_prec = prec; gSize = cpu_prec; qudaGaugeParam = newQudaGaugeParam(); qudaGaugeParam.anisotropy = 1.0; qudaGaugeParam.X[0] = xdim; qudaGaugeParam.X[1] = ydim; qudaGaugeParam.X[2] = zdim; qudaGaugeParam.X[3] = tdim; setDims(qudaGaugeParam.X); qudaGaugeParam.cpu_prec = cpu_prec; qudaGaugeParam.cuda_prec = prec; qudaGaugeParam.gauge_order = gauge_order; qudaGaugeParam.type=QUDA_WILSON_LINKS; qudaGaugeParam.reconstruct = link_recon; /* qudaGaugeParam.flag = QUDA_FAT_PRESERVE_CPU_GAUGE | QUDA_FAT_PRESERVE_GPU_GAUGE | QUDA_FAT_PRESERVE_COMM_MEM; */ qudaGaugeParam.preserve_gauge =0; void* fatlink; if (cudaMallocHost((void**)&fatlink, 4*V*gaugeSiteSize*gSize) == cudaErrorMemoryAllocation) { errorQuda("ERROR: cudaMallocHost failed for fatlink\n"); } void* longlink; if (cudaMallocHost((void**)&longlink, 4*V*gaugeSiteSize*gSize) == cudaErrorMemoryAllocation) { errorQuda("ERROR: cudaMallocHost failed for longlink\n"); } // page-locked memory void* sitelink[4]; for(int i=0;i < 4;i++){ if (cudaMallocHost((void**)&sitelink[i], V*gaugeSiteSize*gSize) == cudaErrorMemoryAllocation) { errorQuda("ERROR: cudaMallocHost failed for sitelink\n"); } } void* sitelink_ex[4]; for(int i=0;i < 4;i++){ if (cudaMallocHost((void**)&sitelink_ex[i], V_ex*gaugeSiteSize*gSize) == cudaErrorMemoryAllocation) { errorQuda("ERROR: cudaMallocHost failed for sitelink_ex\n"); } } void* milc_sitelink; milc_sitelink = (void*)malloc(4*V*gaugeSiteSize*gSize); if(milc_sitelink == NULL){ errorQuda("ERROR: allocating milc_sitelink failed\n"); } void* milc_sitelink_ex; milc_sitelink_ex = (void*)malloc(4*V_ex*gaugeSiteSize*gSize); if(milc_sitelink_ex == NULL){ errorQuda("Error: allocating milc_sitelink failed\n"); } createSiteLinkCPU(sitelink, qudaGaugeParam.cpu_prec, 1); if(gauge_order == QUDA_MILC_GAUGE_ORDER){ for(int i=0; i<V; ++i){ for(int dir=0; dir<4; ++dir){ char* src = (char*)sitelink[dir]; memcpy((char*)milc_sitelink + (i*4 + dir)*gaugeSiteSize*gSize, src+i*gaugeSiteSize*gSize, gaugeSiteSize*gSize); } } } int X1=Z[0]; int X2=Z[1]; int X3=Z[2]; int X4=Z[3]; for(int i=0; i < V_ex; i++){ int sid = i; int oddBit=0; if(i >= Vh_ex){ sid = i - Vh_ex; oddBit = 1; } int za = sid/E1h; int x1h = sid - za*E1h; int zb = za/E2; int x2 = za - zb*E2; int x4 = zb/E3; int x3 = zb - x4*E3; int x1odd = (x2 + x3 + x4 + oddBit) & 1; int x1 = 2*x1h + x1odd; if( x1< 2 || x1 >= X1 +2 || x2< 2 || x2 >= X2 +2 || x3< 2 || x3 >= X3 +2 || x4< 2 || x4 >= X4 +2){ #ifdef MULTI_GPU continue; #endif } x1 = (x1 - 2 + X1) % X1; x2 = (x2 - 2 + X2) % X2; x3 = (x3 - 2 + X3) % X3; x4 = (x4 - 2 + X4) % X4; int idx = (x4*X3*X2*X1+x3*X2*X1+x2*X1+x1)>>1; if(oddBit){ idx += Vh; } for(int dir= 0; dir < 4; dir++){ char* src = (char*)sitelink[dir]; char* dst = (char*)sitelink_ex[dir]; memcpy(dst+i*gaugeSiteSize*gSize, src+idx*gaugeSiteSize*gSize, gaugeSiteSize*gSize); // milc ordering memcpy((char*)milc_sitelink_ex + (i*4 + dir)*gaugeSiteSize*gSize, src+idx*gaugeSiteSize*gSize, gaugeSiteSize*gSize); }//dir }//i double act_path_coeff[6]; for(int i=0;i < 6;i++){ act_path_coeff[i]= 0.1*i; } //only record the last call's performance //the first one is for creating the cpu/cuda data structures struct timeval t0, t1; void** sitelink_ptr; QudaComputeFatMethod method = (test) ? QUDA_COMPUTE_FAT_EXTENDED_VOLUME : QUDA_COMPUTE_FAT_STANDARD; if(gauge_order == QUDA_QDP_GAUGE_ORDER){ sitelink_ptr = (test) ? (void**)sitelink_ex : (void**)sitelink; }else{ sitelink_ptr = (test) ? (void**)milc_sitelink_ex : (void**)milc_sitelink; } void* longlink_ptr = longlink; #ifdef MULTI_GPU if(!test) longlink_ptr = NULL; // Have to have an extended volume for the long-link calculation #endif gettimeofday(&t0, NULL); computeKSLinkQuda(fatlink, longlink_ptr, NULL, milc_sitelink, act_path_coeff, &qudaGaugeParam, method); gettimeofday(&t1, NULL); double secs = TDIFF(t0,t1); void* fat_reflink[4]; void* long_reflink[4]; for(int i=0;i < 4;i++){ fat_reflink[i] = malloc(V*gaugeSiteSize*gSize); if(fat_reflink[i] == NULL){ errorQuda("ERROR; allocate fat_reflink[%d] failed\n", i); } long_reflink[i] = malloc(V*gaugeSiteSize*gSize); if(long_reflink[i] == NULL) errorQuda("ERROR; allocate long_reflink[%d] failed\n", i); } if (verify_results){ //FIXME: we have this compplication because references takes coeff as float/double // depending on the precision while the GPU code aways take coeff as double void* coeff; double coeff_dp[6]; float coeff_sp[6]; for(int i=0;i < 6;i++){ coeff_sp[i] = coeff_dp[i] = act_path_coeff[i]; } if(prec == QUDA_DOUBLE_PRECISION){ coeff = coeff_dp; }else{ coeff = coeff_sp; } #ifdef MULTI_GPU int optflag = 0; //we need x,y,z site links in the back and forward T slice // so it is 3*2*Vs_t int Vs[4] = {Vs_x, Vs_y, Vs_z, Vs_t}; for(int i=0;i < 4; i++){ ghost_sitelink[i] = malloc(8*Vs[i]*gaugeSiteSize*gSize); if (ghost_sitelink[i] == NULL){ printf("ERROR: malloc failed for ghost_sitelink[%d] \n",i); exit(1); } } /* nu | | |_____| mu */ for(int nu=0;nu < 4;nu++){ for(int mu=0; mu < 4;mu++){ if(nu == mu){ ghost_sitelink_diag[nu*4+mu] = NULL; }else{ //the other directions int dir1, dir2; for(dir1= 0; dir1 < 4; dir1++){ if(dir1 !=nu && dir1 != mu){ break; } } for(dir2=0; dir2 < 4; dir2++){ if(dir2 != nu && dir2 != mu && dir2 != dir1){ break; } } ghost_sitelink_diag[nu*4+mu] = malloc(Z[dir1]*Z[dir2]*gaugeSiteSize*gSize); if(ghost_sitelink_diag[nu*4+mu] == NULL){ errorQuda("malloc failed for ghost_sitelink_diag\n"); } memset(ghost_sitelink_diag[nu*4+mu], 0, Z[dir1]*Z[dir2]*gaugeSiteSize*gSize); } } } exchange_cpu_sitelink(qudaGaugeParam.X, sitelink, ghost_sitelink, ghost_sitelink_diag, qudaGaugeParam.cpu_prec, &qudaGaugeParam, optflag); llfat_reference_mg(fat_reflink, sitelink, ghost_sitelink, ghost_sitelink_diag, qudaGaugeParam.cpu_prec, coeff); { int R[4] = {2,2,2,2}; exchange_cpu_sitelink_ex(qudaGaugeParam.X, R, sitelink_ex, QUDA_QDP_GAUGE_ORDER, qudaGaugeParam.cpu_prec, 0, 4); computeLongLinkCPU(long_reflink, sitelink_ex, qudaGaugeParam.cpu_prec, coeff); } #else llfat_reference(fat_reflink, sitelink, qudaGaugeParam.cpu_prec, coeff); computeLongLinkCPU(long_reflink, sitelink, qudaGaugeParam.cpu_prec, coeff); #endif }//verify_results //format change for fatlink and longlink void* myfatlink[4]; void* mylonglink[4]; for(int i=0;i < 4;i++){ myfatlink[i] = malloc(V*gaugeSiteSize*gSize); if(myfatlink[i] == NULL){ printf("Error: malloc failed for myfatlink[%d]\n", i); exit(1); } mylonglink[i] = malloc(V*gaugeSiteSize*gSize); if(mylonglink[i] == NULL){ printf("Error: malloc failed for mylonglink[%d]\n", i); exit(1); } memset(myfatlink[i], 0, V*gaugeSiteSize*gSize); memset(mylonglink[i], 0, V*gaugeSiteSize*gSize); } for(int i=0;i < V; i++){ for(int dir=0; dir< 4; dir++){ char* src = ((char*)fatlink)+ (4*i+dir)*gaugeSiteSize*gSize; char* dst = ((char*)myfatlink[dir]) + i*gaugeSiteSize*gSize; memcpy(dst, src, gaugeSiteSize*gSize); src = ((char*)longlink)+ (4*i+dir)*gaugeSiteSize*gSize; dst = ((char*)mylonglink[dir]) + i*gaugeSiteSize*gSize; memcpy(dst, src, gaugeSiteSize*gSize); } } if (verify_results) { printfQuda("Checking fat links...\n"); int res=1; for(int dir=0; dir<4; dir++){ res &= compare_floats(fat_reflink[dir], myfatlink[dir], V*gaugeSiteSize, 1e-3, qudaGaugeParam.cpu_prec); } strong_check_link(myfatlink, "GPU results: ", fat_reflink, "CPU reference results:", V, qudaGaugeParam.cpu_prec); printfQuda("Fat-link test %s\n\n",(1 == res) ? "PASSED" : "FAILED"); #ifdef MULTI_GPU if(test){ #endif printfQuda("Checking long links...\n"); res = 1; for(int dir=0; dir<4; ++dir){ res &= compare_floats(long_reflink[dir], mylonglink[dir], V*gaugeSiteSize, 1e-3, qudaGaugeParam.cpu_prec); } strong_check_link(mylonglink, "GPU results: ", long_reflink, "CPU reference results:", V, qudaGaugeParam.cpu_prec); printfQuda("Long-link test %s\n\n",(1 == res) ? "PASSED" : "FAILED"); #ifdef MULTI_GPU }else{ // !test printfQuda("Extended volume is required for multi-GPU long-link construction\n"); } #endif } int volume = qudaGaugeParam.X[0]*qudaGaugeParam.X[1]*qudaGaugeParam.X[2]*qudaGaugeParam.X[3]; int flops= 61632; #ifdef MULTI_GPU if(test) flops += (252*4); // long-link contribution #else flops += (252*4); // 2*117 + 18 (two matrix-matrix multiplications and a matrix rescale) #endif double perf = 1.0* flops*volume/(secs*1024*1024*1024); printfQuda("link computation time =%.2f ms, flops= %.2f Gflops\n", secs*1000, perf); for(int i=0;i < 4;i++){ free(myfatlink[i]); } #ifdef MULTI_GPU if (verify_results){ int i; for(i=0;i < 4;i++){ free(ghost_sitelink[i]); } for(i=0;i <4; i++){ for(int j=0;j <4; j++){ if (i==j){ continue; } free(ghost_sitelink_diag[i*4+j]); } } } #endif for(int i=0;i < 4; i++){ cudaFreeHost(sitelink[i]); cudaFreeHost(sitelink_ex[i]); free(fat_reflink[i]); } cudaFreeHost(fatlink); cudaFreeHost(longlink); if(milc_sitelink) free(milc_sitelink); if(milc_sitelink_ex) free(milc_sitelink_ex); #ifdef MULTI_GPU exchange_llfat_cleanup(); #endif endQuda(); }
/* * A worker thread. * * Each thread waits for the pool to be non-empty. * As soon as this applies, one of them succeeds in getting the lock * and then executes the job. */ static void *t_pool_worker(void *arg) { t_pool_worker_t *w = (t_pool_worker_t *)arg; t_pool *p = w->p; t_pool_job *j; #ifdef DEBUG_TIME struct timeval t1, t2, t3; #endif for (;;) { // Pop an item off the pool queue #ifdef DEBUG_TIME gettimeofday(&t1, NULL); #endif pthread_mutex_lock(&p->pool_m); #ifdef DEBUG_TIME gettimeofday(&t2, NULL); p->wait_time += TDIFF(t2,t1); w->wait_time += TDIFF(t2,t1); #endif // If there is something on the job list and a higher priority // thread waiting, let it handle this instead. // while (p->head && p->t_stack_top != -1 && p->t_stack_top < w->idx) { // pthread_mutex_unlock(&p->pool_m); // pthread_cond_signal(&p->t[p->t_stack_top].pending_c); // pthread_mutex_lock(&p->pool_m); // } while (!p->head && !p->shutdown) { p->nwaiting++; if (p->njobs == 0) pthread_cond_signal(&p->empty_c); #ifdef DEBUG_TIME gettimeofday(&t2, NULL); #endif #ifdef IN_ORDER // Push this thread to the top of the waiting stack if (p->t_stack_top == -1 || p->t_stack_top > w->idx) p->t_stack_top = w->idx; p->t_stack[w->idx] = 1; pthread_cond_wait(&w->pending_c, &p->pool_m); p->t_stack[w->idx] = 0; /* Find new t_stack_top */ { int i; p->t_stack_top = -1; for (i = 0; i < p->tsize; i++) { if (p->t_stack[i]) { p->t_stack_top = i; break; } } } #else pthread_cond_wait(&p->pending_c, &p->pool_m); #endif #ifdef DEBUG_TIME gettimeofday(&t3, NULL); p->wait_time += TDIFF(t3,t2); w->wait_time += TDIFF(t3,t2); #endif p->nwaiting--; } if (p->shutdown) { #ifdef DEBUG_TIME p->total_time += TDIFF(t3,t1); #endif #ifdef DEBUG fprintf(stderr, "%d: Shutting down\n", worker_id(p)); #endif pthread_mutex_unlock(&p->pool_m); pthread_exit(NULL); } j = p->head; if (!(p->head = j->next)) p->tail = NULL; if (p->njobs-- >= p->qsize) pthread_cond_signal(&p->full_c); if (p->njobs == 0) pthread_cond_signal(&p->empty_c); pthread_mutex_unlock(&p->pool_m); // We have job 'j' - now execute it. t_pool_add_result(j, j->func(j->arg)); #ifdef DEBUG_TIME pthread_mutex_lock(&p->pool_m); gettimeofday(&t3, NULL); p->total_time += TDIFF(t3,t1); pthread_mutex_unlock(&p->pool_m); #endif memset(j, 0xbb, sizeof(*j)); free(j); } return NULL; }