bool PlatformRPI::MyMalloc(void **memptr, size_t alignment, size_t size) { return (posix_memalign(memptr, alignment, size) == 0); }
int main(int argc, char **argv) { // Check arguments if (argc < 2) { // Default operation } else if (argc == 2) { // Check arg unsigned long arg = strtol(argv[1], NULL, 0); if (arg == 0) usage(); else loopCnt = arg; } // Error else { usage(); } printf("\nHtBcm Test\n\n"); int nau = HtBcmPrepare(&BcmReportNonce, &BcmFreeTask); printf(" Running with %d Units\n", nau); printf(" Looping %d Times\n", loopCnt); // Alloc g_pBcmTask g_pBcmTask = (CHtBcmTask **)calloc(loopCnt * nau, sizeof(CHtBcmTask)); // For each loop for (int j = 0; j < loopCnt; j += 1) { printf("\nLoop %d\n", j + 1); g_bReportNonce = false; g_BcmFreeTaskCnt = 0; // Create Task for each Unit for (int i = 0; i < nau; i++) { int cnt = i + j * nau + 1; int total = loopCnt * nau; printf("HtBcm Hash Instance %d of %d created\n", cnt, total); block_header header; BcmHex2bin(header.version, "01000000"); BcmHex2bin(header.prev_block, "81cd02ab7e569e8bcd9317e2fe99f2de44d49ab2b8851ba4a308000000000000"); BcmHex2bin(header.merkle_root, "e320b6c2fffc8d750423db8b1eb942ae710e951ed797f7affc8892b0f1fc122b"); BcmHex2bin(header.timestamp, "c7f5d74d"); BcmHex2bin(header.bits, "f2b9441a"); BcmHex2bin(header.nonce, "42a14695"); // put header in big endian format BcmEndianFlip((uint8_t *)&header, (uint8_t *)&header, 80); //RegenHash(&header); posix_memalign((void **)&g_pBcmTask[cnt - 1], 64, sizeof(CHtBcmTask) * 1); BcmCalcMidState(g_pBcmTask[cnt - 1]->m_midState, (uint8_t *)&header); memcpy(g_pBcmTask[cnt - 1]->m_data, ((uint8_t *)&header) + 64, 12); uint32_t target[8]; uint32_t bits = BcmByteSwap32(*(uint32_t *)header.bits); BcmInitTarget(target, bits); memcpy(g_pBcmTask[cnt - 1]->m_target, target + 5, 12); g_pBcmTask[cnt - 1]->m_initNonce = init_nonce; g_pBcmTask[cnt - 1]->m_lastNonce = last_nonce; HtBcmAddNewTask(g_pBcmTask[cnt - 1]); } // Force ScanHash call until all units have returned while (!g_bReportNonce || (g_BcmFreeTaskCnt < nau)) HtBcmScanHash((void *)1); for (int i = 0; i < nau; i++) { int cnt = i + j * nau + 1; free(g_pBcmTask[cnt - 1]); } } // All loops complete, finish up printf("%s\n", g_bError ? "FAILED" : "PASSED"); free(g_pBcmTask); HtBcmShutdown(); }
int main(int argc, char *argv[]) { int op, ret; struct iovec s_iov[IOV_CNT], r_iov[IOV_CNT]; char *s_buf, *r_buf; int align_size; int pairs, print_rate; int window_varied; int c, j; int curr_size; enum send_recv_type_e type; ctpm_Init(&argc, &argv); ctpm_Rank(&myid); ctpm_Job_size(&numprocs); /* default values */ pairs = numprocs / 2; window_size = DEFAULT_WINDOW; window_varied = 0; print_rate = 1; hints = fi_allocinfo(); if (!hints) return -1; while ((op = getopt(argc, argv, "hp:w:vr:" CT_STD_OPTS)) != -1) { switch (op) { default: ct_parse_std_opts(op, optarg, hints); break; case 'p': pairs = atoi(optarg); if (pairs > (numprocs / 2)) { print_usage(); return EXIT_FAILURE; } break; case 'w': window_size = atoi(optarg); break; case 'v': window_varied = 1; break; case 'r': print_rate = atoi(optarg); if (0 != print_rate && 1 != print_rate) { print_usage(); return EXIT_FAILURE; } break; case '?': case 'h': print_usage(); return EXIT_FAILURE; } } hints->ep_attr->type = FI_EP_RDM; hints->caps = FI_MSG | FI_DIRECTED_RECV; hints->mode = FI_CONTEXT | FI_LOCAL_MR; if (numprocs < 2) { if (!myid) { fprintf(stderr, "This test requires at least two processes\n"); } ctpm_Finalize(); return -1; } /* Fabric initialization */ ret = init_fabric(); if (ret) { fprintf(stderr, "Problem in fabric initialization\n"); return ret; } ret = init_av(); if (ret) { fprintf(stderr, "Problem in AV initialization\n"); return ret; } /* Data initialization */ align_size = getpagesize(); assert(align_size <= MAX_ALIGNMENT); /* Allocate page aligned buffers */ for (c = 0; c < IOV_CNT; c++) { assert(!posix_memalign(&s_iov[c].iov_base, align_size, MAX_MSG_SIZE)); assert(!posix_memalign(&r_iov[c].iov_base, align_size, MAX_MSG_SIZE)); } assert(!posix_memalign((void **)&s_buf, align_size, MAX_MSG_SIZE * IOV_CNT)); assert(!posix_memalign((void **)&r_buf, align_size, MAX_MSG_SIZE * IOV_CNT)); for (type = 0; type < FIN; type++) { if (!myid) { fprintf(stdout, HEADER); switch (type) { case SEND_RECV: fprintf(stdout, SEND_RECV_DESC); break; case SENDV_RECVV: fprintf(stdout, SENDV_RECVV_DESC); break; case SEND_RECVV: fprintf(stdout, SEND_RECVV_DESC); break; case SENDV_RECV: fprintf(stdout, SENDV_RECV_DESC); break; default: abort(); } if (window_varied) { fprintf(stdout, "# [ pairs: %d ] [ window size: varied ]\n", pairs); fprintf(stdout, "\n# Uni-directional Bandwidth (MB/sec)\n"); } else { fprintf(stdout, "# [ pairs: %d ] [ window size: %d ]\n", pairs, window_size); if (print_rate) { fprintf(stdout, "%-*s%*s%*s%*s\n", 10, "# Size", FIELD_WIDTH, "Iov count", FIELD_WIDTH, "MB/s", FIELD_WIDTH, "Messages/s"); } else { fprintf(stdout, "%-*s%*s%*s\n", 10, "# Size", FIELD_WIDTH, "Iov count", FIELD_WIDTH, "MB/s"); } } fflush(stdout); } if (window_varied) { int window_array[] = WINDOW_SIZES; double **bandwidth_results; int log_val = 1, tmp_message_size = MAX_MSG_SIZE; int i, j; for (i = 0; i < WINDOW_SIZES_COUNT; i++) { if (window_array[i] > window_size) { window_size = window_array[i]; } } while (tmp_message_size >>= 1) { log_val++; } bandwidth_results = (double **)malloc(sizeof(double *) * log_val); for (i = 0; i < log_val; i++) { bandwidth_results[i] = (double *)malloc(sizeof(double) * WINDOW_SIZES_COUNT); } if (!myid) { fprintf(stdout, "# "); for (i = 0; i < WINDOW_SIZES_COUNT; i++) { fprintf(stdout, " %10d", window_array[i]); } fprintf(stdout, "\n"); fflush(stdout); } for (j = 0, curr_size = 1; curr_size <= MAX_MSG_SIZE; curr_size *= 2, j++) { if (!myid) { fprintf(stdout, "%-7d", curr_size); } for (i = 0; i < WINDOW_SIZES_COUNT; i++) { for (c = 0; c < IOV_CNT; c++) { r_iov[c].iov_len = s_iov[c].iov_len = curr_size; bandwidth_results[j][i] = calc_bw(myid, pairs, window_array[i], s_iov, r_iov, c + 1, s_buf, (c + 1) * curr_size, r_buf, (c + 1) * curr_size, type); if (!myid) { fprintf(stdout, "%*d %10.*f", FIELD_WIDTH, c + 1, FLOAT_PRECISION, bandwidth_results[j][i]); } fprintf(stdout, c == IOV_CNT - 1 ? "\n" : ""); } } if (!myid) { fprintf(stdout, "\n"); fflush(stdout); } } if (!myid && print_rate) { fprintf(stdout, "\n# Message Rate Profile\n"); fprintf(stdout, "# "); for (i = 0; i < WINDOW_SIZES_COUNT; i++) { fprintf(stdout, " %10d", window_array[i]); } fprintf(stdout, "\n"); fflush(stdout); for (c = 0; c < IOV_CNT; c++) { for (j = 0, curr_size = 1; curr_size <= MAX_MSG_SIZE; curr_size *= 2) { fprintf(stdout, "%-7d,%*d", curr_size * (c + 1), FIELD_WIDTH, c + 1); for (i = 0; i < WINDOW_SIZES_COUNT; i++) { double rate = 1e6 * bandwidth_results[j][i] / (curr_size * (c + 1)); fprintf(stdout, " %10.2f", rate); } fprintf(stdout, "\n"); fflush(stdout); j++; } } } } else { /* Just one window size */ for (curr_size = 1; curr_size <= MAX_MSG_SIZE; curr_size *= 2) { double bw, rate; for (c = 0; c < IOV_CNT; c++) { r_iov[c].iov_len = s_iov[c].iov_len = curr_size; bw = calc_bw(myid, pairs, window_size, s_iov, r_iov, c + 1, s_buf, (c + 1) * curr_size, r_buf, (c + 1) * curr_size, type); if (!myid) { rate = 1e6 * bw / (curr_size * (c + 1)); if (print_rate) { fprintf(stdout, "%-*d%*d%*.*f%*.*f\n", 10, curr_size * (c + 1), FIELD_WIDTH, c + 1, FIELD_WIDTH, FLOAT_PRECISION, bw, FIELD_WIDTH, FLOAT_PRECISION, rate); fflush(stdout); } else { fprintf(stdout, "%-*d%*d%*.*f\n", 10, curr_size * (c + 1), FIELD_WIDTH, FIELD_WIDTH, c + 1, FLOAT_PRECISION, bw); fflush(stdout); } fprintf(stdout, c == IOV_CNT - 1 ? "\n" : ""); } } } } }
void *av_malloc(size_t size) { void *ptr = NULL; #if CONFIG_MEMALIGN_HACK long diff; #endif /* let's disallow possibly ambiguous cases */ if (size > (max_alloc_size - 32)) return NULL; #if CONFIG_MEMALIGN_HACK ptr = malloc(size + ALIGN); if (!ptr) return ptr; diff = ((~(long)ptr)&(ALIGN - 1)) + 1; ptr = (char *)ptr + diff; ((char *)ptr)[-1] = diff; #elif HAVE_POSIX_MEMALIGN if (size) //OS X on SDK 10.6 has a broken posix_memalign implementation if (posix_memalign(&ptr, ALIGN, size)) ptr = NULL; #elif HAVE_ALIGNED_MALLOC ptr = _aligned_malloc(size, ALIGN); #elif HAVE_MEMALIGN #ifndef __DJGPP__ ptr = memalign(ALIGN, size); #else ptr = memalign(size, ALIGN); #endif /* Why 64? * Indeed, we should align it: * on 4 for 386 * on 16 for 486 * on 32 for 586, PPro - K6-III * on 64 for K7 (maybe for P3 too). * Because L1 and L2 caches are aligned on those values. * But I don't want to code such logic here! */ /* Why 32? * For AVX ASM. SSE / NEON needs only 16. * Why not larger? Because I did not see a difference in benchmarks ... */ /* benchmarks with P3 * memalign(64) + 1 3071, 3051, 3032 * memalign(64) + 2 3051, 3032, 3041 * memalign(64) + 4 2911, 2896, 2915 * memalign(64) + 8 2545, 2554, 2550 * memalign(64) + 16 2543, 2572, 2563 * memalign(64) + 32 2546, 2545, 2571 * memalign(64) + 64 2570, 2533, 2558 * * BTW, malloc seems to do 8-byte alignment by default here. */ #else ptr = malloc(size); #ifdef USE_MEM_STATS printf("malloc(%ld) -> %p\n", size, ptr); if (ptr) { mem_cur += malloc_usable_size(ptr); if (mem_cur > mem_max) { mem_max = mem_cur; printf("mem_max=%d\n", mem_max); } } #endif #endif if(!ptr && !size) { size = 1; ptr= av_malloc(1); } #if CONFIG_MEMORY_POISONING if (ptr) memset(ptr, FF_MEMORY_POISON, size); #endif return ptr; }
void *VSIMalloc( size_t nSize ) { if (nMaxPeakAllocSize < 0) { char* pszMaxPeakAllocSize = getenv("CPL_MAX_PEAK_ALLOC_SIZE"); nMaxPeakAllocSize = (pszMaxPeakAllocSize) ? atoi(pszMaxPeakAllocSize) : 0; char* pszMaxCumulAllocSize = getenv("CPL_MAX_CUMUL_ALLOC_SIZE"); nMaxCumulAllocSize = (pszMaxCumulAllocSize) ? atoi(pszMaxCumulAllocSize) : 0; } if (nMaxPeakAllocSize > 0 && (GIntBig)nSize > nMaxPeakAllocSize) return NULL; #ifdef DEBUG_VSIMALLOC_STATS if (nMaxCumulAllocSize > 0 && (GIntBig)nCurrentTotalAllocs + (GIntBig)nSize > nMaxCumulAllocSize) return NULL; #endif // DEBUG_VSIMALLOC_STATS #ifdef DEBUG_VSIMALLOC_MPROTECT char* ptr = NULL; size_t nPageSize = getpagesize(); posix_memalign((void**)&ptr, nPageSize, (3 * sizeof(void*) + nSize + nPageSize - 1) & ~(nPageSize - 1)); #else char* ptr = (char*) malloc(3 * sizeof(void*) + nSize); #endif // DEBUG_VSIMALLOC_MPROTECT if (ptr == NULL) return NULL; ptr[0] = 'V'; ptr[1] = 'S'; ptr[2] = 'I'; ptr[3] = 'M'; memcpy(ptr + sizeof(void*), &nSize, sizeof(void*)); ptr[2 * sizeof(void*) + nSize + 0] = 'E'; ptr[2 * sizeof(void*) + nSize + 1] = 'V'; ptr[2 * sizeof(void*) + nSize + 2] = 'S'; ptr[2 * sizeof(void*) + nSize + 3] = 'I'; #if defined(DEBUG_VSIMALLOC_STATS) || defined(DEBUG_VSIMALLOC_VERBOSE) { CPLMutexHolderD(&hMemStatMutex); #ifdef DEBUG_VSIMALLOC_VERBOSE if( nSize > THRESHOLD_PRINT ) { fprintf(stderr, "Thread[%p] VSIMalloc(%d) = %p" #ifdef DEBUG_VSIMALLOC_STATS ", current_cumul = " CPL_FRMT_GUIB #ifdef DEBUG_BLOCK_CACHE_USE ", block_cache_used = " CPL_FRMT_GIB #endif ", mal+cal-free = %d" #endif "\n", (void*)CPLGetPID(), (int)nSize, ptr + 2 * sizeof(void*) #ifdef DEBUG_VSIMALLOC_STATS , (GUIntBig)(nCurrentTotalAllocs + nSize), #ifdef DEBUG_BLOCK_CACHE_USE , GDALGetCacheUsed64() #endif ,(int)(nVSIMallocs + nVSICallocs - nVSIFrees) #endif ); } #endif // DEBUG_VSIMALLOC_VERBOSE #ifdef DEBUG_VSIMALLOC_STATS nVSIMallocs ++; if (nMaxTotalAllocs == 0) atexit(VSIShowMemStats); nCurrentTotalAllocs += nSize; if (nCurrentTotalAllocs > nMaxTotalAllocs) nMaxTotalAllocs = nCurrentTotalAllocs; #endif // DEBUG_VSIMALLOC_STATS }
int main(int argc, char *argv[]) { int i, j, rtest, m, k, nerrs, r; void *buf; u8 *temp_buffs[TEST_SOURCES], *buffs[TEST_SOURCES]; u8 a[MMAX * KMAX], b[MMAX * KMAX], c[MMAX * KMAX], d[MMAX * KMAX]; u8 g_tbls[KMAX * TEST_SOURCES * 32], src_in_err[TEST_SOURCES]; u8 src_err_list[TEST_SOURCES], *recov[TEST_SOURCES]; struct perf start, stop; // Pick test parameters m = 14; k = 10; nerrs = 4; const u8 err_list[] = {2, 4, 5, 7}; printf("erasure_code_base_perf: %dx%d %d\n", m, TEST_LEN(m), nerrs); if (m > MMAX || k > KMAX || nerrs > (m - k)){ printf(" Input test parameter error\n"); return -1; } memcpy(src_err_list, err_list, nerrs); memset(src_in_err, 0, TEST_SOURCES); for (i = 0; i < nerrs; i++) src_in_err[src_err_list[i]] = 1; // Allocate the arrays for (i = 0; i < m; i++) { if (posix_memalign(&buf, 64, TEST_LEN(m))) { printf("alloc error: Fail\n"); return -1; } buffs[i] = buf; } for (i = 0; i < (m - k); i++) { if (posix_memalign(&buf, 64, TEST_LEN(m))) { printf("alloc error: Fail\n"); return -1; } temp_buffs[i] = buf; } // Make random data for (i = 0; i < k; i++) for (j = 0; j < TEST_LEN(m); j++) buffs[i][j] = rand(); gf_gen_rs_matrix(a, m, k); ec_init_tables(k, m - k, &a[k * k], g_tbls); ec_encode_data_base(TEST_LEN(m), k, m - k, g_tbls, buffs, &buffs[k]); // Start encode test perf_start(&start); for (rtest = 0; rtest < TEST_LOOPS(m); rtest++) { // Make parity vects ec_init_tables(k, m - k, &a[k * k], g_tbls); ec_encode_data_base(TEST_LEN(m), k, m - k, g_tbls, buffs, &buffs[k]); } perf_stop(&stop); printf("erasure_code_base_encode" TEST_TYPE_STR ": "); perf_print(stop, start, (long long)(TEST_LEN(m)) * (m) * rtest); // Start decode test perf_start(&start); for (rtest = 0; rtest < TEST_LOOPS(m); rtest++) { // Construct b by removing error rows for (i = 0, r = 0; i < k; i++, r++) { while (src_in_err[r]) r++; recov[i] = buffs[r]; for (j = 0; j < k; j++) b[k * i + j] = a[k * r + j]; } if (gf_invert_matrix(b, d, k) < 0) { printf("BAD MATRIX\n"); return -1; } for (i = 0; i < nerrs; i++) for (j = 0; j < k; j++) c[k * i + j] = d[k * src_err_list[i] + j]; // Recover data ec_init_tables(k, nerrs, c, g_tbls); ec_encode_data_base(TEST_LEN(m), k, nerrs, g_tbls, recov, temp_buffs); } perf_stop(&stop); for (i = 0; i < nerrs; i++) { if (0 != memcmp(temp_buffs[i], buffs[src_err_list[i]], TEST_LEN(m))) { printf("Fail error recovery (%d, %d, %d) - ", m, k, nerrs); return -1; } } printf("erasure_code_base_decode" TEST_TYPE_STR ": "); perf_print(stop, start, (long long)(TEST_LEN(m)) * (k + nerrs) * rtest); printf("done all: Pass\n"); return 0; }
int main(int argc, char *argv[]) { if(argc < 3) { printf("Usage: $0 dfe_ip cpu_ip\n"); return 1; } struct in_addr dfe_ip; inet_aton(argv[1], &dfe_ip); struct in_addr cpu_ip; inet_aton(argv[2], &cpu_ip); struct in_addr netmask; inet_aton("255.255.255.0", &netmask); const int port = 5007; max_file_t *maxfile = Tracker_init(); max_engine_t * engine = max_load(maxfile, "*"); max_config_set_bool(MAX_CONFIG_PRINTF_TO_STDOUT, true); max_actions_t *actions = max_actions_init(maxfile, NULL); char regName[32]; for (int i=0; i < 1024; i++) { sprintf(regName, "filter_%d", i); if (i == 150) { max_set_uint64t(actions, "filteringKernel", regName, 0xCC /* a value to match... */); } else { max_set_uint64t(actions, "filteringKernel", regName, 0x4D1B /* or any value you want */); } } max_run(engine, actions); max_actions_free(actions); void *buffer; size_t bufferSize = 4096 * 512; posix_memalign(&buffer, 4096, bufferSize); max_framed_stream_t *toCpu = max_framed_stream_setup(engine, "toCPU", buffer, bufferSize, -1); /* * This executable both creates a normal Linux UDP socket as well as a DFE UDP Socket. * We then exchange data between the two. */ // DFE Socket max_ip_config(engine, MAX_NET_CONNECTION_QSFP_TOP_10G_PORT1, &dfe_ip, &netmask); max_udp_socket_t *dfe_socket = max_udp_create_socket(engine, "udpTopPort1"); max_udp_bind(dfe_socket, port); max_udp_connect(dfe_socket, &cpu_ip, port); // Linux Socket int cpu_socket = create_cpu_udp_socket(&cpu_ip, &dfe_ip, port); printf("Sending test frame...\n"); sendTestFrame(cpu_socket); printf("Waiting for kernel response...\n"); fflush(stdout); void *f; size_t fsz; size_t numMessageRx = 0; uint8_t received_data[512]; while (numMessageRx < NUM_MESSAGES_EXPECTED) { if (max_framed_stream_read(toCpu, 1, &f, &fsz) == 1) { printf("CPU: Got output frame - size %zd - NumMsg = %zd!\n", fsz, numMessageRx); // Frame size would be rounded up to the next 8 bytes. memcpy(received_data, f, fsz); numMessageRx++; max_framed_stream_discard(toCpu, 1); } else usleep(10); } max_udp_close(dfe_socket); max_unload(engine); max_file_free(maxfile); printf("Done.\n"); fflush(stdout); return 0; }
static void init_problem_data(void) { unsigned i,j; #ifdef STARPU_USE_CUDA if (pin) { starpu_data_malloc_pinned_if_possible((void **)&A, zdim*ydim*sizeof(float)); starpu_data_malloc_pinned_if_possible((void **)&B, xdim*zdim*sizeof(float)); starpu_data_malloc_pinned_if_possible((void **)&C, xdim*ydim*sizeof(float)); } else #endif { #ifdef STARPU_HAVE_POSIX_MEMALIGN posix_memalign((void **)&A, 4096, zdim*ydim*sizeof(float)); posix_memalign((void **)&B, 4096, xdim*zdim*sizeof(float)); posix_memalign((void **)&C, 4096, xdim*ydim*sizeof(float)); #else A = malloc(zdim*ydim*sizeof(float)); B = malloc(xdim*zdim*sizeof(float)); C = malloc(xdim*ydim*sizeof(float)); #endif } /* fill the A and B matrices */ if (norandom) { for (j=0; j < ydim; j++) { for (i=0; i < zdim; i++) { A[j+i*ydim] = (float)(i); } } for (j=0; j < zdim; j++) { for (i=0; i < xdim; i++) { B[j+i*zdim] = (float)(j); } } } else { #ifdef NORANDOM srand(2008); STARPU_ABORT(); #endif for (j=0; j < ydim; j++) { for (i=0; i < zdim; i++) { A[j+i*ydim] = (float)(starpu_drand48()); } } for (j=0; j < zdim; j++) { for (i=0; i < xdim; i++) { B[j+i*zdim] = (float)(starpu_drand48()); } } } for (j=0; j < ydim; j++) { for (i=0; i < xdim; i++) { C[j+i*ydim] = (float)(0); } } display_memory_consumption(); }
int main (int argc, char* argv[]) { const int width = 40; const int height = 40; float* input, *output1, *output2; float* input_sc, *output1_sc, *output2_sc; if(posix_memalign((void **) &input, 64, 4 * width*height * sizeof(float)) != 0) { exit(1); } if(posix_memalign((void **) &output1, 64, 4 * width*height * sizeof(float)) != 0) { exit(1); } if(posix_memalign((void **) &output2, 64, 4 * width*height * sizeof(float)) != 0) { exit(1); } if(posix_memalign((void **) &input_sc, 64, 4 * width*height * sizeof(float)) != 0) { exit(1); } if(posix_memalign((void **) &output1_sc, 64, 4 * width*height * sizeof(float)) != 0) { exit(1); } if(posix_memalign((void **) &output2_sc, 64, 4 * width*height * sizeof(float)) != 0) { exit(1); } int i; for (i=0; i<(width*height*4); i++) { input[i] = (i*0.9f)/(i+1); input_sc[i] = (i*0.9f)/(i+1); } blackscholes(width, height, input, output1, output2); blackscholes_sc(width, height, input_sc, output1_sc, output2_sc); for (i=0; i<(width*height*4); i++) { if(fabsf(input_sc[i] - input[i]) > 0.01f) { printf("ERROR\n"); exit(1); } if(fabsf(output1_sc[i] - output1[i]) > 0.01f) { printf("ERROR\n"); exit(1); } if(fabsf(output2_sc[i] - output2[i]) > 0.01f) { printf("ERROR\n"); exit(1); } } return 0; }
/* * pmem_memcpy_init -- benchmark initialization * * Parses command line arguments, allocates persistent memory, and maps it. */ static int pmem_memcpy_init(struct benchmark *bench, struct benchmark_args *args) { assert(bench != NULL); assert(args != NULL); int ret = 0; struct pmem_bench *pmb = malloc(sizeof (struct pmem_bench)); assert(pmb != NULL); pmb->pargs = args->opts; assert(pmb->pargs != NULL); pmb->pargs->chunk_size = args->dsize; enum operation_type op_type; /* * Assign file and buffer size depending on the operation type * (READ from PMEM or WRITE to PMEM) */ if (assign_size(pmb, args, &op_type) != 0) { ret = -1; goto err_free_pmb; } if ((errno = posix_memalign( (void **) &pmb->buf, FLUSH_ALIGN, pmb->bsize)) != 0) { perror("posix_memalign"); ret = -1; goto err_free_pmb; } pmb->n_rand_offsets = args->n_ops_per_thread * args->n_threads; pmb->rand_offsets = malloc(pmb->n_rand_offsets * sizeof (*pmb->rand_offsets)); if (pmb->rand_offsets == NULL) { perror("malloc"); ret = -1; goto err_free_pmb; } for (size_t i = 0; i < pmb->n_rand_offsets; ++i) pmb->rand_offsets[i] = rand() % args->n_ops_per_thread; /* create a pmem file and memory map it */ if ((pmb->pmem_addr = pmem_map_file(args->fname, pmb->fsize, PMEM_FILE_CREATE|PMEM_FILE_EXCL, args->fmode, NULL, NULL)) == NULL) { perror(args->fname); ret = -1; goto err_free_buf; } if (op_type == OP_TYPE_READ) { pmb->src_addr = pmb->pmem_addr; pmb->dest_addr = pmb->buf; } else { pmb->src_addr = pmb->buf; pmb->dest_addr = pmb->pmem_addr; } /* set proper func_src() and func_dest() depending on benchmark args */ if ((pmb->func_src = assign_mode_func(pmb->pargs->src_mode)) == NULL) { fprintf(stderr, "wrong src_mode parameter -- '%s'", pmb->pargs->src_mode); ret = -1; goto err_unmap; } if ((pmb->func_dest = assign_mode_func(pmb->pargs->dest_mode)) == NULL) { fprintf(stderr, "wrong dest_mode parameter -- '%s'", pmb->pargs->dest_mode); ret = -1; goto err_unmap; } if (pmb->pargs->memcpy) { pmb->func_op = pmb->pargs->persist ? libc_memcpy_persist : libc_memcpy; } else { pmb->func_op = pmb->pargs->persist ? libpmem_memcpy_persist : libpmem_memcpy_nodrain; } pmembench_set_priv(bench, pmb); return 0; err_unmap: pmem_unmap(pmb->pmem_addr, pmb->fsize); err_free_buf: free(pmb->buf); err_free_pmb: free(pmb); return ret; }
static void thread(int id) { struct timeval start, stop, diff; int fd, i, ret; size_t n; void *buf; int flags = O_CREAT | O_RDWR | O_LARGEFILE; char filename[32]; ret = posix_memalign(&buf, PAGE_SIZE, chunk_size); if (ret < 0) { fprintf(stderr, "ERROR: task %d couldn't allocate %lu bytes (%s)\n", id, chunk_size, strerror(errno)); exit(1); } memset(buf, 0xaa, chunk_size); snprintf(filename, sizeof(filename), "%s-%d-iobw.tmp", mygroup, id); if (directio) flags |= O_DIRECT; fd = open(filename, flags, 0600); if (fd < 0) { fprintf(stderr, "ERROR: task %d couldn't open %s (%s)\n", id, filename, strerror(errno)); free(buf); exit(1); } /* Write */ lseek(fd, 0, SEEK_SET); n = 0; gettimeofday(&start, NULL); while (n < data_size) { i = write(fd, buf, chunk_size); if (i < 0) { fprintf(stderr, "ERROR: task %d writing to %s (%s)\n", id, filename, strerror(errno)); ret = 1; goto out; } n += i; } gettimeofday(&stop, NULL); timersub(&stop, &start, &diff); print_results(id + 1, OP_WRITE, data_size, &diff); /* Read */ lseek(fd, 0, SEEK_SET); n = 0; gettimeofday(&start, NULL); while (n < data_size) { i = read(fd, buf, chunk_size); if (i < 0) { fprintf(stderr, "ERROR: task %d reading to %s (%s)\n", id, filename, strerror(errno)); ret = 1; goto out; } n += i; } gettimeofday(&stop, NULL); timersub(&stop, &start, &diff); print_results(id + 1, OP_READ, data_size, &diff); out: close(fd); unlink(filename); free(buf); exit(ret); }
static void *aligned_malloc(void **base, int size) { return posix_memalign(base, 8192, size) ? NULL : *base; }
int main (int argc, char **argv) { ssize_t ret_size; struct stat st; int ret, flags; int part_request; long long this_time; double part_min, part_max, time_min, time_max; double time_sum, time_sum2, time_mdev, time_avg; double part_sum, part_sum2, part_mdev, part_avg; long long time_now, time_next, period_deadline; setvbuf(stdout, NULL, _IOLBF, 0); parse_options(argc, argv); interval_ts.tv_sec = interval / 1000000; interval_ts.tv_nsec = (interval % 1000000) * 1000; if (!size) size = default_size; if (size <= 0) errx(1, "request size must be greather than zero"); #ifdef MAX_RW_COUNT if (size > MAX_RW_COUNT) warnx("this platform supports requests %u bytes at most", MAX_RW_COUNT); #endif if (wsize) temp_wsize = wsize; else if (size > temp_wsize) temp_wsize = size; flags = O_RDONLY; #if !defined(HAVE_POSIX_FADVICE) && !defined(HAVE_NOCACHE_IO) # if defined(HAVE_DIRECT_IO) direct |= !cached; # else if (!cached && !write_test) { warnx("non-cached read I/O not supported by this platform"); warnx("you can use write I/O to get reliable results"); cached = 1; } # endif #endif if (write_test) { flags = O_RDWR; make_request = do_pwrite; } if (async) aio_setup(); if (direct) #ifdef HAVE_DIRECT_IO flags |= O_DIRECT; #else errx(1, "direct I/O not supported by this platform"); #endif #ifdef __MINGW32__ flags |= O_BINARY; #endif if (stat(path, &st)) err(2, "stat \"%s\" failed", path); if (!S_ISDIR(st.st_mode) && write_test && write_test < 3) errx(2, "think twice, then use -WWW to shred this target"); if (S_ISDIR(st.st_mode) || S_ISREG(st.st_mode)) { if (S_ISDIR(st.st_mode)) st.st_size = offset + temp_wsize; parse_device(st.st_dev); } else if (S_ISBLK(st.st_mode) || S_ISCHR(st.st_mode)) { fd = open(path, flags); if (fd < 0) err(2, "failed to open \"%s\"", path); if (get_device_size(fd, &st)) { if (!S_ISCHR(st.st_mode)) err(2, "block get size ioctl failed"); st.st_size = offset + temp_wsize; fstype = "character"; device = "device"; } else { device_size = st.st_size; fstype = "block"; device = "device "; } if (!cached && write_test && fdatasync(fd)) { warnx("fdatasync not supported by \"%s\", " "enable cached requests", path); cached = 1; } } else { errx(2, "unsupported destination: \"%s\"", path); } if (wsize > st.st_size || offset > st.st_size - wsize) errx(2, "target is too small for this"); if (!wsize) wsize = st.st_size - offset; if (size > wsize) errx(2, "request size is too big for this target"); ret = posix_memalign(&buf, 0x1000, size); if (ret) errx(2, "buffer allocation failed"); random_memory(buf, size); if (S_ISDIR(st.st_mode)) { fd = create_temp(path, "ioping.tmp"); if (fd < 0) err(2, "failed to create temporary file at \"%s\"", path); if (keep_file) { if (fstat(fd, &st)) err(2, "fstat at \"%s\" failed", path); if (st.st_size >= offset + wsize) #ifndef __MINGW32__ if (st.st_blocks >= (st.st_size + 511) / 512) #endif goto skip_preparation; } for (woffset = 0 ; woffset < wsize ; woffset += ret_size) { ret_size = size; if (woffset + ret_size > wsize) ret_size = wsize - woffset; if (woffset) random_memory(buf, ret_size); ret_size = pwrite(fd, buf, ret_size, offset + woffset); if (ret_size <= 0) err(2, "preparation write failed"); } skip_preparation: if (fsync(fd)) err(2, "fsync failed"); } else if (S_ISREG(st.st_mode)) { fd = open(path, flags); if (fd < 0) err(2, "failed to open \"%s\"", path); } if (!cached) { #ifdef HAVE_POSIX_FADVICE ret = posix_fadvise(fd, offset, wsize, POSIX_FADV_RANDOM); if (ret) err(2, "fadvise failed"); #endif #ifdef HAVE_NOCACHE_IO ret = fcntl(fd, F_NOCACHE, 1); if (ret) err(2, "fcntl nocache failed"); #endif } srandom(now()); if (deadline) deadline += now(); set_signal(); request = 0; woffset = 0; part_request = 0; part_min = time_min = LLONG_MAX; part_max = time_max = LLONG_MIN; part_sum = time_sum = 0; part_sum2 = time_sum2 = 0; time_now = now(); period_deadline = time_now + period_time; while (!exiting) { request++; part_request++; if (randomize) woffset = random() % (wsize / size) * size; #ifdef HAVE_POSIX_FADVICE if (!cached) { ret = posix_fadvise(fd, offset + woffset, size, POSIX_FADV_DONTNEED); if (ret) err(3, "fadvise failed"); } #endif if (write_test) shake_memory(buf, size); this_time = now(); ret_size = make_request(fd, buf, size, offset + woffset); if (ret_size < 0) { if (errno != EINTR) err(3, "request failed"); } else if (ret_size < size) warnx("request returned less than expected: %zu", ret_size); else if (ret_size > size) errx(3, "request returned more than expected: %zu", ret_size); time_now = now(); this_time = time_now - this_time; time_next = time_now + interval; part_sum += this_time; part_sum2 += this_time * this_time; if (this_time < part_min) part_min = this_time; if (this_time > part_max) part_max = this_time; if (!quiet) { print_size(ret_size); printf(" %s %s (%s %s", write_test ? "to" : "from", path, fstype, device); if (device_size) print_size(device_size); printf("): request=%d time=", request); print_time(this_time); printf("\n"); } if ((period_request && (part_request >= period_request)) || (period_time && (time_next >= period_deadline))) { part_avg = part_sum / part_request; part_mdev = sqrt(part_sum2 / part_request - part_avg * part_avg); printf("%d %.0f %.0f %.0f %.0f %.0f %.0f %.0f\n", part_request, part_sum, 1000000. * part_request / part_sum, 1000000. * part_request * size / part_sum, part_min, part_avg, part_max, part_mdev); time_sum += part_sum; time_sum2 += part_sum2; if (part_min < time_min) time_min = part_min; if (part_max > time_max) time_max = part_max; part_min = LLONG_MAX; part_max = LLONG_MIN; part_sum = part_sum2 = 0; part_request = 0; period_deadline = time_now + period_time; } if (!randomize) { woffset += size; if (woffset + size > wsize) woffset = 0; } if (exiting) break; if (stop_at_request && request >= stop_at_request) break; if (deadline && time_next >= deadline) break; if (interval) nanosleep(&interval_ts, NULL); } time_sum += part_sum; time_sum2 += part_sum2; if (part_min < time_min) time_min = part_min; if (part_max > time_max) time_max = part_max; time_avg = time_sum / request; time_mdev = sqrt(time_sum2 / request - time_avg * time_avg); if (batch_mode) { printf("%d %.0f %.0f %.0f %.0f %.0f %.0f %.0f\n", request, time_sum, 1000000. * request / time_sum, 1000000. * request * size / time_sum, time_min, time_avg, time_max, time_mdev); } else if (!quiet || (!period_time && !period_request)) { printf("\n--- %s (%s %s", path, fstype, device); if (device_size) print_size(device_size); printf(") ioping statistics ---\n"); print_int(request); printf(" requests completed in "); print_time(time_sum); printf(", "); print_size((long long)request * size); printf(" %s, ", write_test ? "written" : "read"); print_int(1000000. * request / time_sum); printf(" iops, "); print_size(1000000. * request * size / time_sum); printf("/s\n"); printf("min/avg/max/mdev = "); print_time(time_min); printf(" / "); print_time(time_avg); printf(" / "); print_time(time_max); printf(" / "); print_time(time_mdev); printf("\n"); } return 0; }
void instantiateFIR(FIR* pFIR, const uint uiSampleRate) { printf("FIR: instantiating FIR with %d samples / s\n", uiSampleRate); //check which base frequency to use float** ppfDataSource = NULL; uint nSourceRate = 0; uint* pnSourceBufferLengths = NULL; switch(uiSampleRate) { case 44100: case 22050: case 88200: ppfDataSource = g_aafFIRs44k1; nSourceRate = 44100; pnSourceBufferLengths = g_anSamples44k1; break; default: //case 48000: //case 96000: //case 192000: ppfDataSource = g_aafFIRs48k; nSourceRate = 48000; pnSourceBufferLengths = g_anSamples48k; break; } double fRateScaler = (float)nSourceRate / (double)uiSampleRate; printf("FIR: chosen rate %d, scaler %f\n", nSourceRate, fRateScaler); //reserve buffer uint uiModel = 0; for(; uiModel < NUM_MODELS; uiModel++) { uint nSourceSamples = pnSourceBufferLengths[uiModel]; uint n8Tuples = (uint)(fRateScaler * (float)nSourceSamples - 1.0f) / 8 + 1; pFIR->m_anHistory8Tuples[uiModel] = n8Tuples; uint nDestSamples = n8Tuples * 8; #ifdef _MSC_VER pFIR->m_apfHistory[uiModel] = (v8f_t*) _aligned_malloc(n8Tuples * sizeof(v8f_t), 16); #else posix_memalign((void**)&(pFIR->m_apfHistory[uiModel]), sizeof(v8f_t), n8Tuples * sizeof(v8f_t)); #endif #ifdef _MSC_VER pFIR->m_apfFIR[uiModel] = (v8f_t*) _aligned_malloc(nDestSamples * sizeof(v8f_t), 16); #else posix_memalign((void**)&(pFIR->m_apfFIR[uiModel]), sizeof(v8f_t), nDestSamples * sizeof(v8f_t)); #endif //fill fir coefficients //the fir is reversed here, so multiplication with history data can be carried out sequentially uint uiPermutation = 0; for(; uiPermutation < 8; uiPermutation++) { uint uiFIRSample = 0; while (uiFIRSample < nDestSamples) { float afCoeffs[8] = { 0, 0, 0, 0, 0, 0, 0, 0 }; const uint uiStartSample = uiFIRSample; for (; uiFIRSample < uiStartSample + 8 && uiFIRSample < nDestSamples; uiFIRSample++) { uint uiSourcePosUnscaled = (uiPermutation - uiFIRSample + nDestSamples) % nDestSamples; uint uiSourceBufferPos = (uint)((double)uiSourcePosUnscaled * fRateScaler); if(uiSourceBufferPos < nSourceSamples) afCoeffs[uiFIRSample & 0x7] = ppfDataSource[uiModel][uiSourceBufferPos]; } const uint uiDestIndex = uiPermutation * n8Tuples + (uiStartSample >> 3); pFIR->m_apfFIR [uiModel][uiDestIndex] = v8f_create(afCoeffs); } } } memset(pFIR->m_auiBufferPos, 0, NUM_MODELS * sizeof(uint)); }
/** * crypto_scrypt(passwd, passwdlen, salt, saltlen, N, r, p, buf, buflen): * Compute scrypt(passwd[0 .. passwdlen - 1], salt[0 .. saltlen - 1], N, r, * p, buflen) and write the result into buf. The parameters r, p, and buflen * must satisfy r * p < 2^30 and buflen <= (2^32 - 1) * 32. The parameter N * must be a power of 2 greater than 1. * * Return 0 on success; or -1 on error. */ int crypto_scrypt(const uint8_t * passwd, size_t passwdlen, const uint8_t * salt, size_t saltlen, uint64_t N, uint32_t r, uint32_t p, uint8_t * buf, size_t buflen) { void * B0, * V0, * XY0; uint8_t * B; uint32_t * V; uint32_t * XY; uint32_t i; /* Sanity-check parameters. */ #if SIZE_MAX > UINT32_MAX if (buflen > (((uint64_t)(1) << 32) - 1) * 32) { errno = EFBIG; goto err0; } #endif if ((uint64_t)(r) * (uint64_t)(p) >= (1 << 30)) { errno = EFBIG; goto err0; } if (((N & (N - 1)) != 0) || (N == 0)) { errno = EINVAL; goto err0; } if ((r > SIZE_MAX / 128 / p) || #if SIZE_MAX / 256 <= UINT32_MAX (r > SIZE_MAX / 256) || #endif (N > SIZE_MAX / 128 / r)) { errno = ENOMEM; goto err0; } /* Allocate memory. */ #ifdef _WIN32 #undef HAVE_POSIX_MEMALIGN #endif #ifdef HAVE_POSIX_MEMALIGN if ((errno = posix_memalign(&B0, 64, 128 * r * p)) != 0) goto err0; B = (uint8_t *)(B0); if ((errno = posix_memalign(&XY0, 64, 256 * r + 64)) != 0) goto err1; XY = (uint32_t *)(XY0); #ifndef MAP_ANON if ((errno = posix_memalign(&V0, 64, 128 * r * N)) != 0) goto err2; V = (uint32_t *)(V0); #endif #else if ((B0 = malloc(128 * r * p + 63)) == NULL) goto err0; B = (uint8_t *)(((uintptr_t)(B0) + 63) & ~ (uintptr_t)(63)); if ((XY0 = malloc(256 * r + 64 + 63)) == NULL) goto err1; XY = (uint32_t *)(((uintptr_t)(XY0) + 63) & ~ (uintptr_t)(63)); #ifndef MAP_ANON if ((V0 = malloc(128 * r * N + 63)) == NULL) goto err2; V = (uint32_t *)(((uintptr_t)(V0) + 63) & ~ (uintptr_t)(63)); #endif #endif #ifdef MAP_ANON if ((V0 = mmap(NULL, 128 * r * N, PROT_READ | PROT_WRITE, #ifdef MAP_NOCORE MAP_ANON | MAP_PRIVATE | MAP_NOCORE, #else MAP_ANON | MAP_PRIVATE, #endif -1, 0)) == MAP_FAILED) goto err2; V = (uint32_t *)(V0); #endif /* 1: (B_0 ... B_{p-1}) <-- PBKDF2(P, S, 1, p * MFLen) */ PBKDF2_scrypt_SHA256(passwd, passwdlen, salt, saltlen, 1, B, p * 128 * r); /* 2: for i = 0 to p - 1 do */ for (i = 0; i < p; i++) { /* 3: B_i <-- MF(B_i, N) */ smix(&B[i * 128 * r], r, N, V, XY); } /* 5: DK <-- PBKDF2(P, B, 1, dkLen) */ PBKDF2_scrypt_SHA256(passwd, passwdlen, B, p * 128 * r, 1, buf, buflen); /* Free memory. */ #ifdef MAP_ANON if (munmap(V0, 128 * r * N)) goto err2; #else free(V0); #endif free(XY0); free(B0); /* Success! */ return (0); err2: free(XY0); err1: free(B0); err0: /* Failure! */ return (-1); }
int main(int argc, char*argv[]) { pami_client_t client; pami_context_t *context; pami_task_t task_id; size_t num_tasks; pami_geometry_t world_geometry; /* Barrier variables */ size_t barrier_num_algorithm[2]; pami_algorithm_t *bar_always_works_algo = NULL; pami_metadata_t *bar_always_works_md = NULL; pami_algorithm_t *bar_must_query_algo = NULL; pami_metadata_t *bar_must_query_md = NULL; pami_xfer_type_t barrier_xfer = PAMI_XFER_BARRIER; volatile unsigned bar_poll_flag = 0; /* Alltoallv variables */ size_t alltoallv_num_algorithm[2]; pami_algorithm_t *alltoallv_always_works_algo = NULL; pami_metadata_t *alltoallv_always_works_md = NULL; pami_algorithm_t *next_algo = NULL; pami_metadata_t *next_md= NULL; pami_algorithm_t *alltoallv_must_query_algo = NULL; pami_metadata_t *alltoallv_must_query_md = NULL; pami_xfer_type_t alltoallv_xfer = PAMI_XFER_ALLTOALLV; volatile unsigned alltoallv_poll_flag = 0; int nalg= 0, total_alg; double ti, tf, usec; pami_xfer_t barrier; pami_xfer_t alltoallv; pami_type_t pami_stype = 0; pami_type_t pami_rtype = 0; pami_result_t ret; /* Process environment variables and setup globals */ setup_env(); assert(gNum_contexts > 0); context = (pami_context_t*)malloc(sizeof(pami_context_t) * gNum_contexts); /* Initialize PAMI */ int rc = pami_init(&client, /* Client */ context, /* Context */ NULL, /* Clientname=default */ &gNum_contexts, /* gNum_contexts */ NULL, /* null configuration */ 0, /* no configuration */ &task_id, /* task id */ &num_tasks); /* number of tasks */ if (rc == 1) return 1; /* Allocate buffer(s) */ int err = 0; void* sbuf = NULL; err = posix_memalign((void*) & sbuf, 128, (gMax_byte_count * num_tasks) + gBuffer_offset); assert(err == 0); sbuf = (char*)sbuf + gBuffer_offset; void* rbuf = NULL; err = posix_memalign((void*) & rbuf, 128, (gMax_byte_count * num_tasks) + gBuffer_offset); assert(err == 0); rbuf = (char*)rbuf + gBuffer_offset; sndlens = (size_t*) malloc(num_tasks * sizeof(size_t)); assert(sndlens); sdispls = (size_t*) malloc(num_tasks * sizeof(size_t)); assert(sdispls); rcvlens = (size_t*) malloc(num_tasks * sizeof(size_t)); assert(rcvlens); rdispls = (size_t*) malloc(num_tasks * sizeof(size_t)); assert(rdispls); ret = PAMI_Type_create(&pami_stype); if(ret != PAMI_SUCCESS) return 1; ret = PAMI_Type_create(&pami_rtype); if(ret != PAMI_SUCCESS) return 1; PAMI_Type_add_simple(pami_stype, sizeof(double), 0, 1, sizeof(double)*2); PAMI_Type_add_simple(pami_rtype, sizeof(double), sizeof(double), 1, sizeof(double)); ret = PAMI_Type_complete(pami_stype, sizeof(double)); if(ret != PAMI_SUCCESS){ printf("Invalid atom size for stype\n"); return 1; } ret = PAMI_Type_complete(pami_rtype, sizeof(double)); if(ret != PAMI_SUCCESS){ printf("Invalid atom size for rtype\n"); return 1; } unsigned iContext = 0; for (; iContext < gNum_contexts; ++iContext) { if (task_id == 0) printf("# Context: %u\n", iContext); /* Query the world geometry for barrier algorithms */ rc |= query_geometry_world(client, context[iContext], &world_geometry, barrier_xfer, barrier_num_algorithm, &bar_always_works_algo, &bar_always_works_md, &bar_must_query_algo, &bar_must_query_md); if (rc == 1) return 1; /* Query the world geometry for alltoallv algorithms */ rc |= query_geometry_world(client, context[iContext], &world_geometry, alltoallv_xfer, alltoallv_num_algorithm, &alltoallv_always_works_algo, &alltoallv_always_works_md, &alltoallv_must_query_algo, &alltoallv_must_query_md); if (rc == 1) return 1; barrier.cb_done = cb_done; barrier.cookie = (void*) & bar_poll_flag; barrier.algorithm = bar_always_works_algo[0]; total_alg = alltoallv_num_algorithm[0]+alltoallv_num_algorithm[1]; for (nalg = 0; nalg < total_alg; nalg++) { metadata_result_t result = {0}; unsigned query_protocol; if(nalg < alltoallv_num_algorithm[0]) { query_protocol = 0; next_algo = &alltoallv_always_works_algo[nalg]; next_md = &alltoallv_always_works_md[nalg]; } else { query_protocol = 1; next_algo = &alltoallv_must_query_algo[nalg-alltoallv_num_algorithm[0]]; next_md = &alltoallv_must_query_md[nalg-alltoallv_num_algorithm[0]]; } gProtocolName = next_md->name; alltoallv.cb_done = cb_done; alltoallv.cookie = (void*) & alltoallv_poll_flag; alltoallv.algorithm = *next_algo; alltoallv.cmd.xfer_alltoallv.sndbuf = sbuf; alltoallv.cmd.xfer_alltoallv.stype = pami_stype; alltoallv.cmd.xfer_alltoallv.stypecounts = sndlens; alltoallv.cmd.xfer_alltoallv.sdispls = sdispls; alltoallv.cmd.xfer_alltoallv.rcvbuf = rbuf; alltoallv.cmd.xfer_alltoallv.rtype = pami_rtype; alltoallv.cmd.xfer_alltoallv.rtypecounts = rcvlens; alltoallv.cmd.xfer_alltoallv.rdispls = rdispls; gProtocolName = next_md->name; if (task_id == 0) { printf("# Alltoallv Bandwidth Test(size:%zu) -- context = %d, protocol: %s, Metadata: range %zu <-> %zd, mask %#X\n",num_tasks, iContext, gProtocolName, next_md->range_lo,(ssize_t)next_md->range_hi, next_md->check_correct.bitmask_correct); printf("# Size(bytes) iterations bytes/sec usec\n"); printf("# ----------- ----------- ----------- ---------\n"); } if (((strstr(next_md->name, gSelected) == NULL) && gSelector) || ((strstr(next_md->name, gSelected) != NULL) && !gSelector)) continue; int i, j; unsigned checkrequired = next_md->check_correct.values.checkrequired; /*must query every time */ assert(!checkrequired || next_md->check_fn); /* must have function if checkrequired. */ for (i = 0; i <= (gMax_byte_count/(sizeof(double)*2)); i *= 2) { size_t dataSent = i; int niter; if (dataSent < CUTOFF) niter = gNiterlat; else niter = NITERBW; for (j = 0; j < num_tasks; j++) { sndlens[j] = rcvlens[j] = i; sdispls[j] = rdispls[j] = i * j; initialize_sndbuf( j, (double*)sbuf, (double*)rbuf ); } if(query_protocol) { size_t sz=get_type_size(pami_stype)*i; size_t rsz=get_type_size(pami_rtype)*i; result = check_metadata(*next_md, alltoallv, pami_stype, sz, /* metadata uses bytes i, */ alltoallv.cmd.xfer_alltoallv.sndbuf, pami_rtype, rsz, alltoallv.cmd.xfer_alltoallv.rcvbuf); if (next_md->check_correct.values.nonlocal) { /* \note We currently ignore check_correct.values.nonlocal because these tests should not have nonlocal differences (so far). */ result.check.nonlocal = 0; } if (result.bitmask) { if(!i)i++; continue; } } blocking_coll(context[iContext], &barrier, &bar_poll_flag); ti = timer(); for (j = 0; j < niter; j++) { if (checkrequired) /* must query every time */ { result = next_md->check_fn(&alltoallv); if (result.bitmask) { if(!i)i++; continue; } } blocking_coll(context[iContext], &alltoallv, &alltoallv_poll_flag); } tf = timer(); blocking_coll(context[iContext], &barrier, &bar_poll_flag); int rc_check; rc |= rc_check = check_rcvbuf(num_tasks, task_id, (double*)rbuf, (double*)sbuf); if (rc_check) fprintf(stderr, "%s FAILED validation\n", gProtocolName); usec = (tf - ti) / (double)niter; if (task_id == 0) { printf(" %11lld %16d %14.1f %12.2f\n", (long long)dataSent, niter, (double)1e6*(double)dataSent / (double)usec, usec); fflush(stdout); } if(!i)i++; } } free(bar_always_works_algo); free(bar_always_works_md); free(bar_must_query_algo); free(bar_must_query_md); free(alltoallv_always_works_algo); free(alltoallv_always_works_md); free(alltoallv_must_query_algo); free(alltoallv_must_query_md); } /*for(unsigned iContext = 0; iContext < gNum_contexts; ++iContexts)*/ sbuf = (char*)sbuf - gBuffer_offset; free(sbuf); rbuf = (char*)rbuf - gBuffer_offset; free(rbuf); free(sndlens); free(sdispls); free(rcvlens); free(rdispls); rc |= pami_shutdown(&client, context, &gNum_contexts); return rc; }
inline void* aligned_alloc(size_t alignment, size_t size) { void *p = NULL; posix_memalign(&p, alignment, size); return p; }
void* FLA_malloc( size_t size ) { void* ptr = NULL; FLA_Error e_val; #ifdef FLA_ENABLE_MEMORY_ALIGNMENT int r_val; #endif // In practice, the size argument should very rarely be zero. However, if the // calling code does request a memory region of zero length, we short-circut // the actual allocation request and just return NULL. Hopefully, the calling // code is written such that the pointer is never dereferenced. At free()-time // everything will be fine, as calling free() with a NULL pointer is safe. // Also note that we do NOT increment the memory leak counter before returning. // (Likewise, we will not decrement the counter when a NULL pointer is freed.) if ( size == 0 ) return NULL; #ifdef FLA_ENABLE_MEMORY_ALIGNMENT // Allocate size bytes of memory. Here, we call posix_memalign() if // memory alignment was requested at configure-time, providing the // alignment boundary value given by the user. posix_memalign() also // returns an error code, which is how it signals that something // went wrong. Compare to malloc(), which does this by simply returning // a NULL pointer. r_val = posix_memalign( &ptr, ( size_t ) FLA_MEMORY_ALIGNMENT_BOUNDARY, size ); // Check the return value of posix_memalign() for evidence that the // request failed. if ( FLA_Check_error_level() >= FLA_MIN_ERROR_CHECKING ) { e_val = FLA_Check_posix_memalign_failure( r_val ); FLA_Check_error_code( e_val ); } #else // Allocate size bytes of memory. Note that malloc() only guarantees 8-byte // alignment. ptr = malloc( size ); // It may not seem useful to have a check for a null pointer here, given // that such an occurance would cause the file and line of the error to // be reported as the below line of the current file instead of the file // and line number of the calling code. However, consider that in the // unlikely event that malloc() does return a null pointer, the user will // have much bigger problems on his hands (e.g. an exhausted memory heap) // than needing to know exactly what line in the library triggered error. // Note that such a line in the application code is likely not the root // source of the problem anyway (ie: not the reason why the heap is full). if ( FLA_Check_error_level() >= FLA_MIN_ERROR_CHECKING ) { e_val = FLA_Check_malloc_pointer( ptr ); FLA_Check_error_code( e_val ); } #endif // Update the memory leak counter if it is enabled, and do so thread-safely // if multithreading is enabled. if ( FLA_Memory_leak_counter_status() == TRUE ) { #ifdef FLA_ENABLE_MULTITHREADING FLA_Lock_acquire( &fla_mem_leak_counter_lock ); fla_mem_leak_counter += 1; FLA_Lock_release( &fla_mem_leak_counter_lock ); #else fla_mem_leak_counter += 1; #endif } // Return the pointer to the new memory region returned by malloc(). return ptr; }
int main(int argc, char **argv) { struct timespec t1, t2; int c, d, k, sum = 0; int size, opt, i; char *fname; while((opt = getopt(argc, argv, "f:s:"))!= -1) { switch (opt){ case 's': size = atoi(optarg); break; case 'f': fname = optarg; break; default: size = MEDIUM; break; } } FILE *fp; fp = fopen(fname,"a"); int edge; int *first; posix_memalign((void**)&first,16,sizeof(int)*size*size); //use posix_memalign to get 16byte alignment int *multiply; posix_memalign((void**)&multiply,16,sizeof(int)*size*size); __m128i m1, m2,m3; for ( c = 0 ; c < size ; c++ ) for ( d = 0 ; d < size ; d++ ) first[c*size+d] = ((c+d) % 2) - 1; multiply[c*size+d] = 0; printf("multiplying the %d-size matrices\n You should try to time this part.\n",size); clock_gettime(CLOCK_PROCESS_CPUTIME_ID, &t1); for ( c = 0 ; c < size ; c++ ) { for ( k = 0 ; k < size ; k++ ) { m2 = _mm_set1_epi32(first[c*size+k]); //first[c][k] for (d = 0 ; d < size ; d+=4) { edge = size - d; if (edge < 4){ //account for non-div by 4 matrices for (i = d; i < size; i++) multiply[c*size+i] += first[c*size+k]*first[k*size+i]; } else{ m1 = _mm_loadu_si128(&first[k*size+d]); //first[k][d] m1 = _mm_mullo_epi32(m1,m2); // first[k][d] * first[c][k] m3 = _mm_loadu_si128(&multiply[c*size+d]);//load up old values of multiply[c][d] m1 = _mm_add_epi32(m3,m1); //[+= to mult] _mm_storeu_si128(&multiply[c*size+d],m1); } } } } clock_gettime(CLOCK_PROCESS_CPUTIME_ID, &t2); double nanos = (diff(t1,t2).tv_nsec) * pow(10,-9); double secs = (diff(t1,t2).tv_sec); double dif = secs + nanos; fprintf(fp,"%.10f\n", dif); fclose(fp); printf("test first %d\n",first[size]); printf("test mult %d\n",multiply[size]); free(first); //free SSE aligned array with _aligned_free free(multiply); return 0; }
int main(int argc, char **argv) { int fd, i; unsigned long long FILE_SIZE; size_t len; char c; char unit; int size; unsigned long long count; void *buf1 = NULL; char *buf, *buf2; char fs_type[20]; char quill_enabled[40]; char file_size_num[20]; char filename[60]; int req_size; if (argc < 6) { printf("Usage: ./integrity_test2_write $FS $SCNEARIO $REQ_SIZE $FILE_SIZE $filename\n"); return 0; } strcpy(fs_type, argv[1]); strcpy(quill_enabled, argv[2]); strcpy(file_size_num, argv[4]); len = strlen(file_size_num); unit = file_size_num[len - 1]; file_size_num[len - 1] = '\0'; FILE_SIZE = atoll(file_size_num); switch (unit) { case 'K': case 'k': FILE_SIZE *= 1024; break; case 'M': case 'm': FILE_SIZE *= 1048576; break; case 'G': case 'g': FILE_SIZE *= 1073741824; break; default: printf("ERROR: FILE_SIZE should be #K/M/G format.\n"); return 0; break; } if (FILE_SIZE < 4096) FILE_SIZE = 4096; if (FILE_SIZE > 2147483648) // RAM disk size FILE_SIZE = 2147483648; strcpy(filename, argv[5]); c = filename[0]; if (posix_memalign(&buf1, END_SIZE, END_SIZE)) { // up to 64MB printf("ERROR - POSIX NOMEM!\n"); return 0; } buf = (char *)buf1; buf2 = malloc(END_SIZE); fd = open("/mnt/ramdisk/test1", O_CREAT | O_RDWR | O_DIRECT, 0640); // fd = open("/dev/null", O_WRONLY, 0640); // fd = open("/dev/zero", O_RDONLY, 0640); printf("fd: %d\n", fd); req_size = atoi(argv[3]); if (req_size > END_SIZE) req_size = END_SIZE; size = req_size; memset(buf, c, size); lseek(fd, 0, SEEK_SET); count = FILE_SIZE / size; printf("Start c: %c\n", c); for (i = 0; i < count; i++) { if (write(fd, buf, size) != size) printf("ERROR\n"); c++; if (c > 'z') c = 'A'; memset(buf, c, size); } // fsync(fd); close(fd); free(buf1); free(buf2); return 0; }
int critbitInsert(critbit_t *t, const char *u, void *data, void **old) { const uint8_t *const ubytes = (void*)u; const size_t ulen = strlen(u); uint8_t *p = t->root; void *d = t->data; if(old != NULL) { (*old) = NULL; } if(!p) { char *x; int a = posix_memalign((void **)&x, sizeof(void *), ulen + 1); if(a) { return 0; } memcpy(x, u, ulen + 1); t->root = x; t->data = data; return 2; } while(1 & (intptr_t)p) { critbit_node *q = (void*)(p - 1); uint8_t c = 0; if(q->byte < ulen) { c = ubytes[q->byte]; } const int direction = (1 + (q->otherbits | c)) >> 8; p = q->child[direction]; d = q->data[direction]; } uint32_t newbyte; uint32_t newotherbits; for(newbyte = 0; newbyte < ulen; newbyte++) { if(p[newbyte] != ubytes[newbyte]) { newotherbits = p[newbyte] ^ ubytes[newbyte]; goto different_byte_found; } } if(p[newbyte] != 0) { newotherbits = p[newbyte]; goto different_byte_found; } if(old != NULL) { (*old) = d; } d = data; return 1; different_byte_found: while(newotherbits & (newotherbits - 1)) { newotherbits&= newotherbits-1; } newotherbits ^= 255; uint8_t c = p[newbyte]; int newdirection = (1 + (newotherbits | c)) >> 8; critbit_node *newnode; if(posix_memalign((void **)&newnode, sizeof(void *), sizeof(critbit_node))) { return 0; } char *x; if(posix_memalign((void **)&x, sizeof(void *), ulen + 1)) { free(newnode); return 0; } memcpy(x, ubytes, ulen + 1); newnode->byte = newbyte; newnode->otherbits = newotherbits; newnode->child[1 - newdirection] = x; newnode->data[1 - newdirection] = data; void **wherep = &t->root; void **datap = &t->data; for(;;) { uint8_t *p = *wherep; if(!(1 & (intptr_t)p)) { break; } critbit_node *q = (void*)(p - 1); if(q->byte > newbyte) { break; } if((q->byte == newbyte) && (q->otherbits > newotherbits)) { break; } uint8_t c = 0; if(q->byte < ulen) { c = ubytes[q->byte]; } const int direction = (1 + (q->otherbits | c)) >> 8; wherep = q->child + direction; datap = q->data + direction; } newnode->child[newdirection] = *wherep; newnode->data[newdirection] = *datap; *wherep = (void*)(1 + (char*)newnode); *datap = NULL; return 2; }
//------------------------------------------------ // Aligned memory allocation. // static inline uint8_t* cf_valloc(size_t size) { void* pv; return posix_memalign(&pv, 4096, size) == 0 ? (uint8_t*)pv : 0; }
void *VSICalloc( size_t nCount, size_t nSize ) { #ifdef DEBUG_VSIMALLOC size_t nMul = nCount * nSize; if (nCount != 0 && nMul / nCount != nSize) { fprintf(stderr, "Overflow in VSICalloc(%d, %d)\n", (int)nCount, (int)nSize); return NULL; } if (nMaxPeakAllocSize < 0) { char* pszMaxPeakAllocSize = getenv("CPL_MAX_PEAK_ALLOC_SIZE"); nMaxPeakAllocSize = (pszMaxPeakAllocSize) ? atoi(pszMaxPeakAllocSize) : 0; char* pszMaxCumulAllocSize = getenv("CPL_MAX_CUMUL_ALLOC_SIZE"); nMaxCumulAllocSize = (pszMaxCumulAllocSize) ? atoi(pszMaxCumulAllocSize) : 0; } if (nMaxPeakAllocSize > 0 && (GIntBig)nMul > nMaxPeakAllocSize) return NULL; #ifdef DEBUG_VSIMALLOC_STATS if (nMaxCumulAllocSize > 0 && (GIntBig)nCurrentTotalAllocs + (GIntBig)nMul > nMaxCumulAllocSize) return NULL; #endif #ifdef DEBUG_VSIMALLOC_MPROTECT char* ptr = NULL; size_t nPageSize = getpagesize(); posix_memalign((void**)&ptr, nPageSize, (3 * sizeof(void*) + nMul + nPageSize - 1) & ~(nPageSize - 1)); if (ptr == NULL) return NULL; memset(ptr + 2 * sizeof(void*), 0, nMul); #else char* ptr = (char*) calloc(1, 3 * sizeof(void*) + nMul); if (ptr == NULL) return NULL; #endif ptr[0] = 'V'; ptr[1] = 'S'; ptr[2] = 'I'; ptr[3] = 'M'; memcpy(ptr + sizeof(void*), &nMul, sizeof(void*)); ptr[2 * sizeof(void*) + nMul + 0] = 'E'; ptr[2 * sizeof(void*) + nMul + 1] = 'V'; ptr[2 * sizeof(void*) + nMul + 2] = 'S'; ptr[2 * sizeof(void*) + nMul + 3] = 'I'; #if defined(DEBUG_VSIMALLOC_STATS) || defined(DEBUG_VSIMALLOC_VERBOSE) { CPLMutexHolderD(&hMemStatMutex); #ifdef DEBUG_VSIMALLOC_VERBOSE if( nMul > THRESHOLD_PRINT ) { fprintf(stderr, "Thread[%p] VSICalloc(%d,%d) = %p\n", (void*)CPLGetPID(), (int)nCount, (int)nSize, ptr + 2 * sizeof(void*)); } #endif #ifdef DEBUG_VSIMALLOC_STATS nVSICallocs ++; if (nMaxTotalAllocs == 0) atexit(VSIShowMemStats); nCurrentTotalAllocs += nMul; if (nCurrentTotalAllocs > nMaxTotalAllocs) nMaxTotalAllocs = nCurrentTotalAllocs; #endif } #endif return ptr + 2 * sizeof(void*); #else return calloc( nCount, nSize ); #endif }
// Allocate page-aligned memory. void *malloc_aligned(size_t size) { void *mem; errno = posix_memalign(&mem, BCM2835_PAGE_SIZE, size); return (errno ? NULL : mem); }
/* * A function that splits the parent node and creates 4 new children nodes in the array of nodes named "tree". */ void split(Node* parent, Node* tree, int depth, unsigned int* index, value_type* xsorted, value_type* ysorted, value_type* mass_sorted, int k, int* newNodeIndex){ // Capture and update the newNodeIndex atomically to avoid race condition /* In case of omp tasking make this atomic to avoid race conditions * #pragma omp atomic capture {} */ parent->child_id = *newNodeIndex; *newNodeIndex += 4; // Compute the level of the children unsigned int children_level = parent->level +1; // Compute the indexvalue of this level so we can easily compute the morton-id's of the children int indexValue_level = pow(2,2*(depth - children_level)); // Allocate pointers for the expansions arrays value_type * rxps0; value_type * ixps0; value_type * rxps1; value_type * ixps1; value_type * rxps2; value_type * ixps2; value_type * rxps3; value_type * ixps3; // Initialize the children nodes Node child_0 = Node {children_level, // level parent->morton_id, // morton index -1, // child_id -1, // part_start -1, // part_end 1, // node mass 1, // x center of mass 1, // y center of mass NAN, // radius of node rxps0, // real part of multipole expansion ixps0 // imaginary part of multipole expansion }; Node child_1 = Node {children_level, // level parent->morton_id + indexValue_level, // morton index -1, // child_id -1, // part_start -1, // part_end 1, // node mass 1, // x center of mass 1, // y center of mass NAN, // radius of node rxps1, // real part of multipole expansion ixps1 // imaginary part of multipole expansion }; Node child_2 = Node {children_level, // level parent->morton_id + 2*indexValue_level, // morton index -1, // child_id -1, // part_start -1, // part_end 1, // node mass 1, // x center of mass 1, // y center of mass NAN, // radius of node rxps2, // real part of multipole expansion ixps2 // imaginary part of multipole expansion }; Node child_3 = Node {children_level, // level parent->morton_id + 3*indexValue_level, // morton index -1, // child_id -1, // part_start -1, // part_end 1, // node mass 1, // x center of mass 1, // y center of mass NAN, // radius of node rxps3, // real part of multipole expansion ixps3 // imaginary part of multipole expansion }; if(parent->level == 0){ // Print the maximum index unsigned int max = index[0]; int min = 0; for (int l = 0; l < parent->part_end - parent->part_start; ++l) { if(index[l]>max){ max = index[l]; } if(index[l] < min){ min = index[l]; } } // std::cout << "Biggest Morton Index = " << max << std::endl; // std::cout << "Smallest Morton Index = " << min << std::endl; // std::cout << "Morton limit tree = " << child_3.morton_id - 1 + indexValue_level << std::endl; // std::cout << "IndexValue at level = " << children_level << " is " << indexValue_level << std::endl; } // Set a pointer to the first child node Node* children = tree + parent->child_id; // Put the children nodes into the array at indices [ children[0], ... , children[3] ]. children[0] = child_0; children[1] = child_1; children[2] = child_2; children[3] = child_3; // Assign the particles to the children assignParticles(parent, children, depth, index); // Assign the total and center of mass to the children, as well as their radius r centerOfMass(children, xsorted, ysorted, mass_sorted); radius(children, xsorted, ysorted); // Compute the multipole expansions for the children nodes, only if the level is 2 or deeper and if the node is not empty if(children_level >= 2){ int nParticlesChild = 0; for (int c = 0; c < 4; ++c) { // Check if this child node is empty or contains only 1 particle. if(children[c].part_start == children[c].part_end){ // Do nothing } else { // Compute the number of particles in this child node nParticlesChild = children[c].part_end - children[c].part_start + 1; // Align expansion arrays posix_memalign((void **) &children[c].rxps, 32, sizeof(value_type) * exp_order); posix_memalign((void **) &children[c].ixps, 32, sizeof(value_type) * exp_order); // Compute and set the expansion in this child node p2e(xsorted + children[c].part_start, // x values of child's particles ysorted + children[c].part_start, // y values of child's particles mass_sorted + children[c].part_start, // mass values of child's particles nParticlesChild, // number of particles in child exp_order, // order of expansion children[c].xcom, // x value center of mass children[c].ycom, // y value center of mass children[c].rxps, // real parts of expansion children[c].ixps); // imaginary parts of expansion /* Print info about the computed expansion std::cout << "Child node " << c << " has expansion:" << std::endl; for (int i = 0; i < exp_order; ++i) { std::cout << "rxps [" << i << "] = " << children[c].rxps[i] << std::endl; std::cout << "ixps [" << i << "] = " << children[c].ixps[i] << "\n" << std::endl; } */ } } } // Check number of particles in the children nodes for (int i = 0; i < 4; ++i) { if (children[i].part_end - children[i].part_start + 1 > k && children[i].level < depth) { // There are more than k particles in the node and we haven't reached the maximum depth so split it in four split(children + i, tree, depth, index, xsorted, ysorted, mass_sorted, k, newNodeIndex); } else { // There are less than or equal to k particles in the node // or we reached the maximum depth => the node is a leaf } } }
void pagerank(list* plist, int ncores, int npages, int nedges, double dampener) { /* For whoever reading this. Here's a strange hack. Don't edit. Don't know why it speeds things up - unexplainble. Sorry. -Roger (4/Jun/2014) */ if (ncores > 1) ncores --; /*************************/ /* Padding for AVX later */ /*************************/ while ((npages + g_padding) % (ncores * 4) != 0) { g_padding++; } /***************/ /* Declaration */ /***************/ double constant = (1.0 - dampener) / npages; g_npages = npages; g_nedges = nedges; /* Lists that uses page index */ // double * curr_scores; /* Stores the scores for this round */ // double * prev_scores; /* Stores the score for last round - for calculating converce */ // int * page_inlinks; /* Number of inlinks per page[i] - to determine the loop */ /* Lists the use edge index */ // int * inlinks; /* the index for the numerator prev_score[inlinks[edge]] */ // int * index_edge; /* Stores where the corresponding edge is for the page index */ // double * outlinks; /* the numerator for each edge */ /* Lists that use ncores as index */ //g_sumDiff is declare globally posix_memalign((void *)&curr_scores, 32, (npages + g_padding) * sizeof(double)); posix_memalign((void *)&prev_scores, 32, (npages + g_padding) * sizeof(double)); posix_memalign((void *)&page_inlinks, 32, npages * sizeof(int)); posix_memalign((void *)&inlinks, 32, nedges * sizeof (int)); posix_memalign((void *)&index_edge, 32, nedges * sizeof (int)); posix_memalign((void *)&outlinks, 32, nedges * sizeof (double)); posix_memalign((void *)&g_sumDiff, 32, ncores * sizeof (double)); /*****************************/ /* Setting up data structure */ /*****************************/ node* curr = plist->head; unsigned int edge = 0; for (int i = 0; i < npages; ++i) /* For each node */ { if (curr->page->inlinks != NULL) /* If this page has a inlinks */ { page_inlinks[i] = curr->page->inlinks->length; /* page_inlinks[i] +1 */ index_edge[i] = edge; node* list_node = curr->page->inlinks->head; /* Setting the first node in the inlinks */ for (int j = 0; j < curr->page->inlinks->length; ++j) { inlinks[edge] = list_node->page->index; /* inlinks[edge] +1 */ outlinks[edge] = 1.0 / list_node->page->noutlinks; /* outlinks[edge] +1*/ list_node = list_node -> next; ++edge; } } curr = curr->next; } /************************/ /* First iteration (P0) */ /************************/ for (int i = 0; i < g_padding; i++) { curr_scores[npages + i] = 0.0; prev_scores[npages + i] = 0.0; //padding } double p0 = 1.0 / npages; for (int i = 0; i < npages; i++) { prev_scores[i] = p0; /* We should have started with assigning to curr_score and then swap but this way it save me from swapping once more in the beginning */ } /**************************************/ /* Setting up arguements for parellel */ /**************************************/ pthread_t tids[ncores]; workerargs wargs[ncores]; for (int i = 0; i < ncores; i++) { wargs[i].i = i; wargs[i].start = i * ((npages + g_padding) / ncores); wargs[i].end = (i + 1) * ((npages + g_padding) / ncores); wargs[i].dampener = dampener; wargs[i].constant = constant; } wargs[ncores-1].end = npages + g_padding; // for (int i = 0; i < ncores; i++) // printf("<wargs[%d]: start:%d end:%d>\n", i, wargs[i].start, wargs[i].end); pthread_barrier_init(&score_barrier, NULL, ncores); pthread_barrier_init(&conv_barrier, NULL, ncores); for (int i = 0; i < ncores - 1; i++) pthread_create(&tids[i], NULL, worker, &wargs[i]); int start = wargs[ncores-1].start; int end = wargs[ncores-1].end; /*************************/ /* Manager thread memory */ /*************************/ for (;;) { if ((start < g_npages) && (g_nedges > 0)) //padding test edge = index_edge[start]; /* for edge count */ double conv = 0.0; /* WORKER: Calculating curr_score */ for (int i = start; i < end; ++i) /* Calculation for each page */ { if (i >= g_npages) break; double sum = 0.0; for (int j = 0; j < page_inlinks[i]; ++j) /* Calculation for each inlink */ { sum += prev_scores[inlinks[edge]] * outlinks[edge]; //printf("<%d: %f = %f * %f>\n", i, prev_scores[inlinks[edge]] / outlinks[edge], prev_scores[inlinks[edge]], outlinks[edge]); ++edge; } curr_scores[i] = constant + dampener * sum; } /* WORKER: Calculating conv */ int nblocks = (end - start) * 0.25; for (int i = 0; i < nblocks; i++) { __m256d* curr_block = (__m256d*) &curr_scores[start]; // Cast it instead. __m256d* prev_block = (__m256d*) &prev_scores[start]; __m256d m1 = _mm256_sub_pd(curr_block[i], prev_block[i]); __m256d m2 = _mm256_mul_pd(m1, m1); conv += m2[0] + m2[1] + m2[2] + m2[3]; } pthread_barrier_wait(&score_barrier); /* Wait until all workers are done */ /* WORKER COMPLETE */ /* MANAGER BEGIN */ double* tmp = prev_scores; /* For each iteration: more curr to prev, and replace the old prev */ prev_scores = curr_scores; curr_scores = tmp; /* MANAGER: Summing*/ for (int i = 0; i < ncores; i++) { conv += g_sumDiff[i]; } /* MANAGER: Checking */ if (conv < EPSILON*EPSILON) { g_hasConverged = true; /* It has converged */ pthread_barrier_wait(&conv_barrier); tmp = prev_scores; /* Reverse the swap then */ prev_scores = curr_scores; curr_scores = tmp; break; } pthread_barrier_wait(&conv_barrier); /* Tell the worker they may begin again */ } /********************/ /* Printing results */ /********************/ // printf("curr_scores:\n"); displayPageRank(plist, curr_scores); // printf("prev_scores:\n"); // displayPageRank(plist, prev_scores); /******************/ /* Cleaning up */ /******************/ for (int i = 0; i < ncores - 1; i++) pthread_join(tids[i], NULL); pthread_barrier_destroy(&score_barrier); pthread_barrier_destroy(&conv_barrier); free(curr_scores); free(prev_scores); free(index_edge); free(page_inlinks); free(outlinks); free(inlinks); free(g_sumDiff); }
void initialize_parameters() { long long a, b; vector_size++; // Temporarily increment to allocate space for bias /* Allocate space for word vectors and context word vectors, and correspodning gradsq */ a = posix_memalign((void **)&W, 128, 2 * vocab_size * vector_size * sizeof(real)); // Might perform better than malloc if (W == NULL) { fprintf(stderr, "Error allocating memory for W\n"); exit(1); } a = posix_memalign((void **)&gradsq, 128, 2 * vocab_size * vector_size * sizeof(real)); // Might perform better than malloc if (gradsq == NULL) { fprintf(stderr, "Error allocating memory for gradsq\n"); exit(1); } if(initialize_from_file){ /* char context_weight[MAX_STRING_LENGTH],word_weight[MAX_STRING_LENGTH]; FILE *fin; sprintf(word_weight,"%s_word.txt",initial_weight_file); sprintf(context_weight,"%s_context.txt",initial_weight_file); fin = fopen(word_weight, "r"); fscanf (fin, "%*d %*d\n"); //Initialize word weight from file for (a = 0; a < vocab_size; a++){ fscanf (fin, "%*s"); for (b = 0; b < (vector_size - 1); b++) { float w; fscanf (fin, "%g", &w); W[a * vector_size + b] = w; } } fclose(fin); fin = fopen(context_weight, "r"); fscanf (fin, "%*d %*d\n"); //Initialize context weight from file for (a = vocab_size; a < 2 * vocab_size; a++){ fscanf (fin, "%*s"); for (b = 0; b < (vector_size - 1); b++) { float w; fscanf (fin, "%g", &w); W[a * vector_size + b] = w; } } fclose(fin); */ char weight_file[MAX_STRING_LENGTH]; FILE *fin; sprintf(weight_file,"%s.bin",initial_weight_file); fin = fopen(weight_file, "r"); long long file_s = ftello(fin); long long term_vocab_s = file_s/(sizeof(real))/2/vector_size; fread(W,sizeof(real),term_vocab_s*vector_size,fin); fread(&W[vocab_size*vector_size],sizeof(real),term_vocab_s*vector_size,fin); fclose(fin); //Initialize the rest for (b = 0; b < vector_size; b++) for (a = term_vocab_s; a < vocab_size; a++) W[a * vector_size + b] = (rand() / (real)RAND_MAX - 0.5) / vector_size; for (b = 0; b < vector_size; b++) for (a = vocab_size+term_vocab_s; a < vocab_size; a++) W[a * vector_size + b] = (rand() / (real)RAND_MAX - 0.5) / vector_size; //Initialize bias randomly for (a = 0; a < 2 * vocab_size; a++) W[a * vector_size + vector_size - 1] = (rand() / (real)RAND_MAX - 0.5) / vector_size; }else{ for (b = 0; b < vector_size; b++) for (a = 0; a < 2 * vocab_size; a++) W[a * vector_size + b] = (rand() / (real)RAND_MAX - 0.5) / vector_size; } for (b = 0; b < vector_size; b++) for (a = 0; a < 2 * vocab_size; a++) gradsq[a * vector_size + b] = 1.0; // So initial value of eta is equal to initial learning rate vector_size--; }
static void setup(void) { char filename[PATH_MAX]; int n, j, fd, directflag = 1; long type; if (align_str) { align = atoi(align_str); if (align < 0 || align > PAGE_SIZE) tst_brkm(TCONF, NULL, "Bad alignment %d.", align); } tst_resm(TINFO, "using alignment %d", align); if (workers_str) { workers = atoi(workers_str); if (workers < MIN_WORKERS || workers > MAX_WORKERS) { tst_brkm(TCONF, NULL, "Worker count %d not between " "%d and %d, inclusive", workers, MIN_WORKERS, MAX_WORKERS); } } tst_resm(TINFO, "using %d workers.", workers); tst_sig(FORK, DEF_HANDLER, NULL); tst_require_root(NULL); TEST_PAUSE; tst_tmpdir(); /* * Some file systems may not implement the O_DIRECT flag and open() will * fail with EINVAL if it is used. So add this check for current * filesystem current directory is in, if not supported, we choose to * have this test in LTP_BIG_DEV and mkfs it as ext3. */ fd = open("testfile", O_CREAT | O_DIRECT, 0644); if (fd < 0 && errno == EINVAL) { type = tst_fs_type(NULL, "."); tst_resm(TINFO, "O_DIRECT flag is not supported on %s " "filesystem", tst_fs_type_name(type)); directflag = 0; } else if (fd > 0) { SAFE_CLOSE(NULL, fd); } SAFE_MKDIR(cleanup, MNT_POINT, DIR_MODE); /* * verify whether the current directory has enough free space, * if it is not satisfied, we will use the LTP_BIG_DEV, which * will be exported by runltp with "-z" option. */ if (!directflag || !tst_fs_has_free(NULL, ".", 1300, TST_MB)) { device = getenv("LTP_BIG_DEV"); if (device == NULL) { tst_brkm(TCONF, NULL, "you must specify a big blockdevice(>1.3G)"); } else { tst_mkfs(NULL, device, "ext3", NULL); } if (mount(device, MNT_POINT, "ext3", 0, NULL) < 0) { tst_brkm(TBROK | TERRNO, NULL, "mount device:%s failed", device); } mount_flag = 1; } worker = SAFE_MALLOC(cleanup, workers * sizeof(worker_t)); for (j = 0; j < workers; j++) worker[j].worker_number = j; for (n = 1; n <= FILECOUNT; n++) { snprintf(filename, sizeof(filename), FILE_BASEPATH, n); if (tst_fill_file(filename, n, FILESIZE, 1)) { tst_brkm(TBROK, cleanup, "failed to create file: %s", filename); } } if (posix_memalign((void **)&buffer, PAGE_SIZE, READSIZE + align) != 0) tst_brkm(TBROK, cleanup, "call posix_memalign failed"); }
Tree* build_index(size_t num_levels, size_t fanout[], size_t num_keys, int32_t key[]) { // return null pointer for invalid tree configuration size_t min_num_keys = 1; for (size_t i = 0; i < num_levels - 1; ++i) { min_num_keys *= fanout[i]; } size_t max_num_keys = min_num_keys * fanout[num_levels - 1] - 1; if (num_keys < min_num_keys || num_keys > max_num_keys) { fprintf(stderr, "Error: incorrect number of keys, min %zu, max %zu\n", min_num_keys, max_num_keys); return NULL; } // initialize the tree index Tree* tree = malloc(sizeof(Tree)); assert(tree != NULL); tree->num_levels = num_levels; tree->node_capacity = malloc(sizeof(size_t) * num_levels); assert(tree->node_capacity != NULL); for (size_t i = 0; i < num_levels; ++i) { tree->node_capacity[i] = fanout[i] - 1; } tree->key_array = malloc(sizeof(int32_t*) * num_levels); assert(tree->key_array != NULL); size_t* key_count = malloc(sizeof(size_t) * num_levels); assert(key_count != NULL); size_t* array_capacity = malloc(sizeof(size_t) * num_levels); assert(array_capacity != NULL); for (size_t i = 0; i < num_levels; ++i) { size_t size = sizeof(int32_t) * tree->node_capacity[i]; // allocate one node per level int error = posix_memalign((void**) &(tree->key_array[i]), alignment, size); assert(error == 0); key_count[i] = 0; array_capacity[i] = tree->node_capacity[i]; // array_capacity[i] is always a multiple of node_capacity[i] } // insert sorted keys into index for (size_t i = 1; i < num_keys; ++i) { assert(key[i - 1] < key[i]); } for (size_t i = 0; i < num_keys; ++i) { size_t level = num_levels - 1; while (key_count[level] == array_capacity[level]) level -= 1; tree->key_array[level][key_count[level]] = key[i]; key_count[level] += 1; while (level < num_levels - 1) { level += 1; size_t new_capacity = array_capacity[level] + tree->node_capacity[level]; size_t size = sizeof(int32_t) * new_capacity; // allocate one more node int32_t* new_array = NULL; int error = posix_memalign((void**) &new_array, alignment, size); assert(error == 0); memcpy(new_array, tree->key_array[level], sizeof(int32_t) * key_count[level]); free(tree->key_array[level]); tree->key_array[level] = new_array; array_capacity[level] = new_capacity; } } // pad with INT32_MAXs for (size_t i = 0; i < num_levels; ++i) { for (size_t j = key_count[i]; j < array_capacity[i]; ++j) tree->key_array[i][j] = INT32_MAX; key_count[i] = array_capacity[i]; } // print the tree // for (size_t i = 0; i < num_levels; ++i) { // printf("Level %zu:", i); // for (size_t j = 0; j < key_count[i]; ++j) // printf(" %d", tree->key_array[i][j]); // printf("\n"); // } free(array_capacity); free(key_count); return tree; }
int main(int argc, char *argv[]) { unsigned char *buffer = NULL; char filename[1024]; int fd; bool dowrite = true; pthread_t fork_tid; int c, n, j; worker_t *worker; int align = 0; int offset, rc; workers = sysconf(_SC_NPROCESSORS_ONLN); while ((c = getopt(argc, argv, "a:hw:")) != -1) { switch (c) { case 'a': align = atoi(optarg); if (align < 0 || align > PAGE_SIZE) { printf("Bad alignment %d.\n", align); exit(1); } dowrite = false; break; case 'h': usage(); exit(0); break; case 'w': workers = atoi(optarg); if (workers < MIN_WORKERS || workers > MAX_WORKERS) { fprintf(stderr, "Worker count %d not between " "%d and %d, inclusive.\n", workers, MIN_WORKERS, MAX_WORKERS); usage(); exit(1); } dowrite = false; break; default: usage(); exit(1); } } if (argc > 1 && (optind < argc)) { fprintf(stderr, "Bad command line.\n"); usage(); exit(1); } if (dowrite) { buffer = malloc(FILESIZE); if (buffer == NULL) { fprintf(stderr, "Failed to malloc write buffer.\n"); exit(1); } for (n = 1; n <= FILECOUNT; n++) { sprintf(filename, FILENAME, n); fd = open(filename, O_RDWR|O_CREAT|O_TRUNC, 0666); if (fd < 0) { printf("create failed(%s): %s.\n", filename, strerror(errno)); exit(1); } memset(buffer, n, FILESIZE); printf("Writing file %s.\n", filename); if (write(fd, buffer, FILESIZE) != FILESIZE) { printf("write failed (%s)\n", filename); } close(fd); fd = -1; } free(buffer); buffer = NULL; printf("done\n"); exit(0); } printf("Using %d workers.\n", workers); worker = malloc(workers * sizeof(worker_t)); if (worker == NULL) { fprintf(stderr, "Failed to malloc worker array.\n"); exit(1); } for (j = 0; j < workers; j++) { worker[j].worker_number = j; } printf("Using alignment %d.\n", align); posix_memalign((void *)&buffer, PAGE_SIZE, READSIZE+ align); printf("Read buffer: %p.\n", buffer); for (n = 1; n <= FILECOUNT; n++) { sprintf(filename, FILENAME, n); for (j = 0; j < workers; j++) { if ((worker[j].fd = open(filename, O_RDONLY|O_DIRECT)) < 0) { fprintf(stderr, "Failed to open %s: %s.\n", filename, strerror(errno)); exit(1); } worker[j].pattern = n; } printf("Reading file %d.\n", n); for (offset = 0; offset < FILESIZE; offset += READSIZE) { memset(buffer, PATTERN, READSIZE + align); for (j = 0; j < workers; j++) { worker[j].offset = offset + j * PAGE_SIZE; worker[j].buffer = buffer + align + j * PAGE_SIZE; worker[j].length = PAGE_SIZE; } /* The final worker reads whatever is left over. */ worker[workers - 1].length = READSIZE - PAGE_SIZE * (workers - 1); done = 0; rc = pthread_create(&fork_tid, NULL, fork_thread, NULL); if (rc != 0) { fprintf(stderr, "Can't create fork thread: %s.\n", strerror(rc)); exit(1); } for (j = 0; j < workers; j++) { rc = pthread_create(&worker[j].tid, NULL, worker_thread, worker + j); if (rc != 0) { fprintf(stderr, "Can't create worker thread %d: %s.\n", j, strerror(rc)); exit(1); } } for (j = 0; j < workers; j++) { rc = pthread_join(worker[j].tid, NULL); if (rc != 0) { fprintf(stderr, "Failed to join worker thread %d: %s.\n", j, strerror(rc)); exit(1); } } /* Let the fork thread know it's ok to exit */ done = 1; rc = pthread_join(fork_tid, NULL); if (rc != 0) { fprintf(stderr, "Failed to join fork thread: %s.\n", strerror(rc)); exit(1); } } /* Close the fd's for the next file. */ for (j = 0; j < workers; j++) { close(worker[j].fd); } } return 0; }