/* * Allocate cpu_pda pointer table and array via alloc_bootmem. */ static void __init setup_cpu_pda_map(void) { char *pda; struct x8664_pda **new_cpu_pda; unsigned long size; int cpu; size = roundup(sizeof(struct x8664_pda), cache_line_size()); /* allocate cpu_pda array and pointer table */ { unsigned long tsize = nr_cpu_ids * sizeof(void *); unsigned long asize = size * (nr_cpu_ids - 1); tsize = roundup(tsize, cache_line_size()); new_cpu_pda = alloc_bootmem(tsize + asize); pda = (char *)new_cpu_pda + tsize; } /* initialize pointer table to static pda's */ for_each_possible_cpu(cpu) { if (cpu == 0) { /* leave boot cpu pda in place */ new_cpu_pda[0] = cpu_pda(0); continue; } new_cpu_pda[cpu] = (struct x8664_pda *)pda; new_cpu_pda[cpu]->in_bootmem = 1; pda += size; } /* point to new pointer table */ _cpu_pda = new_cpu_pda; }
static int __init arm64_dma_init(void) { WARN_TAINT(ARCH_DMA_MINALIGN < cache_line_size(), TAINT_CPU_OUT_OF_SPEC, "ARCH_DMA_MINALIGN smaller than CTR_EL0.CWG (%d < %d)", ARCH_DMA_MINALIGN, cache_line_size()); return atomic_pool_init(); }
struct rxe_queue *rxe_queue_init(struct rxe_dev *rxe, int *num_elem, unsigned int elem_size) { struct rxe_queue *q; size_t buf_size; unsigned int num_slots; /* num_elem == 0 is allowed, but uninteresting */ if (*num_elem < 0) goto err1; q = kmalloc(sizeof(*q), GFP_KERNEL); if (!q) goto err1; q->rxe = rxe; /* used in resize, only need to copy used part of queue */ q->elem_size = elem_size; /* pad element up to at least a cacheline and always a power of 2 */ if (elem_size < cache_line_size()) elem_size = cache_line_size(); elem_size = roundup_pow_of_two(elem_size); q->log2_elem_size = order_base_2(elem_size); num_slots = *num_elem + 1; num_slots = roundup_pow_of_two(num_slots); q->index_mask = num_slots - 1; buf_size = sizeof(struct rxe_queue_buf) + num_slots * elem_size; q->buf = vmalloc_user(buf_size); if (!q->buf) goto err2; q->buf->log2_elem_size = q->log2_elem_size; q->buf->index_mask = q->index_mask; q->buf_size = buf_size; *num_elem = num_slots - 1; return q; err2: kfree(q); err1: return NULL; }
char *ring_client(ring_t *ring, char *title) { char buf[32] = {0}; int i = 0; int fd = -1; // set up shm while (fd < 0) { snprintf(buf, 32, "/%s.%d", title, i++); fd = shm_open(buf, O_RDWR | O_CREAT, 0700); if (i > 65535) { fprintf(stderr, "panic: failed to shm_open() 65535 times, giving up.\n"); abort(); } } // map it int size = RING_SIZE + cache_line_size() * 8; char *name = strdup(buf); ftruncate(fd, size); void *addr = mmap(NULL, size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0); if (addr == MAP_FAILED) { return NULL; } ring_set_pointers(ring, addr); ring->size = RING_SIZE; ring->me = 1; memset(addr, 0, size); return name; }
static void ring_set_pointers(ring_t *ring, void *addr) { size_t cache_line = cache_line_size(); int i = 0; #define next_line (addr + cache_line * i++) ring->read = next_line; ring->write = next_line; ring->mark = next_line; ring->wrap = next_line; ring->dir = next_line; #undef next_line ring->buf = addr + cache_line * 8; }
static int ag71xx_ring_alloc(struct ag71xx_ring *ring, unsigned int size) { int err; int i; ring->desc_size = sizeof(struct ag71xx_desc); if (ring->desc_size % cache_line_size()) { DBG("ag71xx: ring %p, desc size %u rounded to %u\n", ring, ring->desc_size, roundup(ring->desc_size, cache_line_size())); ring->desc_size = roundup(ring->desc_size, cache_line_size()); } ring->descs_cpu = dma_alloc_coherent(NULL, size * ring->desc_size, &ring->descs_dma, GFP_ATOMIC); if (!ring->descs_cpu) { err = -ENOMEM; goto err; } ring->size = size; ring->buf = kzalloc(size * sizeof(*ring->buf), GFP_KERNEL); if (!ring->buf) { err = -ENOMEM; goto err; } for (i = 0; i < size; i++) { int idx = i * ring->desc_size; ring->buf[i].desc = (struct ag71xx_desc *)&ring->descs_cpu[idx]; DBG("ag71xx: ring %p, desc %d at %p\n", ring, i, ring->buf[i].desc); } return 0; err: return err; }
int ring_server(ring_t *ring, char *name) { // set up shm int fd = shm_open(name, O_RDWR, 0700); int size = RING_SIZE + cache_line_size() * 8; void *addr = mmap(NULL, size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0); if (addr == MAP_FAILED) { return -1; } shm_unlink(name); ring_set_pointers(ring, addr); ring->size = RING_SIZE; ring->me = 0; return 0; }
/** * percpu_alloc_mask - initial setup of per-cpu data * @size: size of per-cpu object * @gfp: may sleep or not etc. * @mask: populate per-data for cpu's selected through mask bits * * Populating per-cpu data for all online cpu's would be a typical use case, * which is simplified by the percpu_alloc() wrapper. * Per-cpu objects are populated with zeroed buffers. */ void *__percpu_alloc_mask(size_t size, gfp_t gfp, cpumask_t *mask) { /* * We allocate whole cache lines to avoid false sharing */ size_t sz = roundup(nr_cpu_ids * sizeof(void *), cache_line_size()); void *pdata = kzalloc(sz, gfp); void *__pdata = __percpu_disguise(pdata); if (unlikely(!pdata)) return NULL; if (likely(!__percpu_populate_mask(__pdata, size, gfp, mask))) return __pdata; kfree(pdata); return NULL; }
/** * percpu_populate - populate per-cpu data for given cpu * @__pdata: per-cpu data to populate further * @size: size of per-cpu object * @gfp: may sleep or not etc. * @cpu: populate per-data for this cpu * * Populating per-cpu data for a cpu coming online would be a typical * use case. You need to register a cpu hotplug handler for that purpose. * Per-cpu object is populated with zeroed buffer. */ void *percpu_populate(void *__pdata, size_t size, gfp_t gfp, int cpu) { struct percpu_data *pdata = __percpu_disguise(__pdata); int node = cpu_to_node(cpu); /* * We should make sure each CPU gets private memory. */ size = roundup(size, cache_line_size()); BUG_ON(pdata->ptrs[cpu]); if (node_online(node)) pdata->ptrs[cpu] = kmalloc_node(size, gfp|__GFP_ZERO, node); else pdata->ptrs[cpu] = kzalloc(size, gfp); return pdata->ptrs[cpu]; }
void * kmem_cache_alloc(struct kmem_cache *cachep, gfp_t flags) { size_t size = cachep->size; if (cachep->flags & SLAB_HWCACHE_ALIGN) size = max(cachep->size, (size_t)cache_line_size()); void *objp = kmem_alloc(size); if (!objp) { if (cachep->flags & SLAB_PANIC) panic("kmem_cache_alloc() failed."); else return NULL; } if (cachep->ctor) cachep->ctor(objp); return objp; }
/* * Entry point of Multi-core Insense runtime. */ int main() { PRINTFMC("Cache line size: %dB\n", cache_line_size());PRINTFMC("Main thread: %u\n", (unsigned) pthread_self()); #if HEAPS // Small heaps // Initialize mutex if (pthread_mutex_init(&thread_lock, NULL ) != 0) { PRINTF("Mutex initialization failed.\n"); return NULL; } #else // Big heap // Initialize mutex if (pthread_mutex_init(&alloc_lock, NULL ) != 0) { PRINTF("Mutex initialization failed.\n"); return NULL ; } #endif mainThread = pthread_self(); // Note the ID of the main thread. // Create a list for storing references to p-threads threadList = listCreate(); // Create map used to store memory locations of small heaps (using Thread safe list) SHList = listCreate(); // Create map used to store memory locations what is allocated using malloc mallocList = listCreate(); // Start recording execution time #if TIMING // CPU time struct timespec start, finish; double elapsed; //clock_gettime(CLOCK_MONOTONIC, &start); // User time time_t start_t, end_t; double diff_t; time(&start_t); #endif // Call primordial_main. primordial_main(NULL ); // Join all p-threads if (threadList != NULL ) { listJoinThreads(threadList); } // Stop recording execution time #if TIMING // CPU time //clock_gettime(CLOCK_MONOTONIC, &finish); elapsed = (finish.tv_sec - start.tv_sec); elapsed += (finish.tv_nsec - start.tv_nsec) / 1000000000.0; printf("CPU: %f seconds elapsed\n", elapsed); #endif // Destroy lists and free memory listDestroy(threadList); listDestroy(SHList); listDestroy(mallocList); pthread_mutex_destroy(&thread_lock); // Destroy mutex lock used with pthreads pthread_mutex_destroy(&alloc_lock); // Destroy mutex lock used with alloc and free in the big heap scheme return 1; }
int main(int argc, const char **argv) { int err; const char *cmd; char sbuf[STRERR_BUFSIZE]; int value; /* libsubcmd init */ exec_cmd_init("perf", PREFIX, PERF_EXEC_PATH, EXEC_PATH_ENVIRONMENT); pager_init(PERF_PAGER_ENVIRONMENT); /* The page_size is placed in util object. */ page_size = sysconf(_SC_PAGE_SIZE); cache_line_size(&cacheline_size); if (sysctl__read_int("kernel/perf_event_max_stack", &value) == 0) sysctl_perf_event_max_stack = value; if (sysctl__read_int("kernel/perf_event_max_contexts_per_stack", &value) == 0) sysctl_perf_event_max_contexts_per_stack = value; cmd = extract_argv0_path(argv[0]); if (!cmd) cmd = "perf-help"; srandom(time(NULL)); perf_config__init(); err = perf_config(perf_default_config, NULL); if (err) return err; set_buildid_dir(NULL); /* get debugfs/tracefs mount point from /proc/mounts */ tracing_path_mount(); /* * "perf-xxxx" is the same as "perf xxxx", but we obviously: * * - cannot take flags in between the "perf" and the "xxxx". * - cannot execute it externally (since it would just do * the same thing over again) * * So we just directly call the internal command handler. If that one * fails to handle this, then maybe we just run a renamed perf binary * that contains a dash in its name. To handle this scenario, we just * fall through and ignore the "xxxx" part of the command string. */ if (strstarts(cmd, "perf-")) { cmd += 5; argv[0] = cmd; handle_internal_command(argc, argv); /* * If the command is handled, the above function does not * return undo changes and fall through in such a case. */ cmd -= 5; argv[0] = cmd; } if (strstarts(cmd, "trace")) { #if defined(HAVE_LIBAUDIT_SUPPORT) || defined(HAVE_SYSCALL_TABLE_SUPPORT) setup_path(); argv[0] = "trace"; return cmd_trace(argc, argv); #else fprintf(stderr, "trace command not available: missing audit-libs devel package at build time.\n"); goto out; #endif } /* Look for flags.. */ argv++; argc--; handle_options(&argv, &argc, NULL); commit_pager_choice(); if (argc > 0) { if (strstarts(argv[0], "--")) argv[0] += 2; } else { /* The user didn't specify a command; give them help */ printf("\n usage: %s\n\n", perf_usage_string); list_common_cmds_help(); printf("\n %s\n\n", perf_more_info_string); goto out; } cmd = argv[0]; test_attr__init(); /* * We use PATH to find perf commands, but we prepend some higher * precedence paths: the "--exec-path" option, the PERF_EXEC_PATH * environment, and the $(perfexecdir) from the Makefile at build * time. */ setup_path(); /* * Block SIGWINCH notifications so that the thread that wants it can * unblock and get syscalls like select interrupted instead of waiting * forever while the signal goes to some other non interested thread. */ pthread__block_sigwinch(); perf_debug_setup(); while (1) { static int done_help; run_argv(&argc, &argv); if (errno != ENOENT) break; if (!done_help) { cmd = argv[0] = help_unknown_cmd(cmd); done_help = 1; } else break; } fprintf(stderr, "Failed to run command '%s': %s\n", cmd, str_error_r(errno, sbuf, sizeof(sbuf))); out: return 1; }
TCA emitFreeLocalsHelpers(CodeBlock& cb, DataBlock& data, UniqueStubs& us) { // The address of the first local is passed in the second argument register. // We use the third and fourth as scratch registers. auto const local = rarg(1); auto const last = rarg(2); auto const type = rarg(3); CGMeta fixups; // This stub is very hot; keep it cache-aligned. align(cb, &fixups, Alignment::CacheLine, AlignContext::Dead); auto const release = emitDecRefHelper(cb, data, fixups, local, type, local | last); auto const decref_local = [&] (Vout& v) { auto const sf = v.makeReg(); // We can't do a byte load here---we have to sign-extend since we use // `type' as a 32-bit array index to the destructor table. v << loadzbl{local[TVOFF(m_type)], type}; emitCmpTVType(v, sf, KindOfRefCountThreshold, type); ifThen(v, CC_G, sf, [&] (Vout& v) { auto const dword_size = sizeof(int64_t); // saving return value on the stack, but keeping it 16-byte aligned v << mflr{rfuncln()}; v << lea {rsp()[-2 * dword_size], rsp()}; v << store{rfuncln(), rsp()[0]}; v << call{release, arg_regs(3)}; // restore the return value from the stack v << load{rsp()[0], rfuncln()}; v << lea {rsp()[2 * dword_size], rsp()}; v << mtlr{rfuncln()}; }); }; auto const next_local = [&] (Vout& v) { v << addqi{static_cast<int>(sizeof(TypedValue)), local, local, v.makeReg()}; }; alignJmpTarget(cb); us.freeManyLocalsHelper = vwrap(cb, data, fixups, [&] (Vout& v) { // We always unroll the final `kNumFreeLocalsHelpers' decrefs, so only loop // until we hit that point. v << lea{rvmfp()[localOffset(kNumFreeLocalsHelpers - 1)], last}; doWhile(v, CC_NZ, {}, [&] (const VregList& in, const VregList& out) { auto const sf = v.makeReg(); decref_local(v); next_local(v); v << cmpq{local, last, sf}; return sf; } ); }); for (auto i = kNumFreeLocalsHelpers - 1; i >= 0; --i) { us.freeLocalsHelpers[i] = vwrap(cb, data, [&] (Vout& v) { decref_local(v); if (i != 0) next_local(v); }); } // All the stub entrypoints share the same ret. vwrap(cb, data, fixups, [] (Vout& v) { v << ret{}; }); // This stub is hot, so make sure to keep it small. #if 0 // TODO(gut): Currently this assert fails. // Take a closer look when looking at performance always_assert(Stats::enabled() || (cb.frontier() - release <= 4 * cache_line_size())); #endif fixups.process(nullptr); return release; }
int main(int argc, char **argv) { int i; int cache_line = cache_line_size(); int level; discover_caches(); printf("cache line size: %d\n", cache_line); if( argc < 2 ) { printf("Usage: <prog> <narrays> [sfence]"); return 0; } if( argc > 2 && !strcmp(argv[2], "sfence") ){ want_sfence = 1; } narrays = atoi(argv[1]); data = calloc(narrays, sizeof(*data)); for(level = 0; level < cache_level_cnt; level++) { uint64_t result; niters = iters[level]; printf("Fit data to the level %d of memory hirarchy (%zdB)\n", level + 1, cache_sizes[level]); nitems = cache_sizes[level] / narrays; for(i = 0; i < narrays; i++ ){ data[i] = calloc(nitems + cache_sizes[level], sizeof(*data[0])); } flush_array_sz = cache_sizes[level] * 2; flush_array = calloc(flush_array_sz, sizeof(char)); // printf("\t#1 WOUT cache flush:\n"); want_cache_flush = 0; result = testloop1(); printf("\tseq:\tstride=1\t%lu cycles/B\n", result / niters / nitems / narrays); result = testloop2(); printf("\tsplit2:\tstride=1\t%lu cycles/B\n", result / niters / nitems / narrays); for(i=2; i<=cache_line; i*=2) { result = testloop3(i); printf("\tsplit2:\tstride=%d\t%lu cycles/B\n", i, result / niters / nitems / narrays); } // printf("\t#2 WITH cache flush:\n"); // want_cache_flush = 1; // result = testloop1(); // printf("\t\tseq:\tstride=1\t%lu cycles/B\n", result / niters / nitems / narrays); // result = testloop2(); // printf("\t\tsplit2:\tstride=1\t%lu cycles/B\n", result / niters / nitems / narrays); // for(i=2; i<=cache_line; i*=2) { // result = testloop3(i); // printf("\t\tsplit2:\tstride=%d\t%lu cycles/B\n", i, result / niters / nitems / narrays); // } for(i = 0; i < narrays; i++ ){ free(data[i]); } free(flush_array); } return 0; }
static void __init setup_processor(void) { u64 features; s64 block; u32 cwg; int cls; printk("CPU: AArch64 Processor [%08x] revision %d\n", read_cpuid_id(), read_cpuid_id() & 15); sprintf(init_utsname()->machine, ELF_PLATFORM); elf_hwcap = 0; cpuinfo_store_boot_cpu(); /* * Check for sane CTR_EL0.CWG value. */ cwg = cache_type_cwg(); cls = cache_line_size(); if (!cwg) pr_warn("No Cache Writeback Granule information, assuming cache line size %d\n", cls); if (L1_CACHE_BYTES < cls) pr_warn("L1_CACHE_BYTES smaller than the Cache Writeback Granule (%d < %d)\n", L1_CACHE_BYTES, cls); /* * ID_AA64ISAR0_EL1 contains 4-bit wide signed feature blocks. * The blocks we test below represent incremental functionality * for non-negative values. Negative values are reserved. */ features = read_cpuid(ID_AA64ISAR0_EL1); block = cpuid_feature_extract_field(features, 4); if (block > 0) { switch (block) { default: case 2: elf_hwcap |= HWCAP_PMULL; case 1: elf_hwcap |= HWCAP_AES; case 0: break; } } if (cpuid_feature_extract_field(features, 8) > 0) elf_hwcap |= HWCAP_SHA1; if (cpuid_feature_extract_field(features, 12) > 0) elf_hwcap |= HWCAP_SHA2; if (cpuid_feature_extract_field(features, 16) > 0) elf_hwcap |= HWCAP_CRC32; block = cpuid_feature_extract_field(features, 20); if (block > 0) { switch (block) { default: case 2: elf_hwcap |= HWCAP_ATOMICS; case 1: /* RESERVED */ case 0: break; } } #ifdef CONFIG_COMPAT /* * ID_ISAR5_EL1 carries similar information as above, but pertaining to * the AArch32 32-bit execution state. */ features = read_cpuid(ID_ISAR5_EL1); block = cpuid_feature_extract_field(features, 4); if (block > 0) { switch (block) { default: case 2: compat_elf_hwcap2 |= COMPAT_HWCAP2_PMULL; case 1: compat_elf_hwcap2 |= COMPAT_HWCAP2_AES; case 0: break; } } if (cpuid_feature_extract_field(features, 8) > 0) compat_elf_hwcap2 |= COMPAT_HWCAP2_SHA1; if (cpuid_feature_extract_field(features, 12) > 0) compat_elf_hwcap2 |= COMPAT_HWCAP2_SHA2; if (cpuid_feature_extract_field(features, 16) > 0) compat_elf_hwcap2 |= COMPAT_HWCAP2_CRC32; #endif }
/* * Entry point of Multi-core Insense runtime. */ int main(int argc, char* argv[]) { PRINTFMC("Cache line size: %dB\n", cache_line_size()); PRINTFMC("Main thread: %u\n", (unsigned) pthread_self()); errval_t err; coreid_t mycore = disp_get_core_id(); if (argc == 2) { num_to_span = atoi(argv[1]); if(num_to_span==0) all_spanned = true; debug_printf("Spanning onto %d cores\n", num_to_span); for (int i = 1; i < num_to_span; i++) { err = domain_new_dispatcher(mycore + i, span_cb, NULL); if (err_is_fail(err)) { DEBUG_ERR(err, "failed span %d", i); } } } else { debug_printf("ERROR: Must specify number of cores to span\n"); return EXIT_FAILURE; } posixcompat_pthread_set_placement_fn(rrPlacement); while (!all_spanned) { thread_yield(); } my_mutex_init(&shared_heap_mutex); #if HEAPS == HEAP_PRIVATE // Private heaps // Initialize mutex if (pthread_mutex_init(&thread_lock, NULL ) != 0) { PRINTF("Mutex initialization failed.\n"); return -1; } #endif mainThread = pthread_self(); // Note the ID of the main thread. // Create a list for storing references to p-threads threadList = listCreate(); // Create map used to store memory locations of small heaps (using Thread safe list) SHList = listCreate(); // Create map used to store memory locations what is allocated using malloc mallocList = listCreate(); // Start recording execution time #if TIMING // CPU time uint64_t start, end; uint64_t tsc_per_ms = 0; sys_debug_get_tsc_per_ms(&tsc_per_ms); start = rdtsc(); #endif // Call primordial_main. primordial_main(NULL ); // Join all p-threads if (threadList != NULL ) { listJoinThreads(threadList); } // Stop recording execution time #if TIMING end = rdtsc(); uint64_t diff = (end - start) / tsc_per_ms; float elapsed = (diff / 1000) + ((diff % 1000) / 1000.0); printf("CPU: %f seconds elapsed\n", elapsed); #endif // Destroy lists and free memory listDestroy(threadList); listDestroy(SHList); listDestroy(mallocList); #if HEAPS == HEAP_PRIVATE pthread_mutex_destroy(&thread_lock); // Destroy mutex lock used with pthreads #endif return 0; }