void kbase_gpuprops_set(kbase_device *kbdev) { kbase_gpu_props *gpu_props; struct midg_raw_gpu_props *raw; KBASE_DEBUG_ASSERT(NULL != kbdev); gpu_props = &kbdev->gpu_props; raw = &gpu_props->props.raw_props; /* Initialize the base_gpu_props structure from the hardware */ kbase_gpuprops_get_props(&gpu_props->props, kbdev); /* Populate the derived properties */ kbase_gpuprops_calculate_props(&gpu_props->props, kbdev); /* Populate kbase-only fields */ gpu_props->l2_props.associativity = KBASE_UBFX32(raw->l2_features, 8U, 8); gpu_props->l2_props.external_bus_width = KBASE_UBFX32(raw->l2_features, 24U, 8); gpu_props->l3_props.associativity = KBASE_UBFX32(raw->l3_features, 8U, 8); gpu_props->l3_props.external_bus_width = KBASE_UBFX32(raw->l3_features, 24U, 8); gpu_props->mem.core_group = KBASE_UBFX32(raw->mem_features, 0U, 1); gpu_props->mem.supergroup = KBASE_UBFX32(raw->mem_features, 1U, 1); gpu_props->mmu.va_bits = KBASE_UBFX32(raw->mmu_features, 0U, 8); gpu_props->mmu.pa_bits = KBASE_UBFX32(raw->mmu_features, 8U, 8); gpu_props->num_cores = hweight64(raw->shader_present); gpu_props->num_core_groups = hweight64(raw->l2_present); gpu_props->num_supergroups = hweight64(raw->l3_present); gpu_props->num_address_spaces = hweight32(raw->as_present); gpu_props->num_job_slots = hweight32(raw->js_present); }
/* Count the number of free inodes. */ static unsigned int xchk_iallocbt_freecount( xfs_inofree_t freemask) { BUILD_BUG_ON(sizeof(freemask) != sizeof(__u64)); return hweight64(freemask); }
static int init_cgr(struct device *qidev) { int ret; struct qm_mcc_initcgr opts; const u64 cpus = *(u64 *)qman_affine_cpus(); const int num_cpus = hweight64(cpus); const u64 val = num_cpus * MAX_RSP_FQ_BACKLOG_PER_CPU; ret = qman_alloc_cgrid(&qipriv.cgr.cgrid); if (ret) { dev_err(qidev, "CGR alloc failed for rsp FQs: %d\n", ret); return ret; } qipriv.cgr.cb = cgr_cb; memset(&opts, 0, sizeof(opts)); opts.we_mask = cpu_to_be16(QM_CGR_WE_CSCN_EN | QM_CGR_WE_CS_THRES | QM_CGR_WE_MODE); opts.cgr.cscn_en = QM_CGR_EN; opts.cgr.mode = QMAN_CGR_MODE_FRAME; qm_cgr_cs_thres_set64(&opts.cgr.cs_thres, val, 1); ret = qman_create_cgr(&qipriv.cgr, QMAN_CGR_FLAG_USE_INIT, &opts); if (ret) { dev_err(qidev, "Error %d creating CAAM CGRID: %u\n", ret, qipriv.cgr.cgrid); return ret; } dev_info(qidev, "Congestion threshold set to %llu\n", val); return 0; }
void hweight64_test() { for (int i = 0; i < 100000; ++i) { uint64_t r = RAND_NR_NEXT(u, v, w); assert(__builtin_popcountll(r) == hweight64(r)); } }
int main() { uint64_t u, v, w; uint64_t seed = time(0); RAND_NR_INIT(u, v, w, seed); for (int i = 0; i < 100; i++) printf("rand number = %llx\n", (unsigned long long)RAND_NR_NEXT(u, v, w)); uint64_t r = RAND_NR_NEXT(u, v, w); uint64_t s = BIN_TO_GRAYCODE(r); uint64_t t = BIN_TO_GRAYCODE(r + 1); assert(hweight64(t ^ s) == 1); GRAYCODE_TO_BIN64(s); assert(rev8(5) == 160); assert(rev8_hakmem(5) == 160); uint64_t hi, lo; MULQ(0x1234567887654321ul, 0x77665544332211fful, lo, hi); assert(hi == 611815671993850618UL); assert(lo == 14353276178066116319UL); if (s != r) fprintf(stderr, "expected %016llx, acutal %016llx\n", (unsigned long long)r, (unsigned long long)s); else printf("passed\n"); return 0; }
unsigned int hweight64_time() { unsigned int r = 0; for (uint64_t i = 0; i < 100000000; ++i) r += hweight64(i); return r; }
static int gru_seq_show(struct seq_file *file, void *data) { long gid = *(long *)data, ctxfree, cbrfree, dsrfree; struct gru_state *gru = GID_TO_GRU(gid); if (gid == 0) { seq_printf(file, "#%5s%5s%7s%6s%6s%8s%6s%6s\n", "gid", "nid", "ctx", "cbr", "dsr", "ctx", "cbr", "dsr"); seq_printf(file, "#%5s%5s%7s%6s%6s%8s%6s%6s\n", "", "", "busy", "busy", "busy", "free", "free", "free"); } if (gru) { ctxfree = GRU_NUM_CCH - gru->gs_active_contexts; cbrfree = hweight64(gru->gs_cbr_map) * GRU_CBR_AU_SIZE; dsrfree = hweight64(gru->gs_dsr_map) * GRU_DSR_AU_BYTES; seq_printf(file, " %5d%5d%7ld%6ld%6ld%8ld%6ld%6ld\n", gru->gs_gid, gru->gs_blade_id, GRU_NUM_CCH - ctxfree, GRU_NUM_CBE - cbrfree, GRU_NUM_DSR_BYTES - dsrfree, ctxfree, cbrfree, dsrfree); } return 0; }
static int alloc_cgrs(struct device *qidev) { struct qm_mcc_initcgr opts; int ret; const u64 cpus = *(u64 *)qman_affine_cpus(); const int num_cpus = hweight64(cpus); u64 val; /*Allocate response CGR*/ ret = qman_alloc_cgrid(&qipriv.rsp_cgr.cgrid); if (ret) { dev_err(qidev, "CGR alloc failed for rsp FQs"); return ret; } qipriv.rsp_cgr.cb = rsp_cgr_cb; memset(&opts, 0, sizeof(opts)); opts.we_mask = QM_CGR_WE_CSCN_EN | QM_CGR_WE_CS_THRES | QM_CGR_WE_MODE; opts.cgr.cscn_en = QM_CGR_EN; opts.cgr.mode = QMAN_CGR_MODE_FRAME; #ifdef CONFIG_FSL_DPAA_ETH /* * This effectively sets the to-CPU threshold equal to half of the * number of buffers available to dpa_eth driver. It means that at most * half of the buffers can be in the queues from SEC, waiting * to be transmitted to the core (and then on the TX queues). * NOTE: This is an arbitrary division; the factor '2' below could * also be '3' or '4'. It also depends on the number of devices * using the dpa_eth buffers (which can be >1 if f.i. PME/DCE are * also used. */ val = num_cpus * CONFIG_FSL_DPAA_ETH_MAX_BUF_COUNT / 2; #else val = num_cpus * MAX_RSP_FQ_BACKLOG_PER_CPU; #endif qm_cgr_cs_thres_set64(&opts.cgr.cs_thres, val, 1); ret = qman_create_cgr(&qipriv.rsp_cgr, QMAN_CGR_FLAG_USE_INIT, &opts); if (ret) { dev_err(qidev, "Error %d creating CAAM rsp CGRID: %u\n", ret, qipriv.rsp_cgr.cgrid); return ret; } #ifdef DEBUG dev_info(qidev, "CAAM to CPU threshold set to %llu\n", val); #endif return 0; }
/* * Advance the clean counter. When the clean period has expired, * clean an entry. * * This is implemented in atomics to avoid locking. Because multiple * variables are involved, it can be racy which can lead to slightly * inaccurate information. Since this is only a heuristic, this is * OK. Any innaccuracies will clean themselves out as the counter * advances. That said, it is unlikely the entry clean operation will * race - the next possible racer will not start until the next clean * period. * * The clean counter is implemented as a decrement to zero. When zero * is reached an entry is cleaned. */ static void wss_advance_clean_counter(void) { int entry; int weight; unsigned long bits; /* become the cleaner if we decrement the counter to zero */ if (atomic_dec_and_test(&wss.clean_counter)) { /* * Set, not add, the clean period. This avoids an issue * where the counter could decrement below the clean period. * Doing a set can result in lost decrements, slowing the * clean advance. Since this a heuristic, this possible * slowdown is OK. * * An alternative is to loop, advancing the counter by a * clean period until the result is > 0. However, this could * lead to several threads keeping another in the clean loop. * This could be mitigated by limiting the number of times * we stay in the loop. */ atomic_set(&wss.clean_counter, wss_clean_period); /* * Uniquely grab the entry to clean and move to next. * The current entry is always the lower bits of * wss.clean_entry. The table size, wss.num_entries, * is always a power-of-2. */ entry = (atomic_inc_return(&wss.clean_entry) - 1) & (wss.num_entries - 1); /* clear the entry and count the bits */ bits = xchg(&wss.entries[entry], 0); weight = hweight64((u64)bits); /* only adjust the contended total count if needed */ if (weight) atomic_sub(weight, &wss.total_count); } }
static int default_handler(struct task_struct *task, void *buf, pfm_ovfl_arg_t *arg, struct pt_regs *regs, unsigned long stamp) { pfm_default_smpl_hdr_t *hdr; pfm_default_smpl_entry_t *ent; void *cur, *last; unsigned long *e, entry_size; unsigned int npmds, i; unsigned char ovfl_pmd; unsigned char ovfl_notify; if (unlikely(buf == NULL || arg == NULL|| regs == NULL || task == NULL)) { DPRINT(("[%d] invalid arguments buf=%p arg=%p\n", task->pid, buf, arg)); return -EINVAL; } hdr = (pfm_default_smpl_hdr_t *)buf; cur = buf+hdr->hdr_cur_offs; last = buf+hdr->hdr_buf_size; ovfl_pmd = arg->ovfl_pmd; ovfl_notify = arg->ovfl_notify; /* * precheck for sanity */ if ((last - cur) < PFM_DEFAULT_MAX_ENTRY_SIZE) goto full; npmds = hweight64(arg->smpl_pmds[0]); ent = (pfm_default_smpl_entry_t *)cur; prefetch(arg->smpl_pmds_values); entry_size = sizeof(*ent) + (npmds << 3); /* position for first pmd */ e = (unsigned long *)(ent+1); hdr->hdr_count++; DPRINT_ovfl(("[%d] count=%lu cur=%p last=%p free_bytes=%lu ovfl_pmd=%d ovfl_notify=%d npmds=%u\n", task->pid, hdr->hdr_count, cur, last, last-cur, ovfl_pmd, ovfl_notify, npmds)); /* * current = task running at the time of the overflow. * * per-task mode: * - this is usually the task being monitored. * Under certain conditions, it might be a different task * * system-wide: * - this is not necessarily the task controlling the session */ ent->pid = current->pid; ent->ovfl_pmd = ovfl_pmd; ent->last_reset_val = arg->pmd_last_reset; //pmd[0].reg_last_reset_val; /* * where did the fault happen (includes slot number) */ ent->ip = regs->cr_iip | ((regs->cr_ipsr >> 41) & 0x3); ent->tstamp = stamp; ent->cpu = smp_processor_id(); ent->set = arg->active_set; ent->tgid = current->tgid; /* * selectively store PMDs in increasing index number */ if (npmds) { unsigned long *val = arg->smpl_pmds_values; for(i=0; i < npmds; i++) { *e++ = *val++; } } /* * update position for next entry */ hdr->hdr_cur_offs += entry_size; cur += entry_size; /* * post check to avoid losing the last sample */ if ((last - cur) < PFM_DEFAULT_MAX_ENTRY_SIZE) goto full; /* * keep same ovfl_pmds, ovfl_notify */ arg->ovfl_ctrl.bits.notify_user = 0; arg->ovfl_ctrl.bits.block_task = 0; arg->ovfl_ctrl.bits.mask_monitoring = 0; arg->ovfl_ctrl.bits.reset_ovfl_pmds = 1; /* reset before returning from interrupt handler */ return 0; full: DPRINT_ovfl(("sampling buffer full free=%lu, count=%lu, ovfl_notify=%d\n", last-cur, hdr->hdr_count, ovfl_notify)); /* * increment number of buffer overflow. * important to detect duplicate set of samples. */ hdr->hdr_overflows++; /* * if no notification requested, then we saturate the buffer */ if (ovfl_notify == 0) { arg->ovfl_ctrl.bits.notify_user = 0; arg->ovfl_ctrl.bits.block_task = 0; arg->ovfl_ctrl.bits.mask_monitoring = 1; arg->ovfl_ctrl.bits.reset_ovfl_pmds = 0; } else { arg->ovfl_ctrl.bits.notify_user = 1; arg->ovfl_ctrl.bits.block_task = 1; /* ignored for non-blocking context */ arg->ovfl_ctrl.bits.mask_monitoring = 1; arg->ovfl_ctrl.bits.reset_ovfl_pmds = 0; /* no reset now */ } return -1; /* we are full, sorry */ }
STATIC void kbase_gpuprops_construct_coherent_groups(base_gpu_props * const props) { struct mali_base_gpu_coherent_group *current_group; u64 group_present; u64 group_mask; u64 first_set, first_set_prev; u32 num_groups = 0; KBASE_DEBUG_ASSERT(NULL != props); props->coherency_info.coherency = props->raw_props.mem_features; props->coherency_info.num_core_groups = hweight64(props->raw_props.l2_present); if (props->coherency_info.coherency & GROUPS_L3_COHERENT) { /* Group is l3 coherent */ group_present = props->raw_props.l3_present; } else if (props->coherency_info.coherency & GROUPS_L2_COHERENT) { /* Group is l2 coherent */ group_present = props->raw_props.l2_present; } else { /* Group is l1 coherent */ group_present = props->raw_props.shader_present; } /* * The coherent group mask can be computed from the l2/l3 present * register. * * For the coherent group n: * group_mask[n] = (first_set[n] - 1) & ~(first_set[n-1] - 1) * where first_set is group_present with only its nth set-bit kept * (i.e. the position from where a new group starts). * * For instance if the groups are l2 coherent and l2_present=0x0..01111: * The first mask is: * group_mask[1] = (first_set[1] - 1) & ~(first_set[0] - 1) * = (0x0..010 - 1) & ~(0x0..01 - 1) * = 0x0..00f * The second mask is: * group_mask[2] = (first_set[2] - 1) & ~(first_set[1] - 1) * = (0x0..100 - 1) & ~(0x0..010 - 1) * = 0x0..0f0 * And so on until all the bits from group_present have been cleared * (i.e. there is no group left). */ current_group = props->coherency_info.group; first_set = group_present & ~(group_present - 1); while (group_present != 0 && num_groups < BASE_MAX_COHERENT_GROUPS) { group_present -= first_set; /* Clear the current group bit */ first_set_prev = first_set; first_set = group_present & ~(group_present - 1); group_mask = (first_set - 1) & ~(first_set_prev - 1); /* Populate the coherent_group structure for each group */ current_group->core_mask = group_mask & props->raw_props.shader_present; current_group->num_cores = hweight64(current_group->core_mask); num_groups++; current_group++; } if (group_present != 0) KBASE_DEBUG_PRINT_WARN(KBASE_CORE, "Too many coherent groups (keeping only %d groups).", BASE_MAX_COHERENT_GROUPS); props->coherency_info.num_groups = num_groups; }
static int default_handler(struct task_struct *task, void *buf, pfm_ovfl_arg_t *arg, struct pt_regs *regs, unsigned long stamp) { pfm_default_smpl_hdr_t *hdr; pfm_default_smpl_entry_t *ent; void *cur, *last; unsigned long *e, entry_size; unsigned int npmds, i; unsigned char ovfl_pmd; unsigned char ovfl_notify; if (unlikely(buf == NULL || arg == NULL|| regs == NULL || task == NULL)) { DPRINT(("[%d] invalid arguments buf=%p arg=%p\n", task->pid, buf, arg)); return -EINVAL; } hdr = (pfm_default_smpl_hdr_t *)buf; cur = buf+hdr->hdr_cur_offs; last = buf+hdr->hdr_buf_size; ovfl_pmd = arg->ovfl_pmd; ovfl_notify = arg->ovfl_notify; /* */ if ((last - cur) < PFM_DEFAULT_MAX_ENTRY_SIZE) goto full; npmds = hweight64(arg->smpl_pmds[0]); ent = (pfm_default_smpl_entry_t *)cur; prefetch(arg->smpl_pmds_values); entry_size = sizeof(*ent) + (npmds << 3); /* */ e = (unsigned long *)(ent+1); hdr->hdr_count++; DPRINT_ovfl(("[%d] count=%lu cur=%p last=%p free_bytes=%lu ovfl_pmd=%d ovfl_notify=%d npmds=%u\n", task->pid, hdr->hdr_count, cur, last, last-cur, ovfl_pmd, ovfl_notify, npmds)); /* */ ent->pid = current->pid; ent->ovfl_pmd = ovfl_pmd; ent->last_reset_val = arg->pmd_last_reset; // /* */ ent->ip = regs->cr_iip | ((regs->cr_ipsr >> 41) & 0x3); ent->tstamp = stamp; ent->cpu = smp_processor_id(); ent->set = arg->active_set; ent->tgid = current->tgid; /* */ if (npmds) { unsigned long *val = arg->smpl_pmds_values; for(i=0; i < npmds; i++) { *e++ = *val++; } } /* */ hdr->hdr_cur_offs += entry_size; cur += entry_size; /* */ if ((last - cur) < PFM_DEFAULT_MAX_ENTRY_SIZE) goto full; /* */ arg->ovfl_ctrl.bits.notify_user = 0; arg->ovfl_ctrl.bits.block_task = 0; arg->ovfl_ctrl.bits.mask_monitoring = 0; arg->ovfl_ctrl.bits.reset_ovfl_pmds = 1; /* */ return 0; full: DPRINT_ovfl(("sampling buffer full free=%lu, count=%lu, ovfl_notify=%d\n", last-cur, hdr->hdr_count, ovfl_notify)); /* */ hdr->hdr_overflows++; /* */ if (ovfl_notify == 0) { arg->ovfl_ctrl.bits.notify_user = 0; arg->ovfl_ctrl.bits.block_task = 0; arg->ovfl_ctrl.bits.mask_monitoring = 1; arg->ovfl_ctrl.bits.reset_ovfl_pmds = 0; } else { arg->ovfl_ctrl.bits.notify_user = 1; arg->ovfl_ctrl.bits.block_task = 1; /* */ arg->ovfl_ctrl.bits.mask_monitoring = 1; arg->ovfl_ctrl.bits.reset_ovfl_pmds = 0; /* */ } return -1; /* */ }