void __devinit early_intel_workaround(struct cpuinfo_x86 *c) { if (c->x86_vendor != X86_VENDOR_INTEL) return; /* Netburst reports 64 bytes clflush size, but does IO in 128 bytes */ if (c->x86 == 15 && c->x86_cache_alignment == 64) c->x86_cache_alignment = 128; /* Unmask CPUID levels if masked: */ if (c->x86 > 6 || (c->x86 == 6 && c->x86_model >= 0xd)) { u64 misc_enable; rdmsrl(MSR_IA32_MISC_ENABLE, misc_enable); if (misc_enable & MSR_IA32_MISC_ENABLE_LIMIT_CPUID) { misc_enable &= ~MSR_IA32_MISC_ENABLE_LIMIT_CPUID; wrmsrl(MSR_IA32_MISC_ENABLE, misc_enable); c->cpuid_level = cpuid_eax(0); printk("revised cpuid_level = %d\n", c->cpuid_level); } } }
static void __cpuinit early_init_intel(struct cpuinfo_x86 *c) { /* Unmask CPUID levels if masked: */ if (c->x86 == 6 && c->x86_model >= 15) { u64 misc_enable; rdmsrl(MSR_IA32_MISC_ENABLE, misc_enable); if (misc_enable & MSR_IA32_MISC_ENABLE_LIMIT_CPUID) { misc_enable &= ~MSR_IA32_MISC_ENABLE_LIMIT_CPUID; wrmsrl(MSR_IA32_MISC_ENABLE, misc_enable); c->cpuid_level = cpuid_eax(0); } } /* Netburst reports 64 bytes clflush size, but does IO in 128 bytes */ if (c->x86 == 15 && c->x86_cache_alignment == 64) c->x86_cache_alignment = 128; if ((c->x86 == 0xf && c->x86_model >= 0x03) || (c->x86 == 0x6 && c->x86_model >= 0x0e)) set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC); }
static void do_drv_write(void *drvcmd) { struct drv_cmd *cmd; uint64_t msr_content; cmd = (struct drv_cmd *)drvcmd; switch (cmd->type) { case SYSTEM_INTEL_MSR_CAPABLE: rdmsrl(cmd->addr.msr.reg, msr_content); msr_content = (msr_content & ~INTEL_MSR_RANGE) | (cmd->val & INTEL_MSR_RANGE); wrmsrl(cmd->addr.msr.reg, msr_content); break; case SYSTEM_IO_CAPABLE: acpi_os_write_port((acpi_io_address)cmd->addr.io.port, cmd->val, (u32)cmd->addr.io.bit_width); break; default: break; } }
static void intel_thermal_interrupt(struct cpu_user_regs *regs) { uint64_t msr_content; unsigned int cpu = smp_processor_id(); static DEFINE_PER_CPU(s_time_t, next); ack_APIC_irq(); if (NOW() < per_cpu(next, cpu)) return; per_cpu(next, cpu) = NOW() + MILLISECS(5000); rdmsrl(MSR_IA32_THERM_STATUS, msr_content); if (msr_content & 0x1) { printk(KERN_EMERG "CPU%d: Temperature above threshold\n", cpu); printk(KERN_EMERG "CPU%d: Running in modulated clock mode\n", cpu); add_taint(TAINT_MACHINE_CHECK); } else { printk(KERN_INFO "CPU%d: Temperature/speed normal\n", cpu); } }
static int apply_microcode(int cpu) { unsigned long flags; struct ucode_cpu_info *uci = &per_cpu(ucode_cpu_info, cpu); uint32_t rev; struct microcode_amd *mc_amd = uci->mc.mc_amd; /* We should bind the task to the CPU */ BUG_ON(raw_smp_processor_id() != cpu); if ( mc_amd == NULL ) return -EINVAL; spin_lock_irqsave(µcode_update_lock, flags); wrmsrl(MSR_AMD_PATCHLOADER, (unsigned long)&mc_amd->hdr.data_code); /* get patch id after patching */ rdmsrl(MSR_AMD_PATCHLEVEL, rev); spin_unlock_irqrestore(µcode_update_lock, flags); /* check current patch id and patch's id for match */ if ( rev != mc_amd->hdr.patch_id ) { printk(KERN_ERR "microcode: CPU%d update from revision " "0x%x to 0x%x failed\n", cpu, mc_amd->hdr.patch_id, rev); return -EIO; } printk(KERN_INFO "microcode: CPU%d updated from revision %#x to %#x\n", cpu, uci->cpu_sig.rev, mc_amd->hdr.patch_id); uci->cpu_sig.rev = rev; return 0; }
static int __pminit setup_p4_watchdog(void) { uint64_t misc_enable; rdmsrl(MSR_IA32_MISC_ENABLE, misc_enable); if (!(misc_enable & MSR_IA32_MISC_ENABLE_PERF_AVAIL)) return 0; nmi_perfctr_msr = MSR_P4_IQ_PERFCTR0; nmi_p4_cccr_val = P4_NMI_IQ_CCCR0; if ( boot_cpu_data.x86_num_siblings == 2 ) nmi_p4_cccr_val |= P4_CCCR_OVF_PMI1; if (!(misc_enable & MSR_IA32_MISC_ENABLE_PEBS_UNAVAIL)) clear_msr_range(0x3F1, 2); /* MSR 0x3F0 seems to have a default value of 0xFC00, but current docs doesn't fully define it, so leave it alone for now. */ if (boot_cpu_data.x86_model >= 0x3) { /* MSR_P4_IQ_ESCR0/1 (0x3ba/0x3bb) removed */ clear_msr_range(0x3A0, 26); clear_msr_range(0x3BC, 3); } else { clear_msr_range(0x3A0, 31); } clear_msr_range(0x3C0, 6); clear_msr_range(0x3C8, 6); clear_msr_range(0x3E0, 2); clear_msr_range(MSR_P4_BPU_CCCR0, 18); clear_msr_range(MSR_P4_BPU_PERFCTR0, 18); wrmsrl(MSR_P4_CRU_ESCR0, P4_NMI_CRU_ESCR0); wrmsrl(MSR_P4_IQ_CCCR0, P4_NMI_IQ_CCCR0 & ~P4_CCCR_ENABLE); write_watchdog_counter("P4_IQ_COUNTER0"); apic_write(APIC_LVTPC, APIC_DM_NMI); wrmsrl(MSR_P4_IQ_CCCR0, nmi_p4_cccr_val); return 1; }
static void __cpuinit early_init_intel(struct cpuinfo_x86 *c) { /* Unmask CPUID levels if masked: */ if (c->x86 == 6 && c->x86_model >= 15) { u64 misc_enable; rdmsrl(MSR_IA32_MISC_ENABLE, misc_enable); if (misc_enable & MSR_IA32_MISC_ENABLE_LIMIT_CPUID) { misc_enable &= ~MSR_IA32_MISC_ENABLE_LIMIT_CPUID; wrmsrl(MSR_IA32_MISC_ENABLE, misc_enable); c->cpuid_level = cpuid_eax(0); } } if ((c->x86 == 0xf && c->x86_model >= 0x03) || (c->x86 == 0x6 && c->x86_model >= 0x0e)) set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC); #ifdef CONFIG_X86_64 set_cpu_cap(c, X86_FEATURE_SYSENTER32); #else /* Netburst reports 64 bytes clflush size, but does IO in 128 bytes */ if (c->x86 == 15 && c->x86_cache_alignment == 64) c->x86_cache_alignment = 128; #endif /* * c->x86_power is 8000_0007 edx. Bit 8 is TSC runs at constant rate * with P/T states and does not stop in deep C-states */ if (c->x86_power & (1 << 8)) { set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC); set_cpu_cap(c, X86_FEATURE_NONSTOP_TSC); } }
static int ppro_check_ctrs(struct pt_regs * const regs, struct op_msrs const * const msrs) { u64 val; int i; /* * This can happen if perf counters are in use when * we steal the die notifier NMI. */ if (unlikely(!reset_value)) goto out; for (i = 0; i < num_counters; ++i) { if (!reset_value[i]) continue; rdmsrl(msrs->counters[i].addr, val); if (val & (1ULL << (counter_width - 1))) continue; oprofile_add_sample(regs, i); wrmsrl(msrs->counters[i].addr, -reset_value[i]); } out: /* Only P6 based Pentium M need to re-unmask the apic vector but it * doesn't hurt other P6 variant */ apic_write(APIC_LVTPC, apic_read(APIC_LVTPC) & ~APIC_LVT_MASKED); /* We can't work out if we really handled an interrupt. We * might have caught a *second* counter just after overflowing * the interrupt for this counter then arrives * and we don't find a counter that's overflowed, so we * would return 0 and get dazed + confused. Instead we always * assume we found an overflow. This sucks. */ return 1; }
static void intel_epb_restore(void) { u64 val = this_cpu_read(saved_epb); u64 epb; rdmsrl(MSR_IA32_ENERGY_PERF_BIAS, epb); if (val) { val &= EPB_MASK; } else { /* * Because intel_epb_save() has not run for the current CPU yet, * it is going online for the first time, so if its EPB value is * 0 ('performance') at this point, assume that it has not been * initialized by the platform firmware and set it to 6 * ('normal'). */ val = epb & EPB_MASK; if (val == ENERGY_PERF_BIAS_PERFORMANCE) { val = ENERGY_PERF_BIAS_NORMAL; pr_warn_once("ENERGY_PERF_BIAS: Set to 'normal', was 'performance'\n"); } } wrmsrl(MSR_IA32_ENERGY_PERF_BIAS, (epb & ~EPB_MASK) | val); }
static void intel_pmu_lbr_read_32(struct cpu_hw_events *cpuc) { unsigned long mask = x86_pmu.lbr_nr - 1; u64 tos = intel_pmu_lbr_tos(); int i; for (i = 0; i < x86_pmu.lbr_nr; i++) { unsigned long lbr_idx = (tos - i) & mask; union { struct { u32 from; u32 to; }; u64 lbr; } msr_lastbranch; rdmsrl(x86_pmu.lbr_from + lbr_idx, msr_lastbranch.lbr); cpuc->lbr_entries[i].from = msr_lastbranch.from; cpuc->lbr_entries[i].to = msr_lastbranch.to; cpuc->lbr_entries[i].flags = 0; } cpuc->lbr_stack.nr = i; }
/* AMD K8 machine check */ void amd_k8_mcheck_init(struct cpuinfo_x86 *c) { uint64_t value; uint32_t i; int cpu_nr; machine_check_vector = k8_machine_check; cpu_nr = smp_processor_id(); wmb(); rdmsrl(MSR_IA32_MCG_CAP, value); if (value & MCG_CTL_P) /* Control register present ? */ wrmsrl (MSR_IA32_MCG_CTL, 0xffffffffffffffffULL); nr_mce_banks = value & MCG_CAP_COUNT; for (i = 0; i < nr_mce_banks; i++) { switch (i) { case 4: /* Northbridge */ /* Enable error reporting of all errors, * enable error checking and * disable sync flooding */ wrmsrl(MSR_IA32_MC4_CTL, 0x02c3c008ffffffffULL); wrmsrl(MSR_IA32_MC4_STATUS, 0x0ULL); break; default: /* Enable error reporting of all errors */ wrmsrl(MSR_IA32_MC0_CTL + 4 * i, 0xffffffffffffffffULL); wrmsrl(MSR_IA32_MC0_STATUS + 4 * i, 0x0ULL); break; } } set_in_cr4(X86_CR4_MCE); printk("CPU%i: AMD K8 machine check reporting enabled.\n", cpu_nr); }
static int perf_ibs_handle_irq(struct perf_ibs *perf_ibs, struct pt_regs *iregs) { struct cpu_perf_ibs *pcpu = this_cpu_ptr(perf_ibs->pcpu); struct perf_event *event = pcpu->event; struct hw_perf_event *hwc = &event->hw; struct perf_sample_data data; struct perf_raw_record raw; struct pt_regs regs; struct perf_ibs_data ibs_data; int offset, size, check_rip, offset_max, throttle = 0; unsigned int msr; u64 *buf, *config, period; if (!test_bit(IBS_STARTED, pcpu->state)) { /* * Catch spurious interrupts after stopping IBS: After * disabling IBS there could be still incoming NMIs * with samples that even have the valid bit cleared. * Mark all this NMIs as handled. */ return test_and_clear_bit(IBS_STOPPING, pcpu->state) ? 1 : 0; } msr = hwc->config_base; buf = ibs_data.regs; rdmsrl(msr, *buf); if (!(*buf++ & perf_ibs->valid_mask)) return 0; config = &ibs_data.regs[0]; perf_ibs_event_update(perf_ibs, event, config); perf_sample_data_init(&data, 0, hwc->last_period); if (!perf_ibs_set_period(perf_ibs, hwc, &period)) goto out; /* no sw counter overflow */ ibs_data.caps = ibs_caps; size = 1; offset = 1; check_rip = (perf_ibs == &perf_ibs_op && (ibs_caps & IBS_CAPS_RIPINVALIDCHK)); if (event->attr.sample_type & PERF_SAMPLE_RAW) offset_max = perf_ibs->offset_max; else if (check_rip) offset_max = 2; else offset_max = 1; do { rdmsrl(msr + offset, *buf++); size++; offset = find_next_bit(perf_ibs->offset_mask, perf_ibs->offset_max, offset + 1); } while (offset < offset_max); if (event->attr.sample_type & PERF_SAMPLE_RAW) { /* * Read IbsBrTarget and IbsOpData4 separately * depending on their availability. * Can't add to offset_max as they are staggered */ if (ibs_caps & IBS_CAPS_BRNTRGT) { rdmsrl(MSR_AMD64_IBSBRTARGET, *buf++); size++; } if (ibs_caps & IBS_CAPS_OPDATA4) { rdmsrl(MSR_AMD64_IBSOPDATA4, *buf++); size++; } } ibs_data.size = sizeof(u64) * size; regs = *iregs; if (check_rip && (ibs_data.regs[2] & IBS_RIP_INVALID)) { regs.flags &= ~PERF_EFLAGS_EXACT; } else { set_linear_ip(®s, ibs_data.regs[1]); regs.flags |= PERF_EFLAGS_EXACT; } if (event->attr.sample_type & PERF_SAMPLE_RAW) { raw.size = sizeof(u32) + ibs_data.size; raw.data = ibs_data.data; data.raw = &raw; } throttle = perf_event_overflow(event, &data, ®s); out: if (throttle) perf_ibs_disable_event(perf_ibs, hwc, *config); else perf_ibs_enable_event(perf_ibs, hwc, period >> 4); perf_event_update_userpage(event); return 1; }
static int powernow_cpufreq_cpu_init(struct cpufreq_policy *policy) { unsigned int i; unsigned int valid_states = 0; unsigned int cpu = policy->cpu; struct acpi_cpufreq_data *data; unsigned int result = 0; struct processor_performance *perf; u32 max_hw_pstate; uint64_t msr_content; struct cpuinfo_x86 *c = &cpu_data[policy->cpu]; data = xzalloc(struct acpi_cpufreq_data); if (!data) return -ENOMEM; cpufreq_drv_data[cpu] = data; data->acpi_data = &processor_pminfo[cpu]->perf; perf = data->acpi_data; policy->shared_type = perf->shared_type; if (policy->shared_type == CPUFREQ_SHARED_TYPE_ALL || policy->shared_type == CPUFREQ_SHARED_TYPE_ANY) { cpumask_set_cpu(cpu, policy->cpus); if (cpumask_weight(policy->cpus) != 1) { printk(XENLOG_WARNING "Unsupported sharing type %d (%u CPUs)\n", policy->shared_type, cpumask_weight(policy->cpus)); result = -ENODEV; goto err_unreg; } } else { cpumask_copy(policy->cpus, cpumask_of(cpu)); } /* capability check */ if (perf->state_count <= 1) { printk("No P-States\n"); result = -ENODEV; goto err_unreg; } rdmsrl(MSR_PSTATE_CUR_LIMIT, msr_content); max_hw_pstate = (msr_content & HW_PSTATE_MAX_MASK) >> HW_PSTATE_MAX_SHIFT; if (perf->control_register.space_id != perf->status_register.space_id) { result = -ENODEV; goto err_unreg; } data->freq_table = xmalloc_array(struct cpufreq_frequency_table, (perf->state_count+1)); if (!data->freq_table) { result = -ENOMEM; goto err_unreg; } /* detect transition latency */ policy->cpuinfo.transition_latency = 0; for (i=0; i<perf->state_count; i++) { if ((perf->states[i].transition_latency * 1000) > policy->cpuinfo.transition_latency) policy->cpuinfo.transition_latency = perf->states[i].transition_latency * 1000; } policy->governor = cpufreq_opt_governor ? : CPUFREQ_DEFAULT_GOVERNOR; /* table init */ for (i = 0; i < perf->state_count && i <= max_hw_pstate; i++) { if (i > 0 && perf->states[i].core_frequency >= data->freq_table[valid_states-1].frequency / 1000) continue; data->freq_table[valid_states].index = perf->states[i].control & HW_PSTATE_MASK; data->freq_table[valid_states].frequency = perf->states[i].core_frequency * 1000; valid_states++; } data->freq_table[valid_states].frequency = CPUFREQ_TABLE_END; perf->state = 0; result = cpufreq_frequency_table_cpuinfo(policy, data->freq_table); if (result) goto err_freqfree; if (c->cpuid_level >= 6) on_selected_cpus(cpumask_of(cpu), feature_detect, policy, 1); /* * the first call to ->target() should result in us actually * writing something to the appropriate registers. */ data->arch_cpu_flags |= ARCH_CPU_FLAG_RESUME; policy->cur = data->freq_table[i].frequency; return result; err_freqfree: xfree(data->freq_table); err_unreg: xfree(data); cpufreq_drv_data[cpu] = NULL; return result; }
/* P4/Xeon Thermal regulation detect and init */ static void intel_init_thermal(struct cpuinfo_x86 *c) { uint64_t msr_content; uint32_t val; int tm2 = 0; unsigned int cpu = smp_processor_id(); static uint8_t thermal_apic_vector; if (!intel_thermal_supported(c)) return; /* -ENODEV */ /* first check if its enabled already, in which case there might * be some SMM goo which handles it, so we can't even put a handler * since it might be delivered via SMI already -zwanem. */ rdmsrl(MSR_IA32_MISC_ENABLE, msr_content); val = lvtthmr_init; /* * The initial value of thermal LVT entries on all APs always reads * 0x10000 because APs are woken up by BSP issuing INIT-SIPI-SIPI * sequence to them and LVT registers are reset to 0s except for * the mask bits which are set to 1s when APs receive INIT IPI. * If BIOS takes over the thermal interrupt and sets its interrupt * delivery mode to SMI (not fixed), it restores the value that the * BIOS has programmed on AP based on BSP's info we saved (since BIOS * is required to set the same value for all threads/cores). */ if ((val & APIC_MODE_MASK) != APIC_DM_FIXED || (val & APIC_VECTOR_MASK) > 0xf) apic_write(APIC_LVTTHMR, val); if ((msr_content & (1ULL<<3)) && (val & APIC_MODE_MASK) == APIC_DM_SMI) { if (c == &boot_cpu_data) printk(KERN_DEBUG "Thermal monitoring handled by SMI\n"); return; /* -EBUSY */ } if (cpu_has(c, X86_FEATURE_TM2) && (msr_content & (1ULL << 13))) tm2 = 1; /* check whether a vector already exists, temporarily masked? */ if (val & APIC_VECTOR_MASK) { if (c == &boot_cpu_data) printk(KERN_DEBUG "Thermal LVT vector (%#x) already installed\n", val & APIC_VECTOR_MASK); return; /* -EBUSY */ } alloc_direct_apic_vector(&thermal_apic_vector, intel_thermal_interrupt); /* The temperature transition interrupt handler setup */ val = thermal_apic_vector; /* our delivery vector */ val |= (APIC_DM_FIXED | APIC_LVT_MASKED); /* we'll mask till we're ready */ apic_write_around(APIC_LVTTHMR, val); rdmsrl(MSR_IA32_THERM_INTERRUPT, msr_content); wrmsrl(MSR_IA32_THERM_INTERRUPT, msr_content | 0x03); rdmsrl(MSR_IA32_MISC_ENABLE, msr_content); wrmsrl(MSR_IA32_MISC_ENABLE, msr_content | (1ULL<<3)); apic_write_around(APIC_LVTTHMR, val & ~APIC_LVT_MASKED); if (opt_cpu_info) printk(KERN_INFO "CPU%u: Thermal monitoring enabled (%s)\n", cpu, tm2 ? "TM2" : "TM1"); return; }
/* Machine Check Handler for AMD K8 family series */ void k8_machine_check(struct cpu_user_regs *regs, long error_code) { struct vcpu *vcpu = current; struct domain *curdom; struct mc_info *mc_data; struct mcinfo_global mc_global; struct mcinfo_bank mc_info; uint64_t status, addrv, miscv, uc; uint32_t i; unsigned int cpu_nr; uint32_t xen_impacted = 0; #define DOM_NORMAL 0 #define DOM0_TRAP 1 #define DOMU_TRAP 2 #define DOMU_KILLED 4 uint32_t dom_state = DOM_NORMAL; /* This handler runs as interrupt gate. So IPIs from the * polling service routine are defered until we finished. */ /* Disable interrupts for the _vcpu_. It may not re-scheduled to * an other physical CPU or the impacted process in the guest * continues running with corrupted data, otherwise. */ vcpu_schedule_lock_irq(vcpu); mc_data = x86_mcinfo_getptr(); cpu_nr = smp_processor_id(); curdom = vcpu->domain; memset(&mc_global, 0, sizeof(mc_global)); mc_global.common.type = MC_TYPE_GLOBAL; mc_global.common.size = sizeof(mc_global); mc_global.mc_domid = curdom->domain_id; /* impacted domain */ mc_global.mc_coreid = vcpu->processor; /* impacted physical cpu */ BUG_ON(cpu_nr != vcpu->processor); mc_global.mc_core_threadid = 0; mc_global.mc_vcpuid = vcpu->vcpu_id; /* impacted vcpu */ #if 0 /* TODO: on which socket is this physical core? It's not clear to me how to figure this out. */ mc_global.mc_socketid = ???; #endif mc_global.mc_flags |= MC_FLAG_UNCORRECTABLE; rdmsrl(MSR_IA32_MCG_STATUS, mc_global.mc_gstatus); /* Quick check, who is impacted */ xen_impacted = is_idle_domain(curdom); /* Dom0 */ x86_mcinfo_clear(mc_data); x86_mcinfo_add(mc_data, &mc_global); for (i = 0; i < nr_mce_banks; i++) { struct domain *d; rdmsrl(MSR_IA32_MC0_STATUS + 4 * i, status); if (!(status & MCi_STATUS_VAL)) continue; /* An error happened in this bank. * This is expected to be an uncorrectable error, * since correctable errors get polled. */ uc = status & MCi_STATUS_UC; memset(&mc_info, 0, sizeof(mc_info)); mc_info.common.type = MC_TYPE_BANK; mc_info.common.size = sizeof(mc_info); mc_info.mc_bank = i; mc_info.mc_status = status; addrv = 0; if (status & MCi_STATUS_ADDRV) { rdmsrl(MSR_IA32_MC0_ADDR + 4 * i, addrv); d = maddr_get_owner(addrv); if (d != NULL) mc_info.mc_domid = d->domain_id; } miscv = 0; if (status & MCi_STATUS_MISCV) rdmsrl(MSR_IA32_MC0_MISC + 4 * i, miscv); mc_info.mc_addr = addrv; mc_info.mc_misc = miscv; x86_mcinfo_add(mc_data, &mc_info); /* Dom0 */ if (mc_callback_bank_extended) mc_callback_bank_extended(mc_data, i, status); /* clear status */ wrmsrl(MSR_IA32_MC0_STATUS + 4 * i, 0x0ULL); wmb(); add_taint(TAINT_MACHINE_CHECK); } status = mc_global.mc_gstatus; /* clear MCIP or cpu enters shutdown state * in case another MCE occurs. */ status &= ~MCG_STATUS_MCIP; wrmsrl(MSR_IA32_MCG_STATUS, status); wmb(); /* For the details see the discussion "MCE/MCA concept" on xen-devel. * The thread started here: * http://lists.xensource.com/archives/html/xen-devel/2007-05/msg01015.html */ /* MCG_STATUS_RIPV: * When this bit is not set, then the instruction pointer onto the stack * to resume at is not valid. If xen is interrupted, then we panic anyway * right below. Otherwise it is up to the guest to figure out if * guest kernel or guest userland is affected and should kill either * itself or the affected process. */ /* MCG_STATUS_EIPV: * Evaluation of EIPV is the job of the guest. */ if (xen_impacted) { /* Now we are going to panic anyway. Allow interrupts, so that * printk on serial console can work. */ vcpu_schedule_unlock_irq(vcpu); /* Uh, that means, machine check exception * inside Xen occured. */ printk("Machine check exception occured in Xen.\n"); /* if MCG_STATUS_EIPV indicates, the IP on the stack is related * to the error then it makes sense to print a stack trace. * That can be useful for more detailed error analysis and/or * error case studies to figure out, if we can clear * xen_impacted and kill a DomU instead * (i.e. if a guest only control structure is affected, but then * we must ensure the bad pages are not re-used again). */ if (status & MCG_STATUS_EIPV) { printk("MCE: Instruction Pointer is related to the error. " "Therefore, print the execution state.\n"); show_execution_state(regs); } x86_mcinfo_dump(mc_data); mc_panic("End of MCE. Use mcelog to decode above error codes.\n"); } /* If Dom0 registered a machine check handler, which is only possible * with a PV MCA driver, then ... */ if ( guest_has_trap_callback(dom0, 0, TRAP_machine_check) ) { dom_state = DOM0_TRAP; /* ... deliver machine check trap to Dom0. */ send_guest_trap(dom0, 0, TRAP_machine_check); /* Xen may tell Dom0 now to notify the DomU. * But this will happen through a hypercall. */ } else /* Dom0 did not register a machine check handler, but if DomU * did so, then... */ if ( guest_has_trap_callback(curdom, vcpu->vcpu_id, TRAP_machine_check) ) { dom_state = DOMU_TRAP; /* ... deliver machine check trap to DomU */ send_guest_trap(curdom, vcpu->vcpu_id, TRAP_machine_check); } else { /* hmm... noone feels responsible to handle the error. * So, do a quick check if a DomU is impacted or not. */ if (curdom == dom0) { /* Dom0 is impacted. Since noone can't handle * this error, panic! */ x86_mcinfo_dump(mc_data); mc_panic("MCE occured in Dom0, which it can't handle\n"); /* UNREACHED */ } else { dom_state = DOMU_KILLED; /* Enable interrupts. This basically results in * calling sti on the *physical* cpu. But after * domain_crash() the vcpu pointer is invalid. * Therefore, we must unlock the irqs before killing * it. */ vcpu_schedule_unlock_irq(vcpu); /* DomU is impacted. Kill it and continue. */ domain_crash(curdom); } } switch (dom_state) { case DOM0_TRAP: case DOMU_TRAP: /* Enable interrupts. */ vcpu_schedule_unlock_irq(vcpu); /* guest softirqs and event callbacks are scheduled * immediately after this handler exits. */ break; case DOMU_KILLED: /* Nothing to do here. */ break; default: BUG(); } }
static inline s64 read_mperf(void) { u64 u; rdmsrl(MSR_IA32_MPERF,u); return u; }
int vmx_cpu_up(void) { u32 eax, edx; int bios_locked, cpu = smp_processor_id(); u64 cr0, vmx_cr0_fixed0, vmx_cr0_fixed1; BUG_ON(!(read_cr4() & X86_CR4_VMXE)); /* * Ensure the current processor operating mode meets * the requred CRO fixed bits in VMX operation. */ cr0 = read_cr0(); rdmsrl(MSR_IA32_VMX_CR0_FIXED0, vmx_cr0_fixed0); rdmsrl(MSR_IA32_VMX_CR0_FIXED1, vmx_cr0_fixed1); if ( (~cr0 & vmx_cr0_fixed0) || (cr0 & ~vmx_cr0_fixed1) ) { printk("CPU%d: some settings of host CR0 are " "not allowed in VMX operation.\n", cpu); return 0; } rdmsr(IA32_FEATURE_CONTROL_MSR, eax, edx); bios_locked = !!(eax & IA32_FEATURE_CONTROL_MSR_LOCK); if ( bios_locked ) { if ( !(eax & (IA32_FEATURE_CONTROL_MSR_ENABLE_VMXON_OUTSIDE_SMX | IA32_FEATURE_CONTROL_MSR_ENABLE_VMXON_INSIDE_SMX)) ) { printk("CPU%d: VMX disabled by BIOS.\n", cpu); return 0; } } else { eax = IA32_FEATURE_CONTROL_MSR_LOCK; eax |= IA32_FEATURE_CONTROL_MSR_ENABLE_VMXON_OUTSIDE_SMX; if ( test_bit(X86_FEATURE_SMXE, &boot_cpu_data.x86_capability) ) eax |= IA32_FEATURE_CONTROL_MSR_ENABLE_VMXON_INSIDE_SMX; wrmsr(IA32_FEATURE_CONTROL_MSR, eax, 0); } vmx_init_vmcs_config(); INIT_LIST_HEAD(&this_cpu(active_vmcs_list)); if ( this_cpu(host_vmcs) == NULL ) { this_cpu(host_vmcs) = vmx_alloc_vmcs(); if ( this_cpu(host_vmcs) == NULL ) { printk("CPU%d: Could not allocate host VMCS\n", cpu); return 0; } } switch ( __vmxon(virt_to_maddr(this_cpu(host_vmcs))) ) { case -2: /* #UD or #GP */ if ( bios_locked && test_bit(X86_FEATURE_SMXE, &boot_cpu_data.x86_capability) && (!(eax & IA32_FEATURE_CONTROL_MSR_ENABLE_VMXON_OUTSIDE_SMX) || !(eax & IA32_FEATURE_CONTROL_MSR_ENABLE_VMXON_INSIDE_SMX)) ) { printk("CPU%d: VMXON failed: perhaps because of TXT settings " "in your BIOS configuration?\n", cpu); printk(" --> Disable TXT in your BIOS unless using a secure " "bootloader.\n"); return 0; } /* fall through */ case -1: /* CF==1 or ZF==1 */ printk("CPU%d: unexpected VMXON failure\n", cpu); return 0; case 0: /* success */ break; default: BUG(); } return 1; }
static void __init setup_real_mode(void) { u16 real_mode_seg; const u32 *rel; u32 count; unsigned char *base; unsigned long phys_base; struct trampoline_header *trampoline_header; size_t size = PAGE_ALIGN(real_mode_blob_end - real_mode_blob); #ifdef CONFIG_X86_64 u64 *trampoline_pgd; u64 efer; #endif base = (unsigned char *)real_mode_header; memcpy(base, real_mode_blob, size); phys_base = __pa(base); real_mode_seg = phys_base >> 4; rel = (u32 *) real_mode_relocs; /* 16-bit segment relocations. */ count = *rel++; while (count--) { u16 *seg = (u16 *) (base + *rel++); *seg = real_mode_seg; } /* 32-bit linear relocations. */ count = *rel++; while (count--) { u32 *ptr = (u32 *) (base + *rel++); *ptr += phys_base; } /* Must be perfomed *after* relocation. */ trampoline_header = (struct trampoline_header *) __va(real_mode_header->trampoline_header); #ifdef CONFIG_X86_32 trampoline_header->start = __pa_symbol(startup_32_smp); trampoline_header->gdt_limit = __BOOT_DS + 7; trampoline_header->gdt_base = __pa_symbol(boot_gdt); #else /* * Some AMD processors will #GP(0) if EFER.LMA is set in WRMSR * so we need to mask it out. */ rdmsrl(MSR_EFER, efer); trampoline_header->efer = efer & ~EFER_LMA; trampoline_header->start = (u64) secondary_startup_64; trampoline_cr4_features = &trampoline_header->cr4; *trampoline_cr4_features = mmu_cr4_features; trampoline_pgd = (u64 *) __va(real_mode_header->trampoline_pgd); trampoline_pgd[0] = trampoline_pgd_entry.pgd; trampoline_pgd[511] = init_level4_pgt[511].pgd; #endif }
void __init check_bugs(void) { identify_boot_cpu(); /* * identify_boot_cpu() initialized SMT support information, let the * core code know. */ cpu_smt_check_topology(); if (!IS_ENABLED(CONFIG_SMP)) { pr_info("CPU: "); print_cpu_info(&boot_cpu_data); } /* * Read the SPEC_CTRL MSR to account for reserved bits which may * have unknown values. AMD64_LS_CFG MSR is cached in the early AMD * init code as it is not enumerated and depends on the family. */ if (boot_cpu_has(X86_FEATURE_MSR_SPEC_CTRL)) rdmsrl(MSR_IA32_SPEC_CTRL, x86_spec_ctrl_base); /* Allow STIBP in MSR_SPEC_CTRL if supported */ if (boot_cpu_has(X86_FEATURE_STIBP)) x86_spec_ctrl_mask |= SPEC_CTRL_STIBP; /* Select the proper spectre mitigation before patching alternatives */ spectre_v2_select_mitigation(); /* * Select proper mitigation for any exposure to the Speculative Store * Bypass vulnerability. */ ssb_select_mitigation(); l1tf_select_mitigation(); #ifdef CONFIG_X86_32 /* * Check whether we are able to run this kernel safely on SMP. * * - i386 is no longer supported. * - In order to run on anything without a TSC, we need to be * compiled for a i486. */ if (boot_cpu_data.x86 < 4) panic("Kernel requires i486+ for 'invlpg' and other features"); init_utsname()->machine[1] = '0' + (boot_cpu_data.x86 > 6 ? 6 : boot_cpu_data.x86); alternative_instructions(); fpu__init_check_bugs(); #else /* CONFIG_X86_64 */ alternative_instructions(); /* * Make sure the first 2MB area is not mapped by huge pages * There are typically fixed size MTRRs in there and overlapping * MTRRs into large pages causes slow downs. * * Right now we don't do that with gbpages because there seems * very little benefit for that case. */ if (!direct_gbpages) set_memory_4k((unsigned long)__va(0), 1); #endif }
static inline s64 read_pctr0(void) { s64 v; rdmsrl(MSR_P6_PERFCTR0,v); return v; }
static void early_init_intel(struct cpuinfo_x86 *c) { u64 misc_enable; /* Unmask CPUID levels if masked: */ if (c->x86 > 6 || (c->x86 == 6 && c->x86_model >= 0xd)) { if (msr_clear_bit(MSR_IA32_MISC_ENABLE, MSR_IA32_MISC_ENABLE_LIMIT_CPUID_BIT) > 0) { c->cpuid_level = cpuid_eax(0); get_cpu_cap(c); } } if ((c->x86 == 0xf && c->x86_model >= 0x03) || (c->x86 == 0x6 && c->x86_model >= 0x0e)) set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC); if (c->x86 >= 6 && !cpu_has(c, X86_FEATURE_IA64)) { unsigned lower_word; wrmsr(MSR_IA32_UCODE_REV, 0, 0); /* Required by the SDM */ sync_core(); rdmsr(MSR_IA32_UCODE_REV, lower_word, c->microcode); } /* * Atom erratum AAE44/AAF40/AAG38/AAH41: * * A race condition between speculative fetches and invalidating * a large page. This is worked around in microcode, but we * need the microcode to have already been loaded... so if it is * not, recommend a BIOS update and disable large pages. */ if (c->x86 == 6 && c->x86_model == 0x1c && c->x86_mask <= 2 && c->microcode < 0x20e) { pr_warn("Atom PSE erratum detected, BIOS microcode update recommended\n"); clear_cpu_cap(c, X86_FEATURE_PSE); } #ifdef CONFIG_X86_64 set_cpu_cap(c, X86_FEATURE_SYSENTER32); #else /* Netburst reports 64 bytes clflush size, but does IO in 128 bytes */ if (c->x86 == 15 && c->x86_cache_alignment == 64) c->x86_cache_alignment = 128; #endif /* CPUID workaround for 0F33/0F34 CPU */ if (c->x86 == 0xF && c->x86_model == 0x3 && (c->x86_mask == 0x3 || c->x86_mask == 0x4)) c->x86_phys_bits = 36; /* * c->x86_power is 8000_0007 edx. Bit 8 is TSC runs at constant rate * with P/T states and does not stop in deep C-states. * * It is also reliable across cores and sockets. (but not across * cabinets - we turn it off in that case explicitly.) */ if (c->x86_power & (1 << 8)) { set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC); set_cpu_cap(c, X86_FEATURE_NONSTOP_TSC); if (!check_tsc_unstable()) set_sched_clock_stable(); } /* Penwell and Cloverview have the TSC which doesn't sleep on S3 */ if (c->x86 == 6) { switch (c->x86_model) { case 0x27: /* Penwell */ case 0x35: /* Cloverview */ case 0x4a: /* Merrifield */ set_cpu_cap(c, X86_FEATURE_NONSTOP_TSC_S3); break; default: break; } } /* * There is a known erratum on Pentium III and Core Solo * and Core Duo CPUs. * " Page with PAT set to WC while associated MTRR is UC * may consolidate to UC " * Because of this erratum, it is better to stick with * setting WC in MTRR rather than using PAT on these CPUs. * * Enable PAT WC only on P4, Core 2 or later CPUs. */ if (c->x86 == 6 && c->x86_model < 15) clear_cpu_cap(c, X86_FEATURE_PAT); #ifdef CONFIG_KMEMCHECK /* * P4s have a "fast strings" feature which causes single- * stepping REP instructions to only generate a #DB on * cache-line boundaries. * * Ingo Molnar reported a Pentium D (model 6) and a Xeon * (model 2) with the same problem. */ if (c->x86 == 15) if (msr_clear_bit(MSR_IA32_MISC_ENABLE, MSR_IA32_MISC_ENABLE_FAST_STRING_BIT) > 0) pr_info("kmemcheck: Disabling fast string operations\n"); #endif /* * If fast string is not enabled in IA32_MISC_ENABLE for any reason, * clear the fast string and enhanced fast string CPU capabilities. */ if (c->x86 > 6 || (c->x86 == 6 && c->x86_model >= 0xd)) { rdmsrl(MSR_IA32_MISC_ENABLE, misc_enable); if (!(misc_enable & MSR_IA32_MISC_ENABLE_FAST_STRING)) { pr_info("Disabled fast string operations\n"); setup_clear_cpu_cap(X86_FEATURE_REP_GOOD); setup_clear_cpu_cap(X86_FEATURE_ERMS); } } /* * Intel Quark Core DevMan_001.pdf section 6.4.11 * "The operating system also is required to invalidate (i.e., flush) * the TLB when any changes are made to any of the page table entries. * The operating system must reload CR3 to cause the TLB to be flushed" * * As a result, boot_cpu_has(X86_FEATURE_PGE) in arch/x86/include/asm/tlbflush.h * should be false so that __flush_tlb_all() causes CR3 insted of CR4.PGE * to be modified. */ if (c->x86 == 5 && c->x86_model == 9) { pr_info("Disabling PGE capability bit\n"); setup_clear_cpu_cap(X86_FEATURE_PGE); } if (c->cpuid_level >= 0x00000001) { u32 eax, ebx, ecx, edx; cpuid(0x00000001, &eax, &ebx, &ecx, &edx); /* * If HTT (EDX[28]) is set EBX[16:23] contain the number of * apicids which are reserved per package. Store the resulting * shift value for the package management code. */ if (edx & (1U << 28)) c->x86_coreid_bits = get_count_order((ebx >> 16) & 0xff); }
static void __cpuinit init_amd(struct cpuinfo_x86 *c) { u32 l, h; int mbytes = num_physpages >> (20-PAGE_SHIFT); int r; #ifdef CONFIG_SMP unsigned long long value; /* Disable TLB flush filter by setting HWCR.FFDIS on K8 * bit 6 of msr C001_0015 * * Errata 63 for SH-B3 steppings * Errata 122 for all steppings (F+ have it disabled by default) */ if (c->x86 == 15) { rdmsrl(MSR_K7_HWCR, value); value |= 1 << 6; wrmsrl(MSR_K7_HWCR, value); } #endif early_init_amd(c); /* * FIXME: We should handle the K5 here. Set up the write * range and also turn on MSR 83 bits 4 and 31 (write alloc, * no bus pipeline) */ /* Bit 31 in normal CPUID used for nonstandard 3DNow ID; 3DNow is IDd by bit 31 in extended CPUID (1*32+31) anyway */ clear_bit(0*32+31, c->x86_capability); r = get_model_name(c); switch(c->x86) { case 4: /* * General Systems BIOSen alias the cpu frequency registers * of the Elan at 0x000df000. Unfortuantly, one of the Linux * drivers subsequently pokes it, and changes the CPU speed. * Workaround : Remove the unneeded alias. */ #define CBAR (0xfffc) /* Configuration Base Address (32-bit) */ #define CBAR_ENB (0x80000000) #define CBAR_KEY (0X000000CB) if (c->x86_model==9 || c->x86_model == 10) { if (inl (CBAR) & CBAR_ENB) outl (0 | CBAR_KEY, CBAR); } break; case 5: if( c->x86_model < 6 ) { /* Based on AMD doc 20734R - June 2000 */ if ( c->x86_model == 0 ) { clear_bit(X86_FEATURE_APIC, c->x86_capability); set_bit(X86_FEATURE_PGE, c->x86_capability); } break; } if ( c->x86_model == 6 && c->x86_mask == 1 ) { const int K6_BUG_LOOP = 1000000; int n; void (*f_vide)(void); unsigned long d, d2; printk(KERN_INFO "AMD K6 stepping B detected - "); /* * It looks like AMD fixed the 2.6.2 bug and improved indirect * calls at the same time. */ n = K6_BUG_LOOP; f_vide = vide; rdtscl(d); while (n--) f_vide(); rdtscl(d2); d = d2-d; if (d > 20*K6_BUG_LOOP) printk("system stability may be impaired when more than 32 MB are used.\n"); else printk("probably OK (after B9730xxxx).\n"); printk(KERN_INFO "Please see http://membres.lycos.fr/poulot/k6bug.html\n"); } /* K6 with old style WHCR */ if (c->x86_model < 8 || (c->x86_model== 8 && c->x86_mask < 8)) { /* We can only write allocate on the low 508Mb */ if(mbytes>508) mbytes=508; rdmsr(MSR_K6_WHCR, l, h); if ((l&0x0000FFFF)==0) { unsigned long flags; l=(1<<0)|((mbytes/4)<<1); local_irq_save(flags); wbinvd(); wrmsr(MSR_K6_WHCR, l, h); local_irq_restore(flags); printk(KERN_INFO "Enabling old style K6 write allocation for %d Mb\n", mbytes); } break; } if ((c->x86_model == 8 && c->x86_mask >7) || c->x86_model == 9 || c->x86_model == 13) { /* The more serious chips .. */ if(mbytes>4092) mbytes=4092; rdmsr(MSR_K6_WHCR, l, h); if ((l&0xFFFF0000)==0) { unsigned long flags; l=((mbytes>>2)<<22)|(1<<16); local_irq_save(flags); wbinvd(); wrmsr(MSR_K6_WHCR, l, h); local_irq_restore(flags); printk(KERN_INFO "Enabling new style K6 write allocation for %d Mb\n", mbytes); } /* Set MTRR capability flag if appropriate */ if (c->x86_model == 13 || c->x86_model == 9 || (c->x86_model == 8 && c->x86_mask >= 8)) set_bit(X86_FEATURE_K6_MTRR, c->x86_capability); break; } if (c->x86_model == 10) { /* AMD Geode LX is model 10 */ /* placeholder for any needed mods */ break; } break; case 6: /* An Athlon/Duron */ /* Bit 15 of Athlon specific MSR 15, needs to be 0 * to enable SSE on Palomino/Morgan/Barton CPU's. * If the BIOS didn't enable it already, enable it here. */ if (c->x86_model >= 6 && c->x86_model <= 10) { if (!cpu_has(c, X86_FEATURE_XMM)) { printk(KERN_INFO "Enabling disabled K7/SSE Support.\n"); rdmsr(MSR_K7_HWCR, l, h); l &= ~0x00008000; wrmsr(MSR_K7_HWCR, l, h); set_bit(X86_FEATURE_XMM, c->x86_capability); } } /* It's been determined by AMD that Athlons since model 8 stepping 1 * are more robust with CLK_CTL set to 200xxxxx instead of 600xxxxx * As per AMD technical note 27212 0.2 */ if ((c->x86_model == 8 && c->x86_mask>=1) || (c->x86_model > 8)) { rdmsr(MSR_K7_CLK_CTL, l, h); if ((l & 0xfff00000) != 0x20000000) { printk ("CPU: CLK_CTL MSR was %x. Reprogramming to %x\n", l, ((l & 0x000fffff)|0x20000000)); wrmsr(MSR_K7_CLK_CTL, (l & 0x000fffff)|0x20000000, h); } } break; }
static int __init detect_init_APIC (void) { uint64_t msr_content; u32 features; /* Disabled by kernel option? */ if (enable_local_apic < 0) return -1; switch (boot_cpu_data.x86_vendor) { case X86_VENDOR_AMD: if ((boot_cpu_data.x86 == 6 && boot_cpu_data.x86_model > 1) || (boot_cpu_data.x86 >= 0xf && boot_cpu_data.x86 <= 0x17)) break; goto no_apic; case X86_VENDOR_INTEL: if (boot_cpu_data.x86 == 6 || boot_cpu_data.x86 == 15 || (boot_cpu_data.x86 == 5 && cpu_has_apic)) break; goto no_apic; default: goto no_apic; } if (!cpu_has_apic) { /* * Over-ride BIOS and try to enable the local * APIC only if "lapic" specified. */ if (enable_local_apic <= 0) { printk("Local APIC disabled by BIOS -- " "you can enable it with \"lapic\"\n"); return -1; } /* * Some BIOSes disable the local APIC in the * APIC_BASE MSR. This can only be done in * software for Intel P6 or later and AMD K7 * (Model > 1) or later. */ rdmsrl(MSR_IA32_APICBASE, msr_content); if (!(msr_content & MSR_IA32_APICBASE_ENABLE)) { printk("Local APIC disabled by BIOS -- reenabling.\n"); msr_content &= ~MSR_IA32_APICBASE_BASE; msr_content |= MSR_IA32_APICBASE_ENABLE | APIC_DEFAULT_PHYS_BASE; wrmsrl(MSR_IA32_APICBASE, msr_content | MSR_IA32_APICBASE_ENABLE | APIC_DEFAULT_PHYS_BASE); enabled_via_apicbase = 1; } } /* * The APIC feature bit should now be enabled * in `cpuid' */ features = cpuid_edx(1); if (!(features & (1 << X86_FEATURE_APIC))) { printk("Could not enable APIC!\n"); return -1; } set_bit(X86_FEATURE_APIC, boot_cpu_data.x86_capability); mp_lapic_addr = APIC_DEFAULT_PHYS_BASE; /* The BIOS may have set up the APIC at some other address */ rdmsrl(MSR_IA32_APICBASE, msr_content); if (msr_content & MSR_IA32_APICBASE_ENABLE) mp_lapic_addr = msr_content & MSR_IA32_APICBASE_BASE; if (nmi_watchdog != NMI_NONE) nmi_watchdog = NMI_LOCAL_APIC; printk("Found and enabled local APIC!\n"); apic_pm_activate(); return 0; no_apic: printk("No local APIC present or hardware disabled\n"); return -1; }
/* * Enable CMCI (Corrected Machine Check Interrupt) for available MCE banks * on this CPU. Use the algorithm recommended in the SDM to discover shared * banks. */ static void cmci_discover(int banks) { unsigned long *owned = (void *)&__get_cpu_var(mce_banks_owned); unsigned long flags; int i; int bios_wrong_thresh = 0; raw_spin_lock_irqsave(&cmci_discover_lock, flags); for (i = 0; i < banks; i++) { u64 val; int bios_zero_thresh = 0; if (test_bit(i, owned)) continue; /* Skip banks in firmware first mode */ if (test_bit(i, mce_banks_ce_disabled)) continue; rdmsrl(MSR_IA32_MCx_CTL2(i), val); /* Already owned by someone else? */ if (val & MCI_CTL2_CMCI_EN) { clear_bit(i, owned); __clear_bit(i, __get_cpu_var(mce_poll_banks)); continue; } if (!mca_cfg.bios_cmci_threshold) { val &= ~MCI_CTL2_CMCI_THRESHOLD_MASK; val |= CMCI_THRESHOLD; } else if (!(val & MCI_CTL2_CMCI_THRESHOLD_MASK)) { /* * If bios_cmci_threshold boot option was specified * but the threshold is zero, we'll try to initialize * it to 1. */ bios_zero_thresh = 1; val |= CMCI_THRESHOLD; } val |= MCI_CTL2_CMCI_EN; wrmsrl(MSR_IA32_MCx_CTL2(i), val); rdmsrl(MSR_IA32_MCx_CTL2(i), val); /* Did the enable bit stick? -- the bank supports CMCI */ if (val & MCI_CTL2_CMCI_EN) { set_bit(i, owned); __clear_bit(i, __get_cpu_var(mce_poll_banks)); /* * We are able to set thresholds for some banks that * had a threshold of 0. This means the BIOS has not * set the thresholds properly or does not work with * this boot option. Note down now and report later. */ if (mca_cfg.bios_cmci_threshold && bios_zero_thresh && (val & MCI_CTL2_CMCI_THRESHOLD_MASK)) bios_wrong_thresh = 1; } else { WARN_ON(!test_bit(i, __get_cpu_var(mce_poll_banks))); } } raw_spin_unlock_irqrestore(&cmci_discover_lock, flags); if (mca_cfg.bios_cmci_threshold && bios_wrong_thresh) { pr_info_once( "bios_cmci_threshold: Some banks do not have valid thresholds set\n"); pr_info_once( "bios_cmci_threshold: Make sure your BIOS supports this boot option\n"); } }
static int construct_vmcs(struct vcpu *v) { uint16_t sysenter_cs; unsigned long sysenter_eip; vmx_vmcs_enter(v); /* VMCS controls. */ __vmwrite(PIN_BASED_VM_EXEC_CONTROL, vmx_pin_based_exec_control); __vmwrite(VM_EXIT_CONTROLS, vmx_vmexit_control); __vmwrite(VM_ENTRY_CONTROLS, vmx_vmentry_control); __vmwrite(CPU_BASED_VM_EXEC_CONTROL, vmx_cpu_based_exec_control); v->arch.hvm_vmx.exec_control = vmx_cpu_based_exec_control; if ( vmx_cpu_based_exec_control & CPU_BASED_ACTIVATE_SECONDARY_CONTROLS ) __vmwrite(SECONDARY_VM_EXEC_CONTROL, vmx_secondary_exec_control); /* MSR access bitmap. */ if ( cpu_has_vmx_msr_bitmap ) { char *msr_bitmap = alloc_xenheap_page(); if ( msr_bitmap == NULL ) return -ENOMEM; memset(msr_bitmap, ~0, PAGE_SIZE); v->arch.hvm_vmx.msr_bitmap = msr_bitmap; __vmwrite(MSR_BITMAP, virt_to_maddr(msr_bitmap)); vmx_disable_intercept_for_msr(v, MSR_FS_BASE); vmx_disable_intercept_for_msr(v, MSR_GS_BASE); vmx_disable_intercept_for_msr(v, MSR_IA32_SYSENTER_CS); vmx_disable_intercept_for_msr(v, MSR_IA32_SYSENTER_ESP); vmx_disable_intercept_for_msr(v, MSR_IA32_SYSENTER_EIP); } /* I/O access bitmap. */ __vmwrite(IO_BITMAP_A, virt_to_maddr(hvm_io_bitmap)); __vmwrite(IO_BITMAP_B, virt_to_maddr(hvm_io_bitmap + PAGE_SIZE)); /* Host GDTR base. */ __vmwrite(HOST_GDTR_BASE, GDT_VIRT_START(v)); /* Host data selectors. */ __vmwrite(HOST_SS_SELECTOR, __HYPERVISOR_DS); __vmwrite(HOST_DS_SELECTOR, __HYPERVISOR_DS); __vmwrite(HOST_ES_SELECTOR, __HYPERVISOR_DS); __vmwrite(HOST_FS_SELECTOR, 0); __vmwrite(HOST_GS_SELECTOR, 0); __vmwrite(HOST_FS_BASE, 0); __vmwrite(HOST_GS_BASE, 0); /* Host control registers. */ v->arch.hvm_vmx.host_cr0 = read_cr0() | X86_CR0_TS; __vmwrite(HOST_CR0, v->arch.hvm_vmx.host_cr0); __vmwrite(HOST_CR4, mmu_cr4_features); /* Host CS:RIP. */ __vmwrite(HOST_CS_SELECTOR, __HYPERVISOR_CS); __vmwrite(HOST_RIP, (unsigned long)vmx_asm_vmexit_handler); /* Host SYSENTER CS:RIP. */ rdmsrl(MSR_IA32_SYSENTER_CS, sysenter_cs); __vmwrite(HOST_SYSENTER_CS, sysenter_cs); rdmsrl(MSR_IA32_SYSENTER_EIP, sysenter_eip); __vmwrite(HOST_SYSENTER_EIP, sysenter_eip); /* MSR intercepts. */ __vmwrite(VM_EXIT_MSR_LOAD_COUNT, 0); __vmwrite(VM_EXIT_MSR_STORE_COUNT, 0); __vmwrite(VM_ENTRY_MSR_LOAD_COUNT, 0); __vmwrite(VM_ENTRY_INTR_INFO, 0); __vmwrite(CR0_GUEST_HOST_MASK, ~0UL); __vmwrite(CR4_GUEST_HOST_MASK, ~0UL); __vmwrite(PAGE_FAULT_ERROR_CODE_MASK, 0); __vmwrite(PAGE_FAULT_ERROR_CODE_MATCH, 0); __vmwrite(CR3_TARGET_COUNT, 0); __vmwrite(GUEST_ACTIVITY_STATE, 0); /* Guest segment bases. */ __vmwrite(GUEST_ES_BASE, 0); __vmwrite(GUEST_SS_BASE, 0); __vmwrite(GUEST_DS_BASE, 0); __vmwrite(GUEST_FS_BASE, 0); __vmwrite(GUEST_GS_BASE, 0); __vmwrite(GUEST_CS_BASE, 0); /* Guest segment limits. */ __vmwrite(GUEST_ES_LIMIT, ~0u); __vmwrite(GUEST_SS_LIMIT, ~0u); __vmwrite(GUEST_DS_LIMIT, ~0u); __vmwrite(GUEST_FS_LIMIT, ~0u); __vmwrite(GUEST_GS_LIMIT, ~0u); __vmwrite(GUEST_CS_LIMIT, ~0u); /* Guest segment AR bytes. */ __vmwrite(GUEST_ES_AR_BYTES, 0xc093); /* read/write, accessed */ __vmwrite(GUEST_SS_AR_BYTES, 0xc093); __vmwrite(GUEST_DS_AR_BYTES, 0xc093); __vmwrite(GUEST_FS_AR_BYTES, 0xc093); __vmwrite(GUEST_GS_AR_BYTES, 0xc093); __vmwrite(GUEST_CS_AR_BYTES, 0xc09b); /* exec/read, accessed */ /* Guest IDT. */ __vmwrite(GUEST_IDTR_BASE, 0); __vmwrite(GUEST_IDTR_LIMIT, 0); /* Guest GDT. */ __vmwrite(GUEST_GDTR_BASE, 0); __vmwrite(GUEST_GDTR_LIMIT, 0); /* Guest LDT. */ __vmwrite(GUEST_LDTR_AR_BYTES, 0x0082); /* LDT */ __vmwrite(GUEST_LDTR_SELECTOR, 0); __vmwrite(GUEST_LDTR_BASE, 0); __vmwrite(GUEST_LDTR_LIMIT, 0); /* Guest TSS. */ __vmwrite(GUEST_TR_AR_BYTES, 0x008b); /* 32-bit TSS (busy) */ __vmwrite(GUEST_TR_BASE, 0); __vmwrite(GUEST_TR_LIMIT, 0xff); __vmwrite(GUEST_INTERRUPTIBILITY_INFO, 0); __vmwrite(GUEST_DR7, 0); __vmwrite(VMCS_LINK_POINTER, ~0UL); #if defined(__i386__) __vmwrite(VMCS_LINK_POINTER_HIGH, ~0UL); #endif __vmwrite(EXCEPTION_BITMAP, (HVM_TRAP_MASK | (1U << TRAP_page_fault) | (1U << TRAP_no_device))); v->arch.hvm_vcpu.guest_cr[0] = X86_CR0_PE | X86_CR0_ET; hvm_update_guest_cr(v, 0); v->arch.hvm_vcpu.guest_cr[4] = 0; hvm_update_guest_cr(v, 4); if ( cpu_has_vmx_tpr_shadow ) { __vmwrite(VIRTUAL_APIC_PAGE_ADDR, page_to_maddr(vcpu_vlapic(v)->regs_page)); __vmwrite(TPR_THRESHOLD, 0); } vmx_vmcs_exit(v); paging_update_paging_modes(v); /* will update HOST & GUEST_CR3 as reqd */ vmx_vlapic_msr_changed(v); return 0; }
static int __init extlog_init(void) { struct extlog_l1_head *l1_head; void __iomem *extlog_l1_hdr; size_t l1_hdr_size; struct resource *r; u64 cap; int rc; rc = -ENODEV; rdmsrl(MSR_IA32_MCG_CAP, cap); if (!(cap & MCG_ELOG_P)) return rc; if (!extlog_get_l1addr()) return rc; rc = -EINVAL; /* get L1 header to fetch necessary information */ l1_hdr_size = sizeof(struct extlog_l1_head); r = request_mem_region(l1_dirbase, l1_hdr_size, "L1 DIR HDR"); if (!r) { pr_warn(FW_BUG EMCA_BUG, (unsigned long long)l1_dirbase, (unsigned long long)l1_dirbase + l1_hdr_size); goto err; } extlog_l1_hdr = acpi_os_map_memory(l1_dirbase, l1_hdr_size); l1_head = (struct extlog_l1_head *)extlog_l1_hdr; l1_size = l1_head->total_len; l1_percpu_entry = l1_head->entries; elog_base = l1_head->elog_base; elog_size = l1_head->elog_len; acpi_os_unmap_memory(extlog_l1_hdr, l1_hdr_size); release_mem_region(l1_dirbase, l1_hdr_size); /* remap L1 header again based on completed information */ r = request_mem_region(l1_dirbase, l1_size, "L1 Table"); if (!r) { pr_warn(FW_BUG EMCA_BUG, (unsigned long long)l1_dirbase, (unsigned long long)l1_dirbase + l1_size); goto err; } extlog_l1_addr = acpi_os_map_memory(l1_dirbase, l1_size); l1_entry_base = (u64 *)((u8 *)extlog_l1_addr + l1_hdr_size); /* remap elog table */ r = request_mem_region(elog_base, elog_size, "Elog Table"); if (!r) { pr_warn(FW_BUG EMCA_BUG, (unsigned long long)elog_base, (unsigned long long)elog_base + elog_size); goto err_release_l1_dir; } elog_addr = acpi_os_map_memory(elog_base, elog_size); rc = -ENOMEM; /* allocate buffer to save elog record */ elog_buf = kmalloc(ELOG_ENTRY_LEN, GFP_KERNEL); if (elog_buf == NULL) goto err_release_elog; mce_register_decode_chain(&extlog_mce_dec); /* enable OS to be involved to take over management from BIOS */ ((struct extlog_l1_head *)extlog_l1_addr)->flags |= FLAG_OS_OPTIN; return 0; err_release_elog: if (elog_addr) acpi_os_unmap_memory(elog_addr, elog_size); release_mem_region(elog_base, elog_size); err_release_l1_dir: if (extlog_l1_addr) acpi_os_unmap_memory(extlog_l1_addr, l1_size); release_mem_region(l1_dirbase, l1_size); err: pr_warn(FW_BUG "Extended error log disabled because of problems parsing f/w tables\n"); return rc; }
static void __cpuinit early_init_intel(struct cpuinfo_x86 *c) { u64 misc_enable; /* Unmask CPUID levels if masked: */ if (c->x86 > 6 || (c->x86 == 6 && c->x86_model >= 0xd)) { rdmsrl(MSR_IA32_MISC_ENABLE, misc_enable); if (misc_enable & MSR_IA32_MISC_ENABLE_LIMIT_CPUID) { misc_enable &= ~MSR_IA32_MISC_ENABLE_LIMIT_CPUID; wrmsrl(MSR_IA32_MISC_ENABLE, misc_enable); c->cpuid_level = cpuid_eax(0); get_cpu_cap(c); } } if ((c->x86 == 0xf && c->x86_model >= 0x03) || (c->x86 == 0x6 && c->x86_model >= 0x0e)) set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC); if (c->x86 >= 6 && !cpu_has(c, X86_FEATURE_IA64)) { unsigned lower_word; wrmsr(MSR_IA32_UCODE_REV, 0, 0); /* Required by the SDM */ sync_core(); rdmsr(MSR_IA32_UCODE_REV, lower_word, c->microcode); } /* * Atom erratum AAE44/AAF40/AAG38/AAH41: * * A race condition between speculative fetches and invalidating * a large page. This is worked around in microcode, but we * need the microcode to have already been loaded... so if it is * not, recommend a BIOS update and disable large pages. */ if (c->x86 == 6 && c->x86_model == 0x1c && c->x86_mask <= 2 && c->microcode < 0x20e) { printk(KERN_WARNING "Atom PSE erratum detected, BIOS microcode update recommended\n"); clear_cpu_cap(c, X86_FEATURE_PSE); } #ifdef CONFIG_X86_64 set_cpu_cap(c, X86_FEATURE_SYSENTER32); #else /* Netburst reports 64 bytes clflush size, but does IO in 128 bytes */ if (c->x86 == 15 && c->x86_cache_alignment == 64) c->x86_cache_alignment = 128; #endif /* CPUID workaround for 0F33/0F34 CPU */ if (c->x86 == 0xF && c->x86_model == 0x3 && (c->x86_mask == 0x3 || c->x86_mask == 0x4)) c->x86_phys_bits = 36; /* * c->x86_power is 8000_0007 edx. Bit 8 is TSC runs at constant rate * with P/T states and does not stop in deep C-states. * * It is also reliable across cores and sockets. (but not across * cabinets - we turn it off in that case explicitly.) */ if (c->x86_power & (1 << 8)) { set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC); set_cpu_cap(c, X86_FEATURE_NONSTOP_TSC); if (!check_tsc_unstable()) sched_clock_stable = 1; } /* * There is a known erratum on Pentium III and Core Solo * and Core Duo CPUs. * " Page with PAT set to WC while associated MTRR is UC * may consolidate to UC " * Because of this erratum, it is better to stick with * setting WC in MTRR rather than using PAT on these CPUs. * * Enable PAT WC only on P4, Core 2 or later CPUs. */ if (c->x86 == 6 && c->x86_model < 15) clear_cpu_cap(c, X86_FEATURE_PAT); #ifdef CONFIG_KMEMCHECK /* * P4s have a "fast strings" feature which causes single- * stepping REP instructions to only generate a #DB on * cache-line boundaries. * * Ingo Molnar reported a Pentium D (model 6) and a Xeon * (model 2) with the same problem. */ if (c->x86 == 15) { rdmsrl(MSR_IA32_MISC_ENABLE, misc_enable); if (misc_enable & MSR_IA32_MISC_ENABLE_FAST_STRING) { printk(KERN_INFO "kmemcheck: Disabling fast string operations\n"); misc_enable &= ~MSR_IA32_MISC_ENABLE_FAST_STRING; wrmsrl(MSR_IA32_MISC_ENABLE, misc_enable); } } #endif /* * If fast string is not enabled in IA32_MISC_ENABLE for any reason, * clear the fast string and enhanced fast string CPU capabilities. */ if (c->x86 > 6 || (c->x86 == 6 && c->x86_model >= 0xd)) { rdmsrl(MSR_IA32_MISC_ENABLE, misc_enable); if (!(misc_enable & MSR_IA32_MISC_ENABLE_FAST_STRING)) { printk(KERN_INFO "Disabled fast string operations\n"); setup_clear_cpu_cap(X86_FEATURE_REP_GOOD); setup_clear_cpu_cap(X86_FEATURE_ERMS); } } }
static void vtss_dsa_on_each_cpu_init(void* ctx) { if (hardcfg.family == 0x06 || hardcfg.family == 0x0f) { rdmsrl(DS_AREA_MSR, __get_cpu_var(vtss_dsa_cpu_msr)); } }