]> bbs.cooldavid.org Git - net-next-2.6.git/commitdiff
Merge branch 'perf'
authorAvi Kivity <avi@redhat.com>
Mon, 19 Apr 2010 09:52:53 +0000 (12:52 +0300)
committerAvi Kivity <avi@redhat.com>
Mon, 17 May 2010 09:17:58 +0000 (12:17 +0300)
Signed-off-by: Avi Kivity <avi@redhat.com>
1  2 
MAINTAINERS
arch/x86/kvm/vmx.c
arch/x86/kvm/x86.c
kernel/sched.c

diff --combined MAINTAINERS
index a0e3c3a47a51afebf9835b3e9ce3865aab2bc13c,c3e9c3633b75ca89bee21c126339be1053717770..0716c65c05c9c895ec93ee3f66262bd688a4ad09
@@@ -485,8 -485,8 +485,8 @@@ S: Maintaine
  F:    drivers/input/mouse/bcm5974.c
  
  APPLE SMC DRIVER
 -M:    Nicolas Boichat <nicolas@boichat.ch>
 -L:    mactel-linux-devel@lists.sourceforge.net
 +M:    Henrik Rydberg <rydberg@euromail.se>
 +L:    lm-sensors@lm-sensors.org
  S:    Maintained
  F:    drivers/hwmon/applesmc.c
  
@@@ -971,16 -971,6 +971,16 @@@ L:       linux-arm-kernel@lists.infradead.or
  W:    http://www.mcuos.com
  S:    Maintained
  
 +ARM/U300 MACHINE SUPPORT
 +M:    Linus Walleij <linus.walleij@stericsson.com>
 +L:    linux-arm-kernel@lists.infradead.org (moderated for non-subscribers)
 +S:    Supported
 +F:    arch/arm/mach-u300/
 +F:    drivers/i2c/busses/i2c-stu300.c
 +F:    drivers/rtc/rtc-coh901331.c
 +F:    drivers/watchdog/coh901327_wdt.c
 +F:    drivers/dma/coh901318*
 +
  ARM/U8500 ARM ARCHITECTURE
  M:    Srinidhi Kasagar <srinidhi.kasagar@stericsson.com>
  L:    linux-arm-kernel@lists.infradead.org (moderated for non-subscribers)
@@@ -4353,13 -4343,13 +4353,13 @@@ M:   Paul Mackerras <paulus@samba.org
  M:    Ingo Molnar <mingo@elte.hu>
  M:    Arnaldo Carvalho de Melo <acme@redhat.com>
  S:    Supported
- F:    kernel/perf_event.c
+ F:    kernel/perf_event*.c
  F:    include/linux/perf_event.h
- F:    arch/*/kernel/perf_event.c
- F:    arch/*/kernel/*/perf_event.c
- F:    arch/*/kernel/*/*/perf_event.c
+ F:    arch/*/kernel/perf_event*.c
+ F:    arch/*/kernel/*/perf_event*.c
+ F:    arch/*/kernel/*/*/perf_event*.c
  F:    arch/*/include/asm/perf_event.h
- F:    arch/*/lib/perf_event.c
+ F:    arch/*/lib/perf_event*.c
  F:    arch/*/kernel/perf_callchain.c
  F:    tools/perf/
  
diff --combined arch/x86/kvm/vmx.c
index 6e5e75e0d7d3eabf0d930d1d79587acd8e05a44c,82be6dac3d25f034b3c829af7d0fb7560af70b50..0b896ac7e4bb45186072032602782404209be7e5
@@@ -77,8 -77,6 +77,8 @@@ module_param(emulate_invalid_guest_stat
  #define KVM_PMODE_VM_CR4_ALWAYS_ON (X86_CR4_PAE | X86_CR4_VMXE)
  #define KVM_RMODE_VM_CR4_ALWAYS_ON (X86_CR4_VME | X86_CR4_PAE | X86_CR4_VMXE)
  
 +#define RMODE_GUEST_OWNED_EFLAGS_BITS (~(X86_EFLAGS_IOPL | X86_EFLAGS_VM))
 +
  /*
   * These 2 parameters are used to config the controls for Pause-Loop Exiting:
   * ple_gap:    upper bound on the amount of time between two successive
@@@ -133,7 -131,7 +133,7 @@@ struct vcpu_vmx 
        } host_state;
        struct {
                int vm86_active;
 -              u8 save_iopl;
 +              ulong save_rflags;
                struct kvm_save_segment {
                        u16 selector;
                        unsigned long base;
@@@ -234,56 -232,56 +234,56 @@@ static const u32 vmx_msr_index[] = 
  };
  #define NR_VMX_MSR ARRAY_SIZE(vmx_msr_index)
  
 -static inline int is_page_fault(u32 intr_info)
 +static inline bool is_page_fault(u32 intr_info)
  {
        return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VECTOR_MASK |
                             INTR_INFO_VALID_MASK)) ==
                (INTR_TYPE_HARD_EXCEPTION | PF_VECTOR | INTR_INFO_VALID_MASK);
  }
  
 -static inline int is_no_device(u32 intr_info)
 +static inline bool is_no_device(u32 intr_info)
  {
        return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VECTOR_MASK |
                             INTR_INFO_VALID_MASK)) ==
                (INTR_TYPE_HARD_EXCEPTION | NM_VECTOR | INTR_INFO_VALID_MASK);
  }
  
 -static inline int is_invalid_opcode(u32 intr_info)
 +static inline bool is_invalid_opcode(u32 intr_info)
  {
        return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VECTOR_MASK |
                             INTR_INFO_VALID_MASK)) ==
                (INTR_TYPE_HARD_EXCEPTION | UD_VECTOR | INTR_INFO_VALID_MASK);
  }
  
 -static inline int is_external_interrupt(u32 intr_info)
 +static inline bool is_external_interrupt(u32 intr_info)
  {
        return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VALID_MASK))
                == (INTR_TYPE_EXT_INTR | INTR_INFO_VALID_MASK);
  }
  
 -static inline int is_machine_check(u32 intr_info)
 +static inline bool is_machine_check(u32 intr_info)
  {
        return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VECTOR_MASK |
                             INTR_INFO_VALID_MASK)) ==
                (INTR_TYPE_HARD_EXCEPTION | MC_VECTOR | INTR_INFO_VALID_MASK);
  }
  
 -static inline int cpu_has_vmx_msr_bitmap(void)
 +static inline bool cpu_has_vmx_msr_bitmap(void)
  {
        return vmcs_config.cpu_based_exec_ctrl & CPU_BASED_USE_MSR_BITMAPS;
  }
  
 -static inline int cpu_has_vmx_tpr_shadow(void)
 +static inline bool cpu_has_vmx_tpr_shadow(void)
  {
        return vmcs_config.cpu_based_exec_ctrl & CPU_BASED_TPR_SHADOW;
  }
  
 -static inline int vm_need_tpr_shadow(struct kvm *kvm)
 +static inline bool vm_need_tpr_shadow(struct kvm *kvm)
  {
        return (cpu_has_vmx_tpr_shadow()) && (irqchip_in_kernel(kvm));
  }
  
 -static inline int cpu_has_secondary_exec_ctrls(void)
 +static inline bool cpu_has_secondary_exec_ctrls(void)
  {
        return vmcs_config.cpu_based_exec_ctrl &
                CPU_BASED_ACTIVATE_SECONDARY_CONTROLS;
@@@ -303,80 -301,80 +303,80 @@@ static inline bool cpu_has_vmx_flexprio
  
  static inline bool cpu_has_vmx_ept_execute_only(void)
  {
 -      return !!(vmx_capability.ept & VMX_EPT_EXECUTE_ONLY_BIT);
 +      return vmx_capability.ept & VMX_EPT_EXECUTE_ONLY_BIT;
  }
  
  static inline bool cpu_has_vmx_eptp_uncacheable(void)
  {
 -      return !!(vmx_capability.ept & VMX_EPTP_UC_BIT);
 +      return vmx_capability.ept & VMX_EPTP_UC_BIT;
  }
  
  static inline bool cpu_has_vmx_eptp_writeback(void)
  {
 -      return !!(vmx_capability.ept & VMX_EPTP_WB_BIT);
 +      return vmx_capability.ept & VMX_EPTP_WB_BIT;
  }
  
  static inline bool cpu_has_vmx_ept_2m_page(void)
  {
 -      return !!(vmx_capability.ept & VMX_EPT_2MB_PAGE_BIT);
 +      return vmx_capability.ept & VMX_EPT_2MB_PAGE_BIT;
  }
  
  static inline bool cpu_has_vmx_ept_1g_page(void)
  {
 -      return !!(vmx_capability.ept & VMX_EPT_1GB_PAGE_BIT);
 +      return vmx_capability.ept & VMX_EPT_1GB_PAGE_BIT;
  }
  
 -static inline int cpu_has_vmx_invept_individual_addr(void)
 +static inline bool cpu_has_vmx_invept_individual_addr(void)
  {
 -      return !!(vmx_capability.ept & VMX_EPT_EXTENT_INDIVIDUAL_BIT);
 +      return vmx_capability.ept & VMX_EPT_EXTENT_INDIVIDUAL_BIT;
  }
  
 -static inline int cpu_has_vmx_invept_context(void)
 +static inline bool cpu_has_vmx_invept_context(void)
  {
 -      return !!(vmx_capability.ept & VMX_EPT_EXTENT_CONTEXT_BIT);
 +      return vmx_capability.ept & VMX_EPT_EXTENT_CONTEXT_BIT;
  }
  
 -static inline int cpu_has_vmx_invept_global(void)
 +static inline bool cpu_has_vmx_invept_global(void)
  {
 -      return !!(vmx_capability.ept & VMX_EPT_EXTENT_GLOBAL_BIT);
 +      return vmx_capability.ept & VMX_EPT_EXTENT_GLOBAL_BIT;
  }
  
 -static inline int cpu_has_vmx_ept(void)
 +static inline bool cpu_has_vmx_ept(void)
  {
        return vmcs_config.cpu_based_2nd_exec_ctrl &
                SECONDARY_EXEC_ENABLE_EPT;
  }
  
 -static inline int cpu_has_vmx_unrestricted_guest(void)
 +static inline bool cpu_has_vmx_unrestricted_guest(void)
  {
        return vmcs_config.cpu_based_2nd_exec_ctrl &
                SECONDARY_EXEC_UNRESTRICTED_GUEST;
  }
  
 -static inline int cpu_has_vmx_ple(void)
 +static inline bool cpu_has_vmx_ple(void)
  {
        return vmcs_config.cpu_based_2nd_exec_ctrl &
                SECONDARY_EXEC_PAUSE_LOOP_EXITING;
  }
  
 -static inline int vm_need_virtualize_apic_accesses(struct kvm *kvm)
 +static inline bool vm_need_virtualize_apic_accesses(struct kvm *kvm)
  {
        return flexpriority_enabled && irqchip_in_kernel(kvm);
  }
  
 -static inline int cpu_has_vmx_vpid(void)
 +static inline bool cpu_has_vmx_vpid(void)
  {
        return vmcs_config.cpu_based_2nd_exec_ctrl &
                SECONDARY_EXEC_ENABLE_VPID;
  }
  
 -static inline int cpu_has_vmx_rdtscp(void)
 +static inline bool cpu_has_vmx_rdtscp(void)
  {
        return vmcs_config.cpu_based_2nd_exec_ctrl &
                SECONDARY_EXEC_RDTSCP;
  }
  
 -static inline int cpu_has_virtual_nmis(void)
 +static inline bool cpu_has_virtual_nmis(void)
  {
        return vmcs_config.pin_based_exec_ctrl & PIN_BASED_VIRTUAL_NMIS;
  }
@@@ -600,11 -598,11 +600,11 @@@ static void reload_tss(void
        /*
         * VT restores TR but not its size.  Useless.
         */
 -      struct descriptor_table gdt;
 +      struct desc_ptr gdt;
        struct desc_struct *descs;
  
 -      kvm_get_gdt(&gdt);
 -      descs = (void *)gdt.base;
 +      native_store_gdt(&gdt);
 +      descs = (void *)gdt.address;
        descs[GDT_ENTRY_TSS].type = 9; /* available TSS */
        load_TR_desc();
  }
@@@ -634,43 -632,6 +634,43 @@@ static bool update_transition_efer(stru
        return true;
  }
  
 +static unsigned long segment_base(u16 selector)
 +{
 +      struct desc_ptr gdt;
 +      struct desc_struct *d;
 +      unsigned long table_base;
 +      unsigned long v;
 +
 +      if (!(selector & ~3))
 +              return 0;
 +
 +      native_store_gdt(&gdt);
 +      table_base = gdt.address;
 +
 +      if (selector & 4) {           /* from ldt */
 +              u16 ldt_selector = kvm_read_ldt();
 +
 +              if (!(ldt_selector & ~3))
 +                      return 0;
 +
 +              table_base = segment_base(ldt_selector);
 +      }
 +      d = (struct desc_struct *)(table_base + (selector & ~7));
 +      v = get_desc_base(d);
 +#ifdef CONFIG_X86_64
 +       if (d->s == 0 && (d->type == 2 || d->type == 9 || d->type == 11))
 +               v |= ((unsigned long)((struct ldttss_desc64 *)d)->base3) << 32;
 +#endif
 +      return v;
 +}
 +
 +static inline unsigned long kvm_read_tr_base(void)
 +{
 +      u16 tr;
 +      asm("str %0" : "=g"(tr));
 +      return segment_base(tr);
 +}
 +
  static void vmx_save_host_state(struct kvm_vcpu *vcpu)
  {
        struct vcpu_vmx *vmx = to_vmx(vcpu);
@@@ -795,7 -756,7 +795,7 @@@ static void vmx_vcpu_load(struct kvm_vc
        }
  
        if (vcpu->cpu != cpu) {
 -              struct descriptor_table dt;
 +              struct desc_ptr dt;
                unsigned long sysenter_esp;
  
                vcpu->cpu = cpu;
                 * processors.
                 */
                vmcs_writel(HOST_TR_BASE, kvm_read_tr_base()); /* 22.2.4 */
 -              kvm_get_gdt(&dt);
 -              vmcs_writel(HOST_GDTR_BASE, dt.base);   /* 22.2.4 */
 +              native_store_gdt(&dt);
 +              vmcs_writel(HOST_GDTR_BASE, dt.address);   /* 22.2.4 */
  
                rdmsrl(MSR_IA32_SYSENTER_ESP, sysenter_esp);
                vmcs_writel(HOST_IA32_SYSENTER_ESP, sysenter_esp); /* 22.2.3 */
@@@ -857,23 -818,18 +857,23 @@@ static void vmx_fpu_deactivate(struct k
  
  static unsigned long vmx_get_rflags(struct kvm_vcpu *vcpu)
  {
 -      unsigned long rflags;
 +      unsigned long rflags, save_rflags;
  
        rflags = vmcs_readl(GUEST_RFLAGS);
 -      if (to_vmx(vcpu)->rmode.vm86_active)
 -              rflags &= ~(unsigned long)(X86_EFLAGS_IOPL | X86_EFLAGS_VM);
 +      if (to_vmx(vcpu)->rmode.vm86_active) {
 +              rflags &= RMODE_GUEST_OWNED_EFLAGS_BITS;
 +              save_rflags = to_vmx(vcpu)->rmode.save_rflags;
 +              rflags |= save_rflags & ~RMODE_GUEST_OWNED_EFLAGS_BITS;
 +      }
        return rflags;
  }
  
  static void vmx_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags)
  {
 -      if (to_vmx(vcpu)->rmode.vm86_active)
 +      if (to_vmx(vcpu)->rmode.vm86_active) {
 +              to_vmx(vcpu)->rmode.save_rflags = rflags;
                rflags |= X86_EFLAGS_IOPL | X86_EFLAGS_VM;
 +      }
        vmcs_writel(GUEST_RFLAGS, rflags);
  }
  
@@@ -883,9 -839,9 +883,9 @@@ static u32 vmx_get_interrupt_shadow(str
        int ret = 0;
  
        if (interruptibility & GUEST_INTR_STATE_STI)
 -              ret |= X86_SHADOW_INT_STI;
 +              ret |= KVM_X86_SHADOW_INT_STI;
        if (interruptibility & GUEST_INTR_STATE_MOV_SS)
 -              ret |= X86_SHADOW_INT_MOV_SS;
 +              ret |= KVM_X86_SHADOW_INT_MOV_SS;
  
        return ret & mask;
  }
@@@ -897,9 -853,9 +897,9 @@@ static void vmx_set_interrupt_shadow(st
  
        interruptibility &= ~(GUEST_INTR_STATE_STI | GUEST_INTR_STATE_MOV_SS);
  
 -      if (mask & X86_SHADOW_INT_MOV_SS)
 +      if (mask & KVM_X86_SHADOW_INT_MOV_SS)
                interruptibility |= GUEST_INTR_STATE_MOV_SS;
 -      if (mask & X86_SHADOW_INT_STI)
 +      else if (mask & KVM_X86_SHADOW_INT_STI)
                interruptibility |= GUEST_INTR_STATE_STI;
  
        if ((interruptibility != interruptibility_old))
@@@ -1527,8 -1483,8 +1527,8 @@@ static void enter_pmode(struct kvm_vcp
        vmcs_write32(GUEST_TR_AR_BYTES, vmx->rmode.tr.ar);
  
        flags = vmcs_readl(GUEST_RFLAGS);
 -      flags &= ~(X86_EFLAGS_IOPL | X86_EFLAGS_VM);
 -      flags |= (vmx->rmode.save_iopl << IOPL_SHIFT);
 +      flags &= RMODE_GUEST_OWNED_EFLAGS_BITS;
 +      flags |= vmx->rmode.save_rflags & ~RMODE_GUEST_OWNED_EFLAGS_BITS;
        vmcs_writel(GUEST_RFLAGS, flags);
  
        vmcs_writel(GUEST_CR4, (vmcs_readl(GUEST_CR4) & ~X86_CR4_VME) |
@@@ -1601,7 -1557,8 +1601,7 @@@ static void enter_rmode(struct kvm_vcp
        vmcs_write32(GUEST_TR_AR_BYTES, 0x008b);
  
        flags = vmcs_readl(GUEST_RFLAGS);
 -      vmx->rmode.save_iopl
 -              = (flags & X86_EFLAGS_IOPL) >> IOPL_SHIFT;
 +      vmx->rmode.save_rflags = flags;
  
        flags |= X86_EFLAGS_IOPL | X86_EFLAGS_VM;
  
@@@ -1971,28 -1928,28 +1971,28 @@@ static void vmx_get_cs_db_l_bits(struc
        *l = (ar >> 13) & 1;
  }
  
 -static void vmx_get_idt(struct kvm_vcpu *vcpu, struct descriptor_table *dt)
 +static void vmx_get_idt(struct kvm_vcpu *vcpu, struct desc_ptr *dt)
  {
 -      dt->limit = vmcs_read32(GUEST_IDTR_LIMIT);
 -      dt->base = vmcs_readl(GUEST_IDTR_BASE);
 +      dt->size = vmcs_read32(GUEST_IDTR_LIMIT);
 +      dt->address = vmcs_readl(GUEST_IDTR_BASE);
  }
  
 -static void vmx_set_idt(struct kvm_vcpu *vcpu, struct descriptor_table *dt)
 +static void vmx_set_idt(struct kvm_vcpu *vcpu, struct desc_ptr *dt)
  {
 -      vmcs_write32(GUEST_IDTR_LIMIT, dt->limit);
 -      vmcs_writel(GUEST_IDTR_BASE, dt->base);
 +      vmcs_write32(GUEST_IDTR_LIMIT, dt->size);
 +      vmcs_writel(GUEST_IDTR_BASE, dt->address);
  }
  
 -static void vmx_get_gdt(struct kvm_vcpu *vcpu, struct descriptor_table *dt)
 +static void vmx_get_gdt(struct kvm_vcpu *vcpu, struct desc_ptr *dt)
  {
 -      dt->limit = vmcs_read32(GUEST_GDTR_LIMIT);
 -      dt->base = vmcs_readl(GUEST_GDTR_BASE);
 +      dt->size = vmcs_read32(GUEST_GDTR_LIMIT);
 +      dt->address = vmcs_readl(GUEST_GDTR_BASE);
  }
  
 -static void vmx_set_gdt(struct kvm_vcpu *vcpu, struct descriptor_table *dt)
 +static void vmx_set_gdt(struct kvm_vcpu *vcpu, struct desc_ptr *dt)
  {
 -      vmcs_write32(GUEST_GDTR_LIMIT, dt->limit);
 -      vmcs_writel(GUEST_GDTR_BASE, dt->base);
 +      vmcs_write32(GUEST_GDTR_LIMIT, dt->size);
 +      vmcs_writel(GUEST_GDTR_BASE, dt->address);
  }
  
  static bool rmode_segment_valid(struct kvm_vcpu *vcpu, int seg)
@@@ -2371,7 -2328,7 +2371,7 @@@ static int vmx_vcpu_setup(struct vcpu_v
        u32 junk;
        u64 host_pat, tsc_this, tsc_base;
        unsigned long a;
 -      struct descriptor_table dt;
 +      struct desc_ptr dt;
        int i;
        unsigned long kvm_vmx_return;
        u32 exec_control;
  
        vmcs_write16(HOST_TR_SELECTOR, GDT_ENTRY_TSS*8);  /* 22.2.4 */
  
 -      kvm_get_idt(&dt);
 -      vmcs_writel(HOST_IDTR_BASE, dt.base);   /* 22.2.4 */
 +      native_store_idt(&dt);
 +      vmcs_writel(HOST_IDTR_BASE, dt.address);   /* 22.2.4 */
  
        asm("mov $.Lkvm_vmx_return, %0" : "=r"(kvm_vmx_return));
        vmcs_writel(HOST_RIP, kvm_vmx_return); /* 22.2.5 */
@@@ -2985,20 -2942,22 +2985,20 @@@ static int handle_io(struct kvm_vcpu *v
        int size, in, string;
        unsigned port;
  
 -      ++vcpu->stat.io_exits;
        exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
        string = (exit_qualification & 16) != 0;
 +      in = (exit_qualification & 8) != 0;
  
 -      if (string) {
 -              if (emulate_instruction(vcpu, 0, 0, 0) == EMULATE_DO_MMIO)
 -                      return 0;
 -              return 1;
 -      }
 +      ++vcpu->stat.io_exits;
  
 -      size = (exit_qualification & 7) + 1;
 -      in = (exit_qualification & 8) != 0;
 -      port = exit_qualification >> 16;
 +      if (string || in)
 +              return !(emulate_instruction(vcpu, 0, 0, 0) == EMULATE_DO_MMIO);
  
 +      port = exit_qualification >> 16;
 +      size = (exit_qualification & 7) + 1;
        skip_emulated_instruction(vcpu);
 -      return kvm_emulate_pio(vcpu, in, size, port);
 +
 +      return kvm_fast_pio_out(vcpu, size, port);
  }
  
  static void
@@@ -3089,9 -3048,19 +3089,9 @@@ static int handle_cr(struct kvm_vcpu *v
        return 0;
  }
  
 -static int check_dr_alias(struct kvm_vcpu *vcpu)
 -{
 -      if (kvm_read_cr4_bits(vcpu, X86_CR4_DE)) {
 -              kvm_queue_exception(vcpu, UD_VECTOR);
 -              return -1;
 -      }
 -      return 0;
 -}
 -
  static int handle_dr(struct kvm_vcpu *vcpu)
  {
        unsigned long exit_qualification;
 -      unsigned long val;
        int dr, reg;
  
        /* Do not handle if the CPL > 0, will trigger GP on re-entry */
        dr = exit_qualification & DEBUG_REG_ACCESS_NUM;
        reg = DEBUG_REG_ACCESS_REG(exit_qualification);
        if (exit_qualification & TYPE_MOV_FROM_DR) {
 -              switch (dr) {
 -              case 0 ... 3:
 -                      val = vcpu->arch.db[dr];
 -                      break;
 -              case 4:
 -                      if (check_dr_alias(vcpu) < 0)
 -                              return 1;
 -                      /* fall through */
 -              case 6:
 -                      val = vcpu->arch.dr6;
 -                      break;
 -              case 5:
 -                      if (check_dr_alias(vcpu) < 0)
 -                              return 1;
 -                      /* fall through */
 -              default: /* 7 */
 -                      val = vcpu->arch.dr7;
 -                      break;
 -              }
 -              kvm_register_write(vcpu, reg, val);
 -      } else {
 -              val = vcpu->arch.regs[reg];
 -              switch (dr) {
 -              case 0 ... 3:
 -                      vcpu->arch.db[dr] = val;
 -                      if (!(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP))
 -                              vcpu->arch.eff_db[dr] = val;
 -                      break;
 -              case 4:
 -                      if (check_dr_alias(vcpu) < 0)
 -                              return 1;
 -                      /* fall through */
 -              case 6:
 -                      if (val & 0xffffffff00000000ULL) {
 -                              kvm_inject_gp(vcpu, 0);
 -                              return 1;
 -                      }
 -                      vcpu->arch.dr6 = (val & DR6_VOLATILE) | DR6_FIXED_1;
 -                      break;
 -              case 5:
 -                      if (check_dr_alias(vcpu) < 0)
 -                              return 1;
 -                      /* fall through */
 -              default: /* 7 */
 -                      if (val & 0xffffffff00000000ULL) {
 -                              kvm_inject_gp(vcpu, 0);
 -                              return 1;
 -                      }
 -                      vcpu->arch.dr7 = (val & DR7_VOLATILE) | DR7_FIXED_1;
 -                      if (!(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP)) {
 -                              vmcs_writel(GUEST_DR7, vcpu->arch.dr7);
 -                              vcpu->arch.switch_db_regs =
 -                                      (val & DR7_BP_EN_MASK);
 -                      }
 -                      break;
 -              }
 -      }
 +              unsigned long val;
 +              if (!kvm_get_dr(vcpu, dr, &val))
 +                      kvm_register_write(vcpu, reg, val);
 +      } else
 +              kvm_set_dr(vcpu, dr, vcpu->arch.regs[reg]);
        skip_emulated_instruction(vcpu);
        return 1;
  }
  
 +static void vmx_set_dr7(struct kvm_vcpu *vcpu, unsigned long val)
 +{
 +      vmcs_writel(GUEST_DR7, val);
 +}
 +
  static int handle_cpuid(struct kvm_vcpu *vcpu)
  {
        kvm_emulate_cpuid(vcpu);
@@@ -3271,8 -3287,6 +3271,8 @@@ static int handle_task_switch(struct kv
  {
        struct vcpu_vmx *vmx = to_vmx(vcpu);
        unsigned long exit_qualification;
 +      bool has_error_code = false;
 +      u32 error_code = 0;
        u16 tss_selector;
        int reason, type, idt_v;
  
                        kvm_clear_interrupt_queue(vcpu);
                        break;
                case INTR_TYPE_HARD_EXCEPTION:
 +                      if (vmx->idt_vectoring_info &
 +                          VECTORING_INFO_DELIVER_CODE_MASK) {
 +                              has_error_code = true;
 +                              error_code =
 +                                      vmcs_read32(IDT_VECTORING_ERROR_CODE);
 +                      }
 +                      /* fall through */
                case INTR_TYPE_SOFT_EXCEPTION:
                        kvm_clear_exception_queue(vcpu);
                        break;
                       type != INTR_TYPE_NMI_INTR))
                skip_emulated_instruction(vcpu);
  
 -      if (!kvm_task_switch(vcpu, tss_selector, reason))
 +      if (kvm_task_switch(vcpu, tss_selector, reason,
 +                              has_error_code, error_code) == EMULATE_FAIL) {
 +              vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
 +              vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION;
 +              vcpu->run->internal.ndata = 0;
                return 0;
 +      }
  
        /* clear all local breakpoint enable flags */
        vmcs_writel(GUEST_DR7, vmcs_readl(GUEST_DR7) & ~55);
@@@ -3567,7 -3569,7 +3567,7 @@@ static int vmx_handle_exit(struct kvm_v
        u32 exit_reason = vmx->exit_reason;
        u32 vectoring_info = vmx->idt_vectoring_info;
  
 -      trace_kvm_exit(exit_reason, kvm_rip_read(vcpu));
 +      trace_kvm_exit(exit_reason, vcpu);
  
        /* If guest state is invalid, start emulating */
        if (vmx->emulation_required && emulate_invalid_guest_state)
@@@ -3652,8 -3654,11 +3652,11 @@@ static void vmx_complete_interrupts(str
  
        /* We need to handle NMIs before interrupts are enabled */
        if ((exit_intr_info & INTR_INFO_INTR_TYPE_MASK) == INTR_TYPE_NMI_INTR &&
-           (exit_intr_info & INTR_INFO_VALID_MASK))
+           (exit_intr_info & INTR_INFO_VALID_MASK)) {
+               kvm_before_handle_nmi(&vmx->vcpu);
                asm("int $2");
+               kvm_after_handle_nmi(&vmx->vcpu);
+       }
  
        idtv_info_valid = idt_vectoring_info & VECTORING_INFO_VALID_MASK;
  
@@@ -4144,7 -4149,6 +4147,7 @@@ static struct kvm_x86_ops vmx_x86_ops 
        .set_idt = vmx_set_idt,
        .get_gdt = vmx_get_gdt,
        .set_gdt = vmx_set_gdt,
 +      .set_dr7 = vmx_set_dr7,
        .cache_reg = vmx_cache_reg,
        .get_rflags = vmx_get_rflags,
        .set_rflags = vmx_set_rflags,
diff --combined arch/x86/kvm/x86.c
index 30efeead4511f6bda8b8f4466bbe76cb14d39333,c3a33b2bb169733b822692437f36c590637eab12..58a96e6a234cdae8c2ac9744eb4028087f3fcd10
@@@ -40,8 -40,9 +40,9 @@@
  #include <linux/user-return-notifier.h>
  #include <linux/srcu.h>
  #include <linux/slab.h>
+ #include <linux/perf_event.h>
  #include <trace/events/kvm.h>
 -#undef TRACE_INCLUDE_FILE
 +
  #define CREATE_TRACE_POINTS
  #include "trace.h"
  
@@@ -223,6 -224,34 +224,6 @@@ static void drop_user_return_notifiers(
                kvm_on_user_return(&smsr->urn);
  }
  
 -unsigned long segment_base(u16 selector)
 -{
 -      struct descriptor_table gdt;
 -      struct desc_struct *d;
 -      unsigned long table_base;
 -      unsigned long v;
 -
 -      if (selector == 0)
 -              return 0;
 -
 -      kvm_get_gdt(&gdt);
 -      table_base = gdt.base;
 -
 -      if (selector & 4) {           /* from ldt */
 -              u16 ldt_selector = kvm_read_ldt();
 -
 -              table_base = segment_base(ldt_selector);
 -      }
 -      d = (struct desc_struct *)(table_base + (selector & ~7));
 -      v = get_desc_base(d);
 -#ifdef CONFIG_X86_64
 -      if (d->s == 0 && (d->type == 2 || d->type == 9 || d->type == 11))
 -              v |= ((unsigned long)((struct ldttss_desc64 *)d)->base3) << 32;
 -#endif
 -      return v;
 -}
 -EXPORT_SYMBOL_GPL(segment_base);
 -
  u64 kvm_get_apic_base(struct kvm_vcpu *vcpu)
  {
        if (irqchip_in_kernel(vcpu->kvm))
@@@ -405,6 -434,8 +406,6 @@@ void kvm_set_cr0(struct kvm_vcpu *vcpu
  
  #ifdef CONFIG_X86_64
        if (cr0 & 0xffffffff00000000UL) {
 -              printk(KERN_DEBUG "set_cr0: 0x%lx #GP, reserved bits 0x%lx\n",
 -                     cr0, kvm_read_cr0(vcpu));
                kvm_inject_gp(vcpu, 0);
                return;
        }
        cr0 &= ~CR0_RESERVED_BITS;
  
        if ((cr0 & X86_CR0_NW) && !(cr0 & X86_CR0_CD)) {
 -              printk(KERN_DEBUG "set_cr0: #GP, CD == 0 && NW == 1\n");
                kvm_inject_gp(vcpu, 0);
                return;
        }
  
        if ((cr0 & X86_CR0_PG) && !(cr0 & X86_CR0_PE)) {
 -              printk(KERN_DEBUG "set_cr0: #GP, set PG flag "
 -                     "and a clear PE flag\n");
                kvm_inject_gp(vcpu, 0);
                return;
        }
                        int cs_db, cs_l;
  
                        if (!is_pae(vcpu)) {
 -                              printk(KERN_DEBUG "set_cr0: #GP, start paging "
 -                                     "in long mode while PAE is disabled\n");
                                kvm_inject_gp(vcpu, 0);
                                return;
                        }
                        kvm_x86_ops->get_cs_db_l_bits(vcpu, &cs_db, &cs_l);
                        if (cs_l) {
 -                              printk(KERN_DEBUG "set_cr0: #GP, start paging "
 -                                     "in long mode while CS.L == 1\n");
                                kvm_inject_gp(vcpu, 0);
                                return;
  
                } else
  #endif
                if (is_pae(vcpu) && !load_pdptrs(vcpu, vcpu->arch.cr3)) {
 -                      printk(KERN_DEBUG "set_cr0: #GP, pdptrs "
 -                             "reserved bits\n");
                        kvm_inject_gp(vcpu, 0);
                        return;
                }
        }
  
        kvm_x86_ops->set_cr0(vcpu, cr0);
 -      vcpu->arch.cr0 = cr0;
  
        kvm_mmu_reset_context(vcpu);
        return;
@@@ -465,23 -506,28 +466,23 @@@ void kvm_set_cr4(struct kvm_vcpu *vcpu
        unsigned long pdptr_bits = X86_CR4_PGE | X86_CR4_PSE | X86_CR4_PAE;
  
        if (cr4 & CR4_RESERVED_BITS) {
 -              printk(KERN_DEBUG "set_cr4: #GP, reserved bits\n");
                kvm_inject_gp(vcpu, 0);
                return;
        }
  
        if (is_long_mode(vcpu)) {
                if (!(cr4 & X86_CR4_PAE)) {
 -                      printk(KERN_DEBUG "set_cr4: #GP, clearing PAE while "
 -                             "in long mode\n");
                        kvm_inject_gp(vcpu, 0);
                        return;
                }
        } else if (is_paging(vcpu) && (cr4 & X86_CR4_PAE)
                   && ((cr4 ^ old_cr4) & pdptr_bits)
                   && !load_pdptrs(vcpu, vcpu->arch.cr3)) {
 -              printk(KERN_DEBUG "set_cr4: #GP, pdptrs reserved bits\n");
                kvm_inject_gp(vcpu, 0);
                return;
        }
  
        if (cr4 & X86_CR4_VMXE) {
 -              printk(KERN_DEBUG "set_cr4: #GP, setting VMXE\n");
                kvm_inject_gp(vcpu, 0);
                return;
        }
@@@ -502,16 -548,21 +503,16 @@@ void kvm_set_cr3(struct kvm_vcpu *vcpu
  
        if (is_long_mode(vcpu)) {
                if (cr3 & CR3_L_MODE_RESERVED_BITS) {
 -                      printk(KERN_DEBUG "set_cr3: #GP, reserved bits\n");
                        kvm_inject_gp(vcpu, 0);
                        return;
                }
        } else {
                if (is_pae(vcpu)) {
                        if (cr3 & CR3_PAE_RESERVED_BITS) {
 -                              printk(KERN_DEBUG
 -                                     "set_cr3: #GP, reserved bits\n");
                                kvm_inject_gp(vcpu, 0);
                                return;
                        }
                        if (is_paging(vcpu) && !load_pdptrs(vcpu, cr3)) {
 -                              printk(KERN_DEBUG "set_cr3: #GP, pdptrs "
 -                                     "reserved bits\n");
                                kvm_inject_gp(vcpu, 0);
                                return;
                        }
@@@ -543,6 -594,7 +544,6 @@@ EXPORT_SYMBOL_GPL(kvm_set_cr3)
  void kvm_set_cr8(struct kvm_vcpu *vcpu, unsigned long cr8)
  {
        if (cr8 & CR8_RESERVED_BITS) {
 -              printk(KERN_DEBUG "set_cr8: #GP, reserved bits 0x%lx\n", cr8);
                kvm_inject_gp(vcpu, 0);
                return;
        }
@@@ -562,80 -614,6 +563,80 @@@ unsigned long kvm_get_cr8(struct kvm_vc
  }
  EXPORT_SYMBOL_GPL(kvm_get_cr8);
  
 +int kvm_set_dr(struct kvm_vcpu *vcpu, int dr, unsigned long val)
 +{
 +      switch (dr) {
 +      case 0 ... 3:
 +              vcpu->arch.db[dr] = val;
 +              if (!(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP))
 +                      vcpu->arch.eff_db[dr] = val;
 +              break;
 +      case 4:
 +              if (kvm_read_cr4_bits(vcpu, X86_CR4_DE)) {
 +                      kvm_queue_exception(vcpu, UD_VECTOR);
 +                      return 1;
 +              }
 +              /* fall through */
 +      case 6:
 +              if (val & 0xffffffff00000000ULL) {
 +                      kvm_inject_gp(vcpu, 0);
 +                      return 1;
 +              }
 +              vcpu->arch.dr6 = (val & DR6_VOLATILE) | DR6_FIXED_1;
 +              break;
 +      case 5:
 +              if (kvm_read_cr4_bits(vcpu, X86_CR4_DE)) {
 +                      kvm_queue_exception(vcpu, UD_VECTOR);
 +                      return 1;
 +              }
 +              /* fall through */
 +      default: /* 7 */
 +              if (val & 0xffffffff00000000ULL) {
 +                      kvm_inject_gp(vcpu, 0);
 +                      return 1;
 +              }
 +              vcpu->arch.dr7 = (val & DR7_VOLATILE) | DR7_FIXED_1;
 +              if (!(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP)) {
 +                      kvm_x86_ops->set_dr7(vcpu, vcpu->arch.dr7);
 +                      vcpu->arch.switch_db_regs = (val & DR7_BP_EN_MASK);
 +              }
 +              break;
 +      }
 +
 +      return 0;
 +}
 +EXPORT_SYMBOL_GPL(kvm_set_dr);
 +
 +int kvm_get_dr(struct kvm_vcpu *vcpu, int dr, unsigned long *val)
 +{
 +      switch (dr) {
 +      case 0 ... 3:
 +              *val = vcpu->arch.db[dr];
 +              break;
 +      case 4:
 +              if (kvm_read_cr4_bits(vcpu, X86_CR4_DE)) {
 +                      kvm_queue_exception(vcpu, UD_VECTOR);
 +                      return 1;
 +              }
 +              /* fall through */
 +      case 6:
 +              *val = vcpu->arch.dr6;
 +              break;
 +      case 5:
 +              if (kvm_read_cr4_bits(vcpu, X86_CR4_DE)) {
 +                      kvm_queue_exception(vcpu, UD_VECTOR);
 +                      return 1;
 +              }
 +              /* fall through */
 +      default: /* 7 */
 +              *val = vcpu->arch.dr7;
 +              break;
 +      }
 +
 +      return 0;
 +}
 +EXPORT_SYMBOL_GPL(kvm_get_dr);
 +
  static inline u32 bit(int bitno)
  {
        return 1 << (bitno & 31);
@@@ -672,12 -650,15 +673,12 @@@ static u32 emulated_msrs[] = 
  static void set_efer(struct kvm_vcpu *vcpu, u64 efer)
  {
        if (efer & efer_reserved_bits) {
 -              printk(KERN_DEBUG "set_efer: 0x%llx #GP, reserved bits\n",
 -                     efer);
                kvm_inject_gp(vcpu, 0);
                return;
        }
  
        if (is_paging(vcpu)
            && (vcpu->arch.efer & EFER_LME) != (efer & EFER_LME)) {
 -              printk(KERN_DEBUG "set_efer: #GP, change LME while paging\n");
                kvm_inject_gp(vcpu, 0);
                return;
        }
  
                feat = kvm_find_cpuid_entry(vcpu, 0x80000001, 0);
                if (!feat || !(feat->edx & bit(X86_FEATURE_FXSR_OPT))) {
 -                      printk(KERN_DEBUG "set_efer: #GP, enable FFXSR w/o CPUID capability\n");
                        kvm_inject_gp(vcpu, 0);
                        return;
                }
  
                feat = kvm_find_cpuid_entry(vcpu, 0x80000001, 0);
                if (!feat || !(feat->ecx & bit(X86_FEATURE_SVM))) {
 -                      printk(KERN_DEBUG "set_efer: #GP, enable SVM w/o SVM\n");
                        kvm_inject_gp(vcpu, 0);
                        return;
                }
@@@ -985,13 -968,9 +986,13 @@@ static int set_msr_mce(struct kvm_vcpu 
                if (msr >= MSR_IA32_MC0_CTL &&
                    msr < MSR_IA32_MC0_CTL + 4 * bank_num) {
                        u32 offset = msr - MSR_IA32_MC0_CTL;
 -                      /* only 0 or all 1s can be written to IA32_MCi_CTL */
 +                      /* only 0 or all 1s can be written to IA32_MCi_CTL
 +                       * some Linux kernels though clear bit 10 in bank 4 to
 +                       * workaround a BIOS/GART TBL issue on AMD K8s, ignore
 +                       * this to avoid an uncatched #GP in the guest
 +                       */
                        if ((offset & 0x3) == 0 &&
 -                          data != 0 && data != ~(u64)0)
 +                          data != 0 && (data | (1 << 10)) != ~(u64)0)
                                return -1;
                        vcpu->arch.mce_banks[offset] = data;
                        break;
@@@ -1135,7 -1114,6 +1136,7 @@@ int kvm_set_msr_common(struct kvm_vcpu 
                break;
        case MSR_K7_HWCR:
                data &= ~(u64)0x40;     /* ignore flush filter disable */
 +              data &= ~(u64)0x100;    /* ignore ignne emulation enable */
                if (data != 0) {
                        pr_unimpl(vcpu, "unimplemented HWCR wrmsr: 0x%llx\n",
                                data);
@@@ -1594,7 -1572,6 +1595,7 @@@ int kvm_dev_ioctl_check_extension(long 
        case KVM_CAP_HYPERV_VAPIC:
        case KVM_CAP_HYPERV_SPIN:
        case KVM_CAP_PCI_SEGMENT:
 +      case KVM_CAP_DEBUGREGS:
        case KVM_CAP_X86_ROBUST_SINGLESTEP:
                r = 1;
                break;
@@@ -2147,20 -2124,14 +2148,20 @@@ static void kvm_vcpu_ioctl_x86_get_vcpu
  {
        vcpu_load(vcpu);
  
 -      events->exception.injected = vcpu->arch.exception.pending;
 +      events->exception.injected =
 +              vcpu->arch.exception.pending &&
 +              !kvm_exception_is_soft(vcpu->arch.exception.nr);
        events->exception.nr = vcpu->arch.exception.nr;
        events->exception.has_error_code = vcpu->arch.exception.has_error_code;
        events->exception.error_code = vcpu->arch.exception.error_code;
  
 -      events->interrupt.injected = vcpu->arch.interrupt.pending;
 +      events->interrupt.injected =
 +              vcpu->arch.interrupt.pending && !vcpu->arch.interrupt.soft;
        events->interrupt.nr = vcpu->arch.interrupt.nr;
 -      events->interrupt.soft = vcpu->arch.interrupt.soft;
 +      events->interrupt.soft = 0;
 +      events->interrupt.shadow =
 +              kvm_x86_ops->get_interrupt_shadow(vcpu,
 +                      KVM_X86_SHADOW_INT_MOV_SS | KVM_X86_SHADOW_INT_STI);
  
        events->nmi.injected = vcpu->arch.nmi_injected;
        events->nmi.pending = vcpu->arch.nmi_pending;
        events->sipi_vector = vcpu->arch.sipi_vector;
  
        events->flags = (KVM_VCPUEVENT_VALID_NMI_PENDING
 -                       | KVM_VCPUEVENT_VALID_SIPI_VECTOR);
 +                       | KVM_VCPUEVENT_VALID_SIPI_VECTOR
 +                       | KVM_VCPUEVENT_VALID_SHADOW);
  
        vcpu_put(vcpu);
  }
@@@ -2179,8 -2149,7 +2180,8 @@@ static int kvm_vcpu_ioctl_x86_set_vcpu_
                                              struct kvm_vcpu_events *events)
  {
        if (events->flags & ~(KVM_VCPUEVENT_VALID_NMI_PENDING
 -                            | KVM_VCPUEVENT_VALID_SIPI_VECTOR))
 +                            | KVM_VCPUEVENT_VALID_SIPI_VECTOR
 +                            | KVM_VCPUEVENT_VALID_SHADOW))
                return -EINVAL;
  
        vcpu_load(vcpu);
        vcpu->arch.interrupt.soft = events->interrupt.soft;
        if (vcpu->arch.interrupt.pending && irqchip_in_kernel(vcpu->kvm))
                kvm_pic_clear_isr_ack(vcpu->kvm);
 +      if (events->flags & KVM_VCPUEVENT_VALID_SHADOW)
 +              kvm_x86_ops->set_interrupt_shadow(vcpu,
 +                                                events->interrupt.shadow);
  
        vcpu->arch.nmi_injected = events->nmi.injected;
        if (events->flags & KVM_VCPUEVENT_VALID_NMI_PENDING)
        return 0;
  }
  
 +static void kvm_vcpu_ioctl_x86_get_debugregs(struct kvm_vcpu *vcpu,
 +                                           struct kvm_debugregs *dbgregs)
 +{
 +      vcpu_load(vcpu);
 +
 +      memcpy(dbgregs->db, vcpu->arch.db, sizeof(vcpu->arch.db));
 +      dbgregs->dr6 = vcpu->arch.dr6;
 +      dbgregs->dr7 = vcpu->arch.dr7;
 +      dbgregs->flags = 0;
 +
 +      vcpu_put(vcpu);
 +}
 +
 +static int kvm_vcpu_ioctl_x86_set_debugregs(struct kvm_vcpu *vcpu,
 +                                          struct kvm_debugregs *dbgregs)
 +{
 +      if (dbgregs->flags)
 +              return -EINVAL;
 +
 +      vcpu_load(vcpu);
 +
 +      memcpy(vcpu->arch.db, dbgregs->db, sizeof(vcpu->arch.db));
 +      vcpu->arch.dr6 = dbgregs->dr6;
 +      vcpu->arch.dr7 = dbgregs->dr7;
 +
 +      vcpu_put(vcpu);
 +
 +      return 0;
 +}
 +
  long kvm_arch_vcpu_ioctl(struct file *filp,
                         unsigned int ioctl, unsigned long arg)
  {
                r = kvm_vcpu_ioctl_x86_set_vcpu_events(vcpu, &events);
                break;
        }
 +      case KVM_GET_DEBUGREGS: {
 +              struct kvm_debugregs dbgregs;
 +
 +              kvm_vcpu_ioctl_x86_get_debugregs(vcpu, &dbgregs);
 +
 +              r = -EFAULT;
 +              if (copy_to_user(argp, &dbgregs,
 +                               sizeof(struct kvm_debugregs)))
 +                      break;
 +              r = 0;
 +              break;
 +      }
 +      case KVM_SET_DEBUGREGS: {
 +              struct kvm_debugregs dbgregs;
 +
 +              r = -EFAULT;
 +              if (copy_from_user(&dbgregs, argp,
 +                                 sizeof(struct kvm_debugregs)))
 +                      break;
 +
 +              r = kvm_vcpu_ioctl_x86_set_debugregs(vcpu, &dbgregs);
 +              break;
 +      }
        default:
                r = -EINVAL;
        }
@@@ -2723,9 -2636,8 +2724,9 @@@ static int kvm_vm_ioctl_reinject(struc
  int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm,
                                      struct kvm_dirty_log *log)
  {
 -      int r, n, i;
 +      int r, i;
        struct kvm_memory_slot *memslot;
 +      unsigned long n;
        unsigned long is_dirty = 0;
        unsigned long *dirty_bitmap = NULL;
  
        if (!memslot->dirty_bitmap)
                goto out;
  
 -      n = ALIGN(memslot->npages, BITS_PER_LONG) / 8;
 +      n = kvm_dirty_bitmap_bytes(memslot);
  
        r = -ENOMEM;
        dirty_bitmap = vmalloc(n);
@@@ -2910,13 -2822,11 +2911,13 @@@ long kvm_arch_vm_ioctl(struct file *fil
                r = -EFAULT;
                if (copy_from_user(&irq_event, argp, sizeof irq_event))
                        goto out;
 +              r = -ENXIO;
                if (irqchip_in_kernel(kvm)) {
                        __s32 status;
                        status = kvm_set_irq(kvm, KVM_USERSPACE_IRQ_SOURCE_ID,
                                        irq_event.irq, irq_event.level);
                        if (ioctl == KVM_IRQ_LINE_STATUS) {
 +                              r = -EFAULT;
                                irq_event.status = status;
                                if (copy_to_user(argp, &irq_event,
                                                        sizeof irq_event))
@@@ -3132,18 -3042,6 +3133,18 @@@ static int vcpu_mmio_read(struct kvm_vc
        return kvm_io_bus_read(vcpu->kvm, KVM_MMIO_BUS, addr, len, v);
  }
  
 +static void kvm_set_segment(struct kvm_vcpu *vcpu,
 +                      struct kvm_segment *var, int seg)
 +{
 +      kvm_x86_ops->set_segment(vcpu, var, seg);
 +}
 +
 +void kvm_get_segment(struct kvm_vcpu *vcpu,
 +                   struct kvm_segment *var, int seg)
 +{
 +      kvm_x86_ops->get_segment(vcpu, var, seg);
 +}
 +
  gpa_t kvm_mmu_gva_to_gpa_read(struct kvm_vcpu *vcpu, gva_t gva, u32 *error)
  {
        u32 access = (kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0;
@@@ -3224,17 -3122,14 +3225,17 @@@ static int kvm_read_guest_virt_system(g
        return kvm_read_guest_virt_helper(addr, val, bytes, vcpu, 0, error);
  }
  
 -static int kvm_write_guest_virt(gva_t addr, void *val, unsigned int bytes,
 -                              struct kvm_vcpu *vcpu, u32 *error)
 +static int kvm_write_guest_virt_system(gva_t addr, void *val,
 +                                     unsigned int bytes,
 +                                     struct kvm_vcpu *vcpu,
 +                                     u32 *error)
  {
        void *data = val;
        int r = X86EMUL_CONTINUE;
  
        while (bytes) {
 -              gpa_t gpa = kvm_mmu_gva_to_gpa_write(vcpu, addr, error);
 +              gpa_t gpa =  vcpu->arch.mmu.gva_to_gpa(vcpu, addr,
 +                                                     PFERR_WRITE_MASK, error);
                unsigned offset = addr & (PAGE_SIZE-1);
                unsigned towrite = min(bytes, (unsigned)PAGE_SIZE - offset);
                int ret;
@@@ -3257,6 -3152,7 +3258,6 @@@ out
        return r;
  }
  
 -
  static int emulator_read_emulated(unsigned long addr,
                                  void *val,
                                  unsigned int bytes,
@@@ -3359,9 -3255,9 +3360,9 @@@ mmio
  }
  
  int emulator_write_emulated(unsigned long addr,
 -                                 const void *val,
 -                                 unsigned int bytes,
 -                                 struct kvm_vcpu *vcpu)
 +                          const void *val,
 +                          unsigned int bytes,
 +                          struct kvm_vcpu *vcpu)
  {
        /* Crossing a page boundary? */
        if (((addr + bytes - 1) ^ addr) & PAGE_MASK) {
  }
  EXPORT_SYMBOL_GPL(emulator_write_emulated);
  
 +#define CMPXCHG_TYPE(t, ptr, old, new) \
 +      (cmpxchg((t *)(ptr), *(t *)(old), *(t *)(new)) == *(t *)(old))
 +
 +#ifdef CONFIG_X86_64
 +#  define CMPXCHG64(ptr, old, new) CMPXCHG_TYPE(u64, ptr, old, new)
 +#else
 +#  define CMPXCHG64(ptr, old, new) \
 +      (cmpxchg64((u64 *)(ptr), *(u64 *)(old), *(u64 *)(new)) == *(u64 *)(old))
 +#endif
 +
  static int emulator_cmpxchg_emulated(unsigned long addr,
                                     const void *old,
                                     const void *new,
                                     unsigned int bytes,
                                     struct kvm_vcpu *vcpu)
  {
 -      printk_once(KERN_WARNING "kvm: emulating exchange as write\n");
 -#ifndef CONFIG_X86_64
 -      /* guests cmpxchg8b have to be emulated atomically */
 -      if (bytes == 8) {
 -              gpa_t gpa;
 -              struct page *page;
 -              char *kaddr;
 -              u64 val;
 +      gpa_t gpa;
 +      struct page *page;
 +      char *kaddr;
 +      bool exchanged;
  
 -              gpa = kvm_mmu_gva_to_gpa_write(vcpu, addr, NULL);
 +      /* guests cmpxchg8b have to be emulated atomically */
 +      if (bytes > 8 || (bytes & (bytes - 1)))
 +              goto emul_write;
  
 -              if (gpa == UNMAPPED_GVA ||
 -                 (gpa & PAGE_MASK) == APIC_DEFAULT_PHYS_BASE)
 -                      goto emul_write;
 +      gpa = kvm_mmu_gva_to_gpa_write(vcpu, addr, NULL);
  
 -              if (((gpa + bytes - 1) & PAGE_MASK) != (gpa & PAGE_MASK))
 -                      goto emul_write;
 +      if (gpa == UNMAPPED_GVA ||
 +          (gpa & PAGE_MASK) == APIC_DEFAULT_PHYS_BASE)
 +              goto emul_write;
  
 -              val = *(u64 *)new;
 +      if (((gpa + bytes - 1) & PAGE_MASK) != (gpa & PAGE_MASK))
 +              goto emul_write;
  
 -              page = gfn_to_page(vcpu->kvm, gpa >> PAGE_SHIFT);
 +      page = gfn_to_page(vcpu->kvm, gpa >> PAGE_SHIFT);
  
 -              kaddr = kmap_atomic(page, KM_USER0);
 -              set_64bit((u64 *)(kaddr + offset_in_page(gpa)), val);
 -              kunmap_atomic(kaddr, KM_USER0);
 -              kvm_release_page_dirty(page);
 +      kaddr = kmap_atomic(page, KM_USER0);
 +      kaddr += offset_in_page(gpa);
 +      switch (bytes) {
 +      case 1:
 +              exchanged = CMPXCHG_TYPE(u8, kaddr, old, new);
 +              break;
 +      case 2:
 +              exchanged = CMPXCHG_TYPE(u16, kaddr, old, new);
 +              break;
 +      case 4:
 +              exchanged = CMPXCHG_TYPE(u32, kaddr, old, new);
 +              break;
 +      case 8:
 +              exchanged = CMPXCHG64(kaddr, old, new);
 +              break;
 +      default:
 +              BUG();
        }
 +      kunmap_atomic(kaddr, KM_USER0);
 +      kvm_release_page_dirty(page);
 +
 +      if (!exchanged)
 +              return X86EMUL_CMPXCHG_FAILED;
 +
 +      kvm_mmu_pte_write(vcpu, gpa, new, bytes, 1);
 +
 +      return X86EMUL_CONTINUE;
 +
  emul_write:
 -#endif
 +      printk_once(KERN_WARNING "kvm: emulating exchange as write\n");
  
        return emulator_write_emulated(addr, new, bytes, vcpu);
  }
  
 +static int kernel_pio(struct kvm_vcpu *vcpu, void *pd)
 +{
 +      /* TODO: String I/O for in kernel device */
 +      int r;
 +
 +      if (vcpu->arch.pio.in)
 +              r = kvm_io_bus_read(vcpu->kvm, KVM_PIO_BUS, vcpu->arch.pio.port,
 +                                  vcpu->arch.pio.size, pd);
 +      else
 +              r = kvm_io_bus_write(vcpu->kvm, KVM_PIO_BUS,
 +                                   vcpu->arch.pio.port, vcpu->arch.pio.size,
 +                                   pd);
 +      return r;
 +}
 +
 +
 +static int emulator_pio_in_emulated(int size, unsigned short port, void *val,
 +                           unsigned int count, struct kvm_vcpu *vcpu)
 +{
 +      if (vcpu->arch.pio.count)
 +              goto data_avail;
 +
 +      trace_kvm_pio(1, port, size, 1);
 +
 +      vcpu->arch.pio.port = port;
 +      vcpu->arch.pio.in = 1;
 +      vcpu->arch.pio.count  = count;
 +      vcpu->arch.pio.size = size;
 +
 +      if (!kernel_pio(vcpu, vcpu->arch.pio_data)) {
 +      data_avail:
 +              memcpy(val, vcpu->arch.pio_data, size * count);
 +              vcpu->arch.pio.count = 0;
 +              return 1;
 +      }
 +
 +      vcpu->run->exit_reason = KVM_EXIT_IO;
 +      vcpu->run->io.direction = KVM_EXIT_IO_IN;
 +      vcpu->run->io.size = size;
 +      vcpu->run->io.data_offset = KVM_PIO_PAGE_OFFSET * PAGE_SIZE;
 +      vcpu->run->io.count = count;
 +      vcpu->run->io.port = port;
 +
 +      return 0;
 +}
 +
 +static int emulator_pio_out_emulated(int size, unsigned short port,
 +                            const void *val, unsigned int count,
 +                            struct kvm_vcpu *vcpu)
 +{
 +      trace_kvm_pio(0, port, size, 1);
 +
 +      vcpu->arch.pio.port = port;
 +      vcpu->arch.pio.in = 0;
 +      vcpu->arch.pio.count = count;
 +      vcpu->arch.pio.size = size;
 +
 +      memcpy(vcpu->arch.pio_data, val, size * count);
 +
 +      if (!kernel_pio(vcpu, vcpu->arch.pio_data)) {
 +              vcpu->arch.pio.count = 0;
 +              return 1;
 +      }
 +
 +      vcpu->run->exit_reason = KVM_EXIT_IO;
 +      vcpu->run->io.direction = KVM_EXIT_IO_OUT;
 +      vcpu->run->io.size = size;
 +      vcpu->run->io.data_offset = KVM_PIO_PAGE_OFFSET * PAGE_SIZE;
 +      vcpu->run->io.count = count;
 +      vcpu->run->io.port = port;
 +
 +      return 0;
 +}
 +
  static unsigned long get_segment_base(struct kvm_vcpu *vcpu, int seg)
  {
        return kvm_x86_ops->get_segment_base(vcpu, seg);
@@@ -3543,14 -3334,14 +3544,14 @@@ int emulate_clts(struct kvm_vcpu *vcpu
  
  int emulator_get_dr(struct x86_emulate_ctxt *ctxt, int dr, unsigned long *dest)
  {
 -      return kvm_x86_ops->get_dr(ctxt->vcpu, dr, dest);
 +      return kvm_get_dr(ctxt->vcpu, dr, dest);
  }
  
  int emulator_set_dr(struct x86_emulate_ctxt *ctxt, int dr, unsigned long value)
  {
        unsigned long mask = (ctxt->mode == X86EMUL_MODE_PROT64) ? ~0ULL : ~0U;
  
 -      return kvm_x86_ops->set_dr(ctxt->vcpu, dr, value & mask);
 +      return kvm_set_dr(ctxt->vcpu, dr, value & mask);
  }
  
  void kvm_report_emulation_failure(struct kvm_vcpu *vcpu, const char *context)
  }
  EXPORT_SYMBOL_GPL(kvm_report_emulation_failure);
  
 -static struct x86_emulate_ops emulate_ops = {
 -      .read_std            = kvm_read_guest_virt_system,
 -      .fetch               = kvm_fetch_guest_virt,
 -      .read_emulated       = emulator_read_emulated,
 -      .write_emulated      = emulator_write_emulated,
 -      .cmpxchg_emulated    = emulator_cmpxchg_emulated,
 -};
 -
 -static void cache_all_regs(struct kvm_vcpu *vcpu)
 +static u64 mk_cr_64(u64 curr_cr, u32 new_val)
  {
 -      kvm_register_read(vcpu, VCPU_REGS_RAX);
 -      kvm_register_read(vcpu, VCPU_REGS_RSP);
 -      kvm_register_read(vcpu, VCPU_REGS_RIP);
 -      vcpu->arch.regs_dirty = ~0;
 +      return (curr_cr & ~((1ULL << 32) - 1)) | new_val;
  }
  
 -int emulate_instruction(struct kvm_vcpu *vcpu,
 -                      unsigned long cr2,
 -                      u16 error_code,
 -                      int emulation_type)
 +static unsigned long emulator_get_cr(int cr, struct kvm_vcpu *vcpu)
  {
 -      int r, shadow_mask;
 -      struct decode_cache *c;
 -      struct kvm_run *run = vcpu->run;
 +      unsigned long value;
  
 -      kvm_clear_exception_queue(vcpu);
 -      vcpu->arch.mmio_fault_cr2 = cr2;
 -      /*
 -       * TODO: fix emulate.c to use guest_read/write_register
 -       * instead of direct ->regs accesses, can save hundred cycles
 -       * on Intel for instructions that don't read/change RSP, for
 -       * for example.
 -       */
 -      cache_all_regs(vcpu);
 +      switch (cr) {
 +      case 0:
 +              value = kvm_read_cr0(vcpu);
 +              break;
 +      case 2:
 +              value = vcpu->arch.cr2;
 +              break;
 +      case 3:
 +              value = vcpu->arch.cr3;
 +              break;
 +      case 4:
 +              value = kvm_read_cr4(vcpu);
 +              break;
 +      case 8:
 +              value = kvm_get_cr8(vcpu);
 +              break;
 +      default:
 +              vcpu_printf(vcpu, "%s: unexpected cr %u\n", __func__, cr);
 +              return 0;
 +      }
  
 -      vcpu->mmio_is_write = 0;
 -      vcpu->arch.pio.string = 0;
 +      return value;
 +}
  
 -      if (!(emulation_type & EMULTYPE_NO_DECODE)) {
 -              int cs_db, cs_l;
 -              kvm_x86_ops->get_cs_db_l_bits(vcpu, &cs_db, &cs_l);
 +static void emulator_set_cr(int cr, unsigned long val, struct kvm_vcpu *vcpu)
 +{
 +      switch (cr) {
 +      case 0:
 +              kvm_set_cr0(vcpu, mk_cr_64(kvm_read_cr0(vcpu), val));
 +              break;
 +      case 2:
 +              vcpu->arch.cr2 = val;
 +              break;
 +      case 3:
 +              kvm_set_cr3(vcpu, val);
 +              break;
 +      case 4:
 +              kvm_set_cr4(vcpu, mk_cr_64(kvm_read_cr4(vcpu), val));
 +              break;
 +      case 8:
 +              kvm_set_cr8(vcpu, val & 0xfUL);
 +              break;
 +      default:
 +              vcpu_printf(vcpu, "%s: unexpected cr %u\n", __func__, cr);
 +      }
 +}
  
 -              vcpu->arch.emulate_ctxt.vcpu = vcpu;
 -              vcpu->arch.emulate_ctxt.eflags = kvm_get_rflags(vcpu);
 -              vcpu->arch.emulate_ctxt.mode =
 -                      (!is_protmode(vcpu)) ? X86EMUL_MODE_REAL :
 -                      (vcpu->arch.emulate_ctxt.eflags & X86_EFLAGS_VM)
 -                      ? X86EMUL_MODE_VM86 : cs_l
 -                      ? X86EMUL_MODE_PROT64 : cs_db
 -                      ? X86EMUL_MODE_PROT32 : X86EMUL_MODE_PROT16;
 +static int emulator_get_cpl(struct kvm_vcpu *vcpu)
 +{
 +      return kvm_x86_ops->get_cpl(vcpu);
 +}
  
 -              r = x86_decode_insn(&vcpu->arch.emulate_ctxt, &emulate_ops);
 +static void emulator_get_gdt(struct desc_ptr *dt, struct kvm_vcpu *vcpu)
 +{
 +      kvm_x86_ops->get_gdt(vcpu, dt);
 +}
 +
 +static bool emulator_get_cached_descriptor(struct desc_struct *desc, int seg,
 +                                         struct kvm_vcpu *vcpu)
 +{
 +      struct kvm_segment var;
 +
 +      kvm_get_segment(vcpu, &var, seg);
 +
 +      if (var.unusable)
 +              return false;
 +
 +      if (var.g)
 +              var.limit >>= 12;
 +      set_desc_limit(desc, var.limit);
 +      set_desc_base(desc, (unsigned long)var.base);
 +      desc->type = var.type;
 +      desc->s = var.s;
 +      desc->dpl = var.dpl;
 +      desc->p = var.present;
 +      desc->avl = var.avl;
 +      desc->l = var.l;
 +      desc->d = var.db;
 +      desc->g = var.g;
 +
 +      return true;
 +}
 +
 +static void emulator_set_cached_descriptor(struct desc_struct *desc, int seg,
 +                                         struct kvm_vcpu *vcpu)
 +{
 +      struct kvm_segment var;
 +
 +      /* needed to preserve selector */
 +      kvm_get_segment(vcpu, &var, seg);
 +
 +      var.base = get_desc_base(desc);
 +      var.limit = get_desc_limit(desc);
 +      if (desc->g)
 +              var.limit = (var.limit << 12) | 0xfff;
 +      var.type = desc->type;
 +      var.present = desc->p;
 +      var.dpl = desc->dpl;
 +      var.db = desc->d;
 +      var.s = desc->s;
 +      var.l = desc->l;
 +      var.g = desc->g;
 +      var.avl = desc->avl;
 +      var.present = desc->p;
 +      var.unusable = !var.present;
 +      var.padding = 0;
 +
 +      kvm_set_segment(vcpu, &var, seg);
 +      return;
 +}
 +
 +static u16 emulator_get_segment_selector(int seg, struct kvm_vcpu *vcpu)
 +{
 +      struct kvm_segment kvm_seg;
 +
 +      kvm_get_segment(vcpu, &kvm_seg, seg);
 +      return kvm_seg.selector;
 +}
 +
 +static void emulator_set_segment_selector(u16 sel, int seg,
 +                                        struct kvm_vcpu *vcpu)
 +{
 +      struct kvm_segment kvm_seg;
 +
 +      kvm_get_segment(vcpu, &kvm_seg, seg);
 +      kvm_seg.selector = sel;
 +      kvm_set_segment(vcpu, &kvm_seg, seg);
 +}
 +
 +static void emulator_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags)
 +{
 +      kvm_x86_ops->set_rflags(vcpu, rflags);
 +}
 +
 +static struct x86_emulate_ops emulate_ops = {
 +      .read_std            = kvm_read_guest_virt_system,
 +      .write_std           = kvm_write_guest_virt_system,
 +      .fetch               = kvm_fetch_guest_virt,
 +      .read_emulated       = emulator_read_emulated,
 +      .write_emulated      = emulator_write_emulated,
 +      .cmpxchg_emulated    = emulator_cmpxchg_emulated,
 +      .pio_in_emulated     = emulator_pio_in_emulated,
 +      .pio_out_emulated    = emulator_pio_out_emulated,
 +      .get_cached_descriptor = emulator_get_cached_descriptor,
 +      .set_cached_descriptor = emulator_set_cached_descriptor,
 +      .get_segment_selector = emulator_get_segment_selector,
 +      .set_segment_selector = emulator_set_segment_selector,
 +      .get_gdt             = emulator_get_gdt,
 +      .get_cr              = emulator_get_cr,
 +      .set_cr              = emulator_set_cr,
 +      .cpl                 = emulator_get_cpl,
 +      .set_rflags          = emulator_set_rflags,
 +};
 +
 +static void cache_all_regs(struct kvm_vcpu *vcpu)
 +{
 +      kvm_register_read(vcpu, VCPU_REGS_RAX);
 +      kvm_register_read(vcpu, VCPU_REGS_RSP);
 +      kvm_register_read(vcpu, VCPU_REGS_RIP);
 +      vcpu->arch.regs_dirty = ~0;
 +}
 +
 +int emulate_instruction(struct kvm_vcpu *vcpu,
 +                      unsigned long cr2,
 +                      u16 error_code,
 +                      int emulation_type)
 +{
 +      int r, shadow_mask;
 +      struct decode_cache *c;
 +      struct kvm_run *run = vcpu->run;
 +
 +      kvm_clear_exception_queue(vcpu);
 +      vcpu->arch.mmio_fault_cr2 = cr2;
 +      /*
 +       * TODO: fix emulate.c to use guest_read/write_register
 +       * instead of direct ->regs accesses, can save hundred cycles
 +       * on Intel for instructions that don't read/change RSP, for
 +       * for example.
 +       */
 +      cache_all_regs(vcpu);
 +
 +      vcpu->mmio_is_write = 0;
 +
 +      if (!(emulation_type & EMULTYPE_NO_DECODE)) {
 +              int cs_db, cs_l;
 +              kvm_x86_ops->get_cs_db_l_bits(vcpu, &cs_db, &cs_l);
 +
 +              vcpu->arch.emulate_ctxt.vcpu = vcpu;
 +              vcpu->arch.emulate_ctxt.eflags = kvm_x86_ops->get_rflags(vcpu);
 +              vcpu->arch.emulate_ctxt.eip = kvm_rip_read(vcpu);
 +              vcpu->arch.emulate_ctxt.mode =
 +                      (!is_protmode(vcpu)) ? X86EMUL_MODE_REAL :
 +                      (vcpu->arch.emulate_ctxt.eflags & X86_EFLAGS_VM)
 +                      ? X86EMUL_MODE_VM86 : cs_l
 +                      ? X86EMUL_MODE_PROT64 : cs_db
 +                      ? X86EMUL_MODE_PROT32 : X86EMUL_MODE_PROT16;
 +
 +              r = x86_decode_insn(&vcpu->arch.emulate_ctxt, &emulate_ops);
 +              trace_kvm_emulate_insn_start(vcpu);
  
                /* Only allow emulation of specific instructions on #UD
                 * (namely VMMCALL, sysenter, sysexit, syscall)*/
                ++vcpu->stat.insn_emulation;
                if (r)  {
                        ++vcpu->stat.insn_emulation_fail;
 +                      trace_kvm_emulate_insn_failed(vcpu);
                        if (kvm_mmu_unprotect_page_virt(vcpu, cr2))
                                return EMULATE_DONE;
                        return EMULATE_FAIL;
                return EMULATE_DONE;
        }
  
 +restart:
        r = x86_emulate_insn(&vcpu->arch.emulate_ctxt, &emulate_ops);
        shadow_mask = vcpu->arch.emulate_ctxt.interruptibility;
  
        if (r == 0)
                kvm_x86_ops->set_interrupt_shadow(vcpu, shadow_mask);
  
 -      if (vcpu->arch.pio.string)
 +      if (vcpu->arch.pio.count) {
 +              if (!vcpu->arch.pio.in)
 +                      vcpu->arch.pio.count = 0;
                return EMULATE_DO_MMIO;
 +      }
  
 -      if ((r || vcpu->mmio_is_write) && run) {
 +      if (r || vcpu->mmio_is_write) {
                run->exit_reason = KVM_EXIT_MMIO;
                run->mmio.phys_addr = vcpu->mmio_phys_addr;
                memcpy(run->mmio.data, vcpu->mmio_data, 8);
  
        if (r) {
                if (kvm_mmu_unprotect_page_virt(vcpu, cr2))
 -                      return EMULATE_DONE;
 +                      goto done;
                if (!vcpu->mmio_needed) {
 +                      ++vcpu->stat.insn_emulation_fail;
 +                      trace_kvm_emulate_insn_failed(vcpu);
                        kvm_report_emulation_failure(vcpu, "mmio");
                        return EMULATE_FAIL;
                }
                return EMULATE_DO_MMIO;
        }
  
 -      kvm_set_rflags(vcpu, vcpu->arch.emulate_ctxt.eflags);
 -
        if (vcpu->mmio_is_write) {
                vcpu->mmio_needed = 0;
                return EMULATE_DO_MMIO;
        }
  
 -      return EMULATE_DONE;
 -}
 -EXPORT_SYMBOL_GPL(emulate_instruction);
 -
 -static int pio_copy_data(struct kvm_vcpu *vcpu)
 -{
 -      void *p = vcpu->arch.pio_data;
 -      gva_t q = vcpu->arch.pio.guest_gva;
 -      unsigned bytes;
 -      int ret;
 -      u32 error_code;
 -
 -      bytes = vcpu->arch.pio.size * vcpu->arch.pio.cur_count;
 -      if (vcpu->arch.pio.in)
 -              ret = kvm_write_guest_virt(q, p, bytes, vcpu, &error_code);
 -      else
 -              ret = kvm_read_guest_virt(q, p, bytes, vcpu, &error_code);
 -
 -      if (ret == X86EMUL_PROPAGATE_FAULT)
 -              kvm_inject_page_fault(vcpu, q, error_code);
 -
 -      return ret;
 -}
 -
 -int complete_pio(struct kvm_vcpu *vcpu)
 -{
 -      struct kvm_pio_request *io = &vcpu->arch.pio;
 -      long delta;
 -      int r;
 -      unsigned long val;
 -
 -      if (!io->string) {
 -              if (io->in) {
 -                      val = kvm_register_read(vcpu, VCPU_REGS_RAX);
 -                      memcpy(&val, vcpu->arch.pio_data, io->size);
 -                      kvm_register_write(vcpu, VCPU_REGS_RAX, val);
 -              }
 -      } else {
 -              if (io->in) {
 -                      r = pio_copy_data(vcpu);
 -                      if (r)
 -                              goto out;
 -              }
 -
 -              delta = 1;
 -              if (io->rep) {
 -                      delta *= io->cur_count;
 -                      /*
 -                       * The size of the register should really depend on
 -                       * current address size.
 -                       */
 -                      val = kvm_register_read(vcpu, VCPU_REGS_RCX);
 -                      val -= delta;
 -                      kvm_register_write(vcpu, VCPU_REGS_RCX, val);
 -              }
 -              if (io->down)
 -                      delta = -delta;
 -              delta *= io->size;
 -              if (io->in) {
 -                      val = kvm_register_read(vcpu, VCPU_REGS_RDI);
 -                      val += delta;
 -                      kvm_register_write(vcpu, VCPU_REGS_RDI, val);
 -              } else {
 -                      val = kvm_register_read(vcpu, VCPU_REGS_RSI);
 -                      val += delta;
 -                      kvm_register_write(vcpu, VCPU_REGS_RSI, val);
 -              }
 -      }
 -out:
 -      io->count -= io->cur_count;
 -      io->cur_count = 0;
 -
 -      return 0;
 -}
 +done:
 +      if (vcpu->arch.exception.pending)
 +              vcpu->arch.emulate_ctxt.restart = false;
  
 -static int kernel_pio(struct kvm_vcpu *vcpu, void *pd)
 -{
 -      /* TODO: String I/O for in kernel device */
 -      int r;
 -
 -      if (vcpu->arch.pio.in)
 -              r = kvm_io_bus_read(vcpu->kvm, KVM_PIO_BUS, vcpu->arch.pio.port,
 -                                  vcpu->arch.pio.size, pd);
 -      else
 -              r = kvm_io_bus_write(vcpu->kvm, KVM_PIO_BUS,
 -                                   vcpu->arch.pio.port, vcpu->arch.pio.size,
 -                                   pd);
 -      return r;
 -}
 -
 -static int pio_string_write(struct kvm_vcpu *vcpu)
 -{
 -      struct kvm_pio_request *io = &vcpu->arch.pio;
 -      void *pd = vcpu->arch.pio_data;
 -      int i, r = 0;
 -
 -      for (i = 0; i < io->cur_count; i++) {
 -              if (kvm_io_bus_write(vcpu->kvm, KVM_PIO_BUS,
 -                                   io->port, io->size, pd)) {
 -                      r = -EOPNOTSUPP;
 -                      break;
 -              }
 -              pd += io->size;
 -      }
 -      return r;
 -}
 -
 -int kvm_emulate_pio(struct kvm_vcpu *vcpu, int in, int size, unsigned port)
 -{
 -      unsigned long val;
 +      if (vcpu->arch.emulate_ctxt.restart)
 +              goto restart;
  
 -      trace_kvm_pio(!in, port, size, 1);
 -
 -      vcpu->run->exit_reason = KVM_EXIT_IO;
 -      vcpu->run->io.direction = in ? KVM_EXIT_IO_IN : KVM_EXIT_IO_OUT;
 -      vcpu->run->io.size = vcpu->arch.pio.size = size;
 -      vcpu->run->io.data_offset = KVM_PIO_PAGE_OFFSET * PAGE_SIZE;
 -      vcpu->run->io.count = vcpu->arch.pio.count = vcpu->arch.pio.cur_count = 1;
 -      vcpu->run->io.port = vcpu->arch.pio.port = port;
 -      vcpu->arch.pio.in = in;
 -      vcpu->arch.pio.string = 0;
 -      vcpu->arch.pio.down = 0;
 -      vcpu->arch.pio.rep = 0;
 -
 -      if (!vcpu->arch.pio.in) {
 -              val = kvm_register_read(vcpu, VCPU_REGS_RAX);
 -              memcpy(vcpu->arch.pio_data, &val, 4);
 -      }
 -
 -      if (!kernel_pio(vcpu, vcpu->arch.pio_data)) {
 -              complete_pio(vcpu);
 -              return 1;
 -      }
 -      return 0;
 +      return EMULATE_DONE;
  }
 -EXPORT_SYMBOL_GPL(kvm_emulate_pio);
 +EXPORT_SYMBOL_GPL(emulate_instruction);
  
 -int kvm_emulate_pio_string(struct kvm_vcpu *vcpu, int in,
 -                int size, unsigned long count, int down,
 -                gva_t address, int rep, unsigned port)
 +int kvm_fast_pio_out(struct kvm_vcpu *vcpu, int size, unsigned short port)
  {
 -      unsigned now, in_page;
 -      int ret = 0;
 -
 -      trace_kvm_pio(!in, port, size, count);
 -
 -      vcpu->run->exit_reason = KVM_EXIT_IO;
 -      vcpu->run->io.direction = in ? KVM_EXIT_IO_IN : KVM_EXIT_IO_OUT;
 -      vcpu->run->io.size = vcpu->arch.pio.size = size;
 -      vcpu->run->io.data_offset = KVM_PIO_PAGE_OFFSET * PAGE_SIZE;
 -      vcpu->run->io.count = vcpu->arch.pio.count = vcpu->arch.pio.cur_count = count;
 -      vcpu->run->io.port = vcpu->arch.pio.port = port;
 -      vcpu->arch.pio.in = in;
 -      vcpu->arch.pio.string = 1;
 -      vcpu->arch.pio.down = down;
 -      vcpu->arch.pio.rep = rep;
 -
 -      if (!count) {
 -              kvm_x86_ops->skip_emulated_instruction(vcpu);
 -              return 1;
 -      }
 -
 -      if (!down)
 -              in_page = PAGE_SIZE - offset_in_page(address);
 -      else
 -              in_page = offset_in_page(address) + size;
 -      now = min(count, (unsigned long)in_page / size);
 -      if (!now)
 -              now = 1;
 -      if (down) {
 -              /*
 -               * String I/O in reverse.  Yuck.  Kill the guest, fix later.
 -               */
 -              pr_unimpl(vcpu, "guest string pio down\n");
 -              kvm_inject_gp(vcpu, 0);
 -              return 1;
 -      }
 -      vcpu->run->io.count = now;
 -      vcpu->arch.pio.cur_count = now;
 -
 -      if (vcpu->arch.pio.cur_count == vcpu->arch.pio.count)
 -              kvm_x86_ops->skip_emulated_instruction(vcpu);
 -
 -      vcpu->arch.pio.guest_gva = address;
 -
 -      if (!vcpu->arch.pio.in) {
 -              /* string PIO write */
 -              ret = pio_copy_data(vcpu);
 -              if (ret == X86EMUL_PROPAGATE_FAULT)
 -                      return 1;
 -              if (ret == 0 && !pio_string_write(vcpu)) {
 -                      complete_pio(vcpu);
 -                      if (vcpu->arch.pio.count == 0)
 -                              ret = 1;
 -              }
 -      }
 -      /* no string PIO read support yet */
 -
 +      unsigned long val = kvm_register_read(vcpu, VCPU_REGS_RAX);
 +      int ret = emulator_pio_out_emulated(size, port, &val, 1, vcpu);
 +      /* do not return to emulator after return from userspace */
 +      vcpu->arch.pio.count = 0;
        return ret;
  }
 -EXPORT_SYMBOL_GPL(kvm_emulate_pio_string);
 +EXPORT_SYMBOL_GPL(kvm_fast_pio_out);
  
  static void bounce_off(void *info)
  {
@@@ -3955,6 -3766,47 +3956,47 @@@ static void kvm_timer_init(void
        }
  }
  
+ static DEFINE_PER_CPU(struct kvm_vcpu *, current_vcpu);
+ static int kvm_is_in_guest(void)
+ {
+       return percpu_read(current_vcpu) != NULL;
+ }
+ static int kvm_is_user_mode(void)
+ {
+       int user_mode = 3;
+       if (percpu_read(current_vcpu))
+               user_mode = kvm_x86_ops->get_cpl(percpu_read(current_vcpu));
+       return user_mode != 0;
+ }
+ static unsigned long kvm_get_guest_ip(void)
+ {
+       unsigned long ip = 0;
+       if (percpu_read(current_vcpu))
+               ip = kvm_rip_read(percpu_read(current_vcpu));
+       return ip;
+ }
+ static struct perf_guest_info_callbacks kvm_guest_cbs = {
+       .is_in_guest            = kvm_is_in_guest,
+       .is_user_mode           = kvm_is_user_mode,
+       .get_guest_ip           = kvm_get_guest_ip,
+ };
+ void kvm_before_handle_nmi(struct kvm_vcpu *vcpu)
+ {
+       percpu_write(current_vcpu, vcpu);
+ }
+ EXPORT_SYMBOL_GPL(kvm_before_handle_nmi);
+ void kvm_after_handle_nmi(struct kvm_vcpu *vcpu)
+ {
+       percpu_write(current_vcpu, NULL);
+ }
+ EXPORT_SYMBOL_GPL(kvm_after_handle_nmi);
  int kvm_arch_init(void *opaque)
  {
        int r;
  
        kvm_timer_init();
  
+       perf_register_guest_info_callbacks(&kvm_guest_cbs);
        return 0;
  
  out:
  
  void kvm_arch_exit(void)
  {
+       perf_unregister_guest_info_callbacks(&kvm_guest_cbs);
        if (!boot_cpu_has(X86_FEATURE_CONSTANT_TSC))
                cpufreq_unregister_notifier(&kvmclock_cpufreq_notifier_block,
                                            CPUFREQ_TRANSITION_NOTIFIER);
@@@ -4154,20 -4010,85 +4200,20 @@@ int kvm_fix_hypercall(struct kvm_vcpu *
        return emulator_write_emulated(rip, instruction, 3, vcpu);
  }
  
 -static u64 mk_cr_64(u64 curr_cr, u32 new_val)
 -{
 -      return (curr_cr & ~((1ULL << 32) - 1)) | new_val;
 -}
 -
  void realmode_lgdt(struct kvm_vcpu *vcpu, u16 limit, unsigned long base)
  {
 -      struct descriptor_table dt = { limit, base };
 +      struct desc_ptr dt = { limit, base };
  
        kvm_x86_ops->set_gdt(vcpu, &dt);
  }
  
  void realmode_lidt(struct kvm_vcpu *vcpu, u16 limit, unsigned long base)
  {
 -      struct descriptor_table dt = { limit, base };
 +      struct desc_ptr dt = { limit, base };
  
        kvm_x86_ops->set_idt(vcpu, &dt);
  }
  
 -void realmode_lmsw(struct kvm_vcpu *vcpu, unsigned long msw,
 -                 unsigned long *rflags)
 -{
 -      kvm_lmsw(vcpu, msw);
 -      *rflags = kvm_get_rflags(vcpu);
 -}
 -
 -unsigned long realmode_get_cr(struct kvm_vcpu *vcpu, int cr)
 -{
 -      unsigned long value;
 -
 -      switch (cr) {
 -      case 0:
 -              value = kvm_read_cr0(vcpu);
 -              break;
 -      case 2:
 -              value = vcpu->arch.cr2;
 -              break;
 -      case 3:
 -              value = vcpu->arch.cr3;
 -              break;
 -      case 4:
 -              value = kvm_read_cr4(vcpu);
 -              break;
 -      case 8:
 -              value = kvm_get_cr8(vcpu);
 -              break;
 -      default:
 -              vcpu_printf(vcpu, "%s: unexpected cr %u\n", __func__, cr);
 -              return 0;
 -      }
 -
 -      return value;
 -}
 -
 -void realmode_set_cr(struct kvm_vcpu *vcpu, int cr, unsigned long val,
 -                   unsigned long *rflags)
 -{
 -      switch (cr) {
 -      case 0:
 -              kvm_set_cr0(vcpu, mk_cr_64(kvm_read_cr0(vcpu), val));
 -              *rflags = kvm_get_rflags(vcpu);
 -              break;
 -      case 2:
 -              vcpu->arch.cr2 = val;
 -              break;
 -      case 3:
 -              kvm_set_cr3(vcpu, val);
 -              break;
 -      case 4:
 -              kvm_set_cr4(vcpu, mk_cr_64(kvm_read_cr4(vcpu), val));
 -              break;
 -      case 8:
 -              kvm_set_cr8(vcpu, val & 0xfUL);
 -              break;
 -      default:
 -              vcpu_printf(vcpu, "%s: unexpected cr %u\n", __func__, cr);
 -      }
 -}
 -
  static int move_to_next_stateful_cpuid_entry(struct kvm_vcpu *vcpu, int i)
  {
        struct kvm_cpuid_entry2 *e = &vcpu->arch.cpuid_entries[i];
@@@ -4231,13 -4152,9 +4277,13 @@@ int cpuid_maxphyaddr(struct kvm_vcpu *v
  {
        struct kvm_cpuid_entry2 *best;
  
 +      best = kvm_find_cpuid_entry(vcpu, 0x80000000, 0);
 +      if (!best || best->eax < 0x80000008)
 +              goto not_found;
        best = kvm_find_cpuid_entry(vcpu, 0x80000008, 0);
        if (best)
                return best->eax & 0xff;
 +not_found:
        return 36;
  }
  
@@@ -4351,9 -4268,6 +4397,9 @@@ static void inject_pending_event(struc
  {
        /* try to reinject previous events if any */
        if (vcpu->arch.exception.pending) {
 +              trace_kvm_inj_exception(vcpu->arch.exception.nr,
 +                                      vcpu->arch.exception.has_error_code,
 +                                      vcpu->arch.exception.error_code);
                kvm_x86_ops->queue_exception(vcpu, vcpu->arch.exception.nr,
                                          vcpu->arch.exception.has_error_code,
                                          vcpu->arch.exception.error_code);
@@@ -4614,17 -4528,24 +4660,17 @@@ int kvm_arch_vcpu_ioctl_run(struct kvm_
        if (!irqchip_in_kernel(vcpu->kvm))
                kvm_set_cr8(vcpu, kvm_run->cr8);
  
 -      if (vcpu->arch.pio.cur_count) {
 -              r = complete_pio(vcpu);
 -              if (r)
 -                      goto out;
 -      }
 -      if (vcpu->mmio_needed) {
 -              memcpy(vcpu->mmio_data, kvm_run->mmio.data, 8);
 -              vcpu->mmio_read_completed = 1;
 -              vcpu->mmio_needed = 0;
 -
 +      if (vcpu->arch.pio.count || vcpu->mmio_needed ||
 +          vcpu->arch.emulate_ctxt.restart) {
 +              if (vcpu->mmio_needed) {
 +                      memcpy(vcpu->mmio_data, kvm_run->mmio.data, 8);
 +                      vcpu->mmio_read_completed = 1;
 +                      vcpu->mmio_needed = 0;
 +              }
                vcpu->srcu_idx = srcu_read_lock(&vcpu->kvm->srcu);
 -              r = emulate_instruction(vcpu, vcpu->arch.mmio_fault_cr2, 0,
 -                                      EMULTYPE_NO_DECODE);
 +              r = emulate_instruction(vcpu, 0, 0, EMULTYPE_NO_DECODE);
                srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx);
                if (r == EMULATE_DO_MMIO) {
 -                      /*
 -                       * Read-modify-write.  Back to userspace.
 -                       */
                        r = 0;
                        goto out;
                }
@@@ -4707,6 -4628,12 +4753,6 @@@ int kvm_arch_vcpu_ioctl_set_regs(struc
        return 0;
  }
  
 -void kvm_get_segment(struct kvm_vcpu *vcpu,
 -                   struct kvm_segment *var, int seg)
 -{
 -      kvm_x86_ops->get_segment(vcpu, var, seg);
 -}
 -
  void kvm_get_cs_db_l_bits(struct kvm_vcpu *vcpu, int *db, int *l)
  {
        struct kvm_segment cs;
@@@ -4720,7 -4647,7 +4766,7 @@@ EXPORT_SYMBOL_GPL(kvm_get_cs_db_l_bits)
  int kvm_arch_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu,
                                  struct kvm_sregs *sregs)
  {
 -      struct descriptor_table dt;
 +      struct desc_ptr dt;
  
        vcpu_load(vcpu);
  
        kvm_get_segment(vcpu, &sregs->ldt, VCPU_SREG_LDTR);
  
        kvm_x86_ops->get_idt(vcpu, &dt);
 -      sregs->idt.limit = dt.limit;
 -      sregs->idt.base = dt.base;
 +      sregs->idt.limit = dt.size;
 +      sregs->idt.base = dt.address;
        kvm_x86_ops->get_gdt(vcpu, &dt);
 -      sregs->gdt.limit = dt.limit;
 -      sregs->gdt.base = dt.base;
 +      sregs->gdt.limit = dt.size;
 +      sregs->gdt.base = dt.address;
  
        sregs->cr0 = kvm_read_cr0(vcpu);
        sregs->cr2 = vcpu->arch.cr2;
@@@ -4778,33 -4705,559 +4824,33 @@@ int kvm_arch_vcpu_ioctl_set_mpstate(str
        return 0;
  }
  
 -static void kvm_set_segment(struct kvm_vcpu *vcpu,
 -                      struct kvm_segment *var, int seg)
 -{
 -      kvm_x86_ops->set_segment(vcpu, var, seg);
 -}
 -
 -static void seg_desct_to_kvm_desct(struct desc_struct *seg_desc, u16 selector,
 -                                 struct kvm_segment *kvm_desct)
 -{
 -      kvm_desct->base = get_desc_base(seg_desc);
 -      kvm_desct->limit = get_desc_limit(seg_desc);
 -      if (seg_desc->g) {
 -              kvm_desct->limit <<= 12;
 -              kvm_desct->limit |= 0xfff;
 -      }
 -      kvm_desct->selector = selector;
 -      kvm_desct->type = seg_desc->type;
 -      kvm_desct->present = seg_desc->p;
 -      kvm_desct->dpl = seg_desc->dpl;
 -      kvm_desct->db = seg_desc->d;
 -      kvm_desct->s = seg_desc->s;
 -      kvm_desct->l = seg_desc->l;
 -      kvm_desct->g = seg_desc->g;
 -      kvm_desct->avl = seg_desc->avl;
 -      if (!selector)
 -              kvm_desct->unusable = 1;
 -      else
 -              kvm_desct->unusable = 0;
 -      kvm_desct->padding = 0;
 -}
 -
 -static void get_segment_descriptor_dtable(struct kvm_vcpu *vcpu,
 -                                        u16 selector,
 -                                        struct descriptor_table *dtable)
 +int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int reason,
 +                  bool has_error_code, u32 error_code)
  {
 -      if (selector & 1 << 2) {
 -              struct kvm_segment kvm_seg;
 -
 -              kvm_get_segment(vcpu, &kvm_seg, VCPU_SREG_LDTR);
 -
 -              if (kvm_seg.unusable)
 -                      dtable->limit = 0;
 -              else
 -                      dtable->limit = kvm_seg.limit;
 -              dtable->base = kvm_seg.base;
 -      }
 -      else
 -              kvm_x86_ops->get_gdt(vcpu, dtable);
 -}
 -
 -/* allowed just for 8 bytes segments */
 -static int load_guest_segment_descriptor(struct kvm_vcpu *vcpu, u16 selector,
 -                                       struct desc_struct *seg_desc)
 -{
 -      struct descriptor_table dtable;
 -      u16 index = selector >> 3;
 -      int ret;
 -      u32 err;
 -      gva_t addr;
 -
 -      get_segment_descriptor_dtable(vcpu, selector, &dtable);
 -
 -      if (dtable.limit < index * 8 + 7) {
 -              kvm_queue_exception_e(vcpu, GP_VECTOR, selector & 0xfffc);
 -              return X86EMUL_PROPAGATE_FAULT;
 -      }
 -      addr = dtable.base + index * 8;
 -      ret = kvm_read_guest_virt_system(addr, seg_desc, sizeof(*seg_desc),
 -                                       vcpu,  &err);
 -      if (ret == X86EMUL_PROPAGATE_FAULT)
 -              kvm_inject_page_fault(vcpu, addr, err);
 -
 -       return ret;
 -}
 -
 -/* allowed just for 8 bytes segments */
 -static int save_guest_segment_descriptor(struct kvm_vcpu *vcpu, u16 selector,
 -                                       struct desc_struct *seg_desc)
 -{
 -      struct descriptor_table dtable;
 -      u16 index = selector >> 3;
 -
 -      get_segment_descriptor_dtable(vcpu, selector, &dtable);
 -
 -      if (dtable.limit < index * 8 + 7)
 -              return 1;
 -      return kvm_write_guest_virt(dtable.base + index*8, seg_desc, sizeof(*seg_desc), vcpu, NULL);
 -}
 -
 -static gpa_t get_tss_base_addr_write(struct kvm_vcpu *vcpu,
 -                             struct desc_struct *seg_desc)
 -{
 -      u32 base_addr = get_desc_base(seg_desc);
 -
 -      return kvm_mmu_gva_to_gpa_write(vcpu, base_addr, NULL);
 -}
 -
 -static gpa_t get_tss_base_addr_read(struct kvm_vcpu *vcpu,
 -                           struct desc_struct *seg_desc)
 -{
 -      u32 base_addr = get_desc_base(seg_desc);
 -
 -      return kvm_mmu_gva_to_gpa_read(vcpu, base_addr, NULL);
 -}
 -
 -static u16 get_segment_selector(struct kvm_vcpu *vcpu, int seg)
 -{
 -      struct kvm_segment kvm_seg;
 -
 -      kvm_get_segment(vcpu, &kvm_seg, seg);
 -      return kvm_seg.selector;
 -}
 -
 -static int kvm_load_realmode_segment(struct kvm_vcpu *vcpu, u16 selector, int seg)
 -{
 -      struct kvm_segment segvar = {
 -              .base = selector << 4,
 -              .limit = 0xffff,
 -              .selector = selector,
 -              .type = 3,
 -              .present = 1,
 -              .dpl = 3,
 -              .db = 0,
 -              .s = 1,
 -              .l = 0,
 -              .g = 0,
 -              .avl = 0,
 -              .unusable = 0,
 -      };
 -      kvm_x86_ops->set_segment(vcpu, &segvar, seg);
 -      return X86EMUL_CONTINUE;
 -}
 -
 -static int is_vm86_segment(struct kvm_vcpu *vcpu, int seg)
 -{
 -      return (seg != VCPU_SREG_LDTR) &&
 -              (seg != VCPU_SREG_TR) &&
 -              (kvm_get_rflags(vcpu) & X86_EFLAGS_VM);
 -}
 -
 -int kvm_load_segment_descriptor(struct kvm_vcpu *vcpu, u16 selector, int seg)
 -{
 -      struct kvm_segment kvm_seg;
 -      struct desc_struct seg_desc;
 -      u8 dpl, rpl, cpl;
 -      unsigned err_vec = GP_VECTOR;
 -      u32 err_code = 0;
 -      bool null_selector = !(selector & ~0x3); /* 0000-0003 are null */
 -      int ret;
 +      int cs_db, cs_l, ret;
 +      cache_all_regs(vcpu);
  
 -      if (is_vm86_segment(vcpu, seg) || !is_protmode(vcpu))
 -              return kvm_load_realmode_segment(vcpu, selector, seg);
 +      kvm_x86_ops->get_cs_db_l_bits(vcpu, &cs_db, &cs_l);
  
 -      /* NULL selector is not valid for TR, CS and SS */
 -      if ((seg == VCPU_SREG_CS || seg == VCPU_SREG_SS || seg == VCPU_SREG_TR)
 -          && null_selector)
 -              goto exception;
 +      vcpu->arch.emulate_ctxt.vcpu = vcpu;
 +      vcpu->arch.emulate_ctxt.eflags = kvm_x86_ops->get_rflags(vcpu);
 +      vcpu->arch.emulate_ctxt.eip = kvm_rip_read(vcpu);
 +      vcpu->arch.emulate_ctxt.mode =
 +              (!is_protmode(vcpu)) ? X86EMUL_MODE_REAL :
 +              (vcpu->arch.emulate_ctxt.eflags & X86_EFLAGS_VM)
 +              ? X86EMUL_MODE_VM86 : cs_l
 +              ? X86EMUL_MODE_PROT64 : cs_db
 +              ? X86EMUL_MODE_PROT32 : X86EMUL_MODE_PROT16;
  
 -      /* TR should be in GDT only */
 -      if (seg == VCPU_SREG_TR && (selector & (1 << 2)))
 -              goto exception;
 +      ret = emulator_task_switch(&vcpu->arch.emulate_ctxt, &emulate_ops,
 +                                 tss_selector, reason, has_error_code,
 +                                 error_code);
  
 -      ret = load_guest_segment_descriptor(vcpu, selector, &seg_desc);
        if (ret)
 -              return ret;
 -
 -      seg_desct_to_kvm_desct(&seg_desc, selector, &kvm_seg);
 -
 -      if (null_selector) { /* for NULL selector skip all following checks */
 -              kvm_seg.unusable = 1;
 -              goto load;
 -      }
 -
 -      err_code = selector & 0xfffc;
 -      err_vec = GP_VECTOR;
 -
 -      /* can't load system descriptor into segment selecor */
 -      if (seg <= VCPU_SREG_GS && !kvm_seg.s)
 -              goto exception;
 -
 -      if (!kvm_seg.present) {
 -              err_vec = (seg == VCPU_SREG_SS) ? SS_VECTOR : NP_VECTOR;
 -              goto exception;
 -      }
 -
 -      rpl = selector & 3;
 -      dpl = kvm_seg.dpl;
 -      cpl = kvm_x86_ops->get_cpl(vcpu);
 -
 -      switch (seg) {
 -      case VCPU_SREG_SS:
 -              /*
 -               * segment is not a writable data segment or segment
 -               * selector's RPL != CPL or segment selector's RPL != CPL
 -               */
 -              if (rpl != cpl || (kvm_seg.type & 0xa) != 0x2 || dpl != cpl)
 -                      goto exception;
 -              break;
 -      case VCPU_SREG_CS:
 -              if (!(kvm_seg.type & 8))
 -                      goto exception;
 -
 -              if (kvm_seg.type & 4) {
 -                      /* conforming */
 -                      if (dpl > cpl)
 -                              goto exception;
 -              } else {
 -                      /* nonconforming */
 -                      if (rpl > cpl || dpl != cpl)
 -                              goto exception;
 -              }
 -              /* CS(RPL) <- CPL */
 -              selector = (selector & 0xfffc) | cpl;
 -            break;
 -      case VCPU_SREG_TR:
 -              if (kvm_seg.s || (kvm_seg.type != 1 && kvm_seg.type != 9))
 -                      goto exception;
 -              break;
 -      case VCPU_SREG_LDTR:
 -              if (kvm_seg.s || kvm_seg.type != 2)
 -                      goto exception;
 -              break;
 -      default: /*  DS, ES, FS, or GS */
 -              /*
 -               * segment is not a data or readable code segment or
 -               * ((segment is a data or nonconforming code segment)
 -               * and (both RPL and CPL > DPL))
 -               */
 -              if ((kvm_seg.type & 0xa) == 0x8 ||
 -                  (((kvm_seg.type & 0xc) != 0xc) && (rpl > dpl && cpl > dpl)))
 -                      goto exception;
 -              break;
 -      }
 -
 -      if (!kvm_seg.unusable && kvm_seg.s) {
 -              /* mark segment as accessed */
 -              kvm_seg.type |= 1;
 -              seg_desc.type |= 1;
 -              save_guest_segment_descriptor(vcpu, selector, &seg_desc);
 -      }
 -load:
 -      kvm_set_segment(vcpu, &kvm_seg, seg);
 -      return X86EMUL_CONTINUE;
 -exception:
 -      kvm_queue_exception_e(vcpu, err_vec, err_code);
 -      return X86EMUL_PROPAGATE_FAULT;
 -}
 -
 -static void save_state_to_tss32(struct kvm_vcpu *vcpu,
 -                              struct tss_segment_32 *tss)
 -{
 -      tss->cr3 = vcpu->arch.cr3;
 -      tss->eip = kvm_rip_read(vcpu);
 -      tss->eflags = kvm_get_rflags(vcpu);
 -      tss->eax = kvm_register_read(vcpu, VCPU_REGS_RAX);
 -      tss->ecx = kvm_register_read(vcpu, VCPU_REGS_RCX);
 -      tss->edx = kvm_register_read(vcpu, VCPU_REGS_RDX);
 -      tss->ebx = kvm_register_read(vcpu, VCPU_REGS_RBX);
 -      tss->esp = kvm_register_read(vcpu, VCPU_REGS_RSP);
 -      tss->ebp = kvm_register_read(vcpu, VCPU_REGS_RBP);
 -      tss->esi = kvm_register_read(vcpu, VCPU_REGS_RSI);
 -      tss->edi = kvm_register_read(vcpu, VCPU_REGS_RDI);
 -      tss->es = get_segment_selector(vcpu, VCPU_SREG_ES);
 -      tss->cs = get_segment_selector(vcpu, VCPU_SREG_CS);
 -      tss->ss = get_segment_selector(vcpu, VCPU_SREG_SS);
 -      tss->ds = get_segment_selector(vcpu, VCPU_SREG_DS);
 -      tss->fs = get_segment_selector(vcpu, VCPU_SREG_FS);
 -      tss->gs = get_segment_selector(vcpu, VCPU_SREG_GS);
 -      tss->ldt_selector = get_segment_selector(vcpu, VCPU_SREG_LDTR);
 -}
 -
 -static void kvm_load_segment_selector(struct kvm_vcpu *vcpu, u16 sel, int seg)
 -{
 -      struct kvm_segment kvm_seg;
 -      kvm_get_segment(vcpu, &kvm_seg, seg);
 -      kvm_seg.selector = sel;
 -      kvm_set_segment(vcpu, &kvm_seg, seg);
 -}
 -
 -static int load_state_from_tss32(struct kvm_vcpu *vcpu,
 -                                struct tss_segment_32 *tss)
 -{
 -      kvm_set_cr3(vcpu, tss->cr3);
 -
 -      kvm_rip_write(vcpu, tss->eip);
 -      kvm_set_rflags(vcpu, tss->eflags | 2);
 -
 -      kvm_register_write(vcpu, VCPU_REGS_RAX, tss->eax);
 -      kvm_register_write(vcpu, VCPU_REGS_RCX, tss->ecx);
 -      kvm_register_write(vcpu, VCPU_REGS_RDX, tss->edx);
 -      kvm_register_write(vcpu, VCPU_REGS_RBX, tss->ebx);
 -      kvm_register_write(vcpu, VCPU_REGS_RSP, tss->esp);
 -      kvm_register_write(vcpu, VCPU_REGS_RBP, tss->ebp);
 -      kvm_register_write(vcpu, VCPU_REGS_RSI, tss->esi);
 -      kvm_register_write(vcpu, VCPU_REGS_RDI, tss->edi);
 -
 -      /*
 -       * SDM says that segment selectors are loaded before segment
 -       * descriptors
 -       */
 -      kvm_load_segment_selector(vcpu, tss->ldt_selector, VCPU_SREG_LDTR);
 -      kvm_load_segment_selector(vcpu, tss->es, VCPU_SREG_ES);
 -      kvm_load_segment_selector(vcpu, tss->cs, VCPU_SREG_CS);
 -      kvm_load_segment_selector(vcpu, tss->ss, VCPU_SREG_SS);
 -      kvm_load_segment_selector(vcpu, tss->ds, VCPU_SREG_DS);
 -      kvm_load_segment_selector(vcpu, tss->fs, VCPU_SREG_FS);
 -      kvm_load_segment_selector(vcpu, tss->gs, VCPU_SREG_GS);
 +              return EMULATE_FAIL;
  
 -      /*
 -       * Now load segment descriptors. If fault happenes at this stage
 -       * it is handled in a context of new task
 -       */
 -      if (kvm_load_segment_descriptor(vcpu, tss->ldt_selector, VCPU_SREG_LDTR))
 -              return 1;
 -
 -      if (kvm_load_segment_descriptor(vcpu, tss->es, VCPU_SREG_ES))
 -              return 1;
 -
 -      if (kvm_load_segment_descriptor(vcpu, tss->cs, VCPU_SREG_CS))
 -              return 1;
 -
 -      if (kvm_load_segment_descriptor(vcpu, tss->ss, VCPU_SREG_SS))
 -              return 1;
 -
 -      if (kvm_load_segment_descriptor(vcpu, tss->ds, VCPU_SREG_DS))
 -              return 1;
 -
 -      if (kvm_load_segment_descriptor(vcpu, tss->fs, VCPU_SREG_FS))
 -              return 1;
 -
 -      if (kvm_load_segment_descriptor(vcpu, tss->gs, VCPU_SREG_GS))
 -              return 1;
 -      return 0;
 -}
 -
 -static void save_state_to_tss16(struct kvm_vcpu *vcpu,
 -                              struct tss_segment_16 *tss)
 -{
 -      tss->ip = kvm_rip_read(vcpu);
 -      tss->flag = kvm_get_rflags(vcpu);
 -      tss->ax = kvm_register_read(vcpu, VCPU_REGS_RAX);
 -      tss->cx = kvm_register_read(vcpu, VCPU_REGS_RCX);
 -      tss->dx = kvm_register_read(vcpu, VCPU_REGS_RDX);
 -      tss->bx = kvm_register_read(vcpu, VCPU_REGS_RBX);
 -      tss->sp = kvm_register_read(vcpu, VCPU_REGS_RSP);
 -      tss->bp = kvm_register_read(vcpu, VCPU_REGS_RBP);
 -      tss->si = kvm_register_read(vcpu, VCPU_REGS_RSI);
 -      tss->di = kvm_register_read(vcpu, VCPU_REGS_RDI);
 -
 -      tss->es = get_segment_selector(vcpu, VCPU_SREG_ES);
 -      tss->cs = get_segment_selector(vcpu, VCPU_SREG_CS);
 -      tss->ss = get_segment_selector(vcpu, VCPU_SREG_SS);
 -      tss->ds = get_segment_selector(vcpu, VCPU_SREG_DS);
 -      tss->ldt = get_segment_selector(vcpu, VCPU_SREG_LDTR);
 -}
 -
 -static int load_state_from_tss16(struct kvm_vcpu *vcpu,
 -                               struct tss_segment_16 *tss)
 -{
 -      kvm_rip_write(vcpu, tss->ip);
 -      kvm_set_rflags(vcpu, tss->flag | 2);
 -      kvm_register_write(vcpu, VCPU_REGS_RAX, tss->ax);
 -      kvm_register_write(vcpu, VCPU_REGS_RCX, tss->cx);
 -      kvm_register_write(vcpu, VCPU_REGS_RDX, tss->dx);
 -      kvm_register_write(vcpu, VCPU_REGS_RBX, tss->bx);
 -      kvm_register_write(vcpu, VCPU_REGS_RSP, tss->sp);
 -      kvm_register_write(vcpu, VCPU_REGS_RBP, tss->bp);
 -      kvm_register_write(vcpu, VCPU_REGS_RSI, tss->si);
 -      kvm_register_write(vcpu, VCPU_REGS_RDI, tss->di);
 -
 -      /*
 -       * SDM says that segment selectors are loaded before segment
 -       * descriptors
 -       */
 -      kvm_load_segment_selector(vcpu, tss->ldt, VCPU_SREG_LDTR);
 -      kvm_load_segment_selector(vcpu, tss->es, VCPU_SREG_ES);
 -      kvm_load_segment_selector(vcpu, tss->cs, VCPU_SREG_CS);
 -      kvm_load_segment_selector(vcpu, tss->ss, VCPU_SREG_SS);
 -      kvm_load_segment_selector(vcpu, tss->ds, VCPU_SREG_DS);
 -
 -      /*
 -       * Now load segment descriptors. If fault happenes at this stage
 -       * it is handled in a context of new task
 -       */
 -      if (kvm_load_segment_descriptor(vcpu, tss->ldt, VCPU_SREG_LDTR))
 -              return 1;
 -
 -      if (kvm_load_segment_descriptor(vcpu, tss->es, VCPU_SREG_ES))
 -              return 1;
 -
 -      if (kvm_load_segment_descriptor(vcpu, tss->cs, VCPU_SREG_CS))
 -              return 1;
 -
 -      if (kvm_load_segment_descriptor(vcpu, tss->ss, VCPU_SREG_SS))
 -              return 1;
 -
 -      if (kvm_load_segment_descriptor(vcpu, tss->ds, VCPU_SREG_DS))
 -              return 1;
 -      return 0;
 -}
 -
 -static int kvm_task_switch_16(struct kvm_vcpu *vcpu, u16 tss_selector,
 -                            u16 old_tss_sel, u32 old_tss_base,
 -                            struct desc_struct *nseg_desc)
 -{
 -      struct tss_segment_16 tss_segment_16;
 -      int ret = 0;
 -
 -      if (kvm_read_guest(vcpu->kvm, old_tss_base, &tss_segment_16,
 -                         sizeof tss_segment_16))
 -              goto out;
 -
 -      save_state_to_tss16(vcpu, &tss_segment_16);
 -
 -      if (kvm_write_guest(vcpu->kvm, old_tss_base, &tss_segment_16,
 -                          sizeof tss_segment_16))
 -              goto out;
 -
 -      if (kvm_read_guest(vcpu->kvm, get_tss_base_addr_read(vcpu, nseg_desc),
 -                         &tss_segment_16, sizeof tss_segment_16))
 -              goto out;
 -
 -      if (old_tss_sel != 0xffff) {
 -              tss_segment_16.prev_task_link = old_tss_sel;
 -
 -              if (kvm_write_guest(vcpu->kvm,
 -                                  get_tss_base_addr_write(vcpu, nseg_desc),
 -                                  &tss_segment_16.prev_task_link,
 -                                  sizeof tss_segment_16.prev_task_link))
 -                      goto out;
 -      }
 -
 -      if (load_state_from_tss16(vcpu, &tss_segment_16))
 -              goto out;
 -
 -      ret = 1;
 -out:
 -      return ret;
 -}
 -
 -static int kvm_task_switch_32(struct kvm_vcpu *vcpu, u16 tss_selector,
 -                     u16 old_tss_sel, u32 old_tss_base,
 -                     struct desc_struct *nseg_desc)
 -{
 -      struct tss_segment_32 tss_segment_32;
 -      int ret = 0;
 -
 -      if (kvm_read_guest(vcpu->kvm, old_tss_base, &tss_segment_32,
 -                         sizeof tss_segment_32))
 -              goto out;
 -
 -      save_state_to_tss32(vcpu, &tss_segment_32);
 -
 -      if (kvm_write_guest(vcpu->kvm, old_tss_base, &tss_segment_32,
 -                          sizeof tss_segment_32))
 -              goto out;
 -
 -      if (kvm_read_guest(vcpu->kvm, get_tss_base_addr_read(vcpu, nseg_desc),
 -                         &tss_segment_32, sizeof tss_segment_32))
 -              goto out;
 -
 -      if (old_tss_sel != 0xffff) {
 -              tss_segment_32.prev_task_link = old_tss_sel;
 -
 -              if (kvm_write_guest(vcpu->kvm,
 -                                  get_tss_base_addr_write(vcpu, nseg_desc),
 -                                  &tss_segment_32.prev_task_link,
 -                                  sizeof tss_segment_32.prev_task_link))
 -                      goto out;
 -      }
 -
 -      if (load_state_from_tss32(vcpu, &tss_segment_32))
 -              goto out;
 -
 -      ret = 1;
 -out:
 -      return ret;
 -}
 -
 -int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int reason)
 -{
 -      struct kvm_segment tr_seg;
 -      struct desc_struct cseg_desc;
 -      struct desc_struct nseg_desc;
 -      int ret = 0;
 -      u32 old_tss_base = get_segment_base(vcpu, VCPU_SREG_TR);
 -      u16 old_tss_sel = get_segment_selector(vcpu, VCPU_SREG_TR);
 -
 -      old_tss_base = kvm_mmu_gva_to_gpa_write(vcpu, old_tss_base, NULL);
 -
 -      /* FIXME: Handle errors. Failure to read either TSS or their
 -       * descriptors should generate a pagefault.
 -       */
 -      if (load_guest_segment_descriptor(vcpu, tss_selector, &nseg_desc))
 -              goto out;
 -
 -      if (load_guest_segment_descriptor(vcpu, old_tss_sel, &cseg_desc))
 -              goto out;
 -
 -      if (reason != TASK_SWITCH_IRET) {
 -              int cpl;
 -
 -              cpl = kvm_x86_ops->get_cpl(vcpu);
 -              if ((tss_selector & 3) > nseg_desc.dpl || cpl > nseg_desc.dpl) {
 -                      kvm_queue_exception_e(vcpu, GP_VECTOR, 0);
 -                      return 1;
 -              }
 -      }
 -
 -      if (!nseg_desc.p || get_desc_limit(&nseg_desc) < 0x67) {
 -              kvm_queue_exception_e(vcpu, TS_VECTOR, tss_selector & 0xfffc);
 -              return 1;
 -      }
 -
 -      if (reason == TASK_SWITCH_IRET || reason == TASK_SWITCH_JMP) {
 -              cseg_desc.type &= ~(1 << 1); //clear the B flag
 -              save_guest_segment_descriptor(vcpu, old_tss_sel, &cseg_desc);
 -      }
 -
 -      if (reason == TASK_SWITCH_IRET) {
 -              u32 eflags = kvm_get_rflags(vcpu);
 -              kvm_set_rflags(vcpu, eflags & ~X86_EFLAGS_NT);
 -      }
 -
 -      /* set back link to prev task only if NT bit is set in eflags
 -         note that old_tss_sel is not used afetr this point */
 -      if (reason != TASK_SWITCH_CALL && reason != TASK_SWITCH_GATE)
 -              old_tss_sel = 0xffff;
 -
 -      if (nseg_desc.type & 8)
 -              ret = kvm_task_switch_32(vcpu, tss_selector, old_tss_sel,
 -                                       old_tss_base, &nseg_desc);
 -      else
 -              ret = kvm_task_switch_16(vcpu, tss_selector, old_tss_sel,
 -                                       old_tss_base, &nseg_desc);
 -
 -      if (reason == TASK_SWITCH_CALL || reason == TASK_SWITCH_GATE) {
 -              u32 eflags = kvm_get_rflags(vcpu);
 -              kvm_set_rflags(vcpu, eflags | X86_EFLAGS_NT);
 -      }
 -
 -      if (reason != TASK_SWITCH_IRET) {
 -              nseg_desc.type |= (1 << 1);
 -              save_guest_segment_descriptor(vcpu, tss_selector,
 -                                            &nseg_desc);
 -      }
 -
 -      kvm_x86_ops->set_cr0(vcpu, kvm_read_cr0(vcpu) | X86_CR0_TS);
 -      seg_desct_to_kvm_desct(&nseg_desc, tss_selector, &tr_seg);
 -      tr_seg.type = 11;
 -      kvm_set_segment(vcpu, &tr_seg, VCPU_SREG_TR);
 -out:
 -      return ret;
 +      kvm_x86_ops->set_rflags(vcpu, vcpu->arch.emulate_ctxt.eflags);
 +      return EMULATE_DONE;
  }
  EXPORT_SYMBOL_GPL(kvm_task_switch);
  
@@@ -4813,15 -5266,15 +4859,15 @@@ int kvm_arch_vcpu_ioctl_set_sregs(struc
  {
        int mmu_reset_needed = 0;
        int pending_vec, max_bits;
 -      struct descriptor_table dt;
 +      struct desc_ptr dt;
  
        vcpu_load(vcpu);
  
 -      dt.limit = sregs->idt.limit;
 -      dt.base = sregs->idt.base;
 +      dt.size = sregs->idt.limit;
 +      dt.address = sregs->idt.base;
        kvm_x86_ops->set_idt(vcpu, &dt);
 -      dt.limit = sregs->gdt.limit;
 -      dt.base = sregs->gdt.base;
 +      dt.size = sregs->gdt.limit;
 +      dt.address = sregs->gdt.base;
        kvm_x86_ops->set_gdt(vcpu, &dt);
  
        vcpu->arch.cr2 = sregs->cr2;
@@@ -4920,9 -5373,11 +4966,9 @@@ int kvm_arch_vcpu_ioctl_set_guest_debug
                vcpu->arch.switch_db_regs = (vcpu->arch.dr7 & DR7_BP_EN_MASK);
        }
  
 -      if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP) {
 -              vcpu->arch.singlestep_cs =
 -                      get_segment_selector(vcpu, VCPU_SREG_CS);
 -              vcpu->arch.singlestep_rip = kvm_rip_read(vcpu);
 -      }
 +      if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP)
 +              vcpu->arch.singlestep_rip = kvm_rip_read(vcpu) +
 +                      get_segment_base(vcpu, VCPU_SREG_CS);
  
        /*
         * Trigger an rflags update that will inject or remove the trace
@@@ -5413,22 -5868,13 +5459,22 @@@ int kvm_arch_interrupt_allowed(struct k
        return kvm_x86_ops->interrupt_allowed(vcpu);
  }
  
 +bool kvm_is_linear_rip(struct kvm_vcpu *vcpu, unsigned long linear_rip)
 +{
 +      unsigned long current_rip = kvm_rip_read(vcpu) +
 +              get_segment_base(vcpu, VCPU_SREG_CS);
 +
 +      return current_rip == linear_rip;
 +}
 +EXPORT_SYMBOL_GPL(kvm_is_linear_rip);
 +
  unsigned long kvm_get_rflags(struct kvm_vcpu *vcpu)
  {
        unsigned long rflags;
  
        rflags = kvm_x86_ops->get_rflags(vcpu);
        if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP)
 -              rflags &= ~(unsigned long)(X86_EFLAGS_TF | X86_EFLAGS_RF);
 +              rflags &= ~X86_EFLAGS_TF;
        return rflags;
  }
  EXPORT_SYMBOL_GPL(kvm_get_rflags);
  void kvm_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags)
  {
        if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP &&
 -          vcpu->arch.singlestep_cs ==
 -                      get_segment_selector(vcpu, VCPU_SREG_CS) &&
 -          vcpu->arch.singlestep_rip == kvm_rip_read(vcpu))
 -              rflags |= X86_EFLAGS_TF | X86_EFLAGS_RF;
 +          kvm_is_linear_rip(vcpu, vcpu->arch.singlestep_rip))
 +              rflags |= X86_EFLAGS_TF;
        kvm_x86_ops->set_rflags(vcpu, rflags);
  }
  EXPORT_SYMBOL_GPL(kvm_set_rflags);
@@@ -5453,4 -5901,3 +5499,4 @@@ EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_nested
  EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_nested_intr_vmexit);
  EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_invlpga);
  EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_skinit);
 +EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_nested_intercepts);
diff --combined kernel/sched.c
index 6af210a7de70d394015617863ee303059c9dd4d6,8cafe3ff558fec69a0c3c4676aa61191bb6e84da..b0bbadc2495506d874846636eaf16588f3760d6d
@@@ -2077,49 -2077,6 +2077,6 @@@ migrate_task(struct task_struct *p, in
        return 1;
  }
  
- /*
-  * wait_task_context_switch - wait for a thread to complete at least one
-  *                            context switch.
-  *
-  * @p must not be current.
-  */
- void wait_task_context_switch(struct task_struct *p)
- {
-       unsigned long nvcsw, nivcsw, flags;
-       int running;
-       struct rq *rq;
-       nvcsw   = p->nvcsw;
-       nivcsw  = p->nivcsw;
-       for (;;) {
-               /*
-                * The runqueue is assigned before the actual context
-                * switch. We need to take the runqueue lock.
-                *
-                * We could check initially without the lock but it is
-                * very likely that we need to take the lock in every
-                * iteration.
-                */
-               rq = task_rq_lock(p, &flags);
-               running = task_running(rq, p);
-               task_rq_unlock(rq, &flags);
-               if (likely(!running))
-                       break;
-               /*
-                * The switch count is incremented before the actual
-                * context switch. We thus wait for two switches to be
-                * sure at least one completed.
-                */
-               if ((p->nvcsw - nvcsw) > 1)
-                       break;
-               if ((p->nivcsw - nivcsw) > 1)
-                       break;
-               cpu_relax();
-       }
- }
  /*
   * wait_task_inactive - wait for a thread to unschedule.
   *
@@@ -4903,7 -4860,7 +4860,7 @@@ SYSCALL_DEFINE3(sched_getaffinity, pid_
        int ret;
        cpumask_var_t mask;
  
 -      if (len < nr_cpu_ids)
 +      if ((len * BITS_PER_BYTE) < nr_cpu_ids)
                return -EINVAL;
        if (len & (sizeof(unsigned long)-1))
                return -EINVAL;