1 // SPDX-License-Identifier: GPL-2.0-only
3 * AMD Memory Encryption Support
5 * Copyright (C) 2019 SUSE
7 * Author: Joerg Roedel <jroedel@suse.de>
10 #define pr_fmt(fmt) "SEV-ES: " fmt
12 #include <linux/sched/debug.h> /* For show_regs() */
13 #include <linux/percpu-defs.h>
14 #include <linux/mem_encrypt.h>
15 #include <linux/lockdep.h>
16 #include <linux/printk.h>
17 #include <linux/mm_types.h>
18 #include <linux/set_memory.h>
19 #include <linux/memblock.h>
20 #include <linux/kernel.h>
23 #include <asm/cpu_entry_area.h>
24 #include <asm/sev-es.h>
25 #include <asm/insn-eval.h>
26 #include <asm/fpu/internal.h>
27 #include <asm/processor.h>
28 #include <asm/realmode.h>
29 #include <asm/traps.h>
32 /* For early boot hypervisor communication in SEV-ES enabled guests */
33 static struct ghcb boot_ghcb_page __bss_decrypted __aligned(PAGE_SIZE);
36 * Needs to be in the .data section because we need it NULL before bss is
39 static struct ghcb __initdata *boot_ghcb;
41 /* #VC handler runtime per-CPU data */
42 struct sev_es_runtime_data {
43 struct ghcb ghcb_page;
45 /* Physical storage for the per-CPU IST stack of the #VC handler */
46 char ist_stack[EXCEPTION_STKSZ] __aligned(PAGE_SIZE);
49 * Physical storage for the per-CPU fall-back stack of the #VC handler.
50 * The fall-back stack is used when it is not safe to switch back to the
51 * interrupted stack in the #VC entry code.
53 char fallback_stack[EXCEPTION_STKSZ] __aligned(PAGE_SIZE);
56 * Reserve one page per CPU as backup storage for the unencrypted GHCB.
57 * It is needed when an NMI happens while the #VC handler uses the real
58 * GHCB, and the NMI handler itself is causing another #VC exception. In
59 * that case the GHCB content of the first handler needs to be backed up
62 struct ghcb backup_ghcb;
65 * Mark the per-cpu GHCBs as in-use to detect nested #VC exceptions.
66 * There is no need for it to be atomic, because nothing is written to
67 * the GHCB between the read and the write of ghcb_active. So it is safe
68 * to use it when a nested #VC exception happens before the write.
70 * This is necessary for example in the #VC->NMI->#VC case when the NMI
71 * happens while the first #VC handler uses the GHCB. When the NMI code
72 * raises a second #VC handler it might overwrite the contents of the
73 * GHCB written by the first handler. To avoid this the content of the
74 * GHCB is saved and restored when the GHCB is detected to be in use
78 bool backup_ghcb_active;
85 static DEFINE_PER_CPU(struct sev_es_runtime_data*, runtime_data);
86 DEFINE_STATIC_KEY_FALSE(sev_es_enable_key);
88 /* Needed in vc_early_forward_exception */
89 void do_early_exception(struct pt_regs *regs, int trapnr);
91 static void __init setup_vc_stacks(int cpu)
93 struct sev_es_runtime_data *data;
94 struct cpu_entry_area *cea;
98 data = per_cpu(runtime_data, cpu);
99 cea = get_cpu_entry_area(cpu);
101 /* Map #VC IST stack */
102 vaddr = CEA_ESTACK_BOT(&cea->estacks, VC);
103 pa = __pa(data->ist_stack);
104 cea_set_pte((void *)vaddr, pa, PAGE_KERNEL);
106 /* Map VC fall-back stack */
107 vaddr = CEA_ESTACK_BOT(&cea->estacks, VC2);
108 pa = __pa(data->fallback_stack);
109 cea_set_pte((void *)vaddr, pa, PAGE_KERNEL);
112 static __always_inline bool on_vc_stack(unsigned long sp)
114 return ((sp >= __this_cpu_ist_bottom_va(VC)) && (sp < __this_cpu_ist_top_va(VC)));
118 * This function handles the case when an NMI is raised in the #VC exception
119 * handler entry code. In this case, the IST entry for #VC must be adjusted, so
120 * that any subsequent #VC exception will not overwrite the stack contents of the
121 * interrupted #VC handler.
123 * The IST entry is adjusted unconditionally so that it can be also be
124 * unconditionally adjusted back in sev_es_ist_exit(). Otherwise a nested
125 * sev_es_ist_exit() call may adjust back the IST entry too early.
127 void noinstr __sev_es_ist_enter(struct pt_regs *regs)
129 unsigned long old_ist, new_ist;
131 /* Read old IST entry */
132 old_ist = __this_cpu_read(cpu_tss_rw.x86_tss.ist[IST_INDEX_VC]);
134 /* Make room on the IST stack */
135 if (on_vc_stack(regs->sp))
136 new_ist = ALIGN_DOWN(regs->sp, 8) - sizeof(old_ist);
138 new_ist = old_ist - sizeof(old_ist);
140 /* Store old IST entry */
141 *(unsigned long *)new_ist = old_ist;
143 /* Set new IST entry */
144 this_cpu_write(cpu_tss_rw.x86_tss.ist[IST_INDEX_VC], new_ist);
147 void noinstr __sev_es_ist_exit(void)
152 ist = __this_cpu_read(cpu_tss_rw.x86_tss.ist[IST_INDEX_VC]);
154 if (WARN_ON(ist == __this_cpu_ist_top_va(VC)))
157 /* Read back old IST entry and write it to the TSS */
158 this_cpu_write(cpu_tss_rw.x86_tss.ist[IST_INDEX_VC], *(unsigned long *)ist);
161 static __always_inline struct ghcb *sev_es_get_ghcb(struct ghcb_state *state)
163 struct sev_es_runtime_data *data;
166 data = this_cpu_read(runtime_data);
167 ghcb = &data->ghcb_page;
169 if (unlikely(data->ghcb_active)) {
170 /* GHCB is already in use - save its contents */
172 if (unlikely(data->backup_ghcb_active))
175 /* Mark backup_ghcb active before writing to it */
176 data->backup_ghcb_active = true;
178 state->ghcb = &data->backup_ghcb;
180 /* Backup GHCB content */
181 *state->ghcb = *ghcb;
184 data->ghcb_active = true;
190 static __always_inline void sev_es_put_ghcb(struct ghcb_state *state)
192 struct sev_es_runtime_data *data;
195 data = this_cpu_read(runtime_data);
196 ghcb = &data->ghcb_page;
199 /* Restore GHCB from Backup */
200 *ghcb = *state->ghcb;
201 data->backup_ghcb_active = false;
204 data->ghcb_active = false;
208 static inline u64 sev_es_rd_ghcb_msr(void)
210 return __rdmsr(MSR_AMD64_SEV_ES_GHCB);
213 static inline void sev_es_wr_ghcb_msr(u64 val)
218 high = (u32)(val >> 32);
220 native_wrmsr(MSR_AMD64_SEV_ES_GHCB, low, high);
223 static int vc_fetch_insn_kernel(struct es_em_ctxt *ctxt,
224 unsigned char *buffer)
226 return copy_from_kernel_nofault(buffer, (unsigned char *)ctxt->regs->ip, MAX_INSN_SIZE);
229 static enum es_result vc_decode_insn(struct es_em_ctxt *ctxt)
231 char buffer[MAX_INSN_SIZE];
235 res = vc_fetch_insn_kernel(ctxt, buffer);
236 if (unlikely(res == -EFAULT)) {
237 ctxt->fi.vector = X86_TRAP_PF;
238 ctxt->fi.error_code = 0;
239 ctxt->fi.cr2 = ctxt->regs->ip;
243 insn_init(&ctxt->insn, buffer, MAX_INSN_SIZE - res, 1);
244 insn_get_length(&ctxt->insn);
246 ret = ctxt->insn.immediate.got ? ES_OK : ES_DECODE_FAILED;
251 static enum es_result vc_write_mem(struct es_em_ctxt *ctxt,
252 char *dst, char *buf, size_t size)
254 unsigned long error_code = X86_PF_PROT | X86_PF_WRITE;
255 char __user *target = (char __user *)dst;
264 if (put_user(d1, target))
269 if (put_user(d2, target))
274 if (put_user(d4, target))
279 if (put_user(d8, target))
283 WARN_ONCE(1, "%s: Invalid size: %zu\n", __func__, size);
284 return ES_UNSUPPORTED;
290 if (user_mode(ctxt->regs))
291 error_code |= X86_PF_USER;
293 ctxt->fi.vector = X86_TRAP_PF;
294 ctxt->fi.error_code = error_code;
295 ctxt->fi.cr2 = (unsigned long)dst;
300 static enum es_result vc_read_mem(struct es_em_ctxt *ctxt,
301 char *src, char *buf, size_t size)
303 unsigned long error_code = X86_PF_PROT;
304 char __user *s = (char __user *)src;
332 WARN_ONCE(1, "%s: Invalid size: %zu\n", __func__, size);
333 return ES_UNSUPPORTED;
339 if (user_mode(ctxt->regs))
340 error_code |= X86_PF_USER;
342 ctxt->fi.vector = X86_TRAP_PF;
343 ctxt->fi.error_code = error_code;
344 ctxt->fi.cr2 = (unsigned long)src;
349 /* Include code shared with pre-decompression boot stage */
350 #include "sev-es-shared.c"
353 * This function runs on the first #VC exception after the kernel
354 * switched to virtual addresses.
356 static bool __init sev_es_setup_ghcb(void)
358 /* First make sure the hypervisor talks a supported protocol. */
359 if (!sev_es_negotiate_protocol())
363 * Clear the boot_ghcb. The first exception comes in before the bss
364 * section is cleared.
366 memset(&boot_ghcb_page, 0, PAGE_SIZE);
368 /* Alright - Make the boot-ghcb public */
369 boot_ghcb = &boot_ghcb_page;
374 static void __init alloc_runtime_data(int cpu)
376 struct sev_es_runtime_data *data;
378 data = memblock_alloc(sizeof(*data), PAGE_SIZE);
380 panic("Can't allocate SEV-ES runtime data");
382 per_cpu(runtime_data, cpu) = data;
385 static void __init init_ghcb(int cpu)
387 struct sev_es_runtime_data *data;
390 data = per_cpu(runtime_data, cpu);
392 err = early_set_memory_decrypted((unsigned long)&data->ghcb_page,
393 sizeof(data->ghcb_page));
395 panic("Can't map GHCBs unencrypted");
397 memset(&data->ghcb_page, 0, sizeof(data->ghcb_page));
399 data->ghcb_active = false;
400 data->backup_ghcb_active = false;
403 void __init sev_es_init_vc_handling(void)
407 BUILD_BUG_ON(offsetof(struct sev_es_runtime_data, ghcb_page) % PAGE_SIZE);
409 if (!sev_es_active())
412 /* Enable SEV-ES special handling */
413 static_branch_enable(&sev_es_enable_key);
415 /* Initialize per-cpu GHCB pages */
416 for_each_possible_cpu(cpu) {
417 alloc_runtime_data(cpu);
419 setup_vc_stacks(cpu);
422 /* Secondary CPUs use the runtime #VC handler */
423 initial_vc_handler = (unsigned long)safe_stack_exc_vmm_communication;
426 static void __init vc_early_forward_exception(struct es_em_ctxt *ctxt)
428 int trapnr = ctxt->fi.vector;
430 if (trapnr == X86_TRAP_PF)
431 native_write_cr2(ctxt->fi.cr2);
433 ctxt->regs->orig_ax = ctxt->fi.error_code;
434 do_early_exception(ctxt->regs, trapnr);
437 static enum es_result vc_handle_exitcode(struct es_em_ctxt *ctxt,
439 unsigned long exit_code)
441 enum es_result result;
446 * Unexpected #VC exception
448 result = ES_UNSUPPORTED;
454 static __always_inline void vc_forward_exception(struct es_em_ctxt *ctxt)
456 long error_code = ctxt->fi.error_code;
457 int trapnr = ctxt->fi.vector;
459 ctxt->regs->orig_ax = ctxt->fi.error_code;
463 exc_general_protection(ctxt->regs, error_code);
466 exc_invalid_op(ctxt->regs);
469 pr_emerg("Unsupported exception in #VC instruction emulation - can't continue\n");
474 static __always_inline bool on_vc_fallback_stack(struct pt_regs *regs)
476 unsigned long sp = (unsigned long)regs;
478 return (sp >= __this_cpu_ist_bottom_va(VC2) && sp < __this_cpu_ist_top_va(VC2));
482 * Main #VC exception handler. It is called when the entry code was able to
483 * switch off the IST to a safe kernel stack.
485 * With the current implementation it is always possible to switch to a safe
486 * stack because #VC exceptions only happen at known places, like intercepted
487 * instructions or accesses to MMIO areas/IO ports. They can also happen with
488 * code instrumentation when the hypervisor intercepts #DB, but the critical
489 * paths are forbidden to be instrumented, so #DB exceptions currently also
490 * only happen in safe places.
492 DEFINE_IDTENTRY_VC_SAFE_STACK(exc_vmm_communication)
494 struct sev_es_runtime_data *data = this_cpu_read(runtime_data);
495 struct ghcb_state state;
496 struct es_em_ctxt ctxt;
497 enum es_result result;
500 lockdep_assert_irqs_disabled();
501 instrumentation_begin();
504 * This is invoked through an interrupt gate, so IRQs are disabled. The
505 * code below might walk page-tables for user or kernel addresses, so
506 * keep the IRQs disabled to protect us against concurrent TLB flushes.
509 ghcb = sev_es_get_ghcb(&state);
512 * Mark GHCBs inactive so that panic() is able to print the
515 data->ghcb_active = false;
516 data->backup_ghcb_active = false;
518 panic("Unable to handle #VC exception! GHCB and Backup GHCB are already in use");
521 vc_ghcb_invalidate(ghcb);
522 result = vc_init_em_ctxt(&ctxt, regs, error_code);
525 result = vc_handle_exitcode(&ctxt, ghcb, error_code);
527 sev_es_put_ghcb(&state);
529 /* Done - now check the result */
532 vc_finish_insn(&ctxt);
535 pr_err_ratelimited("Unsupported exit-code 0x%02lx in early #VC exception (IP: 0x%lx)\n",
536 error_code, regs->ip);
539 pr_err_ratelimited("Failure in communication with VMM (exit-code 0x%02lx IP: 0x%lx)\n",
540 error_code, regs->ip);
542 case ES_DECODE_FAILED:
543 pr_err_ratelimited("Failed to decode instruction (exit-code 0x%02lx IP: 0x%lx)\n",
544 error_code, regs->ip);
547 vc_forward_exception(&ctxt);
553 pr_emerg("Unknown result in %s():%d\n", __func__, result);
555 * Emulating the instruction which caused the #VC exception
556 * failed - can't continue so print debug information
562 instrumentation_end();
567 if (user_mode(regs)) {
569 * Do not kill the machine if user-space triggered the
570 * exception. Send SIGBUS instead and let user-space deal with
573 force_sig_fault(SIGBUS, BUS_OBJERR, (void __user *)0);
575 pr_emerg("PANIC: Unhandled #VC exception in kernel space (result=%d)\n",
578 /* Show some debug info */
581 /* Ask hypervisor to sev_es_terminate */
582 sev_es_terminate(GHCB_SEV_ES_REASON_GENERAL_REQUEST);
584 /* If that fails and we get here - just panic */
585 panic("Returned from Terminate-Request to Hypervisor\n");
591 /* This handler runs on the #VC fall-back stack. It can cause further #VC exceptions */
592 DEFINE_IDTENTRY_VC_IST(exc_vmm_communication)
594 instrumentation_begin();
595 panic("Can't handle #VC exception from unsupported context\n");
596 instrumentation_end();
599 DEFINE_IDTENTRY_VC(exc_vmm_communication)
601 if (likely(!on_vc_fallback_stack(regs)))
602 safe_stack_exc_vmm_communication(regs, error_code);
604 ist_exc_vmm_communication(regs, error_code);
607 bool __init handle_vc_boot_ghcb(struct pt_regs *regs)
609 unsigned long exit_code = regs->orig_ax;
610 struct es_em_ctxt ctxt;
611 enum es_result result;
613 /* Do initial setup or terminate the guest */
614 if (unlikely(boot_ghcb == NULL && !sev_es_setup_ghcb()))
615 sev_es_terminate(GHCB_SEV_ES_REASON_GENERAL_REQUEST);
617 vc_ghcb_invalidate(boot_ghcb);
619 result = vc_init_em_ctxt(&ctxt, regs, exit_code);
621 result = vc_handle_exitcode(&ctxt, boot_ghcb, exit_code);
623 /* Done - now check the result */
626 vc_finish_insn(&ctxt);
629 early_printk("PANIC: Unsupported exit-code 0x%02lx in early #VC exception (IP: 0x%lx)\n",
630 exit_code, regs->ip);
633 early_printk("PANIC: Failure in communication with VMM (exit-code 0x%02lx IP: 0x%lx)\n",
634 exit_code, regs->ip);
636 case ES_DECODE_FAILED:
637 early_printk("PANIC: Failed to decode instruction (exit-code 0x%02lx IP: 0x%lx)\n",
638 exit_code, regs->ip);
641 vc_early_forward_exception(&ctxt);