arch/x86/kernel/sev-es.c

   1 // SPDX-License-Identifier: GPL-2.0-only
   2 /*
   3  * AMD Memory Encryption Support
   4  *
   5  * Copyright (C) 2019 SUSE
   6  *
   7  * Author: Joerg Roedel <jroedel@suse.de>
   8  */
   9
  10 #define pr_fmt(fmt)     "SEV-ES: " fmt
  11
  12 #include <linux/sched/debug.h>  /* For show_regs() */
  13 #include <linux/percpu-defs.h>
  14 #include <linux/mem_encrypt.h>
  15 #include <linux/lockdep.h>
  16 #include <linux/printk.h>
  17 #include <linux/mm_types.h>
  18 #include <linux/set_memory.h>
  19 #include <linux/memblock.h>
  20 #include <linux/kernel.h>
  21 #include <linux/mm.h>
  22
  23 #include <asm/cpu_entry_area.h>
  24 #include <asm/sev-es.h>
  25 #include <asm/insn-eval.h>
  26 #include <asm/fpu/internal.h>
  27 #include <asm/processor.h>
  28 #include <asm/realmode.h>
  29 #include <asm/traps.h>
  30 #include <asm/svm.h>
  31
  32 /* For early boot hypervisor communication in SEV-ES enabled guests */
  33 static struct ghcb boot_ghcb_page __bss_decrypted __aligned(PAGE_SIZE);
  34
  35 /*
  36  * Needs to be in the .data section because we need it NULL before bss is
  37  * cleared
  38  */
  39 static struct ghcb __initdata *boot_ghcb;
  40
  41 /* #VC handler runtime per-CPU data */
  42 struct sev_es_runtime_data {
  43         struct ghcb ghcb_page;
  44
  45         /* Physical storage for the per-CPU IST stack of the #VC handler */
  46         char ist_stack[EXCEPTION_STKSZ] __aligned(PAGE_SIZE);
  47
  48         /*
  49          * Physical storage for the per-CPU fall-back stack of the #VC handler.
  50          * The fall-back stack is used when it is not safe to switch back to the
  51          * interrupted stack in the #VC entry code.
  52          */
  53         char fallback_stack[EXCEPTION_STKSZ] __aligned(PAGE_SIZE);
  54
  55         /*
  56          * Reserve one page per CPU as backup storage for the unencrypted GHCB.
  57          * It is needed when an NMI happens while the #VC handler uses the real
  58          * GHCB, and the NMI handler itself is causing another #VC exception. In
  59          * that case the GHCB content of the first handler needs to be backed up
  60          * and restored.
  61          */
  62         struct ghcb backup_ghcb;
  63
  64         /*
  65          * Mark the per-cpu GHCBs as in-use to detect nested #VC exceptions.
  66          * There is no need for it to be atomic, because nothing is written to
  67          * the GHCB between the read and the write of ghcb_active. So it is safe
  68          * to use it when a nested #VC exception happens before the write.
  69          *
  70          * This is necessary for example in the #VC->NMI->#VC case when the NMI
  71          * happens while the first #VC handler uses the GHCB. When the NMI code
  72          * raises a second #VC handler it might overwrite the contents of the
  73          * GHCB written by the first handler. To avoid this the content of the
  74          * GHCB is saved and restored when the GHCB is detected to be in use
  75          * already.
  76          */
  77         bool ghcb_active;
  78         bool backup_ghcb_active;
  79 };
  80
  81 struct ghcb_state {
  82         struct ghcb *ghcb;
  83 };
  84
  85 static DEFINE_PER_CPU(struct sev_es_runtime_data*, runtime_data);
  86 DEFINE_STATIC_KEY_FALSE(sev_es_enable_key);
  87
  88 /* Needed in vc_early_forward_exception */
  89 void do_early_exception(struct pt_regs *regs, int trapnr);
  90
  91 static void __init setup_vc_stacks(int cpu)
  92 {
  93         struct sev_es_runtime_data *data;
  94         struct cpu_entry_area *cea;
  95         unsigned long vaddr;
  96         phys_addr_t pa;
  97
  98         data = per_cpu(runtime_data, cpu);
  99         cea  = get_cpu_entry_area(cpu);
 100
 101         /* Map #VC IST stack */
 102         vaddr = CEA_ESTACK_BOT(&cea->estacks, VC);
 103         pa    = __pa(data->ist_stack);
 104         cea_set_pte((void *)vaddr, pa, PAGE_KERNEL);
 105
 106         /* Map VC fall-back stack */
 107         vaddr = CEA_ESTACK_BOT(&cea->estacks, VC2);
 108         pa    = __pa(data->fallback_stack);
 109         cea_set_pte((void *)vaddr, pa, PAGE_KERNEL);
 110 }
 111
 112 static __always_inline bool on_vc_stack(unsigned long sp)
 113 {
 114         return ((sp >= __this_cpu_ist_bottom_va(VC)) && (sp < __this_cpu_ist_top_va(VC)));
 115 }
 116
 117 /*
 118  * This function handles the case when an NMI is raised in the #VC exception
 119  * handler entry code. In this case, the IST entry for #VC must be adjusted, so
 120  * that any subsequent #VC exception will not overwrite the stack contents of the
 121  * interrupted #VC handler.
 122  *
 123  * The IST entry is adjusted unconditionally so that it can be also be
 124  * unconditionally adjusted back in sev_es_ist_exit(). Otherwise a nested
 125  * sev_es_ist_exit() call may adjust back the IST entry too early.
 126  */
 127 void noinstr __sev_es_ist_enter(struct pt_regs *regs)
 128 {
 129         unsigned long old_ist, new_ist;
 130
 131         /* Read old IST entry */
 132         old_ist = __this_cpu_read(cpu_tss_rw.x86_tss.ist[IST_INDEX_VC]);
 133
 134         /* Make room on the IST stack */
 135         if (on_vc_stack(regs->sp))
 136                 new_ist = ALIGN_DOWN(regs->sp, 8) - sizeof(old_ist);
 137         else
 138                 new_ist = old_ist - sizeof(old_ist);
 139
 140         /* Store old IST entry */
 141         *(unsigned long *)new_ist = old_ist;
 142
 143         /* Set new IST entry */
 144         this_cpu_write(cpu_tss_rw.x86_tss.ist[IST_INDEX_VC], new_ist);
 145 }
 146
 147 void noinstr __sev_es_ist_exit(void)
 148 {
 149         unsigned long ist;
 150
 151         /* Read IST entry */
 152         ist = __this_cpu_read(cpu_tss_rw.x86_tss.ist[IST_INDEX_VC]);
 153
 154         if (WARN_ON(ist == __this_cpu_ist_top_va(VC)))
 155                 return;
 156
 157         /* Read back old IST entry and write it to the TSS */
 158         this_cpu_write(cpu_tss_rw.x86_tss.ist[IST_INDEX_VC], *(unsigned long *)ist);
 159 }
 160
 161 static __always_inline struct ghcb *sev_es_get_ghcb(struct ghcb_state *state)
 162 {
 163         struct sev_es_runtime_data *data;
 164         struct ghcb *ghcb;
 165
 166         data = this_cpu_read(runtime_data);
 167         ghcb = &data->ghcb_page;
 168
 169         if (unlikely(data->ghcb_active)) {
 170                 /* GHCB is already in use - save its contents */
 171
 172                 if (unlikely(data->backup_ghcb_active))
 173                         return NULL;
 174
 175                 /* Mark backup_ghcb active before writing to it */
 176                 data->backup_ghcb_active = true;
 177
 178                 state->ghcb = &data->backup_ghcb;
 179
 180                 /* Backup GHCB content */
 181                 *state->ghcb = *ghcb;
 182         } else {
 183                 state->ghcb = NULL;
 184                 data->ghcb_active = true;
 185         }
 186
 187         return ghcb;
 188 }
 189
 190 static __always_inline void sev_es_put_ghcb(struct ghcb_state *state)
 191 {
 192         struct sev_es_runtime_data *data;
 193         struct ghcb *ghcb;
 194
 195         data = this_cpu_read(runtime_data);
 196         ghcb = &data->ghcb_page;
 197
 198         if (state->ghcb) {
 199                 /* Restore GHCB from Backup */
 200                 *ghcb = *state->ghcb;
 201                 data->backup_ghcb_active = false;
 202                 state->ghcb = NULL;
 203         } else {
 204                 data->ghcb_active = false;
 205         }
 206 }
 207
 208 static inline u64 sev_es_rd_ghcb_msr(void)
 209 {
 210         return __rdmsr(MSR_AMD64_SEV_ES_GHCB);
 211 }
 212
 213 static inline void sev_es_wr_ghcb_msr(u64 val)
 214 {
 215         u32 low, high;
 216
 217         low  = (u32)(val);
 218         high = (u32)(val >> 32);
 219
 220         native_wrmsr(MSR_AMD64_SEV_ES_GHCB, low, high);
 221 }
 222
 223 static int vc_fetch_insn_kernel(struct es_em_ctxt *ctxt,
 224                                 unsigned char *buffer)
 225 {
 226         return copy_from_kernel_nofault(buffer, (unsigned char *)ctxt->regs->ip, MAX_INSN_SIZE);
 227 }
 228
 229 static enum es_result vc_decode_insn(struct es_em_ctxt *ctxt)
 230 {
 231         char buffer[MAX_INSN_SIZE];
 232         enum es_result ret;
 233         int res;
 234
 235         res = vc_fetch_insn_kernel(ctxt, buffer);
 236         if (unlikely(res == -EFAULT)) {
 237                 ctxt->fi.vector     = X86_TRAP_PF;
 238                 ctxt->fi.error_code = 0;
 239                 ctxt->fi.cr2        = ctxt->regs->ip;
 240                 return ES_EXCEPTION;
 241         }
 242
 243         insn_init(&ctxt->insn, buffer, MAX_INSN_SIZE - res, 1);
 244         insn_get_length(&ctxt->insn);
 245
 246         ret = ctxt->insn.immediate.got ? ES_OK : ES_DECODE_FAILED;
 247
 248         return ret;
 249 }
 250
 251 static enum es_result vc_write_mem(struct es_em_ctxt *ctxt,
 252                                    char *dst, char *buf, size_t size)
 253 {
 254         unsigned long error_code = X86_PF_PROT | X86_PF_WRITE;
 255         char __user *target = (char __user *)dst;
 256         u64 d8;
 257         u32 d4;
 258         u16 d2;
 259         u8  d1;
 260
 261         switch (size) {
 262         case 1:
 263                 memcpy(&d1, buf, 1);
 264                 if (put_user(d1, target))
 265                         goto fault;
 266                 break;
 267         case 2:
 268                 memcpy(&d2, buf, 2);
 269                 if (put_user(d2, target))
 270                         goto fault;
 271                 break;
 272         case 4:
 273                 memcpy(&d4, buf, 4);
 274                 if (put_user(d4, target))
 275                         goto fault;
 276                 break;
 277         case 8:
 278                 memcpy(&d8, buf, 8);
 279                 if (put_user(d8, target))
 280                         goto fault;
 281                 break;
 282         default:
 283                 WARN_ONCE(1, "%s: Invalid size: %zu\n", __func__, size);
 284                 return ES_UNSUPPORTED;
 285         }
 286
 287         return ES_OK;
 288
 289 fault:
 290         if (user_mode(ctxt->regs))
 291                 error_code |= X86_PF_USER;
 292
 293         ctxt->fi.vector = X86_TRAP_PF;
 294         ctxt->fi.error_code = error_code;
 295         ctxt->fi.cr2 = (unsigned long)dst;
 296
 297         return ES_EXCEPTION;
 298 }
 299
 300 static enum es_result vc_read_mem(struct es_em_ctxt *ctxt,
 301                                   char *src, char *buf, size_t size)
 302 {
 303         unsigned long error_code = X86_PF_PROT;
 304         char __user *s = (char __user *)src;
 305         u64 d8;
 306         u32 d4;
 307         u16 d2;
 308         u8  d1;
 309
 310         switch (size) {
 311         case 1:
 312                 if (get_user(d1, s))
 313                         goto fault;
 314                 memcpy(buf, &d1, 1);
 315                 break;
 316         case 2:
 317                 if (get_user(d2, s))
 318                         goto fault;
 319                 memcpy(buf, &d2, 2);
 320                 break;
 321         case 4:
 322                 if (get_user(d4, s))
 323                         goto fault;
 324                 memcpy(buf, &d4, 4);
 325                 break;
 326         case 8:
 327                 if (get_user(d8, s))
 328                         goto fault;
 329                 memcpy(buf, &d8, 8);
 330                 break;
 331         default:
 332                 WARN_ONCE(1, "%s: Invalid size: %zu\n", __func__, size);
 333                 return ES_UNSUPPORTED;
 334         }
 335
 336         return ES_OK;
 337
 338 fault:
 339         if (user_mode(ctxt->regs))
 340                 error_code |= X86_PF_USER;
 341
 342         ctxt->fi.vector = X86_TRAP_PF;
 343         ctxt->fi.error_code = error_code;
 344         ctxt->fi.cr2 = (unsigned long)src;
 345
 346         return ES_EXCEPTION;
 347 }
 348
 349 /* Include code shared with pre-decompression boot stage */
 350 #include "sev-es-shared.c"
 351
 352 /*
 353  * This function runs on the first #VC exception after the kernel
 354  * switched to virtual addresses.
 355  */
 356 static bool __init sev_es_setup_ghcb(void)
 357 {
 358         /* First make sure the hypervisor talks a supported protocol. */
 359         if (!sev_es_negotiate_protocol())
 360                 return false;
 361
 362         /*
 363          * Clear the boot_ghcb. The first exception comes in before the bss
 364          * section is cleared.
 365          */
 366         memset(&boot_ghcb_page, 0, PAGE_SIZE);
 367
 368         /* Alright - Make the boot-ghcb public */
 369         boot_ghcb = &boot_ghcb_page;
 370
 371         return true;
 372 }
 373
 374 static void __init alloc_runtime_data(int cpu)
 375 {
 376         struct sev_es_runtime_data *data;
 377
 378         data = memblock_alloc(sizeof(*data), PAGE_SIZE);
 379         if (!data)
 380                 panic("Can't allocate SEV-ES runtime data");
 381
 382         per_cpu(runtime_data, cpu) = data;
 383 }
 384
 385 static void __init init_ghcb(int cpu)
 386 {
 387         struct sev_es_runtime_data *data;
 388         int err;
 389
 390         data = per_cpu(runtime_data, cpu);
 391
 392         err = early_set_memory_decrypted((unsigned long)&data->ghcb_page,
 393                                          sizeof(data->ghcb_page));
 394         if (err)
 395                 panic("Can't map GHCBs unencrypted");
 396
 397         memset(&data->ghcb_page, 0, sizeof(data->ghcb_page));
 398
 399         data->ghcb_active = false;
 400         data->backup_ghcb_active = false;
 401 }
 402
 403 void __init sev_es_init_vc_handling(void)
 404 {
 405         int cpu;
 406
 407         BUILD_BUG_ON(offsetof(struct sev_es_runtime_data, ghcb_page) % PAGE_SIZE);
 408
 409         if (!sev_es_active())
 410                 return;
 411
 412         /* Enable SEV-ES special handling */
 413         static_branch_enable(&sev_es_enable_key);
 414
 415         /* Initialize per-cpu GHCB pages */
 416         for_each_possible_cpu(cpu) {
 417                 alloc_runtime_data(cpu);
 418                 init_ghcb(cpu);
 419                 setup_vc_stacks(cpu);
 420         }
 421
 422         /* Secondary CPUs use the runtime #VC handler */
 423         initial_vc_handler = (unsigned long)safe_stack_exc_vmm_communication;
 424 }
 425
 426 static void __init vc_early_forward_exception(struct es_em_ctxt *ctxt)
 427 {
 428         int trapnr = ctxt->fi.vector;
 429
 430         if (trapnr == X86_TRAP_PF)
 431                 native_write_cr2(ctxt->fi.cr2);
 432
 433         ctxt->regs->orig_ax = ctxt->fi.error_code;
 434         do_early_exception(ctxt->regs, trapnr);
 435 }
 436
 437 static enum es_result vc_handle_exitcode(struct es_em_ctxt *ctxt,
 438                                          struct ghcb *ghcb,
 439                                          unsigned long exit_code)
 440 {
 441         enum es_result result;
 442
 443         switch (exit_code) {
 444         default:
 445                 /*
 446                  * Unexpected #VC exception
 447                  */
 448                 result = ES_UNSUPPORTED;
 449         }
 450
 451         return result;
 452 }
 453
 454 static __always_inline void vc_forward_exception(struct es_em_ctxt *ctxt)
 455 {
 456         long error_code = ctxt->fi.error_code;
 457         int trapnr = ctxt->fi.vector;
 458
 459         ctxt->regs->orig_ax = ctxt->fi.error_code;
 460
 461         switch (trapnr) {
 462         case X86_TRAP_GP:
 463                 exc_general_protection(ctxt->regs, error_code);
 464                 break;
 465         case X86_TRAP_UD:
 466                 exc_invalid_op(ctxt->regs);
 467                 break;
 468         default:
 469                 pr_emerg("Unsupported exception in #VC instruction emulation - can't continue\n");
 470                 BUG();
 471         }
 472 }
 473
 474 static __always_inline bool on_vc_fallback_stack(struct pt_regs *regs)
 475 {
 476         unsigned long sp = (unsigned long)regs;
 477
 478         return (sp >= __this_cpu_ist_bottom_va(VC2) && sp < __this_cpu_ist_top_va(VC2));
 479 }
 480
 481 /*
 482  * Main #VC exception handler. It is called when the entry code was able to
 483  * switch off the IST to a safe kernel stack.
 484  *
 485  * With the current implementation it is always possible to switch to a safe
 486  * stack because #VC exceptions only happen at known places, like intercepted
 487  * instructions or accesses to MMIO areas/IO ports. They can also happen with
 488  * code instrumentation when the hypervisor intercepts #DB, but the critical
 489  * paths are forbidden to be instrumented, so #DB exceptions currently also
 490  * only happen in safe places.
 491  */
 492 DEFINE_IDTENTRY_VC_SAFE_STACK(exc_vmm_communication)
 493 {
 494         struct sev_es_runtime_data *data = this_cpu_read(runtime_data);
 495         struct ghcb_state state;
 496         struct es_em_ctxt ctxt;
 497         enum es_result result;
 498         struct ghcb *ghcb;
 499
 500         lockdep_assert_irqs_disabled();
 501         instrumentation_begin();
 502
 503         /*
 504          * This is invoked through an interrupt gate, so IRQs are disabled. The
 505          * code below might walk page-tables for user or kernel addresses, so
 506          * keep the IRQs disabled to protect us against concurrent TLB flushes.
 507          */
 508
 509         ghcb = sev_es_get_ghcb(&state);
 510         if (!ghcb) {
 511                 /*
 512                  * Mark GHCBs inactive so that panic() is able to print the
 513                  * message.
 514                  */
 515                 data->ghcb_active        = false;
 516                 data->backup_ghcb_active = false;
 517
 518                 panic("Unable to handle #VC exception! GHCB and Backup GHCB are already in use");
 519         }
 520
 521         vc_ghcb_invalidate(ghcb);
 522         result = vc_init_em_ctxt(&ctxt, regs, error_code);
 523
 524         if (result == ES_OK)
 525                 result = vc_handle_exitcode(&ctxt, ghcb, error_code);
 526
 527         sev_es_put_ghcb(&state);
 528
 529         /* Done - now check the result */
 530         switch (result) {
 531         case ES_OK:
 532                 vc_finish_insn(&ctxt);
 533                 break;
 534         case ES_UNSUPPORTED:
 535                 pr_err_ratelimited("Unsupported exit-code 0x%02lx in early #VC exception (IP: 0x%lx)\n",
 536                                    error_code, regs->ip);
 537                 goto fail;
 538         case ES_VMM_ERROR:
 539                 pr_err_ratelimited("Failure in communication with VMM (exit-code 0x%02lx IP: 0x%lx)\n",
 540                                    error_code, regs->ip);
 541                 goto fail;
 542         case ES_DECODE_FAILED:
 543                 pr_err_ratelimited("Failed to decode instruction (exit-code 0x%02lx IP: 0x%lx)\n",
 544                                    error_code, regs->ip);
 545                 goto fail;
 546         case ES_EXCEPTION:
 547                 vc_forward_exception(&ctxt);
 548                 break;
 549         case ES_RETRY:
 550                 /* Nothing to do */
 551                 break;
 552         default:
 553                 pr_emerg("Unknown result in %s():%d\n", __func__, result);
 554                 /*
 555                  * Emulating the instruction which caused the #VC exception
 556                  * failed - can't continue so print debug information
 557                  */
 558                 BUG();
 559         }
 560
 561 out:
 562         instrumentation_end();
 563
 564         return;
 565
 566 fail:
 567         if (user_mode(regs)) {
 568                 /*
 569                  * Do not kill the machine if user-space triggered the
 570                  * exception. Send SIGBUS instead and let user-space deal with
 571                  * it.
 572                  */
 573                 force_sig_fault(SIGBUS, BUS_OBJERR, (void __user *)0);
 574         } else {
 575                 pr_emerg("PANIC: Unhandled #VC exception in kernel space (result=%d)\n",
 576                          result);
 577
 578                 /* Show some debug info */
 579                 show_regs(regs);
 580
 581                 /* Ask hypervisor to sev_es_terminate */
 582                 sev_es_terminate(GHCB_SEV_ES_REASON_GENERAL_REQUEST);
 583
 584                 /* If that fails and we get here - just panic */
 585                 panic("Returned from Terminate-Request to Hypervisor\n");
 586         }
 587
 588         goto out;
 589 }
 590
 591 /* This handler runs on the #VC fall-back stack. It can cause further #VC exceptions */
 592 DEFINE_IDTENTRY_VC_IST(exc_vmm_communication)
 593 {
 594         instrumentation_begin();
 595         panic("Can't handle #VC exception from unsupported context\n");
 596         instrumentation_end();
 597 }
 598
 599 DEFINE_IDTENTRY_VC(exc_vmm_communication)
 600 {
 601         if (likely(!on_vc_fallback_stack(regs)))
 602                 safe_stack_exc_vmm_communication(regs, error_code);
 603         else
 604                 ist_exc_vmm_communication(regs, error_code);
 605 }
 606
 607 bool __init handle_vc_boot_ghcb(struct pt_regs *regs)
 608 {
 609         unsigned long exit_code = regs->orig_ax;
 610         struct es_em_ctxt ctxt;
 611         enum es_result result;
 612
 613         /* Do initial setup or terminate the guest */
 614         if (unlikely(boot_ghcb == NULL && !sev_es_setup_ghcb()))
 615                 sev_es_terminate(GHCB_SEV_ES_REASON_GENERAL_REQUEST);
 616
 617         vc_ghcb_invalidate(boot_ghcb);
 618
 619         result = vc_init_em_ctxt(&ctxt, regs, exit_code);
 620         if (result == ES_OK)
 621                 result = vc_handle_exitcode(&ctxt, boot_ghcb, exit_code);
 622
 623         /* Done - now check the result */
 624         switch (result) {
 625         case ES_OK:
 626                 vc_finish_insn(&ctxt);
 627                 break;
 628         case ES_UNSUPPORTED:
 629                 early_printk("PANIC: Unsupported exit-code 0x%02lx in early #VC exception (IP: 0x%lx)\n",
 630                                 exit_code, regs->ip);
 631                 goto fail;
 632         case ES_VMM_ERROR:
 633                 early_printk("PANIC: Failure in communication with VMM (exit-code 0x%02lx IP: 0x%lx)\n",
 634                                 exit_code, regs->ip);
 635                 goto fail;
 636         case ES_DECODE_FAILED:
 637                 early_printk("PANIC: Failed to decode instruction (exit-code 0x%02lx IP: 0x%lx)\n",
 638                                 exit_code, regs->ip);
 639                 goto fail;
 640         case ES_EXCEPTION:
 641                 vc_early_forward_exception(&ctxt);
 642                 break;
 643         case ES_RETRY:
 644                 /* Nothing to do */
 645                 break;
 646         default:
 647                 BUG();
 648         }
 649
 650         return true;
 651
 652 fail:
 653         show_regs(regs);
 654
 655         while (true)
 656                 halt();
 657 }