arch/x86/kernel/cpu/mcheck/mce.c

   1 /*
   2  * Machine check handler.
   3  *
   4  * K8 parts Copyright 2002,2003 Andi Kleen, SuSE Labs.
   5  * Rest from unknown author(s).
   6  * 2004 Andi Kleen. Rewrote most of it.
   7  * Copyright 2008 Intel Corporation
   8  * Author: Andi Kleen
   9  */
  10
  11 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
  12
  13 #include <linux/thread_info.h>
  14 #include <linux/capability.h>
  15 #include <linux/miscdevice.h>
  16 #include <linux/ratelimit.h>
  17 #include <linux/kallsyms.h>
  18 #include <linux/rcupdate.h>
  19 #include <linux/kobject.h>
  20 #include <linux/uaccess.h>
  21 #include <linux/kdebug.h>
  22 #include <linux/kernel.h>
  23 #include <linux/percpu.h>
  24 #include <linux/string.h>
  25 #include <linux/device.h>
  26 #include <linux/syscore_ops.h>
  27 #include <linux/delay.h>
  28 #include <linux/ctype.h>
  29 #include <linux/sched.h>
  30 #include <linux/sysfs.h>
  31 #include <linux/types.h>
  32 #include <linux/slab.h>
  33 #include <linux/init.h>
  34 #include <linux/kmod.h>
  35 #include <linux/poll.h>
  36 #include <linux/nmi.h>
  37 #include <linux/cpu.h>
  38 #include <linux/smp.h>
  39 #include <linux/fs.h>
  40 #include <linux/mm.h>
  41 #include <linux/debugfs.h>
  42 #include <linux/irq_work.h>
  43 #include <linux/export.h>
  44
  45 #include <asm/processor.h>
  46 #include <asm/traps.h>
  47 #include <asm/tlbflush.h>
  48 #include <asm/mce.h>
  49 #include <asm/msr.h>
  50
  51 #include "mce-internal.h"
  52
  53 static DEFINE_MUTEX(mce_chrdev_read_mutex);
  54
  55 #define mce_log_get_idx_check(p) \
  56 ({ \
  57         RCU_LOCKDEP_WARN(!rcu_read_lock_sched_held() && \
  58                          !lockdep_is_held(&mce_chrdev_read_mutex), \
  59                          "suspicious mce_log_get_idx_check() usage"); \
  60         smp_load_acquire(&(p)); \
  61 })
  62
  63 /* sysfs synchronization */
  64 static DEFINE_MUTEX(mce_sysfs_mutex);
  65
  66 #define CREATE_TRACE_POINTS
  67 #include <trace/events/mce.h>
  68
  69 #define SPINUNIT                100     /* 100ns */
  70
  71 DEFINE_PER_CPU(unsigned, mce_exception_count);
  72
  73 struct mce_bank *mce_banks __read_mostly;
  74 struct mce_vendor_flags mce_flags __read_mostly;
  75
  76 struct mca_config mca_cfg __read_mostly = {
  77         .bootlog  = -1,
  78         /*
  79          * Tolerant levels:
  80          * 0: always panic on uncorrected errors, log corrected errors
  81          * 1: panic or SIGBUS on uncorrected errors, log corrected errors
  82          * 2: SIGBUS or log uncorrected errors (if possible), log corr. errors
  83          * 3: never panic or SIGBUS, log all errors (for testing only)
  84          */
  85         .tolerant = 1,
  86         .monarch_timeout = -1
  87 };
  88
  89 /* User mode helper program triggered by machine check event */
  90 static unsigned long            mce_need_notify;
  91 static char                     mce_helper[128];
  92 static char                     *mce_helper_argv[2] = { mce_helper, NULL };
  93
  94 static DECLARE_WAIT_QUEUE_HEAD(mce_chrdev_wait);
  95
  96 static DEFINE_PER_CPU(struct mce, mces_seen);
  97 static int                      cpu_missing;
  98
  99 /*
 100  * MCA banks polled by the period polling timer for corrected events.
 101  * With Intel CMCI, this only has MCA banks which do not support CMCI (if any).
 102  */
 103 DEFINE_PER_CPU(mce_banks_t, mce_poll_banks) = {
 104         [0 ... BITS_TO_LONGS(MAX_NR_BANKS)-1] = ~0UL
 105 };
 106
 107 /*
 108  * MCA banks controlled through firmware first for corrected errors.
 109  * This is a global list of banks for which we won't enable CMCI and we
 110  * won't poll. Firmware controls these banks and is responsible for
 111  * reporting corrected errors through GHES. Uncorrected/recoverable
 112  * errors are still notified through a machine check.
 113  */
 114 mce_banks_t mce_banks_ce_disabled;
 115
 116 static struct work_struct mce_work;
 117 static struct irq_work mce_irq_work;
 118
 119 static void (*quirk_no_way_out)(int bank, struct mce *m, struct pt_regs *regs);
 120 static int mce_usable_address(struct mce *m);
 121
 122 /*
 123  * CPU/chipset specific EDAC code can register a notifier call here to print
 124  * MCE errors in a human-readable form.
 125  */
 126 ATOMIC_NOTIFIER_HEAD(x86_mce_decoder_chain);
 127
 128 /* Do initial initialization of a struct mce */
 129 void mce_setup(struct mce *m)
 130 {
 131         memset(m, 0, sizeof(struct mce));
 132         m->cpu = m->extcpu = smp_processor_id();
 133         m->tsc = rdtsc();
 134         /* We hope get_seconds stays lockless */
 135         m->time = get_seconds();
 136         m->cpuvendor = boot_cpu_data.x86_vendor;
 137         m->cpuid = cpuid_eax(1);
 138         m->socketid = cpu_data(m->extcpu).phys_proc_id;
 139         m->apicid = cpu_data(m->extcpu).initial_apicid;
 140         rdmsrl(MSR_IA32_MCG_CAP, m->mcgcap);
 141
 142         m->microcode = boot_cpu_data.microcode;
 143 }
 144
 145 DEFINE_PER_CPU(struct mce, injectm);
 146 EXPORT_PER_CPU_SYMBOL_GPL(injectm);
 147
 148 /*
 149  * Lockless MCE logging infrastructure.
 150  * This avoids deadlocks on printk locks without having to break locks. Also
 151  * separate MCEs from kernel messages to avoid bogus bug reports.
 152  */
 153
 154 static struct mce_log mcelog = {
 155         .signature      = MCE_LOG_SIGNATURE,
 156         .len            = MCE_LOG_LEN,
 157         .recordlen      = sizeof(struct mce),
 158 };
 159
 160 void mce_log(struct mce *mce)
 161 {
 162         unsigned next, entry;
 163
 164         /* Emit the trace record: */
 165         trace_mce_record(mce);
 166
 167         if (!mce_gen_pool_add(mce))
 168                 irq_work_queue(&mce_irq_work);
 169
 170         mce->finished = 0;
 171         wmb();
 172         for (;;) {
 173                 entry = mce_log_get_idx_check(mcelog.next);
 174                 for (;;) {
 175
 176                         /*
 177                          * When the buffer fills up discard new entries.
 178                          * Assume that the earlier errors are the more
 179                          * interesting ones:
 180                          */
 181                         if (entry >= MCE_LOG_LEN) {
 182                                 set_bit(MCE_OVERFLOW,
 183                                         (unsigned long *)&mcelog.flags);
 184                                 return;
 185                         }
 186                         /* Old left over entry. Skip: */
 187                         if (mcelog.entry[entry].finished) {
 188                                 entry++;
 189                                 continue;
 190                         }
 191                         break;
 192                 }
 193                 smp_rmb();
 194                 next = entry + 1;
 195                 if (cmpxchg(&mcelog.next, entry, next) == entry)
 196                         break;
 197         }
 198         memcpy(mcelog.entry + entry, mce, sizeof(struct mce));
 199         wmb();
 200         mcelog.entry[entry].finished = 1;
 201         wmb();
 202
 203         mce->finished = 1;
 204         set_bit(0, &mce_need_notify);
 205 }
 206
 207 void mce_inject_log(struct mce *m)
 208 {
 209         mutex_lock(&mce_chrdev_read_mutex);
 210         mce_log(m);
 211         mutex_unlock(&mce_chrdev_read_mutex);
 212 }
 213 EXPORT_SYMBOL_GPL(mce_inject_log);
 214
 215 static struct notifier_block mce_srao_nb;
 216
 217 void mce_register_decode_chain(struct notifier_block *nb)
 218 {
 219         /* Ensure SRAO notifier has the highest priority in the decode chain. */
 220         if (nb != &mce_srao_nb && nb->priority == INT_MAX)
 221                 nb->priority -= 1;
 222
 223         atomic_notifier_chain_register(&x86_mce_decoder_chain, nb);
 224 }
 225 EXPORT_SYMBOL_GPL(mce_register_decode_chain);
 226
 227 void mce_unregister_decode_chain(struct notifier_block *nb)
 228 {
 229         atomic_notifier_chain_unregister(&x86_mce_decoder_chain, nb);
 230 }
 231 EXPORT_SYMBOL_GPL(mce_unregister_decode_chain);
 232
 233 static void print_mce(struct mce *m)
 234 {
 235         int ret = 0;
 236
 237         pr_emerg(HW_ERR "CPU %d: Machine Check Exception: %Lx Bank %d: %016Lx\n",
 238                m->extcpu, m->mcgstatus, m->bank, m->status);
 239
 240         if (m->ip) {
 241                 pr_emerg(HW_ERR "RIP%s %02x:<%016Lx> ",
 242                         !(m->mcgstatus & MCG_STATUS_EIPV) ? " !INEXACT!" : "",
 243                                 m->cs, m->ip);
 244
 245                 if (m->cs == __KERNEL_CS)
 246                         print_symbol("{%s}", m->ip);
 247                 pr_cont("\n");
 248         }
 249
 250         pr_emerg(HW_ERR "TSC %llx ", m->tsc);
 251         if (m->addr)
 252                 pr_cont("ADDR %llx ", m->addr);
 253         if (m->misc)
 254                 pr_cont("MISC %llx ", m->misc);
 255
 256         pr_cont("\n");
 257         /*
 258          * Note this output is parsed by external tools and old fields
 259          * should not be changed.
 260          */
 261         pr_emerg(HW_ERR "PROCESSOR %u:%x TIME %llu SOCKET %u APIC %x microcode %x\n",
 262                 m->cpuvendor, m->cpuid, m->time, m->socketid, m->apicid,
 263                 m->microcode);
 264
 265         /*
 266          * Print out human-readable details about the MCE error,
 267          * (if the CPU has an implementation for that)
 268          */
 269         ret = atomic_notifier_call_chain(&x86_mce_decoder_chain, 0, m);
 270         if (ret == NOTIFY_STOP)
 271                 return;
 272
 273         pr_emerg_ratelimited(HW_ERR "Run the above through 'mcelog --ascii'\n");
 274 }
 275
 276 #define PANIC_TIMEOUT 5 /* 5 seconds */
 277
 278 static atomic_t mce_panicked;
 279
 280 static int fake_panic;
 281 static atomic_t mce_fake_panicked;
 282
 283 /* Panic in progress. Enable interrupts and wait for final IPI */
 284 static void wait_for_panic(void)
 285 {
 286         long timeout = PANIC_TIMEOUT*USEC_PER_SEC;
 287
 288         preempt_disable();
 289         local_irq_enable();
 290         while (timeout-- > 0)
 291                 udelay(1);
 292         if (panic_timeout == 0)
 293                 panic_timeout = mca_cfg.panic_timeout;
 294         panic("Panicing machine check CPU died");
 295 }
 296
 297 static void mce_panic(const char *msg, struct mce *final, char *exp)
 298 {
 299         int i, apei_err = 0;
 300
 301         if (!fake_panic) {
 302                 /*
 303                  * Make sure only one CPU runs in machine check panic
 304                  */
 305                 if (atomic_inc_return(&mce_panicked) > 1)
 306                         wait_for_panic();
 307                 barrier();
 308
 309                 bust_spinlocks(1);
 310                 console_verbose();
 311         } else {
 312                 /* Don't log too much for fake panic */
 313                 if (atomic_inc_return(&mce_fake_panicked) > 1)
 314                         return;
 315         }
 316         /* First print corrected ones that are still unlogged */
 317         for (i = 0; i < MCE_LOG_LEN; i++) {
 318                 struct mce *m = &mcelog.entry[i];
 319                 if (!(m->status & MCI_STATUS_VAL))
 320                         continue;
 321                 if (!(m->status & MCI_STATUS_UC)) {
 322                         print_mce(m);
 323                         if (!apei_err)
 324                                 apei_err = apei_write_mce(m);
 325                 }
 326         }
 327         /* Now print uncorrected but with the final one last */
 328         for (i = 0; i < MCE_LOG_LEN; i++) {
 329                 struct mce *m = &mcelog.entry[i];
 330                 if (!(m->status & MCI_STATUS_VAL))
 331                         continue;
 332                 if (!(m->status & MCI_STATUS_UC))
 333                         continue;
 334                 if (!final || memcmp(m, final, sizeof(struct mce))) {
 335                         print_mce(m);
 336                         if (!apei_err)
 337                                 apei_err = apei_write_mce(m);
 338                 }
 339         }
 340         if (final) {
 341                 print_mce(final);
 342                 if (!apei_err)
 343                         apei_err = apei_write_mce(final);
 344         }
 345         if (cpu_missing)
 346                 pr_emerg(HW_ERR "Some CPUs didn't answer in synchronization\n");
 347         if (exp)
 348                 pr_emerg(HW_ERR "Machine check: %s\n", exp);
 349         if (!fake_panic) {
 350                 if (panic_timeout == 0)
 351                         panic_timeout = mca_cfg.panic_timeout;
 352                 panic(msg);
 353         } else
 354                 pr_emerg(HW_ERR "Fake kernel panic: %s\n", msg);
 355 }
 356
 357 /* Support code for software error injection */
 358
 359 static int msr_to_offset(u32 msr)
 360 {
 361         unsigned bank = __this_cpu_read(injectm.bank);
 362
 363         if (msr == mca_cfg.rip_msr)
 364                 return offsetof(struct mce, ip);
 365         if (msr == MSR_IA32_MCx_STATUS(bank))
 366                 return offsetof(struct mce, status);
 367         if (msr == MSR_IA32_MCx_ADDR(bank))
 368                 return offsetof(struct mce, addr);
 369         if (msr == MSR_IA32_MCx_MISC(bank))
 370                 return offsetof(struct mce, misc);
 371         if (msr == MSR_IA32_MCG_STATUS)
 372                 return offsetof(struct mce, mcgstatus);
 373         return -1;
 374 }
 375
 376 /* MSR access wrappers used for error injection */
 377 static u64 mce_rdmsrl(u32 msr)
 378 {
 379         u64 v;
 380
 381         if (__this_cpu_read(injectm.finished)) {
 382                 int offset = msr_to_offset(msr);
 383
 384                 if (offset < 0)
 385                         return 0;
 386                 return *(u64 *)((char *)this_cpu_ptr(&injectm) + offset);
 387         }
 388
 389         if (rdmsrl_safe(msr, &v)) {
 390                 WARN_ONCE(1, "mce: Unable to read msr %d!\n", msr);
 391                 /*
 392                  * Return zero in case the access faulted. This should
 393                  * not happen normally but can happen if the CPU does
 394                  * something weird, or if the code is buggy.
 395                  */
 396                 v = 0;
 397         }
 398
 399         return v;
 400 }
 401
 402 static void mce_wrmsrl(u32 msr, u64 v)
 403 {
 404         if (__this_cpu_read(injectm.finished)) {
 405                 int offset = msr_to_offset(msr);
 406
 407                 if (offset >= 0)
 408                         *(u64 *)((char *)this_cpu_ptr(&injectm) + offset) = v;
 409                 return;
 410         }
 411         wrmsrl(msr, v);
 412 }
 413
 414 /*
 415  * Collect all global (w.r.t. this processor) status about this machine
 416  * check into our "mce" struct so that we can use it later to assess
 417  * the severity of the problem as we read per-bank specific details.
 418  */
 419 static inline void mce_gather_info(struct mce *m, struct pt_regs *regs)
 420 {
 421         mce_setup(m);
 422
 423         m->mcgstatus = mce_rdmsrl(MSR_IA32_MCG_STATUS);
 424         if (regs) {
 425                 /*
 426                  * Get the address of the instruction at the time of
 427                  * the machine check error.
 428                  */
 429                 if (m->mcgstatus & (MCG_STATUS_RIPV|MCG_STATUS_EIPV)) {
 430                         m->ip = regs->ip;
 431                         m->cs = regs->cs;
 432
 433                         /*
 434                          * When in VM86 mode make the cs look like ring 3
 435                          * always. This is a lie, but it's better than passing
 436                          * the additional vm86 bit around everywhere.
 437                          */
 438                         if (v8086_mode(regs))
 439                                 m->cs |= 3;
 440                 }
 441                 /* Use accurate RIP reporting if available. */
 442                 if (mca_cfg.rip_msr)
 443                         m->ip = mce_rdmsrl(mca_cfg.rip_msr);
 444         }
 445 }
 446
 447 int mce_available(struct cpuinfo_x86 *c)
 448 {
 449         if (mca_cfg.disabled)
 450                 return 0;
 451         return cpu_has(c, X86_FEATURE_MCE) && cpu_has(c, X86_FEATURE_MCA);
 452 }
 453
 454 static void mce_schedule_work(void)
 455 {
 456         if (!mce_gen_pool_empty() && keventd_up())
 457                 schedule_work(&mce_work);
 458 }
 459
 460 static void mce_irq_work_cb(struct irq_work *entry)
 461 {
 462         mce_notify_irq();
 463         mce_schedule_work();
 464 }
 465
 466 static void mce_report_event(struct pt_regs *regs)
 467 {
 468         if (regs->flags & (X86_VM_MASK|X86_EFLAGS_IF)) {
 469                 mce_notify_irq();
 470                 /*
 471                  * Triggering the work queue here is just an insurance
 472                  * policy in case the syscall exit notify handler
 473                  * doesn't run soon enough or ends up running on the
 474                  * wrong CPU (can happen when audit sleeps)
 475                  */
 476                 mce_schedule_work();
 477                 return;
 478         }
 479
 480         irq_work_queue(&mce_irq_work);
 481 }
 482
 483 static int srao_decode_notifier(struct notifier_block *nb, unsigned long val,
 484                                 void *data)
 485 {
 486         struct mce *mce = (struct mce *)data;
 487         unsigned long pfn;
 488
 489         if (!mce)
 490                 return NOTIFY_DONE;
 491
 492         if (mce->usable_addr && (mce->severity == MCE_AO_SEVERITY)) {
 493                 pfn = mce->addr >> PAGE_SHIFT;
 494                 memory_failure(pfn, MCE_VECTOR, 0);
 495         }
 496
 497         return NOTIFY_OK;
 498 }
 499 static struct notifier_block mce_srao_nb = {
 500         .notifier_call  = srao_decode_notifier,
 501         .priority = INT_MAX,
 502 };
 503
 504 /*
 505  * Read ADDR and MISC registers.
 506  */
 507 static void mce_read_aux(struct mce *m, int i)
 508 {
 509         if (m->status & MCI_STATUS_MISCV)
 510                 m->misc = mce_rdmsrl(MSR_IA32_MCx_MISC(i));
 511         if (m->status & MCI_STATUS_ADDRV) {
 512                 m->addr = mce_rdmsrl(MSR_IA32_MCx_ADDR(i));
 513
 514                 /*
 515                  * Mask the reported address by the reported granularity.
 516                  */
 517                 if (mca_cfg.ser && (m->status & MCI_STATUS_MISCV)) {
 518                         u8 shift = MCI_MISC_ADDR_LSB(m->misc);
 519                         m->addr >>= shift;
 520                         m->addr <<= shift;
 521                 }
 522         }
 523 }
 524
 525 static bool memory_error(struct mce *m)
 526 {
 527         struct cpuinfo_x86 *c = &boot_cpu_data;
 528
 529         if (c->x86_vendor == X86_VENDOR_AMD) {
 530                 /*
 531                  * coming soon
 532                  */
 533                 return false;
 534         } else if (c->x86_vendor == X86_VENDOR_INTEL) {
 535                 /*
 536                  * Intel SDM Volume 3B - 15.9.2 Compound Error Codes
 537                  *
 538                  * Bit 7 of the MCACOD field of IA32_MCi_STATUS is used for
 539                  * indicating a memory error. Bit 8 is used for indicating a
 540                  * cache hierarchy error. The combination of bit 2 and bit 3
 541                  * is used for indicating a `generic' cache hierarchy error
 542                  * But we can't just blindly check the above bits, because if
 543                  * bit 11 is set, then it is a bus/interconnect error - and
 544                  * either way the above bits just gives more detail on what
 545                  * bus/interconnect error happened. Note that bit 12 can be
 546                  * ignored, as it's the "filter" bit.
 547                  */
 548                 return (m->status & 0xef80) == BIT(7) ||
 549                        (m->status & 0xef00) == BIT(8) ||
 550                        (m->status & 0xeffc) == 0xc;
 551         }
 552
 553         return false;
 554 }
 555
 556 DEFINE_PER_CPU(unsigned, mce_poll_count);
 557
 558 /*
 559  * Poll for corrected events or events that happened before reset.
 560  * Those are just logged through /dev/mcelog.
 561  *
 562  * This is executed in standard interrupt context.
 563  *
 564  * Note: spec recommends to panic for fatal unsignalled
 565  * errors here. However this would be quite problematic --
 566  * we would need to reimplement the Monarch handling and
 567  * it would mess up the exclusion between exception handler
 568  * and poll hander -- * so we skip this for now.
 569  * These cases should not happen anyways, or only when the CPU
 570  * is already totally * confused. In this case it's likely it will
 571  * not fully execute the machine check handler either.
 572  */
 573 bool machine_check_poll(enum mcp_flags flags, mce_banks_t *b)
 574 {
 575         bool error_logged = false;
 576         struct mce m;
 577         int severity;
 578         int i;
 579
 580         this_cpu_inc(mce_poll_count);
 581
 582         mce_gather_info(&m, NULL);
 583
 584         for (i = 0; i < mca_cfg.banks; i++) {
 585                 if (!mce_banks[i].ctl || !test_bit(i, *b))
 586                         continue;
 587
 588                 m.misc = 0;
 589                 m.addr = 0;
 590                 m.bank = i;
 591                 m.tsc = 0;
 592
 593                 barrier();
 594                 m.status = mce_rdmsrl(MSR_IA32_MCx_STATUS(i));
 595                 if (!(m.status & MCI_STATUS_VAL))
 596                         continue;
 597
 598
 599                 /*
 600                  * Uncorrected or signalled events are handled by the exception
 601                  * handler when it is enabled, so don't process those here.
 602                  *
 603                  * TBD do the same check for MCI_STATUS_EN here?
 604                  */
 605                 if (!(flags & MCP_UC) &&
 606                     (m.status & (mca_cfg.ser ? MCI_STATUS_S : MCI_STATUS_UC)))
 607                         continue;
 608
 609                 mce_read_aux(&m, i);
 610
 611                 if (!(flags & MCP_TIMESTAMP))
 612                         m.tsc = 0;
 613
 614                 severity = mce_severity(&m, mca_cfg.tolerant, NULL, false);
 615
 616                 /*
 617                  * In the cases where we don't have a valid address after all,
 618                  * do not add it into the ring buffer.
 619                  */
 620                 if (severity == MCE_DEFERRED_SEVERITY && memory_error(&m)) {
 621                         if (m.status & MCI_STATUS_ADDRV) {
 622                                 m.severity = severity;
 623                                 m.usable_addr = mce_usable_address(&m);
 624
 625                                 if (!mce_gen_pool_add(&m))
 626                                         mce_schedule_work();
 627                         }
 628                 }
 629
 630                 /*
 631                  * Don't get the IP here because it's unlikely to
 632                  * have anything to do with the actual error location.
 633                  */
 634                 if (!(flags & MCP_DONTLOG) && !mca_cfg.dont_log_ce) {
 635                         error_logged = true;
 636                         mce_log(&m);
 637                 }
 638
 639                 /*
 640                  * Clear state for this bank.
 641                  */
 642                 mce_wrmsrl(MSR_IA32_MCx_STATUS(i), 0);
 643         }
 644
 645         /*
 646          * Don't clear MCG_STATUS here because it's only defined for
 647          * exceptions.
 648          */
 649
 650         sync_core();
 651
 652         return error_logged;
 653 }
 654 EXPORT_SYMBOL_GPL(machine_check_poll);
 655
 656 /*
 657  * Do a quick check if any of the events requires a panic.
 658  * This decides if we keep the events around or clear them.
 659  */
 660 static int mce_no_way_out(struct mce *m, char **msg, unsigned long *validp,
 661                           struct pt_regs *regs)
 662 {
 663         int i, ret = 0;
 664         char *tmp;
 665
 666         for (i = 0; i < mca_cfg.banks; i++) {
 667                 m->status = mce_rdmsrl(MSR_IA32_MCx_STATUS(i));
 668                 if (m->status & MCI_STATUS_VAL) {
 669                         __set_bit(i, validp);
 670                         if (quirk_no_way_out)
 671                                 quirk_no_way_out(i, m, regs);
 672                 }
 673
 674                 if (mce_severity(m, mca_cfg.tolerant, &tmp, true) >= MCE_PANIC_SEVERITY) {
 675                         m->bank = i;
 676                         *msg = tmp;
 677                         ret = 1;
 678                 }
 679         }
 680         return ret;
 681 }
 682
 683 /*
 684  * Variable to establish order between CPUs while scanning.
 685  * Each CPU spins initially until executing is equal its number.
 686  */
 687 static atomic_t mce_executing;
 688
 689 /*
 690  * Defines order of CPUs on entry. First CPU becomes Monarch.
 691  */
 692 static atomic_t mce_callin;
 693
 694 /*
 695  * Check if a timeout waiting for other CPUs happened.
 696  */
 697 static int mce_timed_out(u64 *t, const char *msg)
 698 {
 699         /*
 700          * The others already did panic for some reason.
 701          * Bail out like in a timeout.
 702          * rmb() to tell the compiler that system_state
 703          * might have been modified by someone else.
 704          */
 705         rmb();
 706         if (atomic_read(&mce_panicked))
 707                 wait_for_panic();
 708         if (!mca_cfg.monarch_timeout)
 709                 goto out;
 710         if ((s64)*t < SPINUNIT) {
 711                 if (mca_cfg.tolerant <= 1)
 712                         mce_panic(msg, NULL, NULL);
 713                 cpu_missing = 1;
 714                 return 1;
 715         }
 716         *t -= SPINUNIT;
 717 out:
 718         touch_nmi_watchdog();
 719         return 0;
 720 }
 721
 722 /*
 723  * The Monarch's reign.  The Monarch is the CPU who entered
 724  * the machine check handler first. It waits for the others to
 725  * raise the exception too and then grades them. When any
 726  * error is fatal panic. Only then let the others continue.
 727  *
 728  * The other CPUs entering the MCE handler will be controlled by the
 729  * Monarch. They are called Subjects.
 730  *
 731  * This way we prevent any potential data corruption in a unrecoverable case
 732  * and also makes sure always all CPU's errors are examined.
 733  *
 734  * Also this detects the case of a machine check event coming from outer
 735  * space (not detected by any CPUs) In this case some external agent wants
 736  * us to shut down, so panic too.
 737  *
 738  * The other CPUs might still decide to panic if the handler happens
 739  * in a unrecoverable place, but in this case the system is in a semi-stable
 740  * state and won't corrupt anything by itself. It's ok to let the others
 741  * continue for a bit first.
 742  *
 743  * All the spin loops have timeouts; when a timeout happens a CPU
 744  * typically elects itself to be Monarch.
 745  */
 746 static void mce_reign(void)
 747 {
 748         int cpu;
 749         struct mce *m = NULL;
 750         int global_worst = 0;
 751         char *msg = NULL;
 752         char *nmsg = NULL;
 753
 754         /*
 755          * This CPU is the Monarch and the other CPUs have run
 756          * through their handlers.
 757          * Grade the severity of the errors of all the CPUs.
 758          */
 759         for_each_possible_cpu(cpu) {
 760                 int severity = mce_severity(&per_cpu(mces_seen, cpu),
 761                                             mca_cfg.tolerant,
 762                                             &nmsg, true);
 763                 if (severity > global_worst) {
 764                         msg = nmsg;
 765                         global_worst = severity;
 766                         m = &per_cpu(mces_seen, cpu);
 767                 }
 768         }
 769
 770         /*
 771          * Cannot recover? Panic here then.
 772          * This dumps all the mces in the log buffer and stops the
 773          * other CPUs.
 774          */
 775         if (m && global_worst >= MCE_PANIC_SEVERITY && mca_cfg.tolerant < 3)
 776                 mce_panic("Fatal machine check", m, msg);
 777
 778         /*
 779          * For UC somewhere we let the CPU who detects it handle it.
 780          * Also must let continue the others, otherwise the handling
 781          * CPU could deadlock on a lock.
 782          */
 783
 784         /*
 785          * No machine check event found. Must be some external
 786          * source or one CPU is hung. Panic.
 787          */
 788         if (global_worst <= MCE_KEEP_SEVERITY && mca_cfg.tolerant < 3)
 789                 mce_panic("Fatal machine check from unknown source", NULL, NULL);
 790
 791         /*
 792          * Now clear all the mces_seen so that they don't reappear on
 793          * the next mce.
 794          */
 795         for_each_possible_cpu(cpu)
 796                 memset(&per_cpu(mces_seen, cpu), 0, sizeof(struct mce));
 797 }
 798
 799 static atomic_t global_nwo;
 800
 801 /*
 802  * Start of Monarch synchronization. This waits until all CPUs have
 803  * entered the exception handler and then determines if any of them
 804  * saw a fatal event that requires panic. Then it executes them
 805  * in the entry order.
 806  * TBD double check parallel CPU hotunplug
 807  */
 808 static int mce_start(int *no_way_out)
 809 {
 810         int order;
 811         int cpus = num_online_cpus();
 812         u64 timeout = (u64)mca_cfg.monarch_timeout * NSEC_PER_USEC;
 813
 814         if (!timeout)
 815                 return -1;
 816
 817         atomic_add(*no_way_out, &global_nwo);
 818         /*
 819          * global_nwo should be updated before mce_callin
 820          */
 821         smp_wmb();
 822         order = atomic_inc_return(&mce_callin);
 823
 824         /*
 825          * Wait for everyone.
 826          */
 827         while (atomic_read(&mce_callin) != cpus) {
 828                 if (mce_timed_out(&timeout,
 829                                   "Timeout: Not all CPUs entered broadcast exception handler")) {
 830                         atomic_set(&global_nwo, 0);
 831                         return -1;
 832                 }
 833                 ndelay(SPINUNIT);
 834         }
 835
 836         /*
 837          * mce_callin should be read before global_nwo
 838          */
 839         smp_rmb();
 840
 841         if (order == 1) {
 842                 /*
 843                  * Monarch: Starts executing now, the others wait.
 844                  */
 845                 atomic_set(&mce_executing, 1);
 846         } else {
 847                 /*
 848                  * Subject: Now start the scanning loop one by one in
 849                  * the original callin order.
 850                  * This way when there are any shared banks it will be
 851                  * only seen by one CPU before cleared, avoiding duplicates.
 852                  */
 853                 while (atomic_read(&mce_executing) < order) {
 854                         if (mce_timed_out(&timeout,
 855                                           "Timeout: Subject CPUs unable to finish machine check processing")) {
 856                                 atomic_set(&global_nwo, 0);
 857                                 return -1;
 858                         }
 859                         ndelay(SPINUNIT);
 860                 }
 861         }
 862
 863         /*
 864          * Cache the global no_way_out state.
 865          */
 866         *no_way_out = atomic_read(&global_nwo);
 867
 868         return order;
 869 }
 870
 871 /*
 872  * Synchronize between CPUs after main scanning loop.
 873  * This invokes the bulk of the Monarch processing.
 874  */
 875 static int mce_end(int order)
 876 {
 877         int ret = -1;
 878         u64 timeout = (u64)mca_cfg.monarch_timeout * NSEC_PER_USEC;
 879
 880         if (!timeout)
 881                 goto reset;
 882         if (order < 0)
 883                 goto reset;
 884
 885         /*
 886          * Allow others to run.
 887          */
 888         atomic_inc(&mce_executing);
 889
 890         if (order == 1) {
 891                 /* CHECKME: Can this race with a parallel hotplug? */
 892                 int cpus = num_online_cpus();
 893
 894                 /*
 895                  * Monarch: Wait for everyone to go through their scanning
 896                  * loops.
 897                  */
 898                 while (atomic_read(&mce_executing) <= cpus) {
 899                         if (mce_timed_out(&timeout,
 900                                           "Timeout: Monarch CPU unable to finish machine check processing"))
 901                                 goto reset;
 902                         ndelay(SPINUNIT);
 903                 }
 904
 905                 mce_reign();
 906                 barrier();
 907                 ret = 0;
 908         } else {
 909                 /*
 910                  * Subject: Wait for Monarch to finish.
 911                  */
 912                 while (atomic_read(&mce_executing) != 0) {
 913                         if (mce_timed_out(&timeout,
 914                                           "Timeout: Monarch CPU did not finish machine check processing"))
 915                                 goto reset;
 916                         ndelay(SPINUNIT);
 917                 }
 918
 919                 /*
 920                  * Don't reset anything. That's done by the Monarch.
 921                  */
 922                 return 0;
 923         }
 924
 925         /*
 926          * Reset all global state.
 927          */
 928 reset:
 929         atomic_set(&global_nwo, 0);
 930         atomic_set(&mce_callin, 0);
 931         barrier();
 932
 933         /*
 934          * Let others run again.
 935          */
 936         atomic_set(&mce_executing, 0);
 937         return ret;
 938 }
 939
 940 /*
 941  * Check if the address reported by the CPU is in a format we can parse.
 942  * It would be possible to add code for most other cases, but all would
 943  * be somewhat complicated (e.g. segment offset would require an instruction
 944  * parser). So only support physical addresses up to page granuality for now.
 945  */
 946 static int mce_usable_address(struct mce *m)
 947 {
 948         if (!(m->status & MCI_STATUS_MISCV) || !(m->status & MCI_STATUS_ADDRV))
 949                 return 0;
 950         if (MCI_MISC_ADDR_LSB(m->misc) > PAGE_SHIFT)
 951                 return 0;
 952         if (MCI_MISC_ADDR_MODE(m->misc) != MCI_MISC_ADDR_PHYS)
 953                 return 0;
 954         return 1;
 955 }
 956
 957 static void mce_clear_state(unsigned long *toclear)
 958 {
 959         int i;
 960
 961         for (i = 0; i < mca_cfg.banks; i++) {
 962                 if (test_bit(i, toclear))
 963                         mce_wrmsrl(MSR_IA32_MCx_STATUS(i), 0);
 964         }
 965 }
 966
 967 /*
 968  * The actual machine check handler. This only handles real
 969  * exceptions when something got corrupted coming in through int 18.
 970  *
 971  * This is executed in NMI context not subject to normal locking rules. This
 972  * implies that most kernel services cannot be safely used. Don't even
 973  * think about putting a printk in there!
 974  *
 975  * On Intel systems this is entered on all CPUs in parallel through
 976  * MCE broadcast. However some CPUs might be broken beyond repair,
 977  * so be always careful when synchronizing with others.
 978  */
 979 void do_machine_check(struct pt_regs *regs, long error_code)
 980 {
 981         struct mca_config *cfg = &mca_cfg;
 982         struct mce m, *final;
 983         int i;
 984         int worst = 0;
 985         int severity;
 986
 987         /*
 988          * Establish sequential order between the CPUs entering the machine
 989          * check handler.
 990          */
 991         int order = -1;
 992         /*
 993          * If no_way_out gets set, there is no safe way to recover from this
 994          * MCE.  If mca_cfg.tolerant is cranked up, we'll try anyway.
 995          */
 996         int no_way_out = 0;
 997         /*
 998          * If kill_it gets set, there might be a way to recover from this
 999          * error.
1000          */
1001         int kill_it = 0;
1002         DECLARE_BITMAP(toclear, MAX_NR_BANKS);
1003         DECLARE_BITMAP(valid_banks, MAX_NR_BANKS);
1004         char *msg = "Unknown";
1005         u64 recover_paddr = ~0ull;
1006         int flags = MF_ACTION_REQUIRED;
1007
1008         /*
1009          * MCEs are always local on AMD. Same is determined by MCG_STATUS_LMCES
1010          * on Intel.
1011          */
1012         int lmce = 1;
1013
1014         /* If this CPU is offline, just bail out. */
1015         if (cpu_is_offline(smp_processor_id())) {
1016                 u64 mcgstatus;
1017
1018                 mcgstatus = mce_rdmsrl(MSR_IA32_MCG_STATUS);
1019                 if (mcgstatus & MCG_STATUS_RIPV) {
1020                         mce_wrmsrl(MSR_IA32_MCG_STATUS, 0);
1021                         return;
1022                 }
1023         }
1024
1025         ist_enter(regs);
1026
1027         this_cpu_inc(mce_exception_count);
1028
1029         if (!cfg->banks)
1030                 goto out;
1031
1032         mce_gather_info(&m, regs);
1033
1034         final = this_cpu_ptr(&mces_seen);
1035         *final = m;
1036
1037         memset(valid_banks, 0, sizeof(valid_banks));
1038         no_way_out = mce_no_way_out(&m, &msg, valid_banks, regs);
1039
1040         barrier();
1041
1042         /*
1043          * When no restart IP might need to kill or panic.
1044          * Assume the worst for now, but if we find the
1045          * severity is MCE_AR_SEVERITY we have other options.
1046          */
1047         if (!(m.mcgstatus & MCG_STATUS_RIPV))
1048                 kill_it = 1;
1049
1050         /*
1051          * Check if this MCE is signaled to only this logical processor,
1052          * on Intel only.
1053          */
1054         if (m.cpuvendor == X86_VENDOR_INTEL)
1055                 lmce = m.mcgstatus & MCG_STATUS_LMCES;
1056
1057         /*
1058          * Local machine check may already know that we have to panic.
1059          * Broadcast machine check begins rendezvous in mce_start()
1060          * Go through all banks in exclusion of the other CPUs. This way we
1061          * don't report duplicated events on shared banks because the first one
1062          * to see it will clear it.
1063          */
1064         if (lmce) {
1065                 if (no_way_out)
1066                         mce_panic("Fatal local machine check", &m, msg);
1067         } else {
1068                 order = mce_start(&no_way_out);
1069         }
1070
1071         for (i = 0; i < cfg->banks; i++) {
1072                 __clear_bit(i, toclear);
1073                 if (!test_bit(i, valid_banks))
1074                         continue;
1075                 if (!mce_banks[i].ctl)
1076                         continue;
1077
1078                 m.misc = 0;
1079                 m.addr = 0;
1080                 m.bank = i;
1081
1082                 m.status = mce_rdmsrl(MSR_IA32_MCx_STATUS(i));
1083                 if ((m.status & MCI_STATUS_VAL) == 0)
1084                         continue;
1085
1086                 /*
1087                  * Non uncorrected or non signaled errors are handled by
1088                  * machine_check_poll. Leave them alone, unless this panics.
1089                  */
1090                 if (!(m.status & (cfg->ser ? MCI_STATUS_S : MCI_STATUS_UC)) &&
1091                         !no_way_out)
1092                         continue;
1093
1094                 /*
1095                  * Set taint even when machine check was not enabled.
1096                  */
1097                 add_taint(TAINT_MACHINE_CHECK, LOCKDEP_NOW_UNRELIABLE);
1098
1099                 severity = mce_severity(&m, cfg->tolerant, NULL, true);
1100
1101                 /*
1102                  * When machine check was for corrected/deferred handler don't
1103                  * touch, unless we're panicing.
1104                  */
1105                 if ((severity == MCE_KEEP_SEVERITY ||
1106                      severity == MCE_UCNA_SEVERITY) && !no_way_out)
1107                         continue;
1108                 __set_bit(i, toclear);
1109                 if (severity == MCE_NO_SEVERITY) {
1110                         /*
1111                          * Machine check event was not enabled. Clear, but
1112                          * ignore.
1113                          */
1114                         continue;
1115                 }
1116
1117                 mce_read_aux(&m, i);
1118
1119                 /* assuming valid severity level != 0 */
1120                 m.severity = severity;
1121                 m.usable_addr = mce_usable_address(&m);
1122
1123                 mce_log(&m);
1124
1125                 if (severity > worst) {
1126                         *final = m;
1127                         worst = severity;
1128                 }
1129         }
1130
1131         /* mce_clear_state will clear *final, save locally for use later */
1132         m = *final;
1133
1134         if (!no_way_out)
1135                 mce_clear_state(toclear);
1136
1137         /*
1138          * Do most of the synchronization with other CPUs.
1139          * When there's any problem use only local no_way_out state.
1140          */
1141         if (!lmce) {
1142                 if (mce_end(order) < 0)
1143                         no_way_out = worst >= MCE_PANIC_SEVERITY;
1144         } else {
1145                 /*
1146                  * If there was a fatal machine check we should have
1147                  * already called mce_panic earlier in this function.
1148                  * Since we re-read the banks, we might have found
1149                  * something new. Check again to see if we found a
1150                  * fatal error. We call "mce_severity()" again to
1151                  * make sure we have the right "msg".
1152                  */
1153                 if (worst >= MCE_PANIC_SEVERITY && mca_cfg.tolerant < 3) {
1154                         mce_severity(&m, cfg->tolerant, &msg, true);
1155                         mce_panic("Local fatal machine check!", &m, msg);
1156                 }
1157         }
1158
1159         /*
1160          * At insane "tolerant" levels we take no action. Otherwise
1161          * we only die if we have no other choice. For less serious
1162          * issues we try to recover, or limit damage to the current
1163          * process.
1164          */
1165         if (cfg->tolerant < 3) {
1166                 if (no_way_out)
1167                         mce_panic("Fatal machine check on current CPU", &m, msg);
1168                 if (worst == MCE_AR_SEVERITY) {
1169                         recover_paddr = m.addr;
1170                         if (!(m.mcgstatus & MCG_STATUS_RIPV))
1171                                 flags |= MF_MUST_KILL;
1172                 } else if (kill_it) {
1173                         force_sig(SIGBUS, current);
1174                 }
1175         }
1176
1177         if (worst > 0)
1178                 mce_report_event(regs);
1179         mce_wrmsrl(MSR_IA32_MCG_STATUS, 0);
1180 out:
1181         sync_core();
1182
1183         if (recover_paddr == ~0ull)
1184                 goto done;
1185
1186         pr_err("Uncorrected hardware memory error in user-access at %llx",
1187                  recover_paddr);
1188         /*
1189          * We must call memory_failure() here even if the current process is
1190          * doomed. We still need to mark the page as poisoned and alert any
1191          * other users of the page.
1192          */
1193         ist_begin_non_atomic(regs);
1194         local_irq_enable();
1195         if (memory_failure(recover_paddr >> PAGE_SHIFT, MCE_VECTOR, flags) < 0) {
1196                 pr_err("Memory error not recovered");
1197                 force_sig(SIGBUS, current);
1198         }
1199         local_irq_disable();
1200         ist_end_non_atomic();
1201 done:
1202         ist_exit(regs);
1203 }
1204 EXPORT_SYMBOL_GPL(do_machine_check);
1205
1206 #ifndef CONFIG_MEMORY_FAILURE
1207 int memory_failure(unsigned long pfn, int vector, int flags)
1208 {
1209         /* mce_severity() should not hand us an ACTION_REQUIRED error */
1210         BUG_ON(flags & MF_ACTION_REQUIRED);
1211         pr_err("Uncorrected memory error in page 0x%lx ignored\n"
1212                "Rebuild kernel with CONFIG_MEMORY_FAILURE=y for smarter handling\n",
1213                pfn);
1214
1215         return 0;
1216 }
1217 #endif
1218
1219 /*
1220  * Action optional processing happens here (picking up
1221  * from the list of faulting pages that do_machine_check()
1222  * placed into the genpool).
1223  */
1224 static void mce_process_work(struct work_struct *dummy)
1225 {
1226         mce_gen_pool_process();
1227 }
1228
1229 #ifdef CONFIG_X86_MCE_INTEL
1230 /***
1231  * mce_log_therm_throt_event - Logs the thermal throttling event to mcelog
1232  * @cpu: The CPU on which the event occurred.
1233  * @status: Event status information
1234  *
1235  * This function should be called by the thermal interrupt after the
1236  * event has been processed and the decision was made to log the event
1237  * further.
1238  *
1239  * The status parameter will be saved to the 'status' field of 'struct mce'
1240  * and historically has been the register value of the
1241  * MSR_IA32_THERMAL_STATUS (Intel) msr.
1242  */
1243 void mce_log_therm_throt_event(__u64 status)
1244 {
1245         struct mce m;
1246
1247         mce_setup(&m);
1248         m.bank = MCE_THERMAL_BANK;
1249         m.status = status;
1250         mce_log(&m);
1251 }
1252 #endif /* CONFIG_X86_MCE_INTEL */
1253
1254 /*
1255  * Periodic polling timer for "silent" machine check errors.  If the
1256  * poller finds an MCE, poll 2x faster.  When the poller finds no more
1257  * errors, poll 2x slower (up to check_interval seconds).
1258  */
1259 static unsigned long check_interval = INITIAL_CHECK_INTERVAL;
1260
1261 static DEFINE_PER_CPU(unsigned long, mce_next_interval); /* in jiffies */
1262 static DEFINE_PER_CPU(struct timer_list, mce_timer);
1263
1264 static unsigned long mce_adjust_timer_default(unsigned long interval)
1265 {
1266         return interval;
1267 }
1268
1269 static unsigned long (*mce_adjust_timer)(unsigned long interval) = mce_adjust_timer_default;
1270
1271 static void __restart_timer(struct timer_list *t, unsigned long interval)
1272 {
1273         unsigned long when = jiffies + interval;
1274         unsigned long flags;
1275
1276         local_irq_save(flags);
1277
1278         if (timer_pending(t)) {
1279                 if (time_before(when, t->expires))
1280                         mod_timer_pinned(t, when);
1281         } else {
1282                 t->expires = round_jiffies(when);
1283                 add_timer_on(t, smp_processor_id());
1284         }
1285
1286         local_irq_restore(flags);
1287 }
1288
1289 static void mce_timer_fn(unsigned long data)
1290 {
1291         struct timer_list *t = this_cpu_ptr(&mce_timer);
1292         int cpu = smp_processor_id();
1293         unsigned long iv;
1294
1295         WARN_ON(cpu != data);
1296
1297         iv = __this_cpu_read(mce_next_interval);
1298
1299         if (mce_available(this_cpu_ptr(&cpu_info))) {
1300                 machine_check_poll(MCP_TIMESTAMP, this_cpu_ptr(&mce_poll_banks));
1301
1302                 if (mce_intel_cmci_poll()) {
1303                         iv = mce_adjust_timer(iv);
1304                         goto done;
1305                 }
1306         }
1307
1308         /*
1309          * Alert userspace if needed. If we logged an MCE, reduce the polling
1310          * interval, otherwise increase the polling interval.
1311          */
1312         if (mce_notify_irq())
1313                 iv = max(iv / 2, (unsigned long) HZ/100);
1314         else
1315                 iv = min(iv * 2, round_jiffies_relative(check_interval * HZ));
1316
1317 done:
1318         __this_cpu_write(mce_next_interval, iv);
1319         __restart_timer(t, iv);
1320 }
1321
1322 /*
1323  * Ensure that the timer is firing in @interval from now.
1324  */
1325 void mce_timer_kick(unsigned long interval)
1326 {
1327         struct timer_list *t = this_cpu_ptr(&mce_timer);
1328         unsigned long iv = __this_cpu_read(mce_next_interval);
1329
1330         __restart_timer(t, interval);
1331
1332         if (interval < iv)
1333                 __this_cpu_write(mce_next_interval, interval);
1334 }
1335
1336 /* Must not be called in IRQ context where del_timer_sync() can deadlock */
1337 static void mce_timer_delete_all(void)
1338 {
1339         int cpu;
1340
1341         for_each_online_cpu(cpu)
1342                 del_timer_sync(&per_cpu(mce_timer, cpu));
1343 }
1344
1345 static void mce_do_trigger(struct work_struct *work)
1346 {
1347         call_usermodehelper(mce_helper, mce_helper_argv, NULL, UMH_NO_WAIT);
1348 }
1349
1350 static DECLARE_WORK(mce_trigger_work, mce_do_trigger);
1351
1352 /*
1353  * Notify the user(s) about new machine check events.
1354  * Can be called from interrupt context, but not from machine check/NMI
1355  * context.
1356  */
1357 int mce_notify_irq(void)
1358 {
1359         /* Not more than two messages every minute */
1360         static DEFINE_RATELIMIT_STATE(ratelimit, 60*HZ, 2);
1361
1362         if (test_and_clear_bit(0, &mce_need_notify)) {
1363                 /* wake processes polling /dev/mcelog */
1364                 wake_up_interruptible(&mce_chrdev_wait);
1365
1366                 if (mce_helper[0])
1367                         schedule_work(&mce_trigger_work);
1368
1369                 if (__ratelimit(&ratelimit))
1370                         pr_info(HW_ERR "Machine check events logged\n");
1371
1372                 return 1;
1373         }
1374         return 0;
1375 }
1376 EXPORT_SYMBOL_GPL(mce_notify_irq);
1377
1378 static int __mcheck_cpu_mce_banks_init(void)
1379 {
1380         int i;
1381         u8 num_banks = mca_cfg.banks;
1382
1383         mce_banks = kzalloc(num_banks * sizeof(struct mce_bank), GFP_KERNEL);
1384         if (!mce_banks)
1385                 return -ENOMEM;
1386
1387         for (i = 0; i < num_banks; i++) {
1388                 struct mce_bank *b = &mce_banks[i];
1389
1390                 b->ctl = -1ULL;
1391                 b->init = 1;
1392         }
1393         return 0;
1394 }
1395
1396 /*
1397  * Initialize Machine Checks for a CPU.
1398  */
1399 static int __mcheck_cpu_cap_init(void)
1400 {
1401         unsigned b;
1402         u64 cap;
1403
1404         rdmsrl(MSR_IA32_MCG_CAP, cap);
1405
1406         b = cap & MCG_BANKCNT_MASK;
1407         if (!mca_cfg.banks)
1408                 pr_info("CPU supports %d MCE banks\n", b);
1409
1410         if (b > MAX_NR_BANKS) {
1411                 pr_warn("Using only %u machine check banks out of %u\n",
1412                         MAX_NR_BANKS, b);
1413                 b = MAX_NR_BANKS;
1414         }
1415
1416         /* Don't support asymmetric configurations today */
1417         WARN_ON(mca_cfg.banks != 0 && b != mca_cfg.banks);
1418         mca_cfg.banks = b;
1419
1420         if (!mce_banks) {
1421                 int err = __mcheck_cpu_mce_banks_init();
1422
1423                 if (err)
1424                         return err;
1425         }
1426
1427         /* Use accurate RIP reporting if available. */
1428         if ((cap & MCG_EXT_P) && MCG_EXT_CNT(cap) >= 9)
1429                 mca_cfg.rip_msr = MSR_IA32_MCG_EIP;
1430
1431         if (cap & MCG_SER_P)
1432                 mca_cfg.ser = true;
1433
1434         return 0;
1435 }
1436
1437 static void __mcheck_cpu_init_generic(void)
1438 {
1439         enum mcp_flags m_fl = 0;
1440         mce_banks_t all_banks;
1441         u64 cap;
1442         int i;
1443
1444         if (!mca_cfg.bootlog)
1445                 m_fl = MCP_DONTLOG;
1446
1447         /*
1448          * Log the machine checks left over from the previous reset.
1449          */
1450         bitmap_fill(all_banks, MAX_NR_BANKS);
1451         machine_check_poll(MCP_UC | m_fl, &all_banks);
1452
1453         cr4_set_bits(X86_CR4_MCE);
1454
1455         rdmsrl(MSR_IA32_MCG_CAP, cap);
1456         if (cap & MCG_CTL_P)
1457                 wrmsr(MSR_IA32_MCG_CTL, 0xffffffff, 0xffffffff);
1458
1459         for (i = 0; i < mca_cfg.banks; i++) {
1460                 struct mce_bank *b = &mce_banks[i];
1461
1462                 if (!b->init)
1463                         continue;
1464                 wrmsrl(MSR_IA32_MCx_CTL(i), b->ctl);
1465                 wrmsrl(MSR_IA32_MCx_STATUS(i), 0);
1466         }
1467 }
1468
1469 /*
1470  * During IFU recovery Sandy Bridge -EP4S processors set the RIPV and
1471  * EIPV bits in MCG_STATUS to zero on the affected logical processor (SDM
1472  * Vol 3B Table 15-20). But this confuses both the code that determines
1473  * whether the machine check occurred in kernel or user mode, and also
1474  * the severity assessment code. Pretend that EIPV was set, and take the
1475  * ip/cs values from the pt_regs that mce_gather_info() ignored earlier.
1476  */
1477 static void quirk_sandybridge_ifu(int bank, struct mce *m, struct pt_regs *regs)
1478 {
1479         if (bank != 0)
1480                 return;
1481         if ((m->mcgstatus & (MCG_STATUS_EIPV|MCG_STATUS_RIPV)) != 0)
1482                 return;
1483         if ((m->status & (MCI_STATUS_OVER|MCI_STATUS_UC|
1484                           MCI_STATUS_EN|MCI_STATUS_MISCV|MCI_STATUS_ADDRV|
1485                           MCI_STATUS_PCC|MCI_STATUS_S|MCI_STATUS_AR|
1486                           MCACOD)) !=
1487                          (MCI_STATUS_UC|MCI_STATUS_EN|
1488                           MCI_STATUS_MISCV|MCI_STATUS_ADDRV|MCI_STATUS_S|
1489                           MCI_STATUS_AR|MCACOD_INSTR))
1490                 return;
1491
1492         m->mcgstatus |= MCG_STATUS_EIPV;
1493         m->ip = regs->ip;
1494         m->cs = regs->cs;
1495 }
1496
1497 /* Add per CPU specific workarounds here */
1498 static int __mcheck_cpu_apply_quirks(struct cpuinfo_x86 *c)
1499 {
1500         struct mca_config *cfg = &mca_cfg;
1501
1502         if (c->x86_vendor == X86_VENDOR_UNKNOWN) {
1503                 pr_info("unknown CPU type - not enabling MCE support\n");
1504                 return -EOPNOTSUPP;
1505         }
1506
1507         /* This should be disabled by the BIOS, but isn't always */
1508         if (c->x86_vendor == X86_VENDOR_AMD) {
1509                 if (c->x86 == 15 && cfg->banks > 4) {
1510                         /*
1511                          * disable GART TBL walk error reporting, which
1512                          * trips off incorrectly with the IOMMU & 3ware
1513                          * & Cerberus:
1514                          */
1515                         clear_bit(10, (unsigned long *)&mce_banks[4].ctl);
1516                 }
1517                 if (c->x86 <= 17 && cfg->bootlog < 0) {
1518                         /*
1519                          * Lots of broken BIOS around that don't clear them
1520                          * by default and leave crap in there. Don't log:
1521                          */
1522                         cfg->bootlog = 0;
1523                 }
1524                 /*
1525                  * Various K7s with broken bank 0 around. Always disable
1526                  * by default.
1527                  */
1528                 if (c->x86 == 6 && cfg->banks > 0)
1529                         mce_banks[0].ctl = 0;
1530
1531                 /*
1532                  * overflow_recov is supported for F15h Models 00h-0fh
1533                  * even though we don't have a CPUID bit for it.
1534                  */
1535                 if (c->x86 == 0x15 && c->x86_model <= 0xf)
1536                         mce_flags.overflow_recov = 1;
1537
1538                 /*
1539                  * Turn off MC4_MISC thresholding banks on all models since
1540                  * they're not supported there.
1541                  */
1542                 if (c->x86 == 0x15) {
1543                         int i;
1544                         u64 hwcr;
1545                         bool need_toggle;
1546                         u32 msrs[] = {
1547                                 0x00000413, /* MC4_MISC0 */
1548                                 0xc0000408, /* MC4_MISC1 */
1549                         };
1550
1551                         rdmsrl(MSR_K7_HWCR, hwcr);
1552
1553                         /* McStatusWrEn has to be set */
1554                         need_toggle = !(hwcr & BIT(18));
1555
1556                         if (need_toggle)
1557                                 wrmsrl(MSR_K7_HWCR, hwcr | BIT(18));
1558
1559                         /* Clear CntP bit safely */
1560                         for (i = 0; i < ARRAY_SIZE(msrs); i++)
1561                                 msr_clear_bit(msrs[i], 62);
1562
1563                         /* restore old settings */
1564                         if (need_toggle)
1565                                 wrmsrl(MSR_K7_HWCR, hwcr);
1566                 }
1567         }
1568
1569         if (c->x86_vendor == X86_VENDOR_INTEL) {
1570                 /*
1571                  * SDM documents that on family 6 bank 0 should not be written
1572                  * because it aliases to another special BIOS controlled
1573                  * register.
1574                  * But it's not aliased anymore on model 0x1a+
1575                  * Don't ignore bank 0 completely because there could be a
1576                  * valid event later, merely don't write CTL0.
1577                  */
1578
1579                 if (c->x86 == 6 && c->x86_model < 0x1A && cfg->banks > 0)
1580                         mce_banks[0].init = 0;
1581
1582                 /*
1583                  * All newer Intel systems support MCE broadcasting. Enable
1584                  * synchronization with a one second timeout.
1585                  */
1586                 if ((c->x86 > 6 || (c->x86 == 6 && c->x86_model >= 0xe)) &&
1587                         cfg->monarch_timeout < 0)
1588                         cfg->monarch_timeout = USEC_PER_SEC;
1589
1590                 /*
1591                  * There are also broken BIOSes on some Pentium M and
1592                  * earlier systems:
1593                  */
1594                 if (c->x86 == 6 && c->x86_model <= 13 && cfg->bootlog < 0)
1595                         cfg->bootlog = 0;
1596
1597                 if (c->x86 == 6 && c->x86_model == 45)
1598                         quirk_no_way_out = quirk_sandybridge_ifu;
1599         }
1600         if (cfg->monarch_timeout < 0)
1601                 cfg->monarch_timeout = 0;
1602         if (cfg->bootlog != 0)
1603                 cfg->panic_timeout = 30;
1604
1605         return 0;
1606 }
1607
1608 static int __mcheck_cpu_ancient_init(struct cpuinfo_x86 *c)
1609 {
1610         if (c->x86 != 5)
1611                 return 0;
1612
1613         switch (c->x86_vendor) {
1614         case X86_VENDOR_INTEL:
1615                 intel_p5_mcheck_init(c);
1616                 return 1;
1617                 break;
1618         case X86_VENDOR_CENTAUR:
1619                 winchip_mcheck_init(c);
1620                 return 1;
1621                 break;
1622         default:
1623                 return 0;
1624         }
1625
1626         return 0;
1627 }
1628
1629 static void __mcheck_cpu_init_vendor(struct cpuinfo_x86 *c)
1630 {
1631         switch (c->x86_vendor) {
1632         case X86_VENDOR_INTEL:
1633                 mce_intel_feature_init(c);
1634                 mce_adjust_timer = cmci_intel_adjust_timer;
1635                 break;
1636
1637         case X86_VENDOR_AMD: {
1638                 u32 ebx = cpuid_ebx(0x80000007);
1639
1640                 mce_amd_feature_init(c);
1641                 mce_flags.overflow_recov = !!(ebx & BIT(0));
1642                 mce_flags.succor         = !!(ebx & BIT(1));
1643                 mce_flags.smca           = !!(ebx & BIT(3));
1644
1645                 break;
1646                 }
1647
1648         default:
1649                 break;
1650         }
1651 }
1652
1653 static void __mcheck_cpu_clear_vendor(struct cpuinfo_x86 *c)
1654 {
1655         switch (c->x86_vendor) {
1656         case X86_VENDOR_INTEL:
1657                 mce_intel_feature_clear(c);
1658                 break;
1659         default:
1660                 break;
1661         }
1662 }
1663
1664 static void mce_start_timer(unsigned int cpu, struct timer_list *t)
1665 {
1666         unsigned long iv = check_interval * HZ;
1667
1668         if (mca_cfg.ignore_ce || !iv)
1669                 return;
1670
1671         per_cpu(mce_next_interval, cpu) = iv;
1672
1673         t->expires = round_jiffies(jiffies + iv);
1674         add_timer_on(t, cpu);
1675 }
1676
1677 static void __mcheck_cpu_init_timer(void)
1678 {
1679         struct timer_list *t = this_cpu_ptr(&mce_timer);
1680         unsigned int cpu = smp_processor_id();
1681
1682         setup_timer(t, mce_timer_fn, cpu);
1683         mce_start_timer(cpu, t);
1684 }
1685
1686 /* Handle unconfigured int18 (should never happen) */
1687 static void unexpected_machine_check(struct pt_regs *regs, long error_code)
1688 {
1689         pr_err("CPU#%d: Unexpected int18 (Machine Check)\n",
1690                smp_processor_id());
1691 }
1692
1693 /* Call the installed machine check handler for this CPU setup. */
1694 void (*machine_check_vector)(struct pt_regs *, long error_code) =
1695                                                 unexpected_machine_check;
1696
1697 dotraplinkage void do_mce(struct pt_regs *regs, long error_code)
1698 {
1699         machine_check_vector(regs, error_code);
1700 }
1701
1702 /*
1703  * Called for each booted CPU to set up machine checks.
1704  * Must be called with preempt off:
1705  */
1706 void mcheck_cpu_init(struct cpuinfo_x86 *c)
1707 {
1708         if (mca_cfg.disabled)
1709                 return;
1710
1711         if (__mcheck_cpu_ancient_init(c))
1712                 return;
1713
1714         if (!mce_available(c))
1715                 return;
1716
1717         if (__mcheck_cpu_cap_init() < 0 || __mcheck_cpu_apply_quirks(c) < 0) {
1718                 mca_cfg.disabled = true;
1719                 return;
1720         }
1721
1722         if (mce_gen_pool_init()) {
1723                 mca_cfg.disabled = true;
1724                 pr_emerg("Couldn't allocate MCE records pool!\n");
1725                 return;
1726         }
1727
1728         machine_check_vector = do_machine_check;
1729
1730         __mcheck_cpu_init_generic();
1731         __mcheck_cpu_init_vendor(c);
1732         __mcheck_cpu_init_timer();
1733 }
1734
1735 /*
1736  * Called for each booted CPU to clear some machine checks opt-ins
1737  */
1738 void mcheck_cpu_clear(struct cpuinfo_x86 *c)
1739 {
1740         if (mca_cfg.disabled)
1741                 return;
1742
1743         if (!mce_available(c))
1744                 return;
1745
1746         /*
1747          * Possibly to clear general settings generic to x86
1748          * __mcheck_cpu_clear_generic(c);
1749          */
1750         __mcheck_cpu_clear_vendor(c);
1751
1752 }
1753
1754 /*
1755  * mce_chrdev: Character device /dev/mcelog to read and clear the MCE log.
1756  */
1757
1758 static DEFINE_SPINLOCK(mce_chrdev_state_lock);
1759 static int mce_chrdev_open_count;       /* #times opened */
1760 static int mce_chrdev_open_exclu;       /* already open exclusive? */
1761
1762 static int mce_chrdev_open(struct inode *inode, struct file *file)
1763 {
1764         spin_lock(&mce_chrdev_state_lock);
1765
1766         if (mce_chrdev_open_exclu ||
1767             (mce_chrdev_open_count && (file->f_flags & O_EXCL))) {
1768                 spin_unlock(&mce_chrdev_state_lock);
1769
1770                 return -EBUSY;
1771         }
1772
1773         if (file->f_flags & O_EXCL)
1774                 mce_chrdev_open_exclu = 1;
1775         mce_chrdev_open_count++;
1776
1777         spin_unlock(&mce_chrdev_state_lock);
1778
1779         return nonseekable_open(inode, file);
1780 }
1781
1782 static int mce_chrdev_release(struct inode *inode, struct file *file)
1783 {
1784         spin_lock(&mce_chrdev_state_lock);
1785
1786         mce_chrdev_open_count--;
1787         mce_chrdev_open_exclu = 0;
1788
1789         spin_unlock(&mce_chrdev_state_lock);
1790
1791         return 0;
1792 }
1793
1794 static void collect_tscs(void *data)
1795 {
1796         unsigned long *cpu_tsc = (unsigned long *)data;
1797
1798         cpu_tsc[smp_processor_id()] = rdtsc();
1799 }
1800
1801 static int mce_apei_read_done;
1802
1803 /* Collect MCE record of previous boot in persistent storage via APEI ERST. */
1804 static int __mce_read_apei(char __user **ubuf, size_t usize)
1805 {
1806         int rc;
1807         u64 record_id;
1808         struct mce m;
1809
1810         if (usize < sizeof(struct mce))
1811                 return -EINVAL;
1812
1813         rc = apei_read_mce(&m, &record_id);
1814         /* Error or no more MCE record */
1815         if (rc <= 0) {
1816                 mce_apei_read_done = 1;
1817                 /*
1818                  * When ERST is disabled, mce_chrdev_read() should return
1819                  * "no record" instead of "no device."
1820                  */
1821                 if (rc == -ENODEV)
1822                         return 0;
1823                 return rc;
1824         }
1825         rc = -EFAULT;
1826         if (copy_to_user(*ubuf, &m, sizeof(struct mce)))
1827                 return rc;
1828         /*
1829          * In fact, we should have cleared the record after that has
1830          * been flushed to the disk or sent to network in
1831          * /sbin/mcelog, but we have no interface to support that now,
1832          * so just clear it to avoid duplication.
1833          */
1834         rc = apei_clear_mce(record_id);
1835         if (rc) {
1836                 mce_apei_read_done = 1;
1837                 return rc;
1838         }
1839         *ubuf += sizeof(struct mce);
1840
1841         return 0;
1842 }
1843
1844 static ssize_t mce_chrdev_read(struct file *filp, char __user *ubuf,
1845                                 size_t usize, loff_t *off)
1846 {
1847         char __user *buf = ubuf;
1848         unsigned long *cpu_tsc;
1849         unsigned prev, next;
1850         int i, err;
1851
1852         cpu_tsc = kmalloc(nr_cpu_ids * sizeof(long), GFP_KERNEL);
1853         if (!cpu_tsc)
1854                 return -ENOMEM;
1855
1856         mutex_lock(&mce_chrdev_read_mutex);
1857
1858         if (!mce_apei_read_done) {
1859                 err = __mce_read_apei(&buf, usize);
1860                 if (err || buf != ubuf)
1861                         goto out;
1862         }
1863
1864         next = mce_log_get_idx_check(mcelog.next);
1865
1866         /* Only supports full reads right now */
1867         err = -EINVAL;
1868         if (*off != 0 || usize < MCE_LOG_LEN*sizeof(struct mce))
1869                 goto out;
1870
1871         err = 0;
1872         prev = 0;
1873         do {
1874                 for (i = prev; i < next; i++) {
1875                         unsigned long start = jiffies;
1876                         struct mce *m = &mcelog.entry[i];
1877
1878                         while (!m->finished) {
1879                                 if (time_after_eq(jiffies, start + 2)) {
1880                                         memset(m, 0, sizeof(*m));
1881                                         goto timeout;
1882                                 }
1883                                 cpu_relax();
1884                         }
1885                         smp_rmb();
1886                         err |= copy_to_user(buf, m, sizeof(*m));
1887                         buf += sizeof(*m);
1888 timeout:
1889                         ;
1890                 }
1891
1892                 memset(mcelog.entry + prev, 0,
1893                        (next - prev) * sizeof(struct mce));
1894                 prev = next;
1895                 next = cmpxchg(&mcelog.next, prev, 0);
1896         } while (next != prev);
1897
1898         synchronize_sched();
1899
1900         /*
1901          * Collect entries that were still getting written before the
1902          * synchronize.
1903          */
1904         on_each_cpu(collect_tscs, cpu_tsc, 1);
1905
1906         for (i = next; i < MCE_LOG_LEN; i++) {
1907                 struct mce *m = &mcelog.entry[i];
1908
1909                 if (m->finished && m->tsc < cpu_tsc[m->cpu]) {
1910                         err |= copy_to_user(buf, m, sizeof(*m));
1911                         smp_rmb();
1912                         buf += sizeof(*m);
1913                         memset(m, 0, sizeof(*m));
1914                 }
1915         }
1916
1917         if (err)
1918                 err = -EFAULT;
1919
1920 out:
1921         mutex_unlock(&mce_chrdev_read_mutex);
1922         kfree(cpu_tsc);
1923
1924         return err ? err : buf - ubuf;
1925 }
1926
1927 static unsigned int mce_chrdev_poll(struct file *file, poll_table *wait)
1928 {
1929         poll_wait(file, &mce_chrdev_wait, wait);
1930         if (READ_ONCE(mcelog.next))
1931                 return POLLIN | POLLRDNORM;
1932         if (!mce_apei_read_done && apei_check_mce())
1933                 return POLLIN | POLLRDNORM;
1934         return 0;
1935 }
1936
1937 static long mce_chrdev_ioctl(struct file *f, unsigned int cmd,
1938                                 unsigned long arg)
1939 {
1940         int __user *p = (int __user *)arg;
1941
1942         if (!capable(CAP_SYS_ADMIN))
1943                 return -EPERM;
1944
1945         switch (cmd) {
1946         case MCE_GET_RECORD_LEN:
1947                 return put_user(sizeof(struct mce), p);
1948         case MCE_GET_LOG_LEN:
1949                 return put_user(MCE_LOG_LEN, p);
1950         case MCE_GETCLEAR_FLAGS: {
1951                 unsigned flags;
1952
1953                 do {
1954                         flags = mcelog.flags;
1955                 } while (cmpxchg(&mcelog.flags, flags, 0) != flags);
1956
1957                 return put_user(flags, p);
1958         }
1959         default:
1960                 return -ENOTTY;
1961         }
1962 }
1963
1964 static ssize_t (*mce_write)(struct file *filp, const char __user *ubuf,
1965                             size_t usize, loff_t *off);
1966
1967 void register_mce_write_callback(ssize_t (*fn)(struct file *filp,
1968                              const char __user *ubuf,
1969                              size_t usize, loff_t *off))
1970 {
1971         mce_write = fn;
1972 }
1973 EXPORT_SYMBOL_GPL(register_mce_write_callback);
1974
1975 static ssize_t mce_chrdev_write(struct file *filp, const char __user *ubuf,
1976                                 size_t usize, loff_t *off)
1977 {
1978         if (mce_write)
1979                 return mce_write(filp, ubuf, usize, off);
1980         else
1981                 return -EINVAL;
1982 }
1983
1984 static const struct file_operations mce_chrdev_ops = {
1985         .open                   = mce_chrdev_open,
1986         .release                = mce_chrdev_release,
1987         .read                   = mce_chrdev_read,
1988         .write                  = mce_chrdev_write,
1989         .poll                   = mce_chrdev_poll,
1990         .unlocked_ioctl         = mce_chrdev_ioctl,
1991         .llseek                 = no_llseek,
1992 };
1993
1994 static struct miscdevice mce_chrdev_device = {
1995         MISC_MCELOG_MINOR,
1996         "mcelog",
1997         &mce_chrdev_ops,
1998 };
1999
2000 static void __mce_disable_bank(void *arg)
2001 {
2002         int bank = *((int *)arg);
2003         __clear_bit(bank, this_cpu_ptr(mce_poll_banks));
2004         cmci_disable_bank(bank);
2005 }
2006
2007 void mce_disable_bank(int bank)
2008 {
2009         if (bank >= mca_cfg.banks) {
2010                 pr_warn(FW_BUG
2011                         "Ignoring request to disable invalid MCA bank %d.\n",
2012                         bank);
2013                 return;
2014         }
2015         set_bit(bank, mce_banks_ce_disabled);
2016         on_each_cpu(__mce_disable_bank, &bank, 1);
2017 }
2018
2019 /*
2020  * mce=off Disables machine check
2021  * mce=no_cmci Disables CMCI
2022  * mce=no_lmce Disables LMCE
2023  * mce=dont_log_ce Clears corrected events silently, no log created for CEs.
2024  * mce=ignore_ce Disables polling and CMCI, corrected events are not cleared.
2025  * mce=TOLERANCELEVEL[,monarchtimeout] (number, see above)
2026  *      monarchtimeout is how long to wait for other CPUs on machine
2027  *      check, or 0 to not wait
2028  * mce=bootlog Log MCEs from before booting. Disabled by default on AMD.
2029  * mce=nobootlog Don't log MCEs from before booting.
2030  * mce=bios_cmci_threshold Don't program the CMCI threshold
2031  */
2032 static int __init mcheck_enable(char *str)
2033 {
2034         struct mca_config *cfg = &mca_cfg;
2035
2036         if (*str == 0) {
2037                 enable_p5_mce();
2038                 return 1;
2039         }
2040         if (*str == '=')
2041                 str++;
2042         if (!strcmp(str, "off"))
2043                 cfg->disabled = true;
2044         else if (!strcmp(str, "no_cmci"))
2045                 cfg->cmci_disabled = true;
2046         else if (!strcmp(str, "no_lmce"))
2047                 cfg->lmce_disabled = true;
2048         else if (!strcmp(str, "dont_log_ce"))
2049                 cfg->dont_log_ce = true;
2050         else if (!strcmp(str, "ignore_ce"))
2051                 cfg->ignore_ce = true;
2052         else if (!strcmp(str, "bootlog") || !strcmp(str, "nobootlog"))
2053                 cfg->bootlog = (str[0] == 'b');
2054         else if (!strcmp(str, "bios_cmci_threshold"))
2055                 cfg->bios_cmci_threshold = true;
2056         else if (isdigit(str[0])) {
2057                 if (get_option(&str, &cfg->tolerant) == 2)
2058                         get_option(&str, &(cfg->monarch_timeout));
2059         } else {
2060                 pr_info("mce argument %s ignored. Please use /sys\n", str);
2061                 return 0;
2062         }
2063         return 1;
2064 }
2065 __setup("mce", mcheck_enable);
2066
2067 int __init mcheck_init(void)
2068 {
2069         mcheck_intel_therm_init();
2070         mce_register_decode_chain(&mce_srao_nb);
2071         mcheck_vendor_init_severity();
2072
2073         INIT_WORK(&mce_work, mce_process_work);
2074         init_irq_work(&mce_irq_work, mce_irq_work_cb);
2075
2076         return 0;
2077 }
2078
2079 /*
2080  * mce_syscore: PM support
2081  */
2082
2083 /*
2084  * Disable machine checks on suspend and shutdown. We can't really handle
2085  * them later.
2086  */
2087 static void mce_disable_error_reporting(void)
2088 {
2089         int i;
2090
2091         for (i = 0; i < mca_cfg.banks; i++) {
2092                 struct mce_bank *b = &mce_banks[i];
2093
2094                 if (b->init)
2095                         wrmsrl(MSR_IA32_MCx_CTL(i), 0);
2096         }
2097         return;
2098 }
2099
2100 static void vendor_disable_error_reporting(void)
2101 {
2102         /*
2103          * Don't clear on Intel CPUs. Some of these MSRs are socket-wide.
2104          * Disabling them for just a single offlined CPU is bad, since it will
2105          * inhibit reporting for all shared resources on the socket like the
2106          * last level cache (LLC), the integrated memory controller (iMC), etc.
2107          */
2108         if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL)
2109                 return;
2110
2111         mce_disable_error_reporting();
2112 }
2113
2114 static int mce_syscore_suspend(void)
2115 {
2116         vendor_disable_error_reporting();
2117         return 0;
2118 }
2119
2120 static void mce_syscore_shutdown(void)
2121 {
2122         vendor_disable_error_reporting();
2123 }
2124
2125 /*
2126  * On resume clear all MCE state. Don't want to see leftovers from the BIOS.
2127  * Only one CPU is active at this time, the others get re-added later using
2128  * CPU hotplug:
2129  */
2130 static void mce_syscore_resume(void)
2131 {
2132         __mcheck_cpu_init_generic();
2133         __mcheck_cpu_init_vendor(raw_cpu_ptr(&cpu_info));
2134 }
2135
2136 static struct syscore_ops mce_syscore_ops = {
2137         .suspend        = mce_syscore_suspend,
2138         .shutdown       = mce_syscore_shutdown,
2139         .resume         = mce_syscore_resume,
2140 };
2141
2142 /*
2143  * mce_device: Sysfs support
2144  */
2145
2146 static void mce_cpu_restart(void *data)
2147 {
2148         if (!mce_available(raw_cpu_ptr(&cpu_info)))
2149                 return;
2150         __mcheck_cpu_init_generic();
2151         __mcheck_cpu_init_timer();
2152 }
2153
2154 /* Reinit MCEs after user configuration changes */
2155 static void mce_restart(void)
2156 {
2157         mce_timer_delete_all();
2158         on_each_cpu(mce_cpu_restart, NULL, 1);
2159 }
2160
2161 /* Toggle features for corrected errors */
2162 static void mce_disable_cmci(void *data)
2163 {
2164         if (!mce_available(raw_cpu_ptr(&cpu_info)))
2165                 return;
2166         cmci_clear();
2167 }
2168
2169 static void mce_enable_ce(void *all)
2170 {
2171         if (!mce_available(raw_cpu_ptr(&cpu_info)))
2172                 return;
2173         cmci_reenable();
2174         cmci_recheck();
2175         if (all)
2176                 __mcheck_cpu_init_timer();
2177 }
2178
2179 static struct bus_type mce_subsys = {
2180         .name           = "machinecheck",
2181         .dev_name       = "machinecheck",
2182 };
2183
2184 DEFINE_PER_CPU(struct device *, mce_device);
2185
2186 void (*threshold_cpu_callback)(unsigned long action, unsigned int cpu);
2187
2188 static inline struct mce_bank *attr_to_bank(struct device_attribute *attr)
2189 {
2190         return container_of(attr, struct mce_bank, attr);
2191 }
2192
2193 static ssize_t show_bank(struct device *s, struct device_attribute *attr,
2194                          char *buf)
2195 {
2196         return sprintf(buf, "%llx\n", attr_to_bank(attr)->ctl);
2197 }
2198
2199 static ssize_t set_bank(struct device *s, struct device_attribute *attr,
2200                         const char *buf, size_t size)
2201 {
2202         u64 new;
2203
2204         if (kstrtou64(buf, 0, &new) < 0)
2205                 return -EINVAL;
2206
2207         attr_to_bank(attr)->ctl = new;
2208         mce_restart();
2209
2210         return size;
2211 }
2212
2213 static ssize_t
2214 show_trigger(struct device *s, struct device_attribute *attr, char *buf)
2215 {
2216         strcpy(buf, mce_helper);
2217         strcat(buf, "\n");
2218         return strlen(mce_helper) + 1;
2219 }
2220
2221 static ssize_t set_trigger(struct device *s, struct device_attribute *attr,
2222                                 const char *buf, size_t siz)
2223 {
2224         char *p;
2225
2226         strncpy(mce_helper, buf, sizeof(mce_helper));
2227         mce_helper[sizeof(mce_helper)-1] = 0;
2228         p = strchr(mce_helper, '\n');
2229
2230         if (p)
2231                 *p = 0;
2232
2233         return strlen(mce_helper) + !!p;
2234 }
2235
2236 static ssize_t set_ignore_ce(struct device *s,
2237                              struct device_attribute *attr,
2238                              const char *buf, size_t size)
2239 {
2240         u64 new;
2241
2242         if (kstrtou64(buf, 0, &new) < 0)
2243                 return -EINVAL;
2244
2245         mutex_lock(&mce_sysfs_mutex);
2246         if (mca_cfg.ignore_ce ^ !!new) {
2247                 if (new) {
2248                         /* disable ce features */
2249                         mce_timer_delete_all();
2250                         on_each_cpu(mce_disable_cmci, NULL, 1);
2251                         mca_cfg.ignore_ce = true;
2252                 } else {
2253                         /* enable ce features */
2254                         mca_cfg.ignore_ce = false;
2255                         on_each_cpu(mce_enable_ce, (void *)1, 1);
2256                 }
2257         }
2258         mutex_unlock(&mce_sysfs_mutex);
2259
2260         return size;
2261 }
2262
2263 static ssize_t set_cmci_disabled(struct device *s,
2264                                  struct device_attribute *attr,
2265                                  const char *buf, size_t size)
2266 {
2267         u64 new;
2268
2269         if (kstrtou64(buf, 0, &new) < 0)
2270                 return -EINVAL;
2271
2272         mutex_lock(&mce_sysfs_mutex);
2273         if (mca_cfg.cmci_disabled ^ !!new) {
2274                 if (new) {
2275                         /* disable cmci */
2276                         on_each_cpu(mce_disable_cmci, NULL, 1);
2277                         mca_cfg.cmci_disabled = true;
2278                 } else {
2279                         /* enable cmci */
2280                         mca_cfg.cmci_disabled = false;
2281                         on_each_cpu(mce_enable_ce, NULL, 1);
2282                 }
2283         }
2284         mutex_unlock(&mce_sysfs_mutex);
2285
2286         return size;
2287 }
2288
2289 static ssize_t store_int_with_restart(struct device *s,
2290                                       struct device_attribute *attr,
2291                                       const char *buf, size_t size)
2292 {
2293         unsigned long old_check_interval = check_interval;
2294         ssize_t ret = device_store_ulong(s, attr, buf, size);
2295
2296         if (check_interval == old_check_interval)
2297                 return ret;
2298
2299         mutex_lock(&mce_sysfs_mutex);
2300         mce_restart();
2301         mutex_unlock(&mce_sysfs_mutex);
2302
2303         return ret;
2304 }
2305
2306 static DEVICE_ATTR(trigger, 0644, show_trigger, set_trigger);
2307 static DEVICE_INT_ATTR(tolerant, 0644, mca_cfg.tolerant);
2308 static DEVICE_INT_ATTR(monarch_timeout, 0644, mca_cfg.monarch_timeout);
2309 static DEVICE_BOOL_ATTR(dont_log_ce, 0644, mca_cfg.dont_log_ce);
2310
2311 static struct dev_ext_attribute dev_attr_check_interval = {
2312         __ATTR(check_interval, 0644, device_show_int, store_int_with_restart),
2313         &check_interval
2314 };
2315
2316 static struct dev_ext_attribute dev_attr_ignore_ce = {
2317         __ATTR(ignore_ce, 0644, device_show_bool, set_ignore_ce),
2318         &mca_cfg.ignore_ce
2319 };
2320
2321 static struct dev_ext_attribute dev_attr_cmci_disabled = {
2322         __ATTR(cmci_disabled, 0644, device_show_bool, set_cmci_disabled),
2323         &mca_cfg.cmci_disabled
2324 };
2325
2326 static struct device_attribute *mce_device_attrs[] = {
2327         &dev_attr_tolerant.attr,
2328         &dev_attr_check_interval.attr,
2329         &dev_attr_trigger,
2330         &dev_attr_monarch_timeout.attr,
2331         &dev_attr_dont_log_ce.attr,
2332         &dev_attr_ignore_ce.attr,
2333         &dev_attr_cmci_disabled.attr,
2334         NULL
2335 };
2336
2337 static cpumask_var_t mce_device_initialized;
2338
2339 static void mce_device_release(struct device *dev)
2340 {
2341         kfree(dev);
2342 }
2343
2344 /* Per cpu device init. All of the cpus still share the same ctrl bank: */
2345 static int mce_device_create(unsigned int cpu)
2346 {
2347         struct device *dev;
2348         int err;
2349         int i, j;
2350
2351         if (!mce_available(&boot_cpu_data))
2352                 return -EIO;
2353
2354         dev = kzalloc(sizeof *dev, GFP_KERNEL);
2355         if (!dev)
2356                 return -ENOMEM;
2357         dev->id  = cpu;
2358         dev->bus = &mce_subsys;
2359         dev->release = &mce_device_release;
2360
2361         err = device_register(dev);
2362         if (err) {
2363                 put_device(dev);
2364                 return err;
2365         }
2366
2367         for (i = 0; mce_device_attrs[i]; i++) {
2368                 err = device_create_file(dev, mce_device_attrs[i]);
2369                 if (err)
2370                         goto error;
2371         }
2372         for (j = 0; j < mca_cfg.banks; j++) {
2373                 err = device_create_file(dev, &mce_banks[j].attr);
2374                 if (err)
2375                         goto error2;
2376         }
2377         cpumask_set_cpu(cpu, mce_device_initialized);
2378         per_cpu(mce_device, cpu) = dev;
2379
2380         return 0;
2381 error2:
2382         while (--j >= 0)
2383                 device_remove_file(dev, &mce_banks[j].attr);
2384 error:
2385         while (--i >= 0)
2386                 device_remove_file(dev, mce_device_attrs[i]);
2387
2388         device_unregister(dev);
2389
2390         return err;
2391 }
2392
2393 static void mce_device_remove(unsigned int cpu)
2394 {
2395         struct device *dev = per_cpu(mce_device, cpu);
2396         int i;
2397
2398         if (!cpumask_test_cpu(cpu, mce_device_initialized))
2399                 return;
2400
2401         for (i = 0; mce_device_attrs[i]; i++)
2402                 device_remove_file(dev, mce_device_attrs[i]);
2403
2404         for (i = 0; i < mca_cfg.banks; i++)
2405                 device_remove_file(dev, &mce_banks[i].attr);
2406
2407         device_unregister(dev);
2408         cpumask_clear_cpu(cpu, mce_device_initialized);
2409         per_cpu(mce_device, cpu) = NULL;
2410 }
2411
2412 /* Make sure there are no machine checks on offlined CPUs. */
2413 static void mce_disable_cpu(void *h)
2414 {
2415         unsigned long action = *(unsigned long *)h;
2416
2417         if (!mce_available(raw_cpu_ptr(&cpu_info)))
2418                 return;
2419
2420         if (!(action & CPU_TASKS_FROZEN))
2421                 cmci_clear();
2422
2423         vendor_disable_error_reporting();
2424 }
2425
2426 static void mce_reenable_cpu(void *h)
2427 {
2428         unsigned long action = *(unsigned long *)h;
2429         int i;
2430
2431         if (!mce_available(raw_cpu_ptr(&cpu_info)))
2432                 return;
2433
2434         if (!(action & CPU_TASKS_FROZEN))
2435                 cmci_reenable();
2436         for (i = 0; i < mca_cfg.banks; i++) {
2437                 struct mce_bank *b = &mce_banks[i];
2438
2439                 if (b->init)
2440                         wrmsrl(MSR_IA32_MCx_CTL(i), b->ctl);
2441         }
2442 }
2443
2444 /* Get notified when a cpu comes on/off. Be hotplug friendly. */
2445 static int
2446 mce_cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu)
2447 {
2448         unsigned int cpu = (unsigned long)hcpu;
2449         struct timer_list *t = &per_cpu(mce_timer, cpu);
2450
2451         switch (action & ~CPU_TASKS_FROZEN) {
2452         case CPU_ONLINE:
2453                 mce_device_create(cpu);
2454                 if (threshold_cpu_callback)
2455                         threshold_cpu_callback(action, cpu);
2456                 break;
2457         case CPU_DEAD:
2458                 if (threshold_cpu_callback)
2459                         threshold_cpu_callback(action, cpu);
2460                 mce_device_remove(cpu);
2461                 mce_intel_hcpu_update(cpu);
2462
2463                 /* intentionally ignoring frozen here */
2464                 if (!(action & CPU_TASKS_FROZEN))
2465                         cmci_rediscover();
2466                 break;
2467         case CPU_DOWN_PREPARE:
2468                 smp_call_function_single(cpu, mce_disable_cpu, &action, 1);
2469                 del_timer_sync(t);
2470                 break;
2471         case CPU_DOWN_FAILED:
2472                 smp_call_function_single(cpu, mce_reenable_cpu, &action, 1);
2473                 mce_start_timer(cpu, t);
2474                 break;
2475         }
2476
2477         return NOTIFY_OK;
2478 }
2479
2480 static struct notifier_block mce_cpu_notifier = {
2481         .notifier_call = mce_cpu_callback,
2482 };
2483
2484 static __init void mce_init_banks(void)
2485 {
2486         int i;
2487
2488         for (i = 0; i < mca_cfg.banks; i++) {
2489                 struct mce_bank *b = &mce_banks[i];
2490                 struct device_attribute *a = &b->attr;
2491
2492                 sysfs_attr_init(&a->attr);
2493                 a->attr.name    = b->attrname;
2494                 snprintf(b->attrname, ATTR_LEN, "bank%d", i);
2495
2496                 a->attr.mode    = 0644;
2497                 a->show         = show_bank;
2498                 a->store        = set_bank;
2499         }
2500 }
2501
2502 static __init int mcheck_init_device(void)
2503 {
2504         int err;
2505         int i = 0;
2506
2507         if (!mce_available(&boot_cpu_data)) {
2508                 err = -EIO;
2509                 goto err_out;
2510         }
2511
2512         if (!zalloc_cpumask_var(&mce_device_initialized, GFP_KERNEL)) {
2513                 err = -ENOMEM;
2514                 goto err_out;
2515         }
2516
2517         mce_init_banks();
2518
2519         err = subsys_system_register(&mce_subsys, NULL);
2520         if (err)
2521                 goto err_out_mem;
2522
2523         cpu_notifier_register_begin();
2524         for_each_online_cpu(i) {
2525                 err = mce_device_create(i);
2526                 if (err) {
2527                         /*
2528                          * Register notifier anyway (and do not unreg it) so
2529                          * that we don't leave undeleted timers, see notifier
2530                          * callback above.
2531                          */
2532                         __register_hotcpu_notifier(&mce_cpu_notifier);
2533                         cpu_notifier_register_done();
2534                         goto err_device_create;
2535                 }
2536         }
2537
2538         __register_hotcpu_notifier(&mce_cpu_notifier);
2539         cpu_notifier_register_done();
2540
2541         register_syscore_ops(&mce_syscore_ops);
2542
2543         /* register character device /dev/mcelog */
2544         err = misc_register(&mce_chrdev_device);
2545         if (err)
2546                 goto err_register;
2547
2548         return 0;
2549
2550 err_register:
2551         unregister_syscore_ops(&mce_syscore_ops);
2552
2553 err_device_create:
2554         /*
2555          * We didn't keep track of which devices were created above, but
2556          * even if we had, the set of online cpus might have changed.
2557          * Play safe and remove for every possible cpu, since
2558          * mce_device_remove() will do the right thing.
2559          */
2560         for_each_possible_cpu(i)
2561                 mce_device_remove(i);
2562
2563 err_out_mem:
2564         free_cpumask_var(mce_device_initialized);
2565
2566 err_out:
2567         pr_err("Unable to init device /dev/mcelog (rc: %d)\n", err);
2568
2569         return err;
2570 }
2571 device_initcall_sync(mcheck_init_device);
2572
2573 /*
2574  * Old style boot options parsing. Only for compatibility.
2575  */
2576 static int __init mcheck_disable(char *str)
2577 {
2578         mca_cfg.disabled = true;
2579         return 1;
2580 }
2581 __setup("nomce", mcheck_disable);
2582
2583 #ifdef CONFIG_DEBUG_FS
2584 struct dentry *mce_get_debugfs_dir(void)
2585 {
2586         static struct dentry *dmce;
2587
2588         if (!dmce)
2589                 dmce = debugfs_create_dir("mce", NULL);
2590
2591         return dmce;
2592 }
2593
2594 static void mce_reset(void)
2595 {
2596         cpu_missing = 0;
2597         atomic_set(&mce_fake_panicked, 0);
2598         atomic_set(&mce_executing, 0);
2599         atomic_set(&mce_callin, 0);
2600         atomic_set(&global_nwo, 0);
2601 }
2602
2603 static int fake_panic_get(void *data, u64 *val)
2604 {
2605         *val = fake_panic;
2606         return 0;
2607 }
2608
2609 static int fake_panic_set(void *data, u64 val)
2610 {
2611         mce_reset();
2612         fake_panic = val;
2613         return 0;
2614 }
2615
2616 DEFINE_SIMPLE_ATTRIBUTE(fake_panic_fops, fake_panic_get,
2617                         fake_panic_set, "%llu\n");
2618
2619 static int __init mcheck_debugfs_init(void)
2620 {
2621         struct dentry *dmce, *ffake_panic;
2622
2623         dmce = mce_get_debugfs_dir();
2624         if (!dmce)
2625                 return -ENOMEM;
2626         ffake_panic = debugfs_create_file("fake_panic", 0444, dmce, NULL,
2627                                           &fake_panic_fops);
2628         if (!ffake_panic)
2629                 return -ENOMEM;
2630
2631         return 0;
2632 }
2633 #else
2634 static int __init mcheck_debugfs_init(void) { return -EINVAL; }
2635 #endif
2636
2637 static int __init mcheck_late_init(void)
2638 {
2639         mcheck_debugfs_init();
2640
2641         /*
2642          * Flush out everything that has been logged during early boot, now that
2643          * everything has been initialized (workqueues, decoders, ...).
2644          */
2645         mce_schedule_work();
2646
2647         return 0;
2648 }
2649 late_initcall(mcheck_late_init);