arch/x86/kernel/cpu/mcheck/mce.c

   1 /*
   2  * Machine check handler.
   3  *
   4  * K8 parts Copyright 2002,2003 Andi Kleen, SuSE Labs.
   5  * Rest from unknown author(s).
   6  * 2004 Andi Kleen. Rewrote most of it.
   7  * Copyright 2008 Intel Corporation
   8  * Author: Andi Kleen
   9  */
  10
  11 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
  12
  13 #include <linux/thread_info.h>
  14 #include <linux/capability.h>
  15 #include <linux/miscdevice.h>
  16 #include <linux/ratelimit.h>
  17 #include <linux/kallsyms.h>
  18 #include <linux/rcupdate.h>
  19 #include <linux/kobject.h>
  20 #include <linux/uaccess.h>
  21 #include <linux/kdebug.h>
  22 #include <linux/kernel.h>
  23 #include <linux/percpu.h>
  24 #include <linux/string.h>
  25 #include <linux/device.h>
  26 #include <linux/syscore_ops.h>
  27 #include <linux/delay.h>
  28 #include <linux/ctype.h>
  29 #include <linux/sched.h>
  30 #include <linux/sysfs.h>
  31 #include <linux/types.h>
  32 #include <linux/slab.h>
  33 #include <linux/init.h>
  34 #include <linux/kmod.h>
  35 #include <linux/poll.h>
  36 #include <linux/nmi.h>
  37 #include <linux/cpu.h>
  38 #include <linux/smp.h>
  39 #include <linux/fs.h>
  40 #include <linux/mm.h>
  41 #include <linux/debugfs.h>
  42 #include <linux/irq_work.h>
  43 #include <linux/export.h>
  44
  45 #include <asm/processor.h>
  46 #include <asm/traps.h>
  47 #include <asm/tlbflush.h>
  48 #include <asm/mce.h>
  49 #include <asm/msr.h>
  50
  51 #include "mce-internal.h"
  52
  53 static DEFINE_MUTEX(mce_chrdev_read_mutex);
  54
  55 #define mce_log_get_idx_check(p) \
  56 ({ \
  57         RCU_LOCKDEP_WARN(!rcu_read_lock_sched_held() && \
  58                          !lockdep_is_held(&mce_chrdev_read_mutex), \
  59                          "suspicious mce_log_get_idx_check() usage"); \
  60         smp_load_acquire(&(p)); \
  61 })
  62
  63 /* sysfs synchronization */
  64 static DEFINE_MUTEX(mce_sysfs_mutex);
  65
  66 #define CREATE_TRACE_POINTS
  67 #include <trace/events/mce.h>
  68
  69 #define SPINUNIT                100     /* 100ns */
  70
  71 DEFINE_PER_CPU(unsigned, mce_exception_count);
  72
  73 struct mce_bank *mce_banks __read_mostly;
  74 struct mce_vendor_flags mce_flags __read_mostly;
  75
  76 struct mca_config mca_cfg __read_mostly = {
  77         .bootlog  = -1,
  78         /*
  79          * Tolerant levels:
  80          * 0: always panic on uncorrected errors, log corrected errors
  81          * 1: panic or SIGBUS on uncorrected errors, log corrected errors
  82          * 2: SIGBUS or log uncorrected errors (if possible), log corr. errors
  83          * 3: never panic or SIGBUS, log all errors (for testing only)
  84          */
  85         .tolerant = 1,
  86         .monarch_timeout = -1
  87 };
  88
  89 /* User mode helper program triggered by machine check event */
  90 static unsigned long            mce_need_notify;
  91 static char                     mce_helper[128];
  92 static char                     *mce_helper_argv[2] = { mce_helper, NULL };
  93
  94 static DECLARE_WAIT_QUEUE_HEAD(mce_chrdev_wait);
  95
  96 static DEFINE_PER_CPU(struct mce, mces_seen);
  97 static int                      cpu_missing;
  98
  99 /*
 100  * MCA banks polled by the period polling timer for corrected events.
 101  * With Intel CMCI, this only has MCA banks which do not support CMCI (if any).
 102  */
 103 DEFINE_PER_CPU(mce_banks_t, mce_poll_banks) = {
 104         [0 ... BITS_TO_LONGS(MAX_NR_BANKS)-1] = ~0UL
 105 };
 106
 107 /*
 108  * MCA banks controlled through firmware first for corrected errors.
 109  * This is a global list of banks for which we won't enable CMCI and we
 110  * won't poll. Firmware controls these banks and is responsible for
 111  * reporting corrected errors through GHES. Uncorrected/recoverable
 112  * errors are still notified through a machine check.
 113  */
 114 mce_banks_t mce_banks_ce_disabled;
 115
 116 static struct work_struct mce_work;
 117 static struct irq_work mce_irq_work;
 118
 119 static void (*quirk_no_way_out)(int bank, struct mce *m, struct pt_regs *regs);
 120 static int mce_usable_address(struct mce *m);
 121
 122 /*
 123  * CPU/chipset specific EDAC code can register a notifier call here to print
 124  * MCE errors in a human-readable form.
 125  */
 126 ATOMIC_NOTIFIER_HEAD(x86_mce_decoder_chain);
 127
 128 /* Do initial initialization of a struct mce */
 129 void mce_setup(struct mce *m)
 130 {
 131         memset(m, 0, sizeof(struct mce));
 132         m->cpu = m->extcpu = smp_processor_id();
 133         m->tsc = rdtsc();
 134         /* We hope get_seconds stays lockless */
 135         m->time = get_seconds();
 136         m->cpuvendor = boot_cpu_data.x86_vendor;
 137         m->cpuid = cpuid_eax(1);
 138         m->socketid = cpu_data(m->extcpu).phys_proc_id;
 139         m->apicid = cpu_data(m->extcpu).initial_apicid;
 140         rdmsrl(MSR_IA32_MCG_CAP, m->mcgcap);
 141
 142         m->microcode = boot_cpu_data.microcode;
 143 }
 144
 145 DEFINE_PER_CPU(struct mce, injectm);
 146 EXPORT_PER_CPU_SYMBOL_GPL(injectm);
 147
 148 /*
 149  * Lockless MCE logging infrastructure.
 150  * This avoids deadlocks on printk locks without having to break locks. Also
 151  * separate MCEs from kernel messages to avoid bogus bug reports.
 152  */
 153
 154 static struct mce_log mcelog = {
 155         .signature      = MCE_LOG_SIGNATURE,
 156         .len            = MCE_LOG_LEN,
 157         .recordlen      = sizeof(struct mce),
 158 };
 159
 160 void mce_log(struct mce *mce)
 161 {
 162         unsigned next, entry;
 163
 164         /* Emit the trace record: */
 165         trace_mce_record(mce);
 166
 167         if (!mce_gen_pool_add(mce))
 168                 irq_work_queue(&mce_irq_work);
 169
 170         mce->finished = 0;
 171         wmb();
 172         for (;;) {
 173                 entry = mce_log_get_idx_check(mcelog.next);
 174                 for (;;) {
 175
 176                         /*
 177                          * When the buffer fills up discard new entries.
 178                          * Assume that the earlier errors are the more
 179                          * interesting ones:
 180                          */
 181                         if (entry >= MCE_LOG_LEN) {
 182                                 set_bit(MCE_OVERFLOW,
 183                                         (unsigned long *)&mcelog.flags);
 184                                 return;
 185                         }
 186                         /* Old left over entry. Skip: */
 187                         if (mcelog.entry[entry].finished) {
 188                                 entry++;
 189                                 continue;
 190                         }
 191                         break;
 192                 }
 193                 smp_rmb();
 194                 next = entry + 1;
 195                 if (cmpxchg(&mcelog.next, entry, next) == entry)
 196                         break;
 197         }
 198         memcpy(mcelog.entry + entry, mce, sizeof(struct mce));
 199         wmb();
 200         mcelog.entry[entry].finished = 1;
 201         wmb();
 202
 203         mce->finished = 1;
 204         set_bit(0, &mce_need_notify);
 205 }
 206
 207 void mce_inject_log(struct mce *m)
 208 {
 209         mutex_lock(&mce_chrdev_read_mutex);
 210         mce_log(m);
 211         mutex_unlock(&mce_chrdev_read_mutex);
 212 }
 213 EXPORT_SYMBOL_GPL(mce_inject_log);
 214
 215 static struct notifier_block mce_srao_nb;
 216
 217 void mce_register_decode_chain(struct notifier_block *nb)
 218 {
 219         /* Ensure SRAO notifier has the highest priority in the decode chain. */
 220         if (nb != &mce_srao_nb && nb->priority == INT_MAX)
 221                 nb->priority -= 1;
 222
 223         atomic_notifier_chain_register(&x86_mce_decoder_chain, nb);
 224 }
 225 EXPORT_SYMBOL_GPL(mce_register_decode_chain);
 226
 227 void mce_unregister_decode_chain(struct notifier_block *nb)
 228 {
 229         atomic_notifier_chain_unregister(&x86_mce_decoder_chain, nb);
 230 }
 231 EXPORT_SYMBOL_GPL(mce_unregister_decode_chain);
 232
 233 static void print_mce(struct mce *m)
 234 {
 235         int ret = 0;
 236
 237         pr_emerg(HW_ERR "CPU %d: Machine Check Exception: %Lx Bank %d: %016Lx\n",
 238                m->extcpu, m->mcgstatus, m->bank, m->status);
 239
 240         if (m->ip) {
 241                 pr_emerg(HW_ERR "RIP%s %02x:<%016Lx> ",
 242                         !(m->mcgstatus & MCG_STATUS_EIPV) ? " !INEXACT!" : "",
 243                                 m->cs, m->ip);
 244
 245                 if (m->cs == __KERNEL_CS)
 246                         print_symbol("{%s}", m->ip);
 247                 pr_cont("\n");
 248         }
 249
 250         pr_emerg(HW_ERR "TSC %llx ", m->tsc);
 251         if (m->addr)
 252                 pr_cont("ADDR %llx ", m->addr);
 253         if (m->misc)
 254                 pr_cont("MISC %llx ", m->misc);
 255
 256         pr_cont("\n");
 257         /*
 258          * Note this output is parsed by external tools and old fields
 259          * should not be changed.
 260          */
 261         pr_emerg(HW_ERR "PROCESSOR %u:%x TIME %llu SOCKET %u APIC %x microcode %x\n",
 262                 m->cpuvendor, m->cpuid, m->time, m->socketid, m->apicid,
 263                 m->microcode);
 264
 265         /*
 266          * Print out human-readable details about the MCE error,
 267          * (if the CPU has an implementation for that)
 268          */
 269         ret = atomic_notifier_call_chain(&x86_mce_decoder_chain, 0, m);
 270         if (ret == NOTIFY_STOP)
 271                 return;
 272
 273         pr_emerg_ratelimited(HW_ERR "Run the above through 'mcelog --ascii'\n");
 274 }
 275
 276 #define PANIC_TIMEOUT 5 /* 5 seconds */
 277
 278 static atomic_t mce_panicked;
 279
 280 static int fake_panic;
 281 static atomic_t mce_fake_panicked;
 282
 283 /* Panic in progress. Enable interrupts and wait for final IPI */
 284 static void wait_for_panic(void)
 285 {
 286         long timeout = PANIC_TIMEOUT*USEC_PER_SEC;
 287
 288         preempt_disable();
 289         local_irq_enable();
 290         while (timeout-- > 0)
 291                 udelay(1);
 292         if (panic_timeout == 0)
 293                 panic_timeout = mca_cfg.panic_timeout;
 294         panic("Panicing machine check CPU died");
 295 }
 296
 297 static void mce_panic(const char *msg, struct mce *final, char *exp)
 298 {
 299         int i, apei_err = 0;
 300
 301         if (!fake_panic) {
 302                 /*
 303                  * Make sure only one CPU runs in machine check panic
 304                  */
 305                 if (atomic_inc_return(&mce_panicked) > 1)
 306                         wait_for_panic();
 307                 barrier();
 308
 309                 bust_spinlocks(1);
 310                 console_verbose();
 311         } else {
 312                 /* Don't log too much for fake panic */
 313                 if (atomic_inc_return(&mce_fake_panicked) > 1)
 314                         return;
 315         }
 316         /* First print corrected ones that are still unlogged */
 317         for (i = 0; i < MCE_LOG_LEN; i++) {
 318                 struct mce *m = &mcelog.entry[i];
 319                 if (!(m->status & MCI_STATUS_VAL))
 320                         continue;
 321                 if (!(m->status & MCI_STATUS_UC)) {
 322                         print_mce(m);
 323                         if (!apei_err)
 324                                 apei_err = apei_write_mce(m);
 325                 }
 326         }
 327         /* Now print uncorrected but with the final one last */
 328         for (i = 0; i < MCE_LOG_LEN; i++) {
 329                 struct mce *m = &mcelog.entry[i];
 330                 if (!(m->status & MCI_STATUS_VAL))
 331                         continue;
 332                 if (!(m->status & MCI_STATUS_UC))
 333                         continue;
 334                 if (!final || memcmp(m, final, sizeof(struct mce))) {
 335                         print_mce(m);
 336                         if (!apei_err)
 337                                 apei_err = apei_write_mce(m);
 338                 }
 339         }
 340         if (final) {
 341                 print_mce(final);
 342                 if (!apei_err)
 343                         apei_err = apei_write_mce(final);
 344         }
 345         if (cpu_missing)
 346                 pr_emerg(HW_ERR "Some CPUs didn't answer in synchronization\n");
 347         if (exp)
 348                 pr_emerg(HW_ERR "Machine check: %s\n", exp);
 349         if (!fake_panic) {
 350                 if (panic_timeout == 0)
 351                         panic_timeout = mca_cfg.panic_timeout;
 352                 panic(msg);
 353         } else
 354                 pr_emerg(HW_ERR "Fake kernel panic: %s\n", msg);
 355 }
 356
 357 /* Support code for software error injection */
 358
 359 static int msr_to_offset(u32 msr)
 360 {
 361         unsigned bank = __this_cpu_read(injectm.bank);
 362
 363         if (msr == mca_cfg.rip_msr)
 364                 return offsetof(struct mce, ip);
 365         if (msr == MSR_IA32_MCx_STATUS(bank))
 366                 return offsetof(struct mce, status);
 367         if (msr == MSR_IA32_MCx_ADDR(bank))
 368                 return offsetof(struct mce, addr);
 369         if (msr == MSR_IA32_MCx_MISC(bank))
 370                 return offsetof(struct mce, misc);
 371         if (msr == MSR_IA32_MCG_STATUS)
 372                 return offsetof(struct mce, mcgstatus);
 373         return -1;
 374 }
 375
 376 /* MSR access wrappers used for error injection */
 377 static u64 mce_rdmsrl(u32 msr)
 378 {
 379         u64 v;
 380
 381         if (__this_cpu_read(injectm.finished)) {
 382                 int offset = msr_to_offset(msr);
 383
 384                 if (offset < 0)
 385                         return 0;
 386                 return *(u64 *)((char *)this_cpu_ptr(&injectm) + offset);
 387         }
 388
 389         if (rdmsrl_safe(msr, &v)) {
 390                 WARN_ONCE(1, "mce: Unable to read msr %d!\n", msr);
 391                 /*
 392                  * Return zero in case the access faulted. This should
 393                  * not happen normally but can happen if the CPU does
 394                  * something weird, or if the code is buggy.
 395                  */
 396                 v = 0;
 397         }
 398
 399         return v;
 400 }
 401
 402 static void mce_wrmsrl(u32 msr, u64 v)
 403 {
 404         if (__this_cpu_read(injectm.finished)) {
 405                 int offset = msr_to_offset(msr);
 406
 407                 if (offset >= 0)
 408                         *(u64 *)((char *)this_cpu_ptr(&injectm) + offset) = v;
 409                 return;
 410         }
 411         wrmsrl(msr, v);
 412 }
 413
 414 /*
 415  * Collect all global (w.r.t. this processor) status about this machine
 416  * check into our "mce" struct so that we can use it later to assess
 417  * the severity of the problem as we read per-bank specific details.
 418  */
 419 static inline void mce_gather_info(struct mce *m, struct pt_regs *regs)
 420 {
 421         mce_setup(m);
 422
 423         m->mcgstatus = mce_rdmsrl(MSR_IA32_MCG_STATUS);
 424         if (regs) {
 425                 /*
 426                  * Get the address of the instruction at the time of
 427                  * the machine check error.
 428                  */
 429                 if (m->mcgstatus & (MCG_STATUS_RIPV|MCG_STATUS_EIPV)) {
 430                         m->ip = regs->ip;
 431                         m->cs = regs->cs;
 432
 433                         /*
 434                          * When in VM86 mode make the cs look like ring 3
 435                          * always. This is a lie, but it's better than passing
 436                          * the additional vm86 bit around everywhere.
 437                          */
 438                         if (v8086_mode(regs))
 439                                 m->cs |= 3;
 440                 }
 441                 /* Use accurate RIP reporting if available. */
 442                 if (mca_cfg.rip_msr)
 443                         m->ip = mce_rdmsrl(mca_cfg.rip_msr);
 444         }
 445 }
 446
 447 int mce_available(struct cpuinfo_x86 *c)
 448 {
 449         if (mca_cfg.disabled)
 450                 return 0;
 451         return cpu_has(c, X86_FEATURE_MCE) && cpu_has(c, X86_FEATURE_MCA);
 452 }
 453
 454 static void mce_schedule_work(void)
 455 {
 456         if (!mce_gen_pool_empty() && keventd_up())
 457                 schedule_work(&mce_work);
 458 }
 459
 460 static void mce_irq_work_cb(struct irq_work *entry)
 461 {
 462         mce_notify_irq();
 463         mce_schedule_work();
 464 }
 465
 466 static void mce_report_event(struct pt_regs *regs)
 467 {
 468         if (regs->flags & (X86_VM_MASK|X86_EFLAGS_IF)) {
 469                 mce_notify_irq();
 470                 /*
 471                  * Triggering the work queue here is just an insurance
 472                  * policy in case the syscall exit notify handler
 473                  * doesn't run soon enough or ends up running on the
 474                  * wrong CPU (can happen when audit sleeps)
 475                  */
 476                 mce_schedule_work();
 477                 return;
 478         }
 479
 480         irq_work_queue(&mce_irq_work);
 481 }
 482
 483 static int srao_decode_notifier(struct notifier_block *nb, unsigned long val,
 484                                 void *data)
 485 {
 486         struct mce *mce = (struct mce *)data;
 487         unsigned long pfn;
 488
 489         if (!mce)
 490                 return NOTIFY_DONE;
 491
 492         if (mce->usable_addr && (mce->severity == MCE_AO_SEVERITY)) {
 493                 pfn = mce->addr >> PAGE_SHIFT;
 494                 memory_failure(pfn, MCE_VECTOR, 0);
 495         }
 496
 497         return NOTIFY_OK;
 498 }
 499 static struct notifier_block mce_srao_nb = {
 500         .notifier_call  = srao_decode_notifier,
 501         .priority = INT_MAX,
 502 };
 503
 504 /*
 505  * Read ADDR and MISC registers.
 506  */
 507 static void mce_read_aux(struct mce *m, int i)
 508 {
 509         if (m->status & MCI_STATUS_MISCV)
 510                 m->misc = mce_rdmsrl(MSR_IA32_MCx_MISC(i));
 511         if (m->status & MCI_STATUS_ADDRV) {
 512                 m->addr = mce_rdmsrl(MSR_IA32_MCx_ADDR(i));
 513
 514                 /*
 515                  * Mask the reported address by the reported granularity.
 516                  */
 517                 if (mca_cfg.ser && (m->status & MCI_STATUS_MISCV)) {
 518                         u8 shift = MCI_MISC_ADDR_LSB(m->misc);
 519                         m->addr >>= shift;
 520                         m->addr <<= shift;
 521                 }
 522         }
 523 }
 524
 525 static bool memory_error(struct mce *m)
 526 {
 527         struct cpuinfo_x86 *c = &boot_cpu_data;
 528
 529         if (c->x86_vendor == X86_VENDOR_AMD) {
 530                 /*
 531                  * coming soon
 532                  */
 533                 return false;
 534         } else if (c->x86_vendor == X86_VENDOR_INTEL) {
 535                 /*
 536                  * Intel SDM Volume 3B - 15.9.2 Compound Error Codes
 537                  *
 538                  * Bit 7 of the MCACOD field of IA32_MCi_STATUS is used for
 539                  * indicating a memory error. Bit 8 is used for indicating a
 540                  * cache hierarchy error. The combination of bit 2 and bit 3
 541                  * is used for indicating a `generic' cache hierarchy error
 542                  * But we can't just blindly check the above bits, because if
 543                  * bit 11 is set, then it is a bus/interconnect error - and
 544                  * either way the above bits just gives more detail on what
 545                  * bus/interconnect error happened. Note that bit 12 can be
 546                  * ignored, as it's the "filter" bit.
 547                  */
 548                 return (m->status & 0xef80) == BIT(7) ||
 549                        (m->status & 0xef00) == BIT(8) ||
 550                        (m->status & 0xeffc) == 0xc;
 551         }
 552
 553         return false;
 554 }
 555
 556 DEFINE_PER_CPU(unsigned, mce_poll_count);
 557
 558 /*
 559  * Poll for corrected events or events that happened before reset.
 560  * Those are just logged through /dev/mcelog.
 561  *
 562  * This is executed in standard interrupt context.
 563  *
 564  * Note: spec recommends to panic for fatal unsignalled
 565  * errors here. However this would be quite problematic --
 566  * we would need to reimplement the Monarch handling and
 567  * it would mess up the exclusion between exception handler
 568  * and poll hander -- * so we skip this for now.
 569  * These cases should not happen anyways, or only when the CPU
 570  * is already totally * confused. In this case it's likely it will
 571  * not fully execute the machine check handler either.
 572  */
 573 bool machine_check_poll(enum mcp_flags flags, mce_banks_t *b)
 574 {
 575         bool error_logged = false;
 576         struct mce m;
 577         int severity;
 578         int i;
 579
 580         this_cpu_inc(mce_poll_count);
 581
 582         mce_gather_info(&m, NULL);
 583
 584         for (i = 0; i < mca_cfg.banks; i++) {
 585                 if (!mce_banks[i].ctl || !test_bit(i, *b))
 586                         continue;
 587
 588                 m.misc = 0;
 589                 m.addr = 0;
 590                 m.bank = i;
 591                 m.tsc = 0;
 592
 593                 barrier();
 594                 m.status = mce_rdmsrl(MSR_IA32_MCx_STATUS(i));
 595                 if (!(m.status & MCI_STATUS_VAL))
 596                         continue;
 597
 598
 599                 /*
 600                  * Uncorrected or signalled events are handled by the exception
 601                  * handler when it is enabled, so don't process those here.
 602                  *
 603                  * TBD do the same check for MCI_STATUS_EN here?
 604                  */
 605                 if (!(flags & MCP_UC) &&
 606                     (m.status & (mca_cfg.ser ? MCI_STATUS_S : MCI_STATUS_UC)))
 607                         continue;
 608
 609                 mce_read_aux(&m, i);
 610
 611                 if (!(flags & MCP_TIMESTAMP))
 612                         m.tsc = 0;
 613
 614                 severity = mce_severity(&m, mca_cfg.tolerant, NULL, false);
 615
 616                 /*
 617                  * In the cases where we don't have a valid address after all,
 618                  * do not add it into the ring buffer.
 619                  */
 620                 if (severity == MCE_DEFERRED_SEVERITY && memory_error(&m)) {
 621                         if (m.status & MCI_STATUS_ADDRV) {
 622                                 m.severity = severity;
 623                                 m.usable_addr = mce_usable_address(&m);
 624
 625                                 if (!mce_gen_pool_add(&m))
 626                                         mce_schedule_work();
 627                         }
 628                 }
 629
 630                 /*
 631                  * Don't get the IP here because it's unlikely to
 632                  * have anything to do with the actual error location.
 633                  */
 634                 if (!(flags & MCP_DONTLOG) && !mca_cfg.dont_log_ce) {
 635                         error_logged = true;
 636                         mce_log(&m);
 637                 }
 638
 639                 /*
 640                  * Clear state for this bank.
 641                  */
 642                 mce_wrmsrl(MSR_IA32_MCx_STATUS(i), 0);
 643         }
 644
 645         /*
 646          * Don't clear MCG_STATUS here because it's only defined for
 647          * exceptions.
 648          */
 649
 650         sync_core();
 651
 652         return error_logged;
 653 }
 654 EXPORT_SYMBOL_GPL(machine_check_poll);
 655
 656 /*
 657  * Do a quick check if any of the events requires a panic.
 658  * This decides if we keep the events around or clear them.
 659  */
 660 static int mce_no_way_out(struct mce *m, char **msg, unsigned long *validp,
 661                           struct pt_regs *regs)
 662 {
 663         int i, ret = 0;
 664         char *tmp;
 665
 666         for (i = 0; i < mca_cfg.banks; i++) {
 667                 m->status = mce_rdmsrl(MSR_IA32_MCx_STATUS(i));
 668                 if (m->status & MCI_STATUS_VAL) {
 669                         __set_bit(i, validp);
 670                         if (quirk_no_way_out)
 671                                 quirk_no_way_out(i, m, regs);
 672                 }
 673
 674                 if (mce_severity(m, mca_cfg.tolerant, &tmp, true) >= MCE_PANIC_SEVERITY) {
 675                         m->bank = i;
 676                         *msg = tmp;
 677                         ret = 1;
 678                 }
 679         }
 680         return ret;
 681 }
 682
 683 /*
 684  * Variable to establish order between CPUs while scanning.
 685  * Each CPU spins initially until executing is equal its number.
 686  */
 687 static atomic_t mce_executing;
 688
 689 /*
 690  * Defines order of CPUs on entry. First CPU becomes Monarch.
 691  */
 692 static atomic_t mce_callin;
 693
 694 /*
 695  * Check if a timeout waiting for other CPUs happened.
 696  */
 697 static int mce_timed_out(u64 *t, const char *msg)
 698 {
 699         /*
 700          * The others already did panic for some reason.
 701          * Bail out like in a timeout.
 702          * rmb() to tell the compiler that system_state
 703          * might have been modified by someone else.
 704          */
 705         rmb();
 706         if (atomic_read(&mce_panicked))
 707                 wait_for_panic();
 708         if (!mca_cfg.monarch_timeout)
 709                 goto out;
 710         if ((s64)*t < SPINUNIT) {
 711                 if (mca_cfg.tolerant <= 1)
 712                         mce_panic(msg, NULL, NULL);
 713                 cpu_missing = 1;
 714                 return 1;
 715         }
 716         *t -= SPINUNIT;
 717 out:
 718         touch_nmi_watchdog();
 719         return 0;
 720 }
 721
 722 /*
 723  * The Monarch's reign.  The Monarch is the CPU who entered
 724  * the machine check handler first. It waits for the others to
 725  * raise the exception too and then grades them. When any
 726  * error is fatal panic. Only then let the others continue.
 727  *
 728  * The other CPUs entering the MCE handler will be controlled by the
 729  * Monarch. They are called Subjects.
 730  *
 731  * This way we prevent any potential data corruption in a unrecoverable case
 732  * and also makes sure always all CPU's errors are examined.
 733  *
 734  * Also this detects the case of a machine check event coming from outer
 735  * space (not detected by any CPUs) In this case some external agent wants
 736  * us to shut down, so panic too.
 737  *
 738  * The other CPUs might still decide to panic if the handler happens
 739  * in a unrecoverable place, but in this case the system is in a semi-stable
 740  * state and won't corrupt anything by itself. It's ok to let the others
 741  * continue for a bit first.
 742  *
 743  * All the spin loops have timeouts; when a timeout happens a CPU
 744  * typically elects itself to be Monarch.
 745  */
 746 static void mce_reign(void)
 747 {
 748         int cpu;
 749         struct mce *m = NULL;
 750         int global_worst = 0;
 751         char *msg = NULL;
 752         char *nmsg = NULL;
 753
 754         /*
 755          * This CPU is the Monarch and the other CPUs have run
 756          * through their handlers.
 757          * Grade the severity of the errors of all the CPUs.
 758          */
 759         for_each_possible_cpu(cpu) {
 760                 int severity = mce_severity(&per_cpu(mces_seen, cpu),
 761                                             mca_cfg.tolerant,
 762                                             &nmsg, true);
 763                 if (severity > global_worst) {
 764                         msg = nmsg;
 765                         global_worst = severity;
 766                         m = &per_cpu(mces_seen, cpu);
 767                 }
 768         }
 769
 770         /*
 771          * Cannot recover? Panic here then.
 772          * This dumps all the mces in the log buffer and stops the
 773          * other CPUs.
 774          */
 775         if (m && global_worst >= MCE_PANIC_SEVERITY && mca_cfg.tolerant < 3)
 776                 mce_panic("Fatal machine check", m, msg);
 777
 778         /*
 779          * For UC somewhere we let the CPU who detects it handle it.
 780          * Also must let continue the others, otherwise the handling
 781          * CPU could deadlock on a lock.
 782          */
 783
 784         /*
 785          * No machine check event found. Must be some external
 786          * source or one CPU is hung. Panic.
 787          */
 788         if (global_worst <= MCE_KEEP_SEVERITY && mca_cfg.tolerant < 3)
 789                 mce_panic("Fatal machine check from unknown source", NULL, NULL);
 790
 791         /*
 792          * Now clear all the mces_seen so that they don't reappear on
 793          * the next mce.
 794          */
 795         for_each_possible_cpu(cpu)
 796                 memset(&per_cpu(mces_seen, cpu), 0, sizeof(struct mce));
 797 }
 798
 799 static atomic_t global_nwo;
 800
 801 /*
 802  * Start of Monarch synchronization. This waits until all CPUs have
 803  * entered the exception handler and then determines if any of them
 804  * saw a fatal event that requires panic. Then it executes them
 805  * in the entry order.
 806  * TBD double check parallel CPU hotunplug
 807  */
 808 static int mce_start(int *no_way_out)
 809 {
 810         int order;
 811         int cpus = num_online_cpus();
 812         u64 timeout = (u64)mca_cfg.monarch_timeout * NSEC_PER_USEC;
 813
 814         if (!timeout)
 815                 return -1;
 816
 817         atomic_add(*no_way_out, &global_nwo);
 818         /*
 819          * global_nwo should be updated before mce_callin
 820          */
 821         smp_wmb();
 822         order = atomic_inc_return(&mce_callin);
 823
 824         /*
 825          * Wait for everyone.
 826          */
 827         while (atomic_read(&mce_callin) != cpus) {
 828                 if (mce_timed_out(&timeout,
 829                                   "Timeout: Not all CPUs entered broadcast exception handler")) {
 830                         atomic_set(&global_nwo, 0);
 831                         return -1;
 832                 }
 833                 ndelay(SPINUNIT);
 834         }
 835
 836         /*
 837          * mce_callin should be read before global_nwo
 838          */
 839         smp_rmb();
 840
 841         if (order == 1) {
 842                 /*
 843                  * Monarch: Starts executing now, the others wait.
 844                  */
 845                 atomic_set(&mce_executing, 1);
 846         } else {
 847                 /*
 848                  * Subject: Now start the scanning loop one by one in
 849                  * the original callin order.
 850                  * This way when there are any shared banks it will be
 851                  * only seen by one CPU before cleared, avoiding duplicates.
 852                  */
 853                 while (atomic_read(&mce_executing) < order) {
 854                         if (mce_timed_out(&timeout,
 855                                           "Timeout: Subject CPUs unable to finish machine check processing")) {
 856                                 atomic_set(&global_nwo, 0);
 857                                 return -1;
 858                         }
 859                         ndelay(SPINUNIT);
 860                 }
 861         }
 862
 863         /*
 864          * Cache the global no_way_out state.
 865          */
 866         *no_way_out = atomic_read(&global_nwo);
 867
 868         return order;
 869 }
 870
 871 /*
 872  * Synchronize between CPUs after main scanning loop.
 873  * This invokes the bulk of the Monarch processing.
 874  */
 875 static int mce_end(int order)
 876 {
 877         int ret = -1;
 878         u64 timeout = (u64)mca_cfg.monarch_timeout * NSEC_PER_USEC;
 879
 880         if (!timeout)
 881                 goto reset;
 882         if (order < 0)
 883                 goto reset;
 884
 885         /*
 886          * Allow others to run.
 887          */
 888         atomic_inc(&mce_executing);
 889
 890         if (order == 1) {
 891                 /* CHECKME: Can this race with a parallel hotplug? */
 892                 int cpus = num_online_cpus();
 893
 894                 /*
 895                  * Monarch: Wait for everyone to go through their scanning
 896                  * loops.
 897                  */
 898                 while (atomic_read(&mce_executing) <= cpus) {
 899                         if (mce_timed_out(&timeout,
 900                                           "Timeout: Monarch CPU unable to finish machine check processing"))
 901                                 goto reset;
 902                         ndelay(SPINUNIT);
 903                 }
 904
 905                 mce_reign();
 906                 barrier();
 907                 ret = 0;
 908         } else {
 909                 /*
 910                  * Subject: Wait for Monarch to finish.
 911                  */
 912                 while (atomic_read(&mce_executing) != 0) {
 913                         if (mce_timed_out(&timeout,
 914                                           "Timeout: Monarch CPU did not finish machine check processing"))
 915                                 goto reset;
 916                         ndelay(SPINUNIT);
 917                 }
 918
 919                 /*
 920                  * Don't reset anything. That's done by the Monarch.
 921                  */
 922                 return 0;
 923         }
 924
 925         /*
 926          * Reset all global state.
 927          */
 928 reset:
 929         atomic_set(&global_nwo, 0);
 930         atomic_set(&mce_callin, 0);
 931         barrier();
 932
 933         /*
 934          * Let others run again.
 935          */
 936         atomic_set(&mce_executing, 0);
 937         return ret;
 938 }
 939
 940 /*
 941  * Check if the address reported by the CPU is in a format we can parse.
 942  * It would be possible to add code for most other cases, but all would
 943  * be somewhat complicated (e.g. segment offset would require an instruction
 944  * parser). So only support physical addresses up to page granuality for now.
 945  */
 946 static int mce_usable_address(struct mce *m)
 947 {
 948         if (!(m->status & MCI_STATUS_MISCV) || !(m->status & MCI_STATUS_ADDRV))
 949                 return 0;
 950         if (MCI_MISC_ADDR_LSB(m->misc) > PAGE_SHIFT)
 951                 return 0;
 952         if (MCI_MISC_ADDR_MODE(m->misc) != MCI_MISC_ADDR_PHYS)
 953                 return 0;
 954         return 1;
 955 }
 956
 957 static void mce_clear_state(unsigned long *toclear)
 958 {
 959         int i;
 960
 961         for (i = 0; i < mca_cfg.banks; i++) {
 962                 if (test_bit(i, toclear))
 963                         mce_wrmsrl(MSR_IA32_MCx_STATUS(i), 0);
 964         }
 965 }
 966
 967 /*
 968  * The actual machine check handler. This only handles real
 969  * exceptions when something got corrupted coming in through int 18.
 970  *
 971  * This is executed in NMI context not subject to normal locking rules. This
 972  * implies that most kernel services cannot be safely used. Don't even
 973  * think about putting a printk in there!
 974  *
 975  * On Intel systems this is entered on all CPUs in parallel through
 976  * MCE broadcast. However some CPUs might be broken beyond repair,
 977  * so be always careful when synchronizing with others.
 978  */
 979 void do_machine_check(struct pt_regs *regs, long error_code)
 980 {
 981         struct mca_config *cfg = &mca_cfg;
 982         struct mce m, *final;
 983         int i;
 984         int worst = 0;
 985         int severity;
 986
 987         /*
 988          * Establish sequential order between the CPUs entering the machine
 989          * check handler.
 990          */
 991         int order = -1;
 992         /*
 993          * If no_way_out gets set, there is no safe way to recover from this
 994          * MCE.  If mca_cfg.tolerant is cranked up, we'll try anyway.
 995          */
 996         int no_way_out = 0;
 997         /*
 998          * If kill_it gets set, there might be a way to recover from this
 999          * error.
1000          */
1001         int kill_it = 0;
1002         DECLARE_BITMAP(toclear, MAX_NR_BANKS);
1003         DECLARE_BITMAP(valid_banks, MAX_NR_BANKS);
1004         char *msg = "Unknown";
1005         u64 recover_paddr = ~0ull;
1006         int flags = MF_ACTION_REQUIRED;
1007
1008         /*
1009          * MCEs are always local on AMD. Same is determined by MCG_STATUS_LMCES
1010          * on Intel.
1011          */
1012         int lmce = 1;
1013
1014         /* If this CPU is offline, just bail out. */
1015         if (cpu_is_offline(smp_processor_id())) {
1016                 u64 mcgstatus;
1017
1018                 mcgstatus = mce_rdmsrl(MSR_IA32_MCG_STATUS);
1019                 if (mcgstatus & MCG_STATUS_RIPV) {
1020                         mce_wrmsrl(MSR_IA32_MCG_STATUS, 0);
1021                         return;
1022                 }
1023         }
1024
1025         ist_enter(regs);
1026
1027         this_cpu_inc(mce_exception_count);
1028
1029         if (!cfg->banks)
1030                 goto out;
1031
1032         mce_gather_info(&m, regs);
1033
1034         final = this_cpu_ptr(&mces_seen);
1035         *final = m;
1036
1037         memset(valid_banks, 0, sizeof(valid_banks));
1038         no_way_out = mce_no_way_out(&m, &msg, valid_banks, regs);
1039
1040         barrier();
1041
1042         /*
1043          * When no restart IP might need to kill or panic.
1044          * Assume the worst for now, but if we find the
1045          * severity is MCE_AR_SEVERITY we have other options.
1046          */
1047         if (!(m.mcgstatus & MCG_STATUS_RIPV))
1048                 kill_it = 1;
1049
1050         /*
1051          * Check if this MCE is signaled to only this logical processor,
1052          * on Intel only.
1053          */
1054         if (m.cpuvendor == X86_VENDOR_INTEL)
1055                 lmce = m.mcgstatus & MCG_STATUS_LMCES;
1056
1057         /*
1058          * Local machine check may already know that we have to panic.
1059          * Broadcast machine check begins rendezvous in mce_start()
1060          * Go through all banks in exclusion of the other CPUs. This way we
1061          * don't report duplicated events on shared banks because the first one
1062          * to see it will clear it.
1063          */
1064         if (lmce) {
1065                 if (no_way_out)
1066                         mce_panic("Fatal local machine check", &m, msg);
1067         } else {
1068                 order = mce_start(&no_way_out);
1069         }
1070
1071         for (i = 0; i < cfg->banks; i++) {
1072                 __clear_bit(i, toclear);
1073                 if (!test_bit(i, valid_banks))
1074                         continue;
1075                 if (!mce_banks[i].ctl)
1076                         continue;
1077
1078                 m.misc = 0;
1079                 m.addr = 0;
1080                 m.bank = i;
1081
1082                 m.status = mce_rdmsrl(MSR_IA32_MCx_STATUS(i));
1083                 if ((m.status & MCI_STATUS_VAL) == 0)
1084                         continue;
1085
1086                 /*
1087                  * Non uncorrected or non signaled errors are handled by
1088                  * machine_check_poll. Leave them alone, unless this panics.
1089                  */
1090                 if (!(m.status & (cfg->ser ? MCI_STATUS_S : MCI_STATUS_UC)) &&
1091                         !no_way_out)
1092                         continue;
1093
1094                 /*
1095                  * Set taint even when machine check was not enabled.
1096                  */
1097                 add_taint(TAINT_MACHINE_CHECK, LOCKDEP_NOW_UNRELIABLE);
1098
1099                 severity = mce_severity(&m, cfg->tolerant, NULL, true);
1100
1101                 /*
1102                  * When machine check was for corrected/deferred handler don't
1103                  * touch, unless we're panicing.
1104                  */
1105                 if ((severity == MCE_KEEP_SEVERITY ||
1106                      severity == MCE_UCNA_SEVERITY) && !no_way_out)
1107                         continue;
1108                 __set_bit(i, toclear);
1109                 if (severity == MCE_NO_SEVERITY) {
1110                         /*
1111                          * Machine check event was not enabled. Clear, but
1112                          * ignore.
1113                          */
1114                         continue;
1115                 }
1116
1117                 mce_read_aux(&m, i);
1118
1119                 /* assuming valid severity level != 0 */
1120                 m.severity = severity;
1121                 m.usable_addr = mce_usable_address(&m);
1122
1123                 mce_log(&m);
1124
1125                 if (severity > worst) {
1126                         *final = m;
1127                         worst = severity;
1128                 }
1129         }
1130
1131         /* mce_clear_state will clear *final, save locally for use later */
1132         m = *final;
1133
1134         if (!no_way_out)
1135                 mce_clear_state(toclear);
1136
1137         /*
1138          * Do most of the synchronization with other CPUs.
1139          * When there's any problem use only local no_way_out state.
1140          */
1141         if (!lmce) {
1142                 if (mce_end(order) < 0)
1143                         no_way_out = worst >= MCE_PANIC_SEVERITY;
1144         } else {
1145                 /*
1146                  * If there was a fatal machine check we should have
1147                  * already called mce_panic earlier in this function.
1148                  * Since we re-read the banks, we might have found
1149                  * something new. Check again to see if we found a
1150                  * fatal error. We call "mce_severity()" again to
1151                  * make sure we have the right "msg".
1152                  */
1153                 if (worst >= MCE_PANIC_SEVERITY && mca_cfg.tolerant < 3) {
1154                         mce_severity(&m, cfg->tolerant, &msg, true);
1155                         mce_panic("Local fatal machine check!", &m, msg);
1156                 }
1157         }
1158
1159         /*
1160          * At insane "tolerant" levels we take no action. Otherwise
1161          * we only die if we have no other choice. For less serious
1162          * issues we try to recover, or limit damage to the current
1163          * process.
1164          */
1165         if (cfg->tolerant < 3) {
1166                 if (no_way_out)
1167                         mce_panic("Fatal machine check on current CPU", &m, msg);
1168                 if (worst == MCE_AR_SEVERITY) {
1169                         recover_paddr = m.addr;
1170                         if (!(m.mcgstatus & MCG_STATUS_RIPV))
1171                                 flags |= MF_MUST_KILL;
1172                 } else if (kill_it) {
1173                         force_sig(SIGBUS, current);
1174                 }
1175         }
1176
1177         if (worst > 0)
1178                 mce_report_event(regs);
1179         mce_wrmsrl(MSR_IA32_MCG_STATUS, 0);
1180 out:
1181         sync_core();
1182
1183         if (recover_paddr == ~0ull)
1184                 goto done;
1185
1186         pr_err("Uncorrected hardware memory error in user-access at %llx",
1187                  recover_paddr);
1188         /*
1189          * We must call memory_failure() here even if the current process is
1190          * doomed. We still need to mark the page as poisoned and alert any
1191          * other users of the page.
1192          */
1193         ist_begin_non_atomic(regs);
1194         local_irq_enable();
1195         if (memory_failure(recover_paddr >> PAGE_SHIFT, MCE_VECTOR, flags) < 0) {
1196                 pr_err("Memory error not recovered");
1197                 force_sig(SIGBUS, current);
1198         }
1199         local_irq_disable();
1200         ist_end_non_atomic();
1201 done:
1202         ist_exit(regs);
1203 }
1204 EXPORT_SYMBOL_GPL(do_machine_check);
1205
1206 #ifndef CONFIG_MEMORY_FAILURE
1207 int memory_failure(unsigned long pfn, int vector, int flags)
1208 {
1209         /* mce_severity() should not hand us an ACTION_REQUIRED error */
1210         BUG_ON(flags & MF_ACTION_REQUIRED);
1211         pr_err("Uncorrected memory error in page 0x%lx ignored\n"
1212                "Rebuild kernel with CONFIG_MEMORY_FAILURE=y for smarter handling\n",
1213                pfn);
1214
1215         return 0;
1216 }
1217 #endif
1218
1219 /*
1220  * Action optional processing happens here (picking up
1221  * from the list of faulting pages that do_machine_check()
1222  * placed into the genpool).
1223  */
1224 static void mce_process_work(struct work_struct *dummy)
1225 {
1226         mce_gen_pool_process();
1227 }
1228
1229 #ifdef CONFIG_X86_MCE_INTEL
1230 /***
1231  * mce_log_therm_throt_event - Logs the thermal throttling event to mcelog
1232  * @cpu: The CPU on which the event occurred.
1233  * @status: Event status information
1234  *
1235  * This function should be called by the thermal interrupt after the
1236  * event has been processed and the decision was made to log the event
1237  * further.
1238  *
1239  * The status parameter will be saved to the 'status' field of 'struct mce'
1240  * and historically has been the register value of the
1241  * MSR_IA32_THERMAL_STATUS (Intel) msr.
1242  */
1243 void mce_log_therm_throt_event(__u64 status)
1244 {
1245         struct mce m;
1246
1247         mce_setup(&m);
1248         m.bank = MCE_THERMAL_BANK;
1249         m.status = status;
1250         mce_log(&m);
1251 }
1252 #endif /* CONFIG_X86_MCE_INTEL */
1253
1254 /*
1255  * Periodic polling timer for "silent" machine check errors.  If the
1256  * poller finds an MCE, poll 2x faster.  When the poller finds no more
1257  * errors, poll 2x slower (up to check_interval seconds).
1258  */
1259 static unsigned long check_interval = INITIAL_CHECK_INTERVAL;
1260
1261 static DEFINE_PER_CPU(unsigned long, mce_next_interval); /* in jiffies */
1262 static DEFINE_PER_CPU(struct timer_list, mce_timer);
1263
1264 static unsigned long mce_adjust_timer_default(unsigned long interval)
1265 {
1266         return interval;
1267 }
1268
1269 static unsigned long (*mce_adjust_timer)(unsigned long interval) = mce_adjust_timer_default;
1270
1271 static void __restart_timer(struct timer_list *t, unsigned long interval)
1272 {
1273         unsigned long when = jiffies + interval;
1274         unsigned long flags;
1275
1276         local_irq_save(flags);
1277
1278         if (timer_pending(t)) {
1279                 if (time_before(when, t->expires))
1280                         mod_timer_pinned(t, when);
1281         } else {
1282                 t->expires = round_jiffies(when);
1283                 add_timer_on(t, smp_processor_id());
1284         }
1285
1286         local_irq_restore(flags);
1287 }
1288
1289 static void mce_timer_fn(unsigned long data)
1290 {
1291         struct timer_list *t = this_cpu_ptr(&mce_timer);
1292         int cpu = smp_processor_id();
1293         unsigned long iv;
1294
1295         WARN_ON(cpu != data);
1296
1297         iv = __this_cpu_read(mce_next_interval);
1298
1299         if (mce_available(this_cpu_ptr(&cpu_info))) {
1300                 machine_check_poll(MCP_TIMESTAMP, this_cpu_ptr(&mce_poll_banks));
1301
1302                 if (mce_intel_cmci_poll()) {
1303                         iv = mce_adjust_timer(iv);
1304                         goto done;
1305                 }
1306         }
1307
1308         /*
1309          * Alert userspace if needed. If we logged an MCE, reduce the polling
1310          * interval, otherwise increase the polling interval.
1311          */
1312         if (mce_notify_irq())
1313                 iv = max(iv / 2, (unsigned long) HZ/100);
1314         else
1315                 iv = min(iv * 2, round_jiffies_relative(check_interval * HZ));
1316
1317 done:
1318         __this_cpu_write(mce_next_interval, iv);
1319         __restart_timer(t, iv);
1320 }
1321
1322 /*
1323  * Ensure that the timer is firing in @interval from now.
1324  */
1325 void mce_timer_kick(unsigned long interval)
1326 {
1327         struct timer_list *t = this_cpu_ptr(&mce_timer);
1328         unsigned long iv = __this_cpu_read(mce_next_interval);
1329
1330         __restart_timer(t, interval);
1331
1332         if (interval < iv)
1333                 __this_cpu_write(mce_next_interval, interval);
1334 }
1335
1336 /* Must not be called in IRQ context where del_timer_sync() can deadlock */
1337 static void mce_timer_delete_all(void)
1338 {
1339         int cpu;
1340
1341         for_each_online_cpu(cpu)
1342                 del_timer_sync(&per_cpu(mce_timer, cpu));
1343 }
1344
1345 static void mce_do_trigger(struct work_struct *work)
1346 {
1347         call_usermodehelper(mce_helper, mce_helper_argv, NULL, UMH_NO_WAIT);
1348 }
1349
1350 static DECLARE_WORK(mce_trigger_work, mce_do_trigger);
1351
1352 /*
1353  * Notify the user(s) about new machine check events.
1354  * Can be called from interrupt context, but not from machine check/NMI
1355  * context.
1356  */
1357 int mce_notify_irq(void)
1358 {
1359         /* Not more than two messages every minute */
1360         static DEFINE_RATELIMIT_STATE(ratelimit, 60*HZ, 2);
1361
1362         if (test_and_clear_bit(0, &mce_need_notify)) {
1363                 /* wake processes polling /dev/mcelog */
1364                 wake_up_interruptible(&mce_chrdev_wait);
1365
1366                 if (mce_helper[0])
1367                         schedule_work(&mce_trigger_work);
1368
1369                 if (__ratelimit(&ratelimit))
1370                         pr_info(HW_ERR "Machine check events logged\n");
1371
1372                 return 1;
1373         }
1374         return 0;
1375 }
1376 EXPORT_SYMBOL_GPL(mce_notify_irq);
1377
1378 static int __mcheck_cpu_mce_banks_init(void)
1379 {
1380         int i;
1381         u8 num_banks = mca_cfg.banks;
1382
1383         mce_banks = kzalloc(num_banks * sizeof(struct mce_bank), GFP_KERNEL);
1384         if (!mce_banks)
1385                 return -ENOMEM;
1386
1387         for (i = 0; i < num_banks; i++) {
1388                 struct mce_bank *b = &mce_banks[i];
1389
1390                 b->ctl = -1ULL;
1391                 b->init = 1;
1392         }
1393         return 0;
1394 }
1395
1396 /*
1397  * Initialize Machine Checks for a CPU.
1398  */
1399 static int __mcheck_cpu_cap_init(void)
1400 {
1401         unsigned b;
1402         u64 cap;
1403
1404         rdmsrl(MSR_IA32_MCG_CAP, cap);
1405
1406         b = cap & MCG_BANKCNT_MASK;
1407         if (!mca_cfg.banks)
1408                 pr_info("CPU supports %d MCE banks\n", b);
1409
1410         if (b > MAX_NR_BANKS) {
1411                 pr_warn("Using only %u machine check banks out of %u\n",
1412                         MAX_NR_BANKS, b);
1413                 b = MAX_NR_BANKS;
1414         }
1415
1416         /* Don't support asymmetric configurations today */
1417         WARN_ON(mca_cfg.banks != 0 && b != mca_cfg.banks);
1418         mca_cfg.banks = b;
1419
1420         if (!mce_banks) {
1421                 int err = __mcheck_cpu_mce_banks_init();
1422
1423                 if (err)
1424                         return err;
1425         }
1426
1427         /* Use accurate RIP reporting if available. */
1428         if ((cap & MCG_EXT_P) && MCG_EXT_CNT(cap) >= 9)
1429                 mca_cfg.rip_msr = MSR_IA32_MCG_EIP;
1430
1431         if (cap & MCG_SER_P)
1432                 mca_cfg.ser = true;
1433
1434         return 0;
1435 }
1436
1437 static void __mcheck_cpu_init_generic(void)
1438 {
1439         enum mcp_flags m_fl = 0;
1440         mce_banks_t all_banks;
1441         u64 cap;
1442         int i;
1443
1444         if (!mca_cfg.bootlog)
1445                 m_fl = MCP_DONTLOG;
1446
1447         /*
1448          * Log the machine checks left over from the previous reset.
1449          */
1450         bitmap_fill(all_banks, MAX_NR_BANKS);
1451         machine_check_poll(MCP_UC | m_fl, &all_banks);
1452
1453         cr4_set_bits(X86_CR4_MCE);
1454
1455         rdmsrl(MSR_IA32_MCG_CAP, cap);
1456         if (cap & MCG_CTL_P)
1457                 wrmsr(MSR_IA32_MCG_CTL, 0xffffffff, 0xffffffff);
1458
1459         for (i = 0; i < mca_cfg.banks; i++) {
1460                 struct mce_bank *b = &mce_banks[i];
1461
1462                 if (!b->init)
1463                         continue;
1464                 wrmsrl(MSR_IA32_MCx_CTL(i), b->ctl);
1465                 wrmsrl(MSR_IA32_MCx_STATUS(i), 0);
1466         }
1467 }
1468
1469 /*
1470  * During IFU recovery Sandy Bridge -EP4S processors set the RIPV and
1471  * EIPV bits in MCG_STATUS to zero on the affected logical processor (SDM
1472  * Vol 3B Table 15-20). But this confuses both the code that determines
1473  * whether the machine check occurred in kernel or user mode, and also
1474  * the severity assessment code. Pretend that EIPV was set, and take the
1475  * ip/cs values from the pt_regs that mce_gather_info() ignored earlier.
1476  */
1477 static void quirk_sandybridge_ifu(int bank, struct mce *m, struct pt_regs *regs)
1478 {
1479         if (bank != 0)
1480                 return;
1481         if ((m->mcgstatus & (MCG_STATUS_EIPV|MCG_STATUS_RIPV)) != 0)
1482                 return;
1483         if ((m->status & (MCI_STATUS_OVER|MCI_STATUS_UC|
1484                           MCI_STATUS_EN|MCI_STATUS_MISCV|MCI_STATUS_ADDRV|
1485                           MCI_STATUS_PCC|MCI_STATUS_S|MCI_STATUS_AR|
1486                           MCACOD)) !=
1487                          (MCI_STATUS_UC|MCI_STATUS_EN|
1488                           MCI_STATUS_MISCV|MCI_STATUS_ADDRV|MCI_STATUS_S|
1489                           MCI_STATUS_AR|MCACOD_INSTR))
1490                 return;
1491
1492         m->mcgstatus |= MCG_STATUS_EIPV;
1493         m->ip = regs->ip;
1494         m->cs = regs->cs;
1495 }
1496
1497 /* Add per CPU specific workarounds here */
1498 static int __mcheck_cpu_apply_quirks(struct cpuinfo_x86 *c)
1499 {
1500         struct mca_config *cfg = &mca_cfg;
1501
1502         if (c->x86_vendor == X86_VENDOR_UNKNOWN) {
1503                 pr_info("unknown CPU type - not enabling MCE support\n");
1504                 return -EOPNOTSUPP;
1505         }
1506
1507         /* This should be disabled by the BIOS, but isn't always */
1508         if (c->x86_vendor == X86_VENDOR_AMD) {
1509                 if (c->x86 == 15 && cfg->banks > 4) {
1510                         /*
1511                          * disable GART TBL walk error reporting, which
1512                          * trips off incorrectly with the IOMMU & 3ware
1513                          * & Cerberus:
1514                          */
1515                         clear_bit(10, (unsigned long *)&mce_banks[4].ctl);
1516                 }
1517                 if (c->x86 <= 17 && cfg->bootlog < 0) {
1518                         /*
1519                          * Lots of broken BIOS around that don't clear them
1520                          * by default and leave crap in there. Don't log:
1521                          */
1522                         cfg->bootlog = 0;
1523                 }
1524                 /*
1525                  * Various K7s with broken bank 0 around. Always disable
1526                  * by default.
1527                  */
1528                 if (c->x86 == 6 && cfg->banks > 0)
1529                         mce_banks[0].ctl = 0;
1530
1531                 /*
1532                  * overflow_recov is supported for F15h Models 00h-0fh
1533                  * even though we don't have a CPUID bit for it.
1534                  */
1535                 if (c->x86 == 0x15 && c->x86_model <= 0xf)
1536                         mce_flags.overflow_recov = 1;
1537
1538                 /*
1539                  * Turn off MC4_MISC thresholding banks on those models since
1540                  * they're not supported there.
1541                  */
1542                 if (c->x86 == 0x15 &&
1543                     (c->x86_model >= 0x10 && c->x86_model <= 0x1f)) {
1544                         int i;
1545                         u64 hwcr;
1546                         bool need_toggle;
1547                         u32 msrs[] = {
1548                                 0x00000413, /* MC4_MISC0 */
1549                                 0xc0000408, /* MC4_MISC1 */
1550                         };
1551
1552                         rdmsrl(MSR_K7_HWCR, hwcr);
1553
1554                         /* McStatusWrEn has to be set */
1555                         need_toggle = !(hwcr & BIT(18));
1556
1557                         if (need_toggle)
1558                                 wrmsrl(MSR_K7_HWCR, hwcr | BIT(18));
1559
1560                         /* Clear CntP bit safely */
1561                         for (i = 0; i < ARRAY_SIZE(msrs); i++)
1562                                 msr_clear_bit(msrs[i], 62);
1563
1564                         /* restore old settings */
1565                         if (need_toggle)
1566                                 wrmsrl(MSR_K7_HWCR, hwcr);
1567                 }
1568         }
1569
1570         if (c->x86_vendor == X86_VENDOR_INTEL) {
1571                 /*
1572                  * SDM documents that on family 6 bank 0 should not be written
1573                  * because it aliases to another special BIOS controlled
1574                  * register.
1575                  * But it's not aliased anymore on model 0x1a+
1576                  * Don't ignore bank 0 completely because there could be a
1577                  * valid event later, merely don't write CTL0.
1578                  */
1579
1580                 if (c->x86 == 6 && c->x86_model < 0x1A && cfg->banks > 0)
1581                         mce_banks[0].init = 0;
1582
1583                 /*
1584                  * All newer Intel systems support MCE broadcasting. Enable
1585                  * synchronization with a one second timeout.
1586                  */
1587                 if ((c->x86 > 6 || (c->x86 == 6 && c->x86_model >= 0xe)) &&
1588                         cfg->monarch_timeout < 0)
1589                         cfg->monarch_timeout = USEC_PER_SEC;
1590
1591                 /*
1592                  * There are also broken BIOSes on some Pentium M and
1593                  * earlier systems:
1594                  */
1595                 if (c->x86 == 6 && c->x86_model <= 13 && cfg->bootlog < 0)
1596                         cfg->bootlog = 0;
1597
1598                 if (c->x86 == 6 && c->x86_model == 45)
1599                         quirk_no_way_out = quirk_sandybridge_ifu;
1600         }
1601         if (cfg->monarch_timeout < 0)
1602                 cfg->monarch_timeout = 0;
1603         if (cfg->bootlog != 0)
1604                 cfg->panic_timeout = 30;
1605
1606         return 0;
1607 }
1608
1609 static int __mcheck_cpu_ancient_init(struct cpuinfo_x86 *c)
1610 {
1611         if (c->x86 != 5)
1612                 return 0;
1613
1614         switch (c->x86_vendor) {
1615         case X86_VENDOR_INTEL:
1616                 intel_p5_mcheck_init(c);
1617                 return 1;
1618                 break;
1619         case X86_VENDOR_CENTAUR:
1620                 winchip_mcheck_init(c);
1621                 return 1;
1622                 break;
1623         default:
1624                 return 0;
1625         }
1626
1627         return 0;
1628 }
1629
1630 static void __mcheck_cpu_init_vendor(struct cpuinfo_x86 *c)
1631 {
1632         switch (c->x86_vendor) {
1633         case X86_VENDOR_INTEL:
1634                 mce_intel_feature_init(c);
1635                 mce_adjust_timer = cmci_intel_adjust_timer;
1636                 break;
1637
1638         case X86_VENDOR_AMD: {
1639                 u32 ebx = cpuid_ebx(0x80000007);
1640
1641                 mce_amd_feature_init(c);
1642                 mce_flags.overflow_recov = !!(ebx & BIT(0));
1643                 mce_flags.succor         = !!(ebx & BIT(1));
1644                 mce_flags.smca           = !!(ebx & BIT(3));
1645
1646                 break;
1647                 }
1648
1649         default:
1650                 break;
1651         }
1652 }
1653
1654 static void __mcheck_cpu_clear_vendor(struct cpuinfo_x86 *c)
1655 {
1656         switch (c->x86_vendor) {
1657         case X86_VENDOR_INTEL:
1658                 mce_intel_feature_clear(c);
1659                 break;
1660         default:
1661                 break;
1662         }
1663 }
1664
1665 static void mce_start_timer(unsigned int cpu, struct timer_list *t)
1666 {
1667         unsigned long iv = check_interval * HZ;
1668
1669         if (mca_cfg.ignore_ce || !iv)
1670                 return;
1671
1672         per_cpu(mce_next_interval, cpu) = iv;
1673
1674         t->expires = round_jiffies(jiffies + iv);
1675         add_timer_on(t, cpu);
1676 }
1677
1678 static void __mcheck_cpu_init_timer(void)
1679 {
1680         struct timer_list *t = this_cpu_ptr(&mce_timer);
1681         unsigned int cpu = smp_processor_id();
1682
1683         setup_timer(t, mce_timer_fn, cpu);
1684         mce_start_timer(cpu, t);
1685 }
1686
1687 /* Handle unconfigured int18 (should never happen) */
1688 static void unexpected_machine_check(struct pt_regs *regs, long error_code)
1689 {
1690         pr_err("CPU#%d: Unexpected int18 (Machine Check)\n",
1691                smp_processor_id());
1692 }
1693
1694 /* Call the installed machine check handler for this CPU setup. */
1695 void (*machine_check_vector)(struct pt_regs *, long error_code) =
1696                                                 unexpected_machine_check;
1697
1698 dotraplinkage void do_mce(struct pt_regs *regs, long error_code)
1699 {
1700         machine_check_vector(regs, error_code);
1701 }
1702
1703 /*
1704  * Called for each booted CPU to set up machine checks.
1705  * Must be called with preempt off:
1706  */
1707 void mcheck_cpu_init(struct cpuinfo_x86 *c)
1708 {
1709         if (mca_cfg.disabled)
1710                 return;
1711
1712         if (__mcheck_cpu_ancient_init(c))
1713                 return;
1714
1715         if (!mce_available(c))
1716                 return;
1717
1718         if (__mcheck_cpu_cap_init() < 0 || __mcheck_cpu_apply_quirks(c) < 0) {
1719                 mca_cfg.disabled = true;
1720                 return;
1721         }
1722
1723         if (mce_gen_pool_init()) {
1724                 mca_cfg.disabled = true;
1725                 pr_emerg("Couldn't allocate MCE records pool!\n");
1726                 return;
1727         }
1728
1729         machine_check_vector = do_machine_check;
1730
1731         __mcheck_cpu_init_generic();
1732         __mcheck_cpu_init_vendor(c);
1733         __mcheck_cpu_init_timer();
1734 }
1735
1736 /*
1737  * Called for each booted CPU to clear some machine checks opt-ins
1738  */
1739 void mcheck_cpu_clear(struct cpuinfo_x86 *c)
1740 {
1741         if (mca_cfg.disabled)
1742                 return;
1743
1744         if (!mce_available(c))
1745                 return;
1746
1747         /*
1748          * Possibly to clear general settings generic to x86
1749          * __mcheck_cpu_clear_generic(c);
1750          */
1751         __mcheck_cpu_clear_vendor(c);
1752
1753 }
1754
1755 /*
1756  * mce_chrdev: Character device /dev/mcelog to read and clear the MCE log.
1757  */
1758
1759 static DEFINE_SPINLOCK(mce_chrdev_state_lock);
1760 static int mce_chrdev_open_count;       /* #times opened */
1761 static int mce_chrdev_open_exclu;       /* already open exclusive? */
1762
1763 static int mce_chrdev_open(struct inode *inode, struct file *file)
1764 {
1765         spin_lock(&mce_chrdev_state_lock);
1766
1767         if (mce_chrdev_open_exclu ||
1768             (mce_chrdev_open_count && (file->f_flags & O_EXCL))) {
1769                 spin_unlock(&mce_chrdev_state_lock);
1770
1771                 return -EBUSY;
1772         }
1773
1774         if (file->f_flags & O_EXCL)
1775                 mce_chrdev_open_exclu = 1;
1776         mce_chrdev_open_count++;
1777
1778         spin_unlock(&mce_chrdev_state_lock);
1779
1780         return nonseekable_open(inode, file);
1781 }
1782
1783 static int mce_chrdev_release(struct inode *inode, struct file *file)
1784 {
1785         spin_lock(&mce_chrdev_state_lock);
1786
1787         mce_chrdev_open_count--;
1788         mce_chrdev_open_exclu = 0;
1789
1790         spin_unlock(&mce_chrdev_state_lock);
1791
1792         return 0;
1793 }
1794
1795 static void collect_tscs(void *data)
1796 {
1797         unsigned long *cpu_tsc = (unsigned long *)data;
1798
1799         cpu_tsc[smp_processor_id()] = rdtsc();
1800 }
1801
1802 static int mce_apei_read_done;
1803
1804 /* Collect MCE record of previous boot in persistent storage via APEI ERST. */
1805 static int __mce_read_apei(char __user **ubuf, size_t usize)
1806 {
1807         int rc;
1808         u64 record_id;
1809         struct mce m;
1810
1811         if (usize < sizeof(struct mce))
1812                 return -EINVAL;
1813
1814         rc = apei_read_mce(&m, &record_id);
1815         /* Error or no more MCE record */
1816         if (rc <= 0) {
1817                 mce_apei_read_done = 1;
1818                 /*
1819                  * When ERST is disabled, mce_chrdev_read() should return
1820                  * "no record" instead of "no device."
1821                  */
1822                 if (rc == -ENODEV)
1823                         return 0;
1824                 return rc;
1825         }
1826         rc = -EFAULT;
1827         if (copy_to_user(*ubuf, &m, sizeof(struct mce)))
1828                 return rc;
1829         /*
1830          * In fact, we should have cleared the record after that has
1831          * been flushed to the disk or sent to network in
1832          * /sbin/mcelog, but we have no interface to support that now,
1833          * so just clear it to avoid duplication.
1834          */
1835         rc = apei_clear_mce(record_id);
1836         if (rc) {
1837                 mce_apei_read_done = 1;
1838                 return rc;
1839         }
1840         *ubuf += sizeof(struct mce);
1841
1842         return 0;
1843 }
1844
1845 static ssize_t mce_chrdev_read(struct file *filp, char __user *ubuf,
1846                                 size_t usize, loff_t *off)
1847 {
1848         char __user *buf = ubuf;
1849         unsigned long *cpu_tsc;
1850         unsigned prev, next;
1851         int i, err;
1852
1853         cpu_tsc = kmalloc(nr_cpu_ids * sizeof(long), GFP_KERNEL);
1854         if (!cpu_tsc)
1855                 return -ENOMEM;
1856
1857         mutex_lock(&mce_chrdev_read_mutex);
1858
1859         if (!mce_apei_read_done) {
1860                 err = __mce_read_apei(&buf, usize);
1861                 if (err || buf != ubuf)
1862                         goto out;
1863         }
1864
1865         next = mce_log_get_idx_check(mcelog.next);
1866
1867         /* Only supports full reads right now */
1868         err = -EINVAL;
1869         if (*off != 0 || usize < MCE_LOG_LEN*sizeof(struct mce))
1870                 goto out;
1871
1872         err = 0;
1873         prev = 0;
1874         do {
1875                 for (i = prev; i < next; i++) {
1876                         unsigned long start = jiffies;
1877                         struct mce *m = &mcelog.entry[i];
1878
1879                         while (!m->finished) {
1880                                 if (time_after_eq(jiffies, start + 2)) {
1881                                         memset(m, 0, sizeof(*m));
1882                                         goto timeout;
1883                                 }
1884                                 cpu_relax();
1885                         }
1886                         smp_rmb();
1887                         err |= copy_to_user(buf, m, sizeof(*m));
1888                         buf += sizeof(*m);
1889 timeout:
1890                         ;
1891                 }
1892
1893                 memset(mcelog.entry + prev, 0,
1894                        (next - prev) * sizeof(struct mce));
1895                 prev = next;
1896                 next = cmpxchg(&mcelog.next, prev, 0);
1897         } while (next != prev);
1898
1899         synchronize_sched();
1900
1901         /*
1902          * Collect entries that were still getting written before the
1903          * synchronize.
1904          */
1905         on_each_cpu(collect_tscs, cpu_tsc, 1);
1906
1907         for (i = next; i < MCE_LOG_LEN; i++) {
1908                 struct mce *m = &mcelog.entry[i];
1909
1910                 if (m->finished && m->tsc < cpu_tsc[m->cpu]) {
1911                         err |= copy_to_user(buf, m, sizeof(*m));
1912                         smp_rmb();
1913                         buf += sizeof(*m);
1914                         memset(m, 0, sizeof(*m));
1915                 }
1916         }
1917
1918         if (err)
1919                 err = -EFAULT;
1920
1921 out:
1922         mutex_unlock(&mce_chrdev_read_mutex);
1923         kfree(cpu_tsc);
1924
1925         return err ? err : buf - ubuf;
1926 }
1927
1928 static unsigned int mce_chrdev_poll(struct file *file, poll_table *wait)
1929 {
1930         poll_wait(file, &mce_chrdev_wait, wait);
1931         if (READ_ONCE(mcelog.next))
1932                 return POLLIN | POLLRDNORM;
1933         if (!mce_apei_read_done && apei_check_mce())
1934                 return POLLIN | POLLRDNORM;
1935         return 0;
1936 }
1937
1938 static long mce_chrdev_ioctl(struct file *f, unsigned int cmd,
1939                                 unsigned long arg)
1940 {
1941         int __user *p = (int __user *)arg;
1942
1943         if (!capable(CAP_SYS_ADMIN))
1944                 return -EPERM;
1945
1946         switch (cmd) {
1947         case MCE_GET_RECORD_LEN:
1948                 return put_user(sizeof(struct mce), p);
1949         case MCE_GET_LOG_LEN:
1950                 return put_user(MCE_LOG_LEN, p);
1951         case MCE_GETCLEAR_FLAGS: {
1952                 unsigned flags;
1953
1954                 do {
1955                         flags = mcelog.flags;
1956                 } while (cmpxchg(&mcelog.flags, flags, 0) != flags);
1957
1958                 return put_user(flags, p);
1959         }
1960         default:
1961                 return -ENOTTY;
1962         }
1963 }
1964
1965 static ssize_t (*mce_write)(struct file *filp, const char __user *ubuf,
1966                             size_t usize, loff_t *off);
1967
1968 void register_mce_write_callback(ssize_t (*fn)(struct file *filp,
1969                              const char __user *ubuf,
1970                              size_t usize, loff_t *off))
1971 {
1972         mce_write = fn;
1973 }
1974 EXPORT_SYMBOL_GPL(register_mce_write_callback);
1975
1976 static ssize_t mce_chrdev_write(struct file *filp, const char __user *ubuf,
1977                                 size_t usize, loff_t *off)
1978 {
1979         if (mce_write)
1980                 return mce_write(filp, ubuf, usize, off);
1981         else
1982                 return -EINVAL;
1983 }
1984
1985 static const struct file_operations mce_chrdev_ops = {
1986         .open                   = mce_chrdev_open,
1987         .release                = mce_chrdev_release,
1988         .read                   = mce_chrdev_read,
1989         .write                  = mce_chrdev_write,
1990         .poll                   = mce_chrdev_poll,
1991         .unlocked_ioctl         = mce_chrdev_ioctl,
1992         .llseek                 = no_llseek,
1993 };
1994
1995 static struct miscdevice mce_chrdev_device = {
1996         MISC_MCELOG_MINOR,
1997         "mcelog",
1998         &mce_chrdev_ops,
1999 };
2000
2001 static void __mce_disable_bank(void *arg)
2002 {
2003         int bank = *((int *)arg);
2004         __clear_bit(bank, this_cpu_ptr(mce_poll_banks));
2005         cmci_disable_bank(bank);
2006 }
2007
2008 void mce_disable_bank(int bank)
2009 {
2010         if (bank >= mca_cfg.banks) {
2011                 pr_warn(FW_BUG
2012                         "Ignoring request to disable invalid MCA bank %d.\n",
2013                         bank);
2014                 return;
2015         }
2016         set_bit(bank, mce_banks_ce_disabled);
2017         on_each_cpu(__mce_disable_bank, &bank, 1);
2018 }
2019
2020 /*
2021  * mce=off Disables machine check
2022  * mce=no_cmci Disables CMCI
2023  * mce=no_lmce Disables LMCE
2024  * mce=dont_log_ce Clears corrected events silently, no log created for CEs.
2025  * mce=ignore_ce Disables polling and CMCI, corrected events are not cleared.
2026  * mce=TOLERANCELEVEL[,monarchtimeout] (number, see above)
2027  *      monarchtimeout is how long to wait for other CPUs on machine
2028  *      check, or 0 to not wait
2029  * mce=bootlog Log MCEs from before booting. Disabled by default on AMD.
2030  * mce=nobootlog Don't log MCEs from before booting.
2031  * mce=bios_cmci_threshold Don't program the CMCI threshold
2032  */
2033 static int __init mcheck_enable(char *str)
2034 {
2035         struct mca_config *cfg = &mca_cfg;
2036
2037         if (*str == 0) {
2038                 enable_p5_mce();
2039                 return 1;
2040         }
2041         if (*str == '=')
2042                 str++;
2043         if (!strcmp(str, "off"))
2044                 cfg->disabled = true;
2045         else if (!strcmp(str, "no_cmci"))
2046                 cfg->cmci_disabled = true;
2047         else if (!strcmp(str, "no_lmce"))
2048                 cfg->lmce_disabled = true;
2049         else if (!strcmp(str, "dont_log_ce"))
2050                 cfg->dont_log_ce = true;
2051         else if (!strcmp(str, "ignore_ce"))
2052                 cfg->ignore_ce = true;
2053         else if (!strcmp(str, "bootlog") || !strcmp(str, "nobootlog"))
2054                 cfg->bootlog = (str[0] == 'b');
2055         else if (!strcmp(str, "bios_cmci_threshold"))
2056                 cfg->bios_cmci_threshold = true;
2057         else if (isdigit(str[0])) {
2058                 if (get_option(&str, &cfg->tolerant) == 2)
2059                         get_option(&str, &(cfg->monarch_timeout));
2060         } else {
2061                 pr_info("mce argument %s ignored. Please use /sys\n", str);
2062                 return 0;
2063         }
2064         return 1;
2065 }
2066 __setup("mce", mcheck_enable);
2067
2068 int __init mcheck_init(void)
2069 {
2070         mcheck_intel_therm_init();
2071         mce_register_decode_chain(&mce_srao_nb);
2072         mcheck_vendor_init_severity();
2073
2074         INIT_WORK(&mce_work, mce_process_work);
2075         init_irq_work(&mce_irq_work, mce_irq_work_cb);
2076
2077         return 0;
2078 }
2079
2080 /*
2081  * mce_syscore: PM support
2082  */
2083
2084 /*
2085  * Disable machine checks on suspend and shutdown. We can't really handle
2086  * them later.
2087  */
2088 static void mce_disable_error_reporting(void)
2089 {
2090         int i;
2091
2092         for (i = 0; i < mca_cfg.banks; i++) {
2093                 struct mce_bank *b = &mce_banks[i];
2094
2095                 if (b->init)
2096                         wrmsrl(MSR_IA32_MCx_CTL(i), 0);
2097         }
2098         return;
2099 }
2100
2101 static void vendor_disable_error_reporting(void)
2102 {
2103         /*
2104          * Don't clear on Intel CPUs. Some of these MSRs are socket-wide.
2105          * Disabling them for just a single offlined CPU is bad, since it will
2106          * inhibit reporting for all shared resources on the socket like the
2107          * last level cache (LLC), the integrated memory controller (iMC), etc.
2108          */
2109         if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL)
2110                 return;
2111
2112         mce_disable_error_reporting();
2113 }
2114
2115 static int mce_syscore_suspend(void)
2116 {
2117         vendor_disable_error_reporting();
2118         return 0;
2119 }
2120
2121 static void mce_syscore_shutdown(void)
2122 {
2123         vendor_disable_error_reporting();
2124 }
2125
2126 /*
2127  * On resume clear all MCE state. Don't want to see leftovers from the BIOS.
2128  * Only one CPU is active at this time, the others get re-added later using
2129  * CPU hotplug:
2130  */
2131 static void mce_syscore_resume(void)
2132 {
2133         __mcheck_cpu_init_generic();
2134         __mcheck_cpu_init_vendor(raw_cpu_ptr(&cpu_info));
2135 }
2136
2137 static struct syscore_ops mce_syscore_ops = {
2138         .suspend        = mce_syscore_suspend,
2139         .shutdown       = mce_syscore_shutdown,
2140         .resume         = mce_syscore_resume,
2141 };
2142
2143 /*
2144  * mce_device: Sysfs support
2145  */
2146
2147 static void mce_cpu_restart(void *data)
2148 {
2149         if (!mce_available(raw_cpu_ptr(&cpu_info)))
2150                 return;
2151         __mcheck_cpu_init_generic();
2152         __mcheck_cpu_init_timer();
2153 }
2154
2155 /* Reinit MCEs after user configuration changes */
2156 static void mce_restart(void)
2157 {
2158         mce_timer_delete_all();
2159         on_each_cpu(mce_cpu_restart, NULL, 1);
2160 }
2161
2162 /* Toggle features for corrected errors */
2163 static void mce_disable_cmci(void *data)
2164 {
2165         if (!mce_available(raw_cpu_ptr(&cpu_info)))
2166                 return;
2167         cmci_clear();
2168 }
2169
2170 static void mce_enable_ce(void *all)
2171 {
2172         if (!mce_available(raw_cpu_ptr(&cpu_info)))
2173                 return;
2174         cmci_reenable();
2175         cmci_recheck();
2176         if (all)
2177                 __mcheck_cpu_init_timer();
2178 }
2179
2180 static struct bus_type mce_subsys = {
2181         .name           = "machinecheck",
2182         .dev_name       = "machinecheck",
2183 };
2184
2185 DEFINE_PER_CPU(struct device *, mce_device);
2186
2187 void (*threshold_cpu_callback)(unsigned long action, unsigned int cpu);
2188
2189 static inline struct mce_bank *attr_to_bank(struct device_attribute *attr)
2190 {
2191         return container_of(attr, struct mce_bank, attr);
2192 }
2193
2194 static ssize_t show_bank(struct device *s, struct device_attribute *attr,
2195                          char *buf)
2196 {
2197         return sprintf(buf, "%llx\n", attr_to_bank(attr)->ctl);
2198 }
2199
2200 static ssize_t set_bank(struct device *s, struct device_attribute *attr,
2201                         const char *buf, size_t size)
2202 {
2203         u64 new;
2204
2205         if (kstrtou64(buf, 0, &new) < 0)
2206                 return -EINVAL;
2207
2208         attr_to_bank(attr)->ctl = new;
2209         mce_restart();
2210
2211         return size;
2212 }
2213
2214 static ssize_t
2215 show_trigger(struct device *s, struct device_attribute *attr, char *buf)
2216 {
2217         strcpy(buf, mce_helper);
2218         strcat(buf, "\n");
2219         return strlen(mce_helper) + 1;
2220 }
2221
2222 static ssize_t set_trigger(struct device *s, struct device_attribute *attr,
2223                                 const char *buf, size_t siz)
2224 {
2225         char *p;
2226
2227         strncpy(mce_helper, buf, sizeof(mce_helper));
2228         mce_helper[sizeof(mce_helper)-1] = 0;
2229         p = strchr(mce_helper, '\n');
2230
2231         if (p)
2232                 *p = 0;
2233
2234         return strlen(mce_helper) + !!p;
2235 }
2236
2237 static ssize_t set_ignore_ce(struct device *s,
2238                              struct device_attribute *attr,
2239                              const char *buf, size_t size)
2240 {
2241         u64 new;
2242
2243         if (kstrtou64(buf, 0, &new) < 0)
2244                 return -EINVAL;
2245
2246         mutex_lock(&mce_sysfs_mutex);
2247         if (mca_cfg.ignore_ce ^ !!new) {
2248                 if (new) {
2249                         /* disable ce features */
2250                         mce_timer_delete_all();
2251                         on_each_cpu(mce_disable_cmci, NULL, 1);
2252                         mca_cfg.ignore_ce = true;
2253                 } else {
2254                         /* enable ce features */
2255                         mca_cfg.ignore_ce = false;
2256                         on_each_cpu(mce_enable_ce, (void *)1, 1);
2257                 }
2258         }
2259         mutex_unlock(&mce_sysfs_mutex);
2260
2261         return size;
2262 }
2263
2264 static ssize_t set_cmci_disabled(struct device *s,
2265                                  struct device_attribute *attr,
2266                                  const char *buf, size_t size)
2267 {
2268         u64 new;
2269
2270         if (kstrtou64(buf, 0, &new) < 0)
2271                 return -EINVAL;
2272
2273         mutex_lock(&mce_sysfs_mutex);
2274         if (mca_cfg.cmci_disabled ^ !!new) {
2275                 if (new) {
2276                         /* disable cmci */
2277                         on_each_cpu(mce_disable_cmci, NULL, 1);
2278                         mca_cfg.cmci_disabled = true;
2279                 } else {
2280                         /* enable cmci */
2281                         mca_cfg.cmci_disabled = false;
2282                         on_each_cpu(mce_enable_ce, NULL, 1);
2283                 }
2284         }
2285         mutex_unlock(&mce_sysfs_mutex);
2286
2287         return size;
2288 }
2289
2290 static ssize_t store_int_with_restart(struct device *s,
2291                                       struct device_attribute *attr,
2292                                       const char *buf, size_t size)
2293 {
2294         unsigned long old_check_interval = check_interval;
2295         ssize_t ret = device_store_ulong(s, attr, buf, size);
2296
2297         if (check_interval == old_check_interval)
2298                 return ret;
2299
2300         mutex_lock(&mce_sysfs_mutex);
2301         mce_restart();
2302         mutex_unlock(&mce_sysfs_mutex);
2303
2304         return ret;
2305 }
2306
2307 static DEVICE_ATTR(trigger, 0644, show_trigger, set_trigger);
2308 static DEVICE_INT_ATTR(tolerant, 0644, mca_cfg.tolerant);
2309 static DEVICE_INT_ATTR(monarch_timeout, 0644, mca_cfg.monarch_timeout);
2310 static DEVICE_BOOL_ATTR(dont_log_ce, 0644, mca_cfg.dont_log_ce);
2311
2312 static struct dev_ext_attribute dev_attr_check_interval = {
2313         __ATTR(check_interval, 0644, device_show_int, store_int_with_restart),
2314         &check_interval
2315 };
2316
2317 static struct dev_ext_attribute dev_attr_ignore_ce = {
2318         __ATTR(ignore_ce, 0644, device_show_bool, set_ignore_ce),
2319         &mca_cfg.ignore_ce
2320 };
2321
2322 static struct dev_ext_attribute dev_attr_cmci_disabled = {
2323         __ATTR(cmci_disabled, 0644, device_show_bool, set_cmci_disabled),
2324         &mca_cfg.cmci_disabled
2325 };
2326
2327 static struct device_attribute *mce_device_attrs[] = {
2328         &dev_attr_tolerant.attr,
2329         &dev_attr_check_interval.attr,
2330         &dev_attr_trigger,
2331         &dev_attr_monarch_timeout.attr,
2332         &dev_attr_dont_log_ce.attr,
2333         &dev_attr_ignore_ce.attr,
2334         &dev_attr_cmci_disabled.attr,
2335         NULL
2336 };
2337
2338 static cpumask_var_t mce_device_initialized;
2339
2340 static void mce_device_release(struct device *dev)
2341 {
2342         kfree(dev);
2343 }
2344
2345 /* Per cpu device init. All of the cpus still share the same ctrl bank: */
2346 static int mce_device_create(unsigned int cpu)
2347 {
2348         struct device *dev;
2349         int err;
2350         int i, j;
2351
2352         if (!mce_available(&boot_cpu_data))
2353                 return -EIO;
2354
2355         dev = kzalloc(sizeof *dev, GFP_KERNEL);
2356         if (!dev)
2357                 return -ENOMEM;
2358         dev->id  = cpu;
2359         dev->bus = &mce_subsys;
2360         dev->release = &mce_device_release;
2361
2362         err = device_register(dev);
2363         if (err) {
2364                 put_device(dev);
2365                 return err;
2366         }
2367
2368         for (i = 0; mce_device_attrs[i]; i++) {
2369                 err = device_create_file(dev, mce_device_attrs[i]);
2370                 if (err)
2371                         goto error;
2372         }
2373         for (j = 0; j < mca_cfg.banks; j++) {
2374                 err = device_create_file(dev, &mce_banks[j].attr);
2375                 if (err)
2376                         goto error2;
2377         }
2378         cpumask_set_cpu(cpu, mce_device_initialized);
2379         per_cpu(mce_device, cpu) = dev;
2380
2381         return 0;
2382 error2:
2383         while (--j >= 0)
2384                 device_remove_file(dev, &mce_banks[j].attr);
2385 error:
2386         while (--i >= 0)
2387                 device_remove_file(dev, mce_device_attrs[i]);
2388
2389         device_unregister(dev);
2390
2391         return err;
2392 }
2393
2394 static void mce_device_remove(unsigned int cpu)
2395 {
2396         struct device *dev = per_cpu(mce_device, cpu);
2397         int i;
2398
2399         if (!cpumask_test_cpu(cpu, mce_device_initialized))
2400                 return;
2401
2402         for (i = 0; mce_device_attrs[i]; i++)
2403                 device_remove_file(dev, mce_device_attrs[i]);
2404
2405         for (i = 0; i < mca_cfg.banks; i++)
2406                 device_remove_file(dev, &mce_banks[i].attr);
2407
2408         device_unregister(dev);
2409         cpumask_clear_cpu(cpu, mce_device_initialized);
2410         per_cpu(mce_device, cpu) = NULL;
2411 }
2412
2413 /* Make sure there are no machine checks on offlined CPUs. */
2414 static void mce_disable_cpu(void *h)
2415 {
2416         unsigned long action = *(unsigned long *)h;
2417
2418         if (!mce_available(raw_cpu_ptr(&cpu_info)))
2419                 return;
2420
2421         if (!(action & CPU_TASKS_FROZEN))
2422                 cmci_clear();
2423
2424         vendor_disable_error_reporting();
2425 }
2426
2427 static void mce_reenable_cpu(void *h)
2428 {
2429         unsigned long action = *(unsigned long *)h;
2430         int i;
2431
2432         if (!mce_available(raw_cpu_ptr(&cpu_info)))
2433                 return;
2434
2435         if (!(action & CPU_TASKS_FROZEN))
2436                 cmci_reenable();
2437         for (i = 0; i < mca_cfg.banks; i++) {
2438                 struct mce_bank *b = &mce_banks[i];
2439
2440                 if (b->init)
2441                         wrmsrl(MSR_IA32_MCx_CTL(i), b->ctl);
2442         }
2443 }
2444
2445 /* Get notified when a cpu comes on/off. Be hotplug friendly. */
2446 static int
2447 mce_cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu)
2448 {
2449         unsigned int cpu = (unsigned long)hcpu;
2450         struct timer_list *t = &per_cpu(mce_timer, cpu);
2451
2452         switch (action & ~CPU_TASKS_FROZEN) {
2453         case CPU_ONLINE:
2454                 mce_device_create(cpu);
2455                 if (threshold_cpu_callback)
2456                         threshold_cpu_callback(action, cpu);
2457                 break;
2458         case CPU_DEAD:
2459                 if (threshold_cpu_callback)
2460                         threshold_cpu_callback(action, cpu);
2461                 mce_device_remove(cpu);
2462                 mce_intel_hcpu_update(cpu);
2463
2464                 /* intentionally ignoring frozen here */
2465                 if (!(action & CPU_TASKS_FROZEN))
2466                         cmci_rediscover();
2467                 break;
2468         case CPU_DOWN_PREPARE:
2469                 smp_call_function_single(cpu, mce_disable_cpu, &action, 1);
2470                 del_timer_sync(t);
2471                 break;
2472         case CPU_DOWN_FAILED:
2473                 smp_call_function_single(cpu, mce_reenable_cpu, &action, 1);
2474                 mce_start_timer(cpu, t);
2475                 break;
2476         }
2477
2478         return NOTIFY_OK;
2479 }
2480
2481 static struct notifier_block mce_cpu_notifier = {
2482         .notifier_call = mce_cpu_callback,
2483 };
2484
2485 static __init void mce_init_banks(void)
2486 {
2487         int i;
2488
2489         for (i = 0; i < mca_cfg.banks; i++) {
2490                 struct mce_bank *b = &mce_banks[i];
2491                 struct device_attribute *a = &b->attr;
2492
2493                 sysfs_attr_init(&a->attr);
2494                 a->attr.name    = b->attrname;
2495                 snprintf(b->attrname, ATTR_LEN, "bank%d", i);
2496
2497                 a->attr.mode    = 0644;
2498                 a->show         = show_bank;
2499                 a->store        = set_bank;
2500         }
2501 }
2502
2503 static __init int mcheck_init_device(void)
2504 {
2505         int err;
2506         int i = 0;
2507
2508         if (!mce_available(&boot_cpu_data)) {
2509                 err = -EIO;
2510                 goto err_out;
2511         }
2512
2513         if (!zalloc_cpumask_var(&mce_device_initialized, GFP_KERNEL)) {
2514                 err = -ENOMEM;
2515                 goto err_out;
2516         }
2517
2518         mce_init_banks();
2519
2520         err = subsys_system_register(&mce_subsys, NULL);
2521         if (err)
2522                 goto err_out_mem;
2523
2524         cpu_notifier_register_begin();
2525         for_each_online_cpu(i) {
2526                 err = mce_device_create(i);
2527                 if (err) {
2528                         /*
2529                          * Register notifier anyway (and do not unreg it) so
2530                          * that we don't leave undeleted timers, see notifier
2531                          * callback above.
2532                          */
2533                         __register_hotcpu_notifier(&mce_cpu_notifier);
2534                         cpu_notifier_register_done();
2535                         goto err_device_create;
2536                 }
2537         }
2538
2539         __register_hotcpu_notifier(&mce_cpu_notifier);
2540         cpu_notifier_register_done();
2541
2542         register_syscore_ops(&mce_syscore_ops);
2543
2544         /* register character device /dev/mcelog */
2545         err = misc_register(&mce_chrdev_device);
2546         if (err)
2547                 goto err_register;
2548
2549         return 0;
2550
2551 err_register:
2552         unregister_syscore_ops(&mce_syscore_ops);
2553
2554 err_device_create:
2555         /*
2556          * We didn't keep track of which devices were created above, but
2557          * even if we had, the set of online cpus might have changed.
2558          * Play safe and remove for every possible cpu, since
2559          * mce_device_remove() will do the right thing.
2560          */
2561         for_each_possible_cpu(i)
2562                 mce_device_remove(i);
2563
2564 err_out_mem:
2565         free_cpumask_var(mce_device_initialized);
2566
2567 err_out:
2568         pr_err("Unable to init device /dev/mcelog (rc: %d)\n", err);
2569
2570         return err;
2571 }
2572 device_initcall_sync(mcheck_init_device);
2573
2574 /*
2575  * Old style boot options parsing. Only for compatibility.
2576  */
2577 static int __init mcheck_disable(char *str)
2578 {
2579         mca_cfg.disabled = true;
2580         return 1;
2581 }
2582 __setup("nomce", mcheck_disable);
2583
2584 #ifdef CONFIG_DEBUG_FS
2585 struct dentry *mce_get_debugfs_dir(void)
2586 {
2587         static struct dentry *dmce;
2588
2589         if (!dmce)
2590                 dmce = debugfs_create_dir("mce", NULL);
2591
2592         return dmce;
2593 }
2594
2595 static void mce_reset(void)
2596 {
2597         cpu_missing = 0;
2598         atomic_set(&mce_fake_panicked, 0);
2599         atomic_set(&mce_executing, 0);
2600         atomic_set(&mce_callin, 0);
2601         atomic_set(&global_nwo, 0);
2602 }
2603
2604 static int fake_panic_get(void *data, u64 *val)
2605 {
2606         *val = fake_panic;
2607         return 0;
2608 }
2609
2610 static int fake_panic_set(void *data, u64 val)
2611 {
2612         mce_reset();
2613         fake_panic = val;
2614         return 0;
2615 }
2616
2617 DEFINE_SIMPLE_ATTRIBUTE(fake_panic_fops, fake_panic_get,
2618                         fake_panic_set, "%llu\n");
2619
2620 static int __init mcheck_debugfs_init(void)
2621 {
2622         struct dentry *dmce, *ffake_panic;
2623
2624         dmce = mce_get_debugfs_dir();
2625         if (!dmce)
2626                 return -ENOMEM;
2627         ffake_panic = debugfs_create_file("fake_panic", 0444, dmce, NULL,
2628                                           &fake_panic_fops);
2629         if (!ffake_panic)
2630                 return -ENOMEM;
2631
2632         return 0;
2633 }
2634 #else
2635 static int __init mcheck_debugfs_init(void) { return -EINVAL; }
2636 #endif
2637
2638 static int __init mcheck_late_init(void)
2639 {
2640         mcheck_debugfs_init();
2641
2642         /*
2643          * Flush out everything that has been logged during early boot, now that
2644          * everything has been initialized (workqueues, decoders, ...).
2645          */
2646         mce_schedule_work();
2647
2648         return 0;
2649 }
2650 late_initcall(mcheck_late_init);