OSDN Git Service

x86/mce: Add support for deferred errors on AMD
authorAravind Gopalakrishnan <Aravind.Gopalakrishnan@amd.com>
Wed, 6 May 2015 11:58:55 +0000 (06:58 -0500)
committerBorislav Petkov <bp@suse.de>
Wed, 6 May 2015 18:34:31 +0000 (20:34 +0200)
Deferred errors indicate error conditions that were not corrected, but
those errors have not been consumed yet. They require no action from
S/W (or action is optional). These errors provide info about a latent
uncorrectable MCE that can occur when a poisoned data is consumed by the
processor.

Newer AMD processors can generate deferred errors and can be configured
to generate APIC interrupts on such events.

SUCCOR stands for S/W UnCorrectable error COntainment and Recovery.
It indicates support for data poisoning in HW and deferred error
interrupts.

Add new bitfield to mce_vendor_flags for this. We use this to verify
presence of deferred error interrupts before we enable them in mce_amd.c

While at it, clarify comments in mce_vendor_flags to provide an
indication of usages of the bitfields.

Signed-off-by: Aravind Gopalakrishnan <Aravind.Gopalakrishnan@amd.com>
Cc: Tony Luck <tony.luck@intel.com>
Cc: x86-ml <x86@kernel.org>
Cc: linux-edac <linux-edac@vger.kernel.org>
Link: http://lkml.kernel.org/r/1430913538-1415-4-git-send-email-Aravind.Gopalakrishnan@amd.com
[ beef up commit message, do CPUID(8000_0007) only once. ]
Signed-off-by: Borislav Petkov <bp@suse.de>
arch/x86/include/asm/mce.h
arch/x86/kernel/cpu/mcheck/mce.c

index 1f5a86d..407ced6 100644 (file)
@@ -117,8 +117,19 @@ struct mca_config {
 };
 
 struct mce_vendor_flags {
-       __u64           overflow_recov  : 1, /* cpuid_ebx(80000007) */
-                       __reserved_0    : 63;
+                       /*
+                        * overflow recovery cpuid bit indicates that overflow
+                        * conditions are not fatal
+                        */
+       __u64           overflow_recov  : 1,
+
+                       /*
+                        * SUCCOR stands for S/W UnCorrectable error COntainment
+                        * and Recovery. It indicates support for data poisoning
+                        * in HW and deferred error interrupts.
+                        */
+                       succor          : 1,
+                       __reserved_0    : 62;
 };
 extern struct mce_vendor_flags mce_flags;
 
index e535533..521e501 100644 (file)
@@ -1637,10 +1637,16 @@ static void __mcheck_cpu_init_vendor(struct cpuinfo_x86 *c)
                mce_intel_feature_init(c);
                mce_adjust_timer = cmci_intel_adjust_timer;
                break;
-       case X86_VENDOR_AMD:
+
+       case X86_VENDOR_AMD: {
+               u32 ebx = cpuid_ebx(0x80000007);
+
                mce_amd_feature_init(c);
-               mce_flags.overflow_recov = cpuid_ebx(0x80000007) & 0x1;
+               mce_flags.overflow_recov = !!(ebx & BIT(0));
+               mce_flags.succor         = !!(ebx & BIT(1));
                break;
+               }
+
        default:
                break;
        }