2 * Copyright 2017 Advanced Micro Devices, Inc.
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
11 * The above copyright notice and this permission notice shall be included in
12 * all copies or substantial portions of the Software.
14 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
15 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
17 * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
18 * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
19 * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
20 * OTHER DEALINGS IN THE SOFTWARE.
24 #include "CUnit/Basic.h"
26 #include "amdgpu_test.h"
27 #include "amdgpu_drm.h"
28 #include "amdgpu_internal.h"
34 const char *ras_block_string[] = {
51 #define ras_block_str(i) (ras_block_string[i])
53 enum amdgpu_ras_block {
54 AMDGPU_RAS_BLOCK__UMC = 0,
55 AMDGPU_RAS_BLOCK__SDMA,
56 AMDGPU_RAS_BLOCK__GFX,
57 AMDGPU_RAS_BLOCK__MMHUB,
58 AMDGPU_RAS_BLOCK__ATHUB,
59 AMDGPU_RAS_BLOCK__PCIE_BIF,
60 AMDGPU_RAS_BLOCK__HDP,
61 AMDGPU_RAS_BLOCK__XGMI_WAFL,
63 AMDGPU_RAS_BLOCK__SMN,
64 AMDGPU_RAS_BLOCK__SEM,
65 AMDGPU_RAS_BLOCK__MP0,
66 AMDGPU_RAS_BLOCK__MP1,
67 AMDGPU_RAS_BLOCK__FUSE,
69 AMDGPU_RAS_BLOCK__LAST
72 #define AMDGPU_RAS_BLOCK_COUNT AMDGPU_RAS_BLOCK__LAST
73 #define AMDGPU_RAS_BLOCK_MASK ((1ULL << AMDGPU_RAS_BLOCK_COUNT) - 1)
75 enum amdgpu_ras_error_type {
76 AMDGPU_RAS_ERROR__NONE = 0,
77 AMDGPU_RAS_ERROR__SINGLE_CORRECTABLE = 2,
78 AMDGPU_RAS_ERROR__MULTI_UNCORRECTABLE = 4,
79 AMDGPU_RAS_ERROR__POISON = 8,
82 struct ras_common_if {
83 enum amdgpu_ras_block block;
84 enum amdgpu_ras_error_type type;
85 uint32_t sub_block_index;
89 struct ras_inject_if {
90 struct ras_common_if head;
97 struct ras_common_if head;
98 struct ras_inject_if inject;
102 /* for now, only umc, gfx, sdma has implemented. */
103 #define DEFAULT_RAS_BLOCK_MASK_INJECT (1 << AMDGPU_RAS_BLOCK__UMC)
104 #define DEFAULT_RAS_BLOCK_MASK_QUERY (1 << AMDGPU_RAS_BLOCK__UMC)
105 #define DEFAULT_RAS_BLOCK_MASK_BASIC (1 << AMDGPU_RAS_BLOCK__UMC |\
106 (1 << AMDGPU_RAS_BLOCK__SDMA) |\
107 (1 << AMDGPU_RAS_BLOCK__GFX))
109 static uint32_t ras_block_mask_inject = DEFAULT_RAS_BLOCK_MASK_INJECT;
110 static uint32_t ras_block_mask_query = DEFAULT_RAS_BLOCK_MASK_INJECT;
111 static uint32_t ras_block_mask_basic = DEFAULT_RAS_BLOCK_MASK_BASIC;
113 struct ras_test_mask {
114 uint32_t inject_mask;
119 struct amdgpu_ras_data {
120 amdgpu_device_handle device_handle;
123 struct ras_test_mask test_mask;
126 /* all devices who has ras supported */
127 static struct amdgpu_ras_data devices[MAX_CARDS_SUPPORTED];
128 static int devices_count;
130 struct ras_DID_test_mask{
132 uint16_t revision_id;
133 struct ras_test_mask test_mask;
136 /* white list for inject test. */
137 #define RAS_BLOCK_MASK_ALL {\
138 DEFAULT_RAS_BLOCK_MASK_INJECT,\
139 DEFAULT_RAS_BLOCK_MASK_QUERY,\
140 DEFAULT_RAS_BLOCK_MASK_BASIC\
143 #define RAS_BLOCK_MASK_QUERY_BASIC {\
145 DEFAULT_RAS_BLOCK_MASK_QUERY,\
146 DEFAULT_RAS_BLOCK_MASK_BASIC\
149 static const struct ras_DID_test_mask ras_DID_array[] = {
150 {0x66a1, 0x00, RAS_BLOCK_MASK_ALL},
151 {0x66a1, 0x01, RAS_BLOCK_MASK_ALL},
152 {0x66a1, 0x04, RAS_BLOCK_MASK_ALL},
155 static struct ras_test_mask amdgpu_ras_get_test_mask(drmDevicePtr device)
158 static struct ras_test_mask default_test_mask = RAS_BLOCK_MASK_QUERY_BASIC;
160 for (i = 0; i < sizeof(ras_DID_array) / sizeof(ras_DID_array[0]); i++) {
161 if (ras_DID_array[i].device_id == device->deviceinfo.pci->device_id &&
162 ras_DID_array[i].revision_id == device->deviceinfo.pci->revision_id)
163 return ras_DID_array[i].test_mask;
165 return default_test_mask;
168 static uint32_t amdgpu_ras_lookup_capability(amdgpu_device_handle device_handle)
171 uint64_t feature_mask;
173 uint32_t enabled_features;
174 uint32_t supported_features;
179 ret = amdgpu_query_info(device_handle, AMDGPU_INFO_RAS_ENABLED_FEATURES,
180 sizeof(features), &features);
184 return features.supported_features;
187 static int get_file_contents(char *file, char *buf, int size);
189 static int amdgpu_ras_lookup_id(drmDevicePtr device)
197 for (i = 0; i < MAX_CARDS_SUPPORTED; i++) {
198 memset(str, 0, sizeof(str));
199 memset(&info, 0, sizeof(info));
200 sprintf(path, "/sys/kernel/debug/dri/%d/name", i);
201 if (get_file_contents(path, str, sizeof(str)) <= 0)
204 ret = sscanf(str, "amdgpu dev=%04hx:%02hhx:%02hhx.%01hhx",
205 &info.domain, &info.bus, &info.dev, &info.func);
209 if (memcmp(&info, device->businfo.pci, sizeof(info)) == 0)
215 CU_BOOL suite_ras_tests_enable(void)
217 amdgpu_device_handle device_handle;
218 uint32_t major_version;
219 uint32_t minor_version;
223 for (i = 0; i < MAX_CARDS_SUPPORTED && drm_amdgpu[i] >= 0; i++) {
224 if (amdgpu_device_initialize(drm_amdgpu[i], &major_version,
225 &minor_version, &device_handle))
228 if (drmGetDevice2(drm_amdgpu[i],
229 DRM_DEVICE_GET_PCI_REVISION,
233 if (device->bustype == DRM_BUS_PCI &&
234 amdgpu_ras_lookup_capability(device_handle)) {
235 amdgpu_device_deinitialize(device_handle);
239 if (amdgpu_device_deinitialize(device_handle))
246 int suite_ras_tests_init(void)
249 amdgpu_device_handle device_handle;
250 uint32_t major_version;
251 uint32_t minor_version;
253 struct ras_test_mask test_mask;
258 for (i = 0; i < MAX_CARDS_SUPPORTED && drm_amdgpu[i] >= 0; i++) {
259 r = amdgpu_device_initialize(drm_amdgpu[i], &major_version,
260 &minor_version, &device_handle);
264 if (drmGetDevice2(drm_amdgpu[i],
265 DRM_DEVICE_GET_PCI_REVISION,
267 amdgpu_device_deinitialize(device_handle);
271 if (device->bustype != DRM_BUS_PCI) {
272 amdgpu_device_deinitialize(device_handle);
276 capability = amdgpu_ras_lookup_capability(device_handle);
277 if (capability == 0) {
278 amdgpu_device_deinitialize(device_handle);
283 id = amdgpu_ras_lookup_id(device);
285 amdgpu_device_deinitialize(device_handle);
289 test_mask = amdgpu_ras_get_test_mask(device);
291 devices[devices_count++] = (struct amdgpu_ras_data) {
292 device_handle, id, capability, test_mask,
296 if (devices_count == 0)
297 return CUE_SINIT_FAILED;
302 int suite_ras_tests_clean(void)
306 int ret = CUE_SUCCESS;
308 for (i = 0; i < devices_count; i++) {
309 r = amdgpu_device_deinitialize(devices[i].device_handle);
311 ret = CUE_SCLEAN_FAILED;
316 static void amdgpu_ras_disable_test(void);
317 static void amdgpu_ras_enable_test(void);
318 static void amdgpu_ras_inject_test(void);
319 static void amdgpu_ras_query_test(void);
320 static void amdgpu_ras_basic_test(void);
322 CU_TestInfo ras_tests[] = {
323 { "ras basic test", amdgpu_ras_basic_test },
324 { "ras query test", amdgpu_ras_query_test },
325 { "ras inject test", amdgpu_ras_inject_test },
326 { "ras disable test", amdgpu_ras_disable_test },
328 { "ras enable test", amdgpu_ras_enable_test },
335 static int test_card;
336 static char sysfs_path[1024];
337 static char debugfs_path[1024];
338 static uint32_t ras_mask;
339 static amdgpu_device_handle device_handle;
341 static int set_test_card(int card)
346 sprintf(sysfs_path, "/sys/class/drm/card%d/device/ras/", devices[card].id);
347 sprintf(debugfs_path, "/sys/kernel/debug/dri/%d/ras/", devices[card].id);
348 ras_mask = devices[card].capability;
349 device_handle = devices[card].device_handle;
350 ras_block_mask_inject = devices[card].test_mask.inject_mask;
351 ras_block_mask_query = devices[card].test_mask.query_mask;
352 ras_block_mask_basic = devices[card].test_mask.basic_mask;
357 static const char *get_ras_sysfs_root(void)
362 static const char *get_ras_debugfs_root(void)
367 static int set_file_contents(char *file, char *buf, int size)
370 fd = open(file, O_WRONLY);
373 n = write(fd, buf, size);
378 static int get_file_contents(char *file, char *buf, int size)
381 fd = open(file, O_RDONLY);
384 n = read(fd, buf, size);
389 static int is_file_ok(char *file, int flags)
393 fd = open(file, flags);
400 static int amdgpu_ras_is_feature_enabled(enum amdgpu_ras_block block)
402 uint32_t feature_mask;
405 ret = amdgpu_query_info(device_handle, AMDGPU_INFO_RAS_ENABLED_FEATURES,
406 sizeof(feature_mask), &feature_mask);
410 return (1 << block) & feature_mask;
413 static int amdgpu_ras_is_feature_supported(enum amdgpu_ras_block block)
415 return (1 << block) & ras_mask;
418 static int amdgpu_ras_invoke(struct ras_debug_if *data)
423 sprintf(path, "%s%s", get_ras_debugfs_root(), "ras_ctrl");
425 ret = set_file_contents(path, (char *)data, sizeof(*data))
430 static int amdgpu_ras_query_err_count(enum amdgpu_ras_block block,
431 unsigned long *ue, unsigned long *ce)
439 if (amdgpu_ras_is_feature_supported(block) <= 0)
442 sprintf(name, "%s%s%s", get_ras_sysfs_root(), ras_block_str(block), "_err_count");
444 if (is_file_ok(name, O_RDONLY))
447 if (get_file_contents(name, buf, sizeof(buf)) <= 0)
450 if (sscanf(buf, "ue: %lu\nce: %lu", ue, ce) != 2)
457 static void amdgpu_ras_features_test(int enable)
459 struct ras_debug_if data;
464 for (i = 0; i < AMDGPU_RAS_BLOCK__LAST; i++) {
465 struct ras_common_if head = {
467 .type = AMDGPU_RAS_ERROR__MULTI_UNCORRECTABLE,
468 .sub_block_index = 0,
472 if (amdgpu_ras_is_feature_supported(i) <= 0)
477 ret = amdgpu_ras_invoke(&data);
478 CU_ASSERT_EQUAL(ret, 0);
483 ret = enable ^ amdgpu_ras_is_feature_enabled(i);
484 CU_ASSERT_EQUAL(ret, 0);
488 static void amdgpu_ras_disable_test(void)
491 for (i = 0; i < devices_count; i++) {
493 amdgpu_ras_features_test(0);
497 static void amdgpu_ras_enable_test(void)
500 for (i = 0; i < devices_count; i++) {
502 amdgpu_ras_features_test(1);
506 static void __amdgpu_ras_inject_test(void)
508 struct ras_debug_if data;
511 unsigned long ue, ce, ue_old, ce_old;
514 for (i = 0; i < AMDGPU_RAS_BLOCK__LAST; i++) {
516 struct ras_inject_if inject = {
519 .type = AMDGPU_RAS_ERROR__MULTI_UNCORRECTABLE,
520 .sub_block_index = 0,
527 if (amdgpu_ras_is_feature_enabled(i) <= 0)
530 if (!((1 << i) & ras_block_mask_inject))
533 data.inject = inject;
535 ret = amdgpu_ras_query_err_count(i, &ue_old, &ce_old);
536 CU_ASSERT_EQUAL(ret, 0);
541 ret = amdgpu_ras_invoke(&data);
542 CU_ASSERT_EQUAL(ret, 0);
548 while (timeout > 0) {
549 ret = amdgpu_ras_query_err_count(i, &ue, &ce);
550 CU_ASSERT_EQUAL(ret, 0);
555 /*recovery takes ~10s*/
564 CU_ASSERT_EQUAL(ue_old + 1, ue);
565 CU_ASSERT_EQUAL(ce_old, ce);
569 static void amdgpu_ras_inject_test(void)
572 for (i = 0; i < devices_count; i++) {
574 __amdgpu_ras_inject_test();
578 static void __amdgpu_ras_query_test(void)
580 unsigned long ue, ce;
584 for (i = 0; i < AMDGPU_RAS_BLOCK__LAST; i++) {
585 if (amdgpu_ras_is_feature_supported(i) <= 0)
588 if (!((1 << i) & ras_block_mask_query))
591 ret = amdgpu_ras_query_err_count(i, &ue, &ce);
592 CU_ASSERT_EQUAL(ret, 0);
596 static void amdgpu_ras_query_test(void)
599 for (i = 0; i < devices_count; i++) {
601 __amdgpu_ras_query_test();
605 static void amdgpu_ras_basic_test(void)
607 unsigned long ue, ce;
615 ret = is_file_ok("/sys/module/amdgpu/parameters/ras_mask", O_RDONLY);
616 CU_ASSERT_EQUAL(ret, 0);
618 for (i = 0; i < devices_count; i++) {
621 ret = amdgpu_query_info(device_handle, AMDGPU_INFO_RAS_ENABLED_FEATURES,
622 sizeof(features), &features);
623 CU_ASSERT_EQUAL(ret, 0);
625 sprintf(path, "%s%s", get_ras_debugfs_root(), "ras_ctrl");
626 ret = is_file_ok(path, O_WRONLY);
627 CU_ASSERT_EQUAL(ret, 0);
629 sprintf(path, "%s%s", get_ras_sysfs_root(), "features");
630 ret = is_file_ok(path, O_RDONLY);
631 CU_ASSERT_EQUAL(ret, 0);
633 for (j = 0; j < AMDGPU_RAS_BLOCK__LAST; j++) {
634 ret = amdgpu_ras_is_feature_supported(j);
638 if (!((1 << j) & ras_block_mask_basic))
641 sprintf(path, "%s%s%s", get_ras_sysfs_root(), ras_block_str(j), "_err_count");
642 ret = is_file_ok(path, O_RDONLY);
643 CU_ASSERT_EQUAL(ret, 0);
645 sprintf(path, "%s%s%s", get_ras_debugfs_root(), ras_block_str(j), "_err_inject");
646 ret = is_file_ok(path, O_WRONLY);
647 CU_ASSERT_EQUAL(ret, 0);