OSDN Git Service

amdgpu: Fix a structure initialization issue
[android-x86/external-libdrm.git] / tests / amdgpu / ras_tests.c
1 /*
2  * Copyright 2017 Advanced Micro Devices, Inc.
3  *
4  * Permission is hereby granted, free of charge, to any person obtaining a
5  * copy of this software and associated documentation files (the "Software"),
6  * to deal in the Software without restriction, including without limitation
7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8  * and/or sell copies of the Software, and to permit persons to whom the
9  * Software is furnished to do so, subject to the following conditions:
10  *
11  * The above copyright notice and this permission notice shall be included in
12  * all copies or substantial portions of the Software.
13  *
14  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
15  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
17  * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
18  * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
19  * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
20  * OTHER DEALINGS IN THE SOFTWARE.
21  *
22 */
23
24 #include "CUnit/Basic.h"
25
26 #include "amdgpu_test.h"
27 #include "amdgpu_drm.h"
28 #include "amdgpu_internal.h"
29 #include <unistd.h>
30 #include <fcntl.h>
31 #include <stdio.h>
32 #include "xf86drm.h"
33
34 const char *ras_block_string[] = {
35         "umc",
36         "sdma",
37         "gfx",
38         "mmhub",
39         "athub",
40         "pcie_bif",
41         "hdp",
42         "xgmi_wafl",
43         "df",
44         "smn",
45         "sem",
46         "mp0",
47         "mp1",
48         "fuse",
49 };
50
51 #define ras_block_str(i) (ras_block_string[i])
52
53 enum amdgpu_ras_block {
54         AMDGPU_RAS_BLOCK__UMC = 0,
55         AMDGPU_RAS_BLOCK__SDMA,
56         AMDGPU_RAS_BLOCK__GFX,
57         AMDGPU_RAS_BLOCK__MMHUB,
58         AMDGPU_RAS_BLOCK__ATHUB,
59         AMDGPU_RAS_BLOCK__PCIE_BIF,
60         AMDGPU_RAS_BLOCK__HDP,
61         AMDGPU_RAS_BLOCK__XGMI_WAFL,
62         AMDGPU_RAS_BLOCK__DF,
63         AMDGPU_RAS_BLOCK__SMN,
64         AMDGPU_RAS_BLOCK__SEM,
65         AMDGPU_RAS_BLOCK__MP0,
66         AMDGPU_RAS_BLOCK__MP1,
67         AMDGPU_RAS_BLOCK__FUSE,
68
69         AMDGPU_RAS_BLOCK__LAST
70 };
71
72 #define AMDGPU_RAS_BLOCK_COUNT  AMDGPU_RAS_BLOCK__LAST
73 #define AMDGPU_RAS_BLOCK_MASK   ((1ULL << AMDGPU_RAS_BLOCK_COUNT) - 1)
74
75 enum amdgpu_ras_error_type {
76         AMDGPU_RAS_ERROR__NONE                          = 0,
77         AMDGPU_RAS_ERROR__SINGLE_CORRECTABLE            = 2,
78         AMDGPU_RAS_ERROR__MULTI_UNCORRECTABLE           = 4,
79         AMDGPU_RAS_ERROR__POISON                        = 8,
80 };
81
82 struct ras_common_if {
83         enum amdgpu_ras_block block;
84         enum amdgpu_ras_error_type type;
85         uint32_t sub_block_index;
86         char name[32];
87 };
88
89 struct ras_inject_if {
90         struct ras_common_if head;
91         uint64_t address;
92         uint64_t value;
93 };
94
95 struct ras_debug_if {
96         union {
97                 struct ras_common_if head;
98                 struct ras_inject_if inject;
99         };
100         int op;
101 };
102 /* for now, only umc, gfx, sdma has implemented. */
103 #define DEFAULT_RAS_BLOCK_MASK_INJECT (1 << AMDGPU_RAS_BLOCK__UMC)
104 #define DEFAULT_RAS_BLOCK_MASK_QUERY (1 << AMDGPU_RAS_BLOCK__UMC)
105 #define DEFAULT_RAS_BLOCK_MASK_BASIC (1 << AMDGPU_RAS_BLOCK__UMC |\
106                 (1 << AMDGPU_RAS_BLOCK__SDMA) |\
107                 (1 << AMDGPU_RAS_BLOCK__GFX))
108
109 static uint32_t ras_block_mask_inject = DEFAULT_RAS_BLOCK_MASK_INJECT;
110 static uint32_t ras_block_mask_query = DEFAULT_RAS_BLOCK_MASK_INJECT;
111 static uint32_t ras_block_mask_basic = DEFAULT_RAS_BLOCK_MASK_BASIC;
112
113 struct ras_test_mask {
114         uint32_t inject_mask;
115         uint32_t query_mask;
116         uint32_t basic_mask;
117 };
118
119 struct amdgpu_ras_data {
120         amdgpu_device_handle device_handle;
121         uint32_t  id;
122         uint32_t  capability;
123         struct ras_test_mask test_mask;
124 };
125
126 /* all devices who has ras supported */
127 static struct amdgpu_ras_data devices[MAX_CARDS_SUPPORTED];
128 static int devices_count;
129
130 struct ras_DID_test_mask{
131         uint16_t device_id;
132         uint16_t revision_id;
133         struct ras_test_mask test_mask;
134 };
135
136 /* white list for inject test. */
137 #define RAS_BLOCK_MASK_ALL {\
138         DEFAULT_RAS_BLOCK_MASK_INJECT,\
139         DEFAULT_RAS_BLOCK_MASK_QUERY,\
140         DEFAULT_RAS_BLOCK_MASK_BASIC\
141 }
142
143 #define RAS_BLOCK_MASK_QUERY_BASIC {\
144         0,\
145         DEFAULT_RAS_BLOCK_MASK_QUERY,\
146         DEFAULT_RAS_BLOCK_MASK_BASIC\
147 }
148
149 static const struct ras_DID_test_mask ras_DID_array[] = {
150         {0x66a1, 0x00, RAS_BLOCK_MASK_ALL},
151         {0x66a1, 0x01, RAS_BLOCK_MASK_ALL},
152         {0x66a1, 0x04, RAS_BLOCK_MASK_ALL},
153 };
154
155 static struct ras_test_mask amdgpu_ras_get_test_mask(drmDevicePtr device)
156 {
157         int i;
158         static struct ras_test_mask default_test_mask = RAS_BLOCK_MASK_QUERY_BASIC;
159
160         for (i = 0; i < sizeof(ras_DID_array) / sizeof(ras_DID_array[0]); i++) {
161                 if (ras_DID_array[i].device_id == device->deviceinfo.pci->device_id &&
162                                 ras_DID_array[i].revision_id == device->deviceinfo.pci->revision_id)
163                         return ras_DID_array[i].test_mask;
164         }
165         return default_test_mask;
166 }
167
168 static uint32_t amdgpu_ras_lookup_capability(amdgpu_device_handle device_handle)
169 {
170         union {
171                 uint64_t feature_mask;
172                 struct {
173                         uint32_t enabled_features;
174                         uint32_t supported_features;
175                 };
176         } features = { 0 };
177         int ret;
178
179         ret = amdgpu_query_info(device_handle, AMDGPU_INFO_RAS_ENABLED_FEATURES,
180                         sizeof(features), &features);
181         if (ret)
182                 return 0;
183
184         return features.supported_features;
185 }
186
187 static int get_file_contents(char *file, char *buf, int size);
188
189 static int amdgpu_ras_lookup_id(drmDevicePtr device)
190 {
191         char path[1024];
192         char str[128];
193         drmPciBusInfo info;
194         int i;
195         int ret;
196
197         for (i = 0; i < MAX_CARDS_SUPPORTED; i++) {
198                 memset(str, 0, sizeof(str));
199                 memset(&info, 0, sizeof(info));
200                 sprintf(path, "/sys/kernel/debug/dri/%d/name", i);
201                 if (get_file_contents(path, str, sizeof(str)) <= 0)
202                         continue;
203
204                 ret = sscanf(str, "amdgpu dev=%04hx:%02hhx:%02hhx.%01hhx",
205                                 &info.domain, &info.bus, &info.dev, &info.func);
206                 if (ret != 4)
207                         continue;
208
209                 if (memcmp(&info, device->businfo.pci, sizeof(info)) == 0)
210                                 return i;
211         }
212         return -1;
213 }
214
215 CU_BOOL suite_ras_tests_enable(void)
216 {
217         amdgpu_device_handle device_handle;
218         uint32_t  major_version;
219         uint32_t  minor_version;
220         int i;
221         drmDevicePtr device;
222
223         for (i = 0; i < MAX_CARDS_SUPPORTED && drm_amdgpu[i] >= 0; i++) {
224                 if (amdgpu_device_initialize(drm_amdgpu[i], &major_version,
225                                         &minor_version, &device_handle))
226                         continue;
227
228                 if (drmGetDevice2(drm_amdgpu[i],
229                                         DRM_DEVICE_GET_PCI_REVISION,
230                                         &device))
231                         continue;
232
233                 if (device->bustype == DRM_BUS_PCI &&
234                                 amdgpu_ras_lookup_capability(device_handle)) {
235                         amdgpu_device_deinitialize(device_handle);
236                         return CU_TRUE;
237                 }
238
239                 if (amdgpu_device_deinitialize(device_handle))
240                         continue;
241         }
242
243         return CU_FALSE;
244 }
245
246 int suite_ras_tests_init(void)
247 {
248         drmDevicePtr device;
249         amdgpu_device_handle device_handle;
250         uint32_t  major_version;
251         uint32_t  minor_version;
252         uint32_t  capability;
253         struct ras_test_mask test_mask;
254         int id;
255         int i;
256         int r;
257
258         for (i = 0; i < MAX_CARDS_SUPPORTED && drm_amdgpu[i] >= 0; i++) {
259                 r = amdgpu_device_initialize(drm_amdgpu[i], &major_version,
260                                 &minor_version, &device_handle);
261                 if (r)
262                         continue;
263
264                 if (drmGetDevice2(drm_amdgpu[i],
265                                         DRM_DEVICE_GET_PCI_REVISION,
266                                         &device)) {
267                         amdgpu_device_deinitialize(device_handle);
268                         continue;
269                 }
270
271                 if (device->bustype != DRM_BUS_PCI) {
272                         amdgpu_device_deinitialize(device_handle);
273                         continue;
274                 }
275
276                 capability = amdgpu_ras_lookup_capability(device_handle);
277                 if (capability == 0) {
278                         amdgpu_device_deinitialize(device_handle);
279                         continue;
280
281                 }
282
283                 id = amdgpu_ras_lookup_id(device);
284                 if (id == -1) {
285                         amdgpu_device_deinitialize(device_handle);
286                         continue;
287                 }
288
289                 test_mask = amdgpu_ras_get_test_mask(device);
290
291                 devices[devices_count++] = (struct amdgpu_ras_data) {
292                         device_handle, id, capability, test_mask,
293                 };
294         }
295
296         if (devices_count == 0)
297                 return CUE_SINIT_FAILED;
298
299         return CUE_SUCCESS;
300 }
301
302 int suite_ras_tests_clean(void)
303 {
304         int r;
305         int i;
306         int ret = CUE_SUCCESS;
307
308         for (i = 0; i < devices_count; i++) {
309                 r = amdgpu_device_deinitialize(devices[i].device_handle);
310                 if (r)
311                         ret = CUE_SCLEAN_FAILED;
312         }
313         return ret;
314 }
315
316 static void amdgpu_ras_disable_test(void);
317 static void amdgpu_ras_enable_test(void);
318 static void amdgpu_ras_inject_test(void);
319 static void amdgpu_ras_query_test(void);
320 static void amdgpu_ras_basic_test(void);
321
322 CU_TestInfo ras_tests[] = {
323         { "ras basic test",     amdgpu_ras_basic_test },
324         { "ras query test",     amdgpu_ras_query_test },
325         { "ras inject test",    amdgpu_ras_inject_test },
326         { "ras disable test",   amdgpu_ras_disable_test },
327 #if 0
328         { "ras enable test",    amdgpu_ras_enable_test },
329 #endif
330         CU_TEST_INFO_NULL,
331 };
332
333 //helpers
334
335 static int test_card;
336 static char sysfs_path[1024];
337 static char debugfs_path[1024];
338 static uint32_t ras_mask;
339 static amdgpu_device_handle device_handle;
340
341 static int set_test_card(int card)
342 {
343         int i;
344
345         test_card = card;
346         sprintf(sysfs_path, "/sys/class/drm/card%d/device/ras/", devices[card].id);
347         sprintf(debugfs_path, "/sys/kernel/debug/dri/%d/ras/", devices[card].id);
348         ras_mask = devices[card].capability;
349         device_handle = devices[card].device_handle;
350         ras_block_mask_inject = devices[card].test_mask.inject_mask;
351         ras_block_mask_query = devices[card].test_mask.query_mask;
352         ras_block_mask_basic = devices[card].test_mask.basic_mask;
353
354         return 0;
355 }
356
357 static const char *get_ras_sysfs_root(void)
358 {
359         return sysfs_path;
360 }
361
362 static const char *get_ras_debugfs_root(void)
363 {
364         return debugfs_path;
365 }
366
367 static int set_file_contents(char *file, char *buf, int size)
368 {
369         int n, fd;
370         fd = open(file, O_WRONLY);
371         if (fd == -1)
372                 return -1;
373         n = write(fd, buf, size);
374         close(fd);
375         return n;
376 }
377
378 static int get_file_contents(char *file, char *buf, int size)
379 {
380         int n, fd;
381         fd = open(file, O_RDONLY);
382         if (fd == -1)
383                 return -1;
384         n = read(fd, buf, size);
385         close(fd);
386         return n;
387 }
388
389 static int is_file_ok(char *file, int flags)
390 {
391         int fd;
392
393         fd = open(file, flags);
394         if (fd == -1)
395                 return -1;
396         close(fd);
397         return 0;
398 }
399
400 static int amdgpu_ras_is_feature_enabled(enum amdgpu_ras_block block)
401 {
402         uint32_t feature_mask;
403         int ret;
404
405         ret = amdgpu_query_info(device_handle, AMDGPU_INFO_RAS_ENABLED_FEATURES,
406                         sizeof(feature_mask), &feature_mask);
407         if (ret)
408                 return -1;
409
410         return (1 << block) & feature_mask;
411 }
412
413 static int amdgpu_ras_is_feature_supported(enum amdgpu_ras_block block)
414 {
415         return (1 << block) & ras_mask;
416 }
417
418 static int amdgpu_ras_invoke(struct ras_debug_if *data)
419 {
420         char path[1024];
421         int ret;
422
423         sprintf(path, "%s%s", get_ras_debugfs_root(), "ras_ctrl");
424
425         ret = set_file_contents(path, (char *)data, sizeof(*data))
426                 - sizeof(*data);
427         return ret;
428 }
429
430 static int amdgpu_ras_query_err_count(enum amdgpu_ras_block block,
431                 unsigned long *ue, unsigned long *ce)
432 {
433         char buf[64];
434         char name[1024];
435         int ret;
436
437         *ue = *ce = 0;
438
439         if (amdgpu_ras_is_feature_supported(block) <= 0)
440                 return -1;
441
442         sprintf(name, "%s%s%s", get_ras_sysfs_root(), ras_block_str(block), "_err_count");
443
444         if (is_file_ok(name, O_RDONLY))
445                 return 0;
446
447         if (get_file_contents(name, buf, sizeof(buf)) <= 0)
448                 return -1;
449
450         if (sscanf(buf, "ue: %lu\nce: %lu", ue, ce) != 2)
451                 return -1;
452
453         return 0;
454 }
455
456 //tests
457 static void amdgpu_ras_features_test(int enable)
458 {
459         struct ras_debug_if data;
460         int ret;
461         int i;
462
463         data.op = enable;
464         for (i = 0; i < AMDGPU_RAS_BLOCK__LAST; i++) {
465                 struct ras_common_if head = {
466                         .block = i,
467                         .type = AMDGPU_RAS_ERROR__MULTI_UNCORRECTABLE,
468                         .sub_block_index = 0,
469                         .name = "",
470                 };
471
472                 if (amdgpu_ras_is_feature_supported(i) <= 0)
473                         continue;
474
475                 data.head = head;
476
477                 ret = amdgpu_ras_invoke(&data);
478                 CU_ASSERT_EQUAL(ret, 0);
479
480                 if (ret)
481                         continue;
482
483                 ret = enable ^ amdgpu_ras_is_feature_enabled(i);
484                 CU_ASSERT_EQUAL(ret, 0);
485         }
486 }
487
488 static void amdgpu_ras_disable_test(void)
489 {
490         int i;
491         for (i = 0; i < devices_count; i++) {
492                 set_test_card(i);
493                 amdgpu_ras_features_test(0);
494         }
495 }
496
497 static void amdgpu_ras_enable_test(void)
498 {
499         int i;
500         for (i = 0; i < devices_count; i++) {
501                 set_test_card(i);
502                 amdgpu_ras_features_test(1);
503         }
504 }
505
506 static void __amdgpu_ras_inject_test(void)
507 {
508         struct ras_debug_if data;
509         int ret;
510         int i;
511         unsigned long ue, ce, ue_old, ce_old;
512
513         data.op = 2;
514         for (i = 0; i < AMDGPU_RAS_BLOCK__LAST; i++) {
515                 int timeout = 3;
516                 struct ras_inject_if inject = {
517                         .head = {
518                                 .block = i,
519                                 .type = AMDGPU_RAS_ERROR__MULTI_UNCORRECTABLE,
520                                 .sub_block_index = 0,
521                                 .name = "",
522                         },
523                         .address = 0,
524                         .value = 0,
525                 };
526
527                 if (amdgpu_ras_is_feature_enabled(i) <= 0)
528                         continue;
529
530                 if (!((1 << i) & ras_block_mask_inject))
531                         continue;
532
533                 data.inject = inject;
534
535                 ret = amdgpu_ras_query_err_count(i, &ue_old, &ce_old);
536                 CU_ASSERT_EQUAL(ret, 0);
537
538                 if (ret)
539                         continue;
540
541                 ret = amdgpu_ras_invoke(&data);
542                 CU_ASSERT_EQUAL(ret, 0);
543
544                 if (ret)
545                         continue;
546
547 loop:
548                 while (timeout > 0) {
549                         ret = amdgpu_ras_query_err_count(i, &ue, &ce);
550                         CU_ASSERT_EQUAL(ret, 0);
551
552                         if (ret)
553                                 continue;
554                         if (ue_old != ue) {
555                                 /*recovery takes ~10s*/
556                                 sleep(10);
557                                 break;
558                         }
559
560                         sleep(1);
561                         timeout -= 1;
562                 }
563
564                 CU_ASSERT_EQUAL(ue_old + 1, ue);
565                 CU_ASSERT_EQUAL(ce_old, ce);
566         }
567 }
568
569 static void amdgpu_ras_inject_test(void)
570 {
571         int i;
572         for (i = 0; i < devices_count; i++) {
573                 set_test_card(i);
574                 __amdgpu_ras_inject_test();
575         }
576 }
577
578 static void __amdgpu_ras_query_test(void)
579 {
580         unsigned long ue, ce;
581         int ret;
582         int i;
583
584         for (i = 0; i < AMDGPU_RAS_BLOCK__LAST; i++) {
585                 if (amdgpu_ras_is_feature_supported(i) <= 0)
586                         continue;
587
588                 if (!((1 << i) & ras_block_mask_query))
589                         continue;
590
591                 ret = amdgpu_ras_query_err_count(i, &ue, &ce);
592                 CU_ASSERT_EQUAL(ret, 0);
593         }
594 }
595
596 static void amdgpu_ras_query_test(void)
597 {
598         int i;
599         for (i = 0; i < devices_count; i++) {
600                 set_test_card(i);
601                 __amdgpu_ras_query_test();
602         }
603 }
604
605 static void amdgpu_ras_basic_test(void)
606 {
607         unsigned long ue, ce;
608         char name[1024];
609         int ret;
610         int i;
611         int j;
612         uint32_t features;
613         char path[1024];
614
615         ret = is_file_ok("/sys/module/amdgpu/parameters/ras_mask", O_RDONLY);
616         CU_ASSERT_EQUAL(ret, 0);
617
618         for (i = 0; i < devices_count; i++) {
619                 set_test_card(i);
620
621                 ret = amdgpu_query_info(device_handle, AMDGPU_INFO_RAS_ENABLED_FEATURES,
622                                 sizeof(features), &features);
623                 CU_ASSERT_EQUAL(ret, 0);
624
625                 sprintf(path, "%s%s", get_ras_debugfs_root(), "ras_ctrl");
626                 ret = is_file_ok(path, O_WRONLY);
627                 CU_ASSERT_EQUAL(ret, 0);
628
629                 sprintf(path, "%s%s", get_ras_sysfs_root(), "features");
630                 ret = is_file_ok(path, O_RDONLY);
631                 CU_ASSERT_EQUAL(ret, 0);
632
633                 for (j = 0; j < AMDGPU_RAS_BLOCK__LAST; j++) {
634                         ret = amdgpu_ras_is_feature_supported(j);
635                         if (ret <= 0)
636                                 continue;
637
638                         if (!((1 << j) & ras_block_mask_basic))
639                                 continue;
640
641                         sprintf(path, "%s%s%s", get_ras_sysfs_root(), ras_block_str(j), "_err_count");
642                         ret = is_file_ok(path, O_RDONLY);
643                         CU_ASSERT_EQUAL(ret, 0);
644
645                         sprintf(path, "%s%s%s", get_ras_debugfs_root(), ras_block_str(j), "_err_inject");
646                         ret = is_file_ok(path, O_WRONLY);
647                         CU_ASSERT_EQUAL(ret, 0);
648                 }
649         }
650 }