return 0;
}
-static int amdgpu_ras_recovery_init(struct amdgpu_device *adev)
+int amdgpu_ras_recovery_init(struct amdgpu_device *adev)
{
struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
struct ras_err_handler_data **data = &con->eh_data;
int ret;
- *data = kmalloc(sizeof(**data),
- GFP_KERNEL|__GFP_ZERO);
- if (!*data)
- return -ENOMEM;
+ *data = kmalloc(sizeof(**data), GFP_KERNEL | __GFP_ZERO);
+ if (!*data) {
+ ret = -ENOMEM;
+ goto out;
+ }
mutex_init(&con->recovery_lock);
INIT_WORK(&con->recovery_work, amdgpu_ras_do_recovery);
ret = amdgpu_ras_eeprom_init(&adev->psp.ras.ras->eeprom_control);
if (ret)
- return ret;
+ goto free;
if (adev->psp.ras.ras->eeprom_control.num_recs) {
ret = amdgpu_ras_load_bad_pages(adev);
if (ret)
- return ret;
+ goto free;
ret = amdgpu_ras_reserve_bad_pages(adev);
if (ret)
- return ret;
+ goto release;
}
return 0;
+
+release:
+ amdgpu_ras_release_bad_pages(adev);
+free:
+ con->eh_data = NULL;
+ kfree((*data)->bps);
+ kfree((*data)->bps_bo);
+ kfree(*data);
+out:
+ DRM_WARN("Failed to initialize ras recovery!\n");
+
+ return ret;
}
static int amdgpu_ras_recovery_fini(struct amdgpu_device *adev)
struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
struct ras_err_handler_data *data = con->eh_data;
+ /* recovery_init failed to init it, fini is useless */
+ if (!data)
+ return 0;
+
cancel_work_sync(&con->recovery_work);
amdgpu_ras_release_bad_pages(adev);
mutex_lock(&con->recovery_lock);
con->eh_data = NULL;
kfree(data->bps);
+ kfree(data->bps_bo);
kfree(data);
mutex_unlock(&con->recovery_lock);
return r;
}
- if (amdgpu_ras_recovery_init(adev))
- goto recovery_out;
-
amdgpu_ras_mask &= AMDGPU_RAS_BLOCK_MASK;
if (amdgpu_ras_fs_init(adev))
con->hw_supported, con->supported);
return 0;
fs_out:
- amdgpu_ras_recovery_fini(adev);
-recovery_out:
amdgpu_ras_set_context(adev, NULL);
kfree(con);
return ras && (ras->supported & (1 << block));
}
+int amdgpu_ras_recovery_init(struct amdgpu_device *adev);
int amdgpu_ras_request_reset_on_boot(struct amdgpu_device *adev,
unsigned int block);
{
struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
+ /* save bad page to eeprom before gpu reset,
+ * i2c may be unstable in gpu reset
+ */
+ amdgpu_ras_reserve_bad_pages(adev);
if (atomic_cmpxchg(&ras->in_recovery, 0, 1) == 0)
schedule_work(&ras->recovery_work);
return 0;
#include "amdgpu_trace.h"
#include "amdgpu_amdkfd.h"
#include "amdgpu_sdma.h"
+#include "amdgpu_ras.h"
#include "bif/bif_4_1_d.h"
static int amdgpu_map_buffer(struct ttm_buffer_object *bo,
#endif
/*
+ * retired pages will be loaded from eeprom and reserved here,
+ * it should be called after ttm init since new bo may be created,
+ * recovery_init may fail, but it can free all resources allocated by
+ * itself and its failure should not stop amdgpu init process.
+ *
+ * Note: theoretically, this should be called before all vram allocations
+ * to protect retired page from abusing
+ */
+ amdgpu_ras_recovery_init(adev);
+
+ /*
*The reserved vram for firmware must be pinned to the specified
*place on the VRAM, so reserve it early.
*/