Skip to content

Commit

Permalink
drm/amdgpu: revert "reserve backup pages for bad page retirment"
Browse files Browse the repository at this point in the history
As noted during the review this approach doesn't make sense at all.

We should not apply any limitation on the VRAM applications can use inside the kernel.

If an application or end user wants to reserve a certain amount of VRAM for bad pages handling we should do this in the upper layer.

This reverts commit f89b881.

Signed-off-by: Christian König <christian.koenig@amd.com>
Reviewed-by: Hawking Zhang <Hawking.Zhang@amd.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
  • Loading branch information
Christian König authored and Alex Deucher committed Mar 24, 2021
1 parent 6b44b66 commit e5c04ed
Show file tree
Hide file tree
Showing 4 changed files with 15 additions and 118 deletions.
4 changes: 2 additions & 2 deletions drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
Original file line number Diff line number Diff line change
Expand Up @@ -180,7 +180,7 @@ struct amdgpu_mgpu_info mgpu_info = {
};
int amdgpu_ras_enable = -1;
uint amdgpu_ras_mask = 0xffffffff;
int amdgpu_bad_page_threshold = 100;
int amdgpu_bad_page_threshold = -1;
struct amdgpu_watchdog_timer amdgpu_watchdog_timer = {
.timeout_fatal_disable = false,
.period = 0x23, /* default to max. timeout = 1 << 0x23 cycles */
Expand Down Expand Up @@ -854,7 +854,7 @@ module_param_named(reset_method, amdgpu_reset_method, int, 0444);
* faulty pages by ECC exceed threshold value and leave it for user's further
* check.
*/
MODULE_PARM_DESC(bad_page_threshold, "Bad page threshold(-1 = auto, 0 = disable bad page retirement, 100 = default value");
MODULE_PARM_DESC(bad_page_threshold, "Bad page threshold(-1 = auto(default value), 0 = disable bad page retirement)");
module_param_named(bad_page_threshold, amdgpu_bad_page_threshold, int, 0444);

MODULE_PARM_DESC(num_kcq, "number of kernel compute queue user want to setup (8 if set to greater than 8 or less than 0, only affect gfx 8+)");
Expand Down
29 changes: 11 additions & 18 deletions drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
Original file line number Diff line number Diff line change
Expand Up @@ -1790,14 +1790,13 @@ static bool amdgpu_ras_check_bad_page(struct amdgpu_device *adev,
return ret;
}

static uint32_t
amdgpu_ras_calculate_badpags_threshold(struct amdgpu_device *adev)
static void amdgpu_ras_validate_threshold(struct amdgpu_device *adev,
uint32_t max_length)
{
struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
int tmp_threshold = amdgpu_bad_page_threshold;
u64 val;
uint32_t max_length = 0;

max_length = amdgpu_ras_eeprom_get_record_max_length();
/*
* Justification of value bad_page_cnt_threshold in ras structure
*
Expand All @@ -1823,18 +1822,20 @@ amdgpu_ras_calculate_badpags_threshold(struct amdgpu_device *adev)
tmp_threshold = max_length;

if (tmp_threshold == -1) {
val = adev->gmc.real_vram_size;
val = adev->gmc.mc_vram_size;
do_div(val, RAS_BAD_PAGE_RATE);
tmp_threshold = min(lower_32_bits(val), max_length);
con->bad_page_cnt_threshold = min(lower_32_bits(val),
max_length);
} else {
con->bad_page_cnt_threshold = tmp_threshold;
}

return tmp_threshold;
}

int amdgpu_ras_recovery_init(struct amdgpu_device *adev)
{
struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
struct ras_err_handler_data **data;
uint32_t max_eeprom_records_len = 0;
bool exc_err_limit = false;
int ret;

Expand All @@ -1854,16 +1855,8 @@ int amdgpu_ras_recovery_init(struct amdgpu_device *adev)
atomic_set(&con->in_recovery, 0);
con->adev = adev;

if (!con->bad_page_cnt_threshold) {
con->bad_page_cnt_threshold =
amdgpu_ras_calculate_badpags_threshold(adev);

ret = amdgpu_vram_mgr_reserve_backup_pages(
ttm_manager_type(&adev->mman.bdev, TTM_PL_VRAM),
con->bad_page_cnt_threshold);
if (ret)
goto out;
}
max_eeprom_records_len = amdgpu_ras_eeprom_get_record_max_length();
amdgpu_ras_validate_threshold(adev, max_eeprom_records_len);

/* Todo: During test the SMU might fail to read the eeprom through I2C
* when the GPU is pending on XGMI reset during probe time
Expand Down
4 changes: 0 additions & 4 deletions drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.h
Original file line number Diff line number Diff line change
Expand Up @@ -48,8 +48,6 @@ struct amdgpu_vram_mgr {
spinlock_t lock;
struct list_head reservations_pending;
struct list_head reserved_pages;
struct list_head backup_pages;
uint32_t num_backup_pages;
atomic64_t usage;
atomic64_t vis_usage;
};
Expand Down Expand Up @@ -124,8 +122,6 @@ uint64_t amdgpu_vram_mgr_usage(struct ttm_resource_manager *man);
uint64_t amdgpu_vram_mgr_vis_usage(struct ttm_resource_manager *man);
int amdgpu_vram_mgr_reserve_range(struct ttm_resource_manager *man,
uint64_t start, uint64_t size);
int amdgpu_vram_mgr_reserve_backup_pages(struct ttm_resource_manager *man,
uint32_t num_pages);
int amdgpu_vram_mgr_query_page_status(struct ttm_resource_manager *man,
uint64_t start);

Expand Down
96 changes: 2 additions & 94 deletions drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c
Original file line number Diff line number Diff line change
Expand Up @@ -28,9 +28,6 @@
#include "amdgpu_atomfirmware.h"
#include "atom.h"

static int amdgpu_vram_mgr_free_backup_pages(struct amdgpu_vram_mgr *mgr,
uint32_t num_pages);

static inline struct amdgpu_vram_mgr *to_vram_mgr(struct ttm_resource_manager *man)
{
return container_of(man, struct amdgpu_vram_mgr, manager);
Expand Down Expand Up @@ -189,7 +186,6 @@ int amdgpu_vram_mgr_init(struct amdgpu_device *adev)
spin_lock_init(&mgr->lock);
INIT_LIST_HEAD(&mgr->reservations_pending);
INIT_LIST_HEAD(&mgr->reserved_pages);
INIT_LIST_HEAD(&mgr->backup_pages);

/* Add the two VRAM-related sysfs files */
ret = sysfs_create_files(&adev->dev->kobj, amdgpu_vram_mgr_attributes);
Expand Down Expand Up @@ -230,11 +226,6 @@ void amdgpu_vram_mgr_fini(struct amdgpu_device *adev)
drm_mm_remove_node(&rsv->mm_node);
kfree(rsv);
}

list_for_each_entry_safe(rsv, temp, &mgr->backup_pages, node) {
drm_mm_remove_node(&rsv->mm_node);
kfree(rsv);
}
drm_mm_takedown(&mgr->mm);
spin_unlock(&mgr->lock);

Expand Down Expand Up @@ -306,14 +297,12 @@ static void amdgpu_vram_mgr_do_reserve(struct ttm_resource_manager *man)
continue;

dev_dbg(adev->dev, "Reservation 0x%llx - %lld, Succeeded\n",
rsv->mm_node.start << PAGE_SHIFT, rsv->mm_node.size);
rsv->mm_node.start, rsv->mm_node.size);

vis_usage = amdgpu_vram_mgr_vis_size(adev, &rsv->mm_node);
atomic64_add(vis_usage, &mgr->vis_usage);
atomic64_add(rsv->mm_node.size << PAGE_SHIFT, &mgr->usage);
list_move(&rsv->node, &mgr->reserved_pages);

amdgpu_vram_mgr_free_backup_pages(mgr, rsv->mm_node.size);
}
}

Expand All @@ -330,7 +319,6 @@ int amdgpu_vram_mgr_reserve_range(struct ttm_resource_manager *man,
uint64_t start, uint64_t size)
{
struct amdgpu_vram_mgr *mgr = to_vram_mgr(man);
struct amdgpu_device *adev = to_amdgpu_device(mgr);
struct amdgpu_vram_reservation *rsv;

rsv = kzalloc(sizeof(*rsv), GFP_KERNEL);
Expand All @@ -341,94 +329,14 @@ int amdgpu_vram_mgr_reserve_range(struct ttm_resource_manager *man,
rsv->mm_node.start = start >> PAGE_SHIFT;
rsv->mm_node.size = size >> PAGE_SHIFT;

dev_dbg(adev->dev, "Pending Reservation: 0x%llx\n", start);

spin_lock(&mgr->lock);
list_add_tail(&rsv->node, &mgr->reservations_pending);
list_add_tail(&mgr->reservations_pending, &rsv->node);
amdgpu_vram_mgr_do_reserve(man);
spin_unlock(&mgr->lock);

return 0;
}

static int amdgpu_vram_mgr_free_backup_pages(struct amdgpu_vram_mgr *mgr,
uint32_t num_pages)
{
struct amdgpu_device *adev = to_amdgpu_device(mgr);
struct amdgpu_vram_reservation *rsv;
uint32_t i;
uint64_t vis_usage = 0, total_usage = 0;

if (num_pages > mgr->num_backup_pages) {
dev_warn(adev->dev, "No enough backup pages\n");
return -EINVAL;
}

for (i = 0; i < num_pages; i++) {
rsv = list_first_entry(&mgr->backup_pages,
struct amdgpu_vram_reservation, node);
vis_usage += amdgpu_vram_mgr_vis_size(adev, &rsv->mm_node);
total_usage += (rsv->mm_node.size << PAGE_SHIFT);
drm_mm_remove_node(&rsv->mm_node);
list_del(&rsv->node);
kfree(rsv);
mgr->num_backup_pages--;
}

atomic64_sub(total_usage, &mgr->usage);
atomic64_sub(vis_usage, &mgr->vis_usage);

return 0;
}

int amdgpu_vram_mgr_reserve_backup_pages(struct ttm_resource_manager *man,
uint32_t num_pages)
{
struct amdgpu_vram_mgr *mgr = to_vram_mgr(man);
struct amdgpu_device *adev = to_amdgpu_device(mgr);
struct amdgpu_vram_reservation *rsv;
struct drm_mm *mm = &mgr->mm;
uint32_t i;
int ret = 0;
uint64_t vis_usage, total_usage;

for (i = 0; i < num_pages; i++) {
rsv = kzalloc(sizeof(*rsv), GFP_KERNEL);
if (!rsv) {
ret = -ENOMEM;
goto pro_end;
}

INIT_LIST_HEAD(&rsv->node);

ret = drm_mm_insert_node(mm, &rsv->mm_node, 1);
if (ret) {
dev_err(adev->dev, "failed to reserve backup page %d, ret 0x%x\n", i, ret);
kfree(rsv);
goto pro_end;
}

vis_usage = amdgpu_vram_mgr_vis_size(adev, &rsv->mm_node);
total_usage = (rsv->mm_node.size << PAGE_SHIFT);

spin_lock(&mgr->lock);
atomic64_add(vis_usage, &mgr->vis_usage);
atomic64_add(total_usage, &mgr->usage);
list_add_tail(&rsv->node, &mgr->backup_pages);
mgr->num_backup_pages++;
spin_unlock(&mgr->lock);
}

pro_end:
if (ret) {
spin_lock(&mgr->lock);
amdgpu_vram_mgr_free_backup_pages(mgr, mgr->num_backup_pages);
spin_unlock(&mgr->lock);
}

return ret;
}

/**
* amdgpu_vram_mgr_query_page_status - query the reservation status
*
Expand Down

0 comments on commit e5c04ed

Please sign in to comment.