Skip to content

Commit

Permalink
Merge tag 'drm-next-2023-03-03-1' of git://anongit.freedesktop.org/dr…
Browse files Browse the repository at this point in the history
…m/drm

Pull drm fixes from Dave Airlie:
 "fbdev:
   - fix uninit var in error path

  shmem:
   - revert unGPLing an export

  i915:
   - Don't use stolen memory or BAR mappings for ring buffers with LLC
   - Add inverted backlight quirk for HP 14-r206nv
   - Fix GSI offset for MCR lookups
   - GVT fixes (memleak, debugfs attributes, kconfig, typos)

  amdgpu:
   - SMU 13 fixes
   - Enable TMZ for GC 10.3.6
   - Misc display fixes
   - Buddy allocator fixes
   - GC 11 fixes
   - S0ix fix
   - INFO IOCTL queries for GC 11
   - VCN harvest fixes for SR-IOV
   - UMC 8.10 RAS fixes
   - Don't restrict bpc to 8
   - NBIO 7.5 fix
   - Allow freesync on PCon for more devices

  amdkfd:
   - SDMA fix
   - Illegal memory access fix"

* tag 'drm-next-2023-03-03-1' of git://anongit.freedesktop.org/drm/drm: (45 commits)
  drm/amdgpu/vcn: fix compilation issue with legacy gcc
  drm/amd/display: Extend Freesync over PCon support for more devices
  Revert "drm/amd/display: Do not set DRR on pipe commit"
  drm/amd/display: fix shift-out-of-bounds in CalculateVMAndRowBytes
  drm/amd/display: Ext displays with dock can't recognized after resume
  drm/amdgpu: fix ttm_bo calltrace warning in psp_hw_fini
  drm/amdgpu: remove unused variable ring
  drm/amd/display: fix dm irq error message in gpu recover
  drm/amd: Fix initialization for nbio 7.5.1
  drm/amd/display: Don't restrict bpc to 8 bpc
  drm/amdgpu: Make umc_v8_10_convert_error_address static and remove unused variable
  drm/radeon: Fix eDP for single-display iMac11,2
  drm/shmem-helper: Revert accidental non-GPL export
  drm: omapdrm: Do not use helper unininitialized in omap_fbdev_init()
  drm/amd/pm: downgrade log level upon SMU IF version mismatch
  drm/amdgpu: Add ecc info query interface for umc v8_10
  drm/amdgpu: Add convert_error_address function for umc v8_10
  drm/amdgpu: add bad_page_threshold check in ras_eeprom_check_err
  drm/amdgpu: change default behavior of bad_page_threshold parameter
  drm/amdgpu: exclude duplicate pages from UMC RAS UE count
  ...
  • Loading branch information
Linus Torvalds committed Mar 2, 2023
2 parents 39ce439 + 54ceb92 commit 2eb29d5
Show file tree
Hide file tree
Showing 50 changed files with 446 additions and 149 deletions.
1 change: 1 addition & 0 deletions drivers/gpu/drm/amd/amdgpu/Kconfig
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ config DRM_AMDGPU
select FW_LOADER
select DRM_DISPLAY_DP_HELPER
select DRM_DISPLAY_HDMI_HELPER
select DRM_DISPLAY_HDCP_HELPER
select DRM_DISPLAY_HELPER
select DRM_KMS_HELPER
select DRM_SCHED
Expand Down
3 changes: 3 additions & 0 deletions drivers/gpu/drm/amd/amdgpu/amdgpu_acpi.c
Original file line number Diff line number Diff line change
Expand Up @@ -1073,6 +1073,9 @@ bool amdgpu_acpi_is_s0ix_active(struct amdgpu_device *adev)
(pm_suspend_target_state != PM_SUSPEND_TO_IDLE))
return false;

if (adev->asic_type < CHIP_RAVEN)
return false;

/*
* If ACPI_FADT_LOW_POWER_S0 is not set in the FADT, it is generally
* risky to do any special firmware-related preparations for entering
Expand Down
14 changes: 11 additions & 3 deletions drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
Original file line number Diff line number Diff line change
Expand Up @@ -107,9 +107,12 @@
* - 3.50.0 - Update AMDGPU_INFO_DEV_INFO IOCTL for minimum engine and memory clock
* Update AMDGPU_INFO_SENSOR IOCTL for PEAK_PSTATE engine and memory clock
* 3.51.0 - Return the PCIe gen and lanes from the INFO ioctl
* 3.52.0 - Add AMDGPU_IDS_FLAGS_CONFORMANT_TRUNC_COORD, add device_info fields:
* tcp_cache_size, num_sqc_per_wgp, sqc_data_cache_size, sqc_inst_cache_size,
* gl1c_cache_size, gl2c_cache_size, mall_size, enabled_rb_pipes_mask_hi
*/
#define KMS_DRIVER_MAJOR 3
#define KMS_DRIVER_MINOR 51
#define KMS_DRIVER_MINOR 52
#define KMS_DRIVER_PATCHLEVEL 0

unsigned int amdgpu_vram_limit = UINT_MAX;
Expand Down Expand Up @@ -921,7 +924,7 @@ module_param_named(reset_method, amdgpu_reset_method, int, 0444);
* result in the GPU entering bad status when the number of total
* faulty pages by ECC exceeds the threshold value.
*/
MODULE_PARM_DESC(bad_page_threshold, "Bad page threshold(-1 = auto(default value), 0 = disable bad page retirement, -2 = ignore bad page threshold)");
MODULE_PARM_DESC(bad_page_threshold, "Bad page threshold(-1 = ignore threshold (default value), 0 = disable bad page retirement, -2 = driver sets threshold)");
module_param_named(bad_page_threshold, amdgpu_bad_page_threshold, int, 0444);

MODULE_PARM_DESC(num_kcq, "number of kernel compute queue user want to setup (8 if set to greater than 8 or less than 0, only affect gfx 8+)");
Expand Down Expand Up @@ -2414,8 +2417,10 @@ static int amdgpu_pmops_suspend(struct device *dev)

if (amdgpu_acpi_is_s0ix_active(adev))
adev->in_s0ix = true;
else
else if (amdgpu_acpi_is_s3_active(adev))
adev->in_s3 = true;
if (!adev->in_s0ix && !adev->in_s3)
return 0;
return amdgpu_device_suspend(drm_dev, true);
}

Expand All @@ -2436,6 +2441,9 @@ static int amdgpu_pmops_resume(struct device *dev)
struct amdgpu_device *adev = drm_to_adev(drm_dev);
int r;

if (!adev->in_s0ix && !adev->in_s3)
return 0;

/* Avoids registers access if device is physically gone */
if (!pci_device_is_present(adev->pdev))
adev->no_hw_access = true;
Expand Down
2 changes: 2 additions & 0 deletions drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h
Original file line number Diff line number Diff line change
Expand Up @@ -178,6 +178,8 @@ struct amdgpu_gfx_config {
uint32_t num_sc_per_sh;
uint32_t num_packer_per_sc;
uint32_t pa_sc_tile_steering_override;
/* Whether texture coordinate truncation is conformant. */
bool ta_cntl2_truncate_coord_mode;
uint64_t tcc_disabled_mask;
uint32_t gc_num_tcp_per_sa;
uint32_t gc_num_sdp_interface;
Expand Down
1 change: 1 addition & 0 deletions drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c
Original file line number Diff line number Diff line change
Expand Up @@ -552,6 +552,7 @@ void amdgpu_gmc_tmz_set(struct amdgpu_device *adev)
case IP_VERSION(10, 3, 2):
case IP_VERSION(10, 3, 4):
case IP_VERSION(10, 3, 5):
case IP_VERSION(10, 3, 6):
/* VANGOGH */
case IP_VERSION(10, 3, 1):
/* YELLOW_CARP*/
Expand Down
11 changes: 11 additions & 0 deletions drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c
Original file line number Diff line number Diff line change
Expand Up @@ -808,6 +808,8 @@ int amdgpu_info_ioctl(struct drm_device *dev, void *data, struct drm_file *filp)
dev_info->ids_flags |= AMDGPU_IDS_FLAGS_PREEMPTION;
if (amdgpu_is_tmz(adev))
dev_info->ids_flags |= AMDGPU_IDS_FLAGS_TMZ;
if (adev->gfx.config.ta_cntl2_truncate_coord_mode)
dev_info->ids_flags |= AMDGPU_IDS_FLAGS_CONFORMANT_TRUNC_COORD;

vm_size = adev->vm_manager.max_pfn * AMDGPU_GPU_PAGE_SIZE;
vm_size -= AMDGPU_VA_RESERVED_SIZE;
Expand Down Expand Up @@ -865,6 +867,15 @@ int amdgpu_info_ioctl(struct drm_device *dev, void *data, struct drm_file *filp)
adev->pm.pcie_mlw_mask & CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 ? 4 :
adev->pm.pcie_mlw_mask & CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 ? 2 : 1;

dev_info->tcp_cache_size = adev->gfx.config.gc_tcp_l1_size;
dev_info->num_sqc_per_wgp = adev->gfx.config.gc_num_sqc_per_wgp;
dev_info->sqc_data_cache_size = adev->gfx.config.gc_l1_data_cache_size_per_sqc;
dev_info->sqc_inst_cache_size = adev->gfx.config.gc_l1_instruction_cache_size_per_sqc;
dev_info->gl1c_cache_size = adev->gfx.config.gc_gl1c_size_per_instance *
adev->gfx.config.gc_gl1c_per_sa;
dev_info->gl2c_cache_size = adev->gfx.config.gc_gl2c_per_gpu;
dev_info->mall_size = adev->gmc.mall_size;

ret = copy_to_user(out, dev_info,
min((size_t)size, sizeof(*dev_info))) ? -EFAULT : 0;
kfree(dev_info);
Expand Down
2 changes: 1 addition & 1 deletion drivers/gpu/drm/amd/amdgpu/amdgpu_object.c
Original file line number Diff line number Diff line change
Expand Up @@ -139,7 +139,7 @@ void amdgpu_bo_placement_from_domain(struct amdgpu_bo *abo, u32 domain)

if (flags & AMDGPU_GEM_CREATE_CPU_ACCESS_REQUIRED)
places[c].lpfn = visible_pfn;
else
else if (adev->gmc.real_vram_size != adev->gmc.visible_vram_size)
places[c].flags |= TTM_PL_FLAG_TOPDOWN;

if (flags & AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS)
Expand Down
6 changes: 3 additions & 3 deletions drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c
Original file line number Diff line number Diff line change
Expand Up @@ -1683,7 +1683,7 @@ static int psp_hdcp_initialize(struct psp_context *psp)
psp->hdcp_context.context.mem_context.shared_mem_size = PSP_HDCP_SHARED_MEM_SIZE;
psp->hdcp_context.context.ta_load_type = GFX_CMD_ID_LOAD_TA;

if (!psp->hdcp_context.context.initialized) {
if (!psp->hdcp_context.context.mem_context.shared_buf) {
ret = psp_ta_init_shared_buf(psp, &psp->hdcp_context.context.mem_context);
if (ret)
return ret;
Expand Down Expand Up @@ -1750,7 +1750,7 @@ static int psp_dtm_initialize(struct psp_context *psp)
psp->dtm_context.context.mem_context.shared_mem_size = PSP_DTM_SHARED_MEM_SIZE;
psp->dtm_context.context.ta_load_type = GFX_CMD_ID_LOAD_TA;

if (!psp->dtm_context.context.initialized) {
if (!psp->dtm_context.context.mem_context.shared_buf) {
ret = psp_ta_init_shared_buf(psp, &psp->dtm_context.context.mem_context);
if (ret)
return ret;
Expand Down Expand Up @@ -1818,7 +1818,7 @@ static int psp_rap_initialize(struct psp_context *psp)
psp->rap_context.context.mem_context.shared_mem_size = PSP_RAP_SHARED_MEM_SIZE;
psp->rap_context.context.ta_load_type = GFX_CMD_ID_LOAD_TA;

if (!psp->rap_context.context.initialized) {
if (!psp->rap_context.context.mem_context.shared_buf) {
ret = psp_ta_init_shared_buf(psp, &psp->rap_context.context.mem_context);
if (ret)
return ret;
Expand Down
23 changes: 17 additions & 6 deletions drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
Original file line number Diff line number Diff line change
Expand Up @@ -176,7 +176,7 @@ static int amdgpu_reserve_page_direct(struct amdgpu_device *adev, uint64_t addre
if (amdgpu_bad_page_threshold != 0) {
amdgpu_ras_add_bad_pages(adev, err_data.err_addr,
err_data.err_addr_cnt);
amdgpu_ras_save_bad_pages(adev);
amdgpu_ras_save_bad_pages(adev, NULL);
}

dev_warn(adev->dev, "WARNING: THIS IS ONLY FOR TEST PURPOSES AND WILL CORRUPT RAS EEPROM\n");
Expand Down Expand Up @@ -2084,22 +2084,32 @@ int amdgpu_ras_add_bad_pages(struct amdgpu_device *adev,
/*
* write error record array to eeprom, the function should be
* protected by recovery_lock
* new_cnt: new added UE count, excluding reserved bad pages, can be NULL
*/
int amdgpu_ras_save_bad_pages(struct amdgpu_device *adev)
int amdgpu_ras_save_bad_pages(struct amdgpu_device *adev,
unsigned long *new_cnt)
{
struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
struct ras_err_handler_data *data;
struct amdgpu_ras_eeprom_control *control;
int save_count;

if (!con || !con->eh_data)
if (!con || !con->eh_data) {
if (new_cnt)
*new_cnt = 0;

return 0;
}

mutex_lock(&con->recovery_lock);
control = &con->eeprom_control;
data = con->eh_data;
save_count = data->count - control->ras_num_recs;
mutex_unlock(&con->recovery_lock);

if (new_cnt)
*new_cnt = save_count / adev->umc.retire_unit;

/* only new entries are saved */
if (save_count > 0) {
if (amdgpu_ras_eeprom_append(control,
Expand Down Expand Up @@ -2186,11 +2196,12 @@ static void amdgpu_ras_validate_threshold(struct amdgpu_device *adev,
/*
* Justification of value bad_page_cnt_threshold in ras structure
*
* Generally, -1 <= amdgpu_bad_page_threshold <= max record length
* in eeprom, and introduce two scenarios accordingly.
* Generally, 0 <= amdgpu_bad_page_threshold <= max record length
* in eeprom or amdgpu_bad_page_threshold == -2, introduce two
* scenarios accordingly.
*
* Bad page retirement enablement:
* - If amdgpu_bad_page_threshold = -1,
* - If amdgpu_bad_page_threshold = -2,
* bad_page_cnt_threshold = typical value by formula.
*
* - When the value from user is 0 < amdgpu_bad_page_threshold <
Expand Down
3 changes: 2 additions & 1 deletion drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
Original file line number Diff line number Diff line change
Expand Up @@ -547,7 +547,8 @@ int amdgpu_ras_query_error_count(struct amdgpu_device *adev,
int amdgpu_ras_add_bad_pages(struct amdgpu_device *adev,
struct eeprom_table_record *bps, int pages);

int amdgpu_ras_save_bad_pages(struct amdgpu_device *adev);
int amdgpu_ras_save_bad_pages(struct amdgpu_device *adev,
unsigned long *new_cnt);

static inline enum ta_ras_block
amdgpu_ras_block_to_ta(enum amdgpu_ras_block block) {
Expand Down
23 changes: 16 additions & 7 deletions drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c
Original file line number Diff line number Diff line change
Expand Up @@ -417,7 +417,8 @@ bool amdgpu_ras_eeprom_check_err_threshold(struct amdgpu_device *adev)
{
struct amdgpu_ras *con = amdgpu_ras_get_context(adev);

if (!__is_ras_eeprom_supported(adev))
if (!__is_ras_eeprom_supported(adev) ||
!amdgpu_bad_page_threshold)
return false;

/* skip check eeprom table for VEGA20 Gaming */
Expand All @@ -428,10 +429,18 @@ bool amdgpu_ras_eeprom_check_err_threshold(struct amdgpu_device *adev)
return false;

if (con->eeprom_control.tbl_hdr.header == RAS_TABLE_HDR_BAD) {
dev_warn(adev->dev, "This GPU is in BAD status.");
dev_warn(adev->dev, "Please retire it or set a larger "
"threshold value when reloading driver.\n");
return true;
if (amdgpu_bad_page_threshold == -1) {
dev_warn(adev->dev, "RAS records:%d exceed threshold:%d",
con->eeprom_control.ras_num_recs, con->bad_page_cnt_threshold);
dev_warn(adev->dev,
"But GPU can be operated due to bad_page_threshold = -1.\n");
return false;
} else {
dev_warn(adev->dev, "This GPU is in BAD status.");
dev_warn(adev->dev, "Please retire it or set a larger "
"threshold value when reloading driver.\n");
return true;
}
}

return false;
Expand Down Expand Up @@ -1191,8 +1200,8 @@ int amdgpu_ras_eeprom_init(struct amdgpu_ras_eeprom_control *control,
} else {
dev_err(adev->dev, "RAS records:%d exceed threshold:%d",
control->ras_num_recs, ras->bad_page_cnt_threshold);
if (amdgpu_bad_page_threshold == -2) {
dev_warn(adev->dev, "GPU will be initialized due to bad_page_threshold = -2.");
if (amdgpu_bad_page_threshold == -1) {
dev_warn(adev->dev, "GPU will be initialized due to bad_page_threshold = -1.");
res = 0;
} else {
*exceed_err_limit = true;
Expand Down
4 changes: 2 additions & 2 deletions drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c
Original file line number Diff line number Diff line change
Expand Up @@ -68,7 +68,7 @@ int amdgpu_umc_page_retirement_mca(struct amdgpu_device *adev,
if (amdgpu_bad_page_threshold != 0) {
amdgpu_ras_add_bad_pages(adev, err_data.err_addr,
err_data.err_addr_cnt);
amdgpu_ras_save_bad_pages(adev);
amdgpu_ras_save_bad_pages(adev, NULL);
}

out:
Expand Down Expand Up @@ -147,7 +147,7 @@ static int amdgpu_umc_do_page_retirement(struct amdgpu_device *adev,
err_data->err_addr_cnt) {
amdgpu_ras_add_bad_pages(adev, err_data->err_addr,
err_data->err_addr_cnt);
amdgpu_ras_save_bad_pages(adev);
amdgpu_ras_save_bad_pages(adev, &(err_data->ue_count));

amdgpu_dpm_send_hbm_bad_pages_num(adev, con->eeprom_control.ras_num_recs);

Expand Down
2 changes: 2 additions & 0 deletions drivers/gpu/drm/amd/amdgpu/amdgpu_umc.h
Original file line number Diff line number Diff line change
Expand Up @@ -74,6 +74,8 @@ struct amdgpu_umc {

/* UMC regiser per channel offset */
uint32_t channel_offs;
/* how many pages are retired in one UE */
uint32_t retire_unit;
/* channel index table of interleaved memory */
const uint32_t *channel_idx_tbl;
struct ras_common_if *ras_if;
Expand Down
3 changes: 2 additions & 1 deletion drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c
Original file line number Diff line number Diff line change
Expand Up @@ -453,7 +453,8 @@ static int amdgpu_vram_mgr_new(struct ttm_resource_manager *man,
/* Limit maximum size to 2GiB due to SG table limitations */
size = min(remaining_size, 2ULL << 30);

if (size >= (u64)pages_per_block << PAGE_SHIFT)
if ((size >= (u64)pages_per_block << PAGE_SHIFT) &&
!(size & (((u64)pages_per_block << PAGE_SHIFT) - 1)))
min_block_size = (u64)pages_per_block << PAGE_SHIFT;

cur_size = size;
Expand Down
Loading

0 comments on commit 2eb29d5

Please sign in to comment.