Skip to content

Commit

Permalink
Merge tag 'amd-drm-next-6.10-2024-04-26' of https://gitlab.freedeskto…
Browse files Browse the repository at this point in the history
…p.org/agd5f/linux into drm-next

amd-drm-next-6.10-2024-04-26:

amdgpu:
- Misc code cleanups and refactors
- Support setting reset method at runtime
- Report OD status
- SMU 14.0.1 fixes
- SDMA 4.4.2 fixes
- VPE fixes
- MES fixes
- Update BO eviction priorities
- UMSCH fixes
- Reset fixes
- Freesync fixes
- GFXIP 9.4.3 fixes
- SDMA 5.2 fixes
- MES UAF fix
- RAS updates
- Devcoredump updates for dumping IP state
- DSC fixes
- JPEG fix
- Fix VRAM memory accounting
- VCN 5.0 fixes
- MES fixes
- UMC 12.0 updates
- Modify contiguous flags handling
- Initial support for mapping kernel queues via MES

amdkfd:
- Fix rescheduling of restore worker
- VRAM accounting for SVM migrations
- mGPU fix
- Enable SQ watchpoint for gfx10

Signed-off-by: Dave Airlie <airlied@redhat.com>

From: Alex Deucher <alexander.deucher@amd.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20240426221245.1613332-1-alexander.deucher@amd.com
  • Loading branch information
Dave Airlie committed Apr 30, 2024
2 parents 68b89e2 + b77bef3 commit 4a56c0e
Show file tree
Hide file tree
Showing 185 changed files with 1,650 additions and 286 deletions.
2 changes: 1 addition & 1 deletion drivers/gpu/drm/amd/amdgpu/aldebaran.c
Original file line number Diff line number Diff line change
Expand Up @@ -97,7 +97,7 @@ static int aldebaran_mode2_suspend_ip(struct amdgpu_device *adev)
adev->ip_blocks[i].status.hw = false;
}

return r;
return 0;
}

static int
Expand Down
9 changes: 9 additions & 0 deletions drivers/gpu/drm/amd/amdgpu/amdgpu.h
Original file line number Diff line number Diff line change
Expand Up @@ -139,6 +139,14 @@ enum amdgpu_ss {
AMDGPU_SS_DRV_UNLOAD
};

struct amdgpu_hwip_reg_entry {
u32 hwip;
u32 inst;
u32 seg;
u32 reg_offset;
const char *reg_name;
};

struct amdgpu_watchdog_timer {
bool timeout_fatal_disable;
uint32_t period; /* maxCycles = (1 << period), the number of cycles before a timeout */
Expand Down Expand Up @@ -494,6 +502,7 @@ struct amdgpu_wb {
uint64_t gpu_addr;
u32 num_wb; /* Number of wb slots actually reserved for amdgpu. */
unsigned long used[DIV_ROUND_UP(AMDGPU_MAX_WB, BITS_PER_LONG)];
spinlock_t lock;
};

int amdgpu_device_wb_get(struct amdgpu_device *adev, u32 *wb);
Expand Down
2 changes: 2 additions & 0 deletions drivers/gpu/drm/amd/amdgpu/amdgpu_acp.c
Original file line number Diff line number Diff line change
Expand Up @@ -637,6 +637,8 @@ static const struct amd_ip_funcs acp_ip_funcs = {
.soft_reset = acp_soft_reset,
.set_clockgating_state = acp_set_clockgating_state,
.set_powergating_state = acp_set_powergating_state,
.dump_ip_state = NULL,
.print_ip_state = NULL,
};

const struct amdgpu_ip_block_version acp_ip_block = {
Expand Down
9 changes: 8 additions & 1 deletion drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
Original file line number Diff line number Diff line change
Expand Up @@ -747,10 +747,17 @@ bool amdgpu_amdkfd_is_fed(struct amdgpu_device *adev)
return amdgpu_ras_get_fed_status(adev);
}

void amdgpu_amdkfd_ras_pasid_poison_consumption_handler(struct amdgpu_device *adev,
enum amdgpu_ras_block block, uint16_t pasid,
pasid_notify pasid_fn, void *data, uint32_t reset)
{
amdgpu_umc_pasid_poison_handler(adev, block, pasid, pasid_fn, data, reset);
}

void amdgpu_amdkfd_ras_poison_consumption_handler(struct amdgpu_device *adev,
enum amdgpu_ras_block block, uint32_t reset)
{
amdgpu_umc_poison_handler(adev, block, reset);
amdgpu_umc_pasid_poison_handler(adev, block, 0, NULL, NULL, reset);
}

int amdgpu_amdkfd_send_close_event_drain_irq(struct amdgpu_device *adev,
Expand Down
5 changes: 5 additions & 0 deletions drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
Original file line number Diff line number Diff line change
Expand Up @@ -337,6 +337,11 @@ int amdgpu_amdkfd_get_tile_config(struct amdgpu_device *adev,
struct tile_config *config);
void amdgpu_amdkfd_ras_poison_consumption_handler(struct amdgpu_device *adev,
enum amdgpu_ras_block block, uint32_t reset);

void amdgpu_amdkfd_ras_pasid_poison_consumption_handler(struct amdgpu_device *adev,
enum amdgpu_ras_block block, uint16_t pasid,
pasid_notify pasid_fn, void *data, uint32_t reset);

bool amdgpu_amdkfd_is_fed(struct amdgpu_device *adev);
bool amdgpu_amdkfd_bo_mapped_to_dev(struct amdgpu_device *adev, struct kgd_mem *mem);
void amdgpu_amdkfd_block_mmu_notifications(void *p);
Expand Down
71 changes: 58 additions & 13 deletions drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.c
Original file line number Diff line number Diff line change
Expand Up @@ -881,6 +881,7 @@ uint32_t kgd_gfx_v10_set_wave_launch_mode(struct amdgpu_device *adev,
}

#define TCP_WATCH_STRIDE (mmTCP_WATCH1_ADDR_H - mmTCP_WATCH0_ADDR_H)
#define SQ_WATCH_STRIDE (mmSQ_WATCH1_ADDR_H - mmSQ_WATCH0_ADDR_H)
uint32_t kgd_gfx_v10_set_address_watch(struct amdgpu_device *adev,
uint64_t watch_address,
uint32_t watch_address_mask,
Expand All @@ -889,55 +890,93 @@ uint32_t kgd_gfx_v10_set_address_watch(struct amdgpu_device *adev,
uint32_t debug_vmid,
uint32_t inst)
{
/* SQ_WATCH?_ADDR_* and TCP_WATCH?_ADDR_* are programmed with the
* same values.
*/
uint32_t watch_address_high;
uint32_t watch_address_low;
uint32_t watch_address_cntl;

watch_address_cntl = 0;
uint32_t tcp_watch_address_cntl;
uint32_t sq_watch_address_cntl;

watch_address_low = lower_32_bits(watch_address);
watch_address_high = upper_32_bits(watch_address) & 0xffff;

watch_address_cntl = REG_SET_FIELD(watch_address_cntl,
tcp_watch_address_cntl = 0;
tcp_watch_address_cntl = REG_SET_FIELD(tcp_watch_address_cntl,
TCP_WATCH0_CNTL,
VMID,
debug_vmid);
watch_address_cntl = REG_SET_FIELD(watch_address_cntl,
tcp_watch_address_cntl = REG_SET_FIELD(tcp_watch_address_cntl,
TCP_WATCH0_CNTL,
MODE,
watch_mode);
watch_address_cntl = REG_SET_FIELD(watch_address_cntl,
tcp_watch_address_cntl = REG_SET_FIELD(tcp_watch_address_cntl,
TCP_WATCH0_CNTL,
MASK,
watch_address_mask >> 7);

sq_watch_address_cntl = 0;
sq_watch_address_cntl = REG_SET_FIELD(sq_watch_address_cntl,
SQ_WATCH0_CNTL,
VMID,
debug_vmid);
sq_watch_address_cntl = REG_SET_FIELD(sq_watch_address_cntl,
SQ_WATCH0_CNTL,
MODE,
watch_mode);
sq_watch_address_cntl = REG_SET_FIELD(sq_watch_address_cntl,
SQ_WATCH0_CNTL,
MASK,
watch_address_mask >> 6);

/* Turning off this watch point until we set all the registers */
watch_address_cntl = REG_SET_FIELD(watch_address_cntl,
tcp_watch_address_cntl = REG_SET_FIELD(tcp_watch_address_cntl,
TCP_WATCH0_CNTL,
VALID,
0);

WREG32((SOC15_REG_OFFSET(GC, 0, mmTCP_WATCH0_CNTL) +
(watch_id * TCP_WATCH_STRIDE)),
watch_address_cntl);
tcp_watch_address_cntl);

sq_watch_address_cntl = REG_SET_FIELD(sq_watch_address_cntl,
SQ_WATCH0_CNTL,
VALID,
0);
WREG32((SOC15_REG_OFFSET(GC, 0, mmSQ_WATCH0_CNTL) +
(watch_id * SQ_WATCH_STRIDE)),
sq_watch_address_cntl);

/* Program {TCP,SQ}_WATCH?_ADDR* */
WREG32((SOC15_REG_OFFSET(GC, 0, mmTCP_WATCH0_ADDR_H) +
(watch_id * TCP_WATCH_STRIDE)),
watch_address_high);

WREG32((SOC15_REG_OFFSET(GC, 0, mmTCP_WATCH0_ADDR_L) +
(watch_id * TCP_WATCH_STRIDE)),
watch_address_low);

WREG32((SOC15_REG_OFFSET(GC, 0, mmSQ_WATCH0_ADDR_H) +
(watch_id * SQ_WATCH_STRIDE)),
watch_address_high);
WREG32((SOC15_REG_OFFSET(GC, 0, mmSQ_WATCH0_ADDR_L) +
(watch_id * SQ_WATCH_STRIDE)),
watch_address_low);

/* Enable the watch point */
watch_address_cntl = REG_SET_FIELD(watch_address_cntl,
tcp_watch_address_cntl = REG_SET_FIELD(tcp_watch_address_cntl,
TCP_WATCH0_CNTL,
VALID,
1);

WREG32((SOC15_REG_OFFSET(GC, 0, mmTCP_WATCH0_CNTL) +
(watch_id * TCP_WATCH_STRIDE)),
watch_address_cntl);
tcp_watch_address_cntl);

sq_watch_address_cntl = REG_SET_FIELD(sq_watch_address_cntl,
SQ_WATCH0_CNTL,
VALID,
1);
WREG32((SOC15_REG_OFFSET(GC, 0, mmSQ_WATCH0_CNTL) +
(watch_id * SQ_WATCH_STRIDE)),
sq_watch_address_cntl);

return 0;
}
Expand All @@ -953,8 +992,14 @@ uint32_t kgd_gfx_v10_clear_address_watch(struct amdgpu_device *adev,
(watch_id * TCP_WATCH_STRIDE)),
watch_address_cntl);

WREG32((SOC15_REG_OFFSET(GC, 0, mmSQ_WATCH0_CNTL) +
(watch_id * SQ_WATCH_STRIDE)),
watch_address_cntl);

return 0;
}
#undef TCP_WATCH_STRIDE
#undef SQ_WATCH_STRIDE


/* kgd_gfx_v10_get_iq_wait_times: Returns the mmCP_IQ_WAIT_TIME1/2 values
Expand Down
2 changes: 1 addition & 1 deletion drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
Original file line number Diff line number Diff line change
Expand Up @@ -220,7 +220,7 @@ int amdgpu_amdkfd_reserve_mem_limit(struct amdgpu_device *adev,
(kfd_mem_limit.ttm_mem_used + ttm_mem_needed >
kfd_mem_limit.max_ttm_mem_limit) ||
(adev && xcp_id >= 0 && adev->kfd.vram_used[xcp_id] + vram_needed >
vram_size - reserved_for_pt)) {
vram_size - reserved_for_pt - atomic64_read(&adev->vram_pin_size))) {
ret = -ENOMEM;
goto release;
}
Expand Down
2 changes: 1 addition & 1 deletion drivers/gpu/drm/amd/amdgpu/amdgpu_benchmark.c
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,7 @@ static int amdgpu_benchmark_do_move(struct amdgpu_device *adev, unsigned size,
for (i = 0; i < n; i++) {
struct amdgpu_ring *ring = adev->mman.buffer_funcs_ring;
r = amdgpu_copy_buffer(ring, saddr, daddr, size, NULL, &fence,
false, false, false);
false, false, 0);
if (r)
goto exit_do_move;
r = dma_fence_wait(fence, false);
Expand Down
5 changes: 3 additions & 2 deletions drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
Original file line number Diff line number Diff line change
Expand Up @@ -2065,12 +2065,13 @@ static ssize_t amdgpu_reset_dump_register_list_write(struct file *f,
struct amdgpu_device *adev = (struct amdgpu_device *)file_inode(f)->i_private;
char reg_offset[11];
uint32_t *new = NULL, *tmp = NULL;
int ret, i = 0, len = 0;
unsigned int len = 0;
int ret, i = 0;

do {
memset(reg_offset, 0, 11);
if (copy_from_user(reg_offset, buf + len,
min(10, ((int)size-len)))) {
min(10, (size-len)))) {
ret = -EFAULT;
goto error_free;
}
Expand Down
14 changes: 14 additions & 0 deletions drivers/gpu/drm/amd/amdgpu/amdgpu_dev_coredump.c
Original file line number Diff line number Diff line change
Expand Up @@ -262,6 +262,20 @@ amdgpu_devcoredump_read(char *buffer, loff_t offset, size_t count,
drm_printf(&p, "Faulty page starting at address: 0x%016llx\n", fault_info->addr);
drm_printf(&p, "Protection fault status register: 0x%x\n\n", fault_info->status);

/* dump the ip state for each ip */
drm_printf(&p, "IP Dump\n");
for (int i = 0; i < coredump->adev->num_ip_blocks; i++) {
if (coredump->adev->ip_blocks[i].version->funcs->print_ip_state) {
drm_printf(&p, "IP: %s\n",
coredump->adev->ip_blocks[i]
.version->funcs->name);
coredump->adev->ip_blocks[i]
.version->funcs->print_ip_state(
(void *)coredump->adev, &p);
drm_printf(&p, "\n");
}
}

/* Add ring buffer information */
drm_printf(&p, "Ring buffer information\n");
for (int i = 0; i < coredump->adev->num_rings; i++) {
Expand Down
21 changes: 19 additions & 2 deletions drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
Original file line number Diff line number Diff line change
Expand Up @@ -1482,13 +1482,17 @@ static int amdgpu_device_wb_init(struct amdgpu_device *adev)
*/
int amdgpu_device_wb_get(struct amdgpu_device *adev, u32 *wb)
{
unsigned long offset = find_first_zero_bit(adev->wb.used, adev->wb.num_wb);
unsigned long flags, offset;

spin_lock_irqsave(&adev->wb.lock, flags);
offset = find_first_zero_bit(adev->wb.used, adev->wb.num_wb);
if (offset < adev->wb.num_wb) {
__set_bit(offset, adev->wb.used);
spin_unlock_irqrestore(&adev->wb.lock, flags);
*wb = offset << 3; /* convert to dw offset */
return 0;
} else {
spin_unlock_irqrestore(&adev->wb.lock, flags);
return -EINVAL;
}
}
Expand All @@ -1503,9 +1507,13 @@ int amdgpu_device_wb_get(struct amdgpu_device *adev, u32 *wb)
*/
void amdgpu_device_wb_free(struct amdgpu_device *adev, u32 wb)
{
unsigned long flags;

wb >>= 3;
spin_lock_irqsave(&adev->wb.lock, flags);
if (wb < adev->wb.num_wb)
__clear_bit(wb, adev->wb.used);
spin_unlock_irqrestore(&adev->wb.lock, flags);
}

/**
Expand Down Expand Up @@ -4061,6 +4069,7 @@ int amdgpu_device_init(struct amdgpu_device *adev,
spin_lock_init(&adev->se_cac_idx_lock);
spin_lock_init(&adev->audio_endpt_idx_lock);
spin_lock_init(&adev->mm_stats.lock);
spin_lock_init(&adev->wb.lock);

INIT_LIST_HEAD(&adev->shadow_list);
mutex_init(&adev->shadow_list_lock);
Expand Down Expand Up @@ -5353,14 +5362,22 @@ int amdgpu_do_asic_reset(struct list_head *device_list_handle,
struct amdgpu_device *tmp_adev = NULL;
bool need_full_reset, skip_hw_reset, vram_lost = false;
int r = 0;
uint32_t i;

/* Try reset handler method first */
tmp_adev = list_first_entry(device_list_handle, struct amdgpu_device,
reset_list);

if (!test_bit(AMDGPU_SKIP_COREDUMP, &reset_context->flags))
if (!test_bit(AMDGPU_SKIP_COREDUMP, &reset_context->flags)) {
amdgpu_reset_reg_dumps(tmp_adev);

/* Trigger ip dump before we reset the asic */
for (i = 0; i < tmp_adev->num_ip_blocks; i++)
if (tmp_adev->ip_blocks[i].version->funcs->dump_ip_state)
tmp_adev->ip_blocks[i].version->funcs
->dump_ip_state((void *)tmp_adev);
}

reset_context->reset_device_list = device_list_handle;
r = amdgpu_reset_perform_reset(tmp_adev, reset_context);
/* If reset handler not implemented, continue; otherwise return */
Expand Down
2 changes: 1 addition & 1 deletion drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
Original file line number Diff line number Diff line change
Expand Up @@ -925,7 +925,7 @@ module_param_named(freesync_video, amdgpu_freesync_vid_mode, uint, 0444);
* GPU reset method (-1 = auto (default), 0 = legacy, 1 = mode0, 2 = mode1, 3 = mode2, 4 = baco)
*/
MODULE_PARM_DESC(reset_method, "GPU reset method (-1 = auto (default), 0 = legacy, 1 = mode0, 2 = mode1, 3 = mode2, 4 = baco/bamaco)");
module_param_named(reset_method, amdgpu_reset_method, int, 0444);
module_param_named(reset_method, amdgpu_reset_method, int, 0644);

/**
* DOC: bad_page_threshold (int) Bad page threshold is specifies the
Expand Down
3 changes: 2 additions & 1 deletion drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
Original file line number Diff line number Diff line change
Expand Up @@ -1206,7 +1206,8 @@ void amdgpu_gfx_cp_init_microcode(struct amdgpu_device *adev,
fw_size = le32_to_cpu(cp_hdr_v2_0->data_size_bytes);
break;
default:
break;
dev_err(adev->dev, "Invalid ucode id %u\n", ucode_id);
return;
}

if (adev->firmware.load_type == AMDGPU_FW_LOAD_PSP) {
Expand Down
4 changes: 4 additions & 0 deletions drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h
Original file line number Diff line number Diff line change
Expand Up @@ -433,6 +433,10 @@ struct amdgpu_gfx {
uint32_t num_xcc_per_xcp;
struct mutex partition_mutex;
bool mcbp; /* mid command buffer preemption */

/* IP reg dump */
uint32_t *ip_dump;
uint32_t reg_count;
};

struct amdgpu_gfx_ras_reg_entry {
Expand Down
Loading

0 comments on commit 4a56c0e

Please sign in to comment.