Skip to content

Commit

Permalink
Merge tag 'amd-drm-next-5.8-2020-05-19' of git://people.freedesktop.o…
Browse files Browse the repository at this point in the history
…rg/~agd5f/linux into drm-next

amd-drm-next-5.8-2020-05-19:

amdgpu:
- Improved handling for CTF (Critical Thermal Fault) situations
- Clarify AC/DC mode switches
- SR-IOV fixes
- XGMI fixes for RAS
- Misc cleanups
- Add autodump debugfs node to aid in GPU hang debugging

UAPI:
- Add a MEM_SYNC IB flag for handling proper acquire memory semantics if UMDs expect the kernel to handle this
  Used by AMDVLK: https://github.com/GPUOpen-Drivers/pal/blob/dev/src/core/os/amdgpu/amdgpuQueue.cpp#L1262

Signed-off-by: Dave Airlie <airlied@redhat.com>
From: Alex Deucher <alexdeucher@gmail.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20200519202505.4126-1-alexander.deucher@amd.com
  • Loading branch information
Dave Airlie committed May 20, 2020
2 parents 1493bdd + 43c8546 commit bfbe174
Show file tree
Hide file tree
Showing 32 changed files with 775 additions and 313 deletions.
2 changes: 2 additions & 0 deletions drivers/gpu/drm/amd/amdgpu/amdgpu.h
Original file line number Diff line number Diff line change
Expand Up @@ -989,6 +989,8 @@ struct amdgpu_device {
char product_number[16];
char product_name[32];
char serial[16];

struct amdgpu_autodump autodump;
};

static inline struct amdgpu_device *amdgpu_ttm_adev(struct ttm_bo_device *bdev)
Expand Down
78 changes: 77 additions & 1 deletion drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@
#include <linux/pci.h>
#include <linux/uaccess.h>
#include <linux/pm_runtime.h>

#include <linux/poll.h>
#include <drm/drm_debugfs.h>

#include "amdgpu.h"
Expand Down Expand Up @@ -74,8 +74,82 @@ int amdgpu_debugfs_add_files(struct amdgpu_device *adev,
return 0;
}

int amdgpu_debugfs_wait_dump(struct amdgpu_device *adev)
{
#if defined(CONFIG_DEBUG_FS)
unsigned long timeout = 600 * HZ;
int ret;

wake_up_interruptible(&adev->autodump.gpu_hang);

ret = wait_for_completion_interruptible_timeout(&adev->autodump.dumping, timeout);
if (ret == 0) {
pr_err("autodump: timeout, move on to gpu recovery\n");
return -ETIMEDOUT;
}
#endif
return 0;
}

#if defined(CONFIG_DEBUG_FS)

static int amdgpu_debugfs_autodump_open(struct inode *inode, struct file *file)
{
struct amdgpu_device *adev = inode->i_private;
int ret;

file->private_data = adev;

mutex_lock(&adev->lock_reset);
if (adev->autodump.dumping.done) {
reinit_completion(&adev->autodump.dumping);
ret = 0;
} else {
ret = -EBUSY;
}
mutex_unlock(&adev->lock_reset);

return ret;
}

static int amdgpu_debugfs_autodump_release(struct inode *inode, struct file *file)
{
struct amdgpu_device *adev = file->private_data;

complete_all(&adev->autodump.dumping);
return 0;
}

static unsigned int amdgpu_debugfs_autodump_poll(struct file *file, struct poll_table_struct *poll_table)
{
struct amdgpu_device *adev = file->private_data;

poll_wait(file, &adev->autodump.gpu_hang, poll_table);

if (adev->in_gpu_reset)
return POLLIN | POLLRDNORM | POLLWRNORM;

return 0;
}

static const struct file_operations autodump_debug_fops = {
.owner = THIS_MODULE,
.open = amdgpu_debugfs_autodump_open,
.poll = amdgpu_debugfs_autodump_poll,
.release = amdgpu_debugfs_autodump_release,
};

static void amdgpu_debugfs_autodump_init(struct amdgpu_device *adev)
{
init_completion(&adev->autodump.dumping);
complete_all(&adev->autodump.dumping);
init_waitqueue_head(&adev->autodump.gpu_hang);

debugfs_create_file("amdgpu_autodump", 0600,
adev->ddev->primary->debugfs_root,
adev, &autodump_debug_fops);
}

/**
* amdgpu_debugfs_process_reg_op - Handle MMIO register reads/writes
*
Expand Down Expand Up @@ -1434,6 +1508,8 @@ int amdgpu_debugfs_init(struct amdgpu_device *adev)

amdgpu_ras_debugfs_create_all(adev);

amdgpu_debugfs_autodump_init(adev);

return amdgpu_debugfs_add_files(adev, amdgpu_debugfs_list,
ARRAY_SIZE(amdgpu_debugfs_list));
}
Expand Down
6 changes: 6 additions & 0 deletions drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.h
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,11 @@ struct amdgpu_debugfs {
unsigned num_files;
};

struct amdgpu_autodump {
struct completion dumping;
struct wait_queue_head gpu_hang;
};

int amdgpu_debugfs_regs_init(struct amdgpu_device *adev);
int amdgpu_debugfs_init(struct amdgpu_device *adev);
void amdgpu_debugfs_fini(struct amdgpu_device *adev);
Expand All @@ -40,3 +45,4 @@ int amdgpu_debugfs_add_files(struct amdgpu_device *adev,
int amdgpu_debugfs_fence_init(struct amdgpu_device *adev);
int amdgpu_debugfs_firmware_init(struct amdgpu_device *adev);
int amdgpu_debugfs_gem_init(struct amdgpu_device *adev);
int amdgpu_debugfs_wait_dump(struct amdgpu_device *adev);
2 changes: 2 additions & 0 deletions drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
Original file line number Diff line number Diff line change
Expand Up @@ -3927,6 +3927,8 @@ static int amdgpu_device_pre_asic_reset(struct amdgpu_device *adev,
int i, r = 0;
bool need_full_reset = *need_full_reset_arg;

amdgpu_debugfs_wait_dump(adev);

/* block all schedulers and reset given job's ring */
for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
struct amdgpu_ring *ring = adev->rings[i];
Expand Down
10 changes: 10 additions & 0 deletions drivers/gpu/drm/amd/amdgpu/amdgpu_dpm.c
Original file line number Diff line number Diff line change
Expand Up @@ -1188,3 +1188,13 @@ int amdgpu_dpm_set_df_cstate(struct amdgpu_device *adev,

return ret;
}

int amdgpu_dpm_allow_xgmi_power_down(struct amdgpu_device *adev, bool en)
{
struct smu_context *smu = &adev->smu;

if (is_support_sw_smu(adev))
return smu_allow_xgmi_power_down(smu, en);

return 0;
}
2 changes: 2 additions & 0 deletions drivers/gpu/drm/amd/amdgpu/amdgpu_dpm.h
Original file line number Diff line number Diff line change
Expand Up @@ -538,4 +538,6 @@ int amdgpu_dpm_baco_enter(struct amdgpu_device *adev);
int amdgpu_dpm_set_df_cstate(struct amdgpu_device *adev,
uint32_t cstate);

int amdgpu_dpm_allow_xgmi_power_down(struct amdgpu_device *adev, bool en);

#endif
3 changes: 2 additions & 1 deletion drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
Original file line number Diff line number Diff line change
Expand Up @@ -86,9 +86,10 @@
* - 3.35.0 - Add drm_amdgpu_info_device::tcc_disabled_mask
* - 3.36.0 - Allow reading more status registers on si/cik
* - 3.37.0 - L2 is invalidated before SDMA IBs, needed for correctness
* - 3.38.0 - Add AMDGPU_IB_FLAG_EMIT_MEM_SYNC
*/
#define KMS_DRIVER_MAJOR 3
#define KMS_DRIVER_MINOR 37
#define KMS_DRIVER_MINOR 38
#define KMS_DRIVER_PATCHLEVEL 0

int amdgpu_vram_limit = 0;
Expand Down
3 changes: 3 additions & 0 deletions drivers/gpu/drm/amd/amdgpu/amdgpu_ib.c
Original file line number Diff line number Diff line change
Expand Up @@ -189,6 +189,9 @@ int amdgpu_ib_schedule(struct amdgpu_ring *ring, unsigned num_ibs,
dma_fence_put(tmp);
}

if ((ib->flags & AMDGPU_IB_FLAG_EMIT_MEM_SYNC) && ring->funcs->emit_mem_sync)
ring->funcs->emit_mem_sync(ring);

if (ring->funcs->insert_start)
ring->funcs->insert_start(ring);

Expand Down
Loading

0 comments on commit bfbe174

Please sign in to comment.