Skip to content

Commit

Permalink
Merge tag 'drm-next-5.5-2019-11-08' of git://people.freedesktop.org/~…
Browse files Browse the repository at this point in the history
…agd5f/linux into drm-next

drm-next-5.5-2019-11-08:

amdgpu:
- Enable VCN dynamic powergating on RV/RV2
- Fixes for Navi14
- Misc Navi fixes
- Fix MSI-X tear down
- Misc Arturus fixes
- Fix xgmi powerstate handling
- Documenation fixes

scheduler:
- Fix static code checker warning
- Fix possible thread reactivation while thread is stopped
- Avoid cleanup if thread is parked

radeon:
- SI dpm fix ported from amdgpu

Signed-off-by: Dave Airlie <airlied@redhat.com>
From: Alex Deucher <alexdeucher@gmail.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20191108212713.5078-1-alexander.deucher@amd.com
  • Loading branch information
Dave Airlie committed Nov 13, 2019
2 parents 77e0723 + 53dbc27 commit 0990ca2
Show file tree
Hide file tree
Showing 30 changed files with 406 additions and 62 deletions.
35 changes: 35 additions & 0 deletions Documentation/gpu/amdgpu.rst
Original file line number Diff line number Diff line change
Expand Up @@ -82,12 +82,21 @@ AMDGPU XGMI Support
AMDGPU RAS Support
==================

The AMDGPU RAS interfaces are exposed via sysfs (for informational queries) and
debugfs (for error injection).

RAS debugfs/sysfs Control and Error Injection Interfaces
--------------------------------------------------------

.. kernel-doc:: drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
:doc: AMDGPU RAS debugfs control interface

RAS Reboot Behavior for Unrecoverable Errors
--------------------------------------------------------

.. kernel-doc:: drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
:doc: AMDGPU RAS Reboot Behavior for Unrecoverable Errors

RAS Error Count sysfs Interface
-------------------------------

Expand All @@ -109,6 +118,32 @@ RAS VRAM Bad Pages sysfs Interface
.. kernel-doc:: drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
:internal:

Sample Code
-----------
Sample code for testing error injection can be found here:
https://cgit.freedesktop.org/mesa/drm/tree/tests/amdgpu/ras_tests.c

This is part of the libdrm amdgpu unit tests which cover several areas of the GPU.
There are four sets of tests:

RAS Basic Test

The test verifies the RAS feature enabled status and makes sure the necessary sysfs and debugfs files
are present.

RAS Query Test

This test checks the RAS availability and enablement status for each supported IP block as well as
the error counts.

RAS Inject Test

This test injects errors for each IP.

RAS Disable Test

This test tests disabling of RAS features for each IP block.


GPU Power/Thermal Controls and Monitoring
=========================================
Expand Down
3 changes: 3 additions & 0 deletions drivers/gpu/drm/amd/amdgpu/amdgpu.h
Original file line number Diff line number Diff line change
Expand Up @@ -977,6 +977,9 @@ struct amdgpu_device {

uint64_t unique_id;
uint64_t df_perfmon_config_assign_mask[AMDGPU_MAX_DF_PERFMONS];

/* device pstate */
int pstate;
};

static inline struct amdgpu_device *amdgpu_ttm_adev(struct ttm_bo_device *bdev)
Expand Down
6 changes: 2 additions & 4 deletions drivers/gpu/drm/amd/amdgpu/amdgpu_benchmark.c
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@ static int amdgpu_benchmark_do_move(struct amdgpu_device *adev, unsigned size,
{
unsigned long start_jiffies;
unsigned long end_jiffies;
struct dma_fence *fence = NULL;
struct dma_fence *fence;
int i, r;

start_jiffies = jiffies;
Expand All @@ -44,16 +44,14 @@ static int amdgpu_benchmark_do_move(struct amdgpu_device *adev, unsigned size,
if (r)
goto exit_do_move;
r = dma_fence_wait(fence, false);
dma_fence_put(fence);
if (r)
goto exit_do_move;
dma_fence_put(fence);
}
end_jiffies = jiffies;
r = jiffies_to_msecs(end_jiffies - start_jiffies);

exit_do_move:
if (fence)
dma_fence_put(fence);
return r;
}

Expand Down
10 changes: 10 additions & 0 deletions drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
Original file line number Diff line number Diff line change
Expand Up @@ -859,6 +859,9 @@ static int amdgpu_debugfs_test_ib(struct seq_file *m, void *data)
struct amdgpu_device *adev = dev->dev_private;
int r = 0, i;

/* Avoid accidently unparking the sched thread during GPU reset */
mutex_lock(&adev->lock_reset);

/* hold on the scheduler */
for (i = 0; i < AMDGPU_MAX_RINGS; i++) {
struct amdgpu_ring *ring = adev->rings[i];
Expand All @@ -884,6 +887,8 @@ static int amdgpu_debugfs_test_ib(struct seq_file *m, void *data)
kthread_unpark(ring->sched.thread);
}

mutex_unlock(&adev->lock_reset);

return 0;
}

Expand Down Expand Up @@ -1036,6 +1041,9 @@ static int amdgpu_debugfs_ib_preempt(void *data, u64 val)
if (!fences)
return -ENOMEM;

/* Avoid accidently unparking the sched thread during GPU reset */
mutex_lock(&adev->lock_reset);

/* stop the scheduler */
kthread_park(ring->sched.thread);

Expand Down Expand Up @@ -1075,6 +1083,8 @@ static int amdgpu_debugfs_ib_preempt(void *data, u64 val)
/* restart the scheduler */
kthread_unpark(ring->sched.thread);

mutex_unlock(&adev->lock_reset);

ttm_bo_unlock_delayed_workqueue(&adev->mman.bdev, resched);

kfree(fences);
Expand Down
36 changes: 34 additions & 2 deletions drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
Original file line number Diff line number Diff line change
Expand Up @@ -2057,6 +2057,7 @@ static int amdgpu_device_enable_mgpu_fan_boost(void)
*/
static int amdgpu_device_ip_late_init(struct amdgpu_device *adev)
{
struct amdgpu_gpu_instance *gpu_instance;
int i = 0, r;

for (i = 0; i < adev->num_ip_blocks; i++) {
Expand All @@ -2082,8 +2083,39 @@ static int amdgpu_device_ip_late_init(struct amdgpu_device *adev)
if (r)
DRM_ERROR("enable mgpu fan boost failed (%d).\n", r);

/* set to low pstate by default */
amdgpu_xgmi_set_pstate(adev, 0);

if (adev->gmc.xgmi.num_physical_nodes > 1) {
mutex_lock(&mgpu_info.mutex);

/*
* Reset device p-state to low as this was booted with high.
*
* This should be performed only after all devices from the same
* hive get initialized.
*
* However, it's unknown how many device in the hive in advance.
* As this is counted one by one during devices initializations.
*
* So, we wait for all XGMI interlinked devices initialized.
* This may bring some delays as those devices may come from
* different hives. But that should be OK.
*/
if (mgpu_info.num_dgpu == adev->gmc.xgmi.num_physical_nodes) {
for (i = 0; i < mgpu_info.num_gpu; i++) {
gpu_instance = &(mgpu_info.gpu_ins[i]);
if (gpu_instance->adev->flags & AMD_IS_APU)
continue;

r = amdgpu_xgmi_set_pstate(gpu_instance->adev, 0);
if (r) {
DRM_ERROR("pstate setting failed (%d).\n", r);
break;
}
}
}

mutex_unlock(&mgpu_info.mutex);
}

return 0;
}
Expand Down
30 changes: 30 additions & 0 deletions drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h
Original file line number Diff line number Diff line change
Expand Up @@ -127,18 +127,48 @@ struct amdgpu_xgmi {
};

struct amdgpu_gmc {
/* FB's physical address in MMIO space (for CPU to
* map FB). This is different compared to the agp/
* gart/vram_start/end field as the later is from
* GPU's view and aper_base is from CPU's view.
*/
resource_size_t aper_size;
resource_size_t aper_base;
/* for some chips with <= 32MB we need to lie
* about vram size near mc fb location */
u64 mc_vram_size;
u64 visible_vram_size;
/* AGP aperture start and end in MC address space
* Driver find a hole in the MC address space
* to place AGP by setting MC_VM_AGP_BOT/TOP registers
* Under VMID0, logical address == MC address. AGP
* aperture maps to physical bus or IOVA addressed.
* AGP aperture is used to simulate FB in ZFB case.
* AGP aperture is also used for page table in system
* memory (mainly for APU).
*
*/
u64 agp_size;
u64 agp_start;
u64 agp_end;
/* GART aperture start and end in MC address space
* Driver find a hole in the MC address space
* to place GART by setting VM_CONTEXT0_PAGE_TABLE_START/END_ADDR
* registers
* Under VMID0, logical address inside GART aperture will
* be translated through gpuvm gart page table to access
* paged system memory
*/
u64 gart_size;
u64 gart_start;
u64 gart_end;
/* Frame buffer aperture of this GPU device. Different from
* fb_start (see below), this only covers the local GPU device.
* Driver get fb_start from MC_VM_FB_LOCATION_BASE (set by vbios)
* and calculate vram_start of this local device by adding an
* offset inside the XGMI hive.
* Under VMID0, logical address == MC address
*/
u64 vram_start;
u64 vram_end;
/* FB region , it's same as local vram region in single GPU, in XGMI
Expand Down
2 changes: 1 addition & 1 deletion drivers/gpu/drm/amd/amdgpu/amdgpu_irq.c
Original file line number Diff line number Diff line change
Expand Up @@ -311,7 +311,7 @@ void amdgpu_irq_fini(struct amdgpu_device *adev)
drm_irq_uninstall(adev->ddev);
adev->irq.installed = false;
if (adev->irq.msi_enabled)
pci_disable_msi(adev->pdev);
pci_free_irq_vectors(adev->pdev);
if (!amdgpu_device_has_dc_support(adev))
flush_work(&adev->hotplug_work);
}
Expand Down
40 changes: 33 additions & 7 deletions drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
Original file line number Diff line number Diff line change
Expand Up @@ -220,22 +220,28 @@ static struct ras_manager *amdgpu_ras_find_obj(struct amdgpu_device *adev,
* As their names indicate, inject operation will write the
* value to the address.
*
* Second member: struct ras_debug_if::op.
* The second member: struct ras_debug_if::op.
* It has three kinds of operations.
*
* - 0: disable RAS on the block. Take ::head as its data.
* - 1: enable RAS on the block. Take ::head as its data.
* - 2: inject errors on the block. Take ::inject as its data.
*
* How to use the interface?
* programs:
* copy the struct ras_debug_if in your codes and initialize it.
* write the struct to the control node.
*
* Programs
*
* Copy the struct ras_debug_if in your codes and initialize it.
* Write the struct to the control node.
*
* Shells
*
* .. code-block:: bash
*
* echo op block [error [sub_block address value]] > .../ras/ras_ctrl
*
* Parameters:
*
* op: disable, enable, inject
* disable: only block is needed
* enable: block and error are needed
Expand Down Expand Up @@ -265,8 +271,10 @@ static struct ras_manager *amdgpu_ras_find_obj(struct amdgpu_device *adev,
* /sys/class/drm/card[0/1/2...]/device/ras/[gfx/sdma/...]_err_count
*
* .. note::
* Operation is only allowed on blocks which are supported.
* Operations are only allowed on blocks which are supported.
* Please check ras mask at /sys/module/amdgpu/parameters/ras_mask
* to see which blocks support RAS on a particular asic.
*
*/
static ssize_t amdgpu_ras_debugfs_ctrl_write(struct file *f, const char __user *buf,
size_t size, loff_t *pos)
Expand Down Expand Up @@ -322,7 +330,7 @@ static ssize_t amdgpu_ras_debugfs_ctrl_write(struct file *f, const char __user *
* DOC: AMDGPU RAS debugfs EEPROM table reset interface
*
* Some boards contain an EEPROM which is used to persistently store a list of
* bad pages containing ECC errors detected in vram. This interface provides
* bad pages which experiences ECC errors in vram. This interface provides
* a way to reset the EEPROM, e.g., after testing error injection.
*
* Usage:
Expand Down Expand Up @@ -362,7 +370,7 @@ static const struct file_operations amdgpu_ras_debugfs_eeprom_ops = {
/**
* DOC: AMDGPU RAS sysfs Error Count Interface
*
* It allows user to read the error count for each IP block on the gpu through
* It allows the user to read the error count for each IP block on the gpu through
* /sys/class/drm/card[0/1/2...]/device/ras/[gfx/sdma/...]_err_count
*
* It outputs the multiple lines which report the uncorrected (ue) and corrected
Expand Down Expand Up @@ -1027,6 +1035,24 @@ static int amdgpu_ras_sysfs_remove_all(struct amdgpu_device *adev)
}
/* sysfs end */

/**
* DOC: AMDGPU RAS Reboot Behavior for Unrecoverable Errors
*
* Normally when there is an uncorrectable error, the driver will reset
* the GPU to recover. However, in the event of an unrecoverable error,
* the driver provides an interface to reboot the system automatically
* in that event.
*
* The following file in debugfs provides that interface:
* /sys/kernel/debug/dri/[0/1/2...]/ras/auto_reboot
*
* Usage:
*
* .. code-block:: bash
*
* echo true > .../ras/auto_reboot
*
*/
/* debugfs begin */
static void amdgpu_ras_debugfs_create_ctrl_node(struct amdgpu_device *adev)
{
Expand Down
2 changes: 2 additions & 0 deletions drivers/gpu/drm/amd/amdgpu/amdgpu_test.c
Original file line number Diff line number Diff line change
Expand Up @@ -138,6 +138,7 @@ static void amdgpu_do_test_moves(struct amdgpu_device *adev)
}

dma_fence_put(fence);
fence = NULL;

r = amdgpu_bo_kmap(vram_obj, &vram_map);
if (r) {
Expand Down Expand Up @@ -183,6 +184,7 @@ static void amdgpu_do_test_moves(struct amdgpu_device *adev)
}

dma_fence_put(fence);
fence = NULL;

r = amdgpu_bo_kmap(gtt_obj[i], &gtt_map);
if (r) {
Expand Down
6 changes: 3 additions & 3 deletions drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c
Original file line number Diff line number Diff line change
Expand Up @@ -1906,9 +1906,6 @@ void amdgpu_ttm_late_init(struct amdgpu_device *adev)
void *stolen_vga_buf;
/* return the VGA stolen memory (if any) back to VRAM */
amdgpu_bo_free_kernel(&adev->stolen_vga_memory, NULL, &stolen_vga_buf);

/* return the IP Discovery TMR memory back to VRAM */
amdgpu_bo_free_kernel(&adev->discovery_memory, NULL, NULL);
}

/**
Expand All @@ -1921,7 +1918,10 @@ void amdgpu_ttm_fini(struct amdgpu_device *adev)

amdgpu_ttm_debugfs_fini(adev);
amdgpu_ttm_training_reserve_vram_fini(adev);
/* return the IP Discovery TMR memory back to VRAM */
amdgpu_bo_free_kernel(&adev->discovery_memory, NULL, NULL);
amdgpu_ttm_fw_reserve_vram_fini(adev);

if (adev->mman.aper_base_kaddr)
iounmap(adev->mman.aper_base_kaddr);
adev->mman.aper_base_kaddr = NULL;
Expand Down
10 changes: 9 additions & 1 deletion drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
Original file line number Diff line number Diff line change
Expand Up @@ -1418,6 +1418,9 @@ static int amdgpu_vm_update_ptes(struct amdgpu_vm_update_params *params,
uint64_t incr, entry_end, pe_start;
struct amdgpu_bo *pt;

/* make sure that the page tables covering the address range are
* actually allocated
*/
r = amdgpu_vm_alloc_pts(params->adev, params->vm, &cursor,
params->direct);
if (r)
Expand Down Expand Up @@ -1491,7 +1494,12 @@ static int amdgpu_vm_update_ptes(struct amdgpu_vm_update_params *params,
} while (frag_start < entry_end);

if (amdgpu_vm_pt_descendant(adev, &cursor)) {
/* Free all child entries */
/* Free all child entries.
* Update the tables with the flags and addresses and free up subsequent
* tables in the case of huge pages or freed up areas.
* This is the maximum you can free, because all other page tables are not
* completely covered by the range and so potentially still in use.
*/
while (cursor.pfn < frag_start) {
amdgpu_vm_free_pts(adev, params->vm, &cursor);
amdgpu_vm_pt_next(adev, &cursor);
Expand Down
Loading

0 comments on commit 0990ca2

Please sign in to comment.