Skip to content

Commit

Permalink
Merge tag 'drm-amdkfd-next-2017-12-24' of git://people.freedesktop.or…
Browse files Browse the repository at this point in the history
…g/~gabbayo/linux into drm-next

- Add CWSR (compute wave save restore) support for GFX8 (Carrizo)
- Fix SDMA user-mode queues support for GFX7 (Kaveri)
- Add SDMA user-mode queues support for GFX8 (Carrizo)
- Allow HWS (hardware scheduling) to schedule multiple processes concurrently
- Add debugfs support
- Simplify process locking and lock dependencies
- Refactoring topology code to prepare for dGPU support + fixes to that code
  - Add option to generate dummy/virtual CRAT table when its missing or deformed
  - Recognize CPUs other then APUs as compute entities
- Various clean ups and bug fixes

I have not yet sent the dGPU topology code because it depends on a patch
for the PCI subsystem that adds PCIe atomics support. Once that patch is
upstreamed we can continue with the rest of the dGPU code.

* tag 'drm-amdkfd-next-2017-12-24' of git://people.freedesktop.org/~gabbayo/linux: (53 commits)
  drm/amdgpu: Add support for reporting VRAM usage
  drm/amdkfd: Ignore ACPI CRAT for non-APU systems
  drm/amdkfd: Module option to disable CRAT table
  drm/amdkfd: Add AQL Queue Memory flag on topology
  drm/amdkfd: Fixup incorrect info in the CZ CRAT table
  drm/amdkfd: Add perf counters to topology
  drm/amdkfd: Add topology support for dGPUs
  drm/amdkfd: Add topology support for CPUs
  drm/amdkfd: Fix sibling_map[] size
  drm/amdkfd: Simplify counting of memory banks
  drm/amdkfd: Turn verbose topology messages into pr_debug
  drm/amdkfd: sync IOLINK defines to thunk spec
  drm/amdkfd: Support enumerating non-GPU devices
  drm/amdkfd: Decouple CRAT parsing from device list update
  drm/amdkfd: Reorganize CRAT fetching from ACPI
  drm/amdkfd: Group up CRAT related functions
  drm/amdkfd: Fix memory leaks in kfd topology
  drm/amdkfd: Topology: Fix location_id
  drm/amdkfd: Update number of compute unit from KGD
  drm/amd: Remove get_vmem_size from KGD-KFD interface
  ...
  • Loading branch information
Dave Airlie committed Jan 4, 2018
2 parents 3508776 + 9f0a0b4 commit a9742b7
Show file tree
Hide file tree
Showing 37 changed files with 4,632 additions and 643 deletions.
1 change: 1 addition & 0 deletions drivers/gpu/drm/amd/amdgpu/amdgpu.h
Original file line number Diff line number Diff line change
Expand Up @@ -959,6 +959,7 @@ struct amdgpu_gfx_config {
};

struct amdgpu_cu_info {
uint32_t simd_per_cu;
uint32_t max_waves_per_simd;
uint32_t wave_front_size;
uint32_t max_scratch_slots_per_cu;
Expand Down
67 changes: 60 additions & 7 deletions drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
Original file line number Diff line number Diff line change
Expand Up @@ -275,14 +275,34 @@ void free_gtt_mem(struct kgd_dev *kgd, void *mem_obj)
kfree(mem);
}

uint64_t get_vmem_size(struct kgd_dev *kgd)
void get_local_mem_info(struct kgd_dev *kgd,
struct kfd_local_mem_info *mem_info)
{
struct amdgpu_device *adev =
(struct amdgpu_device *)kgd;
struct amdgpu_device *adev = (struct amdgpu_device *)kgd;
uint64_t address_mask = adev->dev->dma_mask ? ~*adev->dev->dma_mask :
~((1ULL << 32) - 1);
resource_size_t aper_limit = adev->mc.aper_base + adev->mc.aper_size;

memset(mem_info, 0, sizeof(*mem_info));
if (!(adev->mc.aper_base & address_mask || aper_limit & address_mask)) {
mem_info->local_mem_size_public = adev->mc.visible_vram_size;
mem_info->local_mem_size_private = adev->mc.real_vram_size -
adev->mc.visible_vram_size;
} else {
mem_info->local_mem_size_public = 0;
mem_info->local_mem_size_private = adev->mc.real_vram_size;
}
mem_info->vram_width = adev->mc.vram_width;

BUG_ON(kgd == NULL);
pr_debug("Address base: 0x%llx limit 0x%llx public 0x%llx private 0x%llx\n",
adev->mc.aper_base, aper_limit,
mem_info->local_mem_size_public,
mem_info->local_mem_size_private);

return adev->mc.real_vram_size;
if (amdgpu_sriov_vf(adev))
mem_info->mem_clk_max = adev->clock.default_mclk / 100;
else
mem_info->mem_clk_max = amdgpu_dpm_get_mclk(adev, false) / 100;
}

uint64_t get_gpu_clock_counter(struct kgd_dev *kgd)
Expand All @@ -298,6 +318,39 @@ uint32_t get_max_engine_clock_in_mhz(struct kgd_dev *kgd)
{
struct amdgpu_device *adev = (struct amdgpu_device *)kgd;

/* The sclk is in quantas of 10kHz */
return adev->pm.dpm.dyn_state.max_clock_voltage_on_ac.sclk / 100;
/* the sclk is in quantas of 10kHz */
if (amdgpu_sriov_vf(adev))
return adev->clock.default_sclk / 100;

return amdgpu_dpm_get_sclk(adev, false) / 100;
}

void get_cu_info(struct kgd_dev *kgd, struct kfd_cu_info *cu_info)
{
struct amdgpu_device *adev = (struct amdgpu_device *)kgd;
struct amdgpu_cu_info acu_info = adev->gfx.cu_info;

memset(cu_info, 0, sizeof(*cu_info));
if (sizeof(cu_info->cu_bitmap) != sizeof(acu_info.bitmap))
return;

cu_info->cu_active_number = acu_info.number;
cu_info->cu_ao_mask = acu_info.ao_cu_mask;
memcpy(&cu_info->cu_bitmap[0], &acu_info.bitmap[0],
sizeof(acu_info.bitmap));
cu_info->num_shader_engines = adev->gfx.config.max_shader_engines;
cu_info->num_shader_arrays_per_engine = adev->gfx.config.max_sh_per_se;
cu_info->num_cu_per_sh = adev->gfx.config.max_cu_per_sh;
cu_info->simd_per_cu = acu_info.simd_per_cu;
cu_info->max_waves_per_simd = acu_info.max_waves_per_simd;
cu_info->wave_front_size = acu_info.wave_front_size;
cu_info->max_scratch_slots_per_cu = acu_info.max_scratch_slots_per_cu;
cu_info->lds_size = acu_info.lds_size;
}

uint64_t amdgpu_amdkfd_get_vram_usage(struct kgd_dev *kgd)
{
struct amdgpu_device *adev = (struct amdgpu_device *)kgd;

return amdgpu_vram_mgr_usage(&adev->mman.bdev.man[TTM_PL_VRAM]);
}
5 changes: 4 additions & 1 deletion drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
Original file line number Diff line number Diff line change
Expand Up @@ -56,10 +56,13 @@ int alloc_gtt_mem(struct kgd_dev *kgd, size_t size,
void **mem_obj, uint64_t *gpu_addr,
void **cpu_ptr);
void free_gtt_mem(struct kgd_dev *kgd, void *mem_obj);
uint64_t get_vmem_size(struct kgd_dev *kgd);
void get_local_mem_info(struct kgd_dev *kgd,
struct kfd_local_mem_info *mem_info);
uint64_t get_gpu_clock_counter(struct kgd_dev *kgd);

uint32_t get_max_engine_clock_in_mhz(struct kgd_dev *kgd);
void get_cu_info(struct kgd_dev *kgd, struct kfd_cu_info *cu_info);
uint64_t amdgpu_amdkfd_get_vram_usage(struct kgd_dev *kgd);

#define read_user_wptr(mmptr, wptr, dst) \
({ \
Expand Down
111 changes: 98 additions & 13 deletions drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v7.c
Original file line number Diff line number Diff line change
Expand Up @@ -105,7 +105,14 @@ static int kgd_hqd_load(struct kgd_dev *kgd, void *mqd, uint32_t pipe_id,
uint32_t queue_id, uint32_t __user *wptr,
uint32_t wptr_shift, uint32_t wptr_mask,
struct mm_struct *mm);
static int kgd_hqd_sdma_load(struct kgd_dev *kgd, void *mqd);
static int kgd_hqd_dump(struct kgd_dev *kgd,
uint32_t pipe_id, uint32_t queue_id,
uint32_t (**dump)[2], uint32_t *n_regs);
static int kgd_hqd_sdma_load(struct kgd_dev *kgd, void *mqd,
uint32_t __user *wptr, struct mm_struct *mm);
static int kgd_hqd_sdma_dump(struct kgd_dev *kgd,
uint32_t engine_id, uint32_t queue_id,
uint32_t (**dump)[2], uint32_t *n_regs);
static bool kgd_hqd_is_occupied(struct kgd_dev *kgd, uint64_t queue_address,
uint32_t pipe_id, uint32_t queue_id);

Expand Down Expand Up @@ -166,7 +173,7 @@ static int get_tile_config(struct kgd_dev *kgd,
static const struct kfd2kgd_calls kfd2kgd = {
.init_gtt_mem_allocation = alloc_gtt_mem,
.free_gtt_mem = free_gtt_mem,
.get_vmem_size = get_vmem_size,
.get_local_mem_info = get_local_mem_info,
.get_gpu_clock_counter = get_gpu_clock_counter,
.get_max_engine_clock_in_mhz = get_max_engine_clock_in_mhz,
.alloc_pasid = amdgpu_vm_alloc_pasid,
Expand All @@ -177,6 +184,8 @@ static const struct kfd2kgd_calls kfd2kgd = {
.init_interrupts = kgd_init_interrupts,
.hqd_load = kgd_hqd_load,
.hqd_sdma_load = kgd_hqd_sdma_load,
.hqd_dump = kgd_hqd_dump,
.hqd_sdma_dump = kgd_hqd_sdma_dump,
.hqd_is_occupied = kgd_hqd_is_occupied,
.hqd_sdma_is_occupied = kgd_hqd_sdma_is_occupied,
.hqd_destroy = kgd_hqd_destroy,
Expand All @@ -191,6 +200,8 @@ static const struct kfd2kgd_calls kfd2kgd = {
.get_fw_version = get_fw_version,
.set_scratch_backing_va = set_scratch_backing_va,
.get_tile_config = get_tile_config,
.get_cu_info = get_cu_info,
.get_vram_usage = amdgpu_amdkfd_get_vram_usage
};

struct kfd2kgd_calls *amdgpu_amdkfd_gfx_7_get_functions(void)
Expand Down Expand Up @@ -375,7 +386,44 @@ static int kgd_hqd_load(struct kgd_dev *kgd, void *mqd, uint32_t pipe_id,
return 0;
}

static int kgd_hqd_sdma_load(struct kgd_dev *kgd, void *mqd)
static int kgd_hqd_dump(struct kgd_dev *kgd,
uint32_t pipe_id, uint32_t queue_id,
uint32_t (**dump)[2], uint32_t *n_regs)
{
struct amdgpu_device *adev = get_amdgpu_device(kgd);
uint32_t i = 0, reg;
#define HQD_N_REGS (35+4)
#define DUMP_REG(addr) do { \
if (WARN_ON_ONCE(i >= HQD_N_REGS)) \
break; \
(*dump)[i][0] = (addr) << 2; \
(*dump)[i++][1] = RREG32(addr); \
} while (0)

*dump = kmalloc(HQD_N_REGS*2*sizeof(uint32_t), GFP_KERNEL);
if (*dump == NULL)
return -ENOMEM;

acquire_queue(kgd, pipe_id, queue_id);

DUMP_REG(mmCOMPUTE_STATIC_THREAD_MGMT_SE0);
DUMP_REG(mmCOMPUTE_STATIC_THREAD_MGMT_SE1);
DUMP_REG(mmCOMPUTE_STATIC_THREAD_MGMT_SE2);
DUMP_REG(mmCOMPUTE_STATIC_THREAD_MGMT_SE3);

for (reg = mmCP_MQD_BASE_ADDR; reg <= mmCP_MQD_CONTROL; reg++)
DUMP_REG(reg);

release_queue(kgd);

WARN_ON_ONCE(i != HQD_N_REGS);
*n_regs = i;

return 0;
}

static int kgd_hqd_sdma_load(struct kgd_dev *kgd, void *mqd,
uint32_t __user *wptr, struct mm_struct *mm)
{
struct amdgpu_device *adev = get_amdgpu_device(kgd);
struct cik_sdma_rlc_registers *m;
Expand Down Expand Up @@ -410,10 +458,17 @@ static int kgd_hqd_sdma_load(struct kgd_dev *kgd, void *mqd)
WREG32(mmSDMA0_GFX_CONTEXT_CNTL, data);
}

WREG32(sdma_base_addr + mmSDMA0_RLC0_DOORBELL,
m->sdma_rlc_doorbell);
WREG32(sdma_base_addr + mmSDMA0_RLC0_RB_RPTR, 0);
WREG32(sdma_base_addr + mmSDMA0_RLC0_RB_WPTR, 0);
data = REG_SET_FIELD(m->sdma_rlc_doorbell, SDMA0_RLC0_DOORBELL,
ENABLE, 1);
WREG32(sdma_base_addr + mmSDMA0_RLC0_DOORBELL, data);
WREG32(sdma_base_addr + mmSDMA0_RLC0_RB_RPTR, m->sdma_rlc_rb_rptr);

if (read_user_wptr(mm, wptr, data))
WREG32(sdma_base_addr + mmSDMA0_RLC0_RB_WPTR, data);
else
WREG32(sdma_base_addr + mmSDMA0_RLC0_RB_WPTR,
m->sdma_rlc_rb_rptr);

WREG32(sdma_base_addr + mmSDMA0_RLC0_VIRTUAL_ADDR,
m->sdma_rlc_virtual_addr);
WREG32(sdma_base_addr + mmSDMA0_RLC0_RB_BASE, m->sdma_rlc_rb_base);
Expand All @@ -423,8 +478,37 @@ static int kgd_hqd_sdma_load(struct kgd_dev *kgd, void *mqd)
m->sdma_rlc_rb_rptr_addr_lo);
WREG32(sdma_base_addr + mmSDMA0_RLC0_RB_RPTR_ADDR_HI,
m->sdma_rlc_rb_rptr_addr_hi);
WREG32(sdma_base_addr + mmSDMA0_RLC0_RB_CNTL,
m->sdma_rlc_rb_cntl);

data = REG_SET_FIELD(m->sdma_rlc_rb_cntl, SDMA0_RLC0_RB_CNTL,
RB_ENABLE, 1);
WREG32(sdma_base_addr + mmSDMA0_RLC0_RB_CNTL, data);

return 0;
}

static int kgd_hqd_sdma_dump(struct kgd_dev *kgd,
uint32_t engine_id, uint32_t queue_id,
uint32_t (**dump)[2], uint32_t *n_regs)
{
struct amdgpu_device *adev = get_amdgpu_device(kgd);
uint32_t sdma_offset = engine_id * SDMA1_REGISTER_OFFSET +
queue_id * KFD_CIK_SDMA_QUEUE_OFFSET;
uint32_t i = 0, reg;
#undef HQD_N_REGS
#define HQD_N_REGS (19+4)

*dump = kmalloc(HQD_N_REGS*2*sizeof(uint32_t), GFP_KERNEL);
if (*dump == NULL)
return -ENOMEM;

for (reg = mmSDMA0_RLC0_RB_CNTL; reg <= mmSDMA0_RLC0_DOORBELL; reg++)
DUMP_REG(sdma_offset + reg);
for (reg = mmSDMA0_RLC0_VIRTUAL_ADDR; reg <= mmSDMA0_RLC0_WATERMARK;
reg++)
DUMP_REG(sdma_offset + reg);

WARN_ON_ONCE(i != HQD_N_REGS);
*n_regs = i;

return 0;
}
Expand Down Expand Up @@ -575,7 +659,7 @@ static int kgd_hqd_sdma_destroy(struct kgd_dev *kgd, void *mqd,
struct cik_sdma_rlc_registers *m;
uint32_t sdma_base_addr;
uint32_t temp;
int timeout = utimeout;
unsigned long end_jiffies = (utimeout * HZ / 1000) + jiffies;

m = get_sdma_mqd(mqd);
sdma_base_addr = get_sdma_base_addr(m);
Expand All @@ -588,17 +672,18 @@ static int kgd_hqd_sdma_destroy(struct kgd_dev *kgd, void *mqd,
temp = RREG32(sdma_base_addr + mmSDMA0_RLC0_CONTEXT_STATUS);
if (temp & SDMA0_STATUS_REG__RB_CMD_IDLE__SHIFT)
break;
if (timeout <= 0)
if (time_after(jiffies, end_jiffies))
return -ETIME;
msleep(20);
timeout -= 20;
usleep_range(500, 1000);
}

WREG32(sdma_base_addr + mmSDMA0_RLC0_DOORBELL, 0);
WREG32(sdma_base_addr + mmSDMA0_RLC0_RB_CNTL,
RREG32(sdma_base_addr + mmSDMA0_RLC0_RB_CNTL) |
SDMA0_RLC0_RB_CNTL__RB_ENABLE_MASK);

m->sdma_rlc_rb_rptr = RREG32(sdma_base_addr + mmSDMA0_RLC0_RB_RPTR);

return 0;
}

Expand Down
Loading

0 comments on commit a9742b7

Please sign in to comment.