Skip to content

Commit

Permalink
drm/i915: Execlists small cleanups and micro-optimisations
Browse files Browse the repository at this point in the history
Assorted changes in the areas of code cleanup, reduction of
invariant conditional in the interrupt handler and lock
contention and MMIO access optimisation.

 * Remove needless initialization.
 * Improve cache locality by reorganizing code and/or using
   branch hints to keep unexpected or error conditions out
   of line.
 * Favor busy submit path vs. empty queue.
 * Less branching in hot-paths.

v2:

 * Avoid mmio reads when possible. (Chris Wilson)
 * Use natural integer size for csb indices.
 * Remove useless return value from execlists_update_context.
 * Extract 32-bit ppgtt PDPs update so it is out of line and
   shared with two callers.
 * Grab forcewake across all mmio operations to ease the
   load on uncore lock and use chepear mmio ops.

v3:

 * Removed some more pointless u8 data types.
 * Removed unused return from execlists_context_queue.
 * Commit message updates.

v4:
 * Unclumsify the unqueue if statement. (Chris Wilson)
 * Hide forcewake from the queuing function. (Chris Wilson)

Version 3 now makes the irq handling code path ~20% smaller on
48-bit PPGTT hardware, and a little bit less elsewhere. Hot
paths are mostly in-line now and hammering on the uncore
spinlock is greatly reduced together with mmio traffic to an
extent.

Benchmarking with "gem_latency -n 100" (keep submitting
batches with 100 nop instruction) shows approximately 4% higher
throughput, 2% less CPU time and 22% smaller latencies. This was
on a big-core while small-cores could benefit even more.

Most likely reason for the improvements are the MMIO
optimization and uncore lock traffic reduction.

One odd result is with "gem_latency -n 0" (dispatching empty
batches) which shows 5% more throughput, 8% less CPU time,
25% better producer and consumer latencies, but 15% higher
dispatch latency which is yet unexplained.

Signed-off-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
Cc: Chris Wilson <chris@chris-wilson.co.uk>
Reviewed-by: Chris Wilson <chris@chris-wilson.co.uk>
Link: http://patchwork.freedesktop.org/patch/msgid/1456505912-22286-1-git-send-email-tvrtko.ursulin@linux.intel.com
  • Loading branch information
Tvrtko Ursulin committed Mar 1, 2016
1 parent 3ba8607 commit c6a2ac7
Show file tree
Hide file tree
Showing 2 changed files with 114 additions and 103 deletions.
214 changes: 112 additions & 102 deletions drivers/gpu/drm/i915/intel_lrc.c
Original file line number Diff line number Diff line change
Expand Up @@ -270,6 +270,9 @@ logical_ring_init_platform_invariants(struct intel_engine_cs *ring)
{
struct drm_device *dev = ring->dev;

if (IS_GEN8(dev) || IS_GEN9(dev))
ring->idle_lite_restore_wa = ~0;

ring->disable_lite_restore_wa = (IS_SKL_REVID(dev, 0, SKL_REVID_B0) ||
IS_BXT_REVID(dev, 0, BXT_REVID_A1)) &&
(ring->id == VCS || ring->id == VCS2);
Expand Down Expand Up @@ -373,8 +376,6 @@ static void execlists_elsp_write(struct drm_i915_gem_request *rq0,
rq0->elsp_submitted++;

/* You must always write both descriptors in the order below. */
spin_lock(&dev_priv->uncore.lock);
intel_uncore_forcewake_get__locked(dev_priv, FORCEWAKE_ALL);
I915_WRITE_FW(RING_ELSP(ring), upper_32_bits(desc[1]));
I915_WRITE_FW(RING_ELSP(ring), lower_32_bits(desc[1]));

Expand All @@ -384,31 +385,32 @@ static void execlists_elsp_write(struct drm_i915_gem_request *rq0,

/* ELSP is a wo register, use another nearby reg for posting */
POSTING_READ_FW(RING_EXECLIST_STATUS_LO(ring));
intel_uncore_forcewake_put__locked(dev_priv, FORCEWAKE_ALL);
spin_unlock(&dev_priv->uncore.lock);
}

static int execlists_update_context(struct drm_i915_gem_request *rq)
static void
execlists_update_context_pdps(struct i915_hw_ppgtt *ppgtt, u32 *reg_state)
{
ASSIGN_CTX_PDP(ppgtt, reg_state, 3);
ASSIGN_CTX_PDP(ppgtt, reg_state, 2);
ASSIGN_CTX_PDP(ppgtt, reg_state, 1);
ASSIGN_CTX_PDP(ppgtt, reg_state, 0);
}

static void execlists_update_context(struct drm_i915_gem_request *rq)
{
struct intel_engine_cs *ring = rq->ring;
struct i915_hw_ppgtt *ppgtt = rq->ctx->ppgtt;
uint32_t *reg_state = rq->ctx->engine[ring->id].lrc_reg_state;

reg_state[CTX_RING_TAIL+1] = rq->tail;

if (ppgtt && !USES_FULL_48BIT_PPGTT(ppgtt->base.dev)) {
/* True 32b PPGTT with dynamic page allocation: update PDP
* registers and point the unallocated PDPs to scratch page.
* PML4 is allocated during ppgtt init, so this is not needed
* in 48-bit mode.
*/
ASSIGN_CTX_PDP(ppgtt, reg_state, 3);
ASSIGN_CTX_PDP(ppgtt, reg_state, 2);
ASSIGN_CTX_PDP(ppgtt, reg_state, 1);
ASSIGN_CTX_PDP(ppgtt, reg_state, 0);
}

return 0;
/* True 32b PPGTT with dynamic page allocation: update PDP
* registers and point the unallocated PDPs to scratch page.
* PML4 is allocated during ppgtt init, so this is not needed
* in 48-bit mode.
*/
if (ppgtt && !USES_FULL_48BIT_PPGTT(ppgtt->base.dev))
execlists_update_context_pdps(ppgtt, reg_state);
}

static void execlists_submit_requests(struct drm_i915_gem_request *rq0,
Expand All @@ -422,10 +424,10 @@ static void execlists_submit_requests(struct drm_i915_gem_request *rq0,
execlists_elsp_write(rq0, rq1);
}

static void execlists_context_unqueue(struct intel_engine_cs *ring)
static void execlists_context_unqueue__locked(struct intel_engine_cs *ring)
{
struct drm_i915_gem_request *req0 = NULL, *req1 = NULL;
struct drm_i915_gem_request *cursor = NULL, *tmp = NULL;
struct drm_i915_gem_request *cursor, *tmp;

assert_spin_locked(&ring->execlist_lock);

Expand All @@ -435,9 +437,6 @@ static void execlists_context_unqueue(struct intel_engine_cs *ring)
*/
WARN_ON(!intel_irqs_enabled(ring->dev->dev_private));

if (list_empty(&ring->execlist_queue))
return;

/* Try to read in pairs */
list_for_each_entry_safe(cursor, tmp, &ring->execlist_queue,
execlist_link) {
Expand All @@ -452,37 +451,48 @@ static void execlists_context_unqueue(struct intel_engine_cs *ring)
req0 = cursor;
} else {
req1 = cursor;
WARN_ON(req1->elsp_submitted);
break;
}
}

if (IS_GEN8(ring->dev) || IS_GEN9(ring->dev)) {
if (unlikely(!req0))
return;

if (req0->elsp_submitted & ring->idle_lite_restore_wa) {
/*
* WaIdleLiteRestore: make sure we never cause a lite
* restore with HEAD==TAIL
* WaIdleLiteRestore: make sure we never cause a lite restore
* with HEAD==TAIL.
*
* Apply the wa NOOPS to prevent ring:HEAD == req:TAIL as we
* resubmit the request. See gen8_emit_request() for where we
* prepare the padding after the end of the request.
*/
if (req0->elsp_submitted) {
/*
* Apply the wa NOOPS to prevent ring:HEAD == req:TAIL
* as we resubmit the request. See gen8_emit_request()
* for where we prepare the padding after the end of the
* request.
*/
struct intel_ringbuffer *ringbuf;
struct intel_ringbuffer *ringbuf;

ringbuf = req0->ctx->engine[ring->id].ringbuf;
req0->tail += 8;
req0->tail &= ringbuf->size - 1;
}
ringbuf = req0->ctx->engine[ring->id].ringbuf;
req0->tail += 8;
req0->tail &= ringbuf->size - 1;
}

WARN_ON(req1 && req1->elsp_submitted);

execlists_submit_requests(req0, req1);
}

static bool execlists_check_remove_request(struct intel_engine_cs *ring,
u32 request_id)
static void execlists_context_unqueue(struct intel_engine_cs *ring)
{
struct drm_i915_private *dev_priv = ring->dev->dev_private;

spin_lock(&dev_priv->uncore.lock);
intel_uncore_forcewake_get__locked(dev_priv, FORCEWAKE_ALL);

execlists_context_unqueue__locked(ring);

intel_uncore_forcewake_put__locked(dev_priv, FORCEWAKE_ALL);
spin_unlock(&dev_priv->uncore.lock);
}

static unsigned int
execlists_check_remove_request(struct intel_engine_cs *ring, u32 request_id)
{
struct drm_i915_gem_request *head_req;

Expand All @@ -492,33 +502,41 @@ static bool execlists_check_remove_request(struct intel_engine_cs *ring,
struct drm_i915_gem_request,
execlist_link);

if (head_req != NULL) {
if (intel_execlists_ctx_id(head_req->ctx, ring) == request_id) {
WARN(head_req->elsp_submitted == 0,
"Never submitted head request\n");
if (!head_req)
return 0;

if (--head_req->elsp_submitted <= 0) {
list_move_tail(&head_req->execlist_link,
&ring->execlist_retired_req_list);
return true;
}
}
}
if (unlikely(intel_execlists_ctx_id(head_req->ctx, ring) != request_id))
return 0;

WARN(head_req->elsp_submitted == 0, "Never submitted head request\n");

if (--head_req->elsp_submitted > 0)
return 0;

list_move_tail(&head_req->execlist_link,
&ring->execlist_retired_req_list);

return false;
return 1;
}

static void get_context_status(struct intel_engine_cs *ring,
u8 read_pointer,
u32 *status, u32 *context_id)
static u32
get_context_status(struct intel_engine_cs *ring, unsigned int read_pointer,
u32 *context_id)
{
struct drm_i915_private *dev_priv = ring->dev->dev_private;
u32 status;

if (WARN_ON(read_pointer >= GEN8_CSB_ENTRIES))
return;
read_pointer %= GEN8_CSB_ENTRIES;

status = I915_READ_FW(RING_CONTEXT_STATUS_BUF_LO(ring, read_pointer));

if (status & GEN8_CTX_STATUS_IDLE_ACTIVE)
return 0;

*status = I915_READ(RING_CONTEXT_STATUS_BUF_LO(ring, read_pointer));
*context_id = I915_READ(RING_CONTEXT_STATUS_BUF_HI(ring, read_pointer));
*context_id = I915_READ_FW(RING_CONTEXT_STATUS_BUF_HI(ring,
read_pointer));

return status;
}

/**
Expand All @@ -532,68 +550,64 @@ void intel_lrc_irq_handler(struct intel_engine_cs *ring)
{
struct drm_i915_private *dev_priv = ring->dev->dev_private;
u32 status_pointer;
u8 read_pointer;
u8 write_pointer;
unsigned int read_pointer, write_pointer;
u32 status = 0;
u32 status_id;
u32 submit_contexts = 0;
unsigned int submit_contexts = 0;

spin_lock(&ring->execlist_lock);

status_pointer = I915_READ(RING_CONTEXT_STATUS_PTR(ring));
spin_lock(&dev_priv->uncore.lock);
intel_uncore_forcewake_get__locked(dev_priv, FORCEWAKE_ALL);

status_pointer = I915_READ_FW(RING_CONTEXT_STATUS_PTR(ring));

read_pointer = ring->next_context_status_buffer;
write_pointer = GEN8_CSB_WRITE_PTR(status_pointer);
if (read_pointer > write_pointer)
write_pointer += GEN8_CSB_ENTRIES;

spin_lock(&ring->execlist_lock);

while (read_pointer < write_pointer) {
status = get_context_status(ring, ++read_pointer, &status_id);

get_context_status(ring, ++read_pointer % GEN8_CSB_ENTRIES,
&status, &status_id);

if (status & GEN8_CTX_STATUS_IDLE_ACTIVE)
continue;

if (status & GEN8_CTX_STATUS_PREEMPTED) {
if (unlikely(status & GEN8_CTX_STATUS_PREEMPTED)) {
if (status & GEN8_CTX_STATUS_LITE_RESTORE) {
if (execlists_check_remove_request(ring, status_id))
WARN(1, "Lite Restored request removed from queue\n");
} else
WARN(1, "Preemption without Lite Restore\n");
}

if ((status & GEN8_CTX_STATUS_ACTIVE_IDLE) ||
(status & GEN8_CTX_STATUS_ELEMENT_SWITCH)) {
if (execlists_check_remove_request(ring, status_id))
submit_contexts++;
}
if (status & (GEN8_CTX_STATUS_ACTIVE_IDLE |
GEN8_CTX_STATUS_ELEMENT_SWITCH))
submit_contexts +=
execlists_check_remove_request(ring, status_id);
}

if (ring->disable_lite_restore_wa) {
/* Prevent a ctx to preempt itself */
if ((status & GEN8_CTX_STATUS_ACTIVE_IDLE) &&
(submit_contexts != 0))
execlists_context_unqueue(ring);
} else if (submit_contexts != 0) {
execlists_context_unqueue(ring);
if (submit_contexts) {
if (!ring->disable_lite_restore_wa ||
(status & GEN8_CTX_STATUS_ACTIVE_IDLE))
execlists_context_unqueue__locked(ring);
}

spin_unlock(&ring->execlist_lock);

if (unlikely(submit_contexts > 2))
DRM_ERROR("More than two context complete events?\n");

ring->next_context_status_buffer = write_pointer % GEN8_CSB_ENTRIES;

/* Update the read pointer to the old write pointer. Manual ringbuffer
* management ftw </sarcasm> */
I915_WRITE(RING_CONTEXT_STATUS_PTR(ring),
_MASKED_FIELD(GEN8_CSB_READ_PTR_MASK,
ring->next_context_status_buffer << 8));
I915_WRITE_FW(RING_CONTEXT_STATUS_PTR(ring),
_MASKED_FIELD(GEN8_CSB_READ_PTR_MASK,
ring->next_context_status_buffer << 8));

intel_uncore_forcewake_put__locked(dev_priv, FORCEWAKE_ALL);
spin_unlock(&dev_priv->uncore.lock);

spin_unlock(&ring->execlist_lock);

if (unlikely(submit_contexts > 2))
DRM_ERROR("More than two context complete events?\n");
}

static int execlists_context_queue(struct drm_i915_gem_request *request)
static void execlists_context_queue(struct drm_i915_gem_request *request)
{
struct intel_engine_cs *ring = request->ring;
struct drm_i915_gem_request *cursor;
Expand Down Expand Up @@ -630,8 +644,6 @@ static int execlists_context_queue(struct drm_i915_gem_request *request)
execlists_context_unqueue(ring);

spin_unlock_irq(&ring->execlist_lock);

return 0;
}

static int logical_ring_invalidate_all_caches(struct drm_i915_gem_request *req)
Expand Down Expand Up @@ -1550,7 +1562,7 @@ static int gen8_init_common_ring(struct intel_engine_cs *ring)
{
struct drm_device *dev = ring->dev;
struct drm_i915_private *dev_priv = dev->dev_private;
u8 next_context_status_buffer_hw;
unsigned int next_context_status_buffer_hw;

lrc_setup_hardware_status_page(ring,
dev_priv->kernel_context->engine[ring->id].state);
Expand Down Expand Up @@ -2013,6 +2025,7 @@ void intel_logical_ring_cleanup(struct intel_engine_cs *ring)
ring->status_page.obj = NULL;
}

ring->idle_lite_restore_wa = 0;
ring->disable_lite_restore_wa = false;
ring->ctx_desc_template = 0;

Expand Down Expand Up @@ -2439,10 +2452,7 @@ populate_lr_context(struct intel_context *ctx, struct drm_i915_gem_object *ctx_o
* With dynamic page allocation, PDPs may not be allocated at
* this point. Point the unallocated PDPs to the scratch page
*/
ASSIGN_CTX_PDP(ppgtt, reg_state, 3);
ASSIGN_CTX_PDP(ppgtt, reg_state, 2);
ASSIGN_CTX_PDP(ppgtt, reg_state, 1);
ASSIGN_CTX_PDP(ppgtt, reg_state, 0);
execlists_update_context_pdps(ppgtt, reg_state);
}

if (ring->id == RCS) {
Expand Down
3 changes: 2 additions & 1 deletion drivers/gpu/drm/i915/intel_ringbuffer.h
Original file line number Diff line number Diff line change
Expand Up @@ -271,7 +271,8 @@ struct intel_engine_cs {
spinlock_t execlist_lock;
struct list_head execlist_queue;
struct list_head execlist_retired_req_list;
u8 next_context_status_buffer;
unsigned int next_context_status_buffer;
unsigned int idle_lite_restore_wa;
bool disable_lite_restore_wa;
u32 ctx_desc_template;
u32 irq_keep_mask; /* bitmask for interrupts that should not be masked */
Expand Down

0 comments on commit c6a2ac7

Please sign in to comment.