Skip to content

Commit

Permalink
drm/i915: Remove HW semaphores for gen7 inter-engine synchronisation
Browse files Browse the repository at this point in the history
The writing is on the wall for the existence of a single execution queue
along each engine, and as a consequence we will not be able to track
dependencies along the HW queue itself, i.e. we will not be able to use
HW semaphores on gen7 as they use a global set of registers (and unlike
gen8+ we can not effectively target memory to keep per-context seqno and
dependencies).

On the positive side, when we implement request reordering for gen7 we
also can not presume a simple execution queue and would also require
removing the current semaphore generation code. So this bring us another
step closer to request reordering for ringbuffer submission!

The negative side is that using interrupts to drive inter-engine
synchronisation is much slower (4us -> 15us to do a nop on each of the 3
engines on ivb). This is much better than it was at the time of introducing
the HW semaphores and equally important userspace weaned itself off
intermixing dependent BLT/RENDER operations (the prime culprit was glyph
rendering in UXA). So while we regress the microbenchmarks, it should not
impact the user.

References: https://bugs.freedesktop.org/show_bug.cgi?id=108888
Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Reviewed-by: Mika Kuoppala <mika.kuoppala@linux.intel.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20181228140736.32606-2-chris@chris-wilson.co.uk
  • Loading branch information
Chris Wilson committed Dec 28, 2018
1 parent 167bc75 commit 6faf591
Show file tree
Hide file tree
Showing 11 changed files with 12 additions and 557 deletions.
19 changes: 1 addition & 18 deletions drivers/gpu/drm/i915/i915_debugfs.c
Original file line number Diff line number Diff line change
Expand Up @@ -1041,21 +1041,7 @@ static const struct file_operations i915_error_state_fops = {
static int
i915_next_seqno_set(void *data, u64 val)
{
struct drm_i915_private *dev_priv = data;
struct drm_device *dev = &dev_priv->drm;
int ret;

ret = mutex_lock_interruptible(&dev->struct_mutex);
if (ret)
return ret;

intel_runtime_pm_get(dev_priv);
ret = i915_gem_set_global_seqno(dev, val);
intel_runtime_pm_put(dev_priv);

mutex_unlock(&dev->struct_mutex);

return ret;
return val ? 0 : -EINVAL;
}

DEFINE_SIMPLE_ATTRIBUTE(i915_next_seqno_fops,
Expand Down Expand Up @@ -4101,9 +4087,6 @@ i915_drop_caches_set(void *data, u64 val)
I915_WAIT_LOCKED,
MAX_SCHEDULE_TIMEOUT);

if (ret == 0 && val & DROP_RESET_SEQNO)
ret = i915_gem_set_global_seqno(&i915->drm, 1);

if (val & DROP_RETIRE)
i915_retire_requests(i915);

Expand Down
2 changes: 1 addition & 1 deletion drivers/gpu/drm/i915/i915_drv.c
Original file line number Diff line number Diff line change
Expand Up @@ -349,7 +349,7 @@ static int i915_getparam_ioctl(struct drm_device *dev, void *data,
value = min_t(int, INTEL_PPGTT(dev_priv), I915_GEM_PPGTT_FULL);
break;
case I915_PARAM_HAS_SEMAPHORES:
value = HAS_LEGACY_SEMAPHORES(dev_priv);
value = 0;
break;
case I915_PARAM_HAS_SECURE_BATCHES:
value = capable(CAP_SYS_ADMIN);
Expand Down
3 changes: 0 additions & 3 deletions drivers/gpu/drm/i915/i915_drv.h
Original file line number Diff line number Diff line change
Expand Up @@ -1948,7 +1948,6 @@ struct drm_i915_private {
struct list_head active_rings;
struct list_head closed_vma;
u32 active_requests;
u32 request_serial;

/**
* Is the GPU currently considered idle, or busy executing
Expand Down Expand Up @@ -2396,8 +2395,6 @@ intel_info(const struct drm_i915_private *dev_priv)
#define HAS_BLT(dev_priv) HAS_ENGINE(dev_priv, BCS)
#define HAS_VEBOX(dev_priv) HAS_ENGINE(dev_priv, VECS)

#define HAS_LEGACY_SEMAPHORES(dev_priv) IS_GEN(dev_priv, 7)

#define HAS_LLC(dev_priv) ((dev_priv)->info.has_llc)
#define HAS_SNOOP(dev_priv) ((dev_priv)->info.has_snoop)
#define HAS_EDRAM(dev_priv) (!!((dev_priv)->edram_cap & EDRAM_ENABLED))
Expand Down
4 changes: 2 additions & 2 deletions drivers/gpu/drm/i915/i915_gem.c
Original file line number Diff line number Diff line change
Expand Up @@ -3318,7 +3318,7 @@ static void nop_submit_request(struct i915_request *request)

spin_lock_irqsave(&request->engine->timeline.lock, flags);
__i915_request_submit(request);
intel_engine_init_global_seqno(request->engine, request->global_seqno);
intel_engine_write_global_seqno(request->engine, request->global_seqno);
spin_unlock_irqrestore(&request->engine->timeline.lock, flags);
}

Expand Down Expand Up @@ -3359,7 +3359,7 @@ void i915_gem_set_wedged(struct drm_i915_private *i915)

/*
* Make sure no request can slip through without getting completed by
* either this call here to intel_engine_init_global_seqno, or the one
* either this call here to intel_engine_write_global_seqno, or the one
* in nop_submit_request.
*/
synchronize_rcu();
Expand Down
126 changes: 6 additions & 120 deletions drivers/gpu/drm/i915/i915_request.c
Original file line number Diff line number Diff line change
Expand Up @@ -111,99 +111,10 @@ i915_request_remove_from_client(struct i915_request *request)
spin_unlock(&file_priv->mm.lock);
}

static int reset_all_global_seqno(struct drm_i915_private *i915, u32 seqno)
static void reserve_gt(struct drm_i915_private *i915)
{
struct intel_engine_cs *engine;
struct i915_timeline *timeline;
enum intel_engine_id id;
int ret;

/* Carefully retire all requests without writing to the rings */
ret = i915_gem_wait_for_idle(i915,
I915_WAIT_INTERRUPTIBLE |
I915_WAIT_LOCKED,
MAX_SCHEDULE_TIMEOUT);
if (ret)
return ret;

GEM_BUG_ON(i915->gt.active_requests);

/* If the seqno wraps around, we need to clear the breadcrumb rbtree */
for_each_engine(engine, i915, id) {
GEM_TRACE("%s seqno %d (current %d) -> %d\n",
engine->name,
engine->timeline.seqno,
intel_engine_get_seqno(engine),
seqno);

if (seqno == engine->timeline.seqno)
continue;

kthread_park(engine->breadcrumbs.signaler);

if (!i915_seqno_passed(seqno, engine->timeline.seqno)) {
/* Flush any waiters before we reuse the seqno */
intel_engine_disarm_breadcrumbs(engine);
intel_engine_init_hangcheck(engine);
GEM_BUG_ON(!list_empty(&engine->breadcrumbs.signals));
}

/* Check we are idle before we fiddle with hw state! */
GEM_BUG_ON(!intel_engine_is_idle(engine));
GEM_BUG_ON(i915_gem_active_isset(&engine->timeline.last_request));

/* Finally reset hw state */
intel_engine_init_global_seqno(engine, seqno);
engine->timeline.seqno = seqno;

kthread_unpark(engine->breadcrumbs.signaler);
}

list_for_each_entry(timeline, &i915->gt.timelines, link)
memset(timeline->global_sync, 0, sizeof(timeline->global_sync));

i915->gt.request_serial = seqno;

return 0;
}

int i915_gem_set_global_seqno(struct drm_device *dev, u32 seqno)
{
struct drm_i915_private *i915 = to_i915(dev);

lockdep_assert_held(&i915->drm.struct_mutex);

if (seqno == 0)
return -EINVAL;

/* HWS page needs to be set less than what we will inject to ring */
return reset_all_global_seqno(i915, seqno - 1);
}

static int reserve_gt(struct drm_i915_private *i915)
{
int ret;

/*
* Reservation is fine until we may need to wrap around
*
* By incrementing the serial for every request, we know that no
* individual engine may exceed that serial (as each is reset to 0
* on any wrap). This protects even the most pessimistic of migrations
* of every request from all engines onto just one.
*/
while (unlikely(++i915->gt.request_serial == 0)) {
ret = reset_all_global_seqno(i915, 0);
if (ret) {
i915->gt.request_serial--;
return ret;
}
}

if (!i915->gt.active_requests++)
i915_gem_unpark(i915);

return 0;
}

static void unreserve_gt(struct drm_i915_private *i915)
Expand Down Expand Up @@ -608,9 +519,7 @@ i915_request_alloc(struct intel_engine_cs *engine, struct i915_gem_context *ctx)
if (IS_ERR(ce))
return ERR_CAST(ce);

ret = reserve_gt(i915);
if (ret)
goto err_unpin;
reserve_gt(i915);

ret = intel_ring_wait_for_space(ce->ring, MIN_SPACE_FOR_ADD_REQUEST);
if (ret)
Expand Down Expand Up @@ -743,7 +652,6 @@ i915_request_alloc(struct intel_engine_cs *engine, struct i915_gem_context *ctx)
kmem_cache_free(i915->requests, rq);
err_unreserve:
unreserve_gt(i915);
err_unpin:
intel_context_unpin(ce);
return ERR_PTR(ret);
}
Expand Down Expand Up @@ -771,34 +679,12 @@ i915_request_await_request(struct i915_request *to, struct i915_request *from)
ret = i915_sw_fence_await_sw_fence_gfp(&to->submit,
&from->submit,
I915_FENCE_GFP);
return ret < 0 ? ret : 0;
}

if (to->engine->semaphore.sync_to) {
u32 seqno;

GEM_BUG_ON(!from->engine->semaphore.signal);

seqno = i915_request_global_seqno(from);
if (!seqno)
goto await_dma_fence;

if (seqno <= to->timeline->global_sync[from->engine->id])
return 0;

trace_i915_gem_ring_sync_to(to, from);
ret = to->engine->semaphore.sync_to(to, from);
if (ret)
return ret;

to->timeline->global_sync[from->engine->id] = seqno;
return 0;
} else {
ret = i915_sw_fence_await_dma_fence(&to->submit,
&from->fence, 0,
I915_FENCE_GFP);
}

await_dma_fence:
ret = i915_sw_fence_await_dma_fence(&to->submit,
&from->fence, 0,
I915_FENCE_GFP);
return ret < 0 ? ret : 0;
}

Expand Down
8 changes: 0 additions & 8 deletions drivers/gpu/drm/i915/i915_timeline.h
Original file line number Diff line number Diff line change
Expand Up @@ -63,14 +63,6 @@ struct i915_timeline {
* redundant and we can discard it without loss of generality.
*/
struct i915_syncmap *sync;
/**
* Separately to the inter-context seqno map above, we track the last
* barrier (e.g. semaphore wait) to the global engine timelines. Note
* that this tracks global_seqno rather than the context.seqno, and
* so it is subject to the limitations of hw wraparound and that we
* may need to revoke global_seqno (on pre-emption).
*/
u32 global_sync[I915_NUM_ENGINES];

struct list_head link;
const char *name;
Expand Down
29 changes: 0 additions & 29 deletions drivers/gpu/drm/i915/i915_trace.h
Original file line number Diff line number Diff line change
Expand Up @@ -585,35 +585,6 @@ TRACE_EVENT(i915_gem_evict_vm,
TP_printk("dev=%d, vm=%p", __entry->dev, __entry->vm)
);

TRACE_EVENT(i915_gem_ring_sync_to,
TP_PROTO(struct i915_request *to, struct i915_request *from),
TP_ARGS(to, from),

TP_STRUCT__entry(
__field(u32, dev)
__field(u32, from_class)
__field(u32, from_instance)
__field(u32, to_class)
__field(u32, to_instance)
__field(u32, seqno)
),

TP_fast_assign(
__entry->dev = from->i915->drm.primary->index;
__entry->from_class = from->engine->uabi_class;
__entry->from_instance = from->engine->instance;
__entry->to_class = to->engine->uabi_class;
__entry->to_instance = to->engine->instance;
__entry->seqno = from->global_seqno;
),

TP_printk("dev=%u, sync-from=%u:%u, sync-to=%u:%u, seqno=%u",
__entry->dev,
__entry->from_class, __entry->from_instance,
__entry->to_class, __entry->to_instance,
__entry->seqno)
);

TRACE_EVENT(i915_request_queue,
TP_PROTO(struct i915_request *rq, u32 flags),
TP_ARGS(rq, flags),
Expand Down
29 changes: 1 addition & 28 deletions drivers/gpu/drm/i915/intel_engine_cs.c
Original file line number Diff line number Diff line change
Expand Up @@ -454,25 +454,8 @@ int intel_engines_init(struct drm_i915_private *dev_priv)
return err;
}

void intel_engine_init_global_seqno(struct intel_engine_cs *engine, u32 seqno)
void intel_engine_write_global_seqno(struct intel_engine_cs *engine, u32 seqno)
{
struct drm_i915_private *dev_priv = engine->i915;

/* Our semaphore implementation is strictly monotonic (i.e. we proceed
* so long as the semaphore value in the register/page is greater
* than the sync value), so whenever we reset the seqno,
* so long as we reset the tracking semaphore value to 0, it will
* always be before the next request's seqno. If we don't reset
* the semaphore value, then when the seqno moves backwards all
* future waits will complete instantly (causing rendering corruption).
*/
if (IS_GEN_RANGE(dev_priv, 6, 7)) {
I915_WRITE(RING_SYNC_0(engine->mmio_base), 0);
I915_WRITE(RING_SYNC_1(engine->mmio_base), 0);
if (HAS_VEBOX(dev_priv))
I915_WRITE(RING_SYNC_2(engine->mmio_base), 0);
}

intel_write_status_page(engine, I915_GEM_HWS_INDEX, seqno);
clear_bit(ENGINE_IRQ_BREADCRUMB, &engine->irq_posted);

Expand Down Expand Up @@ -1300,16 +1283,6 @@ static void intel_engine_print_registers(const struct intel_engine_cs *engine,
drm_printf(m, "\tRING_IMR: %08x\n", I915_READ_IMR(engine));
}

if (HAS_LEGACY_SEMAPHORES(dev_priv)) {
drm_printf(m, "\tSYNC_0: 0x%08x\n",
I915_READ(RING_SYNC_0(engine->mmio_base)));
drm_printf(m, "\tSYNC_1: 0x%08x\n",
I915_READ(RING_SYNC_1(engine->mmio_base)));
if (HAS_VEBOX(dev_priv))
drm_printf(m, "\tSYNC_2: 0x%08x\n",
I915_READ(RING_SYNC_2(engine->mmio_base)));
}

addr = intel_engine_get_active_head(engine);
drm_printf(m, "\tACTHD: 0x%08x_%08x\n",
upper_32_bits(addr), lower_32_bits(addr));
Expand Down
Loading

0 comments on commit 6faf591

Please sign in to comment.