aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorGravatar Daniele Ceraolo Spurio <daniele.ceraolospurio@intel.com> 2023-12-06 13:14:41 -0800
committerGravatar Rodrigo Vivi <rodrigo.vivi@intel.com> 2023-12-21 16:31:30 -0500
commit0eb16fd26795639d5420b58bb12d11c7705e6dfa (patch)
tree4bc28c38c0ae53473720f011e74c0d106500518d
parentdrm/xe/xelpg: Extend Wa_14019877138 for Graphics 12.70/71 (diff)
downloadlinux-0eb16fd26795639d5420b58bb12d11c7705e6dfa.tar.gz
linux-0eb16fd26795639d5420b58bb12d11c7705e6dfa.tar.bz2
linux-0eb16fd26795639d5420b58bb12d11c7705e6dfa.zip
drm/xe/guc: Use FAST_REQUEST for non-blocking H2G messages
We're currently sending non-blocking H2G messages using the EVENT type, which suppresses all CTB protocol replies from the GuC, including the failure cases. This might cause errors to slip through and manifest as unexpected behavior (e.g. a context state might not be what the driver thinks it is because the state change command was silently rejected by the GuC). To avoid this kind of problems, we can use the FAST_REQUEST type instead, which suppresses the reply only on success; this way we still get the advantage of not having to wait for an ack from the GuC (i.e. the H2G is still non-blocking) while still detecting errors. Since we can't escalate to the caller when a non-blocking message fails, we need to escalate to GT reset instead. Note that FAST_REQUEST failures are NOT expected and are usually a sign that the H2G was either malformed or requested an illegal operation. v2: assign fence values to FAST_REQUEST messages, fix abi doc, use xe_gt printers (Michal). v3: fix doc alignment, fix and improve prints (Michal) Signed-off-by: Daniele Ceraolo Spurio <daniele.ceraolospurio@intel.com> Cc: John Harrison <John.C.Harrison@Intel.com> Cc: Matthew Brost <matthew.brost@intel.com> Cc: Michal Wajdeczko <michal.wajdeczko@intel.com> Reviewed-by: Matthew Brost <matthew.brost@intel.com> #v2 Reviewed-by: Michal Wajdeczko <michal.wajdeczko@intel.com>
-rw-r--r--drivers/gpu/drm/xe/abi/guc_messages_abi.h2
-rw-r--r--drivers/gpu/drm/xe/xe_guc_ct.c58
2 files changed, 54 insertions, 6 deletions
diff --git a/drivers/gpu/drm/xe/abi/guc_messages_abi.h b/drivers/gpu/drm/xe/abi/guc_messages_abi.h
index 3d199016cf88..ff888d16bd4f 100644
--- a/drivers/gpu/drm/xe/abi/guc_messages_abi.h
+++ b/drivers/gpu/drm/xe/abi/guc_messages_abi.h
@@ -24,6 +24,7 @@
* | | 30:28 | **TYPE** - message type |
* | | | - _`GUC_HXG_TYPE_REQUEST` = 0 |
* | | | - _`GUC_HXG_TYPE_EVENT` = 1 |
+ * | | | - _`GUC_HXG_TYPE_FAST_REQUEST` = 2 |
* | | | - _`GUC_HXG_TYPE_NO_RESPONSE_BUSY` = 3 |
* | | | - _`GUC_HXG_TYPE_NO_RESPONSE_RETRY` = 5 |
* | | | - _`GUC_HXG_TYPE_RESPONSE_FAILURE` = 6 |
@@ -46,6 +47,7 @@
#define GUC_HXG_MSG_0_TYPE (0x7 << 28)
#define GUC_HXG_TYPE_REQUEST 0u
#define GUC_HXG_TYPE_EVENT 1u
+#define GUC_HXG_TYPE_FAST_REQUEST 2u
#define GUC_HXG_TYPE_NO_RESPONSE_BUSY 3u
#define GUC_HXG_TYPE_NO_RESPONSE_RETRY 5u
#define GUC_HXG_TYPE_RESPONSE_FAILURE 6u
diff --git a/drivers/gpu/drm/xe/xe_guc_ct.c b/drivers/gpu/drm/xe/xe_guc_ct.c
index 24a33fa36496..4cde93c18a2d 100644
--- a/drivers/gpu/drm/xe/xe_guc_ct.c
+++ b/drivers/gpu/drm/xe/xe_guc_ct.c
@@ -17,6 +17,7 @@
#include "xe_device.h"
#include "xe_gt.h"
#include "xe_gt_pagefault.h"
+#include "xe_gt_printk.h"
#include "xe_gt_tlb_invalidation.h"
#include "xe_guc.h"
#include "xe_guc_submit.h"
@@ -448,7 +449,7 @@ static int h2g_write(struct xe_guc_ct *ct, const u32 *action, u32 len,
GUC_HXG_EVENT_MSG_0_DATA0, action[0]);
} else {
cmd[1] =
- FIELD_PREP(GUC_HXG_MSG_0_TYPE, GUC_HXG_TYPE_EVENT) |
+ FIELD_PREP(GUC_HXG_MSG_0_TYPE, GUC_HXG_TYPE_FAST_REQUEST) |
FIELD_PREP(GUC_HXG_EVENT_MSG_0_ACTION |
GUC_HXG_EVENT_MSG_0_DATA0, action[0]);
}
@@ -475,11 +476,31 @@ static int h2g_write(struct xe_guc_ct *ct, const u32 *action, u32 len,
return 0;
}
+/*
+ * The CT protocol accepts a 16 bits fence. This field is fully owned by the
+ * driver, the GuC will just copy it to the reply message. Since we need to
+ * be able to distinguish between replies to REQUEST and FAST_REQUEST messages,
+ * we use one bit of the seqno as an indicator for that and a rolling counter
+ * for the remaining 15 bits.
+ */
+#define CT_SEQNO_MASK GENMASK(14, 0)
+#define CT_SEQNO_UNTRACKED BIT(15)
+static u16 next_ct_seqno(struct xe_guc_ct *ct, bool is_g2h_fence)
+{
+ u32 seqno = ct->fence_seqno++ & CT_SEQNO_MASK;
+
+ if (!is_g2h_fence)
+ seqno |= CT_SEQNO_UNTRACKED;
+
+ return seqno;
+}
+
static int __guc_ct_send_locked(struct xe_guc_ct *ct, const u32 *action,
u32 len, u32 g2h_len, u32 num_g2h,
struct g2h_fence *g2h_fence)
{
struct xe_device *xe = ct_to_xe(ct);
+ u16 seqno;
int ret;
xe_assert(xe, !g2h_len || !g2h_fence);
@@ -505,7 +526,7 @@ static int __guc_ct_send_locked(struct xe_guc_ct *ct, const u32 *action,
if (g2h_fence_needs_alloc(g2h_fence)) {
void *ptr;
- g2h_fence->seqno = (ct->fence_seqno++ & 0xffff);
+ g2h_fence->seqno = next_ct_seqno(ct, true);
ptr = xa_store(&ct->fence_lookup,
g2h_fence->seqno,
g2h_fence, GFP_ATOMIC);
@@ -514,6 +535,10 @@ static int __guc_ct_send_locked(struct xe_guc_ct *ct, const u32 *action,
goto out;
}
}
+
+ seqno = g2h_fence->seqno;
+ } else {
+ seqno = next_ct_seqno(ct, false);
}
if (g2h_len)
@@ -523,8 +548,7 @@ retry:
if (unlikely(ret))
goto out_unlock;
- ret = h2g_write(ct, action, len, g2h_fence ? g2h_fence->seqno : 0,
- !!g2h_fence);
+ ret = h2g_write(ct, action, len, seqno, !!g2h_fence);
if (unlikely(ret)) {
if (ret == -EAGAIN)
goto retry;
@@ -786,7 +810,8 @@ static int parse_g2h_event(struct xe_guc_ct *ct, u32 *msg, u32 len)
static int parse_g2h_response(struct xe_guc_ct *ct, u32 *msg, u32 len)
{
- struct xe_device *xe = ct_to_xe(ct);
+ struct xe_gt *gt = ct_to_gt(ct);
+ struct xe_device *xe = gt_to_xe(gt);
u32 response_len = len - GUC_CTB_MSG_MIN_LEN;
u32 fence = FIELD_GET(GUC_CTB_MSG_0_FENCE, msg[0]);
u32 type = FIELD_GET(GUC_HXG_MSG_0_TYPE, msg[1]);
@@ -794,10 +819,31 @@ static int parse_g2h_response(struct xe_guc_ct *ct, u32 *msg, u32 len)
lockdep_assert_held(&ct->lock);
+ /*
+ * Fences for FAST_REQUEST messages are not tracked in ct->fence_lookup.
+ * Those messages should never fail, so if we do get an error back it
+ * means we're likely doing an illegal operation and the GuC is
+ * rejecting it. We have no way to inform the code that submitted the
+ * H2G that the message was rejected, so we need to escalate the
+ * failure to trigger a reset.
+ */
+ if (fence & CT_SEQNO_UNTRACKED) {
+ if (type == GUC_HXG_TYPE_RESPONSE_FAILURE)
+ xe_gt_err(gt, "FAST_REQ H2G fence 0x%x failed! e=0x%x, h=%u\n",
+ fence,
+ FIELD_GET(GUC_HXG_FAILURE_MSG_0_ERROR, msg[1]),
+ FIELD_GET(GUC_HXG_FAILURE_MSG_0_HINT, msg[1]));
+ else
+ xe_gt_err(gt, "unexpected response %u for FAST_REQ H2G fence 0x%x!\n",
+ type, fence);
+
+ return -EPROTO;
+ }
+
g2h_fence = xa_erase(&ct->fence_lookup, fence);
if (unlikely(!g2h_fence)) {
/* Don't tear down channel, as send could've timed out */
- drm_warn(&xe->drm, "G2H fence (%u) not found!\n", fence);
+ xe_gt_warn(gt, "G2H fence (%u) not found!\n", fence);
g2h_release_space(ct, GUC_CTB_HXG_MSG_MAX_LEN);
return 0;
}