summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorHarshdeep Dhatt <quic_hdhatt@quicinc.com>2023-01-11 16:42:37 -0700
committerGerrit - the friendly Code Review server <code-review@localhost>2023-02-25 09:34:42 -0800
commit03f9b6046176cb890e352b9d409dd8ac1d2f769a (patch)
tree436979e6d086b9ea9904acb1fc30781d3be1a7e3
parentee96ae68df4decbc26f20369ca4897c30d76d9b6 (diff)
downloadgraphics-03f9b6046176cb890e352b9d409dd8ac1d2f769a.tar.gz
msm: kgsl: Add support to handle hardware fence timeout
If GMU detects that a certain external hardware fence (part of a sync object) is not signaled via the hardware path, GMU will query the software status of that fence. Say, kgsl indicates to GMU that the software status of the fence is signaled, GMU will wait for another epoch. GMU will then send a F2H_MSG_CONTEXT_BAD packet if it still detects that the hardware fence is not signaled via the hardware path. This packet contains the context id (that was waiting for this external hardware fence) and the sync object timestamp (to which the external hardware fence belongs). Kgsl will log all the fences (and some metadata) that are part of this sync object and trigger snapshot and recovery. Change-Id: If7ece0d4e45b3ff4e0af05f9233752b99b0bb123 Signed-off-by: Harshdeep Dhatt <quic_hdhatt@quicinc.com>
-rw-r--r--adreno_gen7_hwsched_hfi.c106
-rw-r--r--adreno_hfi.h22
-rw-r--r--adreno_hwsched.c69
-rw-r--r--adreno_trace.h3
4 files changed, 190 insertions, 10 deletions
diff --git a/adreno_gen7_hwsched_hfi.c b/adreno_gen7_hwsched_hfi.c
index 48ebfe0..d2e8e0e 100644
--- a/adreno_gen7_hwsched_hfi.c
+++ b/adreno_gen7_hwsched_hfi.c
@@ -254,6 +254,93 @@ static u32 get_payload_rb_key_legacy(struct adreno_device *adreno_dev,
return 0;
}
+struct syncobj_flags {
+ unsigned long mask;
+ const char *name;
+};
+
+static void _get_syncobj_string(char *str, u32 max_size, struct hfi_syncobj *syncobj, u32 index)
+{
+ u32 count = scnprintf(str, max_size, "syncobj[%d] ctxt_id:%lu seqno:%lu flags:", index,
+ syncobj->ctxt_id, syncobj->seq_no);
+ u32 i;
+ bool first = true;
+ static const struct syncobj_flags _flags[] = {
+ GMU_SYNCOBJ_FLAGS, { -1, NULL }};
+
+ for (i = 0; _flags[i].name; i++) {
+ if (!(syncobj->flags & _flags[i].mask))
+ continue;
+
+ if (first) {
+ count += scnprintf(str + count, max_size - count, "%s", _flags[i].name);
+ first = false;
+ } else {
+ count += scnprintf(str + count, max_size - count, "|%s", _flags[i].name);
+ }
+ }
+}
+
+static void log_syncobj(struct gen7_gmu_device *gmu, struct hfi_submit_syncobj *cmd)
+{
+ struct hfi_syncobj *syncobj = (struct hfi_syncobj *)&cmd[1];
+ char str[128];
+ u32 i = 0;
+
+ for (i = 0; i < cmd->num_syncobj; i++) {
+ _get_syncobj_string(str, sizeof(str), syncobj, i);
+ dev_err(&gmu->pdev->dev, "%s\n", str);
+ syncobj++;
+ }
+}
+
+static void find_timeout_syncobj(struct adreno_device *adreno_dev, u32 ctxt_id, u32 ts)
+{
+ struct gen7_gmu_device *gmu = to_gen7_gmu(adreno_dev);
+ struct kgsl_context *context = NULL;
+ struct adreno_context *drawctxt;
+ struct kgsl_device *device = KGSL_DEVICE(adreno_dev);
+ struct gmu_context_queue_header *hdr;
+ struct hfi_submit_syncobj *cmd;
+ u32 *queue, i;
+ int ret;
+
+ /* We want to get the context even if it is detached */
+ read_lock(&device->context_lock);
+ context = idr_find(&device->context_idr, ctxt_id);
+ ret = _kgsl_context_get(context);
+ read_unlock(&device->context_lock);
+
+ if (!ret)
+ return;
+
+ drawctxt = ADRENO_CONTEXT(context);
+
+ hdr = drawctxt->gmu_context_queue.hostptr;
+ queue = (u32 *)(drawctxt->gmu_context_queue.hostptr + sizeof(*hdr));
+
+ for (i = hdr->read_index; i != hdr->write_index;) {
+ if (MSG_HDR_GET_ID(queue[i]) != H2F_MSG_ISSUE_SYNCOBJ) {
+ i = (i + MSG_HDR_GET_SIZE(queue[i])) % hdr->queue_size;
+ continue;
+ }
+
+ cmd = (struct hfi_submit_syncobj *)&queue[i];
+
+ if (cmd->timestamp == ts) {
+ log_syncobj(gmu, cmd);
+ break;
+ }
+ i = (i + MSG_HDR_GET_SIZE(queue[i])) % hdr->queue_size;
+ }
+
+ if (i == hdr->write_index)
+ dev_err(&gmu->pdev->dev, "Couldn't find unsignaled syncobj ctx:%d ts:%d\n",
+ ctxt_id, ts);
+
+ kgsl_context_put(context);
+}
+
static void log_gpu_fault_legacy(struct adreno_device *adreno_dev)
{
struct gen7_gmu_device *gmu = to_gen7_gmu(adreno_dev);
@@ -382,6 +469,11 @@ static void log_gpu_fault_legacy(struct adreno_device *adreno_dev)
case GMU_GPU_AQE1_ILLEGAL_INST_ERROR:
dev_crit_ratelimited(dev, "AQE1 Illegal instruction error\n");
break;
+ case GMU_SYNCOBJ_TIMEOUT_ERROR:
+ dev_crit_ratelimited(dev, "syncobj timeout ctx %d ts %u\n",
+ cmd->ctxt_id, cmd->ts);
+ find_timeout_syncobj(adreno_dev, cmd->ctxt_id, cmd->ts);
+ break;
case GMU_CP_UNKNOWN_ERROR:
fallthrough;
default:
@@ -610,6 +702,11 @@ static void log_gpu_fault(struct adreno_device *adreno_dev)
case GMU_GPU_AQE1_ILLEGAL_INST_ERROR:
dev_crit_ratelimited(dev, "AQE1 Illegal instruction error\n");
break;
+ case GMU_SYNCOBJ_TIMEOUT_ERROR:
+ dev_crit_ratelimited(dev, "syncobj timeout ctx %d ts %u\n",
+ cmd->gc.ctxt_id, cmd->gc.ts);
+ find_timeout_syncobj(adreno_dev, cmd->gc.ctxt_id, cmd->gc.ts);
+ break;
case GMU_CP_UNKNOWN_ERROR:
fallthrough;
default:
@@ -2375,12 +2472,12 @@ static void populate_kgsl_fence(struct hfi_syncobj *obj,
struct kgsl_sync_timeline *ktimeline = kfence->parent;
unsigned long flags;
- obj->flags |= GMU_SYNCOBJ_KGSL_FENCE;
+ obj->flags |= BIT(GMU_SYNCOBJ_FLAG_KGSL_FENCE_BIT);
spin_lock_irqsave(&ktimeline->lock, flags);
/* This means that the context is going away. Mark the fence as triggered */
if (!ktimeline->context) {
- obj->flags |= GMU_SYNCOBJ_RETIRED;
+ obj->flags |= BIT(GMU_SYNCOBJ_FLAG_SIGNALED_BIT);
spin_unlock_irqrestore(&ktimeline->lock, flags);
return;
}
@@ -2445,8 +2542,9 @@ static int _submit_hw_fence(struct adreno_device *adreno_dev,
return ret;
}
- if (test_bit(MSM_HW_FENCE_FLAG_SIGNALED_BIT, &fences[j]->flags))
- obj->flags |= GMU_SYNCOBJ_RETIRED;
+ if (test_bit(MSM_HW_FENCE_FLAG_SIGNALED_BIT, &fences[j]->flags) ||
+ test_bit(DMA_FENCE_FLAG_SIGNALED_BIT, &fences[j]->flags))
+ obj->flags |= BIT(GMU_SYNCOBJ_FLAG_SIGNALED_BIT);
obj->ctxt_id = fences[j]->context;
obj->seq_no = fences[j]->seqno;
diff --git a/adreno_hfi.h b/adreno_hfi.h
index a1bdc0b..46d7311 100644
--- a/adreno_hfi.h
+++ b/adreno_hfi.h
@@ -760,9 +760,22 @@ struct hfi_ts_notify_cmd {
/* This indicates that the SYNCOBJ is kgsl output fence */
-#define GMU_SYNCOBJ_KGSL_FENCE BIT(0)
-/* This indicates that the SYNCOBJ is already retired */
-#define GMU_SYNCOBJ_RETIRED BIT(1)
+#define GMU_SYNCOBJ_FLAG_KGSL_FENCE_BIT 0
+/* This indicates that the SYNCOBJ is signaled */
+#define GMU_SYNCOBJ_FLAG_SIGNALED_BIT 1
+/* This indicates that the SYNCOBJ's software status is queried */
+#define GMU_SYNCOBJ_FLAG_QUERY_SW_STATUS_BIT 2
+/* This indicates that the SYNCOBJ's software status is signaled */
+#define GMU_SYNCOBJ_FLAG_SW_STATUS_SIGNALED_BIT 3
+/* This indicates that the SYNCOBJ's software status is pending */
+#define GMU_SYNCOBJ_FLAG_SW_STATUS_PENDING_BIT 4
+
+#define GMU_SYNCOBJ_FLAGS \
+ { BIT(GMU_SYNCOBJ_FLAG_KGSL_FENCE_BIT), "KGSL"}, \
+ { BIT(GMU_SYNCOBJ_FLAG_SIGNALED_BIT), "SIGNALED"}, \
+ { BIT(GMU_SYNCOBJ_FLAG_QUERY_SW_STATUS_BIT), "QUERIED"}, \
+ { BIT(GMU_SYNCOBJ_FLAG_SW_STATUS_SIGNALED_BIT), "SW_SIGNALED"}, \
+ { BIT(GMU_SYNCOBJ_FLAG_SW_STATUS_PENDING_BIT), "SW_PENDING"}
/* F2H */
struct hfi_ts_retire_cmd {
@@ -1054,7 +1067,8 @@ struct payload_section {
#define GMU_GPU_AQE1_UCODE_ERROR 627
#define GMU_GPU_AQE1_HW_FAULT_ERROR 628
#define GMU_GPU_AQE1_ILLEGAL_INST_ERROR 629
-
+/* GMU encountered a sync object which is signaled via software but not via hardware */
+#define GMU_SYNCOBJ_TIMEOUT_ERROR 630
/* GPU encountered an unknown CP error */
#define GMU_CP_UNKNOWN_ERROR 700
diff --git a/adreno_hwsched.c b/adreno_hwsched.c
index 2e8d8c9..a03f2db 100644
--- a/adreno_hwsched.c
+++ b/adreno_hwsched.c
@@ -5,6 +5,7 @@
*/
#include <dt-bindings/soc/qcom,ipcc.h>
+#include <linux/dma-fence-array.h>
#include <linux/soc/qcom/msm_hw_fence.h>
#include <soc/qcom/msm_performance.h>
@@ -1699,6 +1700,62 @@ static bool context_is_throttled(struct kgsl_device *device,
return false;
}
+static void _print_syncobj(struct adreno_device *adreno_dev, struct kgsl_drawobj *drawobj)
+{
+ int i, j, fence_index = 0;
+ struct kgsl_drawobj_sync *syncobj = SYNCOBJ(drawobj);
+ struct kgsl_device *device = KGSL_DEVICE(adreno_dev);
+
+ for (i = 0; i < syncobj->numsyncs; i++) {
+ struct kgsl_drawobj_sync_event *event = &syncobj->synclist[i];
+ struct kgsl_sync_fence_cb *kcb = event->handle;
+ struct dma_fence **fences;
+ struct dma_fence_array *array;
+ u32 num_fences;
+
+ array = to_dma_fence_array(kcb->fence);
+ if (array != NULL) {
+ num_fences = array->num_fences;
+ fences = array->fences;
+ } else {
+ num_fences = 1;
+ fences = &kcb->fence;
+ }
+
+ for (j = 0; j < num_fences; j++, fence_index++) {
+ bool kgsl = is_kgsl_fence(fences[j]);
+ bool signaled = test_bit(DMA_FENCE_FLAG_SIGNALED_BIT, &fences[j]->flags);
+ char value[32] = "unknown";
+
+ if (fences[j]->ops->timeline_value_str)
+ fences[j]->ops->timeline_value_str(fences[j], value, sizeof(value));
+
+ dev_err(device->dev,
+ "dma fence[%d] signaled:%d kgsl:%d ctx:%lu seqno:%lu value:%s\n",
+ fence_index, signaled, kgsl, fences[j]->context, fences[j]->seqno,
+ value);
+ }
+ }
+
+}
+
+static void print_fault_syncobj(struct adreno_device *adreno_dev,
+ u32 ctxt_id, u32 ts)
+{
+ struct adreno_hwsched *hwsched = &adreno_dev->hwsched;
+ struct cmd_list_obj *obj;
+
+ list_for_each_entry(obj, &hwsched->cmd_list, node) {
+ struct kgsl_drawobj *drawobj = obj->drawobj;
+
+ if (drawobj->type == SYNCOBJ_TYPE) {
+ if ((ctxt_id == drawobj->context->id) &&
+ (ts == drawobj->timestamp))
+ _print_syncobj(adreno_dev, drawobj);
+ }
+ }
+}
+
static void adreno_hwsched_reset_and_snapshot_legacy(struct adreno_device *adreno_dev, int fault)
{
struct kgsl_drawobj *drawobj = NULL;
@@ -1715,6 +1772,12 @@ static void adreno_hwsched_reset_and_snapshot_legacy(struct adreno_device *adren
if (hwsched->recurring_cmdobj)
srcu_notifier_call_chain(&device->nh, GPU_SSR_BEGIN, NULL);
+ if (cmd->error == GMU_SYNCOBJ_TIMEOUT_ERROR) {
+ print_fault_syncobj(adreno_dev, cmd->ctxt_id, cmd->ts);
+ kgsl_device_snapshot(device, NULL, NULL, true);
+ goto done;
+ }
+
/*
* First, try to see if the faulted command object is marked
* in case there was a context bad hfi. But, with stall-on-fault,
@@ -1787,6 +1850,12 @@ static void adreno_hwsched_reset_and_snapshot(struct adreno_device *adreno_dev,
if (hwsched->recurring_cmdobj)
srcu_notifier_call_chain(&device->nh, GPU_SSR_BEGIN, NULL);
+ if (cmd->error == GMU_SYNCOBJ_TIMEOUT_ERROR) {
+ print_fault_syncobj(adreno_dev, cmd->gc.ctxt_id, cmd->gc.ts);
+ kgsl_device_snapshot(device, NULL, NULL, true);
+ goto done;
+ }
+
/*
* First, try to see if the faulted command object is marked
* in case there was a context bad hfi. But, with stall-on-fault,
diff --git a/adreno_trace.h b/adreno_trace.h
index 251801c..8e7c355 100644
--- a/adreno_trace.h
+++ b/adreno_trace.h
@@ -104,8 +104,7 @@ TRACE_EVENT(adreno_input_hw_fence,
"ctx=%u id=%lld seqno=%lld flags=%s name=%s",
__entry->id, __entry->context, __entry->seqno,
__entry->flags ? __print_flags(__entry->flags, "|",
- { GMU_SYNCOBJ_KGSL_FENCE, "KGSL_FENCE" },
- { GMU_SYNCOBJ_RETIRED, "RETIRED" }) : "none",
+ GMU_SYNCOBJ_FLAGS) : "none",
__get_str(fence_name))
);