summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorWhi copybara merger <whitechapel-automerger@google.com>2021-07-09 00:36:34 +0000
committerSharad Bagri <sharadbagri@google.com>2021-07-09 10:16:18 -0700
commitf20eac91b6e63ee33793ee4aae418cf9fc84b1cf (patch)
treed58afe1091bb328b143692eb9317fb613d44cd9c
parent1aecc588849fb53db0cbc0e8d405e5d720c62524 (diff)
downloadabrolhos-f20eac91b6e63ee33793ee4aae418cf9fc84b1cf.tar.gz
[Copybara Auto Merge] Merge branch 'whitechapel' into android-gs-pixel-5.10
edgetpu: add force_reset flag when restarting firmware When CPU reset is controlled externaly (like mobile platforms relying on GSA), we want the critical wake-up path to be as short as possible. At the same time, when the device goes to a bad state we want to make sure a CPU reset is issued. Add a flag to edgetpu_firmware_restart_locked, propagate it to the restart handler in edgetpu_firmware_chip_data, and use it on Abrolhos to send a GSA_TPU_SHUTDOWN command in order to make sure the CPU is reset. Bug: 190871324 (cherry picked from commit 88bae702e99cd57b7962043ec76a97939765f6d7) Revert "edgetpu: abrolhos attempt FW restart on power down" This reverts commit d5487306b13e81d89435778dd487541ac341759b. (cherry picked from commit 963540540ce495cd1c0bd6123aea470f26dbbc75) edgetpu: abrolhos attempt FW restart on power down Bug: 190871324 edgetpu: KCI early leave if the device state is bad edgetpu: check device pointer in logging functions edgetpu: abrolhos: bcl: Renaming gs101_bcl to google_bcl Bug: 192683348 Bug: 193049438 edgetpu: abrolhos: return 0 when TZ mailbox is not acquired Bug: 192808637 edgetpu: log pids when acquire fails edgetpu: increase reverse KCI buffer size edgetpu: abrolhos fix thermal suspend condition edgetpu: move default firmware run and firmware_name param to common edgetpu: create edgetpu_firmware_chip_data edgetpu: move default firmware run and firmware_name param to common edgetpu: create edgetpu_firmware_chip_data edgetpu: watchdog bite when OPEN_DEVICE KCI timed out edgetpu: remove edgetpu_chip_firmware_run edgetpu: detach mailbox when activation failed edgetpu: reduce IOMMU fault reports severity edgetpu: allow buffer unmapping on errored groups GitOrigin-RevId: 0f4b817fa17e1df8270cb303eab0aceec4e8bf50 Change-Id: I1e7092b9f8f25ee72b559c6246d705bca24c1cdb
-rw-r--r--drivers/edgetpu/abrolhos-device.c4
-rw-r--r--drivers/edgetpu/abrolhos-firmware.c18
-rw-r--r--drivers/edgetpu/abrolhos-pm.c14
-rw-r--r--drivers/edgetpu/abrolhos-thermal.c6
-rw-r--r--drivers/edgetpu/edgetpu-device-group.c19
-rw-r--r--drivers/edgetpu/edgetpu-dmabuf.c5
-rw-r--r--drivers/edgetpu/edgetpu-firmware.c105
-rw-r--r--drivers/edgetpu/edgetpu-firmware.h46
-rw-r--r--drivers/edgetpu/edgetpu-fs.c8
-rw-r--r--drivers/edgetpu/edgetpu-google-iommu.c28
-rw-r--r--drivers/edgetpu/edgetpu-internal.h20
-rw-r--r--drivers/edgetpu/edgetpu-kci.c14
-rw-r--r--drivers/edgetpu/edgetpu-kci.h2
-rw-r--r--drivers/edgetpu/edgetpu-mailbox.c8
-rw-r--r--drivers/edgetpu/edgetpu-mmu.h5
-rw-r--r--drivers/edgetpu/mobile-firmware.h2
16 files changed, 183 insertions, 121 deletions
diff --git a/drivers/edgetpu/abrolhos-device.c b/drivers/edgetpu/abrolhos-device.c
index 1277199..e38c508 100644
--- a/drivers/edgetpu/abrolhos-device.c
+++ b/drivers/edgetpu/abrolhos-device.c
@@ -225,9 +225,9 @@ int edgetpu_chip_release_ext_mailbox(struct edgetpu_client *client,
mutex_lock(&apdev->tz_mailbox_lock);
if (!apdev->secure_client) {
- etdev_err(client->etdev, "TZ mailbox already released\n");
+ etdev_warn(client->etdev, "TZ mailbox already released\n");
mutex_unlock(&apdev->tz_mailbox_lock);
- return -ENODEV;
+ return 0;
}
if (apdev->secure_client != client) {
etdev_err(client->etdev,
diff --git a/drivers/edgetpu/abrolhos-firmware.c b/drivers/edgetpu/abrolhos-firmware.c
index 9acc0d2..5a8cd2e 100644
--- a/drivers/edgetpu/abrolhos-firmware.c
+++ b/drivers/edgetpu/abrolhos-firmware.c
@@ -64,12 +64,17 @@ static void abrolhos_firmware_teardown_buffer(
{
}
-static int abrolhos_firmware_restart(struct edgetpu_firmware *et_fw)
+static int abrolhos_firmware_restart(struct edgetpu_firmware *et_fw,
+ bool force_reset)
{
struct edgetpu_dev *etdev = et_fw->etdev;
struct abrolhos_platform_dev *edgetpu_pdev = to_abrolhos_dev(etdev);
int tpu_state;
+ /* We are in a bad state, send shutdown command and hope the device recovers */
+ if (force_reset)
+ gsa_send_tpu_cmd(edgetpu_pdev->gsa_dev, GSA_TPU_SHUTDOWN);
+
tpu_state = gsa_send_tpu_cmd(edgetpu_pdev->gsa_dev, GSA_TPU_START);
if (tpu_state < 0) {
@@ -199,7 +204,8 @@ out_unmap:
return ret;
}
-static const struct edgetpu_firmware_handlers abrolhos_firmware_handlers = {
+static const struct edgetpu_firmware_chip_data abrolhos_firmware_chip_data = {
+ .default_firmware_name = EDGETPU_DEFAULT_FIRMWARE_NAME,
.alloc_buffer = abrolhos_firmware_alloc_buffer,
.free_buffer = abrolhos_firmware_free_buffer,
.setup_buffer = abrolhos_firmware_setup_buffer,
@@ -210,7 +216,7 @@ static const struct edgetpu_firmware_handlers abrolhos_firmware_handlers = {
int mobile_edgetpu_firmware_create(struct edgetpu_dev *etdev)
{
- return edgetpu_firmware_create(etdev, &abrolhos_firmware_handlers);
+ return edgetpu_firmware_create(etdev, &abrolhos_firmware_chip_data);
}
void mobile_edgetpu_firmware_destroy(struct edgetpu_dev *etdev)
@@ -218,12 +224,6 @@ void mobile_edgetpu_firmware_destroy(struct edgetpu_dev *etdev)
edgetpu_firmware_destroy(etdev);
}
-int edgetpu_chip_firmware_run(struct edgetpu_dev *etdev, const char *name,
- enum edgetpu_firmware_flags flags)
-{
- return edgetpu_firmware_run(etdev, name, flags);
-}
-
unsigned long edgetpu_chip_firmware_iova(struct edgetpu_dev *etdev)
{
/*
diff --git a/drivers/edgetpu/abrolhos-pm.c b/drivers/edgetpu/abrolhos-pm.c
index 651a831..87a82d8 100644
--- a/drivers/edgetpu/abrolhos-pm.c
+++ b/drivers/edgetpu/abrolhos-pm.c
@@ -16,7 +16,6 @@
#include "abrolhos-platform.h"
#include "abrolhos-pm.h"
-#include "edgetpu-config.h"
#include "edgetpu-firmware.h"
#include "edgetpu-internal.h"
#include "edgetpu-kci.h"
@@ -435,7 +434,9 @@ static void abrolhos_power_down(struct edgetpu_pm *etpm);
static int abrolhos_power_up(struct edgetpu_pm *etpm)
{
struct edgetpu_dev *etdev = etpm->etdev;
+#if IS_ENABLED(CONFIG_GOOGLE_BCL)
struct abrolhos_platform_dev *abpdev = to_abrolhos_dev(etdev);
+#endif
int ret = abrolhos_pwr_state_set(
etpm->etdev, abrolhos_get_initial_pwr_state(etdev->dev));
@@ -479,12 +480,10 @@ static int abrolhos_power_up(struct edgetpu_pm *etpm)
/* attempt firmware run */
switch (edgetpu_firmware_status_locked(etdev)) {
case FW_VALID:
- ret = edgetpu_firmware_restart_locked(etdev);
+ ret = edgetpu_firmware_restart_locked(etdev, false);
break;
case FW_INVALID:
- ret = edgetpu_firmware_run_locked(etdev->firmware,
- EDGETPU_DEFAULT_FIRMWARE_NAME,
- FW_DEFAULT);
+ ret = edgetpu_firmware_run_default_locked(etdev);
break;
default:
break;
@@ -511,6 +510,11 @@ abrolhos_pm_shutdown_firmware(struct abrolhos_platform_dev *etpdev,
return;
etdev_warn(etdev, "Firmware shutdown request failed!\n");
+ etdev_warn(etdev, "Attempting firmware restart\n");
+ if (!edgetpu_firmware_restart_locked(etdev, true) &&
+ !edgetpu_pchannel_power_down(etdev, false))
+ return;
+
etdev_warn(etdev, "Requesting early GSA reset\n");
/*
diff --git a/drivers/edgetpu/abrolhos-thermal.c b/drivers/edgetpu/abrolhos-thermal.c
index d8123b6..67016c6 100644
--- a/drivers/edgetpu/abrolhos-thermal.c
+++ b/drivers/edgetpu/abrolhos-thermal.c
@@ -77,9 +77,9 @@ static int edgetpu_set_cur_state(struct thermal_cooling_device *cdev,
dev_err(dev, "error setting tpu policy: %d\n", ret);
goto out;
}
- if (state_original == 0)
+ if (pwr_state == TPU_OFF)
cooling->thermal_suspended = true;
- else if (cooling->cooling_state == 0)
+ else if (state_pwr_map[cooling->cooling_state].state == TPU_OFF)
cooling->thermal_suspended = false;
cooling->cooling_state = state_original;
ret = edgetpu_kci_notify_throttling(etdev, pwr_state);
@@ -258,7 +258,7 @@ static int tpu_thermal_parse_dvfs_table(struct edgetpu_thermal *thermal)
for (i = 0; i < row_size; ++i) {
int idx = col_size * i;
state_pwr_map[i].state = of_data_int_array[idx];
- state_pwr_map[i].power = of_data_int_array[idx+1];
+ state_pwr_map[i].power = of_data_int_array[idx + 1];
}
return 0;
diff --git a/drivers/edgetpu/edgetpu-device-group.c b/drivers/edgetpu/edgetpu-device-group.c
index 53c8ca2..6172b2c 100644
--- a/drivers/edgetpu/edgetpu-device-group.c
+++ b/drivers/edgetpu/edgetpu-device-group.c
@@ -1493,8 +1493,8 @@ int edgetpu_device_group_unmap(struct edgetpu_device_group *group,
int ret = 0;
mutex_lock(&group->lock);
- if (!edgetpu_device_group_is_finalized(group)) {
- ret = edgetpu_group_errno(group);
+ if (!is_finalized_or_errored(group)) {
+ ret = -EINVAL;
goto unlock_group;
}
@@ -1788,11 +1788,16 @@ int edgetpu_group_attach_and_open_mailbox(struct edgetpu_device_group *group)
* Only attaching mailbox for finalized groups.
* Don't attach mailbox for errored groups.
*/
- if (edgetpu_device_group_is_finalized(group)) {
- ret = edgetpu_group_attach_mailbox_locked(group);
- if (!ret)
- ret = edgetpu_group_activate(group);
- }
+ if (!edgetpu_device_group_is_finalized(group))
+ goto out_unlock;
+ ret = edgetpu_group_attach_mailbox_locked(group);
+ if (ret)
+ goto out_unlock;
+ ret = edgetpu_group_activate(group);
+ if (ret)
+ edgetpu_group_detach_mailbox_locked(group);
+
+out_unlock:
mutex_unlock(&group->lock);
return ret;
}
diff --git a/drivers/edgetpu/edgetpu-dmabuf.c b/drivers/edgetpu/edgetpu-dmabuf.c
index 5259650..1c89178 100644
--- a/drivers/edgetpu/edgetpu-dmabuf.c
+++ b/drivers/edgetpu/edgetpu-dmabuf.c
@@ -744,8 +744,9 @@ int edgetpu_unmap_dmabuf(struct edgetpu_device_group *group, u32 die_index,
int ret = -EINVAL;
mutex_lock(&group->lock);
- if (!edgetpu_device_group_is_finalized(group)) {
- ret = edgetpu_group_errno(group);
+ /* allows unmapping on errored groups */
+ if (!edgetpu_device_group_is_finalized(group) && !edgetpu_device_group_is_errored(group)) {
+ ret = -EINVAL;
goto out_unlock;
}
edgetpu_mapping_lock(mappings);
diff --git a/drivers/edgetpu/edgetpu-firmware.c b/drivers/edgetpu/edgetpu-firmware.c
index 2a1e577..d23d00c 100644
--- a/drivers/edgetpu/edgetpu-firmware.c
+++ b/drivers/edgetpu/edgetpu-firmware.c
@@ -8,6 +8,7 @@
#include <linux/delay.h>
#include <linux/device.h>
#include <linux/firmware.h>
+#include <linux/module.h>
#include <linux/mutex.h>
#include <linux/seq_file.h>
#include <linux/slab.h>
@@ -25,6 +26,9 @@
#include "edgetpu-sw-watchdog.h"
#include "edgetpu-telemetry.h"
+static char *firmware_name;
+module_param(firmware_name, charp, 0660);
+
/*
* Descriptor for loaded firmware, either in shared buffer mode or legacy mode
* (non-shared, custom allocated memory).
@@ -46,7 +50,7 @@ struct edgetpu_firmware_desc {
};
struct edgetpu_firmware_private {
- const struct edgetpu_firmware_handlers *handlers;
+ const struct edgetpu_firmware_chip_data *chip_fw;
void *data; /* for edgetpu_firmware_(set/get)_data */
struct mutex fw_desc_lock;
@@ -172,14 +176,14 @@ static int edgetpu_firmware_load_locked(
struct edgetpu_firmware_desc *fw_desc, const char *name,
enum edgetpu_firmware_flags flags)
{
- const struct edgetpu_firmware_handlers *handlers = et_fw->p->handlers;
+ const struct edgetpu_firmware_chip_data *chip_fw = et_fw->p->chip_fw;
struct edgetpu_dev *etdev = et_fw->etdev;
int ret;
fw_desc->buf.flags = flags;
- if (handlers && handlers->alloc_buffer) {
- ret = handlers->alloc_buffer(et_fw, &fw_desc->buf);
+ if (chip_fw->alloc_buffer) {
+ ret = chip_fw->alloc_buffer(et_fw, &fw_desc->buf);
if (ret) {
etdev_err(etdev, "handler alloc_buffer failed: %d\n",
ret);
@@ -193,8 +197,8 @@ static int edgetpu_firmware_load_locked(
goto out_free_buffer;
}
- if (handlers && handlers->setup_buffer) {
- ret = handlers->setup_buffer(et_fw, &fw_desc->buf);
+ if (chip_fw->setup_buffer) {
+ ret = chip_fw->setup_buffer(et_fw, &fw_desc->buf);
if (ret) {
etdev_err(etdev, "handler setup_buffer failed: %d\n",
ret);
@@ -207,8 +211,8 @@ static int edgetpu_firmware_load_locked(
out_do_unload_locked:
edgetpu_firmware_do_unload_locked(et_fw, fw_desc);
out_free_buffer:
- if (handlers && handlers->free_buffer)
- handlers->free_buffer(et_fw, &fw_desc->buf);
+ if (chip_fw->free_buffer)
+ chip_fw->free_buffer(et_fw, &fw_desc->buf);
return ret;
}
@@ -216,19 +220,19 @@ static void edgetpu_firmware_unload_locked(
struct edgetpu_firmware *et_fw,
struct edgetpu_firmware_desc *fw_desc)
{
- const struct edgetpu_firmware_handlers *handlers = et_fw->p->handlers;
+ const struct edgetpu_firmware_chip_data *chip_fw = et_fw->p->chip_fw;
/*
* Platform specific implementation for cleaning up allocated buffer.
*/
- if (handlers && handlers->teardown_buffer)
- handlers->teardown_buffer(et_fw, &fw_desc->buf);
+ if (chip_fw->teardown_buffer)
+ chip_fw->teardown_buffer(et_fw, &fw_desc->buf);
edgetpu_firmware_do_unload_locked(et_fw, fw_desc);
/*
* Platform specific implementation for freeing allocated buffer.
*/
- if (handlers && handlers->free_buffer)
- handlers->free_buffer(et_fw, &fw_desc->buf);
+ if (chip_fw->free_buffer)
+ chip_fw->free_buffer(et_fw, &fw_desc->buf);
}
static char *fw_flavor_str(enum edgetpu_fw_flavor fw_flavor)
@@ -445,7 +449,7 @@ int edgetpu_firmware_run_locked(struct edgetpu_firmware *et_fw,
const char *name,
enum edgetpu_firmware_flags flags)
{
- const struct edgetpu_firmware_handlers *handlers = et_fw->p->handlers;
+ const struct edgetpu_firmware_chip_data *chip_fw = et_fw->p->chip_fw;
struct edgetpu_firmware_desc new_fw_desc;
int ret;
bool is_bl1_run = (flags & FW_BL1);
@@ -460,9 +464,9 @@ int edgetpu_firmware_run_locked(struct edgetpu_firmware *et_fw,
goto out_failed;
etdev_dbg(et_fw->etdev, "run fw %s flags=0x%x", name, flags);
- if (handlers && handlers->prepare_run) {
+ if (chip_fw->prepare_run) {
/* Note this may recursively call us to run BL1 */
- ret = handlers->prepare_run(et_fw, &new_fw_desc.buf);
+ ret = chip_fw->prepare_run(et_fw, &new_fw_desc.buf);
if (ret)
goto out_unload_new_fw;
}
@@ -486,18 +490,18 @@ int edgetpu_firmware_run_locked(struct edgetpu_firmware *et_fw,
if (!ret && !is_bl1_run && et_fw->p->fw_info.fw_flavor != FW_FLAVOR_BL1)
edgetpu_sw_wdt_start(et_fw->etdev);
- if (!ret && !is_bl1_run && handlers && handlers->launch_complete)
- handlers->launch_complete(et_fw);
- else if (ret && handlers && handlers->launch_failed)
- handlers->launch_failed(et_fw, ret);
+ if (!ret && !is_bl1_run && chip_fw->launch_complete)
+ chip_fw->launch_complete(et_fw);
+ else if (ret && chip_fw->launch_failed)
+ chip_fw->launch_failed(et_fw, ret);
edgetpu_firmware_set_state(et_fw, ret);
return ret;
out_unload_new_fw:
edgetpu_firmware_unload_locked(et_fw, &new_fw_desc);
out_failed:
- if (handlers && handlers->launch_failed)
- handlers->launch_failed(et_fw, ret);
+ if (chip_fw->launch_failed)
+ chip_fw->launch_failed(et_fw, ret);
edgetpu_firmware_set_state(et_fw, ret);
return ret;
}
@@ -528,6 +532,31 @@ int edgetpu_firmware_run(struct edgetpu_dev *etdev, const char *name,
return ret;
}
+int edgetpu_firmware_run_default_locked(struct edgetpu_dev *etdev)
+{
+ struct edgetpu_firmware *et_fw = etdev->firmware;
+ const char *run_firmware_name =
+ et_fw->p->chip_fw->default_firmware_name;
+
+ if (firmware_name && *firmware_name)
+ run_firmware_name = firmware_name;
+
+ return edgetpu_firmware_run_locked(etdev->firmware, run_firmware_name,
+ FW_DEFAULT);
+}
+
+int edgetpu_firmware_run_default(struct edgetpu_dev *etdev)
+{
+ struct edgetpu_firmware *et_fw = etdev->firmware;
+ const char *run_firmware_name =
+ et_fw->p->chip_fw->default_firmware_name;
+
+ if (firmware_name && *firmware_name)
+ run_firmware_name = firmware_name;
+
+ return edgetpu_firmware_run(etdev, run_firmware_name, FW_DEFAULT);
+}
+
bool edgetpu_firmware_is_loading(struct edgetpu_dev *etdev)
{
struct edgetpu_firmware *et_fw = etdev->firmware;
@@ -558,10 +587,10 @@ edgetpu_firmware_set_status_locked(struct edgetpu_dev *etdev,
}
/* Caller must hold firmware lock for loading. */
-int edgetpu_firmware_restart_locked(struct edgetpu_dev *etdev)
+int edgetpu_firmware_restart_locked(struct edgetpu_dev *etdev, bool force_reset)
{
struct edgetpu_firmware *et_fw = etdev->firmware;
- const struct edgetpu_firmware_handlers *handlers = et_fw->p->handlers;
+ const struct edgetpu_firmware_chip_data *chip_fw = et_fw->p->chip_fw;
int ret = -1;
edgetpu_firmware_set_loading(et_fw);
@@ -570,10 +599,10 @@ int edgetpu_firmware_restart_locked(struct edgetpu_dev *etdev)
* Try restarting the firmware first, fall back to normal firmware start
* if this fails.
*/
- if (handlers && handlers->restart)
- ret = handlers->restart(et_fw);
- if (ret && handlers && handlers->prepare_run) {
- ret = handlers->prepare_run(et_fw, &et_fw->p->fw_desc.buf);
+ if (chip_fw->restart)
+ ret = chip_fw->restart(et_fw, force_reset);
+ if (ret && chip_fw->prepare_run) {
+ ret = chip_fw->prepare_run(et_fw, &et_fw->p->fw_desc.buf);
if (ret)
goto out;
}
@@ -637,7 +666,7 @@ static ssize_t load_firmware_store(
return PTR_ERR(name);
etdev_info(etdev, "loading firmware %s\n", name);
- ret = edgetpu_chip_firmware_run(etdev, name, 0);
+ ret = edgetpu_firmware_run(etdev, name, 0);
kfree(name);
@@ -726,14 +755,14 @@ static void edgetpu_firmware_wdt_timeout_action(void *data)
ret = edgetpu_firmware_pm_get(et_fw);
if (!ret) {
- ret = edgetpu_firmware_restart_locked(etdev);
+ ret = edgetpu_firmware_restart_locked(etdev, true);
edgetpu_pm_put(etdev->pm);
}
edgetpu_firmware_unlock(etdev);
}
int edgetpu_firmware_create(struct edgetpu_dev *etdev,
- const struct edgetpu_firmware_handlers *handlers)
+ const struct edgetpu_firmware_chip_data *chip_fw)
{
struct edgetpu_firmware *et_fw;
int ret;
@@ -751,7 +780,7 @@ int edgetpu_firmware_create(struct edgetpu_dev *etdev,
ret = -ENOMEM;
goto out_kfree_et_fw;
}
- et_fw->p->handlers = handlers;
+ et_fw->p->chip_fw = chip_fw;
mutex_init(&et_fw->p->fw_desc_lock);
@@ -759,8 +788,8 @@ int edgetpu_firmware_create(struct edgetpu_dev *etdev,
if (ret)
goto out_kfree_et_fw_p;
- if (handlers && handlers->after_create) {
- ret = handlers->after_create(et_fw);
+ if (chip_fw->after_create) {
+ ret = chip_fw->after_create(et_fw);
if (ret) {
etdev_dbg(etdev,
"%s: after create handler failed: %d\n",
@@ -791,20 +820,20 @@ out_kfree_et_fw:
void edgetpu_firmware_destroy(struct edgetpu_dev *etdev)
{
struct edgetpu_firmware *et_fw = etdev->firmware;
- const struct edgetpu_firmware_handlers *handlers;
+ const struct edgetpu_firmware_chip_data *chip_fw;
if (!et_fw)
return;
edgetpu_sw_wdt_destroy(etdev);
if (et_fw->p) {
- handlers = et_fw->p->handlers;
+ chip_fw = et_fw->p->chip_fw;
/*
* Platform specific implementation, which includes stop
* running firmware.
*/
- if (handlers && handlers->before_destroy)
- handlers->before_destroy(et_fw);
+ if (chip_fw->before_destroy)
+ chip_fw->before_destroy(et_fw);
}
device_remove_group(etdev->dev, &edgetpu_firmware_attr_group);
diff --git a/drivers/edgetpu/edgetpu-firmware.h b/drivers/edgetpu/edgetpu-firmware.h
index 3b784c5..0d3e1d4 100644
--- a/drivers/edgetpu/edgetpu-firmware.h
+++ b/drivers/edgetpu/edgetpu-firmware.h
@@ -94,11 +94,15 @@ struct edgetpu_firmware_buffer {
const char *name; /* the name of this firmware */
};
-/*
- * Each handler returns 0 to indicate success, non-zero value to
- * indicate error.
- */
-struct edgetpu_firmware_handlers {
+struct edgetpu_firmware_chip_data {
+ /* Name of default firmware image for this chip. */
+ const char *default_firmware_name;
+
+ /*
+ * Chip handlers called by common firmware processing.
+ * Each handler returns 0 to indicate success, non-zero value to
+ * indicate error.
+ */
int (*after_create)(struct edgetpu_firmware *et_fw);
/*
* Release resource used in platform specific implementation,
@@ -154,20 +158,10 @@ struct edgetpu_firmware_handlers {
* Optional platform-specific handler to restart an already loaded
* firmware.
*/
- int (*restart)(struct edgetpu_firmware *et_fw);
+ int (*restart)(struct edgetpu_firmware *et_fw, bool force_reset);
};
/*
- * Top-level chip-specific run firmware routine.
- * Calls edgetpu_firmware_run() one or more times as appropriate for chip-
- * specific one- or two-stage bootloader processing.
- *
- * @name: the name passed into underlying request_firmware API
- * @flags: edgetpu_firmware_flags for the image
- */
-int edgetpu_chip_firmware_run(struct edgetpu_dev *etdev, const char *name,
- enum edgetpu_firmware_flags flags);
-/*
* Returns the chip-specific IOVA where the firmware is mapped.
*
* Debug purpose only.
@@ -175,13 +169,20 @@ int edgetpu_chip_firmware_run(struct edgetpu_dev *etdev, const char *name,
unsigned long edgetpu_chip_firmware_iova(struct edgetpu_dev *etdev);
/*
- * Load and run firmware. Called by edgetpu_chip_firmware_run().
+ * Load and run firmware.
* @name: the name passed into underlying request_firmware API
* @flags: edgetpu_firmware_flags for the image
+ * Used internally by the sysfs load interface and by unit tests.
*/
int edgetpu_firmware_run(struct edgetpu_dev *etdev, const char *name,
enum edgetpu_firmware_flags flags);
+/* Load and run the default firmware name for the chip. */
+int edgetpu_firmware_run_default(struct edgetpu_dev *etdev);
+
+/* Runs default firmware for the chip, caller holds FW/PM locks */
+int edgetpu_firmware_run_default_locked(struct edgetpu_dev *etdev);
+
/*
* Private data set and used by handlers. It is expected to
* allocate and set the data on after_create() and release on
@@ -191,7 +192,7 @@ void edgetpu_firmware_set_data(struct edgetpu_firmware *et_fw, void *data);
void *edgetpu_firmware_get_data(struct edgetpu_firmware *et_fw);
int edgetpu_firmware_create(struct edgetpu_dev *etdev,
- const struct edgetpu_firmware_handlers *handlers);
+ const struct edgetpu_firmware_chip_data *chip_fw);
void edgetpu_firmware_destroy(struct edgetpu_dev *etdev);
void edgetpu_firmware_mappings_show(struct edgetpu_dev *etdev,
struct seq_file *s);
@@ -223,15 +224,16 @@ edgetpu_firmware_set_status_locked(struct edgetpu_dev *etdev,
/*
* Restarts the last firmware image loaded
* Intended for power managed devices to re-run the firmware without a full
- * reload from the file system
+ * reload from the file system.
+ * Optionally, force a CPU reset to recover from a bad firmware state.
*/
-int edgetpu_firmware_restart_locked(struct edgetpu_dev *etdev);
+int edgetpu_firmware_restart_locked(struct edgetpu_dev *etdev,
+ bool force_reset);
/*
* Loads and runs the specified firmware assuming the required locks have been
- * acquired
+ * acquired. Used to run second-stage bootloader.
*/
-
int edgetpu_firmware_run_locked(struct edgetpu_firmware *et_fw,
const char *name,
enum edgetpu_firmware_flags flags);
diff --git a/drivers/edgetpu/edgetpu-fs.c b/drivers/edgetpu/edgetpu-fs.c
index 6fbd642..1b6e039 100644
--- a/drivers/edgetpu/edgetpu-fs.c
+++ b/drivers/edgetpu/edgetpu-fs.c
@@ -625,6 +625,7 @@ static int edgetpu_ioctl_acquire_wakelock(struct edgetpu_client *client)
error_release:
edgetpu_wakelock_release(client->wakelock);
edgetpu_wakelock_unlock(client->wakelock);
+ etdev_err(client->etdev, "PID: %d failed to acquire wakelock", client->pid);
return ret;
}
@@ -646,11 +647,15 @@ edgetpu_ioctl_acquire_ext_mailbox(struct edgetpu_client *client,
struct edgetpu_ext_mailbox_ioctl __user *argp)
{
struct edgetpu_ext_mailbox_ioctl ext_mailbox;
+ int ret;
if (copy_from_user(&ext_mailbox, argp, sizeof(ext_mailbox)))
return -EFAULT;
- return edgetpu_chip_acquire_ext_mailbox(client, &ext_mailbox);
+ ret = edgetpu_chip_acquire_ext_mailbox(client, &ext_mailbox);
+ if (ret)
+ etdev_err(client->etdev, "PID: %d failed to acquire ext mailbox", client->pid);
+ return ret;
}
static int
@@ -1055,6 +1060,7 @@ void edgetpu_fs_remove(struct edgetpu_dev *etdev)
{
device_remove_group(etdev->dev, &edgetpu_attr_group);
device_destroy(edgetpu_class, etdev->devno);
+ etdev->etcdev = NULL;
cdev_del(&etdev->cdev);
debugfs_remove_recursive(etdev->d_entry);
}
diff --git a/drivers/edgetpu/edgetpu-google-iommu.c b/drivers/edgetpu/edgetpu-google-iommu.c
index 9d28949..851a326 100644
--- a/drivers/edgetpu/edgetpu-google-iommu.c
+++ b/drivers/edgetpu/edgetpu-google-iommu.c
@@ -101,21 +101,21 @@ static int edgetpu_iommu_dev_fault_handler(struct iommu_fault *fault,
struct edgetpu_dev *etdev = (struct edgetpu_dev *)token;
if (fault->type == IOMMU_FAULT_DMA_UNRECOV) {
- etdev_err(etdev, "Unrecoverable IOMMU fault!\n");
- etdev_err(etdev, "Reason = %08X\n", fault->event.reason);
- etdev_err(etdev, "flags = %08X\n", fault->event.flags);
- etdev_err(etdev, "pasid = %08X\n", fault->event.pasid);
- etdev_err(etdev, "perms = %08X\n", fault->event.perm);
- etdev_err(etdev, "addr = %llX\n", fault->event.addr);
- etdev_err(etdev, "fetch_addr = %llX\n",
+ etdev_warn(etdev, "Unrecoverable IOMMU fault!\n");
+ etdev_warn(etdev, "Reason = %08X\n", fault->event.reason);
+ etdev_warn(etdev, "flags = %08X\n", fault->event.flags);
+ etdev_warn(etdev, "pasid = %08X\n", fault->event.pasid);
+ etdev_warn(etdev, "perms = %08X\n", fault->event.perm);
+ etdev_warn(etdev, "addr = %llX\n", fault->event.addr);
+ etdev_warn(etdev, "fetch_addr = %llX\n",
fault->event.fetch_addr);
} else if (fault->type == IOMMU_FAULT_PAGE_REQ) {
- etdev_err(etdev, "IOMMU page request fault!\n");
- etdev_err(etdev, "flags = %08X\n", fault->prm.flags);
- etdev_err(etdev, "pasid = %08X\n", fault->prm.pasid);
- etdev_err(etdev, "grpid = %08X\n", fault->prm.grpid);
- etdev_err(etdev, "perms = %08X\n", fault->prm.perm);
- etdev_err(etdev, "addr = %llX\n", fault->prm.addr);
+ etdev_dbg(etdev, "IOMMU page request fault!\n");
+ etdev_dbg(etdev, "flags = %08X\n", fault->prm.flags);
+ etdev_dbg(etdev, "pasid = %08X\n", fault->prm.pasid);
+ etdev_dbg(etdev, "grpid = %08X\n", fault->prm.grpid);
+ etdev_dbg(etdev, "perms = %08X\n", fault->prm.perm);
+ etdev_dbg(etdev, "addr = %llX\n", fault->prm.addr);
}
// Tell the IOMMU driver to carry on
return -EAGAIN;
@@ -168,7 +168,7 @@ static int edgetpu_iommu_fault_handler(struct iommu_domain *domain,
struct edgetpu_iommu_domain *etdomain =
(struct edgetpu_iommu_domain *)token;
- dev_err(dev, "IOMMU fault on address %08lX. PASID = %u flags = %08X",
+ dev_dbg(dev, "IOMMU fault on address %08lX. PASID = %u flags = %08X",
iova, etdomain->pasid, flags);
// Tell the IOMMU driver we are OK with this fault
return 0;
diff --git a/drivers/edgetpu/edgetpu-internal.h b/drivers/edgetpu/edgetpu-internal.h
index 7c4966e..23e0c12 100644
--- a/drivers/edgetpu/edgetpu-internal.h
+++ b/drivers/edgetpu/edgetpu-internal.h
@@ -35,22 +35,24 @@
#include "edgetpu-thermal.h"
#include "edgetpu-usage-stats.h"
-#define etdev_err(etdev, fmt, ...) dev_err((etdev)->etcdev, fmt, ##__VA_ARGS__)
+#define get_dev_for_logging(etdev) ((etdev)->etcdev ? (etdev)->etcdev : (etdev)->dev)
+
+#define etdev_err(etdev, fmt, ...) dev_err(get_dev_for_logging(etdev), fmt, ##__VA_ARGS__)
#define etdev_warn(etdev, fmt, ...) \
- dev_warn((etdev)->etcdev, fmt, ##__VA_ARGS__)
+ dev_warn(get_dev_for_logging(etdev), fmt, ##__VA_ARGS__)
#define etdev_info(etdev, fmt, ...) \
- dev_info((etdev)->etcdev, fmt, ##__VA_ARGS__)
-#define etdev_dbg(etdev, fmt, ...) dev_dbg((etdev)->etcdev, fmt, ##__VA_ARGS__)
+ dev_info(get_dev_for_logging(etdev), fmt, ##__VA_ARGS__)
+#define etdev_dbg(etdev, fmt, ...) dev_dbg(get_dev_for_logging(etdev), fmt, ##__VA_ARGS__)
#define etdev_err_ratelimited(etdev, fmt, ...) \
- dev_err_ratelimited((etdev)->etcdev, fmt, ##__VA_ARGS__)
+ dev_err_ratelimited(get_dev_for_logging(etdev), fmt, ##__VA_ARGS__)
#define etdev_warn_ratelimited(etdev, fmt, ...) \
- dev_warn_ratelimited((etdev)->etcdev, fmt, ##__VA_ARGS__)
+ dev_warn_ratelimited(get_dev_for_logging(etdev), fmt, ##__VA_ARGS__)
#define etdev_info_ratelimited(etdev, fmt, ...) \
- dev_info_ratelimited((etdev)->etcdev, fmt, ##__VA_ARGS__)
+ dev_info_ratelimited(get_dev_for_logging(etdev), fmt, ##__VA_ARGS__)
#define etdev_dbg_ratelimited(etdev, fmt, ...) \
- dev_dbg_ratelimited((etdev)->etcdev, fmt, ##__VA_ARGS__)
+ dev_dbg_ratelimited(get_dev_for_logging(etdev), fmt, ##__VA_ARGS__)
#define etdev_warn_once(etdev, fmt, ...) \
- dev_warn_once((etdev)->etcdev, fmt, ##__VA_ARGS__)
+ dev_warn_once(get_dev_for_logging(etdev), fmt, ##__VA_ARGS__)
/* The number of TPU tiles in an edgetpu chip */
#ifdef CONFIG_EDGETPU_FPGA
diff --git a/drivers/edgetpu/edgetpu-kci.c b/drivers/edgetpu/edgetpu-kci.c
index 73a47cc..1b467e2 100644
--- a/drivers/edgetpu/edgetpu-kci.c
+++ b/drivers/edgetpu/edgetpu-kci.c
@@ -16,8 +16,8 @@
#include "edgetpu-firmware.h"
#include "edgetpu-internal.h"
-#include "edgetpu-kci.h"
#include "edgetpu-iremap-pool.h"
+#include "edgetpu-kci.h"
#include "edgetpu-mmu.h"
#include "edgetpu-telemetry.h"
#include "edgetpu-usage-stats.h"
@@ -40,6 +40,14 @@
#define KCI_TIMEOUT (5000)
#endif
+/* A macro for KCIs to leave early when the device state is known to be bad. */
+#define RETURN_ERRNO_IF_ETDEV_NOT_GOOD(kci) \
+ do { \
+ int ret = edgetpu_get_state_errno_locked(kci->mailbox->etdev); \
+ if (ret) \
+ return ret; \
+ } while (0)
+
static inline u32 edgetpu_kci_queue_element_size(enum mailbox_queue_type type)
{
if (type == MAILBOX_CMD_QUEUE)
@@ -781,6 +789,7 @@ int edgetpu_kci_join_group(struct edgetpu_kci *kci, u8 n_dies, u8 vid)
if (!kci)
return -ENODEV;
+ RETURN_ERRNO_IF_ETDEV_NOT_GOOD(kci);
return edgetpu_kci_send_cmd_with_data(kci, &cmd, &detail, sizeof(detail));
}
@@ -792,6 +801,7 @@ int edgetpu_kci_leave_group(struct edgetpu_kci *kci)
if (!kci)
return -ENODEV;
+ RETURN_ERRNO_IF_ETDEV_NOT_GOOD(kci);
return edgetpu_kci_send_cmd(kci, &cmd);
}
@@ -1001,6 +1011,7 @@ int edgetpu_kci_open_device(struct edgetpu_kci *kci, u32 mailbox_id, s16 vcid, b
if (!kci)
return -ENODEV;
+ RETURN_ERRNO_IF_ETDEV_NOT_GOOD(kci);
if (vcid < 0)
return edgetpu_kci_send_cmd(kci, &cmd);
return edgetpu_kci_send_cmd_with_data(kci, &cmd, &detail, sizeof(detail));
@@ -1017,6 +1028,7 @@ int edgetpu_kci_close_device(struct edgetpu_kci *kci, u32 mailbox_id)
if (!kci)
return -ENODEV;
+ RETURN_ERRNO_IF_ETDEV_NOT_GOOD(kci);
return edgetpu_kci_send_cmd(kci, &cmd);
}
diff --git a/drivers/edgetpu/edgetpu-kci.h b/drivers/edgetpu/edgetpu-kci.h
index deb258d..2893f20 100644
--- a/drivers/edgetpu/edgetpu-kci.h
+++ b/drivers/edgetpu/edgetpu-kci.h
@@ -24,7 +24,7 @@
* Maximum number of outstanding KCI requests from firmware
* This is used to size a circular buffer, so it must be a power of 2
*/
-#define REVERSE_KCI_BUFFER_SIZE (8)
+#define REVERSE_KCI_BUFFER_SIZE (32)
/*
* The status field in a firmware response is set to this by us when the
diff --git a/drivers/edgetpu/edgetpu-mailbox.c b/drivers/edgetpu/edgetpu-mailbox.c
index cf996f7..eedde54 100644
--- a/drivers/edgetpu/edgetpu-mailbox.c
+++ b/drivers/edgetpu/edgetpu-mailbox.c
@@ -18,6 +18,7 @@
#include "edgetpu-kci.h"
#include "edgetpu-mailbox.h"
#include "edgetpu-mmu.h"
+#include "edgetpu-sw-watchdog.h"
#include "edgetpu-wakelock.h"
#include "edgetpu.h"
@@ -1132,6 +1133,13 @@ int edgetpu_mailbox_activate(struct edgetpu_dev *etdev, u32 mailbox_id, s16 vcid
eh->fw_state |= bit;
}
mutex_unlock(&eh->lock);
+ /*
+ * We are observing OPEN_DEVICE KCI fails while other KCIs (usage update / shutdown) still
+ * succeed and no firmware crash is reported. Kick off the firmware restart when we are
+ * facing this and hope this can rescue the device from the bad state.
+ */
+ if (ret == -ETIMEDOUT)
+ edgetpu_watchdog_bite(etdev, false);
return ret;
}
diff --git a/drivers/edgetpu/edgetpu-mmu.h b/drivers/edgetpu/edgetpu-mmu.h
index 094f14d..7cc9ffa 100644
--- a/drivers/edgetpu/edgetpu-mmu.h
+++ b/drivers/edgetpu/edgetpu-mmu.h
@@ -16,11 +16,6 @@
#include "edgetpu-internal.h"
#include "edgetpu.h"
-/* TODO(b/153947157): remove this */
-#if IS_ENABLED(CONFIG_EDGETPU_TEST)
-#include <linux/iommu-ext.h>
-#endif
-
#if LINUX_VERSION_CODE < KERNEL_VERSION(5, 2, 0)
#ifndef IOMMU_PASID_INVALID
#define IOMMU_PASID_INVALID (-1U)
diff --git a/drivers/edgetpu/mobile-firmware.h b/drivers/edgetpu/mobile-firmware.h
index e0c8dd8..691eaf5 100644
--- a/drivers/edgetpu/mobile-firmware.h
+++ b/drivers/edgetpu/mobile-firmware.h
@@ -49,6 +49,4 @@ struct mobile_image_header {
int mobile_edgetpu_firmware_create(struct edgetpu_dev *etdev);
void mobile_edgetpu_firmware_destroy(struct edgetpu_dev *etdev);
-int mobile_edgetpu_firmware_run_default(struct edgetpu_dev *etdev);
-
#endif /* __MOBILE_FIRMWARE_H__ */