summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorNrithya Kanakasabapathy <nrithya@google.com>2021-05-12 05:10:54 +0000
committerNrithya Kanakasabapathy <nrithya@google.com>2021-05-12 05:10:54 +0000
commit9e653d1d032ecba4f19e6e257e4e664ee8885d29 (patch)
treea6f7d5b5f29bf8b35d867a44c8f2155b7ff3c570
parentc2a5f1fcc8911d94858bb48492a1b529f9a8e80e (diff)
downloadabrolhos-9e653d1d032ecba4f19e6e257e4e664ee8885d29.tar.gz
Merge branch 'darwinn-2.0' into android-gs-pixel-5.10
* whitechapel: edgetpu: using fw trylock in update usage KCI edgetpu: use pm_trylock for usage update KCI edgetpu: abrolhos: add kci to notify firmware about throttling. edgetpu: add firmware launch_failed callback edgetpu: add PM get_if_powered API edgetpu: hold group locks when restoring VII mboxes Revert "edgetpu: abrolhos temporarily disable pm test" edgetpu: add asynchronous usage update KCI edgetpu: reset iomux cfg on top hermosa reset edgetpu: abrolhos temporarily disable pm test edgetpu: modify SG length before sync edgetpu: abrolhos: use vmalloc for firmware image edgetpu: fix unit tests for usage stats to create valid firmware edgetpu: KCI update usage lockout f/w state changes, abort if f/w bad Signed-off-by: Nrithya Kanakasabapathy <nrithya@google.com> Change-Id: I902de9634e3ec5082c279d8bb7eebbc8af33060e
-rw-r--r--drivers/edgetpu/abrolhos-firmware.c4
-rw-r--r--drivers/edgetpu/abrolhos-platform.c3
-rw-r--r--drivers/edgetpu/abrolhos-thermal.c17
-rw-r--r--drivers/edgetpu/edgetpu-device-group.c105
-rw-r--r--drivers/edgetpu/edgetpu-firmware.c32
-rw-r--r--drivers/edgetpu/edgetpu-firmware.h13
-rw-r--r--drivers/edgetpu/edgetpu-kci.c75
-rw-r--r--drivers/edgetpu/edgetpu-kci.h20
-rw-r--r--drivers/edgetpu/edgetpu-mailbox.c57
-rw-r--r--drivers/edgetpu/edgetpu-pm.c58
-rw-r--r--drivers/edgetpu/edgetpu-pm.h18
-rw-r--r--drivers/edgetpu/edgetpu-thermal.h6
12 files changed, 350 insertions, 58 deletions
diff --git a/drivers/edgetpu/abrolhos-firmware.c b/drivers/edgetpu/abrolhos-firmware.c
index 5fbec26..9acc0d2 100644
--- a/drivers/edgetpu/abrolhos-firmware.c
+++ b/drivers/edgetpu/abrolhos-firmware.c
@@ -28,7 +28,7 @@ static int abrolhos_firmware_alloc_buffer(
size_t buffer_size =
abpdev->fw_region_size + MOBILE_FW_HEADER_SIZE;
- fw_buf->vaddr = kzalloc(buffer_size, GFP_KERNEL);
+ fw_buf->vaddr = vmalloc(buffer_size);
if (!fw_buf->vaddr) {
etdev_err(etdev, "%s: failed to allocate buffer (%zu bytes)\n",
__func__, buffer_size);
@@ -44,7 +44,7 @@ static void abrolhos_firmware_free_buffer(
struct edgetpu_firmware *et_fw,
struct edgetpu_firmware_buffer *fw_buf)
{
- kfree(fw_buf->vaddr);
+ vfree(fw_buf->vaddr);
fw_buf->vaddr = NULL;
fw_buf->dma_addr = 0;
fw_buf->alloc_size = 0;
diff --git a/drivers/edgetpu/abrolhos-platform.c b/drivers/edgetpu/abrolhos-platform.c
index a613b63..16e36dc 100644
--- a/drivers/edgetpu/abrolhos-platform.c
+++ b/drivers/edgetpu/abrolhos-platform.c
@@ -315,7 +315,8 @@ static int edgetpu_platform_probe(struct platform_device *pdev)
}
dev_dbg(dev, "Creating thermal device\n");
- abpdev->edgetpu_dev.thermal = devm_tpu_thermal_create(dev);
+ abpdev->edgetpu_dev.thermal =
+ devm_tpu_thermal_create(dev, &abpdev->edgetpu_dev);
dev_info(dev, "%s edgetpu initialized. Build: %s\n",
abpdev->edgetpu_dev.dev_name, GIT_REPO_TAG);
diff --git a/drivers/edgetpu/abrolhos-thermal.c b/drivers/edgetpu/abrolhos-thermal.c
index 27429a9..84be142 100644
--- a/drivers/edgetpu/abrolhos-thermal.c
+++ b/drivers/edgetpu/abrolhos-thermal.c
@@ -48,6 +48,7 @@ static int edgetpu_set_cur_state(struct thermal_cooling_device *cdev,
int ret;
struct edgetpu_thermal *cooling = cdev->devdata;
struct device *dev = cooling->dev;
+ struct edgetpu_dev *etdev = cooling->etdev;
unsigned long pwr_state;
if (state_original >= cooling->tpu_num_states) {
@@ -77,6 +78,18 @@ static int edgetpu_set_cur_state(struct thermal_cooling_device *cdev,
goto out;
}
cooling->cooling_state = state_original;
+ ret = edgetpu_kci_notify_throttling(etdev, pwr_state);
+ if (ret) {
+ /*
+ * TODO(b/185596886) : After FW adds a handler for this
+ * KCI, return the correct value of ret and change the
+ * debug message to an error message
+ */
+ etdev_dbg(
+ etdev, "Failed to notify FW about state %lu, error:%d",
+ pwr_state, ret);
+ ret = 0;
+ }
} else {
ret = -EALREADY;
}
@@ -307,7 +320,8 @@ static int tpu_thermal_init(struct edgetpu_thermal *thermal, struct device *dev)
return 0;
}
-struct edgetpu_thermal *devm_tpu_thermal_create(struct device *dev)
+struct edgetpu_thermal
+*devm_tpu_thermal_create(struct device *dev, struct edgetpu_dev *etdev)
{
struct edgetpu_thermal *thermal;
int err;
@@ -324,5 +338,6 @@ struct edgetpu_thermal *devm_tpu_thermal_create(struct device *dev)
}
devres_add(dev, thermal);
+ thermal->etdev = etdev;
return thermal;
}
diff --git a/drivers/edgetpu/edgetpu-device-group.c b/drivers/edgetpu/edgetpu-device-group.c
index 9c93c0b..9d00a0f 100644
--- a/drivers/edgetpu/edgetpu-device-group.c
+++ b/drivers/edgetpu/edgetpu-device-group.c
@@ -60,6 +60,24 @@ struct edgetpu_host_map {
struct sg_table *sg_tables;
};
+/*
+ * A helper structure for the return value of find_sg_to_sync().
+ */
+struct sglist_to_sync {
+ struct scatterlist *sg;
+ int nelems;
+ /*
+ * The SG that has its length modified by find_sg_to_sync().
+ * Can be NULL, which means no SG's length was modified.
+ */
+ struct scatterlist *last_sg;
+ /*
+ * find_sg_to_sync() will temporarily change the length of @last_sg.
+ * This is used to restore the length.
+ */
+ unsigned int orig_length;
+};
+
#ifdef EDGETPU_HAS_MCP
/* parameter to be used in async KCI jobs */
@@ -86,7 +104,7 @@ static int edgetpu_kci_leave_group_worker(struct kci_worker_param *param)
struct edgetpu_dev *etdev = edgetpu_device_group_nth_etdev(group, i);
etdev_dbg(etdev, "%s: leave group %u", __func__, group->workload_id);
- edgetpu_kci_update_usage(etdev);
+ edgetpu_kci_update_usage_async(etdev);
edgetpu_kci_leave_group(etdev->kci);
return 0;
}
@@ -147,7 +165,7 @@ static void edgetpu_group_kci_close_device(struct edgetpu_device_group *group)
static void edgetpu_device_group_kci_leave(struct edgetpu_device_group *group)
{
#ifdef EDGETPU_HAS_MULTI_GROUPS
- edgetpu_kci_update_usage(group->etdev);
+ edgetpu_kci_update_usage_async(group->etdev);
return edgetpu_group_kci_close_device(group);
#else /* !EDGETPU_HAS_MULTI_GROUPS */
struct kci_worker_param *params =
@@ -1247,32 +1265,60 @@ error:
}
/*
- * Find the scatterlist covering range [start, end).
+ * Finds the scatterlist covering range [start, end).
+ *
+ * The found SG and number of elements will be stored in @sglist.
*
- * Returns NULL if:
- * - @start is larger than the whole SG table
+ * To ensure the returned SG list strictly locates in range [start, end), the
+ * last SG's length is shrunk. Therefore caller must call
+ * restore_sg_after_sync(@sglist) after the DMA sync is performed.
+ *
+ * @sglist->nelems == 0 means the target range exceeds the whole SG table.
*/
-static struct scatterlist *find_sg_within(const struct sg_table *sgt, u64 start,
- u64 end, int *nelems)
+static void find_sg_to_sync(const struct sg_table *sgt, u64 start, u64 end,
+ struct sglist_to_sync *sglist)
{
- struct scatterlist *sg, *sg_to_sync = NULL;
+ struct scatterlist *sg;
size_t cur_offset = 0;
int i;
- *nelems = 0;
+ sglist->sg = NULL;
+ sglist->nelems = 0;
+ sglist->last_sg = NULL;
+ if (unlikely(end == 0))
+ return;
for_each_sg(sgt->sgl, sg, sgt->orig_nents, i) {
- if (end <= cur_offset)
- break;
if (cur_offset <= start && start < cur_offset + sg->length)
- sg_to_sync = sg;
- if (sg_to_sync)
- (*nelems)++;
+ sglist->sg = sg;
+ if (sglist->sg)
+ ++sglist->nelems;
cur_offset += sg->length;
+ if (end <= cur_offset) {
+ sglist->last_sg = sg;
+ sglist->orig_length = sg->length;
+ /*
+ * To let the returned SG list have exact length as
+ * [start, end).
+ */
+ sg->length -= cur_offset - end;
+ break;
+ }
}
+}
- return sg_to_sync;
+static void restore_sg_after_sync(struct sglist_to_sync *sglist)
+{
+ if (!sglist->last_sg)
+ return;
+ sglist->last_sg->length = sglist->orig_length;
}
+/*
+ * Performs DMA sync of the mapping with region [offset, offset + size).
+ *
+ * Caller holds mapping's lock, to prevent @hmap being modified / removed by
+ * other processes.
+ */
static int group_sync_host_map(struct edgetpu_device_group *group,
struct edgetpu_host_map *hmap, u64 offset,
u64 size, enum dma_data_direction dir,
@@ -1283,29 +1329,32 @@ static int group_sync_host_map(struct edgetpu_device_group *group,
for_cpu ? dma_sync_sg_for_cpu : dma_sync_sg_for_device;
struct edgetpu_dev *etdev;
struct sg_table *sgt;
- struct scatterlist *sg;
+ struct sglist_to_sync sglist;
int i;
- int nelems;
sgt = &hmap->map.sgt;
- sg = find_sg_within(sgt, offset, end, &nelems);
- if (!sg)
+ find_sg_to_sync(sgt, offset, end, &sglist);
+ if (!sglist.nelems)
return -EINVAL;
+ if (IS_MIRRORED(hmap->map.flags))
+ etdev = group->etdev;
+ else
+ etdev = edgetpu_device_group_nth_etdev(group,
+ hmap->map.die_index);
+ sync(etdev->dev, sglist.sg, sglist.nelems, dir);
+ restore_sg_after_sync(&sglist);
+
if (IS_MIRRORED(hmap->map.flags)) {
- sync(group->etdev->dev, sg, nelems, dir);
for (i = 1; i < group->n_clients; i++) {
etdev = edgetpu_device_group_nth_etdev(group, i);
- sg = find_sg_within(&hmap->sg_tables[i], offset, end,
- &nelems);
- if (WARN_ON(!sg))
+ find_sg_to_sync(&hmap->sg_tables[i], offset, end,
+ &sglist);
+ if (WARN_ON(!sglist.sg))
return -EINVAL;
- sync(etdev->dev, sg, nelems, dir);
+ sync(etdev->dev, sglist.sg, sglist.nelems, dir);
+ restore_sg_after_sync(&sglist);
}
- } else {
- etdev = edgetpu_device_group_nth_etdev(group,
- hmap->map.die_index);
- sync(etdev->dev, sg, nelems, dir);
}
return 0;
diff --git a/drivers/edgetpu/edgetpu-firmware.c b/drivers/edgetpu/edgetpu-firmware.c
index 3b1c874..8ae808b 100644
--- a/drivers/edgetpu/edgetpu-firmware.c
+++ b/drivers/edgetpu/edgetpu-firmware.c
@@ -313,6 +313,20 @@ edgetpu_firmware_get_build_time(struct edgetpu_firmware *et_fw)
}
/*
+ * Try edgetpu_firmware_lock() if it's not locked yet.
+ *
+ * Returns 1 if the lock is acquired successfully, 0 otherwise.
+ */
+int edgetpu_firmware_trylock(struct edgetpu_dev *etdev)
+{
+ struct edgetpu_firmware *et_fw = etdev->firmware;
+
+ if (!et_fw)
+ return 1;
+ return mutex_trylock(&et_fw->p->fw_desc_lock);
+}
+
+/*
* Grab firmware lock to protect against firmware state changes.
* Locks out firmware loading / unloading while caller performs ops that are
* incompatible with a change in firmware status. Does not care whether or not
@@ -395,7 +409,7 @@ int edgetpu_firmware_run_locked(struct edgetpu_firmware *et_fw,
memset(&new_fw_desc, 0, sizeof(new_fw_desc));
ret = edgetpu_firmware_load_locked(et_fw, &new_fw_desc, name, flags);
if (ret)
- return ret;
+ goto out_failed;
etdev_dbg(et_fw->etdev, "run fw %s flags=0x%x", name, flags);
if (handlers && handlers->prepare_run) {
@@ -426,10 +440,15 @@ int edgetpu_firmware_run_locked(struct edgetpu_firmware *et_fw,
if (!ret && !is_bl1_run && handlers && handlers->launch_complete)
handlers->launch_complete(et_fw);
+ else if (ret && handlers && handlers->launch_failed)
+ handlers->launch_failed(et_fw, ret);
return ret;
out_unload_new_fw:
edgetpu_firmware_unload_locked(et_fw, &new_fw_desc);
+out_failed:
+ if (handlers && handlers->launch_failed)
+ handlers->launch_failed(et_fw, ret);
return ret;
}
@@ -500,6 +519,17 @@ edgetpu_firmware_status_locked(struct edgetpu_dev *etdev)
return et_fw->p->status;
}
+/* Caller must hold firmware lock. For unit tests. */
+void
+edgetpu_firmware_set_status_locked(struct edgetpu_dev *etdev,
+ enum edgetpu_firmware_status status)
+{
+ struct edgetpu_firmware *et_fw = etdev->firmware;
+
+ if (et_fw)
+ et_fw->p->status = status;
+}
+
/* Caller must hold firmware lock for loading. */
int edgetpu_firmware_restart_locked(struct edgetpu_dev *etdev)
{
diff --git a/drivers/edgetpu/edgetpu-firmware.h b/drivers/edgetpu/edgetpu-firmware.h
index d068d29..ad7c484 100644
--- a/drivers/edgetpu/edgetpu-firmware.h
+++ b/drivers/edgetpu/edgetpu-firmware.h
@@ -147,6 +147,8 @@ struct edgetpu_firmware_handlers {
struct edgetpu_firmware_buffer *fw_buf);
/* Firmware running, after successful handshake. */
void (*launch_complete)(struct edgetpu_firmware *et_fw);
+ /* Firmware load failed or unsuccessful handshake. */
+ void (*launch_failed)(struct edgetpu_firmware *et_fw, int ret);
/*
* Optional platform-specific handler to restart an already loaded
@@ -195,12 +197,12 @@ void edgetpu_firmware_mappings_show(struct edgetpu_dev *etdev,
struct seq_file *s);
/*
- * These two functions grab and release the internal firmware lock
- * and must be used before calling the helper functions suffixed with _locked
- * below
+ * These functions grab and release the internal firmware lock and must be used
+ * before calling the helper functions suffixed with _locked below.
*/
int edgetpu_firmware_lock(struct edgetpu_dev *etdev);
+int edgetpu_firmware_trylock(struct edgetpu_dev *etdev);
void edgetpu_firmware_unlock(struct edgetpu_dev *etdev);
@@ -211,6 +213,11 @@ void edgetpu_firmware_unlock(struct edgetpu_dev *etdev);
enum edgetpu_firmware_status
edgetpu_firmware_status_locked(struct edgetpu_dev *etdev);
+/* Caller must hold firmware lock. For unit tests. */
+void
+edgetpu_firmware_set_status_locked(struct edgetpu_dev *etdev,
+ enum edgetpu_firmware_status status);
+
/*
* Restarts the last firmware image loaded
* Intended for power managed devices to re-run the firmware without a full
diff --git a/drivers/edgetpu/edgetpu-kci.c b/drivers/edgetpu/edgetpu-kci.c
index f506a04..e467fac 100644
--- a/drivers/edgetpu/edgetpu-kci.c
+++ b/drivers/edgetpu/edgetpu-kci.c
@@ -432,6 +432,14 @@ static void edgetpu_kci_handle_irq(struct edgetpu_mailbox *mailbox)
schedule_work(&kci->work);
}
+static void edgetpu_kci_update_usage_work(struct work_struct *work)
+{
+ struct edgetpu_kci *kci =
+ container_of(work, struct edgetpu_kci, usage_work);
+
+ edgetpu_kci_update_usage(kci->mailbox->etdev);
+}
+
int edgetpu_kci_init(struct edgetpu_mailbox_manager *mgr,
struct edgetpu_kci *kci)
{
@@ -478,6 +486,7 @@ int edgetpu_kci_init(struct edgetpu_mailbox_manager *mgr,
init_waitqueue_head(&kci->wait_list_waitq);
INIT_WORK(&kci->work, edgetpu_kci_consume_responses_work);
edgetpu_reverse_kci_init(&kci->rkci);
+ INIT_WORK(&kci->usage_work, edgetpu_kci_update_usage_work);
EDGETPU_MAILBOX_CONTEXT_WRITE(mailbox, context_enable, 1);
return 0;
}
@@ -507,7 +516,9 @@ int edgetpu_kci_reinit(struct edgetpu_kci *kci)
void edgetpu_kci_cancel_work_queues(struct edgetpu_kci *kci)
{
- /* Cancel KCI and reverse KCI workers */
+ /* Cancel workers that may send KCIs. */
+ cancel_work_sync(&kci->usage_work);
+ /* Cancel KCI and reverse KCI workers. */
cancel_work_sync(&kci->work);
cancel_work_sync(&kci->rkci.work);
}
@@ -516,10 +527,6 @@ void edgetpu_kci_release(struct edgetpu_dev *etdev, struct edgetpu_kci *kci)
{
if (!kci)
return;
- /*
- * Command/Response queues are managed (dmam_alloc_coherent()), we don't
- * need to free them.
- */
edgetpu_kci_cancel_work_queues(kci);
@@ -853,19 +860,42 @@ enum edgetpu_fw_flavor edgetpu_kci_fw_info(struct edgetpu_kci *kci,
return flavor;
}
+void edgetpu_kci_update_usage_async(struct edgetpu_dev *etdev)
+{
+ schedule_work(&etdev->kci->usage_work);
+}
+
int edgetpu_kci_update_usage(struct edgetpu_dev *etdev)
{
- int ret;
+ int ret = -EAGAIN;
- /* Quick return if device already powered down, else get PM ref. */
+ /* Quick return if device is already powered down. */
if (!edgetpu_is_powered(etdev))
return -EAGAIN;
- ret = edgetpu_pm_get(etdev->pm);
- if (ret)
- return ret;
- ret = edgetpu_kci_update_usage_locked(etdev);
+ /*
+ * Lockout change in f/w load/unload status during usage update.
+ * Skip usage update if the firmware is being updated now or is not
+ * valid.
+ */
+ if (!edgetpu_firmware_trylock(etdev))
+ return -EAGAIN;
- edgetpu_pm_put(etdev->pm);
+ if (edgetpu_firmware_status_locked(etdev) != FW_VALID)
+ goto fw_unlock;
+ /*
+ * This function may run in a worker that is being canceled when the
+ * device is powering down, and the power down code holds the PM lock.
+ * Using trylock to prevent cancel_work_sync() waiting forever.
+ */
+ if (!edgetpu_pm_trylock(etdev->pm))
+ goto fw_unlock;
+
+ if (edgetpu_is_powered(etdev))
+ ret = edgetpu_kci_update_usage_locked(etdev);
+ edgetpu_pm_unlock(etdev->pm);
+
+fw_unlock:
+ edgetpu_firmware_unlock(etdev);
return ret;
}
@@ -986,3 +1016,24 @@ int edgetpu_kci_close_device(struct edgetpu_kci *kci, u32 mailbox_ids)
return -ENODEV;
return edgetpu_kci_send_cmd(kci, &cmd);
}
+
+int edgetpu_kci_notify_throttling(struct edgetpu_dev *etdev, u32 level)
+{
+ struct edgetpu_command_element cmd = {
+ .code = KCI_CODE_NOTIFY_THROTTLING,
+ .dma = {
+ .flags = level,
+ },
+ };
+ int ret;
+
+ if (!etdev->kci)
+ return -ENODEV;
+ if (!edgetpu_pm_get_if_powered(etdev->pm))
+ return -EAGAIN;
+
+ ret = edgetpu_kci_send_cmd(etdev->kci, &cmd);
+ edgetpu_pm_put(etdev->pm);
+ return ret;
+}
+
diff --git a/drivers/edgetpu/edgetpu-kci.h b/drivers/edgetpu/edgetpu-kci.h
index 443c690..4d9de5f 100644
--- a/drivers/edgetpu/edgetpu-kci.h
+++ b/drivers/edgetpu/edgetpu-kci.h
@@ -108,6 +108,7 @@ enum edgetpu_kci_code {
KCI_CODE_CLOSE_DEVICE = 10,
KCI_CODE_FIRMWARE_INFO = 11,
KCI_CODE_GET_USAGE = 12,
+ KCI_CODE_NOTIFY_THROTTLING = 13,
};
/*
@@ -192,6 +193,7 @@ struct edgetpu_kci {
struct work_struct work; /* worker of consuming responses */
/* Handler for reverse (firmware -> kernel) requests */
struct edgetpu_reverse_kci rkci;
+ struct work_struct usage_work; /* worker that sends update usage KCI */
};
struct edgetpu_kci_device_group_detail {
@@ -279,7 +281,15 @@ enum edgetpu_fw_flavor edgetpu_kci_fw_info(
struct edgetpu_kci *kci, struct edgetpu_fw_info *fw_info);
/*
- * Retrieve usage tracking data from firmware, update info on host.
+ * Schedules a worker to call edgetpu_kci_update_usage().
+ *
+ * For functions that don't require the usage to be updated immediately, use
+ * this function instead of edgetpu_kci_update_usage().
+ */
+void edgetpu_kci_update_usage_async(struct edgetpu_dev *etdev);
+
+/*
+ * Retrieves usage tracking data from firmware, update info on host.
* Also used as a watchdog ping to firmware.
*
* Returns KCI response code on success or < 0 on error (typically -ETIMEDOUT).
@@ -342,4 +352,12 @@ int edgetpu_kci_close_device(struct edgetpu_kci *kci, u32 mailbox_ids);
/* Cancel work queues or wait until they're done */
void edgetpu_kci_cancel_work_queues(struct edgetpu_kci *kci);
+/*
+ * Notify the firmware about throttling and the corresponding power level.
+ * The request is sent only if the device is already powered on.
+ *
+ * Returns KCI response code on success or < 0 on error (typically -ETIMEDOUT).
+ */
+int edgetpu_kci_notify_throttling(struct edgetpu_dev *etdev, u32 level);
+
#endif /* __EDGETPU_KCI_H__ */
diff --git a/drivers/edgetpu/edgetpu-mailbox.c b/drivers/edgetpu/edgetpu-mailbox.c
index 8ce6ace..de76bb8 100644
--- a/drivers/edgetpu/edgetpu-mailbox.c
+++ b/drivers/edgetpu/edgetpu-mailbox.c
@@ -710,13 +710,66 @@ void edgetpu_mailbox_restore_active_vii_queues(struct edgetpu_dev *etdev)
{
struct edgetpu_list_group *l;
struct edgetpu_device_group *group;
+ struct edgetpu_device_group **groups;
+ size_t i, n = 0;
mutex_lock(&etdev->groups_lock);
+ groups = kmalloc_array(etdev->n_groups, sizeof(*groups), GFP_KERNEL);
+ if (unlikely(!groups)) {
+ /*
+ * Either the runtime is misbehaving (creates tons of groups),
+ * or the system is indeed OOM - we give up this restore
+ * process, which makes the runtime unable to communicate with
+ * the device through VII.
+ */
+ mutex_unlock(&etdev->groups_lock);
+ return;
+ }
+ /*
+ * Fetch the groups into an array to restore the VII without holding
+ * etdev->groups_lock. To prevent the potential deadlock that
+ * edgetpu_device_group_add() holds group->lock then etdev->groups_lock.
+ */
etdev_for_each_group(etdev, l, group) {
- if (!edgetpu_group_mailbox_detached_locked(group))
- edgetpu_mailbox_reinit_vii(group);
+ /*
+ * Quick skip without holding group->lock.
+ * Disbanded groups can never go back to the normal state.
+ */
+ if (edgetpu_device_group_is_disbanded(group))
+ continue;
+ /*
+ * Increase the group reference to prevent the group being
+ * released after we release groups_lock.
+ */
+ groups[n++] = edgetpu_device_group_get(group);
}
mutex_unlock(&etdev->groups_lock);
+
+ /*
+ * We are not holding @etdev->groups_lock, what may race is:
+ * 1. The group is disbanding and being removed from @etdev.
+ * 2. A new group is adding to @etdev
+ *
+ * For (1.) the group will be marked as DISBANDED, so we check whether
+ * the group is finalized before performing VII re-init.
+ *
+ * For (2.), adding group to @etdev (edgetpu_device_group_add()) has
+ * nothing to do with VII, its VII will be set when the group is
+ * finalized.
+ */
+ for (i = 0; i < n; i++) {
+ group = groups[i];
+ mutex_lock(&group->lock);
+ /*
+ * If the group is just finalized or has mailbox attached in
+ * another process, this re-init is redundant but isn't harmful.
+ */
+ if (edgetpu_group_finalized_and_attached(group))
+ edgetpu_mailbox_reinit_vii(group);
+ mutex_unlock(&group->lock);
+ edgetpu_device_group_put(group);
+ }
+ kfree(groups);
}
int edgetpu_mailbox_enable_ext(struct edgetpu_client *client, u32 mailbox_ids)
diff --git a/drivers/edgetpu/edgetpu-pm.c b/drivers/edgetpu/edgetpu-pm.c
index 8ee47fe..1e28141 100644
--- a/drivers/edgetpu/edgetpu-pm.c
+++ b/drivers/edgetpu/edgetpu-pm.c
@@ -29,15 +29,18 @@ struct edgetpu_pm_private {
int power_up_count;
};
-int edgetpu_pm_get(struct edgetpu_pm *etpm)
+/*
+ * Increases the counter and call the power_up callback.
+ *
+ * Returns zero on success.
+ *
+ * Caller holds etpm->p->lock.
+ */
+static int edgetpu_pm_get_locked(struct edgetpu_pm *etpm)
{
+ int power_up_count = etpm->p->power_up_count++;
int ret = 0;
- int power_up_count;
- if (!etpm || !etpm->p->handlers || !etpm->p->handlers->power_up)
- return 0;
- mutex_lock(&etpm->p->lock);
- power_up_count = etpm->p->power_up_count++;
if (!power_up_count) {
ret = etpm->p->handlers->power_up(etpm);
if (!ret)
@@ -46,6 +49,49 @@ int edgetpu_pm_get(struct edgetpu_pm *etpm)
if (ret)
etpm->p->power_up_count--;
etdev_dbg(etpm->etdev, "%s: %d\n", __func__, etpm->p->power_up_count);
+ return ret;
+}
+
+int edgetpu_pm_trylock(struct edgetpu_pm *etpm)
+{
+ if (!etpm || !etpm->p->handlers || !etpm->p->handlers->power_up)
+ return 1;
+ return mutex_trylock(&etpm->p->lock);
+}
+
+void edgetpu_pm_unlock(struct edgetpu_pm *etpm)
+{
+ if (!etpm || !etpm->p->handlers || !etpm->p->handlers->power_up)
+ return;
+ mutex_unlock(&etpm->p->lock);
+}
+
+bool edgetpu_pm_get_if_powered(struct edgetpu_pm *etpm)
+{
+ bool ret;
+
+ if (!etpm || !etpm->p->handlers || !etpm->p->handlers->power_up)
+ return true;
+ /* fast fail without holding the lock */
+ if (!etpm->p->power_up_count)
+ return false;
+ mutex_lock(&etpm->p->lock);
+ if (etpm->p->power_up_count)
+ ret = !edgetpu_pm_get_locked(etpm);
+ else
+ ret = false;
+ mutex_unlock(&etpm->p->lock);
+ return ret;
+}
+
+int edgetpu_pm_get(struct edgetpu_pm *etpm)
+{
+ int ret;
+
+ if (!etpm || !etpm->p->handlers || !etpm->p->handlers->power_up)
+ return 0;
+ mutex_lock(&etpm->p->lock);
+ ret = edgetpu_pm_get_locked(etpm);
mutex_unlock(&etpm->p->lock);
return ret;
}
diff --git a/drivers/edgetpu/edgetpu-pm.h b/drivers/edgetpu/edgetpu-pm.h
index 3ec7f66..aef35f6 100644
--- a/drivers/edgetpu/edgetpu-pm.h
+++ b/drivers/edgetpu/edgetpu-pm.h
@@ -48,6 +48,24 @@ struct edgetpu_pm {
*/
/*
+ * Tries to acquire the internal lock that ensures power_up_counter won't be
+ * modified.
+ *
+ * Returns 1 if the lock has been acquired successfully, 0 otherwise.
+ */
+int edgetpu_pm_trylock(struct edgetpu_pm *etpm);
+void edgetpu_pm_unlock(struct edgetpu_pm *etpm);
+
+/*
+ * Increase power_up_count if it's already powered on.
+ *
+ * Caller calls edgetpu_pm_put() to decrease power_up_count if this function
+ * returned true, otherwise put() shouldn't be called.
+ *
+ * Return false if device is not powered, true otherwise.
+ */
+bool edgetpu_pm_get_if_powered(struct edgetpu_pm *etpm);
+/*
* Increase power_up_count for active state, power up the device if previous
* power_up_count was zero.
* Returns 0 on success or negative error value
diff --git a/drivers/edgetpu/edgetpu-thermal.h b/drivers/edgetpu/edgetpu-thermal.h
index 7201597..63fc91c 100644
--- a/drivers/edgetpu/edgetpu-thermal.h
+++ b/drivers/edgetpu/edgetpu-thermal.h
@@ -12,6 +12,8 @@
#include <linux/mutex.h>
#include <linux/thermal.h>
+#include "edgetpu-internal.h"
+
#define EDGETPU_COOLING_NAME "tpu_cooling"
struct edgetpu_thermal {
@@ -22,6 +24,7 @@ struct edgetpu_thermal {
void *op_data;
unsigned long cooling_state;
unsigned int tpu_num_states;
+ struct edgetpu_dev *etdev;
};
struct edgetpu_state_pwr {
@@ -34,6 +37,7 @@ struct edgetpu_state_pwr {
*
* Returns -errno on error.
*/
-struct edgetpu_thermal *devm_tpu_thermal_create(struct device *dev);
+struct edgetpu_thermal *devm_tpu_thermal_create(struct device *dev,
+ struct edgetpu_dev *etdev);
#endif /* __EDGETPU_THERMAL_H__ */