Merge branch 'darwinn-2.0' into android-gs-pixel-5.10

* whitechapel: edgetpu: using fw trylock in update usage KCI edgetpu: use pm_trylock for usage update KCI edgetpu: abrolhos: add kci to notify firmware about throttling. edgetpu: add firmware launch_failed callback edgetpu: add PM get_if_powered API edgetpu: hold group locks when restoring VII mboxes Revert "edgetpu: abrolhos temporarily disable pm test" edgetpu: add asynchronous usage update KCI edgetpu: reset iomux cfg on top hermosa reset edgetpu: abrolhos temporarily disable pm test edgetpu: modify SG length before sync edgetpu: abrolhos: use vmalloc for firmware image edgetpu: fix unit tests for usage stats to create valid firmware edgetpu: KCI update usage lockout f/w state changes, abort if f/w bad Signed-off-by: Nrithya Kanakasabapathy <nrithya@google.com> Change-Id: I902de9634e3ec5082c279d8bb7eebbc8af33060e
author: Nrithya Kanakasabapathy <nrithya@google.com> 2021-05-12 05:10:54 +0000
committer: Nrithya Kanakasabapathy <nrithya@google.com> 2021-05-12 05:10:54 +0000
commit: 9e653d1d032ecba4f19e6e257e4e664ee8885d29 (patch)
tree: a6f7d5b5f29bf8b35d867a44c8f2155b7ff3c570
parent: c2a5f1fcc8911d94858bb48492a1b529f9a8e80e (diff)
download: abrolhos-9e653d1d032ecba4f19e6e257e4e664ee8885d29.tar.gz
12 files changed, 350 insertions, 58 deletions
diff --git a/drivers/edgetpu/abrolhos-firmware.c b/drivers/edgetpu/abrolhos-firmware.c
index 5fbec26..9acc0d2 100644
--- a/drivers/edgetpu/abrolhos-firmware.c
+++ b/drivers/edgetpu/abrolhos-firmware.c
@@ -28,7 +28,7 @@ static int abrolhos_firmware_alloc_buffer(
 	size_t buffer_size =
 		abpdev->fw_region_size + MOBILE_FW_HEADER_SIZE;
 
-	fw_buf->vaddr = kzalloc(buffer_size, GFP_KERNEL);
+	fw_buf->vaddr = vmalloc(buffer_size);
 	if (!fw_buf->vaddr) {
 		etdev_err(etdev, "%s: failed to allocate buffer (%zu bytes)\n",
 			  __func__, buffer_size);
@@ -44,7 +44,7 @@ static void abrolhos_firmware_free_buffer(
 		struct edgetpu_firmware *et_fw,
 		struct edgetpu_firmware_buffer *fw_buf)
 {
-	kfree(fw_buf->vaddr);
+	vfree(fw_buf->vaddr);
 	fw_buf->vaddr = NULL;
 	fw_buf->dma_addr = 0;
 	fw_buf->alloc_size = 0;
diff --git a/drivers/edgetpu/abrolhos-platform.c b/drivers/edgetpu/abrolhos-platform.c
index a613b63..16e36dc 100644
--- a/drivers/edgetpu/abrolhos-platform.c
+++ b/drivers/edgetpu/abrolhos-platform.c
@@ -315,7 +315,8 @@ static int edgetpu_platform_probe(struct platform_device *pdev)
 	}
 
 	dev_dbg(dev, "Creating thermal device\n");
-	abpdev->edgetpu_dev.thermal = devm_tpu_thermal_create(dev);
+	abpdev->edgetpu_dev.thermal =
+			devm_tpu_thermal_create(dev, &abpdev->edgetpu_dev);
 
 	dev_info(dev, "%s edgetpu initialized. Build: %s\n",
 		 abpdev->edgetpu_dev.dev_name, GIT_REPO_TAG);
diff --git a/drivers/edgetpu/abrolhos-thermal.c b/drivers/edgetpu/abrolhos-thermal.c
index 27429a9..84be142 100644
--- a/drivers/edgetpu/abrolhos-thermal.c
+++ b/drivers/edgetpu/abrolhos-thermal.c
@@ -48,6 +48,7 @@ static int edgetpu_set_cur_state(struct thermal_cooling_device *cdev,
 	int ret;
 	struct edgetpu_thermal *cooling = cdev->devdata;
 	struct device *dev = cooling->dev;
+	struct edgetpu_dev *etdev = cooling->etdev;
 	unsigned long pwr_state;
 
 	if (state_original >= cooling->tpu_num_states) {
@@ -77,6 +78,18 @@ static int edgetpu_set_cur_state(struct thermal_cooling_device *cdev,
 			goto out;
 		}
 		cooling->cooling_state = state_original;
+		ret = edgetpu_kci_notify_throttling(etdev, pwr_state);
+		if (ret) {
+			/*
+			 * TODO(b/185596886) : After FW adds a handler for this
+			 * KCI, return the correct value of ret and change the
+			 * debug message to an error message
+			 */
+			etdev_dbg(
+			etdev, "Failed to notify FW about state %lu, error:%d",
+			pwr_state, ret);
+			ret = 0;
+		}
 	} else {
 		ret = -EALREADY;
 	}
@@ -307,7 +320,8 @@ static int tpu_thermal_init(struct edgetpu_thermal *thermal, struct device *dev)
 	return 0;
 }
 
-struct edgetpu_thermal *devm_tpu_thermal_create(struct device *dev)
+struct edgetpu_thermal
+*devm_tpu_thermal_create(struct device *dev, struct edgetpu_dev *etdev)
 {
 	struct edgetpu_thermal *thermal;
 	int err;
@@ -324,5 +338,6 @@ struct edgetpu_thermal *devm_tpu_thermal_create(struct device *dev)
 	}
 
 	devres_add(dev, thermal);
+	thermal->etdev = etdev;
 	return thermal;
 }
diff --git a/drivers/edgetpu/edgetpu-device-group.c b/drivers/edgetpu/edgetpu-device-group.c
index 9c93c0b..9d00a0f 100644
--- a/drivers/edgetpu/edgetpu-device-group.c
+++ b/drivers/edgetpu/edgetpu-device-group.c
@@ -60,6 +60,24 @@ struct edgetpu_host_map {
 	struct sg_table *sg_tables;
 };
 
+/*
+ * A helper structure for the return value of find_sg_to_sync().
+ */
+struct sglist_to_sync {
+	struct scatterlist *sg;
+	int nelems;
+	/*
+	 * The SG that has its length modified by find_sg_to_sync().
+	 * Can be NULL, which means no SG's length was modified.
+	 */
+	struct scatterlist *last_sg;
+	/*
+	 * find_sg_to_sync() will temporarily change the length of @last_sg.
+	 * This is used to restore the length.
+	 */
+	unsigned int orig_length;
+};
+
 #ifdef EDGETPU_HAS_MCP
 
 /* parameter to be used in async KCI jobs */
@@ -86,7 +104,7 @@ static int edgetpu_kci_leave_group_worker(struct kci_worker_param *param)
 	struct edgetpu_dev *etdev = edgetpu_device_group_nth_etdev(group, i);
 
 	etdev_dbg(etdev, "%s: leave group %u", __func__, group->workload_id);
-	edgetpu_kci_update_usage(etdev);
+	edgetpu_kci_update_usage_async(etdev);
 	edgetpu_kci_leave_group(etdev->kci);
 	return 0;
 }
@@ -147,7 +165,7 @@ static void edgetpu_group_kci_close_device(struct edgetpu_device_group *group)
 static void edgetpu_device_group_kci_leave(struct edgetpu_device_group *group)
 {
 #ifdef EDGETPU_HAS_MULTI_GROUPS
-	edgetpu_kci_update_usage(group->etdev);
+	edgetpu_kci_update_usage_async(group->etdev);
 	return edgetpu_group_kci_close_device(group);
 #else /* !EDGETPU_HAS_MULTI_GROUPS */
 	struct kci_worker_param *params =
@@ -1247,32 +1265,60 @@ error:
 }
 
 /*
- * Find the scatterlist covering range [start, end).
+ * Finds the scatterlist covering range [start, end).
+ *
+ * The found SG and number of elements will be stored in @sglist.
  *
- * Returns NULL if:
- * - @start is larger than the whole SG table
+ * To ensure the returned SG list strictly locates in range [start, end), the
+ * last SG's length is shrunk. Therefore caller must call
+ * restore_sg_after_sync(@sglist) after the DMA sync is performed.
+ *
+ * @sglist->nelems == 0 means the target range exceeds the whole SG table.
  */
-static struct scatterlist *find_sg_within(const struct sg_table *sgt, u64 start,
-					  u64 end, int *nelems)
+static void find_sg_to_sync(const struct sg_table *sgt, u64 start, u64 end,
+			    struct sglist_to_sync *sglist)
 {
-	struct scatterlist *sg, *sg_to_sync = NULL;
+	struct scatterlist *sg;
 	size_t cur_offset = 0;
 	int i;
 
-	*nelems = 0;
+	sglist->sg = NULL;
+	sglist->nelems = 0;
+	sglist->last_sg = NULL;
+	if (unlikely(end == 0))
+		return;
 	for_each_sg(sgt->sgl, sg, sgt->orig_nents, i) {
-		if (end <= cur_offset)
-			break;
 		if (cur_offset <= start && start < cur_offset + sg->length)
-			sg_to_sync = sg;
-		if (sg_to_sync)
-			(*nelems)++;
+			sglist->sg = sg;
+		if (sglist->sg)
+			++sglist->nelems;
 		cur_offset += sg->length;
+		if (end <= cur_offset) {
+			sglist->last_sg = sg;
+			sglist->orig_length = sg->length;
+			/*
+			 * To let the returned SG list have exact length as
+			 * [start, end).
+			 */
+			sg->length -= cur_offset - end;
+			break;
+		}
 	}
+}
 
-	return sg_to_sync;
+static void restore_sg_after_sync(struct sglist_to_sync *sglist)
+{
+	if (!sglist->last_sg)
+		return;
+	sglist->last_sg->length = sglist->orig_length;
 }
 
+/*
+ * Performs DMA sync of the mapping with region [offset, offset + size).
+ *
+ * Caller holds mapping's lock, to prevent @hmap being modified / removed by
+ * other processes.
+ */
 static int group_sync_host_map(struct edgetpu_device_group *group,
 			       struct edgetpu_host_map *hmap, u64 offset,
 			       u64 size, enum dma_data_direction dir,
@@ -1283,29 +1329,32 @@ static int group_sync_host_map(struct edgetpu_device_group *group,
 		for_cpu ? dma_sync_sg_for_cpu : dma_sync_sg_for_device;
 	struct edgetpu_dev *etdev;
 	struct sg_table *sgt;
-	struct scatterlist *sg;
+	struct sglist_to_sync sglist;
 	int i;
-	int nelems;
 
 	sgt = &hmap->map.sgt;
-	sg = find_sg_within(sgt, offset, end, &nelems);
-	if (!sg)
+	find_sg_to_sync(sgt, offset, end, &sglist);
+	if (!sglist.nelems)
 		return -EINVAL;
 
+	if (IS_MIRRORED(hmap->map.flags))
+		etdev = group->etdev;
+	else
+		etdev = edgetpu_device_group_nth_etdev(group,
+						       hmap->map.die_index);
+	sync(etdev->dev, sglist.sg, sglist.nelems, dir);
+	restore_sg_after_sync(&sglist);
+
 	if (IS_MIRRORED(hmap->map.flags)) {
-		sync(group->etdev->dev, sg, nelems, dir);
 		for (i = 1; i < group->n_clients; i++) {
 			etdev = edgetpu_device_group_nth_etdev(group, i);
-			sg = find_sg_within(&hmap->sg_tables[i], offset, end,
-					    &nelems);
-			if (WARN_ON(!sg))
+			find_sg_to_sync(&hmap->sg_tables[i], offset, end,
+					&sglist);
+			if (WARN_ON(!sglist.sg))
 				return -EINVAL;
-			sync(etdev->dev, sg, nelems, dir);
+			sync(etdev->dev, sglist.sg, sglist.nelems, dir);
+			restore_sg_after_sync(&sglist);
 		}
-	} else {
-		etdev = edgetpu_device_group_nth_etdev(group,
-						       hmap->map.die_index);
-		sync(etdev->dev, sg, nelems, dir);
 	}
 
 	return 0;
diff --git a/drivers/edgetpu/edgetpu-firmware.c b/drivers/edgetpu/edgetpu-firmware.c
index 3b1c874..8ae808b 100644
--- a/drivers/edgetpu/edgetpu-firmware.c
+++ b/drivers/edgetpu/edgetpu-firmware.c
@@ -313,6 +313,20 @@ edgetpu_firmware_get_build_time(struct edgetpu_firmware *et_fw)
 }
 
 /*
+ * Try edgetpu_firmware_lock() if it's not locked yet.
+ *
+ * Returns 1 if the lock is acquired successfully, 0 otherwise.
+ */
+int edgetpu_firmware_trylock(struct edgetpu_dev *etdev)
+{
+	struct edgetpu_firmware *et_fw = etdev->firmware;
+
+	if (!et_fw)
+		return 1;
+	return mutex_trylock(&et_fw->p->fw_desc_lock);
+}
+
+/*
  * Grab firmware lock to protect against firmware state changes.
  * Locks out firmware loading / unloading while caller performs ops that are
  * incompatible with a change in firmware status.  Does not care whether or not
@@ -395,7 +409,7 @@ int edgetpu_firmware_run_locked(struct edgetpu_firmware *et_fw,
 	memset(&new_fw_desc, 0, sizeof(new_fw_desc));
 	ret = edgetpu_firmware_load_locked(et_fw, &new_fw_desc, name, flags);
 	if (ret)
-		return ret;
+		goto out_failed;
 
 	etdev_dbg(et_fw->etdev, "run fw %s flags=0x%x", name, flags);
 	if (handlers && handlers->prepare_run) {
@@ -426,10 +440,15 @@ int edgetpu_firmware_run_locked(struct edgetpu_firmware *et_fw,
 
 	if (!ret && !is_bl1_run && handlers && handlers->launch_complete)
 		handlers->launch_complete(et_fw);
+	else if (ret && handlers && handlers->launch_failed)
+		handlers->launch_failed(et_fw, ret);
 	return ret;
 
 out_unload_new_fw:
 	edgetpu_firmware_unload_locked(et_fw, &new_fw_desc);
+out_failed:
+	if (handlers && handlers->launch_failed)
+		handlers->launch_failed(et_fw, ret);
 	return ret;
 }
 
@@ -500,6 +519,17 @@ edgetpu_firmware_status_locked(struct edgetpu_dev *etdev)
 	return et_fw->p->status;
 }
 
+/* Caller must hold firmware lock. For unit tests. */
+void
+edgetpu_firmware_set_status_locked(struct edgetpu_dev *etdev,
+				   enum edgetpu_firmware_status status)
+{
+	struct edgetpu_firmware *et_fw = etdev->firmware;
+
+	if (et_fw)
+		et_fw->p->status = status;
+}
+
 /* Caller must hold firmware lock for loading. */
 int edgetpu_firmware_restart_locked(struct edgetpu_dev *etdev)
 {
diff --git a/drivers/edgetpu/edgetpu-firmware.h b/drivers/edgetpu/edgetpu-firmware.h
index d068d29..ad7c484 100644
--- a/drivers/edgetpu/edgetpu-firmware.h
+++ b/drivers/edgetpu/edgetpu-firmware.h
@@ -147,6 +147,8 @@ struct edgetpu_firmware_handlers {
 			   struct edgetpu_firmware_buffer *fw_buf);
 	/* Firmware running, after successful handshake. */
 	void (*launch_complete)(struct edgetpu_firmware *et_fw);
+	/* Firmware load failed or unsuccessful handshake. */
+	void (*launch_failed)(struct edgetpu_firmware *et_fw, int ret);
 
 	/*
 	 * Optional platform-specific handler to restart an already loaded
@@ -195,12 +197,12 @@ void edgetpu_firmware_mappings_show(struct edgetpu_dev *etdev,
 				    struct seq_file *s);
 
 /*
- * These two functions grab and release the internal firmware lock
- * and must be used before calling the helper functions suffixed with _locked
- * below
+ * These functions grab and release the internal firmware lock and must be used
+ * before calling the helper functions suffixed with _locked below.
  */
 
 int edgetpu_firmware_lock(struct edgetpu_dev *etdev);
+int edgetpu_firmware_trylock(struct edgetpu_dev *etdev);
 void edgetpu_firmware_unlock(struct edgetpu_dev *etdev);
 
 
@@ -211,6 +213,11 @@ void edgetpu_firmware_unlock(struct edgetpu_dev *etdev);
 enum edgetpu_firmware_status
 edgetpu_firmware_status_locked(struct edgetpu_dev *etdev);
 
+/* Caller must hold firmware lock. For unit tests. */
+void
+edgetpu_firmware_set_status_locked(struct edgetpu_dev *etdev,
+				   enum edgetpu_firmware_status status);
+
 /*
  * Restarts the last firmware image loaded
  * Intended for power managed devices to re-run the firmware without a full
diff --git a/drivers/edgetpu/edgetpu-kci.c b/drivers/edgetpu/edgetpu-kci.c
index f506a04..e467fac 100644
--- a/drivers/edgetpu/edgetpu-kci.c
+++ b/drivers/edgetpu/edgetpu-kci.c
@@ -432,6 +432,14 @@ static void edgetpu_kci_handle_irq(struct edgetpu_mailbox *mailbox)
 	schedule_work(&kci->work);
 }
 
+static void edgetpu_kci_update_usage_work(struct work_struct *work)
+{
+	struct edgetpu_kci *kci =
+		container_of(work, struct edgetpu_kci, usage_work);
+
+	edgetpu_kci_update_usage(kci->mailbox->etdev);
+}
+
 int edgetpu_kci_init(struct edgetpu_mailbox_manager *mgr,
 		     struct edgetpu_kci *kci)
 {
@@ -478,6 +486,7 @@ int edgetpu_kci_init(struct edgetpu_mailbox_manager *mgr,
 	init_waitqueue_head(&kci->wait_list_waitq);
 	INIT_WORK(&kci->work, edgetpu_kci_consume_responses_work);
 	edgetpu_reverse_kci_init(&kci->rkci);
+	INIT_WORK(&kci->usage_work, edgetpu_kci_update_usage_work);
 	EDGETPU_MAILBOX_CONTEXT_WRITE(mailbox, context_enable, 1);
 	return 0;
 }
@@ -507,7 +516,9 @@ int edgetpu_kci_reinit(struct edgetpu_kci *kci)
 
 void edgetpu_kci_cancel_work_queues(struct edgetpu_kci *kci)
 {
-	/* Cancel KCI and reverse KCI workers */
+	/* Cancel workers that may send KCIs. */
+	cancel_work_sync(&kci->usage_work);
+	/* Cancel KCI and reverse KCI workers. */
 	cancel_work_sync(&kci->work);
 	cancel_work_sync(&kci->rkci.work);
 }
@@ -516,10 +527,6 @@ void edgetpu_kci_release(struct edgetpu_dev *etdev, struct edgetpu_kci *kci)
 {
 	if (!kci)
 		return;
-	/*
-	 * Command/Response queues are managed (dmam_alloc_coherent()), we don't
-	 * need to free them.
-	 */
 
 	edgetpu_kci_cancel_work_queues(kci);
 
@@ -853,19 +860,42 @@ enum edgetpu_fw_flavor edgetpu_kci_fw_info(struct edgetpu_kci *kci,
 	return flavor;
 }
 
+void edgetpu_kci_update_usage_async(struct edgetpu_dev *etdev)
+{
+	schedule_work(&etdev->kci->usage_work);
+}
+
 int edgetpu_kci_update_usage(struct edgetpu_dev *etdev)
 {
-	int ret;
+	int ret = -EAGAIN;
 
-	/* Quick return if device already powered down, else get PM ref. */
+	/* Quick return if device is already powered down. */
 	if (!edgetpu_is_powered(etdev))
 		return -EAGAIN;
-	ret = edgetpu_pm_get(etdev->pm);
-	if (ret)
-		return ret;
-	ret = edgetpu_kci_update_usage_locked(etdev);
+	/*
+	 * Lockout change in f/w load/unload status during usage update.
+	 * Skip usage update if the firmware is being updated now or is not
+	 * valid.
+	 */
+	if (!edgetpu_firmware_trylock(etdev))
+		return -EAGAIN;
 
-	edgetpu_pm_put(etdev->pm);
+	if (edgetpu_firmware_status_locked(etdev) != FW_VALID)
+		goto fw_unlock;
+	/*
+	 * This function may run in a worker that is being canceled when the
+	 * device is powering down, and the power down code holds the PM lock.
+	 * Using trylock to prevent cancel_work_sync() waiting forever.
+	 */
+	if (!edgetpu_pm_trylock(etdev->pm))
+		goto fw_unlock;
+
+	if (edgetpu_is_powered(etdev))
+		ret = edgetpu_kci_update_usage_locked(etdev);
+	edgetpu_pm_unlock(etdev->pm);
+
+fw_unlock:
+	edgetpu_firmware_unlock(etdev);
 	return ret;
 }
 
@@ -986,3 +1016,24 @@ int edgetpu_kci_close_device(struct edgetpu_kci *kci, u32 mailbox_ids)
 		return -ENODEV;
 	return edgetpu_kci_send_cmd(kci, &cmd);
 }
+
+int edgetpu_kci_notify_throttling(struct edgetpu_dev *etdev, u32 level)
+{
+	struct edgetpu_command_element cmd = {
+		.code = KCI_CODE_NOTIFY_THROTTLING,
+		.dma = {
+			.flags = level,
+		},
+	};
+	int ret;
+
+	if (!etdev->kci)
+		return -ENODEV;
+	if (!edgetpu_pm_get_if_powered(etdev->pm))
+		return -EAGAIN;
+
+	ret =  edgetpu_kci_send_cmd(etdev->kci, &cmd);
+	edgetpu_pm_put(etdev->pm);
+	return ret;
+}
+
diff --git a/drivers/edgetpu/edgetpu-kci.h b/drivers/edgetpu/edgetpu-kci.h
index 443c690..4d9de5f 100644
--- a/drivers/edgetpu/edgetpu-kci.h
+++ b/drivers/edgetpu/edgetpu-kci.h
@@ -108,6 +108,7 @@ enum edgetpu_kci_code {
 	KCI_CODE_CLOSE_DEVICE = 10,
 	KCI_CODE_FIRMWARE_INFO = 11,
 	KCI_CODE_GET_USAGE = 12,
+	KCI_CODE_NOTIFY_THROTTLING = 13,
 };
 
 /*
@@ -192,6 +193,7 @@ struct edgetpu_kci {
 	struct work_struct work;	/* worker of consuming responses */
 	/* Handler for reverse (firmware -> kernel) requests */
 	struct edgetpu_reverse_kci rkci;
+	struct work_struct usage_work;	/* worker that sends update usage KCI */
 };
 
 struct edgetpu_kci_device_group_detail {
@@ -279,7 +281,15 @@ enum edgetpu_fw_flavor edgetpu_kci_fw_info(
 	struct edgetpu_kci *kci, struct edgetpu_fw_info *fw_info);
 
 /*
- * Retrieve usage tracking data from firmware, update info on host.
+ * Schedules a worker to call edgetpu_kci_update_usage().
+ *
+ * For functions that don't require the usage to be updated immediately, use
+ * this function instead of edgetpu_kci_update_usage().
+ */
+void edgetpu_kci_update_usage_async(struct edgetpu_dev *etdev);
+
+/*
+ * Retrieves usage tracking data from firmware, update info on host.
  * Also used as a watchdog ping to firmware.
  *
  * Returns KCI response code on success or < 0 on error (typically -ETIMEDOUT).
@@ -342,4 +352,12 @@ int edgetpu_kci_close_device(struct edgetpu_kci *kci, u32 mailbox_ids);
 /* Cancel work queues or wait until they're done */
 void edgetpu_kci_cancel_work_queues(struct edgetpu_kci *kci);
 
+/*
+ * Notify the firmware about throttling and the corresponding power level.
+ * The request is sent only if the device is already powered on.
+ *
+ * Returns KCI response code on success or < 0 on error (typically -ETIMEDOUT).
+ */
+int edgetpu_kci_notify_throttling(struct edgetpu_dev *etdev, u32 level);
+
 #endif /* __EDGETPU_KCI_H__ */
diff --git a/drivers/edgetpu/edgetpu-mailbox.c b/drivers/edgetpu/edgetpu-mailbox.c
index 8ce6ace..de76bb8 100644
--- a/drivers/edgetpu/edgetpu-mailbox.c
+++ b/drivers/edgetpu/edgetpu-mailbox.c
@@ -710,13 +710,66 @@ void edgetpu_mailbox_restore_active_vii_queues(struct edgetpu_dev *etdev)
 {
 	struct edgetpu_list_group *l;
 	struct edgetpu_device_group *group;
+	struct edgetpu_device_group **groups;
+	size_t i, n = 0;
 
 	mutex_lock(&etdev->groups_lock);
+	groups = kmalloc_array(etdev->n_groups, sizeof(*groups), GFP_KERNEL);
+	if (unlikely(!groups)) {
+		/*
+		 * Either the runtime is misbehaving (creates tons of groups),
+		 * or the system is indeed OOM - we give up this restore
+		 * process, which makes the runtime unable to communicate with
+		 * the device through VII.
+		 */
+		mutex_unlock(&etdev->groups_lock);
+		return;
+	}
+	/*
+	 * Fetch the groups into an array to restore the VII without holding
+	 * etdev->groups_lock. To prevent the potential deadlock that
+	 * edgetpu_device_group_add() holds group->lock then etdev->groups_lock.
+	 */
 	etdev_for_each_group(etdev, l, group) {
-		if (!edgetpu_group_mailbox_detached_locked(group))
-			edgetpu_mailbox_reinit_vii(group);
+		/*
+		 * Quick skip without holding group->lock.
+		 * Disbanded groups can never go back to the normal state.
+		 */
+		if (edgetpu_device_group_is_disbanded(group))
+			continue;
+		/*
+		 * Increase the group reference to prevent the group being
+		 * released after we release groups_lock.
+		 */
+		groups[n++] = edgetpu_device_group_get(group);
 	}
 	mutex_unlock(&etdev->groups_lock);
+
+	/*
+	 * We are not holding @etdev->groups_lock, what may race is:
+	 *   1. The group is disbanding and being removed from @etdev.
+	 *   2. A new group is adding to @etdev
+	 *
+	 * For (1.) the group will be marked as DISBANDED, so we check whether
+	 * the group is finalized before performing VII re-init.
+	 *
+	 * For (2.), adding group to @etdev (edgetpu_device_group_add()) has
+	 * nothing to do with VII, its VII will be set when the group is
+	 * finalized.
+	 */
+	for (i = 0; i < n; i++) {
+		group = groups[i];
+		mutex_lock(&group->lock);
+		/*
+		 * If the group is just finalized or has mailbox attached in
+		 * another process, this re-init is redundant but isn't harmful.
+		 */
+		if (edgetpu_group_finalized_and_attached(group))
+			edgetpu_mailbox_reinit_vii(group);
+		mutex_unlock(&group->lock);
+		edgetpu_device_group_put(group);
+	}
+	kfree(groups);
 }
 
 int edgetpu_mailbox_enable_ext(struct edgetpu_client *client, u32 mailbox_ids)
diff --git a/drivers/edgetpu/edgetpu-pm.c b/drivers/edgetpu/edgetpu-pm.c
index 8ee47fe..1e28141 100644
--- a/drivers/edgetpu/edgetpu-pm.c
+++ b/drivers/edgetpu/edgetpu-pm.c
@@ -29,15 +29,18 @@ struct edgetpu_pm_private {
 	int power_up_count;
 };
 
-int edgetpu_pm_get(struct edgetpu_pm *etpm)
+/*
+ * Increases the counter and call the power_up callback.
+ *
+ * Returns zero on success.
+ *
+ * Caller holds etpm->p->lock.
+ */
+static int edgetpu_pm_get_locked(struct edgetpu_pm *etpm)
 {
+	int power_up_count = etpm->p->power_up_count++;
 	int ret = 0;
-	int power_up_count;
 
-	if (!etpm || !etpm->p->handlers || !etpm->p->handlers->power_up)
-		return 0;
-	mutex_lock(&etpm->p->lock);
-	power_up_count = etpm->p->power_up_count++;
 	if (!power_up_count) {
 		ret = etpm->p->handlers->power_up(etpm);
 		if (!ret)
@@ -46,6 +49,49 @@ int edgetpu_pm_get(struct edgetpu_pm *etpm)
 	if (ret)
 		etpm->p->power_up_count--;
 	etdev_dbg(etpm->etdev, "%s: %d\n", __func__, etpm->p->power_up_count);
+	return ret;
+}
+
+int edgetpu_pm_trylock(struct edgetpu_pm *etpm)
+{
+	if (!etpm || !etpm->p->handlers || !etpm->p->handlers->power_up)
+		return 1;
+	return mutex_trylock(&etpm->p->lock);
+}
+
+void edgetpu_pm_unlock(struct edgetpu_pm *etpm)
+{
+	if (!etpm || !etpm->p->handlers || !etpm->p->handlers->power_up)
+		return;
+	mutex_unlock(&etpm->p->lock);
+}
+
+bool edgetpu_pm_get_if_powered(struct edgetpu_pm *etpm)
+{
+	bool ret;
+
+	if (!etpm || !etpm->p->handlers || !etpm->p->handlers->power_up)
+		return true;
+	/* fast fail without holding the lock */
+	if (!etpm->p->power_up_count)
+		return false;
+	mutex_lock(&etpm->p->lock);
+	if (etpm->p->power_up_count)
+		ret = !edgetpu_pm_get_locked(etpm);
+	else
+		ret = false;
+	mutex_unlock(&etpm->p->lock);
+	return ret;
+}
+
+int edgetpu_pm_get(struct edgetpu_pm *etpm)
+{
+	int ret;
+
+	if (!etpm || !etpm->p->handlers || !etpm->p->handlers->power_up)
+		return 0;
+	mutex_lock(&etpm->p->lock);
+	ret = edgetpu_pm_get_locked(etpm);
 	mutex_unlock(&etpm->p->lock);
 	return ret;
 }
diff --git a/drivers/edgetpu/edgetpu-pm.h b/drivers/edgetpu/edgetpu-pm.h
index 3ec7f66..aef35f6 100644
--- a/drivers/edgetpu/edgetpu-pm.h
+++ b/drivers/edgetpu/edgetpu-pm.h
@@ -48,6 +48,24 @@ struct edgetpu_pm {
  */
 
 /*
+ * Tries to acquire the internal lock that ensures power_up_counter won't be
+ * modified.
+ *
+ * Returns 1 if the lock has been acquired successfully, 0 otherwise.
+ */
+int edgetpu_pm_trylock(struct edgetpu_pm *etpm);
+void edgetpu_pm_unlock(struct edgetpu_pm *etpm);
+
+/*
+ * Increase power_up_count if it's already powered on.
+ *
+ * Caller calls edgetpu_pm_put() to decrease power_up_count if this function
+ * returned true, otherwise put() shouldn't be called.
+ *
+ * Return false if device is not powered, true otherwise.
+ */
+bool edgetpu_pm_get_if_powered(struct edgetpu_pm *etpm);
+/*
  * Increase power_up_count for active state, power up the device if previous
  * power_up_count was zero.
  * Returns 0 on success or negative error value
diff --git a/drivers/edgetpu/edgetpu-thermal.h b/drivers/edgetpu/edgetpu-thermal.h
index 7201597..63fc91c 100644
--- a/drivers/edgetpu/edgetpu-thermal.h
+++ b/drivers/edgetpu/edgetpu-thermal.h
@@ -12,6 +12,8 @@
 #include <linux/mutex.h>
 #include <linux/thermal.h>
 
+#include "edgetpu-internal.h"
+
 #define EDGETPU_COOLING_NAME "tpu_cooling"
 
 struct edgetpu_thermal {
@@ -22,6 +24,7 @@ struct edgetpu_thermal {
 	void *op_data;
 	unsigned long cooling_state;
 	unsigned int tpu_num_states;
+	struct edgetpu_dev *etdev;
 };
 
 struct edgetpu_state_pwr {
@@ -34,6 +37,7 @@ struct edgetpu_state_pwr {
  *
  * Returns -errno on error.
  */
-struct edgetpu_thermal *devm_tpu_thermal_create(struct device *dev);
+struct edgetpu_thermal *devm_tpu_thermal_create(struct device *dev,
+						struct edgetpu_dev *etdev);
 
 #endif /* __EDGETPU_THERMAL_H__ */
author	Nrithya Kanakasabapathy <nrithya@google.com>	2021-05-12 05:10:54 +0000
committer	Nrithya Kanakasabapathy <nrithya@google.com>	2021-05-12 05:10:54 +0000
commit	9e653d1d032ecba4f19e6e257e4e664ee8885d29 (patch)
tree	a6f7d5b5f29bf8b35d867a44c8f2155b7ff3c570
parent	c2a5f1fcc8911d94858bb48492a1b529f9a8e80e (diff)
download	abrolhos-9e653d1d032ecba4f19e6e257e4e664ee8885d29.tar.gz