diff options
author | Whi copybara merger <whitechapel-automerger@google.com> | 2021-12-07 23:55:24 +0000 |
---|---|---|
committer | Nrithya Kanakasabapathy <nrithya@google.com> | 2021-12-14 00:59:20 +0000 |
commit | b61360624793a3b4898bf74cbc053c2422826366 (patch) | |
tree | c37c9d93e7a60d08d33998eb91bd4bc55a4296ef | |
parent | db8e16af324fae8e3587c62220c2dec2d787312b (diff) | |
download | janeiro-b61360624793a3b4898bf74cbc053c2422826366.tar.gz |
[Copybara Auto Merge] Merge branch 'pro' into android13-gs-pixel-5.10
edgetpu: mcp notify MCP-wide thermal shutdown via kworker
Bug: 207807085
Bug: 174552882
edgetpu: all mobile chips support debug dump
Bug: 207459857
edgetpu: mobile: share debug dump handlers
Bug: 207459857
GitOrigin-RevId: b0177acc91ac4fc014be4d5a4d98253641dc91fa
Change-Id: Iafa408831e81ab80ac4e476a39e81d57838857a5
-rw-r--r-- | drivers/edgetpu/edgetpu-mcp.h | 7 | ||||
-rw-r--r-- | drivers/edgetpu/edgetpu-mobile-platform.h | 3 | ||||
-rw-r--r-- | drivers/edgetpu/janeiro-debug-dump.c | 13 | ||||
-rw-r--r-- | drivers/edgetpu/janeiro-device.c | 2 | ||||
-rw-r--r-- | drivers/edgetpu/janeiro/config.h | 7 | ||||
-rw-r--r-- | drivers/edgetpu/mobile-debug-dump.c | 244 | ||||
-rw-r--r-- | drivers/edgetpu/mobile-debug-dump.h | 26 |
7 files changed, 255 insertions, 47 deletions
diff --git a/drivers/edgetpu/edgetpu-mcp.h b/drivers/edgetpu/edgetpu-mcp.h index 4530d79..4d762c4 100644 --- a/drivers/edgetpu/edgetpu-mcp.h +++ b/drivers/edgetpu/edgetpu-mcp.h @@ -9,7 +9,9 @@ #include <linux/init.h> #include <linux/mutex.h> +#include <linux/spinlock.h> #include <linux/types.h> +#include <linux/workqueue.h> #include "edgetpu-config.h" #include "edgetpu-internal.h" @@ -38,6 +40,11 @@ struct edgetpu_mcp { * One should check with !IS_ERR_OR_NULL(etdevs[i]) before accessing. */ struct edgetpu_dev **etdevs; + + /* MCP-wide fatal errors pending runtime notification */ + uint errors_pending_mask; + spinlock_t errors_pending_lock; + struct work_struct errors_pending_work; /* for notify via kworker */ }; #ifdef EDGETPU_HAS_MCP diff --git a/drivers/edgetpu/edgetpu-mobile-platform.h b/drivers/edgetpu/edgetpu-mobile-platform.h index 65184ae..9d41571 100644 --- a/drivers/edgetpu/edgetpu-mobile-platform.h +++ b/drivers/edgetpu/edgetpu-mobile-platform.h @@ -22,6 +22,7 @@ #include "edgetpu-config.h" #include "edgetpu-internal.h" +#include "mobile-debug-dump.h" #define to_mobile_dev(etdev) container_of(etdev, struct edgetpu_mobile_platform_dev, edgetpu_dev) @@ -93,6 +94,8 @@ struct edgetpu_mobile_platform_dev { #if IS_ENABLED(CONFIG_GOOGLE_BCL) struct bcl_device *bcl_dev; #endif + /* subsystem coredump info struct */ + struct mobile_sscd_info sscd_info; /* Protects TZ Mailbox client pointer */ struct mutex tz_mailbox_lock; /* TZ mailbox client */ diff --git a/drivers/edgetpu/janeiro-debug-dump.c b/drivers/edgetpu/janeiro-debug-dump.c index 4314abe..92c3e1a 100644 --- a/drivers/edgetpu/janeiro-debug-dump.c +++ b/drivers/edgetpu/janeiro-debug-dump.c @@ -1,9 +1,10 @@ // SPDX-License-Identifier: GPL-2.0 -/* - * Implements chip specific details of debug dump memory initialization and SSCD registration. - * - * Copyright (C) 2021 Google, Inc. - */ + +#if IS_ENABLED(CONFIG_SUBSYSTEM_COREDUMP) || IS_ENABLED(CONFIG_EDGETPU_TEST) + +#include "mobile-debug-dump.c" + +#else /* IS_ENABLED(CONFIG_SUBSYSTEM_COREDUMP) || IS_ENABLED(CONFIG_EDGETPU_TEST) */ #include "edgetpu-debug-dump.c" @@ -15,3 +16,5 @@ int edgetpu_debug_dump_init(struct edgetpu_dev *etdev) void edgetpu_debug_dump_exit(struct edgetpu_dev *etdev) { } + +#endif /* IS_ENABLED(CONFIG_SUBSYSTEM_COREDUMP) || IS_ENABLED(CONFIG_EDGETPU_TEST) */ diff --git a/drivers/edgetpu/janeiro-device.c b/drivers/edgetpu/janeiro-device.c index 24452da..0f28186 100644 --- a/drivers/edgetpu/janeiro-device.c +++ b/drivers/edgetpu/janeiro-device.c @@ -9,6 +9,7 @@ #include <linux/uaccess.h> #include "edgetpu-config.h" +#include "edgetpu-debug-dump.h" #include "edgetpu-internal.h" #include "edgetpu-mailbox.h" #include "edgetpu-mobile-platform.h" @@ -51,6 +52,7 @@ irqreturn_t edgetpu_chip_irq_handler(int irq, void *arg) struct edgetpu_dev *etdev = arg; edgetpu_telemetry_irq_handler(etdev); + edgetpu_debug_dump_resp_handler(etdev); return janeiro_mailbox_handle_irq(etdev, irq); } diff --git a/drivers/edgetpu/janeiro/config.h b/drivers/edgetpu/janeiro/config.h index 7a3304c..51215bd 100644 --- a/drivers/edgetpu/janeiro/config.h +++ b/drivers/edgetpu/janeiro/config.h @@ -58,6 +58,13 @@ /* Address from which the TPU CPU can access data in the remapped region */ #define EDGETPU_REMAPPED_DATA_ADDR \ (EDGETPU_INSTRUCTION_REMAP_BASE + EDGETPU_REMAPPED_DATA_OFFSET) + +/* + * Size of memory for FW accessible debug dump segments + * TODO(b/208758697): verify whether this size is good + */ +#define EDGETPU_DEBUG_DUMP_MEM_SIZE 0x4E0000 + #include "config-mailbox.h" #include "config-pwr-state.h" #include "config-tpu-cpu.h" diff --git a/drivers/edgetpu/mobile-debug-dump.c b/drivers/edgetpu/mobile-debug-dump.c index 3732fbb..e0c9493 100644 --- a/drivers/edgetpu/mobile-debug-dump.c +++ b/drivers/edgetpu/mobile-debug-dump.c @@ -8,16 +8,35 @@ #include <linux/mutex.h> #include <linux/platform_data/sscoredump.h> +#include <linux/platform_device.h> #include <linux/rbtree.h> #include <linux/slab.h> +#include "edgetpu-config.h" #include "edgetpu-device-group.h" #include "edgetpu-mailbox.h" +#include "edgetpu-mobile-platform.h" #include "mobile-debug-dump.h" #include "edgetpu-debug-dump.c" -struct mobile_sscd_mappings_dump * +static void sscd_release(struct device *dev) +{ + pr_debug(DRIVER_NAME " release\n"); +} + +static struct sscd_platform_data sscd_pdata; +static struct platform_device sscd_dev; + +/* + * Collects the mapping information of all the host mapping and dmabuf mapping buffers of all + * @groups as an array of struct mobile_sscd_mappings_dump and populates the @sscd_seg. + * + * Returns the pointer to the first element of the mappings dump array. The allocated array should + * be freed by the caller after the sscd segment is reported. + * Returns NULL in case of failure. + */ +static struct mobile_sscd_mappings_dump * mobile_sscd_collect_mappings_segment(struct edgetpu_device_group **groups, size_t num_groups, struct sscd_segment *sscd_seg) { @@ -30,8 +49,8 @@ mobile_sscd_collect_mappings_segment(struct edgetpu_device_group **groups, size_ mappings_dump = kmalloc(sizeof(struct mobile_sscd_mappings_dump), GFP_KERNEL); for (idx = 0; idx < num_groups; idx++) { mutex_lock(&groups[idx]->lock); - new_size += groups[idx]->host_mappings.count * - sizeof(struct mobile_sscd_mappings_dump); + new_size += + groups[idx]->host_mappings.count * sizeof(struct mobile_sscd_mappings_dump); resized_arr = krealloc(mappings_dump, new_size, GFP_KERNEL); if (!resized_arr) { kfree(mappings_dump); @@ -80,40 +99,49 @@ mobile_sscd_collect_mappings_segment(struct edgetpu_device_group **groups, size_ return mappings_dump; } -size_t mobile_sscd_collect_cmd_resp_queues(struct edgetpu_dev *etdev, - struct edgetpu_device_group **groups, size_t num_groups, - struct sscd_segment *sscd_seg_arr) +/* + * Collects the VII cmd and resp queues of all @groups that @etdev belongs to and the KCI cmd and + * resp queues and populates them as @sscd_seg_arr elements. + * + * Returns the total number of queues collected since some queues may have been released for groups + * with detached mailboxes. The return value is less than or equal to the total number of queues + * expected based on @num_groups i.e. (2 * @num_groups +2). + */ +static size_t mobile_sscd_collect_cmd_resp_queues(struct edgetpu_dev *etdev, + struct edgetpu_device_group **groups, + size_t num_groups, + struct sscd_segment *sscd_seg_arr) { struct edgetpu_kci *kci; size_t idx; u16 num_queues = 0; - // Collect VII cmd and resp queues + /* Collect VII cmd and resp queues */ for (idx = 0; idx < num_groups; idx++) { mutex_lock(&groups[idx]->lock); if (!edgetpu_group_mailbox_detached_locked(groups[idx])) { sscd_seg_arr[num_queues].addr = - (void *)groups[idx]->vii.cmd_queue_mem.vaddr; + (void *)groups[idx]->vii.cmd_queue_mem.vaddr; sscd_seg_arr[num_queues].size = groups[idx]->vii.cmd_queue_mem.size; sscd_seg_arr[num_queues].paddr = - (void *)groups[idx]->vii.cmd_queue_mem.tpu_addr; + (void *)groups[idx]->vii.cmd_queue_mem.tpu_addr; sscd_seg_arr[num_queues].vaddr = - (void *)groups[idx]->vii.cmd_queue_mem.vaddr; + (void *)groups[idx]->vii.cmd_queue_mem.vaddr; num_queues++; sscd_seg_arr[num_queues].addr = - (void *)groups[idx]->vii.resp_queue_mem.vaddr; + (void *)groups[idx]->vii.resp_queue_mem.vaddr; sscd_seg_arr[num_queues].size = groups[idx]->vii.resp_queue_mem.size; sscd_seg_arr[num_queues].paddr = - (void *)groups[idx]->vii.resp_queue_mem.tpu_addr; + (void *)groups[idx]->vii.resp_queue_mem.tpu_addr; sscd_seg_arr[num_queues].vaddr = - (void *)groups[idx]->vii.resp_queue_mem.vaddr; + (void *)groups[idx]->vii.resp_queue_mem.vaddr; num_queues++; } mutex_unlock(&groups[idx]->lock); } - // Collect KCI cmd and resp queues + /* Collect KCI cmd and resp queues */ kci = etdev->kci; sscd_seg_arr[num_queues].addr = (void *)kci->cmd_queue_mem.vaddr; sscd_seg_arr[num_queues].size = MAX_QUEUE_SIZE * sizeof(struct edgetpu_command_element); @@ -122,11 +150,195 @@ size_t mobile_sscd_collect_cmd_resp_queues(struct edgetpu_dev *etdev, num_queues++; sscd_seg_arr[num_queues].addr = (void *)kci->resp_queue_mem.vaddr; - sscd_seg_arr[num_queues].size = MAX_QUEUE_SIZE * - sizeof(struct edgetpu_kci_response_element); + sscd_seg_arr[num_queues].size = + MAX_QUEUE_SIZE * sizeof(struct edgetpu_kci_response_element); sscd_seg_arr[num_queues].paddr = (void *)kci->resp_queue_mem.tpu_addr; sscd_seg_arr[num_queues].vaddr = (void *)kci->resp_queue_mem.vaddr; num_queues++; return num_queues; } + +static int mobile_sscd_generate_coredump(void *p_etdev, void *p_dump_setup) +{ + struct edgetpu_dev *etdev; + struct edgetpu_debug_dump_setup *dump_setup; + struct edgetpu_mobile_platform_dev *pdev; + struct sscd_platform_data *pdata; + struct platform_device *sscd_dev; + struct sscd_segment *segs; + struct edgetpu_debug_dump *debug_dump; + struct edgetpu_crash_reason *crash_reason; + struct edgetpu_dump_segment *dump_seg; + struct edgetpu_device_group *group; + struct edgetpu_device_group **groups; + struct edgetpu_list_group *g; + struct mobile_sscd_mappings_dump *mappings_dump = NULL; + char crash_info[128]; + int sscd_dump_segments_num; + int i, ret; + size_t num_groups = 0, num_queues = 0; + u64 offset; + + if (!p_etdev || !p_dump_setup) + return -EINVAL; + + etdev = (struct edgetpu_dev *)p_etdev; + dump_setup = (struct edgetpu_debug_dump_setup *)p_dump_setup; + pdev = to_mobile_dev(etdev); + pdata = (struct sscd_platform_data *)pdev->sscd_info.pdata; + sscd_dev = (struct platform_device *)pdev->sscd_info.dev; + if (!pdata->sscd_report) { + etdev_err(etdev, "failed to generate coredump"); + return -ENOENT; + } + + debug_dump = (struct edgetpu_debug_dump *)(dump_setup + 1); + + /* Populate crash reason */ + crash_reason = + (struct edgetpu_crash_reason *)((u8 *)dump_setup + debug_dump->crash_reason_offset); + scnprintf(crash_info, sizeof(crash_info), "[edgetpu_coredump] error code: %#llx", + crash_reason->code); + + mutex_lock(&etdev->groups_lock); + groups = kmalloc_array(etdev->n_groups, sizeof(*groups), GFP_KERNEL); + if (!groups) { + mutex_unlock(&etdev->groups_lock); + return -ENOMEM; + } + + etdev_for_each_group(etdev, g, group) { + if (edgetpu_device_group_is_disbanded(group)) + continue; + groups[num_groups++] = edgetpu_device_group_get(group); + } + mutex_unlock(&etdev->groups_lock); + + /* Allocate memory for dump segments */ + sscd_dump_segments_num = debug_dump->dump_segments_num; + sscd_dump_segments_num += 2 * num_groups; /* VII cmd and resp queues */ + sscd_dump_segments_num += num_groups ? 1 : 0; /* Mappings info */ + sscd_dump_segments_num += 2; /* KCI cmd and resp queues */ + + segs = kmalloc_array(sscd_dump_segments_num, sizeof(struct sscd_segment), GFP_KERNEL); + if (!segs) { + ret = -ENOMEM; + goto out_sscd_generate_coredump; + } + + /* Populate sscd segments */ + dump_seg = (struct edgetpu_dump_segment *)((u8 *)dump_setup + + debug_dump->dump_segments_offset); + offset = debug_dump->dump_segments_offset; + for (i = 0; i < debug_dump->dump_segments_num; i++) { + segs[i].addr = dump_seg; + segs[i].size = sizeof(struct edgetpu_dump_segment) + dump_seg->size; + segs[i].paddr = (void *)(etdev->debug_dump_mem.tpu_addr + offset); + segs[i].vaddr = (void *)(etdev->debug_dump_mem.vaddr + offset); + offset += sizeof(struct edgetpu_dump_segment) + dump_seg->size; + dump_seg = (struct edgetpu_dump_segment *)((u8 *)dump_setup + + ALIGN(offset, sizeof(uint64_t))); + } + + if (num_groups) { + mappings_dump = mobile_sscd_collect_mappings_segment(groups, num_groups, &segs[i]); + if (!mappings_dump) { + ret = -ENOMEM; + goto out_sscd_generate_coredump; + } + i++; + } + + num_queues = mobile_sscd_collect_cmd_resp_queues(etdev, groups, num_groups, &segs[i]); + + /* + * Adjust num of segments as some groups may have a detached mailbox. + * Subtract number of VII and KCI queues according to num_groups. + */ + sscd_dump_segments_num -= (2 * num_groups + 2); + sscd_dump_segments_num += num_queues; /* Add actual number of valid VII and KCI queues */ + + /* Pass dump data to SSCD daemon */ + etdev_dbg(etdev, "report: %d segments", sscd_dump_segments_num); + ret = pdata->sscd_report(sscd_dev, segs, sscd_dump_segments_num, SSCD_FLAGS_ELFARM64HDR, + crash_info); +out_sscd_generate_coredump: + for (i = 0; i < num_groups; i++) + edgetpu_device_group_put(groups[i]); + kfree(mappings_dump); + kfree(segs); + kfree(groups); + + return ret; +} + +int edgetpu_debug_dump_init(struct edgetpu_dev *etdev) +{ + size_t size; + int ret; + struct edgetpu_debug_dump_setup *dump_setup; + struct edgetpu_mobile_platform_dev *pdev; + + pdev = to_mobile_dev(etdev); + + size = EDGETPU_DEBUG_DUMP_MEM_SIZE; + + sscd_dev = (struct platform_device) { + .name = DRIVER_NAME, + .driver_override = SSCD_NAME, + .id = PLATFORM_DEVID_NONE, + .dev = { + .platform_data = &sscd_pdata, + .release = sscd_release, + }, + }; + /* Register SSCD platform device */ + ret = platform_device_register(&sscd_dev); + if (ret) { + etdev_err(etdev, "SSCD platform device registration failed: %d", ret); + return ret; + } + /* + * Allocate a buffer for various dump segments + */ + ret = edgetpu_alloc_coherent(etdev, size, &etdev->debug_dump_mem, EDGETPU_CONTEXT_KCI); + if (ret) { + etdev_err(etdev, "Debug dump seg alloc failed"); + etdev->debug_dump_mem.vaddr = NULL; + goto out_unregister_platform; + } + dump_setup = (struct edgetpu_debug_dump_setup *)etdev->debug_dump_mem.vaddr; + memset(dump_setup, 0, size); + dump_setup->dump_mem_size = size; + + /* + * Allocate memory for debug dump handlers + */ + etdev->debug_dump_handlers = + kcalloc(DUMP_REASON_NUM, sizeof(*etdev->debug_dump_handlers), GFP_KERNEL); + if (!etdev->debug_dump_handlers) + return -ENOMEM; + etdev->debug_dump_handlers[DUMP_REASON_REQ_BY_USER] = mobile_sscd_generate_coredump; + + pdev->sscd_info.pdata = &sscd_pdata; + pdev->sscd_info.dev = &sscd_dev; + return ret; +out_unregister_platform: + platform_device_unregister(&sscd_dev); + return ret; +} + +void edgetpu_debug_dump_exit(struct edgetpu_dev *etdev) +{ + if (!etdev->debug_dump_mem.vaddr) { + etdev_dbg(etdev, "Debug dump not allocated"); + return; + } + /* + * Free the memory assigned for debug dump + */ + edgetpu_free_coherent(etdev, &etdev->debug_dump_mem, EDGETPU_CONTEXT_KCI); + kfree(etdev->debug_dump_handlers); + platform_device_unregister(&sscd_dev); +} diff --git a/drivers/edgetpu/mobile-debug-dump.h b/drivers/edgetpu/mobile-debug-dump.h index 0a9aef9..f433a99 100644 --- a/drivers/edgetpu/mobile-debug-dump.h +++ b/drivers/edgetpu/mobile-debug-dump.h @@ -23,30 +23,4 @@ struct mobile_sscd_mappings_dump { u64 size; }; -struct sscd_segment; - -/* - * Collects the mapping information of all the host mapping and dmabuf mapping buffers of all - * @groups as an array of struct mobile_sscd_mappings_dump and populates the @sscd_seg. - * - * Returns the pointer to the first element of the mappings dump array. The allocated array should - * be freed by the caller after the sscd segment is reported. - * Returns NULL in case of failure. - */ -struct mobile_sscd_mappings_dump * -mobile_sscd_collect_mappings_segment(struct edgetpu_device_group **groups, size_t num_groups, - struct sscd_segment *sscd_seg); - -/* - * Collects the VII cmd and resp queues of all @groups that @etdev belongs to and the KCI cmd and - * resp queues and populates them as @sscd_seg_arr elements. - * - * Returns the total number of queues collected since some queues may have been released for groups - * with detached mailboxes. The return value is less than or equal to the total number of queues - * expected based on @num_groups i.e. (2 * @num_groups +2). - */ -size_t mobile_sscd_collect_cmd_resp_queues(struct edgetpu_dev *etdev, - struct edgetpu_device_group **groups, size_t num_groups, - struct sscd_segment *sscd_seg_arr); - #endif /* MOBILE_DEBUG_DUMP_H_ */ |