summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorRivaldo Hong <rivaldo.hong@arm.com>2023-02-14 14:39:34 +0900
committerGuus Sliepen <gsliepen@google.com>2023-08-25 08:59:18 +0000
commit662a6ec447752444340d113d6c4d5256930fb894 (patch)
treec4ca55f16ac4d987dc4edae0aa955db9c956a43f
parenta0a0021e080c185aaff8b7173604de78e30bf65b (diff)
downloadgpu-662a6ec447752444340d113d6c4d5256930fb894.tar.gz
[Official] MIDCET-4820,GPUCORE-36255 Sync whole USER_BUFFER pages upon GPU mapping
As a result of this change now the whole of each of the pages underlying a memory region imported as a USER_BUFFER region are synchronized and the driver disables automatic CPU cache synchronization upon DMA map/unmap operations. (cherry picked from commit 0c2790fb2b006edb3e648a8a9a732e3aadcf2436) Provenance: https://code.ipdelivery.arm.com/c/GPU/mali-ddk/+/5829 Bug: 295897369 Change-Id: Ifbd656b78e49ae614ca87e156321d0ccd6dd8887
-rw-r--r--mali_kbase/csf/mali_kbase_csf_kcpu.c7
-rw-r--r--mali_kbase/mali_kbase_mem.c155
-rw-r--r--mali_kbase/mali_kbase_mem_linux.c77
3 files changed, 181 insertions, 58 deletions
diff --git a/mali_kbase/csf/mali_kbase_csf_kcpu.c b/mali_kbase/csf/mali_kbase_csf_kcpu.c
index dc66f62..1cb93b4 100644
--- a/mali_kbase/csf/mali_kbase_csf_kcpu.c
+++ b/mali_kbase/csf/mali_kbase_csf_kcpu.c
@@ -80,7 +80,14 @@ static int kbase_kcpu_map_import_prepare(
* on the physical pages tracking object. When the last
* reference to the tracking object is dropped the pages
* would be unpinned if they weren't unpinned before.
+ *
+ * Region should be CPU cached: abort if it isn't.
*/
+ if (WARN_ON(!(reg->flags & KBASE_REG_CPU_CACHED))) {
+ ret = -EINVAL;
+ goto out;
+ }
+
ret = kbase_jd_user_buf_pin_pages(kctx, reg);
if (ret)
goto out;
diff --git a/mali_kbase/mali_kbase_mem.c b/mali_kbase/mali_kbase_mem.c
index 7c09772..0d00b14 100644
--- a/mali_kbase/mali_kbase_mem.c
+++ b/mali_kbase/mali_kbase_mem.c
@@ -1,7 +1,7 @@
// SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note
/*
*
- * (C) COPYRIGHT 2010-2022 ARM Limited. All rights reserved.
+ * (C) COPYRIGHT 2010-2023 ARM Limited. All rights reserved.
*
* This program is free software and is provided to you under the terms of the
* GNU General Public License version 2 as published by the Free Software
@@ -2059,6 +2059,7 @@ void kbase_sync_single(struct kbase_context *kctx,
src = ((unsigned char *)kmap(gpu_page)) + offset;
dst = ((unsigned char *)kmap(cpu_page)) + offset;
}
+
memcpy(dst, src, size);
kunmap(gpu_page);
kunmap(cpu_page);
@@ -4940,10 +4941,7 @@ static int kbase_jd_user_buf_map(struct kbase_context *kctx,
struct page **pages;
struct tagged_addr *pa;
long i, dma_mapped_pages;
- unsigned long address;
struct device *dev;
- unsigned long offset_within_page;
- unsigned long remaining_size;
unsigned long gwt_mask = ~0;
/* Calls to this function are inherently asynchronous, with respect to
* MMU operations.
@@ -4959,20 +4957,34 @@ static int kbase_jd_user_buf_map(struct kbase_context *kctx,
alloc = reg->gpu_alloc;
pa = kbase_get_gpu_phy_pages(reg);
- address = alloc->imported.user_buf.address;
pinned_pages = alloc->nents;
pages = alloc->imported.user_buf.pages;
dev = kctx->kbdev->dev;
- offset_within_page = address & ~PAGE_MASK;
- remaining_size = alloc->imported.user_buf.size;
+ /* Manual CPU cache synchronization.
+ *
+ * The driver disables automatic CPU cache synchronization because the
+ * memory pages that enclose the imported region may also contain
+ * sub-regions which are not imported and that are allocated and used
+ * by the user process. This may be the case of memory at the beginning
+ * of the first page and at the end of the last page. Automatic CPU cache
+ * synchronization would force some operations on those memory allocations,
+ * unbeknown to the user process: in particular, a CPU cache invalidate
+ * upon unmapping would destroy the content of dirty CPU caches and cause
+ * the user process to lose CPU writes to the non-imported sub-regions.
+ *
+ * When the GPU claims ownership of the imported memory buffer, it shall
+ * commit CPU writes for the whole of all pages that enclose the imported
+ * region, otherwise the initial content of memory would be wrong.
+ */
for (i = 0; i < pinned_pages; i++) {
- unsigned long map_size =
- MIN(PAGE_SIZE - offset_within_page, remaining_size);
- dma_addr_t dma_addr = dma_map_page(dev, pages[i],
- offset_within_page, map_size,
- DMA_BIDIRECTIONAL);
-
+ dma_addr_t dma_addr;
+#if (KERNEL_VERSION(4, 10, 0) > LINUX_VERSION_CODE)
+ dma_addr = dma_map_page(dev, pages[i], 0, PAGE_SIZE, DMA_BIDIRECTIONAL);
+#else
+ dma_addr = dma_map_page_attrs(dev, pages[i], 0, PAGE_SIZE, DMA_BIDIRECTIONAL,
+ DMA_ATTR_SKIP_CPU_SYNC);
+#endif
err = dma_mapping_error(dev, dma_addr);
if (err)
goto unwind;
@@ -4980,8 +4992,7 @@ static int kbase_jd_user_buf_map(struct kbase_context *kctx,
alloc->imported.user_buf.dma_addrs[i] = dma_addr;
pa[i] = as_tagged(page_to_phys(pages[i]));
- remaining_size -= map_size;
- offset_within_page = 0;
+ dma_sync_single_for_device(dev, dma_addr, PAGE_SIZE, DMA_BIDIRECTIONAL);
}
#ifdef CONFIG_MALI_CINSTR_GWT
@@ -4999,19 +5010,26 @@ static int kbase_jd_user_buf_map(struct kbase_context *kctx,
/* fall down */
unwind:
alloc->nents = 0;
- offset_within_page = address & ~PAGE_MASK;
- remaining_size = alloc->imported.user_buf.size;
dma_mapped_pages = i;
- /* Run the unmap loop in the same order as map loop */
+ /* Run the unmap loop in the same order as map loop, and perform again
+ * CPU cache synchronization to re-write the content of dirty CPU caches
+ * to memory. This is precautionary measure in case a GPU job has taken
+ * advantage of a partially GPU-mapped range to write and corrupt the
+ * content of memory, either inside or outside the imported region.
+ *
+ * Notice that this error recovery path doesn't try to be optimal and just
+ * flushes the entire page range.
+ */
for (i = 0; i < dma_mapped_pages; i++) {
- unsigned long unmap_size =
- MIN(PAGE_SIZE - offset_within_page, remaining_size);
+ dma_addr_t dma_addr = alloc->imported.user_buf.dma_addrs[i];
- dma_unmap_page(kctx->kbdev->dev,
- alloc->imported.user_buf.dma_addrs[i],
- unmap_size, DMA_BIDIRECTIONAL);
- remaining_size -= unmap_size;
- offset_within_page = 0;
+ dma_sync_single_for_device(dev, dma_addr, PAGE_SIZE, DMA_BIDIRECTIONAL);
+#if (KERNEL_VERSION(4, 10, 0) > LINUX_VERSION_CODE)
+ dma_unmap_page(dev, dma_addr, PAGE_SIZE, DMA_BIDIRECTIONAL);
+#else
+ dma_unmap_page_attrs(dev, dma_addr, PAGE_SIZE, DMA_BIDIRECTIONAL,
+ DMA_ATTR_SKIP_CPU_SYNC);
+#endif
}
/* The user buffer could already have been previously pinned before
@@ -5052,12 +5070,89 @@ static void kbase_jd_user_buf_unmap(struct kbase_context *kctx, struct kbase_mem
#endif
for (i = 0; i < alloc->imported.user_buf.nr_pages; i++) {
- unsigned long unmap_size =
- MIN(remaining_size, PAGE_SIZE - offset_within_page);
+ unsigned long imported_size = MIN(remaining_size, PAGE_SIZE - offset_within_page);
+ /* Notice: this is a temporary variable that is used for DMA sync
+ * operations, and that could be incremented by an offset if the
+ * current page contains both imported and non-imported memory
+ * sub-regions.
+ *
+ * It is valid to add an offset to this value, because the offset
+ * is always kept within the physically contiguous dma-mapped range
+ * and there's no need to translate to physical address to offset it.
+ *
+ * This variable is not going to be used for the actual DMA unmap
+ * operation, that shall always use the original DMA address of the
+ * whole memory page.
+ */
dma_addr_t dma_addr = alloc->imported.user_buf.dma_addrs[i];
- dma_unmap_page(kctx->kbdev->dev, dma_addr, unmap_size,
- DMA_BIDIRECTIONAL);
+ /* Manual CPU cache synchronization.
+ *
+ * When the GPU returns ownership of the buffer to the CPU, the driver
+ * needs to treat imported and non-imported memory differently.
+ *
+ * The first case to consider is non-imported sub-regions at the
+ * beginning of the first page and at the end of last page. For these
+ * sub-regions: CPU cache shall be committed with a clean+invalidate,
+ * in order to keep the last CPU write.
+ *
+ * Imported region prefers the opposite treatment: this memory has been
+ * legitimately mapped and used by the GPU, hence GPU writes shall be
+ * committed to memory, while CPU cache shall be invalidated to make
+ * sure that CPU reads the correct memory content.
+ *
+ * The following diagram shows the expect value of the variables
+ * used in this loop in the corner case of an imported region encloed
+ * by a single memory page:
+ *
+ * page boundary ->|---------- | <- dma_addr (initial value)
+ * | |
+ * | - - - - - | <- offset_within_page
+ * |XXXXXXXXXXX|\
+ * |XXXXXXXXXXX| \
+ * |XXXXXXXXXXX| }- imported_size
+ * |XXXXXXXXXXX| /
+ * |XXXXXXXXXXX|/
+ * | - - - - - | <- offset_within_page + imported_size
+ * | |\
+ * | | }- PAGE_SIZE - imported_size - offset_within_page
+ * | |/
+ * page boundary ->|-----------|
+ *
+ * If the imported region is enclosed by more than one page, then
+ * offset_within_page = 0 for any page after the first.
+ */
+
+ /* Only for first page: handle non-imported range at the beginning. */
+ if (offset_within_page > 0) {
+ dma_sync_single_for_device(kctx->kbdev->dev, dma_addr, offset_within_page,
+ DMA_BIDIRECTIONAL);
+ dma_addr += offset_within_page;
+ }
+
+ /* For every page: handle imported range. */
+ if (imported_size > 0)
+ dma_sync_single_for_cpu(kctx->kbdev->dev, dma_addr, imported_size,
+ DMA_BIDIRECTIONAL);
+
+ /* Only for last page (that may coincide with first page):
+ * handle non-imported range at the end.
+ */
+ if ((imported_size + offset_within_page) < PAGE_SIZE) {
+ dma_addr += imported_size;
+ dma_sync_single_for_device(kctx->kbdev->dev, dma_addr,
+ PAGE_SIZE - imported_size - offset_within_page,
+ DMA_BIDIRECTIONAL);
+ }
+
+ /* Notice: use the original DMA address to unmap the whole memory page. */
+#if (KERNEL_VERSION(4, 10, 0) > LINUX_VERSION_CODE)
+ dma_unmap_page(kctx->kbdev->dev, alloc->imported.user_buf.dma_addrs[i], PAGE_SIZE,
+ DMA_BIDIRECTIONAL);
+#else
+ dma_unmap_page_attrs(kctx->kbdev->dev, alloc->imported.user_buf.dma_addrs[i],
+ PAGE_SIZE, DMA_BIDIRECTIONAL, DMA_ATTR_SKIP_CPU_SYNC);
+#endif
if (writeable)
set_page_dirty_lock(pages[i]);
#if !MALI_USE_CSF
@@ -5065,7 +5160,7 @@ static void kbase_jd_user_buf_unmap(struct kbase_context *kctx, struct kbase_mem
pages[i] = NULL;
#endif
- remaining_size -= unmap_size;
+ remaining_size -= imported_size;
offset_within_page = 0;
}
#if !MALI_USE_CSF
diff --git a/mali_kbase/mali_kbase_mem_linux.c b/mali_kbase/mali_kbase_mem_linux.c
index ffa0ebc..5289532 100644
--- a/mali_kbase/mali_kbase_mem_linux.c
+++ b/mali_kbase/mali_kbase_mem_linux.c
@@ -1,7 +1,7 @@
// SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note
/*
*
- * (C) COPYRIGHT 2010-2022 ARM Limited. All rights reserved.
+ * (C) COPYRIGHT 2010-2023 ARM Limited. All rights reserved.
*
* This program is free software and is provided to you under the terms of the
* GNU General Public License version 2 as published by the Free Software
@@ -36,7 +36,7 @@
#include <linux/cache.h>
#include <linux/memory_group_manager.h>
#include <linux/math64.h>
-
+#include <linux/version.h>
#include <mali_kbase.h>
#include <mali_kbase_mem_linux.h>
#include <tl/mali_kbase_tracepoints.h>
@@ -1529,10 +1529,10 @@ static struct kbase_va_region *kbase_mem_from_user_buffer(
int zone = KBASE_REG_ZONE_CUSTOM_VA;
bool shared_zone = false;
u32 cache_line_alignment = kbase_get_cache_line_alignment(kctx->kbdev);
- unsigned long offset_within_page;
- unsigned long remaining_size;
struct kbase_alloc_import_user_buf *user_buf;
struct page **pages = NULL;
+ struct tagged_addr *pa;
+ struct device *dev;
int write;
/* Flag supported only for dma-buf imported memory */
@@ -1680,29 +1680,44 @@ static struct kbase_va_region *kbase_mem_from_user_buffer(
reg->gpu_alloc->nents = 0;
reg->extension = 0;
- if (pages) {
- struct device *dev = kctx->kbdev->dev;
- struct tagged_addr *pa = kbase_get_gpu_phy_pages(reg);
+ pa = kbase_get_gpu_phy_pages(reg);
+ dev = kctx->kbdev->dev;
+ if (pages) {
/* Top bit signifies that this was pinned on import */
user_buf->current_mapping_usage_count |= PINNED_ON_IMPORT;
- offset_within_page = user_buf->address & ~PAGE_MASK;
- remaining_size = user_buf->size;
+ /* Manual CPU cache synchronization.
+ *
+ * The driver disables automatic CPU cache synchronization because the
+ * memory pages that enclose the imported region may also contain
+ * sub-regions which are not imported and that are allocated and used
+ * by the user process. This may be the case of memory at the beginning
+ * of the first page and at the end of the last page. Automatic CPU cache
+ * synchronization would force some operations on those memory allocations,
+ * unbeknown to the user process: in particular, a CPU cache invalidate
+ * upon unmapping would destroy the content of dirty CPU caches and cause
+ * the user process to lose CPU writes to the non-imported sub-regions.
+ *
+ * When the GPU claims ownership of the imported memory buffer, it shall
+ * commit CPU writes for the whole of all pages that enclose the imported
+ * region, otherwise the initial content of memory would be wrong.
+ */
for (i = 0; i < faulted_pages; i++) {
- unsigned long map_size =
- MIN(PAGE_SIZE - offset_within_page, remaining_size);
- dma_addr_t dma_addr = dma_map_page(dev, pages[i],
- offset_within_page, map_size, DMA_BIDIRECTIONAL);
-
+ dma_addr_t dma_addr;
+#if (KERNEL_VERSION(4, 10, 0) > LINUX_VERSION_CODE)
+ dma_addr = dma_map_page(dev, pages[i], 0, PAGE_SIZE, DMA_BIDIRECTIONAL);
+#else
+ dma_addr = dma_map_page_attrs(dev, pages[i], 0, PAGE_SIZE,
+ DMA_BIDIRECTIONAL, DMA_ATTR_SKIP_CPU_SYNC);
+#endif
if (dma_mapping_error(dev, dma_addr))
goto unwind_dma_map;
user_buf->dma_addrs[i] = dma_addr;
pa[i] = as_tagged(page_to_phys(pages[i]));
- remaining_size -= map_size;
- offset_within_page = 0;
+ dma_sync_single_for_device(dev, dma_addr, PAGE_SIZE, DMA_BIDIRECTIONAL);
}
reg->gpu_alloc->nents = faulted_pages;
@@ -1711,19 +1726,23 @@ static struct kbase_va_region *kbase_mem_from_user_buffer(
return reg;
unwind_dma_map:
- offset_within_page = user_buf->address & ~PAGE_MASK;
- remaining_size = user_buf->size;
dma_mapped_pages = i;
- /* Run the unmap loop in the same order as map loop */
+ /* Run the unmap loop in the same order as map loop, and perform again
+ * CPU cache synchronization to re-write the content of dirty CPU caches
+ * to memory. This precautionary measure is kept here to keep this code
+ * aligned with kbase_jd_user_buf_map() to allow for a potential refactor
+ * in the future.
+ */
for (i = 0; i < dma_mapped_pages; i++) {
- unsigned long unmap_size =
- MIN(PAGE_SIZE - offset_within_page, remaining_size);
+ dma_addr_t dma_addr = user_buf->dma_addrs[i];
- dma_unmap_page(kctx->kbdev->dev,
- user_buf->dma_addrs[i],
- unmap_size, DMA_BIDIRECTIONAL);
- remaining_size -= unmap_size;
- offset_within_page = 0;
+ dma_sync_single_for_device(dev, dma_addr, PAGE_SIZE, DMA_BIDIRECTIONAL);
+#if (KERNEL_VERSION(4, 10, 0) > LINUX_VERSION_CODE)
+ dma_unmap_page_attrs(dev, dma_addr, PAGE_SIZE, DMA_BIDIRECTIONAL);
+#else
+ dma_unmap_page_attrs(dev, dma_addr, PAGE_SIZE, DMA_BIDIRECTIONAL,
+ DMA_ATTR_SKIP_CPU_SYNC);
+#endif
}
fault_mismatch:
if (pages) {
@@ -1743,7 +1762,6 @@ no_alloc_obj:
no_region:
bad_size:
return NULL;
-
}
@@ -2026,7 +2044,10 @@ int kbase_mem_import(struct kbase_context *kctx, enum base_mem_import_type type,
/* Remove COHERENT_SYSTEM flag if coherent mem is unavailable */
*flags &= ~BASE_MEM_COHERENT_SYSTEM;
}
-
+ if (((*flags & BASE_MEM_CACHED_CPU) == 0) && (type == BASE_MEM_IMPORT_TYPE_USER_BUFFER)) {
+ dev_warn(kctx->kbdev->dev, "USER_BUFFER must be CPU cached");
+ goto bad_flags;
+ }
if ((padding != 0) && (type != BASE_MEM_IMPORT_TYPE_UMM)) {
dev_warn(kctx->kbdev->dev,
"padding is only supported for UMM");