mali_kbase/csf/mali_kbase_csf_tiler_heap.c


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373

// SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note
/*
 *
 * (C) COPYRIGHT 2019-2023 ARM Limited. All rights reserved.
 *
 * This program is free software and is provided to you under the terms of the
 * GNU General Public License version 2 as published by the Free Software
 * Foundation, and any use by you of this program is subject to the terms
 * of such GNU license.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, you can access it online at
 * http://www.gnu.org/licenses/gpl-2.0.html.
 *
 */

#include <tl/mali_kbase_tracepoints.h>

#include "mali_kbase_csf_tiler_heap.h"
#include "mali_kbase_csf_tiler_heap_def.h"
#include "mali_kbase_csf_heap_context_alloc.h"

/* Tiler heap shrink stop limit for maintaining a minimum number of chunks */
#define HEAP_SHRINK_STOP_LIMIT (1)

/**
 * struct kbase_csf_gpu_buffer_heap - A gpu buffer object specific to tiler heap
 *
 * @cdsbp_0:       Descriptor_type and buffer_type
 * @size:          The size of the current heap chunk
 * @pointer:       Pointer to the current heap chunk
 * @low_pointer:   Pointer to low end of current heap chunk
 * @high_pointer:  Pointer to high end of current heap chunk
 */
struct kbase_csf_gpu_buffer_heap {
	u32 cdsbp_0;
	u32 size;
	u64 pointer;
	u64 low_pointer;
	u64 high_pointer;
} __packed;

/**
 * encode_chunk_ptr - Encode the address and size of a chunk as an integer.
 *
 * @chunk_size: Size of a tiler heap chunk, in bytes.
 * @chunk_addr: GPU virtual address of the same tiler heap chunk.
 *
 * The size and address of the next chunk in a list are packed into a single
 * 64-bit value for storage in a chunk's header. This function returns that
 * value.
 *
 * Return: Next chunk pointer suitable for writing into a chunk header.
 */
static u64 encode_chunk_ptr(u32 const chunk_size, u64 const chunk_addr)
{
	u64 encoded_size, encoded_addr;

	WARN_ON(chunk_size & ~CHUNK_SIZE_MASK);
	WARN_ON(chunk_addr & ~CHUNK_ADDR_MASK);

	encoded_size =
		(u64)(chunk_size >> CHUNK_HDR_NEXT_SIZE_ENCODE_SHIFT) <<
		CHUNK_HDR_NEXT_SIZE_POS;

	encoded_addr =
		(chunk_addr >> CHUNK_HDR_NEXT_ADDR_ENCODE_SHIFT) <<
		CHUNK_HDR_NEXT_ADDR_POS;

	return (encoded_size & CHUNK_HDR_NEXT_SIZE_MASK) |
		(encoded_addr & CHUNK_HDR_NEXT_ADDR_MASK);
}

/**
 * get_last_chunk - Get the last chunk of a tiler heap
 *
 * @heap:  Pointer to the tiler heap.
 *
 * Return: The address of the most recently-linked chunk, or NULL if none.
 */
static struct kbase_csf_tiler_heap_chunk *get_last_chunk(
	struct kbase_csf_tiler_heap *const heap)
{
	if (list_empty(&heap->chunks_list))
		return NULL;

	return list_last_entry(&heap->chunks_list,
		struct kbase_csf_tiler_heap_chunk, link);
}

/**
 * remove_external_chunk_mappings - Remove external mappings from a chunk that
 *                                  is being transitioned to the tiler heap
 *                                  memory system.
 *
 * @kctx:  kbase context the chunk belongs to.
 * @chunk: The chunk whose external mappings are going to be removed.
 *
 * This function marks the region as DONT NEED. Along with NO_USER_FREE, this indicates
 * that the VA region is owned by the tiler heap and could potentially be shrunk at any time. Other
 * parts of kbase outside of tiler heap management should not take references on its physical
 * pages, and should not modify them.
 */
static void remove_external_chunk_mappings(struct kbase_context *const kctx,
					   struct kbase_csf_tiler_heap_chunk *chunk)
{
	lockdep_assert_held(&kctx->reg_lock);

	if (chunk->region->cpu_alloc != NULL) {
		kbase_mem_shrink_cpu_mapping(kctx, chunk->region, 0,
					     chunk->region->cpu_alloc->nents);
	}
#if !defined(CONFIG_MALI_VECTOR_DUMP)
	chunk->region->flags |= KBASE_REG_DONT_NEED;
#endif

	dev_dbg(kctx->kbdev->dev, "Removed external mappings from chunk 0x%llX", chunk->gpu_va);
}

/**
 * link_chunk - Link a chunk into a tiler heap
 *
 * @heap:  Pointer to the tiler heap.
 * @chunk: Pointer to the heap chunk to be linked.
 *
 * Unless the @chunk is the first in the kernel's list of chunks belonging to
 * a given tiler heap, this function stores the size and address of the @chunk
 * in the header of the preceding chunk. This requires the GPU memory region
 * containing the header to be mapped temporarily, which can fail.
 *
 * Return: 0 if successful or a negative error code on failure.
 */
static int link_chunk(struct kbase_csf_tiler_heap *const heap,
	struct kbase_csf_tiler_heap_chunk *const chunk)
{
	struct kbase_csf_tiler_heap_chunk *const prev = get_last_chunk(heap);

	if (prev) {
		struct kbase_context *const kctx = heap->kctx;
		u64 *prev_hdr = prev->map.addr;

		WARN((prev->region->flags & KBASE_REG_CPU_CACHED),
		     "Cannot support CPU cached chunks without sync operations");

		*prev_hdr = encode_chunk_ptr(heap->chunk_size, chunk->gpu_va);

		dev_dbg(kctx->kbdev->dev,
			"Linked tiler heap chunks, 0x%llX -> 0x%llX\n",
			prev->gpu_va, chunk->gpu_va);
	}

	return 0;
}

/**
 * init_chunk - Initialize and link a tiler heap chunk
 *
 * @heap:  Pointer to the tiler heap.
 * @chunk: Pointer to the heap chunk to be initialized and linked.
 * @link_with_prev: Flag to indicate if the new chunk needs to be linked with
 *                  the previously allocated chunk.
 *
 * Zero-initialize a new chunk's header (including its pointer to the next
 * chunk, which doesn't exist yet) and then update the previous chunk's
 * header to link the new chunk into the chunk list.
 *
 * Return: 0 if successful or a negative error code on failure.
 */
static int init_chunk(struct kbase_csf_tiler_heap *const heap,
	struct kbase_csf_tiler_heap_chunk *const chunk, bool link_with_prev)
{
	int err = 0;
	u64 *chunk_hdr;
	struct kbase_context *const kctx = heap->kctx;

	lockdep_assert_held(&kctx->csf.tiler_heaps.lock);

	if (unlikely(chunk->gpu_va & ~CHUNK_ADDR_MASK)) {
		dev_err(kctx->kbdev->dev,
			"Tiler heap chunk address is unusable\n");
		return -EINVAL;
	}

	WARN((chunk->region->flags & KBASE_REG_CPU_CACHED),
	     "Cannot support CPU cached chunks without sync operations");
	chunk_hdr = chunk->map.addr;
	if (WARN(chunk->map.size < CHUNK_HDR_SIZE,
		 "Tiler chunk kernel mapping was not large enough for zero-init")) {
		return -EINVAL;
	}

	memset(chunk_hdr, 0, CHUNK_HDR_SIZE);
	INIT_LIST_HEAD(&chunk->link);

	if (link_with_prev)
		err = link_chunk(heap, chunk);

	if (unlikely(err)) {
		dev_err(kctx->kbdev->dev, "Failed to link a chunk to a tiler heap\n");
		return -EINVAL;
	}

	list_add_tail(&chunk->link, &heap->chunks_list);
	heap->chunk_count++;

	return err;
}

/**
 * remove_unlinked_chunk - Remove a chunk that is not currently linked into a
 *                         heap.
 *
 * @kctx:  Kbase context that was used to allocate the memory.
 * @chunk: Chunk that has been allocated, but not linked into a heap.
 */
static void remove_unlinked_chunk(struct kbase_context *kctx,
				  struct kbase_csf_tiler_heap_chunk *chunk)
{
	if (WARN_ON(!list_empty(&chunk->link)))
		return;

	kbase_gpu_vm_lock_with_pmode_sync(kctx);
	kbase_vunmap(kctx, &chunk->map);
	/* KBASE_REG_DONT_NEED regions will be confused with ephemeral regions (inc freed JIT
	 * regions), and so we must clear that flag too before freeing.
	 * For "no user free count", we check that the count is 1 as it is a shrinkable region;
	 * no other code part within kbase can take a reference to it.
	 */
	WARN_ON(atomic_read(&chunk->region->no_user_free_count) > 1);
	kbase_va_region_no_user_free_dec(chunk->region);
#if !defined(CONFIG_MALI_VECTOR_DUMP)
	chunk->region->flags &= ~KBASE_REG_DONT_NEED;
#endif
	kbase_mem_free_region(kctx, chunk->region);
	kbase_gpu_vm_unlock_with_pmode_sync(kctx);

	kfree(chunk);
}

/**
 * alloc_new_chunk - Allocate new chunk metadata for the tiler heap, reserve a fully backed VA
 *                   region for the chunk, and provide a kernel mapping.
 * @kctx:       kbase context with which the chunk will be linked
 * @chunk_size: the size of the chunk from the corresponding heap
 *
 * Allocate the chunk tracking metadata and a corresponding fully backed VA region for the
 * chunk. The kernel may need to invoke the reclaim path while trying to fulfill the allocation, so
 * we cannot hold any lock that would be held in the shrinker paths (JIT evict lock or tiler heap
 * lock).
 *
 * Since the chunk may have its physical backing removed, to prevent use-after-free scenarios we
 * ensure that it is protected from being mapped by other parts of kbase.
 *
 * The chunk's GPU memory can be accessed via its 'map' member, but should only be done so by the
 * shrinker path, as it may be otherwise shrunk at any time.
 *
 * Return: pointer to kbase_csf_tiler_heap_chunk on success or a NULL pointer
 *         on failure
 */
static struct kbase_csf_tiler_heap_chunk *alloc_new_chunk(struct kbase_context *kctx,
							  u64 chunk_size)
{
	u64 nr_pages = PFN_UP(chunk_size);
	u64 flags = BASE_MEM_PROT_GPU_RD | BASE_MEM_PROT_GPU_WR | BASE_MEM_PROT_CPU_WR |
		    BASEP_MEM_NO_USER_FREE | BASE_MEM_COHERENT_LOCAL | BASE_MEM_PROT_CPU_RD;
	struct kbase_csf_tiler_heap_chunk *chunk = NULL;
	/* The chunk kernel mapping needs to be large enough to:
	 * - initially zero the CHUNK_HDR_SIZE area
	 * - on shrinking, access the NEXT_CHUNK_ADDR_SIZE area
	 */
	const size_t chunk_kernel_map_size = max(CHUNK_HDR_SIZE, NEXT_CHUNK_ADDR_SIZE);

	/* Calls to this function are inherently synchronous, with respect to
	 * MMU operations.
	 */
	const enum kbase_caller_mmu_sync_info mmu_sync_info = CALLER_MMU_SYNC;
	flags |= kbase_mem_group_id_set(kctx->jit_group_id);

	chunk = kzalloc(sizeof(*chunk), GFP_KERNEL);
	if (unlikely(!chunk)) {
		dev_err(kctx->kbdev->dev,
			"No kernel memory for a new tiler heap chunk\n");
		return NULL;
	}

	/* Allocate GPU memory for the new chunk. */
	chunk->region =
		kbase_mem_alloc(kctx, nr_pages, nr_pages, 0, &flags, &chunk->gpu_va, mmu_sync_info);

	if (unlikely(!chunk->region)) {
		dev_err(kctx->kbdev->dev, "Failed to allocate a tiler heap chunk!\n");
		goto unroll_chunk;
	}

	kbase_gpu_vm_lock(kctx);

	/* Some checks done here as NO_USER_FREE still allows such things to be made
	 * whilst we had dropped the region lock
	 */
	if (unlikely(atomic_read(&chunk->region->gpu_alloc->kernel_mappings) > 0)) {
		dev_err(kctx->kbdev->dev, "Chunk region has active kernel mappings!\n");
		goto unroll_region;
	}

	/* There is a race condition with regard to KBASE_REG_DONT_NEED, where another
	 * thread can have the "no user free" refcount increased between kbase_mem_alloc
	 * and kbase_gpu_vm_lock (above) and before KBASE_REG_DONT_NEED is set by
	 * remove_external_chunk_mappings (below).
	 *
	 * It should be fine and not a security risk if we let the region leak till
	 * region tracker termination in such a case.
	 */
	if (unlikely(atomic_read(&chunk->region->no_user_free_count) > 1)) {
		dev_err(kctx->kbdev->dev, "Chunk region has no_user_free_count > 1!\n");
		goto unroll_region;
	}

	/* Whilst we can be sure of a number of other restrictions due to BASEP_MEM_NO_USER_FREE
	 * being requested, it's useful to document in code what those restrictions are, and ensure
	 * they remain in place in future.
	 */
	if (WARN(!chunk->region->gpu_alloc,
		 "NO_USER_FREE chunks should not have had their alloc freed")) {
		goto unroll_region;
	}

	if (WARN(chunk->region->gpu_alloc->type != KBASE_MEM_TYPE_NATIVE,
		 "NO_USER_FREE chunks should not have been freed and then reallocated as imported/non-native regions")) {
		goto unroll_region;
	}

	if (WARN((chunk->region->flags & KBASE_REG_ACTIVE_JIT_ALLOC),
		 "NO_USER_FREE chunks should not have been freed and then reallocated as JIT regions")) {
		goto unroll_region;
	}

	if (WARN((chunk->region->flags & KBASE_REG_DONT_NEED),
		 "NO_USER_FREE chunks should not have been made ephemeral")) {
		goto unroll_region;
	}

	if (WARN(atomic_read(&chunk->region->cpu_alloc->gpu_mappings) > 1,
		 "NO_USER_FREE chunks should not have been aliased")) {
		goto unroll_region;
	}

	if (unlikely(!kbase_vmap_reg(kctx, chunk->region, chunk->gpu_va, chunk_kernel_map_size,
				     (KBASE_REG_CPU_RD | KBASE_REG_CPU_WR), &chunk->map,
				     KBASE_VMAP_FLAG_PERMANENT_MAP_ACCOUNTING))) {
		dev_err(kctx->kbdev->dev, "Failed to map chunk header for shrinking!\n");
		goto unroll_region;
	}

	remove_external_chunk_mappings(kctx, chunk);
	kbase_gpu_vm_unlock(kctx);

	/* If page migration is enabled, we don't want to migrate tiler heap pages.
	 * This does not change if the constituent pages are already marked as isolated.
	 */
	if (kbase_is_page_migration_enabled())
		kbase_set_phy_alloc_page_status(chunk->region->gpu_alloc, NOT_MOVABLE);

	return chunk;

unroll_region:
	/* KBASE_REG_DONT_NEED regions will be confused with ephemeral regions (inc freed JIT
	 * regions), and so we must clear that flag too before freeing.
	 */
	kbase_va_region_no_user_free_dec(chunk->region);
#if !defined(CONFIG_MALI_VECTOR_DUMP)
	chunk->region->flags &= ~KBASE_REG_DONT_NEED;
#endif
	kbase_mem_free_region(kctx, chunk->region);
	kbase_gpu_vm_unlock(kctx);
unroll_chunk:
	kfree(chunk);
	return NULL;
}

/**
 * create_chunk - Create a tiler heap chunk
 *
 * @heap: Pointer to the tiler heap for which to allocate memory.
 *
 * This function allocates a chunk of memory for a tiler heap, adds it to the
 * the list of chunks associated with that heap both on the host side and in GPU
 * memory.
 *
 * Return: 0 if successful or a negative error code on failure.
 */
static int create_chunk(struct kbase_csf_tiler_heap *const heap)
{
	int err = 0;
	struct kbase_csf_tiler_heap_chunk *chunk = NULL;

	chunk = alloc_new_chunk(heap->kctx, heap->chunk_size);
	if (unlikely(!chunk)) {
		err = -ENOMEM;
		goto allocation_failure;
	}

	mutex_lock(&heap->kctx->csf.tiler_heaps.lock);
	err = init_chunk(heap, chunk, true);
	mutex_unlock(&heap->kctx->csf.tiler_heaps.lock);

	if (unlikely(err))
		goto initialization_failure;

	dev_dbg(heap->kctx->kbdev->dev, "Created tiler heap chunk 0x%llX\n", chunk->gpu_va);

	return 0;
initialization_failure:
	remove_unlinked_chunk(heap->kctx, chunk);
allocation_failure:
	return err;
}

/**
 * delete_all_chunks - Delete all chunks belonging to an unlinked tiler heap
 *
 * @heap: Pointer to a tiler heap.
 *
 * This function empties the list of chunks associated with a tiler heap by freeing all chunks
 * previously allocated by @create_chunk.
 *
 * The heap must not be reachable from a &struct kbase_context.csf.tiler_heaps.list, as the
 * tiler_heaps lock cannot be held whilst deleting its chunks due to also needing the &struct
 * kbase_context.region_lock.
 *
 * WARNING: Whilst the deleted chunks are unlinked from host memory, they are not unlinked from the
 *          list of chunks used by the GPU, therefore it is only safe to use this function when
 *          deleting a heap.
 */
static void delete_all_chunks(struct kbase_csf_tiler_heap *heap)
{
	struct kbase_context *const kctx = heap->kctx;
	struct list_head *entry = NULL, *tmp = NULL;

	WARN(!list_empty(&heap->link),
	     "Deleting a heap's chunks when that heap is still linked requires the tiler_heaps lock, which cannot be held by the caller");

	list_for_each_safe(entry, tmp, &heap->chunks_list) {
		struct kbase_csf_tiler_heap_chunk *chunk = list_entry(
			entry, struct kbase_csf_tiler_heap_chunk, link);

		list_del_init(&chunk->link);
		heap->chunk_count--;

		remove_unlinked_chunk(kctx, chunk);
	}
}

/**
 * create_initial_chunks - Create the initial list of chunks for a tiler heap
 *
 * @heap:    Pointer to the tiler heap for which to allocate memory.
 * @nchunks: Number of chunks to create.
 *
 * This function allocates a given number of chunks for a tiler heap and
 * adds them to the list of chunks associated with that heap.
 *
 * Return: 0 if successful or a negative error code on failure.
 */
static int create_initial_chunks(struct kbase_csf_tiler_heap *const heap,
	u32 const nchunks)
{
	int err = 0;
	u32 i;

	for (i = 0; (i < nchunks) && likely(!err); i++)
		err = create_chunk(heap);

	if (unlikely(err))
		delete_all_chunks(heap);

	return err;
}

/**
 * delete_heap - Delete an unlinked tiler heap
 *
 * @heap: Pointer to a tiler heap to be deleted.
 *
 * This function frees any chunks allocated for a tiler heap previously
 * initialized by @kbase_csf_tiler_heap_init. The heap context structure used by
 * the firmware is also freed.
 *
 * The heap must not be reachable from a &struct kbase_context.csf.tiler_heaps.list, as the
 * tiler_heaps lock cannot be held whilst deleting it due to also needing the &struct
 * kbase_context.region_lock.
 */
static void delete_heap(struct kbase_csf_tiler_heap *heap)
{
	struct kbase_context *const kctx = heap->kctx;

	dev_dbg(kctx->kbdev->dev, "Deleting tiler heap 0x%llX\n", heap->gpu_va);

	WARN(!list_empty(&heap->link),
	     "Deleting a heap that is still linked requires the tiler_heaps lock, which cannot be held by the caller");

	/* Make sure that all of the VA regions corresponding to the chunks are
	 * freed at this time and that the work queue is not trying to access freed
	 * memory.
	 *
	 * Note: since the heap is unlinked, and that no references are made to chunks other
	 * than from their heap, there is no need to separately move the chunks out of the
	 * heap->chunks_list to delete them.
	 */
	delete_all_chunks(heap);

	kbase_vunmap(kctx, &heap->gpu_va_map);
	/* We could optimize context destruction by not freeing leaked heap
	 * contexts but it doesn't seem worth the extra complexity. After this
	 * point, the suballocation is returned to the heap context allocator and
	 * may be overwritten with new data, meaning heap->gpu_va should not
	 * be used past this point.
	 */
	kbase_csf_heap_context_allocator_free(&kctx->csf.tiler_heaps.ctx_alloc,
		heap->gpu_va);

	WARN_ON(heap->chunk_count);
	KBASE_TLSTREAM_AUX_TILER_HEAP_STATS(kctx->kbdev, kctx->id,
		heap->heap_id, 0, 0, heap->max_chunks, heap->chunk_size, 0,
		heap->target_in_flight, 0);

	if (heap->buf_desc_reg) {
		kbase_vunmap(kctx, &heap->buf_desc_map);
		kbase_gpu_vm_lock(kctx);
		kbase_va_region_no_user_free_dec(heap->buf_desc_reg);
		kbase_gpu_vm_unlock(kctx);
	}

	kfree(heap);
}

/**
 * find_tiler_heap - Find a tiler heap from the address of its heap context
 *
 * @kctx:        Pointer to the kbase context to search for a tiler heap.
 * @heap_gpu_va: GPU virtual address of a heap context structure.
 *
 * Each tiler heap managed by the kernel has an associated heap context
 * structure used by the firmware. This function finds a tiler heap object from
 * the GPU virtual address of its associated heap context. The heap context
 * should have been allocated by @kbase_csf_heap_context_allocator_alloc in the
 * same @kctx.
 *
 * Return: pointer to the tiler heap object, or NULL if not found.
 */
static struct kbase_csf_tiler_heap *find_tiler_heap(
	struct kbase_context *const kctx, u64 const heap_gpu_va)
{
	struct kbase_csf_tiler_heap *heap = NULL;

	lockdep_assert_held(&kctx->csf.tiler_heaps.lock);

	list_for_each_entry(heap, &kctx->csf.tiler_heaps.list, link) {
		if (heap_gpu_va == heap->gpu_va)
			return heap;
	}

	dev_dbg(kctx->kbdev->dev, "Tiler heap 0x%llX was not found\n",
		heap_gpu_va);

	return NULL;
}

static struct kbase_csf_tiler_heap_chunk *find_chunk(struct kbase_csf_tiler_heap *heap,
						     u64 const chunk_gpu_va)
{
	struct kbase_csf_tiler_heap_chunk *chunk = NULL;

	lockdep_assert_held(&heap->kctx->csf.tiler_heaps.lock);

	list_for_each_entry(chunk, &heap->chunks_list, link) {
		if (chunk->gpu_va == chunk_gpu_va)
			return chunk;
	}

	dev_dbg(heap->kctx->kbdev->dev, "Tiler heap chunk 0x%llX was not found\n", chunk_gpu_va);

	return NULL;
}

int kbase_csf_tiler_heap_context_init(struct kbase_context *const kctx)
{
	int err = kbase_csf_heap_context_allocator_init(
		&kctx->csf.tiler_heaps.ctx_alloc, kctx);

	if (unlikely(err))
		return err;

	INIT_LIST_HEAD(&kctx->csf.tiler_heaps.list);
	mutex_init(&kctx->csf.tiler_heaps.lock);

	dev_dbg(kctx->kbdev->dev, "Initialized a context for tiler heaps\n");

	return 0;
}

void kbase_csf_tiler_heap_context_term(struct kbase_context *const kctx)
{
	LIST_HEAD(local_heaps_list);
	struct list_head *entry = NULL, *tmp = NULL;

	dev_dbg(kctx->kbdev->dev, "Terminating a context for tiler heaps\n");

	mutex_lock(&kctx->csf.tiler_heaps.lock);
	list_splice_init(&kctx->csf.tiler_heaps.list, &local_heaps_list);
	mutex_unlock(&kctx->csf.tiler_heaps.lock);

	list_for_each_safe(entry, tmp, &local_heaps_list) {
		struct kbase_csf_tiler_heap *heap = list_entry(
			entry, struct kbase_csf_tiler_heap, link);

		list_del_init(&heap->link);
		delete_heap(heap);
	}

	mutex_destroy(&kctx->csf.tiler_heaps.lock);

	kbase_csf_heap_context_allocator_term(&kctx->csf.tiler_heaps.ctx_alloc);
}

/**
 * kbasep_is_buffer_descriptor_region_suitable - Check if a VA region chosen to house
 *                                               the tiler heap buffer descriptor
 *                                               is suitable for the purpose.
 * @kctx: kbase context of the tiler heap
 * @reg:  VA region being checked for suitability
 *
 * The tiler heap buffer descriptor memory does not admit page faults according
 * to its design, so it must have the entirety of the backing upon allocation,
 * and it has to remain alive as long as the tiler heap is alive, meaning it
 * cannot be allocated from JIT/Ephemeral, or user freeable memory.
 *
 * Return: true on suitability, false otherwise.
 */
static bool kbasep_is_buffer_descriptor_region_suitable(struct kbase_context *const kctx,
							struct kbase_va_region *const reg)
{
	if (kbase_is_region_invalid_or_free(reg)) {
		dev_err(kctx->kbdev->dev, "Region is either invalid or free!\n");
		return false;
	}

	if (!(reg->flags & KBASE_REG_CPU_RD) || kbase_is_region_shrinkable(reg) ||
	    (reg->flags & KBASE_REG_PF_GROW)) {
		dev_err(kctx->kbdev->dev, "Region has invalid flags: 0x%lX!\n", reg->flags);
		return false;
	}

	if (reg->gpu_alloc->type != KBASE_MEM_TYPE_NATIVE) {
		dev_err(kctx->kbdev->dev, "Region has invalid type!\n");
		return false;
	}

	if ((reg->nr_pages != kbase_reg_current_backed_size(reg)) ||
	    (reg->nr_pages < PFN_UP(sizeof(struct kbase_csf_gpu_buffer_heap)))) {
		dev_err(kctx->kbdev->dev, "Region has invalid backing!\n");
		return false;
	}

	return true;
}

#define TILER_BUF_DESC_SIZE (sizeof(struct kbase_csf_gpu_buffer_heap))

int kbase_csf_tiler_heap_init(struct kbase_context *const kctx, u32 const chunk_size,
			      u32 const initial_chunks, u32 const max_chunks,
			      u16 const target_in_flight, u64 const buf_desc_va,
			      u64 *const heap_gpu_va, u64 *const first_chunk_va)
{
	int err = 0;
	struct kbase_csf_tiler_heap *heap = NULL;
	struct kbase_csf_heap_context_allocator *const ctx_alloc =
		&kctx->csf.tiler_heaps.ctx_alloc;
	struct kbase_csf_tiler_heap_chunk *chunk = NULL;
	struct kbase_va_region *gpu_va_reg = NULL;
	void *vmap_ptr = NULL;

	dev_dbg(kctx->kbdev->dev,
		"Creating a tiler heap with %u chunks (limit: %u) of size %u, buf_desc_va: 0x%llx\n",
		initial_chunks, max_chunks, chunk_size, buf_desc_va);

	if (!kbase_mem_allow_alloc(kctx))
		return -EINVAL;

	if (chunk_size == 0)
		return -EINVAL;

	if (chunk_size & ~CHUNK_SIZE_MASK)
		return -EINVAL;

	if (initial_chunks == 0)
		return -EINVAL;

	if (initial_chunks > max_chunks)
		return -EINVAL;

	if (target_in_flight == 0)
		return -EINVAL;

	heap = kzalloc(sizeof(*heap), GFP_KERNEL);
	if (unlikely(!heap)) {
		dev_err(kctx->kbdev->dev, "No kernel memory for a new tiler heap");
		return -ENOMEM;
	}

	heap->kctx = kctx;
	heap->chunk_size = chunk_size;
	heap->max_chunks = max_chunks;
	heap->target_in_flight = target_in_flight;
	heap->buf_desc_checked = false;
	INIT_LIST_HEAD(&heap->chunks_list);
	INIT_LIST_HEAD(&heap->link);

	/* Check on the buffer descriptor virtual Address */
	if (buf_desc_va) {
		struct kbase_va_region *buf_desc_reg;

		kbase_gpu_vm_lock(kctx);
		buf_desc_reg =
			kbase_region_tracker_find_region_enclosing_address(kctx, buf_desc_va);

		if (!kbasep_is_buffer_descriptor_region_suitable(kctx, buf_desc_reg)) {
			kbase_gpu_vm_unlock(kctx);
			dev_err(kctx->kbdev->dev,
				"Could not find a suitable VA region for the tiler heap buf desc!\n");
			err = -EINVAL;
			goto buf_desc_not_suitable;
		}

		/* If we don't prevent userspace from unmapping this, we may run into
		 * use-after-free, as we don't check for the existence of the region throughout.
		 */

		heap->buf_desc_va = buf_desc_va;
		heap->buf_desc_reg = buf_desc_reg;
		kbase_va_region_no_user_free_inc(buf_desc_reg);

		vmap_ptr = kbase_vmap_reg(kctx, buf_desc_reg, buf_desc_va, TILER_BUF_DESC_SIZE,
					  KBASE_REG_CPU_RD, &heap->buf_desc_map,
					  KBASE_VMAP_FLAG_PERMANENT_MAP_ACCOUNTING);

		if (kbase_is_page_migration_enabled())
			kbase_set_phy_alloc_page_status(buf_desc_reg->gpu_alloc, NOT_MOVABLE);

		kbase_gpu_vm_unlock(kctx);

		if (unlikely(!vmap_ptr)) {
			dev_err(kctx->kbdev->dev,
				"Could not vmap buffer descriptor into kernel memory (err %d)\n",
				err);
			err = -ENOMEM;
			goto buf_desc_vmap_failed;
		}
	}

	heap->gpu_va = kbase_csf_heap_context_allocator_alloc(ctx_alloc);
	if (unlikely(!heap->gpu_va)) {
		dev_dbg(kctx->kbdev->dev, "Failed to allocate a tiler heap context\n");
		err = -ENOMEM;
		goto heap_context_alloc_failed;
	}

	gpu_va_reg = ctx_alloc->region;

	kbase_gpu_vm_lock(kctx);
	/* gpu_va_reg was created with BASEP_MEM_NO_USER_FREE, the code to unset this only happens
	 * on kctx termination (after all syscalls on kctx have finished), and so it is safe to
	 * assume that gpu_va_reg is still present.
	 */
	vmap_ptr = kbase_vmap_reg(kctx, gpu_va_reg, heap->gpu_va, NEXT_CHUNK_ADDR_SIZE,
				  (KBASE_REG_CPU_RD | KBASE_REG_CPU_WR), &heap->gpu_va_map,
				  KBASE_VMAP_FLAG_PERMANENT_MAP_ACCOUNTING);
	kbase_gpu_vm_unlock(kctx);
	if (unlikely(!vmap_ptr)) {
		dev_dbg(kctx->kbdev->dev, "Failed to vmap the correct heap GPU VA address\n");
		err = -ENOMEM;
		goto heap_context_vmap_failed;
	}

	err = create_initial_chunks(heap, initial_chunks);
	if (unlikely(err)) {
		dev_dbg(kctx->kbdev->dev, "Failed to create the initial tiler heap chunks\n");
		goto create_chunks_failed;
	}
	chunk = list_first_entry(&heap->chunks_list, struct kbase_csf_tiler_heap_chunk, link);

	*heap_gpu_va = heap->gpu_va;
	*first_chunk_va = chunk->gpu_va;

	mutex_lock(&kctx->csf.tiler_heaps.lock);
	kctx->csf.tiler_heaps.nr_of_heaps++;
	heap->heap_id = kctx->csf.tiler_heaps.nr_of_heaps;
	list_add(&heap->link, &kctx->csf.tiler_heaps.list);

	KBASE_TLSTREAM_AUX_TILER_HEAP_STATS(kctx->kbdev, kctx->id, heap->heap_id,
					    PFN_UP(heap->chunk_size * heap->max_chunks),
					    PFN_UP(heap->chunk_size * heap->chunk_count),
					    heap->max_chunks, heap->chunk_size, heap->chunk_count,
					    heap->target_in_flight, 0);

#if defined(CONFIG_MALI_VECTOR_DUMP)
	list_for_each_entry(chunk, &heap->chunks_list, link) {
		KBASE_TLSTREAM_JD_TILER_HEAP_CHUNK_ALLOC(kctx->kbdev, kctx->id, heap->heap_id,
							 chunk->gpu_va);
	}
#endif
	kctx->running_total_tiler_heap_nr_chunks += heap->chunk_count;
	kctx->running_total_tiler_heap_memory += (u64)heap->chunk_size * heap->chunk_count;
	if (kctx->running_total_tiler_heap_memory > kctx->peak_total_tiler_heap_memory)
		kctx->peak_total_tiler_heap_memory = kctx->running_total_tiler_heap_memory;

	dev_dbg(kctx->kbdev->dev,
		"Created tiler heap 0x%llX, buffer descriptor 0x%llX, ctx_%d_%d\n", heap->gpu_va,
		buf_desc_va, kctx->tgid, kctx->id);
	mutex_unlock(&kctx->csf.tiler_heaps.lock);

	return 0;

create_chunks_failed:
	kbase_vunmap(kctx, &heap->gpu_va_map);
heap_context_vmap_failed:
	kbase_csf_heap_context_allocator_free(ctx_alloc, heap->gpu_va);
heap_context_alloc_failed:
	if (heap->buf_desc_reg)
		kbase_vunmap(kctx, &heap->buf_desc_map);
buf_desc_vmap_failed:
	if (heap->buf_desc_reg) {
		kbase_gpu_vm_lock(kctx);
		kbase_va_region_no_user_free_dec(heap->buf_desc_reg);
		kbase_gpu_vm_unlock(kctx);
	}
buf_desc_not_suitable:
	kfree(heap);
	return err;
}

int kbase_csf_tiler_heap_term(struct kbase_context *const kctx,
	u64 const heap_gpu_va)
{
	int err = 0;
	struct kbase_csf_tiler_heap *heap = NULL;
	u32 chunk_count = 0;
	u64 heap_size = 0;

	mutex_lock(&kctx->csf.tiler_heaps.lock);
	heap = find_tiler_heap(kctx, heap_gpu_va);
	if (likely(heap)) {
		chunk_count = heap->chunk_count;
		heap_size = heap->chunk_size * chunk_count;

		list_del_init(&heap->link);
	} else {
		err = -EINVAL;
	}

	/* Update stats whilst still holding the lock so they are in sync with the tiler_heaps.list
	 * at all times
	 */
	if (likely(kctx->running_total_tiler_heap_memory >= heap_size))
		kctx->running_total_tiler_heap_memory -= heap_size;
	else
		dev_warn(kctx->kbdev->dev,
			 "Running total tiler heap memory lower than expected!");
	if (likely(kctx->running_total_tiler_heap_nr_chunks >= chunk_count))
		kctx->running_total_tiler_heap_nr_chunks -= chunk_count;
	else
		dev_warn(kctx->kbdev->dev,
			 "Running total tiler chunk count lower than expected!");
	if (!err)
		dev_dbg(kctx->kbdev->dev,
			"Terminated tiler heap 0x%llX, buffer descriptor 0x%llX, ctx_%d_%d\n",
			heap->gpu_va, heap->buf_desc_va, kctx->tgid, kctx->id);
	mutex_unlock(&kctx->csf.tiler_heaps.lock);

	/* Deletion requires the kctx->reg_lock, so must only operate on it whilst unlinked from
	 * the kctx's csf.tiler_heaps.list, and without holding the csf.tiler_heaps.lock
	 */
	if (likely(heap))
		delete_heap(heap);

	return err;
}

/**
 * validate_allocation_request - Check whether the chunk allocation request
 *                               received on tiler OOM should be handled at
 *                               current time.
 *
 * @heap:               The tiler heap the OOM is associated with
 * @nr_in_flight:       Number of fragment jobs in flight
 * @pending_frag_count: Number of pending fragment jobs
 *
 * Context: must hold the tiler heap lock to guarantee its lifetime
 *
 * Return:
 * * 0       - allowed to allocate an additional chunk
 * * -EINVAL - invalid
 * * -EBUSY  - there are fragment jobs still in flight, which may free chunks
 *             after completing
 * * -ENOMEM - the targeted number of in-flight chunks has been reached and
 *             no new ones will be allocated
 */
static int validate_allocation_request(struct kbase_csf_tiler_heap *heap, u32 nr_in_flight,
				       u32 pending_frag_count)
{
	lockdep_assert_held(&heap->kctx->csf.tiler_heaps.lock);

	if (WARN_ON(!nr_in_flight) || WARN_ON(pending_frag_count > nr_in_flight))
		return -EINVAL;

	if (nr_in_flight <= heap->target_in_flight) {
		if (heap->chunk_count < heap->max_chunks) {
			/* Not exceeded the target number of render passes yet so be
			 * generous with memory.
			 */
			return 0;
		} else if (pending_frag_count > 0) {
			return -EBUSY;
		} else {
			return -ENOMEM;
		}
	} else {
		/* Reached target number of render passes in flight.
		 * Wait for some of them to finish
		 */
		return -EBUSY;
	}
	return -ENOMEM;
}

int kbase_csf_tiler_heap_alloc_new_chunk(struct kbase_context *kctx,
	u64 gpu_heap_va, u32 nr_in_flight, u32 pending_frag_count, u64 *new_chunk_ptr)
{
	struct kbase_csf_tiler_heap *heap;
	struct kbase_csf_tiler_heap_chunk *chunk;
	int err = -EINVAL;
	u64 chunk_size = 0;
	u64 heap_id = 0;

	/* To avoid potential locking issues during allocation, this is handled
	 * in three phases:
	 * 1. Take the lock, find the corresponding heap, and find its chunk size
	 * (this is always 2 MB, but may change down the line).
	 * 2. Allocate memory for the chunk and its region.
	 * 3. If the heap still exists, link it to the end of the list. If it
	 * doesn't, roll back the allocation.
	 */

	mutex_lock(&kctx->csf.tiler_heaps.lock);
	heap = find_tiler_heap(kctx, gpu_heap_va);
	if (likely(heap)) {
		chunk_size = heap->chunk_size;
		heap_id = heap->heap_id;
	} else {
		dev_err(kctx->kbdev->dev, "Heap 0x%llX does not exist", gpu_heap_va);
		mutex_unlock(&kctx->csf.tiler_heaps.lock);
		goto prelink_failure;
	}

	err = validate_allocation_request(heap, nr_in_flight, pending_frag_count);
	if (unlikely(err)) {
		/* The allocation request can be legitimate, but be invoked on a heap
		 * that has already reached the maximum pre-configured capacity. This
		 * is useful debug information, but should not be treated as an error,
		 * since the request will be re-sent at a later point.
		 */
		dev_dbg(kctx->kbdev->dev,
			"Not allocating new chunk for heap 0x%llX due to current heap state (err %d)",
			gpu_heap_va, err);
		mutex_unlock(&kctx->csf.tiler_heaps.lock);
		goto prelink_failure;
	}
	mutex_unlock(&kctx->csf.tiler_heaps.lock);
	/* this heap must not be used whilst we have dropped the lock */
	heap = NULL;

	chunk = alloc_new_chunk(kctx, chunk_size);
	if (unlikely(!chunk)) {
		dev_err(kctx->kbdev->dev, "Could not allocate chunk of size %lld for ctx %d_%d",
			chunk_size, kctx->tgid, kctx->id);
		goto prelink_failure;
	}

	/* After this point, the heap that we were targeting could already have had the needed
	 * chunks allocated, if we were handling multiple OoM events on multiple threads, so
	 * we need to revalidate the need for the allocation.
	 */
	mutex_lock(&kctx->csf.tiler_heaps.lock);
	heap = find_tiler_heap(kctx, gpu_heap_va);

	if (unlikely(!heap)) {
		dev_err(kctx->kbdev->dev, "Tiler heap 0x%llX no longer exists!\n", gpu_heap_va);
		mutex_unlock(&kctx->csf.tiler_heaps.lock);
		goto unroll_chunk;
	}

	if (heap_id != heap->heap_id) {
		dev_err(kctx->kbdev->dev,
			"Tiler heap 0x%llX was removed from ctx %d_%d while allocating chunk of size %lld!",
			gpu_heap_va, kctx->tgid, kctx->id, chunk_size);
		mutex_unlock(&kctx->csf.tiler_heaps.lock);
		goto unroll_chunk;
	}

	if (WARN_ON(chunk_size != heap->chunk_size)) {
		mutex_unlock(&kctx->csf.tiler_heaps.lock);
		goto unroll_chunk;
	}

	err = validate_allocation_request(heap, nr_in_flight, pending_frag_count);
	if (unlikely(err)) {
		dev_warn(
			kctx->kbdev->dev,
			"Aborting linking chunk to heap 0x%llX: heap state changed during allocation (err %d)",
			gpu_heap_va, err);
		mutex_unlock(&kctx->csf.tiler_heaps.lock);
		goto unroll_chunk;
	}

	err = init_chunk(heap, chunk, false);

	/* On error, the chunk would not be linked, so we can still treat it as an unlinked
	 * chunk for error handling.
	 */
	if (unlikely(err)) {
		dev_err(kctx->kbdev->dev,
			"Could not link chunk(0x%llX) with tiler heap 0%llX in ctx %d_%d due to error %d",
			chunk->gpu_va, gpu_heap_va, kctx->tgid, kctx->id, err);
		mutex_unlock(&kctx->csf.tiler_heaps.lock);
		goto unroll_chunk;
	}

	*new_chunk_ptr = encode_chunk_ptr(heap->chunk_size, chunk->gpu_va);

	/* update total and peak tiler heap memory record */
	kctx->running_total_tiler_heap_nr_chunks++;
	kctx->running_total_tiler_heap_memory += heap->chunk_size;

	if (kctx->running_total_tiler_heap_memory > kctx->peak_total_tiler_heap_memory)
		kctx->peak_total_tiler_heap_memory = kctx->running_total_tiler_heap_memory;

	KBASE_TLSTREAM_AUX_TILER_HEAP_STATS(kctx->kbdev, kctx->id, heap->heap_id,
					    PFN_UP(heap->chunk_size * heap->max_chunks),
					    PFN_UP(heap->chunk_size * heap->chunk_count),
					    heap->max_chunks, heap->chunk_size, heap->chunk_count,
					    heap->target_in_flight, nr_in_flight);

	mutex_unlock(&kctx->csf.tiler_heaps.lock);

	return err;
unroll_chunk:
	remove_unlinked_chunk(kctx, chunk);
prelink_failure:
	return err;
}

static bool delete_chunk_physical_pages(struct kbase_csf_tiler_heap *heap, u64 chunk_gpu_va,
					u64 *hdr_val)
{
	int err;
	u64 *chunk_hdr;
	struct kbase_context *kctx = heap->kctx;
	struct kbase_csf_tiler_heap_chunk *chunk = NULL;

	lockdep_assert_held(&heap->kctx->csf.tiler_heaps.lock);
	lockdep_assert_held(&kctx->kbdev->csf.scheduler.lock);

	chunk = find_chunk(heap, chunk_gpu_va);
	if (unlikely(!chunk)) {
		dev_warn(kctx->kbdev->dev,
			 "Failed to find tiler heap(0x%llX) chunk(0x%llX) for reclaim-delete\n",
			 heap->gpu_va, chunk_gpu_va);
		return false;
	}

	WARN((chunk->region->flags & KBASE_REG_CPU_CACHED),
	     "Cannot support CPU cached chunks without sync operations");
	chunk_hdr = chunk->map.addr;
	*hdr_val = *chunk_hdr;

	dev_dbg(kctx->kbdev->dev,
		"Reclaim: delete chunk(0x%llx) in heap(0x%llx), header value(0x%llX)\n",
		chunk_gpu_va, heap->gpu_va, *hdr_val);

	err = kbase_mem_shrink_gpu_mapping(kctx, chunk->region, 0, chunk->region->gpu_alloc->nents);
	if (unlikely(err)) {
		dev_warn(
			kctx->kbdev->dev,
			"Reclaim: shrinking GPU mapping failed on chunk(0x%llx) in heap(0x%llx) (err %d)\n",
			chunk_gpu_va, heap->gpu_va, err);

		/* Cannot free the pages whilst references on the GPU remain, so keep the chunk on
		 * the heap's chunk list and try a different heap.
		 */

		return false;
	}
	/* Destroy the mapping before the physical pages which are mapped are destroyed. */
	kbase_vunmap(kctx, &chunk->map);

	err = kbase_free_phy_pages_helper(chunk->region->gpu_alloc,
					  chunk->region->gpu_alloc->nents);
	if (unlikely(err)) {
		dev_warn(
			kctx->kbdev->dev,
			"Reclaim: remove physical backing failed on chunk(0x%llx) in heap(0x%llx) (err %d), continuing with deferred removal\n",
			chunk_gpu_va, heap->gpu_va, err);

		/* kbase_free_phy_pages_helper() should only fail on invalid input, and WARNs
		 * anyway, so continue instead of returning early.
		 *
		 * Indeed, we don't want to leave the chunk on the heap's chunk list whilst it has
		 * its mapping removed, as that could lead to problems. It's safest to instead
		 * continue with deferred destruction of the chunk.
		 */
	}

	dev_dbg(kctx->kbdev->dev,
		"Reclaim: delete chunk(0x%llx) in heap(0x%llx), header value(0x%llX)\n",
		chunk_gpu_va, heap->gpu_va, *hdr_val);

	mutex_lock(&heap->kctx->jit_evict_lock);
	list_move(&chunk->region->jit_node, &kctx->jit_destroy_head);
	mutex_unlock(&heap->kctx->jit_evict_lock);

	list_del(&chunk->link);
	heap->chunk_count--;
	kfree(chunk);

	return true;
}

static void sanity_check_gpu_buffer_heap(struct kbase_csf_tiler_heap *heap,
					 struct kbase_csf_gpu_buffer_heap *desc)
{
	u64 first_hoarded_chunk_gpu_va = desc->pointer & CHUNK_ADDR_MASK;

	lockdep_assert_held(&heap->kctx->csf.tiler_heaps.lock);

	if (first_hoarded_chunk_gpu_va) {
		struct kbase_csf_tiler_heap_chunk *chunk =
			find_chunk(heap, first_hoarded_chunk_gpu_va);

		if (likely(chunk)) {
			dev_dbg(heap->kctx->kbdev->dev,
				"Buffer descriptor 0x%llX sanity check ok, HW reclaim allowed\n",
				heap->buf_desc_va);

			heap->buf_desc_checked = true;
			return;
		}
	}
	/* If there is no match, defer the check to next time */
	dev_dbg(heap->kctx->kbdev->dev, "Buffer descriptor 0x%llX runtime sanity check deferred\n",
		heap->buf_desc_va);
}

static bool can_read_hw_gpu_buffer_heap(struct kbase_csf_tiler_heap *heap, u64 *chunk_gpu_va_ptr)
{
	struct kbase_context *kctx = heap->kctx;

	lockdep_assert_held(&kctx->csf.tiler_heaps.lock);

	/* Initialize the descriptor pointer value to 0 */
	*chunk_gpu_va_ptr = 0;

	/* The BufferDescriptor on heap is a hint on creation, do a sanity check at runtime */
	if (heap->buf_desc_reg && !heap->buf_desc_checked) {
		struct kbase_csf_gpu_buffer_heap *desc = heap->buf_desc_map.addr;

		/* BufferDescriptor is supplied by userspace, so could be CPU-cached */
		if (heap->buf_desc_map.flags & KBASE_VMAP_FLAG_SYNC_NEEDED)
			kbase_sync_mem_regions(kctx, &heap->buf_desc_map, KBASE_SYNC_TO_CPU);

		sanity_check_gpu_buffer_heap(heap, desc);
		if (heap->buf_desc_checked)
			*chunk_gpu_va_ptr = desc->pointer & CHUNK_ADDR_MASK;
	}

	return heap->buf_desc_checked;
}

static u32 delete_hoarded_chunks(struct kbase_csf_tiler_heap *heap)
{
	u32 freed = 0;
	u64 chunk_gpu_va = 0;
	struct kbase_context *kctx = heap->kctx;
	struct kbase_csf_tiler_heap_chunk *chunk = NULL;

	lockdep_assert_held(&kctx->csf.tiler_heaps.lock);

	if (can_read_hw_gpu_buffer_heap(heap, &chunk_gpu_va)) {
		u64 chunk_hdr_val;
		u64 *hw_hdr;

		if (!chunk_gpu_va) {
			struct kbase_csf_gpu_buffer_heap *desc = heap->buf_desc_map.addr;

			/* BufferDescriptor is supplied by userspace, so could be CPU-cached */
			if (heap->buf_desc_map.flags & KBASE_VMAP_FLAG_SYNC_NEEDED)
				kbase_sync_mem_regions(kctx, &heap->buf_desc_map,
						       KBASE_SYNC_TO_CPU);
			chunk_gpu_va = desc->pointer & CHUNK_ADDR_MASK;

			if (!chunk_gpu_va) {
				dev_dbg(kctx->kbdev->dev,
					"Buffer descriptor 0x%llX has no chunks (NULL) for reclaim scan\n",
					heap->buf_desc_va);
				goto out;
			}
		}

		chunk = find_chunk(heap, chunk_gpu_va);
		if (unlikely(!chunk))
			goto out;

		WARN((chunk->region->flags & KBASE_REG_CPU_CACHED),
		     "Cannot support CPU cached chunks without sync operations");
		hw_hdr = chunk->map.addr;

		/* Move onto the next chunk relevant information */
		chunk_hdr_val = *hw_hdr;
		chunk_gpu_va = chunk_hdr_val & CHUNK_ADDR_MASK;

		while (chunk_gpu_va && heap->chunk_count > HEAP_SHRINK_STOP_LIMIT) {
			bool success =
				delete_chunk_physical_pages(heap, chunk_gpu_va, &chunk_hdr_val);

			if (!success)
				break;

			freed++;
			/* On success, chunk_hdr_val is updated, extract the next chunk address */
			chunk_gpu_va = chunk_hdr_val & CHUNK_ADDR_MASK;
		}

		/* Update the existing hardware chunk header, after reclaim deletion of chunks */
		*hw_hdr = chunk_hdr_val;

		dev_dbg(heap->kctx->kbdev->dev,
			"HW reclaim scan freed chunks: %u, set hw_hdr[0]: 0x%llX\n", freed,
			chunk_hdr_val);
	} else {
		dev_dbg(kctx->kbdev->dev,
			"Skip HW reclaim scan, (disabled: buffer descriptor 0x%llX)\n",
			heap->buf_desc_va);
	}
out:
	return freed;
}

static u64 delete_unused_chunk_pages(struct kbase_csf_tiler_heap *heap)
{
	u32 freed_chunks = 0;
	u64 freed_pages = 0;
	u64 chunk_gpu_va;
	u64 chunk_hdr_val;
	struct kbase_context *kctx = heap->kctx;
	u64 *ctx_ptr;

	lockdep_assert_held(&kctx->csf.tiler_heaps.lock);

	WARN(heap->gpu_va_map.flags & KBASE_VMAP_FLAG_SYNC_NEEDED,
	     "Cannot support CPU cached heap context without sync operations");

	ctx_ptr = heap->gpu_va_map.addr;

	/* Extract the first chunk address from the context's free_list_head */
	chunk_hdr_val = *ctx_ptr;
	chunk_gpu_va = chunk_hdr_val & CHUNK_ADDR_MASK;

	while (chunk_gpu_va) {
		u64 hdr_val;
		bool success = delete_chunk_physical_pages(heap, chunk_gpu_va, &hdr_val);

		if (!success)
			break;

		freed_chunks++;
		chunk_hdr_val = hdr_val;
		/* extract the next chunk address */
		chunk_gpu_va = chunk_hdr_val & CHUNK_ADDR_MASK;
	}

	/* Update the post-scan deletion to context header */
	*ctx_ptr = chunk_hdr_val;

	/* Try to scan the HW hoarded list of unused chunks */
	freed_chunks += delete_hoarded_chunks(heap);
	freed_pages = freed_chunks * PFN_UP(heap->chunk_size);
	dev_dbg(heap->kctx->kbdev->dev,
		"Scan reclaim freed chunks/pages %u/%llu, set heap-ctx_u64[0]: 0x%llX\n",
		freed_chunks, freed_pages, chunk_hdr_val);

	/* Update context tiler heaps memory usage */
	kctx->running_total_tiler_heap_memory -= freed_pages << PAGE_SHIFT;
	kctx->running_total_tiler_heap_nr_chunks -= freed_chunks;
	return freed_pages;
}

u32 kbase_csf_tiler_heap_scan_kctx_unused_pages(struct kbase_context *kctx, u32 to_free)
{
	u64 freed = 0;
	struct kbase_csf_tiler_heap *heap;

	mutex_lock(&kctx->csf.tiler_heaps.lock);

	list_for_each_entry(heap, &kctx->csf.tiler_heaps.list, link) {
		freed += delete_unused_chunk_pages(heap);

		/* If freed enough, then stop here */
		if (freed >= to_free)
			break;
	}

	mutex_unlock(&kctx->csf.tiler_heaps.lock);
	/* The scan is surely not more than 4-G pages, but for logic flow limit it */
	if (WARN_ON(unlikely(freed > U32_MAX)))
		return U32_MAX;
	else
		return (u32)freed;
}

static u64 count_unused_heap_pages(struct kbase_csf_tiler_heap *heap)
{
	u32 chunk_cnt = 0;
	u64 page_cnt = 0;

	lockdep_assert_held(&heap->kctx->csf.tiler_heaps.lock);

	/* Here the count is basically an informed estimate, avoiding the costly mapping/unmaping
	 * in the chunk list walk. The downside is that the number is a less reliable guide for
	 * later on scan (free) calls on this heap for what actually is freeable.
	 */
	if (heap->chunk_count > HEAP_SHRINK_STOP_LIMIT) {
		chunk_cnt = heap->chunk_count - HEAP_SHRINK_STOP_LIMIT;
		page_cnt = chunk_cnt * PFN_UP(heap->chunk_size);
	}

	dev_dbg(heap->kctx->kbdev->dev,
		"Reclaim count chunks/pages %u/%llu (estimated), heap_va: 0x%llX\n", chunk_cnt,
		page_cnt, heap->gpu_va);

	return page_cnt;
}

u32 kbase_csf_tiler_heap_count_kctx_unused_pages(struct kbase_context *kctx)
{
	u64 page_cnt = 0;
	struct kbase_csf_tiler_heap *heap;

	mutex_lock(&kctx->csf.tiler_heaps.lock);

	list_for_each_entry(heap, &kctx->csf.tiler_heaps.list, link)
		page_cnt += count_unused_heap_pages(heap);

	mutex_unlock(&kctx->csf.tiler_heaps.lock);

	/* The count is surely not more than 4-G pages, but for logic flow limit it */
	if (WARN_ON(unlikely(page_cnt > U32_MAX)))
		return U32_MAX;
	else
		return (u32)page_cnt;
}