mirror of
				git://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
				synced 2025-10-31 08:44:41 +00:00 
			
		
		
		
	DMEM cgroup pull request
This introduces a new cgroup controller to limit the device memory. Notable users would be DRM, dma-buf heaps, or v4l2. This pull request is based on the series developped by Maarten Lankhorst, Friedrich Vock, and I: https://lore.kernel.org/all/20241204134410.1161769-1-dev@lankhorst.se/ -----BEGIN PGP SIGNATURE----- iJUEABMJAB0WIQTkHFbLp4ejekA/qfgnX84Zoj2+dgUCZ4DhogAKCRAnX84Zoj2+ dj7bAYCnLgOut9i0JawJdrx9wzUV6fpZCt8BGnEDeE0snlKCN/7ETTOtmBqojYrR Hwmts70BfA06NadFtMV+t6QrNOsjkk1JBPvCbSKZJK2KF7qN9Z5s8DTgw1F9rlaM dwCAMDduRQ== =F5bE -----END PGP SIGNATURE----- Merge tag 'cgroup-dmem-drm-v2' of git://git.kernel.org/pub/scm/linux/kernel/git/mripard/linux into drm-next DMEM cgroup pull request This introduces a new cgroup controller to limit the device memory. Notable users would be DRM, dma-buf heaps, or v4l2. This pull request is based on the series developped by Maarten Lankhorst, Friedrich Vock, and I: https://lore.kernel.org/all/20241204134410.1161769-1-dev@lankhorst.se/ Signed-off-by: Dave Airlie <airlied@redhat.com> From: Maxime Ripard <mripard@redhat.com> Link: https://patchwork.freedesktop.org/patch/msgid/20250110-cryptic-warm-mandrill-b71f5d@houat
This commit is contained in:
		
						commit
						39388d53c5
					
				
					 20 changed files with 1194 additions and 32 deletions
				
			
		|  | @ -64,13 +64,14 @@ v1 is available under :ref:`Documentation/admin-guide/cgroup-v1/index.rst <cgrou | |||
|      5-6. Device | ||||
|      5-7. RDMA | ||||
|        5-7-1. RDMA Interface Files | ||||
|      5-8. HugeTLB | ||||
|        5.8-1. HugeTLB Interface Files | ||||
|      5-9. Misc | ||||
|        5.9-1 Miscellaneous cgroup Interface Files | ||||
|        5.9-2 Migration and Ownership | ||||
|      5-10. Others | ||||
|        5-10-1. perf_event | ||||
|      5-8. DMEM | ||||
|      5-9. HugeTLB | ||||
|        5.9-1. HugeTLB Interface Files | ||||
|      5-10. Misc | ||||
|        5.10-1 Miscellaneous cgroup Interface Files | ||||
|        5.10-2 Migration and Ownership | ||||
|      5-11. Others | ||||
|        5-11-1. perf_event | ||||
|      5-N. Non-normative information | ||||
|        5-N-1. CPU controller root cgroup process behaviour | ||||
|        5-N-2. IO controller root cgroup process behaviour | ||||
|  | @ -2626,6 +2627,49 @@ RDMA Interface Files | |||
| 	  mlx4_0 hca_handle=1 hca_object=20 | ||||
| 	  ocrdma1 hca_handle=1 hca_object=23 | ||||
| 
 | ||||
| DMEM | ||||
| ---- | ||||
| 
 | ||||
| The "dmem" controller regulates the distribution and accounting of | ||||
| device memory regions. Because each memory region may have its own page size, | ||||
| which does not have to be equal to the system page size, the units are always bytes. | ||||
| 
 | ||||
| DMEM Interface Files | ||||
| ~~~~~~~~~~~~~~~~~~~~ | ||||
| 
 | ||||
|   dmem.max, dmem.min, dmem.low | ||||
| 	A readwrite nested-keyed file that exists for all the cgroups | ||||
| 	except root that describes current configured resource limit | ||||
| 	for a region. | ||||
| 
 | ||||
| 	An example for xe follows:: | ||||
| 
 | ||||
| 	  drm/0000:03:00.0/vram0 1073741824 | ||||
| 	  drm/0000:03:00.0/stolen max | ||||
| 
 | ||||
| 	The semantics are the same as for the memory cgroup controller, and are | ||||
| 	calculated in the same way. | ||||
| 
 | ||||
|   dmem.capacity | ||||
| 	A read-only file that describes maximum region capacity. | ||||
| 	It only exists on the root cgroup. Not all memory can be | ||||
| 	allocated by cgroups, as the kernel reserves some for | ||||
| 	internal use. | ||||
| 
 | ||||
| 	An example for xe follows:: | ||||
| 
 | ||||
| 	  drm/0000:03:00.0/vram0 8514437120 | ||||
| 	  drm/0000:03:00.0/stolen 67108864 | ||||
| 
 | ||||
|   dmem.current | ||||
| 	A read-only file that describes current resource usage. | ||||
| 	It exists for all the cgroup except root. | ||||
| 
 | ||||
| 	An example for xe follows:: | ||||
| 
 | ||||
| 	  drm/0000:03:00.0/vram0 12550144 | ||||
| 	  drm/0000:03:00.0/stolen 8650752 | ||||
| 
 | ||||
| HugeTLB | ||||
| ------- | ||||
| 
 | ||||
|  |  | |||
							
								
								
									
										9
									
								
								Documentation/core-api/cgroup.rst
									
										
									
									
									
										Normal file
									
								
							
							
						
						
									
										9
									
								
								Documentation/core-api/cgroup.rst
									
										
									
									
									
										Normal file
									
								
							|  | @ -0,0 +1,9 @@ | |||
| ================== | ||||
| Cgroup Kernel APIs | ||||
| ================== | ||||
| 
 | ||||
| Device Memory Cgroup API (dmemcg) | ||||
| ========================= | ||||
| .. kernel-doc:: kernel/cgroup/dmem.c | ||||
|    :export: | ||||
| 
 | ||||
|  | @ -109,6 +109,7 @@ more memory-management documentation in Documentation/mm/index.rst. | |||
|    dma-isa-lpc | ||||
|    swiotlb | ||||
|    mm-api | ||||
|    cgroup | ||||
|    genalloc | ||||
|    pin_user_pages | ||||
|    boot-time-mm | ||||
|  |  | |||
							
								
								
									
										54
									
								
								Documentation/gpu/drm-compute.rst
									
										
									
									
									
										Normal file
									
								
							
							
						
						
									
										54
									
								
								Documentation/gpu/drm-compute.rst
									
										
									
									
									
										Normal file
									
								
							|  | @ -0,0 +1,54 @@ | |||
| ================================== | ||||
| Long running workloads and compute | ||||
| ================================== | ||||
| 
 | ||||
| Long running workloads (compute) are workloads that will not complete in 10 | ||||
| seconds. (The time let the user wait before he reaches for the power button). | ||||
| This means that other techniques need to be used to manage those workloads, | ||||
| that cannot use fences. | ||||
| 
 | ||||
| Some hardware may schedule compute jobs, and have no way to pre-empt them, or | ||||
| have their memory swapped out from them. Or they simply want their workload | ||||
| not to be preempted or swapped out at all. | ||||
| 
 | ||||
| This means that it differs from what is described in driver-api/dma-buf.rst. | ||||
| 
 | ||||
| As with normal compute jobs, dma-fence may not be used at all. In this case, | ||||
| not even to force preemption. The driver with is simply forced to unmap a BO | ||||
| from the long compute job's address space on unbind immediately, not even | ||||
| waiting for the workload to complete. Effectively this terminates the workload | ||||
| when there is no hardware support to recover. | ||||
| 
 | ||||
| Since this is undesirable, there need to be mitigations to prevent a workload | ||||
| from being terminated. There are several possible approach, all with their | ||||
| advantages and drawbacks. | ||||
| 
 | ||||
| The first approach you will likely try is to pin all buffers used by compute. | ||||
| This guarantees that the job will run uninterrupted, but also allows a very | ||||
| denial of service attack by pinning as much memory as possible, hogging the | ||||
| all GPU memory, and possibly a huge chunk of CPU memory. | ||||
| 
 | ||||
| A second approach that will work slightly better on its own is adding an option | ||||
| not to evict when creating a new job (any kind). If all of userspace opts in | ||||
| to this flag, it would prevent cooperating userspace from forced terminating | ||||
| older compute jobs to start a new one. | ||||
| 
 | ||||
| If job preemption and recoverable pagefaults are not available, those are the | ||||
| only approaches possible. So even with those, you want a separate way of | ||||
| controlling resources. The standard kernel way of doing so is cgroups. | ||||
| 
 | ||||
| This creates a third option, using cgroups to prevent eviction. Both GPU and | ||||
| driver-allocated CPU memory would be accounted to the correct cgroup, and | ||||
| eviction would be made cgroup aware. This allows the GPU to be partitioned | ||||
| into cgroups, that will allow jobs to run next to each other without | ||||
| interference. | ||||
| 
 | ||||
| The interface to the cgroup would be similar to the current CPU memory | ||||
| interface, with similar semantics for min/low/high/max, if eviction can | ||||
| be made cgroup aware. | ||||
| 
 | ||||
| What should be noted is that each memory region (tiled memory for example) | ||||
| should have its own accounting. | ||||
| 
 | ||||
| The key is set to the regionid set by the driver, for example "tile0". | ||||
| For the value of $card, we use drmGetUnique(). | ||||
|  | @ -26,6 +26,7 @@ | |||
|  * DEALINGS IN THE SOFTWARE. | ||||
|  */ | ||||
| 
 | ||||
| #include <linux/cgroup_dmem.h> | ||||
| #include <linux/debugfs.h> | ||||
| #include <linux/fs.h> | ||||
| #include <linux/module.h> | ||||
|  | @ -820,6 +821,37 @@ void drm_dev_put(struct drm_device *dev) | |||
| } | ||||
| EXPORT_SYMBOL(drm_dev_put); | ||||
| 
 | ||||
| static void drmm_cg_unregister_region(struct drm_device *dev, void *arg) | ||||
| { | ||||
| 	dmem_cgroup_unregister_region(arg); | ||||
| } | ||||
| 
 | ||||
| /**
 | ||||
|  * drmm_cgroup_register_region - Register a region of a DRM device to cgroups | ||||
|  * @dev: device for region | ||||
|  * @region_name: Region name for registering | ||||
|  * @size: Size of region in bytes | ||||
|  * | ||||
|  * This decreases the ref-count of @dev by one. The device is destroyed if the | ||||
|  * ref-count drops to zero. | ||||
|  */ | ||||
| struct dmem_cgroup_region *drmm_cgroup_register_region(struct drm_device *dev, const char *region_name, u64 size) | ||||
| { | ||||
| 	struct dmem_cgroup_region *region; | ||||
| 	int ret; | ||||
| 
 | ||||
| 	region = dmem_cgroup_register_region(size, "drm/%s/%s", dev->unique, region_name); | ||||
| 	if (IS_ERR_OR_NULL(region)) | ||||
| 		return region; | ||||
| 
 | ||||
| 	ret = drmm_add_action_or_reset(dev, drmm_cg_unregister_region, region); | ||||
| 	if (ret) | ||||
| 		return ERR_PTR(ret); | ||||
| 
 | ||||
| 	return region; | ||||
| } | ||||
| EXPORT_SYMBOL_GPL(drmm_cgroup_register_region); | ||||
| 
 | ||||
| static int create_compat_control_link(struct drm_device *dev) | ||||
| { | ||||
| 	struct drm_minor *minor; | ||||
|  |  | |||
|  | @ -258,13 +258,13 @@ static void ttm_bo_unreserve_basic(struct kunit *test) | |||
| 	bo = ttm_bo_kunit_init(test, test->priv, BO_SIZE, NULL); | ||||
| 	bo->priority = bo_prio; | ||||
| 
 | ||||
| 	err = ttm_resource_alloc(bo, place, &res1); | ||||
| 	err = ttm_resource_alloc(bo, place, &res1, NULL); | ||||
| 	KUNIT_ASSERT_EQ(test, err, 0); | ||||
| 
 | ||||
| 	bo->resource = res1; | ||||
| 
 | ||||
| 	/* Add a dummy resource to populate LRU */ | ||||
| 	ttm_resource_alloc(bo, place, &res2); | ||||
| 	ttm_resource_alloc(bo, place, &res2, NULL); | ||||
| 
 | ||||
| 	dma_resv_lock(bo->base.resv, NULL); | ||||
| 	ttm_bo_unreserve(bo); | ||||
|  | @ -300,12 +300,12 @@ static void ttm_bo_unreserve_pinned(struct kunit *test) | |||
| 	dma_resv_lock(bo->base.resv, NULL); | ||||
| 	ttm_bo_pin(bo); | ||||
| 
 | ||||
| 	err = ttm_resource_alloc(bo, place, &res1); | ||||
| 	err = ttm_resource_alloc(bo, place, &res1, NULL); | ||||
| 	KUNIT_ASSERT_EQ(test, err, 0); | ||||
| 	bo->resource = res1; | ||||
| 
 | ||||
| 	/* Add a dummy resource to the pinned list */ | ||||
| 	err = ttm_resource_alloc(bo, place, &res2); | ||||
| 	err = ttm_resource_alloc(bo, place, &res2, NULL); | ||||
| 	KUNIT_ASSERT_EQ(test, err, 0); | ||||
| 	KUNIT_ASSERT_EQ(test, | ||||
| 			list_is_last(&res2->lru.link, &priv->ttm_dev->unevictable), 1); | ||||
|  | @ -355,7 +355,7 @@ static void ttm_bo_unreserve_bulk(struct kunit *test) | |||
| 	ttm_bo_set_bulk_move(bo1, &lru_bulk_move); | ||||
| 	dma_resv_unlock(bo1->base.resv); | ||||
| 
 | ||||
| 	err = ttm_resource_alloc(bo1, place, &res1); | ||||
| 	err = ttm_resource_alloc(bo1, place, &res1, NULL); | ||||
| 	KUNIT_ASSERT_EQ(test, err, 0); | ||||
| 	bo1->resource = res1; | ||||
| 
 | ||||
|  | @ -363,7 +363,7 @@ static void ttm_bo_unreserve_bulk(struct kunit *test) | |||
| 	ttm_bo_set_bulk_move(bo2, &lru_bulk_move); | ||||
| 	dma_resv_unlock(bo2->base.resv); | ||||
| 
 | ||||
| 	err = ttm_resource_alloc(bo2, place, &res2); | ||||
| 	err = ttm_resource_alloc(bo2, place, &res2, NULL); | ||||
| 	KUNIT_ASSERT_EQ(test, err, 0); | ||||
| 	bo2->resource = res2; | ||||
| 
 | ||||
|  | @ -401,7 +401,7 @@ static void ttm_bo_put_basic(struct kunit *test) | |||
| 	bo = ttm_bo_kunit_init(test, test->priv, BO_SIZE, NULL); | ||||
| 	bo->type = ttm_bo_type_device; | ||||
| 
 | ||||
| 	err = ttm_resource_alloc(bo, place, &res); | ||||
| 	err = ttm_resource_alloc(bo, place, &res, NULL); | ||||
| 	KUNIT_ASSERT_EQ(test, err, 0); | ||||
| 	bo->resource = res; | ||||
| 
 | ||||
|  | @ -518,7 +518,7 @@ static void ttm_bo_pin_unpin_resource(struct kunit *test) | |||
| 
 | ||||
| 	bo = ttm_bo_kunit_init(test, test->priv, BO_SIZE, NULL); | ||||
| 
 | ||||
| 	err = ttm_resource_alloc(bo, place, &res); | ||||
| 	err = ttm_resource_alloc(bo, place, &res, NULL); | ||||
| 	KUNIT_ASSERT_EQ(test, err, 0); | ||||
| 	bo->resource = res; | ||||
| 
 | ||||
|  | @ -569,7 +569,7 @@ static void ttm_bo_multiple_pin_one_unpin(struct kunit *test) | |||
| 
 | ||||
| 	bo = ttm_bo_kunit_init(test, test->priv, BO_SIZE, NULL); | ||||
| 
 | ||||
| 	err = ttm_resource_alloc(bo, place, &res); | ||||
| 	err = ttm_resource_alloc(bo, place, &res, NULL); | ||||
| 	KUNIT_ASSERT_EQ(test, err, 0); | ||||
| 	bo->resource = res; | ||||
| 
 | ||||
|  |  | |||
|  | @ -542,7 +542,7 @@ static void ttm_bo_validate_no_placement_signaled(struct kunit *test) | |||
| 		bo->ttm = old_tt; | ||||
| 	} | ||||
| 
 | ||||
| 	err = ttm_resource_alloc(bo, place, &bo->resource); | ||||
| 	err = ttm_resource_alloc(bo, place, &bo->resource, NULL); | ||||
| 	KUNIT_EXPECT_EQ(test, err, 0); | ||||
| 	KUNIT_ASSERT_EQ(test, man->usage, size); | ||||
| 
 | ||||
|  | @ -603,7 +603,7 @@ static void ttm_bo_validate_no_placement_not_signaled(struct kunit *test) | |||
| 	bo = ttm_bo_kunit_init(test, test->priv, size, NULL); | ||||
| 	bo->type = params->bo_type; | ||||
| 
 | ||||
| 	err = ttm_resource_alloc(bo, place, &bo->resource); | ||||
| 	err = ttm_resource_alloc(bo, place, &bo->resource, NULL); | ||||
| 	KUNIT_EXPECT_EQ(test, err, 0); | ||||
| 
 | ||||
| 	placement = kunit_kzalloc(test, sizeof(*placement), GFP_KERNEL); | ||||
|  |  | |||
|  | @ -302,7 +302,7 @@ static void ttm_sys_man_free_basic(struct kunit *test) | |||
| 	res = kunit_kzalloc(test, sizeof(*res), GFP_KERNEL); | ||||
| 	KUNIT_ASSERT_NOT_NULL(test, res); | ||||
| 
 | ||||
| 	ttm_resource_alloc(bo, place, &res); | ||||
| 	ttm_resource_alloc(bo, place, &res, NULL); | ||||
| 
 | ||||
| 	man = ttm_manager_type(priv->devs->ttm_dev, mem_type); | ||||
| 	man->func->free(man, res); | ||||
|  |  | |||
|  | @ -42,6 +42,7 @@ | |||
| #include <linux/file.h> | ||||
| #include <linux/module.h> | ||||
| #include <linux/atomic.h> | ||||
| #include <linux/cgroup_dmem.h> | ||||
| #include <linux/dma-resv.h> | ||||
| 
 | ||||
| #include "ttm_module.h" | ||||
|  | @ -499,6 +500,13 @@ struct ttm_bo_evict_walk { | |||
| 	struct ttm_resource **res; | ||||
| 	/** @evicted: Number of successful evictions. */ | ||||
| 	unsigned long evicted; | ||||
| 
 | ||||
| 	/** @limit_pool: Which pool limit we should test against */ | ||||
| 	struct dmem_cgroup_pool_state *limit_pool; | ||||
| 	/** @try_low: Whether we should attempt to evict BO's with low watermark threshold */ | ||||
| 	bool try_low; | ||||
| 	/** @hit_low: If we cannot evict a bo when @try_low is false (first pass) */ | ||||
| 	bool hit_low; | ||||
| }; | ||||
| 
 | ||||
| static s64 ttm_bo_evict_cb(struct ttm_lru_walk *walk, struct ttm_buffer_object *bo) | ||||
|  | @ -507,6 +515,10 @@ static s64 ttm_bo_evict_cb(struct ttm_lru_walk *walk, struct ttm_buffer_object * | |||
| 		container_of(walk, typeof(*evict_walk), walk); | ||||
| 	s64 lret; | ||||
| 
 | ||||
| 	if (!dmem_cgroup_state_evict_valuable(evict_walk->limit_pool, bo->resource->css, | ||||
| 					      evict_walk->try_low, &evict_walk->hit_low)) | ||||
| 		return 0; | ||||
| 
 | ||||
| 	if (bo->pin_count || !bo->bdev->funcs->eviction_valuable(bo, evict_walk->place)) | ||||
| 		return 0; | ||||
| 
 | ||||
|  | @ -524,7 +536,7 @@ static s64 ttm_bo_evict_cb(struct ttm_lru_walk *walk, struct ttm_buffer_object * | |||
| 	evict_walk->evicted++; | ||||
| 	if (evict_walk->res) | ||||
| 		lret = ttm_resource_alloc(evict_walk->evictor, evict_walk->place, | ||||
| 					  evict_walk->res); | ||||
| 					  evict_walk->res, NULL); | ||||
| 	if (lret == 0) | ||||
| 		return 1; | ||||
| out: | ||||
|  | @ -545,7 +557,8 @@ static int ttm_bo_evict_alloc(struct ttm_device *bdev, | |||
| 			      struct ttm_buffer_object *evictor, | ||||
| 			      struct ttm_operation_ctx *ctx, | ||||
| 			      struct ww_acquire_ctx *ticket, | ||||
| 			      struct ttm_resource **res) | ||||
| 			      struct ttm_resource **res, | ||||
| 			      struct dmem_cgroup_pool_state *limit_pool) | ||||
| { | ||||
| 	struct ttm_bo_evict_walk evict_walk = { | ||||
| 		.walk = { | ||||
|  | @ -556,22 +569,39 @@ static int ttm_bo_evict_alloc(struct ttm_device *bdev, | |||
| 		.place = place, | ||||
| 		.evictor = evictor, | ||||
| 		.res = res, | ||||
| 		.limit_pool = limit_pool, | ||||
| 	}; | ||||
| 	s64 lret; | ||||
| 
 | ||||
| 	evict_walk.walk.trylock_only = true; | ||||
| 	lret = ttm_lru_walk_for_evict(&evict_walk.walk, bdev, man, 1); | ||||
| 
 | ||||
| 	/* One more attempt if we hit low limit? */ | ||||
| 	if (!lret && evict_walk.hit_low) { | ||||
| 		evict_walk.try_low = true; | ||||
| 		lret = ttm_lru_walk_for_evict(&evict_walk.walk, bdev, man, 1); | ||||
| 	} | ||||
| 	if (lret || !ticket) | ||||
| 		goto out; | ||||
| 
 | ||||
| 	/* Reset low limit */ | ||||
| 	evict_walk.try_low = evict_walk.hit_low = false; | ||||
| 	/* If ticket-locking, repeat while making progress. */ | ||||
| 	evict_walk.walk.trylock_only = false; | ||||
| 
 | ||||
| retry: | ||||
| 	do { | ||||
| 		/* The walk may clear the evict_walk.walk.ticket field */ | ||||
| 		evict_walk.walk.ticket = ticket; | ||||
| 		evict_walk.evicted = 0; | ||||
| 		lret = ttm_lru_walk_for_evict(&evict_walk.walk, bdev, man, 1); | ||||
| 	} while (!lret && evict_walk.evicted); | ||||
| 
 | ||||
| 	/* We hit the low limit? Try once more */ | ||||
| 	if (!lret && evict_walk.hit_low && !evict_walk.try_low) { | ||||
| 		evict_walk.try_low = true; | ||||
| 		goto retry; | ||||
| 	} | ||||
| out: | ||||
| 	if (lret < 0) | ||||
| 		return lret; | ||||
|  | @ -689,6 +719,7 @@ static int ttm_bo_alloc_resource(struct ttm_buffer_object *bo, | |||
| 
 | ||||
| 	for (i = 0; i < placement->num_placement; ++i) { | ||||
| 		const struct ttm_place *place = &placement->placement[i]; | ||||
| 		struct dmem_cgroup_pool_state *limit_pool = NULL; | ||||
| 		struct ttm_resource_manager *man; | ||||
| 		bool may_evict; | ||||
| 
 | ||||
|  | @ -701,15 +732,20 @@ static int ttm_bo_alloc_resource(struct ttm_buffer_object *bo, | |||
| 			continue; | ||||
| 
 | ||||
| 		may_evict = (force_space && place->mem_type != TTM_PL_SYSTEM); | ||||
| 		ret = ttm_resource_alloc(bo, place, res); | ||||
| 		ret = ttm_resource_alloc(bo, place, res, force_space ? &limit_pool : NULL); | ||||
| 		if (ret) { | ||||
| 			if (ret != -ENOSPC) | ||||
| 			if (ret != -ENOSPC && ret != -EAGAIN) { | ||||
| 				dmem_cgroup_pool_state_put(limit_pool); | ||||
| 				return ret; | ||||
| 			if (!may_evict) | ||||
| 			} | ||||
| 			if (!may_evict) { | ||||
| 				dmem_cgroup_pool_state_put(limit_pool); | ||||
| 				continue; | ||||
| 			} | ||||
| 
 | ||||
| 			ret = ttm_bo_evict_alloc(bdev, man, place, bo, ctx, | ||||
| 						 ticket, res); | ||||
| 						 ticket, res, limit_pool); | ||||
| 			dmem_cgroup_pool_state_put(limit_pool); | ||||
| 			if (ret == -EBUSY) | ||||
| 				continue; | ||||
| 			if (ret) | ||||
|  | @ -1056,6 +1092,8 @@ struct ttm_bo_swapout_walk { | |||
| 	struct ttm_lru_walk walk; | ||||
| 	/** @gfp_flags: The gfp flags to use for ttm_tt_swapout() */ | ||||
| 	gfp_t gfp_flags; | ||||
| 
 | ||||
| 	bool hit_low, evict_low; | ||||
| }; | ||||
| 
 | ||||
| static s64 | ||||
|  | @ -1106,7 +1144,7 @@ ttm_bo_swapout_cb(struct ttm_lru_walk *walk, struct ttm_buffer_object *bo) | |||
| 
 | ||||
| 		memset(&hop, 0, sizeof(hop)); | ||||
| 		place.mem_type = TTM_PL_SYSTEM; | ||||
| 		ret = ttm_resource_alloc(bo, &place, &evict_mem); | ||||
| 		ret = ttm_resource_alloc(bo, &place, &evict_mem, NULL); | ||||
| 		if (ret) | ||||
| 			goto out; | ||||
| 
 | ||||
|  |  | |||
|  | @ -26,6 +26,7 @@ | |||
| #include <linux/io-mapping.h> | ||||
| #include <linux/iosys-map.h> | ||||
| #include <linux/scatterlist.h> | ||||
| #include <linux/cgroup_dmem.h> | ||||
| 
 | ||||
| #include <drm/ttm/ttm_bo.h> | ||||
| #include <drm/ttm/ttm_placement.h> | ||||
|  | @ -350,15 +351,28 @@ EXPORT_SYMBOL(ttm_resource_fini); | |||
| 
 | ||||
| int ttm_resource_alloc(struct ttm_buffer_object *bo, | ||||
| 		       const struct ttm_place *place, | ||||
| 		       struct ttm_resource **res_ptr) | ||||
| 		       struct ttm_resource **res_ptr, | ||||
| 		       struct dmem_cgroup_pool_state **ret_limit_pool) | ||||
| { | ||||
| 	struct ttm_resource_manager *man = | ||||
| 		ttm_manager_type(bo->bdev, place->mem_type); | ||||
| 	struct dmem_cgroup_pool_state *pool = NULL; | ||||
| 	int ret; | ||||
| 
 | ||||
| 	if (man->cg) { | ||||
| 		ret = dmem_cgroup_try_charge(man->cg, bo->base.size, &pool, ret_limit_pool); | ||||
| 		if (ret) | ||||
| 			return ret; | ||||
| 	} | ||||
| 
 | ||||
| 	ret = man->func->alloc(man, bo, place, res_ptr); | ||||
| 	if (ret) | ||||
| 	if (ret) { | ||||
| 		if (pool) | ||||
| 			dmem_cgroup_uncharge(pool, bo->base.size); | ||||
| 		return ret; | ||||
| 	} | ||||
| 
 | ||||
| 	(*res_ptr)->css = pool; | ||||
| 
 | ||||
| 	spin_lock(&bo->bdev->lru_lock); | ||||
| 	ttm_resource_add_bulk_move(*res_ptr, bo); | ||||
|  | @ -370,6 +384,7 @@ EXPORT_SYMBOL_FOR_TESTS_ONLY(ttm_resource_alloc); | |||
| void ttm_resource_free(struct ttm_buffer_object *bo, struct ttm_resource **res) | ||||
| { | ||||
| 	struct ttm_resource_manager *man; | ||||
| 	struct dmem_cgroup_pool_state *pool; | ||||
| 
 | ||||
| 	if (!*res) | ||||
| 		return; | ||||
|  | @ -377,9 +392,13 @@ void ttm_resource_free(struct ttm_buffer_object *bo, struct ttm_resource **res) | |||
| 	spin_lock(&bo->bdev->lru_lock); | ||||
| 	ttm_resource_del_bulk_move(*res, bo); | ||||
| 	spin_unlock(&bo->bdev->lru_lock); | ||||
| 
 | ||||
| 	pool = (*res)->css; | ||||
| 	man = ttm_manager_type(bo->bdev, (*res)->mem_type); | ||||
| 	man->func->free(man, *res); | ||||
| 	*res = NULL; | ||||
| 	if (man->cg) | ||||
| 		dmem_cgroup_uncharge(pool, bo->base.size); | ||||
| } | ||||
| EXPORT_SYMBOL(ttm_resource_free); | ||||
| 
 | ||||
|  |  | |||
|  | @ -5,6 +5,7 @@ | |||
|  */ | ||||
| 
 | ||||
| #include <drm/drm_managed.h> | ||||
| #include <drm/drm_drv.h> | ||||
| 
 | ||||
| #include <drm/ttm/ttm_placement.h> | ||||
| #include <drm/ttm/ttm_range_manager.h> | ||||
|  | @ -311,6 +312,13 @@ int __xe_ttm_vram_mgr_init(struct xe_device *xe, struct xe_ttm_vram_mgr *mgr, | |||
| 	struct ttm_resource_manager *man = &mgr->manager; | ||||
| 	int err; | ||||
| 
 | ||||
| 	if (mem_type != XE_PL_STOLEN) { | ||||
| 		const char *name = mem_type == XE_PL_VRAM0 ? "vram0" : "vram1"; | ||||
| 		man->cg = drmm_cgroup_register_region(&xe->drm, name, size); | ||||
| 		if (IS_ERR(man->cg)) | ||||
| 			return PTR_ERR(man->cg); | ||||
| 	} | ||||
| 
 | ||||
| 	man->func = &xe_ttm_vram_mgr_func; | ||||
| 	mgr->mem_type = mem_type; | ||||
| 	mutex_init(&mgr->lock); | ||||
|  |  | |||
|  | @ -34,6 +34,7 @@ | |||
| 
 | ||||
| #include <drm/drm_device.h> | ||||
| 
 | ||||
| struct dmem_cgroup_region; | ||||
| struct drm_fb_helper; | ||||
| struct drm_fb_helper_surface_size; | ||||
| struct drm_file; | ||||
|  | @ -436,6 +437,10 @@ void *__devm_drm_dev_alloc(struct device *parent, | |||
| 			   const struct drm_driver *driver, | ||||
| 			   size_t size, size_t offset); | ||||
| 
 | ||||
| struct dmem_cgroup_region * | ||||
| drmm_cgroup_register_region(struct drm_device *dev, | ||||
| 			    const char *region_name, u64 size); | ||||
| 
 | ||||
| /**
 | ||||
|  * devm_drm_dev_alloc - Resource managed allocation of a &drm_device instance | ||||
|  * @parent: Parent device object | ||||
|  |  | |||
|  | @ -38,6 +38,7 @@ | |||
| #define TTM_MAX_BO_PRIORITY	4U | ||||
| #define TTM_NUM_MEM_TYPES 8 | ||||
| 
 | ||||
| struct dmem_cgroup_device; | ||||
| struct ttm_device; | ||||
| struct ttm_resource_manager; | ||||
| struct ttm_resource; | ||||
|  | @ -211,6 +212,11 @@ struct ttm_resource_manager { | |||
| 	 * bdev->lru_lock. | ||||
| 	 */ | ||||
| 	uint64_t usage; | ||||
| 
 | ||||
| 	/**
 | ||||
| 	 * @cg: &dmem_cgroup_region used for memory accounting, if not NULL. | ||||
| 	 */ | ||||
| 	struct dmem_cgroup_region *cg; | ||||
| }; | ||||
| 
 | ||||
| /**
 | ||||
|  | @ -239,6 +245,7 @@ struct ttm_bus_placement { | |||
|  * @placement: Placement flags. | ||||
|  * @bus: Placement on io bus accessible to the CPU | ||||
|  * @bo: weak reference to the BO, protected by ttm_device::lru_lock | ||||
|  * @css: cgroup state this resource is charged to | ||||
|  * | ||||
|  * Structure indicating the placement and space resources used by a | ||||
|  * buffer object. | ||||
|  | @ -251,6 +258,8 @@ struct ttm_resource { | |||
| 	struct ttm_bus_placement bus; | ||||
| 	struct ttm_buffer_object *bo; | ||||
| 
 | ||||
| 	struct dmem_cgroup_pool_state *css; | ||||
| 
 | ||||
| 	/**
 | ||||
| 	 * @lru: Least recently used list, see &ttm_resource_manager.lru | ||||
| 	 */ | ||||
|  | @ -432,7 +441,8 @@ void ttm_resource_fini(struct ttm_resource_manager *man, | |||
| 
 | ||||
| int ttm_resource_alloc(struct ttm_buffer_object *bo, | ||||
| 		       const struct ttm_place *place, | ||||
| 		       struct ttm_resource **res); | ||||
| 		       struct ttm_resource **res, | ||||
| 		       struct dmem_cgroup_pool_state **ret_limit_pool); | ||||
| void ttm_resource_free(struct ttm_buffer_object *bo, struct ttm_resource **res); | ||||
| bool ttm_resource_intersects(struct ttm_device *bdev, | ||||
| 			     struct ttm_resource *res, | ||||
|  |  | |||
							
								
								
									
										66
									
								
								include/linux/cgroup_dmem.h
									
										
									
									
									
										Normal file
									
								
							
							
						
						
									
										66
									
								
								include/linux/cgroup_dmem.h
									
										
									
									
									
										Normal file
									
								
							|  | @ -0,0 +1,66 @@ | |||
| /* SPDX-License-Identifier: MIT */ | ||||
| /*
 | ||||
|  * Copyright © 2023-2024 Intel Corporation | ||||
|  */ | ||||
| 
 | ||||
| #ifndef _CGROUP_DMEM_H | ||||
| #define _CGROUP_DMEM_H | ||||
| 
 | ||||
| #include <linux/types.h> | ||||
| #include <linux/llist.h> | ||||
| 
 | ||||
| struct dmem_cgroup_pool_state; | ||||
| 
 | ||||
| /* Opaque definition of a cgroup region, used internally */ | ||||
| struct dmem_cgroup_region; | ||||
| 
 | ||||
| #if IS_ENABLED(CONFIG_CGROUP_DMEM) | ||||
| struct dmem_cgroup_region *dmem_cgroup_register_region(u64 size, const char *name_fmt, ...) __printf(2,3); | ||||
| void dmem_cgroup_unregister_region(struct dmem_cgroup_region *region); | ||||
| int dmem_cgroup_try_charge(struct dmem_cgroup_region *region, u64 size, | ||||
| 			   struct dmem_cgroup_pool_state **ret_pool, | ||||
| 			   struct dmem_cgroup_pool_state **ret_limit_pool); | ||||
| void dmem_cgroup_uncharge(struct dmem_cgroup_pool_state *pool, u64 size); | ||||
| bool dmem_cgroup_state_evict_valuable(struct dmem_cgroup_pool_state *limit_pool, | ||||
| 				      struct dmem_cgroup_pool_state *test_pool, | ||||
| 				      bool ignore_low, bool *ret_hit_low); | ||||
| 
 | ||||
| void dmem_cgroup_pool_state_put(struct dmem_cgroup_pool_state *pool); | ||||
| #else | ||||
| static inline __printf(2,3) struct dmem_cgroup_region * | ||||
| dmem_cgroup_register_region(u64 size, const char *name_fmt, ...) | ||||
| { | ||||
| 	return NULL; | ||||
| } | ||||
| 
 | ||||
| static inline void dmem_cgroup_unregister_region(struct dmem_cgroup_region *region) | ||||
| { } | ||||
| 
 | ||||
| static inline int dmem_cgroup_try_charge(struct dmem_cgroup_region *region, u64 size, | ||||
| 					 struct dmem_cgroup_pool_state **ret_pool, | ||||
| 					 struct dmem_cgroup_pool_state **ret_limit_pool) | ||||
| { | ||||
| 	*ret_pool = NULL; | ||||
| 
 | ||||
| 	if (ret_limit_pool) | ||||
| 		*ret_limit_pool = NULL; | ||||
| 
 | ||||
| 	return 0; | ||||
| } | ||||
| 
 | ||||
| static inline void dmem_cgroup_uncharge(struct dmem_cgroup_pool_state *pool, u64 size) | ||||
| { } | ||||
| 
 | ||||
| static inline | ||||
| bool dmem_cgroup_state_evict_valuable(struct dmem_cgroup_pool_state *limit_pool, | ||||
| 				      struct dmem_cgroup_pool_state *test_pool, | ||||
| 				      bool ignore_low, bool *ret_hit_low) | ||||
| { | ||||
| 	return true; | ||||
| } | ||||
| 
 | ||||
| static inline void dmem_cgroup_pool_state_put(struct dmem_cgroup_pool_state *pool) | ||||
| { } | ||||
| 
 | ||||
| #endif | ||||
| #endif	/* _CGROUP_DMEM_H */ | ||||
|  | @ -65,6 +65,10 @@ SUBSYS(rdma) | |||
| SUBSYS(misc) | ||||
| #endif | ||||
| 
 | ||||
| #if IS_ENABLED(CONFIG_CGROUP_DMEM) | ||||
| SUBSYS(dmem) | ||||
| #endif | ||||
| 
 | ||||
| /*
 | ||||
|  * The following subsystems are not supported on the default hierarchy. | ||||
|  */ | ||||
|  |  | |||
|  | @ -96,7 +96,7 @@ static inline void page_counter_reset_watermark(struct page_counter *counter) | |||
| 	counter->watermark = usage; | ||||
| } | ||||
| 
 | ||||
| #ifdef CONFIG_MEMCG | ||||
| #if IS_ENABLED(CONFIG_MEMCG) || IS_ENABLED(CONFIG_CGROUP_DMEM) | ||||
| void page_counter_calculate_protection(struct page_counter *root, | ||||
| 				       struct page_counter *counter, | ||||
| 				       bool recursive_protection); | ||||
|  |  | |||
							
								
								
									
										10
									
								
								init/Kconfig
									
										
									
									
									
								
							
							
						
						
									
										10
									
								
								init/Kconfig
									
										
									
									
									
								
							|  | @ -1128,6 +1128,7 @@ config CGROUP_PIDS | |||
| 
 | ||||
| config CGROUP_RDMA | ||||
| 	bool "RDMA controller" | ||||
| 	select PAGE_COUNTER | ||||
| 	help | ||||
| 	  Provides enforcement of RDMA resources defined by IB stack. | ||||
| 	  It is fairly easy for consumers to exhaust RDMA resources, which | ||||
|  | @ -1136,6 +1137,15 @@ config CGROUP_RDMA | |||
| 	  Attaching processes with active RDMA resources to the cgroup | ||||
| 	  hierarchy is allowed even if can cross the hierarchy's limit. | ||||
| 
 | ||||
| config CGROUP_DMEM | ||||
| 	bool "Device memory controller (DMEM)" | ||||
| 	help | ||||
| 	  The DMEM controller allows compatible devices to restrict device | ||||
| 	  memory usage based on the cgroup hierarchy. | ||||
| 
 | ||||
| 	  As an example, it allows you to restrict VRAM usage for applications | ||||
| 	  in the DRM subsystem. | ||||
| 
 | ||||
| config CGROUP_FREEZER | ||||
| 	bool "Freezer controller" | ||||
| 	help | ||||
|  |  | |||
|  | @ -7,4 +7,5 @@ obj-$(CONFIG_CGROUP_RDMA) += rdma.o | |||
| obj-$(CONFIG_CPUSETS) += cpuset.o | ||||
| obj-$(CONFIG_CPUSETS_V1) += cpuset-v1.o | ||||
| obj-$(CONFIG_CGROUP_MISC) += misc.o | ||||
| obj-$(CONFIG_CGROUP_DMEM) += dmem.o | ||||
| obj-$(CONFIG_CGROUP_DEBUG) += debug.o | ||||
|  |  | |||
							
								
								
									
										861
									
								
								kernel/cgroup/dmem.c
									
										
									
									
									
										Normal file
									
								
							
							
						
						
									
										861
									
								
								kernel/cgroup/dmem.c
									
										
									
									
									
										Normal file
									
								
							|  | @ -0,0 +1,861 @@ | |||
| // SPDX-License-Identifier: GPL-2.0
 | ||||
| /*
 | ||||
|  * Copyright 2023-2024 Intel Corporation (Maarten Lankhorst <dev@lankhorst.se>) | ||||
|  * Copyright 2024 Red Hat (Maxime Ripard <mripard@kernel.org>) | ||||
|  * Partially based on the rdma and misc controllers, which bear the following copyrights: | ||||
|  * | ||||
|  * Copyright 2020 Google LLC | ||||
|  * Copyright (C) 2016 Parav Pandit <pandit.parav@gmail.com> | ||||
|  */ | ||||
| 
 | ||||
| #include <linux/cgroup.h> | ||||
| #include <linux/cgroup_dmem.h> | ||||
| #include <linux/list.h> | ||||
| #include <linux/mutex.h> | ||||
| #include <linux/page_counter.h> | ||||
| #include <linux/parser.h> | ||||
| #include <linux/slab.h> | ||||
| 
 | ||||
| struct dmem_cgroup_region { | ||||
| 	/**
 | ||||
| 	 * @ref: References keeping the region alive. | ||||
| 	 * Keeps the region reference alive after a succesful RCU lookup. | ||||
| 	 */ | ||||
| 	struct kref ref; | ||||
| 
 | ||||
| 	/** @rcu: RCU head for freeing */ | ||||
| 	struct rcu_head rcu; | ||||
| 
 | ||||
| 	/**
 | ||||
| 	 * @region_node: Linked into &dmem_cgroup_regions list. | ||||
| 	 * Protected by RCU and global spinlock. | ||||
| 	 */ | ||||
| 	struct list_head region_node; | ||||
| 
 | ||||
| 	/**
 | ||||
| 	 * @pools: List of pools linked to this region. | ||||
| 	 * Protected by global spinlock only | ||||
| 	 */ | ||||
| 	struct list_head pools; | ||||
| 
 | ||||
| 	/** @size: Size of region, in bytes */ | ||||
| 	u64 size; | ||||
| 
 | ||||
| 	/** @name: Name describing the node, set by dmem_cgroup_register_region */ | ||||
| 	char *name; | ||||
| 
 | ||||
| 	/**
 | ||||
| 	 * @unregistered: Whether the region is unregistered by its caller. | ||||
| 	 * No new pools should be added to the region afterwards. | ||||
| 	 */ | ||||
| 	bool unregistered; | ||||
| }; | ||||
| 
 | ||||
| struct dmemcg_state { | ||||
| 	struct cgroup_subsys_state css; | ||||
| 
 | ||||
| 	struct list_head pools; | ||||
| }; | ||||
| 
 | ||||
| struct dmem_cgroup_pool_state { | ||||
| 	struct dmem_cgroup_region *region; | ||||
| 	struct dmemcg_state *cs; | ||||
| 
 | ||||
| 	/* css node, RCU protected against region teardown */ | ||||
| 	struct list_head	css_node; | ||||
| 
 | ||||
| 	/* dev node, no RCU protection required */ | ||||
| 	struct list_head	region_node; | ||||
| 
 | ||||
| 	struct rcu_head rcu; | ||||
| 
 | ||||
| 	struct page_counter cnt; | ||||
| 
 | ||||
| 	bool inited; | ||||
| }; | ||||
| 
 | ||||
| /*
 | ||||
|  * 3 operations require locking protection: | ||||
|  * - Registering and unregistering region to/from list, requires global lock. | ||||
|  * - Adding a dmem_cgroup_pool_state to a CSS, removing when CSS is freed. | ||||
|  * - Adding a dmem_cgroup_pool_state to a region list. | ||||
|  * | ||||
|  * Since for the most common operations RCU provides enough protection, I | ||||
|  * do not think more granular locking makes sense. Most protection is offered | ||||
|  * by RCU and the lockless operating page_counter. | ||||
|  */ | ||||
| static DEFINE_SPINLOCK(dmemcg_lock); | ||||
| static LIST_HEAD(dmem_cgroup_regions); | ||||
| 
 | ||||
| static inline struct dmemcg_state * | ||||
| css_to_dmemcs(struct cgroup_subsys_state *css) | ||||
| { | ||||
| 	return container_of(css, struct dmemcg_state, css); | ||||
| } | ||||
| 
 | ||||
| static inline struct dmemcg_state *get_current_dmemcs(void) | ||||
| { | ||||
| 	return css_to_dmemcs(task_get_css(current, dmem_cgrp_id)); | ||||
| } | ||||
| 
 | ||||
| static struct dmemcg_state *parent_dmemcs(struct dmemcg_state *cg) | ||||
| { | ||||
| 	return cg->css.parent ? css_to_dmemcs(cg->css.parent) : NULL; | ||||
| } | ||||
| 
 | ||||
| static void free_cg_pool(struct dmem_cgroup_pool_state *pool) | ||||
| { | ||||
| 	list_del(&pool->region_node); | ||||
| 	kfree(pool); | ||||
| } | ||||
| 
 | ||||
| static void | ||||
| set_resource_min(struct dmem_cgroup_pool_state *pool, u64 val) | ||||
| { | ||||
| 	page_counter_set_min(&pool->cnt, val); | ||||
| } | ||||
| 
 | ||||
| static void | ||||
| set_resource_low(struct dmem_cgroup_pool_state *pool, u64 val) | ||||
| { | ||||
| 	page_counter_set_low(&pool->cnt, val); | ||||
| } | ||||
| 
 | ||||
| static void | ||||
| set_resource_max(struct dmem_cgroup_pool_state *pool, u64 val) | ||||
| { | ||||
| 	page_counter_set_max(&pool->cnt, val); | ||||
| } | ||||
| 
 | ||||
| static u64 get_resource_low(struct dmem_cgroup_pool_state *pool) | ||||
| { | ||||
| 	return pool ? READ_ONCE(pool->cnt.low) : 0; | ||||
| } | ||||
| 
 | ||||
| static u64 get_resource_min(struct dmem_cgroup_pool_state *pool) | ||||
| { | ||||
| 	return pool ? READ_ONCE(pool->cnt.min) : 0; | ||||
| } | ||||
| 
 | ||||
| static u64 get_resource_max(struct dmem_cgroup_pool_state *pool) | ||||
| { | ||||
| 	return pool ? READ_ONCE(pool->cnt.max) : PAGE_COUNTER_MAX; | ||||
| } | ||||
| 
 | ||||
| static u64 get_resource_current(struct dmem_cgroup_pool_state *pool) | ||||
| { | ||||
| 	return pool ? page_counter_read(&pool->cnt) : 0; | ||||
| } | ||||
| 
 | ||||
| static void reset_all_resource_limits(struct dmem_cgroup_pool_state *rpool) | ||||
| { | ||||
| 	set_resource_min(rpool, 0); | ||||
| 	set_resource_low(rpool, 0); | ||||
| 	set_resource_max(rpool, PAGE_COUNTER_MAX); | ||||
| } | ||||
| 
 | ||||
| static void dmemcs_offline(struct cgroup_subsys_state *css) | ||||
| { | ||||
| 	struct dmemcg_state *dmemcs = css_to_dmemcs(css); | ||||
| 	struct dmem_cgroup_pool_state *pool; | ||||
| 
 | ||||
| 	rcu_read_lock(); | ||||
| 	list_for_each_entry_rcu(pool, &dmemcs->pools, css_node) | ||||
| 		reset_all_resource_limits(pool); | ||||
| 	rcu_read_unlock(); | ||||
| } | ||||
| 
 | ||||
| static void dmemcs_free(struct cgroup_subsys_state *css) | ||||
| { | ||||
| 	struct dmemcg_state *dmemcs = css_to_dmemcs(css); | ||||
| 	struct dmem_cgroup_pool_state *pool, *next; | ||||
| 
 | ||||
| 	spin_lock(&dmemcg_lock); | ||||
| 	list_for_each_entry_safe(pool, next, &dmemcs->pools, css_node) { | ||||
| 		/*
 | ||||
| 		 *The pool is dead and all references are 0, | ||||
| 		 * no need for RCU protection with list_del_rcu or freeing. | ||||
| 		 */ | ||||
| 		list_del(&pool->css_node); | ||||
| 		free_cg_pool(pool); | ||||
| 	} | ||||
| 	spin_unlock(&dmemcg_lock); | ||||
| 
 | ||||
| 	kfree(dmemcs); | ||||
| } | ||||
| 
 | ||||
| static struct cgroup_subsys_state * | ||||
| dmemcs_alloc(struct cgroup_subsys_state *parent_css) | ||||
| { | ||||
| 	struct dmemcg_state *dmemcs = kzalloc(sizeof(*dmemcs), GFP_KERNEL); | ||||
| 	if (!dmemcs) | ||||
| 		return ERR_PTR(-ENOMEM); | ||||
| 
 | ||||
| 	INIT_LIST_HEAD(&dmemcs->pools); | ||||
| 	return &dmemcs->css; | ||||
| } | ||||
| 
 | ||||
| static struct dmem_cgroup_pool_state * | ||||
| find_cg_pool_locked(struct dmemcg_state *dmemcs, struct dmem_cgroup_region *region) | ||||
| { | ||||
| 	struct dmem_cgroup_pool_state *pool; | ||||
| 
 | ||||
| 	list_for_each_entry_rcu(pool, &dmemcs->pools, css_node, spin_is_locked(&dmemcg_lock)) | ||||
| 		if (pool->region == region) | ||||
| 			return pool; | ||||
| 
 | ||||
| 	return NULL; | ||||
| } | ||||
| 
 | ||||
| static struct dmem_cgroup_pool_state *pool_parent(struct dmem_cgroup_pool_state *pool) | ||||
| { | ||||
| 	if (!pool->cnt.parent) | ||||
| 		return NULL; | ||||
| 
 | ||||
| 	return container_of(pool->cnt.parent, typeof(*pool), cnt); | ||||
| } | ||||
| 
 | ||||
| static void | ||||
| dmem_cgroup_calculate_protection(struct dmem_cgroup_pool_state *limit_pool, | ||||
| 				 struct dmem_cgroup_pool_state *test_pool) | ||||
| { | ||||
| 	struct page_counter *climit; | ||||
| 	struct cgroup_subsys_state *css, *next_css; | ||||
| 	struct dmemcg_state *dmemcg_iter; | ||||
| 	struct dmem_cgroup_pool_state *pool, *parent_pool; | ||||
| 	bool found_descendant; | ||||
| 
 | ||||
| 	climit = &limit_pool->cnt; | ||||
| 
 | ||||
| 	rcu_read_lock(); | ||||
| 	parent_pool = pool = limit_pool; | ||||
| 	css = &limit_pool->cs->css; | ||||
| 
 | ||||
| 	/*
 | ||||
| 	 * This logic is roughly equivalent to css_foreach_descendant_pre, | ||||
| 	 * except we also track the parent pool to find out which pool we need | ||||
| 	 * to calculate protection values for. | ||||
| 	 * | ||||
| 	 * We can stop the traversal once we find test_pool among the | ||||
| 	 * descendants since we don't really care about any others. | ||||
| 	 */ | ||||
| 	while (pool != test_pool) { | ||||
| 		next_css = css_next_child(NULL, css); | ||||
| 		if (next_css) { | ||||
| 			parent_pool = pool; | ||||
| 		} else { | ||||
| 			while (css != &limit_pool->cs->css) { | ||||
| 				next_css = css_next_child(css, css->parent); | ||||
| 				if (next_css) | ||||
| 					break; | ||||
| 				css = css->parent; | ||||
| 				parent_pool = pool_parent(parent_pool); | ||||
| 			} | ||||
| 			/*
 | ||||
| 			 * We can only hit this when test_pool is not a | ||||
| 			 * descendant of limit_pool. | ||||
| 			 */ | ||||
| 			if (WARN_ON_ONCE(css == &limit_pool->cs->css)) | ||||
| 				break; | ||||
| 		} | ||||
| 		css = next_css; | ||||
| 
 | ||||
| 		found_descendant = false; | ||||
| 		dmemcg_iter = container_of(css, struct dmemcg_state, css); | ||||
| 
 | ||||
| 		list_for_each_entry_rcu(pool, &dmemcg_iter->pools, css_node) { | ||||
| 			if (pool_parent(pool) == parent_pool) { | ||||
| 				found_descendant = true; | ||||
| 				break; | ||||
| 			} | ||||
| 		} | ||||
| 		if (!found_descendant) | ||||
| 			continue; | ||||
| 
 | ||||
| 		page_counter_calculate_protection( | ||||
| 			climit, &pool->cnt, true); | ||||
| 	} | ||||
| 	rcu_read_unlock(); | ||||
| } | ||||
| 
 | ||||
| /**
 | ||||
|  * dmem_cgroup_state_evict_valuable() - Check if we should evict from test_pool | ||||
|  * @dev: &dmem_cgroup_region | ||||
|  * @index: The index number of the region being tested. | ||||
|  * @limit_pool: The pool for which we hit limits | ||||
|  * @test_pool: The pool for which to test | ||||
|  * @ignore_low: Whether we have to respect low watermarks. | ||||
|  * @ret_hit_low: Pointer to whether it makes sense to consider low watermark. | ||||
|  * | ||||
|  * This function returns true if we can evict from @test_pool, false if not. | ||||
|  * When returning false and @ignore_low is false, @ret_hit_low may | ||||
|  * be set to true to indicate this function can be retried with @ignore_low | ||||
|  * set to true. | ||||
|  * | ||||
|  * Return: bool | ||||
|  */ | ||||
| bool dmem_cgroup_state_evict_valuable(struct dmem_cgroup_pool_state *limit_pool, | ||||
| 				      struct dmem_cgroup_pool_state *test_pool, | ||||
| 				      bool ignore_low, bool *ret_hit_low) | ||||
| { | ||||
| 	struct dmem_cgroup_pool_state *pool = test_pool; | ||||
| 	struct page_counter *climit, *ctest; | ||||
| 	u64 used, min, low; | ||||
| 
 | ||||
| 	/* Can always evict from current pool, despite limits */ | ||||
| 	if (limit_pool == test_pool) | ||||
| 		return true; | ||||
| 
 | ||||
| 	if (limit_pool) { | ||||
| 		if (!parent_dmemcs(limit_pool->cs)) | ||||
| 			return true; | ||||
| 
 | ||||
| 		for (pool = test_pool; pool && limit_pool != pool; pool = pool_parent(pool)) | ||||
| 			{} | ||||
| 
 | ||||
| 		if (!pool) | ||||
| 			return false; | ||||
| 	} else { | ||||
| 		/*
 | ||||
| 		 * If there is no cgroup limiting memory usage, use the root | ||||
| 		 * cgroup instead for limit calculations. | ||||
| 		 */ | ||||
| 		for (limit_pool = test_pool; pool_parent(limit_pool); limit_pool = pool_parent(limit_pool)) | ||||
| 			{} | ||||
| 	} | ||||
| 
 | ||||
| 	climit = &limit_pool->cnt; | ||||
| 	ctest = &test_pool->cnt; | ||||
| 
 | ||||
| 	dmem_cgroup_calculate_protection(limit_pool, test_pool); | ||||
| 
 | ||||
| 	used = page_counter_read(ctest); | ||||
| 	min = READ_ONCE(ctest->emin); | ||||
| 
 | ||||
| 	if (used <= min) | ||||
| 		return false; | ||||
| 
 | ||||
| 	if (!ignore_low) { | ||||
| 		low = READ_ONCE(ctest->elow); | ||||
| 		if (used > low) | ||||
| 			return true; | ||||
| 
 | ||||
| 		*ret_hit_low = true; | ||||
| 		return false; | ||||
| 	} | ||||
| 	return true; | ||||
| } | ||||
| EXPORT_SYMBOL_GPL(dmem_cgroup_state_evict_valuable); | ||||
| 
 | ||||
| static struct dmem_cgroup_pool_state * | ||||
| alloc_pool_single(struct dmemcg_state *dmemcs, struct dmem_cgroup_region *region, | ||||
| 		  struct dmem_cgroup_pool_state **allocpool) | ||||
| { | ||||
| 	struct dmemcg_state *parent = parent_dmemcs(dmemcs); | ||||
| 	struct dmem_cgroup_pool_state *pool, *ppool = NULL; | ||||
| 
 | ||||
| 	if (!*allocpool) { | ||||
| 		pool = kzalloc(sizeof(*pool), GFP_NOWAIT); | ||||
| 		if (!pool) | ||||
| 			return ERR_PTR(-ENOMEM); | ||||
| 	} else { | ||||
| 		pool = *allocpool; | ||||
| 		*allocpool = NULL; | ||||
| 	} | ||||
| 
 | ||||
| 	pool->region = region; | ||||
| 	pool->cs = dmemcs; | ||||
| 
 | ||||
| 	if (parent) | ||||
| 		ppool = find_cg_pool_locked(parent, region); | ||||
| 
 | ||||
| 	page_counter_init(&pool->cnt, | ||||
| 			  ppool ? &ppool->cnt : NULL, true); | ||||
| 	reset_all_resource_limits(pool); | ||||
| 
 | ||||
| 	list_add_tail_rcu(&pool->css_node, &dmemcs->pools); | ||||
| 	list_add_tail(&pool->region_node, ®ion->pools); | ||||
| 
 | ||||
| 	if (!parent) | ||||
| 		pool->inited = true; | ||||
| 	else | ||||
| 		pool->inited = ppool ? ppool->inited : false; | ||||
| 	return pool; | ||||
| } | ||||
| 
 | ||||
| static struct dmem_cgroup_pool_state * | ||||
| get_cg_pool_locked(struct dmemcg_state *dmemcs, struct dmem_cgroup_region *region, | ||||
| 		   struct dmem_cgroup_pool_state **allocpool) | ||||
| { | ||||
| 	struct dmem_cgroup_pool_state *pool, *ppool, *retpool; | ||||
| 	struct dmemcg_state *p, *pp; | ||||
| 
 | ||||
| 	/*
 | ||||
| 	 * Recursively create pool, we may not initialize yet on | ||||
| 	 * recursion, this is done as a separate step. | ||||
| 	 */ | ||||
| 	for (p = dmemcs; p; p = parent_dmemcs(p)) { | ||||
| 		pool = find_cg_pool_locked(p, region); | ||||
| 		if (!pool) | ||||
| 			pool = alloc_pool_single(p, region, allocpool); | ||||
| 
 | ||||
| 		if (IS_ERR(pool)) | ||||
| 			return pool; | ||||
| 
 | ||||
| 		if (p == dmemcs && pool->inited) | ||||
| 			return pool; | ||||
| 
 | ||||
| 		if (pool->inited) | ||||
| 			break; | ||||
| 	} | ||||
| 
 | ||||
| 	retpool = pool = find_cg_pool_locked(dmemcs, region); | ||||
| 	for (p = dmemcs, pp = parent_dmemcs(dmemcs); pp; p = pp, pp = parent_dmemcs(p)) { | ||||
| 		if (pool->inited) | ||||
| 			break; | ||||
| 
 | ||||
| 		/* ppool was created if it didn't exist by above loop. */ | ||||
| 		ppool = find_cg_pool_locked(pp, region); | ||||
| 
 | ||||
| 		/* Fix up parent links, mark as inited. */ | ||||
| 		pool->cnt.parent = &ppool->cnt; | ||||
| 		pool->inited = true; | ||||
| 
 | ||||
| 		pool = ppool; | ||||
| 	} | ||||
| 
 | ||||
| 	return retpool; | ||||
| } | ||||
| 
 | ||||
| static void dmemcg_free_rcu(struct rcu_head *rcu) | ||||
| { | ||||
| 	struct dmem_cgroup_region *region = container_of(rcu, typeof(*region), rcu); | ||||
| 	struct dmem_cgroup_pool_state *pool, *next; | ||||
| 
 | ||||
| 	list_for_each_entry_safe(pool, next, ®ion->pools, region_node) | ||||
| 		free_cg_pool(pool); | ||||
| 	kfree(region->name); | ||||
| 	kfree(region); | ||||
| } | ||||
| 
 | ||||
| static void dmemcg_free_region(struct kref *ref) | ||||
| { | ||||
| 	struct dmem_cgroup_region *cgregion = container_of(ref, typeof(*cgregion), ref); | ||||
| 
 | ||||
| 	call_rcu(&cgregion->rcu, dmemcg_free_rcu); | ||||
| } | ||||
| 
 | ||||
| /**
 | ||||
|  * dmem_cgroup_unregister_region() - Unregister a previously registered region. | ||||
|  * @region: The region to unregister. | ||||
|  * | ||||
|  * This function undoes dmem_cgroup_register_region. | ||||
|  */ | ||||
| void dmem_cgroup_unregister_region(struct dmem_cgroup_region *region) | ||||
| { | ||||
| 	struct list_head *entry; | ||||
| 
 | ||||
| 	if (!region) | ||||
| 		return; | ||||
| 
 | ||||
| 	spin_lock(&dmemcg_lock); | ||||
| 
 | ||||
| 	/* Remove from global region list */ | ||||
| 	list_del_rcu(®ion->region_node); | ||||
| 
 | ||||
| 	list_for_each_rcu(entry, ®ion->pools) { | ||||
| 		struct dmem_cgroup_pool_state *pool = | ||||
| 			container_of(entry, typeof(*pool), region_node); | ||||
| 
 | ||||
| 		list_del_rcu(&pool->css_node); | ||||
| 	} | ||||
| 
 | ||||
| 	/*
 | ||||
| 	 * Ensure any RCU based lookups fail. Additionally, | ||||
| 	 * no new pools should be added to the dead region | ||||
| 	 * by get_cg_pool_unlocked. | ||||
| 	 */ | ||||
| 	region->unregistered = true; | ||||
| 	spin_unlock(&dmemcg_lock); | ||||
| 
 | ||||
| 	kref_put(®ion->ref, dmemcg_free_region); | ||||
| } | ||||
| EXPORT_SYMBOL_GPL(dmem_cgroup_unregister_region); | ||||
| 
 | ||||
| /**
 | ||||
|  * dmem_cgroup_register_region() - Register a regions for dev cgroup. | ||||
|  * @size: Size of region to register, in bytes. | ||||
|  * @fmt: Region parameters to register | ||||
|  * | ||||
|  * This function registers a node in the dmem cgroup with the | ||||
|  * name given. After calling this function, the region can be | ||||
|  * used for allocations. | ||||
|  * | ||||
|  * Return: NULL or a struct on success, PTR_ERR on failure. | ||||
|  */ | ||||
| struct dmem_cgroup_region *dmem_cgroup_register_region(u64 size, const char *fmt, ...) | ||||
| { | ||||
| 	struct dmem_cgroup_region *ret; | ||||
| 	char *region_name; | ||||
| 	va_list ap; | ||||
| 
 | ||||
| 	if (!size) | ||||
| 		return NULL; | ||||
| 
 | ||||
| 	va_start(ap, fmt); | ||||
| 	region_name = kvasprintf(GFP_KERNEL, fmt, ap); | ||||
| 	va_end(ap); | ||||
| 	if (!region_name) | ||||
| 		return ERR_PTR(-ENOMEM); | ||||
| 
 | ||||
| 	ret = kzalloc(sizeof(*ret), GFP_KERNEL); | ||||
| 	if (!ret) { | ||||
| 		kfree(region_name); | ||||
| 		return ERR_PTR(-ENOMEM); | ||||
| 	} | ||||
| 
 | ||||
| 	INIT_LIST_HEAD(&ret->pools); | ||||
| 	ret->name = region_name; | ||||
| 	ret->size = size; | ||||
| 	kref_init(&ret->ref); | ||||
| 
 | ||||
| 	spin_lock(&dmemcg_lock); | ||||
| 	list_add_tail_rcu(&ret->region_node, &dmem_cgroup_regions); | ||||
| 	spin_unlock(&dmemcg_lock); | ||||
| 
 | ||||
| 	return ret; | ||||
| } | ||||
| EXPORT_SYMBOL_GPL(dmem_cgroup_register_region); | ||||
| 
 | ||||
| static struct dmem_cgroup_region *dmemcg_get_region_by_name(const char *name) | ||||
| { | ||||
| 	struct dmem_cgroup_region *region; | ||||
| 
 | ||||
| 	list_for_each_entry_rcu(region, &dmem_cgroup_regions, region_node, spin_is_locked(&dmemcg_lock)) | ||||
| 		if (!strcmp(name, region->name) && | ||||
| 		    kref_get_unless_zero(®ion->ref)) | ||||
| 			return region; | ||||
| 
 | ||||
| 	return NULL; | ||||
| } | ||||
| 
 | ||||
| /**
 | ||||
|  * dmem_cgroup_pool_state_put() - Drop a reference to a dmem_cgroup_pool_state | ||||
|  * @pool: &dmem_cgroup_pool_state | ||||
|  * | ||||
|  * Called to drop a reference to the limiting pool returned by | ||||
|  * dmem_cgroup_try_charge(). | ||||
|  */ | ||||
| void dmem_cgroup_pool_state_put(struct dmem_cgroup_pool_state *pool) | ||||
| { | ||||
| 	if (pool) | ||||
| 		css_put(&pool->cs->css); | ||||
| } | ||||
| EXPORT_SYMBOL_GPL(dmem_cgroup_pool_state_put); | ||||
| 
 | ||||
| static struct dmem_cgroup_pool_state * | ||||
| get_cg_pool_unlocked(struct dmemcg_state *cg, struct dmem_cgroup_region *region) | ||||
| { | ||||
| 	struct dmem_cgroup_pool_state *pool, *allocpool = NULL; | ||||
| 
 | ||||
| 	/* fastpath lookup? */ | ||||
| 	rcu_read_lock(); | ||||
| 	pool = find_cg_pool_locked(cg, region); | ||||
| 	if (pool && !READ_ONCE(pool->inited)) | ||||
| 		pool = NULL; | ||||
| 	rcu_read_unlock(); | ||||
| 
 | ||||
| 	while (!pool) { | ||||
| 		spin_lock(&dmemcg_lock); | ||||
| 		if (!region->unregistered) | ||||
| 			pool = get_cg_pool_locked(cg, region, &allocpool); | ||||
| 		else | ||||
| 			pool = ERR_PTR(-ENODEV); | ||||
| 		spin_unlock(&dmemcg_lock); | ||||
| 
 | ||||
| 		if (pool == ERR_PTR(-ENOMEM)) { | ||||
| 			pool = NULL; | ||||
| 			if (WARN_ON(allocpool)) | ||||
| 				continue; | ||||
| 
 | ||||
| 			allocpool = kzalloc(sizeof(*allocpool), GFP_KERNEL); | ||||
| 			if (allocpool) { | ||||
| 				pool = NULL; | ||||
| 				continue; | ||||
| 			} | ||||
| 		} | ||||
| 	} | ||||
| 
 | ||||
| 	kfree(allocpool); | ||||
| 	return pool; | ||||
| } | ||||
| 
 | ||||
| /**
 | ||||
|  * dmem_cgroup_uncharge() - Uncharge a pool. | ||||
|  * @pool: Pool to uncharge. | ||||
|  * @size: Size to uncharge. | ||||
|  * | ||||
|  * Undoes the effects of dmem_cgroup_try_charge. | ||||
|  * Must be called with the returned pool as argument, | ||||
|  * and same @index and @size. | ||||
|  */ | ||||
| void dmem_cgroup_uncharge(struct dmem_cgroup_pool_state *pool, u64 size) | ||||
| { | ||||
| 	if (!pool) | ||||
| 		return; | ||||
| 
 | ||||
| 	page_counter_uncharge(&pool->cnt, size); | ||||
| 	css_put(&pool->cs->css); | ||||
| } | ||||
| EXPORT_SYMBOL_GPL(dmem_cgroup_uncharge); | ||||
| 
 | ||||
| /**
 | ||||
|  * dmem_cgroup_try_charge() - Try charging a new allocation to a region. | ||||
|  * @dev: Device to charge | ||||
|  * @size: Size (in bytes) to charge. | ||||
|  * @ret_pool: On succesfull allocation, the pool that is charged. | ||||
|  * @ret_limit_pool: On a failed allocation, the limiting pool. | ||||
|  * | ||||
|  * This function charges the current pool for @dev with region at @index for a | ||||
|  * size of @size bytes. | ||||
|  * | ||||
|  * If the function succeeds, @ret_pool is set, which must be passed to | ||||
|  * dmem_cgroup_uncharge() when undoing the allocation. | ||||
|  * | ||||
|  * When this function fails with -EAGAIN and @ret_limit_pool is non-null, it | ||||
|  * will be set to the pool for which the limit is hit. This can be used for | ||||
|  * eviction as argument to dmem_cgroup_evict_valuable(). This reference must be freed | ||||
|  * with @dmem_cgroup_pool_state_put(). | ||||
|  * | ||||
|  * Return: 0 on success, -EAGAIN on hitting a limit, or a negative errno on failure. | ||||
|  */ | ||||
| int dmem_cgroup_try_charge(struct dmem_cgroup_region *region, u64 size, | ||||
| 			  struct dmem_cgroup_pool_state **ret_pool, | ||||
| 			  struct dmem_cgroup_pool_state **ret_limit_pool) | ||||
| { | ||||
| 	struct dmemcg_state *cg; | ||||
| 	struct dmem_cgroup_pool_state *pool; | ||||
| 	struct page_counter *fail; | ||||
| 	int ret; | ||||
| 
 | ||||
| 	*ret_pool = NULL; | ||||
| 	if (ret_limit_pool) | ||||
| 		*ret_limit_pool = NULL; | ||||
| 
 | ||||
| 	/*
 | ||||
| 	 * hold on to css, as cgroup can be removed but resource | ||||
| 	 * accounting happens on css. | ||||
| 	 */ | ||||
| 	cg = get_current_dmemcs(); | ||||
| 
 | ||||
| 	pool = get_cg_pool_unlocked(cg, region); | ||||
| 	if (IS_ERR(pool)) { | ||||
| 		ret = PTR_ERR(pool); | ||||
| 		goto err; | ||||
| 	} | ||||
| 
 | ||||
| 	if (!page_counter_try_charge(&pool->cnt, size, &fail)) { | ||||
| 		if (ret_limit_pool) { | ||||
| 			*ret_limit_pool = container_of(fail, struct dmem_cgroup_pool_state, cnt); | ||||
| 			css_get(&(*ret_limit_pool)->cs->css); | ||||
| 		} | ||||
| 		ret = -EAGAIN; | ||||
| 		goto err; | ||||
| 	} | ||||
| 
 | ||||
| 	/* On success, reference from get_current_dmemcs is transferred to *ret_pool */ | ||||
| 	*ret_pool = pool; | ||||
| 	return 0; | ||||
| 
 | ||||
| err: | ||||
| 	css_put(&cg->css); | ||||
| 	return ret; | ||||
| } | ||||
| EXPORT_SYMBOL_GPL(dmem_cgroup_try_charge); | ||||
| 
 | ||||
| static int dmem_cgroup_region_capacity_show(struct seq_file *sf, void *v) | ||||
| { | ||||
| 	struct dmem_cgroup_region *region; | ||||
| 
 | ||||
| 	rcu_read_lock(); | ||||
| 	list_for_each_entry_rcu(region, &dmem_cgroup_regions, region_node) { | ||||
| 		seq_puts(sf, region->name); | ||||
| 		seq_printf(sf, " %llu\n", region->size); | ||||
| 	} | ||||
| 	rcu_read_unlock(); | ||||
| 	return 0; | ||||
| } | ||||
| 
 | ||||
| static int dmemcg_parse_limit(char *options, struct dmem_cgroup_region *region, | ||||
| 			      u64 *new_limit) | ||||
| { | ||||
| 	char *end; | ||||
| 
 | ||||
| 	if (!strcmp(options, "max")) { | ||||
| 		*new_limit = PAGE_COUNTER_MAX; | ||||
| 		return 0; | ||||
| 	} | ||||
| 
 | ||||
| 	*new_limit = memparse(options, &end); | ||||
| 	if (*end != '\0') | ||||
| 		return -EINVAL; | ||||
| 
 | ||||
| 	return 0; | ||||
| } | ||||
| 
 | ||||
| static ssize_t dmemcg_limit_write(struct kernfs_open_file *of, | ||||
| 				 char *buf, size_t nbytes, loff_t off, | ||||
| 				 void (*apply)(struct dmem_cgroup_pool_state *, u64)) | ||||
| { | ||||
| 	struct dmemcg_state *dmemcs = css_to_dmemcs(of_css(of)); | ||||
| 	int err = 0; | ||||
| 
 | ||||
| 	while (buf && !err) { | ||||
| 		struct dmem_cgroup_pool_state *pool = NULL; | ||||
| 		char *options, *region_name; | ||||
| 		struct dmem_cgroup_region *region; | ||||
| 		u64 new_limit; | ||||
| 
 | ||||
| 		options = buf; | ||||
| 		buf = strchr(buf, '\n'); | ||||
| 		if (buf) | ||||
| 			*buf++ = '\0'; | ||||
| 
 | ||||
| 		options = strstrip(options); | ||||
| 
 | ||||
| 		/* eat empty lines */ | ||||
| 		if (!options[0]) | ||||
| 			continue; | ||||
| 
 | ||||
| 		region_name = strsep(&options, " \t"); | ||||
| 		if (!region_name[0]) | ||||
| 			continue; | ||||
| 
 | ||||
| 		rcu_read_lock(); | ||||
| 		region = dmemcg_get_region_by_name(region_name); | ||||
| 		rcu_read_unlock(); | ||||
| 
 | ||||
| 		if (!region) | ||||
| 			return -EINVAL; | ||||
| 
 | ||||
| 		err = dmemcg_parse_limit(options, region, &new_limit); | ||||
| 		if (err < 0) | ||||
| 			goto out_put; | ||||
| 
 | ||||
| 		pool = get_cg_pool_unlocked(dmemcs, region); | ||||
| 		if (IS_ERR(pool)) { | ||||
| 			err = PTR_ERR(pool); | ||||
| 			goto out_put; | ||||
| 		} | ||||
| 
 | ||||
| 		/* And commit */ | ||||
| 		apply(pool, new_limit); | ||||
| 
 | ||||
| out_put: | ||||
| 		kref_put(®ion->ref, dmemcg_free_region); | ||||
| 	} | ||||
| 
 | ||||
| 
 | ||||
| 	return err ?: nbytes; | ||||
| } | ||||
| 
 | ||||
| static int dmemcg_limit_show(struct seq_file *sf, void *v, | ||||
| 			    u64 (*fn)(struct dmem_cgroup_pool_state *)) | ||||
| { | ||||
| 	struct dmemcg_state *dmemcs = css_to_dmemcs(seq_css(sf)); | ||||
| 	struct dmem_cgroup_region *region; | ||||
| 
 | ||||
| 	rcu_read_lock(); | ||||
| 	list_for_each_entry_rcu(region, &dmem_cgroup_regions, region_node) { | ||||
| 		struct dmem_cgroup_pool_state *pool = find_cg_pool_locked(dmemcs, region); | ||||
| 		u64 val; | ||||
| 
 | ||||
| 		seq_puts(sf, region->name); | ||||
| 
 | ||||
| 		val = fn(pool); | ||||
| 		if (val < PAGE_COUNTER_MAX) | ||||
| 			seq_printf(sf, " %lld\n", val); | ||||
| 		else | ||||
| 			seq_puts(sf, " max\n"); | ||||
| 	} | ||||
| 	rcu_read_unlock(); | ||||
| 
 | ||||
| 	return 0; | ||||
| } | ||||
| 
 | ||||
| static int dmem_cgroup_region_current_show(struct seq_file *sf, void *v) | ||||
| { | ||||
| 	return dmemcg_limit_show(sf, v, get_resource_current); | ||||
| } | ||||
| 
 | ||||
| static int dmem_cgroup_region_min_show(struct seq_file *sf, void *v) | ||||
| { | ||||
| 	return dmemcg_limit_show(sf, v, get_resource_min); | ||||
| } | ||||
| 
 | ||||
| static ssize_t dmem_cgroup_region_min_write(struct kernfs_open_file *of, | ||||
| 				      char *buf, size_t nbytes, loff_t off) | ||||
| { | ||||
| 	return dmemcg_limit_write(of, buf, nbytes, off, set_resource_min); | ||||
| } | ||||
| 
 | ||||
| static int dmem_cgroup_region_low_show(struct seq_file *sf, void *v) | ||||
| { | ||||
| 	return dmemcg_limit_show(sf, v, get_resource_low); | ||||
| } | ||||
| 
 | ||||
| static ssize_t dmem_cgroup_region_low_write(struct kernfs_open_file *of, | ||||
| 				      char *buf, size_t nbytes, loff_t off) | ||||
| { | ||||
| 	return dmemcg_limit_write(of, buf, nbytes, off, set_resource_low); | ||||
| } | ||||
| 
 | ||||
| static int dmem_cgroup_region_max_show(struct seq_file *sf, void *v) | ||||
| { | ||||
| 	return dmemcg_limit_show(sf, v, get_resource_max); | ||||
| } | ||||
| 
 | ||||
| static ssize_t dmem_cgroup_region_max_write(struct kernfs_open_file *of, | ||||
| 				      char *buf, size_t nbytes, loff_t off) | ||||
| { | ||||
| 	return dmemcg_limit_write(of, buf, nbytes, off, set_resource_max); | ||||
| } | ||||
| 
 | ||||
| static struct cftype files[] = { | ||||
| 	{ | ||||
| 		.name = "capacity", | ||||
| 		.seq_show = dmem_cgroup_region_capacity_show, | ||||
| 		.flags = CFTYPE_ONLY_ON_ROOT, | ||||
| 	}, | ||||
| 	{ | ||||
| 		.name = "current", | ||||
| 		.seq_show = dmem_cgroup_region_current_show, | ||||
| 	}, | ||||
| 	{ | ||||
| 		.name = "min", | ||||
| 		.write = dmem_cgroup_region_min_write, | ||||
| 		.seq_show = dmem_cgroup_region_min_show, | ||||
| 		.flags = CFTYPE_NOT_ON_ROOT, | ||||
| 	}, | ||||
| 	{ | ||||
| 		.name = "low", | ||||
| 		.write = dmem_cgroup_region_low_write, | ||||
| 		.seq_show = dmem_cgroup_region_low_show, | ||||
| 		.flags = CFTYPE_NOT_ON_ROOT, | ||||
| 	}, | ||||
| 	{ | ||||
| 		.name = "max", | ||||
| 		.write = dmem_cgroup_region_max_write, | ||||
| 		.seq_show = dmem_cgroup_region_max_show, | ||||
| 		.flags = CFTYPE_NOT_ON_ROOT, | ||||
| 	}, | ||||
| 	{ } /* Zero entry terminates. */ | ||||
| }; | ||||
| 
 | ||||
| struct cgroup_subsys dmem_cgrp_subsys = { | ||||
| 	.css_alloc	= dmemcs_alloc, | ||||
| 	.css_free	= dmemcs_free, | ||||
| 	.css_offline	= dmemcs_offline, | ||||
| 	.legacy_cftypes	= files, | ||||
| 	.dfl_cftypes	= files, | ||||
| }; | ||||
|  | @ -288,7 +288,7 @@ int page_counter_memparse(const char *buf, const char *max, | |||
| } | ||||
| 
 | ||||
| 
 | ||||
| #ifdef CONFIG_MEMCG | ||||
| #if IS_ENABLED(CONFIG_MEMCG) || IS_ENABLED(CONFIG_CGROUP_DMEM) | ||||
| /*
 | ||||
|  * This function calculates an individual page counter's effective | ||||
|  * protection which is derived from its own memory.min/low, its | ||||
|  | @ -460,4 +460,4 @@ void page_counter_calculate_protection(struct page_counter *root, | |||
| 			atomic_long_read(&parent->children_low_usage), | ||||
| 			recursive_protection)); | ||||
| } | ||||
| #endif /* CONFIG_MEMCG */ | ||||
| #endif /* CONFIG_MEMCG || CONFIG_CGROUP_DMEM */ | ||||
|  |  | |||
		Loading…
	
	Add table
		
		Reference in a new issue
	
	 Dave Airlie
						Dave Airlie