2727 * Copyright (c) 2017, Nexenta Systems, Inc. All rights reserved.
2828 * Copyright (c) 2019, loli10K <[email protected] >. All rights reserved. 2929 * Copyright (c) 2020, George Amanakis. All rights reserved.
30- * Copyright (c) 2019, 2024, Klara Inc.
30+ * Copyright (c) 2019, 2024, 2025, Klara, Inc.
3131 * Copyright (c) 2019, Allan Jude
3232 * Copyright (c) 2020, The FreeBSD Foundation [1]
3333 * Copyright (c) 2021, 2024 by George Melikov. All rights reserved.
@@ -337,6 +337,9 @@ static kmutex_t arc_evict_lock;
337337static boolean_t arc_evict_needed = B_FALSE ;
338338static clock_t arc_last_uncached_flush ;
339339
340+ static taskq_t * arc_evict_taskq ;
341+ static struct evict_arg * arc_evict_arg ;
342+
340343/*
341344 * Count of bytes evicted since boot.
342345 */
@@ -470,6 +473,18 @@ static int zfs_arc_prune_task_threads = 1;
470473/* Used by spa_export/spa_destroy to flush the arc asynchronously */
471474static taskq_t * arc_flush_taskq ;
472475
476+ /*
477+ * Controls the number of ARC eviction threads to dispatch sublists to.
478+ *
479+ * Possible values:
480+ * 0 (auto) compute the number of threads using a logarithmic formula.
481+ * 1 (disabled) one thread - parallel eviction is disabled.
482+ * 2+ (manual) set the number manually.
483+ *
484+ * See arc_evict_thread_init() for how "auto" is computed.
485+ */
486+ static uint_t zfs_arc_evict_threads = 0 ;
487+
473488/* The 7 states: */
474489arc_state_t ARC_anon ;
475490arc_state_t ARC_mru ;
@@ -4049,6 +4064,62 @@ arc_state_free_markers(arc_buf_hdr_t **markers, int count)
40494064 kmem_free (markers , sizeof (* markers ) * count );
40504065}
40514066
4067+ typedef struct evict_arg {
4068+ taskq_ent_t eva_tqent ;
4069+ multilist_t * eva_ml ;
4070+ arc_buf_hdr_t * eva_marker ;
4071+ int eva_idx ;
4072+ uint64_t eva_spa ;
4073+ uint64_t eva_bytes ;
4074+ uint64_t eva_evicted ;
4075+ } evict_arg_t ;
4076+
4077+ static void
4078+ arc_evict_task (void * arg )
4079+ {
4080+ evict_arg_t * eva = arg ;
4081+ eva -> eva_evicted = arc_evict_state_impl (eva -> eva_ml , eva -> eva_idx ,
4082+ eva -> eva_marker , eva -> eva_spa , eva -> eva_bytes );
4083+ }
4084+
4085+ static void
4086+ arc_evict_thread_init (void )
4087+ {
4088+ if (zfs_arc_evict_threads == 0 ) {
4089+ /*
4090+ * Compute number of threads we want to use for eviction.
4091+ *
4092+ * Normally, it's log2(ncpus) + ncpus/32, which gets us to the
4093+ * default max of 16 threads at ~256 CPUs.
4094+ *
4095+ * However, that formula goes to two threads at 4 CPUs, which
4096+ * is still rather to low to be really useful, so we just go
4097+ * with 1 thread at fewer than 6 cores.
4098+ */
4099+ if (max_ncpus < 6 )
4100+ zfs_arc_evict_threads = 1 ;
4101+ else
4102+ zfs_arc_evict_threads =
4103+ (highbit64 (max_ncpus ) - 1 ) + max_ncpus / 32 ;
4104+ } else if (zfs_arc_evict_threads > max_ncpus )
4105+ zfs_arc_evict_threads = max_ncpus ;
4106+
4107+ if (zfs_arc_evict_threads > 1 ) {
4108+ arc_evict_taskq = taskq_create ("arc_evict" ,
4109+ zfs_arc_evict_threads , defclsyspri , 0 , INT_MAX ,
4110+ TASKQ_PREPOPULATE );
4111+ arc_evict_arg = kmem_zalloc (
4112+ sizeof (evict_arg_t ) * zfs_arc_evict_threads , KM_SLEEP );
4113+ }
4114+ }
4115+
4116+ /*
4117+ * The minimum number of bytes we can evict at once is a block size.
4118+ * So, SPA_MAXBLOCKSIZE is a reasonable minimal value per an eviction task.
4119+ * We use this value to compute a scaling factor for the eviction tasks.
4120+ */
4121+ #define MIN_EVICT_SIZE (SPA_MAXBLOCKSIZE)
4122+
40524123/*
40534124 * Evict buffers from the given arc state, until we've removed the
40544125 * specified number of bytes. Move the removed buffers to the
@@ -4070,9 +4141,12 @@ arc_evict_state(arc_state_t *state, arc_buf_contents_t type, uint64_t spa,
40704141 multilist_t * ml = & state -> arcs_list [type ];
40714142 int num_sublists ;
40724143 arc_buf_hdr_t * * markers ;
4144+ evict_arg_t * eva = NULL ;
40734145
40744146 num_sublists = multilist_get_num_sublists (ml );
40754147
4148+ boolean_t use_evcttq = zfs_arc_evict_threads > 1 ;
4149+
40764150 /*
40774151 * If we've tried to evict from each sublist, made some
40784152 * progress, but still have not hit the target number of bytes
@@ -4094,25 +4168,91 @@ arc_evict_state(arc_state_t *state, arc_buf_contents_t type, uint64_t spa,
40944168 multilist_sublist_unlock (mls );
40954169 }
40964170
4171+ if (use_evcttq ) {
4172+ if (zthr_iscurthread (arc_evict_zthr ))
4173+ eva = arc_evict_arg ;
4174+ else
4175+ eva = kmem_alloc (sizeof (evict_arg_t ) *
4176+ zfs_arc_evict_threads , KM_NOSLEEP );
4177+ if (eva ) {
4178+ for (int i = 0 ; i < zfs_arc_evict_threads ; i ++ ) {
4179+ taskq_init_ent (& eva [i ].eva_tqent );
4180+ eva [i ].eva_ml = ml ;
4181+ eva [i ].eva_spa = spa ;
4182+ }
4183+ } else {
4184+ /*
4185+ * Fall back to the regular single evict if it is not
4186+ * possible to allocate memory for the taskq entries.
4187+ */
4188+ use_evcttq = B_FALSE ;
4189+ }
4190+ }
4191+
4192+ /*
4193+ * Start eviction using a randomly selected sublist, this is to try and
4194+ * evenly balance eviction across all sublists. Always starting at the
4195+ * same sublist (e.g. index 0) would cause evictions to favor certain
4196+ * sublists over others.
4197+ */
4198+ uint64_t scan_evicted = 0 ;
4199+ int sublists_left = num_sublists ;
4200+ int sublist_idx = multilist_get_random_index (ml );
4201+
40974202 /*
40984203 * While we haven't hit our target number of bytes to evict, or
40994204 * we're evicting all available buffers.
41004205 */
41014206 while (total_evicted < bytes ) {
4102- int sublist_idx = multilist_get_random_index ( ml ) ;
4103- uint64_t scan_evicted = 0 ;
4207+ uint64_t evict = MIN_EVICT_SIZE ;
4208+ uint_t ntasks = zfs_arc_evict_threads ;
41044209
4105- /*
4106- * Start eviction using a randomly selected sublist,
4107- * this is to try and evenly balance eviction across all
4108- * sublists. Always starting at the same sublist
4109- * (e.g. index 0) would cause evictions to favor certain
4110- * sublists over others.
4111- */
4112- for (int i = 0 ; i < num_sublists ; i ++ ) {
4210+ if (use_evcttq ) {
4211+ if (sublists_left < ntasks )
4212+ ntasks = sublists_left ;
4213+
4214+ if (ntasks < 2 )
4215+ use_evcttq = B_FALSE ;
4216+ }
4217+
4218+ if (use_evcttq ) {
4219+ uint64_t left = bytes - total_evicted ;
4220+
4221+ if (bytes == ARC_EVICT_ALL ) {
4222+ evict = bytes ;
4223+ } else if (left > ntasks * MIN_EVICT_SIZE ) {
4224+ evict = DIV_ROUND_UP (left , ntasks );
4225+ } else {
4226+ ntasks = DIV_ROUND_UP (left , MIN_EVICT_SIZE );
4227+ if (ntasks == 1 )
4228+ use_evcttq = B_FALSE ;
4229+ }
4230+ }
4231+
4232+ for (int i = 0 ; sublists_left > 0 ; i ++ , sublist_idx ++ ,
4233+ sublists_left -- ) {
41134234 uint64_t bytes_remaining ;
41144235 uint64_t bytes_evicted ;
41154236
4237+ /* we've reached the end, wrap to the beginning */
4238+ if (sublist_idx >= num_sublists )
4239+ sublist_idx = 0 ;
4240+
4241+ if (use_evcttq ) {
4242+ if (i == ntasks )
4243+ break ;
4244+
4245+ eva [i ].eva_marker = markers [sublist_idx ];
4246+ eva [i ].eva_idx = sublist_idx ;
4247+ eva [i ].eva_bytes = evict ;
4248+
4249+ taskq_dispatch_ent (arc_evict_taskq ,
4250+ arc_evict_task , & eva [i ], 0 ,
4251+ & eva [i ].eva_tqent );
4252+
4253+ continue ;
4254+ }
4255+
41164256 if (total_evicted < bytes )
41174257 bytes_remaining = bytes - total_evicted ;
41184258 else
@@ -4123,18 +4263,23 @@ arc_evict_state(arc_state_t *state, arc_buf_contents_t type, uint64_t spa,
41234263
41244264 scan_evicted += bytes_evicted ;
41254265 total_evicted += bytes_evicted ;
4266+ }
41264267
4127- /* we've reached the end, wrap to the beginning */
4128- if (++ sublist_idx >= num_sublists )
4129- sublist_idx = 0 ;
4268+ if (use_evcttq ) {
4269+ taskq_wait (arc_evict_taskq );
4270+
4271+ for (int i = 0 ; i < ntasks ; i ++ ) {
4272+ scan_evicted += eva [i ].eva_evicted ;
4273+ total_evicted += eva [i ].eva_evicted ;
4274+ }
41304275 }
41314276
41324277 /*
4133- * If we didn't evict anything during this scan , we have
4134- * no reason to believe we'll evict more during another
4278+ * If we scanned all sublists and didn't evict anything, we
4279+ * have no reason to believe we'll evict more during another
41354280 * scan, so break the loop.
41364281 */
4137- if (scan_evicted == 0 ) {
4282+ if (scan_evicted == 0 && sublists_left == 0 ) {
41384283 /* This isn't possible, let's make that obvious */
41394284 ASSERT3S (bytes , != , 0 );
41404285
@@ -4151,13 +4296,33 @@ arc_evict_state(arc_state_t *state, arc_buf_contents_t type, uint64_t spa,
41514296
41524297 break ;
41534298 }
4299+
4300+ /*
4301+ * If we scanned all sublists but still have more to do,
4302+ * reset the counts so we can go around again.
4303+ */
4304+ if (sublists_left == 0 ) {
4305+ sublists_left = num_sublists ;
4306+ sublist_idx = multilist_get_random_index (ml );
4307+ scan_evicted = 0 ;
4308+
4309+ /*
4310+ * Since we're about to reconsider all sublists,
4311+ * re-enable use of the evict threads if available.
4312+ */
4313+ use_evcttq = (zfs_arc_evict_threads > 1 && eva != NULL );
4314+ }
41544315 }
41554316
4317+ if (eva != NULL && eva != arc_evict_arg )
4318+ kmem_free (eva , sizeof (evict_arg_t ) * zfs_arc_evict_threads );
4319+
41564320 for (int i = 0 ; i < num_sublists ; i ++ ) {
41574321 multilist_sublist_t * mls = multilist_sublist_lock_idx (ml , i );
41584322 multilist_sublist_remove (mls , markers [i ]);
41594323 multilist_sublist_unlock (mls );
41604324 }
4325+
41614326 if (markers != arc_state_evict_markers )
41624327 arc_state_free_markers (markers , num_sublists );
41634328
@@ -7805,6 +7970,7 @@ arc_set_limits(uint64_t allmem)
78057970 /* How to set default max varies by platform. */
78067971 arc_c_max = arc_default_max (arc_c_min , allmem );
78077972}
7973+
78087974void
78097975arc_init (void )
78107976{
@@ -7882,6 +8048,8 @@ arc_init(void)
78828048 arc_prune_taskq = taskq_create ("arc_prune" , zfs_arc_prune_task_threads ,
78838049 defclsyspri , 100 , INT_MAX , TASKQ_PREPOPULATE | TASKQ_DYNAMIC );
78848050
8051+ arc_evict_thread_init ();
8052+
78858053 list_create (& arc_async_flush_list , sizeof (arc_async_flush_t ),
78868054 offsetof(arc_async_flush_t , af_node ));
78878055 mutex_init (& arc_async_flush_lock , NULL , MUTEX_DEFAULT , NULL );
@@ -7982,11 +8150,20 @@ arc_fini(void)
79828150 list_destroy (& arc_prune_list );
79838151 mutex_destroy (& arc_prune_mtx );
79848152
8153+ if (arc_evict_taskq != NULL )
8154+ taskq_wait (arc_evict_taskq );
8155+
79858156 (void ) zthr_cancel (arc_evict_zthr );
79868157 (void ) zthr_cancel (arc_reap_zthr );
79878158 arc_state_free_markers (arc_state_evict_markers ,
79888159 arc_state_evict_marker_count );
79898160
8161+ if (arc_evict_taskq != NULL ) {
8162+ taskq_destroy (arc_evict_taskq );
8163+ kmem_free (arc_evict_arg ,
8164+ sizeof (evict_arg_t ) * zfs_arc_evict_threads );
8165+ }
8166+
79908167 mutex_destroy (& arc_evict_lock );
79918168 list_destroy (& arc_evict_waiters );
79928169
@@ -11110,3 +11287,6 @@ ZFS_MODULE_PARAM(zfs_arc, zfs_arc_, evict_batch_limit, UINT, ZMOD_RW,
1111011287
1111111288ZFS_MODULE_PARAM (zfs_arc , zfs_arc_ , prune_task_threads , INT , ZMOD_RW ,
1111211289 "Number of arc_prune threads" );
11290+
11291+ ZFS_MODULE_PARAM (zfs_arc , zfs_arc_ , evict_threads , UINT , ZMOD_RD ,
11292+ "Number of threads to use for ARC eviction." );
0 commit comments