diff --git a/cmd/arc_summary b/cmd/arc_summary index 7149629468e3..4b74f81826ea 100755 --- a/cmd/arc_summary +++ b/cmd/arc_summary @@ -270,16 +270,14 @@ def draw_graph(kstats_dict): arc_perc = f_perc(arc_stats['size'], arc_stats['c_max']) mfu_size = f_bytes(arc_stats['mfu_size']) mru_size = f_bytes(arc_stats['mru_size']) - meta_limit = f_bytes(arc_stats['arc_meta_limit']) meta_size = f_bytes(arc_stats['arc_meta_used']) dnode_limit = f_bytes(arc_stats['arc_dnode_limit']) dnode_size = f_bytes(arc_stats['dnode_size']) - info_form = ('ARC: {0} ({1}) MFU: {2} MRU: {3} META: {4} ({5}) ' - 'DNODE {6} ({7})') + info_form = ('ARC: {0} ({1}) MFU: {2} MRU: {3} META: {4} ' + 'DNODE {5} ({6})') info_line = info_form.format(arc_size, arc_perc, mfu_size, mru_size, - meta_size, meta_limit, dnode_size, - dnode_limit) + meta_size, dnode_size, dnode_limit) info_spc = ' '*int((GRAPH_WIDTH-len(info_line))/2) info_line = GRAPH_INDENT+info_spc+info_line @@ -564,7 +562,6 @@ def section_arc(kstats_dict): mfug_size = arc_stats['mfu_ghost_size'] mrug_size = arc_stats['mru_ghost_size'] unc_size = arc_stats['uncached_size'] - meta_limit = arc_stats['arc_meta_limit'] meta_size = arc_stats['arc_meta_used'] dnode_limit = arc_stats['arc_dnode_limit'] dnode_size = arc_stats['dnode_size'] @@ -589,12 +586,10 @@ def section_arc(kstats_dict): prt_i1('Most Recently Used (MRU) ghost size:', f_bytes(mrug_size)) prt_i2('Uncached data size:', f_perc(unc_size, caches_size), f_bytes(unc_size)) - prt_i2('Metadata cache size (hard limit):', - f_perc(meta_limit, arc_max), f_bytes(meta_limit)) prt_i2('Metadata cache size (current):', - f_perc(meta_size, meta_limit), f_bytes(meta_size)) + f_perc(meta_size, arc_max), f_bytes(meta_size)) prt_i2('Dnode cache size (hard limit):', - f_perc(dnode_limit, meta_limit), f_bytes(dnode_limit)) + f_perc(dnode_limit, arc_max), f_bytes(dnode_limit)) prt_i2('Dnode cache size (current):', f_perc(dnode_size, dnode_limit), f_bytes(dnode_size)) print() diff --git a/cmd/zdb/zdb.c b/cmd/zdb/zdb.c index b04b220c768e..4dcfbd612f7c 100644 --- a/cmd/zdb/zdb.c +++ b/cmd/zdb/zdb.c @@ -116,7 +116,6 @@ zdb_ot_name(dmu_object_type_t type) extern int reference_tracking_enable; extern int zfs_recover; -extern unsigned long zfs_arc_meta_min, zfs_arc_meta_limit; extern uint_t zfs_vdev_async_read_max_active; extern boolean_t spa_load_verify_dryrun; extern boolean_t spa_mode_readable_spacemaps; @@ -8634,8 +8633,8 @@ main(int argc, char **argv) * ZDB does not typically re-read blocks; therefore limit the ARC * to 256 MB, which can be used entirely for metadata. */ - zfs_arc_min = zfs_arc_meta_min = 2ULL << SPA_MAXBLOCKSHIFT; - zfs_arc_max = zfs_arc_meta_limit = 256 * 1024 * 1024; + zfs_arc_min = 2ULL << SPA_MAXBLOCKSHIFT; + zfs_arc_max = 256 * 1024 * 1024; #endif /* diff --git a/include/sys/arc.h b/include/sys/arc.h index 09a66c31d5f0..ef7eda88263b 100644 --- a/include/sys/arc.h +++ b/include/sys/arc.h @@ -201,7 +201,6 @@ struct arc_buf { }; typedef enum arc_buf_contents { - ARC_BUFC_INVALID, /* invalid type */ ARC_BUFC_DATA, /* buffer contains data */ ARC_BUFC_METADATA, /* buffer contains metadata */ ARC_BUFC_NUMTYPES diff --git a/include/sys/arc_impl.h b/include/sys/arc_impl.h index 082372729b80..1bffa67100ca 100644 --- a/include/sys/arc_impl.h +++ b/include/sys/arc_impl.h @@ -81,15 +81,18 @@ typedef struct arc_state { * supports the "dbufs" kstat */ arc_state_type_t arcs_state; + /* + * total amount of data in this state. + */ + zfs_refcount_t arcs_size[ARC_BUFC_NUMTYPES] ____cacheline_aligned; /* * total amount of evictable data in this state */ - zfs_refcount_t arcs_esize[ARC_BUFC_NUMTYPES] ____cacheline_aligned; + zfs_refcount_t arcs_esize[ARC_BUFC_NUMTYPES]; /* - * total amount of data in this state; this includes: evictable, - * non-evictable, ARC_BUFC_DATA, and ARC_BUFC_METADATA. + * amount of hit bytes for this state (ghost only) */ - zfs_refcount_t arcs_size; + wmsum_t arcs_hits[ARC_BUFC_NUMTYPES]; } arc_state_t; typedef struct arc_callback arc_callback_t; @@ -581,7 +584,9 @@ typedef struct arc_stats { kstat_named_t arcstat_hash_collisions; kstat_named_t arcstat_hash_chains; kstat_named_t arcstat_hash_chain_max; - kstat_named_t arcstat_p; + kstat_named_t arcstat_meta; + kstat_named_t arcstat_pd; + kstat_named_t arcstat_pm; kstat_named_t arcstat_c; kstat_named_t arcstat_c_min; kstat_named_t arcstat_c_max; @@ -654,6 +659,8 @@ typedef struct arc_stats { * are all included in this value. */ kstat_named_t arcstat_anon_size; + kstat_named_t arcstat_anon_data; + kstat_named_t arcstat_anon_metadata; /* * Number of bytes consumed by ARC buffers that meet the * following criteria: backing buffers of type ARC_BUFC_DATA, @@ -675,6 +682,8 @@ typedef struct arc_stats { * are all included in this value. */ kstat_named_t arcstat_mru_size; + kstat_named_t arcstat_mru_data; + kstat_named_t arcstat_mru_metadata; /* * Number of bytes consumed by ARC buffers that meet the * following criteria: backing buffers of type ARC_BUFC_DATA, @@ -699,6 +708,8 @@ typedef struct arc_stats { * buffers *would have* consumed this number of bytes. */ kstat_named_t arcstat_mru_ghost_size; + kstat_named_t arcstat_mru_ghost_data; + kstat_named_t arcstat_mru_ghost_metadata; /* * Number of bytes that *would have been* consumed by ARC * buffers that are eligible for eviction, of type @@ -718,6 +729,8 @@ typedef struct arc_stats { * are all included in this value. */ kstat_named_t arcstat_mfu_size; + kstat_named_t arcstat_mfu_data; + kstat_named_t arcstat_mfu_metadata; /* * Number of bytes consumed by ARC buffers that are eligible for * eviction, of type ARC_BUFC_DATA, and reside in the arc_mfu @@ -736,6 +749,8 @@ typedef struct arc_stats { * arcstat_mru_ghost_size for more details. */ kstat_named_t arcstat_mfu_ghost_size; + kstat_named_t arcstat_mfu_ghost_data; + kstat_named_t arcstat_mfu_ghost_metadata; /* * Number of bytes that *would have been* consumed by ARC * buffers that are eligible for eviction, of type @@ -753,6 +768,8 @@ typedef struct arc_stats { * ARC_FLAG_UNCACHED being set. */ kstat_named_t arcstat_uncached_size; + kstat_named_t arcstat_uncached_data; + kstat_named_t arcstat_uncached_metadata; /* * Number of data bytes that are going to be evicted from ARC due to * ARC_FLAG_UNCACHED being set. @@ -875,10 +892,7 @@ typedef struct arc_stats { kstat_named_t arcstat_loaned_bytes; kstat_named_t arcstat_prune; kstat_named_t arcstat_meta_used; - kstat_named_t arcstat_meta_limit; kstat_named_t arcstat_dnode_limit; - kstat_named_t arcstat_meta_max; - kstat_named_t arcstat_meta_min; kstat_named_t arcstat_async_upgrade_sync; /* Number of predictive prefetch requests. */ kstat_named_t arcstat_predictive_prefetch; @@ -986,7 +1000,7 @@ typedef struct arc_sums { wmsum_t arcstat_memory_direct_count; wmsum_t arcstat_memory_indirect_count; wmsum_t arcstat_prune; - aggsum_t arcstat_meta_used; + wmsum_t arcstat_meta_used; wmsum_t arcstat_async_upgrade_sync; wmsum_t arcstat_predictive_prefetch; wmsum_t arcstat_demand_hit_predictive_prefetch; @@ -1014,7 +1028,9 @@ typedef struct arc_evict_waiter { #define ARCSTAT_BUMPDOWN(stat) ARCSTAT_INCR(stat, -1) #define arc_no_grow ARCSTAT(arcstat_no_grow) /* do not grow cache size */ -#define arc_p ARCSTAT(arcstat_p) /* target size of MRU */ +#define arc_meta ARCSTAT(arcstat_meta) /* target frac of metadata */ +#define arc_pd ARCSTAT(arcstat_pd) /* target frac of data MRU */ +#define arc_pm ARCSTAT(arcstat_pm) /* target frac of meta MRU */ #define arc_c ARCSTAT(arcstat_c) /* target size of cache */ #define arc_c_min ARCSTAT(arcstat_c_min) /* min target cache size */ #define arc_c_max ARCSTAT(arcstat_c_max) /* max target cache size */ diff --git a/man/man4/zfs.4 b/man/man4/zfs.4 index f788e645c411..c199d3e0f2e5 100644 --- a/man/man4/zfs.4 +++ b/man/man4/zfs.4 @@ -548,14 +548,6 @@ This value acts as a ceiling to the amount of dnode metadata, and defaults to which indicates that a percent which is based on .Sy zfs_arc_dnode_limit_percent of the ARC meta buffers that may be used for dnodes. -.Pp -Also see -.Sy zfs_arc_meta_prune -which serves a similar purpose but is used -when the amount of metadata in the ARC exceeds -.Sy zfs_arc_meta_limit -rather than in response to overall demand for non-metadata. -. .It Sy zfs_arc_dnode_limit_percent Ns = Ns Sy 10 Ns % Pq u64 Percentage that can be consumed by dnodes of ARC meta buffers. .Pp @@ -638,63 +630,6 @@ It cannot be set back to while running, and reducing it below the current ARC size will not cause the ARC to shrink without memory pressure to induce shrinking. . -.It Sy zfs_arc_meta_adjust_restarts Ns = Ns Sy 4096 Pq uint -The number of restart passes to make while scanning the ARC attempting -the free buffers in order to stay below the -.Sy fs_arc_meta_limit . -This value should not need to be tuned but is available to facilitate -performance analysis. -. -.It Sy zfs_arc_meta_limit Ns = Ns Sy 0 Ns B Pq u64 -The maximum allowed size in bytes that metadata buffers are allowed to -consume in the ARC. -When this limit is reached, metadata buffers will be reclaimed, -even if the overall -.Sy arc_c_max -has not been reached. -It defaults to -.Sy 0 , -which indicates that a percentage based on -.Sy zfs_arc_meta_limit_percent -of the ARC may be used for metadata. -.Pp -This value my be changed dynamically, except that must be set to an explicit -value -.Pq cannot be set back to Sy 0 . -. -.It Sy zfs_arc_meta_limit_percent Ns = Ns Sy 75 Ns % Pq u64 -Percentage of ARC buffers that can be used for metadata. -.Pp -See also -.Sy zfs_arc_meta_limit , -which serves a similar purpose but has a higher priority if nonzero. -. -.It Sy zfs_arc_meta_min Ns = Ns Sy 0 Ns B Pq u64 -The minimum allowed size in bytes that metadata buffers may consume in -the ARC. -. -.It Sy zfs_arc_meta_prune Ns = Ns Sy 10000 Pq int -The number of dentries and inodes to be scanned looking for entries -which can be dropped. -This may be required when the ARC reaches the -.Sy zfs_arc_meta_limit -because dentries and inodes can pin buffers in the ARC. -Increasing this value will cause to dentry and inode caches -to be pruned more aggressively. -Setting this value to -.Sy 0 -will disable pruning the inode and dentry caches. -. -.It Sy zfs_arc_meta_strategy Ns = Ns Sy 1 Ns | Ns 0 Pq uint -Define the strategy for ARC metadata buffer eviction (meta reclaim strategy): -.Bl -tag -compact -offset 4n -width "0 (META_ONLY)" -.It Sy 0 Pq META_ONLY -evict only the ARC metadata buffers -.It Sy 1 Pq BALANCED -additional data buffers may be evicted if required -to evict the required number of metadata buffers. -.El -. .It Sy zfs_arc_min Ns = Ns Sy 0 Ns B Pq u64 Min size of ARC in bytes. .No If set to Sy 0 , arc_c_min @@ -776,20 +711,6 @@ causes the ARC to start reclamation if it exceeds the target size by of the target size, and block allocations by .Em 0.6% . . -.It Sy zfs_arc_p_min_shift Ns = Ns Sy 0 Pq uint -If nonzero, this will update -.Sy arc_p_min_shift Pq default Sy 4 -with the new value. -.Sy arc_p_min_shift No is used as a shift of Sy arc_c -when calculating the minumum -.Sy arc_p No size . -. -.It Sy zfs_arc_p_dampener_disable Ns = Ns Sy 1 Ns | Ns 0 Pq int -Disable -.Sy arc_p -adapt dampener, which reduces the maximum single adjustment to -.Sy arc_p . -. .It Sy zfs_arc_shrink_shift Ns = Ns Sy 0 Pq uint If nonzero, this will update .Sy arc_shrink_shift Pq default Sy 7 diff --git a/module/os/freebsd/zfs/arc_os.c b/module/os/freebsd/zfs/arc_os.c index dfe5c3d311c2..a2ff0f386a9d 100644 --- a/module/os/freebsd/zfs/arc_os.c +++ b/module/os/freebsd/zfs/arc_os.c @@ -159,7 +159,7 @@ arc_prune_task(void *arg) /* * Notify registered consumers they must drop holds on a portion of the ARC * buffered they reference. This provides a mechanism to ensure the ARC can - * honor the arc_meta_limit and reclaim otherwise pinned ARC buffers. This + * honor the metadata limit and reclaim otherwise pinned ARC buffers. This * is analogous to dnlc_reduce_cache() but more generic. * * This operation is performed asynchronously so it may be safely called diff --git a/module/os/freebsd/zfs/sysctl_os.c b/module/os/freebsd/zfs/sysctl_os.c index 05d58ad74e53..3a80d9ac1300 100644 --- a/module/os/freebsd/zfs/sysctl_os.c +++ b/module/os/freebsd/zfs/sysctl_os.c @@ -359,89 +359,114 @@ SYSCTL_INT(_vfs_zfs, OID_AUTO, l2arc_norw, "No reads during writes (LEGACY)"); /* END CSTYLED */ +static int +param_get_arc_state_size(SYSCTL_HANDLER_ARGS) +{ + arc_state_t *state = (arc_state_t *)arg1; + int64_t val; + + val = zfs_refcount_count(&state->arcs_size[ARC_BUFC_DATA]) + + zfs_refcount_count(&state->arcs_size[ARC_BUFC_METADATA]); + return (sysctl_handle_64(oidp, &val, 0, req)); +} + extern arc_state_t ARC_anon; /* BEGIN CSTYLED */ -SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, anon_size, CTLFLAG_RD, - &ARC_anon.arcs_size.rc_count, 0, "size of anonymous state"); +SYSCTL_PROC(_vfs_zfs, OID_AUTO, anon_size, + CTLTYPE_S64 | CTLFLAG_RD | CTLFLAG_MPSAFE, + &ARC_anon, 0, param_get_arc_state_size, "Q", + "size of anonymous state"); SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, anon_metadata_esize, CTLFLAG_RD, &ARC_anon.arcs_esize[ARC_BUFC_METADATA].rc_count, 0, - "size of metadata in anonymous state"); + "size of evictable metadata in anonymous state"); SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, anon_data_esize, CTLFLAG_RD, &ARC_anon.arcs_esize[ARC_BUFC_DATA].rc_count, 0, - "size of data in anonymous state"); + "size of evictable data in anonymous state"); /* END CSTYLED */ extern arc_state_t ARC_mru; /* BEGIN CSTYLED */ -SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mru_size, CTLFLAG_RD, - &ARC_mru.arcs_size.rc_count, 0, "size of mru state"); +SYSCTL_PROC(_vfs_zfs, OID_AUTO, mru_size, + CTLTYPE_S64 | CTLFLAG_RD | CTLFLAG_MPSAFE, + &ARC_mru, 0, param_get_arc_state_size, "Q", + "size of mru state"); SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mru_metadata_esize, CTLFLAG_RD, &ARC_mru.arcs_esize[ARC_BUFC_METADATA].rc_count, 0, - "size of metadata in mru state"); + "size of evictable metadata in mru state"); SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mru_data_esize, CTLFLAG_RD, &ARC_mru.arcs_esize[ARC_BUFC_DATA].rc_count, 0, - "size of data in mru state"); + "size of evictable data in mru state"); /* END CSTYLED */ extern arc_state_t ARC_mru_ghost; /* BEGIN CSTYLED */ -SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mru_ghost_size, CTLFLAG_RD, - &ARC_mru_ghost.arcs_size.rc_count, 0, "size of mru ghost state"); +SYSCTL_PROC(_vfs_zfs, OID_AUTO, mru_ghost_size, + CTLTYPE_S64 | CTLFLAG_RD | CTLFLAG_MPSAFE, + &ARC_mru_ghost, 0, param_get_arc_state_size, "Q", + "size of mru ghost state"); SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mru_ghost_metadata_esize, CTLFLAG_RD, &ARC_mru_ghost.arcs_esize[ARC_BUFC_METADATA].rc_count, 0, - "size of metadata in mru ghost state"); + "size of evictable metadata in mru ghost state"); SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mru_ghost_data_esize, CTLFLAG_RD, &ARC_mru_ghost.arcs_esize[ARC_BUFC_DATA].rc_count, 0, - "size of data in mru ghost state"); + "size of evictable data in mru ghost state"); /* END CSTYLED */ extern arc_state_t ARC_mfu; /* BEGIN CSTYLED */ -SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mfu_size, CTLFLAG_RD, - &ARC_mfu.arcs_size.rc_count, 0, "size of mfu state"); +SYSCTL_PROC(_vfs_zfs, OID_AUTO, mfu_size, + CTLTYPE_S64 | CTLFLAG_RD | CTLFLAG_MPSAFE, + &ARC_mfu, 0, param_get_arc_state_size, "Q", + "size of mfu state"); SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mfu_metadata_esize, CTLFLAG_RD, &ARC_mfu.arcs_esize[ARC_BUFC_METADATA].rc_count, 0, - "size of metadata in mfu state"); + "size of evictable metadata in mfu state"); SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mfu_data_esize, CTLFLAG_RD, &ARC_mfu.arcs_esize[ARC_BUFC_DATA].rc_count, 0, - "size of data in mfu state"); + "size of evictable data in mfu state"); /* END CSTYLED */ extern arc_state_t ARC_mfu_ghost; /* BEGIN CSTYLED */ -SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mfu_ghost_size, CTLFLAG_RD, - &ARC_mfu_ghost.arcs_size.rc_count, 0, "size of mfu ghost state"); +SYSCTL_PROC(_vfs_zfs, OID_AUTO, mfu_ghost_size, + CTLTYPE_S64 | CTLFLAG_RD | CTLFLAG_MPSAFE, + &ARC_mfu_ghost, 0, param_get_arc_state_size, "Q", + "size of mfu ghost state"); SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mfu_ghost_metadata_esize, CTLFLAG_RD, &ARC_mfu_ghost.arcs_esize[ARC_BUFC_METADATA].rc_count, 0, - "size of metadata in mfu ghost state"); + "size of evictable metadata in mfu ghost state"); SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mfu_ghost_data_esize, CTLFLAG_RD, &ARC_mfu_ghost.arcs_esize[ARC_BUFC_DATA].rc_count, 0, - "size of data in mfu ghost state"); + "size of evictable data in mfu ghost state"); /* END CSTYLED */ extern arc_state_t ARC_uncached; /* BEGIN CSTYLED */ -SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, uncached_size, CTLFLAG_RD, - &ARC_uncached.arcs_size.rc_count, 0, "size of uncached state"); +SYSCTL_PROC(_vfs_zfs, OID_AUTO, uncached_size, + CTLTYPE_S64 | CTLFLAG_RD | CTLFLAG_MPSAFE, + &ARC_uncached, 0, param_get_arc_state_size, "Q", + "size of uncached state"); SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, uncached_metadata_esize, CTLFLAG_RD, &ARC_uncached.arcs_esize[ARC_BUFC_METADATA].rc_count, 0, - "size of metadata in uncached state"); + "size of evictable metadata in uncached state"); SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, uncached_data_esize, CTLFLAG_RD, &ARC_uncached.arcs_esize[ARC_BUFC_DATA].rc_count, 0, - "size of data in uncached state"); + "size of evictable data in uncached state"); /* END CSTYLED */ extern arc_state_t ARC_l2c_only; /* BEGIN CSTYLED */ -SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, l2c_only_size, CTLFLAG_RD, - &ARC_l2c_only.arcs_size.rc_count, 0, "size of mru state"); +SYSCTL_PROC(_vfs_zfs, OID_AUTO, l2c_only_size, + CTLTYPE_S64 | CTLFLAG_RD | CTLFLAG_MPSAFE, + &ARC_l2c_only, 0, param_get_arc_state_size, "Q", + "size of l2c_only state"); /* END CSTYLED */ /* dbuf.c */ diff --git a/module/os/linux/zfs/arc_os.c b/module/os/linux/zfs/arc_os.c index 6f730e9ddd83..b7d6053529b4 100644 --- a/module/os/linux/zfs/arc_os.c +++ b/module/os/linux/zfs/arc_os.c @@ -504,7 +504,7 @@ arc_prune_task(void *ptr) /* * Notify registered consumers they must drop holds on a portion of the ARC * buffered they reference. This provides a mechanism to ensure the ARC can - * honor the arc_meta_limit and reclaim otherwise pinned ARC buffers. This + * honor the metadata limit and reclaim otherwise pinned ARC buffers. This * is analogous to dnlc_reduce_cache() but more generic. * * This operation is performed asynchronously so it may be safely called diff --git a/module/zfs/arc.c b/module/zfs/arc.c index 06af9cc7f940..e5097933cec5 100644 --- a/module/zfs/arc.c +++ b/module/zfs/arc.c @@ -108,12 +108,11 @@ * the active state mutex must be held before the ghost state mutex. * * It as also possible to register a callback which is run when the - * arc_meta_limit is reached and no buffers can be safely evicted. In + * metadata limit is reached and no buffers can be safely evicted. In * this case the arc user should drop a reference on some arc buffers so - * they can be reclaimed and the arc_meta_limit honored. For example, - * when using the ZPL each dentry holds a references on a znode. These - * dentries must be pruned before the arc buffer holding the znode can - * be safely evicted. + * they can be reclaimed. For example, when using the ZPL each dentry + * holds a references on a znode. These dentries must be pruned before + * the arc buffer holding the znode can be safely evicted. * * Note that the majority of the performance stats are manipulated * with atomic operations. @@ -377,9 +376,6 @@ static const int arc_kmem_cache_reap_retry_ms = 1000; /* shift of arc_c for calculating overflow limit in arc_get_data_impl */ static int zfs_arc_overflow_shift = 8; -/* shift of arc_c for calculating both min and max arc_p */ -static uint_t arc_p_min_shift = 4; - /* log2(fraction of arc to reclaim) */ uint_t arc_shrink_shift = 7; @@ -422,13 +418,10 @@ boolean_t arc_warm; */ uint64_t zfs_arc_max = 0; uint64_t zfs_arc_min = 0; -uint64_t zfs_arc_meta_limit = 0; -uint64_t zfs_arc_meta_min = 0; static uint64_t zfs_arc_dnode_limit = 0; static uint_t zfs_arc_dnode_reduce_percent = 10; static uint_t zfs_arc_grow_retry = 0; static uint_t zfs_arc_shrink_shift = 0; -static uint_t zfs_arc_p_min_shift = 0; uint_t zfs_arc_average_blocksize = 8 * 1024; /* 8KB */ /* @@ -447,10 +440,10 @@ static const unsigned long zfs_arc_pool_dirty_percent = 20; int zfs_compressed_arc_enabled = B_TRUE; /* - * ARC will evict meta buffers that exceed arc_meta_limit. This - * tunable make arc_meta_limit adjustable for different workloads. + * Balance between metadata and data on ghost hits. Values above 100 + * increase metadata caching, below -- reduce. */ -static uint64_t zfs_arc_meta_limit_percent = 75; +static uint_t zfs_arc_meta_balance = 500; /* * Percentage that can be consumed by dnodes of ARC meta buffers. @@ -463,10 +456,6 @@ static uint_t zfs_arc_dnode_limit_percent = 10; static uint64_t zfs_arc_sys_free = 0; static uint_t zfs_arc_min_prefetch_ms = 0; static uint_t zfs_arc_min_prescient_prefetch_ms = 0; -static int zfs_arc_p_dampener_disable = 1; -static uint_t zfs_arc_meta_prune = 10000; -static uint_t zfs_arc_meta_strategy = ARC_STRATEGY_META_BALANCED; -static uint_t zfs_arc_meta_adjust_restarts = 4096; static uint_t zfs_arc_lotsfree_percent = 10; /* @@ -520,7 +509,9 @@ arc_stats_t arc_stats = { { "hash_collisions", KSTAT_DATA_UINT64 }, { "hash_chains", KSTAT_DATA_UINT64 }, { "hash_chain_max", KSTAT_DATA_UINT64 }, - { "p", KSTAT_DATA_UINT64 }, + { "meta", KSTAT_DATA_UINT64 }, + { "pd", KSTAT_DATA_UINT64 }, + { "pm", KSTAT_DATA_UINT64 }, { "c", KSTAT_DATA_UINT64 }, { "c_min", KSTAT_DATA_UINT64 }, { "c_max", KSTAT_DATA_UINT64 }, @@ -538,21 +529,33 @@ arc_stats_t arc_stats = { { "other_size", KSTAT_DATA_UINT64 }, #endif { "anon_size", KSTAT_DATA_UINT64 }, + { "anon_data", KSTAT_DATA_UINT64 }, + { "anon_metadata", KSTAT_DATA_UINT64 }, { "anon_evictable_data", KSTAT_DATA_UINT64 }, { "anon_evictable_metadata", KSTAT_DATA_UINT64 }, { "mru_size", KSTAT_DATA_UINT64 }, + { "mru_data", KSTAT_DATA_UINT64 }, + { "mru_metadata", KSTAT_DATA_UINT64 }, { "mru_evictable_data", KSTAT_DATA_UINT64 }, { "mru_evictable_metadata", KSTAT_DATA_UINT64 }, { "mru_ghost_size", KSTAT_DATA_UINT64 }, + { "mru_ghost_data", KSTAT_DATA_UINT64 }, + { "mru_ghost_metadata", KSTAT_DATA_UINT64 }, { "mru_ghost_evictable_data", KSTAT_DATA_UINT64 }, { "mru_ghost_evictable_metadata", KSTAT_DATA_UINT64 }, { "mfu_size", KSTAT_DATA_UINT64 }, + { "mfu_data", KSTAT_DATA_UINT64 }, + { "mfu_metadata", KSTAT_DATA_UINT64 }, { "mfu_evictable_data", KSTAT_DATA_UINT64 }, { "mfu_evictable_metadata", KSTAT_DATA_UINT64 }, { "mfu_ghost_size", KSTAT_DATA_UINT64 }, + { "mfu_ghost_data", KSTAT_DATA_UINT64 }, + { "mfu_ghost_metadata", KSTAT_DATA_UINT64 }, { "mfu_ghost_evictable_data", KSTAT_DATA_UINT64 }, { "mfu_ghost_evictable_metadata", KSTAT_DATA_UINT64 }, { "uncached_size", KSTAT_DATA_UINT64 }, + { "uncached_data", KSTAT_DATA_UINT64 }, + { "uncached_metadata", KSTAT_DATA_UINT64 }, { "uncached_evictable_data", KSTAT_DATA_UINT64 }, { "uncached_evictable_metadata", KSTAT_DATA_UINT64 }, { "l2_hits", KSTAT_DATA_UINT64 }, @@ -607,10 +610,7 @@ arc_stats_t arc_stats = { { "arc_loaned_bytes", KSTAT_DATA_UINT64 }, { "arc_prune", KSTAT_DATA_UINT64 }, { "arc_meta_used", KSTAT_DATA_UINT64 }, - { "arc_meta_limit", KSTAT_DATA_UINT64 }, { "arc_dnode_limit", KSTAT_DATA_UINT64 }, - { "arc_meta_max", KSTAT_DATA_UINT64 }, - { "arc_meta_min", KSTAT_DATA_UINT64 }, { "async_upgrade_sync", KSTAT_DATA_UINT64 }, { "predictive_prefetch", KSTAT_DATA_UINT64 }, { "demand_hit_predictive_prefetch", KSTAT_DATA_UINT64 }, @@ -683,10 +683,7 @@ static kstat_t *arc_ksp; */ #define arc_tempreserve ARCSTAT(arcstat_tempreserve) #define arc_loaned_bytes ARCSTAT(arcstat_loaned_bytes) -#define arc_meta_limit ARCSTAT(arcstat_meta_limit) /* max size for metadata */ -/* max size for dnodes */ -#define arc_dnode_size_limit ARCSTAT(arcstat_dnode_limit) -#define arc_meta_min ARCSTAT(arcstat_meta_min) /* min size for metadata */ +#define arc_dnode_limit ARCSTAT(arcstat_dnode_limit) /* max size for dnodes */ #define arc_need_free ARCSTAT(arcstat_need_free) /* waiting to be evicted */ hrtime_t arc_growtime; @@ -859,7 +856,6 @@ static kcondvar_t l2arc_rebuild_thr_cv; enum arc_hdr_alloc_flags { ARC_HDR_ALLOC_RDATA = 0x1, - ARC_HDR_DO_ADAPT = 0x2, ARC_HDR_USE_RESERVE = 0x4, ARC_HDR_ALLOC_LINEAR = 0x8, }; @@ -1876,7 +1872,7 @@ arc_hdr_decrypt(arc_buf_hdr_t *hdr, spa_t *spa, const zbookmark_phys_t *zb) ASSERT(HDR_EMPTY_OR_LOCKED(hdr)); ASSERT(HDR_ENCRYPTED(hdr)); - arc_hdr_alloc_abd(hdr, ARC_HDR_DO_ADAPT); + arc_hdr_alloc_abd(hdr, 0); ret = spa_do_crypt_abd(B_FALSE, spa, zb, hdr->b_crypt_hdr.b_ot, B_FALSE, bswap, hdr->b_crypt_hdr.b_salt, hdr->b_crypt_hdr.b_iv, @@ -1903,8 +1899,7 @@ arc_hdr_decrypt(arc_buf_hdr_t *hdr, spa_t *spa, const zbookmark_phys_t *zb) * and then loan a buffer from it, rather than allocating a * linear buffer and wrapping it in an abd later. */ - cabd = arc_get_data_abd(hdr, arc_hdr_size(hdr), hdr, - ARC_HDR_DO_ADAPT); + cabd = arc_get_data_abd(hdr, arc_hdr_size(hdr), hdr, 0); tmp = abd_borrow_buf(cabd, arc_hdr_size(hdr)); ret = zio_decompress_data(HDR_GET_COMPRESS(hdr), @@ -2421,7 +2416,7 @@ arc_change_state(arc_state_t *new_state, arc_buf_hdr_t *hdr) int64_t refcnt; uint32_t bufcnt; boolean_t update_old, update_new; - arc_buf_contents_t buftype = arc_buf_type(hdr); + arc_buf_contents_t type = arc_buf_type(hdr); /* * We almost always have an L1 hdr here, since we call arc_hdr_realloc() @@ -2466,7 +2461,7 @@ arc_change_state(arc_state_t *new_state, arc_buf_hdr_t *hdr) ASSERT(HDR_HAS_L1HDR(hdr)); /* remove_reference() saves on insert. */ if (multilist_link_active(&hdr->b_l1hdr.b_arc_node)) { - multilist_remove(&old_state->arcs_list[buftype], + multilist_remove(&old_state->arcs_list[type], hdr); arc_evictable_space_decrement(hdr, old_state); } @@ -2479,7 +2474,7 @@ arc_change_state(arc_state_t *new_state, arc_buf_hdr_t *hdr) * beforehand. */ ASSERT(HDR_HAS_L1HDR(hdr)); - multilist_insert(&new_state->arcs_list[buftype], hdr); + multilist_insert(&new_state->arcs_list[type], hdr); arc_evictable_space_increment(hdr, new_state); } } @@ -2502,7 +2497,8 @@ arc_change_state(arc_state_t *new_state, arc_buf_hdr_t *hdr) * the reference. As a result, we use the arc * header pointer for the reference. */ - (void) zfs_refcount_add_many(&new_state->arcs_size, + (void) zfs_refcount_add_many( + &new_state->arcs_size[type], HDR_GET_LSIZE(hdr), hdr); ASSERT3P(hdr->b_l1hdr.b_pabd, ==, NULL); ASSERT(!HDR_HAS_RABD(hdr)); @@ -2530,20 +2526,20 @@ arc_change_state(arc_state_t *new_state, arc_buf_hdr_t *hdr) continue; (void) zfs_refcount_add_many( - &new_state->arcs_size, + &new_state->arcs_size[type], arc_buf_size(buf), buf); } ASSERT3U(bufcnt, ==, buffers); if (hdr->b_l1hdr.b_pabd != NULL) { (void) zfs_refcount_add_many( - &new_state->arcs_size, + &new_state->arcs_size[type], arc_hdr_size(hdr), hdr); } if (HDR_HAS_RABD(hdr)) { (void) zfs_refcount_add_many( - &new_state->arcs_size, + &new_state->arcs_size[type], HDR_GET_PSIZE(hdr), hdr); } } @@ -2564,7 +2560,8 @@ arc_change_state(arc_state_t *new_state, arc_buf_hdr_t *hdr) * header on the ghost state. */ - (void) zfs_refcount_remove_many(&old_state->arcs_size, + (void) zfs_refcount_remove_many( + &old_state->arcs_size[type], HDR_GET_LSIZE(hdr), hdr); } else { uint32_t buffers = 0; @@ -2590,8 +2587,8 @@ arc_change_state(arc_state_t *new_state, arc_buf_hdr_t *hdr) continue; (void) zfs_refcount_remove_many( - &old_state->arcs_size, arc_buf_size(buf), - buf); + &old_state->arcs_size[type], + arc_buf_size(buf), buf); } ASSERT3U(bufcnt, ==, buffers); ASSERT(hdr->b_l1hdr.b_pabd != NULL || @@ -2599,14 +2596,14 @@ arc_change_state(arc_state_t *new_state, arc_buf_hdr_t *hdr) if (hdr->b_l1hdr.b_pabd != NULL) { (void) zfs_refcount_remove_many( - &old_state->arcs_size, arc_hdr_size(hdr), - hdr); + &old_state->arcs_size[type], + arc_hdr_size(hdr), hdr); } if (HDR_HAS_RABD(hdr)) { (void) zfs_refcount_remove_many( - &old_state->arcs_size, HDR_GET_PSIZE(hdr), - hdr); + &old_state->arcs_size[type], + HDR_GET_PSIZE(hdr), hdr); } } } @@ -2663,7 +2660,7 @@ arc_space_consume(uint64_t space, arc_space_type_t type) } if (type != ARC_SPACE_DATA && type != ARC_SPACE_ABD_CHUNK_WASTE) - aggsum_add(&arc_sums.arcstat_meta_used, space); + ARCSTAT_INCR(arcstat_meta_used, space); aggsum_add(&arc_sums.arcstat_size, space); } @@ -2702,13 +2699,8 @@ arc_space_return(uint64_t space, arc_space_type_t type) break; } - if (type != ARC_SPACE_DATA && type != ARC_SPACE_ABD_CHUNK_WASTE) { - ASSERT(aggsum_compare(&arc_sums.arcstat_meta_used, - space) >= 0); - ARCSTAT_MAX(arcstat_meta_max, - aggsum_upper_bound(&arc_sums.arcstat_meta_used)); - aggsum_add(&arc_sums.arcstat_meta_used, -space); - } + if (type != ARC_SPACE_DATA && type != ARC_SPACE_ABD_CHUNK_WASTE) + ARCSTAT_INCR(arcstat_meta_used, -space); ASSERT(aggsum_compare(&arc_sums.arcstat_size, space) >= 0); aggsum_add(&arc_sums.arcstat_size, -space); @@ -2975,7 +2967,7 @@ arc_hdr_free_on_write(arc_buf_hdr_t *hdr, boolean_t free_rdata) (void) zfs_refcount_remove_many(&state->arcs_esize[type], size, hdr); } - (void) zfs_refcount_remove_many(&state->arcs_size, size, hdr); + (void) zfs_refcount_remove_many(&state->arcs_size[type], size, hdr); if (type == ARC_BUFC_METADATA) { arc_space_return(size, ARC_SPACE_META); } else { @@ -3008,7 +3000,8 @@ arc_share_buf(arc_buf_hdr_t *hdr, arc_buf_t *buf) * refcount ownership to the hdr since it always owns * the refcount whenever an arc_buf_t is shared. */ - zfs_refcount_transfer_ownership_many(&hdr->b_l1hdr.b_state->arcs_size, + zfs_refcount_transfer_ownership_many( + &hdr->b_l1hdr.b_state->arcs_size[arc_buf_type(hdr)], arc_hdr_size(hdr), buf, hdr); hdr->b_l1hdr.b_pabd = abd_get_from_buf(buf->b_data, arc_buf_size(buf)); abd_take_ownership_of_buf(hdr->b_l1hdr.b_pabd, @@ -3037,7 +3030,8 @@ arc_unshare_buf(arc_buf_hdr_t *hdr, arc_buf_t *buf) * We are no longer sharing this buffer so we need * to transfer its ownership to the rightful owner. */ - zfs_refcount_transfer_ownership_many(&hdr->b_l1hdr.b_state->arcs_size, + zfs_refcount_transfer_ownership_many( + &hdr->b_l1hdr.b_state->arcs_size[arc_buf_type(hdr)], arc_hdr_size(hdr), hdr, buf); arc_hdr_clear_flags(hdr, ARC_FLAG_SHARED_DATA); abd_release_ownership_of_buf(hdr->b_l1hdr.b_pabd); @@ -3541,7 +3535,7 @@ arc_hdr_realloc_crypt(arc_buf_hdr_t *hdr, boolean_t need_crypt) /* unset all members of the original hdr */ memset(&hdr->b_dva, 0, sizeof (dva_t)); hdr->b_birth = 0; - hdr->b_type = ARC_BUFC_INVALID; + hdr->b_type = 0; hdr->b_flags = 0; hdr->b_psize = 0; hdr->b_lsize = 0; @@ -4217,8 +4211,7 @@ arc_state_alloc_markers(int count) /* * A b_spa of 0 is used to indicate that this header is - * a marker. This fact is used in arc_evict_type() and - * arc_evict_state_impl(). + * a marker. This fact is used in arc_evict_state_impl(). */ markers[i]->b_spa = 0; @@ -4287,19 +4280,6 @@ arc_evict_state(arc_state_t *state, uint64_t spa, uint64_t bytes, int sublist_idx = multilist_get_random_index(ml); uint64_t scan_evicted = 0; - /* - * Try to reduce pinned dnodes with a floor of arc_dnode_limit. - * Request that 10% of the LRUs be scanned by the superblock - * shrinker. - */ - if (type == ARC_BUFC_DATA && aggsum_compare( - &arc_sums.arcstat_dnode_size, arc_dnode_size_limit) > 0) { - arc_prune_async((aggsum_upper_bound( - &arc_sums.arcstat_dnode_size) - - arc_dnode_size_limit) / sizeof (dnode_t) / - zfs_arc_dnode_reduce_percent); - } - /* * Start eviction using a randomly selected sublist, * this is to try and evenly balance eviction across all @@ -4416,230 +4396,38 @@ arc_evict_impl(arc_state_t *state, uint64_t spa, int64_t bytes, return (0); } -/* - * The goal of this function is to evict enough meta data buffers from the - * ARC in order to enforce the arc_meta_limit. Achieving this is slightly - * more complicated than it appears because it is common for data buffers - * to have holds on meta data buffers. In addition, dnode meta data buffers - * will be held by the dnodes in the block preventing them from being freed. - * This means we can't simply traverse the ARC and expect to always find - * enough unheld meta data buffer to release. - * - * Therefore, this function has been updated to make alternating passes - * over the ARC releasing data buffers and then newly unheld meta data - * buffers. This ensures forward progress is maintained and meta_used - * will decrease. Normally this is sufficient, but if required the ARC - * will call the registered prune callbacks causing dentry and inodes to - * be dropped from the VFS cache. This will make dnode meta data buffers - * available for reclaim. - */ -static uint64_t -arc_evict_meta_balanced(uint64_t meta_used) -{ - int64_t delta, adjustmnt; - uint64_t total_evicted = 0, prune = 0; - arc_buf_contents_t type = ARC_BUFC_DATA; - uint_t restarts = zfs_arc_meta_adjust_restarts; - -restart: - /* - * This slightly differs than the way we evict from the mru in - * arc_evict because we don't have a "target" value (i.e. no - * "meta" arc_p). As a result, I think we can completely - * cannibalize the metadata in the MRU before we evict the - * metadata from the MFU. I think we probably need to implement a - * "metadata arc_p" value to do this properly. - */ - adjustmnt = meta_used - arc_meta_limit; - - if (adjustmnt > 0 && - zfs_refcount_count(&arc_mru->arcs_esize[type]) > 0) { - delta = MIN(zfs_refcount_count(&arc_mru->arcs_esize[type]), - adjustmnt); - total_evicted += arc_evict_impl(arc_mru, 0, delta, type); - adjustmnt -= delta; - } - - /* - * We can't afford to recalculate adjustmnt here. If we do, - * new metadata buffers can sneak into the MRU or ANON lists, - * thus penalize the MFU metadata. Although the fudge factor is - * small, it has been empirically shown to be significant for - * certain workloads (e.g. creating many empty directories). As - * such, we use the original calculation for adjustmnt, and - * simply decrement the amount of data evicted from the MRU. - */ - - if (adjustmnt > 0 && - zfs_refcount_count(&arc_mfu->arcs_esize[type]) > 0) { - delta = MIN(zfs_refcount_count(&arc_mfu->arcs_esize[type]), - adjustmnt); - total_evicted += arc_evict_impl(arc_mfu, 0, delta, type); - } - - adjustmnt = meta_used - arc_meta_limit; - - if (adjustmnt > 0 && - zfs_refcount_count(&arc_mru_ghost->arcs_esize[type]) > 0) { - delta = MIN(adjustmnt, - zfs_refcount_count(&arc_mru_ghost->arcs_esize[type])); - total_evicted += arc_evict_impl(arc_mru_ghost, 0, delta, type); - adjustmnt -= delta; - } - - if (adjustmnt > 0 && - zfs_refcount_count(&arc_mfu_ghost->arcs_esize[type]) > 0) { - delta = MIN(adjustmnt, - zfs_refcount_count(&arc_mfu_ghost->arcs_esize[type])); - total_evicted += arc_evict_impl(arc_mfu_ghost, 0, delta, type); - } - - /* - * If after attempting to make the requested adjustment to the ARC - * the meta limit is still being exceeded then request that the - * higher layers drop some cached objects which have holds on ARC - * meta buffers. Requests to the upper layers will be made with - * increasingly large scan sizes until the ARC is below the limit. - */ - if (meta_used > arc_meta_limit || arc_available_memory() < 0) { - if (type == ARC_BUFC_DATA) { - type = ARC_BUFC_METADATA; - } else { - type = ARC_BUFC_DATA; - - if (zfs_arc_meta_prune) { - prune += zfs_arc_meta_prune; - arc_prune_async(prune); - } - } - - if (restarts > 0) { - restarts--; - goto restart; - } - } - return (total_evicted); -} - -/* - * Evict metadata buffers from the cache, such that arcstat_meta_used is - * capped by the arc_meta_limit tunable. - */ -static uint64_t -arc_evict_meta_only(uint64_t meta_used) -{ - uint64_t total_evicted = 0; - int64_t target; - - /* - * If we're over the meta limit, we want to evict enough - * metadata to get back under the meta limit. We don't want to - * evict so much that we drop the MRU below arc_p, though. If - * we're over the meta limit more than we're over arc_p, we - * evict some from the MRU here, and some from the MFU below. - */ - target = MIN((int64_t)(meta_used - arc_meta_limit), - (int64_t)(zfs_refcount_count(&arc_anon->arcs_size) + - zfs_refcount_count(&arc_mru->arcs_size) - arc_p)); - - total_evicted += arc_evict_impl(arc_mru, 0, target, ARC_BUFC_METADATA); - - /* - * Similar to the above, we want to evict enough bytes to get us - * below the meta limit, but not so much as to drop us below the - * space allotted to the MFU (which is defined as arc_c - arc_p). - */ - target = MIN((int64_t)(meta_used - arc_meta_limit), - (int64_t)(zfs_refcount_count(&arc_mfu->arcs_size) - - (arc_c - arc_p))); - - total_evicted += arc_evict_impl(arc_mfu, 0, target, ARC_BUFC_METADATA); - - return (total_evicted); -} - static uint64_t -arc_evict_meta(uint64_t meta_used) +arc_evict_adj(uint64_t frac, uint64_t total, uint64_t up, uint64_t down, + uint_t balance) { - if (zfs_arc_meta_strategy == ARC_STRATEGY_META_ONLY) - return (arc_evict_meta_only(meta_used)); - else - return (arc_evict_meta_balanced(meta_used)); -} - -/* - * Return the type of the oldest buffer in the given arc state - * - * This function will select a random sublist of type ARC_BUFC_DATA and - * a random sublist of type ARC_BUFC_METADATA. The tail of each sublist - * is compared, and the type which contains the "older" buffer will be - * returned. - */ -static arc_buf_contents_t -arc_evict_type(arc_state_t *state) -{ - multilist_t *data_ml = &state->arcs_list[ARC_BUFC_DATA]; - multilist_t *meta_ml = &state->arcs_list[ARC_BUFC_METADATA]; - int data_idx = multilist_get_random_index(data_ml); - int meta_idx = multilist_get_random_index(meta_ml); - multilist_sublist_t *data_mls; - multilist_sublist_t *meta_mls; - arc_buf_contents_t type; - arc_buf_hdr_t *data_hdr; - arc_buf_hdr_t *meta_hdr; + if (total < 8 || up + down == 0) + return (frac); /* - * We keep the sublist lock until we're finished, to prevent - * the headers from being destroyed via arc_evict_state(). + * We should not have more ghost hits than ghost size, but they + * may get close. Restrict maximum adjustment in that case. */ - data_mls = multilist_sublist_lock(data_ml, data_idx); - meta_mls = multilist_sublist_lock(meta_ml, meta_idx); - - /* - * These two loops are to ensure we skip any markers that - * might be at the tail of the lists due to arc_evict_state(). - */ - - for (data_hdr = multilist_sublist_tail(data_mls); data_hdr != NULL; - data_hdr = multilist_sublist_prev(data_mls, data_hdr)) { - if (data_hdr->b_spa != 0) - break; - } - - for (meta_hdr = multilist_sublist_tail(meta_mls); meta_hdr != NULL; - meta_hdr = multilist_sublist_prev(meta_mls, meta_hdr)) { - if (meta_hdr->b_spa != 0) - break; + if (up + down >= total / 4) { + uint64_t scale = (up + down) / (total / 8); + up /= scale; + down /= scale; } - if (data_hdr == NULL && meta_hdr == NULL) { - type = ARC_BUFC_DATA; - } else if (data_hdr == NULL) { - ASSERT3P(meta_hdr, !=, NULL); - type = ARC_BUFC_METADATA; - } else if (meta_hdr == NULL) { - ASSERT3P(data_hdr, !=, NULL); - type = ARC_BUFC_DATA; - } else { - ASSERT3P(data_hdr, !=, NULL); - ASSERT3P(meta_hdr, !=, NULL); - - /* The headers can't be on the sublist without an L1 header */ - ASSERT(HDR_HAS_L1HDR(data_hdr)); - ASSERT(HDR_HAS_L1HDR(meta_hdr)); + /* Get maximal dynamic range by choosing optimal shifts. */ + int s = highbit64(total); + s = MIN(64 - s, 32); - if (data_hdr->b_l1hdr.b_arc_access < - meta_hdr->b_l1hdr.b_arc_access) { - type = ARC_BUFC_DATA; - } else { - type = ARC_BUFC_METADATA; - } - } + uint64_t ofrac = (1ULL << 32) - frac; - multilist_sublist_unlock(meta_mls); - multilist_sublist_unlock(data_mls); + if (frac >= 4 * ofrac) + up /= frac / (2 * ofrac + 1); + up = (up << s) / (total >> (32 - s)); + if (ofrac >= 4 * frac) + down /= ofrac / (2 * frac + 1); + down = (down << s) / (total >> (32 - s)); + down = down * 100 / balance; - return (type); + return (frac + up - down); } /* @@ -4648,150 +4436,128 @@ arc_evict_type(arc_state_t *state) static uint64_t arc_evict(void) { - uint64_t total_evicted = 0; - uint64_t bytes; - int64_t target; - uint64_t asize = aggsum_value(&arc_sums.arcstat_size); - uint64_t ameta = aggsum_value(&arc_sums.arcstat_meta_used); - - /* - * If we're over arc_meta_limit, we want to correct that before - * potentially evicting data buffers below. - */ - total_evicted += arc_evict_meta(ameta); + uint64_t asize, bytes, total_evicted = 0; + int64_t e, mrud, mrum, mfud, mfum, w; + static uint64_t ogrd, ogrm, ogfd, ogfm; + static uint64_t gsrd, gsrm, gsfd, gsfm; + uint64_t ngrd, ngrm, ngfd, ngfm; + + /* Get current size of ARC states we can evict from. */ + mrud = zfs_refcount_count(&arc_mru->arcs_size[ARC_BUFC_DATA]) + + zfs_refcount_count(&arc_anon->arcs_size[ARC_BUFC_DATA]); + mrum = zfs_refcount_count(&arc_mru->arcs_size[ARC_BUFC_METADATA]) + + zfs_refcount_count(&arc_anon->arcs_size[ARC_BUFC_METADATA]); + mfud = zfs_refcount_count(&arc_mfu->arcs_size[ARC_BUFC_DATA]); + mfum = zfs_refcount_count(&arc_mfu->arcs_size[ARC_BUFC_METADATA]); + uint64_t d = mrud + mfud; + uint64_t m = mrum + mfum; + uint64_t t = d + m; + + /* Get ARC ghost hits since last eviction. */ + ngrd = wmsum_value(&arc_mru_ghost->arcs_hits[ARC_BUFC_DATA]); + uint64_t grd = ngrd - ogrd; + ogrd = ngrd; + ngrm = wmsum_value(&arc_mru_ghost->arcs_hits[ARC_BUFC_METADATA]); + uint64_t grm = ngrm - ogrm; + ogrm = ngrm; + ngfd = wmsum_value(&arc_mfu_ghost->arcs_hits[ARC_BUFC_DATA]); + uint64_t gfd = ngfd - ogfd; + ogfd = ngfd; + ngfm = wmsum_value(&arc_mfu_ghost->arcs_hits[ARC_BUFC_METADATA]); + uint64_t gfm = ngfm - ogfm; + ogfm = ngfm; + + /* Adjust ARC states balance based on ghost hits. */ + arc_meta = arc_evict_adj(arc_meta, gsrd + gsrm + gsfd + gsfm, + grm + gfm, grd + gfd, zfs_arc_meta_balance); + arc_pd = arc_evict_adj(arc_pd, gsrd + gsfd, grd, gfd, 100); + arc_pm = arc_evict_adj(arc_pm, gsrm + gsfm, grm, gfm, 100); - /* - * Adjust MRU size - * - * If we're over the target cache size, we want to evict enough - * from the list to get back to our target size. We don't want - * to evict too much from the MRU, such that it drops below - * arc_p. So, if we're over our target cache size more than - * the MRU is over arc_p, we'll evict enough to get back to - * arc_p here, and then evict more from the MFU below. - */ - target = MIN((int64_t)(asize - arc_c), - (int64_t)(zfs_refcount_count(&arc_anon->arcs_size) + - zfs_refcount_count(&arc_mru->arcs_size) + ameta - arc_p)); - - /* - * If we're below arc_meta_min, always prefer to evict data. - * Otherwise, try to satisfy the requested number of bytes to - * evict from the type which contains older buffers; in an - * effort to keep newer buffers in the cache regardless of their - * type. If we cannot satisfy the number of bytes from this - * type, spill over into the next type. - */ - if (arc_evict_type(arc_mru) == ARC_BUFC_METADATA && - ameta > arc_meta_min) { - bytes = arc_evict_impl(arc_mru, 0, target, ARC_BUFC_METADATA); - total_evicted += bytes; - - /* - * If we couldn't evict our target number of bytes from - * metadata, we try to get the rest from data. - */ - target -= bytes; - - total_evicted += - arc_evict_impl(arc_mru, 0, target, ARC_BUFC_DATA); - } else { - bytes = arc_evict_impl(arc_mru, 0, target, ARC_BUFC_DATA); - total_evicted += bytes; - - /* - * If we couldn't evict our target number of bytes from - * data, we try to get the rest from metadata. - */ - target -= bytes; - - total_evicted += - arc_evict_impl(arc_mru, 0, target, ARC_BUFC_METADATA); - } - - /* - * Re-sum ARC stats after the first round of evictions. - */ asize = aggsum_value(&arc_sums.arcstat_size); - ameta = aggsum_value(&arc_sums.arcstat_meta_used); - - - /* - * Adjust MFU size - * - * Now that we've tried to evict enough from the MRU to get its - * size back to arc_p, if we're still above the target cache - * size, we evict the rest from the MFU. - */ - target = asize - arc_c; - - if (arc_evict_type(arc_mfu) == ARC_BUFC_METADATA && - ameta > arc_meta_min) { - bytes = arc_evict_impl(arc_mfu, 0, target, ARC_BUFC_METADATA); - total_evicted += bytes; - - /* - * If we couldn't evict our target number of bytes from - * metadata, we try to get the rest from data. - */ - target -= bytes; - - total_evicted += - arc_evict_impl(arc_mfu, 0, target, ARC_BUFC_DATA); - } else { - bytes = arc_evict_impl(arc_mfu, 0, target, ARC_BUFC_DATA); - total_evicted += bytes; - - /* - * If we couldn't evict our target number of bytes from - * data, we try to get the rest from data. - */ - target -= bytes; - - total_evicted += - arc_evict_impl(arc_mfu, 0, target, ARC_BUFC_METADATA); - } - - /* - * Adjust ghost lists - * - * In addition to the above, the ARC also defines target values - * for the ghost lists. The sum of the mru list and mru ghost - * list should never exceed the target size of the cache, and - * the sum of the mru list, mfu list, mru ghost list, and mfu - * ghost list should never exceed twice the target size of the - * cache. The following logic enforces these limits on the ghost - * caches, and evicts from them as needed. - */ - target = zfs_refcount_count(&arc_mru->arcs_size) + - zfs_refcount_count(&arc_mru_ghost->arcs_size) - arc_c; - - bytes = arc_evict_impl(arc_mru_ghost, 0, target, ARC_BUFC_DATA); + int64_t wt = t - (asize - arc_c); + + /* + * Try to reduce pinned dnodes if more than 3/4 of wanted metadata + * target is not evictable or if they go over arc_dnode_limit. + */ + int64_t prune = 0; + int64_t dn = aggsum_value(&arc_sums.arcstat_dnode_size); + w = wt * (arc_meta >> 16) >> 16; + if (zfs_refcount_count(&arc_mru->arcs_size[ARC_BUFC_METADATA]) + + zfs_refcount_count(&arc_mfu->arcs_size[ARC_BUFC_METADATA]) - + zfs_refcount_count(&arc_mru->arcs_esize[ARC_BUFC_METADATA]) - + zfs_refcount_count(&arc_mfu->arcs_esize[ARC_BUFC_METADATA]) > + w * 3 / 4) { + prune = dn / sizeof (dnode_t) * + zfs_arc_dnode_reduce_percent / 100; + } else if (dn > arc_dnode_limit) { + prune = (dn - arc_dnode_limit) / sizeof (dnode_t) * + zfs_arc_dnode_reduce_percent / 100; + } + if (prune > 0) + arc_prune_async(prune); + + /* Evict MRU metadata. */ + w = wt * (arc_meta * arc_pm >> 48) >> 16; + e = MIN((int64_t)(asize - arc_c), (int64_t)(mrum - w)); + bytes = arc_evict_impl(arc_mru, 0, e, ARC_BUFC_METADATA); total_evicted += bytes; + mrum -= bytes; + asize -= bytes; - target -= bytes; + /* Evict MFU metadata. */ + w = wt * (arc_meta >> 16) >> 16; + e = MIN((int64_t)(asize - arc_c), (int64_t)(m - w)); + bytes = arc_evict_impl(arc_mfu, 0, e, ARC_BUFC_METADATA); + total_evicted += bytes; + mfum -= bytes; + asize -= bytes; + + /* Evict MRU data. */ + wt -= m - total_evicted; + w = wt * (arc_pd >> 16) >> 16; + e = MIN((int64_t)(asize - arc_c), (int64_t)(mrud - w)); + bytes = arc_evict_impl(arc_mru, 0, e, ARC_BUFC_DATA); + total_evicted += bytes; + mrud -= bytes; + asize -= bytes; - total_evicted += - arc_evict_impl(arc_mru_ghost, 0, target, ARC_BUFC_METADATA); + /* Evict MFU data. */ + e = asize - arc_c; + bytes = arc_evict_impl(arc_mfu, 0, e, ARC_BUFC_DATA); + mfud -= bytes; + total_evicted += bytes; /* - * We assume the sum of the mru list and mfu list is less than - * or equal to arc_c (we enforced this above), which means we - * can use the simpler of the two equations below: + * Evict ghost lists * - * mru + mfu + mru ghost + mfu ghost <= 2 * arc_c - * mru ghost + mfu ghost <= arc_c - */ - target = zfs_refcount_count(&arc_mru_ghost->arcs_size) + - zfs_refcount_count(&arc_mfu_ghost->arcs_size) - arc_c; - - bytes = arc_evict_impl(arc_mfu_ghost, 0, target, ARC_BUFC_DATA); - total_evicted += bytes; - - target -= bytes; - - total_evicted += - arc_evict_impl(arc_mfu_ghost, 0, target, ARC_BUFC_METADATA); + * Size of each state's ghost list represents how much that state + * may grow by shrinking the other states. Would it need to shrink + * other states to zero (that is unlikely), its ghost size would be + * equal to sum of other three state sizes. But excessive ghost + * size may result in false ghost hits (too far back), that may + * never result in real cache hits if several states are competing. + * So choose some arbitraty point of 1/2 of other state sizes. + */ + gsrd = (mrum + mfud + mfum) / 2; + e = zfs_refcount_count(&arc_mru_ghost->arcs_size[ARC_BUFC_DATA]) - + gsrd; + (void) arc_evict_impl(arc_mru_ghost, 0, e, ARC_BUFC_DATA); + + gsrm = (mrud + mfud + mfum) / 2; + e = zfs_refcount_count(&arc_mru_ghost->arcs_size[ARC_BUFC_METADATA]) - + gsrm; + (void) arc_evict_impl(arc_mru_ghost, 0, e, ARC_BUFC_METADATA); + + gsfd = (mrud + mrum + mfum) / 2; + e = zfs_refcount_count(&arc_mfu_ghost->arcs_size[ARC_BUFC_DATA]) - + gsfd; + (void) arc_evict_impl(arc_mfu_ghost, 0, e, ARC_BUFC_DATA); + + gsfm = (mrud + mrum + mfud) / 2; + e = zfs_refcount_count(&arc_mfu_ghost->arcs_size[ARC_BUFC_METADATA]) - + gsfm; + (void) arc_evict_impl(arc_mfu_ghost, 0, e, ARC_BUFC_METADATA); return (total_evicted); } @@ -4830,7 +4596,10 @@ arc_flush(spa_t *spa, boolean_t retry) void arc_reduce_target_size(int64_t to_free) { - uint64_t asize = aggsum_value(&arc_sums.arcstat_size); + uint64_t c = arc_c; + + if (c <= arc_c_min) + return; /* * All callers want the ARC to actually evict (at least) this much @@ -4840,26 +4609,16 @@ arc_reduce_target_size(int64_t to_free) * immediately have arc_c < arc_size and therefore the arc_evict_zthr * will evict. */ - uint64_t c = MIN(arc_c, asize); - - if (c > to_free && c - to_free > arc_c_min) { - arc_c = c - to_free; - atomic_add_64(&arc_p, -(arc_p >> arc_shrink_shift)); - if (arc_p > arc_c) - arc_p = (arc_c >> 1); - ASSERT(arc_c >= arc_c_min); - ASSERT((int64_t)arc_p >= 0); - } else { - arc_c = arc_c_min; - } + uint64_t asize = aggsum_value(&arc_sums.arcstat_size); + if (asize < c) + to_free += c - asize; + arc_c = MAX((int64_t)c - to_free, (int64_t)arc_c_min); - if (asize > arc_c) { - /* See comment in arc_evict_cb_check() on why lock+flag */ - mutex_enter(&arc_evict_lock); - arc_evict_needed = B_TRUE; - mutex_exit(&arc_evict_lock); - zthr_wakeup(arc_evict_zthr); - } + /* See comment in arc_evict_cb_check() on why lock+flag */ + mutex_enter(&arc_evict_lock); + arc_evict_needed = B_TRUE; + mutex_exit(&arc_evict_lock); + zthr_wakeup(arc_evict_zthr); } /* @@ -4881,14 +4640,6 @@ arc_kmem_reap_soon(void) kmem_cache_t *prev_data_cache = NULL; #ifdef _KERNEL - if ((aggsum_compare(&arc_sums.arcstat_meta_used, - arc_meta_limit) >= 0) && zfs_arc_meta_prune) { - /* - * We are exceeding our meta-data cache limit. - * Prune some entries to release holds on meta-data. - */ - arc_prune_async(zfs_arc_meta_prune); - } #if defined(_ILP32) /* * Reclaim unused memory from all kmem caches. @@ -5165,40 +4916,8 @@ arc_reap_cb(void *arg, zthr_t *zthr) * when we are adding new content to the cache. */ static void -arc_adapt(int bytes, arc_state_t *state) +arc_adapt(uint64_t bytes) { - int mult; - uint64_t arc_p_min = (arc_c >> arc_p_min_shift); - int64_t mrug_size = zfs_refcount_count(&arc_mru_ghost->arcs_size); - int64_t mfug_size = zfs_refcount_count(&arc_mfu_ghost->arcs_size); - - ASSERT(bytes > 0); - /* - * Adapt the target size of the MRU list: - * - if we just hit in the MRU ghost list, then increase - * the target size of the MRU list. - * - if we just hit in the MFU ghost list, then increase - * the target size of the MFU list by decreasing the - * target size of the MRU list. - */ - if (state == arc_mru_ghost) { - mult = (mrug_size >= mfug_size) ? 1 : (mfug_size / mrug_size); - if (!zfs_arc_p_dampener_disable) - mult = MIN(mult, 10); /* avoid wild arc_p adjustment */ - - arc_p = MIN(arc_c - arc_p_min, arc_p + (uint64_t)bytes * mult); - } else if (state == arc_mfu_ghost) { - uint64_t delta; - - mult = (mfug_size >= mrug_size) ? 1 : (mrug_size / mfug_size); - if (!zfs_arc_p_dampener_disable) - mult = MIN(mult, 10); - - delta = MIN(bytes * mult, arc_p); - arc_p = MAX(arc_p_min, arc_p - delta); - } - ASSERT((int64_t)arc_p >= 0); - /* * Wake reap thread if we do not have any available memory */ @@ -5217,18 +4936,12 @@ arc_adapt(int bytes, arc_state_t *state) * If we're within (2 * maxblocksize) bytes of the target * cache size, increment the target cache size */ - ASSERT3U(arc_c, >=, 2ULL << SPA_MAXBLOCKSHIFT); - if (aggsum_upper_bound(&arc_sums.arcstat_size) >= - arc_c - (2ULL << SPA_MAXBLOCKSHIFT)) { - atomic_add_64(&arc_c, (int64_t)bytes); - if (arc_c > arc_c_max) + if (aggsum_upper_bound(&arc_sums.arcstat_size) + + 2 * SPA_MAXBLOCKSIZE >= arc_c) { + uint64_t dc = MAX(bytes, SPA_OLD_MAXBLOCKSIZE); + if (atomic_add_64_nv(&arc_c, dc) > arc_c_max) arc_c = arc_c_max; - else if (state == arc_anon && arc_p < arc_c >> 1) - atomic_add_64(&arc_p, (int64_t)bytes); - if (arc_p > arc_c) - arc_p = arc_c; } - ASSERT((int64_t)arc_p >= 0); } /* @@ -5277,7 +4990,7 @@ arc_get_data_buf(arc_buf_hdr_t *hdr, uint64_t size, const void *tag) { arc_buf_contents_t type = arc_buf_type(hdr); - arc_get_data_impl(hdr, size, tag, ARC_HDR_DO_ADAPT); + arc_get_data_impl(hdr, size, tag, 0); if (type == ARC_BUFC_METADATA) { return (zio_buf_alloc(size)); } else { @@ -5375,11 +5088,7 @@ static void arc_get_data_impl(arc_buf_hdr_t *hdr, uint64_t size, const void *tag, int alloc_flags) { - arc_state_t *state = hdr->b_l1hdr.b_state; - arc_buf_contents_t type = arc_buf_type(hdr); - - if (alloc_flags & ARC_HDR_DO_ADAPT) - arc_adapt(size, state); + arc_adapt(size); /* * If arc_size is currently overflowing, we must be adding data @@ -5397,7 +5106,7 @@ arc_get_data_impl(arc_buf_hdr_t *hdr, uint64_t size, const void *tag, arc_wait_for_eviction(size * zfs_arc_eviction_pct / 100, alloc_flags & ARC_HDR_USE_RESERVE); - VERIFY3U(hdr->b_type, ==, type); + arc_buf_contents_t type = arc_buf_type(hdr); if (type == ARC_BUFC_METADATA) { arc_space_consume(size, ARC_SPACE_META); } else { @@ -5408,9 +5117,11 @@ arc_get_data_impl(arc_buf_hdr_t *hdr, uint64_t size, const void *tag, * Update the state size. Note that ghost states have a * "ghost size" and so don't need to be updated. */ + arc_state_t *state = hdr->b_l1hdr.b_state; if (!GHOST_STATE(state)) { - (void) zfs_refcount_add_many(&state->arcs_size, size, tag); + (void) zfs_refcount_add_many(&state->arcs_size[type], size, + tag); /* * If this is reached via arc_read, the link is @@ -5426,17 +5137,6 @@ arc_get_data_impl(arc_buf_hdr_t *hdr, uint64_t size, const void *tag, (void) zfs_refcount_add_many(&state->arcs_esize[type], size, tag); } - - /* - * If we are growing the cache, and we are adding anonymous - * data, and we have outgrown arc_p, update arc_p - */ - if (aggsum_upper_bound(&arc_sums.arcstat_size) < arc_c && - hdr->b_l1hdr.b_state == arc_anon && - (zfs_refcount_count(&arc_anon->arcs_size) + - zfs_refcount_count(&arc_mru->arcs_size) > arc_p && - arc_p < arc_c >> 1)) - arc_p = MIN(arc_c, arc_p + size); } } @@ -5479,7 +5179,7 @@ arc_free_data_impl(arc_buf_hdr_t *hdr, uint64_t size, const void *tag) (void) zfs_refcount_remove_many(&state->arcs_esize[type], size, tag); } - (void) zfs_refcount_remove_many(&state->arcs_size, size, tag); + (void) zfs_refcount_remove_many(&state->arcs_size[type], size, tag); VERIFY3U(hdr->b_type, ==, type); if (type == ARC_BUFC_METADATA) { @@ -5592,6 +5292,8 @@ arc_access(arc_buf_hdr_t *hdr, arc_flags_t arc_flags, boolean_t hit) hdr->b_l1hdr.b_mru_ghost_hits++; ARCSTAT_BUMP(arcstat_mru_ghost_hits); hdr->b_l1hdr.b_arc_access = now; + wmsum_add(&arc_mru_ghost->arcs_hits[arc_buf_type(hdr)], + arc_hdr_size(hdr)); if (was_prefetch) { new_state = arc_mru; DTRACE_PROBE1(new_state__mru, arc_buf_hdr_t *, hdr); @@ -5619,6 +5321,8 @@ arc_access(arc_buf_hdr_t *hdr, arc_flags_t arc_flags, boolean_t hit) hdr->b_l1hdr.b_mfu_ghost_hits++; ARCSTAT_BUMP(arcstat_mfu_ghost_hits); hdr->b_l1hdr.b_arc_access = now; + wmsum_add(&arc_mfu_ghost->arcs_hits[arc_buf_type(hdr)], + arc_hdr_size(hdr)); DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, hdr); arc_change_state(arc_mfu, hdr); } else if (hdr->b_l1hdr.b_state == arc_uncached) { @@ -6186,6 +5890,7 @@ arc_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, uint64_t size; abd_t *hdr_abd; int alloc_flags = encrypted_read ? ARC_HDR_ALLOC_RDATA : 0; + arc_buf_contents_t type = BP_GET_BUFC_TYPE(bp); if (*arc_flags & ARC_FLAG_CACHED_ONLY) { rc = SET_ERROR(ENOENT); @@ -6200,7 +5905,6 @@ arc_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, * embedded data. */ arc_buf_hdr_t *exists = NULL; - arc_buf_contents_t type = BP_GET_BUFC_TYPE(bp); hdr = arc_hdr_alloc(spa_load_guid(spa), psize, lsize, BP_IS_PROTECTED(bp), BP_GET_COMPRESS(bp), 0, type); @@ -6259,11 +5963,6 @@ arc_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, alloc_flags |= ARC_HDR_ALLOC_LINEAR; } - /* - * Call arc_adapt() explicitly before arc_access() to allow - * its logic to balance MRU/MFU based on the original state. - */ - arc_adapt(arc_hdr_size(hdr), hdr->b_l1hdr.b_state); /* * Take additional reference for IO_IN_PROGRESS. It stops * arc_access() from putting this header without any buffers @@ -6729,7 +6428,7 @@ arc_release(arc_buf_t *buf, const void *tag) if (arc_can_share(hdr, lastbuf)) { arc_share_buf(hdr, lastbuf); } else { - arc_hdr_alloc_abd(hdr, ARC_HDR_DO_ADAPT); + arc_hdr_alloc_abd(hdr, 0); abd_copy_from_buf(hdr->b_l1hdr.b_pabd, buf->b_data, psize); } @@ -6751,7 +6450,7 @@ arc_release(arc_buf_t *buf, const void *tag) ASSERT(hdr->b_l1hdr.b_pabd != NULL || HDR_HAS_RABD(hdr)); ASSERT3P(state, !=, arc_l2c_only); - (void) zfs_refcount_remove_many(&state->arcs_size, + (void) zfs_refcount_remove_many(&state->arcs_size[type], arc_buf_size(buf), buf); if (zfs_refcount_is_zero(&hdr->b_l1hdr.b_refcnt)) { @@ -6794,7 +6493,7 @@ arc_release(arc_buf_t *buf, const void *tag) buf->b_hdr = nhdr; mutex_exit(&buf->b_evict_lock); - (void) zfs_refcount_add_many(&arc_anon->arcs_size, + (void) zfs_refcount_add_many(&arc_anon->arcs_size[type], arc_buf_size(buf), buf); } else { mutex_exit(&buf->b_evict_lock); @@ -6961,7 +6660,7 @@ arc_write_ready(zio_t *zio) if (ARC_BUF_ENCRYPTED(buf)) { ASSERT3U(psize, >, 0); ASSERT(ARC_BUF_COMPRESSED(buf)); - arc_hdr_alloc_abd(hdr, ARC_HDR_DO_ADAPT | ARC_HDR_ALLOC_RDATA | + arc_hdr_alloc_abd(hdr, ARC_HDR_ALLOC_RDATA | ARC_HDR_USE_RESERVE); abd_copy(hdr->b_crypt_hdr.b_rabd, zio->io_abd, psize); } else if (!(HDR_UNCACHED(hdr) || @@ -6974,19 +6673,17 @@ arc_write_ready(zio_t *zio) */ if (BP_IS_ENCRYPTED(bp)) { ASSERT3U(psize, >, 0); - arc_hdr_alloc_abd(hdr, ARC_HDR_DO_ADAPT | - ARC_HDR_ALLOC_RDATA | ARC_HDR_USE_RESERVE); + arc_hdr_alloc_abd(hdr, ARC_HDR_ALLOC_RDATA | + ARC_HDR_USE_RESERVE); abd_copy(hdr->b_crypt_hdr.b_rabd, zio->io_abd, psize); } else if (arc_hdr_get_compress(hdr) != ZIO_COMPRESS_OFF && !ARC_BUF_COMPRESSED(buf)) { ASSERT3U(psize, >, 0); - arc_hdr_alloc_abd(hdr, ARC_HDR_DO_ADAPT | - ARC_HDR_USE_RESERVE); + arc_hdr_alloc_abd(hdr, ARC_HDR_USE_RESERVE); abd_copy(hdr->b_l1hdr.b_pabd, zio->io_abd, psize); } else { ASSERT3U(zio->io_orig_size, ==, arc_hdr_size(hdr)); - arc_hdr_alloc_abd(hdr, ARC_HDR_DO_ADAPT | - ARC_HDR_USE_RESERVE); + arc_hdr_alloc_abd(hdr, ARC_HDR_USE_RESERVE); abd_copy_from_buf(hdr->b_l1hdr.b_pabd, buf->b_data, arc_buf_size(buf)); } @@ -7241,7 +6938,9 @@ arc_tempreserve_space(spa_t *spa, uint64_t reserve, uint64_t txg) /* assert that it has not wrapped around */ ASSERT3S(atomic_add_64_nv(&arc_loaned_bytes, 0), >=, 0); - anon_size = MAX((int64_t)(zfs_refcount_count(&arc_anon->arcs_size) - + anon_size = MAX((int64_t) + (zfs_refcount_count(&arc_anon->arcs_size[ARC_BUFC_DATA]) + + zfs_refcount_count(&arc_anon->arcs_size[ARC_BUFC_METADATA]) - arc_loaned_bytes), 0); /* @@ -7297,9 +6996,14 @@ arc_tempreserve_space(spa_t *spa, uint64_t reserve, uint64_t txg) static void arc_kstat_update_state(arc_state_t *state, kstat_named_t *size, + kstat_named_t *data, kstat_named_t *metadata, kstat_named_t *evict_data, kstat_named_t *evict_metadata) { - size->value.ui64 = zfs_refcount_count(&state->arcs_size); + data->value.ui64 = + zfs_refcount_count(&state->arcs_size[ARC_BUFC_DATA]); + metadata->value.ui64 = + zfs_refcount_count(&state->arcs_size[ARC_BUFC_METADATA]); + size->value.ui64 = data->value.ui64 + metadata->value.ui64; evict_data->value.ui64 = zfs_refcount_count(&state->arcs_esize[ARC_BUFC_DATA]); evict_metadata->value.ui64 = @@ -7405,26 +7109,38 @@ arc_kstat_update(kstat_t *ksp, int rw) arc_kstat_update_state(arc_anon, &as->arcstat_anon_size, + &as->arcstat_anon_data, + &as->arcstat_anon_metadata, &as->arcstat_anon_evictable_data, &as->arcstat_anon_evictable_metadata); arc_kstat_update_state(arc_mru, &as->arcstat_mru_size, + &as->arcstat_mru_data, + &as->arcstat_mru_metadata, &as->arcstat_mru_evictable_data, &as->arcstat_mru_evictable_metadata); arc_kstat_update_state(arc_mru_ghost, &as->arcstat_mru_ghost_size, + &as->arcstat_mru_ghost_data, + &as->arcstat_mru_ghost_metadata, &as->arcstat_mru_ghost_evictable_data, &as->arcstat_mru_ghost_evictable_metadata); arc_kstat_update_state(arc_mfu, &as->arcstat_mfu_size, + &as->arcstat_mfu_data, + &as->arcstat_mfu_metadata, &as->arcstat_mfu_evictable_data, &as->arcstat_mfu_evictable_metadata); arc_kstat_update_state(arc_mfu_ghost, &as->arcstat_mfu_ghost_size, + &as->arcstat_mfu_ghost_data, + &as->arcstat_mfu_ghost_metadata, &as->arcstat_mfu_ghost_evictable_data, &as->arcstat_mfu_ghost_evictable_metadata); arc_kstat_update_state(arc_uncached, &as->arcstat_uncached_size, + &as->arcstat_uncached_data, + &as->arcstat_uncached_metadata, &as->arcstat_uncached_evictable_data, &as->arcstat_uncached_evictable_metadata); @@ -7527,7 +7243,7 @@ arc_kstat_update(kstat_t *ksp, int rw) as->arcstat_prune.value.ui64 = wmsum_value(&arc_sums.arcstat_prune); as->arcstat_meta_used.value.ui64 = - aggsum_value(&arc_sums.arcstat_meta_used); + wmsum_value(&arc_sums.arcstat_meta_used); as->arcstat_async_upgrade_sync.value.ui64 = wmsum_value(&arc_sums.arcstat_async_upgrade_sync); as->arcstat_predictive_prefetch.value.ui64 = @@ -7613,7 +7329,6 @@ void arc_tuning_update(boolean_t verbose) { uint64_t allmem = arc_all_memory(); - unsigned long limit; /* Valid range: 32M - */ if ((zfs_arc_min) && (zfs_arc_min != arc_c_min) && @@ -7630,44 +7345,15 @@ arc_tuning_update(boolean_t verbose) (zfs_arc_max > arc_c_min)) { arc_c_max = zfs_arc_max; arc_c = MIN(arc_c, arc_c_max); - arc_p = (arc_c >> 1); - if (arc_meta_limit > arc_c_max) - arc_meta_limit = arc_c_max; - if (arc_dnode_size_limit > arc_meta_limit) - arc_dnode_size_limit = arc_meta_limit; + if (arc_dnode_limit > arc_c_max) + arc_dnode_limit = arc_c_max; } WARN_IF_TUNING_IGNORED(zfs_arc_max, arc_c_max, verbose); - /* Valid range: 16M - */ - if ((zfs_arc_meta_min) && (zfs_arc_meta_min != arc_meta_min) && - (zfs_arc_meta_min >= 1ULL << SPA_MAXBLOCKSHIFT) && - (zfs_arc_meta_min <= arc_c_max)) { - arc_meta_min = zfs_arc_meta_min; - if (arc_meta_limit < arc_meta_min) - arc_meta_limit = arc_meta_min; - if (arc_dnode_size_limit < arc_meta_min) - arc_dnode_size_limit = arc_meta_min; - } - WARN_IF_TUNING_IGNORED(zfs_arc_meta_min, arc_meta_min, verbose); - - /* Valid range: - */ - limit = zfs_arc_meta_limit ? zfs_arc_meta_limit : - MIN(zfs_arc_meta_limit_percent, 100) * arc_c_max / 100; - if ((limit != arc_meta_limit) && - (limit >= arc_meta_min) && - (limit <= arc_c_max)) - arc_meta_limit = limit; - WARN_IF_TUNING_IGNORED(zfs_arc_meta_limit, arc_meta_limit, verbose); - - /* Valid range: - */ - limit = zfs_arc_dnode_limit ? zfs_arc_dnode_limit : - MIN(zfs_arc_dnode_limit_percent, 100) * arc_meta_limit / 100; - if ((limit != arc_dnode_size_limit) && - (limit >= arc_meta_min) && - (limit <= arc_meta_limit)) - arc_dnode_size_limit = limit; - WARN_IF_TUNING_IGNORED(zfs_arc_dnode_limit, arc_dnode_size_limit, - verbose); + /* Valid range: 0 - */ + arc_dnode_limit = zfs_arc_dnode_limit ? zfs_arc_dnode_limit : + MIN(zfs_arc_dnode_limit_percent, 100) * arc_c_max / 100; + WARN_IF_TUNING_IGNORED(zfs_arc_dnode_limit, arc_dnode_limit, verbose); /* Valid range: 1 - N */ if (zfs_arc_grow_retry) @@ -7679,10 +7365,6 @@ arc_tuning_update(boolean_t verbose) arc_no_grow_shift = MIN(arc_no_grow_shift, arc_shrink_shift -1); } - /* Valid range: 1 - N */ - if (zfs_arc_p_min_shift) - arc_p_min_shift = zfs_arc_p_min_shift; - /* Valid range: 1 - N ms */ if (zfs_arc_min_prefetch_ms) arc_min_prefetch_ms = zfs_arc_min_prefetch_ms; @@ -7771,13 +7453,25 @@ arc_state_init(void) zfs_refcount_create(&arc_uncached->arcs_esize[ARC_BUFC_METADATA]); zfs_refcount_create(&arc_uncached->arcs_esize[ARC_BUFC_DATA]); - zfs_refcount_create(&arc_anon->arcs_size); - zfs_refcount_create(&arc_mru->arcs_size); - zfs_refcount_create(&arc_mru_ghost->arcs_size); - zfs_refcount_create(&arc_mfu->arcs_size); - zfs_refcount_create(&arc_mfu_ghost->arcs_size); - zfs_refcount_create(&arc_l2c_only->arcs_size); - zfs_refcount_create(&arc_uncached->arcs_size); + zfs_refcount_create(&arc_anon->arcs_size[ARC_BUFC_DATA]); + zfs_refcount_create(&arc_anon->arcs_size[ARC_BUFC_METADATA]); + zfs_refcount_create(&arc_mru->arcs_size[ARC_BUFC_DATA]); + zfs_refcount_create(&arc_mru->arcs_size[ARC_BUFC_METADATA]); + zfs_refcount_create(&arc_mru_ghost->arcs_size[ARC_BUFC_DATA]); + zfs_refcount_create(&arc_mru_ghost->arcs_size[ARC_BUFC_METADATA]); + zfs_refcount_create(&arc_mfu->arcs_size[ARC_BUFC_DATA]); + zfs_refcount_create(&arc_mfu->arcs_size[ARC_BUFC_METADATA]); + zfs_refcount_create(&arc_mfu_ghost->arcs_size[ARC_BUFC_DATA]); + zfs_refcount_create(&arc_mfu_ghost->arcs_size[ARC_BUFC_METADATA]); + zfs_refcount_create(&arc_l2c_only->arcs_size[ARC_BUFC_DATA]); + zfs_refcount_create(&arc_l2c_only->arcs_size[ARC_BUFC_METADATA]); + zfs_refcount_create(&arc_uncached->arcs_size[ARC_BUFC_DATA]); + zfs_refcount_create(&arc_uncached->arcs_size[ARC_BUFC_METADATA]); + + wmsum_init(&arc_mru_ghost->arcs_hits[ARC_BUFC_DATA], 0); + wmsum_init(&arc_mru_ghost->arcs_hits[ARC_BUFC_METADATA], 0); + wmsum_init(&arc_mfu_ghost->arcs_hits[ARC_BUFC_DATA], 0); + wmsum_init(&arc_mfu_ghost->arcs_hits[ARC_BUFC_METADATA], 0); wmsum_init(&arc_sums.arcstat_hits, 0); wmsum_init(&arc_sums.arcstat_iohits, 0); @@ -7865,7 +7559,7 @@ arc_state_init(void) wmsum_init(&arc_sums.arcstat_memory_direct_count, 0); wmsum_init(&arc_sums.arcstat_memory_indirect_count, 0); wmsum_init(&arc_sums.arcstat_prune, 0); - aggsum_init(&arc_sums.arcstat_meta_used, 0); + wmsum_init(&arc_sums.arcstat_meta_used, 0); wmsum_init(&arc_sums.arcstat_async_upgrade_sync, 0); wmsum_init(&arc_sums.arcstat_predictive_prefetch, 0); wmsum_init(&arc_sums.arcstat_demand_hit_predictive_prefetch, 0); @@ -7904,13 +7598,20 @@ arc_state_fini(void) zfs_refcount_destroy(&arc_uncached->arcs_esize[ARC_BUFC_METADATA]); zfs_refcount_destroy(&arc_uncached->arcs_esize[ARC_BUFC_DATA]); - zfs_refcount_destroy(&arc_anon->arcs_size); - zfs_refcount_destroy(&arc_mru->arcs_size); - zfs_refcount_destroy(&arc_mru_ghost->arcs_size); - zfs_refcount_destroy(&arc_mfu->arcs_size); - zfs_refcount_destroy(&arc_mfu_ghost->arcs_size); - zfs_refcount_destroy(&arc_l2c_only->arcs_size); - zfs_refcount_destroy(&arc_uncached->arcs_size); + zfs_refcount_destroy(&arc_anon->arcs_size[ARC_BUFC_DATA]); + zfs_refcount_destroy(&arc_anon->arcs_size[ARC_BUFC_METADATA]); + zfs_refcount_destroy(&arc_mru->arcs_size[ARC_BUFC_DATA]); + zfs_refcount_destroy(&arc_mru->arcs_size[ARC_BUFC_METADATA]); + zfs_refcount_destroy(&arc_mru_ghost->arcs_size[ARC_BUFC_DATA]); + zfs_refcount_destroy(&arc_mru_ghost->arcs_size[ARC_BUFC_METADATA]); + zfs_refcount_destroy(&arc_mfu->arcs_size[ARC_BUFC_DATA]); + zfs_refcount_destroy(&arc_mfu->arcs_size[ARC_BUFC_METADATA]); + zfs_refcount_destroy(&arc_mfu_ghost->arcs_size[ARC_BUFC_DATA]); + zfs_refcount_destroy(&arc_mfu_ghost->arcs_size[ARC_BUFC_METADATA]); + zfs_refcount_destroy(&arc_l2c_only->arcs_size[ARC_BUFC_DATA]); + zfs_refcount_destroy(&arc_l2c_only->arcs_size[ARC_BUFC_METADATA]); + zfs_refcount_destroy(&arc_uncached->arcs_size[ARC_BUFC_DATA]); + zfs_refcount_destroy(&arc_uncached->arcs_size[ARC_BUFC_METADATA]); multilist_destroy(&arc_mru->arcs_list[ARC_BUFC_METADATA]); multilist_destroy(&arc_mru_ghost->arcs_list[ARC_BUFC_METADATA]); @@ -7925,6 +7626,11 @@ arc_state_fini(void) multilist_destroy(&arc_uncached->arcs_list[ARC_BUFC_METADATA]); multilist_destroy(&arc_uncached->arcs_list[ARC_BUFC_DATA]); + wmsum_fini(&arc_mru_ghost->arcs_hits[ARC_BUFC_DATA]); + wmsum_fini(&arc_mru_ghost->arcs_hits[ARC_BUFC_METADATA]); + wmsum_fini(&arc_mfu_ghost->arcs_hits[ARC_BUFC_DATA]); + wmsum_fini(&arc_mfu_ghost->arcs_hits[ARC_BUFC_METADATA]); + wmsum_fini(&arc_sums.arcstat_hits); wmsum_fini(&arc_sums.arcstat_iohits); wmsum_fini(&arc_sums.arcstat_misses); @@ -8011,7 +7717,7 @@ arc_state_fini(void) wmsum_fini(&arc_sums.arcstat_memory_direct_count); wmsum_fini(&arc_sums.arcstat_memory_indirect_count); wmsum_fini(&arc_sums.arcstat_prune); - aggsum_fini(&arc_sums.arcstat_meta_used); + wmsum_fini(&arc_sums.arcstat_meta_used); wmsum_fini(&arc_sums.arcstat_async_upgrade_sync); wmsum_fini(&arc_sums.arcstat_predictive_prefetch); wmsum_fini(&arc_sums.arcstat_demand_hit_predictive_prefetch); @@ -8083,18 +7789,12 @@ arc_init(void) #endif arc_c = arc_c_min; - arc_p = (arc_c >> 1); + arc_meta = (1ULL << 32) / 4; /* Metadata 25% of arc_c. */ + arc_pd = (1ULL << 32) / 2; /* MRU 50% of data. */ + arc_pm = (1ULL << 32) / 2; /* MRU 50% of metadata. */ - /* Set min to 1/2 of arc_c_min */ - arc_meta_min = 1ULL << SPA_MAXBLOCKSHIFT; - /* - * Set arc_meta_limit to a percent of arc_c_max with a floor of - * arc_meta_min, and a ceiling of arc_c_max. - */ - percent = MIN(zfs_arc_meta_limit_percent, 100); - arc_meta_limit = MAX(arc_meta_min, (percent * arc_c_max) / 100); percent = MIN(zfs_arc_dnode_limit_percent, 100); - arc_dnode_size_limit = (percent * arc_meta_limit) / 100; + arc_dnode_limit = arc_c_max * percent / 100; /* Apply user specified tunings */ arc_tuning_update(B_TRUE); @@ -8871,7 +8571,7 @@ l2arc_untransform(zio_t *zio, l2arc_read_callback_t *cb) */ if (BP_IS_ENCRYPTED(bp)) { abd_t *eabd = arc_get_data_abd(hdr, arc_hdr_size(hdr), hdr, - ARC_HDR_DO_ADAPT | ARC_HDR_USE_RESERVE); + ARC_HDR_USE_RESERVE); zio_crypt_decode_params_bp(bp, salt, iv); zio_crypt_decode_mac_bp(bp, mac); @@ -8908,7 +8608,7 @@ l2arc_untransform(zio_t *zio, l2arc_read_callback_t *cb) if (HDR_GET_COMPRESS(hdr) != ZIO_COMPRESS_OFF && !HDR_COMPRESSION_ENABLED(hdr)) { abd_t *cabd = arc_get_data_abd(hdr, arc_hdr_size(hdr), hdr, - ARC_HDR_DO_ADAPT | ARC_HDR_USE_RESERVE); + ARC_HDR_USE_RESERVE); void *tmp = abd_borrow_buf(cabd, arc_hdr_size(hdr)); ret = zio_decompress_data(HDR_GET_COMPRESS(hdr), @@ -9802,7 +9502,7 @@ l2arc_hdr_limit_reached(void) { int64_t s = aggsum_upper_bound(&arc_sums.arcstat_l2_hdr_size); - return (arc_reclaim_needed() || (s > arc_meta_limit * 3 / 4) || + return (arc_reclaim_needed() || (s > (arc_warm ? arc_c : arc_c_max) * l2arc_meta_percent / 100)); } @@ -10692,7 +10392,7 @@ l2arc_log_blk_restore(l2arc_dev_t *dev, const l2arc_log_blk_phys_t *lb, * since we may allocate significant amount of memory here, let ARC * grow its arc_c. */ - arc_adapt(log_entries * HDR_L2ONLY_SIZE, arc_l2c_only); + arc_adapt(log_entries * HDR_L2ONLY_SIZE); for (int i = log_entries - 1; i >= 0; i--) { /* @@ -11153,40 +10853,18 @@ ZFS_MODULE_PARAM_CALL(zfs_arc, zfs_arc_, min, param_set_arc_min, ZFS_MODULE_PARAM_CALL(zfs_arc, zfs_arc_, max, param_set_arc_max, spl_param_get_u64, ZMOD_RW, "Maximum ARC size in bytes"); -ZFS_MODULE_PARAM_CALL(zfs_arc, zfs_arc_, meta_limit, param_set_arc_u64, - spl_param_get_u64, ZMOD_RW, "Metadata limit for ARC size in bytes"); - -ZFS_MODULE_PARAM_CALL(zfs_arc, zfs_arc_, meta_limit_percent, - param_set_arc_int, param_get_uint, ZMOD_RW, - "Percent of ARC size for ARC meta limit"); - -ZFS_MODULE_PARAM_CALL(zfs_arc, zfs_arc_, meta_min, param_set_arc_u64, - spl_param_get_u64, ZMOD_RW, "Minimum ARC metadata size in bytes"); - -ZFS_MODULE_PARAM(zfs_arc, zfs_arc_, meta_prune, INT, ZMOD_RW, - "Meta objects to scan for prune"); - -ZFS_MODULE_PARAM(zfs_arc, zfs_arc_, meta_adjust_restarts, UINT, ZMOD_RW, - "Limit number of restarts in arc_evict_meta"); - -ZFS_MODULE_PARAM(zfs_arc, zfs_arc_, meta_strategy, UINT, ZMOD_RW, - "Meta reclaim strategy"); +ZFS_MODULE_PARAM(zfs_arc, zfs_arc_, meta_balance, UINT, ZMOD_RW, + "Balance between metadata and data on ghost hits."); ZFS_MODULE_PARAM_CALL(zfs_arc, zfs_arc_, grow_retry, param_set_arc_int, param_get_uint, ZMOD_RW, "Seconds before growing ARC size"); -ZFS_MODULE_PARAM(zfs_arc, zfs_arc_, p_dampener_disable, INT, ZMOD_RW, - "Disable arc_p adapt dampener"); - ZFS_MODULE_PARAM_CALL(zfs_arc, zfs_arc_, shrink_shift, param_set_arc_int, param_get_uint, ZMOD_RW, "log2(fraction of ARC to reclaim)"); ZFS_MODULE_PARAM(zfs_arc, zfs_arc_, pc_percent, UINT, ZMOD_RW, "Percent of pagecache to reclaim ARC to"); -ZFS_MODULE_PARAM_CALL(zfs_arc, zfs_arc_, p_min_shift, param_set_arc_int, - param_get_uint, ZMOD_RW, "arc_c shift to calc min/max arc_p"); - ZFS_MODULE_PARAM(zfs_arc, zfs_arc_, average_blocksize, UINT, ZMOD_RD, "Target average block size"); diff --git a/tests/zfs-tests/tests/perf/perf.shlib b/tests/zfs-tests/tests/perf/perf.shlib index 27c40bd52946..5555e910d722 100644 --- a/tests/zfs-tests/tests/perf/perf.shlib +++ b/tests/zfs-tests/tests/perf/perf.shlib @@ -485,7 +485,6 @@ function get_system_config printf " \"tunables\": {\n" >>$config for tunable in \ zfs_arc_max \ - zfs_arc_meta_limit \ zfs_arc_sys_free \ zfs_dirty_data_max \ zfs_flags \