From f376c4fd0148ac25f03fec99b2820118fb18a731 Mon Sep 17 00:00:00 2001 From: Slobodan Predolac Date: Tue, 30 Sep 2025 10:20:23 -0700 Subject: [PATCH 1/2] Implement pool of empty pages in central with some basic operations --- Makefile.in | 3 +- include/jemalloc/internal/arena_externs.h | 11 +- include/jemalloc/internal/ctl.h | 2 + include/jemalloc/internal/hpa.h | 14 + include/jemalloc/internal/hpa_central.h | 47 +++ include/jemalloc/internal/hpa_opts.h | 10 +- .../internal/jemalloc_internal_externs.h | 1 + include/jemalloc/internal/mutex_prof.h | 3 +- include/jemalloc/internal/witness.h | 1 + src/arena.c | 36 ++ src/ctl.c | 57 ++- src/hpa.c | 93 ++++- src/hpa_central.c | 183 ++++++++++ src/jemalloc.c | 7 + src/stats.c | 36 +- test/unit/hpa.c | 12 +- test/unit/hpa_central_pool.c | 329 ++++++++++++++++++ test/unit/hpa_vectorized_madvise.c | 4 +- .../unit/hpa_vectorized_madvise_large_batch.c | 4 +- test/unit/mallctl.c | 4 + 20 files changed, 837 insertions(+), 20 deletions(-) create mode 100644 test/unit/hpa_central_pool.c diff --git a/Makefile.in b/Makefile.in index 4b5b6507e4..ca1bfaa629 100644 --- a/Makefile.in +++ b/Makefile.in @@ -232,9 +232,10 @@ TESTS_UNIT := \ $(srcroot)test/unit/hpa.c \ $(srcroot)test/unit/hpa_sec_integration.c \ $(srcroot)test/unit/hpa_thp_always.c \ + $(srcroot)test/unit/hpa_background_thread.c \ + $(srcroot)test/unit/hpa_central_pool.c \ $(srcroot)test/unit/hpa_vectorized_madvise.c \ $(srcroot)test/unit/hpa_vectorized_madvise_large_batch.c \ - $(srcroot)test/unit/hpa_background_thread.c \ $(srcroot)test/unit/hpdata.c \ $(srcroot)test/unit/huge.c \ $(srcroot)test/unit/inspect.c \ diff --git a/include/jemalloc/internal/arena_externs.h b/include/jemalloc/internal/arena_externs.h index 1d00463527..da55e646c5 100644 --- a/include/jemalloc/internal/arena_externs.h +++ b/include/jemalloc/internal/arena_externs.h @@ -46,8 +46,12 @@ void arena_stats_merge(tsdn_t *tsdn, arena_t *arena, unsigned *nthreads, const char **dss, ssize_t *dirty_decay_ms, ssize_t *muzzy_decay_ms, size_t *nactive, size_t *ndirty, size_t *nmuzzy, arena_stats_t *astats, bin_stats_data_t *bstats, arena_stats_large_t *lstats, pac_estats_t *estats, - hpa_shard_stats_t *hpastats); -void arena_handle_deferred_work(tsdn_t *tsdn, arena_t *arena); + hpa_shard_stats_t *hpastats, sec_stats_t *secstats); +void arena_stats_global_central_read(tsdn_t *tsdn, hpa_central_stats_t *stats); +void arena_stats_global_central_mutex_read( + tsdn_t *tsdn, mutex_prof_data_t *mutex_prof_data); + +void arena_handle_deferred_work(tsdn_t *tsdn, arena_t *arena); edata_t *arena_extent_alloc_large( tsdn_t *tsdn, arena_t *arena, size_t usize, size_t alignment, bool zero); void arena_extent_dalloc_large_prep( @@ -125,7 +129,10 @@ void arena_prefork5(tsdn_t *tsdn, arena_t *arena); void arena_prefork6(tsdn_t *tsdn, arena_t *arena); void arena_prefork7(tsdn_t *tsdn, arena_t *arena); void arena_prefork8(tsdn_t *tsdn, arena_t *arena); +void arena_global_prefork(tsdn_t *tsdn, bool use_hpa); void arena_postfork_parent(tsdn_t *tsdn, arena_t *arena); +void arena_global_postfork_parent(tsdn_t *tsdn, bool use_hpa); void arena_postfork_child(tsdn_t *tsdn, arena_t *arena); +void arena_global_postfork_child(tsdn_t *tsdn, bool use_hpa); #endif /* JEMALLOC_INTERNAL_ARENA_EXTERNS_H */ diff --git a/include/jemalloc/internal/ctl.h b/include/jemalloc/internal/ctl.h index 82035fe366..e39785b0a2 100644 --- a/include/jemalloc/internal/ctl.h +++ b/include/jemalloc/internal/ctl.h @@ -5,6 +5,7 @@ #include "jemalloc/internal/arena_stats.h" #include "jemalloc/internal/background_thread_structs.h" #include "jemalloc/internal/bin_stats.h" +#include "jemalloc/internal/hpa_central.h" #include "jemalloc/internal/jemalloc_internal_types.h" #include "jemalloc/internal/malloc_io.h" #include "jemalloc/internal/mutex_prof.h" @@ -65,6 +66,7 @@ typedef struct ctl_stats_s { size_t retained; background_thread_stats_t background_thread; + hpa_central_stats_t hpa_central; mutex_prof_data_t mutex_prof_data[mutex_prof_num_global_mutexes]; } ctl_stats_t; diff --git a/include/jemalloc/internal/hpa.h b/include/jemalloc/internal/hpa.h index dc7725b77a..a7864ab22e 100644 --- a/include/jemalloc/internal/hpa.h +++ b/include/jemalloc/internal/hpa.h @@ -51,6 +51,20 @@ struct hpa_shard_nonderived_stats_s { * Guarded by mtx. */ uint64_t ndehugifies; + + /* + * The number of times we donated pageslab to central pool + * + * Guarded by mtx. + */ + uint64_t ndonated_ps; + + /* + * The number of times we borrowed pageslab from a central pool + * + * Guarded by mtx. + */ + uint64_t nborrowed_ps; }; /* Completely derived; only used by CTL. */ diff --git a/include/jemalloc/internal/hpa_central.h b/include/jemalloc/internal/hpa_central.h index 3e0ff7daeb..947c5463ed 100644 --- a/include/jemalloc/internal/hpa_central.h +++ b/include/jemalloc/internal/hpa_central.h @@ -8,8 +8,35 @@ #include "jemalloc/internal/mutex.h" #include "jemalloc/internal/tsd_types.h" +typedef struct hpa_pool_s hpa_pool_t; +struct hpa_pool_s { + /* + * Pool of empty huge pages to be shared between shards that are + * participating. + * + * Page is owned by the pool if it lives in one of these two lists. + * This means that it should not be part of any hpa_shard's psset at the + * same time. + */ + hpdata_empty_list_t nonpurged; + hpdata_empty_list_t purged; +}; + +typedef struct hpa_central_stats_s hpa_central_stats_t; +struct hpa_central_stats_s { + /* Number of pages purged while they were in the central pool */ + uint64_t npurged_pool; + + /* Total number of dirty base pages in the pool */ + size_t ndirty_pool; +}; + typedef struct hpa_central_s hpa_central_t; struct hpa_central_s { + /* Guards the access to central pool of empty hugepages */ + malloc_mutex_t pool_mtx; + hpa_pool_t pool; + /* * Guards expansion of eden. We separate this from the regular mutex so * that cheaper operations can still continue while we're doing the OS @@ -30,6 +57,9 @@ struct hpa_central_s { /* The HPA hooks. */ hpa_hooks_t hooks; + + /* Stats */ + hpa_central_stats_t stats; }; bool hpa_central_init( @@ -38,4 +68,21 @@ bool hpa_central_init( hpdata_t *hpa_central_extract(tsdn_t *tsdn, hpa_central_t *central, size_t size, uint64_t age, bool hugify_eager, bool *oom); +/* Donate empty pages to central */ +void hpa_central_ps_insert(tsdn_t *tsdn, hpa_central_t *central, + hpdata_empty_list_t *pages, const nstime_t *now); +/* Get empty page from central without growing it */ +hpdata_t *hpa_central_ps_pop(tsdn_t *tsdn, hpa_central_t *central); + +/* Purge up to max_ps empty pages in the central */ +size_t hpa_central_purge( + tsdn_t *tsdn, hpa_central_t *central, const nstime_t *now, size_t max_ps); + +void hpa_central_prefork(tsdn_t *tsdn, hpa_central_t *central); +void hpa_central_postfork_parent(tsdn_t *tsdn, hpa_central_t *central); +void hpa_central_postfork_child(tsdn_t *tsdn, hpa_central_t *central); + +void hpa_central_stats_read( + tsdn_t *tsdn, hpa_central_t *central, hpa_central_stats_t *stats); + #endif /* JEMALLOC_INTERNAL_HPA_CENTRAL_H */ diff --git a/include/jemalloc/internal/hpa_opts.h b/include/jemalloc/internal/hpa_opts.h index 6747c2db8e..e5c32f2afc 100644 --- a/include/jemalloc/internal/hpa_opts.h +++ b/include/jemalloc/internal/hpa_opts.h @@ -152,6 +152,12 @@ struct hpa_shard_opts_s { * hpa_hugify_style_t for options). */ hpa_hugify_style_t hugify_style; + + /* + * If use_pool is true this shard will donate empty pages to the pool + * and borrow from the pool before using central allocator. + */ + bool use_pool; }; /* clang-format off */ @@ -183,7 +189,9 @@ struct hpa_shard_opts_s { /* min_purge_delay_ms */ \ 0, \ /* hugify_style */ \ - hpa_hugify_style_lazy \ + hpa_hugify_style_lazy, \ + /* use_pool */ \ + false \ } /* clang-format on */ diff --git a/include/jemalloc/internal/jemalloc_internal_externs.h b/include/jemalloc/internal/jemalloc_internal_externs.h index ea739ea88c..e0d23352e0 100644 --- a/include/jemalloc/internal/jemalloc_internal_externs.h +++ b/include/jemalloc/internal/jemalloc_internal_externs.h @@ -17,6 +17,7 @@ extern bool opt_abort_conf; extern bool opt_trust_madvise; extern bool opt_experimental_hpa_start_huge_if_thp_always; extern bool opt_experimental_hpa_enforce_hugify; +extern uint64_t opt_hpa_pool_purge_delay_ms; extern bool opt_confirm_conf; extern bool opt_hpa; extern hpa_shard_opts_t opt_hpa_opts; diff --git a/include/jemalloc/internal/mutex_prof.h b/include/jemalloc/internal/mutex_prof.h index 572200f353..b61d9d4aa9 100644 --- a/include/jemalloc/internal/mutex_prof.h +++ b/include/jemalloc/internal/mutex_prof.h @@ -36,7 +36,8 @@ typedef enum { OP(tcache_list) \ OP(hpa_shard) \ OP(hpa_shard_grow) \ - OP(hpa_sec) + OP(hpa_sec) \ + OP(hpa_central_pool) typedef enum { #define OP(mtx) arena_prof_mutex_##mtx, diff --git a/include/jemalloc/internal/witness.h b/include/jemalloc/internal/witness.h index 0a426ff567..3ca2e4ed42 100644 --- a/include/jemalloc/internal/witness.h +++ b/include/jemalloc/internal/witness.h @@ -56,6 +56,7 @@ enum witness_rank_e { WITNESS_RANK_HPA_SHARD = WITNESS_RANK_EXTENTS, WITNESS_RANK_HPA_CENTRAL_GROW, + WITNESS_RANK_HPA_CENTRAL_POOL, WITNESS_RANK_HPA_CENTRAL, WITNESS_RANK_EDATA_CACHE, diff --git a/src/arena.c b/src/arena.c index 5b144c63a0..261dbbd17c 100644 --- a/src/arena.c +++ b/src/arena.c @@ -213,6 +213,21 @@ arena_stats_merge(tsdn_t *tsdn, arena_t *arena, unsigned *nthreads, } } +void +arena_stats_global_central_read(tsdn_t *tsdn, hpa_central_stats_t *stats) { + hpa_central_stats_read(tsdn, &arena_pa_central_global.hpa, stats); +} + +void +arena_stats_global_central_mutex_read( + tsdn_t *tsdn, mutex_prof_data_t *mutex_prof_data) { + malloc_mutex_lock(tsdn, &arena_pa_central_global.hpa.pool_mtx); + malloc_mutex_prof_read( + tsdn, mutex_prof_data, &arena_pa_central_global.hpa.pool_mtx); + malloc_mutex_unlock(tsdn, &arena_pa_central_global.hpa.pool_mtx); +} + + static void arena_background_thread_inactivity_check( tsdn_t *tsdn, arena_t *arena, bool is_background_thread) { @@ -2321,6 +2336,13 @@ arena_prefork8(tsdn_t *tsdn, arena_t *arena) { } } +void +arena_global_prefork(tsdn_t *tsdn, bool use_hpa) { + if (use_hpa) { + hpa_central_prefork(tsdn, &arena_pa_central_global.hpa); + } +} + void arena_postfork_parent(tsdn_t *tsdn, arena_t *arena) { for (unsigned i = 0; i < nbins_total; i++) { @@ -2336,6 +2358,13 @@ arena_postfork_parent(tsdn_t *tsdn, arena_t *arena) { } } +void +arena_global_postfork_parent(tsdn_t *tsdn, bool use_hpa) { + if (use_hpa) { + hpa_central_postfork_parent(tsdn, &arena_pa_central_global.hpa); + } +} + void arena_postfork_child(tsdn_t *tsdn, arena_t *arena) { atomic_store_u(&arena->nthreads[0], 0, ATOMIC_RELAXED); @@ -2374,3 +2403,10 @@ arena_postfork_child(tsdn_t *tsdn, arena_t *arena) { malloc_mutex_postfork_child(tsdn, &arena->tcache_ql_mtx); } } + +void +arena_global_postfork_child(tsdn_t *tsdn, bool use_hpa) { + if (use_hpa) { + hpa_central_postfork_child(tsdn, &arena_pa_central_global.hpa); + } +} diff --git a/src/ctl.c b/src/ctl.c index 1260e197da..0d191749ac 100644 --- a/src/ctl.c +++ b/src/ctl.c @@ -111,6 +111,8 @@ CTL_PROTO(opt_experimental_hpa_max_purge_nhp) CTL_PROTO(opt_hpa_purge_threshold) CTL_PROTO(opt_hpa_min_purge_delay_ms) CTL_PROTO(opt_hpa_hugify_style) +CTL_PROTO(opt_hpa_use_pool) +CTL_PROTO(opt_hpa_pool_purge_delay_ms) CTL_PROTO(opt_hpa_dirty_mult) CTL_PROTO(opt_hpa_sec_nshards) CTL_PROTO(opt_hpa_sec_max_alloc) @@ -273,6 +275,8 @@ CTL_PROTO(stats_arenas_i_hpa_shard_npurges) CTL_PROTO(stats_arenas_i_hpa_shard_nhugifies) CTL_PROTO(stats_arenas_i_hpa_shard_nhugify_failures) CTL_PROTO(stats_arenas_i_hpa_shard_ndehugifies) +CTL_PROTO(stats_arenas_i_hpa_shard_ndonated_ps) +CTL_PROTO(stats_arenas_i_hpa_shard_nborrowed_ps) /* Set of stats for non-hugified and hugified slabs. */ CTL_PROTO(stats_arenas_i_hpa_shard_slabs_npageslabs_nonhuge) @@ -349,6 +353,8 @@ CTL_PROTO(stats_active) CTL_PROTO(stats_background_thread_num_threads) CTL_PROTO(stats_background_thread_num_runs) CTL_PROTO(stats_background_thread_run_interval) +CTL_PROTO(stats_central_pool_ndirty) +CTL_PROTO(stats_central_pool_npurged) CTL_PROTO(stats_metadata) CTL_PROTO(stats_metadata_edata) CTL_PROTO(stats_metadata_rtree) @@ -486,6 +492,8 @@ static const ctl_named_node_t opt_node[] = {{NAME("abort"), CTL(opt_abort)}, {NAME("hpa_purge_threshold"), CTL(opt_hpa_purge_threshold)}, {NAME("hpa_min_purge_delay_ms"), CTL(opt_hpa_min_purge_delay_ms)}, {NAME("hpa_hugify_style"), CTL(opt_hpa_hugify_style)}, + {NAME("hpa_use_pool"), CTL(opt_hpa_use_pool)}, + {NAME("hpa_pool_purge_delay_ms"), CTL(opt_hpa_pool_purge_delay_ms)}, {NAME("hpa_dirty_mult"), CTL(opt_hpa_dirty_mult)}, {NAME("hpa_sec_nshards"), CTL(opt_hpa_sec_nshards)}, {NAME("hpa_sec_max_alloc"), CTL(opt_hpa_sec_max_alloc)}, @@ -795,6 +803,8 @@ static const ctl_named_node_t stats_arenas_i_hpa_shard_node[] = { {NAME("nhugifies"), CTL(stats_arenas_i_hpa_shard_nhugifies)}, {NAME("nhugify_failures"), CTL(stats_arenas_i_hpa_shard_nhugify_failures)}, {NAME("ndehugifies"), CTL(stats_arenas_i_hpa_shard_ndehugifies)}, + {NAME("ndonated_ps"), CTL(stats_arenas_i_hpa_shard_ndonated_ps)}, + {NAME("nborrowed_ps"), CTL(stats_arenas_i_hpa_shard_nborrowed_ps)}, {NAME("full_slabs"), CHILD(named, stats_arenas_i_hpa_shard_full_slabs)}, {NAME("empty_slabs"), CHILD(named, stats_arenas_i_hpa_shard_empty_slabs)}, @@ -852,6 +862,10 @@ static const ctl_named_node_t stats_background_thread_node[] = { {NAME("num_runs"), CTL(stats_background_thread_num_runs)}, {NAME("run_interval"), CTL(stats_background_thread_run_interval)}}; +static const ctl_named_node_t stats_central_pool_node[] = { + {NAME("ndirty"), CTL(stats_central_pool_ndirty)}, + {NAME("npurged"), CTL(stats_central_pool_npurged)}}; + #define OP(mtx) MUTEX_PROF_DATA_NODE(mutexes_##mtx) MUTEX_PROF_GLOBAL_MUTEXES #undef OP @@ -881,6 +895,7 @@ static const ctl_named_node_t stats_node[] = { {NAME("mutexes"), CHILD(named, stats_mutexes)}, {NAME("arenas"), CHILD(indexed, stats_arenas)}, {NAME("zero_reallocs"), CTL(stats_zero_reallocs)}, + {NAME("central_pool"), CHILD(named, stats_central_pool)}, }; static const ctl_named_node_t experimental_hooks_node[] = { @@ -1064,6 +1079,8 @@ ctl_arena_clear(ctl_arena_t *ctl_arena) { } } +static bool ctl_ever_used_central_pool(const hpa_shard_stats_t *hpastats); + static void ctl_arena_stats_amerge(tsdn_t *tsdn, ctl_arena_t *ctl_arena, arena_t *arena) { unsigned i; @@ -1077,6 +1094,14 @@ ctl_arena_stats_amerge(tsdn_t *tsdn, ctl_arena_t *ctl_arena, arena_t *arena) { ctl_arena->astats->lstats, ctl_arena->astats->estats, &ctl_arena->astats->hpastats); + /* Read central pool mutex stats for arena 0 only */ + if (ctl_arena->arena_ind == 0 && + ctl_ever_used_central_pool(&ctl_arena->astats->hpastats)) { + arena_stats_global_central_mutex_read(tsdn, + &ctl_arena->astats->astats + .mutex_prof_data[arena_prof_mutex_hpa_central_pool]); + } + for (i = 0; i < SC_NBINS; i++) { bin_stats_t *bstats = &ctl_arena->astats->bstats[i].stats_data; @@ -1311,6 +1336,17 @@ ctl_arena_init(tsd_t *tsd, const arena_config_t *config) { return arena_ind; } +static bool +ctl_ever_used_central_pool(const hpa_shard_stats_t *hpastats) { + return hpastats->nonderived_stats.ndonated_ps > 0 + || hpastats->nonderived_stats.nborrowed_ps > 0; +} + +static void +ctl_hpa_central_stats_read(tsdn_t *tsdn, hpa_central_stats_t *central_stats) { + arena_stats_global_central_read(tsdn, central_stats); +} + static void ctl_background_thread_stats_read(tsdn_t *tsdn) { background_thread_stats_t *stats = &ctl_stats->background_thread; @@ -1361,6 +1397,13 @@ ctl_refresh(tsdn_t *tsdn) { } if (config_stats) { + if (ctl_ever_used_central_pool(&ctl_sarena->astats->hpastats)) { + ctl_hpa_central_stats_read( + tsdn, &ctl_stats->hpa_central); + } else { + ctl_stats->hpa_central.npurged_pool = 0; + ctl_stats->hpa_central.ndirty_pool = 0; + } ctl_stats->allocated = ctl_sarena->astats->allocated_small + ctl_sarena->astats->astats.allocated_large; ctl_stats->active = (ctl_sarena->pactive << LG_PAGE); @@ -1371,7 +1414,8 @@ ctl_refresh(tsdn_t *tsdn) { ctl_sarena->astats->astats.metadata_edata; ctl_stats->metadata_rtree = ctl_sarena->astats->astats.metadata_rtree; - ctl_stats->resident = ctl_sarena->astats->astats.resident; + ctl_stats->resident = ctl_sarena->astats->astats.resident + + ctl_stats->hpa_central.ndirty_pool; ctl_stats->metadata_thp = ctl_sarena->astats->astats.metadata_thp; ctl_stats->mapped = ctl_sarena->astats->astats.mapped; @@ -2172,6 +2216,9 @@ CTL_RO_NL_GEN( opt_hpa_min_purge_delay_ms, opt_hpa_opts.min_purge_delay_ms, uint64_t) CTL_RO_NL_GEN(opt_hpa_hugify_style, hpa_hugify_style_names[opt_hpa_opts.hugify_style], const char *) +CTL_RO_NL_GEN(opt_hpa_use_pool, opt_hpa_opts.use_pool, bool) +CTL_RO_NL_GEN( + opt_hpa_pool_purge_delay_ms, opt_hpa_pool_purge_delay_ms, uint64_t) /* * This will have to change before we publicly document this option; fxp_t and * its representation are internal implementation details. @@ -3805,6 +3852,10 @@ approximate_stats_active_ctl(tsd_t *tsd, const size_t *mib, size_t miblen, label_return: return ret; } +CTL_RO_CGEN(config_stats, stats_central_pool_ndirty, + ctl_stats->hpa_central.ndirty_pool, size_t) +CTL_RO_CGEN(config_stats, stats_central_pool_npurged, + ctl_stats->hpa_central.npurged_pool, uint64_t) CTL_RO_GEN(stats_arenas_i_dss, arenas_i(mib[2])->dss, const char *) CTL_RO_GEN( @@ -4120,6 +4171,10 @@ CTL_RO_CGEN(config_stats, stats_arenas_i_hpa_shard_nhugify_failures, uint64_t); CTL_RO_CGEN(config_stats, stats_arenas_i_hpa_shard_ndehugifies, arenas_i(mib[2])->astats->hpastats.nonderived_stats.ndehugifies, uint64_t); +CTL_RO_CGEN(config_stats, stats_arenas_i_hpa_shard_ndonated_ps, + arenas_i(mib[2])->astats->hpastats.nonderived_stats.ndonated_ps, uint64_t); +CTL_RO_CGEN(config_stats, stats_arenas_i_hpa_shard_nborrowed_ps, + arenas_i(mib[2])->astats->hpastats.nonderived_stats.nborrowed_ps, uint64_t); /* Full, nonhuge */ CTL_RO_CGEN(config_stats, diff --git a/src/hpa.c b/src/hpa.c index 7e5b5f7224..53c24cd9ce 100644 --- a/src/hpa.c +++ b/src/hpa.c @@ -109,6 +109,8 @@ hpa_shard_init(tsdn_t *tsdn, hpa_shard_t *shard, hpa_central_t *central, shard->stats.nhugifies = 0; shard->stats.nhugify_failures = 0; shard->stats.ndehugifies = 0; + shard->stats.ndonated_ps = 0; + shard->stats.nborrowed_ps = 0; /* * Fill these in last, so that if an hpa_shard gets used despite @@ -145,6 +147,8 @@ hpa_shard_nonderived_stats_accum( dst->nhugifies += src->nhugifies; dst->nhugify_failures += src->nhugify_failures; dst->ndehugifies += src->ndehugifies; + dst->ndonated_ps += src->ndonated_ps; + dst->nborrowed_ps += src->nborrowed_ps; } void @@ -285,6 +289,18 @@ hpa_assume_huge(tsdn_t *tsdn, hpa_shard_t *shard, hpdata_t *ps) { } } +static void +hpa_update_purgable_time(hpa_shard_t *shard, hpdata_t *ps) { + if (shard->opts.min_purge_delay_ms == 0) { + return; + } + nstime_t now; + uint64_t delayns = shard->opts.min_purge_delay_ms * 1000 * 1000; + shard->central->hooks.curtime(&now, /* first_reading */ true); + nstime_iadd(&now, delayns); + hpdata_time_purge_allowed_set(ps, &now); +} + static void hpa_update_purge_hugify_eligibility( tsdn_t *tsdn, hpa_shard_t *shard, hpdata_t *ps) { @@ -328,13 +344,8 @@ hpa_update_purge_hugify_eligibility( hpdata_allow_hugify(ps, now); } bool purgable = hpa_good_purge_candidate(shard, ps); - if (purgable && !hpdata_purge_allowed_get(ps) - && (shard->opts.min_purge_delay_ms > 0)) { - nstime_t now; - uint64_t delayns = shard->opts.min_purge_delay_ms * 1000 * 1000; - shard->central->hooks.curtime(&now, /* first_reading */ true); - nstime_iadd(&now, delayns); - hpdata_time_purge_allowed_set(ps, &now); + if (purgable && !hpdata_purge_allowed_get(ps)) { + hpa_update_purgable_time(shard, ps); } hpdata_purge_allowed_set(ps, purgable); @@ -449,6 +460,42 @@ hpa_purge_finish_hp( psset_update_end(&shard->psset, hp_item->hp); } +static void +hpa_donate_empty_ps(tsdn_t *tsdn, hpa_shard_t *shard) { + malloc_mutex_assert_owner(tsdn, &shard->mtx); + if (!shard->opts.use_pool) { + return; + } + + hpdata_empty_list_t to_donate; + hpdata_empty_list_init(&to_donate); + do { + hpdata_t *to_purge = (shard->opts.min_purge_delay_ms > 0) + ? psset_pick_purge( + &shard->psset, &shard->last_time_work_attempted) + : psset_pick_purge(&shard->psset, NULL); + + if (to_purge == NULL || !hpdata_empty(to_purge)) { + break; + } + assert(hpdata_ndirty_get(to_purge) > 0); + + /* Donate the page to the pool */ + psset_remove(&shard->psset, to_purge); + hpdata_empty_list_append(&to_donate, to_purge); + shard->stats.ndonated_ps++; + } while (true); + + if (!hpdata_empty_list_empty(&to_donate)) { + nstime_t now; + nstime_copy(&now, &shard->last_time_work_attempted); + malloc_mutex_unlock(tsdn, &shard->mtx); + hpa_central_ps_insert(tsdn, shard->central, &to_donate, + &shard->last_time_work_attempted); + malloc_mutex_lock(tsdn, &shard->mtx); + } +} + /* Returns number of huge pages purged. */ static inline size_t hpa_purge(tsdn_t *tsdn, hpa_shard_t *shard, size_t max_hp) { @@ -468,6 +515,8 @@ hpa_purge(tsdn_t *tsdn, hpa_shard_t *shard, size_t max_hp) { }; assert(batch.range_watermark > 0); + hpa_donate_empty_ps(tsdn, shard); + while (1) { hpa_batch_pass_start(&batch); assert(hpa_batch_empty(&batch)); @@ -635,6 +684,17 @@ hpa_shard_maybe_do_deferred_work( max_purges = max_purge_nhp; } + if (shard->opts.use_pool) { + size_t max_pool_ops = (forced ? (size_t)-1 : 8); + hpa_central_t *central = shard->central; + nstime_t now; + nstime_copy(&now, &shard->last_time_work_attempted); + /* we do not need to hold shard lock when purging the central */ + malloc_mutex_unlock(tsdn, &shard->mtx); + hpa_central_purge(tsdn, central, &now, max_pool_ops); + malloc_mutex_lock(tsdn, &shard->mtx); + } + malloc_mutex_assert_owner(tsdn, &shard->mtx); nops += hpa_purge(tsdn, shard, max_purges); malloc_mutex_assert_owner(tsdn, &shard->mtx); @@ -650,6 +710,19 @@ hpa_shard_maybe_do_deferred_work( } } +static void +hpa_add_pool_page_to_psset(tsdn_t *tsdn, hpa_shard_t *shard, hpdata_t *ps) { + assert(hpdata_alloc_allowed_get(ps) && hpdata_empty(ps) + && hpdata_consistent(ps)); + if (hpdata_purge_allowed_get(ps)) { + hpa_update_purgable_time(shard, ps); + if (hpdata_huge_get(ps)) { + shard->stats.nborrowed_ps++; + } + } + psset_insert(&shard->psset, ps); +} + static edata_t * hpa_try_alloc_one_no_grow( tsdn_t *tsdn, hpa_shard_t *shard, size_t size, bool *oom) { @@ -663,6 +736,12 @@ hpa_try_alloc_one_no_grow( } hpdata_t *ps = psset_pick_alloc(&shard->psset, size); + if (ps == NULL && shard->opts.use_pool) { + ps = hpa_central_ps_pop(tsdn, shard->central); + if (ps != NULL) { + hpa_add_pool_page_to_psset(tsdn, shard, ps); + } + } if (ps == NULL) { edata_cache_fast_put(tsdn, &shard->ecf, edata); return NULL; diff --git a/src/hpa_central.c b/src/hpa_central.c index b4f770c2cb..281e265ea2 100644 --- a/src/hpa_central.c +++ b/src/hpa_central.c @@ -2,10 +2,138 @@ #include "jemalloc/internal/jemalloc_internal_includes.h" #include "jemalloc/internal/hpa_central.h" + +#include "jemalloc/internal/assert.h" +#include "jemalloc/internal/hpa_utils.h" #include "jemalloc/internal/tsd.h" #include "jemalloc/internal/witness.h" #define HPA_EDEN_SIZE (128 * HUGEPAGE) +#define MILLION UINT64_C(1000000) + +uint64_t opt_hpa_pool_purge_delay_ms = 10000; /* 10s */ + +void +hpa_central_pool_init(hpa_pool_t *pool) { + hpdata_empty_list_init(&pool->nonpurged); + hpdata_empty_list_init(&pool->purged); +} + +void +hpa_central_stats_read( + tsdn_t *tsdn, hpa_central_t *central, hpa_central_stats_t *stats) { + malloc_mutex_lock(tsdn, ¢ral->pool_mtx); + stats->ndirty_pool = central->stats.ndirty_pool; + stats->npurged_pool = central->stats.npurged_pool; + malloc_mutex_unlock(tsdn, ¢ral->pool_mtx); +} + +static inline void +hpa_central_pool_concat_nonpurged(tsdn_t *tsdn, hpa_central_t *central, + hpdata_empty_list_t *pages, size_t new_dirty) { + malloc_mutex_lock(tsdn, ¢ral->pool_mtx); + hpdata_empty_list_concat(¢ral->pool.nonpurged, pages); + central->stats.ndirty_pool += new_dirty; + malloc_mutex_unlock(tsdn, ¢ral->pool_mtx); +} + +static void +hpa_central_get_nonpurged(tsdn_t *tsdn, hpa_central_t *central, + const nstime_t *now, hpa_purge_batch_t *batch) { + malloc_mutex_lock(tsdn, ¢ral->pool_mtx); + while (!hpa_batch_full(batch) + && !hpdata_empty_list_empty(¢ral->pool.nonpurged)) { + hpdata_t *ps = hpdata_empty_list_first( + ¢ral->pool.nonpurged); + assert(hpdata_empty(ps) && hpdata_purge_allowed_get(ps)); + + const nstime_t *allowed = hpdata_time_purge_allowed_get(ps); + if (nstime_compare(now, allowed) < 0) { + break; + } + hpdata_empty_list_remove(¢ral->pool.nonpurged, ps); + assert(batch->item_cnt < batch->items_capacity); + hpa_purge_item_t *hp_item = &batch->items[batch->item_cnt]; + batch->item_cnt++; + hp_item->hp = ps; + hp_item->dehugify = hpdata_huge_get(hp_item->hp); + size_t nranges; + hpdata_alloc_allowed_set(hp_item->hp, false); + size_t ndirty = hpdata_purge_begin( + hp_item->hp, &hp_item->state, &nranges); + assert(ndirty > 0 && nranges > 0); + batch->ndirty_in_batch += ndirty; + batch->nranges += nranges; + batch->npurged_hp_total++; + } + malloc_mutex_unlock(tsdn, ¢ral->pool_mtx); +} + +static void +hpa_central_put_purged( + tsdn_t *tsdn, hpa_central_t *central, const hpa_purge_batch_t *batch) { + assert(batch->item_cnt > 0); + hpdata_empty_list_t newly_purged; + hpdata_empty_list_init(&newly_purged); + + for (size_t i = 0; i < batch->item_cnt; ++i) { + hpa_purge_item_t *hp_item = &batch->items[i]; + if (hp_item->dehugify) { + hpdata_dehugify(hp_item->hp); + } + hpdata_purge_end(hp_item->hp, &hp_item->state); + hpdata_alloc_allowed_set(hp_item->hp, true); + hpdata_purge_allowed_set(hp_item->hp, false); + hpdata_empty_list_append(&newly_purged, hp_item->hp); + } + + malloc_mutex_lock(tsdn, ¢ral->pool_mtx); + hpdata_empty_list_concat(¢ral->pool.purged, &newly_purged); + central->stats.npurged_pool += batch->npurged_hp_total; + assert(central->stats.ndirty_pool >= batch->ndirty_in_batch); + central->stats.ndirty_pool -= batch->ndirty_in_batch; + malloc_mutex_unlock(tsdn, ¢ral->pool_mtx); +} + +void +hpa_central_ps_insert(tsdn_t *tsdn, hpa_central_t *central, + hpdata_empty_list_t *pages, const nstime_t *now) { + assert(!hpdata_empty_list_empty(pages)); + + assert(now != NULL); + nstime_t purge_time; + nstime_copy(&purge_time, now); + uint64_t purge_delay_ns = opt_hpa_pool_purge_delay_ms * MILLION; + nstime_iadd(&purge_time, purge_delay_ns); + + hpdata_t *ps; + size_t new_dirty = 0; + ql_foreach (ps, &pages->head, ql_link_empty) { + assert(hpdata_empty(ps)); + assert(hpdata_ndirty_get(ps) > 0); + hpdata_time_purge_allowed_set(ps, &purge_time); + new_dirty += hpdata_ndirty_get(ps); + } + hpa_central_pool_concat_nonpurged(tsdn, central, pages, new_dirty); +} + +hpdata_t * +hpa_central_ps_pop(tsdn_t *tsdn, hpa_central_t *central) { + hpdata_t *ps = NULL; + + malloc_mutex_lock(tsdn, ¢ral->pool_mtx); + if (!hpdata_empty_list_empty(¢ral->pool.nonpurged)) { + ps = hpdata_empty_list_first(¢ral->pool.nonpurged); + hpdata_empty_list_remove(¢ral->pool.nonpurged, ps); + } + if (ps == NULL && !hpdata_empty_list_empty(¢ral->pool.purged)) { + ps = hpdata_empty_list_first(¢ral->pool.purged); + hpdata_empty_list_remove(¢ral->pool.purged, ps); + } + malloc_mutex_unlock(tsdn, ¢ral->pool_mtx); + + return ps; +} bool hpa_central_init( @@ -19,10 +147,19 @@ hpa_central_init( return true; } + err = malloc_mutex_init(¢ral->pool_mtx, "hpa_central_pool", + WITNESS_RANK_HPA_CENTRAL_POOL, malloc_mutex_rank_exclusive); + if (err) { + return true; + } + hpa_central_pool_init(¢ral->pool); + central->base = base; central->eden = NULL; central->eden_len = 0; central->hooks = *hooks; + central->stats.npurged_pool = 0; + central->stats.ndirty_pool = 0; return false; } @@ -119,3 +256,49 @@ hpa_central_extract(tsdn_t *tsdn, hpa_central_t *central, size_t size, return ps; } + +size_t +hpa_central_purge( + tsdn_t *tsdn, hpa_central_t *central, const nstime_t *now, size_t max_ps) { + VARIABLE_ARRAY(hpa_purge_item_t, items, HPA_PURGE_BATCH_MAX); + hpa_purge_batch_t batch = { + .max_hp = max_ps, + .npurged_hp_total = 0, + .items = &items[0], + .items_capacity = HPA_PURGE_BATCH_MAX, + .range_watermark = hpa_process_madvise_max_iovec_len(), + }; + assert(batch.range_watermark > 0); + + do { + hpa_batch_pass_start(&batch); + assert(hpa_batch_empty(&batch)); + hpa_central_get_nonpurged(tsdn, central, now, &batch); + if (hpa_batch_empty(&batch)) { + break; + } + /* We don't need any lock while purging pages from the pool. */ + hpa_purge_batch(¢ral->hooks, batch.items, batch.item_cnt); + hpa_central_put_purged(tsdn, central, &batch); + } while (hpa_batch_full(&batch)); + return batch.npurged_hp_total; +} + +/* + *No need to do any of below for central->grow_mtx as shard->grow_mtx must be + * held to lock that one. + */ +void +hpa_central_prefork(tsdn_t *tsdn, hpa_central_t *central) { + malloc_mutex_prefork(tsdn, ¢ral->pool_mtx); +} + +void +hpa_central_postfork_parent(tsdn_t *tsdn, hpa_central_t *central) { + malloc_mutex_postfork_parent(tsdn, ¢ral->pool_mtx); +} + +void +hpa_central_postfork_child(tsdn_t *tsdn, hpa_central_t *central) { + malloc_mutex_postfork_child(tsdn, ¢ral->pool_mtx); +} diff --git a/src/jemalloc.c b/src/jemalloc.c index 5d23962d67..6bc25e24ed 100644 --- a/src/jemalloc.c +++ b/src/jemalloc.c @@ -1677,6 +1677,10 @@ malloc_conf_init_helper(sc_data_t *sc_data, unsigned bin_shard_sizes[SC_NBINS], } CONF_CONTINUE; } + CONF_HANDLE_BOOL(opt_hpa_opts.use_pool, "hpa_use_pool"); + CONF_HANDLE_UINT64_T(opt_hpa_pool_purge_delay_ms, + "hpa_pool_purge_delay_ms", 0, UINT64_MAX, + CONF_DONT_CHECK_MIN, CONF_DONT_CHECK_MAX, false); if (CONF_MATCH("hpa_dirty_mult")) { if (CONF_MATCH_VALUE("-1")) { @@ -4516,6 +4520,7 @@ _malloc_prefork(void) } } } + arena_global_prefork(tsd_tsdn(tsd), opt_hpa); prof_prefork1(tsd_tsdn(tsd)); stats_prefork(tsd_tsdn(tsd)); tsd_prefork(tsd); @@ -4553,6 +4558,7 @@ _malloc_postfork(void) arena_postfork_parent(tsd_tsdn(tsd), arena); } } + arena_global_postfork_parent(tsd_tsdn(tsd), opt_hpa); prof_postfork_parent(tsd_tsdn(tsd)); if (have_background_thread) { background_thread_postfork_parent(tsd_tsdn(tsd)); @@ -4583,6 +4589,7 @@ jemalloc_postfork_child(void) { arena_postfork_child(tsd_tsdn(tsd), arena); } } + arena_global_postfork_child(tsd_tsdn(tsd), opt_hpa); prof_postfork_child(tsd_tsdn(tsd)); if (have_background_thread) { background_thread_postfork_child(tsd_tsdn(tsd)); diff --git a/src/stats.c b/src/stats.c index be70a6fcb0..848aae01ef 100644 --- a/src/stats.c +++ b/src/stats.c @@ -843,6 +843,9 @@ stats_arena_hpa_shard_counters_print( uint64_t nhugifies; uint64_t nhugify_failures; uint64_t ndehugifies; + uint64_t ndonated_ps; + uint64_t nborrowed_ps; + ; CTL_M2_GET( "stats.arenas.0.hpa_shard.npageslabs", i, &npageslabs, size_t); @@ -874,6 +877,10 @@ stats_arena_hpa_shard_counters_print( &nhugify_failures, uint64_t); CTL_M2_GET( "stats.arenas.0.hpa_shard.ndehugifies", i, &ndehugifies, uint64_t); + CTL_M2_GET( + "stats.arenas.0.hpa_shard.ndonated_ps", i, &ndonated_ps, uint64_t); + CTL_M2_GET("stats.arenas.0.hpa_shard.nborrowed_ps", i, &nborrowed_ps, + uint64_t); emitter_table_printf(emitter, "HPA shard stats:\n" @@ -891,6 +898,10 @@ stats_arena_hpa_shard_counters_print( " / sec)\n" " Dehugifies: %" FMTu64 " (%" FMTu64 " / sec)\n" + " Donated ps to pool: %" FMTu64 " (%" FMTu64 + " / sec)\n" + " Borrowed ps from the pool: %" FMTu64 " (%" FMTu64 + " / sec)\n" "\n", npageslabs, npageslabs_huge, npageslabs_nonhuge, nactive, nactive_huge, nactive_nonhuge, ndirty, ndirty_huge, ndirty_nonhuge, @@ -899,7 +910,9 @@ stats_arena_hpa_shard_counters_print( rate_per_second(npurges, uptime), nhugifies, rate_per_second(nhugifies, uptime), nhugify_failures, rate_per_second(nhugify_failures, uptime), ndehugifies, - rate_per_second(ndehugifies, uptime)); + rate_per_second(ndehugifies, uptime), ndonated_ps, + rate_per_second(ndonated_ps, uptime), nborrowed_ps, + rate_per_second(nborrowed_ps, uptime)); emitter_json_kv(emitter, "npageslabs", emitter_type_size, &npageslabs); emitter_json_kv(emitter, "nactive", emitter_type_size, &nactive); @@ -913,6 +926,10 @@ stats_arena_hpa_shard_counters_print( &nhugify_failures); emitter_json_kv( emitter, "ndehugifies", emitter_type_uint64, &ndehugifies); + emitter_json_kv( + emitter, "ndonated_ps", emitter_type_uint64, &ndonated_ps); + emitter_json_kv( + emitter, "nborrowed_ps", emitter_type_uint64, &nborrowed_ps); emitter_json_object_kv_begin(emitter, "slabs"); emitter_json_kv(emitter, "npageslabs_nonhuge", emitter_type_size, @@ -1141,7 +1158,11 @@ stats_arena_mutexes_print( CTL_LEAF_PREPARE(stats_arenas_mib, 3, "mutexes"); for (mutex_prof_arena_ind_t i = 0; i < mutex_prof_num_arena_mutexes; - i++) { + i++) { + /* hpa_central_pool is global, only print for arena 0 */ + if (i == arena_prof_mutex_hpa_central_pool && arena_ind != 0) { + continue; + } const char *name = arena_mutex_names[i]; emitter_json_object_kv_begin(emitter, name); mutex_stats_read_arena( @@ -1665,6 +1686,8 @@ stats_general_print(emitter_t *emitter) { OPT_WRITE_SIZE_T("hpa_purge_threshold") OPT_WRITE_UINT64("hpa_min_purge_delay_ms") OPT_WRITE_CHAR_P("hpa_hugify_style") + OPT_WRITE_BOOL("hpa_use_pool") + OPT_WRITE_UINT64("hpa_pool_purge_delay_ms") OPT_WRITE_SIZE_T("hpa_sec_nshards") OPT_WRITE_SIZE_T("hpa_sec_max_alloc") OPT_WRITE_SIZE_T("hpa_sec_max_bytes") @@ -1870,7 +1893,9 @@ stats_print_helper(emitter_t *emitter, bool merged, bool destroyed, metadata_thp, resident, mapped, retained; size_t num_background_threads; size_t zero_reallocs; + size_t ndirty_pool; uint64_t background_thread_num_runs, background_thread_run_interval; + uint64_t npurged_pool; CTL_GET("stats.allocated", &allocated, size_t); CTL_GET("stats.active", &active, size_t); @@ -1883,6 +1908,8 @@ stats_print_helper(emitter_t *emitter, bool merged, bool destroyed, CTL_GET("stats.retained", &retained, size_t); CTL_GET("stats.zero_reallocs", &zero_reallocs, size_t); + CTL_GET("stats.central_pool.ndirty", &ndirty_pool, size_t); + CTL_GET("stats.central_pool.npurged", &npurged_pool, uint64_t); if (have_background_thread) { CTL_GET("stats.background_thread.num_threads", @@ -1925,6 +1952,11 @@ stats_print_helper(emitter_t *emitter, bool merged, bool destroyed, emitter_table_printf(emitter, "Count of realloc(non-null-ptr, 0) calls: %zu\n", zero_reallocs); + /* Central pool */ + emitter_table_printf(emitter, + "Central pool dirty: %zu, purged: %" FMTu64 "\n", ndirty_pool, + npurged_pool); + /* Background thread stats. */ emitter_json_object_kv_begin(emitter, "background_thread"); emitter_json_kv( diff --git a/test/unit/hpa.c b/test/unit/hpa.c index 9c4253cd28..ef558ddb4a 100644 --- a/test/unit/hpa.c +++ b/test/unit/hpa.c @@ -43,7 +43,9 @@ static hpa_shard_opts_t test_hpa_shard_opts_default = { /* min_purge_delay_ms */ 0, /* hugify_style */ - hpa_hugify_style_lazy}; + hpa_hugify_style_lazy, + /* use_pool */ + false}; static hpa_shard_opts_t test_hpa_shard_opts_purge = { /* slab_max_alloc */ @@ -67,7 +69,9 @@ static hpa_shard_opts_t test_hpa_shard_opts_purge = { /* min_purge_delay_ms */ 0, /* hugify_style */ - hpa_hugify_style_lazy}; + hpa_hugify_style_lazy, + /* use_pool */ + false}; static hpa_shard_opts_t test_hpa_shard_opts_aggressive = { /* slab_max_alloc */ @@ -91,7 +95,9 @@ static hpa_shard_opts_t test_hpa_shard_opts_aggressive = { /* min_purge_delay_ms */ 10, /* hugify_style */ - hpa_hugify_style_eager}; + hpa_hugify_style_eager, + /* use_pool */ + false}; static hpa_shard_t * create_test_data(const hpa_hooks_t *hooks, hpa_shard_opts_t *opts) { diff --git a/test/unit/hpa_central_pool.c b/test/unit/hpa_central_pool.c new file mode 100644 index 0000000000..79fd22c226 --- /dev/null +++ b/test/unit/hpa_central_pool.c @@ -0,0 +1,329 @@ +#include "test/jemalloc_test.h" + +#include "jemalloc/internal/hpa.h" +#include "jemalloc/internal/nstime.h" + +#define SHARD_IND 111 +#define SHARD_IND2 112 + +#define ALLOC_MAX (HUGEPAGE) + +typedef struct test_data_s test_data_t; +struct test_data_s { + /* + * Must be the first member -- we convert back and forth between the + * test_data_t and the hpa_shard_t; + */ + hpa_shard_t shard; + hpa_central_t central; + base_t *base; + edata_cache_t shard_edata_cache; + + emap_t emap; +}; + +static hpa_shard_opts_t test_hpa_shard_opts_default = { + /* slab_max_alloc */ + ALLOC_MAX, + /* hugification_threshold */ + HUGEPAGE, + /* dirty_mult */ + FXP_INIT_PERCENT(25), + /* deferral_allowed */ + false, + /* hugify_delay_ms */ + 10 * 1000, + /* hugify_sync */ + false, + /* min_purge_interval_ms */ + 0, + /* experimental_max_purge_nhp */ + -1, + /* purge_threshold */ + HUGEPAGE, + /* min_purge_delay_ms */ + 0, + /* hugify_style */ + hpa_hugify_style_eager, + /* use_pool */ + true}; + +static hpa_shard_t * +create_test_data( + hpa_central_t *central, hpa_shard_opts_t *opts, unsigned int shard_ind) { + bool err; + base_t *base = base_new(TSDN_NULL, /* ind */ shard_ind, + &ehooks_default_extent_hooks, /* metadata_use_hooks */ true); + assert_ptr_not_null(base, ""); + + test_data_t *test_data = malloc(sizeof(test_data_t)); + assert_ptr_not_null(test_data, ""); + + test_data->base = base; + + err = edata_cache_init(&test_data->shard_edata_cache, base); + assert_false(err, ""); + + err = emap_init(&test_data->emap, test_data->base, /* zeroed */ false); + assert_false(err, ""); + + err = hpa_shard_init(&test_data->shard, central, &test_data->emap, + test_data->base, &test_data->shard_edata_cache, shard_ind, opts); + assert_false(err, ""); + + return (hpa_shard_t *)test_data; +} + +static void +destroy_test_data(hpa_shard_t *shard) { + test_data_t *test_data = (test_data_t *)shard; + base_delete(TSDN_NULL, test_data->base); + free(test_data); +} + +static uintptr_t defer_bump_ptr = HUGEPAGE * 123; +static void * +defer_test_map(size_t size) { + void *result = (void *)defer_bump_ptr; + defer_bump_ptr += size; + return result; +} + +static void +defer_test_unmap(void *ptr, size_t size) { + (void)ptr; + (void)size; +} + +static size_t ndefer_purge_calls = 0; +static size_t npurge_size = 0; +static void +defer_test_purge(void *ptr, size_t size) { + (void)ptr; + npurge_size = size; + ++ndefer_purge_calls; +} + +static bool defer_vectorized_purge_called = false; +static bool +defer_vectorized_purge(void *vec, size_t vlen, size_t nbytes) { + (void)vec; + (void)nbytes; + ++ndefer_purge_calls; + defer_vectorized_purge_called = true; + return false; +} + +static size_t ndefer_hugify_calls = 0; +static bool +defer_test_hugify(void *ptr, size_t size, bool sync) { + ++ndefer_hugify_calls; + return false; +} + +static size_t ndefer_dehugify_calls = 0; +static void +defer_test_dehugify(void *ptr, size_t size) { + ++ndefer_dehugify_calls; +} + +static nstime_t defer_curtime; +static void +defer_test_curtime(nstime_t *r_time, bool first_reading) { + *r_time = defer_curtime; +} + +static uint64_t +defer_test_ms_since(nstime_t *past_time) { + return (nstime_ns(&defer_curtime) - nstime_ns(past_time)) / 1000 / 1000; +} + +TEST_BEGIN(test_central_pool) { + test_skip_if(!hpa_supported() || !config_stats); + + hpa_hooks_t hooks; + hooks.map = &defer_test_map; + hooks.unmap = &defer_test_unmap; + hooks.purge = &defer_test_purge; + hooks.hugify = &defer_test_hugify; + hooks.dehugify = &defer_test_dehugify; + hooks.curtime = &defer_test_curtime; + hooks.ms_since = &defer_test_ms_since; + hooks.vectorized_purge = &defer_vectorized_purge; + + hpa_shard_opts_t opts = test_hpa_shard_opts_default; + opts.deferral_allowed = true; + opts.purge_threshold = HUGEPAGE; + opts.min_purge_delay_ms = 0; + opts.min_purge_interval_ms = 0; + + hpa_central_t central; + base_t *central_base = base_new(TSDN_NULL, /* ind */ 1234, + &ehooks_default_extent_hooks, /* metadata_use_hooks */ true); + assert_ptr_not_null(central_base, ""); + hpa_central_init(¢ral, central_base, &hooks); + ndefer_purge_calls = 0; + hpa_shard_t *shard1 = create_test_data(¢ral, &opts, SHARD_IND); + hpa_shard_t *shard2 = create_test_data(¢ral, &opts, SHARD_IND2); + + bool deferred_work_generated = false; + nstime_init(&defer_curtime, 10 * 1000 * 1000); + tsdn_t *tsdn = tsd_tsdn(tsd_fetch()); + enum { NALLOCS = HUGEPAGE_PAGES }; + edata_t *edatas[NALLOCS]; + for (int i = 0; i < NALLOCS / 2; i++) { + edatas[i] = pai_alloc(tsdn, &shard1->pai, PAGE, PAGE, false, + false, false, &deferred_work_generated); + expect_ptr_not_null(edatas[i], "Unexpected null edata"); + } + /* Remember the page */ + hpdata_t *ps = psset_pick_alloc(&shard1->psset, PAGE); + expect_true(hpdata_huge_get(ps), "Should be huge as we start as huge"); + + /* Deallocate all */ + for (int i = 0; i < NALLOCS / 2; i++) { + pai_dalloc( + tsdn, &shard1->pai, edatas[i], &deferred_work_generated); + } + hpa_shard_do_deferred_work(tsdn, shard1); + expect_true(deferred_work_generated, ""); + expect_zu_eq( + 0, ndefer_purge_calls, "Should donate, not purge delay=0ms"); + + /* Stats should not include the page */ + expect_zu_eq(shard1->psset.stats.merged.nactive, 0, ""); + expect_zu_eq(shard1->psset.stats.merged.npageslabs, 0, "Non huge"); + npurge_size = 0; + + /* Make allocation on second shard */ + edata_t *edata2 = pai_alloc(tsdn, &shard2->pai, PAGE, PAGE, false, + false, false, &deferred_work_generated); + expect_ptr_not_null(edata2, "Unexpected null edata"); + expect_zu_eq(shard2->psset.stats.merged.nactive, 1, ""); + hpdata_t *ps2 = psset_pick_alloc(&shard2->psset, PAGE); + expect_ptr_eq( + ps, ps2, "Expected to get the same page via central pool"); + expect_true(hpdata_huge_get(ps2), "Should still be huge"); + + expect_zu_eq(shard2->psset.stats.merged.npageslabs, 1, ""); + pai_dalloc(tsdn, &shard2->pai, edata2, &deferred_work_generated); + expect_true(deferred_work_generated, ""); + ndefer_purge_calls = 0; + npurge_size = 0; + hpa_shard_do_deferred_work(tsdn, shard1); + expect_zu_eq(0, ndefer_purge_calls, "No purge, no donate, delay==0ms"); + hpa_shard_do_deferred_work(tsdn, shard2); + expect_zu_eq(0, ndefer_purge_calls, "No purge, yes donate, delay==0ms"); + + /* Move the time above hard coded limit of 10s */ + nstime_iadd(&defer_curtime, UINT64_C(30) * 1000 * 1000 * 1000); + hpa_shard_do_deferred_work(tsdn, shard2); + expect_zu_eq(1, ndefer_purge_calls, "Purged, delay==0ms"); + expect_zu_eq(HUGEPAGE, npurge_size, "Should purge full folio"); + expect_zu_eq(shard1->psset.stats.merged.npageslabs, 0, ""); + expect_zu_eq(shard2->psset.stats.merged.npageslabs, 0, ""); + /* now alloc again and still get the same page */ + edata2 = pai_alloc(tsdn, &shard2->pai, PAGE, PAGE, false, false, false, + &deferred_work_generated); + expect_ptr_not_null(edata2, "Unexpected null edata"); + expect_zu_eq(shard2->psset.stats.merged.nactive, 1, ""); + ps2 = psset_pick_alloc(&shard2->psset, PAGE); + expect_ptr_eq( + ps, ps2, "Expected to get the same page via central pool"); + expect_zu_eq(shard2->psset.stats.merged.npageslabs, 1, ""); + pai_dalloc(tsdn, &shard2->pai, edata2, &deferred_work_generated); + + npurge_size = 0; + ndefer_purge_calls = 0; + destroy_test_data(shard1); + destroy_test_data(shard2); + base_delete(TSDN_NULL, central_base); +} +TEST_END + +TEST_BEGIN(test_central_pool_with_delay) { + test_skip_if(!hpa_supported() || !config_stats); + + hpa_hooks_t hooks; + hooks.map = &defer_test_map; + hooks.unmap = &defer_test_unmap; + hooks.purge = &defer_test_purge; + hooks.hugify = &defer_test_hugify; + hooks.dehugify = &defer_test_dehugify; + hooks.curtime = &defer_test_curtime; + hooks.ms_since = &defer_test_ms_since; + hooks.vectorized_purge = &defer_vectorized_purge; + + hpa_shard_opts_t opts = test_hpa_shard_opts_default; + opts.deferral_allowed = true; + opts.purge_threshold = HUGEPAGE; + opts.min_purge_delay_ms = 1000; + opts.min_purge_interval_ms = 0; + + hpa_central_t central; + base_t *central_base = base_new(TSDN_NULL, /* ind */ 1234, + &ehooks_default_extent_hooks, /* metadata_use_hooks */ true); + assert_ptr_not_null(central_base, ""); + hpa_central_init(¢ral, central_base, &hooks); + ndefer_purge_calls = 0; + hpa_shard_t *shard1 = create_test_data(¢ral, &opts, SHARD_IND); + hpa_shard_t *shard2 = create_test_data(¢ral, &opts, SHARD_IND2); + + bool deferred_work_generated = false; + nstime_init(&defer_curtime, 10 * 1000 * 1000); + tsdn_t *tsdn = tsd_tsdn(tsd_fetch()); + enum { NALLOCS = HUGEPAGE_PAGES }; + edata_t *edatas[NALLOCS]; + for (int i = 0; i < NALLOCS / 2; i++) { + edatas[i] = pai_alloc(tsdn, &shard1->pai, PAGE, PAGE, false, + false, false, &deferred_work_generated); + expect_ptr_not_null(edatas[i], "Unexpected null edata"); + } + /* Remember the page */ + hpdata_t *ps = psset_pick_alloc(&shard1->psset, PAGE); + expect_true(hpdata_huge_get(ps), "Should be huge as we start as huge"); + + /* Deallocate all */ + for (int i = 0; i < NALLOCS / 2; i++) { + pai_dalloc( + tsdn, &shard1->pai, edatas[i], &deferred_work_generated); + } + hpa_shard_do_deferred_work(tsdn, shard1); + expect_true(deferred_work_generated, ""); + expect_zu_eq(0, ndefer_purge_calls, "No purge, no donation delay=0ms"); + + /* Stats should include the page */ + expect_zu_eq(shard1->psset.stats.merged.nactive, 0, ""); + expect_zu_eq(shard1->psset.stats.merged.npageslabs, 1, ""); + + /* One more second passed */ + nstime_iadd(&defer_curtime, UINT64_C(1000) * 1000 * 1000); + hpa_shard_do_deferred_work(tsdn, shard1); + expect_zu_eq(0, ndefer_purge_calls, "No purge, donation"); + /* Stats should not include the page */ + expect_zu_eq(shard1->psset.stats.merged.nactive, 0, ""); + expect_zu_eq(shard1->psset.stats.merged.npageslabs, 0, ""); + /* Make allocation on second shard */ + edata_t *edata2 = pai_alloc(tsdn, &shard2->pai, PAGE, PAGE, false, + false, false, &deferred_work_generated); + expect_ptr_not_null(edata2, "Unexpected null edata"); + expect_zu_eq(shard2->psset.stats.merged.nactive, 1, ""); + hpdata_t *ps2 = psset_pick_alloc(&shard2->psset, PAGE); + expect_ptr_eq( + ps, ps2, "Expected to get the same page via central pool"); + expect_true(hpdata_huge_get(ps2), "Should still be huge"); + expect_zu_eq(shard2->psset.stats.merged.npageslabs, 1, ""); + + npurge_size = 0; + ndefer_purge_calls = 0; + destroy_test_data(shard1); + destroy_test_data(shard2); + base_delete(TSDN_NULL, central_base); +} +TEST_END + +int +main(void) { + return test_no_reentrancy( + test_central_pool, test_central_pool_with_delay); +} diff --git a/test/unit/hpa_vectorized_madvise.c b/test/unit/hpa_vectorized_madvise.c index 2121de49af..6566089a82 100644 --- a/test/unit/hpa_vectorized_madvise.c +++ b/test/unit/hpa_vectorized_madvise.c @@ -43,7 +43,9 @@ static hpa_shard_opts_t test_hpa_shard_opts_default = { /* purge_delay_ms */ 0, /* hugify_style */ - hpa_hugify_style_lazy}; + hpa_hugify_style_lazy, + /* use_pool */ + false}; static hpa_shard_t * create_test_data(const hpa_hooks_t *hooks, hpa_shard_opts_t *opts) { diff --git a/test/unit/hpa_vectorized_madvise_large_batch.c b/test/unit/hpa_vectorized_madvise_large_batch.c index e92988dec4..296f06a8e1 100644 --- a/test/unit/hpa_vectorized_madvise_large_batch.c +++ b/test/unit/hpa_vectorized_madvise_large_batch.c @@ -44,7 +44,9 @@ static hpa_shard_opts_t test_hpa_shard_opts_default = { /* min_purge_delay_ms */ 0, /* hugify_style */ - hpa_hugify_style_lazy}; + hpa_hugify_style_lazy, + /* use_pool */ + false}; static hpa_shard_t * create_test_data(const hpa_hooks_t *hooks, hpa_shard_opts_t *opts) { diff --git a/test/unit/mallctl.c b/test/unit/mallctl.c index 4c11e4857d..b6d2107190 100644 --- a/test/unit/mallctl.c +++ b/test/unit/mallctl.c @@ -318,6 +318,8 @@ TEST_BEGIN(test_mallctl_opt) { TEST_MALLCTL_OPT(size_t, hpa_purge_threshold, always); TEST_MALLCTL_OPT(uint64_t, hpa_min_purge_delay_ms, always); TEST_MALLCTL_OPT(const char *, hpa_hugify_style, always); + TEST_MALLCTL_OPT(bool, hpa_use_pool, always); + TEST_MALLCTL_OPT(uint64_t, hpa_pool_purge_delay_ms, always); TEST_MALLCTL_OPT(unsigned, narenas, always); TEST_MALLCTL_OPT(const char *, percpu_arena, always); TEST_MALLCTL_OPT(size_t, oversize_threshold, always); @@ -1076,6 +1078,8 @@ TEST_BEGIN(test_stats_arenas_hpa_shard_counters) { TEST_STATS_ARENAS_HPA_SHARD_COUNTERS(uint64_t, npurges); TEST_STATS_ARENAS_HPA_SHARD_COUNTERS(uint64_t, nhugifies); TEST_STATS_ARENAS_HPA_SHARD_COUNTERS(uint64_t, ndehugifies); + TEST_STATS_ARENAS_HPA_SHARD_COUNTERS(uint64_t, ndonated_ps); + TEST_STATS_ARENAS_HPA_SHARD_COUNTERS(uint64_t, nborrowed_ps); #undef TEST_STATS_ARENAS_HPA_SHARD_COUNTERS } From ba01d0409cc47dfa31b72740c3aa6549aaaa0c9a Mon Sep 17 00:00:00 2001 From: Slobodan Predolac Date: Tue, 9 Dec 2025 18:08:29 -0800 Subject: [PATCH 2/2] Immediate sharing of a page --- include/jemalloc/internal/arena_externs.h | 4 +- include/jemalloc/internal/hpa_central.h | 14 ++- src/arena.c | 22 +++- src/background_thread.c | 20 +++- src/hpa.c | 95 +++++++++-------- src/hpa_central.c | 97 ++++++++++++----- test/unit/hpa_central_pool.c | 120 ++++------------------ test/unit/hpa_central_pool.sh | 3 + 8 files changed, 190 insertions(+), 185 deletions(-) create mode 100644 test/unit/hpa_central_pool.sh diff --git a/include/jemalloc/internal/arena_externs.h b/include/jemalloc/internal/arena_externs.h index da55e646c5..cef41c9fba 100644 --- a/include/jemalloc/internal/arena_externs.h +++ b/include/jemalloc/internal/arena_externs.h @@ -46,7 +46,7 @@ void arena_stats_merge(tsdn_t *tsdn, arena_t *arena, unsigned *nthreads, const char **dss, ssize_t *dirty_decay_ms, ssize_t *muzzy_decay_ms, size_t *nactive, size_t *ndirty, size_t *nmuzzy, arena_stats_t *astats, bin_stats_data_t *bstats, arena_stats_large_t *lstats, pac_estats_t *estats, - hpa_shard_stats_t *hpastats, sec_stats_t *secstats); + hpa_shard_stats_t *hpastats); void arena_stats_global_central_read(tsdn_t *tsdn, hpa_central_stats_t *stats); void arena_stats_global_central_mutex_read( tsdn_t *tsdn, mutex_prof_data_t *mutex_prof_data); @@ -67,6 +67,8 @@ void arena_decay( tsdn_t *tsdn, arena_t *arena, bool is_background_thread, bool all); uint64_t arena_time_until_deferred(tsdn_t *tsdn, arena_t *arena); void arena_do_deferred_work(tsdn_t *tsdn, arena_t *arena); +void arena_central_do_deferred_work(tsdn_t *tsdn); +uint64_t arena_central_time_until_deferred_work(tsdn_t *tsdn); void arena_reset(tsd_t *tsd, arena_t *arena); void arena_destroy(tsd_t *tsd, arena_t *arena); cache_bin_sz_t arena_ptr_array_fill_small(tsdn_t *tsdn, arena_t *arena, diff --git a/include/jemalloc/internal/hpa_central.h b/include/jemalloc/internal/hpa_central.h index 947c5463ed..16f0756ea4 100644 --- a/include/jemalloc/internal/hpa_central.h +++ b/include/jemalloc/internal/hpa_central.h @@ -14,7 +14,7 @@ struct hpa_pool_s { * Pool of empty huge pages to be shared between shards that are * participating. * - * Page is owned by the pool if it lives in one of these two lists. + * Page is owned by the pool if it lives in one of these two lists. * This means that it should not be part of any hpa_shard's psset at the * same time. */ @@ -68,16 +68,20 @@ bool hpa_central_init( hpdata_t *hpa_central_extract(tsdn_t *tsdn, hpa_central_t *central, size_t size, uint64_t age, bool hugify_eager, bool *oom); -/* Donate empty pages to central */ -void hpa_central_ps_insert(tsdn_t *tsdn, hpa_central_t *central, - hpdata_empty_list_t *pages, const nstime_t *now); +/* Donate empty page to central */ +void hpa_central_donate( + tsdn_t *tsdn, hpa_central_t *central, hpdata_t *ps, const nstime_t *now); /* Get empty page from central without growing it */ -hpdata_t *hpa_central_ps_pop(tsdn_t *tsdn, hpa_central_t *central); +hpdata_t *hpa_central_borrow(tsdn_t *tsdn, hpa_central_t *central); /* Purge up to max_ps empty pages in the central */ size_t hpa_central_purge( tsdn_t *tsdn, hpa_central_t *central, const nstime_t *now, size_t max_ps); +/* Get time in nanoseconds until central pool needs deferred work */ +uint64_t hpa_central_time_until_deferred_work( + tsdn_t *tsdn, hpa_central_t *central); + void hpa_central_prefork(tsdn_t *tsdn, hpa_central_t *central); void hpa_central_postfork_parent(tsdn_t *tsdn, hpa_central_t *central); void hpa_central_postfork_child(tsdn_t *tsdn, hpa_central_t *central); diff --git a/src/arena.c b/src/arena.c index 261dbbd17c..5ee5f407ef 100644 --- a/src/arena.c +++ b/src/arena.c @@ -227,7 +227,6 @@ arena_stats_global_central_mutex_read( malloc_mutex_unlock(tsdn, &arena_pa_central_global.hpa.pool_mtx); } - static void arena_background_thread_inactivity_check( tsdn_t *tsdn, arena_t *arena, bool is_background_thread) { @@ -628,6 +627,27 @@ arena_do_deferred_work(tsdn_t *tsdn, arena_t *arena) { pa_shard_do_deferred_work(tsdn, &arena->pa_shard); } +/* Called from background threads to purge central pool. */ +void +arena_central_do_deferred_work(tsdn_t *tsdn) { + if (arena_pa_central_global.hpa.base == NULL) { + return; + } + nstime_t now; + arena_pa_central_global.hpa.hooks.curtime( + &now, /* first_reading */ true); + hpa_central_purge(tsdn, &arena_pa_central_global.hpa, &now, SIZE_MAX); +} + +uint64_t +arena_central_time_until_deferred_work(tsdn_t *tsdn) { + if (arena_pa_central_global.hpa.base == NULL) { + return UINT64_MAX; + } + return hpa_central_time_until_deferred_work( + tsdn, &arena_pa_central_global.hpa); +} + void arena_slab_dalloc(tsdn_t *tsdn, arena_t *arena, edata_t *slab) { bool deferred_work_generated = false; diff --git a/src/background_thread.c b/src/background_thread.c index 4901856a82..75826ed2bc 100644 --- a/src/background_thread.c +++ b/src/background_thread.c @@ -87,7 +87,9 @@ pthread_create_fptr_init(void) { #ifndef JEMALLOC_BACKGROUND_THREAD # define NOT_REACHED \ - { not_reached(); } + { \ + not_reached(); \ + } bool background_thread_create(tsd_t *tsd, unsigned arena_ind) NOT_REACHED bool background_threads_enable(tsd_t *tsd) NOT_REACHED @@ -280,6 +282,22 @@ background_work_sleep_once( } } + /* + * Handle central pool (shared across all arenas). + * Multiple background threads may call this concurrently; + * hpa_central functions handle synchronization internally. + */ + if (!slept_indefinitely) { + arena_central_do_deferred_work(tsdn); + } + if (ns_until_deferred > BACKGROUND_THREAD_MIN_INTERVAL_NS) { + uint64_t ns_central_deferred = + arena_central_time_until_deferred_work(tsdn); + if (ns_central_deferred < ns_until_deferred) { + ns_until_deferred = ns_central_deferred; + } + } + uint64_t sleep_ns; if (ns_until_deferred == BACKGROUND_THREAD_DEFERRED_MAX) { sleep_ns = BACKGROUND_THREAD_INDEFINITE_SLEEP; diff --git a/src/hpa.c b/src/hpa.c index 53c24cd9ce..d3d0f2235f 100644 --- a/src/hpa.c +++ b/src/hpa.c @@ -460,42 +460,6 @@ hpa_purge_finish_hp( psset_update_end(&shard->psset, hp_item->hp); } -static void -hpa_donate_empty_ps(tsdn_t *tsdn, hpa_shard_t *shard) { - malloc_mutex_assert_owner(tsdn, &shard->mtx); - if (!shard->opts.use_pool) { - return; - } - - hpdata_empty_list_t to_donate; - hpdata_empty_list_init(&to_donate); - do { - hpdata_t *to_purge = (shard->opts.min_purge_delay_ms > 0) - ? psset_pick_purge( - &shard->psset, &shard->last_time_work_attempted) - : psset_pick_purge(&shard->psset, NULL); - - if (to_purge == NULL || !hpdata_empty(to_purge)) { - break; - } - assert(hpdata_ndirty_get(to_purge) > 0); - - /* Donate the page to the pool */ - psset_remove(&shard->psset, to_purge); - hpdata_empty_list_append(&to_donate, to_purge); - shard->stats.ndonated_ps++; - } while (true); - - if (!hpdata_empty_list_empty(&to_donate)) { - nstime_t now; - nstime_copy(&now, &shard->last_time_work_attempted); - malloc_mutex_unlock(tsdn, &shard->mtx); - hpa_central_ps_insert(tsdn, shard->central, &to_donate, - &shard->last_time_work_attempted); - malloc_mutex_lock(tsdn, &shard->mtx); - } -} - /* Returns number of huge pages purged. */ static inline size_t hpa_purge(tsdn_t *tsdn, hpa_shard_t *shard, size_t max_hp) { @@ -515,8 +479,6 @@ hpa_purge(tsdn_t *tsdn, hpa_shard_t *shard, size_t max_hp) { }; assert(batch.range_watermark > 0); - hpa_donate_empty_ps(tsdn, shard); - while (1) { hpa_batch_pass_start(&batch); assert(hpa_batch_empty(&batch)); @@ -684,7 +646,7 @@ hpa_shard_maybe_do_deferred_work( max_purges = max_purge_nhp; } - if (shard->opts.use_pool) { + if (shard->opts.use_pool && !shard->opts.deferral_allowed) { size_t max_pool_ops = (forced ? (size_t)-1 : 8); hpa_central_t *central = shard->central; nstime_t now; @@ -711,16 +673,49 @@ hpa_shard_maybe_do_deferred_work( } static void -hpa_add_pool_page_to_psset(tsdn_t *tsdn, hpa_shard_t *shard, hpdata_t *ps) { - assert(hpdata_alloc_allowed_get(ps) && hpdata_empty(ps) - && hpdata_consistent(ps)); - if (hpdata_purge_allowed_get(ps)) { +hpa_borrow_page_from_pool(tsdn_t *tsdn, hpa_shard_t *shard, hpdata_t *ps) { + malloc_mutex_assert_owner(tsdn, &shard->mtx); + hpdata_assert_consistent(ps); + assert(hpdata_empty(ps)); + assert(!hpdata_updating_get(ps)); + assert(!hpdata_changing_state_get(ps)); + assert(hpdata_alloc_allowed_get(ps)); + + /* Page will be purgeable per this shard's rules. */ + bool purgable = hpa_good_purge_candidate(shard, ps); + hpdata_purge_allowed_set(ps, purgable); + if (purgable) { hpa_update_purgable_time(shard, ps); - if (hpdata_huge_get(ps)) { - shard->stats.nborrowed_ps++; - } } + /* Page is empty ensure we do not add to hugify list */ + assert(!hpdata_hugify_allowed_get(ps)); + shard->stats.nborrowed_ps++; psset_insert(&shard->psset, ps); + hpdata_assert_consistent(ps); +} + +static void +hpa_donate_ps_to_pool(tsdn_t *tsdn, hpa_shard_t *shard, hpdata_t *ps) { + malloc_mutex_assert_owner(tsdn, &shard->mtx); + + nstime_t now; + shard->central->hooks.curtime(&now, /* first_reading */ true); + + hpdata_assert_consistent(ps); + assert(hpdata_empty(ps)); + assert(!hpdata_updating_get(ps)); + assert(!hpdata_changing_state_get(ps)); + assert(hpdata_alloc_allowed_get(ps)); + assert(hpdata_ndirty_get(ps) > 0); + assert(!hpdata_hugify_allowed_get(ps)); + + psset_remove(&shard->psset, ps); + shard->stats.ndonated_ps++; + hpdata_assert_consistent(ps); + + malloc_mutex_unlock(tsdn, &shard->mtx); + hpa_central_donate(tsdn, shard->central, ps, &now); + malloc_mutex_lock(tsdn, &shard->mtx); } static edata_t * @@ -737,9 +732,9 @@ hpa_try_alloc_one_no_grow( hpdata_t *ps = psset_pick_alloc(&shard->psset, size); if (ps == NULL && shard->opts.use_pool) { - ps = hpa_central_ps_pop(tsdn, shard->central); + ps = hpa_central_borrow(tsdn, shard->central); if (ps != NULL) { - hpa_add_pool_page_to_psset(tsdn, shard, ps); + hpa_borrow_page_from_pool(tsdn, shard, ps); } } if (ps == NULL) { @@ -1058,6 +1053,10 @@ hpa_dalloc_locked(tsdn_t *tsdn, hpa_shard_t *shard, edata_t *edata) { hpdata_nactive_get(ps), hpdata_age_get(ps)); hpa_update_purge_hugify_eligibility(tsdn, shard, ps); psset_update_end(&shard->psset, ps); + if (hpdata_empty(ps) && shard->opts.use_pool + && !hpdata_changing_state_get(ps)) { + hpa_donate_ps_to_pool(tsdn, shard, ps); + } } static void diff --git a/src/hpa_central.c b/src/hpa_central.c index 281e265ea2..75b47a3d4b 100644 --- a/src/hpa_central.c +++ b/src/hpa_central.c @@ -28,23 +28,13 @@ hpa_central_stats_read( malloc_mutex_unlock(tsdn, ¢ral->pool_mtx); } -static inline void -hpa_central_pool_concat_nonpurged(tsdn_t *tsdn, hpa_central_t *central, - hpdata_empty_list_t *pages, size_t new_dirty) { - malloc_mutex_lock(tsdn, ¢ral->pool_mtx); - hpdata_empty_list_concat(¢ral->pool.nonpurged, pages); - central->stats.ndirty_pool += new_dirty; - malloc_mutex_unlock(tsdn, ¢ral->pool_mtx); -} - static void hpa_central_get_nonpurged(tsdn_t *tsdn, hpa_central_t *central, const nstime_t *now, hpa_purge_batch_t *batch) { malloc_mutex_lock(tsdn, ¢ral->pool_mtx); while (!hpa_batch_full(batch) && !hpdata_empty_list_empty(¢ral->pool.nonpurged)) { - hpdata_t *ps = hpdata_empty_list_first( - ¢ral->pool.nonpurged); + hpdata_t *ps = hpdata_empty_list_last(¢ral->pool.nonpurged); assert(hpdata_empty(ps) && hpdata_purge_allowed_get(ps)); const nstime_t *allowed = hpdata_time_purge_allowed_get(ps); @@ -56,7 +46,8 @@ hpa_central_get_nonpurged(tsdn_t *tsdn, hpa_central_t *central, hpa_purge_item_t *hp_item = &batch->items[batch->item_cnt]; batch->item_cnt++; hp_item->hp = ps; - hp_item->dehugify = hpdata_huge_get(hp_item->hp); + /* We only deal with empty pages in the pool */ + hp_item->dehugify = false; size_t nranges; hpdata_alloc_allowed_set(hp_item->hp, false); size_t ndirty = hpdata_purge_begin( @@ -78,8 +69,11 @@ hpa_central_put_purged( for (size_t i = 0; i < batch->item_cnt; ++i) { hpa_purge_item_t *hp_item = &batch->items[i]; - if (hp_item->dehugify) { + /* Page was empty, so we just change the flag after purging */ + if (hpdata_huge_get(hp_item->hp)) { hpdata_dehugify(hp_item->hp); + hpdata_purged_when_empty_and_huge_set( + hp_item->hp, true); } hpdata_purge_end(hp_item->hp, &hp_item->state); hpdata_alloc_allowed_set(hp_item->hp, true); @@ -96,33 +90,50 @@ hpa_central_put_purged( } void -hpa_central_ps_insert(tsdn_t *tsdn, hpa_central_t *central, - hpdata_empty_list_t *pages, const nstime_t *now) { - assert(!hpdata_empty_list_empty(pages)); - +hpa_central_donate( + tsdn_t *tsdn, hpa_central_t *central, hpdata_t *ps, const nstime_t *now) { assert(now != NULL); nstime_t purge_time; nstime_copy(&purge_time, now); uint64_t purge_delay_ns = opt_hpa_pool_purge_delay_ms * MILLION; nstime_iadd(&purge_time, purge_delay_ns); - - hpdata_t *ps; - size_t new_dirty = 0; - ql_foreach (ps, &pages->head, ql_link_empty) { - assert(hpdata_empty(ps)); - assert(hpdata_ndirty_get(ps) > 0); - hpdata_time_purge_allowed_set(ps, &purge_time); - new_dirty += hpdata_ndirty_get(ps); - } - hpa_central_pool_concat_nonpurged(tsdn, central, pages, new_dirty); + assert(hpdata_empty(ps)); + assert(hpdata_ndirty_get(ps) > 0); + /* + * Central pool purge policy: We expect to receive pages with ndirty > 0 + * from shards. Regardless of the source shard's purge settings + * (including dirty_mult=-1), donated pages are marked as purgeable and + * will be purged after hpa_pool_purge_delay_ms milliseconds. This + * allows the central pool to reclaim memory independently of individual + * shard policies. + */ + hpdata_purge_allowed_set(ps, true); + hpdata_time_purge_allowed_set(ps, &purge_time); + size_t new_dirty = hpdata_ndirty_get(ps); + malloc_mutex_lock(tsdn, ¢ral->pool_mtx); + central->stats.ndirty_pool += new_dirty; + /* + * Insert at head (LIFO for insertion). This means newly donated pages + * will be borrowed first (FIFO for borrowing at line 125), providing + * better cache locality. Older pages accumulate at the tail and are + * purged first (LIFO for purging at line 37). + */ + hpdata_empty_list_prepend(¢ral->pool.nonpurged, ps); + malloc_mutex_unlock(tsdn, ¢ral->pool_mtx); } hpdata_t * -hpa_central_ps_pop(tsdn_t *tsdn, hpa_central_t *central) { +hpa_central_borrow(tsdn_t *tsdn, hpa_central_t *central) { hpdata_t *ps = NULL; malloc_mutex_lock(tsdn, ¢ral->pool_mtx); + /* + * Prefer non-purged pages over purged ones. Non-purged pages are cheaper + * to use (no need to fault pages back in) and allow purged pages to + * remain as a reserve for when the pool is under pressure. + */ if (!hpdata_empty_list_empty(¢ral->pool.nonpurged)) { + /* Take from front (FIFO) - gets most recently donated pages. */ ps = hpdata_empty_list_first(¢ral->pool.nonpurged); hpdata_empty_list_remove(¢ral->pool.nonpurged, ps); } @@ -284,6 +295,36 @@ hpa_central_purge( return batch.npurged_hp_total; } +uint64_t +hpa_central_time_until_deferred_work(tsdn_t *tsdn, hpa_central_t *central) { + nstime_t purge_allowed; + nstime_init_zero(&purge_allowed); + + malloc_mutex_lock(tsdn, ¢ral->pool_mtx); + if (!hpdata_empty_list_empty(¢ral->pool.nonpurged)) { + /* Get the last element (oldest in terms of insertion order) */ + hpdata_t *ps = hpdata_empty_list_last(¢ral->pool.nonpurged); + nstime_copy(&purge_allowed, hpdata_time_purge_allowed_get(ps)); + } + malloc_mutex_unlock(tsdn, ¢ral->pool_mtx); + + if (nstime_equals_zero(&purge_allowed)) { + /* No pages to purge */ + return BACKGROUND_THREAD_DEFERRED_MAX; + } + + nstime_t now; + central->hooks.curtime(&now, /* first_reading */ true); + + if (nstime_compare(&purge_allowed, &now) <= 0) { + /* Already ready for purging */ + return BACKGROUND_THREAD_DEFERRED_MIN; + } + + /* Return nanoseconds until purge is allowed */ + return nstime_ns_between(&now, &purge_allowed); +} + /* *No need to do any of below for central->grow_mtx as shard->grow_mtx must be * held to lock that one. diff --git a/test/unit/hpa_central_pool.c b/test/unit/hpa_central_pool.c index 79fd22c226..938153be24 100644 --- a/test/unit/hpa_central_pool.c +++ b/test/unit/hpa_central_pool.c @@ -67,8 +67,11 @@ create_test_data( err = emap_init(&test_data->emap, test_data->base, /* zeroed */ false); assert_false(err, ""); - err = hpa_shard_init(&test_data->shard, central, &test_data->emap, - test_data->base, &test_data->shard_edata_cache, shard_ind, opts); + tsdn_t *tsdn = tsd_tsdn(tsd_fetch()); + sec_opts_t sec_opts; + sec_opts.nshards = 0; + err = hpa_shard_init(tsdn, &test_data->shard, central, &test_data->emap, + test_data->base, &test_data->shard_edata_cache, shard_ind, opts, &sec_opts); assert_false(err, ""); return (hpa_shard_t *)test_data; @@ -177,7 +180,7 @@ TEST_BEGIN(test_central_pool) { expect_ptr_not_null(edatas[i], "Unexpected null edata"); } /* Remember the page */ - hpdata_t *ps = psset_pick_alloc(&shard1->psset, PAGE); + hpdata_t *ps = edata_ps_get(edatas[0]); expect_true(hpdata_huge_get(ps), "Should be huge as we start as huge"); /* Deallocate all */ @@ -186,9 +189,8 @@ TEST_BEGIN(test_central_pool) { tsdn, &shard1->pai, edatas[i], &deferred_work_generated); } hpa_shard_do_deferred_work(tsdn, shard1); - expect_true(deferred_work_generated, ""); - expect_zu_eq( - 0, ndefer_purge_calls, "Should donate, not purge delay=0ms"); + expect_false(deferred_work_generated, "Page was donated on dalloc"); + expect_zu_eq(0, ndefer_purge_calls, "Should donate, not purge"); /* Stats should not include the page */ expect_zu_eq(shard1->psset.stats.merged.nactive, 0, ""); @@ -200,25 +202,24 @@ TEST_BEGIN(test_central_pool) { false, false, &deferred_work_generated); expect_ptr_not_null(edata2, "Unexpected null edata"); expect_zu_eq(shard2->psset.stats.merged.nactive, 1, ""); - hpdata_t *ps2 = psset_pick_alloc(&shard2->psset, PAGE); - expect_ptr_eq( - ps, ps2, "Expected to get the same page via central pool"); + hpdata_t *ps2 = edata_ps_get(edata2); + expect_ptr_eq(ps, ps2, "Expected to get the same page via pool"); expect_true(hpdata_huge_get(ps2), "Should still be huge"); expect_zu_eq(shard2->psset.stats.merged.npageslabs, 1, ""); pai_dalloc(tsdn, &shard2->pai, edata2, &deferred_work_generated); - expect_true(deferred_work_generated, ""); + expect_false(deferred_work_generated, "Page donated to the pool"); ndefer_purge_calls = 0; npurge_size = 0; hpa_shard_do_deferred_work(tsdn, shard1); - expect_zu_eq(0, ndefer_purge_calls, "No purge, no donate, delay==0ms"); + expect_zu_eq(0, ndefer_purge_calls, "Empty shard"); hpa_shard_do_deferred_work(tsdn, shard2); - expect_zu_eq(0, ndefer_purge_calls, "No purge, yes donate, delay==0ms"); + expect_zu_eq(0, ndefer_purge_calls, "Empty shard"); - /* Move the time above hard coded limit of 10s */ + /* Move the time above limit of 10s we passed in MALLOC_CONF */ nstime_iadd(&defer_curtime, UINT64_C(30) * 1000 * 1000 * 1000); - hpa_shard_do_deferred_work(tsdn, shard2); - expect_zu_eq(1, ndefer_purge_calls, "Purged, delay==0ms"); + hpa_central_purge(tsdn, ¢ral, &defer_curtime, SIZE_MAX); + expect_zu_eq(1, ndefer_purge_calls, "Purged, delay==10ms"); expect_zu_eq(HUGEPAGE, npurge_size, "Should purge full folio"); expect_zu_eq(shard1->psset.stats.merged.npageslabs, 0, ""); expect_zu_eq(shard2->psset.stats.merged.npageslabs, 0, ""); @@ -227,9 +228,8 @@ TEST_BEGIN(test_central_pool) { &deferred_work_generated); expect_ptr_not_null(edata2, "Unexpected null edata"); expect_zu_eq(shard2->psset.stats.merged.nactive, 1, ""); - ps2 = psset_pick_alloc(&shard2->psset, PAGE); - expect_ptr_eq( - ps, ps2, "Expected to get the same page via central pool"); + ps2 = edata_ps_get(edata2); + expect_ptr_eq(ps, ps2, "Expected to get the same page via pool"); expect_zu_eq(shard2->psset.stats.merged.npageslabs, 1, ""); pai_dalloc(tsdn, &shard2->pai, edata2, &deferred_work_generated); @@ -241,89 +241,7 @@ TEST_BEGIN(test_central_pool) { } TEST_END -TEST_BEGIN(test_central_pool_with_delay) { - test_skip_if(!hpa_supported() || !config_stats); - - hpa_hooks_t hooks; - hooks.map = &defer_test_map; - hooks.unmap = &defer_test_unmap; - hooks.purge = &defer_test_purge; - hooks.hugify = &defer_test_hugify; - hooks.dehugify = &defer_test_dehugify; - hooks.curtime = &defer_test_curtime; - hooks.ms_since = &defer_test_ms_since; - hooks.vectorized_purge = &defer_vectorized_purge; - - hpa_shard_opts_t opts = test_hpa_shard_opts_default; - opts.deferral_allowed = true; - opts.purge_threshold = HUGEPAGE; - opts.min_purge_delay_ms = 1000; - opts.min_purge_interval_ms = 0; - - hpa_central_t central; - base_t *central_base = base_new(TSDN_NULL, /* ind */ 1234, - &ehooks_default_extent_hooks, /* metadata_use_hooks */ true); - assert_ptr_not_null(central_base, ""); - hpa_central_init(¢ral, central_base, &hooks); - ndefer_purge_calls = 0; - hpa_shard_t *shard1 = create_test_data(¢ral, &opts, SHARD_IND); - hpa_shard_t *shard2 = create_test_data(¢ral, &opts, SHARD_IND2); - - bool deferred_work_generated = false; - nstime_init(&defer_curtime, 10 * 1000 * 1000); - tsdn_t *tsdn = tsd_tsdn(tsd_fetch()); - enum { NALLOCS = HUGEPAGE_PAGES }; - edata_t *edatas[NALLOCS]; - for (int i = 0; i < NALLOCS / 2; i++) { - edatas[i] = pai_alloc(tsdn, &shard1->pai, PAGE, PAGE, false, - false, false, &deferred_work_generated); - expect_ptr_not_null(edatas[i], "Unexpected null edata"); - } - /* Remember the page */ - hpdata_t *ps = psset_pick_alloc(&shard1->psset, PAGE); - expect_true(hpdata_huge_get(ps), "Should be huge as we start as huge"); - - /* Deallocate all */ - for (int i = 0; i < NALLOCS / 2; i++) { - pai_dalloc( - tsdn, &shard1->pai, edatas[i], &deferred_work_generated); - } - hpa_shard_do_deferred_work(tsdn, shard1); - expect_true(deferred_work_generated, ""); - expect_zu_eq(0, ndefer_purge_calls, "No purge, no donation delay=0ms"); - - /* Stats should include the page */ - expect_zu_eq(shard1->psset.stats.merged.nactive, 0, ""); - expect_zu_eq(shard1->psset.stats.merged.npageslabs, 1, ""); - - /* One more second passed */ - nstime_iadd(&defer_curtime, UINT64_C(1000) * 1000 * 1000); - hpa_shard_do_deferred_work(tsdn, shard1); - expect_zu_eq(0, ndefer_purge_calls, "No purge, donation"); - /* Stats should not include the page */ - expect_zu_eq(shard1->psset.stats.merged.nactive, 0, ""); - expect_zu_eq(shard1->psset.stats.merged.npageslabs, 0, ""); - /* Make allocation on second shard */ - edata_t *edata2 = pai_alloc(tsdn, &shard2->pai, PAGE, PAGE, false, - false, false, &deferred_work_generated); - expect_ptr_not_null(edata2, "Unexpected null edata"); - expect_zu_eq(shard2->psset.stats.merged.nactive, 1, ""); - hpdata_t *ps2 = psset_pick_alloc(&shard2->psset, PAGE); - expect_ptr_eq( - ps, ps2, "Expected to get the same page via central pool"); - expect_true(hpdata_huge_get(ps2), "Should still be huge"); - expect_zu_eq(shard2->psset.stats.merged.npageslabs, 1, ""); - - npurge_size = 0; - ndefer_purge_calls = 0; - destroy_test_data(shard1); - destroy_test_data(shard2); - base_delete(TSDN_NULL, central_base); -} -TEST_END - int main(void) { - return test_no_reentrancy( - test_central_pool, test_central_pool_with_delay); + return test_no_reentrancy(test_central_pool); } diff --git a/test/unit/hpa_central_pool.sh b/test/unit/hpa_central_pool.sh new file mode 100644 index 0000000000..61e97a6adb --- /dev/null +++ b/test/unit/hpa_central_pool.sh @@ -0,0 +1,3 @@ +#!/bin/sh + +export MALLOC_CONF="hpa:true,hpa_use_pool:true,hpa_pool_purge_delay_ms:10000"