Skip to content

Commit

Permalink
Support to optimize page placement via NUMA balancing among bound nodes
Browse files Browse the repository at this point in the history
In Linux kernel v5.12, a new mode flag MPOL_F_NUMA_BALANCING is added
to set_mempolicy() to optimize the page placement among the NUMA nodes
with the NUMA balancing mechanism even if the memory of the
applications are bound with MPOL_BIND.  For details about kernel
change, please refer to commit bda420b98505 ("numa balancing: migrate
on fault among multiple bound nodes").  This patch adds the
corresponding support to libnuma and numactl.

A new API: numa_set_membind_balancing() is added to libnuma.  It is
same as numa_set_membind() except that the Linux kernel NUMA balancing
will be enabled for the task if the feature is supported by the
kernel.

At the same time, a new option: --balancing (-b) is added to numactl.
Which can be used before the --membind/-m memory policy in the command
line.  With it, the Linux kernel NUMA balancing will be enabled for
the process if --membind/-m is used and the feature is supported by
the kernel.

Signed-off-by: "Huang, Ying" <ying.huang@intel.com>
  • Loading branch information
yhuang-intel authored and andikleen committed Mar 1, 2021
1 parent b46666e commit 156e4b1
Show file tree
Hide file tree
Showing 7 changed files with 70 additions and 3 deletions.
14 changes: 14 additions & 0 deletions libnuma.c
Original file line number Diff line number Diff line change
Expand Up @@ -1064,6 +1064,20 @@ numa_set_membind_v2(struct bitmask *bmp)

make_internal_alias(numa_set_membind_v2);

void
numa_set_membind_balancing(struct bitmask *bmp)
{
/* MPOL_F_NUMA_BALANCING: ignore if unsupported */
if (set_mempolicy(MPOL_BIND | MPOL_F_NUMA_BALANCING,
bmp->maskp, bmp->size + 1) < 0) {
if (errno == EINVAL) {
errno = 0;
numa_set_membind_v2(bmp);
} else
numa_error("set_mempolicy");
}
}

/*
* copy a bitmask map body to a numa.h nodemask_t structure
*/
Expand Down
15 changes: 15 additions & 0 deletions numa.3
Original file line number Diff line number Diff line change
Expand Up @@ -80,6 +80,8 @@ numa \- NUMA policy library
.br
.BI "void numa_set_membind(struct bitmask *" nodemask );
.br
.BI "void numa_set_membind_balancing(struct bitmask *" nodemask );
.br
.B struct bitmask *numa_get_membind(void);
.sp
.BI "void *numa_alloc_onnode(size_t " size ", int " node );
Expand Down Expand Up @@ -538,6 +540,19 @@ that contains nodes other than those in the mask returned by
.IR numa_get_mems_allowed ()
will result in an error.

.BR numa_set_membind_balancing ()
sets the memory allocation mask and enable the Linux kernel NUMA
balancing for the task if the feature is supported by the kernel.
The task will only allocate memory from the nodes set in
.IR nodemask .
Passing an empty
.I nodemask
or a
.I nodemask
that contains nodes other than those in the mask returned by
.IR numa_get_mems_allowed ()
will result in an error.

.BR numa_get_membind ()
returns the mask of nodes from which memory can currently be allocated.
If the returned mask is equal to
Expand Down
4 changes: 4 additions & 0 deletions numa.h
Original file line number Diff line number Diff line change
Expand Up @@ -192,6 +192,10 @@ void numa_set_localalloc(void);
/* Only allocate memory from the nodes set in mask. 0 to turn off */
void numa_set_membind(struct bitmask *nodemask);

/* Only allocate memory from the nodes set in mask. Optimize page
placement with Linux kernel NUMA balancing if possible. 0 to turn off */
void numa_set_membind_balancing(struct bitmask *bmp);

/* Return current membind */
struct bitmask *numa_get_membind(void);

Expand Down
12 changes: 12 additions & 0 deletions numactl.8
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,8 @@ numactl \- Control NUMA policy for processes or shared memory
[
.B \-\-all
] [
.B \-\-balancing
] [
.B \-\-interleave nodes
] [
.B \-\-preferred node
Expand Down Expand Up @@ -168,6 +170,12 @@ but if memory cannot be allocated there fall back to other nodes.
This option takes only a single node number.
Relative notation may be used.
.TP
.B \-\-balancing, \-b
Enable Linux kernel NUMA balancing for the process if it is supported by kernel.
This should only be used with
.I \-\-membind, \-m
only, otherwise ignored.
.TP
.B \-\-show, \-s
Show NUMA policy settings of the current process.
.TP
Expand Down Expand Up @@ -278,6 +286,10 @@ numactl \-\-cpunodebind=0 \-\-membind=0,1 -- process -l
Run process as above, but with an option (-l) that would be confused with
a numactl option.

numactl \-\-cpunodebind=0 \-\-balancing \-\-membind=0,1 process
Run process on node 0 with memory allocated on node 0 and 1. Optimize the
page placement with Linux kernel NUMA balancing mechanism if possible.

numactl \-\-cpunodebind=netdev:eth0 \-\-membind=netdev:eth0 network-server
Run network-server on the node of network device eth0 with its memory
also in the same node.
Expand Down
17 changes: 14 additions & 3 deletions numactl.c
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,7 @@ struct option opts[] = {
{"membind", 1, 0, 'm'},
{"show", 0, 0, 's' },
{"localalloc", 0,0, 'l'},
{"balancing", 0, 0, 'b'},
{"hardware", 0,0,'H' },

{"shm", 1, 0, 'S'},
Expand All @@ -65,9 +66,10 @@ struct option opts[] = {
void usage(void)
{
fprintf(stderr,
"usage: numactl [--all | -a] [--interleave= | -i <nodes>] [--preferred= | -p <node>]\n"
" [--physcpubind= | -C <cpus>] [--cpunodebind= | -N <nodes>]\n"
" [--membind= | -m <nodes>] [--localalloc | -l] command args ...\n"
"usage: numactl [--all | -a] [--balancing | -b] [--interleave= | -i <nodes>]\n"
" [--preferred= | -p <node>] [--physcpubind= | -C <cpus>]\n"
" [--cpunodebind= | -N <nodes>] [--membind= | -m <nodes>]\n"
" [--localalloc | -l] command args ...\n"
" numactl [--show | -s]\n"
" numactl [--hardware | -H]\n"
" numactl [--length | -l <length>] [--offset | -o <offset>] [--shmmode | -M <shmmode>]\n"
Expand All @@ -90,6 +92,8 @@ void usage(void)
"all numbers and ranges can be made cpuset-relative with +\n"
"the old --cpubind argument is deprecated.\n"
"use --cpunodebind or --physcpubind instead\n"
"use --balancing | -b to enable Linux kernel NUMA balancing\n"
"for the process if it is supported by kernel\n"
"<length> can have g (GB), m (MB) or k (KB) suffixes\n");
exit(1);
}
Expand Down Expand Up @@ -338,6 +342,7 @@ int do_dump = 0;
int shmattached = 0;
int did_node_cpu_parse = 0;
int parse_all = 0;
int numa_balancing = 0;
char *shmoption;

void check_cpubind(int flag)
Expand Down Expand Up @@ -431,6 +436,10 @@ int main(int ac, char **av)
nopolicy();
hardware();
exit(0);
case 'b': /* --balancing */
nopolicy();
numa_balancing = 1;
break;
case 'i': /* --interleave */
checknuma();
if (parse_all)
Expand Down Expand Up @@ -507,6 +516,8 @@ int main(int ac, char **av)
numa_set_bind_policy(1);
if (shmfd >= 0) {
numa_tonodemask_memory(shmptr, shmlen, mask);
} else if (numa_balancing) {
numa_set_membind_balancing(mask);
} else {
numa_set_membind(mask);
}
Expand Down
3 changes: 3 additions & 0 deletions numaif.h
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,9 @@ extern long move_pages(int pid, unsigned long count,
#define MPOL_LOCAL 4
#define MPOL_MAX 5

/* Flags for set_mempolicy, specified in mode */
#define MPOL_F_NUMA_BALANCING (1 << 13) /* Optimize with NUMA balancing if possible */

/* Flags for get_mem_policy */
#define MPOL_F_NODE (1<<0) /* return next il node or node of address */
/* Warning: MPOL_F_NODE is unsupported and
Expand Down
8 changes: 8 additions & 0 deletions versions.ldscript
Original file line number Diff line number Diff line change
Expand Up @@ -146,3 +146,11 @@ libnuma_1.4 {
local:
*;
} libnuma_1.3;

# New interface for membind with NUMA balancing optimization
libnuma_1.5 {
global:
numa_set_membind_balancing;
local:
*;
} libnuma_1.4;

0 comments on commit 156e4b1

Please sign in to comment.