Rebalancing pages across NUMA nodes can improve locality. After the CPU-side owner of a memory range changes, keeping its pages on the old node can turn future accesses into remote accesses. For discardable anonymous memory, Linux can do this without page migration. The usual sequence is to discard the old pages first, set the NUMA policy for future faults, then touch the range so Linux allocates new pages on the preferred node. move_pages can be used for verification.

  1. madvise(MADV_DONTNEED): discard resident pages in the range.
  2. set_mempolicy(MPOL_PREFERRED): choose the preferred node for future faults.
  3. Later memory access: refault the page under the current policy.
  4. move_pages(..., nodes = NULL, ...): query page locations and verify that the new pages landed on the desired nodes.

MPOL_PREFERRED can fall back if the requested node cannot satisfy the allocation. In contrast, use MPOL_BIND when fallback should be treated as failure.

This is discard and refault, not migration. The old contents are gone.

The program below verifies the behavior for one normal anonymous page and one 2MB anonymous MAP_HUGETLB page. It faults each mapping on node 0, discards the page, prefers node 1, then refaults. For the HugeTLB test, reserve some 2MB huge page on each node first:

1
2
echo 10 | sudo tee /sys/devices/system/node/node0/hugepages/hugepages-2048kB/nr_hugepages
echo 10 | sudo tee /sys/devices/system/node/node1/hugepages/hugepages-2048kB/nr_hugepages
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
#include <errno.h>
#include <limits.h>
#include <linux/mempolicy.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <sys/mman.h>
#include <sys/syscall.h>
#include <unistd.h>

static void prefer(int node, const char* label) {
const unsigned long mask = 1UL << (unsigned int)node;
const unsigned long maxnode = sizeof(mask) * CHAR_BIT;

if (syscall(SYS_set_mempolicy, MPOL_PREFERRED, &mask, maxnode) != 0) {
perror("set_mempolicy(MPOL_PREFERRED)");
exit(1);
}

printf("%-18s MPOL_PREFERRED node %d\n", label, node);
}

static void print_node(const char* label, char* addr) {
void* page = addr;
int status = 0;

if (syscall(SYS_move_pages, 0, 1, &page, NULL, &status, 0) != 0) {
printf("%-18s move_pages: %s\n", label, strerror(errno));
} else if (status >= 0) {
printf("%-18s node %d\n", label, status);
} else if (status == -ENOENT) {
printf("%-18s not resident\n", label);
} else {
printf("%-18s error %d\n", label, -status);
}
}

static void test(const char* name, size_t len, int flags) {
char* addr;

printf("%s (%zu bytes)\n", name, len);

prefer(0, "first fault");

addr = mmap(NULL, len, PROT_READ | PROT_WRITE,
MAP_PRIVATE | MAP_ANONYMOUS | flags, -1, 0);
if (addr == MAP_FAILED) {
printf("mmap: %s\n\n", strerror(errno));
return;
}

print_node("after mmap", addr);

addr[0] = 1;
print_node("after fault", addr);

if (madvise(addr, len, MADV_DONTNEED) != 0) {
printf("madvise(MADV_DONTNEED): %s\n\n", strerror(errno));
munmap(addr, len);
return;
}
print_node("after dontneed", addr);

prefer(1, "refault");
addr[0] = 2;
print_node("after refault", addr);

munmap(addr, len);
putchar('\n');
}

int main(void) {
const size_t huge_page_size = 2UL * 1024 * 1024;
long page_size = sysconf(_SC_PAGESIZE);

if (page_size <= 0) {
perror("sysconf(_SC_PAGESIZE)");
return 1;
}

setbuf(stdout, NULL);
printf("policy: MPOL_PREFERRED\n\n");

test("normal page", (size_t)page_size, 0);
test("hugetlb page", huge_page_size, MAP_HUGETLB);

return 0;
}

Sample output:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
policy: MPOL_PREFERRED

normal page (4096 bytes)
first fault MPOL_PREFERRED node 0
after mmap not resident
after fault node 0
after dontneed not resident
refault MPOL_PREFERRED node 1
after refault node 1

hugetlb page (2097152 bytes)
first fault MPOL_PREFERRED node 0
after mmap not resident
after fault node 0
after dontneed not resident
refault MPOL_PREFERRED node 1
after refault node 1