From 0f203430dd88cc6270310956ace58aca639edb59 Mon Sep 17 00:00:00 2001 From: He Chen Date: Thu, 27 Apr 2017 10:35:58 +0800 Subject: numa: Allow setting NUMA distance for different NUMA nodes This patch is going to add SLIT table support in QEMU, and provides additional option `dist` for command `-numa` to allow user set vNUMA distance by QEMU command. With this patch, when a user wants to create a guest that contains several vNUMA nodes and also wants to set distance among those nodes, the QEMU command would like: ``` -numa node,nodeid=0,cpus=0 \ -numa node,nodeid=1,cpus=1 \ -numa node,nodeid=2,cpus=2 \ -numa node,nodeid=3,cpus=3 \ -numa dist,src=0,dst=1,val=21 \ -numa dist,src=0,dst=2,val=31 \ -numa dist,src=0,dst=3,val=41 \ -numa dist,src=1,dst=2,val=21 \ -numa dist,src=1,dst=3,val=31 \ -numa dist,src=2,dst=3,val=21 \ ``` Signed-off-by: He Chen Message-Id: <1493260558-20728-1-git-send-email-he.chen@linux.intel.com> Reviewed-by: Igor Mammedov Reviewed-by: Andrew Jones Signed-off-by: Eduardo Habkost --- numa.c | 137 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 135 insertions(+), 2 deletions(-) (limited to 'numa.c') diff --git a/numa.c b/numa.c index 6fc2393ddd..2b3fc69915 100644 --- a/numa.c +++ b/numa.c @@ -51,6 +51,7 @@ static int max_numa_nodeid; /* Highest specified NUMA node ID, plus one. * For all nodes, nodeid < max_numa_nodeid */ int nb_numa_nodes; +bool have_numa_distance; NodeInfo numa_info[MAX_NODES]; void numa_set_mem_node_id(ram_addr_t addr, uint64_t size, uint32_t node) @@ -140,7 +141,7 @@ uint32_t numa_get_node(ram_addr_t addr, Error **errp) return -1; } -static void numa_node_parse(NumaNodeOptions *node, QemuOpts *opts, Error **errp) +static void parse_numa_node(NumaNodeOptions *node, QemuOpts *opts, Error **errp) { uint16_t nodenr; uint16List *cpus = NULL; @@ -212,6 +213,43 @@ static void numa_node_parse(NumaNodeOptions *node, QemuOpts *opts, Error **errp) max_numa_nodeid = MAX(max_numa_nodeid, nodenr + 1); } +static void parse_numa_distance(NumaDistOptions *dist, Error **errp) +{ + uint16_t src = dist->src; + uint16_t dst = dist->dst; + uint8_t val = dist->val; + + if (src >= MAX_NODES || dst >= MAX_NODES) { + error_setg(errp, + "Invalid node %" PRIu16 + ", max possible could be %" PRIu16, + MAX(src, dst), MAX_NODES); + return; + } + + if (!numa_info[src].present || !numa_info[dst].present) { + error_setg(errp, "Source/Destination NUMA node is missing. " + "Please use '-numa node' option to declare it first."); + return; + } + + if (val < NUMA_DISTANCE_MIN) { + error_setg(errp, "NUMA distance (%" PRIu8 ") is invalid, " + "it shouldn't be less than %d.", + val, NUMA_DISTANCE_MIN); + return; + } + + if (src == dst && val != NUMA_DISTANCE_MIN) { + error_setg(errp, "Local distance of node %d should be %d.", + src, NUMA_DISTANCE_MIN); + return; + } + + numa_info[src].distance[dst] = val; + have_numa_distance = true; +} + static int parse_numa(void *opaque, QemuOpts *opts, Error **errp) { NumaOptions *object = NULL; @@ -229,12 +267,18 @@ static int parse_numa(void *opaque, QemuOpts *opts, Error **errp) switch (object->type) { case NUMA_OPTIONS_TYPE_NODE: - numa_node_parse(&object->u.node, opts, &err); + parse_numa_node(&object->u.node, opts, &err); if (err) { goto end; } nb_numa_nodes++; break; + case NUMA_OPTIONS_TYPE_DIST: + parse_numa_distance(&object->u.dist, &err); + if (err) { + goto end; + } + break; default: abort(); } @@ -294,6 +338,75 @@ static void validate_numa_cpus(void) g_free(seen_cpus); } +/* If all node pair distances are symmetric, then only distances + * in one direction are enough. If there is even one asymmetric + * pair, though, then all distances must be provided. The + * distance from a node to itself is always NUMA_DISTANCE_MIN, + * so providing it is never necessary. + */ +static void validate_numa_distance(void) +{ + int src, dst; + bool is_asymmetrical = false; + + for (src = 0; src < nb_numa_nodes; src++) { + for (dst = src; dst < nb_numa_nodes; dst++) { + if (numa_info[src].distance[dst] == 0 && + numa_info[dst].distance[src] == 0) { + if (src != dst) { + error_report("The distance between node %d and %d is " + "missing, at least one distance value " + "between each nodes should be provided.", + src, dst); + exit(EXIT_FAILURE); + } + } + + if (numa_info[src].distance[dst] != 0 && + numa_info[dst].distance[src] != 0 && + numa_info[src].distance[dst] != + numa_info[dst].distance[src]) { + is_asymmetrical = true; + } + } + } + + if (is_asymmetrical) { + for (src = 0; src < nb_numa_nodes; src++) { + for (dst = 0; dst < nb_numa_nodes; dst++) { + if (src != dst && numa_info[src].distance[dst] == 0) { + error_report("At least one asymmetrical pair of " + "distances is given, please provide distances " + "for both directions of all node pairs."); + exit(EXIT_FAILURE); + } + } + } + } +} + +static void complete_init_numa_distance(void) +{ + int src, dst; + + /* Fixup NUMA distance by symmetric policy because if it is an + * asymmetric distance table, it should be a complete table and + * there would not be any missing distance except local node, which + * is verified by validate_numa_distance above. + */ + for (src = 0; src < nb_numa_nodes; src++) { + for (dst = 0; dst < nb_numa_nodes; dst++) { + if (numa_info[src].distance[dst] == 0) { + if (src == dst) { + numa_info[src].distance[dst] = NUMA_DISTANCE_MIN; + } else { + numa_info[src].distance[dst] = numa_info[dst].distance[src]; + } + } + } + } +} + void parse_numa_opts(MachineClass *mc) { int i; @@ -390,6 +503,26 @@ void parse_numa_opts(MachineClass *mc) } validate_numa_cpus(); + + /* QEMU needs at least all unique node pair distances to build + * the whole NUMA distance table. QEMU treats the distance table + * as symmetric by default, i.e. distance A->B == distance B->A. + * Thus, QEMU is able to complete the distance table + * initialization even though only distance A->B is provided and + * distance B->A is not. QEMU knows the distance of a node to + * itself is always 10, so A->A distances may be omitted. When + * the distances of two nodes of a pair differ, i.e. distance + * A->B != distance B->A, then that means the distance table is + * asymmetric. In this case, the distances for both directions + * of all node pairs are required. + */ + if (have_numa_distance) { + /* Validate enough NUMA distance information was provided. */ + validate_numa_distance(); + + /* Validation succeeded, now fill in any missing distances. */ + complete_init_numa_distance(); + } } else { numa_set_mem_node_id(0, ram_size, 0); } -- cgit v1.2.1 From 3bfe57165b4bf86a431099078df422f54598f5c6 Mon Sep 17 00:00:00 2001 From: Laurent Vivier Date: Tue, 2 May 2017 18:29:55 +0200 Subject: numa: equally distribute memory on nodes When there are more nodes than available memory to put the minimum allowed memory by node, all the memory is put on the last node. This is because we put (ram_size / nb_numa_nodes) & ~((1 << mc->numa_mem_align_shift) - 1); on each node, and in this case the value is 0. This is particularly true with pseries, as the memory must be aligned to 256MB. To avoid this problem, this patch uses an error diffusion algorithm [1] to distribute equally the memory on nodes. We introduce numa_auto_assign_ram() function in MachineClass to keep compatibility between machine type versions. The legacy function is used with pseries-2.9, pc-q35-2.9 and pc-i440fx-2.9 (and previous), the new one with all others. Example: qemu-system-ppc64 -S -nographic -nodefaults -monitor stdio -m 1G -smp 8 \ -numa node -numa node -numa node \ -numa node -numa node -numa node Before: (qemu) info numa 6 nodes node 0 cpus: 0 6 node 0 size: 0 MB node 1 cpus: 1 7 node 1 size: 0 MB node 2 cpus: 2 node 2 size: 0 MB node 3 cpus: 3 node 3 size: 0 MB node 4 cpus: 4 node 4 size: 0 MB node 5 cpus: 5 node 5 size: 1024 MB After: (qemu) info numa 6 nodes node 0 cpus: 0 6 node 0 size: 0 MB node 1 cpus: 1 7 node 1 size: 256 MB node 2 cpus: 2 node 2 size: 0 MB node 3 cpus: 3 node 3 size: 256 MB node 4 cpus: 4 node 4 size: 256 MB node 5 cpus: 5 node 5 size: 256 MB [1] https://en.wikipedia.org/wiki/Error_diffusion Signed-off-by: Laurent Vivier Message-Id: <20170502162955.1610-2-lvivier@redhat.com> Reviewed-by: Eduardo Habkost [ehabkost: s/ram_size/size/ at numa_default_auto_assign_ram()] Signed-off-by: Eduardo Habkost --- numa.c | 49 ++++++++++++++++++++++++++++++++++++++----------- 1 file changed, 38 insertions(+), 11 deletions(-) (limited to 'numa.c') diff --git a/numa.c b/numa.c index 2b3fc69915..d753687dec 100644 --- a/numa.c +++ b/numa.c @@ -407,6 +407,42 @@ static void complete_init_numa_distance(void) } } +void numa_legacy_auto_assign_ram(MachineClass *mc, NodeInfo *nodes, + int nb_nodes, ram_addr_t size) +{ + int i; + uint64_t usedmem = 0; + + /* Align each node according to the alignment + * requirements of the machine class + */ + + for (i = 0; i < nb_nodes - 1; i++) { + nodes[i].node_mem = (size / nb_nodes) & + ~((1 << mc->numa_mem_align_shift) - 1); + usedmem += nodes[i].node_mem; + } + nodes[i].node_mem = size - usedmem; +} + +void numa_default_auto_assign_ram(MachineClass *mc, NodeInfo *nodes, + int nb_nodes, ram_addr_t size) +{ + int i; + uint64_t usedmem = 0, node_mem; + uint64_t granularity = size / nb_nodes; + uint64_t propagate = 0; + + for (i = 0; i < nb_nodes - 1; i++) { + node_mem = (granularity + propagate) & + ~((1 << mc->numa_mem_align_shift) - 1); + propagate = granularity + propagate - node_mem; + nodes[i].node_mem = node_mem; + usedmem += node_mem; + } + nodes[i].node_mem = size - usedmem; +} + void parse_numa_opts(MachineClass *mc) { int i; @@ -449,17 +485,8 @@ void parse_numa_opts(MachineClass *mc) } } if (i == nb_numa_nodes) { - uint64_t usedmem = 0; - - /* Align each node according to the alignment - * requirements of the machine class - */ - for (i = 0; i < nb_numa_nodes - 1; i++) { - numa_info[i].node_mem = (ram_size / nb_numa_nodes) & - ~((1 << mc->numa_mem_align_shift) - 1); - usedmem += numa_info[i].node_mem; - } - numa_info[i].node_mem = ram_size - usedmem; + assert(mc->numa_auto_assign_ram); + mc->numa_auto_assign_ram(mc, numa_info, nb_numa_nodes, ram_size); } numa_total = 0; -- cgit v1.2.1 From ea089eebbd80e61d3c3cd03741dd5d9535c551fc Mon Sep 17 00:00:00 2001 From: Igor Mammedov Date: Wed, 10 May 2017 13:29:45 +0200 Subject: numa: move source of default CPUs to NUMA node mapping into boards Originally CPU threads were by default assigned in round-robin fashion. However it was causing issues in guest since CPU threads from the same socket/core could be placed on different NUMA nodes. Commit fb43b73b (pc: fix default VCPU to NUMA node mapping) fixed it by grouping threads within a socket on the same node introducing cpu_index_to_socket_id() callback and commit 20bb648d (spapr: Fix default NUMA node allocation for threads) reused callback to fix similar issues for SPAPR machine even though socket doesn't make much sense there. As result QEMU ended up having 3 default distribution rules used by 3 targets /virt-arm, spapr, pc/. In effort of moving NUMA mapping for CPUs into possible_cpus, generalize default mapping in numa.c by making boards decide on default mapping and let them explicitly tell generic numa code to which node a CPU thread belongs to by replacing cpu_index_to_socket_id() with @cpu_index_to_instance_props() which provides default node_id assigned by board to specified cpu_index. Signed-off-by: Igor Mammedov Reviewed-by: Eduardo Habkost Message-Id: <1494415802-227633-2-git-send-email-imammedo@redhat.com> Reviewed-by: David Gibson Signed-off-by: Eduardo Habkost --- numa.c | 24 +++++++++++------------- 1 file changed, 11 insertions(+), 13 deletions(-) (limited to 'numa.c') diff --git a/numa.c b/numa.c index d753687dec..bcdfca2309 100644 --- a/numa.c +++ b/numa.c @@ -443,9 +443,10 @@ void numa_default_auto_assign_ram(MachineClass *mc, NodeInfo *nodes, nodes[i].node_mem = size - usedmem; } -void parse_numa_opts(MachineClass *mc) +void parse_numa_opts(MachineState *ms) { int i; + MachineClass *mc = MACHINE_GET_CLASS(ms); for (i = 0; i < MAX_NODES; i++) { numa_info[i].node_cpu = bitmap_new(max_cpus); @@ -511,21 +512,18 @@ void parse_numa_opts(MachineClass *mc) break; } } - /* Historically VCPUs were assigned in round-robin order to NUMA - * nodes. However it causes issues with guest not handling it nice - * in case where cores/threads from a multicore CPU appear on - * different nodes. So allow boards to override default distribution - * rule grouping VCPUs by socket so that VCPUs from the same socket - * would be on the same node. - */ + + /* assign CPUs to nodes using board provided default mapping */ + if (!mc->cpu_index_to_instance_props) { + error_report("default CPUs to NUMA node mapping isn't supported"); + exit(1); + } if (i == nb_numa_nodes) { for (i = 0; i < max_cpus; i++) { - unsigned node_id = i % nb_numa_nodes; - if (mc->cpu_index_to_socket_id) { - node_id = mc->cpu_index_to_socket_id(i) % nb_numa_nodes; - } + CpuInstanceProperties props; + props = mc->cpu_index_to_instance_props(ms, i); - set_bit(i, numa_info[node_id].node_cpu); + set_bit(i, numa_info[props.node_id].node_cpu); } } -- cgit v1.2.1 From 64c2a8f6d3facc2f758907c3b95686fe9e999590 Mon Sep 17 00:00:00 2001 From: Igor Mammedov Date: Wed, 10 May 2017 13:29:49 +0200 Subject: numa: add check that board supports cpu_index to node mapping Default node mapping initialization already checks that board supports cpu_index to node mapping and refuses to start if it's not supported. Do the same for explicitly provided mapping "-numa node,cpus=..." Signed-off-by: Igor Mammedov Reviewed-by: Andrew Jones Reviewed-by: David Gibson Message-Id: <1494415802-227633-6-git-send-email-imammedo@redhat.com> Signed-off-by: Eduardo Habkost --- numa.c | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) (limited to 'numa.c') diff --git a/numa.c b/numa.c index bcdfca2309..718248161c 100644 --- a/numa.c +++ b/numa.c @@ -141,10 +141,12 @@ uint32_t numa_get_node(ram_addr_t addr, Error **errp) return -1; } -static void parse_numa_node(NumaNodeOptions *node, QemuOpts *opts, Error **errp) +static void parse_numa_node(MachineState *ms, NumaNodeOptions *node, + QemuOpts *opts, Error **errp) { uint16_t nodenr; uint16List *cpus = NULL; + MachineClass *mc = MACHINE_GET_CLASS(ms); if (node->has_nodeid) { nodenr = node->nodeid; @@ -163,6 +165,10 @@ static void parse_numa_node(NumaNodeOptions *node, QemuOpts *opts, Error **errp) return; } + if (!mc->cpu_index_to_instance_props) { + error_report("NUMA is not supported by this machine-type"); + exit(1); + } for (cpus = node->cpus; cpus; cpus = cpus->next) { if (cpus->value >= max_cpus) { error_setg(errp, @@ -253,6 +259,7 @@ static void parse_numa_distance(NumaDistOptions *dist, Error **errp) static int parse_numa(void *opaque, QemuOpts *opts, Error **errp) { NumaOptions *object = NULL; + MachineState *ms = opaque; Error *err = NULL; { @@ -267,7 +274,7 @@ static int parse_numa(void *opaque, QemuOpts *opts, Error **errp) switch (object->type) { case NUMA_OPTIONS_TYPE_NODE: - parse_numa_node(&object->u.node, opts, &err); + parse_numa_node(ms, &object->u.node, opts, &err); if (err) { goto end; } @@ -452,7 +459,7 @@ void parse_numa_opts(MachineState *ms) numa_info[i].node_cpu = bitmap_new(max_cpus); } - if (qemu_opts_foreach(qemu_find_opts("numa"), parse_numa, NULL, NULL)) { + if (qemu_opts_foreach(qemu_find_opts("numa"), parse_numa, ms, NULL)) { exit(1); } -- cgit v1.2.1 From 7c88e65d9e9ff7df7fa9cff1869d64a0eaac63a1 Mon Sep 17 00:00:00 2001 From: Igor Mammedov Date: Wed, 10 May 2017 13:29:50 +0200 Subject: numa: mirror cpu to node mapping in MachineState::possible_cpus Introduce machine_set_cpu_numa_node() helper that stores node mapping for CPU in MachineState::possible_cpus. CPU and node it belongs to is specified by 'props' argument. Patch doesn't remove old way of storing mapping in numa_info[X].node_cpu as removing it at the same time makes patch rather big. Instead it just mirrors mapping in possible_cpus and follow up per target patches will switch to possible_cpus and numa_info[X].node_cpu will be removed once there isn't any users left. Signed-off-by: Igor Mammedov Reviewed-by: David Gibson Reviewed-by: Andrew Jones Message-Id: <1494415802-227633-7-git-send-email-imammedo@redhat.com> Signed-off-by: Eduardo Habkost --- numa.c | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'numa.c') diff --git a/numa.c b/numa.c index 718248161c..7db5dde873 100644 --- a/numa.c +++ b/numa.c @@ -170,6 +170,7 @@ static void parse_numa_node(MachineState *ms, NumaNodeOptions *node, exit(1); } for (cpus = node->cpus; cpus; cpus = cpus->next) { + CpuInstanceProperties props; if (cpus->value >= max_cpus) { error_setg(errp, "CPU index (%" PRIu16 ")" @@ -178,6 +179,10 @@ static void parse_numa_node(MachineState *ms, NumaNodeOptions *node, return; } bitmap_set(numa_info[nodenr].node_cpu, cpus->value, 1); + props = mc->cpu_index_to_instance_props(ms, cpus->value); + props.node_id = nodenr; + props.has_node_id = true; + machine_set_cpu_numa_node(ms, &props, &error_fatal); } if (node->has_mem && node->has_memdev) { @@ -528,9 +533,12 @@ void parse_numa_opts(MachineState *ms) if (i == nb_numa_nodes) { for (i = 0; i < max_cpus; i++) { CpuInstanceProperties props; + /* fetch default mapping from board and enable it */ props = mc->cpu_index_to_instance_props(ms, i); + props.has_node_id = true; set_bit(i, numa_info[props.node_id].node_cpu); + machine_set_cpu_numa_node(ms, &props, &error_fatal); } } -- cgit v1.2.1 From af9b20e8d21cb692e9411963a532b2486f2a1e65 Mon Sep 17 00:00:00 2001 From: Igor Mammedov Date: Wed, 10 May 2017 13:29:51 +0200 Subject: numa: do default mapping based on possible_cpus instead of node_cpu bitmaps Signed-off-by: Igor Mammedov Reviewed-by: David Gibson Reviewed-by: Andrew Jones Message-Id: <1494415802-227633-8-git-send-email-imammedo@redhat.com> Signed-off-by: Eduardo Habkost --- numa.c | 20 ++++++++++++-------- 1 file changed, 12 insertions(+), 8 deletions(-) (limited to 'numa.c') diff --git a/numa.c b/numa.c index 7db5dde873..c89fc2d4a5 100644 --- a/numa.c +++ b/numa.c @@ -458,6 +458,7 @@ void numa_default_auto_assign_ram(MachineClass *mc, NodeInfo *nodes, void parse_numa_opts(MachineState *ms) { int i; + const CPUArchIdList *possible_cpus; MachineClass *mc = MACHINE_GET_CLASS(ms); for (i = 0; i < MAX_NODES; i++) { @@ -519,18 +520,21 @@ void parse_numa_opts(MachineState *ms) numa_set_mem_ranges(); - for (i = 0; i < nb_numa_nodes; i++) { - if (!bitmap_empty(numa_info[i].node_cpu, max_cpus)) { - break; - } - } - /* assign CPUs to nodes using board provided default mapping */ - if (!mc->cpu_index_to_instance_props) { + if (!mc->cpu_index_to_instance_props || !mc->possible_cpu_arch_ids) { error_report("default CPUs to NUMA node mapping isn't supported"); exit(1); } - if (i == nb_numa_nodes) { + + possible_cpus = mc->possible_cpu_arch_ids(ms); + for (i = 0; i < possible_cpus->len; i++) { + if (possible_cpus->cpus[i].props.has_node_id) { + break; + } + } + + /* no CPUs are assigned to NUMA nodes */ + if (i == possible_cpus->len) { for (i = 0; i < max_cpus; i++) { CpuInstanceProperties props; /* fetch default mapping from board and enable it */ -- cgit v1.2.1 From 6accfb782321de5a15e9c9ff5482a2d830055a8f Mon Sep 17 00:00:00 2001 From: Igor Mammedov Date: Wed, 10 May 2017 13:29:56 +0200 Subject: tests: numa: add case for QMP command query-cpus Signed-off-by: Igor Mammedov Reviewed-by: David Gibson Message-Id: <1494415802-227633-13-git-send-email-imammedo@redhat.com> Signed-off-by: Eduardo Habkost --- numa.c | 14 -------------- 1 file changed, 14 deletions(-) (limited to 'numa.c') diff --git a/numa.c b/numa.c index c89fc2d4a5..f16a6a8ade 100644 --- a/numa.c +++ b/numa.c @@ -737,20 +737,6 @@ MemdevList *qmp_query_memdev(Error **errp) return list; } -int numa_get_node_for_cpu(int idx) -{ - int i; - - assert(idx < max_cpus); - - for (i = 0; i < nb_numa_nodes; i++) { - if (test_bit(idx, numa_info[i].node_cpu)) { - break; - } - } - return i; -} - void ram_block_notifier_add(RAMBlockNotifier *n) { QLIST_INSERT_HEAD(&ram_list.ramblock_notifiers, n, next); -- cgit v1.2.1 From 3b8a8557f781c4216744d9ab69bee43b526b0c64 Mon Sep 17 00:00:00 2001 From: Igor Mammedov Date: Wed, 10 May 2017 13:29:57 +0200 Subject: numa: remove no longer need numa_post_machine_init() CPUState::numa_node is still in use but now it's set by board when it creates CPU objects. So there isn't any need to set it again after all CPU's are created, since it's been already set. Signed-off-by: Igor Mammedov Reviewed-by: David Gibson Reviewed-by: Andrew Jones Message-Id: <1494415802-227633-14-git-send-email-imammedo@redhat.com> Signed-off-by: Eduardo Habkost --- numa.c | 15 --------------- 1 file changed, 15 deletions(-) (limited to 'numa.c') diff --git a/numa.c b/numa.c index f16a6a8ade..dc739eadfa 100644 --- a/numa.c +++ b/numa.c @@ -572,21 +572,6 @@ void parse_numa_opts(MachineState *ms) } } -void numa_post_machine_init(void) -{ - CPUState *cpu; - int i; - - CPU_FOREACH(cpu) { - for (i = 0; i < nb_numa_nodes; i++) { - assert(cpu->cpu_index < max_cpus); - if (test_bit(cpu->cpu_index, numa_info[i].node_cpu)) { - cpu->numa_node = i; - } - } - } -} - static void allocate_system_memory_nonnuma(MemoryRegion *mr, Object *owner, const char *name, uint64_t ram_size) -- cgit v1.2.1 From ec78f8114bc4c133fc56fefa7f2af99725e42857 Mon Sep 17 00:00:00 2001 From: Igor Mammedov Date: Wed, 10 May 2017 13:29:59 +0200 Subject: numa: use possible_cpus for not mapped CPUs check and remove corresponding part in numa.c that uses node_cpu bitmaps. Signed-off-by: Igor Mammedov Reviewed-by: David Gibson Reviewed-by: Andrew Jones Message-Id: <1494415802-227633-16-git-send-email-imammedo@redhat.com> Signed-off-by: Eduardo Habkost --- numa.c | 10 ---------- 1 file changed, 10 deletions(-) (limited to 'numa.c') diff --git a/numa.c b/numa.c index dc739eadfa..63bff5a54d 100644 --- a/numa.c +++ b/numa.c @@ -337,16 +337,6 @@ static void validate_numa_cpus(void) bitmap_or(seen_cpus, seen_cpus, numa_info[i].node_cpu, max_cpus); } - - if (!bitmap_full(seen_cpus, max_cpus)) { - char *msg; - bitmap_complement(seen_cpus, seen_cpus, max_cpus); - msg = enumerate_cpus(seen_cpus, max_cpus); - error_report("warning: CPU(s) not present in any NUMA nodes: %s", msg); - error_report("warning: All CPU(s) up to maxcpus should be described " - "in NUMA config"); - g_free(msg); - } g_free(seen_cpus); } -- cgit v1.2.1 From 1171ae9a5b132dc631728ff17688d05ed4534181 Mon Sep 17 00:00:00 2001 From: Igor Mammedov Date: Wed, 10 May 2017 13:30:00 +0200 Subject: numa: remove node_cpu bitmaps as they are no longer used Postfactum "CPU(s) present in multiple NUMA nodes" check was the last user of node_cpu bitmaps, but it's not need as machine_set_cpu_numa_node() does the similar check at the time mapping is set for cpus (i.e. when -numa cpus= is parsed) and ensures that cpu can be mapped only to one node. Remove duplicate check based on node_cpu bitmaps and since the last user is gone remove node_cpu as well, which completes internal transition from legacy bitmap based mapping storage to possible_cpus storage. Signed-off-by: Igor Mammedov Reviewed-by: David Gibson Reviewed-by: Andrew Jones Message-Id: <1494415802-227633-17-git-send-email-imammedo@redhat.com> Signed-off-by: Eduardo Habkost --- numa.c | 43 ------------------------------------------- 1 file changed, 43 deletions(-) (limited to 'numa.c') diff --git a/numa.c b/numa.c index 63bff5a54d..ca122ccb13 100644 --- a/numa.c +++ b/numa.c @@ -178,7 +178,6 @@ static void parse_numa_node(MachineState *ms, NumaNodeOptions *node, cpus->value, max_cpus); return; } - bitmap_set(numa_info[nodenr].node_cpu, cpus->value, 1); props = mc->cpu_index_to_instance_props(ms, cpus->value); props.node_id = nodenr; props.has_node_id = true; @@ -305,41 +304,6 @@ end: return 0; } -static char *enumerate_cpus(unsigned long *cpus, int max_cpus) -{ - int cpu; - bool first = true; - GString *s = g_string_new(NULL); - - for (cpu = find_first_bit(cpus, max_cpus); - cpu < max_cpus; - cpu = find_next_bit(cpus, max_cpus, cpu + 1)) { - g_string_append_printf(s, "%s%d", first ? "" : " ", cpu); - first = false; - } - return g_string_free(s, FALSE); -} - -static void validate_numa_cpus(void) -{ - int i; - unsigned long *seen_cpus = bitmap_new(max_cpus); - - for (i = 0; i < nb_numa_nodes; i++) { - if (bitmap_intersects(seen_cpus, numa_info[i].node_cpu, max_cpus)) { - bitmap_and(seen_cpus, seen_cpus, - numa_info[i].node_cpu, max_cpus); - error_report("CPU(s) present in multiple NUMA nodes: %s", - enumerate_cpus(seen_cpus, max_cpus)); - g_free(seen_cpus); - exit(EXIT_FAILURE); - } - bitmap_or(seen_cpus, seen_cpus, - numa_info[i].node_cpu, max_cpus); - } - g_free(seen_cpus); -} - /* If all node pair distances are symmetric, then only distances * in one direction are enough. If there is even one asymmetric * pair, though, then all distances must be provided. The @@ -451,10 +415,6 @@ void parse_numa_opts(MachineState *ms) const CPUArchIdList *possible_cpus; MachineClass *mc = MACHINE_GET_CLASS(ms); - for (i = 0; i < MAX_NODES; i++) { - numa_info[i].node_cpu = bitmap_new(max_cpus); - } - if (qemu_opts_foreach(qemu_find_opts("numa"), parse_numa, ms, NULL)) { exit(1); } @@ -531,13 +491,10 @@ void parse_numa_opts(MachineState *ms) props = mc->cpu_index_to_instance_props(ms, i); props.has_node_id = true; - set_bit(i, numa_info[props.node_id].node_cpu); machine_set_cpu_numa_node(ms, &props, &error_fatal); } } - validate_numa_cpus(); - /* QEMU needs at least all unique node pair distances to build * the whole NUMA distance table. QEMU treats the distance table * as symmetric by default, i.e. distance A->B == distance B->A. -- cgit v1.2.1 From 419fcdec3c1ff545cd33d90ade99236c9bcc37cc Mon Sep 17 00:00:00 2001 From: Igor Mammedov Date: Wed, 10 May 2017 13:30:01 +0200 Subject: numa: add '-numa cpu,...' option for property based node mapping legacy cpu to node mapping is using cpu index values to map VCPU to node with help of '-numa node,nodeid=node,cpus=x[-y]' option. However cpu index is internal concept and QEMU users have to guess /reimplement qemu's logic/ to map it to a concrete cpu socket/core/thread to make sane CPUs placement across numa nodes. This patch allows to map cpu objects to numa nodes using the same properties as used for cpus with -device/device_add (socket-id/core-id/thread-id/node-id). At present valid properties/values to address CPUs could be fetched using hotpluggable-cpus monitor/qmp command, it will require user to start qemu twice when creating domain to fetch possible CPUs for a machine type/-smp layout first and then the second time with numa explicit mapping for actual usage. The first step results could be saved and reused to set/change mapping later as far as machine type/-smp stays the same. Proposed impl. supports exact and wildcard matching to simplify CLI and allow to set mapping for a specific cpu or group of cpu objects specified by matched properties. For example: # exact mapping x86 -numa cpu,node-id=x,socket-id=y,core-id=z,thread-id=n # exact mapping SPAPR -numa cpu,node-id=x,core-id=y # wildcard mapping, all cpu objects that match socket-id=y # are mapped to node-id=x -numa cpu,node-id=x,socket-id=y Signed-off-by: Igor Mammedov Message-Id: <1494415802-227633-18-git-send-email-imammedo@redhat.com> Reviewed-by: David Gibson Signed-off-by: Eduardo Habkost --- numa.c | 15 +++++++++++++++ 1 file changed, 15 insertions(+) (limited to 'numa.c') diff --git a/numa.c b/numa.c index ca122ccb13..84ce2af9b4 100644 --- a/numa.c +++ b/numa.c @@ -290,6 +290,21 @@ static int parse_numa(void *opaque, QemuOpts *opts, Error **errp) goto end; } break; + case NUMA_OPTIONS_TYPE_CPU: + if (!object->u.cpu.has_node_id) { + error_setg(&err, "Missing mandatory node-id property"); + goto end; + } + if (!numa_info[object->u.cpu.node_id].present) { + error_setg(&err, "Invalid node-id=%" PRId64 ", NUMA node must be " + "defined with -numa node,nodeid=ID before it's used with " + "-numa cpu,node-id=ID", object->u.cpu.node_id); + goto end; + } + + machine_set_cpu_numa_node(ms, qapi_NumaCpuOptions_base(&object->u.cpu), + &err); + break; default: abort(); } -- cgit v1.2.1