修复一个 kubelet numa 节点设备分配的 bug

May 22, 2021

k8s 在 1.18 以后发布了 topology manager 功能，允许用户根据硬件设备的 numa 拓扑来分配资源。 topology manager 支持多种分配策略，其中一种叫 single numa node policy，这种策略承诺分配给 pod 的硬件都属于同一个 numa 节点。然而我们发现某些场景下，虽然选择的是 single numa node policy，但是 kublet 分配的设备却可能跨越多个 numa 节点。通过排查我们发现这个问题的根源在于 devicemanager 中设备分配的实现算法上，具体问题代码在pkg/kubelet/cm/devicemanager/manager.go:filterByAffinity()

func (m *ManagerImpl) filterByAffinity(podUID, contName, resource string, available sets.String) (sets.String, sets.String, sets.String) {
	// If alignment information is not available, just pass the available list back.
	hint := m.topologyAffinityStore.GetAffinity(podUID, contName)
	if !m.deviceHasTopologyAlignment(resource) || hint.NUMANodeAffinity == nil {
		return sets.NewString(), sets.NewString(), available
	}

	// Build a map of NUMA Nodes to the devices associated with them. A
	// device may be associated to multiple NUMA nodes at the same time. If an
	// available device does not have any NUMA Nodes associated with it, add it
	// to a list of NUMA Nodes for the fake NUMANode -1.
	perNodeDevices := make(map[int]sets.String)
	nodeWithoutTopology := -1
	for d := range available {
		if m.allDevices[resource][d].Topology == nil || len(m.allDevices[resource][d].Topology.Nodes) == 0 {
			if _, ok := perNodeDevices[nodeWithoutTopology]; !ok {
				perNodeDevices[nodeWithoutTopology] = sets.NewString()
			}
			perNodeDevices[nodeWithoutTopology].Insert(d)
			continue
		}

		for _, node := range m.allDevices[resource][d].Topology.Nodes {
			if _, ok := perNodeDevices[int(node.ID)]; !ok {
				perNodeDevices[int(node.ID)] = sets.NewString()
			}
			perNodeDevices[int(node.ID)].Insert(d)
		}
	}

	// Get a flat list of all of the nodes associated with available devices.
	var nodes []int
	for node := range perNodeDevices {
		nodes = append(nodes, node)
	}

	// Sort the list of nodes by how many devices they contain.
	sort.Slice(nodes, func(i, j int) bool {
		return perNodeDevices[i].Len() < perNodeDevices[j].Len()
	})

	// Generate three sorted lists of devices. Devices in the first list come
	// from valid NUMA Nodes contained in the affinity mask. Devices in the
	// second list come from valid NUMA Nodes not in the affinity mask. Devices
	// in the third list come from devices with no NUMA Node association (i.e.
	// those mapped to the fake NUMA Node -1). Because we loop through the
	// sorted list of NUMA nodes in order, within each list, devices are sorted
	// by their connection to NUMA Nodes with more devices on them.
	var fromAffinity []string
	var notFromAffinity []string
	var withoutTopology []string
	for d := range available {
		// Since the same device may be associated with multiple NUMA Nodes. We
		// need to be careful not to add each device to multiple lists. The
		// logic below ensures this by breaking after the first NUMA node that
		// has the device is encountered.
		for _, n := range nodes {
			if perNodeDevices[n].Has(d) {
				if n == nodeWithoutTopology {
					withoutTopology = append(withoutTopology, d)
				} else if hint.NUMANodeAffinity.IsSet(n) {
					fromAffinity = append(fromAffinity, d)
				} else {
					notFromAffinity = append(notFromAffinity, d)
				}
				break
			}
		}
	}

	// Return all three lists containing the full set of devices across them.
	return sets.NewString(fromAffinity...), sets.NewString(notFromAffinity...), sets.NewString(withoutTopology...)
}

这个函把当前的可用设备按 numa 亲和性、非 numa 亲和性、无 numa 拓扑结构来划分。它存在的问题是，当一个设备属于多个 numa 节点时（没错，一个物理设备是可以同时属于多个 numa 节点的），它会把 numa 亲和性的设备划分到非 numa 亲和性的集合里。例如，设备 dev1 同时属于 nuam1 和 numa2 节点，设备 dev2 属于 numa1 节点，假设我们现在要求选择 numa2 节点上的设备，这时候如果for _, n := range nodes这条迭代语句中nodes也就是 numa 节点列表的顺序是[numa1, numa2]，那么根据后面的迭代语句中的逻辑，dev1 就会被归类到非亲和性的设备集合里。那为什么nodes中 numa 节点的顺序是怎么确定的呢？我们看nodes列表是怎么构造的：

	// Get a flat list of all of the nodes associated with available devices.
	var nodes []int
	for node := range perNodeDevices {
		nodes = append(nodes, node)
	}

	// Sort the list of nodes by how many devices they contain.
	sort.Slice(nodes, func(i, j int) bool {
		return perNodeDevices[i].Len() < perNodeDevices[j].Len()
	})

先从字典perNodeDevices中构造nodes列表，注意因为perNodeDevices是字典，迭代的顺序无法保证，所以即便是同样的内容也可能产生不一样的nodes列表。然后对nodes进行排序，但是排序的 comparator 显然写错了， return perNodeDevices[i].Len() < perNodeDevices[j].Len()明显应该是return perNodeDevices[nodes[i]].Len() < perNodeDevices[nodes[j]].Len()。字典迭代结果的不确定性和排序 comparator 的错误显然让nodes的排序结果彻底变得不确定，这直接影响到后面设备的分类操作。那我们按上面说的修复 comparator 的 bug 就可以了吗？我们先试着理解这个 comparator 的逻辑，numa 节点按上面的连接的设备数量升序排列。在我们所举的例子中，numa2 上关联一个设备，numa1 上关联两个设备，nodes排列顺序是[numa1, numa2], 似乎没有问题。但如果我们再考虑这样一个场景，numa2 上还有 dev3, dev4，这时候nodes的排列顺序又变成[numa1, numa2]， dev2 又会被划到非亲和性设备集里。所以只是简单的根据 numa 节点上关联的设备来排序不足以修复这个问题。要修复这个问题的本质在于 numa 节点的的排序上：如果一个 numa 节点在hint.NUMANodeAffinity集合里，那么他应该比不在这个集合里的 numa 节点排得靠前，具体的 fix 如下：


	// Sort the list of nodes by:
	// 1) Nodes contained in the 'hint's affinity set
	// 2) Nodes not contained in the 'hint's affinity set
	// 3) The fake NUMANode of -1 (assuming it is included in the list)
	// Within each of the groups above, sort the nodes by how many devices they contain
	sort.Slice(nodes, func(i, j int) bool {
		// If one or the other of nodes[i] or nodes[j] is in the 'hint's affinity set
		if hint.NUMANodeAffinity.IsSet(nodes[i]) && hint.NUMANodeAffinity.IsSet(nodes[j]) {
			return perNodeDevices[nodes[i]].Len() < perNodeDevices[nodes[j]].Len()
		}
		if hint.NUMANodeAffinity.IsSet(nodes[i]) {
			return true
		}
		if hint.NUMANodeAffinity.IsSet(nodes[j]) {
			return false
		}

		// If one or the other of nodes[i] or nodes[j] is the fake NUMA node -1 (they can't both be)
		if nodes[i] == nodeWithoutTopology {
			return false
		}
		if nodes[j] == nodeWithoutTopology {
			return true
		}

		// Otherwise both nodes[i] and nodes[j] are real NUMA nodes that are not in the 'hint's' affinity list.
		return perNodeDevices[nodes[i]].Len() < perNodeDevices[nodes[j]].Len()
	})

目前这个 bug 的相关 fix已经提交给社区并 merge 到主干了。