Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
70 changes: 66 additions & 4 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,22 @@ build-dracpu: ## build dracpu
clean: ## clean
rm -rf "$(OUT_DIR)/"

with-kind: ## run a command with a temporary kind cluster (create if missing, delete on exit if created)
@if [ -z "$$CMD" ]; then \
echo "CMD is required. Example: CMD='echo hello' $(MAKE) with-kind"; \
exit 1; \
fi; \
created=0; \
while read -r name; do \
if [ "$$name" = "$(CLUSTER_NAME)" ]; then created=2; fi; \
done < <(kind get clusters 2>/dev/null || true); \
if [ "$$created" -eq 0 ]; then \
kind create cluster --name ${CLUSTER_NAME} --config hack/kind.yaml; \
created=1; \
fi; \
trap 'if [ "$$created" -eq 1 ]; then kind delete cluster --name ${CLUSTER_NAME}; fi' EXIT; \
bash -c "$$CMD"

test-unit: ## run tests
CGO_ENABLED=1 go test -v -race -count 1 -coverprofile=coverage.out ./pkg/...

Expand Down Expand Up @@ -96,7 +112,16 @@ export DOCKER_CLI_EXPERIMENTAL=enabled
image: ## docker build load
docker build . -t ${STAGING_IMAGE_NAME} --load

build-image: ## build image
build-image: ## build image (skip if already exists unless FORCE_BUILD=1)
@if [ "$(FORCE_BUILD)" = "1" ]; then \
$(MAKE) build-image-force; \
elif docker image inspect "${IMAGE}" >/dev/null 2>&1; then \
echo "Image ${IMAGE} already exists; skipping build."; \
else \
$(MAKE) build-image-force; \
fi

build-image-force: ## force build image
docker buildx build . \
--platform="${PLATFORMS}" \
--tag="${IMAGE}" \
Expand All @@ -116,6 +141,9 @@ kind-cluster: ## create kind cluster
kind-load-image: build-image ## load the current container image into kind
kind load docker-image ${IMAGE} ${IMAGE_LATEST} --name ${CLUSTER_NAME}

kind-load-test-image: build-test-image ## load the test image into kind
kind load docker-image ${IMAGE_TEST} --name ${CLUSTER_NAME}

kind-uninstall-cpu-dra: ## remove cpu dra from kind cluster
kubectl delete -f install.yaml || true

Expand Down Expand Up @@ -162,7 +190,16 @@ ifneq ($(DRACPU_E2E_VERBOSE),)
endif
$(call kind_setup,$(CI_MANIFEST_FILE))

build-test-image: ## build tests image
build-test-image: ## build tests image (skip if already exists unless FORCE_BUILD=1)
@if [ "$(FORCE_BUILD)" = "1" ]; then \
$(MAKE) build-test-image-force; \
elif docker image inspect "${IMAGE_TEST}" >/dev/null 2>&1; then \
echo "Image ${IMAGE_TEST} already exists; skipping build."; \
else \
$(MAKE) build-test-image-force; \
fi

build-test-image-force: ## force build tests image
docker buildx build . \
--file test/image/Dockerfile \
--platform="${PLATFORMS}" \
Expand All @@ -175,11 +212,36 @@ build-test-dracputester: ## build helper to serve as entry point and report cpu
build-test-dracpuinfo: ## build helper to expose hardware info in the internal dracpu format
go build -v -o "$(OUT_DIR)/dracpuinfo" ./test/image/dracpuinfo

test-e2e: ## run e2e test against an existing configured cluster
env DRACPU_E2E_TEST_IMAGE=$(IMAGE_TEST) DRACPU_E2E_RESERVED_CPUS=$(DRACPU_E2E_RESERVED_CPUS) go test -v ./test/e2e/ --ginkgo.v
# E2E: shared setup and run targets (assume cluster exists; use with-kind for "create if missing, delete if we created").
define e2e_daemonset_patch
[{"op":"replace","path":"/spec/template/spec/containers/0/args","value":["/dracpu","--v=4","--cpu-device-mode=$(1)","--reserved-cpus=$(DRACPU_E2E_RESERVED_CPUS)"]}]
endef

kind-e2e-setup: ## label workers, load test image, install DRA, wait for rollout (cluster must exist)
kubectl label nodes -l '!node-role.kubernetes.io/control-plane' node-role.kubernetes.io/worker='' --overwrite
$(MAKE) kind-load-test-image kind-install-cpu-dra
kubectl -n kube-system rollout status daemonset/dracpu --timeout=120s

test-e2e-grouped-run: ## patch daemonset to grouped and run e2e (requires kind-e2e-setup)
kubectl -n kube-system patch daemonset dracpu --type=json -p='$(call e2e_daemonset_patch,grouped)'
env DRACPU_E2E_TEST_IMAGE=$(IMAGE_TEST) DRACPU_E2E_RESERVED_CPUS=$(DRACPU_E2E_RESERVED_CPUS) DRACPU_E2E_CPU_DEVICE_MODE=grouped go test -v ./test/e2e/ --ginkgo.v

test-e2e-individual-run: ## patch daemonset to individual and run e2e (requires kind-e2e-setup)
kubectl -n kube-system patch daemonset dracpu --type=json -p='$(call e2e_daemonset_patch,individual)'
kubectl -n kube-system rollout status daemonset/dracpu --timeout=120s
env DRACPU_E2E_TEST_IMAGE=$(IMAGE_TEST) DRACPU_E2E_RESERVED_CPUS=$(DRACPU_E2E_RESERVED_CPUS) DRACPU_E2E_CPU_DEVICE_MODE=individual go test -v ./test/e2e/ --ginkgo.v

test-e2e: ## run e2e in both grouped and individual mode (one cluster: create if missing, run both, delete if we created)
Comment on lines +227 to +234
Copy link
Copy Markdown
Contributor

@pravk03 pravk03 Mar 19, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This changes here overwrites the custom CI manifests deployed in CI workflows with make ci-kind-setup. Not sure if we would want that @ffromani

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I suggest we move the MakeFile improvements to a separate PR to give more time to iterate on after 0.1 release is cut. We can limit this PR to bug fixes and improvements in tests.

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Okay. Will do tomorrow.

CMD='$(MAKE) kind-e2e-setup; $(MAKE) test-e2e-grouped-run; r1=$$?; $(MAKE) test-e2e-individual-run; r2=$$?; exit $$((r1|r2))' $(MAKE) with-kind

test-e2e-kind: ci-kind-setup test-e2e ## run e2e test against a purpose-built kind cluster

test-e2e-grouped-mode: ## run e2e tests in grouped mode (with-kind)
CMD='$(MAKE) kind-e2e-setup test-e2e-grouped-run' $(MAKE) with-kind

test-e2e-individual-mode: ## run e2e tests in individual mode (with-kind)
CMD='$(MAKE) kind-e2e-setup test-e2e-individual-run' $(MAKE) with-kind

lint: ## run the linter against the codebase
$(GOLANGCI_LINT) run ./...

Expand Down
146 changes: 102 additions & 44 deletions test/e2e/cpu_assignment_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -60,13 +60,14 @@ Please note "Serial" is however unavoidable because we manage the shared node st
*/
var _ = ginkgo.Describe("CPU Allocation", ginkgo.Serial, ginkgo.Ordered, ginkgo.ContinueOnFailure, func() {
var (
rootFxt *fixture.Fixture
targetNode *v1.Node
targetNodeCPUInfo discovery.DRACPUInfo
availableCPUs cpuset.CPUSet
dracpuTesterImage string
reservedCPUs cpuset.CPUSet
cpuDeviceMode string
rootFxt *fixture.Fixture
targetNode *v1.Node
targetNodeCPUInfo discovery.DRACPUInfo
availableCPUs cpuset.CPUSet
availableCPUsByNode map[string]cpuset.CPUSet
dracpuTesterImage string
reservedCPUs cpuset.CPUSet
cpuDeviceMode string
)

ginkgo.BeforeAll(func(ctx context.Context) {
Expand Down Expand Up @@ -106,38 +107,50 @@ var _ = ginkgo.Describe("CPU Allocation", ginkgo.Serial, ginkgo.Ordered, ginkgo.
gomega.Expect(dsReservedCPUs.Equals(reservedCPUs)).To(gomega.BeTrue(), "daemonset reserved cpus %v do not match test reserved cpus %v", dsReservedCPUs.String(), reservedCPUs.String())
rootFxt.Log.Info("daemonset --cpu-device-mode configuration", "mode", cpuDeviceMode)

var workerNodes []*v1.Node
if targetNodeName := os.Getenv("DRACPU_E2E_TARGET_NODE"); len(targetNodeName) > 0 {
targetNode, err = rootFxt.K8SClientset.CoreV1().Nodes().Get(ctx, targetNodeName, metav1.GetOptions{})
gomega.Expect(err).ToNot(gomega.HaveOccurred(), "cannot get worker node %q: %v", targetNodeName, err)
workerNodes = []*v1.Node{targetNode}
} else {
gomega.Eventually(func() error {
workerNodes, err := node.FindWorkers(ctx, infraFxt.K8SClientset)
if err != nil {
return err
var findErr error
workerNodes, findErr = node.FindWorkers(ctx, infraFxt.K8SClientset)
if findErr != nil {
return findErr
}
if len(workerNodes) == 0 {
return fmt.Errorf("no worker nodes detected")
}
targetNode = workerNodes[0] // pick random one, this is the simplest random pick
targetNode = workerNodes[0]
return nil
}).WithTimeout(1*time.Minute).WithPolling(5*time.Second).Should(gomega.Succeed(), "failed to find any worker node")
}
rootFxt.Log.Info("using worker node", "nodeName", targetNode.Name)

infoPod := discovery.MakePod(infraFxt.Namespace.Name, dracpuTesterImage)
infoPod = e2epod.PinToNode(infoPod, targetNode.Name)
infoPod, err = e2epod.RunToCompletion(ctx, infraFxt.K8SClientset, infoPod)
gomega.Expect(err).ToNot(gomega.HaveOccurred(), "cannot create discovery pod: %v", err)
data, err := e2epod.GetLogs(infraFxt.K8SClientset, ctx, infoPod.Namespace, infoPod.Name, infoPod.Spec.Containers[0].Name)
gomega.Expect(err).ToNot(gomega.HaveOccurred(), "cannot get logs from discovery pod: %v", err)
gomega.Expect(json.Unmarshal([]byte(data), &targetNodeCPUInfo)).To(gomega.Succeed())

allocatableCPUs := makeCPUSetFromDiscoveredCPUInfo(targetNodeCPUInfo)
availableCPUs = allocatableCPUs.Difference(reservedCPUs)
if reservedCPUs.Size() > 0 {
gomega.Expect(availableCPUs.Intersection(reservedCPUs).Size()).To(gomega.BeZero(), "available cpus %v overlap with reserved cpus %v", availableCPUs.String(), reservedCPUs.String())
availableCPUsByNode = make(map[string]cpuset.CPUSet)
for _, n := range workerNodes {
infoPod := discovery.MakePod(infraFxt.Namespace.Name, dracpuTesterImage)
infoPod.Name = "discovery-pod-" + n.Name
infoPod = e2epod.PinToNode(infoPod, n.Name)
infoPod, err = e2epod.RunToCompletion(ctx, infraFxt.K8SClientset, infoPod)
gomega.Expect(err).ToNot(gomega.HaveOccurred(), "cannot create discovery pod on node %q: %v", n.Name, err)
data, err := e2epod.GetLogs(infraFxt.K8SClientset, ctx, infoPod.Namespace, infoPod.Name, infoPod.Spec.Containers[0].Name)
gomega.Expect(err).ToNot(gomega.HaveOccurred(), "cannot get logs from discovery pod: %v", err)
var nodeCPUInfo discovery.DRACPUInfo
gomega.Expect(json.Unmarshal([]byte(data), &nodeCPUInfo)).To(gomega.Succeed())
if n.Name == targetNode.Name {
targetNodeCPUInfo = nodeCPUInfo
}
allocatable := makeCPUSetFromDiscoveredCPUInfo(nodeCPUInfo)
available := allocatable.Difference(reservedCPUs)
if reservedCPUs.Size() > 0 {
gomega.Expect(available.Intersection(reservedCPUs).Size()).To(gomega.BeZero(), "node %q: available cpus %v overlap reserved %v", n.Name, available.String(), reservedCPUs.String())
}
availableCPUsByNode[n.Name] = available
rootFxt.Log.Info("checking worker node", "nodeName", n.Name, "coreCount", len(nodeCPUInfo.CPUs), "allocatableCPUs", allocatable.String(), "availableCPUs", available.String())
}
rootFxt.Log.Info("checking worker node", "nodeName", infoPod.Spec.NodeName, "coreCount", len(targetNodeCPUInfo.CPUs), "allocatableCPUs", allocatableCPUs.String(), "reservedCPUs", reservedCPUs.String(), "availableCPUs", availableCPUs.String())
availableCPUs = availableCPUsByNode[targetNode.Name]
})

ginkgo.When("setting resource claims", func() {
Expand Down Expand Up @@ -184,7 +197,8 @@ var _ = ginkgo.Describe("CPU Allocation", ginkgo.Serial, ginkgo.Ordered, ginkgo.

fxt.Log.Info("Creating pods requesting exclusive CPUs", "numPods", numPods, "cpusPerClaim", cpusPerClaim)
var exclPods []*v1.Pod
allAllocatedCPUs := cpuset.New()
// CPU IDs are per-node; track allocations per node so we don't treat same IDs on different nodes as overlapping.
allAllocatedCPUsByNode := make(map[string]cpuset.CPUSet)

claimTemplate := resourcev1.ResourceClaimTemplate{
ObjectMeta: metav1.ObjectMeta{
Expand All @@ -205,33 +219,77 @@ var _ = ginkgo.Describe("CPU Allocation", ginkgo.Serial, ginkgo.Ordered, ginkgo.

fixture.By("Verifying CPU allocations for each exclusive pod")
for i, pod := range exclPods {
alloc := getTesterPodCPUAllocation(fxt.K8SClientset, ctx, pod)
var alloc CPUAllocation
nodeName := pod.Spec.NodeName
gomega.Eventually(func() error {
alloc = getTesterPodCPUAllocation(fxt.K8SClientset, ctx, pod)
if alloc.CPUAssigned.Size() != cpusPerClaim {
return fmt.Errorf("pod %d: got %d CPUs, want %d", i, alloc.CPUAssigned.Size(), cpusPerClaim)
}
availableOnNode := availableCPUsByNode[nodeName]
if availableOnNode.Size() == 0 {
availableOnNode = availableCPUs
}
if !alloc.CPUAssigned.IsSubsetOf(availableOnNode) {
return fmt.Errorf("pod %d on node %s: CPUs %s outside node available set %s", i, nodeName, alloc.CPUAssigned.String(), availableOnNode.String())
}
allocatedOnNode := allAllocatedCPUsByNode[nodeName]
if allocatedOnNode.Intersection(alloc.CPUAssigned).Size() != 0 {
return fmt.Errorf("pod %d: overlapping CPUs with previous pods on node %s", i, nodeName)
}
return nil
}).WithTimeout(2*time.Minute).WithPolling(5*time.Second).Should(gomega.Succeed(), "exclusive pod %d allocation did not stabilize", i)
fxt.Log.Info("Checking exclusive CPU allocation", "pod", e2epod.Identify(pod), "cpuAllocated", alloc.CPUAssigned.String())
gomega.Expect(alloc.CPUAssigned.Size()).To(gomega.Equal(cpusPerClaim), "Pod %d did not get %d CPUs", i, cpusPerClaim)
gomega.Expect(alloc.CPUAssigned.IsSubsetOf(availableCPUs)).To(gomega.BeTrue(), "Pod %d got CPUs outside available set", i)
gomega.Expect(allAllocatedCPUs.Intersection(alloc.CPUAssigned).Size()).To(gomega.Equal(0), "Pod %d has overlapping CPUs", i)
allAllocatedCPUs = allAllocatedCPUs.Union(alloc.CPUAssigned)
allAllocatedCPUsByNode[nodeName] = allAllocatedCPUsByNode[nodeName].Union(alloc.CPUAssigned)
}
var totalAllocated int
for _, cpus := range allAllocatedCPUsByNode {
totalAllocated += cpus.Size()
}
gomega.Expect(totalAllocated).To(gomega.Equal(numPods * cpusPerClaim))
rootFxt.Log.Info("All exclusive allocation", "byNode", allAllocatedCPUsByNode, "expected Shared CPUs on target", availableCPUs.Difference(allAllocatedCPUsByNode[targetNode.Name]).String())

fixture.By("checking the shared pool does not include anymore the exclusively allocated CPUs on each node with exclusive pods")
var shrPod2 *v1.Pod
var otherNodeSharedPods []*v1.Pod
for nodeName := range allAllocatedCPUsByNode {
availableOnNode := availableCPUsByNode[nodeName]
if availableOnNode.Size() == 0 {
availableOnNode = availableCPUs
}
expectedSharedOnNode := availableOnNode.Difference(allAllocatedCPUsByNode[nodeName])
if nodeName == targetNode.Name {
fixture.By("creating a second best-effort reference pod on target node")
shrPod2 = mustCreateBestEffortPod(ctx, fxt, targetNode.Name, dracpuTesterImage)
verifySharedPoolMatches(ctx, fxt, shrPod2, expectedSharedOnNode)
ginkgo.By("checking the CPU pool of the best-effort pod created before the pods with CPU resource claims")
verifySharedPoolMatches(ctx, fxt, shrPod1, expectedSharedOnNode)
continue
}
fixture.By("creating best-effort pod on node %s to verify shared pool", nodeName)
shrPodOther := mustCreateBestEffortPod(ctx, fxt, nodeName, dracpuTesterImage)
verifySharedPoolMatches(ctx, fxt, shrPodOther, expectedSharedOnNode)
otherNodeSharedPods = append(otherNodeSharedPods, shrPodOther)
}
gomega.Expect(allAllocatedCPUs.Size()).To(gomega.Equal(numPods * cpusPerClaim))
rootFxt.Log.Info("All exclusive allocation", "pod", "exclusive CPUs", allAllocatedCPUs.String(), "expected Shared CPUs", availableCPUs.Difference(allAllocatedCPUs).String())

fixture.By("checking the shared pool does not include anymore the exclusively allocated CPUs")
expectedSharedCPUs := availableCPUs.Difference(allAllocatedCPUs)

fixture.By("creating a second best-effort reference pod")
shrPod2 := mustCreateBestEffortPod(ctx, fxt, targetNode.Name, dracpuTesterImage)
verifySharedPoolMatches(ctx, fxt, shrPod2, expectedSharedCPUs)

ginkgo.By("checking the CPU pool of the best-effort pod created before the pods with CPU resource claims")
verifySharedPoolMatches(ctx, fxt, shrPod1, expectedSharedCPUs)

fixture.By("deleting the pods with exclusive CPUs")
for _, pod := range exclPods {
gomega.Expect(e2epod.DeleteSync(ctx, fxt.K8SClientset, pod)).To(gomega.Succeed(), "cannot delete pod %s", e2epod.Identify(pod))
}

verifySharedPoolMatches(ctx, fxt, shrPod1, availableCPUs)
verifySharedPoolMatches(ctx, fxt, shrPod2, availableCPUs)
if shrPod2 != nil {
verifySharedPoolMatches(ctx, fxt, shrPod2, availableCPUs)
}
for _, shrPodOther := range otherNodeSharedPods {
nodeName := shrPodOther.Spec.NodeName
availableOnNode := availableCPUsByNode[nodeName]
if availableOnNode.Size() == 0 {
availableOnNode = availableCPUs
}
verifySharedPoolMatches(ctx, fxt, shrPodOther, availableOnNode)
gomega.Expect(e2epod.DeleteSync(ctx, fxt.K8SClientset, shrPodOther)).To(gomega.Succeed(), "cannot delete helper pod %s", e2epod.Identify(shrPodOther))
}
})
})
})
Expand All @@ -248,5 +306,5 @@ func verifySharedPoolMatches(ctx context.Context, fxt *fixture.Fixture, sharedPo
return fmt.Errorf("shared CPUs mismatch: expected %v got %v", expectedSharedCPUs.String(), sharedAllocUpdated.CPUAssigned.String())
}
return nil
}).WithTimeout(1*time.Minute).WithPolling(5*time.Second).Should(gomega.Succeed(), "the best-effort tester pod %s does not have access to the exclusively allocated CPUs", e2epod.Identify(sharedPod))
}).WithTimeout(2*time.Minute).WithPolling(5*time.Second).Should(gomega.Succeed(), "the best-effort tester pod %s CPU pool did not match expected %s", e2epod.Identify(sharedPod), expectedSharedCPUs.String())
}
2 changes: 1 addition & 1 deletion test/e2e/e2e_suite_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -66,7 +66,7 @@ func BeFailedToCreate(fxt *fixture.Fixture) types.GomegaMatcher {
return false, nil
}
if cntSt.State.Waiting.Reason != reasonCreateContainerError {
lh.Info("container terminated for different reason", "containerName", cntSt.Name, "reason", cntSt.State.Terminated.Reason)
lh.Info("container waiting for different reason", "containerName", cntSt.Name, "reason", cntSt.State.Waiting.Reason)
return false, nil
}
lh.Info("container creation error", "containerName", cntSt.Name)
Expand Down
8 changes: 4 additions & 4 deletions test/e2e/sharing_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -147,7 +147,7 @@ var _ = ginkgo.Describe("Claim sharing", ginkgo.Serial, ginkgo.Ordered, ginkgo.C
gomega.Expect(fxt.Teardown(ctx)).To(gomega.Succeed())
})

ginkgo.It("should fail to run pods which share a claim", func(ctx context.Context) {
ginkgo.It("should fail to run pods which share a claim", ginkgo.SpecTimeout(5*time.Minute), func(ctx context.Context) {
testPod := v1.Pod{
ObjectMeta: metav1.ObjectMeta{
Namespace: fxt.Namespace.Name,
Expand Down Expand Up @@ -196,10 +196,10 @@ var _ = ginkgo.Describe("Claim sharing", ginkgo.Serial, ginkgo.Ordered, ginkgo.C
return nil
}
return pod
}).WithTimeout(time.Minute).WithPolling(2 * time.Second).Should(BeFailedToCreate(fxt))
}).WithTimeout(2 * time.Minute).WithPolling(2 * time.Second).Should(BeFailedToCreate(fxt))
})

ginkgo.It("should fail to run a pod with multiple containers which share a claim", ginkgo.Label("negative"), func(ctx context.Context) {
ginkgo.It("should fail to run a pod with multiple containers which share a claim", ginkgo.Label("negative"), ginkgo.SpecTimeout(5*time.Minute), func(ctx context.Context) {
testPod := v1.Pod{
ObjectMeta: metav1.ObjectMeta{
Namespace: fxt.Namespace.Name,
Expand Down Expand Up @@ -259,7 +259,7 @@ var _ = ginkgo.Describe("Claim sharing", ginkgo.Serial, ginkgo.Ordered, ginkgo.C
return nil
}
return pod
}).WithTimeout(time.Minute).WithPolling(2 * time.Second).Should(BeFailedToCreate(fxt))
}).WithTimeout(2 * time.Minute).WithPolling(2 * time.Second).Should(BeFailedToCreate(fxt))
})
})
})
Loading
Loading