From c43e01de28aa107b54dee69e5b3335bd6433bbfb Mon Sep 17 00:00:00 2001 From: Pedro Brochado Date: Tue, 3 Feb 2026 17:03:55 -0300 Subject: [PATCH] Add better error handling for repover duplicate content Added a proper error class for duplicate content handling and some more logging to inform exactly what are the conflicting content. Closes: #7184 --- CHANGES/7184.feature | 2 + pulpcore/exceptions/__init__.py | 1 + pulpcore/exceptions/base.py | 1 + pulpcore/exceptions/validation.py | 18 +++++++ pulpcore/plugin/repo_version_utils.py | 77 ++++++++++++++++++--------- 5 files changed, 75 insertions(+), 24 deletions(-) create mode 100644 CHANGES/7184.feature diff --git a/CHANGES/7184.feature b/CHANGES/7184.feature new file mode 100644 index 00000000000..4bed3ae97c5 --- /dev/null +++ b/CHANGES/7184.feature @@ -0,0 +1,2 @@ +Added better error handling for errors when creating RepositoryVersions. +Now the duplicate content pks are shown in the logs. diff --git a/pulpcore/exceptions/__init__.py b/pulpcore/exceptions/__init__.py index a5343971ab8..4c458e43933 100644 --- a/pulpcore/exceptions/__init__.py +++ b/pulpcore/exceptions/__init__.py @@ -24,5 +24,6 @@ ValidationError, MissingDigestValidationError, UnsupportedDigestValidationError, + DuplicateContentInRepositoryError, ) from .plugin import MissingPlugin diff --git a/pulpcore/exceptions/base.py b/pulpcore/exceptions/base.py index 59a3ee598fc..14592d4f9a7 100644 --- a/pulpcore/exceptions/base.py +++ b/pulpcore/exceptions/base.py @@ -1,3 +1,4 @@ +from __future__ import annotations import http.client from gettext import gettext as _ diff --git a/pulpcore/exceptions/validation.py b/pulpcore/exceptions/validation.py index 953f54037a5..2131e5fce51 100644 --- a/pulpcore/exceptions/validation.py +++ b/pulpcore/exceptions/validation.py @@ -119,3 +119,21 @@ def __init__(self, message=None, verified=None): def __str__(self): return f"[{self.error_code}] {self.message}" + + +class DuplicateContentInRepositoryError(ValidationError): + """ + Raised when duplicate content is detected within a Repository (Version). + """ + + error_code = "PLP0022" + + def __init__(self, duplicate_count: int, correlation_id: str): + self.dup_count = duplicate_count + self.cid = correlation_id + + def __str__(self): + return f"[{self.error_code}] " + _( + "Found {n} duplicate contents in repository version" + "(see the logs (cid={cid}) for details).".format(n=self.dup_count, cid=self.cid) + ) diff --git a/pulpcore/plugin/repo_version_utils.py b/pulpcore/plugin/repo_version_utils.py index f892a9d4bd0..d3bee16f795 100644 --- a/pulpcore/plugin/repo_version_utils.py +++ b/pulpcore/plugin/repo_version_utils.py @@ -6,6 +6,11 @@ from pulpcore.app.files import validate_file_paths from pulpcore.app.models import Content, ContentArtifact from pulpcore.app.util import batch_qs +from pulpcore.exceptions import DuplicateContentInRepositoryError +from collections import defaultdict +from django_guid import get_guid +from typing import NamedTuple +from uuid import UUID _logger = logging.getLogger(__name__) @@ -78,35 +83,59 @@ def validate_duplicate_content(version): Uses repo_key_fields to determine if content is duplicated. Raises: - ValueError: If repo version has duplicate content. + RepositoryVersionCreateError: If repo version has duplicate content. """ - error_messages = [] - + dup_count = 0 + correlation_id = get_guid() for type_obj in version.repository.CONTENT_TYPES: if type_obj.repo_key_fields == (): continue - pulp_type = type_obj.get_pulp_type() - repo_key_fields = type_obj.repo_key_fields - new_content_total = type_obj.objects.filter( - pk__in=version.content.filter(pulp_type=pulp_type) - ).count() - unique_new_content_total = ( - type_obj.objects.filter(pk__in=version.content.filter(pulp_type=pulp_type)) - .distinct(*repo_key_fields) - .count() - ) - - if unique_new_content_total < new_content_total: - error_messages.append( - _( - "More than one {pulp_type} content with the duplicate values for {fields}." - ).format(pulp_type=pulp_type, fields=", ".join(repo_key_fields)) - ) - if error_messages: - raise ValueError( - _("Cannot create repository version. {msg}").format(msg=", ".join(error_messages)) - ) + unique_keys = type_obj.repo_key_fields + content_qs = type_obj.objects.filter(pk__in=version.content.filter(pulp_type=pulp_type)) + dup_count = count_duplicates(content_qs, unique_keys) + if dup_count > 0: + # At this point the task already failed, so we'll pay extra queries + # to collect duplicates and provide more useful logs + for duplicate in collect_duplicates(content_qs, unique_keys): + keyset_value = duplicate.keyset_value + duplicate_pks = duplicate.duplicate_pks + _logger.info(f"Duplicates found: {pulp_type=}; {keyset_value=}; {duplicate_pks=}") + if dup_count > 0: + raise DuplicateContentInRepositoryError(dup_count, correlation_id) + + +class DuplicateEntry(NamedTuple): + keyset_value: tuple[str, ...] + duplicate_pks: list[UUID] + + +def count_duplicates(content_qs, unique_keys: tuple[str]) -> int: + new_content_total = content_qs.count() + unique_new_content_total = content_qs.distinct(*unique_keys).count() + return new_content_total - unique_new_content_total + + +def collect_duplicates(content_qs, unique_keys: tuple[str]) -> list[DuplicateEntry]: + last_keyset = None + last_pk = None + keyset_to_contents = defaultdict(list) + content_qs = content_qs.values_list(*unique_keys, "pk") + for values in content_qs.order_by(*unique_keys).iterator(): + keyset_value = values[:-1] + pk = str(values[-1]) + if keyset_value == last_keyset: + dup_pk_list = keyset_to_contents[keyset_value] + # the previous duplicate didn't know it was a duplicate + if len(dup_pk_list) == 0: + dup_pk_list.append(last_pk) + dup_pk_list.append(pk) + last_keyset = keyset_value + last_pk = pk + duplicate_entries = [] + for keyset_value, pk_list in keyset_to_contents.items(): + duplicate_entries.append(DuplicateEntry(duplicate_pks=pk_list, keyset_value=keyset_value)) + return duplicate_entries def validate_version_paths(version):