From 100b01b5762865036d2093e5d2f0851a9f13e9d1 Mon Sep 17 00:00:00 2001 From: Gerrod Ubben Date: Thu, 5 Feb 2026 17:14:43 -0500 Subject: [PATCH 1/2] Add ability to sync from a git repository fixes: #4708 Generated by: claude-opus-4.6-thinking --- CHANGES/pulp_file/4708.feature | 1 + .../app/migrations/0019_add_filegitremote.py | 28 ++ pulp_file/app/models.py | 21 +- pulp_file/app/serializers.py | 19 ++ pulp_file/app/tasks/synchronizing.py | 275 +++++++++++++++++- pulp_file/app/viewsets.py | 90 ++++++ pulp_file/docs/user/guides/_SUMMARY.md | 1 + pulp_file/docs/user/guides/git_sync.md | 132 +++++++++ pulp_file/pytest_plugin.py | 13 + .../tests/functional/api/test_git_sync.py | 116 ++++++++ pulp_file/tests/unit/test_git_sync.py | 144 +++++++++ pyproject.toml | 1 + 12 files changed, 836 insertions(+), 5 deletions(-) create mode 100644 CHANGES/pulp_file/4708.feature create mode 100644 pulp_file/app/migrations/0019_add_filegitremote.py create mode 100644 pulp_file/docs/user/guides/git_sync.md create mode 100644 pulp_file/tests/functional/api/test_git_sync.py create mode 100644 pulp_file/tests/unit/test_git_sync.py diff --git a/CHANGES/pulp_file/4708.feature b/CHANGES/pulp_file/4708.feature new file mode 100644 index 00000000000..11b886e727f --- /dev/null +++ b/CHANGES/pulp_file/4708.feature @@ -0,0 +1 @@ +Added ability to sync a git repository with the new FileGitRemote. diff --git a/pulp_file/app/migrations/0019_add_filegitremote.py b/pulp_file/app/migrations/0019_add_filegitremote.py new file mode 100644 index 00000000000..206be69e8a0 --- /dev/null +++ b/pulp_file/app/migrations/0019_add_filegitremote.py @@ -0,0 +1,28 @@ +# Generated by Django 5.2.10 on 2026-02-05 19:22 + +import django.db.models.deletion +import pulpcore.app.models.access_policy +from django.db import migrations, models + + +class Migration(migrations.Migration): + + dependencies = [ + ('core', '0145_domainize_import_export'), + ('file', '0018_alter_filecontent_options'), + ] + + operations = [ + migrations.CreateModel( + name='FileGitRemote', + fields=[ + ('remote_ptr', models.OneToOneField(auto_created=True, on_delete=django.db.models.deletion.CASCADE, parent_link=True, primary_key=True, serialize=False, to='core.remote')), + ('git_ref', models.TextField(default='HEAD')), + ], + options={ + 'permissions': [('manage_roles_filegitremote', 'Can manage roles on file git remotes')], + 'default_related_name': '%(app_label)s_%(model_name)s', + }, + bases=('core.remote', pulpcore.app.models.access_policy.AutoAddObjPermsMixin), + ), + ] diff --git a/pulp_file/app/models.py b/pulp_file/app/models.py index 3c6cd658a79..646af49bb8b 100644 --- a/pulp_file/app/models.py +++ b/pulp_file/app/models.py @@ -62,6 +62,25 @@ class Meta: ] +class FileGitRemote(Remote, AutoAddObjPermsMixin): + """ + Remote for syncing files from a Git repository (without PULP_MANIFEST). + + The URL should point to a Git repository. The ``git_ref`` field can be used to specify a + branch, tag, or commit to sync from (defaults to ``HEAD``). + """ + + TYPE = "git" + + git_ref = models.TextField(default="HEAD") + + class Meta: + default_related_name = "%(app_label)s_%(model_name)s" + permissions = [ + ("manage_roles_filegitremote", "Can manage roles on file git remotes"), + ] + + class FileRepository(Repository, AutoAddObjPermsMixin): """ The "file" repository type. @@ -69,7 +88,7 @@ class FileRepository(Repository, AutoAddObjPermsMixin): TYPE = "file" CONTENT_TYPES = [FileContent] - REMOTE_TYPES = [FileRemote] + REMOTE_TYPES = [FileRemote, FileGitRemote] manifest = models.TextField(default="PULP_MANIFEST", null=True) autopublish = models.BooleanField(default=False) diff --git a/pulp_file/app/serializers.py b/pulp_file/app/serializers.py index 63323f175c5..bb7fb49dbf9 100644 --- a/pulp_file/app/serializers.py +++ b/pulp_file/app/serializers.py @@ -24,6 +24,7 @@ FileAlternateContentSource, FileContent, FileDistribution, + FileGitRemote, FileRemote, FileRepository, FilePublication, @@ -160,6 +161,24 @@ class Meta: model = FileRemote +class FileGitRemoteSerializer(RemoteSerializer): + """ + Serializer for File Git Remotes. + """ + + git_ref = serializers.CharField( + help_text=_("The git ref (branch, tag, or commit hash) to sync from. Defaults to HEAD."), + default="HEAD", + required=False, + ) + + policy = serializers.HiddenField(default=models.Remote.IMMEDIATE) + + class Meta: + fields = RemoteSerializer.Meta.fields + ("git_ref",) + model = FileGitRemote + + class FilePublicationSerializer(PublicationSerializer): """ Serializer for File Publications. diff --git a/pulp_file/app/tasks/synchronizing.py b/pulp_file/app/tasks/synchronizing.py index 6d739dd40b6..88ad72f9559 100644 --- a/pulp_file/app/tasks/synchronizing.py +++ b/pulp_file/app/tasks/synchronizing.py @@ -1,23 +1,38 @@ +import hashlib import logging import os +import tempfile from gettext import gettext as _ from urllib.parse import quote, urlparse, urlunparse import aiohttp.client_exceptions +import git as gitpython from django.core.files import File +from pulpcore.app import pulp_hashlib from pulpcore.plugin.exceptions import SyncError from pulpcore.plugin.models import Artifact, ProgressReport, Remote, PublishedMetadata from pulpcore.plugin.serializers import RepositoryVersionSerializer from pulpcore.plugin.stages import ( + ArtifactSaver, DeclarativeArtifact, DeclarativeContent, DeclarativeVersion, + QueryExistingArtifacts, + QueryExistingContents, + ContentSaver, + RemoteArtifactSaver, + ResolveContentFutures, Stage, ) -from pulp_file.app.models import FileContent, FileRemote, FileRepository, FilePublication +from pulp_file.app.models import ( + FileContent, + FileGitRemote, + FileRepository, + FilePublication, +) from pulp_file.manifest import Manifest @@ -43,14 +58,18 @@ def synchronize(remote_pk, repository_pk, mirror, url=None): SyncError: If the remote does not specify a URL to sync. """ - remote = FileRemote.objects.get(pk=remote_pk) + remote = Remote.objects.get(pk=remote_pk).cast() repository = FileRepository.objects.get(pk=repository_pk) if not remote.url: raise SyncError(_("A remote must have a url specified to synchronize.")) - first_stage = FileFirstStage(remote, url) - dv = DeclarativeVersion(first_stage, repository, mirror=mirror, acs=True) + if isinstance(remote, FileGitRemote): + first_stage = GitFirstStage(remote) + dv = GitDeclarativeVersion(first_stage, repository, mirror=mirror) + else: + first_stage = FileFirstStage(remote, url) + dv = DeclarativeVersion(first_stage, repository, mirror=mirror, acs=True) rv = dv.create() if rv and mirror: # TODO: this is awful, we really should rewrite the DeclarativeVersion API to @@ -146,3 +165,251 @@ def _get_safe_path(root_dir, entry, scheme): relative_path = entry.relative_path.lstrip("/") path = os.path.join(root_dir, relative_path) return path if scheme == "file" else quote(path, safe=":/") + + +class GitBlobExtractStage(Stage): + """ + A stage that replaces ``ArtifactDownloader`` for Git-based syncing. + + For each ``DeclarativeContent`` whose artifact is unsaved, this stage reads the blob + from the bare git clone, writes it to a temp file, and computes all required digests. + Artifacts that already exist in Pulp (matched by ``QueryExistingArtifacts``) are + skipped. + """ + + async def run(self): + async for batch in self.batches(): + for d_content in batch: + for d_artifact in d_content.d_artifacts: + if not d_artifact.artifact._state.adding: + # Artifact already exists in Pulp, nothing to extract. + continue + + extra = d_artifact.extra_data + clone_dir = extra["git_clone_dir"] + blob_path = extra["git_blob_path"] + git_ref = extra["git_ref"] + + repo = gitpython.Repo(clone_dir) + commit = repo.commit(git_ref) + blob = commit.tree[blob_path] + + # Write blob to temp file and compute all digests in a single pass. + digests = {n: pulp_hashlib.new(n) for n in Artifact.DIGEST_FIELDS} + size = 0 + tmp_file = tempfile.NamedTemporaryFile(dir=".", delete=False) + stream = blob.data_stream + while True: + chunk = stream.read(1048576) # 1 MB + if not chunk: + break + tmp_file.write(chunk) + for hasher in digests.values(): + hasher.update(chunk) + size += len(chunk) + tmp_file.flush() + os.fsync(tmp_file.fileno()) + tmp_file.close() + + artifact_attrs = {"size": size} + for alg, hasher in digests.items(): + artifact_attrs[alg] = hasher.hexdigest() + + d_artifact.artifact = Artifact(file=tmp_file.name, **artifact_attrs) + + await self.put(d_content) + + +class GitDeclarativeVersion(DeclarativeVersion): + """ + A DeclarativeVersion with a custom pipeline for Git-based syncing. + + Replaces ``ArtifactDownloader`` with ``GitBlobExtractStage`` which reads blobs + directly from the bare git clone instead of performing HTTP downloads. + """ + + def pipeline_stages(self, new_version): + return [ + self.first_stage, + QueryExistingArtifacts(), + GitBlobExtractStage(), + ArtifactSaver(), + QueryExistingContents(), + ContentSaver(), + RemoteArtifactSaver(), + ResolveContentFutures(), + ] + + +def _build_clone_env(remote): + """ + Build environment variables for git clone that apply the remote's auth and proxy settings. + + Args: + remote (FileGitRemote): The remote with auth/proxy/TLS configuration. + + Returns: + dict: Environment variables to pass to git commands. + """ + env = os.environ.copy() + + # Proxy configuration + if remote.proxy_url: + proxy_url = remote.proxy_url + if remote.proxy_username and remote.proxy_password: + parsed = urlparse(proxy_url) + proxy_url = urlunparse( + parsed._replace( + netloc=f"{remote.proxy_username}:{remote.proxy_password}@{parsed.hostname}" + + (f":{parsed.port}" if parsed.port else "") + ) + ) + env["http_proxy"] = proxy_url + env["https_proxy"] = proxy_url + + # TLS validation + if not remote.tls_validation: + env["GIT_SSL_NO_VERIFY"] = "true" + + # CA certificate + if remote.ca_cert: + ca_cert_file = tempfile.NamedTemporaryFile(dir=".", suffix=".pem", delete=False, mode="w") + ca_cert_file.write(remote.ca_cert) + ca_cert_file.close() + env["GIT_SSL_CAINFO"] = ca_cert_file.name + + # Client certificate and key + if remote.client_cert: + client_cert_file = tempfile.NamedTemporaryFile( + dir=".", suffix=".pem", delete=False, mode="w" + ) + client_cert_file.write(remote.client_cert) + client_cert_file.close() + env["GIT_SSL_CERT"] = client_cert_file.name + + if remote.client_key: + client_key_file = tempfile.NamedTemporaryFile( + dir=".", suffix=".key", delete=False, mode="w" + ) + client_key_file.write(remote.client_key) + client_key_file.close() + env["GIT_SSL_KEY"] = client_key_file.name + + return env + + +def _build_clone_url(remote): + """ + Build the clone URL, embedding basic auth credentials if present on the remote. + + Args: + remote (FileGitRemote): The remote with URL and optional credentials. + + Returns: + str: The URL to use for git clone. + """ + url = remote.url + if remote.username and remote.password: + parsed = urlparse(url) + if parsed.scheme in ("http", "https"): + url = urlunparse( + parsed._replace( + netloc=f"{remote.username}:{remote.password}@{parsed.hostname}" + + (f":{parsed.port}" if parsed.port else "") + ) + ) + return url + + +class GitFirstStage(Stage): + """ + The first stage of a pulp_file sync pipeline for Git repositories. + + Performs a bare clone of the Git repository, resolves the specified git_ref, and + walks the tree to emit ``DeclarativeContent`` for each blob. Computes sha256 for + each blob so that ``QueryExistingArtifacts`` can match already-known artifacts and + ``FileContent.digest`` is available for content matching. + """ + + def __init__(self, remote): + """ + Args: + remote (FileGitRemote): The git remote data to be used when syncing. + """ + super().__init__() + self.remote = remote + + async def run(self): + """ + Build and emit `DeclarativeContent` from the Git repository tree. + """ + remote = self.remote + git_ref = remote.git_ref or "HEAD" + clone_url = _build_clone_url(remote) + clone_env = _build_clone_env(remote) + + clone_dir = tempfile.mkdtemp(dir=".", prefix="pulp_file_git_") + + async with ProgressReport(message="Cloning Git Repository", code="sync.git.cloning") as pb: + try: + try: + repo = gitpython.Repo.clone_from( + clone_url, + clone_dir, + bare=True, + depth=1, + branch=git_ref, + env=clone_env, + ) + except gitpython.exc.GitCommandError: + # depth/branch fails for commit hashes; retry with full bare clone + repo = gitpython.Repo.clone_from(clone_url, clone_dir, bare=True, env=clone_env) + except gitpython.exc.GitCommandError as e: + raise SyncError( + _("Failed to clone git repository '{url}': {error}").format( + url=remote.url, error=str(e) + ) + ) + await pb.aincrement() + + async with ProgressReport(message="Resolving Git ref", code="sync.git.resolving_ref") as pb: + try: + commit = repo.commit(git_ref) + except Exception as e: + raise SyncError( + _("Could not resolve git ref '{ref}': {error}").format( + ref=git_ref, error=str(e) + ) + ) + await pb.aincrement() + + async with ProgressReport( + message="Parsing Git tree", + code="sync.git.parsing_tree", + ) as pb: + blobs = [item for item in commit.tree.traverse() if item.type == "blob"] + pb.total = len(blobs) + await pb.asave() + + for blob in blobs: + relative_path = blob.path + size = blob.size + sha256 = hashlib.sha256(blob.data_stream.read()).hexdigest() + + file_content = FileContent(relative_path=relative_path, digest=sha256) + artifact = Artifact(size=size, sha256=sha256) + da = DeclarativeArtifact( + artifact=artifact, + url=remote.url, + relative_path=relative_path, + remote=remote, + deferred_download=False, + extra_data={ + "git_clone_dir": clone_dir, + "git_blob_path": blob.path, + "git_ref": git_ref, + }, + ) + dc = DeclarativeContent(content=file_content, d_artifacts=[da]) + await pb.aincrement() + await self.put(dc) diff --git a/pulp_file/app/viewsets.py b/pulp_file/app/viewsets.py index cc095c19ba1..cafd91da63a 100644 --- a/pulp_file/app/viewsets.py +++ b/pulp_file/app/viewsets.py @@ -39,6 +39,7 @@ FileAlternateContentSource, FileContent, FileDistribution, + FileGitRemote, FileRemote, FileRepository, FilePublication, @@ -48,6 +49,7 @@ FileContentSerializer, FileContentUploadSerializer, FileDistributionSerializer, + FileGitRemoteSerializer, FileRemoteSerializer, FileRepositorySerializer, FilePublicationSerializer, @@ -204,6 +206,16 @@ class FileRepositoryViewSet(RepositoryViewSet, ModifyRepositoryActionMixin, Role "has_model_or_domain_or_obj_perms:file.view_filerepository", ], }, + { + "action": ["sync"], + "principal": "authenticated", + "effect": "allow", + "condition": [ + "has_model_or_domain_or_obj_perms:file.sync_filerepository", + "has_remote_param_model_or_domain_or_obj_perms:file.view_filegitremote", + "has_model_or_domain_or_obj_perms:file.view_filerepository", + ], + }, { "action": ["modify"], "principal": "authenticated", @@ -393,6 +405,84 @@ class FileRemoteViewSet(RemoteViewSet, RolesMixin): } +class FileGitRemoteViewSet(RemoteViewSet, RolesMixin): + """ + + FileGitRemote represents a Git repository as an external source of + File Content. + The target url of a FileGitRemote must point to a Git repository. Syncing will perform a + bare clone and extract file metadata from the specified git ref. + """ + + endpoint_name = "git" + queryset = FileGitRemote.objects.all() + serializer_class = FileGitRemoteSerializer + queryset_filtering_required_permission = "file.view_filegitremote" + + DEFAULT_ACCESS_POLICY = { + "statements": [ + { + "action": ["list", "my_permissions"], + "principal": "authenticated", + "effect": "allow", + }, + { + "action": ["create"], + "principal": "authenticated", + "effect": "allow", + "condition": "has_model_or_domain_perms:file.add_filegitremote", + }, + { + "action": ["retrieve"], + "principal": "authenticated", + "effect": "allow", + "condition": "has_model_or_domain_or_obj_perms:file.view_filegitremote", + }, + { + "action": ["update", "partial_update", "set_label", "unset_label"], + "principal": "authenticated", + "effect": "allow", + "condition": [ + "has_model_or_domain_or_obj_perms:file.change_filegitremote", + "has_model_or_domain_or_obj_perms:file.view_filegitremote", + ], + }, + { + "action": ["destroy"], + "principal": "authenticated", + "effect": "allow", + "condition": [ + "has_model_or_domain_or_obj_perms:file.delete_filegitremote", + "has_model_or_domain_or_obj_perms:file.view_filegitremote", + ], + }, + { + "action": ["list_roles", "add_role", "remove_role"], + "principal": "authenticated", + "effect": "allow", + "condition": ["has_model_or_domain_or_obj_perms:file.manage_roles_filegitremote"], + }, + ], + "creation_hooks": [ + { + "function": "add_roles_for_object_creator", + "parameters": {"roles": "file.filegitremote_owner"}, + }, + ], + "queryset_scoping": {"function": "scope_queryset"}, + } + LOCKED_ROLES = { + "file.filegitremote_creator": ["file.add_filegitremote"], + "file.filegitremote_owner": [ + "file.view_filegitremote", + "file.change_filegitremote", + "file.delete_filegitremote", + "file.manage_roles_filegitremote", + ], + "file.filegitremote_viewer": ["file.view_filegitremote"], + } + + class FilePublicationViewSet(PublicationViewSet, RolesMixin): """ diff --git a/pulp_file/docs/user/guides/_SUMMARY.md b/pulp_file/docs/user/guides/_SUMMARY.md index e09fb73cff0..6121621b138 100644 --- a/pulp_file/docs/user/guides/_SUMMARY.md +++ b/pulp_file/docs/user/guides/_SUMMARY.md @@ -1,3 +1,4 @@ * [Synchronize a Repository](sync.md) +* [Synchronize from a Git Repository](git_sync.md) * [Publish and Host](publish-host.md) * [Upload Content](upload.md) diff --git a/pulp_file/docs/user/guides/git_sync.md b/pulp_file/docs/user/guides/git_sync.md new file mode 100644 index 00000000000..aabde3b8316 --- /dev/null +++ b/pulp_file/docs/user/guides/git_sync.md @@ -0,0 +1,132 @@ +# Synchronize a git repository + +`pulp_file` can sync the files from a `git` repository with a `FileGitRemote`. + +## Create a repository `foo` + +=== "Create Repository foo" + + ```bash + #!/usr/bin/env bash + export REPO_NAME=$(head /dev/urandom | tr -dc a-z | head -c5) + + echo "Creating a new repository named $REPO_NAME." + pulp file repository create --name $REPO_NAME + + echo "Inspecting repository." + pulp file repository show --name $REPO_NAME + ``` + +=== "Output" + + ```json + { + "pulp_created": "2019-05-16T19:23:55.224096Z", + "pulp_href": "/pulp/api/v3/repositories/file/file/680f18e7-0513-461f-b067-436b03285e4c/", + "latest_version_href": null, + "versions_href": "/pulp/api/v3/repositories/file/file/680f18e7-0513-461f-b067-436b03285e4c/versions/", + "description": "", + "name": "foo" + } + ``` + +## Create a new git remote `bar` + +=== "Create Remote bar" + + ```bash + #!/usr/bin/env bash + export REMOTE_NAME=$(head /dev/urandom | tr -dc a-z | head -c5) + echo "Creating a remote that points to an external git repository" + pulp file remote --type git create --name $REMOTE_NAME \ + --url 'https://github.com/pulp/pulpcore.git' + + echo "Inspecting new Remote." + pulp file remote -t git show --name $REMOTE_NAME + ``` + +=== "Output" + + ```json + { + "name": "bar", + "prn": "prn:file.filegitremote:019c2fd7-c89e-7ae7-81ee-92ef2f0dae85", + "proxy_url": null, + "pulp_created": "2026-02-05T22:06:36.702964Z", + "pulp_href": "/pulp/api/v3/remotes/file/git/019c2fd7-c89e-7ae7-81ee-92ef2f0dae85/", + "pulp_labels": {}, + "pulp_last_updated": "2026-02-05T22:06:36.702974Z", + "tls_validation": true, + "url": "https://github.com/pulp/pulpcore.git", + "git_ref": "HEAD" + } + ``` + +## Sync repository `foo` using git remote `bar` + +=== "Sync foo using bar" + + ```bash + #!/usr/bin/env bash + echo "Syncing the repository using the remote." + pulp file repository sync --name $REPO_NAME --remote file:git:$REMOTE_NAME + + echo "Inspecting RepositoryVersion." + pulp file repository version show --repository $REPO_NAME --version 1 + ``` + +=== "Output" + + ```json + { + "pulp_href": "/pulp/api/v3/repositories/file/file/019c2fda-4690-774e-a374-dedab9f2e64a/versions/1/", + "prn": "prn:core.repositoryversion:019c2fda-81e9-7161-914b-8fdeba1b31ac", + "pulp_created": "2026-02-05T22:09:35.211703Z", + "pulp_last_updated": "2026-02-05T22:09:48.661887Z", + "number": 1, + "repository": "/pulp/api/v3/repositories/file/file/019c2fda-4690-774e-a374-dedab9f2e64a/", + "base_version": null, + "content_summary": { + "added": { + "file.file": { + "count": 695, + "href": "/pulp/api/v3/content/file/files/?repository_version_added=/pulp/api/v3/repositories/file/file/019c2fda-4690-774e-a374-dedab9f2e64a/versions/1/" + } + }, + "removed": {}, + "present": { + "file.file": { + "count": 695, + "href": "/pulp/api/v3/content/file/files/?repository_version=/pulp/api/v3/repositories/file/file/019c2fda-4690-774e-a374-dedab9f2e64a/versions/1/" + } + } + }, + "vuln_report": "/pulp/api/v3/vuln_report/?repo_versions=prn:core.repositoryversion:019c2fda-81e9-7161-914b-8fdeba1b31ac" + } + ``` + +## Specify a git ref to sync from + +=== "Create Remote bar" + + ```bash + #!/usr/bin/env bash + echo "Update git remote to new git_ref" + pulp file remote -t git update --name $REMOTE_NAME --git-ref "3.102.0" + ``` + +=== "Output" + + ```json + { + "name": "bar", + "prn": "prn:file.filegitremote:019c2fd7-c89e-7ae7-81ee-92ef2f0dae85", + "proxy_url": null, + "pulp_created": "2026-02-05T22:06:36.702964Z", + "pulp_href": "/pulp/api/v3/remotes/file/git/019c2fd7-c89e-7ae7-81ee-92ef2f0dae85/", + "pulp_labels": {}, + "pulp_last_updated": "2026-02-05T22:06:36.702974Z", + "tls_validation": true, + "url": "https://github.com/pulp/pulpcore.git", + "git_ref": "3.102.0" + } diff --git a/pulp_file/pytest_plugin.py b/pulp_file/pytest_plugin.py index bf451a64c66..9fa32d3af47 100644 --- a/pulp_file/pytest_plugin.py +++ b/pulp_file/pytest_plugin.py @@ -306,3 +306,16 @@ def _generate_server_and_remote(*, manifest_path, policy): return server, remote yield _generate_server_and_remote + + +# Git remote fixtures + + +@pytest.fixture +def file_git_remote_factory(file_bindings, gen_object_with_cleanup): + def _file_git_remote_factory(url, git_ref="HEAD", **kwargs): + body = {"name": str(uuid.uuid4()), "url": url, "git_ref": git_ref} + body.update(kwargs) + return gen_object_with_cleanup(file_bindings.RemotesGitApi, body) + + return _file_git_remote_factory diff --git a/pulp_file/tests/functional/api/test_git_sync.py b/pulp_file/tests/functional/api/test_git_sync.py new file mode 100644 index 00000000000..f7e76e7fcad --- /dev/null +++ b/pulp_file/tests/functional/api/test_git_sync.py @@ -0,0 +1,116 @@ +import pytest +import uuid + +from pulpcore.tests.functional.utils import PulpTaskError + +from pulpcore.client.pulp_file import RepositorySyncURL + + +GIT_REMOTE_URL = "https://github.com/pulp/pulp-smash.git" +FILE_COUNT = { + "HEAD": 79, # latest commit + "main": 79, # default branch + "click": 78, # branch + "2016.02.18": 95, # tag + "63651d3": 306, # commit for tag 2018.02.15 +} + + +# CRUD tests + + +@pytest.mark.parallel +def test_git_remote_crud(file_bindings, gen_object_with_cleanup, monitor_task): + """Test create, read, update, and delete of a FileGitRemote.""" + # Create + body = {"name": str(uuid.uuid4()), "url": GIT_REMOTE_URL} + remote = gen_object_with_cleanup(file_bindings.RemotesGitApi, body) + assert remote.url == GIT_REMOTE_URL + assert remote.git_ref == "HEAD" + assert remote.name == body["name"] + + # Read + remote = file_bindings.RemotesGitApi.read(remote.pulp_href) + assert remote.url == GIT_REMOTE_URL + + # Update (partial) + update_response = file_bindings.RemotesGitApi.partial_update( + remote.pulp_href, {"git_ref": "main"} + ) + monitor_task(update_response.task) + remote = file_bindings.RemotesGitApi.read(remote.pulp_href) + assert remote.git_ref == "main" + + # Update (full) + new_body = {"name": str(uuid.uuid4()), "url": GIT_REMOTE_URL, "git_ref": "HEAD"} + update_response = file_bindings.RemotesGitApi.update(remote.pulp_href, new_body) + monitor_task(update_response.task) + remote = file_bindings.RemotesGitApi.read(remote.pulp_href) + assert remote.name == new_body["name"] + assert remote.git_ref == "HEAD" + + +# Sync tests + + +@pytest.mark.parametrize("git_ref", list(FILE_COUNT.keys())) +@pytest.mark.parallel +def test_git_sync(file_bindings, file_repo, file_git_remote_factory, monitor_task, git_ref): + """Test syncing from a public Git repository.""" + remote = file_git_remote_factory(url=GIT_REMOTE_URL, git_ref=git_ref) + + body = RepositorySyncURL(remote=remote.pulp_href) + monitor_task(file_bindings.RepositoriesFileApi.sync(file_repo.pulp_href, body).task) + + file_repo = file_bindings.RepositoriesFileApi.read(file_repo.pulp_href) + assert file_repo.latest_version_href.endswith("/versions/1/") + + version = file_bindings.RepositoriesFileVersionsApi.read(file_repo.latest_version_href) + assert version.content_summary.present["file.file"]["count"] == FILE_COUNT[git_ref] + assert version.content_summary.added["file.file"]["count"] == FILE_COUNT[git_ref] + + +@pytest.mark.parallel +def test_git_sync_idempotent(file_bindings, file_repo, file_git_remote_factory, monitor_task): + """Syncing the same Git ref twice should not create a new repository version.""" + remote = file_git_remote_factory(url=GIT_REMOTE_URL, git_ref="main") + + body = RepositorySyncURL(remote=remote.pulp_href) + monitor_task(file_bindings.RepositoriesFileApi.sync(file_repo.pulp_href, body).task) + + file_repo = file_bindings.RepositoriesFileApi.read(file_repo.pulp_href) + assert file_repo.latest_version_href.endswith("/versions/1/") + + first_version = file_bindings.RepositoriesFileVersionsApi.read(file_repo.latest_version_href) + first_count = first_version.content_summary.present["file.file"]["count"] + + # Sync again -- no new version should be created + monitor_task(file_bindings.RepositoriesFileApi.sync(file_repo.pulp_href, body).task) + file_repo = file_bindings.RepositoriesFileApi.read(file_repo.pulp_href) + assert file_repo.latest_version_href.endswith("/versions/1/") + + second_version = file_bindings.RepositoriesFileVersionsApi.read(file_repo.latest_version_href) + assert second_version.content_summary.present["file.file"]["count"] == first_count + + +@pytest.mark.parallel +def test_git_sync_invalid_url(file_bindings, file_repo, file_git_remote_factory, monitor_task): + """Syncing with an invalid Git URL should raise a task error.""" + remote = file_git_remote_factory(url="https://invalid.example.com/no-such-repo.git") + + body = RepositorySyncURL(remote=remote.pulp_href) + with pytest.raises(PulpTaskError) as exc: + monitor_task(file_bindings.RepositoriesFileApi.sync(file_repo.pulp_href, body).task) + assert "Failed to clone git repository" in exc.value.task.error["description"] + + +@pytest.mark.parallel +def test_git_sync_invalid_ref(file_bindings, file_repo, file_git_remote_factory, monitor_task): + """Syncing with a non-existent git ref should raise a task error.""" + remote = file_git_remote_factory(url=GIT_REMOTE_URL, git_ref="this-ref-does-not-exist-abc123") + + body = RepositorySyncURL(remote=remote.pulp_href) + with pytest.raises(PulpTaskError) as exc: + monitor_task(file_bindings.RepositoriesFileApi.sync(file_repo.pulp_href, body).task) + error_desc = exc.value.task.error["description"] + assert "Failed to clone" in error_desc or "Could not resolve git ref" in error_desc diff --git a/pulp_file/tests/unit/test_git_sync.py b/pulp_file/tests/unit/test_git_sync.py new file mode 100644 index 00000000000..753aa37221f --- /dev/null +++ b/pulp_file/tests/unit/test_git_sync.py @@ -0,0 +1,144 @@ +"""Unit tests for Git sync helper functions.""" + +import os + +from unittest import mock + +from pulp_file.app.tasks.synchronizing import _build_clone_url, _build_clone_env + + +class TestBuildCloneUrl: + """Tests for _build_clone_url.""" + + @staticmethod + def _mock_remote(url, username=None, password=None): + remote = mock.Mock() + remote.url = url + remote.username = username + remote.password = password + return remote + + def test_url_without_credentials(self): + remote = self._mock_remote("https://github.com/pulp/pulpcore.git") + assert _build_clone_url(remote) == "https://github.com/pulp/pulpcore.git" + + def test_https_with_credentials(self): + remote = self._mock_remote("https://github.com/pulp/pulpcore.git", "user", "p@ssw0rd") + result = _build_clone_url(remote) + assert result == "https://user:p@ssw0rd@github.com/pulp/pulpcore.git" + + def test_http_with_credentials(self): + remote = self._mock_remote("http://git.example.com/repo.git", "user", "pass") + result = _build_clone_url(remote) + assert result == "http://user:pass@git.example.com/repo.git" + + def test_ssh_url_credentials_not_embedded(self): + """SSH URLs should not have credentials embedded.""" + remote = self._mock_remote("git@github.com:pulp/pulpcore.git", "user", "pass") + result = _build_clone_url(remote) + assert result == "git@github.com:pulp/pulpcore.git" + + def test_url_with_port_and_credentials(self): + remote = self._mock_remote("https://git.example.com:8443/repo.git", "user", "pass") + result = _build_clone_url(remote) + assert result == "https://user:pass@git.example.com:8443/repo.git" + + def test_username_only_no_embedding(self): + """Only embed credentials when both username AND password are present.""" + remote = self._mock_remote("https://github.com/pulp/pulpcore.git", "user", None) + assert _build_clone_url(remote) == "https://github.com/pulp/pulpcore.git" + + def test_password_only_no_embedding(self): + remote = self._mock_remote("https://github.com/pulp/pulpcore.git", None, "pass") + assert _build_clone_url(remote) == "https://github.com/pulp/pulpcore.git" + + def test_file_url_no_credentials(self): + remote = self._mock_remote("file:///tmp/my-repo.git") + assert _build_clone_url(remote) == "file:///tmp/my-repo.git" + + def test_file_url_credentials_not_embedded(self): + """file:// URLs should not have credentials embedded.""" + remote = self._mock_remote("file:///tmp/my-repo.git", "user", "pass") + assert _build_clone_url(remote) == "file:///tmp/my-repo.git" + + +class TestBuildCloneEnv: + """Tests for _build_clone_env.""" + + @staticmethod + def _mock_remote(**kwargs): + remote = mock.Mock() + remote.proxy_url = kwargs.get("proxy_url", None) + remote.proxy_username = kwargs.get("proxy_username", None) + remote.proxy_password = kwargs.get("proxy_password", None) + remote.tls_validation = kwargs.get("tls_validation", True) + remote.ca_cert = kwargs.get("ca_cert", None) + remote.client_cert = kwargs.get("client_cert", None) + remote.client_key = kwargs.get("client_key", None) + return remote + + def test_defaults_no_extra_env(self): + """A remote with default values should not inject extra env vars.""" + remote = self._mock_remote() + env = _build_clone_env(remote) + assert "GIT_SSL_NO_VERIFY" not in env + assert "GIT_SSL_CAINFO" not in env + assert "GIT_SSL_CERT" not in env + assert "GIT_SSL_KEY" not in env + + def test_proxy_url(self): + remote = self._mock_remote(proxy_url="http://proxy.example.com:8080") + env = _build_clone_env(remote) + assert env["http_proxy"] == "http://proxy.example.com:8080" + assert env["https_proxy"] == "http://proxy.example.com:8080" + + def test_proxy_with_auth(self): + remote = self._mock_remote( + proxy_url="http://proxy.example.com:8080", + proxy_username="proxyuser", + proxy_password="proxypass", + ) + env = _build_clone_env(remote) + assert "proxyuser:proxypass@proxy.example.com:8080" in env["http_proxy"] + assert "proxyuser:proxypass@proxy.example.com:8080" in env["https_proxy"] + + def test_tls_validation_disabled(self): + remote = self._mock_remote(tls_validation=False) + env = _build_clone_env(remote) + assert env["GIT_SSL_NO_VERIFY"] == "true" + + def test_tls_validation_enabled(self): + remote = self._mock_remote(tls_validation=True) + env = _build_clone_env(remote) + assert "GIT_SSL_NO_VERIFY" not in env + + def test_ca_cert_written_to_file(self, tmp_path, monkeypatch): + monkeypatch.chdir(tmp_path) + ca_pem = "-----BEGIN CERTIFICATE-----\nfakecert\n-----END CERTIFICATE-----" + remote = self._mock_remote(ca_cert=ca_pem) + env = _build_clone_env(remote) + assert "GIT_SSL_CAINFO" in env + with open(env["GIT_SSL_CAINFO"]) as f: + assert f.read() == ca_pem + os.unlink(env["GIT_SSL_CAINFO"]) + + def test_client_cert_and_key_written_to_files(self, tmp_path, monkeypatch): + monkeypatch.chdir(tmp_path) + cert_pem = "-----BEGIN CERTIFICATE-----\nfakecert\n-----END CERTIFICATE-----" + key_pem = "-----BEGIN RSA PRIVATE KEY-----\nfakekey\n-----END RSA PRIVATE KEY-----" + remote = self._mock_remote(client_cert=cert_pem, client_key=key_pem) + env = _build_clone_env(remote) + assert "GIT_SSL_CERT" in env + assert "GIT_SSL_KEY" in env + with open(env["GIT_SSL_CERT"]) as f: + assert f.read() == cert_pem + with open(env["GIT_SSL_KEY"]) as f: + assert f.read() == key_pem + os.unlink(env["GIT_SSL_CERT"]) + os.unlink(env["GIT_SSL_KEY"]) + + def test_inherits_existing_env(self): + """The returned env should contain the existing process environment.""" + remote = self._mock_remote() + env = _build_clone_env(remote) + assert "PATH" in env diff --git a/pyproject.toml b/pyproject.toml index 06f89bc1d72..d104d136931 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -43,6 +43,7 @@ dependencies = [ "drf-nested-routers>=0.93.4,<=0.95.0", "drf-spectacular==0.27.2", # We monkeypatch this so we need a very narrow requirement string. "dynaconf>=3.2.5,<3.3.0", + "GitPython>=3.1.40,<3.3", "gunicorn>=22.0,<25.1.0", "jinja2>=3.1,<=3.1.6", "json_stream>=2.3.2,<2.5", From 401d6b27ed28972cbaa270fb5fc2b7a114cddfb9 Mon Sep 17 00:00:00 2001 From: Gerrod Ubben Date: Thu, 5 Feb 2026 23:30:12 -0500 Subject: [PATCH 2/2] Simplify git sync stages --- pulp_file/app/tasks/synchronizing.py | 104 +++-------------------- pulp_file/app/viewsets.py | 13 +-- pulpcore/app/global_access_conditions.py | 8 +- pyproject.toml | 2 +- 4 files changed, 23 insertions(+), 104 deletions(-) diff --git a/pulp_file/app/tasks/synchronizing.py b/pulp_file/app/tasks/synchronizing.py index 88ad72f9559..650bf25a64d 100644 --- a/pulp_file/app/tasks/synchronizing.py +++ b/pulp_file/app/tasks/synchronizing.py @@ -1,6 +1,6 @@ -import hashlib import logging import os +import shutil import tempfile from gettext import gettext as _ @@ -10,20 +10,15 @@ import git as gitpython from django.core.files import File -from pulpcore.app import pulp_hashlib from pulpcore.plugin.exceptions import SyncError from pulpcore.plugin.models import Artifact, ProgressReport, Remote, PublishedMetadata from pulpcore.plugin.serializers import RepositoryVersionSerializer from pulpcore.plugin.stages import ( - ArtifactSaver, + ArtifactDownloader, DeclarativeArtifact, DeclarativeContent, DeclarativeVersion, - QueryExistingArtifacts, - QueryExistingContents, - ContentSaver, RemoteArtifactSaver, - ResolveContentFutures, Stage, ) @@ -66,7 +61,13 @@ def synchronize(remote_pk, repository_pk, mirror, url=None): if isinstance(remote, FileGitRemote): first_stage = GitFirstStage(remote) - dv = GitDeclarativeVersion(first_stage, repository, mirror=mirror) + dv = DeclarativeVersion(first_stage, repository, mirror=mirror) + old_pipeline_stages = dv.pipeline_stages + dv.pipeline_stages = lambda new_version: [ + stage + for stage in old_pipeline_stages(new_version) + if not isinstance(stage, (ArtifactDownloader, RemoteArtifactSaver)) + ] else: first_stage = FileFirstStage(remote, url) dv = DeclarativeVersion(first_stage, repository, mirror=mirror, acs=True) @@ -167,80 +168,6 @@ def _get_safe_path(root_dir, entry, scheme): return path if scheme == "file" else quote(path, safe=":/") -class GitBlobExtractStage(Stage): - """ - A stage that replaces ``ArtifactDownloader`` for Git-based syncing. - - For each ``DeclarativeContent`` whose artifact is unsaved, this stage reads the blob - from the bare git clone, writes it to a temp file, and computes all required digests. - Artifacts that already exist in Pulp (matched by ``QueryExistingArtifacts``) are - skipped. - """ - - async def run(self): - async for batch in self.batches(): - for d_content in batch: - for d_artifact in d_content.d_artifacts: - if not d_artifact.artifact._state.adding: - # Artifact already exists in Pulp, nothing to extract. - continue - - extra = d_artifact.extra_data - clone_dir = extra["git_clone_dir"] - blob_path = extra["git_blob_path"] - git_ref = extra["git_ref"] - - repo = gitpython.Repo(clone_dir) - commit = repo.commit(git_ref) - blob = commit.tree[blob_path] - - # Write blob to temp file and compute all digests in a single pass. - digests = {n: pulp_hashlib.new(n) for n in Artifact.DIGEST_FIELDS} - size = 0 - tmp_file = tempfile.NamedTemporaryFile(dir=".", delete=False) - stream = blob.data_stream - while True: - chunk = stream.read(1048576) # 1 MB - if not chunk: - break - tmp_file.write(chunk) - for hasher in digests.values(): - hasher.update(chunk) - size += len(chunk) - tmp_file.flush() - os.fsync(tmp_file.fileno()) - tmp_file.close() - - artifact_attrs = {"size": size} - for alg, hasher in digests.items(): - artifact_attrs[alg] = hasher.hexdigest() - - d_artifact.artifact = Artifact(file=tmp_file.name, **artifact_attrs) - - await self.put(d_content) - - -class GitDeclarativeVersion(DeclarativeVersion): - """ - A DeclarativeVersion with a custom pipeline for Git-based syncing. - - Replaces ``ArtifactDownloader`` with ``GitBlobExtractStage`` which reads blobs - directly from the bare git clone instead of performing HTTP downloads. - """ - - def pipeline_stages(self, new_version): - return [ - self.first_stage, - QueryExistingArtifacts(), - GitBlobExtractStage(), - ArtifactSaver(), - QueryExistingContents(), - ContentSaver(), - RemoteArtifactSaver(), - ResolveContentFutures(), - ] - - def _build_clone_env(remote): """ Build environment variables for git clone that apply the remote's auth and proxy settings. @@ -343,6 +270,7 @@ async def run(self): """ Build and emit `DeclarativeContent` from the Git repository tree. """ + remote = self.remote git_ref = remote.git_ref or "HEAD" clone_url = _build_clone_url(remote) @@ -394,21 +322,17 @@ async def run(self): for blob in blobs: relative_path = blob.path size = blob.size - sha256 = hashlib.sha256(blob.data_stream.read()).hexdigest() + with tempfile.NamedTemporaryFile(dir=".", delete=False, mode="wb") as file: + shutil.copyfileobj(blob.data_stream, file) - file_content = FileContent(relative_path=relative_path, digest=sha256) - artifact = Artifact(size=size, sha256=sha256) + artifact = Artifact.init_and_validate(file.name, expected_size=size) + file_content = FileContent(relative_path=relative_path, digest=artifact.sha256) da = DeclarativeArtifact( artifact=artifact, url=remote.url, relative_path=relative_path, remote=remote, deferred_download=False, - extra_data={ - "git_clone_dir": clone_dir, - "git_blob_path": blob.path, - "git_ref": git_ref, - }, ) dc = DeclarativeContent(content=file_content, d_artifacts=[da]) await pb.aincrement() diff --git a/pulp_file/app/viewsets.py b/pulp_file/app/viewsets.py index cafd91da63a..5dd36d1632a 100644 --- a/pulp_file/app/viewsets.py +++ b/pulp_file/app/viewsets.py @@ -202,17 +202,8 @@ class FileRepositoryViewSet(RepositoryViewSet, ModifyRepositoryActionMixin, Role "effect": "allow", "condition": [ "has_model_or_domain_or_obj_perms:file.sync_filerepository", - "has_remote_param_model_or_domain_or_obj_perms:file.view_fileremote", - "has_model_or_domain_or_obj_perms:file.view_filerepository", - ], - }, - { - "action": ["sync"], - "principal": "authenticated", - "effect": "allow", - "condition": [ - "has_model_or_domain_or_obj_perms:file.sync_filerepository", - "has_remote_param_model_or_domain_or_obj_perms:file.view_filegitremote", + "has_remote_param_model_or_domain_or_obj_perms:file.view_fileremote," + "file.view_filegitremote", "has_model_or_domain_or_obj_perms:file.view_filerepository", ], }, diff --git a/pulpcore/app/global_access_conditions.py b/pulpcore/app/global_access_conditions.py index 8123b5ad4f0..a674ce28dec 100644 --- a/pulpcore/app/global_access_conditions.py +++ b/pulpcore/app/global_access_conditions.py @@ -172,7 +172,8 @@ def has_remote_param_obj_perms(request, view, action, permission): authorization. action (str): The action being performed, e.g. "destroy". permission (str): The name of the Permission to be checked. In the form - `app_label.codename`, e.g. "core.delete_task". + `app_label.codename`, e.g. "core.delete_task". Can contain multiple permissions + separated by commas. Returns: True if the user has the Permission named by the ``permission`` argument on the ``remote`` @@ -188,7 +189,10 @@ def has_remote_param_obj_perms(request, view, action, permission): serializer = view.serializer_class(instance=obj, data=request.data, context=context, **kwargs) serializer.is_valid(raise_exception=True) if remote := serializer.validated_data.get("remote"): - return request.user.has_perm(permission, remote) + for perm in permission.split(","): + if request.user.has_perm(perm, remote): + return True + return False return True diff --git a/pyproject.toml b/pyproject.toml index d104d136931..726f6ddbc47 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -43,7 +43,7 @@ dependencies = [ "drf-nested-routers>=0.93.4,<=0.95.0", "drf-spectacular==0.27.2", # We monkeypatch this so we need a very narrow requirement string. "dynaconf>=3.2.5,<3.3.0", - "GitPython>=3.1.40,<3.3", + "GitPython>=3.1.24,<3.2", "gunicorn>=22.0,<25.1.0", "jinja2>=3.1,<=3.1.6", "json_stream>=2.3.2,<2.5",