From dbbd5b3cd5f531017b32c51b6f37579b10857681 Mon Sep 17 00:00:00 2001 From: Tully Foote Date: Fri, 13 Jun 2025 02:08:26 -0700 Subject: [PATCH 01/51] Add support for rosdistro cache to hold the README and CHANGELOG contents --- src/rosdistro/distribution.py | 40 ++++++++++++++++++- src/rosdistro/distribution_cache.py | 10 ++++- src/rosdistro/distribution_cache_generator.py | 20 +++++++++- src/rosdistro/manifest_provider/bitbucket.py | 4 +- src/rosdistro/manifest_provider/cache.py | 34 ++++++++++++---- src/rosdistro/manifest_provider/git.py | 20 +++++----- src/rosdistro/manifest_provider/github.py | 18 +++++---- src/rosdistro/manifest_provider/gitlab.py | 4 +- src/rosdistro/manifest_provider/tar.py | 4 +- 9 files changed, 120 insertions(+), 34 deletions(-) diff --git a/src/rosdistro/distribution.py b/src/rosdistro/distribution.py index 742cc638..e06e9525 100644 --- a/src/rosdistro/distribution.py +++ b/src/rosdistro/distribution.py @@ -58,6 +58,8 @@ def __init__(self, distribution_file, manifest_providers=None, source_manifest_p self._source_manifest_providers = source_manifest_providers self._release_package_xmls = {} + self._release_readmes = {} + self._release_changelogs = {} self._source_repo_package_xmls = {} def __getattr__(self, name): @@ -75,12 +77,48 @@ def get_release_package_xml(self, pkg_name): return None package_xml = None for mp in self._manifest_providers: - package_xml = mp(self._distribution_file.name, repo, pkg_name) + package_xml = mp(self._distribution_file.name, repo, pkg_name, 'package.xml') if package_xml is not None: break self._release_package_xmls[pkg_name] = package_xml return self._release_package_xmls[pkg_name] + def get_release_readme(self, pkg_name): + if pkg_name not in self._release_readmes: + pkg = self._distribution_file.release_packages[pkg_name] + repo_name = pkg.repository_name + repo = self._distribution_file.repositories[repo_name] + if repo.release_repository is None: + return None + repo = repo.release_repository + if repo.version is None: + return None + readme = None + for mp in self._manifest_providers: + readme = mp(self._distribution_file.name, repo, pkg_name, filepath='README.md') + if readme is not None: + break + self._release_readmes[pkg_name] = readme + return self._release_readmes[pkg_name] + + def get_release_changelog(self, pkg_name): + if pkg_name not in self._release_changelogs: + pkg = self._distribution_file.release_packages[pkg_name] + repo_name = pkg.repository_name + repo = self._distribution_file.repositories[repo_name] + if repo.release_repository is None: + return None + repo = repo.release_repository + if repo.version is None: + return None + changelog = None + for mp in self._manifest_providers: + changelog = mp(self._distribution_file.name, repo, pkg_name, filepath='CHANGELOG.rst') + if changelog is not None: + break + self._release_changelogs[pkg_name] = changelog + return self._release_changelogs[pkg_name] + def get_source_package_xml(self, pkg_name): repo_name = self._distribution_file.source_packages[pkg_name].repository_name repo_cache = self.get_source_repo_package_xmls(repo_name) diff --git a/src/rosdistro/distribution_cache.py b/src/rosdistro/distribution_cache.py index 42f416a1..c27ee476 100644 --- a/src/rosdistro/distribution_cache.py +++ b/src/rosdistro/distribution_cache.py @@ -62,7 +62,9 @@ def __init__(self, name, data=None, distribution_file_data=None): self._distribution_file_data = data['distribution_file'] if data else distribution_file_data self.distribution_file = create_distribution_file(name, self._distribution_file_data) - self.release_package_xmls = data['release_package_xmls'] if data else {} + self.release_package_xmls = data['release_package_xmls'] if data and 'release_packages' in data else {} + self.release_readmes = data['release_readmes'] if data and 'release_readmes' in data else {} + self.release_changelogs = data['release_changelogs'] if data and 'release_changelogs' in data else {} self.source_repo_package_xmls = {} if data and 'source_repo_package_xmls' in data: for repo_name, repo_data in data['source_repo_package_xmls'].items(): @@ -76,6 +78,8 @@ def get_data(self): data['name'] = self.distribution_file.name data['distribution_file'] = self._distribution_file_data data['release_package_xmls'] = self.release_package_xmls + data['release_readmes'] = self.release_readmes + data['release_changelogs'] = self.release_changelogs data['source_repo_package_xmls'] = dict([(repo_name, repo_cache.get_data()) for repo_name, repo_cache in self.source_repo_package_xmls.items()]) return data @@ -112,6 +116,8 @@ def update_distribution(self, distribution_file_data): if pkg_name in self.release_package_xmls and self._get_repo_info(dist_file, pkg_name) != self._get_repo_info(self.distribution_file, pkg_name): logger.debug("Dropping release package XML cache for %s" % pkg_name) del self.release_package_xmls[pkg_name] + del self.release_readmes[pkg_name] + del self.release_changelogs[pkg_name] # Remove all source package xmls where the devel branch is pointing to a different commit than # the one we have associated with our cache. This requires calling git ls-remote on all affected repos. @@ -172,3 +178,5 @@ def _remove_obsolete_entries(self): if pkg_name not in self.distribution_file.release_packages: print('- REMOVE', pkg_name) del self.release_package_xmls[pkg_name] + del self.release_readmes[pkg_name] + del self.release_changelogs[pkg_name] diff --git a/src/rosdistro/distribution_cache_generator.py b/src/rosdistro/distribution_cache_generator.py index 8646fe55..f3095596 100644 --- a/src/rosdistro/distribution_cache_generator.py +++ b/src/rosdistro/distribution_cache_generator.py @@ -111,6 +111,24 @@ def generate_distribution_cache(index, dist_name, preclean=False, ignore_local=F if package_xml != old_package_xml: print(" - updated manifest of package '%s' to version '%s'" % (pkg_name, pkg.version)) + old_readme = None + if cache and pkg_name in cache.release_readmes: + old_readme = cache.release_readmes[pkg_name] + readme = dist.get_release_readme(pkg_name) + + if readme != old_readme: + print(" - updated README.md of package '%s'" % (pkg_name)) + + old_changelog = None + if cache and pkg_name in cache.release_changelogs: + old_changelog = cache.release_changelogs[pkg_name] + changelog = dist.get_release_changelog(pkg_name) + + if changelog != old_changelog: + print(" - updated CHANGELOG.rst of package '%s'" % (pkg_name)) + + + if not debug: print('') @@ -187,7 +205,7 @@ def _get_cached_distribution(index, dist_name, preclean=False, ignore_local=Fals # get distribution cache cache = get_distribution_cache(index, dist_name) except Exception as e: - print('- failed to fetch old cache: %s' % e) + print('- failed to fetch old cache: %s' % e.what()) if cache: print('- update cache') diff --git a/src/rosdistro/manifest_provider/bitbucket.py b/src/rosdistro/manifest_provider/bitbucket.py index b514965d..a1364554 100644 --- a/src/rosdistro/manifest_provider/bitbucket.py +++ b/src/rosdistro/manifest_provider/bitbucket.py @@ -49,7 +49,7 @@ BITBUCKET_PASSWORD = os.getenv('BITBUCKET_PASSWORD', None) -def bitbucket_manifest_provider(_dist_name, repo, pkg_name): +def bitbucket_manifest_provider(_dist_name, repo, pkg_name, filepath='package.xml'): assert repo.version server, path = repo.get_url_parts() @@ -62,7 +62,7 @@ def bitbucket_manifest_provider(_dist_name, repo, pkg_name): if not repo.has_remote_tag(release_tag): raise RuntimeError('specified tag "%s" is not a git tag' % release_tag) - url = 'https://bitbucket.org/%s/raw/%s/package.xml' % (path, release_tag) + url = 'https://bitbucket.org/%s/raw/%s/%s' % (path, release_tag, filepath) try: logger.debug('Load package.xml file from url "%s"' % url) req = Request(url) diff --git a/src/rosdistro/manifest_provider/cache.py b/src/rosdistro/manifest_provider/cache.py index 85de71a8..1bf47496 100644 --- a/src/rosdistro/manifest_provider/cache.py +++ b/src/rosdistro/manifest_provider/cache.py @@ -68,18 +68,31 @@ def __init__(self, distribution_cache, manifest_providers=None): self._distribution_cache = distribution_cache self._manifest_providers = manifest_providers - def __call__(self, dist_name, repo, pkg_name): + def __call__(self, dist_name, repo, pkg_name, filepath='package.xml'): assert repo.version - package_xml = self._distribution_cache.release_package_xmls.get(pkg_name, None) - if package_xml: - package_xml = sanitize_xml(package_xml) - self._distribution_cache.release_package_xmls[pkg_name] = package_xml - logger.debug('Loading package.xml for package "%s" from cache' % pkg_name) + if filepath == 'README.md': + package_xml = self._distribution_cache.release_readmes.get(pkg_name, None) + if package_xml: + self._distribution_cache.release_readmes[pkg_name] = package_xml + logger.debug('Loading README.md for package "%s" from cache' % pkg_name) + elif filepath == 'CHANGELOG.rst': + package_xml = self._distribution_cache.release_changelogs.get(pkg_name, None) + if package_xml: + self._distribution_cache.release_changelogs[pkg_name] = package_xml + logger.debug('Loading CHANGELOG.rst for package "%s" from cache' % pkg_name) else: + package_xml = self._distribution_cache.release_package_xmls.get(pkg_name, None) + if package_xml: + package_xml = sanitize_xml(package_xml) + self._distribution_cache.release_package_xmls[pkg_name] = package_xml + logger.debug('Loading package.xml for package "%s" from cache' % pkg_name) + if not package_xml: # use manifest providers to lazy load for mp in self._manifest_providers or []: try: - package_xml = sanitize_xml(mp(dist_name, repo, pkg_name)) + package_xml = mp(dist_name, repo, pkg_name, filepath) + if filepath == 'package.xml': + package_xml = sanitize_xml(package_xml) break except Exception as e: # pass and try next manifest provider @@ -87,7 +100,12 @@ def __call__(self, dist_name, repo, pkg_name): if package_xml is None: return None # populate the cache - self._distribution_cache.release_package_xmls[pkg_name] = package_xml + if filepath == 'README.md': + self._distribution_cache.release_readmes[pkg_name] = package_xml + if filepath == 'CHANGELOG.rst': + self._distribution_cache.release_changelogs[pkg_name] = package_xml + else: + self._distribution_cache.release_package_xmls[pkg_name] = package_xml return package_xml diff --git a/src/rosdistro/manifest_provider/git.py b/src/rosdistro/manifest_provider/git.py index 6e9e5e60..d2931ab1 100644 --- a/src/rosdistro/manifest_provider/git.py +++ b/src/rosdistro/manifest_provider/git.py @@ -44,41 +44,41 @@ from rosdistro.vcs import Git, ref_is_hash -def git_manifest_provider(_dist_name, repo, pkg_name): +def git_manifest_provider(_dist_name, repo, pkg_name, filepath='package.xml'): assert repo.version try: release_tag = repo.get_release_tag(pkg_name) with _temp_git_clone(repo.url, release_tag) as git_repo_path: - filename = os.path.join(git_repo_path, 'package.xml') + filename = os.path.join(git_repo_path, filepath) if not os.path.exists(filename): - raise RuntimeError('Could not find package.xml in repository "%s"' % repo.url) + raise RuntimeError('Could not find %s in repository "%s"' % (filepath, repo.url)) with open(filename, 'r') as f: return f.read() except Exception as e: - raise RuntimeError('Unable to fetch package.xml: %s' % e) + raise RuntimeError('Unable to fetch %s: %s' % (filepath, e)) -def git_source_manifest_provider(repo): +def git_source_manifest_provider(repo, filepath='package.xml'): try: - with _temp_git_clone(repo.url, repo.version) as git_repo_path: + with _temp_git_clone(repo.url, repo.version, '/tmp/rosdistro') as git_repo_path: # Include the git hash in our cache dictionary. git_hash = Git(git_repo_path).command('rev-parse', 'HEAD')['output'] cache = SourceRepositoryCache.from_ref(git_hash) - # Find package.xml files inside the repo. + # Find filepath files inside the repo. for package_path in find_package_paths(git_repo_path): if package_path == '.': package_path = '' - with open(os.path.join(git_repo_path, package_path, 'package.xml'), 'r') as f: + with open(os.path.join(git_repo_path, package_path, filepath), 'r') as f: package_xml = f.read() try: name = parse_package_string(package_xml).name except InvalidPackage: - raise RuntimeError('Unable to parse package.xml file found in %s' % repo.url) + raise RuntimeError('Unable to parse %s file found in %s' % (filepath, repo.url)) cache.add(name, package_path, package_xml) except Exception as e: - raise RuntimeError('Unable to fetch source package.xml files: %s' % e) + raise RuntimeError('Unable to fetch source %s files: %s' % (filepath, e)) return cache diff --git a/src/rosdistro/manifest_provider/github.py b/src/rosdistro/manifest_provider/github.py index 4074e61c..2224cbf7 100644 --- a/src/rosdistro/manifest_provider/github.py +++ b/src/rosdistro/manifest_provider/github.py @@ -44,11 +44,12 @@ GITHUB_USER = os.getenv('GITHUB_USER', None) GITHUB_PASSWORD = os.getenv('GITHUB_PASSWORD', None) +GITHUB_TOKEN = os.getenv('GITHUB_TOKEN', None) def _get_url_contents(url): return urlopen(url).read().decode('utf-8') -def github_manifest_provider(_dist_name, repo, pkg_name): +def github_manifest_provider(_dist_name, repo, pkg_name, filepath='package.xml'): assert repo.version server, path = repo.get_url_parts() if not server.endswith('github.com'): @@ -60,16 +61,16 @@ def github_manifest_provider(_dist_name, repo, pkg_name): if not repo.has_remote_tag(release_tag): raise RuntimeError('specified tag "%s" is not a git tag' % release_tag) - url = 'https://raw.githubusercontent.com/%s/%s/package.xml' % (path, release_tag) + url = 'https://raw.githubusercontent.com/%s/%s/%s' % (path, release_tag, filepath) try: - logger.debug('Load package.xml file from url "%s"' % url) + logger.debug('Load %s file from url "%s"' % (filepath, url)) return _get_url_contents(url) except URLError as e: logger.debug('- failed (%s), trying "%s"' % (e, url)) raise RuntimeError() -def github_source_manifest_provider(repo): +def github_source_manifest_provider(repo, filepath='package.xml'): server, path = repo.get_url_parts() if not server.endswith('github.com'): logger.debug('Skip non-github url "%s"' % repo.url) @@ -77,6 +78,8 @@ def github_source_manifest_provider(repo): tree_url = 'https://api.github.com/repos/%s/git/trees/%s?recursive=1' % (path, repo.version) req = Request(tree_url) + if GITHUB_TOKEN: + req.add_header({"Authorization": f"Bearer {GITHUB_TOKEN}"}) if GITHUB_USER and GITHUB_PASSWORD: logger.debug('- using http basic auth from supplied environment variables.') credential_pair = '%s:%s' % (GITHUB_USER, GITHUB_PASSWORD) @@ -93,9 +96,10 @@ def github_source_manifest_provider(repo): package_xml_paths = set() for obj in tree_json['tree']: - if obj['path'].split('/')[-1] == 'package.xml': + if obj['path'].split('/')[-1] == filepath: package_xml_paths.add(os.path.dirname(obj['path'])) + # TODO(tfoote) This is not correct for non-package.xml # Filter out ones that are inside other packages (eg, part of tests) def package_xml_in_parent(path): if path == '': @@ -112,8 +116,8 @@ def package_xml_in_parent(path): cache = SourceRepositoryCache.from_ref(tree_json['sha']) for package_xml_path in package_xml_paths: url = 'https://raw.githubusercontent.com/%s/%s/%s' % \ - (path, cache.ref(), package_xml_path + '/package.xml' if package_xml_path else 'package.xml') - logger.debug('- load package.xml from %s' % url) + (path, cache.ref(), package_xml_path + '/' + filepath if package_xml_path else filepath) + logger.debug('- load %s from %s' % (filepath, url)) package_xml = _get_url_contents(url) name = parse_package_string(package_xml).name cache.add(name, package_xml_path, package_xml) diff --git a/src/rosdistro/manifest_provider/gitlab.py b/src/rosdistro/manifest_provider/gitlab.py index 88450453..b1c874d1 100644 --- a/src/rosdistro/manifest_provider/gitlab.py +++ b/src/rosdistro/manifest_provider/gitlab.py @@ -89,14 +89,14 @@ def _gitlab_paged_api_query(server, path, resource, attrs): url = match.group(1) -def gitlab_manifest_provider(_dist_name, repo, pkg_name): +def gitlab_manifest_provider(_dist_name, repo, pkg_name, filepath='package.xml'): assert repo.version server, path = repo.get_url_parts() if not server.endswith('gitlab.com') and server != ROSDISTRO_GITLAB_SERVER: logger.debug('Skip non-gitlab url "%s"' % repo.url) raise RuntimeError('can not handle non gitlab urls') - resource = 'repository/files/package.xml/raw' + resource = 'repository/files/%s/raw' % filepath attrs = { 'ref': repo.get_release_tag(pkg_name), } diff --git a/src/rosdistro/manifest_provider/tar.py b/src/rosdistro/manifest_provider/tar.py index 8cd038c7..e4382ad1 100644 --- a/src/rosdistro/manifest_provider/tar.py +++ b/src/rosdistro/manifest_provider/tar.py @@ -48,7 +48,7 @@ _TAR_USER = os.getenv('TAR_USER', None) _TAR_PASSWORD = os.getenv('TAR_PASSWORD', None) -def tar_manifest_provider(_dist_name, repo, pkg_name): +def tar_manifest_provider(_dist_name, repo, pkg_name, filepath='package.xml'): assert repo.type == 'tar' subdir = repo.get_release_tag(pkg_name) @@ -65,7 +65,7 @@ def tar_manifest_provider(_dist_name, repo, pkg_name): response = urlopen(request) with tarfile.open(fileobj=io.BytesIO(response.read())) as tar: - package_xml = tar.extractfile(subdir + '/package.xml').read() + package_xml = tar.extractfile(subdir + '/' + filepath).read() return package_xml.decode('utf-8') From 7673ee5ad01772ab35e93e4157f0659be257ec03 Mon Sep 17 00:00:00 2001 From: Tully Foote Date: Fri, 13 Jun 2025 04:07:33 -0700 Subject: [PATCH 02/51] get everything working Truncating content to 100 lines to avoid giant file sizes --- src/rosdistro/distribution_cache.py | 2 +- src/rosdistro/distribution_cache_generator.py | 4 ++-- src/rosdistro/manifest_provider/cache.py | 2 +- src/rosdistro/manifest_provider/github.py | 12 ++++++++++-- 4 files changed, 14 insertions(+), 6 deletions(-) diff --git a/src/rosdistro/distribution_cache.py b/src/rosdistro/distribution_cache.py index c27ee476..7720549e 100644 --- a/src/rosdistro/distribution_cache.py +++ b/src/rosdistro/distribution_cache.py @@ -62,7 +62,7 @@ def __init__(self, name, data=None, distribution_file_data=None): self._distribution_file_data = data['distribution_file'] if data else distribution_file_data self.distribution_file = create_distribution_file(name, self._distribution_file_data) - self.release_package_xmls = data['release_package_xmls'] if data and 'release_packages' in data else {} + self.release_package_xmls = data['release_package_xmls'] if data and 'release_package_xmls' in data else {} self.release_readmes = data['release_readmes'] if data and 'release_readmes' in data else {} self.release_changelogs = data['release_changelogs'] if data and 'release_changelogs' in data else {} self.source_repo_package_xmls = {} diff --git a/src/rosdistro/distribution_cache_generator.py b/src/rosdistro/distribution_cache_generator.py index f3095596..811c423b 100644 --- a/src/rosdistro/distribution_cache_generator.py +++ b/src/rosdistro/distribution_cache_generator.py @@ -125,7 +125,7 @@ def generate_distribution_cache(index, dist_name, preclean=False, ignore_local=F changelog = dist.get_release_changelog(pkg_name) if changelog != old_changelog: - print(" - updated CHANGELOG.rst of package '%s'" % (pkg_name)) + print(" - updated CHANGELOG.rst of package '%s'" % (pkg_name)) @@ -205,7 +205,7 @@ def _get_cached_distribution(index, dist_name, preclean=False, ignore_local=Fals # get distribution cache cache = get_distribution_cache(index, dist_name) except Exception as e: - print('- failed to fetch old cache: %s' % e.what()) + print('- failed to fetch old cache: %s' % e) if cache: print('- update cache') diff --git a/src/rosdistro/manifest_provider/cache.py b/src/rosdistro/manifest_provider/cache.py index 1bf47496..2fa8d6e1 100644 --- a/src/rosdistro/manifest_provider/cache.py +++ b/src/rosdistro/manifest_provider/cache.py @@ -102,7 +102,7 @@ def __call__(self, dist_name, repo, pkg_name, filepath='package.xml'): # populate the cache if filepath == 'README.md': self._distribution_cache.release_readmes[pkg_name] = package_xml - if filepath == 'CHANGELOG.rst': + elif filepath == 'CHANGELOG.rst': self._distribution_cache.release_changelogs[pkg_name] = package_xml else: self._distribution_cache.release_package_xmls[pkg_name] = package_xml diff --git a/src/rosdistro/manifest_provider/github.py b/src/rosdistro/manifest_provider/github.py index 2224cbf7..e8eb55fe 100644 --- a/src/rosdistro/manifest_provider/github.py +++ b/src/rosdistro/manifest_provider/github.py @@ -35,6 +35,7 @@ import json import os from urllib.request import urlopen, Request +from urllib.error import HTTPError from urllib.error import URLError from catkin_pkg.package import parse_package_string @@ -64,7 +65,14 @@ def github_manifest_provider(_dist_name, repo, pkg_name, filepath='package.xml') url = 'https://raw.githubusercontent.com/%s/%s/%s' % (path, release_tag, filepath) try: logger.debug('Load %s file from url "%s"' % (filepath, url)) - return _get_url_contents(url) + # TODO(tfoote) magic number for testing + return '\n'.join( _get_url_contents(url).splitlines()[:100]) + except HTTPError as e: + if e.code == 404: + logger.debug('- File not found (%s), trying "%s"' % (e, url)) + return 'Missing' + logger.debug('- HTTP ERROR (%s), trying "%s"' % (e, url)) + raise e except URLError as e: logger.debug('- failed (%s), trying "%s"' % (e, url)) raise RuntimeError() @@ -79,7 +87,7 @@ def github_source_manifest_provider(repo, filepath='package.xml'): tree_url = 'https://api.github.com/repos/%s/git/trees/%s?recursive=1' % (path, repo.version) req = Request(tree_url) if GITHUB_TOKEN: - req.add_header({"Authorization": f"Bearer {GITHUB_TOKEN}"}) + req.add_header("Authorization", f"Bearer {GITHUB_TOKEN}") if GITHUB_USER and GITHUB_PASSWORD: logger.debug('- using http basic auth from supplied environment variables.') credential_pair = '%s:%s' % (GITHUB_USER, GITHUB_PASSWORD) From c8f02470c68b3e135090b5e4a68e1bbdc7656ce1 Mon Sep 17 00:00:00 2001 From: Tully Foote Date: Wed, 18 Jun 2025 01:15:11 -0700 Subject: [PATCH 03/51] add support for caching other content than package.xml --- src/rosdistro/manifest_provider/github.py | 4 ++-- src/rosdistro/source_repository_cache.py | 14 ++++++++++---- 2 files changed, 12 insertions(+), 6 deletions(-) diff --git a/src/rosdistro/manifest_provider/github.py b/src/rosdistro/manifest_provider/github.py index e8eb55fe..74c88c5c 100644 --- a/src/rosdistro/manifest_provider/github.py +++ b/src/rosdistro/manifest_provider/github.py @@ -104,7 +104,7 @@ def github_source_manifest_provider(repo, filepath='package.xml'): package_xml_paths = set() for obj in tree_json['tree']: - if obj['path'].split('/')[-1] == filepath: + if obj['path'].split('/')[-1] == 'package.xml': # Actually package.xml to find packages instead of filepath package_xml_paths.add(os.path.dirname(obj['path'])) # TODO(tfoote) This is not correct for non-package.xml @@ -128,6 +128,6 @@ def package_xml_in_parent(path): logger.debug('- load %s from %s' % (filepath, url)) package_xml = _get_url_contents(url) name = parse_package_string(package_xml).name - cache.add(name, package_xml_path, package_xml) + cache.add(name, package_xml_path, package_xml, filepath) return cache diff --git a/src/rosdistro/source_repository_cache.py b/src/rosdistro/source_repository_cache.py index e94b9cc5..e10a0b0f 100644 --- a/src/rosdistro/source_repository_cache.py +++ b/src/rosdistro/source_repository_cache.py @@ -58,11 +58,18 @@ def from_ref(cls, ref): """ return cls({'_ref': ref}) - def add(self, package_name, package_path, package_xml_string): + def add(self, package_name, package_path, payload_string, payload_type='package.xml'): # TODO(tfoote) Breaks rosdistro formatting changing from list to dict """ Add a package to the cache. """ - self._data[package_name] = (package_path, package_xml_string) + if package_name not in self._data: + self._data[package_name] = {} + + # Migration option for old caches + if type(self._data[package_name]) != dict: + self._data[package_name] = {} + self._data[package_name]['package_path'] = package_path + self._data[package_name][payload_type] = payload_string self._package_names.add(package_name) def __iter__(self): @@ -86,8 +93,7 @@ def items(self): to repo root, and package xml string. """ for package_name in self._package_names: - package_path, package_xml_string = self._data[package_name] - yield package_name, package_path, package_xml_string + yield package_name, self._data[package_name]['package_path'], self._data[package_name]['package.xml'] def __len__(self): """ From 5051dcb4670e1b952967ceded424de04a2fc6c26 Mon Sep 17 00:00:00 2001 From: Tully Foote Date: Thu, 19 Jun 2025 00:34:39 -0700 Subject: [PATCH 04/51] Add timestamp of last update TODO maybe remove this as we're already checking the hash anyway --- src/rosdistro/source_repository_cache.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/rosdistro/source_repository_cache.py b/src/rosdistro/source_repository_cache.py index e10a0b0f..c41fd671 100644 --- a/src/rosdistro/source_repository_cache.py +++ b/src/rosdistro/source_repository_cache.py @@ -31,6 +31,7 @@ # ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE # POSSIBILITY OF SUCH DAMAGE. +from datetime import datetime class SourceRepositoryCache(object): """ @@ -71,6 +72,8 @@ def add(self, package_name, package_path, payload_string, payload_type='package. self._data[package_name]['package_path'] = package_path self._data[package_name][payload_type] = payload_string self._package_names.add(package_name) + self._data['_last_update_time'] = datetime.now() + def __iter__(self): """ From 69ab352d2393412dda42debf20ebbfa25179ed87 Mon Sep 17 00:00:00 2001 From: Tully Foote Date: Thu, 19 Jun 2025 00:36:04 -0700 Subject: [PATCH 05/51] Add rate limiting backoff on url fetches --- src/rosdistro/manifest_provider/github.py | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/src/rosdistro/manifest_provider/github.py b/src/rosdistro/manifest_provider/github.py index 74c88c5c..e9ad4d52 100644 --- a/src/rosdistro/manifest_provider/github.py +++ b/src/rosdistro/manifest_provider/github.py @@ -34,6 +34,8 @@ import base64 import json import os +import time + from urllib.request import urlopen, Request from urllib.error import HTTPError from urllib.error import URLError @@ -48,6 +50,16 @@ GITHUB_TOKEN = os.getenv('GITHUB_TOKEN', None) def _get_url_contents(url): + backoff = 1 + while backoff < 120: + try: + return urlopen(url).read().decode('utf-8') + except HTTPError as e: + if e.code != 403: + raise e + logger.debug(f'Fetch of {url} failed with 403, assuming rate limit, retrying after period {backoff} seconds.') + time.sleep(backoff) + backoff *= 1.5 return urlopen(url).read().decode('utf-8') def github_manifest_provider(_dist_name, repo, pkg_name, filepath='package.xml'): From ce2be60146007fade49c7dcf38e6264caa1e22c7 Mon Sep 17 00:00:00 2001 From: Tully Foote Date: Thu, 19 Jun 2025 00:37:36 -0700 Subject: [PATCH 06/51] use filepath in git plugin --- src/rosdistro/manifest_provider/git.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/rosdistro/manifest_provider/git.py b/src/rosdistro/manifest_provider/git.py index d2931ab1..1acaad81 100644 --- a/src/rosdistro/manifest_provider/git.py +++ b/src/rosdistro/manifest_provider/git.py @@ -75,7 +75,7 @@ def git_source_manifest_provider(repo, filepath='package.xml'): name = parse_package_string(package_xml).name except InvalidPackage: raise RuntimeError('Unable to parse %s file found in %s' % (filepath, repo.url)) - cache.add(name, package_path, package_xml) + cache.add(name, package_path, package_xml, filepath) except Exception as e: raise RuntimeError('Unable to fetch source %s files: %s' % (filepath, e)) From 32efafc2c99f0afa0280cd95d69ea5cfd7747bb4 Mon Sep 17 00:00:00 2001 From: Tully Foote Date: Thu, 19 Jun 2025 00:38:01 -0700 Subject: [PATCH 07/51] fix clone logic for source packages and add a little debug --- src/rosdistro/manifest_provider/git.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/rosdistro/manifest_provider/git.py b/src/rosdistro/manifest_provider/git.py index 1acaad81..65b0fd6f 100644 --- a/src/rosdistro/manifest_provider/git.py +++ b/src/rosdistro/manifest_provider/git.py @@ -42,6 +42,7 @@ from rosdistro.common import rmtree from rosdistro.source_repository_cache import SourceRepositoryCache from rosdistro.vcs import Git, ref_is_hash +from rosdistro import logger def git_manifest_provider(_dist_name, repo, pkg_name, filepath='package.xml'): @@ -60,7 +61,8 @@ def git_manifest_provider(_dist_name, repo, pkg_name, filepath='package.xml'): def git_source_manifest_provider(repo, filepath='package.xml'): try: - with _temp_git_clone(repo.url, repo.version, '/tmp/rosdistro') as git_repo_path: + with _temp_git_clone(repo.url, repo.version) as git_repo_path: + logger.debug(f'Cloing repository {repo.url} to get source info') # Include the git hash in our cache dictionary. git_hash = Git(git_repo_path).command('rev-parse', 'HEAD')['output'] cache = SourceRepositoryCache.from_ref(git_hash) From 667443ea19c30d4741fd7376c94e91d9d0d5fca4 Mon Sep 17 00:00:00 2001 From: Tully Foote Date: Thu, 19 Jun 2025 00:40:50 -0700 Subject: [PATCH 08/51] add source cache linking logic for README.md and CHANGELOG.rst --- src/rosdistro/manifest_provider/cache.py | 35 +++++++++++++++++++----- 1 file changed, 28 insertions(+), 7 deletions(-) diff --git a/src/rosdistro/manifest_provider/cache.py b/src/rosdistro/manifest_provider/cache.py index 2fa8d6e1..d857cb53 100644 --- a/src/rosdistro/manifest_provider/cache.py +++ b/src/rosdistro/manifest_provider/cache.py @@ -122,7 +122,7 @@ def __call__(self, repo): # Use manifest providers to lazy load for mp in self._source_manifest_providers or []: try: - repo_cache = mp(repo) + repo_cache = mp(repo) # TODO (tfoote) list other files here except Exception as e: # pass and try next manifest provider logger.debug('Skipped "%s()": %s' % (mp.__name__, e)) @@ -136,11 +136,32 @@ def __call__(self, repo): # De-duplicate with the release package XMLs. This will cause the YAML writer # to use references for the common strings, saving a lot of space in the cache file. if repo_cache: - for package_name, package_path, package_xml in repo_cache.items(): - package_xml = sanitize_xml(package_xml) - release_package_xml = self._distribution_cache.release_package_xmls.get(package_name, None) - if package_xml == release_package_xml: - package_xml = release_package_xml - repo_cache.add(package_name, package_path, package_xml) + for package_name, pkg_entries in repo_cache._data.items(): + if package_name.startswith('_'): + continue + print(f"pkg_entries {pkg_entries}") + if 'package.xml' in pkg_entries: + package_xml = sanitize_xml(pkg_entries['package.xml']) + release_package_xml = self._distribution_cache.release_package_xmls.get(package_name, None) + if package_xml == release_package_xml: + logger.debug(f'{package_name} Linking package.xml of source cache entry for compaction.') + package_xml = release_package_xml + repo_cache.add(package_name, pkg_entries['package_path'], package_xml) + + if 'CHANGELOG.rst' in pkg_entries: + changelog = pkg_entries['CHANGELOG.rst'] + release_changelog = self._distribution_cache.release_changelogs.get(package_name, None) + if changelog == release_changelog: + logger.debug(f'{package_name} Linking CHANGELOG.rst of source cache entry for compaction.') + changelog = release_changelog + repo_cache.add(package_name, pkg_entries['package_path'], changelog) + + if 'README.md' in pkg_entries: + readme = pkg_entries['README.md'] + release_readme = self._distribution_cache.release_readmes.get(package_name, None) + if readme == release_readme: + logger.debug(f'{package_name} Linking README.md of source cache entry for compaction.') + readme = release_readme + repo_cache.add(package_name, pkg_entries['package_path'], readme) return repo_cache From 4fbaf6d04915eb05e9a76776150d98805f9159a5 Mon Sep 17 00:00:00 2001 From: Tully Foote Date: Thu, 19 Jun 2025 00:41:50 -0700 Subject: [PATCH 09/51] Add support for changelog and readme to github source provider --- src/rosdistro/manifest_provider/github.py | 17 ++++++++++++++++- 1 file changed, 16 insertions(+), 1 deletion(-) diff --git a/src/rosdistro/manifest_provider/github.py b/src/rosdistro/manifest_provider/github.py index e9ad4d52..921b9d0a 100644 --- a/src/rosdistro/manifest_provider/github.py +++ b/src/rosdistro/manifest_provider/github.py @@ -90,7 +90,7 @@ def github_manifest_provider(_dist_name, repo, pkg_name, filepath='package.xml') raise RuntimeError() -def github_source_manifest_provider(repo, filepath='package.xml'): +def github_source_manifest_provider(repo, filepaths=['CHANGELOG.rst', 'README.md']): server, path = repo.get_url_parts() if not server.endswith('github.com'): logger.debug('Skip non-github url "%s"' % repo.url) @@ -135,11 +135,26 @@ def package_xml_in_parent(path): cache = SourceRepositoryCache.from_ref(tree_json['sha']) for package_xml_path in package_xml_paths: + filepath = 'package.xml' url = 'https://raw.githubusercontent.com/%s/%s/%s' % \ (path, cache.ref(), package_xml_path + '/' + filepath if package_xml_path else filepath) logger.debug('- load %s from %s' % (filepath, url)) package_xml = _get_url_contents(url) name = parse_package_string(package_xml).name cache.add(name, package_xml_path, package_xml, filepath) + for filepath in filepaths: + url = 'https://raw.githubusercontent.com/%s/%s/%s' % \ + (path, cache.ref(), package_xml_path + '/' + filepath if package_xml_path else filepath) + logger.debug('- load %s from %s' % (filepath, url)) + try: + contents = _get_url_contents(url) + except HTTPError as e: + if e.code == 404: + logger.debug('- Recording Missing (%s), hit error "%s"' % (url, e)) + contents = 'Missing' + else: + logger.debug('- HTTP ERROR (%s), trying "%s"' % (e, url)) + raise e + cache.add(name, package_xml_path, contents, filepath) return cache From 9998ad21caf072614759645c36cf9bfe5e0edcb8 Mon Sep 17 00:00:00 2001 From: Tully Foote Date: Thu, 19 Jun 2025 00:57:40 -0700 Subject: [PATCH 10/51] Fixup remove debug --- src/rosdistro/manifest_provider/cache.py | 1 - 1 file changed, 1 deletion(-) diff --git a/src/rosdistro/manifest_provider/cache.py b/src/rosdistro/manifest_provider/cache.py index d857cb53..40fb0ad0 100644 --- a/src/rosdistro/manifest_provider/cache.py +++ b/src/rosdistro/manifest_provider/cache.py @@ -139,7 +139,6 @@ def __call__(self, repo): for package_name, pkg_entries in repo_cache._data.items(): if package_name.startswith('_'): continue - print(f"pkg_entries {pkg_entries}") if 'package.xml' in pkg_entries: package_xml = sanitize_xml(pkg_entries['package.xml']) release_package_xml = self._distribution_cache.release_package_xmls.get(package_name, None) From 0a5235ee15bc7905b6b3665e13743cb4877f8f3c Mon Sep 17 00:00:00 2001 From: Tully Foote Date: Thu, 19 Jun 2025 23:38:09 -0700 Subject: [PATCH 11/51] add debug instrumentation --- src/rosdistro/distribution_cache.py | 24 ++++++++++++++++++++++-- 1 file changed, 22 insertions(+), 2 deletions(-) diff --git a/src/rosdistro/distribution_cache.py b/src/rosdistro/distribution_cache.py index 7720549e..b075ded3 100644 --- a/src/rosdistro/distribution_cache.py +++ b/src/rosdistro/distribution_cache.py @@ -31,7 +31,9 @@ # ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE # POSSIBILITY OF SUCH DAMAGE. +import datetime import sys +import time from . import logger from .distribution_file import create_distribution_file @@ -109,7 +111,7 @@ def update_distribution(self, distribution_file_data): dist_file = create_distribution_file(self.distribution_file.name, self._distribution_file_data) # remove all release package xmls where the package version has changed. - print("- removing invalid release package cache entries.") + print(f"- removing invalid release package cache entries. [{len(dist_file.release_packages.keys())}]") for pkg_name in sorted(dist_file.release_packages.keys()): if pkg_name not in self.distribution_file.release_packages: continue @@ -122,7 +124,10 @@ def update_distribution(self, distribution_file_data): # Remove all source package xmls where the devel branch is pointing to a different commit than # the one we have associated with our cache. This requires calling git ls-remote on all affected repos. if self.source_repo_package_xmls: - print("- checking invalid source repo cache entries.") + start_time = time.perf_counter() + dropped_count = 0 + skipped_count = 0 + print(f"- checking invalid source repo cache entries. [{len(self.source_repo_package_xmls.keys())}]") for repo in sorted(self.source_repo_package_xmls.keys()): sys.stdout.write('.') sys.stdout.flush() @@ -135,6 +140,15 @@ def update_distribution(self, distribution_file_data): del self.source_repo_package_xmls[repo] continue + max_update_delta = 24 * 60 * 60 + if '_last_update_time' in self.source_repo_package_xmls[repo]: + now = datetime.datetime.now() + entry_age = (now - self.source_repo_package_xmls[repo]['_last_update_time']).total_seconds() + if entry_age < max_update_delta: + logger.debug(f'Skipping check of {repo} because it was last updated only {entry_age} seconds ago less than {max_update_delta}') + skipped_count += 1 + continue + if ref_is_hash(source_repository.version): source_hash = source_repository.version else: @@ -143,6 +157,7 @@ def update_distribution(self, distribution_file_data): # Error checking remote, or unable to find remote reference. Drop the cache entry. logger.debug("Unable to check hash for branch %s of %s, dropping cache entry." % (source_repository.version, source_repository.url)) del self.source_repo_package_xmls[repo] + dropped_count += 1 continue # Split by line first and take the last line, to squelch any preamble output, for example # a known host key validation notice. @@ -152,7 +167,12 @@ def update_distribution(self, distribution_file_data): if source_hash != cached_hash: logger.debug('Repo "%s" has moved from %s to %s, dropping cache.' % (repo, cached_hash, source_hash)) del self.source_repo_package_xmls[repo] + dropped_count += 1 sys.stdout.write('\n') + sys.stdout.write(f'Dropped {dropped_count} repositories\n') + sys.stdout.write(f'Skippted {skipped_count} repositories\n') + end_time = time.perf_counter() + logger.debug(f'Check of invalid source repo cache entries took {(end_time - start_time):.1f} seconds') self.distribution_file = dist_file self.distribution_file.source_packages = self.get_source_packages() From 2f085b807e60a33c32a59386e954229e85ecf425 Mon Sep 17 00:00:00 2001 From: Tully Foote Date: Thu, 19 Jun 2025 23:39:48 -0700 Subject: [PATCH 12/51] filepaths for all source manifest providers --- src/rosdistro/manifest_provider/cache.py | 2 +- src/rosdistro/manifest_provider/git.py | 3 ++- src/rosdistro/manifest_provider/gitlab.py | 6 +++--- src/rosdistro/manifest_provider/tar.py | 4 ++-- 4 files changed, 8 insertions(+), 7 deletions(-) diff --git a/src/rosdistro/manifest_provider/cache.py b/src/rosdistro/manifest_provider/cache.py index 40fb0ad0..8b5148dc 100644 --- a/src/rosdistro/manifest_provider/cache.py +++ b/src/rosdistro/manifest_provider/cache.py @@ -122,7 +122,7 @@ def __call__(self, repo): # Use manifest providers to lazy load for mp in self._source_manifest_providers or []: try: - repo_cache = mp(repo) # TODO (tfoote) list other files here + repo_cache = mp(repo, filepaths=['CHANGELOG.rst', 'README.md']) # TODO (tfoote) list other files here except Exception as e: # pass and try next manifest provider logger.debug('Skipped "%s()": %s' % (mp.__name__, e)) diff --git a/src/rosdistro/manifest_provider/git.py b/src/rosdistro/manifest_provider/git.py index 65b0fd6f..6f0e5227 100644 --- a/src/rosdistro/manifest_provider/git.py +++ b/src/rosdistro/manifest_provider/git.py @@ -59,7 +59,8 @@ def git_manifest_provider(_dist_name, repo, pkg_name, filepath='package.xml'): raise RuntimeError('Unable to fetch %s: %s' % (filepath, e)) -def git_source_manifest_provider(repo, filepath='package.xml'): +def git_source_manifest_provider(repo, filepaths=['package.xml']): + filepath = 'package.xml' # TODO(tfoote) use filepaths try: with _temp_git_clone(repo.url, repo.version) as git_repo_path: logger.debug(f'Cloing repository {repo.url} to get source info') diff --git a/src/rosdistro/manifest_provider/gitlab.py b/src/rosdistro/manifest_provider/gitlab.py index b1c874d1..574a2f2d 100644 --- a/src/rosdistro/manifest_provider/gitlab.py +++ b/src/rosdistro/manifest_provider/gitlab.py @@ -108,7 +108,7 @@ def gitlab_manifest_provider(_dist_name, repo, pkg_name, filepath='package.xml') raise -def gitlab_source_manifest_provider(repo): +def gitlab_source_manifest_provider(repo, filepaths=['package.xml']): assert repo.version server, path = repo.get_url_parts() if not server.endswith('gitlab.com') and server != ROSDISTRO_GITLAB_SERVER: @@ -148,11 +148,11 @@ def package_xml_in_parent(path): cache = SourceRepositoryCache.from_ref(sha) for package_xml_path in package_xml_paths: resource_path = urlquote( - package_xml_path + '/package.xml' if package_xml_path else 'package.xml', safe='') + package_xml_path + '/' + filepath if package_xml_path else filepath, safe='') resource = 'repository/files/' + resource_path + '/raw' with _gitlab_api_query(server, path, resource, {'ref': sha}) as res: package_xml = res.read().decode('utf-8') name = parse_package_string(package_xml).name - cache.add(name, package_xml_path, package_xml) + cache.add(name, package_xml_path, package_xml, filepath) return cache diff --git a/src/rosdistro/manifest_provider/tar.py b/src/rosdistro/manifest_provider/tar.py index e4382ad1..0ea58322 100644 --- a/src/rosdistro/manifest_provider/tar.py +++ b/src/rosdistro/manifest_provider/tar.py @@ -69,7 +69,7 @@ def tar_manifest_provider(_dist_name, repo, pkg_name, filepath='package.xml'): return package_xml.decode('utf-8') -def tar_source_manifest_provider(repo): +def tar_source_manifest_provider(repo, filepaths=['package.xml']): assert repo.type == 'tar' try: @@ -100,7 +100,7 @@ def tar_source_manifest_provider(repo): name = parse_package_string(package_xml).name except InvalidPackage: raise RuntimeError('Unable to parse package.xml file found in %s' % repo.url) - cache.add(name, package_path, package_xml) + cache.add(name, package_path, package_xml, filepath) return cache finally: From cd9669fd9cde0f06c1469c31f967940269f768de Mon Sep 17 00:00:00 2001 From: Tully Foote Date: Fri, 20 Jun 2025 00:07:24 -0700 Subject: [PATCH 13/51] fix bug in compression logic causing cross talk --- src/rosdistro/manifest_provider/cache.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/rosdistro/manifest_provider/cache.py b/src/rosdistro/manifest_provider/cache.py index 8b5148dc..bbd46d0f 100644 --- a/src/rosdistro/manifest_provider/cache.py +++ b/src/rosdistro/manifest_provider/cache.py @@ -145,7 +145,7 @@ def __call__(self, repo): if package_xml == release_package_xml: logger.debug(f'{package_name} Linking package.xml of source cache entry for compaction.') package_xml = release_package_xml - repo_cache.add(package_name, pkg_entries['package_path'], package_xml) + repo_cache.add(package_name, pkg_entries['package_path'], package_xml, 'package.xml') if 'CHANGELOG.rst' in pkg_entries: changelog = pkg_entries['CHANGELOG.rst'] @@ -153,7 +153,7 @@ def __call__(self, repo): if changelog == release_changelog: logger.debug(f'{package_name} Linking CHANGELOG.rst of source cache entry for compaction.') changelog = release_changelog - repo_cache.add(package_name, pkg_entries['package_path'], changelog) + repo_cache.add(package_name, pkg_entries['package_path'], changelog, 'CHANGELOG.rst') if 'README.md' in pkg_entries: readme = pkg_entries['README.md'] @@ -161,6 +161,6 @@ def __call__(self, repo): if readme == release_readme: logger.debug(f'{package_name} Linking README.md of source cache entry for compaction.') readme = release_readme - repo_cache.add(package_name, pkg_entries['package_path'], readme) + repo_cache.add(package_name, pkg_entries['package_path'], readme, 'README.md') return repo_cache From 886c3b1c45f20c445c8cd7f2e3b69b8080d8dc91 Mon Sep 17 00:00:00 2001 From: Tully Foote Date: Fri, 20 Jun 2025 00:09:11 -0700 Subject: [PATCH 14/51] debug for clearing content visibilty --- src/rosdistro/source_repository_cache.py | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/src/rosdistro/source_repository_cache.py b/src/rosdistro/source_repository_cache.py index c41fd671..803ceeac 100644 --- a/src/rosdistro/source_repository_cache.py +++ b/src/rosdistro/source_repository_cache.py @@ -68,6 +68,7 @@ def add(self, package_name, package_path, payload_string, payload_type='package. # Migration option for old caches if type(self._data[package_name]) != dict: + print(f"Clearing content from package {package_name} @@@@@@@@@@@@@@@@!!!!!!!!!!!!!!!!") self._data[package_name] = {} self._data[package_name]['package_path'] = package_path self._data[package_name][payload_type] = payload_string @@ -90,13 +91,13 @@ def __getitem__(self, package_name): raise KeyError("Package '%s' not present in SourceRepositoryCache." % package_name) return self._data[package_name] - def items(self): - """ - Generator of (str, str, str) containing the package name, path relative - to repo root, and package xml string. - """ - for package_name in self._package_names: - yield package_name, self._data[package_name]['package_path'], self._data[package_name]['package.xml'] + # def items(self): # TODO(tfoote) unused now + # """ + # Generator of (str, str, str) containing the package name, path relative + # to repo root, and package xml string. + # """ + # for package_name in self._package_names: + # yield package_name, self._data[package_name]['package_path'], self._data[package_name]['package.xml'] def __len__(self): """ From be0a62e51dbffb0e86062549669250e2312e9220 Mon Sep 17 00:00:00 2001 From: Tully Foote Date: Fri, 20 Jun 2025 00:09:29 -0700 Subject: [PATCH 15/51] better variable naming --- src/rosdistro/manifest_provider/github.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/src/rosdistro/manifest_provider/github.py b/src/rosdistro/manifest_provider/github.py index 921b9d0a..5cd0a68f 100644 --- a/src/rosdistro/manifest_provider/github.py +++ b/src/rosdistro/manifest_provider/github.py @@ -135,13 +135,14 @@ def package_xml_in_parent(path): cache = SourceRepositoryCache.from_ref(tree_json['sha']) for package_xml_path in package_xml_paths: - filepath = 'package.xml' + package_xml_filename = 'package.xml' url = 'https://raw.githubusercontent.com/%s/%s/%s' % \ - (path, cache.ref(), package_xml_path + '/' + filepath if package_xml_path else filepath) - logger.debug('- load %s from %s' % (filepath, url)) + (path, cache.ref(), package_xml_path + '/' + package_xml_filename if package_xml_path else package_xml_filename) + logger.debug('- load %s from %s' % (package_xml_filename, url)) package_xml = _get_url_contents(url) name = parse_package_string(package_xml).name - cache.add(name, package_xml_path, package_xml, filepath) + logger.debug(f'==== Package xml added for {name}') + cache.add(name, package_xml_path, package_xml, package_xml_filename) for filepath in filepaths: url = 'https://raw.githubusercontent.com/%s/%s/%s' % \ (path, cache.ref(), package_xml_path + '/' + filepath if package_xml_path else filepath) From 25f8cce1f57e33def56a16a7317ea3a1b9abacfa Mon Sep 17 00:00:00 2001 From: Tully Foote Date: Fri, 20 Jun 2025 14:25:26 -0700 Subject: [PATCH 16/51] remove debug statement --- src/rosdistro/manifest_provider/github.py | 1 - 1 file changed, 1 deletion(-) diff --git a/src/rosdistro/manifest_provider/github.py b/src/rosdistro/manifest_provider/github.py index 5cd0a68f..6e45392d 100644 --- a/src/rosdistro/manifest_provider/github.py +++ b/src/rosdistro/manifest_provider/github.py @@ -141,7 +141,6 @@ def package_xml_in_parent(path): logger.debug('- load %s from %s' % (package_xml_filename, url)) package_xml = _get_url_contents(url) name = parse_package_string(package_xml).name - logger.debug(f'==== Package xml added for {name}') cache.add(name, package_xml_path, package_xml, package_xml_filename) for filepath in filepaths: url = 'https://raw.githubusercontent.com/%s/%s/%s' % \ From 372e2fbd33395c85075c479cca25399385c220c4 Mon Sep 17 00:00:00 2001 From: Tully Foote Date: Fri, 20 Jun 2025 15:20:52 -0700 Subject: [PATCH 17/51] Differentiate which fetch is running --- src/rosdistro/distribution_cache_generator.py | 4 ++-- src/rosdistro/release_cache_generator.py | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/src/rosdistro/distribution_cache_generator.py b/src/rosdistro/distribution_cache_generator.py index 811c423b..e5c11741 100644 --- a/src/rosdistro/distribution_cache_generator.py +++ b/src/rosdistro/distribution_cache_generator.py @@ -86,7 +86,7 @@ def generate_distribution_cache(index, dist_name, preclean=False, ignore_local=F print(' - skip "%s" since it has no version' % pkg_name) continue if debug: - print(' - fetch "%s"' % pkg_name) + print(' - dist cache fetch "%s"' % pkg_name) else: sys.stdout.write('.') sys.stdout.flush() @@ -138,7 +138,7 @@ def generate_distribution_cache(index, dist_name, preclean=False, ignore_local=F if dist.repositories[repo_name].source_repository: dist.get_source_repo_package_xmls(repo_name) if debug: - print(' - fetch "%s"' % repo_name) + print(' - dist cache source fetch "%s"' % repo_name) else: sys.stdout.write('.') sys.stdout.flush() diff --git a/src/rosdistro/release_cache_generator.py b/src/rosdistro/release_cache_generator.py index a274bea8..790da8f0 100644 --- a/src/rosdistro/release_cache_generator.py +++ b/src/rosdistro/release_cache_generator.py @@ -78,7 +78,7 @@ def generate_release_cache(index, dist_name, preclean=False, debug=False): print(' - skip "%s" since it has no version' % pkg_name) continue if debug: - print(' - fetch "%s"' % pkg_name) + print(' - release cache fetch "%s"' % pkg_name) else: sys.stdout.write('.') sys.stdout.flush() From d673ca64f9454968e9fc07a06d5024a7c7325074 Mon Sep 17 00:00:00 2001 From: Tully Foote Date: Mon, 23 Jun 2025 09:53:54 -0700 Subject: [PATCH 18/51] proof of concept truncating extra content --- src/rosdistro/manifest_provider/cache.py | 61 ++++++++++++++---------- 1 file changed, 37 insertions(+), 24 deletions(-) diff --git a/src/rosdistro/manifest_provider/cache.py b/src/rosdistro/manifest_provider/cache.py index bbd46d0f..1a8527d4 100644 --- a/src/rosdistro/manifest_provider/cache.py +++ b/src/rosdistro/manifest_provider/cache.py @@ -35,6 +35,13 @@ from rosdistro import logger +def sanitize_and_truncate_docs(doc_string, max_length=100): + # Remove trailing whitespace then truncate + lines = doc_string.rstrip().splitlines() + ending = '' + if len(lines) > 100: + ending = f'\nTruncated file at {max_length} lines' + return '\n'.join(lines[:max_length]) + ending def sanitize_xml(xml_string): """ Returns a version of the supplied XML string with comments and all whitespace stripped, @@ -71,42 +78,46 @@ def __init__(self, distribution_cache, manifest_providers=None): def __call__(self, dist_name, repo, pkg_name, filepath='package.xml'): assert repo.version if filepath == 'README.md': - package_xml = self._distribution_cache.release_readmes.get(pkg_name, None) - if package_xml: - self._distribution_cache.release_readmes[pkg_name] = package_xml + manifest_content = self._distribution_cache.release_readmes.get(pkg_name, None) + if manifest_content: + manifest_content = sanitize_and_truncate_docs(manifest_content) + self._distribution_cache.release_readmes[pkg_name] = manifest_content logger.debug('Loading README.md for package "%s" from cache' % pkg_name) elif filepath == 'CHANGELOG.rst': - package_xml = self._distribution_cache.release_changelogs.get(pkg_name, None) - if package_xml: - self._distribution_cache.release_changelogs[pkg_name] = package_xml + manifest_content = self._distribution_cache.release_changelogs.get(pkg_name, None) + if manifest_content: + manifest_content = sanitize_and_truncate_docs(manifest_content) + self._distribution_cache.release_changelogs[pkg_name] = manifest_content logger.debug('Loading CHANGELOG.rst for package "%s" from cache' % pkg_name) else: - package_xml = self._distribution_cache.release_package_xmls.get(pkg_name, None) - if package_xml: - package_xml = sanitize_xml(package_xml) - self._distribution_cache.release_package_xmls[pkg_name] = package_xml + manifest_content = self._distribution_cache.release_package_xmls.get(pkg_name, None) + if manifest_content: + manifest_content = sanitize_xml(manifest_content) + self._distribution_cache.release_package_xmls[pkg_name] = manifest_content logger.debug('Loading package.xml for package "%s" from cache' % pkg_name) - if not package_xml: + if not manifest_content: # use manifest providers to lazy load for mp in self._manifest_providers or []: try: - package_xml = mp(dist_name, repo, pkg_name, filepath) + manifest_content = mp(dist_name, repo, pkg_name, filepath) if filepath == 'package.xml': - package_xml = sanitize_xml(package_xml) + manifest_content = sanitize_xml(manifest_content) + else: + manifest_content = sanitize_and_truncate_docs(manifest_content) break except Exception as e: # pass and try next manifest provider logger.debug('Skipped "%s()": %s' % (mp.__name__, e)) - if package_xml is None: + if manifest_content is None: return None # populate the cache if filepath == 'README.md': - self._distribution_cache.release_readmes[pkg_name] = package_xml + self._distribution_cache.release_readmes[pkg_name] = manifest_content elif filepath == 'CHANGELOG.rst': - self._distribution_cache.release_changelogs[pkg_name] = package_xml + self._distribution_cache.release_changelogs[pkg_name] = manifest_content else: - self._distribution_cache.release_package_xmls[pkg_name] = package_xml - return package_xml + self._distribution_cache.release_package_xmls[pkg_name] = manifest_content + return manifest_content class CachedSourceManifestProvider(object): @@ -140,26 +151,28 @@ def __call__(self, repo): if package_name.startswith('_'): continue if 'package.xml' in pkg_entries: - package_xml = sanitize_xml(pkg_entries['package.xml']) + package_xml = sanitize_xml(pkg_entries['package.xml']) # TODO(tfoote) validate as unnecessary should be sanitized already on insert? release_package_xml = self._distribution_cache.release_package_xmls.get(package_name, None) if package_xml == release_package_xml: - logger.debug(f'{package_name} Linking package.xml of source cache entry for compaction.') + logger.debug(f'{package_name} Linking package.xml of source cache entry for compaction. Lines saved: {len(package_xml.splitlines())}') package_xml = release_package_xml repo_cache.add(package_name, pkg_entries['package_path'], package_xml, 'package.xml') if 'CHANGELOG.rst' in pkg_entries: - changelog = pkg_entries['CHANGELOG.rst'] + changelog = sanitize_and_truncate_docs(pkg_entries['CHANGELOG.rst']) release_changelog = self._distribution_cache.release_changelogs.get(package_name, None) if changelog == release_changelog: - logger.debug(f'{package_name} Linking CHANGELOG.rst of source cache entry for compaction.') + logger.debug(f'{package_name} Linking CHANGELOG.rst of source cache entry for compaction. Lines saved: {len(changelog.splitlines())}') changelog = release_changelog repo_cache.add(package_name, pkg_entries['package_path'], changelog, 'CHANGELOG.rst') + else: + logger.debug(f'Changelog didn\'t match!!!!!!!!!!!!!!!!!\n<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<\n{changelog}\n================================\n{release_changelog}\n>>>>>>>>>>>>>>>>>>>>>>>>>>>>') if 'README.md' in pkg_entries: - readme = pkg_entries['README.md'] + readme = sanitize_and_truncate_docs(pkg_entries['README.md']) release_readme = self._distribution_cache.release_readmes.get(package_name, None) if readme == release_readme: - logger.debug(f'{package_name} Linking README.md of source cache entry for compaction.') + logger.debug(f'{package_name} Linking README.md of source cache entry for compaction. Lines saved: {len(readme.splitlines())}') readme = release_readme repo_cache.add(package_name, pkg_entries['package_path'], readme, 'README.md') From 86fbcde82a5acab4345ae856fe9ed835c94047a9 Mon Sep 17 00:00:00 2001 From: Tully Foote Date: Mon, 23 Jun 2025 20:59:37 -0700 Subject: [PATCH 19/51] Only update timestamp if something is crawled --- src/rosdistro/distribution_cache.py | 2 +- src/rosdistro/manifest_provider/cache.py | 6 +++--- src/rosdistro/source_repository_cache.py | 5 +++-- 3 files changed, 7 insertions(+), 6 deletions(-) diff --git a/src/rosdistro/distribution_cache.py b/src/rosdistro/distribution_cache.py index b075ded3..e0cebee9 100644 --- a/src/rosdistro/distribution_cache.py +++ b/src/rosdistro/distribution_cache.py @@ -140,7 +140,7 @@ def update_distribution(self, distribution_file_data): del self.source_repo_package_xmls[repo] continue - max_update_delta = 24 * 60 * 60 + max_update_delta = 1 * 60 * 60 if '_last_update_time' in self.source_repo_package_xmls[repo]: now = datetime.datetime.now() entry_age = (now - self.source_repo_package_xmls[repo]['_last_update_time']).total_seconds() diff --git a/src/rosdistro/manifest_provider/cache.py b/src/rosdistro/manifest_provider/cache.py index 1a8527d4..dffd6527 100644 --- a/src/rosdistro/manifest_provider/cache.py +++ b/src/rosdistro/manifest_provider/cache.py @@ -156,7 +156,7 @@ def __call__(self, repo): if package_xml == release_package_xml: logger.debug(f'{package_name} Linking package.xml of source cache entry for compaction. Lines saved: {len(package_xml.splitlines())}') package_xml = release_package_xml - repo_cache.add(package_name, pkg_entries['package_path'], package_xml, 'package.xml') + repo_cache.add(package_name, pkg_entries['package_path'], package_xml, 'package.xml', increment_update_time=False) if 'CHANGELOG.rst' in pkg_entries: changelog = sanitize_and_truncate_docs(pkg_entries['CHANGELOG.rst']) @@ -164,7 +164,7 @@ def __call__(self, repo): if changelog == release_changelog: logger.debug(f'{package_name} Linking CHANGELOG.rst of source cache entry for compaction. Lines saved: {len(changelog.splitlines())}') changelog = release_changelog - repo_cache.add(package_name, pkg_entries['package_path'], changelog, 'CHANGELOG.rst') + repo_cache.add(package_name, pkg_entries['package_path'], changelog, 'CHANGELOG.rst', increment_update_time=False) else: logger.debug(f'Changelog didn\'t match!!!!!!!!!!!!!!!!!\n<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<\n{changelog}\n================================\n{release_changelog}\n>>>>>>>>>>>>>>>>>>>>>>>>>>>>') @@ -174,6 +174,6 @@ def __call__(self, repo): if readme == release_readme: logger.debug(f'{package_name} Linking README.md of source cache entry for compaction. Lines saved: {len(readme.splitlines())}') readme = release_readme - repo_cache.add(package_name, pkg_entries['package_path'], readme, 'README.md') + repo_cache.add(package_name, pkg_entries['package_path'], readme, 'README.md', increment_update_time=False) return repo_cache diff --git a/src/rosdistro/source_repository_cache.py b/src/rosdistro/source_repository_cache.py index 803ceeac..a59e9321 100644 --- a/src/rosdistro/source_repository_cache.py +++ b/src/rosdistro/source_repository_cache.py @@ -59,7 +59,7 @@ def from_ref(cls, ref): """ return cls({'_ref': ref}) - def add(self, package_name, package_path, payload_string, payload_type='package.xml'): # TODO(tfoote) Breaks rosdistro formatting changing from list to dict + def add(self, package_name, package_path, payload_string, payload_type='package.xml', increment_update_time=True): # TODO(tfoote) Breaks rosdistro formatting changing from list to dict """ Add a package to the cache. """ @@ -73,7 +73,8 @@ def add(self, package_name, package_path, payload_string, payload_type='package. self._data[package_name]['package_path'] = package_path self._data[package_name][payload_type] = payload_string self._package_names.add(package_name) - self._data['_last_update_time'] = datetime.now() + if increment_update_time: + self._data['_last_update_time'] = datetime.now() def __iter__(self): From d85c93dba4241470ae3e1ee26eac6a3e4f404357 Mon Sep 17 00:00:00 2001 From: Tully Foote Date: Mon, 23 Jun 2025 23:01:56 -0700 Subject: [PATCH 20/51] improve readability of status messages, todo magic number --- src/rosdistro/distribution_cache.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/src/rosdistro/distribution_cache.py b/src/rosdistro/distribution_cache.py index e0cebee9..72e6a8e4 100644 --- a/src/rosdistro/distribution_cache.py +++ b/src/rosdistro/distribution_cache.py @@ -111,7 +111,7 @@ def update_distribution(self, distribution_file_data): dist_file = create_distribution_file(self.distribution_file.name, self._distribution_file_data) # remove all release package xmls where the package version has changed. - print(f"- removing invalid release package cache entries. [{len(dist_file.release_packages.keys())}]") + print(f"- checking [{len(dist_file.release_packages.keys())}] release package cache entries for different versions") for pkg_name in sorted(dist_file.release_packages.keys()): if pkg_name not in self.distribution_file.release_packages: continue @@ -127,7 +127,7 @@ def update_distribution(self, distribution_file_data): start_time = time.perf_counter() dropped_count = 0 skipped_count = 0 - print(f"- checking invalid source repo cache entries. [{len(self.source_repo_package_xmls.keys())}]") + print(f"- checking [{len(self.source_repo_package_xmls.keys())}] source repo cache entries without source entries, requires ls-remote") for repo in sorted(self.source_repo_package_xmls.keys()): sys.stdout.write('.') sys.stdout.flush() @@ -140,12 +140,12 @@ def update_distribution(self, distribution_file_data): del self.source_repo_package_xmls[repo] continue - max_update_delta = 1 * 60 * 60 + min_update_delta = 1 * 60 * 60 # TOOD(tfoote) magic number make into a parameter if '_last_update_time' in self.source_repo_package_xmls[repo]: now = datetime.datetime.now() entry_age = (now - self.source_repo_package_xmls[repo]['_last_update_time']).total_seconds() - if entry_age < max_update_delta: - logger.debug(f'Skipping check of {repo} because it was last updated only {entry_age} seconds ago less than {max_update_delta}') + if entry_age < min_update_delta: + logger.debug(f'Skipping check of {repo} because it was last updated only {entry_age} seconds ago less than {min_update_delta}') skipped_count += 1 continue From 70c99347eee88e4f115a4f1b59993181e0c2c87d Mon Sep 17 00:00:00 2001 From: Tully Foote Date: Mon, 23 Jun 2025 23:02:20 -0700 Subject: [PATCH 21/51] remove magic number --- src/rosdistro/manifest_provider/cache.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/rosdistro/manifest_provider/cache.py b/src/rosdistro/manifest_provider/cache.py index dffd6527..97bea88f 100644 --- a/src/rosdistro/manifest_provider/cache.py +++ b/src/rosdistro/manifest_provider/cache.py @@ -39,7 +39,7 @@ def sanitize_and_truncate_docs(doc_string, max_length=100): # Remove trailing whitespace then truncate lines = doc_string.rstrip().splitlines() ending = '' - if len(lines) > 100: + if len(lines) >= max_length: ending = f'\nTruncated file at {max_length} lines' return '\n'.join(lines[:max_length]) + ending From 8b8e32b2f0c88a8388d716c302aa48307828244f Mon Sep 17 00:00:00 2001 From: Tully Foote Date: Tue, 24 Jun 2025 09:24:29 -0700 Subject: [PATCH 22/51] extend truncation message --- src/rosdistro/manifest_provider/cache.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/rosdistro/manifest_provider/cache.py b/src/rosdistro/manifest_provider/cache.py index 97bea88f..d8c831f4 100644 --- a/src/rosdistro/manifest_provider/cache.py +++ b/src/rosdistro/manifest_provider/cache.py @@ -40,7 +40,7 @@ def sanitize_and_truncate_docs(doc_string, max_length=100): lines = doc_string.rstrip().splitlines() ending = '' if len(lines) >= max_length: - ending = f'\nTruncated file at {max_length} lines' + ending = f'\nTruncated content at {max_length} of {len(lines)} lines' return '\n'.join(lines[:max_length]) + ending def sanitize_xml(xml_string): From 23e9b864fecb418b99b31ade2f6ec1b0242b152c Mon Sep 17 00:00:00 2001 From: Tully Foote Date: Tue, 24 Jun 2025 09:54:34 -0700 Subject: [PATCH 23/51] Make docs truncation idempotent --- src/rosdistro/manifest_provider/cache.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/rosdistro/manifest_provider/cache.py b/src/rosdistro/manifest_provider/cache.py index d8c831f4..c4e087ac 100644 --- a/src/rosdistro/manifest_provider/cache.py +++ b/src/rosdistro/manifest_provider/cache.py @@ -39,9 +39,9 @@ def sanitize_and_truncate_docs(doc_string, max_length=100): # Remove trailing whitespace then truncate lines = doc_string.rstrip().splitlines() ending = '' - if len(lines) >= max_length: + if len(lines) > max_length: ending = f'\nTruncated content at {max_length} of {len(lines)} lines' - return '\n'.join(lines[:max_length]) + ending + return '\n'.join(lines[:max_length - 1 ]) + ending def sanitize_xml(xml_string): """ Returns a version of the supplied XML string with comments and all whitespace stripped, From 6228ae214b272d906b11888f55d64ec27db2c838 Mon Sep 17 00:00:00 2001 From: Tully Foote Date: Tue, 24 Jun 2025 10:59:06 -0700 Subject: [PATCH 24/51] simplify deduplication logic --- src/rosdistro/manifest_provider/cache.py | 12 +++--------- 1 file changed, 3 insertions(+), 9 deletions(-) diff --git a/src/rosdistro/manifest_provider/cache.py b/src/rosdistro/manifest_provider/cache.py index c4e087ac..4c3a559b 100644 --- a/src/rosdistro/manifest_provider/cache.py +++ b/src/rosdistro/manifest_provider/cache.py @@ -155,25 +155,19 @@ def __call__(self, repo): release_package_xml = self._distribution_cache.release_package_xmls.get(package_name, None) if package_xml == release_package_xml: logger.debug(f'{package_name} Linking package.xml of source cache entry for compaction. Lines saved: {len(package_xml.splitlines())}') - package_xml = release_package_xml - repo_cache.add(package_name, pkg_entries['package_path'], package_xml, 'package.xml', increment_update_time=False) + repo_cache.add(package_name, pkg_entries['package_path'], release_package_xml, 'package.xml', increment_update_time=False) if 'CHANGELOG.rst' in pkg_entries: changelog = sanitize_and_truncate_docs(pkg_entries['CHANGELOG.rst']) release_changelog = self._distribution_cache.release_changelogs.get(package_name, None) if changelog == release_changelog: logger.debug(f'{package_name} Linking CHANGELOG.rst of source cache entry for compaction. Lines saved: {len(changelog.splitlines())}') - changelog = release_changelog - repo_cache.add(package_name, pkg_entries['package_path'], changelog, 'CHANGELOG.rst', increment_update_time=False) - else: - logger.debug(f'Changelog didn\'t match!!!!!!!!!!!!!!!!!\n<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<\n{changelog}\n================================\n{release_changelog}\n>>>>>>>>>>>>>>>>>>>>>>>>>>>>') - + repo_cache.add(package_name, pkg_entries['package_path'], release_changelog, 'CHANGELOG.rst', increment_update_time=False) if 'README.md' in pkg_entries: readme = sanitize_and_truncate_docs(pkg_entries['README.md']) release_readme = self._distribution_cache.release_readmes.get(package_name, None) if readme == release_readme: logger.debug(f'{package_name} Linking README.md of source cache entry for compaction. Lines saved: {len(readme.splitlines())}') - readme = release_readme - repo_cache.add(package_name, pkg_entries['package_path'], readme, 'README.md', increment_update_time=False) + repo_cache.add(package_name, pkg_entries['package_path'], release_readme, 'README.md', increment_update_time=False) return repo_cache From 9bd8a78855fe61ed25e9bdfe8eedf2a45745c20b Mon Sep 17 00:00:00 2001 From: Tully Foote Date: Tue, 24 Jun 2025 10:59:30 -0700 Subject: [PATCH 25/51] Allow the collapse of any long string >300 right now --- src/rosdistro/distribution_cache_generator.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/rosdistro/distribution_cache_generator.py b/src/rosdistro/distribution_cache_generator.py index e5c11741..7921f0a4 100644 --- a/src/rosdistro/distribution_cache_generator.py +++ b/src/rosdistro/distribution_cache_generator.py @@ -170,8 +170,8 @@ def __init__(self, *args, **kwargs): super(CacheYamlDumper, self).__init__(*args, **kwargs) def ignore_aliases(self, content): - """ Allow strings that look like package XML to alias to each other in the YAML output. """ - return not (isinstance(content, str) and ' 300) # TODO(tfoote) magic number move to config def represent_mapping(self, tag, mapping, flow_style=False): """ Gives compact representation for the distribution_file section, while allowing the package From b2cb88ffc7b630ff41a411943fd22e12d9ae02a5 Mon Sep 17 00:00:00 2001 From: Tully Foote Date: Tue, 24 Jun 2025 11:14:32 -0700 Subject: [PATCH 26/51] truncate on initial insertion of docs --- src/rosdistro/manifest_provider/github.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/rosdistro/manifest_provider/github.py b/src/rosdistro/manifest_provider/github.py index 6e45392d..a1052dc5 100644 --- a/src/rosdistro/manifest_provider/github.py +++ b/src/rosdistro/manifest_provider/github.py @@ -45,6 +45,8 @@ from rosdistro.source_repository_cache import SourceRepositoryCache from rosdistro import logger +from rosdistro.manifest_provider.cache import sanitize_and_truncate_docs + GITHUB_USER = os.getenv('GITHUB_USER', None) GITHUB_PASSWORD = os.getenv('GITHUB_PASSWORD', None) GITHUB_TOKEN = os.getenv('GITHUB_TOKEN', None) @@ -155,6 +157,7 @@ def package_xml_in_parent(path): else: logger.debug('- HTTP ERROR (%s), trying "%s"' % (e, url)) raise e + contents = sanitize_and_truncate_docs(contents) # TODO(tfoote) Do this later so it doesn't need to be in all manifest providers cache.add(name, package_xml_path, contents, filepath) return cache From 56407dda800070be8569f5850ea003745a0c8304 Mon Sep 17 00:00:00 2001 From: Tully Foote Date: Tue, 24 Jun 2025 11:20:02 -0700 Subject: [PATCH 27/51] clearer cache loading message --- src/rosdistro/manifest_provider/cache.py | 1 + 1 file changed, 1 insertion(+) diff --git a/src/rosdistro/manifest_provider/cache.py b/src/rosdistro/manifest_provider/cache.py index 4c3a559b..84a1fc06 100644 --- a/src/rosdistro/manifest_provider/cache.py +++ b/src/rosdistro/manifest_provider/cache.py @@ -130,6 +130,7 @@ def __call__(self, repo): assert repo.url repo_cache = self._distribution_cache.source_repo_package_xmls.get(repo.name, None) if not repo_cache: + logger.debug(f"Internal Cache Miss for {repo.name} Louding from Source Manifset Providers") # Use manifest providers to lazy load for mp in self._source_manifest_providers or []: try: From 9657a2ea7dd9602372e784a16fa5c7926924a5d2 Mon Sep 17 00:00:00 2001 From: Tully Foote Date: Tue, 24 Jun 2025 20:31:31 -0700 Subject: [PATCH 28/51] add debugging config option for faster development cycles --- src/rosdistro/distribution_cache_generator.py | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/src/rosdistro/distribution_cache_generator.py b/src/rosdistro/distribution_cache_generator.py index 7921f0a4..09497526 100644 --- a/src/rosdistro/distribution_cache_generator.py +++ b/src/rosdistro/distribution_cache_generator.py @@ -78,8 +78,12 @@ def generate_distribution_cache(index, dist_name, preclean=False, ignore_local=F include_source=include_source) print('- fetch missing release manifests') + max_source_packages = 100000 # TODO(tfoote) magic number move to config + max_source_repos = 10000 # TODO(tfoote) magic number move to config errors = [] - for pkg_name in sorted(dist.release_packages.keys()): + if debug and (len(dist.release_packages.keys()) > max_source_packages): + print(f' - limiting packages scanned to {max_source_packages} of {len(dist.release_packages.keys())} as per config') + for pkg_name in sorted(dist.release_packages.keys())[:max_source_packages]: repo = dist.repositories[dist.release_packages[pkg_name].repository_name].release_repository if repo.version is None: if debug: @@ -134,7 +138,9 @@ def generate_distribution_cache(index, dist_name, preclean=False, ignore_local=F if include_source: print('- fetch source repository manifests') - for repo_name in sorted(dist.repositories.keys()): + if debug and len(dist.repositories.keys()) > max_source_repos: + print(f' - limiting repositories scanned to {max_source_repos} of {len(dist.repositories.keys())} as per config') + for repo_name in sorted(dist.repositories.keys())[:max_source_repos]: if dist.repositories[repo_name].source_repository: dist.get_source_repo_package_xmls(repo_name) if debug: From 2982685eda03f4d5c96d406742631dc8f4ba3934 Mon Sep 17 00:00:00 2001 From: Tully Foote Date: Tue, 24 Jun 2025 20:39:19 -0700 Subject: [PATCH 29/51] fix loud load typo --- src/rosdistro/manifest_provider/cache.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/rosdistro/manifest_provider/cache.py b/src/rosdistro/manifest_provider/cache.py index 84a1fc06..a77d8ede 100644 --- a/src/rosdistro/manifest_provider/cache.py +++ b/src/rosdistro/manifest_provider/cache.py @@ -130,7 +130,7 @@ def __call__(self, repo): assert repo.url repo_cache = self._distribution_cache.source_repo_package_xmls.get(repo.name, None) if not repo_cache: - logger.debug(f"Internal Cache Miss for {repo.name} Louding from Source Manifset Providers") + logger.debug(f"Internal Cache Miss for {repo.name} Loading from Source Manifset Providers") # Use manifest providers to lazy load for mp in self._source_manifest_providers or []: try: From 63ba4754d695eef4a624e044795c3526056aa241 Mon Sep 17 00:00:00 2001 From: Tully Foote Date: Tue, 24 Jun 2025 20:51:27 -0700 Subject: [PATCH 30/51] restore items with new API --- src/rosdistro/source_repository_cache.py | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/src/rosdistro/source_repository_cache.py b/src/rosdistro/source_repository_cache.py index a59e9321..cf5b3a78 100644 --- a/src/rosdistro/source_repository_cache.py +++ b/src/rosdistro/source_repository_cache.py @@ -83,22 +83,22 @@ def __iter__(self): """ return iter(self._package_names) - def __getitem__(self, package_name): + def __getitem__(self, package_name): #TODO(tfoote) API change """ - Access the cached information about a specific package. Returns a (str, str) of - path to package relative to repo root, and string of package xml. + Access the cached information about a specific package. Returns a dict of + path to package relative paths to repo root, and string of the file contents (potentially truncated). """ if package_name not in self._package_names: raise KeyError("Package '%s' not present in SourceRepositoryCache." % package_name) return self._data[package_name] - # def items(self): # TODO(tfoote) unused now - # """ - # Generator of (str, str, str) containing the package name, path relative - # to repo root, and package xml string. - # """ - # for package_name in self._package_names: - # yield package_name, self._data[package_name]['package_path'], self._data[package_name]['package.xml'] + def items(self): #TODO(tfoote) API change + """ + Generator of (str, dict) containing the package name, and a dict of + paths to file contents (potentially truncated). + """ + for package_name in self._package_names: + yield package_name, self._data[package_name] def __len__(self): """ From ff86598d723992ca5d3c3c037dadb8cb7bb46b26 Mon Sep 17 00:00:00 2001 From: Tully Foote Date: Thu, 3 Jul 2025 23:09:32 -0700 Subject: [PATCH 31/51] refactor to generic _resources from package_xmls --- src/rosdistro/distribution.py | 14 ++++---- src/rosdistro/distribution_cache.py | 32 +++++++++---------- src/rosdistro/distribution_cache_generator.py | 4 +-- src/rosdistro/manifest_provider/cache.py | 4 +-- 4 files changed, 27 insertions(+), 27 deletions(-) diff --git a/src/rosdistro/distribution.py b/src/rosdistro/distribution.py index e06e9525..47a77b0c 100644 --- a/src/rosdistro/distribution.py +++ b/src/rosdistro/distribution.py @@ -60,7 +60,7 @@ def __init__(self, distribution_file, manifest_providers=None, source_manifest_p self._release_package_xmls = {} self._release_readmes = {} self._release_changelogs = {} - self._source_repo_package_xmls = {} + self._source_repo_resources = {} def __getattr__(self, name): return getattr(self._distribution_file, name) @@ -121,23 +121,23 @@ def get_release_changelog(self, pkg_name): def get_source_package_xml(self, pkg_name): repo_name = self._distribution_file.source_packages[pkg_name].repository_name - repo_cache = self.get_source_repo_package_xmls(repo_name) + repo_cache = self.get_source_repo_resources(repo_name) if repo_cache: return repo_cache[pkg_name][1] else: return None - def get_source_repo_package_xmls(self, repo_name): - if repo_name in self._source_repo_package_xmls: - return self._source_repo_package_xmls[repo_name] + def get_source_repo_resources(self, repo_name): + if repo_name in self._source_repo_resources: + return self._source_repo_resources[repo_name] else: for mp in self._source_manifest_providers: repo_cache = mp(self.repositories[repo_name].source_repository) if repo_cache is not None: # Update map of package XMLs, and also list of known package names. - self._source_repo_package_xmls[repo_name] = repo_cache + self._source_repo_resources[repo_name] = repo_cache for pkg_name in repo_cache: if pkg_name[0] != '_': self._distribution_file.source_packages[pkg_name] = Package(pkg_name, repo_name) - return self._source_repo_package_xmls[repo_name] + return self._source_repo_resources[repo_name] return None diff --git a/src/rosdistro/distribution_cache.py b/src/rosdistro/distribution_cache.py index 72e6a8e4..173e8ef3 100644 --- a/src/rosdistro/distribution_cache.py +++ b/src/rosdistro/distribution_cache.py @@ -67,10 +67,10 @@ def __init__(self, name, data=None, distribution_file_data=None): self.release_package_xmls = data['release_package_xmls'] if data and 'release_package_xmls' in data else {} self.release_readmes = data['release_readmes'] if data and 'release_readmes' in data else {} self.release_changelogs = data['release_changelogs'] if data and 'release_changelogs' in data else {} - self.source_repo_package_xmls = {} - if data and 'source_repo_package_xmls' in data: - for repo_name, repo_data in data['source_repo_package_xmls'].items(): - self.source_repo_package_xmls[repo_name] = SourceRepositoryCache(repo_data) + self.source_repo_resources = {} + if data and 'source_repo_resources' in data: + for repo_name, repo_data in data['source_repo_resources'].items(): + self.source_repo_resources[repo_name] = SourceRepositoryCache(repo_data) self.distribution_file.source_packages = self.get_source_packages() def get_data(self): @@ -82,8 +82,8 @@ def get_data(self): data['release_package_xmls'] = self.release_package_xmls data['release_readmes'] = self.release_readmes data['release_changelogs'] = self.release_changelogs - data['source_repo_package_xmls'] = dict([(repo_name, repo_cache.get_data()) - for repo_name, repo_cache in self.source_repo_package_xmls.items()]) + data['source_repo_resources'] = dict([(repo_name, repo_cache.get_data()) + for repo_name, repo_cache in self.source_repo_resources.items()]) return data def update_distribution(self, distribution_file_data): @@ -123,12 +123,12 @@ def update_distribution(self, distribution_file_data): # Remove all source package xmls where the devel branch is pointing to a different commit than # the one we have associated with our cache. This requires calling git ls-remote on all affected repos. - if self.source_repo_package_xmls: + if self.source_repo_resources: start_time = time.perf_counter() dropped_count = 0 skipped_count = 0 - print(f"- checking [{len(self.source_repo_package_xmls.keys())}] source repo cache entries without source entries, requires ls-remote") - for repo in sorted(self.source_repo_package_xmls.keys()): + print(f"- checking [{len(self.source_repo_resources.keys())}] source repo cache entries without source entries, requires ls-remote") + for repo in sorted(self.source_repo_resources.keys()): sys.stdout.write('.') sys.stdout.flush() try: @@ -137,13 +137,13 @@ def update_distribution(self, distribution_file_data): # The repo entry has been dropped, or the source stanza from it has been dropped, # either way, remove the cache entries associated with this repository. logger.debug('Unable to find source repository info for repo "%s".' % repo) - del self.source_repo_package_xmls[repo] + del self.source_repo_resources[repo] continue min_update_delta = 1 * 60 * 60 # TOOD(tfoote) magic number make into a parameter - if '_last_update_time' in self.source_repo_package_xmls[repo]: + if '_last_update_time' in self.source_repo_resources[repo]: now = datetime.datetime.now() - entry_age = (now - self.source_repo_package_xmls[repo]['_last_update_time']).total_seconds() + entry_age = (now - self.source_repo_resources[repo]['_last_update_time']).total_seconds() if entry_age < min_update_delta: logger.debug(f'Skipping check of {repo} because it was last updated only {entry_age} seconds ago less than {min_update_delta}') skipped_count += 1 @@ -156,17 +156,17 @@ def update_distribution(self, distribution_file_data): if result['returncode'] != 0 or not result['output']: # Error checking remote, or unable to find remote reference. Drop the cache entry. logger.debug("Unable to check hash for branch %s of %s, dropping cache entry." % (source_repository.version, source_repository.url)) - del self.source_repo_package_xmls[repo] + del self.source_repo_resources[repo] dropped_count += 1 continue # Split by line first and take the last line, to squelch any preamble output, for example # a known host key validation notice. source_hash = result['output'].split('\n')[-1].split('\t')[0] - cached_hash = self.source_repo_package_xmls[repo].ref() + cached_hash = self.source_repo_resources[repo].ref() if source_hash != cached_hash: logger.debug('Repo "%s" has moved from %s to %s, dropping cache.' % (repo, cached_hash, source_hash)) - del self.source_repo_package_xmls[repo] + del self.source_repo_resources[repo] dropped_count += 1 sys.stdout.write('\n') sys.stdout.write(f'Dropped {dropped_count} repositories\n') @@ -183,7 +183,7 @@ def update_distribution(self, distribution_file_data): def get_source_packages(self): """ Returns dictionary mapping source package names to Package() objects. """ package_dict = {} - for source_repo_name, source_repo in self.source_repo_package_xmls.items(): + for source_repo_name, source_repo in self.source_repo_resources.items(): for pkg_name in source_repo: package_dict[pkg_name] = Package(pkg_name, source_repo_name) return package_dict diff --git a/src/rosdistro/distribution_cache_generator.py b/src/rosdistro/distribution_cache_generator.py index 09497526..f58b3375 100644 --- a/src/rosdistro/distribution_cache_generator.py +++ b/src/rosdistro/distribution_cache_generator.py @@ -142,7 +142,7 @@ def generate_distribution_cache(index, dist_name, preclean=False, ignore_local=F print(f' - limiting repositories scanned to {max_source_repos} of {len(dist.repositories.keys())} as per config') for repo_name in sorted(dist.repositories.keys())[:max_source_repos]: if dist.repositories[repo_name].source_repository: - dist.get_source_repo_package_xmls(repo_name) + dist.get_source_repo_resources(repo_name) if debug: print(' - dist cache source fetch "%s"' % repo_name) else: @@ -223,7 +223,7 @@ def _get_cached_distribution(index, dist_name, preclean=False, ignore_local=Fals # if we're not including the source portion of the cache, strip it out of the existing cache # in order to skip the potentially lengthy cache invalidation process. if not include_source: - cache.source_repo_package_xmls = {} + cache.source_repo_resources = {} # update cache with current distribution file, which filters existing cache by validity. cache.update_distribution(rel_file_data) else: diff --git a/src/rosdistro/manifest_provider/cache.py b/src/rosdistro/manifest_provider/cache.py index a77d8ede..8695a632 100644 --- a/src/rosdistro/manifest_provider/cache.py +++ b/src/rosdistro/manifest_provider/cache.py @@ -128,7 +128,7 @@ def __init__(self, distribution_cache, source_manifest_providers=None): def __call__(self, repo): assert repo.url - repo_cache = self._distribution_cache.source_repo_package_xmls.get(repo.name, None) + repo_cache = self._distribution_cache.source_repo_resources.get(repo.name, None) if not repo_cache: logger.debug(f"Internal Cache Miss for {repo.name} Loading from Source Manifset Providers") # Use manifest providers to lazy load @@ -140,7 +140,7 @@ def __call__(self, repo): logger.debug('Skipped "%s()": %s' % (mp.__name__, e)) continue - self._distribution_cache.source_repo_package_xmls[repo.name] = repo_cache + self._distribution_cache.source_repo_resources[repo.name] = repo_cache break else: logger.debug('Load package XMLs for repo "%s" from cache' % repo.name) From f60979e5b87fa9c53660b83d4bf7761b201a262c Mon Sep 17 00:00:00 2001 From: Tully Foote Date: Thu, 3 Jul 2025 23:35:37 -0700 Subject: [PATCH 32/51] remove debugging truncate, it's in the sanitizer now --- src/rosdistro/manifest_provider/github.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/src/rosdistro/manifest_provider/github.py b/src/rosdistro/manifest_provider/github.py index a1052dc5..f7ded7a1 100644 --- a/src/rosdistro/manifest_provider/github.py +++ b/src/rosdistro/manifest_provider/github.py @@ -79,8 +79,7 @@ def github_manifest_provider(_dist_name, repo, pkg_name, filepath='package.xml') url = 'https://raw.githubusercontent.com/%s/%s/%s' % (path, release_tag, filepath) try: logger.debug('Load %s file from url "%s"' % (filepath, url)) - # TODO(tfoote) magic number for testing - return '\n'.join( _get_url_contents(url).splitlines()[:100]) + return '\n'.join( _get_url_contents(url).splitlines()) except HTTPError as e: if e.code == 404: logger.debug('- File not found (%s), trying "%s"' % (e, url)) From 94c5fd112e93c760c661831994faeb0259f2e254 Mon Sep 17 00:00:00 2001 From: Tully Foote Date: Mon, 7 Jul 2025 11:18:09 -0700 Subject: [PATCH 33/51] multi resource support for git --- src/rosdistro/manifest_provider/git.py | 20 +++++++++++++++----- 1 file changed, 15 insertions(+), 5 deletions(-) diff --git a/src/rosdistro/manifest_provider/git.py b/src/rosdistro/manifest_provider/git.py index 6f0e5227..997d64f1 100644 --- a/src/rosdistro/manifest_provider/git.py +++ b/src/rosdistro/manifest_provider/git.py @@ -59,8 +59,8 @@ def git_manifest_provider(_dist_name, repo, pkg_name, filepath='package.xml'): raise RuntimeError('Unable to fetch %s: %s' % (filepath, e)) -def git_source_manifest_provider(repo, filepaths=['package.xml']): - filepath = 'package.xml' # TODO(tfoote) use filepaths +def git_source_manifest_provider(repo, filepaths=['CHANGELOG.rst', 'README.md']): + xmlpath = 'package.xml' # TODO(tfoote) use filepaths try: with _temp_git_clone(repo.url, repo.version) as git_repo_path: logger.debug(f'Cloing repository {repo.url} to get source info') @@ -72,13 +72,23 @@ def git_source_manifest_provider(repo, filepaths=['package.xml']): for package_path in find_package_paths(git_repo_path): if package_path == '.': package_path = '' - with open(os.path.join(git_repo_path, package_path, filepath), 'r') as f: + with open(os.path.join(git_repo_path, package_path, xmlpath), 'r') as f: package_xml = f.read() try: name = parse_package_string(package_xml).name except InvalidPackage: - raise RuntimeError('Unable to parse %s file found in %s' % (filepath, repo.url)) - cache.add(name, package_path, package_xml, filepath) + raise RuntimeError('Unable to parse %s file found in %s' % (xmlpath, repo.url)) + cache.add(name, package_path, package_xml, xmlpath) + for filepath in filepaths: + repo_filename = os.path.join(git_repo_path, package_path, filepath) + if not os.path.exists(repo_filename): + logger.debug(f'- git load of {filepath} from {repo.url} at {repo.version} skipped because it did not exist.') + continue + with open(repo_filename, 'r') as f: + logger.debug('- git load %s from %s' % (filepath, repo_filename)) + contents = f.read() + contents = sanitize_and_truncate_docs(contents) # TODO(tfoote) Do this later so it doesn't need to be in all manifest providers + cache.add(name, package_path, contents, filepath) except Exception as e: raise RuntimeError('Unable to fetch source %s files: %s' % (filepath, e)) From 47049eaae6a3115a410e606d80b990b7162c158e Mon Sep 17 00:00:00 2001 From: Tully Foote Date: Sat, 27 Dec 2025 01:08:23 +0000 Subject: [PATCH 34/51] Print out url string not Result address Also add a comment about the logic for future reference. --- src/rosdistro/manifest_provider/github.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/rosdistro/manifest_provider/github.py b/src/rosdistro/manifest_provider/github.py index f7ded7a1..f7732f8d 100644 --- a/src/rosdistro/manifest_provider/github.py +++ b/src/rosdistro/manifest_provider/github.py @@ -59,9 +59,10 @@ def _get_url_contents(url): except HTTPError as e: if e.code != 403: raise e - logger.debug(f'Fetch of {url} failed with 403, assuming rate limit, retrying after period {backoff} seconds.') + logger.debug(f'Fetch of {url.full_url} failed with 403, assuming rate limit, retrying after period {backoff} seconds.') time.sleep(backoff) backoff *= 1.5 + # Last return after timeout to collect the error object again. return urlopen(url).read().decode('utf-8') def github_manifest_provider(_dist_name, repo, pkg_name, filepath='package.xml'): From 6e184cabe223c6930cd71d491b29d8c468b7bd96 Mon Sep 17 00:00:00 2001 From: Tully Foote Date: Sat, 27 Dec 2025 03:00:39 +0000 Subject: [PATCH 35/51] First draft at a schema --- dev_test/rosdistro_cache_3.schema.json | 61 ++++++++++++++++++++++++++ setup.py | 2 +- stdeb.cfg | 2 +- 3 files changed, 63 insertions(+), 2 deletions(-) create mode 100644 dev_test/rosdistro_cache_3.schema.json diff --git a/dev_test/rosdistro_cache_3.schema.json b/dev_test/rosdistro_cache_3.schema.json new file mode 100644 index 00000000..7d75fb84 --- /dev/null +++ b/dev_test/rosdistro_cache_3.schema.json @@ -0,0 +1,61 @@ +{ + "$schema": "https://json-schema.org/draft/2020-12/schema", + "$id": "https://github.com/ros-infrastructure/rosdistro/TODO/rosdistro_cache_3.schema.json", + "title": "ROSDistro Cache Format 3 Schema", + "description": "Cache Format 3 for rosdistro", + "type": "object", + "properties": { + "version": { + "description": "The version of the rosdistro cache", + "type": "integer", + "minimum": "3", + "maximum": "3" + }, + "type": { + "description": "Clarifying the file type", + "type": "string" + }, + "source_repo_resources": { + "type": "object", + "patternProperties": { + "^.*$": {"$ref": "#/$defs/repository_resources"} + } + } + }, + "$defs": { + "repository_resources": { + "type": "object", + "additionalProperties": {"$ref": "#/$defs/package_resources"}, + "properties": { + "_last_update_time": { + "type": "string" + }, + "_ref": { + "type": "string" + } + }, + "required": ["_ref"] + }, + "package_resources": { + "type": "object", + "properties": { + "CHANGELOG.rst": { + "type": "string", + "description": "Contents of the CHANGELOG.rst if it's available" + }, + "package.xml": { + "type": "string", + "description": "Contents of the package.xml if it's available" + }, + "package_path": { + "type": "string", + "description": "The package_path if it's available" + }, + "README.md": { + "type": "string", + "description": "Contents of the README.md if it's available" + } + } + } + } +} diff --git a/setup.py b/setup.py index 81514361..e013258e 100755 --- a/setup.py +++ b/setup.py @@ -10,7 +10,7 @@ # - src/rosdistro/__init__.py # - stdeb.cfg 'version': '1.0.1', - 'install_requires': ['PyYAML', 'setuptools'], + 'install_requires': ['PyYAML', 'setuptools', 'jsonschema'], 'python_requires': '>=3.6', 'packages': find_packages('src'), 'package_dir': {'': 'src'}, diff --git a/stdeb.cfg b/stdeb.cfg index 1b430c35..6948e33b 100644 --- a/stdeb.cfg +++ b/stdeb.cfg @@ -12,7 +12,7 @@ X-Python3-Version: >= 3.6 Setup-Env-Vars: SKIP_PYTHON_MODULES=1 [rosdistro_modules] -Depends3: ca-certificates, python3-catkin-pkg-modules, python3-rospkg-modules, python3-setuptools, python3-yaml +Depends3: ca-certificates, python3-catkin-pkg-modules, python3-rospkg-modules, python3-setuptools, python3-yaml, python3-jsonschema Conflicts3: python3-rosdistro (<< 0.6.0) Replaces3: python3-rosdistro (<< 0.6.0) Copyright-File: LICENSE.txt From 9b19db3f991a906857ca11af3fdeed82f65984e9 Mon Sep 17 00:00:00 2001 From: Tully Foote Date: Sat, 27 Dec 2025 04:00:13 +0000 Subject: [PATCH 36/51] schema running for intermediate state --- dev_test/rosdistro_cache_3.schema.json | 31 ++++++++++++++++++++++---- 1 file changed, 27 insertions(+), 4 deletions(-) diff --git a/dev_test/rosdistro_cache_3.schema.json b/dev_test/rosdistro_cache_3.schema.json index 7d75fb84..b05edd4b 100644 --- a/dev_test/rosdistro_cache_3.schema.json +++ b/dev_test/rosdistro_cache_3.schema.json @@ -8,8 +8,8 @@ "version": { "description": "The version of the rosdistro cache", "type": "integer", - "minimum": "3", - "maximum": "3" + "minimum": 3, + "maximum": 3 }, "type": { "description": "Clarifying the file type", @@ -20,8 +20,31 @@ "patternProperties": { "^.*$": {"$ref": "#/$defs/repository_resources"} } - } - }, + }, + "release_resources": { + "type": "object", + "additionalProperties": {"$ref": "#/$defs/package_resources"}, + "properties": { + "_last_update_time": { + "type": "string" + }, + "version": { + "type": "string" + } + }, + "required": ["version"] + }, + "additionalProperties": false + }, + "distribution_file": { + "type": "object", + "description": "rosdistro distribution_file" + }, + "name": { + "type": "string", + "description": "The name of the distribution being cached" + }, + "required": ["source_repo_resources", "distribution_file", "name"], "$defs": { "repository_resources": { "type": "object", From f432591f09359e27a2303eed236d7d10030e77ee Mon Sep 17 00:00:00 2001 From: Tully Foote Date: Sat, 27 Dec 2025 04:50:54 +0000 Subject: [PATCH 37/51] catch last_update_time and update SourceRepositoryCache docs --- src/rosdistro/source_repository_cache.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/src/rosdistro/source_repository_cache.py b/src/rosdistro/source_repository_cache.py index cf5b3a78..560727a5 100644 --- a/src/rosdistro/source_repository_cache.py +++ b/src/rosdistro/source_repository_cache.py @@ -35,15 +35,17 @@ class SourceRepositoryCache(object): """ - This class represents a cache of the package XML strings for all packages in a single + This class represents a cache of the resource strings for all packages in a single repo at a particular moment in time. A dictionary of many of these (one for each repo) - keyed to the repo name represents the totality of the source package xml cache. + keyed to the repo name represents the totality of the source package resources cache. """ def __init__(self, data): assert data self._ref = data['_ref'] - self._package_names = set([name for name in data.keys() if name != '_ref']) + self._last_update_time = data['_last_update_time'] if data['_last_update_time'] else None + non_package_keys = ['_ref', '_last_update_time'] + self._package_names = set([name for name in data.keys() if name not in non_package_keys]) self._data = data def get_data(self): From b7e0c2bca2fa67544e21d096c660e97edffa8834 Mon Sep 17 00:00:00 2001 From: Tully Foote Date: Sat, 27 Dec 2025 06:41:09 +0000 Subject: [PATCH 38/51] add environment variable to not wait as long for timeouts --- src/rosdistro/manifest_provider/github.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/rosdistro/manifest_provider/github.py b/src/rosdistro/manifest_provider/github.py index f7732f8d..ae20b91c 100644 --- a/src/rosdistro/manifest_provider/github.py +++ b/src/rosdistro/manifest_provider/github.py @@ -51,9 +51,11 @@ GITHUB_PASSWORD = os.getenv('GITHUB_PASSWORD', None) GITHUB_TOKEN = os.getenv('GITHUB_TOKEN', None) +ROSDISTRO_RETRY_MAX = int(os.getenv('ROSDISTRO_RETRY_MAX', '120')) + def _get_url_contents(url): backoff = 1 - while backoff < 120: + while backoff < ROSDISTRO_RETRY_MAX: try: return urlopen(url).read().decode('utf-8') except HTTPError as e: From b18a7337f34ef2f2a874ad07588cd40f9ef41076 Mon Sep 17 00:00:00 2001 From: Tully Foote Date: Sat, 27 Dec 2025 06:41:59 +0000 Subject: [PATCH 39/51] fix exception variable scopes and some comments --- src/rosdistro/manifest_provider/git.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/rosdistro/manifest_provider/git.py b/src/rosdistro/manifest_provider/git.py index 997d64f1..a97cf73e 100644 --- a/src/rosdistro/manifest_provider/git.py +++ b/src/rosdistro/manifest_provider/git.py @@ -60,10 +60,10 @@ def git_manifest_provider(_dist_name, repo, pkg_name, filepath='package.xml'): def git_source_manifest_provider(repo, filepaths=['CHANGELOG.rst', 'README.md']): - xmlpath = 'package.xml' # TODO(tfoote) use filepaths + xmlpath = 'package.xml' # TODO(tfoote) use filepaths, currently using the package special to get the names try: with _temp_git_clone(repo.url, repo.version) as git_repo_path: - logger.debug(f'Cloing repository {repo.url} to get source info') + logger.debug(f'Cloning repository {repo.url} to get source info') # Include the git hash in our cache dictionary. git_hash = Git(git_repo_path).command('rev-parse', 'HEAD')['output'] cache = SourceRepositoryCache.from_ref(git_hash) @@ -91,7 +91,7 @@ def git_source_manifest_provider(repo, filepaths=['CHANGELOG.rst', 'README.md']) cache.add(name, package_path, contents, filepath) except Exception as e: - raise RuntimeError('Unable to fetch source %s files: %s' % (filepath, e)) + raise RuntimeError('Unable to fetch source files: %s' % (e)) return cache From e730cc4447674c49297a663d15c75e02f2d846e3 Mon Sep 17 00:00:00 2001 From: Tully Foote Date: Sat, 27 Dec 2025 07:23:30 +0000 Subject: [PATCH 40/51] Switch release cache to release_resources as a dict avoid special casing code for each resource type --- src/rosdistro/distribution.py | 61 ++++++------------- src/rosdistro/distribution_cache_generator.py | 16 ++--- src/rosdistro/manifest_provider/cache.py | 35 ++++------- 3 files changed, 39 insertions(+), 73 deletions(-) diff --git a/src/rosdistro/distribution.py b/src/rosdistro/distribution.py index 47a77b0c..914acc8d 100644 --- a/src/rosdistro/distribution.py +++ b/src/rosdistro/distribution.py @@ -57,16 +57,17 @@ def __init__(self, distribution_file, manifest_providers=None, source_manifest_p if source_manifest_providers is not None: self._source_manifest_providers = source_manifest_providers - self._release_package_xmls = {} - self._release_readmes = {} - self._release_changelogs = {} + self._release_resources = {} self._source_repo_resources = {} def __getattr__(self, name): return getattr(self._distribution_file, name) - def get_release_package_xml(self, pkg_name): - if pkg_name not in self._release_package_xmls: + + def get_release_resource(self, pkg_name, filepath): + if pkg_name not in self._release_resources: + self._release_resources[pkg_name] = {} + if not self._release_resources[pkg_name].get(filepath, None): pkg = self._distribution_file.release_packages[pkg_name] repo_name = pkg.repository_name repo = self._distribution_file.repositories[repo_name] @@ -75,49 +76,25 @@ def get_release_package_xml(self, pkg_name): repo = repo.release_repository if repo.version is None: return None - package_xml = None for mp in self._manifest_providers: - package_xml = mp(self._distribution_file.name, repo, pkg_name, 'package.xml') - if package_xml is not None: + content = mp(self._distribution_file.name, repo, pkg_name, filepath) + if content is not None: break - self._release_package_xmls[pkg_name] = package_xml - return self._release_package_xmls[pkg_name] + self._release_resources[pkg_name][filepath] = content + return self._release_resources.get(pkg_name, {}).get(filepath, None) + + + def get_release_package_xml(self, pkg_name): + # TODO(tfoote) deprecated + return self.get_release_resource(pkg_name, 'package.xml') def get_release_readme(self, pkg_name): - if pkg_name not in self._release_readmes: - pkg = self._distribution_file.release_packages[pkg_name] - repo_name = pkg.repository_name - repo = self._distribution_file.repositories[repo_name] - if repo.release_repository is None: - return None - repo = repo.release_repository - if repo.version is None: - return None - readme = None - for mp in self._manifest_providers: - readme = mp(self._distribution_file.name, repo, pkg_name, filepath='README.md') - if readme is not None: - break - self._release_readmes[pkg_name] = readme - return self._release_readmes[pkg_name] + # TODO(tfoote) deprecated + return self.get_release_resource(pkg_name, 'README.md') def get_release_changelog(self, pkg_name): - if pkg_name not in self._release_changelogs: - pkg = self._distribution_file.release_packages[pkg_name] - repo_name = pkg.repository_name - repo = self._distribution_file.repositories[repo_name] - if repo.release_repository is None: - return None - repo = repo.release_repository - if repo.version is None: - return None - changelog = None - for mp in self._manifest_providers: - changelog = mp(self._distribution_file.name, repo, pkg_name, filepath='CHANGELOG.rst') - if changelog is not None: - break - self._release_changelogs[pkg_name] = changelog - return self._release_changelogs[pkg_name] + # TODO(tfoote) deprecated + return self.get_release_resource(pkg_name, 'CHANGELOG.rst') def get_source_package_xml(self, pkg_name): repo_name = self._distribution_file.source_packages[pkg_name].repository_name diff --git a/src/rosdistro/distribution_cache_generator.py b/src/rosdistro/distribution_cache_generator.py index f58b3375..c637db3b 100644 --- a/src/rosdistro/distribution_cache_generator.py +++ b/src/rosdistro/distribution_cache_generator.py @@ -96,8 +96,8 @@ def generate_distribution_cache(index, dist_name, preclean=False, ignore_local=F sys.stdout.flush() # check that package.xml is fetchable old_package_xml = None - if cache and pkg_name in cache.release_package_xmls: - old_package_xml = cache.release_package_xmls[pkg_name] + if cache and pkg_name in cache.release_resources: + old_package_xml = cache.release_resources[pkg_name].get('package.xml', None) package_xml = dist.get_release_package_xml(pkg_name) if not package_xml: errors.append('%s: missing package.xml file for package "%s"' % (dist_name, pkg_name)) @@ -116,17 +116,17 @@ def generate_distribution_cache(index, dist_name, preclean=False, ignore_local=F print(" - updated manifest of package '%s' to version '%s'" % (pkg_name, pkg.version)) old_readme = None - if cache and pkg_name in cache.release_readmes: - old_readme = cache.release_readmes[pkg_name] - readme = dist.get_release_readme(pkg_name) + if cache and pkg_name in cache.release_resources: + old_readme = cache.release_resources[pkg_name].get('README.md', None) + readme = dist.get_release_resource(pkg_name, 'README.md') if readme != old_readme: print(" - updated README.md of package '%s'" % (pkg_name)) old_changelog = None - if cache and pkg_name in cache.release_changelogs: - old_changelog = cache.release_changelogs[pkg_name] - changelog = dist.get_release_changelog(pkg_name) + if cache and pkg_name in cache.release_resources: + old_changelog = cache.release_resources[pkg_name].get('CHANGELOG.rst', None) + changelog = dist.get_release_resource(pkg_name, 'CHANGELOG.rst') if changelog != old_changelog: print(" - updated CHANGELOG.rst of package '%s'" % (pkg_name)) diff --git a/src/rosdistro/manifest_provider/cache.py b/src/rosdistro/manifest_provider/cache.py index 8695a632..6ea25f91 100644 --- a/src/rosdistro/manifest_provider/cache.py +++ b/src/rosdistro/manifest_provider/cache.py @@ -77,24 +77,15 @@ def __init__(self, distribution_cache, manifest_providers=None): def __call__(self, dist_name, repo, pkg_name, filepath='package.xml'): assert repo.version - if filepath == 'README.md': - manifest_content = self._distribution_cache.release_readmes.get(pkg_name, None) - if manifest_content: - manifest_content = sanitize_and_truncate_docs(manifest_content) - self._distribution_cache.release_readmes[pkg_name] = manifest_content - logger.debug('Loading README.md for package "%s" from cache' % pkg_name) - elif filepath == 'CHANGELOG.rst': - manifest_content = self._distribution_cache.release_changelogs.get(pkg_name, None) - if manifest_content: + + # Load from cache + manifest_content = self._distribution_cache.release_resources.get(pkg_name, {}).get(filepath, None) + if manifest_content: + if filepath != 'package.xml': manifest_content = sanitize_and_truncate_docs(manifest_content) - self._distribution_cache.release_changelogs[pkg_name] = manifest_content - logger.debug('Loading CHANGELOG.rst for package "%s" from cache' % pkg_name) - else: - manifest_content = self._distribution_cache.release_package_xmls.get(pkg_name, None) - if manifest_content: - manifest_content = sanitize_xml(manifest_content) - self._distribution_cache.release_package_xmls[pkg_name] = manifest_content - logger.debug('Loading package.xml for package "%s" from cache' % pkg_name) + self._distribution_cache.release_resources[pkg_name][filepath] = manifest_content + logger.debug('Loading %s for package "%s" from cache' % (filepath, pkg_name) ) + if not manifest_content: # use manifest providers to lazy load for mp in self._manifest_providers or []: @@ -110,13 +101,11 @@ def __call__(self, dist_name, repo, pkg_name, filepath='package.xml'): logger.debug('Skipped "%s()": %s' % (mp.__name__, e)) if manifest_content is None: return None + # populate the cache - if filepath == 'README.md': - self._distribution_cache.release_readmes[pkg_name] = manifest_content - elif filepath == 'CHANGELOG.rst': - self._distribution_cache.release_changelogs[pkg_name] = manifest_content - else: - self._distribution_cache.release_package_xmls[pkg_name] = manifest_content + if pkg_name not in self._distribution_cache.release_resources: + self._distribution_cache.release_resources[pkg_name] = {} + self._distribution_cache.release_resources[pkg_name][filepath] = manifest_content return manifest_content From 93ab5cf54e3ac916a1653713498d5dc60ac3671a Mon Sep 17 00:00:00 2001 From: Tully Foote Date: Sat, 27 Dec 2025 08:18:55 +0000 Subject: [PATCH 41/51] improve visibility of progress on release updates --- src/rosdistro/distribution_cache.py | 18 +++++++++++++----- 1 file changed, 13 insertions(+), 5 deletions(-) diff --git a/src/rosdistro/distribution_cache.py b/src/rosdistro/distribution_cache.py index 173e8ef3..960b7d41 100644 --- a/src/rosdistro/distribution_cache.py +++ b/src/rosdistro/distribution_cache.py @@ -112,14 +112,22 @@ def update_distribution(self, distribution_file_data): # remove all release package xmls where the package version has changed. print(f"- checking [{len(dist_file.release_packages.keys())}] release package cache entries for different versions") + dropped_count = 0 + skipped_count = 0 for pkg_name in sorted(dist_file.release_packages.keys()): if pkg_name not in self.distribution_file.release_packages: + logger.debug("Skipping %s because not in the distro." % pkg_name) + skipped_count += 1 continue - if pkg_name in self.release_package_xmls and self._get_repo_info(dist_file, pkg_name) != self._get_repo_info(self.distribution_file, pkg_name): - logger.debug("Dropping release package XML cache for %s" % pkg_name) - del self.release_package_xmls[pkg_name] - del self.release_readmes[pkg_name] - del self.release_changelogs[pkg_name] + if pkg_name in self.release_resources and self._get_repo_info(dist_file, pkg_name) != self._get_repo_info(self.distribution_file, pkg_name): + logger.debug("Dropping release resources package cache for %s" % pkg_name) + dropped_count += 1 + del self.release_resources[pkg_name] + + + sys.stdout.write('\n') + sys.stdout.write(f'Dropped {dropped_count} repositories\n') + sys.stdout.write(f'Skippted {skipped_count} repositories\n') # Remove all source package xmls where the devel branch is pointing to a different commit than # the one we have associated with our cache. This requires calling git ls-remote on all affected repos. From a69bdf538f7d8da44ef07a7263aa71cb119d2657 Mon Sep 17 00:00:00 2001 From: Tully Foote Date: Sat, 27 Dec 2025 08:19:41 +0000 Subject: [PATCH 42/51] Update the loading process and self declaration --- src/rosdistro/distribution_cache.py | 45 ++++++++++++++++++----------- 1 file changed, 28 insertions(+), 17 deletions(-) diff --git a/src/rosdistro/distribution_cache.py b/src/rosdistro/distribution_cache.py index 960b7d41..8453fa0d 100644 --- a/src/rosdistro/distribution_cache.py +++ b/src/rosdistro/distribution_cache.py @@ -48,25 +48,40 @@ class DistributionCache(object): def __init__(self, name, data=None, distribution_file_data=None): assert data or distribution_file_data + + # default value + inbound_version = 0 if data: assert 'type' in data, "Expected file type is '%s'" % DistributionCache._type assert data['type'] == DistributionCache._type, "Expected file type is '%s', not '%s'" % (DistributionCache._type, data['type']) assert 'version' in data, "Distribution cache file for '%s' lacks required version information" % name - self.version = int(data['version']) - assert self.version > 1, "Unable to handle '%s' format version '%d' anymore, please update your '%s' file to version '2'" % (DistributionCache._type, self.version, DistributionCache._type) - assert self.version == 2, "Unable to handle '%s' format version '%d', please update rosdistro (e.g. on Ubuntu/Debian use: sudo apt-get update && sudo apt-get install --only-upgrade python-rosdistro)" % (DistributionCache._type, self.version) + inbound_version = int(data['version']) + assert inbound_version > 1, "Unable to handle '%s' format version '%d' anymore, please update your '%s' file to version '2'" % (DistributionCache._type, inbound_version, DistributionCache._type) + assert inbound_version <= 3, "Unable to handle '%s' format version '%d', please update rosdistro (e.g. on Ubuntu/Debian use: sudo apt-get update && sudo apt-get install --only-upgrade python-rosdistro)" % (DistributionCache._type, inbound_version) assert 'name' in data, "Distribution cache file for '%s' lacks required name information" % name assert data['name'] == name, "Distribution cache file for '%s' does not match the name '%s'" % (name, data['name']) - else: - self.version = 2 + + # All data will be migrated forward on import, any rexport will be in version 3 + self.version = 3 self._distribution_file_data = data['distribution_file'] if data else distribution_file_data self.distribution_file = create_distribution_file(name, self._distribution_file_data) - self.release_package_xmls = data['release_package_xmls'] if data and 'release_package_xmls' in data else {} - self.release_readmes = data['release_readmes'] if data and 'release_readmes' in data else {} - self.release_changelogs = data['release_changelogs'] if data and 'release_changelogs' in data else {} + + # self.release_package_xmls = data['release_package_xmls'] if data and 'release_package_xmls' in data else {} + # self.release_readmes = data['release_readmes'] if data and 'release_readmes' in data else {} + # self.release_changelogs = data['release_changelogs'] if data and 'release_changelogs' in data else {} + self.release_resources = data['release_resources'] if data and 'release_resources' in data else {} + + # Format 2 backards compatability + # Convert release_package_xml from flat dict at the root to be an instance of a resource loaded + if inbound_version == 2 and 'release_package_xmls' in data: + for pkg_name, pkg_xml in data['release_package_xmls']: + if not pkg_name in data['release_resources']: + data['release_resources'][pkg_name] = {} + data['release_resources'][pkg_name]['package.xml'] = pkg_xml + self.source_repo_resources = {} if data and 'source_repo_resources' in data: for repo_name, repo_data in data['source_repo_resources'].items(): @@ -76,12 +91,10 @@ def __init__(self, name, data=None, distribution_file_data=None): def get_data(self): data = {} data['type'] = 'cache' - data['version'] = 2 + data['version'] = 3 data['name'] = self.distribution_file.name data['distribution_file'] = self._distribution_file_data - data['release_package_xmls'] = self.release_package_xmls - data['release_readmes'] = self.release_readmes - data['release_changelogs'] = self.release_changelogs + data['release_resources'] = self.release_resources data['source_repo_resources'] = dict([(repo_name, repo_cache.get_data()) for repo_name, repo_cache in self.source_repo_resources.items()]) return data @@ -202,9 +215,7 @@ def _get_repo_info(self, dist_file, pkg_name): return (repo.version, repo.url) def _remove_obsolete_entries(self): - for pkg_name in list(self.release_package_xmls.keys()): + for pkg_name in list(self.release_resources.keys()): if pkg_name not in self.distribution_file.release_packages: - print('- REMOVE', pkg_name) - del self.release_package_xmls[pkg_name] - del self.release_readmes[pkg_name] - del self.release_changelogs[pkg_name] + print('- REMOVE Release Resources for: ', pkg_name) + del self.release_resources[pkg_name] From 66cc8aad509570ef358953be31b7ff3ba85a1ba2 Mon Sep 17 00:00:00 2001 From: Tully Foote Date: Sat, 27 Dec 2025 08:20:21 +0000 Subject: [PATCH 43/51] backwards compatibility for self.package_xmls --- src/rosdistro/release_cache.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/rosdistro/release_cache.py b/src/rosdistro/release_cache.py index 169dc707..a31f94b9 100644 --- a/src/rosdistro/release_cache.py +++ b/src/rosdistro/release_cache.py @@ -56,7 +56,8 @@ def __init__(self, name, data=None, distribution_file_data=None): self._distribution_file_data = data['distribution_file'] if data else distribution_file_data self.release_file = ReleaseFile(name, self._distribution_file_data) - self.package_xmls = data['release_package_xmls'] if data else {} + + self.package_xmls = {pkg_name: info['package.xml'] for pkg_name, info in data['release_resources'] if 'package.xml' in info} # for backward compatibility only def __getattr__(self, name): From c3c2cc6934b6020d69cf01a12e192780e4bd7f8e Mon Sep 17 00:00:00 2001 From: Tully Foote Date: Sat, 27 Dec 2025 08:21:05 +0000 Subject: [PATCH 44/51] deduplication logic is updated for the new structure --- src/rosdistro/manifest_provider/cache.py | 37 +++++++++++------------- 1 file changed, 17 insertions(+), 20 deletions(-) diff --git a/src/rosdistro/manifest_provider/cache.py b/src/rosdistro/manifest_provider/cache.py index 6ea25f91..1bf733e8 100644 --- a/src/rosdistro/manifest_provider/cache.py +++ b/src/rosdistro/manifest_provider/cache.py @@ -140,24 +140,21 @@ def __call__(self, repo): for package_name, pkg_entries in repo_cache._data.items(): if package_name.startswith('_'): continue - if 'package.xml' in pkg_entries: - package_xml = sanitize_xml(pkg_entries['package.xml']) # TODO(tfoote) validate as unnecessary should be sanitized already on insert? - release_package_xml = self._distribution_cache.release_package_xmls.get(package_name, None) - if package_xml == release_package_xml: - logger.debug(f'{package_name} Linking package.xml of source cache entry for compaction. Lines saved: {len(package_xml.splitlines())}') - repo_cache.add(package_name, pkg_entries['package_path'], release_package_xml, 'package.xml', increment_update_time=False) - - if 'CHANGELOG.rst' in pkg_entries: - changelog = sanitize_and_truncate_docs(pkg_entries['CHANGELOG.rst']) - release_changelog = self._distribution_cache.release_changelogs.get(package_name, None) - if changelog == release_changelog: - logger.debug(f'{package_name} Linking CHANGELOG.rst of source cache entry for compaction. Lines saved: {len(changelog.splitlines())}') - repo_cache.add(package_name, pkg_entries['package_path'], release_changelog, 'CHANGELOG.rst', increment_update_time=False) - if 'README.md' in pkg_entries: - readme = sanitize_and_truncate_docs(pkg_entries['README.md']) - release_readme = self._distribution_cache.release_readmes.get(package_name, None) - if readme == release_readme: - logger.debug(f'{package_name} Linking README.md of source cache entry for compaction. Lines saved: {len(readme.splitlines())}') - repo_cache.add(package_name, pkg_entries['package_path'], release_readme, 'README.md', increment_update_time=False) - + for resource_type in pkg_entries: + valid_types = ['CHANGELOG.rst', 'README.md', 'package.xml'] + if resource_type not in valid_types: + #TODO(tfoote) clean up this logic with magic values + continue + if 'package.xml' == resource_type: + package_xml = sanitize_xml(pkg_entries['package.xml']) # TODO(tfoote) validate as unnecessary should be sanitized already on insert? + release_package_xml = self._distribution_cache.release_resources.get(package_name, {}).get('package.xml', None) + if package_xml == release_package_xml: + logger.debug(f'{package_name} Linking package.xml of source cache entry for compaction. Lines saved: {len(package_xml.splitlines())}') + repo_cache.add(package_name, pkg_entries['package_path'], release_package_xml, 'package.xml', increment_update_time=False) + else: + content = sanitize_and_truncate_docs(pkg_entries[resource_type]) + release_content = self._distribution_cache.release_resources.get(package_name, {}).get(resource_type, None) + if content == release_content: + logger.debug(f'{package_name} Linking {resource_type} of source cache entry for compaction. Lines saved: {len(content.splitlines())}') + repo_cache.add(package_name, pkg_entries['package_path'], release_content, resource_type, increment_update_time=False) return repo_cache From 03c22939f04435487e523de99a7a8e239490eaa9 Mon Sep 17 00:00:00 2001 From: Tully Foote Date: Sat, 27 Dec 2025 08:22:13 +0000 Subject: [PATCH 45/51] clean up last update time logic --- src/rosdistro/source_repository_cache.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/src/rosdistro/source_repository_cache.py b/src/rosdistro/source_repository_cache.py index 560727a5..e6fd8281 100644 --- a/src/rosdistro/source_repository_cache.py +++ b/src/rosdistro/source_repository_cache.py @@ -43,7 +43,10 @@ class SourceRepositoryCache(object): def __init__(self, data): assert data self._ref = data['_ref'] - self._last_update_time = data['_last_update_time'] if data['_last_update_time'] else None + if '_last_update_time' in data: + self._last_update_time = data['_last_update_time'] + else: + self._last_update_time = None non_package_keys = ['_ref', '_last_update_time'] self._package_names = set([name for name in data.keys() if name not in non_package_keys]) self._data = data @@ -76,7 +79,7 @@ def add(self, package_name, package_path, payload_string, payload_type='package. self._data[package_name][payload_type] = payload_string self._package_names.add(package_name) if increment_update_time: - self._data['_last_update_time'] = datetime.now() + self._data[package_name]['_last_update_time'] = datetime.now() def __iter__(self): From 454afa1e05912b7605bd802a054e5914efa9a7e0 Mon Sep 17 00:00:00 2001 From: Tully Foote Date: Sun, 4 Jan 2026 04:35:26 +0000 Subject: [PATCH 46/51] Store the release_resources in the ReleaseCache --- src/rosdistro/release_cache.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/rosdistro/release_cache.py b/src/rosdistro/release_cache.py index a31f94b9..8dddf2f8 100644 --- a/src/rosdistro/release_cache.py +++ b/src/rosdistro/release_cache.py @@ -58,6 +58,7 @@ def __init__(self, name, data=None, distribution_file_data=None): self.release_file = ReleaseFile(name, self._distribution_file_data) self.package_xmls = {pkg_name: info['package.xml'] for pkg_name, info in data['release_resources'] if 'package.xml' in info} + self.release_resources = data['release_resources'] # for backward compatibility only def __getattr__(self, name): @@ -71,7 +72,8 @@ def get_data(self): data['version'] = 2 data['name'] = self.release_file.name data['distribution_file'] = self._distribution_file_data - data['package_xmls'] = self.package_xmls + data['package_xmls'] = self.package_xmls # backwards compatibility + data['release_resources'] = self.release_resources return data def update_distribution(self, distribution_file_data): From 33e6af51ecf4328e2ce03c386d3194c46e82bab7 Mon Sep 17 00:00:00 2001 From: Tully Foote Date: Sun, 4 Jan 2026 06:30:24 +0000 Subject: [PATCH 47/51] fix backwards compat insertion --- src/rosdistro/distribution_cache.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/rosdistro/distribution_cache.py b/src/rosdistro/distribution_cache.py index 8453fa0d..3081820d 100644 --- a/src/rosdistro/distribution_cache.py +++ b/src/rosdistro/distribution_cache.py @@ -77,7 +77,9 @@ def __init__(self, name, data=None, distribution_file_data=None): # Format 2 backards compatability # Convert release_package_xml from flat dict at the root to be an instance of a resource loaded if inbound_version == 2 and 'release_package_xmls' in data: - for pkg_name, pkg_xml in data['release_package_xmls']: + if not 'release_resources' in data: + data['release_resources'] = {} + for pkg_name, pkg_xml in data['release_package_xmls'].items(): if not pkg_name in data['release_resources']: data['release_resources'][pkg_name] = {} data['release_resources'][pkg_name]['package.xml'] = pkg_xml From 813e7b6d245186f80b77890580d9d3a47223acba Mon Sep 17 00:00:00 2001 From: Tully Foote Date: Sun, 4 Jan 2026 06:32:19 +0000 Subject: [PATCH 48/51] add extra resource initializaiton to test mock --- test/test_manifest_providers.py | 1 + 1 file changed, 1 insertion(+) diff --git a/test/test_manifest_providers.py b/test/test_manifest_providers.py index d0d96482..fe94d9f9 100644 --- a/test/test_manifest_providers.py +++ b/test/test_manifest_providers.py @@ -26,6 +26,7 @@ def test_cached(): class FakeDistributionCache(object): def __init__(self): self.release_package_xmls = {} + self.release_resources = {} dc = FakeDistributionCache() cache = CachedManifestProvider(dc, [rosdistro.manifest_provider.github.github_manifest_provider]) assert '' in cache('melodic', _genmsg_release_repo(), 'genmsg') From d6b5548eddffa80b4646caef876966b1a0a5ee27 Mon Sep 17 00:00:00 2001 From: Tully Foote Date: Sun, 4 Jan 2026 06:53:37 +0000 Subject: [PATCH 49/51] add missing import --- src/rosdistro/manifest_provider/git.py | 1 + 1 file changed, 1 insertion(+) diff --git a/src/rosdistro/manifest_provider/git.py b/src/rosdistro/manifest_provider/git.py index a97cf73e..8fcc10f7 100644 --- a/src/rosdistro/manifest_provider/git.py +++ b/src/rosdistro/manifest_provider/git.py @@ -40,6 +40,7 @@ from catkin_pkg.packages import find_package_paths from rosdistro.common import rmtree +from rosdistro.manifest_provider.cache import sanitize_and_truncate_docs from rosdistro.source_repository_cache import SourceRepositoryCache from rosdistro.vcs import Git, ref_is_hash from rosdistro import logger From a86fe1052ecb43cb9a340f853f7d229e4a9ed812 Mon Sep 17 00:00:00 2001 From: Tully Foote Date: Sun, 4 Jan 2026 06:54:22 +0000 Subject: [PATCH 50/51] Iterate filepaths correctly --- src/rosdistro/manifest_provider/gitlab.py | 15 ++++++++------- src/rosdistro/manifest_provider/tar.py | 3 ++- 2 files changed, 10 insertions(+), 8 deletions(-) diff --git a/src/rosdistro/manifest_provider/gitlab.py b/src/rosdistro/manifest_provider/gitlab.py index 574a2f2d..69aa6717 100644 --- a/src/rosdistro/manifest_provider/gitlab.py +++ b/src/rosdistro/manifest_provider/gitlab.py @@ -147,12 +147,13 @@ def package_xml_in_parent(path): cache = SourceRepositoryCache.from_ref(sha) for package_xml_path in package_xml_paths: - resource_path = urlquote( - package_xml_path + '/' + filepath if package_xml_path else filepath, safe='') - resource = 'repository/files/' + resource_path + '/raw' - with _gitlab_api_query(server, path, resource, {'ref': sha}) as res: - package_xml = res.read().decode('utf-8') - name = parse_package_string(package_xml).name - cache.add(name, package_xml_path, package_xml, filepath) + for filepath in filepaths: + resource_path = urlquote( + package_xml_path + '/' + filepath if package_xml_path else filepath, safe='') + resource = 'repository/files/' + resource_path + '/raw' + with _gitlab_api_query(server, path, resource, {'ref': sha}) as res: + contents = res.read().decode('utf-8') + name = parse_package_string(contents).name + cache.add(name, package_xml_path, contents, filepath) return cache diff --git a/src/rosdistro/manifest_provider/tar.py b/src/rosdistro/manifest_provider/tar.py index 0ea58322..2f285637 100644 --- a/src/rosdistro/manifest_provider/tar.py +++ b/src/rosdistro/manifest_provider/tar.py @@ -100,7 +100,8 @@ def tar_source_manifest_provider(repo, filepaths=['package.xml']): name = parse_package_string(package_xml).name except InvalidPackage: raise RuntimeError('Unable to parse package.xml file found in %s' % repo.url) - cache.add(name, package_path, package_xml, filepath) + for filepath in filepaths: + cache.add(name, package_path, package_xml, filepath) return cache finally: From ef1ee32b09f6077ef54c447cce1bcfe3d5dc846e Mon Sep 17 00:00:00 2001 From: Tully Foote Date: Sun, 4 Jan 2026 06:54:59 +0000 Subject: [PATCH 51/51] update tests for new storage format --- test/test_manifest_providers.py | 25 +++++++++++-------------- 1 file changed, 11 insertions(+), 14 deletions(-) diff --git a/test/test_manifest_providers.py b/test/test_manifest_providers.py index fe94d9f9..1206ff1e 100644 --- a/test/test_manifest_providers.py +++ b/test/test_manifest_providers.py @@ -56,9 +56,8 @@ def test_git_source(): # This hash corresponds to the 0.5.11 tag. assert repo_cache.ref() == 'a189fc78558e7276df59d2961cfe4f8b4de08a8b' - package_path, package_xml = repo_cache['genmsg'] - assert '' == package_path - assert '0.5.11' in package_xml + assert '' == repo_cache['genmsg']['package_path'] + assert '0.5.11' in repo_cache['genmsg']['package.xml'] # mock_get_url_contents is used to mock out the '_get_url_contents' method in @@ -98,9 +97,8 @@ def test_github_source(): # This hash corresponds to the 0.5.7 tag. assert repo_cache.ref() == '81b66fe5eb00043c43894ddeee07e738d9b9712f' - package_path, package_xml = repo_cache['genmsg'] - assert '' == package_path - assert '0.5.11' in package_xml + assert '' == repo_cache['genmsg']['package_path'] + assert '0.5.11' in repo_cache['genmsg']['package.xml'] def test_gitlab_source(): @@ -109,16 +107,14 @@ def test_gitlab_source(): # This hash corresponds to the 1.0.3 tag. assert repo_cache.ref() == 'cd30853005ef3a591cb8594b4aa49f9ef400d30f' - package_path, package_xml = repo_cache['ros2trace_analysis'] - assert 'ros2trace_analysis' == package_path - assert '1.0.3' in package_xml + assert 'ros2trace_analysis' == repo_cache['ros2trace_analysis']['package_path'] + assert '1.0.3' in repo_cache['ros2trace_analysis']['package.xml'] def test_git_source_multi(): repo_cache = git_source_manifest_provider(_ros_source_repo()) assert repo_cache.ref() - package_path, package_xml = repo_cache['roslib'] - assert package_path == os.path.join('core', 'roslib') + assert repo_cache['roslib']['package_path'] == os.path.join('core', 'roslib') def test_tar_source(): @@ -126,9 +122,10 @@ def test_tar_source(): assert repo_cache.ref() is None - package_path, package_xml = repo_cache['genmsg'] - assert 'genmsg-0.5.16' == package_path - assert '0.5.16' in package_xml + print(repo_cache['genmsg']) + + assert 'genmsg-0.5.16' == repo_cache['genmsg']['package_path'] + assert '0.5.16' in repo_cache['genmsg']['package.xml'] def test_sanitize():