diff --git a/dev_test/rosdistro_cache_3.schema.json b/dev_test/rosdistro_cache_3.schema.json new file mode 100644 index 00000000..b05edd4b --- /dev/null +++ b/dev_test/rosdistro_cache_3.schema.json @@ -0,0 +1,84 @@ +{ + "$schema": "https://json-schema.org/draft/2020-12/schema", + "$id": "https://github.com/ros-infrastructure/rosdistro/TODO/rosdistro_cache_3.schema.json", + "title": "ROSDistro Cache Format 3 Schema", + "description": "Cache Format 3 for rosdistro", + "type": "object", + "properties": { + "version": { + "description": "The version of the rosdistro cache", + "type": "integer", + "minimum": 3, + "maximum": 3 + }, + "type": { + "description": "Clarifying the file type", + "type": "string" + }, + "source_repo_resources": { + "type": "object", + "patternProperties": { + "^.*$": {"$ref": "#/$defs/repository_resources"} + } + }, + "release_resources": { + "type": "object", + "additionalProperties": {"$ref": "#/$defs/package_resources"}, + "properties": { + "_last_update_time": { + "type": "string" + }, + "version": { + "type": "string" + } + }, + "required": ["version"] + }, + "additionalProperties": false + }, + "distribution_file": { + "type": "object", + "description": "rosdistro distribution_file" + }, + "name": { + "type": "string", + "description": "The name of the distribution being cached" + }, + "required": ["source_repo_resources", "distribution_file", "name"], + "$defs": { + "repository_resources": { + "type": "object", + "additionalProperties": {"$ref": "#/$defs/package_resources"}, + "properties": { + "_last_update_time": { + "type": "string" + }, + "_ref": { + "type": "string" + } + }, + "required": ["_ref"] + }, + "package_resources": { + "type": "object", + "properties": { + "CHANGELOG.rst": { + "type": "string", + "description": "Contents of the CHANGELOG.rst if it's available" + }, + "package.xml": { + "type": "string", + "description": "Contents of the package.xml if it's available" + }, + "package_path": { + "type": "string", + "description": "The package_path if it's available" + }, + "README.md": { + "type": "string", + "description": "Contents of the README.md if it's available" + } + } + } + } +} diff --git a/setup.py b/setup.py index 81514361..e013258e 100755 --- a/setup.py +++ b/setup.py @@ -10,7 +10,7 @@ # - src/rosdistro/__init__.py # - stdeb.cfg 'version': '1.0.1', - 'install_requires': ['PyYAML', 'setuptools'], + 'install_requires': ['PyYAML', 'setuptools', 'jsonschema'], 'python_requires': '>=3.6', 'packages': find_packages('src'), 'package_dir': {'': 'src'}, diff --git a/src/rosdistro/distribution.py b/src/rosdistro/distribution.py index 742cc638..914acc8d 100644 --- a/src/rosdistro/distribution.py +++ b/src/rosdistro/distribution.py @@ -57,14 +57,17 @@ def __init__(self, distribution_file, manifest_providers=None, source_manifest_p if source_manifest_providers is not None: self._source_manifest_providers = source_manifest_providers - self._release_package_xmls = {} - self._source_repo_package_xmls = {} + self._release_resources = {} + self._source_repo_resources = {} def __getattr__(self, name): return getattr(self._distribution_file, name) - def get_release_package_xml(self, pkg_name): - if pkg_name not in self._release_package_xmls: + + def get_release_resource(self, pkg_name, filepath): + if pkg_name not in self._release_resources: + self._release_resources[pkg_name] = {} + if not self._release_resources[pkg_name].get(filepath, None): pkg = self._distribution_file.release_packages[pkg_name] repo_name = pkg.repository_name repo = self._distribution_file.repositories[repo_name] @@ -73,33 +76,45 @@ def get_release_package_xml(self, pkg_name): repo = repo.release_repository if repo.version is None: return None - package_xml = None for mp in self._manifest_providers: - package_xml = mp(self._distribution_file.name, repo, pkg_name) - if package_xml is not None: + content = mp(self._distribution_file.name, repo, pkg_name, filepath) + if content is not None: break - self._release_package_xmls[pkg_name] = package_xml - return self._release_package_xmls[pkg_name] + self._release_resources[pkg_name][filepath] = content + return self._release_resources.get(pkg_name, {}).get(filepath, None) + + + def get_release_package_xml(self, pkg_name): + # TODO(tfoote) deprecated + return self.get_release_resource(pkg_name, 'package.xml') + + def get_release_readme(self, pkg_name): + # TODO(tfoote) deprecated + return self.get_release_resource(pkg_name, 'README.md') + + def get_release_changelog(self, pkg_name): + # TODO(tfoote) deprecated + return self.get_release_resource(pkg_name, 'CHANGELOG.rst') def get_source_package_xml(self, pkg_name): repo_name = self._distribution_file.source_packages[pkg_name].repository_name - repo_cache = self.get_source_repo_package_xmls(repo_name) + repo_cache = self.get_source_repo_resources(repo_name) if repo_cache: return repo_cache[pkg_name][1] else: return None - def get_source_repo_package_xmls(self, repo_name): - if repo_name in self._source_repo_package_xmls: - return self._source_repo_package_xmls[repo_name] + def get_source_repo_resources(self, repo_name): + if repo_name in self._source_repo_resources: + return self._source_repo_resources[repo_name] else: for mp in self._source_manifest_providers: repo_cache = mp(self.repositories[repo_name].source_repository) if repo_cache is not None: # Update map of package XMLs, and also list of known package names. - self._source_repo_package_xmls[repo_name] = repo_cache + self._source_repo_resources[repo_name] = repo_cache for pkg_name in repo_cache: if pkg_name[0] != '_': self._distribution_file.source_packages[pkg_name] = Package(pkg_name, repo_name) - return self._source_repo_package_xmls[repo_name] + return self._source_repo_resources[repo_name] return None diff --git a/src/rosdistro/distribution_cache.py b/src/rosdistro/distribution_cache.py index 42f416a1..3081820d 100644 --- a/src/rosdistro/distribution_cache.py +++ b/src/rosdistro/distribution_cache.py @@ -31,7 +31,9 @@ # ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE # POSSIBILITY OF SUCH DAMAGE. +import datetime import sys +import time from . import logger from .distribution_file import create_distribution_file @@ -46,38 +48,57 @@ class DistributionCache(object): def __init__(self, name, data=None, distribution_file_data=None): assert data or distribution_file_data + + # default value + inbound_version = 0 if data: assert 'type' in data, "Expected file type is '%s'" % DistributionCache._type assert data['type'] == DistributionCache._type, "Expected file type is '%s', not '%s'" % (DistributionCache._type, data['type']) assert 'version' in data, "Distribution cache file for '%s' lacks required version information" % name - self.version = int(data['version']) - assert self.version > 1, "Unable to handle '%s' format version '%d' anymore, please update your '%s' file to version '2'" % (DistributionCache._type, self.version, DistributionCache._type) - assert self.version == 2, "Unable to handle '%s' format version '%d', please update rosdistro (e.g. on Ubuntu/Debian use: sudo apt-get update && sudo apt-get install --only-upgrade python-rosdistro)" % (DistributionCache._type, self.version) + inbound_version = int(data['version']) + assert inbound_version > 1, "Unable to handle '%s' format version '%d' anymore, please update your '%s' file to version '2'" % (DistributionCache._type, inbound_version, DistributionCache._type) + assert inbound_version <= 3, "Unable to handle '%s' format version '%d', please update rosdistro (e.g. on Ubuntu/Debian use: sudo apt-get update && sudo apt-get install --only-upgrade python-rosdistro)" % (DistributionCache._type, inbound_version) assert 'name' in data, "Distribution cache file for '%s' lacks required name information" % name assert data['name'] == name, "Distribution cache file for '%s' does not match the name '%s'" % (name, data['name']) - else: - self.version = 2 + + # All data will be migrated forward on import, any rexport will be in version 3 + self.version = 3 self._distribution_file_data = data['distribution_file'] if data else distribution_file_data self.distribution_file = create_distribution_file(name, self._distribution_file_data) - self.release_package_xmls = data['release_package_xmls'] if data else {} - self.source_repo_package_xmls = {} - if data and 'source_repo_package_xmls' in data: - for repo_name, repo_data in data['source_repo_package_xmls'].items(): - self.source_repo_package_xmls[repo_name] = SourceRepositoryCache(repo_data) + + # self.release_package_xmls = data['release_package_xmls'] if data and 'release_package_xmls' in data else {} + # self.release_readmes = data['release_readmes'] if data and 'release_readmes' in data else {} + # self.release_changelogs = data['release_changelogs'] if data and 'release_changelogs' in data else {} + self.release_resources = data['release_resources'] if data and 'release_resources' in data else {} + + # Format 2 backards compatability + # Convert release_package_xml from flat dict at the root to be an instance of a resource loaded + if inbound_version == 2 and 'release_package_xmls' in data: + if not 'release_resources' in data: + data['release_resources'] = {} + for pkg_name, pkg_xml in data['release_package_xmls'].items(): + if not pkg_name in data['release_resources']: + data['release_resources'][pkg_name] = {} + data['release_resources'][pkg_name]['package.xml'] = pkg_xml + + self.source_repo_resources = {} + if data and 'source_repo_resources' in data: + for repo_name, repo_data in data['source_repo_resources'].items(): + self.source_repo_resources[repo_name] = SourceRepositoryCache(repo_data) self.distribution_file.source_packages = self.get_source_packages() def get_data(self): data = {} data['type'] = 'cache' - data['version'] = 2 + data['version'] = 3 data['name'] = self.distribution_file.name data['distribution_file'] = self._distribution_file_data - data['release_package_xmls'] = self.release_package_xmls - data['source_repo_package_xmls'] = dict([(repo_name, repo_cache.get_data()) - for repo_name, repo_cache in self.source_repo_package_xmls.items()]) + data['release_resources'] = self.release_resources + data['source_repo_resources'] = dict([(repo_name, repo_cache.get_data()) + for repo_name, repo_cache in self.source_repo_resources.items()]) return data def update_distribution(self, distribution_file_data): @@ -105,19 +126,32 @@ def update_distribution(self, distribution_file_data): dist_file = create_distribution_file(self.distribution_file.name, self._distribution_file_data) # remove all release package xmls where the package version has changed. - print("- removing invalid release package cache entries.") + print(f"- checking [{len(dist_file.release_packages.keys())}] release package cache entries for different versions") + dropped_count = 0 + skipped_count = 0 for pkg_name in sorted(dist_file.release_packages.keys()): if pkg_name not in self.distribution_file.release_packages: + logger.debug("Skipping %s because not in the distro." % pkg_name) + skipped_count += 1 continue - if pkg_name in self.release_package_xmls and self._get_repo_info(dist_file, pkg_name) != self._get_repo_info(self.distribution_file, pkg_name): - logger.debug("Dropping release package XML cache for %s" % pkg_name) - del self.release_package_xmls[pkg_name] + if pkg_name in self.release_resources and self._get_repo_info(dist_file, pkg_name) != self._get_repo_info(self.distribution_file, pkg_name): + logger.debug("Dropping release resources package cache for %s" % pkg_name) + dropped_count += 1 + del self.release_resources[pkg_name] + + + sys.stdout.write('\n') + sys.stdout.write(f'Dropped {dropped_count} repositories\n') + sys.stdout.write(f'Skippted {skipped_count} repositories\n') # Remove all source package xmls where the devel branch is pointing to a different commit than # the one we have associated with our cache. This requires calling git ls-remote on all affected repos. - if self.source_repo_package_xmls: - print("- checking invalid source repo cache entries.") - for repo in sorted(self.source_repo_package_xmls.keys()): + if self.source_repo_resources: + start_time = time.perf_counter() + dropped_count = 0 + skipped_count = 0 + print(f"- checking [{len(self.source_repo_resources.keys())}] source repo cache entries without source entries, requires ls-remote") + for repo in sorted(self.source_repo_resources.keys()): sys.stdout.write('.') sys.stdout.flush() try: @@ -126,9 +160,18 @@ def update_distribution(self, distribution_file_data): # The repo entry has been dropped, or the source stanza from it has been dropped, # either way, remove the cache entries associated with this repository. logger.debug('Unable to find source repository info for repo "%s".' % repo) - del self.source_repo_package_xmls[repo] + del self.source_repo_resources[repo] continue + min_update_delta = 1 * 60 * 60 # TOOD(tfoote) magic number make into a parameter + if '_last_update_time' in self.source_repo_resources[repo]: + now = datetime.datetime.now() + entry_age = (now - self.source_repo_resources[repo]['_last_update_time']).total_seconds() + if entry_age < min_update_delta: + logger.debug(f'Skipping check of {repo} because it was last updated only {entry_age} seconds ago less than {min_update_delta}') + skipped_count += 1 + continue + if ref_is_hash(source_repository.version): source_hash = source_repository.version else: @@ -136,17 +179,23 @@ def update_distribution(self, distribution_file_data): if result['returncode'] != 0 or not result['output']: # Error checking remote, or unable to find remote reference. Drop the cache entry. logger.debug("Unable to check hash for branch %s of %s, dropping cache entry." % (source_repository.version, source_repository.url)) - del self.source_repo_package_xmls[repo] + del self.source_repo_resources[repo] + dropped_count += 1 continue # Split by line first and take the last line, to squelch any preamble output, for example # a known host key validation notice. source_hash = result['output'].split('\n')[-1].split('\t')[0] - cached_hash = self.source_repo_package_xmls[repo].ref() + cached_hash = self.source_repo_resources[repo].ref() if source_hash != cached_hash: logger.debug('Repo "%s" has moved from %s to %s, dropping cache.' % (repo, cached_hash, source_hash)) - del self.source_repo_package_xmls[repo] + del self.source_repo_resources[repo] + dropped_count += 1 sys.stdout.write('\n') + sys.stdout.write(f'Dropped {dropped_count} repositories\n') + sys.stdout.write(f'Skippted {skipped_count} repositories\n') + end_time = time.perf_counter() + logger.debug(f'Check of invalid source repo cache entries took {(end_time - start_time):.1f} seconds') self.distribution_file = dist_file self.distribution_file.source_packages = self.get_source_packages() @@ -157,7 +206,7 @@ def update_distribution(self, distribution_file_data): def get_source_packages(self): """ Returns dictionary mapping source package names to Package() objects. """ package_dict = {} - for source_repo_name, source_repo in self.source_repo_package_xmls.items(): + for source_repo_name, source_repo in self.source_repo_resources.items(): for pkg_name in source_repo: package_dict[pkg_name] = Package(pkg_name, source_repo_name) return package_dict @@ -168,7 +217,7 @@ def _get_repo_info(self, dist_file, pkg_name): return (repo.version, repo.url) def _remove_obsolete_entries(self): - for pkg_name in list(self.release_package_xmls.keys()): + for pkg_name in list(self.release_resources.keys()): if pkg_name not in self.distribution_file.release_packages: - print('- REMOVE', pkg_name) - del self.release_package_xmls[pkg_name] + print('- REMOVE Release Resources for: ', pkg_name) + del self.release_resources[pkg_name] diff --git a/src/rosdistro/distribution_cache_generator.py b/src/rosdistro/distribution_cache_generator.py index 8646fe55..c637db3b 100644 --- a/src/rosdistro/distribution_cache_generator.py +++ b/src/rosdistro/distribution_cache_generator.py @@ -78,22 +78,26 @@ def generate_distribution_cache(index, dist_name, preclean=False, ignore_local=F include_source=include_source) print('- fetch missing release manifests') + max_source_packages = 100000 # TODO(tfoote) magic number move to config + max_source_repos = 10000 # TODO(tfoote) magic number move to config errors = [] - for pkg_name in sorted(dist.release_packages.keys()): + if debug and (len(dist.release_packages.keys()) > max_source_packages): + print(f' - limiting packages scanned to {max_source_packages} of {len(dist.release_packages.keys())} as per config') + for pkg_name in sorted(dist.release_packages.keys())[:max_source_packages]: repo = dist.repositories[dist.release_packages[pkg_name].repository_name].release_repository if repo.version is None: if debug: print(' - skip "%s" since it has no version' % pkg_name) continue if debug: - print(' - fetch "%s"' % pkg_name) + print(' - dist cache fetch "%s"' % pkg_name) else: sys.stdout.write('.') sys.stdout.flush() # check that package.xml is fetchable old_package_xml = None - if cache and pkg_name in cache.release_package_xmls: - old_package_xml = cache.release_package_xmls[pkg_name] + if cache and pkg_name in cache.release_resources: + old_package_xml = cache.release_resources[pkg_name].get('package.xml', None) package_xml = dist.get_release_package_xml(pkg_name) if not package_xml: errors.append('%s: missing package.xml file for package "%s"' % (dist_name, pkg_name)) @@ -111,16 +115,36 @@ def generate_distribution_cache(index, dist_name, preclean=False, ignore_local=F if package_xml != old_package_xml: print(" - updated manifest of package '%s' to version '%s'" % (pkg_name, pkg.version)) + old_readme = None + if cache and pkg_name in cache.release_resources: + old_readme = cache.release_resources[pkg_name].get('README.md', None) + readme = dist.get_release_resource(pkg_name, 'README.md') + + if readme != old_readme: + print(" - updated README.md of package '%s'" % (pkg_name)) + + old_changelog = None + if cache and pkg_name in cache.release_resources: + old_changelog = cache.release_resources[pkg_name].get('CHANGELOG.rst', None) + changelog = dist.get_release_resource(pkg_name, 'CHANGELOG.rst') + + if changelog != old_changelog: + print(" - updated CHANGELOG.rst of package '%s'" % (pkg_name)) + + + if not debug: print('') if include_source: print('- fetch source repository manifests') - for repo_name in sorted(dist.repositories.keys()): + if debug and len(dist.repositories.keys()) > max_source_repos: + print(f' - limiting repositories scanned to {max_source_repos} of {len(dist.repositories.keys())} as per config') + for repo_name in sorted(dist.repositories.keys())[:max_source_repos]: if dist.repositories[repo_name].source_repository: - dist.get_source_repo_package_xmls(repo_name) + dist.get_source_repo_resources(repo_name) if debug: - print(' - fetch "%s"' % repo_name) + print(' - dist cache source fetch "%s"' % repo_name) else: sys.stdout.write('.') sys.stdout.flush() @@ -152,8 +176,8 @@ def __init__(self, *args, **kwargs): super(CacheYamlDumper, self).__init__(*args, **kwargs) def ignore_aliases(self, content): - """ Allow strings that look like package XML to alias to each other in the YAML output. """ - return not (isinstance(content, str) and ' 300) # TODO(tfoote) magic number move to config def represent_mapping(self, tag, mapping, flow_style=False): """ Gives compact representation for the distribution_file section, while allowing the package @@ -199,7 +223,7 @@ def _get_cached_distribution(index, dist_name, preclean=False, ignore_local=Fals # if we're not including the source portion of the cache, strip it out of the existing cache # in order to skip the potentially lengthy cache invalidation process. if not include_source: - cache.source_repo_package_xmls = {} + cache.source_repo_resources = {} # update cache with current distribution file, which filters existing cache by validity. cache.update_distribution(rel_file_data) else: diff --git a/src/rosdistro/manifest_provider/bitbucket.py b/src/rosdistro/manifest_provider/bitbucket.py index b514965d..a1364554 100644 --- a/src/rosdistro/manifest_provider/bitbucket.py +++ b/src/rosdistro/manifest_provider/bitbucket.py @@ -49,7 +49,7 @@ BITBUCKET_PASSWORD = os.getenv('BITBUCKET_PASSWORD', None) -def bitbucket_manifest_provider(_dist_name, repo, pkg_name): +def bitbucket_manifest_provider(_dist_name, repo, pkg_name, filepath='package.xml'): assert repo.version server, path = repo.get_url_parts() @@ -62,7 +62,7 @@ def bitbucket_manifest_provider(_dist_name, repo, pkg_name): if not repo.has_remote_tag(release_tag): raise RuntimeError('specified tag "%s" is not a git tag' % release_tag) - url = 'https://bitbucket.org/%s/raw/%s/package.xml' % (path, release_tag) + url = 'https://bitbucket.org/%s/raw/%s/%s' % (path, release_tag, filepath) try: logger.debug('Load package.xml file from url "%s"' % url) req = Request(url) diff --git a/src/rosdistro/manifest_provider/cache.py b/src/rosdistro/manifest_provider/cache.py index 85de71a8..1bf733e8 100644 --- a/src/rosdistro/manifest_provider/cache.py +++ b/src/rosdistro/manifest_provider/cache.py @@ -35,6 +35,13 @@ from rosdistro import logger +def sanitize_and_truncate_docs(doc_string, max_length=100): + # Remove trailing whitespace then truncate + lines = doc_string.rstrip().splitlines() + ending = '' + if len(lines) > max_length: + ending = f'\nTruncated content at {max_length} of {len(lines)} lines' + return '\n'.join(lines[:max_length - 1 ]) + ending def sanitize_xml(xml_string): """ Returns a version of the supplied XML string with comments and all whitespace stripped, @@ -68,27 +75,38 @@ def __init__(self, distribution_cache, manifest_providers=None): self._distribution_cache = distribution_cache self._manifest_providers = manifest_providers - def __call__(self, dist_name, repo, pkg_name): + def __call__(self, dist_name, repo, pkg_name, filepath='package.xml'): assert repo.version - package_xml = self._distribution_cache.release_package_xmls.get(pkg_name, None) - if package_xml: - package_xml = sanitize_xml(package_xml) - self._distribution_cache.release_package_xmls[pkg_name] = package_xml - logger.debug('Loading package.xml for package "%s" from cache' % pkg_name) - else: + + # Load from cache + manifest_content = self._distribution_cache.release_resources.get(pkg_name, {}).get(filepath, None) + if manifest_content: + if filepath != 'package.xml': + manifest_content = sanitize_and_truncate_docs(manifest_content) + self._distribution_cache.release_resources[pkg_name][filepath] = manifest_content + logger.debug('Loading %s for package "%s" from cache' % (filepath, pkg_name) ) + + if not manifest_content: # use manifest providers to lazy load for mp in self._manifest_providers or []: try: - package_xml = sanitize_xml(mp(dist_name, repo, pkg_name)) + manifest_content = mp(dist_name, repo, pkg_name, filepath) + if filepath == 'package.xml': + manifest_content = sanitize_xml(manifest_content) + else: + manifest_content = sanitize_and_truncate_docs(manifest_content) break except Exception as e: # pass and try next manifest provider logger.debug('Skipped "%s()": %s' % (mp.__name__, e)) - if package_xml is None: + if manifest_content is None: return None + # populate the cache - self._distribution_cache.release_package_xmls[pkg_name] = package_xml - return package_xml + if pkg_name not in self._distribution_cache.release_resources: + self._distribution_cache.release_resources[pkg_name] = {} + self._distribution_cache.release_resources[pkg_name][filepath] = manifest_content + return manifest_content class CachedSourceManifestProvider(object): @@ -99,18 +117,19 @@ def __init__(self, distribution_cache, source_manifest_providers=None): def __call__(self, repo): assert repo.url - repo_cache = self._distribution_cache.source_repo_package_xmls.get(repo.name, None) + repo_cache = self._distribution_cache.source_repo_resources.get(repo.name, None) if not repo_cache: + logger.debug(f"Internal Cache Miss for {repo.name} Loading from Source Manifset Providers") # Use manifest providers to lazy load for mp in self._source_manifest_providers or []: try: - repo_cache = mp(repo) + repo_cache = mp(repo, filepaths=['CHANGELOG.rst', 'README.md']) # TODO (tfoote) list other files here except Exception as e: # pass and try next manifest provider logger.debug('Skipped "%s()": %s' % (mp.__name__, e)) continue - self._distribution_cache.source_repo_package_xmls[repo.name] = repo_cache + self._distribution_cache.source_repo_resources[repo.name] = repo_cache break else: logger.debug('Load package XMLs for repo "%s" from cache' % repo.name) @@ -118,11 +137,24 @@ def __call__(self, repo): # De-duplicate with the release package XMLs. This will cause the YAML writer # to use references for the common strings, saving a lot of space in the cache file. if repo_cache: - for package_name, package_path, package_xml in repo_cache.items(): - package_xml = sanitize_xml(package_xml) - release_package_xml = self._distribution_cache.release_package_xmls.get(package_name, None) - if package_xml == release_package_xml: - package_xml = release_package_xml - repo_cache.add(package_name, package_path, package_xml) - + for package_name, pkg_entries in repo_cache._data.items(): + if package_name.startswith('_'): + continue + for resource_type in pkg_entries: + valid_types = ['CHANGELOG.rst', 'README.md', 'package.xml'] + if resource_type not in valid_types: + #TODO(tfoote) clean up this logic with magic values + continue + if 'package.xml' == resource_type: + package_xml = sanitize_xml(pkg_entries['package.xml']) # TODO(tfoote) validate as unnecessary should be sanitized already on insert? + release_package_xml = self._distribution_cache.release_resources.get(package_name, {}).get('package.xml', None) + if package_xml == release_package_xml: + logger.debug(f'{package_name} Linking package.xml of source cache entry for compaction. Lines saved: {len(package_xml.splitlines())}') + repo_cache.add(package_name, pkg_entries['package_path'], release_package_xml, 'package.xml', increment_update_time=False) + else: + content = sanitize_and_truncate_docs(pkg_entries[resource_type]) + release_content = self._distribution_cache.release_resources.get(package_name, {}).get(resource_type, None) + if content == release_content: + logger.debug(f'{package_name} Linking {resource_type} of source cache entry for compaction. Lines saved: {len(content.splitlines())}') + repo_cache.add(package_name, pkg_entries['package_path'], release_content, resource_type, increment_update_time=False) return repo_cache diff --git a/src/rosdistro/manifest_provider/git.py b/src/rosdistro/manifest_provider/git.py index 6e9e5e60..8fcc10f7 100644 --- a/src/rosdistro/manifest_provider/git.py +++ b/src/rosdistro/manifest_provider/git.py @@ -40,45 +40,59 @@ from catkin_pkg.packages import find_package_paths from rosdistro.common import rmtree +from rosdistro.manifest_provider.cache import sanitize_and_truncate_docs from rosdistro.source_repository_cache import SourceRepositoryCache from rosdistro.vcs import Git, ref_is_hash +from rosdistro import logger -def git_manifest_provider(_dist_name, repo, pkg_name): +def git_manifest_provider(_dist_name, repo, pkg_name, filepath='package.xml'): assert repo.version try: release_tag = repo.get_release_tag(pkg_name) with _temp_git_clone(repo.url, release_tag) as git_repo_path: - filename = os.path.join(git_repo_path, 'package.xml') + filename = os.path.join(git_repo_path, filepath) if not os.path.exists(filename): - raise RuntimeError('Could not find package.xml in repository "%s"' % repo.url) + raise RuntimeError('Could not find %s in repository "%s"' % (filepath, repo.url)) with open(filename, 'r') as f: return f.read() except Exception as e: - raise RuntimeError('Unable to fetch package.xml: %s' % e) + raise RuntimeError('Unable to fetch %s: %s' % (filepath, e)) -def git_source_manifest_provider(repo): +def git_source_manifest_provider(repo, filepaths=['CHANGELOG.rst', 'README.md']): + xmlpath = 'package.xml' # TODO(tfoote) use filepaths, currently using the package special to get the names try: with _temp_git_clone(repo.url, repo.version) as git_repo_path: + logger.debug(f'Cloning repository {repo.url} to get source info') # Include the git hash in our cache dictionary. git_hash = Git(git_repo_path).command('rev-parse', 'HEAD')['output'] cache = SourceRepositoryCache.from_ref(git_hash) - # Find package.xml files inside the repo. + # Find filepath files inside the repo. for package_path in find_package_paths(git_repo_path): if package_path == '.': package_path = '' - with open(os.path.join(git_repo_path, package_path, 'package.xml'), 'r') as f: + with open(os.path.join(git_repo_path, package_path, xmlpath), 'r') as f: package_xml = f.read() try: name = parse_package_string(package_xml).name except InvalidPackage: - raise RuntimeError('Unable to parse package.xml file found in %s' % repo.url) - cache.add(name, package_path, package_xml) + raise RuntimeError('Unable to parse %s file found in %s' % (xmlpath, repo.url)) + cache.add(name, package_path, package_xml, xmlpath) + for filepath in filepaths: + repo_filename = os.path.join(git_repo_path, package_path, filepath) + if not os.path.exists(repo_filename): + logger.debug(f'- git load of {filepath} from {repo.url} at {repo.version} skipped because it did not exist.') + continue + with open(repo_filename, 'r') as f: + logger.debug('- git load %s from %s' % (filepath, repo_filename)) + contents = f.read() + contents = sanitize_and_truncate_docs(contents) # TODO(tfoote) Do this later so it doesn't need to be in all manifest providers + cache.add(name, package_path, contents, filepath) except Exception as e: - raise RuntimeError('Unable to fetch source package.xml files: %s' % e) + raise RuntimeError('Unable to fetch source files: %s' % (e)) return cache diff --git a/src/rosdistro/manifest_provider/github.py b/src/rosdistro/manifest_provider/github.py index 4074e61c..ae20b91c 100644 --- a/src/rosdistro/manifest_provider/github.py +++ b/src/rosdistro/manifest_provider/github.py @@ -34,7 +34,10 @@ import base64 import json import os +import time + from urllib.request import urlopen, Request +from urllib.error import HTTPError from urllib.error import URLError from catkin_pkg.package import parse_package_string @@ -42,13 +45,29 @@ from rosdistro.source_repository_cache import SourceRepositoryCache from rosdistro import logger +from rosdistro.manifest_provider.cache import sanitize_and_truncate_docs + GITHUB_USER = os.getenv('GITHUB_USER', None) GITHUB_PASSWORD = os.getenv('GITHUB_PASSWORD', None) +GITHUB_TOKEN = os.getenv('GITHUB_TOKEN', None) + +ROSDISTRO_RETRY_MAX = int(os.getenv('ROSDISTRO_RETRY_MAX', '120')) def _get_url_contents(url): + backoff = 1 + while backoff < ROSDISTRO_RETRY_MAX: + try: + return urlopen(url).read().decode('utf-8') + except HTTPError as e: + if e.code != 403: + raise e + logger.debug(f'Fetch of {url.full_url} failed with 403, assuming rate limit, retrying after period {backoff} seconds.') + time.sleep(backoff) + backoff *= 1.5 + # Last return after timeout to collect the error object again. return urlopen(url).read().decode('utf-8') -def github_manifest_provider(_dist_name, repo, pkg_name): +def github_manifest_provider(_dist_name, repo, pkg_name, filepath='package.xml'): assert repo.version server, path = repo.get_url_parts() if not server.endswith('github.com'): @@ -60,16 +79,22 @@ def github_manifest_provider(_dist_name, repo, pkg_name): if not repo.has_remote_tag(release_tag): raise RuntimeError('specified tag "%s" is not a git tag' % release_tag) - url = 'https://raw.githubusercontent.com/%s/%s/package.xml' % (path, release_tag) + url = 'https://raw.githubusercontent.com/%s/%s/%s' % (path, release_tag, filepath) try: - logger.debug('Load package.xml file from url "%s"' % url) - return _get_url_contents(url) + logger.debug('Load %s file from url "%s"' % (filepath, url)) + return '\n'.join( _get_url_contents(url).splitlines()) + except HTTPError as e: + if e.code == 404: + logger.debug('- File not found (%s), trying "%s"' % (e, url)) + return 'Missing' + logger.debug('- HTTP ERROR (%s), trying "%s"' % (e, url)) + raise e except URLError as e: logger.debug('- failed (%s), trying "%s"' % (e, url)) raise RuntimeError() -def github_source_manifest_provider(repo): +def github_source_manifest_provider(repo, filepaths=['CHANGELOG.rst', 'README.md']): server, path = repo.get_url_parts() if not server.endswith('github.com'): logger.debug('Skip non-github url "%s"' % repo.url) @@ -77,6 +102,8 @@ def github_source_manifest_provider(repo): tree_url = 'https://api.github.com/repos/%s/git/trees/%s?recursive=1' % (path, repo.version) req = Request(tree_url) + if GITHUB_TOKEN: + req.add_header("Authorization", f"Bearer {GITHUB_TOKEN}") if GITHUB_USER and GITHUB_PASSWORD: logger.debug('- using http basic auth from supplied environment variables.') credential_pair = '%s:%s' % (GITHUB_USER, GITHUB_PASSWORD) @@ -93,9 +120,10 @@ def github_source_manifest_provider(repo): package_xml_paths = set() for obj in tree_json['tree']: - if obj['path'].split('/')[-1] == 'package.xml': + if obj['path'].split('/')[-1] == 'package.xml': # Actually package.xml to find packages instead of filepath package_xml_paths.add(os.path.dirname(obj['path'])) + # TODO(tfoote) This is not correct for non-package.xml # Filter out ones that are inside other packages (eg, part of tests) def package_xml_in_parent(path): if path == '': @@ -111,11 +139,27 @@ def package_xml_in_parent(path): cache = SourceRepositoryCache.from_ref(tree_json['sha']) for package_xml_path in package_xml_paths: + package_xml_filename = 'package.xml' url = 'https://raw.githubusercontent.com/%s/%s/%s' % \ - (path, cache.ref(), package_xml_path + '/package.xml' if package_xml_path else 'package.xml') - logger.debug('- load package.xml from %s' % url) + (path, cache.ref(), package_xml_path + '/' + package_xml_filename if package_xml_path else package_xml_filename) + logger.debug('- load %s from %s' % (package_xml_filename, url)) package_xml = _get_url_contents(url) name = parse_package_string(package_xml).name - cache.add(name, package_xml_path, package_xml) + cache.add(name, package_xml_path, package_xml, package_xml_filename) + for filepath in filepaths: + url = 'https://raw.githubusercontent.com/%s/%s/%s' % \ + (path, cache.ref(), package_xml_path + '/' + filepath if package_xml_path else filepath) + logger.debug('- load %s from %s' % (filepath, url)) + try: + contents = _get_url_contents(url) + except HTTPError as e: + if e.code == 404: + logger.debug('- Recording Missing (%s), hit error "%s"' % (url, e)) + contents = 'Missing' + else: + logger.debug('- HTTP ERROR (%s), trying "%s"' % (e, url)) + raise e + contents = sanitize_and_truncate_docs(contents) # TODO(tfoote) Do this later so it doesn't need to be in all manifest providers + cache.add(name, package_xml_path, contents, filepath) return cache diff --git a/src/rosdistro/manifest_provider/gitlab.py b/src/rosdistro/manifest_provider/gitlab.py index 88450453..69aa6717 100644 --- a/src/rosdistro/manifest_provider/gitlab.py +++ b/src/rosdistro/manifest_provider/gitlab.py @@ -89,14 +89,14 @@ def _gitlab_paged_api_query(server, path, resource, attrs): url = match.group(1) -def gitlab_manifest_provider(_dist_name, repo, pkg_name): +def gitlab_manifest_provider(_dist_name, repo, pkg_name, filepath='package.xml'): assert repo.version server, path = repo.get_url_parts() if not server.endswith('gitlab.com') and server != ROSDISTRO_GITLAB_SERVER: logger.debug('Skip non-gitlab url "%s"' % repo.url) raise RuntimeError('can not handle non gitlab urls') - resource = 'repository/files/package.xml/raw' + resource = 'repository/files/%s/raw' % filepath attrs = { 'ref': repo.get_release_tag(pkg_name), } @@ -108,7 +108,7 @@ def gitlab_manifest_provider(_dist_name, repo, pkg_name): raise -def gitlab_source_manifest_provider(repo): +def gitlab_source_manifest_provider(repo, filepaths=['package.xml']): assert repo.version server, path = repo.get_url_parts() if not server.endswith('gitlab.com') and server != ROSDISTRO_GITLAB_SERVER: @@ -147,12 +147,13 @@ def package_xml_in_parent(path): cache = SourceRepositoryCache.from_ref(sha) for package_xml_path in package_xml_paths: - resource_path = urlquote( - package_xml_path + '/package.xml' if package_xml_path else 'package.xml', safe='') - resource = 'repository/files/' + resource_path + '/raw' - with _gitlab_api_query(server, path, resource, {'ref': sha}) as res: - package_xml = res.read().decode('utf-8') - name = parse_package_string(package_xml).name - cache.add(name, package_xml_path, package_xml) + for filepath in filepaths: + resource_path = urlquote( + package_xml_path + '/' + filepath if package_xml_path else filepath, safe='') + resource = 'repository/files/' + resource_path + '/raw' + with _gitlab_api_query(server, path, resource, {'ref': sha}) as res: + contents = res.read().decode('utf-8') + name = parse_package_string(contents).name + cache.add(name, package_xml_path, contents, filepath) return cache diff --git a/src/rosdistro/manifest_provider/tar.py b/src/rosdistro/manifest_provider/tar.py index 8cd038c7..2f285637 100644 --- a/src/rosdistro/manifest_provider/tar.py +++ b/src/rosdistro/manifest_provider/tar.py @@ -48,7 +48,7 @@ _TAR_USER = os.getenv('TAR_USER', None) _TAR_PASSWORD = os.getenv('TAR_PASSWORD', None) -def tar_manifest_provider(_dist_name, repo, pkg_name): +def tar_manifest_provider(_dist_name, repo, pkg_name, filepath='package.xml'): assert repo.type == 'tar' subdir = repo.get_release_tag(pkg_name) @@ -65,11 +65,11 @@ def tar_manifest_provider(_dist_name, repo, pkg_name): response = urlopen(request) with tarfile.open(fileobj=io.BytesIO(response.read())) as tar: - package_xml = tar.extractfile(subdir + '/package.xml').read() + package_xml = tar.extractfile(subdir + '/' + filepath).read() return package_xml.decode('utf-8') -def tar_source_manifest_provider(repo): +def tar_source_manifest_provider(repo, filepaths=['package.xml']): assert repo.type == 'tar' try: @@ -100,7 +100,8 @@ def tar_source_manifest_provider(repo): name = parse_package_string(package_xml).name except InvalidPackage: raise RuntimeError('Unable to parse package.xml file found in %s' % repo.url) - cache.add(name, package_path, package_xml) + for filepath in filepaths: + cache.add(name, package_path, package_xml, filepath) return cache finally: diff --git a/src/rosdistro/release_cache.py b/src/rosdistro/release_cache.py index 169dc707..8dddf2f8 100644 --- a/src/rosdistro/release_cache.py +++ b/src/rosdistro/release_cache.py @@ -56,7 +56,9 @@ def __init__(self, name, data=None, distribution_file_data=None): self._distribution_file_data = data['distribution_file'] if data else distribution_file_data self.release_file = ReleaseFile(name, self._distribution_file_data) - self.package_xmls = data['release_package_xmls'] if data else {} + + self.package_xmls = {pkg_name: info['package.xml'] for pkg_name, info in data['release_resources'] if 'package.xml' in info} + self.release_resources = data['release_resources'] # for backward compatibility only def __getattr__(self, name): @@ -70,7 +72,8 @@ def get_data(self): data['version'] = 2 data['name'] = self.release_file.name data['distribution_file'] = self._distribution_file_data - data['package_xmls'] = self.package_xmls + data['package_xmls'] = self.package_xmls # backwards compatibility + data['release_resources'] = self.release_resources return data def update_distribution(self, distribution_file_data): diff --git a/src/rosdistro/release_cache_generator.py b/src/rosdistro/release_cache_generator.py index a274bea8..790da8f0 100644 --- a/src/rosdistro/release_cache_generator.py +++ b/src/rosdistro/release_cache_generator.py @@ -78,7 +78,7 @@ def generate_release_cache(index, dist_name, preclean=False, debug=False): print(' - skip "%s" since it has no version' % pkg_name) continue if debug: - print(' - fetch "%s"' % pkg_name) + print(' - release cache fetch "%s"' % pkg_name) else: sys.stdout.write('.') sys.stdout.flush() diff --git a/src/rosdistro/source_repository_cache.py b/src/rosdistro/source_repository_cache.py index e94b9cc5..e6fd8281 100644 --- a/src/rosdistro/source_repository_cache.py +++ b/src/rosdistro/source_repository_cache.py @@ -31,18 +31,24 @@ # ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE # POSSIBILITY OF SUCH DAMAGE. +from datetime import datetime class SourceRepositoryCache(object): """ - This class represents a cache of the package XML strings for all packages in a single + This class represents a cache of the resource strings for all packages in a single repo at a particular moment in time. A dictionary of many of these (one for each repo) - keyed to the repo name represents the totality of the source package xml cache. + keyed to the repo name represents the totality of the source package resources cache. """ def __init__(self, data): assert data self._ref = data['_ref'] - self._package_names = set([name for name in data.keys() if name != '_ref']) + if '_last_update_time' in data: + self._last_update_time = data['_last_update_time'] + else: + self._last_update_time = None + non_package_keys = ['_ref', '_last_update_time'] + self._package_names = set([name for name in data.keys() if name not in non_package_keys]) self._data = data def get_data(self): @@ -58,12 +64,23 @@ def from_ref(cls, ref): """ return cls({'_ref': ref}) - def add(self, package_name, package_path, package_xml_string): + def add(self, package_name, package_path, payload_string, payload_type='package.xml', increment_update_time=True): # TODO(tfoote) Breaks rosdistro formatting changing from list to dict """ Add a package to the cache. """ - self._data[package_name] = (package_path, package_xml_string) + if package_name not in self._data: + self._data[package_name] = {} + + # Migration option for old caches + if type(self._data[package_name]) != dict: + print(f"Clearing content from package {package_name} @@@@@@@@@@@@@@@@!!!!!!!!!!!!!!!!") + self._data[package_name] = {} + self._data[package_name]['package_path'] = package_path + self._data[package_name][payload_type] = payload_string self._package_names.add(package_name) + if increment_update_time: + self._data[package_name]['_last_update_time'] = datetime.now() + def __iter__(self): """ @@ -71,23 +88,22 @@ def __iter__(self): """ return iter(self._package_names) - def __getitem__(self, package_name): + def __getitem__(self, package_name): #TODO(tfoote) API change """ - Access the cached information about a specific package. Returns a (str, str) of - path to package relative to repo root, and string of package xml. + Access the cached information about a specific package. Returns a dict of + path to package relative paths to repo root, and string of the file contents (potentially truncated). """ if package_name not in self._package_names: raise KeyError("Package '%s' not present in SourceRepositoryCache." % package_name) return self._data[package_name] - def items(self): + def items(self): #TODO(tfoote) API change """ - Generator of (str, str, str) containing the package name, path relative - to repo root, and package xml string. + Generator of (str, dict) containing the package name, and a dict of + paths to file contents (potentially truncated). """ for package_name in self._package_names: - package_path, package_xml_string = self._data[package_name] - yield package_name, package_path, package_xml_string + yield package_name, self._data[package_name] def __len__(self): """ diff --git a/stdeb.cfg b/stdeb.cfg index 1b430c35..6948e33b 100644 --- a/stdeb.cfg +++ b/stdeb.cfg @@ -12,7 +12,7 @@ X-Python3-Version: >= 3.6 Setup-Env-Vars: SKIP_PYTHON_MODULES=1 [rosdistro_modules] -Depends3: ca-certificates, python3-catkin-pkg-modules, python3-rospkg-modules, python3-setuptools, python3-yaml +Depends3: ca-certificates, python3-catkin-pkg-modules, python3-rospkg-modules, python3-setuptools, python3-yaml, python3-jsonschema Conflicts3: python3-rosdistro (<< 0.6.0) Replaces3: python3-rosdistro (<< 0.6.0) Copyright-File: LICENSE.txt diff --git a/test/test_manifest_providers.py b/test/test_manifest_providers.py index d0d96482..1206ff1e 100644 --- a/test/test_manifest_providers.py +++ b/test/test_manifest_providers.py @@ -26,6 +26,7 @@ def test_cached(): class FakeDistributionCache(object): def __init__(self): self.release_package_xmls = {} + self.release_resources = {} dc = FakeDistributionCache() cache = CachedManifestProvider(dc, [rosdistro.manifest_provider.github.github_manifest_provider]) assert '' in cache('melodic', _genmsg_release_repo(), 'genmsg') @@ -55,9 +56,8 @@ def test_git_source(): # This hash corresponds to the 0.5.11 tag. assert repo_cache.ref() == 'a189fc78558e7276df59d2961cfe4f8b4de08a8b' - package_path, package_xml = repo_cache['genmsg'] - assert '' == package_path - assert '0.5.11' in package_xml + assert '' == repo_cache['genmsg']['package_path'] + assert '0.5.11' in repo_cache['genmsg']['package.xml'] # mock_get_url_contents is used to mock out the '_get_url_contents' method in @@ -97,9 +97,8 @@ def test_github_source(): # This hash corresponds to the 0.5.7 tag. assert repo_cache.ref() == '81b66fe5eb00043c43894ddeee07e738d9b9712f' - package_path, package_xml = repo_cache['genmsg'] - assert '' == package_path - assert '0.5.11' in package_xml + assert '' == repo_cache['genmsg']['package_path'] + assert '0.5.11' in repo_cache['genmsg']['package.xml'] def test_gitlab_source(): @@ -108,16 +107,14 @@ def test_gitlab_source(): # This hash corresponds to the 1.0.3 tag. assert repo_cache.ref() == 'cd30853005ef3a591cb8594b4aa49f9ef400d30f' - package_path, package_xml = repo_cache['ros2trace_analysis'] - assert 'ros2trace_analysis' == package_path - assert '1.0.3' in package_xml + assert 'ros2trace_analysis' == repo_cache['ros2trace_analysis']['package_path'] + assert '1.0.3' in repo_cache['ros2trace_analysis']['package.xml'] def test_git_source_multi(): repo_cache = git_source_manifest_provider(_ros_source_repo()) assert repo_cache.ref() - package_path, package_xml = repo_cache['roslib'] - assert package_path == os.path.join('core', 'roslib') + assert repo_cache['roslib']['package_path'] == os.path.join('core', 'roslib') def test_tar_source(): @@ -125,9 +122,10 @@ def test_tar_source(): assert repo_cache.ref() is None - package_path, package_xml = repo_cache['genmsg'] - assert 'genmsg-0.5.16' == package_path - assert '0.5.16' in package_xml + print(repo_cache['genmsg']) + + assert 'genmsg-0.5.16' == repo_cache['genmsg']['package_path'] + assert '0.5.16' in repo_cache['genmsg']['package.xml'] def test_sanitize():