diff --git a/ckanapi/cli/dump_metadata.py b/ckanapi/cli/dump_metadata.py new file mode 100644 index 0000000..2269678 --- /dev/null +++ b/ckanapi/cli/dump_metadata.py @@ -0,0 +1,55 @@ +""" +implementation of dump_metadata cli command +""" + +import sys +import gzip +import json +from datetime import datetime +import os + +from ckanapi.cli.utils import compact_json, \ + quiet_int_pipe, pretty_json + +DEFAULT_PAGINATION = 50 + +def dump_metadata(ckan, arguments, pagination=DEFAULT_PAGINATION, + stdout=None, stderr=None): + ''' + Dump all the JSON metadata records. + This is often a better than using dump_things with thing=datasets, + as sites like catalog.data.gov do not support package_list api. + + The package_search API is used with pagination. + ''' + if pagination < 1: + raise ValueError("Pagination size must be greater or equal to 1") + if stdout is None: + stdout = getattr(sys.stdout, 'buffer', sys.stdout) + if stderr is None: + stderr = getattr(sys.stderr, 'buffer', sys.stderr) + + jsonl_output = stdout + if arguments['--output']: + jsonl_output = open(arguments['--output'], 'wb') + if arguments['--gzip']: + jsonl_output = gzip.GzipFile(fileobj=jsonl_output) + + with quiet_int_pipe() as errors: + count = 0 + total_count = 0 + total_known = False + while not total_known or total_count > count: + response = ckan.call_action("package_search", dict(rows=pagination, + start=count, sort="id asc")) + total_count = response["count"] + total_known = True + for record in response["results"]: + jsonl_output.write(compact_json(record, + sort_keys=True) + b'\n') + count += 1 + if 'pipe' in errors: + return 1 + if 'interrupt' in errors: + return 2 + diff --git a/ckanapi/cli/main.py b/ckanapi/cli/main.py index 0904266..6cbb5a1 100644 --- a/ckanapi/cli/main.py +++ b/ckanapi/cli/main.py @@ -19,6 +19,10 @@ (ID_OR_NAME ... | --all) ([-O JSONL_OUTPUT] | [-D DIRECTORY]) [-p PROCESSES] [-qwz] [[-c CONFIG] [-u USER] | -r SITE_URL [-a APIKEY] [-g]] + ckanapi dump_datasets2 + [-O JSONL_OUTPUT] + [-z] + [[-c CONFIG] [-u USER] | -r SITE_URL [-a APIKEY] [-g]] ckanapi delete (datasets | groups | organizations | users | related) (ID_OR_NAME ... | [-I JSONL_INPUT] [-s START] [-m MAX]) [-p PROCESSES] [-l LOG_FILE] [-qwz] @@ -77,6 +81,7 @@ from ckanapi.cli.dump import dump_things from ckanapi.cli.delete import delete_things from ckanapi.cli.action import action +from ckanapi.cli.dump_metadata import dump_metadata def parse_arguments(): @@ -130,6 +135,9 @@ def main(running_with_paster=False): if arguments['delete']: return delete_things(ckan, thing[0], arguments) + if arguments['dump_datasets2']: + return dump_metadata(ckan, arguments) + assert 0, arguments # we shouldn't be here diff --git a/ckanapi/tests/test_cli_dump_metadata.py b/ckanapi/tests/test_cli_dump_metadata.py new file mode 100644 index 0000000..44a16c8 --- /dev/null +++ b/ckanapi/tests/test_cli_dump_metadata.py @@ -0,0 +1,67 @@ +from ckanapi.cli.dump_metadata import dump_metadata +from ckanapi.errors import NotFound +import json +import tempfile +import shutil +from os.path import exists + +try: + import unittest2 as unittest +except ImportError: + import unittest +from io import BytesIO + + +class MockCKAN(object): + + def __init__(self): + + self.data = [{'id': '12', + 'name': 'twelve', + 'title': "Twelve"}, + {'id': '34', + 'name': 'thirtyfour', + 'title': "Thirty-four"}, + {'id': '56', + 'name': 'fiftysix', + 'title': "Fifty-Six"}, + {'id': '67', + 'name': 'sixtyseven', + 'title': "Sixty-Seven"}, + {'id': 'dp', + 'name': 'dp', + 'title': 'Test for datapackage', + 'resources':[ {'name': 'resource1', + 'format': 'html', + 'url':'http://example.com/test-file'}]}] + + def call_action(self, name, kwargs): + start = kwargs["start"] + rows = kwargs["rows"] + return {"count" : 5, + "results" : self.data[start:start+rows]} + + +class TestCLIDumpMetadata(unittest.TestCase): + def setUp(self): + self.ckan = MockCKAN() + self.stdout = BytesIO() + self.stderr = BytesIO() + + def test_dump_metadata(self): + dump_metadata(self.ckan, { + '--ckan-user': None, + '--config': None, + '--remote': None, + '--apikey': None, + '--output': None, + '--gzip': False, + }, 2, + stdout=self.stdout, + stderr=self.stderr) + output = self.stdout.getvalue().decode("UTF-8") + self.assertEqual(output.count("\n"), 5) + self.assertTrue(r'{"id":"12","name":"twelve","title":"Twelve"}' in output) + self.assertTrue(r'{"id":"34","name":"thirtyfour","title":"Thirty-four"}' in output) + self.assertTrue(r'id":"dp"' in output) +