From 62a7da90194b3e46d93ff7f3054caa15c476f8bb Mon Sep 17 00:00:00 2001 From: Eric Zhu Date: Mon, 14 Nov 2016 21:20:03 -0500 Subject: [PATCH 1/4] adding command line tool for dumping all metadata --- ckanapi/cli/dump_metadata.py | 50 ++++++++++++++++++++ ckanapi/cli/main.py | 8 ++++ ckanapi/tests/test_cli_dump_metadata.py | 63 +++++++++++++++++++++++++ 3 files changed, 121 insertions(+) create mode 100644 ckanapi/cli/dump_metadata.py create mode 100644 ckanapi/tests/test_cli_dump_metadata.py diff --git a/ckanapi/cli/dump_metadata.py b/ckanapi/cli/dump_metadata.py new file mode 100644 index 0000000..b22e298 --- /dev/null +++ b/ckanapi/cli/dump_metadata.py @@ -0,0 +1,50 @@ +""" +implementation of dump_metadata cli command +""" + +import sys +import gzip +import json +from datetime import datetime +import os + +from ckanapi.cli.utils import compact_json, \ + quiet_int_pipe, pretty_json + +PAGINATION = 50 + +def dump_metadata(ckan, arguments, stdout=None, stderr=None): + ''' + Dump all the JSON metadata records. + + The package_search API is used with pagination. + ''' + if stdout is None: + stdout = getattr(sys.stdout, 'buffer', sys.stdout) + if stderr is None: + stderr = getattr(sys.stderr, 'buffer', sys.stderr) + + jsonl_output = stdout + if arguments['--output']: + jsonl_output = open(arguments['--output'], 'wb') + if arguments['--gzip']: + jsonl_output = gzip.GzipFile(fileobj=jsonl_output) + + with quiet_int_pipe() as errors: + count = 0 + total_count = 0 + total_known = False + while not total_known or total_count > count: + response = ckan.call_action("package_search", rows=PAGINATION, + start=count) + total_count = response["count"] + total_known = True + for record in response["results"]: + jsonl_output.write(compact_json(record, + sort_keys=True) + b'\n') + count += 1 + if 'pipe' in errors: + return 1 + if 'interrupt' in errors: + return 2 + diff --git a/ckanapi/cli/main.py b/ckanapi/cli/main.py index 0904266..ad026d9 100644 --- a/ckanapi/cli/main.py +++ b/ckanapi/cli/main.py @@ -19,6 +19,10 @@ (ID_OR_NAME ... | --all) ([-O JSONL_OUTPUT] | [-D DIRECTORY]) [-p PROCESSES] [-qwz] [[-c CONFIG] [-u USER] | -r SITE_URL [-a APIKEY] [-g]] + ckanapi dump_metadata + [-O JSONL_OUTPUT] + [-z] + [[-c CONFIG] [-u USER] | -r SITE_URL [-a APIKEY] [-g]] ckanapi delete (datasets | groups | organizations | users | related) (ID_OR_NAME ... | [-I JSONL_INPUT] [-s START] [-m MAX]) [-p PROCESSES] [-l LOG_FILE] [-qwz] @@ -77,6 +81,7 @@ from ckanapi.cli.dump import dump_things from ckanapi.cli.delete import delete_things from ckanapi.cli.action import action +from ckanapi.cli.dump_metadata import dump_metadata def parse_arguments(): @@ -130,6 +135,9 @@ def main(running_with_paster=False): if arguments['delete']: return delete_things(ckan, thing[0], arguments) + if arguments['dump_metadata']: + return dump_metadata(ckan, arguments) + assert 0, arguments # we shouldn't be here diff --git a/ckanapi/tests/test_cli_dump_metadata.py b/ckanapi/tests/test_cli_dump_metadata.py new file mode 100644 index 0000000..ae7840a --- /dev/null +++ b/ckanapi/tests/test_cli_dump_metadata.py @@ -0,0 +1,63 @@ +from ckanapi.cli.dump_metadata import dump_metadata +from ckanapi.errors import NotFound +import json +import tempfile +import shutil +from os.path import exists + +try: + import unittest2 as unittest +except ImportError: + import unittest +from io import BytesIO + + +class MockCKAN(object): + def call_action(self, name, **kwargs): + try: + return { + 'package_search': { + 'count' : 3, + 'results' : [ + {'id': '12', + 'name': 'twelve', + 'title': "Twelve"}, + {'id': '34', + 'name': 'thirtyfour', + 'title': "Thirty-four"}, + {'id': 'dp', + 'name': 'dp', + 'title': 'Test for datapackage', + 'resources':[ {'name': 'resource1', + 'format': 'html', + 'url':'http://example.com/test-file'}]}, + ], + }, + }[name] + except KeyError: + raise NotFound() + + +class TestCLIDumpMetadata(unittest.TestCase): + def setUp(self): + self.ckan = MockCKAN() + self.stdout = BytesIO() + self.stderr = BytesIO() + + def test_dump_metadata(self): + dump_metadata(self.ckan, { + '--ckan-user': None, + '--config': None, + '--remote': None, + '--apikey': None, + '--output': None, + '--gzip': False, + }, + stdout=self.stdout, + stderr=self.stderr) + output = self.stdout.getvalue().decode("UTF-8") + self.assertEqual(output.count("\n"), 3) + self.assertTrue(r'{"id":"12","name":"twelve","title":"Twelve"}' in output) + self.assertTrue(r'{"id":"34","name":"thirtyfour","title":"Thirty-four"}' in output) + self.assertTrue(r'id":"dp"' in output) + From 12f73796619a57d183585e3a49018034e29321e7 Mon Sep 17 00:00:00 2001 From: Eric Zhu Date: Tue, 15 Nov 2016 13:56:34 -0500 Subject: [PATCH 2/4] add sort by metadata id, test for pagination --- ckanapi/cli/dump_metadata.py | 11 ++++-- ckanapi/tests/test_cli_dump_metadata.py | 52 +++++++++++++------------ 2 files changed, 35 insertions(+), 28 deletions(-) diff --git a/ckanapi/cli/dump_metadata.py b/ckanapi/cli/dump_metadata.py index b22e298..be0222e 100644 --- a/ckanapi/cli/dump_metadata.py +++ b/ckanapi/cli/dump_metadata.py @@ -11,14 +11,17 @@ from ckanapi.cli.utils import compact_json, \ quiet_int_pipe, pretty_json -PAGINATION = 50 +DEFAULT_PAGINATION = 50 -def dump_metadata(ckan, arguments, stdout=None, stderr=None): +def dump_metadata(ckan, arguments, pagination=DEFAULT_PAGINATION, + stdout=None, stderr=None): ''' Dump all the JSON metadata records. The package_search API is used with pagination. ''' + if pagination < 1: + raise ValueError("Pagination size must be greater or equal to 1") if stdout is None: stdout = getattr(sys.stdout, 'buffer', sys.stdout) if stderr is None: @@ -35,8 +38,8 @@ def dump_metadata(ckan, arguments, stdout=None, stderr=None): total_count = 0 total_known = False while not total_known or total_count > count: - response = ckan.call_action("package_search", rows=PAGINATION, - start=count) + response = ckan.call_action("package_search", rows=pagination, + start=count, sort="id asc") total_count = response["count"] total_known = True for record in response["results"]: diff --git a/ckanapi/tests/test_cli_dump_metadata.py b/ckanapi/tests/test_cli_dump_metadata.py index ae7840a..bc2dc52 100644 --- a/ckanapi/tests/test_cli_dump_metadata.py +++ b/ckanapi/tests/test_cli_dump_metadata.py @@ -13,29 +13,33 @@ class MockCKAN(object): + + def __init__(self): + + self.data = [{'id': '12', + 'name': 'twelve', + 'title': "Twelve"}, + {'id': '34', + 'name': 'thirtyfour', + 'title': "Thirty-four"}, + {'id': '56', + 'name': 'fiftysix', + 'title': "Fifty-Six"}, + {'id': '67', + 'name': 'sixtyseven', + 'title': "Sixty-Seven"}, + {'id': 'dp', + 'name': 'dp', + 'title': 'Test for datapackage', + 'resources':[ {'name': 'resource1', + 'format': 'html', + 'url':'http://example.com/test-file'}]}] + def call_action(self, name, **kwargs): - try: - return { - 'package_search': { - 'count' : 3, - 'results' : [ - {'id': '12', - 'name': 'twelve', - 'title': "Twelve"}, - {'id': '34', - 'name': 'thirtyfour', - 'title': "Thirty-four"}, - {'id': 'dp', - 'name': 'dp', - 'title': 'Test for datapackage', - 'resources':[ {'name': 'resource1', - 'format': 'html', - 'url':'http://example.com/test-file'}]}, - ], - }, - }[name] - except KeyError: - raise NotFound() + start = kwargs["start"] + rows = kwargs["rows"] + return {"count" : 5, + "results" : self.data[start:start+rows]} class TestCLIDumpMetadata(unittest.TestCase): @@ -52,11 +56,11 @@ def test_dump_metadata(self): '--apikey': None, '--output': None, '--gzip': False, - }, + }, 2, stdout=self.stdout, stderr=self.stderr) output = self.stdout.getvalue().decode("UTF-8") - self.assertEqual(output.count("\n"), 3) + self.assertEqual(output.count("\n"), 5) self.assertTrue(r'{"id":"12","name":"twelve","title":"Twelve"}' in output) self.assertTrue(r'{"id":"34","name":"thirtyfour","title":"Thirty-four"}' in output) self.assertTrue(r'id":"dp"' in output) From 6b80577915709082fb56991e6a72b9f2f84dda61 Mon Sep 17 00:00:00 2001 From: Eric Zhu Date: Tue, 15 Nov 2016 14:45:48 -0500 Subject: [PATCH 3/4] fix syntax for using call_action --- ckanapi/cli/dump_metadata.py | 4 ++-- ckanapi/tests/test_cli_dump_metadata.py | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/ckanapi/cli/dump_metadata.py b/ckanapi/cli/dump_metadata.py index be0222e..43bae7d 100644 --- a/ckanapi/cli/dump_metadata.py +++ b/ckanapi/cli/dump_metadata.py @@ -38,8 +38,8 @@ def dump_metadata(ckan, arguments, pagination=DEFAULT_PAGINATION, total_count = 0 total_known = False while not total_known or total_count > count: - response = ckan.call_action("package_search", rows=pagination, - start=count, sort="id asc") + response = ckan.call_action("package_search", dict(rows=pagination, + start=count, sort="id asc")) total_count = response["count"] total_known = True for record in response["results"]: diff --git a/ckanapi/tests/test_cli_dump_metadata.py b/ckanapi/tests/test_cli_dump_metadata.py index bc2dc52..44a16c8 100644 --- a/ckanapi/tests/test_cli_dump_metadata.py +++ b/ckanapi/tests/test_cli_dump_metadata.py @@ -35,7 +35,7 @@ def __init__(self): 'format': 'html', 'url':'http://example.com/test-file'}]}] - def call_action(self, name, **kwargs): + def call_action(self, name, kwargs): start = kwargs["start"] rows = kwargs["rows"] return {"count" : 5, From 02d764a2ec98dc06dcd827f599ecbe998292f33f Mon Sep 17 00:00:00 2001 From: Eric Zhu Date: Wed, 14 Dec 2016 19:10:07 -0500 Subject: [PATCH 4/4] change name for dump metadata command --- ckanapi/cli/dump_metadata.py | 2 ++ ckanapi/cli/main.py | 4 ++-- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/ckanapi/cli/dump_metadata.py b/ckanapi/cli/dump_metadata.py index 43bae7d..2269678 100644 --- a/ckanapi/cli/dump_metadata.py +++ b/ckanapi/cli/dump_metadata.py @@ -17,6 +17,8 @@ def dump_metadata(ckan, arguments, pagination=DEFAULT_PAGINATION, stdout=None, stderr=None): ''' Dump all the JSON metadata records. + This is often a better than using dump_things with thing=datasets, + as sites like catalog.data.gov do not support package_list api. The package_search API is used with pagination. ''' diff --git a/ckanapi/cli/main.py b/ckanapi/cli/main.py index ad026d9..6cbb5a1 100644 --- a/ckanapi/cli/main.py +++ b/ckanapi/cli/main.py @@ -19,7 +19,7 @@ (ID_OR_NAME ... | --all) ([-O JSONL_OUTPUT] | [-D DIRECTORY]) [-p PROCESSES] [-qwz] [[-c CONFIG] [-u USER] | -r SITE_URL [-a APIKEY] [-g]] - ckanapi dump_metadata + ckanapi dump_datasets2 [-O JSONL_OUTPUT] [-z] [[-c CONFIG] [-u USER] | -r SITE_URL [-a APIKEY] [-g]] @@ -135,7 +135,7 @@ def main(running_with_paster=False): if arguments['delete']: return delete_things(ckan, thing[0], arguments) - if arguments['dump_metadata']: + if arguments['dump_datasets2']: return dump_metadata(ckan, arguments) assert 0, arguments # we shouldn't be here