From bf09ae3b9dea6d3e00d7c0a42efc4e5d1c027388 Mon Sep 17 00:00:00 2001 From: Nicholas Kumia Date: Mon, 2 Oct 2023 12:59:35 -0400 Subject: [PATCH 1/6] new: don't raise error for ParentNotHarvestedException Not sure of the original intention, but we are removing this to make our harvester more reliable --- ckanext/datajson/datajson.py | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/ckanext/datajson/datajson.py b/ckanext/datajson/datajson.py index 2e44715b..bfe0b937 100644 --- a/ckanext/datajson/datajson.py +++ b/ckanext/datajson/datajson.py @@ -465,7 +465,11 @@ def is_part_of_to_package_id(self, ipo, harvest_object): harvest_object.save() except Exception: pass - raise ParentNotHarvestedException('Unable to find parent dataset. Raising error to allow re-run later') + + # This 'raise' was constantly crashing our harvesting process. + # To better accomodate our current infrastructure, the output + # of this function should be validated instead. + # raise ParentNotHarvestedException('Unable to find parent dataset. Raising error to allow re-run later') def import_stage(self, harvest_object): # The import stage actually creates the dataset. @@ -502,7 +506,10 @@ def import_stage(self, harvest_object): # check if parent is already harvested parent_identifier = parent_pkg_id.replace('IPO:', '') parent = self.is_part_of_to_package_id(parent_identifier, harvest_object) - parent_pkg_id = parent['id'] + if parent is not None: + parent_pkg_id = parent['id'] + else: + parent_pkg_id = None if extra.key.startswith('catalog_'): catalog_extras[extra.key] = extra.value From 9f3b15e9317c1da99b14657454481e53d6e5d43a Mon Sep 17 00:00:00 2001 From: Nicholas Kumia Date: Mon, 2 Oct 2023 13:04:34 -0400 Subject: [PATCH 2/6] lint: comment out ParentNotHarvestedException --- ckanext/datajson/datajson.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ckanext/datajson/datajson.py b/ckanext/datajson/datajson.py index bfe0b937..b4f6dd9a 100644 --- a/ckanext/datajson/datajson.py +++ b/ckanext/datajson/datajson.py @@ -11,7 +11,7 @@ from ckanext.harvest.model import HarvestObject, HarvestObjectError, HarvestObjectExtra from ckanext.harvest.harvesters.base import HarvesterBase -from ckanext.datajson.exceptions import ParentNotHarvestedException +# from ckanext.datajson.exceptions import ParentNotHarvestedException import uuid import hashlib import json From 8f0277de3224b58180a1a187f0021499f2f2fd83 Mon Sep 17 00:00:00 2001 From: Nicholas Kumia Date: Mon, 2 Oct 2023 16:24:39 -0400 Subject: [PATCH 3/6] new: if the parent cannot be found, don't continue with the child --- ckanext/datajson/datajson.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ckanext/datajson/datajson.py b/ckanext/datajson/datajson.py index b4f6dd9a..26338a01 100644 --- a/ckanext/datajson/datajson.py +++ b/ckanext/datajson/datajson.py @@ -509,7 +509,7 @@ def import_stage(self, harvest_object): if parent is not None: parent_pkg_id = parent['id'] else: - parent_pkg_id = None + return None if extra.key.startswith('catalog_'): catalog_extras[extra.key] = extra.value From c3192b8e6d4fbc63f07156a13e86fe983f1d4520 Mon Sep 17 00:00:00 2001 From: Nicholas Kumia Date: Mon, 2 Oct 2023 14:14:41 -0400 Subject: [PATCH 4/6] fix: tests that rely on ParentNotHarvestedException --- .../tests/test_datajson_ckan_all_harvester.py | 16 +++++----------- 1 file changed, 5 insertions(+), 11 deletions(-) diff --git a/ckanext/datajson/tests/test_datajson_ckan_all_harvester.py b/ckanext/datajson/tests/test_datajson_ckan_all_harvester.py index 5861b8f0..12b71bac 100644 --- a/ckanext/datajson/tests/test_datajson_ckan_all_harvester.py +++ b/ckanext/datajson/tests/test_datajson_ckan_all_harvester.py @@ -11,7 +11,6 @@ from ckan import model from ckan.lib.munge import munge_title_to_name from ckanext.datajson.harvester_datajson import DataJsonHarvester -from ckanext.datajson.exceptions import ParentNotHarvestedException from .factories import HarvestJobObj, HarvestSourceObj from mock import Mock, patch @@ -382,8 +381,7 @@ def __init__(self, message): # first a child and assert to get an error r2 = json.dumps({"harvest_object_id": self.harvest_objects[1].id}) r0 = FakeMethod(r2) - with pytest.raises(ParentNotHarvestedException): - queue.fetch_callback(consumer_fetch, r0, None, r2) + queue.fetch_callback(consumer_fetch, r0, None, r2) assert self.harvest_objects[1].retry_times == 1 assert self.harvest_objects[1].state == "ERROR" @@ -476,8 +474,7 @@ def get_action(action_name): harvest_object.source = harvest_source harvester = DataJsonHarvester() - with pytest.raises(ParentNotHarvestedException): - harvester.is_part_of_to_package_id('custom-identifier', harvest_object) + assert harvester.is_part_of_to_package_id('custom-identifier', harvest_object) is None assert mock_get_action.called @@ -557,8 +554,7 @@ def get_action(action_name): mock_get_action.side_effect = get_action harvester = DataJsonHarvester() - with pytest.raises(ParentNotHarvestedException): - harvester.is_part_of_to_package_id('identifier', None) + assert harvester.is_part_of_to_package_id('identifier', None) is None def test_datajson_is_part_of_package_id(self): url = 'http://127.0.0.1:%s/collection-1-parent-2-children.data.json' % self.mock_port @@ -575,11 +571,9 @@ def test_datajson_is_part_of_package_id(self): assert dataset['title'] == 'Employee Relations Roundtables' if content['identifier'] in ['OPM-ERround-0001-AWOL', 'OPM-ERround-0001-Retire']: - with pytest.raises(ParentNotHarvestedException): - self.harvester.is_part_of_to_package_id(content['identifier'], harvest_object) + assert self.harvester.is_part_of_to_package_id(content['identifier'], harvest_object) is None - with pytest.raises(ParentNotHarvestedException): - self.harvester.is_part_of_to_package_id('bad identifier', harvest_object) + assert self.harvester.is_part_of_to_package_id('bad identifier', harvest_object) is None def test_datajson_non_federal(self): """ validate we get the coinfig we sent """ From 8f30b013c29411a582153ee9cda27951efdeba00 Mon Sep 17 00:00:00 2001 From: Nicholas Kumia Date: Mon, 2 Oct 2023 14:15:55 -0400 Subject: [PATCH 5/6] lint: blah --- ckanext/datajson/tests/test_datajson_ckan_all_harvester.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/ckanext/datajson/tests/test_datajson_ckan_all_harvester.py b/ckanext/datajson/tests/test_datajson_ckan_all_harvester.py index 12b71bac..0df6902c 100644 --- a/ckanext/datajson/tests/test_datajson_ckan_all_harvester.py +++ b/ckanext/datajson/tests/test_datajson_ckan_all_harvester.py @@ -2,8 +2,6 @@ import json import logging -import pytest - import ckan.plugins as p import ckanext.harvest.model as harvest_model import ckanext.harvest.queue as queue From 7077b4bdce45e8e73913e41cb3b7308549565884 Mon Sep 17 00:00:00 2001 From: Nicholas Kumia Date: Mon, 2 Oct 2023 16:30:47 -0400 Subject: [PATCH 6/6] fix: parent was never encountered before, so no retry --- ckanext/datajson/tests/test_datajson_ckan_all_harvester.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ckanext/datajson/tests/test_datajson_ckan_all_harvester.py b/ckanext/datajson/tests/test_datajson_ckan_all_harvester.py index 0df6902c..d1923acb 100644 --- a/ckanext/datajson/tests/test_datajson_ckan_all_harvester.py +++ b/ckanext/datajson/tests/test_datajson_ckan_all_harvester.py @@ -387,7 +387,7 @@ def __init__(self, message): r2 = json.dumps({"harvest_object_id": self.harvest_objects[0].id}) r0 = FakeMethod(r2) queue.fetch_callback(consumer_fetch, r0, None, r2) - assert self.harvest_objects[0].retry_times == 1 + assert self.harvest_objects[0].retry_times == 0 assert self.harvest_objects[0].state == "COMPLETE" # Check status on harvest objects