diff --git a/sdks/python/apache_beam/io/gcp/gcsfilesystem.py b/sdks/python/apache_beam/io/gcp/gcsfilesystem.py index 3763e21abc9f..bec7cb858c18 100644 --- a/sdks/python/apache_beam/io/gcp/gcsfilesystem.py +++ b/sdks/python/apache_beam/io/gcp/gcsfilesystem.py @@ -34,7 +34,6 @@ from apache_beam.io.filesystem import CompressionTypes from apache_beam.io.filesystem import FileMetadata from apache_beam.io.filesystem import FileSystem -from apache_beam.io.gcp import gcsio __all__ = ['GCSFileSystem'] @@ -42,13 +41,27 @@ class GCSFileSystem(FileSystem): """A GCS ``FileSystem`` implementation for accessing files on GCS. """ + def _get_gcsio(self): + try: + from apache_beam.io.gcp import gcsio + return gcsio + except ImportError: + raise ImportError( + 'GCSFileSystem requires apache-beam[gcp]. ' + 'Install it with: pip install apache-beam[gcp]') - CHUNK_SIZE = gcsio.MAX_BATCH_OPERATION_SIZE # Chuck size in batch operations GCS_PREFIX = 'gs://' def __init__(self, pipeline_options): super().__init__(pipeline_options) self._pipeline_options = pipeline_options + self._chunk_size = None + + @property + def CHUNK_SIZE(self): + if self._chunk_size is None: + self._chunk_size = self._get_gcsio().MAX_BATCH_OPERATION_SIZE + return self._chunk_size @classmethod def scheme(cls): @@ -139,6 +152,7 @@ def _list(self, dir_or_prefix): raise BeamIOError("List operation failed", {dir_or_prefix: e}) def _gcsIO(self): + gcsio = self._get_gcsio() return gcsio.GcsIO(pipeline_options=self._pipeline_options) def _path_open( @@ -370,8 +384,9 @@ def delete(self, paths): def report_lineage(self, path, lineage): try: + gcsio = self._get_gcsio() components = gcsio.parse_gcs_path(path, object_optional=True) - except ValueError: + except (ImportError, ValueError): # report lineage is fail-safe traceback.print_exc() return diff --git a/sdks/python/apache_beam/io/gcp/gcsfilesystem_test.py b/sdks/python/apache_beam/io/gcp/gcsfilesystem_test.py index 08fdd6302887..fa3d6b3f6212 100644 --- a/sdks/python/apache_beam/io/gcp/gcsfilesystem_test.py +++ b/sdks/python/apache_beam/io/gcp/gcsfilesystem_test.py @@ -27,6 +27,7 @@ from apache_beam.io.filesystem import BeamIOError from apache_beam.io.filesystem import FileMetadata +from apache_beam.io.filesystems import FileSystems from apache_beam.options.pipeline_options import PipelineOptions # Protect against environments where apitools library is not available. @@ -35,16 +36,31 @@ from apache_beam.io.gcp import gcsfilesystem except ImportError: gcsfilesystem = None # type: ignore + +try: + from apache_beam.io.gcp import gcsio +except ImportError: + gcsio = None # type: ignore # pylint: enable=wrong-import-order, wrong-import-position -@unittest.skipIf(gcsfilesystem is None, 'GCP dependencies are not installed') +class GCSFileSystemLazyLoadTest(unittest.TestCase): + def test_get_filesystem_does_not_require_gcp_extra(self): + fs = FileSystems.get_filesystem('gs://test-bucket/path') + self.assertEqual(fs.scheme(), 'gs') + + +@unittest.skipIf( + gcsfilesystem is None or gcsio is None, + 'GCP dependencies are not installed') class GCSFileSystemTest(unittest.TestCase): def setUp(self): + assert gcsfilesystem is not None pipeline_options = PipelineOptions() self.fs = gcsfilesystem.GCSFileSystem(pipeline_options=pipeline_options) def test_scheme(self): + assert gcsfilesystem is not None self.assertEqual(self.fs.scheme(), 'gs') self.assertEqual(gcsfilesystem.GCSFileSystem.scheme(), 'gs') @@ -77,11 +93,13 @@ def test_split(self): with self.assertRaises(ValueError): self.fs.split('/no/gcs/prefix') - @mock.patch('apache_beam.io.gcp.gcsfilesystem.gcsio') - def test_match_multiples(self, mock_gcsio): + @mock.patch('apache_beam.io.gcp.gcsfilesystem.GCSFileSystem._get_gcsio') + def test_match_multiples(self, mock_get_gcsio): # Prepare mocks. gcsio_mock = mock.MagicMock() - gcsfilesystem.gcsio.GcsIO = lambda pipeline_options=None: gcsio_mock + mock_gcsio_module = mock.MagicMock() + mock_gcsio_module.GcsIO.return_value = gcsio_mock + mock_get_gcsio.return_value = mock_gcsio_module gcsio_mock.list_files.return_value = iter([ ('gs://bucket/file1', (1, 99999.0)), ('gs://bucket/file2', (2, 88888.0)) @@ -95,12 +113,14 @@ def test_match_multiples(self, mock_gcsio): gcsio_mock.list_files.assert_called_once_with( 'gs://bucket/', with_metadata=True) - @mock.patch('apache_beam.io.gcp.gcsfilesystem.gcsio') - def test_match_multiples_limit(self, mock_gcsio): + @mock.patch('apache_beam.io.gcp.gcsfilesystem.GCSFileSystem._get_gcsio') + def test_match_multiples_limit(self, mock_get_gcsio): # Prepare mocks. gcsio_mock = mock.MagicMock() limit = 1 - gcsfilesystem.gcsio.GcsIO = lambda pipeline_options=None: gcsio_mock + mock_gcsio_module = mock.MagicMock() + mock_gcsio_module.GcsIO.return_value = gcsio_mock + mock_get_gcsio.return_value = mock_gcsio_module gcsio_mock.list_files.return_value = iter([ ('gs://bucket/file1', (1, 99999.0)) ]) @@ -111,11 +131,13 @@ def test_match_multiples_limit(self, mock_gcsio): gcsio_mock.list_files.assert_called_once_with( 'gs://bucket/', with_metadata=True) - @mock.patch('apache_beam.io.gcp.gcsfilesystem.gcsio') - def test_match_multiples_error(self, mock_gcsio): + @mock.patch('apache_beam.io.gcp.gcsfilesystem.GCSFileSystem._get_gcsio') + def test_match_multiples_error(self, mock_get_gcsio): # Prepare mocks. gcsio_mock = mock.MagicMock() - gcsfilesystem.gcsio.GcsIO = lambda pipeline_options=None: gcsio_mock + mock_gcsio_module = mock.MagicMock() + mock_gcsio_module.GcsIO.return_value = gcsio_mock + mock_get_gcsio.return_value = mock_gcsio_module exception = IOError('Failed') gcsio_mock.list_files.side_effect = exception @@ -127,11 +149,13 @@ def test_match_multiples_error(self, mock_gcsio): gcsio_mock.list_files.assert_called_once_with( 'gs://bucket/', with_metadata=True) - @mock.patch('apache_beam.io.gcp.gcsfilesystem.gcsio') - def test_match_multiple_patterns(self, mock_gcsio): + @mock.patch('apache_beam.io.gcp.gcsfilesystem.GCSFileSystem._get_gcsio') + def test_match_multiple_patterns(self, mock_get_gcsio): # Prepare mocks. gcsio_mock = mock.MagicMock() - gcsfilesystem.gcsio.GcsIO = lambda pipeline_options=None: gcsio_mock + mock_gcsio_module = mock.MagicMock() + mock_gcsio_module.GcsIO.return_value = gcsio_mock + mock_get_gcsio.return_value = mock_gcsio_module gcsio_mock.list_files.side_effect = [ iter([('gs://bucket/file1', (1, 99999.0))]), iter([('gs://bucket/file2', (2, 88888.0))]), @@ -141,33 +165,39 @@ def test_match_multiple_patterns(self, mock_gcsio): result = self.fs.match(['gs://bucket/file1*', 'gs://bucket/file2*']) self.assertEqual([mr.metadata_list for mr in result], expected_results) - @mock.patch('apache_beam.io.gcp.gcsfilesystem.gcsio') - def test_create(self, mock_gcsio): + @mock.patch('apache_beam.io.gcp.gcsfilesystem.GCSFileSystem._get_gcsio') + def test_create(self, mock_get_gcsio): # Prepare mocks. gcsio_mock = mock.MagicMock() - gcsfilesystem.gcsio.GcsIO = lambda pipeline_options=None: gcsio_mock + mock_gcsio_module = mock.MagicMock() + mock_gcsio_module.GcsIO.return_value = gcsio_mock + mock_get_gcsio.return_value = mock_gcsio_module # Issue file copy _ = self.fs.create('gs://bucket/from1', 'application/octet-stream') gcsio_mock.open.assert_called_once_with( 'gs://bucket/from1', 'wb', mime_type='application/octet-stream') - @mock.patch('apache_beam.io.gcp.gcsfilesystem.gcsio') - def test_open(self, mock_gcsio): + @mock.patch('apache_beam.io.gcp.gcsfilesystem.GCSFileSystem._get_gcsio') + def test_open(self, mock_get_gcsio): # Prepare mocks. gcsio_mock = mock.MagicMock() - gcsfilesystem.gcsio.GcsIO = lambda pipeline_options=None: gcsio_mock + mock_gcsio_module = mock.MagicMock() + mock_gcsio_module.GcsIO.return_value = gcsio_mock + mock_get_gcsio.return_value = mock_gcsio_module # Issue file copy _ = self.fs.open('gs://bucket/from1', 'application/octet-stream') gcsio_mock.open.assert_called_once_with( 'gs://bucket/from1', 'rb', mime_type='application/octet-stream') - @mock.patch('apache_beam.io.gcp.gcsfilesystem.gcsio') - def test_copy_file(self, mock_gcsio): + @mock.patch('apache_beam.io.gcp.gcsfilesystem.GCSFileSystem._get_gcsio') + def test_copy_file(self, mock_get_gcsio): # Prepare mocks. gcsio_mock = mock.MagicMock() - gcsfilesystem.gcsio.GcsIO = lambda pipeline_options=None: gcsio_mock + mock_gcsio_module = mock.MagicMock() + mock_gcsio_module.GcsIO.return_value = gcsio_mock + mock_get_gcsio.return_value = mock_gcsio_module sources = ['gs://bucket/from1'] destinations = ['gs://bucket/to1'] @@ -177,11 +207,13 @@ def test_copy_file(self, mock_gcsio): gcsio_mock.copy.assert_called_once_with( 'gs://bucket/from1', 'gs://bucket/to1') - @mock.patch('apache_beam.io.gcp.gcsfilesystem.gcsio') - def test_copy_file_error(self, mock_gcsio): + @mock.patch('apache_beam.io.gcp.gcsfilesystem.GCSFileSystem._get_gcsio') + def test_copy_file_error(self, mock_get_gcsio): # Prepare mocks. gcsio_mock = mock.MagicMock() - gcsfilesystem.gcsio.GcsIO = lambda pipeline_options=None: gcsio_mock + mock_gcsio_module = mock.MagicMock() + mock_gcsio_module.GcsIO.return_value = gcsio_mock + mock_get_gcsio.return_value = mock_gcsio_module sources = ['gs://bucket/from1'] destinations = ['gs://bucket/to1'] @@ -201,11 +233,13 @@ def test_copy_file_error(self, mock_gcsio): gcsio_mock.copy.assert_called_once_with( 'gs://bucket/from1', 'gs://bucket/to1') - @mock.patch('apache_beam.io.gcp.gcsfilesystem.gcsio') - def test_copy_tree(self, mock_gcsio): + @mock.patch('apache_beam.io.gcp.gcsfilesystem.GCSFileSystem._get_gcsio') + def test_copy_tree(self, mock_get_gcsio): # Prepare mocks. gcsio_mock = mock.MagicMock() - gcsfilesystem.gcsio.GcsIO = lambda pipeline_options=None: gcsio_mock + mock_gcsio_module = mock.MagicMock() + mock_gcsio_module.GcsIO.return_value = gcsio_mock + mock_get_gcsio.return_value = mock_gcsio_module sources = ['gs://bucket1/'] destinations = ['gs://bucket2/'] @@ -215,11 +249,13 @@ def test_copy_tree(self, mock_gcsio): gcsio_mock.copytree.assert_called_once_with( 'gs://bucket1/', 'gs://bucket2/') - @mock.patch('apache_beam.io.gcp.gcsfilesystem.gcsio') - def test_rename(self, mock_gcsio): + @mock.patch('apache_beam.io.gcp.gcsfilesystem.GCSFileSystem._get_gcsio') + def test_rename(self, mock_get_gcsio): # Prepare mocks. gcsio_mock = mock.MagicMock() - gcsfilesystem.gcsio.GcsIO = lambda pipeline_options=None: gcsio_mock + mock_gcsio_module = mock.MagicMock() + mock_gcsio_module.GcsIO.return_value = gcsio_mock + mock_get_gcsio.return_value = mock_gcsio_module sources = [ 'gs://bucket/from1', 'gs://bucket/from2', @@ -255,11 +291,13 @@ def test_rename(self, mock_gcsio): 'gs://bucket/from3', ]) - @mock.patch('apache_beam.io.gcp.gcsfilesystem.gcsio') - def test_rename_delete_error(self, mock_gcsio): + @mock.patch('apache_beam.io.gcp.gcsfilesystem.GCSFileSystem._get_gcsio') + def test_rename_delete_error(self, mock_get_gcsio): # Prepare mocks. gcsio_mock = mock.MagicMock() - gcsfilesystem.gcsio.GcsIO = lambda pipeline_options=None: gcsio_mock + mock_gcsio_module = mock.MagicMock() + mock_gcsio_module.GcsIO.return_value = gcsio_mock + mock_get_gcsio.return_value = mock_gcsio_module sources = [ 'gs://bucket/from1', 'gs://bucket/from2', @@ -296,11 +334,13 @@ def test_rename_delete_error(self, mock_gcsio): 'gs://bucket/from3', ]) - @mock.patch('apache_beam.io.gcp.gcsfilesystem.gcsio') - def test_rename_copy_error(self, mock_gcsio): + @mock.patch('apache_beam.io.gcp.gcsfilesystem.GCSFileSystem._get_gcsio') + def test_rename_copy_error(self, mock_get_gcsio): # Prepare mocks. gcsio_mock = mock.MagicMock() - gcsfilesystem.gcsio.GcsIO = lambda pipeline_options=None: gcsio_mock + mock_gcsio_module = mock.MagicMock() + mock_gcsio_module.GcsIO.return_value = gcsio_mock + mock_get_gcsio.return_value = mock_gcsio_module sources = [ 'gs://bucket/from1', 'gs://bucket/from2', @@ -335,11 +375,13 @@ def test_rename_copy_error(self, mock_gcsio): 'gs://bucket/from3', ]) - @mock.patch('apache_beam.io.gcp.gcsfilesystem.gcsio') - def test_delete(self, mock_gcsio): + @mock.patch('apache_beam.io.gcp.gcsfilesystem.GCSFileSystem._get_gcsio') + def test_delete(self, mock_get_gcsio): # Prepare mocks. gcsio_mock = mock.MagicMock() - gcsfilesystem.gcsio.GcsIO = lambda pipeline_options=None: gcsio_mock + mock_gcsio_module = mock.MagicMock() + mock_gcsio_module.GcsIO.return_value = gcsio_mock + mock_get_gcsio.return_value = mock_gcsio_module gcsio_mock._status.return_value = {'size': 0, 'updated': 99999.0} files = [ 'gs://bucket/from1', @@ -351,11 +393,13 @@ def test_delete(self, mock_gcsio): self.fs.delete(files) gcsio_mock.delete_batch.assert_called() - @mock.patch('apache_beam.io.gcp.gcsfilesystem.gcsio') - def test_delete_error(self, mock_gcsio): + @mock.patch('apache_beam.io.gcp.gcsfilesystem.GCSFileSystem._get_gcsio') + def test_delete_error(self, mock_get_gcsio): # Prepare mocks. gcsio_mock = mock.MagicMock() - gcsfilesystem.gcsio.GcsIO = lambda pipeline_options=None: gcsio_mock + mock_gcsio_module = mock.MagicMock() + mock_gcsio_module.GcsIO.return_value = gcsio_mock + mock_get_gcsio.return_value = mock_gcsio_module gcsio_mock._status.return_value = {'size': 0, 'updated': 99999.0} files = [