From 970846b96bc9e7f6583f1c84f99f8f0330a6bb39 Mon Sep 17 00:00:00 2001 From: Jack McCluskey Date: Thu, 8 May 2025 11:53:12 -0400 Subject: [PATCH 1/7] Support newer versions of PyArrow --- .../beam_PostCommit_Python_Dependency.json | 2 +- sdks/python/setup.py | 2 +- sdks/python/test-suites/tox/py39/build.gradle | 16 ++++++++++++++++ sdks/python/tox.ini | 6 +++++- 4 files changed, 23 insertions(+), 3 deletions(-) diff --git a/.github/trigger_files/beam_PostCommit_Python_Dependency.json b/.github/trigger_files/beam_PostCommit_Python_Dependency.json index a7fc54b3e4bb..907b485d4d30 100644 --- a/.github/trigger_files/beam_PostCommit_Python_Dependency.json +++ b/.github/trigger_files/beam_PostCommit_Python_Dependency.json @@ -1,4 +1,4 @@ { "comment": "Modify this file in a trivial way to cause this test suite to run", - "modification": 1 + "modification": 0 } \ No newline at end of file diff --git a/sdks/python/setup.py b/sdks/python/setup.py index 2b21d0463c98..c381c0e31f6f 100644 --- a/sdks/python/setup.py +++ b/sdks/python/setup.py @@ -145,7 +145,7 @@ def cythonize(*args, **kwargs): pyarrow_dependency = [''] else: pyarrow_dependency = [ - 'pyarrow>=3.0.0,<17.0.0', + 'pyarrow>=3.0.0,<21.0.0', # NOTE(https://github.com/apache/beam/issues/29392): We can remove this # once Beam increases the pyarrow lower bound to a version that fixes CVE. 'pyarrow-hotfix<1' diff --git a/sdks/python/test-suites/tox/py39/build.gradle b/sdks/python/test-suites/tox/py39/build.gradle index 52283fdd7123..6490694c021d 100644 --- a/sdks/python/test-suites/tox/py39/build.gradle +++ b/sdks/python/test-suites/tox/py39/build.gradle @@ -85,6 +85,22 @@ toxTask "testPy39pyarrow-16", "py39-pyarrow-16", "${posargs}" test.dependsOn "testPy39pyarrow-16" postCommitPyDep.dependsOn "testPy39pyarrow-16" +toxTask "testPy39pyarrow-17", "py39-pyarrow-17", "${posargs}" +test.dependsOn "testPy39pyarrow-17" +postCommitPyDep.dependsOn "testPy39pyarrow-17" + +toxTask "testPy39pyarrow-18", "py39-pyarrow-18", "${posargs}" +test.dependsOn "testPy39pyarrow-18" +postCommitPyDep.dependsOn "testPy39pyarrow-18" + +toxTask "testPy39pyarrow-19", "py39-pyarrow-19", "${posargs}" +test.dependsOn "testPy39pyarrow-19" +postCommitPyDep.dependsOn "testPy39pyarrow-19" + +toxTask "testPy39pyarrow-20", "py39-pyarrow-20", "${posargs}" +test.dependsOn "testPy39pyarrow-20" +postCommitPyDep.dependsOn "testPy39pyarrow-20" + // Create a test task for each supported minor version of pandas toxTask "testPy39pandas-14", "py39-pandas-14", "${posargs}" test.dependsOn "testPy39pandas-14" diff --git a/sdks/python/tox.ini b/sdks/python/tox.ini index b87b5ecc6f67..72d4871b1767 100644 --- a/sdks/python/tox.ini +++ b/sdks/python/tox.ini @@ -320,7 +320,7 @@ extras = test commands = bash {toxinidir}/scripts/pytest_validates_runner.sh {envname} {toxinidir}/apache_beam/runners/portability/prism_runner_test.py {posargs} -[testenv:py{39,310}-pyarrow-{3,9,10,11,12,13,14,15,16}] +[testenv:py{39,310}-pyarrow-{3,9,10,11,12,13,14,15,16,17,18,19,20}] deps = # As a courtesy to users, test against the oldest allowed version of Pyarrow. # We'd have to increase the pyarrow lower bound when Python 3.9 is deprecated. @@ -338,6 +338,10 @@ deps = 14: pyarrow>=14,<15 15: pyarrow>=15,<16 16: pyarrow>=16,<17 + 17: pyarrow>=17,<18 + 18: pyarrow>=18,<19 + 19: pyarrow>=19,<20 + 20: pyarrow>=20,<21 numpy==1.26.4 commands = # Log pyarrow and numpy version for debugging From 280f98d833e4326aeb608362bd1e05b12c5ab877 Mon Sep 17 00:00:00 2001 From: Jack McCluskey Date: Thu, 8 May 2025 11:58:35 -0400 Subject: [PATCH 2/7] add context for lower bound that would let us remove the pyarrow-hotfix --- sdks/python/setup.py | 1 + 1 file changed, 1 insertion(+) diff --git a/sdks/python/setup.py b/sdks/python/setup.py index c381c0e31f6f..ca75aa6c66fc 100644 --- a/sdks/python/setup.py +++ b/sdks/python/setup.py @@ -148,6 +148,7 @@ def cythonize(*args, **kwargs): 'pyarrow>=3.0.0,<21.0.0', # NOTE(https://github.com/apache/beam/issues/29392): We can remove this # once Beam increases the pyarrow lower bound to a version that fixes CVE. + # (lower bound >= 14.0.1) 'pyarrow-hotfix<1' ] From bca5d6e89b87dc3e470c3de7b5c9bf250c7143bf Mon Sep 17 00:00:00 2001 From: Jack McCluskey Date: Wed, 14 May 2025 13:44:16 -0400 Subject: [PATCH 3/7] smaller jump for pyarrow --- sdks/python/setup.py | 2 +- sdks/python/test-suites/tox/py39/build.gradle | 8 -------- sdks/python/tox.ini | 4 +--- 3 files changed, 2 insertions(+), 12 deletions(-) diff --git a/sdks/python/setup.py b/sdks/python/setup.py index ca75aa6c66fc..338b2b3296ad 100644 --- a/sdks/python/setup.py +++ b/sdks/python/setup.py @@ -145,7 +145,7 @@ def cythonize(*args, **kwargs): pyarrow_dependency = [''] else: pyarrow_dependency = [ - 'pyarrow>=3.0.0,<21.0.0', + 'pyarrow>=3.0.0,<19.0.0', # NOTE(https://github.com/apache/beam/issues/29392): We can remove this # once Beam increases the pyarrow lower bound to a version that fixes CVE. # (lower bound >= 14.0.1) diff --git a/sdks/python/test-suites/tox/py39/build.gradle b/sdks/python/test-suites/tox/py39/build.gradle index 6490694c021d..d6a5e08fcf32 100644 --- a/sdks/python/test-suites/tox/py39/build.gradle +++ b/sdks/python/test-suites/tox/py39/build.gradle @@ -93,14 +93,6 @@ toxTask "testPy39pyarrow-18", "py39-pyarrow-18", "${posargs}" test.dependsOn "testPy39pyarrow-18" postCommitPyDep.dependsOn "testPy39pyarrow-18" -toxTask "testPy39pyarrow-19", "py39-pyarrow-19", "${posargs}" -test.dependsOn "testPy39pyarrow-19" -postCommitPyDep.dependsOn "testPy39pyarrow-19" - -toxTask "testPy39pyarrow-20", "py39-pyarrow-20", "${posargs}" -test.dependsOn "testPy39pyarrow-20" -postCommitPyDep.dependsOn "testPy39pyarrow-20" - // Create a test task for each supported minor version of pandas toxTask "testPy39pandas-14", "py39-pandas-14", "${posargs}" test.dependsOn "testPy39pandas-14" diff --git a/sdks/python/tox.ini b/sdks/python/tox.ini index 72d4871b1767..191e9e821973 100644 --- a/sdks/python/tox.ini +++ b/sdks/python/tox.ini @@ -320,7 +320,7 @@ extras = test commands = bash {toxinidir}/scripts/pytest_validates_runner.sh {envname} {toxinidir}/apache_beam/runners/portability/prism_runner_test.py {posargs} -[testenv:py{39,310}-pyarrow-{3,9,10,11,12,13,14,15,16,17,18,19,20}] +[testenv:py{39,310}-pyarrow-{3,9,10,11,12,13,14,15,16,17,18}] deps = # As a courtesy to users, test against the oldest allowed version of Pyarrow. # We'd have to increase the pyarrow lower bound when Python 3.9 is deprecated. @@ -340,8 +340,6 @@ deps = 16: pyarrow>=16,<17 17: pyarrow>=17,<18 18: pyarrow>=18,<19 - 19: pyarrow>=19,<20 - 20: pyarrow>=20,<21 numpy==1.26.4 commands = # Log pyarrow and numpy version for debugging From 657d106c7f966dcf867b4653bba40ffc6a1faf43 Mon Sep 17 00:00:00 2001 From: Jack McCluskey Date: Wed, 14 May 2025 13:49:00 -0400 Subject: [PATCH 4/7] run postcommit --- .github/trigger_files/beam_PostCommit_Python.json | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/.github/trigger_files/beam_PostCommit_Python.json b/.github/trigger_files/beam_PostCommit_Python.json index 0b759d797ee8..13a309763b58 100644 --- a/.github/trigger_files/beam_PostCommit_Python.json +++ b/.github/trigger_files/beam_PostCommit_Python.json @@ -1,6 +1,5 @@ { "comment": "Modify this file in a trivial way to cause this test suite to run.", - "https://github.com/apache/beam/pull/32440": "test new datastream runner for batch" - "modification": 10 + "modification": 11 } From a0469290b6cd1becf4a74a1a19c25f7b3e428c81 Mon Sep 17 00:00:00 2001 From: Jack McCluskey Date: Wed, 14 May 2025 16:00:20 -0400 Subject: [PATCH 5/7] tweak dynamic work rebalancing test --- sdks/python/apache_beam/io/parquetio_test.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sdks/python/apache_beam/io/parquetio_test.py b/sdks/python/apache_beam/io/parquetio_test.py index e33ee4ec1129..3d0e180baecf 100644 --- a/sdks/python/apache_beam/io/parquetio_test.py +++ b/sdks/python/apache_beam/io/parquetio_test.py @@ -487,7 +487,7 @@ def test_read_with_splitting_multiple_row_group(self): self._run_parquet_test(file_name, None, 10000, True, expected_result) def test_dynamic_work_rebalancing(self): - file_name = self._write_data(count=120, row_group_size=20) + file_name = self._write_data(count=280, row_group_size=20) source = _create_parquet_source(file_name) splits = [split for split in source.split(desired_bundle_size=float('inf'))] From cab37cb1ab0218b0c6665eeb9bda2a73751ec178 Mon Sep 17 00:00:00 2001 From: Jack McCluskey Date: Wed, 14 May 2025 16:03:07 -0400 Subject: [PATCH 6/7] add comment explaining unit test behavior --- sdks/python/apache_beam/io/parquetio_test.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/sdks/python/apache_beam/io/parquetio_test.py b/sdks/python/apache_beam/io/parquetio_test.py index 3d0e180baecf..964c2046d1fb 100644 --- a/sdks/python/apache_beam/io/parquetio_test.py +++ b/sdks/python/apache_beam/io/parquetio_test.py @@ -487,6 +487,10 @@ def test_read_with_splitting_multiple_row_group(self): self._run_parquet_test(file_name, None, 10000, True, expected_result) def test_dynamic_work_rebalancing(self): + # This test depends on count being sufficiently large + the ratio of + # count to row_group_size also being sufficiently large (but the required + # ratio to pass varies for values of row_group_size and, somehow, the + # version of pyarrow being tested against.) file_name = self._write_data(count=280, row_group_size=20) source = _create_parquet_source(file_name) From 4121a668a54f5ca4331bf8edbdc97ddbac9e058d Mon Sep 17 00:00:00 2001 From: Jack McCluskey Date: Wed, 14 May 2025 16:18:09 -0400 Subject: [PATCH 7/7] yapf --- sdks/python/apache_beam/io/parquetio_test.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sdks/python/apache_beam/io/parquetio_test.py b/sdks/python/apache_beam/io/parquetio_test.py index 964c2046d1fb..fd19ec9520a9 100644 --- a/sdks/python/apache_beam/io/parquetio_test.py +++ b/sdks/python/apache_beam/io/parquetio_test.py @@ -490,7 +490,7 @@ def test_dynamic_work_rebalancing(self): # This test depends on count being sufficiently large + the ratio of # count to row_group_size also being sufficiently large (but the required # ratio to pass varies for values of row_group_size and, somehow, the - # version of pyarrow being tested against.) + # version of pyarrow being tested against.) file_name = self._write_data(count=280, row_group_size=20) source = _create_parquet_source(file_name)