From 7cc41cca107ea67f520b4b716466a86ff2b42492 Mon Sep 17 00:00:00 2001 From: laughingman7743 Date: Tue, 6 Jan 2026 13:43:48 +0900 Subject: [PATCH] Add NULL and empty string handling documentation MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add comprehensive documentation explaining NULL and empty string behavior across different cursor types: - Default Cursor and DictCursor properly distinguish NULL from empty string - PandasCursor with CSV treats both as NaN, but works correctly with unload - ArrowCursor with CSV treats both as empty string, but works correctly with unload - PolarsCursor properly distinguishes in both CSV and Parquet modes - S3FSCursor properly distinguishes using custom AthenaCSVReader This documentation helps users understand the CSV format limitation and provides workarounds (unload option, PolarsCursor, S3FSCursor) for applications that need to distinguish NULL from empty string values. Unify test method names to `test_null_vs_empty_string` across all cursor types: - TestCursor.test_null_vs_empty_string - TestDictCursor.test_null_vs_empty_string - TestPandasCursor.test_null_vs_empty_string (renamed from test_empty_and_null_string) - TestArrowCursor.test_null_vs_empty_string - TestPolarsCursor.test_null_vs_empty_string - TestS3FSCursor.test_null_vs_empty_string (consolidated with parametrize) Closes #118 Closes #148 Closes #168 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 --- docs/index.rst | 1 + docs/null_handling.rst | 475 +++++++++++++++++++++++++++ tests/pyathena/arrow/test_cursor.py | 48 +++ tests/pyathena/pandas/test_cursor.py | 2 +- tests/pyathena/polars/test_cursor.py | 45 +++ tests/pyathena/s3fs/test_cursor.py | 41 ++- tests/pyathena/test_cursor.py | 70 ++++ 7 files changed, 660 insertions(+), 22 deletions(-) create mode 100644 docs/null_handling.rst diff --git a/docs/index.rst b/docs/index.rst index cee8e3ca..21ec91a2 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -17,6 +17,7 @@ Documentation introduction usage cursor + null_handling sqlalchemy pandas arrow diff --git a/docs/null_handling.rst b/docs/null_handling.rst new file mode 100644 index 00000000..6d75137f --- /dev/null +++ b/docs/null_handling.rst @@ -0,0 +1,475 @@ +.. _null_handling: + +NULL and Empty String Handling +============================== + +This section documents the behavior of NULL and empty string values across different cursor types in PyAthena. +Understanding this behavior is important for applications that need to distinguish between NULL (missing) values +and empty strings. + +.. _null-csv-limitation: + +CSV Format Limitation +--------------------- + +When Athena executes a query, the results are stored as CSV files in S3. This CSV format has an inherent limitation +in how NULL values and empty strings are represented: + +- **NULL values** are represented as unquoted empty fields: ``,,`` +- **Empty strings** are represented as quoted empty fields: ``,"",`` + +.. code:: text + + # Example CSV output from Athena + id,name,description + 1,Alice,Hello + 2,Bob,"" + 3,Charlie, + + # Row 2: description is an empty string (quoted: "") + # Row 3: description is NULL (unquoted) + +Most CSV parsers handle these differently: + +- **pandas** (used by ``PandasCursor``): By default, treats both unquoted empty and quoted empty as missing values +- **PyArrow** (used by ``ArrowCursor``): With ``quoted_strings_can_be_null=False``, treats both as empty strings +- **Polars** (used by ``PolarsCursor``): Correctly distinguishes quoted empty strings from unquoted NULL values +- **AthenaCSVReader** (used by ``S3FSCursor``): Correctly distinguishes NULL (returns ``None``) from empty strings + +This means the ability to distinguish NULL from empty strings varies by cursor when reading CSV files. + +Cursor Behavior Comparison +-------------------------- + +The following table summarizes how different cursors handle NULL and empty string values, +based on actual testing with Athena: + +.. list-table:: NULL vs Empty String Behavior + :header-rows: 1 + :widths: 25 20 20 20 15 + + * - Cursor Type + - Data Source + - Empty String ``''`` + - NULL Value + - Distinguishes? + * - ``Cursor`` (default) + - Athena API + - ``''`` + - ``None`` + - ✅ Yes + * - ``DictCursor`` + - Athena API + - ``''`` + - ``None`` + - ✅ Yes + * - ``PandasCursor`` + - CSV file + - ``NaN`` + - ``NaN`` + - ❌ No + * - ``PandasCursor`` + unload + - Parquet file + - ``''`` + - ``None`` + - ✅ Yes + * - ``ArrowCursor`` + - CSV file + - ``''`` + - ``''`` + - ❌ No + * - ``ArrowCursor`` + unload + - Parquet file + - ``''`` + - ``null`` + - ✅ Yes + * - ``PolarsCursor`` + - CSV file + - ``''`` + - ``null`` + - ✅ Yes + * - ``PolarsCursor`` + unload + - Parquet file + - ``''`` + - ``null`` + - ✅ Yes + * - ``S3FSCursor`` + - CSV file + - ``''`` + - ``None`` + - ✅ Yes + +.. note:: + + ``PolarsCursor`` and ``S3FSCursor`` are unique among the file-based cursors in that they can properly + distinguish NULL from empty strings even when reading CSV files. ``PolarsCursor`` uses Polars' CSV parser + which correctly interprets unquoted empty values as NULL, while ``S3FSCursor`` uses a custom + ``AthenaCSVReader`` that respects CSV quoting rules. + +Default Cursor (API-based) +-------------------------- + +The default ``Cursor`` and ``DictCursor`` fetch results directly from the Athena API, +which properly distinguishes between NULL and empty string values. + +.. code:: python + + from pyathena import connect + + cursor = connect(s3_staging_dir="s3://YOUR_S3_BUCKET/path/to/", + region_name="us-west-2").cursor() + + cursor.execute(""" + SELECT * FROM ( + VALUES + (1, ''), + (2, CAST(NULL AS VARCHAR)), + (3, 'hello') + ) AS t(id, value) + """) + + for row in cursor: + print(f"id={row[0]}, value={repr(row[1])}, is_none={row[1] is None}") + + # Output: + # id=1, value='', is_none=False <- Empty string + # id=2, value=None, is_none=True <- NULL + # id=3, value='hello', is_none=False + +This is the most reliable cursor when distinguishing NULL from empty string is critical. + +PandasCursor Behavior +--------------------- + +Without Unload (CSV) +~~~~~~~~~~~~~~~~~~~~ + +When using ``PandasCursor`` without the ``unload`` option, the cursor reads the CSV file +using pandas' ``read_csv()``. **Both empty strings and NULL values are treated as NaN**. + +.. code:: python + + from pyathena import connect + from pyathena.pandas.cursor import PandasCursor + + cursor = connect(s3_staging_dir="s3://YOUR_S3_BUCKET/path/to/", + region_name="us-west-2", + cursor_class=PandasCursor).cursor() + + df = cursor.execute(""" + SELECT * FROM ( + VALUES + (1, '', 'empty_string'), + (2, CAST(NULL AS VARCHAR), 'null_value'), + (3, 'hello', 'normal_string') + ) AS t(id, value, description) + """).as_pandas() + + print(df) + # id value description + # 0 1 NaN empty_string <- Empty string becomes NaN + # 1 2 NaN null_value <- NULL becomes NaN + # 2 3 hello normal_string + + print(df['value'].isna().tolist()) + # [True, True, False] <- Both empty string and NULL are NaN + +.. note:: + + By default, PyAthena sets ``keep_default_na=False`` and ``na_values=("",)`` which means only + empty values are treated as NaN, while strings like "N/A", "NULL", "NA" are preserved as-is. + However, this still cannot distinguish between quoted empty strings and unquoted NULL values + in the CSV output. + +With Unload (Parquet) - Recommended +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Using the ``unload`` option outputs results in Parquet format, which properly preserves +NULL semantics: + +.. code:: python + + from pyathena import connect + from pyathena.pandas.cursor import PandasCursor + + cursor = connect(s3_staging_dir="s3://YOUR_S3_BUCKET/path/to/", + region_name="us-west-2", + cursor_class=PandasCursor).cursor(unload=True) + + df = cursor.execute(""" + SELECT * FROM ( + VALUES + (1, '', 'empty_string'), + (2, CAST(NULL AS VARCHAR), 'null_value'), + (3, 'hello', 'normal_string') + ) AS t(id, value, description) + """).as_pandas() + + print(df) + # id value description + # 0 1 empty_string <- Empty string preserved + # 1 2 None null_value <- NULL is None + # 2 3 hello normal_string + + print(df['value'].isna().tolist()) + # [False, True, False] <- Only NULL is NaN, empty string is not + +ArrowCursor Behavior +-------------------- + +Without Unload (CSV) +~~~~~~~~~~~~~~~~~~~~ + +``ArrowCursor`` uses PyArrow's CSV reader with ``quoted_strings_can_be_null=False``, +which means **both NULL and empty strings become empty strings**: + +.. code:: python + + from pyathena import connect + from pyathena.arrow.cursor import ArrowCursor + + cursor = connect(s3_staging_dir="s3://YOUR_S3_BUCKET/path/to/", + region_name="us-west-2", + cursor_class=ArrowCursor).cursor() + + table = cursor.execute(""" + SELECT * FROM ( + VALUES + (1, '', 'empty_string'), + (2, CAST(NULL AS VARCHAR), 'null_value'), + (3, 'hello', 'normal_string') + ) AS t(id, value, description) + """).as_arrow() + + value_col = table.column('value') + print(value_col.to_pylist()) + # ['', '', 'hello'] <- Both empty string and NULL become '' + + print(value_col.null_count) + # 0 <- No null values detected + +With Unload (Parquet) - Recommended +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. code:: python + + from pyathena import connect + from pyathena.arrow.cursor import ArrowCursor + + cursor = connect(s3_staging_dir="s3://YOUR_S3_BUCKET/path/to/", + region_name="us-west-2", + cursor_class=ArrowCursor).cursor(unload=True) + + table = cursor.execute(""" + SELECT * FROM ( + VALUES + (1, '', 'empty_string'), + (2, CAST(NULL AS VARCHAR), 'null_value'), + (3, 'hello', 'normal_string') + ) AS t(id, value, description) + """).as_arrow() + + value_col = table.column('value') + print(value_col.to_pylist()) + # ['', None, 'hello'] <- NULL is properly None + + print(value_col.null_count) + # 1 <- One null value correctly detected + +PolarsCursor Behavior +--------------------- + +``PolarsCursor`` is unique in that **it can distinguish NULL from empty strings even when reading CSV files**. +This is because Polars' CSV parser correctly interprets unquoted empty values as NULL. + +Without Unload (CSV) +~~~~~~~~~~~~~~~~~~~~ + +.. code:: python + + from pyathena import connect + from pyathena.polars.cursor import PolarsCursor + + cursor = connect(s3_staging_dir="s3://YOUR_S3_BUCKET/path/to/", + region_name="us-west-2", + cursor_class=PolarsCursor).cursor() + + df = cursor.execute(""" + SELECT * FROM ( + VALUES + (1, '', 'empty_string'), + (2, CAST(NULL AS VARCHAR), 'null_value'), + (3, 'hello', 'normal_string') + ) AS t(id, value, description) + """).as_polars() + + print(df) + # shape: (3, 3) + # ┌─────┬───────┬───────────────┐ + # │ id ┆ value ┆ description │ + # │ --- ┆ --- ┆ --- │ + # │ i32 ┆ str ┆ str │ + # ╞═════╪═══════╪═══════════════╡ + # │ 1 ┆ ┆ empty_string │ <- Empty string + # │ 2 ┆ null ┆ null_value │ <- NULL + # │ 3 ┆ hello ┆ normal_string │ + # └─────┴───────┴───────────────┘ + + print(df['value'].is_null().to_list()) + # [False, True, False] <- Correctly distinguishes NULL + + print(df['value'].to_list()) + # ['', None, 'hello'] <- Empty string and NULL are different + +With Unload (Parquet) +~~~~~~~~~~~~~~~~~~~~~ + +The behavior is the same with unload, as Parquet also properly preserves NULL semantics: + +.. code:: python + + from pyathena import connect + from pyathena.polars.cursor import PolarsCursor + + cursor = connect(s3_staging_dir="s3://YOUR_S3_BUCKET/path/to/", + region_name="us-west-2", + cursor_class=PolarsCursor).cursor(unload=True) + + df = cursor.execute("SELECT * FROM your_table").as_polars() + # Same behavior as CSV - NULL and empty strings are properly distinguished + +S3FSCursor Behavior +------------------- + +``S3FSCursor`` supports two CSV readers with different NULL handling behaviors: + +- **AthenaCSVReader** (default): Properly distinguishes NULL from empty strings by respecting CSV quoting rules +- **DefaultCSVReader**: For backward compatibility; both NULL and empty strings become ``None`` + +AthenaCSVReader (Default) +~~~~~~~~~~~~~~~~~~~~~~~~~ + +By default, ``S3FSCursor`` uses ``AthenaCSVReader`` which correctly interprets unquoted empty +values as NULL and quoted empty values as empty strings. + +.. code:: python + + from pyathena import connect + from pyathena.s3fs.cursor import S3FSCursor + + # AthenaCSVReader is used by default + cursor = connect(s3_staging_dir="s3://YOUR_S3_BUCKET/path/to/", + region_name="us-west-2", + cursor_class=S3FSCursor).cursor() + + cursor.execute(""" + SELECT * FROM ( + VALUES + (1, '', 'empty_string'), + (2, CAST(NULL AS VARCHAR), 'null_value'), + (3, 'hello', 'normal_string') + ) AS t(id, value, description) + """) + + for row in cursor: + print(f"id={row[0]}, value={repr(row[1])}, is_none={row[1] is None}") + + # Output: + # id=1, value='', is_none=False <- Empty string preserved + # id=2, value=None, is_none=True <- NULL is None + # id=3, value='hello', is_none=False + +DefaultCSVReader (Backward Compatibility) +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +If you need backward compatibility where both NULL and empty strings are treated as ``None``, +you can explicitly specify ``DefaultCSVReader``: + +.. code:: python + + from pyathena import connect + from pyathena.s3fs.cursor import S3FSCursor + from pyathena.s3fs.reader import DefaultCSVReader + + cursor = connect(s3_staging_dir="s3://YOUR_S3_BUCKET/path/to/", + region_name="us-west-2", + cursor_class=S3FSCursor, + cursor_kwargs={"csv_reader": DefaultCSVReader}).cursor() + + cursor.execute("SELECT '' AS empty_col, NULL AS null_col") + row = cursor.fetchone() + print(row) # (None, None) - both become None + +.. note:: + + ``S3FSCursor`` does not support the ``unload`` option. However, since ``AthenaCSVReader`` (the default) + can already distinguish NULL from empty strings when reading CSV files, this is not a limitation for NULL handling. + +Recommendations +--------------- + +Based on the cursor behaviors documented above: + +1. **If you need to distinguish NULL from empty strings:** + + - Use the default ``Cursor`` or ``DictCursor`` (API-based, most reliable) + - Or use ``PolarsCursor`` or ``S3FSCursor`` (works correctly even with CSV) + - Or use any cursor with ``unload=True`` (Parquet format) + +2. **If you need pandas DataFrame and NULL/empty string distinction:** + + - Use ``PandasCursor`` with ``unload=True`` + +3. **If you need Arrow Table and NULL/empty string distinction:** + + - Use ``ArrowCursor`` with ``unload=True`` + +4. **For data pipelines and ETL with DataFrame cursors:** + + - Prefer ``PolarsCursor`` (works correctly without unload) + - Or use ``unload=True`` with any DataFrame cursor + +5. **If performance is critical and NULL/empty distinction is not important:** + + - Any cursor works; CSV-based reading is generally faster for smaller datasets + +Summary Table +------------- + +.. list-table:: Recommended Cursor by Use Case + :header-rows: 1 + :widths: 40 60 + + * - Use Case + - Recommended Cursor + * - Need NULL/empty distinction with tuples + - ``Cursor``, ``DictCursor``, or ``S3FSCursor`` + * - Need pandas DataFrame with NULL/empty distinction + - ``PandasCursor`` with ``unload=True`` + * - Need Arrow Table with NULL/empty distinction + - ``ArrowCursor`` with ``unload=True`` + * - Need Polars DataFrame (any case) + - ``PolarsCursor`` (works with or without unload) + * - Don't care about NULL/empty distinction + - Any cursor (choose based on performance needs) + +Unload Limitations +------------------ + +While the ``unload`` option solves the NULL/empty string issue for pandas and Arrow cursors, +it has some limitations: + +- The UNLOAD statement has certain type restrictions (e.g., TIME type is not supported) +- Results are written to multiple files in parallel without guaranteed global sort order +- Column aliases are required for all SELECT expressions + +See the :ref:`arrow-cursor` documentation for more details on unload limitations. + +Related Issues +-------------- + +This behavior is documented in response to the following GitHub issues: + +- `#118 `_: PandasCursor converting strings to NaN +- `#148 `_: String NULL handling in pandas conversion +- `#168 `_: Inconsistent NULL handling across data types diff --git a/tests/pyathena/arrow/test_cursor.py b/tests/pyathena/arrow/test_cursor.py index 655a69ea..a6236af2 100644 --- a/tests/pyathena/arrow/test_cursor.py +++ b/tests/pyathena/arrow/test_cursor.py @@ -879,3 +879,51 @@ def test_timeout_parameters_float(self, arrow_cursor): # Verify float timeout parameters are passed to result set assert arrow_cursor.result_set._connect_timeout == 5.5 assert arrow_cursor.result_set._request_timeout == 15.5 + + @pytest.mark.parametrize( + "arrow_cursor", + [ + {"cursor_kwargs": {"unload": False}}, + {"cursor_kwargs": {"unload": True}}, + ], + indirect=["arrow_cursor"], + ) + def test_null_vs_empty_string(self, arrow_cursor): + """ + Test NULL vs empty string handling in ArrowCursor. + + Without unload (CSV): Cannot distinguish NULL from empty string (both become ''). + With unload (Parquet): Properly distinguishes NULL from empty string. + + See docs/null_handling.rst for details. + """ + query = """ + SELECT * FROM ( + VALUES + (1, '', 'empty_string'), + (2, CAST(NULL AS VARCHAR), 'null_value'), + (3, 'hello', 'normal_string'), + (4, 'N/A', 'na_string'), + (5, 'NULL', 'null_string_literal') + ) AS t(id, value, description) + ORDER BY id + """ + table = arrow_cursor.execute(query).as_arrow() + value_col = table.column("value") + values = value_col.to_pylist() + + if arrow_cursor._unload: + # With unload (Parquet): NULL and empty string are properly distinguished + assert values[0] == "" # Empty string + assert values[1] is None # NULL is None + assert value_col.null_count == 1 + else: + # Without unload (CSV): Both NULL and empty string become empty string + assert values[0] == "" # Empty string + assert values[1] == "" # NULL also becomes empty string + assert value_col.null_count == 0 + + # Normal strings are always preserved correctly + assert values[2] == "hello" + assert values[3] == "N/A" + assert values[4] == "NULL" diff --git a/tests/pyathena/pandas/test_cursor.py b/tests/pyathena/pandas/test_cursor.py index fc12166f..01a64739 100644 --- a/tests/pyathena/pandas/test_cursor.py +++ b/tests/pyathena/pandas/test_cursor.py @@ -1028,7 +1028,7 @@ def test_not_skip_blank_lines(self, pandas_cursor, parquet_engine): ], indirect=["pandas_cursor"], ) - def test_empty_and_null_string(self, pandas_cursor, parquet_engine): + def test_null_vs_empty_string(self, pandas_cursor, parquet_engine): # TODO https://github.com/laughingman7743/PyAthena/issues/118 query = """ SELECT * FROM (VALUES ('', 'a'), ('N/A', 'a'), ('NULL', 'a'), (NULL, 'a')) diff --git a/tests/pyathena/polars/test_cursor.py b/tests/pyathena/polars/test_cursor.py index 447b676b..8269a733 100644 --- a/tests/pyathena/polars/test_cursor.py +++ b/tests/pyathena/polars/test_cursor.py @@ -624,3 +624,48 @@ def test_iterator_with_chunksize_unload(self, polars_cursor): polars_cursor.execute("SELECT * FROM many_rows LIMIT 15") rows = list(polars_cursor) assert len(rows) == 15 + + @pytest.mark.parametrize( + "polars_cursor", + [ + {"cursor_kwargs": {"unload": False}}, + {"cursor_kwargs": {"unload": True}}, + ], + indirect=["polars_cursor"], + ) + def test_null_vs_empty_string(self, polars_cursor): + """ + Test NULL vs empty string handling in PolarsCursor. + + PolarsCursor can properly distinguish NULL from empty string in both CSV and Parquet modes. + This is unique among file-based cursors because Polars' CSV parser correctly interprets + unquoted empty values as NULL. + + See docs/null_handling.rst for details. + """ + query = """ + SELECT * FROM ( + VALUES + (1, '', 'empty_string'), + (2, CAST(NULL AS VARCHAR), 'null_value'), + (3, 'hello', 'normal_string'), + (4, 'N/A', 'na_string'), + (5, 'NULL', 'null_string_literal') + ) AS t(id, value, description) + ORDER BY id + """ + df = polars_cursor.execute(query).as_polars() + is_null = df["value"].is_null().to_list() + values = df["value"].to_list() + + # Both CSV and Parquet modes properly distinguish NULL from empty string + assert not is_null[0] # Empty string is NOT null + assert values[0] == "" # Empty string is preserved + + assert is_null[1] # NULL IS null + assert values[1] is None # NULL is None + + # Normal strings are preserved correctly + assert values[2] == "hello" + assert values[3] == "N/A" + assert values[4] == "NULL" diff --git a/tests/pyathena/s3fs/test_cursor.py b/tests/pyathena/s3fs/test_cursor.py index b90523cb..404a4c9e 100644 --- a/tests/pyathena/s3fs/test_cursor.py +++ b/tests/pyathena/s3fs/test_cursor.py @@ -426,39 +426,38 @@ def test_empty_string_with_athena_reader(self): # AthenaCSVReader preserves empty string as '' assert result == ("",) - def test_null_vs_empty_string_with_default_reader(self): - """DefaultCSVReader: Both NULL and empty string become None.""" - with ( - contextlib.closing( - connect( - schema_name=ENV.schema, - cursor_class=S3FSCursor, - cursor_kwargs={"csv_reader": DefaultCSVReader}, - ) - ) as conn, - conn.cursor() as cursor, - ): - cursor.execute("SELECT NULL AS null_col, '' AS empty_col") - result = cursor.fetchone() - # Both become None - assert result == (None, None) + @pytest.mark.parametrize( + "csv_reader, expected_empty", + [ + (DefaultCSVReader, None), # DefaultCSVReader: empty string becomes None + (AthenaCSVReader, ""), # AthenaCSVReader: empty string is preserved + ], + ) + def test_null_vs_empty_string(self, csv_reader, expected_empty): + """ + Test NULL vs empty string handling with different CSV readers. + + DefaultCSVReader: Both NULL and empty string become None. + AthenaCSVReader: NULL is None, empty string is preserved as ''. - def test_null_vs_empty_string_with_athena_reader(self): - """AthenaCSVReader: NULL and empty string are distinct.""" + See docs/null_handling.rst for details. + """ with ( contextlib.closing( connect( schema_name=ENV.schema, cursor_class=S3FSCursor, - cursor_kwargs={"csv_reader": AthenaCSVReader}, + cursor_kwargs={"csv_reader": csv_reader}, ) ) as conn, conn.cursor() as cursor, ): cursor.execute("SELECT NULL AS null_col, '' AS empty_col") result = cursor.fetchone() - # NULL is None, empty string is '' - assert result == (None, "") + # NULL is always None + assert result[0] is None + # Empty string behavior depends on reader + assert result[1] == expected_empty def test_mixed_values_with_athena_reader(self): """AthenaCSVReader: Mixed NULL, empty string, and regular values.""" diff --git a/tests/pyathena/test_cursor.py b/tests/pyathena/test_cursor.py index 37883a1e..0ffe9174 100644 --- a/tests/pyathena/test_cursor.py +++ b/tests/pyathena/test_cursor.py @@ -863,6 +863,41 @@ def verify_query_id(): row = cursor.fetchone() assert row == (5,) + def test_null_vs_empty_string(self, cursor): + """ + Default Cursor should properly distinguish NULL from empty string. + See docs/null_handling.rst for details. + """ + query = """ + SELECT * FROM ( + VALUES + (1, '', 'empty_string'), + (2, CAST(NULL AS VARCHAR), 'null_value'), + (3, 'hello', 'normal_string'), + (4, 'N/A', 'na_string'), + (5, 'NULL', 'null_string_literal') + ) AS t(id, value, description) + ORDER BY id + """ + cursor.execute(query) + rows = cursor.fetchall() + + # Row 1: Empty string is preserved as empty string, not None + assert rows[0][1] == "" + assert rows[0][1] is not None + + # Row 2: NULL is properly returned as None + assert rows[1][1] is None + + # Row 3: Normal string + assert rows[2][1] == "hello" + + # Row 4: "N/A" string should be preserved as-is + assert rows[3][1] == "N/A" + + # Row 5: "NULL" string literal should be preserved as-is + assert rows[4][1] == "NULL" + class TestDictCursor: def test_fetchone(self, dict_cursor): @@ -884,6 +919,41 @@ def test_fetchall(self, dict_cursor): dict_cursor.execute("SELECT a FROM many_rows ORDER BY a") assert dict_cursor.fetchall() == [{"a": i} for i in range(10000)] + def test_null_vs_empty_string(self, dict_cursor): + """ + DictCursor should properly distinguish NULL from empty string. + See docs/null_handling.rst for details. + """ + query = """ + SELECT * FROM ( + VALUES + (1, '', 'empty_string'), + (2, CAST(NULL AS VARCHAR), 'null_value'), + (3, 'hello', 'normal_string'), + (4, 'N/A', 'na_string'), + (5, 'NULL', 'null_string_literal') + ) AS t(id, value, description) + ORDER BY id + """ + dict_cursor.execute(query) + rows = dict_cursor.fetchall() + + # Row 1: Empty string is preserved as empty string, not None + assert rows[0]["value"] == "" + assert rows[0]["value"] is not None + + # Row 2: NULL is properly returned as None + assert rows[1]["value"] is None + + # Row 3: Normal string + assert rows[2]["value"] == "hello" + + # Row 4: "N/A" string should be preserved as-is + assert rows[3]["value"] == "N/A" + + # Row 5: "NULL" string literal should be preserved as-is + assert rows[4]["value"] == "NULL" + class TestComplexDataTypes: """Test complex data types (STRUCT, ARRAY, MAP) with actual Athena queries."""