diff --git a/docs/index.rst b/docs/index.rst index cee8e3ca..21ec91a2 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -17,6 +17,7 @@ Documentation introduction usage cursor + null_handling sqlalchemy pandas arrow diff --git a/docs/null_handling.rst b/docs/null_handling.rst new file mode 100644 index 00000000..6d75137f --- /dev/null +++ b/docs/null_handling.rst @@ -0,0 +1,475 @@ +.. _null_handling: + +NULL and Empty String Handling +============================== + +This section documents the behavior of NULL and empty string values across different cursor types in PyAthena. +Understanding this behavior is important for applications that need to distinguish between NULL (missing) values +and empty strings. + +.. _null-csv-limitation: + +CSV Format Limitation +--------------------- + +When Athena executes a query, the results are stored as CSV files in S3. This CSV format has an inherent limitation +in how NULL values and empty strings are represented: + +- **NULL values** are represented as unquoted empty fields: ``,,`` +- **Empty strings** are represented as quoted empty fields: ``,"",`` + +.. code:: text + + # Example CSV output from Athena + id,name,description + 1,Alice,Hello + 2,Bob,"" + 3,Charlie, + + # Row 2: description is an empty string (quoted: "") + # Row 3: description is NULL (unquoted) + +Most CSV parsers handle these differently: + +- **pandas** (used by ``PandasCursor``): By default, treats both unquoted empty and quoted empty as missing values +- **PyArrow** (used by ``ArrowCursor``): With ``quoted_strings_can_be_null=False``, treats both as empty strings +- **Polars** (used by ``PolarsCursor``): Correctly distinguishes quoted empty strings from unquoted NULL values +- **AthenaCSVReader** (used by ``S3FSCursor``): Correctly distinguishes NULL (returns ``None``) from empty strings + +This means the ability to distinguish NULL from empty strings varies by cursor when reading CSV files. + +Cursor Behavior Comparison +-------------------------- + +The following table summarizes how different cursors handle NULL and empty string values, +based on actual testing with Athena: + +.. list-table:: NULL vs Empty String Behavior + :header-rows: 1 + :widths: 25 20 20 20 15 + + * - Cursor Type + - Data Source + - Empty String ``''`` + - NULL Value + - Distinguishes? + * - ``Cursor`` (default) + - Athena API + - ``''`` + - ``None`` + - ✅ Yes + * - ``DictCursor`` + - Athena API + - ``''`` + - ``None`` + - ✅ Yes + * - ``PandasCursor`` + - CSV file + - ``NaN`` + - ``NaN`` + - ❌ No + * - ``PandasCursor`` + unload + - Parquet file + - ``''`` + - ``None`` + - ✅ Yes + * - ``ArrowCursor`` + - CSV file + - ``''`` + - ``''`` + - ❌ No + * - ``ArrowCursor`` + unload + - Parquet file + - ``''`` + - ``null`` + - ✅ Yes + * - ``PolarsCursor`` + - CSV file + - ``''`` + - ``null`` + - ✅ Yes + * - ``PolarsCursor`` + unload + - Parquet file + - ``''`` + - ``null`` + - ✅ Yes + * - ``S3FSCursor`` + - CSV file + - ``''`` + - ``None`` + - ✅ Yes + +.. note:: + + ``PolarsCursor`` and ``S3FSCursor`` are unique among the file-based cursors in that they can properly + distinguish NULL from empty strings even when reading CSV files. ``PolarsCursor`` uses Polars' CSV parser + which correctly interprets unquoted empty values as NULL, while ``S3FSCursor`` uses a custom + ``AthenaCSVReader`` that respects CSV quoting rules. + +Default Cursor (API-based) +-------------------------- + +The default ``Cursor`` and ``DictCursor`` fetch results directly from the Athena API, +which properly distinguishes between NULL and empty string values. + +.. code:: python + + from pyathena import connect + + cursor = connect(s3_staging_dir="s3://YOUR_S3_BUCKET/path/to/", + region_name="us-west-2").cursor() + + cursor.execute(""" + SELECT * FROM ( + VALUES + (1, ''), + (2, CAST(NULL AS VARCHAR)), + (3, 'hello') + ) AS t(id, value) + """) + + for row in cursor: + print(f"id={row[0]}, value={repr(row[1])}, is_none={row[1] is None}") + + # Output: + # id=1, value='', is_none=False <- Empty string + # id=2, value=None, is_none=True <- NULL + # id=3, value='hello', is_none=False + +This is the most reliable cursor when distinguishing NULL from empty string is critical. + +PandasCursor Behavior +--------------------- + +Without Unload (CSV) +~~~~~~~~~~~~~~~~~~~~ + +When using ``PandasCursor`` without the ``unload`` option, the cursor reads the CSV file +using pandas' ``read_csv()``. **Both empty strings and NULL values are treated as NaN**. + +.. code:: python + + from pyathena import connect + from pyathena.pandas.cursor import PandasCursor + + cursor = connect(s3_staging_dir="s3://YOUR_S3_BUCKET/path/to/", + region_name="us-west-2", + cursor_class=PandasCursor).cursor() + + df = cursor.execute(""" + SELECT * FROM ( + VALUES + (1, '', 'empty_string'), + (2, CAST(NULL AS VARCHAR), 'null_value'), + (3, 'hello', 'normal_string') + ) AS t(id, value, description) + """).as_pandas() + + print(df) + # id value description + # 0 1 NaN empty_string <- Empty string becomes NaN + # 1 2 NaN null_value <- NULL becomes NaN + # 2 3 hello normal_string + + print(df['value'].isna().tolist()) + # [True, True, False] <- Both empty string and NULL are NaN + +.. note:: + + By default, PyAthena sets ``keep_default_na=False`` and ``na_values=("",)`` which means only + empty values are treated as NaN, while strings like "N/A", "NULL", "NA" are preserved as-is. + However, this still cannot distinguish between quoted empty strings and unquoted NULL values + in the CSV output. + +With Unload (Parquet) - Recommended +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Using the ``unload`` option outputs results in Parquet format, which properly preserves +NULL semantics: + +.. code:: python + + from pyathena import connect + from pyathena.pandas.cursor import PandasCursor + + cursor = connect(s3_staging_dir="s3://YOUR_S3_BUCKET/path/to/", + region_name="us-west-2", + cursor_class=PandasCursor).cursor(unload=True) + + df = cursor.execute(""" + SELECT * FROM ( + VALUES + (1, '', 'empty_string'), + (2, CAST(NULL AS VARCHAR), 'null_value'), + (3, 'hello', 'normal_string') + ) AS t(id, value, description) + """).as_pandas() + + print(df) + # id value description + # 0 1 empty_string <- Empty string preserved + # 1 2 None null_value <- NULL is None + # 2 3 hello normal_string + + print(df['value'].isna().tolist()) + # [False, True, False] <- Only NULL is NaN, empty string is not + +ArrowCursor Behavior +-------------------- + +Without Unload (CSV) +~~~~~~~~~~~~~~~~~~~~ + +``ArrowCursor`` uses PyArrow's CSV reader with ``quoted_strings_can_be_null=False``, +which means **both NULL and empty strings become empty strings**: + +.. code:: python + + from pyathena import connect + from pyathena.arrow.cursor import ArrowCursor + + cursor = connect(s3_staging_dir="s3://YOUR_S3_BUCKET/path/to/", + region_name="us-west-2", + cursor_class=ArrowCursor).cursor() + + table = cursor.execute(""" + SELECT * FROM ( + VALUES + (1, '', 'empty_string'), + (2, CAST(NULL AS VARCHAR), 'null_value'), + (3, 'hello', 'normal_string') + ) AS t(id, value, description) + """).as_arrow() + + value_col = table.column('value') + print(value_col.to_pylist()) + # ['', '', 'hello'] <- Both empty string and NULL become '' + + print(value_col.null_count) + # 0 <- No null values detected + +With Unload (Parquet) - Recommended +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. code:: python + + from pyathena import connect + from pyathena.arrow.cursor import ArrowCursor + + cursor = connect(s3_staging_dir="s3://YOUR_S3_BUCKET/path/to/", + region_name="us-west-2", + cursor_class=ArrowCursor).cursor(unload=True) + + table = cursor.execute(""" + SELECT * FROM ( + VALUES + (1, '', 'empty_string'), + (2, CAST(NULL AS VARCHAR), 'null_value'), + (3, 'hello', 'normal_string') + ) AS t(id, value, description) + """).as_arrow() + + value_col = table.column('value') + print(value_col.to_pylist()) + # ['', None, 'hello'] <- NULL is properly None + + print(value_col.null_count) + # 1 <- One null value correctly detected + +PolarsCursor Behavior +--------------------- + +``PolarsCursor`` is unique in that **it can distinguish NULL from empty strings even when reading CSV files**. +This is because Polars' CSV parser correctly interprets unquoted empty values as NULL. + +Without Unload (CSV) +~~~~~~~~~~~~~~~~~~~~ + +.. code:: python + + from pyathena import connect + from pyathena.polars.cursor import PolarsCursor + + cursor = connect(s3_staging_dir="s3://YOUR_S3_BUCKET/path/to/", + region_name="us-west-2", + cursor_class=PolarsCursor).cursor() + + df = cursor.execute(""" + SELECT * FROM ( + VALUES + (1, '', 'empty_string'), + (2, CAST(NULL AS VARCHAR), 'null_value'), + (3, 'hello', 'normal_string') + ) AS t(id, value, description) + """).as_polars() + + print(df) + # shape: (3, 3) + # ┌─────┬───────┬───────────────┐ + # │ id ┆ value ┆ description │ + # │ --- ┆ --- ┆ --- │ + # │ i32 ┆ str ┆ str │ + # ╞═════╪═══════╪═══════════════╡ + # │ 1 ┆ ┆ empty_string │ <- Empty string + # │ 2 ┆ null ┆ null_value │ <- NULL + # │ 3 ┆ hello ┆ normal_string │ + # └─────┴───────┴───────────────┘ + + print(df['value'].is_null().to_list()) + # [False, True, False] <- Correctly distinguishes NULL + + print(df['value'].to_list()) + # ['', None, 'hello'] <- Empty string and NULL are different + +With Unload (Parquet) +~~~~~~~~~~~~~~~~~~~~~ + +The behavior is the same with unload, as Parquet also properly preserves NULL semantics: + +.. code:: python + + from pyathena import connect + from pyathena.polars.cursor import PolarsCursor + + cursor = connect(s3_staging_dir="s3://YOUR_S3_BUCKET/path/to/", + region_name="us-west-2", + cursor_class=PolarsCursor).cursor(unload=True) + + df = cursor.execute("SELECT * FROM your_table").as_polars() + # Same behavior as CSV - NULL and empty strings are properly distinguished + +S3FSCursor Behavior +------------------- + +``S3FSCursor`` supports two CSV readers with different NULL handling behaviors: + +- **AthenaCSVReader** (default): Properly distinguishes NULL from empty strings by respecting CSV quoting rules +- **DefaultCSVReader**: For backward compatibility; both NULL and empty strings become ``None`` + +AthenaCSVReader (Default) +~~~~~~~~~~~~~~~~~~~~~~~~~ + +By default, ``S3FSCursor`` uses ``AthenaCSVReader`` which correctly interprets unquoted empty +values as NULL and quoted empty values as empty strings. + +.. code:: python + + from pyathena import connect + from pyathena.s3fs.cursor import S3FSCursor + + # AthenaCSVReader is used by default + cursor = connect(s3_staging_dir="s3://YOUR_S3_BUCKET/path/to/", + region_name="us-west-2", + cursor_class=S3FSCursor).cursor() + + cursor.execute(""" + SELECT * FROM ( + VALUES + (1, '', 'empty_string'), + (2, CAST(NULL AS VARCHAR), 'null_value'), + (3, 'hello', 'normal_string') + ) AS t(id, value, description) + """) + + for row in cursor: + print(f"id={row[0]}, value={repr(row[1])}, is_none={row[1] is None}") + + # Output: + # id=1, value='', is_none=False <- Empty string preserved + # id=2, value=None, is_none=True <- NULL is None + # id=3, value='hello', is_none=False + +DefaultCSVReader (Backward Compatibility) +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +If you need backward compatibility where both NULL and empty strings are treated as ``None``, +you can explicitly specify ``DefaultCSVReader``: + +.. code:: python + + from pyathena import connect + from pyathena.s3fs.cursor import S3FSCursor + from pyathena.s3fs.reader import DefaultCSVReader + + cursor = connect(s3_staging_dir="s3://YOUR_S3_BUCKET/path/to/", + region_name="us-west-2", + cursor_class=S3FSCursor, + cursor_kwargs={"csv_reader": DefaultCSVReader}).cursor() + + cursor.execute("SELECT '' AS empty_col, NULL AS null_col") + row = cursor.fetchone() + print(row) # (None, None) - both become None + +.. note:: + + ``S3FSCursor`` does not support the ``unload`` option. However, since ``AthenaCSVReader`` (the default) + can already distinguish NULL from empty strings when reading CSV files, this is not a limitation for NULL handling. + +Recommendations +--------------- + +Based on the cursor behaviors documented above: + +1. **If you need to distinguish NULL from empty strings:** + + - Use the default ``Cursor`` or ``DictCursor`` (API-based, most reliable) + - Or use ``PolarsCursor`` or ``S3FSCursor`` (works correctly even with CSV) + - Or use any cursor with ``unload=True`` (Parquet format) + +2. **If you need pandas DataFrame and NULL/empty string distinction:** + + - Use ``PandasCursor`` with ``unload=True`` + +3. **If you need Arrow Table and NULL/empty string distinction:** + + - Use ``ArrowCursor`` with ``unload=True`` + +4. **For data pipelines and ETL with DataFrame cursors:** + + - Prefer ``PolarsCursor`` (works correctly without unload) + - Or use ``unload=True`` with any DataFrame cursor + +5. **If performance is critical and NULL/empty distinction is not important:** + + - Any cursor works; CSV-based reading is generally faster for smaller datasets + +Summary Table +------------- + +.. list-table:: Recommended Cursor by Use Case + :header-rows: 1 + :widths: 40 60 + + * - Use Case + - Recommended Cursor + * - Need NULL/empty distinction with tuples + - ``Cursor``, ``DictCursor``, or ``S3FSCursor`` + * - Need pandas DataFrame with NULL/empty distinction + - ``PandasCursor`` with ``unload=True`` + * - Need Arrow Table with NULL/empty distinction + - ``ArrowCursor`` with ``unload=True`` + * - Need Polars DataFrame (any case) + - ``PolarsCursor`` (works with or without unload) + * - Don't care about NULL/empty distinction + - Any cursor (choose based on performance needs) + +Unload Limitations +------------------ + +While the ``unload`` option solves the NULL/empty string issue for pandas and Arrow cursors, +it has some limitations: + +- The UNLOAD statement has certain type restrictions (e.g., TIME type is not supported) +- Results are written to multiple files in parallel without guaranteed global sort order +- Column aliases are required for all SELECT expressions + +See the :ref:`arrow-cursor` documentation for more details on unload limitations. + +Related Issues +-------------- + +This behavior is documented in response to the following GitHub issues: + +- `#118 `_: PandasCursor converting strings to NaN +- `#148 `_: String NULL handling in pandas conversion +- `#168 `_: Inconsistent NULL handling across data types diff --git a/tests/pyathena/arrow/test_cursor.py b/tests/pyathena/arrow/test_cursor.py index 655a69ea..a6236af2 100644 --- a/tests/pyathena/arrow/test_cursor.py +++ b/tests/pyathena/arrow/test_cursor.py @@ -879,3 +879,51 @@ def test_timeout_parameters_float(self, arrow_cursor): # Verify float timeout parameters are passed to result set assert arrow_cursor.result_set._connect_timeout == 5.5 assert arrow_cursor.result_set._request_timeout == 15.5 + + @pytest.mark.parametrize( + "arrow_cursor", + [ + {"cursor_kwargs": {"unload": False}}, + {"cursor_kwargs": {"unload": True}}, + ], + indirect=["arrow_cursor"], + ) + def test_null_vs_empty_string(self, arrow_cursor): + """ + Test NULL vs empty string handling in ArrowCursor. + + Without unload (CSV): Cannot distinguish NULL from empty string (both become ''). + With unload (Parquet): Properly distinguishes NULL from empty string. + + See docs/null_handling.rst for details. + """ + query = """ + SELECT * FROM ( + VALUES + (1, '', 'empty_string'), + (2, CAST(NULL AS VARCHAR), 'null_value'), + (3, 'hello', 'normal_string'), + (4, 'N/A', 'na_string'), + (5, 'NULL', 'null_string_literal') + ) AS t(id, value, description) + ORDER BY id + """ + table = arrow_cursor.execute(query).as_arrow() + value_col = table.column("value") + values = value_col.to_pylist() + + if arrow_cursor._unload: + # With unload (Parquet): NULL and empty string are properly distinguished + assert values[0] == "" # Empty string + assert values[1] is None # NULL is None + assert value_col.null_count == 1 + else: + # Without unload (CSV): Both NULL and empty string become empty string + assert values[0] == "" # Empty string + assert values[1] == "" # NULL also becomes empty string + assert value_col.null_count == 0 + + # Normal strings are always preserved correctly + assert values[2] == "hello" + assert values[3] == "N/A" + assert values[4] == "NULL" diff --git a/tests/pyathena/pandas/test_cursor.py b/tests/pyathena/pandas/test_cursor.py index fc12166f..01a64739 100644 --- a/tests/pyathena/pandas/test_cursor.py +++ b/tests/pyathena/pandas/test_cursor.py @@ -1028,7 +1028,7 @@ def test_not_skip_blank_lines(self, pandas_cursor, parquet_engine): ], indirect=["pandas_cursor"], ) - def test_empty_and_null_string(self, pandas_cursor, parquet_engine): + def test_null_vs_empty_string(self, pandas_cursor, parquet_engine): # TODO https://github.com/laughingman7743/PyAthena/issues/118 query = """ SELECT * FROM (VALUES ('', 'a'), ('N/A', 'a'), ('NULL', 'a'), (NULL, 'a')) diff --git a/tests/pyathena/polars/test_cursor.py b/tests/pyathena/polars/test_cursor.py index 447b676b..8269a733 100644 --- a/tests/pyathena/polars/test_cursor.py +++ b/tests/pyathena/polars/test_cursor.py @@ -624,3 +624,48 @@ def test_iterator_with_chunksize_unload(self, polars_cursor): polars_cursor.execute("SELECT * FROM many_rows LIMIT 15") rows = list(polars_cursor) assert len(rows) == 15 + + @pytest.mark.parametrize( + "polars_cursor", + [ + {"cursor_kwargs": {"unload": False}}, + {"cursor_kwargs": {"unload": True}}, + ], + indirect=["polars_cursor"], + ) + def test_null_vs_empty_string(self, polars_cursor): + """ + Test NULL vs empty string handling in PolarsCursor. + + PolarsCursor can properly distinguish NULL from empty string in both CSV and Parquet modes. + This is unique among file-based cursors because Polars' CSV parser correctly interprets + unquoted empty values as NULL. + + See docs/null_handling.rst for details. + """ + query = """ + SELECT * FROM ( + VALUES + (1, '', 'empty_string'), + (2, CAST(NULL AS VARCHAR), 'null_value'), + (3, 'hello', 'normal_string'), + (4, 'N/A', 'na_string'), + (5, 'NULL', 'null_string_literal') + ) AS t(id, value, description) + ORDER BY id + """ + df = polars_cursor.execute(query).as_polars() + is_null = df["value"].is_null().to_list() + values = df["value"].to_list() + + # Both CSV and Parquet modes properly distinguish NULL from empty string + assert not is_null[0] # Empty string is NOT null + assert values[0] == "" # Empty string is preserved + + assert is_null[1] # NULL IS null + assert values[1] is None # NULL is None + + # Normal strings are preserved correctly + assert values[2] == "hello" + assert values[3] == "N/A" + assert values[4] == "NULL" diff --git a/tests/pyathena/s3fs/test_cursor.py b/tests/pyathena/s3fs/test_cursor.py index b90523cb..404a4c9e 100644 --- a/tests/pyathena/s3fs/test_cursor.py +++ b/tests/pyathena/s3fs/test_cursor.py @@ -426,39 +426,38 @@ def test_empty_string_with_athena_reader(self): # AthenaCSVReader preserves empty string as '' assert result == ("",) - def test_null_vs_empty_string_with_default_reader(self): - """DefaultCSVReader: Both NULL and empty string become None.""" - with ( - contextlib.closing( - connect( - schema_name=ENV.schema, - cursor_class=S3FSCursor, - cursor_kwargs={"csv_reader": DefaultCSVReader}, - ) - ) as conn, - conn.cursor() as cursor, - ): - cursor.execute("SELECT NULL AS null_col, '' AS empty_col") - result = cursor.fetchone() - # Both become None - assert result == (None, None) + @pytest.mark.parametrize( + "csv_reader, expected_empty", + [ + (DefaultCSVReader, None), # DefaultCSVReader: empty string becomes None + (AthenaCSVReader, ""), # AthenaCSVReader: empty string is preserved + ], + ) + def test_null_vs_empty_string(self, csv_reader, expected_empty): + """ + Test NULL vs empty string handling with different CSV readers. + + DefaultCSVReader: Both NULL and empty string become None. + AthenaCSVReader: NULL is None, empty string is preserved as ''. - def test_null_vs_empty_string_with_athena_reader(self): - """AthenaCSVReader: NULL and empty string are distinct.""" + See docs/null_handling.rst for details. + """ with ( contextlib.closing( connect( schema_name=ENV.schema, cursor_class=S3FSCursor, - cursor_kwargs={"csv_reader": AthenaCSVReader}, + cursor_kwargs={"csv_reader": csv_reader}, ) ) as conn, conn.cursor() as cursor, ): cursor.execute("SELECT NULL AS null_col, '' AS empty_col") result = cursor.fetchone() - # NULL is None, empty string is '' - assert result == (None, "") + # NULL is always None + assert result[0] is None + # Empty string behavior depends on reader + assert result[1] == expected_empty def test_mixed_values_with_athena_reader(self): """AthenaCSVReader: Mixed NULL, empty string, and regular values.""" diff --git a/tests/pyathena/test_cursor.py b/tests/pyathena/test_cursor.py index 37883a1e..0ffe9174 100644 --- a/tests/pyathena/test_cursor.py +++ b/tests/pyathena/test_cursor.py @@ -863,6 +863,41 @@ def verify_query_id(): row = cursor.fetchone() assert row == (5,) + def test_null_vs_empty_string(self, cursor): + """ + Default Cursor should properly distinguish NULL from empty string. + See docs/null_handling.rst for details. + """ + query = """ + SELECT * FROM ( + VALUES + (1, '', 'empty_string'), + (2, CAST(NULL AS VARCHAR), 'null_value'), + (3, 'hello', 'normal_string'), + (4, 'N/A', 'na_string'), + (5, 'NULL', 'null_string_literal') + ) AS t(id, value, description) + ORDER BY id + """ + cursor.execute(query) + rows = cursor.fetchall() + + # Row 1: Empty string is preserved as empty string, not None + assert rows[0][1] == "" + assert rows[0][1] is not None + + # Row 2: NULL is properly returned as None + assert rows[1][1] is None + + # Row 3: Normal string + assert rows[2][1] == "hello" + + # Row 4: "N/A" string should be preserved as-is + assert rows[3][1] == "N/A" + + # Row 5: "NULL" string literal should be preserved as-is + assert rows[4][1] == "NULL" + class TestDictCursor: def test_fetchone(self, dict_cursor): @@ -884,6 +919,41 @@ def test_fetchall(self, dict_cursor): dict_cursor.execute("SELECT a FROM many_rows ORDER BY a") assert dict_cursor.fetchall() == [{"a": i} for i in range(10000)] + def test_null_vs_empty_string(self, dict_cursor): + """ + DictCursor should properly distinguish NULL from empty string. + See docs/null_handling.rst for details. + """ + query = """ + SELECT * FROM ( + VALUES + (1, '', 'empty_string'), + (2, CAST(NULL AS VARCHAR), 'null_value'), + (3, 'hello', 'normal_string'), + (4, 'N/A', 'na_string'), + (5, 'NULL', 'null_string_literal') + ) AS t(id, value, description) + ORDER BY id + """ + dict_cursor.execute(query) + rows = dict_cursor.fetchall() + + # Row 1: Empty string is preserved as empty string, not None + assert rows[0]["value"] == "" + assert rows[0]["value"] is not None + + # Row 2: NULL is properly returned as None + assert rows[1]["value"] is None + + # Row 3: Normal string + assert rows[2]["value"] == "hello" + + # Row 4: "N/A" string should be preserved as-is + assert rows[3]["value"] == "N/A" + + # Row 5: "NULL" string literal should be preserved as-is + assert rows[4]["value"] == "NULL" + class TestComplexDataTypes: """Test complex data types (STRUCT, ARRAY, MAP) with actual Athena queries."""