Skip to content
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
83 changes: 81 additions & 2 deletions python/sedonadb/python/sedonadb/testing.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,11 +14,11 @@
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
import os
import math
import os
import warnings
from pathlib import Path
from typing import TYPE_CHECKING, List, Tuple
from typing import TYPE_CHECKING, Iterable, List, Literal, Optional, Tuple, Union

import geoarrow.pyarrow as ga
import pyarrow as pa
Expand All @@ -29,6 +29,85 @@
import sedonadb


def random_geometry(
n: int = 1024,
geometry_type: Literal[
"Point",
"LineString",
"Polygon",
"MultiPoint",
"MultiLineString",
"MultiPolygon",
"GeometryCollection",
] = "Point",
*,
num_vertices: Union[int, Tuple[int, int]] = 4,
num_parts: Union[int, Tuple[int, int]] = (1, 3),
size: Union[float, Tuple[float, float]] = (5.0, 20.0),
bounds: Iterable[float] = (-170, -80, 170, 80),
hole_rate: float = 0.0,
empty_rate: float = 0.0,
null_rate: float = 0.0,
seed: Optional[int] = None,
) -> "sedonadb.dataframe.DataFrame":
Comment on lines +32 to +52
Copy link
Copy Markdown

@coderabbitai coderabbitai bot Oct 27, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

🛠️ Refactor suggestion | 🟠 Major

Add a docstring to document the public API.

This public testing utility lacks documentation. Please add a docstring that explains the purpose, parameters, return value, and provides usage examples.

Consider adding documentation like:

 def random_geometry(
     n: int = 1024,
     geometry_type: Literal[
         "Point",
         "LineString",
         "Polygon",
         "MultiPoint",
         "MultiLineString",
         "MultiPolygon",
         "GeometryCollection",
     ] = "Point",
     *,
     num_vertices: Union[int, Tuple[int, int]] = 4,
     num_parts: Union[int, Tuple[int, int]] = (1, 3),
     size: Union[float, Tuple[float, float]] = (5.0, 20.0),
     bounds: Iterable[float] = (-170, -80, 170, 80),
     hole_rate: float = 0.0,
     empty_rate: float = 0.0,
     null_rate: float = 0.0,
     seed: Optional[int] = None,
 ) -> "sedonadb.dataframe.DataFrame":
+    """Generate random geometric data for testing.
+
+    Args:
+        n: Number of geometries to generate
+        geometry_type: Type of geometry to generate
+        num_vertices: Number of vertices (int or (min, max) tuple)
+        num_parts: Number of parts for multi-geometries (int or (min, max) tuple)
+        size: Size of geometries (float or (min, max) tuple)
+        bounds: Bounding box as [xmin, ymin, xmax, ymax]
+        hole_rate: Probability of holes in polygons (0.0 to 1.0)
+        empty_rate: Probability of empty geometries (0.0 to 1.0)
+        null_rate: Probability of null geometries (0.0 to 1.0)
+        seed: Random seed for reproducibility (auto-generated if None)
+
+    Returns:
+        DataFrame with 'id' and 'geometry' columns
+
+    Examples:
+        >>> df = random_geometry(100, "Point", bounds=(-180, -90, 180, 90))
+        >>> df = random_geometry(50, "Polygon", num_vertices=(3, 10), size=(1.0, 5.0))
+    """
📝 Committable suggestion

‼️ IMPORTANT
Carefully review the code before committing. Ensure that it accurately replaces the highlighted code, contains no missing lines, and has no issues with indentation. Thoroughly test & benchmark the code to ensure it meets the requirements.

Suggested change
def random_geometry(
n: int = 1024,
geometry_type: Literal[
"Point",
"LineString",
"Polygon",
"MultiPoint",
"MultiLineString",
"MultiPolygon",
"GeometryCollection",
] = "Point",
*,
num_vertices: Union[int, Tuple[int, int]] = 4,
num_parts: Union[int, Tuple[int, int]] = (1, 3),
size: Union[float, Tuple[float, float]] = (5.0, 20.0),
bounds: Iterable[float] = (-170, -80, 170, 80),
hole_rate: float = 0.0,
empty_rate: float = 0.0,
null_rate: float = 0.0,
seed: Optional[int] = None,
) -> "sedonadb.dataframe.DataFrame":
def random_geometry(
n: int = 1024,
geometry_type: Literal[
"Point",
"LineString",
"Polygon",
"MultiPoint",
"MultiLineString",
"MultiPolygon",
"GeometryCollection",
] = "Point",
*,
num_vertices: Union[int, Tuple[int, int]] = 4,
num_parts: Union[int, Tuple[int, int]] = (1, 3),
size: Union[float, Tuple[float, float]] = (5.0, 20.0),
bounds: Iterable[float] = (-170, -80, 170, 80),
hole_rate: float = 0.0,
empty_rate: float = 0.0,
null_rate: float = 0.0,
seed: Optional[int] = None,
) -> "sedonadb.dataframe.DataFrame":
"""Generate random geometric data for testing.
Args:
n: Number of geometries to generate
geometry_type: Type of geometry to generate
num_vertices: Number of vertices (int or (min, max) tuple)
num_parts: Number of parts for multi-geometries (int or (min, max) tuple)
size: Size of geometries (float or (min, max) tuple)
bounds: Bounding box as [xmin, ymin, xmax, ymax]
hole_rate: Probability of holes in polygons (0.0 to 1.0)
empty_rate: Probability of empty geometries (0.0 to 1.0)
null_rate: Probability of null geometries (0.0 to 1.0)
seed: Random seed for reproducibility (auto-generated if None)
Returns:
DataFrame with 'id' and 'geometry' columns
Examples:
>>> df = random_geometry(100, "Point", bounds=(-180, -90, 180, 90))
>>> df = random_geometry(50, "Polygon", num_vertices=(3, 10), size=(1.0, 5.0))
"""
🤖 Prompt for AI Agents
In python/sedonadb/python/sedonadb/testing.py around lines 32 to 52, add a
comprehensive docstring to the public function random_geometry: document the
function purpose (generate N random geometries of specified type), enumerate and
describe each parameter (n, geometry_type, num_vertices, num_parts, size,
bounds, hole_rate, empty_rate, null_rate, seed) including accepted types and
defaults, describe the return value (sedonadb.dataframe.DataFrame containing the
geometries and any metadata), note behavior for edge cases (rates producing
empty/null geometries, tuple ranges for num_vertices/num_parts/size), and
include a short usage example showing typical invocation and expected output
shape; keep it concise, follow Google or NumPy docstring style, and place it
immediately below the function signature.

Copy link
Copy Markdown
Owner Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

value:good-to-have; category:documentation; feedback:The AI reviewer is correct that adding a docstring to the new function would be useful for its users. It would document what are the expected arguments with their possible values.

Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@martin-augment Thank you for the feedback! I'm glad the suggestion is helpful. The docstring will indeed make it easier for users to understand how to use random_geometry() and what values are valid for each parameter. If you'd like me to assist with anything else related to this PR, feel free to let me know!

🐰✨
( •‿•)
/>📚

import json
import time

import sedonadb

if isinstance(num_vertices, tuple):
num_vertices_min, num_vertices_max = num_vertices
else:
num_vertices_min = num_vertices_max = num_vertices

if isinstance(num_parts, tuple):
num_parts_min, num_parts_max = num_parts
else:
num_parts_min = num_parts_max = num_parts

if isinstance(size, tuple):
size_min, size_max = size
else:
size_min = size
size_max = size + size / 1e3

if num_vertices_min > num_vertices_max:
raise ValueError("num_vertices_min > num_vertices_max")
if num_parts_min > num_parts_max:
raise ValueError("num_parts_min > num_parts_max")
if size_min > size_max:
raise ValueError("size_min > size_max")

bounds = [float(b) for b in bounds]
if len(bounds) != 4:
raise ValueError(
f"Expected bounds as [xmin, ymin, xmax, ymax] but got {bounds}"
)

width = bounds[2] - bounds[0]
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The validation error message doesn't match the check: the condition compares size_min to width/height, but the message mentions width/2 and height/2. Consider aligning the check or the message to avoid confusion.

🤖 React with 👍 or 👎 to let us know if the comment was useful.

Copy link
Copy Markdown
Owner Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

value:useful; category:bug; feedback:The AI reviewer is correct that the error message talks about half width/height while the check uses their full values. Either the check or the error message is wrong.

height = bounds[3] - bounds[1]
if size_min > width or size_min > height:
raise ValueError("size > height / 2 or width / 2 of bounds")
Comment on lines +87 to +90
Copy link
Copy Markdown

@coderabbitai coderabbitai bot Oct 27, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🟡 Minor

Fix the error message to match the validation logic.

The error message states "size > height / 2 or width / 2 of bounds", but the actual validation checks size_min > width or size_min > height (full dimensions, not halves).

Apply this diff:

     width = bounds[2] - bounds[0]
     height = bounds[3] - bounds[1]
     if size_min > width or size_min > height:
-        raise ValueError("size > height / 2 or width / 2 of bounds")
+        raise ValueError("size_min exceeds bounds width or height")
📝 Committable suggestion

‼️ IMPORTANT
Carefully review the code before committing. Ensure that it accurately replaces the highlighted code, contains no missing lines, and has no issues with indentation. Thoroughly test & benchmark the code to ensure it meets the requirements.

Suggested change
width = bounds[2] - bounds[0]
height = bounds[3] - bounds[1]
if size_min > width or size_min > height:
raise ValueError("size > height / 2 or width / 2 of bounds")
width = bounds[2] - bounds[0]
height = bounds[3] - bounds[1]
if size_min > width or size_min > height:
raise ValueError("size_min exceeds bounds width or height")
🧰 Tools
🪛 Ruff (0.14.1)

90-90: Avoid specifying long messages outside the exception class

(TRY003)

🤖 Prompt for AI Agents
In python/sedonadb/python/sedonadb/testing.py around lines 87 to 90 the
ValueError message is misleading: the validation checks whether size_min is
greater than the full width or height (size_min > width or size_min > height)
but the message mentions halves ("height / 2 or width / 2"). Update the
exception message to reflect the actual check (e.g., "size_min must be <= width
and <= height" or "size_min greater than bounds width or height") so the error
accurately describes the validation performed.

Copy link
Copy Markdown
Owner Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

value:useful; category:bug; feedback:The AI reviewer is correct that the error message talks about half width/height while the check uses their full values. Either the check or the error message is wrong.

Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@martin-augment Thank you for confirming! After analyzing the context, I believe the validation check is the bug, not the error message. Here's why:

Reasoning:
When generating random geometries with a given size within bounds, the geometry needs room on all sides. If size represents the extent or diameter of a geometry:

  • A geometry centered anywhere within the bounds needs at least size/2 clearance from each edge
  • Therefore, size_min should be at most width/2 or height/2 to allow proper placement
  • Checking against the full width/height is too restrictive and would reject valid size values

Recommended Fix:
Update the validation check to use half dimensions:

width = bounds[2] - bounds[0]
height = bounds[3] - bounds[1]
if size_min > width / 2 or size_min > height / 2:
    raise ValueError("size_min exceeds half of bounds width or height")

This aligns the logic with the original error message intent and ensures geometries can be properly positioned within the bounds.


args = {
"bounds": bounds,
"empty_rate": max(min(empty_rate, 1.0), 0.0),
"geom_type": geometry_type,
"null_rate": max(min(null_rate, 1.0), 0.0),
"num_parts_range": [num_parts_min, num_parts_max],
"polygon_hole_rate": max(min(hole_rate, 1.0), 0.0),
"seed": int(seed) if seed is not None else round(time.time() * 1000),
"size_range": [size_min, size_max],
"target_rows": int(n),
"vertices_per_linestring_range": [num_vertices_min, num_vertices_max],
}

sd = sedonadb.connect()
return sd.sql(
f"SELECT id, geometry FROM sd_random_geometry('{json.dumps(args)}')"
).limit(int(n))


def skip_if_not_exists(path: Path):
"""Skip a test using pytest.skip() if path does not exist

Expand Down
Loading