From 2b4ef0bdb8100c1f544d4127d2105718cb372dd5 Mon Sep 17 00:00:00 2001 From: Rok Mihevc Date: Sat, 20 Sep 2025 20:20:21 +0200 Subject: [PATCH 1/4] Add pyarrow-stubs minus their docstings --- python/pyarrow-stubs/__init__.pyi | 676 ++++++++ python/pyarrow-stubs/_azurefs.pyi | 33 + python/pyarrow-stubs/_compute.pyi | 586 +++++++ python/pyarrow-stubs/_csv.pyi | 135 ++ python/pyarrow-stubs/_cuda.pyi | 164 ++ python/pyarrow-stubs/_dataset.pyi | 669 ++++++++ python/pyarrow-stubs/_dataset_orc.pyi | 24 + python/pyarrow-stubs/_dataset_parquet.pyi | 163 ++ .../_dataset_parquet_encryption.pyi | 59 + python/pyarrow-stubs/_feather.pyi | 50 + python/pyarrow-stubs/_flight.pyi | 656 +++++++ python/pyarrow-stubs/_fs.pyi | 216 +++ python/pyarrow-stubs/_gcsfs.pyi | 44 + python/pyarrow-stubs/_hdfs.pyi | 38 + python/pyarrow-stubs/_ipc.pyi | 301 ++++ python/pyarrow-stubs/_json.pyi | 70 + python/pyarrow-stubs/_orc.pyi | 73 + python/pyarrow-stubs/_parquet.pyi | 492 ++++++ python/pyarrow-stubs/_parquet_encryption.pyi | 93 + python/pyarrow-stubs/_s3fs.pyi | 103 ++ python/pyarrow-stubs/_stubs_typing.pyi | 117 ++ python/pyarrow-stubs/_substrait.pyi | 63 + python/pyarrow-stubs/_types.pyi | 992 +++++++++++ python/pyarrow-stubs/acero.pyi | 113 ++ python/pyarrow-stubs/array.pyi | 860 ++++++++++ python/pyarrow-stubs/builder.pyi | 53 + python/pyarrow-stubs/cffi.pyi | 21 + python/pyarrow-stubs/compat.pyi | 22 + python/pyarrow-stubs/compute.pyi | 1518 +++++++++++++++++ python/pyarrow-stubs/config.pyi | 64 + python/pyarrow-stubs/csv.pyi | 44 + python/pyarrow-stubs/cuda.pyi | 42 + python/pyarrow-stubs/dataset.pyi | 272 +++ python/pyarrow-stubs/device.pyi | 68 + python/pyarrow-stubs/error.pyi | 70 + python/pyarrow-stubs/feather.pyi | 78 + python/pyarrow-stubs/flight.pyi | 112 ++ python/pyarrow-stubs/fs.pyi | 97 ++ python/pyarrow-stubs/gandiva.pyi | 82 + python/pyarrow-stubs/interchange/__init__.pyi | 16 + python/pyarrow-stubs/interchange/buffer.pyi | 41 + python/pyarrow-stubs/interchange/column.pyi | 90 + .../pyarrow-stubs/interchange/dataframe.pyi | 49 + .../interchange/from_dataframe.pyi | 89 + python/pyarrow-stubs/io.pyi | 428 +++++ python/pyarrow-stubs/ipc.pyi | 157 ++ python/pyarrow-stubs/json.pyi | 20 + python/pyarrow-stubs/lib.pyi | 93 + python/pyarrow-stubs/memory.pyi | 91 + python/pyarrow-stubs/orc.pyi | 146 ++ python/pyarrow-stubs/pandas_compat.pyi | 82 + python/pyarrow-stubs/pandas_shim.pyi | 68 + python/pyarrow-stubs/parquet/__init__.pyi | 18 + python/pyarrow-stubs/parquet/core.pyi | 355 ++++ python/pyarrow-stubs/parquet/encryption.pyi | 32 + python/pyarrow-stubs/py.typed | 16 + python/pyarrow-stubs/scalar.pyi | 391 +++++ python/pyarrow-stubs/substrait.pyi | 38 + python/pyarrow-stubs/table.pyi | 653 +++++++ python/pyarrow-stubs/tensor.pyi | 253 +++ python/pyarrow-stubs/types.pyi | 217 +++ python/pyarrow-stubs/util.pyi | 48 + 62 files changed, 12724 insertions(+) create mode 100644 python/pyarrow-stubs/__init__.pyi create mode 100644 python/pyarrow-stubs/_azurefs.pyi create mode 100644 python/pyarrow-stubs/_compute.pyi create mode 100644 python/pyarrow-stubs/_csv.pyi create mode 100644 python/pyarrow-stubs/_cuda.pyi create mode 100644 python/pyarrow-stubs/_dataset.pyi create mode 100644 python/pyarrow-stubs/_dataset_orc.pyi create mode 100644 python/pyarrow-stubs/_dataset_parquet.pyi create mode 100644 python/pyarrow-stubs/_dataset_parquet_encryption.pyi create mode 100644 python/pyarrow-stubs/_feather.pyi create mode 100644 python/pyarrow-stubs/_flight.pyi create mode 100644 python/pyarrow-stubs/_fs.pyi create mode 100644 python/pyarrow-stubs/_gcsfs.pyi create mode 100644 python/pyarrow-stubs/_hdfs.pyi create mode 100644 python/pyarrow-stubs/_ipc.pyi create mode 100644 python/pyarrow-stubs/_json.pyi create mode 100644 python/pyarrow-stubs/_orc.pyi create mode 100644 python/pyarrow-stubs/_parquet.pyi create mode 100644 python/pyarrow-stubs/_parquet_encryption.pyi create mode 100644 python/pyarrow-stubs/_s3fs.pyi create mode 100644 python/pyarrow-stubs/_stubs_typing.pyi create mode 100644 python/pyarrow-stubs/_substrait.pyi create mode 100644 python/pyarrow-stubs/_types.pyi create mode 100644 python/pyarrow-stubs/acero.pyi create mode 100644 python/pyarrow-stubs/array.pyi create mode 100644 python/pyarrow-stubs/builder.pyi create mode 100644 python/pyarrow-stubs/cffi.pyi create mode 100644 python/pyarrow-stubs/compat.pyi create mode 100644 python/pyarrow-stubs/compute.pyi create mode 100644 python/pyarrow-stubs/config.pyi create mode 100644 python/pyarrow-stubs/csv.pyi create mode 100644 python/pyarrow-stubs/cuda.pyi create mode 100644 python/pyarrow-stubs/dataset.pyi create mode 100644 python/pyarrow-stubs/device.pyi create mode 100644 python/pyarrow-stubs/error.pyi create mode 100644 python/pyarrow-stubs/feather.pyi create mode 100644 python/pyarrow-stubs/flight.pyi create mode 100644 python/pyarrow-stubs/fs.pyi create mode 100644 python/pyarrow-stubs/gandiva.pyi create mode 100644 python/pyarrow-stubs/interchange/__init__.pyi create mode 100644 python/pyarrow-stubs/interchange/buffer.pyi create mode 100644 python/pyarrow-stubs/interchange/column.pyi create mode 100644 python/pyarrow-stubs/interchange/dataframe.pyi create mode 100644 python/pyarrow-stubs/interchange/from_dataframe.pyi create mode 100644 python/pyarrow-stubs/io.pyi create mode 100644 python/pyarrow-stubs/ipc.pyi create mode 100644 python/pyarrow-stubs/json.pyi create mode 100644 python/pyarrow-stubs/lib.pyi create mode 100644 python/pyarrow-stubs/memory.pyi create mode 100644 python/pyarrow-stubs/orc.pyi create mode 100644 python/pyarrow-stubs/pandas_compat.pyi create mode 100644 python/pyarrow-stubs/pandas_shim.pyi create mode 100644 python/pyarrow-stubs/parquet/__init__.pyi create mode 100644 python/pyarrow-stubs/parquet/core.pyi create mode 100644 python/pyarrow-stubs/parquet/encryption.pyi create mode 100644 python/pyarrow-stubs/py.typed create mode 100644 python/pyarrow-stubs/scalar.pyi create mode 100644 python/pyarrow-stubs/substrait.pyi create mode 100644 python/pyarrow-stubs/table.pyi create mode 100644 python/pyarrow-stubs/tensor.pyi create mode 100644 python/pyarrow-stubs/types.pyi create mode 100644 python/pyarrow-stubs/util.pyi diff --git a/python/pyarrow-stubs/__init__.pyi b/python/pyarrow-stubs/__init__.pyi new file mode 100644 index 00000000000..1a188eccd45 --- /dev/null +++ b/python/pyarrow-stubs/__init__.pyi @@ -0,0 +1,676 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +# ruff: noqa: F401, I001, E402 +__version__: str + +import pyarrow.lib as _lib + +_gc_enabled: bool + +from pyarrow.lib import ( + BuildInfo, + RuntimeInfo, + set_timezone_db_path, + MonthDayNano, + VersionInfo, + cpp_build_info, + cpp_version, + cpp_version_info, + runtime_info, + cpu_count, + set_cpu_count, + enable_signal_handlers, + io_thread_count, + set_io_thread_count, +) + + +def show_versions() -> None: ... +def show_info() -> None: ... +def _module_is_available(module: str) -> bool: ... +def _filesystem_is_available(fs: str) -> bool: ... + + +from pyarrow.lib import ( + null, + bool_, + int8, + int16, + int32, + int64, + uint8, + uint16, + uint32, + uint64, + time32, + time64, + timestamp, + date32, + date64, + duration, + month_day_nano_interval, + float16, + float32, + float64, + binary, + string, + utf8, + binary_view, + string_view, + large_binary, + large_string, + large_utf8, + decimal32, + decimal64, + decimal128, + decimal256, + list_, + large_list, + list_view, + large_list_view, + map_, + struct, + union, + sparse_union, + dense_union, + dictionary, + run_end_encoded, + json_, + uuid, + fixed_shape_tensor, + bool8, + opaque, + field, + type_for_alias, + DataType, + DictionaryType, + StructType, + ListType, + LargeListType, + FixedSizeListType, + ListViewType, + LargeListViewType, + MapType, + UnionType, + SparseUnionType, + DenseUnionType, + TimestampType, + Time32Type, + Time64Type, + DurationType, + FixedSizeBinaryType, + Decimal32Type, + Decimal64Type, + Decimal128Type, + Decimal256Type, + BaseExtensionType, + ExtensionType, + RunEndEncodedType, + FixedShapeTensorType, + Bool8Type, + UuidType, + JsonType, + OpaqueType, + UnknownExtensionType, + register_extension_type, + unregister_extension_type, + DictionaryMemo, + KeyValueMetadata, + Field, + Schema, + schema, + unify_schemas, + Array, + Tensor, + array, + chunked_array, + record_batch, + nulls, + repeat, + SparseCOOTensor, + SparseCSRMatrix, + SparseCSCMatrix, + SparseCSFTensor, + infer_type, + from_numpy_dtype, + NullArray, + NumericArray, + IntegerArray, + FloatingPointArray, + BooleanArray, + Int8Array, + UInt8Array, + Int16Array, + UInt16Array, + Int32Array, + UInt32Array, + Int64Array, + UInt64Array, + HalfFloatArray, + FloatArray, + DoubleArray, + ListArray, + LargeListArray, + FixedSizeListArray, + ListViewArray, + LargeListViewArray, + MapArray, + UnionArray, + BinaryArray, + StringArray, + LargeBinaryArray, + LargeStringArray, + BinaryViewArray, + StringViewArray, + FixedSizeBinaryArray, + DictionaryArray, + Date32Array, + Date64Array, + TimestampArray, + Time32Array, + Time64Array, + DurationArray, + MonthDayNanoIntervalArray, + Decimal32Array, + Decimal64Array, + Decimal128Array, + Decimal256Array, + StructArray, + ExtensionArray, + RunEndEncodedArray, + FixedShapeTensorArray, + Bool8Array, + UuidArray, + JsonArray, + OpaqueArray, + scalar, + NA, + _NULL as NULL, + Scalar, + NullScalar, + BooleanScalar, + Int8Scalar, + Int16Scalar, + Int32Scalar, + Int64Scalar, + UInt8Scalar, + UInt16Scalar, + UInt32Scalar, + UInt64Scalar, + HalfFloatScalar, + FloatScalar, + DoubleScalar, + Decimal32Scalar, + Decimal64Scalar, + Decimal128Scalar, + Decimal256Scalar, + ListScalar, + LargeListScalar, + FixedSizeListScalar, + ListViewScalar, + LargeListViewScalar, + Date32Scalar, + Date64Scalar, + Time32Scalar, + Time64Scalar, + TimestampScalar, + DurationScalar, + MonthDayNanoIntervalScalar, + BinaryScalar, + LargeBinaryScalar, + BinaryViewScalar, + StringScalar, + LargeStringScalar, + StringViewScalar, + FixedSizeBinaryScalar, + DictionaryScalar, + MapScalar, + StructScalar, + UnionScalar, + RunEndEncodedScalar, + ExtensionScalar, + Bool8Scalar, + UuidScalar, + JsonScalar, + OpaqueScalar, +) + +# Buffers, allocation +from pyarrow.lib import DeviceAllocationType, Device, MemoryManager, default_cpu_memory_manager + +from pyarrow.lib import ( + Buffer, + ResizableBuffer, + foreign_buffer, + py_buffer, + Codec, + compress, + decompress, + allocate_buffer, +) + +from pyarrow.lib import ( + MemoryPool, + LoggingMemoryPool, + ProxyMemoryPool, + total_allocated_bytes, + set_memory_pool, + default_memory_pool, + system_memory_pool, + jemalloc_memory_pool, + mimalloc_memory_pool, + logging_memory_pool, + proxy_memory_pool, + log_memory_allocations, + jemalloc_set_decay_ms, + supported_memory_backends, +) + +# I/O +from pyarrow.lib import ( + NativeFile, + PythonFile, + BufferedInputStream, + BufferedOutputStream, + CacheOptions, + CompressedInputStream, + CompressedOutputStream, + TransformInputStream, + transcoding_input_stream, + FixedSizeBufferWriter, + BufferReader, + BufferOutputStream, + OSFile, + MemoryMappedFile, + memory_map, + create_memory_map, + MockOutputStream, + input_stream, + output_stream, + have_libhdfs, +) + +from pyarrow.lib import ( + ChunkedArray, + RecordBatch, + Table, + table, + concat_arrays, + concat_tables, + TableGroupBy, + RecordBatchReader, +) + +# Exceptions +from pyarrow.lib import ( + ArrowCancelled, + ArrowCapacityError, + ArrowException, + ArrowKeyError, + ArrowIndexError, + ArrowInvalid, + ArrowIOError, + ArrowMemoryError, + ArrowNotImplementedError, + ArrowTypeError, + ArrowSerializationError, +) + +from pyarrow.ipc import serialize_pandas, deserialize_pandas +import pyarrow.ipc as ipc + +import pyarrow.types as types + +# ---------------------------------------------------------------------- +# Deprecations + +from pyarrow.util import _deprecate_api, _deprecate_class + +from pyarrow.ipc import ( + Message, + MessageReader, + MetadataVersion, + RecordBatchFileReader, + RecordBatchFileWriter, + RecordBatchStreamReader, + RecordBatchStreamWriter, +) + +# ---------------------------------------------------------------------- +# Returning absolute path to the pyarrow include directory (if bundled, e.g. in +# wheels) + + +def get_include() -> str: ... +def _get_pkg_config_executable() -> str: ... +def _has_pkg_config(pkgname: str) -> bool: ... +def _read_pkg_config_variable(pkgname: str, cli_args: list[str]) -> str: ... +def get_libraries() -> list[str]: ... +def create_library_symlinks() -> None: ... +def get_library_dirs() -> list[str]: ... + + +__all__ = [ + "__version__", + "_lib", + "_gc_enabled", + "BuildInfo", + "RuntimeInfo", + "set_timezone_db_path", + "MonthDayNano", + "VersionInfo", + "cpp_build_info", + "cpp_version", + "cpp_version_info", + "runtime_info", + "cpu_count", + "set_cpu_count", + "enable_signal_handlers", + "io_thread_count", + "set_io_thread_count", + "show_versions", + "show_info", + "_module_is_available", + "_filesystem_is_available", + "null", + "bool_", + "int8", + "int16", + "int32", + "int64", + "uint8", + "uint16", + "uint32", + "uint64", + "time32", + "time64", + "timestamp", + "date32", + "date64", + "duration", + "month_day_nano_interval", + "float16", + "float32", + "float64", + "binary", + "string", + "utf8", + "binary_view", + "string_view", + "large_binary", + "large_string", + "large_utf8", + "decimal32", + "decimal64", + "decimal128", + "decimal256", + "list_", + "large_list", + "list_view", + "large_list_view", + "map_", + "struct", + "union", + "sparse_union", + "dense_union", + "dictionary", + "run_end_encoded", + "json_", + "uuid", + "fixed_shape_tensor", + "bool8", + "opaque", + "field", + "type_for_alias", + "DataType", + "DictionaryType", + "StructType", + "ListType", + "LargeListType", + "FixedSizeListType", + "ListViewType", + "LargeListViewType", + "MapType", + "UnionType", + "SparseUnionType", + "DenseUnionType", + "TimestampType", + "Time32Type", + "Time64Type", + "DurationType", + "FixedSizeBinaryType", + "Decimal32Type", + "Decimal64Type", + "Decimal128Type", + "Decimal256Type", + "BaseExtensionType", + "ExtensionType", + "RunEndEncodedType", + "FixedShapeTensorType", + "Bool8Type", + "UuidType", + "JsonType", + "OpaqueType", + "UnknownExtensionType", + "register_extension_type", + "unregister_extension_type", + "DictionaryMemo", + "KeyValueMetadata", + "Field", + "Schema", + "schema", + "unify_schemas", + "Array", + "Tensor", + "array", + "chunked_array", + "record_batch", + "nulls", + "repeat", + "SparseCOOTensor", + "SparseCSRMatrix", + "SparseCSCMatrix", + "SparseCSFTensor", + "infer_type", + "from_numpy_dtype", + "NullArray", + "NumericArray", + "IntegerArray", + "FloatingPointArray", + "BooleanArray", + "Int8Array", + "UInt8Array", + "Int16Array", + "UInt16Array", + "Int32Array", + "UInt32Array", + "Int64Array", + "UInt64Array", + "HalfFloatArray", + "FloatArray", + "DoubleArray", + "ListArray", + "LargeListArray", + "FixedSizeListArray", + "ListViewArray", + "LargeListViewArray", + "MapArray", + "UnionArray", + "BinaryArray", + "StringArray", + "LargeBinaryArray", + "LargeStringArray", + "BinaryViewArray", + "StringViewArray", + "FixedSizeBinaryArray", + "DictionaryArray", + "Date32Array", + "Date64Array", + "TimestampArray", + "Time32Array", + "Time64Array", + "DurationArray", + "MonthDayNanoIntervalArray", + "Decimal32Array", + "Decimal64Array", + "Decimal128Array", + "Decimal256Array", + "StructArray", + "ExtensionArray", + "Bool8Array", + "UuidArray", + "JsonArray", + "OpaqueArray", + "RunEndEncodedArray", + "FixedShapeTensorArray", + "scalar", + "NA", + "NULL", + "Scalar", + "NullScalar", + "BooleanScalar", + "Int8Scalar", + "Int16Scalar", + "Int32Scalar", + "Int64Scalar", + "UInt8Scalar", + "UInt16Scalar", + "UInt32Scalar", + "UInt64Scalar", + "HalfFloatScalar", + "FloatScalar", + "DoubleScalar", + "Decimal32Scalar", + "Decimal64Scalar", + "Decimal128Scalar", + "Decimal256Scalar", + "ListScalar", + "LargeListScalar", + "FixedSizeListScalar", + "ListViewScalar", + "LargeListViewScalar", + "Date32Scalar", + "Date64Scalar", + "Time32Scalar", + "Time64Scalar", + "TimestampScalar", + "DurationScalar", + "MonthDayNanoIntervalScalar", + "BinaryScalar", + "LargeBinaryScalar", + "BinaryViewScalar", + "StringScalar", + "LargeStringScalar", + "StringViewScalar", + "FixedSizeBinaryScalar", + "DictionaryScalar", + "MapScalar", + "StructScalar", + "UnionScalar", + "RunEndEncodedScalar", + "ExtensionScalar", + "Bool8Scalar", + "UuidScalar", + "JsonScalar", + "OpaqueScalar", + "DeviceAllocationType", + "Device", + "MemoryManager", + "default_cpu_memory_manager", + "Buffer", + "ResizableBuffer", + "foreign_buffer", + "py_buffer", + "Codec", + "compress", + "decompress", + "allocate_buffer", + "MemoryPool", + "LoggingMemoryPool", + "ProxyMemoryPool", + "total_allocated_bytes", + "set_memory_pool", + "default_memory_pool", + "system_memory_pool", + "jemalloc_memory_pool", + "mimalloc_memory_pool", + "logging_memory_pool", + "proxy_memory_pool", + "log_memory_allocations", + "jemalloc_set_decay_ms", + "supported_memory_backends", + "NativeFile", + "PythonFile", + "BufferedInputStream", + "BufferedOutputStream", + "CacheOptions", + "CompressedInputStream", + "CompressedOutputStream", + "TransformInputStream", + "transcoding_input_stream", + "FixedSizeBufferWriter", + "BufferReader", + "BufferOutputStream", + "OSFile", + "MemoryMappedFile", + "memory_map", + "create_memory_map", + "MockOutputStream", + "input_stream", + "output_stream", + "have_libhdfs", + "ChunkedArray", + "RecordBatch", + "Table", + "table", + "concat_arrays", + "concat_tables", + "TableGroupBy", + "RecordBatchReader", + "ArrowCancelled", + "ArrowCapacityError", + "ArrowException", + "ArrowKeyError", + "ArrowIndexError", + "ArrowInvalid", + "ArrowIOError", + "ArrowMemoryError", + "ArrowNotImplementedError", + "ArrowTypeError", + "ArrowSerializationError", + "serialize_pandas", + "deserialize_pandas", + "ipc", + "types", + "_deprecate_api", + "_deprecate_class", + "Message", + "MessageReader", + "MetadataVersion", + "RecordBatchFileReader", + "RecordBatchFileWriter", + "RecordBatchStreamReader", + "RecordBatchStreamWriter", + "get_include", + "_get_pkg_config_executable", + "_has_pkg_config", + "_read_pkg_config_variable", + "get_libraries", + "create_library_symlinks", + "get_library_dirs", +] diff --git a/python/pyarrow-stubs/_azurefs.pyi b/python/pyarrow-stubs/_azurefs.pyi new file mode 100644 index 00000000000..2d866f34dbd --- /dev/null +++ b/python/pyarrow-stubs/_azurefs.pyi @@ -0,0 +1,33 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +from typing import Literal + +from ._fs import FileSystem + + +class AzureFileSystem(FileSystem): + def __init__( + self, + account_name: str, + account_key: str | None = None, + blob_storage_authority: str | None = None, + dfs_storage_authority: str | None = None, + blob_storage_schema: Literal["http", "https"] = "https", + dfs_storage_schema: Literal["http", "https"] = "https", + sas_token: str | None = None, + ) -> None: ... diff --git a/python/pyarrow-stubs/_compute.pyi b/python/pyarrow-stubs/_compute.pyi new file mode 100644 index 00000000000..7742dbda539 --- /dev/null +++ b/python/pyarrow-stubs/_compute.pyi @@ -0,0 +1,586 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +from typing import ( + Any, + Callable, + Iterable, + Literal, + Sequence, + TypeAlias, + TypedDict, + overload, +) + +from . import lib + +_Order: TypeAlias = Literal["ascending", "descending"] +_Placement: TypeAlias = Literal["at_start", "at_end"] + + +class Kernel(lib._Weakrefable): + ... + + +class Function(lib._Weakrefable): + @property + def arity(self) -> int: ... + + @property + def kind( + self, + ) -> Literal["scalar", "vector", "scalar_aggregate", "hash_aggregate", "meta"]: ... + @property + def name(self) -> str: ... + @property + def num_kernels(self) -> int: ... + + def call( + self, + args: Iterable, + options: FunctionOptions | None = None, + memory_pool: lib.MemoryPool | None = None, + length: int | None = None, + ) -> Any: ... + + +class FunctionOptions(lib._Weakrefable): + def serialize(self) -> lib.Buffer: ... + @classmethod + def deserialize(cls, buf: lib.Buffer) -> FunctionOptions: ... + + +class FunctionRegistry(lib._Weakrefable): + def get_function(self, name: str) -> Function: ... + def list_functions(self) -> list[str]: ... + + +class HashAggregateFunction(Function): + ... + + +class HashAggregateKernel(Kernel): + ... + + +class ScalarAggregateFunction(Function): + ... + + +class ScalarAggregateKernel(Kernel): + ... + + +class ScalarFunction(Function): + ... + + +class ScalarKernel(Kernel): + ... + + +class VectorFunction(Function): + ... + + +class VectorKernel(Kernel): + ... + +# ==================== _compute.pyx Option classes ==================== + + +class ArraySortOptions(FunctionOptions): + def __init__( + self, + order: _Order = "ascending", + null_placement: _Placement = "at_end", + ) -> None: ... + + +class AssumeTimezoneOptions(FunctionOptions): + def __init__( + self, + timezone: str, + *, + ambiguous: Literal["raise", "earliest", "latest"] = "raise", + nonexistent: Literal["raise", "earliest", "latest"] = "raise", + ) -> None: ... + + +class CastOptions(FunctionOptions): + allow_int_overflow: bool + allow_time_truncate: bool + allow_time_overflow: bool + allow_decimal_truncate: bool + allow_float_truncate: bool + allow_invalid_utf8: bool + + def __init__( + self, + target_type: lib.DataType | None = None, + *, + allow_int_overflow: bool | None = None, + allow_time_truncate: bool | None = None, + allow_time_overflow: bool | None = None, + allow_decimal_truncate: bool | None = None, + allow_float_truncate: bool | None = None, + allow_invalid_utf8: bool | None = None, + ) -> None: ... + @staticmethod + def safe(target_type: lib.DataType | None = None) -> CastOptions: ... + @staticmethod + def unsafe(target_type: lib.DataType | None = None) -> CastOptions: ... + def is_safe(self) -> bool: ... + + +class CountOptions(FunctionOptions): + def __init__(self, mode: Literal["only_valid", + "only_null", "all"] = "only_valid") -> None: ... + + +class CumulativeOptions(FunctionOptions): + def __init__(self, start: lib.Scalar | None = None, + *, skip_nulls: bool = False) -> None: ... + + +class CumulativeSumOptions(FunctionOptions): + def __init__(self, start: lib.Scalar | None = None, + *, skip_nulls: bool = False) -> None: ... + + +class DayOfWeekOptions(FunctionOptions): + def __init__(self, *, count_from_zero: bool = True, + week_start: int = 1) -> None: ... + + +class DictionaryEncodeOptions(FunctionOptions): + def __init__(self, null_encoding: Literal["mask", "encode"] = "mask") -> None: ... + + +class RunEndEncodeOptions(FunctionOptions): + # TODO: default is DataType(int32) + def __init__(self, run_end_type: lib.DataType = ...) -> None: ... + + +class ElementWiseAggregateOptions(FunctionOptions): + def __init__(self, *, skip_nulls: bool = True) -> None: ... + + +class ExtractRegexOptions(FunctionOptions): + def __init__(self, pattern: str) -> None: ... + + +class ExtractRegexSpanOptions(FunctionOptions): + def __init__(self, pattern: str) -> None: ... + + +class FilterOptions(FunctionOptions): + def __init__( + self, null_selection_behavior: Literal["drop", "emit_null"] = "drop") -> None: ... + + +class IndexOptions(FunctionOptions): + def __init__(self, value: lib.Scalar) -> None: ... + + +class JoinOptions(FunctionOptions): + @overload + def __init__( + self, null_handling: Literal["emit_null", "skip"] = "emit_null") -> None: ... + + @overload + def __init__(self, null_handling: Literal["replace"], + null_replacement: str = "") -> None: ... + + +class ListSliceOptions(FunctionOptions): + def __init__( + self, + start: int, + stop: int | None = None, + step: int = 1, + return_fixed_size_list: bool | None = None, + ) -> None: ... + + +class ListFlattenOptions(FunctionOptions): + def __init__(self, recursive: bool = False) -> None: ... + + +class MakeStructOptions(FunctionOptions): + def __init__( + self, + field_names: Sequence[str] = (), + *, + field_nullability: Sequence[bool] | None = None, + field_metadata: Sequence[lib.KeyValueMetadata] | None = None, + ) -> None: ... + + +class MapLookupOptions(FunctionOptions): + # TODO: query_key: Scalar or Object can be converted to Scalar + def __init__( + self, query_key: lib.Scalar, occurrence: Literal["first", "last", "all"] + ) -> None: ... + + +class MatchSubstringOptions(FunctionOptions): + def __init__(self, pattern: str, *, ignore_case: bool = False) -> None: ... + + +class ModeOptions(FunctionOptions): + def __init__(self, n: int = 1, *, skip_nulls: bool = True, + min_count: int = 0) -> None: ... + + +class NullOptions(FunctionOptions): + def __init__(self, *, nan_is_null: bool = False) -> None: ... + + +class PadOptions(FunctionOptions): + def __init__( + self, width: int, padding: str = " ", lean_left_on_odd_padding: bool = True + ) -> None: ... + + +class PairwiseOptions(FunctionOptions): + def __init__(self, period: int = 1) -> None: ... + + +class PartitionNthOptions(FunctionOptions): + def __init__(self, pivot: int, *, + null_placement: _Placement = "at_end") -> None: ... + + +class WinsorizeOptions(FunctionOptions): + def __init__(self, lower_limit: float, upper_limit: float) -> None: ... + + +class QuantileOptions(FunctionOptions): + def __init__( + self, + q: float | Sequence[float], + *, + interpolation: Literal["linear", "lower", + "higher", "nearest", "midpoint"] = "linear", + skip_nulls: bool = True, + min_count: int = 0, + ) -> None: ... + + +class RandomOptions(FunctionOptions): + def __init__(self, *, initializer: int | Literal["system"] = "system") -> None: ... + + +class RankOptions(FunctionOptions): + def __init__( + self, + sort_keys: _Order | Sequence[tuple[str, _Order]] = "ascending", + *, + null_placement: _Placement = "at_end", + tiebreaker: Literal["min", "max", "first", "dense"] = "first", + ) -> None: ... + + +class RankQuantileOptions(FunctionOptions): + def __init__( + self, + sort_keys: _Order | Sequence[tuple[str, _Order]] = "ascending", + *, + null_placement: _Placement = "at_end", + ) -> None: ... + + +class PivotWiderOptions(FunctionOptions): + def __init__( + self, + key_names: Sequence[str], + *, + unexpected_key_behavior: Literal["ignore", "raise"] = "ignore", + ) -> None: ... + + +class ReplaceSliceOptions(FunctionOptions): + def __init__(self, start: int, stop: int, replacement: str) -> None: ... + + +class ReplaceSubstringOptions(FunctionOptions): + def __init__( + self, pattern: str, replacement: str, *, max_replacements: int | None = None + ) -> None: ... + + +_RoundMode: TypeAlias = Literal[ + "down", + "up", + "towards_zero", + "towards_infinity", + "half_down", + "half_up", + "half_towards_zero", + "half_towards_infinity", + "half_to_even", + "half_to_odd", +] + + +class RoundBinaryOptions(FunctionOptions): + def __init__( + self, + round_mode: _RoundMode = "half_to_even", + ) -> None: ... + + +class RoundOptions(FunctionOptions): + def __init__( + self, + ndigits: int = 0, + round_mode: _RoundMode = "half_to_even", + ) -> None: ... + + +_DateTimeUint: TypeAlias = Literal[ + "year", + "quarter", + "month", + "week", + "day", + "hour", + "minute", + "second", + "millisecond", + "microsecond", + "nanosecond", +] + + +class RoundTemporalOptions(FunctionOptions): + def __init__( + self, + multiple: int = 1, + unit: _DateTimeUint = "day", + *, + week_starts_monday: bool = True, + ceil_is_strictly_greater: bool = False, + calendar_based_origin: bool = False, + ) -> None: ... + + +class RoundToMultipleOptions(FunctionOptions): + def __init__(self, multiple: float = 1.0, + round_mode: _RoundMode = "half_to_even") -> None: ... + + +class ScalarAggregateOptions(FunctionOptions): + def __init__(self, *, skip_nulls: bool = True, min_count: int = 1) -> None: ... + + +class SelectKOptions(FunctionOptions): + def __init__(self, k: int, sort_keys: Sequence[tuple[str, _Order]]) -> None: ... + + +class SetLookupOptions(FunctionOptions): + def __init__(self, value_set: lib.Array, *, skip_nulls: bool = True) -> None: ... + + +class SliceOptions(FunctionOptions): + def __init__(self, start: int, stop: int | None = None, step: int = 1) -> None: ... + + +class SortOptions(FunctionOptions): + def __init__( + self, sort_keys: Sequence[tuple[str, _Order]], *, null_placement: _Placement = "at_end" + ) -> None: ... + + +class SplitOptions(FunctionOptions): + def __init__(self, *, max_splits: int | None = None, + reverse: bool = False) -> None: ... + + +class SplitPatternOptions(FunctionOptions): + def __init__( + self, pattern: str, *, max_splits: int | None = None, reverse: bool = False + ) -> None: ... + + +class StrftimeOptions(FunctionOptions): + def __init__(self, format: str = "%Y-%m-%dT%H:%M:%S", + locale: str = "C") -> None: ... + + +class StrptimeOptions(FunctionOptions): + def __init__( + self, format: str, unit: Literal["s", "ms", "us", "ns"], error_is_null: bool = False + ) -> None: ... + + +class StructFieldOptions(FunctionOptions): + def __init__( + self, indices: list[str] | list[bytes] | list[int] | Expression | bytes | str | int + ) -> None: ... + + +class TakeOptions(FunctionOptions): + def __init__(self, boundscheck: bool = True) -> None: ... + + +class TDigestOptions(FunctionOptions): + def __init__( + self, + q: float | Sequence[float] = 0.5, + *, + delta: int = 100, + buffer_size: int = 500, + skip_nulls: bool = True, + min_count: int = 0, + ) -> None: ... + + +class TrimOptions(FunctionOptions): + def __init__(self, characters: str) -> None: ... + + +class Utf8NormalizeOptions(FunctionOptions): + def __init__(self, form: Literal["NFC", "NFKC", "NFD", "NFKD"]) -> None: ... + + +class VarianceOptions(FunctionOptions): + def __init__(self, *, ddof: int = 0, skip_nulls: bool = True, + min_count: int = 0) -> None: ... + + +class SkewOptions(FunctionOptions): + def __init__( + self, *, skip_nulls: bool = True, biased: bool = True, min_count: int = 0 + ) -> None: ... + + +class WeekOptions(FunctionOptions): + def __init__( + self, + *, + week_starts_monday: bool = True, + count_from_zero: bool = False, + first_week_is_fully_in_year: bool = False, + ) -> None: ... + +# ==================== _compute.pyx Functions ==================== + + +def call_function( + name: str, + args: list, + options: FunctionOptions | None = None, + memory_pool: lib.MemoryPool | None = None, + length: int | None = None, +) -> Any: ... +def function_registry() -> FunctionRegistry: ... +def get_function(name: str) -> Function: ... +def list_functions() -> list[str]: ... + +# ==================== _compute.pyx Udf ==================== + + +def call_tabular_function( + function_name: str, args: Iterable | None = None, func_registry: FunctionRegistry | None = None +) -> lib.RecordBatchReader: ... + + +class _FunctionDoc(TypedDict): + summary: str + description: str + + +def register_scalar_function( + func: Callable, + function_name: str, + function_doc: _FunctionDoc, + in_types: dict[str, lib.DataType], + out_type: lib.DataType, + func_registry: FunctionRegistry | None = None, +) -> None: ... + + +def register_tabular_function( + func: Callable, + function_name: str, + function_doc: _FunctionDoc, + in_types: dict[str, lib.DataType], + out_type: lib.DataType, + func_registry: FunctionRegistry | None = None, +) -> None: ... + + +def register_aggregate_function( + func: Callable, + function_name: str, + function_doc: _FunctionDoc, + in_types: dict[str, lib.DataType], + out_type: lib.DataType, + func_registry: FunctionRegistry | None = None, +) -> None: ... + + +def register_vector_function( + func: Callable, + function_name: str, + function_doc: _FunctionDoc, + in_types: dict[str, lib.DataType], + out_type: lib.DataType, + func_registry: FunctionRegistry | None = None, +) -> None: ... + + +class UdfContext: + @property + def batch_length(self) -> int: ... + @property + def memory_pool(self) -> lib.MemoryPool: ... + +# ==================== _compute.pyx Expression ==================== + + +class Expression(lib._Weakrefable): + @staticmethod + def from_substrait(buffer: bytes | lib.Buffer) -> Expression: ... + def to_substrait(self, schema: lib.Schema, + allow_arrow_extensions: bool = False) -> lib.Buffer: ... + + def __invert__(self) -> Expression: ... + def __and__(self, other) -> Expression: ... + def __or__(self, other) -> Expression: ... + def __add__(self, other) -> Expression: ... + def __mul__(self, other) -> Expression: ... + def __sub__(self, other) -> Expression: ... + def __eq__(self, value: object) -> Expression: ... # type: ignore[override] + def __ne__(self, value: object) -> Expression: ... # type: ignore[override] + def __gt__(self, value: object) -> Expression: ... # type: ignore[override] + def __lt__(self, value: object) -> Expression: ... # type: ignore[override] + def __ge__(self, value: object) -> Expression: ... # type: ignore[override] + def __le__(self, value: object) -> Expression: ... # type: ignore[override] + def __truediv__(self, other) -> Expression: ... + def is_valid(self) -> bool: ... + def is_null(self, nan_is_null: bool = False) -> Expression: ... + def is_nan(self) -> Expression: ... + + def cast( + self, type: lib.DataType, safe: bool = True, options: CastOptions | None = None + ) -> Expression: ... + def isin(self, values: lib.Array | Iterable) -> Expression: ... + +# ==================== _compute.py ==================== diff --git a/python/pyarrow-stubs/_csv.pyi b/python/pyarrow-stubs/_csv.pyi new file mode 100644 index 00000000000..c62ae725ec1 --- /dev/null +++ b/python/pyarrow-stubs/_csv.pyi @@ -0,0 +1,135 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +from dataclasses import dataclass, field +from typing import IO, Any, Callable, Literal + +from _typeshed import StrPath + +from . import lib + + +@dataclass(kw_only=True) +class ReadOptions(lib._Weakrefable): + use_threads: bool = field(default=True, kw_only=False) + block_size: int | None = None + skip_rows: int = 0 + skip_rows_after_names: int = 0 + column_names: list[str] | None = None + autogenerate_column_names: bool = False + encoding: str = "utf8" + def validate(self) -> None: ... + + +@dataclass(kw_only=True) +class ParseOptions(lib._Weakrefable): + + delimiter: str = field(default=",", kw_only=False) + quote_char: str | Literal[False] = '"' + double_quote: bool = True + escape_char: str | Literal[False] = False + newlines_in_values: bool = False + ignore_empty_lines: bool = True + invalid_row_handler: Callable[[InvalidRow], Literal["skip", "error"]] | None = None + + def validate(self) -> None: ... + + +@dataclass(kw_only=True) +class ConvertOptions(lib._Weakrefable): + + check_utf8: bool = field(default=True, kw_only=False) + column_types: lib.Schema | dict | None = None + null_values: list[str] | None = None + true_values: list[str] | None = None + false_values: list[str] | None = None + decimal_point: str = "." + strings_can_be_null: bool = False + quoted_strings_can_be_null: bool = True + include_columns: list[str] | None = None + include_missing_columns: bool = False + auto_dict_encode: bool = False + auto_dict_max_cardinality: int | None = None + timestamp_parsers: list[str] | None = None + + def validate(self) -> None: ... + + +@dataclass(kw_only=True) +class WriteOptions(lib._Weakrefable): + + include_header: bool = field(default=True, kw_only=False) + batch_size: int = 1024 + delimiter: str = "," + quoting_style: Literal["needed", "all_valid", "none"] = "needed" + + def validate(self) -> None: ... + + +@dataclass +class InvalidRow(lib._Weakrefable): + + expected_columns: int + actual_columns: int + number: int | None + text: str + + +class CSVWriter(lib._CRecordBatchWriter): + + def __init__( + self, + # TODO: OutputStream + sink: StrPath | IO[Any], + schema: lib.Schema, + write_options: WriteOptions | None = None, + *, + memory_pool: lib.MemoryPool | None = None, + ) -> None: ... + + +class CSVStreamingReader(lib.RecordBatchReader): + ... + + +ISO8601: lib._Weakrefable + + +def open_csv( + input_file: StrPath | IO[Any], + read_options: ReadOptions | None = None, + parse_options: ParseOptions | None = None, + convert_options: ConvertOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> CSVStreamingReader: ... + + +def read_csv( + input_file: StrPath | IO[Any], + read_options: ReadOptions | None = None, + parse_options: ParseOptions | None = None, + convert_options: ConvertOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> lib.Table: ... + + +def write_csv( + data: lib.RecordBatch | lib.Table, + output_file: StrPath | lib.NativeFile | IO[Any], + write_options: WriteOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> None: ... diff --git a/python/pyarrow-stubs/_cuda.pyi b/python/pyarrow-stubs/_cuda.pyi new file mode 100644 index 00000000000..929f448f396 --- /dev/null +++ b/python/pyarrow-stubs/_cuda.pyi @@ -0,0 +1,164 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +from typing import Any + +import cuda # type: ignore[import-not-found] + +from numba.cuda.cudadrv import driver as _numba_driver # type: ignore[import-not-found] + +from . import lib +from ._stubs_typing import ArrayLike + + +class Context(lib._Weakrefable): + + def __init__(self, device_number: int = 0, handle: int | None = None) -> None: ... + + @staticmethod + def from_numba(context: _numba_driver.Context | None = None) -> Context: ... + + def to_numba(self) -> _numba_driver.Context: ... + + @staticmethod + def get_num_devices() -> int: ... + + @property + def device_number(self) -> int: ... + + @property + def handle(self) -> int: ... + + def synchronize(self) -> None: ... + + @property + def bytes_allocated(self) -> int: ... + + def get_device_address(self, address: int) -> int: ... + + def new_buffer(self, nbytes: int) -> CudaBuffer: ... + + @property + def memory_manager(self) -> lib.MemoryManager: ... + + @property + def device(self) -> lib.Device: ... + + def foreign_buffer(self, address: int, size: int, base: Any | + None = None) -> CudaBuffer: ... + + def open_ipc_buffer(self, ipc_handle: IpcMemHandle) -> CudaBuffer: ... + + def buffer_from_data( + self, + data: CudaBuffer | HostBuffer | lib.Buffer | ArrayLike, + offset: int = 0, + size: int = -1, + ) -> CudaBuffer: ... + + def buffer_from_object(self, obj: Any) -> CudaBuffer: ... + + +class IpcMemHandle(lib._Weakrefable): + + @staticmethod + def from_buffer(opaque_handle: lib.Buffer) -> IpcMemHandle: ... + + def serialize(self, pool: lib.MemoryPool | None = None) -> lib.Buffer: ... + + +class CudaBuffer(lib.Buffer): + + @staticmethod + def from_buffer(buf: lib.Buffer) -> CudaBuffer: ... + + @staticmethod + def from_numba(mem: _numba_driver.MemoryPointer) -> CudaBuffer: ... + + def to_numba(self) -> _numba_driver.MemoryPointer: ... + + def copy_to_host( + self, + position: int = 0, + nbytes: int = -1, + buf: lib.Buffer | None = None, + memory_pool: lib.MemoryPool | None = None, + resizable: bool = False, + ) -> lib.Buffer: ... + + def copy_from_host( + self, data: lib.Buffer | ArrayLike, position: int = 0, nbytes: int = -1 + ) -> int: ... + + def copy_from_device(self, buf: CudaBuffer, position: int = 0, + nbytes: int = -1) -> int: ... + + def export_for_ipc(self) -> IpcMemHandle: ... + + @property + def context(self) -> Context: ... + + def slice(self, offset: int = 0, length: int | None = None) -> CudaBuffer: ... + + def to_pybytes(self) -> bytes: ... + + +class HostBuffer(lib.Buffer): + + @property + def size(self) -> int: ... + + +class BufferReader(lib.NativeFile): + + def __init__(self, obj: CudaBuffer) -> None: ... + def read_buffer(self, nbytes: int | None = None) -> CudaBuffer: ... + + +class BufferWriter(lib.NativeFile): + + def __init__(self, obj: CudaBuffer) -> None: ... + def writeat(self, position: int, data: ArrayLike) -> None: ... + + @property + def buffer_size(self) -> int: ... + + @buffer_size.setter + def buffer_size(self, buffer_size: int): ... + + @property + def num_bytes_buffered(self) -> int: ... + + +def new_host_buffer(size: int, device: int = 0) -> HostBuffer: ... + + +def serialize_record_batch(batch: lib.RecordBatch, ctx: Context) -> CudaBuffer: ... + + +def read_message( + source: CudaBuffer | cuda.BufferReader, pool: lib.MemoryManager | None = None +) -> lib.Message: ... + + +def read_record_batch( + buffer: lib.Buffer, + object: lib.Schema, + *, + dictionary_memo: lib.DictionaryMemo | None = None, + pool: lib.MemoryPool | None = None, +) -> lib.RecordBatch: ... diff --git a/python/pyarrow-stubs/_dataset.pyi b/python/pyarrow-stubs/_dataset.pyi new file mode 100644 index 00000000000..c3b3c4d9bec --- /dev/null +++ b/python/pyarrow-stubs/_dataset.pyi @@ -0,0 +1,669 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +import sys + +if sys.version_info >= (3, 11): + from typing import Self +else: + from typing_extensions import Self +from typing import ( + IO, + Any, + Callable, + Generic, + Iterator, + Literal, + NamedTuple, + TypeVar, + overload, +) + +from _typeshed import StrPath + +from . import csv, _json, _parquet, lib +from ._fs import FileSelector, FileSystem, SupportedFileSystem +from ._stubs_typing import Indices, JoinType, Order +from .acero import ExecNodeOptions +from .compute import Expression +from .ipc import IpcWriteOptions, RecordBatchReader + + +class Dataset(lib._Weakrefable): + + @property + def partition_expression(self) -> Expression: ... + + def replace_schema(self, schema: lib.Schema) -> None: ... + + def get_fragments(self, filter: Expression | None = None): ... + + def scanner( + self, + columns: list[str] | None = None, + filter: Expression | None = None, + batch_size: int = ..., + batch_readahead: int = 16, + fragment_readahead: int = 4, + fragment_scan_options: FragmentScanOptions | None = None, + use_threads: bool = True, + cache_metadata: bool = True, + memory_pool: lib.MemoryPool | None = None, + ) -> Scanner: ... + + def to_batches( + self, + columns: list[str] | None = None, + filter: Expression | None = None, + batch_size: int = ..., + batch_readahead: int = 16, + fragment_readahead: int = 4, + fragment_scan_options: FragmentScanOptions | None = None, + use_threads: bool = True, + cache_metadata: bool = True, + memory_pool: lib.MemoryPool | None = None, + ) -> Iterator[lib.RecordBatch]: ... + + def to_table( + self, + columns: list[str] | dict[str, Expression] | None = None, + filter: Expression | None = None, + batch_size: int = ..., + batch_readahead: int = 16, + fragment_readahead: int = 4, + fragment_scan_options: FragmentScanOptions | None = None, + use_threads: bool = True, + cache_metadata: bool = True, + memory_pool: lib.MemoryPool | None = None, + ) -> lib.Table: ... + + def take( + self, + indices: Indices, + columns: list[str] | None = None, + filter: Expression | None = None, + batch_size: int = ..., + batch_readahead: int = 16, + fragment_readahead: int = 4, + fragment_scan_options: FragmentScanOptions | None = None, + use_threads: bool = True, + cache_metadata: bool = True, + memory_pool: lib.MemoryPool | None = None, + ) -> lib.Table: ... + + def head( + self, + num_rows: int, + columns: list[str] | None = None, + filter: Expression | None = None, + batch_size: int = ..., + batch_readahead: int = 16, + fragment_readahead: int = 4, + fragment_scan_options: FragmentScanOptions | None = None, + use_threads: bool = True, + cache_metadata: bool = True, + memory_pool: lib.MemoryPool | None = None, + ) -> lib.Table: ... + + def count_rows( + self, + filter: Expression | None = None, + batch_size: int = ..., + batch_readahead: int = 16, + fragment_readahead: int = 4, + fragment_scan_options: FragmentScanOptions | None = None, + use_threads: bool = True, + cache_metadata: bool = True, + memory_pool: lib.MemoryPool | None = None, + ) -> int: ... + + @property + def schema(self) -> lib.Schema: ... + + def filter(self, expression: Expression) -> Self: ... + + def sort_by(self, sorting: str | + list[tuple[str, Order]], **kwargs) -> InMemoryDataset: ... + + def join( + self, + right_dataset: Dataset, + keys: str | list[str], + right_keys: str | list[str] | None = None, + join_type: JoinType = "left outer", + left_suffix: str | None = None, + right_suffix: str | None = None, + coalesce_keys: bool = True, + use_threads: bool = True, + ) -> InMemoryDataset: ... + + def join_asof( + self, + right_dataset: Dataset, + on: str, + by: str | list[str], + tolerance: int, + right_on: str | list[str] | None = None, + right_by: str | list[str] | None = None, + ) -> InMemoryDataset: ... + + +class InMemoryDataset(Dataset): + ... + + +class UnionDataset(Dataset): + + @property + def children(self) -> list[Dataset]: ... + + +class FileSystemDataset(Dataset): + + def __init__( + self, + fragments: list[Fragment], + schema: lib.Schema, + format: FileFormat, + filesystem: SupportedFileSystem | None = None, + root_partition: Expression | None = None, + ) -> None: ... + + @classmethod + def from_paths( + cls, + paths: list[str], + schema: lib.Schema | None = None, + format: FileFormat | None = None, + filesystem: SupportedFileSystem | None = None, + partitions: list[Expression] | None = None, + root_partition: Expression | None = None, + ) -> FileSystemDataset: ... + + @property + def filesystem(self) -> FileSystem: ... + @property + def partitioning(self) -> Partitioning | None: ... + + @property + def files(self) -> list[str]: ... + + @property + def format(self) -> FileFormat: ... + + +class FileWriteOptions(lib._Weakrefable): + @property + def format(self) -> FileFormat: ... + + +class FileFormat(lib._Weakrefable): + def inspect( + self, file: StrPath | IO, filesystem: SupportedFileSystem | None = None + ) -> lib.Schema: ... + + def make_fragment( + self, + file: StrPath | IO, + filesystem: SupportedFileSystem | None = None, + partition_expression: Expression | None = None, + *, + file_size: int | None = None, + ) -> Fragment: ... + + def make_write_options(self) -> FileWriteOptions: ... + @property + def default_extname(self) -> str: ... + @property + def default_fragment_scan_options(self) -> FragmentScanOptions: ... + @default_fragment_scan_options.setter + def default_fragment_scan_options(self, options: FragmentScanOptions) -> None: ... + + +class Fragment(lib._Weakrefable): + + @property + def physical_schema(self) -> lib.Schema: ... + + @property + def partition_expression(self) -> Expression: ... + + def scanner( + self, + schema: lib.Schema | None = None, + columns: list[str] | None = None, + filter: Expression | None = None, + batch_size: int = ..., + batch_readahead: int = 16, + fragment_readahead: int = 4, + fragment_scan_options: FragmentScanOptions | None = None, + use_threads: bool = True, + cache_metadata: bool = True, + memory_pool: lib.MemoryPool | None = None, + ) -> Scanner: ... + + def to_batches( + self, + schema: lib.Schema | None = None, + columns: list[str] | None = None, + filter: Expression | None = None, + batch_size: int = ..., + batch_readahead: int = 16, + fragment_readahead: int = 4, + fragment_scan_options: FragmentScanOptions | None = None, + use_threads: bool = True, + cache_metadata: bool = True, + memory_pool: lib.MemoryPool | None = None, + ) -> Iterator[lib.RecordBatch]: ... + + def to_table( + self, + schema: lib.Schema | None = None, + columns: list[str] | None = None, + filter: Expression | None = None, + batch_size: int = ..., + batch_readahead: int = 16, + fragment_readahead: int = 4, + fragment_scan_options: FragmentScanOptions | None = None, + use_threads: bool = True, + cache_metadata: bool = True, + memory_pool: lib.MemoryPool | None = None, + ) -> lib.Table: ... + + def take( + self, + indices: Indices, + columns: list[str] | None = None, + filter: Expression | None = None, + batch_size: int = ..., + batch_readahead: int = 16, + fragment_readahead: int = 4, + fragment_scan_options: FragmentScanOptions | None = None, + use_threads: bool = True, + cache_metadata: bool = True, + memory_pool: lib.MemoryPool | None = None, + ) -> lib.Table: ... + + def head( + self, + num_rows: int, + columns: list[str] | None = None, + filter: Expression | None = None, + batch_size: int = ..., + batch_readahead: int = 16, + fragment_readahead: int = 4, + fragment_scan_options: FragmentScanOptions | None = None, + use_threads: bool = True, + cache_metadata: bool = True, + memory_pool: lib.MemoryPool | None = None, + ) -> lib.Table: ... + + def count_rows( + self, + columns: list[str] | None = None, + filter: Expression | None = None, + batch_size: int = ..., + batch_readahead: int = 16, + fragment_readahead: int = 4, + fragment_scan_options: FragmentScanOptions | None = None, + use_threads: bool = True, + cache_metadata: bool = True, + memory_pool: lib.MemoryPool | None = None, + ) -> int: ... + + +class FileFragment(Fragment): + + def open(self) -> lib.NativeFile: ... + + @property + def path(self) -> str: ... + + @property + def filesystem(self) -> FileSystem: ... + + @property + def buffer(self) -> lib.Buffer: ... + + @property + def format(self) -> FileFormat: ... + + +class FragmentScanOptions(lib._Weakrefable): + + @property + def type_name(self) -> str: ... + + +class IpcFileWriteOptions(FileWriteOptions): + @property + def write_options(self) -> IpcWriteOptions: ... + @write_options.setter + def write_options(self, write_options: IpcWriteOptions) -> None: ... + + +class IpcFileFormat(FileFormat): + def equals(self, other: IpcFileFormat) -> bool: ... + def make_write_options(self, **kwargs) -> IpcFileWriteOptions: ... + @property + def default_extname(self) -> str: ... + + +class FeatherFileFormat(IpcFileFormat): + ... + + +class CsvFileFormat(FileFormat): + + def __init__( + self, + parse_options: csv.ParseOptions | None = None, + default_fragment_scan_options: CsvFragmentScanOptions | None = None, + convert_options: csv.ConvertOptions | None = None, + read_options: csv.ReadOptions | None = None, + ) -> None: ... + def make_write_options(self) -> csv.WriteOptions: ... # type: ignore[override] + @property + def parse_options(self) -> csv.ParseOptions: ... + @parse_options.setter + def parse_options(self, parse_options: csv.ParseOptions) -> None: ... + def equals(self, other: CsvFileFormat) -> bool: ... + + +class CsvFragmentScanOptions(FragmentScanOptions): + + convert_options: csv.ConvertOptions + read_options: csv.ReadOptions + + def __init__( + self, convert_options: csv.ConvertOptions, read_options: csv.ReadOptions + ) -> None: ... + def equals(self, other: CsvFragmentScanOptions) -> bool: ... + + +class CsvFileWriteOptions(FileWriteOptions): + write_options: csv.WriteOptions + + +class JsonFileFormat(FileFormat): + + def __init__( + self, + default_fragment_scan_options: JsonFragmentScanOptions | None = None, + parse_options: _json.ParseOptions | None = None, + read_options: _json.ReadOptions | None = None, + ) -> None: ... + def equals(self, other: JsonFileFormat) -> bool: ... + + +class JsonFragmentScanOptions(FragmentScanOptions): + + parse_options: _json.ParseOptions + read_options: _json.ReadOptions + + def __init__( + self, parse_options: _json.ParseOptions, read_options: _json.ReadOptions + ) -> None: ... + def equals(self, other: JsonFragmentScanOptions) -> bool: ... + + +class Partitioning(lib._Weakrefable): + def parse(self, path: str) -> Expression: ... + + def format(self, expr: Expression) -> tuple[str, str]: ... + + @property + def schema(self) -> lib.Schema: ... + + +class PartitioningFactory(lib._Weakrefable): + @property + def type_name(self) -> str: ... + + +class KeyValuePartitioning(Partitioning): + @property + def dictionaries(self) -> list[lib.Array | None]: ... + + +class DirectoryPartitioning(KeyValuePartitioning): + + @staticmethod + def discover( + field_names: list[str] | None = None, + infer_dictionary: bool = False, + max_partition_dictionary_size: int = 0, + schema: lib.Schema | None = None, + segment_encoding: Literal["uri", "none"] = "uri", + ) -> PartitioningFactory: ... + + def __init__( + self, + schema: lib.Schema, + dictionaries: dict[str, lib.Array] | None = None, + segment_encoding: Literal["uri", "none"] = "uri", + ) -> None: ... + + +class HivePartitioning(KeyValuePartitioning): + + def __init__( + self, + schema: lib.Schema, + dictionaries: dict[str, lib.Array] | None = None, + null_fallback: str = "__HIVE_DEFAULT_PARTITION__", + segment_encoding: Literal["uri", "none"] = "uri", + ) -> None: ... + + @staticmethod + def discover( + infer_dictionary: bool = False, + max_partition_dictionary_size: int = 0, + null_fallback="__HIVE_DEFAULT_PARTITION__", + schema: lib.Schema | None = None, + segment_encoding: Literal["uri", "none"] = "uri", + ) -> PartitioningFactory: ... + + +class FilenamePartitioning(KeyValuePartitioning): + + def __init__( + self, + schema: lib.Schema, + dictionaries: dict[str, lib.Array] | None = None, + segment_encoding: Literal["uri", "none"] = "uri", + ) -> None: ... + + @staticmethod + def discover( + field_names: list[str] | None = None, + infer_dictionary: bool = False, + schema: lib.Schema | None = None, + segment_encoding: Literal["uri", "none"] = "uri", + ) -> PartitioningFactory: ... + + +class DatasetFactory(lib._Weakrefable): + + root_partition: Expression + def finish(self, schema: lib.Schema | None = None) -> Dataset: ... + + def inspect(self) -> lib.Schema: ... + + def inspect_schemas(self) -> list[lib.Schema]: ... + + +class FileSystemFactoryOptions(lib._Weakrefable): + + partitioning: Partitioning + partitioning_factory: PartitioningFactory + partition_base_dir: str + exclude_invalid_files: bool + selector_ignore_prefixes: list[str] + + def __init__( + self, + artition_base_dir: str | None = None, + partitioning: Partitioning | PartitioningFactory | None = None, + exclude_invalid_files: bool = True, + selector_ignore_prefixes: list[str] | None = None, + ) -> None: ... + + +class FileSystemDatasetFactory(DatasetFactory): + + def __init__( + self, + filesystem: SupportedFileSystem, + paths_or_selector: FileSelector, + format: FileFormat, + options: FileSystemFactoryOptions | None = None, + ) -> None: ... + + +class UnionDatasetFactory(DatasetFactory): + + def __init__(self, factories: list[DatasetFactory]) -> None: ... + + +_RecordBatchT = TypeVar("_RecordBatchT", bound=lib.RecordBatch) + + +class RecordBatchIterator(lib._Weakrefable, Generic[_RecordBatchT]): + + def __iter__(self) -> Self: ... + def __next__(self) -> _RecordBatchT: ... + + +class TaggedRecordBatch(NamedTuple): + + record_batch: lib.RecordBatch + fragment: Fragment + + +class TaggedRecordBatchIterator(lib._Weakrefable): + + def __iter__(self) -> Self: ... + def __next__(self) -> TaggedRecordBatch: ... + + +class Scanner(lib._Weakrefable): + + @staticmethod + def from_dataset( + dataset: Dataset, + *, + columns: list[str] | dict[str, Expression] | None = None, + filter: Expression | None = None, + batch_size: int = ..., + batch_readahead: int = 16, + fragment_readahead: int = 4, + fragment_scan_options: FragmentScanOptions | None = None, + use_threads: bool = True, + cache_metadata: bool = True, + memory_pool: lib.MemoryPool | None = None, + ) -> Scanner: ... + + @staticmethod + def from_fragment( + fragment: Fragment, + *, + schema: lib.Schema | None = None, + columns: list[str] | dict[str, Expression] | None = None, + filter: Expression | None = None, + batch_size: int = ..., + batch_readahead: int = 16, + fragment_readahead: int = 4, + fragment_scan_options: FragmentScanOptions | None = None, + use_threads: bool = True, + cache_metadata: bool = True, + memory_pool: lib.MemoryPool | None = None, + ) -> Scanner: ... + + @staticmethod + def from_batches( + source: Iterator[lib.RecordBatch] | RecordBatchReader, + *, + schema: lib.Schema | None = None, + columns: list[str] | dict[str, Expression] | None = None, + filter: Expression | None = None, + batch_size: int = ..., + batch_readahead: int = 16, + fragment_readahead: int = 4, + fragment_scan_options: FragmentScanOptions | None = None, + use_threads: bool = True, + cache_metadata: bool = True, + memory_pool: lib.MemoryPool | None = None, + ) -> Scanner: ... + + @property + def dataset_schema(self) -> lib.Schema: ... + + @property + def projected_schema(self) -> lib.Schema: ... + + def to_batches(self) -> Iterator[lib.RecordBatch]: ... + + def scan_batches(self) -> TaggedRecordBatchIterator: ... + + def to_table(self) -> lib.Table: ... + + def take(self, indices: Indices) -> lib.Table: ... + + def head(self, num_rows: int) -> lib.Table: ... + + def count_rows(self) -> int: ... + + def to_reader(self) -> RecordBatchReader: ... + + +def get_partition_keys(partition_expression: Expression) -> dict[str, Any]: ... + + +class WrittenFile(lib._Weakrefable): + + def __init__(self, path: str, metadata: _parquet.FileMetaData | + None, size: int) -> None: ... + + +def _filesystemdataset_write( + data: Scanner, + base_dir: StrPath, + basename_template: str, + filesystem: SupportedFileSystem, + partitioning: Partitioning, + file_options: FileWriteOptions, + max_partitions: int, + file_visitor: Callable[[str], None], + existing_data_behavior: Literal["error", "overwrite_or_ignore", "delete_matching"], + max_open_files: int, + max_rows_per_file: int, + min_rows_per_group: int, + max_rows_per_group: int, + create_dir: bool, +): ... + + +class _ScanNodeOptions(ExecNodeOptions): + def _set_options(self, dataset: Dataset, scan_options: dict) -> None: ... + + +class ScanNodeOptions(_ScanNodeOptions): + + def __init__( + self, dataset: Dataset, require_sequenced_output: bool = False, **kwargs + ) -> None: ... diff --git a/python/pyarrow-stubs/_dataset_orc.pyi b/python/pyarrow-stubs/_dataset_orc.pyi new file mode 100644 index 00000000000..62f49bf5d30 --- /dev/null +++ b/python/pyarrow-stubs/_dataset_orc.pyi @@ -0,0 +1,24 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +from ._dataset import FileFormat + + +class OrcFileFormat(FileFormat): + def equals(self, other: OrcFileFormat) -> bool: ... + @property + def default_extname(self): ... diff --git a/python/pyarrow-stubs/_dataset_parquet.pyi b/python/pyarrow-stubs/_dataset_parquet.pyi new file mode 100644 index 00000000000..df9536ef725 --- /dev/null +++ b/python/pyarrow-stubs/_dataset_parquet.pyi @@ -0,0 +1,163 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +from dataclasses import dataclass +from typing import IO, Any, Iterable, TypedDict + +from _typeshed import StrPath + +from ._compute import Expression +from ._dataset import ( + DatasetFactory, + FileFormat, + FileFragment, + FileWriteOptions, + Fragment, + FragmentScanOptions, + Partitioning, + PartitioningFactory, +) +from ._dataset_parquet_encryption import ParquetDecryptionConfig +from ._fs import SupportedFileSystem +from ._parquet import FileDecryptionProperties, FileMetaData +from .lib import CacheOptions, Schema, _Weakrefable + +parquet_encryption_enabled: bool + + +class ParquetFileFormat(FileFormat): + + def __init__( + self, + read_options: ParquetReadOptions | None = None, + default_fragment_scan_options: ParquetFragmentScanOptions | None = None, + **kwargs, + ) -> None: ... + @property + def read_options(self) -> ParquetReadOptions: ... + def make_write_options( + self) -> ParquetFileWriteOptions: ... # type: ignore[override] + + def equals(self, other: ParquetFileFormat) -> bool: ... + @property + def default_extname(self) -> str: ... + + def make_fragment( + self, + file: StrPath | IO, + filesystem: SupportedFileSystem | None = None, + partition_expression: Expression | None = None, + row_groups: Iterable[int] | None = None, + *, + file_size: int | None = None, + ) -> Fragment: ... + + +class _NameStats(TypedDict): + min: Any + max: Any + + +class RowGroupInfo: + + id: int + metadata: FileMetaData + schema: Schema + + def __init__(self, id: int, metadata: FileMetaData, schema: Schema) -> None: ... + @property + def num_rows(self) -> int: ... + @property + def total_byte_size(self) -> int: ... + @property + def statistics(self) -> dict[str, _NameStats]: ... + + +class ParquetFileFragment(FileFragment): + + def ensure_complete_metadata(self) -> None: ... + @property + def row_groups(self) -> list[RowGroupInfo]: ... + @property + def metadata(self) -> FileMetaData: ... + @property + def num_row_groups(self) -> int: ... + + def split_by_row_group( + self, filter: Expression | None = None, schema: Schema | None = None + ) -> list[Fragment]: ... + + def subset( + self, + filter: Expression | None = None, + schema: Schema | None = None, + row_group_ids: list[int] | None = None, + ) -> ParquetFileFormat: ... + + +class ParquetReadOptions(_Weakrefable): + + def __init__( + self, dictionary_columns: list[str] | None, coerce_int96_timestamp_unit: str | None = None + ) -> None: ... + @property + def coerce_int96_timestamp_unit(self) -> str: ... + @coerce_int96_timestamp_unit.setter + def coerce_int96_timestamp_unit(self, unit: str) -> None: ... + def equals(self, other: ParquetReadOptions) -> bool: ... + + +class ParquetFileWriteOptions(FileWriteOptions): + def update(self, **kwargs) -> None: ... + def _set_properties(self) -> None: ... + def _set_arrow_properties(self) -> None: ... + def _set_encryption_config(self) -> None: ... + + +@dataclass(kw_only=True) +class ParquetFragmentScanOptions(FragmentScanOptions): + + use_buffered_stream: bool = False + buffer_size: int = 8192 + pre_buffer: bool = True + cache_options: CacheOptions | None = None + thrift_string_size_limit: int | None = None + thrift_container_size_limit: int | None = None + decryption_config: ParquetDecryptionConfig | None = None + decryption_properties: FileDecryptionProperties | None = None + page_checksum_verification: bool = False + + def equals(self, other: ParquetFragmentScanOptions) -> bool: ... + + +@dataclass +class ParquetFactoryOptions(_Weakrefable): + + partition_base_dir: str | None = None + partitioning: Partitioning | PartitioningFactory | None = None + validate_column_chunk_paths: bool = False + + +class ParquetDatasetFactory(DatasetFactory): + + def __init__( + self, + metadata_path: str, + filesystem: SupportedFileSystem, + format: FileFormat, + options: ParquetFactoryOptions | None = None, + ) -> None: ... diff --git a/python/pyarrow-stubs/_dataset_parquet_encryption.pyi b/python/pyarrow-stubs/_dataset_parquet_encryption.pyi new file mode 100644 index 00000000000..d8338776481 --- /dev/null +++ b/python/pyarrow-stubs/_dataset_parquet_encryption.pyi @@ -0,0 +1,59 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +from ._dataset_parquet import ParquetFileWriteOptions, ParquetFragmentScanOptions +from ._parquet import FileDecryptionProperties +from ._parquet_encryption import CryptoFactory, EncryptionConfiguration, KmsConnectionConfig +from .lib import _Weakrefable + + +class ParquetEncryptionConfig(_Weakrefable): + + def __init__( + self, + crypto_factory: CryptoFactory, + kms_connection_config: KmsConnectionConfig, + encryption_config: EncryptionConfiguration, + ) -> None: ... + + +class ParquetDecryptionConfig(_Weakrefable): + + def __init__( + self, + crypto_factory: CryptoFactory, + kms_connection_config: KmsConnectionConfig, + encryption_config: EncryptionConfiguration, + ) -> None: ... + + +def set_encryption_config( + opts: ParquetFileWriteOptions, + config: ParquetEncryptionConfig, +) -> None: ... + + +def set_decryption_properties( + opts: ParquetFragmentScanOptions, + config: FileDecryptionProperties, +): ... + + +def set_decryption_config( + opts: ParquetFragmentScanOptions, + config: ParquetDecryptionConfig, +): ... diff --git a/python/pyarrow-stubs/_feather.pyi b/python/pyarrow-stubs/_feather.pyi new file mode 100644 index 00000000000..edd3a089f82 --- /dev/null +++ b/python/pyarrow-stubs/_feather.pyi @@ -0,0 +1,50 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +from typing import IO + +from _typeshed import StrPath + +from .lib import Buffer, NativeFile, Table, _Weakrefable + + +class FeatherError(Exception): + ... + + +def write_feather( + table: Table, + dest: StrPath | IO | NativeFile, + compression: str | None = None, + compression_level: int | None = None, + chunksize: int | None = None, + version: int = 2, +): ... + + +class FeatherReader(_Weakrefable): + def __init__( + self, + source: StrPath | IO | NativeFile | Buffer, + use_memory_map: bool, + use_threads: bool, + ) -> None: ... + @property + def version(self) -> str: ... + def read(self) -> Table: ... + def read_indices(self, indices: list[int]) -> Table: ... + def read_names(self, names: list[str]) -> Table: ... diff --git a/python/pyarrow-stubs/_flight.pyi b/python/pyarrow-stubs/_flight.pyi new file mode 100644 index 00000000000..e4d226a9f60 --- /dev/null +++ b/python/pyarrow-stubs/_flight.pyi @@ -0,0 +1,656 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +import asyncio +import enum +import sys + +if sys.version_info >= (3, 11): + from typing import Self +else: + from typing_extensions import Self +from typing import Generator, Generic, Iterable, Iterator, NamedTuple, TypeVar + +from typing_extensions import deprecated + +from .ipc import _ReadPandasMixin +from .lib import ( + ArrowCancelled, + ArrowException, + ArrowInvalid, + Buffer, + IpcReadOptions, + IpcWriteOptions, + RecordBatch, + RecordBatchReader, + Schema, + Table, + TimestampScalar, + _CRecordBatchWriter, + _Weakrefable, +) + +_T = TypeVar("_T") + +class FlightCallOptions(_Weakrefable): + + + def __init__( + self, + timeout: float | None = None, + write_options: IpcWriteOptions | None = None, + headers: list[tuple[str, str]] | None = None, + read_options: IpcReadOptions | None = None, + ) -> None: ... + + +class CertKeyPair(NamedTuple): + + + cert: str + key: str + +class FlightError(Exception): + + + extra_info: bytes + +class FlightInternalError(FlightError, ArrowException): ... + + +class FlightTimedOutError(FlightError, ArrowException): ... + + +class FlightCancelledError(FlightError, ArrowCancelled): ... + + +class FlightServerError(FlightError, ArrowException): ... + + +class FlightUnauthenticatedError(FlightError, ArrowException): ... + + +class FlightUnauthorizedError(FlightError, ArrowException): ... + + +class FlightUnavailableError(FlightError, ArrowException): ... + + +class FlightWriteSizeExceededError(ArrowInvalid): + + + limit: int + actual: int + +class Action(_Weakrefable): + + + def __init__(self, action_type: bytes | str, buf: Buffer | bytes) -> None: ... + + @property + def type(self) -> str: ... + + @property + def body(self) -> Buffer: ... + + def serialize(self) -> bytes: ... + + @classmethod + def deserialize(cls, serialized: bytes) -> Self: ... + + +class ActionType(NamedTuple): + + + type: str + description: str + + def make_action(self, buf: Buffer | bytes) -> Action: ... + + +class Result(_Weakrefable): + + def __init__(self, buf: Buffer | bytes) -> None: ... + + @property + def body(self) -> Buffer: ... + + def serialize(self) -> bytes: ... + + @classmethod + def deserialize(cls, serialized: bytes) -> Self: ... + + +class BasicAuth(_Weakrefable): + + def __init__( + self, username: str | bytes | None = None, password: str | bytes | None = None + ) -> None: ... + + @property + def username(self) -> bytes: ... + @property + def password(self) -> bytes: ... + def serialize(self) -> str: ... + @staticmethod + def deserialize(serialized: str | bytes) -> BasicAuth: ... + +class DescriptorType(enum.Enum): + + + UNKNOWN = 0 + PATH = 1 + CMD = 2 + +class FlightMethod(enum.Enum): + + + INVALID = 0 + HANDSHAKE = 1 + LIST_FLIGHTS = 2 + GET_FLIGHT_INFO = 3 + GET_SCHEMA = 4 + DO_GET = 5 + DO_PUT = 6 + DO_ACTION = 7 + LIST_ACTIONS = 8 + DO_EXCHANGE = 9 + +class FlightDescriptor(_Weakrefable): + + @staticmethod + def for_path(*path: str | bytes) -> FlightDescriptor: ... + + + @staticmethod + def for_command(command: str | bytes) -> FlightDescriptor: ... + + @property + def descriptor_type(self) -> DescriptorType: ... + + @property + def path(self) -> list[bytes] | None: ... + + @property + def command(self) -> bytes | None: ... + + def serialize(self) -> bytes: ... + @classmethod + def deserialize(cls, serialized: bytes) -> Self: ... + +class Ticket(_Weakrefable): + + def __init__(self, ticket: str | bytes) -> None: ... + @property + def ticket(self) -> bytes: ... + def serialize(self) -> bytes: ... + @classmethod + def deserialize(cls, serialized: bytes) -> Self: ... + +class Location(_Weakrefable): + + def __init__(self, uri: str | bytes) -> None: ... + @property + def uri(self) -> bytes: ... + def equals(self, other: Location) -> bool: ... + @staticmethod + def for_grpc_tcp(host: str | bytes, port: int) -> Location: ... + + @staticmethod + def for_grpc_tls(host: str | bytes, port: int) -> Location: ... + + @staticmethod + def for_grpc_unix(path: str | bytes) -> Location: ... + + +class FlightEndpoint(_Weakrefable): + + def __init__( + self, + ticket: Ticket | str | bytes, + locations: list[str | Location], + expiration_time: TimestampScalar | None = ..., + app_metadata: bytes | str = ..., + ): ... + + @property + def ticket(self) -> Ticket: ... + + @property + def locations(self) -> list[Location]: ... + + def serialize(self) -> bytes: ... + @property + def expiration_time(self) -> TimestampScalar | None: ... + + @property + def app_metadata(self) -> bytes | str: ... + + @classmethod + def deserialize(cls, serialized: bytes) -> Self: ... + +class SchemaResult(_Weakrefable): + + def __init__(self, schema: Schema) -> None: ... + + @property + def schema(self) -> Schema: ... + + def serialize(self) -> bytes: ... + @classmethod + def deserialize(cls, serialized: bytes) -> Self: ... + +class FlightInfo(_Weakrefable): + + def __init__( + self, + schema: Schema, + descriptor: FlightDescriptor, + endpoints: list[FlightEndpoint], + total_records: int = ..., + total_bytes: int = ..., + ordered: bool = ..., + app_metadata: bytes | str = ..., + ) -> None: ... + + @property + def schema(self) -> Schema: ... + + @property + def descriptor(self) -> FlightDescriptor: ... + + @property + def endpoints(self) -> list[FlightEndpoint]: ... + + @property + def total_records(self) -> int: ... + + @property + def total_bytes(self) -> int: ... + + @property + def ordered(self) -> bool: ... + + @property + def app_metadata(self) -> bytes | str: ... + + def serialize(self) -> bytes: ... + @classmethod + def deserialize(cls, serialized: bytes) -> Self: ... + +class FlightStreamChunk(_Weakrefable): + + @property + def data(self) -> RecordBatch | None: ... + @property + def app_metadata(self) -> Buffer | None: ... + def __iter__(self): ... + +class _MetadataRecordBatchReader(_Weakrefable, _ReadPandasMixin): + + + # Needs to be separate class so the "real" class can subclass the + # pure-Python mixin class + + def __iter__(self) -> Self: ... + def __next__(self) -> FlightStreamChunk: ... + @property + def schema(self) -> Schema: ... + + def read_all(self) -> Table: ... + + def read_chunk(self) -> FlightStreamChunk: ... + + def to_reader(self) -> RecordBatchReader: ... + + +class MetadataRecordBatchReader(_MetadataRecordBatchReader): ... + + +class FlightStreamReader(MetadataRecordBatchReader): + + def cancel(self) -> None: ... + + def read_all(self) -> Table: ... + + +class MetadataRecordBatchWriter(_CRecordBatchWriter): + + + def begin(self, schema: Schema, options: IpcWriteOptions | None = None) -> None: ... + + def write_metadata(self, buf: Buffer) -> None: ... + + def write_batch(self, batch: RecordBatch) -> None: ... # type: ignore[override] + + def write_table(self, table: Table, max_chunksize: int | None = None, **kwargs) -> None: ... + + def close(self) -> None: ... + + def write_with_metadata(self, batch: RecordBatch, buf: Buffer) -> None: ... + + +class FlightStreamWriter(MetadataRecordBatchWriter): + + def done_writing(self) -> None: ... + + +class FlightMetadataReader(_Weakrefable): + + def read(self) -> Buffer | None: ... + + +class FlightMetadataWriter(_Weakrefable): + + def write(self, message: Buffer) -> None: ... + + +class AsyncioCall(Generic[_T]): + + + _future: asyncio.Future[_T] + + def as_awaitable(self) -> asyncio.Future[_T]: ... + def wakeup(self, result_or_exception: BaseException | _T) -> None: ... + +class AsyncioFlightClient: + + + def __init__(self, client: FlightClient) -> None: ... + async def get_flight_info( + self, + descriptor: FlightDescriptor, + *, + options: FlightCallOptions | None = None, + ): ... + +class FlightClient(_Weakrefable): + + def __init__( + self, + location: str | tuple[str, int] | Location, + *, + tls_root_certs: str | None = None, + cert_chain: str | None = None, + private_key: str | None = None, + override_hostname: str | None = None, + middleware: list[ClientMiddlewareFactory] | None = None, + write_size_limit_bytes: int | None = None, + disable_server_verification: bool = False, + generic_options: list[tuple[str, int | str]] | None = None, + ): ... + @property + def supports_async(self) -> bool: ... + def as_async(self) -> AsyncioFlightClient: ... + def wait_for_available(self, timeout: int = 5) -> None: ... + + @deprecated( + "Use the ``FlightClient`` constructor or ``pyarrow.flight.connect`` function instead." + ) + @classmethod + def connect( + cls, + location: str | tuple[str, int] | Location, + tls_root_certs: str | None = None, + cert_chain: str | None = None, + private_key: str | None = None, + override_hostname: str | None = None, + disable_server_verification: bool = False, + ) -> FlightClient: ... + + def authenticate( + self, auth_handler: ClientAuthHandler, options: FlightCallOptions | None = None + ) -> None: ... + + def authenticate_basic_token( + self, username: str, password: str, options: FlightCallOptions | None = None + ) -> tuple[str, str]: ... + + def list_actions(self, options: FlightCallOptions | None = None) -> list[Action]: ... + + def do_action( + self, action: Action, options: FlightCallOptions | None = None + ) -> Iterator[Result]: ... + + def list_flights( + self, criteria: str | None = None, options: FlightCallOptions | None = None + ) -> Generator[FlightInfo, None, None]: ... + + def get_flight_info( + self, descriptor: FlightDescriptor, options: FlightCallOptions | None = None + ) -> FlightInfo: ... + + def get_schema( + self, descriptor: FlightDescriptor, options: FlightCallOptions | None = None + ) -> Schema: ... + + def do_get( + self, ticket: Ticket, options: FlightCallOptions | None = None + ) -> FlightStreamReader: ... + + def do_put( + self, + descriptor: FlightDescriptor, + schema: Schema, + options: FlightCallOptions | None = None, + ) -> tuple[FlightStreamWriter, FlightStreamReader]: ... + + def do_exchange( + self, descriptor: FlightDescriptor, options: FlightCallOptions | None = None + ) -> tuple[FlightStreamWriter, FlightStreamReader]: ... + + def close(self) -> None: ... + + def __enter__(self) -> Self: ... + def __exit__(self, exc_type, exc_value, traceback) -> None: ... + +class FlightDataStream(_Weakrefable): ... + + +class RecordBatchStream(FlightDataStream): + + def __init__( + self, data_source: RecordBatchReader | Table, options: IpcWriteOptions | None = None + ) -> None: ... + + +class GeneratorStream(FlightDataStream): + + def __init__( + self, + schema: Schema, + generator: Iterable[FlightDataStream | Table | RecordBatch | RecordBatchReader], + options: IpcWriteOptions | None = None, + ) -> None: ... + + +class ServerCallContext(_Weakrefable): + + def peer_identity(self) -> bytes: ... + + def peer(self) -> str: ... + + # Set safe=True as gRPC on Windows sometimes gives garbage bytes + def is_cancelled(self) -> bool: ... + + def add_header(self, key: str, value: str) -> None: ... + + def add_trailer(self, key: str, value: str) -> None: ... + + def get_middleware(self, key: str) -> ServerMiddleware | None: ... + + +class ServerAuthReader(_Weakrefable): + + def read(self) -> str: ... + +class ServerAuthSender(_Weakrefable): + + def write(self, message: str) -> None: ... + +class ClientAuthReader(_Weakrefable): + + def read(self) -> str: ... + +class ClientAuthSender(_Weakrefable): + + def write(self, message: str) -> None: ... + +class ServerAuthHandler(_Weakrefable): + + def authenticate(self, outgoing: ServerAuthSender, incoming: ServerAuthReader): ... + + def is_valid(self, token: str) -> bool: ... + + +class ClientAuthHandler(_Weakrefable): + + def authenticate(self, outgoing: ClientAuthSender, incoming: ClientAuthReader): ... + + def get_token(self) -> str: ... + + +class CallInfo(NamedTuple): + + + method: FlightMethod + +class ClientMiddlewareFactory(_Weakrefable): + + def start_call(self, info: CallInfo) -> ClientMiddleware | None: ... + + +class ClientMiddleware(_Weakrefable): + + + def sending_headers(self) -> dict[str, list[str] | list[bytes]]: ... + + + def received_headers(self, headers: dict[str, list[str] | list[bytes]]): ... + + + def call_completed(self, exception: ArrowException): ... + + +class ServerMiddlewareFactory(_Weakrefable): + + + def start_call( + self, info: CallInfo, headers: dict[str, list[str] | list[bytes]] + ) -> ServerMiddleware | None: ... + + +class TracingServerMiddlewareFactory(ServerMiddlewareFactory): ... + + +class ServerMiddleware(_Weakrefable): + + + def sending_headers(self) -> dict[str, list[str] | list[bytes]]: ... + + def call_completed(self, exception: ArrowException): ... + + +class TracingServerMiddleware(ServerMiddleware): + trace_context: dict + def __init__(self, trace_context: dict) -> None: ... + +class _ServerMiddlewareFactoryWrapper(ServerMiddlewareFactory): + + + def __init__(self, factories: dict[str, ServerMiddlewareFactory]) -> None: ... + def start_call( # type: ignore[override] + self, info: CallInfo, headers: dict[str, list[str] | list[bytes]] + ) -> _ServerMiddlewareFactoryWrapper | None: ... + +class _ServerMiddlewareWrapper(ServerMiddleware): + def __init__(self, middleware: dict[str, ServerMiddleware]) -> None: ... + def send_headers(self) -> dict[str, dict[str, list[str] | list[bytes]]]: ... + def call_completed(self, exception: ArrowException) -> None: ... + +class _FlightServerFinalizer(_Weakrefable): + + + def finalize(self) -> None: ... + +class FlightServerBase(_Weakrefable): + + def __init__( + self, + location: str | tuple[str, int] | Location | None = None, + auth_handler: ServerAuthHandler | None = None, + tls_certificates: list[tuple[str, str]] | None = None, + verify_client: bool = False, + root_certificates: str | None = None, + middleware: dict[str, ServerMiddlewareFactory] | None = None, + ): ... + @property + def port(self) -> int: ... + + def list_flights(self, context: ServerCallContext, criteria: str) -> Iterator[FlightInfo]: ... + + def get_flight_info( + self, context: ServerCallContext, descriptor: FlightDescriptor + ) -> FlightInfo: ... + + def get_schema(self, context: ServerCallContext, descriptor: FlightDescriptor) -> Schema: ... + + def do_put( + self, + context: ServerCallContext, + descriptor: FlightDescriptor, + reader: MetadataRecordBatchReader, + writer: FlightMetadataWriter, + ) -> None: ... + + def do_get(self, context: ServerCallContext, ticket: Ticket) -> FlightDataStream: ... + + def do_exchange( + self, + context: ServerCallContext, + descriptor: FlightDescriptor, + reader: MetadataRecordBatchReader, + writer: MetadataRecordBatchWriter, + ) -> None: ... + + def list_actions(self, context: ServerCallContext) -> Iterable[Action]: ... + + def do_action(self, context: ServerCallContext, action: Action) -> Iterable[bytes]: ... + + def serve(self) -> None: ... + + def run(self) -> None: ... + + def shutdown(self) -> None: ... + + def wait(self) -> None: ... + + def __enter__(self) -> Self: ... + def __exit__(self, exc_type, exc_value, traceback): ... + +def connect( + location: str | tuple[str, int] | Location, + *, + tls_root_certs: str | None = None, + cert_chain: str | None = None, + private_key: str | None = None, + override_hostname: str | None = None, + middleware: list[ClientMiddlewareFactory] | None = None, + write_size_limit_bytes: int | None = None, + disable_server_verification: bool = False, + generic_options: list[tuple[str, int | str]] | None = None, +) -> FlightClient: ... diff --git a/python/pyarrow-stubs/_fs.pyi b/python/pyarrow-stubs/_fs.pyi new file mode 100644 index 00000000000..9ec5c543c58 --- /dev/null +++ b/python/pyarrow-stubs/_fs.pyi @@ -0,0 +1,216 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +import datetime as dt +import enum +import sys + +from abc import ABC, abstractmethod + +if sys.version_info >= (3, 11): + from typing import Self +else: + from typing_extensions import Self +if sys.version_info >= (3, 10): + from typing import TypeAlias +else: + from typing_extensions import TypeAlias + +from typing import Union, overload + +from fsspec import AbstractFileSystem # type: ignore + +from .lib import NativeFile, _Weakrefable + + +class FileType(enum.IntFlag): + NotFound = enum.auto() + Unknown = enum.auto() + File = enum.auto() + Directory = enum.auto() + + +class FileInfo(_Weakrefable): + + def __init__( + self, + path: str, + type: FileType = FileType.Unknown, + *, + mtime: dt.datetime | float | None = None, + mtime_ns: int | None = None, + size: int | None = None, + ): ... + @property + def type(self) -> FileType: ... + + @property + def is_file(self) -> bool: ... + @property + def path(self) -> str: ... + + @property + def base_name(self) -> str: ... + + @property + def size(self) -> int: ... + + @property + def extension(self) -> str: ... + + @property + def mtime(self) -> dt.datetime | None: ... + + @property + def mtime_ns(self) -> int | None: ... + + +class FileSelector(_Weakrefable): + + base_dir: str + allow_not_found: bool + recursive: bool + def __init__(self, base_dir: str, allow_not_found: bool = False, + recursive: bool = False): ... + + +class FileSystem(_Weakrefable): + + @classmethod + def from_uri(cls, uri: str) -> tuple[Self, str]: ... + + def equals(self, other: FileSystem) -> bool: ... + + @property + def type_name(self) -> str: ... + + def get_file_info(self, paths_or_selector: str | FileSelector | + list[str]) -> FileInfo | list[FileInfo]: ... + + def create_dir(self, path: str, *, recursive: bool = True) -> None: ... + + def delete_dir(self, path: str) -> None: ... + + def delete_dir_contents( + self, path: str, *, accept_root_dir: bool = False, missing_dir_ok: bool = False + ) -> None: ... + + def move(self, src: str, dest: str) -> None: ... + + def copy_file(self, src: str, dest: str) -> None: ... + + def delete_file(self, path: str) -> None: ... + + def open_input_file(self, path: str) -> NativeFile: ... + + def open_input_stream( + self, path: str, compression: str | None = "detect", buffer_size: int | None = None + ) -> NativeFile: ... + + def open_output_stream( + self, + path: str, + compression: str | None = "detect", + buffer_size: int | None = None, + metadata: dict[str, str] | None = None, + ) -> NativeFile: ... + + def open_append_stream( + self, + path: str, + compression: str | None = "detect", + buffer_size: int | None = None, + metadata: dict[str, str] | None = None, + ): ... + + def normalize_path(self, path: str) -> str: ... + + +class LocalFileSystem(FileSystem): + + def __init__(self, *, use_mmap: bool = False) -> None: ... + + +class SubTreeFileSystem(FileSystem): + + def __init__(self, base_path: str, base_fs: FileSystem): ... + @property + def base_path(self) -> str: ... + @property + def base_fs(self) -> FileSystem: ... + + +class _MockFileSystem(FileSystem): + def __init__(self, current_time: dt.datetime | None = None) -> None: ... + + +class PyFileSystem(FileSystem): + + def __init__(self, handler: FileSystemHandler) -> None: ... + @property + def handler(self) -> FileSystemHandler: ... + + +class FileSystemHandler(ABC): + + @abstractmethod + def get_type_name(self) -> str: ... + + @abstractmethod + def get_file_info(self, paths: str | list[str]) -> FileInfo | list[FileInfo]: ... + + @abstractmethod + def get_file_info_selector(self, selector: FileSelector) -> list[FileInfo]: ... + + @abstractmethod + def create_dir(self, path: str, recursive: bool) -> None: ... + + @abstractmethod + def delete_dir(self, path: str) -> None: ... + + @abstractmethod + def delete_dir_contents(self, path: str, missing_dir_ok: bool = False) -> None: ... + + @abstractmethod + def delete_root_dir_contents(self) -> None: ... + + @abstractmethod + def delete_file(self, path: str) -> None: ... + + @abstractmethod + def move(self, src: str, dest: str) -> None: ... + + @abstractmethod + def copy_file(self, src: str, dest: str) -> None: ... + + @abstractmethod + def open_input_stream(self, path: str) -> NativeFile: ... + + @abstractmethod + def open_input_file(self, path: str) -> NativeFile: ... + + @abstractmethod + def open_output_stream(self, path: str, metadata: dict[str, str]) -> NativeFile: ... + + @abstractmethod + def open_append_stream(self, path: str, metadata: dict[str, str]) -> NativeFile: ... + + @abstractmethod + def normalize_path(self, path: str) -> str: ... + + +SupportedFileSystem: TypeAlias = Union[AbstractFileSystem, FileSystem] diff --git a/python/pyarrow-stubs/_gcsfs.pyi b/python/pyarrow-stubs/_gcsfs.pyi new file mode 100644 index 00000000000..631c7ae4932 --- /dev/null +++ b/python/pyarrow-stubs/_gcsfs.pyi @@ -0,0 +1,44 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +import datetime as dt + +from ._fs import FileSystem +from .lib import KeyValueMetadata + + +class GcsFileSystem(FileSystem): + + def __init__( + self, + *, + anonymous: bool = False, + access_token: str | None = None, + target_service_account: str | None = None, + credential_token_expiration: dt.datetime | None = None, + default_bucket_location: str = "US", + scheme: str = "https", + endpoint_override: str | None = None, + default_metadata: dict | KeyValueMetadata | None = None, + retry_time_limit: dt.timedelta | None = None, + project_id: str | None = None, + ): ... + @property + def default_bucket_location(self) -> str: ... + + @property + def project_id(self) -> str: ... diff --git a/python/pyarrow-stubs/_hdfs.pyi b/python/pyarrow-stubs/_hdfs.pyi new file mode 100644 index 00000000000..ee1253d64b6 --- /dev/null +++ b/python/pyarrow-stubs/_hdfs.pyi @@ -0,0 +1,38 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +from _typeshed import StrPath + +from ._fs import FileSystem + + +class HadoopFileSystem(FileSystem): + + def __init__( + self, + host: str, + port: int = 8020, + *, + user: str | None = None, + replication: int = 3, + buffer_size: int = 0, + default_block_size: int | None = None, + kerb_ticket: StrPath | None = None, + extra_conf: dict | None = None, + ): ... + @staticmethod + def from_uri(uri: str) -> HadoopFileSystem: ... # type: ignore[override] diff --git a/python/pyarrow-stubs/_ipc.pyi b/python/pyarrow-stubs/_ipc.pyi new file mode 100644 index 00000000000..6e83541bf5c --- /dev/null +++ b/python/pyarrow-stubs/_ipc.pyi @@ -0,0 +1,301 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +import enum +import sys + +from io import IOBase + +if sys.version_info >= (3, 11): + from typing import Self +else: + from typing_extensions import Self +from typing import Iterable, Iterator, Literal, Mapping, NamedTuple + +import pandas as pd + +from pyarrow._stubs_typing import SupportArrowStream, SupportPyBuffer +from pyarrow.lib import MemoryPool, RecordBatch, Schema, Table, Tensor, _Weakrefable + +from .io import Buffer, Codec, NativeFile +from ._types import DictionaryMemo, KeyValueMetadata + + +class MetadataVersion(enum.IntEnum): + ... + V1 = enum.auto() + V2 = enum.auto() + V3 = enum.auto() + V4 = enum.auto() + V5 = enum.auto() + + +class WriteStats(NamedTuple): + ... + num_messages: int + num_record_batches: int + num_dictionary_batches: int + num_dictionary_deltas: int + num_replaced_dictionaries: int + + +class ReadStats(NamedTuple): + ... + num_messages: int + num_record_batches: int + num_dictionary_batches: int + num_dictionary_deltas: int + num_replaced_dictionaries: int + + +class IpcReadOptions(_Weakrefable): + ... + ensure_native_endian: bool + use_threads: bool + included_fields: list[int] + + def __init__( + self, + *, + ensure_native_endian: bool = True, + use_threads: bool = True, + included_fields: list[int] | None = None, + ) -> None: ... + + +class IpcWriteOptions(_Weakrefable): + ... + metadata_version: MetadataVersion + allow_64bit: bool + use_legacy_format: bool + compression: Codec | Literal["lz4", "zstd"] | None + use_threads: bool + emit_dictionary_deltas: bool + unify_dictionaries: bool + + def __init__( + self, + *, + metadata_version: MetadataVersion = MetadataVersion.V5, + allow_64bit: bool = False, + use_legacy_format: bool = False, + compression: Codec | Literal["lz4", "zstd"] | None = None, + use_threads: bool = True, + emit_dictionary_deltas: bool = False, + unify_dictionaries: bool = False, + ) -> None: ... + + +class Message(_Weakrefable): + ... + @property + def type(self) -> str: ... + @property + def metadata(self) -> Buffer: ... + @property + def metadata_version(self) -> MetadataVersion: ... + @property + def body(self) -> Buffer | None: ... + def equals(self, other: Message) -> bool: ... + + def serialize_to( + self, sink: NativeFile, alignment: int = 8, memory_pool: MemoryPool | None = None + ): ... + + def serialize(self, alignment: int = 8, memory_pool: MemoryPool | + None = None) -> Buffer: ... + + +class MessageReader(_Weakrefable): + ... + @classmethod + def open_stream(cls, source: bytes | NativeFile | + IOBase | SupportPyBuffer) -> Self: ... + + def __iter__(self) -> Self: ... + def read_next_message(self) -> Message: ... + + __next__ = read_next_message + +# ---------------------------------------------------------------------- +# File and stream readers and writers + + +class _CRecordBatchWriter(_Weakrefable): + ... + def write(self, table_or_batch: Table | RecordBatch): ... + + def write_batch( + self, + batch: RecordBatch, + custom_metadata: Mapping[bytes, bytes] | KeyValueMetadata | None = None, + ): ... + + def write_table(self, table: Table, max_chunksize: int | None = None) -> None: ... + + def close(self) -> None: ... + + def __enter__(self) -> Self: ... + def __exit__(self, exc_type, exc_val, exc_tb): ... + @property + def stats(self) -> WriteStats: ... + + +class _RecordBatchStreamWriter(_CRecordBatchWriter): + ... + @property + def _use_legacy_format(self) -> bool: ... + @property + def _metadata_version(self) -> MetadataVersion: ... + def _open(self, sink, schema: Schema, + options: IpcWriteOptions = IpcWriteOptions()): ... + + +class _ReadPandasMixin: + ... + def read_pandas(self, **options) -> pd.DataFrame: ... + + +class RecordBatchReader(_Weakrefable): + ... + def __iter__(self) -> Self: ... + def read_next_batch(self) -> RecordBatch: ... + + __next__ = read_next_batch + @property + def schema(self) -> Schema: ... + + def read_next_batch_with_custom_metadata(self) -> RecordBatchWithMetadata: ... + + def iter_batches_with_custom_metadata( + self, + ) -> Iterator[RecordBatchWithMetadata]: ... + + def read_all(self) -> Table: ... + + # pyright: ignore[reportUnknownMemberType,reportUnknownVariableType] + read_pandas = _ReadPandasMixin.read_pandas + def close(self) -> None: ... + + def __enter__(self) -> Self: ... + def __exit__(self, exc_type, exc_val, exc_tb): ... + def cast(self, target_schema: Schema) -> Self: ... + + def _export_to_c(self, out_ptr: int) -> None: ... + + @classmethod + def _import_from_c(cls, in_ptr: int) -> Self: ... + + def __arrow_c_stream__(self, requested_schema=None): ... + + @classmethod + def _import_from_c_capsule(cls, stream) -> Self: ... + + @classmethod + def from_stream(cls, data: SupportArrowStream, + schema: Schema | None = None) -> Self: ... + + @classmethod + def from_batches(cls, schema: Schema, batches: Iterable[RecordBatch]) -> Self: ... + + +class _RecordBatchStreamReader(RecordBatchReader): + ... + @property + def stats(self) -> ReadStats: ... + + +class _RecordBatchFileWriter(_RecordBatchStreamWriter): + ... + + +class RecordBatchWithMetadata(NamedTuple): + ... + batch: RecordBatch + custom_metadata: KeyValueMetadata + + +class _RecordBatchFileReader(_Weakrefable): + ... + @property + def num_record_batches(self) -> int: ... + + def get_batch(self, i: int) -> RecordBatch: ... + + get_record_batch = get_batch + def get_batch_with_custom_metadata(self, i: int) -> RecordBatchWithMetadata: ... + + def read_all(self) -> Table: ... + + # pyright: ignore[reportUnknownMemberType,reportUnknownVariableType] + read_pandas = _ReadPandasMixin.read_pandas + def __enter__(self) -> Self: ... + def __exit__(self, exc_type, exc_val, exc_tb): ... + @property + def schema(self) -> Schema: ... + @property + def stats(self) -> ReadStats: ... + + +def get_tensor_size(tensor: Tensor) -> int: ... + + +def get_record_batch_size(batch: RecordBatch) -> int: ... + + +def write_tensor(tensor: Tensor, dest: NativeFile) -> int: ... + + +def read_tensor(source: NativeFile) -> Tensor: ... + + +def read_message(source: NativeFile | IOBase | SupportPyBuffer) -> Message: ... + + +def read_schema(obj: Buffer | Message, dictionary_memo: DictionaryMemo | + None = None) -> Schema: ... + + +def read_record_batch( + obj: Message | SupportPyBuffer, schema: Schema, dictionary_memo: DictionaryMemo | None = None +) -> RecordBatch: ... + + +__all__ = [ + "MetadataVersion", + "WriteStats", + "ReadStats", + "IpcReadOptions", + "IpcWriteOptions", + "Message", + "MessageReader", + "_CRecordBatchWriter", + "_RecordBatchStreamWriter", + "_ReadPandasMixin", + "RecordBatchReader", + "_RecordBatchStreamReader", + "_RecordBatchFileWriter", + "RecordBatchWithMetadata", + "_RecordBatchFileReader", + "get_tensor_size", + "get_record_batch_size", + "write_tensor", + "read_tensor", + "read_message", + "read_schema", + "read_record_batch", +] diff --git a/python/pyarrow-stubs/_json.pyi b/python/pyarrow-stubs/_json.pyi new file mode 100644 index 00000000000..b52be2bf028 --- /dev/null +++ b/python/pyarrow-stubs/_json.pyi @@ -0,0 +1,70 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +from typing import IO, Any, Literal + +from _typeshed import StrPath + +from .lib import MemoryPool, RecordBatchReader, Schema, Table, _Weakrefable + +class ReadOptions(_Weakrefable): + + + use_threads: bool + + block_size: int + + def __init__(self, use_threads: bool | None = None, block_size: int | None = None): ... + def equals(self, other: ReadOptions) -> bool: ... + + +class ParseOptions(_Weakrefable): + + + explicit_schema: Schema + + newlines_in_values: bool + + unexpected_field_behavior: Literal["ignore", "error", "infer"] + + def __init__( + self, + explicit_schema: Schema | None = None, + newlines_in_values: bool | None = None, + unexpected_field_behavior: Literal["ignore", "error", "infer"] = "infer", + ): ... + def equals(self, other: ParseOptions) -> bool: ... + + +class JSONStreamingReader(RecordBatchReader): ... + + +def read_json( + input_file: StrPath | IO[Any], + read_options: ReadOptions | None = None, + parse_options: ParseOptions | None = None, + memory_pool: MemoryPool | None = None, +) -> Table: ... + + +def open_json( + input_file: StrPath | IO[Any], + read_options: ReadOptions | None = None, + parse_options: ParseOptions | None = None, + memory_pool: MemoryPool | None = None, +) -> JSONStreamingReader: ... + diff --git a/python/pyarrow-stubs/_orc.pyi b/python/pyarrow-stubs/_orc.pyi new file mode 100644 index 00000000000..7587cc121c3 --- /dev/null +++ b/python/pyarrow-stubs/_orc.pyi @@ -0,0 +1,73 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +from typing import IO, Literal + +from .lib import ( + Buffer, + KeyValueMetadata, + MemoryPool, + NativeFile, + RecordBatch, + Schema, + Table, + _Weakrefable, +) + +class ORCReader(_Weakrefable): + def __init__(self, memory_pool: MemoryPool | None = None) -> None: ... + def open(self, source: str | NativeFile | Buffer, use_memory_map: bool = True): ... + def metadata(self) -> KeyValueMetadata: ... + def schema(self) -> Schema: ... + def nrows(self) -> int: ... + def nstripes(self) -> int: ... + def file_version(self) -> str: ... + def software_version(self) -> str: ... + def compression(self) -> Literal["UNCOMPRESSED", "ZLIB", "SNAPPY", "LZ4", "ZSTD"]: ... + def compression_size(self) -> int: ... + def row_index_stride(self) -> int: ... + def writer(self) -> str: ... + def writer_version(self) -> str: ... + def nstripe_statistics(self) -> int: ... + def content_length(self) -> int: ... + def stripe_statistics_length(self) -> int: ... + def file_footer_length(self) -> int: ... + def file_postscript_length(self) -> int: ... + def file_length(self) -> int: ... + def serialized_file_tail(self) -> int: ... + def read_stripe(self, n: int, columns: list[str] | None = None) -> RecordBatch: ... + def read(self, columns: list[str] | None = None) -> Table: ... + +class ORCWriter(_Weakrefable): + def open( + self, + where: str | NativeFile | IO, + *, + file_version: str | None = None, + batch_size: int | None = None, + stripe_size: int | None = None, + compression: Literal["UNCOMPRESSED", "ZLIB", "SNAPPY", "LZ4", "ZSTD"] | None = None, + compression_block_size: int | None = None, + compression_strategy: Literal["COMPRESSION", "SPEED"] | None = None, + row_index_stride: int | None = None, + padding_tolerance: float | None = None, + dictionary_key_size_threshold: float | None = None, + bloom_filter_columns: list[int] | None = None, + bloom_filter_fpp: float | None = None, + ) -> None: ... + def write(self, table: Table) -> None: ... + def close(self) -> None: ... diff --git a/python/pyarrow-stubs/_parquet.pyi b/python/pyarrow-stubs/_parquet.pyi new file mode 100644 index 00000000000..35ee2b41fde --- /dev/null +++ b/python/pyarrow-stubs/_parquet.pyi @@ -0,0 +1,492 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +from typing import IO, Any, Iterable, Iterator, Literal, Sequence, TypeAlias, TypedDict + +from _typeshed import StrPath + +from ._stubs_typing import Order +from .lib import ( + Buffer, + ChunkedArray, + KeyValueMetadata, + MemoryPool, + NativeFile, + RecordBatch, + Schema, + Table, + _Weakrefable, +) + +_PhysicalType: TypeAlias = Literal[ + "BOOLEAN", + "INT32", + "INT64", + "INT96", + "FLOAT", + "DOUBLE", + "BYTE_ARRAY", + "FIXED_LEN_BYTE_ARRAY", + "UNKNOWN", +] +_LogicTypeName: TypeAlias = Literal[ + "UNDEFINED", + "STRING", + "MAP", + "LIST", + "ENUM", + "DECIMAL", + "DATE", + "TIME", + "TIMESTAMP", + "INT", + "FLOAT16", + "JSON", + "BSON", + "UUID", + "NONE", + "UNKNOWN", +] +_ConvertedType: TypeAlias = Literal[ + "NONE", + "UTF8", + "MAP", + "MAP_KEY_VALUE", + "LIST", + "ENUM", + "DECIMAL", + "DATE", + "TIME_MILLIS", + "TIME_MICROS", + "TIMESTAMP_MILLIS", + "TIMESTAMP_MICROS", + "UINT_8", + "UINT_16", + "UINT_32", + "UINT_64", + "INT_8", + "INT_16", + "INT_32", + "INT_64", + "JSON", + "BSON", + "INTERVAL", + "UNKNOWN", +] +_Encoding: TypeAlias = Literal[ + "PLAIN", + "PLAIN_DICTIONARY", + "RLE", + "BIT_PACKED", + "DELTA_BINARY_PACKED", + "DELTA_LENGTH_BYTE_ARRAY", + "DELTA_BYTE_ARRAY", + "RLE_DICTIONARY", + "BYTE_STREAM_SPLIT", + "UNKNOWN", +] +_Compression: TypeAlias = Literal[ + "UNCOMPRESSED", + "SNAPPY", + "GZIP", + "LZO", + "BROTLI", + "LZ4", + "ZSTD", + "UNKNOWN", +] + + +class _Statistics(TypedDict): + has_min_max: bool + min: Any | None + max: Any | None + null_count: int | None + distinct_count: int | None + num_values: int + physical_type: _PhysicalType + + +class Statistics(_Weakrefable): + def to_dict(self) -> _Statistics: ... + def equals(self, other: Statistics) -> bool: ... + @property + def has_min_max(self) -> bool: ... + @property + def has_null_count(self) -> bool: ... + @property + def has_distinct_count(self) -> bool: ... + @property + def min_raw(self) -> Any | None: ... + @property + def max_raw(self) -> Any | None: ... + @property + def min(self) -> Any | None: ... + @property + def max(self) -> Any | None: ... + @property + def null_count(self) -> int | None: ... + @property + def distinct_count(self) -> int | None: ... + @property + def num_values(self) -> int: ... + @property + def physical_type(self) -> _PhysicalType: ... + @property + def logical_type(self) -> ParquetLogicalType: ... + @property + def converted_type(self) -> _ConvertedType | None: ... + + +class ParquetLogicalType(_Weakrefable): + def to_json(self) -> str: ... + @property + def type(self) -> _LogicTypeName: ... + + +class _ColumnChunkMetaData(TypedDict): + file_offset: int + file_path: str | None + physical_type: _PhysicalType + num_values: int + path_in_schema: str + is_stats_set: bool + statistics: Statistics | None + compression: _Compression + encodings: tuple[_Encoding, ...] + has_dictionary_page: bool + dictionary_page_offset: int | None + data_page_offset: int + total_compressed_size: int + total_uncompressed_size: int + + +class ColumnChunkMetaData(_Weakrefable): + def to_dict(self) -> _ColumnChunkMetaData: ... + def equals(self, other: ColumnChunkMetaData) -> bool: ... + @property + def file_offset(self) -> int: ... + @property + def file_path(self) -> str | None: ... + @property + def physical_type(self) -> _PhysicalType: ... + @property + def num_values(self) -> int: ... + @property + def path_in_schema(self) -> str: ... + @property + def is_stats_set(self) -> bool: ... + @property + def statistics(self) -> Statistics | None: ... + @property + def compression(self) -> _Compression: ... + @property + def encodings(self) -> tuple[_Encoding, ...]: ... + @property + def has_dictionary_page(self) -> bool: ... + @property + def dictionary_page_offset(self) -> int | None: ... + @property + def data_page_offset(self) -> int: ... + @property + def has_index_page(self) -> bool: ... + @property + def index_page_offset(self) -> int: ... + @property + def total_compressed_size(self) -> int: ... + @property + def total_uncompressed_size(self) -> int: ... + @property + def has_offset_index(self) -> bool: ... + @property + def has_column_index(self) -> bool: ... + @property + def metadata(self) -> dict[bytes, bytes] | None: ... + + +class _SortingColumn(TypedDict): + column_index: int + descending: bool + nulls_first: bool + + +class SortingColumn: + def __init__( + self, column_index: int, descending: bool = False, nulls_first: bool = False + ) -> None: ... + + @classmethod + def from_ordering( + cls, + schema: Schema, + sort_keys: Sequence[tuple[str, Order]], + null_placement: Literal["at_start", "at_end"] = "at_end", + ) -> tuple[SortingColumn, ...]: ... + + @staticmethod + def to_ordering( + schema: Schema, sorting_columns: tuple[SortingColumn, ...] + ) -> tuple[Sequence[tuple[str, Order]], Literal["at_start", "at_end"]]: ... + def __hash__(self) -> int: ... + @property + def column_index(self) -> int: ... + @property + def descending(self) -> bool: ... + @property + def nulls_first(self) -> bool: ... + def to_dict(self) -> _SortingColumn: ... + + +class _RowGroupMetaData(TypedDict): + num_columns: int + num_rows: int + total_byte_size: int + columns: list[ColumnChunkMetaData] + sorting_columns: list[SortingColumn] + + +class RowGroupMetaData(_Weakrefable): + def __init__(self, parent: FileMetaData, index: int) -> None: ... + def equals(self, other: RowGroupMetaData) -> bool: ... + def column(self, i: int) -> ColumnChunkMetaData: ... + def to_dict(self) -> _RowGroupMetaData: ... + @property + def num_columns(self) -> int: ... + @property + def num_rows(self) -> int: ... + @property + def total_byte_size(self) -> int: ... + @property + def sorting_columns(self) -> list[SortingColumn]: ... + + +class _FileMetaData(TypedDict): + created_by: str + num_columns: int + num_rows: int + num_row_groups: int + format_version: str + serialized_size: int + + +class FileMetaData(_Weakrefable): + def __hash__(self) -> int: ... + def to_dict(self) -> _FileMetaData: ... + def equals(self, other: FileMetaData) -> bool: ... + @property + def schema(self) -> ParquetSchema: ... + @property + def serialized_size(self) -> int: ... + @property + def num_columns(self) -> int: ... + @property + def num_rows(self) -> int: ... + @property + def num_row_groups(self) -> int: ... + @property + def format_version(self) -> str: ... + @property + def created_by(self) -> str: ... + @property + def metadata(self) -> dict[bytes, bytes] | None: ... + def row_group(self, i: int) -> RowGroupMetaData: ... + def set_file_path(self, path: str) -> None: ... + def append_row_groups(self, other: FileMetaData) -> None: ... + def write_metadata_file(self, where: StrPath | Buffer | + NativeFile | IO) -> None: ... + + +class ParquetSchema(_Weakrefable): + def __init__(self, container: FileMetaData) -> None: ... + def __getitem__(self, i: int) -> ColumnChunkMetaData: ... + def __hash__(self) -> int: ... + def __len__(self) -> int: ... + @property + def names(self) -> list[str]: ... + def to_arrow_schema(self) -> Schema: ... + def equals(self, other: ParquetSchema) -> bool: ... + def column(self, i: int) -> ColumnSchema: ... + + +class ColumnSchema(_Weakrefable): + def __init__(self, schema: ParquetSchema, index: int) -> None: ... + def equals(self, other: ColumnSchema) -> bool: ... + @property + def name(self) -> str: ... + @property + def path(self) -> str: ... + @property + def max_definition_level(self) -> int: ... + @property + def max_repetition_level(self) -> int: ... + @property + def physical_type(self) -> _PhysicalType: ... + @property + def logical_type(self) -> ParquetLogicalType: ... + @property + def converted_type(self) -> _ConvertedType | None: ... + @property + def length(self) -> int | None: ... + @property + def precision(self) -> int | None: ... + @property + def scale(self) -> int | None: ... + + +class ParquetReader(_Weakrefable): + def __init__(self, memory_pool: MemoryPool | None = None) -> None: ... + + def open( + self, + source: StrPath | NativeFile | IO, + *, + use_memory_map: bool = False, + read_dictionary: Iterable[int] | Iterable[str] | None = None, + metadata: FileMetaData | None = None, + buffer_size: int = 0, + pre_buffer: bool = False, + coerce_int96_timestamp_unit: str | None = None, + decryption_properties: FileDecryptionProperties | None = None, + thrift_string_size_limit: int | None = None, + thrift_container_size_limit: int | None = None, + page_checksum_verification: bool = False, + ): ... + @property + def column_paths(self) -> list[str]: ... + @property + def metadata(self) -> FileMetaData: ... + @property + def schema_arrow(self) -> Schema: ... + @property + def num_row_groups(self) -> int: ... + def set_use_threads(self, use_threads: bool) -> None: ... + def set_batch_size(self, batch_size: int) -> None: ... + + def iter_batches( + self, + batch_size: int, + row_groups: list[int], + column_indices: list[int] | None = None, + use_threads: bool = True, + ) -> Iterator[RecordBatch]: ... + + def read_row_group( + self, i: int, column_indices: list[int] | None = None, use_threads: bool = True + ) -> Table: ... + + def read_row_groups( + self, + row_groups: list[int], + column_indices: list[int] | None = None, + use_threads: bool = True, + ) -> Table: ... + + def read_all( + self, column_indices: list[int] | None = None, use_threads: bool = True + ) -> Table: ... + def scan_contents( + self, column_indices: list[int] | None = None, batch_size: int = 65536): ... + + def column_name_idx(self, column_name: str) -> int: ... + def read_column(self, column_index: int) -> ChunkedArray: ... + def close(self) -> None: ... + @property + def closed(self) -> bool: ... + + +class ParquetWriter(_Weakrefable): + def __init__( + self, + where: StrPath | NativeFile | IO, + schema: Schema, + use_dictionary: bool | list[str] | None = None, + compression: _Compression | dict[str, _Compression] | None = None, + version: str | None = None, + write_statistics: bool | list[str] | None = None, + memory_pool: MemoryPool | None = None, + use_deprecated_int96_timestamps: bool = False, + coerce_timestamps: Literal["ms", "us"] | None = None, + data_page_size: int | None = None, + allow_truncated_timestamps: bool = False, + compression_level: int | dict[str, int] | None = None, + use_byte_stream_split: bool | list[str] = False, + column_encoding: _Encoding | dict[str, _Encoding] | None = None, + writer_engine_version: str | None = None, + data_page_version: str | None = None, + use_compliant_nested_type: bool = True, + encryption_properties: FileDecryptionProperties | None = None, + write_batch_size: int | None = None, + dictionary_pagesize_limit: int | None = None, + store_schema: bool = True, + write_page_index: bool = False, + write_page_checksum: bool = False, + sorting_columns: tuple[SortingColumn, ...] | None = None, + store_decimal_as_integer: bool = False, + ): ... + def close(self) -> None: ... + def write_table(self, table: Table, row_group_size: int | None = None) -> None: ... + def add_key_value_metadata(self, key_value_metadata: KeyValueMetadata) -> None: ... + @property + def metadata(self) -> FileMetaData: ... + @property + def use_dictionary(self) -> bool | list[str] | None: ... + @property + def use_deprecated_int96_timestamps(self) -> bool: ... + @property + def use_byte_stream_split(self) -> bool | list[str]: ... + @property + def column_encoding(self) -> _Encoding | dict[str, _Encoding] | None: ... + @property + def coerce_timestamps(self) -> Literal["ms", "us"] | None: ... + @property + def allow_truncated_timestamps(self) -> bool: ... + @property + def compression(self) -> _Compression | dict[str, _Compression] | None: ... + @property + def compression_level(self) -> int | dict[str, int] | None: ... + @property + def data_page_version(self) -> str | None: ... + @property + def use_compliant_nested_type(self) -> bool: ... + @property + def version(self) -> str | None: ... + @property + def write_statistics(self) -> bool | list[str] | None: ... + @property + def writer_engine_version(self) -> str: ... + @property + def row_group_size(self) -> int: ... + @property + def data_page_size(self) -> int: ... + @property + def encryption_properties(self) -> FileDecryptionProperties: ... + @property + def write_batch_size(self) -> int: ... + @property + def dictionary_pagesize_limit(self) -> int: ... + @property + def store_schema(self) -> bool: ... + @property + def store_decimal_as_integer(self) -> bool: ... + + +class FileEncryptionProperties: + ... + + +class FileDecryptionProperties: + ... diff --git a/python/pyarrow-stubs/_parquet_encryption.pyi b/python/pyarrow-stubs/_parquet_encryption.pyi new file mode 100644 index 00000000000..cf09b6ee39c --- /dev/null +++ b/python/pyarrow-stubs/_parquet_encryption.pyi @@ -0,0 +1,93 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +import datetime as dt + +from typing import Callable + +from ._parquet import FileDecryptionProperties, FileEncryptionProperties +from .lib import _Weakrefable + + +class EncryptionConfiguration(_Weakrefable): + footer_key: str + column_keys: dict[str, list[str]] + encryption_algorithm: str + plaintext_footer: bool + double_wrapping: bool + cache_lifetime: dt.timedelta + internal_key_material: bool + data_key_length_bits: int + + def __init__( + self, + footer_key: str, + *, + column_keys: dict[str, str | list[str]] | None = None, + encryption_algorithm: str | None = None, + plaintext_footer: bool | None = None, + double_wrapping: bool | None = None, + cache_lifetime: dt.timedelta | None = None, + internal_key_material: bool | None = None, + data_key_length_bits: int | None = None, + ) -> None: ... + + +class DecryptionConfiguration(_Weakrefable): + cache_lifetime: dt.timedelta + def __init__(self, *, cache_lifetime: dt.timedelta | None = None): ... + + +class KmsConnectionConfig(_Weakrefable): + kms_instance_id: str + kms_instance_url: str + key_access_token: str + custom_kms_conf: dict[str, str] + + def __init__( + self, + *, + kms_instance_id: str | None = None, + kms_instance_url: str | None = None, + key_access_token: str | None = None, + custom_kms_conf: dict[str, str] | None = None, + ) -> None: ... + def refresh_key_access_token(self, value: str) -> None: ... + + +class KmsClient(_Weakrefable): + def wrap_key(self, key_bytes: bytes, master_key_identifier: str) -> str: ... + def unwrap_key(self, wrapped_key: str, master_key_identifier: str) -> str: ... + + +class CryptoFactory(_Weakrefable): + def __init__(self, kms_client_factory: Callable[[ + KmsConnectionConfig], KmsClient]): ... + + def file_encryption_properties( + self, + kms_connection_config: KmsConnectionConfig, + encryption_config: EncryptionConfiguration, + ) -> FileEncryptionProperties: ... + + def file_decryption_properties( + self, + kms_connection_config: KmsConnectionConfig, + decryption_config: DecryptionConfiguration | None = None, + ) -> FileDecryptionProperties: ... + def remove_cache_entries_for_token(self, access_token: str) -> None: ... + def remove_cache_entries_for_all_tokens(self) -> None: ... diff --git a/python/pyarrow-stubs/_s3fs.pyi b/python/pyarrow-stubs/_s3fs.pyi new file mode 100644 index 00000000000..f065d78f993 --- /dev/null +++ b/python/pyarrow-stubs/_s3fs.pyi @@ -0,0 +1,103 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +import enum + +from typing import Literal, TypedDict +from typing_extensions import Required, NotRequired + +from ._fs import FileSystem +from .lib import KeyValueMetadata + + +class _ProxyOptions(TypedDict): + schema: Required[Literal["http", "https"]] + host: Required[str] + port: Required[int] + username: NotRequired[str] + password: NotRequired[str] + + +class S3LogLevel(enum.IntEnum): + Off = enum.auto() + Fatal = enum.auto() + Error = enum.auto() + Warn = enum.auto() + Info = enum.auto() + Debug = enum.auto() + Trace = enum.auto() + + +Off = S3LogLevel.Off +Fatal = S3LogLevel.Fatal +Error = S3LogLevel.Error +Warn = S3LogLevel.Warn +Info = S3LogLevel.Info +Debug = S3LogLevel.Debug +Trace = S3LogLevel.Trace + + +def initialize_s3( + log_level: S3LogLevel = S3LogLevel.Fatal, num_event_loop_threads: int = 1 +) -> None: ... +def ensure_s3_initialized() -> None: ... +def finalize_s3() -> None: ... +def ensure_s3_finalized() -> None: ... +def resolve_s3_region(bucket: str) -> str: ... + + +class S3RetryStrategy: + max_attempts: int + def __init__(self, max_attempts=3) -> None: ... + + +class AwsStandardS3RetryStrategy(S3RetryStrategy): + ... + + +class AwsDefaultS3RetryStrategy(S3RetryStrategy): + ... + + +class S3FileSystem(FileSystem): + def __init__( + self, + *, + access_key: str | None = None, + secret_key: str | None = None, + session_token: str | None = None, + anonymous: bool = False, + region: str | None = None, + request_timeout: float | None = None, + connect_timeout: float | None = None, + scheme: Literal["http", "https"] = "https", + endpoint_override: str | None = None, + background_writes: bool = True, + default_metadata: dict | KeyValueMetadata | None = None, + role_arn: str | None = None, + session_name: str | None = None, + external_id: str | None = None, + load_frequency: int = 900, + proxy_options: _ProxyOptions | str | None = None, + allow_bucket_creation: bool = False, + allow_bucket_deletion: bool = False, + check_directory_existence_before_creation: bool = False, + retry_strategy: S3RetryStrategy = AwsStandardS3RetryStrategy(max_attempts=3), + force_virtual_addressing: bool = False, + ): ... + @property + def region(self) -> str: ... diff --git a/python/pyarrow-stubs/_stubs_typing.pyi b/python/pyarrow-stubs/_stubs_typing.pyi new file mode 100644 index 00000000000..56aa7fd1123 --- /dev/null +++ b/python/pyarrow-stubs/_stubs_typing.pyi @@ -0,0 +1,117 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +import datetime as dt + +from collections.abc import Sequence +from decimal import Decimal +from typing import Any, Collection, Literal, Protocol, TypeAlias, TypeVar, Iterator + +import numpy as np + +from numpy.typing import NDArray + +from pyarrow.lib import BooleanArray, IntegerArray + +ArrayLike: TypeAlias = Any +ScalarLike: TypeAlias = Any +Order: TypeAlias = Literal["ascending", "descending"] +JoinType: TypeAlias = Literal[ + "left semi", + "right semi", + "left anti", + "right anti", + "inner", + "left outer", + "right outer", + "full outer", +] +Compression: TypeAlias = Literal[ + "gzip", "bz2", "brotli", "lz4", "lz4_frame", "lz4_raw", "zstd", "snappy" +] +NullEncoding: TypeAlias = Literal["mask", "encode"] +NullSelectionBehavior: TypeAlias = Literal["drop", "emit_null"] +Mask: TypeAlias = Sequence[bool | None] | NDArray[np.bool_] | BooleanArray +Indices: TypeAlias = Sequence[int] | NDArray[np.integer[Any]] | IntegerArray +PyScalar: TypeAlias = ( + bool | int | float | Decimal | str | bytes | dt.date | dt.datetime | dt.time | dt.timedelta +) + +_T = TypeVar("_T") +_V = TypeVar("_V", covariant=True) + +SingleOrList: TypeAlias = list[_T] | _T + + +class SupportEq(Protocol): + def __eq__(self, other) -> bool: ... + + +class SupportLt(Protocol): + def __lt__(self, other) -> bool: ... + + +class SupportGt(Protocol): + def __gt__(self, other) -> bool: ... + + +class SupportLe(Protocol): + def __le__(self, other) -> bool: ... + + +class SupportGe(Protocol): + def __ge__(self, other) -> bool: ... + + +FilterTuple: TypeAlias = ( + tuple[str, Literal["=", "==", "!="], SupportEq] + | tuple[str, Literal["<"], SupportLt] + | tuple[str, Literal[">"], SupportGt] + | tuple[str, Literal["<="], SupportLe] + | tuple[str, Literal[">="], SupportGe] + | tuple[str, Literal["in", "not in"], Collection] +) + + +class Buffer(Protocol): + ... + + +class SupportPyBuffer(Protocol): + ... + + +class SupportArrowStream(Protocol): + def __arrow_c_stream__(self, requested_schema=None) -> Any: ... + + +class SupportArrowArray(Protocol): + def __arrow_c_array__(self, requested_schema=None) -> Any: ... + + +class SupportArrowDeviceArray(Protocol): + def __arrow_c_device_array__(self, requested_schema=None, **kwargs) -> Any: ... + + +class SupportArrowSchema(Protocol): + def __arrow_c_schema(self) -> Any: ... + + +class NullableCollection(Protocol[_V]): # pyright: ignore[reportInvalidTypeVarUse] + def __iter__(self) -> Iterator[_V] | Iterator[_V | None]: ... + def __len__(self) -> int: ... + def __contains__(self, item: Any, /) -> bool: ... diff --git a/python/pyarrow-stubs/_substrait.pyi b/python/pyarrow-stubs/_substrait.pyi new file mode 100644 index 00000000000..12dd437412f --- /dev/null +++ b/python/pyarrow-stubs/_substrait.pyi @@ -0,0 +1,63 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +from typing import Any, Callable + +from ._compute import Expression +from .lib import Buffer, RecordBatchReader, Schema, Table, _Weakrefable + + +def run_query( + plan: Buffer | int, + *, + table_provider: Callable[[list[str], Schema], Table] | None = None, + use_threads: bool = True, +) -> RecordBatchReader: ... +def _parse_json_plan(plan: bytes) -> Buffer: ... + + +class SubstraitSchema: + schema: Schema + expression: Expression + def __init__(self, schema: Schema, expression: Expression) -> None: ... + def to_pysubstrait(self) -> Any: ... + + +def serialize_schema(schema: Schema) -> SubstraitSchema: ... +def deserialize_schema(buf: Buffer | bytes) -> Schema: ... + + +def serialize_expressions( + exprs: list[Expression], + names: list[str], + schema: Schema, + *, + allow_arrow_extensions: bool = False, +) -> Buffer: ... + + +class BoundExpressions(_Weakrefable): + @property + def schema(self) -> Schema: ... + @property + def expressions(self) -> dict[str, Expression]: ... + @classmethod + def from_substrait(cls, message: Buffer | bytes) -> BoundExpressions: ... + + +def deserialize_expressions(buf: Buffer | bytes) -> BoundExpressions: ... +def get_supported_functions() -> list[str]: ... diff --git a/python/pyarrow-stubs/_types.pyi b/python/pyarrow-stubs/_types.pyi new file mode 100644 index 00000000000..0cb4bba6a6f --- /dev/null +++ b/python/pyarrow-stubs/_types.pyi @@ -0,0 +1,992 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +import datetime as dt +import sys + +from collections.abc import Mapping, Sequence +from decimal import Decimal + +if sys.version_info >= (3, 11): + from typing import Self +else: + from typing_extensions import Self + +from typing import Any, Generic, Iterable, Iterator, Literal + +import numpy as np +import pandas as pd + +from pyarrow._stubs_typing import SupportArrowSchema +from pyarrow.lib import ( + Array, + ChunkedArray, + ExtensionArray, + MemoryPool, + MonthDayNano, + Table, +) +from typing_extensions import TypeVar, deprecated + +from .io import Buffer +from .scalar import ExtensionScalar + + +class _Weakrefable: + ... + + +class _Metadata(_Weakrefable): + ... + + +class DataType(_Weakrefable): + + def field(self, i: int) -> Field: ... + + @property + def id(self) -> int: ... + @property + def bit_width(self) -> int: ... + + @property + def byte_width(self) -> int: ... + + @property + def num_fields(self) -> int: ... + + @property + def num_buffers(self) -> int: ... + + def __hash__(self) -> int: ... + + def equals(self, other: DataType | str, *, + check_metadata: bool = False) -> bool: ... + + def to_pandas_dtype(self) -> np.generic: ... + + def _export_to_c(self, out_ptr: int) -> None: ... + + @classmethod + def _import_from_c(cls, in_ptr: int) -> Self: ... + + def __arrow_c_schema__(self) -> Any: ... + + @classmethod + def _import_from_c_capsule(cls, schema) -> Self: ... + + +_AsPyType = TypeVar("_AsPyType") +_DataTypeT = TypeVar("_DataTypeT", bound=DataType) + + +class _BasicDataType(DataType, Generic[_AsPyType]): + ... + + +class NullType(_BasicDataType[None]): + ... + + +class BoolType(_BasicDataType[bool]): + ... + + +class UInt8Type(_BasicDataType[int]): + ... + + +class Int8Type(_BasicDataType[int]): + ... + + +class UInt16Type(_BasicDataType[int]): + ... + + +class Int16Type(_BasicDataType[int]): + ... + + +class Uint32Type(_BasicDataType[int]): + ... + + +class Int32Type(_BasicDataType[int]): + ... + + +class UInt64Type(_BasicDataType[int]): + ... + + +class Int64Type(_BasicDataType[int]): + ... + + +class Float16Type(_BasicDataType[float]): + ... + + +class Float32Type(_BasicDataType[float]): + ... + + +class Float64Type(_BasicDataType[float]): + ... + + +class Date32Type(_BasicDataType[dt.date]): + ... + + +class Date64Type(_BasicDataType[dt.date]): + ... + + +class MonthDayNanoIntervalType(_BasicDataType[MonthDayNano]): + ... + + +class StringType(_BasicDataType[str]): + ... + + +class LargeStringType(_BasicDataType[str]): + ... + + +class StringViewType(_BasicDataType[str]): + ... + + +class BinaryType(_BasicDataType[bytes]): + ... + + +class LargeBinaryType(_BasicDataType[bytes]): + ... + + +class BinaryViewType(_BasicDataType[bytes]): + ... + + +_Unit = TypeVar("_Unit", bound=Literal["s", "ms", "us", "ns"], default=Literal["us"]) +_Tz = TypeVar("_Tz", str, None, default=None) + + +class TimestampType(_BasicDataType[int], Generic[_Unit, _Tz]): + + @property + def unit(self) -> _Unit: ... + + @property + def tz(self) -> _Tz: ... + + +_Time32Unit = TypeVar("_Time32Unit", bound=Literal["s", "ms"]) + + +class Time32Type(_BasicDataType[dt.time], Generic[_Time32Unit]): + + @property + def unit(self) -> _Time32Unit: ... + + +_Time64Unit = TypeVar("_Time64Unit", bound=Literal["us", "ns"]) + + +class Time64Type(_BasicDataType[dt.time], Generic[_Time64Unit]): + + @property + def unit(self) -> _Time64Unit: ... + + +class DurationType(_BasicDataType[dt.timedelta], Generic[_Unit]): + + @property + def unit(self) -> _Unit: ... + + +class FixedSizeBinaryType(_BasicDataType[Decimal]): + ... + + +_Precision = TypeVar("_Precision", default=Any) +_Scale = TypeVar("_Scale", default=Any) + + +class Decimal32Type(FixedSizeBinaryType, Generic[_Precision, _Scale]): + + @property + def precision(self) -> _Precision: ... + + @property + def scale(self) -> _Scale: ... + + +class Decimal64Type(FixedSizeBinaryType, Generic[_Precision, _Scale]): + + @property + def precision(self) -> _Precision: ... + + @property + def scale(self) -> _Scale: ... + + +class Decimal128Type(FixedSizeBinaryType, Generic[_Precision, _Scale]): + + @property + def precision(self) -> _Precision: ... + + @property + def scale(self) -> _Scale: ... + + +class Decimal256Type(FixedSizeBinaryType, Generic[_Precision, _Scale]): + + @property + def precision(self) -> _Precision: ... + + @property + def scale(self) -> _Scale: ... + + +class ListType(DataType, Generic[_DataTypeT]): + + @property + def value_field(self) -> Field[_DataTypeT]: ... + + @property + def value_type(self) -> _DataTypeT: ... + + +class LargeListType(DataType, Generic[_DataTypeT]): + + @property + def value_field(self) -> Field[_DataTypeT]: ... + @property + def value_type(self) -> _DataTypeT: ... + + +class ListViewType(DataType, Generic[_DataTypeT]): + + @property + def value_field(self) -> Field[_DataTypeT]: ... + + @property + def value_type(self) -> _DataTypeT: ... + + +class LargeListViewType(DataType, Generic[_DataTypeT]): + + @property + def value_field(self) -> Field[_DataTypeT]: ... + + @property + def value_type(self) -> _DataTypeT: ... + + +class FixedSizeListType(DataType, Generic[_DataTypeT, _Size]): + + @property + def value_field(self) -> Field[_DataTypeT]: ... + + @property + def value_type(self) -> _DataTypeT: ... + + @property + def list_size(self) -> _Size: ... + + +class DictionaryMemo(_Weakrefable): + ... + + +_IndexT = TypeVar( + "_IndexT", + UInt8Type, + Int8Type, + UInt16Type, + Int16Type, + Uint32Type, + Int32Type, + UInt64Type, + Int64Type, +) +_BasicValueT = TypeVar("_BasicValueT", bound=_BasicDataType) +_ValueT = TypeVar("_ValueT", bound=DataType) +_Ordered = TypeVar("_Ordered", Literal[True], Literal[False], default=Literal[False]) + + +class DictionaryType(DataType, Generic[_IndexT, _BasicValueT, _Ordered]): + + @property + def ordered(self) -> _Ordered: ... + + @property + def index_type(self) -> _IndexT: ... + + @property + def value_type(self) -> _BasicValueT: ... + + +_K = TypeVar("_K", bound=DataType) + + +class MapType(DataType, Generic[_K, _ValueT, _Ordered]): + + @property + def key_field(self) -> Field[_K]: ... + + @property + def key_type(self) -> _K: ... + + @property + def item_field(self) -> Field[_ValueT]: ... + + @property + def item_type(self) -> _ValueT: ... + + @property + def keys_sorted(self) -> _Ordered: ... + + +_Size = TypeVar("_Size", default=int) + + +class StructType(DataType): + + def get_field_index(self, name: str) -> int: ... + + def field(self, i: int | str) -> Field: ... + + def get_all_field_indices(self, name: str) -> list[int]: ... + + def __len__(self) -> int: ... + + def __iter__(self) -> Iterator[Field]: ... + + __getitem__ = field # pyright: ignore[reportUnknownVariableType] + @property + def names(self) -> list[str]: ... + + @property + def fields(self) -> list[Field]: ... + + +class UnionType(DataType): + + @property + def mode(self) -> Literal["sparse", "dense"]: ... + + @property + def type_codes(self) -> list[int]: ... + + def __len__(self) -> int: ... + + def __iter__(self) -> Iterator[Field]: ... + + def field(self, i: int) -> Field: ... + + __getitem__ = field # pyright: ignore[reportUnknownVariableType] + + +class SparseUnionType(UnionType): + + @property + def mode(self) -> Literal["sparse"]: ... + + +class DenseUnionType(UnionType): + + @property + def mode(self) -> Literal["dense"]: ... + + +_RunEndType = TypeVar("_RunEndType", Int16Type, Int32Type, Int64Type) + + +class RunEndEncodedType(DataType, Generic[_RunEndType, _BasicValueT]): + + @property + def run_end_type(self) -> _RunEndType: ... + @property + def value_type(self) -> _BasicValueT: ... + + +_StorageT = TypeVar("_StorageT", bound=Array | ChunkedArray) + + +class BaseExtensionType(DataType): + + def __arrow_ext_class__(self) -> type[ExtensionArray]: ... + + def __arrow_ext_scalar_class__(self) -> type[ExtensionScalar]: ... + + @property + def extension_name(self) -> str: ... + + @property + def storage_type(self) -> DataType: ... + + def wrap_array(self, storage: _StorageT) -> _StorageT: ... + + +class ExtensionType(BaseExtensionType): + + def __init__(self, storage_type: DataType, extension_name: str) -> None: ... + + def __arrow_ext_serialize__(self) -> bytes: ... + + @classmethod + def __arrow_ext_deserialize__( + cls, storage_type: DataType, serialized: bytes) -> Self: ... + + +class FixedShapeTensorType(BaseExtensionType, Generic[_ValueT]): + + @property + def value_type(self) -> _ValueT: ... + + @property + def shape(self) -> list[int]: ... + + @property + def dim_names(self) -> list[str] | None: ... + + @property + def permutation(self) -> list[int] | None: ... + + +class Bool8Type(BaseExtensionType): + ... + + +class UuidType(BaseExtensionType): + ... + + +class JsonType(BaseExtensionType): + ... + + +class OpaqueType(BaseExtensionType): + + @property + def type_name(self) -> str: ... + + @property + def vendor_name(self) -> str: ... + + +# TODO +# @deprecated( +# "This class is deprecated and its deserialization is disabled by default. " +# ":class:`ExtensionType` is recommended instead." +# ) +# class PyExtensionType(ExtensionType): +# """ +# Concrete base class for Python-defined extension types based on pickle +# for (de)serialization. +# +# .. warning:: +# This class is deprecated and its deserialization is disabled by default. +# :class:`ExtensionType` is recommended instead. +# +# Parameters +# ---------- +# storage_type : DataType +# The storage type for which the extension is built. +# """ +# def __init__(self, storage_type: DataType) -> None: ... +# @classmethod +# def set_auto_load(cls, value: bool) -> None: +# """ +# Enable or disable auto-loading of serialized PyExtensionType instances. +# +# Parameters +# ---------- +# value : bool +# Whether to enable auto-loading. +# """ + +class UnknownExtensionType(ExtensionType): # type: ignore + + def __init__(self, storage_type: DataType, serialized: bytes) -> None: ... + + +def register_extension_type(ext_type: ExtensionType) -> None: ... # type: ignore + + +def unregister_extension_type(type_name: str) -> None: ... + + +class KeyValueMetadata(_Metadata, Mapping[bytes, bytes]): + + def __init__(self, __arg0__: Mapping[bytes, bytes] + | None = None, **kwargs) -> None: ... + + def equals(self, other: KeyValueMetadata) -> bool: ... + + def __len__(self) -> int: ... + + def __contains__(self, __key: object) -> bool: ... + + def __getitem__(self, __key: Any) -> Any: ... + + def __iter__(self) -> Iterator[bytes]: ... + + def get_all(self, key: str) -> list[bytes]: ... + + def to_dict(self) -> dict[bytes, bytes]: ... + + +class Field(_Weakrefable, Generic[_DataTypeT]): + + def equals(self, other: Field, check_metadata: bool = False) -> bool: ... + + def __hash__(self) -> int: ... + + @property + def nullable(self) -> bool: ... + + @property + def name(self) -> str: ... + + @property + def metadata(self) -> dict[bytes, bytes] | None: ... + + @property + def type(self) -> _DataTypeT: ... + def with_metadata(self, metadata: dict[bytes | str, bytes | str]) -> Self: ... + + def remove_metadata(self) -> Self: ... + + def with_type(self, new_type: _DataTypeT) -> Field[_DataTypeT]: ... + + def with_name(self, name: str) -> Self: ... + + def with_nullable(self, nullable: bool) -> Field[_DataTypeT]: ... + + def flatten(self) -> list[Field]: ... + + def _export_to_c(self, out_ptr: int) -> None: ... + + @classmethod + def _import_from_c(cls, in_ptr: int) -> Self: ... + + def __arrow_c_schema__(self) -> Any: ... + + @classmethod + def _import_from_c_capsule(cls, schema) -> Self: ... + + +class Schema(_Weakrefable): + + def __len__(self) -> int: ... + + def __getitem__(self, key: str) -> Field: ... + + _field = __getitem__ # pyright: ignore[reportUnknownVariableType] + def __iter__(self) -> Iterator[Field]: ... + + def __hash__(self) -> int: ... + + def __sizeof__(self) -> int: ... + @property + def pandas_metadata(self) -> dict: ... + + @property + def names(self) -> list[str]: ... + + @property + def types(self) -> list[DataType]: ... + + @property + def metadata(self) -> dict[bytes, bytes]: ... + + def empty_table(self) -> Table: ... + + def equals(self, other: Schema, check_metadata: bool = False) -> bool: ... + + @classmethod + def from_pandas(cls, df: pd.DataFrame, preserve_index: bool | + None = None) -> Schema: ... + + def field(self, i: int | str | bytes) -> Field: ... + + @deprecated("Use 'field' instead") + def field_by_name(self, name: str) -> Field: ... + + def get_field_index(self, name: str) -> int: ... + + def get_all_field_indices(self, name: str) -> list[int]: ... + + def append(self, field: Field) -> Schema: ... + + def insert(self, i: int, field: Field) -> Schema: ... + + def remove(self, i: int) -> Schema: ... + + def set(self, i: int, field: Field) -> Schema: ... + + @deprecated("Use 'with_metadata' instead") + def add_metadata(self, metadata: dict) -> Schema: ... + + def with_metadata(self, metadata: dict) -> Schema: ... + + def serialize(self, memory_pool: MemoryPool | None = None) -> Buffer: ... + + def remove_metadata(self) -> Schema: ... + + def to_string( + self, + truncate_metadata: bool = True, + show_field_metadata: bool = True, + show_schema_metadata: bool = True, + ) -> str: ... + + def _export_to_c(self, out_ptr: int) -> None: ... + + @classmethod + def _import_from_c(cls, in_ptr: int) -> Schema: ... + + def __arrow_c_schema__(self) -> Any: ... + + @staticmethod + def _import_from_c_capsule(schema: Any) -> Schema: ... + + +def unify_schemas( + schemas: list[Schema], *, promote_options: Literal["default", "permissive"] = "default" +) -> Schema: ... + + +def field( + name: SupportArrowSchema | str, type: _DataTypeT, nullable: bool = ..., metadata: dict[Any, Any] | None = None +) -> Field[_DataTypeT] | Field[Any]: ... + + +def null() -> NullType: ... + + +def bool_() -> BoolType: ... + + +def uint8() -> UInt8Type: ... + + +def int8() -> Int8Type: ... + + +def uint16() -> UInt16Type: ... + + +def int16() -> Int16Type: ... + + +def uint32() -> Uint32Type: ... + + +def int32() -> Int32Type: ... + + +def int64() -> Int64Type: ... + + +def uint64() -> UInt64Type: ... + + +def timestamp(unit: _Unit, tz: _Tz | None = None) -> TimestampType[_Unit, _Tz]: ... + + +def time32(unit: _Time32Unit) -> Time32Type[_Time32Unit]: ... + + +def time64(unit: _Time64Unit) -> Time64Type[_Time64Unit]: ... + + +def duration(unit: _Unit) -> DurationType[_Unit]: ... + + +def month_day_nano_interval() -> MonthDayNanoIntervalType: ... + + +def date32() -> Date32Type: ... + + +def date64() -> Date64Type: ... + + +def float16() -> Float16Type: ... + + +def float32() -> Float32Type: ... + + +def float64() -> Float64Type: ... + + +def decimal32(precision: _Precision, scale: _Scale | + None = None) -> Decimal32Type[_Precision, _Scale| Literal[0]]: ... + + +def decimal64(precision: _Precision, scale: _Scale | + None = None) -> Decimal64Type[_Precision, _Scale | Literal[0]]: ... + + +def decimal128(precision: _Precision, scale: _Scale | + None = None) -> Decimal128Type[_Precision, _Scale | Literal[0]]: ... + + +def decimal256(precision: _Precision, scale: _Scale | + None = None) -> Decimal256Type[_Precision, _Scale | Literal[0]]: ... + + +def string() -> StringType: ... + + +utf8 = string + + +def binary(length: Literal[-1] | int = ...) -> BinaryType | FixedSizeBinaryType: ... + + +def large_binary() -> LargeBinaryType: ... + + +def large_string() -> LargeStringType: ... + + +large_utf8 = large_string + + +def binary_view() -> BinaryViewType: ... + + +def string_view() -> StringViewType: ... + + +def list_( + value_type: _DataTypeT | Field[_DataTypeT], list_size: Literal[-1] | _Size | None = None +) -> ListType[_DataTypeT] | FixedSizeListType[_DataTypeT, _Size]: ... + + +def large_list(value_type: _DataTypeT | + Field[_DataTypeT]) -> LargeListType[_DataTypeT]: ... + + +def list_view(value_type: _DataTypeT | + Field[_DataTypeT]) -> ListViewType[_DataTypeT]: ... + + +def large_list_view( + value_type: _DataTypeT | Field[_DataTypeT], +) -> LargeListViewType[_DataTypeT]: ... + + +def map_( + key_type: _K, item_type: _ValueT, key_sorted: _Ordered | None = None +) -> MapType[_K, _ValueT, _Ordered]: ... + + +def dictionary( + index_type: _IndexT, value_type: _BasicValueT, ordered: _Ordered | None = None +) -> DictionaryType[_IndexT, _BasicValueT, _Ordered]: ... + + +def struct( + fields: Iterable[Field[Any] | tuple[str, Field[Any]] | tuple[str, DataType]] + | Mapping[str, Field[Any]], +) -> StructType: ... + + +def sparse_union( + child_fields: list[Field[Any]], type_codes: list[int] | None = None +) -> SparseUnionType: ... + + +def dense_union( + child_fields: list[Field[Any]], type_codes: list[int] | None = None +) -> DenseUnionType: ... + + +def union( + child_fields: list[Field[Any]], mode: Literal["sparse"] | Literal["dense"], type_codes: list[int] | None = None +) -> SparseUnionType | DenseUnionType: ... + + +def run_end_encoded( + run_end_type: _RunEndType, value_type: _BasicValueT +) -> RunEndEncodedType[_RunEndType, _BasicValueT]: ... + + +def json_(storage_type: DataType = ...) -> JsonType: ... + + +def uuid() -> UuidType: ... + + +def fixed_shape_tensor( + value_type: _ValueT, + shape: Sequence[int], + dim_names: Sequence[str] | None = None, + permutation: Sequence[int] | None = None, +) -> FixedShapeTensorType[_ValueT]: ... + + +def bool8() -> Bool8Type: ... + + +def opaque(storage_type: DataType, type_name: str, vendor_name: str) -> OpaqueType: ... + + +def type_for_alias(name: Any) -> DataType: ... + + +def schema( + fields: Iterable[Field[Any]] | Iterable[tuple[str, DataType]] | Mapping[str, DataType], + metadata: dict[bytes | str, bytes | str] | None = None, +) -> Schema: ... + + +def from_numpy_dtype(dtype: np.dtype[Any]) -> DataType: ... + + +__all__ = [ + "_Weakrefable", + "_Metadata", + "DataType", + "_BasicDataType", + "NullType", + "BoolType", + "UInt8Type", + "Int8Type", + "UInt16Type", + "Int16Type", + "Uint32Type", + "Int32Type", + "UInt64Type", + "Int64Type", + "Float16Type", + "Float32Type", + "Float64Type", + "Date32Type", + "Date64Type", + "MonthDayNanoIntervalType", + "StringType", + "LargeStringType", + "StringViewType", + "BinaryType", + "LargeBinaryType", + "BinaryViewType", + "TimestampType", + "Time32Type", + "Time64Type", + "DurationType", + "FixedSizeBinaryType", + "Decimal32Type", + "Decimal64Type", + "Decimal128Type", + "Decimal256Type", + "ListType", + "LargeListType", + "ListViewType", + "LargeListViewType", + "FixedSizeListType", + "DictionaryMemo", + "DictionaryType", + "MapType", + "StructType", + "UnionType", + "SparseUnionType", + "DenseUnionType", + "RunEndEncodedType", + "BaseExtensionType", + "ExtensionType", + "FixedShapeTensorType", + "Bool8Type", + "UuidType", + "JsonType", + "OpaqueType", + "UnknownExtensionType", + "register_extension_type", + "unregister_extension_type", + "KeyValueMetadata", + "Field", + "Schema", + "unify_schemas", + "field", + "null", + "bool_", + "uint8", + "int8", + "uint16", + "int16", + "uint32", + "int32", + "int64", + "uint64", + "timestamp", + "time32", + "time64", + "duration", + "month_day_nano_interval", + "date32", + "date64", + "float16", + "float32", + "float64", + "decimal32", + "decimal64", + "decimal128", + "decimal256", + "string", + "utf8", + "binary", + "large_binary", + "large_string", + "large_utf8", + "binary_view", + "string_view", + "list_", + "large_list", + "list_view", + "large_list_view", + "map_", + "dictionary", + "struct", + "sparse_union", + "dense_union", + "union", + "run_end_encoded", + "json_", + "uuid", + "fixed_shape_tensor", + "bool8", + "opaque", + "type_for_alias", + "schema", + "from_numpy_dtype", + "_Unit", + "_Tz", + "_Time32Unit", + "_Time64Unit", + "_DataTypeT", +] diff --git a/python/pyarrow-stubs/acero.pyi b/python/pyarrow-stubs/acero.pyi new file mode 100644 index 00000000000..b3bc83382fb --- /dev/null +++ b/python/pyarrow-stubs/acero.pyi @@ -0,0 +1,113 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +import sys + +if sys.version_info >= (3, 11): + from typing import Self +else: + from typing_extensions import Self +if sys.version_info >= (3, 10): + from typing import TypeAlias +else: + from typing_extensions import TypeAlias +from typing import Literal + +from . import lib +from .compute import Expression, FunctionOptions + +_StrOrExpr: TypeAlias = str | Expression + + +class Declaration(lib._Weakrefable): + def __init__( + self, + factory_name: str, + options: ExecNodeOptions, + inputs: list[Declaration] | None = None, + ) -> None: ... + @classmethod + def from_sequence(cls, decls: list[Declaration]) -> Self: ... + def to_reader(self, use_threads: bool = True) -> lib.RecordBatchReader: ... + def to_table(self, use_threads: bool = True) -> lib.Table: ... + + +class ExecNodeOptions(lib._Weakrefable): + ... + + +class TableSourceNodeOptions(ExecNodeOptions): + def __init__(self, table: lib.Table) -> None: ... + + +class FilterNodeOptions(ExecNodeOptions): + def __init__(self, filter_expression: Expression) -> None: ... + + +class ProjectNodeOptions(ExecNodeOptions): + def __init__(self, expressions: list[Expression], + names: list[str] | None = None) -> None: ... + + +class AggregateNodeOptions(ExecNodeOptions): + def __init__( + self, + aggregates: list[tuple[list[str], str, FunctionOptions, str]], + keys: list[_StrOrExpr] | None = None, + ) -> None: ... + + +class OrderByNodeOptions(ExecNodeOptions): + def __init__( + self, + sort_keys: tuple[tuple[str, Literal["ascending", "descending"]], ...] = (), + *, + null_placement: Literal["at_start", "at_end"] = "at_end", + ) -> None: ... + + +class HashJoinNodeOptions(ExecNodeOptions): + def __init__( + self, + join_type: Literal[ + "left semi", + "right semi", + "left anti", + "right anti", + "inner", + "left outer", + "right outer", + "full outer", + ], + left_keys: _StrOrExpr | list[_StrOrExpr], + right_keys: _StrOrExpr | list[_StrOrExpr], + left_output: list[_StrOrExpr] | None = None, + right_output: list[_StrOrExpr] | None = None, + output_suffix_for_left: str = "", + output_suffix_for_right: str = "", + ) -> None: ... + + +class AsofJoinNodeOptions(ExecNodeOptions): + def __init__( + self, + left_on: _StrOrExpr, + left_by: _StrOrExpr | list[_StrOrExpr], + right_on: _StrOrExpr, + right_by: _StrOrExpr | list[_StrOrExpr], + tolerance: int, + ) -> None: ... diff --git a/python/pyarrow-stubs/array.pyi b/python/pyarrow-stubs/array.pyi new file mode 100644 index 00000000000..7aa67fc8955 --- /dev/null +++ b/python/pyarrow-stubs/array.pyi @@ -0,0 +1,860 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +import sys + +from collections.abc import Callable + +if sys.version_info >= (3, 11): + from typing import Self +else: + from typing_extensions import Self +from typing import ( + Any, + Generic, + Iterable, + Iterator, + Literal, + TypeVar, +) + +import numpy as np +import pandas as pd + +from pandas.core.dtypes.base import ExtensionDtype +from pyarrow._compute import CastOptions # type: ignore[import-not-found] +from pyarrow._stubs_typing import ( + ArrayLike, + Indices, + Mask, + Order, + SupportArrowArray, + SupportArrowDeviceArray, +) +from pyarrow.lib import ( # type: ignore[attr-defined] + Buffer, + Device, # type: ignore[reportAttributeAccessIssue] + MemoryManager, # type: ignore[reportAttributeAccessIssue] + MemoryPool, + Tensor, + _Weakrefable, +) +from typing_extensions import deprecated +import builtins + +from .scalar import * +from .device import DeviceAllocationType # type: ignore[import-not-found] +from ._types import ( + BaseExtensionType, + BinaryType, + DataType, + Field, + Float64Type, + Int16Type, + Int32Type, + Int64Type, + MapType, + StringType, + StructType, + _AsPyType, + _BasicDataType, + _BasicValueT, + _DataTypeT, + _IndexT, + _RunEndType, + _Size, + _Time32Unit, + _Time64Unit, + _Tz, + _Unit, +) +from ._stubs_typing import NullableCollection + + +def array( + values: NullableCollection[Any] | Iterable[Any] | SupportArrowArray | SupportArrowDeviceArray, + type: Any | None = None, + mask: Mask | None = None, + size: int | None = None, + from_pandas: bool | None = None, + safe: bool = True, + memory_pool: MemoryPool | None = None, +) -> ArrayLike: ... + + +def asarray( + values: NullableCollection[Any] | Iterable[Any] | SupportArrowArray | SupportArrowDeviceArray, + type: _DataTypeT | Any | None = None, +) -> Array[Scalar[_DataTypeT]] | ArrayLike: ... + + +def nulls( + size: int, + type: Any | None = None, + memory_pool: MemoryPool | None = None, +) -> ArrayLike: ... + + +def repeat( + value: Any, + size: int, + memory_pool: MemoryPool | None = None, +) -> ArrayLike: ... + + +def infer_type(values: Iterable[Any], mask: Mask, + from_pandas: bool = False) -> DataType: ... + + +class ArrayStatistics(_Weakrefable): + + @property + def null_count(self) -> int: ... + + @property + def distinct_count(self) -> int: ... + + @property + def min(self) -> Any: ... + + @property + def is_min_exact(self) -> bool: ... + + @property + def max(self) -> Any: ... + + @property + def is_max_exact(self) -> bool: ... + + +_ConvertAs = TypeVar("_ConvertAs", pd.DataFrame, pd.Series) + + +class _PandasConvertible(_Weakrefable, Generic[_ConvertAs]): + def to_pandas( + self, + memory_pool: MemoryPool | None = None, + categories: list | None = None, + strings_to_categorical: bool = False, + zero_copy_only: bool = False, + integer_object_nulls: bool = False, + date_as_object: bool = True, + timestamp_as_object: bool = False, + use_threads: bool = True, + deduplicate_objects: bool = True, + ignore_metadata: bool = False, + safe: bool = True, + split_blocks: bool = False, + self_destruct: bool = False, + maps_as_pydicts: Literal["None", "lossy", "strict"] | None = None, + types_mapper: Callable[[DataType], ExtensionDtype | None] | None = None, + coerce_temporal_nanoseconds: bool = False, + ) -> _ConvertAs: ... + + +_CastAs = TypeVar("_CastAs", bound=DataType) +_Scalar_co = TypeVar("_Scalar_co", bound=Scalar, covariant=True) +_ScalarT = TypeVar("_ScalarT", bound=Scalar) + + +class Array(_PandasConvertible[pd.Series], Generic[_Scalar_co]): + + def diff(self, other: Self) -> str: ... + + def cast( + self, + target_type: _CastAs, + safe: bool = True, + options: CastOptions | None = None, + memory_pool: MemoryPool | None = None, + ) -> Array[Scalar[_CastAs]]: ... + + def view(self, target_type: _CastAs) -> Array[Scalar[_CastAs]]: ... + + def sum(self, **kwargs) -> _Scalar_co: ... + + @property + def type(self: Array[Scalar[_DataTypeT]]) -> _DataTypeT: ... + def unique(self) -> Self: ... + + def dictionary_encode(self, null_encoding: str = "mask") -> DictionaryArray: ... + + def value_counts(self) -> StructArray: ... + + @staticmethod + def from_pandas( + obj: pd.Series | np.ndarray | ArrayLike, + *, + mask: Mask | None = None, + type: _DataTypeT | None = None, + safe: bool = True, + memory_pool: MemoryPool | None = None, + ) -> Array[Scalar[_DataTypeT]] | Array[Scalar]: ... + + @staticmethod + def from_buffers( + type: _DataTypeT, + length: int, + buffers: list[Buffer], + null_count: int = -1, + offset=0, + children: NullableCollection[Array[Scalar[_DataTypeT]]] | None = None, + ) -> Array[Scalar[_DataTypeT]]: ... + + @property + def null_count(self) -> int: ... + @property + def nbytes(self) -> int: ... + + def get_total_buffer_size(self) -> int: ... + + def __sizeof__(self) -> int: ... + def __iter__(self) -> Iterator[_Scalar_co]: ... + + def to_string( + self, + *, + indent: int = 2, + top_level_indent: int = 0, + window: int = 10, + container_window: int = 2, + skip_new_lines: bool = False, + ) -> str: ... + + format = to_string + def equals(self, other: Self) -> bool: ... + + def __len__(self) -> int: ... + + def is_null(self, *, nan_is_null: bool = False) -> BooleanArray: ... + + def is_nan(self) -> BooleanArray: ... + + def is_valid(self) -> BooleanArray: ... + + def fill_null( + self: Array[Scalar[_BasicDataType[_AsPyType]]], fill_value: _AsPyType + ) -> Array[Scalar[_BasicDataType[_AsPyType]]]: ... + + def __getitem__(self, key: int | builtins.slice) -> _Scalar_co | Self: ... + + def slice(self, offset: int = 0, length: int | None = None) -> Self: ... + + def take(self, indices: Indices) -> Self: ... + + def drop_null(self) -> Self: ... + + def filter( + self, + mask: Mask, + *, + null_selection_behavior: Literal["drop", "emit_null"] = "drop", + ) -> Self: ... + + def index( + self: Array[_ScalarT] | Array[Scalar[_BasicDataType[_AsPyType]]], + value: _ScalarT | _AsPyType, + start: int | None = None, + end: int | None = None, + *, + memory_pool: MemoryPool | None = None, + ) -> Int64Scalar: ... + + def sort(self, order: Order = "ascending", **kwargs) -> Self: ... + + def __array__(self, dtype: np.dtype | None = None, + copy: bool | None = None) -> np.ndarray: ... + + def to_numpy(self, zero_copy_only: bool = True, + writable: bool = False) -> np.ndarray: ... + + def to_pylist( + self: Array[Scalar[_BasicDataType[_AsPyType]]], + *, + maps_as_pydicts: Literal["lossy", "strict"] | None = None, + ) -> list[_AsPyType | None]: ... + + tolist = to_pylist + def validate(self, *, full: bool = False) -> None: ... + + @property + def offset(self) -> int: ... + + def buffers(self) -> list[Buffer | None]: ... + + def copy_to(self, destination: MemoryManager | Device) -> Self: ... + + def _export_to_c(self, out_ptr: int, out_schema_ptr: int = 0) -> None: ... + + @classmethod + def _import_from_c(cls, in_ptr: int, type: int | DataType) -> Self: ... + + def __arrow_c_array__(self, requested_schema=None) -> Any: ... + + @classmethod + def _import_from_c_capsule(cls, schema_capsule, array_capsule) -> Self: ... + def _export_to_c_device(self, out_ptr: int, out_schema_ptr: int = 0) -> None: ... + + @classmethod + def _import_from_c_device(cls, in_ptr: int, type: DataType | int) -> Self: ... + + def __arrow_c_device_array__(self, requested_schema=None, **kwargs) -> Any: ... + + @classmethod + def _import_from_c_device_capsule(cls, schema_capsule, array_capsule) -> Self: ... + def __dlpack__(self, stream: int | None = None) -> Any: ... + + def __dlpack_device__(self) -> tuple[int, int]: ... + + @property + def device_type(self) -> DeviceAllocationType: ... + + @property + def is_cpu(self) -> bool: ... + + @property + def statistics(self) -> ArrayStatistics | None: ... + + +class NullArray(Array[NullScalar]): + ... + + +class BooleanArray(Array[BooleanScalar]): + + @property + def false_count(self) -> int: ... + @property + def true_count(self) -> int: ... + + +class NumericArray(Array[_ScalarT]): + ... + + +class IntegerArray(NumericArray[_ScalarT]): + ... + + +class FloatingPointArray(NumericArray[_ScalarT]): + ... + + +class Int8Array(IntegerArray[Int8Scalar]): + ... + + +class UInt8Array(IntegerArray[UInt8Scalar]): + ... + + +class Int16Array(IntegerArray[Int16Scalar]): + ... + + +class UInt16Array(IntegerArray[UInt16Scalar]): + ... + + +class Int32Array(IntegerArray[Int32Scalar]): + ... + + +class UInt32Array(IntegerArray[UInt32Scalar]): + ... + + +class Int64Array(IntegerArray[Int64Scalar]): + ... + + +class UInt64Array(IntegerArray[UInt64Scalar]): + ... + + +class Date32Array(NumericArray[Date32Scalar]): + ... + + +class Date64Array(NumericArray[Date64Scalar]): + ... + + +class TimestampArray(NumericArray[TimestampScalar[_Unit, _Tz]]): + ... + + +class Time32Array(NumericArray[Time32Scalar[_Time32Unit]]): + ... + + +class Time64Array(NumericArray[Time64Scalar[_Time64Unit]]): + ... + + +class DurationArray(NumericArray[DurationScalar[_Unit]]): + ... + + +class MonthDayNanoIntervalArray(Array[MonthDayNanoIntervalScalar]): + ... + + +class HalfFloatArray(FloatingPointArray[HalfFloatScalar]): + ... + + +class FloatArray(FloatingPointArray[FloatScalar]): + ... + + +class DoubleArray(FloatingPointArray[DoubleScalar]): + ... + + +class FixedSizeBinaryArray(Array[FixedSizeBinaryScalar]): + ... + + +class Decimal32Array(FixedSizeBinaryArray): + ... + + +class Decimal64Array(FixedSizeBinaryArray): + ... + + +class Decimal128Array(FixedSizeBinaryArray): + ... + + +class Decimal256Array(FixedSizeBinaryArray): + ... + + +class BaseListArray(Array[_ScalarT]): + def flatten(self, recursive: bool = False) -> Array: ... + + def value_parent_indices(self) -> Int64Array: ... + + def value_lengths(self) -> Int32Array: ... + + +class ListArray(BaseListArray[_ScalarT]): + + @classmethod + def from_arrays( + cls, + offsets: Int32Array | list[int], + values: Array[Scalar[_DataTypeT]] | list[int] | list[float] | list[str] | list[bytes] | list, + *, + type: _DataTypeT | None = None, + pool: MemoryPool | None = None, + mask: Mask | None = None, + ) -> ListArray[ListScalar[_DataTypeT | Int64Type | Float64Type | StringType | BinaryType]] | ListArray: ... + + @property + def values(self) -> Array: ... + + @property + def offsets(self) -> Int32Array: ... + + +class LargeListArray(BaseListArray[LargeListScalar[_DataTypeT]]): + + @classmethod + def from_arrays( + cls, + offsets: Int64Array, + values: Array[Scalar[_DataTypeT]] | Array, + *, + type: _DataTypeT | None = None, + pool: MemoryPool | None = None, + mask: Mask | None = None, + ) -> LargeListArray[_DataTypeT] | LargeListArray[_DataTypeT]: ... + + @property + def values(self) -> Array: ... + + @property + def offsets(self) -> Int64Array: ... + + +class ListViewArray(BaseListArray[ListViewScalar[_DataTypeT]]): + + @classmethod + def from_arrays( + cls, + offsets: Int32Array, + values: Array[Scalar[_DataTypeT]] | Array, + *, + type: _DataTypeT | None = None, + pool: MemoryPool | None = None, + mask: Mask | None = None, + ) -> ListViewArray[_DataTypeT] | ListViewArray[_DataTypeT]: ... + + @property + def values(self) -> Array: ... + + @property + def offsets(self) -> Int32Array: ... + + @property + def sizes(self) -> Int32Array: ... + + +class LargeListViewArray(BaseListArray[LargeListScalar[_DataTypeT]]): + + @classmethod + def from_arrays( + cls, + offsets: Int64Array, + values: Array[Scalar[_DataTypeT]] | Array, + *, + type: _DataTypeT | None = None, + pool: MemoryPool | None = None, + mask: Mask | None = None, + ) -> LargeListViewArray[_DataTypeT]: ... + + @property + def values(self) -> Array: ... + + @property + def offsets(self) -> Int64Array: ... + + @property + def sizes(self) -> Int64Array: ... + + +class FixedSizeListArray(BaseListArray[FixedSizeListScalar[_DataTypeT, _Size]]): + + @classmethod + def from_arrays( + cls, + values: Array[Scalar[_DataTypeT]], + limit_size: _Size | None = None, + *, + type: None = None, + mask: Mask | None = None, + ) -> FixedSizeListArray[_DataTypeT, _Size | None]: ... + + @property + def values(self) -> BaseListArray[ListScalar[_DataTypeT]]: ... + + +_MapKeyT = TypeVar("_MapKeyT", bound=_BasicDataType) +_MapItemT = TypeVar("_MapItemT", bound=_BasicDataType) + + +class MapArray(BaseListArray[MapScalar[_MapKeyT, _MapItemT]]): + + @classmethod + def from_arrays( + cls, + offsets: Int64Array | list[int] | None, + keys: Array[Scalar[_MapKeyT]] | None = None, + items: Array[Scalar[_MapItemT]] | None = None, + values: Array | None = None, + *, + type: MapType[_MapKeyT, _MapItemT] | None = None, + pool: MemoryPool | None = None, + mask: Mask | None = None, + ) -> MapArray[_MapKeyT, _MapItemT]: ... + + @property + def keys(self) -> Array: ... + + @property + def items(self) -> Array: ... + + +class UnionArray(Array[UnionScalar]): + + @deprecated("Use fields() instead") + def child(self, pos: int) -> Field: ... + + def field(self, pos: int) -> Array: ... + + @property + def type_codes(self) -> Int8Array: ... + + @property + def offsets(self) -> Int32Array: ... + + @staticmethod + def from_dense( + types: Int8Array, + value_offsets: Int32Array, + children: NullableCollection[Array], + field_names: list[str] | None = None, + type_codes: Int8Array | None = None, + ) -> UnionArray: ... + + @staticmethod + def from_sparse( + types: Int8Array, + children: NullableCollection[Array], + field_names: list[str] | None = None, + type_codes: Int8Array | None = None, + ) -> UnionArray: ... + + +class StringArray(Array[StringScalar]): + + @staticmethod + def from_buffers( # type: ignore[override] + length: int, + value_offsets: Buffer, + data: Buffer, + null_bitmap: Buffer | None = None, + null_count: int | None = -1, + offset: int | None = 0, + ) -> StringArray: ... + + +class LargeStringArray(Array[LargeStringScalar]): + + @staticmethod + def from_buffers( # type: ignore[override] + length: int, + value_offsets: Buffer, + data: Buffer, + null_bitmap: Buffer | None = None, + null_count: int | None = -1, + offset: int | None = 0, + ) -> StringArray: ... + + +class StringViewArray(Array[StringViewScalar]): + ... + + +class BinaryArray(Array[BinaryScalar]): + + @property + def total_values_length(self) -> int: ... + + +class LargeBinaryArray(Array[LargeBinaryScalar]): + + @property + def total_values_length(self) -> int: ... + + +class BinaryViewArray(Array[BinaryViewScalar]): + ... + + +class DictionaryArray(Array[DictionaryScalar[_IndexT, _BasicValueT]]): + + def dictionary_encode(self) -> Self: ... # type: ignore[override] + def dictionary_decode(self) -> Array[Scalar[_BasicValueT]]: ... + + @property + def indices(self) -> Array[Scalar[_IndexT]]: ... + @property + def dictionary(self) -> Array[Scalar[_BasicValueT]]: ... + + @staticmethod + def from_buffers( # type: ignore[override] + type: _BasicValueT, + length: int, + buffers: list[Buffer], + dictionary: Array | np.ndarray | pd.Series, + null_count: int = -1, + offset: int = 0, + ) -> DictionaryArray[Any, _BasicValueT]: ... + + @staticmethod + def from_arrays( + indices: Indices, + dictionary: Array | np.ndarray | pd.Series, + mask: np.ndarray | pd.Series | BooleanArray | None = None, + ordered: bool = False, + from_pandas: bool = False, + safe: bool = True, + memory_pool: MemoryPool | None = None, + ) -> DictionaryArray: ... + + +class StructArray(Array[StructScalar]): + + def field(self, index: int | str) -> Array: ... + + def flatten(self, memory_pool: MemoryPool | None = None) -> list[Array]: ... + + @staticmethod + def from_arrays( + arrays: Iterable[Array], + names: list[str] | None = None, + fields: list[Field] | None = None, + mask=None, + memory_pool: MemoryPool | None = None, + type: StructType | None = None, + ) -> StructArray: ... + + def sort(self, order: Order = "ascending", by: str | + None = None, **kwargs) -> StructArray: ... + + +class RunEndEncodedArray(Array[RunEndEncodedScalar[_RunEndType, _BasicValueT]]): + + @staticmethod + def from_arrays( + run_ends: Int16Array | Int32Array | Int64Array, + values: Array, + type: DataType | None = None, + ) -> RunEndEncodedArray[Int16Type | Int32Type | Int64Type, _BasicValueT]: ... # type: ignore[type-var] + + @staticmethod + def from_buffers( # type: ignore[override] + type: DataType, + length: int, + buffers: list[Buffer], + null_count: int = -1, + offset=0, + children: tuple[Array, Array] | None = None, + ) -> RunEndEncodedArray[Any, _BasicValueT]: ... + + @property + def run_ends(self) -> Array[Scalar[_RunEndType]]: ... + + @property + def values(self) -> Array[Scalar[_BasicValueT]]: ... + + def find_physical_offset(self) -> int: ... + + def find_physical_length(self) -> int: ... + + +_ArrayT = TypeVar("_ArrayT", bound=Array) + + +class ExtensionArray(Array[ExtensionScalar], Generic[_ArrayT]): + + @property + def storage(self) -> Any: ... + + @staticmethod + def from_storage(typ: BaseExtensionType, + storage: _ArrayT) -> ExtensionArray[_ArrayT]: ... + + +class JsonArray(ExtensionArray[_ArrayT]): + ... + + +class UuidArray(ExtensionArray[_ArrayT]): + ... + + +class FixedShapeTensorArray(ExtensionArray[_ArrayT]): + + def to_numpy_ndarray(self) -> np.ndarray: ... + + def to_tensor(self) -> Tensor: ... + + @classmethod + def from_numpy_ndarray(cls, obj: np.ndarray) -> Self: ... + + +class OpaqueArray(ExtensionArray[_ArrayT]): + ... + + +class Bool8Array(ExtensionArray): + + def to_numpy(self, zero_copy_only: bool = ..., + writable: bool = ...) -> np.ndarray: ... + + @classmethod + def from_storage(cls, storage: Int8Array) -> Self: ... # type: ignore[override] + + @classmethod + def from_numpy(cls, obj: np.ndarray) -> Self: ... + + +def concat_arrays(arrays: Iterable[_ArrayT], + memory_pool: MemoryPool | None = None) -> _ArrayT: ... + + +def _empty_array(type: _DataTypeT) -> Array[Scalar[_DataTypeT]]: ... + + +__all__ = [ + "array", + "asarray", + "nulls", + "repeat", + "infer_type", + "_PandasConvertible", + "Array", + "NullArray", + "BooleanArray", + "NumericArray", + "IntegerArray", + "FloatingPointArray", + "Int8Array", + "UInt8Array", + "Int16Array", + "UInt16Array", + "Int32Array", + "UInt32Array", + "Int64Array", + "UInt64Array", + "Date32Array", + "Date64Array", + "TimestampArray", + "Time32Array", + "Time64Array", + "DurationArray", + "MonthDayNanoIntervalArray", + "HalfFloatArray", + "FloatArray", + "DoubleArray", + "FixedSizeBinaryArray", + "Decimal32Array", + "Decimal64Array", + "Decimal128Array", + "Decimal256Array", + "BaseListArray", + "ListArray", + "LargeListArray", + "ListViewArray", + "LargeListViewArray", + "FixedSizeListArray", + "MapArray", + "UnionArray", + "StringArray", + "LargeStringArray", + "StringViewArray", + "BinaryArray", + "LargeBinaryArray", + "BinaryViewArray", + "DictionaryArray", + "StructArray", + "RunEndEncodedArray", + "ExtensionArray", + "Bool8Array", + "UuidArray", + "JsonArray", + "OpaqueArray", + "FixedShapeTensorArray", + "concat_arrays", + "_empty_array", + "_CastAs", +] diff --git a/python/pyarrow-stubs/builder.pyi b/python/pyarrow-stubs/builder.pyi new file mode 100644 index 00000000000..c379bd83afb --- /dev/null +++ b/python/pyarrow-stubs/builder.pyi @@ -0,0 +1,53 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +from typing import Iterable + +from pyarrow.lib import MemoryPool, _Weakrefable + +from .array import StringArray, StringViewArray + + +class StringBuilder(_Weakrefable): + + def __init__(self, memory_pool: MemoryPool | None = None) -> None: ... + def append(self, value: str | bytes | None): ... + + def append_values(self, values: Iterable[str | bytes | None]): ... + + def finish(self) -> StringArray: ... + + @property + def null_count(self) -> int: ... + def __len__(self) -> int: ... + + +class StringViewBuilder(_Weakrefable): + + def __init__(self, memory_pool: MemoryPool | None = None) -> None: ... + def append(self, value: str | bytes | None): ... + + def append_values(self, values: Iterable[str | bytes | None]): ... + + def finish(self) -> StringViewArray: ... + + @property + def null_count(self) -> int: ... + def __len__(self) -> int: ... + + +__all__ = ["StringBuilder", "StringViewBuilder"] diff --git a/python/pyarrow-stubs/cffi.pyi b/python/pyarrow-stubs/cffi.pyi new file mode 100644 index 00000000000..e4f077d7155 --- /dev/null +++ b/python/pyarrow-stubs/cffi.pyi @@ -0,0 +1,21 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +import cffi + +c_source: str +ffi: cffi.FFI diff --git a/python/pyarrow-stubs/compat.pyi b/python/pyarrow-stubs/compat.pyi new file mode 100644 index 00000000000..2ea013555c0 --- /dev/null +++ b/python/pyarrow-stubs/compat.pyi @@ -0,0 +1,22 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +def encode_file_path(path: str | bytes) -> bytes: ... +def tobytes(o: str | bytes) -> bytes: ... +def frombytes(o: bytes, *, safe: bool = False): ... + +__all__ = ["encode_file_path", "tobytes", "frombytes"] diff --git a/python/pyarrow-stubs/compute.pyi b/python/pyarrow-stubs/compute.pyi new file mode 100644 index 00000000000..235e8ffc34d --- /dev/null +++ b/python/pyarrow-stubs/compute.pyi @@ -0,0 +1,1518 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +# ruff: noqa: I001 +from typing import Literal, TypeAlias, TypeVar, overload, Any, Iterable, ParamSpec, Sequence +from collections.abc import Callable + +# Option classes +from pyarrow._compute import ArraySortOptions as ArraySortOptions +from pyarrow._compute import AssumeTimezoneOptions as AssumeTimezoneOptions +from pyarrow._compute import CastOptions as CastOptions +from pyarrow._compute import CountOptions as CountOptions +from pyarrow._compute import CumulativeOptions as CumulativeOptions +from pyarrow._compute import CumulativeSumOptions as CumulativeSumOptions +from pyarrow._compute import DayOfWeekOptions as DayOfWeekOptions +from pyarrow._compute import DictionaryEncodeOptions as DictionaryEncodeOptions +from pyarrow._compute import ElementWiseAggregateOptions as ElementWiseAggregateOptions + +# Expressions +from pyarrow._compute import Expression as Expression +from pyarrow._compute import ExtractRegexOptions as ExtractRegexOptions +from pyarrow._compute import ExtractRegexSpanOptions as ExtractRegexSpanOptions +from pyarrow._compute import FilterOptions as FilterOptions +from pyarrow._compute import Function as Function +from pyarrow._compute import FunctionOptions as FunctionOptions +from pyarrow._compute import FunctionRegistry as FunctionRegistry +from pyarrow._compute import HashAggregateFunction as HashAggregateFunction +from pyarrow._compute import HashAggregateKernel as HashAggregateKernel +from pyarrow._compute import IndexOptions as IndexOptions +from pyarrow._compute import JoinOptions as JoinOptions +from pyarrow._compute import Kernel as Kernel +from pyarrow._compute import ListFlattenOptions as ListFlattenOptions +from pyarrow._compute import ListSliceOptions as ListSliceOptions +from pyarrow._compute import MakeStructOptions as MakeStructOptions +from pyarrow._compute import MapLookupOptions as MapLookupOptions +from pyarrow._compute import MatchSubstringOptions as MatchSubstringOptions +from pyarrow._compute import ModeOptions as ModeOptions +from pyarrow._compute import NullOptions as NullOptions +from pyarrow._compute import PadOptions as PadOptions +from pyarrow._compute import PairwiseOptions as PairwiseOptions +from pyarrow._compute import PartitionNthOptions as PartitionNthOptions +from pyarrow._compute import PivotWiderOptions as PivotWiderOptions +from pyarrow._compute import QuantileOptions as QuantileOptions +from pyarrow._compute import RandomOptions as RandomOptions +from pyarrow._compute import RankOptions as RankOptions +from pyarrow._compute import RankQuantileOptions as RankQuantileOptions +from pyarrow._compute import ReplaceSliceOptions as ReplaceSliceOptions +from pyarrow._compute import ReplaceSubstringOptions as ReplaceSubstringOptions +from pyarrow._compute import RoundBinaryOptions as RoundBinaryOptions +from pyarrow._compute import RoundOptions as RoundOptions +from pyarrow._compute import RoundTemporalOptions as RoundTemporalOptions +from pyarrow._compute import RoundToMultipleOptions as RoundToMultipleOptions +from pyarrow._compute import RunEndEncodeOptions as RunEndEncodeOptions +from pyarrow._compute import ScalarAggregateFunction as ScalarAggregateFunction +from pyarrow._compute import ScalarAggregateKernel as ScalarAggregateKernel +from pyarrow._compute import ScalarAggregateOptions as ScalarAggregateOptions +from pyarrow._compute import ScalarFunction as ScalarFunction +from pyarrow._compute import ScalarKernel as ScalarKernel +from pyarrow._compute import SelectKOptions as SelectKOptions +from pyarrow._compute import SetLookupOptions as SetLookupOptions +from pyarrow._compute import SkewOptions as SkewOptions +from pyarrow._compute import SliceOptions as SliceOptions +from pyarrow._compute import SortOptions as SortOptions +from pyarrow._compute import SplitOptions as SplitOptions +from pyarrow._compute import SplitPatternOptions as SplitPatternOptions +from pyarrow._compute import StrftimeOptions as StrftimeOptions +from pyarrow._compute import StrptimeOptions as StrptimeOptions +from pyarrow._compute import StructFieldOptions as StructFieldOptions +from pyarrow._compute import TakeOptions as TakeOptions +from pyarrow._compute import TDigestOptions as TDigestOptions +from pyarrow._compute import TrimOptions as TrimOptions +from pyarrow._compute import UdfContext as UdfContext +from pyarrow._compute import Utf8NormalizeOptions as Utf8NormalizeOptions +from pyarrow._compute import VarianceOptions as VarianceOptions +from pyarrow._compute import VectorFunction as VectorFunction +from pyarrow._compute import VectorKernel as VectorKernel +from pyarrow._compute import WeekOptions as WeekOptions +from pyarrow._compute import WinsorizeOptions as WinsorizeOptions + +# Functions +from pyarrow._compute import call_function as call_function + +# Udf +from pyarrow._compute import call_tabular_function as call_tabular_function +from pyarrow._compute import function_registry as function_registry +from pyarrow._compute import get_function as get_function +from pyarrow._compute import list_functions as list_functions +from pyarrow._compute import register_aggregate_function as register_aggregate_function +from pyarrow._compute import register_scalar_function as register_scalar_function +from pyarrow._compute import register_tabular_function as register_tabular_function +from pyarrow._compute import register_vector_function as register_vector_function + +from pyarrow._compute import _Order, _Placement +from pyarrow._stubs_typing import ArrayLike, ScalarLike +from . import lib + +_P = ParamSpec("_P") +_R = TypeVar("_R") + +def field(*name_or_index: str | tuple[str, ...] | int) -> Expression: ... + + +def scalar(value: bool | float | str) -> Expression: ... + + +def _clone_signature(f: Callable[_P, _R]) -> Callable[_P, _R]: ... + +# ============= compute functions ============= +_DataTypeT = TypeVar("_DataTypeT", bound=lib.DataType) +_Scalar_CoT = TypeVar("_Scalar_CoT", bound=lib.Scalar, covariant=True) +_ScalarT = TypeVar("_ScalarT", bound=lib.Scalar) +_ArrayT = TypeVar("_ArrayT", bound=lib.Array | lib.ChunkedArray) +_ScalarOrArrayT = TypeVar("_ScalarOrArrayT", bound=lib.Array | lib.Scalar | lib.ChunkedArray) +ArrayOrChunkedArray: TypeAlias = lib.Array[_Scalar_CoT] | lib.ChunkedArray[_Scalar_CoT] +ScalarOrArray: TypeAlias = ArrayOrChunkedArray[_Scalar_CoT] | _Scalar_CoT + +SignedIntegerScalar: TypeAlias = ( + lib.Scalar[lib.Int8Type] + | lib.Scalar[lib.Int16Type] + | lib.Scalar[lib.Int32Type] + | lib.Scalar[lib.Int64Type] +) +UnsignedIntegerScalar: TypeAlias = ( + lib.Scalar[lib.UInt8Type] + | lib.Scalar[lib.UInt16Type] + | lib.Scalar[lib.Uint32Type] + | lib.Scalar[lib.UInt64Type] +) +IntegerScalar: TypeAlias = SignedIntegerScalar | UnsignedIntegerScalar +FloatScalar: TypeAlias = ( + lib.Scalar[lib.Float16Type] | lib.Scalar[lib.Float32Type] | lib.Scalar[lib.Float64Type] +) +DecimalScalar: TypeAlias = ( + lib.Scalar[lib.Decimal32Type] + | lib.Scalar[lib.Decimal64Type] + | lib.Scalar[lib.Decimal128Type] + | lib.Scalar[lib.Decimal256Type] +) +NonFloatNumericScalar: TypeAlias = IntegerScalar | DecimalScalar +NumericScalar: TypeAlias = IntegerScalar | FloatScalar | DecimalScalar +BinaryScalar: TypeAlias = ( + lib.Scalar[lib.BinaryType] + | lib.Scalar[lib.LargeBinaryType] + | lib.Scalar[lib.FixedSizeBinaryType] +) +StringScalar: TypeAlias = lib.Scalar[lib.StringType] | lib.Scalar[lib.LargeStringType] +StringOrBinaryScalar: TypeAlias = StringScalar | BinaryScalar +_ListScalar: TypeAlias = lib.ListViewScalar[_DataTypeT] | lib.FixedSizeListScalar[_DataTypeT, Any] +_LargeListScalar: TypeAlias = lib.LargeListScalar[_DataTypeT] | lib.LargeListViewScalar[_DataTypeT] +ListScalar: TypeAlias = ( + lib.ListScalar[_DataTypeT] | _ListScalar[_DataTypeT] | _LargeListScalar[_DataTypeT] +) +TemporalScalar: TypeAlias = ( + lib.Date32Scalar + | lib.Date64Scalar + | lib.Time32Scalar[Any] + | lib.Time64Scalar[Any] + | lib.TimestampScalar[Any] + | lib.DurationScalar[Any] + | lib.MonthDayNanoIntervalScalar +) +NumericOrDurationScalar: TypeAlias = NumericScalar | lib.DurationScalar +NumericOrTemporalScalar: TypeAlias = NumericScalar | TemporalScalar + +_NumericOrTemporalScalarT = TypeVar("_NumericOrTemporalScalarT", bound=NumericOrTemporalScalar) +_NumericScalarT = TypeVar("_NumericScalarT", bound=NumericScalar) +NumericArray: TypeAlias = ArrayOrChunkedArray[_NumericScalarT] +_NumericArrayT = TypeVar("_NumericArrayT", bound=NumericArray) +_NumericOrDurationT = TypeVar("_NumericOrDurationT", bound=NumericOrDurationScalar) +NumericOrDurationArray: TypeAlias = ArrayOrChunkedArray[NumericOrDurationScalar] +_NumericOrDurationArrayT = TypeVar("_NumericOrDurationArrayT", bound=NumericOrDurationArray) +NumericOrTemporalArray: TypeAlias = ArrayOrChunkedArray[_NumericOrTemporalScalarT] +_NumericOrTemporalArrayT = TypeVar("_NumericOrTemporalArrayT", bound=NumericOrTemporalArray) +BooleanArray: TypeAlias = ArrayOrChunkedArray[lib.BooleanScalar] +_BooleanArrayT = TypeVar("_BooleanArrayT", bound=BooleanArray) +IntegerArray: TypeAlias = ArrayOrChunkedArray[IntegerScalar] +_FloatScalarT = TypeVar("_FloatScalarT", bound=FloatScalar) +FloatArray: TypeAlias = ArrayOrChunkedArray[FloatScalar] +_FloatArrayT = TypeVar("_FloatArrayT", bound=FloatArray) +_StringScalarT = TypeVar("_StringScalarT", bound=StringScalar) +StringArray: TypeAlias = ArrayOrChunkedArray[StringScalar] +_StringArrayT = TypeVar("_StringArrayT", bound=StringArray) +_BinaryScalarT = TypeVar("_BinaryScalarT", bound=BinaryScalar) +BinaryArray: TypeAlias = ArrayOrChunkedArray[BinaryScalar] +_BinaryArrayT = TypeVar("_BinaryArrayT", bound=BinaryArray) +_StringOrBinaryScalarT = TypeVar("_StringOrBinaryScalarT", bound=StringOrBinaryScalar) +StringOrBinaryArray: TypeAlias = StringArray | BinaryArray +_StringOrBinaryArrayT = TypeVar("_StringOrBinaryArrayT", bound=StringOrBinaryArray) +_TemporalScalarT = TypeVar("_TemporalScalarT", bound=TemporalScalar) +TemporalArray: TypeAlias = ArrayOrChunkedArray[TemporalScalar] +_TemporalArrayT = TypeVar("_TemporalArrayT", bound=TemporalArray) +_ListArray: TypeAlias = ArrayOrChunkedArray[_ListScalar[_DataTypeT]] +_LargeListArray: TypeAlias = ArrayOrChunkedArray[_LargeListScalar[_DataTypeT]] +ListArray: TypeAlias = ArrayOrChunkedArray[ListScalar[_DataTypeT]] +# =============================== 1. Aggregation =============================== + +# ========================= 1.1 functions ========================= + +def all( + array: lib.BooleanScalar | BooleanArray, + /, + *, + skip_nulls: bool = True, + min_count: int = 1, + options: ScalarAggregateOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> lib.BooleanScalar: ... + + +any = _clone_signature(all) + +def approximate_median( + array: NumericScalar | NumericArray, + /, + *, + skip_nulls: bool = True, + min_count: int = 1, + options: ScalarAggregateOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> lib.DoubleScalar: ... + + +def count( + array: lib.Array | lib.ChunkedArray, + /, + mode: Literal["only_valid", "only_null", "all"] = "only_valid", + *, + options: CountOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> lib.Int64Scalar: ... + + +def count_distinct( + array: lib.Array | lib.ChunkedArray, + /, + mode: Literal["only_valid", "only_null", "all"] = "only_valid", + *, + options: CountOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> lib.Int64Scalar: ... + + +def first( + array: lib.Array[_ScalarT] | lib.ChunkedArray[_ScalarT], + /, + *, + skip_nulls: bool = True, + min_count: int = 1, + options: ScalarAggregateOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> _ScalarT: ... + + +def first_last( + array: lib.Array[Any] | lib.ChunkedArray[Any], + /, + *, + skip_nulls: bool = True, + min_count: int = 1, + options: ScalarAggregateOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> lib.StructScalar: ... + + +def index( + data: lib.Array[Any] | lib.ChunkedArray[Any], + value, + start: int | None = None, + end: int | None = None, + *, + memory_pool: lib.MemoryPool | None = None, +) -> lib.Int64Scalar: ... + + +last = _clone_signature(first) +max = _clone_signature(first) +min = _clone_signature(first) +min_max = _clone_signature(first_last) + +def mean( + array: FloatScalar | FloatArray + | lib.NumericArray[lib.Scalar[Any]] + | lib.ChunkedArray[lib.Scalar[Any]] + | lib.Scalar[Any], + /, + *, + skip_nulls: bool = True, + min_count: int = 1, + options: ScalarAggregateOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> lib.Scalar[Any]: ... + + +def mode( + array: NumericScalar | NumericArray, + /, + n: int = 1, + *, + skip_nulls: bool = True, + min_count: int = 0, + options: ModeOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> lib.StructArray: ... + + +def product( + array: _ScalarT | lib.NumericArray[_ScalarT], + /, + *, + skip_nulls: bool = True, + min_count: int = 1, + options: ScalarAggregateOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> _ScalarT: ... + + +def quantile( + array: NumericScalar | NumericArray, + /, + q: float = 0.5, + *, + interpolation: Literal["linear", "lower", "higher", "nearest", "midpoint"] = "linear", + skip_nulls: bool = True, + min_count: int = 0, + options: QuantileOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> lib.DoubleArray: ... + + +def stddev( + array: NumericScalar | NumericArray, + /, + *, + ddof: float = 0, + skip_nulls: bool = True, + min_count: int = 0, + options: VarianceOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> lib.DoubleScalar: ... + + +def sum( + array: _NumericScalarT | NumericArray[_NumericScalarT], + /, + *, + skip_nulls: bool = True, + min_count: int = 1, + options: ScalarAggregateOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> _NumericScalarT: ... + + +def tdigest( + array: NumericScalar | NumericArray, + /, + q: float = 0.5, + *, + delta: int = 100, + buffer_size: int = 500, + skip_nulls: bool = True, + min_count: int = 0, + options: TDigestOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> lib.DoubleArray: ... + + +def variance( + array: NumericScalar | NumericArray, + /, + *, + ddof: int = 0, + skip_nulls: bool = True, + min_count: int = 0, + options: VarianceOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> lib.DoubleScalar: ... + + +def top_k_unstable( + values: lib.Array | lib.ChunkedArray | lib.RecordBatch | lib.Table, + k: int, + sort_keys: list | None = None, + *, + memory_pool: lib.MemoryPool | None = None, +) -> lib.Array: ... + + +def bottom_k_unstable( + values: lib.Array | lib.ChunkedArray | lib.RecordBatch | lib.Table, + k: int, + sort_keys: list | None = None, + *, + memory_pool: lib.MemoryPool | None = None, +) -> lib.Array: ... + + +# ========================= 2. Element-wise (“scalar”) functions ========================= + +# ========================= 2.1 Arithmetic ========================= +def abs( + x: _NumericOrDurationT | _NumericOrDurationArrayT | Expression, /, *, memory_pool: lib.MemoryPool | None = None +) -> _NumericOrDurationT | _NumericOrDurationArrayT | Expression: ... + + +abs_checked = _clone_signature(abs) + +def add( + x: _NumericOrTemporalScalarT | NumericOrTemporalScalar | _NumericOrTemporalArrayT | Expression, + y: _NumericOrTemporalScalarT | NumericOrTemporalScalar | _NumericOrTemporalArrayT | Expression, + /, + *, + memory_pool: lib.MemoryPool | None = None, +) -> _NumericOrTemporalScalarT | _NumericOrTemporalArrayT | Expression: ... + + +add_checked = _clone_signature(add) + +def divide( + x: _NumericOrTemporalScalarT | NumericOrTemporalScalar | _NumericOrTemporalArrayT | Expression, + y: _NumericOrTemporalScalarT | NumericOrTemporalScalar | _NumericOrTemporalArrayT | Expression, + /, + *, + memory_pool: lib.MemoryPool | None = None, +) -> _NumericOrTemporalScalarT | _NumericOrTemporalArrayT | Expression: ... + + +divide_checked = _clone_signature(divide) + +def exp( + exponent: _FloatArrayT | ArrayOrChunkedArray[NonFloatNumericScalar] | _FloatScalarT | NonFloatNumericScalar | lib.DoubleScalar, + /, *, memory_pool: lib.MemoryPool | None = None +) -> _FloatArrayT | lib.DoubleArray | _FloatScalarT | lib.DoubleScalar | Expression: ... + + +multiply = _clone_signature(add) +multiply_checked = _clone_signature(add) + +def negate( + x: _NumericOrDurationT | _NumericOrDurationArrayT | Expression, /, *, memory_pool: lib.MemoryPool | None = None +) -> _NumericOrDurationT | _NumericOrDurationArrayT | Expression: ... + + +negate_checked = _clone_signature(negate) + +def power( + base: _NumericScalarT | _NumericArrayT | Expression | _NumericArrayT | NumericScalar, + exponent: _NumericScalarT | _NumericArrayT | Expression | _NumericArrayT | NumericScalar, + /, + *, + memory_pool: lib.MemoryPool | None = None, +) -> _NumericScalarT | _NumericArrayT | Expression: ... + + +power_checked = _clone_signature(power) + +def sign( + x: NumericOrDurationArray | NumericOrDurationScalar | Expression, /, *, memory_pool: lib.MemoryPool | None = None +) -> ( + lib.NumericArray[lib.Int8Scalar] + | lib.NumericArray[lib.FloatScalar] + | lib.NumericArray[lib.DoubleScalar] + | lib.Int8Scalar | lib.FloatScalar | lib.DoubleScalar | Expression +): ... + + +def sqrt(x: NumericArray | NumericScalar | Expression, /, *, memory_pool: lib.MemoryPool | None = None) -> FloatArray | FloatScalar | Expression: ... + + +sqrt_checked = _clone_signature(sqrt) + +subtract = _clone_signature(add) +subtract_checked = _clone_signature(add) + +# ========================= 2.1 Bit-wise functions ========================= +def bit_wise_and( + x: _NumericScalarT | _NumericArrayT | NumericScalar | Expression | ArrayOrChunkedArray[NumericScalar], + y: _NumericScalarT | _NumericArrayT | NumericScalar | Expression | ArrayOrChunkedArray[NumericScalar], + /, *, memory_pool: lib.MemoryPool | None = None +) -> _NumericScalarT | _NumericArrayT | Expression: ... + + +def bit_wise_not( + x: _NumericScalarT | _NumericArrayT | Expression, /, *, memory_pool: lib.MemoryPool | None = None +) -> _NumericScalarT | _NumericArrayT | Expression: ... + + +bit_wise_or = _clone_signature(bit_wise_and) +bit_wise_xor = _clone_signature(bit_wise_and) +shift_left = _clone_signature(bit_wise_and) +shift_left_checked = _clone_signature(bit_wise_and) +shift_right = _clone_signature(bit_wise_and) +shift_right_checked = _clone_signature(bit_wise_and) + +# ========================= 2.2 Rounding functions ========================= +def ceil(x: _FloatScalarT | _FloatArrayT | Expression, /, *, memory_pool: lib.MemoryPool | None = None) -> _FloatScalarT | _FloatArrayT | Expression: ... + + +floor = _clone_signature(ceil) + +def round( + x: _NumericScalarT | _NumericArrayT | Expression, + /, + ndigits: int = 0, + round_mode: Literal[ + "down", + "up", + "towards_zero", + "towards_infinity", + "half_down", + "half_up", + "half_towards_zero", + "half_towards_infinity", + "half_to_even", + "half_to_odd", + ] = "half_to_even", + *, + options: RoundOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> _NumericScalarT | _NumericArrayT | Expression: ... + + +def round_to_multiple( + x: _NumericScalarT | _NumericArrayT | Expression, + /, + multiple: int = 0, + round_mode: Literal[ + "down", + "up", + "towards_zero", + "towards_infinity", + "half_down", + "half_up", + "half_towards_zero", + "half_towards_infinity", + "half_to_even", + "half_to_odd", + ] = "half_to_even", + *, + options: RoundToMultipleOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> _NumericScalarT | _NumericArrayT | Expression: ... + + +def round_binary( + x: _NumericScalarT | _NumericArrayT | Expression, + s: int | lib.Int8Scalar | lib.Int16Scalar | lib.Int32Scalar | lib.Int64Scalar | Iterable, + /, + round_mode: Literal[ + "down", + "up", + "towards_zero", + "towards_infinity", + "half_down", + "half_up", + "half_towards_zero", + "half_towards_infinity", + "half_to_even", + "half_to_odd", + ] = "half_to_even", + *, + options: RoundBinaryOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> _NumericScalarT | lib.NumericArray[_NumericScalarT] | _NumericArrayT | Expression: ... + + +trunc = _clone_signature(ceil) + +# ========================= 2.3 Logarithmic functions ========================= +def ln( + x: FloatScalar | FloatArray | Expression, /, *, memory_pool: lib.MemoryPool | None = None +) -> lib.FloatScalar | lib.DoubleScalar | lib.NumericArray[lib.FloatScalar] | lib.NumericArray[lib.DoubleScalar] | Expression: ... + + +ln_checked = _clone_signature(ln) +log10 = _clone_signature(ln) +log10_checked = _clone_signature(ln) +log1p = _clone_signature(ln) +log1p_checked = _clone_signature(ln) +log2 = _clone_signature(ln) +log2_checked = _clone_signature(ln) + +def logb( + x: FloatScalar | FloatArray | Expression | Any, b: FloatScalar | FloatArray | Expression | Any, + /, *, memory_pool: lib.MemoryPool | None = None +) -> lib.FloatScalar | lib.DoubleScalar | lib.NumericArray[lib.FloatScalar] | lib.NumericArray[lib.DoubleScalar] | Expression | Any: ... + + +logb_checked = _clone_signature(logb) + +# ========================= 2.4 Trigonometric functions ========================= +acos = _clone_signature(ln) +acos_checked = _clone_signature(ln) +asin = _clone_signature(ln) +asin_checked = _clone_signature(ln) +atan = _clone_signature(ln) +cos = _clone_signature(ln) +cos_checked = _clone_signature(ln) +sin = _clone_signature(ln) +sin_checked = _clone_signature(ln) +tan = _clone_signature(ln) +tan_checked = _clone_signature(ln) + +def atan2( + y: FloatScalar | FloatArray | Expression | Any, x: FloatScalar | FloatArray | Expression | Any, + /, *, memory_pool: lib.MemoryPool | None = None +) -> lib.FloatScalar | lib.DoubleScalar | lib.NumericArray[lib.FloatScalar] | lib.NumericArray[lib.DoubleScalar] | Expression: ... + + +# ========================= 2.5 Comparisons functions ========================= +def equal( + x: lib.Scalar | lib.Array | lib.ChunkedArray | Expression, + y: lib.Scalar | lib.Array | lib.ChunkedArray | Expression, + /, *, memory_pool: lib.MemoryPool | None = None +) -> lib.BooleanScalar | lib.BooleanArray | Expression: ... + + +greater = _clone_signature(equal) +greater_equal = _clone_signature(equal) +less = _clone_signature(equal) +less_equal = _clone_signature(equal) +not_equal = _clone_signature(equal) + +def max_element_wise( + *args: ScalarOrArray[_Scalar_CoT] | Expression, + skip_nulls: bool = True, + options: ElementWiseAggregateOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> _Scalar_CoT | Expression: ... + + +min_element_wise = _clone_signature(max_element_wise) + +# ========================= 2.6 Logical functions ========================= +def and_( + x: lib.BooleanScalar | BooleanArray | Expression | ScalarOrArray[lib.BooleanScalar], + y: lib.BooleanScalar | BooleanArray | Expression | ScalarOrArray[lib.BooleanScalar], + /, *, memory_pool: lib.MemoryPool | None = None +) -> lib.BooleanScalar | lib.BooleanArray | Expression | ScalarOrArray[lib.BooleanScalar]: ... + + +and_kleene = _clone_signature(and_) +and_not = _clone_signature(and_) +and_not_kleene = _clone_signature(and_) +or_ = _clone_signature(and_) +or_kleene = _clone_signature(and_) +xor = _clone_signature(and_) + +def invert( + x: lib.BooleanScalar | _BooleanArrayT | Expression, /, *, memory_pool: lib.MemoryPool | None = None +) -> lib.BooleanScalar | _BooleanArrayT | Expression: ... + + +# ========================= 2.10 String predicates ========================= +def ascii_is_alnum( + strings: StringScalar | StringArray | Expression, /, *, memory_pool: lib.MemoryPool | None = None +) -> lib.BooleanScalar | lib.BooleanArray | Expression: ... + + +ascii_is_alpha = _clone_signature(ascii_is_alnum) +ascii_is_decimal = _clone_signature(ascii_is_alnum) +ascii_is_lower = _clone_signature(ascii_is_alnum) +ascii_is_printable = _clone_signature(ascii_is_alnum) +ascii_is_space = _clone_signature(ascii_is_alnum) +ascii_is_upper = _clone_signature(ascii_is_alnum) +utf8_is_alnum = _clone_signature(ascii_is_alnum) +utf8_is_alpha = _clone_signature(ascii_is_alnum) +utf8_is_decimal = _clone_signature(ascii_is_alnum) +utf8_is_digit = _clone_signature(ascii_is_alnum) +utf8_is_lower = _clone_signature(ascii_is_alnum) +utf8_is_numeric = _clone_signature(ascii_is_alnum) +utf8_is_printable = _clone_signature(ascii_is_alnum) +utf8_is_space = _clone_signature(ascii_is_alnum) +utf8_is_upper = _clone_signature(ascii_is_alnum) +ascii_is_title = _clone_signature(ascii_is_alnum) +utf8_is_title = _clone_signature(ascii_is_alnum) +string_is_ascii = _clone_signature(ascii_is_alnum) + +# ========================= 2.11 String transforms ========================= +def ascii_capitalize( + strings: _StringScalarT | _StringArrayT | Expression, /, *, memory_pool: lib.MemoryPool | None = None +) -> _StringScalarT | _StringArrayT | Expression: ... + + +ascii_lower = _clone_signature(ascii_capitalize) +ascii_reverse = _clone_signature(ascii_capitalize) +ascii_swapcase = _clone_signature(ascii_capitalize) +ascii_title = _clone_signature(ascii_capitalize) +ascii_upper = _clone_signature(ascii_capitalize) + +def binary_length( + strings: lib.BinaryScalar | lib.StringScalar | lib.LargeBinaryScalar | lib.LargeStringScalar + | lib.BinaryArray | lib.StringArray + | lib.ChunkedArray[lib.BinaryScalar] | lib.ChunkedArray[lib.StringScalar] + | lib.LargeBinaryArray | lib.LargeStringArray + | lib.ChunkedArray[lib.LargeBinaryScalar] | lib.ChunkedArray[lib.LargeStringScalar] + | Expression, + /, *, memory_pool: lib.MemoryPool | None = None +) -> lib.Int32Scalar | lib.Int64Scalar | lib.Int32Array | lib.Int64Array | Expression: ... + + +def binary_repeat( + strings: _StringOrBinaryScalarT | _StringOrBinaryArrayT | Expression, + num_repeats: int | list[int] | list[int | None], + /, + *, + memory_pool: lib.MemoryPool | None = None, +) -> _StringOrBinaryScalarT | lib.Array[_StringOrBinaryScalarT] | _StringOrBinaryArrayT | Expression: ... + + +def binary_replace_slice( + strings: _StringOrBinaryScalarT | _StringOrBinaryArrayT | Expression, + /, + start: int, + stop: int, + replacement: str | bytes, + *, + options: ReplaceSliceOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> _StringOrBinaryScalarT | _StringOrBinaryArrayT | Expression: ... + + +def binary_reverse( + strings: _BinaryScalarT | _BinaryArrayT | Expression, /, *, memory_pool: lib.MemoryPool | None = None +) -> _BinaryScalarT | _BinaryArrayT | Expression: ... + + +def replace_substring( + strings: _StringScalarT | _StringArrayT | Expression, + /, + pattern: str | bytes, + replacement: str | bytes, + *, + max_replacements: int | None = None, + options: ReplaceSubstringOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> _StringScalarT | _StringArrayT | Expression: ... + + +replace_substring_regex = _clone_signature(replace_substring) + +def utf8_capitalize( + strings: _StringScalarT | _StringArrayT | Expression, /, *, memory_pool: lib.MemoryPool | None = None +) -> _StringScalarT | _StringArrayT | Expression: ... + + +def utf8_length( + strings: lib.StringScalar | lib.LargeStringScalar | lib.StringArray | lib.ChunkedArray[lib.StringScalar] + | lib.LargeStringArray | lib.ChunkedArray[lib.LargeStringScalar] | Expression, + /, *, memory_pool: lib.MemoryPool | None = None +) -> lib.Int32Scalar | lib.Int64Scalar | lib.Int32Array | lib.Int64Array | Expression: ... + + +utf8_lower = _clone_signature(utf8_capitalize) + +def utf8_replace_slice( + strings: _StringScalarT | _StringArrayT | Expression, + /, + start: int, + stop: int, + replacement: str | bytes, + *, + options: ReplaceSliceOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> _StringScalarT | _StringArrayT | Expression: ... + + +utf8_reverse = _clone_signature(utf8_capitalize) +utf8_swapcase = _clone_signature(utf8_capitalize) +utf8_title = _clone_signature(utf8_capitalize) +utf8_upper = _clone_signature(utf8_capitalize) + +# ========================= 2.12 String padding ========================= +def ascii_center( + strings: _StringScalarT | _StringArrayT | Expression, + /, + width: int, + padding: str = " ", + lean_left_on_odd_padding: bool = True, + *, + options: PadOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> _StringScalarT | _StringArrayT | Expression: ... + + +ascii_lpad = _clone_signature(ascii_center) +ascii_rpad = _clone_signature(ascii_center) +utf8_center = _clone_signature(ascii_center) +utf8_lpad = _clone_signature(ascii_center) +utf8_rpad = _clone_signature(ascii_center) + +# ========================= 2.13 String trimming ========================= +def ascii_ltrim( + strings: _StringScalarT | _StringArrayT | Expression, + /, + characters: str, + *, + options: TrimOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> _StringScalarT | _StringArrayT | Expression: ... + + +ascii_rtrim = _clone_signature(ascii_ltrim) +ascii_trim = _clone_signature(ascii_ltrim) +utf8_ltrim = _clone_signature(ascii_ltrim) +utf8_rtrim = _clone_signature(ascii_ltrim) +utf8_trim = _clone_signature(ascii_ltrim) + +def ascii_ltrim_whitespace( + strings: _StringScalarT | _StringArrayT | Expression, + /, + *, + options: TrimOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> _StringScalarT | _StringArrayT | Expression: ... + + +ascii_rtrim_whitespace = _clone_signature(ascii_ltrim_whitespace) +ascii_trim_whitespace = _clone_signature(ascii_ltrim_whitespace) +utf8_ltrim_whitespace = _clone_signature(ascii_ltrim_whitespace) +utf8_rtrim_whitespace = _clone_signature(ascii_ltrim_whitespace) +utf8_trim_whitespace = _clone_signature(ascii_ltrim_whitespace) + +# ========================= 2.14 String splitting ========================= +def ascii_split_whitespace( + strings: _StringScalarT | lib.Array[lib.Scalar[_DataTypeT]] | Expression, + /, + *, + max_splits: int | None = None, + reverse: bool = False, + options: SplitOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> lib.ListArray[_StringScalarT] | lib.ListArray[lib.ListScalar[_DataTypeT]] | Expression: ... + + +def split_pattern( + strings: _StringOrBinaryScalarT | lib.Array[lib.Scalar[_DataTypeT]] | Expression, + /, + pattern: str, + *, + max_splits: int | None = None, + reverse: bool = False, + options: SplitOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> lib.ListArray[_StringOrBinaryScalarT] | lib.ListArray[lib.ListScalar[_DataTypeT]] | Expression: ... + + +split_pattern_regex = _clone_signature(split_pattern) +utf8_split_whitespace = _clone_signature(ascii_split_whitespace) + +# ========================= 2.15 String component extraction ========================= +def extract_regex( + strings: StringOrBinaryScalar | StringOrBinaryArray | Expression, + /, + pattern: str, + *, + options: ExtractRegexOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> lib.StructScalar | lib.StructArray | Expression: ... + + +# ========================= 2.16 String join ========================= +def binary_join( + strings, separator, /, *, memory_pool: lib.MemoryPool | None = None +) -> StringScalar | StringArray: ... + + +def binary_join_element_wise( + *strings: _StringOrBinaryScalarT | _StringOrBinaryArrayT | Expression, + null_handling: Literal["emit_null", "skip", "replace"] = "emit_null", + null_replacement: str = "", + options: JoinOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> _StringOrBinaryScalarT | _StringOrBinaryArrayT | Expression: ... + + +# ========================= 2.17 String Slicing ========================= +def binary_slice( + strings: _BinaryScalarT | _BinaryArrayT | Expression, + /, + start: int, + stop: int | None = None, + step: int = 1, + *, + options: SliceOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> _BinaryScalarT | _BinaryArrayT | Expression: ... + + +def utf8_slice_codeunits( + strings: _StringScalarT | _StringArrayT | Expression, + /, + start: int, + stop: int | None = None, + step: int = 1, + *, + options: SliceOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> _StringScalarT | _StringArrayT | Expression: ... + + +# ========================= 2.18 Containment tests ========================= +def count_substring( + strings: lib.StringScalar | lib.BinaryScalar | lib.LargeStringScalar | lib.LargeBinaryScalar + | lib.StringArray | lib.BinaryArray + | lib.ChunkedArray[lib.StringScalar] | lib.ChunkedArray[lib.BinaryScalar] + | lib.LargeStringArray | lib.LargeBinaryArray + | lib.ChunkedArray[lib.LargeStringScalar] | lib.ChunkedArray[lib.LargeBinaryScalar] + | Expression, + /, + pattern: str, + *, + ignore_case: bool = False, + options: MatchSubstringOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> lib.Int32Scalar | lib.Int64Scalar | lib.Int32Array | lib.Int64Array | Expression: ... + + +count_substring_regex = _clone_signature(count_substring) + +def ends_with( + strings: StringScalar | BinaryScalar | StringArray | BinaryArray | Expression, + /, + pattern: str, + *, + ignore_case: bool = False, + options: MatchSubstringOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> lib.BooleanScalar | lib.BooleanArray | Expression: ... + + +find_substring = _clone_signature(count_substring) +find_substring_regex = _clone_signature(count_substring) + +def index_in( + values: lib.Scalar | lib.Array | lib.ChunkedArray | Expression, + /, + value_set: lib.Array | lib.ChunkedArray | Expression, + *, + skip_nulls: bool = False, + options: SetLookupOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> lib.Int32Scalar | lib.Int32Array | Expression: ... + + +def is_in( + values: lib.Scalar | lib.Array | lib.ChunkedArray | Expression, + /, + value_set: lib.Array | lib.ChunkedArray | Expression, + *, + skip_nulls: bool = False, + options: SetLookupOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> lib.BooleanScalar | lib.BooleanArray: ... + + +match_like = _clone_signature(ends_with) +match_substring = _clone_signature(ends_with) +match_substring_regex = _clone_signature(ends_with) +starts_with = _clone_signature(ends_with) + +# ========================= 2.19 Categorizations ========================= +def is_finite( + values: NumericScalar | lib.NullScalar | NumericArray | lib.NullArray | Expression, + /, *, memory_pool: lib.MemoryPool | None = None +) -> lib.BooleanScalar | lib.BooleanArray | Expression: ... + + +is_inf = _clone_signature(is_finite) +is_nan = _clone_signature(is_finite) + +def is_null( + values: lib.Scalar | lib.Array | lib.ChunkedArray | Expression, + /, + *, + nan_is_null: bool = False, + options: NullOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> lib.BooleanScalar | lib.BooleanArray | Expression: ... + + +def is_valid( + values: lib.Scalar | lib.Array | lib.ChunkedArray | Expression, + /, *, memory_pool: lib.MemoryPool | None = None +) -> lib.BooleanScalar | lib.BooleanArray | Expression: ... + + +true_unless_null = _clone_signature(is_valid) + +# ========================= 2.20 Selecting / multiplexing ========================= +def case_when(cond, /, *cases, memory_pool: lib.MemoryPool | None = None): ... + + +def choose(indices, /, *values, memory_pool: lib.MemoryPool | None = None): ... + + +def coalesce( + *values: _ScalarOrArrayT, memory_pool: lib.MemoryPool | None = None +) -> _ScalarOrArrayT: ... + + +fill_null = coalesce + +def if_else( + cond: ArrayLike | ScalarLike, + left: ArrayLike | ScalarLike, + right: ArrayLike | ScalarLike, + /, + *, + memory_pool: lib.MemoryPool | None = None, +) -> ArrayLike | ScalarLike: ... + + +# ========================= 2.21 Structural transforms ========================= + +def list_value_length( + lists: _ListArray[Any] | _LargeListArray[Any] | ListArray[Any] | Expression, + /, + *, + memory_pool: lib.MemoryPool | None = None, +) -> lib.Int32Array | lib.Int64Array | Expression: ... + + +def make_struct( + *args: lib.Scalar | lib.Array | lib.ChunkedArray | Expression, + field_names: list[str] | tuple[str, ...] = (), + field_nullability: bool | None = None, + field_metadata: list[lib.KeyValueMetadata] | None = None, + options: MakeStructOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> lib.StructScalar | lib.StructArray | Expression: ... + + +# ========================= 2.22 Conversions ========================= +def ceil_temporal( + timestamps: _TemporalScalarT | _TemporalArrayT | Expression, + /, + multiple: int = 1, + unit: Literal[ + "year", + "quarter", + "month", + "week", + "day", + "hour", + "minute", + "second", + "millisecond", + "microsecond", + "nanosecond", + ] = "day", + *, + week_starts_monday: bool = True, + ceil_is_strictly_greater: bool = False, + calendar_based_origin: bool = False, + options: RoundTemporalOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> _TemporalScalarT | _TemporalArrayT | Expression: ... + + +floor_temporal = _clone_signature(ceil_temporal) +round_temporal = _clone_signature(ceil_temporal) + +def cast( + arr: lib.Scalar | lib.Array | lib.ChunkedArray, + target_type: _DataTypeT, + safe: bool | None = None, + options: CastOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> lib.Scalar[_DataTypeT] | lib.Array[lib.Scalar[_DataTypeT]] | lib.ChunkedArray[lib.Scalar[_DataTypeT]]: ... + + +def strftime( + timestamps: TemporalScalar | TemporalArray | Expression, + /, + format: str = "%Y-%m-%dT%H:%M:%S", + locale: str = "C", + *, + options: StrftimeOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> lib.StringScalar | lib.StringArray | Expression: ... + + +def strptime( + strings: StringScalar | StringArray | Expression, + /, + format: str, + unit: Literal["s", "ms", "us", "ns"], + error_is_null: bool = False, + *, + options: StrptimeOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> lib.TimestampScalar | lib.TimestampArray | Expression: ... + + +# ========================= 2.23 Temporal component extraction ========================= +def day( + values: TemporalScalar | TemporalArray | Expression, /, *, memory_pool: lib.MemoryPool | None = None +) -> lib.Int64Scalar | lib.Int64Array | Expression: ... + + +def day_of_week( + values: TemporalScalar | TemporalArray | Expression, + /, + *, + count_from_zero: bool = True, + week_start: int = 1, + options: DayOfWeekOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> lib.Int64Scalar | lib.Int64Array | Expression: ... + + +day_of_year = _clone_signature(day) + +def hour( + values: lib.TimestampScalar[Any] | lib.Time32Scalar[Any] | lib.Time64Scalar[Any] + | lib.TimestampArray[Any] | lib.Time32Array[Any] | lib.Time64Array[Any] + | lib.ChunkedArray[lib.TimestampScalar[Any]] + | lib.ChunkedArray[lib.Time32Scalar[Any]] + | lib.ChunkedArray[lib.Time64Scalar[Any]] | Expression, + /, + *, + memory_pool: lib.MemoryPool | None = None, +) -> lib.Int64Scalar | lib.Int64Array | Expression: ... + + +def is_dst( + values: lib.TimestampScalar | lib.TimestampArray[Any] | lib.ChunkedArray[lib.TimestampScalar] | Expression, + /, *, memory_pool: lib.MemoryPool | None = None +) -> lib.BooleanScalar | lib.BooleanArray | Expression: ... + + +def iso_week( + values: lib.TimestampScalar | lib.TimestampArray[Any] | lib.ChunkedArray[lib.TimestampScalar[Any]] | Expression, + /, *, memory_pool: lib.MemoryPool | None = None +) -> lib.Int64Scalar | lib.Int64Array | Expression: ... + + +iso_year = _clone_signature(iso_week) + +def is_leap_year( + values: lib.TimestampScalar[Any] | lib.Date32Scalar | lib.Date64Scalar | lib.TimestampArray + | lib.Date32Array + | lib.Date64Array + | lib.ChunkedArray[lib.TimestampScalar] + | lib.ChunkedArray[lib.Date32Scalar] + | lib.ChunkedArray[lib.Date64Scalar] | Expression, + /, + *, + memory_pool: lib.MemoryPool | None = None, +) -> lib.BooleanScalar | lib.BooleanArray | Expression: ... + + +microsecond = _clone_signature(iso_week) +millisecond = _clone_signature(iso_week) +minute = _clone_signature(iso_week) +month = _clone_signature(day_of_week) +nanosecond = _clone_signature(hour) +quarter = _clone_signature(day_of_week) +second = _clone_signature(hour) +subsecond = _clone_signature(hour) +us_week = _clone_signature(iso_week) +us_year = _clone_signature(iso_week) +year = _clone_signature(iso_week) + +def week( + values: lib.TimestampScalar | lib.TimestampArray | lib.ChunkedArray[lib.TimestampScalar] | Expression, + /, + *, + week_starts_monday: bool = True, + count_from_zero: bool = False, + first_week_is_fully_in_year: bool = False, + options: WeekOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> lib.Int64Scalar | lib.Int64Array | Expression: ... + + +def year_month_day( + values: TemporalScalar | TemporalArray | Expression, /, *, memory_pool: lib.MemoryPool | None = None +) -> lib.StructScalar | lib.StructArray | Expression: ... + + +# ========================= 2.24 Temporal difference ========================= +def day_time_interval_between(start, end, /, *, memory_pool: lib.MemoryPool | None = None): ... + + +def days_between( + start, end, /, *, memory_pool: lib.MemoryPool | None = None +) -> lib.Int64Scalar | lib.Int64Array: ... + + +hours_between = _clone_signature(days_between) +microseconds_between = _clone_signature(days_between) +milliseconds_between = _clone_signature(days_between) +minutes_between = _clone_signature(days_between) + +def month_day_nano_interval_between( + start, end, /, *, memory_pool: lib.MemoryPool | None = None +) -> lib.MonthDayNanoIntervalScalar | lib.MonthDayNanoIntervalArray: ... + + +def month_interval_between(start, end, /, *, memory_pool: lib.MemoryPool | None = None): ... + + +nanoseconds_between = _clone_signature(days_between) +quarters_between = _clone_signature(days_between) +seconds_between = _clone_signature(days_between) + +def weeks_between( + start, + end, + /, + *, + count_from_zero: bool = True, + week_start: int = 1, + options: DayOfWeekOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> lib.Int64Scalar | lib.Int64Array: ... + + +years_between = _clone_signature(days_between) + +# ========================= 2.25 Timezone handling ========================= +def assume_timezone( + timestamps: lib.TimestampScalar | lib.TimestampArray | lib.ChunkedArray[lib.TimestampScalar] | Expression, + /, + timezone: str, + *, + ambiguous: Literal["raise", "earliest", "latest"] = "raise", + nonexistent: Literal["raise", "earliest", "latest"] = "raise", + options: AssumeTimezoneOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> lib.TimestampScalar | lib.TimestampArray | lib.ChunkedArray[lib.TimestampScalar] | Expression: ... + + +def local_timestamp( + timestamps: lib.TimestampScalar | lib.TimestampArray | lib.ChunkedArray[lib.TimestampScalar] | Expression, + /, *, memory_pool: lib.MemoryPool | None = None +) -> lib.TimestampScalar | lib.TimestampArray | Expression: ... + + +# ========================= 2.26 Random number generation ========================= +def random( + n: int, + *, + initializer: Literal["system"] | int = "system", + options: RandomOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> lib.DoubleArray: ... + + +# ========================= 3. Array-wise (“vector”) functions ========================= + +# ========================= 3.1 Cumulative Functions ========================= +def cumulative_sum( + values: _NumericArrayT | Expression, + /, + start: lib.Scalar | None = None, + *, + skip_nulls: bool = False, + options: CumulativeSumOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> _NumericArrayT | Expression: ... + + +cumulative_sum_checked = _clone_signature(cumulative_sum) +cumulative_prod = _clone_signature(cumulative_sum) +cumulative_prod_checked = _clone_signature(cumulative_sum) +cumulative_max = _clone_signature(cumulative_sum) +cumulative_min = _clone_signature(cumulative_sum) +cumulative_mean = _clone_signature(cumulative_sum) +# ========================= 3.2 Associative transforms ========================= + +def dictionary_encode( + array: _ScalarOrArrayT | Expression, + /, + null_encoding: Literal["mask", "encode"] = "mask", + *, + options=None, + memory_pool: lib.MemoryPool | None = None, +) -> _ScalarOrArrayT | Expression: ... +def unique(array: _ArrayT | Expression, /, *, memory_pool: lib.MemoryPool | None = None) -> _ArrayT | Expression: ... +def value_counts( + array: lib.Array | lib.ChunkedArray | Expression, /, *, memory_pool: lib.MemoryPool | None = None +) -> lib.StructArray | Expression: ... + +# ========================= 3.3 Selections ========================= +@overload +def array_filter( + array: _ArrayT, + selection_filter: list[bool] | list[bool | None] | BooleanArray, + /, + null_selection_behavior: Literal["drop", "emit_null"] = "drop", + *, + options: FilterOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> _ArrayT: ... +@overload +def array_filter( + array: Expression, + selection_filter: list[bool] | list[bool | None] | BooleanArray, + /, + null_selection_behavior: Literal["drop", "emit_null"] = "drop", + *, + options: FilterOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> Expression: ... +@overload +def array_take( + array: _ArrayT, + indices: list[int] + | list[int | None] + | lib.Int16Array + | lib.Int32Array + | lib.Int64Array + | lib.ChunkedArray[lib.Int16Scalar] + | lib.ChunkedArray[lib.Int32Scalar] + | lib.ChunkedArray[lib.Int64Scalar], + /, + *, + boundscheck: bool = True, + options: TakeOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> _ArrayT: ... +@overload +def array_take( + array: Expression, + indices: list[int] + | list[int | None] + | lib.Int16Array + | lib.Int32Array + | lib.Int64Array + | lib.ChunkedArray[lib.Int16Scalar] + | lib.ChunkedArray[lib.Int32Scalar] + | lib.ChunkedArray[lib.Int64Scalar], + /, + *, + boundscheck: bool = True, + options: TakeOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> Expression: ... +@overload +def drop_null(input: _ArrayT, /, *, memory_pool: lib.MemoryPool | None = None) -> _ArrayT: ... +@overload +def drop_null( + input: Expression, /, *, memory_pool: lib.MemoryPool | None = None +) -> Expression: ... + +filter = array_filter +take = array_take + +# ========================= 3.4 Containment tests ========================= +def indices_nonzero( + values: lib.BooleanArray + | lib.NullArray + | NumericArray + | lib.Decimal128Array + | lib.Decimal256Array | Expression, + /, + *, + memory_pool: lib.MemoryPool | None = None, +) -> lib.UInt64Array | Expression: ... + + +# ========================= 3.5 Sorts and partitions ========================= +def array_sort_indices( + array: lib.Array | lib.ChunkedArray | Expression, + /, + order: _Order = "ascending", + *, + null_placement: _Placement = "at_end", + options: ArraySortOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> lib.UInt64Array | Expression: ... + + +def partition_nth_indices( + array: lib.Array | lib.ChunkedArray | Expression, + /, + pivot: int, + *, + null_placement: _Placement = "at_end", + options: PartitionNthOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> lib.UInt64Array | Expression: ... + + +def rank( + input: lib.Array | lib.ChunkedArray, + /, + sort_keys: _Order = "ascending", + *, + null_placement: _Placement = "at_end", + tiebreaker: Literal["min", "max", "first", "dense"] = "first", + options: RankOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> lib.UInt64Array: ... + + +def select_k_unstable( + input: lib.Array | lib.ChunkedArray | Expression, + /, + k: int, + sort_keys: list[tuple[str, _Order]], + *, + options: SelectKOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> lib.UInt64Array | Expression: ... + + +def sort_indices( + input: lib.Array | lib.ChunkedArray | lib.RecordBatch | lib.Table | Expression, + /, + sort_keys: Sequence[tuple[str, _Order]] = (), + *, + null_placement: _Placement = "at_end", + options: SortOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> lib.UInt64Array | Expression: ... + + +# ========================= 3.6 Structural transforms ========================= +def list_element( + lists: lib.Array[ListScalar[_DataTypeT]] | lib.ChunkedArray[ListScalar[_DataTypeT]] | ListScalar[_DataTypeT] | Expression, + index: ScalarLike, /, *, memory_pool: lib.MemoryPool | None = None +) -> lib.Array[lib.Scalar[_DataTypeT]] | lib.ChunkedArray[lib.Scalar[_DataTypeT]] | _DataTypeT | Expression: ... + + +def list_flatten( + lists: ArrayOrChunkedArray[ListScalar[Any]] | Expression, + /, + recursive: bool = False, + *, + options: ListFlattenOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> lib.ListArray[Any] | Expression: ... + + +def list_parent_indices( + lists: ArrayOrChunkedArray[Any] | Expression, /, *, memory_pool: lib.MemoryPool | None = None +) -> lib.Int64Array | Expression: ... + + +def list_slice( + lists: ArrayOrChunkedArray[Any] | Expression, + /, + start: int, + stop: int | None = None, + step: int = 1, + return_fixed_size_list: bool | None = None, + *, + options: ListSliceOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> lib.ListArray[Any] | Expression: ... + + +def map_lookup( + container, + /, + query_key, + occurrence: str, + *, + options: MapLookupOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +): ... + + +def struct_field( + values, + /, + indices, + *, + options: StructFieldOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +): ... + + +def fill_null_backward(values, /, *, memory_pool: lib.MemoryPool | None = None): ... + + +def fill_null_forward(values, /, *, memory_pool: lib.MemoryPool | None = None): ... + + +def replace_with_mask( + values, + mask: list[bool] | list[bool | None] | BooleanArray, + replacements, + /, + *, + memory_pool: lib.MemoryPool | None = None, +): ... + + +# ========================= 3.7 Pairwise functions ========================= +def pairwise_diff( + input: _NumericOrTemporalArrayT | Expression, + /, + period: int = 1, + *, + options: PairwiseOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> _NumericOrTemporalArrayT | Expression: ... + + +pairwise_diff_checked = _clone_signature(pairwise_diff) diff --git a/python/pyarrow-stubs/config.pyi b/python/pyarrow-stubs/config.pyi new file mode 100644 index 00000000000..62555a506f3 --- /dev/null +++ b/python/pyarrow-stubs/config.pyi @@ -0,0 +1,64 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +from typing import NamedTuple + + +class VersionInfo(NamedTuple): + major: int + minor: int + patch: int + + +class BuildInfo(NamedTuple): + version: str + version_info: VersionInfo + so_version: str + full_so_version: str + compiler_id: str + compiler_version: str + compiler_flags: str + git_id: str + git_description: str + package_kind: str + build_type: str + + +class RuntimeInfo(NamedTuple): + simd_level: str + detected_simd_level: str + + +cpp_build_info: BuildInfo +cpp_version: str +cpp_version_info: VersionInfo + + +def runtime_info() -> RuntimeInfo: ... +def set_timezone_db_path(path: str) -> None: ... + + +__all__ = [ + "VersionInfo", + "BuildInfo", + "RuntimeInfo", + "cpp_build_info", + "cpp_version", + "cpp_version_info", + "runtime_info", + "set_timezone_db_path", +] diff --git a/python/pyarrow-stubs/csv.pyi b/python/pyarrow-stubs/csv.pyi new file mode 100644 index 00000000000..a7abd413aab --- /dev/null +++ b/python/pyarrow-stubs/csv.pyi @@ -0,0 +1,44 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +from pyarrow._csv import ( + ISO8601, + ConvertOptions, + CSVStreamingReader, + CSVWriter, + InvalidRow, + ParseOptions, + ReadOptions, + WriteOptions, + open_csv, + read_csv, + write_csv, +) + +__all__ = [ + "ISO8601", + "ConvertOptions", + "CSVStreamingReader", + "CSVWriter", + "InvalidRow", + "ParseOptions", + "ReadOptions", + "WriteOptions", + "open_csv", + "read_csv", + "write_csv", +] diff --git a/python/pyarrow-stubs/cuda.pyi b/python/pyarrow-stubs/cuda.pyi new file mode 100644 index 00000000000..0394965bb73 --- /dev/null +++ b/python/pyarrow-stubs/cuda.pyi @@ -0,0 +1,42 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +from pyarrow._cuda import ( + BufferReader, + BufferWriter, + Context, + CudaBuffer, + HostBuffer, + IpcMemHandle, + new_host_buffer, + read_message, + read_record_batch, + serialize_record_batch, +) + +__all__ = [ + "BufferReader", + "BufferWriter", + "Context", + "CudaBuffer", + "HostBuffer", + "IpcMemHandle", + "new_host_buffer", + "read_message", + "read_record_batch", + "serialize_record_batch", +] diff --git a/python/pyarrow-stubs/dataset.pyi b/python/pyarrow-stubs/dataset.pyi new file mode 100644 index 00000000000..160ed19ee4b --- /dev/null +++ b/python/pyarrow-stubs/dataset.pyi @@ -0,0 +1,272 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +from typing import Callable, Iterable, Literal, Sequence, TypeAlias, overload + +from _typeshed import StrPath +from pyarrow._dataset import ( + CsvFileFormat, + CsvFragmentScanOptions, + Dataset, + DatasetFactory, + DirectoryPartitioning, + FeatherFileFormat, + FileFormat, + FileFragment, + FilenamePartitioning, + FileSystemDataset, + FileSystemDatasetFactory, + FileSystemFactoryOptions, + FileWriteOptions, + Fragment, + FragmentScanOptions, + HivePartitioning, + InMemoryDataset, + IpcFileFormat, + IpcFileWriteOptions, + JsonFileFormat, + JsonFragmentScanOptions, + Partitioning, + PartitioningFactory, + Scanner, + TaggedRecordBatch, + UnionDataset, + UnionDatasetFactory, + WrittenFile, + get_partition_keys, +) +from pyarrow._dataset_orc import OrcFileFormat +from pyarrow._dataset_parquet import ( + ParquetDatasetFactory, + ParquetFactoryOptions, + ParquetFileFormat, + ParquetFileFragment, + ParquetFileWriteOptions, + ParquetFragmentScanOptions, + ParquetReadOptions, + RowGroupInfo, +) +from pyarrow._dataset_parquet_encryption import ( + ParquetDecryptionConfig, + ParquetEncryptionConfig, +) +from pyarrow.compute import Expression, field, scalar +from pyarrow.lib import Array, RecordBatch, RecordBatchReader, Schema, Table + +from ._fs import SupportedFileSystem + +_orc_available: bool +_parquet_available: bool + +__all__ = [ + "CsvFileFormat", + "CsvFragmentScanOptions", + "Dataset", + "DatasetFactory", + "DirectoryPartitioning", + "FeatherFileFormat", + "FileFormat", + "FileFragment", + "FilenamePartitioning", + "FileSystemDataset", + "FileSystemDatasetFactory", + "FileSystemFactoryOptions", + "FileWriteOptions", + "Fragment", + "FragmentScanOptions", + "HivePartitioning", + "InMemoryDataset", + "IpcFileFormat", + "IpcFileWriteOptions", + "JsonFileFormat", + "JsonFragmentScanOptions", + "Partitioning", + "PartitioningFactory", + "Scanner", + "TaggedRecordBatch", + "UnionDataset", + "UnionDatasetFactory", + "WrittenFile", + "get_partition_keys", + # Orc + "OrcFileFormat", + # Parquet + "ParquetDatasetFactory", + "ParquetFactoryOptions", + "ParquetFileFormat", + "ParquetFileFragment", + "ParquetFileWriteOptions", + "ParquetFragmentScanOptions", + "ParquetReadOptions", + "RowGroupInfo", + # Parquet Encryption + "ParquetDecryptionConfig", + "ParquetEncryptionConfig", + # Compute + "Expression", + "field", + "scalar", + # Dataset + "partitioning", + "parquet_dataset", + "write_dataset", +] + +_DatasetFormat: TypeAlias = Literal["parquet", "ipc", "arrow", "feather", "csv"] + + +@overload +def partitioning( + schema: Schema, +) -> Partitioning: ... + + +@overload +def partitioning( + schema: Schema, + *, + flavor: Literal["filename"], + dictionaries: dict[str, Array] | None = None, +) -> Partitioning: ... + + +@overload +def partitioning( + schema: Schema, + *, + flavor: Literal["filename"], + dictionaries: Literal["infer"], +) -> PartitioningFactory: ... + + +@overload +def partitioning( + field_names: list[str], + *, + flavor: Literal["filename"], +) -> PartitioningFactory: ... + + +@overload +def partitioning( + schema: Schema, + *, + flavor: Literal["hive"], + dictionaries: Literal["infer"], +) -> PartitioningFactory: ... + + +@overload +def partitioning( + *, + flavor: Literal["hive"], +) -> PartitioningFactory: ... + + +@overload +def partitioning( + schema: Schema, + *, + flavor: Literal["hive"], + dictionaries: dict[str, Array] | None = None, +) -> Partitioning: ... + + +def parquet_dataset( + metadata_path: StrPath, + schema: Schema | None = None, + filesystem: SupportedFileSystem | None = None, + format: ParquetFileFormat | None = None, + partitioning: Partitioning | PartitioningFactory | None = None, + partition_base_dir: str | None = None, +) -> FileSystemDataset: ... + + +@overload +def dataset( + source: StrPath | Sequence[StrPath], + schema: Schema | None = None, + format: FileFormat | _DatasetFormat | None = None, + filesystem: SupportedFileSystem | str | None = None, + partitioning: Partitioning | PartitioningFactory | str | list[str] | None = None, + partition_base_dir: str | None = None, + exclude_invalid_files: bool | None = None, + ignore_prefixes: list[str] | None = None, +) -> FileSystemDataset: ... + + +@overload +def dataset( + source: list[Dataset], + schema: Schema | None = None, + format: FileFormat | _DatasetFormat | None = None, + filesystem: SupportedFileSystem | str | None = None, + partitioning: Partitioning | PartitioningFactory | str | list[str] | None = None, + partition_base_dir: str | None = None, + exclude_invalid_files: bool | None = None, + ignore_prefixes: list[str] | None = None, +) -> UnionDataset: ... + + +@overload +def dataset( + source: Iterable[RecordBatch] | Iterable[Table] | RecordBatchReader, + schema: Schema | None = None, + format: FileFormat | _DatasetFormat | None = None, + filesystem: SupportedFileSystem | str | None = None, + partitioning: Partitioning | PartitioningFactory | str | list[str] | None = None, + partition_base_dir: str | None = None, + exclude_invalid_files: bool | None = None, + ignore_prefixes: list[str] | None = None, +) -> InMemoryDataset: ... + + +@overload +def dataset( + source: RecordBatch | Table, + schema: Schema | None = None, + format: FileFormat | _DatasetFormat | None = None, + filesystem: SupportedFileSystem | str | None = None, + partitioning: Partitioning | PartitioningFactory | str | list[str] | None = None, + partition_base_dir: str | None = None, + exclude_invalid_files: bool | None = None, + ignore_prefixes: list[str] | None = None, +) -> InMemoryDataset: ... + + +def write_dataset( + data: Dataset | Table | RecordBatch | RecordBatchReader | list[Table] | Iterable[RecordBatch], + base_dir: StrPath, + *, + basename_template: str | None = None, + format: FileFormat | _DatasetFormat | None = None, + partitioning: Partitioning | list[str] | None = None, + partitioning_flavor: str | None = None, + schema: Schema | None = None, + filesystem: SupportedFileSystem | None = None, + file_options: FileWriteOptions | None = None, + use_threads: bool = True, + max_partitions: int = 1024, + max_open_files: int = 1024, + max_rows_per_file: int = 0, + min_rows_per_group: int = 0, + max_rows_per_group: int = 1024 * 1024, + file_visitor: Callable[[str], None] | None = None, + existing_data_behavior: Literal["error", + "overwrite_or_ignore", "delete_matching"] = "error", + create_dir: bool = True, +): ... diff --git a/python/pyarrow-stubs/device.pyi b/python/pyarrow-stubs/device.pyi new file mode 100644 index 00000000000..d77fe2504af --- /dev/null +++ b/python/pyarrow-stubs/device.pyi @@ -0,0 +1,68 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +import enum + +from pyarrow.lib import _Weakrefable + + +class DeviceAllocationType(enum.Flag): + CPU = enum.auto() + CUDA = enum.auto() + CUDA_HOST = enum.auto() + OPENCL = enum.auto() + VULKAN = enum.auto() + METAL = enum.auto() + VPI = enum.auto() + ROCM = enum.auto() + ROCM_HOST = enum.auto() + EXT_DEV = enum.auto() + CUDA_MANAGED = enum.auto() + ONEAPI = enum.auto() + WEBGPU = enum.auto() + HEXAGON = enum.auto() + + +class Device(_Weakrefable): + + @property + def type_name(self) -> str: ... + + @property + def device_id(self) -> int: ... + + @property + def is_cpu(self) -> bool: ... + + @property + def device_type(self) -> DeviceAllocationType: ... + + +class MemoryManager(_Weakrefable): + + @property + def device(self) -> Device: ... + + @property + def is_cpu(self) -> bool: ... + + +def default_cpu_memory_manager() -> MemoryManager: ... + + +__all__ = ["DeviceAllocationType", "Device", + "MemoryManager", "default_cpu_memory_manager"] diff --git a/python/pyarrow-stubs/error.pyi b/python/pyarrow-stubs/error.pyi new file mode 100644 index 00000000000..c1e1a04ee40 --- /dev/null +++ b/python/pyarrow-stubs/error.pyi @@ -0,0 +1,70 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +import sys + +if sys.version_info >= (3, 11): + from typing import Self +else: + from typing_extensions import Self + +class ArrowException(Exception): ... +class ArrowInvalid(ValueError, ArrowException): ... +class ArrowMemoryError(MemoryError, ArrowException): ... +class ArrowKeyError(KeyError, ArrowException): ... +class ArrowTypeError(TypeError, ArrowException): ... +class ArrowNotImplementedError(NotImplementedError, ArrowException): ... +class ArrowCapacityError(ArrowException): ... +class ArrowIndexError(IndexError, ArrowException): ... +class ArrowSerializationError(ArrowException): ... + +class ArrowCancelled(ArrowException): + signum: int | None + def __init__(self, message: str, signum: int | None = None) -> None: ... + +ArrowIOError = IOError + +class StopToken: ... + +def enable_signal_handlers(enable: bool) -> None: ... + +have_signal_refcycle: bool + +class SignalStopHandler: + def __enter__(self) -> Self: ... + def __exit__(self, exc_type, exc_value, exc_tb) -> None: ... + def __dealloc__(self) -> None: ... + @property + def stop_token(self) -> StopToken: ... + +__all__ = [ + "ArrowException", + "ArrowInvalid", + "ArrowMemoryError", + "ArrowKeyError", + "ArrowTypeError", + "ArrowNotImplementedError", + "ArrowCapacityError", + "ArrowIndexError", + "ArrowSerializationError", + "ArrowCancelled", + "ArrowIOError", + "StopToken", + "enable_signal_handlers", + "have_signal_refcycle", + "SignalStopHandler", +] diff --git a/python/pyarrow-stubs/feather.pyi b/python/pyarrow-stubs/feather.pyi new file mode 100644 index 00000000000..10281e91152 --- /dev/null +++ b/python/pyarrow-stubs/feather.pyi @@ -0,0 +1,78 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +from typing import IO, Literal + +import pandas as pd + +from _typeshed import StrPath +from pyarrow._feather import FeatherError +from pyarrow.lib import Table + +__all__ = [ + "FeatherError", + "FeatherDataset", + "check_chunked_overflow", + "write_feather", + "read_feather", + "read_table", +] + + +class FeatherDataset: + path_or_paths: str | list[str] + validate_schema: bool + + def __init__(self, path_or_paths: str | + list[str], validate_schema: bool = True) -> None: ... + + def read_table(self, columns: list[str] | None = None) -> Table: ... + def validate_schemas(self, piece, table: Table) -> None: ... + + def read_pandas( + self, columns: list[str] | None = None, use_threads: bool = True + ) -> pd.DataFrame: ... + + +def check_chunked_overflow(name: str, col) -> None: ... + + +def write_feather( + df: pd.DataFrame | Table, + dest: StrPath | IO, + compression: Literal["zstd", "lz4", "uncompressed"] | None = None, + compression_level: int | None = None, + chunksize: int | None = None, + version: Literal[1, 2] = 2, +) -> None: ... + + +def read_feather( + source: StrPath | IO, + columns: list[str] | None = None, + use_threads: bool = True, + memory_map: bool = False, + **kwargs, +) -> pd.DataFrame: ... + + +def read_table( + source: StrPath | IO, + columns: list[str] | None = None, + memory_map: bool = False, + use_threads: bool = True, +) -> Table: ... diff --git a/python/pyarrow-stubs/flight.pyi b/python/pyarrow-stubs/flight.pyi new file mode 100644 index 00000000000..dcc6ee2244b --- /dev/null +++ b/python/pyarrow-stubs/flight.pyi @@ -0,0 +1,112 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +from pyarrow._flight import ( + Action, + ActionType, + BasicAuth, + CallInfo, + CertKeyPair, + ClientAuthHandler, + ClientMiddleware, + ClientMiddlewareFactory, + DescriptorType, + FlightCallOptions, + FlightCancelledError, + FlightClient, + FlightDataStream, + FlightDescriptor, + FlightEndpoint, + FlightError, + FlightInfo, + FlightInternalError, + FlightMetadataReader, + FlightMetadataWriter, + FlightMethod, + FlightServerBase, + FlightServerError, + FlightStreamChunk, + FlightStreamReader, + FlightStreamWriter, + FlightTimedOutError, + FlightUnauthenticatedError, + FlightUnauthorizedError, + FlightUnavailableError, + FlightWriteSizeExceededError, + GeneratorStream, + Location, + MetadataRecordBatchReader, + MetadataRecordBatchWriter, + RecordBatchStream, + Result, + SchemaResult, + ServerAuthHandler, + ServerCallContext, + ServerMiddleware, + ServerMiddlewareFactory, + Ticket, + TracingServerMiddlewareFactory, + connect, +) + +__all__ = [ + "Action", + "ActionType", + "BasicAuth", + "CallInfo", + "CertKeyPair", + "ClientAuthHandler", + "ClientMiddleware", + "ClientMiddlewareFactory", + "DescriptorType", + "FlightCallOptions", + "FlightCancelledError", + "FlightClient", + "FlightDataStream", + "FlightDescriptor", + "FlightEndpoint", + "FlightError", + "FlightInfo", + "FlightInternalError", + "FlightMetadataReader", + "FlightMetadataWriter", + "FlightMethod", + "FlightServerBase", + "FlightServerError", + "FlightStreamChunk", + "FlightStreamReader", + "FlightStreamWriter", + "FlightTimedOutError", + "FlightUnauthenticatedError", + "FlightUnauthorizedError", + "FlightUnavailableError", + "FlightWriteSizeExceededError", + "GeneratorStream", + "Location", + "MetadataRecordBatchReader", + "MetadataRecordBatchWriter", + "RecordBatchStream", + "Result", + "SchemaResult", + "ServerAuthHandler", + "ServerCallContext", + "ServerMiddleware", + "ServerMiddlewareFactory", + "Ticket", + "TracingServerMiddlewareFactory", + "connect", +] diff --git a/python/pyarrow-stubs/fs.pyi b/python/pyarrow-stubs/fs.pyi new file mode 100644 index 00000000000..61a557ea428 --- /dev/null +++ b/python/pyarrow-stubs/fs.pyi @@ -0,0 +1,97 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +from pyarrow._fs import ( # noqa + FileSelector, + FileType, + FileInfo, + FileSystem, + LocalFileSystem, + SubTreeFileSystem, + _MockFileSystem, + FileSystemHandler, + PyFileSystem, + SupportedFileSystem, +) +from pyarrow._azurefs import AzureFileSystem +from pyarrow._hdfs import HadoopFileSystem +from pyarrow._gcsfs import GcsFileSystem +from pyarrow._s3fs import ( # noqa + AwsDefaultS3RetryStrategy, + AwsStandardS3RetryStrategy, + S3FileSystem, + S3LogLevel, + S3RetryStrategy, + ensure_s3_initialized, + finalize_s3, + ensure_s3_finalized, + initialize_s3, + resolve_s3_region, +) + +FileStats = FileInfo + + +def copy_files( + source: str, + destination: str, + source_filesystem: SupportedFileSystem | None = None, + destination_filesystem: SupportedFileSystem | None = None, + *, + chunk_size: int = 1024 * 1024, + use_threads: bool = True, +) -> None: ... + + +class FSSpecHandler(FileSystemHandler): # type: ignore[misc] + fs: SupportedFileSystem + def __init__(self, fs: SupportedFileSystem) -> None: ... + + +__all__ = [ + # _fs + "FileSelector", + "FileType", + "FileInfo", + "FileSystem", + "LocalFileSystem", + "SubTreeFileSystem", + "_MockFileSystem", + "FileSystemHandler", + "PyFileSystem", + # _azurefs + "AzureFileSystem", + # _hdfs + "HadoopFileSystem", + # _gcsfs + "GcsFileSystem", + # _s3fs + "AwsDefaultS3RetryStrategy", + "AwsStandardS3RetryStrategy", + "S3FileSystem", + "S3LogLevel", + "S3RetryStrategy", + "ensure_s3_initialized", + "finalize_s3", + "ensure_s3_finalized", + "initialize_s3", + "resolve_s3_region", + # fs + "FileStats", + "copy_files", + "FSSpecHandler", +] diff --git a/python/pyarrow-stubs/gandiva.pyi b/python/pyarrow-stubs/gandiva.pyi new file mode 100644 index 00000000000..bc07e15c4a6 --- /dev/null +++ b/python/pyarrow-stubs/gandiva.pyi @@ -0,0 +1,82 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +from typing import Iterable, Literal + +from .lib import Array, DataType, Field, MemoryPool, RecordBatch, Schema, _Weakrefable + +class Node(_Weakrefable): + def return_type(self) -> DataType: ... + +class Expression(_Weakrefable): + def root(self) -> Node: ... + def result(self) -> Field: ... + +class Condition(_Weakrefable): + def root(self) -> Node: ... + def result(self) -> Field: ... + +class SelectionVector(_Weakrefable): + def to_array(self) -> Array: ... + +class Projector(_Weakrefable): + @property + def llvm_ir(self): ... + def evaluate( + self, batch: RecordBatch, selection: SelectionVector | None = None + ) -> list[Array]: ... + +class Filter(_Weakrefable): + @property + def llvm_ir(self): ... + def evaluate( + self, batch: RecordBatch, pool: MemoryPool, dtype: DataType | str = "int32" + ) -> SelectionVector: ... + +class TreeExprBuilder(_Weakrefable): + def make_literal(self, value: float | str | bytes | bool, dtype: DataType) -> Node: ... + def make_expression(self, root_node: Node, return_field: Field) -> Expression: ... + def make_function(self, name: str, children: list[Node], return_type: DataType) -> Node: ... + def make_field(self, field: Field) -> Node: ... + def make_if( + self, condition: Node, this_node: Node, else_node: Node, return_type: DataType + ) -> Node: ... + def make_and(self, children: list[Node]) -> Node: ... + def make_or(self, children: list[Node]) -> Node: ... + def make_in_expression(self, node: Node, values: Iterable, dtype: DataType) -> Node: ... + def make_condition(self, condition: Node) -> Condition: ... + +class Configuration(_Weakrefable): + def __init__(self, optimize: bool = True, dump_ir: bool = False) -> None: ... + +def make_projector( + schema: Schema, + children: list[Expression], + pool: MemoryPool, + selection_mode: Literal["NONE", "UINT16", "UINT32", "UINT64"] = "NONE", + configuration: Configuration | None = None, +) -> Projector: ... +def make_filter( + schema: Schema, condition: Condition, configuration: Configuration | None = None +) -> Filter: ... + +class FunctionSignature(_Weakrefable): + def return_type(self) -> DataType: ... + def param_types(self) -> list[DataType]: ... + def name(self) -> str: ... + +def get_registered_function_signatures() -> list[FunctionSignature]: ... diff --git a/python/pyarrow-stubs/interchange/__init__.pyi b/python/pyarrow-stubs/interchange/__init__.pyi new file mode 100644 index 00000000000..13a83393a91 --- /dev/null +++ b/python/pyarrow-stubs/interchange/__init__.pyi @@ -0,0 +1,16 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. diff --git a/python/pyarrow-stubs/interchange/buffer.pyi b/python/pyarrow-stubs/interchange/buffer.pyi new file mode 100644 index 00000000000..e1d8ae949c9 --- /dev/null +++ b/python/pyarrow-stubs/interchange/buffer.pyi @@ -0,0 +1,41 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +import enum + +from pyarrow.lib import Buffer + + +class DlpackDeviceType(enum.IntEnum): + CPU = 1 + CUDA = 2 + CPU_PINNED = 3 + OPENCL = 4 + VULKAN = 7 + METAL = 8 + VPI = 9 + ROCM = 10 + + +class _PyArrowBuffer: + def __init__(self, x: Buffer, allow_copy: bool = True) -> None: ... + @property + def bufsize(self) -> int: ... + @property + def ptr(self) -> int: ... + def __dlpack__(self): ... + def __dlpack_device__(self) -> tuple[DlpackDeviceType, int | None]: ... diff --git a/python/pyarrow-stubs/interchange/column.pyi b/python/pyarrow-stubs/interchange/column.pyi new file mode 100644 index 00000000000..04861a72b0b --- /dev/null +++ b/python/pyarrow-stubs/interchange/column.pyi @@ -0,0 +1,90 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +import enum + +from typing import Any, Iterable, TypeAlias, TypedDict + +from pyarrow.lib import Array, ChunkedArray + +from .buffer import _PyArrowBuffer + + +class DtypeKind(enum.IntEnum): + INT = 0 + UINT = 1 + FLOAT = 2 + BOOL = 20 + STRING = 21 # UTF-8 + DATETIME = 22 + CATEGORICAL = 23 + + +Dtype: TypeAlias = tuple[DtypeKind, int, str, str] + + +class ColumnNullType(enum.IntEnum): + NON_NULLABLE = 0 + USE_NAN = 1 + USE_SENTINEL = 2 + USE_BITMASK = 3 + USE_BYTEMASK = 4 + + +class ColumnBuffers(TypedDict): + data: tuple[_PyArrowBuffer, Dtype] + validity: tuple[_PyArrowBuffer, Dtype] | None + offsets: tuple[_PyArrowBuffer, Dtype] | None + + +class CategoricalDescription(TypedDict): + is_ordered: bool + is_dictionary: bool + categories: _PyArrowColumn | None + + +class Endianness(enum.Enum): + LITTLE = "<" + BIG = ">" + NATIVE = "=" + NA = "|" + + +class NoBufferPresent(Exception): + ... + + +class _PyArrowColumn: + def __init__(self, column: Array | ChunkedArray, + allow_copy: bool = True) -> None: ... + + def size(self) -> int: ... + @property + def offset(self) -> int: ... + @property + def dtype(self) -> tuple[DtypeKind, int, str, str]: ... + @property + def describe_categorical(self) -> CategoricalDescription: ... + @property + def describe_null(self) -> tuple[ColumnNullType, Any]: ... + @property + def null_count(self) -> int: ... + @property + def metadata(self) -> dict[str, Any]: ... + def num_chunks(self) -> int: ... + def get_chunks(self, n_chunks: int | None = None) -> Iterable[_PyArrowColumn]: ... + def get_buffers(self) -> ColumnBuffers: ... diff --git a/python/pyarrow-stubs/interchange/dataframe.pyi b/python/pyarrow-stubs/interchange/dataframe.pyi new file mode 100644 index 00000000000..cafbe0fc200 --- /dev/null +++ b/python/pyarrow-stubs/interchange/dataframe.pyi @@ -0,0 +1,49 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +import sys + +if sys.version_info >= (3, 11): + from typing import Self +else: + from typing_extensions import Self +from typing import Any, Iterable, Sequence + +from pyarrow.interchange.column import _PyArrowColumn +from pyarrow.lib import RecordBatch, Table + + +class _PyArrowDataFrame: + def __init__( + self, df: Table | RecordBatch, nan_as_null: bool = False, allow_copy: bool = True + ) -> None: ... + + def __dataframe__( + self, nan_as_null: bool = False, allow_copy: bool = True + ) -> _PyArrowDataFrame: ... + @property + def metadata(self) -> dict[str, Any]: ... + def num_columns(self) -> int: ... + def num_rows(self) -> int: ... + def num_chunks(self) -> int: ... + def column_names(self) -> Iterable[str]: ... + def get_column(self, i: int) -> _PyArrowColumn: ... + def get_column_by_name(self, name: str) -> _PyArrowColumn: ... + def get_columns(self) -> Iterable[_PyArrowColumn]: ... + def select_columns(self, indices: Sequence[int]) -> Self: ... + def select_columns_by_name(self, names: Sequence[str]) -> Self: ... + def get_chunks(self, n_chunks: int | None = None) -> Iterable[Self]: ... diff --git a/python/pyarrow-stubs/interchange/from_dataframe.pyi b/python/pyarrow-stubs/interchange/from_dataframe.pyi new file mode 100644 index 00000000000..e7f1c6e91ff --- /dev/null +++ b/python/pyarrow-stubs/interchange/from_dataframe.pyi @@ -0,0 +1,89 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +from typing import Any, Protocol, TypeAlias + +from pyarrow.lib import Array, Buffer, DataType, DictionaryArray, RecordBatch, Table + +from .column import ( + ColumnBuffers, + ColumnNullType, + Dtype, + DtypeKind, +) + + +class DataFrameObject(Protocol): + def __dataframe__(self, nan_as_null: bool = False, + allow_copy: bool = True) -> Any: ... + + +ColumnObject: TypeAlias = Any + + +def from_dataframe(df: DataFrameObject, allow_copy=True) -> Table: ... + + +def protocol_df_chunk_to_pyarrow( + df: DataFrameObject, allow_copy: bool = True) -> RecordBatch: ... + + +def column_to_array(col: ColumnObject, allow_copy: bool = True) -> Array: ... + + +def bool_column_to_array(col: ColumnObject, allow_copy: bool = True) -> Array: ... + + +def categorical_column_to_dictionary( + col: ColumnObject, allow_copy: bool = True +) -> DictionaryArray: ... + + +def parse_datetime_format_str(format_str: str) -> tuple[str, str]: ... + + +def map_date_type(data_type: tuple[DtypeKind, int, str, str]) -> DataType: ... + + +def buffers_to_array( + buffers: ColumnBuffers, + data_type: tuple[DtypeKind, int, str, str], + length: int, + describe_null: ColumnNullType, + offset: int = 0, + allow_copy: bool = True, +) -> Array: ... + + +def validity_buffer_from_mask( + validity_buff: Buffer, + validity_dtype: Dtype, + describe_null: ColumnNullType, + length: int, + offset: int = 0, + allow_copy: bool = True, +) -> Buffer: ... + + +def validity_buffer_nan_sentinel( + data_pa_buffer: Buffer, + data_type: Dtype, + describe_null: ColumnNullType, + length: int, + offset: int = 0, + allow_copy: bool = True, +) -> Buffer: ... diff --git a/python/pyarrow-stubs/io.pyi b/python/pyarrow-stubs/io.pyi new file mode 100644 index 00000000000..ea259f02142 --- /dev/null +++ b/python/pyarrow-stubs/io.pyi @@ -0,0 +1,428 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +import sys + +from collections.abc import Callable +from io import IOBase + +from _typeshed import StrPath + +if sys.version_info >= (3, 11): + from typing import Self +else: + from typing_extensions import Self +if sys.version_info >= (3, 10): + from typing import TypeAlias +else: + from typing_extensions import TypeAlias + +from typing import Any, Literal, SupportsIndex +import builtins + +from pyarrow._stubs_typing import Compression, SupportPyBuffer +from pyarrow.lib import MemoryPool, _Weakrefable + +from .device import Device, DeviceAllocationType, MemoryManager +from ._types import KeyValueMetadata + +def have_libhdfs() -> bool: ... + + +def io_thread_count() -> int: ... + + +def set_io_thread_count(count: int) -> None: ... + + +Mode: TypeAlias = Literal["rb", "wb", "rb+", "ab"] + +class NativeFile(_Weakrefable): + + + _default_chunk_size: int + + def __enter__(self) -> Self: ... + def __exit__(self, *args) -> None: ... + @property + def mode(self) -> Mode: ... + + def readable(self) -> bool: ... + def seekable(self) -> bool: ... + def isatty(self) -> bool: ... + def fileno(self) -> int: ... + + @property + def closed(self) -> bool: ... + def close(self) -> None: ... + def size(self) -> int: ... + + def metadata(self) -> KeyValueMetadata: ... + + def tell(self) -> int: ... + + def seek(self, position: int, whence: int = 0) -> int: ... + + def flush(self) -> None: ... + + def write(self, data: bytes | SupportPyBuffer) -> int: ... + + def read(self, nbytes: int | None = None) -> bytes: ... + + def get_stream(self, file_offset: int, nbytes: int) -> Self: ... + + def read_at(self, nbytes: int, offset: int) -> bytes: ... + + def read1(self, nbytes: int | None = None) -> bytes: ... + + def readall(self) -> bytes: ... + def readinto(self, b: SupportPyBuffer) -> int: ... + + + def readline(self, size: int | None = None) -> bytes: ... + + def readlines(self, hint: int | None = None) -> list[bytes]: ... + + def __iter__(self) -> Self: ... + + def __next__(self) -> bytes: ... + def read_buffer(self, nbytes: int | None = None) -> Buffer: ... + + def truncate(self) -> None: ... + + def writelines(self, lines: list[bytes]): ... + + def download(self, stream_or_path: StrPath | IOBase, buffer_size: int | None = None) -> None: ... + + def upload(self, stream: IOBase, buffer_size: int | None) -> None: ... + + + def writable(self): ... + +# ---------------------------------------------------------------------- +# Python file-like objects + +class PythonFile(NativeFile): + + def __init__(self, handle: IOBase, mode: Literal["r", "w"] | None = None) -> None: ... + def truncate(self, pos: int | None = None) -> None: ... + + +class MemoryMappedFile(NativeFile): + + @classmethod + def create(cls, path: str, size: int) -> Self: ... + + def _open(self, path: str, mode: Literal["r", "rb", "w", "wb", "r+", "r+b", "rb+"] = "r"): ... + def resize(self, new_size: int) -> None: ... + + +def memory_map( + path: str, mode: Literal["r", "rb", "w", "wb", "r+", "r+b", "rb+"] = "r" +) -> MemoryMappedFile: ... + + +create_memory_map = MemoryMappedFile.create + +class OSFile(NativeFile): + + def __init__( + self, + path: str, + mode: Literal["r", "rb", "w", "wb", "a", "ab"], + memory_pool: MemoryPool | None = None, + ) -> None: ... + +class FixedSizeBufferWriter(NativeFile): + + def __init__(self, buffer: Buffer) -> None: ... + def set_memcopy_threads(self, num_threads: int) -> None: ... + + def set_memcopy_blocksize(self, blocksize: int) -> None: ... + + def set_memcopy_threshold(self, threshold: int) -> None: ... + + +# ---------------------------------------------------------------------- +# Arrow buffers + +class Buffer(_Weakrefable): + + def __len__(self) -> int: ... + + def _assert_cpu(self) -> None: ... + @property + def size(self) -> int: ... + + @property + def address(self) -> int: ... + + def hex(self) -> bytes: ... + + @property + def is_mutable(self) -> bool: ... + + @property + def is_cpu(self) -> bool: ... + + @property + def device(self) -> Device: ... + + @property + def memory_manager(self) -> MemoryManager: ... + + @property + def device_type(self) -> DeviceAllocationType: ... + + @property + def parent(self) -> Buffer | None: ... + def __getitem__(self, key: builtins.slice | int) -> Self | int: ... + + def slice(self, offset: int = 0, length: int | None = None) -> Self: ... + + def equals(self, other: Self) -> bool: ... + + def __reduce_ex__(self, protocol: SupportsIndex) -> str | tuple[Any, ...]: ... + def to_pybytes(self) -> bytes: ... + + +class ResizableBuffer(Buffer): + + + def resize(self, new_size: int, shrink_to_fit: bool = False) -> None: ... + + +def allocate_buffer( + size: int, memory_pool: MemoryPool | None = None, resizable: Literal[False] | Literal[True] | None = None +) -> Buffer | ResizableBuffer: ... + + +# ---------------------------------------------------------------------- +# Arrow Stream +class BufferOutputStream(NativeFile): + + def __init__(self, memory_pool: MemoryPool | None = None) -> None: ... + def getvalue(self) -> Buffer: ... + + +class MockOutputStream(NativeFile): ... + +class BufferReader(NativeFile): + + def __init__(self, obj) -> None: ... + + +class CompressedInputStream(NativeFile): + + + def __init__( + self, + stream: StrPath | NativeFile | IOBase, + compression: Literal["bz2", "brotli", "gzip", "lz4", "zstd"], + ) -> None: ... + + +class CompressedOutputStream(NativeFile): + + def __init__( + self, + stream: StrPath | NativeFile | IOBase, + compression: Literal["bz2", "brotli", "gzip", "lz4", "zstd"], + ) -> None: ... + + +class BufferedInputStream(NativeFile): + + def __init__( + self, stream: NativeFile, buffer_size: int, memory_pool: MemoryPool | None = None + ) -> None: ... + + def detach(self) -> NativeFile: ... + + +class BufferedOutputStream(NativeFile): + + def __init__( + self, stream: NativeFile, buffer_size: int, memory_pool: MemoryPool | None = None + ) -> None: ... + + def detach(self) -> NativeFile: ... + + +class TransformInputStream(NativeFile): + + def __init__(self, stream: NativeFile, transform_func: Callable[[Buffer], Any]) -> None: ... + + +class Transcoder: + def __init__(self, decoder, encoder) -> None: ... + def __call__(self, buf: Buffer): ... + +def transcoding_input_stream( + stream: NativeFile, src_encoding: str, dest_encoding: str +) -> TransformInputStream: ... + + +def py_buffer(obj: SupportPyBuffer) -> Buffer: ... + + +def foreign_buffer(address: int, size: int, base: Any | None = None) -> Buffer: ... + + +def as_buffer(o: Buffer | SupportPyBuffer) -> Buffer: ... + +# --------------------------------------------------------------------- + +class CacheOptions(_Weakrefable): + + + hole_size_limit: int + range_size_limit: int + lazy: bool + prefetch_limit: int + def __init__( + self, + *, + hole_size_limit: int | None = None, + range_size_limit: int | None = None, + lazy: bool = True, + prefetch_limit: int = 0, + ) -> None: ... + + @classmethod + def from_network_metrics( + cls, + time_to_first_byte_millis: int, + transfer_bandwidth_mib_per_sec: int, + ideal_bandwidth_utilization_frac: float = 0.9, + max_ideal_request_size_mib: int = 64, + ) -> Self: ... + + +class Codec(_Weakrefable): + + def __init__(self, compression: Compression, compression_level: int | None = None) -> None: ... + + @classmethod + def detect(cls, path: StrPath) -> Self: ... + + @staticmethod + def is_available(compression: Compression) -> bool: ... + + @staticmethod + def supports_compression_level(compression: Compression) -> int: ... + + @staticmethod + def default_compression_level(compression: Compression) -> int: ... + + @staticmethod + def minimum_compression_level(compression: Compression) -> int: ... + + @staticmethod + def maximum_compression_level(compression: Compression) -> int: ... + + @property + def name(self) -> Compression: ... + + @property + def compression_level(self) -> int: ... + + def compress( + self, + buf: Buffer | bytes | SupportPyBuffer, + *, + asbytes: Literal[False] | Literal[True] | None = None, + memory_pool: MemoryPool | None = None, + ) -> Buffer | bytes: ... + + def decompress( + self, + buf: Buffer | bytes | SupportPyBuffer, + decompressed_size: int | None = None, + *, + asbytes: Literal[False] | Literal[True] | None = None, + memory_pool: MemoryPool | None = None, + ) -> Buffer | bytes: ... + + +def compress( + buf: Buffer | bytes | SupportPyBuffer, + codec: Compression = "lz4", + *, + asbytes: Literal[False] | Literal[True] | None = None, + memory_pool: MemoryPool | None = None, +) -> Buffer | bytes: ... + + +def decompress( + buf: Buffer | bytes | SupportPyBuffer, + decompressed_size: int | None = None, + codec: Compression = "lz4", + *, + asbytes: Literal[False] | Literal[True] | None = None, + memory_pool: MemoryPool | None = None, +) -> Buffer | bytes: ... + + +def input_stream( + source: StrPath | Buffer | IOBase, + compression: Literal["detect", "bz2", "brotli", "gzip", "lz4", "zstd"] = "detect", + buffer_size: int | None = None, +) -> BufferReader: ... + + +def output_stream( + source: StrPath | Buffer | IOBase, + compression: Literal["detect", "bz2", "brotli", "gzip", "lz4", "zstd"] = "detect", + buffer_size: int | None = None, +) -> NativeFile: ... + + +__all__ = [ + "have_libhdfs", + "io_thread_count", + "set_io_thread_count", + "NativeFile", + "PythonFile", + "MemoryMappedFile", + "memory_map", + "create_memory_map", + "OSFile", + "FixedSizeBufferWriter", + "Buffer", + "ResizableBuffer", + "allocate_buffer", + "BufferOutputStream", + "MockOutputStream", + "BufferReader", + "CompressedInputStream", + "CompressedOutputStream", + "BufferedInputStream", + "BufferedOutputStream", + "TransformInputStream", + "Transcoder", + "transcoding_input_stream", + "py_buffer", + "foreign_buffer", + "as_buffer", + "CacheOptions", + "Codec", + "compress", + "decompress", + "input_stream", + "output_stream", +] diff --git a/python/pyarrow-stubs/ipc.pyi b/python/pyarrow-stubs/ipc.pyi new file mode 100644 index 00000000000..a6e7c71dd12 --- /dev/null +++ b/python/pyarrow-stubs/ipc.pyi @@ -0,0 +1,157 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +from io import IOBase + +import pandas as pd +import pyarrow.lib as lib + +from pyarrow.lib import ( + IpcReadOptions, + IpcWriteOptions, + Message, + MessageReader, + MetadataVersion, + ReadStats, + RecordBatchReader, + WriteStats, + _ReadPandasMixin, + get_record_batch_size, + get_tensor_size, + read_message, + read_record_batch, + read_schema, + read_tensor, + write_tensor, +) + + +class RecordBatchStreamReader(lib._RecordBatchStreamReader): + def __init__( + self, + source: bytes | lib.Buffer | lib.NativeFile | IOBase, + *, + options: IpcReadOptions | None = None, + memory_pool: lib.MemoryPool | None = None, + ) -> None: ... + + +class RecordBatchStreamWriter(lib._RecordBatchStreamWriter): + def __init__( + self, + sink: str | lib.NativeFile | IOBase, + schema: lib.Schema, + *, + use_legacy_format: bool | None = None, + options: IpcWriteOptions | None = None, + ) -> None: ... + + +class RecordBatchFileReader(lib._RecordBatchFileReader): + def __init__( + self, + source: bytes | lib.Buffer | lib.NativeFile | IOBase, + footer_offset: int | None = None, + *, + options: IpcReadOptions | None, + memory_pool: lib.MemoryPool | None = None, + ) -> None: ... + + +class RecordBatchFileWriter(lib._RecordBatchFileWriter): + def __init__( + self, + sink: str | lib.NativeFile | IOBase, + schema: lib.Schema, + *, + use_legacy_format: bool | None = None, + options: IpcWriteOptions | None = None, + ) -> None: ... + + +def new_stream( + sink: str | lib.NativeFile | IOBase, + schema: lib.Schema, + *, + use_legacy_format: bool | None = None, + options: IpcWriteOptions | None = None, +) -> RecordBatchStreamWriter: ... + + +def open_stream( + source: bytes | lib.Buffer | lib.NativeFile | IOBase, + *, + options: IpcReadOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> RecordBatchStreamReader: ... + + +def new_file( + sink: str | lib.NativeFile | IOBase, + schema: lib.Schema, + *, + use_legacy_format: bool | None = None, + options: IpcWriteOptions | None = None, +) -> RecordBatchFileWriter: ... + + +def open_file( + source: bytes | lib.Buffer | lib.NativeFile | IOBase, + footer_offset: int | None = None, + *, + options: IpcReadOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> RecordBatchFileReader: ... + + +def serialize_pandas( + df: pd.DataFrame, *, nthreads: int | None = None, preserve_index: bool | None = None +) -> lib.Buffer: ... + + +def deserialize_pandas( + buf: lib.Buffer, *, use_threads: bool = True) -> pd.DataFrame: ... + + +__all__ = [ + "IpcReadOptions", + "IpcWriteOptions", + "Message", + "MessageReader", + "MetadataVersion", + "ReadStats", + "RecordBatchReader", + "WriteStats", + "_ReadPandasMixin", + "get_record_batch_size", + "get_tensor_size", + "read_message", + "read_record_batch", + "read_schema", + "read_tensor", + "write_tensor", + "RecordBatchStreamReader", + "RecordBatchStreamWriter", + "RecordBatchFileReader", + "RecordBatchFileWriter", + "new_stream", + "open_stream", + "new_file", + "open_file", + "serialize_pandas", + "deserialize_pandas", +] diff --git a/python/pyarrow-stubs/json.pyi b/python/pyarrow-stubs/json.pyi new file mode 100644 index 00000000000..67768db42e4 --- /dev/null +++ b/python/pyarrow-stubs/json.pyi @@ -0,0 +1,20 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +from pyarrow._json import ParseOptions, ReadOptions, open_json, read_json + +__all__ = ["ParseOptions", "ReadOptions", "read_json", "open_json"] diff --git a/python/pyarrow-stubs/lib.pyi b/python/pyarrow-stubs/lib.pyi new file mode 100644 index 00000000000..43c40b61cf8 --- /dev/null +++ b/python/pyarrow-stubs/lib.pyi @@ -0,0 +1,93 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +# ruff: noqa: F403 +from typing import NamedTuple + +from .array import * +from .builder import * +from .compat import * +from .config import * +from .device import * +from .error import * +from .io import * +from ._ipc import * +from .memory import * +from .pandas_shim import * +from .scalar import * +from .table import * +from .tensor import * +from ._types import * + + +class MonthDayNano(NamedTuple): + days: int + months: int + nanoseconds: int + + +def cpu_count() -> int: ... + + +def set_cpu_count(count: int) -> None: ... + + +def is_threading_enabled() -> bool: ... + + +Type_NA: int +Type_BOOL: int +Type_UINT8: int +Type_INT8: int +Type_UINT16: int +Type_INT16: int +Type_UINT32: int +Type_INT32: int +Type_UINT64: int +Type_INT64: int +Type_HALF_FLOAT: int +Type_FLOAT: int +Type_DOUBLE: int +Type_DECIMAL128: int +Type_DECIMAL256: int +Type_DATE32: int +Type_DATE64: int +Type_TIMESTAMP: int +Type_TIME32: int +Type_TIME64: int +Type_DURATION: int +Type_INTERVAL_MONTH_DAY_NANO: int +Type_BINARY: int +Type_STRING: int +Type_LARGE_BINARY: int +Type_LARGE_STRING: int +Type_FIXED_SIZE_BINARY: int +Type_BINARY_VIEW: int +Type_STRING_VIEW: int +Type_LIST: int +Type_LARGE_LIST: int +Type_LIST_VIEW: int +Type_LARGE_LIST_VIEW: int +Type_MAP: int +Type_FIXED_SIZE_LIST: int +Type_STRUCT: int +Type_SPARSE_UNION: int +Type_DENSE_UNION: int +Type_DICTIONARY: int +Type_RUN_END_ENCODED: int +UnionMode_SPARSE: int +UnionMode_DENSE: int diff --git a/python/pyarrow-stubs/memory.pyi b/python/pyarrow-stubs/memory.pyi new file mode 100644 index 00000000000..ab5db5b1f06 --- /dev/null +++ b/python/pyarrow-stubs/memory.pyi @@ -0,0 +1,91 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +from pyarrow.lib import _Weakrefable + +class MemoryPool(_Weakrefable): + + + def release_unused(self) -> None: ... + + def bytes_allocated(self) -> int: ... + + def total_bytes_allocated(self) -> int: ... + + def max_memory(self) -> int | None: ... + + def num_allocations(self) -> int: ... + + def print_stats(self) -> None: ... + + @property + def backend_name(self) -> str: ... + + +class LoggingMemoryPool(MemoryPool): ... +class ProxyMemoryPool(MemoryPool): ... + + +def default_memory_pool() -> MemoryPool: ... + + +def proxy_memory_pool(parent: MemoryPool) -> ProxyMemoryPool: ... + + +def logging_memory_pool(parent: MemoryPool) -> LoggingMemoryPool: ... + + +def system_memory_pool() -> MemoryPool: ... + + +def jemalloc_memory_pool() -> MemoryPool: ... + + +def mimalloc_memory_pool() -> MemoryPool: ... + + +def set_memory_pool(pool: MemoryPool) -> None: ... + + +def log_memory_allocations(enable: bool = True) -> None: ... + + +def total_allocated_bytes() -> int: ... + + +def jemalloc_set_decay_ms(decay_ms: int) -> None: ... + + +def supported_memory_backends() -> list[str]: ... + + +__all__ = [ + "MemoryPool", + "LoggingMemoryPool", + "ProxyMemoryPool", + "default_memory_pool", + "proxy_memory_pool", + "logging_memory_pool", + "system_memory_pool", + "jemalloc_memory_pool", + "mimalloc_memory_pool", + "set_memory_pool", + "log_memory_allocations", + "total_allocated_bytes", + "jemalloc_set_decay_ms", + "supported_memory_backends", +] diff --git a/python/pyarrow-stubs/orc.pyi b/python/pyarrow-stubs/orc.pyi new file mode 100644 index 00000000000..5e0289e61f7 --- /dev/null +++ b/python/pyarrow-stubs/orc.pyi @@ -0,0 +1,146 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +import sys + +if sys.version_info >= (3, 11): + from typing import Self +else: + from typing_extensions import Self +from typing import IO, Literal + +from _typeshed import StrPath + +from . import _orc +from ._fs import SupportedFileSystem +from .lib import KeyValueMetadata, NativeFile, RecordBatch, Schema, Table + + +class ORCFile: + + reader: _orc.ORCReader + def __init__(self, source: StrPath | NativeFile | IO) -> None: ... + @property + def metadata(self) -> KeyValueMetadata: ... + + @property + def schema(self) -> Schema: ... + + @property + def nrows(self) -> int: ... + + @property + def nstripes(self) -> int: ... + + @property + def file_version(self) -> str: ... + + @property + def software_version(self) -> str: ... + + @property + def compression(self) -> Literal["UNCOMPRESSED", + "ZLIB", "SNAPPY", "LZ4", "ZSTD"]: ... + + @property + def compression_size(self) -> int: ... + + @property + def writer(self) -> str: ... + + @property + def writer_version(self) -> str: ... + + @property + def row_index_stride(self) -> int: ... + + @property + def nstripe_statistics(self) -> int: ... + + @property + def content_length(self) -> int: ... + + @property + def stripe_statistics_length(self) -> int: ... + + @property + def file_footer_length(self) -> int: ... + + @property + def file_postscript_length(self) -> int: ... + + @property + def file_length(self) -> int: ... + + def read_stripe(self, n: int, columns: list[str] | None = None) -> RecordBatch: ... + + def read(self, columns: list[str] | None = None) -> Table: ... + + +class ORCWriter: + + writer: _orc.ORCWriter + is_open: bool + + def __init__( + self, + where: StrPath | NativeFile | IO, + *, + file_version: str = "0.12", + batch_size: int = 1024, + stripe_size: int = 64 * 1024 * 1024, + compression: Literal["UNCOMPRESSED", "ZLIB", + "SNAPPY", "LZ4", "ZSTD"] = "UNCOMPRESSED", + compression_block_size: int = 65536, + compression_strategy: Literal["COMPRESSION", "SPEED"] = "SPEED", + row_index_stride: int = 10000, + padding_tolerance: float = 0.0, + dictionary_key_size_threshold: float = 0.0, + bloom_filter_columns: list[int] | None = None, + bloom_filter_fpp: float = 0.05, + ): ... + def __enter__(self) -> Self: ... + def __exit__(self, *args, **kwargs) -> None: ... + def write(self, table: Table) -> None: ... + + def close(self) -> None: ... + + +def read_table( + source: StrPath | NativeFile | IO, + columns: list[str] | None = None, + filesystem: SupportedFileSystem | None = None, +) -> Table: ... + + +def write_table( + table: Table, + where: StrPath | NativeFile | IO, + *, + file_version: str = "0.12", + batch_size: int = 1024, + stripe_size: int = 64 * 1024 * 1024, + compression: Literal["UNCOMPRESSED", "ZLIB", + "SNAPPY", "LZ4", "ZSTD"] = "UNCOMPRESSED", + compression_block_size: int = 65536, + compression_strategy: Literal["COMPRESSION", "SPEED"] = "SPEED", + row_index_stride: int = 10000, + padding_tolerance: float = 0.0, + dictionary_key_size_threshold: float = 0.0, + bloom_filter_columns: list[int] | None = None, + bloom_filter_fpp: float = 0.05, +) -> None: ... diff --git a/python/pyarrow-stubs/pandas_compat.pyi b/python/pyarrow-stubs/pandas_compat.pyi new file mode 100644 index 00000000000..f25d1ad24a6 --- /dev/null +++ b/python/pyarrow-stubs/pandas_compat.pyi @@ -0,0 +1,82 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +from typing import Any, TypedDict, TypeVar + +import numpy as np +import pandas as pd + +from pandas import DatetimeTZDtype + +from .lib import Array, DataType, Schema, Table + +_T = TypeVar("_T") + + +def get_logical_type_map() -> dict[int, str]: ... +def get_logical_type(arrow_type: DataType) -> str: ... +def get_numpy_logical_type_map() -> dict[type[np.generic], str]: ... +def get_logical_type_from_numpy(pandas_collection) -> str: ... +def get_extension_dtype_info(column) -> tuple[str, dict[str, Any]]: ... + + +class _ColumnMetadata(TypedDict): + name: str + field_name: str + pandas_type: int + numpy_type: str + metadata: dict | None + + +def get_column_metadata( + column: pd.Series | pd.Index, name: str, arrow_type: DataType, field_name: str +) -> _ColumnMetadata: ... + + +def construct_metadata( + columns_to_convert: list[pd.Series], + df: pd.DataFrame, + column_names: list[str], + index_levels: list[pd.Index], + index_descriptors: list[dict], + preserve_index: bool, + types: list[DataType], + column_field_names: list[str] = ..., +) -> dict[bytes, bytes]: ... + + +def dataframe_to_types( + df: pd.DataFrame, preserve_index: bool | None, columns: list[str] | None = None +) -> tuple[list[str], list[DataType], dict[bytes, bytes]]: ... + + +def dataframe_to_arrays( + df: pd.DataFrame, + schema: Schema, + preserve_index: bool | None, + nthreads: int = 1, + columns: list[str] | None = None, + safe: bool = True, +) -> tuple[Array, Schema, int]: ... +def get_datetimetz_type(values: _T, dtype, type_) -> tuple[_T, DataType]: ... +def make_datetimetz(unit: str, tz: str) -> DatetimeTZDtype: ... + + +def table_to_dataframe( + options, table: Table, categories=None, ignore_metadata: bool = False, types_mapper=None +) -> pd.DataFrame: ... +def make_tz_aware(series: pd.Series, tz: str) -> pd.Series: ... diff --git a/python/pyarrow-stubs/pandas_shim.pyi b/python/pyarrow-stubs/pandas_shim.pyi new file mode 100644 index 00000000000..e62767b1591 --- /dev/null +++ b/python/pyarrow-stubs/pandas_shim.pyi @@ -0,0 +1,68 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +from types import ModuleType +from typing import Any, Iterable, TypeGuard + +from pandas import Categorical, DatetimeTZDtype, Index, Series, DataFrame + +from numpy import dtype +from pandas.core.dtypes.base import ExtensionDtype + +class _PandasAPIShim: + has_sparse: bool + + def series(self, *args, **kwargs) -> Series: ... + def data_frame(self, *args, **kwargs) -> DataFrame: ... + @property + def have_pandas(self) -> bool: ... + @property + def compat(self) -> ModuleType: ... + @property + def pd(self) -> ModuleType: ... + def infer_dtype(self, obj: Iterable) -> str: ... + def pandas_dtype(self, dtype: str) -> dtype: ... + @property + def loose_version(self) -> Any: ... + @property + def version(self) -> str: ... + def is_v1(self) -> bool: ... + def is_ge_v21(self) -> bool: ... + def is_ge_v23(self) -> bool: ... + def is_ge_v3(self) -> bool: ... + @property + def categorical_type(self) -> type[Categorical]: ... + @property + def datetimetz_type(self) -> type[DatetimeTZDtype]: ... + @property + def extension_dtype(self) -> type[ExtensionDtype]: ... + def is_array_like( + self, obj: Any + ) -> TypeGuard[Series | Index | Categorical | ExtensionDtype]: ... + def is_categorical(self, obj: Any) -> TypeGuard[Categorical]: ... + def is_datetimetz(self, obj: Any) -> TypeGuard[DatetimeTZDtype]: ... + def is_extension_array_dtype(self, obj: Any) -> TypeGuard[ExtensionDtype]: ... + def is_sparse(self, obj: Any) -> bool: ... + def is_data_frame(self, obj: Any) -> TypeGuard[DataFrame]: ... + def is_series(self, obj: Any) -> TypeGuard[Series]: ... + def is_index(self, obj: Any) -> TypeGuard[Index]: ... + def get_values(self, obj: Any) -> bool: ... + def get_rangeindex_attribute(self, level, name): ... + +_pandas_api: _PandasAPIShim + +__all__ = ["_PandasAPIShim", "_pandas_api"] diff --git a/python/pyarrow-stubs/parquet/__init__.pyi b/python/pyarrow-stubs/parquet/__init__.pyi new file mode 100644 index 00000000000..8d0b5374ea0 --- /dev/null +++ b/python/pyarrow-stubs/parquet/__init__.pyi @@ -0,0 +1,18 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +from .core import * # noqa diff --git a/python/pyarrow-stubs/parquet/core.pyi b/python/pyarrow-stubs/parquet/core.pyi new file mode 100644 index 00000000000..8cb4f152ff7 --- /dev/null +++ b/python/pyarrow-stubs/parquet/core.pyi @@ -0,0 +1,355 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +import sys + +from pathlib import Path + +if sys.version_info >= (3, 11): + from typing import Self +else: + from typing_extensions import Self +from typing import IO, Callable, Iterator, Literal, Sequence + +if sys.version_info >= (3, 10): + from typing import TypeAlias +else: + from typing_extensions import TypeAlias + +from pyarrow import _parquet +from pyarrow._compute import Expression +from pyarrow._fs import FileSystem, SupportedFileSystem +from pyarrow._parquet import ( + ColumnChunkMetaData, + ColumnSchema, + FileDecryptionProperties, + FileEncryptionProperties, + FileMetaData, + ParquetLogicalType, + ParquetReader, + ParquetSchema, + RowGroupMetaData, + SortingColumn, + Statistics, +) +from pyarrow._stubs_typing import FilterTuple, SingleOrList +from pyarrow.dataset import ParquetFileFragment, Partitioning +from pyarrow.lib import NativeFile, RecordBatch, Schema, Table +from typing_extensions import deprecated + +__all__ = ( + "ColumnChunkMetaData", + "ColumnSchema", + "FileDecryptionProperties", + "FileEncryptionProperties", + "FileMetaData", + "ParquetDataset", + "ParquetFile", + "ParquetLogicalType", + "ParquetReader", + "ParquetSchema", + "ParquetWriter", + "RowGroupMetaData", + "SortingColumn", + "Statistics", + "read_metadata", + "read_pandas", + "read_schema", + "read_table", + "write_metadata", + "write_table", + "write_to_dataset", + "_filters_to_expression", + "filters_to_expression", +) + + +def filters_to_expression( + filters: list[FilterTuple | list[FilterTuple]]) -> Expression: ... + + +@deprecated("use filters_to_expression") +def _filters_to_expression( + filters: list[FilterTuple | list[FilterTuple]]) -> Expression: ... + + +_Compression: TypeAlias = Literal["gzip", "bz2", + "brotli", "lz4", "zstd", "snappy", "none"] + + +class ParquetFile: + reader: ParquetReader + common_metadata: FileMetaData + + def __init__( + self, + source: str | Path | NativeFile | IO, + *, + metadata: FileMetaData | None = None, + common_metadata: FileMetaData | None = None, + read_dictionary: list[str] | None = None, + memory_map: bool = False, + buffer_size: int = 0, + pre_buffer: bool = False, + coerce_int96_timestamp_unit: str | None = None, + decryption_properties: FileDecryptionProperties | None = None, + thrift_string_size_limit: int | None = None, + thrift_container_size_limit: int | None = None, + filesystem: SupportedFileSystem | None = None, + page_checksum_verification: bool = False, + ): ... + def __enter__(self) -> Self: ... + def __exit__(self, *args, **kwargs) -> None: ... + @property + def metadata(self) -> FileMetaData: ... + @property + def schema(self) -> ParquetSchema: ... + @property + def schema_arrow(self) -> Schema: ... + @property + def num_row_groups(self) -> int: ... + def close(self, force: bool = False) -> None: ... + @property + def closed(self) -> bool: ... + + def read_row_group( + self, + i: int, + columns: list | None = None, + use_threads: bool = True, + use_pandas_metadata: bool = False, + ) -> Table: ... + + def read_row_groups( + self, + row_groups: list, + columns: list | None = None, + use_threads: bool = True, + use_pandas_metadata: bool = False, + ) -> Table: ... + + def iter_batches( + self, + batch_size: int = 65536, + row_groups: list | None = None, + columns: list | None = None, + use_threads: bool = True, + use_pandas_metadata: bool = False, + ) -> Iterator[RecordBatch]: ... + + def read( + self, + columns: list | None = None, + use_threads: bool = True, + use_pandas_metadata: bool = False, + ) -> Table: ... + def scan_contents(self, columns: list | None = None, + batch_size: int = 65536) -> int: ... + + +class ParquetWriter: + flavor: str + schema_changed: bool + schema: ParquetSchema + where: str | Path | IO + file_handler: NativeFile | None + writer: _parquet.ParquetWriter + is_open: bool + + def __init__( + self, + where: str | Path | IO | NativeFile, + schema: Schema, + filesystem: SupportedFileSystem | None = None, + flavor: str | None = None, + version: Literal["1.0", "2.4", "2.6"] = ..., + use_dictionary: bool = True, + compression: _Compression | dict[str, _Compression] = "snappy", + write_statistics: bool | list = True, + use_deprecated_int96_timestamps: bool | None = None, + compression_level: int | dict | None = None, + use_byte_stream_split: bool | list = False, + column_encoding: str | dict | None = None, + writer_engine_version=None, + data_page_version: Literal["1.0", "2.0"] = ..., + use_compliant_nested_type: bool = True, + encryption_properties: FileEncryptionProperties | None = None, + write_batch_size: int | None = None, + dictionary_pagesize_limit: int | None = None, + store_schema: bool = True, + write_page_index: bool = False, + write_page_checksum: bool = False, + sorting_columns: Sequence[SortingColumn] | None = None, + store_decimal_as_integer: bool = False, + **options, + ) -> None: ... + def __enter__(self) -> Self: ... + def __exit__(self, *args, **kwargs) -> Literal[False]: ... + + def write( + self, table_or_batch: RecordBatch | Table, row_group_size: int | None = None + ) -> None: ... + def write_batch(self, batch: RecordBatch, + row_group_size: int | None = None) -> None: ... + + def write_table(self, table: Table, row_group_size: int | None = None) -> None: ... + def close(self) -> None: ... + def add_key_value_metadata(self, key_value_metadata: dict[str, str]) -> None: ... + + +class ParquetDataset: + def __init__( + self, + path_or_paths: SingleOrList[str] + | SingleOrList[Path] + | SingleOrList[NativeFile] + | SingleOrList[IO], + filesystem: SupportedFileSystem | None = None, + schema: Schema | None = None, + *, + filters: Expression | FilterTuple | list[FilterTuple] | None = None, + read_dictionary: list[str] | None = None, + memory_map: bool = False, + buffer_size: int = 0, + partitioning: str | list[str] | Partitioning | None = "hive", + ignore_prefixes: list[str] | None = None, + pre_buffer: bool = True, + coerce_int96_timestamp_unit: str | None = None, + decryption_properties: FileDecryptionProperties | None = None, + thrift_string_size_limit: int | None = None, + thrift_container_size_limit: int | None = None, + page_checksum_verification: bool = False, + ): ... + def equals(self, other: ParquetDataset) -> bool: ... + @property + def schema(self) -> Schema: ... + + def read( + self, + columns: list[str] | None = None, + use_threads: bool = True, + use_pandas_metadata: bool = False, + ) -> Table: ... + def read_pandas(self, **kwargs) -> Table: ... + @property + def fragments(self) -> list[ParquetFileFragment]: ... + @property + def files(self) -> list[str]: ... + @property + def filesystem(self) -> FileSystem: ... + @property + def partitioning(self) -> Partitioning: ... + + +def read_table( + source: SingleOrList[str] | SingleOrList[Path] | SingleOrList[NativeFile] | SingleOrList[IO], + *, + columns: list | None = None, + use_threads: bool = True, + schema: Schema | None = None, + use_pandas_metadata: bool = False, + read_dictionary: list[str] | None = None, + memory_map: bool = False, + buffer_size: int = 0, + partitioning: str | list[str] | Partitioning | None = "hive", + filesystem: SupportedFileSystem | None = None, + filters: Expression | FilterTuple | list[FilterTuple] | None = None, + ignore_prefixes: list[str] | None = None, + pre_buffer: bool = True, + coerce_int96_timestamp_unit: str | None = None, + decryption_properties: FileDecryptionProperties | None = None, + thrift_string_size_limit: int | None = None, + thrift_container_size_limit: int | None = None, + page_checksum_verification: bool = False, +) -> Table: ... + + +def read_pandas( + source: str | Path | NativeFile | IO, columns: list | None = None, **kwargs +) -> Table: ... + + +def write_table( + table: Table, + where: str | Path | NativeFile | IO, + row_group_size: int | None = None, + version: Literal["1.0", "2.4", "2.6"] = "2.6", + use_dictionary: bool = True, + compression: _Compression | dict[str, _Compression] = "snappy", + write_statistics: bool | list = True, + use_deprecated_int96_timestamps: bool | None = None, + coerce_timestamps: str | None = None, + allow_truncated_timestamps: bool = False, + data_page_size: int | None = None, + flavor: str | None = None, + filesystem: SupportedFileSystem | None = None, + compression_level: int | dict | None = None, + use_byte_stream_split: bool = False, + column_encoding: str | dict | None = None, + data_page_version: Literal["1.0", "2.0"] = ..., + use_compliant_nested_type: bool = True, + encryption_properties: FileEncryptionProperties | None = None, + write_batch_size: int | None = None, + dictionary_pagesize_limit: int | None = None, + store_schema: bool = True, + write_page_index: bool = False, + write_page_checksum: bool = False, + sorting_columns: Sequence[SortingColumn] | None = None, + store_decimal_as_integer: bool = False, + **kwargs, +) -> None: ... + + +def write_to_dataset( + table: Table, + root_path: str | Path, + partition_cols: list[str] | None = None, + filesystem: SupportedFileSystem | None = None, + schema: Schema | None = None, + partitioning: Partitioning | list[str] | None = None, + basename_template: str | None = None, + use_threads: bool | None = None, + file_visitor: Callable[[str], None] | None = None, + existing_data_behavior: Literal["overwrite_or_ignore", "error", "delete_matching"] + | None = None, + **kwargs, +) -> None: ... + + +def write_metadata( + schema: Schema, + where: str | NativeFile, + metadata_collector: list[FileMetaData] | None = None, + filesystem: SupportedFileSystem | None = None, + **kwargs, +) -> None: ... + + +def read_metadata( + where: str | Path | IO | NativeFile, + memory_map: bool = False, + decryption_properties: FileDecryptionProperties | None = None, + filesystem: SupportedFileSystem | None = None, +) -> FileMetaData: ... + + +def read_schema( + where: str | Path | IO | NativeFile, + memory_map: bool = False, + decryption_properties: FileDecryptionProperties | None = None, + filesystem: SupportedFileSystem | None = None, +) -> Schema: ... diff --git a/python/pyarrow-stubs/parquet/encryption.pyi b/python/pyarrow-stubs/parquet/encryption.pyi new file mode 100644 index 00000000000..fe9a454e593 --- /dev/null +++ b/python/pyarrow-stubs/parquet/encryption.pyi @@ -0,0 +1,32 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +from pyarrow._parquet_encryption import ( + CryptoFactory, + DecryptionConfiguration, + EncryptionConfiguration, + KmsClient, + KmsConnectionConfig, +) + +__all__ = [ + "CryptoFactory", + "DecryptionConfiguration", + "EncryptionConfiguration", + "KmsClient", + "KmsConnectionConfig", +] diff --git a/python/pyarrow-stubs/py.typed b/python/pyarrow-stubs/py.typed new file mode 100644 index 00000000000..13a83393a91 --- /dev/null +++ b/python/pyarrow-stubs/py.typed @@ -0,0 +1,16 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. diff --git a/python/pyarrow-stubs/scalar.pyi b/python/pyarrow-stubs/scalar.pyi new file mode 100644 index 00000000000..4563b97fef7 --- /dev/null +++ b/python/pyarrow-stubs/scalar.pyi @@ -0,0 +1,391 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +import collections.abc +import datetime as dt +import sys + +if sys.version_info >= (3, 11): + from typing import Self +else: + from typing_extensions import Self +if sys.version_info >= (3, 10): + from typing import TypeAlias +else: + from typing_extensions import TypeAlias +from typing import Any, Generic, Iterator, Literal + +import numpy as np + +from pyarrow._compute import CastOptions # type: ignore[import-not-found] +from pyarrow.lib import Array, Buffer, MemoryPool, MonthDayNano, Tensor, _Weakrefable +from typing_extensions import TypeVar + +from ._types import ( + _DataTypeT, + _Time32Unit, + _Time64Unit, + _Tz, + _Unit, + DataType, + ListType, + LargeListType, + ListViewType, + LargeListViewType, + FixedSizeListType, +) +from ._types import ( + Decimal256Type, _Precision, _Scale, NullType, BoolType, UInt8Type, Int8Type, + UInt16Type, Int16Type, Uint32Type, Int32Type, UInt64Type, Int64Type, + Float16Type, Float32Type, Float64Type, Decimal32Type, Decimal64Type, + Decimal128Type, Date32Type, Date64Type, Time32Type, Time64Type, TimestampType, + _Size, DurationType, MonthDayNanoIntervalType, BinaryType, LargeBinaryType, + FixedSizeBinaryType, StringType, LargeStringType, BinaryViewType, StringViewType, + StructType, _K, _ValueT, _IndexT, _BasicValueT, RunEndEncodedType, _RunEndType, + UnionType, ExtensionType, BaseExtensionType, Bool8Type, UuidType, JsonType, + OpaqueType, DictionaryType, MapType, _BasicDataType, +) + +_AsPyTypeK = TypeVar("_AsPyTypeK") +_AsPyTypeV = TypeVar("_AsPyTypeV") +_DataType_co = TypeVar("_DataType_co", bound=DataType, covariant=True) + +class Scalar(_Weakrefable, Generic[_DataType_co]): + + @property + def type(self) -> _DataType_co: ... + + @property + def is_valid(self) -> bool: ... + + def cast( + self, + target_type: None | _DataTypeT, + safe: bool = True, + options: CastOptions | None = None, + memory_pool: MemoryPool | None = None, + ) -> Self | Scalar[_DataTypeT]: ... + + def validate(self, *, full: bool = False) -> None: ... + + def equals(self, other: Scalar) -> bool: ... + + def __hash__(self) -> int: ... + + def as_py(self: Scalar[Any], *, maps_as_pydicts: Literal["lossy", "strict"] | None = None) -> Any: ... + + +_NULL: TypeAlias = None +NA = _NULL + +class NullScalar(Scalar[NullType]): ... + +class BooleanScalar(Scalar[BoolType]): ... + +class UInt8Scalar(Scalar[UInt8Type]): ... + +class Int8Scalar(Scalar[Int8Type]): ... + +class UInt16Scalar(Scalar[UInt16Type]): ... + +class Int16Scalar(Scalar[Int16Type]): ... + +class UInt32Scalar(Scalar[Uint32Type]): ... + +class Int32Scalar(Scalar[Int32Type]): ... + +class UInt64Scalar(Scalar[UInt64Type]): ... + +class Int64Scalar(Scalar[Int64Type]): ... + +class HalfFloatScalar(Scalar[Float16Type]): ... + +class FloatScalar(Scalar[Float32Type]): ... + +class DoubleScalar(Scalar[Float64Type]): ... + +class Decimal32Scalar(Scalar[Decimal32Type[_Precision, _Scale]]): ... + +class Decimal64Scalar(Scalar[Decimal64Type[_Precision, _Scale]]): ... + +class Decimal128Scalar(Scalar[Decimal128Type[_Precision, _Scale]]): ... + +class Decimal256Scalar(Scalar[Decimal256Type[_Precision, _Scale]]): ... + +class Date32Scalar(Scalar[Date32Type]): ... + + +class Date64Scalar(Scalar[Date64Type]): + + @property + def value(self) -> dt.date | None: ... + +class Time32Scalar(Scalar[Time32Type[_Time32Unit]]): + + @property + def value(self) -> dt.time | None: ... + +class Time64Scalar(Scalar[Time64Type[_Time64Unit]]): + + @property + def value(self) -> dt.time | None: ... + +class TimestampScalar(Scalar[TimestampType[_Unit, _Tz]]): + + @property + def value(self) -> int | None: ... + +class DurationScalar(Scalar[DurationType[_Unit]]): + + @property + def value(self) -> dt.timedelta | None: ... + +class MonthDayNanoIntervalScalar(Scalar[MonthDayNanoIntervalType]): + + @property + def value(self) -> MonthDayNano | None: ... + + +class BinaryScalar(Scalar[BinaryType]): + + def as_buffer(self) -> Buffer: ... + + +class LargeBinaryScalar(Scalar[LargeBinaryType]): + + def as_buffer(self) -> Buffer: ... + + +class FixedSizeBinaryScalar(Scalar[FixedSizeBinaryType]): + + def as_buffer(self) -> Buffer: ... + + +class StringScalar(Scalar[StringType]): + + def as_buffer(self) -> Buffer: ... + + +class LargeStringScalar(Scalar[LargeStringType]): + + def as_buffer(self) -> Buffer: ... + + +class BinaryViewScalar(Scalar[BinaryViewType]): + + def as_buffer(self) -> Buffer: ... + + +class StringViewScalar(Scalar[StringViewType]): + + def as_buffer(self) -> Buffer: ... + + +class ListScalar(Scalar[ListType[_DataTypeT]]): + + @property + def values(self) -> Array | None: ... + def __len__(self) -> int: ... + + def __getitem__(self, i: int) -> Scalar[_DataTypeT]: ... + + def __iter__(self) -> Iterator[Array]: ... + + +class FixedSizeListScalar(Scalar[FixedSizeListType[_DataTypeT, _Size]]): + + @property + def values(self) -> Array | None: ... + def __len__(self) -> int: ... + + def __getitem__(self, i: int) -> Scalar[_DataTypeT]: ... + + def __iter__(self) -> Iterator[Array]: ... + + +class LargeListScalar(Scalar[LargeListType[_DataTypeT]]): + + @property + def values(self) -> Array | None: ... + def __len__(self) -> int: ... + + def __getitem__(self, i: int) -> Scalar[_DataTypeT]: ... + + def __iter__(self) -> Iterator[Array]: ... + + +class ListViewScalar(Scalar[ListViewType[_DataTypeT]]): + + @property + def values(self) -> Array | None: ... + def __len__(self) -> int: ... + + def __getitem__(self, i: int) -> Scalar[_DataTypeT]: ... + + def __iter__(self) -> Iterator[Array]: ... + + +class LargeListViewScalar(Scalar[LargeListViewType[_DataTypeT]]): + + @property + def values(self) -> Array | None: ... + def __len__(self) -> int: ... + + def __getitem__(self, i: int) -> Scalar[_DataTypeT]: ... + + def __iter__(self) -> Iterator[Array]: ... + + +class StructScalar(Scalar[StructType], collections.abc.Mapping[str, Scalar]): + + def __len__(self) -> int: ... + + def __iter__(self) -> Iterator[str]: ... + + def __getitem__(self, key: int | str) -> Scalar[Any]: ... + + def _as_py_tuple(self) -> list[tuple[str, Any]]: ... + +class MapScalar(Scalar[MapType[_K, _ValueT]]): + + @property + def values(self) -> Array | None: ... + def __len__(self) -> int: ... + + def __getitem__(self, i: int) -> tuple[Scalar[_K], _ValueT, Any]: ... + + def __iter__( + self: Scalar[ + MapType[_BasicDataType[_AsPyTypeK], _BasicDataType[_AsPyTypeV]],] + | Scalar[MapType[Any, _BasicDataType[_AsPyTypeV]]] + | Scalar[MapType[_BasicDataType[_AsPyTypeK], Any]] + ) -> Iterator[tuple[_AsPyTypeK, _AsPyTypeV]] | Iterator[tuple[Any, _AsPyTypeV]] | Iterator[tuple[_AsPyTypeK, Any]]: ... + + +class DictionaryScalar(Scalar[DictionaryType[_IndexT, _BasicValueT]]): + + @property + def index(self) -> Scalar[_IndexT]: ... + + @property + def value(self) -> Scalar[_BasicValueT]: ... + + @property + def dictionary(self) -> Array: ... + +class RunEndEncodedScalar(Scalar[RunEndEncodedType[_RunEndType, _BasicValueT]]): + + @property + def value(self) -> tuple[int, _BasicValueT] | None: ... + + +class UnionScalar(Scalar[UnionType]): + + @property + def value(self) -> Any | None: ... + + @property + def type_code(self) -> str: ... + + +class ExtensionScalar(Scalar[ExtensionType]): + + @property + def value(self) -> Any | None: ... + + @staticmethod + def from_storage(typ: BaseExtensionType, value) -> ExtensionScalar: ... + + +class Bool8Scalar(Scalar[Bool8Type]): ... + +class UuidScalar(Scalar[UuidType]): ... + +class JsonScalar(Scalar[JsonType]): ... + +class OpaqueScalar(Scalar[OpaqueType]): ... + + +class FixedShapeTensorScalar(ExtensionScalar): + + def to_numpy(self) -> np.ndarray: ... + + def to_tensor(self) -> Tensor: ... + + +def scalar( + value: Any, + type: _DataTypeT, + *, + from_pandas: bool | None = None, + memory_pool: MemoryPool | None = None, +) -> Scalar[_DataTypeT]: ... + + +__all__ = [ + "Scalar", + "_NULL", + "NA", + "NullScalar", + "BooleanScalar", + "UInt8Scalar", + "Int8Scalar", + "UInt16Scalar", + "Int16Scalar", + "UInt32Scalar", + "Int32Scalar", + "UInt64Scalar", + "Int64Scalar", + "HalfFloatScalar", + "FloatScalar", + "DoubleScalar", + "Decimal32Scalar", + "Decimal64Scalar", + "Decimal128Scalar", + "Decimal256Scalar", + "Date32Scalar", + "Date64Scalar", + "Time32Scalar", + "Time64Scalar", + "TimestampScalar", + "DurationScalar", + "MonthDayNanoIntervalScalar", + "BinaryScalar", + "LargeBinaryScalar", + "FixedSizeBinaryScalar", + "StringScalar", + "LargeStringScalar", + "BinaryViewScalar", + "StringViewScalar", + "ListScalar", + "FixedSizeListScalar", + "LargeListScalar", + "ListViewScalar", + "LargeListViewScalar", + "StructScalar", + "MapScalar", + "DictionaryScalar", + "RunEndEncodedScalar", + "UnionScalar", + "ExtensionScalar", + "FixedShapeTensorScalar", + "Bool8Scalar", + "UuidScalar", + "JsonScalar", + "OpaqueScalar", + "scalar", +] diff --git a/python/pyarrow-stubs/substrait.pyi b/python/pyarrow-stubs/substrait.pyi new file mode 100644 index 00000000000..b78bbd8aebd --- /dev/null +++ b/python/pyarrow-stubs/substrait.pyi @@ -0,0 +1,38 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +from pyarrow._substrait import ( + BoundExpressions, + SubstraitSchema, + deserialize_expressions, + deserialize_schema, + get_supported_functions, + run_query, + serialize_expressions, + serialize_schema, +) + +__all__ = [ + "BoundExpressions", + "get_supported_functions", + "run_query", + "deserialize_expressions", + "serialize_expressions", + "deserialize_schema", + "serialize_schema", + "SubstraitSchema", +] diff --git a/python/pyarrow-stubs/table.pyi b/python/pyarrow-stubs/table.pyi new file mode 100644 index 00000000000..29784d274df --- /dev/null +++ b/python/pyarrow-stubs/table.pyi @@ -0,0 +1,653 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +import datetime as dt +import sys + +if sys.version_info >= (3, 11): + from typing import Self +else: + from typing_extensions import Self +if sys.version_info >= (3, 10): + from typing import TypeAlias +else: + from typing_extensions import TypeAlias +from typing import ( + Any, + Collection, + Generator, + Generic, + Iterable, + Iterator, + Literal, + Mapping, + Sequence, + TypeVar, +) +import builtins + +import numpy as np +import pandas as pd + +from numpy.typing import NDArray +from pyarrow._compute import ( + CastOptions, + CountOptions, + FunctionOptions, + ScalarAggregateOptions, + TDigestOptions, + VarianceOptions, +) +from pyarrow._stubs_typing import ( + Indices, + Mask, + NullEncoding, + NullSelectionBehavior, + Order, + SupportArrowArray, + SupportArrowDeviceArray, + SupportArrowStream, +) +from pyarrow.compute import ArrayOrChunkedArray, Expression +from pyarrow.interchange.dataframe import _PyArrowDataFrame +from pyarrow.lib import Device, MemoryManager, MemoryPool, Schema +from pyarrow.lib import Field as _Field + +from .array import Array, StructArray, _CastAs, _PandasConvertible +from .device import DeviceAllocationType +from .io import Buffer +from ._ipc import RecordBatchReader +from .scalar import BooleanScalar, Int64Scalar, Scalar, StructScalar +from .tensor import Tensor +from ._stubs_typing import NullableCollection +from ._types import DataType, _AsPyType, _BasicDataType, _DataTypeT + +Field: TypeAlias = _Field[DataType] +_ScalarT = TypeVar("_ScalarT", bound=Scalar) +_Scalar_co = TypeVar("_Scalar_co", bound=Scalar, covariant=True) + +_Aggregation: TypeAlias = Literal[ + "all", + "any", + "approximate_median", + "count", + "count_all", + "count_distinct", + "distinct", + "first", + "first_last", + "last", + "list", + "max", + "mean", + "min", + "min_max", + "one", + "product", + "stddev", + "sum", + "tdigest", + "variance", +] +_AggregationPrefixed: TypeAlias = Literal[ + "hash_all", + "hash_any", + "hash_approximate_median", + "hash_count", + "hash_count_all", + "hash_count_distinct", + "hash_distinct", + "hash_first", + "hash_first_last", + "hash_last", + "hash_list", + "hash_max", + "hash_mean", + "hash_min", + "hash_min_max", + "hash_one", + "hash_product", + "hash_stddev", + "hash_sum", + "hash_tdigest", + "hash_variance", +] +Aggregation: TypeAlias = _Aggregation | _AggregationPrefixed +AggregateOptions: TypeAlias = ( + ScalarAggregateOptions | CountOptions | TDigestOptions | VarianceOptions | FunctionOptions +) + +UnarySelector: TypeAlias = str +NullarySelector: TypeAlias = tuple[()] +NarySelector: TypeAlias = list[str] | tuple[str, ...] +ColumnSelector: TypeAlias = UnarySelector | NullarySelector | NarySelector + +class ChunkedArray(_PandasConvertible[pd.Series], Generic[_Scalar_co]): + + + @property + def data(self) -> Self: ... + @property + def type(self: ChunkedArray[Scalar[_DataTypeT]]) -> _DataTypeT: ... + + def length(self) -> int: ... + + __len__ = length + def to_string( + self, + *, + indent: int = 0, + window: int = 5, + container_window: int = 2, + skip_new_lines: bool = False, + ) -> str: ... + + format = to_string + def validate(self, *, full: bool = False) -> None: ... + + @property + def null_count(self) -> int: ... + + @property + def nbytes(self) -> int: ... + + def get_total_buffer_size(self) -> int: ... + + def __sizeof__(self) -> int: ... + def __getitem__(self, key: int | builtins.slice) -> Self | _Scalar_co: ... + + def getitem(self, i: int) -> Scalar: ... + def is_null(self, *, nan_is_null: bool = False) -> ChunkedArray[BooleanScalar]: ... + + def is_nan(self) -> ChunkedArray[BooleanScalar]: ... + + def is_valid(self) -> ChunkedArray[BooleanScalar]: ... + + def fill_null(self, fill_value: Scalar[_DataTypeT]) -> Self: ... + + def equals(self, other: Self) -> bool: ... + + def to_numpy(self, zero_copy_only: bool = False) -> np.ndarray: ... + + def __array__(self, dtype: np.dtype | None = None, copy: bool | None = None) -> np.ndarray: ... + def cast( + self, + target_type: None | _CastAs = None, + safe: bool | None = None, + options: CastOptions | None = None, + ) -> Self | ChunkedArray[Scalar[_CastAs]]: ... + + def dictionary_encode(self, null_encoding: NullEncoding = "mask") -> Self: ... + + def flatten(self, memory_pool: MemoryPool | None = None) -> list[ChunkedArray[Any]]: ... + + def combine_chunks(self, memory_pool: MemoryPool | None = None) -> Array[_Scalar_co]: ... + + def unique(self) -> ChunkedArray[_Scalar_co]: ... + + def value_counts(self) -> StructArray: ... + + def slice(self, offset: int = 0, length: int | None = None) -> Self: ... + + def filter(self, mask: Mask, null_selection_behavior: NullSelectionBehavior = "drop") -> Self: ... + + def index( + self: ChunkedArray[Scalar[_BasicDataType[_AsPyType]]], + value: Scalar[_DataTypeT] | _AsPyType, + start: int | None = None, + end: int | None = None, + *, + memory_pool: MemoryPool | None = None, + ) -> Int64Scalar: ... + + def take(self, indices: Indices) -> Self: ... + + def drop_null(self) -> Self: ... + + def sort(self, order: Order = "ascending", **kwargs) -> Self: ... + + def unify_dictionaries(self, memory_pool: MemoryPool | None = None) -> Self: ... + + @property + def num_chunks(self) -> int: ... + + def chunk(self, i: int) -> ChunkedArray[_Scalar_co]: ... + + @property + def chunks(self) -> list[Array[_Scalar_co]]: ... + + def iterchunks( + self: ArrayOrChunkedArray[_ScalarT], + ) -> Generator[Array, None, None]: ... + + def __iter__(self) -> Iterator[_Scalar_co]: ... + def to_pylist( + self: ChunkedArray[Scalar[_BasicDataType[_AsPyType]]], + *, + maps_as_pydicts: Literal["lossy", "strict"] | None = None, + ) -> list[_AsPyType | None]: ... + + def __arrow_c_stream__(self, requested_schema=None) -> Any: ... + + @classmethod + def _import_from_c_capsule(cls, stream) -> Self: ... + + @property + def is_cpu(self) -> bool: ... + + +def chunked_array( + arrays: Iterable[NullableCollection[Any]] | Iterable[Iterable[Any] | SupportArrowStream | SupportArrowArray] | Iterable[Array[_ScalarT]], + type: DataType | str | None = None, +) -> ChunkedArray[Scalar[Any]] | ChunkedArray[_ScalarT]: ... + + +_ColumnT = TypeVar("_ColumnT", bound=ArrayOrChunkedArray[Any]) + +class _Tabular(_PandasConvertible[pd.DataFrame], Generic[_ColumnT]): + def __array__(self, dtype: np.dtype | None = None, copy: bool | None = None) -> np.ndarray: ... + def __dataframe__( + self, nan_as_null: bool = False, allow_copy: bool = True + ) -> _PyArrowDataFrame: ... + + def __getitem__(self, key: int | str | slice) -> _ColumnT | Self: ... + + def __len__(self) -> int: ... + def column(self, i: int | str) -> _ColumnT: ... + + @property + def column_names(self) -> list[str]: ... + + @property + def columns(self) -> list[_ColumnT]: ... + + def drop_null(self) -> Self: ... + + def field(self, i: int | str) -> Field: ... + + @classmethod + def from_pydict( + cls, + mapping: Mapping[str, ArrayOrChunkedArray[Any] | list[Any] | np.ndarray], + schema: Schema | None = None, + metadata: Mapping[str | bytes, str | bytes] | None = None, + ) -> Self: ... + + @classmethod + def from_pylist( + cls, + mapping: Sequence[Mapping[str, Any]], + schema: Schema | None = None, + metadata: Mapping[str | bytes, str | bytes] | None = None, + ) -> Self: ... + + def itercolumns(self) -> Generator[_ColumnT, None, None]: ... + + @property + def num_columns(self) -> int: ... + @property + def num_rows(self) -> int: ... + @property + def shape(self) -> tuple[int, int]: ... + + @property + def schema(self) -> Schema: ... + @property + def nbytes(self) -> int: ... + def sort_by(self, sorting: str | list[tuple[str, Order]], **kwargs) -> Self: ... + + def take(self, indices: Indices) -> Self: ... + + def filter( + self, mask: Mask | Expression, null_selection_behavior: NullSelectionBehavior = "drop" + ) -> Self: ... + + def to_pydict( + self, *, maps_as_pydicts: Literal["lossy", "strict"] | None = None + ) -> dict[str, list[Any]]: ... + + def to_pylist( + self, *, maps_as_pydicts: Literal["lossy", "strict"] | None = None + ) -> list[dict[str, Any]]: ... + + def to_string(self, *, show_metadata: bool = False, preview_cols: int = 0) -> str: ... + + def remove_column(self, i: int) -> Self: ... + def drop_columns(self, columns: str | list[str]) -> Self: ... + + def add_column( + self, i: int, field_: str | Field, column: ArrayOrChunkedArray[Any] | list[list[Any]] + ) -> Self: ... + def append_column( + self, field_: str | Field, column: ArrayOrChunkedArray[Any] | list[list[Any]] + ) -> Self: ... + + +class RecordBatch(_Tabular[Array]): + + + def validate(self, *, full: bool = False) -> None: ... + + def replace_schema_metadata( + self, metadata: dict[str | bytes, str | bytes] | None = None + ) -> Self: ... + + @property + def num_columns(self) -> int: ... + + + @property + def num_rows(self) -> int: ... + + @property + def schema(self) -> Schema: ... + + @property + def nbytes(self) -> int: ... + + def get_total_buffer_size(self) -> int: ... + + + def __sizeof__(self) -> int: ... + def add_column( + self, i: int, field_: str | Field, column: ArrayOrChunkedArray[Any] | list + ) -> Self: ... + + def remove_column(self, i: int) -> Self: ... + + def set_column(self, i: int, field_: str | Field, column: Array | list) -> Self: ... + + def rename_columns(self, names: list[str] | dict[str, str]) -> Self: ... + + def serialize(self, memory_pool: MemoryPool | None = None) -> Buffer: ... + + def slice(self, offset: int = 0, length: int | None = None) -> Self: ... + + def equals(self, other: Self, check_metadata: bool = False) -> bool: ... + + def select(self, columns: Iterable[str] | Iterable[int] | NDArray[np.str_]) -> Self: ... + + def cast( + self, target_schema: Schema, safe: bool | None = None, options: CastOptions | None = None + ) -> Self: ... + + @classmethod + def from_arrays( + cls, + arrays: Collection[Array], + names: list[str] | None = None, + schema: Schema | None = None, + metadata: Mapping[str | bytes, str | bytes] | None = None, + ) -> Self: ... + + @classmethod + def from_pandas( + cls, + df: pd.DataFrame, + schema: Schema | None = None, + preserve_index: bool | None = None, + nthreads: int | None = None, + columns: list[str] | None = None, + ) -> Self: ... + + @classmethod + def from_struct_array( + cls, struct_array: StructArray | ChunkedArray[StructScalar] + ) -> Self: ... + + def to_struct_array(self) -> StructArray: ... + + def to_tensor( + self, + null_to_nan: bool = False, + row_major: bool = True, + memory_pool: MemoryPool | None = None, + ) -> Tensor: ... + + def _export_to_c(self, out_ptr: int, out_schema_ptr: int = 0): ... + + @classmethod + def _import_from_c(cls, in_ptr: int, schema: Schema) -> Self: ... + + def __arrow_c_array__(self, requested_schema=None): ... + + def __arrow_c_stream__(self, requested_schema=None): ... + + @classmethod + def _import_from_c_capsule(cls, schema_capsule, array_capsule) -> Self: ... + + def _export_to_c_device(self, out_ptr: int, out_schema_ptr: int = 0) -> None: ... + + @classmethod + def _import_from_c_device(cls, in_ptr: int, schema: Schema) -> Self: ... + + def __arrow_c_device_array__(self, requested_schema=None, **kwargs): ... + + @classmethod + def _import_from_c_device_capsule(cls, schema_capsule, array_capsule) -> Self: ... + + @property + def device_type(self) -> DeviceAllocationType: ... + + @property + def is_cpu(self) -> bool: ... + + def copy_to(self, destination: MemoryManager | Device) -> Self: ... + + +def table_to_blocks(options, table: Table, categories, extension_columns): ... + +JoinType: TypeAlias = Literal[ + "left semi", + "right semi", + "left anti", + "right anti", + "inner", + "left outer", + "right outer", + "full outer", +] + +class Table(_Tabular[ChunkedArray[Any]]): + + + def validate(self, *, full: bool = False) -> None: ... + + def slice(self, offset: int = 0, length: int | None = None) -> Self: ... + + def select(self, columns: Iterable[str] | Iterable[int] | NDArray[np.str_]) -> Self: ... + + def replace_schema_metadata( + self, metadata: dict[str | bytes, str | bytes] | None = None + ) -> Self: ... + + def flatten(self, memory_pool: MemoryPool | None = None) -> Self: ... + + def combine_chunks(self, memory_pool: MemoryPool | None = None) -> Self: ... + + def unify_dictionaries(self, memory_pool: MemoryPool | None = None) -> Self: ... + + def equals(self, other: Self, check_metadata: bool = False) -> Self: ... + + def cast( + self, target_schema: Schema, safe: bool | None = None, options: CastOptions | None = None + ) -> Self: ... + + @classmethod + def from_pandas( + cls, + df: pd.DataFrame, + schema: Schema | None = None, + preserve_index: bool | None = None, + nthreads: int | None = None, + columns: list[str] | None = None, + safe: bool = True, + ) -> Self: ... + + @classmethod + def from_arrays( + cls, + arrays: Collection[ArrayOrChunkedArray[Any]], + names: list[str] | None = None, + schema: Schema | None = None, + metadata: Mapping[str | bytes, str | bytes] | None = None, + ) -> Self: ... + + @classmethod + def from_struct_array( + cls, struct_array: StructArray | ChunkedArray[StructScalar] + ) -> Self: ... + + def to_struct_array( + self, max_chunksize: int | None = None + ) -> ChunkedArray[StructScalar]: ... + + @classmethod + def from_batches(cls, batches: Iterable[RecordBatch], schema: Schema | None = None) -> Self: ... + + def to_batches(self, max_chunksize: int | None = None) -> list[RecordBatch]: ... + + def to_reader(self, max_chunksize: int | None = None) -> RecordBatchReader: ... + + @property + def schema(self) -> Schema: ... + + @property + def num_columns(self) -> int: ... + + @property + def num_rows(self) -> int: ... + + @property + def nbytes(self) -> int: ... + + def get_total_buffer_size(self) -> int: ... + + def __sizeof__(self) -> int: ... + def add_column( + self, i: int, field_: str | Field, column: ArrayOrChunkedArray[Any] | list[list[Any]] + ) -> Self: ... + + def remove_column(self, i: int) -> Self: ... + + def set_column( + self, i: int, field_: str | Field, column: ArrayOrChunkedArray[Any] | list[list[Any]] + ) -> Self: ... + + def rename_columns(self, names: list[str] | dict[str, str]) -> Self: ... + + def drop(self, columns: str | list[str]) -> Self: ... + + def group_by(self, keys: str | list[str], use_threads: bool = True) -> TableGroupBy: ... + + def join( + self, + right_table: Self, + keys: str | list[str], + right_keys: str | list[str] | None = None, + join_type: JoinType = "left outer", + left_suffix: str | None = None, + right_suffix: str | None = None, + coalesce_keys: bool = True, + use_threads: bool = True, + ) -> Self: ... + + def join_asof( + self, + right_table: Self, + on: str, + by: str | list[str], + tolerance: int, + right_on: str | list[str] | None = None, + right_by: str | list[str] | None = None, + ) -> Self: ... + + def __arrow_c_stream__(self, requested_schema=None): ... + + @property + def is_cpu(self) -> bool: ... + + +def record_batch( + data: dict[str, list[Any] | Array[Any]] + | Collection[Array[Any]] + | pd.DataFrame + | SupportArrowArray + | SupportArrowDeviceArray, + names: list[str] | None = None, + schema: Schema | None = None, + metadata: Mapping[str | bytes, str | bytes] | None = None, +) -> RecordBatch: ... + + +def table( + data: dict[str, list[Any] | Array[Any]] + | Collection[ArrayOrChunkedArray[Any]] + | pd.DataFrame + | SupportArrowArray + | SupportArrowStream + | SupportArrowDeviceArray, + names: list[str] | None = None, + schema: Schema | None = None, + metadata: Mapping[str | bytes, str | bytes] | None = None, + nthreads: int | None = None, +) -> Table: ... + + +def concat_tables( + tables: Iterable[Table], + memory_pool: MemoryPool | None = None, + promote_options: Literal["none", "default", "permissive"] = "none", + **kwargs: Any, +) -> Table: ... + + +class TableGroupBy: + + + keys: str | list[str] + def __init__(self, table: Table, keys: str | list[str], use_threads: bool = True): ... + def aggregate( + self, + aggregations: Iterable[ + tuple[ColumnSelector, Aggregation] + | tuple[ColumnSelector, Aggregation, AggregateOptions | None] + ], + ) -> Table: ... + + def _table(self) -> Table: ... + @property + def _use_threads(self) -> bool: ... + +def concat_batches( + recordbatches: Iterable[RecordBatch], memory_pool: MemoryPool | None = None +) -> RecordBatch: ... + + +__all__ = [ + "ChunkedArray", + "chunked_array", + "_Tabular", + "RecordBatch", + "table_to_blocks", + "Table", + "record_batch", + "table", + "concat_tables", + "TableGroupBy", + "concat_batches", +] diff --git a/python/pyarrow-stubs/tensor.pyi b/python/pyarrow-stubs/tensor.pyi new file mode 100644 index 00000000000..471f0ec1e98 --- /dev/null +++ b/python/pyarrow-stubs/tensor.pyi @@ -0,0 +1,253 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +import sys + +if sys.version_info >= (3, 11): + from typing import Self +else: + from typing_extensions import Self + +import numpy as np + +from pyarrow.lib import _Weakrefable +from scipy.sparse import coo_matrix, csr_matrix +from sparse import COO # type: ignore + +class Tensor(_Weakrefable): + + + @classmethod + def from_numpy(cls, obj: np.ndarray, dim_names: list[str] | None = None) -> Self: ... + + def to_numpy(self) -> np.ndarray: ... + + def equals(self, other: Tensor) -> bool: ... + + def dim_name(self, i: int) -> str: ... + + @property + def dim_names(self) -> list[str]: ... + + @property + def is_mutable(self) -> bool: ... + + @property + def is_contiguous(self) -> bool: ... + + @property + def ndim(self) -> int: ... + + @property + def size(self) -> str: ... + + @property + def shape(self) -> tuple[int, ...]: ... + + @property + def strides(self) -> tuple[int, ...]: ... + + +class SparseCOOTensor(_Weakrefable): + + @classmethod + def from_dense_numpy(cls, obj: np.ndarray, dim_names: list[str] | None = None) -> Self: ... + + + @classmethod + def from_numpy( + cls, + data: np.ndarray, + coords: np.ndarray, + shape: tuple[int, ...], + dim_names: list[str] | None = None, + ) -> Self: ... + + @classmethod + def from_scipy(cls, obj: csr_matrix, dim_names: list[str] | None = None) -> Self: ... + + @classmethod + def from_pydata_sparse(cls, obj: COO, dim_names: list[str] | None = None) -> Self: ... + + @classmethod + def from_tensor(cls, obj: Tensor) -> Self: ... + + def to_numpy(self) -> tuple[np.ndarray, np.ndarray]: ... + + def to_scipy(self) -> coo_matrix: ... + + def to_pydata_sparse(self) -> COO: ... + + def to_tensor(self) -> Tensor: ... + + def equals(self, other: Self) -> bool: ... + + @property + def is_mutable(self) -> bool: ... + @property + def ndim(self) -> int: ... + @property + def size(self) -> str: ... + @property + def shape(self) -> tuple[int, ...]: ... + def dim_name(self, i: int) -> str: ... + + @property + def dim_names(self) -> list[str]: ... + @property + def non_zero_length(self) -> int: ... + @property + def has_canonical_format(self) -> bool: ... + +class SparseCSRMatrix(_Weakrefable): + + + @classmethod + def from_dense_numpy(cls, obj: np.ndarray, dim_names: list[str] | None = None) -> Self: ... + + @classmethod + def from_numpy( + cls, + data: np.ndarray, + indptr: np.ndarray, + indices: np.ndarray, + shape: tuple[int, ...], + dim_names: list[str] | None = None, + ) -> Self: ... + + @classmethod + def from_scipy(cls, obj: csr_matrix, dim_names: list[str] | None = None) -> Self: ... + + @classmethod + def from_tensor(cls, obj: Tensor) -> Self: ... + + def to_numpy(self) -> tuple[np.ndarray, np.ndarray, np.ndarray]: ... + + def to_scipy(self) -> csr_matrix: ... + + def to_tensor(self) -> Tensor: ... + + def equals(self, other: Self) -> bool: ... + + @property + def is_mutable(self) -> bool: ... + @property + def ndim(self) -> int: ... + @property + def size(self) -> str: ... + @property + def shape(self) -> tuple[int, ...]: ... + def dim_name(self, i: int) -> str: ... + + @property + def dim_names(self) -> list[str]: ... + @property + def non_zero_length(self) -> int: ... + +class SparseCSCMatrix(_Weakrefable): + + + @classmethod + def from_dense_numpy(cls, obj: np.ndarray, dim_names: list[str] | None = None) -> Self: ... + + @classmethod + def from_numpy( + cls, + data: np.ndarray, + indptr: np.ndarray, + indices: np.ndarray, + shape: tuple[int, ...], + dim_names: list[str] | None = None, + ) -> Self: ... + + @classmethod + def from_scipy(cls, obj: csr_matrix, dim_names: list[str] | None = None) -> Self: ... + + @classmethod + def from_tensor(cls, obj: Tensor) -> Self: ... + + def to_numpy(self) -> tuple[np.ndarray, np.ndarray, np.ndarray]: ... + + def to_scipy(self) -> csr_matrix: ... + + def to_tensor(self) -> Tensor: ... + + def equals(self, other: Self) -> bool: ... + + @property + def is_mutable(self) -> bool: ... + @property + def ndim(self) -> int: ... + @property + def size(self) -> str: ... + @property + def shape(self) -> tuple[int, ...]: ... + def dim_name(self, i: int) -> str: ... + + @property + def dim_names(self) -> list[str]: ... + @property + def non_zero_length(self) -> int: ... + +class SparseCSFTensor(_Weakrefable): + + + @classmethod + def from_dense_numpy(cls, obj: np.ndarray, dim_names: list[str] | None = None) -> Self: ... + + @classmethod + def from_numpy( + cls, + data: np.ndarray, + indptr: np.ndarray, + indices: np.ndarray, + shape: tuple[int, ...], + axis_order: list[int] | None = None, + dim_names: list[str] | None = None, + ) -> Self: ... + + @classmethod + def from_tensor(cls, obj: Tensor) -> Self: ... + + def to_numpy(self) -> tuple[np.ndarray, np.ndarray, np.ndarray]: ... + + def to_tensor(self) -> Tensor: ... + + def equals(self, other: Self) -> bool: ... + + @property + def is_mutable(self) -> bool: ... + @property + def ndim(self) -> int: ... + @property + def size(self) -> str: ... + @property + def shape(self) -> tuple[int, ...]: ... + def dim_name(self, i: int) -> str: ... + + @property + def dim_names(self) -> list[str]: ... + @property + def non_zero_length(self) -> int: ... + +__all__ = [ + "Tensor", + "SparseCOOTensor", + "SparseCSRMatrix", + "SparseCSCMatrix", + "SparseCSFTensor", +] diff --git a/python/pyarrow-stubs/types.pyi b/python/pyarrow-stubs/types.pyi new file mode 100644 index 00000000000..def5e3771ab --- /dev/null +++ b/python/pyarrow-stubs/types.pyi @@ -0,0 +1,217 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +import sys + +from typing import Any + +if sys.version_info >= (3, 13): + from typing import TypeIs +else: + from typing_extensions import TypeIs +if sys.version_info >= (3, 10): + from typing import TypeAlias +else: + from typing_extensions import TypeAlias + +from pyarrow.lib import ( + BinaryType, + BinaryViewType, + BoolType, + DataType, + Date32Type, + Date64Type, + Decimal32Type, + Decimal64Type, + Decimal128Type, + Decimal256Type, + DenseUnionType, + DictionaryType, + DurationType, + FixedSizeBinaryType, + FixedSizeListType, + Float16Type, + Float32Type, + Float64Type, + Int8Type, + Int16Type, + Int32Type, + Int64Type, + LargeBinaryType, + LargeListType, + LargeListViewType, + LargeStringType, + ListType, + ListViewType, + MapType, + MonthDayNanoIntervalType, + NullType, + RunEndEncodedType, + SparseUnionType, + StringType, + StringViewType, + StructType, + Time32Type, + Time64Type, + TimestampType, + UInt8Type, + UInt16Type, + Uint32Type, + UInt64Type, +) + +_SignedInteger: TypeAlias = Int8Type | Int16Type | Int32Type | Int64Type +_UnsignedInteger: TypeAlias = UInt8Type | UInt16Type | Uint32Type | UInt64Type +_Integer: TypeAlias = _SignedInteger | _UnsignedInteger +_Floating: TypeAlias = Float16Type | Float32Type | Float64Type +_Decimal: TypeAlias = ( + Decimal32Type[Any, Any] + | Decimal64Type[Any, Any] + | Decimal128Type[Any, Any] + | Decimal256Type[Any, Any] +) +_Date: TypeAlias = Date32Type | Date64Type +_Time: TypeAlias = Time32Type[Any] | Time64Type[Any] +_Interval: TypeAlias = MonthDayNanoIntervalType +_Temporal: TypeAlias = TimestampType[Any, + Any] | DurationType[Any] | _Time | _Date | _Interval +_Union: TypeAlias = SparseUnionType | DenseUnionType +_Nested: TypeAlias = ( + ListType[Any] + | FixedSizeListType[Any, Any] + | LargeListType[Any] + | ListViewType[Any] + | LargeListViewType[Any] + | StructType + | MapType[Any, Any, Any] + | _Union +) + + +def is_null(t: DataType) -> TypeIs[NullType]: ... +def is_boolean(t: DataType) -> TypeIs[BoolType]: ... +def is_integer(t: DataType) -> TypeIs[_Integer]: ... +def is_signed_integer(t: DataType) -> TypeIs[_SignedInteger]: ... +def is_unsigned_integer(t: DataType) -> TypeIs[_UnsignedInteger]: ... +def is_int8(t: DataType) -> TypeIs[Int8Type]: ... +def is_int16(t: DataType) -> TypeIs[Int16Type]: ... +def is_int32(t: DataType) -> TypeIs[Int32Type]: ... +def is_int64(t: DataType) -> TypeIs[Int64Type]: ... +def is_uint8(t: DataType) -> TypeIs[UInt8Type]: ... +def is_uint16(t: DataType) -> TypeIs[UInt16Type]: ... +def is_uint32(t: DataType) -> TypeIs[Uint32Type]: ... +def is_uint64(t: DataType) -> TypeIs[UInt64Type]: ... +def is_floating(t: DataType) -> TypeIs[_Floating]: ... +def is_float16(t: DataType) -> TypeIs[Float16Type]: ... +def is_float32(t: DataType) -> TypeIs[Float32Type]: ... +def is_float64(t: DataType) -> TypeIs[Float64Type]: ... +def is_list(t: DataType) -> TypeIs[ListType[Any]]: ... +def is_large_list(t: DataType) -> TypeIs[LargeListType[Any]]: ... +def is_fixed_size_list(t: DataType) -> TypeIs[FixedSizeListType[Any, Any]]: ... +def is_list_view(t: DataType) -> TypeIs[ListViewType[Any]]: ... +def is_large_list_view(t: DataType) -> TypeIs[LargeListViewType[Any]]: ... +def is_struct(t: DataType) -> TypeIs[StructType]: ... +def is_union(t: DataType) -> TypeIs[_Union]: ... +def is_nested(t: DataType) -> TypeIs[_Nested]: ... +def is_run_end_encoded(t: DataType) -> TypeIs[RunEndEncodedType[Any, Any]]: ... +def is_temporal(t: DataType) -> TypeIs[_Temporal]: ... +def is_timestamp(t: DataType) -> TypeIs[TimestampType[Any, Any]]: ... +def is_duration(t: DataType) -> TypeIs[DurationType[Any]]: ... +def is_time(t: DataType) -> TypeIs[_Time]: ... +def is_time32(t: DataType) -> TypeIs[Time32Type[Any]]: ... +def is_time64(t: DataType) -> TypeIs[Time64Type[Any]]: ... +def is_binary(t: DataType) -> TypeIs[BinaryType]: ... +def is_large_binary(t: DataType) -> TypeIs[LargeBinaryType]: ... +def is_unicode(t: DataType) -> TypeIs[StringType]: ... +def is_string(t: DataType) -> TypeIs[StringType]: ... +def is_large_unicode(t: DataType) -> TypeIs[LargeStringType]: ... +def is_large_string(t: DataType) -> TypeIs[LargeStringType]: ... +def is_fixed_size_binary(t: DataType) -> TypeIs[FixedSizeBinaryType]: ... +def is_binary_view(t: DataType) -> TypeIs[BinaryViewType]: ... +def is_string_view(t: DataType) -> TypeIs[StringViewType]: ... +def is_date(t: DataType) -> TypeIs[_Date]: ... +def is_date32(t: DataType) -> TypeIs[Date32Type]: ... +def is_date64(t: DataType) -> TypeIs[Date64Type]: ... +def is_map(t: DataType) -> TypeIs[MapType[Any, Any, Any]]: ... +def is_decimal(t: DataType) -> TypeIs[_Decimal]: ... +def is_decimal32(t: DataType) -> TypeIs[Decimal32Type[Any, Any]]: ... +def is_decimal64(t: DataType) -> TypeIs[Decimal64Type[Any, Any]]: ... +def is_decimal128(t: DataType) -> TypeIs[Decimal128Type[Any, Any]]: ... +def is_decimal256(t: DataType) -> TypeIs[Decimal256Type[Any, Any]]: ... +def is_dictionary(t: DataType) -> TypeIs[DictionaryType[Any, Any, Any]]: ... +def is_interval(t: DataType) -> TypeIs[_Interval]: ... +def is_primitive(t: DataType) -> bool: ... +def is_boolean_value(obj: Any) -> bool: ... +def is_integer_value(obj: Any) -> bool: ... +def is_float_value(obj: Any) -> bool: ... + + +__all__ = [ + "is_binary", + "is_binary_view", + "is_boolean", + "is_date", + "is_date32", + "is_date64", + "is_decimal", + "is_decimal128", + "is_decimal256", + "is_decimal32", + "is_decimal64", + "is_dictionary", + "is_duration", + "is_fixed_size_binary", + "is_fixed_size_list", + "is_float16", + "is_float32", + "is_float64", + "is_floating", + "is_int16", + "is_int32", + "is_int64", + "is_int8", + "is_integer", + "is_interval", + "is_large_binary", + "is_large_list", + "is_large_list_view", + "is_large_string", + "is_large_unicode", + "is_list", + "is_list_view", + "is_map", + "is_nested", + "is_null", + "is_primitive", + "is_run_end_encoded", + "is_signed_integer", + "is_string", + "is_string_view", + "is_struct", + "is_temporal", + "is_time", + "is_time32", + "is_time64", + "is_timestamp", + "is_uint16", + "is_uint32", + "is_uint64", + "is_uint8", + "is_unicode", + "is_union", + "is_unsigned_integer", +] diff --git a/python/pyarrow-stubs/util.pyi b/python/pyarrow-stubs/util.pyi new file mode 100644 index 00000000000..db74524d77d --- /dev/null +++ b/python/pyarrow-stubs/util.pyi @@ -0,0 +1,48 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +from collections.abc import Callable +from os import PathLike +from typing import Any, Protocol, Sequence, TypeVar + +_F = TypeVar("_F", bound=Callable) +_N = TypeVar("_N") + + +class _DocStringComponents(Protocol): + _docstring_components: list[str] + + +def doc( + *docstrings: str | _DocStringComponents | Callable | None, **params: Any +) -> Callable[[_F], _F]: ... +def _is_iterable(obj) -> bool: ... +def _is_path_like(path) -> bool: ... +def _stringify_path(path: str | PathLike) -> str: ... +def product(seq: Sequence[_N]) -> _N: ... + + +def get_contiguous_span( + shape: tuple[int, ...], strides: tuple[int, ...], itemsize: int +) -> tuple[int, int]: ... +def find_free_port() -> int: ... +def guid() -> str: ... +def _download_urllib(url, out_path) -> None: ... +def _download_requests(url, out_path) -> None: ... +def download_tzdata_on_windows() -> None: ... +def _deprecate_api(old_name, new_name, api, next_version, type=...): ... +def _deprecate_class(old_name, new_class, next_version, instancecheck=True): ... From 144aef59154c211c2906595aa684637c74ff303d Mon Sep 17 00:00:00 2001 From: Rok Mihevc Date: Sat, 20 Sep 2025 20:21:14 +0200 Subject: [PATCH 2/4] Minor changes to pyarrow so some typechecks pass --- python/pyarrow/array.pxi | 2 +- python/pyarrow/scalar.pxi | 2 +- python/pyarrow/tests/strategies.py | 38 ++++++++++++++++-------------- python/pyarrow/tests/test_array.py | 8 +++---- python/pyarrow/tests/test_io.py | 20 +++++++++------- 5 files changed, 37 insertions(+), 33 deletions(-) diff --git a/python/pyarrow/array.pxi b/python/pyarrow/array.pxi index bf5beab589d..109d8ebe597 100644 --- a/python/pyarrow/array.pxi +++ b/python/pyarrow/array.pxi @@ -3634,7 +3634,7 @@ cdef class FixedSizeListArray(BaseListArray): Or create from a values array, list size and matching type: >>> typ = pa.list_(pa.field("values", pa.int64()), 2) - >>> arr = pa.FixedSizeListArray.from_arrays(values,type=typ) + >>> arr = pa.FixedSizeListArray.from_arrays(values, type=typ) >>> arr [ diff --git a/python/pyarrow/scalar.pxi b/python/pyarrow/scalar.pxi index 5934a7aa8cf..d26933e3f39 100644 --- a/python/pyarrow/scalar.pxi +++ b/python/pyarrow/scalar.pxi @@ -1036,7 +1036,7 @@ cdef class StructScalar(Scalar, Mapping): Parameters ---------- - index : Union[int, str] + key : Union[int, str] Index / position or name of the field. Returns diff --git a/python/pyarrow/tests/strategies.py b/python/pyarrow/tests/strategies.py index 450cce74f1d..6d7ec6f724f 100644 --- a/python/pyarrow/tests/strategies.py +++ b/python/pyarrow/tests/strategies.py @@ -18,21 +18,21 @@ import datetime import sys -import pytest -import hypothesis as h -import hypothesis.strategies as st +import pytest # type: ignore[import-not-found] +import hypothesis as h # type: ignore[import-not-found] +import hypothesis.strategies as st # type: ignore[import-not-found] try: - import hypothesis.extra.numpy as npst + import hypothesis.extra.numpy as npst # type: ignore[import-not-found] except ImportError: - npst = None + npst = None # type: ignore[assignment] try: - import hypothesis.extra.pytz as tzst + import hypothesis.extra.pytz as tzst # type: ignore[import-not-found] except ImportError: - tzst = None + tzst = None # type: ignore[assignment] try: import zoneinfo except ImportError: - zoneinfo = None + zoneinfo = None # type: ignore[assignment] if sys.platform == 'win32': try: import tzdata # noqa:F401 @@ -41,7 +41,7 @@ try: import numpy as np except ImportError: - np = None + np = None # type: ignore[assignment] import pyarrow as pa @@ -234,13 +234,13 @@ def schemas(type_strategy=primitive_types, max_fields=None): all_types = st.deferred( lambda: ( - primitive_types | - list_types() | - struct_types() | - dictionary_types() | - map_types() | - list_types(all_types) | - struct_types(all_types) + primitive_types + | list_types() + | struct_types() + | dictionary_types() + | map_types() + | list_types(all_types) # type: ignore[has-type] + | struct_types(all_types) # type: ignore[has-type] ) ) all_fields = fields(all_types) @@ -467,7 +467,9 @@ def pandas_compatible_list_types( dictionary_types( value_strategy=pandas_compatible_dictionary_value_types ), - pandas_compatible_list_types(pandas_compatible_types), - struct_types(pandas_compatible_types) + pandas_compatible_list_types( + pandas_compatible_types # type: ignore[has-type] + ), + struct_types(pandas_compatible_types) # type: ignore[has-type] ) ) diff --git a/python/pyarrow/tests/test_array.py b/python/pyarrow/tests/test_array.py index 009ab1e849b..5686420c688 100644 --- a/python/pyarrow/tests/test_array.py +++ b/python/pyarrow/tests/test_array.py @@ -18,10 +18,10 @@ from collections.abc import Iterable import datetime import decimal -import hypothesis as h -import hypothesis.strategies as st +import hypothesis as h # type: ignore[import-not-found] +import hypothesis.strategies as st # type: ignore[import-not-found] import itertools -import pytest +import pytest # type: ignore[import-not-found] import struct import subprocess import sys @@ -30,7 +30,7 @@ try: import numpy as np except ImportError: - np = None + np = None # type: ignore[assignment] import pyarrow as pa import pyarrow.tests.strategies as past diff --git a/python/pyarrow/tests/test_io.py b/python/pyarrow/tests/test_io.py index a6d3546e57c..7c86f37587c 100644 --- a/python/pyarrow/tests/test_io.py +++ b/python/pyarrow/tests/test_io.py @@ -24,7 +24,7 @@ import math import os import pathlib -import pytest +import pytest # type: ignore[import-not-found] import random import sys import tempfile @@ -33,7 +33,7 @@ try: import numpy as np except ImportError: - np = None + np = None # type: ignore[assignment] from pyarrow.util import guid from pyarrow import Codec @@ -811,8 +811,9 @@ def test_cache_options_pickling(pickle_module): @pytest.mark.numpy @pytest.mark.parametrize("compression", [ - pytest.param( - "bz2", marks=pytest.mark.xfail(raises=pa.lib.ArrowNotImplementedError) + pytest.param("bz2", marks=pytest.mark.xfail( + raises=pa.lib.ArrowNotImplementedError # type: ignore[attr-defined] + ) ), "brotli", "gzip", @@ -852,8 +853,9 @@ def test_compress_decompress(compression): @pytest.mark.numpy @pytest.mark.parametrize("compression", [ - pytest.param( - "bz2", marks=pytest.mark.xfail(raises=pa.lib.ArrowNotImplementedError) + pytest.param("bz2", marks=pytest.mark.xfail( + raises=pa.lib.ArrowNotImplementedError # type: ignore[attr-defined] + ) ), "brotli", "gzip", @@ -1748,9 +1750,9 @@ def test_unknown_compression_raises(): "gzip", "lz4", "zstd", - pytest.param( - "snappy", - marks=pytest.mark.xfail(raises=pa.lib.ArrowNotImplementedError) + pytest.param("snappy", marks=pytest.mark.xfail( + raises=pa.lib.ArrowNotImplementedError # type: ignore[attr-defined] + ) ) ]) def test_compressed_roundtrip(compression): From 8e6c6c83978602a3336aee1bee646b0503eeb8d3 Mon Sep 17 00:00:00 2001 From: Rok Mihevc Date: Sat, 20 Sep 2025 20:21:38 +0200 Subject: [PATCH 3/4] Add utility for adding docstrings into annotations --- dev/update_stub_docstrings.py | 214 ++++++++++++++++++++++++++++++++++ 1 file changed, 214 insertions(+) create mode 100644 dev/update_stub_docstrings.py diff --git a/dev/update_stub_docstrings.py b/dev/update_stub_docstrings.py new file mode 100644 index 00000000000..7eb1ee2925d --- /dev/null +++ b/dev/update_stub_docstrings.py @@ -0,0 +1,214 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +# Utility to extract docstrings from pyarrow and update +# docstrings in stubfiles. +# +# Usage +# ===== +# +# python ./dev/update_stub_docstrings.py -f ./python/pyarrow-stubs + + +from pathlib import Path +from textwrap import indent + +import click +# TODO: perhaps replace griffe with importlib +import griffe +from griffe import AliasResolutionError +import libcst +from libcst import matchers as m + + +def _get_docstring(name, package, indentation): + # print("extract_docstrings", name) + try: + obj = package.get_member(name) + except (KeyError, ValueError, AliasResolutionError): + # Some cython __init__ symbols can't be found + # e.g. pyarrow.lib.OSFile.__init__ + stack = name.split(".") + parent_name = ".".join(stack[:-1]) + + try: + obj = package.get_member(parent_name).all_members[stack[-1]] + except (KeyError, ValueError, AliasResolutionError): + print(f"{name} not found in {package.name}, it's probably ok.") + return None + + if obj.has_docstring: + docstring = obj.docstring.value + # Remove signature if present in docstring + if docstring.startswith(obj.name) or ( + (hasattr(obj.parent, "name") and + docstring.startswith(f"{obj.parent.name}.{obj.name}"))): + docstring = "\n".join(docstring.splitlines()[2:]) + # Skip empty docstrings + if docstring.strip() == "": + return None + # Indent docstring + indentation_prefix = indentation * " " + docstring = indent(docstring + '\n"""', indentation_prefix) + docstring = '"""\n' + docstring + return docstring + return None + + +class ReplaceEllipsis(libcst.CSTTransformer): + def __init__(self, package, namespace): + self.package = package + self.base_namespace = namespace + self.stack = [] + self.indentation = 0 + + # Insert module level docstring if _clone_signature is used + def leave_Module(self, original_node, updated_node): + new_body = [] + clone_matcher = m.SimpleStatementLine( + body=[m.Assign( + value=m.Call(func=m.Name(value="_clone_signature")) + ), m.ZeroOrMore()] + ) + for statement in updated_node.body: + new_body.append(statement) + if m.matches(statement, clone_matcher): + name = statement.body[0].targets[0].target.value + if self.base_namespace: + name = f"{self.base_namespace}.{name}" + docstring = _get_docstring(name, self.package, 0) + if docstring is not None: + new_expr = libcst.Expr(value=libcst.SimpleString(docstring)) + new_line = libcst.SimpleStatementLine(body=[new_expr]) + new_body.append(new_line) + + return updated_node.with_changes(body=new_body) + + def visit_ClassDef(self, node): + self.stack.append(node.name.value) + self.indentation += 1 + + def leave_ClassDef(self, original_node, updated_node): + name = ".".join(self.stack) + if self.base_namespace: + name = self.base_namespace + "." + name + + class_matcher_1 = m.ClassDef( + name=m.Name(), + body=m.IndentedBlock( + body=[m.SimpleStatementLine( + body=[m.Expr(m.Ellipsis()), m.ZeroOrMore()] + ), m.ZeroOrMore()] + ) + ) + class_matcher_2 = m.ClassDef( + name=m.Name(), + body=m.IndentedBlock( + body=[m.FunctionDef(), m.ZeroOrMore()] + ) + ) + + if m.matches(updated_node, class_matcher_1): + docstring = _get_docstring(name, self.package, self.indentation) + if docstring is not None: + new_node = libcst.SimpleString(value=docstring) + updated_node = updated_node.deep_replace( + updated_node.body.body[0].body[0].value, new_node) + + if m.matches(updated_node, class_matcher_2): + docstring = _get_docstring(name, self.package, self.indentation) + if docstring is not None: + new_docstring = libcst.SimpleString(value=docstring) + new_body = [ + libcst.SimpleWhitespace(self.indentation * " "), + libcst.Expr(value=new_docstring), + libcst.Newline() + ] + list(updated_node.body.body) + new_body = libcst.IndentedBlock(body=new_body) + updated_node = updated_node.with_changes(body=new_body) + + self.stack.pop() + self.indentation -= 1 + return updated_node + + def visit_FunctionDef(self, node): + self.stack.append(node.name.value) + self.indentation += 1 + + def leave_FunctionDef(self, original_node, updated_node): + name = ".".join(self.stack) + if self.base_namespace: + name = self.base_namespace + "." + name + + function_matcher = m.FunctionDef( + name=m.Name(), + body=m.SimpleStatementSuite( + body=[m.Expr( + m.Ellipsis() + )])) + if m.matches(original_node, function_matcher): + docstring = _get_docstring(name, self.package, self.indentation) + if docstring is not None: + new_docstring = libcst.SimpleString(value=docstring) + new_body = [ + libcst.SimpleWhitespace(self.indentation * " "), + libcst.Expr(value=new_docstring), + libcst.Newline() + ] + new_body = libcst.IndentedBlock(body=new_body) + updated_node = updated_node.with_changes(body=new_body) + + self.stack.pop() + self.indentation -= 1 + return updated_node + + +@click.command() +@click.option('--pyarrow_folder', '-f', type=click.Path(resolve_path=True)) +def add_docs_to_stub_files(pyarrow_folder): + print("Updating docstrings of stub files in:", pyarrow_folder) + package = griffe.load("pyarrow", try_relative_path=True, + force_inspection=True, resolve_aliases=True) + lib_modules = ["array", "builder", "compat", "config", "device", "error", "io", + "_ipc", "memory", "pandas_shim", "scalar", "table", "tensor", + "_types"] + + for stub_file in Path(pyarrow_folder).rglob('*.pyi'): + if stub_file.name == "_stubs_typing.pyi": + continue + module = stub_file.with_suffix('').name + print(f"[{stub_file} {module}]") + + with open(stub_file, 'r') as f: + tree = libcst.parse_module(f.read()) + + if module in lib_modules: + module = "lib" + elif stub_file.parent.name in ["parquet", "interchange"]: + module = f"{stub_file.parent.name}.{module}" + elif module == "__init__": + module = "" + + modified_tree = tree.visit(ReplaceEllipsis(package, module)) + with open(stub_file, "w") as f: + f.write(modified_tree.code) + print("\n") + + +if __name__ == "__main__": + docstrings_map = {} + add_docs_to_stub_files(obj={}) From a0ce53c19216c44bc5011e74af1dd131df2b1594 Mon Sep 17 00:00:00 2001 From: Rok Mihevc Date: Sat, 20 Sep 2025 20:21:59 +0200 Subject: [PATCH 4/4] Add CI check --- .github/workflows/python.yml | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/.github/workflows/python.yml b/.github/workflows/python.yml index 0d12accda4e..1c0faed062f 100644 --- a/.github/workflows/python.yml +++ b/.github/workflows/python.yml @@ -138,6 +138,15 @@ jobs: continue-on-error: true run: archery docker push ${{ matrix.image }} + - name: Type check with mypy and pyright + run: |- + python -m pip install mypy pyright griffe libcst scipy-stubs pandas-stubs types-python-dateutil types-psutil types-requests griffe libcst types-cffi + pip install -i https://pypi.anaconda.org/scientific-python-nightly-wheels/simple pyarrow + cd python + mypy ./pyarrow-stubs ./pyarrow/tests/test_array.py ./pyarrow/tests/test_io.py + pyright ./pyarrow-stubs + python ../dev/update_stub_docstrings.py -f ./python/pyarrow-stubs + macos: name: ${{ matrix.architecture }} macOS ${{ matrix.macos-version }} Python 3 runs-on: macos-${{ matrix.macos-version }}