diff --git a/.gitignore b/.gitignore index e8fece7..3e02d64 100644 --- a/.gitignore +++ b/.gitignore @@ -1,207 +1,15 @@ -# Byte-compiled / optimized / DLL files -__pycache__/ -*.py[codz] -*$py.class - -# C extensions -*.so - -# Distribution / packaging -.Python -build/ -develop-eggs/ -dist/ -downloads/ -eggs/ -.eggs/ -lib/ -lib64/ -parts/ -sdist/ -var/ -wheels/ -share/python-wheels/ -*.egg-info/ -.installed.cfg -*.egg -MANIFEST - -# PyInstaller -# Usually these files are written by a python script from a template -# before PyInstaller builds the exe, so as to inject date/other infos into it. -*.manifest -*.spec - -# Installer logs -pip-log.txt -pip-delete-this-directory.txt - -# Unit test / coverage reports -htmlcov/ -.tox/ -.nox/ -.coverage -.coverage.* -.cache -nosetests.xml -coverage.xml -*.cover -*.py.cover -.hypothesis/ -.pytest_cache/ -cover/ - -# Translations -*.mo -*.pot - -# Django stuff: -*.log -local_settings.py -db.sqlite3 -db.sqlite3-journal - -# Flask stuff: -instance/ -.webassets-cache - -# Scrapy stuff: -.scrapy - -# Sphinx documentation -docs/_build/ - -# PyBuilder -.pybuilder/ -target/ - -# Jupyter Notebook -.ipynb_checkpoints - -# IPython -profile_default/ -ipython_config.py - -# pyenv -# For a library or package, you might want to ignore these files since the code is -# intended to run in multiple environments; otherwise, check them in: -# .python-version - -# pipenv -# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. -# However, in case of collaboration, if having platform-specific dependencies or dependencies -# having no cross-platform support, pipenv may install dependencies that don't work, or not -# install all needed dependencies. -#Pipfile.lock - -# UV -# Similar to Pipfile.lock, it is generally recommended to include uv.lock in version control. -# This is especially recommended for binary packages to ensure reproducibility, and is more -# commonly ignored for libraries. -#uv.lock - -# poetry -# Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. -# This is especially recommended for binary packages to ensure reproducibility, and is more -# commonly ignored for libraries. -# https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control -#poetry.lock -#poetry.toml - -# pdm -# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. -# pdm recommends including project-wide configuration in pdm.toml, but excluding .pdm-python. -# https://pdm-project.org/en/latest/usage/project/#working-with-version-control -#pdm.lock -#pdm.toml -.pdm-python -.pdm-build/ - -# pixi -# Similar to Pipfile.lock, it is generally recommended to include pixi.lock in version control. -#pixi.lock -# Pixi creates a virtual environment in the .pixi directory, just like venv module creates one -# in the .venv directory. It is recommended not to include this directory in version control. -.pixi - -# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm -__pypackages__/ - -# Celery stuff -celerybeat-schedule -celerybeat.pid - -# SageMath parsed files -*.sage.py - -# Environments -.env -.envrc -.venv -env/ -venv/ -ENV/ -env.bak/ -venv.bak/ - -# Spyder project settings -.spyderproject -.spyproject - -# Rope project settings -.ropeproject - -# mkdocs documentation -/site - -# mypy -.mypy_cache/ -.dmypy.json -dmypy.json - -# Pyre type checker -.pyre/ - -# pytype static type analyzer -.pytype/ - -# Cython debug symbols -cython_debug/ - -# PyCharm -# JetBrains specific template is maintained in a separate JetBrains.gitignore that can -# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore -# and can be added to the global gitignore or merged into this file. For a more nuclear -# option (not recommended) you can uncomment the following to ignore the entire idea folder. -.idea/ - -# Abstra -# Abstra is an AI-powered process automation framework. -# Ignore directories containing user credentials, local state, and settings. -# Learn more at https://abstra.io/docs -.abstra/ - -# Visual Studio Code -# Visual Studio Code specific template is maintained in a separate VisualStudioCode.gitignore -# that can be found at https://github.com/github/gitignore/blob/main/Global/VisualStudioCode.gitignore -# and can be added to the global gitignore or merged into this file. However, if you prefer, -# you could uncomment the following to ignore the entire vscode folder -# .vscode/ - -# Ruff stuff: -.ruff_cache/ - -# PyPI configuration file -.pypirc - -# Cursor -# Cursor is an AI-powered code editor. `.cursorignore` specifies files/directories to -# exclude from AI features like autocomplete and code analysis. Recommended for sensitive data -# refer to https://docs.cursor.com/context/ignore-files -.cursorignore -.cursorindexingignore - -# Marimo -marimo/_static/ -marimo/_lsp/ -__marimo__/ +__pycache__/ +*.py[cod] +*.py.class + +# C extensions +*.so + +# Distribution / packaging +.Python +build/ +develop-eggs/ + +.env +.venv/ +.DS_Store diff --git a/README.md b/README.md index 1954f2b..740d92b 100644 --- a/README.md +++ b/README.md @@ -17,46 +17,16 @@ Lance-Ray combines the distributed computing capabilities of Ray with the effici ## Installation -### Basic Installation ```bash -# Clone the repository -git clone https://github.com/lancedb/lance-ray.git +# Install from source +git clone https://github.com/lance-ray/lance-ray.git cd lance-ray - -# Install UV (if not already installed) -pip install uv - -# Install in editable mode uv pip install -e . -``` -### Development Installation (with all dependencies) -```bash - -# Clone the repository -git clone https://github.com/lancedb/lance-ray.git -cd lance-ray - -# Install UV (if not already installed) -pip install uv - -# Install with development dependencies +# Or install with development dependencies uv pip install -e ".[dev]" ``` -### Windows Specific Instructions -```bash -# If 'uv' command is still not recognized (especially on Windows), -# try restarting your terminal or use: -# Basic installation -python -m uv pip install -e . - -# Development installation -python -m uv pip install -e ".[dev]" - -``` - - ## Requirements - Python >= 3.10 diff --git a/docs/contributing.md b/docs/contributing.md new file mode 100644 index 0000000..8cee482 --- /dev/null +++ b/docs/contributing.md @@ -0,0 +1,35 @@ +# Contributing to lance-ray + +## Development setup + +Install the latest development version with all dependencies: + +```bash +git clone https://github.com//lance-ray.git +cd lance-ray +uv pip install -e .[dev] +``` +# Requirements + +- Python >= 3.8 + +- Ray >= 2.40.0 + +- PyLance >= 0.30.0 + +- lance-namespace >= 0.0.5 + +- PyArrow >= 17.0.0 + +- Pandas >= 2.2.0 + +- NumPy >= 2.0.0 + + +# Running Tests + +To run all tests using [pytest](https://docs.pytest.org/): + +```bash +uv run pytest +``` diff --git a/docs/examples.md b/docs/examples.md new file mode 100644 index 0000000..4a157ed --- /dev/null +++ b/docs/examples.md @@ -0,0 +1,57 @@ +# Examples + +## Basic Usage + +```python + +import ray + +import pandas as pd + +from lance_ray import read_lance, write_lance + +ray.init() + +# Write a DataFrame to Lance +df = pd.DataFrame({"a": [1, 2, 3], "b": ["x", "y", "z"]}) + +write_lance("example.lance", df) + +# Read the dataset back + +ds = read_lance("example.lance") + +print(ds.take(3)) + +# Read only specific columns + +ds = read_lance("example.lance", columns=["a"]) + +print(ds.take(3)) + +# Read with a filter expression + +filtered_ds = read_lance("example.lance", filters="a > 1") + +print(filtered_ds.take(3)) + +print(f"Filtered count: {filtered_ds.count()}") + +## Advanced Usage + +# Process data in parallel using Ray tasks +@ray.remote +def process_partition(partition): + return [x * 2 for x in partition["a"]] + +# Split the dataset into 2 partitions + +ds = read_lance("example.lance") + +partitions = ds.split(2) + +# Process each partition in parallel +results = ray.get([process_partition.remote(p) for p in partitions]) + +print(results) +``` \ No newline at end of file diff --git a/docs/index.md b/docs/index.md new file mode 100644 index 0000000..fd3f2ff --- /dev/null +++ b/docs/index.md @@ -0,0 +1,37 @@ +# Lance-Ray Integration + +A Python library that provides seamless integration between [Ray](https://www.ray.io/) and [Lance](https://lancedb.github.io/lance/) for distributed columnar data processing. + +## Overview + +Lance-Ray combines the distributed computing capabilities of Ray with the efficient columnar storage format of Lance, enabling scalable data processing workflows with optimal performance. + +## Features + +- **Distributed Lance Operations:** Leverage Ray’s distributed computing for Lance dataset operations. +- **Seamless Data Movement:** Efficiently move data between Ray and Lance datasets. +- **Optimized I/O:** Fast read and write operations on Lance datasets with Ray integration. +- **Parallel Processing:** Support for concurrent batch operations on distributed Lance data. + +## Quick Start + +```python + +import ray + +from lance_ray import read_lance, write_lance + +ray.init() + +# Write a pandas DataFrame to Lance format +import pandas as pd + +df = pd.DataFrame({"a": [1, 2, 3], "b": ["x", "y", "z"]}) + +write_lance("example.lance", df) + +# Read the dataset back as a Ray Dataset +ds = read_lance("example.lance") + +print(ds.take(3)) +``` diff --git a/mkdocs.yml b/mkdocs.yml new file mode 100644 index 0000000..eb5c2ea --- /dev/null +++ b/mkdocs.yml @@ -0,0 +1,51 @@ +site_name: Lance-Ray +strict: true +docs_dir: docs + +repo_name: lancedb/lance-ray +repo_url: https://github.com/lancedb/lance-ray + +theme: + name: material + logo: images/lance-logo.png + favicon: images/lance-favicon.png + palette: + - scheme: default + primary: indigo + accent: indigo + toggle: + icon: material/brightness-7 + name: Switch to dark mode + - scheme: slate + primary: indigo + accent: indigo + toggle: + icon: material/brightness-4 + name: Switch to light mode + features: + - navigation.sections + - navigation.expand + - navigation.top + - search.highlight + - search.share + - content.code.copy + - content.code.annotate + + icon: + repo: fontawesome/brands/github + +nav: + - Home: index.md + - Examples: examples.md + - Contributing: contributing.md + +extra: + social: + - icon: fontawesome/brands/github + link: https://github.com/lancedb/lance-ray + - icon: fontawesome/brands/discord + link: https://discord.gg/zMM32dvNtd + - icon: fontawesome/brands/x-twitter + link: https://twitter.com/lancedb + footer: + social: true \ No newline at end of file