diff --git a/.gitattributes b/.gitattributes new file mode 100644 index 0000000..ae4a351 --- /dev/null +++ b/.gitattributes @@ -0,0 +1,12 @@ +# Jupyter notebook handling with jupytext +# +# Strategy: Pair .ipynb with .py companions +# - .py files are git-diffable and Claude Code friendly +# - .ipynb files contain outputs for local use +# - Only .py files are meaningfully diffed in PRs + +# Use jupytext for notebook diffs (if available) +*.ipynb diff=jupytext + +# Filter for showing clean diffs +# (requires: git config diff.jupytext.command 'jupytext --to md --set-formats - -o -') diff --git a/.gitignore b/.gitignore index 68bc17f..b6398bd 100644 --- a/.gitignore +++ b/.gitignore @@ -99,7 +99,7 @@ ipython_config.py # This is especially recommended for binary packages to ensure reproducibility, and is more # commonly ignored for libraries. # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control -#poetry.lock +poetry.lock # pdm # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. @@ -158,3 +158,28 @@ cython_debug/ # and can be added to the global gitignore or merged into this file. For a more nuclear # option (not recommended) you can uncomment the following to ignore the entire idea folder. #.idea/ + +# mac +.DS_Store + +# Node.js dependencies +node_modules/ +npm-debug.log* +yarn-debug.log* +yarn-error.log* + +# Claude Code temporary files +.claude/ + +# Office temporary files +~$*.xlsx +~$*.docx +~$*.pptx + + +# DuckDB temporary storage +.tmp/ +duckdb_temp_storage*.tmp + +# Large data files (use remote parquet instead) +*.parquet diff --git a/CLAUDE.md b/CLAUDE.md new file mode 100644 index 0000000..b086ebc --- /dev/null +++ b/CLAUDE.md @@ -0,0 +1,160 @@ +# CLAUDE.md + +This file provides guidance to Claude Code (claude.ai/code) when working with code in this repository. + +## Development Commands + +### Python Environment Management +- **Poetry** is the primary dependency manager (`pyproject.toml` manages dependencies) +- Install dependencies: `poetry install` +- Install with examples dependencies: `poetry install --with examples` +- Activate virtual environment: `poetry shell` +- Run Python scripts: `poetry run python ` + +### Testing +- Run Python tests: `poetry run pytest tests/` +- Run single test: `poetry run pytest tests/test_isbclient.py::test_field_names` +- Test files are in `tests/` directory + +### Playwright Testing (Web Scraping) +- Playwright tests located in `playwright/tests/` +- Run Playwright tests: `cd playwright && npx playwright test` +- View test reports: `cd playwright && npx playwright show-report` +- Configuration: `playwright/playwright.config.js` + +### Docker Development +- Build and run Jupyter environment: `./run_docker.sh [port]` +- Default port is 8890, custom port can be specified as first argument +- Dockerfile creates a Jupyter environment with all dependencies installed + +## Current Status & Issues ⚠️ + +**IMPORTANT**: As of September 2025, the iSamples central API at `https://central.isample.xyz/isamples_central/` is offline. This affects all three client classes below. The repository is transitioning to **offline-first geoparquet workflows** - see examples in `examples/basic/geoparquet.ipynb` and `examples/basic/isample-archive.ipynb` for working patterns. + +## Architecture Overview + +### Core Python Client (`src/isamples_client/`) +The main Python package provides three client classes for interacting with the iSamples API: + +1. **`IsbClient`** (`isbclient.py:232-339`): Basic HTTP client using httpx + - Direct API interaction with `/thing/select` endpoint + - Methods: `field_names()`, `record_count()`, `facets()`, `pivot()` + +2. **`IsbClient2`** (`isbclient.py:341-586`): Enhanced Solr client using pysolr + - Extends IsbClient with more sophisticated search capabilities + - Supports complex filter queries (`_fq_from_kwargs()`) + - Default search parameters in `default_search_params()` + - Faceting and pivot table functionality + +3. **`ISamplesBulkHandler`** (`isbclient.py:588-683`): Bulk data operations + - Handles large dataset exports via authentication + - Methods: `create_download()`, `get_status()`, `download_file()` + - Loads bulk data into pandas DataFrames + +### Key Configuration Constants +- `ISB_SERVER`: Default iSamples API endpoint +- `FL_DEFAULT`: Default field list for search results +- `FACET_FIELDS_DEFAULT`: Default faceting fields +- `MAJOR_FIELDS`: UI field mappings +- `ISAMPLES_SOURCES`: Available data sources (SESAR, OPENCONTEXT, GEOME, SMITHSONIAN) + +### Examples Structure +- **`examples/basic/`**: Basic API usage examples and Jupyter notebooks +- **`examples/spatial/`**: Geospatial data analysis with geoparquet, DuckDB +- **`examples/opencontext/`**: OpenContext-specific examples +- **`javascript/`**: JavaScript/Node.js integration examples + +### Jupyter Notebook Integration +Heavy emphasis on Jupyter notebook examples for data exploration: +- Interactive data analysis with pandas, xarray +- Geospatial analysis using geopandas, folium, cartopy +- **Lonboard WebGL visualization**: High-performance point cloud rendering +- **DuckDB integration**: Efficient remote parquet processing via HTTP range requests +- **API-independent workflows**: Examples that work without central API access + +#### Notebook Editing & Version Control Tools +**For Claude Code and Git Workflows**: + +1. **jupytext pairing** (recommended for active development): + - Pair `.ipynb` with `.py` companions: `~/bin/nb_pair.sh notebook.ipynb` + - Edit `.py` files to avoid token limits (no outputs in source) + - Auto-sync changes between `.ipynb` ↔ `.py` + - Commit `.py` files for clean git diffs + - See: `JUPYTEXT_WORKFLOW.md` for full guide + +2. **nb_source_diff.py** (for quick diffs): + - Diff notebooks without output noise: `nb-diff notebook.ipynb HEAD` + - Use for one-off comparisons or unpaired notebooks + - Tool location: `~/bin/nb_source_diff.py` + +**Quick Reference**: See `QUICKREF_NOTEBOOKS.md` for command cheatsheet + +**Recommended Workflow**: +- When Claude Code hits token limits on `.ipynb` files → Edit the `.py` companion instead +- Pair new notebooks immediately: `~/bin/nb_pair.sh notebook.ipynb` +- **Commit BOTH files** to git (`.ipynb` for outputs, `.py` for clean diffs) +- Review `.py` diffs for code changes, `.ipynb` for output changes +- Sync after Claude edits: `~/bin/nb_pair.sh --sync notebook.ipynb` + +### Dependencies Architecture +- **Core dependencies**: httpx, requests, pandas, xarray, pysolr +- **Spatial analysis**: geopandas, duckdb, polars, ibis-framework, shapely +- **Visualization**: matplotlib, folium, cartopy, ipyleaflet, lonboard +- **Jupyter ecosystem**: ipywidgets, ipydatagrid, sidecar + +## Development Patterns + +### Search Parameter Building +The codebase uses a sophisticated parameter building system: +- `_fq_from_kwargs()` builds Solr filter queries from keyword arguments +- Uses `multidict.MultiDict` for handling multiple values for same parameter +- Supports date range queries, source filtering, and complex boolean logic + +### Error Handling and Logging +- Uses Python `logging` module (configured at INFO level) +- Request URLs are logged for debugging +- HTTP status codes checked with appropriate error raising + +### Monkey Patching for Large Queries +- `monkey_patch_select()` modifies pysolr to handle large queries via POST +- `SWITCH_TO_POST` threshold (10000 bytes) determines GET vs POST usage +- Critical for handling complex search queries that exceed URL limits + +## Known Issues & Troubleshooting + +### API Connectivity Issues +- **Central API offline**: If you see connection errors to `https://central.isample.xyz/isamples_central/`, the API is currently offline +- **Workaround**: Use the geoparquet examples in `examples/basic/geoparquet.ipynb` and `examples/basic/isample-archive.ipynb` which work without API access +- **Alternative data sources**: The examples demonstrate accessing iSamples data via Zenodo archives and remote parquet files + +### Lonboard Visualization Issues + +**⚠️ CRITICAL: Lonboard 0.12+ API Breaking Change** + +Lonboard 0.12+ changed how map initialization works. The old `zoom` and `center` parameters cause `TypeError`. + +**OLD (BROKEN)**: +```python +viz(result, map_kwargs={"zoom": 1, "center": {"lat": 0, "lon": 0}}) +``` + +**NEW (CORRECT for 0.12+)**: +```python +viz(result, map_kwargs={"view_state": {"zoom": 1, "latitude": 0, "longitude": 0}}) +``` + +**Key changes**: +- `zoom` and `center` must be nested inside `view_state` +- `center: {lat, lon}` becomes flat `latitude` and `longitude` keys +- Dynamic updates: `m.set_view_state(longitude=..., latitude=..., zoom=...)` +- Animation: `m.fly_to(...)` + +**Other considerations**: +- **Memory usage**: Always use `LIMIT` clauses when visualizing parquet data (e.g., `LIMIT 100000`) +- **Performance**: For 6M+ row datasets, querying without LIMIT can cause 5+ minute hangs +- **CRS warnings**: "No CRS exists on data" warnings are expected and can be ignored if lon/lat are WGS84 +- **Deprecated**: The `con` parameter to `viz()` is deprecated in newer versions + +### Environment Setup +- **Node.js conflicts**: Multiple `package.json` files exist; use `poetry install --with examples` for Python dependencies +- **Jupyter extensions**: Some notebooks require ipywidgets and sidecar extensions for full functionality \ No newline at end of file diff --git a/CROSS_REPO_ALIGNMENT.md b/CROSS_REPO_ALIGNMENT.md new file mode 100644 index 0000000..3acb143 --- /dev/null +++ b/CROSS_REPO_ALIGNMENT.md @@ -0,0 +1,215 @@ +# Cross-Repository Alignment Strategy + +**Date**: 2025-09-05 +**Repositories**: `isamples-python` ↔ `isamplesorg.github.io` +**Status**: Both repos have successfully pivoted to offline-first geoparquet workflows + +## Repository Relationship Overview + +Both repositories are complementary components of the iSamples ecosystem that have independently evolved toward the same strategic direction: **geoparquet + DuckDB workflows** replacing central API dependencies. + +### Parallel Evolution + +Both repos experienced the **same critical event**: iSamples Central API going offline +- **isamples-python**: Contains sophisticated lonboard visualization patterns but needs API-independent examples +- **isamplesorg.github.io**: Successfully migrated to browser-based DuckDB-WASM + geoparquet tutorials + +## Repository Roles & Complementary Functions + +### `isamples-python` - Development & Analysis Environment +**Role**: Local development, sophisticated analysis, reusable Python patterns + +**Strengths**: +- ⭐ **Excellent lonboard visualization** - `geoparquet.ipynb` contains zoom-layered rendering, interactive controls +- **Rich Python ecosystem** - GeoPandas, DuckDB, Polars, Ibis integration +- **Jupyter development** - Interactive analysis with full Python scientific stack +- **Local data processing** - High-memory, CPU-intensive analysis capabilities + +**Current Focus**: +- Advanced WebGL point cloud visualization with lonboard +- Multi-backend data processing (DuckDB, pandas, polars) +- Complex geospatial analysis workflows +- Python client library (currently broken due to API offline) + +### `isamplesorg.github.io` - Public Documentation & Tutorials +**Role**: Public-facing education, browser-based demos, universal access + +**Strengths**: +- ✅ **Proven geoparquet migration** - Complete transition from API to browser-based analysis +- **Universal browser access** - Zero installation, works on any device +- **Interactive tutorials** - Observable JavaScript + DuckDB-WASM +- **Performance optimized** - HTTP range requests, memory efficient +- **Educational focus** - Clear learning paths, comprehensive documentation + +**Current Focus**: +- Browser-based data analysis tutorials (Quarto + Observable JS) +- Public-facing documentation and vocabulary system +- Performance demonstrations (300MB datasets in <100MB memory) +- SKOS vocabulary management and generation + +## Data Flow & Technical Architecture Alignment + +### Shared Technology Stack +Both repositories use identical core technologies: +- **DuckDB**: SQL analytical database (Python vs WASM versions) +- **Geoparquet**: Efficient geospatial data format +- **HTTP range requests**: Selective data access from large files +- **Zenodo archives**: Same data sources and URLs + +### Data Access Patterns +```mermaid +graph TB + Z[Zenodo iSamples Archive
300MB geoparquet] + + subgraph "isamples-python" + P[Python DuckDB] + L[Lonboard WebGL] + J[Jupyter Analysis] + end + + subgraph "isamplesorg.github.io" + W[DuckDB-WASM] + O[Observable Plot] + Q[Quarto Tutorials] + end + + Z -->|HTTP range requests| P + Z -->|HTTP range requests| W + P --> L + P --> J + W --> O + W --> Q +``` + +### Performance Characteristics +**Common Benefits**: +- 5-10x faster than traditional pandas workflows +- 99% reduction in data transfer via selective queries +- Memory efficient analysis of large datasets +- Offline-capable once data is cached + +## Strategic Alignment Opportunities + +### 1. Pattern Standardization ⭐ **HIGH PRIORITY** + +**Opportunity**: The lonboard visualization patterns in `isamples-python/examples/basic/geoparquet.ipynb` could be adapted for the website tutorials. + +**Actions**: +- Extract reusable lonboard patterns from Python notebook +- Create Observable JS equivalents using lonboard's WebGL approach +- Standardize color mapping, zoom layering, and interaction patterns +- Document shared visualization vocabulary + +**Benefits**: +- Consistent user experience across local/web environments +- Leverage excellent lonboard work for public tutorials +- Reduce duplication of visualization research + +### 2. Data Source Coordination + +**Current State**: Both repos access same Zenodo archives independently +**Opportunity**: Coordinate data source management and updates + +**Actions**: +- Centralize data source URLs and metadata +- Create shared validation scripts for data integrity +- Coordinate data updates and versioning +- Document data access patterns for both environments + +**File Location**: Both repos could reference shared `DATA_SOURCES.md` + +### 3. Cross-Referencing & Learning Paths + +**Opportunity**: Create clear pathways between browser demos and local development + +**Website → Python Transitions**: +- "Try this analysis locally" links from tutorials → Jupyter notebooks +- Environment setup guidance for deeper analysis +- Advanced analysis patterns requiring Python ecosystem + +**Python → Website References**: +- Link to browser demos from Python examples +- Reference interactive tutorials for concept explanations +- Point to website documentation for context + +### 4. Testing & Validation Alignment + +**Shared Challenge**: Both repos need robust testing for geoparquet workflows +**Opportunity**: Coordinate test data and validation approaches + +**Actions**: +- Share test datasets and expected results +- Coordinate data quality validation scripts +- Create cross-platform compatibility tests +- Document known data issues and workarounds + +## Implementation Strategy + +### Phase 1: Documentation Alignment (Immediate) +1. **Create shared DATA_SOURCES.md** - Centralized data access documentation +2. **Cross-link repositories** - Clear navigation between repos in READMEs +3. **Align terminology** - Consistent language for geoparquet workflows +4. **Document handoff patterns** - When to use browser vs local analysis + +### Phase 2: Pattern Extraction (Near-term) +1. **Extract lonboard visualization library** from Python notebooks +2. **Create Observable JS visualization equivalents** +3. **Standardize interaction patterns** - zoom, filtering, color mapping +4. **Document reusable components** for both environments + +### Phase 3: Infrastructure Sharing (Medium-term) +1. **Shared data validation scripts** +2. **Coordinated data source updates** +3. **Cross-platform testing framework** +4. **Performance benchmarking tools** + +## Recommended File Locations + +### In `isamples-python/`: +- `CROSS_REPO_ALIGNMENT.md` (this document) +- `DATA_SOURCES.md` - Shared data source documentation +- `docs/website-integration.md` - Links and transitions to website +- Enhanced `examples/README.md` with website cross-references + +### In `isamplesorg.github.io/`: +- `about.qmd` updates - Link to Python development environment +- `tutorials/index.qmd` - "Advanced Analysis" section pointing to Python repo +- `design/architecture.md` - Multi-repository ecosystem documentation + +## Success Metrics + +### Short-term (1-2 weeks) +- ✅ Clear documentation linking both repositories +- ✅ Shared data source documentation +- ✅ Coordinated terminology and concepts + +### Medium-term (1-2 months) +- ✅ Extracted reusable visualization patterns +- ✅ Observable JS adaptations of lonboard techniques +- ✅ Consistent user experience across environments + +### Long-term (3-6 months) +- ✅ Shared infrastructure components +- ✅ Coordinated data updates and validation +- ✅ Seamless workflow transitions between repos + +## Cross-Project Learning Insights + +### Successful Independent Evolution +- Both teams arrived at same technical solution independently +- Validates geoparquet + DuckDB as optimal approach +- Shows strength of distributed development model + +### Complementary Strengths +- Python repo: Deep analysis capabilities, rich ecosystem +- Website repo: Universal access, educational focus, performance optimization +- Together: Complete workflow from exploration to publication + +### Technical Innovation Leadership +- Combined repositories demonstrate cutting-edge browser + local analysis +- Show how modern web platform capabilities enable "big data in the browser" +- Position iSamples as leader in accessible scientific computing + +--- + +*This alignment strategy leverages the successful independent evolution of both repositories while maximizing their complementary strengths.* \ No newline at end of file diff --git a/DATA_SOURCES.md b/DATA_SOURCES.md new file mode 100644 index 0000000..cc642d8 --- /dev/null +++ b/DATA_SOURCES.md @@ -0,0 +1,187 @@ +# Shared Data Sources - iSamples Ecosystem + +**Maintained by**: Both `isamples-python` and `isamplesorg.github.io` repositories +**Last Updated**: 2025-09-05 + +## Primary Data Sources + +### Zenodo iSamples Archive ⭐ **PRIMARY** +- **URL**: `https://z.rslv.xyz/10.5281/zenodo.15278210/isamples_export_2025_04_21_16_23_46_geo.parquet` +- **Size**: ~300 MB, 6+ million records +- **Format**: Geoparquet with spatial indexing +- **Sources**: SESAR, OpenContext, GEOME, Smithsonian (all federated sources) +- **Update Frequency**: Periodic (check Zenodo for latest versions) +- **Access Method**: HTTP range requests for efficient querying +- **CORS Status**: ⚠️ Check current accessibility for browser use + +**Data Quality Notes**: +- Comprehensive geological sample metadata +- Spatial coordinates available for most records +- Some records may have missing or incomplete fields +- Quality varies by source system + +### OpenContext Collections +- **Base URL Pattern**: Various URLs for specific archaeological collections +- **Format**: Parquet files with domain-specific schemas +- **Access**: HTTP range requests supported +- **Usage**: Domain-specific analysis, educational examples + +### Local Sample Data (Both Repos) + +#### In `isamples-python/examples/spatial/`: +- `cities.geoparquet` - Sample cities data for testing +- `bay_area_cities.parquet` - Regional subset for performance testing +- Purpose: Development and testing without external dependencies + +#### In `isamplesorg.github.io` tutorials: +- Embedded fallback datasets for CORS-restricted environments +- Demo datasets demonstrating same analytical techniques +- Smaller scale data for educational purposes + +## Data Access Patterns + +### Python Environment (`isamples-python`) +```python +import duckdb + +# Connect to DuckDB and query remote parquet +conn = duckdb.connect() +result = conn.sql(""" + SELECT source, COUNT(*) as sample_count + FROM 'https://z.rslv.xyz/10.5281/zenodo.15278210/isamples_export_2025_04_21_16_23_46_geo.parquet' + GROUP BY source +""") +df = result.to_df() +``` + +### Browser Environment (`isamplesorg.github.io`) +```javascript +// DuckDB-WASM with automatic CORS fallback +const conn = await duckdb.connect(); + +// Primary data source with fallback +const dataUrl = "https://z.rslv.xyz/10.5281/zenodo.15278210/isamples_export_2025_04_21_16_23_46_geo.parquet"; + +try { + const result = await conn.query(` + SELECT source, COUNT(*) as sample_count + FROM '${dataUrl}' + GROUP BY source + `); +} catch (e) { + // Fallback to demo dataset + console.log("CORS blocked, using demo data"); + // ... fallback logic +} +``` + +## Performance Characteristics + +### HTTP Range Request Benefits +- **Metadata queries**: <1KB transfer for table statistics +- **Sampling**: ~1-10KB for representative samples +- **Filtered queries**: Only transfers matching data rows +- **Aggregations**: Minimal data transfer for GROUP BY operations + +### Memory Usage +- **Browser**: Analyze 300MB datasets in <100MB memory +- **Python**: Full dataset can be loaded for complex operations +- **Streaming**: Both environments support streaming for larger-than-memory analysis + +## Data Update Coordination + +### Version Management +1. **Check Zenodo** regularly for updated iSamples exports +2. **Test compatibility** in both Python and browser environments +3. **Update URLs** in both repositories simultaneously +4. **Verify data quality** with standard validation queries + +### Validation Queries +```sql +-- Basic quality checks (run in both environments) +SELECT + source, + COUNT(*) as total_records, + COUNT(latitude) as records_with_coords, + MIN(collection_date) as earliest_date, + MAX(collection_date) as latest_date +FROM parquet_file +GROUP BY source; +``` + +### Update Process +1. **Identify new data source** on Zenodo or other archives +2. **Test in Python environment** first (full DuckDB capabilities) +3. **Test in browser environment** (check CORS, performance) +4. **Update both repositories** with new URLs and documentation +5. **Verify examples still work** in both environments + +## Known Issues & Workarounds + +### CORS Restrictions +- **Problem**: Some data sources block browser access +- **Detection**: Try HEAD request first in browser tutorials +- **Workaround**: Automatic fallback to demo datasets +- **Solution**: Host CORS-enabled mirrors when possible + +### Data Quality Issues +- **Missing coordinates**: ~5-10% of records may lack spatial data +- **Encoding issues**: Some text fields may have inconsistent encoding +- **Date formats**: Multiple date formats across source systems +- **Null values**: Handle missing data gracefully in all queries + +### Performance Considerations +- **Large queries**: Use LIMIT in initial development/testing +- **Memory limits**: Browser environment more constrained than Python +- **Network timeouts**: Implement retry logic for large HTTP range requests + +## Cross-Repository Testing + +### Shared Test Queries +Both repositories should validate these standard queries work: + +```sql +-- Test 1: Basic connectivity and record count +SELECT COUNT(*) FROM parquet_file; + +-- Test 2: Source distribution +SELECT source, COUNT(*) FROM parquet_file GROUP BY source; + +-- Test 3: Spatial data availability +SELECT + COUNT(*) as total, + COUNT(latitude) as with_coords, + ROUND(100.0 * COUNT(latitude) / COUNT(*), 2) as coord_percentage +FROM parquet_file; + +-- Test 4: Date range analysis +SELECT + source, + MIN(collection_date) as earliest, + MAX(collection_date) as latest +FROM parquet_file +WHERE collection_date IS NOT NULL +GROUP BY source; +``` + +### Expected Results (as of 2025-04-21 export) +- Total records: ~6+ million +- Sources: SESAR, OpenContext, GEOME, Smithsonian +- Spatial coverage: Global with concentrations in North America, Europe +- Date range: Historical to present (varies by source) + +## Contact & Coordination + +### Data Issues +- Report data quality issues in both repository issue trackers +- Tag issues with `data-quality` label for visibility +- Include specific queries and expected vs actual results + +### New Data Sources +- Propose new data sources in `isamples-python` issues +- Test compatibility in both environments before adoption +- Document access patterns and any special considerations + +--- + +*This document is maintained collaboratively between both repositories to ensure consistency and coordination.* \ No newline at end of file diff --git a/Dockerfile b/Dockerfile new file mode 100644 index 0000000..09855ca --- /dev/null +++ b/Dockerfile @@ -0,0 +1,46 @@ +FROM quay.io/jupyter/minimal-notebook:2024-10-03 + +# Set environment variables to avoid prompts during package installation +ENV DEBIAN_FRONTEND=noninteractive + +# Make sure the contents of our repo are in ${HOME} +COPY . ${HOME} +USER root +RUN chown -R ${NB_UID} ${HOME} + +# Update package list and install required dependencies +RUN apt-get update && \ + apt-get install -y software-properties-common libdb-dev libzmq3-dev curl libssl-dev zlib1g-dev jq jupyter-console pkg-config default-libmysqlclient-dev && \ + apt-get clean && \ + rm -rf /var/lib/apt/lists/* + +# Install pipx and add its binary directory to PATH +RUN pip install pipx && \ + pipx ensurepath +ENV PATH="/home/jovyan/.local/bin:$PATH" + +# Use pipx to install Poetry +RUN pipx install poetry + +# Copy pyproject.toml and poetry.lock if it exists +COPY pyproject.toml poetry.lock* ./ + +# Install project dependencies using Poetry +RUN poetry config virtualenvs.create false && \ + poetry install --no-interaction --no-ansi --with examples + +# Install dependencies from requirements.in if it exists +# COPY requirements.in ./ +# RUN if [ -f requirements.in ]; then pip install --upgrade -r requirements.in; fi + +# Create necessary directories and set permissions +RUN mkdir -p /home/jovyan/.local/share/jupyter && \ + chown -R jovyan:users /home/jovyan/.local + +VOLUME ["/home/jovyan/work", "/data"] + +# Switch back to jovyan to avoid accidental container runs as root +USER ${NB_UID} + +# Verify permissions +RUN ls -la /home/jovyan/.local/share diff --git a/ISAMPLES_MODEL_ACTION_PLAN.md b/ISAMPLES_MODEL_ACTION_PLAN.md new file mode 100644 index 0000000..8390191 --- /dev/null +++ b/ISAMPLES_MODEL_ACTION_PLAN.md @@ -0,0 +1,133 @@ +# iSamples Model Correction Action Plan + +## Overview +This action plan addresses the correction needed after discovering that the iSamples metadata model is **domain-agnostic**, not archaeology-specific as previously documented in our notebooks and code. + +## Key Understanding Update + +### Previous (Incorrect) Understanding +- Treated `MaterialSampleRecord`, `SamplingEvent`, `GeospatialCoordLocation`, `SamplingSite` as archaeology-specific entity types +- Documented predicates like `produced_by`, `sample_location` as OpenContext-specific +- Suggested the model was customized for archaeological data + +### Corrected Understanding +1. **PQG Framework**: Generic property graph representation (s/p/o/n) +2. **iSamples Model**: Domain-agnostic metadata standard for ALL scientific material samples +3. **Domain Data**: OpenContext, SESAR, GEOME populate the model with domain-specific VALUES + +## Phase 1: Documentation Updates (Immediate) + +### Notebooks to Update +- [ ] `examples/basic/oc_parquet_analysis_enhanced.ipynb` + - Fix section "Key Distinction: Generic PQG vs OpenContext-Specific" + - Update all comments suggesting entity types are archaeology-specific + - Clarify that OpenContext uses standard iSamples model with archaeological data + +- [ ] `examples/basic/oc_parquet_analysis.ipynb` + - Update introductory documentation + - Fix inline comments about entity types + +- [ ] `examples/basic/geoparquet.ipynb` + - Verify no misleading archaeology-specific claims + - Add note about cross-domain capabilities + +### Documentation Files +- [x] `README.md` - Updated to reflect domain-agnostic nature +- [ ] `examples/README.md` - Update notebook descriptions +- [ ] `STATUS.md` - Add note about model understanding correction +- [ ] `CROSS_REPO_ALIGNMENT.md` - Ensure consistency with new understanding + +## Phase 2: Code Comment Updates + +### Priority Files +- [ ] All notebooks with SQL/Ibis queries + - Change comments from "OpenContext-specific entity" to "iSamples entity" + - Update "OpenContext predicate" to "iSamples predicate" + - Fix variable names like `archaeological_sites` → `sampling_sites` + +### Example Changes Needed +```python +# OLD (incorrect): +samples = oc_pqg.filter(_.otype == 'MaterialSampleRecord') # OpenContext entity + +# NEW (correct): +samples = oc_pqg.filter(_.otype == 'MaterialSampleRecord') # iSamples entity (archaeological data) +``` + +## Phase 3: Enhanced Cross-Domain Examples + +### New Notebooks/Sections to Create +- [ ] Add section showing how same queries work across domains +- [ ] Create comparison: archaeological vs geological samples using same model +- [ ] Document which fields are domain-universal vs domain-specific values + +### Query Pattern Documentation +- [ ] Document universal graph traversal patterns +- [ ] Show how predicates work across domains +- [ ] Create reference table: Entity Type → Example Values by Domain + +## Phase 4: Testing & Validation + +### Verification Tasks +- [ ] Test existing queries still work correctly +- [ ] Verify no functional breaks from documentation changes +- [ ] If available, test with non-archaeological iSamples data +- [ ] Validate cross-domain query capabilities + +### Performance Testing +- [ ] Ensure query performance unchanged +- [ ] Document any optimization opportunities from new understanding + +## Phase 5: Communication & Education + +### Internal Documentation +- [x] Dev journal entry (2025-09-26) documenting discovery +- [x] Project journal update with correction +- [ ] Add "Model Clarification" section to main docs + +### External Communication +- [ ] Consider blog post or documentation update for iSamples community +- [ ] Update any presentations or tutorials +- [ ] Notify collaborators of corrected understanding + +## Implementation Priority + +1. **Immediate** (Today): + - [x] Document discovery in dev journal + - [x] Update main README + - [x] Create this action plan + +2. **High Priority** (This Week): + - [ ] Fix oc_parquet_analysis_enhanced.ipynb documentation + - [ ] Update all notebook comments + - [ ] Test for any functional impacts + +3. **Medium Priority** (Next 2 Weeks): + - [ ] Create cross-domain examples + - [ ] Enhance documentation with domain comparisons + - [ ] Update all secondary documentation + +4. **Low Priority** (As Time Permits): + - [ ] Refactor variable names for clarity + - [ ] Create educational materials + - [ ] Consider broader community communication + +## Success Criteria + +- All documentation accurately reflects domain-agnostic nature of iSamples +- No misleading references to "archaeology-specific" entity types +- Clear explanation of the three-layer architecture (PQG → iSamples → Domain Data) +- Examples demonstrate cross-domain capabilities +- Community understanding aligned with correct model + +## Notes + +- This correction actually makes the iSamples model MORE powerful - it's a universal framework +- Emphasize the positive: enables cross-domain discovery and integration +- Use this as an opportunity to showcase the model's flexibility + +## References + +- iSamples LinkML Schema: https://isamplesorg.github.io/metadata/ +- PQG Documentation: `/Users/raymondyee/C/src/iSamples/pqg/isamples/README.md` +- Eric Kansa discussion: 2025-09-26 (clarified domain-agnostic nature) \ No newline at end of file diff --git a/JUPYTEXT_WORKFLOW.md b/JUPYTEXT_WORKFLOW.md new file mode 100644 index 0000000..af52fa3 --- /dev/null +++ b/JUPYTEXT_WORKFLOW.md @@ -0,0 +1,315 @@ +# Jupytext Workflow for iSamples Notebooks + +This guide explains how to use **jupytext** to pair `.ipynb` notebooks with `.py` companion files for better git diffing and Claude Code editing. + +## The Problem + +1. **Git diffs of `.ipynb` files are messy**: Execution outputs, cell metadata, and JSON noise obscure actual code changes +2. **Claude Code file size limits**: Large notebooks with outputs can exceed token limits +3. **Merge conflicts**: Notebooks with outputs create unnecessary conflicts + +## The Solution: Jupytext Pairing + +**Jupytext** creates a two-file system: +- **`.ipynb`** - Full notebook with outputs (local development, `.gitignore`'d or committed) +- **`.py`** - Clean Python representation (version controlled, diffed, edited) + +### Benefits + +✅ **For Git**: Diff/review `.py` files with clean code (no output noise) +✅ **For Claude Code**: Edit `.py` files directly (no token limit issues) +✅ **For Humans**: Keep `.ipynb` with outputs for local Jupyter development +✅ **Auto-sync**: Changes to either file automatically update the other + +--- + +## Quick Start + +### 1. Pair a Notebook + +```bash +# Pair single notebook +~/bin/nb_pair.sh examples/basic/my_notebook.ipynb + +# Pair all notebooks in directory +~/bin/nb_pair.sh examples/**/*.ipynb +``` + +This creates: +- `my_notebook.ipynb` (original, with outputs) +- `my_notebook.py` (new, clean code with `# %%` cell markers) + +### 2. Update .gitignore (Usually NOT Needed!) + +**Recommended: Commit BOTH files** +```bash +# No .gitignore changes needed +# Commit both .ipynb (with outputs) and .py (clean code) +git add notebook.ipynb notebook.py +``` + +**Why commit both?** +- ✅ Outputs are valuable (show results without re-running) +- ✅ Reviewers can see both code and results +- ✅ `.py` file provides clean diffs for code review +- ✅ `.ipynb` provides rendered outputs on GitHub +- ✅ Best of both worlds! + +**Only ignore .ipynb if:** +- Outputs are huge (>10MB) or contain sensitive data +- Notebooks change frequently with trivial output differences +- CI/CD regenerates outputs automatically + +```gitignore +# Only if you have a specific reason: +# *.ipynb +``` + +### 3. Normal Workflow + +**Option A: Edit in Jupyter (recommended)** +```bash +# Edit notebook in Jupyter as usual +jupyter lab + +# Jupytext auto-syncs .ipynb ↔ .py on save +# Commit BOTH files +git add examples/basic/my_notebook.ipynb examples/basic/my_notebook.py +git commit -m "Update analysis query" +``` + +**Option B: Edit .py directly (for Claude Code)** +```bash +# Claude edits the .py file +# Then sync back to .ipynb: +~/bin/nb_pair.sh --sync examples/basic/my_notebook.py + +# Or use jupytext directly: +jupytext --sync examples/basic/my_notebook.py +``` + +--- + +## Detailed Usage + +### Pairing Commands + +```bash +# Pair notebook (creates .py companion) +~/bin/nb_pair.sh notebook.ipynb + +# Sync after editing either file +~/bin/nb_pair.sh --sync notebook.ipynb + +# Remove pairing (keeps .py file) +~/bin/nb_pair.sh --unpair notebook.ipynb +``` + +### Manual Jupytext Commands + +```bash +# Pair with percent format (# %% cell markers) +jupytext --set-formats ipynb,py:percent notebook.ipynb + +# Sync changes between files +jupytext --sync notebook.ipynb + +# Convert without pairing +jupytext --to py:percent notebook.ipynb +``` + +### Understanding the .py Format + +The `.py` companion uses **percent format**: + +```python +# %% [markdown] +# # My Notebook Title +# This is a markdown cell + +# %% +import pandas as pd +print("This is a code cell") + +# %% +# Another code cell +result = pd.DataFrame({'a': [1, 2, 3]}) +``` + +**Benefits of percent format**: +- Valid Python file (can run with `python notebook.py`) +- Clear cell boundaries (`# %%`) +- Claude Code can edit directly +- Git diffs show actual code changes + +--- + +## Git Configuration (Optional) + +Enable better notebook diffs in git: + +```bash +# Configure jupytext as diff driver +git config diff.jupytext.command 'jupytext --to md --set-formats - -o -' + +# Add to .gitattributes (already done) +echo '*.ipynb diff=jupytext' >> .gitattributes +``` + +--- + +## Recommended Workflows + +### Workflow 1: Development (Jupyter + Pairing) + +1. **Pair notebook**: `~/bin/nb_pair.sh notebook.ipynb` +2. **Edit in Jupyter**: Work normally, outputs saved to `.ipynb` +3. **Commit .py**: Auto-synced on save, commit this file +4. **Review**: Git diffs show clean code changes + +**Best for**: Active development with outputs + +### Workflow 2: Claude Code Editing + +1. **Pair notebook**: `~/bin/nb_pair.sh notebook.ipynb` +2. **Claude edits .py**: No token limits, clean diffs +3. **Sync back**: `~/bin/nb_pair.sh --sync notebook.ipynb` +4. **Run in Jupyter**: Execute to generate outputs + +**Best for**: Large refactoring, architecture changes + +### Workflow 3: Existing Notebooks + +```bash +# Pair all existing notebooks +find examples -name "*.ipynb" -exec ~/bin/nb_pair.sh {} \; + +# Add to git +git add examples/**/*.py +git commit -m "Add jupytext pairing for all notebooks" + +# Optionally ignore .ipynb files +echo "*.ipynb" >> .gitignore +``` + +--- + +## Troubleshooting + +### Q: Changes not syncing? + +**A**: Manually sync: +```bash +~/bin/nb_pair.sh --sync notebook.ipynb +# or +jupytext --sync notebook.ipynb +``` + +### Q: Want to stop pairing? + +**A**: Unpair and delete .py: +```bash +~/bin/nb_pair.sh --unpair notebook.ipynb +rm notebook.py +``` + +### Q: Merge conflict in .ipynb? + +**A**: Resolve in .py file, then sync: +```bash +# Fix conflict in notebook.py +git add notebook.py +~/bin/nb_pair.sh --sync notebook.ipynb +``` + +### Q: Claude Code still hits token limits? + +**A**: Edit the `.py` file directly: +- Claude reads `notebook.py` (clean, no outputs) +- Makes changes +- Sync: `~/bin/nb_pair.sh --sync notebook.py` +- Run in Jupyter to regenerate outputs + +--- + +## Integration with Existing Tools + +### With nb_source_diff.py + +Both tools complement each other: +- **nb_source_diff.py**: Diff `.ipynb` files without outputs (one-off) +- **jupytext pairing**: Permanent `.py` companions (ongoing) + +Use **jupytext** for: +- Claude Code editing (avoids token limits) +- Normal git workflow (commit .py, diff .py) +- Long-term maintenance + +Use **nb_source_diff.py** for: +- Quick diffs of unpaired notebooks +- Legacy notebooks you don't want to pair +- Ad-hoc comparisons + +### With Git Hooks (Advanced) + +Auto-sync on commit: + +```bash +# .git/hooks/pre-commit +#!/bin/bash +for ipynb in $(git diff --cached --name-only | grep '.ipynb$'); do + jupytext --sync "$ipynb" + git add "${ipynb%.ipynb}.py" +done +``` + +--- + +## Examples in This Repository + +### Paired Notebooks (After Setup) + +``` +examples/basic/ +├── oc_parquet_analysis_enhanced.ipynb (full notebook with outputs) +└── oc_parquet_analysis_enhanced.py (clean code, version controlled) +``` + +**Git workflow**: +```bash +# Edit in Jupyter +jupyter lab examples/basic/oc_parquet_analysis_enhanced.ipynb + +# Auto-synced to .py on save +# Commit clean code +git add examples/basic/oc_parquet_analysis_enhanced.py +git commit -m "Add geographic classification analysis" + +# PR reviewers see clean Python diff, not JSON noise +``` + +--- + +## Best Practices + +1. **✅ DO**: Pair notebooks you actively develop +2. **✅ DO**: Commit BOTH `.ipynb` and `.py` files (usually!) +3. **✅ DO**: Review `.py` diffs in PRs for code changes +4. **✅ DO**: Check `.ipynb` diffs for output changes +5. **✅ DO**: Let Claude Code edit `.py` files directly +6. **⚠️ CONSIDER**: Ignoring `.ipynb` files if outputs are huge/sensitive +7. **❌ DON'T**: Manually edit both files separately (sync instead) +8. **❌ DON'T**: Commit `.ipynb` AND `.py` with conflicting changes + +--- + +## References + +- **Jupytext docs**: https://jupytext.readthedocs.io/ +- **Helper script**: `~/bin/nb_pair.sh` +- **Diff tool**: `~/bin/nb_source_diff.py` +- **This guide**: `/Users/raymondyee/C/src/iSamples/isamples-python/JUPYTEXT_WORKFLOW.md` + +--- + +**Last Updated**: 2025-10-15 by Claude Code diff --git a/PQG_INTEGRATION_PLAN.md b/PQG_INTEGRATION_PLAN.md new file mode 100644 index 0000000..0dcc285 --- /dev/null +++ b/PQG_INTEGRATION_PLAN.md @@ -0,0 +1,448 @@ +# PQG Integration Plan for oc_parquet_analysis_enhanced.ipynb + +**Date**: 2025-11-11 +**Goal**: Integrate the `pqg` library to simplify graph operations while preserving domain-specific logic + +--- + +## Executive Summary + +The notebook currently has **100 cells** with **29 recursive CTEs** and complex SQL joins for property graph traversal. The `pqg` library can simplify many of these operations while maintaining performance for OpenContext-specific analysis. + +**Strategy**: Hybrid approach +- ✅ Use PQG for: Node retrieval, edge traversal, entity queries +- ✅ Keep custom SQL for: Visualization aggregations, bulk operations, performance-critical queries +- ✅ Add comparison sections showing both approaches + +--- + +## Current State Analysis + +### 🔍 What the Notebook Does + +**Core Functions (11 custom query functions)**: +1. `get_sample_locations_for_viz()` - Extract samples with coordinates for mapping +2. `get_sample_geo_context_via_sample_pid()` - Get all geographic context for a sample +3. `get_samples_for_geo_pid()` - Reverse: get samples at a geographic location +4. `export_site_subgraph()` - Export all data for a site pattern +5. **Eric's 4 authoritative queries** (already implemented in SQL): + - `get_sample_data_via_sample_pid()` + - `get_sample_data_agents_sample_pid()` + - `get_sample_types_and_keywords_via_sample_pid()` + - `get_samples_at_geo_cord_location_via_sample_event()` +6. `get_sampling_sites_by_name()` - Site search by name pattern +7. `ark_to_url()` - URL conversion utility + +**Heavy SQL Patterns**: +- **29 cells** use CTEs/recursive queries for graph traversal +- **35 cells** use multi-table joins +- **20 cells** use aggregation (GROUP BY) +- **15 cells** filter by entity type (WHERE otype=...) + +**Domain-Specific Logic**: +- Path 1 vs Path 2 geographic traversal patterns +- Three-category geo classification (both paths, Path 1 only, Path 2 only) +- Site-level vs event-level coordinate analysis +- Material type categorization +- Data quality checks (orphaned nodes, location quality) + +--- + +## Integration Strategy + +### Phase 1: Setup & Basic Queries (Easy Wins) + +**Replace simple entity queries with PQG methods** + +#### 1.1 Entity Type Counts +**Current** (Cell 10): +```python +# SQL +SELECT otype, COUNT(*) as count FROM pqg +WHERE otype != '_edge_' +GROUP BY otype +``` + +**With PQG**: +```python +from pqg import PQG + +# Load parquet as PQG instance +pqg_instance = PQG(conn) +pqg_instance._table = 'pqg' +pqg_instance._isparquet = True + +# Get entity type distribution +entity_counts = {} +for pid, otype in pqg_instance.getIds(maxrows=100000): + entity_counts[otype] = entity_counts.get(otype, 0) + 1 + +# Or use SQL for performance on large datasets (keep current approach) +``` + +**Recommendation**: Keep SQL for aggregations (faster), use PQG for learning/examples + +#### 1.2 Edge Predicate Exploration +**Current** (Cell 12): +```python +SELECT p, COUNT(*) FROM pqg WHERE otype='_edge_' GROUP BY p +``` + +**With PQG**: +```python +# Get all relationships +predicates = {} +for subject, predicate, obj in pqg_instance.getRelations(): + predicates[predicate] = predicates.get(predicate, 0) + 1 +``` + +**Recommendation**: Keep SQL (much faster for 9M+ edges) + +#### 1.3 Node Retrieval by PID +**Current**: Direct SQL SELECT +**With PQG**: +```python +# Get a single node with all properties +sample_node = pqg_instance.getNode("ark:/28722/k2wq0b20z", max_depth=0) +print(sample_node) + +# With depth=1, automatically expands related nodes +sample_with_relations = pqg_instance.getNode("ark:/28722/k2wq0b20z", max_depth=1) +``` + +**Recommendation**: ✅ **USE PQG** - Simpler API, handles row_id conversion automatically + +--- + +### Phase 2: Graph Traversal Functions (Medium Complexity) + +**Replace custom recursive CTEs with PQG traversal methods** + +#### 2.1 `get_sample_geo_context_via_sample_pid()` + +**Current approach**: Multi-hop SQL join +```sql +-- Find event +SELECT e.o[1] as event_pid +FROM pqg e +WHERE e.s = (SELECT row_id FROM pqg WHERE pid = sample_pid) + AND e.p = 'produced_by' + +-- Find geo via event +SELECT g.pid FROM pqg g +JOIN pqg e ON g.row_id = e.o[1] +WHERE e.s = event_row_id AND e.p = 'sample_location' +``` + +**With PQG**: +```python +def get_sample_geo_context_via_sample_pid_pqg(pqg_instance, sample_pid): + """Get geographic context using PQG graph traversal""" + + # Get sample node with edges expanded (depth=1 gets immediate neighbors) + sample = pqg_instance.getNode(sample_pid, max_depth=1) + + # Navigate to event via produced_by edge + event_pid = sample.get('produced_by') # Auto-expanded by max_depth=1 + if not event_pid: + return None + + # Get event with its edges + event = pqg_instance.getNode(event_pid, max_depth=1) + + # Extract geographic context + geo_context = { + 'sample_location': event.get('sample_location'), # Path 1 + 'sampling_site': event.get('sampling_site') # Path 2 (site) + } + + return geo_context +``` + +**Comparison**: +- PQG: More readable, handles row_id conversion, 3 API calls +- SQL: Faster for bulk, single query, but complex +- **Recommendation**: ✅ **Show both** - PQG for clarity, SQL for performance + +#### 2.2 `get_samples_for_geo_pid()` - Reverse Traversal + +**Current**: Complex SQL with UNION for Path 1 + Path 2 +**With PQG**: +```python +def get_samples_for_geo_pid_pqg(pqg_instance, geo_pid, mode='either_or'): + """Find samples connected to a geographic location (reverse traversal)""" + + # Path 1: geo <- sample_location <- event <- produced_by <- sample + path1_samples = [] + for subj, pred, obj in pqg_instance.getRelations(obj=geo_pid, predicate='sample_location'): + event_pid = subj # Event that has this geo as sample_location + event = pqg_instance.getNode(event_pid, max_depth=1) + + # Find samples produced by this event + for s2, p2, o2 in pqg_instance.getRelations(obj=event_pid, predicate='produced_by'): + sample_pid = s2 + path1_samples.append(sample_pid) + + # Path 2: geo <- site_location <- site <- sampling_site <- event <- produced_by <- sample + # (Similar pattern, more hops) + + return path1_samples +``` + +**Comparison**: +- PQG: Clear step-by-step traversal +- SQL: Single query with joins, much faster +- **Recommendation**: ✅ **Show both** - PQG for learning, SQL for production + +--- + +### Phase 3: Domain-Specific Optimizations (Keep Custom) + +**These should remain as custom SQL - they're OpenContext-specific and performance-critical** + +#### 3.1 Visualization Queries +**Keep as SQL**: +- `get_sample_locations_for_viz()` - Optimized for 10K+ samples, specific column selection +- Geographic classification queries (3-category analysis) +- Coordinate extraction for mapping + +**Reason**: Need bulk aggregation, specific projections, performance-critical + +#### 3.2 Eric's Authoritative Queries +**Keep as SQL**: +- All 4 of Eric's queries are already optimized and tested +- They use specific column selections not available in PQG API +- Performance-critical for web UI + +**Reason**: Production-tested, web application integration + +#### 3.3 Data Quality Analysis +**Keep as SQL**: +- Orphaned node detection +- Location quality checks +- Summary statistics + +**Reason**: Require full table scans and aggregations + +--- + +## Implementation Plan + +### Step 1: Add PQG Setup Section (New Cell) + +Insert after Cell 6 (data loading): + +```python +# === PQG Integration Setup === + +from pqg import PQG + +# Create PQG instance from loaded parquet +def create_pqg_instance(conn, table_name='pqg'): + """Initialize PQG wrapper around parquet data""" + pqg_instance = PQG(dbinstance=conn) + pqg_instance._table = table_name + pqg_instance._isparquet = True # Read-only mode + pqg_instance._node_pk = 'pid' # Primary lookup field + return pqg_instance + +pqg_instance = create_pqg_instance(conn) + +print("✅ PQG instance created") +print(f"Table: {pqg_instance._table}") +print(f"Read-only mode: {pqg_instance._isparquet}") +``` + +### Step 2: Add Comparison Sections (Incremental) + +For each major query pattern, add a comparison cell: + +```markdown +### Example: Node Retrieval - SQL vs PQG + +**SQL Approach** (current): +```python +# [existing SQL query] +``` + +**PQG Approach** (alternative): +```python +# [PQG equivalent] +``` + +**Performance Comparison**: +- SQL: X seconds +- PQG: Y seconds +- **Use SQL for**: Bulk operations, aggregations +- **Use PQG for**: Single node traversal, learning, prototyping +``` + +### Step 3: Rewrite 3 Key Functions with PQG (Examples) + +Choose 3 representative functions to show PQG alternative: + +1. ✅ `get_sample_geo_context_via_sample_pid()` - Forward traversal +2. ✅ `get_samples_for_geo_pid()` - Reverse traversal +3. ✅ `export_site_subgraph()` - Subgraph extraction + +**Implementation**: +- Create `_pqg` suffixed versions alongside originals +- Add timing comparisons +- Document when to use each + +### Step 4: Add "PQG Learning Section" (New) + +New section at end of notebook: + +```markdown +## Using PQG for Interactive Exploration + +This section demonstrates using the PQG library for interactive graph exploration. +Use these patterns for prototyping and learning. For production queries, use the +optimized SQL versions shown earlier. + +### Basic Operations +- Node retrieval: `pqg_instance.getNode(pid)` +- Edge queries: `pqg_instance.getRelations(subject=..., predicate=...)` +- Entity search: `pqg_instance.getIds(otype="MaterialSampleRecord")` + +### Graph Traversal +[Examples of multi-hop traversal] + +### When to Use PQG vs SQL +[Decision matrix] +``` + +--- + +## Decision Matrix: PQG vs Custom SQL + +| Use Case | PQG | Custom SQL | Rationale | +|----------|-----|------------|-----------| +| Single node lookup | ✅ | ⚠️ | PQG handles row_id conversion, cleaner API | +| Multi-hop traversal (1-3 hops) | ✅ | ⚠️ | PQG more readable, acceptable performance | +| Reverse graph traversal | ⚠️ | ✅ | SQL more efficient for finding "what points to X" | +| Bulk aggregations (10K+ rows) | ❌ | ✅ | SQL dramatically faster | +| Visualization queries | ❌ | ✅ | Need specific projections, performance-critical | +| Data quality analysis | ❌ | ✅ | Requires full table scans | +| Learning/prototyping | ✅ | ⚠️ | PQG clearer for understanding graph structure | +| Production web queries | ❌ | ✅ | Eric's queries already optimized and tested | + +**Legend**: +- ✅ Recommended +- ⚠️ Works but not optimal +- ❌ Not recommended + +--- + +## Expected Benefits + +### Code Clarity +**Before**: +```sql +-- 30 lines of recursive CTE SQL +WITH RECURSIVE traverse AS (...) +SELECT ... FROM traverse JOIN ... +``` + +**After**: +```python +# 5 lines of PQG +sample = pqg_instance.getNode(sample_pid, max_depth=1) +event = pqg_instance.getNode(sample['produced_by'], max_depth=1) +geo = event['sample_location'] +``` + +### Learning Value +- **New users** can understand graph structure via PQG API +- **SQL experts** can see equivalent SQL for optimization +- **Comparison sections** show tradeoffs + +### Maintainability +- Less SQL to maintain for simple queries +- PQG handles schema changes (row_id conversion) +- Clear separation: PQG for exploration, SQL for production + +--- + +## Risks & Mitigations + +### Risk 1: Performance Regression +**Concern**: PQG might be slower for large queries +**Mitigation**: +- ✅ Keep all existing SQL queries as primary +- ✅ Add PQG as **alternative** in comparison sections +- ✅ Benchmark and document performance differences + +### Risk 2: API Limitations +**Concern**: PQG might not support all OpenContext-specific patterns +**Mitigation**: +- ✅ Use hybrid approach - PQG for basics, SQL for advanced +- ✅ Document gaps in "When to Use PQG vs SQL" section +- ✅ Contribute improvements back to pqg library if needed + +### Risk 3: Notebook Complexity +**Concern**: Adding PQG might make notebook harder to follow +**Mitigation**: +- ✅ Use collapsible sections for alternatives +- ✅ Clear headers: "SQL Approach" vs "PQG Approach" +- ✅ Summary tables showing when to use each + +--- + +## Success Criteria + +After integration, the notebook should: + +1. ✅ **Preserve all existing functionality** - Every query still works +2. ✅ **Show PQG alternatives** for 5-10 common patterns +3. ✅ **Include performance comparisons** - Clear benchmarks +4. ✅ **Have clear guidance** - Decision matrix for when to use each +5. ✅ **Be more accessible** - New users can learn via PQG, then optimize with SQL +6. ✅ **Maintain performance** - Production queries unchanged + +--- + +## Next Steps + +### Immediate (30 minutes) +1. ✅ Add PQG setup cell (Step 1) +2. ✅ Test basic operations (`getNode()`, `getRelations()`) +3. ✅ Verify row_id conversion works correctly + +### Short-term (2-3 hours) +4. ✅ Rewrite 1 function with PQG: `get_sample_geo_context_via_sample_pid()` +5. ✅ Add comparison section with timing +6. ✅ Document findings + +### Medium-term (1-2 sessions) +7. ✅ Add 2 more PQG function examples +8. ✅ Create "PQG Learning Section" at end +9. ✅ Add decision matrix to README + +### Long-term (optional) +10. ⏭️ Extract common patterns into helper module +11. ⏭️ Contribute enhancements back to pqg library +12. ⏭️ Create tutorial notebook: "Graph Queries with PQG" + +--- + +## Questions to Resolve + +1. **Performance baseline**: What's acceptable slowdown for PQG clarity benefits? + - Suggestion: 2-3x slower OK for single-node queries, not for bulk + +2. **API gaps**: Does PQG support reverse traversal efficiently? + - Need to test `getRelations(obj=geo_pid)` performance + +3. **Integration pattern**: Separate notebook or integrated sections? + - **Recommendation**: Integrated comparison sections (more useful) + +4. **Documentation location**: Where to put "When to use PQG" guide? + - **Recommendation**: Both in notebook AND in isamples-python README + +--- + +**Prepared by**: Claude Code (Sonnet 4.5) +**Date**: 2025-11-11 +**Next Action**: Discuss this plan, then implement Step 1 (PQG setup) diff --git a/QUICKREF_NOTEBOOKS.md b/QUICKREF_NOTEBOOKS.md new file mode 100644 index 0000000..484b221 --- /dev/null +++ b/QUICKREF_NOTEBOOKS.md @@ -0,0 +1,167 @@ +# Quick Reference: Notebook Workflows + +## Two Tools for Different Needs + +### 1. **nb_source_diff.py** - Quick Diffs Without Outputs +```bash +# One-off diff of any notebook vs git history +nb-diff notebook.ipynb HEAD +nb-diff notebook.ipynb HEAD~5 +``` +**Use when**: Quick comparison, unpaired notebooks, legacy files + +--- + +### 2. **jupytext pairing** - Permanent .py Companions +```bash +# Pair notebook (creates .py file) +~/bin/nb_pair.sh notebook.ipynb + +# Sync after editing +~/bin/nb_pair.sh --sync notebook.ipynb +``` +**Use when**: Active development, Claude Code editing, clean git workflow + +--- + +## Decision Tree + +``` +Need to diff a notebook? +├─ One-time comparison → nb-diff +└─ Ongoing development → jupytext pair + +Claude Code hitting token limits? +└─ Pair with jupytext, edit .py file + +Want clean git diffs? +├─ Quick → nb-diff +└─ Permanent → jupytext pair + commit .py + +Collaborating on notebooks? +└─ Pair all notebooks, commit .py files +``` + +--- + +## Setup New Notebook (Recommended) + +```bash +# 1. Create notebook in Jupyter +jupyter lab + +# 2. Pair immediately +~/bin/nb_pair.sh examples/basic/my_analysis.ipynb + +# 3. Add to git +git add examples/basic/my_analysis.py + +# 4. Develop normally - changes auto-sync +``` + +--- + +## Quick Commands Cheat Sheet + +```bash +# DIFF TOOLS +nb-diff notebook.ipynb # vs HEAD +nb-diff notebook.ipynb HEAD~3 # vs 3 commits ago + +# PAIRING +~/bin/nb_pair.sh notebook.ipynb # Pair (create .py) +~/bin/nb_pair.sh --sync notebook.ipynb # Sync changes +~/bin/nb_pair.sh examples/**/*.ipynb # Pair all + +# JUPYTEXT DIRECT +jupytext --set-formats ipynb,py:percent notebook.ipynb # Pair +jupytext --sync notebook.ipynb # Sync +``` + +--- + +## Claude Code Editing Workflow + +### Problem: Large notebook with outputs exceeds token limits + +### Solution: Edit .py companion + +```bash +# 1. Pair notebook (if not already paired) +~/bin/nb_pair.sh notebook.ipynb + +# 2. Tell Claude to edit: notebook.py (NOT .ipynb) +# Claude edits clean .py file without output noise + +# 3. Sync back to notebook +~/bin/nb_pair.sh --sync notebook.ipynb + +# 4. Run in Jupyter to regenerate outputs +jupyter lab +``` + +--- + +## Git Workflow with Pairing + +```bash +# Development +jupyter lab my_notebook.ipynb # Edit in Jupyter +# Changes auto-sync to my_notebook.py on save + +# Commit BOTH files (recommended!) +git status # Shows both files changed +git diff my_notebook.py # Review code changes (clean!) +git diff my_notebook.ipynb # Review output changes (optional) +git add my_notebook.ipynb my_notebook.py +git commit -m "Add new analysis" + +# PR Review +# Reviewers can: +# - Check .py for code changes (clean diffs) +# - Check .ipynb for output changes (rendered on GitHub) +# - Best of both worlds! +``` + +--- + +## Migration: Existing Notebooks + +```bash +# Pair all notebooks in project +find examples -name "*.ipynb" -exec ~/bin/nb_pair.sh {} \; + +# Commit BOTH .ipynb and .py files (recommended) +git add examples/**/*.ipynb examples/**/*.py +git commit -m "Add jupytext pairing to all notebooks" + +# Or if outputs are problematic (less common): +# git add examples/**/*.py +# echo "*.ipynb" >> .gitignore +# git commit -m "Add .py companions, ignore .ipynb" +``` + +--- + +## Troubleshooting + +| Problem | Solution | +|---------|----------| +| Changes not syncing | `~/bin/nb_pair.sh --sync notebook.ipynb` | +| Claude hits token limit | Edit `notebook.py` instead of `notebook.ipynb` | +| Git diff too noisy | Use `nb-diff` or pair with jupytext | +| Want to stop pairing | `~/bin/nb_pair.sh --unpair notebook.ipynb` | +| Merge conflict | Resolve in `.py`, then `--sync` | + +--- + +## Files & Docs + +- **Helper script**: `~/bin/nb_pair.sh` +- **Diff tool**: `~/bin/nb_source_diff.py` +- **Full guide**: `JUPYTEXT_WORKFLOW.md` +- **This quickref**: `QUICKREF_NOTEBOOKS.md` + +--- + +**Last Updated**: 2025-10-15 diff --git a/README.md b/README.md index b620f86..e85d68d 100644 --- a/README.md +++ b/README.md @@ -1,2 +1,114 @@ -# examples -Examples of interacting with iSamples API +# isamples-python + +Python client library and examples for working with iSamples material sample data across scientific domains (geology, biology, archaeology, etc.), with a focus on high-performance geoparquet analysis and visualization. + +## Quick Start + +```bash +# Install dependencies +poetry install --with examples + +# Activate environment +poetry shell + +# Launch Jupyter for examples +jupyter lab examples/ +``` + +## Overview + +This repository provides Python tools for analyzing material sample data from the iSamples project. The iSamples metadata model is **domain-agnostic**, supporting samples from geology, biology, archaeology, environmental science, and other fields. Originally designed to work with the iSamples API, it has evolved to focus on **offline-first, geoparquet-centric workflows** using modern spatial data tools. + +### Key Capabilities + +- **High-performance visualization** with [Lonboard](https://github.com/developmentseed/lonboard) WebGL mapping +- **Efficient spatial queries** using DuckDB on remote parquet files +- **Interactive Jupyter notebooks** for cross-domain sample data exploration +- **API-independent workflows** accessing data via HTTP range requests + +## Architecture + +### Python Client Library (`src/isamples_client/`) + +Three client classes for different use cases: + +1. **`IsbClient`**: Basic HTTP client using httpx +2. **`IsbClient2`**: Enhanced Solr client with complex query support +3. **`ISamplesBulkHandler`**: Bulk data operations with authentication + +**Note**: API clients currently target `https://central.isample.xyz/isamples_central/` which may be offline. See [STATUS.md](STATUS.md) for current issues and workarounds. + +### Key Examples + +- **`examples/basic/geoparquet.ipynb`** ⭐ - Advanced lonboard visualization with zoom-layered rendering +- **`examples/basic/oc_parquet_analysis_enhanced.ipynb`** ⭐ - **NEW**: iSamples property graph analysis using OpenContext archaeological data with Ibis and DuckDB +- **`examples/basic/isample-archive.ipynb`** - Remote parquet analysis via DuckDB +- **`examples/basic/record_counts.ipynb`** - Quick visualization patterns +- **`examples/basic/oc_parquet_analysis.ipynb`** - Basic OpenContext parquet exploration + +The enhanced OpenContext notebook demonstrates: +- **Property graph traversal** through complex multi-hop joins +- **Ibis vs raw SQL** comparison for readable query construction +- **Corrected relationship paths** for sample-to-location queries +- **Performance optimization** techniques for 11M+ row datasets + +See [examples/README.md](examples/README.md) for detailed notebook descriptions. + +## Technology Stack + +- **Spatial Analysis**: GeoPandas, DuckDB, Shapely, **Ibis** (new) +- **Visualization**: Lonboard, Matplotlib, Folium, Cartopy +- **Data Processing**: Pandas, Polars, PyArrow +- **Jupyter Ecosystem**: IPyWidgets, IPyDatagrid, Sidecar + +## Development + +### Commands + +```bash +# Install with examples dependencies +poetry install --with examples + +# Run tests +poetry run pytest tests/ + +# Run Playwright tests (web scraping) +cd playwright && npx playwright test + +# Docker Jupyter environment +./run_docker.sh [port] # default port 8890 +``` + +### Current Focus: Geoparquet Workflows + +This repository is transitioning from API-dependent to **offline-first geoparquet analysis**: + +- ✅ Remote parquet processing via DuckDB HTTP range requests +- ✅ High-performance WebGL visualization with Lonboard +- ✅ Interactive cross-domain sample data exploration notebooks +- 🚧 API fallback mechanisms and error handling +- 🚧 Consolidated development environment + +See [STATUS.md](STATUS.md) for detailed WIP status and loose ends. + +## Ecosystem Integration + +### Companion Repository: [isamplesorg.github.io](https://github.com/isamplesorg/isamplesorg.github.io) +**Public website with browser-based tutorials and documentation** + +**Complementary roles**: +- 🔗 **This repo (`isamples-python`)**: Local development, advanced analysis, Python ecosystem +- 🌐 **Website repo**: Public tutorials, universal browser access, educational content + +**Shared technology**: Both use DuckDB + geoparquet for efficient data analysis +- Same data sources (Zenodo archives, HTTP range requests) +- Compatible visualization approaches (lonboard ↔ Observable Plot) +- Coordinated development patterns + +See [CROSS_REPO_ALIGNMENT.md](CROSS_REPO_ALIGNMENT.md) for detailed integration strategy. + +## Related Projects + +- [iSamples](https://www.isamples.org/) - Internet of Samples project (domain-agnostic material sample metadata) +- [Lonboard](https://github.com/developmentseed/lonboard) - Fast geospatial visualization +- [DuckDB](https://duckdb.org/) - High-performance analytical database diff --git a/SESSION_SUMMARY.md b/SESSION_SUMMARY.md new file mode 100644 index 0000000..07593ec --- /dev/null +++ b/SESSION_SUMMARY.md @@ -0,0 +1,364 @@ +# iSamples Session Summary + +**Date**: 2025-11-13 +**Status**: ✅ **READY** - PQG Demo Notebook Complete, Next Phase Planning + +--- + +## ✅ What We Accomplished + +### 1. Created PQG Demo Notebook +- **File**: `examples/basic/pqg_demo.ipynb` +- **Action**: Built comprehensive working notebook demonstrating PQG library with OpenContext data +- **Testing**: All examples run successfully with 11.6M record parquet file +- **Content**: 5 examples comparing PQG vs SQL approaches +- **Result**: Working foundation for exploring PQG capabilities + +### 2. Debugged Initial Implementation +- **Problem**: Initial notebook had critical errors (missing `source` parameter, misunderstood `max_depth` behavior) +- **User Fixed**: Corrected in VSCode with proper PQG initialization +- **Key Learning**: `max_depth=1` returns fully expanded dictionaries, not PIDs +- **Result**: Claude now understands PQG's relationship expansion feature correctly + +### 3. Committed and Pushed Work +- **Commit**: `d5dc75d` on `exploratory` branch +- **Files**: `pqg_demo.ipynb`, `PQG_INTEGRATION_PLAN.md`, `SESSION_SUMMARY.md` +- **Status**: Pushed to `origin/exploratory` + +--- + +## 🔍 Key Findings + +### 1. PQG Demo Is Too SQL-Heavy +**Discovery**: Notebook became "SQL vs PQG comparison" instead of "PQG in action" showcase. + +**User Feedback**: "what is a bit sad about this demo is that the pqg module isn't used that much, right? Lots of custom SQL?" + +**Why it matters**: +- Integration plan's defensive tone ("use SQL for real work") influenced design +- Should celebrate PQG's strengths first, comparisons second +- Current approach feels apologetic rather than exploratory + +**Decision**: Keep current notebook as working baseline, use it to push PQG harder + +### 2. `max_depth` Is More Powerful Than Expected +**Discovery**: When `max_depth=1`, PQG returns **fully expanded dictionaries** for related nodes. + +**What Claude got wrong initially**: +```python +# Claude's broken code: +event_pid = sample.get('produced_by') # Thought this was a PID string +event = pqg_instance.getNode(event_pid) # Unnecessary fetch! + +# Correct usage: +produced_by = sample.get('produced_by') # This is ALREADY a full dict! +event = produced_by # Just use it directly +``` + +**Impact**: PQG's relationship expansion is actually quite powerful - one API call gives you entire neighborhood + +### 3. PQG Initialization for Parquet Files +**Discovery**: Parquet files require explicit `source` parameter: +```python +parquet_source = f"read_parquet('{parquet_path}')" +pqg_instance = pqg.PQG(dbinstance=conn, source=parquet_source) +``` + +**Also needed**: Manual `_types` initialization when parquet lacks PQG metadata + +### 4. Identified PQG Enhancement Opportunities +**Areas where PQG could improve**: +1. Bulk relationship queries (current: manual iteration) +2. Reverse traversal optimization (Example 4 returned 0 results) +3. Subgraph extraction (no built-in method) +4. Pattern matching (declarative queries vs manual loops) + +**Next Phase Goal**: Push PQG to its limits to identify concrete enhancement targets for contributing back to pqg library + +--- + +## 📁 Files Generated This Session + +### Keep (Committed to Git) + +#### isamples-python repository +- ✅ `examples/basic/pqg_demo.ipynb` - **NEW** Working PQG demonstration notebook + - 5 examples: single node, relationships, traversal, reverse lookup, aggregations + - Performance comparisons with SQL + - Decision matrix for when to use each approach + - ~30 cells, fully tested + +- ✅ `PQG_INTEGRATION_PLAN.md` - Strategic integration plan (from Nov 11 session) + - 400+ lines analyzing hybrid PQG/SQL approach + - Decision matrix and 4-phase implementation strategy + - Referenced in current work but not modified today + +- ✅ `SESSION_SUMMARY.md` - This file (updated from Nov 11 version) + +**Commit**: `d5dc75d` - "Add PQG demonstration notebook and integration planning" +**Branch**: `exploratory` +**Status**: ✅ Pushed to origin + +### Not Modified (Still Uncommitted from Previous Session) + +- ⏸️ `examples/basic/oc_parquet_analysis_enhanced.ipynb` - Modified but not committed +- ⏸️ `pyproject.toml` - Modified (added seaborn) but not committed +- ⏸️ `examples/basic/isamples_explore.py` - Untracked file + +--- + +## 🎯 Next Steps (Prioritized) + +### 🟢 HIGH Priority (Ready to Execute) + +#### 1. Push PQG to Its Limits (30-60 min) 🟡 MEDIUM RISK +**Action**: Modify `pqg_demo.ipynb` to explore PQG's boundaries + +**What to try**: +- Large-scale relationship queries (10K+ nodes) +- Complex multi-hop traversals (4-5 hops) +- Subgraph extraction patterns +- Pattern matching (find all samples with coords + keywords) +- Reverse traversal deep dive (why did Example 4 return 0 results?) + +**Risk**: May discover significant API gaps or performance issues +**Mitigation**: Document limitations as potential enhancement targets + +**Goal**: Generate concrete list of "PQG should be able to do X but can't/struggles" + +#### 2. Decide on Enhancement Strategy (15 min) 🔴 LOW RISK +**Action**: After pushing PQG hard, decide together: + +**Option A**: Contribute enhancements back to pqg library +- Requires: Fork, implement, test, PR +- Timeline: Multi-session effort +- Impact: Benefits entire PQG community + +**Option B**: Create iSamples-specific helper layer +- Requires: New module in isamples-python +- Timeline: Single session +- Impact: Immediate value for iSamples users + +**Option C**: Document patterns and workarounds +- Requires: Update integration plan with "PQG recipes" +- Timeline: Current session +- Impact: Knowledge sharing without code + +### 🟡 MEDIUM Priority (This Week) + +#### 3. Create "PQG Exploration" Notebook (1-2 hours) 🟡 MEDIUM RISK +**Action**: New notebook focused on **discovery workflow**, not comparisons + +**Structure**: +1. "Exploring an Unknown Graph" - Use PQG to understand structure +2. "Cool Graph Queries" - Show off PQG's strengths +3. "Complex Relationships" - Where PQG shines vs SQL pain +4. "When to Switch to SQL" - Honest but at the end + +**Deliverable**: `examples/basic/pqg_exploration.ipynb` + +#### 4. Commit Remaining Uncommitted Changes (10 min) 🔴 LOW RISK +**Action**: Review and commit `oc_parquet_analysis_enhanced.ipynb`, `pyproject.toml` + +**Why delayed**: Focus on PQG demo first, clean up after + +### 🔵 LOW Priority (Future) + +#### 5. Extract Visualization Patterns (2-3 hours) +**Action**: Create reusable viz module from notebook patterns +**Reference**: Lonboard patterns from `oc_parquet_analysis_enhanced.ipynb` + +#### 6. Cross-Domain Examples (TBD) +**Action**: Create examples with SESAR/GEOME data (not just OpenContext) +**Goal**: Demonstrate domain-agnostic nature of iSamples model + +--- + +## 🚫 Current Blockers + +**None** ✅ + +All technical work completed successfully: +- ✅ Notebook runs without errors +- ✅ PQG initialization working +- ✅ Git commit and push successful + +**Waiting on**: +- Your decision on how hard to push PQG (in current notebook vs new one) +- Your preference on enhancement strategy (contribute vs helper layer vs document) + +--- + +## 🔧 Technical Setup Notes + +### Repository States + +#### isamples-python +**Location**: `/Users/raymondyee/C/src/iSamples/isamples-python` +**Branch**: `exploratory` +**Remote**: `origin = git@github.com:rdhyee/isamples-python.git` + +**Status**: +``` +✅ Latest commit: d5dc75d (PQG demo + integration plan) +✅ Pushed to origin/exploratory +⏸️ Uncommitted: oc_parquet_analysis_enhanced.ipynb, pyproject.toml, isamples_explore.py +``` + +**Virtual env**: Managed by Poetry +**Activate**: `cd isamples-python && poetry shell` + +#### pqg +**Location**: `/Users/raymondyee/C/src/iSamples/pqg` +**Branch**: `claude/improve-documentation-011CV19CYZTUTA2CZL5msSTr` (from Nov 11 session) +**Status**: ✅ PR #5 ready for review, all work from Nov 11 complete +**Virtual env**: `.venv/` (uv-managed, Python 3.12.9) + +### Data Files + +#### OpenContext Parquet +**Location**: `~/Data/iSample/pqg_refining/oc_isamples_pqg.parquet` +**Size**: 691MB +**Records**: 11,637,144 total (9.2M edges, 2.4M nodes) +**Schema**: INTEGER row_id (validated against PR #4) +**Status**: ✅ Working with PQG demo notebook + +### Key Commands + +**Launch notebook**: +```bash +cd /Users/raymondyee/C/src/iSamples/isamples-python +poetry shell +jupyter lab examples/basic/pqg_demo.ipynb +``` + +**Test PQG with parquet** (standalone): +```bash +cd /Users/raymondyee/C/src/iSamples/pqg +source .venv/bin/activate +python3 << 'EOF' +import duckdb +from pqg import pqg_singletable as pqg + +conn = duckdb.connect() +parquet_path = "~/Data/iSample/pqg_refining/oc_isamples_pqg.parquet" +pqg_instance = pqg.PQG(conn, source=f"read_parquet('{parquet_path}')") +print("✅ PQG loaded") +EOF +``` + +--- + +## 📊 Session Statistics + +**Duration**: ~1.5 hours (13:00-14:30 estimated) +**Repositories touched**: 1 (isamples-python) +**Files created**: 1 (pqg_demo.ipynb) +**Files updated**: 1 (SESSION_SUMMARY.md) +**Commits**: 1 (d5dc75d) +**Lines added**: ~850 (notebook content) +**Key insight**: PQG demo too defensive, need to push boundaries + +--- + +## 🎓 Lessons Learned + +### 1. "Comparison" != "Demonstration" +**Lesson**: The integration plan's hybrid approach led to a notebook that apologizes for PQG instead of showcasing it. + +**Why it matters**: When building demos, lead with strengths. Comparisons belong at the end or in separate documentation. + +**Apply next time**: Start with "what can this do?" before "when should you use something else?" + +### 2. Read the Actual Object Behavior +**Lesson**: Claude assumed `max_depth=1` returned PIDs to fetch. User's debugging revealed it returns full dicts. + +**Why it matters**: API assumptions without testing lead to unnecessarily complex code. + +**Apply next time**: When demonstrating a library, run simple experiments first to understand actual behavior. + +### 3. Parquet-Based PQG Requires Different Initialization +**Lesson**: PQG needs `source=read_parquet(...)` for parquet files, not just a view name. + +**Why it matters**: Different data sources have different initialization patterns. + +**Apply next time**: Check library docs for format-specific initialization requirements. + +### 4. User Feedback Reveals True Intent +**Lesson**: User's "a bit sad" comment revealed the notebook missed the mark on being a PQG showcase. + +**Why it matters**: Honest feedback helps course-correct before investing more time in wrong direction. + +**Action taken**: Shifted strategy to "push PQG hard to find real boundaries" + +--- + +## 🔗 Related Resources + +**Notebooks**: +- Current: `/Users/raymondyee/C/src/iSamples/isamples-python/examples/basic/pqg_demo.ipynb` +- Reference: `examples/basic/oc_parquet_analysis_enhanced.ipynb` (production SQL patterns) + +**Documentation**: +- Integration Plan: `PQG_INTEGRATION_PLAN.md` (Nov 11) +- PQG Repo: https://github.com/isamplesorg/pqg +- PQG Docs: https://github.com/isamplesorg/pqg/tree/main/docs + +**Previous Work**: +- Nov 11 Session: PR #4 merged, PR #5 updated, integration plan created +- Commit history: `git log --oneline exploratory` + +**Dev Journal**: +- Today: `~/dev-journal/daily/2025-11-13.md` +- Project: `~/dev-journal/projects/isamples.md` + +--- + +## Quick Resume Checklist + +**Next session, start here:** + +1. [ ] Read this SESSION_SUMMARY.md +2. [ ] **DECISION**: How to push PQG boundaries? + - Modify existing `pqg_demo.ipynb` OR + - Create new `pqg_exploration.ipynb` OR + - Run experiments in REPL first +3. [ ] Try these PQG challenges: + - Large-scale relationship queries (10K+ nodes) + - Multi-hop traversals (4-5 hops deep) + - Subgraph extraction patterns + - Pattern matching ("find samples with X AND Y") + - Debug why Example 4 returned 0 results +4. [ ] Document limitations discovered +5. [ ] **DECISION**: Enhancement strategy? + - Contribute to PQG library + - Create iSamples helper layer + - Document patterns/workarounds + +--- + +## 📍 Context for Others + +**What was delivered**: +- ✅ PQG demo notebook: Working examples with OpenContext data (11.6M records) +- ✅ All code tested and committed to exploratory branch +- ✅ Claude learned PQG initialization and `max_depth` behavior +- ✅ Identified next phase: Push PQG boundaries to find enhancement opportunities + +**Current state**: +- Repository clean (uncommitted files are pre-existing from Nov 11) +- Notebook runs successfully +- PQG library integration understood +- Ready for exploration phase + +**Next session can immediately**: +- Run notebook and start experimenting with PQG limits +- Try complex graph queries to stress-test API +- Document enhancement opportunities +- Decide on contribution strategy + +--- + +**Last Updated**: 2025-11-13 by Claude Code (Sonnet 4.5) +**Session Duration**: ~1.5 hours +**Session Status**: ✅ **READY - Foundation Complete, Exploration Phase Next** diff --git a/STATUS.md b/STATUS.md new file mode 100644 index 0000000..4789259 --- /dev/null +++ b/STATUS.md @@ -0,0 +1,116 @@ +# Project Status - iSamples Python + +**Last Updated**: 2025-09-05 +**Branch**: `exploratory` +**Status**: Heavy WIP transitioning from API-dependent to geoparquet-focused workflows + +## Current State Overview + +This repository is in active development, pivoting from iSamples API integration to **offline-first geoparquet analysis** due to the central API being offline. The codebase contains excellent foundations for geological data visualization but has several loose ends that need resolution. + +## 🚨 Critical Issues + +### API Dependency Problems +- **Offline API**: `ISB_SERVER = "https://central.isample.xyz/isamples_central/"` is currently unreachable +- **No fallback mechanisms**: All three client classes (`IsbClient`, `IsbClient2`, `ISamplesBulkHandler`) will fail +- **Authentication workflows broken**: Bulk handler requires tokens from offline service + +### Code Issues +- **lonboard parameter error** in `examples/basic/record_counts.ipynb:69`: Incorrect `zoom`/`center` parameters for Map constructor +- **Hardcoded paths**: Several notebooks contain user-specific paths that need generalization + +## 🚧 Work In Progress Areas + +### 1. Development Environment Inconsistencies +- **Mixed package managers**: Poetry (main) + npm scattered in multiple locations + - Root: `package.json`, `package-lock.json` + - `examples/basic/`: Node.js setup + - `playwright/`: Separate npm environment +- **Node modules duplication**: `node_modules/` in multiple directories + +### 2. JavaScript Integration Experiments +- **Incomplete experiments**: + - `examples/basic/hello_encode.js` - partial implementation + - `examples/spatial/cesium_points.ipynb` - 3D visualization experiments + - `javascript/stream.ipynb` - streaming data experiments +- **Playwright infrastructure**: Web scraping setup but unclear integration purpose + +### 3. Incomplete Example Files +``` +examples/basic/ +├── bone.xlsx + ~$bone.xlsx (Excel temp file - should be cleaned) +├── subset.py (basic operations, undocumented) +├── zenodo_metadata.json (archival metadata, good for offline workflows) +└── hello_encode.js (incomplete JavaScript experiment) +``` + +### 4. Testing Infrastructure Gaps +- **Minimal test coverage**: Only basic structure in `tests/` +- **No integration tests**: For the core client classes +- **Playwright tests**: Present but targeting demo todo app, not iSamples functionality + +## ✅ Working Well (Build On These) + +### Excellent Visualization Patterns +- **`examples/basic/geoparquet.ipynb`** contains sophisticated lonboard code: + - Zoom-layered rendering with `create_zoom_layers()` + - Interactive color mapping by geological source + - Efficient WebGL point cloud visualization + - Well-documented functions for reuse + +### Successful API-Free Workflows +- **`examples/basic/isample-archive.ipynb`** demonstrates: + - Remote parquet access via HTTP range requests + - DuckDB efficient spatial queries + - Zenodo archive integration + - No API dependencies + +### Robust Technology Stack +- **Core dependencies** properly managed in `pyproject.toml` +- **Spatial analysis tools**: GeoPandas, DuckDB, Shapely all working +- **Jupyter integration**: ipywidgets, sidecar, etc. functional + +## 🎯 Recommended Next Steps + +### Priority 1: API Issues +1. **Add offline detection** to client classes with graceful fallbacks +2. **Document workarounds** for API-dependent examples +3. **Create mock data** for testing without API access + +### Priority 2: Environment Consolidation +1. **Standardize Node.js usage**: Single package.json or eliminate if unnecessary +2. **Clean up temp files**: Remove `~$bone.xlsx` and similar artifacts +3. **Generalize paths**: Remove user-specific hardcoded paths from notebooks + +### Priority 3: Documentation & Testing +1. **Complete examples/README.md**: Document each notebook's purpose and data requirements +2. **Add integration tests**: For successful offline workflows +3. **Create troubleshooting guide**: Common issues and solutions + +### Priority 4: Code Quality +1. **Fix lonboard parameter errors** in record_counts notebook +2. **Extract reusable functions** from geoparquet.ipynb visualization code +3. **Add error handling** throughout the codebase + +## 🔄 Strategic Direction + +**From**: API-dependent geological data analysis +**To**: Offline-first geoparquet workflows with modern spatial tools + +**Key Success Metrics**: +- All examples run without API dependencies +- Clear documentation for new users +- Reusable visualization patterns +- Robust error handling + +## Files Needing Immediate Attention + +1. `examples/basic/record_counts.ipynb` - Fix lonboard Map parameters +2. `src/isamples_client/isbclient.py` - Add offline detection +3. `examples/basic/` - Clean up temporary/experimental files +4. Root directory - Consolidate Node.js dependencies +5. `tests/` - Add meaningful test coverage + +--- + +*This status document should be updated as issues are resolved and new ones discovered.* \ No newline at end of file diff --git a/basic/record_counts.ipynb b/basic/record_counts.ipynb deleted file mode 100644 index 9f22cd1..0000000 --- a/basic/record_counts.ipynb +++ /dev/null @@ -1,630 +0,0 @@ -{ - "cells": [ - { - "cell_type": "code", - "execution_count": 18, - "metadata": { - "ExecuteTime": { - "end_time": "2023-10-28T13:01:34.491834Z", - "start_time": "2023-10-28T13:01:34.342886Z" - }, - "collapsed": false - }, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "INFO:httpx:HTTP Request: GET https://central.isample.xyz/isamples_central/thing/select?rows=0&q=%2A%3A%2A \"HTTP/1.1 200 OK\"\n", - "INFO:root:url = https://central.isample.xyz/isamples_central/thing/select?rows=0&q=%2A%3A%2A\n" - ] - }, - { - "data": { - "text/plain": [ - "6347967" - ] - }, - "execution_count": 18, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "import json\n", - "import logging\n", - "import typing\n", - "import urllib.parse\n", - "import httpx\n", - "import xarray\n", - "\n", - "ISB_SERVER = \"https://central.isample.xyz/isamples_central/\"\n", - "TIMEOUT = 10 #seconds\n", - "USER_AGENT = \"Python/3.11 isamples.examples\"\n", - "\n", - "logging.basicConfig(level=logging.INFO)\n", - "L = logging.getLogger()\n", - "\n", - "class IsbClient:\n", - " \"\"\"A client for iSamples.\n", - " \"\"\"\n", - "\n", - " def __init__(self, isb_server:str=None):\n", - " self.isb_server = ISB_SERVER if isb_server is None else isb_server\n", - " self.isb_server = self.isb_server.strip(\" /\") + \"/\"\n", - " self.session = httpx.Client()\n", - "\n", - " def _request(self, path:str, params=None)->typing.Any:\n", - " headers = {\n", - " \"Accept\": \"application/json\",\n", - " \"User-Agent\": USER_AGENT\n", - " }\n", - " url = urllib.parse.urljoin(self.isb_server, path)\n", - " response = self.session.get(url, params=params, headers=headers, timeout=TIMEOUT)\n", - " L.info(\"url = %s\", response.url)\n", - " return response.json()\n", - "\n", - " def field_names(self)->typing.List[str]:\n", - " \"\"\"Return a list of field names available in the Solr endpoint.\n", - " \"\"\"\n", - " response = self._request(\"thing/select/info\")\n", - " fields = [k for k in response.get(\"schema\",{}).get(\"fields\", {}).keys()]\n", - " return fields\n", - "\n", - " def record_count(self, q:str)->int:\n", - " \"\"\"Number of records matching query q\n", - " \"\"\"\n", - " params = httpx.QueryParams(rows=0, q=q)\n", - " response = self._request(\"thing/select\", params)\n", - " return response.get(\"response\", {}).get(\"numFound\", -1)\n", - "\n", - " def facets(self, q:str, fields:typing.List[str]) -> typing.Dict[str, typing.Dict[str, int]]:\n", - " \"\"\"Get facet values and counts for the records matching query q and specified fields.\n", - "\n", - " Response is a dict of dicts:\n", - " {\n", - " field_name: {\n", - " facet_value: count,\n", - " ...\n", - " },\n", - " ...\n", - " }\n", - " \"\"\"\n", - " params = httpx.QueryParams(rows=0, q=q, facet=\"true\")\n", - " params = params.add(\"facet.mincount\", 0)\n", - " for field in fields:\n", - " params = params.add(\"facet.field\", field)\n", - " response = self._request(\"thing/select\", params)\n", - " res = {}\n", - " for field in fields:\n", - " counts = {}\n", - " vals = response.get(\"facet_counts\",{}).get(\"facet_fields\",{}).get(field, [])\n", - " for i in range(0, len(vals), 2):\n", - " k = vals[i]\n", - " v = vals[i+1]\n", - " counts[k] = v\n", - " res[field] = counts\n", - " return res\n", - "\n", - "\n", - " def pivot(self, q:str, dimensions:typing.List[str])-> xarray.DataArray:\n", - " \"\"\"Return an n-dimensional xarray of counts for specified fields\n", - " \"\"\"\n", - "\n", - " def _normalize_facet(v:str):\n", - " return v.strip().lower()\n", - "\n", - " def _get_coordinates(data, dimensions, coordinates):\n", - " \"\"\"Get the coordinate index values from the facet response. \n", - " \"\"\"\n", - " for entry in data:\n", - " v = _normalize_facet(entry.get(\"value\"))\n", - " f = entry.get(\"field\")\n", - " if f is not None and v not in coordinates[f]:\n", - " coordinates[f].append(v)\n", - " _get_coordinates(entry.get(\"pivot\", []), dimensions, coordinates)\n", - "\n", - " def _value_structure(dimensions, coordinates, cdim=0):\n", - " \"\"\"Populate an empty value structure for holding the facet counts\n", - " \"\"\"\n", - " nvalues = len(coordinates[dimensions[cdim]])\n", - " if cdim >= len(dimensions)-1:\n", - " return [0,]*nvalues\n", - " return [_value_structure(dimensions, coordinates, cdim=cdim+1)]*nvalues\n", - "\n", - " def _set_values(values, data, coord):\n", - " \"\"\"Populate the xarray with the facet count values.\n", - " \"\"\"\n", - " for entry in data:\n", - " coord[entry.get(\"field\")] = _normalize_facet(entry.get(\"value\"))\n", - " p = entry.get(\"pivot\", None)\n", - " if p is None:\n", - " values.loc[coord] = values.loc[coord] + entry.get(\"count\")\n", - " else:\n", - " _set_values(values, p, coord)\n", - " coord.popitem()\n", - "\n", - " if len(dimensions) < 2:\n", - " raise ValueError(\"At least two dimensions required for pivot.\")\n", - " params = httpx.QueryParams(rows=0, q=q)\n", - " params = params.add(\"facet\", \"true\")\n", - " params = params.add(\"facet.mincount\", 0)\n", - " params = params.add(\"facet.pivot\", \",\".join(dimensions))\n", - " response = self._request(\"thing/select\", params)\n", - " fkey = \",\".join(dimensions)\n", - " data = response.get(\"facet_counts\", {}).get(\"facet_pivot\", {}).get(fkey, [])\n", - " coordinates = {k:[] for k in dimensions}\n", - " _get_coordinates(data, dimensions, coordinates)\n", - " values = _value_structure(dimensions, coordinates)\n", - " xd = xarray.DataArray(values, coords=coordinates, dims=dimensions)\n", - " _set_values(xd, data, {})\n", - " return xd\n", - "\n", - "\n", - "cli = IsbClient()\n", - "cli.record_count(\"*:*\")" - ] - }, - { - "cell_type": "code", - "execution_count": 19, - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "INFO:httpx:HTTP Request: GET https://central.isample.xyz/isamples_central/thing/select?rows=0&q=%2A%3A%2A&facet=true&facet.mincount=0&facet.field=source&facet.field=hasMaterialCategory&facet.field=hasContextCategory \"HTTP/1.1 200 OK\"\n", - "INFO:root:url = https://central.isample.xyz/isamples_central/thing/select?rows=0&q=%2A%3A%2A&facet=true&facet.mincount=0&facet.field=source&facet.field=hasMaterialCategory&facet.field=hasContextCategory\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "{\n", - " \"source\": {\n", - " \"SESAR\": 4688386,\n", - " \"OPENCONTEXT\": 853229,\n", - " \"GEOME\": 554320,\n", - " \"SMITHSONIAN\": 213411\n", - " },\n", - " \"hasMaterialCategory\": {\n", - " \"Natural Solid Material\": 2233939,\n", - " \"Organic material\": 1108614,\n", - " \"Rock\": 913127,\n", - " \" rock\": 838805,\n", - " \" sediment\": 838805,\n", - " \"Mixed soil\": 838805,\n", - " \"Biogenic non organic material\": 484858,\n", - " \"Material\": 462472,\n", - " \"Mineral\": 391088,\n", - " \"Biogenic non-organic material\": 346242,\n", - " \"Anthropogenic metal\": 184888,\n", - " \"Natural solid material\": 182909,\n", - " \"Not Provided\": 181260,\n", - " \"Anthropogenic material\": 177576,\n", - " \"Sediment\": 94084,\n", - " \"Soil\": 37153,\n", - " \"Liquid water\": 25777,\n", - " \"Gaseous material\": 1225,\n", - " \"Particulate\": 124,\n", - " \"Non-aqueous liquid material\": 46,\n", - " \"Ice\": 8\n", - " },\n", - " \"hasContextCategory\": {\n", - " \"Not Provided\": 3984022,\n", - " \"Site of past human activities\": 853229,\n", - " \"Earth interior\": 665766,\n", - " \"Animalia\": 391453,\n", - " \"Subaerial surface environment\": 108123,\n", - " \"Marine water body\": 56520,\n", - " \"Marine water body bottom\": 53641,\n", - " \"Lake river or stream bottom\": 14582,\n", - " \"Terrestrial water body\": 10792,\n", - " \"Plantae\": 9417,\n", - " \"Active human occupation site\": 4040,\n", - " \"Fungi\": 3793,\n", - " \"Lake, river or stream bottom\": 1697,\n", - " \"Subsurface fluid reservoir\": 1680,\n", - " \"Marine biome\": 1661,\n", - " \"Chromista\": 1184,\n", - " \"Subaerial terrestrial biome\": 133,\n", - " \"Bacteria\": 4,\n", - " \"Protozoa\": 4\n", - " }\n", - "}\n" - ] - } - ], - "source": [ - "fields = [\"source\", \"hasMaterialCategory\", \"hasContextCategory\"]\n", - "facets = cli.facets(\"*:*\", fields)\n", - "print(json.dumps(facets, indent=2))" - ] - }, - { - "cell_type": "code", - "execution_count": 20, - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "INFO:httpx:HTTP Request: GET https://central.isample.xyz/isamples_central/thing/select?rows=0&q=%2A%3A%2A&facet=true&facet.mincount=0&facet.pivot=source%2ChasMaterialCategory%2ChasContextCategory \"HTTP/1.1 200 OK\"\n", - "INFO:root:url = https://central.isample.xyz/isamples_central/thing/select?rows=0&q=%2A%3A%2A&facet=true&facet.mincount=0&facet.pivot=source%2ChasMaterialCategory%2ChasContextCategory\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\n", - "array(4)\n", - "Coordinates:\n", - " source \n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
sourcesesaropencontextgeomesmithsonian
hasMaterialCategory
natural solid material223393918290900
rock175252027200
sediment932889000
mixed soil838805000
material462472000
mineral39079729100
biogenic non-organic material346242000
organic material28183459049405855213411
not provided4717313408700
soil37153000
liquid water25777000
gaseous material1225000
anthropogenic material30117727500
particulate124000
non-aqueous liquid material46000
ice8000
biogenic non organic material048485800
anthropogenic metal018488800
\n", - "" - ], - "text/plain": [ - "source sesar opencontext geome smithsonian\n", - "hasMaterialCategory \n", - "natural solid material 2233939 182909 0 0\n", - "rock 1752520 272 0 0\n", - "sediment 932889 0 0 0\n", - "mixed soil 838805 0 0 0\n", - "material 462472 0 0 0\n", - "mineral 390797 291 0 0\n", - "biogenic non-organic material 346242 0 0 0\n", - "organic material 281834 59049 405855 213411\n", - "not provided 47173 134087 0 0\n", - "soil 37153 0 0 0\n", - "liquid water 25777 0 0 0\n", - "gaseous material 1225 0 0 0\n", - "anthropogenic material 301 177275 0 0\n", - "particulate 124 0 0 0\n", - "non-aqueous liquid material 46 0 0 0\n", - "ice 8 0 0 0\n", - "biogenic non organic material 0 484858 0 0\n", - "anthropogenic metal 0 184888 0 0" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "# Sum by axis 2 (hasContextCategory) and print\n", - "df = xd.sum(axis=2).to_pandas()\n", - "# display transposed\n", - "display(df.T)\n" - ] - }, - { - "cell_type": "code", - "execution_count": 21, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\n", - "array(1752520)\n", - "Coordinates:\n", - " source =0.10.0 +ipydatagrid +geopandas +duckdb +pandas>=2.0.0 +numpy +ipywidgets +pyarrow>=12.0.0 diff --git a/docs/conf.py b/docs/conf.py new file mode 100644 index 0000000..e69de29 diff --git a/docs/index.rst b/docs/index.rst new file mode 100644 index 0000000..e69de29 diff --git a/docs/usage.rst b/docs/usage.rst new file mode 100644 index 0000000..e69de29 diff --git a/examples/README.md b/examples/README.md new file mode 100644 index 0000000..cb98473 --- /dev/null +++ b/examples/README.md @@ -0,0 +1,208 @@ +# iSamples Examples + +This directory contains Jupyter notebooks and scripts demonstrating different approaches to working with geological sample data from the iSamples project. + +## 🌟 Key Notebooks (Start Here) + +### `basic/geoparquet.ipynb` ⭐ **PRIMARY VISUALIZATION EXAMPLE** +**Status**: ✅ Working (API-independent) +**Focus**: Advanced lonboard WebGL visualization + +**What it does**: +- Loads geological sample data from geoparquet files +- Creates sophisticated WebGL point cloud visualizations using Lonboard +- Implements zoom-layered rendering for performance with large datasets +- Provides interactive controls for filtering by geological source collections +- Demonstrates advanced color mapping and styling techniques + +**Key patterns to reuse**: +```python +# Zoom-layered visualization +layers = create_zoom_layers(gdf, zoom_levels, color_map) + +# Interactive filtering +layer = update_layer_colors(gdf_data, selected_collections) + +# Efficient color mapping +colors = create_color_map(data, color_map, selected_collections) +``` + +### `basic/isample-archive.ipynb` +**Status**: ✅ Working (API-independent) +**Focus**: Remote parquet analysis with DuckDB + +**What it does**: +- Accesses iSamples data via Zenodo archives using HTTP range requests +- Demonstrates efficient spatial queries using DuckDB on remote parquet files +- Shows how to work with geological data without API dependencies +- Examples of spatial filtering, aggregation, and analysis + +**Key pattern**: +```python +# Remote parquet access +conn = duckdb.connect() +result = conn.sql("SELECT * FROM 'https://zenodo.org/.../data.parquet'") +``` + +### `basic/record_counts.ipynb` +**Status**: ⚠️ Has issues (lonboard parameter errors) +**Focus**: Quick visualization patterns + +**What it does**: +- Demonstrates rapid prototyping with `lonboard.viz()` +- Shows basic DuckDB operations on local data +- Quick visualization of record counts and distributions + +**Known issues**: +- Line 69: Incorrect `zoom`/`center` parameters for `Map()` constructor +- Needs parameter fixes for lonboard compatibility + +## 📁 Directory Structure + +``` +examples/ +├── basic/ # Core examples and tutorials +│ ├── geoparquet.ipynb ⭐ Main visualization notebook +│ ├── isample-archive.ipynb ✅ Remote parquet analysis +│ ├── record_counts.ipynb ⚠️ Quick patterns (has issues) +│ ├── pgp.ipynb 🧪 Additional lonboard experiments +│ ├── subset.py 📝 Basic Python subset operations +│ ├── bone.xlsx 📊 Sample Excel data +│ └── zenodo_metadata.json 📋 Archive metadata +│ +├── spatial/ # Advanced spatial analysis +│ ├── cesium_points.ipynb 🌐 3D visualization experiments +│ ├── cities.geoparquet 🗺️ Sample spatial data +│ └── bay_area_cities.parquet +│ +├── opencontext/ # OpenContext-specific examples +└── javascript/ # Node.js integration experiments + └── stream.ipynb 🔄 Streaming data patterns +``` + +## 🚀 Getting Started + +### Prerequisites +```bash +# Install dependencies +poetry install --with examples + +# Activate environment +poetry shell + +# Launch Jupyter +jupyter lab +``` + +### Recommended Learning Path + +1. **Start with `basic/geoparquet.ipynb`** - Learn advanced lonboard visualization +2. **Try `basic/isample-archive.ipynb`** - Understand remote data access +3. **Explore `spatial/cesium_points.ipynb`** - See 3D visualization options +4. **Check `basic/pgp.ipynb`** - Additional lonboard patterns + +## 📊 Data Sources + +### Working Data Sources (API-independent) +- **Zenodo archives**: Remote parquet files accessible via HTTP +- **Local geoparquet files**: Sample data in `spatial/` directory +- **Excel samples**: `bone.xlsx` for testing data import + +### API-dependent Sources ⚠️ +- **iSamples Central API**: Currently offline (`https://central.isample.xyz/isamples_central/`) +- **Bulk export endpoints**: Require authentication from offline API + +## 🛠️ Common Patterns + +### Lonboard Visualization +```python +from lonboard import Map, ScatterplotLayer +from lonboard.colormap import apply_continuous_cmap + +# Basic pattern +layer = ScatterplotLayer.from_geopandas( + gdf, + get_fill_color=colors, + get_radius=300, + radius_units='meters', + pickable=True +) +map_widget = Map(layers=[layer]) +``` + +### DuckDB Remote Access +```python +import duckdb + +# Connect and query remote parquet +conn = duckdb.connect() +result = conn.sql("SELECT * FROM 'remote_file.parquet' WHERE condition") +gdf = result.to_df() +``` + +### Error Handling for API Issues +```python +try: + # API-dependent code + client = IsbClient() + data = client.search() +except requests.exceptions.ConnectionError: + # Fallback to local/remote parquet + print("API unavailable, using local data") + data = pd.read_parquet('backup_data.parquet') +``` + +## 🐛 Troubleshooting + +### Common Issues + +1. **"No CRS exists on data" warning** + - Usually harmless for visualization + - Add explicit CRS if needed: `gdf.set_crs('EPSG:4326')` + +2. **Lonboard Map parameter errors** + - Don't use `zoom` and `center` directly in `Map()` constructor + - Use layer-specific parameters instead + +3. **Memory issues with large datasets** + - Use the zoom-layered approach from `geoparquet.ipynb` + - Sample data before visualization: `gdf.sample(n=10000)` + +4. **API connection errors** + - Expected behavior - API is currently offline + - Use geoparquet examples for working patterns + +### Performance Tips + +- **Large datasets**: Use DuckDB for filtering before loading into memory +- **Interactive maps**: Implement zoom-based level-of-detail rendering +- **Memory usage**: Sample data for initial exploration, full dataset for final analysis + +## 🌐 Try These Patterns in Your Browser + +Many of the analysis patterns demonstrated in these notebooks have **browser-based equivalents** in our companion website: + +**[iSamples Interactive Tutorials](https://smrgeoinfo.github.io/isamplesorg.github.io/tutorials/)** +- Same datasets, zero installation required +- Observable JS + DuckDB-WASM for in-browser analysis +- Perfect for sharing analyses or trying concepts quickly +- **Performance**: Analyze 300MB datasets in <100MB browser memory + +**Learning Path**: +1. **Start here** for deep analysis and Python ecosystem power +2. **Share results** via website tutorials for broader accessibility +3. **Rapid prototyping** in browser, then **advanced analysis** locally + +## 🔗 Related Documentation + +- [Main README](../README.md) - Repository overview +- [STATUS.md](../STATUS.md) - Current issues and WIP areas +- [CLAUDE.md](../CLAUDE.md) - Development guidance +- [CROSS_REPO_ALIGNMENT.md](../CROSS_REPO_ALIGNMENT.md) - Website integration strategy +- [DATA_SOURCES.md](../DATA_SOURCES.md) - Shared data documentation +- [Lonboard Documentation](https://github.com/developmentseed/lonboard) +- [DuckDB Spatial Extension](https://duckdb.org/docs/extensions/spatial) + +--- + +*This README is updated as new examples are added and issues are resolved.* \ No newline at end of file diff --git a/examples/basic/PQG_WIDE_EXPLORATION.md b/examples/basic/PQG_WIDE_EXPLORATION.md new file mode 100644 index 0000000..75bdd7f --- /dev/null +++ b/examples/basic/PQG_WIDE_EXPLORATION.md @@ -0,0 +1,86 @@ +# PQG Wide Schema Exploration Summary + +**Date**: 2025-12-01 +**Context**: Eric Kansa created an experimental "wide" serialization of PQG data + +## Key Insight + +**The 14 ISamplesEdgeType values define the GRAMMAR (semantics) of iSamples relationships.** + +Narrow vs Wide = different SERIALIZATIONS of the same grammar: + +| Aspect | Narrow Schema | Wide Schema | +|--------|---------------|-------------| +| Storage | Edge rows (`otype='_edge_'`, `s`, `p`, `o`) | Predicate columns (`p__produced_by`, etc.) | +| Same 14 edge types? | ✅ Yes | ✅ Yes | +| Query pattern | 7 joins (3 edge + 4 entity tables) | 3 joins (entity tables only) | +| Optimized for | Write flexibility | Read performance | + +## Performance Comparison (Local Files) + +| Metric | Narrow | Wide | Improvement | +|--------|--------|------|-------------| +| File size | 690.9 MB | 275.3 MB | **60% smaller** | +| Row count | 11.6M | 2.5M | **79% fewer** (no edge rows) | +| Geolocation query | 142ms | 54ms | **2.6x faster** | + +## Files + +- **Narrow**: `oc_isamples_pqg.parquet` (690.9 MB) +- **Wide**: `oc_isamples_pqg_wide.parquet` (275.3 MB) +- **Wide source**: `https://storage.googleapis.com/opencontext-parquet/oc_isamples_pqg_wide.parquet` + +## Edge Type → Column Mapping + +10 of 14 edge types exist in OpenContext data: + +| Edge Type | p__* Column | Has Data? | +|-----------|-------------|-----------| +| MSR_PRODUCED_BY | `p__produced_by` | ✅ 1.1M | +| MSR_REGISTRANT | `p__registrant` | ✅ 422K | +| MSR_KEYWORDS | `p__keywords` | ✅ 1.1M | +| MSR_HAS_CONTEXT_CATEGORY | `p__has_context_category` | ✅ 1.1M | +| MSR_HAS_MATERIAL_CATEGORY | `p__has_material_category` | ✅ 1.1M | +| MSR_HAS_SAMPLE_OBJECT_TYPE | `p__has_sample_object_type` | ✅ 1.1M | +| EVENT_SAMPLING_SITE | `p__sampling_site` | ✅ 1.1M | +| EVENT_SAMPLE_LOCATION | `p__sample_location` | ✅ 1.1M | +| EVENT_RESPONSIBILITY | `p__responsibility` | ✅ 1.1M | +| SITE_LOCATION | `p__site_location` | ✅ 18K | +| MSR_CURATION | `p__curation` | ❌ Not in OC | +| MSR_RELATED_RESOURCE | `p__related_resource` | ❌ Not in OC | +| EVENT_HAS_CONTEXT_CATEGORY | (shares `p__has_context_category`) | ❌ Not in OC | +| CURATION_RESPONSIBILITY | (shares `p__responsibility`) | ❌ Not in OC | + +## Example Queries + +### Wide Schema (simpler) +```sql +SELECT samp.pid, geo.latitude, geo.longitude +FROM pqg_wide AS samp +JOIN pqg_wide AS se ON se.row_id = ANY(samp.p__produced_by) +JOIN pqg_wide AS geo ON geo.row_id = ANY(se.p__sample_location) +WHERE samp.otype = 'MaterialSampleRecord' +``` + +### Narrow Schema (more joins) +```sql +SELECT samp.pid, geo.latitude, geo.longitude +FROM pqg AS samp +JOIN pqg AS e1 ON e1.s = samp.row_id AND e1.p = 'produced_by' +JOIN pqg AS se ON se.row_id = ANY(e1.o) +JOIN pqg AS e2 ON e2.s = se.row_id AND e2.p = 'sample_location' +JOIN pqg AS geo ON geo.row_id = ANY(e2.o) +WHERE samp.otype = 'MaterialSampleRecord' AND e1.otype = '_edge_' AND e2.otype = '_edge_' +``` + +## Next Steps + +1. **TypedEdgeQueries dual-mode**: Add schema detection + wide query paths +2. **Conversion function**: Port Eric's `create_pqg_wide_table()` to pqg repo +3. **CLI commands**: `pqg convert --to-wide` / `--to-narrow` + +## References + +- Eric's code: `https://github.com/ekansa/open-context-py/.../isamples_pqg.py` +- PQG PR #6 (typed edges): `https://github.com/isamplesorg/pqg/pull/6` (still open) +- Plan file: `~/.claude/plans/parsed-yawning-knuth.md` diff --git a/examples/basic/archive/geoparquet0.ipynb b/examples/basic/archive/geoparquet0.ipynb new file mode 100644 index 0000000..f999cfd --- /dev/null +++ b/examples/basic/archive/geoparquet0.ipynb @@ -0,0 +1,1176 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "id": "70614b0c", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Time for database connection: 0.00 seconds\n", + "Time for creating view: 0.00 seconds\n", + "Time for getting column names: 0.01 seconds\n", + "\n", + "Available columns:\n", + "- sample_identifier\n", + "- label\n", + "- description\n", + "- source_collection\n", + "- has_sample_object_type\n", + "- has_material_category\n", + "- has_context_category\n", + "- informal_classification\n", + "- keywords\n", + "- produced_by\n", + "- curation\n", + "- registrant\n", + "- related_resource\n", + "- sampling_purpose\n", + "- sample_location_longitude\n", + "- sample_location_latitude\n", + "- geometry\n", + "\n", + "Total number of rows: 6,494,541\n", + "Time for counting rows: 0.00 seconds\n", + "Time for querying sample rows: 0.29 seconds\n", + "\n", + "First 5 rows:\n", + " sample_identifier label description \\\n", + "0 ark:/21547/DSz2757 757 basisOfRecord: PreservedSpecimen \n", + "1 ark:/21547/DSz2779 779 basisOfRecord: PreservedSpecimen \n", + "2 ark:/21547/DSz2806 806 basisOfRecord: PreservedSpecimen \n", + "3 ark:/21547/DSz2807 807 basisOfRecord: PreservedSpecimen \n", + "4 ark:/21547/DSz2759 759 basisOfRecord: PreservedSpecimen \n", + "\n", + " source_collection has_sample_object_type \\\n", + "0 GEOME [{'identifier': 'https://w3id.org/isample/voca... \n", + "1 GEOME [{'identifier': 'https://w3id.org/isample/voca... \n", + "2 GEOME [{'identifier': 'https://w3id.org/isample/voca... \n", + "3 GEOME [{'identifier': 'https://w3id.org/isample/voca... \n", + "4 GEOME [{'identifier': 'https://w3id.org/isample/voca... \n", + "\n", + " has_material_category \\\n", + "0 [{'identifier': 'https://w3id.org/isample/voca... \n", + "1 [{'identifier': 'https://w3id.org/isample/voca... \n", + "2 [{'identifier': 'https://w3id.org/isample/voca... \n", + "3 [{'identifier': 'https://w3id.org/isample/voca... \n", + "4 [{'identifier': 'https://w3id.org/isample/voca... \n", + "\n", + " has_context_category informal_classification \\\n", + "0 [{'identifier': 'https://w3id.org/isample/biol... [Taricha, granulosa] \n", + "1 [{'identifier': 'https://w3id.org/isample/biol... [Taricha, granulosa] \n", + "2 [{'identifier': 'https://w3id.org/isample/biol... [Taricha, granulosa] \n", + "3 [{'identifier': 'https://w3id.org/isample/biol... [Taricha, granulosa] \n", + "4 [{'identifier': 'https://w3id.org/isample/biol... [Taricha, granulosa] \n", + "\n", + " keywords \\\n", + "0 [{'keyword': 'California'}, {'keyword': 'USA'}] \n", + "1 [{'keyword': 'California'}, {'keyword': 'USA'}] \n", + "2 [{'keyword': 'California'}, {'keyword': 'USA'}] \n", + "3 [{'keyword': 'California'}, {'keyword': 'USA'}] \n", + "4 [{'keyword': 'California'}, {'keyword': 'USA'}] \n", + "\n", + " produced_by curation registrant \\\n", + "0 {'description': 'expeditionCode: newts | proje... \n", + "1 {'description': 'expeditionCode: newts | proje... \n", + "2 {'description': 'expeditionCode: newts | proje... \n", + "3 {'description': 'expeditionCode: newts | proje... \n", + "4 {'description': 'expeditionCode: newts | proje... \n", + "\n", + " related_resource sampling_purpose sample_location_longitude \\\n", + "0 -122.578610 \n", + "1 -122.373055 \n", + "2 -122.117050 \n", + "3 -122.117050 \n", + "4 -122.578610 \n", + "\n", + " sample_location_latitude geometry \n", + "0 38.578888 [1, 1, 0, 0, 0, 222, 200, 60, 242, 7, 165, 94,... \n", + "1 37.385277 [1, 1, 0, 0, 0, 254, 38, 20, 34, 224, 151, 94,... \n", + "2 37.365490 [1, 1, 0, 0, 0, 204, 127, 72, 191, 125, 135, 9... \n", + "3 37.365490 [1, 1, 0, 0, 0, 204, 127, 72, 191, 125, 135, 9... \n", + "4 38.578888 [1, 1, 0, 0, 0, 222, 200, 60, 242, 7, 165, 94,... \n", + "Time for querying sample rows: 0.29 seconds\n", + "\n", + "First 5 rows:\n", + " sample_identifier label description \\\n", + "0 ark:/21547/DSz2757 757 basisOfRecord: PreservedSpecimen \n", + "1 ark:/21547/DSz2779 779 basisOfRecord: PreservedSpecimen \n", + "2 ark:/21547/DSz2806 806 basisOfRecord: PreservedSpecimen \n", + "3 ark:/21547/DSz2807 807 basisOfRecord: PreservedSpecimen \n", + "4 ark:/21547/DSz2759 759 basisOfRecord: PreservedSpecimen \n", + "\n", + " source_collection has_sample_object_type \\\n", + "0 GEOME [{'identifier': 'https://w3id.org/isample/voca... \n", + "1 GEOME [{'identifier': 'https://w3id.org/isample/voca... \n", + "2 GEOME [{'identifier': 'https://w3id.org/isample/voca... \n", + "3 GEOME [{'identifier': 'https://w3id.org/isample/voca... \n", + "4 GEOME [{'identifier': 'https://w3id.org/isample/voca... \n", + "\n", + " has_material_category \\\n", + "0 [{'identifier': 'https://w3id.org/isample/voca... \n", + "1 [{'identifier': 'https://w3id.org/isample/voca... \n", + "2 [{'identifier': 'https://w3id.org/isample/voca... \n", + "3 [{'identifier': 'https://w3id.org/isample/voca... \n", + "4 [{'identifier': 'https://w3id.org/isample/voca... \n", + "\n", + " has_context_category informal_classification \\\n", + "0 [{'identifier': 'https://w3id.org/isample/biol... [Taricha, granulosa] \n", + "1 [{'identifier': 'https://w3id.org/isample/biol... [Taricha, granulosa] \n", + "2 [{'identifier': 'https://w3id.org/isample/biol... [Taricha, granulosa] \n", + "3 [{'identifier': 'https://w3id.org/isample/biol... [Taricha, granulosa] \n", + "4 [{'identifier': 'https://w3id.org/isample/biol... [Taricha, granulosa] \n", + "\n", + " keywords \\\n", + "0 [{'keyword': 'California'}, {'keyword': 'USA'}] \n", + "1 [{'keyword': 'California'}, {'keyword': 'USA'}] \n", + "2 [{'keyword': 'California'}, {'keyword': 'USA'}] \n", + "3 [{'keyword': 'California'}, {'keyword': 'USA'}] \n", + "4 [{'keyword': 'California'}, {'keyword': 'USA'}] \n", + "\n", + " produced_by curation registrant \\\n", + "0 {'description': 'expeditionCode: newts | proje... \n", + "1 {'description': 'expeditionCode: newts | proje... \n", + "2 {'description': 'expeditionCode: newts | proje... \n", + "3 {'description': 'expeditionCode: newts | proje... \n", + "4 {'description': 'expeditionCode: newts | proje... \n", + "\n", + " related_resource sampling_purpose sample_location_longitude \\\n", + "0 -122.578610 \n", + "1 -122.373055 \n", + "2 -122.117050 \n", + "3 -122.117050 \n", + "4 -122.578610 \n", + "\n", + " sample_location_latitude geometry \n", + "0 38.578888 [1, 1, 0, 0, 0, 222, 200, 60, 242, 7, 165, 94,... \n", + "1 37.385277 [1, 1, 0, 0, 0, 254, 38, 20, 34, 224, 151, 94,... \n", + "2 37.365490 [1, 1, 0, 0, 0, 204, 127, 72, 191, 125, 135, 9... \n", + "3 37.365490 [1, 1, 0, 0, 0, 204, 127, 72, 191, 125, 135, 9... \n", + "4 38.578888 [1, 1, 0, 0, 0, 222, 200, 60, 242, 7, 165, 94,... \n", + "Data loading and sampling time: 1.31 seconds\n", + "Data loading and sampling time: 1.31 seconds\n", + "Total execution time: 2.93 seconds\n", + "Total execution time: 2.93 seconds\n" + ] + }, + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Total rows: 6494541\n" + ] + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "d45b2709ed6346469199bcb576c2a674", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "ename": "TypeError", + "evalue": "count(): incompatible function arguments. The following argument types are supported:\n 1. (self: duckdb.duckdb.DuckDBPyRelation, column: str, groups: str = '', window_spec: str = '', projected_columns: str = '') -> duckdb.duckdb.DuckDBPyRelation\n\nInvoked with: ┌─────────────────────────┬──────────┬──────────────────────────────────────────────────┬───────────────────┬──────────────────────────────────────────────────────────────────────────────────────────────────┬────────────────────────────────────────────────────────────────────────────────────┬───────────────────────────────────────────────────────────────────────────────────┬────────────────────────────┬────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────┬──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────┬────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────┬────────────────────────┬───────────────────────────────────────┬────────────────────┬───────────────────────────┬──────────────────────────┬───────────────────────────────┐\n│ sample_identifier │ label │ description │ source_collection │ has_sample_object_type │ has_material_category │ has_context_category │ informal_classification │ keywords │ produced_by │ curation │ registrant │ related_resource │ sampling_purpose │ sample_location_longitude │ sample_location_latitude │ geometry │\n│ varchar │ varchar │ varchar │ varchar │ struct(identifier varchar)[] │ struct(identifier varchar)[] │ struct(identifier varchar)[] │ varchar[] │ struct(keyword varchar)[] │ struct(description varchar, has_feature_of_interest varchar, identifier varchar, \"label\" varchar, responsibility struct(\"name\" varchar, \"role\" varchar)[], result_time varchar, sampling_site struct(description varchar, \"label\" varchar, place_name varchar[], sample_location struct(elevation double, latitude double, longitude double))) │ struct(access_constraints varchar[], curation_location varchar, description varchar, \"label\" varchar, responsibility struct(\"name\" varchar, \"role\" varchar)[]) │ struct(\"name\" varchar) │ struct(target varchar)[] │ varchar[] │ double │ double │ geometry │\n├─────────────────────────┼──────────┼──────────────────────────────────────────────────┼───────────────────┼──────────────────────────────────────────────────────────────────────────────────────────────────┼────────────────────────────────────────────────────────────────────────────────────┼───────────────────────────────────────────────────────────────────────────────────┼────────────────────────────┼────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────┼──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────┼────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────┼────────────────────────┼───────────────────────────────────────┼────────────────────┼───────────────────────────┼──────────────────────────┼───────────────────────────────┤\n│ ark:/21547/DSz2757 │ 757 │ basisOfRecord: PreservedSpecimen │ GEOME │ [{'identifier': https://w3id.org/isample/vocabulary/materialsampleobjecttype/1.0/wholeorganism}] │ [{'identifier': https://w3id.org/isample/vocabulary/material/1.0/organicmaterial}] │ [{'identifier': https://w3id.org/isample/biology/biosampledfeature/1.0/Animalia}] │ [Taricha, granulosa] │ [{'keyword': California}, {'keyword': USA}] │ {'description': expeditionCode: newts | projectId: 244, 'has_feature_of_interest': , 'identifier': ark:/21547/DSz2757, 'label': a22d568d303a95c622a9409871e562d7 newts, 'responsibility': [{'name': Vance Vredenburg, 'role': collector }, {'name': Vance Vredenburg, 'role': principalInvestigator}], 'result_time': 1894-01-01, 'sampling_site': {'description': NULL, 'label': California, 'place_name': [California, USA], 'sample_location': {'elevation': NULL, 'latitude': 38.578888, 'longitude': -122.57861}}} │ NULL │ NULL │ NULL │ NULL │ -122.57861 │ 38.578888 │ POINT (-122.57861 38.578888) │\n│ ark:/21547/DSz2779 │ 779 │ basisOfRecord: PreservedSpecimen │ GEOME │ [{'identifier': https://w3id.org/isample/vocabulary/materialsampleobjecttype/1.0/wholeorganism}] │ [{'identifier': https://w3id.org/isample/vocabulary/material/1.0/organicmaterial}] │ [{'identifier': https://w3id.org/isample/biology/biosampledfeature/1.0/Animalia}] │ [Taricha, granulosa] │ [{'keyword': California}, {'keyword': USA}] │ {'description': expeditionCode: newts | projectId: 244, 'has_feature_of_interest': , 'identifier': ark:/21547/DSz2779, 'label': 096c166b0c23c8823678eb43e4c00802 newts, 'responsibility': [{'name': Vance Vredenburg, 'role': collector }, {'name': Vance Vredenburg, 'role': principalInvestigator}], 'result_time': 1893-01-01, 'sampling_site': {'description': NULL, 'label': California, 'place_name': [California, USA], 'sample_location': {'elevation': NULL, 'latitude': 37.385277, 'longitude': -122.373055}}} │ NULL │ NULL │ NULL │ NULL │ -122.373055 │ 37.385277 │ POINT (-122.373055 37.385277) │\n│ ark:/21547/DSz2806 │ 806 │ basisOfRecord: PreservedSpecimen │ GEOME │ [{'identifier': https://w3id.org/isample/vocabulary/materialsampleobjecttype/1.0/wholeorganism}] │ [{'identifier': https://w3id.org/isample/vocabulary/material/1.0/organicmaterial}] │ [{'identifier': https://w3id.org/isample/biology/biosampledfeature/1.0/Animalia}] │ [Taricha, granulosa] │ [{'keyword': California}, {'keyword': USA}] │ {'description': expeditionCode: newts | projectId: 244, 'has_feature_of_interest': , 'identifier': ark:/21547/DSz2806, 'label': 4824d73db4a747e634637f1a3c2978cb newts, 'responsibility': [{'name': Vance Vredenburg, 'role': collector }, {'name': Vance Vredenburg, 'role': principalInvestigator}], 'result_time': 1893-01-01, 'sampling_site': {'description': NULL, 'label': California, 'place_name': [California, USA], 'sample_location': {'elevation': NULL, 'latitude': 37.36549, 'longitude': -122.11705}}} │ NULL │ NULL │ NULL │ NULL │ -122.11705 │ 37.36549 │ POINT (-122.11705 37.36549) │\n│ ark:/21547/DSz2807 │ 807 │ basisOfRecord: PreservedSpecimen │ GEOME │ [{'identifier': https://w3id.org/isample/vocabulary/materialsampleobjecttype/1.0/wholeorganism}] │ [{'identifier': https://w3id.org/isample/vocabulary/material/1.0/organicmaterial}] │ [{'identifier': https://w3id.org/isample/biology/biosampledfeature/1.0/Animalia}] │ [Taricha, granulosa] │ [{'keyword': California}, {'keyword': USA}] │ {'description': expeditionCode: newts | projectId: 244, 'has_feature_of_interest': , 'identifier': ark:/21547/DSz2807, 'label': 4824d73db4a747e634637f1a3c2978cb newts, 'responsibility': [{'name': Vance Vredenburg, 'role': collector }, {'name': Vance Vredenburg, 'role': principalInvestigator}], 'result_time': 1893-01-01, 'sampling_site': {'description': NULL, 'label': California, 'place_name': [California, USA], 'sample_location': {'elevation': NULL, 'latitude': 37.36549, 'longitude': -122.11705}}} │ NULL │ NULL │ NULL │ NULL │ -122.11705 │ 37.36549 │ POINT (-122.11705 37.36549) │\n│ ark:/21547/DSz2759 │ 759 │ basisOfRecord: PreservedSpecimen │ GEOME │ [{'identifier': https://w3id.org/isample/vocabulary/materialsampleobjecttype/1.0/wholeorganism}] │ [{'identifier': https://w3id.org/isample/vocabulary/material/1.0/organicmaterial}] │ [{'identifier': https://w3id.org/isample/biology/biosampledfeature/1.0/Animalia}] │ [Taricha, granulosa] │ [{'keyword': California}, {'keyword': USA}] │ {'description': expeditionCode: newts | projectId: 244, 'has_feature_of_interest': , 'identifier': ark:/21547/DSz2759, 'label': a22d568d303a95c622a9409871e562d7 newts, 'responsibility': [{'name': Vance Vredenburg, 'role': collector }, {'name': Vance Vredenburg, 'role': principalInvestigator}], 'result_time': 1894-01-01, 'sampling_site': {'description': NULL, 'label': California, 'place_name': [California, USA], 'sample_location': {'elevation': NULL, 'latitude': 38.578888, 'longitude': -122.57861}}} │ NULL │ NULL │ NULL │ NULL │ -122.57861 │ 38.578888 │ POINT (-122.57861 38.578888) │\n│ ark:/21547/DSz2761 │ 761 │ basisOfRecord: PreservedSpecimen │ GEOME │ [{'identifier': https://w3id.org/isample/vocabulary/materialsampleobjecttype/1.0/wholeorganism}] │ [{'identifier': https://w3id.org/isample/vocabulary/material/1.0/organicmaterial}] │ [{'identifier': https://w3id.org/isample/biology/biosampledfeature/1.0/Animalia}] │ [Taricha, granulosa] │ [{'keyword': California}, {'keyword': USA}] │ {'description': expeditionCode: newts | projectId: 244, 'has_feature_of_interest': , 'identifier': ark:/21547/DSz2761, 'label': a22d568d303a95c622a9409871e562d7 newts, 'responsibility': [{'name': Vance Vredenburg, 'role': collector }, {'name': Vance Vredenburg, 'role': principalInvestigator}], 'result_time': 1894-01-01, 'sampling_site': {'description': NULL, 'label': California, 'place_name': [California, USA], 'sample_location': {'elevation': NULL, 'latitude': 38.578888, 'longitude': -122.57861}}} │ NULL │ NULL │ NULL │ NULL │ -122.57861 │ 38.578888 │ POINT (-122.57861 38.578888) │\n│ ark:/21547/DSz2967 │ 967 │ basisOfRecord: PreservedSpecimen │ GEOME │ [{'identifier': https://w3id.org/isample/vocabulary/materialsampleobjecttype/1.0/wholeorganism}] │ [{'identifier': https://w3id.org/isample/vocabulary/material/1.0/organicmaterial}] │ [{'identifier': https://w3id.org/isample/biology/biosampledfeature/1.0/Animalia}] │ [Taricha, torosa] │ [{'keyword': California}, {'keyword': USA}] │ {'description': expeditionCode: newts | projectId: 244, 'has_feature_of_interest': , 'identifier': ark:/21547/DSz2967, 'label': 1b092798b61f72c79ff6df1f361b8705 newts, 'responsibility': [{'name': Vance Vredenburg, 'role': collector }, {'name': Vance Vredenburg, 'role': principalInvestigator}], 'result_time': 1894-01-01, 'sampling_site': {'description': NULL, 'label': California, 'place_name': [California, USA], 'sample_location': {'elevation': NULL, 'latitude': 38.669395, 'longitude': -122.63218}}} │ NULL │ NULL │ NULL │ NULL │ -122.63218 │ 38.669395 │ POINT (-122.63218 38.669395) │\n│ ark:/21547/DSz2763 │ 763 │ basisOfRecord: PreservedSpecimen │ GEOME │ [{'identifier': https://w3id.org/isample/vocabulary/materialsampleobjecttype/1.0/wholeorganism}] │ [{'identifier': https://w3id.org/isample/vocabulary/material/1.0/organicmaterial}] │ [{'identifier': https://w3id.org/isample/biology/biosampledfeature/1.0/Animalia}] │ [Taricha, granulosa] │ [{'keyword': California}, {'keyword': USA}] │ {'description': expeditionCode: newts | projectId: 244, 'has_feature_of_interest': , 'identifier': ark:/21547/DSz2763, 'label': a22d568d303a95c622a9409871e562d7 newts, 'responsibility': [{'name': Vance Vredenburg, 'role': collector }, {'name': Vance Vredenburg, 'role': principalInvestigator}], 'result_time': 1894-01-01, 'sampling_site': {'description': NULL, 'label': California, 'place_name': [California, USA], 'sample_location': {'elevation': NULL, 'latitude': 38.578888, 'longitude': -122.57861}}} │ NULL │ NULL │ NULL │ NULL │ -122.57861 │ 38.578888 │ POINT (-122.57861 38.578888) │\n│ ark:/21547/DSz2979 │ 979 │ basisOfRecord: PreservedSpecimen │ GEOME │ [{'identifier': https://w3id.org/isample/vocabulary/materialsampleobjecttype/1.0/wholeorganism}] │ [{'identifier': https://w3id.org/isample/vocabulary/material/1.0/organicmaterial}] │ [{'identifier': https://w3id.org/isample/biology/biosampledfeature/1.0/Animalia}] │ [Taricha, torosa] │ [{'keyword': California}, {'keyword': USA}] │ {'description': expeditionCode: newts | projectId: 244, 'has_feature_of_interest': , 'identifier': ark:/21547/DSz2979, 'label': ac6f9b6dd20fd04e411c2db0348524e3 newts, 'responsibility': [{'name': Vance Vredenburg, 'role': collector }, {'name': Vance Vredenburg, 'role': principalInvestigator}], 'result_time': 1894-01-01, 'sampling_site': {'description': NULL, 'label': California, 'place_name': [California, USA], 'sample_location': {'elevation': NULL, 'latitude': 34.147778, 'longitude': -118.14361}}} │ NULL │ NULL │ NULL │ NULL │ -118.14361 │ 34.147778 │ POINT (-118.14361 34.147778) │\n│ ark:/21547/DSz21792 │ 1792 │ basisOfRecord: PreservedSpecimen │ GEOME │ [{'identifier': https://w3id.org/isample/vocabulary/materialsampleobjecttype/1.0/wholeorganism}] │ [{'identifier': https://w3id.org/isample/vocabulary/material/1.0/organicmaterial}] │ [{'identifier': https://w3id.org/isample/biology/biosampledfeature/1.0/Animalia}] │ [Taricha, torosa] │ [{'keyword': California}, {'keyword': USA}] │ {'description': expeditionCode: newts | projectId: 244, 'has_feature_of_interest': , 'identifier': ark:/21547/DSz21792, 'label': ed5754fe295e377d6da2add6748fb7c0 newts, 'responsibility': [{'name': Vance Vredenburg, 'role': collector }, {'name': Vance Vredenburg, 'role': principalInvestigator}], 'result_time': 1896-01-01, 'sampling_site': {'description': NULL, 'label': California, 'place_name': [California, USA], 'sample_location': {'elevation': NULL, 'latitude': 37.87103, 'longitude': -122.27711}}} │ NULL │ NULL │ NULL │ NULL │ -122.27711 │ 37.87103 │ POINT (-122.27711 37.87103) │\n│ · │ · │ · │ · │ · │ · │ · │ · │ · │ · │ · │ · │ · │ · │ · │ · │ · │\n│ · │ · │ · │ · │ · │ · │ · │ · │ · │ · │ · │ · │ · │ · │ · │ · │ · │\n│ · │ · │ · │ · │ · │ · │ · │ · │ · │ · │ · │ · │ · │ · │ · │ · │ · │\n│ ark:/21547/BNt2CmMQ0005 │ CmMQ0005 │ NULL │ GEOME │ [{'identifier': https://w3id.org/isample/vocabulary/materialsampleobjecttype/1.0/organismpart}] │ [{'identifier': https://w3id.org/isample/vocabulary/material/1.0/organicmaterial}] │ [{'identifier': https://w3id.org/isample/biology/biosampledfeature/1.0/Animalia}] │ [Ctenochaetus, marginatus] │ [{'keyword': Acanthuridae}, {'keyword': Actinopterygii}, {'keyword': Chordata}, {'keyword': French Polynesia}, {'keyword': Marquesas}, {'keyword': Perciformes}, {'keyword': Ua Huka}] │ NULL │ {'access_constraints': [], 'curation_location': , 'description': preservative: DMSO Buffer, 'label': , 'responsibility': [{'name': c, 'role': curator}, {'name': u, 'role': curator}, {'name': r, 'role': curator}, {'name': a, 'role': curator}, {'name': t, 'role': curator}, {'name': o, 'role': curator}, {'name': r, 'role': curator}, {'name': , 'role': }, {'name': S, 'role': curator}, {'name': T, 'role': curator}, {'name': R, 'role': curator}, {'name': I, 'role': curator}]} │ NULL │ [{'target': ark:/21547/ot2CmMQ0005}] │ [genomic analysis] │ NULL │ NULL │ POINT EMPTY │\n│ ark:/21547/ot2ANMQ0165 │ ANMQ0165 │ previousIdentifications: Acanthurus nigricans │ GEOME │ [{'identifier': https://w3id.org/isample/vocabulary/materialsampleobjecttype/1.0/wholeorganism}] │ [{'identifier': https://w3id.org/isample/vocabulary/material/1.0/organicmaterial}] │ [{'identifier': https://w3id.org/isample/biology/biosampledfeature/1.0/Animalia}] │ [Acanthurus, nigricans] │ [{'keyword': Acanthuridae}, {'keyword': Actinopterygii}, {'keyword': Chordata}, {'keyword': French Polynesia}, {'keyword': Marquesas}, {'keyword': Perciformes}] │ {'description': expeditionCode: IPfish_A68_HL | projectId: 1, 'has_feature_of_interest': , 'identifier': ark:/21547/ot2ANMQ0165, 'label': a8b1f9fa321073f72f9a7881747ab592 IPfish_A68_HL, 'responsibility': [{'name': D. R. Robertson, 'role': collector }, {'name': H. A. Lessios, 'role': principalInvestigator}], 'result_time': 1995-05-01, 'sampling_site': {'description': NULL, 'label': Marquesas, 'place_name': [French Polynesia, Marquesas], 'sample_location': {'elevation': NULL, 'latitude': -9.0, 'longitude': -139.3}}} │ {'access_constraints': [], 'curation_location': , 'description': preservative: DMSO Buffer, 'label': , 'responsibility': [{'name': c, 'role': curator}, {'name': u, 'role': curator}, {'name': r, 'role': curator}, {'name': a, 'role': curator}, {'name': t, 'role': curator}, {'name': o, 'role': curator}, {'name': r, 'role': curator}, {'name': , 'role': }, {'name': S, 'role': curator}, {'name': T, 'role': curator}, {'name': R, 'role': curator}, {'name': I, 'role': curator}]} │ NULL │ [{'target': ark:/21547/BNt2ANMQ0165}] │ NULL │ -139.3 │ -9.0 │ POINT (-139.3 -9) │\n│ ark:/21547/BNt2ANMQ0165 │ ANMQ0165 │ NULL │ GEOME │ [{'identifier': https://w3id.org/isample/vocabulary/materialsampleobjecttype/1.0/organismpart}] │ [{'identifier': https://w3id.org/isample/vocabulary/material/1.0/organicmaterial}] │ [{'identifier': https://w3id.org/isample/biology/biosampledfeature/1.0/Animalia}] │ [Acanthurus, nigricans] │ [{'keyword': Acanthuridae}, {'keyword': Actinopterygii}, {'keyword': Chordata}, {'keyword': French Polynesia}, {'keyword': Marquesas}, {'keyword': Perciformes}] │ NULL │ {'access_constraints': [], 'curation_location': , 'description': preservative: DMSO Buffer, 'label': , 'responsibility': [{'name': c, 'role': curator}, {'name': u, 'role': curator}, {'name': r, 'role': curator}, {'name': a, 'role': curator}, {'name': t, 'role': curator}, {'name': o, 'role': curator}, {'name': r, 'role': curator}, {'name': , 'role': }, {'name': S, 'role': curator}, {'name': T, 'role': curator}, {'name': R, 'role': curator}, {'name': I, 'role': curator}]} │ NULL │ [{'target': ark:/21547/ot2ANMQ0165}] │ [genomic analysis] │ NULL │ NULL │ POINT EMPTY │\n│ ark:/21547/ot2CmMQ0004 │ CmMQ0004 │ previousIdentifications: Ctenochaetus marginatus │ GEOME │ [{'identifier': https://w3id.org/isample/vocabulary/materialsampleobjecttype/1.0/wholeorganism}] │ [{'identifier': https://w3id.org/isample/vocabulary/material/1.0/organicmaterial}] │ [{'identifier': https://w3id.org/isample/biology/biosampledfeature/1.0/Animalia}] │ [Ctenochaetus, marginatus] │ [{'keyword': Acanthuridae}, {'keyword': Actinopterygii}, {'keyword': Chordata}, {'keyword': French Polynesia}, {'keyword': Marquesas}, {'keyword': Perciformes}, {'keyword': Ua Huka}] │ {'description': expeditionCode: IPfish_A68_HL | projectId: 1, 'has_feature_of_interest': , 'identifier': ark:/21547/ot2CmMQ0004, 'label': c8e4608b0409567824c8badfbaef685f IPfish_A68_HL, 'responsibility': [{'name': D. R. Robertson, 'role': collector }, {'name': H. A. Lessios, 'role': principalInvestigator}], 'result_time': 1995-05-01, 'sampling_site': {'description': NULL, 'label': ua-huka, 'place_name': [French Polynesia, Marquesas, Ua Huka, ua-huka], 'sample_location': {'elevation': NULL, 'latitude': -8.90864, 'longitude': -139.55984}}} │ {'access_constraints': [], 'curation_location': , 'description': preservative: DMSO Buffer, 'label': , 'responsibility': [{'name': c, 'role': curator}, {'name': u, 'role': curator}, {'name': r, 'role': curator}, {'name': a, 'role': curator}, {'name': t, 'role': curator}, {'name': o, 'role': curator}, {'name': r, 'role': curator}, {'name': , 'role': }, {'name': S, 'role': curator}, {'name': T, 'role': curator}, {'name': R, 'role': curator}, {'name': I, 'role': curator}]} │ NULL │ [{'target': ark:/21547/BNt2CmMQ0004}] │ NULL │ -139.55984 │ -8.90864 │ POINT (-139.55984 -8.90864) │\n│ ark:/21547/BNt2CmMQ0004 │ CmMQ0004 │ NULL │ GEOME │ [{'identifier': https://w3id.org/isample/vocabulary/materialsampleobjecttype/1.0/organismpart}] │ [{'identifier': https://w3id.org/isample/vocabulary/material/1.0/organicmaterial}] │ [{'identifier': https://w3id.org/isample/biology/biosampledfeature/1.0/Animalia}] │ [Ctenochaetus, marginatus] │ [{'keyword': Acanthuridae}, {'keyword': Actinopterygii}, {'keyword': Chordata}, {'keyword': French Polynesia}, {'keyword': Marquesas}, {'keyword': Perciformes}, {'keyword': Ua Huka}] │ NULL │ {'access_constraints': [], 'curation_location': , 'description': preservative: DMSO Buffer, 'label': , 'responsibility': [{'name': c, 'role': curator}, {'name': u, 'role': curator}, {'name': r, 'role': curator}, {'name': a, 'role': curator}, {'name': t, 'role': curator}, {'name': o, 'role': curator}, {'name': r, 'role': curator}, {'name': , 'role': }, {'name': S, 'role': curator}, {'name': T, 'role': curator}, {'name': R, 'role': curator}, {'name': I, 'role': curator}]} │ NULL │ [{'target': ark:/21547/ot2CmMQ0004}] │ [genomic analysis] │ NULL │ NULL │ POINT EMPTY │\n│ ark:/21547/ot2CmMQ0007 │ CmMQ0007 │ previousIdentifications: Ctenochaetus marginatus │ GEOME │ [{'identifier': https://w3id.org/isample/vocabulary/materialsampleobjecttype/1.0/wholeorganism}] │ [{'identifier': https://w3id.org/isample/vocabulary/material/1.0/organicmaterial}] │ [{'identifier': https://w3id.org/isample/biology/biosampledfeature/1.0/Animalia}] │ [Ctenochaetus, marginatus] │ [{'keyword': Acanthuridae}, {'keyword': Actinopterygii}, {'keyword': Chordata}, {'keyword': French Polynesia}, {'keyword': Marquesas}, {'keyword': Perciformes}, {'keyword': Ua Huka}] │ {'description': expeditionCode: IPfish_A68_HL | projectId: 1, 'has_feature_of_interest': , 'identifier': ark:/21547/ot2CmMQ0007, 'label': c8e4608b0409567824c8badfbaef685f IPfish_A68_HL, 'responsibility': [{'name': D. R. Robertson, 'role': collector }, {'name': H. A. Lessios, 'role': principalInvestigator}], 'result_time': 1995-05-01, 'sampling_site': {'description': NULL, 'label': ua-huka, 'place_name': [French Polynesia, Marquesas, Ua Huka, ua-huka], 'sample_location': {'elevation': NULL, 'latitude': -8.90864, 'longitude': -139.55984}}} │ {'access_constraints': [], 'curation_location': , 'description': preservative: DMSO Buffer, 'label': , 'responsibility': [{'name': c, 'role': curator}, {'name': u, 'role': curator}, {'name': r, 'role': curator}, {'name': a, 'role': curator}, {'name': t, 'role': curator}, {'name': o, 'role': curator}, {'name': r, 'role': curator}, {'name': , 'role': }, {'name': S, 'role': curator}, {'name': T, 'role': curator}, {'name': R, 'role': curator}, {'name': I, 'role': curator}]} │ NULL │ [{'target': ark:/21547/BNt2CmMQ0007}] │ NULL │ -139.55984 │ -8.90864 │ POINT (-139.55984 -8.90864) │\n│ ark:/21547/BNt2CmMQ0007 │ CmMQ0007 │ NULL │ GEOME │ [{'identifier': https://w3id.org/isample/vocabulary/materialsampleobjecttype/1.0/organismpart}] │ [{'identifier': https://w3id.org/isample/vocabulary/material/1.0/organicmaterial}] │ [{'identifier': https://w3id.org/isample/biology/biosampledfeature/1.0/Animalia}] │ [Ctenochaetus, marginatus] │ [{'keyword': Acanthuridae}, {'keyword': Actinopterygii}, {'keyword': Chordata}, {'keyword': French Polynesia}, {'keyword': Marquesas}, {'keyword': Perciformes}, {'keyword': Ua Huka}] │ NULL │ {'access_constraints': [], 'curation_location': , 'description': preservative: DMSO Buffer, 'label': , 'responsibility': [{'name': c, 'role': curator}, {'name': u, 'role': curator}, {'name': r, 'role': curator}, {'name': a, 'role': curator}, {'name': t, 'role': curator}, {'name': o, 'role': curator}, {'name': r, 'role': curator}, {'name': , 'role': }, {'name': S, 'role': curator}, {'name': T, 'role': curator}, {'name': R, 'role': curator}, {'name': I, 'role': curator}]} │ NULL │ [{'target': ark:/21547/ot2CmMQ0007}] │ [genomic analysis] │ NULL │ NULL │ POINT EMPTY │\n│ ark:/21547/ot2ANMQ0136 │ ANMQ0136 │ previousIdentifications: Acanthurus nigricans │ GEOME │ [{'identifier': https://w3id.org/isample/vocabulary/materialsampleobjecttype/1.0/wholeorganism}] │ [{'identifier': https://w3id.org/isample/vocabulary/material/1.0/organicmaterial}] │ [{'identifier': https://w3id.org/isample/biology/biosampledfeature/1.0/Animalia}] │ [Acanthurus, nigricans] │ [{'keyword': Acanthuridae}, {'keyword': Actinopterygii}, {'keyword': Chordata}, {'keyword': French Polynesia}, {'keyword': Marquesas}, {'keyword': Perciformes}] │ {'description': expeditionCode: IPfish_A68_HL | projectId: 1, 'has_feature_of_interest': , 'identifier': ark:/21547/ot2ANMQ0136, 'label': a8b1f9fa321073f72f9a7881747ab592 IPfish_A68_HL, 'responsibility': [{'name': D. R. Robertson, 'role': collector }, {'name': H. A. Lessios, 'role': principalInvestigator}], 'result_time': 1995-05-01, 'sampling_site': {'description': NULL, 'label': Marquesas, 'place_name': [French Polynesia, Marquesas], 'sample_location': {'elevation': NULL, 'latitude': -9.0, 'longitude': -139.3}}} │ {'access_constraints': [], 'curation_location': , 'description': preservative: DMSO Buffer, 'label': , 'responsibility': [{'name': c, 'role': curator}, {'name': u, 'role': curator}, {'name': r, 'role': curator}, {'name': a, 'role': curator}, {'name': t, 'role': curator}, {'name': o, 'role': curator}, {'name': r, 'role': curator}, {'name': , 'role': }, {'name': S, 'role': curator}, {'name': T, 'role': curator}, {'name': R, 'role': curator}, {'name': I, 'role': curator}]} │ NULL │ [{'target': ark:/21547/BNt2ANMQ0136}] │ NULL │ -139.3 │ -9.0 │ POINT (-139.3 -9) │\n│ ark:/21547/BNt2ANMQ0136 │ ANMQ0136 │ NULL │ GEOME │ [{'identifier': https://w3id.org/isample/vocabulary/materialsampleobjecttype/1.0/organismpart}] │ [{'identifier': https://w3id.org/isample/vocabulary/material/1.0/organicmaterial}] │ [{'identifier': https://w3id.org/isample/biology/biosampledfeature/1.0/Animalia}] │ [Acanthurus, nigricans] │ [{'keyword': Acanthuridae}, {'keyword': Actinopterygii}, {'keyword': Chordata}, {'keyword': French Polynesia}, {'keyword': Marquesas}, {'keyword': Perciformes}] │ NULL │ {'access_constraints': [], 'curation_location': , 'description': preservative: DMSO Buffer, 'label': , 'responsibility': [{'name': c, 'role': curator}, {'name': u, 'role': curator}, {'name': r, 'role': curator}, {'name': a, 'role': curator}, {'name': t, 'role': curator}, {'name': o, 'role': curator}, {'name': r, 'role': curator}, {'name': , 'role': }, {'name': S, 'role': curator}, {'name': T, 'role': curator}, {'name': R, 'role': curator}, {'name': I, 'role': curator}]} │ NULL │ [{'target': ark:/21547/ot2ANMQ0136}] │ [genomic analysis] │ NULL │ NULL │ POINT EMPTY │\n│ ark:/21547/ot2ANMQ0137 │ ANMQ0137 │ previousIdentifications: Acanthurus nigricans │ GEOME │ [{'identifier': https://w3id.org/isample/vocabulary/materialsampleobjecttype/1.0/wholeorganism}] │ [{'identifier': https://w3id.org/isample/vocabulary/material/1.0/organicmaterial}] │ [{'identifier': https://w3id.org/isample/biology/biosampledfeature/1.0/Animalia}] │ [Acanthurus, nigricans] │ [{'keyword': Acanthuridae}, {'keyword': Actinopterygii}, {'keyword': Chordata}, {'keyword': French Polynesia}, {'keyword': Marquesas}, {'keyword': Perciformes}] │ {'description': expeditionCode: IPfish_A68_HL | projectId: 1, 'has_feature_of_interest': , 'identifier': ark:/21547/ot2ANMQ0137, 'label': a8b1f9fa321073f72f9a7881747ab592 IPfish_A68_HL, 'responsibility': [{'name': D. R. Robertson, 'role': collector }, {'name': H. A. Lessios, 'role': principalInvestigator}], 'result_time': 1995-05-01, 'sampling_site': {'description': NULL, 'label': Marquesas, 'place_name': [French Polynesia, Marquesas], 'sample_location': {'elevation': NULL, 'latitude': -9.0, 'longitude': -139.3}}} │ {'access_constraints': [], 'curation_location': , 'description': preservative: DMSO Buffer, 'label': , 'responsibility': [{'name': c, 'role': curator}, {'name': u, 'role': curator}, {'name': r, 'role': curator}, {'name': a, 'role': curator}, {'name': t, 'role': curator}, {'name': o, 'role': curator}, {'name': r, 'role': curator}, {'name': , 'role': }, {'name': S, 'role': curator}, {'name': T, 'role': curator}, {'name': R, 'role': curator}, {'name': I, 'role': curator}]} │ NULL │ [{'target': ark:/21547/BNt2ANMQ0137}] │ NULL │ -139.3 │ -9.0 │ POINT (-139.3 -9) │\n├─────────────────────────┴──────────┴──────────────────────────────────────────────────┴───────────────────┴──────────────────────────────────────────────────────────────────────────────────────────────────┴────────────────────────────────────────────────────────────────────────────────────┴───────────────────────────────────────────────────────────────────────────────────┴────────────────────────────┴────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────┴──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────┴────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────┴────────────────────────┴───────────────────────────────────────┴────────────────────┴───────────────────────────┴──────────────────────────┴───────────────────────────────┤\n│ ? rows (>9999 rows, 20 shown) 17 columns │\n└──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────┘\n", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mTypeError\u001b[0m Traceback (most recent call last)", + "Cell \u001b[0;32mIn[1], line 138\u001b[0m\n\u001b[1;32m 135\u001b[0m query \u001b[38;5;241m=\u001b[39m conn\u001b[38;5;241m.\u001b[39mtable(\u001b[38;5;124m'\u001b[39m\u001b[38;5;124misamples_export_2025_02_20_10_30_49_geo.parquet\u001b[39m\u001b[38;5;124m'\u001b[39m)\n\u001b[1;32m 137\u001b[0m \u001b[38;5;66;03m# Queries stay lazy until you need results\u001b[39;00m\n\u001b[0;32m--> 138\u001b[0m \u001b[38;5;28mprint\u001b[39m(\u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mTotal rows: \u001b[39m\u001b[38;5;132;01m{\u001b[39;00m\u001b[43mquery\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mcount\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m\"\u001b[39m)\n\u001b[1;32m 140\u001b[0m \u001b[38;5;66;03m# You can chain operations naturally\u001b[39;00m\n\u001b[1;32m 141\u001b[0m filtered \u001b[38;5;241m=\u001b[39m query\u001b[38;5;241m.\u001b[39mfilter(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124msome_condition\u001b[39m\u001b[38;5;124m\"\u001b[39m)\u001b[38;5;241m.\u001b[39mlimit(\u001b[38;5;241m5\u001b[39m)\n", + "\u001b[0;31mTypeError\u001b[0m: count(): incompatible function arguments. The following argument types are supported:\n 1. (self: duckdb.duckdb.DuckDBPyRelation, column: str, groups: str = '', window_spec: str = '', projected_columns: str = '') -> duckdb.duckdb.DuckDBPyRelation\n\nInvoked with: ┌─────────────────────────┬──────────┬──────────────────────────────────────────────────┬───────────────────┬──────────────────────────────────────────────────────────────────────────────────────────────────┬────────────────────────────────────────────────────────────────────────────────────┬───────────────────────────────────────────────────────────────────────────────────┬────────────────────────────┬────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────┬──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────┬────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────┬────────────────────────┬───────────────────────────────────────┬────────────────────┬───────────────────────────┬──────────────────────────┬───────────────────────────────┐\n│ sample_identifier │ label │ description │ source_collection │ has_sample_object_type │ has_material_category │ has_context_category │ informal_classification │ keywords │ produced_by │ curation │ registrant │ related_resource │ sampling_purpose │ sample_location_longitude │ sample_location_latitude │ geometry │\n│ varchar │ varchar │ varchar │ varchar │ struct(identifier varchar)[] │ struct(identifier varchar)[] │ struct(identifier varchar)[] │ varchar[] │ struct(keyword varchar)[] │ struct(description varchar, has_feature_of_interest varchar, identifier varchar, \"label\" varchar, responsibility struct(\"name\" varchar, \"role\" varchar)[], result_time varchar, sampling_site struct(description varchar, \"label\" varchar, place_name varchar[], sample_location struct(elevation double, latitude double, longitude double))) │ struct(access_constraints varchar[], curation_location varchar, description varchar, \"label\" varchar, responsibility struct(\"name\" varchar, \"role\" varchar)[]) │ struct(\"name\" varchar) │ struct(target varchar)[] │ varchar[] │ double │ double │ geometry │\n├─────────────────────────┼──────────┼──────────────────────────────────────────────────┼───────────────────┼──────────────────────────────────────────────────────────────────────────────────────────────────┼────────────────────────────────────────────────────────────────────────────────────┼───────────────────────────────────────────────────────────────────────────────────┼────────────────────────────┼────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────┼──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────┼────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────┼────────────────────────┼───────────────────────────────────────┼────────────────────┼───────────────────────────┼──────────────────────────┼───────────────────────────────┤\n│ ark:/21547/DSz2757 │ 757 │ basisOfRecord: PreservedSpecimen │ GEOME │ [{'identifier': https://w3id.org/isample/vocabulary/materialsampleobjecttype/1.0/wholeorganism}] │ [{'identifier': https://w3id.org/isample/vocabulary/material/1.0/organicmaterial}] │ [{'identifier': https://w3id.org/isample/biology/biosampledfeature/1.0/Animalia}] │ [Taricha, granulosa] │ [{'keyword': California}, {'keyword': USA}] │ {'description': expeditionCode: newts | projectId: 244, 'has_feature_of_interest': , 'identifier': ark:/21547/DSz2757, 'label': a22d568d303a95c622a9409871e562d7 newts, 'responsibility': [{'name': Vance Vredenburg, 'role': collector }, {'name': Vance Vredenburg, 'role': principalInvestigator}], 'result_time': 1894-01-01, 'sampling_site': {'description': NULL, 'label': California, 'place_name': [California, USA], 'sample_location': {'elevation': NULL, 'latitude': 38.578888, 'longitude': -122.57861}}} │ NULL │ NULL │ NULL │ NULL │ -122.57861 │ 38.578888 │ POINT (-122.57861 38.578888) │\n│ ark:/21547/DSz2779 │ 779 │ basisOfRecord: PreservedSpecimen │ GEOME │ [{'identifier': https://w3id.org/isample/vocabulary/materialsampleobjecttype/1.0/wholeorganism}] │ [{'identifier': https://w3id.org/isample/vocabulary/material/1.0/organicmaterial}] │ [{'identifier': https://w3id.org/isample/biology/biosampledfeature/1.0/Animalia}] │ [Taricha, granulosa] │ [{'keyword': California}, {'keyword': USA}] │ {'description': expeditionCode: newts | projectId: 244, 'has_feature_of_interest': , 'identifier': ark:/21547/DSz2779, 'label': 096c166b0c23c8823678eb43e4c00802 newts, 'responsibility': [{'name': Vance Vredenburg, 'role': collector }, {'name': Vance Vredenburg, 'role': principalInvestigator}], 'result_time': 1893-01-01, 'sampling_site': {'description': NULL, 'label': California, 'place_name': [California, USA], 'sample_location': {'elevation': NULL, 'latitude': 37.385277, 'longitude': -122.373055}}} │ NULL │ NULL │ NULL │ NULL │ -122.373055 │ 37.385277 │ POINT (-122.373055 37.385277) │\n│ ark:/21547/DSz2806 │ 806 │ basisOfRecord: PreservedSpecimen │ GEOME │ [{'identifier': https://w3id.org/isample/vocabulary/materialsampleobjecttype/1.0/wholeorganism}] │ [{'identifier': https://w3id.org/isample/vocabulary/material/1.0/organicmaterial}] │ [{'identifier': https://w3id.org/isample/biology/biosampledfeature/1.0/Animalia}] │ [Taricha, granulosa] │ [{'keyword': California}, {'keyword': USA}] │ {'description': expeditionCode: newts | projectId: 244, 'has_feature_of_interest': , 'identifier': ark:/21547/DSz2806, 'label': 4824d73db4a747e634637f1a3c2978cb newts, 'responsibility': [{'name': Vance Vredenburg, 'role': collector }, {'name': Vance Vredenburg, 'role': principalInvestigator}], 'result_time': 1893-01-01, 'sampling_site': {'description': NULL, 'label': California, 'place_name': [California, USA], 'sample_location': {'elevation': NULL, 'latitude': 37.36549, 'longitude': -122.11705}}} │ NULL │ NULL │ NULL │ NULL │ -122.11705 │ 37.36549 │ POINT (-122.11705 37.36549) │\n│ ark:/21547/DSz2807 │ 807 │ basisOfRecord: PreservedSpecimen │ GEOME │ [{'identifier': https://w3id.org/isample/vocabulary/materialsampleobjecttype/1.0/wholeorganism}] │ [{'identifier': https://w3id.org/isample/vocabulary/material/1.0/organicmaterial}] │ [{'identifier': https://w3id.org/isample/biology/biosampledfeature/1.0/Animalia}] │ [Taricha, granulosa] │ [{'keyword': California}, {'keyword': USA}] │ {'description': expeditionCode: newts | projectId: 244, 'has_feature_of_interest': , 'identifier': ark:/21547/DSz2807, 'label': 4824d73db4a747e634637f1a3c2978cb newts, 'responsibility': [{'name': Vance Vredenburg, 'role': collector }, {'name': Vance Vredenburg, 'role': principalInvestigator}], 'result_time': 1893-01-01, 'sampling_site': {'description': NULL, 'label': California, 'place_name': [California, USA], 'sample_location': {'elevation': NULL, 'latitude': 37.36549, 'longitude': -122.11705}}} │ NULL │ NULL │ NULL │ NULL │ -122.11705 │ 37.36549 │ POINT (-122.11705 37.36549) │\n│ ark:/21547/DSz2759 │ 759 │ basisOfRecord: PreservedSpecimen │ GEOME │ [{'identifier': https://w3id.org/isample/vocabulary/materialsampleobjecttype/1.0/wholeorganism}] │ [{'identifier': https://w3id.org/isample/vocabulary/material/1.0/organicmaterial}] │ [{'identifier': https://w3id.org/isample/biology/biosampledfeature/1.0/Animalia}] │ [Taricha, granulosa] │ [{'keyword': California}, {'keyword': USA}] │ {'description': expeditionCode: newts | projectId: 244, 'has_feature_of_interest': , 'identifier': ark:/21547/DSz2759, 'label': a22d568d303a95c622a9409871e562d7 newts, 'responsibility': [{'name': Vance Vredenburg, 'role': collector }, {'name': Vance Vredenburg, 'role': principalInvestigator}], 'result_time': 1894-01-01, 'sampling_site': {'description': NULL, 'label': California, 'place_name': [California, USA], 'sample_location': {'elevation': NULL, 'latitude': 38.578888, 'longitude': -122.57861}}} │ NULL │ NULL │ NULL │ NULL │ -122.57861 │ 38.578888 │ POINT (-122.57861 38.578888) │\n│ ark:/21547/DSz2761 │ 761 │ basisOfRecord: PreservedSpecimen │ GEOME │ [{'identifier': https://w3id.org/isample/vocabulary/materialsampleobjecttype/1.0/wholeorganism}] │ [{'identifier': https://w3id.org/isample/vocabulary/material/1.0/organicmaterial}] │ [{'identifier': https://w3id.org/isample/biology/biosampledfeature/1.0/Animalia}] │ [Taricha, granulosa] │ [{'keyword': California}, {'keyword': USA}] │ {'description': expeditionCode: newts | projectId: 244, 'has_feature_of_interest': , 'identifier': ark:/21547/DSz2761, 'label': a22d568d303a95c622a9409871e562d7 newts, 'responsibility': [{'name': Vance Vredenburg, 'role': collector }, {'name': Vance Vredenburg, 'role': principalInvestigator}], 'result_time': 1894-01-01, 'sampling_site': {'description': NULL, 'label': California, 'place_name': [California, USA], 'sample_location': {'elevation': NULL, 'latitude': 38.578888, 'longitude': -122.57861}}} │ NULL │ NULL │ NULL │ NULL │ -122.57861 │ 38.578888 │ POINT (-122.57861 38.578888) │\n│ ark:/21547/DSz2967 │ 967 │ basisOfRecord: PreservedSpecimen │ GEOME │ [{'identifier': https://w3id.org/isample/vocabulary/materialsampleobjecttype/1.0/wholeorganism}] │ [{'identifier': https://w3id.org/isample/vocabulary/material/1.0/organicmaterial}] │ [{'identifier': https://w3id.org/isample/biology/biosampledfeature/1.0/Animalia}] │ [Taricha, torosa] │ [{'keyword': California}, {'keyword': USA}] │ {'description': expeditionCode: newts | projectId: 244, 'has_feature_of_interest': , 'identifier': ark:/21547/DSz2967, 'label': 1b092798b61f72c79ff6df1f361b8705 newts, 'responsibility': [{'name': Vance Vredenburg, 'role': collector }, {'name': Vance Vredenburg, 'role': principalInvestigator}], 'result_time': 1894-01-01, 'sampling_site': {'description': NULL, 'label': California, 'place_name': [California, USA], 'sample_location': {'elevation': NULL, 'latitude': 38.669395, 'longitude': -122.63218}}} │ NULL │ NULL │ NULL │ NULL │ -122.63218 │ 38.669395 │ POINT (-122.63218 38.669395) │\n│ ark:/21547/DSz2763 │ 763 │ basisOfRecord: PreservedSpecimen │ GEOME │ [{'identifier': https://w3id.org/isample/vocabulary/materialsampleobjecttype/1.0/wholeorganism}] │ [{'identifier': https://w3id.org/isample/vocabulary/material/1.0/organicmaterial}] │ [{'identifier': https://w3id.org/isample/biology/biosampledfeature/1.0/Animalia}] │ [Taricha, granulosa] │ [{'keyword': California}, {'keyword': USA}] │ {'description': expeditionCode: newts | projectId: 244, 'has_feature_of_interest': , 'identifier': ark:/21547/DSz2763, 'label': a22d568d303a95c622a9409871e562d7 newts, 'responsibility': [{'name': Vance Vredenburg, 'role': collector }, {'name': Vance Vredenburg, 'role': principalInvestigator}], 'result_time': 1894-01-01, 'sampling_site': {'description': NULL, 'label': California, 'place_name': [California, USA], 'sample_location': {'elevation': NULL, 'latitude': 38.578888, 'longitude': -122.57861}}} │ NULL │ NULL │ NULL │ NULL │ -122.57861 │ 38.578888 │ POINT (-122.57861 38.578888) │\n│ ark:/21547/DSz2979 │ 979 │ basisOfRecord: PreservedSpecimen │ GEOME │ [{'identifier': https://w3id.org/isample/vocabulary/materialsampleobjecttype/1.0/wholeorganism}] │ [{'identifier': https://w3id.org/isample/vocabulary/material/1.0/organicmaterial}] │ [{'identifier': https://w3id.org/isample/biology/biosampledfeature/1.0/Animalia}] │ [Taricha, torosa] │ [{'keyword': California}, {'keyword': USA}] │ {'description': expeditionCode: newts | projectId: 244, 'has_feature_of_interest': , 'identifier': ark:/21547/DSz2979, 'label': ac6f9b6dd20fd04e411c2db0348524e3 newts, 'responsibility': [{'name': Vance Vredenburg, 'role': collector }, {'name': Vance Vredenburg, 'role': principalInvestigator}], 'result_time': 1894-01-01, 'sampling_site': {'description': NULL, 'label': California, 'place_name': [California, USA], 'sample_location': {'elevation': NULL, 'latitude': 34.147778, 'longitude': -118.14361}}} │ NULL │ NULL │ NULL │ NULL │ -118.14361 │ 34.147778 │ POINT (-118.14361 34.147778) │\n│ ark:/21547/DSz21792 │ 1792 │ basisOfRecord: PreservedSpecimen │ GEOME │ [{'identifier': https://w3id.org/isample/vocabulary/materialsampleobjecttype/1.0/wholeorganism}] │ [{'identifier': https://w3id.org/isample/vocabulary/material/1.0/organicmaterial}] │ [{'identifier': https://w3id.org/isample/biology/biosampledfeature/1.0/Animalia}] │ [Taricha, torosa] │ [{'keyword': California}, {'keyword': USA}] │ {'description': expeditionCode: newts | projectId: 244, 'has_feature_of_interest': , 'identifier': ark:/21547/DSz21792, 'label': ed5754fe295e377d6da2add6748fb7c0 newts, 'responsibility': [{'name': Vance Vredenburg, 'role': collector }, {'name': Vance Vredenburg, 'role': principalInvestigator}], 'result_time': 1896-01-01, 'sampling_site': {'description': NULL, 'label': California, 'place_name': [California, USA], 'sample_location': {'elevation': NULL, 'latitude': 37.87103, 'longitude': -122.27711}}} │ NULL │ NULL │ NULL │ NULL │ -122.27711 │ 37.87103 │ POINT (-122.27711 37.87103) │\n│ · │ · │ · │ · │ · │ · │ · │ · │ · │ · │ · │ · │ · │ · │ · │ · │ · │\n│ · │ · │ · │ · │ · │ · │ · │ · │ · │ · │ · │ · │ · │ · │ · │ · │ · │\n│ · │ · │ · │ · │ · │ · │ · │ · │ · │ · │ · │ · │ · │ · │ · │ · │ · │\n│ ark:/21547/BNt2CmMQ0005 │ CmMQ0005 │ NULL │ GEOME │ [{'identifier': https://w3id.org/isample/vocabulary/materialsampleobjecttype/1.0/organismpart}] │ [{'identifier': https://w3id.org/isample/vocabulary/material/1.0/organicmaterial}] │ [{'identifier': https://w3id.org/isample/biology/biosampledfeature/1.0/Animalia}] │ [Ctenochaetus, marginatus] │ [{'keyword': Acanthuridae}, {'keyword': Actinopterygii}, {'keyword': Chordata}, {'keyword': French Polynesia}, {'keyword': Marquesas}, {'keyword': Perciformes}, {'keyword': Ua Huka}] │ NULL │ {'access_constraints': [], 'curation_location': , 'description': preservative: DMSO Buffer, 'label': , 'responsibility': [{'name': c, 'role': curator}, {'name': u, 'role': curator}, {'name': r, 'role': curator}, {'name': a, 'role': curator}, {'name': t, 'role': curator}, {'name': o, 'role': curator}, {'name': r, 'role': curator}, {'name': , 'role': }, {'name': S, 'role': curator}, {'name': T, 'role': curator}, {'name': R, 'role': curator}, {'name': I, 'role': curator}]} │ NULL │ [{'target': ark:/21547/ot2CmMQ0005}] │ [genomic analysis] │ NULL │ NULL │ POINT EMPTY │\n│ ark:/21547/ot2ANMQ0165 │ ANMQ0165 │ previousIdentifications: Acanthurus nigricans │ GEOME │ [{'identifier': https://w3id.org/isample/vocabulary/materialsampleobjecttype/1.0/wholeorganism}] │ [{'identifier': https://w3id.org/isample/vocabulary/material/1.0/organicmaterial}] │ [{'identifier': https://w3id.org/isample/biology/biosampledfeature/1.0/Animalia}] │ [Acanthurus, nigricans] │ [{'keyword': Acanthuridae}, {'keyword': Actinopterygii}, {'keyword': Chordata}, {'keyword': French Polynesia}, {'keyword': Marquesas}, {'keyword': Perciformes}] │ {'description': expeditionCode: IPfish_A68_HL | projectId: 1, 'has_feature_of_interest': , 'identifier': ark:/21547/ot2ANMQ0165, 'label': a8b1f9fa321073f72f9a7881747ab592 IPfish_A68_HL, 'responsibility': [{'name': D. R. Robertson, 'role': collector }, {'name': H. A. Lessios, 'role': principalInvestigator}], 'result_time': 1995-05-01, 'sampling_site': {'description': NULL, 'label': Marquesas, 'place_name': [French Polynesia, Marquesas], 'sample_location': {'elevation': NULL, 'latitude': -9.0, 'longitude': -139.3}}} │ {'access_constraints': [], 'curation_location': , 'description': preservative: DMSO Buffer, 'label': , 'responsibility': [{'name': c, 'role': curator}, {'name': u, 'role': curator}, {'name': r, 'role': curator}, {'name': a, 'role': curator}, {'name': t, 'role': curator}, {'name': o, 'role': curator}, {'name': r, 'role': curator}, {'name': , 'role': }, {'name': S, 'role': curator}, {'name': T, 'role': curator}, {'name': R, 'role': curator}, {'name': I, 'role': curator}]} │ NULL │ [{'target': ark:/21547/BNt2ANMQ0165}] │ NULL │ -139.3 │ -9.0 │ POINT (-139.3 -9) │\n│ ark:/21547/BNt2ANMQ0165 │ ANMQ0165 │ NULL │ GEOME │ [{'identifier': https://w3id.org/isample/vocabulary/materialsampleobjecttype/1.0/organismpart}] │ [{'identifier': https://w3id.org/isample/vocabulary/material/1.0/organicmaterial}] │ [{'identifier': https://w3id.org/isample/biology/biosampledfeature/1.0/Animalia}] │ [Acanthurus, nigricans] │ [{'keyword': Acanthuridae}, {'keyword': Actinopterygii}, {'keyword': Chordata}, {'keyword': French Polynesia}, {'keyword': Marquesas}, {'keyword': Perciformes}] │ NULL │ {'access_constraints': [], 'curation_location': , 'description': preservative: DMSO Buffer, 'label': , 'responsibility': [{'name': c, 'role': curator}, {'name': u, 'role': curator}, {'name': r, 'role': curator}, {'name': a, 'role': curator}, {'name': t, 'role': curator}, {'name': o, 'role': curator}, {'name': r, 'role': curator}, {'name': , 'role': }, {'name': S, 'role': curator}, {'name': T, 'role': curator}, {'name': R, 'role': curator}, {'name': I, 'role': curator}]} │ NULL │ [{'target': ark:/21547/ot2ANMQ0165}] │ [genomic analysis] │ NULL │ NULL │ POINT EMPTY │\n│ ark:/21547/ot2CmMQ0004 │ CmMQ0004 │ previousIdentifications: Ctenochaetus marginatus │ GEOME │ [{'identifier': https://w3id.org/isample/vocabulary/materialsampleobjecttype/1.0/wholeorganism}] │ [{'identifier': https://w3id.org/isample/vocabulary/material/1.0/organicmaterial}] │ [{'identifier': https://w3id.org/isample/biology/biosampledfeature/1.0/Animalia}] │ [Ctenochaetus, marginatus] │ [{'keyword': Acanthuridae}, {'keyword': Actinopterygii}, {'keyword': Chordata}, {'keyword': French Polynesia}, {'keyword': Marquesas}, {'keyword': Perciformes}, {'keyword': Ua Huka}] │ {'description': expeditionCode: IPfish_A68_HL | projectId: 1, 'has_feature_of_interest': , 'identifier': ark:/21547/ot2CmMQ0004, 'label': c8e4608b0409567824c8badfbaef685f IPfish_A68_HL, 'responsibility': [{'name': D. R. Robertson, 'role': collector }, {'name': H. A. Lessios, 'role': principalInvestigator}], 'result_time': 1995-05-01, 'sampling_site': {'description': NULL, 'label': ua-huka, 'place_name': [French Polynesia, Marquesas, Ua Huka, ua-huka], 'sample_location': {'elevation': NULL, 'latitude': -8.90864, 'longitude': -139.55984}}} │ {'access_constraints': [], 'curation_location': , 'description': preservative: DMSO Buffer, 'label': , 'responsibility': [{'name': c, 'role': curator}, {'name': u, 'role': curator}, {'name': r, 'role': curator}, {'name': a, 'role': curator}, {'name': t, 'role': curator}, {'name': o, 'role': curator}, {'name': r, 'role': curator}, {'name': , 'role': }, {'name': S, 'role': curator}, {'name': T, 'role': curator}, {'name': R, 'role': curator}, {'name': I, 'role': curator}]} │ NULL │ [{'target': ark:/21547/BNt2CmMQ0004}] │ NULL │ -139.55984 │ -8.90864 │ POINT (-139.55984 -8.90864) │\n│ ark:/21547/BNt2CmMQ0004 │ CmMQ0004 │ NULL │ GEOME │ [{'identifier': https://w3id.org/isample/vocabulary/materialsampleobjecttype/1.0/organismpart}] │ [{'identifier': https://w3id.org/isample/vocabulary/material/1.0/organicmaterial}] │ [{'identifier': https://w3id.org/isample/biology/biosampledfeature/1.0/Animalia}] │ [Ctenochaetus, marginatus] │ [{'keyword': Acanthuridae}, {'keyword': Actinopterygii}, {'keyword': Chordata}, {'keyword': French Polynesia}, {'keyword': Marquesas}, {'keyword': Perciformes}, {'keyword': Ua Huka}] │ NULL │ {'access_constraints': [], 'curation_location': , 'description': preservative: DMSO Buffer, 'label': , 'responsibility': [{'name': c, 'role': curator}, {'name': u, 'role': curator}, {'name': r, 'role': curator}, {'name': a, 'role': curator}, {'name': t, 'role': curator}, {'name': o, 'role': curator}, {'name': r, 'role': curator}, {'name': , 'role': }, {'name': S, 'role': curator}, {'name': T, 'role': curator}, {'name': R, 'role': curator}, {'name': I, 'role': curator}]} │ NULL │ [{'target': ark:/21547/ot2CmMQ0004}] │ [genomic analysis] │ NULL │ NULL │ POINT EMPTY │\n│ ark:/21547/ot2CmMQ0007 │ CmMQ0007 │ previousIdentifications: Ctenochaetus marginatus │ GEOME │ [{'identifier': https://w3id.org/isample/vocabulary/materialsampleobjecttype/1.0/wholeorganism}] │ [{'identifier': https://w3id.org/isample/vocabulary/material/1.0/organicmaterial}] │ [{'identifier': https://w3id.org/isample/biology/biosampledfeature/1.0/Animalia}] │ [Ctenochaetus, marginatus] │ [{'keyword': Acanthuridae}, {'keyword': Actinopterygii}, {'keyword': Chordata}, {'keyword': French Polynesia}, {'keyword': Marquesas}, {'keyword': Perciformes}, {'keyword': Ua Huka}] │ {'description': expeditionCode: IPfish_A68_HL | projectId: 1, 'has_feature_of_interest': , 'identifier': ark:/21547/ot2CmMQ0007, 'label': c8e4608b0409567824c8badfbaef685f IPfish_A68_HL, 'responsibility': [{'name': D. R. Robertson, 'role': collector }, {'name': H. A. Lessios, 'role': principalInvestigator}], 'result_time': 1995-05-01, 'sampling_site': {'description': NULL, 'label': ua-huka, 'place_name': [French Polynesia, Marquesas, Ua Huka, ua-huka], 'sample_location': {'elevation': NULL, 'latitude': -8.90864, 'longitude': -139.55984}}} │ {'access_constraints': [], 'curation_location': , 'description': preservative: DMSO Buffer, 'label': , 'responsibility': [{'name': c, 'role': curator}, {'name': u, 'role': curator}, {'name': r, 'role': curator}, {'name': a, 'role': curator}, {'name': t, 'role': curator}, {'name': o, 'role': curator}, {'name': r, 'role': curator}, {'name': , 'role': }, {'name': S, 'role': curator}, {'name': T, 'role': curator}, {'name': R, 'role': curator}, {'name': I, 'role': curator}]} │ NULL │ [{'target': ark:/21547/BNt2CmMQ0007}] │ NULL │ -139.55984 │ -8.90864 │ POINT (-139.55984 -8.90864) │\n│ ark:/21547/BNt2CmMQ0007 │ CmMQ0007 │ NULL │ GEOME │ [{'identifier': https://w3id.org/isample/vocabulary/materialsampleobjecttype/1.0/organismpart}] │ [{'identifier': https://w3id.org/isample/vocabulary/material/1.0/organicmaterial}] │ [{'identifier': https://w3id.org/isample/biology/biosampledfeature/1.0/Animalia}] │ [Ctenochaetus, marginatus] │ [{'keyword': Acanthuridae}, {'keyword': Actinopterygii}, {'keyword': Chordata}, {'keyword': French Polynesia}, {'keyword': Marquesas}, {'keyword': Perciformes}, {'keyword': Ua Huka}] │ NULL │ {'access_constraints': [], 'curation_location': , 'description': preservative: DMSO Buffer, 'label': , 'responsibility': [{'name': c, 'role': curator}, {'name': u, 'role': curator}, {'name': r, 'role': curator}, {'name': a, 'role': curator}, {'name': t, 'role': curator}, {'name': o, 'role': curator}, {'name': r, 'role': curator}, {'name': , 'role': }, {'name': S, 'role': curator}, {'name': T, 'role': curator}, {'name': R, 'role': curator}, {'name': I, 'role': curator}]} │ NULL │ [{'target': ark:/21547/ot2CmMQ0007}] │ [genomic analysis] │ NULL │ NULL │ POINT EMPTY │\n│ ark:/21547/ot2ANMQ0136 │ ANMQ0136 │ previousIdentifications: Acanthurus nigricans │ GEOME │ [{'identifier': https://w3id.org/isample/vocabulary/materialsampleobjecttype/1.0/wholeorganism}] │ [{'identifier': https://w3id.org/isample/vocabulary/material/1.0/organicmaterial}] │ [{'identifier': https://w3id.org/isample/biology/biosampledfeature/1.0/Animalia}] │ [Acanthurus, nigricans] │ [{'keyword': Acanthuridae}, {'keyword': Actinopterygii}, {'keyword': Chordata}, {'keyword': French Polynesia}, {'keyword': Marquesas}, {'keyword': Perciformes}] │ {'description': expeditionCode: IPfish_A68_HL | projectId: 1, 'has_feature_of_interest': , 'identifier': ark:/21547/ot2ANMQ0136, 'label': a8b1f9fa321073f72f9a7881747ab592 IPfish_A68_HL, 'responsibility': [{'name': D. R. Robertson, 'role': collector }, {'name': H. A. Lessios, 'role': principalInvestigator}], 'result_time': 1995-05-01, 'sampling_site': {'description': NULL, 'label': Marquesas, 'place_name': [French Polynesia, Marquesas], 'sample_location': {'elevation': NULL, 'latitude': -9.0, 'longitude': -139.3}}} │ {'access_constraints': [], 'curation_location': , 'description': preservative: DMSO Buffer, 'label': , 'responsibility': [{'name': c, 'role': curator}, {'name': u, 'role': curator}, {'name': r, 'role': curator}, {'name': a, 'role': curator}, {'name': t, 'role': curator}, {'name': o, 'role': curator}, {'name': r, 'role': curator}, {'name': , 'role': }, {'name': S, 'role': curator}, {'name': T, 'role': curator}, {'name': R, 'role': curator}, {'name': I, 'role': curator}]} │ NULL │ [{'target': ark:/21547/BNt2ANMQ0136}] │ NULL │ -139.3 │ -9.0 │ POINT (-139.3 -9) │\n│ ark:/21547/BNt2ANMQ0136 │ ANMQ0136 │ NULL │ GEOME │ [{'identifier': https://w3id.org/isample/vocabulary/materialsampleobjecttype/1.0/organismpart}] │ [{'identifier': https://w3id.org/isample/vocabulary/material/1.0/organicmaterial}] │ [{'identifier': https://w3id.org/isample/biology/biosampledfeature/1.0/Animalia}] │ [Acanthurus, nigricans] │ [{'keyword': Acanthuridae}, {'keyword': Actinopterygii}, {'keyword': Chordata}, {'keyword': French Polynesia}, {'keyword': Marquesas}, {'keyword': Perciformes}] │ NULL │ {'access_constraints': [], 'curation_location': , 'description': preservative: DMSO Buffer, 'label': , 'responsibility': [{'name': c, 'role': curator}, {'name': u, 'role': curator}, {'name': r, 'role': curator}, {'name': a, 'role': curator}, {'name': t, 'role': curator}, {'name': o, 'role': curator}, {'name': r, 'role': curator}, {'name': , 'role': }, {'name': S, 'role': curator}, {'name': T, 'role': curator}, {'name': R, 'role': curator}, {'name': I, 'role': curator}]} │ NULL │ [{'target': ark:/21547/ot2ANMQ0136}] │ [genomic analysis] │ NULL │ NULL │ POINT EMPTY │\n│ ark:/21547/ot2ANMQ0137 │ ANMQ0137 │ previousIdentifications: Acanthurus nigricans │ GEOME │ [{'identifier': https://w3id.org/isample/vocabulary/materialsampleobjecttype/1.0/wholeorganism}] │ [{'identifier': https://w3id.org/isample/vocabulary/material/1.0/organicmaterial}] │ [{'identifier': https://w3id.org/isample/biology/biosampledfeature/1.0/Animalia}] │ [Acanthurus, nigricans] │ [{'keyword': Acanthuridae}, {'keyword': Actinopterygii}, {'keyword': Chordata}, {'keyword': French Polynesia}, {'keyword': Marquesas}, {'keyword': Perciformes}] │ {'description': expeditionCode: IPfish_A68_HL | projectId: 1, 'has_feature_of_interest': , 'identifier': ark:/21547/ot2ANMQ0137, 'label': a8b1f9fa321073f72f9a7881747ab592 IPfish_A68_HL, 'responsibility': [{'name': D. R. Robertson, 'role': collector }, {'name': H. A. Lessios, 'role': principalInvestigator}], 'result_time': 1995-05-01, 'sampling_site': {'description': NULL, 'label': Marquesas, 'place_name': [French Polynesia, Marquesas], 'sample_location': {'elevation': NULL, 'latitude': -9.0, 'longitude': -139.3}}} │ {'access_constraints': [], 'curation_location': , 'description': preservative: DMSO Buffer, 'label': , 'responsibility': [{'name': c, 'role': curator}, {'name': u, 'role': curator}, {'name': r, 'role': curator}, {'name': a, 'role': curator}, {'name': t, 'role': curator}, {'name': o, 'role': curator}, {'name': r, 'role': curator}, {'name': , 'role': }, {'name': S, 'role': curator}, {'name': T, 'role': curator}, {'name': R, 'role': curator}, {'name': I, 'role': curator}]} │ NULL │ [{'target': ark:/21547/BNt2ANMQ0137}] │ NULL │ -139.3 │ -9.0 │ POINT (-139.3 -9) │\n├─────────────────────────┴──────────┴──────────────────────────────────────────────────┴───────────────────┴──────────────────────────────────────────────────────────────────────────────────────────────────┴────────────────────────────────────────────────────────────────────────────────────┴───────────────────────────────────────────────────────────────────────────────────┴────────────────────────────┴────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────┴──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────┴────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────┴────────────────────────┴───────────────────────────────────────┴────────────────────┴───────────────────────────┴──────────────────────────┴───────────────────────────────┤\n│ ? rows (>9999 rows, 20 shown) 17 columns │\n└──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────┘\n" + ] + } + ], + "source": [ + "# Geoparquet\n", + "import duckdb\n", + "import time\n", + "import ibis\n", + "\n", + "\n", + "\n", + "def print_timing(start_time, operation):\n", + " elapsed = time.time() - start_time\n", + " print(f\"Time for {operation}: {elapsed:.2f} seconds\")\n", + "\n", + "# Connect to an in-memory DuckDB instance\n", + "start_time = time.time()\n", + "con = duckdb.connect()\n", + "print_timing(start_time, \"database connection\")\n", + "\n", + "# Load the GeoParquet file and create view\n", + "start_time = time.time()\n", + "geo_parquet_file = '/Users/raymondyee/Data/iSample/2025_02_20_10_30_49/isamples_export_2025_02_20_10_30_49_geo.parquet'\n", + "con.sql(f\"CREATE VIEW geosamples AS SELECT * FROM read_parquet('{geo_parquet_file}')\")\n", + "print_timing(start_time, \"creating view\")\n", + "\n", + "# Get column names dynamically\n", + "start_time = time.time()\n", + "columns = con.sql(\"SELECT column_name FROM information_schema.columns WHERE table_name = 'geosamples'\").fetchall()\n", + "column_names = [col[0] for col in columns]\n", + "print_timing(start_time, \"getting column names\")\n", + "\n", + "# Print available columns\n", + "print(\"\\nAvailable columns:\")\n", + "for col in column_names:\n", + " print(f\"- {col}\")\n", + "\n", + "# Get total row count\n", + "start_time = time.time()\n", + "row_count = con.sql(\"SELECT COUNT(*) FROM geosamples\").fetchone()[0]\n", + "print(f\"\\nTotal number of rows: {row_count:,}\")\n", + "print_timing(start_time, \"counting rows\")\n", + "\n", + "# Query the first 5 rows with all columns\n", + "start_time = time.time()\n", + "result = con.sql(\"\"\"\n", + " SELECT *\n", + " FROM geosamples \n", + " LIMIT 5\n", + "\"\"\").df()\n", + "print_timing(start_time, \"querying sample rows\")\n", + "\n", + "print(\"\\nFirst 5 rows:\")\n", + "print(result)\n", + "import geopandas as gpd\n", + "import matplotlib.pyplot as plt\n", + "import cartopy.crs as ccrs\n", + "import cartopy.feature as cfeature\n", + "import time\n", + "\n", + "# Start timing\n", + "start_time = time.time()\n", + "\n", + "# Read only a sample of the data (e.g., 1% for better performance)\n", + "# geo_parquet_file = '/Users/raymondyee/Data/iSample/2025_02_20_10_30_49/isamples_export_2025_02_20_10_30_49_geo.parquet'\n", + "geo_parquet_file = '/Users/raymondyee/Data/iSample/2025_04_21_16_23_46/isamples_export_2025_04_21_16_23_46_geo.parquet'\n", + "gdf = gpd.read_parquet(geo_parquet_file, columns=['geometry']) # Only read geometry column\n", + "\n", + "# Create map visualization of geographical sample points\n", + "# Sampling only 1% of data for better performance with large datasets\n", + "sample_size = len(gdf) // 100 # 1% of data\n", + "gdf_sampled = gdf.sample(n=sample_size, random_state=42)\n", + "\n", + "print(f\"Data loading and sampling time: {time.time() - start_time:.2f} seconds\")\n", + "\n", + "# Create figure and axis with larger size and projection\n", + "fig, ax = plt.subplots(figsize=(15, 10), \n", + " subplot_kw={'projection': ccrs.Robinson()})\n", + "\n", + "# Add map features\n", + "ax.add_feature(cfeature.LAND, facecolor='lightgray')\n", + "ax.add_feature(cfeature.OCEAN, facecolor='lightblue')\n", + "ax.add_feature(cfeature.COASTLINE)\n", + "ax.gridlines()\n", + "\n", + "# Plot with improved styling\n", + "gdf_sampled.plot(\n", + " ax=ax,\n", + " transform=ccrs.PlateCarree(),\n", + " alpha=0.5,\n", + " markersize=1,\n", + " color='red',\n", + " legend=True\n", + ")\n", + "\n", + "# Add title\n", + "plt.title(f'Sample of {sample_size:,} points from {len(gdf):,} total records')\n", + "\n", + "print(f\"Total execution time: {time.time() - start_time:.2f} seconds\")\n", + "\n", + "plt.show()\n", + "import duckdb\n", + "import os\n", + "\n", + "# Change working directory to the location of the GeoParquet file\n", + "os.chdir('/Users/raymondyee/Data/iSample/2025_02_20_10_30_49')\n", + "\n", + "# Initialize DuckDB connection\n", + "conn = duckdb.connect(':memory:') # or specify a database file\n", + "\n", + "# Install and load spatial extension\n", + "conn.execute(\"INSTALL spatial;\")\n", + "conn.execute(\"LOAD spatial;\")\n", + "\n", + "# Create temp view from parquet file\n", + "conn.execute(\"\"\"\n", + " CREATE TEMP VIEW my_data AS \n", + " SELECT * FROM read_parquet('isamples_export_2025_02_20_10_30_49_geo.parquet')\n", + "\"\"\")\n", + "\n", + "# Get count of rows\n", + "result = conn.execute(\"SELECT COUNT(*) FROM my_data\").fetchall()\n", + "\n", + "# Print result\n", + "print(f\"Total rows: {result[0][0]}\")\n", + "\n", + "# Close connection\n", + "conn.close()\n", + "import duckdb\n", + "\n", + "# Change working directory to the location of the GeoParquet file\n", + "os.chdir('/Users/raymondyee/Data/iSample/2025_02_20_10_30_49')\n", + "\n", + "# Create connection\n", + "conn = duckdb.connect(':memory:')\n", + "conn.execute(\"INSTALL spatial; LOAD spatial;\")\n", + "\n", + "# DuckDB can query Parquet directly - no need to create views\n", + "query = conn.table('isamples_export_2025_02_20_10_30_49_geo.parquet')\n", + "\n", + "# Queries stay lazy until you need results\n", + "print(f\"Total rows: {query.count()}\")\n", + "\n", + "# You can chain operations naturally\n", + "filtered = query.filter(\"some_condition\").limit(5)\n", + "\n", + "# Only converts to DataFrame when you actually call .df()\n", + "# filtered_df = filtered.df() # This would materialize the data\n", + "\n", + "\n", + "import os\n", + "import duckdb\n", + "from lonboard import viz\n", + "\n", + "# Change working directory to the location of the GeoParquet file\n", + "os.chdir('/Users/raymondyee/Data/iSample/2025_02_20_10_30_49')\n", + "\n", + "# Initialize DuckDB connection\n", + "conn = duckdb.connect(':memory:')\n", + "\n", + "# Install and load spatial extension\n", + "conn.execute(\"INSTALL spatial;\")\n", + "conn.execute(\"LOAD spatial;\")\n", + "\n", + "# First, let's check the geometry type and a sample\n", + "result = conn.execute(\"\"\"\n", + " SELECT \n", + " ST_GeometryType(geometry) as geom_type,\n", + " ST_AsText(geometry) as wkt,\n", + " sample_location_longitude,\n", + " sample_location_latitude\n", + " FROM read_parquet('isamples_export_2025_02_20_10_30_49_geo.parquet') \n", + " WHERE geometry IS NOT NULL \n", + " LIMIT 1\n", + "\"\"\").fetchall()\n", + "print(result)\n", + "\n", + "# Create temp view using longitude/latitude to create geometry\n", + "conn.execute(\"\"\"\n", + " CREATE TEMP VIEW my_data AS \n", + " SELECT \n", + " ST_AsWKB(ST_Point(sample_location_longitude, sample_location_latitude)) as geometry,\n", + " sample_identifier,\n", + " label,\n", + " description,\n", + " source_collection,\n", + " has_sample_object_type,\n", + " has_material_category,\n", + " has_context_category,\n", + " informal_classification,\n", + " keywords,\n", + " produced_by,\n", + " curation,\n", + " registrant,\n", + " related_resource,\n", + " sampling_purpose,\n", + " sample_location_longitude,\n", + " sample_location_latitude\n", + " FROM read_parquet('isamples_export_2025_02_20_10_30_49_geo.parquet')\n", + " WHERE sample_location_longitude IS NOT NULL \n", + " AND sample_location_latitude IS NOT NULL\n", + "\"\"\")\n", + "\n", + "# Check coordinate bounds\n", + "bounds = conn.execute(\"\"\"\n", + " SELECT \n", + " MIN(sample_location_longitude) as min_lon,\n", + " MAX(sample_location_longitude) as max_lon,\n", + " MIN(sample_location_latitude) as min_lat,\n", + " MAX(sample_location_latitude) as max_lat,\n", + " COUNT(*) as point_count\n", + " FROM my_data\n", + "\"\"\").fetchall()\n", + "print(\"\\nCoordinate bounds and point count:\")\n", + "print(bounds)\n", + "\n", + "# Query and visualize with map configuration\n", + "result = conn.sql(\"SELECT * FROM my_data LIMIT 10000\")\n", + "viz(\n", + " result,\n", + " map_kwargs={\n", + " # Lonboard 0.12+: initialize view using `view_state`, not `zoom`/`center`\n", + " \"view_state\": {\"zoom\": 1, \"latitude\": 0, \"longitude\": 0}\n", + " }\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "2efc61ce", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[('POINT', 'POINT (-122.57861 38.578888)', -122.57861, 38.578888)]\n", + "\n", + "Coordinate bounds and point count:\n", + "[(-180.0, 180.0, -89.983, 89.981, 5795511)]\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/Users/raymondyee/.pyenv/versions/3.12.9/envs/isamples-python-3.12.9/lib/python3.12/site-packages/lonboard/_geoarrow/ops/reproject.py:33: UserWarning: No CRS exists on data. If no data is shown on the map, double check that your CRS is WGS84.\n", + " warn(\n" + ] + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "fceaf0968b0546a4b296ffc2a3eeb2e3", + "version_major": 2, + "version_minor": 1 + }, + "text/plain": [ + "Map(basemap_style=25k may get sluggish. Use clustering (enabled) or downsample.\n", + "- For very large datasets, consider: tiling (3D Tiles), server‑side aggregation, or heatmaps.\n", + "\n", + "Next steps you could try later:\n", + "- Replace simple points with color by category (material/context)\n", + "- Add popups with richer HTML\n", + "- Stream chunks instead of embedding all JSON inline\n", + "- Switch to `GeoJsonDataSource.clustering` tuning (pixelRange / minimumClusterSize)\n" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "719d5ed1", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Prepared 10000 points for CesiumJS (SAMPLE_FRACTION=None, MAX_POINTS=10000)\n" + ] + }, + { + "data": { + "text/html": [ + "\n", + "
\n", + "
\n", + "
\n", + "\n", + "\n", + "\n" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "import duckdb, json, os, math, textwrap\n", + "from IPython.display import HTML, display\n", + "\n", + "# --- Config ---\n", + "PARQUET_PATH = '/Users/raymondyee/Data/iSample/2025_02_20_10_30_49/isamples_export_2025_02_20_10_30_49_geo.parquet'\n", + "MAX_POINTS = 10000 # Upper bound of points to embed (tune for performance)\n", + "SAMPLE_FRACTION = None # e.g., 0.01 for 1% random sample; overrides MAX_POINTS if set\n", + "RANDOM_SEED = 42\n", + "\n", + "# --- Extract (lon, lat, id, label) subset via DuckDB ---\n", + "con = duckdb.connect()\n", + "query_base = f\"\"\"\n", + " SELECT \n", + " sample_location_longitude AS lon,\n", + " sample_location_latitude AS lat,\n", + " sample_identifier AS sample_id,\n", + " COALESCE(label, sample_identifier) AS label\n", + " FROM read_parquet('{PARQUET_PATH}')\n", + " WHERE sample_location_longitude IS NOT NULL\n", + " AND sample_location_latitude IS NOT NULL\n", + "\"\"\"\n", + "\n", + "if SAMPLE_FRACTION is not None:\n", + " points_df = con.sql(query_base + f\" USING SAMPLE {SAMPLE_FRACTION * 100}% (bernoulli, {RANDOM_SEED})\").df()\n", + "else:\n", + " points_df = con.sql(query_base + f\" LIMIT {MAX_POINTS}\").df()\n", + "\n", + "point_count = len(points_df)\n", + "print(f\"Prepared {point_count} points for CesiumJS (SAMPLE_FRACTION={SAMPLE_FRACTION}, MAX_POINTS={MAX_POINTS})\")\n", + "\n", + "# --- Convert to GeoJSON FeatureCollection ---\n", + "features = []\n", + "for row in points_df.itertuples(index=False):\n", + " lon, lat, sample_id, label = row\n", + " if math.isnan(lon) or math.isnan(lat):\n", + " continue\n", + " features.append({\n", + " \"type\": \"Feature\",\n", + " \"geometry\": {\"type\": \"Point\", \"coordinates\": [float(lon), float(lat)]},\n", + " \"properties\": {\"sample_id\": sample_id, \"label\": label}\n", + " })\n", + "geojson_obj = {\"type\": \"FeatureCollection\", \"features\": features}\n", + "geojson_str = json.dumps(geojson_obj) # embed directly (small subset)\n", + "\n", + "# --- HTML/JS Template for Cesium ---\n", + "html = f\"\"\"\n", + "
\n", + "
\n", + "
\n", + "\n", + "\n", + "\n", + "\"\"\"\n", + "\n", + "# Display\n", + "display(HTML(html))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "2e658719", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "markdown", + "id": "ecf57b6a", + "metadata": {}, + "source": [ + "### CesiumJS Troubleshooting & Alternate Embed\n", + "If the earlier Cesium cell (inline `