diff --git a/common/src/main/java/org/opensearch/sql/common/setting/Settings.java b/common/src/main/java/org/opensearch/sql/common/setting/Settings.java index 96fe2e04eea..15686e0e38e 100644 --- a/common/src/main/java/org/opensearch/sql/common/setting/Settings.java +++ b/common/src/main/java/org/opensearch/sql/common/setting/Settings.java @@ -36,6 +36,7 @@ public enum Key { PPL_SYNTAX_LEGACY_PREFERRED("plugins.ppl.syntax.legacy.preferred"), PPL_SUBSEARCH_MAXOUT("plugins.ppl.subsearch.maxout"), PPL_JOIN_SUBSEARCH_MAXOUT("plugins.ppl.join.subsearch_maxout"), + PPL_DISTRIBUTED_ENABLED("plugins.ppl.distributed.enabled"), /** Enable Calcite as execution engine */ CALCITE_ENGINE_ENABLED("plugins.calcite.enabled"), diff --git a/core/src/main/java/org/opensearch/sql/planner/distributed/exchange/ExchangeManager.java b/core/src/main/java/org/opensearch/sql/planner/distributed/exchange/ExchangeManager.java new file mode 100644 index 00000000000..f42c57ed80a --- /dev/null +++ b/core/src/main/java/org/opensearch/sql/planner/distributed/exchange/ExchangeManager.java @@ -0,0 +1,36 @@ +/* + * Copyright OpenSearch Contributors + * SPDX-License-Identifier: Apache-2.0 + */ + +package org.opensearch.sql.planner.distributed.exchange; + +import org.opensearch.sql.planner.distributed.operator.OperatorContext; +import org.opensearch.sql.planner.distributed.stage.PartitioningScheme; + +/** + * Manages the lifecycle of exchanges between compute stages. Creates exchange sink and source + * operators for inter-stage data transfer. + */ +public interface ExchangeManager { + + /** + * Creates an exchange sink operator for sending data from one stage to another. + * + * @param context the operator context + * @param targetStageId the downstream stage receiving the data + * @param partitioning how the output should be partitioned + * @return the exchange sink operator + */ + ExchangeSinkOperator createSink( + OperatorContext context, String targetStageId, PartitioningScheme partitioning); + + /** + * Creates an exchange source operator for receiving data from an upstream stage. + * + * @param context the operator context + * @param sourceStageId the upstream stage sending the data + * @return the exchange source operator + */ + ExchangeSourceOperator createSource(OperatorContext context, String sourceStageId); +} diff --git a/core/src/main/java/org/opensearch/sql/planner/distributed/exchange/ExchangeSinkOperator.java b/core/src/main/java/org/opensearch/sql/planner/distributed/exchange/ExchangeSinkOperator.java new file mode 100644 index 00000000000..29cde441fca --- /dev/null +++ b/core/src/main/java/org/opensearch/sql/planner/distributed/exchange/ExchangeSinkOperator.java @@ -0,0 +1,19 @@ +/* + * Copyright OpenSearch Contributors + * SPDX-License-Identifier: Apache-2.0 + */ + +package org.opensearch.sql.planner.distributed.exchange; + +import org.opensearch.sql.planner.distributed.operator.SinkOperator; + +/** + * A sink operator that sends pages to a downstream compute stage. Implementations handle the + * serialization and transport of data between stages (e.g., via OpenSearch transport, Arrow Flight, + * or in-memory buffers for local exchanges). + */ +public interface ExchangeSinkOperator extends SinkOperator { + + /** Returns the ID of the downstream stage this sink sends data to. */ + String getTargetStageId(); +} diff --git a/core/src/main/java/org/opensearch/sql/planner/distributed/exchange/ExchangeSourceOperator.java b/core/src/main/java/org/opensearch/sql/planner/distributed/exchange/ExchangeSourceOperator.java new file mode 100644 index 00000000000..7c228c68a89 --- /dev/null +++ b/core/src/main/java/org/opensearch/sql/planner/distributed/exchange/ExchangeSourceOperator.java @@ -0,0 +1,18 @@ +/* + * Copyright OpenSearch Contributors + * SPDX-License-Identifier: Apache-2.0 + */ + +package org.opensearch.sql.planner.distributed.exchange; + +import org.opensearch.sql.planner.distributed.operator.SourceOperator; + +/** + * A source operator that receives pages from an upstream compute stage. Implementations handle + * deserialization and buffering of data received from upstream stages. + */ +public interface ExchangeSourceOperator extends SourceOperator { + + /** Returns the ID of the upstream stage this source receives data from. */ + String getSourceStageId(); +} diff --git a/core/src/main/java/org/opensearch/sql/planner/distributed/exchange/OutputBuffer.java b/core/src/main/java/org/opensearch/sql/planner/distributed/exchange/OutputBuffer.java new file mode 100644 index 00000000000..6253ced27b2 --- /dev/null +++ b/core/src/main/java/org/opensearch/sql/planner/distributed/exchange/OutputBuffer.java @@ -0,0 +1,45 @@ +/* + * Copyright OpenSearch Contributors + * SPDX-License-Identifier: Apache-2.0 + */ + +package org.opensearch.sql.planner.distributed.exchange; + +import org.opensearch.sql.planner.distributed.page.Page; +import org.opensearch.sql.planner.distributed.stage.PartitioningScheme; + +/** + * Buffers output pages from a stage before sending them to downstream consumers via the exchange + * layer. Provides back-pressure to prevent producers from overwhelming consumers. + * + *

Serialization format is an implementation detail. The default implementation uses OpenSearch + * transport ({@code StreamOutput}). A future implementation can use Arrow IPC ({@code + * ArrowRecordBatch}) for zero-copy columnar exchange. + */ +public interface OutputBuffer extends AutoCloseable { + + /** + * Enqueues a page for delivery to downstream consumers. + * + * @param page the page to send + */ + void enqueue(Page page); + + /** Signals that no more pages will be enqueued. */ + void setNoMorePages(); + + /** Returns true if the buffer is full and the producer should wait (back-pressure). */ + boolean isFull(); + + /** Returns the total size of buffered data in bytes. */ + long getBufferedBytes(); + + /** Aborts the buffer, discarding any buffered pages. */ + void abort(); + + /** Returns true if all pages have been consumed and no more will be produced. */ + boolean isFinished(); + + /** Returns the partitioning scheme for this buffer's output. */ + PartitioningScheme getPartitioningScheme(); +} diff --git a/core/src/main/java/org/opensearch/sql/planner/distributed/execution/QueryExecution.java b/core/src/main/java/org/opensearch/sql/planner/distributed/execution/QueryExecution.java new file mode 100644 index 00000000000..f24808ecbfd --- /dev/null +++ b/core/src/main/java/org/opensearch/sql/planner/distributed/execution/QueryExecution.java @@ -0,0 +1,57 @@ +/* + * Copyright OpenSearch Contributors + * SPDX-License-Identifier: Apache-2.0 + */ + +package org.opensearch.sql.planner.distributed.execution; + +import java.util.List; +import org.opensearch.sql.planner.distributed.stage.StagedPlan; + +/** + * Represents the execution of a complete distributed query. Manages the lifecycle of all stage + * executions and provides query-level statistics. + */ +public interface QueryExecution { + + /** Query execution states. */ + enum State { + PLANNING, + STARTING, + RUNNING, + FINISHING, + FINISHED, + FAILED + } + + /** Returns the unique query identifier. */ + String getQueryId(); + + /** Returns the staged execution plan. */ + StagedPlan getPlan(); + + /** Returns the current execution state. */ + State getState(); + + /** Returns all stage executions for this query. */ + List getStageExecutions(); + + /** Returns execution statistics for this query. */ + QueryStats getStats(); + + /** Cancels the query and all its stage executions. */ + void cancel(); + + /** Statistics for a query execution. */ + interface QueryStats { + + /** Returns the total number of output rows. */ + long getTotalRows(); + + /** Returns the total elapsed execution time in milliseconds. */ + long getElapsedTimeMillis(); + + /** Returns the time spent planning in milliseconds. */ + long getPlanningTimeMillis(); + } +} diff --git a/core/src/main/java/org/opensearch/sql/planner/distributed/execution/StageExecution.java b/core/src/main/java/org/opensearch/sql/planner/distributed/execution/StageExecution.java new file mode 100644 index 00000000000..f6f9f19dc73 --- /dev/null +++ b/core/src/main/java/org/opensearch/sql/planner/distributed/execution/StageExecution.java @@ -0,0 +1,92 @@ +/* + * Copyright OpenSearch Contributors + * SPDX-License-Identifier: Apache-2.0 + */ + +package org.opensearch.sql.planner.distributed.execution; + +import java.util.List; +import java.util.Map; +import org.opensearch.sql.planner.distributed.split.DataUnit; +import org.opensearch.sql.planner.distributed.stage.ComputeStage; + +/** + * Manages the execution of a single compute stage across multiple nodes. Tracks task executions, + * handles data unit assignment, and monitors stage completion. + */ +public interface StageExecution { + + /** Stage execution states. */ + enum State { + PLANNED, + SCHEDULING, + RUNNING, + FINISHED, + FAILED, + CANCELLED + } + + /** Returns the compute stage being executed. */ + ComputeStage getStage(); + + /** Returns the current execution state. */ + State getState(); + + /** + * Adds data units to be processed by this stage. + * + * @param dataUnits the data units to add + */ + void addDataUnits(List dataUnits); + + /** Signals that no more data units will be added to this stage. */ + void noMoreDataUnits(); + + /** + * Returns task executions grouped by node ID. + * + * @return map from node ID to list of task executions on that node + */ + Map> getTaskExecutions(); + + /** Returns execution statistics for this stage. */ + StageStats getStats(); + + /** Cancels all tasks in this stage. */ + void cancel(); + + /** + * Adds a listener to be notified when the stage state changes. + * + * @param listener the state change listener + */ + void addStateChangeListener(StateChangeListener listener); + + /** Listener for stage state changes. */ + @FunctionalInterface + interface StateChangeListener { + + /** + * Called when the stage transitions to a new state. + * + * @param newState the new state + */ + void onStateChange(State newState); + } + + /** Statistics for a stage execution. */ + interface StageStats { + + /** Returns the total number of rows processed across all tasks. */ + long getTotalRows(); + + /** Returns the total number of bytes processed across all tasks. */ + long getTotalBytes(); + + /** Returns the number of completed tasks. */ + int getCompletedTasks(); + + /** Returns the total number of tasks. */ + int getTotalTasks(); + } +} diff --git a/core/src/main/java/org/opensearch/sql/planner/distributed/execution/TaskExecution.java b/core/src/main/java/org/opensearch/sql/planner/distributed/execution/TaskExecution.java new file mode 100644 index 00000000000..4048fab1ec9 --- /dev/null +++ b/core/src/main/java/org/opensearch/sql/planner/distributed/execution/TaskExecution.java @@ -0,0 +1,60 @@ +/* + * Copyright OpenSearch Contributors + * SPDX-License-Identifier: Apache-2.0 + */ + +package org.opensearch.sql.planner.distributed.execution; + +import java.util.List; +import org.opensearch.sql.planner.distributed.split.DataUnit; + +/** + * Represents the execution of a single task within a stage. Each task processes a subset of data + * units on a specific node. + */ +public interface TaskExecution { + + /** Task execution states. */ + enum State { + PLANNED, + RUNNING, + FLUSHING, + FINISHED, + FAILED, + CANCELLED + } + + /** Returns the unique identifier for this task. */ + String getTaskId(); + + /** Returns the node ID where this task is executing. */ + String getNodeId(); + + /** Returns the current execution state. */ + State getState(); + + /** Returns the data units assigned to this task. */ + List getAssignedDataUnits(); + + /** Returns execution statistics for this task. */ + TaskStats getStats(); + + /** Cancels this task. */ + void cancel(); + + /** Statistics for a task execution. */ + interface TaskStats { + + /** Returns the number of rows processed by this task. */ + long getProcessedRows(); + + /** Returns the number of bytes processed by this task. */ + long getProcessedBytes(); + + /** Returns the number of output rows produced by this task. */ + long getOutputRows(); + + /** Returns the elapsed execution time in milliseconds. */ + long getElapsedTimeMillis(); + } +} diff --git a/core/src/main/java/org/opensearch/sql/planner/distributed/operator/Operator.java b/core/src/main/java/org/opensearch/sql/planner/distributed/operator/Operator.java new file mode 100644 index 00000000000..165d1eb1655 --- /dev/null +++ b/core/src/main/java/org/opensearch/sql/planner/distributed/operator/Operator.java @@ -0,0 +1,54 @@ +/* + * Copyright OpenSearch Contributors + * SPDX-License-Identifier: Apache-2.0 + */ + +package org.opensearch.sql.planner.distributed.operator; + +import org.opensearch.sql.planner.distributed.page.Page; + +/** + * Core operator interface using a push/pull model. Operators form a pipeline where data flows as + * {@link Page} batches. Each operator declares whether it needs input ({@link #needsInput()}), + * accepts input ({@link #addInput(Page)}), and produces output ({@link #getOutput()}). + * + *

Lifecycle: + * + *

    + *
  1. Pipeline driver calls {@link #needsInput()} to check readiness + *
  2. If ready, driver calls {@link #addInput(Page)} with upstream output + *
  3. Driver calls {@link #getOutput()} to pull processed results + *
  4. When upstream is done, driver calls {@link #finish()} to signal no more input + *
  5. Operator produces remaining buffered output via {@link #getOutput()} + *
  6. When {@link #isFinished()} returns true, operator is done + *
  7. Driver calls {@link #close()} to release resources + *
+ */ +public interface Operator extends AutoCloseable { + + /** Returns true if this operator is ready to accept input via {@link #addInput(Page)}. */ + boolean needsInput(); + + /** + * Provides a page of input data to this operator. + * + * @param page the input page (must not be null) + * @throws IllegalStateException if {@link #needsInput()} returns false + */ + void addInput(Page page); + + /** + * Returns the next page of output, or null if no output is available yet. A null return does not + * mean the operator is finished — call {@link #isFinished()} to check. + */ + Page getOutput(); + + /** Returns true if this operator has completed all processing and will produce no more output. */ + boolean isFinished(); + + /** Signals that no more input will be provided. The operator should flush buffered results. */ + void finish(); + + /** Returns the runtime context for this operator. */ + OperatorContext getContext(); +} diff --git a/core/src/main/java/org/opensearch/sql/planner/distributed/operator/OperatorContext.java b/core/src/main/java/org/opensearch/sql/planner/distributed/operator/OperatorContext.java new file mode 100644 index 00000000000..ebeba3548a4 --- /dev/null +++ b/core/src/main/java/org/opensearch/sql/planner/distributed/operator/OperatorContext.java @@ -0,0 +1,57 @@ +/* + * Copyright OpenSearch Contributors + * SPDX-License-Identifier: Apache-2.0 + */ + +package org.opensearch.sql.planner.distributed.operator; + +import java.util.concurrent.atomic.AtomicBoolean; + +/** + * Runtime context available to operators during execution. Provides access to memory limits, + * cancellation, and operator identity. + */ +public class OperatorContext { + + private final String operatorId; + private final String stageId; + private final long memoryLimitBytes; + private final AtomicBoolean cancelled; + + public OperatorContext(String operatorId, String stageId, long memoryLimitBytes) { + this.operatorId = operatorId; + this.stageId = stageId; + this.memoryLimitBytes = memoryLimitBytes; + this.cancelled = new AtomicBoolean(false); + } + + /** Returns the unique identifier for this operator instance. */ + public String getOperatorId() { + return operatorId; + } + + /** Returns the stage ID this operator belongs to. */ + public String getStageId() { + return stageId; + } + + /** Returns the memory limit in bytes for this operator. */ + public long getMemoryLimitBytes() { + return memoryLimitBytes; + } + + /** Returns true if the query has been cancelled. */ + public boolean isCancelled() { + return cancelled.get(); + } + + /** Requests cancellation of the query. */ + public void cancel() { + cancelled.set(true); + } + + /** Creates a default context for testing. */ + public static OperatorContext createDefault(String operatorId) { + return new OperatorContext(operatorId, "default-stage", Long.MAX_VALUE); + } +} diff --git a/core/src/main/java/org/opensearch/sql/planner/distributed/operator/OperatorFactory.java b/core/src/main/java/org/opensearch/sql/planner/distributed/operator/OperatorFactory.java new file mode 100644 index 00000000000..b7f5cf954e8 --- /dev/null +++ b/core/src/main/java/org/opensearch/sql/planner/distributed/operator/OperatorFactory.java @@ -0,0 +1,25 @@ +/* + * Copyright OpenSearch Contributors + * SPDX-License-Identifier: Apache-2.0 + */ + +package org.opensearch.sql.planner.distributed.operator; + +/** + * Factory for creating {@link Operator} instances. Each factory creates operators for a specific + * pipeline position (e.g., filter, project, aggregation). The pipeline uses factories so that + * multiple operator instances can be created for parallel execution. + */ +public interface OperatorFactory { + + /** + * Creates a new operator instance. + * + * @param context the runtime context for the operator + * @return a new operator instance + */ + Operator createOperator(OperatorContext context); + + /** Signals that no more operators will be created from this factory. */ + void noMoreOperators(); +} diff --git a/core/src/main/java/org/opensearch/sql/planner/distributed/operator/SinkOperator.java b/core/src/main/java/org/opensearch/sql/planner/distributed/operator/SinkOperator.java new file mode 100644 index 00000000000..10c88dd911c --- /dev/null +++ b/core/src/main/java/org/opensearch/sql/planner/distributed/operator/SinkOperator.java @@ -0,0 +1,24 @@ +/* + * Copyright OpenSearch Contributors + * SPDX-License-Identifier: Apache-2.0 + */ + +package org.opensearch.sql.planner.distributed.operator; + +import org.opensearch.sql.planner.distributed.page.Page; + +/** + * A terminal operator that consumes pages without producing output. Sink operators collect results + * (e.g., into a response buffer) or send data to downstream stages (e.g., exchange sinks). + * + *

Sink operators always need input (until finished) and never produce output via {@link + * #getOutput()}. + */ +public interface SinkOperator extends Operator { + + /** Sink operators do not produce output pages. Always returns null. */ + @Override + default Page getOutput() { + return null; + } +} diff --git a/core/src/main/java/org/opensearch/sql/planner/distributed/operator/SourceOperator.java b/core/src/main/java/org/opensearch/sql/planner/distributed/operator/SourceOperator.java new file mode 100644 index 00000000000..1edd4f11b3a --- /dev/null +++ b/core/src/main/java/org/opensearch/sql/planner/distributed/operator/SourceOperator.java @@ -0,0 +1,42 @@ +/* + * Copyright OpenSearch Contributors + * SPDX-License-Identifier: Apache-2.0 + */ + +package org.opensearch.sql.planner.distributed.operator; + +import org.opensearch.sql.planner.distributed.page.Page; +import org.opensearch.sql.planner.distributed.split.DataUnit; + +/** + * A source operator that reads data from external storage (e.g., Lucene shards). Source operators + * do not accept input from upstream operators — they produce data from assigned {@link DataUnit}s. + * + *

The pipeline driver assigns data units via {@link #addDataUnit(DataUnit)} and signals + * completion via {@link #noMoreDataUnits()}. The operator reads data from data units and produces + * {@link Page} batches via {@link #getOutput()}. + */ +public interface SourceOperator extends Operator { + + /** + * Assigns a unit of work (e.g., a shard) to this source operator. + * + * @param dataUnit the data unit to read from + */ + void addDataUnit(DataUnit dataUnit); + + /** Signals that no more data units will be assigned. */ + void noMoreDataUnits(); + + /** Source operators never accept input from upstream. */ + @Override + default boolean needsInput() { + return false; + } + + /** Source operators never accept input from upstream. */ + @Override + default void addInput(Page page) { + throw new UnsupportedOperationException("Source operators do not accept input"); + } +} diff --git a/core/src/main/java/org/opensearch/sql/planner/distributed/operator/SourceOperatorFactory.java b/core/src/main/java/org/opensearch/sql/planner/distributed/operator/SourceOperatorFactory.java new file mode 100644 index 00000000000..a06617d97d1 --- /dev/null +++ b/core/src/main/java/org/opensearch/sql/planner/distributed/operator/SourceOperatorFactory.java @@ -0,0 +1,24 @@ +/* + * Copyright OpenSearch Contributors + * SPDX-License-Identifier: Apache-2.0 + */ + +package org.opensearch.sql.planner.distributed.operator; + +/** + * Factory for creating {@link SourceOperator} instances. Source operator factories are used at the + * beginning of a pipeline to create operators that read from external storage. + */ +public interface SourceOperatorFactory { + + /** + * Creates a new source operator instance. + * + * @param context the runtime context for the operator + * @return a new source operator instance + */ + SourceOperator createOperator(OperatorContext context); + + /** Signals that no more operators will be created from this factory. */ + void noMoreOperators(); +} diff --git a/core/src/main/java/org/opensearch/sql/planner/distributed/page/Block.java b/core/src/main/java/org/opensearch/sql/planner/distributed/page/Block.java new file mode 100644 index 00000000000..f9b7674d064 --- /dev/null +++ b/core/src/main/java/org/opensearch/sql/planner/distributed/page/Block.java @@ -0,0 +1,64 @@ +/* + * Copyright OpenSearch Contributors + * SPDX-License-Identifier: Apache-2.0 + */ + +package org.opensearch.sql.planner.distributed.page; + +/** + * A column of data within a {@link Page}. Each Block holds values for a single column across all + * rows in the page. Designed to align with Apache Arrow's columnar model: a future {@code + * ArrowBlock} implementation can wrap Arrow {@code FieldVector} for zero-copy exchange via Arrow + * IPC. + */ +public interface Block { + + /** Returns the number of values (rows) in this block. */ + int getPositionCount(); + + /** + * Returns the value at the given position. + * + * @param position the row index (0-based) + * @return the value, or null if the position is null + */ + Object getValue(int position); + + /** + * Returns true if the value at the given position is null. + * + * @param position the row index (0-based) + * @return true if null + */ + boolean isNull(int position); + + /** Returns the estimated memory retained by this block in bytes. */ + long getRetainedSizeBytes(); + + /** + * Returns a sub-region of this block. + * + * @param positionOffset the starting row index + * @param length the number of rows in the region + * @return a new Block representing the sub-region + */ + Block getRegion(int positionOffset, int length); + + /** Returns the data type of this block's values. */ + BlockType getType(); + + /** Supported block data types, aligned with Arrow's type system. */ + enum BlockType { + BOOLEAN, + INT, + LONG, + FLOAT, + DOUBLE, + STRING, + BYTES, + TIMESTAMP, + DATE, + NULL, + UNKNOWN + } +} diff --git a/core/src/main/java/org/opensearch/sql/planner/distributed/page/Page.java b/core/src/main/java/org/opensearch/sql/planner/distributed/page/Page.java new file mode 100644 index 00000000000..926a8524bc8 --- /dev/null +++ b/core/src/main/java/org/opensearch/sql/planner/distributed/page/Page.java @@ -0,0 +1,64 @@ +/* + * Copyright OpenSearch Contributors + * SPDX-License-Identifier: Apache-2.0 + */ + +package org.opensearch.sql.planner.distributed.page; + +/** + * A batch of rows or columns flowing through the operator pipeline. Designed to be columnar-ready: + * Phase 5A uses a row-based implementation ({@link RowPage}), but future phases can swap in an + * Arrow-backed implementation for zero-copy columnar processing. + */ +public interface Page { + + /** Returns the number of rows in this page. */ + int getPositionCount(); + + /** Returns the number of columns in this page. */ + int getChannelCount(); + + /** + * Returns the value at the given row and column position. + * + * @param position the row index (0-based) + * @param channel the column index (0-based) + * @return the value, or null if the cell is null + */ + Object getValue(int position, int channel); + + /** + * Returns a sub-region of this page. + * + * @param positionOffset the starting row index + * @param length the number of rows in the region + * @return a new Page representing the sub-region + */ + Page getRegion(int positionOffset, int length); + + /** + * Returns the columnar block for the given channel. Default implementation throws + * UnsupportedOperationException; columnar Page implementations (e.g., Arrow-backed) override + * this. + * + * @param channel the column index (0-based) + * @return the block for the channel + */ + default Block getBlock(int channel) { + throw new UnsupportedOperationException( + "Columnar access not supported by " + getClass().getSimpleName()); + } + + /** + * Returns the estimated memory retained by this page in bytes. Default implementation estimates + * based on position count, channel count, and 8 bytes per value. + */ + default long getRetainedSizeBytes() { + return (long) getPositionCount() * getChannelCount() * 8L; + } + + /** Returns an empty page with zero rows and the given number of columns. */ + static Page empty(int channelCount) { + return new RowPage(new Object[0][channelCount], channelCount); + } +} diff --git a/core/src/main/java/org/opensearch/sql/planner/distributed/page/PageBuilder.java b/core/src/main/java/org/opensearch/sql/planner/distributed/page/PageBuilder.java new file mode 100644 index 00000000000..1b3f0e76daa --- /dev/null +++ b/core/src/main/java/org/opensearch/sql/planner/distributed/page/PageBuilder.java @@ -0,0 +1,79 @@ +/* + * Copyright OpenSearch Contributors + * SPDX-License-Identifier: Apache-2.0 + */ + +package org.opensearch.sql.planner.distributed.page; + +import java.util.ArrayList; +import java.util.List; + +/** + * Builds a {@link Page} row by row. Call {@link #beginRow()}, set values via {@link #setValue(int, + * Object)}, then {@link #endRow()} to commit. Call {@link #build()} to produce the final Page. + */ +public class PageBuilder { + + private final int channelCount; + private final List rows; + private Object[] currentRow; + + public PageBuilder(int channelCount) { + if (channelCount < 0) { + throw new IllegalArgumentException("channelCount must be non-negative: " + channelCount); + } + this.channelCount = channelCount; + this.rows = new ArrayList<>(); + } + + /** Starts a new row. Values default to null. */ + public void beginRow() { + currentRow = new Object[channelCount]; + } + + /** + * Sets a value in the current row. + * + * @param channel the column index (0-based) + * @param value the value to set + */ + public void setValue(int channel, Object value) { + if (currentRow == null) { + throw new IllegalStateException("beginRow() must be called before setValue()"); + } + if (channel < 0 || channel >= channelCount) { + throw new IndexOutOfBoundsException( + "Channel " + channel + " out of range [0, " + channelCount + ")"); + } + currentRow[channel] = value; + } + + /** Commits the current row to the page. */ + public void endRow() { + if (currentRow == null) { + throw new IllegalStateException("beginRow() must be called before endRow()"); + } + rows.add(currentRow); + currentRow = null; + } + + /** Returns the number of rows added so far. */ + public int getRowCount() { + return rows.size(); + } + + /** Returns true if no rows have been added. */ + public boolean isEmpty() { + return rows.isEmpty(); + } + + /** Builds the final Page from all committed rows and resets the builder. */ + public Page build() { + if (currentRow != null) { + throw new IllegalStateException("endRow() must be called before build()"); + } + Object[][] data = rows.toArray(new Object[0][]); + rows.clear(); + return new RowPage(data, channelCount); + } +} diff --git a/core/src/main/java/org/opensearch/sql/planner/distributed/page/RowPage.java b/core/src/main/java/org/opensearch/sql/planner/distributed/page/RowPage.java new file mode 100644 index 00000000000..568fd93728b --- /dev/null +++ b/core/src/main/java/org/opensearch/sql/planner/distributed/page/RowPage.java @@ -0,0 +1,69 @@ +/* + * Copyright OpenSearch Contributors + * SPDX-License-Identifier: Apache-2.0 + */ + +package org.opensearch.sql.planner.distributed.page; + +import java.util.Arrays; + +/** + * Simple row-based {@link Page} implementation. Each row is an Object array where the index + * corresponds to the column (channel) position. This is the Phase 5A implementation; future phases + * will add an Arrow-backed columnar implementation. + */ +public class RowPage implements Page { + + private final Object[][] rows; + private final int channelCount; + + /** + * Creates a RowPage from pre-built row data. + * + * @param rows 2D array where rows[i][j] is the value at row i, column j + * @param channelCount the number of columns + */ + public RowPage(Object[][] rows, int channelCount) { + this.rows = rows; + this.channelCount = channelCount; + } + + @Override + public int getPositionCount() { + return rows.length; + } + + @Override + public int getChannelCount() { + return channelCount; + } + + @Override + public Object getValue(int position, int channel) { + if (position < 0 || position >= rows.length) { + throw new IndexOutOfBoundsException( + "Position " + position + " out of range [0, " + rows.length + ")"); + } + if (channel < 0 || channel >= channelCount) { + throw new IndexOutOfBoundsException( + "Channel " + channel + " out of range [0, " + channelCount + ")"); + } + return rows[position][channel]; + } + + @Override + public Page getRegion(int positionOffset, int length) { + if (positionOffset < 0 || positionOffset + length > rows.length) { + throw new IndexOutOfBoundsException( + "Region [" + + positionOffset + + ", " + + (positionOffset + length) + + ") out of range [0, " + + rows.length + + ")"); + } + Object[][] region = Arrays.copyOfRange(rows, positionOffset, positionOffset + length); + return new RowPage(region, channelCount); + } +} diff --git a/core/src/main/java/org/opensearch/sql/planner/distributed/pipeline/Pipeline.java b/core/src/main/java/org/opensearch/sql/planner/distributed/pipeline/Pipeline.java new file mode 100644 index 00000000000..2e63d8715d5 --- /dev/null +++ b/core/src/main/java/org/opensearch/sql/planner/distributed/pipeline/Pipeline.java @@ -0,0 +1,56 @@ +/* + * Copyright OpenSearch Contributors + * SPDX-License-Identifier: Apache-2.0 + */ + +package org.opensearch.sql.planner.distributed.pipeline; + +import java.util.Collections; +import java.util.List; +import org.opensearch.sql.planner.distributed.operator.OperatorFactory; +import org.opensearch.sql.planner.distributed.operator.SourceOperatorFactory; + +/** + * An ordered chain of operator factories that defines the processing logic for a compute stage. The + * first element is a {@link SourceOperatorFactory} (reads from storage or exchange), followed by + * zero or more intermediate {@link OperatorFactory} instances (filter, project, aggregate, etc.). + */ +public class Pipeline { + + private final String pipelineId; + private final SourceOperatorFactory sourceFactory; + private final List operatorFactories; + + /** + * Creates a pipeline. + * + * @param pipelineId unique identifier + * @param sourceFactory the source operator factory (first in chain) + * @param operatorFactories ordered list of intermediate operator factories + */ + public Pipeline( + String pipelineId, + SourceOperatorFactory sourceFactory, + List operatorFactories) { + this.pipelineId = pipelineId; + this.sourceFactory = sourceFactory; + this.operatorFactories = Collections.unmodifiableList(operatorFactories); + } + + public String getPipelineId() { + return pipelineId; + } + + public SourceOperatorFactory getSourceFactory() { + return sourceFactory; + } + + public List getOperatorFactories() { + return operatorFactories; + } + + /** Returns the total number of operators (source + intermediates). */ + public int getOperatorCount() { + return 1 + operatorFactories.size(); + } +} diff --git a/core/src/main/java/org/opensearch/sql/planner/distributed/pipeline/PipelineContext.java b/core/src/main/java/org/opensearch/sql/planner/distributed/pipeline/PipelineContext.java new file mode 100644 index 00000000000..05a4ae0f7bc --- /dev/null +++ b/core/src/main/java/org/opensearch/sql/planner/distributed/pipeline/PipelineContext.java @@ -0,0 +1,60 @@ +/* + * Copyright OpenSearch Contributors + * SPDX-License-Identifier: Apache-2.0 + */ + +package org.opensearch.sql.planner.distributed.pipeline; + +import java.util.concurrent.atomic.AtomicBoolean; + +/** Runtime state for a pipeline execution. Tracks status and provides cancellation. */ +public class PipelineContext { + + /** Pipeline execution status. */ + public enum Status { + CREATED, + RUNNING, + FINISHED, + FAILED, + CANCELLED + } + + private volatile Status status; + private final AtomicBoolean cancelled; + private volatile String failureMessage; + + public PipelineContext() { + this.status = Status.CREATED; + this.cancelled = new AtomicBoolean(false); + } + + public Status getStatus() { + return status; + } + + public void setRunning() { + this.status = Status.RUNNING; + } + + public void setFinished() { + this.status = Status.FINISHED; + } + + public void setFailed(String message) { + this.status = Status.FAILED; + this.failureMessage = message; + } + + public void setCancelled() { + this.status = Status.CANCELLED; + this.cancelled.set(true); + } + + public boolean isCancelled() { + return cancelled.get(); + } + + public String getFailureMessage() { + return failureMessage; + } +} diff --git a/core/src/main/java/org/opensearch/sql/planner/distributed/pipeline/PipelineDriver.java b/core/src/main/java/org/opensearch/sql/planner/distributed/pipeline/PipelineDriver.java new file mode 100644 index 00000000000..c4a26267bd5 --- /dev/null +++ b/core/src/main/java/org/opensearch/sql/planner/distributed/pipeline/PipelineDriver.java @@ -0,0 +1,210 @@ +/* + * Copyright OpenSearch Contributors + * SPDX-License-Identifier: Apache-2.0 + */ + +package org.opensearch.sql.planner.distributed.pipeline; + +import java.util.ArrayList; +import java.util.List; +import org.apache.logging.log4j.LogManager; +import org.apache.logging.log4j.Logger; +import org.opensearch.sql.planner.distributed.operator.Operator; +import org.opensearch.sql.planner.distributed.operator.OperatorContext; +import org.opensearch.sql.planner.distributed.operator.OperatorFactory; +import org.opensearch.sql.planner.distributed.operator.SourceOperator; +import org.opensearch.sql.planner.distributed.page.Page; +import org.opensearch.sql.planner.distributed.split.DataUnit; + +/** + * Executes a pipeline by driving data through a chain of operators. The driver implements a + * pull/push loop: it pulls output from upstream operators and pushes it as input to downstream + * operators. + * + *

Execution model: + * + *

    + *
  1. Source operator produces pages from data units + *
  2. Each intermediate operator transforms pages + *
  3. The last operator (or sink) consumes the final output + *
  4. When all operators are finished, the pipeline is complete + *
+ */ +public class PipelineDriver { + + private static final Logger log = LogManager.getLogger(PipelineDriver.class); + + private final SourceOperator sourceOperator; + private final List operators; + private final PipelineContext context; + + /** + * Creates a PipelineDriver from a Pipeline definition. + * + * @param pipeline the pipeline to execute + * @param operatorContext the context for creating operators + * @param dataUnits the data units to assign to the source operator + */ + public PipelineDriver( + Pipeline pipeline, OperatorContext operatorContext, List dataUnits) { + this.context = new PipelineContext(); + + // Create source operator + this.sourceOperator = pipeline.getSourceFactory().createOperator(operatorContext); + for (DataUnit dataUnit : dataUnits) { + this.sourceOperator.addDataUnit(dataUnit); + } + this.sourceOperator.noMoreDataUnits(); + + // Create intermediate operators + this.operators = new ArrayList<>(); + for (OperatorFactory factory : pipeline.getOperatorFactories()) { + this.operators.add(factory.createOperator(operatorContext)); + } + } + + /** + * Creates a PipelineDriver from pre-built operators (for testing). + * + * @param sourceOperator the source operator + * @param operators the intermediate operators + */ + public PipelineDriver(SourceOperator sourceOperator, List operators) { + this.context = new PipelineContext(); + this.sourceOperator = sourceOperator; + this.operators = new ArrayList<>(operators); + } + + /** + * Runs the pipeline to completion. Drives data from source through all operators until all are + * finished or cancellation is requested. + * + * @return the final output page from the last operator (may be null for sink pipelines) + */ + public Page run() { + context.setRunning(); + Page lastOutput = null; + + try { + while (!isFinished() && !context.isCancelled()) { + boolean madeProgress = processOnce(); + if (!madeProgress && !isFinished()) { + // No progress and not finished — avoid busy-wait + Thread.yield(); + } + } + + // Collect any remaining output from the last operator + if (!operators.isEmpty()) { + Page output = operators.get(operators.size() - 1).getOutput(); + if (output != null) { + lastOutput = output; + } + } else { + Page output = sourceOperator.getOutput(); + if (output != null) { + lastOutput = output; + } + } + + if (context.isCancelled()) { + context.setCancelled(); + } else { + context.setFinished(); + } + } catch (Exception e) { + context.setFailed(e.getMessage()); + throw new RuntimeException("Pipeline execution failed", e); + } finally { + closeAll(); + } + + return lastOutput; + } + + /** + * Processes one iteration of the pipeline loop. Returns true if any progress was made (data + * moved). + */ + boolean processOnce() { + boolean madeProgress = false; + + // Drive source → first operator (or collect output if no intermediates) + if (!sourceOperator.isFinished()) { + Page sourcePage = sourceOperator.getOutput(); + if (sourcePage != null && sourcePage.getPositionCount() > 0) { + if (!operators.isEmpty() && operators.get(0).needsInput()) { + operators.get(0).addInput(sourcePage); + madeProgress = true; + } + } + } else if (!operators.isEmpty()) { + // Source finished — signal finish to first operator + Operator first = operators.get(0); + if (!first.isFinished()) { + first.finish(); + madeProgress = true; + } + } + + // Drive through intermediate operators: operator[i] → operator[i+1] + for (int i = 0; i < operators.size() - 1; i++) { + Operator current = operators.get(i); + Operator next = operators.get(i + 1); + + Page output = current.getOutput(); + if (output != null && output.getPositionCount() > 0 && next.needsInput()) { + next.addInput(output); + madeProgress = true; + } + + if (current.isFinished() && !next.isFinished()) { + next.finish(); + madeProgress = true; + } + } + + // Drain the last operator's output so it can transition to finished. + // Without this, operators that buffer pages (e.g., PassThroughOperator) + // would never have getOutput() called, preventing isFinished() from + // returning true. + if (!operators.isEmpty()) { + Operator last = operators.get(operators.size() - 1); + Page output = last.getOutput(); + if (output != null) { + madeProgress = true; + } + } + + return madeProgress; + } + + /** Returns true if all operators have finished processing. */ + public boolean isFinished() { + if (!operators.isEmpty()) { + return operators.get(operators.size() - 1).isFinished(); + } + return sourceOperator.isFinished(); + } + + /** Returns the pipeline execution context. */ + public PipelineContext getContext() { + return context; + } + + /** Closes all operators, releasing resources. */ + private void closeAll() { + try { + sourceOperator.close(); + } catch (Exception e) { + log.warn("Error closing source operator", e); + } + for (Operator op : operators) { + try { + op.close(); + } catch (Exception e) { + log.warn("Error closing operator", e); + } + } + } +} diff --git a/core/src/main/java/org/opensearch/sql/planner/distributed/planner/CostEstimator.java b/core/src/main/java/org/opensearch/sql/planner/distributed/planner/CostEstimator.java new file mode 100644 index 00000000000..efc7a39f8bb --- /dev/null +++ b/core/src/main/java/org/opensearch/sql/planner/distributed/planner/CostEstimator.java @@ -0,0 +1,42 @@ +/* + * Copyright OpenSearch Contributors + * SPDX-License-Identifier: Apache-2.0 + */ + +package org.opensearch.sql.planner.distributed.planner; + +import org.apache.calcite.rel.RelNode; + +/** + * Estimates the cost of executing a RelNode subtree. Used by the physical planner to make decisions + * about stage boundaries, exchange types (broadcast vs. hash repartition), and operator placement. + * + *

Phase 5A defines the interface. Phase 5G implements it using Lucene statistics (doc count, + * field cardinality, selectivity estimates). + */ +public interface CostEstimator { + + /** + * Estimates the number of output rows for a RelNode. + * + * @param relNode the plan node to estimate + * @return estimated row count + */ + long estimateRowCount(RelNode relNode); + + /** + * Estimates the output size in bytes for a RelNode. + * + * @param relNode the plan node to estimate + * @return estimated size in bytes + */ + long estimateSizeBytes(RelNode relNode); + + /** + * Estimates the selectivity of a filter condition (0.0 to 1.0). + * + * @param relNode the filter node + * @return selectivity ratio + */ + double estimateSelectivity(RelNode relNode); +} diff --git a/core/src/main/java/org/opensearch/sql/planner/distributed/planner/FragmentationContext.java b/core/src/main/java/org/opensearch/sql/planner/distributed/planner/FragmentationContext.java new file mode 100644 index 00000000000..7c8377a1d79 --- /dev/null +++ b/core/src/main/java/org/opensearch/sql/planner/distributed/planner/FragmentationContext.java @@ -0,0 +1,38 @@ +/* + * Copyright OpenSearch Contributors + * SPDX-License-Identifier: Apache-2.0 + */ + +package org.opensearch.sql.planner.distributed.planner; + +import java.util.List; +import org.opensearch.sql.planner.distributed.split.DataUnitSource; + +/** + * Provides context to the {@link PlanFragmenter} during plan fragmentation. Supplies information + * about cluster topology, cost estimates, and data unit discovery needed to make fragmentation + * decisions (e.g., broadcast vs. hash repartition, stage parallelism). + */ +public interface FragmentationContext { + + /** Returns the list of available data node IDs in the cluster. */ + List getAvailableNodes(); + + /** Returns the cost estimator for sizing stages and choosing exchange types. */ + CostEstimator getCostEstimator(); + + /** + * Returns a data unit source for the given table name. Used to discover shards and their + * locations during fragmentation. + * + * @param tableName the table (index) name + * @return the data unit source for shard discovery + */ + DataUnitSource getDataUnitSource(String tableName); + + /** Returns the maximum number of tasks per stage (limits parallelism). */ + int getMaxTasksPerStage(); + + /** Returns the node ID of the coordinator node. */ + String getCoordinatorNodeId(); +} diff --git a/core/src/main/java/org/opensearch/sql/planner/distributed/planner/PhysicalPlanner.java b/core/src/main/java/org/opensearch/sql/planner/distributed/planner/PhysicalPlanner.java new file mode 100644 index 00000000000..13650af5084 --- /dev/null +++ b/core/src/main/java/org/opensearch/sql/planner/distributed/planner/PhysicalPlanner.java @@ -0,0 +1,25 @@ +/* + * Copyright OpenSearch Contributors + * SPDX-License-Identifier: Apache-2.0 + */ + +package org.opensearch.sql.planner.distributed.planner; + +import org.apache.calcite.rel.RelNode; +import org.opensearch.sql.planner.distributed.stage.StagedPlan; + +/** + * Converts a Calcite logical plan (RelNode) into a distributed execution plan (StagedPlan). + * Implementations walk the RelNode tree, decide stage boundaries (where exchanges go), and build + * operator pipelines for each stage. + */ +public interface PhysicalPlanner { + + /** + * Plans a Calcite RelNode tree into a distributed StagedPlan. + * + * @param relNode the optimized Calcite logical plan + * @return the distributed execution plan + */ + StagedPlan plan(RelNode relNode); +} diff --git a/core/src/main/java/org/opensearch/sql/planner/distributed/planner/PlanFragmenter.java b/core/src/main/java/org/opensearch/sql/planner/distributed/planner/PlanFragmenter.java new file mode 100644 index 00000000000..51799eec462 --- /dev/null +++ b/core/src/main/java/org/opensearch/sql/planner/distributed/planner/PlanFragmenter.java @@ -0,0 +1,36 @@ +/* + * Copyright OpenSearch Contributors + * SPDX-License-Identifier: Apache-2.0 + */ + +package org.opensearch.sql.planner.distributed.planner; + +import org.apache.calcite.rel.RelNode; +import org.opensearch.sql.planner.distributed.stage.StagedPlan; + +/** + * Fragments an optimized Calcite RelNode tree into a multi-stage distributed execution plan. Walks + * the RelNode tree, identifies stage boundaries (where exchanges are needed), and creates {@link + * SubPlan} fragments for each stage. Replaces the manual stage creation in the old {@code + * DistributedQueryPlanner}. + * + *

Stage boundaries are inserted at: + * + *

    + *
  • Table scans (leaf stages) + *
  • Aggregations requiring repartition (hash exchange) + *
  • Joins requiring repartition or broadcast + *
  • Sort requiring gather exchange + *
+ */ +public interface PlanFragmenter { + + /** + * Fragments an optimized RelNode tree into a staged execution plan. + * + * @param optimizedPlan the Calcite-optimized RelNode tree + * @param context fragmentation context providing cluster topology and cost estimates + * @return the staged distributed execution plan + */ + StagedPlan fragment(RelNode optimizedPlan, FragmentationContext context); +} diff --git a/core/src/main/java/org/opensearch/sql/planner/distributed/planner/SubPlan.java b/core/src/main/java/org/opensearch/sql/planner/distributed/planner/SubPlan.java new file mode 100644 index 00000000000..7a9b2070812 --- /dev/null +++ b/core/src/main/java/org/opensearch/sql/planner/distributed/planner/SubPlan.java @@ -0,0 +1,70 @@ +/* + * Copyright OpenSearch Contributors + * SPDX-License-Identifier: Apache-2.0 + */ + +package org.opensearch.sql.planner.distributed.planner; + +import java.util.Collections; +import java.util.List; +import org.apache.calcite.rel.RelNode; +import org.opensearch.sql.planner.distributed.stage.PartitioningScheme; + +/** + * A fragment of the query plan that executes within a single stage. Contains a sub-plan (Calcite + * RelNode tree) that can be sent to data nodes for local execution, enabling query pushdown. + * + *

The {@code root} RelNode represents the computation to execute locally on each data node. For + * example, a scan stage's SubPlan might contain: Filter → TableScan, allowing the data node to + * apply the filter during scanning rather than sending all data to the coordinator. + */ +public class SubPlan { + + private final String fragmentId; + private final RelNode root; + private final PartitioningScheme outputPartitioning; + private final List children; + + public SubPlan( + String fragmentId, + RelNode root, + PartitioningScheme outputPartitioning, + List children) { + this.fragmentId = fragmentId; + this.root = root; + this.outputPartitioning = outputPartitioning; + this.children = Collections.unmodifiableList(children); + } + + /** Returns the unique identifier for this plan fragment. */ + public String getFragmentId() { + return fragmentId; + } + + /** Returns the root of the sub-plan RelNode tree for data node execution. */ + public RelNode getRoot() { + return root; + } + + /** Returns the output partitioning scheme for this fragment. */ + public PartitioningScheme getOutputPartitioning() { + return outputPartitioning; + } + + /** Returns child sub-plans that feed data into this fragment. */ + public List getChildren() { + return children; + } + + @Override + public String toString() { + return "SubPlan{" + + "id='" + + fragmentId + + "', partitioning=" + + outputPartitioning.getExchangeType() + + ", children=" + + children.size() + + '}'; + } +} diff --git a/core/src/main/java/org/opensearch/sql/planner/distributed/split/DataUnit.java b/core/src/main/java/org/opensearch/sql/planner/distributed/split/DataUnit.java new file mode 100644 index 00000000000..52a6738fa16 --- /dev/null +++ b/core/src/main/java/org/opensearch/sql/planner/distributed/split/DataUnit.java @@ -0,0 +1,44 @@ +/* + * Copyright OpenSearch Contributors + * SPDX-License-Identifier: Apache-2.0 + */ + +package org.opensearch.sql.planner.distributed.split; + +import java.util.List; +import java.util.Map; + +/** + * A unit of data assigned to a SourceOperator. Each DataUnit represents a portion of data to read — + * typically one OpenSearch shard. Includes preferred nodes for data locality and estimated size for + * load balancing. + * + *

Subclasses provide storage-specific details (e.g., {@code OpenSearchDataUnit} adds index name + * and shard ID). + */ +public abstract class DataUnit { + + /** Returns a unique identifier for this data unit. */ + public abstract String getDataUnitId(); + + /** Returns the nodes where this data unit can be read locally (primary + replicas). */ + public abstract List getPreferredNodes(); + + /** Returns the estimated number of rows in this data unit. */ + public abstract long getEstimatedRows(); + + /** Returns the estimated size in bytes of this data unit. */ + public abstract long getEstimatedSizeBytes(); + + /** Returns storage-specific properties for this data unit. */ + public abstract Map getProperties(); + + /** + * Returns whether this data unit can be read from any node (true) or requires execution on a + * preferred node (false). Default is true; OpenSearch shard data units override to false because + * Lucene requires local access. + */ + public boolean isRemotelyAccessible() { + return true; + } +} diff --git a/core/src/main/java/org/opensearch/sql/planner/distributed/split/DataUnitAssignment.java b/core/src/main/java/org/opensearch/sql/planner/distributed/split/DataUnitAssignment.java new file mode 100644 index 00000000000..8f0f983bd3d --- /dev/null +++ b/core/src/main/java/org/opensearch/sql/planner/distributed/split/DataUnitAssignment.java @@ -0,0 +1,26 @@ +/* + * Copyright OpenSearch Contributors + * SPDX-License-Identifier: Apache-2.0 + */ + +package org.opensearch.sql.planner.distributed.split; + +import java.util.List; +import java.util.Map; + +/** + * Assigns data units to nodes, respecting data locality and load balance. Implementations decide + * which node should process each data unit based on preferred nodes, current load, and cluster + * topology. + */ +public interface DataUnitAssignment { + + /** + * Assigns data units to nodes. + * + * @param dataUnits the data units to assign + * @param availableNodes the nodes available for execution + * @return a mapping from node ID to the list of data units assigned to that node + */ + Map> assign(List dataUnits, List availableNodes); +} diff --git a/core/src/main/java/org/opensearch/sql/planner/distributed/split/DataUnitSource.java b/core/src/main/java/org/opensearch/sql/planner/distributed/split/DataUnitSource.java new file mode 100644 index 00000000000..5bfa5ddd623 --- /dev/null +++ b/core/src/main/java/org/opensearch/sql/planner/distributed/split/DataUnitSource.java @@ -0,0 +1,39 @@ +/* + * Copyright OpenSearch Contributors + * SPDX-License-Identifier: Apache-2.0 + */ + +package org.opensearch.sql.planner.distributed.split; + +import java.util.List; + +/** + * Generates {@link DataUnit}s for a source operator. Implementations discover available data units + * (e.g., shards) from cluster state and create them with preferred node information. + */ +public interface DataUnitSource extends AutoCloseable { + + /** + * Returns the next batch of data units, up to the specified maximum batch size. Returns an empty + * list if no more data units are available. + * + * @param maxBatchSize maximum number of data units to return + * @return list of data units + */ + List getNextBatch(int maxBatchSize); + + /** + * Returns the next batch of data units with a default batch size. + * + * @return list of data units + */ + default List getNextBatch() { + return getNextBatch(1000); + } + + /** Returns true if all data units have been generated. */ + boolean isFinished(); + + @Override + void close(); +} diff --git a/core/src/main/java/org/opensearch/sql/planner/distributed/stage/ComputeStage.java b/core/src/main/java/org/opensearch/sql/planner/distributed/stage/ComputeStage.java new file mode 100644 index 00000000000..a71eeeca093 --- /dev/null +++ b/core/src/main/java/org/opensearch/sql/planner/distributed/stage/ComputeStage.java @@ -0,0 +1,149 @@ +/* + * Copyright OpenSearch Contributors + * SPDX-License-Identifier: Apache-2.0 + */ + +package org.opensearch.sql.planner.distributed.stage; + +import java.util.Collections; +import java.util.List; +import org.apache.calcite.rel.RelNode; +import org.opensearch.sql.planner.distributed.operator.OperatorFactory; +import org.opensearch.sql.planner.distributed.operator.SourceOperatorFactory; +import org.opensearch.sql.planner.distributed.split.DataUnit; + +/** + * A portion of the distributed plan that runs as a pipeline on one or more nodes. Each ComputeStage + * contains a pipeline of operators (source + transforms), an output partitioning scheme (how + * results flow to the next stage), and metadata about dependencies and parallelism. + * + *

Naming follows the convention: "ComputeStage" (not "Fragment") — a unit of distributed + * computation. + */ +public class ComputeStage { + + private final String stageId; + private final SourceOperatorFactory sourceFactory; + private final List operatorFactories; + private final PartitioningScheme outputPartitioning; + private final List sourceStageIds; + private final List dataUnits; + private final long estimatedRows; + private final long estimatedBytes; + private final RelNode planFragment; + + public ComputeStage( + String stageId, + SourceOperatorFactory sourceFactory, + List operatorFactories, + PartitioningScheme outputPartitioning, + List sourceStageIds, + List dataUnits, + long estimatedRows, + long estimatedBytes) { + this( + stageId, + sourceFactory, + operatorFactories, + outputPartitioning, + sourceStageIds, + dataUnits, + estimatedRows, + estimatedBytes, + null); + } + + public ComputeStage( + String stageId, + SourceOperatorFactory sourceFactory, + List operatorFactories, + PartitioningScheme outputPartitioning, + List sourceStageIds, + List dataUnits, + long estimatedRows, + long estimatedBytes, + RelNode planFragment) { + this.stageId = stageId; + this.sourceFactory = sourceFactory; + this.operatorFactories = Collections.unmodifiableList(operatorFactories); + this.outputPartitioning = outputPartitioning; + this.sourceStageIds = Collections.unmodifiableList(sourceStageIds); + this.dataUnits = Collections.unmodifiableList(dataUnits); + this.estimatedRows = estimatedRows; + this.estimatedBytes = estimatedBytes; + this.planFragment = planFragment; + } + + public String getStageId() { + return stageId; + } + + public SourceOperatorFactory getSourceFactory() { + return sourceFactory; + } + + /** Returns the ordered list of intermediate operator factories (after source). */ + public List getOperatorFactories() { + return operatorFactories; + } + + /** Returns how this stage's output is partitioned for the downstream stage. */ + public PartitioningScheme getOutputPartitioning() { + return outputPartitioning; + } + + /** Returns the IDs of upstream stages that feed data into this stage. */ + public List getSourceStageIds() { + return sourceStageIds; + } + + /** Returns the data units assigned to this stage (for source stages with shard assignments). */ + public List getDataUnits() { + return dataUnits; + } + + /** Returns the estimated row count for this stage's output. */ + public long getEstimatedRows() { + return estimatedRows; + } + + /** Returns the estimated byte size for this stage's output. */ + public long getEstimatedBytes() { + return estimatedBytes; + } + + /** + * Returns the sub-plan (Calcite RelNode) for data node execution, or null if this stage does not + * push down a plan fragment. Enables query pushdown: the data node can execute this sub-plan + * locally instead of just scanning raw data. + */ + public RelNode getPlanFragment() { + return planFragment; + } + + /** Returns true if this is a leaf stage (no upstream dependencies). */ + public boolean isLeaf() { + return sourceStageIds.isEmpty(); + } + + /** Returns the total operator count (source + intermediates). */ + public int getOperatorCount() { + return 1 + operatorFactories.size(); + } + + @Override + public String toString() { + return "ComputeStage{" + + "id='" + + stageId + + "', operators=" + + getOperatorCount() + + ", exchange=" + + outputPartitioning.getExchangeType() + + ", dataUnits=" + + dataUnits.size() + + ", deps=" + + sourceStageIds + + '}'; + } +} diff --git a/core/src/main/java/org/opensearch/sql/planner/distributed/stage/ExchangeType.java b/core/src/main/java/org/opensearch/sql/planner/distributed/stage/ExchangeType.java new file mode 100644 index 00000000000..3b4a84d47b2 --- /dev/null +++ b/core/src/main/java/org/opensearch/sql/planner/distributed/stage/ExchangeType.java @@ -0,0 +1,21 @@ +/* + * Copyright OpenSearch Contributors + * SPDX-License-Identifier: Apache-2.0 + */ + +package org.opensearch.sql.planner.distributed.stage; + +/** How data is exchanged between compute stages. */ +public enum ExchangeType { + /** All data flows to a single node (coordinator). Used for final merge. */ + GATHER, + + /** Data is repartitioned by hash key across nodes. Used for distributed joins and aggs. */ + HASH_REPARTITION, + + /** Data is sent to all downstream nodes. Used for broadcast joins (small table). */ + BROADCAST, + + /** No exchange — stage runs locally after the previous stage on the same node. */ + NONE +} diff --git a/core/src/main/java/org/opensearch/sql/planner/distributed/stage/PartitioningScheme.java b/core/src/main/java/org/opensearch/sql/planner/distributed/stage/PartitioningScheme.java new file mode 100644 index 00000000000..15e97ef7aed --- /dev/null +++ b/core/src/main/java/org/opensearch/sql/planner/distributed/stage/PartitioningScheme.java @@ -0,0 +1,50 @@ +/* + * Copyright OpenSearch Contributors + * SPDX-License-Identifier: Apache-2.0 + */ + +package org.opensearch.sql.planner.distributed.stage; + +import java.util.Collections; +import java.util.List; + +/** Describes how a stage's output data is partitioned across nodes. */ +public class PartitioningScheme { + + private final ExchangeType exchangeType; + private final List hashChannels; + + private PartitioningScheme(ExchangeType exchangeType, List hashChannels) { + this.exchangeType = exchangeType; + this.hashChannels = Collections.unmodifiableList(hashChannels); + } + + /** Creates a GATHER partitioning (all data to coordinator). */ + public static PartitioningScheme gather() { + return new PartitioningScheme(ExchangeType.GATHER, List.of()); + } + + /** Creates a HASH_REPARTITION partitioning on the given column indices. */ + public static PartitioningScheme hashRepartition(List hashChannels) { + return new PartitioningScheme(ExchangeType.HASH_REPARTITION, hashChannels); + } + + /** Creates a BROADCAST partitioning (all data to all nodes). */ + public static PartitioningScheme broadcast() { + return new PartitioningScheme(ExchangeType.BROADCAST, List.of()); + } + + /** Creates a NONE partitioning (no exchange). */ + public static PartitioningScheme none() { + return new PartitioningScheme(ExchangeType.NONE, List.of()); + } + + public ExchangeType getExchangeType() { + return exchangeType; + } + + /** Returns the column indices used for hash partitioning. Empty for non-hash schemes. */ + public List getHashChannels() { + return hashChannels; + } +} diff --git a/core/src/main/java/org/opensearch/sql/planner/distributed/stage/StagedPlan.java b/core/src/main/java/org/opensearch/sql/planner/distributed/stage/StagedPlan.java new file mode 100644 index 00000000000..d8aed791a9a --- /dev/null +++ b/core/src/main/java/org/opensearch/sql/planner/distributed/stage/StagedPlan.java @@ -0,0 +1,96 @@ +/* + * Copyright OpenSearch Contributors + * SPDX-License-Identifier: Apache-2.0 + */ + +package org.opensearch.sql.planner.distributed.stage; + +import java.util.ArrayList; +import java.util.Collections; +import java.util.List; +import java.util.Map; +import java.util.stream.Collectors; + +/** + * The complete distributed execution plan as a tree of {@link ComputeStage}s. Created by the + * physical planner from a Calcite RelNode tree. Stages are ordered by dependency — leaf stages + * (scans) first, root stage (final merge) last. + */ +public class StagedPlan { + + private final String planId; + private final List stages; + + public StagedPlan(String planId, List stages) { + this.planId = planId; + this.stages = Collections.unmodifiableList(stages); + } + + public String getPlanId() { + return planId; + } + + /** Returns all stages in dependency order (leaves first, root last). */ + public List getStages() { + return stages; + } + + /** Returns the root stage (last in the list — typically the coordinator merge stage). */ + public ComputeStage getRootStage() { + if (stages.isEmpty()) { + throw new IllegalStateException("StagedPlan has no stages"); + } + return stages.get(stages.size() - 1); + } + + /** Returns leaf stages (stages with no upstream dependencies). */ + public List getLeafStages() { + return stages.stream().filter(ComputeStage::isLeaf).collect(Collectors.toList()); + } + + /** Returns a stage by its ID. */ + public ComputeStage getStage(String stageId) { + return stages.stream() + .filter(s -> s.getStageId().equals(stageId)) + .findFirst() + .orElseThrow(() -> new IllegalArgumentException("Stage not found: " + stageId)); + } + + /** Returns the total number of stages. */ + public int getStageCount() { + return stages.size(); + } + + /** + * Validates the plan. Returns a list of validation errors, or empty list if valid. + * + * @return list of error messages + */ + public List validate() { + List errors = new ArrayList<>(); + if (planId == null || planId.isEmpty()) { + errors.add("Plan ID is required"); + } + if (stages.isEmpty()) { + errors.add("Plan must have at least one stage"); + } + + // Check that all referenced source stages exist + Map stageMap = + stages.stream().collect(Collectors.toMap(ComputeStage::getStageId, s -> s)); + for (ComputeStage stage : stages) { + for (String depId : stage.getSourceStageIds()) { + if (!stageMap.containsKey(depId)) { + errors.add("Stage '" + stage.getStageId() + "' references unknown stage: " + depId); + } + } + } + + return errors; + } + + @Override + public String toString() { + return "StagedPlan{id='" + planId + "', stages=" + stages.size() + '}'; + } +} diff --git a/core/src/test/java/org/opensearch/sql/planner/distributed/page/PageBuilderTest.java b/core/src/test/java/org/opensearch/sql/planner/distributed/page/PageBuilderTest.java new file mode 100644 index 00000000000..dc5ff6c1840 --- /dev/null +++ b/core/src/test/java/org/opensearch/sql/planner/distributed/page/PageBuilderTest.java @@ -0,0 +1,98 @@ +/* + * Copyright OpenSearch Contributors + * SPDX-License-Identifier: Apache-2.0 + */ + +package org.opensearch.sql.planner.distributed.page; + +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertThrows; +import static org.junit.jupiter.api.Assertions.assertTrue; + +import org.junit.jupiter.api.DisplayNameGeneration; +import org.junit.jupiter.api.DisplayNameGenerator; +import org.junit.jupiter.api.Test; + +@DisplayNameGeneration(DisplayNameGenerator.ReplaceUnderscores.class) +class PageBuilderTest { + + @Test + void should_build_page_row_by_row() { + PageBuilder builder = new PageBuilder(3); + + builder.beginRow(); + builder.setValue(0, "Alice"); + builder.setValue(1, 30); + builder.setValue(2, 1000.0); + builder.endRow(); + + builder.beginRow(); + builder.setValue(0, "Bob"); + builder.setValue(1, 25); + builder.setValue(2, 2000.0); + builder.endRow(); + + Page page = builder.build(); + assertEquals(2, page.getPositionCount()); + assertEquals(3, page.getChannelCount()); + assertEquals("Alice", page.getValue(0, 0)); + assertEquals(2000.0, page.getValue(1, 2)); + } + + @Test + void should_track_row_count() { + PageBuilder builder = new PageBuilder(2); + assertTrue(builder.isEmpty()); + assertEquals(0, builder.getRowCount()); + + builder.beginRow(); + builder.setValue(0, "A"); + builder.setValue(1, 1); + builder.endRow(); + + assertEquals(1, builder.getRowCount()); + } + + @Test + void should_reset_after_build() { + PageBuilder builder = new PageBuilder(1); + builder.beginRow(); + builder.setValue(0, "test"); + builder.endRow(); + + Page page = builder.build(); + assertEquals(1, page.getPositionCount()); + + // Builder should be empty after build + assertTrue(builder.isEmpty()); + assertEquals(0, builder.getRowCount()); + } + + @Test + void should_throw_on_set_before_begin() { + PageBuilder builder = new PageBuilder(2); + assertThrows(IllegalStateException.class, () -> builder.setValue(0, "value")); + } + + @Test + void should_throw_on_end_before_begin() { + PageBuilder builder = new PageBuilder(2); + assertThrows(IllegalStateException.class, () -> builder.endRow()); + } + + @Test + void should_throw_on_build_with_uncommitted_row() { + PageBuilder builder = new PageBuilder(2); + builder.beginRow(); + builder.setValue(0, "value"); + assertThrows(IllegalStateException.class, () -> builder.build()); + } + + @Test + void should_throw_on_invalid_channel() { + PageBuilder builder = new PageBuilder(2); + builder.beginRow(); + assertThrows(IndexOutOfBoundsException.class, () -> builder.setValue(2, "value")); + assertThrows(IndexOutOfBoundsException.class, () -> builder.setValue(-1, "value")); + } +} diff --git a/core/src/test/java/org/opensearch/sql/planner/distributed/page/RowPageTest.java b/core/src/test/java/org/opensearch/sql/planner/distributed/page/RowPageTest.java new file mode 100644 index 00000000000..f4443d566d1 --- /dev/null +++ b/core/src/test/java/org/opensearch/sql/planner/distributed/page/RowPageTest.java @@ -0,0 +1,105 @@ +/* + * Copyright OpenSearch Contributors + * SPDX-License-Identifier: Apache-2.0 + */ + +package org.opensearch.sql.planner.distributed.page; + +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertNull; +import static org.junit.jupiter.api.Assertions.assertThrows; + +import org.junit.jupiter.api.DisplayNameGeneration; +import org.junit.jupiter.api.DisplayNameGenerator; +import org.junit.jupiter.api.Test; + +@DisplayNameGeneration(DisplayNameGenerator.ReplaceUnderscores.class) +class RowPageTest { + + @Test + void should_create_page_with_rows_and_columns() { + Object[][] data = { + {"Alice", 30, 1000.0}, + {"Bob", 25, 2000.0} + }; + RowPage page = new RowPage(data, 3); + + assertEquals(2, page.getPositionCount()); + assertEquals(3, page.getChannelCount()); + } + + @Test + void should_access_values_by_position_and_channel() { + Object[][] data = { + {"Alice", 30, 1000.0}, + {"Bob", 25, 2000.0} + }; + RowPage page = new RowPage(data, 3); + + assertEquals("Alice", page.getValue(0, 0)); + assertEquals(30, page.getValue(0, 1)); + assertEquals(1000.0, page.getValue(0, 2)); + assertEquals("Bob", page.getValue(1, 0)); + assertEquals(25, page.getValue(1, 1)); + } + + @Test + void should_handle_null_values() { + Object[][] data = {{null, 30, null}}; + RowPage page = new RowPage(data, 3); + + assertNull(page.getValue(0, 0)); + assertEquals(30, page.getValue(0, 1)); + assertNull(page.getValue(0, 2)); + } + + @Test + void should_create_sub_region() { + Object[][] data = { + {"Alice", 30}, + {"Bob", 25}, + {"Charlie", 35}, + {"Diana", 28} + }; + RowPage page = new RowPage(data, 2); + + Page region = page.getRegion(1, 2); + assertEquals(2, region.getPositionCount()); + assertEquals("Bob", region.getValue(0, 0)); + assertEquals("Charlie", region.getValue(1, 0)); + } + + @Test + void should_create_empty_page() { + Page empty = Page.empty(3); + assertEquals(0, empty.getPositionCount()); + assertEquals(3, empty.getChannelCount()); + } + + @Test + void should_throw_on_invalid_position() { + Object[][] data = {{"Alice", 30}}; + RowPage page = new RowPage(data, 2); + + assertThrows(IndexOutOfBoundsException.class, () -> page.getValue(-1, 0)); + assertThrows(IndexOutOfBoundsException.class, () -> page.getValue(1, 0)); + } + + @Test + void should_throw_on_invalid_channel() { + Object[][] data = {{"Alice", 30}}; + RowPage page = new RowPage(data, 2); + + assertThrows(IndexOutOfBoundsException.class, () -> page.getValue(0, -1)); + assertThrows(IndexOutOfBoundsException.class, () -> page.getValue(0, 2)); + } + + @Test + void should_throw_on_invalid_region() { + Object[][] data = {{"Alice"}, {"Bob"}}; + RowPage page = new RowPage(data, 1); + + assertThrows(IndexOutOfBoundsException.class, () -> page.getRegion(1, 3)); + assertThrows(IndexOutOfBoundsException.class, () -> page.getRegion(-1, 1)); + } +} diff --git a/core/src/test/java/org/opensearch/sql/planner/distributed/pipeline/PipelineDriverTest.java b/core/src/test/java/org/opensearch/sql/planner/distributed/pipeline/PipelineDriverTest.java new file mode 100644 index 00000000000..5a8735f4b6d --- /dev/null +++ b/core/src/test/java/org/opensearch/sql/planner/distributed/pipeline/PipelineDriverTest.java @@ -0,0 +1,232 @@ +/* + * Copyright OpenSearch Contributors + * SPDX-License-Identifier: Apache-2.0 + */ + +package org.opensearch.sql.planner.distributed.pipeline; + +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertTrue; + +import java.util.ArrayList; +import java.util.List; +import org.junit.jupiter.api.DisplayNameGeneration; +import org.junit.jupiter.api.DisplayNameGenerator; +import org.junit.jupiter.api.Test; +import org.opensearch.sql.planner.distributed.operator.Operator; +import org.opensearch.sql.planner.distributed.operator.OperatorContext; +import org.opensearch.sql.planner.distributed.operator.SourceOperator; +import org.opensearch.sql.planner.distributed.page.Page; +import org.opensearch.sql.planner.distributed.page.PageBuilder; +import org.opensearch.sql.planner.distributed.split.DataUnit; + +@DisplayNameGeneration(DisplayNameGenerator.ReplaceUnderscores.class) +class PipelineDriverTest { + + @Test + void should_run_source_only_pipeline() { + // Given: A source that produces one page + MockSourceOperator source = new MockSourceOperator(List.of(createTestPage(3, 2))); + + // When + PipelineDriver driver = new PipelineDriver(source, List.of()); + Page result = driver.run(); + + // Then + assertTrue(driver.isFinished()); + assertEquals(PipelineContext.Status.FINISHED, driver.getContext().getStatus()); + } + + @Test + void should_run_source_to_transform_pipeline() { + // Given: Source produces a page, transform doubles column 1 values + Page inputPage = createTestPage(3, 2); + MockSourceOperator source = new MockSourceOperator(List.of(inputPage)); + PassThroughOperator passThrough = new PassThroughOperator(); + + // When + PipelineDriver driver = new PipelineDriver(source, List.of(passThrough)); + driver.run(); + + // Then + assertTrue(driver.isFinished()); + assertTrue(passThrough.receivedPages > 0); + } + + @Test + void should_run_source_to_sink_pipeline() { + // Given: Source produces pages, sink collects them + Page page1 = createTestPage(2, 2); + Page page2 = createTestPage(3, 2); + MockSourceOperator source = new MockSourceOperator(List.of(page1, page2)); + CollectingSinkOperator sink = new CollectingSinkOperator(); + + // When + PipelineDriver driver = new PipelineDriver(source, List.of(sink)); + driver.run(); + + // Then + assertTrue(driver.isFinished()); + assertEquals(2, sink.collectedPages.size()); + assertEquals(2, sink.collectedPages.get(0).getPositionCount()); + assertEquals(3, sink.collectedPages.get(1).getPositionCount()); + } + + @Test + void should_chain_multiple_operators() { + // Given: source → passthrough1 → passthrough2 → sink + Page inputPage = createTestPage(5, 3); + MockSourceOperator source = new MockSourceOperator(List.of(inputPage)); + PassThroughOperator pass1 = new PassThroughOperator(); + PassThroughOperator pass2 = new PassThroughOperator(); + CollectingSinkOperator sink = new CollectingSinkOperator(); + + // When + PipelineDriver driver = new PipelineDriver(source, List.of(pass1, pass2, sink)); + driver.run(); + + // Then + assertTrue(driver.isFinished()); + assertTrue(pass1.receivedPages > 0); + assertTrue(pass2.receivedPages > 0); + assertEquals(1, sink.collectedPages.size()); + } + + private Page createTestPage(int rows, int cols) { + PageBuilder builder = new PageBuilder(cols); + for (int r = 0; r < rows; r++) { + builder.beginRow(); + for (int c = 0; c < cols; c++) { + builder.setValue(c, "r" + r + "c" + c); + } + builder.endRow(); + } + return builder.build(); + } + + /** Mock source operator that produces pre-built pages. */ + static class MockSourceOperator implements SourceOperator { + private final List pages; + private int index = 0; + private boolean finished = false; + + MockSourceOperator(List pages) { + this.pages = new ArrayList<>(pages); + } + + @Override + public void addDataUnit(DataUnit dataUnit) {} + + @Override + public void noMoreDataUnits() {} + + @Override + public Page getOutput() { + if (index < pages.size()) { + return pages.get(index++); + } + finished = true; + return null; + } + + @Override + public boolean isFinished() { + return finished && index >= pages.size(); + } + + @Override + public void finish() { + finished = true; + } + + @Override + public OperatorContext getContext() { + return OperatorContext.createDefault("mock-source"); + } + + @Override + public void close() {} + } + + /** Pass-through operator that forwards pages unchanged. */ + static class PassThroughOperator implements Operator { + private Page buffered; + private boolean finished = false; + int receivedPages = 0; + + @Override + public boolean needsInput() { + return buffered == null && !finished; + } + + @Override + public void addInput(Page page) { + buffered = page; + receivedPages++; + } + + @Override + public Page getOutput() { + Page out = buffered; + buffered = null; + return out; + } + + @Override + public boolean isFinished() { + return finished && buffered == null; + } + + @Override + public void finish() { + finished = true; + } + + @Override + public OperatorContext getContext() { + return OperatorContext.createDefault("passthrough"); + } + + @Override + public void close() {} + } + + /** Sink operator that collects all received pages. */ + static class CollectingSinkOperator implements Operator { + final List collectedPages = new ArrayList<>(); + private boolean finished = false; + + @Override + public boolean needsInput() { + return !finished; + } + + @Override + public void addInput(Page page) { + collectedPages.add(page); + } + + @Override + public Page getOutput() { + return null; + } + + @Override + public boolean isFinished() { + return finished; + } + + @Override + public void finish() { + finished = true; + } + + @Override + public OperatorContext getContext() { + return OperatorContext.createDefault("sink"); + } + + @Override + public void close() {} + } +} diff --git a/core/src/test/java/org/opensearch/sql/planner/distributed/stage/ComputeStageTest.java b/core/src/test/java/org/opensearch/sql/planner/distributed/stage/ComputeStageTest.java new file mode 100644 index 00000000000..9b2b5675050 --- /dev/null +++ b/core/src/test/java/org/opensearch/sql/planner/distributed/stage/ComputeStageTest.java @@ -0,0 +1,301 @@ +/* + * Copyright OpenSearch Contributors + * SPDX-License-Identifier: Apache-2.0 + */ + +package org.opensearch.sql.planner.distributed.stage; + +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertFalse; +import static org.junit.jupiter.api.Assertions.assertThrows; +import static org.junit.jupiter.api.Assertions.assertTrue; + +import java.util.Collections; +import java.util.List; +import java.util.Map; +import org.junit.jupiter.api.DisplayNameGeneration; +import org.junit.jupiter.api.DisplayNameGenerator; +import org.junit.jupiter.api.Test; +import org.opensearch.sql.planner.distributed.operator.Operator; +import org.opensearch.sql.planner.distributed.operator.OperatorContext; +import org.opensearch.sql.planner.distributed.operator.OperatorFactory; +import org.opensearch.sql.planner.distributed.operator.SourceOperator; +import org.opensearch.sql.planner.distributed.operator.SourceOperatorFactory; +import org.opensearch.sql.planner.distributed.page.Page; +import org.opensearch.sql.planner.distributed.split.DataUnit; + +@DisplayNameGeneration(DisplayNameGenerator.ReplaceUnderscores.class) +class ComputeStageTest { + + @Test + void should_create_leaf_stage_with_data_units() { + DataUnit du1 = new TestDataUnit("accounts/0", List.of("node-1", "node-2"), 50000L); + DataUnit du2 = new TestDataUnit("accounts/1", List.of("node-2", "node-3"), 45000L); + + ComputeStage stage = + new ComputeStage( + "stage-0", + new NoOpSourceFactory(), + List.of(), + PartitioningScheme.gather(), + List.of(), + List.of(du1, du2), + 95000L, + 0L); + + assertEquals("stage-0", stage.getStageId()); + assertTrue(stage.isLeaf()); + assertEquals(2, stage.getDataUnits().size()); + assertEquals(1, stage.getOperatorCount()); + assertEquals(ExchangeType.GATHER, stage.getOutputPartitioning().getExchangeType()); + assertEquals(95000L, stage.getEstimatedRows()); + } + + @Test + void should_create_non_leaf_stage_with_dependencies() { + ComputeStage stage = + new ComputeStage( + "stage-1", + new NoOpSourceFactory(), + List.of(new NoOpOperatorFactory()), + PartitioningScheme.none(), + List.of("stage-0"), + List.of(), + 0L, + 0L); + + assertFalse(stage.isLeaf()); + assertEquals(List.of("stage-0"), stage.getSourceStageIds()); + assertEquals(2, stage.getOperatorCount()); + } + + @Test + void should_create_staged_plan() { + ComputeStage scan = + new ComputeStage( + "scan", + new NoOpSourceFactory(), + List.of(), + PartitioningScheme.gather(), + List.of(), + List.of(new TestDataUnit("idx/0", List.of("n1"), 1000L)), + 1000L, + 0L); + + ComputeStage merge = + new ComputeStage( + "merge", + new NoOpSourceFactory(), + List.of(), + PartitioningScheme.none(), + List.of("scan"), + List.of(), + 1000L, + 0L); + + StagedPlan plan = new StagedPlan("plan-1", List.of(scan, merge)); + + assertEquals("plan-1", plan.getPlanId()); + assertEquals(2, plan.getStageCount()); + assertEquals("merge", plan.getRootStage().getStageId()); + assertEquals(1, plan.getLeafStages().size()); + assertEquals("scan", plan.getLeafStages().get(0).getStageId()); + } + + @Test + void should_validate_staged_plan() { + StagedPlan validPlan = + new StagedPlan( + "p1", + List.of( + new ComputeStage( + "s1", + new NoOpSourceFactory(), + List.of(), + PartitioningScheme.gather(), + List.of(), + List.of(), + 0L, + 0L))); + + assertTrue(validPlan.validate().isEmpty()); + } + + @Test + void should_detect_invalid_plan() { + // Null plan ID + StagedPlan nullId = new StagedPlan(null, List.of()); + assertFalse(nullId.validate().isEmpty()); + + // Empty stages + StagedPlan noStages = new StagedPlan("p1", List.of()); + assertFalse(noStages.validate().isEmpty()); + + // Reference to non-existent stage + StagedPlan badRef = + new StagedPlan( + "p1", + List.of( + new ComputeStage( + "s1", + new NoOpSourceFactory(), + List.of(), + PartitioningScheme.none(), + List.of("nonexistent"), + List.of(), + 0L, + 0L))); + assertFalse(badRef.validate().isEmpty()); + } + + @Test + void should_lookup_stage_by_id() { + ComputeStage s1 = + new ComputeStage( + "s1", + new NoOpSourceFactory(), + List.of(), + PartitioningScheme.gather(), + List.of(), + List.of(), + 0L, + 0L); + StagedPlan plan = new StagedPlan("p1", List.of(s1)); + + assertEquals("s1", plan.getStage("s1").getStageId()); + assertThrows(IllegalArgumentException.class, () -> plan.getStage("nonexistent")); + } + + @Test + void should_create_partitioning_schemes() { + PartitioningScheme gather = PartitioningScheme.gather(); + assertEquals(ExchangeType.GATHER, gather.getExchangeType()); + assertTrue(gather.getHashChannels().isEmpty()); + + PartitioningScheme hash = PartitioningScheme.hashRepartition(List.of(0, 1)); + assertEquals(ExchangeType.HASH_REPARTITION, hash.getExchangeType()); + assertEquals(List.of(0, 1), hash.getHashChannels()); + + PartitioningScheme broadcast = PartitioningScheme.broadcast(); + assertEquals(ExchangeType.BROADCAST, broadcast.getExchangeType()); + + PartitioningScheme none = PartitioningScheme.none(); + assertEquals(ExchangeType.NONE, none.getExchangeType()); + } + + /** Minimal test stub for DataUnit. */ + static class TestDataUnit extends DataUnit { + private final String id; + private final List preferredNodes; + private final long estimatedRows; + + TestDataUnit(String id, List preferredNodes, long estimatedRows) { + this.id = id; + this.preferredNodes = preferredNodes; + this.estimatedRows = estimatedRows; + } + + @Override + public String getDataUnitId() { + return id; + } + + @Override + public List getPreferredNodes() { + return preferredNodes; + } + + @Override + public long getEstimatedRows() { + return estimatedRows; + } + + @Override + public long getEstimatedSizeBytes() { + return 0; + } + + @Override + public Map getProperties() { + return Collections.emptyMap(); + } + } + + /** No-op source factory for testing. */ + static class NoOpSourceFactory implements SourceOperatorFactory { + @Override + public SourceOperator createOperator(OperatorContext context) { + return new SourceOperator() { + @Override + public void addDataUnit(DataUnit dataUnit) {} + + @Override + public void noMoreDataUnits() {} + + @Override + public Page getOutput() { + return null; + } + + @Override + public boolean isFinished() { + return true; + } + + @Override + public void finish() {} + + @Override + public OperatorContext getContext() { + return context; + } + + @Override + public void close() {} + }; + } + + @Override + public void noMoreOperators() {} + } + + /** No-op operator factory for testing. */ + static class NoOpOperatorFactory implements OperatorFactory { + @Override + public Operator createOperator(OperatorContext context) { + return new Operator() { + @Override + public boolean needsInput() { + return false; + } + + @Override + public void addInput(Page page) {} + + @Override + public Page getOutput() { + return null; + } + + @Override + public boolean isFinished() { + return true; + } + + @Override + public void finish() {} + + @Override + public OperatorContext getContext() { + return context; + } + + @Override + public void close() {} + }; + } + + @Override + public void noMoreOperators() {} + } +} diff --git a/docs/distributed-engine-architecture.md b/docs/distributed-engine-architecture.md new file mode 100644 index 00000000000..8ddb4584c5e --- /dev/null +++ b/docs/distributed-engine-architecture.md @@ -0,0 +1,294 @@ +# Distributed PPL Query Engine — Architecture + +## High-Level Execution Flow + +``` + PPL Query: "search source=accounts | stats avg(age) by gender" + | + v + +--------------------+ + | PPL Parser / | + | Calcite Planner | + +--------------------+ + | + RelNode tree + | + v + +---------------------------------+ + | DistributedExecutionEngine | + | (routing shell) | + +---------------------------------+ + | | + distributed=true distributed=false (default) + | | + v v + UnsupportedOperationException +------------------------+ + (execution not yet implemented) | OpenSearchExecution | + | Engine (legacy) | + +------------------------+ +``` + +When `plugins.ppl.distributed.enabled=true`, the engine throws `UnsupportedOperationException`. +Distributed execution will be implemented in the next phase against the clean H2 interfaces. + +--- + +## Module Layout + +``` +sql/ + ├── core/src/main/java/org/opensearch/sql/planner/distributed/ + │ │ + │ ├── operator/ ── Core Operator Framework ── + │ │ ├── Operator.java Push/pull interface (Page batches) + │ │ ├── SourceOperator.java Reads from storage (extends Operator) + │ │ ├── SinkOperator.java Terminal consumer (extends Operator) + │ │ ├── OperatorFactory.java Creates Operator instances + │ │ ├── SourceOperatorFactory.java Creates SourceOperator instances + │ │ └── OperatorContext.java Runtime context (memory, cancellation) + │ │ + │ ├── page/ ── Data Batching (Columnar-Ready) ── + │ │ ├── Page.java Columnar-ready batch interface + │ │ ├── Block.java Single-column data (Arrow-aligned) + │ │ ├── RowPage.java Row-based Page implementation + │ │ └── PageBuilder.java Row-by-row Page builder + │ │ + │ ├── pipeline/ ── Pipeline Execution ── + │ │ ├── Pipeline.java Ordered chain of OperatorFactories + │ │ ├── PipelineDriver.java Drives data through operator chain + │ │ └── PipelineContext.java Runtime state (status, cancellation) + │ │ + │ ├── stage/ ── Staged Planning ── + │ │ ├── StagedPlan.java Tree of ComputeStages (dependency order) + │ │ ├── ComputeStage.java Stage with pipeline + partitioning + planFragment + │ │ ├── PartitioningScheme.java Output partitioning (gather, hash, broadcast) + │ │ └── ExchangeType.java Enum: GATHER / HASH_REPARTITION / BROADCAST / NONE + │ │ + │ ├── exchange/ ── Inter-Stage Data Transfer ── + │ │ ├── ExchangeManager.java Creates sink/source operators + │ │ ├── ExchangeSinkOperator.java Sends pages downstream + │ │ ├── ExchangeSourceOperator.java Receives pages from upstream + │ │ └── OutputBuffer.java Back-pressure buffering for pages + │ │ + │ ├── split/ ── Data Assignment ── + │ │ ├── DataUnit.java Abstract: unit of data (shard, file, etc.) + │ │ ├── DataUnitSource.java Generates DataUnits (shard discovery) + │ │ └── DataUnitAssignment.java Assigns DataUnits to nodes + │ │ + │ ├── planner/ ── Physical Planning Interfaces ── + │ │ ├── PhysicalPlanner.java RelNode → StagedPlan + │ │ ├── PlanFragmenter.java Auto stage creation from RelNode tree + │ │ ├── FragmentationContext.java Context for fragmentation (nodes, costs) + │ │ ├── SubPlan.java RelNode fragment for data node pushdown + │ │ └── CostEstimator.java Row count / size / selectivity estimation + │ │ + │ └── execution/ ── Execution Lifecycle ── + │ ├── QueryExecution.java Full query lifecycle management + │ ├── StageExecution.java Per-stage execution tracking + │ └── TaskExecution.java Per-task execution tracking + │ + └── opensearch/src/main/java/org/opensearch/sql/opensearch/executor/ + ├── DistributedExecutionEngine.java Routing shell: legacy vs distributed + │ + └── distributed/ + ├── TransportExecuteDistributedTaskAction.java Transport handler (data node) + ├── ExecuteDistributedTaskAction.java ActionType for routing + ├── ExecuteDistributedTaskRequest.java Request wire format + ├── ExecuteDistributedTaskResponse.java Response wire format + │ + ├── split/ + │ └── OpenSearchDataUnit.java DataUnit impl (index + shard + locality) + │ + ├── operator/ ── OpenSearch Operators ── + │ ├── LuceneScanOperator.java Direct Lucene _source reads (Weight/Scorer) + │ ├── LimitOperator.java Row limit enforcement + │ ├── ResultCollector.java Collects pages into row lists + │ └── FilterToLuceneConverter.java Filter conditions → Lucene Query + │ + └── pipeline/ + └── OperatorPipelineExecutor.java Orchestrates pipeline on data node +``` + +--- + +## Class Hierarchy + +### DataUnit Model + +``` + DataUnit (abstract class) + ├── getDataUnitId() Unique identifier + ├── getPreferredNodes() Nodes where data is local + ├── getEstimatedRows() Row count estimate + ├── getEstimatedSizeBytes() Size estimate + ├── getProperties() Storage-specific metadata + └── isRemotelyAccessible() Default: true + │ + └── OpenSearchDataUnit (concrete) + ├── indexName, shardId + └── isRemotelyAccessible() → false (Lucene requires locality) + + DataUnitSource (interface, AutoCloseable) + └── getNextBatch(maxBatchSize) → List + + DataUnitAssignment (interface) + └── assign(dataUnits, availableNodes) → Map> +``` + +### Block / Page Columnar Model + +``` + Page (interface) + ├── getPositionCount() Row count + ├── getChannelCount() Column count + ├── getValue(pos, channel) Cell access + ├── getBlock(channel) Columnar access (default: throws) + ├── getRetainedSizeBytes() Memory estimate + └── getRegion(offset, len) Sub-page slice + │ + └── RowPage (row-based impl) + + Block (interface) + ├── getPositionCount() Row count in this column + ├── getValue(position) Value at row + ├── isNull(position) Null check + ├── getRetainedSizeBytes() Memory estimate + ├── getRegion(offset, len) Sub-block slice + └── getType() → BlockType BOOLEAN, INT, LONG, FLOAT, DOUBLE, STRING, ... + + Future: ArrowBlock wraps Arrow FieldVector for zero-copy exchange +``` + +### PlanFragmenter → StagedPlan → ComputeStage + +``` + PlanFragmenter (interface) + └── fragment(RelNode, FragmentationContext) → StagedPlan + │ + │ FragmentationContext (interface) + │ ├── getAvailableNodes() + │ ├── getCostEstimator() + │ ├── getDataUnitSource(tableName) + │ ├── getMaxTasksPerStage() + │ └── getCoordinatorNodeId() + │ + │ SubPlan (class) + │ ├── fragmentId + │ ├── root: RelNode ← sub-plan for data node execution (pushdown) + │ ├── outputPartitioning + │ └── children: List + │ + └── StagedPlan + └── List (dependency order: leaves → root) + ├── stageId + ├── SourceOperatorFactory + ├── List + ├── PartitioningScheme + │ ├── ExchangeType: GATHER | HASH_REPARTITION | BROADCAST | NONE + │ └── hashChannels: List + ├── sourceStageIds (upstream dependencies) + ├── List (data assignments) + ├── planFragment: RelNode (nullable — sub-plan for pushdown) + └── estimatedRows / estimatedBytes +``` + +### Exchange Interfaces + +``` + ExchangeManager (interface) + ├── createSink(context, targetStageId, partitioning) → ExchangeSinkOperator + └── createSource(context, sourceStageId) → ExchangeSourceOperator + + OutputBuffer (interface, AutoCloseable) + ├── enqueue(Page) Add page to buffer + ├── setNoMorePages() Signal completion + ├── isFull() Back-pressure check + ├── getBufferedBytes() Buffer size + ├── abort() Discard buffered pages + ├── isFinished() All pages consumed + └── getPartitioningScheme() Output partitioning + + Exchange protocol: + Current: OpenSearch transport (Netty TCP, StreamOutput/StreamInput) + Future: Arrow IPC (ArrowRecordBatch for zero-copy columnar exchange) +``` + +### Execution Lifecycle + +``` + QueryExecution (interface) + ├── State: PLANNING → STARTING → RUNNING → FINISHING → FINISHED | FAILED + ├── getQueryId() + ├── getPlan() → StagedPlan + ├── getStageExecutions() → List + ├── getStats() → QueryStats (totalRows, elapsedTime, planningTime) + └── cancel() + + StageExecution (interface) + ├── State: PLANNED → SCHEDULING → RUNNING → FINISHED | FAILED | CANCELLED + ├── getStage() → ComputeStage + ├── addDataUnits(List) + ├── noMoreDataUnits() + ├── getTaskExecutions() → Map> + ├── getStats() → StageStats (totalRows, totalBytes, completedTasks, totalTasks) + ├── addStateChangeListener(listener) + └── cancel() + + TaskExecution (interface) + ├── State: PLANNED → RUNNING → FLUSHING → FINISHED | FAILED | CANCELLED + ├── getTaskId(), getNodeId() + ├── getAssignedDataUnits() → List + ├── getStats() → TaskStats (processedRows, processedBytes, outputRows, elapsedTime) + └── cancel() +``` + +### Operator Framework + +``` + Operator (interface) + / \ + SourceOperator SinkOperator + (adds DataUnits) (terminal) + | | + ExchangeSourceOperator ExchangeSinkOperator + | + LuceneScanOperator (OpenSearch impl) + + Other Operators: + ├── LimitOperator (implements Operator) + └── (future: FilterOperator, ProjectOperator, AggOperator, etc.) + + Factories: + ├── OperatorFactory → creates Operator + └── SourceOperatorFactory → creates SourceOperator + + Data Flow: + DataUnit → SourceOperator → Page → Operator → Page → ... → SinkOperator + ↑ + OperatorContext (memory, cancellation) +``` + +--- + +## Configuration + +| Setting | Default | Description | +|---------|---------|-------------| +| `plugins.ppl.distributed.enabled` | `false` | Single toggle: legacy engine (off/default) or distributed (on, not yet implemented) | + +**No sub-settings.** When distributed is enabled in the future, the operator pipeline will be the only execution path. + +--- + +## Two Execution Paths (No Fallback) + +``` + plugins.ppl.distributed.enabled = false (default) plugins.ppl.distributed.enabled = true + ────────────────────────────────────────────── ────────────────────────────────────── + PPL → Calcite → DistributedExecutionEngine PPL → Calcite → DistributedExecutionEngine + │ │ + v v + OpenSearchExecutionEngine (legacy) UnsupportedOperationException + client.search() (SSB pushdown) (execution not yet implemented) + Single-node coordinator +``` diff --git a/docs/ppl-test-queries.md b/docs/ppl-test-queries.md new file mode 100644 index 00000000000..a1ff402c796 --- /dev/null +++ b/docs/ppl-test-queries.md @@ -0,0 +1,324 @@ +# PPL Test Queries & Index Setup + +Quick-reference for manual testing against a live OpenSearch cluster. +Data files live in `sql/doctest/test_data/*.json`. + +--- + +## Index Setup (Bulk Ingest) + +Run these to create and populate all required test indices. + +### accounts (4 docs, used by most commands) +```bash +curl -s -XPOST 'localhost:9200/accounts/_bulk?refresh=true' -H 'Content-Type: application/json' --data-binary ' +{"index":{"_id":"1"}} +{"account_number":1,"balance":39225,"firstname":"Amber","lastname":"Duke","age":32,"gender":"M","address":"880 Holmes Lane","employer":"Pyrami","email":"amberduke@pyrami.com","city":"Brogan","state":"IL"} +{"index":{"_id":"6"}} +{"account_number":6,"balance":5686,"firstname":"Hattie","lastname":"Bond","age":36,"gender":"M","address":"671 Bristol Street","employer":"Netagy","email":"hattiebond@netagy.com","city":"Dante","state":"TN"} +{"index":{"_id":"13"}} +{"account_number":13,"balance":32838,"firstname":"Nanette","lastname":"Bates","age":28,"gender":"F","address":"789 Madison Street","employer":"Quility","city":"Nogal","state":"VA"} +{"index":{"_id":"18"}} +{"account_number":18,"balance":4180,"firstname":"Dale","lastname":"Adams","age":33,"gender":"M","address":"467 Hutchinson Court","employer":null,"email":"daleadams@boink.com","city":"Orick","state":"MD"} +' +``` + +### state_country (8 docs, used by join/explain/streamstats) +```bash +curl -s -XPOST 'localhost:9200/state_country/_bulk?refresh=true' -H 'Content-Type: application/json' --data-binary ' +{"index":{"_id":"1"}} +{"name":"Jake","age":70,"state":"California","country":"USA","year":2023,"month":4} +{"index":{"_id":"2"}} +{"name":"Hello","age":30,"state":"New York","country":"USA","year":2023,"month":4} +{"index":{"_id":"3"}} +{"name":"John","age":25,"state":"Ontario","country":"Canada","year":2023,"month":4} +{"index":{"_id":"4"}} +{"name":"Jane","age":20,"state":"Quebec","country":"Canada","year":2023,"month":4} +{"index":{"_id":"5"}} +{"name":"Jim","age":27,"state":"B.C","country":"Canada","year":2023,"month":4} +{"index":{"_id":"6"}} +{"name":"Peter","age":57,"state":"B.C","country":"Canada","year":2023,"month":4} +{"index":{"_id":"7"}} +{"name":"Rick","age":70,"state":"B.C","country":"Canada","year":2023,"month":4} +{"index":{"_id":"8"}} +{"name":"David","age":40,"state":"Washington","country":"USA","year":2023,"month":4} +' +``` + +### occupation (6 docs, used by join examples) +```bash +curl -s -XPOST 'localhost:9200/occupation/_bulk?refresh=true' -H 'Content-Type: application/json' --data-binary ' +{"index":{"_id":"1"}} +{"name":"Jake","occupation":"Engineer","country":"England","salary":100000,"year":2023,"month":4} +{"index":{"_id":"2"}} +{"name":"Hello","occupation":"Artist","country":"USA","salary":70000,"year":2023,"month":4} +{"index":{"_id":"3"}} +{"name":"John","occupation":"Doctor","country":"Canada","salary":120000,"year":2023,"month":4} +{"index":{"_id":"4"}} +{"name":"David","occupation":"Doctor","country":"USA","salary":120000,"year":2023,"month":4} +{"index":{"_id":"5"}} +{"name":"David","occupation":"Unemployed","country":"Canada","salary":0,"year":2023,"month":4} +{"index":{"_id":"6"}} +{"name":"Jane","occupation":"Scientist","country":"Canada","salary":90000,"year":2023,"month":4} +' +``` + +### employees (used by basic queries) +```bash +curl -s -XPOST 'localhost:9200/employees/_bulk?refresh=true' -H 'Content-Type: application/json' --data-binary ' +{"index":{"_id":"1"}} +{"name":"Alice","age":30,"department":"Engineering","salary":90000} +{"index":{"_id":"2"}} +{"name":"Bob","age":35,"department":"Marketing","salary":75000} +{"index":{"_id":"3"}} +{"name":"Carol","age":28,"department":"Engineering","salary":85000} +{"index":{"_id":"4"}} +{"name":"Dave","age":42,"department":"Sales","salary":70000} +{"index":{"_id":"5"}} +{"name":"Eve","age":31,"department":"Engineering","salary":95000} +{"index":{"_id":"6"}} +{"name":"Frank","age":45,"department":"Marketing","salary":80000} +{"index":{"_id":"7"}} +{"name":"Grace","age":27,"department":"Sales","salary":65000} +{"index":{"_id":"8"}} +{"name":"Hank","age":38,"department":"Engineering","salary":105000} +' +``` + +### people (used by functions: math, string, datetime, crypto, collection, conversion) +```bash +curl -s -XPOST 'localhost:9200/people/_bulk?refresh=true' -H 'Content-Type: application/json' --data-binary ' +{"index":{"_id":"1"}} +{"name":"Alice","age":30,"city":"Seattle"} +{"index":{"_id":"2"}} +{"name":"Bob","age":25,"city":"Portland"} +{"index":{"_id":"3"}} +{"name":"Carol","age":35,"city":"Vancouver"} +' +``` + +### products (used by basic queries) +```bash +curl -s -XPOST 'localhost:9200/products/_bulk?refresh=true' -H 'Content-Type: application/json' --data-binary ' +{"index":{"_id":"1"}} +{"name":"Widget","price":9.99,"category":"Tools","stock":100} +{"index":{"_id":"2"}} +{"name":"Gadget","price":24.99,"category":"Electronics","stock":50} +{"index":{"_id":"3"}} +{"name":"Doohickey","price":4.99,"category":"Tools","stock":200} +{"index":{"_id":"4"}} +{"name":"Thingamajig","price":49.99,"category":"Electronics","stock":25} +{"index":{"_id":"5"}} +{"name":"Whatchamacallit","price":14.99,"category":"Misc","stock":75} +{"index":{"_id":"6"}} +{"name":"Gizmo","price":34.99,"category":"Electronics","stock":30} +' +``` + +### Ingest ALL at once +```bash +# One-liner to ingest all indices (copy-paste friendly) +for idx in accounts state_country occupation employees people products; do + echo "--- $idx ---" +done +# Or run each curl block above individually +``` + +### Enable distributed execution +```bash +curl -s -XPUT 'localhost:9200/_cluster/settings' -H 'Content-Type: application/json' -d '{ + "persistent": {"plugins.ppl.distributed.enabled": true} +}' +``` + +### Disable distributed execution (revert to legacy) +```bash +curl -s -XPUT 'localhost:9200/_cluster/settings' -H 'Content-Type: application/json' -d '{ + "persistent": {"plugins.ppl.distributed.enabled": false} +}' +``` + +--- + +## PPL Queries by Category + +Helper function for running queries: +```bash +ppl() { curl -s 'localhost:9200/_plugins/_ppl' -H 'Content-Type: application/json' -d "{\"query\":\"$1\"}" | python3 -m json.tool; } +``` + +--- + +### Join Queries (state_country + occupation) + +```bash +# Inner join +ppl "source = state_country | inner join left=a right=b ON a.name = b.name occupation | fields a.name, a.age, b.occupation, b.salary" + +# Left join +ppl "source = state_country as a | left join left=a right=b ON a.name = b.name occupation as b | fields a.name, a.age, b.occupation, b.salary" + +# Right join (requires plugins.calcite.all_join_types.allowed=true) +ppl "source = state_country as a | right join left=a right=b ON a.name = b.name occupation as b | fields a.name, a.age, b.occupation, b.salary" + +# Semi join +ppl "source = state_country as a | left semi join left=a right=b ON a.name = b.name occupation as b | fields a.name, a.age, a.country" + +# Anti join +ppl "source = state_country as a | left anti join left=a right=b ON a.name = b.name occupation as b | fields a.name, a.age, a.country" + +# Join with filter +ppl "source = state_country | inner join left=a right=b ON a.name = b.name occupation | where b.salary > 80000 | fields a.name, b.salary" + +# Join with sort + limit +ppl "source = state_country | inner join left=a right=b ON a.name = b.name occupation | sort - b.salary | head 3" + +# Join with subsearch +ppl "source = state_country as a | left join ON a.name = b.name [ source = occupation | where salary > 0 | fields name, country, salary | sort salary | head 3 ] as b | fields a.name, a.age, b.salary" + +# Join with stats +ppl "source = state_country | inner join left=a right=b ON a.name = b.name occupation | stats avg(salary) by span(age, 10) as age_span, b.country" +``` + +### Explain (shows distributed plan) +```bash +# Explain a join query +curl -s 'localhost:9200/_plugins/_ppl/_explain' -H 'Content-Type: application/json' \ + -d '{"query":"source = state_country | inner join left=a right=b ON a.name = b.name occupation | fields a.name, b.salary"}' | python3 -m json.tool + +# Explain a simple query +curl -s 'localhost:9200/_plugins/_ppl/_explain' -H 'Content-Type: application/json' \ + -d '{"query":"source = accounts | where age > 30 | head 5"}' | python3 -m json.tool +``` + +--- + +### Basic Scan / Filter / Limit (accounts) + +```bash +ppl "source=accounts" +ppl "source=accounts | head 2" +ppl "source=accounts | fields firstname, age" +ppl "source=accounts | where age > 30" +ppl "source=accounts | where age > 30 | fields firstname, age" +ppl "source=accounts | where age > 30 | head 2" +ppl "source=accounts | fields firstname, age | head 3 from 1" +``` + +### Sort (accounts) +```bash +ppl "source=accounts | sort age | fields firstname, age" +ppl "source=accounts | sort - balance | fields firstname, balance | head 3" +ppl "source=accounts | sort + age | fields firstname, age" +``` + +### Rename (accounts) +```bash +ppl "source=accounts | rename firstname as first_name | fields first_name, age" +ppl "source=accounts | rename firstname as first_name, lastname as last_name | fields first_name, last_name" +``` + +### Where / Filter (accounts) +```bash +ppl "source=accounts | where gender = 'M' | fields firstname, gender" +ppl "source=accounts | where age > 30 AND gender = 'M' | fields firstname, age, gender" +ppl "source=accounts | where balance > 10000 | fields firstname, balance" +ppl "source=accounts | where employer IS NOT NULL | fields firstname, employer" +``` + +### Dedup (accounts) +```bash +ppl "source=accounts | dedup gender | fields account_number, gender | sort account_number" +ppl "source=accounts | dedup 2 gender | fields account_number, gender | sort account_number" +``` + +### Eval (accounts) +```bash +ppl "source=accounts | eval doubleAge = age * 2 | fields age, doubleAge" +ppl "source=accounts | eval greeting = 'Hello ' + firstname | fields firstname, greeting" +``` + +### Stats / Aggregation (accounts) +```bash +ppl "source=accounts | stats count()" +ppl "source=accounts | stats avg(age)" +ppl "source=accounts | stats avg(age) by gender" +ppl "source=accounts | stats max(age), min(age) by gender" +ppl "source=accounts | stats count() as cnt by state" +``` + +### Parse (accounts) +```bash +ppl "source=accounts | parse email '.+@(?.+)' | fields email, host" +ppl "source=accounts | parse address '\\d+ (?.+)' | fields address, street" +``` + +### Regex (accounts) +```bash +ppl "source=accounts | regex email=\"@pyrami\\.com$\" | fields account_number, email" +``` + +### Fillnull (accounts) +```bash +ppl "source=accounts | fields email, employer | fillnull with '' in employer" +``` + +### Replace (accounts) +```bash +ppl "source=accounts | replace \"IL\" WITH \"Illinois\" IN state | fields state" +``` + +--- + +### Streamstats (state_country) +```bash +ppl "source=state_country | streamstats avg(age) as running_avg, count() as running_count by country" +ppl "source=state_country | streamstats current=false window=2 max(age) as prev_max_age" +``` + +### Explain (state_country) +```bash +ppl "explain source=state_country | where country = 'USA' OR country = 'England' | stats count() by country" +``` + +--- + +### Functions (people) +```bash +ppl "source=people | eval len = LENGTH(name) | fields name, len" +ppl "source=people | eval upper = UPPER(name) | fields name, upper" +ppl "source=people | eval abs_val = ABS(-42) | fields name, abs_val | head 1" +``` + +--- + +## Index Summary + +| Index | Docs | Used By | Key Fields | +|-------|------|---------|------------| +| `accounts` | 4 | head, stats, where, sort, dedup, eval, parse, regex, fillnull, rename, replace, fields, addtotals, transpose, appendpipe, condition, expressions, statistical, aggregations, relevance | account_number, balance, firstname, lastname, age, gender, address, employer, email, city, state | +| `state_country` | 8 | join, explain, streamstats | name, age, state, country, year, month | +| `occupation` | 6 | join | name, occupation, country, salary, year, month | +| `employees` | 8 | basic queries | name, age, department, salary | +| `people` | 3 | math, string, datetime, crypto, collection, conversion functions | name, age, city | +| `products` | 6 | basic queries | name, price, category, stock | + +### Additional indices in doctest/test_data/ (ingest from file if needed) +| Index | Data File | +|-------|-----------| +| `books` | `doctest/test_data/books.json` | +| `nyc_taxi` | `doctest/test_data/nyc_taxi.json` | +| `weblogs` | `doctest/test_data/weblogs.json` | +| `json_test` | `doctest/test_data/json_test.json` | +| `otellogs` | `doctest/test_data/otellogs.json` | +| `mvcombine_data` | `doctest/test_data/mvcombine.json` | +| `work_information` | `doctest/test_data/work_information.json` | +| `worker` | `doctest/test_data/worker.json` | +| `events` | `doctest/test_data/events.json` | + +To ingest from file: +```bash +curl -s -XPOST 'localhost:9200//_bulk?refresh=true' \ + -H 'Content-Type: application/json' \ + --data-binary @sql/doctest/test_data/.json +``` diff --git a/integ-test/src/test/java/org/opensearch/sql/calcite/remote/CalcitePPLCastFunctionIT.java b/integ-test/src/test/java/org/opensearch/sql/calcite/remote/CalcitePPLCastFunctionIT.java index 9560aa0939a..eb1dfb6190c 100644 --- a/integ-test/src/test/java/org/opensearch/sql/calcite/remote/CalcitePPLCastFunctionIT.java +++ b/integ-test/src/test/java/org/opensearch/sql/calcite/remote/CalcitePPLCastFunctionIT.java @@ -196,4 +196,10 @@ public void testCastIpToString() throws IOException { rows("1.2.3.5"), rows("::ffff:1234")); } + + @Override + @Test + public void testCastToIP() throws IOException { + super.testCastToIP(); + } } diff --git a/opensearch/src/main/java/org/opensearch/sql/opensearch/executor/DistributedExecutionEngine.java b/opensearch/src/main/java/org/opensearch/sql/opensearch/executor/DistributedExecutionEngine.java new file mode 100644 index 00000000000..b59c0ec219e --- /dev/null +++ b/opensearch/src/main/java/org/opensearch/sql/opensearch/executor/DistributedExecutionEngine.java @@ -0,0 +1,84 @@ +/* + * Copyright OpenSearch Contributors + * SPDX-License-Identifier: Apache-2.0 + */ + +package org.opensearch.sql.opensearch.executor; + +import org.apache.calcite.rel.RelNode; +import org.apache.logging.log4j.LogManager; +import org.apache.logging.log4j.Logger; +import org.opensearch.sql.ast.statement.ExplainMode; +import org.opensearch.sql.calcite.CalcitePlanContext; +import org.opensearch.sql.common.response.ResponseListener; +import org.opensearch.sql.executor.ExecutionContext; +import org.opensearch.sql.executor.ExecutionEngine; +import org.opensearch.sql.opensearch.setting.OpenSearchSettings; +import org.opensearch.sql.planner.physical.PhysicalPlan; + +/** + * Distributed execution engine that routes queries between legacy single-node execution and + * distributed multi-node execution based on configuration. + * + *

When distributed execution is disabled (default), all queries delegate to the legacy {@link + * OpenSearchExecutionEngine}. When enabled, queries throw {@link UnsupportedOperationException} — + * distributed execution will be implemented in the next phase against the clean H2 interfaces + * (ComputeStage, DataUnit, PlanFragmenter, etc.). + */ +public class DistributedExecutionEngine implements ExecutionEngine { + private static final Logger logger = LogManager.getLogger(DistributedExecutionEngine.class); + + private final OpenSearchExecutionEngine legacyEngine; + private final OpenSearchSettings settings; + + public DistributedExecutionEngine( + OpenSearchExecutionEngine legacyEngine, OpenSearchSettings settings) { + this.legacyEngine = legacyEngine; + this.settings = settings; + logger.info("Initialized DistributedExecutionEngine"); + } + + @Override + public void execute(PhysicalPlan plan, ResponseListener listener) { + execute(plan, ExecutionContext.emptyExecutionContext(), listener); + } + + @Override + public void execute( + PhysicalPlan plan, ExecutionContext context, ResponseListener listener) { + if (isDistributedEnabled()) { + throw new UnsupportedOperationException("Distributed execution not yet implemented"); + } + legacyEngine.execute(plan, context, listener); + } + + @Override + public void explain(PhysicalPlan plan, ResponseListener listener) { + legacyEngine.explain(plan, listener); + } + + @Override + public void execute( + RelNode plan, CalcitePlanContext context, ResponseListener listener) { + if (isDistributedEnabled()) { + throw new UnsupportedOperationException("Distributed execution not yet implemented"); + } + legacyEngine.execute(plan, context, listener); + } + + @Override + public void explain( + RelNode plan, + ExplainMode mode, + CalcitePlanContext context, + ResponseListener listener) { + if (isDistributedEnabled()) { + throw new UnsupportedOperationException("Distributed execution not yet implemented"); + } + legacyEngine.explain(plan, mode, context, listener); + } + + private boolean isDistributedEnabled() { + return settings.getDistributedExecutionEnabled(); + } +} diff --git a/opensearch/src/main/java/org/opensearch/sql/opensearch/executor/distributed/ExecuteDistributedTaskAction.java b/opensearch/src/main/java/org/opensearch/sql/opensearch/executor/distributed/ExecuteDistributedTaskAction.java new file mode 100644 index 00000000000..be04f909c9a --- /dev/null +++ b/opensearch/src/main/java/org/opensearch/sql/opensearch/executor/distributed/ExecuteDistributedTaskAction.java @@ -0,0 +1,28 @@ +/* + * Copyright OpenSearch Contributors + * SPDX-License-Identifier: Apache-2.0 + */ + +package org.opensearch.sql.opensearch.executor.distributed; + +import org.opensearch.action.ActionType; + +/** + * Transport action for executing distributed query tasks on remote cluster nodes. + * + *

This action enables the DistributedTaskScheduler to send operator pipeline requests to + * specific nodes for execution. Each node executes the pipeline locally using direct Lucene access + * and returns rows back to the coordinator. + */ +public class ExecuteDistributedTaskAction extends ActionType { + + /** Action name used for transport routing */ + public static final String NAME = "cluster:admin/opensearch/sql/distributed/execute"; + + /** Singleton instance */ + public static final ExecuteDistributedTaskAction INSTANCE = new ExecuteDistributedTaskAction(); + + private ExecuteDistributedTaskAction() { + super(NAME, ExecuteDistributedTaskResponse::new); + } +} diff --git a/opensearch/src/main/java/org/opensearch/sql/opensearch/executor/distributed/ExecuteDistributedTaskRequest.java b/opensearch/src/main/java/org/opensearch/sql/opensearch/executor/distributed/ExecuteDistributedTaskRequest.java new file mode 100644 index 00000000000..fbae67dd5b7 --- /dev/null +++ b/opensearch/src/main/java/org/opensearch/sql/opensearch/executor/distributed/ExecuteDistributedTaskRequest.java @@ -0,0 +1,184 @@ +/* + * Copyright OpenSearch Contributors + * SPDX-License-Identifier: Apache-2.0 + */ + +package org.opensearch.sql.opensearch.executor.distributed; + +import java.io.IOException; +import java.util.List; +import java.util.Map; +import lombok.AllArgsConstructor; +import lombok.Data; +import lombok.EqualsAndHashCode; +import lombok.NoArgsConstructor; +import org.opensearch.action.ActionRequest; +import org.opensearch.action.ActionRequestValidationException; +import org.opensearch.core.common.io.stream.StreamInput; +import org.opensearch.core.common.io.stream.StreamOutput; + +/** + * Request message for executing distributed query tasks on a remote node. + * + *

Contains the operator pipeline parameters needed for execution: index name, shard IDs, field + * names, query limit, and optional filter conditions. + */ +@Data +@EqualsAndHashCode(callSuper = false) +@AllArgsConstructor +@NoArgsConstructor +public class ExecuteDistributedTaskRequest extends ActionRequest { + + /** ID of the execution stage these work units belong to */ + private String stageId; + + /** Index name for per-shard execution. */ + private String indexName; + + /** Shard IDs to execute on the target node. */ + private List shardIds; + + /** Execution mode: always "OPERATOR_PIPELINE". */ + private String executionMode; + + /** Fields to return when using operator pipeline mode. */ + private List fieldNames; + + /** Row limit when using operator pipeline mode. */ + private int queryLimit; + + /** + * Filter conditions for operator pipeline. Each entry is a Map with keys: "field" (String), "op" + * (String: EQ, NEQ, GT, GTE, LT, LTE), "value" (Object). Multiple conditions are ANDed. Compound + * boolean uses "bool" key with "AND"/"OR" and "children" list. Null means match all. + */ + @SuppressWarnings("unchecked") + private List> filterConditions; + + /** Constructor for deserialization from stream. */ + public ExecuteDistributedTaskRequest(StreamInput in) throws IOException { + super(in); + this.stageId = in.readString(); + this.indexName = in.readOptionalString(); + + // Skip SearchSourceBuilder field (backward compat: always false for new requests) + if (in.readBoolean()) { + // Consume the SearchSourceBuilder bytes for backward compatibility + new org.opensearch.search.builder.SearchSourceBuilder(in); + } + + // Deserialize shard IDs + if (in.readBoolean()) { + int shardCount = in.readVInt(); + this.shardIds = new java.util.ArrayList<>(shardCount); + for (int i = 0; i < shardCount; i++) { + this.shardIds.add(in.readVInt()); + } + } + + // Deserialize operator pipeline fields + this.executionMode = in.readOptionalString(); + if (in.readBoolean()) { + this.fieldNames = in.readStringList(); + } + this.queryLimit = in.readVInt(); + + // Deserialize filter conditions + if (in.readBoolean()) { + int filterCount = in.readVInt(); + this.filterConditions = new java.util.ArrayList<>(filterCount); + for (int i = 0; i < filterCount; i++) { + @SuppressWarnings("unchecked") + Map condition = (Map) in.readGenericValue(); + this.filterConditions.add(condition); + } + } + } + + /** Serializes this request to a stream for network transport. */ + @Override + public void writeTo(StreamOutput out) throws IOException { + super.writeTo(out); + out.writeString(stageId != null ? stageId : ""); + out.writeOptionalString(indexName); + + // SearchSourceBuilder field — always false for new requests + out.writeBoolean(false); + + // Serialize shard IDs + if (shardIds != null) { + out.writeBoolean(true); + out.writeVInt(shardIds.size()); + for (int shardId : shardIds) { + out.writeVInt(shardId); + } + } else { + out.writeBoolean(false); + } + + // Serialize operator pipeline fields + out.writeOptionalString(executionMode); + if (fieldNames != null) { + out.writeBoolean(true); + out.writeStringCollection(fieldNames); + } else { + out.writeBoolean(false); + } + out.writeVInt(queryLimit); + + // Serialize filter conditions + if (filterConditions != null && !filterConditions.isEmpty()) { + out.writeBoolean(true); + out.writeVInt(filterConditions.size()); + for (Map condition : filterConditions) { + out.writeGenericValue(condition); + } + } else { + out.writeBoolean(false); + } + } + + /** + * Validates the request before execution. + * + * @return true if request is valid for execution + */ + public boolean isValid() { + return indexName != null + && !indexName.isEmpty() + && shardIds != null + && !shardIds.isEmpty() + && fieldNames != null + && !fieldNames.isEmpty() + && queryLimit > 0; + } + + @Override + public ActionRequestValidationException validate() { + ActionRequestValidationException validationException = null; + if (indexName == null || indexName.trim().isEmpty()) { + validationException = new ActionRequestValidationException(); + validationException.addValidationError("Index name cannot be null or empty"); + } + if (shardIds == null || shardIds.isEmpty()) { + if (validationException == null) { + validationException = new ActionRequestValidationException(); + } + validationException.addValidationError("Shard IDs cannot be null or empty"); + } + if (fieldNames == null || fieldNames.isEmpty()) { + if (validationException == null) { + validationException = new ActionRequestValidationException(); + } + validationException.addValidationError("Field names cannot be null or empty"); + } + return validationException; + } + + @Override + public String toString() { + return String.format( + "ExecuteDistributedTaskRequest{stageId='%s', index='%s', shards=%s, mode='%s'}", + stageId, indexName, shardIds, executionMode); + } +} diff --git a/opensearch/src/main/java/org/opensearch/sql/opensearch/executor/distributed/ExecuteDistributedTaskResponse.java b/opensearch/src/main/java/org/opensearch/sql/opensearch/executor/distributed/ExecuteDistributedTaskResponse.java new file mode 100644 index 00000000000..fdc77dcc3ba --- /dev/null +++ b/opensearch/src/main/java/org/opensearch/sql/opensearch/executor/distributed/ExecuteDistributedTaskResponse.java @@ -0,0 +1,182 @@ +/* + * Copyright OpenSearch Contributors + * SPDX-License-Identifier: Apache-2.0 + */ + +package org.opensearch.sql.opensearch.executor.distributed; + +import java.io.IOException; +import java.util.List; +import java.util.Map; +import lombok.AllArgsConstructor; +import lombok.Data; +import lombok.EqualsAndHashCode; +import lombok.NoArgsConstructor; +import org.opensearch.action.search.SearchResponse; +import org.opensearch.core.action.ActionResponse; +import org.opensearch.core.common.io.stream.StreamInput; +import org.opensearch.core.common.io.stream.StreamOutput; + +/** + * Response message containing results from distributed query task execution. + * + *

Contains the execution results, performance metrics, and any error information from executing + * WorkUnits on a remote cluster node. + * + *

Phase 1B Serialization: Serializes the SearchResponse (which implements + * Writeable) for returning per-shard search results from remote nodes. This prepares for Phase 1C + * transport-based execution. + */ +@Data +@EqualsAndHashCode(callSuper = false) +@AllArgsConstructor +@NoArgsConstructor +public class ExecuteDistributedTaskResponse extends ActionResponse { + + /** Results from executing the work units */ + private List results; + + /** Execution statistics and performance metrics */ + private Map executionStats; + + /** Node ID where the tasks were executed */ + private String nodeId; + + /** Whether execution completed successfully */ + private boolean success; + + /** Error message if execution failed */ + private String errorMessage; + + /** SearchResponse from per-shard execution (Phase 1B). */ + private SearchResponse searchResponse; + + /** Column names from operator pipeline execution (Phase 5B). */ + private List pipelineFieldNames; + + /** Row data from operator pipeline execution (Phase 5B). */ + private List> pipelineRows; + + /** Constructor with original fields for backward compatibility. */ + public ExecuteDistributedTaskResponse( + List results, + Map executionStats, + String nodeId, + boolean success, + String errorMessage) { + this.results = results; + this.executionStats = executionStats; + this.nodeId = nodeId; + this.success = success; + this.errorMessage = errorMessage; + } + + /** Constructor for deserialization from stream. */ + public ExecuteDistributedTaskResponse(StreamInput in) throws IOException { + super(in); + this.nodeId = in.readString(); + this.success = in.readBoolean(); + this.errorMessage = in.readOptionalString(); + + // Deserialize SearchResponse (implements Writeable) + if (in.readBoolean()) { + this.searchResponse = new SearchResponse(in); + } + + // Deserialize operator pipeline results (Phase 5B) + if (in.readBoolean()) { + this.pipelineFieldNames = in.readStringList(); + int rowCount = in.readVInt(); + this.pipelineRows = new java.util.ArrayList<>(rowCount); + int colCount = this.pipelineFieldNames.size(); + for (int i = 0; i < rowCount; i++) { + List row = new java.util.ArrayList<>(colCount); + for (int j = 0; j < colCount; j++) { + row.add(in.readGenericValue()); + } + this.pipelineRows.add(row); + } + } + + // Generic results not serialized over transport + this.results = List.of(); + this.executionStats = Map.of(); + } + + /** Serializes this response to a stream for network transport. */ + @Override + public void writeTo(StreamOutput out) throws IOException { + out.writeString(nodeId != null ? nodeId : ""); + out.writeBoolean(success); + out.writeOptionalString(errorMessage); + + // Serialize SearchResponse (implements Writeable) + if (searchResponse != null) { + out.writeBoolean(true); + searchResponse.writeTo(out); + } else { + out.writeBoolean(false); + } + + // Serialize operator pipeline results (Phase 5B) + if (pipelineFieldNames != null && pipelineRows != null) { + out.writeBoolean(true); + out.writeStringCollection(pipelineFieldNames); + out.writeVInt(pipelineRows.size()); + for (List row : pipelineRows) { + for (Object value : row) { + out.writeGenericValue(value); + } + } + } else { + out.writeBoolean(false); + } + } + + /** Creates a successful response with results. */ + public static ExecuteDistributedTaskResponse success( + String nodeId, List results, Map stats) { + return new ExecuteDistributedTaskResponse(results, stats, nodeId, true, null); + } + + /** Creates a failure response with error information. */ + public static ExecuteDistributedTaskResponse failure(String nodeId, String errorMessage) { + return new ExecuteDistributedTaskResponse(List.of(), Map.of(), nodeId, false, errorMessage); + } + + /** Creates a successful response containing row data from operator pipeline (Phase 5B). */ + public static ExecuteDistributedTaskResponse successWithRows( + String nodeId, List fieldNames, List> rows) { + ExecuteDistributedTaskResponse resp = + new ExecuteDistributedTaskResponse(List.of(), Map.of(), nodeId, true, null); + resp.setPipelineFieldNames(fieldNames); + resp.setPipelineRows(rows); + return resp; + } + + /** Creates a successful response containing a SearchResponse (Phase 1C). */ + public static ExecuteDistributedTaskResponse successWithSearch( + String nodeId, SearchResponse searchResponse) { + ExecuteDistributedTaskResponse resp = + new ExecuteDistributedTaskResponse(List.of(), Map.of(), nodeId, true, null); + resp.setSearchResponse(searchResponse); + return resp; + } + + /** Gets the number of results returned. */ + public int getResultCount() { + return results != null ? results.size() : 0; + } + + /** Checks if the execution was successful. */ + public boolean isSuccessful() { + return success && errorMessage == null; + } + + @Override + public String toString() { + return String.format( + "ExecuteDistributedTaskResponse{nodeId='%s', success=%s, results=%d, error='%s'}", + nodeId, success, getResultCount(), errorMessage); + } +} diff --git a/opensearch/src/main/java/org/opensearch/sql/opensearch/executor/distributed/TransportExecuteDistributedTaskAction.java b/opensearch/src/main/java/org/opensearch/sql/opensearch/executor/distributed/TransportExecuteDistributedTaskAction.java new file mode 100644 index 00000000000..e7b6b4df211 --- /dev/null +++ b/opensearch/src/main/java/org/opensearch/sql/opensearch/executor/distributed/TransportExecuteDistributedTaskAction.java @@ -0,0 +1,93 @@ +/* + * Copyright OpenSearch Contributors + * SPDX-License-Identifier: Apache-2.0 + */ + +package org.opensearch.sql.opensearch.executor.distributed; + +import lombok.extern.log4j.Log4j2; +import org.opensearch.action.support.ActionFilters; +import org.opensearch.action.support.HandledTransportAction; +import org.opensearch.cluster.service.ClusterService; +import org.opensearch.common.inject.Inject; +import org.opensearch.core.action.ActionListener; +import org.opensearch.indices.IndicesService; +import org.opensearch.sql.opensearch.executor.distributed.pipeline.OperatorPipelineExecutor; +import org.opensearch.tasks.Task; +import org.opensearch.transport.TransportService; +import org.opensearch.transport.client.Client; + +/** + * Transport action handler for executing distributed query tasks on data nodes. + * + *

This handler runs on each cluster node and processes ExecuteDistributedTaskRequest messages + * from the coordinator. It executes the operator pipeline locally using direct Lucene access and + * returns results via ExecuteDistributedTaskResponse. + * + *

Execution Process: + * + *

    + *
  1. Receive OPERATOR_PIPELINE request from coordinator node + *
  2. Execute LuceneScanOperator + LimitOperator pipeline on assigned shards + *
  3. Return rows to coordinator + *
+ */ +@Log4j2 +public class TransportExecuteDistributedTaskAction + extends HandledTransportAction { + + public static final String NAME = "cluster:admin/opensearch/sql/distributed/execute"; + + private final ClusterService clusterService; + private final Client client; + private final IndicesService indicesService; + + @Inject + public TransportExecuteDistributedTaskAction( + TransportService transportService, + ActionFilters actionFilters, + ClusterService clusterService, + Client client, + IndicesService indicesService) { + super( + ExecuteDistributedTaskAction.NAME, + transportService, + actionFilters, + ExecuteDistributedTaskRequest::new); + this.clusterService = clusterService; + this.client = client; + this.indicesService = indicesService; + } + + @Override + protected void doExecute( + Task task, + ExecuteDistributedTaskRequest request, + ActionListener listener) { + + String nodeId = clusterService.localNode().getId(); + + try { + log.info( + "[Operator Pipeline] Executing on node: {} for index: {}, shards: {}", + nodeId, + request.getIndexName(), + request.getShardIds()); + + OperatorPipelineExecutor.OperatorPipelineResult result = + OperatorPipelineExecutor.execute(indicesService, request); + + log.info( + "[Operator Pipeline] Completed on node: {} - {} rows", nodeId, result.getRows().size()); + + listener.onResponse( + ExecuteDistributedTaskResponse.successWithRows( + nodeId, result.getFieldNames(), result.getRows())); + } catch (Exception e) { + log.error("[Operator Pipeline] Failed on node: {}", nodeId, e); + listener.onResponse( + ExecuteDistributedTaskResponse.failure( + nodeId, "Operator pipeline failed: " + e.getMessage())); + } + } +} diff --git a/opensearch/src/main/java/org/opensearch/sql/opensearch/executor/distributed/operator/FilterToLuceneConverter.java b/opensearch/src/main/java/org/opensearch/sql/opensearch/executor/distributed/operator/FilterToLuceneConverter.java new file mode 100644 index 00000000000..b9345a7c139 --- /dev/null +++ b/opensearch/src/main/java/org/opensearch/sql/opensearch/executor/distributed/operator/FilterToLuceneConverter.java @@ -0,0 +1,320 @@ +/* + * Copyright OpenSearch Contributors + * SPDX-License-Identifier: Apache-2.0 + */ + +package org.opensearch.sql.opensearch.executor.distributed.operator; + +import java.util.List; +import java.util.Map; +import lombok.extern.log4j.Log4j2; +import org.apache.lucene.document.DoublePoint; +import org.apache.lucene.document.FloatPoint; +import org.apache.lucene.document.IntPoint; +import org.apache.lucene.document.LongPoint; +import org.apache.lucene.index.Term; +import org.apache.lucene.search.BooleanClause; +import org.apache.lucene.search.BooleanQuery; +import org.apache.lucene.search.MatchAllDocsQuery; +import org.apache.lucene.search.Query; +import org.apache.lucene.search.TermQuery; +import org.apache.lucene.search.TermRangeQuery; +import org.apache.lucene.util.BytesRef; +import org.opensearch.index.IndexService; +import org.opensearch.index.mapper.KeywordFieldMapper; +import org.opensearch.index.mapper.MappedFieldType; +import org.opensearch.index.mapper.MapperService; +import org.opensearch.index.mapper.NumberFieldMapper; +import org.opensearch.index.mapper.TextFieldMapper; +import org.opensearch.index.query.QueryBuilders; +import org.opensearch.index.query.QueryShardContext; + +/** + * Converts serialized filter conditions to Lucene queries using the local shard's field mappings. + * + *

Each filter condition is a Map with keys: + * + *

    + *
  • "field" (String) - field name + *
  • "op" (String) - operator: EQ, NEQ, GT, GTE, LT, LTE + *
  • "value" (Object) - comparison value + *
+ * + *

Multiple conditions are combined with AND. The converter uses MapperService to resolve field + * types and creates appropriate Lucene queries: + * + *

    + *
  • Keyword fields: TermQuery / TermRangeQuery + *
  • Numeric fields: LongPoint / IntPoint / DoublePoint range queries + *
  • Text fields: TermQuery on the field directly + *
+ */ +@Log4j2 +public class FilterToLuceneConverter { + + private FilterToLuceneConverter() {} + + /** + * Converts a list of filter conditions to a single Lucene query. + * + * @param conditions filter conditions (null or empty means match all) + * @param mapperService the shard's mapper service for field type resolution + * @return a Lucene Query + */ + public static Query convert(List> conditions, MapperService mapperService) { + return convert(conditions, mapperService, null); + } + + /** + * Converts a list of filter conditions to a single Lucene query with IndexService for + * query_string support. + * + * @param conditions filter conditions (null or empty means match all) + * @param mapperService the shard's mapper service for field type resolution + * @param indexService the index service for creating SearchExecutionContext (for query_string) + * @return a Lucene Query + */ + public static Query convert( + List> conditions, + MapperService mapperService, + IndexService indexService) { + if (conditions == null || conditions.isEmpty()) { + return new MatchAllDocsQuery(); + } + + if (conditions.size() == 1) { + return convertSingle(conditions.get(0), mapperService, indexService); + } + + // Multiple conditions: AND them together + BooleanQuery.Builder bool = new BooleanQuery.Builder(); + for (Map condition : conditions) { + Query q = convertSingle(condition, mapperService, indexService); + bool.add(q, BooleanClause.Occur.FILTER); + } + return bool.build(); + } + + private static Query convertSingle( + Map condition, MapperService mapperService, IndexService indexService) { + // Handle query_string type (from PPL inline filters) + String type = (String) condition.get("type"); + if ("query_string".equals(type)) { + return convertQueryString(condition, indexService); + } + + String field = (String) condition.get("field"); + String op = (String) condition.get("op"); + Object value = condition.get("value"); + + if (field == null || op == null) { + log.warn("[Filter] Invalid filter condition: {}", condition); + return new MatchAllDocsQuery(); + } + + // Resolve field type from shard mapping + MappedFieldType fieldType = mapperService.fieldType(field); + if (fieldType == null) { + // Try with .keyword suffix for text fields + fieldType = mapperService.fieldType(field + ".keyword"); + if (fieldType != null) { + field = field + ".keyword"; + } else { + log.warn("[Filter] Field '{}' not found in mapping, skipping filter", field); + return new MatchAllDocsQuery(); + } + } + + log.debug( + "[Filter] Converting: field={}, op={}, value={}, fieldType={}", + field, + op, + value, + fieldType.getClass().getSimpleName()); + + return switch (op) { + case "EQ" -> buildEqualityQuery(field, value, fieldType); + case "NEQ" -> buildNegationQuery(buildEqualityQuery(field, value, fieldType)); + case "GT" -> buildRangeQuery(field, value, fieldType, false, false); + case "GTE" -> buildRangeQuery(field, value, fieldType, true, false); + case "LT" -> buildRangeQuery(field, value, fieldType, false, true); + case "LTE" -> buildRangeQuery(field, value, fieldType, true, true); + default -> { + log.warn("[Filter] Unknown operator: {}", op); + yield new MatchAllDocsQuery(); + } + }; + } + + private static Query buildEqualityQuery(String field, Object value, MappedFieldType fieldType) { + if (fieldType instanceof NumberFieldMapper.NumberFieldType numType) { + return buildNumericExactQuery(field, value, numType); + } else if (fieldType instanceof KeywordFieldMapper.KeywordFieldType) { + return new TermQuery(new Term(field, value.toString())); + } else if (fieldType instanceof TextFieldMapper.TextFieldType) { + // For text fields, use the analyzed field + return new TermQuery(new Term(field, value.toString().toLowerCase())); + } else { + // Generic fallback: term query + return new TermQuery(new Term(field, value.toString())); + } + } + + private static Query buildNegationQuery(Query inner) { + BooleanQuery.Builder bool = new BooleanQuery.Builder(); + bool.add(new MatchAllDocsQuery(), BooleanClause.Occur.MUST); + bool.add(inner, BooleanClause.Occur.MUST_NOT); + return bool.build(); + } + + /** + * Builds a range query for the given field and value. + * + * @param inclusive whether the bound is inclusive (>= or <=) + * @param isUpper if true, value is the upper bound; if false, value is the lower bound + */ + private static Query buildRangeQuery( + String field, Object value, MappedFieldType fieldType, boolean inclusive, boolean isUpper) { + + if (fieldType instanceof NumberFieldMapper.NumberFieldType numType) { + return buildNumericRangeQuery(field, value, numType, inclusive, isUpper); + } else if (fieldType instanceof KeywordFieldMapper.KeywordFieldType) { + return buildKeywordRangeQuery(field, value, inclusive, isUpper); + } else { + // Generic fallback: keyword range + return buildKeywordRangeQuery(field, value, inclusive, isUpper); + } + } + + private static Query buildNumericExactQuery( + String field, Object value, NumberFieldMapper.NumberFieldType numType) { + String typeName = numType.typeName(); + return switch (typeName) { + case "long" -> LongPoint.newExactQuery(field, toLong(value)); + case "integer" -> IntPoint.newExactQuery(field, toInt(value)); + case "double" -> DoublePoint.newExactQuery(field, toDouble(value)); + case "float" -> FloatPoint.newExactQuery(field, toFloat(value)); + default -> LongPoint.newExactQuery(field, toLong(value)); + }; + } + + private static Query buildNumericRangeQuery( + String field, + Object value, + NumberFieldMapper.NumberFieldType numType, + boolean inclusive, + boolean isUpper) { + String typeName = numType.typeName(); + return switch (typeName) { + case "long" -> buildLongRange(field, toLong(value), inclusive, isUpper); + case "integer" -> buildIntRange(field, toInt(value), inclusive, isUpper); + case "double" -> buildDoubleRange(field, toDouble(value), inclusive, isUpper); + case "float" -> buildFloatRange(field, toFloat(value), inclusive, isUpper); + default -> buildLongRange(field, toLong(value), inclusive, isUpper); + }; + } + + private static Query buildLongRange( + String field, long value, boolean inclusive, boolean isUpper) { + if (isUpper) { + long upper = inclusive ? value : value - 1; + return LongPoint.newRangeQuery(field, Long.MIN_VALUE, upper); + } else { + long lower = inclusive ? value : value + 1; + return LongPoint.newRangeQuery(field, lower, Long.MAX_VALUE); + } + } + + private static Query buildIntRange(String field, int value, boolean inclusive, boolean isUpper) { + if (isUpper) { + int upper = inclusive ? value : value - 1; + return IntPoint.newRangeQuery(field, Integer.MIN_VALUE, upper); + } else { + int lower = inclusive ? value : value + 1; + return IntPoint.newRangeQuery(field, lower, Integer.MAX_VALUE); + } + } + + private static Query buildDoubleRange( + String field, double value, boolean inclusive, boolean isUpper) { + if (isUpper) { + double upper = inclusive ? value : Math.nextDown(value); + return DoublePoint.newRangeQuery(field, Double.NEGATIVE_INFINITY, upper); + } else { + double lower = inclusive ? value : Math.nextUp(value); + return DoublePoint.newRangeQuery(field, lower, Double.POSITIVE_INFINITY); + } + } + + private static Query buildFloatRange( + String field, float value, boolean inclusive, boolean isUpper) { + if (isUpper) { + float upper = inclusive ? value : Math.nextDown(value); + return FloatPoint.newRangeQuery(field, Float.NEGATIVE_INFINITY, upper); + } else { + float lower = inclusive ? value : Math.nextUp(value); + return FloatPoint.newRangeQuery(field, lower, Float.POSITIVE_INFINITY); + } + } + + private static Query buildKeywordRangeQuery( + String field, Object value, boolean inclusive, boolean isUpper) { + BytesRef bytesVal = new BytesRef(value.toString()); + if (isUpper) { + return new TermRangeQuery(field, null, bytesVal, true, inclusive); + } else { + return new TermRangeQuery(field, bytesVal, null, inclusive, true); + } + } + + private static long toLong(Object value) { + if (value instanceof Number n) return n.longValue(); + return Long.parseLong(value.toString()); + } + + private static int toInt(Object value) { + if (value instanceof Number n) return n.intValue(); + return Integer.parseInt(value.toString()); + } + + private static double toDouble(Object value) { + if (value instanceof Number n) return n.doubleValue(); + return Double.parseDouble(value.toString()); + } + + private static float toFloat(Object value) { + if (value instanceof Number n) return n.floatValue(); + return Float.parseFloat(value.toString()); + } + + /** + * Converts a query_string filter to a Lucene query using OpenSearch's QueryStringQueryBuilder. + * PPL inline filters (e.g., source=bank gender='F') get converted to query_string syntax like + * "gender:F" by the PPL parser. Uses OpenSearch's query builder for proper field type handling + * (numeric, keyword, text, etc.). + */ + private static Query convertQueryString( + Map condition, IndexService indexService) { + String queryText = (String) condition.get("query"); + if (queryText == null || queryText.isEmpty()) { + log.warn("[Filter] Empty query_string condition"); + return new MatchAllDocsQuery(); + } + + if (indexService == null) { + log.warn("[Filter] IndexService not available, can't convert query_string: {}", queryText); + return new MatchAllDocsQuery(); + } + + try { + QueryShardContext queryShardContext = + indexService.newQueryShardContext(0, null, () -> 0L, null); + Query query = QueryBuilders.queryStringQuery(queryText).toQuery(queryShardContext); + log.info("[Filter] Converted query_string '{}' to Lucene query: {}", queryText, query); + return query; + } catch (Exception e) { + log.warn("[Filter] Failed to convert query_string '{}': {}", queryText, e.getMessage()); + return new MatchAllDocsQuery(); + } + } +} diff --git a/opensearch/src/main/java/org/opensearch/sql/opensearch/executor/distributed/operator/LimitOperator.java b/opensearch/src/main/java/org/opensearch/sql/opensearch/executor/distributed/operator/LimitOperator.java new file mode 100644 index 00000000000..3d57d241177 --- /dev/null +++ b/opensearch/src/main/java/org/opensearch/sql/opensearch/executor/distributed/operator/LimitOperator.java @@ -0,0 +1,82 @@ +/* + * Copyright OpenSearch Contributors + * SPDX-License-Identifier: Apache-2.0 + */ + +package org.opensearch.sql.opensearch.executor.distributed.operator; + +import org.opensearch.sql.planner.distributed.operator.Operator; +import org.opensearch.sql.planner.distributed.operator.OperatorContext; +import org.opensearch.sql.planner.distributed.page.Page; + +/** + * Operator that limits the number of rows passing through the pipeline. Truncates pages when the + * accumulated row count reaches the configured limit. + */ +public class LimitOperator implements Operator { + + private final int limit; + private final OperatorContext context; + + private int accumulatedRows; + private Page pendingOutput; + private boolean inputFinished; + + public LimitOperator(int limit, OperatorContext context) { + this.limit = limit; + this.context = context; + this.accumulatedRows = 0; + } + + @Override + public boolean needsInput() { + return pendingOutput == null && accumulatedRows < limit && !inputFinished; + } + + @Override + public void addInput(Page page) { + if (page == null || accumulatedRows >= limit) { + return; + } + + int remaining = limit - accumulatedRows; + int pageRows = page.getPositionCount(); + + if (pageRows <= remaining) { + // Entire page fits within limit + accumulatedRows += pageRows; + pendingOutput = page; + } else { + // Truncate page to remaining rows + pendingOutput = page.getRegion(0, remaining); + accumulatedRows += remaining; + } + } + + @Override + public Page getOutput() { + Page output = pendingOutput; + pendingOutput = null; + return output; + } + + @Override + public boolean isFinished() { + return accumulatedRows >= limit || (inputFinished && pendingOutput == null); + } + + @Override + public void finish() { + inputFinished = true; + } + + @Override + public OperatorContext getContext() { + return context; + } + + @Override + public void close() { + // No resources to release + } +} diff --git a/opensearch/src/main/java/org/opensearch/sql/opensearch/executor/distributed/operator/LuceneScanOperator.java b/opensearch/src/main/java/org/opensearch/sql/opensearch/executor/distributed/operator/LuceneScanOperator.java new file mode 100644 index 00000000000..b6d14608143 --- /dev/null +++ b/opensearch/src/main/java/org/opensearch/sql/opensearch/executor/distributed/operator/LuceneScanOperator.java @@ -0,0 +1,272 @@ +/* + * Copyright OpenSearch Contributors + * SPDX-License-Identifier: Apache-2.0 + */ + +package org.opensearch.sql.opensearch.executor.distributed.operator; + +import java.io.IOException; +import java.util.List; +import java.util.Map; +import lombok.extern.log4j.Log4j2; +import org.apache.lucene.index.LeafReaderContext; +import org.apache.lucene.index.StoredFields; +import org.apache.lucene.search.DocIdSetIterator; +import org.apache.lucene.search.MatchAllDocsQuery; +import org.apache.lucene.search.Query; +import org.apache.lucene.search.ScoreMode; +import org.apache.lucene.search.Scorer; +import org.apache.lucene.search.Weight; +import org.apache.lucene.util.Bits; +import org.apache.lucene.util.BytesRef; +import org.opensearch.common.xcontent.XContentHelper; +import org.opensearch.common.xcontent.XContentType; +import org.opensearch.core.common.bytes.BytesArray; +import org.opensearch.index.engine.Engine; +import org.opensearch.index.shard.IndexShard; +import org.opensearch.sql.planner.distributed.operator.OperatorContext; +import org.opensearch.sql.planner.distributed.operator.SourceOperator; +import org.opensearch.sql.planner.distributed.page.Page; +import org.opensearch.sql.planner.distributed.page.PageBuilder; +import org.opensearch.sql.planner.distributed.split.DataUnit; + +/** + * Source operator that reads documents directly from Lucene via {@link + * IndexShard#acquireSearcher(String)}. + * + *

Uses Lucene's Weight/Scorer pattern to iterate only documents matching the filter query. When + * no filter is provided, uses {@link MatchAllDocsQuery} to match all documents. + * + *

Reads {@code _source} JSON from stored fields and extracts requested field values. + */ +@Log4j2 +public class LuceneScanOperator implements SourceOperator { + + private final IndexShard indexShard; + private final List fieldNames; + private final int batchSize; + private final OperatorContext context; + private final Query luceneQuery; + + private DataUnit dataUnit; + private boolean noMoreDataUnits; + private boolean finished; + private Engine.Searcher engineSearcher; + + // Weight/Scorer state for filtered iteration + private List leaves; + private int currentLeafIndex; + private StoredFields currentStoredFields; + private Scorer currentScorer; + private DocIdSetIterator currentDocIdIterator; + private Bits currentLiveDocs; + + /** + * Creates a LuceneScanOperator with a filter query merged into the scan. + * + * @param indexShard the shard to read from + * @param fieldNames fields to extract from _source + * @param batchSize rows per page batch + * @param context operator context + * @param luceneQuery the Lucene query for filtering (null means match all) + */ + public LuceneScanOperator( + IndexShard indexShard, + List fieldNames, + int batchSize, + OperatorContext context, + Query luceneQuery) { + this.indexShard = indexShard; + this.fieldNames = fieldNames; + this.batchSize = batchSize; + this.context = context; + this.luceneQuery = luceneQuery != null ? luceneQuery : new MatchAllDocsQuery(); + this.finished = false; + this.currentLeafIndex = 0; + } + + /** Backward-compatible constructor that matches all documents. */ + public LuceneScanOperator( + IndexShard indexShard, List fieldNames, int batchSize, OperatorContext context) { + this(indexShard, fieldNames, batchSize, context, null); + } + + @Override + public void addDataUnit(DataUnit dataUnit) { + this.dataUnit = dataUnit; + } + + @Override + public void noMoreDataUnits() { + this.noMoreDataUnits = true; + } + + @Override + public Page getOutput() { + if (finished) { + return null; + } + + try { + // Lazy initialization: acquire searcher and prepare Weight on first call + if (engineSearcher == null) { + engineSearcher = indexShard.acquireSearcher("distributed-pipeline"); + leaves = engineSearcher.getIndexReader().leaves(); + if (leaves.isEmpty()) { + finished = true; + return null; + } + advanceToLeaf(0); + } + + PageBuilder builder = new PageBuilder(fieldNames.size()); + int rowsInBatch = 0; + + while (rowsInBatch < batchSize) { + // Advance to next matching doc + int docId = nextMatchingDoc(); + if (docId == DocIdSetIterator.NO_MORE_DOCS) { + finished = true; + return builder.isEmpty() ? null : builder.build(); + } + + // Read the document's _source + org.apache.lucene.document.Document doc = currentStoredFields.document(docId); + BytesRef sourceBytes = doc.getBinaryValue("_source"); + + if (sourceBytes == null) { + continue; + } + + Map source = + XContentHelper.convertToMap(new BytesArray(sourceBytes), false, XContentType.JSON).v2(); + + builder.beginRow(); + for (int i = 0; i < fieldNames.size(); i++) { + builder.setValue(i, getNestedValue(source, fieldNames.get(i))); + } + builder.endRow(); + rowsInBatch++; + } + + return builder.isEmpty() ? null : builder.build(); + + } catch (IOException e) { + log.error("Error reading from Lucene shard: {}", indexShard.shardId(), e); + finished = true; + throw new RuntimeException("Failed to read from Lucene shard", e); + } + } + + /** + * Returns the next matching live document ID using the Weight/Scorer pattern. Advances across + * leaf readers (segments) as needed. Skips deleted/soft-deleted documents by checking the + * segment's liveDocs bitset — Lucene's Scorer.iterator() does NOT filter deleted docs. + */ + private int nextMatchingDoc() throws IOException { + while (currentLeafIndex < leaves.size()) { + if (currentDocIdIterator != null) { + while (true) { + int docId = currentDocIdIterator.nextDoc(); + if (docId == DocIdSetIterator.NO_MORE_DOCS) { + break; + } + // Skip deleted/soft-deleted docs: liveDocs == null means all docs are live + if (currentLiveDocs == null || currentLiveDocs.get(docId)) { + return docId; + } + } + } + // Move to next leaf + currentLeafIndex++; + if (currentLeafIndex < leaves.size()) { + advanceToLeaf(currentLeafIndex); + } + } + return DocIdSetIterator.NO_MORE_DOCS; + } + + /** + * Advances to the specified leaf (segment) and creates a Scorer for it. The Scorer uses the + * Lucene query to efficiently iterate only matching documents in that segment. Also captures the + * segment's liveDocs bitset for filtering deleted/soft-deleted documents. + */ + private void advanceToLeaf(int leafIndex) throws IOException { + LeafReaderContext leafCtx = leaves.get(leafIndex); + currentStoredFields = leafCtx.reader().storedFields(); + currentLiveDocs = leafCtx.reader().getLiveDocs(); + + // Create Weight/Scorer for filtered iteration using the engine's IndexSearcher + // (Engine.Searcher extends IndexSearcher with proper soft-delete handling) + Query rewritten = engineSearcher.rewrite(luceneQuery); + Weight weight = engineSearcher.createWeight(rewritten, ScoreMode.COMPLETE_NO_SCORES, 1.0f); + + currentScorer = weight.scorer(leafCtx); + if (currentScorer != null) { + currentDocIdIterator = currentScorer.iterator(); + } else { + // No matching docs in this segment + currentDocIdIterator = null; + } + } + + /** + * Navigates a nested map using a dotted field path. For "machine.os1", navigates into + * source["machine"]["os1"]. Handles arrays by extracting values from the first element. Falls + * back to direct key lookup for non-dotted fields. + */ + @SuppressWarnings("unchecked") + private Object getNestedValue(Map source, String fieldName) { + // Try direct key first (covers non-dotted names and flattened fields) + Object direct = source.get(fieldName); + if (direct != null) { + return direct; + } + + // Navigate dotted path: "machine.os1" → source["machine"]["os1"] + if (fieldName.contains(".")) { + String[] parts = fieldName.split("\\."); + Object current = source; + for (String part : parts) { + if (current instanceof Map) { + current = ((Map) current).get(part); + } else if (current instanceof List list) { + // For array fields, extract from the first element + if (!list.isEmpty() && list.get(0) instanceof Map) { + current = ((Map) list.get(0)).get(part); + } else { + return null; + } + } else { + return null; + } + } + return current; + } + + return null; + } + + @Override + public boolean isFinished() { + return finished; + } + + @Override + public void finish() { + finished = true; + } + + @Override + public OperatorContext getContext() { + return context; + } + + @Override + public void close() { + if (engineSearcher != null) { + engineSearcher.close(); + engineSearcher = null; + } + } +} diff --git a/opensearch/src/main/java/org/opensearch/sql/opensearch/executor/distributed/operator/ResultCollector.java b/opensearch/src/main/java/org/opensearch/sql/opensearch/executor/distributed/operator/ResultCollector.java new file mode 100644 index 00000000000..c33d82cefd1 --- /dev/null +++ b/opensearch/src/main/java/org/opensearch/sql/opensearch/executor/distributed/operator/ResultCollector.java @@ -0,0 +1,50 @@ +/* + * Copyright OpenSearch Contributors + * SPDX-License-Identifier: Apache-2.0 + */ + +package org.opensearch.sql.opensearch.executor.distributed.operator; + +import java.util.ArrayList; +import java.util.List; +import org.opensearch.sql.planner.distributed.page.Page; + +/** + * Collects pages from the operator pipeline into a list of rows. Used on data nodes to gather + * pipeline output before serializing into transport response. + */ +public class ResultCollector { + + private final List fieldNames; + private final List> rows; + + public ResultCollector(List fieldNames) { + this.fieldNames = fieldNames; + this.rows = new ArrayList<>(); + } + + /** Extracts rows from a page and adds them to the collected results. */ + public void addPage(Page page) { + if (page == null) { + return; + } + int channelCount = page.getChannelCount(); + for (int pos = 0; pos < page.getPositionCount(); pos++) { + List row = new ArrayList<>(channelCount); + for (int ch = 0; ch < channelCount; ch++) { + row.add(page.getValue(pos, ch)); + } + rows.add(row); + } + } + + /** Returns the field names for the collected data. */ + public List getFieldNames() { + return fieldNames; + } + + /** Returns all collected rows. */ + public List> getRows() { + return rows; + } +} diff --git a/opensearch/src/main/java/org/opensearch/sql/opensearch/executor/distributed/pipeline/OperatorPipelineExecutor.java b/opensearch/src/main/java/org/opensearch/sql/opensearch/executor/distributed/pipeline/OperatorPipelineExecutor.java new file mode 100644 index 00000000000..07bd095b082 --- /dev/null +++ b/opensearch/src/main/java/org/opensearch/sql/opensearch/executor/distributed/pipeline/OperatorPipelineExecutor.java @@ -0,0 +1,177 @@ +/* + * Copyright OpenSearch Contributors + * SPDX-License-Identifier: Apache-2.0 + */ + +package org.opensearch.sql.opensearch.executor.distributed.pipeline; + +import java.util.List; +import java.util.Map; +import lombok.extern.log4j.Log4j2; +import org.apache.lucene.search.Query; +import org.opensearch.index.IndexService; +import org.opensearch.index.mapper.MapperService; +import org.opensearch.index.shard.IndexShard; +import org.opensearch.indices.IndicesService; +import org.opensearch.sql.opensearch.executor.distributed.ExecuteDistributedTaskRequest; +import org.opensearch.sql.opensearch.executor.distributed.operator.FilterToLuceneConverter; +import org.opensearch.sql.opensearch.executor.distributed.operator.LimitOperator; +import org.opensearch.sql.opensearch.executor.distributed.operator.LuceneScanOperator; +import org.opensearch.sql.opensearch.executor.distributed.operator.ResultCollector; +import org.opensearch.sql.planner.distributed.operator.OperatorContext; +import org.opensearch.sql.planner.distributed.page.Page; + +/** + * Orchestrates operator pipeline execution on a data node. Creates a LuceneScanOperator for each + * assigned shard, pipes output through a LimitOperator, and collects results. + * + *

Filter conditions from the transport request are converted to Lucene queries using the local + * shard's field mappings via {@link FilterToLuceneConverter}. + */ +@Log4j2 +public class OperatorPipelineExecutor { + + private OperatorPipelineExecutor() {} + + /** + * Executes the operator pipeline for the given request. + * + * @param indicesService used to resolve IndexShard instances and field mappings + * @param request contains index name, shard IDs, field names, limit, and filter conditions + * @return the collected field names and rows + */ + public static OperatorPipelineResult execute( + IndicesService indicesService, ExecuteDistributedTaskRequest request) { + + String indexName = request.getIndexName(); + List shardIds = request.getShardIds(); + List fieldNames = request.getFieldNames(); + int queryLimit = request.getQueryLimit(); + List> filterConditions = request.getFilterConditions(); + + log.info( + "[Operator Pipeline] Executing on shards {} for index: {}, fields: {}, limit: {}," + + " filters: {}", + shardIds, + indexName, + fieldNames, + queryLimit, + filterConditions != null ? filterConditions.size() : 0); + + // Resolve MapperService for field type lookup + IndexService indexService = resolveIndexService(indicesService, indexName); + MapperService mapperService = indexService != null ? indexService.mapperService() : null; + + // Convert filter conditions to Lucene query using local field mappings + Query luceneQuery = null; + if (mapperService != null && filterConditions != null && !filterConditions.isEmpty()) { + luceneQuery = FilterToLuceneConverter.convert(filterConditions, mapperService, indexService); + log.info("[Operator Pipeline] Lucene filter query: {}", luceneQuery); + } + + ResultCollector collector = new ResultCollector(fieldNames); + int remainingLimit = queryLimit; + + for (int shardId : shardIds) { + if (remainingLimit <= 0) { + break; + } + + IndexShard indexShard = resolveIndexShard(indicesService, indexName, shardId); + if (indexShard == null) { + log.warn("[Operator Pipeline] Could not resolve shard {}/{}", indexName, shardId); + continue; + } + + OperatorContext ctx = OperatorContext.createDefault("lucene-scan-" + shardId); + + try (LuceneScanOperator source = + new LuceneScanOperator(indexShard, fieldNames, 1024, ctx, luceneQuery)) { + + LimitOperator limit = new LimitOperator(remainingLimit, ctx); + + // Pull loop: source → limit → collector + while (!source.isFinished() && !limit.isFinished()) { + Page page = source.getOutput(); + if (page != null) { + limit.addInput(page); + Page limited = limit.getOutput(); + if (limited != null) { + collector.addPage(limited); + } + } + } + + // Flush any remaining output from limit + limit.finish(); + Page remaining = limit.getOutput(); + if (remaining != null) { + collector.addPage(remaining); + } + + limit.close(); + } catch (Exception e) { + log.error("[Operator Pipeline] Error processing shard {}/{}", indexName, shardId, e); + throw new RuntimeException( + "Operator pipeline failed on shard " + indexName + "/" + shardId, e); + } + + remainingLimit = queryLimit - collector.getRows().size(); + } + + log.info( + "[Operator Pipeline] Completed - collected {} rows from {} shards", + collector.getRows().size(), + shardIds.size()); + + return new OperatorPipelineResult(collector.getFieldNames(), collector.getRows()); + } + + private static IndexService resolveIndexService(IndicesService indicesService, String indexName) { + for (IndexService indexService : indicesService) { + if (indexService.index().getName().equals(indexName)) { + return indexService; + } + } + log.warn("[Operator Pipeline] Index {} not found on this node", indexName); + return null; + } + + private static IndexShard resolveIndexShard( + IndicesService indicesService, String indexName, int shardId) { + for (IndexService indexService : indicesService) { + if (indexService.index().getName().equals(indexName)) { + try { + return indexService.getShard(shardId); + } catch (Exception e) { + log.warn( + "[Operator Pipeline] Shard {} not found on this node for index: {}", + shardId, + indexName); + return null; + } + } + } + log.warn("[Operator Pipeline] Index {} not found on this node", indexName); + return null; + } + + /** Result of operator pipeline execution containing field names and row data. */ + public static class OperatorPipelineResult { + private final List fieldNames; + private final List> rows; + + public OperatorPipelineResult(List fieldNames, List> rows) { + this.fieldNames = fieldNames; + this.rows = rows; + } + + public List getFieldNames() { + return fieldNames; + } + + public List> getRows() { + return rows; + } + } +} diff --git a/opensearch/src/main/java/org/opensearch/sql/opensearch/executor/distributed/split/OpenSearchDataUnit.java b/opensearch/src/main/java/org/opensearch/sql/opensearch/executor/distributed/split/OpenSearchDataUnit.java new file mode 100644 index 00000000000..c63c0129051 --- /dev/null +++ b/opensearch/src/main/java/org/opensearch/sql/opensearch/executor/distributed/split/OpenSearchDataUnit.java @@ -0,0 +1,97 @@ +/* + * Copyright OpenSearch Contributors + * SPDX-License-Identifier: Apache-2.0 + */ + +package org.opensearch.sql.opensearch.executor.distributed.split; + +import java.util.Collections; +import java.util.List; +import java.util.Map; +import org.opensearch.sql.planner.distributed.split.DataUnit; + +/** + * An OpenSearch-specific data unit representing a single shard of an index. Requires local Lucene + * access (not remotely accessible) because the LuceneScanOperator reads directly from the shard's + * IndexShard via {@code acquireSearcher}. + */ +public class OpenSearchDataUnit extends DataUnit { + + private final String indexName; + private final int shardId; + private final List preferredNodes; + private final long estimatedRows; + private final long estimatedSizeBytes; + + public OpenSearchDataUnit( + String indexName, + int shardId, + List preferredNodes, + long estimatedRows, + long estimatedSizeBytes) { + this.indexName = indexName; + this.shardId = shardId; + this.preferredNodes = Collections.unmodifiableList(preferredNodes); + this.estimatedRows = estimatedRows; + this.estimatedSizeBytes = estimatedSizeBytes; + } + + @Override + public String getDataUnitId() { + return indexName + "/" + shardId; + } + + @Override + public List getPreferredNodes() { + return preferredNodes; + } + + @Override + public long getEstimatedRows() { + return estimatedRows; + } + + @Override + public long getEstimatedSizeBytes() { + return estimatedSizeBytes; + } + + @Override + public Map getProperties() { + return Map.of("indexName", indexName, "shardId", String.valueOf(shardId)); + } + + /** + * OpenSearch shard data units require local Lucene access — they cannot be read remotely. + * + * @return false + */ + @Override + public boolean isRemotelyAccessible() { + return false; + } + + /** Returns the index name this data unit reads from. */ + public String getIndexName() { + return indexName; + } + + /** Returns the shard ID within the index. */ + public int getShardId() { + return shardId; + } + + @Override + public String toString() { + return "OpenSearchDataUnit{" + + "index='" + + indexName + + "', shard=" + + shardId + + ", nodes=" + + preferredNodes + + ", ~rows=" + + estimatedRows + + '}'; + } +} diff --git a/opensearch/src/main/java/org/opensearch/sql/opensearch/setting/OpenSearchSettings.java b/opensearch/src/main/java/org/opensearch/sql/opensearch/setting/OpenSearchSettings.java index bd8001f589d..01da8ce050c 100644 --- a/opensearch/src/main/java/org/opensearch/sql/opensearch/setting/OpenSearchSettings.java +++ b/opensearch/src/main/java/org/opensearch/sql/opensearch/setting/OpenSearchSettings.java @@ -151,6 +151,13 @@ public class OpenSearchSettings extends Settings { Setting.Property.NodeScope, Setting.Property.Dynamic); + public static final Setting PPL_DISTRIBUTED_ENABLED_SETTING = + Setting.boolSetting( + Key.PPL_DISTRIBUTED_ENABLED.getKeyValue(), + false, + Setting.Property.NodeScope, + Setting.Property.Dynamic); + public static final Setting CALCITE_ENGINE_ENABLED_SETTING = Setting.boolSetting( Key.CALCITE_ENGINE_ENABLED.getKeyValue(), @@ -437,6 +444,12 @@ public OpenSearchSettings(ClusterSettings clusterSettings) { Key.PPL_JOIN_SUBSEARCH_MAXOUT, PPL_JOIN_SUBSEARCH_MAXOUT_SETTING, new Updater(Key.PPL_JOIN_SUBSEARCH_MAXOUT)); + register( + settingBuilder, + clusterSettings, + Key.PPL_DISTRIBUTED_ENABLED, + PPL_DISTRIBUTED_ENABLED_SETTING, + new Updater(Key.PPL_DISTRIBUTED_ENABLED)); register( settingBuilder, clusterSettings, @@ -667,6 +680,7 @@ public static List> pluginSettings() { .add(PPL_VALUES_MAX_LIMIT_SETTING) .add(PPL_SUBSEARCH_MAXOUT_SETTING) .add(PPL_JOIN_SUBSEARCH_MAXOUT_SETTING) + .add(PPL_DISTRIBUTED_ENABLED_SETTING) .add(QUERY_MEMORY_LIMIT_SETTING) .add(QUERY_SIZE_LIMIT_SETTING) .add(QUERY_BUCKET_SIZE_SETTING) @@ -702,4 +716,14 @@ public static List> pluginNonDynamicSettings() { public List> getSettings() { return pluginSettings(); } + + /** + * Returns whether distributed PPL execution is enabled. Defaults to false for safety - + * distributed execution must be explicitly enabled. + * + * @return true if distributed execution is enabled, false otherwise + */ + public boolean getDistributedExecutionEnabled() { + return getSettingValue(Key.PPL_DISTRIBUTED_ENABLED); + } } diff --git a/opensearch/src/test/java/org/opensearch/sql/opensearch/executor/DistributedExecutionEngineTest.java b/opensearch/src/test/java/org/opensearch/sql/opensearch/executor/DistributedExecutionEngineTest.java new file mode 100644 index 00000000000..57353b3afd2 --- /dev/null +++ b/opensearch/src/test/java/org/opensearch/sql/opensearch/executor/DistributedExecutionEngineTest.java @@ -0,0 +1,132 @@ +/* + * Copyright OpenSearch Contributors + * SPDX-License-Identifier: Apache-2.0 + */ + +package org.opensearch.sql.opensearch.executor; + +import static org.junit.jupiter.api.Assertions.assertNotNull; +import static org.junit.jupiter.api.Assertions.assertThrows; +import static org.mockito.Mockito.mock; +import static org.mockito.Mockito.times; +import static org.mockito.Mockito.verify; +import static org.mockito.Mockito.when; + +import org.apache.calcite.rel.RelNode; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.DisplayNameGeneration; +import org.junit.jupiter.api.DisplayNameGenerator; +import org.junit.jupiter.api.Test; +import org.junit.jupiter.api.extension.ExtendWith; +import org.mockito.Mock; +import org.mockito.junit.jupiter.MockitoExtension; +import org.mockito.junit.jupiter.MockitoSettings; +import org.mockito.quality.Strictness; +import org.opensearch.sql.ast.statement.ExplainMode; +import org.opensearch.sql.calcite.CalcitePlanContext; +import org.opensearch.sql.common.response.ResponseListener; +import org.opensearch.sql.executor.ExecutionContext; +import org.opensearch.sql.executor.ExecutionEngine; +import org.opensearch.sql.executor.ExecutionEngine.QueryResponse; +import org.opensearch.sql.opensearch.setting.OpenSearchSettings; +import org.opensearch.sql.planner.physical.PhysicalPlan; + +@ExtendWith(MockitoExtension.class) +@MockitoSettings(strictness = Strictness.LENIENT) +@DisplayNameGeneration(DisplayNameGenerator.ReplaceUnderscores.class) +class DistributedExecutionEngineTest { + + @Mock private OpenSearchExecutionEngine legacyEngine; + @Mock private OpenSearchSettings settings; + @Mock private PhysicalPlan physicalPlan; + @Mock private RelNode relNode; + @Mock private CalcitePlanContext calciteContext; + @Mock private ResponseListener responseListener; + @Mock private ExecutionContext executionContext; + + private DistributedExecutionEngine distributedEngine; + + @BeforeEach + void setUp() { + distributedEngine = new DistributedExecutionEngine(legacyEngine, settings); + } + + @Test + void should_use_legacy_engine_when_distributed_execution_disabled() { + when(settings.getDistributedExecutionEnabled()).thenReturn(false); + + distributedEngine.execute(physicalPlan, executionContext, responseListener); + + verify(legacyEngine, times(1)).execute(physicalPlan, executionContext, responseListener); + } + + @Test + void should_throw_when_distributed_enabled_for_physical_plan() { + when(settings.getDistributedExecutionEnabled()).thenReturn(true); + + assertThrows( + UnsupportedOperationException.class, + () -> distributedEngine.execute(physicalPlan, executionContext, responseListener)); + } + + @Test + void should_throw_when_distributed_enabled_for_calcite_relnode() { + when(settings.getDistributedExecutionEnabled()).thenReturn(true); + + assertThrows( + UnsupportedOperationException.class, + () -> distributedEngine.execute(relNode, calciteContext, responseListener)); + } + + @Test + void should_use_legacy_engine_for_calcite_relnode_when_disabled() { + when(settings.getDistributedExecutionEnabled()).thenReturn(false); + + distributedEngine.execute(relNode, calciteContext, responseListener); + + verify(legacyEngine, times(1)).execute(relNode, calciteContext, responseListener); + } + + @Test + void should_delegate_explain_to_legacy_engine() { + @SuppressWarnings("unchecked") + ResponseListener explainListener = + mock(ResponseListener.class); + + distributedEngine.explain(physicalPlan, explainListener); + + verify(legacyEngine, times(1)).explain(physicalPlan, explainListener); + } + + @Test + void should_delegate_calcite_explain_to_legacy_when_disabled() { + @SuppressWarnings("unchecked") + ResponseListener explainListener = + mock(ResponseListener.class); + ExplainMode mode = ExplainMode.STANDARD; + when(settings.getDistributedExecutionEnabled()).thenReturn(false); + + distributedEngine.explain(relNode, mode, calciteContext, explainListener); + + verify(legacyEngine, times(1)).explain(relNode, mode, calciteContext, explainListener); + } + + @Test + void should_throw_for_calcite_explain_when_distributed_enabled() { + @SuppressWarnings("unchecked") + ResponseListener explainListener = + mock(ResponseListener.class); + ExplainMode mode = ExplainMode.STANDARD; + when(settings.getDistributedExecutionEnabled()).thenReturn(true); + + assertThrows( + UnsupportedOperationException.class, + () -> distributedEngine.explain(relNode, mode, calciteContext, explainListener)); + } + + @Test + void constructor_should_initialize() { + DistributedExecutionEngine engine = new DistributedExecutionEngine(legacyEngine, settings); + assertNotNull(engine); + } +} diff --git a/opensearch/src/test/java/org/opensearch/sql/opensearch/executor/distributed/TransportExecuteDistributedTaskActionTest.java b/opensearch/src/test/java/org/opensearch/sql/opensearch/executor/distributed/TransportExecuteDistributedTaskActionTest.java new file mode 100644 index 00000000000..b66d6e67d5f --- /dev/null +++ b/opensearch/src/test/java/org/opensearch/sql/opensearch/executor/distributed/TransportExecuteDistributedTaskActionTest.java @@ -0,0 +1,119 @@ +/* + * Copyright OpenSearch Contributors + * SPDX-License-Identifier: Apache-2.0 + */ + +package org.opensearch.sql.opensearch.executor.distributed; + +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertNotNull; +import static org.junit.jupiter.api.Assertions.assertTrue; +import static org.mockito.Mockito.mock; +import static org.mockito.Mockito.when; + +import java.util.List; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.DisplayNameGeneration; +import org.junit.jupiter.api.DisplayNameGenerator; +import org.junit.jupiter.api.Test; +import org.junit.jupiter.api.extension.ExtendWith; +import org.mockito.Mock; +import org.mockito.junit.jupiter.MockitoExtension; +import org.mockito.junit.jupiter.MockitoSettings; +import org.mockito.quality.Strictness; +import org.opensearch.action.support.ActionFilters; +import org.opensearch.cluster.node.DiscoveryNode; +import org.opensearch.cluster.service.ClusterService; +import org.opensearch.indices.IndicesService; +import org.opensearch.tasks.Task; +import org.opensearch.transport.TransportService; +import org.opensearch.transport.client.Client; + +@ExtendWith(MockitoExtension.class) +@MockitoSettings(strictness = Strictness.LENIENT) +@DisplayNameGeneration(DisplayNameGenerator.ReplaceUnderscores.class) +class TransportExecuteDistributedTaskActionTest { + + @Mock private TransportService transportService; + @Mock private ClusterService clusterService; + @Mock private ActionFilters actionFilters; + @Mock private Client client; + @Mock private IndicesService indicesService; + @Mock private Task task; + + private TransportExecuteDistributedTaskAction action; + + @BeforeEach + void setUp() { + action = + new TransportExecuteDistributedTaskAction( + transportService, actionFilters, clusterService, client, indicesService); + + // Setup cluster service mock + DiscoveryNode localNode = mock(DiscoveryNode.class); + when(localNode.getId()).thenReturn("test-node-1"); + when(clusterService.localNode()).thenReturn(localNode); + } + + @Test + void action_name_should_be_defined() { + assertEquals( + "cluster:admin/opensearch/sql/distributed/execute", + TransportExecuteDistributedTaskAction.NAME); + } + + @Test + void should_validate_operator_pipeline_request() { + // Given: Valid operator pipeline request + ExecuteDistributedTaskRequest request = new ExecuteDistributedTaskRequest(); + request.setExecutionMode("OPERATOR_PIPELINE"); + request.setIndexName("test-index"); + request.setShardIds(List.of(0, 1)); + request.setFieldNames(List.of("field1", "field2")); + request.setQueryLimit(100); + request.setStageId("operator-pipeline"); + + // Then + assertTrue(request.isValid()); + assertNotNull(request.toString()); + } + + @Test + void should_reject_invalid_request_missing_index() { + // Given: Request without index name + ExecuteDistributedTaskRequest request = new ExecuteDistributedTaskRequest(); + request.setExecutionMode("OPERATOR_PIPELINE"); + request.setShardIds(List.of(0, 1)); + request.setFieldNames(List.of("field1")); + request.setQueryLimit(100); + + // Then + assertNotNull(request.validate()); + } + + @Test + void should_reject_invalid_request_missing_shards() { + // Given: Request without shard IDs + ExecuteDistributedTaskRequest request = new ExecuteDistributedTaskRequest(); + request.setExecutionMode("OPERATOR_PIPELINE"); + request.setIndexName("test-index"); + request.setFieldNames(List.of("field1")); + request.setQueryLimit(100); + + // Then + assertNotNull(request.validate()); + } + + @Test + void should_reject_invalid_request_missing_fields() { + // Given: Request without field names + ExecuteDistributedTaskRequest request = new ExecuteDistributedTaskRequest(); + request.setExecutionMode("OPERATOR_PIPELINE"); + request.setIndexName("test-index"); + request.setShardIds(List.of(0, 1)); + request.setQueryLimit(100); + + // Then + assertNotNull(request.validate()); + } +} diff --git a/plugin/src/main/java/org/opensearch/sql/plugin/SQLPlugin.java b/plugin/src/main/java/org/opensearch/sql/plugin/SQLPlugin.java index d817e13c69f..54ff9cb146a 100644 --- a/plugin/src/main/java/org/opensearch/sql/plugin/SQLPlugin.java +++ b/plugin/src/main/java/org/opensearch/sql/plugin/SQLPlugin.java @@ -90,10 +90,11 @@ import org.opensearch.sql.legacy.plugin.RestSqlAction; import org.opensearch.sql.legacy.plugin.RestSqlStatsAction; import org.opensearch.sql.opensearch.client.OpenSearchNodeClient; +import org.opensearch.sql.opensearch.executor.distributed.ExecuteDistributedTaskResponse; +import org.opensearch.sql.opensearch.executor.distributed.TransportExecuteDistributedTaskAction; import org.opensearch.sql.opensearch.setting.OpenSearchSettings; import org.opensearch.sql.opensearch.storage.OpenSearchDataSourceFactory; import org.opensearch.sql.opensearch.storage.script.CompoundedScriptEngine; -import org.opensearch.sql.plugin.config.OpenSearchPluginModule; import org.opensearch.sql.plugin.rest.RestPPLQueryAction; import org.opensearch.sql.plugin.rest.RestPPLStatsAction; import org.opensearch.sql.plugin.rest.RestQuerySettingsAction; @@ -225,7 +226,11 @@ public List getRestHandlers( new ActionType<>( TransportWriteDirectQueryResourcesRequestAction.NAME, WriteDirectQueryResourcesActionResponse::new), - TransportWriteDirectQueryResourcesRequestAction.class)); + TransportWriteDirectQueryResourcesRequestAction.class), + new ActionHandler<>( + new ActionType<>( + TransportExecuteDistributedTaskAction.NAME, ExecuteDistributedTaskResponse::new), + TransportExecuteDistributedTaskAction.class)); } @Override @@ -250,7 +255,7 @@ public Collection createComponents( LocalClusterState.state().setPluginSettings((OpenSearchSettings) pluginSettings); LocalClusterState.state().setClient(client); ModulesBuilder modules = new ModulesBuilder(); - modules.add(new OpenSearchPluginModule()); + // Removed OpenSearchPluginModule - SQLPlugin only needs async and direct query services modules.add( b -> { b.bind(NodeClient.class).toInstance((NodeClient) client); diff --git a/plugin/src/main/java/org/opensearch/sql/plugin/config/OpenSearchPluginModule.java b/plugin/src/main/java/org/opensearch/sql/plugin/config/OpenSearchPluginModule.java index 8027301073f..9ae5ef567dc 100644 --- a/plugin/src/main/java/org/opensearch/sql/plugin/config/OpenSearchPluginModule.java +++ b/plugin/src/main/java/org/opensearch/sql/plugin/config/OpenSearchPluginModule.java @@ -6,6 +6,7 @@ package org.opensearch.sql.plugin.config; import lombok.RequiredArgsConstructor; +import org.opensearch.cluster.service.ClusterService; import org.opensearch.common.inject.AbstractModule; import org.opensearch.common.inject.Provides; import org.opensearch.common.inject.Singleton; @@ -22,12 +23,14 @@ import org.opensearch.sql.monitor.ResourceMonitor; import org.opensearch.sql.opensearch.client.OpenSearchClient; import org.opensearch.sql.opensearch.client.OpenSearchNodeClient; +import org.opensearch.sql.opensearch.executor.DistributedExecutionEngine; import org.opensearch.sql.opensearch.executor.OpenSearchExecutionEngine; import org.opensearch.sql.opensearch.executor.OpenSearchQueryManager; import org.opensearch.sql.opensearch.executor.protector.ExecutionProtector; import org.opensearch.sql.opensearch.executor.protector.OpenSearchExecutionProtector; import org.opensearch.sql.opensearch.monitor.OpenSearchMemoryHealthy; import org.opensearch.sql.opensearch.monitor.OpenSearchResourceMonitor; +import org.opensearch.sql.opensearch.setting.OpenSearchSettings; import org.opensearch.sql.opensearch.storage.OpenSearchStorageEngine; import org.opensearch.sql.planner.Planner; import org.opensearch.sql.planner.optimizer.LogicalPlanOptimizer; @@ -59,8 +62,17 @@ public StorageEngine storageEngine(OpenSearchClient client, Settings settings) { @Provides public ExecutionEngine executionEngine( - OpenSearchClient client, ExecutionProtector protector, PlanSerializer planSerializer) { - return new OpenSearchExecutionEngine(client, protector, planSerializer); + OpenSearchClient client, + ExecutionProtector protector, + PlanSerializer planSerializer, + ClusterService clusterService) { + OpenSearchExecutionEngine legacyEngine = + new OpenSearchExecutionEngine(client, protector, planSerializer); + + OpenSearchSettings openSearchSettings = + new OpenSearchSettings(clusterService.getClusterSettings()); + + return new DistributedExecutionEngine(legacyEngine, openSearchSettings); } @Provides diff --git a/plugin/src/main/java/org/opensearch/sql/plugin/transport/TransportPPLQueryAction.java b/plugin/src/main/java/org/opensearch/sql/plugin/transport/TransportPPLQueryAction.java index 27bfe2084f7..5dbb840b07d 100644 --- a/plugin/src/main/java/org/opensearch/sql/plugin/transport/TransportPPLQueryAction.java +++ b/plugin/src/main/java/org/opensearch/sql/plugin/transport/TransportPPLQueryAction.java @@ -75,6 +75,8 @@ public TransportPPLQueryAction( b.bind(org.opensearch.sql.common.setting.Settings.class) .toInstance(new OpenSearchSettings(clusterService.getClusterSettings())); b.bind(DataSourceService.class).toInstance(dataSourceService); + b.bind(ClusterService.class).toInstance(clusterService); + b.bind(TransportService.class).toInstance(transportService); }); this.injector = Guice.createInjector(modules); this.pplEnabled =