Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions CHANGES.md
Original file line number Diff line number Diff line change
Expand Up @@ -62,6 +62,9 @@
* New highly anticipated feature X added to Python SDK ([#X](https://github.com/apache/beam/issues/X)).
* New highly anticipated feature Y added to Java SDK ([#Y](https://github.com/apache/beam/issues/Y)).

* Flink 2.0 support for Java Classic and Portable Flink Runners ([#36947](https://github.com/apache/beam/issues/36947)),
experimental support for other SDK languages including Python.

## I/Os

* Support for X source added (Java/Python) ([#X](https://github.com/apache/beam/issues/X)).
Expand All @@ -72,6 +75,9 @@

## Breaking Changes

* Portable Java SDK now encodes SchemaCoders in a portable way ([34672](https://github.com/apache/beam/issues/34672)).
- Original custom Java coder encoding can still be obtained using [StreamingOptions.setUpdateCompatibilityVersion("2.71")](https://github.com/apache/beam/blob/2cf0930e7ae1aa389c26ce6639b584877a3e31d9/sdks/java/core/src/main/java/org/apache/beam/sdk/options/StreamingOptions.java#L47).
- Fixes [36496](https://github.com/apache/beam/issues/36496), [30276](https://github.com/apache/beam/issues/30276), [29245](https://github.com/apache/beam/issues/29245).
* The Python SDK container's `boot.go` now passes pipeline options through a file instead of the `PIPELINE_OPTIONS` environment variable. If a user pairs a new Python SDK container with an older SDK version (which does not support the file-based approach), the pipeline options will not be recognized and the pipeline will fail. Users must ensure their SDK and container versions are synchronized ([#37370](https://github.com/apache/beam/issues/37370)).

## Deprecations
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -137,7 +137,7 @@ public class DataflowPipelineTranslator {
private static byte[] serializeWindowingStrategy(
WindowingStrategy<?, ?> windowingStrategy, PipelineOptions options) {
try {
SdkComponents sdkComponents = SdkComponents.create();
SdkComponents sdkComponents = SdkComponents.create(options);

String workerHarnessContainerImageURL =
DataflowRunner.getContainerImageForJob(options.as(DataflowPipelineOptions.class));
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -1319,13 +1319,13 @@ public DataflowPipelineJob run(Pipeline pipeline) {
// with the SDK harness image (which implements Fn API).
//
// The same Environment is used in different and contradictory ways, depending on whether
// it is a v1 or v2 job submission.
// it is a portable or non-portable job submission.
RunnerApi.Environment defaultEnvironmentForDataflow =
Environments.createDockerEnvironment(workerHarnessContainerImageURL);

// The SdkComponents for portable an non-portable job submission must be kept distinct. Both
// The SdkComponents for portable and non-portable job submission must be kept distinct. Both
// need the default environment.
SdkComponents portableComponents = SdkComponents.create();
SdkComponents portableComponents = SdkComponents.create(options);
portableComponents.registerEnvironment(
defaultEnvironmentForDataflow
.toBuilder()
Expand Down Expand Up @@ -1357,28 +1357,29 @@ public DataflowPipelineJob run(Pipeline pipeline) {
dataflowOptions.setPipelineUrl(stagedPipeline.getLocation());

if (useUnifiedWorker(options)) {
LOG.info("Skipping v1 transform replacements since job will run on v2.");
LOG.info(
"Skipping non-portable transform replacements since job will run on portable worker.");
} else {
// Now rewrite things to be as needed for v1 (mutates the pipeline)
// This way the job submitted is valid for v1 and v2, simultaneously
// Now rewrite things to be as needed for non-portable (mutates the pipeline).
// This way the job submitted is valid for portable and non-portable, simultaneously.
replaceV1Transforms(pipeline);
}
// Capture the SdkComponents for look up during step translations
SdkComponents dataflowV1Components = SdkComponents.create();
dataflowV1Components.registerEnvironment(
// Capture the SdkComponents for look up during step translations.
SdkComponents dataflowNonPortableComponents = SdkComponents.create(options);
dataflowNonPortableComponents.registerEnvironment(
defaultEnvironmentForDataflow
.toBuilder()
.addAllDependencies(getDefaultArtifacts())
.addAllCapabilities(Environments.getJavaCapabilities())
.build());
// No need to perform transform upgrading for the Runner v1 proto.
RunnerApi.Pipeline dataflowV1PipelineProto =
PipelineTranslation.toProto(pipeline, dataflowV1Components, true, false);
// No need to perform transform upgrading for the non-portable runner proto.
RunnerApi.Pipeline dataflowNonPortablePipelineProto =
PipelineTranslation.toProto(pipeline, dataflowNonPortableComponents, true, false);

if (LOG.isDebugEnabled()) {
LOG.debug(
"Dataflow v1 pipeline proto:\n{}",
TextFormat.printer().printToString(dataflowV1PipelineProto));
"Dataflow non-portable worker pipeline proto:\n{}",
TextFormat.printer().printToString(dataflowNonPortablePipelineProto));
}

// Set a unique client_request_id in the CreateJob request.
Expand All @@ -1398,7 +1399,11 @@ public DataflowPipelineJob run(Pipeline pipeline) {

JobSpecification jobSpecification =
translator.translate(
pipeline, dataflowV1PipelineProto, dataflowV1Components, this, packages);
pipeline,
dataflowNonPortablePipelineProto,
dataflowNonPortableComponents,
this,
packages);

if (!isNullOrEmpty(dataflowOptions.getDataflowWorkerJar()) && !useUnifiedWorker(options)) {
List<String> experiments =
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,7 @@
import com.google.api.services.dataflow.model.Job;
import com.google.api.services.dataflow.model.Step;
import com.google.api.services.dataflow.model.WorkerPool;
import com.google.auto.value.AutoValue;
import java.io.File;
import java.io.IOException;
import java.io.Serializable;
Expand Down Expand Up @@ -89,7 +90,10 @@
import org.apache.beam.sdk.io.range.OffsetRange;
import org.apache.beam.sdk.options.PipelineOptions;
import org.apache.beam.sdk.options.PipelineOptionsFactory;
import org.apache.beam.sdk.options.StreamingOptions;
import org.apache.beam.sdk.options.ValueProvider;
import org.apache.beam.sdk.schemas.AutoValueSchema;
import org.apache.beam.sdk.schemas.annotations.DefaultSchema;
import org.apache.beam.sdk.state.StateSpec;
import org.apache.beam.sdk.state.StateSpecs;
import org.apache.beam.sdk.state.ValueState;
Expand Down Expand Up @@ -153,7 +157,7 @@ public class DataflowPipelineTranslatorTest implements Serializable {
@Rule public transient ExpectedException thrown = ExpectedException.none();

private SdkComponents createSdkComponents(PipelineOptions options) {
SdkComponents sdkComponents = SdkComponents.create();
SdkComponents sdkComponents = SdkComponents.create(options);

String containerImageURL =
DataflowRunner.getContainerImageForJob(options.as(DataflowPipelineOptions.class));
Expand Down Expand Up @@ -1125,7 +1129,7 @@ public String apply(byte[] input) {
file1.deleteOnExit();
File file2 = File.createTempFile("file2-", ".txt");
file2.deleteOnExit();
SdkComponents sdkComponents = SdkComponents.create();
SdkComponents sdkComponents = SdkComponents.create(options);
sdkComponents.registerEnvironment(
Environments.createDockerEnvironment(DataflowRunner.getContainerImageForJob(options))
.toBuilder()
Expand Down Expand Up @@ -1699,4 +1703,53 @@ public OffsetRange getInitialRange(@Element String element) {
return null;
}
}

@AutoValue
@DefaultSchema(AutoValueSchema.class)
public abstract static class SimpleAutoValue {
public abstract String getString();

public abstract Integer getInt32();

public abstract Long getInt64();

public static DataflowPipelineTranslatorTest.SimpleAutoValue of(
String string, Integer int32, Long int64) {
return new AutoValue_DataflowPipelineTranslatorTest_SimpleAutoValue(string, int32, int64);
}
}

@Test
public void testSchemaCoderTranslation() throws Exception {
DataflowPipelineOptions options = buildPipelineOptions();
Pipeline pipeline = Pipeline.create(options);
pipeline
.apply(Impulse.create())
.apply(
MapElements.via(
new SimpleFunction<byte[], SimpleAutoValue>() {
@Override
public SimpleAutoValue apply(byte[] input) {
return SimpleAutoValue.of("foo", 5, 10L);
}
}))
.apply(Window.into(FixedWindows.of(Duration.standardMinutes(1))));
{
SdkComponents sdkComponents = createSdkComponents(options);
RunnerApi.Pipeline pipelineProto = PipelineTranslation.toProto(pipeline, sdkComponents, true);
Map<String, RunnerApi.Coder> coders = pipelineProto.getComponents().getCodersMap();
assertTrue(coders.containsKey("SchemaCoder"));
assertEquals("beam:coder:schema:v1", coders.get("SchemaCoder").getSpec().getUrn());
}

// Prior to version 2.73, SchemaCoders are translated as custom java coders.
{
options.as(StreamingOptions.class).setUpdateCompatibilityVersion("2.72");
SdkComponents sdkComponents = createSdkComponents(options);
RunnerApi.Pipeline pipelineProto = PipelineTranslation.toProto(pipeline, sdkComponents, true);
Map<String, RunnerApi.Coder> coders = pipelineProto.getComponents().getCodersMap();
assertTrue(coders.containsKey("SchemaCoder"));
assertEquals("beam:coders:javasdk:0.1", coders.get("SchemaCoder").getSpec().getUrn());
}
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@
import org.apache.beam.model.pipeline.v1.RunnerApi.FunctionSpec;
import org.apache.beam.sdk.coders.Coder;
import org.apache.beam.sdk.util.SerializableUtils;
import org.apache.beam.sdk.util.construction.CoderTranslators.TranslationContextWithOptions;
import org.apache.beam.vendor.grpc.v1p69p0.com.google.protobuf.ByteString;
import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.annotations.VisibleForTesting;
import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.collect.BiMap;
Expand Down Expand Up @@ -131,12 +132,13 @@ private static RunnerApi.Coder toKnownCoder(Coder<?> coder, SdkComponents compon
throws IOException {
CoderTranslator translator = getKnownTranslators().get(coder.getClass());
List<String> componentIds = registerComponents(coder, translator, components);
TranslationContextWithOptions context = () -> components::getPipelineOptions;
return RunnerApi.Coder.newBuilder()
.addAllComponentCoderIds(componentIds)
.setSpec(
FunctionSpec.newBuilder()
.setUrn(getKnownCoderUrns().get(coder.getClass()))
.setPayload(ByteString.copyFrom(translator.getPayload(coder))))
.setUrn(translator.getUrn(coder, context))
.setPayload(ByteString.copyFrom(translator.getPayload(coder, context))))
.build();
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,13 @@
* additional payload, which is not currently supported. This exists as a temporary measure.
*/
public interface CoderTranslator<T extends Coder<?>> {

/** Returns the String to use for the coder URN. */
default String getUrn(T coder, TranslationContext context) {
return CoderTranslation.getKnownCoderUrns()
.getOrDefault(coder.getClass(), CoderTranslation.JAVA_SERIALIZED_CODER_URN);
}

/** Extract all component {@link Coder coders} within a coder. */
List<? extends Coder<?>> getComponents(T from);

Expand All @@ -37,7 +44,7 @@ public interface CoderTranslator<T extends Coder<?>> {
*
* <p>The default implementation returns a byte array of length zero.
*/
default byte[] getPayload(T from) {
default byte[] getPayload(T from, TranslationContext context) {
return new byte[0];
}

Expand Down
Loading
Loading