diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 7690d7dc..fd95c24e 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -22,7 +22,7 @@ jobs: runs-on: ubuntu-latest strategy: matrix: - java: [8, 11] + java: [17, 21] steps: - uses: actions/checkout@v3 @@ -37,5 +37,5 @@ jobs: - name: Build without Scala run: ./gradlew build --scan --stacktrace -Porg.gradle.project.enable_scala=false - - name: Build with Scala - run: ./gradlew build --scan --stacktrace -Porg.gradle.project.enable_scala=true +# - name: Build with Scala +# run: ./gradlew build --scan --stacktrace -Porg.gradle.project.enable_scala=true diff --git a/README.md b/README.md index 9ed2a0cb..8fee9dff 100644 --- a/README.md +++ b/README.md @@ -57,7 +57,7 @@ You will need to set up your environment in order to develop, debug, and execute Flink supports Linux, OS X, and Windows as development environments for Flink programs and local execution. The following software is required for a Flink development setup and should be installed on your system: - Git -- a JDK for Java 8 or Java 11 (a JRE is not sufficient; other versions of Java are currently not supported) +- a JDK for Java 11 or Java 17 or Java 21 (a JRE is not sufficient; other versions of Java are currently not supported) - an IDE for Java (and/or Scala) development with Gradle support - We recommend [IntelliJ](https://www.jetbrains.com/idea/), but [Eclipse](https://www.eclipse.org/downloads/) or [Visual Studio Code](https://code.visualstudio.com/) (with the [Java extension pack](https://code.visualstudio.com/docs/java/java-tutorial)) can also be used so long as you stick to Java - For Scala, you will need to use IntelliJ (and its [Scala plugin](https://plugins.jetbrains.com/plugin/1347-scala/)) @@ -130,7 +130,7 @@ You can also selectively apply this plugin in a single subproject if desired. The project needs to be imported as a gradle project into your IDE. -Then you should be able to open [`RideCleansingTest`](ride-cleansing/src/test/java/org/apache/flink/training/exercises/ridecleansing/RideCleansingTest.java) and run this test. +Then you should be able to open [`RideCleansingTest`](ride-cleansing/src/test/java/org/apache/flink/training/exercises/ridecleansing/RideCleansingUnitTest.java) and run this test. > **:information_source: Note for Scala users:** You will need to use IntelliJ with the JetBrains Scala plugin, and you will need to add a Scala 2.12 SDK to the Global Libraries section of the Project Structure as well as to the module you are working on. > IntelliJ will ask you for the latter when you open a Scala file. @@ -241,6 +241,8 @@ For Java/Scala exercises and solutions, we provide special tasks that can be lis - [Exercise](long-ride-alerts/README.md) - [Discussion](long-ride-alerts/DISCUSSION.md) + + ## Contribute If you would like to contribute to this repository or add new exercises, please read the [contributing](CONTRIBUTING.md) guide. diff --git a/README_zh.md b/README_zh.md index 1914e08e..d73b9c3c 100644 --- a/README_zh.md +++ b/README_zh.md @@ -59,7 +59,7 @@ under the License. Linux、OS X 和 Windows 均可作为 Flink 程序和本地执行的开发环境。 Flink 开发设置需要以下软件,它们应该安装在系统上: - Git -- Java 8 或者 Java 11 版本的 JDK (JRE不满足要求;目前不支持其他版本的Java) +- Java 11 或者 Java 17 或者 Java 21 版本的 JDK (JRE不满足要求;目前不支持其他版本的Java) - 支持 Gradle 的 Java (及/或 Scala) 开发IDE - 推荐使用 [IntelliJ](https://www.jetbrains.com/idea/), 但 [Eclipse](https://www.eclipse.org/downloads/) 或 [Visual Studio Code](https://code.visualstudio.com/) (安装 [Java extension pack](https://code.visualstudio.com/docs/java/java-tutorial) 插件) 也可以用于Java环境 - 为了使用 Scala, 需要使用 IntelliJ (及其 [Scala plugin](https://plugins.jetbrains.com/plugin/1347-scala/) 插件) @@ -134,7 +134,7 @@ org.gradle.project.enable_scala = true 本项目应作为 gradle 项目导入到IDE中。 -然后应该可以打开 [`RideCleansingTest`](ride-cleansing/src/test/java/org/apache/flink/training/exercises/ridecleansing/RideCleansingTest.java) 并运行此测试。 +然后应该可以打开 [`RideCleansingTest`](ride-cleansing/src/test/java/org/apache/flink/training/exercises/ridecleansing/RideCleansingUnitTest.java) 并运行此测试。 > **:information_source: Scala 用户须知:** 需要将 IntelliJ 与 JetBrains Scala 插件一起使用,并且需要将 Scala 2.12 SDK 添加到项目结构的全局库部分以及工作模块中。 > 当打开 Scala 文件时,IntelliJ 会要求提供后者(JetBrains Scala 插件)。 diff --git a/build.gradle b/build.gradle index 3344a8e2..add9a22c 100644 --- a/build.gradle +++ b/build.gradle @@ -16,15 +16,15 @@ */ plugins { - id 'com.github.johnrengelman.shadow' version '7.0.0' apply false - id "com.diffplug.spotless" version "6.4.2" apply false + id 'com.gradleup.shadow' version '9.0.0-rc2' apply false + id "com.diffplug.spotless" version "7.2.1" apply false } description = "Flink Training Exercises" allprojects { group = 'org.apache.flink' - version = '1.17-SNAPSHOT' + version = '2.0.0-SNAPSHOT' apply plugin: 'com.diffplug.spotless' @@ -33,7 +33,7 @@ allprojects { target '*.gradle', '*.md', '.gitignore' trimTrailingWhitespace() - indentWithSpaces(4) + leadingTabsToSpaces(4) endWithNewline() } @@ -89,13 +89,13 @@ subprojects { if (project.properties['org.gradle.project.enable_scala'].trim() == 'true') { apply plugin: 'scala' } - apply plugin: 'com.github.johnrengelman.shadow' + apply plugin: 'com.gradleup.shadow' apply plugin: 'checkstyle' apply plugin: 'eclipse' ext { - javaVersion = '1.8' - flinkVersion = '1.17.0' + javaVersion = '17' + flinkVersion = '2.0.0' scalaBinaryVersion = '2.12' log4jVersion = '2.12.1' junitVersion = '4.13' @@ -127,9 +127,12 @@ subprojects { shadow "org.apache.logging.log4j:log4j-core:${log4jVersion}" shadow "org.apache.flink:flink-clients:${flinkVersion}" - shadow "org.apache.flink:flink-java:${flinkVersion}" +// removed +// shadow "org.apache.flink:flink-java:${flinkVersion}" + shadow "org.apache.flink:flink-runtime:${flinkVersion}" shadow "org.apache.flink:flink-streaming-java:${flinkVersion}" - shadow "org.apache.flink:flink-streaming-scala_${scalaBinaryVersion}:${flinkVersion}" +// removed +// shadow "org.apache.flink:flink-streaming-scala_${scalaBinaryVersion}:${flinkVersion}" // allows using Flink's web UI when running in the IDE: shadow "org.apache.flink:flink-runtime-web:${flinkVersion}" @@ -174,7 +177,7 @@ subprojects { spotless { java { - googleJavaFormat('1.7').aosp() + googleJavaFormat('1.28.0').aosp() // \# refers to static imports importOrder('org.apache.flink', 'org.apache.flink.shaded', '', 'javax', 'java', 'scala', '\\#') diff --git a/common/src/main/java/org/apache/flink/training/examples/ridecount/RideCountExample.java b/common/src/main/java/org/apache/flink/training/examples/ridecount/RideCountExample.java index 36b7b899..438620fe 100644 --- a/common/src/main/java/org/apache/flink/training/examples/ridecount/RideCountExample.java +++ b/common/src/main/java/org/apache/flink/training/examples/ridecount/RideCountExample.java @@ -23,9 +23,12 @@ import org.apache.flink.streaming.api.datastream.DataStream; import org.apache.flink.streaming.api.datastream.KeyedStream; import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment; +import org.apache.flink.streaming.api.functions.timestamps.BoundedOutOfOrdernessTimestampExtractor; import org.apache.flink.training.exercises.common.datatypes.TaxiRide; import org.apache.flink.training.exercises.common.sources.TaxiRideGenerator; +import java.time.Duration; + /** * Example that counts the rides for each driver. * @@ -47,7 +50,18 @@ public static void main(String[] args) throws Exception { StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment(); // start the data generator - DataStream rides = env.addSource(new TaxiRideGenerator()); + DataStream rides = + env.fromSource( + new TaxiRideGenerator(), + new BoundedOutOfOrdernessTimestampExtractor( + Duration.ofSeconds(10)) { + + @Override + public long extractTimestamp(TaxiRide taxiRide) { + return taxiRide.getEventTimeMillis(); + } + }, + "taxi ride"); // map each ride to a tuple of (driverId, 1) DataStream> tuples = diff --git a/common/src/main/java/org/apache/flink/training/exercises/common/sources/TaxiFareGenerator.java b/common/src/main/java/org/apache/flink/training/exercises/common/sources/TaxiFareGenerator.java index 58fbe687..de0b4cb4 100644 --- a/common/src/main/java/org/apache/flink/training/exercises/common/sources/TaxiFareGenerator.java +++ b/common/src/main/java/org/apache/flink/training/exercises/common/sources/TaxiFareGenerator.java @@ -18,53 +18,69 @@ package org.apache.flink.training.exercises.common.sources; -import org.apache.flink.streaming.api.functions.source.SourceFunction; +import org.apache.flink.api.common.typeinfo.TypeInformation; +import org.apache.flink.api.connector.source.util.ratelimit.RateLimiterStrategy; +import org.apache.flink.connector.datagen.source.DataGeneratorSource; +import org.apache.flink.connector.datagen.source.GeneratorFunction; import org.apache.flink.training.exercises.common.datatypes.TaxiFare; -import org.apache.flink.training.exercises.common.utils.DataGenerator; -import java.time.Duration; import java.time.Instant; +import java.util.concurrent.ConcurrentLinkedDeque; +import java.util.concurrent.atomic.AtomicLong; /** * This SourceFunction generates a data stream of TaxiFare records. * *

The stream is generated in order. */ -public class TaxiFareGenerator implements SourceFunction { +public class TaxiFareGenerator extends DataGeneratorSource { - private volatile boolean running = true; - private Instant limitingTimestamp = Instant.MAX; - - /** Create a bounded TaxiFareGenerator that runs only for the specified duration. */ - public static TaxiFareGenerator runFor(Duration duration) { - TaxiFareGenerator generator = new TaxiFareGenerator(); - generator.limitingTimestamp = DataGenerator.BEGINNING.plus(duration); - return generator; - } - - @Override - public void run(SourceContext ctx) throws Exception { - - long id = 1; - - while (running) { - TaxiFare fare = new TaxiFare(id); + private static Instant limitingTimestamp = Instant.MAX; + /** + * build taxi fare deque. + * + * @return taxiFareDeque + */ + public static ConcurrentLinkedDeque buildTaxiFareDeque() { + ConcurrentLinkedDeque taxiFareDeque = new ConcurrentLinkedDeque<>(); + for (int i = 1; ; i++) { + TaxiFare fare = new TaxiFare(i); // don't emit events that exceed the specified limit if (fare.startTime.compareTo(limitingTimestamp) >= 0) { break; } - - ++id; - ctx.collect(fare); - - // match our event production rate to that of the TaxiRideGenerator - Thread.sleep(TaxiRideGenerator.SLEEP_MILLIS_PER_EVENT); + taxiFareDeque.push(fare); } + return taxiFareDeque; + } + + /** TaxiFareGenerator. */ + public TaxiFareGenerator() { + this(buildTaxiFareDeque()); } - @Override - public void cancel() { - running = false; + /** + * TaxiFareGenerator. + * + * @param taxiFareDeque taxiFareDeque + */ + public TaxiFareGenerator(ConcurrentLinkedDeque taxiFareDeque) { + super( + new GeneratorFunction() { + + private final AtomicLong id = new AtomicLong(0); + private final AtomicLong maxStartTime = new AtomicLong(0); + + @Override + public TaxiFare map(Long value) throws Exception { + synchronized (this) { + return taxiFareDeque.poll(); + } + } + }, + taxiFareDeque.size(), + RateLimiterStrategy.perSecond(200), + TypeInformation.of(TaxiFare.class)); } } diff --git a/common/src/main/java/org/apache/flink/training/exercises/common/sources/TaxiRideGenerator.java b/common/src/main/java/org/apache/flink/training/exercises/common/sources/TaxiRideGenerator.java index 975ff2bf..d3895315 100644 --- a/common/src/main/java/org/apache/flink/training/exercises/common/sources/TaxiRideGenerator.java +++ b/common/src/main/java/org/apache/flink/training/exercises/common/sources/TaxiRideGenerator.java @@ -18,69 +18,82 @@ package org.apache.flink.training.exercises.common.sources; -import org.apache.flink.streaming.api.functions.source.SourceFunction; +import org.apache.flink.api.common.typeinfo.TypeInformation; +import org.apache.flink.api.connector.source.util.ratelimit.RateLimiterStrategy; +import org.apache.flink.connector.datagen.source.DataGeneratorSource; +import org.apache.flink.connector.datagen.source.GeneratorFunction; import org.apache.flink.training.exercises.common.datatypes.TaxiRide; import java.util.ArrayList; import java.util.List; import java.util.PriorityQueue; import java.util.Random; +import java.util.concurrent.ConcurrentLinkedDeque; +import java.util.concurrent.atomic.AtomicLong; /** * This SourceFunction generates a data stream of TaxiRide records. * *

The stream is produced out-of-order. */ -public class TaxiRideGenerator implements SourceFunction { +public class TaxiRideGenerator extends DataGeneratorSource { public static final int SLEEP_MILLIS_PER_EVENT = 10; private static final int BATCH_SIZE = 5; - private volatile boolean running = true; - @Override - public void run(SourceContext ctx) throws Exception { + /** TaxiRideGenerator. */ + public TaxiRideGenerator() { + super( + new GeneratorFunction() { - PriorityQueue endEventQ = new PriorityQueue<>(100); - long id = 0; - long maxStartTime = 0; + private final AtomicLong id = new AtomicLong(0); + private final AtomicLong maxStartTime = new AtomicLong(0); + private final PriorityQueue endEventQ = new PriorityQueue<>(100); + private final ConcurrentLinkedDeque deque = + new ConcurrentLinkedDeque<>(); - while (running) { + @Override + public TaxiRide map(Long value) throws Exception { + synchronized (this) { + if (deque.isEmpty()) { + long idLocal = id.get(); + // generate a batch of START events + List startEvents = new ArrayList(BATCH_SIZE); + for (int i = 1; i <= BATCH_SIZE; i++) { + TaxiRide ride = new TaxiRide(idLocal + i, true); + startEvents.add(ride); + // the start times may be in order, but let's not assume that + maxStartTime.set( + Math.max( + maxStartTime.get(), ride.getEventTimeMillis())); + } + // enqueue the corresponding END events + for (int i = 1; i <= BATCH_SIZE; i++) { + endEventQ.add(new TaxiRide(idLocal + i, false)); + } - // generate a batch of START events - List startEvents = new ArrayList(BATCH_SIZE); - for (int i = 1; i <= BATCH_SIZE; i++) { - TaxiRide ride = new TaxiRide(id + i, true); - startEvents.add(ride); - // the start times may be in order, but let's not assume that - maxStartTime = Math.max(maxStartTime, ride.getEventTimeMillis()); - } + // release the END events coming before the end of this new batch + // (this allows a few END events to precede their matching START + // event) + while (endEventQ.peek().getEventTimeMillis() + <= maxStartTime.get()) { + TaxiRide ride = endEventQ.poll(); + deque.push(ride); + } - // enqueue the corresponding END events - for (int i = 1; i <= BATCH_SIZE; i++) { - endEventQ.add(new TaxiRide(id + i, false)); - } + // then emit the new START events (out-of-order) + java.util.Collections.shuffle(startEvents, new Random(id.get())); + startEvents.iterator().forEachRemaining(deque::push); - // release the END events coming before the end of this new batch - // (this allows a few END events to precede their matching START event) - while (endEventQ.peek().getEventTimeMillis() <= maxStartTime) { - TaxiRide ride = endEventQ.poll(); - ctx.collect(ride); - } - - // then emit the new START events (out-of-order) - java.util.Collections.shuffle(startEvents, new Random(id)); - startEvents.iterator().forEachRemaining(r -> ctx.collect(r)); - - // prepare for the next batch - id += BATCH_SIZE; - - // don't go too fast - Thread.sleep(BATCH_SIZE * SLEEP_MILLIS_PER_EVENT); - } - } - - @Override - public void cancel() { - running = false; + // prepare for the next batch + id.set(id.get() + BATCH_SIZE); + } + return deque.poll(); + } + } + }, + Long.MAX_VALUE, + RateLimiterStrategy.perSecond(200), + TypeInformation.of(TaxiRide.class)); } } diff --git a/common/src/test/java/org/apache/flink/training/exercises/testing/ComposedKeyedProcessFunction.java b/common/src/test/java/org/apache/flink/training/exercises/testing/ComposedKeyedProcessFunction.java index c3ecf2cc..e71b145c 100644 --- a/common/src/test/java/org/apache/flink/training/exercises/testing/ComposedKeyedProcessFunction.java +++ b/common/src/test/java/org/apache/flink/training/exercises/testing/ComposedKeyedProcessFunction.java @@ -18,7 +18,7 @@ package org.apache.flink.training.exercises.testing; -import org.apache.flink.configuration.Configuration; +import org.apache.flink.api.common.functions.OpenContext; import org.apache.flink.streaming.api.functions.KeyedProcessFunction; import org.apache.flink.training.exercises.common.utils.MissingSolutionException; import org.apache.flink.util.Collector; @@ -48,7 +48,7 @@ public ComposedKeyedProcessFunction( } @Override - public void open(Configuration parameters) throws Exception { + public void open(OpenContext parameters) throws Exception { try { exercise.setRuntimeContext(this.getRuntimeContext()); diff --git a/common/src/test/java/org/apache/flink/training/exercises/testing/ComposedPipeline.java b/common/src/test/java/org/apache/flink/training/exercises/testing/ComposedPipeline.java index ba277d00..80e1083e 100644 --- a/common/src/test/java/org/apache/flink/training/exercises/testing/ComposedPipeline.java +++ b/common/src/test/java/org/apache/flink/training/exercises/testing/ComposedPipeline.java @@ -19,9 +19,11 @@ package org.apache.flink.training.exercises.testing; import org.apache.flink.api.common.JobExecutionResult; -import org.apache.flink.streaming.api.functions.source.SourceFunction; +import org.apache.flink.api.connector.source.Source; import org.apache.flink.training.exercises.common.utils.MissingSolutionException; +import java.util.function.Supplier; + /** * This allows the tests to be run against both the exercises and the solutions. * @@ -39,16 +41,16 @@ public ComposedPipeline( } @Override - public JobExecutionResult execute(SourceFunction source, TestSink sink) + public JobExecutionResult execute(Supplier> sourceSupplier, TestSink sink) throws Exception { JobExecutionResult result; try { - result = exercise.execute(source, sink); + result = exercise.execute(sourceSupplier, sink); } catch (Exception e) { if (MissingSolutionException.ultimateCauseIsMissingSolution(e)) { - result = solution.execute(source, sink); + result = solution.execute(sourceSupplier, sink); } else { throw e; } diff --git a/common/src/test/java/org/apache/flink/training/exercises/testing/ComposedRichCoFlatMapFunction.java b/common/src/test/java/org/apache/flink/training/exercises/testing/ComposedRichCoFlatMapFunction.java index 97d5ce21..64e4c130 100644 --- a/common/src/test/java/org/apache/flink/training/exercises/testing/ComposedRichCoFlatMapFunction.java +++ b/common/src/test/java/org/apache/flink/training/exercises/testing/ComposedRichCoFlatMapFunction.java @@ -18,7 +18,7 @@ package org.apache.flink.training.exercises.testing; -import org.apache.flink.configuration.Configuration; +import org.apache.flink.api.common.functions.OpenContext; import org.apache.flink.streaming.api.functions.co.RichCoFlatMapFunction; import org.apache.flink.training.exercises.common.utils.MissingSolutionException; import org.apache.flink.util.Collector; @@ -50,7 +50,7 @@ public ComposedRichCoFlatMapFunction( } @Override - public void open(Configuration parameters) throws Exception { + public void open(OpenContext parameters) throws Exception { try { exercise.setRuntimeContext(this.getRuntimeContext()); diff --git a/common/src/test/java/org/apache/flink/training/exercises/testing/ComposedTwoInputPipeline.java b/common/src/test/java/org/apache/flink/training/exercises/testing/ComposedTwoInputPipeline.java index 95e24983..10625b23 100644 --- a/common/src/test/java/org/apache/flink/training/exercises/testing/ComposedTwoInputPipeline.java +++ b/common/src/test/java/org/apache/flink/training/exercises/testing/ComposedTwoInputPipeline.java @@ -19,7 +19,7 @@ package org.apache.flink.training.exercises.testing; import org.apache.flink.api.common.JobExecutionResult; -import org.apache.flink.streaming.api.functions.source.SourceFunction; +import org.apache.flink.api.connector.source.Source; import org.apache.flink.training.exercises.common.utils.MissingSolutionException; /** @@ -43,7 +43,7 @@ public ComposedTwoInputPipeline( @Override public JobExecutionResult execute( - SourceFunction source1, SourceFunction source2, TestSink sink) + Source source1, Source source2, TestSink sink) throws Exception { JobExecutionResult result; diff --git a/common/src/test/java/org/apache/flink/training/exercises/testing/ExecutablePipeline.java b/common/src/test/java/org/apache/flink/training/exercises/testing/ExecutablePipeline.java index 685edc47..22ede200 100644 --- a/common/src/test/java/org/apache/flink/training/exercises/testing/ExecutablePipeline.java +++ b/common/src/test/java/org/apache/flink/training/exercises/testing/ExecutablePipeline.java @@ -19,8 +19,11 @@ package org.apache.flink.training.exercises.testing; import org.apache.flink.api.common.JobExecutionResult; -import org.apache.flink.streaming.api.functions.source.SourceFunction; +import org.apache.flink.api.connector.source.Source; + +import java.util.function.Supplier; public interface ExecutablePipeline { - JobExecutionResult execute(SourceFunction source, TestSink sink) throws Exception; + JobExecutionResult execute(Supplier> sourceSupplier, TestSink sink) + throws Exception; } diff --git a/common/src/test/java/org/apache/flink/training/exercises/testing/ExecutableTwoInputPipeline.java b/common/src/test/java/org/apache/flink/training/exercises/testing/ExecutableTwoInputPipeline.java index 3f9aba90..0c2944eb 100644 --- a/common/src/test/java/org/apache/flink/training/exercises/testing/ExecutableTwoInputPipeline.java +++ b/common/src/test/java/org/apache/flink/training/exercises/testing/ExecutableTwoInputPipeline.java @@ -19,10 +19,10 @@ package org.apache.flink.training.exercises.testing; import org.apache.flink.api.common.JobExecutionResult; -import org.apache.flink.streaming.api.functions.source.SourceFunction; +import org.apache.flink.api.connector.source.Source; public interface ExecutableTwoInputPipeline { JobExecutionResult execute( - SourceFunction source1, SourceFunction source2, TestSink sink) + Source source1, Source source2, TestSink sink) throws Exception; } diff --git a/common/src/test/java/org/apache/flink/training/exercises/testing/ParallelTestSource.java b/common/src/test/java/org/apache/flink/training/exercises/testing/ParallelTestSource.java index 6dcfc022..19919d0e 100644 --- a/common/src/test/java/org/apache/flink/training/exercises/testing/ParallelTestSource.java +++ b/common/src/test/java/org/apache/flink/training/exercises/testing/ParallelTestSource.java @@ -18,48 +18,463 @@ package org.apache.flink.training.exercises.testing; +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import org.apache.flink.api.common.ExecutionConfig; +import org.apache.flink.api.common.serialization.SerializerConfigImpl; import org.apache.flink.api.common.typeinfo.TypeInformation; +import org.apache.flink.api.common.typeutils.TypeSerializer; +import org.apache.flink.api.connector.source.Boundedness; +import org.apache.flink.api.connector.source.ReaderOutput; +import org.apache.flink.api.connector.source.Source; +import org.apache.flink.api.connector.source.SourceReader; +import org.apache.flink.api.connector.source.SourceReaderContext; +import org.apache.flink.api.connector.source.SourceSplit; +import org.apache.flink.api.connector.source.SplitEnumerator; +import org.apache.flink.api.connector.source.SplitEnumeratorContext; import org.apache.flink.api.java.typeutils.ResultTypeQueryable; import org.apache.flink.api.java.typeutils.TypeExtractor; -import org.apache.flink.streaming.api.functions.source.RichParallelSourceFunction; +import org.apache.flink.api.java.typeutils.runtime.kryo.KryoSerializer; +import org.apache.flink.core.io.InputStatus; +import org.apache.flink.core.io.SimpleVersionedSerializer; +import org.apache.flink.core.memory.DataInputViewStreamWrapper; +import org.apache.flink.core.memory.DataOutputViewStreamWrapper; +import org.apache.flink.streaming.api.lineage.DefaultLineageDataset; +import org.apache.flink.streaming.api.lineage.LineageDataset; +import org.apache.flink.streaming.api.lineage.LineageVertex; +import org.apache.flink.streaming.api.lineage.LineageVertexProvider; +import org.apache.flink.streaming.api.lineage.SourceLineageVertex; +import org.apache.flink.streaming.api.operators.OutputTypeConfigurable; +import org.apache.flink.util.Preconditions; + +import javax.annotation.Nonnull; +import javax.annotation.Nullable; + +import java.io.ByteArrayInputStream; +import java.io.ByteArrayOutputStream; +import java.io.IOException; +import java.io.Serializable; +import java.util.ArrayDeque; +import java.util.ArrayList; +import java.util.Collection; +import java.util.Collections; +import java.util.HashMap; +import java.util.List; +import java.util.Objects; +import java.util.Queue; +import java.util.concurrent.CompletableFuture; +import java.util.concurrent.atomic.AtomicBoolean; + +public class ParallelTestSource + implements Source, List>, + ResultTypeQueryable, + OutputTypeConfigurable, + LineageVertexProvider { + + /** The (de)serializer to be used for the data elements. */ + private TypeSerializer serializer; + + private final List elements; -public class ParallelTestSource extends RichParallelSourceFunction - implements ResultTypeQueryable { - private final T[] testStream; private final TypeInformation typeInfo; - @SuppressWarnings("unchecked") @SafeVarargs - public ParallelTestSource(T... events) { - this.typeInfo = (TypeInformation) TypeExtractor.createTypeInfo(events[0].getClass()); - this.testStream = events; + public ParallelTestSource(T... elements) { + this(List.of(elements)); } - @Override - public void run(SourceContext ctx) { - int indexOfThisSubtask = getRuntimeContext().getIndexOfThisSubtask(); - int numberOfParallelSubtasks = getRuntimeContext().getNumberOfParallelSubtasks(); - int subtask = 0; - - // the elements of the testStream are assigned to the parallel instances in a round-robin - // fashion - for (T element : testStream) { - if (subtask == indexOfThisSubtask) { - ctx.collect(element); + public ParallelTestSource(Collection elements) { + this(null, null, elements); + } + + @SafeVarargs + public ParallelTestSource( + @Nullable TypeSerializer serializer, + @Nullable TypeInformation typeInfo, + T... elements) { + this(serializer, typeInfo, List.of(elements)); + } + + public ParallelTestSource( + @Nullable TypeSerializer serializer, + @Nullable TypeInformation typeInfo, + @Nonnull Collection elements) { + this.elements = new ArrayList<>(elements); + + if (typeInfo != null) { + this.typeInfo = typeInfo; + } else { + if (this.elements.isEmpty()) { + throw new IllegalArgumentException( + "The type information must be specified when the collection is empty"); + } + T firstElement = this.elements.get(0); + try { + this.typeInfo = TypeExtractor.getForObject(firstElement); + } catch (Exception e) { + throw new RuntimeException( + "Could not create TypeInformation for type " + + firstElement.getClass().getName() + + "; please specify the TypeInformation manually", + e); } - subtask = (subtask + 1) % numberOfParallelSubtasks; } - // test sources are finite, so they emit a Long.MAX_VALUE watermark when they finish + checkIterable(elements, this.typeInfo.getTypeClass()); + + this.serializer = + serializer != null + ? serializer + : new KryoSerializer( + this.typeInfo.getTypeClass(), new SerializerConfigImpl()); + } + + @Override + public Boundedness getBoundedness() { + return Boundedness.BOUNDED; } @Override - public void cancel() { - // ignore cancel, finite anyway + public SourceReader> createReader(SourceReaderContext ctx) { + return new InMemoryReader<>(); + } + + @Override + public SplitEnumerator, List> createEnumerator( + SplitEnumeratorContext> enumContext) { + return new InMemoryEnumerator<>(enumContext, elements); + } + + @Override + public SplitEnumerator, List> restoreEnumerator( + SplitEnumeratorContext> enumContext, List checkpoint) { + return new InMemoryEnumerator<>(enumContext, checkpoint); + } + + @Override + public SimpleVersionedSerializer> getSplitSerializer() { + return new InMemorySplitSerializer<>(this.serializer); + } + + @Override + public SimpleVersionedSerializer> getEnumeratorCheckpointSerializer() { + return new CheckpointSerializer<>(this.serializer); } @Override public TypeInformation getProducedType() { - return typeInfo; + return this.typeInfo; + } + + /** Split definition for in-memory data. */ + public static class InMemorySplit implements SourceSplit, Serializable { + private final int splitId; + private final List slice; + + InMemorySplit(int splitId, List slice) { + this.splitId = splitId; + this.slice = new ArrayList<>(slice); + } + + @Override + public String splitId() { + return "split-" + splitId; + } + + public List getSlice() { + return slice; + } + } + + /** SplitEnumerator:split data. */ + public static class InMemoryEnumerator + implements SplitEnumerator, List> { + + private final SplitEnumeratorContext> context; + private final List elements; + private boolean assigned = false; + + InMemoryEnumerator(SplitEnumeratorContext> context, List elements) { + this.context = context; + this.elements = elements; + } + + @Override + public void start() {} + + @Override + public void handleSplitRequest(int subtaskId, @Nullable String requesterHostname) { + throw new UnsupportedOperationException(); + } + + @Override + public void addSplitsBack(List> splits, int subtaskId) { + throw new UnsupportedOperationException(); + } + + @Override + public void addReader(int subtaskId) { + if (!assigned && context.registeredReaders().size() == context.currentParallelism()) { + assignSplits(); + assigned = true; + } + } + + private void assignSplits() { + int parallelism = context.currentParallelism(); + int step = Math.max(1, (elements.size() + parallelism - 1) / parallelism); + for (int i = 0; i < parallelism; i++) { + int from = i * step; + int to = Math.min(from + step, elements.size()); + if (from >= to) { + while (i < parallelism) { + InMemorySplit split = new InMemorySplit<>(i, List.of()); + context.assignSplit(split, i); + i++; + } + break; + } + InMemorySplit split = new InMemorySplit<>(i, elements.subList(from, to)); + context.assignSplit(split, i); + } + } + + @Override + public List snapshotState(long checkpointId) { + return elements; + } + + @Override + public void close() {} + } + + /** SourceReader: read data. */ + public static class InMemoryReader implements SourceReader> { + + private final Queue remaining = new ArrayDeque<>(); + private final AtomicBoolean initialized = new AtomicBoolean(false); + + public InMemoryReader() {} + + @Override + public void start() {} + + @Override + public InputStatus pollNext(ReaderOutput output) { + if (!initialized.get()) { + return InputStatus.MORE_AVAILABLE; + } + T next = remaining.poll(); + if (next != null) { + output.collect(next); + return InputStatus.MORE_AVAILABLE; + } else { + return InputStatus.END_OF_INPUT; + } + } + + @Override + public List> snapshotState(long checkpointId) { + return Collections.emptyList(); + } + + @Override + public CompletableFuture isAvailable() { + return CompletableFuture.completedFuture((Void) null); + } + + @Override + public void addSplits(List> splits) { + for (InMemorySplit split : splits) { + remaining.addAll(split.getSlice()); + } + initialized.set(true); + } + + @Override + public void notifyNoMoreSplits() {} + + @Override + public void close() {} + } + + public static class InMemorySplitSerializer + implements SimpleVersionedSerializer> { + + /** The (de)serializer to be used for the data elements. */ + private final TypeSerializer serializer; + + public InMemorySplitSerializer(TypeSerializer serializer) { + this.serializer = serializer; + } + + @Override + public int getVersion() { + return 1; + } + + @Override + public byte[] serialize(InMemorySplit split) throws IOException { + ByteArrayOutputStream baos = new ByteArrayOutputStream(); + DataOutputViewStreamWrapper wrapper = new DataOutputViewStreamWrapper(baos); + try { + wrapper.writeInt(split.splitId); + wrapper.writeInt(split.slice.size()); + for (T element : split.slice) { + serializer.serialize(element, wrapper); + } + } catch (Exception e) { + throw new IOException( + "Serializing the source elements failed: " + e.getMessage(), e); + } + return baos.toByteArray(); + } + + @Override + public InMemorySplit deserialize(int version, byte[] serialized) throws IOException { + ByteArrayInputStream bais = new ByteArrayInputStream(serialized); + DataInputViewStreamWrapper wrapper = new DataInputViewStreamWrapper(bais); + try { + int splitId = wrapper.readInt(); + int size = wrapper.readInt(); + List result = new ArrayList<>(size); + for (int i = 0; i < size; i++) { + result.add(serializer.deserialize(wrapper)); + } + return new InMemorySplit<>(splitId, result); + } catch (IOException e) { + throw new IOException( + "Deserializing the source elements failed: " + e.getMessage(), e); + } + } + } + + public static class CheckpointSerializer implements SimpleVersionedSerializer> { + + /** The (de)serializer to be used for the data elements. */ + private final TypeSerializer serializer; + + public CheckpointSerializer(TypeSerializer serializer) { + this.serializer = serializer; + } + + @Override + public int getVersion() { + return 1; + } + + @Override + public byte[] serialize(List obj) throws IOException { + ByteArrayOutputStream baos = new ByteArrayOutputStream(); + DataOutputViewStreamWrapper wrapper = new DataOutputViewStreamWrapper(baos); + try { + wrapper.writeInt(obj.size()); + for (T element : obj) { + serializer.serialize(element, wrapper); + } + } catch (Exception e) { + throw new IOException( + "Serializing the source elements failed: " + e.getMessage(), e); + } + return baos.toByteArray(); + } + + @Override + public List deserialize(int version, byte[] serialized) throws IOException { + ByteArrayInputStream bais = new ByteArrayInputStream(serialized); + DataInputViewStreamWrapper wrapper = new DataInputViewStreamWrapper(bais); + try { + int size = wrapper.readInt(); + List result = new ArrayList<>(size); + for (int i = 0; i < size; i++) { + result.add(serializer.deserialize(wrapper)); + } + return result; + } catch (IOException e) { + throw new IOException( + "Deserializing the source elements failed: " + e.getMessage(), e); + } + } + } + + /** + * Set element type and re-serialize element if required. Should only be called before + * serialization/deserialization of this function. + */ + @Override + public void setOutputType(TypeInformation outTypeInfo, ExecutionConfig executionConfig) { + Preconditions.checkState( + elements != null, + "The output type should've been specified before shipping the graph to the cluster"); + checkIterable(elements, outTypeInfo.getTypeClass()); + TypeSerializer newSerializer = + outTypeInfo.createSerializer(executionConfig.getSerializerConfig()); + if (Objects.equals(serializer, newSerializer)) { + return; + } + serializer = newSerializer; + } + + // ------------------------------------------------------------------------ + // Utilities + // ------------------------------------------------------------------------ + + /** + * Verifies that all elements in the collection are non-null, and are of the given class, or a + * subclass thereof. + * + * @param elements The collection to check. + * @param viewedAs The class to which the elements must be assignable to. + * @param The generic type of the collection to be checked. + */ + public static void checkCollection(Collection elements, Class viewedAs) { + checkIterable(elements, viewedAs); + } + + private static void checkIterable(Iterable elements, Class viewedAs) { + for (OUT elem : elements) { + if (elem == null) { + throw new IllegalArgumentException("The collection contains a null element"); + } + + if (!viewedAs.isAssignableFrom(elem.getClass())) { + throw new IllegalArgumentException( + "The elements in the collection are not all subclasses of " + + viewedAs.getCanonicalName()); + } + } + } + + @Override + public LineageVertex getLineageVertex() { + return new SourceLineageVertex() { + @Override + public Boundedness boundedness() { + return Boundedness.BOUNDED; + } + + @Override + public List datasets() { + return List.of( + new DefaultLineageDataset( + "", "values://FromElementsSource", new HashMap<>())); + } + }; } } diff --git a/common/src/test/java/org/apache/flink/training/exercises/testing/TestSink.java b/common/src/test/java/org/apache/flink/training/exercises/testing/TestSink.java index 4fef8b9d..ca98d269 100644 --- a/common/src/test/java/org/apache/flink/training/exercises/testing/TestSink.java +++ b/common/src/test/java/org/apache/flink/training/exercises/testing/TestSink.java @@ -18,19 +18,37 @@ package org.apache.flink.training.exercises.testing; -import org.apache.flink.api.common.JobExecutionResult; -import org.apache.flink.api.common.accumulators.ListAccumulator; -import org.apache.flink.configuration.Configuration; -import org.apache.flink.streaming.api.functions.sink.RichSinkFunction; +import org.apache.flink.api.connector.sink2.Sink; +import org.apache.flink.api.connector.sink2.SinkWriter; +import org.apache.flink.api.connector.sink2.WriterInitContext; +import java.io.IOException; +import java.util.Collection; import java.util.List; +import java.util.UUID; +import java.util.concurrent.ConcurrentHashMap; +import java.util.concurrent.ConcurrentLinkedQueue; -public class TestSink extends RichSinkFunction { +public class TestSink implements Sink { private final String name; + private final String valueUuId; + + public static final ConcurrentHashMap> RESULTS = + new ConcurrentHashMap<>(); + + public Collection getResults() { + ConcurrentLinkedQueue result = (ConcurrentLinkedQueue) RESULTS.get(valueUuId); + if (result == null) { + return List.of(); + } + return result; + } + public TestSink(String name) { this.name = name; + valueUuId = UUID.randomUUID().toString(); } public TestSink() { @@ -38,16 +56,31 @@ public TestSink() { } @Override - public void open(Configuration parameters) { - getRuntimeContext().addAccumulator(name, new ListAccumulator()); + public SinkWriter createWriter(WriterInitContext context) throws IOException { + return new TestSinkWriter<>(valueUuId); } - @Override - public void invoke(OUT value, Context context) { - getRuntimeContext().getAccumulator(name).add(value); - } + public static class TestSinkWriter implements SinkWriter { + + private final String valueUuId; + + public TestSinkWriter(String valueUuId) { + this.valueUuId = valueUuId; + } + + @Override + public void write(OUT element, Context context) throws IOException, InterruptedException { + RESULTS.computeIfAbsent(valueUuId, k -> new ConcurrentLinkedQueue<>()).add(element); + } + + @Override + public void flush(boolean endOfInput) throws IOException, InterruptedException { + System.out.println("TestSinkWriter.flush"); + } - public List getResults(JobExecutionResult jobResult) { - return jobResult.getAccumulatorResult(name); + @Override + public void close() throws Exception { + System.out.println("TestSinkWriter.close"); + } } } diff --git a/config/checkstyle/checkstyle.xml b/config/checkstyle/checkstyle.xml index 56d219df..05dc6ef3 100644 --- a/config/checkstyle/checkstyle.xml +++ b/config/checkstyle/checkstyle.xml @@ -247,7 +247,7 @@ This file is based on the checkstyle file of Apache Beam. - + diff --git a/gradle.properties b/gradle.properties index 6be414c4..b7ee9cf0 100644 --- a/gradle.properties +++ b/gradle.properties @@ -3,3 +3,11 @@ org.gradle.parallel = true # Scala exercises can be enabled by setting this to true org.gradle.project.enable_scala = false + +# This is required for Java 16+ to compile the code +org.gradle.jvmargs=-XX:MaxMetaspaceSize=512m \ + --add-exports=jdk.compiler/com.sun.tools.javac.api=ALL-UNNAMED \ + --add-exports=jdk.compiler/com.sun.tools.javac.file=ALL-UNNAMED \ + --add-exports=jdk.compiler/com.sun.tools.javac.parser=ALL-UNNAMED \ + --add-exports=jdk.compiler/com.sun.tools.javac.tree=ALL-UNNAMED \ + --add-exports=jdk.compiler/com.sun.tools.javac.util=ALL-UNNAMED diff --git a/gradle/wrapper/gradle-wrapper.jar b/gradle/wrapper/gradle-wrapper.jar index f3d88b1c..9bbc975c 100644 Binary files a/gradle/wrapper/gradle-wrapper.jar and b/gradle/wrapper/gradle-wrapper.jar differ diff --git a/gradle/wrapper/gradle-wrapper.properties b/gradle/wrapper/gradle-wrapper.properties index 69a97150..d4081da4 100644 --- a/gradle/wrapper/gradle-wrapper.properties +++ b/gradle/wrapper/gradle-wrapper.properties @@ -1,5 +1,7 @@ distributionBase=GRADLE_USER_HOME distributionPath=wrapper/dists -distributionUrl=https\://services.gradle.org/distributions/gradle-7.1-bin.zip +distributionUrl=https\://services.gradle.org/distributions/gradle-8.14.3-bin.zip +networkTimeout=10000 +validateDistributionUrl=true zipStoreBase=GRADLE_USER_HOME zipStorePath=wrapper/dists diff --git a/gradlew b/gradlew index 2fe81a7d..faf93008 100755 --- a/gradlew +++ b/gradlew @@ -1,7 +1,7 @@ -#!/usr/bin/env sh +#!/bin/sh # -# Copyright 2015 the original author or authors. +# Copyright © 2015-2021 the original authors. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -15,80 +15,115 @@ # See the License for the specific language governing permissions and # limitations under the License. # +# SPDX-License-Identifier: Apache-2.0 +# ############################################################################## -## -## Gradle start up script for UN*X -## +# +# Gradle start up script for POSIX generated by Gradle. +# +# Important for running: +# +# (1) You need a POSIX-compliant shell to run this script. If your /bin/sh is +# noncompliant, but you have some other compliant shell such as ksh or +# bash, then to run this script, type that shell name before the whole +# command line, like: +# +# ksh Gradle +# +# Busybox and similar reduced shells will NOT work, because this script +# requires all of these POSIX shell features: +# * functions; +# * expansions «$var», «${var}», «${var:-default}», «${var+SET}», +# «${var#prefix}», «${var%suffix}», and «$( cmd )»; +# * compound commands having a testable exit status, especially «case»; +# * various built-in commands including «command», «set», and «ulimit». +# +# Important for patching: +# +# (2) This script targets any POSIX shell, so it avoids extensions provided +# by Bash, Ksh, etc; in particular arrays are avoided. +# +# The "traditional" practice of packing multiple parameters into a +# space-separated string is a well documented source of bugs and security +# problems, so this is (mostly) avoided, by progressively accumulating +# options in "$@", and eventually passing that to Java. +# +# Where the inherited environment variables (DEFAULT_JVM_OPTS, JAVA_OPTS, +# and GRADLE_OPTS) rely on word-splitting, this is performed explicitly; +# see the in-line comments for details. +# +# There are tweaks for specific operating systems such as AIX, CygWin, +# Darwin, MinGW, and NonStop. +# +# (3) This script is generated from the Groovy template +# https://github.com/gradle/gradle/blob/HEAD/platforms/jvm/plugins-application/src/main/resources/org/gradle/api/internal/plugins/unixStartScript.txt +# within the Gradle project. +# +# You can find Gradle at https://github.com/gradle/gradle/. +# ############################################################################## # Attempt to set APP_HOME + # Resolve links: $0 may be a link -PRG="$0" -# Need this for relative symlinks. -while [ -h "$PRG" ] ; do - ls=`ls -ld "$PRG"` - link=`expr "$ls" : '.*-> \(.*\)$'` - if expr "$link" : '/.*' > /dev/null; then - PRG="$link" - else - PRG=`dirname "$PRG"`"/$link" - fi +app_path=$0 + +# Need this for daisy-chained symlinks. +while + APP_HOME=${app_path%"${app_path##*/}"} # leaves a trailing /; empty if no leading path + [ -h "$app_path" ] +do + ls=$( ls -ld "$app_path" ) + link=${ls#*' -> '} + case $link in #( + /*) app_path=$link ;; #( + *) app_path=$APP_HOME$link ;; + esac done -SAVED="`pwd`" -cd "`dirname \"$PRG\"`/" >/dev/null -APP_HOME="`pwd -P`" -cd "$SAVED" >/dev/null -APP_NAME="Gradle" -APP_BASE_NAME=`basename "$0"` - -# Add default JVM options here. You can also use JAVA_OPTS and GRADLE_OPTS to pass JVM options to this script. -DEFAULT_JVM_OPTS='"-Xmx64m" "-Xms64m"' +# This is normally unused +# shellcheck disable=SC2034 +APP_BASE_NAME=${0##*/} +# Discard cd standard output in case $CDPATH is set (https://github.com/gradle/gradle/issues/25036) +APP_HOME=$( cd -P "${APP_HOME:-./}" > /dev/null && printf '%s\n' "$PWD" ) || exit # Use the maximum available, or set MAX_FD != -1 to use that value. -MAX_FD="maximum" +MAX_FD=maximum warn () { echo "$*" -} +} >&2 die () { echo echo "$*" echo exit 1 -} +} >&2 # OS specific support (must be 'true' or 'false'). cygwin=false msys=false darwin=false nonstop=false -case "`uname`" in - CYGWIN* ) - cygwin=true - ;; - Darwin* ) - darwin=true - ;; - MINGW* ) - msys=true - ;; - NONSTOP* ) - nonstop=true - ;; +case "$( uname )" in #( + CYGWIN* ) cygwin=true ;; #( + Darwin* ) darwin=true ;; #( + MSYS* | MINGW* ) msys=true ;; #( + NONSTOP* ) nonstop=true ;; esac CLASSPATH=$APP_HOME/gradle/wrapper/gradle-wrapper.jar + # Determine the Java command to use to start the JVM. if [ -n "$JAVA_HOME" ] ; then if [ -x "$JAVA_HOME/jre/sh/java" ] ; then # IBM's JDK on AIX uses strange locations for the executables - JAVACMD="$JAVA_HOME/jre/sh/java" + JAVACMD=$JAVA_HOME/jre/sh/java else - JAVACMD="$JAVA_HOME/bin/java" + JAVACMD=$JAVA_HOME/bin/java fi if [ ! -x "$JAVACMD" ] ; then die "ERROR: JAVA_HOME is set to an invalid directory: $JAVA_HOME @@ -97,87 +132,120 @@ Please set the JAVA_HOME variable in your environment to match the location of your Java installation." fi else - JAVACMD="java" - which java >/dev/null 2>&1 || die "ERROR: JAVA_HOME is not set and no 'java' command could be found in your PATH. + JAVACMD=java + if ! command -v java >/dev/null 2>&1 + then + die "ERROR: JAVA_HOME is not set and no 'java' command could be found in your PATH. Please set the JAVA_HOME variable in your environment to match the location of your Java installation." + fi fi # Increase the maximum file descriptors if we can. -if [ "$cygwin" = "false" -a "$darwin" = "false" -a "$nonstop" = "false" ] ; then - MAX_FD_LIMIT=`ulimit -H -n` - if [ $? -eq 0 ] ; then - if [ "$MAX_FD" = "maximum" -o "$MAX_FD" = "max" ] ; then - MAX_FD="$MAX_FD_LIMIT" - fi - ulimit -n $MAX_FD - if [ $? -ne 0 ] ; then - warn "Could not set maximum file descriptor limit: $MAX_FD" - fi - else - warn "Could not query maximum file descriptor limit: $MAX_FD_LIMIT" - fi +if ! "$cygwin" && ! "$darwin" && ! "$nonstop" ; then + case $MAX_FD in #( + max*) + # In POSIX sh, ulimit -H is undefined. That's why the result is checked to see if it worked. + # shellcheck disable=SC2039,SC3045 + MAX_FD=$( ulimit -H -n ) || + warn "Could not query maximum file descriptor limit" + esac + case $MAX_FD in #( + '' | soft) :;; #( + *) + # In POSIX sh, ulimit -n is undefined. That's why the result is checked to see if it worked. + # shellcheck disable=SC2039,SC3045 + ulimit -n "$MAX_FD" || + warn "Could not set maximum file descriptor limit to $MAX_FD" + esac fi -# For Darwin, add options to specify how the application appears in the dock -if $darwin; then - GRADLE_OPTS="$GRADLE_OPTS \"-Xdock:name=$APP_NAME\" \"-Xdock:icon=$APP_HOME/media/gradle.icns\"" -fi +# Collect all arguments for the java command, stacking in reverse order: +# * args from the command line +# * the main class name +# * -classpath +# * -D...appname settings +# * --module-path (only if needed) +# * DEFAULT_JVM_OPTS, JAVA_OPTS, and GRADLE_OPTS environment variables. # For Cygwin or MSYS, switch paths to Windows format before running java -if [ "$cygwin" = "true" -o "$msys" = "true" ] ; then - APP_HOME=`cygpath --path --mixed "$APP_HOME"` - CLASSPATH=`cygpath --path --mixed "$CLASSPATH"` - JAVACMD=`cygpath --unix "$JAVACMD"` - - # We build the pattern for arguments to be converted via cygpath - ROOTDIRSRAW=`find -L / -maxdepth 1 -mindepth 1 -type d 2>/dev/null` - SEP="" - for dir in $ROOTDIRSRAW ; do - ROOTDIRS="$ROOTDIRS$SEP$dir" - SEP="|" - done - OURCYGPATTERN="(^($ROOTDIRS))" - # Add a user-defined pattern to the cygpath arguments - if [ "$GRADLE_CYGPATTERN" != "" ] ; then - OURCYGPATTERN="$OURCYGPATTERN|($GRADLE_CYGPATTERN)" - fi +if "$cygwin" || "$msys" ; then + APP_HOME=$( cygpath --path --mixed "$APP_HOME" ) + CLASSPATH=$( cygpath --path --mixed "$CLASSPATH" ) + + JAVACMD=$( cygpath --unix "$JAVACMD" ) + # Now convert the arguments - kludge to limit ourselves to /bin/sh - i=0 - for arg in "$@" ; do - CHECK=`echo "$arg"|egrep -c "$OURCYGPATTERN" -` - CHECK2=`echo "$arg"|egrep -c "^-"` ### Determine if an option - - if [ $CHECK -ne 0 ] && [ $CHECK2 -eq 0 ] ; then ### Added a condition - eval `echo args$i`=`cygpath --path --ignore --mixed "$arg"` - else - eval `echo args$i`="\"$arg\"" + for arg do + if + case $arg in #( + -*) false ;; # don't mess with options #( + /?*) t=${arg#/} t=/${t%%/*} # looks like a POSIX filepath + [ -e "$t" ] ;; #( + *) false ;; + esac + then + arg=$( cygpath --path --ignore --mixed "$arg" ) fi - i=`expr $i + 1` + # Roll the args list around exactly as many times as the number of + # args, so each arg winds up back in the position where it started, but + # possibly modified. + # + # NB: a `for` loop captures its iteration list before it begins, so + # changing the positional parameters here affects neither the number of + # iterations, nor the values presented in `arg`. + shift # remove old arg + set -- "$@" "$arg" # push replacement arg done - case $i in - 0) set -- ;; - 1) set -- "$args0" ;; - 2) set -- "$args0" "$args1" ;; - 3) set -- "$args0" "$args1" "$args2" ;; - 4) set -- "$args0" "$args1" "$args2" "$args3" ;; - 5) set -- "$args0" "$args1" "$args2" "$args3" "$args4" ;; - 6) set -- "$args0" "$args1" "$args2" "$args3" "$args4" "$args5" ;; - 7) set -- "$args0" "$args1" "$args2" "$args3" "$args4" "$args5" "$args6" ;; - 8) set -- "$args0" "$args1" "$args2" "$args3" "$args4" "$args5" "$args6" "$args7" ;; - 9) set -- "$args0" "$args1" "$args2" "$args3" "$args4" "$args5" "$args6" "$args7" "$args8" ;; - esac fi -# Escape application args -save () { - for i do printf %s\\n "$i" | sed "s/'/'\\\\''/g;1s/^/'/;\$s/\$/' \\\\/" ; done - echo " " -} -APP_ARGS=`save "$@"` -# Collect all arguments for the java command, following the shell quoting and substitution rules -eval set -- $DEFAULT_JVM_OPTS $JAVA_OPTS $GRADLE_OPTS "\"-Dorg.gradle.appname=$APP_BASE_NAME\"" -classpath "\"$CLASSPATH\"" org.gradle.wrapper.GradleWrapperMain "$APP_ARGS" +# Add default JVM options here. You can also use JAVA_OPTS and GRADLE_OPTS to pass JVM options to this script. +DEFAULT_JVM_OPTS='"-Xmx64m" "-Xms64m"' + +# Collect all arguments for the java command: +# * DEFAULT_JVM_OPTS, JAVA_OPTS, and optsEnvironmentVar are not allowed to contain shell fragments, +# and any embedded shellness will be escaped. +# * For example: A user cannot expect ${Hostname} to be expanded, as it is an environment variable and will be +# treated as '${Hostname}' itself on the command line. + +set -- \ + "-Dorg.gradle.appname=$APP_BASE_NAME" \ + -classpath "$CLASSPATH" \ + org.gradle.wrapper.GradleWrapperMain \ + "$@" + +# Stop when "xargs" is not available. +if ! command -v xargs >/dev/null 2>&1 +then + die "xargs is not available" +fi + +# Use "xargs" to parse quoted args. +# +# With -n1 it outputs one arg per line, with the quotes and backslashes removed. +# +# In Bash we could simply go: +# +# readarray ARGS < <( xargs -n1 <<<"$var" ) && +# set -- "${ARGS[@]}" "$@" +# +# but POSIX shell has neither arrays nor command substitution, so instead we +# post-process each arg (as a line of input to sed) to backslash-escape any +# character that might be a shell metacharacter, then use eval to reverse +# that process (while maintaining the separation between arguments), and wrap +# the whole thing up as a single "set" statement. +# +# This will of course break if any of these variables contains a newline or +# an unmatched quote. +# + +eval "set -- $( + printf '%s\n' "$DEFAULT_JVM_OPTS $JAVA_OPTS $GRADLE_OPTS" | + xargs -n1 | + sed ' s~[^-[:alnum:]+,./:=@_]~\\&~g; ' | + tr '\n' ' ' + )" '"$@"' exec "$JAVACMD" "$@" diff --git a/gradlew.bat b/gradlew.bat index 9109989e..9b42019c 100644 --- a/gradlew.bat +++ b/gradlew.bat @@ -13,8 +13,10 @@ @rem See the License for the specific language governing permissions and @rem limitations under the License. @rem +@rem SPDX-License-Identifier: Apache-2.0 +@rem -@if "%DEBUG%" == "" @echo off +@if "%DEBUG%"=="" @echo off @rem ########################################################################## @rem @rem Gradle startup script for Windows @@ -25,7 +27,8 @@ if "%OS%"=="Windows_NT" setlocal set DIRNAME=%~dp0 -if "%DIRNAME%" == "" set DIRNAME=. +if "%DIRNAME%"=="" set DIRNAME=. +@rem This is normally unused set APP_BASE_NAME=%~n0 set APP_HOME=%DIRNAME% @@ -40,13 +43,13 @@ if defined JAVA_HOME goto findJavaFromJavaHome set JAVA_EXE=java.exe %JAVA_EXE% -version >NUL 2>&1 -if "%ERRORLEVEL%" == "0" goto init +if %ERRORLEVEL% equ 0 goto execute -echo. -echo ERROR: JAVA_HOME is not set and no 'java' command could be found in your PATH. -echo. -echo Please set the JAVA_HOME variable in your environment to match the -echo location of your Java installation. +echo. 1>&2 +echo ERROR: JAVA_HOME is not set and no 'java' command could be found in your PATH. 1>&2 +echo. 1>&2 +echo Please set the JAVA_HOME variable in your environment to match the 1>&2 +echo location of your Java installation. 1>&2 goto fail @@ -54,48 +57,36 @@ goto fail set JAVA_HOME=%JAVA_HOME:"=% set JAVA_EXE=%JAVA_HOME%/bin/java.exe -if exist "%JAVA_EXE%" goto init +if exist "%JAVA_EXE%" goto execute -echo. -echo ERROR: JAVA_HOME is set to an invalid directory: %JAVA_HOME% -echo. -echo Please set the JAVA_HOME variable in your environment to match the -echo location of your Java installation. +echo. 1>&2 +echo ERROR: JAVA_HOME is set to an invalid directory: %JAVA_HOME% 1>&2 +echo. 1>&2 +echo Please set the JAVA_HOME variable in your environment to match the 1>&2 +echo location of your Java installation. 1>&2 goto fail -:init -@rem Get command-line arguments, handling Windows variants - -if not "%OS%" == "Windows_NT" goto win9xME_args - -:win9xME_args -@rem Slurp the command line arguments. -set CMD_LINE_ARGS= -set _SKIP=2 - -:win9xME_args_slurp -if "x%~1" == "x" goto execute - -set CMD_LINE_ARGS=%* - :execute @rem Setup the command line set CLASSPATH=%APP_HOME%\gradle\wrapper\gradle-wrapper.jar + @rem Execute Gradle -"%JAVA_EXE%" %DEFAULT_JVM_OPTS% %JAVA_OPTS% %GRADLE_OPTS% "-Dorg.gradle.appname=%APP_BASE_NAME%" -classpath "%CLASSPATH%" org.gradle.wrapper.GradleWrapperMain %CMD_LINE_ARGS% +"%JAVA_EXE%" %DEFAULT_JVM_OPTS% %JAVA_OPTS% %GRADLE_OPTS% "-Dorg.gradle.appname=%APP_BASE_NAME%" -classpath "%CLASSPATH%" org.gradle.wrapper.GradleWrapperMain %* :end @rem End local scope for the variables with windows NT shell -if "%ERRORLEVEL%"=="0" goto mainEnd +if %ERRORLEVEL% equ 0 goto mainEnd :fail rem Set variable GRADLE_EXIT_CONSOLE if you need the _script_ return code instead of rem the _cmd.exe /c_ return code! -if not "" == "%GRADLE_EXIT_CONSOLE%" exit 1 -exit /b 1 +set EXIT_CODE=%ERRORLEVEL% +if %EXIT_CODE% equ 0 set EXIT_CODE=1 +if not ""=="%GRADLE_EXIT_CONSOLE%" exit %EXIT_CODE% +exit /b %EXIT_CODE% :mainEnd if "%OS%"=="Windows_NT" endlocal diff --git a/hourly-tips/src/main/java/org/apache/flink/training/exercises/hourlytips/HourlyTipsExercise.java b/hourly-tips/src/main/java/org/apache/flink/training/exercises/hourlytips/HourlyTipsExercise.java index 79c89aee..21ef481e 100644 --- a/hourly-tips/src/main/java/org/apache/flink/training/exercises/hourlytips/HourlyTipsExercise.java +++ b/hourly-tips/src/main/java/org/apache/flink/training/exercises/hourlytips/HourlyTipsExercise.java @@ -19,30 +19,33 @@ package org.apache.flink.training.exercises.hourlytips; import org.apache.flink.api.common.JobExecutionResult; +import org.apache.flink.api.connector.sink2.Sink; +import org.apache.flink.api.connector.source.Source; import org.apache.flink.api.java.tuple.Tuple3; import org.apache.flink.streaming.api.datastream.DataStream; import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment; -import org.apache.flink.streaming.api.functions.sink.PrintSinkFunction; -import org.apache.flink.streaming.api.functions.sink.SinkFunction; -import org.apache.flink.streaming.api.functions.source.SourceFunction; +import org.apache.flink.streaming.api.functions.sink.PrintSink; +import org.apache.flink.streaming.api.functions.timestamps.BoundedOutOfOrdernessTimestampExtractor; import org.apache.flink.training.exercises.common.datatypes.TaxiFare; import org.apache.flink.training.exercises.common.sources.TaxiFareGenerator; import org.apache.flink.training.exercises.common.utils.MissingSolutionException; +import java.io.Serializable; +import java.time.Duration; + /** * The Hourly Tips exercise from the Flink training. * *

The task of the exercise is to first calculate the total tips collected by each driver, hour * by hour, and then from that stream, find the highest tip total in each hour. */ -public class HourlyTipsExercise { +public class HourlyTipsExercise implements Serializable { - private final SourceFunction source; - private final SinkFunction> sink; + private final Source source; + private final Sink> sink; /** Creates a job using the source and sink provided. */ - public HourlyTipsExercise( - SourceFunction source, SinkFunction> sink) { + public HourlyTipsExercise(Source source, Sink> sink) { this.source = source; this.sink = sink; @@ -55,8 +58,7 @@ public HourlyTipsExercise( */ public static void main(String[] args) throws Exception { - HourlyTipsExercise job = - new HourlyTipsExercise(new TaxiFareGenerator(), new PrintSinkFunction<>()); + HourlyTipsExercise job = new HourlyTipsExercise(new TaxiFareGenerator(), new PrintSink<>()); job.execute(); } @@ -68,12 +70,22 @@ public static void main(String[] args) throws Exception { * @throws Exception which occurs during job execution. */ public JobExecutionResult execute() throws Exception { - // set up streaming execution environment StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment(); // start the data generator - DataStream fares = env.addSource(source); + DataStream fares = + env.fromSource( + source, + new BoundedOutOfOrdernessTimestampExtractor( + Duration.ofSeconds(10)) { + + @Override + public long extractTimestamp(TaxiFare taxiFare) { + return taxiFare.getEventTimeMillis(); + } + }, + "taxi fare"); // replace this with your solution if (true) { diff --git a/hourly-tips/src/solution/java/org/apache/flink/training/solutions/hourlytips/HourlyTipsSolution.java b/hourly-tips/src/solution/java/org/apache/flink/training/solutions/hourlytips/HourlyTipsSolution.java index 6ae443ff..7a6f6b94 100644 --- a/hourly-tips/src/solution/java/org/apache/flink/training/solutions/hourlytips/HourlyTipsSolution.java +++ b/hourly-tips/src/solution/java/org/apache/flink/training/solutions/hourlytips/HourlyTipsSolution.java @@ -20,34 +20,36 @@ import org.apache.flink.api.common.JobExecutionResult; import org.apache.flink.api.common.eventtime.WatermarkStrategy; +import org.apache.flink.api.connector.sink2.Sink; +import org.apache.flink.api.connector.source.Source; import org.apache.flink.api.java.tuple.Tuple3; import org.apache.flink.streaming.api.datastream.DataStream; import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment; -import org.apache.flink.streaming.api.functions.sink.PrintSinkFunction; -import org.apache.flink.streaming.api.functions.sink.SinkFunction; -import org.apache.flink.streaming.api.functions.source.SourceFunction; +import org.apache.flink.streaming.api.functions.sink.PrintSink; +import org.apache.flink.streaming.api.functions.timestamps.BoundedOutOfOrdernessTimestampExtractor; import org.apache.flink.streaming.api.functions.windowing.ProcessWindowFunction; import org.apache.flink.streaming.api.windowing.assigners.TumblingEventTimeWindows; -import org.apache.flink.streaming.api.windowing.time.Time; import org.apache.flink.streaming.api.windowing.windows.TimeWindow; import org.apache.flink.training.exercises.common.datatypes.TaxiFare; import org.apache.flink.training.exercises.common.sources.TaxiFareGenerator; import org.apache.flink.util.Collector; +import java.io.Serializable; +import java.time.Duration; + /** * Java reference implementation for the Hourly Tips exercise from the Flink training. * *

The task of the exercise is to first calculate the total tips collected by each driver, hour * by hour, and then from that stream, find the highest tip total in each hour. */ -public class HourlyTipsSolution { +public class HourlyTipsSolution implements Serializable { - private final SourceFunction source; - private final SinkFunction> sink; + private final Source source; + private final Sink> sink; /** Creates a job using the source and sink provided. */ - public HourlyTipsSolution( - SourceFunction source, SinkFunction> sink) { + public HourlyTipsSolution(Source source, Sink> sink) { this.source = source; this.sink = sink; @@ -60,8 +62,7 @@ public HourlyTipsSolution( */ public static void main(String[] args) throws Exception { - HourlyTipsSolution job = - new HourlyTipsSolution(new TaxiFareGenerator(), new PrintSinkFunction<>()); + HourlyTipsSolution job = new HourlyTipsSolution(new TaxiFareGenerator(), new PrintSink<>()); job.execute(); } @@ -79,7 +80,17 @@ public JobExecutionResult execute() throws Exception { // start the data generator and arrange for watermarking DataStream fares = - env.addSource(source) + env.fromSource( + source, + new BoundedOutOfOrdernessTimestampExtractor( + Duration.ofSeconds(10)) { + + @Override + public long extractTimestamp(TaxiFare taxiFare) { + return taxiFare.getEventTimeMillis(); + } + }, + "taxi fare") .assignTimestampsAndWatermarks( // taxi fares are in order WatermarkStrategy.forMonotonousTimestamps() @@ -89,12 +100,12 @@ public JobExecutionResult execute() throws Exception { // compute tips per hour for each driver DataStream> hourlyTips = fares.keyBy((TaxiFare fare) -> fare.driverId) - .window(TumblingEventTimeWindows.of(Time.hours(1))) + .window(TumblingEventTimeWindows.of(Duration.ofHours(1))) .process(new AddTips()); // find the driver with the highest sum of tips for each hour DataStream> hourlyMax = - hourlyTips.windowAll(TumblingEventTimeWindows.of(Time.hours(1))).maxBy(2); + hourlyTips.windowAll(TumblingEventTimeWindows.of(Duration.ofHours(1))).maxBy(2); /* You should explore how this alternative (commented out below) behaves. * In what ways is the same as, and different from, the solution above (using a windowAll)? @@ -102,7 +113,7 @@ public JobExecutionResult execute() throws Exception { // DataStream> hourlyMax = hourlyTips.keyBy(t -> t.f0).maxBy(2); - hourlyMax.addSink(sink); + hourlyMax.sinkTo(sink); // execute the transformation pipeline return env.execute("Hourly Tips"); diff --git a/hourly-tips/src/test/java/org/apache/flink/training/exercises/hourlytips/HourlyTipsTest.java b/hourly-tips/src/test/java/org/apache/flink/training/exercises/hourlytips/HourlyTipsTest.java index 1e379c35..75384888 100644 --- a/hourly-tips/src/test/java/org/apache/flink/training/exercises/hourlytips/HourlyTipsTest.java +++ b/hourly-tips/src/test/java/org/apache/flink/training/exercises/hourlytips/HourlyTipsTest.java @@ -19,9 +19,9 @@ package org.apache.flink.training.exercises.hourlytips; import org.apache.flink.api.common.JobExecutionResult; +import org.apache.flink.api.connector.source.Source; import org.apache.flink.api.java.tuple.Tuple3; import org.apache.flink.runtime.testutils.MiniClusterResourceConfiguration; -import org.apache.flink.streaming.api.functions.source.SourceFunction; import org.apache.flink.test.util.MiniClusterWithClientResource; import org.apache.flink.training.exercises.common.datatypes.TaxiFare; import org.apache.flink.training.exercises.common.utils.DataGenerator; @@ -36,7 +36,8 @@ import java.time.Duration; import java.time.Instant; -import java.util.List; +import java.util.Collection; +import java.util.function.Supplier; import static org.assertj.core.api.AssertionsForInterfaceTypes.assertThat; @@ -58,11 +59,11 @@ public void testOneDriverOneTip() throws Exception { TaxiFare one = testFare(1, t(0), 1.0F); - ParallelTestSource source = new ParallelTestSource<>(one); + Supplier> sourceSupplier = () -> new ParallelTestSource<>(one); Tuple3 expected = Tuple3.of(t(60).toEpochMilli(), 1L, 1.0F); - assertThat(results(source)).containsExactly(expected); + assertThat(results(sourceSupplier)).containsExactly(expected); } @Test @@ -71,12 +72,13 @@ public void testTipsAreSummedByHour() throws Exception { TaxiFare fiveIn1 = testFare(1, t(15), 5.0F); TaxiFare tenIn2 = testFare(1, t(90), 10.0F); - ParallelTestSource source = new ParallelTestSource<>(oneIn1, fiveIn1, tenIn2); + Supplier> sourceSupplier = + () -> new ParallelTestSource<>(oneIn1, fiveIn1, tenIn2); Tuple3 hour1 = Tuple3.of(t(60).toEpochMilli(), 1L, 6.0F); Tuple3 hour2 = Tuple3.of(t(120).toEpochMilli(), 1L, 10.0F); - assertThat(results(source)).containsExactlyInAnyOrder(hour1, hour2); + assertThat(results(sourceSupplier)).containsExactlyInAnyOrder(hour1, hour2); } @Test @@ -90,21 +92,22 @@ public void testMaxAcrossDrivers() throws Exception { TaxiFare oneFor4In2 = testFare(4, t(80), 1.0F); TaxiFare tenFor5In2 = testFare(5, t(100), 10.0F); - ParallelTestSource source = - new ParallelTestSource<>( - oneFor1In1, - fiveFor1In1, - tenFor1In2, - twentyFor2In2, - zeroFor3In2, - zeroFor4In2, - oneFor4In2, - tenFor5In2); + Supplier> sourceSupplier = + () -> + new ParallelTestSource<>( + oneFor1In1, + fiveFor1In1, + tenFor1In2, + twentyFor2In2, + zeroFor3In2, + zeroFor4In2, + oneFor4In2, + tenFor5In2); Tuple3 hour1 = Tuple3.of(t(60).toEpochMilli(), 1L, 6.0F); Tuple3 hour2 = Tuple3.of(t(120).toEpochMilli(), 2L, 20.0F); - assertThat(results(source)).containsExactlyInAnyOrder(hour1, hour2); + assertThat(results(sourceSupplier)).containsExactlyInAnyOrder(hour1, hour2); } public Instant t(int minutes) { @@ -118,19 +121,21 @@ private TaxiFare testFare(long driverId, Instant startTime, float tip) { private ComposedPipeline> hourlyTipsPipeline() { ExecutablePipeline> exercise = - (source, sink) -> new HourlyTipsExercise(source, sink).execute(); + (sourceSupplier, sink) -> + new HourlyTipsExercise(sourceSupplier.get(), sink).execute(); ExecutablePipeline> solution = - (source, sink) -> new HourlyTipsSolution(source, sink).execute(); + (sourceSupplier, sink) -> + new HourlyTipsSolution(sourceSupplier.get(), sink).execute(); return new ComposedPipeline<>(exercise, solution); } - protected List> results(SourceFunction source) - throws Exception { + protected Collection> results( + Supplier> sourceSupplier) throws Exception { TestSink> sink = new TestSink<>(); - JobExecutionResult jobResult = hourlyTipsPipeline().execute(source, sink); - return sink.getResults(jobResult); + JobExecutionResult jobResult = hourlyTipsPipeline().execute(sourceSupplier, sink); + return sink.getResults(); } } diff --git a/long-ride-alerts/src/main/java/org/apache/flink/training/exercises/longrides/LongRidesExercise.java b/long-ride-alerts/src/main/java/org/apache/flink/training/exercises/longrides/LongRidesExercise.java index f555e5da..d82944dc 100644 --- a/long-ride-alerts/src/main/java/org/apache/flink/training/exercises/longrides/LongRidesExercise.java +++ b/long-ride-alerts/src/main/java/org/apache/flink/training/exercises/longrides/LongRidesExercise.java @@ -21,18 +21,20 @@ import org.apache.flink.annotation.VisibleForTesting; import org.apache.flink.api.common.JobExecutionResult; import org.apache.flink.api.common.eventtime.WatermarkStrategy; -import org.apache.flink.configuration.Configuration; +import org.apache.flink.api.common.functions.OpenContext; +import org.apache.flink.api.connector.sink2.Sink; +import org.apache.flink.api.connector.source.Source; import org.apache.flink.streaming.api.datastream.DataStream; import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment; import org.apache.flink.streaming.api.functions.KeyedProcessFunction; -import org.apache.flink.streaming.api.functions.sink.PrintSinkFunction; -import org.apache.flink.streaming.api.functions.sink.SinkFunction; -import org.apache.flink.streaming.api.functions.source.SourceFunction; +import org.apache.flink.streaming.api.functions.sink.PrintSink; +import org.apache.flink.streaming.api.functions.timestamps.BoundedOutOfOrdernessTimestampExtractor; import org.apache.flink.training.exercises.common.datatypes.TaxiRide; import org.apache.flink.training.exercises.common.sources.TaxiRideGenerator; import org.apache.flink.training.exercises.common.utils.MissingSolutionException; import org.apache.flink.util.Collector; +import java.io.Serializable; import java.time.Duration; /** @@ -43,12 +45,14 @@ * *

You should eventually clear any state you create. */ -public class LongRidesExercise { - private final SourceFunction source; - private final SinkFunction sink; +public class LongRidesExercise implements Serializable { + + private final Source source; + + private final Sink sink; /** Creates a job using the source and sink provided. */ - public LongRidesExercise(SourceFunction source, SinkFunction sink) { + public LongRidesExercise(Source source, Sink sink) { this.source = source; this.sink = sink; } @@ -65,7 +69,18 @@ public JobExecutionResult execute() throws Exception { StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment(); // start the data generator - DataStream rides = env.addSource(source); + DataStream rides = + env.fromSource( + source, + new BoundedOutOfOrdernessTimestampExtractor( + Duration.ofSeconds(10)) { + + @Override + public long extractTimestamp(TaxiRide taxiRide) { + return taxiRide.getEventTimeMillis(); + } + }, + "taxi ride"); // the WatermarkStrategy specifies how to extract timestamps and generate watermarks WatermarkStrategy watermarkStrategy = @@ -77,7 +92,7 @@ public JobExecutionResult execute() throws Exception { rides.assignTimestampsAndWatermarks(watermarkStrategy) .keyBy(ride -> ride.rideId) .process(new AlertFunction()) - .addSink(sink); + .sinkTo(sink); // execute the pipeline and return the result return env.execute("Long Taxi Rides"); @@ -89,8 +104,7 @@ public JobExecutionResult execute() throws Exception { * @throws Exception which occurs during job execution. */ public static void main(String[] args) throws Exception { - LongRidesExercise job = - new LongRidesExercise(new TaxiRideGenerator(), new PrintSinkFunction<>()); + LongRidesExercise job = new LongRidesExercise(new TaxiRideGenerator(), new PrintSink<>()); job.execute(); } @@ -99,7 +113,7 @@ public static void main(String[] args) throws Exception { public static class AlertFunction extends KeyedProcessFunction { @Override - public void open(Configuration config) throws Exception { + public void open(OpenContext config) throws Exception { throw new MissingSolutionException(); } diff --git a/long-ride-alerts/src/solution/java/org/apache/flink/training/solutions/longrides/LongRidesSolution.java b/long-ride-alerts/src/solution/java/org/apache/flink/training/solutions/longrides/LongRidesSolution.java index e542817c..0759ed14 100644 --- a/long-ride-alerts/src/solution/java/org/apache/flink/training/solutions/longrides/LongRidesSolution.java +++ b/long-ride-alerts/src/solution/java/org/apache/flink/training/solutions/longrides/LongRidesSolution.java @@ -21,19 +21,21 @@ import org.apache.flink.annotation.VisibleForTesting; import org.apache.flink.api.common.JobExecutionResult; import org.apache.flink.api.common.eventtime.WatermarkStrategy; +import org.apache.flink.api.common.functions.OpenContext; import org.apache.flink.api.common.state.ValueState; import org.apache.flink.api.common.state.ValueStateDescriptor; -import org.apache.flink.configuration.Configuration; +import org.apache.flink.api.connector.sink2.Sink; +import org.apache.flink.api.connector.source.Source; import org.apache.flink.streaming.api.datastream.DataStream; import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment; import org.apache.flink.streaming.api.functions.KeyedProcessFunction; -import org.apache.flink.streaming.api.functions.sink.PrintSinkFunction; -import org.apache.flink.streaming.api.functions.sink.SinkFunction; -import org.apache.flink.streaming.api.functions.source.SourceFunction; +import org.apache.flink.streaming.api.functions.sink.PrintSink; +import org.apache.flink.streaming.api.functions.timestamps.BoundedOutOfOrdernessTimestampExtractor; import org.apache.flink.training.exercises.common.datatypes.TaxiRide; import org.apache.flink.training.exercises.common.sources.TaxiRideGenerator; import org.apache.flink.util.Collector; +import java.io.Serializable; import java.time.Duration; /** @@ -44,13 +46,13 @@ * *

You should eventually clear any state you create. */ -public class LongRidesSolution { +public class LongRidesSolution implements Serializable { - private final SourceFunction source; - private final SinkFunction sink; + private final Source source; + private final Sink sink; /** Creates a job using the source and sink provided. */ - public LongRidesSolution(SourceFunction source, SinkFunction sink) { + public LongRidesSolution(Source source, Sink sink) { this.source = source; this.sink = sink; @@ -68,7 +70,18 @@ public JobExecutionResult execute() throws Exception { StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment(); // start the data generator - DataStream rides = env.addSource(source); + DataStream rides = + env.fromSource( + source, + new BoundedOutOfOrdernessTimestampExtractor( + Duration.ofSeconds(10)) { + + @Override + public long extractTimestamp(TaxiRide taxiRide) { + return taxiRide.getEventTimeMillis(); + } + }, + "taxi ride"); // the WatermarkStrategy specifies how to extract timestamps and generate watermarks WatermarkStrategy watermarkStrategy = @@ -80,7 +93,7 @@ public JobExecutionResult execute() throws Exception { rides.assignTimestampsAndWatermarks(watermarkStrategy) .keyBy(ride -> ride.rideId) .process(new AlertFunction()) - .addSink(sink); + .sinkTo(sink); // execute the pipeline and return the result return env.execute("Long Taxi Rides"); @@ -92,8 +105,7 @@ public JobExecutionResult execute() throws Exception { * @throws Exception which occurs during job execution. */ public static void main(String[] args) throws Exception { - LongRidesSolution job = - new LongRidesSolution(new TaxiRideGenerator(), new PrintSinkFunction<>()); + LongRidesSolution job = new LongRidesSolution(new TaxiRideGenerator(), new PrintSink<>()); job.execute(); } @@ -104,7 +116,7 @@ public static class AlertFunction extends KeyedProcessFunction rideState; @Override - public void open(Configuration config) { + public void open(OpenContext config) { ValueStateDescriptor rideStateDescriptor = new ValueStateDescriptor<>("ride event", TaxiRide.class); rideState = getRuntimeContext().getState(rideStateDescriptor); diff --git a/long-ride-alerts/src/test/java/org/apache/flink/training/exercises/longrides/LongRidesIntegrationTest.java b/long-ride-alerts/src/test/java/org/apache/flink/training/exercises/longrides/LongRidesIntegrationTest.java index 792a8dd9..ccb9cc73 100644 --- a/long-ride-alerts/src/test/java/org/apache/flink/training/exercises/longrides/LongRidesIntegrationTest.java +++ b/long-ride-alerts/src/test/java/org/apache/flink/training/exercises/longrides/LongRidesIntegrationTest.java @@ -19,8 +19,8 @@ package org.apache.flink.training.exercises.longrides; import org.apache.flink.api.common.JobExecutionResult; +import org.apache.flink.api.connector.source.Source; import org.apache.flink.runtime.testutils.MiniClusterResourceConfiguration; -import org.apache.flink.streaming.api.functions.source.SourceFunction; import org.apache.flink.test.util.MiniClusterWithClientResource; import org.apache.flink.training.exercises.common.datatypes.TaxiRide; import org.apache.flink.training.exercises.testing.ComposedPipeline; @@ -33,6 +33,7 @@ import org.junit.Test; import java.util.List; +import java.util.function.Supplier; import static org.assertj.core.api.AssertionsForInterfaceTypes.assertThat; @@ -55,10 +56,10 @@ public void shortRide() throws Exception { TaxiRide rideStarted = startRide(1, BEGINNING); TaxiRide endedOneMinLater = endRide(rideStarted, ONE_MINUTE_LATER); - ParallelTestSource source = - new ParallelTestSource<>(rideStarted, endedOneMinLater); + Supplier> sourceSupplier = + () -> new ParallelTestSource(rideStarted, endedOneMinLater); - assertThat(results(source)).isEmpty(); + assertThat(results(sourceSupplier)).isEmpty(); } @Test @@ -66,10 +67,10 @@ public void shortRideOutOfOrder() throws Exception { TaxiRide rideStarted = startRide(1, BEGINNING); TaxiRide endedOneMinLater = endRide(rideStarted, ONE_MINUTE_LATER); - ParallelTestSource source = - new ParallelTestSource<>(endedOneMinLater, rideStarted); + Supplier> sourceSupplier = + () -> new ParallelTestSource(endedOneMinLater, rideStarted); - assertThat(results(source)).isEmpty(); + assertThat(results(sourceSupplier)).isEmpty(); } @Test @@ -82,32 +83,32 @@ public void multipleRides() throws Exception { TaxiRide twoHourRideEnded = endRide(twoHourRide, BEGINNING); TaxiRide otherLongRideEnded = endRide(otherLongRide, THREE_HOURS_LATER); - ParallelTestSource source = - new ParallelTestSource<>( - longRideWithoutEnd, - twoHourRide, - otherLongRide, - shortRide, - shortRideEnded, - twoHourRideEnded, - otherLongRideEnded); - - assertThat(results(source)) + Supplier> sourceSupplier = + () -> + new ParallelTestSource( + longRideWithoutEnd, + twoHourRide, + otherLongRide, + shortRide, + shortRideEnded, + twoHourRideEnded, + otherLongRideEnded); + + assertThat(results(sourceSupplier)) .containsExactlyInAnyOrder(longRideWithoutEnd.rideId, otherLongRide.rideId); } private static final ExecutablePipeline exercise = - (source, sink) -> new LongRidesExercise(source, sink).execute(); + (sourceSupplier, sink) -> new LongRidesExercise(sourceSupplier.get(), sink).execute(); private static final ExecutablePipeline solution = - (source, sink) -> new LongRidesSolution(source, sink).execute(); - - protected List results(SourceFunction source) throws Exception { + (sourceSupplier, sink) -> new LongRidesSolution(sourceSupplier.get(), sink).execute(); + protected List results(Supplier> sourceSupplier) throws Exception { TestSink sink = new TestSink<>(); ComposedPipeline longRidesPipeline = new ComposedPipeline<>(exercise, solution); - JobExecutionResult jobResult = longRidesPipeline.execute(source, sink); - return sink.getResults(jobResult); + JobExecutionResult jobResult = longRidesPipeline.execute(sourceSupplier, sink); + return sink.getResults().stream().toList(); } } diff --git a/ride-cleansing/src/main/java/org/apache/flink/training/exercises/ridecleansing/RideCleansingExercise.java b/ride-cleansing/src/main/java/org/apache/flink/training/exercises/ridecleansing/RideCleansingExercise.java index 1f07312f..fbd5a36a 100644 --- a/ride-cleansing/src/main/java/org/apache/flink/training/exercises/ridecleansing/RideCleansingExercise.java +++ b/ride-cleansing/src/main/java/org/apache/flink/training/exercises/ridecleansing/RideCleansingExercise.java @@ -20,28 +20,31 @@ import org.apache.flink.api.common.JobExecutionResult; import org.apache.flink.api.common.functions.FilterFunction; +import org.apache.flink.api.connector.sink2.Sink; +import org.apache.flink.api.connector.source.Source; import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment; -import org.apache.flink.streaming.api.functions.sink.PrintSinkFunction; -import org.apache.flink.streaming.api.functions.sink.SinkFunction; -import org.apache.flink.streaming.api.functions.source.SourceFunction; +import org.apache.flink.streaming.api.functions.sink.PrintSink; +import org.apache.flink.streaming.api.functions.timestamps.BoundedOutOfOrdernessTimestampExtractor; import org.apache.flink.training.exercises.common.datatypes.TaxiRide; import org.apache.flink.training.exercises.common.sources.TaxiRideGenerator; import org.apache.flink.training.exercises.common.utils.MissingSolutionException; +import java.io.Serializable; +import java.time.Duration; + /** * The Ride Cleansing exercise from the Flink training. * *

The task of this exercise is to filter a data stream of taxi ride records to keep only rides * that both start and end within New York City. The resulting stream should be printed. */ -public class RideCleansingExercise { +public class RideCleansingExercise implements Serializable { - private final SourceFunction source; - private final SinkFunction sink; + private final Source source; + private final Sink sink; /** Creates a job using the source and sink provided. */ - public RideCleansingExercise(SourceFunction source, SinkFunction sink) { - + public RideCleansingExercise(Source source, Sink sink) { this.source = source; this.sink = sink; } @@ -53,7 +56,7 @@ public RideCleansingExercise(SourceFunction source, SinkFunction()); + new RideCleansingExercise(new TaxiRideGenerator(), new PrintSink<>()); job.execute(); } @@ -70,7 +73,19 @@ public JobExecutionResult execute() throws Exception { StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment(); // set up the pipeline - env.addSource(source).filter(new NYCFilter()).addSink(sink); + env.fromSource( + source, + new BoundedOutOfOrdernessTimestampExtractor( + Duration.ofSeconds(10)) { + + @Override + public long extractTimestamp(TaxiRide taxiRide) { + return taxiRide.getEventTimeMillis(); + } + }, + "taxi ride") + .filter(new NYCFilter()) + .sinkTo(sink); // run the pipeline and return the result return env.execute("Taxi Ride Cleansing"); diff --git a/ride-cleansing/src/solution/java/org/apache/flink/training/solutions/ridecleansing/RideCleansingSolution.java b/ride-cleansing/src/solution/java/org/apache/flink/training/solutions/ridecleansing/RideCleansingSolution.java index 46786268..9e740132 100644 --- a/ride-cleansing/src/solution/java/org/apache/flink/training/solutions/ridecleansing/RideCleansingSolution.java +++ b/ride-cleansing/src/solution/java/org/apache/flink/training/solutions/ridecleansing/RideCleansingSolution.java @@ -20,13 +20,18 @@ import org.apache.flink.api.common.JobExecutionResult; import org.apache.flink.api.common.functions.FilterFunction; +import org.apache.flink.api.connector.sink2.Sink; +import org.apache.flink.api.connector.source.Source; import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment; -import org.apache.flink.streaming.api.functions.sink.PrintSinkFunction; -import org.apache.flink.streaming.api.functions.sink.SinkFunction; -import org.apache.flink.streaming.api.functions.source.SourceFunction; +import org.apache.flink.streaming.api.functions.sink.PrintSink; +import org.apache.flink.streaming.api.functions.timestamps.BoundedOutOfOrdernessTimestampExtractor; import org.apache.flink.training.exercises.common.datatypes.TaxiRide; import org.apache.flink.training.exercises.common.sources.TaxiRideGenerator; import org.apache.flink.training.exercises.common.utils.GeoUtils; +import org.apache.flink.training.exercises.ridecleansing.RideCleansingExercise; + +import java.io.Serializable; +import java.time.Duration; /** * Solution to the Ride Cleansing exercise from the Flink training. @@ -34,14 +39,13 @@ *

The task of this exercise is to filter a data stream of taxi ride records to keep only rides * that both start and end within New York City. The resulting stream should be printed. */ -public class RideCleansingSolution { +public class RideCleansingSolution implements Serializable { - private final SourceFunction source; - private final SinkFunction sink; + private final Source source; + private final Sink sink; /** Creates a job using the source and sink provided. */ - public RideCleansingSolution(SourceFunction source, SinkFunction sink) { - + public RideCleansingSolution(Source source, Sink sink) { this.source = source; this.sink = sink; } @@ -53,7 +57,7 @@ public RideCleansingSolution(SourceFunction source, SinkFunction()); + new RideCleansingSolution(new TaxiRideGenerator(), new PrintSink<>()); job.execute(); } @@ -70,7 +74,19 @@ public JobExecutionResult execute() throws Exception { StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment(); // set up the pipeline - env.addSource(source).filter(new NYCFilter()).addSink(sink); + env.fromSource( + source, + new BoundedOutOfOrdernessTimestampExtractor( + Duration.ofSeconds(10)) { + + @Override + public long extractTimestamp(TaxiRide taxiRide) { + return taxiRide.getEventTimeMillis(); + } + }, + "taxi ride") + .filter(new RideCleansingExercise.NYCFilter()) + .sinkTo(sink); // run the pipeline and return the result return env.execute("Taxi Ride Cleansing"); diff --git a/ride-cleansing/src/test/java/org/apache/flink/training/exercises/ridecleansing/RideCleansingIntegrationTest.java b/ride-cleansing/src/test/java/org/apache/flink/training/exercises/ridecleansing/RideCleansingIntegrationTest.java index 9b86a9a2..57e2243a 100644 --- a/ride-cleansing/src/test/java/org/apache/flink/training/exercises/ridecleansing/RideCleansingIntegrationTest.java +++ b/ride-cleansing/src/test/java/org/apache/flink/training/exercises/ridecleansing/RideCleansingIntegrationTest.java @@ -19,9 +19,11 @@ package org.apache.flink.training.exercises.ridecleansing; import org.apache.flink.api.common.JobExecutionResult; +import org.apache.flink.api.connector.source.Source; import org.apache.flink.runtime.testutils.MiniClusterResourceConfiguration; import org.apache.flink.test.util.MiniClusterWithClientResource; import org.apache.flink.training.exercises.common.datatypes.TaxiRide; +import org.apache.flink.training.exercises.common.utils.MissingSolutionException; import org.apache.flink.training.exercises.testing.ComposedPipeline; import org.apache.flink.training.exercises.testing.ExecutablePipeline; import org.apache.flink.training.exercises.testing.ParallelTestSource; @@ -31,6 +33,8 @@ import org.junit.ClassRule; import org.junit.Test; +import java.util.function.Supplier; + import static org.assertj.core.api.AssertionsForInterfaceTypes.assertThat; public class RideCleansingIntegrationTest extends RideCleansingTestBase { @@ -48,26 +52,36 @@ public class RideCleansingIntegrationTest extends RideCleansingTestBase { @Test public void testAMixtureOfLocations() throws Exception { + try { + TaxiRide toThePole = testRide(-73.9947F, 40.750626F, 0, 90); + TaxiRide fromThePole = testRide(0, 90, -73.9947F, 40.750626F); + TaxiRide atPennStation = testRide(-73.9947F, 40.750626F, -73.9947F, 40.750626F); + TaxiRide atNorthPole = testRide(0, 90, 0, 90); - TaxiRide toThePole = testRide(-73.9947F, 40.750626F, 0, 90); - TaxiRide fromThePole = testRide(0, 90, -73.9947F, 40.750626F); - TaxiRide atPennStation = testRide(-73.9947F, 40.750626F, -73.9947F, 40.750626F); - TaxiRide atNorthPole = testRide(0, 90, 0, 90); - - ParallelTestSource source = - new ParallelTestSource<>(toThePole, fromThePole, atPennStation, atNorthPole); - TestSink sink = new TestSink<>(); + Supplier> sourceSupplier = + () -> + new ParallelTestSource( + toThePole, fromThePole, atPennStation, atNorthPole); + TestSink sink = new TestSink<>(); - JobExecutionResult jobResult = rideCleansingPipeline().execute(source, sink); - assertThat(sink.getResults(jobResult)).containsExactly(atPennStation); + JobExecutionResult jobResult = rideCleansingPipeline().execute(sourceSupplier, sink); + jobResult.getJobExecutionResult().getJobExecutionResult(); + assertThat(sink.getResults()).containsExactly(atPennStation); + } catch (Exception e) { + if (!MissingSolutionException.ultimateCauseIsMissingSolution(e)) { + throw e; + } + } } protected ComposedPipeline rideCleansingPipeline() { ExecutablePipeline exercise = - (source, sink) -> (new RideCleansingExercise(source, sink)).execute(); + (sourceSupplier, sink) -> + (new RideCleansingExercise(sourceSupplier.get(), sink)).execute(); ExecutablePipeline solution = - (source, sink) -> (new RideCleansingSolution(source, sink)).execute(); + (sourceSupplier, sink) -> + (new RideCleansingSolution(sourceSupplier.get(), sink)).execute(); return new ComposedPipeline<>(exercise, solution); } diff --git a/rides-and-fares/src/main/java/org/apache/flink/training/exercises/ridesandfares/RidesAndFaresExercise.java b/rides-and-fares/src/main/java/org/apache/flink/training/exercises/ridesandfares/RidesAndFaresExercise.java index 0662dfc0..650aa4cc 100644 --- a/rides-and-fares/src/main/java/org/apache/flink/training/exercises/ridesandfares/RidesAndFaresExercise.java +++ b/rides-and-fares/src/main/java/org/apache/flink/training/exercises/ridesandfares/RidesAndFaresExercise.java @@ -19,13 +19,15 @@ package org.apache.flink.training.exercises.ridesandfares; import org.apache.flink.api.common.JobExecutionResult; -import org.apache.flink.configuration.Configuration; +import org.apache.flink.api.common.functions.OpenContext; +import org.apache.flink.api.connector.sink2.Sink; +import org.apache.flink.api.connector.source.Source; import org.apache.flink.streaming.api.datastream.DataStream; +import org.apache.flink.streaming.api.datastream.KeyedStream; import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment; import org.apache.flink.streaming.api.functions.co.RichCoFlatMapFunction; -import org.apache.flink.streaming.api.functions.sink.PrintSinkFunction; -import org.apache.flink.streaming.api.functions.sink.SinkFunction; -import org.apache.flink.streaming.api.functions.source.SourceFunction; +import org.apache.flink.streaming.api.functions.sink.PrintSink; +import org.apache.flink.streaming.api.functions.timestamps.BoundedOutOfOrdernessTimestampExtractor; import org.apache.flink.training.exercises.common.datatypes.RideAndFare; import org.apache.flink.training.exercises.common.datatypes.TaxiFare; import org.apache.flink.training.exercises.common.datatypes.TaxiRide; @@ -34,22 +36,25 @@ import org.apache.flink.training.exercises.common.utils.MissingSolutionException; import org.apache.flink.util.Collector; +import java.io.Serializable; +import java.time.Duration; + /** * The Stateful Enrichment exercise from the Flink training. * *

The goal for this exercise is to enrich TaxiRides with fare information. */ -public class RidesAndFaresExercise { +public class RidesAndFaresExercise implements Serializable { - private final SourceFunction rideSource; - private final SourceFunction fareSource; - private final SinkFunction sink; + private final Source rideSource; + private final Source fareSource; + private final Sink sink; /** Creates a job using the sources and sink provided. */ public RidesAndFaresExercise( - SourceFunction rideSource, - SourceFunction fareSource, - SinkFunction sink) { + Source rideSource, + Source fareSource, + Sink sink) { this.rideSource = rideSource; this.fareSource = fareSource; @@ -67,14 +72,39 @@ public JobExecutionResult execute() throws Exception { StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment(); // A stream of taxi ride START events, keyed by rideId. + DataStream rides = - env.addSource(rideSource).filter(ride -> ride.isStart).keyBy(ride -> ride.rideId); + env.fromSource( + rideSource, + new BoundedOutOfOrdernessTimestampExtractor( + Duration.ofSeconds(10)) { + + @Override + public long extractTimestamp(TaxiRide taxiRide) { + return taxiRide.getEventTimeMillis(); + } + }, + "taxi ride") + .filter(ride -> ride.isStart) + .keyBy(ride -> ride.rideId); // A stream of taxi fare events, also keyed by rideId. - DataStream fares = env.addSource(fareSource).keyBy(fare -> fare.rideId); + KeyedStream fares = + env.fromSource( + fareSource, + new BoundedOutOfOrdernessTimestampExtractor( + Duration.ofSeconds(10)) { + + @Override + public long extractTimestamp(TaxiFare taxiFare) { + return taxiFare.getEventTimeMillis(); + } + }, + "taxi fare") + .keyBy(fare -> fare.rideId); // Create the pipeline. - rides.connect(fares).flatMap(new EnrichmentFunction()).addSink(sink); + rides.connect(fares).flatMap(new EnrichmentFunction()).sinkTo(sink); // Execute the pipeline and return the result. return env.execute("Join Rides with Fares"); @@ -89,9 +119,7 @@ public static void main(String[] args) throws Exception { RidesAndFaresExercise job = new RidesAndFaresExercise( - new TaxiRideGenerator(), - new TaxiFareGenerator(), - new PrintSinkFunction<>()); + new TaxiRideGenerator(), new TaxiFareGenerator(), new PrintSink<>()); job.execute(); } @@ -100,7 +128,7 @@ public static class EnrichmentFunction extends RichCoFlatMapFunction { @Override - public void open(Configuration config) throws Exception { + public void open(OpenContext config) throws Exception { throw new MissingSolutionException(); } diff --git a/rides-and-fares/src/solution/java/org/apache/flink/training/solutions/ridesandfares/RidesAndFaresSolution.java b/rides-and-fares/src/solution/java/org/apache/flink/training/solutions/ridesandfares/RidesAndFaresSolution.java index 3cdf17a6..e28349a2 100644 --- a/rides-and-fares/src/solution/java/org/apache/flink/training/solutions/ridesandfares/RidesAndFaresSolution.java +++ b/rides-and-fares/src/solution/java/org/apache/flink/training/solutions/ridesandfares/RidesAndFaresSolution.java @@ -19,15 +19,18 @@ package org.apache.flink.training.solutions.ridesandfares; import org.apache.flink.api.common.JobExecutionResult; +import org.apache.flink.api.common.functions.OpenContext; import org.apache.flink.api.common.state.ValueState; import org.apache.flink.api.common.state.ValueStateDescriptor; +import org.apache.flink.api.connector.sink2.Sink; +import org.apache.flink.api.connector.source.Source; import org.apache.flink.configuration.Configuration; import org.apache.flink.streaming.api.datastream.DataStream; +import org.apache.flink.streaming.api.datastream.KeyedStream; import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment; import org.apache.flink.streaming.api.functions.co.RichCoFlatMapFunction; -import org.apache.flink.streaming.api.functions.sink.PrintSinkFunction; -import org.apache.flink.streaming.api.functions.sink.SinkFunction; -import org.apache.flink.streaming.api.functions.source.SourceFunction; +import org.apache.flink.streaming.api.functions.sink.PrintSink; +import org.apache.flink.streaming.api.functions.timestamps.BoundedOutOfOrdernessTimestampExtractor; import org.apache.flink.training.exercises.common.datatypes.RideAndFare; import org.apache.flink.training.exercises.common.datatypes.TaxiFare; import org.apache.flink.training.exercises.common.datatypes.TaxiRide; @@ -35,22 +38,25 @@ import org.apache.flink.training.exercises.common.sources.TaxiRideGenerator; import org.apache.flink.util.Collector; +import java.io.Serializable; +import java.time.Duration; + /** * Java reference implementation for the Stateful Enrichment exercise from the Flink training. * *

The goal for this exercise is to enrich TaxiRides with fare information. */ -public class RidesAndFaresSolution { +public class RidesAndFaresSolution implements Serializable { - private final SourceFunction rideSource; - private final SourceFunction fareSource; - private final SinkFunction sink; + private final Source rideSource; + private final Source fareSource; + private final Sink sink; /** Creates a job using the sources and sink provided. */ public RidesAndFaresSolution( - SourceFunction rideSource, - SourceFunction fareSource, - SinkFunction sink) { + Source rideSource, + Source fareSource, + Sink sink) { this.rideSource = rideSource; this.fareSource = fareSource; @@ -67,18 +73,43 @@ public RidesAndFaresSolution( public JobExecutionResult execute(StreamExecutionEnvironment env) throws Exception { // A stream of taxi ride START events, keyed by rideId. + DataStream rides = - env.addSource(rideSource).filter(ride -> ride.isStart).keyBy(ride -> ride.rideId); + env.fromSource( + rideSource, + new BoundedOutOfOrdernessTimestampExtractor( + Duration.ofSeconds(10)) { + + @Override + public long extractTimestamp(TaxiRide taxiRide) { + return taxiRide.getEventTimeMillis(); + } + }, + "taxi ride") + .filter(ride -> ride.isStart) + .keyBy(ride -> ride.rideId); // A stream of taxi fare events, also keyed by rideId. - DataStream fares = env.addSource(fareSource).keyBy(fare -> fare.rideId); + KeyedStream fares = + env.fromSource( + fareSource, + new BoundedOutOfOrdernessTimestampExtractor( + Duration.ofSeconds(10)) { + + @Override + public long extractTimestamp(TaxiFare taxiFare) { + return taxiFare.getEventTimeMillis(); + } + }, + "taxi fare") + .keyBy(fare -> fare.rideId); // Create the pipeline. rides.connect(fares) .flatMap(new EnrichmentFunction()) .uid("enrichment") // uid for this operator's state .name("enrichment") // name for this operator in the web UI - .addSink(sink); + .sinkTo(sink); // Execute the pipeline and return the result. return env.execute("Join Rides with Fares"); @@ -100,9 +131,7 @@ public static void main(String[] args) throws Exception { RidesAndFaresSolution job = new RidesAndFaresSolution( - new TaxiRideGenerator(), - new TaxiFareGenerator(), - new PrintSinkFunction<>()); + new TaxiRideGenerator(), new TaxiFareGenerator(), new PrintSink<>()); // Setting up checkpointing so that the state can be explored with the State Processor API. // Generally it's better to separate configuration settings from the code, @@ -127,7 +156,7 @@ public static class EnrichmentFunction private ValueState fareState; @Override - public void open(Configuration config) { + public void open(OpenContext config) { rideState = getRuntimeContext() diff --git a/rides-and-fares/src/test/java/org/apache/flink/training/exercises/ridesandfares/RidesAndFaresIntegrationTest.java b/rides-and-fares/src/test/java/org/apache/flink/training/exercises/ridesandfares/RidesAndFaresIntegrationTest.java index e8d523ab..1f303239 100644 --- a/rides-and-fares/src/test/java/org/apache/flink/training/exercises/ridesandfares/RidesAndFaresIntegrationTest.java +++ b/rides-and-fares/src/test/java/org/apache/flink/training/exercises/ridesandfares/RidesAndFaresIntegrationTest.java @@ -68,7 +68,7 @@ public void testSeveralRidesAndFaresMixedTogether() throws Exception { TestSink sink = new TestSink<>(); JobExecutionResult jobResult = ridesAndFaresPipeline().execute(rides, fares, sink); - assertThat(sink.getResults(jobResult)) + assertThat(sink.getResults()) .containsExactlyInAnyOrder( new RideAndFare(ride1, fare1), new RideAndFare(ride2, fare2),