diff --git a/.gitignore b/.gitignore index 0f4acf7..981889f 100644 --- a/.gitignore +++ b/.gitignore @@ -22,7 +22,6 @@ # virtual machine crash logs, see http://www.java.com/en/download/help/error_hotspot.xml hs_err_pid* -.idea build target .classpath* @@ -48,3 +47,21 @@ resources/*.xml *.so *.o .vscode +.idea + +data/ +tmp/ + +!requirements.txt +freshness*.png +rate*.png +resulti7i/ +result1k2_2/ +result_lance +cluster*.png +pixels-sink.out +/*.png +/*.csv +*logs/*.out +*logs/*.png +*logs/**/*.png diff --git a/README.md b/README.md index 0871799..95fae5f 100644 --- a/README.md +++ b/README.md @@ -1,4 +1,50 @@ # pixels-sink -The data sink for Pixels. It ingests data into Pixels in a streaming manner. -> This project is under development. +Pixels Sink is the data sink service for Pixels. It ingests debezium-format change events from multiple sources (Debezium engine, Kafka, or storage files), converts them into Pixels events, and writes them to a configured sink (Retina, CSV, Proto, Flink, or none). + +This project is under active development. + +## Docs + +- Architecture and pipeline overview: [docs/overview.md](docs/overview.md) +- Transaction handling: [docs/transaction.md](docs/transaction.md) +- Usage guide: [docs/usage.md](docs/usage.md) +- Configuration reference: [docs/configuration.md](docs/configuration.md) +- Local dev environment (Docker): [develop/README.md](develop/README.md) + +## Quick Start + +### Requirements +- [Pixels](https://github.com/pixelsdb/pixels) +- Java 17 +- Maven 3.9+ +- Source and sink dependencies based on your configuration + - Kafka broker if `sink.datasource=kafka` + - Debezium + database access if `sink.datasource=engine` + - Retina service if `sink.mode=retina` + - Trino if freshness checking is enabled and uses Trino + +### Build +```bash +mvn -q -DskipTests package +``` + +### Run (Script) +```bash +./pixels-sink [config.properties] +``` + +The script reads `conf/jvm.conf` and uses a properties file configured inside `./pixels-sink` by default. If you pass a path, it overrides the default. + +### Run (IDE) +- Main class: `io.pixelsdb.pixels.sink.PixelsSinkApp` +- Program arguments: `-c conf/pixels-sink.aws.properties` + +## Configuration +- Sample configs are in `conf/`. +- Start with `conf/pixels-sink.aws.properties` and adjust. +- See [docs/configuration.md](docs/configuration.md) for a full key reference and guidance. + +## Monitoring +- Enable Prometheus metrics with `sink.monitor.enable=true`. +- Metrics endpoint listens on `sink.monitor.port` (default `9464`). diff --git a/conf/jvm.conf b/conf/jvm.conf new file mode 100644 index 0000000..fcc3742 --- /dev/null +++ b/conf/jvm.conf @@ -0,0 +1,26 @@ +-server +-XX:+AlwaysPreTouch +-Dfile.encoding=UTF-8 +-Duser.timezone=UTC + +-Xms8g +-Xmx60g + +-XX:+UseG1GC +-XX:MaxGCPauseMillis=200 +-XX:InitiatingHeapOccupancyPercent=35 +-XX:+ParallelRefProcEnabled +-XX:+UnlockExperimentalVMOptions +-XX:+TrustFinalNonStaticFields +-XX:+DisableExplicitGC + +-Xss512k + + +-XX:+HeapDumpOnOutOfMemoryError +-XX:HeapDumpPath=/var/log/java/java_heapdump.hprof +-XX:+ExitOnOutOfMemoryError + +-agentlib:jdwp=transport=dt_socket,server=y,suspend=n,address=*:10086 +--add-opens=java.base/sun.nio.ch=ALL-UNNAMED +--add-opens=java.base/java.nio=ALL-UNNAMED diff --git a/conf/pixels-sink.aws.properties b/conf/pixels-sink.aws.properties new file mode 100644 index 0000000..cd9f98d --- /dev/null +++ b/conf/pixels-sink.aws.properties @@ -0,0 +1,118 @@ +# engine | kafka | storage +sink.datasource=storage +# -1 means no limit, Only implement in retina sink mode yet: +sink.datasource.rate.limit=200000 +sink.monitor.report.file=/home/ubuntu/pixels-sink/result1k2_feb/rate_8192tile_3.csv +sink.monitor.freshness.file=/home/ubuntu/pixels-sink/result1k2_feb/fresh_8192tile_3.csv +# rate limiter implement: guava or semaphore +sink.datasource.rate.limit.type=guava +# Sink Config: retina | csv | proto | flink | none +sink.mode=retina +sink.retina.client=1 +# in flight rpc +sink.retina.rpc.limit=100000 +sink.retina.trans.limit=10000000 +sink.retina.log.queue=false +sink.retina.trans.request.batch=true +sink.commit.method=async +sink.retina.trans.request.batch.size=100 +## batch or single or record, batch is recommend. record is faster, but doesn't have ACID feature +sink.trans.mode=batch +sink.monitor.report.enable=true +# trino for freshness query +trino.url=jdbc:trino://realtime-pixels-coordinator:8080/pixels/pixels_bench +# trino.url=jdbc:trino://realtime-pixels-coordinator:8080/pixels/pixels_bench_sf10x +trino.user=pixels +trino.password=password +trino.parallel=1 +# row or txn or embed +sink.monitor.freshness.level=embed +sink.monitor.freshness.verbose=true +sink.monitor.freshness.embed.warmup=10 +sink.monitor.freshness.embed.static=false +sink.monitor.freshness.embed.snapshot=true +sink.monitor.freshness.embed.tablelist=loantrans +sink.monitor.freshness.embed.delay=10 +# sink.monitor.freshness.embed.tablelist=savingaccount +sink.monitor.freshness.timestamp=true +sink.storage.loop=true +# Kafka Config +bootstrap.servers=realtime-kafka-2:29092 +group.id=3078 +auto.offset.reset=earliest +key.deserializer=org.apache.kafka.common.serialization.StringDeserializer +#value.deserializer=io.pixelsdb.pixels.writer.deserializer.RowChangeEventAvroDeserializer +value.deserializer=io.pixelsdb.pixels.sink.event.deserializer.RowChangeEventJsonDeserializer +# Topic & Database Config +topic.prefix=postgresql.oltp_server +consumer.capture_database=pixels_bench_sf1x +consumer.include_tables= +sink.csv.path=./data +sink.csv.enable_header=false +## Retina Config +sink.retina.embedded=false +# stub or stream +sink.retina.mode=stream +#writer.retina.mode=stub +sink.remote.host=localhost +sink.remote.port=29422 +sink.timeout.ms=5000 +sink.flush.interval.ms=5 +sink.flush.batch.size=100 +sink.max.retries=3 +## writer commit +# sync or async +sink.commit.batch.size=100 +sink.commit.batch.worker=4 +sink.commit.batch.delay=10 +## Proto Config +sink.proto.dir=file:///home/ubuntu/disk2/hybench/ +sink.proto.data=hybench1000_4 +# sink.proto.data=hybench100_3 +# sink.proto.data=hybench100_4 +# sink.proto.data=hybench10_10 +sink.proto.maxRecords=100000 +## Flink Config +sink.flink.server.port=9091 +## Schema Registry +sink.registry.url=http://localhost:8080/apis/registry/v2 +# Transaction Config +transaction.topic.suffix=transaction +#transaction.topic.value.deserializer=io.pixelsdb.pixels.writer.deserializer.TransactionAvroMessageDeserializer +transaction.topic.value.deserializer=io.pixelsdb.pixels.sink.event.deserializer.TransactionJsonMessageDeserializer +sink.trans.batch.size=100 + +# Sink Metrics +sink.monitor.enable=true +sink.monitor.port=9464 +sink.monitor.report.interval=1000 +sink.monitor.freshness.interval=1000 + +# Interact with other rpc +sink.rpc.enable=true +sink.rpc.mock.delay=20 +# debezium engine config +debezium.name=testEngine +debezium.connector.class=io.debezium.connector.postgresql.PostgresConnector +debezium.provide.transaction.metadata=true +debezium.offset.storage=org.apache.kafka.connect.storage.FileOffsetBackingStore +debezium.offset.storage.file.filename=/tmp/offsets.dat +debezium.offset.flush.interval.ms=60000 +debezium.schema.history.internal=io.debezium.storage.file.history.FileSchemaHistory +debezium.schema.history.internal.file.filename=/tmp/schemahistory.dat +debezium.database.hostname=realtime-pg-2 +debezium.database.port=5432 +debezium.database.user=pixels +debezium.database.password=pixels_realtime_crud +debezium.database.dbname=pixels_bench +debezium.plugin.name=pgoutput +debezium.database.server.id=1 +debezium.schema.include.list=public +debezium.snapshot.mode=never +debezium.key.converter=org.apache.kafka.connect.json.JsonConverter +debezium.value.converter=org.apache.kafka.connect.json.JsonConverter +debezium.topic.prefix=postgresql.oltp_server +debezium.transforms=topicRouting +debezium.transforms.topicRouting.type=org.apache.kafka.connect.transforms.RegexRouter +debezium.transforms.topicRouting.regex=postgresql\\.oltp_server\\.public\\.(.*) +debezium.transforms.topicRouting.replacement=postgresql.oltp_server.pixels_bench_sf10x.$1 diff --git a/conf/pixels-sink.ch.properties b/conf/pixels-sink.ch.properties new file mode 100644 index 0000000..c9a4fbe --- /dev/null +++ b/conf/pixels-sink.ch.properties @@ -0,0 +1,115 @@ +# engine | kafka | storage +sink.datasource=storage +sink.mode=retina +#sink.datasource=engine +#sink.mode=proto +sink.proto.data=CH10K_2 +# -1 means no limit, Only implement in retina sink mode yet: +sink.datasource.rate.limit=80000 +sink.monitor.report.file=/home/ubuntu/pixels-sink/result_ch/rate_100K.csv +sink.monitor.freshness.file=/home/ubuntu/pixels-sink/result_ch/fresh_100K.csv +# rate limiter implement: guava or semaphore +sink.datasource.rate.limit.type=guava +sink.retina.client=1 +# in flight rpc +sink.retina.rpc.limit=1000000 +sink.retina.trans.limit=1000000 +sink.retina.log.queue=false +sink.retina.trans.request.batch=false +sink.commit.method=sync +sink.retina.trans.request.batch.size=1000 +## batch or single or record, batch is recommend. record is faster, but doesn't have ACID feature +sink.trans.mode=batch +# sink.trans.mode=record +sink.monitor.report.enable=true +# trino for freshness query +trino.url=jdbc:trino://realtime-pixels-coordinator:8080/pixels/pixels_bench +trino.user=pixels +trino.password=password +trino.parallel=1 +# row or txn or embed +sink.monitor.freshness.level=embed +sink.monitor.freshness.verbose=true +sink.monitor.freshness.embed.warmup=10 +sink.monitor.freshness.embed.static=false +sink.monitor.freshness.embed.snapshot=true +sink.monitor.freshness.embed.tablelist=stock +sink.monitor.freshness.embed.delay=10 +sink.monitor.freshness.timestamp=true +sink.storage.loop=true +# Kafka Config +bootstrap.servers=realtime-kafka-2:29092 +group.id=3078 +auto.offset.reset=earliest +key.deserializer=org.apache.kafka.common.serialization.StringDeserializer +#value.deserializer=io.pixelsdb.pixels.writer.deserializer.RowChangeEventAvroDeserializer +value.deserializer=io.pixelsdb.pixels.sink.event.deserializer.RowChangeEventJsonDeserializer +# Topic & Database Config +topic.prefix=postgresql.oltp_server +consumer.capture_database=pixels_bench_sf1x +consumer.include_tables= +sink.csv.path=./data +sink.csv.enable_header=false +## Retina Config +sink.retina.embedded=false +# stub or stream +sink.retina.mode=stream +#writer.retina.mode=stub +sink.remote.host=localhost +sink.remote.port=29422 +sink.timeout.ms=5000 +sink.flush.interval.ms=10 +sink.flush.batch.size=200 +sink.max.retries=3 +## writer commit +# sync or async +sink.commit.batch.size=1000 +sink.commit.batch.worker=8 +sink.commit.batch.delay=1000 +## Proto Config +sink.proto.dir=file:///home/ubuntu/disk2/chbench/ +sink.proto.maxRecords=100000 +## Flink Config +sink.flink.server.port=9091 +## Schema Registry +sink.registry.url=http://localhost:8080/apis/registry/v2 +# Transaction Config +transaction.topic.suffix=transaction +#transaction.topic.value.deserializer=io.pixelsdb.pixels.writer.deserializer.TransactionAvroMessageDeserializer +transaction.topic.value.deserializer=io.pixelsdb.pixels.sink.event.deserializer.TransactionJsonMessageDeserializer +sink.trans.batch.size=100 + +# Sink Metrics +sink.monitor.enable=true +sink.monitor.port=9464 +sink.monitor.report.interval=1000 +sink.monitor.freshness.interval=1000 + +# Interact with other rpc +sink.rpc.enable=true +sink.rpc.mock.delay=20 +# debezium engine config +debezium.name=testEngine +debezium.connector.class=io.debezium.connector.postgresql.PostgresConnector +debezium.provide.transaction.metadata=true +debezium.offset.storage=org.apache.kafka.connect.storage.FileOffsetBackingStore +debezium.offset.storage.file.filename=/tmp/offsets.dat +debezium.offset.flush.interval.ms=60000 +debezium.schema.history.internal=io.debezium.storage.file.history.FileSchemaHistory +debezium.schema.history.internal.file.filename=/tmp/schemahistory.dat +debezium.database.hostname=realtime-pg-2 +debezium.database.port=5432 +debezium.database.user=pixels +debezium.database.password=pixels_realtime_crud +debezium.database.dbname=pixels_bench +debezium.plugin.name=pgoutput +debezium.database.server.id=1 +debezium.schema.include.list=tpcch +debezium.snapshot.mode=never +debezium.key.converter=org.apache.kafka.connect.json.JsonConverter +debezium.value.converter=org.apache.kafka.connect.json.JsonConverter +debezium.topic.prefix=postgresql.oltp_server +debezium.transforms=topicRouting +debezium.transforms.topicRouting.type=org.apache.kafka.connect.transforms.RegexRouter +debezium.transforms.topicRouting.regex=postgresql\\.oltp_server\\.public\\.(.*) +debezium.transforms.topicRouting.replacement=postgresql.oltp_server.pixels_bench_sf10x.$1 diff --git a/conf/pixels-sink.flink.properties b/conf/pixels-sink.flink.properties new file mode 100644 index 0000000..676957f --- /dev/null +++ b/conf/pixels-sink.flink.properties @@ -0,0 +1,45 @@ +# engine | kafka | storage +sink.datasource=storage +# -1 means no limit, Only implement in retina sink mode yet +sink.datasource.rate.limit=50000 +# Sink Config: retina | csv | proto | flink | none +sink.mode=flink +sink.commit.batch.size=20 +## batch or single or record, batch is recommend. record is faster, but doesn't have ACID feature +sink.trans.mode=batch +sink.monitor.report.enable=true +sink.monitor.report.file=/home/ubuntu/pixels-sink/result_lance/rate_test.csv +sink.monitor.freshness.file=/home/ubuntu/pixels-sink/result_lance/fresh_test.csv +# trino for freshness query +trino.url=jdbc:trino://realtime-pixels-coordinator:8080/lance/default +trino.user=pixels +trino.password=password +trino.parallel=1 +# row or txn or embed +sink.monitor.freshness.level=embed +sink.monitor.freshness.embed.warmup=10 +sink.monitor.freshness.embed.static=false +sink.monitor.freshness.embed.snapshot=false +sink.monitor.freshness.embed.tablelist=stock +sink.monitor.freshness.verbose=true +sink.monitor.freshness.timestamp=true +sink.storage.loop=true + +sink.remote.host=localhost +sink.remote.port=29422 +sink.timeout.ms=5000 +sink.flush.interval.ms=50 +sink.flush.batch.size=10 +sink.max.retries=3 + +## Proto Config +# sink.proto.data=hybench1000_4 +sink.proto.data=CH10K_2 +## Flink Config +sink.flink.server.port=9091 + +# Sink Metrics +sink.monitor.enable=true +sink.monitor.port=9465 +sink.monitor.report.interval=10000 +sink.monitor.freshness.interval=1000 diff --git a/conf/pixels-sink.flink.properties.hybench b/conf/pixels-sink.flink.properties.hybench new file mode 100644 index 0000000..fa3ba46 --- /dev/null +++ b/conf/pixels-sink.flink.properties.hybench @@ -0,0 +1,45 @@ +# engine | kafka | storage +sink.datasource=storage +# -1 means no limit, Only implement in retina sink mode yet +sink.datasource.rate.limit=50000 +# Sink Config: retina | csv | proto | flink | none +sink.mode=flink +sink.commit.batch.size=200 +## batch or single or record, batch is recommend. record is faster, but doesn't have ACID feature +sink.trans.mode=batch +sink.monitor.report.enable=true +sink.monitor.report.file=/home/ubuntu/pixels-sink/result_lance/rate_bucket4_batch1000.csv +sink.monitor.freshness.file=/home/ubuntu/pixels-sink/result_lance/fresh_bucket4_batch1000.csv +# trino for freshness query +trino.url=jdbc:trino://realtime-pixels-coordinator:8080/lance/default +trino.user=pixels +trino.password=password +trino.parallel=1 +# row or txn or embed +sink.monitor.freshness.level=embed_dis +sink.monitor.freshness.embed.warmup=10 +sink.monitor.freshness.embed.static=false +sink.monitor.freshness.embed.snapshot=false +sink.monitor.freshness.embed.tablelist=loantrans +sink.monitor.freshness.verbose=true +sink.monitor.freshness.timestamp=true +sink.storage.loop=true + +sink.remote.host=localhost +sink.remote.port=29422 +sink.timeout.ms=5000 +sink.flush.interval.ms=50 +sink.flush.batch.size=10 +sink.max.retries=3 + +## Proto Config +sink.proto.data=hybench1000_4 +# sink.proto.data=CH10K_2 +## Flink Config +sink.flink.server.port=9091 + +# Sink Metrics +sink.monitor.enable=true +sink.monitor.port=9465 +sink.monitor.report.interval=10000 +sink.monitor.freshness.interval=1000 diff --git a/conf/pixels-sink.pg.properties b/conf/pixels-sink.pg.properties new file mode 100644 index 0000000..effc171 --- /dev/null +++ b/conf/pixels-sink.pg.properties @@ -0,0 +1,106 @@ +# engine | kafka | storage +sink.datasource=engine +# -1 means no limit, Only implement in retina sink mode yet +sink.datasource.rate.limit=100000 +# Sink Config: retina | csv | proto | flink | none +sink.mode=retina +sink.retina.client=8 +sink.retina.log.queue=false +## batch or single or record, batch is recommend. record is faster, but doesn't have ACID feature +sink.trans.mode=batch +sink.monitor.report.enable=true +sink.monitor.report.file=/home/ubuntu/pixels-sink/resulti7i/100k_rate_2.csv +sink.monitor.freshness.file=/home/ubuntu/pixels-sink/resulti7i/100k_freshness_2.csv +# trino for freshness query +trino.url=jdbc:trino://realtime-kafka-2:8080/pixels/pixels_bench_sf10x +# trino.url=jdbc:trino://realtime-pixels-coordinator:8080/pixels/pixels_bench_sf10x +trino.user=pixels +trino.password=password +trino.parallel=8 +# row or txn or embed +sink.monitor.freshness.level=embed +sink.monitor.freshness.embed.warmup=10 +sink.monitor.freshness.embed.static=false +sink.monitor.freshness.embed.snapshot=true +sink.monitor.freshness.embed.tablelist=loanapps,loantrans +sink.monitor.freshness.verbose=true +sink.monitor.freshness.timestamp=true +sink.storage.loop=true +# Kafka Config +bootstrap.servers=realtime-kafka-2:29092 +group.id=3078 +auto.offset.reset=earliest +key.deserializer=org.apache.kafka.common.serialization.StringDeserializer +#value.deserializer=io.pixelsdb.pixels.writer.deserializer.RowChangeEventAvroDeserializer +value.deserializer=io.pixelsdb.pixels.sink.event.deserializer.RowChangeEventJsonDeserializer +# Topic & Database Config +topic.prefix=postgresql.oltp_server +consumer.capture_database=pixels_bench_sf1x +consumer.include_tables= +sink.csv.path=./data +sink.csv.enable_header=false +## Retina Config +sink.retina.embedded=false +# stub or stream +sink.retina.mode=stream +#writer.retina.mode=stub +sink.remote.host=localhost +sink.remote.port=29422 +sink.timeout.ms=5000 +sink.flush.interval.ms=50 +sink.flush.batch.size=10 +sink.max.retries=3 +## writer commit +# sync or async +sink.commit.method=sync +sink.commit.batch.size=10 +sink.commit.batch.worker=32 +sink.commit.batch.delay=3000 +## Proto Config +sink.proto.dir=file:///home/ubuntu/disk1/hybench/ +sink.proto.data=hybench10_10 +sink.proto.maxRecords=100000 +## Flink Config +sink.flink.server.port=9091 +## Schema Registry +sink.registry.url=http://localhost:8080/apis/registry/v2 +# Transaction Config +transaction.topic.suffix=transaction +#transaction.topic.value.deserializer=io.pixelsdb.pixels.writer.deserializer.TransactionAvroMessageDeserializer +transaction.topic.value.deserializer=io.pixelsdb.pixels.sink.event.deserializer.TransactionJsonMessageDeserializer +sink.trans.batch.size=100 + +# Sink Metrics +sink.monitor.enable=true +sink.monitor.port=9464 +sink.monitor.report.interval=10000 +sink.monitor.freshness.interval=1000 + +# Interact with other rpc +sink.rpc.enable=true +sink.rpc.mock.delay=20 +# debezium engine config +debezium.name=testEngine +debezium.connector.class=io.debezium.connector.postgresql.PostgresConnector +debezium.provide.transaction.metadata=true +debezium.offset.storage=org.apache.kafka.connect.storage.FileOffsetBackingStore +debezium.offset.storage.file.filename=/tmp/offsets.dat +debezium.offset.flush.interval.ms=60000 +debezium.schema.history.internal=io.debezium.storage.file.history.FileSchemaHistory +debezium.schema.history.internal.file.filename=/tmp/schemahistory.dat +debezium.database.hostname=realtime-pg-2 +debezium.database.port=5432 +debezium.database.user=pixels +debezium.database.password=pixels_realtime_crud +debezium.database.dbname=pixels_bench_sf10x +debezium.plugin.name=pgoutput +debezium.database.server.id=1 +debezium.schema.include.list=public +debezium.snapshot.mode=never +debezium.key.converter=org.apache.kafka.connect.json.JsonConverter +debezium.value.converter=org.apache.kafka.connect.json.JsonConverter +debezium.topic.prefix=postgresql.oltp_server +debezium.transforms=topicRouting +debezium.transforms.topicRouting.type=org.apache.kafka.connect.transforms.RegexRouter +debezium.transforms.topicRouting.regex=postgresql\\.oltp_server\\.public\\.(.*) +debezium.transforms.topicRouting.replacement=postgresql.oltp_server.pixels_bench_sf10x.$1 diff --git a/develop/README.md b/develop/README.md index c69e3a7..fb8ca11 100644 --- a/develop/README.md +++ b/develop/README.md @@ -1,30 +1,86 @@ -## Quick Start +## Overview +This folder contains the local Docker-based development environment for Pixels Sink. -Prerequisites: -- Install Docker Engine and Docker Compose -Ensure Docker Engine and Docker Compose are installed and configured. -- Ensure Pixels is installed and configured in your local or remote Maven repository. +## Quick Start +### Prerequisites +- Docker Engine +- Docker Compose +- Pixels installed and available in your local or remote Maven repository +### Install ```bash -./install # or your relative/full path +./install ``` -After execute install script, you can check [Kafdrop](http://localhost:9000) to see if installation was successful. +### Verify +After installation, open Kafdrop to verify Kafka is up: + +[http://localhost:9000](http://localhost:9000) -Use this script to clean up +Grafana is also exposed at: +[http://localhost:3000](http://localhost:3000) + +### Cleanup ```bash . scripts/common_func.sh shutdown_containers ``` -## Test +## Install Options + +The `./install` script supports the following flags (all are optional): +- `--need_build=on|off` build images and Pixels Sink +- `--need_init=on|off` start containers via `docker compose` +- `--generate_data=on|off` generate TPCH data +- `--data_scale=` TPCH scale factor, default `0.1` +- `--enable_mysql=on|off` enable MySQL connector and source DB +- `--load_mysql=on|off` load TPCH data into MySQL (requires `--enable_tpch=on`) +- `--enable_postgres=on|off` enable Postgres connector and source DB +- `--load_postgres=on|off` load TPCH data into Postgres (requires `--enable_tpch=on`) +- `--enable_tpch=on|off` enable TPCH dataset and loading logic +- `--enable_tpcc=on|off` run TPC-C benchmark after startup +- `--develop_debug=on` enable verbose shell tracing + +## Example Commands + +- Start containers only, no build, no data generation: +```bash +./install --need_build=off --generate_data=off --enable_mysql=off --load_postgres=off +``` + +- Build images and start, without TPCH/TPC-C: +```bash +./install --need_build=on --generate_data=off --enable_tpch=off --enable_tpcc=off +``` + +- MySQL only, no Postgres: +```bash +./install --need_build=on --generate_data=off --enable_mysql=on --enable_postgres=off --enable_tpch=off +``` + +- MySQL + TPCH data generation (scale 10): +```bash +./install --need_build=on --generate_data=on --data_scale=10 --enable_mysql=on --enable_postgres=off --enable_tpch=on --enable_tpcc=off +``` + +- MySQL + TPC-C benchmark: +```bash +./install --need_build=off --generate_data=off --enable_mysql=on --enable_postgres=off --enable_tpch=off --enable_tpcc=on +``` +## Test Databases +### MySQL ```bash -docker exec -it pixels_mysql_source_db mysql -upixels -ppixels_realtime_crud -D pixels_realtime_crud +docker exec -it pixels_mysql_source_db \ + mysql -upixels -ppixels_realtime_crud -D pixels_realtime_crud +``` -docker exec -it pixels_postgres_source_db psql -Upixels -d pixels_realtime_crud -``` \ No newline at end of file +### Postgres +```bash +docker exec -it pixels_postgres_source_db \ + psql -Upixels -d pixels_realtime_crud +``` diff --git a/develop/config/register-postgres.json.template b/develop/config/register-postgres.json.template index 283ed65..0b3884b 100644 --- a/develop/config/register-postgres.json.template +++ b/develop/config/register-postgres.json.template @@ -11,21 +11,15 @@ "database.dbname" : "pixels_realtime_crud", "schema.include.list": "public", "database.server.id": "1", - "topic.prefix": "oltp_server", + "topic.prefix": "postgresql.oltp_server", "transforms": "topicRouting", "transforms.topicRouting.type": "org.apache.kafka.connect.transforms.RegexRouter", - "transforms.topicRouting.regex": "oltp_server\\.public\\.(.*)", - "transforms.topicRouting.replacement": "oltp_server.pixels_realtime_crud.$1", + "transforms.topicRouting.regex": "postgresql.oltp_server\\.public\\.(.*)", + "transforms.topicRouting.replacement": "postgresql.oltp_server.pixels_realtime_crud.$1", - "key.converter": "io.apicurio.registry.utils.converter.AvroConverter", - "value.converter": "io.apicurio.registry.utils.converter.AvroConverter", - "key.converter.apicurio.registry.url": "http://apicurio:8080/apis/registry/v2", - "key.converter.apicurio.registry.auto-register": "true", - "key.converter.apicurio.registry.find-latest": "true", - "value.converter.apicurio.registry.url": "http://apicurio:8080/apis/registry/v2", - "value.converter.apicurio.registry.auto-register": "true", - "value.converter.apicurio.registry.find-latest": "true", - "schema.name.adjustment.mode": "avro" + "key.converter": "org.apache.kafka.connect.json.JsonConverter", + "value.converter": "org.apache.kafka.connect.json.JsonConverter", + "plugin.name": "pgoutput" } } diff --git a/develop/docker-compose.yml b/develop/docker-compose.yml index 08621ff..d919b48 100644 --- a/develop/docker-compose.yml +++ b/develop/docker-compose.yml @@ -31,7 +31,7 @@ services: - pixels_network postgres: - image: quay.io/debezium/postgres:17 # This image install plugin: postgres-decoderbufs and configure wal_level = logical + image: quay.io/debezium/postgres:16 # This image install plugin: postgres-decoderbufs and configure wal_level = logical container_name: pixels_postgres_source_db environment: POSTGRES_DB: pixels_realtime_crud @@ -75,7 +75,8 @@ services: - pixels_network pixels-sink: - image: pixels-sink:0.2.0-SNAPSHOT + image: hello-world:latest + # image: pixels-sink:0.2.0-SNAPSHOT container_name: pixels-sink volumes: - ./data:/app/data @@ -101,7 +102,7 @@ services: - pixels_network pg_debezium: - image: debezium/connect:2.7.3.Final + image: debezium/connect:3.0.0.Final ports: - "8084:8083" depends_on: diff --git a/develop/docker-monitor-compose.yml b/develop/docker-monitor-compose.yml new file mode 100644 index 0000000..180be1b --- /dev/null +++ b/develop/docker-monitor-compose.yml @@ -0,0 +1,40 @@ +services: + # monitor + prometheus: + image: prom/prometheus:v3.2.1 + container_name: pixels-prometheus + ports: + - "9090:9090" + volumes: + - ./images/prometheus/prometheus_local.yml:/etc/prometheus/prometheus.yml + networks: + - pixels_monitor_network + extra_hosts: + - "host.docker.internal:host-gateway" + + grafana: + image: grafana/grafana:10.1.5 + container_name: pixels-grafana + ports: + - "3000:3000" + volumes: + - grafana-data:/var/lib/grafana + - ./images/grafana-provisioning:/etc/grafana/provisioning + environment: + - GF_SECURITY_ADMIN_USER=admin + - GF_SECURITY_ADMIN_PASSWORD=admin + - GF_AUTH_ANONYMOUS_ENABLED=true + - GF_AUTH_ANONYMOUS_ORG_ROLE=Admin + - GF_DEFAULT_INSTANCE_THEME=light + networks: + - pixels_monitor_network + depends_on: + - prometheus +volumes: + grafana-data: + + +networks: + pixels_monitor_network: + name: pixels_crud_network + driver: bridge diff --git a/develop/example/sql/dss.ri b/develop/example/sql/dss.ri index fb4c002..61b4382 100644 --- a/develop/example/sql/dss.ri +++ b/develop/example/sql/dss.ri @@ -1,100 +1,97 @@ -- Sccsid: @(#)dss.ri 2.1.8.1 --- TPCD Benchmark Version 8.0 +-- TPCH Benchmark Version 8.0 -CONNECT TO TPCD; +--CONNECT TO TPCH; ---ALTER TABLE TPCD.REGION DROP PRIMARY KEY; ---ALTER TABLE TPCD.NATION DROP PRIMARY KEY; ---ALTER TABLE TPCD.PART DROP PRIMARY KEY; ---ALTER TABLE TPCD.SUPPLIER DROP PRIMARY KEY; ---ALTER TABLE TPCD.PARTSUPP DROP PRIMARY KEY; ---ALTER TABLE TPCD.ORDERS DROP PRIMARY KEY; ---ALTER TABLE TPCD.LINEITEM DROP PRIMARY KEY; ---ALTER TABLE TPCD.CUSTOMER DROP PRIMARY KEY; +--ALTER TABLE REGION DROP PRIMARY KEY; +--ALTER TABLE NATION DROP PRIMARY KEY; +--ALTER TABLE PART DROP PRIMARY KEY; +--ALTER TABLE SUPPLIER DROP PRIMARY KEY; +--ALTER TABLE PARTSUPP DROP PRIMARY KEY; +--ALTER TABLE ORDERS DROP PRIMARY KEY; +--ALTER TABLE LINEITEM DROP PRIMARY KEY; +--ALTER TABLE CUSTOMER DROP PRIMARY KEY; -- For table REGION -ALTER TABLE TPCD.REGION +ALTER TABLE REGION ADD PRIMARY KEY (R_REGIONKEY); -- For table NATION -ALTER TABLE TPCD.NATION +ALTER TABLE NATION ADD PRIMARY KEY (N_NATIONKEY); -ALTER TABLE TPCD.NATION -ADD FOREIGN KEY NATION_FK1 (N_REGIONKEY) references TPCD.REGION; +ALTER TABLE NATION +ADD FOREIGN KEY (N_REGIONKEY) references REGION; -COMMIT WORK; +--COMMIT WORK; -- For table PART -ALTER TABLE TPCD.PART +ALTER TABLE PART ADD PRIMARY KEY (P_PARTKEY); -COMMIT WORK; +--COMMIT WORK; -- For table SUPPLIER -ALTER TABLE TPCD.SUPPLIER +ALTER TABLE SUPPLIER ADD PRIMARY KEY (S_SUPPKEY); -ALTER TABLE TPCD.SUPPLIER -ADD FOREIGN KEY SUPPLIER_FK1 (S_NATIONKEY) references TPCD.NATION; +ALTER TABLE SUPPLIER +ADD FOREIGN KEY (S_NATIONKEY) references NATION; -COMMIT WORK; +--COMMIT WORK; -- For table PARTSUPP -ALTER TABLE TPCD.PARTSUPP +ALTER TABLE PARTSUPP ADD PRIMARY KEY (PS_PARTKEY,PS_SUPPKEY); -COMMIT WORK; +--COMMIT WORK; -- For table CUSTOMER -ALTER TABLE TPCD.CUSTOMER +ALTER TABLE CUSTOMER ADD PRIMARY KEY (C_CUSTKEY); -ALTER TABLE TPCD.CUSTOMER -ADD FOREIGN KEY CUSTOMER_FK1 (C_NATIONKEY) references TPCD.NATION; +ALTER TABLE CUSTOMER +ADD FOREIGN KEY (C_NATIONKEY) references NATION; -COMMIT WORK; +--COMMIT WORK; -- For table LINEITEM -ALTER TABLE TPCD.LINEITEM +ALTER TABLE LINEITEM ADD PRIMARY KEY (L_ORDERKEY,L_LINENUMBER); -COMMIT WORK; +--COMMIT WORK; -- For table ORDERS -ALTER TABLE TPCD.ORDERS +ALTER TABLE ORDERS ADD PRIMARY KEY (O_ORDERKEY); -COMMIT WORK; +--COMMIT WORK; -- For table PARTSUPP -ALTER TABLE TPCD.PARTSUPP -ADD FOREIGN KEY PARTSUPP_FK1 (PS_SUPPKEY) references TPCD.SUPPLIER; +ALTER TABLE PARTSUPP +ADD FOREIGN KEY (PS_SUPPKEY) references SUPPLIER; -COMMIT WORK; +--COMMIT WORK; -ALTER TABLE TPCD.PARTSUPP -ADD FOREIGN KEY PARTSUPP_FK2 (PS_PARTKEY) references TPCD.PART; +ALTER TABLE PARTSUPP +ADD FOREIGN KEY (PS_PARTKEY) references PART; -COMMIT WORK; +--COMMIT WORK; -- For table ORDERS -ALTER TABLE TPCD.ORDERS -ADD FOREIGN KEY ORDERS_FK1 (O_CUSTKEY) references TPCD.CUSTOMER; +ALTER TABLE ORDERS +ADD FOREIGN KEY (O_CUSTKEY) references CUSTOMER; -COMMIT WORK; +--COMMIT WORK; -- For table LINEITEM -ALTER TABLE TPCD.LINEITEM -ADD FOREIGN KEY LINEITEM_FK1 (L_ORDERKEY) references TPCD.ORDERS; +ALTER TABLE LINEITEM +ADD FOREIGN KEY (L_ORDERKEY) references ORDERS; -COMMIT WORK; - -ALTER TABLE TPCD.LINEITEM -ADD FOREIGN KEY LINEITEM_FK2 (L_PARTKEY,L_SUPPKEY) references - TPCD.PARTSUPP; - -COMMIT WORK; +--COMMIT WORK; +ALTER TABLE LINEITEM +ADD FOREIGN KEY (L_PARTKEY,L_SUPPKEY) references PARTSUPP; +--COMMIT WORK; \ No newline at end of file diff --git a/develop/example/sql/pg_replica.sql b/develop/example/sql/pg_replica.sql new file mode 100644 index 0000000..96eaf20 --- /dev/null +++ b/develop/example/sql/pg_replica.sql @@ -0,0 +1,15 @@ +ALTER TABLE customer REPLICA IDENTITY USING INDEX customer_pkey; + +ALTER TABLE lineitem REPLICA IDENTITY USING INDEX lineitem_pkey; + +ALTER TABLE nation REPLICA IDENTITY USING INDEX nation_pkey; + +ALTER TABLE orders REPLICA IDENTITY USING INDEX orders_pkey; + +ALTER TABLE part REPLICA IDENTITY USING INDEX part_pkey; + +ALTER TABLE partsupp REPLICA IDENTITY USING INDEX partsupp_pkey; + +ALTER TABLE region REPLICA IDENTITY USING INDEX region_pkey; + +ALTER TABLE supplier REPLICA IDENTITY USING INDEX supplier_pkey; diff --git a/develop/images/debezium-source-connector/docker-maven-download.sh b/develop/images/debezium-source-connector/docker-maven-download.sh index bd14be7..c5edd23 100644 --- a/develop/images/debezium-source-connector/docker-maven-download.sh +++ b/develop/images/debezium-source-connector/docker-maven-download.sh @@ -1,5 +1,24 @@ #!/bin/bash +# Copyright 2026 PixelsDB. +# +# This file is part of Pixels. +# +# Pixels is free software: you can redistribute it and/or modify +# it under the terms of the Affero GNU General Public License as +# published by the Free Software Foundation, either version 3 of +# the License, or (at your option) any later version. +# +# Pixels is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# Affero GNU General Public License for more details. +# +# You should have received a copy of the Affero GNU General Public +# License along with Pixels. If not, see +# . + + # # Download connector maven dependencies # 4 methods are available: @@ -111,4 +130,4 @@ case $1 in "apicurio" ) shift maven_apicurio_converter ${@} ;; -esac \ No newline at end of file +esac diff --git a/develop/images/debezium-source-connector/start.sh b/develop/images/debezium-source-connector/start.sh index caa6433..0a1173a 100644 --- a/develop/images/debezium-source-connector/start.sh +++ b/develop/images/debezium-source-connector/start.sh @@ -1,5 +1,24 @@ #!/bin/bash +# Copyright 2026 PixelsDB. +# +# This file is part of Pixels. +# +# Pixels is free software: you can redistribute it and/or modify +# it under the terms of the Affero GNU General Public License as +# published by the Free Software Foundation, either version 3 of +# the License, or (at your option) any later version. +# +# Pixels is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# Affero GNU General Public License for more details. +# +# You should have received a copy of the Affero GNU General Public +# License along with Pixels. If not, see +# . + + # Exit immediately if a *pipeline* returns a non-zero status. (Add -x for command tracing) set -e @@ -327,4 +346,4 @@ done # # Execute the Kafka Connect distributed service, replacing this shell process with the specified program ... # -exec $KAFKA_HOME/bin/connect-standalone.sh $KAFKA_HOME/config/connect-standalone.properties \ No newline at end of file +exec $KAFKA_HOME/bin/connect-standalone.sh $KAFKA_HOME/config/connect-standalone.properties diff --git a/develop/images/grafana-provisioning/dashboards/sink-server.json b/develop/images/grafana-provisioning/dashboards/sink-server.json index 1e622ca..2d609bb 100644 --- a/develop/images/grafana-provisioning/dashboards/sink-server.json +++ b/develop/images/grafana-provisioning/dashboards/sink-server.json @@ -21,7 +21,6 @@ "editable": true, "fiscalYearStartMonth": 0, "graphTooltip": 1, - "id": 1, "links": [], "liveNow": false, "panels": [ @@ -33,6 +32,323 @@ "x": 0, "y": 0 }, + "id": 26, + "panels": [], + "title": "JVM", + "type": "row" + }, + { + "datasource": { + "type": "prometheus", + "uid": "PBFA97CFB590B2093" + }, + "description": "", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 9, + "x": 0, + "y": 1 + }, + "id": 27, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "PBFA97CFB590B2093" + }, + "editorMode": "code", + "expr": "jvm_memory_bytes_used{area=\"heap\"}", + "instant": false, + "legendFormat": "__auto", + "range": true, + "refId": "A" + } + ], + "title": "JVM Heap Memory", + "type": "timeseries" + }, + { + "collapsed": false, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 9 + }, + "id": 25, + "panels": [], + "title": "Debezium", + "type": "row" + }, + { + "datasource": { + "type": "prometheus", + "uid": "PBFA97CFB590B2093" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 11, + "w": 5, + "x": 0, + "y": 10 + }, + "id": 23, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "PBFA97CFB590B2093" + }, + "disableTextWrap": false, + "editorMode": "builder", + "expr": "rate(debezium_event_total[1m])", + "fullMetaSearch": false, + "includeNullMetadata": true, + "instant": false, + "legendFormat": "__auto", + "range": true, + "refId": "A", + "useBackend": false + } + ], + "title": "Debezium Event Rate", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "PBFA97CFB590B2093" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 11, + "w": 4, + "x": 5, + "y": 10 + }, + "id": 24, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "PBFA97CFB590B2093" + }, + "disableTextWrap": false, + "editorMode": "builder", + "expr": "rate(row_event_total[$__rate_interval])", + "fullMetaSearch": false, + "includeNullMetadata": true, + "instant": false, + "legendFormat": "__auto", + "range": true, + "refId": "A", + "useBackend": false + } + ], + "title": "Row Event Rate", + "type": "timeseries" + }, + { + "collapsed": false, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 21 + }, "id": 12, "panels": [], "title": "Basic", @@ -103,7 +419,7 @@ "h": 5, "w": 5, "x": 0, - "y": 1 + "y": 22 }, "id": 9, "options": { @@ -202,7 +518,7 @@ "h": 5, "w": 2, "x": 5, - "y": 1 + "y": 22 }, "id": 10, "options": { @@ -266,7 +582,7 @@ "h": 5, "w": 3, "x": 7, - "y": 1 + "y": 22 }, "id": 11, "options": { @@ -365,7 +681,7 @@ "h": 9, "w": 14, "x": 10, - "y": 1 + "y": 22 }, "id": 4, "options": { @@ -428,7 +744,7 @@ "h": 4, "w": 6, "x": 0, - "y": 6 + "y": 27 }, "id": 8, "options": { @@ -531,7 +847,7 @@ "h": 4, "w": 4, "x": 6, - "y": 6 + "y": 27 }, "id": 2, "options": { @@ -633,7 +949,7 @@ "h": 7, "w": 2, "x": 0, - "y": 10 + "y": 31 }, "id": 18, "options": { @@ -701,7 +1017,7 @@ "h": 7, "w": 4, "x": 2, - "y": 10 + "y": 31 }, "id": 20, "options": { @@ -770,7 +1086,7 @@ "h": 7, "w": 13, "x": 6, - "y": 10 + "y": 31 }, "id": 21, "options": { @@ -837,7 +1153,7 @@ "h": 7, "w": 5, "x": 19, - "y": 10 + "y": 31 }, "id": 6, "options": { @@ -877,7 +1193,7 @@ "h": 1, "w": 24, "x": 0, - "y": 17 + "y": 38 }, "id": 13, "panels": [], @@ -946,7 +1262,7 @@ "h": 9, "w": 4, "x": 0, - "y": 18 + "y": 39 }, "id": 14, "options": { @@ -1044,7 +1360,7 @@ "h": 9, "w": 4, "x": 4, - "y": 18 + "y": 39 }, "id": 15, "options": { @@ -1142,7 +1458,7 @@ "h": 9, "w": 4, "x": 8, - "y": 18 + "y": 39 }, "id": 16, "options": { @@ -1240,7 +1556,7 @@ "h": 9, "w": 6, "x": 12, - "y": 18 + "y": 39 }, "id": 17, "options": { @@ -1372,7 +1688,7 @@ "h": 9, "w": 6, "x": 18, - "y": 18 + "y": 39 }, "id": 22, "options": { @@ -1448,7 +1764,7 @@ "h": 1, "w": 24, "x": 0, - "y": 27 + "y": 48 }, "id": 19, "panels": [], @@ -1456,7 +1772,7 @@ "type": "row" } ], - "refresh": "auto", + "refresh": false, "schemaVersion": 38, "style": "light", "tags": [], @@ -1488,8 +1804,8 @@ ] }, "time": { - "from": "now-5m", - "to": "now" + "from": "2025-10-07T04:53:37.436Z", + "to": "2025-10-07T04:55:21.643Z" }, "timepicker": {}, "timezone": "", diff --git a/develop/images/pixels-sink/Dockerfile b/develop/images/pixels-sink/Dockerfile index 2f49820..2652267 100644 --- a/develop/images/pixels-sink/Dockerfile +++ b/develop/images/pixels-sink/Dockerfile @@ -1,4 +1,4 @@ -# This docker file should be built in project source dir +# This Dockerfile should be built in project source dir FROM eclipse-temurin:17-jdk-jammy @@ -10,11 +10,9 @@ ARG IMAGE_PATH=develop/images/pixels-sink ENV JAR_FILE=${jarFile} COPY target/${jarFile} /app/ -ADD ${IMAGE_PATH}/pixels-sink.properties ${IMAGE_PATH}/start.sh /app/ +ADD ${IMAGE_PATH}/pixels-sink.properties ${IMAGE_PATH}/start.sh ${IMAGE_PATH}/jvm.conf /app/ RUN chmod +x /app/start.sh -#RUN apt-get update && apt-get install -y --no-install-recommends vim procps - CMD ["./start.sh"] LABEL authors="anti" diff --git a/develop/images/pixels-sink/jvm.conf b/develop/images/pixels-sink/jvm.conf new file mode 120000 index 0000000..171434e --- /dev/null +++ b/develop/images/pixels-sink/jvm.conf @@ -0,0 +1 @@ +../../conf/jvm.conf \ No newline at end of file diff --git a/develop/images/pixels-sink/start.sh b/develop/images/pixels-sink/start.sh index 8289585..7ba0454 100644 --- a/develop/images/pixels-sink/start.sh +++ b/develop/images/pixels-sink/start.sh @@ -1,7 +1,39 @@ #!/bin/sh + +# Copyright 2026 PixelsDB. +# +# This file is part of Pixels. +# +# Pixels is free software: you can redistribute it and/or modify +# it under the terms of the Affero GNU General Public License as +# published by the Free Software Foundation, either version 3 of +# the License, or (at your option) any later version. +# +# Pixels is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# Affero GNU General Public License for more details. +# +# You should have received a copy of the Affero GNU General Public +# License along with Pixels. If not, see +# . + # image entrypoint -set -o verbose -o xtrace +set -euo pipefail + +JVM_CONFIG_FILE="${JVM_CONFIG_FILE:-/app/jvm.conf}" +PROPERTIES_FILE="${PROPERTIES_FILE:-pixels-sink.properties}" + +if [ -f "${JVM_CONFIG_FILE}" ]; then + JVM_OPTION=$(grep -v '^[[:space:]]*#' "${JVM_CONFIG_FILE}" | grep -v '^[[:space:]]*$' | xargs) +else + JVM_OPTION="${JVM_OPTION:--Xmx4096m -Xmn1024m}" +fi +echo "Starting Pixels Sink" +echo "JAR_FILE = ${JAR_FILE}" +echo "PROPERTIES_FILE = ${PROPERTIES_FILE}" +echo "JVM_CONFIG_FILE = ${JVM_CONFIG_FILE}" +echo "JVM_OPTION = ${JVM_OPTION}" -JVM_OPTION="-Xmx4096m -Xmn1024m " -java $JVM_OPTION -jar ${JAR_FILE} -c pixels-sink.properties \ No newline at end of file +exec java ${JVM_OPTION} -jar "${JAR_FILE}" -c "${PROPERTIES_FILE}" diff --git a/develop/images/prometheus/prometheus_local.yml b/develop/images/prometheus/prometheus_local.yml new file mode 100644 index 0000000..340e1a6 --- /dev/null +++ b/develop/images/prometheus/prometheus_local.yml @@ -0,0 +1,9 @@ +global: + scrape_interval: 1s + +scrape_configs: + - job_name: 'pixels-sink' + metrics_path: /metrics + static_configs: + - targets: [ 'host.docker.internal:9464' ] + diff --git a/develop/index.html b/develop/index.html new file mode 100644 index 0000000..7dedf06 --- /dev/null +++ b/develop/index.html @@ -0,0 +1,276 @@ + + + + + Pixels Sink Performance Dashboard + + + + + + +
+ +
+ +
+
+ + + + diff --git a/develop/scripts/build_func.sh b/develop/scripts/build_func.sh index 8047e09..2397e0f 100644 --- a/develop/scripts/build_func.sh +++ b/develop/scripts/build_func.sh @@ -1,3 +1,21 @@ +# Copyright 2026 PixelsDB. +# +# This file is part of Pixels. +# +# Pixels is free software: you can redistribute it and/or modify +# it under the terms of the Affero GNU General Public License as +# published by the Free Software Foundation, either version 3 of +# the License, or (at your option) any later version. +# +# Pixels is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# Affero GNU General Public License for more details. +# +# You should have received a copy of the Affero GNU General Public +# License along with Pixels. If not, see +# . + build_pixels_sink_image() { mvn clean package dockerfile:build -f ${PROJECT_DIR}/pom.xml check_fatal_exit "Fail to build pixels sink image" diff --git a/develop/scripts/common_func.sh b/develop/scripts/common_func.sh index aad8b5b..e220987 100755 --- a/develop/scripts/common_func.sh +++ b/develop/scripts/common_func.sh @@ -1,5 +1,24 @@ #!/bin/bash +# Copyright 2026 PixelsDB. +# +# This file is part of Pixels. +# +# Pixels is free software: you can redistribute it and/or modify +# it under the terms of the Affero GNU General Public License as +# published by the Free Software Foundation, either version 3 of +# the License, or (at your option) any later version. +# +# Pixels is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# Affero GNU General Public License for more details. +# +# You should have received a copy of the Affero GNU General Public +# License along with Pixels. If not, see +# . + + SOURCE_PATH=`readlink -f $BASH_SOURCE` 2>/dev/null @@ -21,4 +40,4 @@ source ${SCRIPT_DIR}/build_func.sh source ${SCRIPT_DIR}/log_func.sh source ${SCRIPT_DIR}/docker_func.sh source ${SCRIPT_DIR}/util_func.sh -source ${SCRIPT_DIR}/gen_data.sh \ No newline at end of file +source ${SCRIPT_DIR}/gen_data.sh diff --git a/develop/scripts/docker_func.sh b/develop/scripts/docker_func.sh index 7130d8e..be9852d 100644 --- a/develop/scripts/docker_func.sh +++ b/develop/scripts/docker_func.sh @@ -1,3 +1,21 @@ +# Copyright 2026 PixelsDB. +# +# This file is part of Pixels. +# +# Pixels is free software: you can redistribute it and/or modify +# it under the terms of the Affero GNU General Public License as +# published by the Free Software Foundation, either version 3 of +# the License, or (at your option) any later version. +# +# Pixels is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# Affero GNU General Public License for more details. +# +# You should have received a copy of the Affero GNU General Public +# License along with Pixels. If not, see +# . + shutdown_containers() { docker compose -f ${DEVELOP_DIR}/docker-compose.yml down -v } diff --git a/develop/scripts/gen_data.sh b/develop/scripts/gen_data.sh index bbee62d..47085db 100644 --- a/develop/scripts/gen_data.sh +++ b/develop/scripts/gen_data.sh @@ -1,3 +1,21 @@ +# Copyright 2026 PixelsDB. +# +# This file is part of Pixels. +# +# Pixels is free software: you can redistribute it and/or modify +# it under the terms of the Affero GNU General Public License as +# published by the Free Software Foundation, either version 3 of +# the License, or (at your option) any later version. +# +# Pixels is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# Affero GNU General Public License for more details. +# +# You should have received a copy of the Affero GNU General Public +# License along with Pixels. If not, see +# . + tpch_data=${DEVELOP_DIR}/example/tpch_data dbgen_path=${DEVELOP_DIR}/example/tpch-dbgen @@ -89,4 +107,4 @@ function start_tpcc_test() { ./runBenchmark.sh $config_path check_fatal_exit "Run TPC-C Benchmark for $db_type failed" cd ${WORK_PATH} -} \ No newline at end of file +} diff --git a/develop/scripts/install.sh b/develop/scripts/install.sh index 2906792..d372b86 100755 --- a/develop/scripts/install.sh +++ b/develop/scripts/install.sh @@ -1,5 +1,24 @@ #!/bin/bash +# Copyright 2026 PixelsDB. +# +# This file is part of Pixels. +# +# Pixels is free software: you can redistribute it and/or modify +# it under the terms of the Affero GNU General Public License as +# published by the Free Software Foundation, either version 3 of +# the License, or (at your option) any later version. +# +# Pixels is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# Affero GNU General Public License for more details. +# +# You should have received a copy of the Affero GNU General Public +# License along with Pixels. If not, see +# . + + usage() { cat <. + + log() { local level="$1" shift diff --git a/develop/scripts/util_func.sh b/develop/scripts/util_func.sh index 68a8299..57fff3a 100644 --- a/develop/scripts/util_func.sh +++ b/develop/scripts/util_func.sh @@ -1,3 +1,21 @@ +# Copyright 2026 PixelsDB. +# +# This file is part of Pixels. +# +# Pixels is free software: you can redistribute it and/or modify +# it under the terms of the Affero GNU General Public License as +# published by the Free Software Foundation, either version 3 of +# the License, or (at your option) any later version. +# +# Pixels is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# Affero GNU General Public License for more details. +# +# You should have received a copy of the Affero GNU General Public +# License along with Pixels. If not, see +# . + check_fatal_exit() { [[ $? -ne 0 ]] && { log_fatal_exit "$@";} return 0 @@ -89,4 +107,4 @@ try_command() { done log_fatal "Max retries ($MAX_RETRIES) reached. Command failed." return 1 -} \ No newline at end of file +} diff --git a/docs/assets/TransactionCoordinator.png b/docs/assets/TransactionCoordinator.png new file mode 100644 index 0000000..8a8ed4b Binary files /dev/null and b/docs/assets/TransactionCoordinator.png differ diff --git a/docs/assets/frame.png b/docs/assets/frame.png new file mode 100644 index 0000000..9d34356 Binary files /dev/null and b/docs/assets/frame.png differ diff --git a/docs/configuration.md b/docs/configuration.md new file mode 100644 index 0000000..845af65 --- /dev/null +++ b/docs/configuration.md @@ -0,0 +1,158 @@ +# Configuration Reference + +Pixels Sink is configured via a Java properties file. Pass the path with `-c`. + +- Examples: `conf/pixels-sink.pg.properties`, `conf/pixels-sink.aws.properties`, `conf/pixels-sink.flink.properties` + +Values are loaded by `PixelsSinkConfig` and mapped from keys in the properties file. + +## Core Keys + +| Key | Default | Notes | +| --- | --- | --- | +| `sink.datasource` | `engine` | Source type: `engine`, `kafka`, or `storage`. | +| `sink.mode` | `retina` | Sink type: `retina`, `csv`, `proto`, `flink`, or `none`. | +| `sink.datasource.rate.limit` | `-1` | Rate limit for source ingestion. `-1` disables. | +| `sink.datasource.rate.limit.type` | `semaphore` | Rate limiter type used by `FlushRateLimiterFactory`. 'guava' or 'semaphore'| + +### Notes on `sink.datasource` + +- `engine` reads CDC logs directly from Debezium Engine. +- `storage` reads CDC logs from files dumped by `sink.proto` output; schema reference: [sink.proto](https://github.com/pixelsdb/pixels/blob/master/proto/sink.proto). +- `kafka` reads from a set of Kafka topics; this mode is deprecated and not actively tested. + +### Notes on `sink.mode` + +- `retina` connects to one or more Retina services via RPC and sends `UpdateRecord` or `StreamUpdateRecord` requests defined in [retina.proto](https://github.com/pixelsdb/pixels/blob/master/proto/retina.proto). +- `csv` is mainly for debugging. +- `proto` converts row change events and transaction metadata into `sink.proto` format, writes them in order to one or more files, and registers file paths in ETCD. These files can be read by `sink.datasource=storage`. This provides the highest CDC read efficiency and is used in paper experiments. +- `flink` starts a server for external programs to pull data via RPC and continue ingestion, for example [pixels-lance](https://github.com/AntiO2/pixels-lance) or [pixels-flink](https://github.com/AntiO2/pixels-flink). +- `none` writes no output and is useful for testing or observing source-side metrics. + +## Source and Sink + +### Transaction + +Only supported in **Retina** sink mode. + +| Key | Default | Notes | +| --- | --- | --- | +| `sink.trans.batch.size` | `100` | Batch size for transaction processing. | +| `sink.trans.mode` | `batch` | Transaction mode: `single`, `record`, or `batch`. | +| `transaction.timeout` | `300` | Transaction timeout in seconds. | + +Notes on `sink.trans.mode`: +- `single` means each Retina request writes exactly one transaction. +- `batch` means a single Retina request may carry multiple transactions. +- `single` and `batch` both support cross-table transactions. `record` disables cross-table transactions and only processes single-table transactions. + +### Debezium Engine Source + +| Key | Default | Notes | +| --- | --- | --- | +| `debezium.name` | none | Engine name. | +| `debezium.connector.class` | none | Connector class, e.g. PostgreSQL connector. | +| `debezium.*` | none | Standard Debezium engine properties. | + +### Retina Sink + +| Key | Default | Notes | +| --- | --- | --- | +| `sink.retina.mode` | `stub` | Write mode: `stub` or `stream`. | +| `sink.retina.client` | `1` | Number of Retina clients per table writer. | +| `sink.retina.log.queue` | `true` | Enable queue logging. | +| `sink.retina.rpc.limit` | `1000` | Max inflight RPC requests. | +| `sink.retina.trans.limit` | `1000` | Max inflight transaction requests. | +| `sink.retina.trans.request.batch` | `false` | Enable batched transaction requests. | +| `sink.retina.trans.request.batch.size` | `100` | Batch size for transaction requests. | +| `sink.timeout.ms` | `30000` | RPC timeout. | +| `sink.flush.interval.ms` | `1000` | Flush interval. | +| `sink.flush.batch.size` | `100` | Flush batch size. | +| `sink.max.retries` | `3` | Retry limit. | +| `sink.commit.method` | `async` | Commit method: `sync` or `async`. | +| `sink.commit.batch.size` | `500` | Commit batch size. | +| `sink.commit.batch.worker` | `16` | Commit worker threads. | +| `sink.commit.batch.delay` | `200` | Commit batch delay in ms. | + + +### CSV Sink + +| Key | Default | Notes | +| --- | --- | --- | +| `sink.csv.path` | `./data` | Output directory. | +| `sink.csv.enable_header` | `false` | Write header row. | + +### Proto Sink and Storage Source + +| Key | Default | Notes | +| --- | --- | --- | +| `sink.proto.dir` | required | Proto output or input directory. | +| `sink.proto.data` | `data` | Data set name. | +| `sink.proto.maxRecords` | `100000` | Max records per file. | +| `sink.storage.loop` | `false` | Whether to loop over stored files. | + +### Flink Sink + +| Key | Default | Notes | +| --- | --- | --- | +| `sink.flink.server.port` | `9091` | Polling server port. | + + +### Kafka Source + +Kafka source is deprecated. + +| Key | Default | Notes | +| --- | --- | --- | +| `bootstrap.servers` | required | Kafka bootstrap servers. | +| `group.id` | required | Consumer group id. | +| `auto.offset.reset` | none | Standard Kafka consumer property. | +| `key.deserializer` | `org.apache.kafka.common.serialization.StringDeserializer` | Kafka key deserializer. | +| `value.deserializer` | `io.pixelsdb.pixels.sink.event.deserializer.RowChangeEventJsonDeserializer` | Kafka value deserializer for row events. | +| `topic.prefix` | required | Topic prefix for table events. | +| `consumer.capture_database` | required | Database name used to build topic names. | +| `consumer.include_tables` | empty | Comma-separated table list, empty means all. | +| `transaction.topic.suffix` | `transaction` | Suffix appended to transaction topics. | +| `transaction.topic.value.deserializer` | `io.pixelsdb.pixels.sink.event.deserializer.RowChangeEventJsonDeserializer` | Deserializer for transaction messages. | +| `transaction.topic.group_id` | `transaction_consumer` | Consumer group for transaction topic. | +| `sink.registry.url` | required | Avro Schema registry endpoint. | + +**Reserved Configuration** + +| Key | Default | Notes | +| --- | --- | --- | +| `sink.remote.host` | `localhost` | Sink server host. | +| `sink.remote.port` | `9090` | Sink server port. | +| `sink.rpc.enable` | `false` | Enable RPC simulation (for development). | +| `sink.rpc.mock.delay` | `0` | Artificial delay in ms. | + +**Monitoring and Metrics** + +| Key | Default | Notes | +| --- | --- | --- | +| `sink.monitor.enable` | `false` | Enable Prometheus metrics endpoint. | +| `sink.monitor.port` | `9464` | Metrics server port. | +| `sink.monitor.report.enable` | `true` | Enable report file output. | +| `sink.monitor.report.interval` | `5000` | Report interval in ms. | +| `sink.monitor.report.file` | `/tmp/sink.csv` | Report output file. | +| `sink.monitor.freshness.interval` | `1000` | Freshness report interval in ms. | +| `sink.monitor.freshness.file` | `/tmp/sinkFreshness.csv` | Freshness report output file. | +| `sink.monitor.freshness.level` | `row` | `row`, `txn`, or `embed`. | +| `sink.monitor.freshness.embed.warmup` | `10` | Warmup seconds for embedded freshness query. | +| `sink.monitor.freshness.embed.static` | `false` | Whether to keep a static snapshot. | +| `sink.monitor.freshness.embed.snapshot` | `false` | Whether to take a snapshot. | +| `sink.monitor.freshness.embed.tablelist` | empty | Tables to include for embedded mode. | +| `sink.monitor.freshness.embed.delay` | `0` | Delay seconds for embedded freshness query. | +| `sink.monitor.freshness.verbose` | `false` | Verbose freshness logging. | +| `sink.monitor.freshness.timestamp` | `false` | Include timestamps. | + +Note: In the Retina paper experiments, `sink.monitor.freshness.level=embed` is used to query freshness from Trino. This requires the last column of each table to be `freshness_ts`. + +**Freshness Trino Settings** + +| Key | Default | Notes | +| --- | --- | --- | +| `trino.url` | required for Trino-based freshness | JDBC URL. | +| `trino.user` | required for Trino-based freshness | Username. | +| `trino.password` | required for Trino-based freshness | Password. | +| `trino.parallel` | `1` | Parallel query count. | diff --git a/docs/overview.md b/docs/overview.md new file mode 100644 index 0000000..c321e20 --- /dev/null +++ b/docs/overview.md @@ -0,0 +1,135 @@ +# Pixels Sink Overview + +![](./assets/frame.png) + +Pixels Sink uses a multi-stage pipeline. Each stage communicates via producer/consumer queues. + +**Entry** + +**PixelsSinkApp** +- Main entry point for running as a standalone server. +- Configuration is loaded via `PixelsSinkConfigFactory` using the properties file passed by `-c`. + +**PixelsSinkProvider** +- Implements Pixels SPI so it can be started by Pixels Worker. +- Receives a `ConfigFactory` directly and builds the same sink pipeline. + +**Source** +The source stage pulls events and forwards raw payloads to providers. + +**Source Inputs** +| Source Type | Description | Related Config | +| --- | --- | --- | +| `engine` | Debezium Engine reads WAL/binlog directly from a database | `debezium.*` | +| `kafka` | Kafka consumer reads change events from topics | `bootstrap.servers`, `group.id`, `topic.*` | +| `storage` | Reads from Pixels storage files containing serialized sink proto records | `sink.proto.*`, `sink.storage.loop` | + +**Source Outputs** +- The source does not parse events. It forwards raw records to providers. + +**Provider** +Providers convert source records into Pixels events. + +```mermaid +classDiagram + direction TB + + class EventProvider~SOURCE_RECORD_T, TARGET_RECORD_T~ { + +run() + +close() + +processLoop() + +convertToTargetRecord() + +recordSerdEvent() + +putRawEvent() + +getRawEvent() + +pollRawEvent() + +putTargetEvent() + +getTargetEvent() + } + + class TableEventProvider~SOURCE_RECORD_T~ { + } + class TableEventEngineProvider~T~ { + } + class TableEventKafkaProvider~T~ { + } + class TableEventStorageProvider~T~ { + } + + class TransactionEventProvider~SOURCE_RECORD_T~ { + } + class TransactionEventEngineProvider~T~ { + } + class TransactionEventKafkaProvider~T~ { + } + class TransactionEventStorageProvider~T~ { + } + + EventProvider <|-- TableEventProvider + EventProvider <|-- TransactionEventProvider + + TableEventProvider <|-- TableEventEngineProvider + TableEventProvider <|-- TableEventKafkaProvider + TableEventProvider <|-- TableEventStorageProvider + + TransactionEventProvider <|-- TransactionEventEngineProvider + TransactionEventProvider <|-- TransactionEventKafkaProvider + TransactionEventProvider <|-- TransactionEventStorageProvider + +``` + +Example mappings: + +| Provider | Source Type | Target Type | +| --- | --- | --- | +| `TableEventEngineProvider` | Debezium Struct | `RowChangeEvent` | +| `TableEventKafkaProvider` | Kafka topic | `RowChangeEvent` | +| `TableEventStorageProvider` | Proto bytes | `RowChangeEvent` | +| `TransactionEventEngineProvider` | Debezium Struct | `SinkProto.TransactionMetadata` | +| `TransactionEventKafkaProvider` | Kafka topic | `SinkProto.TransactionMetadata` | +| `TransactionEventStorageProvider` | Proto bytes | `SinkProto.TransactionMetadata` | + +**Processor** +Processors pull events from providers and write to the sink writers. + +- `TableProcessor` instances are created by `TableProviderAndProcessorPipelineManager`. +- There is typically one `TableProcessor` per table to maintain per-table ordering. +- `TransactionProcessor` is a singleton. + +**Writer** +Writers implement `PixelsSinkWriter`: + +| Method | Description | +| --- | --- | +| `writeRow(RowChangeEvent rowChangeEvent)` | Write a row change | +| `writeTrans(SinkProto.TransactionMetadata transactionMetadata)` | Handle transaction metadata | +| `flush()` | Flush buffered data | + +**Retina Writer** +`RetinaWriter` implements transactional replay into Retina. + +Key components: +- `RetinaServiceProxy` communicates with Retina. +- `SinkContextManager` holds transaction context and table writer proxies. + +Bucket routing for `RowChangeEvent`: +- Insert: derive bucket from the after-image key. +- Delete: derive bucket from the before-image key. +- Update: if primary key is unchanged, use any key. If primary key changes, split into delete and insert events and preserve delete-then-insert order. + +![img.png](./assets/TransactionCoordinator.png) + +Table writers: +- `SingleTxWriter` writes a single transaction per call. +- `CrossTxWriter` allows a batch to contain multiple transactions. + +Transactions are committed via `TransactionProxy` which supports synchronous or async batch commits. + +**Proto Writer** +Creates storage source files by serializing events to proto. Metadata (file paths, etc.) is stored in ETCD. + +**CSV Writer** +Writes events to CSV files. + +**Flink Writer** +Exposes events to Flink through a polling service. diff --git a/docs/transaction.md b/docs/transaction.md index f51ba7e..f8954f9 100644 --- a/docs/transaction.md +++ b/docs/transaction.md @@ -1,6 +1,6 @@ -# 事务处理机制 +# Transaction Handling -假设同一表上,保证按照事务顺序读取RowRecord +Assume row records for the same table are read in transaction order. ```mermaid sequenceDiagram @@ -8,23 +8,22 @@ sequenceDiagram participant CO as TransactionCoordinator participant T1 as TableConsumer-Nation participant T2 as TableConsumer-Region -%% 事务 TX1 生命周期 +%% Transaction TX1 lifecycle TC ->> CO: TX1: BEGIN T1 ->> CO: (TX1): update nation row1 TC ->> CO: TX1: END - Note over CO: 收到 END 后等待所有事件完成 -%% TX1 的延迟事件处理 + Note over CO: After END, wait for all events to finish +%% Delayed events for TX1 T2 ->> CO: (TX1): update region row2 - CO -->> CO: 验证 TX1 完整性 - CO -->> CO: 提交TX1 -%% 事务 TX2 生命周期(注意事件到达顺序问题) + CO -->> CO: Validate TX1 completeness + CO -->> CO: Commit TX1 +%% Transaction TX2 lifecycle (out-of-order arrival) T1 ->> CO: (TX2): update nation row2 - Note over CO: 发现 TX2 未注册,缓存事件 + Note over CO: TX2 not registered yet, cache event TC ->> CO: TX2: BEGIN - CO -->> CO: 处理缓存的 TX2 事件 + CO -->> CO: Process cached TX2 events T2 ->> CO: (TX2): update region row2 TC ->> CO: TX2: END - CO -->> CO: 验证 TX2 完整性 - CO -->> CO: 提交 TX2 + CO -->> CO: Validate TX2 completeness + CO -->> CO: Commit TX2 ``` - diff --git a/docs/usage.md b/docs/usage.md new file mode 100644 index 0000000..5beaa54 --- /dev/null +++ b/docs/usage.md @@ -0,0 +1,40 @@ +# Usage + +This document describes how to run Pixels Sink and how the runtime is wired. + +**Entry Point** +- Main class: `io.pixelsdb.pixels.sink.PixelsSinkApp` +- CLI options: `-c, --config` for the properties file path + +**Run with Maven** +```bash +mvn -q -DskipTests exec:java \ + -Dexec.mainClass=io.pixelsdb.pixels.sink.PixelsSinkApp \ + -Dexec.args="-c conf/pixels-sink.pg.properties" +``` + +**Run with scripts** + +```bash +./pixels-sink +``` + +**Run from an IDE** +- Set the main class to `io.pixelsdb.pixels.sink.PixelsSinkApp` +- Add program args: `-c conf/pixels-sink.pg.properties` + +**Pipeline Summary** +- A source reads change events from Debezium Engine, Kafka, or storage files. +- A provider converts source records into Pixels events. +- Processors enforce ordering by table and write to the configured sink. +- The sink writer persists the events to Retina, CSV, Proto, Flink, or a no-op sink. + +**Lifecycle Notes** +- Metrics and freshness reporting are configured in the properties file. +- If Prometheus metrics are enabled, the HTTP server is started on `sink.monitor.port`. +- If freshness checking is set to `embed`, the freshness client is started with the configured Trino settings. + +**Example Configs** +- `conf/pixels-sink.pg.properties` for Postgres + Kafka +- `conf/pixels-sink.aws.properties` for AWS environments, using retina +- `conf/pixels-sink.flink.properties` for Flink sink diff --git a/pixels-sink b/pixels-sink new file mode 100755 index 0000000..a3a3164 --- /dev/null +++ b/pixels-sink @@ -0,0 +1,71 @@ +#!/usr/bin/env bash + +# Copyright 2026 PixelsDB. +# +# This file is part of Pixels. +# +# Pixels is free software: you can redistribute it and/or modify +# it under the terms of the Affero GNU General Public License as +# published by the Free Software Foundation, either version 3 of +# the License, or (at your option) any later version. +# +# Pixels is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# Affero GNU General Public License for more details. +# +# You should have received a copy of the Affero GNU General Public +# License along with Pixels. If not, see +# . + +set -euo pipefail + +# Resolve the absolute path of this script +SOURCE_PATH=$(readlink -f "$BASH_SOURCE") 2>/dev/null +SINK_DIR=$(dirname "$SOURCE_PATH") + +# Environment variable +# export PIXELS_HOME="/home/ubuntu/opt/pixels" + +# Application properties file (default) +#PROPERTIES_FILE="${SINK_DIR}/conf/pixels-sink.pg.properties" +#PROPERTIES_FILE="${SINK_DIR}/conf/pixels-sink.ch.properties" +DEFAULT_PROPERTIES_FILE="${SINK_DIR}/conf/pixels-sink.aws.properties" +DEFAULT_PROPERTIES_FILE="${SINK_DIR}/conf/pixels-sink.flink.properties" +DEFAULT_PROPERTIES_FILE="${SINK_DIR}/conf/pixels-sink.flink.properties.hybench" + +# Optional first argument overrides the default config path +PROPERTIES_FILE="${1:-$DEFAULT_PROPERTIES_FILE}" + +# JVM config file +JVM_CONFIG_FILE="${SINK_DIR}/conf/jvm.conf" + +if [[ ! -f "$JVM_CONFIG_FILE" ]]; then + echo "JVM config file not found: $JVM_CONFIG_FILE" + exit 1 +fi + +# Read JVM options (ignore comments and empty lines) +JVM_OPTS=$(grep -v '^\s*#' "$JVM_CONFIG_FILE" | grep -v '^\s*$' | xargs) + +# Main class (for reference, though fat-jar specifies it in MANIFEST) +MAIN_CLASS="io.pixelsdb.pixels.sink.PixelsSinkApp" + +# Application arguments +APP_ARGS="-c $PROPERTIES_FILE" + +# Path to the fat jar +APP_JAR="$SINK_DIR/target/pixels-sink-0.2.0-SNAPSHOT-full.jar" + +if [[ ! -f "$APP_JAR" ]]; then + echo "Application jar not found: $APP_JAR" + exit 1 +fi + +echo "Starting PixelsSinkApp..." +echo "PIXELS_HOME = $PIXELS_HOME" +echo "JVM_OPTS = $JVM_OPTS" +echo "APP_JAR = $APP_JAR" +echo "APP_ARGS = $APP_ARGS" + +exec java $JVM_OPTS -jar "$APP_JAR" $APP_ARGS diff --git a/pom.xml b/pom.xml index e9896e7..a4919eb 100644 --- a/pom.xml +++ b/pom.xml @@ -36,9 +36,10 @@ 0.9.0 3.8.0 5.8 - 1.18.36 - 3.0.7.Final + 1.18.42 + 3.2.3.Final 1.4.13 + 440 @@ -52,13 +53,36 @@ pixels-core true + + io.pixelsdb + pixels-retina + true + test + + + io.etcd + jetcd-core + true + + + io.netty + netty-all + + + io.grpc + grpc-netty + + + io.trino + trino-jdbc + ${trino.version} + com.alibaba fastjson true - org.apache.logging.log4j log4j-core @@ -111,12 +135,22 @@ kafka-clients ${dep.kafka.version} + + org.apache.kafka + connect-api + ${dep.kafka.version} + com.opencsv opencsv ${dep.opencsv.version} + + com.google.guava + guava + 33.2.0-jre + org.projectlombok @@ -142,7 +176,11 @@ debezium-sink ${dep.debezium.version} - + + io.debezium + debezium-connector-postgres + ${dep.debezium.version} + junit @@ -182,13 +220,17 @@ 2.6.2.Final - io.prometheus simpleclient 0.16.0 + + io.prometheus + simpleclient_hotspot + 0.16.0 + io.prometheus @@ -196,6 +238,18 @@ 0.16.0 + + org.apache.commons + commons-math3 + 3.6.1 + + + + + io.pixelsdb + pixels-storage-localfs + + @@ -257,43 +311,24 @@ develop/images/pixels-sink/Dockerfile - - com.github.os72 - protoc-jar-maven-plugin - 3.3.0.1 - - - generate-sources - - run - - - - src/main/proto - - - - java - src/main/java - - - grpc-java - src/main/java - io.grpc:protoc-gen-grpc-java:1.53.0 - - - - - - - - org.apache.maven.plugins - maven-compiler-plugin - - 16 - 16 - - + + org.apache.maven.plugins + maven-compiler-plugin + + 17 + 17 + + + org.projectlombok + lombok + ${dep.lombok.version} + + + + -parameters + + + \ No newline at end of file diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..8cb1b70 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,4 @@ +flask +pandas +matplotlib +numpy diff --git a/scripts/README.md b/scripts/README.md new file mode 100644 index 0000000..41b7432 --- /dev/null +++ b/scripts/README.md @@ -0,0 +1,32 @@ +# Scripts + +This folder contains performance and monitoring helpers for Pixels Sink. Most scripts are used to organize results, aggregate metrics, or prepare datasets for analysis. Some scripts also generate plots. + +## Scripts + +- `perf_flush.py`: + Measures flush behavior and latency under different flush settings. + +- `perf_freshness.py`: + Collects freshness metrics (row/txn/embed) and writes reports. + +- `perf_multi_rate.py`: + Runs multi-rate load tests and aggregates results. + +- `perf_query.py`: + Issues benchmark queries (typically via Trino) to validate freshness or latency. + +- `perf_rate.py`: + Runs single-rate throughput tests. + +- `perf_retina.py`: + Benchmarks Retina write performance. + +- `perf_retina_2.py`: + Alternative Retina benchmark with different batching or concurrency settings. + +- `perf_retina_3.py`: + Another Retina benchmark variant for comparative testing. + +- `perf_web_monitor.py`: + Pulls metrics from local folder and shows results. diff --git a/scripts/perf_flush.py b/scripts/perf_flush.py new file mode 100644 index 0000000..86ac78e --- /dev/null +++ b/scripts/perf_flush.py @@ -0,0 +1,72 @@ +# Copyright 2026 PixelsDB. +# +# This file is part of Pixels. +# +# Pixels is free software: you can redistribute it and/or modify +# it under the terms of the Affero GNU General Public License as +# published by the Free Software Foundation, either version 3 of +# the License, or (at your option) any later version. +# +# Pixels is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# Affero GNU General Public License for more details. +# +# You should have received a copy of the Affero GNU General Public +# License along with Pixels. If not, see +# . + +import pandas as pd +import os + +########################################## +# Config parameters +########################################## +MAX_SECONDS = 600 # Max window in seconds + +def preprocess_flush_data(): + # Define strategy file mapping + strategies = { + "High": "result_ablation/flush/fresh_low.csv", + "Mid": "result_ablation/flush/fresh_mid.csv", + "Low": "result_ablation/flush/fresh_large.csv" + } + + combined_df = pd.DataFrame() + + for label, path in strategies.items(): + if not os.path.exists(path): + print(f"Warning: {path} not found.") + continue + + # Read data + df = pd.read_csv(path, header=None, names=["ts", "freshness", "query_time"]) + + # 1. Compute relative time (seconds) + # Assume ts is milliseconds; if seconds, no /1000 + # Use first row of each file as T=0 + start_ts = df["ts"].iloc[0] + df["rel_sec"] = (df["ts"] - start_ts) / 1000.0 + + # 2. Filter by MAX_SECONDS + df_filtered = df[df["rel_sec"] <= MAX_SECONDS].copy() + + # 3. Extract needed columns and merge + # reset_index aligns rows across strategies (start at row 0) + temp_df = pd.DataFrame({ + f"{label}_fresh": df_filtered["freshness"].reset_index(drop=True), + f"{label}_query": df_filtered["query_time"].reset_index(drop=True) + }) + + # Concatenate horizontally with axis=1 + combined_df = pd.concat([combined_df, temp_df], axis=1) + + # Create output directory + os.makedirs("tmp", exist_ok=True) + + output_path = "tmp/flush_ablation_combined.csv" + combined_df.to_csv(output_path, index=False) + print(f"✅ Combined data (First {MAX_SECONDS}s) saved to {output_path}") + +if __name__ == "__main__": + preprocess_flush_data() diff --git a/scripts/perf_freshness.py b/scripts/perf_freshness.py new file mode 100644 index 0000000..c21e8ca --- /dev/null +++ b/scripts/perf_freshness.py @@ -0,0 +1,174 @@ +# Copyright 2026 PixelsDB. +# +# This file is part of Pixels. +# +# Pixels is free software: you can redistribute it and/or modify +# it under the terms of the Affero GNU General Public License as +# published by the Free Software Foundation, either version 3 of +# the License, or (at your option) any later version. +# +# Pixels is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# Affero GNU General Public License for more details. +# +# You should have received a copy of the Affero GNU General Public +# License along with Pixels. If not, see +# . + +import pandas as pd +import matplotlib.pyplot as plt +import numpy as np +import seaborn as sns + +########################################## +# Configuration: CSV Files and Labels +########################################## +# csv_files = { +# "10k": "result100/fresh_1n4_10k_2.csv", +# "20k": "result100/fresh_1n4_20k_2.csv", +# "40k": "result100/fresh_1n4_40k.csv", +# "80k": "result100/fresh_1n4_80k.csv", +# "120k": "result100/fresh_1n4_120k_2.csv", +# # "150k": "result100/nouse_fresh_1n4_150k.csv", +# } + +csv_files = { + "10k": "result100_2/fresh_10k.csv", + "20k": "result100_2/fresh_20k.csv", + "40k": "result100_2/fresh_40k.csv", + "80k": "result100_2/fresh_80k.csv", + "100k": "result100_2/fresh_100k.csv", + "120k": "result100_2/fresh_120k.csv", + "160k": "result100_2/fresh_160k.csv", + "200k": "result100_2/fresh_200k.csv", +} + +csv_files = { + "10k": "result_ch/fresh_10K.csv", + "20k": "result_ch/fresh_20K.csv", + "40k": "result_ch/fresh_40K.csv", + "80k": "result_ch/fresh_80K.csv", + "85k": "result_ch/fresh_85K.csv" +} +# csv_files = { +# "Query Transaction": "tmp/i7i_2k_dec_freshness.csv", +# "Query Record": "tmp/i7i_2k_record_dec_freshness.csv", +# "Internal Transaction Context": "tmp/i7i_2k_txn_dec_freshness.csv", +# "Query Selected Table, Trans Mode": "tmp/i7i_2k_batchtest_dec_freshness_2.csv" +# } + + +csv_files = { + "200": "result_lance/fresh_bucket1_batch1000.csv", + "300": "result_lance/fresh_bucket4_batch1000.csv" +} + +MAX_SECONDS = 30000 # Capture data for the first N seconds +SKIP_SECONDS = 10 # Skip the first N seconds (adjustable) +BIN_SECONDS = 10 # Average window (seconds) +MAX_FRESHNESS = 500000 # Filter out useless data during initial warmup + +########################################## +# Data Loading and Processing +########################################## +data = {} +data_raw_filtered = {} # New: store filtered raw data before resample + +for label, path in csv_files.items(): + df_full = pd.read_csv(path, header=None) + + df = pd.DataFrame() + df["ts"] = pd.to_datetime(df_full.iloc[:, 0], unit="ms") + df["freshness"] = df_full.iloc[:, 1] # Always take the 2nd column + + # 1. Basic filter logic + t0 = df["ts"].iloc[0] + df["sec"] = (df["ts"] - t0).dt.total_seconds() + + mask = (df["sec"] >= SKIP_SECONDS) & \ + (df["sec"] <= MAX_SECONDS) & \ + (df["freshness"] <= MAX_FRESHNESS) + df = df[mask].copy() + + # Align time axis start to 0 + if not df.empty: + t_new0 = df["ts"].iloc[0] + df["sec"] = (df["ts"] - t_new0).dt.total_seconds() + + # Store filtered raw data for CDF plot + data_raw_filtered[label] = df["freshness"].copy() + + # 2. Resample only for Plot 1 (Time Series) + df_bin = df.resample(f"{BIN_SECONDS}s", on="ts").mean().reset_index() + df_bin["bin_sec"] = (df_bin["ts"] - df_bin["ts"].iloc[0]).dt.total_seconds() + data[label] = df_bin + +########################################## +# Plot 1: (unchanged) Smoothed Time Series +########################################## +# ... (Plot 1 code omitted; same logic as before) ... + +########################################## +# Plot 2: Inverted CDF (use raw data_raw_filtered) +########################################## +plt.figure(figsize=(10, 5)) + +for label, raw_vals in data_raw_filtered.items(): + # Use raw points without resample + vals = np.sort(raw_vals.dropna()) + prob = np.linspace(0, 1, len(vals)) + + plt.plot(prob, vals, label=label) + +plt.xticks(np.arange(0, 1.1, 0.1)) +plt.xlim(0, 1) +plt.xlabel("CDF (Probability)") +plt.ylabel("Freshness (ms, Raw Data)") # Update label to emphasize raw data +plt.title( + f"Inverted Freshness CDF\n(Raw Data Points, Skip {SKIP_SECONDS}s)" +) + +plt.grid(True, which="both", ls="-", alpha=0.3) +plt.legend() +plt.tight_layout() +plt.savefig("freshness_cdf_raw_fixed_ticks.png") # Suggested rename for clarity +plt.close() + +########################################## +# Data Export: Export Raw Filtered Data +########################################## +raw_series_list = [] + +for label, path in csv_files.items(): + # 1. Read all columns + df_raw = pd.read_csv(path, header=None) + + # 2. Compatibility: always use column 2 (index 1) as freshness + df_processed = pd.DataFrame() + df_processed["ts"] = pd.to_datetime(df_raw.iloc[:, 0], unit="ms") + + # Key change: iloc[:, 1] ensures the middle column (freshness) + # Even if a 3rd column (query time) exists, it is ignored + df_processed["freshness"] = df_raw.iloc[:, 1] + + # 3. Compute relative time axis + t0 = df_processed["ts"].iloc[0] + df_processed["sec"] = (df_processed["ts"] - t0).dt.total_seconds() + + # 4. Apply filter logic + mask = (df_processed["sec"] >= SKIP_SECONDS) & \ + (df_processed["sec"] <= MAX_SECONDS) & \ + (df_processed["freshness"] <= MAX_FRESHNESS) + + # 5. Extract and rename for horizontal merge + filtered_series = df_processed.loc[mask, "freshness"].reset_index(drop=True) + filtered_series.name = label + raw_series_list.append(filtered_series) + +# 6. Merge horizontally and export +if raw_series_list: + df_export = pd.concat(raw_series_list, axis=1) + export_filename = "freshness_raw_filtered.csv" + df_export.to_csv(export_filename, index=False) + print(f"Filtered raw data exported to: {export_filename}") diff --git a/scripts/perf_multi_rate.py b/scripts/perf_multi_rate.py new file mode 100644 index 0000000..00c9952 --- /dev/null +++ b/scripts/perf_multi_rate.py @@ -0,0 +1,345 @@ +# Copyright 2026 PixelsDB. +# +# This file is part of Pixels. +# +# Pixels is free software: you can redistribute it and/or modify +# it under the terms of the Affero GNU General Public License as +# published by the Free Software Foundation, either version 3 of +# the License, or (at your option) any later version. +# +# Pixels is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# Affero GNU General Public License for more details. +# +# You should have received a copy of the Affero GNU General Public +# License along with Pixels. If not, see +# . + +import pandas as pd +import matplotlib.pyplot as plt +import matplotlib.ticker as ticker +import numpy as np +import os +from datetime import datetime, date + +########################################## +# Configuration: Labels and Base Directory +########################################## +# csv_labels = { +# "1 Node": "nodes_1_rate_2.csv", +# "2 Nodes": "nodes_2_rate_2.csv", +# "4 Nodes": "nodes_4_rate_2.csv", +# "8 Nodes": "nodes_8_rate_2.csv", +# # "16 Nodes 8WB": "nodes_16_rate_2.csv", +# # "16 Nodes 16WB": "nodes_16_rate_3.csv", +# # "16 Nodes": "nodes_16_rate_4.csv", +# "16 Nodes": "nodes_16_rate_16c.csv", + +# } + +csv_labels = { + # "4 Node 6 Client": "test_rate_4nodes_6c.csv", + "100k": "test_rate_1n16_1c_batch50.csv", + "120k": "rate_1n16_120k_batch500.csv", + "160k 2Client": "rate_1n16c2_160k_batch500.csv", + "180k 3Client": "rate_1n16c3_180k_batch500.csv", + "200k 2Client": "rate_1n16c2_200k_batch500.csv", + "240k 3Client": "rate_1n16c3_240k_batch500.csv", +} + +csv_labels = { + "1Node" : "rate_1n16_120k_batch500.csv", + "2Nodes": "rate_2n162c_250k.csv", + "4Nodes": "rate_4n162c_500k.csv", + "8Nodes": "rate_8n164c.csv", + # "16Nodes": "rate_16n168c_250k.csv", + # "16Nodes_2": "rate_16n168c.csv", + # "16Nodes_2": "rate_16n168c_180k.csv", + "16Nodes": "rate_16n168c_500k.csv" +} + +csv_labels = { + "1Node" : "rate_200k.csv", + "2Nodes": "0130_rate_2node.csv", + "4Nodes": "0130_rate_4node.csv", + "8Nodes": "0130_rate_8node.csv", + # "16Nodes": "rate_16n168c_500k.csv" +} +LOG_BASE_DIR = "collected-logs" +# Added "interval_sec" to handle the precise delta time from Java logs +COL_NAMES_NEW = ["time", "rows", "txns", "debezium", "serdRows", "serdTxs", "interval_sec"] +NUMERIC_COLS = ["rows", "txns", "debezium", "serdRows", "serdTxs"] +PLOT_COL = "rows" # This represents 'rows' ops + +MAX_SECONDS = 1800 +SKIP_SECONDS = 30 +BIN_SECONDS = 5 + +data = {} + + + +def export_scalability_matrix(data_dict, output_file="scalability_results.csv"): + """ + Convert processed data into a column-wise CSV (1Node, 2Nodes, 4Nodes...) + """ + # 1. Auto-detect and sort node counts (1, 2, 4, 8, 16) + # Extract numbers from labels via regex + def get_node_count(label): + import re + nums = re.findall(r'\d+', label) + return int(nums[0]) if nums else 999 + + sorted_labels = sorted(data_dict.keys(), key=get_node_count) + + # 2. Align sample counts (use min length to align matrix) + min_len = min([len(data_dict[label][PLOT_COL]) for label in sorted_labels]) + print(f"\n[Export] Aligned sample count: {min_len}") + + # 3. Build result matrix + matrix_data = {} + for label in sorted_labels: + # Get throughput series for this experiment + series = data_dict[label][PLOT_COL].dropna().values + # Truncate to min_len + matrix_data[label] = series[:min_len].astype(int) + + # 4. Convert to DataFrame and export + df_result = pd.DataFrame(matrix_data) + + # Rename columns to requested format (1node, 2node...) + rename_map = {lbl: f"{get_node_count(lbl)}node" for lbl in sorted_labels} + df_result = df_result.rename(columns=rename_map) + + # Export + df_result.to_csv(output_file, index=False) + + print(f"--- Performance data matrix (first 5 rows) ---") + print(df_result.head(5)) + print(f"---------------------------") + print(f"CSV file generated: {output_file}") + +for label, filename in csv_labels.items(): + print(f"Processing Experiment: {label}") + all_node_data = [] + + if not os.path.exists(LOG_BASE_DIR): + print(f"Directory {LOG_BASE_DIR} not found!") + continue + + nodes = [d for d in os.listdir(LOG_BASE_DIR) if os.path.isdir(os.path.join(LOG_BASE_DIR, d))] + + for node in nodes: + path = os.path.join(LOG_BASE_DIR, node, filename) + if os.path.exists(path): + print(f" Reading node {node} -> {filename}") + + with open(path, 'r') as f: + first_line = f.readline() + col_count = len(first_line.split(',')) + + current_cols = COL_NAMES_NEW if col_count >= 7 else COL_NAMES_NEW[:6] + df = pd.read_csv(path, header=None, names=current_cols, sep=',') + + # Standardize Time + df["ts"] = pd.to_datetime(df["time"], format="%H:%M:%S", errors='coerce') + df = df.dropna(subset=["ts"]).copy() + df["ts"] = df["ts"].dt.time.apply(lambda x: datetime.combine(date.today(), x)) + + for col in NUMERIC_COLS: + df[col] = pd.to_numeric(df[col], errors='coerce') + + if "interval_sec" not in df.columns: + # Calculate the difference between consecutive timestamps (Successive Diffs) + # We sort by time first to ensure the diff is chronological + df = df.sort_values("ts") + + # Use shift(-1) to look at the 'next' row's timestamp + df["interval_sec"] = (df["ts"].shift(-1) - df["ts"]).dt.total_seconds() + + # For the very last row, we use the average of all previous intervals as a fallback + mean_interval = df["interval_sec"].mean() + if pd.isna(mean_interval) or mean_interval <= 0: + mean_interval = 1.0 # Absolute fallback if only one row exists + + df["interval_sec"] = df["interval_sec"].fillna(mean_interval) + else: + # If the column exists, trust the Java-side measured interval + df["interval_sec"] = pd.to_numeric(df["interval_sec"], errors='coerce').fillna(1.0) + + # Reconstruct the absolute quantity from Ops/s + # Since input is Ops, (Ops * actual_seconds) = Total Events in that interval + for col in NUMERIC_COLS: + df[f"{col}_total_events"] = df[col] * df["interval_sec"] + + all_node_data.append(df) + + if not all_node_data: + continue + + combined_raw = pd.concat(all_node_data).sort_values("ts") + t0 = combined_raw["ts"].iloc[0] + combined_raw["sec_from_start"] = (combined_raw["ts"] - t0).dt.total_seconds() + + # Filter time range + filtered_df = combined_raw[(combined_raw["sec_from_start"] >= SKIP_SECONDS) & + (combined_raw["sec_from_start"] <= SKIP_SECONDS + MAX_SECONDS + BIN_SECONDS)].copy() + + if filtered_df.empty: + continue + + # 3. Aggregation via Resampling + # We sum the 'total_events' reconstructed from all nodes in the bin + df_bin = filtered_df.set_index("ts").resample( + f"{BIN_SECONDS}s", + origin='start' + ).sum(numeric_only=True).reset_index() + + # --- Unit Conversion --- + for col in NUMERIC_COLS: + # Cluster Ops = (Sum of all events from all nodes) / (Bin Duration) + df_bin[col] = df_bin[f"{col}_total_events"] / BIN_SECONDS + + # 4. Final Alignment + df_bin["bin_sec"] = (df_bin["ts"] - df_bin["ts"].iloc[0]).dt.total_seconds() + df_bin = df_bin[df_bin["bin_sec"] <= MAX_SECONDS] + + data[label] = df_bin + + +########################################## +# Plotting +########################################## +if not data: + print("No data available to plot.") + exit() + +# Plot 1: Time Series +plt.figure(figsize=(12, 6)) +for label, df in data.items(): + plt.plot(df["bin_sec"], df[PLOT_COL], marker='o', markersize=4, label=label) + +plt.xlim(0, MAX_SECONDS) +plt.xticks(np.arange(0, MAX_SECONDS + 1, 300)) +plt.ylim(bottom=0) + +# --- Key change: disable scientific notation --- +# style='plain' forces plain number formatting +# axis='y' applies only to the y-axis +plt.ticklabel_format(style='plain', axis='y') + +# For very large values, use ScalarFormatter to avoid offset +# plt.gca().yaxis.set_major_formatter(ticker.ScalarFormatter(useOffset=False)) + +plt.xlabel("Time (sec)") +# Changed label to Ops/s +plt.ylabel(f"Total Cluster {PLOT_COL} (Ops/s, {BIN_SECONDS}s bin)") +plt.title(f"Cluster Aggregate Throughput: {PLOT_COL} (Ops/s)") +plt.legend() +plt.grid(True, linestyle='--', alpha=0.7) +plt.tight_layout() +plt.savefig(f"cluster_rate_{PLOT_COL}_time.png") + +# Plot 2: CDF +plt.figure(figsize=(10, 5)) +for label, df in data.items(): + vals = np.sort(df[PLOT_COL].dropna()) + if len(vals) > 0: + y = np.linspace(0, 1, len(vals)) + plt.plot(vals, y, label=label) + +plt.xlim(left=0) +plt.xlabel(f"Aggregate Cluster {PLOT_COL} (Ops/s)") +plt.ylabel("CDF") +plt.title(f"Throughput Distribution CDF: {PLOT_COL}") +plt.legend() +plt.grid(True, linestyle='--', alpha=0.7) +plt.tight_layout() +plt.savefig(f"cluster_rate_{PLOT_COL}_cdf.png") + +print(f"\nProcessing Complete. Metrics plotted as Total Cluster Ops/s.") + +if not data: + print("No data available to plot.") + exit() + +# Plot 1: Time Series +plt.figure(figsize=(12, 6)) +for label, df in data.items(): + # Plot uses raw data to keep trends accurate + plt.plot(df["bin_sec"], df[PLOT_COL], marker='o', markersize=4, label=label) + +# --- X-axis Scaling --- +plt.xlim(0, MAX_SECONDS) +plt.xticks(np.arange(0, MAX_SECONDS + 1, 300)) + +# --- Y-axis Scaling: 1, 2, 3, 4 (Units of 100k) --- + +# 1. Force Y axis to start at 0 +plt.ylim(bottom=0) + +# 2. Define formatter (e.g., 200000 -> label "2") +def units_of_100k(x, pos): + return f'{int(x / 100000)}' if x != 0 else '0' + +# 3. Set major ticks every 100,000 +plt.gca().yaxis.set_major_locator(ticker.MultipleLocator(100000)) +# 4. Apply formatter +plt.gca().yaxis.set_major_formatter(ticker.FuncFormatter(units_of_100k)) + +# Update Y-axis label with units +plt.ylabel(f"Total Cluster {PLOT_COL} (x 100k Ops/s)") +plt.xlabel("Time (sec)") +plt.title(f"Cluster Aggregate Throughput ({PLOT_COL})") +plt.legend() +plt.grid(True, linestyle='--', alpha=0.7) +plt.tight_layout() +plt.savefig(f"cluster_rate_{PLOT_COL}_time_units.png") + +########################################## +# Plot 3: Scalability (Using Filtered Data) +########################################## +plt.figure(figsize=(10, 6)) + +# 1. Extract and parse node counts +node_results = [] +for label, df in data.items(): + # Extract number from label (e.g., "16 Nodes") + node_num = int(''.join(filter(str.isdigit, label))) + # Use your filtered and resampled data column + series = df[PLOT_COL].dropna() + node_results.append((node_num, series.values)) + +# 2. Sort to ensure correct line connections +node_results.sort(key=lambda x: x[0]) +sorted_nodes = [x[0] for x in node_results] +sorted_values = [x[1] for x in node_results] +means = [np.mean(v) for v in sorted_values] + +# 3. Plot +box = plt.boxplot(sorted_values, positions=sorted_nodes, widths=np.array(sorted_nodes) * 0.2) + +# Plot mean trend line +plt.plot(sorted_nodes, means, marker='o', markersize=8, linestyle='-', + linewidth=2, label='Mean Throughput (Filtered)', color='#1f77b4') + +# 4. Axes and styling (use log2 to show scalability) +plt.xscale('log', base=2) +plt.xticks(sorted_nodes, labels=[f"{n}N" for n in sorted_nodes]) +# plt.yscale('log') # Throughput often uses log axis to see linear growth slope + +# Format Y axis (disable sci notation or use 100k logic) +plt.gca().yaxis.set_major_formatter(ticker.ScalarFormatter()) +plt.ticklabel_format(style='plain', axis='y') + +plt.xlabel("Cluster Size (Nodes, log2 scale)") +plt.ylabel(f"Filtered {PLOT_COL} (Ops/s, {BIN_SECONDS}s bin)") +plt.title(f"Throughput Scalability: {PLOT_COL} (Filtered Data)") +plt.grid(True, which="both", linestyle='--', alpha=0.5) +plt.legend() +plt.tight_layout() + +plt.savefig(f"cluster_scalability_{PLOT_COL}_final.png") + +export_scalability_matrix(data) diff --git a/scripts/perf_query.py b/scripts/perf_query.py new file mode 100644 index 0000000..e16d92d --- /dev/null +++ b/scripts/perf_query.py @@ -0,0 +1,166 @@ +# Copyright 2026 PixelsDB. +# +# This file is part of Pixels. +# +# Pixels is free software: you can redistribute it and/or modify +# it under the terms of the Affero GNU General Public License as +# published by the Free Software Foundation, either version 3 of +# the License, or (at your option) any later version. +# +# Pixels is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# Affero GNU General Public License for more details. +# +# You should have received a copy of the Affero GNU General Public +# License along with Pixels. If not, see +# . + +import pandas as pd +import matplotlib.pyplot as plt +import numpy as np +import seaborn as sns +import os + +########################################## +# 1. Config: file paths and parameters +########################################## +data_dir = "result_ablation/gc" + +# Strict display order +file_groups = { + "Static": "static.csv", + "10s": "fresh_10s_2.csv", + "30s": "fresh_30s_size.csv", + "60s": "fresh_60s_size.csv", + "120s": "fresh_120s_size.csv", + "240s": "fresh_240s.csv", +} + + +data_dir = "result1k2_feb" +file_groups = { + "512": "fresh_512tile.csv", + "4096": "fresh_4096tile.csv" +} + +# Filter parameters +SKIP_SECONDS = 0 +MAX_SECONDS = 1200 +MAX_FRESHNESS = 50000 + +########################################## +# 2. Data load and processing +########################################## +plot_data = {} + +for label, filename in file_groups.items(): + path = os.path.join(data_dir, filename) + if not os.path.exists(path): + print(f"❌ File not found: {path}") + continue + + # Read CSV (3 columns: ts, dummy, freshness) + try: + df = pd.read_csv(path, header=None, names=["ts", "dummy", "freshness"]) + print(f"📖 Read {label}: {len(df)} rows of raw data") + + if df.empty: + print(f"⚠️ {label} file is empty") + continue + + # Time axis transform (align to 0 seconds) + df["ts_dt"] = pd.to_datetime(df["ts"], unit="ms") + t0 = df["ts_dt"].iloc[0] + df["sec"] = (df["ts_dt"] - t0).dt.total_seconds() + + # Data filtering + mask = (df["sec"] >= SKIP_SECONDS) & \ + (df["sec"] <= MAX_SECONDS) & \ + (df["freshness"] <= MAX_FRESHNESS) + + clean_series = df.loc[mask, "freshness"].dropna() + + if clean_series.empty: + print(f"⚠️ {label} No data left after filtering! (check SKIP_SECONDS)") + else: + plot_data[label] = clean_series + print(f"✅ {label} usable data points: {len(clean_series)}") + + except Exception as e: + print(f"❌ Error processing {label}: {e}") + +########################################## +# 3. Box plot +########################################## +if not plot_data: + print("‼️ No data to plot; check CSV content and paths.") +else: + sns.set_theme(style="whitegrid") + plt.figure(figsize=(10, 6)) + + labels = list(plot_data.keys()) + values = [plot_data[label] for label in labels] + + # Draw box plot + box_plot = plt.boxplot( + values, + labels=labels, + patch_artist=True, + showmeans=True, + meanprops={"marker":"D", "markerfacecolor":"white", "markeredgecolor":"black", "markersize":"5"}, + flierprops={'marker': 'o', 'markersize': 2, 'markerfacecolor': 'gray', 'alpha': 0.1} + ) + + # Color scheme + colors = sns.color_palette("husl", len(labels)) + for patch, color in zip(box_plot['boxes'], colors): + patch.set_facecolor(color) + patch.set_alpha(0.7) + + plt.title("Freshness Ablation Study: Query Time Comparison", fontsize=14, pad=20) + plt.ylabel("Query Time (ms)", fontsize=12, fontweight='bold') + plt.xlabel("Configuration", fontsize=12, fontweight='bold') + + # Auto-set Y range from data + all_vals = pd.concat(list(plot_data.values())) + plt.ylim(0, all_vals.quantile(0.99) * 1.2) # Ignore extreme outliers to improve view + + plt.grid(axis='y', linestyle='--', alpha=0.7) + plt.tight_layout() + + save_path = "freshness_boxplot_final.png" + plt.savefig(save_path, dpi=300) + print(f"\n🚀 Plot saved to: {save_path}") + + # Print final stats comparison + print("\n--- Final Statistics ---") + for label in labels: + s = plot_data[label] + print(f"{label:10} | Mean: {s.mean():8.2f}ms | Median: {s.median():8.2f}ms | P95: {s.quantile(0.95):8.2f}ms") + + plt.show() + +########################################## +# 4. Transpose export: each column is a config +########################################## +if plot_data: + # Put all Series into a list + export_columns = [] + for label in file_groups.keys(): # Extract in defined order + if label in plot_data: + # Extract data and rename Series as CSV headers + s = plot_data[label].reset_index(drop=True) + s.name = label + export_columns.append(s) + + # Merge horizontally (axis=1), one config per column + df_export = pd.concat(export_columns, axis=1) + + # Export CSV without index + export_filename = "gc_interval_query.csv" + df_export.to_csv(export_filename, index=False) + + print(f"\n📊 Exported successfully: {export_filename}") + print(f"Table shape: {df_export.shape} (rows x cols)") + print(df_export.head()) # Print preview rows diff --git a/scripts/perf_rate.py b/scripts/perf_rate.py new file mode 100644 index 0000000..b0d04f1 --- /dev/null +++ b/scripts/perf_rate.py @@ -0,0 +1,323 @@ +# Copyright 2026 PixelsDB. +# +# This file is part of Pixels. +# +# Pixels is free software: you can redistribute it and/or modify +# it under the terms of the Affero GNU General Public License as +# published by the Free Software Foundation, either version 3 of +# the License, or (at your option) any later version. +# +# Pixels is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# Affero GNU General Public License for more details. +# +# You should have received a copy of the Affero GNU General Public +# License along with Pixels. If not, see +# . + +import pandas as pd +import matplotlib.pyplot as plt +import numpy as np +import os +from datetime import datetime, date + +########################################## +# Configuration: CSV Files and Labels +########################################## +# csv_files = { +# # "10k": "resulti7i/10k_rate_2.csv", +# # "20k": "resulti7i/20k_rate_2.csv", +# # "30k": "resulti7i/30k_rate_2.csv", +# # "40k": "resulti7i/40k_rate_2.csv", +# # "50k": "resulti7i/50k_rate.csv", +# # "60k": "resulti7i/60k_rate_2.csv", +# # "80k": "resulti7i/80k_rate_2.csv", +# # "100k": "resulti7i_100/100k_rate.csv", +# "100k": "result1k2/test_rate_1n16_1c_batch50.csv", +# } + +csv_files = { + "64": { + "rate": "result_ablation/rate_64size.csv", + # "fresh": "result_ablation/fresh_64size.csv", + }, + "128": { + "rate": "result_ablation/rate_128size.csv", + # "fresh": "result_ablation/fresh_128size.csv", + }, + "256": { + "rate": "result_ablation/rate_256size.csv", + # "fresh": "result_ablation/fresh_256size.csv", + }, + "512": { + "rate": "result_ablation/rate_512size.csv", + # "fresh": "result_ablation/fresh_512size.csv", + }, + "1024": { + "rate": "result_ablation/rate_1024_2size.csv", + "fresh": "result_ablation/fresh_1024_2size.csv", + }, + "2048": { + "rate": "result_ablation/rate_2048size.csv", + # "fresh": "result_ablation/fresh_2048size.csv", + }, + "4096": { + "rate": "result_ablation/rate_4096size.csv", + # "fresh": "result_ablation/fresh_4096size.csv", + }, + "8192": { + "rate": "result_ablation/rate_8192size.csv", + # "fresh": "result_ablation/fresh_8192size.csv", + }, + "16384": { + "rate": "result_ablation/rate_16384size.csv", + "fresh": "result_ablation/fresh_16384size.csv", + }, +} + +csv_files = { + "200": { + "rate": "result_res/rate_200k.csv", + "freshness": "result_res/fresh_200k.csv" + } +} + +csv_files = { + "64": { + "rate": "result1k2_2/ablation/rate_64size.csv", + "fresh": "result1k2_2/ablation/fresh_64size.csv", + }, + "128": { + "rate": "result1k2_2/ablation/rate_128size.csv", + "fresh": "result1k2_2/ablation/fresh_128size.csv", + }, + "256": { + "rate": "result1k2_2/ablation/rate_256size.csv", + "fresh": "result1k2_2/ablation/fresh_256size.csv", + }, + "512": { + "rate": "result1k2_2/ablation/rate_512size.csv", + "fresh": "result1k2_2/ablation/fresh_512size.csv", + }, + "1024": { + "rate": "result1k2_2/ablation/rate_1024size.csv", + "fresh": "result1k2_2/ablation/fresh_1024size.csv", + }, + "2048": { + "rate": "result1k2_2/ablation/rate_2048size.csv", + "fresh": "result1k2_2/ablation/fresh_2048.size.csv", # Note: ls output includes .size here + }, + "4096": { + "rate": "result1k2_2/ablation/rate_4096size.csv", + "fresh": "result1k2_2/ablation/fresh_4096.size.csv", # Note: ls output includes .size here + }, + "8192": { + "rate": "result1k2_2/ablation/rate_8192size_2.csv", + "fresh": "result1k2_2/ablation/fresh_8192.size.csv", # Note: ls output includes .size here + }, + "16384": { + "rate": "result1k2_2/ablation/rate_16384size_2.csv", + "fresh": "result1k2_2/ablation/fresh_16384size.csv", + }, +} + +COL_NAMES = ["time", "rows", "txns", "debezium", "serdRows", "serdTxs", "lastTime"] +PLOT_COL = "rows" + +MAX_SECONDS = 1800 +SKIP_SECONDS = 100 +BIN_SECONDS = 2 + +########################################## +# Load & process RATE data +########################################## + +rate_ts_data = {} +rate_box_data = {} + +for label, files in csv_files.items(): + rate_path = files["rate"] + + df = pd.read_csv(rate_path, header=None, names=COL_NAMES) + + df["ts"] = pd.to_datetime(df["time"], format="%H:%M:%S", errors="coerce") + df = df.dropna(subset=["ts"]).copy() + df["ts"] = df["ts"].dt.time.apply( + lambda x: datetime.combine(date.today(), x) + ) + + df = df.sort_values("ts") + t0 = df["ts"].iloc[0] + df["sec"] = (df["ts"] - t0).dt.total_seconds() + + df = df[df["sec"] >= SKIP_SECONDS].copy() + if df.empty: + continue + + t1 = df["ts"].iloc[0] + df["sec"] = (df["ts"] - t1).dt.total_seconds() + df = df[df["sec"] <= MAX_SECONDS] + + for col in ["rows", "txns", "debezium", "serdRows", "serdTxs"]: + df[col] = pd.to_numeric(df[col], errors="coerce") + + df = df.set_index("ts") + df_bin = df.resample(f"{BIN_SECONDS}s").mean(numeric_only=True).reset_index() + + if not df_bin.empty: + df_bin["bin_sec"] = ( + df_bin["ts"] - df_bin["ts"].iloc[0] + ).dt.total_seconds() + rate_ts_data[label] = df_bin + rate_box_data[label] = df_bin[PLOT_COL].dropna() + +########################################## +# Load FRESHNESS data +########################################## + +fresh_val_data = {} + +for label, files in csv_files.items(): + # Support both freshness and fresh keys + fresh_path = files.get("freshness") or files.get("fresh") + if fresh_path is None or not os.path.exists(fresh_path): + continue + + # As specified: col2 is freshness, col3 is query_time + df = pd.read_csv( + fresh_path, + header=None, + names=["ts", "freshness_val", "query_time"] + ) + + # Convert column 2 + df["freshness_val"] = pd.to_numeric(df["freshness_val"], errors="coerce") + df = df.dropna(subset=["freshness_val"]) + + if not df.empty: + # Store freshness values from column 2 + fresh_val_data[label] = df["freshness_val"] + +########################################## +# Plot 1: Rate over time +########################################## + +plt.figure(figsize=(10, 5)) + +for label, df in rate_ts_data.items(): + plt.plot(df["bin_sec"], df[PLOT_COL], label=label) + +plt.xlabel("Time (sec)") +plt.ylabel(f"Throughput ({BIN_SECONDS}s average)") +plt.title("Throughput Over Time") +plt.legend(title="Rate") +plt.grid(True, linestyle="--", alpha=0.6) + +plt.tight_layout() +plt.savefig("rate_over_time.png") +plt.close() + +########################################## +# Plot 2: Throughput Boxplot (Standalone) +########################################## + +sorted_labels = sorted(rate_box_data.keys(), key=lambda x: int(x)) +x_indices = np.arange(len(sorted_labels)) + +fig, ax1 = plt.subplots(figsize=(10, 6)) + +rate_boxes = [rate_box_data[k] for k in sorted_labels] +rate_means = [rate_box_data[k].mean() for k in sorted_labels] + +box_rate = ax1.boxplot( + rate_boxes, + positions=x_indices, + widths=0.4, + patch_artist=True, + showmeans=True +) +for patch in box_rate["boxes"]: + patch.set_facecolor("skyblue") + patch.set_alpha(0.7) + +ax1.plot(x_indices, rate_means, color="blue", marker="o", linestyle="-", linewidth=2, label="Throughput Mean") + +ax1.set_xticks(x_indices) +ax1.set_xticklabels(sorted_labels) +ax1.set_xlabel("Block Size") +ax1.set_ylabel("Throughput (rows / s)") +ax1.set_title("Throughput Distribution vs Block Size") +ax1.grid(True, axis="y", linestyle="--", alpha=0.5) +ax1.legend(loc="upper left") + +plt.tight_layout() +plt.savefig("boxplot_throughput.png") +plt.close() + +########################################## +# Plot 3: Freshness Boxplot (Standalone) +########################################## + +# Plot only labels with freshness data +fresh_labels = [l for l in sorted_labels if l in fresh_val_data] + +if fresh_labels: + fig, ax2 = plt.subplots(figsize=(10, 6)) + + f_x_indices = np.arange(len(fresh_labels)) + # Extract freshness values from column 2 + fresh_boxes = [fresh_val_data[k] for k in fresh_labels] + fresh_means = [fresh_val_data[k].mean() for k in fresh_labels] + + box_fresh = ax2.boxplot( + fresh_boxes, + positions=f_x_indices, + widths=0.4, + patch_artist=True, + showmeans=True + ) + for patch in box_fresh["boxes"]: + patch.set_facecolor("salmon") # Use different color for freshness + patch.set_alpha(0.7) + + ax2.plot(f_x_indices, fresh_means, color="red", marker="D", linestyle="-", linewidth=2, label="Freshness Mean") + + ax2.set_xticks(f_x_indices) + ax2.set_xticklabels(fresh_labels) + ax2.set_xlabel("Block Size") + ax2.set_ylabel("Freshness (Value)") + ax2.set_title("Freshness (Column 2) Distribution vs Block Size") + ax2.grid(True, axis="y", linestyle="--", alpha=0.5) + ax2.legend(loc="upper left") + + plt.tight_layout() + plt.savefig("boxplot_freshness_value.png") + plt.show() + plt.close() + +########################################## +# Export: Preprocessed Rate Data (Transposed) +########################################## + +# 1. Sort labels by int order +sorted_export_labels = sorted(rate_box_data.keys(), key=lambda x: int(x)) + +# 2. Extract data columns +export_series = [] +for label in sorted_export_labels: + s = rate_box_data[label].reset_index(drop=True) + s.name = label + export_series.append(s) + +# 3. Merge +df_rate_transposed = pd.concat(export_series, axis=1) + +# 4. Save +export_filename = "rate_preprocessed_transposed.csv" +df_rate_transposed.to_csv(export_filename, index=False) + +print(f"\nTask completed:") +print(f"1. Throughput trend plot -> rate_over_time.png") +print(f"2. Throughput boxplot -> boxplot_throughput.png") +print(f"3. Freshness (col2) boxplot -> boxplot_freshness_value.png") diff --git a/scripts/perf_retina.py b/scripts/perf_retina.py new file mode 100644 index 0000000..5f3cfdf --- /dev/null +++ b/scripts/perf_retina.py @@ -0,0 +1,182 @@ +# Copyright 2026 PixelsDB. +# +# This file is part of Pixels. +# +# Pixels is free software: you can redistribute it and/or modify +# it under the terms of the Affero GNU General Public License as +# published by the Free Software Foundation, either version 3 of +# the License, or (at your option) any later version. +# +# Pixels is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# Affero GNU General Public License for more details. +# +# You should have received a copy of the Affero GNU General Public +# License along with Pixels. If not, see +# . + +import pandas as pd +import matplotlib.pyplot as plt +import matplotlib.ticker as ticker +import re +import os +import numpy as np + +########################################## +# 1. Config parameters +########################################## +SET_NAME = "retina_realtime-pixels-retina_8192tile_3" +LOG_FILE = "collected-retina-logs/" + SET_NAME + ".out" +OUTPUT_BASE = "collected-retina-logs/tile/" + SET_NAME +RESAMPLE_INTERVAL = '10s' +GI_FACTOR = 1024**3 +MAX_SECONDS = 2400 # Max window in seconds + +def parse_and_plot(log_path, output_name, interval='10s'): + # --- 2. Regex configuration --- + rdb_re = re.compile( + r"(?P