diff --git a/docs/antora.yml b/docs/antora.yml index a965ce82..2a73d0cc 100644 --- a/docs/antora.yml +++ b/docs/antora.yml @@ -9,16 +9,31 @@ nav: asciidoc: attributes: company: 'DataStax' - cdc_cass_first: 'DataStax CDC for Apache Cassandra(R)' - cdc_cass: 'CDC for Cassandra' - csc_pulsar_first: 'DataStax Cassandra Source Connector for Apache Pulsar(TM)' - csc_pulsar: 'CSC for Pulsar' + product: 'DataStax CDC for Apache Cassandra(R)' + product-short: 'CDC for Cassandra' + product-repo: 'https://github.com/datastax/cdc-apache-cassandra' cdc_agent_first: 'DataStax Change Agent for Apache Cassandra(R)' cdc_agent: 'Change Agent for Cassandra' luna_version: '3.1' version: '2.3.6' # cdc-apache-cassandra latest {version} dse: 'DataStax Enterprise (DSE)' dse-short: 'DSE' + metrics-collector: 'DSE Metrics Collector' + opscenter: 'DSE OpsCenter' + astra-db: 'Astra DB' + astra-stream: 'Astra Streaming' + astra-streaming-examples-repo: 'https://github.com/datastax/astra-streaming-examples' + scb: 'Secure Connect Bundle (SCB)' + dsbulk-repo: 'https://github.com/datastax/dsbulk' + dsbulk: 'DataStax Bulk Loader (DSBulk)' + dsbulk-short: 'DSBulk' cass-reg: 'Apache Cassandra(R)' cass: 'Apache Cassandra' cass-short: 'Cassandra' + pulsar-reg: 'Apache Pulsar(TM)' + pulsar: 'Apache Pulsar' + pulsar-short: 'Pulsar' + + # Attributes used in auto-generated content - Do not change the attribute name + csc_pulsar_first: 'DataStax Cassandra Source Connector (CSC) for Apache Pulsar(TM)' + csc_pulsar: 'CSC for Pulsar' \ No newline at end of file diff --git a/docs/local-preview-playbook.yml b/docs/local-preview-playbook.yml index 13953d9f..55c142db 100644 --- a/docs/local-preview-playbook.yml +++ b/docs/local-preview-playbook.yml @@ -55,23 +55,176 @@ asciidoc: xrefstyle: short # CUSTOM ATTRIBUTES company: 'DataStax' + trust-center: 'IBM Trust Center' + trust-center-url: 'https://www.ibm.com/trust' + trust-center-link: '{trust-center-url}[{trust-center}]' + support-url: 'https://www.ibm.com/mysupport/s/' + dsbulk: 'DataStax Bulk Loader (DSBulk)' + dsbulk-short: 'DSBulk' + dsbulk-repo: 'https://github.com/datastax/dsbulk' + astra: 'Astra' + astra-db: 'Astra DB' + astra-ui: 'Astra Portal' + astra-url: 'https://astra.datastax.com' + astra-ui-link: '{astra-url}[{astra-ui}^]' + db-classic: 'Managed Cluster' + db-serverless: 'Serverless (non-vector)' + db-serverless-vector: 'Serverless (vector)' + scb: 'Secure Connect Bundle (SCB)' + scb-short: 'SCB' + scb-brief: 'Secure Connect Bundle' + devops-api: 'DevOps API' + devops-api-ref-url: 'xref:astra-api-docs:ROOT:attachment$devops-api/index.html' + astra-cli: 'Astra CLI' + astra-stream: 'Astra Streaming' + starlight-kafka: 'Starlight for Kafka' + starlight-rabbitmq: 'Starlight for RabbitMQ' + astra-streaming-examples-repo: 'https://github.com/datastax/astra-streaming-examples' + sstable-sideloader: '{astra-db} Sideloader' + zdm: 'Zero Downtime Migration' + zdm-short: 'ZDM' + zdm-proxy: 'ZDM Proxy' + cass-migrator: 'Cassandra Data Migrator (CDM)' + cass-migrator-short: 'CDM' + hcd: 'Hyper-Converged Database (HCD)' + hcd-short: 'HCD' + dse: 'DataStax Enterprise (DSE)' + dse-short: 'DSE' + metrics-collector: 'DSE Metrics Collector' + mc: 'Mission Control' + opscenter: 'DSE OpsCenter' + studio: 'DataStax Studio' + cass-reg: 'Apache Cassandra(R)' + cass: 'Apache Cassandra' + cass-short: 'Cassandra' + cql: 'Cassandra Query Language (CQL)' + cql-shell: 'CQL shell' + cql-console: 'CQL console' + cql-service: 'CQL Service' + pulsar-reg: 'Apache Pulsar(TM)' + pulsar: 'Apache Pulsar' + pulsar-short: 'Pulsar' + spark-reg: 'Apache Spark(TM)' + spark: 'Apache Spark' + spark-short: 'Spark' + spark-connect: 'Spark Connect' + spark-connector: 'Apache Cassandra Spark Connector' + spark-connector-short: 'Spark Connector' + kafka-reg: 'Apache Kafka(R)' + kafka: 'Apache Kafka' + kafka-short: 'Kafka' + kafka-connect: 'Kafka Connect' + kafka-connector: 'DataStax Apache Kafka Connector' + kafka-connector-short: 'Kafka Connector' + solr-reg: 'Apache Solr(TM)' + solr: 'Apache Solr' + solr-short: 'Solr' + lucene-reg: 'Apache Lucene(TM)' + lucene: 'Apache Lucene' + lucene-short: 'Lucene' + hadoop-reg: 'Apache Hadoop(R)' + hadoop: 'Apache Hadoop' + hadoop-short: 'Hadoop' + airflow-reg: 'Apache Airflow(R)' + airflow: 'Apache Airflow' + airflow-short: 'Airflow' + maven-reg: 'Apache Maven(TM)' + maven: 'Apache Maven' + maven-short: 'Maven' + flink-reg: 'Apache Flink(R)' + flink: 'Apache Flink' + flink-short: 'Flink' + beam-reg: 'Apache Beam(R)' + beam: 'Apache Beam' + beam-short: 'Beam' + geode-reg: 'Apache Geode(TM)' + geode: 'Apache Geode' + geode-short: 'Geode' + hbase-reg: 'Apache HBase(R)' + hbase: 'Apache HBase' + hbase-short: 'HBase' + kudu-reg: 'Apache Kudu(TM)' + kudu: 'Apache Kudu' + kudu-short: 'Kudu' + phoenix-reg: 'Apache Phoenix(TM)' + phoenix: 'Apache Phoenix' + phoenix-short: 'Phoenix' + zookeeper-reg: 'Apache ZooKeeper(TM)' + zookeeper: 'Apache ZooKeeper' + zookeeper-short: 'ZooKeeper' + asf: 'Apache Software Foundation (ASF)' + asf-short: 'ASF' + tinkerpop-reg: 'Apache TinkerPop(TM)' + tinkerpop: 'Apache TinkerPop' + tinkerpop-short: 'TinkerPop' + cloudstack-reg: 'Apache CloudStack(R)' + cloudstack: 'Apache CloudStack' + cloudstack-short: 'CloudStack' + tomcat-reg: 'Apache Tomcat(R)' + tomcat: 'Apache Tomcat' + tomcat-short: 'Tomcat' + ajp: 'Apache JServ Protocol (AJP)' + ajp-short: 'AJP' + activemq-reg: 'Apache ActiveMQ(R)' + activemq: 'Apache ActiveMQ' + activemq-short: 'ActiveMQ' + tomee-reg: 'Apache TomEE(TM)' + tomee: 'Apache TomEE' + tomee-short: 'TomEE' + bookkeeper-reg: 'Apache BookKeeper(TM)' + bookkeeper: 'Apache BookKeeper' + bookkeeper-short: 'BookKeeper' + groovy-reg: 'Apache Groovy(TM)' + groovy: 'Apache Groovy' + groovy-short: 'Groovy' + cpp-driver-url: 'https://github.com/datastax/cpp-driver' + csharp-driver-url: 'https://github.com/datastax/csharp-driver' + gocql-astra-url: 'https://github.com/datastax/gocql-astra' + go-driver-url: 'https://github.com/apache/cassandra-gocql-driver' + cql-proxy-url: 'https://github.com/datastax/cql-proxy' + java-driver-url: 'https://github.com/apache/cassandra-java-driver' + nodejs-driver-url: 'https://github.com/datastax/nodejs-driver' + python-driver-url: 'https://github.com/datastax/python-driver' + scala-driver-url: 'https://github.com/apache/cassandra-spark-connector' + cass-driver-cpp-shield: 'image:https://img.shields.io/github/v/tag/datastax/cpp-driver?label=latest[alt="Latest cpp-driver release on GitHub",link="{cpp-driver-url}/tags"]' + cass-driver-csharp-shield: 'image:https://img.shields.io/nuget/v/CassandraCSharpDriver?label=latest[alt="Latest CassandraCSharpDriver release on NuGet",link="https://www.nuget.org/packages/CassandraCSharpDriver"]' + cass-driver-go-shield: 'image:https://img.shields.io/github/v/tag/apache/cassandra-gocql-driver?label=latest%20gocql[alt="Latest gocql release on GitHub",link="{go-driver-url}/tags"]' + cass-driver-java-shield: 'image:https://img.shields.io/github/v/tag/apache/cassandra-java-driver?label=latest[alt="Latest cassandra-java-driver release on GitHub",link="{java-driver-url}/tags"]' + cass-driver-nodejs-shield: 'image:https://img.shields.io/github/v/tag/datastax/nodejs-driver?label=latest[alt="Latest nodejs-driver release on GitHub",link="{nodejs-driver-url}/tags"]' + cass-driver-python-shield: 'image:https://img.shields.io/github/v/tag/datastax/python-driver?label=latest[alt="Latest python-driver release on GitHub",link="{python-driver-url}/tags"]' + cass-driver-scala-shield: 'image:https://img.shields.io/github/v/tag/apache/cassandra-spark-connector?label=latest[alt="Latest cassandra-spark-connector release on GitHub",link="{scala-driver-url}/releases"]' + data-api: 'Data API' + csharp-client-api-ref-url: 'xref:astra-api-docs:ROOT:attachment$csharp-client' + py-client-api-ref-url-2x: 'xref:astra-api-docs:ROOT:attachment$python-client/astrapy' + ts-client-api-ref-url-2x: 'xref:astra-api-docs:ROOT:attachment$typescript-client' + java-client-api-ref-url-2x: 'xref:astra-api-docs:ROOT:attachment$java-client' + python-client-repo-url: 'https://github.com/datastax/astrapy' + typescript-client-repo-url: 'https://github.com/datastax/astra-db-ts' + typescript-client-examples-url: '{typescript-client-repo-url}/blob/v2.x/examples' + java-client-repo-url: 'https://github.com/datastax/astra-db-java' + csharp-client-repo-url: 'https://github.com/datastax/astra-db-csharp' + python-client-python-version: '3.8' + dataapi-java-client-shield: 'image:https://img.shields.io/maven-central/v/com.datastax.astra/astra-db-java.svg?label=latest[alt="Latest astra-db-java release on Maven Central",link="https://search.maven.org/artifact/com.datastax.astra/astra-db-java"]' + dataapi-python-client-shield: 'image:https://img.shields.io/github/v/tag/datastax/astrapy?label=latest[alt="Latest astrapy release on GitHub",link="{python-client-repo-url}/releases"]' + dataapi-typescript-client-shield: 'image:https://img.shields.io/github/v/tag/datastax/astra-db-ts?label=latest[alt="Latest astra-db-ts release on GitHub",link="{typescript-client-repo-url}/releases"]' + dataapi-csharp-client-shield: 'image:https://img.shields.io/github/v/tag/datastax/astra-db-csharp?label=latest[alt="Latest astra-db-csharp release on GitHub",link="{csharp-client-repo-url}/releases"]' + agent: 'DataStax Agent' + repair-service: 'Repair Service' + backup-service: 'Backup Service' + performance-service: 'Performance Service' + monitoring-service: 'OpsCenter Monitoring' + nodesync-service: 'NodeSync Service' + bestpractice-service: 'Best Practice Service' + capacity-service: 'Capacity Service' + lcm: 'Lifecycle Manager (LCM)' + lcm-short: 'LCM' + cr: 'custom resource (CR)' + cr-short: 'CR' + crd: 'custom resource definition (CRD)' + crd-short: 'CRD' + # Custom attributes only used in ragstack-ai astra_db: 'Astra DB' - astra_stream: 'Astra Streaming' astra_ui: 'Astra Portal' - astra_cli: 'Astra CLI' - astra-streaming-examples-repo: 'https://raw.githubusercontent.com/datastax/astra-streaming-examples/master' - luna-streaming-examples-repo: 'https://raw.githubusercontent.com/datastaxdevs/luna-streaming-examples/main' - support_url: 'https://www.ibm.com/mysupport/s/' - glossary-url: 'https://docs.datastax.com/en/glossary/docs/index.html#' - emoji-tada: "🎉" - emoji-rocket: "🚀" - emoji-smile: "😀" - dse: 'DataStax Enterprise (DSE)' - cassandra: 'Apache Cassandra(R)' - classic: 'classic' - classic_cap: 'Classic' - serverless: 'serverless' - serverless_cap: 'Serverless' # Antora Atlas primary-site-url: https://docs.datastax.com/en primary-site-manifest-url: https://docs.datastax.com/en/site-manifest.json diff --git a/docs/modules/ROOT/pages/backfill-cli.adoc b/docs/modules/ROOT/pages/backfill-cli.adoc index d21056a5..24b35aed 100644 --- a/docs/modules/ROOT/pages/backfill-cli.adoc +++ b/docs/modules/ROOT/pages/backfill-cli.adoc @@ -3,15 +3,15 @@ When CDC is enabled on a table, the data topic doesn't contain any data from before CDC was enabled. The backfill CLI solves this problem by exporting the table's primary key to a CSV file, storing the CSV file on disk, and sending the primary key from the CSV file to the event topic. The {cass-short} source connector reads the primary key from the event topic and populates the data topic with historical data. -The backfill CLI is powered by the xref:dsbulk:overview:dsbulk-about.adoc[{company} Bulk Loader], a battle-tested data loader tool. This means the CLI takes full advantage of optimizations done in DSBulk when exporting data from table to disk. +The backfill CLI is powered by the xref:dsbulk:overview:dsbulk-about.adoc[{dsbulk}], a battle-tested data loader tool. This means the CLI takes full advantage of optimizations done in {dsbulk-short} when exporting data from table to disk. Developers can also use the backfill CLI to trigger change events for downstream applications without having to insert new data. [#install] == Installation -The CDC backfill CLI is distributed both as a JAR file and as a Pulsar-admin extension NAR file. -The Pulsar-admin extension is packaged with the IBM Elite Support for Apache Pulsar distribution in the `/cliextensions` folder, so you don't need to build from source unless you want to make changes to the code. +The CDC backfill CLI is distributed both as a JAR file and as a `pulsar-admin` extension NAR file. +The `pulsar-admin` extension is packaged with the IBM Elite Support for {pulsar} distribution in the `/cliextensions` folder, so you don't need to build from source unless you want to make changes to the code. Both artifacts are built with Gradle. @@ -36,17 +36,17 @@ BUILD SUCCESSFUL in 37s Gradle generates two main artifacts: -* An uber JAR file containing the CLI and all its dependencies: backfill-cli/build/libs/backfill-cli-{version}-all.jar -* A NAR archive that wraps the CLI as a Pulsar-admin Extension: backfill-cli/build/libs/pulsar-cassandra-admin-{version}-nar.nar +* An uber JAR file containing the CLI and all its dependencies: `backfill-cli/build/libs/backfill-cli-{version}-all.jar` +* A NAR archive that wraps the CLI as a `pulsar-admin` Extension: `backfill-cli/build/libs/pulsar-cassandra-admin-{version}-nar.nar` -Once the artifacts are generated, you can run the backfill CLI tool as either a standalone Java application or as a Pulsar-admin extension. +Once the artifacts are generated, you can run the backfill CLI tool as either a standalone Java application or as a `pulsar-admin` extension. [tabs] ====== Java standalone:: + -- -[source,shell] +[source,shell,subs="+attributes"] ---- java -jar backfill-cli/build/libs/backfill-cli-{version}-all.jar --data-dir target/export --export-host 127.0.0.1:9042 \ --export-username cassandra --export-password cassandra --keyspace ks1 --table table1 @@ -54,14 +54,14 @@ java -jar backfill-cli/build/libs/backfill-cli-{version}-all.jar --data-dir targ -- -Pulsar-admin extension:: +`pulsar-admin` extension:: + -- -The Pulsar-admin extension is packaged with the IBM Elite Support for Apache Pulsar (formerly {company} Luna Streaming) distribution in the /cliextensions folder, so you don't need to build from source unless you want to make changes to the code. +The `pulsar-admin` extension is packaged with the IBM Elite Support for {pulsar} (formerly {company} Luna Streaming) distribution in the /cliextensions folder, so you don't need to build from source unless you want to make changes to the code. -. Move the generated NAR archive to the /cliextensions folder of your Pulsar installation (e.g. /pulsar/cliextensions). +. Move the generated NAR archive to the /cliextensions folder of your {pulsar-short} installation (e.g. /pulsar/cliextensions). -. Modify the client.conf file of your Pulsar installation to include: `customCommandFactories=cassandra-cdc`. +. Modify the client.conf file of your {pulsar-short} installation to include: `customCommandFactories=cassandra-cdc`. . Run the following command (this assumes the xref:6.9@dse:installing:tarball-dse.adoc[default tarball installation of {dse-short}]): + @@ -81,7 +81,7 @@ This test quickly confirms your CDC backfill is working correctly. . Make sure you have the following prerequisites: + * A running {dse-short} cluster -* A running Pulsar cluster (https://pulsar.apache.org/docs/getting-started-standalone/[standalone] is fine) +* A running {pulsar-short} cluster (https://pulsar.apache.org/docs/getting-started-standalone/[standalone] is fine) * Backfill CLI built with Gradle (see <>) . Start {dse-short} if it isn't already running: @@ -154,14 +154,14 @@ cqlsh> INSERT INTO ks1.table1 (id,val) VALUES ('2', 'val2'); cqlsh> INSERT INTO ks1.table1 (id,val) VALUES ('3', 'val3'); ---- -. Start Pulsar standalone: +. Start {pulsar-short} standalone: + [source,bash] ---- ./bin/pulsar standalone ---- -. Create a {cass-short} source connector in Pulsar: +. Create a {cass-short} source connector in {pulsar-short}: + [source,bash] ---- @@ -189,7 +189,7 @@ lunastreaming-2.10.3.1 ./bin/pulsar-admin sources create \ ./bin/pulsar-client consume -s subsc -n 0 -st auto_consume -p Earliest persistent://public/default/data-ks1.table1 ---- + -At this point, everything is set up for the test: A {cass-short} table with pre-existing data, a Pulsar topic with a CDC connector, and a Pulsar consumer subscription. +At this point, everything is set up for the test: A {cass-short} table with pre-existing data, a {pulsar-short} topic with a CDC connector, and a {pulsar-short} consumer subscription. . Run the backfill CLI to hydrate the data topic with the existing data in the {cass-short} table: + @@ -211,9 +211,9 @@ Make sure the last line of the output is similar to the following: + Your data topic is now populated with the existing data from the {cass-short} table. -. Check your Pulsar subscription to ensure Pulsar received the change events: +. Check your {pulsar-short} subscription to ensure {pulsar-short} received the change events: + -.Pulsar consumer subscription output +.{pulsar-short} consumer subscription output [%collapsible] ==== [source,console] @@ -272,15 +272,15 @@ Now that you have tested the Backfill CLI, you can run it manually when you want == Backfill CLI parameters -When running the backfill CLI as a Pulsar-admin extension, all `--pulsar-*` parameters are loaded from the `client.conf` file. +When running the backfill CLI as a `pulsar-admin` extension, all `--pulsar-*` parameters are loaded from the `client.conf` file. The `--dsbulk-log-dir` is only available when running the backfill CLI as a standalone Java application. -The `--export-dsbulk-option` parameter passes extra parameters to DSBulk. -The relevant DSBulk settings configure the CSV connector and can be found https://github.com/datastax/dsbulk/blob/1.10.x/manual/settings.md#connector.csv[here]. +The `--export-dsbulk-option` parameter passes extra parameters to {dsbulk-short}. +The relevant {dsbulk-short} settings configure the CSV connector and can be found in the {dsbulk-repo}/blob/1.10.x/manual/settings.md#connector.csv[{dsbulk-short} repository]. Shortened option names aren't supported. -In both the Pulsar-admin extension and the standalone Java application, {cass-short} related configurations are exclusively passed as command line arguments. +In both the `pulsar-admin` extension and the standalone Java application, {cass-short} related configurations are exclusively passed as command line arguments. .CDC backfill CLI parameters [cols=2*,options="header"] @@ -300,14 +300,14 @@ directory. |Displays this help message |--dsbulk-log-dir=PATH, -l -|The directory where DSBulk should store its logs. The default is a +|The directory where {dsbulk-short} should store its logs. The default is a 'logs' subdirectory in the current working directory. This -subdirectory is created if it doesn't exist. Each DSBulk +subdirectory is created if it doesn't exist. Each {dsbulk-short} operation creates a subdirectory inside the log directory -specified here. This command isn't available in the Pulsar-admin extension. +specified here. This command isn't available in the `pulsar-admin` extension. |--export-bundle=PATH -|The path to a Secure Connect Bundle (SCB) to connect to an Astra DB database. Options --export-host and --export-bundle are mutually exclusive. +|The path to a {scb} to connect to an {astra-db} database. Options --export-host and --export-bundle are mutually exclusive. |--export-consistency=CONSISTENCY |The consistency level to use when exporting data. The default is @@ -329,9 +329,9 @@ is an advanced setting; you should rarely need to modify the default value. |--export-dsbulk-option=OPT=VALUE -|An extra DSBulk option to use when exporting. Any valid DSBulk option -can be specified here, and it is passed as-is to the DSBulk -process. DSBulk options, including driver options, must be passed as +|An extra {dsbulk-short} option to use when exporting. Any valid {dsbulk-short} option +can be specified here, and it is passed as-is to the {dsbulk-short} +process. {dsbulk-short} options, including driver options, must be passed as '--long.option.name='. Short options aren't supported. |--export-host=HOST[:PORT] @@ -371,12 +371,12 @@ disables it. The default is -1. |Displays version info. |=== -== Pulsar connectivity parameters +== {pulsar-short} connectivity parameters -Pulsar connectivity parameters are auto-populated from the `client.conf` file available to the CLI when used as a Pulsar-admin extension. +{pulsar-short} connectivity parameters are auto-populated from the `client.conf` file available to the CLI when used as a `pulsar-admin` extension. These parameters should be passed as command line arguments in the standalone Java application. -.Pulsar connectivity parameters +.{pulsar-short} connectivity parameters [cols=2] |=== |Parameter |Description @@ -385,13 +385,13 @@ These parameters should be passed as command line arguments in the standalone Ja |The event topic name prefix. The `.` is appended to that prefix to build the topic name. |--pulsar-auth-params= -|The Pulsar authentication parameters. +|The {pulsar-short} authentication parameters. |--pulsar-auth-plugin-class-name= -|The Pulsar authentication plugin class name. +|The {pulsar-short} authentication plugin class name. |--pulsar-url= -|The Pulsar broker service URL. +|The {pulsar-short} broker service URL. |--pulsar-ssl-provider= |The SSL/TLS provider to use. diff --git a/docs/modules/ROOT/pages/cdc-cassandra-events.adoc b/docs/modules/ROOT/pages/cdc-cassandra-events.adoc index e375f683..9f7f3fc2 100644 --- a/docs/modules/ROOT/pages/cdc-cassandra-events.adoc +++ b/docs/modules/ROOT/pages/cdc-cassandra-events.adoc @@ -1,11 +1,11 @@ = CDC for {cass-short} Events -The {cdc_cass_first} agent pushes the mutation primary key for the CDC-enabled table into the Apache Pulsar events topic (also called the dirty topic). The messages in the data topic (or clean topic) are keyed messages where both the key and the payload are https://avro.apache.org/docs/current/spec.html#schema_record[AVRO records]: +The {product} agent pushes the mutation primary key for the CDC-enabled table into the {pulsar-short} events topic (also called the dirty topic). The messages in the data topic (or clean topic) are keyed messages where both the key and the payload are https://avro.apache.org/docs/current/spec.html#schema_record[AVRO records]: * The message key is an AVRO record including all the primary key columns of your {cass-short} table. * The message payload is an AVRO record including regular columns from your {cass-short} table. -In order to support https://pulsar.apache.org/docs/en/concepts-topic-compaction/[Pulsar Topic Compaction], the message key is encoded separately from the message payload, in the message metadata. +In order to support https://pulsar.apache.org/docs/en/concepts-topic-compaction/[{pulsar-short} Topic Compaction], the message key is encoded separately from the message payload, in the message metadata. Finally, the following CQL data types are encoded as AVRO logical types: @@ -78,9 +78,9 @@ And a JSON representation of the AVRO Record for the message payload: == `DELETE` Event -When deleting a row, the {cass-short} source connector publishes a tombstone message into the clean topic with a message key matching the {cass-short} primary key and a null payload. The null payload acts as a message tombstone, and the downstream Pulsar connectors like the Elasticsearch sink should delete the corresponding row (or document in the case of Elasticsearch). +When deleting a row, the {cass-short} source connector publishes a tombstone message into the clean topic with a message key matching the {cass-short} primary key and a null payload. The null payload acts as a message tombstone, and the downstream {pulsar-short} connectors like the Elasticsearch sink should delete the corresponding row (or document in the case of Elasticsearch). -If using the `json` output format, the Pulsar message passes an empty JSON instead of a null payload, because Pulsar doesn't support nulls in message values. The downstream connectors still delete the corresponding row. +If using the `json` output format, the {pulsar-short} message passes an empty JSON instead of a null payload, because {pulsar-short} doesn't support nulls in message values. The downstream connectors still delete the corresponding row. === Check Source Connector status @@ -113,11 +113,11 @@ bin/pulsar-admin source status --name cassandra-source-ks1-table1 == Troubleshooting -If you're having issues consuming CDC events, check the source connector logs on your Pulsar function workers and the data topic schema. +If you're having issues consuming CDC events, check the source connector logs on your {pulsar-short} function workers and the data topic schema. === Check the source connector logs -Check the source connector logs on your Pulsar function workers. The name of the logs depends on the connectors' name. +Check the source connector logs on your {pulsar-short} function workers. The name of the logs depends on the connectors' name. [source,bash] ---- @@ -126,7 +126,7 @@ cat logs/functions/public/default/cassandra-source-ks1-table1/cassandra-source-k === Check the data topic schema -Check the https://pulsar.apache.org/docs/en/schema-manage/[Pulsar schema] to ensure the clean topic matches your CQL table: +Check the https://pulsar.apache.org/docs/en/schema-manage/[{pulsar-short} schema] to ensure the clean topic matches your CQL table: [source,bash] ---- diff --git a/docs/modules/ROOT/pages/cdc-concepts.adoc b/docs/modules/ROOT/pages/cdc-concepts.adoc index 9b3d0a66..17ece812 100644 --- a/docs/modules/ROOT/pages/cdc-concepts.adoc +++ b/docs/modules/ROOT/pages/cdc-concepts.adoc @@ -1,6 +1,6 @@ -= About Change Data Capture (CDC) workflows with {cass-reg} and Apache Pulsar(TM) += About Change Data Capture (CDC) workflows with {cass-reg} and {pulsar-reg} :navtitle: About CDC workflows and components -:description: Capture changes in a {cass-short} database and publish them to Apache Pulsar(TM) as events. +:description: Capture changes in a {cass-short} database and publish them to {pulsar-short} as events. :csharp: C# Change Data Capture (CDC) is a design pattern used in software development to capture and propagate changes made to data in a system. @@ -11,56 +11,56 @@ This allows applications to react quickly to changes in the data while not addin Before learning the specifics of CDC, you must first understand the components needed to complete a CDC workflow. -== Apache Pulsar(TM) source connectors +== {pulsar} source connectors -Source connectors in Apache Pulsar are responsible for ingesting data from external sources into the Pulsar system. +Source connectors in {pulsar} are responsible for ingesting data from external sources into the {pulsar-short} system. They can be used to collect data from a variety of sources including databases, message queues, and file systems. -When the source connector "sees" data, it streams the data to a Pulsar topic. -This enables users to easily integrate data from disparate sources into their Pulsar-based applications. -Source connectors make it easy to ingest, process, and analyze large volumes of data from a variety of sources into Pulsar. +When the source connector "sees" data, it streams the data to a {pulsar-short} topic. +This enables users to easily integrate data from disparate sources into their {pulsar-short}-based applications. +Source connectors make it easy to ingest, process, and analyze large volumes of data from a variety of sources into {pulsar-short}. -Pulsar offers extensible APIs where developers can use a defined interface to develop their own connector. +{pulsar-short} offers extensible APIs where developers can use a defined interface to develop their own connector. The interface takes much of the boilerplate burdens away from a developer and gets them right to the purpose of the connector. -Creating a connector means adding in the know-how to work with data from the source and adapt it to produce a compliant message with the Pulsar client. +Creating a connector means adding in the know-how to work with data from the source and adapt it to produce a compliant message with the {pulsar-short} client. -As explained later in this guide, among the processes needed to capture change data, the **{csc_pulsar_first}** ({csc_pulsar}) is critical. -This is one of many available source connectors for Pulsar, but is specifically designed to work with {cass-short} and its CDC features. +As explained later in this guide, among the processes needed to capture change data, the {csc_pulsar_first} is critical. +This is one of many available source connectors for {pulsar-short}, but is specifically designed to work with {cass-short} and its CDC features. To run a source connector, you provide configuration about what data is selected, how to connect with the upstream system, and the destination topic for the new message. The source connector takes care of producing the message. -Pulsar source connectors run as Pulsar functions within the cluster, so many of the features of functions apply, such as the number of instances to run and how to configure the function instance running environment. +{pulsar-short} source connectors run as {pulsar-short} functions within the cluster, so many of the features of functions apply, such as the number of instances to run and how to configure the function instance running environment. Metrics and logs for a source connector are automatically made a part of the cluster. === Monitor source connectors Monitoring a source connector includes two areas: health and performance. -Every connector in Pulsar emits basic metrics about its health, including stats like the number of records received from the source, and the number of messages written to the destination topic. +Every connector in {pulsar-short} emits basic metrics about its health, including stats like the number of records received from the source, and the number of messages written to the destination topic. Connectors also emit debugging metrics like the number of exceptions thrown by the source. Performance metrics include health metrics as well as specific knowledge about the source. -Refer to the https://pulsar.apache.org/docs/reference-metrics/#connectors[Pulsar connectors metrics documentation] for a complete list and explanation of metrics. +Refer to the https://pulsar.apache.org/docs/reference-metrics/#connectors[{pulsar-short} connectors metrics documentation] for a complete list and explanation of metrics. === Source connector logs -Most Pulsar source connectors emit logs that show lifecycle events as well as custom events specific to the connector type. +Most {pulsar-short} source connectors emit logs that show lifecycle events as well as custom events specific to the connector type. All logs are handled the same way core cluster logs are handled. By default, they are written to the console and collected by log4j destinations. If you are using function workers, you can access log files on their disk. -Refer to Pulsar's https://pulsar.apache.org/docs/io-debug/[connector debugging guide] for more information. +Refer to the https://pulsar.apache.org/docs/io-debug/[{pulsar} connector debugging guide] for more information. -== Pulsar schemas and the schema registry +== {pulsar-short} schemas and the schema registry -The Apache Pulsar schema registry is a feature of a Pulsar cluster that manages the schemas of messages sent and received on Pulsar topics. -In Pulsar, messages are stored as bytes. +The {pulsar-short} schema registry is a feature of a {pulsar-short} cluster that manages the schemas of messages sent and received on {pulsar-short} topics. +In {pulsar-short}, messages are stored as bytes. Schemas provide a way to serialize and deserialize messages with a particular structure or type, allowing for interoperability between different systems. -The schema registry in Pulsar stores and manages schema definitions for all message types sent and received in Pulsar. +The schema registry in {pulsar-short} stores and manages schema definitions for all message types sent and received in {pulsar-short}. The schema registry enforces schema compatibility rules, such as requiring a producer to send messages that conform to a certain schema, or rejecting messages that don't match the schema. Schemas follow a primitive or complex type. Primitive schemas are simple data types like bool, int, string, and float. -Because Pulsar is written in Java, that is where the primitives are based. +Because {pulsar-short} is written in Java, that is where the primitives are based. When a different client runtime is used, a conversion might need to occur. -Refer to the https://pulsar.apache.org/docs/schema-understand/#primitive-type[Pulsar primitive types table] for a full reference. +Refer to the https://pulsar.apache.org/docs/schema-understand/#primitive-type[{pulsar-short} primitive types table] for a full reference. Complex schemas introduce a more structured way of messaging. The two types of complex messages are KeyValue and Struct. @@ -79,13 +79,13 @@ In the context of CDC there are a few schema configurations of note. All of these are specific to the namespace where the event and data topics are logically located. schema-compatibility-strategy:: -This setting instructs the Pulsar broker how to handle new schemas introduced to existing topics by producers. +This setting instructs the {pulsar-short} broker how to handle new schemas introduced to existing topics by producers. This is relevant to CDC when a table's design is changed. For example, if a new column is added, the registered schema is changed to include that new value. The chosen schema-compatibility-strategy decides if the namespace allows this. If schema validations are enabled, this option decides what strategy is used. -Pulsar's default strategy is "FULL" which means existing optional table columns can be modified. -Learn more about the different types of strategies in the https://pulsar.apache.org/docs/next/schema-understand/#schema-compatibility-check-strategy[Pulsar docs]. +The {pulsar-short} default strategy is "FULL" which means existing optional table columns can be modified. +For more information, see the https://pulsar.apache.org/docs/next/schema-understand/#schema-compatibility-check-strategy[{pulsar} schema compatibility check strategies]. allow-auto-update-schema:: Given the compatibility strategy, this setting is a flag that determines if an update to the schema is generally allowed. @@ -99,7 +99,7 @@ CDC sets this to 'BACKWARDTRANSITIVE', which means if optional table columns hav schema-validation-enforce:: This flag limits how producers and consumers are allowed to be configured. When enabled (`true`) producer and consumer clients must have a schema set before sending the message. -When disabled (`false`) Pulsar allows producers and consumers without a set schema to send or receive messages. +When disabled (`false`) {pulsar-short} allows producers and consumers without a set schema to send or receive messages. CDC disables (`false`) this option, so producers and consumers don't have to know the message schema ahead of time. == The {cdc_agent_first} and {csc_pulsar_first} @@ -107,9 +107,9 @@ CDC disables (`false`) this option, so producers and consumers don't have to kno The {cdc_agent} is a process running on each node in a {cass-short} cluster that watches for data changes on tables that have enabled the CDC feature. Using {cass-short}'s https://cassandra.apache.org/doc/4.0/cassandra/configuration/cass_yaml_file.html#commitlog_sync[commitlog_sync option], the agent periodically syncs a separate log in a special "cdc_raw" directory. Each log entry is a CDC event. -The {cdc_agent} creates a new event message containing the row coordinates of the changed data and produces the message to a downstream Pulsar cluster. +The {cdc_agent} creates a new event message containing the row coordinates of the changed data and produces the message to a downstream {pulsar-short} cluster. -In Pulsar, each table that has CDC enabled also has a corresponding {company} {csc_pulsar}. +In {pulsar-short}, each table that has CDC enabled also has a corresponding {company} {csc_pulsar}. This is unlike the {cdc_agent} where the process runs on each {cass-short} node, keeping a log of all table changes. Each table-specific {cass-short} source connector subscribes to the events topic the agent is producing messages to. When the connector "sees" a message for its table, it uses the row coordinates within the message to retrieve the mutated data from {cass-short} and create a new message with the specifics. @@ -118,33 +118,33 @@ That new message is written to a data topic where others can subscribe and recei === Event deduplication A particular advantage in the {company} {csc_pulsar} is its deduplication feature. -https://pulsar.apache.org/docs/2.11.x/concepts-messaging/#message-deduplication[Pulsar's built-in deduplication capabilities] _aren't_ used in the message flow because CDC needs a finer grain control to detect duplicates. +The https://pulsar.apache.org/docs/2.11.x/concepts-messaging/#message-deduplication[{pulsar} built-in deduplication capabilities] _aren't_ used in the message flow because CDC needs a finer grain control to detect duplicates. As the {cdc_agent} discovers a new commit log, an authentic identifier is created using the MD5 hash algorithm. That key identifier is added to the event message. When message consumers, like the {company} {csc_pulsar}, connect to the event topic, they establish a subscription type. -Pulsar has four types of subscriptions: Exclusive, shared, failover, and key_shared. +{pulsar-short} has four types of subscriptions: Exclusive, shared, failover, and key_shared. In a typical CDC flow, the {cass-short} source connector has multiple instances running in parallel. -When multiple consumers are a part of a key_shared subscription, Pulsar delivers a duplicate hash key to the same consumer no matter how many times it's sent. +When multiple consumers are a part of a key_shared subscription, {pulsar-short} delivers a duplicate hash key to the same consumer no matter how many times it's sent. When a {cass-short} cluster has multiple hosts (with multiple commit logs), and they all use the same mutation to calculate the same hash key, then the same consumer always receives it. Each {cass-short} source connector keeps a cache of hashes it has seen and ensures duplicates are dropped before producing the data message. -Learn more about Pulsar's key_shared subscription type in the https://pulsar.apache.org/docs/2.11.x/concepts-messaging/#key_shared[Pulsar documentation]. +Learn more about the {pulsar-short} key_shared subscription type in the https://pulsar.apache.org/docs/2.11.x/concepts-messaging/#key_shared[{pulsar} documentation]. == Understand the CDC workflow Now that you understand the different components used in the CDC pattern, the following steps summarize a generic CDC workflow, including configuring the workflow and producing messages: -. Create a Pulsar tenant to hold CDC messages with the following namespace and topics: +. Create a {pulsar-short} tenant to hold CDC messages with the following namespace and topics: .. Create a namespace or use the default namespace. .. Create a topic for event messages. .. Create a topic for data messages. -. Start the {cass-short} source connector in Pulsar by setting the destination topic (the data messages topic), the event topic, and {cass-short} connection info, along with other settings. +. Start the {cass-short} source connector in {pulsar-short} by setting the destination topic (the data messages topic), the event topic, and {cass-short} connection info, along with other settings. -. Configure the {cass-short} change agent with a working directory, Pulsar service URL, and other settings in the {cass-short} node. +. Configure the {cass-short} change agent with a working directory, {pulsar-short} service URL, and other settings in the {cass-short} node. A restart is required. . Create a {cass-short} table and enable CDC. @@ -159,14 +159,14 @@ From here the following sequence occurs: == {cass-short} table schema evolution with CDC -This section describes how table schema changes are handled in the {cass-reg} Connector for Apache Pulsar(TM). +This section describes how table schema changes are handled in the {csc_pulsar_first}. === Message schema translation The message schema is of particular importance in completing the CDC pattern. -Initially, it is set to match the {cass-short} table's schema as closely as possible, but some data types aren't known in Pulsar (or more accurately, not known in Avro). -To overcome this, there are adaptations performed when the {company} {csc_pulsar} builds the Pulsar message. -Some types aren't compatible and cannot be adapted. In this case, those columns of data are dropped while creating the Pulsar message. +Initially, it is set to match the {cass-short} table's schema as closely as possible, but some data types aren't known in {pulsar-short} (or more accurately, not known in Avro). +To overcome this, there are adaptations performed when the {company} {csc_pulsar} builds the {pulsar-short} message. +Some types aren't compatible and cannot be adapted. In this case, those columns of data are dropped while creating the {pulsar-short} message. To better understand how exactly the {cdc_agent} constructs the event message, here is the pseudo code of how the schema is created: @@ -187,7 +187,7 @@ The byte array is an Avro-encoded record that documents the table's primary key( The MutationValue is an extended Avro record that has direction on what changed and how to get its specifics. CDC sets the initial topic schema on the first change it detects. -Once the initial topic schema has been set, an ideal path has been established to create change data events in Pulsar. +Once the initial topic schema has been set, an ideal path has been established to create change data events in {pulsar-short}. Inevitably, table designs change: Columns are added, updated, or removed. When these changes occur, the components that are part of the CDC flow must adapt to preserve the ideal path of event data. @@ -241,11 +241,11 @@ For example: ALTER TABLE [keyspace_name.] table_name DROP some-column; ---- -== Consume change data with Apache Pulsar(TM) +== Consume change data with {pulsar} -This section describes how to consume change data with Apache Pulsar(TM). +This section describes how to consume change data with {pulsar}. -=== Pulsar clients +=== {pulsar-short} clients Each client handles message consumption a little differently but there is one overall pattern to follow. As covered in the previous sections, a CDC message arrives as an Avro GenericRecord of type KeyValue. @@ -256,16 +256,16 @@ From there you'll want to deserialize the Avro record and extract the interestin The following example projects demonstrate implementations for each runtime consuming messages from a CDC data topic. -These examples are stored in the `astra-streaming-examples` repository, but they aren't specific to Astra Streaming. -You can use these examples to consume CDC data topics in your own {cass-short}/Pulsar clusters. +These examples are stored in the `astra-streaming-examples` repository, but they aren't specific to {astra-stream}. +You can use these examples to consume CDC data topics in your own {cass-short}/{pulsar-short} clusters. -* svg:common::icons/logos/csharp.svg[role="icon text-xl",name="C#"] https://github.com/datastax/astra-streaming-examples/blob/master/csharp/astra-cdc/Program.cs[{csharp} CDC project example] -* svg:common::icons/logos/go.svg[role="icon text-xl",name="Go"] https://github.com/datastax/astra-streaming-examples/blob/master/go/astra-cdc/main/main.go[Golang CDC project example] -* svg:common::icons/logos/java.svg[role="icon text-xl",name="Java"] https://github.com/datastax/astra-streaming-examples/blob/master/java/astra-cdc/javaexamples/consumers/CDCConsumer.java[Java CDC consumer example] -* svg:common::icons/logos/nodejs.svg[role="icon text-xl",name="Node.js"] https://github.com/datastax/astra-streaming-examples/blob/master/nodejs/astra-cdc/consumer.js[Node.js CDC consumer example] -* svg:common::icons/logos/python.svg[role="icon text-xl",name="Python"] https://github.com/datastax/astra-streaming-examples/blob/master/python/astra-cdc/cdc_consumer.py[Python CDC consumer example] +* svg:common::icons/logos/csharp.svg[role="icon text-xl",name="C#"] {astra-streaming-examples-repo}/blob/master/csharp/astra-cdc/Program.cs[{csharp} CDC project example] +* svg:common::icons/logos/go.svg[role="icon text-xl",name="Go"] {astra-streaming-examples-repo}/blob/master/go/astra-cdc/main/main.go[Golang CDC project example] +* svg:common::icons/logos/java.svg[role="icon text-xl",name="Java"] {astra-streaming-examples-repo}/blob/master/java/astra-cdc/javaexamples/consumers/CDCConsumer.java[Java CDC consumer example] +* svg:common::icons/logos/nodejs.svg[role="icon text-xl",name="Node.js"] {astra-streaming-examples-repo}/blob/master/nodejs/astra-cdc/consumer.js[Node.js CDC consumer example] +* svg:common::icons/logos/python.svg[role="icon text-xl",name="Python"] {astra-streaming-examples-repo}/blob/master/python/astra-cdc/cdc_consumer.py[Python CDC consumer example] -=== Pulsar functions +=== {pulsar-short} functions It is very common to have a function consuming the CDC data. Functions usually perform additional processing on the data and pass it to another topic. @@ -273,12 +273,12 @@ Similar to a client consumer, it needs to deserialize the message data. The following example functions consume messages from the CDC data topic. -These examples are stored in the `astra-streaming-examples` repository, but they aren't specific to Astra Streaming. -You can use these examples for functions for your own {cass-short}/Pulsar clusters. +These examples are stored in the `astra-streaming-examples` repository, but they aren't specific to {astra-stream}. +You can use these examples for functions for your own {cass-short}/{pulsar-short} clusters. -* svg:common::icons/logos/go.svg[role="icon text-xl",name="Go"] https://github.com/datastax/astra-streaming-examples/blob/master/go/astra-cdc/main/main.go[Golang CDC project example] -* svg:common::icons/logos/java.svg[role="icon text-xl",name="Java"] https://github.com/datastax/astra-streaming-examples/blob/master/java/astra-cdc/javaexamples/functions/CDCFunction.java[Java CDC function example] -* svg:common::icons/logos/python.svg[role="icon text-xl",name="Python"] https://github.com/datastax/astra-streaming-examples/blob/master/python/cdc-in-pulsar-function/deschemaer.py[Python CDC function example] +* svg:common::icons/logos/go.svg[role="icon text-xl",name="Go"] {astra-streaming-examples-repo}/blob/master/go/astra-cdc/main/main.go[Golang CDC project example] +* svg:common::icons/logos/java.svg[role="icon text-xl",name="Java"] {astra-streaming-examples-repo}/blob/master/java/astra-cdc/javaexamples/functions/CDCFunction.java[Java CDC function example] +* svg:common::icons/logos/python.svg[role="icon text-xl",name="Python"] {astra-streaming-examples-repo}/blob/master/python/cdc-in-pulsar-function/deschemaer.py[Python CDC function example] == See also diff --git a/docs/modules/ROOT/pages/cdcExample.adoc b/docs/modules/ROOT/pages/cdcExample.adoc index 0145ca34..088214f2 100644 --- a/docs/modules/ROOT/pages/cdcExample.adoc +++ b/docs/modules/ROOT/pages/cdcExample.adoc @@ -1,30 +1,30 @@ = Change Data Capture with {dse} :navtitle: Quickstart for CDC with {dse-short} -You can capture schema changes in your tables and pass them to Apache Pulsar(R) with {company} Change Data Capture (CDC). -This guide explains how to install, configure, and use CDC with {cass-reg} or {dse} in a VM-based deployment. +You can capture schema changes in your tables and pass them to {pulsar-reg} with {product}. +This guide explains how to install, configure, and use CDC with {cass} or {dse} in a VM-based deployment. == Prerequisites This installation requires the following: * {dse-short} 6.8.16 or later -* OSS {cass-reg} +* OSS {cass} * {cdc_agent_first} ** {dse-short} - use `agent-dse4-**VERSION**-all.jar` ** OSS {cass-short} - use `agent-c4-**VERSION**-all.jar` -* Pulsar -** IBM Elite Support for Apache Pulsar - use `agent-dse4-**VERSION**-all.jar` -* Pulsar {cass-short} source connector ({csc_pulsar_first}) -** Pulsar {cass-short} Source NAR - use `pulsar-cassandra-source-**VERSION**.nar` +* {pulsar} +** IBM Elite Support for {pulsar} - use `agent-dse4-**VERSION**-all.jar` +* {csc_pulsar_first} +** {pulsar-short} {cass-short} Source NAR - use `pulsar-cassandra-source-**VERSION**.nar` -The latest versions of the {cdc_agent} and {csc_pulsar} are available from the https://github.com/datastax/cdc-apache-cassandra/releases/latest[{company} {cdc_cass} repository]. +The latest versions of the {cdc_agent} and {csc_pulsar} are available from the {product-repo}/releases/latest[{company} {product-short} repository]. == Architecture overview -The CDC pipeline consists of three stages that work together to capture and process changes and publishes change events to the events topic in Pulsar. +The CDC pipeline consists of three stages that work together to capture and process changes and publishes change events to the events topic in {pulsar-short}. -* The {cdc_agent} running in Pulsar creates events topics using the format: `events-**KEYSPACE_NAME**.**TABLE_NAME**`. +* The {cdc_agent} running in {pulsar-short} creates events topics using the format: `events-**KEYSPACE_NAME**.**TABLE_NAME**`. When configuring the source connector, the `events.topic` parameter must match exactly what the {cdc_agent} publishes to. For example, if your keyspace is `ks1` and table is `table1`, the `events.topic` parameter is `persistent://public/default/events-ks1.table1`. @@ -35,7 +35,7 @@ Consumers include sink connectors, analytics systems, or other applications. == Installing and configuring -. Download the https://pulsar.apache.org/download/[Pulsar tarball] and set up a Pulsar cluster. This example uses Pulsar https://pulsar.apache.org/docs/en/standalone/[standalone mode], but you can also use our helpful https://github.com/datastax/pulsar-ansible[Ansible Scripts]. +. Download the https://pulsar.apache.org/download/[{pulsar-short} tarball] and set up a {pulsar-short} cluster. This example uses {pulsar-short} https://pulsar.apache.org/docs/en/standalone/[standalone mode], but you can also use our helpful https://github.com/datastax/pulsar-ansible[Ansible Scripts]. + [source,bash] ---- @@ -50,31 +50,31 @@ bin/pulsar standalone . Install {cass-short} or {dse-short}. -. After installing {cass-short} or {dse-short}, but before starting the {cass-short} or {dse-short} service, set the `Cassandra-env.sh` configuration: +. After installing {cass-short} or {dse-short}, but before starting the {cass-short} or {dse-short} service, set the `cassandra-env.sh` configuration: + -[source,bash] +[source,bash,subs="+quotes"] ---- -export CDC_PULSAR_SERVICE_URL="" (e.g. pulsar://:6650) +export CDC_PULSAR_SERVICE_URL="**PULSAR_BROKER_SERVICE_URL**" (e.g. pulsar://:6650) # needed when Pulsar JWT authentication is enabled export CDC_PULSAR_AUTH_PLUGIN_CLASS_NAME="org.apache.pulsar.client.impl.auth.AuthenticationToken" -export CDC_PULSAR_AUTH_PARAMS="file://" +export CDC_PULSAR_AUTH_PARAMS="file://**PATH/TO/TOKEN/FILE**" # needed when Pulsar TLS encryption is enabled -export CDC_TLS_TRUST_CERTS_FILE_PATH="" +export CDC_TLS_TRUST_CERTS_FILE_PATH="**PATH/TO/TRUSTED/CERT/FILE**" # DSE CDC -JVM_OPTS="$JVM_OPTS -javaagent:/home/automaton/cdc104/agent-dse4--all.jar" +JVM_OPTS="$JVM_OPTS -javaagent:/home/automaton/cdc104/agent-dse4-**VERSION**-all.jar" ---- + -* **For {cdc_agent} versions after 1.0.3**: The {cdc_agent} Pulsar connection parameters are provided as system environment variables. +* **For {cdc_agent} versions after 1.0.3**: The {cdc_agent} {pulsar-short} connection parameters are provided as system environment variables. See `DSE CDC` in the preceding example. -* **For {cdc_agent} versions before 1.0.3**: The {cdc_agent} Pulsar connection parameters are also provided as extra JVM options. +* **For {cdc_agent} versions before 1.0.3**: The {cdc_agent} {pulsar-short} connection parameters are also provided as extra JVM options. For example: + -[source,bash] +[source,bash,subs="+quotes"] ---- -export JVM_EXTRA_OPTS="-javaagent:/path/to/agent-c4--all.jar=pulsarServiceUrl=pulsar://pulsar:6650" +export JVM_EXTRA_OPTS="-javaagent:/path/to/agent-c4-**VERSION**-all.jar=pulsarServiceUrl=pulsar://pulsar:6650" ---- . Set the `cassandra.yaml` configuration: @@ -96,15 +96,15 @@ INFO [main] 2022-04-11 18:47:06,331 AgentConfig.java:526 - maxInflightMessages INFO [main] 2022-04-11 18:47:06,433 Agent.java:92 - CDC agent started ---- -== Deploy Pulsar Connector +== Deploy {pulsar-short} Connector -. Deploy the Pulsar {cass-short} Source Connector (CSC) for each CDC-enabled {cass-short} table. The connector consumes from the **events topic** (where the {cdc_agent} publishes) and writes to the **data topic** (where downstream consumers read from). +. Deploy the {csc_pulsar_first} for each CDC-enabled {cass-short} table. The connector consumes from the **events topic** (where the {cdc_agent} publishes) and writes to the **data topic** (where downstream consumers read from). + [IMPORTANT] ==== The `events.topic` parameter in the source connector configuration must match exactly the topic name that the {cdc_agent} publishes to. - The agent creates topics using the format `events-**KEYSPACE**.**TABLE**` when using the default `events-` prefix. For example, for keyspace `ks1` and table `table1`, the events topic is `events-ks1.table1`. +The agent creates topics using the format `events-**KEYSPACE**.**TABLE**` when using the default `events-` prefix. For example, for keyspace `ks1` and table `table1`, the events topic is `events-ks1.table1`. ==== + The `outputFormat` source configuration controls the format of messages on the data topic. @@ -199,9 +199,9 @@ pulsar-admin source list ["**CSC_CONNECTOR_NAME**"] == Verify end-to-end operation -Now that Pulsar, your database service, CDC, and the CSC connector are installed and verified to be operational, you can verify the end-to-end message flow. +Now that {pulsar-short}, your database service, CDC, and the {csc_pulsar} connector are installed and verified to be operational, you can verify the end-to-end message flow. -. To verify that the {cdc_agent} in Pulsar is publishing to the events topic, confirm that the events topic exists and has messages: +. To verify that the {cdc_agent} in {pulsar-short} is publishing to the events topic, confirm that the events topic exists and has messages: + [source,bash,subs="+quotes"] ---- diff --git a/docs/modules/ROOT/pages/faqs.adoc b/docs/modules/ROOT/pages/faqs.adoc index ef0df8cd..1fe6ec2a 100644 --- a/docs/modules/ROOT/pages/faqs.adoc +++ b/docs/modules/ROOT/pages/faqs.adoc @@ -1,35 +1,35 @@ -= {cdc_cass} FAQs += {product-short} FAQs -The following are frequently asked questions about {cdc_cass_first} and its features. +The following are frequently asked questions about {product} and its features. -== What is {cdc_cass}? +== What is {product-short}? -The {cdc_cass} is a an open-source product from {company}. +The {product-short} is a an open-source product from {company}. -With {cdc_cass}, updates to data in {cass} or {dse} are put into a Pulsar topic, which in turn can write the data to external targets such as Elasticsearch, Snowflake, and other platforms. -The {csc_pulsar_first} component has a one-to-one correspondence between a {cass-short} table and a single Pulsar topic. +With {product-short}, updates to data in {cass} or {dse} are put into a {pulsar-short} topic, which in turn can write the data to external targets such as Elasticsearch, Snowflake, and other platforms. +The {csc_pulsar_first} component has a one-to-one correspondence between a {cass-short} table and a single {pulsar-short} topic. -== Is {cdc_cass} an open-source project? Where can I find the repository? +== Is {product-short} an open-source project? Where can I find the repository? -Yes, {cdc_cass} is open source software under the Apache 2.0 license. -You can find the source code on the https://github.com/datastax/cdc-apache-cassandra[{company} {cdc_cass} GitHub repository]. +Yes, {product-short} is open source software under the Apache 2.0 license. +You can find the source code on the {product-repo}[{company} {product-short} GitHub repository]. -== What does {cdc_cass} provide that I cannot get with open-source Apache Pulsar? +== What does {product-short} provide that I cannot get with open-source {pulsar-reg}? -In effect, the {cdc_cass} implements the reverse of Apache Pulsar or {company} {cass-short} Sink Connector. -With those sink connectors, data is taken from a Pulsar topic and put into {cass-short}. -With {cdc_cass}, updates to a {cass-short} table are converted into events and put into a data topic. +In effect, the {product-short} implements the reverse of {pulsar} or {company} {cass-short} Sink Connector. +With those sink connectors, data is taken from a {pulsar-short} topic and put into {cass-short}. +With {product-short}, updates to a {cass-short} table are converted into events and put into a data topic. From there, the data can be published to external platforms like Elasticsearch, Snowflake, and other platforms. -== How do I install {cdc_cass}? +== How do I install {product-short}? Follow the xref:install.adoc[installation instructions]. -== What are the requirements for {cdc_cass}? +== What are the requirements for {product-short}? See the xref:ROOT:install.adoc[installation instructions]. -== I have multiple {cass-short} datacenters. How do I configure {cdc_cass}? +== I have multiple {cass-short} datacenters. How do I configure {product-short}? See xref:ROOT:index.adoc#multiple-cassandra-datacenters[Deploy multiple {cass-short} datacenters]. @@ -37,30 +37,30 @@ See xref:ROOT:index.adoc#multiple-cassandra-datacenters[Deploy multiple {cass-sh For each CDC-enabled {cass-short} table, {cass-short} needs extra processing cycles and storage to process the CDC commit logs. The impact for dealing with a single CDC-enabled table is small, but when there are a large number of {cass-short} tables with CDC enabled, the impact within {cass-short} increases. -The performance impact occurs within {cass-short} itself, not the {cass-short} CDC solution with Pulsar. +The performance impact occurs within {cass-short} itself, not the {cass-short} CDC solution with {pulsar-short}. The {cdc_agent} is started as a JVM agent of the {cass-short} process and it shares the same hardware resource of the same {cass-short} node. -However, the only job that the {cdc_agent} does is to scan the CDC commit log directory on a regular basis and send messages to the Pulsar cluster. +However, the only job that the {cdc_agent} does is to scan the CDC commit log directory on a regular basis and send messages to the {pulsar-short} cluster. This is a lightweight process when launched on a single thread, but the {cdc_agent} can be launched with multiple threads. As more threads are launched, more resources are consumed. -For each {cass-short} write operation (one detected change-event), the Pulsar CSC connector performs a primary key-based {cass-short} read to get the most complete, up-to-date information of that particular {cass-short} row. +For each {cass-short} write operation (one detected change-event), the {pulsar-short} CSC connector performs a primary key-based {cass-short} read to get the most complete, up-to-date information of that particular {cass-short} row. In a worst-case scenario, where a CDC-enabled {cass-short} has 100% write workload, the CDC solution would double the workload by adding the same amount of read workload to {cass-short} table. Since the {cass-short} read is primary key-based, it is efficient. -== What are the {cdc_cass} limitations? +== What are the {product-short} limitations? -See xref:ROOT:index.adoc#limitations[{cdc_cass} limitations]. +See xref:ROOT:index.adoc#limitations[{product-short} limitations]. -== What happens if the Apache Pulsar service is unavailable? +== What happens if the {pulsar} service is unavailable? -If the Pulsar cluster is down, the {cdc_agent} on each {cass-short} node attempts to send the mutations periodically, and it keeps the CDC commitlog segments on disk until the data sending is successful. +If the {pulsar-short} cluster is down, the {cdc_agent} on each {cass-short} node attempts to send the mutations periodically, and it keeps the CDC commitlog segments on disk until the data sending is successful. -The {cdc_agent} keeps track of the CDC commitlog segment offsets, so the {cdc_agent} knows where to resume sending the mutation messages when the Pulsar cluster is back online. +The {cdc_agent} keeps track of the CDC commitlog segment offsets, so the {cdc_agent} knows where to resume sending the mutation messages when the {pulsar-short} cluster is back online. {company} recommends active monitoring of the disk space of the {cass-short} nodes. -If the Pulsar cluster is down, the change agent continues trying to send messages, and the CDC commitlog files accumulate on the {cass-short} node. If the maximum CDC directory disk space is reached, future {cass-short} writes to the CDC-enabled table will fail. +If the {pulsar-short} cluster is down, the change agent continues trying to send messages, and the CDC commitlog files accumulate on the {cass-short} node. If the maximum CDC directory disk space is reached, future {cass-short} writes to the CDC-enabled table will fail. When the disk space of the `cdc_raw` directory reaches your `cdc_total_space_in_mb` {cass-short} setting (less than 4 GB by default), writes to CDC-enabled tables fail with a `CDCWriteException`. The following warning message is included in {cass-short} logs: @@ -71,12 +71,12 @@ WARN [CoreThread-5] 2021-10-29 09:12:52,790 NoSpamLogger.java:98 - Rejecting M ---- To avoid or recover from this situation, increase the `cdc_total_space_in_mb` and restart the node. -To prevent hitting this new limit, increase the write throughput to your Apache Pulsar cluster, or decrease the write throughput to your node. +To prevent hitting this new limit, increase the write throughput to your {pulsar-short} cluster, or decrease the write throughput to your node. Increasing the write throughput can involve tuning one or more of the following: * Change agent configuration: The number of allocated threads, the batching delay, the number of inflight messages -* Pulsar cluster configuration: The number of partitions of your topics +* {pulsar-short} cluster configuration: The number of partitions of your topics * {csc_pulsar} configuration: The query executors, batching and cache settings, connector parallelism As a last resort, if losing data is acceptable in your CDC pipeline, remove `commitlog` files from the `cdc_raw` directory. @@ -166,18 +166,18 @@ ALTER TABLE cdc.raw_cdc WITH cdc = {'enabled': true}; == What happens to unacknowledged event messages the {cdc_agent} cannot deliver? -Unacknowledged messages mean the {cdc_agent} couldn't produce the event message in Pulsar. +Unacknowledged messages mean the {cdc_agent} couldn't produce the event message in {pulsar-short}. In this case, the table row mutation fails. The {cass-short} client handles this as an exception. The data is committed to {cass-short} and no event is created. -Another scenario might be the Pulsar broker is too busy to process messages and a backlog has been created. -In this case, Pulsar's backlog policies take effect and event messages are handled accordingly. +Another scenario might be the {pulsar-short} broker is too busy to process messages and a backlog has been created. +In this case, the {pulsar-short} backlog policies take effect and event messages are handled accordingly. The data is committed to {cass-short} but there might be some additional latency to the event message creation. The design of CDC in {cass-short} assumed that when table changes are synchronized to the `raw_cdc` log, another process is draining that log. There is a max log size setting that disables writes to the table when the set threshold is reached. -If a connection to the Pulsar cluster is needed for the log to be drained, and it isn't responsive, then the log begins to fill, which can impact a table's write availability. +If a connection to the {pulsar-short} cluster is needed for the log to be drained, and it isn't responsive, then the log begins to fill, which can impact a table's write availability. For more, see the xref:cdc-for-cassandra:ROOT:install.adoc#scaling-up-your-configuration[Scaling up your CDC configuration]. @@ -204,12 +204,12 @@ If you are using {cass-short} in a serverless environment, then the JVM is alrea Number of {csc_pulsar} instances that are running:: This is initially set when the {cass-short} source connector is created, and it can be updated throughout the life of the running connector. -Depending on your Pulsar configuration, an instance can represent a process thread on the broker or a function worker. +Depending on your {pulsar-short} configuration, an instance can represent a process thread on the broker or a function worker. If using Kubernetes, this could be a pod. Each represents different scaling strategies like increasing compute, adding more workers, and more K8s nodes. Broker backlog size and throughput tolerances:: -There are potentially a large amount of messages being created, so you must ensure the Pulsar cluster is sized correctly. +There are potentially a large amount of messages being created, so you must ensure the {pulsar-short} cluster is sized correctly. For more information, see xref:luna-streaming:install-upgrade:production-cluster-sizing.adoc[]. == How do I filter table data by column? @@ -221,17 +221,17 @@ Name the topic something memorable, like `filtered-data` topic. [#how-do-i-configure-multi-region-cdc-using-the-cassandra-sink] == How do I configure multi-region CDC using the {cass-short} sink? -One of the requirements for CDC is that both the {cass-short} and Pulsar clusters must be in the same cloud region or on-premise data center. +One of the requirements for CDC is that both the {cass-short} and {pulsar-short} clusters must be in the same cloud region or on-premise data center. If you are using geo-replication, you need the change data to be replicated across multiple clusters. -The most manageable way to handle this is to use Pulsar's {cass-short} sink to "watch" the CDC data topic and write the change to a different {cass-short} table in another organization. +The most manageable way to handle this is to use the {pulsar-short} {cass-short} sink to "watch" the CDC data topic and write the change to a different {cass-short} table in another organization. The {cass-short} sink requires the following provisions: - Use the CDC data topic as its source of messages - Provide a secure bundle (creds) to another {cass-short} cluster - Map message values to a specific table in the other cluster -- Use Pulsar's delivery guarantee to ensure success -- Use Pulsar's connector health metrics to monitor failures +- Use the {pulsar-short} delivery guarantee to ensure success +- Use the {pulsar-short} connector health metrics to monitor failures == How do I migrate table data using CDC? diff --git a/docs/modules/ROOT/pages/index.adoc b/docs/modules/ROOT/pages/index.adoc index c4fd6bd9..373c72d3 100644 --- a/docs/modules/ROOT/pages/index.adoc +++ b/docs/modules/ROOT/pages/index.adoc @@ -1,21 +1,21 @@ -= About {cdc_cass} += About {product-short} -{cdc_cass_first} is open-source software (OSS) that sends {cass-short} mutations for tables having Change Data Capture (CDC) enabled to https://www.ibm.com/docs/en/supportforpulsar[IBM Elite Support for Apache Pulsar] or your own self-managed https://pulsar.apache.org/[Apache Pulsar(TM)] deployment, which in turn can write the data to platforms such as Elasticsearch(R) or Snowflake(R). +{product} is open-source software (OSS) that sends {cass-short} mutations for tables having Change Data Capture (CDC) enabled to https://www.ibm.com/docs/en/supportforpulsar[IBM Elite Support for {pulsar}] or your own self-managed https://pulsar.apache.org/[{pulsar-reg})] deployment, which in turn can write the data to platforms such as Elasticsearch(R) or Snowflake(R). == Key Features -* Supports {cass} 3.11 or later, {cass} 4.0 or later, and {dse} 6.8.16 or later -* Supports IBM Elite Support for Apache Pulsar (formerly {company} Luna Streaming) and Apache Pulsar 2.8.1 or later +* Supports {cass} version 3.11 or later, {cass} version 4.0 or later, and {dse} version 6.8.16 or later +* Supports IBM Elite Support for {pulsar} (formerly {company} Luna Streaming) and self-managed {pulsar} version 2.8.1 or later * De-duplicates updates from multiple replicas -* Propagates {cass-short} schema change to the built-in Pulsar schema registry +* Propagates {cass-short} schema change to the built-in {pulsar-short} schema registry * Supports AVRO message format == Architecture -Other than the prerequisite {cass-short} and Pulsar clusters, {cdc_cass} has two components: +Other than the prerequisite {cass-short} and {pulsar-short} clusters, {product-short} has two components: * {cdc_agent_first}, which is an event producer deployed as a JVM agent on each {cass-short} data node -* {csc_pulsar_first}, which is a source connector deployed in your Pulsar cluster +* {csc_pulsar_first}, which is a source connector deployed in your {pulsar-short} cluster The following diagram describes the general architecture. @@ -48,10 +48,10 @@ The following table describes what is published to the data topic for each updat The {csc_pulsar} updates the schema registry to dynamically reflect the {cass-short} table schema. You can then deploy various sink connectors to replicate data into the backends of your choice. -For more, see https://pulsar.apache.org/docs/en/io-connectors/#sink-connector[Pulsar built-in sink connectors]. +For more, see https://pulsar.apache.org/docs/en/io-connectors/#sink-connector[{pulsar-short} built-in sink connectors]. Sink connectors processing messages from the data topic should interpret an event with a null value as a delete. -For example, with the Pulsar Elasticsearch connector, you need to set `nullValueAction` to `DELETE`. +For example, with the {pulsar-short} Elasticsearch connector, you need to set `nullValueAction` to `DELETE`. The change agent runs on all {cass-short} data nodes. This means that the agent processes the original write plus its replicas. @@ -62,23 +62,23 @@ For each update to the table, an MD5 digest is calculated to de-duplicate the up [cols="1,1"] |=== -| {cass-short} version | Apache Pulsar/IBM Elite Support for Apache Pulsar (formerly {company} Luna Streaming) -| {cass-short} 3.x | https://github.com/datastax/cdc-apache-cassandra/tree/master/agent-c3[agent-c3] -| {cass-short} 4.x | https://github.com/datastax/cdc-apache-cassandra/tree/master/agent-c4[agent-c4] -| {dse-short} 6.8.16 or later | https://github.com/datastax/cdc-apache-cassandra/tree/master/agent-dse4[agent-dse4] +| {cass-short} version | Self-managed {pulsar} or IBM Elite Support for {pulsar} (formerly {company} Luna Streaming) agent +| {cass-short} 3.x | {product-repo}/tree/master/agent-c3[agent-c3] +| {cass-short} 4.x | {product-repo}/tree/master/agent-c4[agent-c4] +| {dse-short} 6.8.16 or later | {product-repo}/tree/master/agent-dse4[agent-dse4] |=== == Supported streaming platforms -* IBM Elite Support for Apache Pulsar (formerly {company} Luna Streaming) 2.8 and later (current version is {luna_version}) -* Apache Pulsar 2.8.1 and later +* IBM Elite Support for {pulsar} (formerly {company} Luna Streaming) 2.8 and later (current version is {luna_version}) +* Self-managed {pulsar} version 2.8.1 and later === Connector deployment matrix [cols="1"] |=== -| Apache Pulsar/IBM Elite Support for Apache Pulsar (formerly {company} Luna Streaming) -| https://github.com/datastax/cdc-apache-cassandra/tree/master/connector[connector] +| Self-managed {pulsar} or IBM Elite Support for {pulsar} (formerly {company} Luna Streaming) +| {product-repo}/tree/master/connector[connector] |=== [#supported-databases] @@ -133,7 +133,7 @@ If a row update contains both supported and unsupported data types, the event in [#limitations] == Limitations -{cdc_cass} has the following limitations: +{product-short} has the following limitations: * Doesn't manage table truncates. Don't use the `TRUNCATE **TABLE_NAME**` command. @@ -146,9 +146,9 @@ Don't use the `TRUNCATE **TABLE_NAME**` command. * Doesn't support range deletes. -* CQL column names cannot match Pulsar primitive type names, such as the following: +* CQL column names cannot match {pulsar-short} primitive type names, such as the following: + -.Pulsar primitive types +.{pulsar-short} primitive types [cols=2] |=== |Primitive type |Description @@ -189,12 +189,12 @@ It stores the number of milliseconds since January 1, 1970, 00:00:00 GMT as an I == Manage schema updates on topics -Schema registry updates on a Pulsar topic are controlled by the `is-allow-auto-update-schema` option. +Schema registry updates on a {pulsar-short} topic are controlled by the `is-allow-auto-update-schema` option. * `true` allows the broker to register a new schema for a topic and connect the producer if the schema isn't registered. * `false` rejects the producer's connection to the broker if the schema isn't registered. -To ensure the {csc_pulsar} can automatically update the schema on the Pulsar topic, set the option to `true`. For more, see https://pulsar.apache.org/docs/en/schema-manage/[Schema Auto-Update]. +To ensure the {csc_pulsar} can automatically update the schema on the {pulsar-short} topic, set the option to `true`. For more, see https://pulsar.apache.org/docs/en/schema-manage/[Schema Auto-Update]. [#multiple-cassandra-datacenters] == Deploy on multiple {cass-short} datacenters diff --git a/docs/modules/ROOT/pages/install.adoc b/docs/modules/ROOT/pages/install.adoc index 705f9c90..69b3a8eb 100644 --- a/docs/modules/ROOT/pages/install.adoc +++ b/docs/modules/ROOT/pages/install.adoc @@ -1,6 +1,6 @@ -= Install and configure {cdc_cass} += Install and configure {product-short} -This guide explains how to install and configure {cdc_cass_first} for the first time on your {cass} or {dse-short} cluster and Pulsar cluster. +This guide explains how to install and configure {product} for the first time on your {cass-reg} or {dse} cluster and {pulsar-reg} cluster. [TIP] ==== @@ -11,9 +11,9 @@ For a shorter version of this guide, see xref:ROOT:cdcExample.adoc[]. The minimum requirements are as follows: -* Near real-time event streaming CDC requires {cass-short} version 3.11 or later, {cass-short} 4.0 or later, or {dse-short} 6.8.16 or later. +* Near real-time event streaming CDC requires {cass-short} version 3.11 or later, {cass-short} version 4.0 or later, or {dse-short} version 6.8.16 or later. * Batch CDC only requires {cass-short} version 3.0 to 3.10. -* IBM Elite Support for Apache Pulsar (formerly {company} Luna Streaming) or Apache Pulsar 2.8.1 or later. +* IBM Elite Support for {pulsar} (formerly {company} Luna Streaming) or self-managed {pulsar} version 2.8.1 or later. * Additional memory and CPU available on all {cass-short} nodes. [NOTE] @@ -24,9 +24,9 @@ However, for near real-time event streaming, you must run at least {cass-short} Depending on the workloads of the CDC enabled {cass-short} tables, you might need to increase the CPU and memory specification of the {cass-short} nodes. -== Install {cdc_cass} for VM deployment +== Install {product-short} for VM deployment -. Download the `cassandra-source-agents` tar file from the https://github.com/datastax/cdc-apache-cassandra/releases[{cdc_cass} GitHub repository]. +. Download the `cassandra-source-agents` tar file from the {product-repo}/releases[{product-short} GitHub repository]. The following files are available in the tar file: + [cols="1,1"] @@ -51,16 +51,16 @@ tar xvf cassandra-source-agents-**VERSION**.tar == Start {cass-short} with the {cdc_agent} All data nodes of your {cass-short} or {dse-short} datacenter must run the change agent as a JVM agent to send mutations into the events topic of your streaming software. -Start your {cass-short} or {dse-short} nodes with the appropriate producer binary matching your {cass-short} (3.11 or 4.0) or {dse-short} (6.8.16) version and your streaming platform: IBM Elite Support for Apache Pulsar (formerly {company} Luna Streaming) 2.8+ or Apache Pulsar 2.8.1+. +Start your {cass-short} or {dse-short} nodes with the appropriate producer binary matching your {cass-short} (3.11 or 4.0) or {dse-short} (6.8.16) version and your streaming platform: IBM Elite Support for {pulsar} (formerly {company} Luna Streaming) version 2.8 or later, or self-managed {pulsar} version 2.8.1 or later. -In {cdc_agent} versions *before 1.0.3*, the {cdc_agent} Pulsar connection parameters were provided as extra JVM options after the `jar` file name in the form of a comma-separated list of `paramName=paramValue`, as below: +In {cdc_agent} versions _before 1.0.3_, the {cdc_agent} {pulsar-short} connection parameters were provided as extra JVM options after the `jar` file name in the form of a comma-separated list of `paramName=paramValue`, as below: [source,bash,subs="+quotes"] ---- export JVM_EXTRA_OPTS="-javaagent:/path/to/agent-c4-**VERSION**-all.jar=pulsarServiceUrl=pulsar://pulsar:6650" ---- -In {cdc_agent} versions *after 1.0.3*, the {cdc_agent} Pulsar connection parameters are also provided as system environment parameters in `cassandra-env.sh`. The same JVM option above is now appended to `cassandra-env.sh` as below: +In {cdc_agent} versions _after 1.0.3_, the {cdc_agent} {pulsar-short} connection parameters are also provided as system environment parameters in `cassandra-env.sh`. The same JVM option above is now appended to `cassandra-env.sh` as below: [source,bash,subs="+quotes"] ---- @@ -108,9 +108,9 @@ include::ROOT:partial$agentParams.adoc[] == Download the {csc_pulsar_first} -Download the `cassandra-source-connectors` tar file from the https://github.com/datastax/cdc-apache-cassandra/releases[{cdc_cass} GitHub repository]. +Download the `cassandra-source-connectors` tar file from the {product-repo}/releases[{product-short} GitHub repository]. -For Apache Pulsar and IBM Elite Support for Apache Pulsar (formerly {company} Luna Streaming) 2.8, the `pulsar-cassandra-source-.nar` file is available. +For self-managed {pulsar} and IBM Elite Support for {pulsar} (formerly {company} Luna Streaming) 2.8, the `pulsar-cassandra-source-.nar` file is available. Extract the files from the tar, specifying the version that matches your streaming platform: @@ -121,7 +121,7 @@ tar xvf cassandra-source-connectors-**VERSION**.tar == Deploy the {csc_pulsar} -To deploy the {csc_pulsar} `NAR` file in your Pulsar cluster, upload it to your Pulsar cluster using the `pulsar-admin sources create` command. +To deploy the {csc_pulsar} `NAR` file in your {pulsar-short} cluster, upload it to your {pulsar-short} cluster using the `pulsar-admin sources create` command. You need to deploy {csc_pulsar} for each CDC-enabled table. For each CDC-enabled table, the change agent sends events to the events topic. @@ -132,7 +132,7 @@ You have to specify the following parameters: * Connector name. You have one connector per CDC-enabled {cass-short} table, make sure to use a unique name. * Previously downloaded {csc_pulsar} `NAR` file. -* Pulsar `tenant` and `namespace` where the connector will run. +* {pulsar-short} `tenant` and `namespace` where the connector will run. * Destination `topic` for {cass-short} data (`data` topic). * Number of instances (parallelism) of the connector. For high-volume tables, you might need to run multiple connector instances to prevent a growing backlog on the events topic. * Name of the `events` topic the connector will read from. @@ -217,7 +217,7 @@ If you don't provide either in your configuration, {csc_pulsar} defaults are app For information about the Java properties, refer to the https://docs.datastax.com/en/developer/java-driver/4.3/manual/core/configuration/reference/index.html[{company} Java driver documentation]. |=== -| {csc_pulsar_first} | Using datastax-java-driver prefix +| {csc_pulsar} | Using `datastax-java-driver` prefix | `contactPoints` | `datastax-java-driver.basic.contact-points` @@ -259,13 +259,13 @@ For more information, refer to the https://docs.datastax.com/en/developer/java-d == Scaling up your configuration If your connector isn't keeping up and the messages in the events topic are growing, increase the number of connector instances using the `parallelism` parameter. -Pulsar ensures in-order processing using Key_Shared subscriptions. +{pulsar-short} ensures in-order processing using Key_Shared subscriptions. -If the volume of data in the events topic is very high, https://pulsar.apache.org/docs/en/admin-api-topics/#manage-partitioned-topics[partition] the events topic to distribute the load across multiple Pulsar brokers. -Do this before starting the change agent because Pulsar auto-creates non-partitioned topics by default. +If the volume of data in the events topic is very high, https://pulsar.apache.org/docs/en/admin-api-topics/#manage-partitioned-topics[partition] the events topic to distribute the load across multiple {pulsar-short} brokers. +Do this before starting the change agent because {pulsar-short} auto-creates non-partitioned topics by default. If you are using partitioned topics, change `events.subscription.type` to `Failover` to ensure in-order delivery when running multiple connector instances. -To further improve the throughput, you can adjust the `pulsarBatchDelayInMs` in the change agent to batch messages in the change agent before sending them to Pulsar. +To further improve the throughput, you can adjust the `pulsarBatchDelayInMs` in the change agent to batch messages in the change agent before sending them to {pulsar-short}. To improve performance on individual connector instances as they read data from {cass-short}, you can adjust the `batch.size` and the `query.executors`. Increasing these values from their defaults increases parallelism within the connector instances. diff --git a/docs/modules/ROOT/pages/monitor.adoc b/docs/modules/ROOT/pages/monitor.adoc index 00fb69b3..61c0acc8 100644 --- a/docs/modules/ROOT/pages/monitor.adoc +++ b/docs/modules/ROOT/pages/monitor.adoc @@ -1,4 +1,4 @@ -= Monitoring {cdc_cass} += Monitoring {product-short} == Change Agent Metrics @@ -53,9 +53,9 @@ The change agent is a JVM agent running in {cass-reg} nodes and provides a dedic |The maximum number of uncleaned tasks. |=== -== {cdc_cass} stats +== {product-short} stats -The {cdc_cass} framework reports stats for each connector. You can view the stats for a connector like this: +The {product-short} framework reports stats for each connector. You can view the stats for a connector like this: [source,bash] ---- @@ -81,13 +81,13 @@ pulsar-admin source stats --name cassandra-source-1 } ---- -The stats `numReceivedFromSource` and `numWritten` indicate how many events have been processed by the {cdc_cass}. +The stats `numReceivedFromSource` and `numWritten` indicate how many events have been processed by the {product-short}. If the connector has errors, the counts are shown. A description of the last seen error is displayed in the `error` field. -== {cdc_cass} metrics +== {product-short} metrics -{cdc_cass} also publishes per message metrics: +{product-short} also publishes per message metrics: [cols="2,3"] |=== @@ -112,11 +112,11 @@ A description of the last seen error is displayed in the `error` field. |The number of threads available to execute the CQL queries. |replication_latency -|The replication latency in milliseconds (the {cdc_cass} processing time minus the {cass-short} mutation writetime). +|The replication latency in milliseconds (the {product-short} processing time minus the {cass-short} mutation writetime). |=== -Here an example of those user-defined metrics aggregated by Apache Pulsar(TM) when processing 2000 mutations: +Here an example of those user-defined metrics aggregated by {pulsar-reg} when processing 2000 mutations: [source,bash] ---- @@ -147,6 +147,6 @@ pulsar_source_user_metric__sum{tenant="public",namespace="public/default",name=" == Monitoring and Alerting resources * The change agent exposes metrics with xref:planning:ROOT:metrics-alerts.adoc[JMX], a technology within Java that provides tools for managing and monitoring applications. -* xref:opscenter:overview:opscenter-about.adoc[OpsCenter] can collect these exposed metrics for visualization and alerts, and pass them on to xref:monitoring:ROOT:ops-use-metrics-collector.adoc[{dse-short} Metrics Collector] for additional integration with https://prometheus.io/docs/introduction/overview/[Prometheus] and https://grafana.com/[Grafana]. -* The https://github.com/datastax/metric-collector-for-apache-cassandra[Metrics Collector for {cass}] with Prometheus and Grafana dashboards provides the same functionality as {dse-short} Metrics Collector, built on the well-supported collectd agent. +* xref:opscenter:overview:opscenter-about.adoc[{opscenter}] can collect these exposed metrics for visualization and alerts, and pass them on to xref:monitoring:ROOT:ops-use-metrics-collector.adoc[{metrics-collector}] for additional integration with https://prometheus.io/docs/introduction/overview/[Prometheus] and https://grafana.com/[Grafana]. +* The https://github.com/datastax/metric-collector-for-apache-cassandra[Metrics Collector for {cass}] with Prometheus and Grafana dashboards provides the same functionality as {metrics-collector}, built on the well-supported collectd agent. * Other monitoring tools like https://github.com/prometheus/jmx_exporter[JMX Exporter] by Prometheus are available, but they might require additional tuning. \ No newline at end of file diff --git a/docs/modules/ROOT/pages/stringMappings.adoc b/docs/modules/ROOT/pages/stringMappings.adoc index f8f7f4f9..964a041e 100644 --- a/docs/modules/ROOT/pages/stringMappings.adoc +++ b/docs/modules/ROOT/pages/stringMappings.adoc @@ -1,17 +1,17 @@ = CDC Change Agent Parameter Mappings -In CDC versions *before 1.0.3*, the {cdc_agent} Pulsar connection parameters were provided as extra JVM options after the `.jar` file name in the form of a comma-separated list of `paramName=paramValue`, as below: +In CDC versions *before 1.0.3*, the {cdc_agent} {pulsar-short} connection parameters were provided as extra JVM options after the `.jar` file name in the form of a comma-separated list of `paramName=paramValue`, as below: [source,bash] ---- export JVM_EXTRA_OPTS="-javaagent:/path/to/agent-c4-luna--all.jar=pulsarServiceUrl=pulsar://pulsar:6650" ---- -In CDC versions *after 1.0.3*, the {cdc_agent} Pulsar connection parameters are also provided as system environment parameters in `cassandra-env.sh`. The JVM option above is now appended to `cassandra-env.sh` as below: +In CDC versions *after 1.0.3*, the {cdc_agent} {pulsar-short} connection parameters are also provided as system environment parameters in `cassandra-env.sh`. The JVM option above is now appended to `cassandra-env.sh` as below: -[source,bash] +[source,bash,subs="+quotes"] ---- -export CDC_PULSAR_SERVICE_URL="pulsar://:6650" +export CDC_PULSAR_SERVICE_URL="pulsar://**PULSAR_SERVER_IP**:6650" ---- This document lists the CDC Change Agent parameter mappings between the JVM option strings and {cass-short} strings. @@ -52,37 +52,37 @@ This document lists the CDC Change Agent parameter mappings between the JVM opti | *pulsarServiceUrl* -| The Pulsar broker service URL. +| The {pulsar-short} broker service URL. | PULSAR_SERVICE_URL | *pulsarBatchDelayInMs* -| Pulsar batching delay in milliseconds. Pulsar batching is enabled when this value is greater than zero. +| {pulsar-short} batching delay in milliseconds. {pulsar-short} batching is enabled when this value is greater than zero. | PULSAR_BATCH_DELAY_IN_MS | *pulsarKeyBasedBatcher* -| When true, use the Pulsar KEY_BASED BatchBuilder. +| When true, use the {pulsar-short} KEY_BASED BatchBuilder. | PULSAR_KEY_BASED_BATCHER | *pulsarMaxPendingMessages* -| The Pulsar maximum size of a queue holding pending messages. +| The {pulsar-short} maximum size of a queue holding pending messages. | PULSAR_MAX_PENDING_MESSAGES | *pulsarMaxPendingMessagesAcrossPartitions* -| The Pulsar maximum number of pending messages across partitions. +| The {pulsar-short} maximum number of pending messages across partitions. | PULSAR_MAX_PENDING_MESSAGES_ACROSS_PARTITIONS | *pulsarAuthPluginClassName* -| The Pulsar authentication plugin class name. +| The {pulsar-short} authentication plugin class name. | PULSAR_AUTH_PLUGIN_CLASS_NAME | *pulsarAuthParams* -| The Pulsar authentication parameters. +| The {pulsar-short} authentication parameters. | PULSAR_AUTH_PARAMS