diff --git a/.vscode/launch.json b/.vscode/launch.json index dd543f9..f4fab5a 100644 --- a/.vscode/launch.json +++ b/.vscode/launch.json @@ -21,6 +21,15 @@ "console": "integratedTerminal", "args": ["--config-path", "./example/pipeline_parking_sensors.json", "--working-dir", "./tmp", "--show-result", "--cleanup-database"], "justMyCode": true + }, + { + "name": "Python: main.py JDBC", + "type": "python", + "request": "launch", + "program": "src/main.py", + "console": "integratedTerminal", + "args": ["--config-path", "./example/pipeline_fruit_batch_JDBC.json", "--working-dir", "./tmp", "--show-result", "--cleanup-database"], + "justMyCode": true }, { "name": "Python: main.py ADLS", diff --git a/doc/ADLS_ingestion.md b/doc/ADLS_ingestion.md new file mode 100644 index 0000000..250f34a --- /dev/null +++ b/doc/ADLS_ingestion.md @@ -0,0 +1,70 @@ +# ADLS/JDBC Data Ingestion + +## If you want to setup ADLS or JDBC AS pipline data source you need setup the service principal, Azure key vault and databricks secret scope + +### Create Azure Lake Storage Account + +- The Data Lake Storage account that will use the service principal must be Gen2. Azure Data Lake Gen1 storage accounts are not supported. +- The Data Lake Storage account has hierarchical namespace enabled. +- Admin permissions for your Azure tenant, if you have to create a new service principal. + +### Create an Azure service principal for Customer Insights + +#### Look for an existing service principal + +1. Go to the Azure admin portal and sign in to your organization. +2. From Azure services, select Azure Active Directory. +3. Under Manage, select App registrations. +4. If you find a matching record, it means that the service principal already exists. Grant permissions for the service principal to access the storage account. + +#### Create a new service principal + +1. Go to the Azure admin portal and sign in to your organization. +2. From Azure services, select Azure Active Directory. +3. Under Manage, select App registrations. +4. New Registration +![new app](images/new_app.png) +5. recorded information + - Application (client) ID + - Directory (tenant) ID + - Client credentials +![new app](images/app_info.png) + +#### Grant permissions to the service principal to access the storage account + +1. Go to the ADLS in Azure admin portal. +2. Select Access Cintrol (IAM) +3. Add role assignment +![new app](images/app_adls01.png) +4. Select the label Role => Contributor => Next +![new app](images/app_adls02.png) +5. Under label Members => User, group, service pricipal +6. Search and select service principal name just created +7. Click Review + assign +![new app](images/app_adls03.png) + +#### ADD ADLS and JDBC info into Azure Key Vault + +- Secrets => Generate/Import +- add service principal and JDBC user name and password +![new app](images/key_vault.png) + +#### ADD Databricks secret scope for access Key Vault +secret scope is stored in (backed by) an encrypted database owned and managed by Azure Databricks. The secret scope name: + +- Must be unique within a workspace. +- Must consist of alphanumeric characters, dashes, underscores, @, and periods, and may not exceed 128 characters. + +The names are considered non-sensitive and are readable by all users in the workspace. + +#### Create an Azure Key Vault-backed secret scope using the UI + +- Go to https://#secrets/createScope. This URL is case sensitive; scope in createScope must be uppercase. +![new app](images/dbs_secret_scope.png) + +input scope name and Azure Key Vault DNS Name and Resource ID + +DNS Name and Resource ID you can get these value from Azure portal your key vault => properties +![new app](images/key_vault02.png) + +Finally, you have your account is ready and can access ADLS and JDBC in Databricks \ No newline at end of file diff --git a/doc/images/app_adls01.png b/doc/images/app_adls01.png new file mode 100644 index 0000000..ae35c68 Binary files /dev/null and b/doc/images/app_adls01.png differ diff --git a/doc/images/app_adls02.png b/doc/images/app_adls02.png new file mode 100644 index 0000000..d1eb042 Binary files /dev/null and b/doc/images/app_adls02.png differ diff --git a/doc/images/app_adls03.png b/doc/images/app_adls03.png new file mode 100644 index 0000000..a7a2885 Binary files /dev/null and b/doc/images/app_adls03.png differ diff --git a/doc/images/app_info.png b/doc/images/app_info.png new file mode 100644 index 0000000..5b664bc Binary files /dev/null and b/doc/images/app_info.png differ diff --git a/doc/images/dbs_secret_scope.png b/doc/images/dbs_secret_scope.png new file mode 100644 index 0000000..2367ef3 Binary files /dev/null and b/doc/images/dbs_secret_scope.png differ diff --git a/doc/images/key_vault.png b/doc/images/key_vault.png new file mode 100644 index 0000000..254979f Binary files /dev/null and b/doc/images/key_vault.png differ diff --git a/doc/images/key_vault02.png b/doc/images/key_vault02.png new file mode 100644 index 0000000..c6eabb7 Binary files /dev/null and b/doc/images/key_vault02.png differ diff --git a/doc/images/new_app.png b/doc/images/new_app.png new file mode 100644 index 0000000..bc071a5 Binary files /dev/null and b/doc/images/new_app.png differ diff --git a/example/pipeline_fruit_batch_ADLS.json b/example/pipeline_fruit_batch_ADLS.json index 7bb83f1..9825dd0 100644 --- a/example/pipeline_fruit_batch_ADLS.json +++ b/example/pipeline_fruit_batch_ADLS.json @@ -69,7 +69,44 @@ } ], "type": "struct" - } + }, + "sampleData": [ + { + "ID": 1, + "Amount": 12, + "TS": "2022-01-10T00:00:00.000+08:00" + }, + { + "ID": 2, + "Amount": 13, + "TS": "2022-01-10T00:00:00.000+08:00" + }, + { + "ID": 3, + "Amount": 14, + "TS": "2022-01-10T00:00:00.000+08:00" + }, + { + "ID": 4, + "Amount": 15, + "TS": "2022-01-10T00:00:00.000+08:00" + }, + { + "ID": 5, + "Amount": 16, + "TS": "2022-01-10T00:00:00.000+08:00" + }, + { + "ID": 6, + "Amount": 17, + "TS": "2022-01-10T00:00:00.000+08:00" + }, + { + "ID": 7, + "Amount": 18, + "TS": "2022-01-10T00:00:00.000+08:00" + } + ] }, { "name": "price_ingestion", @@ -175,7 +212,65 @@ } ], "type": "struct" - } + }, + "sampleData": [ + { + "ID": 1, + "Fruit": "Red Grape", + "Color": "Red", + "Price": 2, + "Start_TS": "2015-01-01T00:00:00.000+08:00", + "End_TS": "2099-12-31T23:59:59.000+08:00" + }, + { + "ID": 2, + "Fruit": "Peach", + "Color": "Yellow", + "Price": 3, + "Start_TS": "2015-01-01T00:00:00.000+08:00", + "End_TS": "2099-12-31T23:59:59.000+08:00" + }, + { + "ID": 3, + "Fruit": "Orange", + "Color": "Orange", + "Price": 2, + "Start_TS": "2015-01-01T00:00:00.000+08:00", + "End_TS": "2099-12-31T23:59:59.000+08:00" + }, + { + "ID": 4, + "Fruit": "Green Apple", + "Color": "Green", + "Price": 3, + "Start_TS": "2015-01-01T00:00:00.000+08:00", + "End_TS": "2099-12-31T23:59:59.000+08:00" + }, + { + "ID": 5, + "Fruit": "Fiji Apple", + "Color": "Red", + "Price": 3.5, + "Start_TS": "2015-01-01T00:00:00.000+08:00", + "End_TS": "2099-12-31T23:59:59.000+08:00" + }, + { + "ID": 6, + "Fruit": "Banana", + "Color": "Yellow", + "Price": 1, + "Start_TS": "2015-01-01T00:00:00.000+08:00", + "End_TS": "2099-12-31T23:59:59.000+08:00" + }, + { + "ID": 7, + "Fruit": "Green Grape", + "Color": " Green", + "Price": 2, + "Start_TS": "2015-01-01T00:00:00.000+08:00", + "End_TS": "2099-12-31T23:59:59.000+08:00" + } + ] } ], "standard": [ diff --git a/example/pipeline_fruit_batch_JDBC.json b/example/pipeline_fruit_batch_JDBC.json new file mode 100644 index 0000000..6afbbbc --- /dev/null +++ b/example/pipeline_fruit_batch_JDBC.json @@ -0,0 +1,306 @@ +{ + "name": "fruit_batch_data_app", + "staging": [ + { + "name": "sales_ingestion", + "target": "stg_sales", + "input": { + "type": "jdbc", + "format": "csv", + "path": "FileStore/cddp_apps/fruit_batch_data_app/landing/sales_ingestion/", + "read-type": "batch", + "schema": { + "fields": [ + { + "metadata": {}, + "name": "ID", + "nullable": true, + "type": "integer" + }, + { + "metadata": {}, + "name": "Amount", + "nullable": true, + "type": "integer" + }, + { + "metadata": {}, + "name": "TS", + "nullable": true, + "type": "timestamp" + } + ], + "type": "struct" + }, + "secret_scope": "ADLSTEST", + "table_name": "fruit_sales", + "jdbc_url": "jdbc:sqlserver://jdbctestbo.database.windows.net:1433;database=JDBCTEST", + "jdbc_password": "jdbc-password", + "jdbc_username": "jdbc-username" + }, + "output": { + "target": "stg_sales", + "type": [ + "file", + "view" + ] + }, + "schema": { + "fields": [ + { + "metadata": {}, + "name": "ID", + "nullable": true, + "type": "integer" + }, + { + "metadata": {}, + "name": "Amount", + "nullable": true, + "type": "integer" + }, + { + "metadata": {}, + "name": "TS", + "nullable": true, + "type": "timestamp" + } + ], + "type": "struct" + }, + "sampleData": [ + { + "ID": 1, + "Amount": 12, + "TS": "2022-01-10T00:00:00.000+08:00" + }, + { + "ID": 2, + "Amount": 13, + "TS": "2022-01-10T00:00:00.000+08:00" + }, + { + "ID": 3, + "Amount": 14, + "TS": "2022-01-10T00:00:00.000+08:00" + }, + { + "ID": 4, + "Amount": 15, + "TS": "2022-01-10T00:00:00.000+08:00" + }, + { + "ID": 5, + "Amount": 16, + "TS": "2022-01-10T00:00:00.000+08:00" + }, + { + "ID": 6, + "Amount": 17, + "TS": "2022-01-10T00:00:00.000+08:00" + }, + { + "ID": 7, + "Amount": 18, + "TS": "2022-01-10T00:00:00.000+08:00" + } + ] + }, + { + "name": "price_ingestion", + "target": "stg_price", + "input": { + "type": "jdbc", + "format": "csv", + "path": "FileStore/cddp_apps/fruit_batch_data_app/landing/price_ingestion/", + "read-type": "batch", + "schema": { + "fields": [ + { + "metadata": {}, + "name": "ID", + "nullable": true, + "type": "integer" + }, + { + "metadata": {}, + "name": "Fruit", + "nullable": true, + "type": "string" + }, + { + "metadata": {}, + "name": "Color", + "nullable": true, + "type": "string" + }, + { + "metadata": {}, + "name": "Price", + "nullable": true, + "type": "double" + }, + { + "metadata": {}, + "name": "Start_TS", + "nullable": true, + "type": "timestamp" + }, + { + "metadata": {}, + "name": "End_TS", + "nullable": true, + "type": "timestamp" + } + ], + "type": "struct" + }, + "secret_scope": "ADLSTEST", + "table_name": "fruit_price", + "jdbc_url": "jdbc:sqlserver://jdbctestbo.database.windows.net:1433;database=JDBCTEST", + "jdbc_password": "jdbc-password", + "jdbc_username": "jdbc-username" + }, + "output": { + "target": "stg_price", + "type": [ + "file", + "view" + ] + }, + "schema": { + "fields": [ + { + "metadata": {}, + "name": "ID", + "nullable": true, + "type": "integer" + }, + { + "metadata": {}, + "name": "Fruit", + "nullable": true, + "type": "string" + }, + { + "metadata": {}, + "name": "Color", + "nullable": true, + "type": "string" + }, + { + "metadata": {}, + "name": "Price", + "nullable": true, + "type": "double" + }, + { + "metadata": {}, + "name": "Start_TS", + "nullable": true, + "type": "timestamp" + }, + { + "metadata": {}, + "name": "End_TS", + "nullable": true, + "type": "timestamp" + } + ], + "type": "struct" + }, + "sampleData": [ + { + "ID": 1, + "Fruit": "Red Grape", + "Color": "Red", + "Price": 2, + "Start_TS": "2015-01-01T00:00:00.000+08:00", + "End_TS": "2099-12-31T23:59:59.000+08:00" + }, + { + "ID": 2, + "Fruit": "Peach", + "Color": "Yellow", + "Price": 3, + "Start_TS": "2015-01-01T00:00:00.000+08:00", + "End_TS": "2099-12-31T23:59:59.000+08:00" + }, + { + "ID": 3, + "Fruit": "Orange", + "Color": "Orange", + "Price": 2, + "Start_TS": "2015-01-01T00:00:00.000+08:00", + "End_TS": "2099-12-31T23:59:59.000+08:00" + }, + { + "ID": 4, + "Fruit": "Green Apple", + "Color": "Green", + "Price": 3, + "Start_TS": "2015-01-01T00:00:00.000+08:00", + "End_TS": "2099-12-31T23:59:59.000+08:00" + }, + { + "ID": 5, + "Fruit": "Fiji Apple", + "Color": "Red", + "Price": 3.5, + "Start_TS": "2015-01-01T00:00:00.000+08:00", + "End_TS": "2099-12-31T23:59:59.000+08:00" + }, + { + "ID": 6, + "Fruit": "Banana", + "Color": "Yellow", + "Price": 1, + "Start_TS": "2015-01-01T00:00:00.000+08:00", + "End_TS": "2099-12-31T23:59:59.000+08:00" + }, + { + "ID": 7, + "Fruit": "Green Grape", + "Color": " Green", + "Price": 2, + "Start_TS": "2015-01-01T00:00:00.000+08:00", + "End_TS": "2099-12-31T23:59:59.000+08:00" + } + ] + } + ], + "standard": [ + { + "name": "fruit_sales_transform", + "type": "batch", + "code": { + "lang": "sql", + "sql": "select price.fruit, price.id, sales.amount, price.price, sales.ts from stg_sales sales left outer join stg_price price on sales.id = price.id and sales.ts >= price.start_ts and sales.ts < price.end_ts" + }, + "output": { + "target": "std_fruit_sales", + "type": [ + "file", + "view" + ] + } + } + ], + "serving": [ + { + "name": "fruit_sales_total_curation", + "type": "batch", + "code": { + "lang": "sql", + "sql": "select id, fruit, sum(amount*price) as total from std_fruit_sales group by id, fruit order by total desc" + }, + "output": { + "target": "srv_fruit_sales_total", + "type": [ + "table", + "file" + ] + } + } + ] +} \ No newline at end of file diff --git a/src/cddp/dbxapi.py b/src/cddp/dbxapi.py index 1cb2f8a..87c314d 100644 --- a/src/cddp/dbxapi.py +++ b/src/cddp/dbxapi.py @@ -90,6 +90,8 @@ def build_tasks(config, working_dir, config_path, dbx_cluster): type = task["input"]["type"] if type == "filestore": mode = task["input"]["read-type"] + elif type == "jdbc": + mode = task["input"]["read-type"] elif type == "azure_adls_gen2": mode = task["input"]["read-type"] name = task["name"] diff --git a/src/cddp/ingestion/__init__.py b/src/cddp/ingestion/__init__.py index 4ad8d38..52dda1f 100644 --- a/src/cddp/ingestion/__init__.py +++ b/src/cddp/ingestion/__init__.py @@ -1,6 +1,6 @@ import cddp.ingestion.autoloader import cddp.ingestion.azure_eventhub -import cddp.ingestion.azure_adls_gen2 +import cddp.ingestion.jdbc import cddp.ingestion.azure_adls_gen1 import cddp.ingestion.azure_adls_gen2 import cddp.ingestion.filestore diff --git a/src/cddp/ingestion/jdbc.py b/src/cddp/ingestion/jdbc.py index a5216a7..871cd66 100644 --- a/src/cddp/ingestion/jdbc.py +++ b/src/cddp/ingestion/jdbc.py @@ -1,19 +1,21 @@ from pyspark.sql.types import * +import IPython def start_ingestion_task(task, spark): - import dbutils + # import dbutils + dbutils = IPython.get_ipython().user_ns["dbutils"] schema = StructType.fromJson(task["schema"]) - username = dbutils.secrets.get(scope = task["secret_scope"], key = task["jdbc_username"]) - password = dbutils.secrets.get(scope = task["secret_scope"], key = task["jdbc_password"]) + username = dbutils.secrets.get(scope = task["input"]["secret_scope"], key = task["input"]["jdbc_username"]) + password = dbutils.secrets.get(scope = task["input"]["secret_scope"], key = task["input"]["jdbc_password"]) - spark.read \ - .format("jdbc") \ - .option("url", task["jdbc_url"]) \ - .option("dbtable", task["table_name"]) \ - .option("user", username) \ - .option("password", password) \ - .schema(schema) \ - .load() + df = spark.read \ + .format("jdbc") \ + .option("url", task["input"]["jdbc_url"]) \ + .option("dbtable", task["input"]["table_name"]) \ + .option("user", username) \ + .option("password", password) \ + .schema(schema) \ + .load() return df, False \ No newline at end of file diff --git a/web/index.html b/web/index.html index bb9c32e..3853180 100644 --- a/web/index.html +++ b/web/index.html @@ -285,6 +285,16 @@

placeholder="JDBC URL"> +
+ + +
+
- +
+ + +
+
+ + +