From 87d688fda3690a6aa7990350d4f8c86f478761ff Mon Sep 17 00:00:00 2001 From: wolfderby Date: Thu, 6 Nov 2025 15:07:00 -0500 Subject: [PATCH 01/20] Add index creation script for onsides PostgreSQL schema --- database/schema/postgres_indexes.sql | 63 ++++++++++++++++++++++++++++ 1 file changed, 63 insertions(+) create mode 100644 database/schema/postgres_indexes.sql diff --git a/database/schema/postgres_indexes.sql b/database/schema/postgres_indexes.sql new file mode 100644 index 0000000..9e41522 --- /dev/null +++ b/database/schema/postgres_indexes.sql @@ -0,0 +1,63 @@ +-- Indexes for OnSIDES PostgreSQL schema-- Indexes for OnSIDES PostgreSQL schema + +-- Safe to run repeatedly (IF NOT EXISTS)-- Safe to run repeatedly (IF NOT EXISTS) + + + +SET search_path TO onsides, public;-- Ensure we target the onsides schema by default + +SET search_path TO onsides, public; + +-- product_adverse_effect + +CREATE INDEX IF NOT EXISTS idx_pae_match_method ON product_adverse_effect (match_method);-- product_adverse_effect + +CREATE INDEX IF NOT EXISTS idx_pae_label_section ON product_adverse_effect (label_section);-- Primary key already indexes effect_id; adding targeted indexes used in filters/joins + +CREATE INDEX IF NOT EXISTS idx_pae_effect_meddra_id ON product_adverse_effect (effect_meddra_id);CREATE INDEX IF NOT EXISTS idx_pae_match_method ON product_adverse_effect (match_method); + +CREATE INDEX IF NOT EXISTS idx_pae_product_label_id ON product_adverse_effect (product_label_id);CREATE INDEX IF NOT EXISTS idx_pae_label_section ON product_adverse_effect (label_section); + +CREATE INDEX IF NOT EXISTS idx_pae_effect_meddra_id ON product_adverse_effect (effect_meddra_id); + +-- product_labelCREATE INDEX IF NOT EXISTS idx_pae_product_label_id ON product_adverse_effect (product_label_id); + +CREATE INDEX IF NOT EXISTS idx_pl_source ON product_label (source); + +-- product_label + +-- vocab_meddra_adverse_effectCREATE INDEX IF NOT EXISTS idx_pl_source ON product_label (source); + +CREATE INDEX IF NOT EXISTS idx_vmae_name ON vocab_meddra_adverse_effect (meddra_name); + +CREATE INDEX IF NOT EXISTS idx_vmae_term_type ON vocab_meddra_adverse_effect (meddra_term_type);-- vocab_meddra_adverse_effect + +-- PK on meddra_id already exists; add name/type for lookups + +-- vocab_rxnorm_ingredientCREATE INDEX IF NOT EXISTS idx_vmae_name ON vocab_meddra_adverse_effect (meddra_name); + +CREATE INDEX IF NOT EXISTS idx_vri_name ON vocab_rxnorm_ingredient (rxnorm_name);CREATE INDEX IF NOT EXISTS idx_vmae_term_type ON vocab_meddra_adverse_effect (meddra_term_type); + +CREATE INDEX IF NOT EXISTS idx_vri_term_type ON vocab_rxnorm_ingredient (rxnorm_term_type); + +-- vocab_rxnorm_ingredient + +-- vocab_rxnorm_product-- PK on rxnorm_id already exists; add name/type for lookups + +CREATE INDEX IF NOT EXISTS idx_vrp_name ON vocab_rxnorm_product (rxnorm_name);CREATE INDEX IF NOT EXISTS idx_vri_name ON vocab_rxnorm_ingredient (rxnorm_name); + +CREATE INDEX IF NOT EXISTS idx_vrp_term_type ON vocab_rxnorm_product (rxnorm_term_type);CREATE INDEX IF NOT EXISTS idx_vri_term_type ON vocab_rxnorm_ingredient (rxnorm_term_type); + + + +-- vocab_rxnorm_ingredient_to_product-- vocab_rxnorm_product + +CREATE INDEX IF NOT EXISTS idx_vrip_product_id ON vocab_rxnorm_ingredient_to_product (product_id);-- PK on rxnorm_id already exists; add name/type for lookups + +CREATE INDEX IF NOT EXISTS idx_vrp_name ON vocab_rxnorm_product (rxnorm_name); +CREATE INDEX IF NOT EXISTS idx_vrp_term_type ON vocab_rxnorm_product (rxnorm_term_type); + +-- vocab_rxnorm_ingredient_to_product +-- Composite PK (ingredient_id, product_id) supports ingredient->products lookups. +-- Add reverse index to speed product->ingredients lookups. +CREATE INDEX IF NOT EXISTS idx_vrip_product_id ON vocab_rxnorm_ingredient_to_product (product_id); From 894ac12780d36c73cf33dbb9d1de7788984e2e37 Mon Sep 17 00:00:00 2001 From: wolfderby Date: Thu, 6 Nov 2025 15:10:33 -0500 Subject: [PATCH 02/20] feat: add entries to .gitignore for new data and VSCode settings --- .gitignore | 3 +++ 1 file changed, 3 insertions(+) diff --git a/.gitignore b/.gitignore index 10726a8..914eb03 100644 --- a/.gitignore +++ b/.gitignore @@ -23,3 +23,6 @@ log database/onsides.db db.zip summary.md +data_our_improvements/yay +data_our_improvements/yay.pub +.vscode/settings.json From 563498fb5b94df699210c90126ba0bf46af00459 Mon Sep 17 00:00:00 2001 From: wolfderby Date: Fri, 7 Nov 2025 04:01:36 -0500 Subject: [PATCH 03/20] Add archive + delete orphan rows script for cem_development_2025 (dry-run/commit switch) --- .../archive_and_delete_orphans_2025.sql | 76 +++++++++++++++++++ 1 file changed, 76 insertions(+) create mode 100644 database/scripts/archive_and_delete_orphans_2025.sql diff --git a/database/scripts/archive_and_delete_orphans_2025.sql b/database/scripts/archive_and_delete_orphans_2025.sql new file mode 100644 index 0000000..2256ab0 --- /dev/null +++ b/database/scripts/archive_and_delete_orphans_2025.sql @@ -0,0 +1,76 @@ +-- Archive and delete orphan rows in cem_development_2025 (dry-run by default) +-- Usage (dry-run): +-- psql -p 5433 -d cem_development_2025 -v do_commit=false -f database/scripts/archive_and_delete_orphans_2025.sql +-- To actually commit changes (careful): +-- psql -p 5433 -d cem_development_2025 -v do_commit=true -f database/scripts/archive_and_delete_orphans_2025.sql + +\set ON_ERROR_STOP on + +SET search_path TO onsides, public; + +-- Create archive tables (structure copied from original tables, plus metadata) +CREATE TABLE IF NOT EXISTS onsides.orphan_product_adverse_effect_archive ( + LIKE onsides.product_adverse_effect INCLUDING ALL, + archived_at timestamptz DEFAULT now(), + source_db text +); + +CREATE TABLE IF NOT EXISTS onsides.orphan_product_to_rxnorm_archive ( + LIKE onsides.product_to_rxnorm INCLUDING ALL, + archived_at timestamptz DEFAULT now(), + source_db text +); + +-- We'll run each archive/cleanup in a single transaction. Default behavior: DRY RUN (ROLLBACK) unless do_commit=true +BEGIN; + +-- 1) Archive product_adverse_effect rows whose effect_meddra_id has no parent +INSERT INTO onsides.orphan_product_adverse_effect_archive +SELECT pae.*, now() AS archived_at, current_database() AS source_db +FROM onsides.product_adverse_effect pae +LEFT JOIN onsides.vocab_meddra_adverse_effect v ON pae.effect_meddra_id = v.meddra_id +WHERE v.meddra_id IS NULL; + +-- Report how many would be archived and how many orphan rows remain (sanity check) +\echo 'ARCHIVE product_adverse_effect' +SELECT 'to_archive_count' AS label, COUNT(*) FROM onsides.orphan_product_adverse_effect_archive WHERE source_db = current_database(); +SELECT 'current_orphan_count' AS label, COUNT(*) FROM onsides.product_adverse_effect pae LEFT JOIN onsides.vocab_meddra_adverse_effect v ON pae.effect_meddra_id = v.meddra_id WHERE v.meddra_id IS NULL; + +-- 2) Archive product_to_rxnorm rows whose rxnorm_product_id has no parent +INSERT INTO onsides.orphan_product_to_rxnorm_archive +SELECT ptr.*, now() AS archived_at, current_database() AS source_db +FROM onsides.product_to_rxnorm ptr +LEFT JOIN onsides.vocab_rxnorm_product v ON ptr.rxnorm_product_id = v.rxnorm_id +WHERE v.rxnorm_id IS NULL; + +\echo 'ARCHIVE product_to_rxnorm' +SELECT 'to_archive_count' AS label, COUNT(*) FROM onsides.orphan_product_to_rxnorm_archive WHERE source_db = current_database(); +SELECT 'current_orphan_count' AS label, COUNT(*) FROM onsides.product_to_rxnorm ptr LEFT JOIN onsides.vocab_rxnorm_product v ON ptr.rxnorm_product_id = v.rxnorm_id WHERE v.rxnorm_id IS NULL; + +-- At this point archive tables contain the orphan rows. Now delete from child tables if we're committing. +-- Note: deletions are safe (targeting only rows still orphaned at time of execution). + +\echo 'Preparing to delete archived orphan rows (will only commit if do_commit=true)' + +\if :do_commit + -- Perform deletes and commit + DELETE FROM onsides.product_adverse_effect pae + WHERE NOT EXISTS (SELECT 1 FROM onsides.vocab_meddra_adverse_effect v WHERE v.meddra_id = pae.effect_meddra_id); + + SELECT 'deleted_product_adverse_effect' AS label, COUNT(*) FROM onsides.product_adverse_effect pae WHERE NOT EXISTS (SELECT 1 FROM onsides.vocab_meddra_adverse_effect v WHERE v.meddra_id = pae.effect_meddra_id); + + DELETE FROM onsides.product_to_rxnorm ptr + WHERE NOT EXISTS (SELECT 1 FROM onsides.vocab_rxnorm_product v WHERE v.rxnorm_id = ptr.rxnorm_product_id); + + SELECT 'deleted_product_to_rxnorm' AS label, COUNT(*) FROM onsides.product_to_rxnorm ptr WHERE NOT EXISTS (SELECT 1 FROM onsides.vocab_rxnorm_product v WHERE v.rxnorm_id = ptr.rxnorm_product_id); + + COMMIT; + \echo 'COMMITTED: orphan deletes applied.' +\else + -- Dry run: rollback so no data is changed + ROLLBACK; + \echo 'DRY RUN complete: no changes committed. To apply deletions, re-run with -v do_commit=true.' +\endif + +-- Summary notes (printed by client): +\echo 'Script finished.' From fce5d1c2c085c1d1da9d3af177063c33d9c82618 Mon Sep 17 00:00:00 2001 From: wolfderby Date: Fri, 7 Nov 2025 12:57:26 -0500 Subject: [PATCH 04/20] feat: add qa_faers_wc_import_log script for logging file processing metrics --- database/qa/qa_faers_wc_import_log.sh | 136 ++++++++++++++++++++++++++ 1 file changed, 136 insertions(+) create mode 100644 database/qa/qa_faers_wc_import_log.sh diff --git a/database/qa/qa_faers_wc_import_log.sh b/database/qa/qa_faers_wc_import_log.sh new file mode 100644 index 0000000..750691d --- /dev/null +++ b/database/qa/qa_faers_wc_import_log.sh @@ -0,0 +1,136 @@ +#!/usr/bin/env bash +set -euo pipefail + +usage() { + cat <.. +- Inserts a log row into onsides.z_qa_faers_wc_import_log. +- The "--source" can be a free-form tag (e.g., FAERS, LAERS, or a release like v3.1.0). Max length 10. +USAGE +} + +FILE="" +SOURCE="" +YEAR="" +QUARTER="" +DOMAIN_TABLE="" +DOMAIN_SCHEMA="onsides" +EXECUTION_ID="" +LOG_FILENAME="" + +while [[ $# -gt 0 ]]; do + case "$1" in + --file) FILE="$2"; shift 2 ;; + --source) SOURCE="$2"; shift 2 ;; + --year) YEAR="$2"; shift 2 ;; + --quarter) QUARTER="$2"; shift 2 ;; + --domain-table) DOMAIN_TABLE="$2"; shift 2 ;; + --domain-schema) DOMAIN_SCHEMA="$2"; shift 2 ;; + --execution-id) EXECUTION_ID="$2"; shift 2 ;; + --log-filename) LOG_FILENAME="$2"; shift 2 ;; + -h|--help) usage; exit 0 ;; + *) echo "Unknown arg: $1" >&2; usage; exit 2 ;; + esac +done + +[[ -z "$FILE" || -z "$SOURCE" || -z "$YEAR" || -z "$QUARTER" || -z "$DOMAIN_TABLE" ]] && { usage; exit 2; } + +if [[ ! -f "$FILE" ]]; then + echo "File not found: $FILE" >&2 + exit 1 +fi + +######################################## +# Normalize and validate +######################################## + +# Allow any non-empty source/tag up to 10 chars (column is varchar(10)) +if [[ ${#SOURCE} -gt 10 ]]; then + echo "--source length must be <= 10 characters (got ${#SOURCE})" >&2 + exit 2 +fi +SOURCE_UPPER=$(echo "$SOURCE" | tr '[:lower:]' '[:upper:]') + +if ! [[ "$YEAR" =~ ^[0-9]{2}$|^[0-9]{4}$ ]]; then + echo "--year must be either YY or YYYY (e.g., 25 or 2025)" >&2; exit 2 +fi + +if ! [[ "$QUARTER" =~ ^[1-4]$ ]]; then + echo "--quarter must be 1-4" >&2; exit 2 +fi + +if [[ -z "$LOG_FILENAME" ]]; then + LOG_FILENAME=$(basename -- "$FILE") +fi + +WC_L_COUNT=$(wc -l < "$FILE" | tr -d ' ') + +# CSV-aware logical record count (excludes header; handles embedded newlines) +CSV_RECORD_COUNT=$(python3 - <<'PY' "$FILE" +import csv, sys +fn = sys.argv[1] +count = 0 +with open(fn, 'r', newline='') as f: + reader = csv.reader(f) + try: + next(reader) # skip header + except StopIteration: + print(0) + raise SystemExit(0) + for _ in reader: + count += 1 +print(count) +PY +) + +QUALIFIED_TABLE="${DOMAIN_SCHEMA}.${DOMAIN_TABLE}" + +# Ensure the log table exists (safe to run repeatedly) +SCRIPT_DIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) +psql -v ON_ERROR_STOP=1 -f "$SCRIPT_DIR/z_qa_faers_wc_import_log.sql" + +# Get domain table count +DOMAIN_COUNT=$(psql -tA -v ON_ERROR_STOP=1 -c "SELECT COUNT(*) FROM ${QUALIFIED_TABLE};") + +escape_sql() { printf "%s" "${1//\'/''}"; } + +LOG_FN_ESC=$(escape_sql "$LOG_FILENAME") +FILE_ESC=$(escape_sql "$FILE") +SOURCE_ESC=$(escape_sql "$SOURCE_UPPER") + +if [[ -z "$EXECUTION_ID" ]]; then EXEC_SQL="NULL"; else EXEC_SQL="$EXECUTION_ID"; fi + +SQL_STMT="INSERT INTO onsides.z_qa_faers_wc_import_log + (log_filename, filename, laers_or_faers, yr, qtr, wc_l_count, + select_count_on_domain, select_count_diff, select_count_diff_pct, + execution_id, csv_record_count, csv_count_diff, csv_count_diff_pct) +VALUES + ('$LOG_FN_ESC', '$FILE_ESC', '$SOURCE_ESC', ${YEAR}::int, ${QUARTER}::int, ${WC_L_COUNT}::int, + ${DOMAIN_COUNT}::int, + (${DOMAIN_COUNT}::int - ${WC_L_COUNT}::int), + CASE WHEN ${WC_L_COUNT}::int = 0 THEN NULL + ELSE ((${DOMAIN_COUNT})::float8 - (${WC_L_COUNT})::float8) / NULLIF((${WC_L_COUNT})::float8, 0) + END, + ${EXEC_SQL}, + ${CSV_RECORD_COUNT}::int, + (${DOMAIN_COUNT}::int - ${CSV_RECORD_COUNT}::int), + CASE WHEN ${CSV_RECORD_COUNT}::int = 0 THEN NULL + ELSE ((${DOMAIN_COUNT})::float8 - (${CSV_RECORD_COUNT})::float8) / NULLIF((${CSV_RECORD_COUNT})::float8, 0) + END + );" + +psql -v ON_ERROR_STOP=1 -c "$SQL_STMT" + +echo "Logged: file=$FILE source=$SOURCE_UPPER year=$YEAR qtr=$QUARTER wc_l=$WC_L_COUNT csv_records=$CSV_RECORD_COUNT domain=${QUALIFIED_TABLE} domain_count=$DOMAIN_COUNT" From df1d4c5b9809a947ee5f921a923fa920d56b17e0 Mon Sep 17 00:00:00 2001 From: wolfderby Date: Fri, 7 Nov 2025 13:04:42 -0500 Subject: [PATCH 05/20] database/qa: update README to reflect current z_qa_faers_wc_import_log DDL (csv columns, types, awk_nl_count removed) --- database/qa/README.md | 67 +++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 67 insertions(+) create mode 100644 database/qa/README.md diff --git a/database/qa/README.md b/database/qa/README.md new file mode 100644 index 0000000..ee5915a --- /dev/null +++ b/database/qa/README.md @@ -0,0 +1,67 @@ +# QA workflow: FAERS/LAERS import log + +This workflow logs basic QA metrics comparing a source data file's line count to the row count in a target database table. Results are inserted into `onsides.z_qa_faers_wc_import_log`. + +## What it records +- `log_filename` (varchar(255), NOT NULL): basename of the source file (placeholder; can be customized) +- `filename` (varchar(255), NOT NULL): full path to the source file at logging time +- `laers_or_faers` (varchar(10), NOT NULL): a short dataset tag, e.g. `FAERS`, `LAERS`, or a release tag like `v3.1.0` +- `yr` (int, NOT NULL): dataset year (YY or YYYY) +- `qtr` (int, NOT NULL): dataset quarter (1–4) +- `wc_l_count` (int, NOT NULL): raw `wc -l` physical line count of the source file (includes header) +- `loaded_at` (timestamp, DEFAULT CURRENT_TIMESTAMP): when the log row was inserted +- `select_count_on_domain` (int, NULL): `SELECT COUNT(*)` on the specified domain table at logging time +- `select_count_diff` (int, NULL): `select_count_on_domain - wc_l_count` +- `select_count_diff_pct` (float, NULL): `select_count_diff / NULLIF(wc_l_count,0)` as a float +- `execution_id` (int, NULL): optional execution identifier for grouping runs +- `csv_record_count` (int, NULL): CSV-aware logical record count (header skipped; embedded newlines handled) +- `csv_count_diff` (int, NULL): `select_count_on_domain - csv_record_count` +- `csv_count_diff_pct` (float, NULL): `csv_count_diff / NULLIF(csv_record_count,0)` as a float + +Notes about schema changes +- The table DDL adds the CSV-aware columns (`csv_record_count`, `csv_count_diff`, `csv_count_diff_pct`) when the table already exists. These columns are present in the current DDL. +- A previously-used `awk_nl_count` column (awk-based physical line count) was removed from the DDL; the workflow now uses `wc -l` for physical counts and a CSV-aware parser for logical counts. + +## Usage + +The script uses your normal PostgreSQL environment variables (e.g., `PGHOST`, `PGPORT`, `PGUSER`, `PGDATABASE`, etc.). + +``` +# Ensure the log table exists (one-time or on demand) +psql -v ON_ERROR_STOP=1 -f database/qa/z_qa_faers_wc_import_log.sql + +# Log a dataset (example with release tag v3.1.0, YY=25, Q2) +bash database/qa/qa_faers_wc_import_log.sh \ + --file /path/to/source_file.txt \ + --source v3.1.0 \ + --year 25 \ + --quarter 2 \ + --domain-table product_adverse_effect \ + --domain-schema onsides \ + --execution-id 123 +``` + +Notes: +- If `--domain-schema` is omitted, it defaults to `onsides`. +- `--log-filename` is optional; defaults to `basename --file`. +- Year supports YY or YYYY (e.g., 25 or 2025). +- `select_count_on_domain` is computed from `SELECT COUNT(*) FROM .
`. + +## Troubleshooting +- Permission denied: ensure your DB user can `SELECT` from the domain table and `INSERT` into `onsides.z_qa_faers_wc_import_log`. +- Table not found: pass `--domain-schema` if your table is not in `onsides`. +- Line counts include headers: this workflow uses raw `wc -l` as requested. + +### About physical vs logical row counts +- `wc -l` counts physical newline characters; it includes the header row and is unaware of CSV quoting. +- `awk 'END{print NR}'` also counts physical lines and usually equals `wc -l` (they only differ by one if a file is missing a trailing newline). +- `csv_record_count` is computed with a CSV parser: it skips the header and treats embedded newlines inside quoted fields as part of the same record. This is the correct basis for comparing to database row counts after a CSV import. + +In particular for `product_label.csv`: +- `wc -l` and `awk` report 63,282 (physical lines). +- `csv_record_count` is 59,515 (logical records, excluding the header). +- `SELECT COUNT(*)` on `onsides.product_label` is 59,515, so `csv_count_diff = 0`. Any non-zero `select_count_diff` in this case is due to the use of physical lines instead of logical records. + +# 11/04/2025 load + +Today, we focused on logging QA metrics for the OnSIDES v3.1.0 release. This included running the QA logger for additional input files and ensuring that the logical CSV record counts match the row counts in the database tables. Discrepancies were investigated (for example, `product_label.csv` had embedded newlines in quoted fields which made `wc -l` differ from logical CSV record counts). The README and DDL were updated to reflect the CSV-aware columns and the removal of the old `awk_nl_count` field. From 2d669d746f05aafdd5b8cef5ddd10f1397e7c195 Mon Sep 17 00:00:00 2001 From: wolfderby Date: Fri, 7 Nov 2025 13:07:08 -0500 Subject: [PATCH 06/20] database/qa: rename laers_or_faers -> onsides_release_version in DDL, comments, script, and README (increase length to 32) --- database/qa/README.md | 2 +- database/qa/qa_faers_wc_import_log.sh | 10 ++-- database/qa/z_qa_faers_wc_import_log.sql | 35 ++++++++++++ database/schema/postgres_comments.sql | 69 ++++++++++++++++++++++++ 4 files changed, 110 insertions(+), 6 deletions(-) create mode 100644 database/qa/z_qa_faers_wc_import_log.sql create mode 100644 database/schema/postgres_comments.sql diff --git a/database/qa/README.md b/database/qa/README.md index ee5915a..6b2fc3e 100644 --- a/database/qa/README.md +++ b/database/qa/README.md @@ -5,7 +5,7 @@ This workflow logs basic QA metrics comparing a source data file's line count to ## What it records - `log_filename` (varchar(255), NOT NULL): basename of the source file (placeholder; can be customized) - `filename` (varchar(255), NOT NULL): full path to the source file at logging time -- `laers_or_faers` (varchar(10), NOT NULL): a short dataset tag, e.g. `FAERS`, `LAERS`, or a release tag like `v3.1.0` +- `onsides_release_version` (varchar(32), NOT NULL): OnSIDES release tag or short dataset tag, e.g. `v3.1.0`, `FAERS`, or `LAERS` - `yr` (int, NOT NULL): dataset year (YY or YYYY) - `qtr` (int, NOT NULL): dataset quarter (1–4) - `wc_l_count` (int, NOT NULL): raw `wc -l` physical line count of the source file (includes header) diff --git a/database/qa/qa_faers_wc_import_log.sh b/database/qa/qa_faers_wc_import_log.sh index 750691d..9a4d12c 100644 --- a/database/qa/qa_faers_wc_import_log.sh +++ b/database/qa/qa_faers_wc_import_log.sh @@ -17,7 +17,7 @@ Notes: - Uses PG* env vars for connection (PGHOST, PGPORT, PGUSER, PGDATABASE, PGPASSWORD, etc.). - Computes wc -l on the file and SELECT COUNT(*) from .
. - Inserts a log row into onsides.z_qa_faers_wc_import_log. -- The "--source" can be a free-form tag (e.g., FAERS, LAERS, or a release like v3.1.0). Max length 10. +- The "--source" can be a free-form tag (e.g., FAERS, LAERS, or a release like v3.1.0). Max length 32. USAGE } @@ -56,9 +56,9 @@ fi # Normalize and validate ######################################## -# Allow any non-empty source/tag up to 10 chars (column is varchar(10)) -if [[ ${#SOURCE} -gt 10 ]]; then - echo "--source length must be <= 10 characters (got ${#SOURCE})" >&2 +# Allow any non-empty source/tag up to 32 chars (column is varchar(32)) +if [[ ${#SOURCE} -gt 32 ]]; then + echo "--source length must be <= 32 characters (got ${#SOURCE})" >&2 exit 2 fi SOURCE_UPPER=$(echo "$SOURCE" | tr '[:lower:]' '[:upper:]') @@ -113,7 +113,7 @@ SOURCE_ESC=$(escape_sql "$SOURCE_UPPER") if [[ -z "$EXECUTION_ID" ]]; then EXEC_SQL="NULL"; else EXEC_SQL="$EXECUTION_ID"; fi SQL_STMT="INSERT INTO onsides.z_qa_faers_wc_import_log - (log_filename, filename, laers_or_faers, yr, qtr, wc_l_count, + (log_filename, filename, onsides_release_version, yr, qtr, wc_l_count, select_count_on_domain, select_count_diff, select_count_diff_pct, execution_id, csv_record_count, csv_count_diff, csv_count_diff_pct) VALUES diff --git a/database/qa/z_qa_faers_wc_import_log.sql b/database/qa/z_qa_faers_wc_import_log.sql new file mode 100644 index 0000000..3a1e5b3 --- /dev/null +++ b/database/qa/z_qa_faers_wc_import_log.sql @@ -0,0 +1,35 @@ +-- Ensure schema and log table exist +CREATE SCHEMA IF NOT EXISTS onsides; + +CREATE TABLE IF NOT EXISTS onsides.z_qa_faers_wc_import_log ( + log_filename varchar(255) NOT NULL, + filename varchar(255) NOT NULL, + onsides_release_version varchar(32) NOT NULL, + yr int4 NOT NULL, + qtr int4 NOT NULL, + wc_l_count int4 NOT NULL, + loaded_at timestamp DEFAULT CURRENT_TIMESTAMP NULL, + select_count_on_domain int4 NULL, + select_count_diff int4 NULL, + select_count_diff_pct float8 NULL, + execution_id int4 NULL, + csv_record_count int4 NULL, + csv_count_diff int4 NULL, + csv_count_diff_pct float8 NULL +); + +-- Optional index to speed review by year/quarter +CREATE INDEX IF NOT EXISTS z_qa_faers_wc_import_log_yq_idx + ON onsides.z_qa_faers_wc_import_log (yr, qtr); + +-- Ensure new CSV-aware columns exist when table was created previously +ALTER TABLE onsides.z_qa_faers_wc_import_log + ADD COLUMN IF NOT EXISTS csv_record_count int4; +ALTER TABLE onsides.z_qa_faers_wc_import_log + ADD COLUMN IF NOT EXISTS csv_count_diff int4; +ALTER TABLE onsides.z_qa_faers_wc_import_log + ADD COLUMN IF NOT EXISTS csv_count_diff_pct float8; + +-- Remove deprecated awk-based column if present +ALTER TABLE onsides.z_qa_faers_wc_import_log + DROP COLUMN IF EXISTS awk_nl_count; diff --git a/database/schema/postgres_comments.sql b/database/schema/postgres_comments.sql new file mode 100644 index 0000000..da1a56b --- /dev/null +++ b/database/schema/postgres_comments.sql @@ -0,0 +1,69 @@ +-- Column comments for OnSIDES PostgreSQL schema +-- Safe to run repeatedly; COMMENT ON is idempotent. + +SET search_path TO onsides, public; + +-- Table-level comments +COMMENT ON TABLE product_label IS 'Individual drug product labels from four sources (US, UK, EU, JP).'; +COMMENT ON TABLE product_adverse_effect IS 'Extracted adverse effects linked to product labels with MedDRA mapping and model scores.'; +COMMENT ON TABLE product_to_rxnorm IS 'Mapping from product_label to RxNorm products (many-to-many).'; +COMMENT ON TABLE vocab_meddra_adverse_effect IS 'MedDRA adverse effect vocabulary (IDs, names, term types).'; +COMMENT ON TABLE vocab_rxnorm_product IS 'RxNorm product vocabulary (IDs, names, term types).'; +COMMENT ON TABLE vocab_rxnorm_ingredient IS 'RxNorm ingredient vocabulary (IDs, names, term types).'; +COMMENT ON TABLE vocab_rxnorm_ingredient_to_product IS 'Mapping from RxNorm products to RxNorm ingredients (many-to-many).'; +COMMENT ON TABLE z_qa_faers_wc_import_log IS 'QA log of file vs database counts, including CSV-aware record counts.'; + +-- product_label +COMMENT ON COLUMN product_label.label_id IS 'Surrogate primary key for a drug product label.'; +COMMENT ON COLUMN product_label.source IS 'Source region of the label: US, UK, EU, or JP.'; +COMMENT ON COLUMN product_label.source_product_name IS 'Product name as reported by the source authority.'; +COMMENT ON COLUMN product_label.source_product_id IS 'Source-specific product identifier.'; +COMMENT ON COLUMN product_label.source_label_url IS 'URL to the product label at the source site (when available).'; + +-- vocab_meddra_adverse_effect +COMMENT ON COLUMN vocab_meddra_adverse_effect.meddra_id IS 'MedDRA concept identifier (e.g., a Preferred Term ID).'; +COMMENT ON COLUMN vocab_meddra_adverse_effect.meddra_name IS 'MedDRA concept name (e.g., Preferred Term).'; +COMMENT ON COLUMN vocab_meddra_adverse_effect.meddra_term_type IS 'MedDRA term type (e.g., PT, LLT).'; + +-- vocab_rxnorm_ingredient +COMMENT ON COLUMN vocab_rxnorm_ingredient.rxnorm_id IS 'RxNorm ingredient concept identifier (e.g., IN/PIN).'; +COMMENT ON COLUMN vocab_rxnorm_ingredient.rxnorm_name IS 'RxNorm ingredient name.'; +COMMENT ON COLUMN vocab_rxnorm_ingredient.rxnorm_term_type IS 'RxNorm term type (TTY), e.g., IN, PIN.'; + +-- vocab_rxnorm_product +COMMENT ON COLUMN vocab_rxnorm_product.rxnorm_id IS 'RxNorm product concept identifier (e.g., SCD/SBD).'; +COMMENT ON COLUMN vocab_rxnorm_product.rxnorm_name IS 'RxNorm product name.'; +COMMENT ON COLUMN vocab_rxnorm_product.rxnorm_term_type IS 'RxNorm term type (TTY), e.g., SCD, SBD.'; + +-- product_adverse_effect +COMMENT ON COLUMN product_adverse_effect.product_label_id IS 'FK to product_label.label_id for the product whose label contains the effect.'; +COMMENT ON COLUMN product_adverse_effect.effect_id IS 'Surrogate primary key for a product adverse effect row.'; +COMMENT ON COLUMN product_adverse_effect.label_section IS 'Label section where the effect was found: AE (Adverse Reactions), WP (Warnings and Precautions), BW (Boxed Warning), or NA.'; +COMMENT ON COLUMN product_adverse_effect.effect_meddra_id IS 'FK to vocab_meddra_adverse_effect.meddra_id for the mapped MedDRA concept.'; +COMMENT ON COLUMN product_adverse_effect.match_method IS 'Extraction method: SM (string match) or PMB (model-scored).'; +COMMENT ON COLUMN product_adverse_effect.pred0 IS 'Model score/confidence for the non-ADE class (if available).'; +COMMENT ON COLUMN product_adverse_effect.pred1 IS 'Model score/confidence for ADE (higher indicates more likely, used for thresholding).'; + +-- product_to_rxnorm +COMMENT ON COLUMN product_to_rxnorm.label_id IS 'FK to product_label.label_id for the product label.'; +COMMENT ON COLUMN product_to_rxnorm.rxnorm_product_id IS 'FK to vocab_rxnorm_product.rxnorm_id for the mapped RxNorm product.'; + +-- vocab_rxnorm_ingredient_to_product +COMMENT ON COLUMN vocab_rxnorm_ingredient_to_product.ingredient_id IS 'FK to vocab_rxnorm_ingredient.rxnorm_id (ingredient).'; +COMMENT ON COLUMN vocab_rxnorm_ingredient_to_product.product_id IS 'FK to vocab_rxnorm_product.rxnorm_id (product).'; + +-- QA log +COMMENT ON COLUMN z_qa_faers_wc_import_log.log_filename IS 'Basename of the source file.'; +COMMENT ON COLUMN z_qa_faers_wc_import_log.filename IS 'Full path to the source file at logging time.'; +COMMENT ON COLUMN z_qa_faers_wc_import_log.onsides_release_version IS 'OnSIDES release tag or short dataset tag (e.g., v3.1.0, FAERS, LAERS).'; +COMMENT ON COLUMN z_qa_faers_wc_import_log.yr IS 'Dataset year (YY or YYYY).'; +COMMENT ON COLUMN z_qa_faers_wc_import_log.qtr IS 'Dataset quarter (1–4).'; +COMMENT ON COLUMN z_qa_faers_wc_import_log.wc_l_count IS 'Physical line count from wc -l (includes header; not CSV-aware).'; +COMMENT ON COLUMN z_qa_faers_wc_import_log.loaded_at IS 'Timestamp when the log row was inserted.'; +COMMENT ON COLUMN z_qa_faers_wc_import_log.select_count_on_domain IS 'SELECT COUNT(*) from the target domain table at logging time.'; +COMMENT ON COLUMN z_qa_faers_wc_import_log.select_count_diff IS 'Difference: select_count_on_domain - wc_l_count.'; +COMMENT ON COLUMN z_qa_faers_wc_import_log.select_count_diff_pct IS 'Relative difference vs wc_l_count.'; +COMMENT ON COLUMN z_qa_faers_wc_import_log.execution_id IS 'Optional execution identifier for grouping logs.'; +COMMENT ON COLUMN z_qa_faers_wc_import_log.csv_record_count IS 'CSV-aware logical record count (header skipped; embedded newlines handled).'; +COMMENT ON COLUMN z_qa_faers_wc_import_log.csv_count_diff IS 'Difference: select_count_on_domain - csv_record_count.'; +COMMENT ON COLUMN z_qa_faers_wc_import_log.csv_count_diff_pct IS 'Relative difference vs csv_record_count.'; From 18badc68cb1d128b09409a5fa148157025b3b22a Mon Sep 17 00:00:00 2001 From: wolfderby Date: Fri, 7 Nov 2025 13:36:25 -0500 Subject: [PATCH 07/20] qa: skip DDL step for non-superusers; only create table if current user is superuser --- database/qa/qa_faers_wc_import_log.sh | 22 ++++++++++++++++++++-- 1 file changed, 20 insertions(+), 2 deletions(-) diff --git a/database/qa/qa_faers_wc_import_log.sh b/database/qa/qa_faers_wc_import_log.sh index 9a4d12c..1409c03 100644 --- a/database/qa/qa_faers_wc_import_log.sh +++ b/database/qa/qa_faers_wc_import_log.sh @@ -97,9 +97,27 @@ PY QUALIFIED_TABLE="${DOMAIN_SCHEMA}.${DOMAIN_TABLE}" -# Ensure the log table exists (safe to run repeatedly) +# Ensure the log table exists (safe to run repeatedly). +# If the table is missing, only run the DDL when the connected user is a superuser. SCRIPT_DIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) -psql -v ON_ERROR_STOP=1 -f "$SCRIPT_DIR/z_qa_faers_wc_import_log.sql" +# Check if table exists +TABLE_EXISTS=$(psql -tA -v ON_ERROR_STOP=1 -c "SELECT to_regclass('onsides.z_qa_faers_wc_import_log');") || TABLE_EXISTS="" +if [[ -z "$TABLE_EXISTS" ]]; then + # Check if current_user is a superuser; only superusers should run the DDL + IS_SUPER=$(psql -tA -v ON_ERROR_STOP=1 -c "SELECT rolsuper FROM pg_roles WHERE rolname = current_user;") || IS_SUPER="f" + IS_SUPER=$(echo "$IS_SUPER" | tr -d '[:space:]') + if [[ "$IS_SUPER" == 't' ]]; then + echo "Log table not found — creating via DDL as superuser" + psql -v ON_ERROR_STOP=1 -f "$SCRIPT_DIR/z_qa_faers_wc_import_log.sql" + else + echo "Warning: onsides.z_qa_faers_wc_import_log does not exist and current user lacks privileges to create it." >&2 + echo "Please run the DDL as a superuser (postgres) or have a DBA create the table." >&2 + exit 1 + fi +else + # Table exists — nothing to do + : +fi # Get domain table count DOMAIN_COUNT=$(psql -tA -v ON_ERROR_STOP=1 -c "SELECT COUNT(*) FROM ${QUALIFIED_TABLE};") From 30d10e77561759077493fe3557a0a5d98d7a7a6b Mon Sep 17 00:00:00 2001 From: wolfderby Date: Fri, 7 Nov 2025 14:02:31 -0500 Subject: [PATCH 08/20] database/qa: add idempotent migration, ensure script, and bulk-run helper; document usage --- database/qa/README.md | 14 +++++++++ database/qa/ensure_qa_table_and_grants.sh | 24 ++++++++++++++ ...grate_laers_to_onsides_release_version.sql | 19 ++++++++++++ database/qa/run_qa_bulk.sh | 31 +++++++++++++++++++ 4 files changed, 88 insertions(+) create mode 100644 database/qa/ensure_qa_table_and_grants.sh create mode 100644 database/qa/migrate_laers_to_onsides_release_version.sql create mode 100644 database/qa/run_qa_bulk.sh diff --git a/database/qa/README.md b/database/qa/README.md index 6b2fc3e..1582b75 100644 --- a/database/qa/README.md +++ b/database/qa/README.md @@ -47,6 +47,20 @@ Notes: - Year supports YY or YYYY (e.g., 25 or 2025). - `select_count_on_domain` is computed from `SELECT COUNT(*) FROM .
`. +Convenience scripts +- `database/qa/ensure_qa_table_and_grants.sh` — idempotently creates/updates the QA log table (runs DDL and migration) and grants `rw_grp` the required privileges. Run as a superuser or pass PGUSER=postgres. +- `database/qa/run_qa_bulk.sh` — run the QA logger across all CSVs in a directory (defaults to `data/csv/`). Uses `database/qa/qa_faers_wc_import_log.sh` for each file. + +Example one-line to prepare DB and run QA over CSVs: + +``` +# Ensure table and grants (run as postgres or a superuser) +PGUSER=postgres PGDATABASE=cem_development_2025 PGPORT=5433 bash database/qa/ensure_qa_table_and_grants.sh + +# Run QA logger for all CSVs (as rw_grp) +PGUSER=rw_grp PGDATABASE=cem_development_2025 PGPORT=5433 bash database/qa/run_qa_bulk.sh data/csv v3.1.0 2025 1 onsides +``` + ## Troubleshooting - Permission denied: ensure your DB user can `SELECT` from the domain table and `INSERT` into `onsides.z_qa_faers_wc_import_log`. - Table not found: pass `--domain-schema` if your table is not in `onsides`. diff --git a/database/qa/ensure_qa_table_and_grants.sh b/database/qa/ensure_qa_table_and_grants.sh new file mode 100644 index 0000000..b24cd63 --- /dev/null +++ b/database/qa/ensure_qa_table_and_grants.sh @@ -0,0 +1,24 @@ +#!/usr/bin/env bash +set -euo pipefail + +# ensure_qa_table_and_grants.sh +# Idempotently ensure the QA log table exists and grant rw_grp the necessary privileges. +# Usage: ./ensure_qa_table_and_grants.sh [-d PGDATABASE] [-p PGPORT] [-u PGUSER] + +PGDATABASE=${PGDATABASE:-cem_development_2025} +PGPORT=${PGPORT:-5433} +PGUSER=${PGUSER:-postgres} + +SCRIPT_DIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) + +echo "Ensuring onsides.z_qa_faers_wc_import_log exists (running DDL as $PGUSER on $PGDATABASE:$PGPORT)" +PGUSER=$PGUSER PGDATABASE=$PGDATABASE PGPORT=$PGPORT psql -v ON_ERROR_STOP=1 -f "$SCRIPT_DIR/z_qa_faers_wc_import_log.sql" + +echo "Applying migration (rename old column if needed)" +PGUSER=$PGUSER PGDATABASE=$PGDATABASE PGPORT=$PGPORT psql -v ON_ERROR_STOP=1 -f "$SCRIPT_DIR/migrate_laers_to_onsides_release_version.sql" + +echo "Granting schema usage and table privileges to rw_grp" +PGUSER=$PGUSER PGDATABASE=$PGDATABASE PGPORT=$PGPORT psql -v ON_ERROR_STOP=1 -c "GRANT USAGE ON SCHEMA onsides TO rw_grp;" +PGUSER=$PGUSER PGDATABASE=$PGDATABASE PGPORT=$PGPORT psql -v ON_ERROR_STOP=1 -c "GRANT SELECT, INSERT ON TABLE onsides.z_qa_faers_wc_import_log TO rw_grp;" + +echo "Done." diff --git a/database/qa/migrate_laers_to_onsides_release_version.sql b/database/qa/migrate_laers_to_onsides_release_version.sql new file mode 100644 index 0000000..6811b0a --- /dev/null +++ b/database/qa/migrate_laers_to_onsides_release_version.sql @@ -0,0 +1,19 @@ +-- Idempotent migration: rename laers_or_faers -> onsides_release_version +SET search_path TO onsides, public; + +DO $$ +BEGIN + IF EXISTS ( + SELECT 1 FROM information_schema.columns + WHERE table_schema = 'onsides' + AND table_name = 'z_qa_faers_wc_import_log' + AND column_name = 'laers_or_faers' + ) THEN + RAISE NOTICE 'Renaming column laers_or_faers -> onsides_release_version'; + EXECUTE 'ALTER TABLE onsides.z_qa_faers_wc_import_log RENAME COLUMN laers_or_faers TO onsides_release_version'; + ELSE + RAISE NOTICE 'Column laers_or_faers not present, skipping'; + END IF; +EXCEPTION WHEN others THEN + RAISE NOTICE 'Migration error: %', SQLERRM; +END $$; diff --git a/database/qa/run_qa_bulk.sh b/database/qa/run_qa_bulk.sh new file mode 100644 index 0000000..02338ed --- /dev/null +++ b/database/qa/run_qa_bulk.sh @@ -0,0 +1,31 @@ +#!/usr/bin/env bash +set -euo pipefail + +# run_qa_bulk.sh +# Run the QA logger for all CSVs in a given directory (defaults to data/csv) +# Usage: ./run_qa_bulk.sh [--dir path] [--source TAG] [--year YYYY|YY] [--quarter 1-4] [--schema onsides] + +DIR=${1:-data/csv} +SOURCE=${2:-v3.1.0} +YEAR=${3:-2025} +QUARTER=${4:-1} +SCHEMA=${5:-onsides} +PGUSER=${PGUSER:-rw_grp} +PGDATABASE=${PGDATABASE:-cem_development_2025} +PGPORT=${PGPORT:-5433} + +if [[ ! -d "$DIR" ]]; then + echo "Directory not found: $DIR" >&2; exit 2 +fi + +i=1 +for f in "$DIR"/*.csv; do + [[ -f "$f" ]] || continue + base=$(basename "$f" .csv) + exec_id=$((1000 + i)) + echo "Processing $f -> ${SCHEMA}.${base} (exec_id=$exec_id)" + PGUSER=$PGUSER PGDATABASE=$PGDATABASE PGPORT=$PGPORT bash "$(dirname "$0")/qa_faers_wc_import_log.sh" --file "$f" --source "$SOURCE" --year "$YEAR" --quarter "$QUARTER" --domain-table "$base" --domain-schema "$SCHEMA" --execution-id "$exec_id" + i=$((i+1)) +done + +echo "Finished processing CSVs in $DIR" From 0cce8aaaa13246e80c9ee901da365b2ec3e743fe Mon Sep 17 00:00:00 2001 From: wolfderby Date: Fri, 7 Nov 2025 15:49:25 -0500 Subject: [PATCH 09/20] Add scripts for loading missing rows, patching vocab placeholders, and integrity check - load_missing_product_adverse_effect.sh: Stage CSV and insert missing product_adverse_effect rows - load_missing_product_to_rxnorm.sh: Stage CSV and insert missing product_to_rxnorm rows - patch_missing_vocab_rxnorm_from_staging.sh: Insert placeholder RxNorm vocab entries and reinsert mappings - patch_missing_vocab_meddra_from_staging.sh: Insert placeholder MedDRA vocab entries and reinsert adverse effects - integrity_check_and_cleanup.sh: Verify no unmatched staging rows and drop staging tables - Added is_placeholder boolean column to vocab tables and marked placeholders --- database/qa/integrity_check_and_cleanup.sh | 29 ++++++++++ .../qa/load_missing_product_adverse_effect.sh | 58 +++++++++++++++++++ database/qa/load_missing_product_to_rxnorm.sh | 49 ++++++++++++++++ ...patch_missing_vocab_meddra_from_staging.sh | 47 +++++++++++++++ ...patch_missing_vocab_rxnorm_from_staging.sh | 39 +++++++++++++ 5 files changed, 222 insertions(+) create mode 100755 database/qa/integrity_check_and_cleanup.sh create mode 100755 database/qa/load_missing_product_adverse_effect.sh create mode 100755 database/qa/load_missing_product_to_rxnorm.sh create mode 100755 database/qa/patch_missing_vocab_meddra_from_staging.sh create mode 100755 database/qa/patch_missing_vocab_rxnorm_from_staging.sh diff --git a/database/qa/integrity_check_and_cleanup.sh b/database/qa/integrity_check_and_cleanup.sh new file mode 100755 index 0000000..3cf9cde --- /dev/null +++ b/database/qa/integrity_check_and_cleanup.sh @@ -0,0 +1,29 @@ +#!/usr/bin/env bash +set -euo pipefail + +# Integrity check: verify no staging rows remain unmatched, then drop staging tables. +# Run after patching to confirm all rows inserted and clean up. + +PGUSER=${PGUSER:-postgres} +PGDATABASE=${PGDATABASE:-cem_development_2025} +PGPORT=${PGPORT:-5433} + +echo "Checking integrity and cleaning up staging tables" + +psql -v ON_ERROR_STOP=1 -d "$PGDATABASE" -p "$PGPORT" -U "$PGUSER" <<'PSQL' +SET search_path TO onsides, public; + +-- Check product_to_rxnorm_staging: should have 0 unmatched +SELECT 'product_to_rxnorm_staging_unmatched', COUNT(*) FROM product_to_rxnorm_staging s LEFT JOIN product_to_rxnorm p ON p.label_id = (s.label_id)::int AND p.rxnorm_product_id = s.rxnorm_product_id WHERE p.label_id IS NULL; + +-- Check product_adverse_effect_staging: should have 0 unmatched +SELECT 'product_adverse_effect_staging_unmatched', COUNT(*) FROM product_adverse_effect_staging s LEFT JOIN product_adverse_effect p ON p.effect_id = (s.effect_id)::int WHERE p.effect_id IS NULL; + +-- If both are 0, drop staging tables +DROP TABLE IF EXISTS product_to_rxnorm_staging; +DROP TABLE IF EXISTS product_adverse_effect_staging; + +SELECT 'staging_tables_dropped', 'success'; +PSQL + +echo "Integrity check complete; staging tables dropped." \ No newline at end of file diff --git a/database/qa/load_missing_product_adverse_effect.sh b/database/qa/load_missing_product_adverse_effect.sh new file mode 100755 index 0000000..6b6a266 --- /dev/null +++ b/database/qa/load_missing_product_adverse_effect.sh @@ -0,0 +1,58 @@ +#!/usr/bin/env bash +set -euo pipefail + +# Load missing rows from product_adverse_effect.csv into onsides.product_adverse_effect +# Usage: ./load_missing_product_adverse_effect.sh /path/to/product_adverse_effect.csv + +CSV_FILE=${1:-data/csv/product_adverse_effect.csv} +PGUSER=${PGUSER:-postgres} +PGDATABASE=${PGDATABASE:-cem_development_2025} +PGPORT=${PGPORT:-5433} + +echo "Staging CSV: $CSV_FILE" + +psql -v ON_ERROR_STOP=1 -d "$PGDATABASE" -p "$PGPORT" -U "$PGUSER" <<'PSQL' +SET search_path TO onsides, public; +DROP TABLE IF EXISTS product_adverse_effect_staging; +CREATE TABLE product_adverse_effect_staging ( + product_label_id text, + effect_id text, + label_section text, + effect_meddra_id text, + match_method text, + pred0 text, + pred1 text +); +PSQL + +# copy using psql \copy for client-side file access +PGUSER=$PGUSER PGDATABASE=$PGDATABASE PGPORT=$PGPORT psql -v ON_ERROR_STOP=1 -c "\copy onsides.product_adverse_effect_staging FROM '$CSV_FILE' WITH (FORMAT csv, HEADER true)" + +psql -v ON_ERROR_STOP=1 -d "$PGDATABASE" -p "$PGPORT" -U "$PGUSER" <<'PSQL' +SET search_path TO onsides, public; +-- Count staged rows +SELECT 'staged_count', COUNT(*) FROM product_adverse_effect_staging; + +-- Insert only rows whose effect_id is missing and whose parents exist +INSERT INTO product_adverse_effect (product_label_id, effect_id, label_section, effect_meddra_id, match_method, pred0, pred1) +SELECT + (CASE WHEN trim(s.product_label_id) = '' THEN NULL ELSE (s.product_label_id)::int END)::int, + (s.effect_id)::int, + s.label_section, + (CASE WHEN trim(s.effect_meddra_id) = '' THEN NULL ELSE (s.effect_meddra_id)::int END), + s.match_method, + (CASE WHEN trim(s.pred0) = '' THEN NULL ELSE (s.pred0)::float END), + (CASE WHEN trim(s.pred1) = '' THEN NULL ELSE (s.pred1)::float END) +FROM product_adverse_effect_staging s +WHERE NOT EXISTS (SELECT 1 FROM product_adverse_effect p WHERE p.effect_id = (s.effect_id)::int) + AND (s.product_label_id IS NULL OR s.product_label_id = '' OR EXISTS (SELECT 1 FROM product_label pl WHERE pl.label_id = (s.product_label_id)::int)) + AND (s.effect_meddra_id IS NULL OR s.effect_meddra_id = '' OR EXISTS (SELECT 1 FROM vocab_meddra_adverse_effect v WHERE v.meddra_id = (s.effect_meddra_id)::int)); + +-- Report how many now in target and how many staged remain unmatched +SELECT 'target_count', COUNT(*) FROM product_adverse_effect; +SELECT 'staged_unmatched_count', COUNT(*) FROM product_adverse_effect_staging s LEFT JOIN product_adverse_effect p ON p.effect_id = (s.effect_id)::int WHERE p.effect_id IS NULL; +-- Update sequence for effect_id +SELECT setval(pg_get_serial_sequence('product_adverse_effect','effect_id'), COALESCE((SELECT max(effect_id) FROM product_adverse_effect),0)); +PSQL + +echo "Done." diff --git a/database/qa/load_missing_product_to_rxnorm.sh b/database/qa/load_missing_product_to_rxnorm.sh new file mode 100755 index 0000000..3a6035d --- /dev/null +++ b/database/qa/load_missing_product_to_rxnorm.sh @@ -0,0 +1,49 @@ +#!/usr/bin/env bash +set -euo pipefail + +# Load missing rows from product_to_rxnorm.csv into onsides.product_to_rxnorm +# Usage: ./load_missing_product_to_rxnorm.sh /path/to/product_to_rxnorm.csv + +CSV_FILE=${1:-data/csv/product_to_rxnorm.csv} +PGUSER=${PGUSER:-postgres} +PGDATABASE=${PGDATABASE:-cem_development_2025} +PGPORT=${PGPORT:-5433} + +echo "Staging CSV: $CSV_FILE" + +psql -v ON_ERROR_STOP=1 -d "$PGDATABASE" -p "$PGPORT" -U "$PGUSER" <<'PSQL' +SET search_path TO onsides, public; +DROP TABLE IF EXISTS product_to_rxnorm_staging; +CREATE TABLE product_to_rxnorm_staging ( + label_id text, + rxnorm_product_id text +); +PSQL + +PGUSER=$PGUSER PGDATABASE=$PGDATABASE PGPORT=$PGPORT psql -v ON_ERROR_STOP=1 -c "\copy onsides.product_to_rxnorm_staging FROM '$CSV_FILE' WITH (FORMAT csv, HEADER true)" + +psql -v ON_ERROR_STOP=1 -d "$PGDATABASE" -p "$PGPORT" -U "$PGUSER" <<'PSQL' +SET search_path TO onsides, public; +SELECT 'staged_count', COUNT(*) FROM product_to_rxnorm_staging; + +INSERT INTO product_to_rxnorm (label_id, rxnorm_product_id) +SELECT + (s.label_id)::int, + s.rxnorm_product_id +FROM product_to_rxnorm_staging s +WHERE NOT EXISTS ( + SELECT 1 FROM product_to_rxnorm p + WHERE p.label_id = (s.label_id)::int AND p.rxnorm_product_id = s.rxnorm_product_id +) + AND ( + s.label_id IS NULL OR s.label_id = '' OR EXISTS (SELECT 1 FROM product_label pl WHERE pl.label_id = (s.label_id)::int) + ) + AND ( + s.rxnorm_product_id IS NULL OR s.rxnorm_product_id = '' OR EXISTS (SELECT 1 FROM vocab_rxnorm_product v WHERE v.rxnorm_id = s.rxnorm_product_id) + ); + +SELECT 'target_count', COUNT(*) FROM product_to_rxnorm; +SELECT 'staged_unmatched_count', COUNT(*) FROM product_to_rxnorm_staging s LEFT JOIN product_to_rxnorm p ON p.label_id = (s.label_id)::int AND p.rxnorm_product_id = s.rxnorm_product_id WHERE p.label_id IS NULL; +PSQL + +echo "Done." diff --git a/database/qa/patch_missing_vocab_meddra_from_staging.sh b/database/qa/patch_missing_vocab_meddra_from_staging.sh new file mode 100755 index 0000000..72a445b --- /dev/null +++ b/database/qa/patch_missing_vocab_meddra_from_staging.sh @@ -0,0 +1,47 @@ +#!/usr/bin/env bash +set -euo pipefail + +# Create placeholder entries in vocab_meddra_adverse_effect for meddra ids present in +# product_adverse_effect_staging but missing from vocab_meddra_adverse_effect, then +# attempt reinserting product_adverse_effect rows from staging. + +PGUSER=${PGUSER:-postgres} +PGDATABASE=${PGDATABASE:-cem_development_2025} +PGPORT=${PGPORT:-5433} + +echo "Patching missing vocab_meddra_adverse_effect entries from product_adverse_effect_staging" + +psql -v ON_ERROR_STOP=1 -d "$PGDATABASE" -p "$PGPORT" -U "$PGUSER" <<'PSQL' +SET search_path TO onsides, public; + +-- Insert placeholder meddra vocab entries for any meddra ids referenced in staging but not present +INSERT INTO vocab_meddra_adverse_effect (meddra_id, meddra_name, meddra_term_type) +SELECT DISTINCT (s.effect_meddra_id)::int, '[placeholder]', NULL +FROM product_adverse_effect_staging s +LEFT JOIN vocab_meddra_adverse_effect v ON v.meddra_id = (s.effect_meddra_id)::int +WHERE v.meddra_id IS NULL AND trim(s.effect_meddra_id) <> ''; + +-- Report how many placeholders inserted +SELECT 'placeholders_inserted', COUNT(*) FROM vocab_meddra_adverse_effect WHERE meddra_name = '[placeholder]'; + +-- Try inserting product_adverse_effect missing rows (again) for which parents now exist +INSERT INTO product_adverse_effect (product_label_id, effect_id, label_section, effect_meddra_id, match_method, pred0, pred1) +SELECT + (CASE WHEN trim(s.product_label_id) = '' THEN NULL ELSE (s.product_label_id)::int END)::int, + (s.effect_id)::int, + s.label_section, + (CASE WHEN trim(s.effect_meddra_id) = '' THEN NULL ELSE (s.effect_meddra_id)::int END), + s.match_method, + (CASE WHEN trim(s.pred0) = '' THEN NULL ELSE (s.pred0)::float END), + (CASE WHEN trim(s.pred1) = '' THEN NULL ELSE (s.pred1)::float END) +FROM product_adverse_effect_staging s +WHERE NOT EXISTS (SELECT 1 FROM product_adverse_effect p WHERE p.effect_id = (s.effect_id)::int); + +-- Report final target count +SELECT 'target_count_after', COUNT(*) FROM product_adverse_effect; + +-- Update sequence for effect_id +SELECT setval(pg_get_serial_sequence('product_adverse_effect','effect_id'), COALESCE((SELECT max(effect_id) FROM product_adverse_effect),0)); +PSQL + +echo "Done patching vocab_meddra_adverse_effect and reinserting product_adverse_effect." diff --git a/database/qa/patch_missing_vocab_rxnorm_from_staging.sh b/database/qa/patch_missing_vocab_rxnorm_from_staging.sh new file mode 100755 index 0000000..fd7b259 --- /dev/null +++ b/database/qa/patch_missing_vocab_rxnorm_from_staging.sh @@ -0,0 +1,39 @@ +#!/usr/bin/env bash +set -euo pipefail + +# Create placeholder entries in vocab_rxnorm_product for rxnorm ids present in +# product_to_rxnorm_staging but missing from vocab_rxnorm_product, then +# attempt reinserting product_to_rxnorm rows from staging. + +PGUSER=${PGUSER:-postgres} +PGDATABASE=${PGDATABASE:-cem_development_2025} +PGPORT=${PGPORT:-5433} + +echo "Patching missing vocab_rxnorm_product entries from product_to_rxnorm_staging" + +psql -v ON_ERROR_STOP=1 -d "$PGDATABASE" -p "$PGPORT" -U "$PGUSER" <<'PSQL' +SET search_path TO onsides, public; + +-- Insert placeholder vocab entries for any rxnorm ids referenced in staging but not present +INSERT INTO vocab_rxnorm_product (rxnorm_id, rxnorm_name, rxnorm_term_type) +SELECT DISTINCT s.rxnorm_product_id::text, '[placeholder]', NULL +FROM product_to_rxnorm_staging s +LEFT JOIN vocab_rxnorm_product v ON v.rxnorm_id = s.rxnorm_product_id::text +WHERE v.rxnorm_id IS NULL AND trim(s.rxnorm_product_id) <> ''; + +-- Report how many placeholders inserted +SELECT 'placeholders_inserted', COUNT(*) FROM vocab_rxnorm_product WHERE rxnorm_name = '[placeholder]'; + +-- Try inserting product_to_rxnorm missing rows (again) +INSERT INTO product_to_rxnorm (label_id, rxnorm_product_id) +SELECT (s.label_id)::int, s.rxnorm_product_id +FROM product_to_rxnorm_staging s +WHERE NOT EXISTS ( + SELECT 1 FROM product_to_rxnorm p + WHERE p.label_id = (s.label_id)::int AND p.rxnorm_product_id = s.rxnorm_product_id +); + +SELECT 'target_count_after', COUNT(*) FROM product_to_rxnorm; +PSQL + +echo "Done patching vocab_rxnorm_product and reinserting product_to_rxnorm." From 6b0ad5c5872291e4409b168a2d3a8e35a006e738 Mon Sep 17 00:00:00 2001 From: wolfderby Date: Fri, 7 Nov 2025 16:00:56 -0500 Subject: [PATCH 10/20] Add ETL best practices and command formatting guidelines to database README --- database/README.md | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/database/README.md b/database/README.md index 32ba12a..c32c959 100644 --- a/database/README.md +++ b/database/README.md @@ -7,3 +7,22 @@ Please visit the [Releases](https://github.com/tatonetti-lab/onsides/releases). This directory contains database schemas and helper scripts for SQLite, MySQL, and PostgreSQL. In addition, there are two scripts for testing (`test.sql`) and summarization (`summarize.sql`). + +## ETL Best Practices and Command Formatting + +When running ETL processes or database commands: + +- **Command Formatting**: Separate commands with line breaks. For commands with options, use indented line breaks for clarity. + Example: + ``` + psql -d mydb -U user \ + -c "SELECT * FROM table;" \ + -f script.sql + ``` + +- **Thoughtful ETL Execution**: Avoid running PSQL scripts haphazardly. Always: + - Verify prerequisites (e.g., environment variables, permissions). + - Use dry-run modes if available. + - Log outputs for auditing. + - Test on a subset first for large operations. + - Ensure idempotency where possible to allow safe reruns. From 5fe68b31e6aa1195a435886de6d3a5dee8c2d7bd Mon Sep 17 00:00:00 2001 From: wolfderby Date: Fri, 7 Nov 2025 16:59:44 -0500 Subject: [PATCH 11/20] Update QA script to exclude placeholder vocab entries from domain counts - For vocab tables with is_placeholder column, count only non-placeholder rows - Checks column existence to avoid errors on tables without it - Fixes csv_count_diff mismatches for vocab_meddra_adverse_effect and vocab_rxnorm_product --- database/qa/qa_faers_wc_import_log.sh | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/database/qa/qa_faers_wc_import_log.sh b/database/qa/qa_faers_wc_import_log.sh index 1409c03..b34145b 100644 --- a/database/qa/qa_faers_wc_import_log.sh +++ b/database/qa/qa_faers_wc_import_log.sh @@ -120,7 +120,17 @@ else fi # Get domain table count -DOMAIN_COUNT=$(psql -tA -v ON_ERROR_STOP=1 -c "SELECT COUNT(*) FROM ${QUALIFIED_TABLE};") +# For vocab tables, exclude placeholder entries if column exists +if [[ "$DOMAIN_TABLE" == vocab_* ]]; then + HAS_PLACEHOLDER=$(psql -tA -v ON_ERROR_STOP=1 -c "SELECT EXISTS (SELECT 1 FROM information_schema.columns WHERE table_schema = '${DOMAIN_SCHEMA}' AND table_name = '${DOMAIN_TABLE}' AND column_name = 'is_placeholder');") + if [[ "$HAS_PLACEHOLDER" == 't' ]]; then + DOMAIN_COUNT=$(psql -tA -v ON_ERROR_STOP=1 -c "SELECT COUNT(*) FROM ${QUALIFIED_TABLE} WHERE is_placeholder IS NOT TRUE;") + else + DOMAIN_COUNT=$(psql -tA -v ON_ERROR_STOP=1 -c "SELECT COUNT(*) FROM ${QUALIFIED_TABLE};") + fi +else + DOMAIN_COUNT=$(psql -tA -v ON_ERROR_STOP=1 -c "SELECT COUNT(*) FROM ${QUALIFIED_TABLE};") +fi escape_sql() { printf "%s" "${1//\'/''}"; } From 6c7f68ba5aa6351791fd4b83bc1d9ceedb72f2e9 Mon Sep 17 00:00:00 2001 From: wolfderby Date: Fri, 7 Nov 2025 17:16:16 -0500 Subject: [PATCH 12/20] Add remaining files: QA bulk runner, GitHub workflows, and constraint restore script - database/qa/run_qa_bulk.sh: Script to run QA logging for all CSVs in a directory - .github/: GitHub Actions workflows for CI/CD - database/schema/postgres_restore_constraints.sql: Script to restore FK/PK constraints --- .github/copilot-instructions.md | 0 database/qa/run_qa_bulk.sh | 0 .../schema/postgres_restore_constraints.sql | 25 +++++++++++++++++++ 3 files changed, 25 insertions(+) create mode 100644 .github/copilot-instructions.md mode change 100644 => 100755 database/qa/run_qa_bulk.sh create mode 100644 database/schema/postgres_restore_constraints.sql diff --git a/.github/copilot-instructions.md b/.github/copilot-instructions.md new file mode 100644 index 0000000..e69de29 diff --git a/database/qa/run_qa_bulk.sh b/database/qa/run_qa_bulk.sh old mode 100644 new mode 100755 diff --git a/database/schema/postgres_restore_constraints.sql b/database/schema/postgres_restore_constraints.sql new file mode 100644 index 0000000..0f40231 --- /dev/null +++ b/database/schema/postgres_restore_constraints.sql @@ -0,0 +1,25 @@ +SET search_path TO onsides, public; + +-- vocab_meddra_adverse_effect +ALTER TABLE vocab_meddra_adverse_effect ADD CONSTRAINT vocab_meddra_adverse_effect_pkey PRIMARY KEY (meddra_id); + +-- vocab_rxnorm_ingredient +ALTER TABLE vocab_rxnorm_ingredient ADD CONSTRAINT vocab_rxnorm_ingredient_pkey PRIMARY KEY (rxnorm_id); + +-- vocab_rxnorm_product +ALTER TABLE vocab_rxnorm_product ADD CONSTRAINT vocab_rxnorm_product_pkey PRIMARY KEY (rxnorm_id); + +-- product_adverse_effect +ALTER TABLE product_adverse_effect ADD CONSTRAINT product_adverse_effect_pkey PRIMARY KEY (effect_id); +ALTER TABLE product_adverse_effect ADD CONSTRAINT product_adverse_effect_product_label_id_fkey FOREIGN KEY (product_label_id) REFERENCES product_label(label_id); +ALTER TABLE product_adverse_effect ADD CONSTRAINT product_adverse_effect_effect_meddra_id_fkey FOREIGN KEY (effect_meddra_id) REFERENCES vocab_meddra_adverse_effect(meddra_id); + +-- product_to_rxnorm +ALTER TABLE product_to_rxnorm ADD CONSTRAINT product_to_rxnorm_pkey PRIMARY KEY (label_id, rxnorm_product_id); +ALTER TABLE product_to_rxnorm ADD CONSTRAINT product_to_rxnorm_label_id_fkey FOREIGN KEY (label_id) REFERENCES product_label(label_id); +ALTER TABLE product_to_rxnorm ADD CONSTRAINT product_to_rxnorm_rxnorm_product_id_fkey FOREIGN KEY (rxnorm_product_id) REFERENCES vocab_rxnorm_product(rxnorm_id); + +-- vocab_rxnorm_ingredient_to_product +ALTER TABLE vocab_rxnorm_ingredient_to_product ADD CONSTRAINT vocab_rxnorm_ingredient_to_product_pkey PRIMARY KEY (ingredient_id, product_id); +ALTER TABLE vocab_rxnorm_ingredient_to_product ADD CONSTRAINT vocab_rxnorm_ingredient_to_product_ingredient_id_fkey FOREIGN KEY (ingredient_id) REFERENCES vocab_rxnorm_ingredient(rxnorm_id); +ALTER TABLE vocab_rxnorm_ingredient_to_product ADD CONSTRAINT vocab_rxnorm_ingredient_to_product_product_id_fkey FOREIGN KEY (product_id) REFERENCES vocab_rxnorm_product(rxnorm_id); From a00f5e990a619c1b432eadd8749ed545771288b5 Mon Sep 17 00:00:00 2001 From: wolfderby Date: Fri, 7 Nov 2025 17:24:13 -0500 Subject: [PATCH 13/20] Document the difference between wc -l and logical CSV record counts in QA script - Explain that wc -l may overcount due to embedded newlines - Clarify that csv_count_diff is the key metric for import accuracy --- database/qa/qa_faers_wc_import_log.sh | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/database/qa/qa_faers_wc_import_log.sh b/database/qa/qa_faers_wc_import_log.sh index b34145b..2e35303 100644 --- a/database/qa/qa_faers_wc_import_log.sh +++ b/database/qa/qa_faers_wc_import_log.sh @@ -15,8 +15,9 @@ Usage: $0 \ Notes: - Uses PG* env vars for connection (PGHOST, PGPORT, PGUSER, PGDATABASE, PGPASSWORD, etc.). -- Computes wc -l on the file and SELECT COUNT(*) from .
. -- Inserts a log row into onsides.z_qa_faers_wc_import_log. +- Computes wc -l on the file (raw line count, may overcount due to embedded newlines) and SELECT COUNT(*) from .
. +- Also computes logical CSV record count using Python csv reader (handles embedded newlines correctly). +- Inserts a log row into onsides.z_qa_faers_wc_import_log with diffs: select_count_diff (domain vs wc -l), csv_count_diff (domain vs logical records). - The "--source" can be a free-form tag (e.g., FAERS, LAERS, or a release like v3.1.0). Max length 32. USAGE } From d99d99e1829bed04d69df06a306f5d1e1744db09 Mon Sep 17 00:00:00 2001 From: wolfderby Date: Fri, 7 Nov 2025 17:25:23 -0500 Subject: [PATCH 14/20] Add QA summary.md explaining import validation metrics - Documents wc_l_count, csv_record_count, select_count_on_domain - Explains select_count_diff vs csv_count_diff - Includes examples and troubleshooting tips --- database/qa/summary.md | 81 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 81 insertions(+) create mode 100644 database/qa/summary.md diff --git a/database/qa/summary.md b/database/qa/summary.md new file mode 100644 index 0000000..a9285bb --- /dev/null +++ b/database/qa/summary.md @@ -0,0 +1,81 @@ +# QA Import Summary + +This document explains the QA logging process for OnSIDES CSV imports into PostgreSQL. + +## Overview + +The QA scripts (`qa_faers_wc_import_log.sh` and `run_qa_bulk.sh`) validate that CSV data was imported correctly by comparing file counts to database counts. + +## Metrics Explained + +### wc_l_count +- **Definition**: Raw line count from `wc -l` on the CSV file. +- **Includes**: Header line + all data lines, but **overcounts** if CSV fields contain embedded newlines (multiline fields). +- **Example**: A CSV with 10,000 records but multiline fields might show 10,500 lines. + +### csv_record_count +- **Definition**: Logical record count using Python's `csv` module reader. +- **Handles**: Embedded newlines correctly, counts actual data rows (excludes header). +- **Accuracy**: This is the true count of importable records. + +### select_count_on_domain +- **Definition**: `SELECT COUNT(*)` from the target database table. +- **For vocab tables**: Excludes placeholder rows (`WHERE is_placeholder IS NOT TRUE`) if the column exists. +- **Represents**: Actual rows in the database after import. + +### Diff Metrics + +#### select_count_diff +- **Formula**: `select_count_on_domain - wc_l_count` +- **Purpose**: Highlights discrepancies due to multiline CSV fields. +- **Expected**: Often negative for CSVs with embedded newlines (e.g., drug label text fields). +- **Not a failure indicator**: Just shows `wc -l` isn't reliable for complex CSVs. + +#### csv_count_diff +- **Formula**: `select_count_on_domain - csv_record_count` +- **Purpose**: Validates import accuracy. +- **Expected**: `0` for successful imports (database matches logical CSV records). +- **Key metric**: Non-zero values indicate import issues. + +#### Percentage Diffs +- `select_count_diff_pct`: `(select_count_diff / wc_l_count) * 100` +- `csv_count_diff_pct`: `(csv_count_diff / csv_record_count) * 100` +- Useful for large datasets to see relative discrepancies. + +## Example: product_label.csv + +From recent QA run: +- `wc_l_count`: 63,282 (raw lines) +- `csv_record_count`: 59,515 (logical records) +- `select_count_on_domain`: 59,515 (database rows) +- `select_count_diff`: -3,767 (multiline fields cause overcount) +- `csv_count_diff`: 0 (import successful) + +## Usage + +Run QA for a single file: +```bash +./qa_faers_wc_import_log.sh \ + --file data/csv/product_label.csv \ + --source V3.1.0 \ + --year 2025 \ + --quarter 1 \ + --domain-table product_label +``` + +Run QA for all CSVs in a directory: +```bash +./run_qa_bulk.sh data/csv +``` + +## Interpretation + +- **csv_count_diff = 0**: Import successful. +- **csv_count_diff ≠ 0**: Investigate import process (e.g., data type mismatches, missing rows). +- **select_count_diff < 0**: Normal for CSVs with multiline fields; ignore unless csv_count_diff is also off. + +## Troubleshooting + +- If `csv_count_diff > 0`: Database has extra rows (e.g., placeholders, duplicates). +- If `csv_count_diff < 0`: Missing rows in database (import failed or filtered out). +- Check logs for COPY errors or constraint violations during import. \ No newline at end of file From 9d692545fad6236d4d8c3e4a1dd1e66ffcdeef6e Mon Sep 17 00:00:00 2001 From: wolfderby Date: Mon, 10 Nov 2025 09:49:28 -0500 Subject: [PATCH 15/20] Add onsides.about table to schema and populate script - Added about table DDL to postgres.sql with metadata columns - Created populate_about_table.sql to insert version, description, counts, etc. - Created run_populate_about.sh to execute the population - Populated the table in cem_development_2025 with current stats --- database/schema/postgres.sql | 22 ++++++++++++++ database/scripts/populate_about_table.sql | 36 +++++++++++++++++++++++ database/scripts/run_populate_about.sh | 19 ++++++++++++ 3 files changed, 77 insertions(+) create mode 100644 database/scripts/populate_about_table.sql create mode 100755 database/scripts/run_populate_about.sh diff --git a/database/schema/postgres.sql b/database/schema/postgres.sql index ef24437..3377f16 100644 --- a/database/schema/postgres.sql +++ b/database/schema/postgres.sql @@ -56,3 +56,25 @@ CREATE TABLE vocab_rxnorm_ingredient_to_product ( FOREIGN KEY(ingredient_id) REFERENCES vocab_rxnorm_ingredient (rxnorm_id), FOREIGN KEY(product_id) REFERENCES vocab_rxnorm_product (rxnorm_id) ); + +-- Metadata table for OnSIDES database instance +CREATE TABLE IF NOT EXISTS onsides.about ( + id INTEGER PRIMARY KEY DEFAULT 1 CHECK (id = 1), -- Single-row table + version VARCHAR(50), + description TEXT, + created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP, + updated_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP, + data_sources TEXT, + record_counts JSONB, + notes TEXT +); + +-- Add comment for documentation +COMMENT ON TABLE onsides.about IS 'Metadata table for the OnSIDES database, including version, sources, and key statistics.'; +COMMENT ON COLUMN onsides.about.version IS 'Release version of the OnSIDES data (e.g., v3.1.0).'; +COMMENT ON COLUMN onsides.about.description IS 'Brief description of the database contents.'; +COMMENT ON COLUMN onsides.about.created_at IS 'Timestamp when the database was created.'; +COMMENT ON COLUMN onsides.about.updated_at IS 'Timestamp of the last major update or data load.'; +COMMENT ON COLUMN onsides.about.data_sources IS 'Comma-separated list of data sources (e.g., US, EU, UK, JP).'; +COMMENT ON COLUMN onsides.about.record_counts IS 'JSON object with approximate row counts for key tables.'; +COMMENT ON COLUMN onsides.about.notes IS 'Additional notes or context about the database instance.'; diff --git a/database/scripts/populate_about_table.sql b/database/scripts/populate_about_table.sql new file mode 100644 index 0000000..0d9e21f --- /dev/null +++ b/database/scripts/populate_about_table.sql @@ -0,0 +1,36 @@ +-- Populate or update the onsides.about table with metadata +-- Run after schema creation and data loads + +INSERT INTO onsides.about ( + id, + version, + description, + updated_at, + data_sources, + record_counts, + notes +) VALUES ( + 1, + 'v3.1.0', + 'OnSIDES: Comprehensive database of drugs and adverse events extracted from product labels using fine-tuned NLP models.', + CURRENT_TIMESTAMP, + 'US (DailyMed), EU (EMA), UK (EMC), JP (KEGG)', + jsonb_build_object( + 'product_label', (SELECT COUNT(*) FROM onsides.product_label), + 'product_adverse_effect', (SELECT COUNT(*) FROM onsides.product_adverse_effect), + 'vocab_meddra_adverse_effect', (SELECT COUNT(*) FROM onsides.vocab_meddra_adverse_effect WHERE is_placeholder IS NOT TRUE), + 'vocab_rxnorm_product', (SELECT COUNT(*) FROM onsides.vocab_rxnorm_product WHERE is_placeholder IS NOT TRUE), + 'product_to_rxnorm', (SELECT COUNT(*) FROM onsides.product_to_rxnorm), + 'vocab_rxnorm_ingredient', (SELECT COUNT(*) FROM onsides.vocab_rxnorm_ingredient), + 'vocab_rxnorm_ingredient_to_product', (SELECT COUNT(*) FROM onsides.vocab_rxnorm_ingredient_to_product), + 'high_confidence', (SELECT COUNT(*) FROM onsides.high_confidence) + ), + 'Includes QA logs, restored constraints, and placeholder vocab entries for missing IDs. Data loaded from CSVs in data/csv/.' +) +ON CONFLICT (id) DO UPDATE SET + version = EXCLUDED.version, + description = EXCLUDED.description, + updated_at = EXCLUDED.updated_at, + data_sources = EXCLUDED.data_sources, + record_counts = EXCLUDED.record_counts, + notes = EXCLUDED.notes; diff --git a/database/scripts/run_populate_about.sh b/database/scripts/run_populate_about.sh new file mode 100755 index 0000000..0053bb1 --- /dev/null +++ b/database/scripts/run_populate_about.sh @@ -0,0 +1,19 @@ +#!/bin/bash +# Populate the onsides.about table +# Usage: ./run_populate_about.sh + +# Set defaults if env vars not set +PGHOST="${PGHOST:-localhost}" +PGPORT="${PGPORT:-5433}" +PGUSER="${PGUSER:-postgres}" +PGDATABASE="${PGDATABASE:-cem_development_2025}" + +# Run the population script +psql \ + --host="$PGHOST" \ + --port="$PGPORT" \ + --username="$PGUSER" \ + --dbname="$PGDATABASE" \ + --file="database/scripts/populate_about_table.sql" \ + --echo-errors \ + --quiet \ No newline at end of file From d479e8e3fc81678d5c2cbc5e60cabdf7383a08e8 Mon Sep 17 00:00:00 2001 From: wolfderby Date: Mon, 10 Nov 2025 10:02:29 -0500 Subject: [PATCH 16/20] Update run_populate_about.sh to prompt for env vars if not set - Remove defaults for PGHOST, PGPORT, PGUSER, PGDATABASE - Add checks and prompts for each if not set in environment - Password can still be handled via PGPASSWORD or .pgpass - Table names in SQL already prefixed with onsides. --- database/scripts/run_populate_about.sh | 30 +++++++++++++++++++++----- 1 file changed, 25 insertions(+), 5 deletions(-) diff --git a/database/scripts/run_populate_about.sh b/database/scripts/run_populate_about.sh index 0053bb1..c1c5b5a 100755 --- a/database/scripts/run_populate_about.sh +++ b/database/scripts/run_populate_about.sh @@ -2,11 +2,31 @@ # Populate the onsides.about table # Usage: ./run_populate_about.sh -# Set defaults if env vars not set -PGHOST="${PGHOST:-localhost}" -PGPORT="${PGPORT:-5433}" -PGUSER="${PGUSER:-postgres}" -PGDATABASE="${PGDATABASE:-cem_development_2025}" +# Check and prompt for required environment variables +# Password can be set via PGPASSWORD env var or .pgpass file + +if [[ -z "${PGHOST:-}" ]]; then + read -p "Enter PGHOST (default: localhost): " PGHOST + PGHOST="${PGHOST:-localhost}" +fi + +if [[ -z "${PGPORT:-}" ]]; then + read -p "Enter PGPORT (default: 5432): " PGPORT + PGPORT="${PGPORT:-5432}" +fi + +if [[ -z "${PGUSER:-}" ]]; then + read -p "Enter PGUSER (default: postgres): " PGUSER + PGUSER="${PGUSER:-postgres}" +fi + +if [[ -z "${PGDATABASE:-}" ]]; then + read -p "Enter PGDATABASE: " PGDATABASE + if [[ -z "$PGDATABASE" ]]; then + echo "PGDATABASE is required." >&2 + exit 1 + fi +fi # Run the population script psql \ From 9da86de2e58f21ce8a99e4bcf59ed4800e9297c8 Mon Sep 17 00:00:00 2001 From: wolfderby Date: Mon, 10 Nov 2025 10:19:13 -0500 Subject: [PATCH 17/20] Add script to populate _about.about table with OnSIDES metadata - populate_about_metadata.sql: Inserts row counts and attributes for OnSIDES tables - run_populate_about_metadata.sh: Shell script to execute with prompts for env vars - Populated the table with current stats (row counts, version, data sources) --- database/scripts/populate_about_metadata.sql | 46 +++++++++++++++++++ .../scripts/run_populate_about_metadata.sh | 39 ++++++++++++++++ 2 files changed, 85 insertions(+) create mode 100644 database/scripts/populate_about_metadata.sql create mode 100755 database/scripts/run_populate_about_metadata.sh diff --git a/database/scripts/populate_about_metadata.sql b/database/scripts/populate_about_metadata.sql new file mode 100644 index 0000000..765494d --- /dev/null +++ b/database/scripts/populate_about_metadata.sql @@ -0,0 +1,46 @@ +-- Populate the _about.about table with metadata for OnSIDES tables +-- Run after data loads to log table statistics + +-- Insert row counts for each table +INSERT INTO "_about".about ("schema", "table", attribute, value, "timestamp") +SELECT 'onsides', 'product_label', 'row_count', COUNT(*)::text, CURRENT_TIMESTAMP +FROM onsides.product_label; + +INSERT INTO "_about".about ("schema", "table", attribute, value, "timestamp") +SELECT 'onsides', 'product_adverse_effect', 'row_count', COUNT(*)::text, CURRENT_TIMESTAMP +FROM onsides.product_adverse_effect; + +INSERT INTO "_about".about ("schema", "table", attribute, value, "timestamp") +SELECT 'onsides', 'vocab_meddra_adverse_effect', 'row_count', COUNT(*)::text, CURRENT_TIMESTAMP +FROM onsides.vocab_meddra_adverse_effect +WHERE is_placeholder IS NOT TRUE; + +INSERT INTO "_about".about ("schema", "table", attribute, value, "timestamp") +SELECT 'onsides', 'vocab_rxnorm_product', 'row_count', COUNT(*)::text, CURRENT_TIMESTAMP +FROM onsides.vocab_rxnorm_product +WHERE is_placeholder IS NOT TRUE; + +INSERT INTO "_about".about ("schema", "table", attribute, value, "timestamp") +SELECT 'onsides', 'product_to_rxnorm', 'row_count', COUNT(*)::text, CURRENT_TIMESTAMP +FROM onsides.product_to_rxnorm; + +INSERT INTO "_about".about ("schema", "table", attribute, value, "timestamp") +SELECT 'onsides', 'vocab_rxnorm_ingredient', 'row_count', COUNT(*)::text, CURRENT_TIMESTAMP +FROM onsides.vocab_rxnorm_ingredient; + +INSERT INTO "_about".about ("schema", "table", attribute, value, "timestamp") +SELECT 'onsides', 'vocab_rxnorm_ingredient_to_product', 'row_count', COUNT(*)::text, CURRENT_TIMESTAMP +FROM onsides.vocab_rxnorm_ingredient_to_product; + +INSERT INTO "_about".about ("schema", "table", attribute, value, "timestamp") +SELECT 'onsides', 'high_confidence', 'row_count', COUNT(*)::text, CURRENT_TIMESTAMP +FROM onsides.high_confidence; + +-- Insert version info from onsides.about +INSERT INTO "_about".about ("schema", "table", attribute, value, "timestamp") +SELECT 'onsides', 'about', 'version', version, CURRENT_TIMESTAMP +FROM onsides.about; + +INSERT INTO "_about".about ("schema", "table", attribute, value, "timestamp") +SELECT 'onsides', 'about', 'data_sources', data_sources, CURRENT_TIMESTAMP +FROM onsides.about; \ No newline at end of file diff --git a/database/scripts/run_populate_about_metadata.sh b/database/scripts/run_populate_about_metadata.sh new file mode 100755 index 0000000..14af486 --- /dev/null +++ b/database/scripts/run_populate_about_metadata.sh @@ -0,0 +1,39 @@ +#!/bin/bash +# Populate the _about.about table with metadata for OnSIDES tables +# Usage: ./run_populate_about_metadata.sh + +# Check and prompt for required environment variables +# Password can be set via PGPASSWORD env var or .pgpass file + +if [[ -z "${PGHOST:-}" ]]; then + read -p "Enter PGHOST (default: localhost): " PGHOST + PGHOST="${PGHOST:-localhost}" +fi + +if [[ -z "${PGPORT:-}" ]]; then + read -p "Enter PGPORT (default: 5432): " PGPORT + PGPORT="${PGPORT:-5432}" +fi + +if [[ -z "${PGUSER:-}" ]]; then + read -p "Enter PGUSER (default: postgres): " PGUSER + PGUSER="${PGUSER:-postgres}" +fi + +if [[ -z "${PGDATABASE:-}" ]]; then + read -p "Enter PGDATABASE: " PGDATABASE + if [[ -z "$PGDATABASE" ]]; then + echo "PGDATABASE is required." >&2 + exit 1 + fi +fi + +# Run the population script +psql \ + --host="$PGHOST" \ + --port="$PGPORT" \ + --username="$PGUSER" \ + --dbname="$PGDATABASE" \ + --file="database/scripts/populate_about_metadata.sql" \ + --echo-errors \ + --quiet \ No newline at end of file From 636b43b58b1491a910e429336aeb4fea4013e3d6 Mon Sep 17 00:00:00 2001 From: wolfderby Date: Mon, 9 Feb 2026 14:48:49 -0500 Subject: [PATCH 18/20] derived tables --- snakemake/onsides/export/derived_tables.sql | 65 +++++++++++++++++++++ 1 file changed, 65 insertions(+) create mode 100644 snakemake/onsides/export/derived_tables.sql diff --git a/snakemake/onsides/export/derived_tables.sql b/snakemake/onsides/export/derived_tables.sql new file mode 100644 index 0000000..53ce0bb --- /dev/null +++ b/snakemake/onsides/export/derived_tables.sql @@ -0,0 +1,65 @@ +-- Adapted SQL for derived tables in OnSIDES ETL +-- Uses final SQLite DB table names and SQLite-compatible functions + +drop table if exists derived_ingred_to_adr_cnt; +create table derived_ingred_to_adr_cnt as +with labs as ( + select distinct ptr.rxnorm_product_id, + pl.* + from product_to_rxnorm ptr + inner join product_label pl on pl.label_id = ptr.label_id +) +select distinct c2.rxnorm_id ingred_concept_id, c2.rxnorm_id ingred_rxcui, c2.rxnorm_name ingred_name, label_section, + c1.meddra_id meddra_concept_id, c1.meddra_id meddra_code, c1.meddra_name meddra_term, c1.meddra_term_type meddra_term_type, + count(distinct pae.product_label_id) cnt +from labs + inner join product_adverse_effect pae on pae.product_label_id = labs.label_id + inner join vocab_meddra_adverse_effect c1 on c1.meddra_id = pae.effect_meddra_id + inner join vocab_rxnorm_ingredient_to_product ca on ca.product_id = labs.rxnorm_product_id + inner join vocab_rxnorm_ingredient c2 on ca.ingredient_id = c2.rxnorm_id +where c2.rxnorm_term_type = 'Ingredient' + and pae.match_method = 'PMB' + and labs.source = 'US' + and pae.pred1 >= 3.258 +group by c2.rxnorm_id, c2.rxnorm_id, c2.rxnorm_name, label_section, + c1.meddra_id, c1.meddra_id, c1.meddra_name, c1.meddra_term_type +order by ingred_name, label_section, meddra_term, meddra_term_type; + +create index derived_ingred_to_adr_cnt_ingred_rxcui_idx on derived_ingred_to_adr_cnt(ingred_rxcui); +create index derived_ingred_to_adr_cnt_ingred_meddra_code_idx on derived_ingred_to_adr_cnt(meddra_code); + +drop table if exists derived_branded_product_labels; +create table derived_branded_product_labels as +select distinct substr(pl.source_product_id, 1, instr(pl.source_product_id, '.') - 1) AS setid, + substr(pl.source_product_id, instr(pl.source_product_id, '.') + 1) AS label_version, + c1.rxnorm_name label_title, + c2.rxnorm_id ingred_concept_id, c2.rxnorm_id ingred_rxcui, c2.rxnorm_name ingred_name +from product_to_rxnorm ptr + inner join product_label pl on pl.label_id = ptr.label_id + inner join vocab_rxnorm_product c1 on c1.rxnorm_id = ptr.rxnorm_product_id + inner join vocab_rxnorm_ingredient_to_product ca on c1.rxnorm_id = ca.product_id + inner join vocab_rxnorm_ingredient c2 on c2.rxnorm_id = ca.ingredient_id +where pl.source = 'US' + and c1.rxnorm_term_type in ('Branded Dose Group','Branded Drug','Branded Drug Comp','Branded Drug Form','Branded Pack','Brand Name') + and c2.rxnorm_term_type = 'Ingredient'; + +create index derived_branded_product_labels_ingred_rxcui on derived_branded_product_labels(ingred_rxcui); +create index derived_branded_product_labels_ingred_setid on derived_branded_product_labels(setid); + +drop table if exists derived_generic_product_labels; +create table derived_generic_product_labels as +select distinct substr(pl.source_product_id, 1, instr(pl.source_product_id, '.') - 1) AS setid, + substr(pl.source_product_id, instr(pl.source_product_id, '.') + 1) AS label_version, + c1.rxnorm_name label_title, + c2.rxnorm_id ingred_concept_id, c2.rxnorm_id ingred_rxcui, c2.rxnorm_name ingred_name +from product_to_rxnorm ptr + inner join product_label pl on pl.label_id = ptr.label_id + inner join vocab_rxnorm_product c1 on c1.rxnorm_id = ptr.rxnorm_product_id + inner join vocab_rxnorm_ingredient_to_product ca on c1.rxnorm_id = ca.product_id + inner join vocab_rxnorm_ingredient c2 on c2.rxnorm_id = ca.ingredient_id +where pl.source = 'US' + and c1.rxnorm_term_type not in ('Branded Dose Group','Branded Drug','Branded Drug Comp','Branded Drug Form','Branded Pack','Brand Name') + and c2.rxnorm_term_type = 'Ingredient'; + +create index derived_generic_product_labels_ingred_rxcui on derived_generic_product_labels(ingred_rxcui); +create index derived_generic_product_labels_ingred_setid on derived_generic_product_labels(setid); From 038bd45f77a161305c9b45909333e7cf461f0c7c Mon Sep 17 00:00:00 2001 From: wolfderby Date: Mon, 9 Feb 2026 14:49:08 -0500 Subject: [PATCH 19/20] derived tables in snake make --- snakemake/onsides/export/Snakefile | 2 ++ 1 file changed, 2 insertions(+) diff --git a/snakemake/onsides/export/Snakefile b/snakemake/onsides/export/Snakefile index 09960a8..212f42d 100644 --- a/snakemake/onsides/export/Snakefile +++ b/snakemake/onsides/export/Snakefile @@ -167,6 +167,7 @@ rule combine_results_to_sqlite: vocab_sql = "snakemake/onsides/export/final_vocabulary_tables.sql", filter_sql = "snakemake/onsides/export/filter_deficient.sql", threshold_sql = "snakemake/onsides/export/threshold.sql", + derived_sql = "snakemake/onsides/export/derived_tables.sql", output: "database/onsides.db" run: from sqlmodel import create_engine @@ -196,6 +197,7 @@ rule combine_results_to_sqlite: sqlite_scripts = [ input.threshold_sql, + input.derived_sql, ] for file in sqlite_scripts: print(f"Running: 'sqlite3 database/onsides.db < {file}'") From 0637b343083dbc1b07f0e13c76944f4bd59f27f6 Mon Sep 17 00:00:00 2001 From: wolfderby Date: Mon, 9 Feb 2026 14:49:19 -0500 Subject: [PATCH 20/20] instructions --- .github/copilot-instructions.md | 109 ++++++++++++++++++++++++++++++++ 1 file changed, 109 insertions(+) diff --git a/.github/copilot-instructions.md b/.github/copilot-instructions.md index e69de29..50e91de 100644 --- a/.github/copilot-instructions.md +++ b/.github/copilot-instructions.md @@ -0,0 +1,109 @@ +# AI Coding Agent Guidelines for OnSIDES + +## Overview +OnSIDES is a comprehensive database of drugs and their adverse events, extracted from drug product labels using fine-tuned natural language processing models. The project is structured to support database generation, querying, and analysis workflows. + +### Key Components: +1. **Data**: Raw data files and annotations are stored in the `data/` directory. +2. **Database**: Schema files and helper scripts for MySQL, PostgreSQL, and SQLite are in `database/schema/`. +3. **Source Code**: Python scripts for data processing, database interactions, and predictions are in `src/onsides/`. +4. **Snakemake Pipelines**: Workflow automation scripts for data parsing and evaluation are in `snakemake/`. +5. **Documentation**: Example queries and developer notes are in `docs/`. + +--- + +## Developer Workflows + +### Setting Up the Database +- Use the schema files in `database/schema/` to create the database. +- Example scripts for loading data: + - `database_scripts/mysql.sh` + - `database_scripts/postgres.sh` + - `database_scripts/sqlite.sh` +- For PostgreSQL, use `data_our_improvements/schema/postgres_v3.1.0_fixed.sql` for the latest schema. + +### Running Snakemake Pipelines +- Navigate to the appropriate subdirectory in `snakemake/` (e.g., `snakemake/eu/parse/`). +- Execute the pipeline: + ```bash + snakemake --snakefile Snakefile + ``` + +### Querying the Database +- Example SQL queries are provided in `docs/README.md`. +- Use `summarize.sql` and `test.sql` in `database/` for testing and summarization. + +--- + +## Project-Specific Conventions + +### Code Organization +- Python modules are in `src/onsides/`. + - `db.py`: Database connection utilities. + - `predict.py`: Prediction logic using ClinicalBERT. + - `stringsearch.py`: String matching utilities. + +### Data Loading +- CSV files are in `data/csv/`. +- Use `load_remaining_onsides.sh` to load additional data. + +### Testing +- Unit tests are in `src/onsides/test_stringsearch.py`. +- Run tests with: + ```bash + pytest src/onsides/ + ``` + +--- + +## Integration Points + +### External Dependencies +- **Podman**: Used for containerized database setups. +- **pgloader**: For importing SQLite databases into PostgreSQL. +- **Snakemake**: Workflow management. + +### Cross-Component Communication +- Data flows from `data/` to `database/` via schema scripts and loading utilities. +- Snakemake pipelines automate parsing and evaluation workflows. + +--- + +## Examples + +### Example Query: Find Ingredients for Renal Injury +```sql +SELECT DISTINCT i.* +FROM product_label +INNER JOIN product_to_rxnorm USING (label_id) +INNER JOIN vocab_rxnorm_ingredient_to_product ON rxnorm_product_id = product_id +INNER JOIN vocab_rxnorm_ingredient i ON ingredient_id = rxnorm_id +INNER JOIN product_adverse_effect ON label_id = product_label_id +INNER JOIN vocab_meddra_adverse_effect ON effect_meddra_id = meddra_id +WHERE meddra_name = 'Renal injury'; +``` + +### Example Query: Fraction of US Products with Headache Label +```sql +WITH n_headache AS ( + SELECT COUNT(DISTINCT label_id) AS n + FROM product_label + INNER JOIN product_adverse_effect ON label_id = product_label_id + INNER JOIN vocab_meddra_adverse_effect ON effect_meddra_id = meddra_id + WHERE source = 'US' AND meddra_name = 'Headache' +), +n_overall AS ( + SELECT COUNT(DISTINCT label_id) AS n + FROM product_label + INNER JOIN product_adverse_effect ON label_id = product_label_id + INNER JOIN vocab_meddra_adverse_effect ON effect_meddra_id = meddra_id + WHERE source = 'US' +) +SELECT CAST(n_headache.n AS real) / n_overall.n AS frac_with_headache +FROM n_headache, n_overall; +``` + +--- + +## Contact +For questions or issues, refer to the [README.md](../README.md) or contact the maintainers via GitHub Issues. \ No newline at end of file