forked from cdcai/premier_analysis
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathrun_preprocessing.sh
More file actions
41 lines (31 loc) · 974 Bytes
/
run_preprocessing.sh
File metadata and controls
41 lines (31 loc) · 974 Bytes
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
#!/usr/bin/env bash
# == Automate running data pre-processing steps
# Construct output paths
out_path="${PWD}/output/"
pkl_path="${out_path}/pkl"
parquet_path="${out_path}/parquet"
mkdir -p ${pkl_path}
mkdir -p ${parquet_path}
# Check if virtualenv present, and if so activate it
venv_path="${PWD}/venv/Scripts/activate"
# Activate virtualenv
if [[ -f "${venv_path}" ]]; then
echo "Virualenv found, activating"
source "${venv_path}"
fi
py_location=$(where python || which python)
# Echo the python location
echo "${py_location}"
# Process EHR data to summarized parquet and produce feature dicts
run_feature_extraction() {
python "$PWD/python/feature_extraction.py"
}
# Run features to integers to encode text features to integer representation
run_encoding() {
# Run DAN
python "${PWD}/python/features_tokenization.py"
}
echo "Running feature extraction >>"
run_feature_extraction
echo "Converting text features to integer >>"
run_encoding