Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 6 additions & 4 deletions .github/workflows/cml.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -3,10 +3,12 @@ on: pull_request
jobs:
run:
runs-on: [ubuntu-latest]
container: docker://dvcorg/cml-py3:latest
steps:
- uses: iterative/setup-cml@v2
- uses: iterative/setup-dvc@v1
- uses: actions/checkout@v3
# - uses: iterative/setup-cml@v2
# - uses: iterative/setup-dvc@v1
- uses: actions/checkout@v2
- name: CML run
with:
fetch-depth: 2
# Needed for https://github.com/iterative/example-repos-dev/issues/225
Expand All @@ -24,7 +26,7 @@ jobs:
git fetch origin main:main
fi

dvc pull eval
dvc repro
dvc plots diff $PREVIOUS_REF workspace \
--show-vega --targets ROC | json5 > vega.json
vl2svg vega.json roc.svg
Expand Down
38 changes: 19 additions & 19 deletions dvc.lock
Original file line number Diff line number Diff line change
Expand Up @@ -28,27 +28,27 @@ stages:
nfiles: 2
- path: src/prepare/collate.py
hash: md5
md5: 196f211d4aa609e4623bfbaa78048d6d
md5: 075f9c5c4e5e0a24a7baece2c1dffe62
size: 1328
params:
params.yaml:
prepare.seed: 42
outs:
- path: data/collated/data.csv
hash: md5
md5: 6c011536ae995e2558840b7064599104
md5: a8ee15642e77d9eb17c55b58cf0d9dbc
size: 92520291
split:
cmd: python src/prepare/split.py data/data.csv
cmd: python src/prepare/split.py data/collated/data.csv
deps:
- path: data/data.csv
- path: data/collated/data.csv
hash: md5
md5: a8ee15642e77d9eb17c55b58cf0d9dbc
size: 92520291
- path: src/prepare/split.py
hash: md5
md5: 7b17f69ba203ff9290e23cccba4330dd
size: 1794
md5: a543b0bfb12d98b8b92faf6c01b3102f
size: 1607
params:
params.yaml:
prepare.seed: 42
Expand All @@ -73,21 +73,21 @@ stages:
size: 4065
params:
params.yaml:
featurize.max_features: 200
featurize.max_features: 10
featurize.ngrams: 2
outs:
- path: data/features
hash: md5
md5: c0fde86f821aa3994f8699dc6dd1923b.dir
size: 15293681
md5: 145b99ce99000a8fc8be0e08c3517578.dir
size: 2882605
nfiles: 2
train:
cmd: python src/train.py data/features model.pkl
deps:
- path: data/features
hash: md5
md5: c0fde86f821aa3994f8699dc6dd1923b.dir
size: 15293681
md5: 145b99ce99000a8fc8be0e08c3517578.dir
size: 2882605
nfiles: 2
- path: src/train.py
hash: md5
Expand All @@ -101,29 +101,29 @@ stages:
outs:
- path: model.pkl
hash: md5
md5: cb60759246d8999b8d457ba56607d5d6
size: 2278913
md5: 8a6cd2f4570d1170fbdb4883b82e92dc
size: 1322189
evaluate:
cmd: python src/evaluate.py model.pkl data/features
deps:
- path: data/features
hash: md5
md5: c0fde86f821aa3994f8699dc6dd1923b.dir
size: 15293681
md5: 145b99ce99000a8fc8be0e08c3517578.dir
size: 2882605
nfiles: 2
- path: model.pkl
hash: md5
md5: cb60759246d8999b8d457ba56607d5d6
size: 2278913
md5: 8a6cd2f4570d1170fbdb4883b82e92dc
size: 1322189
- path: src/evaluate.py
hash: md5
md5: a1a59f55636170fb56e0c6afd3e28fa4
size: 3315
outs:
- path: eval
hash: md5
md5: 1332ef2ad9a41a6d73681be5ea93c15b.dir
size: 6196803
md5: cbbbc5e755b9bc6ac69c65213c4e4a51.dir
size: 6116699
nfiles: 8
redact:
cmd: python src/prepare/redact.py data/cleaned
Expand Down
47 changes: 34 additions & 13 deletions dvc.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -12,35 +12,56 @@ artifacts:
- classification
- news
stages:

# sources not redacted

# clean:
# cmd: python src/prepare/clean.py data/raw
# deps:
# - data/raw
# - src/prepare/clean.py
# outs:
# - data/cleaned
# redact:
# cmd: python src/prepare/redact.py data/cleaned
# deps:
# - data/cleaned
# - src/prepare/redact.py
# outs:
# - data/redacted
# collate:
# cmd: python src/prepare/collate.py data/redacted
# cmd: python src/prepare/collate.py data/cleaned
# deps:
# - data/redacted
# - data/cleaned
# - src/prepare/collate.py
# params:
# - prepare.seed
# outs:
# - data/collated/data.csv

# sources redacted

clean:
cmd: python src/prepare/clean.py data/raw
deps:
- data/raw
- src/prepare/clean.py
outs:
- data/cleaned
redact:
cmd: python src/prepare/redact.py data/cleaned
deps:
- data/cleaned
- src/prepare/redact.py
outs:
- data/redacted
collate:
cmd: python src/prepare/collate.py data/redacted
deps:
- data/redacted
- src/prepare/collate.py
params:
- prepare.seed
outs:
- data/collated/data.csv

split:
# cmd: python src/prepare/split.py data/collated/data.csv
cmd: python src/prepare/split.py data/data.csv
cmd: python src/prepare/split.py data/collated/data.csv
deps:
# - data/collated/data.csv
- data/data.csv
- data/collated/data.csv
- src/prepare/split.py
params:
- prepare.seed
Expand Down
2 changes: 1 addition & 1 deletion params.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@ prepare:
seed: 42

featurize:
max_features: 200
max_features: 10
ngrams: 2

train:
Expand Down
7 changes: 2 additions & 5 deletions src/prepare/split.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,11 +25,8 @@ def split(dataset, train_out, test_out, split, seed):
X, y, test_size=split, stratify=y, random_state=seed
)

# train = pd.concat([X_train, y_train], axis=0)
# test = pd.concat([X_test, y_test], axis=0)

train = pd.DataFrame({"id": X_train['id'], "text": X_train['text'], "label": y_train})
test = pd.DataFrame({"id": X_test['id'], "text": X_test['text'], "label": y_test})
train = pd.concat([X_train, y_train], axis=1)
test = pd.concat([X_test, y_test], axis=1)

# save train and test data
os.makedirs(os.path.join("data", "prepared"), exist_ok=True)
Expand Down