diff --git a/analysis.sh b/analysis.sh old mode 100644 new mode 100755 index 8cc2da0..c49f7f3 --- a/analysis.sh +++ b/analysis.sh @@ -8,8 +8,8 @@ bash pretraining.sh "$DIR"/pretraining # Then generate negative decoys # This step is very CPU and RAM intensive -bash negative_decoys.sh "$DIR"/negative_decoys "$CPU" +# bash negative_decoys.sh "$DIR"/negative_decoys "$CPU" # Finally perform supervised training and evaluate the model # This step is faster with a GPU -bash train_and_evaluate.sh "$DIR"/negative_decoys/datasets "$DIR"/pretraining/model "$DIR"/training \ No newline at end of file +# bash train_and_evaluate.sh "$DIR"/negative_decoys/datasets "$DIR"/pretraining/model "$DIR"/training diff --git a/bertrand-job.sh b/bertrand-job.sh new file mode 100644 index 0000000..44f6e43 --- /dev/null +++ b/bertrand-job.sh @@ -0,0 +1,17 @@ +#!/bin/bash +#SBATCH --mail-type=ALL # Powiadomienia mailowe. Opcje: NONE, BEGIN, END, FAIL, ALL +#SBATCH --mail-user=jacenko.vlad@gmail.com # adres e-mail +#SBATCH --ntasks=4 # Uruchomienie na jednym procesorze +#SBATCH --mem=32gb +#SBATCH --gpus=a100:1 +#SBATCH --time=72:00:00 # maksymalny limit czasu DD-HH:MM:SS +#SBATCH --partition=long + +pwd; hostname; date + +source /home2/sfglab/yvladyslav/anaconda3/etc/profile.d/conda.sh +cd bertrand +conda activate bertrand +./analysis.sh "/home2/sfglab/yvladyslav/pretrain-mlm/bertrand_results" 4 + +date diff --git a/bertrand/pretraining/peptide_tcr_repertoire.py b/bertrand/pretraining/peptide_tcr_repertoire.py index 0ea5894..b792b11 100644 --- a/bertrand/pretraining/peptide_tcr_repertoire.py +++ b/bertrand/pretraining/peptide_tcr_repertoire.py @@ -80,11 +80,10 @@ def read_peptides(fn: str) -> pd.DataFrame: logging.info(f"{len(presented_peptides)} peptides read") presented_unique = ( presented_peptides.reset_index() - .groupby("Peptide2") + .groupby("peptide_seq") .agg( { "HLA_type": lambda x: "|".join(sorted(x)), - "index": lambda x: "|".join(sorted(x)), } ) .reset_index() @@ -114,7 +113,7 @@ def sample_peptide_tcr_repertoire( ) peptides_sampled.loc[:, "CDR3b"] = synthetic_tcrs.values - peptide_tcr_repertoire = peptides_sampled.rename(columns={"Peptide2": "Peptide"}) + peptide_tcr_repertoire = peptides_sampled.rename(columns={"peptide_seq": "Peptide"}) return peptide_tcr_repertoire diff --git a/bertrand/training/evaluate.py b/bertrand/training/evaluate.py index 48c3791..d1b8b0f 100644 --- a/bertrand/training/evaluate.py +++ b/bertrand/training/evaluate.py @@ -1,7 +1,6 @@ import os import shutil from copy import deepcopy -from functools import partial from glob import glob from typing import Union, List, Tuple, Dict diff --git a/env.yml b/env.yml index 52be727..d15a02a 100644 --- a/env.yml +++ b/env.yml @@ -1,27 +1,135 @@ name: bertrand channels: + - nvidia - pytorch - defaults dependencies: - - biopython=1.78=py38h7b6447c_0 - - h5py=2.10.0=py38hd6299e0_1 - - hdf5=1.10.6=hb1b8bf9_0 - - joblib=1.1.0=pyhd3eb1b0_0 - - matplotlib=3.3.4=py38h06a4308_0 - - numpy=1.21.2=py38h20f2e39_0 - - pandas=1.4.1=py38h295c915_0 - - pip=21.2.4=py38h06a4308_0 + - _libgcc_mutex=0.1=main + - _openmp_mutex=5.1=1_gnu + - biopython=1.78=py38h7f8727e_0 + - blas=1.0=mkl + - bottleneck=1.3.5=py38h7deecbd_0 + - brotli=1.0.9=h5eee18b_7 + - brotli-bin=1.0.9=h5eee18b_7 + - ca-certificates=2023.01.10=h06a4308_0 + - contourpy=1.0.5=py38hdb19cb5_0 + - cudatoolkit=11.5.1=hcf5317a_9 + - cycler=0.11.0=pyhd3eb1b0_0 + - dbus=1.13.18=hb2f20db_0 + - expat=2.4.9=h6a678d5_0 + - fftw=3.3.9=h27cfd23_1 + - fontconfig=2.14.1=h52c9d5c_1 + - fonttools=4.25.0=pyhd3eb1b0_0 + - freetype=2.12.1=h4a9f257_0 + - giflib=5.2.1=h5eee18b_3 + - glib=2.63.1=h5a9c865_0 + - gst-plugins-base=1.14.0=hbbd80ab_1 + - gstreamer=1.14.0=hb453b48_1 + - h5py=3.7.0=py38h737f45e_0 + - hdf5=1.10.6=h3ffc7dd_1 + - icu=58.2=he6710b0_3 + - importlib_resources=5.2.0=pyhd3eb1b0_1 + - intel-openmp=2021.4.0=h06a4308_3561 + - joblib=1.2.0=py38h06a4308_0 + - jpeg=9e=h5eee18b_1 + - kiwisolver=1.4.4=py38h6a678d5_0 + - lcms2=2.12=h3be6417_0 + - lerc=3.0=h295c915_0 + - libbrotlicommon=1.0.9=h5eee18b_7 + - libbrotlidec=1.0.9=h5eee18b_7 + - libbrotlienc=1.0.9=h5eee18b_7 + - libdeflate=1.17=h5eee18b_0 + - libedit=3.1.20221030=h5eee18b_0 + - libffi=3.2.1=hf484d3e_1007 + - libgcc-ng=11.2.0=h1234567_1 + - libgfortran-ng=11.2.0=h00389a5_1 + - libgfortran5=11.2.0=h1234567_1 + - libgomp=11.2.0=h1234567_1 + - libpng=1.6.39=h5eee18b_0 + - libstdcxx-ng=11.2.0=h1234567_1 + - libtiff=4.5.0=h6a678d5_2 + - libuuid=1.41.5=h5eee18b_0 + - libuv=1.44.2=h5eee18b_0 + - libwebp=1.2.4=h11a3e52_1 + - libwebp-base=1.2.4=h5eee18b_1 + - libxcb=1.15=h7f8727e_0 + - libxml2=2.9.14=h74e7548_0 + - lz4-c=1.9.4=h6a678d5_0 + - matplotlib=3.7.1=py38h06a4308_1 + - matplotlib-base=3.7.1=py38h417a72b_1 + - mkl=2021.4.0=h06a4308_640 + - mkl-service=2.4.0=py38h7f8727e_0 + - mkl_fft=1.3.1=py38hd3c417c_0 + - mkl_random=1.2.2=py38h51133e4_0 + - munkres=1.1.4=py_0 + - ncurses=6.4=h6a678d5_0 + - numexpr=2.8.4=py38he184ba9_0 + - numpy=1.22.3=py38he7a7128_0 + - numpy-base=1.22.3=py38hf524024_0 + - openssl=1.1.1t=h7f8727e_0 + - packaging=23.0=py38h06a4308_0 + - pandas=1.5.3=py38h417a72b_0 + - pcre=8.45=h295c915_0 + - pillow=9.4.0=py38h6a678d5_0 + - pip=23.0.1=py38h06a4308_0 + - pyparsing=3.0.9=py38h06a4308_0 + - pyqt=5.9.2=py38h05f1152_4 - python=3.8.0=h0371630_2 - - pytorch=1.11.0=py3.8_cuda10.2_cudnn7.6.5_0 + - python-dateutil=2.8.2=pyhd3eb1b0_0 + - pytorch=1.11.0=py3.8_cuda11.5_cudnn8.3.2_0 + - pytorch-mutex=1.0=cuda + - pytz=2022.7=py38h06a4308_0 + - qt=5.9.7=h5867ecd_1 + - readline=7.0=h7b6447c_5 - scikit-learn=0.24.2=py38ha9443f7_0 - - scipy=1.7.3=py38hc147768_0 - - seaborn=0.11.1=pyhd3eb1b0_0 - - tokenizers=0.10.3=py38hb317417_1 - - tqdm=4.62.3=pyhd3eb1b0_1 + - scipy=1.7.3=py38h6c91a56_2 + - seaborn=0.12.2=py38h06a4308_0 + - setuptools=66.0.0=py38h06a4308_0 + - sip=4.19.13=py38h295c915_0 + - six=1.16.0=pyhd3eb1b0_1 + - sqlite=3.33.0=h62c20be_0 + - threadpoolctl=2.2.0=pyh0d69192_0 + - tk=8.6.12=h1ccaba5_0 + - tokenizers=0.11.4=py38h3dcd8bd_1 + - tornado=6.2=py38h5eee18b_0 + - tqdm=4.65.0=py38hb070fc8_0 + - typing_extensions=4.5.0=py38h06a4308_0 + - wheel=0.38.4=py38h06a4308_0 + - xz=5.4.2=h5eee18b_0 + - zipp=3.11.0=py38h06a4308_0 + - zlib=1.2.13=h5eee18b_0 + - zstd=1.5.5=hc292b87_0 - pip: - - datasets==1.18.3 - - fastcluster==1.2.4 - - leven==1.0.4 - - pytorch-lightning==0.7.1 - - transformers==4.16.2 -prefix: /home/ardigen/miniconda3/envs/bertrand + - aiohttp==3.8.4 + - aiosignal==1.3.1 + - async-timeout==4.0.2 + - attrs==23.1.0 + - certifi==2023.5.7 + - charset-normalizer==3.1.0 + - click==8.1.3 + - datasets==2.12.0 + - dill==0.3.6 + - fastcluster==1.2.6 + - filelock==3.12.0 + - frozenlist==1.3.3 + - fsspec==2023.5.0 + - huggingface-hub==0.14.1 + - idna==3.4 + - leven==1.0.4 + - lightning-utilities==0.8.0 + - multidict==6.0.4 + - multiprocess==0.70.14 + - nose==1.3.7 + - pyarrow==12.0.0 + - pytorch-lightning==2.0.2 + - pyyaml==6.0 + - regex==2023.5.5 + - requests==2.31.0 + - responses==0.18.0 + - sacremoses==0.0.53 + - torchmetrics==0.11.4 + - transformers==4.16.2 + - urllib3==2.0.2 + - xxhash==3.2.0 + - yarl==1.9.2 +prefix: /home2/sfglab/yvladyslav/anaconda3/envs/bertrand diff --git a/train_and_evaluate.sh b/train_and_evaluate.sh index a350cd5..4df5aee 100644 --- a/train_and_evaluate.sh +++ b/train_and_evaluate.sh @@ -1,4 +1,4 @@ -set -x +set -ex DATA_DIR=$1 MODEL_DIR=$2 OUT_DIR=$3 @@ -9,9 +9,9 @@ python -m bertrand.training.train \ --input-dir=$DATA_DIR \ --model-ckpt=$MODEL_DIR \ --output-dir=$OUT_DIR \ - --n-splits=21 + --n-splits=1 python -m bertrand.training.evaluate \ --datasets-dir=$DATA_DIR \ --results-dir=$OUT_DIR \ - --out=$OUT_DIR/results.csv \ No newline at end of file + --out=$OUT_DIR/results.csv