From fa3db99da497af9dcefa65b3ded62f35f150ff1b Mon Sep 17 00:00:00 2001 From: GP Saggese Date: Thu, 24 Jul 2025 07:59:37 -0400 Subject: [PATCH 1/3] Update MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Pre-commit checks: All checks passed ✅ --- CLAUDE.md | 6 + config_root/config/test/test_config.py | 10 +- config_root/config/test/test_config_utils.py | 8 +- .../documentation/render_images.py | 6 +- .../Test_notes_to_pdf1.test3/output/test.txt | 4 +- .../output/test.txt | 24 - .../input/input1.txt | 0 .../output/test.txt | 0 .../input/input1.txt | 0 .../output/test.txt | 30 + .../input/input1.txt | 1817 +++++++++++++++++ .../output/test.txt | 1775 ++++++++++++++++ .../documentation/test/test_lint_notes.py | 10 +- .../test/test_preprocess_notes.py | 306 +-- .../documentation/test/test_render_images.py | 5 +- dev_scripts_helpers/llms/ai_review.py | 3 +- dev_scripts_helpers/llms/llm_transform.py | 1 + .../all.coding_style_guidelines.reference.md | 3 - docs/tools/all.ai_review.how_to_guide.md | 60 +- helpers/hcache_simple.py | 2 +- helpers/hllm.py | 3 +- helpers/hmarkdown.py | 1 + helpers/hmarkdown_coloring.py | 122 +- helpers/hmarkdown_rules.py | 16 +- helpers/hmarkdown_slides.py | 2 +- helpers/hmarkdown_tables.py | 120 ++ helpers/hmarkdown_toc.py | 25 + helpers/hmkdocs.py | 22 +- helpers/hplayback.py | 8 +- helpers/hunit_test.py | 4 +- helpers/hunit_test_purification.py | 48 +- helpers/test/test_hgit.py | 6 +- helpers/test/test_hmarkdown_bullets.py | 8 +- helpers/test/test_hmarkdown_coloring.py | 176 +- helpers/test/test_hmarkdown_headers.py | 4 +- helpers/test/test_hmarkdown_rules.py | 29 +- helpers/test/test_hmarkdown_tables.py | 196 ++ helpers/test/test_hmarkdown_toc.py | 101 + helpers/test/test_hmkdocs.py | 96 - helpers/test/test_hparquet.py | 4 +- helpers/test/test_hunit_test.py | 8 +- helpers/test/test_hunit_test_purification.py | 55 +- helpers/test/test_lib_tasks_docker_release.py | 6 + helpers/test/test_repo_config_amp.py | 8 +- helpers/unit_test_template.py | 21 - linters/test/test_amp_check_import.py | 8 +- linters/test/test_amp_check_shebang.py | 41 +- linters/test/test_amp_class_method_order.py | 268 +-- linters/test/test_amp_fix_comment.py | 3 +- unit_test_template.py | 28 +- 50 files changed, 4830 insertions(+), 677 deletions(-) delete mode 100644 dev_scripts_helpers/documentation/test/outcomes/Test_preprocess_notes1.test2/output/test.txt rename dev_scripts_helpers/documentation/test/outcomes/{Test_preprocess_notes1.test1 => Test_preprocess_notes_executable1.test1}/input/input1.txt (100%) rename dev_scripts_helpers/documentation/test/outcomes/{Test_preprocess_notes1.test1 => Test_preprocess_notes_executable1.test1}/output/test.txt (100%) rename dev_scripts_helpers/documentation/test/outcomes/{Test_preprocess_notes1.test2 => Test_preprocess_notes_executable1.test2}/input/input1.txt (100%) create mode 100644 dev_scripts_helpers/documentation/test/outcomes/Test_preprocess_notes_executable1.test2/output/test.txt create mode 100644 dev_scripts_helpers/documentation/test/outcomes/Test_preprocess_notes_executable1.test3/input/input1.txt create mode 100644 dev_scripts_helpers/documentation/test/outcomes/Test_preprocess_notes_executable1.test3/output/test.txt create mode 100644 helpers/hmarkdown_tables.py create mode 100644 helpers/hmarkdown_toc.py create mode 100644 helpers/test/test_hmarkdown_tables.py create mode 100644 helpers/test/test_hmarkdown_toc.py delete mode 100644 helpers/unit_test_template.py diff --git a/CLAUDE.md b/CLAUDE.md index 256b163a7..70a766646 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -104,3 +104,9 @@ import config_root.config.config_ as crococon ### Code Conventions - Coding guidelines are in `docs/code_guidelines/all.coding_style_guidelines.reference.md` + +- Templates for code are: + - `code_template.py`: template for code + - `unit_test_template.py`: template for unit test + - `dev_scripts_helpers/coding_tools/script_template.py`: template for + self-standing Python script diff --git a/config_root/config/test/test_config.py b/config_root/config/test/test_config.py index f75c0acef..cd4b35547 100644 --- a/config_root/config/test/test_config.py +++ b/config_root/config/test/test_config.py @@ -2300,13 +2300,19 @@ def execute_stmt( raise ValueError(f"Invalid mode={mode}") _LOG.debug("config=\n%s", actual) if expected is not None: - self.assert_equal(actual, expected, purify_text=True, fuzzy_match=True) + self.assert_equal( + actual, expected, purify_text=True, fuzzy_match=True + ) # Package the output. actual = hprint.frame(stmt) + "\n" + actual return actual def raise_stmt( - self, stmt: str, assertion_type: Any, expected: Optional[str], globals_: Dict + self, + stmt: str, + assertion_type: Any, + expected: Optional[str], + globals_: Dict, ) -> None: _LOG.debug("\n" + hprint.frame(stmt)) with self.assertRaises(assertion_type) as cm: diff --git a/config_root/config/test/test_config_utils.py b/config_root/config/test/test_config_utils.py index 65eae3c74..15fbea709 100644 --- a/config_root/config/test/test_config_utils.py +++ b/config_root/config/test/test_config_utils.py @@ -416,7 +416,9 @@ def test1(self) -> None: config1 = _get_test_config1() config2 = _get_test_config2() # - actual = cconfig.build_config_diff_dataframe({"1": config1, "2": config2}) + actual = cconfig.build_config_diff_dataframe( + {"1": config1, "2": config2} + ) actual = hpandas.df_to_str(actual, num_rows=None) # expected = pd.DataFrame( @@ -433,7 +435,9 @@ def test2(self) -> None: """ config1 = _get_test_config1() # - actual = cconfig.build_config_diff_dataframe({"1": config1, "2": config1}) + actual = cconfig.build_config_diff_dataframe( + {"1": config1, "2": config1} + ) actual = hpandas.df_to_str(actual, num_rows=None) # expected = """ diff --git a/dev_scripts_helpers/documentation/render_images.py b/dev_scripts_helpers/documentation/render_images.py index bdc4c33e0..9db0f1a5d 100755 --- a/dev_scripts_helpers/documentation/render_images.py +++ b/dev_scripts_helpers/documentation/render_images.py @@ -101,8 +101,10 @@ def _get_rendered_file_paths( # ############################################################################# -# Save cache to disk for persistence. -@hcacsimp.simple_cache(write_through=True) +@hcacsimp.simple_cache( + # Save cache to disk for persistence. + write_through=True +) def _render_image_code( image_code_txt: str, image_code_idx: int, diff --git a/dev_scripts_helpers/documentation/test/outcomes/Test_notes_to_pdf1.test3/output/test.txt b/dev_scripts_helpers/documentation/test/outcomes/Test_notes_to_pdf1.test3/output/test.txt index 80fcc1126..2eff818f4 100644 --- a/dev_scripts_helpers/documentation/test/outcomes/Test_notes_to_pdf1.test3/output/test.txt +++ b/dev_scripts_helpers/documentation/test/outcomes/Test_notes_to_pdf1.test3/output/test.txt @@ -7,10 +7,10 @@ $GIT_ROOT/dev_scripts_helpers/documentation/preprocess_notes.py --input $GIT_ROO # render_images $GIT_ROOT/dev_scripts_helpers/documentation/render_images.py --in_file_name $GIT_ROOT/dev_scripts_helpers/documentation/test/outcomes/Test_notes_to_pdf1.test3/tmp.scratch/tmp.notes_to_pdf.preprocess_notes.txt --out_file_name $GIT_ROOT/dev_scripts_helpers/documentation/test/outcomes/Test_notes_to_pdf1.test3/tmp.scratch/tmp.notes_to_pdf.render_image.txt # run_pandoc -docker run --rm --user $(id -u):$(id -g) -e AM_GDRIVE_PATH -e AM_TELEGRAM_TOKEN -e CSFY_AWS_PROFILE -e CSFY_AWS_S3_BUCKET -e CSFY_ECR_BASE_PATH -e CSFY_HOST_NAME -e CSFY_HOST_OS_NAME -e CSFY_HOST_OS_VERSION -e CSFY_HOST_USER_NAME -e OPENAI_API_KEY -e OPENROUTER_API_KEY -e QUANDL_API_KEY --workdir /app --mount type=bind,source=$GIT_ROOT,target=/app tmp.pandoc_texlive.arm64.8689d816.xxxxxxxx /helpers_root/dev_scripts_helpers/documentation/test/outcomes/Test_notes_to_pdf1.test3/tmp.scratch/tmp.notes_to_pdf.render_image2.txt --output /helpers_root/dev_scripts_helpers/documentation/test/outcomes/Test_notes_to_pdf1.test3/tmp.scratch/tmp.notes_to_pdf.tex --template /helpers_root/dev_scripts_helpers/documentation/pandoc.latex -V geometry:margin=1in -f markdown --number-sections --highlight-style=tango -s -t latex +docker run --rm --user $(id -u):$(id -g) -e AM_CONTAINER_VERSION -e CSFY_AWS_ACCESS_KEY_ID -e CSFY_AWS_DEFAULT_REGION -e CSFY_AWS_PROFILE -e CSFY_AWS_S3_BUCKET -e CSFY_AWS_SECRET_ACCESS_KEY -e CSFY_AWS_SESSION_TOKEN -e CSFY_CI -e CSFY_ECR_BASE_PATH -e CSFY_ENABLE_DIND -e CSFY_FORCE_TEST_FAIL -e CSFY_GIT_ROOT_PATH -e CSFY_HELPERS_ROOT_PATH -e CSFY_HOST_GIT_ROOT_PATH -e CSFY_HOST_NAME -e CSFY_HOST_OS_NAME -e CSFY_HOST_OS_VERSION -e CSFY_HOST_USER_NAME -e CSFY_REPO_CONFIG_CHECK -e CSFY_REPO_CONFIG_PATH -e CSFY_TELEGRAM_TOKEN -e CSFY_USE_HELPERS_AS_NESTED_MODULE -e OPENAI_API_KEY --workdir $GIT_ROOT --mount type=bind,source=$CSFY_HOST_GIT_ROOT_PATH,target=$GIT_ROOT tmp.pandoc_texlive.aarch64.xxxxxxxx $GIT_ROOT/dev_scripts_helpers/documentation/test/outcomes/Test_notes_to_pdf1.test3/tmp.scratch/tmp.notes_to_pdf.render_image2.txt --output $GIT_ROOT/dev_scripts_helpers/documentation/test/outcomes/Test_notes_to_pdf1.test3/tmp.scratch/tmp.notes_to_pdf.tex --template $GIT_ROOT/dev_scripts_helpers/documentation/pandoc.latex -V geometry:margin=1in -f markdown --number-sections --highlight-style=tango -s -t latex # latex cp -f $GIT_ROOT/dev_scripts_helpers/documentation/latex_abbrevs.sty $GIT_ROOT/dev_scripts_helpers/documentation/test/outcomes/Test_notes_to_pdf1.test3/tmp.scratch -docker run --rm --user $(id -u):$(id -g) -e AM_GDRIVE_PATH -e AM_TELEGRAM_TOKEN -e CSFY_AWS_PROFILE -e CSFY_AWS_S3_BUCKET -e CSFY_ECR_BASE_PATH -e CSFY_HOST_NAME -e CSFY_HOST_OS_NAME -e CSFY_HOST_OS_VERSION -e CSFY_HOST_USER_NAME -e OPENAI_API_KEY -e OPENROUTER_API_KEY -e QUANDL_API_KEY --workdir /app --mount type=bind,source=$GIT_ROOT,target=/app tmp.latex.arm64.2f590c86.xxxxxxxx pdflatex -output-directory /helpers_root/dev_scripts_helpers/documentation/test/outcomes/Test_notes_to_pdf1.test3/tmp.scratch --interaction=nonstopmode --halt-on-error --shell-escape /helpers_root/dev_scripts_helpers/documentation/test/outcomes/Test_notes_to_pdf1.test3/tmp.scratch/tmp.notes_to_pdf.tex +docker run --rm --user $(id -u):$(id -g) -e AM_CONTAINER_VERSION -e CSFY_AWS_ACCESS_KEY_ID -e CSFY_AWS_DEFAULT_REGION -e CSFY_AWS_PROFILE -e CSFY_AWS_S3_BUCKET -e CSFY_AWS_SECRET_ACCESS_KEY -e CSFY_AWS_SESSION_TOKEN -e CSFY_CI -e CSFY_ECR_BASE_PATH -e CSFY_ENABLE_DIND -e CSFY_FORCE_TEST_FAIL -e CSFY_GIT_ROOT_PATH -e CSFY_HELPERS_ROOT_PATH -e CSFY_HOST_GIT_ROOT_PATH -e CSFY_HOST_NAME -e CSFY_HOST_OS_NAME -e CSFY_HOST_OS_VERSION -e CSFY_HOST_USER_NAME -e CSFY_REPO_CONFIG_CHECK -e CSFY_REPO_CONFIG_PATH -e CSFY_TELEGRAM_TOKEN -e CSFY_USE_HELPERS_AS_NESTED_MODULE -e OPENAI_API_KEY --workdir $GIT_ROOT --mount type=bind,source=$CSFY_HOST_GIT_ROOT_PATH,target=$GIT_ROOT tmp.latex.aarch64.xxxxxxxx pdflatex -output-directory $GIT_ROOT/dev_scripts_helpers/documentation/test/outcomes/Test_notes_to_pdf1.test3/tmp.scratch --interaction=nonstopmode --halt-on-error --shell-escape $GIT_ROOT/dev_scripts_helpers/documentation/test/outcomes/Test_notes_to_pdf1.test3/tmp.scratch/tmp.notes_to_pdf.tex # latex again \cp -af $GIT_ROOT/dev_scripts_helpers/documentation/test/outcomes/Test_notes_to_pdf1.test3/tmp.scratch/tmp.notes_to_pdf.pdf $GIT_ROOT/dev_scripts_helpers/documentation/test/outcomes/Test_notes_to_pdf1.test3/tmp.scratch/output.pdf # copy_to_gdrive diff --git a/dev_scripts_helpers/documentation/test/outcomes/Test_preprocess_notes1.test2/output/test.txt b/dev_scripts_helpers/documentation/test/outcomes/Test_preprocess_notes1.test2/output/test.txt deleted file mode 100644 index e67dbcfad..000000000 --- a/dev_scripts_helpers/documentation/test/outcomes/Test_preprocess_notes1.test2/output/test.txt +++ /dev/null @@ -1,24 +0,0 @@ -\let\emph\textit -\let\uline\underline -\let\ul\underline -# 14, Topics in Demand and Supply Analysis (p. 6, 843) - -## Introduction - -- **Economics** - - Economics is the study of: - - production - - distribution - - consumption - - Macroeconomics deals with aggregated economic quantities - - E.g., national output and national income - - Microeconomics deals with markets and decision making of individual economic - units - - E.g., consumers, businesses - - Microeconomics classifies private economic units into: - - consumers (aka households) - - consumption (= demand for goods and services) - - utility maximizing individuals (i.e., maximizing satisfaction from - present and future consumption) - - businesses (aka companies, firms) - - supply of goods and services by profit maximizing firms \ No newline at end of file diff --git a/dev_scripts_helpers/documentation/test/outcomes/Test_preprocess_notes1.test1/input/input1.txt b/dev_scripts_helpers/documentation/test/outcomes/Test_preprocess_notes_executable1.test1/input/input1.txt similarity index 100% rename from dev_scripts_helpers/documentation/test/outcomes/Test_preprocess_notes1.test1/input/input1.txt rename to dev_scripts_helpers/documentation/test/outcomes/Test_preprocess_notes_executable1.test1/input/input1.txt diff --git a/dev_scripts_helpers/documentation/test/outcomes/Test_preprocess_notes1.test1/output/test.txt b/dev_scripts_helpers/documentation/test/outcomes/Test_preprocess_notes_executable1.test1/output/test.txt similarity index 100% rename from dev_scripts_helpers/documentation/test/outcomes/Test_preprocess_notes1.test1/output/test.txt rename to dev_scripts_helpers/documentation/test/outcomes/Test_preprocess_notes_executable1.test1/output/test.txt diff --git a/dev_scripts_helpers/documentation/test/outcomes/Test_preprocess_notes1.test2/input/input1.txt b/dev_scripts_helpers/documentation/test/outcomes/Test_preprocess_notes_executable1.test2/input/input1.txt similarity index 100% rename from dev_scripts_helpers/documentation/test/outcomes/Test_preprocess_notes1.test2/input/input1.txt rename to dev_scripts_helpers/documentation/test/outcomes/Test_preprocess_notes_executable1.test2/input/input1.txt diff --git a/dev_scripts_helpers/documentation/test/outcomes/Test_preprocess_notes_executable1.test2/output/test.txt b/dev_scripts_helpers/documentation/test/outcomes/Test_preprocess_notes_executable1.test2/output/test.txt new file mode 100644 index 000000000..fae3ebf94 --- /dev/null +++ b/dev_scripts_helpers/documentation/test/outcomes/Test_preprocess_notes_executable1.test2/output/test.txt @@ -0,0 +1,30 @@ +--- +fontsize: 10pt +--- +\let\emph\textit +\let\uline\underline +\let\ul\underline +# 14, Topics in Demand and Supply Analysis (p. 6, 843) + +## Introduction + +- **Economics** +- Economics is the study of: + - production + - distribution + - consumption + +- Macroeconomics deals with aggregated economic quantities + - E.g., national output and national income + +- Microeconomics deals with markets and decision making of individual economic + units + - E.g., consumers, businesses + +- Microeconomics classifies private economic units into: + - consumers (aka households) + - consumption (= demand for goods and services) + - utility maximizing individuals (i.e., maximizing satisfaction from + present and future consumption) + - businesses (aka companies, firms) + - supply of goods and services by profit maximizing firms diff --git a/dev_scripts_helpers/documentation/test/outcomes/Test_preprocess_notes_executable1.test3/input/input1.txt b/dev_scripts_helpers/documentation/test/outcomes/Test_preprocess_notes_executable1.test3/input/input1.txt new file mode 100644 index 000000000..ef2e285d6 --- /dev/null +++ b/dev_scripts_helpers/documentation/test/outcomes/Test_preprocess_notes_executable1.test3/input/input1.txt @@ -0,0 +1,1817 @@ +::: columns +:::: {.column width=15%} +![](lectures_source/UMD_Logo.png) +:::: +:::: {.column width=75%} + +\vspace{0.4cm} +\begingroup \large +MSML610: Advanced Machine Learning +\endgroup +:::: +::: + +\vspace{1cm} + +\begingroup \Large +**$$\text{\blue{Machine Learning Techniques}}$$** +\endgroup +\vspace{1cm} + +**References**: + +- AIMA: ? + +- Hastie: ? + +// Model assessment and selection +// Hastie 7 (p. 238) + +# ############################################################################## +# Paradigms +# ############################################################################## + +* Machine Learning Paradigms with Examples (1/3) + +- **Supervised Learning** + - Learn from labeled data to predict labels for new inputs + - E.g., image classification using ResNet on ImageNet + +- **Unsupervised Learning** + - Discover hidden patterns or structure in unlabeled data + - E.g., K-means clustering for customer segmentation + +- **Reinforcement Learning** + - Learn through interaction with an environment, receiving rewards/punishments + - E.g., deep Q-Learning for playing Atari games + +- **Self-Supervised Learning** + - Generate pseudo-labels from unlabeled data to pre-train models + - E.g., BERT (Masked Language Modeling) + +- **Semi-Supervised Learning** + - Combine small labeled data with large unlabeled data to improve performance + - E.g., named entity recognition (NER) using annotated sentences with entity + tags combined with many raw text documents + +* Machine Learning Paradigms with Examples (2/3) + +- **Online Learning** + - Learn incrementally from a stream of data in real time + - E.g., online logistic regression for click-through rate prediction + +- **Multi-Task Learning** + - Train simultaneously a model to perform multiple related tasks + - E.g., learn sentiment analysis and question answering + +- **Meta-Learning** + - "Learning to learn": adapt quickly to new tasks using prior experience + - E.g., a model can be fine-tuned quickly on a new task using just a few + gradient steps + +- **Zero-Shot / Few-Shot Learning** + - Generalize to new tasks with no or few labeled examples + - E.g., GPT-4 solving tasks with zero-shot prompting + +- **Active Learning** + - The model selects the most informative samples to be labeled by an oracle + (e.g., a human) + - E.g., pick samples where the model is least confident to get more examples + +* Machine Learning Paradigms with Examples (3/3) + +- **Federated Learning** + - Train models across decentralized devices without sharing raw data + - E.g., fraud detection or credit scoring across banks + +- **Evolutionary Learning** + - Optimize model structures or parameters using evolutionary algorithms + inspired by natural selection and genetics + - Gradient free, global search, discrete structures, variable length inputs + - E.g., genetic algorithms + +- **Curriculum Learning** + - Train models on easier tasks first, gradually increasing difficulty + - E.g., curriculum-based training in robotic control simulations + +- **Multi-Agent Learning** + - Multiple agents learn and interact in shared environments, often in + game-theoretic settings (e.g., competition, collaboration) + - E.g., AlphaStar to play StarCraft II + +* Supervised Learning +- Learn a function $f: X \to Y$ that maps inputs to correct outputs using + training examples $(\vx, y)$ where inputs and correct output pairs are known + - Requires labeled data for training + - Measure performance with error on a separate test set + +- **Classification**: output is a discrete label, e.g., + - `Spam` vs `Not Spam` + - Digit recognition `0`, `1`, ... + - Sentiment analysis `Pos`, `Neg`, `Neutral` + +- **Regression**: output is a continuous value, e.g., + - House prices given features like size and location + - House demand + - Stock prices + +- **Common algorithms**: + - Linear Regression + - Decision Trees + - K-nearest neighbors + - Neural Networks + - ... + +* Unsupervised Learning +- Learn from data **without** labeled outputs + - Goal: discover patterns, groupings, or structure in the data + - No explicit feedback signal + - Evaluation can be qualitative + +- **Main techniques**: + - **Clustering**: Group similar examples, e.g., + - Customer segmentation + - Grouping news articles by topic without knowing the topics + - **Dimensionality Reduction**: Reduce number of variables with PCA while + preserving structure + - E.g., visualize high-dimensional data in 2D + - **Density Estimation**: Estimate probability distribution of data + - E.g., anomaly detection in server logs + - **Association Rule Learning**: Discover interesting relations between + variables + - E.g., market basket analysis (e.g., "people who buy X also buy Y") + +- **Common algorithms:** + - K-means + - PCA + - Autoencoders + +* Reinforcement Learning +- Learn by **interacting with an environment** to **maximize cumulative reward** + - Learn policy $\pi(s) \to a$ that maximizes expected reward + - Trade-off between exploration (trying new actions) and exploitation (using + known good actions) + - Environments provide clear rules and feedback (win/loss/reward) + - Often involve physical simulation or real-world interaction + +::: columns +:::: {.column width=60%} +- **Core elements:** + - Agent: Learner and decision maker + - Environment: Everything the agent interacts with + - State $s$ + - Action $a$ + - Reward $r$ + +- **Algorithms:** + - Q-learning + - Policy Gradient methods +:::: +:::: {.column width=35%} +```graphviz +digraph BayesianFlow { + splines=true; + nodesep=1.0; + ranksep=0.75; + + node [shape=box, style="rounded,filled", fontname="Helvetica", fontsize=12, penwidth=1.4]; + + // Node styles + Agent [label="Agent", shape=box, fillcolor="#F4A6A6"]; + Env [label="Environment", shape=box, fillcolor="#B2E2B2"]; + + // Force ranks + //{ rank=same; Agent; Env; } + + // Edges + Agent -> Agent [label=" State", fontcolor=black, labeldistance=2.0]; + Agent -> Env [label=" Action", fontcolor=black, labeldistance=2.0]; + Env -> Agent [label=" Reward", fontcolor=black, labeldistance=2.0]; +} +``` +:::: +::: + +* Reinforcement Learning: Examples +- In game playing, learn strategies through trial and error + - E.g., AlphaGo mastering the game of Go +- In robotics, learn control policies for movement and manipulation +- In autonomous driving, learn safe and efficient driving behaviors +- In resource management, optimize allocation of limited resources over time + - E.g., data center cooling or CPU job scheduling +- In personalized recommendations, adapt suggestions based on user interaction + - E.g., newsfeed ranking adjusting based on user clicks +- In healthcare, optimize treatment plans over time + +# ############################################################################## +# Techniques +# ############################################################################## + +## ############################################################################# +## Machine Learning in Practice +## ############################################################################# + +* Machine Learning Flow +- **Question** + - E.g., "How can we predict house prices?" +- **Input data** + - E.g., historical data of house sales +- **Features** + - E.g., number of bedrooms, location, square footage +- **Algorithm** + - E.g., linear regression, decision trees +- **Parameters** + - E.g., learning rate, number of trees in a random forest +- **Evaluation** + - E.g., accuracy, precision, recall + +* Machine Learning Flow + +```graphviz[height=80%] +digraph BayesianFlow { + rankdir=LR; + splines=true; + nodesep=1.0; + ranksep=0.75; + node [shape=box, style="rounded,filled", fontname="Helvetica", fontsize=12, penwidth=1.4]; + // Node styles + "Question" [fillcolor="#F4A6A6"]; + "Input data" [fillcolor="#FFD1A6"]; + "Features" [fillcolor="#B2E2B2"]; + "Algorithm" [fillcolor="#A0D6D1"]; + "Parameters" [fillcolor="#A6E7F4"]; + "Evaluation" [fillcolor="#A6C8F4"]; + // Force ranks + // Edges + "Question" -> "Input data"; + "Input data" -> "Features"; + "Features" -> "Algorithm"; + "Algorithm" -> "Parameters"; + "Parameters" -> "Evaluation"; +} +``` + +- **Not all phases are equally important!** + - Question $>$ Data $>$ Features $>$ Algorithm + +- Clarity of the question impacts project success +- Quality and relevance of data are crucial for performance +- Proper feature selection simplifies the model and improves accuracy +- Algorithm is often less important (contrary to popular belief!) + +* Question +- **Make the question concrete and precise** + - Define the problem clearly + - Specify inputs and expected outputs + - Align question with business or research objectives + - E.g.,: + - **Bad**: _"How can we improve sales?"_ + - **Good**: _"What factors most significantly impact sales of product X in + region Y during season Z?"_ + +- Formulating question is **the most important part** of the machine learning +problem + - Misunderstanding leads to: + - Solving the wrong problem + - Collecting wrong data + - ... + +- _"If I were given one hour to save the planet, I would spend 59 minutes + defining the problem and one minute resolving it"_ (Albert Einstein) + +* Input Data +- Ensure **data is specific to prediction** goal + - E.g., use known movie ratings to predict unseen movie ratings from the same + population + - Training set $\approx$ test set + +- Relationship between data and prediction goal is **not always direct** + - E.g., interested in prices but predict supply and demand instead + +- Poor-quality data leads to inaccurate predictions + - _"Garbage in - garbage out"_ + +- Recognize **when data is insufficient** for valid answers + - _"Combination of data and desire for an answer does not ensure a reasonable + answer can be extracted"_ (John Tukey) + +- **More data vs better models** + - Meta-studies show difference between generic and best model is like + 5\% + - _"It's not who has the best algorithm that wins. It's who has the most + data"_ (Google researcher) + - _"Every time I fire a linguist, the performance of the speech recognizer + goes up"_ (IBM researcher in speech recognition) + +* Features +- **Features** provide high-level information about inputs + - E.g., use intensity and symmetry for scanned numbers instead of raw bit maps + +- **Characteristics of good features**: + 1. Enable data compression + 2. Retain relevant information + 3. Often created with expert knowledge + +- **Common mistakes in feature building**: + 1. Automating feature selection may lead to overfitting + - Black box predictions can be accurate but stop working anytime + - E.g., Google Flu's unclear feature-model link + 2. Ignoring data-specific quirks + - E.g., mislabeling outliers + 3. Unnecessarily discarding information + +* Models +- Best models are: + - **Interpretable** + - Allow users to understand and trust the model's decisions + - E.g., decision trees are appropriate in medical studies since they produce + a "reasoning" + - **Simple** + - Easier to implement and maintain + - Reduces the risk of overfitting + - **Accurate** + - Often accuracy is traded off for remaining characteristics + - E.g., accuracy vs interpretability, accuracy vs speed + - **Fast** + - To train and test + - Essential for real-time applications + - Reduces computational costs + - **Scalable** + - Can handle large datasets efficiently + - Important for growing data and user bases + - E.g., in the Netflix prize, Netflix didn't end up implementing the best + algorithm since it wasn't scalable enough + +## ############################################################################# +## How to Do Research +## ############################################################################# + +### ############################################################################ +### Simple Is Better +### ############################################################################ + +* Occam'S Razor +- _The **simplest** model that fits the data is also the **most plausible**_ + (Occam) + - Trim the model to the bare minimum necessary to explain the data + - _"An explanation of the data should be as simple as possible, but not + simpler"_ (Einstein?) + - **Simple** means: + - Less likely to fit a given data by coincidence + - An unlikely event is more significant if it happens (formalized in terms + entropy) + - **Better** means better out of sample performance + +- An object is **simple** when it is one of few possible objects + - Polynomial of order 2 is simpler than a polynomial of order 17 + - There are many more polynomials of order 17 compared to order 2, although + both are infinite sets + - SVM (Support Vector Machine) characteristics: + - The separating hyperplane appears wiggly, but it is defined by a few + support vectors + - Complexity of a hypothesis $h$ + - E.g., polynomial order, MDL (describe the hypothesis in terms of bits), + Kolmogorov complexity + - Complexity of a hypothesis set $\calH$ + - E.g., VC dimension of the model + - Complexity of $h$ and $\calH$ are related by counting: if we need $l$ bits + to specify $h$, then $h$ is one of $2^l$ elements of a set $\calH$ + +* Model Soundness +- We cannot blindly accept the result of modeling + - A model should tell a story + - Always ask yourself: _"what criticisms would you give to the model if it was + presented to us for the first time?"_ + +- Benchmark models: what are the performance if the model outputs: + - Outputs always 0 or 1 + - E.g., long-only model for stock predictions + - Random results + - I.e., bootstrap of null hypothesis "there is no prediction power" + +- A perfect fit can mean nothing, e.g., + - Get 2 data points on a plane + - Fit data with a linear relationship + - It is a perfect fit + - This means nothing since: + - There is always a line between 2 points + - The data cannot falsify the hypothesis + - The model (line) is too complex for the data set (only 2 points) + +* Sampling Bias +- A model, when learning, sees the world in terms of the training data + - If data is sampled in a biased way, learning will produce a biased outcome + +- Formally: one of the few hypothesis of Hoeffding in learning theory is that + training and testing distributions are the same + +- Addressing sampling bias + - Weight or re-sample data to match testing distribution + - If data points have zero probability ($\Pr = 0$) in the data set, no + remedies are possible + +* Data Snooping +- **Data snooping** is the improper use of data that biases ML model results + - Common trap for practitioners + +- **Sources** of data snooping + 1. Contamination of training and test sets + 2. Multiple testing issue + 3. If data affects any learning step (e.g., feature engineering, model + selection, hyperparameter tuning), its assessment becomes optimistic + +- **Effects** of data snooping + - Models show inflated performance metrics which do not translate out of + sample + - Snooping leads to seemingly better performance: + - It is a "happy minefield" + +* "Burning the Test Set" +- Repeatedly using the same data eventually leads to "success" + - The model starts fitting to specific data quirks + - The test set should not be used for training; this leads to over-optimism + - _"If you torture the data long enough, it will confess whatever you want"_ + +- Solutions: + - Use the test set _exactly once_ + - The VC dimension applies to the overall learning model, including all + attempted models + - MDL accounts for the number of fitting attempts in overfitting measurement + - Adjust p-values for multiple experiments + +### ############################################################################ +### Research Methodology +### ############################################################################ + +* How to Achieve Out-Of-Sample Fit +- Goal: choose an hypothesis $g$ approximates the unknown target hypothesis $f$ + + $$ + g \approx f \iff E_{out}(g) \approx 0 + $$ + +- Solution: + - Achieve + 1. Good in-sample performance $E_{in}(g) \approx 0$ + 2. Good generalization $E_{out}(g) \approx E_{in}(g)$ + - Then 1. + 2. $\implies$ good out-of-sample performance + $E_{out}(g) \approx 0$ + +* What to Do If Out-Of-Sample Fit Is Poor? +- The model performs well in sample ($E_{in} \approx 0$) but poorly out of + sample ($E_{out} \gg E_{in}$) + +- What does it mean? + - The in-sample performance are optimistic + - The model is overfitted and fails to generalize + +- What do we do? + - Run diagnostics before embarking in long term projects + - Gain insight on what works / does not work to understand how to improve + performance + - E.g., bias-variance curves and learning curves + +- How to fix? + - It depends on the diagnostics! + 1. Training data + - Get more training data (it can take long time) $\iff$ fixes high variance + 2. Features + - Remove features $\iff$ fixes high variance + - Add more features (it can take long time) $\iff$ fixes high bias + - Add derived features (e.g., polynomial features) $\iff$ fixes high bias + 3. Regularization + - Decrease regularization amount $\lambda$ $\iff$ fixes high bias + - Increase regularization amount $\lambda$ $\iff$ fixes high variance + +* Why Using a Lot of Data? +- Several studies show that: + - Different algorithms/models have remarkably similar performance + - Increasing training set improves performance + +- Thus it holds that: + + $$ + \text{High capacity model + massive training set = good performance} + $$ + +- Using a high capacity model with many parameters (e.g., neural network) + $$ + E_{in} \approx 0 + $$ + due to low bias (and high variance) +- A massive data set helps avoid overfitting + $$ + E_{out} \approx E_{in} + $$ +- These two conditions together + + $$ + E_{out} \approx E_{in} \approx 0 \implies E_{out} \approx 0 + $$ + +* What to Do When One Has Lots of Data? +- You have $m$ = 100M examples in data set, what do you do? + +- Training on a lot of data might yield scalability issue: + - Slow + - Lots of compute + - Require work on infrastructure + - ... + +- Plot the learning curves as function of increasing + $m = 1k, 10k, 100k, 1M, ...$ + - If the algorithm has large bias, it converges (training and validation + performance are similar) at $m = 1000$ + - Add more features and complicate the model rather than training on 100M + instances + - If the variance is large, use all instances to train the model + +* Why We Do Things? +- Always + - Ask: _"Why are we doing something?"_ + - To understand the purpose of the task + - Ask: _"What do we hope to determine by performing the task?"_ + - To clarify goals and outcomes of the task + - Encourage thinking about actions with the bigger picture in mind + - Avoid merely going through motions + - Promote critical thinking and awareness + - Prioritize tasks by importance and impact + +- E.g., when conducting a customer survey, ask: + - _"Why is feedback being collected?"_ + - To improve product features and customer service + - _"What is the desired outcome?"_ + - To identify areas for improvement and innovation + +- E.g., before starting a marketing campaign, ask: + - _"Why is this campaign run?"_ + - To increase brand awareness or drive sales + - _"What are the specific goals?"_ + - Set target number of new leads or click-through rates + +* Summary of the Results, Next Steps, Follow Ups +- Always have a summary of the results + - It's like a high-level map of what we have done and what we have discovered + - E.g., "smoothing model coefficients helps" + - Highlight major findings + - Interpret the results + - E.g., _"The increase in sales is likely due to the new marketing + strategy."_ + - Conclusions + - Summarize what the data suggests or confirms + - E.g., _"Our hypothesis that user engagement increases retention is + supported"_ + +- Always have a reference to more detailed results + - Provide quick insights before diving into details + +- Always have next steps / follow-ups + - What do you expect that will happen? + - What results do you expect? + - Like thinking $n$ moves ahead in chess + - E.g., _"Next, we will conduct a detailed analysis on the demographics + contributing most to sales growth"_ + - Outline potential experiments or analyses to validate findings further + +* Example of Spam Filter Classification +- We use $N = 4$ words in an email to distinguish spam from non-spam emails + using logistic regression + - Words can be: `buy`, `now`, `deal`, `discount`, `` + +- How to improve the performance of this classifier? + 1. Collect more data + - E.g., honeypot project: set up fake email account and collect spam + 2. Use better features + - E.g., email routing information: spammers use unusual accounts and mask + emails as legitimate + 3. Use better features from message body + 4. Detect intentional misspellings + - Spammers use misspelled words (e.g., `w4tch` for `watch`) to confuse the + classifier + - Use stemming software + +* Right and Wrong Approach to Research + +- **Bad** + 1. It is not clear how to prioritize the different possible tasks + 2. Use gut feeling and pick one task + 3. Complete the task + 4. Re-evaluate performance + +- **Good** + 1. Build a simple algorithm + - Within 1 day + 2. Set up the performance evaluation framework + - A single number and bounds to evaluate + - Aim to improve that number + - Evaluate with cross-validation + 3. Set up diagnostic tools + - Compute learning and bias-variance curves + - Avoid premature optimization by understanding the issue before fixing it + 4. Manually review misclassified emails in the cross-validated set + - What features might help to improve performance? + - E.g., what types of emails are misclassified? + +- Sometimes an approach must be tried to see if it works + - E.g., stemming software to consider certain words equivalent + +## ############################################################################# +## Pipeline Organization +## ############################################################################# + +* How Are Machine Learning Systems Organized? +- Machine learning systems are typically organized in a pipeline + 1. Break down the problem into sub-problems + 2. Solve problems one at the time + 3. Combine the solutions to the sub-problems into a solution to the initial + problem + +- The performance $p$ of the entire ML pipeline are given by: + + $$ + p_{system} = \sum_i p_i \cdot \alpha_i + $$ + + where: + - $p_i$ is the performance of each stage $p_i$ + - $\alpha_i$ is the importance of each stage + +* ML Pipeline: Example of Photo OCR System +- Goal: build systems to read text in a picture + - OCR = "Optical Character Recognition" + +- Stages of ML pipeline for OCR: + - Text detection: find areas of the picture with text + - Character segmentation: split text into boxes, one per letter + - E.g., `h e l l o` + - Character classification: classify characters, one at a time + - Spelling correction: fix errors in text using context + - E.g., `hell0` corrected to `hello` + +- Issues with text detection: + - Unknown text location and size + +- Solution + - Use a sliding window classifier + - Works as evaluating a classifier is often cheap compared to training + - Sliding window classifiers can be used for text detection and character + segmentation + +- **Text detection** + - Train a classifier to recognize letters vs non-letters + - Scan image in two directions, different sizes looking for text + - Create a map of text likelihood (e.g., heatmap) using classifier + probabilities + - Enclose text areas in boxes + - Discard boxes not fitting aspect ratio (valid text width > height) + +- **Character segmentation** + - Use sliding window classifiers to find "breaks" between characters + - Use a 1D sliding window for character segmentation + +* The Ideal Recipe for ML +- The ideal recipe for ML is: + + $$ + \text{low-bias algorithm + massive amount of data to train} + $$ + - Use learning curves to make sure we are taking advantage of more data + +- Always ask yourself: _"how much work is to get 10x more data than we currently + have?"_ +- Often it is not that difficult: + 1. Artificial data + - E.g., synthesize or amplify data set + 2. Collect and label by hand + - E.g., crowd sourcing like Amazon Mechanical Turk + +* OCR Pipeline: Example of Artificial Data Synthesis +- How can we increase data set size? + 1. Synthesize data set + - Use font libraries to generate large training sets + - Paste characters against random backgrounds + - Apply scaling, distortion, adding noise, etc + 2. Amplify a data set + - Start from a training set and add examples by warping/distorting existing + examples + +- Transformations and noise should be specific to the application domain + - E.g., Gaussian noise is not always appropriate + +* Ceiling Analysis for ML Pipeline +- The most valuable resource is time + - Sometimes one works on an optimization for months + - The optimization doesn't make much difference + +- **Problem**: On which part of the pipeline should time/resource be spent? + +- **Solution**: Ceiling analysis + - Technique to analyze performance of pipelines + - Have a single number representing the performance of the entire system + - E.g., accuracy for an OCR system + - For each component: + - Mock the component with a box that always gives the correct output + (=oracle) + - Leave the remaining components untouched + - Compute performance of the entire pipeline + - Understand which component is critical to performance by estimating an upper + bound for overall performance when that component improves 10\% + - Don't trust your gut feeling but measure! + +## ############################################################################# +## Input Processing +## ############################################################################# + +// TODO(gp): Add something + +- Data cleaning +- Dimensionality reduction +- Feature engineering + +## ############################################################################# +## Learning Algorithms +## ############################################################################# + +### ############################################################################ +### Gradient Descent +### ############################################################################ + +* The Problem of Minimizing a Function + +::: columns +:::: {.column width=80%} +- **Goal**: minimize a function $J(\vw)$ + - E.g., in-sample error $E_{in}(\vw)$ + +- **Solutions**: + 1. Analytical solution + - Impose the gradient of $J(\vw)$ to equal 0 + - Find a closed-form solution for $\vw^*$ + 2. Numerical solution: + - Use an iterative method to update $\vw$ to reach the minimum value of + $J(\vw)$ + - E.g., gradient descent + - It works even if there is an analytical solution +:::: +:::: {.column width=20%} + ![](lectures_source/figures/Lesson6_Gradient_descent_2.png) +// TODO: Convert in Tikz or improve +:::: +::: + +* Gradient Descent: Intuition + +::: columns +:::: {.column width=80%} +- **Problem**: + - We are on a hilly surface and we want to walk down to the bottom of the hill + +- **Solution**: + - At each point: + - We look around + - We move a step in the direction where the surface is steepest + - We keep doing until we reach the bottom + +- Gradient descent + - Is a general technique for minimizing a twice-differentiable function + - Converges to + - A local minimum in general + - The global minimum if $J(\vw)$ is convex (e.g., logistic regression and + linear models) +:::: +:::: {.column width=20%} +![](lectures_source/figures/Lesson6_Gradient_descent_1.png) +// TODO: Convert in Tikz or improve +:::: +::: + +* Gradient descent with fixed learning rate (1/3) +// TODO: Add images from tutorial +- Consider the contour plot of a function + +- Start from a point $\vw(0)$ (random, the origin, ...) + +- At each step, move a fixed amount $\eta$ in the weight space (fixed learning + rate): + + $$ + \vw(t + 1) = \vw(t) + \eta \hat{\vvv} + $$ + + where $\hat{\vvv}$ is a unit vector + +- Pick $\hat{\vvv}$ to move to a value of $E_{in}(\vw)$ as negative as possible + + - The change for $E_{in}$ is: + \begingroup \footnotesize + \begin{align*} + \Delta E_{in} + & = E_{in}(\vw(t + 1)) - E_{in}(\vw(t)) + & \\ + & = E_{in}(\vw(t) + \eta \hat{\vvv}) - E_{in}(\vw(t)) + & \text{ (replacing the expression of $\vw(t + 1)$)} \\ + & = \eta \nabla E_{in}(\vw(t))^T \hat{\vvv} + O(\eta ^ 2) + & \text{ (using Taylor expansion)} \\ + \end{align*} + \endgroup + - Gradient descent keeps only $O(\eta)$ the term and ignores the rest + - Conjugate gradient considers up to $O(\eta^2)$ and ignores higher + infinitesimals + +* Gradient Descent with Fixed Learning Rate (2/3) + +- The minimal value of the scalar product + - Is $- \eta \|\nabla E_{in}(\vw(t))\|$, + - Happens when + $\hat{\vvv} = - \frac{\nabla E_{in}(\vw(t))}{\|\nabla E_{in}(\vw(t))\|}$ +- The change in weights is: + $$ + \Delta \vw = - \eta \frac{\nabla}{\|\nabla\|} + $$ +- It is called "gradient descent" since we descend along the gradient of the + function to optimize + +* Gradient Descent with Fixed Learning Rate (3/3) +- Each component of the weight $\vw$ is updated with the partial derivative with + respect to that coordinate: + \begin{align*} + \vw(t + 1) + &= \vw(t) - \eta \hat{v} \\ + \vw(t + 1) + &= \vw(t) - \eta \frac{\nabla E_{in}(\vw(t))}{\|\nabla E_{in}(\vw(t))\|} \\ + w_j(t + 1) + &= w_j(t) - \eta + \frac{1}{\|\nabla E_{in}(\vw(t))\|} + \frac{\partial E_{in}(\vw)}{\partial w_j} + \end{align*} + +- The update of all components should be simultaneous, i.e., computed at once + +- A step of the optimization when we update the solution (weights) is called + epoch + +* Gradient Descent: Stopping Criteria +- In theory, stop when $\Delta E_{in} = \vv{0}$ + - Numerically this might not occur +- In practice, stop when + - The variation of $E_{in}$ is smaller than a given threshold + $\Delta E_{in} < \theta$ + - We have reached a certain number of iterations + +- Monitoring gradient descent + - In theory, only need to compute the derivatives of the function $J(\vw)$ to + optimize + - In practice, need to monitor the algorithm progress by recomputing the cost + function $J(\vw)$ periodically to make sure it is decreasing + +* Setting $\eta$ in Gradient Descent with Fixed Learning Rate +- Consider a 1D convex function + - If $\eta$ is small: + - The linear approximation of $E_{in}$ is effective + - Many steps are needed to converge to the minimum + - If $\eta$ is large: + - The linear approximation fails (higher terms affect values) + - It "bounces around" + +![](lectures_source/figures/Lesson6_Gradient_descent_3.png) + +- Idea: vary learning rate $\eta$ during gradient descent + - Smaller learning rates may find a better minimum + - Reduce $\eta$ as a function of iterations + - Cons: this introduces an additional parameter to tune + +* Gradient Descent with Variable Learning Rate +- In gradient descent with fixed learning rate (i.e., constant change in weight + space), we use: + + $$ + \Delta \vw = - \eta \frac{\nabla J}{\|\nabla J\|} + $$ + +- To converge quickly, we want to: + - Move fast in weight space (large $\eta$) when the surface is steep (large + gradient) + - Move slow in weight space (small $\eta$) near the minimum to avoid bouncing + around (small gradient) + +- Ideally, $\eta$ should increase with the slope: $\eta \propto \|\nabla J\|$ + +- This is called gradient descent with variable learning rate: + + $$ + \Delta \vw = - \eta \nabla J + $$ + +* Feature Scaling in Gradient Descent +- Gradient descent converges faster if features are scaled to the same range + - Feature scaling techniques include min-max scaling and standardization + - E.g., applying standardization to a dataset can transform feature values to + have a mean of 0 and a standard deviation of 1 + +- Otherwise, different gradient components have different errors due to + numerical approximation, causing the gradient to bounce around + - Unscaled features can lead to slow and unstable convergence due to varying + magnitudes + - E.g., if one feature ranges from 1 to 1000 and another ranges from 0.01 to + 1, the large disparity can cause inefficient updates + +* Issues with Batch Gradient Descent +- Consider the case of squared error with $n$ samples + $$ + E_{in}(\vw) + = \frac{1}{n} \sum_i e(h_{\vw}(\vx_i) - y_i) + = \frac{1}{n} \sum_i (h_{\vw}(\vx_i) - y_i) ^ 2 + $$ +- The Batch Gradient Descent (BSD) requires to update each component of the + weight vector with an expression like: + $$ + \vw(t + 1) = \vw(t) - \eta \frac{\nabla E_{in}}{\|\nabla E_{in}\|} + $$ +- In terms of coordinates for squared error: + $$ + w_j(t + 1) = w_j(t) - + \eta \frac{2}{n} + \sum_{i=0}^n (h_{\vw}(\vx_i) - y_i) + \frac{\partial h_{\vw}(\vx_i)}{\partial w_j} + $$ +- With a large number of training examples (e.g., $N = 10^6$), gradient descent: + - Is computationally expensive as it requires evaluating the gradient from all + examples for a single update + - Requires storing all the data in memory + +### ############################################################################ +### Stochastic Gradient Descent +### ############################################################################ + +* Stochastic Gradient Descent +- **Idea** of Stochastic Gradient Descent (SGD) + - Update the weights only for one training example picked at random + +- **Algorithm** + - Pick one $(\vx_n, y_n)$ at a time from the available examples + - Compute $\nabla e(h(\vx_n), y_n)$ to update the weights: + $$ + \Delta \vw = -\eta \nabla e + $$ + - Update the weight considering only one random example: + $$ + w_j(t + 1) = w_j(t) - + \eta \frac{2}{n} + (h_{\vw}(\vx_t) - y_t) + \frac{\partial h_{\vw}(\vx_t)}{\partial w_j} + $$ + +- $\nabla e$ is a function of a random var $\vx_n$ + - The average direction of SGD is the same direction as batch version + $$ + \EE[\nabla e] + = \frac{1}{N} \sum \nabla e(h(\vx_n), y_n) + = \nabla \frac{1}{N} \sum e(h(\vx_n), y_n) + = \nabla E_{in} + $$ + +- In Stochastic Gradient Descent (SGD): + - The path in the weight space is more random + - It does not even converge but rather oscillates around the local minimum + +* Mini-Batch Gradient Descent +- Bring together characteristics of both Batch and Stochastic Gradient Descent + +- Use $b$ examples to make an update to the current weight + - $b$ represents the batch size, which is a hyperparameter you can choose + - A common choice for $b$ might be $b = 32$ or $b = 64$ + +- Mini-batch GD offers a balance between SGD noisiness and full-batch + approaches, using small, random data samples for updates + +* On-Line Learning and Gradient Descent +- Continuous stream of training examples requires updating the model + - In real-time systems, new data points arrive and the model adapts without + fully retraining + - E.g., in stock market prediction models, each transaction can dynamically + adjust model weights + - Handle variation in the dynamics of the underlying process + +- Stochastic gradient (SGD) and mini-batch descent are suitable for online + learning, updating the model one example at a time + +- Discard examples for a "compressed" model representation + - Useful for large data streams where storing every data point is impractical + - E.g., in training a language model on live chat data, older conversations + might be discarded after updates to maintain relevant patterns in the model + +* SGD vs BGD vs Mini-Batch +- To update the weights: + - BGD (batch gradient descent) uses all the training examples + - SGD (stochastic gradient descent) uses a single (random) training example + - Mini-batch GD uses only a subset of training examples + +\begingroup \scriptsize + +| **Aspect** | **Batch Gradient Descent** | **Stochastic Gradient Descent** | +| --------------- | ---------------------------------------- | ------------------------------------ | +| Computation | Uses all examples | One example at a time | +| Memory | Requires all examples in memory | Require less memory | +| Randomization | More likely to terminate in flat regions | Avoid local minima due to randomness | +| Regularization | No implicit regularization | Oscillations act as regularization | +| Parallelization | Can be parallelized | Less parallel-friendly | +| Online Learning | Not suitable | Suitable for online learning | + +\endgroup + +* Map-Reduce for Batch Gradient Descent +- In map-reduce we use $k$ machines to parallelize the summation (map step) and + then we send the $k$ partial sums to a single node to accumulate the result + (reduce step) +- Batch GD (and many learning algorithms) can be expressed in this map-reduce + form + +// TODO: Add a tikz picture + +* Coordinate Descend +- Minimize $J(x_0, ..., x_n)$ by optimizing along one direction $x_i$ at a time + - Instead of computing all derivatives + +- **Algorithm** + - Pick a random starting point $\vw(0)$ + - Pick a random order for the coordinates $\{ x_i \}$ + - Find the minimum along the current coordinate (1D optimization problem) + - Move to the next coordinate $x_{i+1}$ + - The sequence of $\vw(t)$ is decreasing + - A minimum is found if there is no improvement after one cycle of scanning + all coordinates + - The minimum is local + +* Gradient Descent vs Pseudo-Inverse for Linear Models +- For linear models we can use either pseudo-inverse or gradient descent to find + optimal $\vw^*$ + +- **Gradient descent** + - Choose learning rate $\eta$ + - Requires many iterations to converge + - Monitor stopping criteria, oscillations, etc + - Effective for many features $P$ + +- **Pseudo-inverse** + - No parameter selection needed + - Converges in one iteration (with nested loops) + - Computes $(\mX^T \mX)^{-1}$, a $P \times P$ matrix + - Inverse complexity $O(P^3)$ + - E.g., for $P \approx 10,000$, gradient descent is preferable + +## ############################################################################# +## Performance Metrics +## ############################################################################# + +* How to Make Progress in ML Research +- There are many possible directions for research + - Different features + - Different data preprocessing methods + - Different models + - Different training algorithms + - Different evaluation techniques + - Explore optimization strategies + +- What to do? + +- Approach + - Evaluate models systematically using a single number + - Implement metrics (E.g., accuracy, F1 score) for insight + - Use cross-validation for model validation + - Statistical tests to ensure differences are not random + - Utilize hypothesis testing for genuine improvements + - Conduct A/B testing for real-world validation + +* How to Measure Classifier'S Performance? +- Success / hit / win rate (or error / miss rate) + - Measures the proportion of correct predictions by the model + - Important for understanding overall accuracy + - E.g., in binary classification, 80 correct predictions out of 100 result in + an 80% success rate + +- Log probability / cross-entropy error + - Evaluates classification model with probabilities between 0 and 1 + - E.g., lower cross-entropy loss indicates better performance + +- **Precision / recall / F-score** + - Useful for evaluating models in imbalanced data scenarios + - Precision: ratio of correctly predicted positive observations to total + predicted positives + - E.g., a precision of 0.75 means 75% of identified positives are true + positives + - Recall: ratio of correctly predicted positive observations to actual + positives + - E.g., a recall of 0.60 means 60% of actual positives were correctly + identified + - F-score: weighted harmonic mean of precision and recall + +- **Utility function** + - Customizes the evaluation metric to prioritize types of errors and success + - E.g., true / false positives / negatives + - E.g., in medical diagnosis, a utility function might give higher weight to + minimizing false negatives to prevent missed diagnoses + +* Training vs Test Set +- Performance on train set $E_{in}$ is an optimistic estimate of $E_{out}$ + - One can have: + - 0\% error rate on training data (e.g., memorizing responses for training + set) + - 50\% error rate on test set (e.g., by answering randomly) + +- To evaluate model performance, use a test set that played no role in training + +- Training and test sets should be representative samples of the problem + - E.g., credit risk problem + - One cannot use data from a bank branch in Florida to assess a model built + with data from a bank branch in New York + - Characteristics of the populations are very different + +* Lots of Data Scenario vs Scarce Data Scenario +- **Lots of data scenario** + - Ideal to have lots of data (ideally infinite) + - Learn on lots of data + - Fit all degrees of freedom of a complex model + - Predict on lots of data + - Assess precise out-of-sample performance + +- **Scarce data scenario** + - Often data (especially data of high quality) is scarce + - E.g., facial recognition datasets with limited annotated data needing + careful management + - Cannot use all data as a training set + - Need to hold out data to estimate performance metrics and bounds + - Split the data 70-30 or 80-20 in train and test sets + - Consider cross-validation techniques to maximize data usage + - Other approaches: + - Augment data artificially, like data augmentation in image processing + - Utilize transfer learning with pre-trained models on related tasks + +// TODO: Add plot for the splitting of data + +* Splitting Data Into Training, Validation, Test Sets +- Training, validation, and test sets must be: + - Distinct + - Representative of the problem + - E.g., each class in all sets must be represented according to the original + data + - Sized based on available data and problem needs + +- To ensure sets have the same distribution: + - Stratified sampling + - E.g., each class label is proportionally represented in each set + - Shuffle and then sample + - Achieves randomization, maintaining distribution + - Sample and check statistics of variables (e.g., mean, std dev, PDF) + - Compare these statistics to ensure each set mirrors the broader dataset + +* Rule of Thumbs for Data Set Splits +- If $n$ is **large** $\to$ use a 60-20-20 split + - Training: 60% + - Validation: 20% + - Test: 20% + +- If $n$ is **medium** $\to$ use a 60-40 split + - Training: 60% + - Test: 40% + - Not possible to learn hyperparameters, so no validation set + +- If $n$ is **small** $\to$ use cross-validation and report "small data size" + - Use K-fold cross-validation + - Be cautious of the increased chance of high accuracy by chance + - Is machine learning for the given sample size even suitable? + +* Can We Ever Use Test Set as Training Set? +- Once the model is selected and validated, reuse all available data (including + the test set) to generate the model for deployment + - This ensures the model benefits from all available information + +- Generally, more data is better, though returns diminish after exceeding a + certain volume + - Initially, increasing data size can significantly improve model performance + - Eventually, adding more data results in smaller accuracy gains and may not + justify the increased computational cost + +* In-Sample vs Out-Of-Sample Error Expressions +- We want to find a function $h$ that approximates the unknown function $f$, + $h \approx f$ over the space of inputs $\vx \in \calX$ ("script X") + +- The error is usually defined point-wise: + + $$ + e(h(\vx_i), f(\vx_i)) + $$ + - E.g., + - Squared error: $e(\vx) = (h(\vx) - f(\vx))^2$ + - 0-1 binary error: $e(\vx) = I[h(\vx) == f(\vx)]$ + - Log probability: $e(\vx) = - \log( \Pr(h(\vx) == f(\vx)) )$ + +- In-sample error is computed using all points in the training set: + + $$ + E_{in}{(h)} = \frac{1}{N} \sum_{i=1}^N e(h(\vx_i), f(\vx_i)) + $$ + +- Out-of-sample error is computed on the entire space of inputs $\calX$ + + $$ + E_{out}(h) = \EE_{\vx \in \calX}[e(h(\vx), f(\vx))] + $$ + +* Mean Squared Error (MSE) +- MSE is the average difference of squared error: + + $$ + \text{MSE} + \defeq \frac{1}{N} \sum_{i=1}^N (h(\vx_i) - f(\vx_i))^2 + $$ + - MSE measures the estimator quality, quantifying the difference between + estimated and actual values + - E.g., in a house price prediction model, MSE determines how close predicted + prices are to actual prices + +- **Cons:** + - It doesn't share the unit of measure with the output + - Distorts error interpretation; predicted and actual values are usually in + different units + - Sensitive to outliers + - A single large error can disproportionately affect the MSE + - Use median absolute deviation (MAD), median of squared error for + robustness against outliers + +* Root Mean Squared Error (RMSE) +- RMSE is the standard deviation of the Mean Squared Error (MSE): + + $$ + \text{RMSE} \defeq \sqrt{\text{MSE}} + = \sqrt{\frac{1}{N} \sum_{i=1}^N (h(\vx_i) - f(\vx_i))^2} + $$ + +- **Pros:** + - Same units as the output, allowing intuition of its magnitude compared to + the mean + - Facilitates comparison between different data sets or models since the + metric is normalized to the output's scale + +- **Cons:** + - Sensitive to outliers (like MSE) which can excessively affect the metric + - May not be suitable for ranking models when outliers or skewed distributions + are present + +* Median-Based Metrics +- We can use metric based on median (i.e., the 0.5 quantile of absolute error): + +- Median absolute deviation: + + $$ + \text{MAD} \defeq \text{median}_i(|h(\vx_i) - f(\vx_i)|) + $$ + +- Median squared error: + + $$ + \defeq \text{median}_i(|h(\vx_i) - f(\vx_i)|^2) + $$ + +* How to Choose an Error Measure? + +- Error measure depends on the **application** and should be **specified by the + "customer"**: + - The customer needs to define what constitutes an acceptable level of error + for their specific use case + - E.g., medical applications might have a low tolerance for errors, while a + recommendation system might have a higher tolerance + +- Otherwise, we can pick: + - A **plausible error measure**: + - E.g., squared error is commonly used when assuming Gaussian noise in the + data + - A **"friendly error" measure**: + - E.g., measures that allow for closed-form solutions simplify calculations + significantly + - Convex optimization-friendly measures ensure optimization algorithms find + the global minimum easily + +* Error Measures: Fingerprint Verification Example +- In fingerprint verification: + - Recognizing a valid fingerprint has no error + - Otherwise, it is a false positive or a false negative + +- Error weight depends on the application + - For the same problem in two set-ups, the error measure is the opposite + - For supermarket applications: + - False positives are minor (e.g., one more discount) + - False negatives are costly (e.g., annoyed customer, slow line) + - For CIA building access: + - False negatives are acceptable (triggers further security) + - False positives are disastrous + +### ############################################################################ +### Precision and Recall +### ############################################################################ + +* Error Metrics for Skewed Classes +- When classes are skewed (i.e., one class is very rare), accuracy can be + misleading + - Use metrics like confusion matrix, precision, and recall + +- Example: + - Train a classifier to distinguish tumors as: + - $y = 1$: malignant + - $y = 0$: benign + - Classifier's error rate is 1% (i.e., guess correctly 99% of the time) seems + good + - But only 0.5% of patients have cancer + - A trivial classifier that always outputs $y = 0$ has a 0.5% error rate! + - Now a 1% error rate does not look good anymore + +* Decision Matrix ::: Columns :::: {.Column Width=60%} +- Aka confusion matrix + +- Typically $y = 1$ encodes the rare class to predict + +- Assuming actual and predicted class $\in \{0, 1\}$, we have 4 possible cases: + - $act = 1$, $pred = 1$: true positive (TP) + - $act = 0$, $pred = 0$: true negative (TN) + - $act = 1$, $pred = 0$: false negative (FN) (output $pred = 0$, but it is + wrong) + - $act = 0$, $pred = 1$: false positive (FP) (output $pred = 1$, but it is + wrong) + +- Aggregate decision matrix in precision and recall +:::: +:::: {.column width=35%} + +```tikz +% Draw matrix +\draw[thick] (0,0) rectangle (4,4); +\draw[thick] (0,2) -- (4,2); % horizontal middle +\draw[thick] (2,0) -- (2,4); % vertical middle + +% Labels for actual class +\node[rotate=90] at (-0.8,3) {act = 1}; +\node[rotate=90] at (-0.8,1) {act = 0}; + +% Labels for predicted class +\node at (1,4.3) {pred = 1}; +\node at (3,4.3) {pred = 0}; + +% Cell labels +\node at (1,3) {\textbf{TP}}; +\node at (3,3) {\textbf{FN}}; +\node at (1,1) {\textbf{FP}}; +\node at (3,1) {\textbf{TN}}; +``` +:::: +::: + +* Precision vs recall +- Assume that $y = 1$ encodes the rare event we want to detect + +- **Precision** measures how often there is a true positive _given that pred = + 1_ + + \begingroup \small + + $$ + \text{precision} + \defeq \Pr(\text{TP} | \text{pred == 1}) + = \frac{|\text{pred == 1} \land \text{act == 1}|}{|\text{pred == 1}|} + = \frac{\text{TP}}{\text{TP} + \text{FP}} + $$ + + \endgroup + +- **Recall** measures how often there is a true positive _given that act = 1_ + \begingroup \small + + $$ + \text{recall} + \defeq \Pr(\text{TP} | \text{act == 1}) + = \frac{\text{TP}}{|\text{act == 1}|} + = \frac{\text{TP}}{\text{TP} + \text{FN}} + $$ + + \endgroup + +- Both are conditional probability measuring the fraction of TP under different + circumstances: + - (Pre)cision: pred = 1 + - Rec(a)ll: act = 1 + +- Precision/recall are widely used in information retrieval + - E.g., a search engine: + - Returns 30 pages; only 20 are relevant $\implies$ precision = 20 / 30 = 2 + / 3 + - Fails to return another 40 relevant pages $\implies$ recall = 20 / + (40 + 20) = 20 / 60 = 1 / 3 + +* Precision / Recall in Terms of Quality / Quantity +- **Precision** + - Increasing precision means when we predict 1, we are more likely to be right + - E.g., in a spam email detection system, "precision is 90%" means 90% of + the emails marked as spam are actually spam + - A higher precision indicates fewer false positives + - Measures "quality" of prediction + +- **Recall** + - Increasing recall means we predict more instances when the outcome is 1 + - E.g., in a spam email detection system, "recall is 80%" indicates 80% of + all actual spam emails were correctly identified as spam + - A higher recall means fewer false negatives + - Measures "quantity" of prediction (coverage) + +* Precision / recall for trivial classifiers +- A classifier that outputs always the most common class 0 has: + \begin{alignat*}{3} + \text{precision} + & = 0 + & \text{(since TP = 0)} + \\ + \text{recall} + & = 0 + & \text{(since TP = 0)} + \\ + \end{alignat*} +- A classifier that outputs always the rare class 1 has: + \begin{alignat*}{3} + \text{recall} + & = 1 + & \text{(since FN = 0)} + \\ + \text{precision} + & \defeq \Pr(\text{TP} | \text{pred == 1}) + & \text{(by definition)} + \\ + &= \frac{\text{TP}}{\text{TP + FP}} + & \text{($TP + FP = n$ because} + \\ + &= \frac{\#(y = 1)}{n} + & \text{classifier always emits 1)} + \\ + &= \Pr(\text{pos}) \approx 0 + & \text{(the positive class is very rare)} + \\ + \end{alignat*} +- A trivial classifier has precision or recall close to 0 + +* Trading Off Precision and Recall +- In theory, we want to increase both precision and recall + +- In practice, modify the threshold of a probabilistic classifier to trade off + precision and recall in practice + +- E.g., use logistic regression to predict cancer: + - With a threshold = 0.5, the classifier has: + - Precision = $\frac{\text{TP}}{|\text{pred == 1}|}$ + - Recall = $\frac{\text{TP}}{|\text{act == 1}|}$ + - Increase the threshold $\implies$ output 1 only if more confident, i.e., + increase precision + - Decrease the threshold $\implies$ output 1 more often, decreasing the + chances of missing a possible case of cancer, i.e., increase recall + +// TODO: Pic from notebook + +* Precision-Recall: Pros / Cons +- Pros: + - Give insight on the behavior of a classifier (e.g., confusion matrix) + - Avoid mistaking a trivial classifier for a good classifier + +- Cons: + - We have two different numbers, thus it is difficult to compare classifiers + to each other + - Solutions: F-score, AUC + +* Precision-Recall Curves +- **Aka ROC curves** + +- Plot the curve on a precision-recall plane: ($y =$ precision, $1 - x =$ + recall) to show the precision vs recall trade-off for a classifier + - E.g., changing the threshold of logistic regression + +- A curve higher than another means a better classifier, since for the same + recall we can get a higher precision + - The best classifier (precision = recall = 1) is in the top-right corner + +- The precision-recall plot can have different shapes, e.g., + - Diagonal (pure luck) + - Convex up (better than luck) + - Convex down (worse than luck) + +// TODO: Pic from notebook + +* Area Under the Curve +- **AUC** is the area under the precision-recall curve + - Provides a robust metric by integrating over all thresholds + - Higher AUC indicates better performance in differentiating between classes + - AUC = 0.5 suggests no discriminative power, similar to random guessing, + - AUC closer to 1.0 indicates high performance + +- **Pros**: + - Single number summarizing classifier behavior, useful for comparing + different models + - Does not require selecting a threshold for performance calculation + - Can handle imbalanced datasets effectively + +- E.g., consider a classifier for medical diagnosis + - The AUC helps understand how well the model distinguishes between patients + with and without a disease across all thresholds + +* F-Score +- The F-score is the harmonic mean of precision and recall: + + $$ + \text{F-score} + \defeq \frac{2}{\frac{1}{P} + \frac{1}{R}} + = 2 \frac{P \cdot R}{P + R} + $$ + +- **Interpretation:** + - Trivial classifiers: $P = 0$ or $R = 0$ => F-score = 0 + - Perfect classifiers: $P = R = 1$ => F-score = 1 + - For F-score to be large, both $P$ and $R$ must be high + +- Why not just averaging $P, R$? + - A classifier that always outputs 1 has $R = 1$ and $P = 0$ + - $\frac{P + R}{2} = \frac{1}{2}$, while we prefer a low value (ideally 0) + +## ############################################################################# +## Model Selection +## ############################################################################# + +* Model Selection Problem +- Model selection chooses the best model from a set of candidates based on + performance + - Needed when multiple hypotheses can explain the data + +- Certain parameters are fixed, while others need to be picked, e.g., + - Set of features + - E.g., selecting a subset of features from a dataset with 100 variables + - Learning algorithms + - E.g., deciding how to train a neural network + - Model types + - E.g., linear regression model vs. Support Vector Machine (SVM) + - Model complexity + - E.g., models with polynomials of degree $d < 10$ + - Values of the regularization parameter + - E.g., trying different values like 0.01, 0.1, and 1.0 + +- Evaluate model accuracy, precision, and recall +- Perform cross-validation to assess model performance +- Consider computational cost + - E.g., a simple logistic regression is faster than a complex neural network + +* Model Selection Process + +1. Split data into $D_{train}, D_{val}, D_{test}$ + - Commonly: 60\% training, 20\% validation, 20\% test + - Like splitting 80\% training between two learning phases + +2. Given $N$ hypotheses, learn on $D_{train}$ to get $g_1, ..., g_N$ + +3. Evaluate hypotheses on $D_{val}$ estimating errors + $E_{val}^{(1)}, ..., E_{val}^{(N)}$ + +4. Pick model $g_m$ with minimum $E_{val}^{(m)}$ + +5. Use test set $D_{test}$ to estimate fair performance of model $g_m$, i.e., + $E_{val} \approx E_{out}$ + +6. Retrain model with entire $D = D_{train} \cup D_{val} \cup D_{test}$ to get + final $g_m^{*}$ + +// TODO: Add a tikz plot + +* Model Selection as Learning +- "Picking the model with smallest $E_{val}$" is a form of learning: + - Hypothesis set: $\{g_1, ... , g_N\}$ + - Training set: $D_{val}$ + - Pick the best model $g_m$ + +- After model selection + - Experimentally $E_{val}(g_m) < E_{out}(g_m)$, i.e., $E_{val}(g_m)$ is a + (optimistically) biased estimate of $E_{out}(g_m)$ + - Theoretically: + - The penalty for model complexity with a finite set of hypotheses is + $$ + E_{out}(g_m) \le E_{val}(g_m) + O(\sqrt{\log(N / K)}) + $$ + - Use VC dimension for an infinite number of hypotheses (e.g., choice of + $\lambda$ for regularization) + +// TODO: Add a tikz plot + +## ############################################################################# +## Aggregation +## ############################################################################# + +* Ensemble Learning: Intuition +- Ensemble learning combines multiple models to improve prediction accuracy + - **Idea**: a group of weak learners can form a strong learner + +- Combine outputs of models $X_i$ to build a model $X^*$ better than any $X_i$, + with the wisdom of all + - Utilizes diversity in model predictions to improve accuracy + - Each model contributes its unique perspective, reducing overfitting + - E.g., like a panel of voting experts + +- Example: in computer vision detecting a face is difficult task (at least + circa 2010) + - Look for different features: + - Are there eyes? + - Is there a nose? + - Are eyes and nose in the correct position? + - ... + - Each feature is weak per-se, but together they become reliable + +* Ensemble Learning: Different Techniques +- **Bagging** (bootstrap + aggregation) + - Reduces variance by averaging predictions from different models + - E.g., decision trees $\to$ bagging $\to$ random forest + - Bagging creates multiple versions of a decision tree (each trained on a + random sample of data) + - Average their predictions to improve accuracy + +- **Boosting** + - Reduces bias by focusing on errors made by previous models + - Sequentially adds models, each correcting its predecessor + - E.g., `adaBoost` increases weights of incorrectly classified data points to + learn the next model + +- **Stacking** + - Uses a meta-model to combine separate models using weights + - E.g., a stacking ensemble + - Uses a logistic regression as a meta-model + - Combines the predictions of other models (e.g., decision trees, support + vector machines, and neural networks) + +* Ensemble Learning: Relation with Statistics +- **Bagging** + - Improves performance by adding randomized variants (mimicking multiple + training sets) + - Reduce variance without affecting bias + +- **Boosting** + - Use another model to learn residuals, i.e., difference between predicted and + true values + - Related to the statistical technique of "forward stagewise additive models" + +- **Stacking** + - If we have 3 independent classifiers, each with $\Pr(\text{correct}) = 0.7$ + \begin{alignat*}{2} + \Pr(\text{majority correct}) + &= \Pr(\text{at least 2 classifiers correct}) \\ + &= {3 \choose 2} 0.7^2 0.3 + 0.7^3 \\ + &= 3 \times 0.7^2 \times 0.3 + 0.7^3 \\ + &\approx 0.78 > 0.7 + \end{alignat*} + +* Ensemble learning: pros and cons +- **Pros** + - Hypothesis set $\calH$ is increased by combining hypotheses from different + models + +- **Cons** + - More computationally intensive to train and evaluate + - Loss of interpretability + - Risk of overfitting (model complexity is increased) + - Ensemble learning contradicts Occam's razor, which advocates simplicity + +* When Ensemble Learning Works +- Combining multiple models with ensemble learning works when models: + - Are very different from each other + - Treat a reasonable percentage of the data correctly + - E.g., one cannot do much if all classifiers have 50% accuracy + - Complement each other: they are specialists in a part of the domain where + the others don't perform well + +* How to Combine Outputs in Ensemble Learning +- **Regression** + - Weighted average of prediction + - E.g., by accuracy of each model or by a prior + +- **Classification** + - Weighted vote of predicted classes + - It needs an odd number of models to break ties + +- **Probabilistic classification** + - Weighted average of class probabilities + +- We can also learn a meta-learner (stacking) to combine multiple models + +### ############################################################################ +### Bagging +### ############################################################################ + +* Bagging +- Bagging stands for "Bootstrap AGGregation" + +- **Learning procedure** + - Several training datasets are extracted randomly by sampling with + replacement from the original dataset (i.e., bootstrap) + - Learn multiple models, one for each training set + - Combine outputs using various methods + - Result is a better model than a single model + +- **Why bagging works?** + - From the bias-variance decomposition view, combining multiple models: + - Reduces the variance component + - Without compromising the bias (bagged models are typically unbiased) + - Bagging mimics extracting more training sets (though not independent) from + the unknown distribution + +* Bagging and Instability in Learning Algorithms +- Bagging works best with different models, especially non-linear models + +- Introduce randomization in the learning algorithm intentionally + +- **Decision Trees** + - Disable pruning + - Break ties randomly when selecting the best attribute to split + - E.g., bagging trees results in random forests + +- **Multilayer Perceptrons** + - Use different initial weights in backpropagation to reach different local + minima + +- **Nearest Neighbor Classifier** + - Use a random subset of features + - Resampling the training set has limited impact, as it is equivalent to + changing example weights + +### ############################################################################ +### Boosting +### ############################################################################ + +* Boosting +- Boosting builds models that complement each other + - Typically use homogeneous models, i.e., parametrized models from $\calH$ + +- Strong classifiers can be built from weak classifiers + - E.g., decision stumps = decision trees with one level + +- Statistical meaning of boosting: + - Boosting implements forward stagewise additive modeling + - Use another model to learn residuals (difference between predicted and true + values) + +- Boosting does not work for linear regression: + - Combination of linear models is still a linear model + - OLS finds optimal weights in one step + - Combining linear regressions from different attributes is equivalent to a + single multiple linear regression + +* Adaboost.M1 +- Widely used for classification +- Assume examples can be weighted in the cost function used to learn + - Otherwise use resampling + +- **Learning procedure** + - Start with equal weights for all examples + - Iterate: + - Learn a classifier based on current weights for examples + - Weight the answer of each model by overall score (e.g., accuracy) or + probability + - Evaluate the ensemble + - Adjust weights for examples classified correctly/incorrectly + +### ############################################################################ +### Stacking +### ############################################################################ + +* Stacking +- Stacking learns how to combine models (not necessarily of the same type) + +- The problem is that with voting / averaging we don't know which model to trust +- Instead of voting or weighting we can use a meta-learner (level 1) to learn + how to pick / mix models (level 0) + +- **Learning procedure** + - Learn "level 0" models + - Learn "level 1" model using hold-out data from learning of level 0 models + (like in model selection) + - Build training data with predicted values from level 0 models + - Then learn level 1 + - Use a simple model for level 1 (e.g., linear models or trees) to avoid + overfitting + - Use probabilities from level 0, so level 1 can assess the confidence of + each model + +* Boosting vs Bagging vs Stacking + +\begingroup \scriptsize + +| **Aspect** | **Bagging** | **Boosting** | **Stacking** | +| ------------------------ | ----------------------------------- | ---------------------------------- | ---------------------------------------- | +| **Combines** | Models of the same type | Models of the same type | Models of different types | +| | | | +| **Learning** | Models trained independently | Iterative training | Models trained independently | +| **Predicting** | Uses uniform or data-driven weights | Uses learned weights from training | Uses learned weights or confidence | +| **Main Objective** | Reduce variance | Reduce bias | Improve generalization through diversity | +| **Base Learners** | Often strong learners | Often weak learners | Any model type (heterogeneous ensemble) | +| **Sensitivity to Noise** | Low | High | Medium | +| **Parallelizable** | Yes | No (sequential dependency) | Partially (base models parallelized) | +| **Meta-model** | Not used | Not used | Required | +| | | | +| **Examples** | Random Forest | AdaBoost, Gradient Boosting | Stacked Generalization, Blending | + +\endgroup diff --git a/dev_scripts_helpers/documentation/test/outcomes/Test_preprocess_notes_executable1.test3/output/test.txt b/dev_scripts_helpers/documentation/test/outcomes/Test_preprocess_notes_executable1.test3/output/test.txt new file mode 100644 index 000000000..c9faea410 --- /dev/null +++ b/dev_scripts_helpers/documentation/test/outcomes/Test_preprocess_notes_executable1.test3/output/test.txt @@ -0,0 +1,1775 @@ +--- +fontsize: 10pt +--- +\let\emph\textit +\let\uline\underline +\let\ul\underline +::: columns +:::: {.column width=15%} +![](lectures_source/UMD_Logo.png) +:::: +:::: {.column width=75%} + +\vspace{0.4cm} +\begingroup \large +MSML610: Advanced Machine Learning +\endgroup +:::: +::: + +\vspace{1cm} + +\begingroup \Large +**$$\text{\textcolor{blue}{\text{Machine Learning Techniques}}}$$** +\endgroup +\vspace{1cm} + +**References**: + +- AIMA: ? + +- Hastie: ? + + +# Paradigms + +- **Machine Learning Paradigms with Examples (1/3)** + +- **Supervised Learning** + - Learn from labeled data to predict labels for new inputs + - E.g., image classification using ResNet on ImageNet + +- **Unsupervised Learning** + - Discover hidden patterns or structure in unlabeled data + - E.g., K-means clustering for customer segmentation + +- **Reinforcement Learning** + - Learn through interaction with an environment, receiving rewards/punishments + - E.g., deep Q-Learning for playing Atari games + +- **Self-Supervised Learning** + - Generate pseudo-labels from unlabeled data to pre-train models + - E.g., BERT (Masked Language Modeling) + +- **Semi-Supervised Learning** + - Combine small labeled data with large unlabeled data to improve performance + - E.g., named entity recognition (NER) using annotated sentences with entity + tags combined with many raw text documents + +- **Machine Learning Paradigms with Examples (2/3)** + +- **Online Learning** + - Learn incrementally from a stream of data in real time + - E.g., online logistic regression for click-through rate prediction + +- **Multi-Task Learning** + - Train simultaneously a model to perform multiple related tasks + - E.g., learn sentiment analysis and question answering + +- **Meta-Learning** + - "Learning to learn": adapt quickly to new tasks using prior experience + - E.g., a model can be fine-tuned quickly on a new task using just a few + gradient steps + +- **Zero-Shot / Few-Shot Learning** + - Generalize to new tasks with no or few labeled examples + - E.g., GPT-4 solving tasks with zero-shot prompting + +- **Active Learning** + - The model selects the most informative samples to be labeled by an oracle + (e.g., a human) + - E.g., pick samples where the model is least confident to get more examples + +- **Machine Learning Paradigms with Examples (3/3)** + +- **Federated Learning** + - Train models across decentralized devices without sharing raw data + - E.g., fraud detection or credit scoring across banks + +- **Evolutionary Learning** + - Optimize model structures or parameters using evolutionary algorithms + inspired by natural selection and genetics + - Gradient free, global search, discrete structures, variable length inputs + - E.g., genetic algorithms + +- **Curriculum Learning** + - Train models on easier tasks first, gradually increasing difficulty + - E.g., curriculum-based training in robotic control simulations + +- **Multi-Agent Learning** + - Multiple agents learn and interact in shared environments, often in + game-theoretic settings (e.g., competition, collaboration) + - E.g., AlphaStar to play StarCraft II + +- **Supervised Learning** +- Learn a function $f: X \to Y$ that maps inputs to correct outputs using + training examples $(\vx, y)$ where inputs and correct output pairs are known + - Requires labeled data for training + - Measure performance with error on a separate test set + +- **Classification**: output is a discrete label, e.g., + - `Spam` vs `Not Spam` + - Digit recognition `0`, `1`, ... + - Sentiment analysis `Pos`, `Neg`, `Neutral` + +- **Regression**: output is a continuous value, e.g., + - House prices given features like size and location + - House demand + - Stock prices + +- **Common algorithms**: + - Linear Regression + - Decision Trees + - K-nearest neighbors + - Neural Networks + - ... + +- **Unsupervised Learning** +- Learn from data **without** labeled outputs + - Goal: discover patterns, groupings, or structure in the data + - No explicit feedback signal + - Evaluation can be qualitative + +- **Main techniques**: + - **Clustering**: Group similar examples, e.g., + - Customer segmentation + - Grouping news articles by topic without knowing the topics + - **Dimensionality Reduction**: Reduce number of variables with PCA while + preserving structure + - E.g., visualize high-dimensional data in 2D + - **Density Estimation**: Estimate probability distribution of data + - E.g., anomaly detection in server logs + - **Association Rule Learning**: Discover interesting relations between + variables + - E.g., market basket analysis (e.g., "people who buy X also buy Y") + +- **Common algorithms:** + - K-means + - PCA + - Autoencoders + +- **Reinforcement Learning** +- Learn by **interacting with an environment** to **maximize cumulative reward** + - Learn policy $\pi(s) \to a$ that maximizes expected reward + - Trade-off between exploration (trying new actions) and exploitation (using + known good actions) + - Environments provide clear rules and feedback (win/loss/reward) + - Often involve physical simulation or real-world interaction + +::: columns +:::: {.column width=60%} +- **Core elements:** + - Agent: Learner and decision maker + - Environment: Everything the agent interacts with + - State $s$ + - Action $a$ + - Reward $r$ + +- **Algorithms:** + - Q-learning + - Policy Gradient methods +:::: +:::: {.column width=35%} +```graphviz +digraph BayesianFlow { + splines=true; + nodesep=1.0; + ranksep=0.75; + + node [shape=box, style="rounded,filled", fontname="Helvetica", fontsize=12, penwidth=1.4]; + + // Node styles + Agent [label="Agent", shape=box, fillcolor="#F4A6A6"]; + Env [label="Environment", shape=box, fillcolor="#B2E2B2"]; + + // Force ranks + //{ rank=same; Agent; Env; } + + // Edges + Agent -> Agent [label=" State", fontcolor=black, labeldistance=2.0]; + Agent -> Env [label=" Action", fontcolor=black, labeldistance=2.0]; + Env -> Agent [label=" Reward", fontcolor=black, labeldistance=2.0]; +} +``` +:::: +::: + +- **Reinforcement Learning: Examples** +- In game playing, learn strategies through trial and error + - E.g., AlphaGo mastering the game of Go +- In robotics, learn control policies for movement and manipulation +- In autonomous driving, learn safe and efficient driving behaviors +- In resource management, optimize allocation of limited resources over time + - E.g., data center cooling or CPU job scheduling +- In personalized recommendations, adapt suggestions based on user interaction + - E.g., newsfeed ranking adjusting based on user clicks +- In healthcare, optimize treatment plans over time + +# Techniques + +## Machine Learning in Practice + +- **Machine Learning Flow** +- **Question** + - E.g., "How can we predict house prices?" +- **Input data** + - E.g., historical data of house sales +- **Features** + - E.g., number of bedrooms, location, square footage +- **Algorithm** + - E.g., linear regression, decision trees +- **Parameters** + - E.g., learning rate, number of trees in a random forest +- **Evaluation** + - E.g., accuracy, precision, recall + +- **Machine Learning Flow** + +```graphviz[height=80%] +digraph BayesianFlow { + rankdir=LR; + splines=true; + nodesep=1.0; + ranksep=0.75; + node [shape=box, style="rounded,filled", fontname="Helvetica", fontsize=12, penwidth=1.4]; + // Node styles + "Question" [fillcolor="#F4A6A6"]; + "Input data" [fillcolor="#FFD1A6"]; + "Features" [fillcolor="#B2E2B2"]; + "Algorithm" [fillcolor="#A0D6D1"]; + "Parameters" [fillcolor="#A6E7F4"]; + "Evaluation" [fillcolor="#A6C8F4"]; + // Force ranks + // Edges + "Question" -> "Input data"; + "Input data" -> "Features"; + "Features" -> "Algorithm"; + "Algorithm" -> "Parameters"; + "Parameters" -> "Evaluation"; +} +``` + +- **Not all phases are equally important!** + - Question $>$ Data $>$ Features $>$ Algorithm + +- Clarity of the question impacts project success +- Quality and relevance of data are crucial for performance +- Proper feature selection simplifies the model and improves accuracy +- Algorithm is often less important (contrary to popular belief!) + +- **Question** +- **Make the question concrete and precise** + - Define the problem clearly + - Specify inputs and expected outputs + - Align question with business or research objectives + - E.g.,: + - **Bad**: _"How can we improve sales?"_ + - **Good**: _"What factors most significantly impact sales of product X in + region Y during season Z?"_ + +- Formulating question is **the most important part** of the machine learning +problem + - Misunderstanding leads to: + - Solving the wrong problem + - Collecting wrong data + - ... + +- _"If I were given one hour to save the planet, I would spend 59 minutes + defining the problem and one minute resolving it"_ (Albert Einstein) + +- **Input Data** +- Ensure **data is specific to prediction** goal + - E.g., use known movie ratings to predict unseen movie ratings from the same + population + - Training set $\approx$ test set + +- Relationship between data and prediction goal is **not always direct** + - E.g., interested in prices but predict supply and demand instead + +- Poor-quality data leads to inaccurate predictions + - _"Garbage in - garbage out"_ + +- Recognize **when data is insufficient** for valid answers + - _"Combination of data and desire for an answer does not ensure a reasonable + answer can be extracted"_ (John Tukey) + +- **More data vs better models** + - Meta-studies show difference between generic and best model is like + 5\% + - _"It's not who has the best algorithm that wins. It's who has the most + data"_ (Google researcher) + - _"Every time I fire a linguist, the performance of the speech recognizer + goes up"_ (IBM researcher in speech recognition) + +- **Features** +- **Features** provide high-level information about inputs + - E.g., use intensity and symmetry for scanned numbers instead of raw bit maps + +- **Characteristics of good features**: + 1. Enable data compression + 2. Retain relevant information + 3. Often created with expert knowledge + +- **Common mistakes in feature building**: + 1. Automating feature selection may lead to overfitting + - Black box predictions can be accurate but stop working anytime + - E.g., Google Flu's unclear feature-model link + 2. Ignoring data-specific quirks + - E.g., mislabeling outliers + 3. Unnecessarily discarding information + +- **Models** +- Best models are: + - **Interpretable** + - Allow users to understand and trust the model's decisions + - E.g., decision trees are appropriate in medical studies since they produce + a "reasoning" + - **Simple** + - Easier to implement and maintain + - Reduces the risk of overfitting + - **Accurate** + - Often accuracy is traded off for remaining characteristics + - E.g., accuracy vs interpretability, accuracy vs speed + - **Fast** + - To train and test + - Essential for real-time applications + - Reduces computational costs + - **Scalable** + - Can handle large datasets efficiently + - Important for growing data and user bases + - E.g., in the Netflix prize, Netflix didn't end up implementing the best + algorithm since it wasn't scalable enough + +## How to Do Research + +### Simple Is Better + +- **Occam'S Razor** +- _The **simplest** model that fits the data is also the **most plausible**_ + (Occam) + - Trim the model to the bare minimum necessary to explain the data + - _"An explanation of the data should be as simple as possible, but not + simpler"_ (Einstein?) + - **Simple** means: + - Less likely to fit a given data by coincidence + - An unlikely event is more significant if it happens (formalized in terms + entropy) + - **Better** means better out of sample performance + +- An object is **simple** when it is one of few possible objects + - Polynomial of order 2 is simpler than a polynomial of order 17 + - There are many more polynomials of order 17 compared to order 2, although + both are infinite sets + - SVM (Support Vector Machine) characteristics: + - The separating hyperplane appears wiggly, but it is defined by a few + support vectors + - Complexity of a hypothesis $h$ + - E.g., polynomial order, MDL (describe the hypothesis in terms of bits), + Kolmogorov complexity + - Complexity of a hypothesis set $\calH$ + - E.g., VC dimension of the model + - Complexity of $h$ and $\calH$ are related by counting: if we need $l$ bits + to specify $h$, then $h$ is one of $2^l$ elements of a set $\calH$ + +- **Model Soundness** +- We cannot blindly accept the result of modeling + - A model should tell a story + - Always ask yourself: _"what criticisms would you give to the model if it was + presented to us for the first time?"_ + +- Benchmark models: what are the performance if the model outputs: + - Outputs always 0 or 1 + - E.g., long-only model for stock predictions + - Random results + - I.e., bootstrap of null hypothesis "there is no prediction power" + +- A perfect fit can mean nothing, e.g., + - Get 2 data points on a plane + - Fit data with a linear relationship + - It is a perfect fit + - This means nothing since: + - There is always a line between 2 points + - The data cannot falsify the hypothesis + - The model (line) is too complex for the data set (only 2 points) + +- **Sampling Bias** +- A model, when learning, sees the world in terms of the training data + - If data is sampled in a biased way, learning will produce a biased outcome + +- Formally: one of the few hypothesis of Hoeffding in learning theory is that + training and testing distributions are the same + +- Addressing sampling bias + - Weight or re-sample data to match testing distribution + - If data points have zero probability ($\Pr = 0$) in the data set, no + remedies are possible + +- **Data Snooping** +- **Data snooping** is the improper use of data that biases ML model results + - Common trap for practitioners + +- **Sources** of data snooping + 1. Contamination of training and test sets + 2. Multiple testing issue + 3. If data affects any learning step (e.g., feature engineering, model + selection, hyperparameter tuning), its assessment becomes optimistic + +- **Effects** of data snooping + - Models show inflated performance metrics which do not translate out of + sample + - Snooping leads to seemingly better performance: + - It is a "happy minefield" + +- **"Burning the Test Set"** +- Repeatedly using the same data eventually leads to "success" + - The model starts fitting to specific data quirks + - The test set should not be used for training; this leads to over-optimism + - _"If you torture the data long enough, it will confess whatever you want"_ + +- Solutions: + - Use the test set _exactly once_ + - The VC dimension applies to the overall learning model, including all + attempted models + - MDL accounts for the number of fitting attempts in overfitting measurement + - Adjust p-values for multiple experiments + +### Research Methodology + +- **How to Achieve Out-Of-Sample Fit** +- Goal: choose an hypothesis $g$ approximates the unknown target hypothesis $f$ + + $$ + g \approx f \iff E_{out}(g) \approx 0 + $$ + +- Solution: + - Achieve + 1. Good in-sample performance $E_{in}(g) \approx 0$ + 2. Good generalization $E_{out}(g) \approx E_{in}(g)$ + - Then 1. + 2. $\implies$ good out-of-sample performance + $E_{out}(g) \approx 0$ + +- **What to Do If Out-Of-Sample Fit Is Poor?** +- The model performs well in sample ($E_{in} \approx 0$) but poorly out of + sample ($E_{out} \gg E_{in}$) + +- What does it mean? + - The in-sample performance are optimistic + - The model is overfitted and fails to generalize + +- What do we do? + - Run diagnostics before embarking in long term projects + - Gain insight on what works / does not work to understand how to improve + performance + - E.g., bias-variance curves and learning curves + +- How to fix? + - It depends on the diagnostics! + 1. Training data + - Get more training data (it can take long time) $\iff$ fixes high variance + 2. Features + - Remove features $\iff$ fixes high variance + - Add more features (it can take long time) $\iff$ fixes high bias + - Add derived features (e.g., polynomial features) $\iff$ fixes high bias + 3. Regularization + - Decrease regularization amount $\lambda$ $\iff$ fixes high bias + - Increase regularization amount $\lambda$ $\iff$ fixes high variance + +- **Why Using a Lot of Data?** +- Several studies show that: + - Different algorithms/models have remarkably similar performance + - Increasing training set improves performance + +- Thus it holds that: + + $$ + \text{High capacity model + massive training set = good performance} + $$ + +- Using a high capacity model with many parameters (e.g., neural network) + $$ + E_{in} \approx 0 + $$ + due to low bias (and high variance) +- A massive data set helps avoid overfitting + $$ + E_{out} \approx E_{in} + $$ +- These two conditions together + + $$ + E_{out} \approx E_{in} \approx 0 \implies E_{out} \approx 0 + $$ + +- **What to Do When One Has Lots of Data?** +- You have $m$ = 100M examples in data set, what do you do? + +- Training on a lot of data might yield scalability issue: + - Slow + - Lots of compute + - Require work on infrastructure + - ... + +- Plot the learning curves as function of increasing + $m = 1k, 10k, 100k, 1M, ...$ + - If the algorithm has large bias, it converges (training and validation + performance are similar) at $m = 1000$ + - Add more features and complicate the model rather than training on 100M + instances + - If the variance is large, use all instances to train the model + +- **Why We Do Things?** +- Always + - Ask: _"Why are we doing something?"_ + - To understand the purpose of the task + - Ask: _"What do we hope to determine by performing the task?"_ + - To clarify goals and outcomes of the task + - Encourage thinking about actions with the bigger picture in mind + - Avoid merely going through motions + - Promote critical thinking and awareness + - Prioritize tasks by importance and impact + +- E.g., when conducting a customer survey, ask: + - _"Why is feedback being collected?"_ + - To improve product features and customer service + - _"What is the desired outcome?"_ + - To identify areas for improvement and innovation + +- E.g., before starting a marketing campaign, ask: + - _"Why is this campaign run?"_ + - To increase brand awareness or drive sales + - _"What are the specific goals?"_ + - Set target number of new leads or click-through rates + +- **Summary of the Results, Next Steps, Follow Ups** +- Always have a summary of the results + - It's like a high-level map of what we have done and what we have discovered + - E.g., "smoothing model coefficients helps" + - Highlight major findings + - Interpret the results + - E.g., _"The increase in sales is likely due to the new marketing + strategy."_ + - Conclusions + - Summarize what the data suggests or confirms + - E.g., _"Our hypothesis that user engagement increases retention is + supported"_ + +- Always have a reference to more detailed results + - Provide quick insights before diving into details + +- Always have next steps / follow-ups + - What do you expect that will happen? + - What results do you expect? + - Like thinking $n$ moves ahead in chess + - E.g., _"Next, we will conduct a detailed analysis on the demographics + contributing most to sales growth"_ + - Outline potential experiments or analyses to validate findings further + +- **Example of Spam Filter Classification** +- We use $N = 4$ words in an email to distinguish spam from non-spam emails + using logistic regression + - Words can be: `buy`, `now`, `deal`, `discount`, `` + +- How to improve the performance of this classifier? + 1. Collect more data + - E.g., honeypot project: set up fake email account and collect spam + 2. Use better features + - E.g., email routing information: spammers use unusual accounts and mask + emails as legitimate + 3. Use better features from message body + 4. Detect intentional misspellings + - Spammers use misspelled words (e.g., `w4tch` for `watch`) to confuse the + classifier + - Use stemming software + +- **Right and Wrong Approach to Research** + +- **Bad** + 1. It is not clear how to prioritize the different possible tasks + 2. Use gut feeling and pick one task + 3. Complete the task + 4. Re-evaluate performance + +- **Good** + 1. Build a simple algorithm + - Within 1 day + 2. Set up the performance evaluation framework + - A single number and bounds to evaluate + - Aim to improve that number + - Evaluate with cross-validation + 3. Set up diagnostic tools + - Compute learning and bias-variance curves + - Avoid premature optimization by understanding the issue before fixing it + 4. Manually review misclassified emails in the cross-validated set + - What features might help to improve performance? + - E.g., what types of emails are misclassified? + +- Sometimes an approach must be tried to see if it works + - E.g., stemming software to consider certain words equivalent + +## Pipeline Organization + +- **How Are Machine Learning Systems Organized?** +- Machine learning systems are typically organized in a pipeline + 1. Break down the problem into sub-problems + 2. Solve problems one at the time + 3. Combine the solutions to the sub-problems into a solution to the initial + problem + +- The performance $p$ of the entire ML pipeline are given by: + + $$ + p_{system} = \sum_i p_i \cdot \alpha_i + $$ + + where: + - $p_i$ is the performance of each stage $p_i$ + - $\alpha_i$ is the importance of each stage + +- **ML Pipeline: Example of Photo OCR System** +- Goal: build systems to read text in a picture + - OCR = "Optical Character Recognition" + +- Stages of ML pipeline for OCR: + - Text detection: find areas of the picture with text + - Character segmentation: split text into boxes, one per letter + - E.g., `h e l l o` + - Character classification: classify characters, one at a time + - Spelling correction: fix errors in text using context + - E.g., `hell0` corrected to `hello` + +- Issues with text detection: + - Unknown text location and size + +- Solution + - Use a sliding window classifier + - Works as evaluating a classifier is often cheap compared to training + - Sliding window classifiers can be used for text detection and character + segmentation + +- **Text detection** + - Train a classifier to recognize letters vs non-letters + - Scan image in two directions, different sizes looking for text + - Create a map of text likelihood (e.g., heatmap) using classifier + probabilities + - Enclose text areas in boxes + - Discard boxes not fitting aspect ratio (valid text width > height) + +- **Character segmentation** + - Use sliding window classifiers to find "breaks" between characters + - Use a 1D sliding window for character segmentation + +- **The Ideal Recipe for ML** +- The ideal recipe for ML is: + + $$ + \text{low-bias algorithm + massive amount of data to train} + $$ + - Use learning curves to make sure we are taking advantage of more data + +- Always ask yourself: _"how much work is to get 10x more data than we currently + have?"_ +- Often it is not that difficult: + 1. Artificial data + - E.g., synthesize or amplify data set + 2. Collect and label by hand + - E.g., crowd sourcing like Amazon Mechanical Turk + +- **OCR Pipeline: Example of Artificial Data Synthesis** +- How can we increase data set size? + 1. Synthesize data set + - Use font libraries to generate large training sets + - Paste characters against random backgrounds + - Apply scaling, distortion, adding noise, etc + 2. Amplify a data set + - Start from a training set and add examples by warping/distorting existing + examples + +- Transformations and noise should be specific to the application domain + - E.g., Gaussian noise is not always appropriate + +- **Ceiling Analysis for ML Pipeline** +- The most valuable resource is time + - Sometimes one works on an optimization for months + - The optimization doesn't make much difference + +- **Problem**: On which part of the pipeline should time/resource be spent? + +- **Solution**: Ceiling analysis + - Technique to analyze performance of pipelines + - Have a single number representing the performance of the entire system + - E.g., accuracy for an OCR system + - For each component: + - Mock the component with a box that always gives the correct output + (=oracle) + - Leave the remaining components untouched + - Compute performance of the entire pipeline + - Understand which component is critical to performance by estimating an upper + bound for overall performance when that component improves 10\% + - Don't trust your gut feeling but measure! + +## Input Processing + + +- Data cleaning +- Dimensionality reduction +- Feature engineering + +## Learning Algorithms + +### Gradient Descent + +- **The Problem of Minimizing a Function** + +::: columns +:::: {.column width=80%} +- **Goal**: minimize a function $J(\vw)$ + - E.g., in-sample error $E_{in}(\vw)$ + +- **Solutions**: + 1. Analytical solution + - Impose the gradient of $J(\vw)$ to equal 0 + - Find a closed-form solution for $\vw^*$ + 2. Numerical solution: + - Use an iterative method to update $\vw$ to reach the minimum value of + $J(\vw)$ + - E.g., gradient descent + - It works even if there is an analytical solution +:::: +:::: {.column width=20%} + ![](lectures_source/figures/Lesson6_Gradient_descent_2.png) +:::: +::: + +- **Gradient Descent: Intuition** + +::: columns +:::: {.column width=80%} +- **Problem**: + - We are on a hilly surface and we want to walk down to the bottom of the hill + +- **Solution**: + - At each point: + - We look around + - We move a step in the direction where the surface is steepest + - We keep doing until we reach the bottom + +- Gradient descent + - Is a general technique for minimizing a twice-differentiable function + - Converges to + - A local minimum in general + - The global minimum if $J(\vw)$ is convex (e.g., logistic regression and + linear models) +:::: +:::: {.column width=20%} +![](lectures_source/figures/Lesson6_Gradient_descent_1.png) +:::: +::: + +- **Gradient descent with fixed learning rate (1/3)** +- Consider the contour plot of a function + +- Start from a point $\vw(0)$ (random, the origin, ...) + +- At each step, move a fixed amount $\eta$ in the weight space (fixed learning + rate): + + $$ + \vw(t + 1) = \vw(t) + \eta \hat{\vvv} + $$ + + where $\hat{\vvv}$ is a unit vector + +- Pick $\hat{\vvv}$ to move to a value of $E_{in}(\vw)$ as negative as possible + + - The change for $E_{in}$ is: + \begingroup \footnotesize + \begin{align*} + \Delta E_{in} + & = E_{in}(\vw(t + 1)) - E_{in}(\vw(t)) + & \\ + & = E_{in}(\vw(t) + \eta \hat{\vvv}) - E_{in}(\vw(t)) + & \text{ (replacing the expression of $\vw(t + 1)$)} \\ + & = \eta \nabla E_{in}(\vw(t))^T \hat{\vvv} + O(\eta ^ 2) + & \text{ (using Taylor expansion)} \\ + \end{align*} + \endgroup + - Gradient descent keeps only $O(\eta)$ the term and ignores the rest + - Conjugate gradient considers up to $O(\eta^2)$ and ignores higher + infinitesimals + +- **Gradient Descent with Fixed Learning Rate (2/3)** + +- The minimal value of the scalar product + - Is $- \eta \|\nabla E_{in}(\vw(t))\|$, + - Happens when + $\hat{\vvv} = - \frac{\nabla E_{in}(\vw(t))}{\|\nabla E_{in}(\vw(t))\|}$ +- The change in weights is: + $$ + \Delta \vw = - \eta \frac{\nabla}{\|\nabla\|} + $$ +- It is called "gradient descent" since we descend along the gradient of the + function to optimize + +- **Gradient Descent with Fixed Learning Rate (3/3)** +- Each component of the weight $\vw$ is updated with the partial derivative with + respect to that coordinate: + \begin{align*} + \vw(t + 1) + &= \vw(t) - \eta \hat{v} \\ + \vw(t + 1) + &= \vw(t) - \eta \frac{\nabla E_{in}(\vw(t))}{\|\nabla E_{in}(\vw(t))\|} \\ + w_j(t + 1) + &= w_j(t) - \eta + \frac{1}{\|\nabla E_{in}(\vw(t))\|} + \frac{\partial E_{in}(\vw)}{\partial w_j} + \end{align*} + +- The update of all components should be simultaneous, i.e., computed at once + +- A step of the optimization when we update the solution (weights) is called + epoch + +- **Gradient Descent: Stopping Criteria** +- In theory, stop when $\Delta E_{in} = \vv{0}$ + - Numerically this might not occur +- In practice, stop when + - The variation of $E_{in}$ is smaller than a given threshold + $\Delta E_{in} < \theta$ + - We have reached a certain number of iterations + +- Monitoring gradient descent + - In theory, only need to compute the derivatives of the function $J(\vw)$ to + optimize + - In practice, need to monitor the algorithm progress by recomputing the cost + function $J(\vw)$ periodically to make sure it is decreasing + +- **Setting $\eta$ in Gradient Descent with Fixed Learning Rate** +- Consider a 1D convex function + - If $\eta$ is small: + - The linear approximation of $E_{in}$ is effective + - Many steps are needed to converge to the minimum + - If $\eta$ is large: + - The linear approximation fails (higher terms affect values) + - It "bounces around" + +![](lectures_source/figures/Lesson6_Gradient_descent_3.png) + +- Idea: vary learning rate $\eta$ during gradient descent + - Smaller learning rates may find a better minimum + - Reduce $\eta$ as a function of iterations + - Cons: this introduces an additional parameter to tune + +- **Gradient Descent with Variable Learning Rate** +- In gradient descent with fixed learning rate (i.e., constant change in weight + space), we use: + + $$ + \Delta \vw = - \eta \frac{\nabla J}{\|\nabla J\|} + $$ + +- To converge quickly, we want to: + - Move fast in weight space (large $\eta$) when the surface is steep (large + gradient) + - Move slow in weight space (small $\eta$) near the minimum to avoid bouncing + around (small gradient) + +- Ideally, $\eta$ should increase with the slope: $\eta \propto \|\nabla J\|$ + +- This is called gradient descent with variable learning rate: + + $$ + \Delta \vw = - \eta \nabla J + $$ + +- **Feature Scaling in Gradient Descent** +- Gradient descent converges faster if features are scaled to the same range + - Feature scaling techniques include min-max scaling and standardization + - E.g., applying standardization to a dataset can transform feature values to + have a mean of 0 and a standard deviation of 1 + +- Otherwise, different gradient components have different errors due to + numerical approximation, causing the gradient to bounce around + - Unscaled features can lead to slow and unstable convergence due to varying + magnitudes + - E.g., if one feature ranges from 1 to 1000 and another ranges from 0.01 to + 1, the large disparity can cause inefficient updates + +- **Issues with Batch Gradient Descent** +- Consider the case of squared error with $n$ samples + $$ + E_{in}(\vw) + = \frac{1}{n} \sum_i e(h_{\vw}(\vx_i) - y_i) + = \frac{1}{n} \sum_i (h_{\vw}(\vx_i) - y_i) ^ 2 + $$ +- The Batch Gradient Descent (BSD) requires to update each component of the + weight vector with an expression like: + $$ + \vw(t + 1) = \vw(t) - \eta \frac{\nabla E_{in}}{\|\nabla E_{in}\|} + $$ +- In terms of coordinates for squared error: + $$ + w_j(t + 1) = w_j(t) - + \eta \frac{2}{n} + \sum_{i=0}^n (h_{\vw}(\vx_i) - y_i) + \frac{\partial h_{\vw}(\vx_i)}{\partial w_j} + $$ +- With a large number of training examples (e.g., $N = 10^6$), gradient descent: + - Is computationally expensive as it requires evaluating the gradient from all + examples for a single update + - Requires storing all the data in memory + +### Stochastic Gradient Descent + +- **Stochastic Gradient Descent** +- **Idea** of Stochastic Gradient Descent (SGD) + - Update the weights only for one training example picked at random + +- **Algorithm** + - Pick one $(\vx_n, y_n)$ at a time from the available examples + - Compute $\nabla e(h(\vx_n), y_n)$ to update the weights: + $$ + \Delta \vw = -\eta \nabla e + $$ + - Update the weight considering only one random example: + $$ + w_j(t + 1) = w_j(t) - + \eta \frac{2}{n} + (h_{\vw}(\vx_t) - y_t) + \frac{\partial h_{\vw}(\vx_t)}{\partial w_j} + $$ + +- $\nabla e$ is a function of a random var $\vx_n$ + - The average direction of SGD is the same direction as batch version + $$ + \EE[\nabla e] + = \frac{1}{N} \sum \nabla e(h(\vx_n), y_n) + = \nabla \frac{1}{N} \sum e(h(\vx_n), y_n) + = \nabla E_{in} + $$ + +- In Stochastic Gradient Descent (SGD): + - The path in the weight space is more random + - It does not even converge but rather oscillates around the local minimum + +- **Mini-Batch Gradient Descent** +- Bring together characteristics of both Batch and Stochastic Gradient Descent + +- Use $b$ examples to make an update to the current weight + - $b$ represents the batch size, which is a hyperparameter you can choose + - A common choice for $b$ might be $b = 32$ or $b = 64$ + +- Mini-batch GD offers a balance between SGD noisiness and full-batch + approaches, using small, random data samples for updates + +- **On-Line Learning and Gradient Descent** +- Continuous stream of training examples requires updating the model + - In real-time systems, new data points arrive and the model adapts without + fully retraining + - E.g., in stock market prediction models, each transaction can dynamically + adjust model weights + - Handle variation in the dynamics of the underlying process + +- Stochastic gradient (SGD) and mini-batch descent are suitable for online + learning, updating the model one example at a time + +- Discard examples for a "compressed" model representation + - Useful for large data streams where storing every data point is impractical + - E.g., in training a language model on live chat data, older conversations + might be discarded after updates to maintain relevant patterns in the model + +- **SGD vs BGD vs Mini-Batch** +- To update the weights: + - BGD (batch gradient descent) uses all the training examples + - SGD (stochastic gradient descent) uses a single (random) training example + - Mini-batch GD uses only a subset of training examples + +\begingroup \scriptsize + +| **Aspect** | **Batch Gradient Descent** | **Stochastic Gradient Descent** | +| --------------- | ---------------------------------------- | ------------------------------------ | +| Computation | Uses all examples | One example at a time | +| Memory | Requires all examples in memory | Require less memory | +| Randomization | More likely to terminate in flat regions | Avoid local minima due to randomness | +| Regularization | No implicit regularization | Oscillations act as regularization | +| Parallelization | Can be parallelized | Less parallel-friendly | +| Online Learning | Not suitable | Suitable for online learning | + +\endgroup + +- **Map-Reduce for Batch Gradient Descent** +- In map-reduce we use $k$ machines to parallelize the summation (map step) and + then we send the $k$ partial sums to a single node to accumulate the result + (reduce step) +- Batch GD (and many learning algorithms) can be expressed in this map-reduce + form + + +- **Coordinate Descend** +- Minimize $J(x_0, ..., x_n)$ by optimizing along one direction $x_i$ at a time + - Instead of computing all derivatives + +- **Algorithm** + - Pick a random starting point $\vw(0)$ + - Pick a random order for the coordinates $\{ x_i \}$ + - Find the minimum along the current coordinate (1D optimization problem) + - Move to the next coordinate $x_{i+1}$ + - The sequence of $\vw(t)$ is decreasing + - A minimum is found if there is no improvement after one cycle of scanning + all coordinates + - The minimum is local + +- **Gradient Descent vs Pseudo-Inverse for Linear Models** +- For linear models we can use either pseudo-inverse or gradient descent to find + optimal $\vw^*$ + +- **Gradient descent** + - Choose learning rate $\eta$ + - Requires many iterations to converge + - Monitor stopping criteria, oscillations, etc + - Effective for many features $P$ + +- **Pseudo-inverse** + - No parameter selection needed + - Converges in one iteration (with nested loops) + - Computes $(\mX^T \mX)^{-1}$, a $P \times P$ matrix + - Inverse complexity $O(P^3)$ + - E.g., for $P \approx 10,000$, gradient descent is preferable + +## Performance Metrics + +- **How to Make Progress in ML Research** +- There are many possible directions for research + - Different features + - Different data preprocessing methods + - Different models + - Different training algorithms + - Different evaluation techniques + - Explore optimization strategies + +- What to do? + +- Approach + - Evaluate models systematically using a single number + - Implement metrics (E.g., accuracy, F1 score) for insight + - Use cross-validation for model validation + - Statistical tests to ensure differences are not random + - Utilize hypothesis testing for genuine improvements + - Conduct A/B testing for real-world validation + +- **How to Measure Classifier'S Performance?** +- Success / hit / win rate (or error / miss rate) + - Measures the proportion of correct predictions by the model + - Important for understanding overall accuracy + - E.g., in binary classification, 80 correct predictions out of 100 result in + an 80% success rate + +- Log probability / cross-entropy error + - Evaluates classification model with probabilities between 0 and 1 + - E.g., lower cross-entropy loss indicates better performance + +- **Precision / recall / F-score** + - Useful for evaluating models in imbalanced data scenarios + - Precision: ratio of correctly predicted positive observations to total + predicted positives + - E.g., a precision of 0.75 means 75% of identified positives are true + positives + - Recall: ratio of correctly predicted positive observations to actual + positives + - E.g., a recall of 0.60 means 60% of actual positives were correctly + identified + - F-score: weighted harmonic mean of precision and recall + +- **Utility function** + - Customizes the evaluation metric to prioritize types of errors and success + - E.g., true / false positives / negatives + - E.g., in medical diagnosis, a utility function might give higher weight to + minimizing false negatives to prevent missed diagnoses + +- **Training vs Test Set** +- Performance on train set $E_{in}$ is an optimistic estimate of $E_{out}$ + - One can have: + - 0\% error rate on training data (e.g., memorizing responses for training + set) + - 50\% error rate on test set (e.g., by answering randomly) + +- To evaluate model performance, use a test set that played no role in training + +- Training and test sets should be representative samples of the problem + - E.g., credit risk problem + - One cannot use data from a bank branch in Florida to assess a model built + with data from a bank branch in New York + - Characteristics of the populations are very different + +- **Lots of Data Scenario vs Scarce Data Scenario** +- **Lots of data scenario** + - Ideal to have lots of data (ideally infinite) + - Learn on lots of data + - Fit all degrees of freedom of a complex model + - Predict on lots of data + - Assess precise out-of-sample performance + +- **Scarce data scenario** + - Often data (especially data of high quality) is scarce + - E.g., facial recognition datasets with limited annotated data needing + careful management + - Cannot use all data as a training set + - Need to hold out data to estimate performance metrics and bounds + - Split the data 70-30 or 80-20 in train and test sets + - Consider cross-validation techniques to maximize data usage + - Other approaches: + - Augment data artificially, like data augmentation in image processing + - Utilize transfer learning with pre-trained models on related tasks + + +- **Splitting Data Into Training, Validation, Test Sets** +- Training, validation, and test sets must be: + - Distinct + - Representative of the problem + - E.g., each class in all sets must be represented according to the original + data + - Sized based on available data and problem needs + +- To ensure sets have the same distribution: + - Stratified sampling + - E.g., each class label is proportionally represented in each set + - Shuffle and then sample + - Achieves randomization, maintaining distribution + - Sample and check statistics of variables (e.g., mean, std dev, PDF) + - Compare these statistics to ensure each set mirrors the broader dataset + +- **Rule of Thumbs for Data Set Splits** +- If $n$ is **large** $\to$ use a 60-20-20 split + - Training: 60% + - Validation: 20% + - Test: 20% + +- If $n$ is **medium** $\to$ use a 60-40 split + - Training: 60% + - Test: 40% + - Not possible to learn hyperparameters, so no validation set + +- If $n$ is **small** $\to$ use cross-validation and report "small data size" + - Use K-fold cross-validation + - Be cautious of the increased chance of high accuracy by chance + - Is machine learning for the given sample size even suitable? + +- **Can We Ever Use Test Set as Training Set?** +- Once the model is selected and validated, reuse all available data (including + the test set) to generate the model for deployment + - This ensures the model benefits from all available information + +- Generally, more data is better, though returns diminish after exceeding a + certain volume + - Initially, increasing data size can significantly improve model performance + - Eventually, adding more data results in smaller accuracy gains and may not + justify the increased computational cost + +- **In-Sample vs Out-Of-Sample Error Expressions** +- We want to find a function $h$ that approximates the unknown function $f$, + $h \approx f$ over the space of inputs $\vx \in \calX$ ("script X") + +- The error is usually defined point-wise: + + $$ + e(h(\vx_i), f(\vx_i)) + $$ + - E.g., + - Squared error: $e(\vx) = (h(\vx) - f(\vx))^2$ + - 0-1 binary error: $e(\vx) = I[h(\vx) == f(\vx)]$ + - Log probability: $e(\vx) = - \log( \Pr(h(\vx) == f(\vx)) )$ + +- In-sample error is computed using all points in the training set: + + $$ + E_{in}{(h)} = \frac{1}{N} \sum_{i=1}^N e(h(\vx_i), f(\vx_i)) + $$ + +- Out-of-sample error is computed on the entire space of inputs $\calX$ + + $$ + E_{out}(h) = \EE_{\vx \in \calX}[e(h(\vx), f(\vx))] + $$ + +- **Mean Squared Error (MSE)** +- MSE is the average difference of squared error: + + $$ + \text{MSE} + \defeq \frac{1}{N} \sum_{i=1}^N (h(\vx_i) - f(\vx_i))^2 + $$ + - MSE measures the estimator quality, quantifying the difference between + estimated and actual values + - E.g., in a house price prediction model, MSE determines how close predicted + prices are to actual prices + +- **Cons:** + - It doesn't share the unit of measure with the output + - Distorts error interpretation; predicted and actual values are usually in + different units + - Sensitive to outliers + - A single large error can disproportionately affect the MSE + - Use median absolute deviation (MAD), median of squared error for + robustness against outliers + +- **Root Mean Squared Error (RMSE)** +- RMSE is the standard deviation of the Mean Squared Error (MSE): + + $$ + \text{RMSE} \defeq \sqrt{\text{MSE}} + = \sqrt{\frac{1}{N} \sum_{i=1}^N (h(\vx_i) - f(\vx_i))^2} + $$ + +- **Pros:** + - Same units as the output, allowing intuition of its magnitude compared to + the mean + - Facilitates comparison between different data sets or models since the + metric is normalized to the output's scale + +- **Cons:** + - Sensitive to outliers (like MSE) which can excessively affect the metric + - May not be suitable for ranking models when outliers or skewed distributions + are present + +- **Median-Based Metrics** +- We can use metric based on median (i.e., the 0.5 quantile of absolute error): + +- Median absolute deviation: + + $$ + \text{MAD} \defeq \text{median}_i(|h(\vx_i) - f(\vx_i)|) + $$ + +- Median squared error: + + $$ + \defeq \text{median}_i(|h(\vx_i) - f(\vx_i)|^2) + $$ + +- **How to Choose an Error Measure?** + +- Error measure depends on the **application** and should be **specified by the + "customer"**: + - The customer needs to define what constitutes an acceptable level of error + for their specific use case + - E.g., medical applications might have a low tolerance for errors, while a + recommendation system might have a higher tolerance + +- Otherwise, we can pick: + - A **plausible error measure**: + - E.g., squared error is commonly used when assuming Gaussian noise in the + data + - A **"friendly error" measure**: + - E.g., measures that allow for closed-form solutions simplify calculations + significantly + - Convex optimization-friendly measures ensure optimization algorithms find + the global minimum easily + +- **Error Measures: Fingerprint Verification Example** +- In fingerprint verification: + - Recognizing a valid fingerprint has no error + - Otherwise, it is a false positive or a false negative + +- Error weight depends on the application + - For the same problem in two set-ups, the error measure is the opposite + - For supermarket applications: + - False positives are minor (e.g., one more discount) + - False negatives are costly (e.g., annoyed customer, slow line) + - For CIA building access: + - False negatives are acceptable (triggers further security) + - False positives are disastrous + +### Precision and Recall + +- **Error Metrics for Skewed Classes** +- When classes are skewed (i.e., one class is very rare), accuracy can be + misleading + - Use metrics like confusion matrix, precision, and recall + +- Example: + - Train a classifier to distinguish tumors as: + - $y = 1$: malignant + - $y = 0$: benign + - Classifier's error rate is 1% (i.e., guess correctly 99% of the time) seems + good + - But only 0.5% of patients have cancer + - A trivial classifier that always outputs $y = 0$ has a 0.5% error rate! + - Now a 1% error rate does not look good anymore + +- **Decision Matrix ::: Columns :::: {.Column Width=60%}** +- Aka confusion matrix + +- Typically $y = 1$ encodes the rare class to predict + +- Assuming actual and predicted class $\in \{0, 1\}$, we have 4 possible cases: + - $act = 1$, $pred = 1$: true positive (TP) + - $act = 0$, $pred = 0$: true negative (TN) + - $act = 1$, $pred = 0$: false negative (FN) (output $pred = 0$, but it is + wrong) + - $act = 0$, $pred = 1$: false positive (FP) (output $pred = 1$, but it is + wrong) + +- Aggregate decision matrix in precision and recall +:::: +:::: {.column width=35%} + +```tikz +% Draw matrix +\draw[thick] (0,0) rectangle (4,4); +\draw[thick] (0,2) -- (4,2); % horizontal middle +\draw[thick] (2,0) -- (2,4); % vertical middle + +% Labels for actual class +\node[rotate=90] at (-0.8,3) {act = 1}; +\node[rotate=90] at (-0.8,1) {act = 0}; + +% Labels for predicted class +\node at (1,4.3) {pred = 1}; +\node at (3,4.3) {pred = 0}; + +% Cell labels +\node at (1,3) {\textbf{TP}}; +\node at (3,3) {\textbf{FN}}; +\node at (1,1) {\textbf{FP}}; +\node at (3,1) {\textbf{TN}}; +``` +:::: +::: + +- **Precision vs recall** +- Assume that $y = 1$ encodes the rare event we want to detect + +- **Precision** measures how often there is a true positive _given that pred = + 1_ + + \begingroup \small + + $$ + \text{precision} + \defeq \Pr(\text{TP} | \text{pred == 1}) + = \frac{|\text{pred == 1} \land \text{act == 1}|}{|\text{pred == 1}|} + = \frac{\text{TP}}{\text{TP} + \text{FP}} + $$ + + \endgroup + +- **Recall** measures how often there is a true positive _given that act = 1_ + \begingroup \small + + $$ + \text{recall} + \defeq \Pr(\text{TP} | \text{act == 1}) + = \frac{\text{TP}}{|\text{act == 1}|} + = \frac{\text{TP}}{\text{TP} + \text{FN}} + $$ + + \endgroup + +- Both are conditional probability measuring the fraction of TP under different + circumstances: + - (Pre)cision: pred = 1 + - Rec(a)ll: act = 1 + +- Precision/recall are widely used in information retrieval + - E.g., a search engine: + - Returns 30 pages; only 20 are relevant $\implies$ precision = 20 / 30 = 2 + / 3 + - Fails to return another 40 relevant pages $\implies$ recall = 20 / + (40 + 20) = 20 / 60 = 1 / 3 + +- **Precision / Recall in Terms of Quality / Quantity** +- **Precision** + - Increasing precision means when we predict 1, we are more likely to be right + - E.g., in a spam email detection system, "precision is 90%" means 90% of + the emails marked as spam are actually spam + - A higher precision indicates fewer false positives + - Measures "quality" of prediction + +- **Recall** + - Increasing recall means we predict more instances when the outcome is 1 + - E.g., in a spam email detection system, "recall is 80%" indicates 80% of + all actual spam emails were correctly identified as spam + - A higher recall means fewer false negatives + - Measures "quantity" of prediction (coverage) + +- **Precision / recall for trivial classifiers** +- A classifier that outputs always the most common class 0 has: + \begin{alignat*}{3} + \text{precision} + & = 0 + & \text{(since TP = 0)} + \\ + \text{recall} + & = 0 + & \text{(since TP = 0)} + \\ + \end{alignat*} +- A classifier that outputs always the rare class 1 has: + \begin{alignat*}{3} + \text{recall} + & = 1 + & \text{(since FN = 0)} + \\ + \text{precision} + & \defeq \Pr(\text{TP} | \text{pred == 1}) + & \text{(by definition)} + \\ + &= \frac{\text{TP}}{\text{TP + FP}} + & \text{($TP + FP = n$ because} + \\ + &= \frac{\#(y = 1)}{n} + & \text{classifier always emits 1)} + \\ + &= \Pr(\text{pos}) \approx 0 + & \text{(the positive class is very rare)} + \\ + \end{alignat*} +- A trivial classifier has precision or recall close to 0 + +- **Trading Off Precision and Recall** +- In theory, we want to increase both precision and recall + +- In practice, modify the threshold of a probabilistic classifier to trade off + precision and recall in practice + +- E.g., use logistic regression to predict cancer: + - With a threshold = 0.5, the classifier has: + - Precision = $\frac{\text{TP}}{|\text{pred == 1}|}$ + - Recall = $\frac{\text{TP}}{|\text{act == 1}|}$ + - Increase the threshold $\implies$ output 1 only if more confident, i.e., + increase precision + - Decrease the threshold $\implies$ output 1 more often, decreasing the + chances of missing a possible case of cancer, i.e., increase recall + + +- **Precision-Recall: Pros / Cons** +- Pros: + - Give insight on the behavior of a classifier (e.g., confusion matrix) + - Avoid mistaking a trivial classifier for a good classifier + +- Cons: + - We have two different numbers, thus it is difficult to compare classifiers + to each other + - Solutions: F-score, AUC + +- **Precision-Recall Curves** +- **Aka ROC curves** + +- Plot the curve on a precision-recall plane: ($y =$ precision, $1 - x =$ + recall) to show the precision vs recall trade-off for a classifier + - E.g., changing the threshold of logistic regression + +- A curve higher than another means a better classifier, since for the same + recall we can get a higher precision + - The best classifier (precision = recall = 1) is in the top-right corner + +- The precision-recall plot can have different shapes, e.g., + - Diagonal (pure luck) + - Convex up (better than luck) + - Convex down (worse than luck) + + +- **Area Under the Curve** +- **AUC** is the area under the precision-recall curve + - Provides a robust metric by integrating over all thresholds + - Higher AUC indicates better performance in differentiating between classes + - AUC = 0.5 suggests no discriminative power, similar to random guessing, + - AUC closer to 1.0 indicates high performance + +- **Pros**: + - Single number summarizing classifier behavior, useful for comparing + different models + - Does not require selecting a threshold for performance calculation + - Can handle imbalanced datasets effectively + +- E.g., consider a classifier for medical diagnosis + - The AUC helps understand how well the model distinguishes between patients + with and without a disease across all thresholds + +- **F-Score** +- The F-score is the harmonic mean of precision and recall: + + $$ + \text{F-score} + \defeq \frac{2}{\frac{1}{P} + \frac{1}{R}} + = 2 \frac{P \cdot R}{P + R} + $$ + +- **Interpretation:** + - Trivial classifiers: $P = 0$ or $R = 0$ $\implies$ F-score = 0 + - Perfect classifiers: $P = R = 1$ $\implies$ F-score = 1 + - For F-score to be large, both $P$ and $R$ must be high + +- Why not just averaging $P, R$? + - A classifier that always outputs 1 has $R = 1$ and $P = 0$ + - $\frac{P + R}{2} = \frac{1}{2}$, while we prefer a low value (ideally 0) + +## Model Selection + +- **Model Selection Problem** +- Model selection chooses the best model from a set of candidates based on + performance + - Needed when multiple hypotheses can explain the data + +- Certain parameters are fixed, while others need to be picked, e.g., + - Set of features + - E.g., selecting a subset of features from a dataset with 100 variables + - Learning algorithms + - E.g., deciding how to train a neural network + - Model types + - E.g., linear regression model vs. Support Vector Machine (SVM) + - Model complexity + - E.g., models with polynomials of degree $d < 10$ + - Values of the regularization parameter + - E.g., trying different values like 0.01, 0.1, and 1.0 + +- Evaluate model accuracy, precision, and recall +- Perform cross-validation to assess model performance +- Consider computational cost + - E.g., a simple logistic regression is faster than a complex neural network + +- **Model Selection Process** + +1. Split data into $D_{train}, D_{val}, D_{test}$ + - Commonly: 60\% training, 20\% validation, 20\% test + - Like splitting 80\% training between two learning phases + +2. Given $N$ hypotheses, learn on $D_{train}$ to get $g_1, ..., g_N$ + +3. Evaluate hypotheses on $D_{val}$ estimating errors + $E_{val}^{(1)}, ..., E_{val}^{(N)}$ + +4. Pick model $g_m$ with minimum $E_{val}^{(m)}$ + +5. Use test set $D_{test}$ to estimate fair performance of model $g_m$, i.e., + $E_{val} \approx E_{out}$ + +6. Retrain model with entire $D = D_{train} \cup D_{val} \cup D_{test}$ to get + final $g_m^{*}$ + + +- **Model Selection as Learning** +- "Picking the model with smallest $E_{val}$" is a form of learning: + - Hypothesis set: $\{g_1, ... , g_N\}$ + - Training set: $D_{val}$ + - Pick the best model $g_m$ + +- After model selection + - Experimentally $E_{val}(g_m) < E_{out}(g_m)$, i.e., $E_{val}(g_m)$ is a + (optimistically) biased estimate of $E_{out}(g_m)$ + - Theoretically: + - The penalty for model complexity with a finite set of hypotheses is + $$ + E_{out}(g_m) \le E_{val}(g_m) + O(\sqrt{\log(N / K)}) + $$ + - Use VC dimension for an infinite number of hypotheses (e.g., choice of + $\lambda$ for regularization) + + +## Aggregation + +- **Ensemble Learning: Intuition** +- Ensemble learning combines multiple models to improve prediction accuracy + - **Idea**: a group of weak learners can form a strong learner + +- Combine outputs of models $X_i$ to build a model $X^*$ better than any $X_i$, + with the wisdom of all + - Utilizes diversity in model predictions to improve accuracy + - Each model contributes its unique perspective, reducing overfitting + - E.g., like a panel of voting experts + +- Example: in computer vision detecting a face is difficult task (at least + circa 2010) + - Look for different features: + - Are there eyes? + - Is there a nose? + - Are eyes and nose in the correct position? + - ... + - Each feature is weak per-se, but together they become reliable + +- **Ensemble Learning: Different Techniques** +- **Bagging** (bootstrap + aggregation) + - Reduces variance by averaging predictions from different models + - E.g., decision trees $\to$ bagging $\to$ random forest + - Bagging creates multiple versions of a decision tree (each trained on a + random sample of data) + - Average their predictions to improve accuracy + +- **Boosting** + - Reduces bias by focusing on errors made by previous models + - Sequentially adds models, each correcting its predecessor + - E.g., `adaBoost` increases weights of incorrectly classified data points to + learn the next model + +- **Stacking** + - Uses a meta-model to combine separate models using weights + - E.g., a stacking ensemble + - Uses a logistic regression as a meta-model + - Combines the predictions of other models (e.g., decision trees, support + vector machines, and neural networks) + +- **Ensemble Learning: Relation with Statistics** +- **Bagging** + - Improves performance by adding randomized variants (mimicking multiple + training sets) + - Reduce variance without affecting bias + +- **Boosting** + - Use another model to learn residuals, i.e., difference between predicted and + true values + - Related to the statistical technique of "forward stagewise additive models" + +- **Stacking** + - If we have 3 independent classifiers, each with $\Pr(\text{correct}) = 0.7$ + \begin{alignat*}{2} + \Pr(\text{majority correct}) + &= \Pr(\text{at least 2 classifiers correct}) \\ + &= {3 \choose 2} 0.7^2 0.3 + 0.7^3 \\ + &= 3 \times 0.7^2 \times 0.3 + 0.7^3 \\ + &\approx 0.78 > 0.7 + \end{alignat*} + +- **Ensemble learning: pros and cons** +- **Pros** + - Hypothesis set $\calH$ is increased by combining hypotheses from different + models + +- **Cons** + - More computationally intensive to train and evaluate + - Loss of interpretability + - Risk of overfitting (model complexity is increased) + - Ensemble learning contradicts Occam's razor, which advocates simplicity + +- **When Ensemble Learning Works** +- Combining multiple models with ensemble learning works when models: + - Are very different from each other + - Treat a reasonable percentage of the data correctly + - E.g., one cannot do much if all classifiers have 50% accuracy + - Complement each other: they are specialists in a part of the domain where + the others don't perform well + +- **How to Combine Outputs in Ensemble Learning** +- **Regression** + - Weighted average of prediction + - E.g., by accuracy of each model or by a prior + +- **Classification** + - Weighted vote of predicted classes + - It needs an odd number of models to break ties + +- **Probabilistic classification** + - Weighted average of class probabilities + +- We can also learn a meta-learner (stacking) to combine multiple models + +### Bagging + +- **Bagging** +- Bagging stands for "Bootstrap AGGregation" + +- **Learning procedure** + - Several training datasets are extracted randomly by sampling with + replacement from the original dataset (i.e., bootstrap) + - Learn multiple models, one for each training set + - Combine outputs using various methods + - Result is a better model than a single model + +- **Why bagging works?** + - From the bias-variance decomposition view, combining multiple models: + - Reduces the variance component + - Without compromising the bias (bagged models are typically unbiased) + - Bagging mimics extracting more training sets (though not independent) from + the unknown distribution + +- **Bagging and Instability in Learning Algorithms** +- Bagging works best with different models, especially non-linear models + +- Introduce randomization in the learning algorithm intentionally + +- **Decision Trees** + - Disable pruning + - Break ties randomly when selecting the best attribute to split + - E.g., bagging trees results in random forests + +- **Multilayer Perceptrons** + - Use different initial weights in backpropagation to reach different local + minima + +- **Nearest Neighbor Classifier** + - Use a random subset of features + - Resampling the training set has limited impact, as it is equivalent to + changing example weights + +### Boosting + +- **Boosting** +- Boosting builds models that complement each other + - Typically use homogeneous models, i.e., parametrized models from $\calH$ + +- Strong classifiers can be built from weak classifiers + - E.g., decision stumps = decision trees with one level + +- Statistical meaning of boosting: + - Boosting implements forward stagewise additive modeling + - Use another model to learn residuals (difference between predicted and true + values) + +- Boosting does not work for linear regression: + - Combination of linear models is still a linear model + - OLS finds optimal weights in one step + - Combining linear regressions from different attributes is equivalent to a + single multiple linear regression + +- **Adaboost.M1** +- Widely used for classification +- Assume examples can be weighted in the cost function used to learn + - Otherwise use resampling + +- **Learning procedure** + - Start with equal weights for all examples + - Iterate: + - Learn a classifier based on current weights for examples + - Weight the answer of each model by overall score (e.g., accuracy) or + probability + - Evaluate the ensemble + - Adjust weights for examples classified correctly/incorrectly + +### Stacking + +- **Stacking** +- Stacking learns how to combine models (not necessarily of the same type) + +- The problem is that with voting / averaging we don't know which model to trust +- Instead of voting or weighting we can use a meta-learner (level 1) to learn + how to pick / mix models (level 0) + +- **Learning procedure** + - Learn "level 0" models + - Learn "level 1" model using hold-out data from learning of level 0 models + (like in model selection) + - Build training data with predicted values from level 0 models + - Then learn level 1 + - Use a simple model for level 1 (e.g., linear models or trees) to avoid + overfitting + - Use probabilities from level 0, so level 1 can assess the confidence of + each model + +- **Boosting vs Bagging vs Stacking** + +\begingroup \scriptsize + +| **Aspect** | **Bagging** | **Boosting** | **Stacking** | +| ------------------------ | ----------------------------------- | ---------------------------------- | ---------------------------------------- | +| **Combines** | Models of the same type | Models of the same type | Models of different types | +| | | | +| **Learning** | Models trained independently | Iterative training | Models trained independently | +| **Predicting** | Uses uniform or data-driven weights | Uses learned weights from training | Uses learned weights or confidence | +| **Main Objective** | Reduce variance | Reduce bias | Improve generalization through diversity | +| **Base Learners** | Often strong learners | Often weak learners | Any model type (heterogeneous ensemble) | +| **Sensitivity to Noise** | Low | High | Medium | +| **Parallelizable** | Yes | No (sequential dependency) | Partially (base models parallelized) | +| **Meta-model** | Not used | Not used | Required | +| | | | +| **Examples** | Random Forest | AdaBoost, Gradient Boosting | Stacked Generalization, Blending | + +\endgroup diff --git a/dev_scripts_helpers/documentation/test/test_lint_notes.py b/dev_scripts_helpers/documentation/test/test_lint_notes.py index 3f769fd9f..d28d2a5de 100644 --- a/dev_scripts_helpers/documentation/test/test_lint_notes.py +++ b/dev_scripts_helpers/documentation/test/test_lint_notes.py @@ -190,7 +190,7 @@ def test_process3(self) -> None: # Good - - Good Time Management + - Good time management 1. Choose the right tasks - Avoid non-essential tasks @@ -245,13 +245,13 @@ def test_process_prettier_bug1(self) -> None: txt = self._get_text_problematic_for_prettier1() actual = hdocexec.prettier_on_str(txt, file_type="txt") expected = r""" - - Python Formatting + - Python formatting * Python has several built-in ways of formatting strings 1. `%` format operator 2. `format` and `str.format` - - `%` Format Operator + - `%` format operator * Text template as a format string - Values to insert are provided as a value or a `tuple` @@ -336,7 +336,9 @@ def _helper_process( file_name = os.path.join(self.get_scratch_space(), file_name) actual = dshdlino._process(txt, file_name) if expected: - expected = hprint.dedent(expected, remove_lead_trail_empty_lines_=True) + expected = hprint.dedent( + expected, remove_lead_trail_empty_lines_=True + ) self.assert_equal(actual, expected) return actual diff --git a/dev_scripts_helpers/documentation/test/test_preprocess_notes.py b/dev_scripts_helpers/documentation/test/test_preprocess_notes.py index 1d6646211..cdfd4d8e6 100644 --- a/dev_scripts_helpers/documentation/test/test_preprocess_notes.py +++ b/dev_scripts_helpers/documentation/test/test_preprocess_notes.py @@ -16,182 +16,15 @@ _LOG = logging.getLogger(__name__) -# TODO(gp): Pass through the function and not only executable. -def _run_preprocess_notes(in_file: str, out_file: str) -> str: - """ - Execute the end-to-end flow for `preprocess_notes.py` returning the output - as string. - """ - exec_path = hgit.find_file_in_git_tree("preprocess_notes.py") - hdbg.dassert_path_exists(exec_path) - # - hdbg.dassert_path_exists(in_file) - # - cmd = [] - cmd.append(exec_path) - cmd.append(f"--input {in_file}") - cmd.append(f"--output {out_file}") - cmd.append("--type pdf") - cmd_as_str = " ".join(cmd) - hsystem.system(cmd_as_str) - # Check. - actual = hio.from_file(out_file) - return actual # type: ignore - - -# ############################################################################# -# Test_process_color_commands1 -# ############################################################################# - - -class Test_process_color_commands1(hunitest.TestCase): - def test_text_content1(self) -> None: - """ - Test with plain text content. - """ - txt_in = r"\red{Hello world}" - expected = r"\textcolor{red}{\text{Hello world}}" - actual = hmarkdo.process_color_commands(txt_in) - self.assert_equal(actual, expected) - - def test_math_content1(self) -> None: - """ - Test color command with mathematical content. - """ - txt_in = r"\blue{x + y = z}" - expected = r"\textcolor{blue}{x + y = z}" - actual = hmarkdo.process_color_commands(txt_in) - self.assert_equal(actual, expected) - - def test_multiple_colors1(self) -> None: - """ - Test multiple color commands in the same line. - """ - txt_in = r"The \red{quick} \blue{fox} \green{jumps}" - expected = r"The \textcolor{red}{\text{quick}} \textcolor{blue}{\text{fox}} \textcolor{darkgreen}{\text{jumps}}" - actual = hmarkdo.process_color_commands(txt_in) - self.assert_equal(actual, expected) - - def test_mixed_content1(self) -> None: - """ - Test color commands with both text and math content. - """ - txt_in = r"\red{Result: x^2 + y^2}" - expected = r"\textcolor{red}{Result: x^2 + y^2}" - actual = hmarkdo.process_color_commands(txt_in) - self.assert_equal(actual, expected) - - def test_nested_braces1(self) -> None: - """ - Test color command with nested braces. - """ - txt_in = r"\blue{f(x) = {x + 1}}" - expected = r"\textcolor{blue}{f(x) = {x + 1}}" - actual = hmarkdo.process_color_commands(txt_in) - self.assert_equal(actual, expected) - - -# ############################################################################# -# Test_colorize_bullet_points1 -# ############################################################################# - - -@pytest.mark.skip(reason="Broken for now") -class Test_colorize_bullet_points1(hunitest.TestCase): - def helper(self, txt_in: str, expected: str) -> None: - """ - Test colorize bullet points. - """ - txt_in = hprint.dedent(txt_in) - actual = hmarkdo.colorize_bullet_points(txt_in) - expected = hprint.dedent(expected) - self.assert_equal(actual, expected) - - def test1(self) -> None: - """ - Test colorize bullet points. - """ - txt_in = r""" - - **VC Theory** - - Measures model - - - **Bias-Variance Decomposition** - - Prediction error - - **Bias** - - **Variance** - - - **Computation Complexity** - - Balances model - - Related to - - E.g., Minimum - - - **Bayesian Approach** - - Treats ML as probability - - Combines prior knowledge with observed data to update belief about a model - - - **Problem in ML Theory:** - - Assumptions may not align with practical problems - """ - expected = r""" - - **\red{VC Theory}** - - Measures model - - - **\orange{Bias-Variance Decomposition}** - - Prediction error - - **\yellow{Bias}** - - **\lime{Variance}** - - - **\green{Computation Complexity}** - - Balances model - - Related to - - E.g., Minimum - - - **\teal{Bayesian Approach}** - - Treats ML as probability - - Combines prior knowledge with observed data to update belief about a model - - - **\cyan{Problem in ML Theory:}** - - Assumptions may not align with practical problems - """ - self.helper(txt_in, expected) - - -# ############################################################################# -# Test_preprocess_notes1 -# ############################################################################# - - -@pytest.mark.skipif( - hserver.is_inside_ci() or hserver.is_dev_csfy(), - reason="Disabled because of CmampTask10710", -) -class Test_preprocess_notes1(hunitest.TestCase): - """ - Test `preprocess_notes.py` using the executable and checked in files. - """ - - def test1(self) -> None: - self._helper() - - def _helper(self) -> None: - # Set up. - in_file = os.path.join(self.get_input_dir(), "input1.txt") - out_file = os.path.join(self.get_scratch_space(), "output.txt") - # Run. - actual = _run_preprocess_notes(in_file, out_file) - # Check. - self.check_string(actual) - - # ############################################################################# # Test_process_question1 # ############################################################################# -@pytest.mark.skipif( - hserver.is_inside_ci() or hserver.is_dev_csfy(), - reason="Disabled because of CmampTask10710", -) +# @pytest.mark.skipif( +# hserver.is_inside_ci() or hserver.is_dev_csfy(), +# reason="Disabled because of CmampTask10710", +# ) class Test_process_question1(hunitest.TestCase): """ Check that the output of `preprocess_notes.py` is the expected one calling @@ -201,45 +34,43 @@ class Test_process_question1(hunitest.TestCase): def test_process_question1(self) -> None: txt_in = "* Hope is not a strategy" do_continue_exp = True - expected = "- **Hope is not a strategy**" - self._helper_process_question(txt_in, do_continue_exp, expected) + exp = "- **Hope is not a strategy**" + self.helper(txt_in, do_continue_exp, exp) def test_process_question2(self) -> None: txt_in = "** Hope is not a strategy" do_continue_exp = True - expected = "- **Hope is not a strategy**" - self._helper_process_question(txt_in, do_continue_exp, expected) + exp = "- **Hope is not a strategy**" + self.helper(txt_in, do_continue_exp, exp) def test_process_question3(self) -> None: txt_in = "*: Hope is not a strategy" do_continue_exp = True - expected = "- **Hope is not a strategy**" - self._helper_process_question(txt_in, do_continue_exp, expected) + exp = "- **Hope is not a strategy**" + self.helper(txt_in, do_continue_exp, exp) def test_process_question4(self) -> None: txt_in = "- Systems don't run themselves, they need to be run" do_continue_exp = False - expected = txt_in - self._helper_process_question(txt_in, do_continue_exp, expected) + exp = txt_in + self.helper(txt_in, do_continue_exp, exp) def test_process_question5(self) -> None: space = " " txt_in = "*" + space + "Hope is not a strategy" do_continue_exp = True - expected = "-" + space + "**Hope is not a strategy**" - self._helper_process_question(txt_in, do_continue_exp, expected) + exp = "-" + space + "**Hope is not a strategy**" + self.helper(txt_in, do_continue_exp, exp) def test_process_question6(self) -> None: space = " " txt_in = "**" + space + "Hope is not a strategy" do_continue_exp = True - expected = "-" + " " * len(space) + "**Hope is not a strategy**" - self._helper_process_question(txt_in, do_continue_exp, expected) + exp = "-" + " " * len(space) + "**Hope is not a strategy**" + self.helper(txt_in, do_continue_exp, exp) - def _helper_process_question( - self, txt_in: str, do_continue_exp: bool, expected: str - ) -> None: - do_continue, actual = dshdprno._process_question_to_markdown(txt_in) + def helper(self, txt_in: str, do_continue_exp: bool, exp: str) -> None: + do_continue, act = dshdprno._process_question_to_markdown(txt_in) self.assertEqual(do_continue, do_continue_exp) self.assert_equal(actual, expected) @@ -249,17 +80,19 @@ def _helper_process_question( # ############################################################################# -@pytest.mark.skipif( - hserver.is_inside_ci() or hserver.is_dev_csfy(), - reason="Disabled because of CmampTask10710", -) -class Test_preprocess_notes3(hunitest.TestCase): +# @pytest.mark.skipif( +# hserver.is_inside_ci() or hserver.is_dev_csfy(), +# reason="Disabled because of CmampTask10710", +# ) +class Test_preprocess_notes_end_to_end1(hunitest.TestCase): """ - Check that the output of `preprocess_notes.py` is the expected one calling - the library function directly. + Test `preprocess_notes.py` by calling the library function directly. """ def test_run_all1(self) -> None: + """ + Test type_="pdf". + """ # Prepare inputs. txt_in = r""" # ############################################################################# @@ -310,5 +143,86 @@ def _is_integer(value): print(v) ``` """ - expected = hprint.dedent(expected, remove_lead_trail_empty_lines_=True) - self.assert_equal(actual, expected) + exp = hprint.dedent(exp, remove_lead_trail_empty_lines_=True) + self.assert_equal(act, exp) + + def test_run_all2(self) -> None: + """ + Test type_="slides". + """ + # Prepare inputs. + txt_in = os.path.join(self.get_input_dir(), "input.txt") + txt_in = hprint.dedent(txt_in, remove_lead_trail_empty_lines_=True) + # Run function. + type_ = "slides" + act = dshdprno._transform_lines(txt_in, type_, is_qa=False) + # Check. + self.check_string(act) + + +# ############################################################################# +# Test_preprocess_notes1 +# ############################################################################# + + +@pytest.mark.skipif( + hserver.is_inside_ci() or hserver.is_dev_csfy(), + reason="Disabled because of CmampTask10710", +) +class Test_preprocess_notes_executable1(hunitest.TestCase): + """ + Test `preprocess_notes.py` using the executable and checked in files. + """ + + @staticmethod + def helper(in_file: str, out_file: str, type_: str) -> str: + """ + Execute the end-to-end flow for `preprocess_notes.py` returning the output + as string. + """ + hdbg.dassert_path_exists(in_file) + # Find executable. + exec_path = hgit.find_file_in_git_tree("preprocess_notes.py") + hdbg.dassert_path_exists(exec_path) + # Prepare command. + cmd = [] + cmd.append(exec_path) + cmd.append(f"--input {in_file}") + cmd.append(f"--output {out_file}") + cmd.append(f"--type {type_}") + cmd_as_str = " ".join(cmd) + # Run. + hsystem.system(cmd_as_str) + # Check. + act = hio.from_file(out_file) + return act # type: ignore + + def test1(self) -> None: + # Prepare inputs. + in_file = os.path.join(self.get_input_dir(), "input1.txt") + out_file = os.path.join(self.get_scratch_space(), "output.txt") + type_ = "pdf" + # Run. + act = self.helper(in_file, out_file, type_) + # Check. + self.check_string(act) + + def test2(self) -> None: + # Prepare inputs. + in_file = os.path.join(self.get_input_dir(), "input1.txt") + out_file = os.path.join(self.get_scratch_space(), "output.txt") + type_ = "pdf" + # Run. + act = self.helper(in_file, out_file, type_) + # Check. + self.check_string(act) + + def test3(self) -> None: + # Prepare inputs. + in_file = os.path.join(self.get_input_dir(), "input1.txt") + out_file = os.path.join(self.get_scratch_space(), "output.txt") + type_ = "pdf" + # Run. + act = self.helper(in_file, out_file, type_) + # Check. + self.check_string(act) diff --git a/dev_scripts_helpers/documentation/test/test_render_images.py b/dev_scripts_helpers/documentation/test/test_render_images.py index e60a1486b..ff791189d 100644 --- a/dev_scripts_helpers/documentation/test/test_render_images.py +++ b/dev_scripts_helpers/documentation/test/test_render_images.py @@ -103,7 +103,10 @@ def test2(self) -> None: image code type. """ # Prepare inputs. - image_code = "digraph { B -> A }" + image_code = """ + graph TD + B --> A + """ image_code_idx = 1 image_code_type = "mermaid" template_out_file = os.path.join(self.get_scratch_space(), "test.md") diff --git a/dev_scripts_helpers/llms/ai_review.py b/dev_scripts_helpers/llms/ai_review.py index f55e0c7a5..e9250ba7f 100755 --- a/dev_scripts_helpers/llms/ai_review.py +++ b/dev_scripts_helpers/llms/ai_review.py @@ -80,9 +80,10 @@ def _main(parser: argparse.ArgumentParser) -> None: ) # Run post-transforms outside the container. if not args.skip_post_transforms: + compare = False out_txt = dshlllut.run_post_transforms( args.prompt, - args.compare, + compare, in_file_name, tmp_in_file_name, tmp_out_file_name, diff --git a/dev_scripts_helpers/llms/llm_transform.py b/dev_scripts_helpers/llms/llm_transform.py index 7f0ae299c..46d0e5dc1 100755 --- a/dev_scripts_helpers/llms/llm_transform.py +++ b/dev_scripts_helpers/llms/llm_transform.py @@ -83,6 +83,7 @@ def _parse() -> argparse.ArgumentParser: return parser +# TODO(gp): Make it public and move it to `hdockerized_executables.py`. def _run_dockerized_llm_transform( in_file_path: str, cmd_opts: List[str], diff --git a/docs/code_guidelines/all.coding_style_guidelines.reference.md b/docs/code_guidelines/all.coding_style_guidelines.reference.md index 263ff527e..7ddf92eda 100644 --- a/docs/code_guidelines/all.coding_style_guidelines.reference.md +++ b/docs/code_guidelines/all.coding_style_guidelines.reference.md @@ -414,9 +414,6 @@ - Use `isinstance()` instead of `type()` to check the type of an object - Good: `if isinstance(obj, str):` - Bad: `if type(obj) == str:` -- Do not use `import *` - - Good: `from math import sqrt, pi` - - Bad: `from math import *` - Do not use `from ... import ...`, unless it is the `typing` package, e.g., `from typing import Iterable, List` - Good: `from typing import Dict, Tuple` diff --git a/docs/tools/all.ai_review.how_to_guide.md b/docs/tools/all.ai_review.how_to_guide.md index 16fa10013..989ed704d 100644 --- a/docs/tools/all.ai_review.how_to_guide.md +++ b/docs/tools/all.ai_review.how_to_guide.md @@ -27,26 +27,26 @@ - Apply modifications from a `cfile` to a set of files - E.g., from linter and AI review - Add TODOs from a `cfile` to Python or markdown files - - Apply a set of transformations to an entire file + - Apply a set of transformations to an entire Python file - E.g., styling / formatting code - Rewrite an entire markdown to fix English mistakes without changing its structure - - Reformat an entire markdown or Python using LLMs or code + - E.g., styling / formatting a markdown -- You should always commit your code before applying automatic transforms (e.g., - linting) +- You should always commit your code before applying automatic transforms, in the + same way that we run the `linter` on a clean tree - In this way, modifying a file is a separate commit and it's easy to review # Use templates -- We use templates for code and documentation to show and describe how a - document or code should look like, e.g., +- We use templates for code and documentation to show and describe how a document + or code should look like, e.g., - `code_template.py` shows our coding style - `unit_test_template.py` shows how our unit tests look like - - `all.how_to_guide_template_doc.md` shows how a Diataxis how to guide should be - structured and look like (same for `explanation`, `tutorial`, `reference`) + - `all.how_to_guide_template_doc.md` shows how a Diataxis how to guide should + be structured and look like (same for `explanation`, `tutorial`, `reference`) -- The same template can have multiple applications for: +- The same templates can have multiple applications for: - Humans: - Understand how to write documentation and code - As boilerplate @@ -108,7 +108,45 @@ ## `ai_review.py` -./docs/code_guidelines/all.coding_style_guidelines.reference.md +- The rules for AI are saved in the file + ./docs/code_guidelines/all.coding_style_guidelines.reference.md +- This file has a special structure: + ```bash + > extract_headers_from_markdown.py -i ./docs/code_guidelines/all.coding_style_guidelines.reference.md --max_level 2 + - All Style Guide + - Summary + - General + - Spelling + - Python + - Naming + - Docstrings + - Comments + - Code Implementation + - Code Design + - Imports + - Type Annotations + - Functions + - Scripts + - Logging + - Misc + - Unit Tests + - Rules + - Notebooks + - General + - Plotting + - Jupytext + - Markdown + - General + - Headers + - Text + ``` + - The first level represents the target language (e.g. `General`, `Python`) + - The second level represents a rule topic (e.g., `Imports`, `Functions`) + - The third level represents instructions for an LLM vs Linter, since some + instructions: + - Are easier to enforce by an LLM + - While others should be enforced by the `linter` (even if they are temporary not + enforced by the `linter` but by LLM or by humans) ## `inject_todos.py` @@ -137,10 +175,10 @@ ## A reviewer workflow - This workflow can be used by the author of the code directly or by a reviewer + - Initially, reviewers use these tools as part of dogfooding of the workflows - The goal is to make these tools robust enough so that they can be used directly by the author and potentially integrated in the `linter` flow itself - - Initially, reviewers use these tools as part of dogfooding of the workflows - Go to the Git branch with the code to review diff --git a/helpers/hcache_simple.py b/helpers/hcache_simple.py index 9931b3a2a..c45645886 100644 --- a/helpers/hcache_simple.py +++ b/helpers/hcache_simple.py @@ -272,7 +272,7 @@ def _save_cache_dict_to_disk(func_name: str, data: Dict) -> None: pickle.dump(data, file) elif cache_type == "json": with open(file_name, "w", encoding="utf-8") as file: - json.dump(data, file) + json.dump(data, file, indent=4, sort_keys=True, ensure_ascii=False) else: raise ValueError(f"Invalid cache type '{cache_type}'") diff --git a/helpers/hllm.py b/helpers/hllm.py index 37cb2285f..06c77521d 100644 --- a/helpers/hllm.py +++ b/helpers/hllm.py @@ -489,7 +489,7 @@ def get_completion( :param model: model to use or empty string to use the default model :param report_progress: whether to report progress running the API call - :param cache_mode : "DISABLE_CACHE","REFRESH_CACHE", "HIT_CACHE_OR_ABORT", "NORMAL" + :param cache_mode: - "DISABLE_CACHE": No caching - "REFRESH_CACHE": Make API calls and save responses to cache - "HIT_CACHE_OR_ABORT": Use cached responses, fail if not in cache @@ -508,7 +508,6 @@ def get_completion( update_llm_cache = get_update_llm_cache() if update_llm_cache: cache_mode = "REFRESH_CACHE" - llm_client = LLMClient(model=model) llm_client.create_client() # Construct messages in OpenAI API request format. diff --git a/helpers/hmarkdown.py b/helpers/hmarkdown.py index 98ff2fbf3..c7babad0f 100644 --- a/helpers/hmarkdown.py +++ b/helpers/hmarkdown.py @@ -13,3 +13,4 @@ from helpers.hmarkdown_headers import * # isort:skip # noqa: F401,F403 # pylint: disable=unused-import,unused-wildcard-import,wildcard-import from helpers.hmarkdown_rules import * # isort:skip # noqa: F401,F403 # pylint: disable=unused-import,unused-wildcard-import,wildcard-import from helpers.hmarkdown_slides import * # isort:skip # noqa: F401,F403 # pylint: disable=unused-import,unused-wildcard-import,wildcard-import +from helpers.hmarkdown_toc import * # isort:skip # noqa: F401,F403 # pylint: disable=unused-import,unused-wildcard-import,wildcard-import diff --git a/helpers/hmarkdown_coloring.py b/helpers/hmarkdown_coloring.py index ec989ecf9..4cfe8cdaf 100644 --- a/helpers/hmarkdown_coloring.py +++ b/helpers/hmarkdown_coloring.py @@ -6,6 +6,7 @@ import logging import re +from typing import List, Optional import helpers.hdbg as hdbg from helpers.hmarkdown_fenced_blocks import ( @@ -15,35 +16,55 @@ _LOG = logging.getLogger(__name__) -# TODO(gp): Add a decorator like in hprint to process both strings and lists -# of strings. # ############################################################################# # Colorize # ############################################################################# # Define colors and their LaTeX equivalents. -_COLORS = { +_MD_COLORS_LATEX_MAPPING = { "red": "red", "orange": "orange", - # "yellow": "yellow", - # "lime": "lime", - # + "yellow": "yellow", + "lime": "lime", "green": "darkgreen", "teal": "teal", "cyan": "cyan", "blue": "blue", - # "purple": "purple", + "purple": "purple", "violet": "violet", "magenta": "magenta", - # "pink": "pink", + "pink": "pink", "brown": "brown", "olive": "olive", "gray": "gray", "darkgray": "darkgray", - # "lightgray": "lightgray", - # "black": "black", - # "white": "white", + "lightgray": "lightgray", + "black": "black", + "white": "white", +} + + +_MD_COLORS = { + "red", + "orange", + # "yellow", + # "lime", + "green", + "teal", + "cyan", + "blue", + # "purple", + "violet", + "magenta", + # "pink", + "brown", + "olive", + "gray", + "darkgray", + # "lightgray", + # "black", + # "white", } @@ -60,11 +81,12 @@ def process_color_commands(in_line: str) -> str: :param in_line: input line to process :return: line with color commands transformed """ - for color, value in _COLORS.items(): - # This regex matches LaTeX color commands like \red{content}, \blue{content}, etc. + for md_color, latex_color in _MD_COLORS_LATEX_MAPPING.items(): + # This regex matches color commands like \red{content}, \blue{content}, + # etc. pattern = re.compile( rf""" - \\{color} # Match the color command (e.g., \red, \blue, etc.). + \\{md_color} # Match the color command (e.g., \red, \blue, etc.). \{{ # Match the opening curly brace. ([^}}]*) # Capture everything inside the curly braces. \}} # Match the closing curly brace. @@ -72,32 +94,36 @@ def process_color_commands(in_line: str) -> str: re.VERBOSE, ) - def _replacement(match: re.Match, value: str) -> str: + def _replacement(match: re.Match, latex_color: str) -> str: content = match.group(1) - # Check if content appears to be math expression. - is_math = any(c in content for c in "+-*/=<>{}[]()^_") - if is_math: - ret = rf"\textcolor{{{value}}}{{{content}}}" + # Check if content appears to be a math expression, otherwise wrap + # it in `\text{}`. + is_math_expr = any(c in content for c in "+-*/=<>{}[]()^_") + if is_math_expr: + ret = rf"\textcolor{{{latex_color}}}{{{content}}}" else: - ret = rf"\textcolor{{{value}}}{{\text{{{content}}}}}" + ret = rf"\textcolor{{{latex_color}}}{{\text{{{content}}}}}" return ret # Replace the color command with the LaTeX color command. - in_line = re.sub(pattern, lambda m: _replacement(m, value), in_line) + in_line = re.sub( + pattern, lambda m: _replacement(m, latex_color), in_line + ) return in_line -def has_color_command(line: str) -> bool: +def has_color_command(text: str) -> bool: """ Check if line contains any color commands. :param line: line to check :return: whether the line contains color commands """ - hdbg.dassert_isinstance(line, str) + hdbg.dassert_isinstance(text, str) # hdbg.dassert_not_in("\n", line) - for color in _COLORS.keys(): - # This regex matches LaTeX color commands like \red{content}, \blue{content}, etc. + for color in _MD_COLORS_LATEX_MAPPING.keys(): + # This regex matches LaTeX color commands like \red{content}, + # \blue{content}, etc. pattern = re.compile( rf""" \\{color} # Match the color command (e.g., \red, \blue, etc.). @@ -107,7 +133,7 @@ def has_color_command(line: str) -> bool: """, re.VERBOSE, ) - if re.search(pattern, line): + if re.search(pattern, text): return True return False @@ -115,17 +141,26 @@ def has_color_command(line: str) -> bool: # TODO(gp): -> List[str] # TODO(gp): Use hmarkdown.process_lines() and test it. def colorize_bullet_points_in_slide( - txt: str, *, use_abbreviations: bool = True + txt: str, + *, + use_abbreviations: bool = True, + interpolate_colors: bool = False, + all_md_colors: Optional[List[str]] = None, ) -> str: - """ + r""" Colorize bold text in a given string. :param txt: text to colorize :param use_abbreviations: use abbreviations for the colors like `\red{foo}` instead of `\textcolor{red}{foo}` + :param interpolate_colors: interpolate the colors to use for the + bold items instead of using a fixed set of colors + :param all_colors: list of colors to use for the bold items :return: colored text """ hdbg.dassert_isinstance(txt, str) + if all_md_colors is None: + all_md_colors = list(_MD_COLORS) # Replace fenced code blocks with tags. lines = txt.split("\n") lines, fence_map = replace_fenced_blocks_with_tags(lines) @@ -144,12 +179,31 @@ def colorize_bullet_points_in_slide( # want to count `**bold**` as 1. hdbg.dassert_eq(tot_bold % 2, 0, "tot_bold=%s needs to be even", tot_bold) num_bolds = tot_bold // 2 + # Use the colors in the order of the list of colors. - hdbg.dassert_lte(num_bolds, len(_COLORS)) - # Sample num_bolds colors evenly spaced from the available colors - step = len(_COLORS) // num_bolds - colors = list(_COLORS.keys())[::step][:num_bolds] + def _interpolate_colors(num_bolds: int) -> List[str]: + """ + Sample `num_bolds` colors evenly spaced from the available colors. + """ + step = len(all_md_colors) // num_bolds + colors = list(all_md_colors)[::step][:num_bolds] + return colors + + if interpolate_colors: + colors = _interpolate_colors(num_bolds) + else: + if num_bolds == 1: + colors = ["red"] + elif num_bolds == 2: + colors = ["red", "blue"] + elif num_bolds == 3: + colors = ["red", "green", "blue"] + elif num_bolds == 4: + colors = ["red", "green", "blue", "violet"] + else: + colors = _interpolate_colors(num_bolds) _LOG.debug("colors=%s", colors) + hdbg.dassert_lte(num_bolds, len(colors)) # Colorize the bold items. color_idx = 0 txt_out = [] @@ -163,11 +217,13 @@ def color_replacer(match: re.Match[str]) -> str: text = match.group(1) hdbg.dassert_lte(color_idx, len(colors)) color_to_use = colors[color_idx] + hdbg.dassert_in(color_to_use, _MD_COLORS_LATEX_MAPPING) + latex_color = _MD_COLORS_LATEX_MAPPING[color_to_use] color_idx += 1 if use_abbreviations: ret = f"**\\{color_to_use}{{{text}}}**" else: - ret = f"**\\textcolor{{{color_to_use}}}{{{text}}}**" + ret = f"**\\textcolor{{{latex_color}}}{{{text}}}**" return ret line = re.sub(r"\*\*([^*]+)\*\*", color_replacer, line) diff --git a/helpers/hmarkdown_rules.py b/helpers/hmarkdown_rules.py index 7117af985..1ecc1ed1e 100644 --- a/helpers/hmarkdown_rules.py +++ b/helpers/hmarkdown_rules.py @@ -286,6 +286,7 @@ def extract_rules( return rule_sections +# TODO(gp): This seems private? def parse_rules_from_txt(txt: str) -> List[str]: """ Parse rules from a chunk of markdown text. @@ -333,25 +334,26 @@ def parse_rules_from_txt(txt: str) -> List[str]: return bullet_points -def extract_rules_from_section(txt: str, line_number: int) -> List[str]: +def extract_rules_from_section(txt: str, start_line_number: int) -> List[str]: """ Extract rules from a section of a markdown file. :param txt: markdown text to extract the rules from - :param line_number: line number of the section to start extracting + :param start_line_number: line number of the section to start extracting the rules from :return: extracted rules """ # Find the line number of the next header. - i = line_number + end_line_number = start_line_number while True: - hdbg.dassert_lt(i, len(txt)) - line = txt[i] + hdbg.dassert_lt(end_line_number, len(txt)) + line = txt[end_line_number] if line.startswith("#"): break - i += 1 + end_line_number += 1 + _LOG.debug("end_line_number=%s", end_line_number) # Parse the markdown text into a list of bullet points. - bullet_points = parse_rules_from_txt(txt) + bullet_points = parse_rules_from_txt(txt[start_line_number:end_line_number]) # Extract the rules from the bullet points. rules = [] for bullet_point in bullet_points: diff --git a/helpers/hmarkdown_slides.py b/helpers/hmarkdown_slides.py index 2121ff736..df3e8074d 100644 --- a/helpers/hmarkdown_slides.py +++ b/helpers/hmarkdown_slides.py @@ -57,7 +57,7 @@ def process_slides(txt: str, transform: Callable[[List[str]], List[str]]) -> str # 2) Process slide. if _TRACE: _LOG.debug(" -> %s", hprint.to_str("in_slide")) - if line.startswith("* "): + if line.startswith("* ") or line.startswith("#### "): _LOG.debug("### Found slide") # Found a slide or the end of the file. if slide_txt: diff --git a/helpers/hmarkdown_tables.py b/helpers/hmarkdown_tables.py new file mode 100644 index 000000000..c73a0716f --- /dev/null +++ b/helpers/hmarkdown_tables.py @@ -0,0 +1,120 @@ +""" +Import as: + +import helpers.hmarkdown_tables as hmarktab +""" + +import logging +from typing import Dict, List, Tuple + +import helpers.hdbg as hdbg + +_LOG = logging.getLogger(__name__) + + +def replace_tables_with_tags( + lines: List[str], +) -> Tuple[List[str], Dict[str, str]]: + """ + Replace markdown tables with tag and return mapping from tags to the table. + + E.g., + ``` + Some text before + | Column 1 | Column 2 | + |----------|----------| + | Value 1 | Value 2 | + | Value 3 | Value 4 | + More text after + ``` + is replaced with: + ``` + Some text before + + More text after + ``` + + :param lines: list of lines to process + :return: tuple containing: + - list of lines with the tables replaced by tags + - mapping from tags to the table text + """ + hdbg.dassert_isinstance(lines, list) + result = [] + table_map = {} + table_count = 0 + i = 0 + while i < len(lines): + line = lines[i].strip() + # Check if this line starts a table (contains |). + if "|" in line and line.strip(): + # Look ahead to see if next line is a separator. + if i + 1 < len(lines): + next_line = lines[i + 1].strip() + # Check if next line is a table separator (contains --- and |). + if "|" in next_line and "-" in next_line: + # Found a table, collect all table lines. + table_lines = [] + # Add header line. + table_lines.append(lines[i]) + i += 1 + # Add separator line. + table_lines.append(lines[i]) + i += 1 + # Add data rows (continue while lines contain |). + while ( + i < len(lines) + and "|" in lines[i].strip() + and lines[i].strip() + ): + table_lines.append(lines[i]) + i += 1 + # Store the table. + table_count += 1 + table_text = "\n".join(table_lines) + table_map[str(table_count)] = table_text + result.append(f"") + continue + # Not a table line, add as-is. + result.append(lines[i]) + i += 1 + return result, table_map + + +def replace_tags_with_tables( + lines: List[str], table_map: Dict[str, str] +) -> List[str]: + """ + Replace tags with markdown tables. + + :param lines: list of lines to process + :param table_map: mapping from tags to table text + :return: list of lines with tags replaced by tables + """ + hdbg.dassert_isinstance(lines, list) + hdbg.dassert_isinstance(table_map, dict) + result = [] + table_map_copy = table_map.copy() + + for line in lines: + if line.startswith(""): + # Extract table number from tag like . + tag_match = line[6:-1] # Remove '' + hdbg.dassert_in( + tag_match, table_map_copy, f"Found unmatched tag {tag_match}" + ) + # Split table text into lines and add them. + table_text = table_map_copy[tag_match] + table_lines = table_text.split("\n") + result.extend(table_lines) + # Remove used tag from map. + del table_map_copy[tag_match] + else: + result.append(line) + # Ensure all tags were used. + hdbg.dassert_eq( + len(table_map_copy), + 0, + f"Found {len(table_map_copy)} unmatched tags: {list(table_map_copy.keys())}", + ) + return result diff --git a/helpers/hmarkdown_toc.py b/helpers/hmarkdown_toc.py new file mode 100644 index 000000000..a7600aceb --- /dev/null +++ b/helpers/hmarkdown_toc.py @@ -0,0 +1,25 @@ +""" +Import as: + +import helpers.hmarkdown_toc as hmarkdo +""" + +import re + + +def remove_table_of_contents(txt: str) -> str: + """ + Remove the table of contents from the text of a markdown file. + + The table of contents is stored between + ``` + + ... + + ``` + + :param txt: Input markdown text + :return: Text with table of contents removed + """ + txt = re.sub(r".*?", "", txt, flags=re.DOTALL) + return txt diff --git a/helpers/hmkdocs.py b/helpers/hmkdocs.py index 4a7a2decf..fa90e67eb 100644 --- a/helpers/hmkdocs.py +++ b/helpers/hmkdocs.py @@ -7,25 +7,7 @@ import re import helpers.hdbg as hdbg - - -# TODO(gp): -> hamrkdown_toc.py -def remove_table_of_contents(txt: str) -> str: - """ - Remove the table of contents from the text of a markdown file. - - The table of contents is stored between - ``` - - ... - - ``` - - :param txt: Input markdown text - :return: Text with table of contents removed - """ - txt = re.sub(r".*?", "", txt, flags=re.DOTALL) - return txt +import helpers.hmarkdown as hmarkdo # TODO(gp): -> hmarkdown_?.py @@ -119,7 +101,7 @@ def preprocess_mkdocs_markdown(txt: str) -> str: :return: Preprocessed markdown text """ # Apply all preprocessing steps. - txt = remove_table_of_contents(txt) + txt = hmarkdo.remove_table_of_contents(txt) txt = dedent_python_code_blocks(txt) txt = replace_indentation_with_four_spaces(txt) return txt diff --git a/helpers/hplayback.py b/helpers/hplayback.py index 859c5ec71..9d5a9e7d6 100644 --- a/helpers/hplayback.py +++ b/helpers/hplayback.py @@ -264,8 +264,12 @@ def _check_code(self, func_output: Any) -> None: self._append("expected = jsonpickle.decode(expected)", 2) if isinstance(func_output, (pd.DataFrame, pd.Series)): - self._append("actual = hpandas.df_to_str(actual, num_rows=None)", 2) - self._append("expected = hpandas.df_to_str(expected, num_rows=None)", 2) + self._append( + "actual = hpandas.df_to_str(actual, num_rows=None)", 2 + ) + self._append( + "expected = hpandas.df_to_str(expected, num_rows=None)", 2 + ) self._append("# Compare actual and expected output.", 2) self._append("self.assertEqual(actual, expected)", 2) else: diff --git a/helpers/hunit_test.py b/helpers/hunit_test.py index d57cf3068..4b8534ded 100644 --- a/helpers/hunit_test.py +++ b/helpers/hunit_test.py @@ -674,7 +674,9 @@ def assert_equal( values: Dict[str, str] = collections.OrderedDict() def _append(tag: str, actual: str, expected: str) -> None: - _LOG.debug("tag=%s\n actual='\n%s'\n expected='\n%s'", tag, actual, expected) + _LOG.debug( + "tag=%s\n actual='\n%s'\n expected='\n%s'", tag, actual, expected + ) hdbg.dassert_not_in(tag, values) values[tag] = (actual, expected) diff --git a/helpers/hunit_test_purification.py b/helpers/hunit_test_purification.py index 61014d8e6..fedaa8e12 100644 --- a/helpers/hunit_test_purification.py +++ b/helpers/hunit_test_purification.py @@ -300,7 +300,7 @@ def purify_line_number(self, txt: str) -> str: def purify_parquet_file_names(self, txt: str) -> str: """ - Replace UUIDs file names to `data.parquet` in the goldens. + Replace UUIDs file names to `data.parquet` in the golden outcomes. :param txt: input text containing parquet file names :return: text with standardized parquet file names @@ -342,18 +342,56 @@ def purify_helpers(self, txt: str) -> str: def purify_docker_image_name(self, txt: str) -> str: """ - Remove temporary docker image name that are function of their content. + Remove temporary docker image name. :param txt: input text containing docker image names :return: text with standardized docker image names """ - # In a command like: + # Purify command like: # > docker run --rm ... tmp.latex.edb567be .. + # > ... tmp.latex.aarch64.2f590c86.2f590c86 + pattern = r""" + ^ # Start of line + ( # Start capture group 1 + .*docker.* # Any text containing "docker" + \s+ # One or more whitespace + tmp\.\S+\. # tmp.something. + ) # End capture group 1 + [a-z0-9]{8} # 8 character hex hash + ( # Start capture group 2 + \s+ # One or more whitespace + .* # Rest of the line + ) # End capture group 2 + $ # End of line + """ txt = re.sub( - r"^(.*docker.*\s+tmp\.\S+\.)[a-z0-9]{8}(\s+.*)$", + pattern, r"\1xxxxxxxx\2", txt, - flags=re.MULTILINE, + flags=re.MULTILINE | re.VERBOSE, + ) + # Handle patterns like `tmp.latex.aarch64.2f590c86.2f590c86`. + pattern = r""" + ^ # Start of line + ( # Start capture group 1 + .*docker.* # Any text containing "docker" + \s+ # One or more whitespace + tmp\.\S+\.\S+\. # tmp.something.something. + ) # End capture group 1 + [a-z0-9]{8} # 8 character hex hash + \. # Literal dot + [a-z0-9]{8} # Another 8 character hex hash + ( # Start capture group 2 + \s+ # One or more whitespace + .* # Rest of the line + ) # End capture group 2 + $ # End of line + """ + txt = re.sub( + pattern, + r"\1xxxxxxxx\2", + txt, + flags=re.MULTILINE | re.VERBOSE, ) return txt diff --git a/helpers/test/test_hgit.py b/helpers/test/test_hgit.py index dbf83ad55..c328e629b 100644 --- a/helpers/test/test_hgit.py +++ b/helpers/test/test_hgit.py @@ -112,7 +112,11 @@ def test_group_hashes3(self) -> None: self._helper_group_hashes(head_hash, remh_hash, subm_hash, expected) def _helper_group_hashes( - self, head_hash: str, remh_hash: str, subm_hash: Optional[str], expected: str + self, + head_hash: str, + remh_hash: str, + subm_hash: Optional[str], + expected: str, ) -> None: actual = hgit._group_hashes(head_hash, remh_hash, subm_hash) self.assert_equal(actual, expected, fuzzy_match=True) diff --git a/helpers/test/test_hmarkdown_bullets.py b/helpers/test/test_hmarkdown_bullets.py index 730a7ec98..0164d5980 100644 --- a/helpers/test/test_hmarkdown_bullets.py +++ b/helpers/test/test_hmarkdown_bullets.py @@ -534,7 +534,9 @@ def test1(self) -> None: _LOG.debug(hprint.to_str("line")) out.append(f"{i}:{line}") actual = "\n".join(out) - self.check_string(actual, dedent=True, remove_lead_trail_empty_lines=True) + self.check_string( + actual, dedent=True, remove_lead_trail_empty_lines=True + ) # ############################################################################# @@ -566,4 +568,6 @@ def test1(self) -> None: txt_in = hio.from_file(input_file_path) txt_in = hprint.dedent(txt_in, remove_lead_trail_empty_lines_=True) actual = self.helper_process_code_block(txt_in) - self.check_string(actual, dedent=True, remove_lead_trail_empty_lines=True) + self.check_string( + actual, dedent=True, remove_lead_trail_empty_lines=True + ) diff --git a/helpers/test/test_hmarkdown_coloring.py b/helpers/test/test_hmarkdown_coloring.py index 22a5987ac..e2d6b75e8 100644 --- a/helpers/test/test_hmarkdown_coloring.py +++ b/helpers/test/test_hmarkdown_coloring.py @@ -1,9 +1,57 @@ -import logging - import helpers.hmarkdown as hmarkdo import helpers.hunit_test as hunitest -_LOG = logging.getLogger(__name__) + +# ############################################################################# +# Test_process_color_commands1 +# ############################################################################# + + +class Test_process_color_commands1(hunitest.TestCase): + def test_text_content1(self) -> None: + """ + Test with plain text content. + """ + txt_in = r"\red{Hello world}" + expected = r"\textcolor{red}{\text{Hello world}}" + actual = hmarkdo.process_color_commands(txt_in) + self.assert_equal(actual, expected) + + def test_math_content1(self) -> None: + """ + Test color command with mathematical content. + """ + txt_in = r"\blue{x + y = z}" + expected = r"\textcolor{blue}{x + y = z}" + actual = hmarkdo.process_color_commands(txt_in) + self.assert_equal(actual, expected) + + def test_multiple_colors1(self) -> None: + """ + Test multiple color commands in the same line. + """ + txt_in = r"The \red{quick} \blue{fox} \green{jumps}" + expected = r"The \textcolor{red}{\text{quick}} \textcolor{blue}{\text{fox}} \textcolor{darkgreen}{\text{jumps}}" + actual = hmarkdo.process_color_commands(txt_in) + self.assert_equal(actual, expected) + + def test_mixed_content1(self) -> None: + """ + Test color commands with both text and math content. + """ + txt_in = r"\red{Result: x^2 + y^2}" + expected = r"\textcolor{red}{Result: x^2 + y^2}" + actual = hmarkdo.process_color_commands(txt_in) + self.assert_equal(actual, expected) + + def test_nested_braces1(self) -> None: + """ + Test color command with nested braces. + """ + txt_in = r"\blue{f(x) = {x + 1}}" + expected = r"\textcolor{blue}{f(x) = {x + 1}}" + actual = hmarkdo.process_color_commands(txt_in) + self.assert_equal(actual, expected) # ############################################################################# @@ -13,39 +61,81 @@ class Test_colorize_bullet_points_in_slide1(hunitest.TestCase): def test1(self) -> None: - text = """ - * Machine Learning Flow + # Prepare inputs. + text = r""" + - **VC Theory** + - Measures model - ::: columns - :::: {.column width=90%} - - Question - - E.g., "How can we predict house prices?" - - Input data - - E.g., historical data of house sales + - **Bias-Variance Decomposition** + - Prediction error + - **Bias** + - **Variance** - - _"If I were given one hour to save the planet, I would spend 59 minutes - defining the problem and one minute resolving it"_ (Albert Einstein) + - **Computation Complexity** + - Balances model + - Related to + - E.g., Minimum - - **Not all phases are equally important!** - - Question $>$ Data $>$ Features $>$ Algorithm - - Clarity of the question impacts project success - - Quality and relevance of data are crucial for performance - - Proper feature selection simplifies the model and improves accuracy - - Algorithm is often less important (contrary to popular belief!) - :::: - :::: {.column width=5%} + - **Bayesian Approach** + - Treats ML as probability + - Combines prior knowledge with observed data to update belief about a model - ```graphviz[height=90%] - digraph BayesianFlow { - rankdir=TD; - splines=true; - ... - } - ``` - :::: - ::: + - **Problem in ML Theory:** + - Assumptions may not align with practical problems + """ + # Run function. + all_md_colors = [ + "red", + "orange", + "yellow", + "lime", + "green", + "teal", + "cyan", + "blue", + "purple", + "violet", + "magenta", + "pink", + "brown", + "olive", + "gray", + "darkgray", + "lightgray", + "black", + "white", + ] + + actual = hmarkdo.colorize_bullet_points_in_slide( + text, all_md_colors=all_md_colors + ) + # Check output. + expected = r""" + - **\red{VC Theory}** + - Measures model + + - **\orange{Bias-Variance Decomposition}** + - Prediction error + - **\yellow{Bias}** + - **\lime{Variance}** + + - **\green{Computation Complexity}** + - Balances model + - Related to + - E.g., Minimum + + - **\teal{Bayesian Approach}** + - Treats ML as probability + - Combines prior knowledge with observed data to update belief about a model + + - **\cyan{Problem in ML Theory:}** + - Assumptions may not align with practical problems """ - expected = """ + self.assert_equal(actual, expected) + + def test2(self) -> None: + # Prepare inputs. + text = r""" * Machine Learning Flow ::: columns @@ -58,7 +148,7 @@ def test1(self) -> None: - _"If I were given one hour to save the planet, I would spend 59 minutes defining the problem and one minute resolving it"_ (Albert Einstein) - - **\\red{Not all phases are equally important!}** + - **Not all phases are equally important!** - Question $>$ Data $>$ Features $>$ Algorithm - Clarity of the question impacts project success - Quality and relevance of data are crucial for performance @@ -77,6 +167,28 @@ def test1(self) -> None: :::: ::: """ + # Run function. actual = hmarkdo.colorize_bullet_points_in_slide(text) # Check output. + expected = r""" + - **\red{VC Theory}** + - Measures model + + - **\yellow{Bias-Variance Decomposition}** + - Prediction error + - **\green{Bias}** + - **\cyan{Variance}** + + - **\purple{Computation Complexity}** + - Balances model + - Related to + - E.g., Minimum + + - **\magenta{Bayesian Approach}** + - Treats ML as probability + - Combines prior knowledge with observed data to update belief about a model + + - **\brown{Problem in ML Theory:}** + - Assumptions may not align with practical problems + """ self.assert_equal(actual, expected) diff --git a/helpers/test/test_hmarkdown_headers.py b/helpers/test/test_hmarkdown_headers.py index 0882f6508..79b7e0bc8 100644 --- a/helpers/test/test_hmarkdown_headers.py +++ b/helpers/test/test_hmarkdown_headers.py @@ -596,7 +596,9 @@ def test_single_header(self) -> None: # Call function. actual = hmarkdo.extract_headers_from_markdown(content, max_level=3) # Check output. - expected = r"""[HeaderInfo(1, 'Header1', 1), HeaderInfo(2, 'Header2', 3)]""" + expected = ( + r"""[HeaderInfo(1, 'Header1', 1), HeaderInfo(2, 'Header2', 3)]""" + ) self.assert_equal(str(actual), expected) def test_no_headers(self) -> None: diff --git a/helpers/test/test_hmarkdown_rules.py b/helpers/test/test_hmarkdown_rules.py index ea77183b0..c14b7cf34 100644 --- a/helpers/test/test_hmarkdown_rules.py +++ b/helpers/test/test_hmarkdown_rules.py @@ -206,13 +206,14 @@ def test4(self) -> None: class Test_parse_rules_from_txt1(hunitest.TestCase): - def helper(self, text: str, expected: str) -> None: + def helper(self, text: str, expected: List[str]) -> None: # Prepare inputs. text = hprint.dedent(text) # Call function. actual = hmarkdo.parse_rules_from_txt(text) # Check output. - actual = "\n".join(actual) + actual = str(actual) + expected = str(expected) self.assert_equal(actual, expected, dedent=True) def test_basic_list1(self) -> None: @@ -224,11 +225,7 @@ def test_basic_list1(self) -> None: - Item 2 - Item 3 """ - expected = """ - - Item 1 - - Item 2 - - Item 3 - """ + expected = ["- Item 1", "- Item 2", "- Item 3"] self.helper(text, expected) def test_nested_list1(self) -> None: @@ -242,13 +239,11 @@ def test_nested_list1(self) -> None: - Sub-item 2.2 - Item 3 """ - expected = """ - - Item 1 - - Item 2 - - Sub-item 2.1 - - Sub-item 2.2 - - Item 3 - """ + expected = [ + "- Item 1", + "- Item 2\n - Sub-item 2.1\n - Sub-item 2.2", + "- Item 3", + ] self.helper(text, expected) def test_empty_list1(self) -> None: @@ -256,7 +251,7 @@ def test_empty_list1(self) -> None: Test handling empty input. """ text = "" - expected = "" + expected = [] self.helper(text, expected) @@ -304,7 +299,9 @@ def test_get_header_list1(self) -> None: """ self.assert_equal(actual, expected, dedent=True) - def helper_extract_rules(self, selection_rules: List[str], expected: str) -> None: + def helper_extract_rules( + self, selection_rules: List[str], expected: str + ) -> None: """ Helper function to test extracting rules from a markdown file. """ diff --git a/helpers/test/test_hmarkdown_tables.py b/helpers/test/test_hmarkdown_tables.py new file mode 100644 index 000000000..f651aa3bf --- /dev/null +++ b/helpers/test/test_hmarkdown_tables.py @@ -0,0 +1,196 @@ +import logging +import pprint +from typing import Dict, List + +import helpers.hmarkdown_tables as hmartabl +import helpers.hprint as hprint +import helpers.hunit_test as hunitest + +_LOG = logging.getLogger(__name__) + + +# ############################################################################# +# Test_replace_tables_with_tags1 +# ############################################################################# + + +class Test_replace_tables_with_tags1(hunitest.TestCase): + def helper( + self, text: str, expected_lines: List[str], expected_map: Dict[str, str] + ) -> None: + """ + Test replacing markdown tables with tags. + """ + lines = hprint.dedent(text, remove_lead_trail_empty_lines_=True) + lines = lines.split("\n") + # Call function. + actual_lines, table_map = hmartabl.replace_tables_with_tags(lines) + # Check output. + table_map_as_str = pprint.pformat(table_map) + expected_map_as_str = pprint.pformat(expected_map) + self.assert_equal(table_map_as_str, expected_map_as_str) + # + actual_lines = "\n".join(actual_lines) + expected_lines = hprint.dedent( + expected_lines, remove_lead_trail_empty_lines_=True + ) + self.assert_equal(actual_lines, expected_lines) + + def helper_round_trip(self, text: str) -> None: + """ + Test the round trip. + """ + # Do the round trip. + lines = text.split("\n") + actual_lines, table_map = hmartabl.replace_tables_with_tags(lines) + act_text = hmartabl.replace_tags_with_tables(actual_lines, table_map) + # Check output. + act_text = "\n".join(act_text) + self.assert_equal(act_text, text) + + def test1(self) -> None: + """ + Test replacing simple markdown table with tags. + """ + # Prepare inputs. + text = """ + Some text before + | Column 1 | Column 2 | + |----------|----------| + | Value 1 | Value 2 | + | Value 3 | Value 4 | + Text between tables + | Name | Age | City | + |------|-----|------| + | John | 25 | NYC | + Some text after + """ + # Prepare outputs. + expected_lines = """ + Some text before + + Text between tables + + Some text after + """ + # Check table map. + expected_map = { + "1": "| Column 1 | Column 2 |\n|----------|----------|\n| Value 1 | Value 2 |\n| Value 3 | Value 4 |", + "2": "| Name | Age | City |\n|------|-----|------|\n| John | 25 | NYC |", + } + self.helper(text, expected_lines, expected_map) + + def test2(self) -> None: + """ + Test table with alignment indicators. + """ + text = """ + | Left | Center | Right | + |:-----|:------:|------:| + | L1 | C1 | R1 | + | L2 | C2 | R2 | + """ + expected_lines = """ + + """ + expected_map = { + "1": "| Left | Center | Right |\n|:-----|:------:|------:|\n| L1 | C1 | R1 |\n| L2 | C2 | R2 |" + } + self.helper(text, expected_lines, expected_map) + # + self.helper_round_trip(text) + + def test3(self) -> None: + """ + Test table with minimal structure. + """ + text = """ + Before + | A | B | + |---|---| + | 1 | 2 | + After + """ + expected_lines = """ + Before + + After + """ + expected_map = {"1": "| A | B |\n|---|---|\n| 1 | 2 |"} + self.helper(text, expected_lines, expected_map) + # + self.helper_round_trip(text) + + def test4(self) -> None: + """ + Test table with empty cells. + """ + text = """ + | Col1 | Col2 | Col3 | + |------|------|------| + | A | | C | + | | B | | + """ + expected_lines = """ + + """ + expected_map = { + "1": "| Col1 | Col2 | Col3 |\n|------|------|------|\n| A | | C |\n| | B | |" + } + self.helper(text, expected_lines, expected_map) + # + self.helper_round_trip(text) + + def test5(self) -> None: + """ + Test multiple tables with different column counts. + """ + text = """ + First table: + | A | B | + |---|---| + | 1 | 2 | + + Second table: + | X | Y | Z | W | + |---|---|---|---| + | a | b | c | d | + | e | f | g | h | + """ + expected_lines = """ + First table: + + + Second table: + + """ + expected_map = { + "1": "| A | B |\n|---|---|\n| 1 | 2 |", + "2": "| X | Y | Z | W |\n|---|---|---|---|\n| a | b | c | d |\n| e | f | g | h |", + } + self.helper(text, expected_lines, expected_map) + # + self.helper_round_trip(text) + + def test6(self) -> None: + """ + Test table with indentation. + """ + text = """ + Outside + | Col1 | Col2 | + |------|------| + | Val1 | Val2 | + End + """ + expected_lines = """ + Outside + + End + """ + expected_map = { + "1": " | Col1 | Col2 |\n |------|------|\n | Val1 | Val2 |" + } + self.helper(text, expected_lines, expected_map) + # + self.helper_round_trip(text) diff --git a/helpers/test/test_hmarkdown_toc.py b/helpers/test/test_hmarkdown_toc.py new file mode 100644 index 000000000..6a2f4a54e --- /dev/null +++ b/helpers/test/test_hmarkdown_toc.py @@ -0,0 +1,101 @@ +import logging + +import helpers.hmarkdown as hmarkdo +import helpers.hprint as hprint +import helpers.hunit_test as hunitest + +_LOG = logging.getLogger(__name__) +# ############################################################################# +# Test_remove_table_of_contents1 +# ############################################################################# + + +class Test_remove_table_of_contents1(hunitest.TestCase): + def test_with_toc(self) -> None: + """ + Test removing table of contents from markdown text. + """ + # Prepare inputs. + text = """ + # Introduction + + This is an introduction. + + + - [Section 1](#section-1) + - [Section 2](#section-2) + + + ## Section 1 + + Content of section 1. + """ + expected = """ + # Introduction + + This is an introduction. + + + + ## Section 1 + + Content of section 1. + """ + text = hprint.dedent(text) + # Run test. + actual = hmarkdo.remove_table_of_contents(text) + # Check output. + expected = hprint.dedent(expected) + self.assert_equal(actual, expected) + + def test_without_toc(self) -> None: + """ + Test text without table of contents remains unchanged. + """ + # Prepare inputs. + text = """ + # Introduction + + This is an introduction. + + ## Section 1 + + Content of section 1. + """ + text = hprint.dedent(text) + # Run test. + actual = hmarkdo.remove_table_of_contents(text) + # Check output. + self.assert_equal(actual, text) + + def test_multiline_toc(self) -> None: + """ + Test removing multi-line table of contents. + """ + # Prepare inputs. + text = """ + # Introduction + + + - [Section 1](#section-1) + - [Subsection 1.1](#subsection-11) + - [Section 2](#section-2) + - [Subsection 2.1](#subsection-21) + - [Subsection 2.2](#subsection-22) + + + ## Section 1 + """ + expected = """ + # Introduction + + + + ## Section 1 + """ + text = hprint.dedent(text) + # Run test. + actual = hmarkdo.remove_table_of_contents(text) + # Check output. + expected = hprint.dedent(expected) + self.assert_equal(actual, expected) diff --git a/helpers/test/test_hmkdocs.py b/helpers/test/test_hmkdocs.py index 4390f8f7a..16f0f097a 100644 --- a/helpers/test/test_hmkdocs.py +++ b/helpers/test/test_hmkdocs.py @@ -7,102 +7,6 @@ _LOG = logging.getLogger(__name__) -# ############################################################################# -# Test_remove_table_of_contents1 -# ############################################################################# - - -class Test_remove_table_of_contents1(hunitest.TestCase): - def test_with_toc(self) -> None: - """ - Test removing table of contents from markdown text. - """ - # Prepare inputs. - text = """ - # Introduction - - This is an introduction. - - - - [Section 1](#section-1) - - [Section 2](#section-2) - - - ## Section 1 - - Content of section 1. - """ - expected = """ - # Introduction - - This is an introduction. - - - - ## Section 1 - - Content of section 1. - """ - text = hprint.dedent(text) - # Run test. - actual = hmkdocs.remove_table_of_contents(text) - # Check output. - expected = hprint.dedent(expected) - self.assert_equal(actual, expected) - - def test_without_toc(self) -> None: - """ - Test text without table of contents remains unchanged. - """ - # Prepare inputs. - text = """ - # Introduction - - This is an introduction. - - ## Section 1 - - Content of section 1. - """ - text = hprint.dedent(text) - # Run test. - actual = hmkdocs.remove_table_of_contents(text) - # Check output. - self.assert_equal(actual, text) - - def test_multiline_toc(self) -> None: - """ - Test removing multi-line table of contents. - """ - # Prepare inputs. - text = """ - # Introduction - - - - [Section 1](#section-1) - - [Subsection 1.1](#subsection-11) - - [Section 2](#section-2) - - [Subsection 2.1](#subsection-21) - - [Subsection 2.2](#subsection-22) - - - ## Section 1 - """ - expected = """ - # Introduction - - - - ## Section 1 - """ - text = hprint.dedent(text) - # Run test. - actual = hmkdocs.remove_table_of_contents(text) - # Check output. - expected = hprint.dedent(expected) - self.assert_equal(actual, expected) - - # ############################################################################# # Test_dedent_python_code_blocks1 # ############################################################################# diff --git a/helpers/test/test_hparquet.py b/helpers/test/test_hparquet.py index 49e8e14af..8fe34e678 100644 --- a/helpers/test/test_hparquet.py +++ b/helpers/test/test_hparquet.py @@ -986,7 +986,9 @@ def test_to_partitioned_dataset(self) -> None: dummy_value_1=3 dummy_value_1=3/dummy_value_2=C dummy_value_1=3/dummy_value_2=C/data.parquet""" - self.assert_equal(dir_signature, expected, purify_text=True, fuzzy_match=True) + self.assert_equal( + dir_signature, expected, purify_text=True, fuzzy_match=True + ) # include_file_content = True dir_signature = hunitest.get_dir_signature( diff --git a/helpers/test/test_hunit_test.py b/helpers/test/test_hunit_test.py index ed4a43537..2da2686e6 100644 --- a/helpers/test/test_hunit_test.py +++ b/helpers/test/test_hunit_test.py @@ -873,7 +873,9 @@ def _check_df_helper( golden_outcomes.to_csv(file_name) try: outcome_updated, file_exists, is_equal = self.check_dataframe( - actual, abort_on_error=abort_on_error, err_threshold=err_threshold + actual, + abort_on_error=abort_on_error, + err_threshold=err_threshold, ) finally: # Clean up. @@ -892,7 +894,9 @@ def test1(self) -> None: actual = "hello" # action_on_missing_golden = "assert" action_on_missing_golden = "update" - self.check_string(actual, action_on_missing_golden=action_on_missing_golden) + self.check_string( + actual, action_on_missing_golden=action_on_missing_golden + ) def test2(self) -> None: actual = pd.DataFrame([[0, 1, 2], [3, 4, 5]], columns="a b c".split()) diff --git a/helpers/test/test_hunit_test_purification.py b/helpers/test/test_hunit_test_purification.py index 53520cd9a..41efb42b5 100644 --- a/helpers/test/test_hunit_test_purification.py +++ b/helpers/test/test_hunit_test_purification.py @@ -8,7 +8,7 @@ import logging import os import unittest.mock as umock -from typing import List +from typing import Any, List import pytest @@ -116,28 +116,29 @@ def test9(self) -> None: # TODO(gp): We should remove the current path. # pylint: disable=line-too-long txt = r""" -************* Module input [pylint] -$SUPER_MODULE/dev_scripts/test/Test_linter_py1.test_linter1/tmp.scratch/input.py: Your code has been rated at -10.00/10 (previous run: -10.00/10, +0.00) [pylint] -$SUPER_MODULE/dev_scripts/test/Test_linter_py1.test_linter1/tmp.scratch/input.py:3:20: W605 invalid escape sequence '\s' [flake8] -$SUPER_MODULE/dev_scripts/test/Test_linter_py1.test_linter1/tmp.scratch/input.py:3:9: F821 undefined name 're' [flake8] -cmd line='$SUPER_MODULE/dev_scripts/linter.py -f $SUPER_MODULE/amp/dev_scripts/test/Test_linter_py1.test_linter1/tmp.scratch/input.py --linter_log $SUPER_MODULE/dev_scripts/test/Test_linter_py1.test_linter1/tmp.scratch/linter.log' -dev_scripts/test/Test_linter_py1.test_linter1/tmp.scratch/input.py:3: [E0602(undefined-variable), ] Undefined variable 're' [pylint] -dev_scripts/test/Test_linter_py1.test_linter1/tmp.scratch/input.py:3: [W1401(anomalous-backslash-in-string), ] Anomalous backslash in string: '\s'. String constant might be missing an r prefix. [pylint] -dev_scripts/test/Test_linter_py1.test_linter1/tmp.scratch/input.py:3: error: Name 're' is not defined [mypy] -""" + ************* Module input [pylint] + $SUPER_MODULE/dev_scripts/test/Test_linter_py1.test_linter1/tmp.scratch/input.py: Your code has been rated at -10.00/10 (previous run: -10.00/10, +0.00) [pylint] + $SUPER_MODULE/dev_scripts/test/Test_linter_py1.test_linter1/tmp.scratch/input.py:3:20: W605 invalid escape sequence '\s' [flake8] + $SUPER_MODULE/dev_scripts/test/Test_linter_py1.test_linter1/tmp.scratch/input.py:3:9: F821 undefined name 're' [flake8] + cmd line='$SUPER_MODULE/dev_scripts/linter.py -f $SUPER_MODULE/amp/dev_scripts/test/Test_linter_py1.test_linter1/tmp.scratch/input.py --linter_log $SUPER_MODULE/dev_scripts/test/Test_linter_py1.test_linter1/tmp.scratch/linter.log' + dev_scripts/test/Test_linter_py1.test_linter1/tmp.scratch/input.py:3: [E0602(undefined-variable), ] Undefined variable 're' [pylint] + dev_scripts/test/Test_linter_py1.test_linter1/tmp.scratch/input.py:3: [W1401(anomalous-backslash-in-string), ] Anomalous backslash in string: '\s'. String constant might be missing an r prefix. [pylint] + dev_scripts/test/Test_linter_py1.test_linter1/tmp.scratch/input.py:3: error: Name 're' is not defined [mypy] + """ + txt = hprint.dedent(txt) txt = txt.replace("$SUPER_MODULE", super_module_path) expected = r""" -************* Module input [pylint] -$GIT_ROOT/dev_scripts/test/Test_linter_py1.test_linter1/tmp.scratch/input.py: Your code has been rated at -10.00/10 (previous run: -10.00/10, +0.00) [pylint] -$GIT_ROOT/dev_scripts/test/Test_linter_py1.test_linter1/tmp.scratch/input.py:3:20: W605 invalid escape sequence '\s' [flake8] -$GIT_ROOT/dev_scripts/test/Test_linter_py1.test_linter1/tmp.scratch/input.py:3:9: F821 undefined name 're' [flake8] -cmd line='$GIT_ROOT/dev_scripts/linter.py -f $GIT_ROOT/dev_scripts/test/Test_linter_py1.test_linter1/tmp.scratch/input.py --linter_log $GIT_ROOT/dev_scripts/test/Test_linter_py1.test_linter1/tmp.scratch/linter.log' -dev_scripts/test/Test_linter_py1.test_linter1/tmp.scratch/input.py:3: [E0602(undefined-variable), ] Undefined variable 're' [pylint] -dev_scripts/test/Test_linter_py1.test_linter1/tmp.scratch/input.py:3: [W1401(anomalous-backslash-in-string), ] Anomalous backslash in string: '\s'. String constant might be missing an r prefix. [pylint] -dev_scripts/test/Test_linter_py1.test_linter1/tmp.scratch/input.py:3: error: Name 're' is not defined [mypy] -""" + ************* Module input [pylint] + $GIT_ROOT/dev_scripts/test/Test_linter_py1.test_linter1/tmp.scratch/input.py: Your code has been rated at -10.00/10 (previous run: -10.00/10, +0.00) [pylint] + $GIT_ROOT/dev_scripts/test/Test_linter_py1.test_linter1/tmp.scratch/input.py:3:20: W605 invalid escape sequence '\s' [flake8] + $GIT_ROOT/dev_scripts/test/Test_linter_py1.test_linter1/tmp.scratch/input.py:3:9: F821 undefined name 're' [flake8] + cmd line='$GIT_ROOT/dev_scripts/linter.py -f $GIT_ROOT/dev_scripts/test/Test_linter_py1.test_linter1/tmp.scratch/input.py --linter_log $GIT_ROOT/dev_scripts/test/Test_linter_py1.test_linter1/tmp.scratch/linter.log' + dev_scripts/test/Test_linter_py1.test_linter1/tmp.scratch/input.py:3: [E0602(undefined-variable), ] Undefined variable 're' [pylint] + dev_scripts/test/Test_linter_py1.test_linter1/tmp.scratch/input.py:3: [W1401(anomalous-backslash-in-string), ] Anomalous backslash in string: '\s'. String constant might be missing an r prefix. [pylint] + dev_scripts/test/Test_linter_py1.test_linter1/tmp.scratch/input.py:3: error: Name 're' is not defined [mypy] + """ # pylint: enable=line-too-long - self.check_helper(txt, expected) + self.check_helper(txt, expected, dedent=True) def test10(self) -> None: """ @@ -947,6 +948,20 @@ def test1(self) -> None: actual = text_purifier.purify_docker_image_name(txt) self.assert_equal(actual, expected, fuzzy_match=True) + def test2(self) -> None: + """ + Test patterns like `tmp.latex.aarch64.2f590c86.2f590c86`. + """ + txt = r""" + docker run --rm --user $(id -u):$(id -g) --workdir $GIT_ROOT --mount type=bind,source=/Users/saggese/src/helpers1,target=$GIT_ROOT tmp.latex.aarch64.2f590c86.2f590c86 pdflatex -output-directory + """ + expected = r""" + docker run --rm --user $(id -u):$(id -g) --workdir $GIT_ROOT --mount type=bind,source=/Users/saggese/src/helpers1,target=$GIT_ROOT tmp.latex.aarch64.xxxxxxxx pdflatex -output-directory + """ + text_purifier = huntepur.TextPurifier() + actual = text_purifier.purify_docker_image_name(txt) + self.assert_equal(actual, expected, fuzzy_match=True) + # ############################################################################# # Test_purify_line_number1 diff --git a/helpers/test/test_lib_tasks_docker_release.py b/helpers/test/test_lib_tasks_docker_release.py index cddb2e7d7..58c8a5d9d 100644 --- a/helpers/test/test_lib_tasks_docker_release.py +++ b/helpers/test/test_lib_tasks_docker_release.py @@ -7,6 +7,7 @@ import moto import pytest +import helpers.hgit as hgit import helpers.hunit_test as hunitest import helpers.lib_tasks_docker as hlitadoc import helpers.lib_tasks_docker_release as hltadore @@ -319,6 +320,11 @@ def test_multi_arch_prod_image1(self) -> None: """ self._check_docker_command_output(expected, self.mock_run.call_args_list) + @pytest.mark.skipif( + not hgit.is_in_helpers_as_supermodule(), + # TODO(gp): Is the assertion too strict? + reason="Needs to run insde a super module", + ) def test_candidate_tag1(self) -> None: """ Test building with candidate mode using tag. diff --git a/helpers/test/test_repo_config_amp.py b/helpers/test/test_repo_config_amp.py index 17bedef39..ced80844b 100644 --- a/helpers/test/test_repo_config_amp.py +++ b/helpers/test/test_repo_config_amp.py @@ -225,7 +225,9 @@ def test_amp_ci(self) -> None: # We ignore the AWS vars, since GH Actions does some replacement to mask # the env vars coming from secrets. skip_secrets_vars = True - hunteuti.check_env_to_str(self, expected, skip_secrets_vars=skip_secrets_vars) + hunteuti.check_env_to_str( + self, expected, skip_secrets_vars=skip_secrets_vars + ) @pytest.mark.skipif( not hrecouti.get_repo_config().get_name() == "//cmamp", @@ -277,4 +279,6 @@ def test_cmamp_ci(self) -> None: # We ignore the AWS vars, since GH Actions does some replacement to mask # the env vars coming from secrets. skip_secrets_vars = True - hunteuti.check_env_to_str(self, expected, skip_secrets_vars=skip_secrets_vars) + hunteuti.check_env_to_str( + self, expected, skip_secrets_vars=skip_secrets_vars + ) diff --git a/helpers/unit_test_template.py b/helpers/unit_test_template.py deleted file mode 100644 index 1b769e1b4..000000000 --- a/helpers/unit_test_template.py +++ /dev/null @@ -1,21 +0,0 @@ -""" -Import as: - -import helpers.unit_test_template as hunteske -""" - -import logging - -import helpers.hunit_test as hunitest - -_LOG = logging.getLogger(__name__) - - -# ############################################################################# -# Test_Example -# ############################################################################# - - -class Test_Example(hunitest.TestCase): - def test_example1(self) -> None: - pass diff --git a/linters/test/test_amp_check_import.py b/linters/test/test_amp_check_import.py index b1eb0f6e9..1cdd6d599 100644 --- a/linters/test/test_amp_check_import.py +++ b/linters/test/test_amp_check_import.py @@ -44,9 +44,13 @@ def test5(self) -> None: expected = "" self._helper_check_import(line, expected, file_name="test.py") - def _helper_check_import(self, line: str, expected: str, file_name: str) -> None: + def _helper_check_import( + self, line: str, expected: str, file_name: str + ) -> None: file_name = file_name or "test.py" line_num = 1 - expected = f"{file_name}:{line_num}: {expected}" if expected else expected + expected = ( + f"{file_name}:{line_num}: {expected}" if expected else expected + ) msg = lamchimp._check_import(file_name, line_num, line) self.assertEqual(expected, msg) diff --git a/linters/test/test_amp_check_shebang.py b/linters/test/test_amp_check_shebang.py index 181a5e3e6..4ea695a18 100644 --- a/linters/test/test_amp_check_shebang.py +++ b/linters/test/test_amp_check_shebang.py @@ -1,6 +1,7 @@ import pytest import helpers.hunit_test as hunitest +import helpers.hprint as hprint import linters.amp_check_shebang as lamchshe @@ -11,10 +12,12 @@ def test1(self) -> None: Executable with wrong shebang: error. """ file_name = "exec.py" - txt = """#!/bin/bash -hello -world -""" + txt = """ + #!/bin/bash + hello + world + """ + txt = hprint.dedent(txt) is_executable = True expected = "exec.py:1: any executable needs to start with a shebang '#!/usr/bin/env python'" self._helper_check_shebang(file_name, txt, is_executable, expected) @@ -24,10 +27,12 @@ def test2(self) -> None: Executable with the correct shebang: correct. """ file_name = "exec.py" - txt = """#!/usr/bin/env python -hello -world -""" + txt = """ + #!/usr/bin/env python + hello + world + """ + txt = hprint.dedent(txt) is_executable = True expected = "" self._helper_check_shebang(file_name, txt, is_executable, expected) @@ -37,10 +42,12 @@ def test3(self) -> None: Non executable with a shebang: error. """ file_name = "exec.py" - txt = """#!/usr/bin/env python -hello -world -""" + txt = """ + #!/usr/bin/env python + hello + world + """ + txt = hprint.dedent(txt) is_executable = False expected = "exec.py:1: a non-executable can't start with a shebang." self._helper_check_shebang(file_name, txt, is_executable, expected) @@ -50,11 +57,13 @@ def test4(self) -> None: Library without a shebang: correct. """ file_name = "lib.py" - txt = '''""" -Import as: + txt = ''' + """ + Import as: -import _setenv_lib as selib -''' + import _setenv_lib as selib + ''' + txt = hprint.dedent(txt) is_executable = False expected = "" self._helper_check_shebang(file_name, txt, is_executable, expected) diff --git a/linters/test/test_amp_class_method_order.py b/linters/test/test_amp_class_method_order.py index e65f7dc14..75f4ae13e 100644 --- a/linters/test/test_amp_class_method_order.py +++ b/linters/test/test_amp_class_method_order.py @@ -1,3 +1,4 @@ +import helpers.hprint as hprint import helpers.hunit_test as hunitest import linters.amp_class_method_order as laclmeor @@ -8,42 +9,42 @@ def test_1(self) -> None: Test methods in incorrect order are re-ordered. """ original = """ -class Test: - def test1(): - pass + class Test: + def test1(): + pass - def __init__(): - pass + def __init__(): + pass - def _test2(): - pass + def _test2(): + pass - def test3(): - pass + def test3(): + pass - def __magic_test__(): - pass + def __magic_test__(): + pass -""" + """ expected = """ -class Test: + class Test: - def __init__(): - pass + def __init__(): + pass - def __magic_test__(): - pass + def __magic_test__(): + pass - def test1(): - pass + def test1(): + pass - def test3(): - pass + def test3(): + pass - def _test2(): - pass + def _test2(): + pass -""" + """ self._helper(original, expected) def test_2(self) -> None: @@ -51,22 +52,22 @@ def test_2(self) -> None: Test methods in correct order aren't re-ordered. """ original = expected = """ -class Test: - def __init__(): - pass + class Test: + def __init__(): + pass - def __magic_test__(): - pass + def __magic_test__(): + pass - def test1(): - pass + def test1(): + pass - def test3(): - pass + def test3(): + pass - def _test2(): - pass -""" + def _test2(): + pass + """ self._helper(original, expected) def test_3(self) -> None: @@ -76,25 +77,25 @@ def test_3(self) -> None: """ # pylint: disable=line-too-long original = """ -class Test: - def test1(): - # This is a test comment - pass - - def __init__(): - # Another comment - pass -""" + class Test: + def test1(): + # This is a test comment + pass + + def __init__(): + # Another comment + pass + """ expected = """ -class Test: - def __init__(): - # Another comment - pass - - def test1(): - # This is a test comment - pass -""" + class Test: + def __init__(): + # Another comment + pass + + def test1(): + # This is a test comment + pass + """ self._helper(original, expected) def test_4(self) -> None: @@ -103,25 +104,25 @@ def test_4(self) -> None: losing information. """ original = ''' -class Test: - def test1(): - """This is a test docstring""" - pass - - def __init__(): - """Another docstring""" - pass -''' + class Test: + def test1(): + """This is a test docstring""" + pass + + def __init__(): + """Another docstring""" + pass + ''' expected = ''' -class Test: - def __init__(): - """Another docstring""" - pass - - def test1(): - """This is a test docstring""" - pass -''' + class Test: + def __init__(): + """Another docstring""" + pass + + def test1(): + """This is a test docstring""" + pass + ''' self._helper(original, expected) def test_5(self) -> None: @@ -129,52 +130,52 @@ def test_5(self) -> None: Test that static and regular methods are re-ordered correctly. """ original = """ -class Test: - @staticmethod - def test1(): - pass + class Test: + @staticmethod + def test1(): + pass - def __init__(): - pass + def __init__(): + pass - @staticmethod - def _test2(): - pass + @staticmethod + def _test2(): + pass - def test3(): - pass + def test3(): + pass - def _test4(): - pass + def _test4(): + pass - def __magic_test__(): - pass + def __magic_test__(): + pass -""" + """ expected = """ -class Test: + class Test: - def __init__(): - pass + def __init__(): + pass - def __magic_test__(): - pass + def __magic_test__(): + pass - @staticmethod - def test1(): - pass + @staticmethod + def test1(): + pass - def test3(): - pass + def test3(): + pass - @staticmethod - def _test2(): - pass + @staticmethod + def _test2(): + pass - def _test4(): - pass + def _test4(): + pass -""" + """ self._helper(original, expected) def test_6(self) -> None: @@ -182,50 +183,51 @@ def test_6(self) -> None: Test re-ordering with different decorators. """ original = """ -@pytest.mark.skip("ABC") -class Test: + @pytest.mark.skip("ABC") + class Test: - def __init__(): - pass + def __init__(): + pass - @pytest.mark.skip("DEF") - def test1(): - pass + @pytest.mark.skip("DEF") + def test1(): + pass - @pytest.mark.slow() - @umock.patch.object(imvcdeexcl.hdateti, "get_current_time") - def _test2(): - pass + @pytest.mark.slow() + @umock.patch.object(imvcdeexcl.hdateti, "get_current_time") + def _test2(): + pass - def __magic_test__(): - pass + def __magic_test__(): + pass -""" + """ expected = """ -@pytest.mark.skip("ABC") -class Test: + @pytest.mark.skip("ABC") + class Test: - def __init__(): - pass + def __init__(): + pass - def __magic_test__(): - pass + def __magic_test__(): + pass - @pytest.mark.skip("DEF") - def test1(): - pass + @pytest.mark.skip("DEF") + def test1(): + pass - @pytest.mark.slow() - @umock.patch.object(imvcdeexcl.hdateti, "get_current_time") - def _test2(): - pass + @pytest.mark.slow() + @umock.patch.object(imvcdeexcl.hdateti, "get_current_time") + def _test2(): + pass -""" + """ self._helper(original, expected) def _helper(self, txt: str, expected: str) -> None: + txt = hprint.dedent(txt) actual = laclmeor.order_methods(txt) # Remove empty lines since they can create issues. actual = hunitest.filter_text(r"^\s*$", actual) expected = hunitest.filter_text(r"^\s*$", expected) - self.assert_equal(actual, expected) + self.assert_equal(actual, expected, dedent=True) diff --git a/linters/test/test_amp_fix_comment.py b/linters/test/test_amp_fix_comment.py index 5dc1b52e1..22c884d00 100644 --- a/linters/test/test_amp_fix_comment.py +++ b/linters/test/test_amp_fix_comment.py @@ -44,8 +44,7 @@ def test3(self) -> None: self.assertEqual(expected, actual) @pytest.mark.skip( - reason="""Inline comments are not allowed, as they are hard to maintain - """ + reason="Inline comments are not allowed, as they are hard to maintain" ) def test4(self) -> None: """ diff --git a/unit_test_template.py b/unit_test_template.py index 4fc6ae256..922d642c2 100644 --- a/unit_test_template.py +++ b/unit_test_template.py @@ -12,6 +12,16 @@ class Test_format_compressed_markdown1(hunitest.TestCase): + def helper(self, actual: str, expected: str) -> None: + # Prepare inputs. + actual = hprint.dedent(actual) + actual = [line for line in actual.split("\n") if line != ""] + actual = "\n".join(actual) + # Prepare outputs. + expected = hprint.dedent(expected) + # Check output. + self.assert_equal(actual, expected) + def test1(self) -> None: # Prepare inputs. # ... @@ -25,35 +35,33 @@ def test2(self) -> None: """ Test basic case with single first level bullet. """ + # Prepare inputs. text = """ Some text - First bullet More text""" + # Prepare outputs. expected = """ Some text - First bullet More text""" - self._format_and_compare_markdown(text, expected) + # Check. + self.helper(text, expected) def test3(self) -> None: """ Test multiple first level bullets. """ + # Prepare inputs. text = """ - First bullet - Second bullet - Third bullet""" + # Prepare outputs. expected = """ - First bullet - Second bullet - Third bullet""" - self._format_and_compare_markdown(text, expected) - - def _format_and_compare_markdown(self, actual: str, expected: str) -> None: - actual = hprint.dedent(actual) - actual = [line for line in actual.split("\n") if line != ""] - actual = "\n".join(actual) - expected = hprint.dedent(expected) - # - self.assert_equal(actual, expected) + # Check. + self.helper(text, expected) From 2e3ace7d6b6486e5102d5078866a9fd47b53614e Mon Sep 17 00:00:00 2001 From: GP Saggese Date: Thu, 24 Jul 2025 08:08:03 -0400 Subject: [PATCH 2/3] Update --- .../output/test.txt | 7 +++ .../test/test_preprocess_notes.py | 58 +++++++++---------- 2 files changed, 36 insertions(+), 29 deletions(-) create mode 100644 dev_scripts_helpers/documentation/test/outcomes/Test_preprocess_notes_end_to_end1.test_run_all2/output/test.txt diff --git a/dev_scripts_helpers/documentation/test/outcomes/Test_preprocess_notes_end_to_end1.test_run_all2/output/test.txt b/dev_scripts_helpers/documentation/test/outcomes/Test_preprocess_notes_end_to_end1.test_run_all2/output/test.txt new file mode 100644 index 000000000..0985eb324 --- /dev/null +++ b/dev_scripts_helpers/documentation/test/outcomes/Test_preprocess_notes_end_to_end1.test_run_all2/output/test.txt @@ -0,0 +1,7 @@ +--- +fontsize: 10pt +--- +\let\emph\textit +\let\uline\underline +\let\ul\underline +/app/helpers_root/dev_scripts_helpers/documentation/test/outcomes/Test_preprocess_notes_end_to_end1.test_run_all2/input/input.txt \ No newline at end of file diff --git a/dev_scripts_helpers/documentation/test/test_preprocess_notes.py b/dev_scripts_helpers/documentation/test/test_preprocess_notes.py index cdfd4d8e6..f6b370e66 100644 --- a/dev_scripts_helpers/documentation/test/test_preprocess_notes.py +++ b/dev_scripts_helpers/documentation/test/test_preprocess_notes.py @@ -31,48 +31,48 @@ class Test_process_question1(hunitest.TestCase): the library function directly. """ + def helper(self, txt_in: str, do_continue_exp: bool, expected: str) -> None: + do_continue, actual = dshdprno._process_question_to_markdown(txt_in) + self.assertEqual(do_continue, do_continue_exp) + self.assert_equal(actual, expected) + def test_process_question1(self) -> None: txt_in = "* Hope is not a strategy" do_continue_exp = True - exp = "- **Hope is not a strategy**" - self.helper(txt_in, do_continue_exp, exp) + expected = "- **Hope is not a strategy**" + self.helper(txt_in, do_continue_exp, expected) def test_process_question2(self) -> None: txt_in = "** Hope is not a strategy" do_continue_exp = True - exp = "- **Hope is not a strategy**" - self.helper(txt_in, do_continue_exp, exp) + expected = "- **Hope is not a strategy**" + self.helper(txt_in, do_continue_exp, expected) def test_process_question3(self) -> None: txt_in = "*: Hope is not a strategy" do_continue_exp = True - exp = "- **Hope is not a strategy**" - self.helper(txt_in, do_continue_exp, exp) + expected = "- **Hope is not a strategy**" + self.helper(txt_in, do_continue_exp, expected) def test_process_question4(self) -> None: txt_in = "- Systems don't run themselves, they need to be run" do_continue_exp = False - exp = txt_in - self.helper(txt_in, do_continue_exp, exp) + expected = txt_in + self.helper(txt_in, do_continue_exp, expected) def test_process_question5(self) -> None: space = " " txt_in = "*" + space + "Hope is not a strategy" do_continue_exp = True - exp = "-" + space + "**Hope is not a strategy**" - self.helper(txt_in, do_continue_exp, exp) + expected = "-" + space + "**Hope is not a strategy**" + self.helper(txt_in, do_continue_exp, expected) def test_process_question6(self) -> None: space = " " txt_in = "**" + space + "Hope is not a strategy" do_continue_exp = True - exp = "-" + " " * len(space) + "**Hope is not a strategy**" - self.helper(txt_in, do_continue_exp, exp) - - def helper(self, txt_in: str, do_continue_exp: bool, exp: str) -> None: - do_continue, act = dshdprno._process_question_to_markdown(txt_in) - self.assertEqual(do_continue, do_continue_exp) - self.assert_equal(actual, expected) + expected = "-" + " " * len(space) + "**Hope is not a strategy**" + self.helper(txt_in, do_continue_exp, expected) # ############################################################################# @@ -143,8 +143,8 @@ def _is_integer(value): print(v) ``` """ - exp = hprint.dedent(exp, remove_lead_trail_empty_lines_=True) - self.assert_equal(act, exp) + expected = hprint.dedent(expected, remove_lead_trail_empty_lines_=True) + self.assert_equal(actual, expected) def test_run_all2(self) -> None: """ @@ -155,9 +155,9 @@ def test_run_all2(self) -> None: txt_in = hprint.dedent(txt_in, remove_lead_trail_empty_lines_=True) # Run function. type_ = "slides" - act = dshdprno._transform_lines(txt_in, type_, is_qa=False) + actual = dshdprno._transform_lines(txt_in, type_, is_qa=False) # Check. - self.check_string(act) + self.check_string(actual) # ############################################################################# @@ -194,8 +194,8 @@ def helper(in_file: str, out_file: str, type_: str) -> str: # Run. hsystem.system(cmd_as_str) # Check. - act = hio.from_file(out_file) - return act # type: ignore + actual = hio.from_file(out_file) + return actual # type: ignore def test1(self) -> None: # Prepare inputs. @@ -203,9 +203,9 @@ def test1(self) -> None: out_file = os.path.join(self.get_scratch_space(), "output.txt") type_ = "pdf" # Run. - act = self.helper(in_file, out_file, type_) + actual = self.helper(in_file, out_file, type_) # Check. - self.check_string(act) + self.check_string(actual) def test2(self) -> None: # Prepare inputs. @@ -213,9 +213,9 @@ def test2(self) -> None: out_file = os.path.join(self.get_scratch_space(), "output.txt") type_ = "pdf" # Run. - act = self.helper(in_file, out_file, type_) + actual = self.helper(in_file, out_file, type_) # Check. - self.check_string(act) + self.check_string(actual) def test3(self) -> None: # Prepare inputs. @@ -223,6 +223,6 @@ def test3(self) -> None: out_file = os.path.join(self.get_scratch_space(), "output.txt") type_ = "pdf" # Run. - act = self.helper(in_file, out_file, type_) + actual = self.helper(in_file, out_file, type_) # Check. - self.check_string(act) + self.check_string(actual) From 712c382145ac89d9836424246e9be87d1ae75303 Mon Sep 17 00:00:00 2001 From: GP Saggese Date: Thu, 24 Jul 2025 08:20:38 -0400 Subject: [PATCH 3/3] Update --- helpers/hmarkdown.py | 1 + helpers/test/test_hmarkdown_coloring.py | 13 ++++++------- helpers/test/test_hunit_test_purification.py | 4 ++-- 3 files changed, 9 insertions(+), 9 deletions(-) diff --git a/helpers/hmarkdown.py b/helpers/hmarkdown.py index c7babad0f..0d7f88441 100644 --- a/helpers/hmarkdown.py +++ b/helpers/hmarkdown.py @@ -13,4 +13,5 @@ from helpers.hmarkdown_headers import * # isort:skip # noqa: F401,F403 # pylint: disable=unused-import,unused-wildcard-import,wildcard-import from helpers.hmarkdown_rules import * # isort:skip # noqa: F401,F403 # pylint: disable=unused-import,unused-wildcard-import,wildcard-import from helpers.hmarkdown_slides import * # isort:skip # noqa: F401,F403 # pylint: disable=unused-import,unused-wildcard-import,wildcard-import +from helpers.hmarkdown_tables import * # isort:skip # noqa: F401,F403 # pylint: disable=unused-import,unused-wildcard-import,wildcard-import from helpers.hmarkdown_toc import * # isort:skip # noqa: F401,F403 # pylint: disable=unused-import,unused-wildcard-import,wildcard-import diff --git a/helpers/test/test_hmarkdown_coloring.py b/helpers/test/test_hmarkdown_coloring.py index e2d6b75e8..9b2681adb 100644 --- a/helpers/test/test_hmarkdown_coloring.py +++ b/helpers/test/test_hmarkdown_coloring.py @@ -105,7 +105,6 @@ def test1(self) -> None: "black", "white", ] - actual = hmarkdo.colorize_bullet_points_in_slide( text, all_md_colors=all_md_colors ) @@ -114,21 +113,21 @@ def test1(self) -> None: - **\red{VC Theory}** - Measures model - - **\orange{Bias-Variance Decomposition}** + - **\yellow{Bias-Variance Decomposition}** - Prediction error - - **\yellow{Bias}** - - **\lime{Variance}** + - **\green{Bias}** + - **\cyan{Variance}** - - **\green{Computation Complexity}** + - **\purple{Computation Complexity}** - Balances model - Related to - E.g., Minimum - - **\teal{Bayesian Approach}** + - **\magenta{Bayesian Approach}** - Treats ML as probability - Combines prior knowledge with observed data to update belief about a model - - **\cyan{Problem in ML Theory:}** + - **\brown{Problem in ML Theory:}** - Assumptions may not align with practical problems """ self.assert_equal(actual, expected) diff --git a/helpers/test/test_hunit_test_purification.py b/helpers/test/test_hunit_test_purification.py index 41efb42b5..6488621a1 100644 --- a/helpers/test/test_hunit_test_purification.py +++ b/helpers/test/test_hunit_test_purification.py @@ -28,10 +28,10 @@ class Test_purify_text1(hunitest.TestCase): - def check_helper(self, txt: str, expected: str) -> None: + def check_helper(self, txt: str, expected: str, **kwargs: Any) -> None: text_purifier = huntepur.TextPurifier() actual = text_purifier.purify_txt_from_client(txt) - self.assert_equal(actual, expected) + self.assert_equal(actual, expected, **kwargs) def test1(self) -> None: txt = "amp/helpers/test/test_system_interaction.py"