From 3593338c754f1903ca04a3e3a4d8bae9e9f5355f Mon Sep 17 00:00:00 2001 From: Ahmad Date: Wed, 3 May 2023 01:00:49 +0430 Subject: [PATCH 1/4] using first paper codes --- Manifest.toml | 253 +++++++++++++++++-------- Project.toml | 7 +- src/Backtests/MultipleTestReporting.jl | 32 ++++ src/Backtests/TestSetOverfitting.jl | 67 ++++--- src/Features/Clustering.jl | 96 +++++----- 5 files changed, 297 insertions(+), 158 deletions(-) create mode 100644 src/Backtests/MultipleTestReporting.jl diff --git a/Manifest.toml b/Manifest.toml index 5f3f964..4986c41 100644 --- a/Manifest.toml +++ b/Manifest.toml @@ -3,8 +3,11 @@ julia_version = "1.7.2" manifest_format = "2.0" -[[deps.ArgTools]] -uuid = "0dad84c5-d112-42e6-8d28-ef12dabb789f" +[[deps.ArrayLayouts]] +deps = ["FillArrays", "LinearAlgebra", "SparseArrays"] +git-tree-sha1 = "4aff5fa660eb95c2e0deb6bcdabe4d9a96bc4667" +uuid = "4c555306-a7a7-4459-81d9-ec55ddd5c99a" +version = "0.8.18" [[deps.Artifacts]] uuid = "56f22d72-fd6d-98f1-02f0-08ddc0907c33" @@ -12,11 +15,41 @@ uuid = "56f22d72-fd6d-98f1-02f0-08ddc0907c33" [[deps.Base64]] uuid = "2a0f44e3-6c83-55bd-87e4-b1978d98bd5f" +[[deps.BlockArrays]] +deps = ["ArrayLayouts", "FillArrays", "LinearAlgebra"] +git-tree-sha1 = "3b15c61bcece7c426ea641d143c808ace3661973" +uuid = "8e7c35d0-a365-5155-bbbb-fb81a777f24e" +version = "0.16.25" + +[[deps.BlockDiagonals]] +deps = ["ChainRulesCore", "FillArrays", "FiniteDifferences", "LinearAlgebra"] +git-tree-sha1 = "ffd635c19b56f50d1d4278d876219644299b5711" +uuid = "0a1fb500-61f7-11e9-3c65-f5ef3456f9f0" +version = "0.1.41" + +[[deps.ChainRulesCore]] +deps = ["Compat", "LinearAlgebra", "SparseArrays"] +git-tree-sha1 = "c6d890a52d2c4d55d326439580c3b8d0875a77d9" +uuid = "d360d2e6-b24c-11e9-a2a3-2a2ae2dbcce4" +version = "1.15.7" + +[[deps.ChangesOfVariables]] +deps = ["LinearAlgebra", "Test"] +git-tree-sha1 = "f84967c4497e0e1955f9a582c232b02847c5f589" +uuid = "9e997f8a-9a97-42d5-a9f1-ce6bfc15e2c0" +version = "0.1.7" + +[[deps.Clustering]] +deps = ["Distances", "LinearAlgebra", "NearestNeighbors", "Printf", "Random", "SparseArrays", "Statistics", "StatsBase"] +git-tree-sha1 = "a3213fa9d35edf589d0c6303f95850f7641fe2dc" +uuid = "aaaa29a8-35af-508c-8bc3-b662a17a0fe5" +version = "0.15.1" + [[deps.Compat]] -deps = ["Base64", "Dates", "DelimitedFiles", "Distributed", "InteractiveUtils", "LibGit2", "Libdl", "LinearAlgebra", "Markdown", "Mmap", "Pkg", "Printf", "REPL", "Random", "SHA", "Serialization", "SharedArrays", "Sockets", "SparseArrays", "Statistics", "Test", "UUIDs", "Unicode"] -git-tree-sha1 = "78bee250c6826e1cf805a88b7f1e86025275d208" +deps = ["Dates", "LinearAlgebra", "UUIDs"] +git-tree-sha1 = "7a60c856b9fa189eb34f5f8a6f6b5529b7942957" uuid = "34da2185-b29b-5c13-b0c7-acf172513d20" -version = "3.46.0" +version = "4.6.1" [[deps.CompilerSupportLibraries_jll]] deps = ["Artifacts", "Libdl"] @@ -28,15 +61,15 @@ uuid = "a8cc5b0e-0ffa-5ad4-8c14-923d3ee1735f" version = "4.1.1" [[deps.DataAPI]] -git-tree-sha1 = "fb5f5316dd3fd4c5e7c30a24d50643b73e37cd40" +git-tree-sha1 = "e8119c1a33d267e16108be441a287a6981ba1630" uuid = "9a962f9c-6df0-11e9-0e5d-c546b8b5ee8a" -version = "1.10.0" +version = "1.14.0" [[deps.DataFrames]] -deps = ["Compat", "DataAPI", "Future", "InvertedIndices", "IteratorInterfaceExtensions", "LinearAlgebra", "Markdown", "Missings", "PooledArrays", "PrettyTables", "Printf", "REPL", "Reexport", "SortingAlgorithms", "Statistics", "TableTraits", "Tables", "Unicode"] -git-tree-sha1 = "daa21eb85147f72e41f6352a57fccea377e310a9" +deps = ["Compat", "DataAPI", "Future", "InlineStrings", "InvertedIndices", "IteratorInterfaceExtensions", "LinearAlgebra", "Markdown", "Missings", "PooledArrays", "PrettyTables", "Printf", "REPL", "Random", "Reexport", "SentinelArrays", "SnoopPrecompile", "SortingAlgorithms", "Statistics", "TableTraits", "Tables", "Unicode"] +git-tree-sha1 = "aa51303df86f8626a962fccb878430cdb0a97eee" uuid = "a93c6f00-e57d-5684-b7b6-d8193f3e46c0" -version = "1.3.4" +version = "1.5.0" [[deps.DataStructures]] deps = ["Compat", "InteractiveUtils", "OrderedCollections"] @@ -57,19 +90,29 @@ uuid = "ade2ca70-3891-5945-98fb-dc099432e06a" deps = ["Mmap"] uuid = "8bb1440f-4735-579b-a4ab-409b98df4dab" -[[deps.Distributed]] -deps = ["Random", "Serialization", "Sockets"] -uuid = "8ba89e20-285c-5b6f-9357-94700520ee1b" +[[deps.Distances]] +deps = ["LinearAlgebra", "SparseArrays", "Statistics", "StatsAPI"] +git-tree-sha1 = "49eba9ad9f7ead780bfb7ee319f962c811c6d3b2" +uuid = "b4f34e82-e78d-54a5-968a-f98e89d6e8f7" +version = "0.10.8" [[deps.DocStringExtensions]] deps = ["LibGit2"] -git-tree-sha1 = "b19534d1895d702889b219c382a6e18010797f0b" +git-tree-sha1 = "2fb1e02f2b635d0845df5d7c167fec4dd739b00d" uuid = "ffbed154-4ef7-542d-bbb7-c09d3a79fcae" -version = "0.8.6" +version = "0.9.3" + +[[deps.FillArrays]] +deps = ["LinearAlgebra", "Random", "SparseArrays", "Statistics"] +git-tree-sha1 = "7072f1e3e5a8be51d525d64f63d3ec1287ff2790" +uuid = "1a297f60-69ca-5386-bcde-b61e274b549b" +version = "0.13.11" -[[deps.Downloads]] -deps = ["ArgTools", "LibCURL", "NetworkOptions"] -uuid = "f43a241f-c20a-4ad4-852c-f6b1247861c6" +[[deps.FiniteDifferences]] +deps = ["ChainRulesCore", "LinearAlgebra", "Printf", "Random", "Richardson", "SparseArrays", "StaticArrays"] +git-tree-sha1 = "3f605dd6db5640c5278f2551afc9427656439f42" +uuid = "26cc04aa-876d-5657-8c51-4c34ba976000" +version = "0.12.26" [[deps.Formatting]] deps = ["Printf"] @@ -81,36 +124,46 @@ version = "0.4.2" deps = ["Random"] uuid = "9fa8497b-333b-5362-9e8d-4d0656e87820" +[[deps.InlineStrings]] +deps = ["Parsers"] +git-tree-sha1 = "9cc2baf75c6d09f9da536ddf58eb2f29dedaf461" +uuid = "842dd82b-1e85-43dc-bf29-5d0ee9dffc48" +version = "1.4.0" + [[deps.InteractiveUtils]] deps = ["Markdown"] uuid = "b77e0a4c-d291-57a0-90e8-8db25a27a240" +[[deps.InverseFunctions]] +deps = ["Test"] +git-tree-sha1 = "6667aadd1cdee2c6cd068128b3d226ebc4fb0c67" +uuid = "3587e190-3f89-42d0-90ee-14403ec27112" +version = "0.1.9" + [[deps.InvertedIndices]] -git-tree-sha1 = "bee5f1ef5bf65df56bdd2e40447590b272a5471f" +git-tree-sha1 = "0dc7b50b8d436461be01300fd8cd45aa0274b038" uuid = "41ab1584-1d38-5bbf-9106-f11c6c58b48f" -version = "1.1.0" +version = "1.3.0" + +[[deps.IrrationalConstants]] +git-tree-sha1 = "630b497eafcc20001bba38a4651b327dcfc491d2" +uuid = "92d709cd-6900-40b7-9082-c6be49f344b6" +version = "0.2.2" [[deps.IteratorInterfaceExtensions]] git-tree-sha1 = "a3f24677c21f5bbe9d2a714f95dcd58337fb2856" uuid = "82899510-4779-5014-852e-03e436cf321d" version = "1.0.0" -[[deps.LibCURL]] -deps = ["LibCURL_jll", "MozillaCACerts_jll"] -uuid = "b27032c2-a3e7-50c8-80cd-2d36dbcbfd21" - -[[deps.LibCURL_jll]] -deps = ["Artifacts", "LibSSH2_jll", "Libdl", "MbedTLS_jll", "Zlib_jll", "nghttp2_jll"] -uuid = "deac9b47-8bc7-5906-a0fe-35ac56dc84c0" +[[deps.LaTeXStrings]] +git-tree-sha1 = "f2355693d6778a178ade15952b7ac47a4ff97996" +uuid = "b964fa9f-0449-5b57-a5c2-d3ea65f4040f" +version = "1.3.0" [[deps.LibGit2]] deps = ["Base64", "NetworkOptions", "Printf", "SHA"] uuid = "76f85450-5226-5b5a-8eaa-529ad045b433" -[[deps.LibSSH2_jll]] -deps = ["Artifacts", "Libdl", "MbedTLS_jll"] -uuid = "29816b5a-b9ab-546f-933c-edad1886dfa8" - [[deps.Libdl]] uuid = "8f399da3-3557-5675-b5ff-fb832c97cbdb" @@ -118,6 +171,12 @@ uuid = "8f399da3-3557-5675-b5ff-fb832c97cbdb" deps = ["Libdl", "libblastrampoline_jll"] uuid = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e" +[[deps.LogExpFunctions]] +deps = ["ChainRulesCore", "ChangesOfVariables", "DocStringExtensions", "InverseFunctions", "IrrationalConstants", "LinearAlgebra"] +git-tree-sha1 = "0a1b7c2863e44523180fdb3146534e265a91870b" +uuid = "2ab3a3ac-af41-5b50-aa03-7779005ae688" +version = "0.3.23" + [[deps.Logging]] uuid = "56ddb016-857b-54e1-b83d-db4d58db5568" @@ -125,21 +184,20 @@ uuid = "56ddb016-857b-54e1-b83d-db4d58db5568" deps = ["Base64"] uuid = "d6f4376e-aef5-505a-96c1-9c027394607a" -[[deps.MbedTLS_jll]] -deps = ["Artifacts", "Libdl"] -uuid = "c8ffd9c3-330d-5841-b78e-0817d7145fa1" - [[deps.Missings]] deps = ["DataAPI"] -git-tree-sha1 = "bf210ce90b6c9eed32d25dbcae1ebc565df2687f" +git-tree-sha1 = "f66bdc5de519e8f8ae43bdc598782d35a25b1272" uuid = "e1d29d7a-bbdc-5cf2-9ac0-f12de2c33e28" -version = "1.0.2" +version = "1.1.0" [[deps.Mmap]] uuid = "a63ad114-7e13-5084-954f-fe012c677804" -[[deps.MozillaCACerts_jll]] -uuid = "14a3606d-f60d-562e-9121-12d972cd8159" +[[deps.NearestNeighbors]] +deps = ["Distances", "StaticArrays"] +git-tree-sha1 = "2c3726ceb3388917602169bed973dbc97f1b51a8" +uuid = "b8a86587-4115-5ab1-83bc-aa920d37bbce" +version = "0.4.13" [[deps.NetworkOptions]] uuid = "ca575930-c2e3-43a9-ace4-1e988b2c1908" @@ -149,13 +207,15 @@ deps = ["Artifacts", "CompilerSupportLibraries_jll", "Libdl"] uuid = "4536629a-c528-5b80-bd46-f80d51c5b363" [[deps.OrderedCollections]] -git-tree-sha1 = "85f8e6578bf1f9ee0d11e7bb1b1456435479d47c" +git-tree-sha1 = "d321bf2de576bf25ec4d3e4360faca399afca282" uuid = "bac558e1-5e72-5ebc-8fee-abe8a469f55d" -version = "1.4.1" +version = "1.6.0" -[[deps.Pkg]] -deps = ["Artifacts", "Dates", "Downloads", "LibGit2", "Libdl", "Logging", "Markdown", "Printf", "REPL", "Random", "SHA", "Serialization", "TOML", "Tar", "UUIDs", "p7zip_jll"] -uuid = "44cfe95a-1eb2-52ea-b672-e2afdf69b78f" +[[deps.Parsers]] +deps = ["Dates", "SnoopPrecompile"] +git-tree-sha1 = "478ac6c952fddd4399e71d4779797c538d0ff2bf" +uuid = "69de0a69-1ddd-5017-9359-2bf0b02dc9f0" +version = "2.5.8" [[deps.PooledArrays]] deps = ["DataAPI", "Future"] @@ -163,11 +223,23 @@ git-tree-sha1 = "a6062fe4063cdafe78f4a0a81cfffb89721b30e7" uuid = "2dfb63ee-cc39-5dd5-95bd-886bf059d720" version = "1.4.2" +[[deps.PrecompileTools]] +deps = ["Preferences"] +git-tree-sha1 = "2e47054ffe7d0a8872e977c0d09eb4b3d162ebde" +uuid = "aea7be01-6a6a-4083-8856-8a6e6704d82a" +version = "1.0.2" + +[[deps.Preferences]] +deps = ["TOML"] +git-tree-sha1 = "47e5f437cc0e7ef2ce8406ce1e7e24d44915f88d" +uuid = "21216c6a-2e73-6563-6e65-726566657250" +version = "1.3.0" + [[deps.PrettyTables]] -deps = ["Crayons", "Formatting", "Markdown", "Reexport", "Tables"] -git-tree-sha1 = "dfb54c4e414caa595a1f2ed759b160f5a3ddcba5" +deps = ["Crayons", "Formatting", "LaTeXStrings", "Markdown", "Reexport", "StringManipulation", "Tables"] +git-tree-sha1 = "548793c7859e28ef026dba514752275ee871169f" uuid = "08abe8d2-0d0c-5749-adfa-8a2ac140af0d" -version = "1.3.1" +version = "2.2.3" [[deps.Printf]] deps = ["Unicode"] @@ -182,42 +254,91 @@ deps = ["SHA", "Serialization"] uuid = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c" [[deps.RecipesBase]] -git-tree-sha1 = "6bf3f380ff52ce0832ddd3a2a7b9538ed1bcca7d" +deps = ["PrecompileTools"] +git-tree-sha1 = "5c3d09cc4f31f5fc6af001c250bf1278733100ff" uuid = "3cdcf5f2-1ef4-517c-9805-6587b60abb01" -version = "1.2.1" +version = "1.3.4" [[deps.Reexport]] git-tree-sha1 = "45e428421666073eab6f2da5c9d310d99bb12f9b" uuid = "189a3867-3050-52da-a836-e630ba90ab69" version = "1.2.2" +[[deps.Richardson]] +deps = ["LinearAlgebra"] +git-tree-sha1 = "e03ca566bec93f8a3aeb059c8ef102f268a38949" +uuid = "708f8203-808e-40c0-ba2d-98a6953ed40d" +version = "1.4.0" + [[deps.SHA]] uuid = "ea8e919c-243c-51af-8825-aaa63cd721ce" +[[deps.SentinelArrays]] +deps = ["Dates", "Random"] +git-tree-sha1 = "77d3c4726515dca71f6d80fbb5e251088defe305" +uuid = "91c51154-3ec4-41a3-a24f-3f23e20d615c" +version = "1.3.18" + [[deps.Serialization]] uuid = "9e88b42a-f829-5b0c-bbe9-9e923198166b" -[[deps.SharedArrays]] -deps = ["Distributed", "Mmap", "Random", "Serialization"] -uuid = "1a1011a3-84de-559e-8e89-a11a2f7dc383" +[[deps.Shuffle]] +deps = ["Random"] +git-tree-sha1 = "b812fb30d6d8b295b71dd5a4102d1ae7b60698e3" +uuid = "bf21e494-c40e-4daa-abfb-de5ec0aad010" +version = "0.1.1" + +[[deps.SnoopPrecompile]] +deps = ["Preferences"] +git-tree-sha1 = "e760a70afdcd461cf01a575947738d359234665c" +uuid = "66db9d55-30c0-4569-8b51-7e840670fc0c" +version = "1.0.3" [[deps.Sockets]] uuid = "6462fe0b-24de-5631-8697-dd941f90decc" [[deps.SortingAlgorithms]] deps = ["DataStructures"] -git-tree-sha1 = "b3363d7460f7d098ca0912c69b082f75625d7508" +git-tree-sha1 = "a4ada03f999bd01b3a25dcaa30b2d929fe537e00" uuid = "a2af1166-a08f-5f64-846c-94a0d3cef48c" -version = "1.0.1" +version = "1.1.0" [[deps.SparseArrays]] deps = ["LinearAlgebra", "Random"] uuid = "2f01184e-e22b-5df5-ae63-d93ebab69eaf" +[[deps.StaticArrays]] +deps = ["LinearAlgebra", "Random", "StaticArraysCore", "Statistics"] +git-tree-sha1 = "c262c8e978048c2b095be1672c9bee55b4619521" +uuid = "90137ffa-7385-5640-81b9-e52037218182" +version = "1.5.24" + +[[deps.StaticArraysCore]] +git-tree-sha1 = "6b7ba252635a5eff6a0b0664a41ee140a1c9e72a" +uuid = "1e83bf80-4336-4d27-bf5d-d5a4f845583c" +version = "1.4.0" + [[deps.Statistics]] deps = ["LinearAlgebra", "SparseArrays"] uuid = "10745b16-79ce-11e8-11f9-7d13ad32a3b2" +[[deps.StatsAPI]] +deps = ["LinearAlgebra"] +git-tree-sha1 = "45a7769a04a3cf80da1c1c7c60caf932e6f4c9f7" +uuid = "82ae8749-77ed-4fe6-ae5f-f523153014b0" +version = "1.6.0" + +[[deps.StatsBase]] +deps = ["DataAPI", "DataStructures", "LinearAlgebra", "LogExpFunctions", "Missings", "Printf", "Random", "SortingAlgorithms", "SparseArrays", "Statistics", "StatsAPI"] +git-tree-sha1 = "d1bf48bfcc554a3761a133fe3a9bb01488e06916" +uuid = "2913bbd2-ae8a-5f71-8c99-4fb6c76f3a91" +version = "0.33.21" + +[[deps.StringManipulation]] +git-tree-sha1 = "46da2434b41f41ac3594ee9816ce5541c6096123" +uuid = "892a3eda-7b42-436c-8928-eab12a02cf0e" +version = "0.3.0" + [[deps.TOML]] deps = ["Dates"] uuid = "fa267f1f-6049-4f14-aa54-33bafae1ed76" @@ -230,13 +351,9 @@ version = "1.0.1" [[deps.Tables]] deps = ["DataAPI", "DataValueInterfaces", "IteratorInterfaceExtensions", "LinearAlgebra", "OrderedCollections", "TableTraits", "Test"] -git-tree-sha1 = "5ce79ce186cc678bbb5c5681ca3379d1ddae11a1" +git-tree-sha1 = "1544b926975372da01227b382066ab70e574a3ec" uuid = "bd369af6-aec1-5ad0-b16a-f7cc5008161c" -version = "1.7.0" - -[[deps.Tar]] -deps = ["ArgTools", "SHA"] -uuid = "a4e569a6-e804-4fa4-b0f3-eef7a1d5b13e" +version = "1.10.1" [[deps.Test]] deps = ["InteractiveUtils", "Logging", "Random", "Serialization"] @@ -244,9 +361,9 @@ uuid = "8dfed614-e22c-5e08-85e1-65c5234f0b40" [[deps.TimeSeries]] deps = ["Dates", "DelimitedFiles", "DocStringExtensions", "RecipesBase", "Reexport", "Statistics", "Tables"] -git-tree-sha1 = "3c91141a9f2276c37c3b6bc2bd83e652d50fecbc" +git-tree-sha1 = "3dd965ee9ce5e1857172cffa6d8985cd8b299585" uuid = "9e3dc215-6440-5c97-bce1-76c03772f85e" -version = "0.23.0" +version = "0.23.1" [[deps.UUIDs]] deps = ["Random", "SHA"] @@ -255,18 +372,6 @@ uuid = "cf7118a7-6976-5b1a-9a39-7adc72f591a4" [[deps.Unicode]] uuid = "4ec0a83e-493e-50e2-b9ac-8f72acf5a8f5" -[[deps.Zlib_jll]] -deps = ["Libdl"] -uuid = "83775a58-1f1d-513f-b197-d71354ab007a" - [[deps.libblastrampoline_jll]] deps = ["Artifacts", "Libdl", "OpenBLAS_jll"] uuid = "8e850b90-86db-534c-a0d3-1478176c7d93" - -[[deps.nghttp2_jll]] -deps = ["Artifacts", "Libdl"] -uuid = "8e850ede-7688-5339-a07c-302acd2aaf8d" - -[[deps.p7zip_jll]] -deps = ["Artifacts", "Libdl"] -uuid = "3f19e933-33d8-53b3-aaab-bd5110c3b7a0" diff --git a/Project.toml b/Project.toml index dc73a1c..09a7cd8 100644 --- a/Project.toml +++ b/Project.toml @@ -4,14 +4,17 @@ authors = ["RiskLab AI "] version = "0.0.1" [deps] +BlockArrays = "8e7c35d0-a365-5155-bbbb-fb81a777f24e" +BlockDiagonals = "0a1fb500-61f7-11e9-3c65-f5ef3456f9f0" +Clustering = "aaaa29a8-35af-508c-8bc3-b662a17a0fe5" DataFrames = "a93c6f00-e57d-5684-b7b6-d8193f3e46c0" LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e" Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c" +Shuffle = "bf21e494-c40e-4daa-abfb-de5ec0aad010" Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40" TimeSeries = "9e3dc215-6440-5c97-bce1-76c03772f85e" [compat] -julia = ">= 1.3" DataFrames = ">= 1.0" TimeSeries = ">= 0.13.0" - +julia = ">= 1.3" diff --git a/src/Backtests/MultipleTestReporting.jl b/src/Backtests/MultipleTestReporting.jl new file mode 100644 index 0000000..db552f9 --- /dev/null +++ b/src/Backtests/MultipleTestReporting.jl @@ -0,0 +1,32 @@ +""" +List of Collaborators, Developers, and Research Assistants (in alphabetical order) +Ahmad Zaferani +""" + +using DataFrames +include("../features/Clustering.jl") +include("TestSetOverfitting.jl") + +#--------------------------------------------------- +""" +function: a Template for Reporting Results of Backtest on Financial Strategies +refernce: Fabozzi, F, De Prado, M (2018) Being Honest in Backtest Reporting: A Template for Disclosing Multiple Tests +doi: https://doi.org/10.3905/jpm.2018.45.1.141 +""" +function BacktestResultTemplate(nTrials::Int64, # number of trials + familySize::Union{UInt64,Nothing}, # number of significantly different experiments + correlation::Union{DataFrame,Nothing}, # corr dataframe + familyWiseErrorRate::Union{Float64,Nothing}, # family-wise error rate + powerOfTest::Union{Float64,Nothing}, # power of the test +)::Nothing + if familySize === nothing + println("familySize not provided; using clustering on correlation matrix of backtest returns...") + @assert correlation !== nothing "correlation must be provided" + correlationNew, clusters, silh = clusterKMeansTop(correlation) + familySize = length(clusters) + println("calculated familySize: ", familySize) + else + @assert nTrials >= familySize "familySize must be less equal to nTrials" + end + +end diff --git a/src/Backtests/TestSetOverfitting.jl b/src/Backtests/TestSetOverfitting.jl index a0f8499..3e7300c 100644 --- a/src/Backtests/TestSetOverfitting.jl +++ b/src/Backtests/TestSetOverfitting.jl @@ -15,22 +15,22 @@ refernce: De Prado, M (2020) Machine Learning for Asset Managers methodology: page 110, snippet 8.1 """ function expectedMaxSharpeRatio(nTrials, # number of trials - meanSharpeRatio, # mean Sharpe Ratio - stdSharpeRatio) # standard deviation of Sharpe Ratios + meanSharpeRatio, # mean Sharpe Ratio + stdSharpeRatio) # standard deviation of Sharpe Ratios emc = MathConstants.eulergamma # euler gamma constant - sharpeRatio = (1 - emc) * quantile(Normal(0, 1), 1 - 1 / nTrials) + emc * quantile(Normal(0, 1) , 1 - 1 / (nTrials * MathConstants.e)) # get expected value of sharpe ratio by using false strategy theorem + sharpeRatio = (1 - emc) * quantile(Normal(0, 1), 1 - 1 / nTrials) + emc * quantile(Normal(0, 1), 1 - 1 / (nTrials * MathConstants.e)) # get expected value of sharpe ratio by using false strategy theorem sharpeRatio = meanSharpeRatio + stdSharpeRatio * sharpeRatio # get max Sharpe Ratio, controlling for SBuMT - + return sharpeRatio end #--------------------------------------------------- function generatedMaxSharpeRatio(nSims, # number of simulations - nTrials, # number of trials - stdSharpeRatio, # mean Sharpe Ratio - meanSharpeRatio) # standard deviation of Sharpe Ratios + nTrials, # number of trials + stdSharpeRatio, # mean Sharpe Ratio + meanSharpeRatio) # standard deviation of Sharpe Ratios rng = MersenneTwister(1234) # create random number generator out = DataFrame() # initialize output @@ -39,11 +39,11 @@ function generatedMaxSharpeRatio(nSims, # number of simulations for nTrials_ in nTrials #1) Simulated Sharpe ratios sharpeRatio = randn(rng, (Int64(nSims), Int64(nTrials_))) # generate random numbers for Sharpe Ratios - sharpeRatio = (sharpeRatio .- mean(sharpeRatio, dims = 2)) ./std(sharpeRatio, dims = 2) # standardize Sharpe Ratios + sharpeRatio = (sharpeRatio .- mean(sharpeRatio, dims=2)) ./ std(sharpeRatio, dims=2) # standardize Sharpe Ratios sharpeRatio = meanSharpeRatio .+ sharpeRatio .* stdSharpeRatio # set the mean and standard deviation - + #2) Store output - output = DataFrame(maxSharpeRatio = vec(maximum(sharpeRatio, dims = 2)), nTrials = nTrials_) # generate output + output = DataFrame(maxSharpeRatio=vec(maximum(sharpeRatio, dims=2)), nTrials=nTrials_) # generate output append!(out, output) # append output end return out @@ -56,13 +56,13 @@ refernce: De Prado, M (2020) Machine Learning for Asset Managers methodology: page 112, snippet 8.2 """ function meanAndStdError(nSims0, # number of max{SR} used to estimate E[max{SR}] - nSims1, # number of errors on which std is computed - nTrials, # array of numbers of SR used to derive max{SR} - stdSharpeRatio, # mean Sharpe Ratio - meanSharpeRatio) # standard deviation of Sharpe Ratios + nSims1, # number of errors on which std is computed + nTrials, # array of numbers of SR used to derive max{SR} + stdSharpeRatio, # mean Sharpe Ratio + meanSharpeRatio) # standard deviation of Sharpe Ratios # Compute standard deviation of errors per nTrial - sharpeRatio0 = DataFrame(nT = nTrials, ExpectedMaxSR = [expectedMaxSharpeRatio(i, meanSharpeRatio, stdSharpeRatio) for i in nTrials]) # compute expected max Sharpe Ratios + sharpeRatio0 = DataFrame(nT=nTrials, ExpectedMaxSR=[expectedMaxSharpeRatio(i, meanSharpeRatio, stdSharpeRatio) for i in nTrials]) # compute expected max Sharpe Ratios error = DataFrame() # initialize errors out = DataFrame() # initialize output @@ -73,7 +73,7 @@ function meanAndStdError(nSims0, # number of max{SR} used to estimate E[max{SR}] error_[!, :ExpectedMaxSR] = sharpeRatio0.ExpectedMaxSR # add expected max Sharpe Ratios error_[!, :err] = error_.maxSharpeRatio ./ error_.ExpectedMaxSR .- 1 # calculate errors append!(error, error_) # append errors - end + end out[!, :meanErr] = combine(groupby(error, :nTrials), :err => mean; renamecols=false).err # calculate mean errors out[!, :nTrials] = combine(groupby(error, :nTrials), :err => mean; renamecols=false).nTrials # add number of trials @@ -89,23 +89,23 @@ refernce: De Prado, M (2020) Machine Learning for Asset Managers methodology: page 119, snippet 8.3 """ function estimatedSharpeRatioZStatistics(sharpeRatio, # estimated Sharpe Ratio - t, # number of observations - sharpeRatio_ = 0, # true Sharpe Ratio - skew = 0, # skewness of returns - kurt = 3) # kurtosis of returns + t, # number of observations + sharpeRatio_=0, # true Sharpe Ratio + skew=0, # skewness of returns + kurt=3) # kurtosis of returns - z = (sharpeRatio - sharpeRatio_)*(t - 1)^0.5 # calculate first part of z statistic - z /= (1 - skew*sr + (kurt - 1) / 4*sr^2)^0.5 # calculate z statistic + z = (sharpeRatio - sharpeRatio_) * (t - 1)^0.5 # calculate first part of z statistic + z /= (1 - skew * sr + (kurt - 1) / 4 * sr^2)^0.5 # calculate z statistic return z end #--------------------------------------------------- function strategyType1ErrorProbability(z, # z statistic for the estimated Sharpe Ratios - k = 1) # number of tests + k=1) # number of tests α = cdf(Normal(0, 1), -z) # find false positive rate - α_k = 1 - (1 - α) ^ k # correct for multi-testing + α_k = 1 - (1 - α)^k # correct for multi-testing return α_k end @@ -118,25 +118,24 @@ refernce: De Prado, M (2020) Machine Learning for Asset Managers methodology: page 121, snippet 8.4 """ function thetaForType2Error(sharpeRatio, # estimated Sharpe Ratio - t, # number of observations - sharpeRatio_ = 0, # true Sharpe Ratio - skew = 0, # skewness of returns - kurt = 3) # kurtosis of returns + t, # number of observations + sharpeRatio_=0, # true Sharpe Ratio + skew=0, # skewness of returns + kurt=3) # kurtosis of returns - θ = sharpeRatio_*(t - 1)^0.5 # calculate first part of theta - θ /= (1 - skew*sharpeRatio + (kurt - 1) / 4*sharpeRatio^2)^0.5 # calculate theta + θ = sharpeRatio_ * (t - 1)^0.5 # calculate first part of theta + θ /= (1 - skew * sharpeRatio + (kurt - 1) / 4 * sharpeRatio^2)^0.5 # calculate theta return θ end #--------------------------------------------------- function strategyType2ErrorProbability(α, # type I error - k, # number of tests - θ) # calculated theta parameter + k, # number of tests + θ) # calculated theta parameter - z = quantile(Normal(0, 1),(1 - α) ^ (1 / k)) # perform Sidak’s correction + z = quantile(Normal(0, 1), (1 - α)^(1 / k)) # perform Sidak’s correction β = cdf(Normal(0, 1), z - θ) # calculate false negative rate return β end - diff --git a/src/Features/Clustering.jl b/src/Features/Clustering.jl index 682442a..c9b8242 100644 --- a/src/Features/Clustering.jl +++ b/src/Features/Clustering.jl @@ -1,4 +1,3 @@ -#using KernelEstimator using Distributions using LinearAlgebra using DataFrames @@ -12,6 +11,7 @@ using MarketData using CSV using StatsBase using BlockDiagonals +using PyCall @pyimport sklearn.metrics as Metrics """---------------------------------------------------------------------- @@ -23,10 +23,10 @@ function percentChange(prices::DataFrames.DataFrame) returns = DataFrames.DataFrame() # empty dataframe of returns for sym in names(prices)[2:end] data = prices[!, Symbol(sym)] # prices of each name - ret = Array{Float64}(undef,length(data)) # returns of each name + ret = Array{Float64}(undef, length(data)) # returns of each name ret[1] = NaN for i in 2:length(data) - ret[i] = (data[i]/data[i-1]) - 1 # calculate returns of each name + ret[i] = (data[i] / data[i-1]) - 1 # calculate returns of each name end returns[!, Symbol(sym)] = ret # append returns into dataframe end @@ -38,17 +38,17 @@ function: Clustering reference: De Prado, M. (2020) Advances in financial machine learning. John Wiley & Sons. methodology: Snipet 4.1, Page 56 ----------------------------------------------------------------------""" -function clusterKMeansBase(correlation; - numberClusters = 10, - iterations = 10) - distance = sqrt.((1 .- correlation)/2) # distance matrix +function clusterKMeansBase(correlation; + numberClusters=10, + iterations=10) + distance = sqrt.((1 .- correlation) / 2) # distance matrix silh, kmeansOut = [NaN], [NaN] # initial value for silh, kmeans for init ∈ 1:iterations for i ∈ 2:numberClusters kmeans_ = kmeans(distance, i) # clustering distance with maximum cluster i silh_ = Metrics.silhouette_samples(distance, assignments(kmeans_)) # silh score of clustering - statistic = (mean(silh_)/std(silh_), mean(silh)/std(silh)) # calculate t-statistic - if isnan(statistic[2]) || statistic[1]>statistic[2] + statistic = (mean(silh_) / std(silh_), mean(silh) / std(silh)) # calculate t-statistic + if isnan(statistic[2]) || statistic[1] > statistic[2] silh, kmeansOut = silh_, kmeans_ # replace better clustering end end @@ -56,8 +56,8 @@ function clusterKMeansBase(correlation; indexSorted = sortperm(assignments(kmeansOut)) # sort arguments based on clustering correlationSorted = correlation[indexSorted, indexSorted] # new corr matrix based on clustering # dictionary of clustering - clusters = Dict("$i"=> filter(p->assignments(kmeansOut)[p] == i, indexSorted) for i in unique(assignments(kmeansOut))) - silh = DataFrames.DataFrame(silh = silh) # dataframe of silh scores + clusters = Dict("$i" => filter(p -> assignments(kmeansOut)[p] == i, indexSorted) for i in unique(assignments(kmeansOut))) + silh = DataFrames.DataFrame(silh=silh) # dataframe of silh scores return correlationSorted, clusters, silh, indexSorted end @@ -67,27 +67,27 @@ reference: De Prado, M. (2020) Advances in financial machine learning. John Wile methodology: Snipet 4.2, Page 58 ----------------------------------------------------------------------""" function makeNewOutputs(correlation, # corr dataframe - clusters, # cluster 1 - clusters2) # cluster 2 + clusters, # cluster 1 + clusters2) # cluster 2 assets = names(correlation)# name of the columns of corr dataframe # merge two clusters clustersNew = Dict() for i in keys(clusters) - clustersNew[length(keys(clustersNew)) + 1] = clusters[i] + clustersNew[length(keys(clustersNew))+1] = clusters[i] end for i in keys(clusters2) - clustersNew[length(keys(clustersNew)) + 1] = clusters2[i] + clustersNew[length(keys(clustersNew))+1] = clusters2[i] end indexNew = [j for i in keys(clustersNew) for j in clustersNew[i]] # sorted index of assets correlationNew = correlation[indexin(indexNew, assets), indexin(indexNew, assets)] # new corr matrix - distance = sqrt.((1 .- Matrix(correlation))/2) # distance matrix + distance = sqrt.((1 .- Matrix(correlation)) / 2) # distance matrix labelsKmeans = zeros(size(distance)[2]) # initial labels for i in keys(clustersNew) - index = indexin(clustersNew[i], assets) + index = indexin(clustersNew[i], assets) labelsKmeans[index] .= i # label for clusters end - silhNew = DataFrames.DataFrame(index = assets, silh = Metrics.silhouette_samples(distance, labelsKmeans)) # silh series - return correlationNew,clustersNew,silhNew + silhNew = DataFrames.DataFrame(index=assets, silh=Metrics.silhouette_samples(distance, labelsKmeans)) # silh series + return correlationNew, clustersNew, silhNew end """---------------------------------------------------------------------- @@ -95,22 +95,22 @@ function: clustering (ONC) reference: De Prado, M. (2020) Advances in financial machine learning. John Wiley & Sons. methodology: Snipet 4.2, Page 58 ----------------------------------------------------------------------""" -function clusterKMeansTop(correlation; # corr dataframe - numberClusters = nothing, # number of clusters - iterations = 10) # number of iterations +function clusterKMeansTop(correlation, # corr dataframe + numberClusters=nothing, # number of clusters + iterations=10) # number of iterations if isnothing(numberClusters) numberClusters = size(correlation)[2] - 1 # set number of cluster end assets = names(correlation) # names of columns # clustering - correlationSorted, clusters, silh, indexSorted = clusterKMeansBase(Matrix(correlation), numberClusters = min(numberClusters, size(correlation)[2] - 1), iterations = 10) + correlationSorted, clusters, silh, indexSorted = clusterKMeansBase(Matrix(correlation), numberClusters=min(numberClusters, size(correlation)[2] - 1), iterations=10) correlationSorted = DataFrames.DataFrame(correlationSorted, :auto) # dataframe of correlationSorted DataFrames.rename!(correlationSorted, Symbol.(names(correlationSorted)) .=> assets[indexSorted]) # rename columns of the dataframe of correlationSorted clusters = Dict("$i" => assets[clusters[i]] for i in keys(clusters)) # dictionary of clusters # calcultae t-statistic of each cluster - clusterTstats = Dict("$i" => mean(silh[indexin(clusters[i], assets), :silh])/std(silh[indexin(clusters[i], assets), :silh]) for i in keys(clusters)) - tStatMean = sum(values(clusterTstats))/length(clusterTstats) # mean of t-statistics - redoClusters = [i for i in keys(clusterTstats) if clusterTstats[i] mean(silh[indexin(clusters[i], assets), :silh]) / std(silh[indexin(clusters[i], assets), :silh]) for i in keys(clusters)) + tStatMean = sum(values(clusterTstats)) / length(clusterTstats) # mean of t-statistics + redoClusters = [i for i in keys(clusterTstats) if clusterTstats[i] < tStatMean] # select clusters which have t-stat lower than mean if length(redoClusters) <= 1 return correlationSorted, clusters, silh else @@ -119,12 +119,12 @@ function clusterKMeansTop(correlation; # corr dataframe assets_ = names(correlationTemp) # names of dataframe tStatMean = mean([clusterTstats[i] for i in redoClusters]) # mean of t-stats redoclusters # call again clusterKMeansTop - correlationSorted2, clusters2, silh2 = clusterKMeansTop(correlationTemp, numberClusters = min(numberClusters, size(correlationTemp)[2] - 1), iterations = iterations) + correlationSorted2, clusters2, silh2 = clusterKMeansTop(correlationTemp, numberClusters=min(numberClusters, size(correlationTemp)[2] - 1), iterations=iterations) # Make new outputs, if necessary correlationNew, clustersNew, silhNew = makeNewOutputs(correlation, Dict("$i" => clusters[i] for i in keys(clusters) if i ∉ redoClusters), clusters2) # mean of t-stats new output - newTstatMean = mean([mean(silhNew[indexin(clustersNew[i], silhNew.index), :silh])/ - std(silhNew[indexin(clustersNew[i], silhNew.index), :silh]) + newTstatMean = mean([mean(silhNew[indexin(clustersNew[i], silhNew.index), :silh]) / + std(silhNew[indexin(clustersNew[i], silhNew.index), :silh]) for i in keys(clustersNew)]) if newTstatMean <= tStatMean return correlationSorted, clusters, silh @@ -140,9 +140,9 @@ reference: De Prado, M. (2020) Advances in financial machine learning. John Wile methodology: Snipet 4.3, Page 61 ----------------------------------------------------------------------""" function randomCovarianceSub(numberObservations, # number of observations - numberColumns, # number of cols - σ, # sigma for normal distribution - domain) # range for rand + numberColumns, # number of cols + σ, # sigma for normal distribution + domain) # range for rand # Sub correlation matrix if numberColumns == 1 return ones(1, 1) @@ -160,17 +160,17 @@ reference: De Prado, M. (2020) Advances in financial machine learning. John Wile methodology: Snipet 4.3, Page 61 ----------------------------------------------------------------------""" function randomBlockCovariance(numberColumns, # number of cols - numberBlocks; # number of blocks - blockSizeMin = 1, # minimum size of block - σ = 1., # sigma for normal distribution - domain = nothing) # range for rand + numberBlocks; # number of blocks + blockSizeMin=1, # minimum size of block + σ=1.0, # sigma for normal distribution + domain=nothing) # range for rand # Generate a block random correlation matrix - parts = sort(StatsBase.sample(domain, 1:numberColumns - (blockSizeMin - 1)*numberBlocks - 1, numberBlocks - 1, replace = false)) - append!(parts, numberColumns - (blockSizeMin - 1)*numberBlocks) - parts = append!([parts[1]], diff(parts)) .-1 .+ blockSizeMin + parts = sort(StatsBase.sample(domain, 1:numberColumns-(blockSizeMin-1)*numberBlocks-1, numberBlocks - 1, replace=false)) + append!(parts, numberColumns - (blockSizeMin - 1) * numberBlocks) + parts = append!([parts[1]], diff(parts)) .- 1 .+ blockSizeMin covariance = nothing for column in parts - thisCovariance = randomCovarianceSub(Int(max(column*(column + 1)/2., 100)), column, σ, domain) # sub covariance + thisCovariance = randomCovarianceSub(Int(max(column * (column + 1) / 2.0, 100)), column, σ, domain) # sub covariance if isnothing(covariance) covariance = copy(thisCovariance) #copy covariance else @@ -186,14 +186,14 @@ reference: De Prado, M. (2020) Advances in financial machine learning. John Wile methodology: Snipet 4.3, Page 61 ----------------------------------------------------------------------""" function randomBlockCorrelation(numberColumns, # number of cols - numberBlocks; # number of blocks - randomState = nothing, # for rand data - blockSizeMin = 1) # minimum size of block + numberBlocks; # number of blocks + randomState=nothing, # for rand data + blockSizeMin=1) # minimum size of block # set seed domain = MersenneTwister(randomState) # generate 2 random block diagram cov matrix - covariance1 = randomBlockCovariance(numberColumns, numberBlocks, blockSizeMin = blockSizeMin, σ = .5, domain = domain) - covariance2 = randomBlockCovariance(numberColumns, 1, blockSizeMin = blockSizeMin, σ = 1., domain = domain) # add noise + covariance1 = randomBlockCovariance(numberColumns, numberBlocks, blockSizeMin=blockSizeMin, σ=0.5, domain=domain) + covariance2 = randomBlockCovariance(numberColumns, 1, blockSizeMin=blockSizeMin, σ=1.0, domain=domain) # add noise covariance1 += covariance2 # add 2 cov matrix correlation = covToCorr(covariance1) # corr matrix correlation = DataFrames.DataFrame(correlation, :auto) # dataframe of corr matrix @@ -207,8 +207,8 @@ end ----------------------------------------------------------------------""" function covToCorr(covariance) # covariance matrix std = sqrt.((diag(covariance))) # standard deviations - correlation = covariance./(std.*std') # create correlation matrix - correlation[correlation .< -1] .= -1 # numerical error - correlation[correlation .> 1] .= 1 # numerical error + correlation = covariance ./ (std .* std') # create correlation matrix + correlation[correlation.<-1] .= -1 # numerical error + correlation[correlation.>1] .= 1 # numerical error return correlation end From 9a37d3ad2c7128fe8e38c75bed6ef91239fc5aa8 Mon Sep 17 00:00:00 2001 From: Ahmad Date: Fri, 5 May 2023 11:28:31 +0430 Subject: [PATCH 2/4] second reference paper --- src/Backtests/MultipleTestReporting.jl | 28 ++++++++++++++++++++++---- 1 file changed, 24 insertions(+), 4 deletions(-) diff --git a/src/Backtests/MultipleTestReporting.jl b/src/Backtests/MultipleTestReporting.jl index db552f9..d55b517 100644 --- a/src/Backtests/MultipleTestReporting.jl +++ b/src/Backtests/MultipleTestReporting.jl @@ -15,18 +15,38 @@ doi: https://doi.org/10.3905/jpm.2018.45.1.141 """ function BacktestResultTemplate(nTrials::Int64, # number of trials familySize::Union{UInt64,Nothing}, # number of significantly different experiments - correlation::Union{DataFrame,Nothing}, # corr dataframe - familyWiseErrorRate::Union{Float64,Nothing}, # family-wise error rate + clusteringArgs::Union{Array{DataFrame,UInt64,UInt64},Nothing}, # corr dataframe, number of clusters, number of iterations + familyWiseErrorRate::Union{UFloat64,Nothing}, # family-wise error rate + typeOneErrorArgs::Union{Array{Float64,UInt64},Nothing}, # z statistic for the estimated Sharpe Ratios, number of tests + CalculatePowerOfTest::Bool, # should calculate power of the test powerOfTest::Union{Float64,Nothing}, # power of the test + typeTwoErrorArgs::Union{Array{UInt64,Float64},Nothing}, # number of tests, calculated theta parameter )::Nothing if familySize === nothing println("familySize not provided; using clustering on correlation matrix of backtest returns...") - @assert correlation !== nothing "correlation must be provided" - correlationNew, clusters, silh = clusterKMeansTop(correlation) + @assert clusteringArgs !== nothing "clustering function arguments must be provided" + correlationNew, clusters, silh = clusterKMeansTop(clusteringArgs...) familySize = length(clusters) println("calculated familySize: ", familySize) else @assert nTrials >= familySize "familySize must be less equal to nTrials" end + if familyWiseErrorRate === nothing + println("familyWiseErrorRate not provided; using Sharpe Ratio type 1 error under multiple testing...") + @assert typeOneErrorArgs !== nothing "Type 1 Error Probability function arguments must be provided" + familyWiseErrorRate = strategyType1ErrorProbability(typeOneErrorArgs...) + println("calculated familyWiseErrorRate: ", familyWiseErrorRate) + end + + if CalculatePowerOfTest == false + println("skipping calculating optional parameter: powerOfTest ...") + else + @assert powerOfTest === nothing "powerOfTest must not be provided" + println("powerOfTest not provided, using Sharpe Ratio type 2 error under multiple testing...") + @assert typeTwoErrorArgs !== nothing "Type 2 Error Probability function arguments must be provided" + insert!(typeTwoErrorArgs, 1, familyWiseErrorRate) + powerOfTest = strategyType2ErrorProbability(typeTwoErrorArgs...) + println("calculated powerOfTest: ", powerOfTest) + end end From a4a678b74a0ea696da9cc2b838944465b810cfd1 Mon Sep 17 00:00:00 2001 From: Ahmad Date: Fri, 5 May 2023 17:09:33 +0430 Subject: [PATCH 3/4] first test set created --- src/Backtests/MultipleTestReporting.jl | 17 +++++++++------- test/Backtests/MultipleTestReportingTest.jl | 22 +++++++++++++++++++++ test/runtests.jl | 3 +-- 3 files changed, 33 insertions(+), 9 deletions(-) create mode 100644 test/Backtests/MultipleTestReportingTest.jl diff --git a/src/Backtests/MultipleTestReporting.jl b/src/Backtests/MultipleTestReporting.jl index d55b517..04ec8a4 100644 --- a/src/Backtests/MultipleTestReporting.jl +++ b/src/Backtests/MultipleTestReporting.jl @@ -14,14 +14,16 @@ refernce: Fabozzi, F, De Prado, M (2018) Being Honest in Backtest Reporting: A T doi: https://doi.org/10.3905/jpm.2018.45.1.141 """ function BacktestResultTemplate(nTrials::Int64, # number of trials - familySize::Union{UInt64,Nothing}, # number of significantly different experiments - clusteringArgs::Union{Array{DataFrame,UInt64,UInt64},Nothing}, # corr dataframe, number of clusters, number of iterations - familyWiseErrorRate::Union{UFloat64,Nothing}, # family-wise error rate - typeOneErrorArgs::Union{Array{Float64,UInt64},Nothing}, # z statistic for the estimated Sharpe Ratios, number of tests - CalculatePowerOfTest::Bool, # should calculate power of the test - powerOfTest::Union{Float64,Nothing}, # power of the test - typeTwoErrorArgs::Union{Array{UInt64,Float64},Nothing}, # number of tests, calculated theta parameter + familySize::Union{Int64,Nothing}=nothing, # number of significantly different experiments + clusteringArgs::Union{Tuple{DataFrame,Int,Int},Nothing}=nothing, # correlation dataframe, number of clusters, number of iterations + familyWiseErrorRate::Union{Float64,Nothing}=nothing, # family-wise error rate + typeOneErrorArgs::Union{Tuple{Float64,Int64},Nothing}=nothing, # z statistic for the estimated Sharpe Ratios, number of tests + CalculatePowerOfTest::Bool=false, # should calculate power of the test + powerOfTest::Union{Float64,Nothing}=nothing, # power of the test + typeTwoErrorArgs::Union{Tuple{Int64,Float64},Nothing}=nothing # number of tests, calculated theta parameter )::Nothing + @assert nTrials > 0 "nTrials must be a positive integer" + if familySize === nothing println("familySize not provided; using clustering on correlation matrix of backtest returns...") @assert clusteringArgs !== nothing "clustering function arguments must be provided" @@ -29,6 +31,7 @@ function BacktestResultTemplate(nTrials::Int64, # number of trials familySize = length(clusters) println("calculated familySize: ", familySize) else + @assert familySize > 0 "familySize must be a positive integer" @assert nTrials >= familySize "familySize must be less equal to nTrials" end diff --git a/test/Backtests/MultipleTestReportingTest.jl b/test/Backtests/MultipleTestReportingTest.jl new file mode 100644 index 0000000..3e5ef37 --- /dev/null +++ b/test/Backtests/MultipleTestReportingTest.jl @@ -0,0 +1,22 @@ +""" +List of Collaborators, Developers, and Research Assistants (in alphabetical order) +Ahmad Zaferani +""" + +using Test +include("../../src/Backtests/MultipleTestReporting.jl") + + +@testset "BacktestResultTemplate" begin + @testset "test arguments" begin + @test_throws AssertionError BacktestResultTemplate(-1) + @test_throws AssertionError BacktestResultTemplate(1, -1) + @test_throws AssertionError BacktestResultTemplate(1, 5) + @test_throws AssertionError BacktestResultTemplate(1, 1, nothing) + @test BacktestResultTemplate(1, 1, nothing, 1.0) === nothing + end + + @testset "call dependencies" begin + @test BacktestResultTemplate(1, 1, nothing, 1.0) === nothing + end +end diff --git a/test/runtests.jl b/test/runtests.jl index 5f1abd5..18f3018 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -1,2 +1 @@ -using Test -@test true +include("Backtests/MultipleTestReportingTest.jl") From 9027cef502acad437fc613a8ecd90f6759c6bb4a Mon Sep 17 00:00:00 2001 From: Ahmad Date: Fri, 5 May 2023 22:42:06 +0430 Subject: [PATCH 4/4] add mocking package - unit test dependencies --- Manifest.toml | 11 ++++ Project.toml | 1 + src/Backtests/MultipleTestReporting.jl | 31 +++++---- test/Backtests/MultipleTestReportingTest.jl | 69 ++++++++++++++++++--- 4 files changed, 93 insertions(+), 19 deletions(-) diff --git a/Manifest.toml b/Manifest.toml index 4986c41..ceb1171 100644 --- a/Manifest.toml +++ b/Manifest.toml @@ -102,6 +102,11 @@ git-tree-sha1 = "2fb1e02f2b635d0845df5d7c167fec4dd739b00d" uuid = "ffbed154-4ef7-542d-bbb7-c09d3a79fcae" version = "0.9.3" +[[deps.ExprTools]] +git-tree-sha1 = "c1d06d129da9f55715c6c212866f5b1bddc5fa00" +uuid = "e2ba6199-217a-4e67-a87a-7c52f15ade04" +version = "0.1.9" + [[deps.FillArrays]] deps = ["LinearAlgebra", "Random", "SparseArrays", "Statistics"] git-tree-sha1 = "7072f1e3e5a8be51d525d64f63d3ec1287ff2790" @@ -193,6 +198,12 @@ version = "1.1.0" [[deps.Mmap]] uuid = "a63ad114-7e13-5084-954f-fe012c677804" +[[deps.Mocking]] +deps = ["Compat", "ExprTools"] +git-tree-sha1 = "782e258e80d68a73d8c916e55f8ced1de00c2cea" +uuid = "78c3b35d-d492-501b-9361-3d52fe80e533" +version = "0.7.6" + [[deps.NearestNeighbors]] deps = ["Distances", "StaticArrays"] git-tree-sha1 = "2c3726ceb3388917602169bed973dbc97f1b51a8" diff --git a/Project.toml b/Project.toml index 09a7cd8..a20f9c8 100644 --- a/Project.toml +++ b/Project.toml @@ -9,6 +9,7 @@ BlockDiagonals = "0a1fb500-61f7-11e9-3c65-f5ef3456f9f0" Clustering = "aaaa29a8-35af-508c-8bc3-b662a17a0fe5" DataFrames = "a93c6f00-e57d-5684-b7b6-d8193f3e46c0" LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e" +Mocking = "78c3b35d-d492-501b-9361-3d52fe80e533" Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c" Shuffle = "bf21e494-c40e-4daa-abfb-de5ec0aad010" Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40" diff --git a/src/Backtests/MultipleTestReporting.jl b/src/Backtests/MultipleTestReporting.jl index 04ec8a4..aa1e59b 100644 --- a/src/Backtests/MultipleTestReporting.jl +++ b/src/Backtests/MultipleTestReporting.jl @@ -4,6 +4,7 @@ Ahmad Zaferani """ using DataFrames +using Mocking include("../features/Clustering.jl") include("TestSetOverfitting.jl") @@ -13,7 +14,7 @@ function: a Template for Reporting Results of Backtest on Financial Strategies refernce: Fabozzi, F, De Prado, M (2018) Being Honest in Backtest Reporting: A Template for Disclosing Multiple Tests doi: https://doi.org/10.3905/jpm.2018.45.1.141 """ -function BacktestResultTemplate(nTrials::Int64, # number of trials +function backtestResultTemplate(nTrials::Int64, # number of trials familySize::Union{Int64,Nothing}=nothing, # number of significantly different experiments clusteringArgs::Union{Tuple{DataFrame,Int,Int},Nothing}=nothing, # correlation dataframe, number of clusters, number of iterations familyWiseErrorRate::Union{Float64,Nothing}=nothing, # family-wise error rate @@ -21,35 +22,41 @@ function BacktestResultTemplate(nTrials::Int64, # number of trials CalculatePowerOfTest::Bool=false, # should calculate power of the test powerOfTest::Union{Float64,Nothing}=nothing, # power of the test typeTwoErrorArgs::Union{Tuple{Int64,Float64},Nothing}=nothing # number of tests, calculated theta parameter -)::Nothing +)::String + buffer = IOBuffer() + @assert nTrials > 0 "nTrials must be a positive integer" if familySize === nothing - println("familySize not provided; using clustering on correlation matrix of backtest returns...") + println(buffer, "familySize not provided; using clustering on correlation matrix of backtest returns...") @assert clusteringArgs !== nothing "clustering function arguments must be provided" - correlationNew, clusters, silh = clusterKMeansTop(clusteringArgs...) + correlationNew, clusters, silh = @mock clusterKMeansTop(clusteringArgs...) familySize = length(clusters) - println("calculated familySize: ", familySize) + println(buffer, "calculated familySize: ", familySize) else @assert familySize > 0 "familySize must be a positive integer" @assert nTrials >= familySize "familySize must be less equal to nTrials" end if familyWiseErrorRate === nothing - println("familyWiseErrorRate not provided; using Sharpe Ratio type 1 error under multiple testing...") + println(buffer, "familyWiseErrorRate not provided; using Sharpe Ratio type 1 error under multiple testing...") @assert typeOneErrorArgs !== nothing "Type 1 Error Probability function arguments must be provided" - familyWiseErrorRate = strategyType1ErrorProbability(typeOneErrorArgs...) - println("calculated familyWiseErrorRate: ", familyWiseErrorRate) + familyWiseErrorRate = @mock strategyType1ErrorProbability(typeOneErrorArgs...) + println(buffer, "calculated familyWiseErrorRate: ", familyWiseErrorRate) end if CalculatePowerOfTest == false - println("skipping calculating optional parameter: powerOfTest ...") + println(buffer, "skipping calculating optional parameter: powerOfTest ...") else @assert powerOfTest === nothing "powerOfTest must not be provided" - println("powerOfTest not provided, using Sharpe Ratio type 2 error under multiple testing...") + println(buffer, "powerOfTest not provided, using Sharpe Ratio type 2 error under multiple testing...") @assert typeTwoErrorArgs !== nothing "Type 2 Error Probability function arguments must be provided" + typeTwoErrorArgs = [typeTwoErrorArgs...] insert!(typeTwoErrorArgs, 1, familyWiseErrorRate) - powerOfTest = strategyType2ErrorProbability(typeTwoErrorArgs...) - println("calculated powerOfTest: ", powerOfTest) + powerOfTest = @mock strategyType2ErrorProbability(typeTwoErrorArgs...) + println(buffer, "calculated powerOfTest: ", powerOfTest) end + + println(buffer, "Strategy Results are as follows:\n1. number of trials: $nTrials\n2. number of significantly different experiments: $familySize\n3. family-wise error rate: $familyWiseErrorRate\n4. power of the test: $powerOfTest") + return String(take!(buffer)) end diff --git a/test/Backtests/MultipleTestReportingTest.jl b/test/Backtests/MultipleTestReportingTest.jl index 3e5ef37..b2e1c3d 100644 --- a/test/Backtests/MultipleTestReportingTest.jl +++ b/test/Backtests/MultipleTestReportingTest.jl @@ -4,19 +4,74 @@ Ahmad Zaferani """ using Test +using Mocking +using DataFrames include("../../src/Backtests/MultipleTestReporting.jl") +Mocking.activate() -@testset "BacktestResultTemplate" begin +function generateSuccessOutput(nTrials, familySize, familyWiseErrorRate, powerOfTest, + familySizeNotProvided=false, calculatedFamilySize=0, familyWiseErrorRateNotProvided=false, + calculatedFamilyWizeErrorRate=0, skipPowerOfTest=true, calculatedPowerOfTest=0) + familySizeNotProvidedMsg = "familySize not provided; using clustering on correlation matrix of backtest returns..." + calculatedFamilySizeMsg = "calculated familySize: $calculatedFamilySize" + familyWiseErrorRateNotProvidedMsg = "familyWiseErrorRate not provided; using Sharpe Ratio type 1 error under multiple testing..." + calculatedFamilyWizeErrorRateMsg = "calculated familyWiseErrorRate: $calculatedFamilyWizeErrorRate" + skipPowerOfTestMsg = "skipping calculating optional parameter: powerOfTest ..." + powerOfTestNotProvidedMsg = "powerOfTest not provided, using Sharpe Ratio type 2 error under multiple testing..." + calculatedPowerOfTestMsg = "calculated powerOfTest: $calculatedPowerOfTest" + + buffer = IOBuffer() + if familySizeNotProvided == true + println(buffer, familySizeNotProvidedMsg) + println(buffer, calculatedFamilySizeMsg) + familySize = calculatedFamilySize + end + if familyWiseErrorRateNotProvided == true + println(buffer, familyWiseErrorRateNotProvidedMsg) + println(buffer, calculatedFamilyWizeErrorRateMsg) + familyWiseErrorRate = calculatedFamilyWizeErrorRate + end + if skipPowerOfTest == true + println(buffer, skipPowerOfTestMsg) + else + println(buffer, powerOfTestNotProvidedMsg) + println(buffer, calculatedPowerOfTestMsg) + powerOfTest = calculatedPowerOfTest + end + println(buffer, "Strategy Results are as follows:\n1. number of trials: $nTrials\n2. number of significantly different experiments: $familySize\n3. family-wise error rate: $familyWiseErrorRate\n4. power of the test: $powerOfTest") + return String(take!(buffer)) +end + +@testset "backtestResultTemplate" begin @testset "test arguments" begin - @test_throws AssertionError BacktestResultTemplate(-1) - @test_throws AssertionError BacktestResultTemplate(1, -1) - @test_throws AssertionError BacktestResultTemplate(1, 5) - @test_throws AssertionError BacktestResultTemplate(1, 1, nothing) - @test BacktestResultTemplate(1, 1, nothing, 1.0) === nothing + @test_throws AssertionError backtestResultTemplate(-1) + @test_throws AssertionError backtestResultTemplate(1, -1) + @test_throws AssertionError backtestResultTemplate(1, 5) + @test_throws AssertionError backtestResultTemplate(1, 1, nothing) + @test backtestResultTemplate(1, 1, nothing, 1.0) === generateSuccessOutput(1, 1, 1.0, nothing) end @testset "call dependencies" begin - @test BacktestResultTemplate(1, 1, nothing, 1.0) === nothing + twentyCluster = @patch clusterKMeansTop(correlation, numberClusters=nothing, itetations=10) = nothing, 1:20, nothing + apply(twentyCluster) do + @test backtestResultTemplate(1, nothing, (DataFrame(), 1, 1), 1.0) == generateSuccessOutput(1, 10, 1.0, nothing, true, 20) + end + + zeroPointSixError = @patch strategyType1ErrorProbability(z, k=1) = 0.6 + apply(zeroPointSixError) do + @test backtestResultTemplate(20, 10, nothing, nothing, (0.1, 1)) == generateSuccessOutput(20, 10, 1.0, nothing, false, 0, true, 0.6) + end + end + + @testset "optional parameters" begin + @test backtestResultTemplate(1, 1, nothing, 1.0, nothing, false, 0.44) == generateSuccessOutput(1, 1, 1.0, 0.44) + @test_throws AssertionError backtestResultTemplate(1, 1, nothing, 1.0, nothing, true) + @test_throws AssertionError backtestResultTemplate(1, 1, nothing, 1.0, nothing, true, 1.0) + + zeroPointFourtyFourError = @patch strategyType2ErrorProbability(α, k, θ) = 0.44 + apply(zeroPointFourtyFourError) do + @test backtestResultTemplate(1, 1, nothing, 1.0, nothing, true, nothing, (1, 1.0)) == generateSuccessOutput(1, 1, 1.0, nothing, false, 0, false, 0, false, 0.44) + end end end