diff --git a/README.md b/README.md index fbff8b342..a31037910 100644 --- a/README.md +++ b/README.md @@ -42,7 +42,7 @@ framework?** Start [here](https://JuliaAI.github.io/MLJ.jl/dev/quick_start_guide MLJ was initially created as a Tools, Practices and Systems project at the [Alan Turing Institute](https://www.turing.ac.uk/) -in 2019. Current funding is provided by a [New Zealand Strategic +in 2019. Funding has also been provided by a [New Zealand Strategic Science Investment Fund](https://www.mbie.govt.nz/science-and-technology/science-and-innovation/funding-information-and-opportunities/investment-funds/strategic-science-investment-fund/ssif-funded-programmes/university-of-auckland/) awarded to the University of Auckland. @@ -50,6 +50,7 @@ awarded to the University of Auckland. MLJ has been developed with the support of the following organizations:
+ diff --git a/examples/telco/Manifest.toml b/examples/telco/Manifest.toml deleted file mode 100644 index a9d63382d..000000000 --- a/examples/telco/Manifest.toml +++ /dev/null @@ -1,1337 +0,0 @@ -# This file is machine-generated - editing it directly is not advised - -julia_version = "1.6.5" -manifest_format = "2.0" - -[[deps.ARFFFiles]] -deps = ["CategoricalArrays", "Dates", "Parsers", "Tables"] -git-tree-sha1 = "e8c8e0a2be6eb4f56b1672e46004463033daa409" -uuid = "da404889-ca92-49ff-9e8b-0aa6b4d38dc8" -version = "1.4.1" - -[[deps.AbstractFFTs]] -deps = ["ChainRulesCore", "LinearAlgebra"] -git-tree-sha1 = "6f1d9bc1c08f9f4a8fa92e3ea3cb50153a1b40d4" -uuid = "621f4979-c628-5d54-868e-fcf4e3e8185c" -version = "1.1.0" - -[[deps.Adapt]] -deps = ["LinearAlgebra"] -git-tree-sha1 = "af92965fb30777147966f58acb05da51c5616b5f" -uuid = "79e6a3ab-5dfb-504d-930d-738a2a938a0e" -version = "3.3.3" - -[[deps.ArgTools]] -uuid = "0dad84c5-d112-42e6-8d28-ef12dabb789f" - -[[deps.Arpack]] -deps = ["Arpack_jll", "Libdl", "LinearAlgebra"] -git-tree-sha1 = "2ff92b71ba1747c5fdd541f8fc87736d82f40ec9" -uuid = "7d9fca2a-8960-54d3-9f78-7d1dccf2cb97" -version = "0.4.0" - -[[deps.Arpack_jll]] -deps = ["Libdl", "OpenBLAS_jll", "Pkg"] -git-tree-sha1 = "e214a9b9bd1b4e1b4f15b22c0994862b66af7ff7" -uuid = "68821587-b530-5797-8361-c406ea357684" -version = "3.5.0+3" - -[[deps.Artifacts]] -uuid = "56f22d72-fd6d-98f1-02f0-08ddc0907c33" - -[[deps.BFloat16s]] -deps = ["LinearAlgebra", "Printf", "Random", "Test"] -git-tree-sha1 = "a598ecb0d717092b5539dbbe890c98bac842b072" -uuid = "ab4f0b2a-ad5b-11e8-123f-65d77653426b" -version = "0.2.0" - -[[deps.BSON]] -git-tree-sha1 = "306bb5574b0c1c56d7e1207581516c557d105cad" -uuid = "fbb218c0-5317-5bc6-957e-2ee96dd4b1f0" -version = "0.3.5" - -[[deps.Base64]] -uuid = "2a0f44e3-6c83-55bd-87e4-b1978d98bd5f" - -[[deps.Bzip2_jll]] -deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"] -git-tree-sha1 = "19a35467a82e236ff51bc17a3a44b69ef35185a2" -uuid = "6e34b625-4abd-537c-b88f-471c36dfa7a0" -version = "1.0.8+0" - -[[deps.CEnum]] -git-tree-sha1 = "215a9aa4a1f23fbd05b92769fdd62559488d70e9" -uuid = "fa961155-64e5-5f13-b03f-caf6b980ea82" -version = "0.4.1" - -[[deps.CUDA]] -deps = ["AbstractFFTs", "Adapt", "BFloat16s", "CEnum", "CompilerSupportLibraries_jll", "ExprTools", "GPUArrays", "GPUCompiler", "LLVM", "LazyArtifacts", "Libdl", "LinearAlgebra", "Logging", "Printf", "Random", "Random123", "RandomNumbers", "Reexport", "Requires", "SparseArrays", "SpecialFunctions", "TimerOutputs"] -git-tree-sha1 = "c60152d5401c14b770b045933a255828f1786bd3" -uuid = "052768ef-5323-5732-b1bb-66c8b64840ba" -version = "3.8.3" - -[[deps.Cairo_jll]] -deps = ["Artifacts", "Bzip2_jll", "Fontconfig_jll", "FreeType2_jll", "Glib_jll", "JLLWrappers", "LZO_jll", "Libdl", "Pixman_jll", "Pkg", "Xorg_libXext_jll", "Xorg_libXrender_jll", "Zlib_jll", "libpng_jll"] -git-tree-sha1 = "4b859a208b2397a7a623a03449e4636bdb17bcf2" -uuid = "83423d85-b0ee-5818-9007-b63ccbeb887a" -version = "1.16.1+1" - -[[deps.Calculus]] -deps = ["LinearAlgebra"] -git-tree-sha1 = "f641eb0a4f00c343bbc32346e1217b86f3ce9dad" -uuid = "49dc2e85-a5d0-5ad3-a950-438e2897f1b9" -version = "0.5.1" - -[[deps.CategoricalArrays]] -deps = ["DataAPI", "Future", "Missings", "Printf", "Requires", "Statistics", "Unicode"] -git-tree-sha1 = "3b60064cb48efe986179359e08ffb568a6d510a2" -uuid = "324d7699-5711-5eae-9e2f-1d82baa6b597" -version = "0.10.3" - -[[deps.CategoricalDistributions]] -deps = ["CategoricalArrays", "Distributions", "Missings", "OrderedCollections", "Random", "ScientificTypesBase", "UnicodePlots"] -git-tree-sha1 = "8c340dc71d2dc9177b1f701726d08d2255d2d811" -uuid = "af321ab8-2d2e-40a6-b165-3d674595d28e" -version = "0.1.5" - -[[deps.ChainRulesCore]] -deps = ["Compat", "LinearAlgebra", "SparseArrays"] -git-tree-sha1 = "c9a6160317d1abe9c44b3beb367fd448117679ca" -uuid = "d360d2e6-b24c-11e9-a2a3-2a2ae2dbcce4" -version = "1.13.0" - -[[deps.ChangesOfVariables]] -deps = ["ChainRulesCore", "LinearAlgebra", "Test"] -git-tree-sha1 = "bf98fa45a0a4cee295de98d4c1462be26345b9a1" -uuid = "9e997f8a-9a97-42d5-a9f1-ce6bfc15e2c0" -version = "0.1.2" - -[[deps.CodecZlib]] -deps = ["TranscodingStreams", "Zlib_jll"] -git-tree-sha1 = "ded953804d019afa9a3f98981d99b33e3db7b6da" -uuid = "944b1d66-785c-5afd-91f1-9de20f533193" -version = "0.7.0" - -[[deps.ColorSchemes]] -deps = ["ColorTypes", "Colors", "FixedPointNumbers", "Random"] -git-tree-sha1 = "12fc73e5e0af68ad3137b886e3f7c1eacfca2640" -uuid = "35d6a980-a343-548e-a6ea-1d62b119f2f4" -version = "3.17.1" - -[[deps.ColorTypes]] -deps = ["FixedPointNumbers", "Random"] -git-tree-sha1 = "024fe24d83e4a5bf5fc80501a314ce0d1aa35597" -uuid = "3da002f7-5984-5a60-b8a6-cbb66c0b333f" -version = "0.11.0" - -[[deps.Colors]] -deps = ["ColorTypes", "FixedPointNumbers", "Reexport"] -git-tree-sha1 = "417b0ed7b8b838aa6ca0a87aadf1bb9eb111ce40" -uuid = "5ae59095-9a9b-59fe-a467-6f913c188581" -version = "0.12.8" - -[[deps.Compat]] -deps = ["Base64", "Dates", "DelimitedFiles", "Distributed", "InteractiveUtils", "LibGit2", "Libdl", "LinearAlgebra", "Markdown", "Mmap", "Pkg", "Printf", "REPL", "Random", "SHA", "Serialization", "SharedArrays", "Sockets", "SparseArrays", "Statistics", "Test", "UUIDs", "Unicode"] -git-tree-sha1 = "96b0bc6c52df76506efc8a441c6cf1adcb1babc4" -uuid = "34da2185-b29b-5c13-b0c7-acf172513d20" -version = "3.42.0" - -[[deps.CompilerSupportLibraries_jll]] -deps = ["Artifacts", "Libdl"] -uuid = "e66e0078-7015-5450-92f7-15fbd957f2ae" - -[[deps.ComputationalResources]] -git-tree-sha1 = "52cb3ec90e8a8bea0e62e275ba577ad0f74821f7" -uuid = "ed09eef8-17a6-5b46-8889-db040fac31e3" -version = "0.3.2" - -[[deps.ConstructionBase]] -deps = ["LinearAlgebra"] -git-tree-sha1 = "f74e9d5388b8620b4cee35d4c5a618dd4dc547f4" -uuid = "187b0558-2788-49d3-abe0-74a17ed4e7c9" -version = "1.3.0" - -[[deps.Contour]] -deps = ["StaticArrays"] -git-tree-sha1 = "9f02045d934dc030edad45944ea80dbd1f0ebea7" -uuid = "d38c429a-6771-53c6-b99e-75d170b6e991" -version = "0.5.7" - -[[deps.Crayons]] -git-tree-sha1 = "249fe38abf76d48563e2f4556bebd215aa317e15" -uuid = "a8cc5b0e-0ffa-5ad4-8c14-923d3ee1735f" -version = "4.1.1" - -[[deps.DataAPI]] -git-tree-sha1 = "cc70b17275652eb47bc9e5f81635981f13cea5c8" -uuid = "9a962f9c-6df0-11e9-0e5d-c546b8b5ee8a" -version = "1.9.0" - -[[deps.DataFrames]] -deps = ["Compat", "DataAPI", "Future", "InvertedIndices", "IteratorInterfaceExtensions", "LinearAlgebra", "Markdown", "Missings", "PooledArrays", "PrettyTables", "Printf", "REPL", "Reexport", "SortingAlgorithms", "Statistics", "TableTraits", "Tables", "Unicode"] -git-tree-sha1 = "ae02104e835f219b8930c7664b8012c93475c340" -uuid = "a93c6f00-e57d-5684-b7b6-d8193f3e46c0" -version = "1.3.2" - -[[deps.DataStructures]] -deps = ["Compat", "InteractiveUtils", "OrderedCollections"] -git-tree-sha1 = "3daef5523dd2e769dad2365274f760ff5f282c7d" -uuid = "864edb3b-99cc-5e75-8d2d-829cb0a9cfe8" -version = "0.18.11" - -[[deps.DataValueInterfaces]] -git-tree-sha1 = "bfc1187b79289637fa0ef6d4436ebdfe6905cbd6" -uuid = "e2d170a0-9d28-54be-80f0-106bbe20a464" -version = "1.0.0" - -[[deps.Dates]] -deps = ["Printf"] -uuid = "ade2ca70-3891-5945-98fb-dc099432e06a" - -[[deps.DecisionTree]] -deps = ["DelimitedFiles", "Distributed", "LinearAlgebra", "Random", "ScikitLearnBase", "Statistics", "Test"] -git-tree-sha1 = "123adca1e427dc8abc5eec5040644e7842d53c92" -uuid = "7806a523-6efd-50cb-b5f6-3fa6f1930dbb" -version = "0.10.11" - -[[deps.DelimitedFiles]] -deps = ["Mmap"] -uuid = "8bb1440f-4735-579b-a4ab-409b98df4dab" - -[[deps.DensityInterface]] -deps = ["InverseFunctions", "Test"] -git-tree-sha1 = "80c3e8639e3353e5d2912fb3a1916b8455e2494b" -uuid = "b429d917-457f-4dbc-8f4c-0cc954292b1d" -version = "0.4.0" - -[[deps.Distances]] -deps = ["LinearAlgebra", "SparseArrays", "Statistics", "StatsAPI"] -git-tree-sha1 = "3258d0659f812acde79e8a74b11f17ac06d0ca04" -uuid = "b4f34e82-e78d-54a5-968a-f98e89d6e8f7" -version = "0.10.7" - -[[deps.Distributed]] -deps = ["Random", "Serialization", "Sockets"] -uuid = "8ba89e20-285c-5b6f-9357-94700520ee1b" - -[[deps.Distributions]] -deps = ["ChainRulesCore", "DensityInterface", "FillArrays", "LinearAlgebra", "PDMats", "Printf", "QuadGK", "Random", "SparseArrays", "SpecialFunctions", "Statistics", "StatsBase", "StatsFuns", "Test"] -git-tree-sha1 = "9d3c0c762d4666db9187f363a76b47f7346e673b" -uuid = "31c24e10-a181-5473-b8eb-7969acd0382f" -version = "0.25.49" - -[[deps.DocStringExtensions]] -deps = ["LibGit2"] -git-tree-sha1 = "b19534d1895d702889b219c382a6e18010797f0b" -uuid = "ffbed154-4ef7-542d-bbb7-c09d3a79fcae" -version = "0.8.6" - -[[deps.Downloads]] -deps = ["ArgTools", "LibCURL", "NetworkOptions"] -uuid = "f43a241f-c20a-4ad4-852c-f6b1247861c6" - -[[deps.DualNumbers]] -deps = ["Calculus", "NaNMath", "SpecialFunctions"] -git-tree-sha1 = "90b158083179a6ccbce2c7eb1446d5bf9d7ae571" -uuid = "fa6b7ba4-c1ee-5f82-b5fc-ecf0adba8f74" -version = "0.6.7" - -[[deps.EarCut_jll]] -deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"] -git-tree-sha1 = "3f3a2501fa7236e9b911e0f7a588c657e822bb6d" -uuid = "5ae413db-bbd1-5e63-b57d-d24a61df00f5" -version = "2.2.3+0" - -[[deps.EarlyStopping]] -deps = ["Dates", "Statistics"] -git-tree-sha1 = "98fdf08b707aaf69f524a6cd0a67858cefe0cfb6" -uuid = "792122b4-ca99-40de-a6bc-6742525f08b6" -version = "0.3.0" - -[[deps.EvoTrees]] -deps = ["BSON", "CUDA", "CategoricalArrays", "Distributions", "MLJModelInterface", "NetworkLayout", "Random", "RecipesBase", "SpecialFunctions", "StaticArrays", "Statistics", "StatsBase"] -git-tree-sha1 = "4ee5e68551afec3ae2af30931e333b5d35cc52a8" -uuid = "f6006082-12f8-11e9-0c9c-0d5d367ab1e5" -version = "0.9.4" - -[[deps.Expat_jll]] -deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"] -git-tree-sha1 = "ae13fcbc7ab8f16b0856729b050ef0c446aa3492" -uuid = "2e619515-83b5-522b-bb60-26c02a35a201" -version = "2.4.4+0" - -[[deps.ExprTools]] -git-tree-sha1 = "56559bbef6ca5ea0c0818fa5c90320398a6fbf8d" -uuid = "e2ba6199-217a-4e67-a87a-7c52f15ade04" -version = "0.1.8" - -[[deps.FFMPEG]] -deps = ["FFMPEG_jll"] -git-tree-sha1 = "b57e3acbe22f8484b4b5ff66a7499717fe1a9cc8" -uuid = "c87230d0-a227-11e9-1b43-d7ebe4e7570a" -version = "0.4.1" - -[[deps.FFMPEG_jll]] -deps = ["Artifacts", "Bzip2_jll", "FreeType2_jll", "FriBidi_jll", "JLLWrappers", "LAME_jll", "Libdl", "Ogg_jll", "OpenSSL_jll", "Opus_jll", "Pkg", "Zlib_jll", "libass_jll", "libfdk_aac_jll", "libvorbis_jll", "x264_jll", "x265_jll"] -git-tree-sha1 = "d8a578692e3077ac998b50c0217dfd67f21d1e5f" -uuid = "b22a6f82-2f65-5046-a5b2-351ab43fb4e5" -version = "4.4.0+0" - -[[deps.FilePathsBase]] -deps = ["Compat", "Dates", "Mmap", "Printf", "Test", "UUIDs"] -git-tree-sha1 = "04d13bfa8ef11720c24e4d840c0033d145537df7" -uuid = "48062228-2e41-5def-b9a4-89aafe57970f" -version = "0.9.17" - -[[deps.FillArrays]] -deps = ["LinearAlgebra", "Random", "SparseArrays", "Statistics"] -git-tree-sha1 = "0dbc5b9683245f905993b51d2814202d75b34f1a" -uuid = "1a297f60-69ca-5386-bcde-b61e274b549b" -version = "0.13.1" - -[[deps.FixedPointNumbers]] -deps = ["Statistics"] -git-tree-sha1 = "335bfdceacc84c5cdf16aadc768aa5ddfc5383cc" -uuid = "53c48c17-4a7d-5ca2-90c5-79b7896eea93" -version = "0.8.4" - -[[deps.Fontconfig_jll]] -deps = ["Artifacts", "Bzip2_jll", "Expat_jll", "FreeType2_jll", "JLLWrappers", "Libdl", "Libuuid_jll", "Pkg", "Zlib_jll"] -git-tree-sha1 = "21efd19106a55620a188615da6d3d06cd7f6ee03" -uuid = "a3f928ae-7b40-5064-980b-68af3947d34b" -version = "2.13.93+0" - -[[deps.Formatting]] -deps = ["Printf"] -git-tree-sha1 = "8339d61043228fdd3eb658d86c926cb282ae72a8" -uuid = "59287772-0a20-5a39-b81b-1366585eb4c0" -version = "0.4.2" - -[[deps.FreeType2_jll]] -deps = ["Artifacts", "Bzip2_jll", "JLLWrappers", "Libdl", "Pkg", "Zlib_jll"] -git-tree-sha1 = "87eb71354d8ec1a96d4a7636bd57a7347dde3ef9" -uuid = "d7e528f0-a631-5988-bf34-fe36492bcfd7" -version = "2.10.4+0" - -[[deps.FriBidi_jll]] -deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"] -git-tree-sha1 = "aa31987c2ba8704e23c6c8ba8a4f769d5d7e4f91" -uuid = "559328eb-81f9-559d-9380-de523a88c83c" -version = "1.0.10+0" - -[[deps.Future]] -deps = ["Random"] -uuid = "9fa8497b-333b-5362-9e8d-4d0656e87820" - -[[deps.GLFW_jll]] -deps = ["Artifacts", "JLLWrappers", "Libdl", "Libglvnd_jll", "Pkg", "Xorg_libXcursor_jll", "Xorg_libXi_jll", "Xorg_libXinerama_jll", "Xorg_libXrandr_jll"] -git-tree-sha1 = "51d2dfe8e590fbd74e7a842cf6d13d8a2f45dc01" -uuid = "0656b61e-2033-5cc2-a64a-77c0f6c09b89" -version = "3.3.6+0" - -[[deps.GPUArrays]] -deps = ["Adapt", "LLVM", "LinearAlgebra", "Printf", "Random", "Serialization", "Statistics"] -git-tree-sha1 = "cf91e6e9213b9190dc0511d6fff862a86652a94a" -uuid = "0c68f7d7-f131-5f86-a1c3-88cf8149b2d7" -version = "8.2.1" - -[[deps.GPUCompiler]] -deps = ["ExprTools", "InteractiveUtils", "LLVM", "Libdl", "Logging", "TimerOutputs", "UUIDs"] -git-tree-sha1 = "647a54f196b5ffb7c3bc2fec5c9a57fa273354cc" -uuid = "61eb1bfa-7361-4325-ad38-22787b887f55" -version = "0.13.14" - -[[deps.GR]] -deps = ["Base64", "DelimitedFiles", "GR_jll", "HTTP", "JSON", "Libdl", "LinearAlgebra", "Pkg", "Printf", "Random", "RelocatableFolders", "Serialization", "Sockets", "Test", "UUIDs"] -git-tree-sha1 = "9f836fb62492f4b0f0d3b06f55983f2704ed0883" -uuid = "28b8d3ca-fb5f-59d9-8090-bfdbd6d07a71" -version = "0.64.0" - -[[deps.GR_jll]] -deps = ["Artifacts", "Bzip2_jll", "Cairo_jll", "FFMPEG_jll", "Fontconfig_jll", "GLFW_jll", "JLLWrappers", "JpegTurbo_jll", "Libdl", "Libtiff_jll", "Pixman_jll", "Pkg", "Qt5Base_jll", "Zlib_jll", "libpng_jll"] -git-tree-sha1 = "a6c850d77ad5118ad3be4bd188919ce97fffac47" -uuid = "d2c73de3-f751-5644-a686-071e5b155ba9" -version = "0.64.0+0" - -[[deps.GeometryBasics]] -deps = ["EarCut_jll", "IterTools", "LinearAlgebra", "StaticArrays", "StructArrays", "Tables"] -git-tree-sha1 = "83ea630384a13fc4f002b77690bc0afeb4255ac9" -uuid = "5c1252a2-5f33-56bf-86c9-59e7332b4326" -version = "0.4.2" - -[[deps.Gettext_jll]] -deps = ["Artifacts", "CompilerSupportLibraries_jll", "JLLWrappers", "Libdl", "Libiconv_jll", "Pkg", "XML2_jll"] -git-tree-sha1 = "9b02998aba7bf074d14de89f9d37ca24a1a0b046" -uuid = "78b55507-aeef-58d4-861c-77aaff3498b1" -version = "0.21.0+0" - -[[deps.Glib_jll]] -deps = ["Artifacts", "Gettext_jll", "JLLWrappers", "Libdl", "Libffi_jll", "Libiconv_jll", "Libmount_jll", "PCRE_jll", "Pkg", "Zlib_jll"] -git-tree-sha1 = "a32d672ac2c967f3deb8a81d828afc739c838a06" -uuid = "7746bdde-850d-59dc-9ae8-88ece973131d" -version = "2.68.3+2" - -[[deps.Graphite2_jll]] -deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"] -git-tree-sha1 = "344bf40dcab1073aca04aa0df4fb092f920e4011" -uuid = "3b182d85-2403-5c21-9c21-1e1f0cc25472" -version = "1.3.14+0" - -[[deps.Grisu]] -git-tree-sha1 = "53bb909d1151e57e2484c3d1b53e19552b887fb2" -uuid = "42e2da0e-8278-4e71-bc24-59509adca0fe" -version = "1.0.2" - -[[deps.HTTP]] -deps = ["Base64", "Dates", "IniFile", "Logging", "MbedTLS", "NetworkOptions", "Sockets", "URIs"] -git-tree-sha1 = "0fa77022fe4b511826b39c894c90daf5fce3334a" -uuid = "cd3eb016-35fb-5094-929b-558a96fad6f3" -version = "0.9.17" - -[[deps.HarfBuzz_jll]] -deps = ["Artifacts", "Cairo_jll", "Fontconfig_jll", "FreeType2_jll", "Glib_jll", "Graphite2_jll", "JLLWrappers", "Libdl", "Libffi_jll", "Pkg"] -git-tree-sha1 = "129acf094d168394e80ee1dc4bc06ec835e510a3" -uuid = "2e76f6c2-a576-52d4-95c1-20adfe4de566" -version = "2.8.1+1" - -[[deps.HypergeometricFunctions]] -deps = ["DualNumbers", "LinearAlgebra", "SpecialFunctions", "Test"] -git-tree-sha1 = "65e4589030ef3c44d3b90bdc5aac462b4bb05567" -uuid = "34004b35-14d8-5ef3-9330-4cdb6864b03a" -version = "0.3.8" - -[[deps.IniFile]] -git-tree-sha1 = "f550e6e32074c939295eb5ea6de31849ac2c9625" -uuid = "83e8ac13-25f8-5344-8a64-a9f2b223428f" -version = "0.5.1" - -[[deps.InteractiveUtils]] -deps = ["Markdown"] -uuid = "b77e0a4c-d291-57a0-90e8-8db25a27a240" - -[[deps.InverseFunctions]] -deps = ["Test"] -git-tree-sha1 = "91b5dcf362c5add98049e6c29ee756910b03051d" -uuid = "3587e190-3f89-42d0-90ee-14403ec27112" -version = "0.1.3" - -[[deps.InvertedIndices]] -git-tree-sha1 = "bee5f1ef5bf65df56bdd2e40447590b272a5471f" -uuid = "41ab1584-1d38-5bbf-9106-f11c6c58b48f" -version = "1.1.0" - -[[deps.IrrationalConstants]] -git-tree-sha1 = "7fd44fd4ff43fc60815f8e764c0f352b83c49151" -uuid = "92d709cd-6900-40b7-9082-c6be49f344b6" -version = "0.1.1" - -[[deps.IterTools]] -git-tree-sha1 = "fa6287a4469f5e048d763df38279ee729fbd44e5" -uuid = "c8e1da08-722c-5040-9ed9-7db0dc04731e" -version = "1.4.0" - -[[deps.IterationControl]] -deps = ["EarlyStopping", "InteractiveUtils"] -git-tree-sha1 = "83c84b7b87d3063e48a909a86c3c5bf4c3521962" -uuid = "b3c1a2ee-3fec-4384-bf48-272ea71de57c" -version = "0.5.2" - -[[deps.IteratorInterfaceExtensions]] -git-tree-sha1 = "a3f24677c21f5bbe9d2a714f95dcd58337fb2856" -uuid = "82899510-4779-5014-852e-03e436cf321d" -version = "1.0.0" - -[[deps.JLLWrappers]] -deps = ["Preferences"] -git-tree-sha1 = "abc9885a7ca2052a736a600f7fa66209f96506e1" -uuid = "692b3bcd-3c85-4b1f-b108-f13ce0eb3210" -version = "1.4.1" - -[[deps.JLSO]] -deps = ["BSON", "CodecZlib", "FilePathsBase", "Memento", "Pkg", "Serialization"] -git-tree-sha1 = "e00feb9d56e9e8518e0d60eef4d1040b282771e2" -uuid = "9da8a3cd-07a3-59c0-a743-3fdc52c30d11" -version = "2.6.0" - -[[deps.JSON]] -deps = ["Dates", "Mmap", "Parsers", "Unicode"] -git-tree-sha1 = "3c837543ddb02250ef42f4738347454f95079d4e" -uuid = "682c06a0-de6a-54ab-a142-c8b1cf79cde6" -version = "0.21.3" - -[[deps.JpegTurbo_jll]] -deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"] -git-tree-sha1 = "b53380851c6e6664204efb2e62cd24fa5c47e4ba" -uuid = "aacddb02-875f-59d6-b918-886e6ef4fbf8" -version = "2.1.2+0" - -[[deps.LAME_jll]] -deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"] -git-tree-sha1 = "f6250b16881adf048549549fba48b1161acdac8c" -uuid = "c1c5ebd0-6772-5130-a774-d5fcae4a789d" -version = "3.100.1+0" - -[[deps.LERC_jll]] -deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"] -git-tree-sha1 = "bf36f528eec6634efc60d7ec062008f171071434" -uuid = "88015f11-f218-50d7-93a8-a6af411a945d" -version = "3.0.0+1" - -[[deps.LLVM]] -deps = ["CEnum", "LLVMExtra_jll", "Libdl", "Printf", "Unicode"] -git-tree-sha1 = "c9b86064be5ae0f63e50816a5a90b08c474507ae" -uuid = "929cbde3-209d-540e-8aea-75f648917ca0" -version = "4.9.1" - -[[deps.LLVMExtra_jll]] -deps = ["Artifacts", "JLLWrappers", "LazyArtifacts", "Libdl", "Pkg"] -git-tree-sha1 = "5558ad3c8972d602451efe9d81c78ec14ef4f5ef" -uuid = "dad2f222-ce93-54a1-a47d-0025e8a3acab" -version = "0.0.14+2" - -[[deps.LZO_jll]] -deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"] -git-tree-sha1 = "e5b909bcf985c5e2605737d2ce278ed791b89be6" -uuid = "dd4b983a-f0e5-5f8d-a1b7-129d4a5fb1ac" -version = "2.10.1+0" - -[[deps.LaTeXStrings]] -git-tree-sha1 = "f2355693d6778a178ade15952b7ac47a4ff97996" -uuid = "b964fa9f-0449-5b57-a5c2-d3ea65f4040f" -version = "1.3.0" - -[[deps.Latexify]] -deps = ["Formatting", "InteractiveUtils", "LaTeXStrings", "MacroTools", "Markdown", "Printf", "Requires"] -git-tree-sha1 = "4f00cc36fede3c04b8acf9b2e2763decfdcecfa6" -uuid = "23fbe1c1-3f47-55db-b15f-69d7ec21a316" -version = "0.15.13" - -[[deps.LatinHypercubeSampling]] -deps = ["Random", "StableRNGs", "StatsBase", "Test"] -git-tree-sha1 = "42938ab65e9ed3c3029a8d2c58382ca75bdab243" -uuid = "a5e1c1ea-c99a-51d3-a14d-a9a37257b02d" -version = "1.8.0" - -[[deps.LazyArtifacts]] -deps = ["Artifacts", "Pkg"] -uuid = "4af54fe1-eca0-43a8-85a7-787d91b784e3" - -[[deps.LearnBase]] -git-tree-sha1 = "a0d90569edd490b82fdc4dc078ea54a5a800d30a" -uuid = "7f8f8fb0-2700-5f03-b4bd-41f8cfc144b6" -version = "0.4.1" - -[[deps.LibCURL]] -deps = ["LibCURL_jll", "MozillaCACerts_jll"] -uuid = "b27032c2-a3e7-50c8-80cd-2d36dbcbfd21" - -[[deps.LibCURL_jll]] -deps = ["Artifacts", "LibSSH2_jll", "Libdl", "MbedTLS_jll", "Zlib_jll", "nghttp2_jll"] -uuid = "deac9b47-8bc7-5906-a0fe-35ac56dc84c0" - -[[deps.LibGit2]] -deps = ["Base64", "NetworkOptions", "Printf", "SHA"] -uuid = "76f85450-5226-5b5a-8eaa-529ad045b433" - -[[deps.LibSSH2_jll]] -deps = ["Artifacts", "Libdl", "MbedTLS_jll"] -uuid = "29816b5a-b9ab-546f-933c-edad1886dfa8" - -[[deps.Libdl]] -uuid = "8f399da3-3557-5675-b5ff-fb832c97cbdb" - -[[deps.Libffi_jll]] -deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"] -git-tree-sha1 = "0b4a5d71f3e5200a7dff793393e09dfc2d874290" -uuid = "e9f186c6-92d2-5b65-8a66-fee21dc1b490" -version = "3.2.2+1" - -[[deps.Libgcrypt_jll]] -deps = ["Artifacts", "JLLWrappers", "Libdl", "Libgpg_error_jll", "Pkg"] -git-tree-sha1 = "64613c82a59c120435c067c2b809fc61cf5166ae" -uuid = "d4300ac3-e22c-5743-9152-c294e39db1e4" -version = "1.8.7+0" - -[[deps.Libglvnd_jll]] -deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg", "Xorg_libX11_jll", "Xorg_libXext_jll"] -git-tree-sha1 = "7739f837d6447403596a75d19ed01fd08d6f56bf" -uuid = "7e76a0d4-f3c7-5321-8279-8d96eeed0f29" -version = "1.3.0+3" - -[[deps.Libgpg_error_jll]] -deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"] -git-tree-sha1 = "c333716e46366857753e273ce6a69ee0945a6db9" -uuid = "7add5ba3-2f88-524e-9cd5-f83b8a55f7b8" -version = "1.42.0+0" - -[[deps.Libiconv_jll]] -deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"] -git-tree-sha1 = "42b62845d70a619f063a7da093d995ec8e15e778" -uuid = "94ce4f54-9a6c-5748-9c1c-f9c7231a4531" -version = "1.16.1+1" - -[[deps.Libmount_jll]] -deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"] -git-tree-sha1 = "9c30530bf0effd46e15e0fdcf2b8636e78cbbd73" -uuid = "4b2f31a3-9ecc-558c-b454-b3730dcb73e9" -version = "2.35.0+0" - -[[deps.Libtiff_jll]] -deps = ["Artifacts", "JLLWrappers", "JpegTurbo_jll", "LERC_jll", "Libdl", "Pkg", "Zlib_jll", "Zstd_jll"] -git-tree-sha1 = "c9551dd26e31ab17b86cbd00c2ede019c08758eb" -uuid = "89763e89-9b03-5906-acba-b20f662cd828" -version = "4.3.0+1" - -[[deps.Libuuid_jll]] -deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"] -git-tree-sha1 = "7f3efec06033682db852f8b3bc3c1d2b0a0ab066" -uuid = "38a345b3-de98-5d2b-a5d3-14cd9215e700" -version = "2.36.0+0" - -[[deps.LinearAlgebra]] -deps = ["Libdl"] -uuid = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e" - -[[deps.LogExpFunctions]] -deps = ["ChainRulesCore", "ChangesOfVariables", "DocStringExtensions", "InverseFunctions", "IrrationalConstants", "LinearAlgebra"] -git-tree-sha1 = "3f7cb7157ef860c637f3f4929c8ed5d9716933c6" -uuid = "2ab3a3ac-af41-5b50-aa03-7779005ae688" -version = "0.3.7" - -[[deps.Logging]] -uuid = "56ddb016-857b-54e1-b83d-db4d58db5568" - -[[deps.LossFunctions]] -deps = ["InteractiveUtils", "LearnBase", "Markdown", "RecipesBase", "StatsBase"] -git-tree-sha1 = "0f057f6ea90a84e73a8ef6eebb4dc7b5c330020f" -uuid = "30fc2ffe-d236-52d8-8643-a9d8f7c094a7" -version = "0.7.2" - -[[deps.MLJ]] -deps = ["CategoricalArrays", "ComputationalResources", "Distributed", "Distributions", "LinearAlgebra", "MLJBase", "MLJEnsembles", "MLJIteration", "MLJModels", "MLJSerialization", "MLJTuning", "OpenML", "Pkg", "ProgressMeter", "Random", "ScientificTypes", "Statistics", "StatsBase", "Tables"] -git-tree-sha1 = "ecd156a5494894ea125548ee58226541ee368329" -uuid = "add582a8-e3ab-11e8-2d5e-e98b27df1bc7" -version = "0.17.3" - -[[deps.MLJBase]] -deps = ["CategoricalArrays", "CategoricalDistributions", "ComputationalResources", "Dates", "DelimitedFiles", "Distributed", "Distributions", "InteractiveUtils", "InvertedIndices", "LinearAlgebra", "LossFunctions", "MLJModelInterface", "Missings", "OrderedCollections", "Parameters", "PrettyTables", "ProgressMeter", "Random", "ScientificTypes", "StatisticalTraits", "Statistics", "StatsBase", "Tables"] -git-tree-sha1 = "193521a6cdb0334ede6654508aa9d3acc05b633b" -uuid = "a7f614a8-145f-11e9-1d2a-a57a1082229d" -version = "0.19.7" - -[[deps.MLJDecisionTreeInterface]] -deps = ["DecisionTree", "MLJModelInterface", "Random", "Tables"] -git-tree-sha1 = "8cf2bb326b2d7d8a81c2b14e709dc2aeeb25bbc6" -uuid = "c6f25543-311c-4c74-83dc-3ea6d1015661" -version = "0.2.1" - -[[deps.MLJEnsembles]] -deps = ["CategoricalArrays", "CategoricalDistributions", "ComputationalResources", "Distributed", "Distributions", "MLJBase", "MLJModelInterface", "ProgressMeter", "Random", "ScientificTypesBase", "StatsBase"] -git-tree-sha1 = "4279437ccc8ece8f478ded5139334b888dcce631" -uuid = "50ed68f4-41fd-4504-931a-ed422449fee0" -version = "0.2.0" - -[[deps.MLJIteration]] -deps = ["IterationControl", "MLJBase", "Random"] -git-tree-sha1 = "9ea78184700a54ce45abea4c99478aa5261ed74f" -uuid = "614be32b-d00c-4edb-bd02-1eb411ab5e55" -version = "0.4.5" - -[[deps.MLJModelInterface]] -deps = ["Random", "ScientificTypesBase", "StatisticalTraits"] -git-tree-sha1 = "74d7fb54c306af241c5f9d4816b735cb4051e125" -uuid = "e80e1ace-859a-464e-9ed9-23947d8ae3ea" -version = "1.4.2" - -[[deps.MLJModels]] -deps = ["CategoricalArrays", "CategoricalDistributions", "Dates", "Distances", "Distributions", "InteractiveUtils", "LinearAlgebra", "MLJModelInterface", "Markdown", "OrderedCollections", "Parameters", "Pkg", "PrettyPrinting", "REPL", "Random", "ScientificTypes", "StatisticalTraits", "Statistics", "StatsBase", "Tables"] -git-tree-sha1 = "5ccd6e467431b6043fdc1b3f79020ef4ada24fe8" -uuid = "d491faf4-2d78-11e9-2867-c94bc002c0b7" -version = "0.15.5" - -[[deps.MLJMultivariateStatsInterface]] -deps = ["Distances", "LinearAlgebra", "MLJModelInterface", "MultivariateStats", "StatsBase"] -git-tree-sha1 = "0cfc81ff677ea13ed72894992ee9e5f8ae4dbb9d" -uuid = "1b6a4a23-ba22-4f51-9698-8599985d3728" -version = "0.2.2" - -[[deps.MLJSerialization]] -deps = ["IterationControl", "JLSO", "MLJBase", "MLJModelInterface"] -git-tree-sha1 = "cc5877ad02ef02e273d2622f0d259d628fa61cd0" -uuid = "17bed46d-0ab5-4cd4-b792-a5c4b8547c6d" -version = "1.1.3" - -[[deps.MLJTuning]] -deps = ["ComputationalResources", "Distributed", "Distributions", "LatinHypercubeSampling", "MLJBase", "ProgressMeter", "Random", "RecipesBase"] -git-tree-sha1 = "a443cc088158b949876d7038a1aa37cfc8c5509b" -uuid = "03970b2e-30c4-11ea-3135-d1576263f10f" -version = "0.6.16" - -[[deps.MacroTools]] -deps = ["Markdown", "Random"] -git-tree-sha1 = "3d3e902b31198a27340d0bf00d6ac452866021cf" -uuid = "1914dd2f-81c6-5fcd-8719-6d5c9610ff09" -version = "0.5.9" - -[[deps.MarchingCubes]] -deps = ["StaticArrays"] -git-tree-sha1 = "5f768e0a0c3875df386be4c036f78c8bd4b1a9b6" -uuid = "299715c1-40a9-479a-aaf9-4a633d36f717" -version = "0.1.2" - -[[deps.Markdown]] -deps = ["Base64"] -uuid = "d6f4376e-aef5-505a-96c1-9c027394607a" - -[[deps.MbedTLS]] -deps = ["Dates", "MbedTLS_jll", "Random", "Sockets"] -git-tree-sha1 = "1c38e51c3d08ef2278062ebceade0e46cefc96fe" -uuid = "739be429-bea8-5141-9913-cc70e7f3736d" -version = "1.0.3" - -[[deps.MbedTLS_jll]] -deps = ["Artifacts", "Libdl"] -uuid = "c8ffd9c3-330d-5841-b78e-0817d7145fa1" - -[[deps.Measurements]] -deps = ["Calculus", "LinearAlgebra", "Printf", "RecipesBase", "Requires"] -git-tree-sha1 = "88cd033eb781c698e75ae0b680e5cef1553f0856" -uuid = "eff96d63-e80a-5855-80a2-b1b0885c5ab7" -version = "2.7.1" - -[[deps.Measures]] -git-tree-sha1 = "e498ddeee6f9fdb4551ce855a46f54dbd900245f" -uuid = "442fdcdd-2543-5da2-b0f3-8c86c306513e" -version = "0.3.1" - -[[deps.Memento]] -deps = ["Dates", "Distributed", "Requires", "Serialization", "Sockets", "Test", "UUIDs"] -git-tree-sha1 = "9b0b0dbf419fbda7b383dc12d108621d26eeb89f" -uuid = "f28f55f0-a522-5efc-85c2-fe41dfb9b2d9" -version = "1.3.0" - -[[deps.Missings]] -deps = ["DataAPI"] -git-tree-sha1 = "bf210ce90b6c9eed32d25dbcae1ebc565df2687f" -uuid = "e1d29d7a-bbdc-5cf2-9ac0-f12de2c33e28" -version = "1.0.2" - -[[deps.Mmap]] -uuid = "a63ad114-7e13-5084-954f-fe012c677804" - -[[deps.MozillaCACerts_jll]] -uuid = "14a3606d-f60d-562e-9121-12d972cd8159" - -[[deps.MultivariateStats]] -deps = ["Arpack", "LinearAlgebra", "SparseArrays", "Statistics", "StatsBase"] -git-tree-sha1 = "8d958ff1854b166003238fe191ec34b9d592860a" -uuid = "6f286f6a-111f-5878-ab1e-185364afe411" -version = "0.8.0" - -[[deps.NaNMath]] -git-tree-sha1 = "737a5957f387b17e74d4ad2f440eb330b39a62c5" -uuid = "77ba4419-2d1f-58cd-9bb1-8ffee604a2e3" -version = "1.0.0" - -[[deps.NetworkLayout]] -deps = ["GeometryBasics", "LinearAlgebra", "Random", "Requires", "SparseArrays"] -git-tree-sha1 = "cac8fc7ba64b699c678094fa630f49b80618f625" -uuid = "46757867-2c16-5918-afeb-47bfcb05e46a" -version = "0.4.4" - -[[deps.NetworkOptions]] -uuid = "ca575930-c2e3-43a9-ace4-1e988b2c1908" - -[[deps.Ogg_jll]] -deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"] -git-tree-sha1 = "887579a3eb005446d514ab7aeac5d1d027658b8f" -uuid = "e7412a2a-1a6e-54c0-be00-318e2571c051" -version = "1.3.5+1" - -[[deps.OpenBLAS_jll]] -deps = ["Artifacts", "CompilerSupportLibraries_jll", "Libdl"] -uuid = "4536629a-c528-5b80-bd46-f80d51c5b363" - -[[deps.OpenLibm_jll]] -deps = ["Artifacts", "Libdl"] -uuid = "05823500-19ac-5b8b-9628-191a04bc5112" - -[[deps.OpenML]] -deps = ["ARFFFiles", "HTTP", "JSON", "Markdown", "Pkg"] -git-tree-sha1 = "06080992e86a93957bfe2e12d3181443cedf2400" -uuid = "8b6db2d4-7670-4922-a472-f9537c81ab66" -version = "0.2.0" - -[[deps.OpenSSL_jll]] -deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"] -git-tree-sha1 = "648107615c15d4e09f7eca16307bc821c1f718d8" -uuid = "458c3c95-2e84-50aa-8efc-19380b2a3a95" -version = "1.1.13+0" - -[[deps.OpenSpecFun_jll]] -deps = ["Artifacts", "CompilerSupportLibraries_jll", "JLLWrappers", "Libdl", "Pkg"] -git-tree-sha1 = "13652491f6856acfd2db29360e1bbcd4565d04f1" -uuid = "efe28fd5-8261-553b-a9e1-b2916fc3738e" -version = "0.5.5+0" - -[[deps.Opus_jll]] -deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"] -git-tree-sha1 = "51a08fb14ec28da2ec7a927c4337e4332c2a4720" -uuid = "91d4177d-7536-5919-b921-800302f37372" -version = "1.3.2+0" - -[[deps.OrderedCollections]] -git-tree-sha1 = "85f8e6578bf1f9ee0d11e7bb1b1456435479d47c" -uuid = "bac558e1-5e72-5ebc-8fee-abe8a469f55d" -version = "1.4.1" - -[[deps.PCRE_jll]] -deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"] -git-tree-sha1 = "b2a7af664e098055a7529ad1a900ded962bca488" -uuid = "2f80f16e-611a-54ab-bc61-aa92de5b98fc" -version = "8.44.0+0" - -[[deps.PDMats]] -deps = ["LinearAlgebra", "SparseArrays", "SuiteSparse"] -git-tree-sha1 = "7e2166042d1698b6072352c74cfd1fca2a968253" -uuid = "90014a1f-27ba-587c-ab20-58faa44d9150" -version = "0.11.6" - -[[deps.Parameters]] -deps = ["OrderedCollections", "UnPack"] -git-tree-sha1 = "34c0e9ad262e5f7fc75b10a9952ca7692cfc5fbe" -uuid = "d96e819e-fc66-5662-9728-84c9c7592b0a" -version = "0.12.3" - -[[deps.Parsers]] -deps = ["Dates"] -git-tree-sha1 = "85b5da0fa43588c75bb1ff986493443f821c70b7" -uuid = "69de0a69-1ddd-5017-9359-2bf0b02dc9f0" -version = "2.2.3" - -[[deps.Pixman_jll]] -deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"] -git-tree-sha1 = "b4f5d02549a10e20780a24fce72bea96b6329e29" -uuid = "30392449-352a-5448-841d-b1acce4e97dc" -version = "0.40.1+0" - -[[deps.Pkg]] -deps = ["Artifacts", "Dates", "Downloads", "LibGit2", "Libdl", "Logging", "Markdown", "Printf", "REPL", "Random", "SHA", "Serialization", "TOML", "Tar", "UUIDs", "p7zip_jll"] -uuid = "44cfe95a-1eb2-52ea-b672-e2afdf69b78f" - -[[deps.PlotThemes]] -deps = ["PlotUtils", "Requires", "Statistics"] -git-tree-sha1 = "a3a964ce9dc7898193536002a6dd892b1b5a6f1d" -uuid = "ccf2f8ad-2431-5c83-bf29-c5338b663b6a" -version = "2.0.1" - -[[deps.PlotUtils]] -deps = ["ColorSchemes", "Colors", "Dates", "Printf", "Random", "Reexport", "Statistics"] -git-tree-sha1 = "6f1b25e8ea06279b5689263cc538f51331d7ca17" -uuid = "995b91a9-d308-5afd-9ec6-746e21dbc043" -version = "1.1.3" - -[[deps.Plots]] -deps = ["Base64", "Contour", "Dates", "Downloads", "FFMPEG", "FixedPointNumbers", "GR", "GeometryBasics", "JSON", "Latexify", "LinearAlgebra", "Measures", "NaNMath", "PlotThemes", "PlotUtils", "Printf", "REPL", "Random", "RecipesBase", "RecipesPipeline", "Reexport", "Requires", "Scratch", "Showoff", "SparseArrays", "Statistics", "StatsBase", "UUIDs", "UnicodeFun", "Unzip"] -git-tree-sha1 = "23d109aad5d225e945c813c6ebef79104beda955" -uuid = "91a5bcdd-55d7-5caf-9e0b-520d859cae80" -version = "1.26.0" - -[[deps.PooledArrays]] -deps = ["DataAPI", "Future"] -git-tree-sha1 = "db3a23166af8aebf4db5ef87ac5b00d36eb771e2" -uuid = "2dfb63ee-cc39-5dd5-95bd-886bf059d720" -version = "1.4.0" - -[[deps.Preferences]] -deps = ["TOML"] -git-tree-sha1 = "de893592a221142f3db370f48290e3a2ef39998f" -uuid = "21216c6a-2e73-6563-6e65-726566657250" -version = "1.2.4" - -[[deps.PrettyPrinting]] -git-tree-sha1 = "4be53d093e9e37772cc89e1009e8f6ad10c4681b" -uuid = "54e16d92-306c-5ea0-a30b-337be88ac337" -version = "0.4.0" - -[[deps.PrettyTables]] -deps = ["Crayons", "Formatting", "Markdown", "Reexport", "Tables"] -git-tree-sha1 = "dfb54c4e414caa595a1f2ed759b160f5a3ddcba5" -uuid = "08abe8d2-0d0c-5749-adfa-8a2ac140af0d" -version = "1.3.1" - -[[deps.Printf]] -deps = ["Unicode"] -uuid = "de0858da-6303-5e67-8744-51eddeeeb8d7" - -[[deps.ProgressMeter]] -deps = ["Distributed", "Printf"] -git-tree-sha1 = "afadeba63d90ff223a6a48d2009434ecee2ec9e8" -uuid = "92933f4c-e287-5a05-a399-4b506db050ca" -version = "1.7.1" - -[[deps.Qt5Base_jll]] -deps = ["Artifacts", "CompilerSupportLibraries_jll", "Fontconfig_jll", "Glib_jll", "JLLWrappers", "Libdl", "Libglvnd_jll", "OpenSSL_jll", "Pkg", "Xorg_libXext_jll", "Xorg_libxcb_jll", "Xorg_xcb_util_image_jll", "Xorg_xcb_util_keysyms_jll", "Xorg_xcb_util_renderutil_jll", "Xorg_xcb_util_wm_jll", "Zlib_jll", "xkbcommon_jll"] -git-tree-sha1 = "ad368663a5e20dbb8d6dc2fddeefe4dae0781ae8" -uuid = "ea2cea3b-5b76-57ae-a6ef-0a8af62496e1" -version = "5.15.3+0" - -[[deps.QuadGK]] -deps = ["DataStructures", "LinearAlgebra"] -git-tree-sha1 = "78aadffb3efd2155af139781b8a8df1ef279ea39" -uuid = "1fd47b50-473d-5c70-9696-f719f8f3bcdc" -version = "2.4.2" - -[[deps.REPL]] -deps = ["InteractiveUtils", "Markdown", "Sockets", "Unicode"] -uuid = "3fa0cd96-eef1-5676-8a61-b3b8758bbffb" - -[[deps.Random]] -deps = ["Serialization"] -uuid = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c" - -[[deps.Random123]] -deps = ["Random", "RandomNumbers"] -git-tree-sha1 = "afeacaecf4ed1649555a19cb2cad3c141bbc9474" -uuid = "74087812-796a-5b5d-8853-05524746bad3" -version = "1.5.0" - -[[deps.RandomNumbers]] -deps = ["Random", "Requires"] -git-tree-sha1 = "043da614cc7e95c703498a491e2c21f58a2b8111" -uuid = "e6cf234a-135c-5ec9-84dd-332b85af5143" -version = "1.5.3" - -[[deps.RecipesBase]] -git-tree-sha1 = "6bf3f380ff52ce0832ddd3a2a7b9538ed1bcca7d" -uuid = "3cdcf5f2-1ef4-517c-9805-6587b60abb01" -version = "1.2.1" - -[[deps.RecipesPipeline]] -deps = ["Dates", "NaNMath", "PlotUtils", "RecipesBase"] -git-tree-sha1 = "995a812c6f7edea7527bb570f0ac39d0fb15663c" -uuid = "01d81517-befc-4cb6-b9ec-a95719d0359c" -version = "0.5.1" - -[[deps.Reexport]] -git-tree-sha1 = "45e428421666073eab6f2da5c9d310d99bb12f9b" -uuid = "189a3867-3050-52da-a836-e630ba90ab69" -version = "1.2.2" - -[[deps.RelocatableFolders]] -deps = ["SHA", "Scratch"] -git-tree-sha1 = "cdbd3b1338c72ce29d9584fdbe9e9b70eeb5adca" -uuid = "05181044-ff0b-4ac5-8273-598c1e38db00" -version = "0.1.3" - -[[deps.Requires]] -deps = ["UUIDs"] -git-tree-sha1 = "838a3a4188e2ded87a4f9f184b4b0d78a1e91cb7" -uuid = "ae029012-a4dd-5104-9daa-d747884805df" -version = "1.3.0" - -[[deps.Rmath]] -deps = ["Random", "Rmath_jll"] -git-tree-sha1 = "bf3188feca147ce108c76ad82c2792c57abe7b1f" -uuid = "79098fc4-a85e-5d69-aa6a-4863f24498fa" -version = "0.7.0" - -[[deps.Rmath_jll]] -deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"] -git-tree-sha1 = "68db32dff12bb6127bac73c209881191bf0efbb7" -uuid = "f50d1b31-88e8-58de-be2c-1cc44531875f" -version = "0.3.0+0" - -[[deps.SHA]] -uuid = "ea8e919c-243c-51af-8825-aaa63cd721ce" - -[[deps.ScientificTypes]] -deps = ["CategoricalArrays", "ColorTypes", "Dates", "Distributions", "PrettyTables", "Reexport", "ScientificTypesBase", "StatisticalTraits", "Tables"] -git-tree-sha1 = "ba70c9a6e4c81cc3634e3e80bb8163ab5ef57eb8" -uuid = "321657f4-b219-11e9-178b-2701a2544e81" -version = "3.0.0" - -[[deps.ScientificTypesBase]] -git-tree-sha1 = "a8e18eb383b5ecf1b5e6fc237eb39255044fd92b" -uuid = "30f210dd-8aff-4c5f-94ba-8e64358c1161" -version = "3.0.0" - -[[deps.ScikitLearnBase]] -deps = ["LinearAlgebra", "Random", "Statistics"] -git-tree-sha1 = "7877e55c1523a4b336b433da39c8e8c08d2f221f" -uuid = "6e75b9c4-186b-50bd-896f-2d2496a4843e" -version = "0.5.0" - -[[deps.Scratch]] -deps = ["Dates"] -git-tree-sha1 = "0b4b7f1393cff97c33891da2a0bf69c6ed241fda" -uuid = "6c6a2e73-6563-6170-7368-637461726353" -version = "1.1.0" - -[[deps.Serialization]] -uuid = "9e88b42a-f829-5b0c-bbe9-9e923198166b" - -[[deps.SharedArrays]] -deps = ["Distributed", "Mmap", "Random", "Serialization"] -uuid = "1a1011a3-84de-559e-8e89-a11a2f7dc383" - -[[deps.Showoff]] -deps = ["Dates", "Grisu"] -git-tree-sha1 = "91eddf657aca81df9ae6ceb20b959ae5653ad1de" -uuid = "992d4aef-0814-514b-bc4d-f2e9a6c4116f" -version = "1.0.3" - -[[deps.Sockets]] -uuid = "6462fe0b-24de-5631-8697-dd941f90decc" - -[[deps.SortingAlgorithms]] -deps = ["DataStructures"] -git-tree-sha1 = "b3363d7460f7d098ca0912c69b082f75625d7508" -uuid = "a2af1166-a08f-5f64-846c-94a0d3cef48c" -version = "1.0.1" - -[[deps.SparseArrays]] -deps = ["LinearAlgebra", "Random"] -uuid = "2f01184e-e22b-5df5-ae63-d93ebab69eaf" - -[[deps.SpecialFunctions]] -deps = ["ChainRulesCore", "IrrationalConstants", "LogExpFunctions", "OpenLibm_jll", "OpenSpecFun_jll"] -git-tree-sha1 = "5ba658aeecaaf96923dce0da9e703bd1fe7666f9" -uuid = "276daf66-3868-5448-9aa4-cd146d93841b" -version = "2.1.4" - -[[deps.StableRNGs]] -deps = ["Random", "Test"] -git-tree-sha1 = "3be7d49667040add7ee151fefaf1f8c04c8c8276" -uuid = "860ef19b-820b-49d6-a774-d7a799459cd3" -version = "1.0.0" - -[[deps.StaticArrays]] -deps = ["LinearAlgebra", "Random", "Statistics"] -git-tree-sha1 = "74fb527333e72ada2dd9ef77d98e4991fb185f04" -uuid = "90137ffa-7385-5640-81b9-e52037218182" -version = "1.4.1" - -[[deps.StatisticalTraits]] -deps = ["ScientificTypesBase"] -git-tree-sha1 = "271a7fea12d319f23d55b785c51f6876aadb9ac0" -uuid = "64bff920-2084-43da-a3e6-9bb72801c0c9" -version = "3.0.0" - -[[deps.Statistics]] -deps = ["LinearAlgebra", "SparseArrays"] -uuid = "10745b16-79ce-11e8-11f9-7d13ad32a3b2" - -[[deps.StatsAPI]] -deps = ["LinearAlgebra"] -git-tree-sha1 = "c3d8ba7f3fa0625b062b82853a7d5229cb728b6b" -uuid = "82ae8749-77ed-4fe6-ae5f-f523153014b0" -version = "1.2.1" - -[[deps.StatsBase]] -deps = ["DataAPI", "DataStructures", "LinearAlgebra", "LogExpFunctions", "Missings", "Printf", "Random", "SortingAlgorithms", "SparseArrays", "Statistics", "StatsAPI"] -git-tree-sha1 = "8977b17906b0a1cc74ab2e3a05faa16cf08a8291" -uuid = "2913bbd2-ae8a-5f71-8c99-4fb6c76f3a91" -version = "0.33.16" - -[[deps.StatsFuns]] -deps = ["ChainRulesCore", "HypergeometricFunctions", "InverseFunctions", "IrrationalConstants", "LogExpFunctions", "Reexport", "Rmath", "SpecialFunctions"] -git-tree-sha1 = "25405d7016a47cf2bd6cd91e66f4de437fd54a07" -uuid = "4c63d2b9-4356-54db-8cca-17b64c39e42c" -version = "0.9.16" - -[[deps.StructArrays]] -deps = ["Adapt", "DataAPI", "StaticArrays", "Tables"] -git-tree-sha1 = "57617b34fa34f91d536eb265df67c2d4519b8b98" -uuid = "09ab397b-f2b6-538f-b94a-2f83cf4a842a" -version = "0.6.5" - -[[deps.SuiteSparse]] -deps = ["Libdl", "LinearAlgebra", "Serialization", "SparseArrays"] -uuid = "4607b0f0-06f3-5cda-b6b1-a6196a1729e9" - -[[deps.TOML]] -deps = ["Dates"] -uuid = "fa267f1f-6049-4f14-aa54-33bafae1ed76" - -[[deps.TableTraits]] -deps = ["IteratorInterfaceExtensions"] -git-tree-sha1 = "c06b2f539df1c6efa794486abfb6ed2022561a39" -uuid = "3783bdb8-4a98-5b6b-af9a-565f29a5fe9c" -version = "1.0.1" - -[[deps.Tables]] -deps = ["DataAPI", "DataValueInterfaces", "IteratorInterfaceExtensions", "LinearAlgebra", "OrderedCollections", "TableTraits", "Test"] -git-tree-sha1 = "5ce79ce186cc678bbb5c5681ca3379d1ddae11a1" -uuid = "bd369af6-aec1-5ad0-b16a-f7cc5008161c" -version = "1.7.0" - -[[deps.Tar]] -deps = ["ArgTools", "SHA"] -uuid = "a4e569a6-e804-4fa4-b0f3-eef7a1d5b13e" - -[[deps.Test]] -deps = ["InteractiveUtils", "Logging", "Random", "Serialization"] -uuid = "8dfed614-e22c-5e08-85e1-65c5234f0b40" - -[[deps.TimerOutputs]] -deps = ["ExprTools", "Printf"] -git-tree-sha1 = "d60b0c96a16aaa42138d5d38ad386df672cb8bd8" -uuid = "a759f4b9-e2f1-59dc-863e-4aeb61b1ea8f" -version = "0.5.16" - -[[deps.TranscodingStreams]] -deps = ["Random", "Test"] -git-tree-sha1 = "216b95ea110b5972db65aa90f88d8d89dcb8851c" -uuid = "3bb67fe8-82b1-5028-8e26-92a6c54297fa" -version = "0.9.6" - -[[deps.URIs]] -git-tree-sha1 = "97bbe755a53fe859669cd907f2d96aee8d2c1355" -uuid = "5c2747f8-b7ea-4ff2-ba2e-563bfd36b1d4" -version = "1.3.0" - -[[deps.UUIDs]] -deps = ["Random", "SHA"] -uuid = "cf7118a7-6976-5b1a-9a39-7adc72f591a4" - -[[deps.UnPack]] -git-tree-sha1 = "387c1f73762231e86e0c9c5443ce3b4a0a9a0c2b" -uuid = "3a884ed6-31ef-47d7-9d2a-63182c4928ed" -version = "1.0.2" - -[[deps.Unicode]] -uuid = "4ec0a83e-493e-50e2-b9ac-8f72acf5a8f5" - -[[deps.UnicodeFun]] -deps = ["REPL"] -git-tree-sha1 = "53915e50200959667e78a92a418594b428dffddf" -uuid = "1cfade01-22cf-5700-b092-accc4b62d6e1" -version = "0.4.1" - -[[deps.UnicodePlots]] -deps = ["Contour", "Crayons", "Dates", "LinearAlgebra", "MarchingCubes", "NaNMath", "SparseArrays", "StaticArrays", "StatsBase", "Unitful"] -git-tree-sha1 = "1785494cb9484f9ab05bbc9d81a2d4de4341eb39" -uuid = "b8865327-cd53-5732-bb35-84acbb429228" -version = "2.9.0" - -[[deps.Unitful]] -deps = ["ConstructionBase", "Dates", "LinearAlgebra", "Random"] -git-tree-sha1 = "b649200e887a487468b71821e2644382699f1b0f" -uuid = "1986cc42-f94f-5a68-af5c-568840ba703d" -version = "1.11.0" - -[[deps.Unzip]] -git-tree-sha1 = "34db80951901073501137bdbc3d5a8e7bbd06670" -uuid = "41fe7b60-77ed-43a1-b4f0-825fd5a5650d" -version = "0.1.2" - -[[deps.Wayland_jll]] -deps = ["Artifacts", "Expat_jll", "JLLWrappers", "Libdl", "Libffi_jll", "Pkg", "XML2_jll"] -git-tree-sha1 = "3e61f0b86f90dacb0bc0e73a0c5a83f6a8636e23" -uuid = "a2964d1f-97da-50d4-b82a-358c7fce9d89" -version = "1.19.0+0" - -[[deps.Wayland_protocols_jll]] -deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"] -git-tree-sha1 = "4528479aa01ee1b3b4cd0e6faef0e04cf16466da" -uuid = "2381bf8a-dfd0-557d-9999-79630e7b1b91" -version = "1.25.0+0" - -[[deps.XML2_jll]] -deps = ["Artifacts", "JLLWrappers", "Libdl", "Libiconv_jll", "Pkg", "Zlib_jll"] -git-tree-sha1 = "1acf5bdf07aa0907e0a37d3718bb88d4b687b74a" -uuid = "02c8fc9c-b97f-50b9-bbe4-9be30ff0a78a" -version = "2.9.12+0" - -[[deps.XSLT_jll]] -deps = ["Artifacts", "JLLWrappers", "Libdl", "Libgcrypt_jll", "Libgpg_error_jll", "Libiconv_jll", "Pkg", "XML2_jll", "Zlib_jll"] -git-tree-sha1 = "91844873c4085240b95e795f692c4cec4d805f8a" -uuid = "aed1982a-8fda-507f-9586-7b0439959a61" -version = "1.1.34+0" - -[[deps.Xorg_libX11_jll]] -deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg", "Xorg_libxcb_jll", "Xorg_xtrans_jll"] -git-tree-sha1 = "5be649d550f3f4b95308bf0183b82e2582876527" -uuid = "4f6342f7-b3d2-589e-9d20-edeb45f2b2bc" -version = "1.6.9+4" - -[[deps.Xorg_libXau_jll]] -deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"] -git-tree-sha1 = "4e490d5c960c314f33885790ed410ff3a94ce67e" -uuid = "0c0b7dd1-d40b-584c-a123-a41640f87eec" -version = "1.0.9+4" - -[[deps.Xorg_libXcursor_jll]] -deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg", "Xorg_libXfixes_jll", "Xorg_libXrender_jll"] -git-tree-sha1 = "12e0eb3bc634fa2080c1c37fccf56f7c22989afd" -uuid = "935fb764-8cf2-53bf-bb30-45bb1f8bf724" -version = "1.2.0+4" - -[[deps.Xorg_libXdmcp_jll]] -deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"] -git-tree-sha1 = "4fe47bd2247248125c428978740e18a681372dd4" -uuid = "a3789734-cfe1-5b06-b2d0-1dd0d9d62d05" -version = "1.1.3+4" - -[[deps.Xorg_libXext_jll]] -deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg", "Xorg_libX11_jll"] -git-tree-sha1 = "b7c0aa8c376b31e4852b360222848637f481f8c3" -uuid = "1082639a-0dae-5f34-9b06-72781eeb8cb3" -version = "1.3.4+4" - -[[deps.Xorg_libXfixes_jll]] -deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg", "Xorg_libX11_jll"] -git-tree-sha1 = "0e0dc7431e7a0587559f9294aeec269471c991a4" -uuid = "d091e8ba-531a-589c-9de9-94069b037ed8" -version = "5.0.3+4" - -[[deps.Xorg_libXi_jll]] -deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg", "Xorg_libXext_jll", "Xorg_libXfixes_jll"] -git-tree-sha1 = "89b52bc2160aadc84d707093930ef0bffa641246" -uuid = "a51aa0fd-4e3c-5386-b890-e753decda492" -version = "1.7.10+4" - -[[deps.Xorg_libXinerama_jll]] -deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg", "Xorg_libXext_jll"] -git-tree-sha1 = "26be8b1c342929259317d8b9f7b53bf2bb73b123" -uuid = "d1454406-59df-5ea1-beac-c340f2130bc3" -version = "1.1.4+4" - -[[deps.Xorg_libXrandr_jll]] -deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg", "Xorg_libXext_jll", "Xorg_libXrender_jll"] -git-tree-sha1 = "34cea83cb726fb58f325887bf0612c6b3fb17631" -uuid = "ec84b674-ba8e-5d96-8ba1-2a689ba10484" -version = "1.5.2+4" - -[[deps.Xorg_libXrender_jll]] -deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg", "Xorg_libX11_jll"] -git-tree-sha1 = "19560f30fd49f4d4efbe7002a1037f8c43d43b96" -uuid = "ea2f1a96-1ddc-540d-b46f-429655e07cfa" -version = "0.9.10+4" - -[[deps.Xorg_libpthread_stubs_jll]] -deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"] -git-tree-sha1 = "6783737e45d3c59a4a4c4091f5f88cdcf0908cbb" -uuid = "14d82f49-176c-5ed1-bb49-ad3f5cbd8c74" -version = "0.1.0+3" - -[[deps.Xorg_libxcb_jll]] -deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg", "XSLT_jll", "Xorg_libXau_jll", "Xorg_libXdmcp_jll", "Xorg_libpthread_stubs_jll"] -git-tree-sha1 = "daf17f441228e7a3833846cd048892861cff16d6" -uuid = "c7cfdc94-dc32-55de-ac96-5a1b8d977c5b" -version = "1.13.0+3" - -[[deps.Xorg_libxkbfile_jll]] -deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg", "Xorg_libX11_jll"] -git-tree-sha1 = "926af861744212db0eb001d9e40b5d16292080b2" -uuid = "cc61e674-0454-545c-8b26-ed2c68acab7a" -version = "1.1.0+4" - -[[deps.Xorg_xcb_util_image_jll]] -deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg", "Xorg_xcb_util_jll"] -git-tree-sha1 = "0fab0a40349ba1cba2c1da699243396ff8e94b97" -uuid = "12413925-8142-5f55-bb0e-6d7ca50bb09b" -version = "0.4.0+1" - -[[deps.Xorg_xcb_util_jll]] -deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg", "Xorg_libxcb_jll"] -git-tree-sha1 = "e7fd7b2881fa2eaa72717420894d3938177862d1" -uuid = "2def613f-5ad1-5310-b15b-b15d46f528f5" -version = "0.4.0+1" - -[[deps.Xorg_xcb_util_keysyms_jll]] -deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg", "Xorg_xcb_util_jll"] -git-tree-sha1 = "d1151e2c45a544f32441a567d1690e701ec89b00" -uuid = "975044d2-76e6-5fbe-bf08-97ce7c6574c7" -version = "0.4.0+1" - -[[deps.Xorg_xcb_util_renderutil_jll]] -deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg", "Xorg_xcb_util_jll"] -git-tree-sha1 = "dfd7a8f38d4613b6a575253b3174dd991ca6183e" -uuid = "0d47668e-0667-5a69-a72c-f761630bfb7e" -version = "0.3.9+1" - -[[deps.Xorg_xcb_util_wm_jll]] -deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg", "Xorg_xcb_util_jll"] -git-tree-sha1 = "e78d10aab01a4a154142c5006ed44fd9e8e31b67" -uuid = "c22f9ab0-d5fe-5066-847c-f4bb1cd4e361" -version = "0.4.1+1" - -[[deps.Xorg_xkbcomp_jll]] -deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg", "Xorg_libxkbfile_jll"] -git-tree-sha1 = "4bcbf660f6c2e714f87e960a171b119d06ee163b" -uuid = "35661453-b289-5fab-8a00-3d9160c6a3a4" -version = "1.4.2+4" - -[[deps.Xorg_xkeyboard_config_jll]] -deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg", "Xorg_xkbcomp_jll"] -git-tree-sha1 = "5c8424f8a67c3f2209646d4425f3d415fee5931d" -uuid = "33bec58e-1273-512f-9401-5d533626f822" -version = "2.27.0+4" - -[[deps.Xorg_xtrans_jll]] -deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"] -git-tree-sha1 = "79c31e7844f6ecf779705fbc12146eb190b7d845" -uuid = "c5fb5394-a638-5e4d-96e5-b29de1b5cf10" -version = "1.4.0+3" - -[[deps.Zlib_jll]] -deps = ["Libdl"] -uuid = "83775a58-1f1d-513f-b197-d71354ab007a" - -[[deps.Zstd_jll]] -deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"] -git-tree-sha1 = "e45044cd873ded54b6a5bac0eb5c971392cf1927" -uuid = "3161d3a3-bdf6-5164-811a-617609db77b4" -version = "1.5.2+0" - -[[deps.libass_jll]] -deps = ["Artifacts", "Bzip2_jll", "FreeType2_jll", "FriBidi_jll", "HarfBuzz_jll", "JLLWrappers", "Libdl", "Pkg", "Zlib_jll"] -git-tree-sha1 = "5982a94fcba20f02f42ace44b9894ee2b140fe47" -uuid = "0ac62f75-1d6f-5e53-bd7c-93b484bb37c0" -version = "0.15.1+0" - -[[deps.libfdk_aac_jll]] -deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"] -git-tree-sha1 = "daacc84a041563f965be61859a36e17c4e4fcd55" -uuid = "f638f0a6-7fb0-5443-88ba-1cc74229b280" -version = "2.0.2+0" - -[[deps.libpng_jll]] -deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg", "Zlib_jll"] -git-tree-sha1 = "94d180a6d2b5e55e447e2d27a29ed04fe79eb30c" -uuid = "b53b4c65-9356-5827-b1ea-8c7a1a84506f" -version = "1.6.38+0" - -[[deps.libvorbis_jll]] -deps = ["Artifacts", "JLLWrappers", "Libdl", "Ogg_jll", "Pkg"] -git-tree-sha1 = "b910cb81ef3fe6e78bf6acee440bda86fd6ae00c" -uuid = "f27f6e37-5d2b-51aa-960f-b287f2bc3b7a" -version = "1.3.7+1" - -[[deps.nghttp2_jll]] -deps = ["Artifacts", "Libdl"] -uuid = "8e850ede-7688-5339-a07c-302acd2aaf8d" - -[[deps.p7zip_jll]] -deps = ["Artifacts", "Libdl"] -uuid = "3f19e933-33d8-53b3-aaab-bd5110c3b7a0" - -[[deps.x264_jll]] -deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"] -git-tree-sha1 = "4fea590b89e6ec504593146bf8b988b2c00922b2" -uuid = "1270edf5-f2f9-52d2-97e9-ab00b5d0237a" -version = "2021.5.5+0" - -[[deps.x265_jll]] -deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"] -git-tree-sha1 = "ee567a171cce03570d77ad3a43e90218e38937a9" -uuid = "dfaa095f-4041-5dcd-9319-2fabd8486b76" -version = "3.5.0+0" - -[[deps.xkbcommon_jll]] -deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg", "Wayland_jll", "Wayland_protocols_jll", "Xorg_libxcb_jll", "Xorg_xkeyboard_config_jll"] -git-tree-sha1 = "ece2350174195bb31de1a63bea3a41ae1aa593b6" -uuid = "d8fb68d0-12a3-5cfd-a85a-d49703b185fd" -version = "0.9.1+5" diff --git a/examples/telco/Project.toml b/examples/telco/Project.toml deleted file mode 100644 index 86ba12f37..000000000 --- a/examples/telco/Project.toml +++ /dev/null @@ -1,10 +0,0 @@ -[deps] -DataFrames = "a93c6f00-e57d-5684-b7b6-d8193f3e46c0" -DecisionTree = "7806a523-6efd-50cb-b5f6-3fa6f1930dbb" -Distributions = "31c24e10-a181-5473-b8eb-7969acd0382f" -EvoTrees = "f6006082-12f8-11e9-0c9c-0d5d367ab1e5" -MLJ = "add582a8-e3ab-11e8-2d5e-e98b27df1bc7" -MLJDecisionTreeInterface = "c6f25543-311c-4c74-83dc-3ea6d1015661" -MLJMultivariateStatsInterface = "1b6a4a23-ba22-4f51-9698-8599985d3728" -Measurements = "eff96d63-e80a-5855-80a2-b1b0885c5ab7" -Plots = "91a5bcdd-55d7-5caf-9e0b-520d859cae80" diff --git a/examples/telco/assets/scitypes.png b/examples/telco/assets/scitypes.png deleted file mode 100644 index 8e04fadf1..000000000 Binary files a/examples/telco/assets/scitypes.png and /dev/null differ diff --git a/examples/telco/generate.jl b/examples/telco/generate.jl deleted file mode 100644 index 5c56cdf3c..000000000 --- a/examples/telco/generate.jl +++ /dev/null @@ -1,11 +0,0 @@ -# Execute this julia file to generate the notebooks from ../notebook.jl - -joinpath(@__DIR__, "..", "generate.jl") |> include -generate(@__DIR__, pluto=true, execute=false) - -# Execution has been failing with a an issue with deserializing the -# final model. -# Executing the notebook in Juptyer is fine however. - -#https://discourse.julialang.org/t/execution-of-notebook-in-literate-jl-not-working-but-notebook-executes-fine-in-jupyter-serialization-issue/76387/4 - diff --git a/examples/telco/notebook.ipynb b/examples/telco/notebook.ipynb deleted file mode 100644 index 77fc58651..000000000 --- a/examples/telco/notebook.ipynb +++ /dev/null @@ -1,3817 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# MLJ for Data Scientists in Two Hours" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "An application of the [MLJ\n", - "toolbox](https://juliaai.github.io/MLJ.jl/dev/) to the\n", - "Telco Customer Churn dataset, aimed at practicing data scientists\n", - "new to MLJ (Machine Learning in Julia). This tutorial does not\n", - "cover exploratory data analysis." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "MLJ is a *multi-paradigm* machine learning toolbox (i.e., not just\n", - "deep-learning)." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "For other MLJ learning resources see the [Learning\n", - "MLJ](https://juliaai.github.io/MLJ.jl/dev/learning_mlj/)\n", - "section of the\n", - "[manual](https://juliaai.github.io/MLJ.jl/dev/)." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "**Topics covered**: Grabbing and preparing a dataset, basic\n", - "fit/predict workflow, constructing a pipeline to include data\n", - "pre-processing, estimating performance metrics, ROC curves, confusion\n", - "matrices, feature importance, basic feature selection, controlling iterative\n", - "models, hyper-parameter optimization (tuning)." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "**Prerequisites for this tutorial.** Previous experience building,\n", - "evaluating, and optimizing machine learning models using\n", - "scikit-learn, caret, MLR, weka, or similar tool. No previous\n", - "experience with MLJ. Only fairly basic familiarity with Julia is\n", - "required. Uses\n", - "[DataFrames.jl](https://dataframes.juliadata.org/stable/) but in a\n", - "minimal way ([this\n", - "cheatsheet](https://ahsmart.com/pub/data-wrangling-with-data-frames-jl-cheat-sheet/index.html)\n", - "may help)." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "**Time.** Between two and three hours, first time through." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Summary of methods and types introduced" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "|code | purpose|\n", - "|:-------|:-------------------------------------------------------|\n", - "| `OpenML.load(id)` | grab a dataset from [OpenML.org](https://www.openml.org)|\n", - "| `scitype(X)` | inspect the scientific type (scitype) of object `X`|\n", - "| `schema(X)` | inspect the column scitypes (scientific types) of a table `X`|\n", - "| `coerce(X, ...)` | fix column encodings to get appropriate scitypes|\n", - "| `partition(data, frac1, frac2, ...; rng=...)` | vertically split `data`, which can be a table, vector or matrix|\n", - "| `unpack(table, f1, f2, ...)` | horizontally split `table` based on conditions `f1`, `f2`, ..., applied to column names|\n", - "| `@load ModelType pkg=...` | load code defining a model type|\n", - "| `input_scitype(model)` | inspect the scitype that a model requires for features (inputs)|\n", - "| `target_scitype(model)`| inspect the scitype that a model requires for the target (labels)|\n", - "| `ContinuousEncoder` | built-in model type for re-encoding all features as `Continuous`|\n", - "| `model1 ∣> model2 ∣> ...` | combine multiple models into a pipeline|\n", - "| `measures(\"under curve\")` | list all measures (metrics) with string \"under curve\" in documentation|\n", - "| `accuracy(yhat, y)` | compute accuracy of predictions `yhat` against ground truth observations `y`|\n", - "| `auc(yhat, y)`, `brier_loss(yhat, y)` | evaluate two probabilistic measures (`yhat` a vector of probability distributions)|\n", - "| `machine(model, X, y)` | bind `model` to training data `X` (features) and `y` (target)|\n", - "| `fit!(mach, rows=...)` | train machine using specified rows (observation indices)|\n", - "| `predict(mach, rows=...)`, | make in-sample model predictions given specified rows|\n", - "| `predict(mach, Xnew)` | make predictions given new features `Xnew`|\n", - "| `fitted_params(mach)` | inspect learned parameters|\n", - "| `report(mach)` | inspect other outcomes of training|\n", - "| `confmat(yhat, y)` | confusion matrix for predictions `yhat` and ground truth `y`|\n", - "| `roc(yhat, y)` | compute points on the receiver-operator Characteristic|\n", - "| `StratifiedCV(nfolds=6)` | 6-fold stratified cross-validation resampling strategy|\n", - "| `Holdout(fraction_train=0.7)` | holdout resampling strategy|\n", - "| `evaluate(model, X, y; resampling=..., options...)` | estimate performance metrics `model` using the data `X`, `y`|\n", - "| `FeatureSelector()` | transformer for selecting features|\n", - "| `Step(3)` | iteration control for stepping 3 iterations|\n", - "| `NumberSinceBest(6)`, `TimeLimit(60/5), InvalidValue()` | iteration control stopping criteria|\n", - "| `IteratedModel(model=..., controls=..., options...)` | wrap an iterative `model` in control strategies|\n", - "| `range(model, :some_hyperparam, lower=..., upper=...)` | define a numeric range|\n", - "| `RandomSearch()` | random search tuning strategy|\n", - "| `TunedModel(model=..., tuning=..., options...)` | wrap the supervised `model` in specified `tuning` strategy|" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Instantiate a Julia environment" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "The following code replicates precisely the set of Julia packages\n", - "used to develop this tutorial. If this is your first time running\n", - "the notebook, package instantiation and pre-compilation may take a\n", - "minute or so to complete. **This step will fail** if the [correct\n", - "Manifest.toml and Project.toml\n", - "files](https://github.com/JuliaAI/MLJ.jl/tree/dev/examples/telco)\n", - "are not in the same directory as this notebook." - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "\u001b[32m\u001b[1m Activating\u001b[22m\u001b[39m environment at `~/GoogleDrive/Julia/MLJ/MLJ/examples/telco/Project.toml`\n", - "\u001b[32m\u001b[1mPrecompiling\u001b[22m\u001b[39m project...\n", - "\u001b[32m ✓ \u001b[39m\u001b[90mLogExpFunctions\u001b[39m\n", - "\u001b[32m ✓ \u001b[39m\u001b[90mStatsBase\u001b[39m\n", - "\u001b[32m ✓ \u001b[39m\u001b[90mLatinHypercubeSampling\u001b[39m\n", - "\u001b[32m ✓ \u001b[39m\u001b[90mMultivariateStats\u001b[39m\n", - "\u001b[32m ✓ \u001b[39m\u001b[90mLossFunctions\u001b[39m\n", - "\u001b[32m ✓ \u001b[39m\u001b[90mSpecialFunctions\u001b[39m\n", - "\u001b[32m ✓ \u001b[39mMLJMultivariateStatsInterface\n", - "\u001b[32m ✓ \u001b[39m\u001b[90mDualNumbers\u001b[39m\n", - "\u001b[32m ✓ \u001b[39m\u001b[90mHypergeometricFunctions\u001b[39m\n", - "\u001b[32m ✓ \u001b[39m\u001b[90mStatsFuns\u001b[39m\n", - "\u001b[32m ✓ \u001b[39m\u001b[90mUnicodePlots\u001b[39m\n", - "\u001b[32m ✓ \u001b[39mDistributions\n", - "\u001b[32m ✓ \u001b[39m\u001b[90mScientificTypes\u001b[39m\n", - "\u001b[32m ✓ \u001b[39m\u001b[90mCategoricalDistributions\u001b[39m\n", - "\u001b[32m ✓ \u001b[39m\u001b[90mMLJModels\u001b[39m\n", - "\u001b[32m ✓ \u001b[39m\u001b[90mMLJBase\u001b[39m\n", - "\u001b[32m ✓ \u001b[39m\u001b[90mMLJEnsembles\u001b[39m\n", - "\u001b[32m ✓ \u001b[39m\u001b[90mMLJTuning\u001b[39m\n", - "\u001b[32m ✓ \u001b[39m\u001b[90mMLJSerialization\u001b[39m\n", - "\u001b[32m ✓ \u001b[39m\u001b[90mMLJIteration\u001b[39m\n", - "\u001b[32m ✓ \u001b[39mMLJ\n", - "\u001b[32m ✓ \u001b[39m\u001b[90mCUDA\u001b[39m\n", - "\u001b[32m ✓ \u001b[39mPlots\n", - "\u001b[32m ✓ \u001b[39mEvoTrees\n", - " 24 dependencies successfully precompiled in 110 seconds (181 already precompiled)\n" - ] - } - ], - "source": [ - "using Pkg\n", - "Pkg.activate(@__DIR__) # get env from TOML files in same directory as this notebook\n", - "Pkg.instantiate()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Warm up: Building a model for the iris dataset" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Before turning to the Telco Customer Churn dataset, we very quickly\n", - "build a predictive model for Fisher's well-known iris data set, as way of\n", - "introducing the main actors in any MLJ workflow. Details that you\n", - "don't fully grasp should become clearer in the Telco study." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "This section is a condensed adaption of the [Getting Started\n", - "example](https://juliaai.github.io/MLJ.jl/dev/getting_started/#Fit-and-predict)\n", - "in the MLJ documentation." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "First, using the built-in iris dataset, we load and inspect the features\n", - "`X_iris` (a table) and target variable `y_iris` (a vector):" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "┌ Info: Precompiling MLJ [add582a8-e3ab-11e8-2d5e-e98b27df1bc7]\n", - "└ @ Base loading.jl:1342\n" - ] - } - ], - "source": [ - "using MLJ" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "┌──────────────┬────────────┬─────────┐\n", - "│\u001b[22m names \u001b[0m│\u001b[22m scitypes \u001b[0m│\u001b[22m types \u001b[0m│\n", - "├──────────────┼────────────┼─────────┤\n", - "│ sepal_length │ Continuous │ Float64 │\n", - "│ sepal_width │ Continuous │ Float64 │\n", - "│ petal_length │ Continuous │ Float64 │\n", - "│ petal_width │ Continuous │ Float64 │\n", - "└──────────────┴────────────┴─────────┘\n" - ] - }, - "execution_count": 3, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "const X_iris, y_iris = @load_iris;\n", - "schema(X_iris)" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "4-element CategoricalArrays.CategoricalArray{String,1,UInt32}:\n", - " \"setosa\"\n", - " \"setosa\"\n", - " \"setosa\"\n", - " \"setosa\"" - ] - }, - "execution_count": 4, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "y_iris[1:4]" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "3-element Vector{String}:\n", - " \"setosa\"\n", - " \"versicolor\"\n", - " \"virginica\"" - ] - }, - "execution_count": 5, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "levels(y_iris)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "We load a decision tree model, from the package DecisionTree.jl:" - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "┌ Info: For silent loading, specify `verbosity=0`. \n", - "└ @ Main /Users/anthony/.julia/packages/MLJModels/38NmP/src/loading.jl:168\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "import MLJDecisionTreeInterface ✔\n" - ] - }, - { - "data": { - "text/plain": [ - "DecisionTreeClassifier(\n", - " max_depth = -1,\n", - " min_samples_leaf = 1,\n", - " min_samples_split = 5,\n", - " min_purity_increase = 0.0,\n", - " n_subfeatures = 0,\n", - " post_prune = false,\n", - " merge_purity_threshold = 1.0,\n", - " display_depth = 5,\n", - " rng = Random._GLOBAL_RNG())" - ] - }, - "execution_count": 6, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "DecisionTree = @load DecisionTreeClassifier pkg=DecisionTree # model type\n", - "model = DecisionTree(min_samples_split=5) # model instance" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "In MLJ, a *model* is just a container for hyper-parameters of\n", - "some learning algorithm. It does not store learned parameters." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Next, we bind the model together with the available data in what's\n", - "called a *machine*:" - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "Machine{DecisionTreeClassifier,…} trained 0 times; caches data\n", - " model: MLJDecisionTreeInterface.DecisionTreeClassifier\n", - " args: \n", - " 1:\tSource @066 ⏎ `Table{AbstractVector{Continuous}}`\n", - " 2:\tSource @594 ⏎ `AbstractVector{Multiclass{3}}`\n" - ] - }, - "execution_count": 7, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "mach = machine(model, X_iris, y_iris)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "A machine is essentially just a model (ie, hyper-parameters) plus data, but\n", - "it additionally stores *learned parameters* (the tree) once it is\n", - "trained on some view of the data:" - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "┌ Info: Training Machine{DecisionTreeClassifier,…}.\n", - "└ @ MLJBase /Users/anthony/.julia/packages/MLJBase/hHa7b/src/machines.jl:464\n" - ] - }, - { - "data": { - "text/plain": [ - "(tree = Decision Tree\n", - "Leaves: 5\n", - "Depth: 3,\n", - " encoding = Dict{CategoricalArrays.CategoricalValue{String, UInt32}, UInt32}(\"virginica\" => 0x00000003, \"setosa\" => 0x00000001, \"versicolor\" => 0x00000002),\n", - " features = [:sepal_length, :sepal_width, :petal_length, :petal_width],)" - ] - }, - "execution_count": 8, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "train_rows = vcat(1:60, 91:150); # some row indices (observations are rows not columns)\n", - "fit!(mach, rows=train_rows)\n", - "fitted_params(mach)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "A machine stores some other information enabling [warm\n", - "restart](https://juliaai.github.io/MLJ.jl/dev/machines/#Warm-restarts)\n", - "for some models, but we won't go into that here. You are allowed to\n", - "access and mutate the `model` parameter:" - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "┌ Info: Updating Machine{DecisionTreeClassifier,…}.\n", - "└ @ MLJBase /Users/anthony/.julia/packages/MLJBase/hHa7b/src/machines.jl:465\n" - ] - }, - { - "data": { - "text/plain": [ - "Machine{DecisionTreeClassifier,…} trained 2 times; caches data\n", - " model: MLJDecisionTreeInterface.DecisionTreeClassifier\n", - " args: \n", - " 1:\tSource @066 ⏎ `Table{AbstractVector{Continuous}}`\n", - " 2:\tSource @594 ⏎ `AbstractVector{Multiclass{3}}`\n" - ] - }, - "execution_count": 9, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "mach.model.min_samples_split = 10\n", - "fit!(mach, rows=train_rows) # re-train with new hyper-parameter" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Now we can make predictions on some other view of the data, as in" - ] - }, - { - "cell_type": "code", - "execution_count": 10, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "3-element CategoricalDistributions.UnivariateFiniteVector{Multiclass{3}, String, UInt32, Float64}:\n", - " UnivariateFinite{Multiclass{3}}(setosa=>0.0, versicolor=>0.0, virginica=>1.0)\n", - " UnivariateFinite{Multiclass{3}}(setosa=>0.0, versicolor=>1.0, virginica=>0.0)\n", - " UnivariateFinite{Multiclass{3}}(setosa=>0.0, versicolor=>0.25, virginica=>0.75)" - ] - }, - "execution_count": 10, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "predict(mach, rows=71:73)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "or on completely new data, as in" - ] - }, - { - "cell_type": "code", - "execution_count": 11, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "2-element CategoricalDistributions.UnivariateFiniteVector{Multiclass{3}, String, UInt32, Float64}:\n", - " UnivariateFinite{Multiclass{3}}(setosa=>1.0, versicolor=>0.0, virginica=>0.0)\n", - " UnivariateFinite{Multiclass{3}}(setosa=>0.0, versicolor=>0.25, virginica=>0.75)" - ] - }, - "execution_count": 11, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "Xnew = (sepal_length = [5.1, 6.3],\n", - " sepal_width = [3.0, 2.5],\n", - " petal_length = [1.4, 4.9],\n", - " petal_width = [0.3, 1.5])\n", - "yhat = predict(mach, Xnew)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "These are probabilistic predictions which can be manipulated using a\n", - "widely adopted interface defined in the Distributions.jl\n", - "package. For example, we can get raw probabilities like this:" - ] - }, - { - "cell_type": "code", - "execution_count": 12, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "2-element Vector{Float64}:\n", - " 0.0\n", - " 0.75" - ] - }, - "execution_count": 12, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "pdf.(yhat, \"virginica\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "We now turn to the Telco dataset." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Getting the Telco data" - ] - }, - { - "cell_type": "code", - "execution_count": 13, - "metadata": {}, - "outputs": [], - "source": [ - "import DataFrames" - ] - }, - { - "cell_type": "code", - "execution_count": 14, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "

4 rows × 21 columns (omitted printing of 13 columns)

customerIDgenderSeniorCitizenPartnerDependentstenurePhoneServiceMultipleLines
StringStringFloat64StringStringFloat64StringString
17590-VHVEGFemale0.0YesNo1.0NoNo phone service
25575-GNVDEMale0.0NoNo34.0YesNo
33668-QPYBKMale0.0NoNo2.0YesNo
47795-CFOCWMale0.0NoNo45.0NoNo phone service
" - ], - "text/latex": [ - "\\begin{tabular}{r|ccccccccc}\n", - "\t& customerID & gender & SeniorCitizen & Partner & Dependents & tenure & PhoneService & MultipleLines & \\\\\n", - "\t\\hline\n", - "\t& String & String & Float64 & String & String & Float64 & String & String & \\\\\n", - "\t\\hline\n", - "\t1 & 7590-VHVEG & Female & 0.0 & Yes & No & 1.0 & No & No phone service & $\\dots$ \\\\\n", - "\t2 & 5575-GNVDE & Male & 0.0 & No & No & 34.0 & Yes & No & $\\dots$ \\\\\n", - "\t3 & 3668-QPYBK & Male & 0.0 & No & No & 2.0 & Yes & No & $\\dots$ \\\\\n", - "\t4 & 7795-CFOCW & Male & 0.0 & No & No & 45.0 & No & No phone service & $\\dots$ \\\\\n", - "\\end{tabular}\n" - ], - "text/plain": [ - "\u001b[1m4×21 DataFrame\u001b[0m\n", - "\u001b[1m Row \u001b[0m│\u001b[1m customerID \u001b[0m\u001b[1m gender \u001b[0m\u001b[1m SeniorCitizen \u001b[0m\u001b[1m Partner \u001b[0m\u001b[1m Dependents \u001b[0m\u001b[1m tenure \u001b[0m\u001b[1m PhoneS\u001b[0m ⋯\n", - "\u001b[1m \u001b[0m│\u001b[90m String \u001b[0m\u001b[90m String \u001b[0m\u001b[90m Float64 \u001b[0m\u001b[90m String \u001b[0m\u001b[90m String \u001b[0m\u001b[90m Float64 \u001b[0m\u001b[90m String\u001b[0m ⋯\n", - "─────┼──────────────────────────────────────────────────────────────────────────\n", - " 1 │ 7590-VHVEG Female 0.0 Yes No 1.0 No ⋯\n", - " 2 │ 5575-GNVDE Male 0.0 No No 34.0 Yes\n", - " 3 │ 3668-QPYBK Male 0.0 No No 2.0 Yes\n", - " 4 │ 7795-CFOCW Male 0.0 No No 45.0 No\n", - "\u001b[36m 15 columns omitted\u001b[0m" - ] - }, - "execution_count": 14, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "data = OpenML.load(42178) # data set from OpenML.org\n", - "df0 = DataFrames.DataFrame(data)\n", - "first(df0, 4)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "The object of this tutorial is to build and evaluate supervised\n", - "learning models to predict the `:Churn` variable, a binary variable\n", - "measuring customer retention, based on other variables that are\n", - "relevant." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "In the table, observations correspond to rows, and features to\n", - "columns, which is the convention for representing all\n", - "two-dimensional data in MLJ." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Type coercion" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "> Introduces: `scitype`, `schema`, `coerce`" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "A [\"scientific\n", - "type\"](https://juliaai.github.io/ScientificTypes.jl/dev/) or\n", - "*scitype* indicates how MLJ will *interpret* data. For example,\n", - "`typeof(3.14) == Float64`, while `scitype(3.14) == Continuous` and\n", - "also `scitype(3.14f0) == Continuous`. In MLJ, model data\n", - "requirements are articulated using scitypes." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Here are common \"scalar\" scitypes:" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "![](assets/scitypes.png)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "There are also container scitypes. For example, the scitype of any\n", - "`N`-dimensional array is `AbstractArray{S, N}`, where `S` is the scitype of the\n", - "elements:" - ] - }, - { - "cell_type": "code", - "execution_count": 15, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "AbstractVector{Textual} (alias for AbstractArray{Textual, 1})" - ] - }, - "execution_count": 15, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "scitype([\"cat\", \"mouse\", \"dog\"])" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "The `schema` operator summarizes the column scitypes of a table:" - ] - }, - { - "cell_type": "code", - "execution_count": 16, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "

21 rows × 3 columns

namesscitypestypes
SymbolDataTypeDataType
1customerIDTextualString
2genderTextualString
3SeniorCitizenContinuousFloat64
4PartnerTextualString
5DependentsTextualString
6tenureContinuousFloat64
7PhoneServiceTextualString
8MultipleLinesTextualString
9InternetServiceTextualString
10OnlineSecurityTextualString
11OnlineBackupTextualString
12DeviceProtectionTextualString
13TechSupportTextualString
14StreamingTVTextualString
15StreamingMoviesTextualString
16ContractTextualString
17PaperlessBillingTextualString
18PaymentMethodTextualString
19MonthlyChargesContinuousFloat64
20TotalChargesTextualString
21ChurnTextualString
" - ], - "text/latex": [ - "\\begin{tabular}{r|ccc}\n", - "\t& names & scitypes & types\\\\\n", - "\t\\hline\n", - "\t& Symbol & DataType & DataType\\\\\n", - "\t\\hline\n", - "\t1 & customerID & Textual & String \\\\\n", - "\t2 & gender & Textual & String \\\\\n", - "\t3 & SeniorCitizen & Continuous & Float64 \\\\\n", - "\t4 & Partner & Textual & String \\\\\n", - "\t5 & Dependents & Textual & String \\\\\n", - "\t6 & tenure & Continuous & Float64 \\\\\n", - "\t7 & PhoneService & Textual & String \\\\\n", - "\t8 & MultipleLines & Textual & String \\\\\n", - "\t9 & InternetService & Textual & String \\\\\n", - "\t10 & OnlineSecurity & Textual & String \\\\\n", - "\t11 & OnlineBackup & Textual & String \\\\\n", - "\t12 & DeviceProtection & Textual & String \\\\\n", - "\t13 & TechSupport & Textual & String \\\\\n", - "\t14 & StreamingTV & Textual & String \\\\\n", - "\t15 & StreamingMovies & Textual & String \\\\\n", - "\t16 & Contract & Textual & String \\\\\n", - "\t17 & PaperlessBilling & Textual & String \\\\\n", - "\t18 & PaymentMethod & Textual & String \\\\\n", - "\t19 & MonthlyCharges & Continuous & Float64 \\\\\n", - "\t20 & TotalCharges & Textual & String \\\\\n", - "\t21 & Churn & Textual & String \\\\\n", - "\\end{tabular}\n" - ], - "text/plain": [ - "\u001b[1m21×3 DataFrame\u001b[0m\n", - "\u001b[1m Row \u001b[0m│\u001b[1m names \u001b[0m\u001b[1m scitypes \u001b[0m\u001b[1m types \u001b[0m\n", - "\u001b[1m \u001b[0m│\u001b[90m Symbol \u001b[0m\u001b[90m DataType \u001b[0m\u001b[90m DataType \u001b[0m\n", - "─────┼────────────────────────────────────────\n", - " 1 │ customerID Textual String\n", - " 2 │ gender Textual String\n", - " 3 │ SeniorCitizen Continuous Float64\n", - " 4 │ Partner Textual String\n", - " 5 │ Dependents Textual String\n", - " 6 │ tenure Continuous Float64\n", - " 7 │ PhoneService Textual String\n", - " 8 │ MultipleLines Textual String\n", - " 9 │ InternetService Textual String\n", - " 10 │ OnlineSecurity Textual String\n", - " 11 │ OnlineBackup Textual String\n", - " 12 │ DeviceProtection Textual String\n", - " 13 │ TechSupport Textual String\n", - " 14 │ StreamingTV Textual String\n", - " 15 │ StreamingMovies Textual String\n", - " 16 │ Contract Textual String\n", - " 17 │ PaperlessBilling Textual String\n", - " 18 │ PaymentMethod Textual String\n", - " 19 │ MonthlyCharges Continuous Float64\n", - " 20 │ TotalCharges Textual String\n", - " 21 │ Churn Textual String" - ] - }, - "execution_count": 16, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "schema(df0) |> DataFrames.DataFrame # converted to DataFrame for better display" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "All of the fields being interpreted as `Textual` are really\n", - "something else, either `Multiclass` or, in the case of\n", - "`:TotalCharges`, `Continuous`. In fact, `:TotalCharges` is\n", - "mostly floats wrapped as strings. However, it needs special\n", - "treatment because some elements consist of a single space, \" \",\n", - "which we'll treat as \"0.0\"." - ] - }, - { - "cell_type": "code", - "execution_count": 17, - "metadata": {}, - "outputs": [], - "source": [ - "fix_blanks(v) = map(v) do x\n", - " if x == \" \"\n", - " return \"0.0\"\n", - " else\n", - " return x\n", - " end\n", - "end\n", - "\n", - "df0.TotalCharges = fix_blanks(df0.TotalCharges);" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Coercing the `:TotalCharges` type to ensure a `Continuous` scitype:" - ] - }, - { - "cell_type": "code", - "execution_count": 18, - "metadata": {}, - "outputs": [], - "source": [ - "coerce!(df0, :TotalCharges => Continuous);" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Coercing all remaining `Textual` data to `Multiclass`:" - ] - }, - { - "cell_type": "code", - "execution_count": 19, - "metadata": {}, - "outputs": [], - "source": [ - "coerce!(df0, Textual => Multiclass);" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Finally, we'll coerce our target variable `:Churn` to be\n", - "`OrderedFactor`, rather than `Multiclass`, to enable a reliable\n", - "interpretation of metrics like \"true positive rate\". By convention,\n", - "the first class is the negative one:" - ] - }, - { - "cell_type": "code", - "execution_count": 20, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "2-element Vector{String}:\n", - " \"No\"\n", - " \"Yes\"" - ] - }, - "execution_count": 20, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "coerce!(df0, :Churn => OrderedFactor)\n", - "levels(df0.Churn) # to check order" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Re-inspecting the scitypes:" - ] - }, - { - "cell_type": "code", - "execution_count": 21, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "

21 rows × 3 columns

namesscitypestypes
SymbolDataTypeDataType
1customerIDMulticlass{7043}CategoricalValue{String, UInt32}
2genderMulticlass{2}CategoricalValue{String, UInt32}
3SeniorCitizenContinuousFloat64
4PartnerMulticlass{2}CategoricalValue{String, UInt32}
5DependentsMulticlass{2}CategoricalValue{String, UInt32}
6tenureContinuousFloat64
7PhoneServiceMulticlass{2}CategoricalValue{String, UInt32}
8MultipleLinesMulticlass{3}CategoricalValue{String, UInt32}
9InternetServiceMulticlass{3}CategoricalValue{String, UInt32}
10OnlineSecurityMulticlass{3}CategoricalValue{String, UInt32}
11OnlineBackupMulticlass{3}CategoricalValue{String, UInt32}
12DeviceProtectionMulticlass{3}CategoricalValue{String, UInt32}
13TechSupportMulticlass{3}CategoricalValue{String, UInt32}
14StreamingTVMulticlass{3}CategoricalValue{String, UInt32}
15StreamingMoviesMulticlass{3}CategoricalValue{String, UInt32}
16ContractMulticlass{3}CategoricalValue{String, UInt32}
17PaperlessBillingMulticlass{2}CategoricalValue{String, UInt32}
18PaymentMethodMulticlass{4}CategoricalValue{String, UInt32}
19MonthlyChargesContinuousFloat64
20TotalChargesContinuousFloat64
21ChurnOrderedFactor{2}CategoricalValue{String, UInt32}
" - ], - "text/latex": [ - "\\begin{tabular}{r|ccc}\n", - "\t& names & scitypes & types\\\\\n", - "\t\\hline\n", - "\t& Symbol & DataType & DataType\\\\\n", - "\t\\hline\n", - "\t1 & customerID & Multiclass\\{7043\\} & CategoricalValue\\{String, UInt32\\} \\\\\n", - "\t2 & gender & Multiclass\\{2\\} & CategoricalValue\\{String, UInt32\\} \\\\\n", - "\t3 & SeniorCitizen & Continuous & Float64 \\\\\n", - "\t4 & Partner & Multiclass\\{2\\} & CategoricalValue\\{String, UInt32\\} \\\\\n", - "\t5 & Dependents & Multiclass\\{2\\} & CategoricalValue\\{String, UInt32\\} \\\\\n", - "\t6 & tenure & Continuous & Float64 \\\\\n", - "\t7 & PhoneService & Multiclass\\{2\\} & CategoricalValue\\{String, UInt32\\} \\\\\n", - "\t8 & MultipleLines & Multiclass\\{3\\} & CategoricalValue\\{String, UInt32\\} \\\\\n", - "\t9 & InternetService & Multiclass\\{3\\} & CategoricalValue\\{String, UInt32\\} \\\\\n", - "\t10 & OnlineSecurity & Multiclass\\{3\\} & CategoricalValue\\{String, UInt32\\} \\\\\n", - "\t11 & OnlineBackup & Multiclass\\{3\\} & CategoricalValue\\{String, UInt32\\} \\\\\n", - "\t12 & DeviceProtection & Multiclass\\{3\\} & CategoricalValue\\{String, UInt32\\} \\\\\n", - "\t13 & TechSupport & Multiclass\\{3\\} & CategoricalValue\\{String, UInt32\\} \\\\\n", - "\t14 & StreamingTV & Multiclass\\{3\\} & CategoricalValue\\{String, UInt32\\} \\\\\n", - "\t15 & StreamingMovies & Multiclass\\{3\\} & CategoricalValue\\{String, UInt32\\} \\\\\n", - "\t16 & Contract & Multiclass\\{3\\} & CategoricalValue\\{String, UInt32\\} \\\\\n", - "\t17 & PaperlessBilling & Multiclass\\{2\\} & CategoricalValue\\{String, UInt32\\} \\\\\n", - "\t18 & PaymentMethod & Multiclass\\{4\\} & CategoricalValue\\{String, UInt32\\} \\\\\n", - "\t19 & MonthlyCharges & Continuous & Float64 \\\\\n", - "\t20 & TotalCharges & Continuous & Float64 \\\\\n", - "\t21 & Churn & OrderedFactor\\{2\\} & CategoricalValue\\{String, UInt32\\} \\\\\n", - "\\end{tabular}\n" - ], - "text/plain": [ - "\u001b[1m21×3 DataFrame\u001b[0m\n", - "\u001b[1m Row \u001b[0m│\u001b[1m names \u001b[0m\u001b[1m scitypes \u001b[0m\u001b[1m types \u001b[0m\n", - "\u001b[1m \u001b[0m│\u001b[90m Symbol \u001b[0m\u001b[90m DataType \u001b[0m\u001b[90m DataType \u001b[0m\n", - "─────┼──────────────────────────────────────────────────────────────────────\n", - " 1 │ customerID Multiclass{7043} CategoricalValue{String, UInt32}\n", - " 2 │ gender Multiclass{2} CategoricalValue{String, UInt32}\n", - " 3 │ SeniorCitizen Continuous Float64\n", - " 4 │ Partner Multiclass{2} CategoricalValue{String, UInt32}\n", - " 5 │ Dependents Multiclass{2} CategoricalValue{String, UInt32}\n", - " 6 │ tenure Continuous Float64\n", - " 7 │ PhoneService Multiclass{2} CategoricalValue{String, UInt32}\n", - " 8 │ MultipleLines Multiclass{3} CategoricalValue{String, UInt32}\n", - " 9 │ InternetService Multiclass{3} CategoricalValue{String, UInt32}\n", - " 10 │ OnlineSecurity Multiclass{3} CategoricalValue{String, UInt32}\n", - " 11 │ OnlineBackup Multiclass{3} CategoricalValue{String, UInt32}\n", - " 12 │ DeviceProtection Multiclass{3} CategoricalValue{String, UInt32}\n", - " 13 │ TechSupport Multiclass{3} CategoricalValue{String, UInt32}\n", - " 14 │ StreamingTV Multiclass{3} CategoricalValue{String, UInt32}\n", - " 15 │ StreamingMovies Multiclass{3} CategoricalValue{String, UInt32}\n", - " 16 │ Contract Multiclass{3} CategoricalValue{String, UInt32}\n", - " 17 │ PaperlessBilling Multiclass{2} CategoricalValue{String, UInt32}\n", - " 18 │ PaymentMethod Multiclass{4} CategoricalValue{String, UInt32}\n", - " 19 │ MonthlyCharges Continuous Float64\n", - " 20 │ TotalCharges Continuous Float64\n", - " 21 │ Churn OrderedFactor{2} CategoricalValue{String, UInt32}" - ] - }, - "execution_count": 21, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "schema(df0) |> DataFrames.DataFrame" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Preparing a holdout set for final testing" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "> Introduces: `partition`" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "To reduce training times for the purposes of this tutorial, we're\n", - "going to dump 90% of observations (after shuffling) and split off\n", - "30% of the remainder for use as a lock-and-throw-away-the-key\n", - "holdout set:" - ] - }, - { - "cell_type": "code", - "execution_count": 22, - "metadata": {}, - "outputs": [], - "source": [ - "df, df_test, df_dumped = partition(df0, 0.07, 0.03, # in ratios 7:3:90\n", - " stratify=df0.Churn,\n", - " rng=123);" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "The reader interested in including all data can instead do\n", - "`df, df_test = partition(df0, 0.7, stratify=df0.Churn, rng=123)`." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Splitting data into target and features" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "> Introduces: `unpack`" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "In the following call, the column with name `:Churn` is copied over\n", - "to a vector `y`, and every remaining column, except `:customerID`\n", - "(which contains no useful information) goes into a table `X`. Here\n", - "`:Churn` is the target variable for which we seek predictions, given\n", - "new versions of the features `X`." - ] - }, - { - "cell_type": "code", - "execution_count": 23, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "(:gender, :SeniorCitizen, :Partner, :Dependents, :tenure, :PhoneService, :MultipleLines, :InternetService, :OnlineSecurity, :OnlineBackup, :DeviceProtection, :TechSupport, :StreamingTV, :StreamingMovies, :Contract, :PaperlessBilling, :PaymentMethod, :MonthlyCharges, :TotalCharges)" - ] - }, - "execution_count": 23, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "const y, X = unpack(df, ==(:Churn), !=(:customerID));\n", - "schema(X).names" - ] - }, - { - "cell_type": "code", - "execution_count": 24, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "Symbol[]" - ] - }, - "execution_count": 24, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "intersect([:Churn, :customerID], schema(X).names)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "We'll do the same for the holdout data:" - ] - }, - { - "cell_type": "code", - "execution_count": 25, - "metadata": {}, - "outputs": [], - "source": [ - "const ytest, Xtest = unpack(df_test, ==(:Churn), !=(:customerID));" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Loading a model and checking type requirements" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "> Introduces: `@load`, `input_scitype`, `target_scitype`" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "For tools helping us to identify suitable models, see the [Model\n", - "Search](https://juliaai.github.io/MLJ.jl/dev/model_search/#model_search)\n", - "section of the manual. We will build a gradient tree-boosting model,\n", - "a popular first choice for structured data like we have here. Model\n", - "code is contained in a third-party package called\n", - "[EvoTrees.jl](https://github.com/Evovest/EvoTrees.jl) which is\n", - "loaded as follows:" - ] - }, - { - "cell_type": "code", - "execution_count": 26, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "import EvoTrees" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "┌ Info: For silent loading, specify `verbosity=0`. \n", - "└ @ Main /Users/anthony/.julia/packages/MLJModels/38NmP/src/loading.jl:168\n", - "┌ Info: Precompiling EvoTrees [f6006082-12f8-11e9-0c9c-0d5d367ab1e5]\n", - "└ @ Base loading.jl:1342\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - " ✔\n" - ] - }, - { - "data": { - "text/plain": [ - "EvoTrees.EvoTreeClassifier" - ] - }, - "execution_count": 26, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "Booster = @load EvoTreeClassifier pkg=EvoTrees" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Recall that a *model* is just a container for some algorithm's\n", - "hyper-parameters. Let's create a `Booster` with default values for\n", - "the hyper-parameters:" - ] - }, - { - "cell_type": "code", - "execution_count": 27, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "EvoTreeClassifier(\n", - " loss = EvoTrees.Softmax(),\n", - " nrounds = 10,\n", - " λ = 0.0,\n", - " γ = 0.0,\n", - " η = 0.1,\n", - " max_depth = 5,\n", - " min_weight = 1.0,\n", - " rowsample = 1.0,\n", - " colsample = 1.0,\n", - " nbins = 64,\n", - " α = 0.5,\n", - " metric = :mlogloss,\n", - " rng = MersenneTwister(123),\n", - " device = \"cpu\")" - ] - }, - "execution_count": 27, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "booster = Booster()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "This model is appropriate for the kind of target variable we have because of\n", - "the following passing test:" - ] - }, - { - "cell_type": "code", - "execution_count": 28, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "true" - ] - }, - "execution_count": 28, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "scitype(y) <: target_scitype(booster)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "However, our features `X` cannot be directly used with `booster`:" - ] - }, - { - "cell_type": "code", - "execution_count": 29, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "false" - ] - }, - "execution_count": 29, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "scitype(X) <: input_scitype(booster)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "As it turns out, this is because `booster`, like the majority of MLJ\n", - "supervised models, expects the features to be `Continuous`. (With\n", - "some experience, this can be gleaned from `input_scitype(booster)`.)\n", - "So we need categorical feature encoding, discussed next." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Building a model pipeline to incorporate feature encoding" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "> Introduces: `ContinuousEncoder`, pipeline operator `|>`" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "The built-in `ContinuousEncoder` model transforms an arbitrary table\n", - "to a table whose features are all `Continuous` (dropping any fields\n", - "it does not know how to encode). In particular, all `Multiclass`\n", - "features are one-hot encoded." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "A *pipeline* is a stand-alone model that internally combines one or\n", - "more models in a linear (non-branching) pipeline. Here's a pipeline\n", - "that adds the `ContinuousEncoder` as a pre-processor to the\n", - "gradient tree-boosting model above:" - ] - }, - { - "cell_type": "code", - "execution_count": 30, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "ProbabilisticPipeline(\n", - " continuous_encoder = ContinuousEncoder(\n", - " drop_last = false,\n", - " one_hot_ordered_factors = false),\n", - " evo_tree_classifier = EvoTreeClassifier(\n", - " loss = EvoTrees.Softmax(),\n", - " nrounds = 10,\n", - " λ = 0.0,\n", - " γ = 0.0,\n", - " η = 0.1,\n", - " max_depth = 5,\n", - " min_weight = 1.0,\n", - " rowsample = 1.0,\n", - " colsample = 1.0,\n", - " nbins = 64,\n", - " α = 0.5,\n", - " metric = :mlogloss,\n", - " rng = MersenneTwister(123),\n", - " device = \"cpu\"),\n", - " cache = true)" - ] - }, - "execution_count": 30, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "pipe = ContinuousEncoder() |> booster" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Note that the component models appear as hyper-parameters of\n", - "`pipe`. Pipelines are an implementation of a more general [model\n", - "composition](https://juliaai.github.io/MLJ.jl/dev/composing_models/#Composing-Models)\n", - "interface provided by MLJ that advanced users may want to learn about." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "From the above display, we see that component model hyper-parameters\n", - "are now *nested*, but they are still accessible (important in hyper-parameter\n", - "optimization):" - ] - }, - { - "cell_type": "code", - "execution_count": 31, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "5" - ] - }, - "execution_count": 31, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "pipe.evo_tree_classifier.max_depth" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Evaluating the pipeline model's performance" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "> Introduces: `measures` (function), **measures:** `brier_loss`, `auc`, `accuracy`;\n", - "> `machine`, `fit!`, `predict`, `fitted_params`, `report`, `roc`, **resampling strategy** `StratifiedCV`, `evaluate`, `FeatureSelector`" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Without touching our test set `Xtest`, `ytest`, we will estimate the\n", - "performance of our pipeline model, with default hyper-parameters, in\n", - "two different ways:" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "**Evaluating by hand.** First, we'll do this \"by hand\" using the `fit!` and `predict`\n", - "workflow illustrated for the iris data set above, using a\n", - "holdout resampling strategy. At the same time we'll see how to\n", - "generate a **confusion matrix**, **ROC curve**, and inspect\n", - "**feature importances**." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "**Automated performance evaluation.** Next we'll apply the more\n", - "typical and convenient `evaluate` workflow, but using `StratifiedCV`\n", - "(stratified cross-validation) which is more informative." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "In any case, we need to choose some measures (metrics) to quantify\n", - "the performance of our model. For a complete list of measures, one\n", - "does `measures()`. Or we also can do:" - ] - }, - { - "cell_type": "code", - "execution_count": 32, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "2-element Vector{NamedTuple{(:name, :instances, :human_name, :target_scitype, :supports_weights, :supports_class_weights, :prediction_type, :orientation, :reports_each_observation, :aggregation, :is_feature_dependent, :docstring, :distribution_type), T} where T<:Tuple}:\n", - " (name = BrierLoss, instances = [brier_loss], ...)\n", - " (name = BrierScore, instances = [brier_score], ...)" - ] - }, - "execution_count": 32, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "measures(\"Brier\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "We will be primarily using `brier_loss`, but also `auc` (area under\n", - "the ROC curve) and `accuracy`." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Evaluating by hand (with a holdout set)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Our pipeline model can be trained just like the decision tree model\n", - "we built for the iris data set. Binding all non-test data to the\n", - "pipeline model:" - ] - }, - { - "cell_type": "code", - "execution_count": 33, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "Machine{ProbabilisticPipeline{NamedTuple{,…},…},…} trained 0 times; caches data\n", - " model: MLJBase.ProbabilisticPipeline{NamedTuple{(:continuous_encoder, :evo_tree_classifier), Tuple{Unsupervised, Probabilistic}}, MLJModelInterface.predict}\n", - " args: \n", - " 1:\tSource @731 ⏎ `Table{Union{AbstractVector{Continuous}, AbstractVector{Multiclass{2}}, AbstractVector{Multiclass{4}}, AbstractVector{Multiclass{3}}}}`\n", - " 2:\tSource @959 ⏎ `AbstractVector{OrderedFactor{2}}`\n" - ] - }, - "execution_count": 33, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "mach_pipe = machine(pipe, X, y)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "We already encountered the `partition` method above. Here we apply\n", - "it to row indices, instead of data containers, as `fit!` and\n", - "`predict` only need a *view* of the data to work." - ] - }, - { - "cell_type": "code", - "execution_count": 34, - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "┌ Info: Training Machine{ProbabilisticPipeline{NamedTuple{,…},…},…}.\n", - "└ @ MLJBase /Users/anthony/.julia/packages/MLJBase/hHa7b/src/machines.jl:464\n", - "┌ Info: Training Machine{ContinuousEncoder,…}.\n", - "└ @ MLJBase /Users/anthony/.julia/packages/MLJBase/hHa7b/src/machines.jl:464\n", - "┌ Info: Training Machine{EvoTreeClassifier{Float64,…},…}.\n", - "└ @ MLJBase /Users/anthony/.julia/packages/MLJBase/hHa7b/src/machines.jl:464\n" - ] - }, - { - "data": { - "text/plain": [ - "Machine{ProbabilisticPipeline{NamedTuple{,…},…},…} trained 1 time; caches data\n", - " model: MLJBase.ProbabilisticPipeline{NamedTuple{(:continuous_encoder, :evo_tree_classifier), Tuple{Unsupervised, Probabilistic}}, MLJModelInterface.predict}\n", - " args: \n", - " 1:\tSource @731 ⏎ `Table{Union{AbstractVector{Continuous}, AbstractVector{Multiclass{2}}, AbstractVector{Multiclass{4}}, AbstractVector{Multiclass{3}}}}`\n", - " 2:\tSource @959 ⏎ `AbstractVector{OrderedFactor{2}}`\n" - ] - }, - "execution_count": 34, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "train, validation = partition(1:length(y), 0.7)\n", - "fit!(mach_pipe, rows=train)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "We note in passing that we can access two kinds of information from a trained machine:" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "- The **learned parameters** (eg, coefficients of a linear model): We use `fitted_params(mach_pipe)`\n", - "- Other **by-products of training** (eg, feature importances): We use `report(mach_pipe)`" - ] - }, - { - "cell_type": "code", - "execution_count": 35, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "(:evo_tree_classifier, :continuous_encoder, :machines, :fitted_params_given_machine)" - ] - }, - "execution_count": 35, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "fp = fitted_params(mach_pipe);\n", - "keys(fp)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "For example, we can check that the encoder did not actually drop any features:" - ] - }, - { - "cell_type": "code", - "execution_count": 36, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "true" - ] - }, - "execution_count": 36, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "Set(fp.continuous_encoder.features_to_keep) == Set(schema(X).names)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "And, from the report, extract feature importances:" - ] - }, - { - "cell_type": "code", - "execution_count": 37, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "(:feature_importances,)" - ] - }, - "execution_count": 37, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "rpt = report(mach_pipe)\n", - "keys(rpt.evo_tree_classifier)" - ] - }, - { - "cell_type": "code", - "execution_count": 38, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "

45 rows × 2 columns

featureimportance
SymbolFloat64
1tenure0.339006
2MonthlyCharges0.179065
3Contract__Month-to-month0.143517
4TotalCharges0.0808519
5PaymentMethod__Bank transfer (automatic)0.0332562
6SeniorCitizen0.0327449
7TechSupport__No0.0301126
8PaperlessBilling__No0.0271853
9StreamingMovies__Yes0.0182164
10Dependents__No0.017089
11PaperlessBilling__Yes0.0135949
12gender__Female0.0122897
13DeviceProtection__No0.00952897
14Dependents__Yes0.00872813
15OnlineBackup__Yes0.0077387
16StreamingMovies__No0.00751297
17TechSupport__Yes0.00745764
18OnlineSecurity__No0.00729202
19DeviceProtection__Yes0.00673942
20Partner__Yes0.00659092
21OnlineBackup__No0.00436309
22Partner__No0.00271873
23Contract__Two year0.0014937
24Contract__One year0.00143396
25gender__Male0.00139943
26InternetService__DSL7.35606e-5
27OnlineSecurity__No internet service0.0
28PaymentMethod__Mailed check0.0
29PaymentMethod__Electronic check0.0
30OnlineSecurity__Yes0.0
" - ], - "text/latex": [ - "\\begin{tabular}{r|cc}\n", - "\t& feature & importance\\\\\n", - "\t\\hline\n", - "\t& Symbol & Float64\\\\\n", - "\t\\hline\n", - "\t1 & tenure & 0.339006 \\\\\n", - "\t2 & MonthlyCharges & 0.179065 \\\\\n", - "\t3 & Contract\\_\\_Month-to-month & 0.143517 \\\\\n", - "\t4 & TotalCharges & 0.0808519 \\\\\n", - "\t5 & PaymentMethod\\_\\_Bank transfer (automatic) & 0.0332562 \\\\\n", - "\t6 & SeniorCitizen & 0.0327449 \\\\\n", - "\t7 & TechSupport\\_\\_No & 0.0301126 \\\\\n", - "\t8 & PaperlessBilling\\_\\_No & 0.0271853 \\\\\n", - "\t9 & StreamingMovies\\_\\_Yes & 0.0182164 \\\\\n", - "\t10 & Dependents\\_\\_No & 0.017089 \\\\\n", - "\t11 & PaperlessBilling\\_\\_Yes & 0.0135949 \\\\\n", - "\t12 & gender\\_\\_Female & 0.0122897 \\\\\n", - "\t13 & DeviceProtection\\_\\_No & 0.00952897 \\\\\n", - "\t14 & Dependents\\_\\_Yes & 0.00872813 \\\\\n", - "\t15 & OnlineBackup\\_\\_Yes & 0.0077387 \\\\\n", - "\t16 & StreamingMovies\\_\\_No & 0.00751297 \\\\\n", - "\t17 & TechSupport\\_\\_Yes & 0.00745764 \\\\\n", - "\t18 & OnlineSecurity\\_\\_No & 0.00729202 \\\\\n", - "\t19 & DeviceProtection\\_\\_Yes & 0.00673942 \\\\\n", - "\t20 & Partner\\_\\_Yes & 0.00659092 \\\\\n", - "\t21 & OnlineBackup\\_\\_No & 0.00436309 \\\\\n", - "\t22 & Partner\\_\\_No & 0.00271873 \\\\\n", - "\t23 & Contract\\_\\_Two year & 0.0014937 \\\\\n", - "\t24 & Contract\\_\\_One year & 0.00143396 \\\\\n", - "\t25 & gender\\_\\_Male & 0.00139943 \\\\\n", - "\t26 & InternetService\\_\\_DSL & 7.35606e-5 \\\\\n", - "\t27 & OnlineSecurity\\_\\_No internet service & 0.0 \\\\\n", - "\t28 & PaymentMethod\\_\\_Mailed check & 0.0 \\\\\n", - "\t29 & PaymentMethod\\_\\_Electronic check & 0.0 \\\\\n", - "\t30 & OnlineSecurity\\_\\_Yes & 0.0 \\\\\n", - "\t$\\dots$ & $\\dots$ & $\\dots$ \\\\\n", - "\\end{tabular}\n" - ], - "text/plain": [ - "\u001b[1m45×2 DataFrame\u001b[0m\n", - "\u001b[1m Row \u001b[0m│\u001b[1m feature \u001b[0m\u001b[1m importance \u001b[0m\n", - "\u001b[1m \u001b[0m│\u001b[90m Symbol \u001b[0m\u001b[90m Float64 \u001b[0m\n", - "─────┼───────────────────────────────────────────────\n", - " 1 │ tenure 0.339006\n", - " 2 │ MonthlyCharges 0.179065\n", - " 3 │ Contract__Month-to-month 0.143517\n", - " 4 │ TotalCharges 0.0808519\n", - " 5 │ PaymentMethod__Bank transfer (au… 0.0332562\n", - " 6 │ SeniorCitizen 0.0327449\n", - " 7 │ TechSupport__No 0.0301126\n", - " 8 │ PaperlessBilling__No 0.0271853\n", - " 9 │ StreamingMovies__Yes 0.0182164\n", - " 10 │ Dependents__No 0.017089\n", - " 11 │ PaperlessBilling__Yes 0.0135949\n", - " ⋮ │ ⋮ ⋮\n", - " 36 │ PhoneService__Yes 0.0\n", - " 37 │ DeviceProtection__No internet se… 0.0\n", - " 38 │ StreamingTV__No internet service 0.0\n", - " 39 │ StreamingMovies__No internet ser… 0.0\n", - " 40 │ TechSupport__No internet service 0.0\n", - " 41 │ StreamingTV__No 0.0\n", - " 42 │ MultipleLines__No 0.0\n", - " 43 │ PaymentMethod__Credit card (auto… 0.0\n", - " 44 │ MultipleLines__Yes 0.0\n", - " 45 │ MultipleLines__No phone service 0.0\n", - "\u001b[36m 24 rows omitted\u001b[0m" - ] - }, - "execution_count": 38, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "fi = rpt.evo_tree_classifier.feature_importances\n", - "feature_importance_table =\n", - " (feature=Symbol.(first.(fi)), importance=last.(fi)) |> DataFrames.DataFrame" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "For models not reporting feature importances, we recommend the\n", - "[Shapley.jl](https://expandingman.gitlab.io/Shapley.jl/) package." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Returning to predictions and evaluations of our measures:" - ] - }, - { - "cell_type": "code", - "execution_count": 39, - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "┌ Info: Measurements\n", - "│ brier_loss(ŷ, y[validation]) |> mean = 0.2700436048904324\n", - "│ auc(ŷ, y[validation]) = 0.8350461133069829\n", - "│ accuracy(mode.(ŷ), y[validation]) = 0.8108108108108107\n", - "└ @ Main In[39]:2\n" - ] - } - ], - "source": [ - "ŷ = predict(mach_pipe, rows=validation);\n", - "@info(\"Measurements\",\n", - " brier_loss(ŷ, y[validation]) |> mean,\n", - " auc(ŷ, y[validation]),\n", - " accuracy(mode.(ŷ), y[validation])\n", - " )" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Note that we need `mode` in the last case because `accuracy` expects\n", - "point predictions, not probabilistic ones. (One can alternatively\n", - "use `predict_mode` to generate the predictions.)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "While we're here, lets also generate a **confusion matrix** and\n", - "[receiver-operator\n", - "characteristic](https://en.wikipedia.org/wiki/Receiver_operating_characteristic)\n", - "(ROC):" - ] - }, - { - "cell_type": "code", - "execution_count": 40, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - " ┌───────────────────────────┐\n", - " │ Ground Truth │\n", - "┌─────────────┼─────────────┬─────────────┤\n", - "│ Predicted │ No │ Yes │\n", - "├─────────────┼─────────────┼─────────────┤\n", - "│ No │ 101 │ 14 │\n", - "├─────────────┼─────────────┼─────────────┤\n", - "│ Yes │ 14 │ 19 │\n", - "└─────────────┴─────────────┴─────────────┘\n" - ] - }, - "execution_count": 40, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "confmat(mode.(ŷ), y[validation])" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Note: Importing the plotting package and calling the plotting\n", - "functions for the first time can take a minute or so." - ] - }, - { - "cell_type": "code", - "execution_count": 41, - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "┌ Info: Precompiling Plots [91a5bcdd-55d7-5caf-9e0b-520d859cae80]\n", - "└ @ Base loading.jl:1342\n" - ] - } - ], - "source": [ - "using Plots" - ] - }, - { - "cell_type": "code", - "execution_count": 42, - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "┌ Info: Precompiling GR_jll [d2c73de3-f751-5644-a686-071e5b155ba9]\n", - "└ @ Base loading.jl:1342\n" - ] - }, - { - "data": { - "image/svg+xml": [ - "\n", - "\n", - "\n", - " \n", - " \n", - " \n", - "\n", - "\n", - "\n", - " \n", - " \n", - " \n", - "\n", - "\n", - "\n", - " \n", - " \n", - " \n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n" - ] - }, - "execution_count": 42, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "roc_curve = roc(ŷ, y[validation])\n", - "plt = scatter(roc_curve, legend=false)\n", - "plot!(plt, xlab=\"false positive rate\", ylab=\"true positive rate\")\n", - "plot!([0, 1], [0, 1], linewidth=2, linestyle=:dash, color=:black)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Automated performance evaluation (more typical workflow)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "We can also get performance estimates with a single call to the\n", - "`evaluate` function, which also allows for more complicated\n", - "resampling - in this case stratified cross-validation. To make this\n", - "more comprehensive, we set `repeats=3` below to make our\n", - "cross-validation \"Monte Carlo\" (3 random size-6 partitions of the\n", - "observation space, for a total of 18 folds) and set\n", - "`acceleration=CPUThreads()` to parallelize the computation." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "We choose a `StratifiedCV` resampling strategy; the complete list of options is\n", - "[here](https://juliaai.github.io/MLJ.jl/dev/evaluating_model_performance/#Built-in-resampling-strategies)." - ] - }, - { - "cell_type": "code", - "execution_count": 43, - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "┌ Info: Performing evaluations using 5 threads.\n", - "└ @ MLJBase /Users/anthony/.julia/packages/MLJBase/hHa7b/src/resampling.jl:1177\n", - "\u001b[33mEvaluating over 18 folds: 100%[=========================] Time: 0:00:05\u001b[39m\n" - ] - }, - { - "data": { - "text/plain": [ - "PerformanceEvaluation object with these fields:\n", - " measure, measurement, operation, per_fold,\n", - " per_observation, fitted_params_per_fold,\n", - " report_per_fold, train_test_pairs\n", - "Extract:\n", - "┌──────────────────┬─────────────┬──────────────┬───────────────────────────────\n", - "│\u001b[22m measure \u001b[0m│\u001b[22m measurement \u001b[0m│\u001b[22m operation \u001b[0m│\u001b[22m per_fold \u001b[0m ⋯\n", - "├──────────────────┼─────────────┼──────────────┼───────────────────────────────\n", - "│ BrierLoss() │ 0.313 │ predict │ [0.296, 0.346, 0.289, 0.337, ⋯\n", - "│ AreaUnderCurve() │ 0.789 │ predict │ [0.791, 0.732, 0.8, 0.778, 0 ⋯\n", - "│ Accuracy() │ 0.781 │ predict_mode │ [0.795, 0.78, 0.768, 0.744, ⋯\n", - "└──────────────────┴─────────────┴──────────────┴───────────────────────────────\n", - "\u001b[36m 1 column omitted\u001b[0m\n" - ] - }, - "execution_count": 43, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "e_pipe = evaluate(pipe, X, y,\n", - " resampling=StratifiedCV(nfolds=6, rng=123),\n", - " measures=[brier_loss, auc, accuracy],\n", - " repeats=3,\n", - " acceleration=CPUThreads())" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "(There is also a version of `evaluate` for machines. Query the\n", - "`evaluate` and `evaluate!` doc-strings to learn more about these\n", - "functions and what the `PerformanceEvaluation` object `e_pipe` records.)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "While [less than ideal](https://arxiv.org/abs/2104.00673), let's\n", - "adopt the common practice of using the standard error of a\n", - "cross-validation score as an estimate of the uncertainty of a\n", - "performance measure's expected value. Here's a utility function to\n", - "calculate 95% confidence intervals for our performance estimates based\n", - "on this practice, and it's application to the current evaluation:" - ] - }, - { - "cell_type": "code", - "execution_count": 44, - "metadata": {}, - "outputs": [], - "source": [ - "using Measurements" - ] - }, - { - "cell_type": "code", - "execution_count": 45, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "

3 rows × 2 columns

measuremeasurement
MeasureMeasurem…
1BrierLoss()0.313±0.014
2AreaUnderCurve()0.789±0.023
3Accuracy()0.781±0.014
" - ], - "text/latex": [ - "\\begin{tabular}{r|cc}\n", - "\t& measure & measurement\\\\\n", - "\t\\hline\n", - "\t& Measure & Measurem…\\\\\n", - "\t\\hline\n", - "\t1 & BrierLoss() & $0.313 \\pm 0.014$ \\\\\n", - "\t2 & AreaUnderCurve() & $0.789 \\pm 0.023$ \\\\\n", - "\t3 & Accuracy() & $0.781 \\pm 0.014$ \\\\\n", - "\\end{tabular}\n" - ], - "text/plain": [ - "\u001b[1m3×2 DataFrame\u001b[0m\n", - "\u001b[1m Row \u001b[0m│\u001b[1m measure \u001b[0m\u001b[1m measurement \u001b[0m\n", - "\u001b[1m \u001b[0m│\u001b[90m Measure \u001b[0m\u001b[90m Measuremen… \u001b[0m\n", - "─────┼───────────────────────────────\n", - " 1 │ BrierLoss() 0.313±0.014\n", - " 2 │ AreaUnderCurve() 0.789±0.023\n", - " 3 │ Accuracy() 0.781±0.014" - ] - }, - "execution_count": 45, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "function confidence_intervals(e)\n", - " factor = 2.0 # to get level of 95%\n", - " measure = e.measure\n", - " nfolds = length(e.per_fold[1])\n", - " measurement = [e.measurement[j] ± factor*std(e.per_fold[j])/sqrt(nfolds - 1)\n", - " for j in eachindex(measure)]\n", - " table = (measure=measure, measurement=measurement)\n", - " return DataFrames.DataFrame(table)\n", - "end\n", - "\n", - "const confidence_intervals_basic_model = confidence_intervals(e_pipe)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Filtering out unimportant features" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "> Introduces: `FeatureSelector`" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Before continuing, we'll modify our pipeline to drop those features\n", - "with low feature importance, to speed up later optimization:" - ] - }, - { - "cell_type": "code", - "execution_count": 46, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "ProbabilisticPipeline(\n", - " continuous_encoder = ContinuousEncoder(\n", - " drop_last = false,\n", - " one_hot_ordered_factors = false),\n", - " feature_selector = FeatureSelector(\n", - " features = [:OnlineBackup__No, :Partner__No, Symbol(\"Contract__Two year\"), Symbol(\"Contract__One year\"), :gender__Male, :InternetService__DSL, Symbol(\"OnlineSecurity__No internet service\"), Symbol(\"PaymentMethod__Mailed check\"), Symbol(\"PaymentMethod__Electronic check\"), :OnlineSecurity__Yes … :PhoneService__Yes, Symbol(\"DeviceProtection__No internet service\"), Symbol(\"StreamingTV__No internet service\"), Symbol(\"StreamingMovies__No internet service\"), Symbol(\"TechSupport__No internet service\"), :StreamingTV__No, :MultipleLines__No, Symbol(\"PaymentMethod__Credit card (automatic)\"), :MultipleLines__Yes, Symbol(\"MultipleLines__No phone service\")],\n", - " ignore = true),\n", - " evo_tree_classifier = EvoTreeClassifier(\n", - " loss = EvoTrees.Softmax(),\n", - " nrounds = 10,\n", - " λ = 0.0,\n", - " γ = 0.0,\n", - " η = 0.1,\n", - " max_depth = 5,\n", - " min_weight = 1.0,\n", - " rowsample = 1.0,\n", - " colsample = 1.0,\n", - " nbins = 64,\n", - " α = 0.5,\n", - " metric = :mlogloss,\n", - " rng = MersenneTwister(123, (0, 86172, 85170, 780)),\n", - " device = \"cpu\"),\n", - " cache = true)" - ] - }, - "execution_count": 46, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "unimportant_features = filter(:importance => <(0.005), feature_importance_table).feature\n", - "\n", - "pipe2 = ContinuousEncoder() |>\n", - " FeatureSelector(features=unimportant_features, ignore=true) |> booster" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Wrapping our iterative model in control strategies" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "> Introduces: **control strategies:** `Step`, `NumberSinceBest`, `TimeLimit`, `InvalidValue`, **model wrapper** `IteratedModel`, **resampling strategy:** `Holdout`" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "We want to optimize the hyper-parameters of our model. Since our\n", - "model is iterative, these parameters include the (nested) iteration\n", - "parameter `pipe.evo_tree_classifier.nrounds`. Sometimes this\n", - "parameter is optimized first, fixed, and then maybe optimized again\n", - "after the other parameters. Here we take a more principled approach,\n", - "**wrapping our model in a control strategy** that makes it\n", - "\"self-iterating\". The strategy applies a stopping criterion to\n", - "*out-of-sample* estimates of the model performance, constructed\n", - "using an internally constructed holdout set. In this way, we avoid\n", - "some data hygiene issues, and, when we subsequently optimize other\n", - "parameters, we will always being using an optimal number of\n", - "iterations." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Note that this approach can be applied to any iterative MLJ model,\n", - "eg, the neural network models provided by\n", - "[MLJFlux.jl](https://github.com/FluxML/MLJFlux.jl)." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "First, we select appropriate controls from [this\n", - "list](https://juliaai.github.io/MLJ.jl/dev/controlling_iterative_models/#Controls-provided):" - ] - }, - { - "cell_type": "code", - "execution_count": 47, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "4-element Vector{Any}:\n", - " Step(1)\n", - " NumberSinceBest(4)\n", - " TimeLimit(Dates.Millisecond(2000))\n", - " InvalidValue()" - ] - }, - "execution_count": 47, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "controls = [\n", - " Step(1), # to increment iteration parameter (`pipe.nrounds`)\n", - " NumberSinceBest(4), # main stopping criterion\n", - " TimeLimit(2/3600), # never train more than 2 sec\n", - " InvalidValue() # stop if NaN or ±Inf encountered\n", - "]" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Now we wrap our pipeline model using the `IteratedModel` wrapper,\n", - "being sure to specify the `measure` on which internal estimates of\n", - "the out-of-sample performance will be based:" - ] - }, - { - "cell_type": "code", - "execution_count": 48, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "ProbabilisticIteratedModel(\n", - " model = ProbabilisticPipeline(\n", - " continuous_encoder = ContinuousEncoder,\n", - " feature_selector = FeatureSelector,\n", - " evo_tree_classifier = EvoTreeClassifier{Float64,…},\n", - " cache = true),\n", - " controls = Any[Step(1), NumberSinceBest(4), TimeLimit(Dates.Millisecond(2000)), InvalidValue()],\n", - " resampling = Holdout(\n", - " fraction_train = 0.7,\n", - " shuffle = false,\n", - " rng = Random._GLOBAL_RNG()),\n", - " measure = BrierLoss(),\n", - " weights = nothing,\n", - " class_weights = nothing,\n", - " operation = MLJModelInterface.predict,\n", - " retrain = false,\n", - " check_measure = true,\n", - " iteration_parameter = nothing,\n", - " cache = true)" - ] - }, - "execution_count": 48, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "iterated_pipe = IteratedModel(model=pipe2,\n", - " controls=controls,\n", - " measure=brier_loss,\n", - " resampling=Holdout(fraction_train=0.7))" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "We've set `resampling=Holdout(fraction_train=0.7)` to arrange that\n", - "data attached to our model should be internally split into a train\n", - "set (70%) and a holdout set (30%) for determining the out-of-sample\n", - "estimate of the Brier loss." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "For demonstration purposes, let's bind `iterated_model` to all data\n", - "not in our don't-touch holdout set, and train on all of that data:" - ] - }, - { - "cell_type": "code", - "execution_count": 49, - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "┌ Info: Training Machine{ProbabilisticIteratedModel{ProbabilisticPipeline{NamedTuple{,…},…}},…}.\n", - "└ @ MLJBase /Users/anthony/.julia/packages/MLJBase/hHa7b/src/machines.jl:464\n", - "┌ Info: No iteration parameter specified. Using `iteration_parameter=:(evo_tree_classifier.nrounds)`. \n", - "└ @ MLJIteration /Users/anthony/.julia/packages/MLJIteration/LXjxi/src/core.jl:62\n", - "┌ Info: final loss: 0.2630163172395482\n", - "└ @ IterationControl /Users/anthony/.julia/packages/IterationControl/lO4bA/src/train.jl:44\n", - "┌ Info: Stop triggered by NumberSinceBest(4) stopping criterion. \n", - "└ @ IterationControl /Users/anthony/.julia/packages/IterationControl/lO4bA/src/stopping_controls.jl:54\n", - "┌ Info: Total of 24 iterations. \n", - "└ @ MLJIteration /Users/anthony/.julia/packages/MLJIteration/LXjxi/src/core.jl:35\n" - ] - } - ], - "source": [ - "mach_iterated_pipe = machine(iterated_pipe, X, y)\n", - "fit!(mach_iterated_pipe);" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "To recap, internally this training is split into two separate steps:" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "- A controlled iteration step, training on the holdout set, with the total number of iterations determined by the specified stopping criteria (based on the out-of-sample performance estimates)\n", - "- A final step that trains the atomic model on *all* available\n", - " data using the number of iterations determined in the first step. Calling `predict` on `mach_iterated_pipe` means using the learned parameters of the second step." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Hyper-parameter optimization (model tuning)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "> Introduces: `range`, **model wrapper** `TunedModel`, `RandomSearch`" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "We now turn to hyper-parameter optimization. A tool not discussed\n", - "here is the `learning_curve` function, which can be useful when\n", - "wanting to visualize the effect of changes to a *single*\n", - "hyper-parameter (which could be an iteration parameter). See, for\n", - "example, [this section of the\n", - "manual](https://juliaai.github.io/MLJ.jl/dev/learning_curves/)\n", - "or [this\n", - "tutorial](https://github.com/ablaom/MLJTutorial.jl/blob/dev/notebooks/04_tuning/notebook.ipynb)." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Fine tuning the hyper-parameters of a gradient booster can be\n", - "somewhat involved. Here we settle for simultaneously optimizing two\n", - "key parameters: `max_depth` and `η` (learning_rate)." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Like iteration control, **model optimization in MLJ is implemented as\n", - "a model wrapper**, called `TunedModel`. After wrapping a model in a\n", - "tuning strategy and binding the wrapped model to data in a machine\n", - "called `mach`, calling `fit!(mach)` instigates a search for optimal\n", - "model hyperparameters, within a specified range, and then uses all\n", - "supplied data to train the best model. To predict using that model,\n", - "one then calls `predict(mach, Xnew)`. In this way the wrapped model\n", - "may be viewed as a \"self-tuning\" version of the unwrapped\n", - "model. That is, wrapping the model simply transforms certain\n", - "hyper-parameters into learned parameters (just as `IteratedModel`\n", - "does for an iteration parameter)." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "To start with, we define ranges for the parameters of\n", - "interest. Since these parameters are nested, let's force a\n", - "display of our model to a larger depth:" - ] - }, - { - "cell_type": "code", - "execution_count": 50, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "ProbabilisticIteratedModel(\n", - " model = ProbabilisticPipeline(\n", - " continuous_encoder = ContinuousEncoder(\n", - " drop_last = false,\n", - " one_hot_ordered_factors = false),\n", - " feature_selector = FeatureSelector(\n", - " features = [:OnlineBackup__No, :Partner__No, Symbol(\"Contract__Two year\"), Symbol(\"Contract__One year\"), :gender__Male, :InternetService__DSL, Symbol(\"OnlineSecurity__No internet service\"), Symbol(\"PaymentMethod__Mailed check\"), Symbol(\"PaymentMethod__Electronic check\"), :OnlineSecurity__Yes, Symbol(\"InternetService__Fiber optic\"), Symbol(\"OnlineBackup__No internet service\"), :InternetService__No, :StreamingTV__Yes, :PhoneService__No, :PhoneService__Yes, Symbol(\"DeviceProtection__No internet service\"), Symbol(\"StreamingTV__No internet service\"), Symbol(\"StreamingMovies__No internet service\"), Symbol(\"TechSupport__No internet service\"), :StreamingTV__No, :MultipleLines__No, Symbol(\"PaymentMethod__Credit card (automatic)\"), :MultipleLines__Yes, Symbol(\"MultipleLines__No phone service\")],\n", - " ignore = true),\n", - " evo_tree_classifier = EvoTreeClassifier(\n", - " loss = EvoTrees.Softmax(),\n", - " nrounds = 10,\n", - " λ = 0.0,\n", - " γ = 0.0,\n", - " η = 0.1,\n", - " max_depth = 5,\n", - " min_weight = 1.0,\n", - " rowsample = 1.0,\n", - " colsample = 1.0,\n", - " nbins = 64,\n", - " α = 0.5,\n", - " metric = :mlogloss,\n", - " rng = MersenneTwister(123, (0, 86172, 85170, 780)),\n", - " device = \"cpu\"),\n", - " cache = true),\n", - " controls = Any[Step(1), NumberSinceBest(4), TimeLimit(Dates.Millisecond(2000)), InvalidValue()],\n", - " resampling = Holdout(\n", - " fraction_train = 0.7,\n", - " shuffle = false,\n", - " rng = Random._GLOBAL_RNG()),\n", - " measure = BrierLoss(),\n", - " weights = nothing,\n", - " class_weights = nothing,\n", - " operation = MLJModelInterface.predict,\n", - " retrain = false,\n", - " check_measure = true,\n", - " iteration_parameter = nothing,\n", - " cache = true)" - ] - } - ], - "source": [ - "show(iterated_pipe, 2)" - ] - }, - { - "cell_type": "code", - "execution_count": 51, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "NumericRange(2 ≤ model.evo_tree_classifier.max_depth ≤ 6; origin=4.0, unit=2.0)" - ] - }, - "execution_count": 51, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "p1 = :(model.evo_tree_classifier.η)\n", - "p2 = :(model.evo_tree_classifier.max_depth)\n", - "\n", - "r1 = range(iterated_pipe, p1, lower=-2, upper=-0.5, scale=x->10^x)\n", - "r2 = range(iterated_pipe, p2, lower=2, upper=6)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Nominal ranges are defined by specifying `values` instead of `lower`\n", - "and `upper`." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Next, we choose an optimization strategy from [this\n", - "list](https://juliaai.github.io/MLJ.jl/dev/tuning_models/#Tuning-Models):" - ] - }, - { - "cell_type": "code", - "execution_count": 52, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "RandomSearch(\n", - " bounded = Distributions.Uniform,\n", - " positive_unbounded = Distributions.Gamma,\n", - " other = Distributions.Normal,\n", - " rng = MersenneTwister(123))" - ] - }, - "execution_count": 52, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "tuning = RandomSearch(rng=123)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Then we wrap the model, specifying a `resampling` strategy and a\n", - "`measure`, as we did for `IteratedModel`. In fact, we can include a\n", - "battery of `measures`; by default, optimization is with respect to\n", - "performance estimates based on the first measure, but estimates for\n", - "all measures can be accessed from the model's `report`." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "The keyword `n` specifies the total number of models (sets of\n", - "hyper-parameters) to evaluate." - ] - }, - { - "cell_type": "code", - "execution_count": 53, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "ProbabilisticTunedModel(\n", - " model = ProbabilisticIteratedModel(\n", - " model = ProbabilisticPipeline{NamedTuple{,…},…},\n", - " controls = Any[Step(1), NumberSinceBest(4), TimeLimit(Dates.Millisecond(2000)), InvalidValue()],\n", - " resampling = Holdout,\n", - " measure = BrierLoss(),\n", - " weights = nothing,\n", - " class_weights = nothing,\n", - " operation = MLJModelInterface.predict,\n", - " retrain = false,\n", - " check_measure = true,\n", - " iteration_parameter = nothing,\n", - " cache = true),\n", - " tuning = RandomSearch(\n", - " bounded = Distributions.Uniform,\n", - " positive_unbounded = Distributions.Gamma,\n", - " other = Distributions.Normal,\n", - " rng = MersenneTwister(123)),\n", - " resampling = StratifiedCV(\n", - " nfolds = 6,\n", - " shuffle = true,\n", - " rng = MersenneTwister(123)),\n", - " measure = MLJBase.Measure[BrierLoss(), AreaUnderCurve(), Accuracy()],\n", - " weights = nothing,\n", - " operation = nothing,\n", - " range = MLJBase.NumericRange{T, MLJBase.Bounded, D} where {T, D}[transformed NumericRange(-2.0 ≤ model.evo_tree_classifier.η ≤ -0.5; origin=-1.25, unit=0.75), NumericRange(2 ≤ model.evo_tree_classifier.max_depth ≤ 6; origin=4.0, unit=2.0)],\n", - " selection_heuristic = MLJTuning.NaiveSelection(nothing),\n", - " train_best = true,\n", - " repeats = 1,\n", - " n = 40,\n", - " acceleration = CPUThreads{Int64}(5),\n", - " acceleration_resampling = CPU1{Nothing}(nothing),\n", - " check_measure = true,\n", - " cache = true)" - ] - }, - "execution_count": 53, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "tuned_iterated_pipe = TunedModel(model=iterated_pipe,\n", - " range=[r1, r2],\n", - " tuning=tuning,\n", - " measures=[brier_loss, auc, accuracy],\n", - " resampling=StratifiedCV(nfolds=6, rng=123),\n", - " acceleration=CPUThreads(),\n", - " n=40)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "To save time, we skip the `repeats` here." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Binding our final model to data and training:" - ] - }, - { - "cell_type": "code", - "execution_count": 54, - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "┌ Info: Training Machine{ProbabilisticTunedModel{RandomSearch,…},…}.\n", - "└ @ MLJBase /Users/anthony/.julia/packages/MLJBase/hHa7b/src/machines.jl:464\n", - "┌ Info: Attempting to evaluate 40 models.\n", - "└ @ MLJTuning /Users/anthony/.julia/packages/MLJTuning/Al9yX/src/tuned_models.jl:680\n", - "\u001b[33mEvaluating over 40 metamodels: 100%[=========================] Time: 0:00:57\u001b[39m\n" - ] - }, - { - "data": { - "text/plain": [ - "Machine{ProbabilisticTunedModel{RandomSearch,…},…} trained 1 time; caches data\n", - " model: MLJTuning.ProbabilisticTunedModel{RandomSearch, MLJIteration.ProbabilisticIteratedModel{MLJBase.ProbabilisticPipeline{NamedTuple{(:continuous_encoder, :feature_selector, :evo_tree_classifier), Tuple{Unsupervised, Unsupervised, Probabilistic}}, MLJModelInterface.predict}}}\n", - " args: \n", - " 1:\tSource @866 ⏎ `Table{Union{AbstractVector{Continuous}, AbstractVector{Multiclass{2}}, AbstractVector{Multiclass{4}}, AbstractVector{Multiclass{3}}}}`\n", - " 2:\tSource @663 ⏎ `AbstractVector{OrderedFactor{2}}`\n" - ] - }, - "execution_count": 54, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "mach_tuned_iterated_pipe = machine(tuned_iterated_pipe, X, y)\n", - "fit!(mach_tuned_iterated_pipe)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "As explained above, the training we have just performed was split\n", - "internally into two separate steps:" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "- A step to determine the parameter values that optimize the aggregated cross-validation scores\n", - "- A final step that trains the optimal model on *all* available data. Future predictions `predict(mach_tuned_iterated_pipe, ...)` are based on this final training step." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "From `report(mach_tuned_iterated_pipe)` we can extract details about\n", - "the optimization procedure. For example:" - ] - }, - { - "cell_type": "code", - "execution_count": 55, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "EvoTreeClassifier(\n", - " loss = EvoTrees.Softmax(),\n", - " nrounds = 10,\n", - " λ = 0.0,\n", - " γ = 0.0,\n", - " η = 0.06612359168674212,\n", - " max_depth = 2,\n", - " min_weight = 1.0,\n", - " rowsample = 1.0,\n", - " colsample = 1.0,\n", - " nbins = 64,\n", - " α = 0.5,\n", - " metric = :mlogloss,\n", - " rng = MersenneTwister(123, (0, 86172, 85170, 780)),\n", - " device = \"cpu\")" - ] - }, - "execution_count": 55, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "rpt2 = report(mach_tuned_iterated_pipe);\n", - "best_booster = rpt2.best_model.model.evo_tree_classifier" - ] - }, - { - "cell_type": "code", - "execution_count": 56, - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "┌ Info: Optimal hyper-parameters:\n", - "│ best_booster.max_depth = 2\n", - "│ best_booster.η = 0.06612359168674212\n", - "└ @ Main In[56]:1\n" - ] - } - ], - "source": [ - "@info \"Optimal hyper-parameters:\" best_booster.max_depth best_booster.η;" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Using the `confidence_intervals` function we defined earlier:" - ] - }, - { - "cell_type": "code", - "execution_count": 57, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "

3 rows × 2 columns

measuremeasurement
MeasureMeasurem…
1BrierLoss()0.29±0.035
2AreaUnderCurve()0.824±0.043
3Accuracy()0.797±0.047
" - ], - "text/latex": [ - "\\begin{tabular}{r|cc}\n", - "\t& measure & measurement\\\\\n", - "\t\\hline\n", - "\t& Measure & Measurem…\\\\\n", - "\t\\hline\n", - "\t1 & BrierLoss() & $0.29 \\pm 0.035$ \\\\\n", - "\t2 & AreaUnderCurve() & $0.824 \\pm 0.043$ \\\\\n", - "\t3 & Accuracy() & $0.797 \\pm 0.047$ \\\\\n", - "\\end{tabular}\n" - ], - "text/plain": [ - "\u001b[1m3×2 DataFrame\u001b[0m\n", - "\u001b[1m Row \u001b[0m│\u001b[1m measure \u001b[0m\u001b[1m measurement \u001b[0m\n", - "\u001b[1m \u001b[0m│\u001b[90m Measure \u001b[0m\u001b[90m Measuremen… \u001b[0m\n", - "─────┼───────────────────────────────\n", - " 1 │ BrierLoss() 0.29±0.035\n", - " 2 │ AreaUnderCurve() 0.824±0.043\n", - " 3 │ Accuracy() 0.797±0.047" - ] - }, - "execution_count": 57, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "e_best = rpt2.best_history_entry\n", - "confidence_intervals(e_best)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Digging a little deeper, we can learn what stopping criterion was\n", - "applied in the case of the optimal model, and how many iterations\n", - "were required:" - ] - }, - { - "cell_type": "code", - "execution_count": 58, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "4-element Vector{Tuple{Any, NamedTuple}}:\n", - " (Step(1), (new_iterations = 66,))\n", - " (NumberSinceBest(4), (done = true, log = \"Stop triggered by NumberSinceBest(4) stopping criterion. \"))\n", - " (TimeLimit(Dates.Millisecond(2000)), (done = false, log = \"\"))\n", - " (InvalidValue(), (done = false, log = \"\"))" - ] - }, - "execution_count": 58, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "rpt2.best_report.controls |> collect" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Finally, we can visualize the optimization results:" - ] - }, - { - "cell_type": "code", - "execution_count": 59, - "metadata": {}, - "outputs": [ - { - "data": { - "image/svg+xml": [ - "\n", - "\n", - "\n", - " \n", - " \n", - " \n", - "\n", - "\n", - "\n", - " \n", - " \n", - " \n", - "\n", - "\n", - "\n", - " \n", - " \n", - " \n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - " \n", - " \n", - " \n", - "\n", - "\n", - "\n", - " \n", - " \n", - " \n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - " \n", - " \n", - " \n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n" - ] - }, - "execution_count": 59, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "plot(mach_tuned_iterated_pipe, size=(600,450))" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Saving our model" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "> Introduces: `MLJ.save`" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Here's how to serialize our final, trained self-iterating,\n", - "self-tuning pipeline machine:" - ] - }, - { - "cell_type": "code", - "execution_count": 60, - "metadata": {}, - "outputs": [], - "source": [ - "MLJ.save(\"tuned_iterated_pipe.jlso\", mach_tuned_iterated_pipe)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "We'll deserialize this in \"Testing the final model\" below." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Final performance estimate" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Finally, to get an even more accurate estimate of performance, we\n", - "can evaluate our model using stratified cross-validation and all the\n", - "data attached to our machine. Because this evaluation implies\n", - "[nested\n", - "resampling](https://mlr.mlr-org.com/articles/tutorial/nested_resampling.html),\n", - "this computation takes quite a bit longer than the previous one\n", - "(which is being repeated six times, using 5/6th of the data each\n", - "time):" - ] - }, - { - "cell_type": "code", - "execution_count": 61, - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "\u001b[33mEvaluating over 6 folds: 100%[=========================] Time: 0:05:10\u001b[39m\n" - ] - }, - { - "data": { - "text/plain": [ - "PerformanceEvaluation object with these fields:\n", - " measure, measurement, operation, per_fold,\n", - " per_observation, fitted_params_per_fold,\n", - " report_per_fold, train_test_pairs\n", - "Extract:\n", - "┌──────────────────┬─────────────┬──────────────┬───────────────────────────────\n", - "│\u001b[22m measure \u001b[0m│\u001b[22m measurement \u001b[0m│\u001b[22m operation \u001b[0m│\u001b[22m per_fold \u001b[0m ⋯\n", - "├──────────────────┼─────────────┼──────────────┼───────────────────────────────\n", - "│ BrierLoss() │ 0.291 │ predict │ [0.294, 0.341, 0.256, 0.288, ⋯\n", - "│ AreaUnderCurve() │ 0.814 │ predict │ [0.806, 0.735, 0.846, 0.851, ⋯\n", - "│ Accuracy() │ 0.797 │ predict_mode │ [0.795, 0.78, 0.854, 0.78, 0 ⋯\n", - "└──────────────────┴─────────────┴──────────────┴───────────────────────────────\n", - "\u001b[36m 1 column omitted\u001b[0m\n" - ] - }, - "execution_count": 61, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "e_tuned_iterated_pipe = evaluate(tuned_iterated_pipe, X, y,\n", - " resampling=StratifiedCV(nfolds=6, rng=123),\n", - " measures=[brier_loss, auc, accuracy])" - ] - }, - { - "cell_type": "code", - "execution_count": 62, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "

3 rows × 2 columns

measuremeasurement
MeasureMeasurem…
1BrierLoss()0.291±0.034
2AreaUnderCurve()0.814±0.056
3Accuracy()0.797±0.035
" - ], - "text/latex": [ - "\\begin{tabular}{r|cc}\n", - "\t& measure & measurement\\\\\n", - "\t\\hline\n", - "\t& Measure & Measurem…\\\\\n", - "\t\\hline\n", - "\t1 & BrierLoss() & $0.291 \\pm 0.034$ \\\\\n", - "\t2 & AreaUnderCurve() & $0.814 \\pm 0.056$ \\\\\n", - "\t3 & Accuracy() & $0.797 \\pm 0.035$ \\\\\n", - "\\end{tabular}\n" - ], - "text/plain": [ - "\u001b[1m3×2 DataFrame\u001b[0m\n", - "\u001b[1m Row \u001b[0m│\u001b[1m measure \u001b[0m\u001b[1m measurement \u001b[0m\n", - "\u001b[1m \u001b[0m│\u001b[90m Measure \u001b[0m\u001b[90m Measuremen… \u001b[0m\n", - "─────┼───────────────────────────────\n", - " 1 │ BrierLoss() 0.291±0.034\n", - " 2 │ AreaUnderCurve() 0.814±0.056\n", - " 3 │ Accuracy() 0.797±0.035" - ] - }, - "execution_count": 62, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "confidence_intervals(e_tuned_iterated_pipe)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "For comparison, here are the confidence intervals for the basic\n", - "pipeline model (no feature selection and default hyperparameters):" - ] - }, - { - "cell_type": "code", - "execution_count": 63, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "

3 rows × 2 columns

measuremeasurement
MeasureMeasurem…
1BrierLoss()0.313±0.014
2AreaUnderCurve()0.789±0.023
3Accuracy()0.781±0.014
" - ], - "text/latex": [ - "\\begin{tabular}{r|cc}\n", - "\t& measure & measurement\\\\\n", - "\t\\hline\n", - "\t& Measure & Measurem…\\\\\n", - "\t\\hline\n", - "\t1 & BrierLoss() & $0.313 \\pm 0.014$ \\\\\n", - "\t2 & AreaUnderCurve() & $0.789 \\pm 0.023$ \\\\\n", - "\t3 & Accuracy() & $0.781 \\pm 0.014$ \\\\\n", - "\\end{tabular}\n" - ], - "text/plain": [ - "\u001b[1m3×2 DataFrame\u001b[0m\n", - "\u001b[1m Row \u001b[0m│\u001b[1m measure \u001b[0m\u001b[1m measurement \u001b[0m\n", - "\u001b[1m \u001b[0m│\u001b[90m Measure \u001b[0m\u001b[90m Measuremen… \u001b[0m\n", - "─────┼───────────────────────────────\n", - " 1 │ BrierLoss() 0.313±0.014\n", - " 2 │ AreaUnderCurve() 0.789±0.023\n", - " 3 │ Accuracy() 0.781±0.014" - ] - }, - "execution_count": 63, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "confidence_intervals_basic_model" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "As each pair of intervals overlap, it's doubtful the small changes\n", - "here can be assigned statistical significance. Default `booster`\n", - "hyper-parameters do a pretty good job." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Testing the final model" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "We now determine the performance of our model on our\n", - "lock-and-throw-away-the-key holdout set. To demonstrate\n", - "deserialization, we'll pretend we're in a new Julia session (but\n", - "have called `import`/`using` on the same packages). Then the\n", - "following should suffice to recover our model trained under\n", - "\"Hyper-parameter optimization\" above:" - ] - }, - { - "cell_type": "code", - "execution_count": 64, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "Machine{ProbabilisticTunedModel{RandomSearch,…},…} trained 1 time; caches data\n", - " model: MLJTuning.ProbabilisticTunedModel{RandomSearch, MLJIteration.ProbabilisticIteratedModel{MLJBase.ProbabilisticPipeline{NamedTuple{(:continuous_encoder, :feature_selector, :evo_tree_classifier), Tuple{Unsupervised, Unsupervised, Probabilistic}}, MLJModelInterface.predict}}}\n", - " args: \n" - ] - }, - "execution_count": 64, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "mach_restored = machine(\"tuned_iterated_pipe.jlso\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "We compute predictions on the holdout set:" - ] - }, - { - "cell_type": "code", - "execution_count": 65, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - " \u001b[1mUnivariateFinite{Multiclass{2}}\u001b[22m \n", - " \u001b[90m┌ ┐\u001b[39m \n", - " \u001b[0mNo \u001b[90m┤\u001b[39m\u001b[38;5;2m■■■■■■■■■■■■■■■■■\u001b[39m\u001b[0m 0.44551543781072306 \u001b[90m \u001b[39m \n", - " \u001b[0mYes \u001b[90m┤\u001b[39m\u001b[38;5;2m■■■■■■■■■■■■■■■■■■■■■\u001b[39m\u001b[0m 0.554484562189277 \u001b[90m \u001b[39m \n", - " \u001b[90m└ ┘\u001b[39m " - ] - }, - "execution_count": 65, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "ŷ_tuned = predict(mach_restored, Xtest);\n", - "ŷ_tuned[1]" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "And can compute the final performance measures:" - ] - }, - { - "cell_type": "code", - "execution_count": 66, - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "┌ Info: Tuned model measurements on test:\n", - "│ brier_loss(ŷ_tuned, ytest) |> mean = 0.26784766342462435\n", - "│ auc(ŷ_tuned, ytest) = 0.8467741935483871\n", - "│ accuracy(mode.(ŷ_tuned), ytest) = 0.8104265402843602\n", - "└ @ Main In[66]:1\n" - ] - } - ], - "source": [ - "@info(\"Tuned model measurements on test:\",\n", - " brier_loss(ŷ_tuned, ytest) |> mean,\n", - " auc(ŷ_tuned, ytest),\n", - " accuracy(mode.(ŷ_tuned), ytest)\n", - " )" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "For comparison, here's the performance for the basic pipeline model" - ] - }, - { - "cell_type": "code", - "execution_count": 67, - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "┌ Info: Basic model measurements on test set:\n", - "│ brier_loss(ŷ_basic, ytest) |> mean = 0.2815220182102922\n", - "│ auc(ŷ_basic, ytest) = 0.8496543778801844\n", - "│ accuracy(mode.(ŷ_basic), ytest) = 0.8009478672985781\n", - "└ @ Main In[67]:6\n" - ] - } - ], - "source": [ - "mach_basic = machine(pipe, X, y)\n", - "fit!(mach_basic, verbosity=0)\n", - "\n", - "ŷ_basic = predict(mach_basic, Xtest);\n", - "\n", - "@info(\"Basic model measurements on test set:\",\n", - " brier_loss(ŷ_basic, ytest) |> mean,\n", - " auc(ŷ_basic, ytest),\n", - " accuracy(mode.(ŷ_basic), ytest)\n", - " )" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "---\n", - "\n", - "*This notebook was generated using [Literate.jl](https://github.com/fredrikekre/Literate.jl).*" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Julia 1.6.5", - "language": "julia", - "name": "julia-1.6" - }, - "language_info": { - "file_extension": ".jl", - "mimetype": "application/julia", - "name": "julia", - "version": "1.6.5" - } - }, - "nbformat": 4, - "nbformat_minor": 3 -} diff --git a/examples/telco/notebook.jl b/examples/telco/notebook.jl deleted file mode 100644 index d42a46923..000000000 --- a/examples/telco/notebook.jl +++ /dev/null @@ -1,757 +0,0 @@ -# # MLJ for Data Scientists in Two Hours - -# An application of the [MLJ -# toolbox](https://juliaai.github.io/MLJ.jl/dev/) to the -# Telco Customer Churn dataset, aimed at practicing data scientists -# new to MLJ (Machine Learning in Julia). This tutorial does not -# cover exploratory data analysis. - -# MLJ is a *multi-paradigm* machine learning toolbox (i.e., not just -# deep-learning). - -# For other MLJ learning resources see the [Learning -# MLJ](https://juliaai.github.io/MLJ.jl/dev/learning_mlj/) -# section of the -# [manual](https://juliaai.github.io/MLJ.jl/dev/). - -# **Topics covered**: Grabbing and preparing a dataset, basic -# fit/predict workflow, constructing a pipeline to include data -# pre-processing, estimating performance metrics, ROC curves, confusion -# matrices, feature importance, basic feature selection, controlling iterative -# models, hyper-parameter optimization (tuning). - -# **Prerequisites for this tutorial.** Previous experience building, -# evaluating, and optimizing machine learning models using -# scikit-learn, caret, MLR, weka, or similar tool. No previous -# experience with MLJ. Only fairly basic familiarity with Julia is -# required. Uses -# [DataFrames.jl](https://dataframes.juliadata.org/stable/) but in a -# minimal way ([this -# cheatsheet](https://ahsmart.com/pub/data-wrangling-with-data-frames-jl-cheat-sheet/index.html) -# may help). - -# **Time.** Between two and three hours, first time through. - - -# ## Summary of methods and types introduced - -# |code | purpose| -# |:-------|:-------------------------------------------------------| -# | `OpenML.load(id)` | grab a dataset from [OpenML.org](https://www.openml.org)| -# | `scitype(X)` | inspect the scientific type (scitype) of object `X`| -# | `schema(X)` | inspect the column scitypes (scientific types) of a table `X`| -# | `coerce(X, ...)` | fix column encodings to get appropriate scitypes| -# | `partition(data, frac1, frac2, ...; rng=...)` | vertically split `data`, which can be a table, vector or matrix| -# | `unpack(table, f1, f2, ...)` | horizontally split `table` based on conditions `f1`, `f2`, ..., applied to column names| -# | `@load ModelType pkg=...` | load code defining a model type| -# | `input_scitype(model)` | inspect the scitype that a model requires for features (inputs)| -# | `target_scitype(model)`| inspect the scitype that a model requires for the target (labels)| -# | `ContinuousEncoder` | built-in model type for re-encoding all features as `Continuous`| -# | `model1 ∣> model2 ∣> ...` | combine multiple models into a pipeline| -# | `measures("under curve")` | list all measures (metrics) with string "under curve" in documentation| -# | `accuracy(yhat, y)` | compute accuracy of predictions `yhat` against ground truth observations `y`| -# | `auc(yhat, y)`, `brier_loss(yhat, y)` | evaluate two probabilistic measures (`yhat` a vector of probability distributions)| -# | `machine(model, X, y)` | bind `model` to training data `X` (features) and `y` (target)| -# | `fit!(mach, rows=...)` | train machine using specified rows (observation indices)| -# | `predict(mach, rows=...)`, | make in-sample model predictions given specified rows| -# | `predict(mach, Xnew)` | make predictions given new features `Xnew`| -# | `fitted_params(mach)` | inspect learned parameters| -# | `report(mach)` | inspect other outcomes of training| -# | `confmat(yhat, y)` | confusion matrix for predictions `yhat` and ground truth `y`| -# | `roc(yhat, y)` | compute points on the receiver-operator Characteristic| -# | `StratifiedCV(nfolds=6)` | 6-fold stratified cross-validation resampling strategy| -# | `Holdout(fraction_train=0.7)` | holdout resampling strategy| -# | `evaluate(model, X, y; resampling=..., options...)` | estimate performance metrics `model` using the data `X`, `y`| -# | `FeatureSelector()` | transformer for selecting features| -# | `Step(3)` | iteration control for stepping 3 iterations| -# | `NumberSinceBest(6)`, `TimeLimit(60/5), InvalidValue()` | iteration control stopping criteria| -# | `IteratedModel(model=..., controls=..., options...)` | wrap an iterative `model` in control strategies| -# | `range(model, :some_hyperparam, lower=..., upper=...)` | define a numeric range| -# | `RandomSearch()` | random search tuning strategy| -# | `TunedModel(model=..., tuning=..., options...)` | wrap the supervised `model` in specified `tuning` strategy| - - -# ## Instantiate a Julia environment - -# The following code replicates precisely the set of Julia packages -# used to develop this tutorial. If this is your first time running -# the notebook, package instantiation and pre-compilation may take a -# minute or so to complete. **This step will fail** if the [correct -# Manifest.toml and Project.toml -# files](https://github.com/JuliaAI/MLJ.jl/tree/dev/examples/telco) -# are not in the same directory as this notebook. - -using Pkg -Pkg.activate(@__DIR__) # get env from TOML files in same directory as this notebook -Pkg.instantiate() - - -# ## Warm up: Building a model for the iris dataset - -# Before turning to the Telco Customer Churn dataset, we very quickly -# build a predictive model for Fisher's well-known iris data set, as way of -# introducing the main actors in any MLJ workflow. Details that you -# don't fully grasp should become clearer in the Telco study. - -# This section is a condensed adaption of the [Getting Started -# example](https://juliaai.github.io/MLJ.jl/dev/getting_started/#Fit-and-predict) -# in the MLJ documentation. - -# First, using the built-in iris dataset, we load and inspect the features -# `X_iris` (a table) and target variable `y_iris` (a vector): - -using MLJ - -#- - -const X_iris, y_iris = @load_iris; -schema(X_iris) - -#- - -y_iris[1:4] - -# - -levels(y_iris) - -# We load a decision tree model, from the package DecisionTree.jl: - -DecisionTree = @load DecisionTreeClassifier pkg=DecisionTree # model type -model = DecisionTree(min_samples_split=5) # model instance - -# In MLJ, a *model* is just a container for hyperparameters of -# some learning algorithm. It does not store learned parameters. - -# Next, we bind the model together with the available data in what's -# called a *machine*: - -mach = machine(model, X_iris, y_iris) - -# A machine is essentially just a model (ie, hyperparameters) plus data, but -# it additionally stores *learned parameters* (the tree) once it is -# trained on some view of the data: - -train_rows = vcat(1:60, 91:150); # some row indices (observations are rows not columns) -fit!(mach, rows=train_rows) -fitted_params(mach) - -# A machine stores some other information enabling [warm -# restart](https://juliaai.github.io/MLJ.jl/dev/machines/#Warm-restarts) -# for some models, but we won't go into that here. You are allowed to -# access and mutate the `model` parameter: - -mach.model.min_samples_split = 10 -fit!(mach, rows=train_rows) # re-train with new hyperparameter - -# Now we can make predictions on some other view of the data, as in - -predict(mach, rows=71:73) - -# or on completely new data, as in - -Xnew = (sepal_length = [5.1, 6.3], - sepal_width = [3.0, 2.5], - petal_length = [1.4, 4.9], - petal_width = [0.3, 1.5]) -yhat = predict(mach, Xnew) - -# These are probabilistic predictions which can be manipulated using a -# widely adopted interface defined in the Distributions.jl -# package. For example, we can get raw probabilities like this: - -pdf.(yhat, "virginica") - - -# We now turn to the Telco dataset. - - -# ## Getting the Telco data - -import DataFrames - -#- - -data = OpenML.load(42178) # data set from OpenML.org -df0 = DataFrames.DataFrame(data) -first(df0, 4) - -# The object of this tutorial is to build and evaluate supervised -# learning models to predict the `:Churn` variable, a binary variable -# measuring customer retention, based on other variables that are -# relevant. - -# In the table, observations correspond to rows, and features to -# columns, which is the convention for representing all -# two-dimensional data in MLJ. - - -# ## Type coercion - -# > Introduces: `scitype`, `schema`, `coerce` - -# A ["scientific -# type"](https://juliaai.github.io/ScientificTypes.jl/dev/) or -# *scitype* indicates how MLJ will *interpret* data. For example, -# `typeof(3.14) == Float64`, while `scitype(3.14) == Continuous` and -# also `scitype(3.14f0) == Continuous`. In MLJ, model data -# requirements are articulated using scitypes. - -# Here are common "scalar" scitypes: - -# ![](assets/scitypes.png) - -# There are also container scitypes. For example, the scitype of any -# `N`-dimensional array is `AbstractArray{S, N}`, where `S` is the scitype of the -# elements: - -scitype(["cat", "mouse", "dog"]) - -# The `schema` operator summarizes the column scitypes of a table: - -schema(df0) |> DataFrames.DataFrame # converted to DataFrame for better display - -# All of the fields being interpreted as `Textual` are really -# something else, either `Multiclass` or, in the case of -# `:TotalCharges`, `Continuous`. In fact, `:TotalCharges` is -# mostly floats wrapped as strings. However, it needs special -# treatment because some elements consist of a single space, " ", -# which we'll treat as "0.0". - -fix_blanks(v) = map(v) do x - if x == " " - return "0.0" - else - return x - end -end - -df0.TotalCharges = fix_blanks(df0.TotalCharges); - -# Coercing the `:TotalCharges` type to ensure a `Continuous` scitype: - -coerce!(df0, :TotalCharges => Continuous); - -# Coercing all remaining `Textual` data to `Multiclass`: - -coerce!(df0, Textual => Multiclass); - -# Finally, we'll coerce our target variable `:Churn` to be -# `OrderedFactor`, rather than `Multiclass`, to enable a reliable -# interpretation of metrics like "true positive rate". By convention, -# the first class is the negative one: - -coerce!(df0, :Churn => OrderedFactor) -levels(df0.Churn) # to check order - -# Re-inspecting the scitypes: - -schema(df0) |> DataFrames.DataFrame - - -# ## Preparing a holdout set for final testing - -# > Introduces: `partition` - -# To reduce training times for the purposes of this tutorial, we're -# going to dump 90% of observations (after shuffling) and split off -# 30% of the remainder for use as a lock-and-throw-away-the-key -# holdout set: - -df, df_test, df_dumped = partition(df0, 0.07, 0.03, # in ratios 7:3:90 - stratify=df0.Churn, - rng=123); - -# The reader interested in including all data can instead do -# `df, df_test = partition(df0, 0.7, stratify=df0.Churn, rng=123)`. - - -# ## Splitting data into target and features - -# > Introduces: `unpack` - -# In the following call, the column with name `:Churn` is copied over -# to a vector `y`, and every remaining column, except `:customerID` -# (which contains no useful information) goes into a table `X`. Here -# `:Churn` is the target variable for which we seek predictions, given -# new versions of the features `X`. - -const y, X = unpack(df, ==(:Churn), !=(:customerID)); -schema(X).names - -#- - -intersect([:Churn, :customerID], schema(X).names) - -# We'll do the same for the holdout data: - -const ytest, Xtest = unpack(df_test, ==(:Churn), !=(:customerID)); - -# ## Loading a model and checking type requirements - -# > Introduces: `@load`, `input_scitype`, `target_scitype` - -# For tools helping us to identify suitable models, see the [Model -# Search](https://juliaai.github.io/MLJ.jl/dev/model_search/#model_search) -# section of the manual. We will build a gradient tree-boosting model, -# a popular first choice for structured data like we have here. Model -# code is contained in a third-party package called -# [EvoTrees.jl](https://github.com/Evovest/EvoTrees.jl) which is -# loaded as follows: - -Booster = @load EvoTreeClassifier pkg=EvoTrees - -# Recall that a *model* is just a container for some algorithm's -# hyperparameters. Let's create a `Booster` with default values for -# the hyperparameters: - -booster = Booster() - -# This model is appropriate for the kind of target variable we have because of -# the following passing test: - -scitype(y) <: target_scitype(booster) - -# However, our features `X` cannot be directly used with `booster`: - -scitype(X) <: input_scitype(booster) - -# As it turns out, this is because `booster`, like the majority of MLJ -# supervised models, expects the features to be `Continuous`. (With -# some experience, this can be gleaned from `input_scitype(booster)`.) -# So we need categorical feature encoding, discussed next. - - -# ## Building a model pipeline to incorporate feature encoding - -# > Introduces: `ContinuousEncoder`, pipeline operator `|>` - -# The built-in `ContinuousEncoder` model transforms an arbitrary table -# to a table whose features are all `Continuous` (dropping any fields -# it does not know how to encode). In particular, all `Multiclass` -# features are one-hot encoded. - -# A *pipeline* is a stand-alone model that internally combines one or -# more models in a linear (non-branching) pipeline. Here's a pipeline -# that adds the `ContinuousEncoder` as a pre-processor to the -# gradient tree-boosting model above: - -pipe = ContinuousEncoder() |> booster - -# Note that the component models appear as hyperparameters of -# `pipe`. Pipelines are an implementation of a more general [model -# composition](https://juliaai.github.io/MLJ.jl/dev/composing_models/#Composing-Models) -# interface provided by MLJ that advanced users may want to learn about. - -# From the above display, we see that component model hyperparameters -# are now *nested*, but they are still accessible (important in hyperparameter -# optimization): - -pipe.evo_tree_classifier.max_depth - - -# ## Evaluating the pipeline model's performance - -# > Introduces: `measures` (function), **measures:** `brier_loss`, `auc`, `accuracy`; -# > `machine`, `fit!`, `predict`, `fitted_params`, `report`, `roc`, **resampling strategy** `StratifiedCV`, `evaluate`, `FeatureSelector` - -# Without touching our test set `Xtest`, `ytest`, we will estimate the -# performance of our pipeline model, with default hyperparameters, in -# two different ways: - -# **Evaluating by hand.** First, we'll do this "by hand" using the `fit!` and `predict` -# workflow illustrated for the iris data set above, using a -# holdout resampling strategy. At the same time we'll see how to -# generate a **confusion matrix**, **ROC curve**, and inspect -# **feature importances**. - -# **Automated performance evaluation.** Next we'll apply the more -# typical and convenient `evaluate` workflow, but using `StratifiedCV` -# (stratified cross-validation) which is more informative. - -# In any case, we need to choose some measures (metrics) to quantify -# the performance of our model. For a complete list of measures, one -# does `measures()`. Or we also can do: - -measures("Brier") - -# We will be primarily using `brier_loss`, but also `auc` (area under -# the ROC curve) and `accuracy`. - - -# ### Evaluating by hand (with a holdout set) - -# Our pipeline model can be trained just like the decision tree model -# we built for the iris data set. Binding all non-test data to the -# pipeline model: - -mach_pipe = machine(pipe, X, y) - -# We already encountered the `partition` method above. Here we apply -# it to row indices, instead of data containers, as `fit!` and -# `predict` only need a *view* of the data to work. - -train, validation = partition(1:length(y), 0.7) -fit!(mach_pipe, rows=train) - -# We note in passing that we can access two kinds of information from a trained machine: - -# - The **learned parameters** (eg, coefficients of a linear model): We use `fitted_params(mach_pipe)` -# - Other **by-products of training** (eg, feature importances): We use `report(mach_pipe)` - -fp = fitted_params(mach_pipe); -keys(fp) - -# For example, we can check that the encoder did not actually drop any features: - -Set(fp.continuous_encoder.features_to_keep) == Set(schema(X).names) - -# And, from the report, extract feature importances: - -rpt = report(mach_pipe) -keys(rpt.evo_tree_classifier) - -#- - -fi = rpt.evo_tree_classifier.feature_importances -feature_importance_table = - (feature=Symbol.(first.(fi)), importance=last.(fi)) |> DataFrames.DataFrame - -# For models not reporting feature importances, we recommend the -# [Shapley.jl](https://expandingman.gitlab.io/Shapley.jl/) package. - -# Returning to predictions and evaluations of our measures: - -ŷ = predict(mach_pipe, rows=validation); -@info("Measurements", - brier_loss(ŷ, y[validation]) |> mean, - auc(ŷ, y[validation]), - accuracy(mode.(ŷ), y[validation]) - ) - -# Note that we need `mode` in the last case because `accuracy` expects -# point predictions, not probabilistic ones. (One can alternatively -# use `predict_mode` to generate the predictions.) - -# While we're here, lets also generate a **confusion matrix** and -# [receiver-operator -# characteristic](https://en.wikipedia.org/wiki/Receiver_operating_characteristic) -# (ROC): - -confmat(mode.(ŷ), y[validation]) - -# Note: Importing the plotting package and calling the plotting -# functions for the first time can take a minute or so. - -using Plots - -#- - -roc_curve = roc(ŷ, y[validation]) -plt = scatter(roc_curve, legend=false) -plot!(plt, xlab="false positive rate", ylab="true positive rate") -plot!([0, 1], [0, 1], linewidth=2, linestyle=:dash, color=:black) - - -# ### Automated performance evaluation (more typical workflow) - -# We can also get performance estimates with a single call to the -# `evaluate` function, which also allows for more complicated -# resampling - in this case stratified cross-validation. To make this -# more comprehensive, we set `repeats=3` below to make our -# cross-validation "Monte Carlo" (3 random size-6 partitions of the -# observation space, for a total of 18 folds) and set -# `acceleration=CPUThreads()` to parallelize the computation. - -# We choose a `StratifiedCV` resampling strategy; the complete list of options is -# [here](https://juliaai.github.io/MLJ.jl/dev/evaluating_model_performance/#Built-in-resampling-strategies). - -e_pipe = evaluate(pipe, X, y, - resampling=StratifiedCV(nfolds=6, rng=123), - measures=[brier_loss, auc, accuracy], - repeats=3, - acceleration=CPUThreads()) - -# (There is also a version of `evaluate` for machines. Query the -# `evaluate` and `evaluate!` doc-strings to learn more about these -# functions and what the `PerformanceEvaluation` object `e_pipe` records.) - -# While [less than ideal](https://arxiv.org/abs/2104.00673), let's -# adopt the common practice of using the standard error of a -# cross-validation score as an estimate of the uncertainty of a -# performance measure's expected value. Here's a utility function to -# calculate 95% confidence intervals for our performance estimates based -# on this practice, and it's application to the current evaluation: - -using Measurements - -#- - -function confidence_intervals(e) - factor = 2.0 # to get level of 95% - measure = e.measure - nfolds = length(e.per_fold[1]) - measurement = [e.measurement[j] ± factor*std(e.per_fold[j])/sqrt(nfolds - 1) - for j in eachindex(measure)] - table = (measure=measure, measurement=measurement) - return DataFrames.DataFrame(table) -end - -const confidence_intervals_basic_model = confidence_intervals(e_pipe) - - -# ## Filtering out unimportant features - -# > Introduces: `FeatureSelector` - -# Before continuing, we'll modify our pipeline to drop those features -# with low feature importance, to speed up later optimization: - -unimportant_features = filter(:importance => <(0.005), feature_importance_table).feature - -pipe2 = ContinuousEncoder() |> - FeatureSelector(features=unimportant_features, ignore=true) |> booster - - -# ## Wrapping our iterative model in control strategies - -# > Introduces: **control strategies:** `Step`, `NumberSinceBest`, `TimeLimit`, `InvalidValue`, **model wrapper** `IteratedModel`, **resampling strategy:** `Holdout` - -# We want to optimize the hyperparameters of our model. Since our -# model is iterative, these parameters include the (nested) iteration -# parameter `pipe.evo_tree_classifier.nrounds`. Sometimes this -# parameter is optimized first, fixed, and then maybe optimized again -# after the other parameters. Here we take a more principled approach, -# **wrapping our model in a control strategy** that makes it -# "self-iterating". The strategy applies a stopping criterion to -# *out-of-sample* estimates of the model performance, constructed -# using an internally constructed holdout set. In this way, we avoid -# some data hygiene issues, and, when we subsequently optimize other -# parameters, we will always being using an optimal number of -# iterations. - -# Note that this approach can be applied to any iterative MLJ model, -# eg, the neural network models provided by -# [MLJFlux.jl](https://github.com/FluxML/MLJFlux.jl). - -# First, we select appropriate controls from [this -# list](https://juliaai.github.io/MLJ.jl/dev/controlling_iterative_models/#Controls-provided): - -controls = [ - Step(1), # to increment iteration parameter (`pipe.nrounds`) - NumberSinceBest(4), # main stopping criterion - TimeLimit(2/3600), # never train more than 2 sec - InvalidValue() # stop if NaN or ±Inf encountered -] - -# Now we wrap our pipeline model using the `IteratedModel` wrapper, -# being sure to specify the `measure` on which internal estimates of -# the out-of-sample performance will be based: - -iterated_pipe = IteratedModel(model=pipe2, - controls=controls, - measure=brier_loss, - resampling=Holdout(fraction_train=0.7)) - -# We've set `resampling=Holdout(fraction_train=0.7)` to arrange that -# data attached to our model should be internally split into a train -# set (70%) and a holdout set (30%) for determining the out-of-sample -# estimate of the Brier loss. - -# For demonstration purposes, let's bind `iterated_model` to all data -# not in our don't-touch holdout set, and train on all of that data: - -mach_iterated_pipe = machine(iterated_pipe, X, y) -fit!(mach_iterated_pipe); - -# To recap, internally this training is split into two separate steps: - -# - A controlled iteration step, training on the holdout set, with the total number of iterations determined by the specified stopping criteria (based on the out-of-sample performance estimates) -# - A final step that trains the atomic model on *all* available -# data using the number of iterations determined in the first step. Calling `predict` on `mach_iterated_pipe` means using the learned parameters of the second step. - - -# ## Hyper-parameter optimization (model tuning) - -# > Introduces: `range`, **model wrapper** `TunedModel`, `RandomSearch` - -# We now turn to hyperparameter optimization. A tool not discussed -# here is the `learning_curve` function, which can be useful when -# wanting to visualize the effect of changes to a *single* -# hyperparameter (which could be an iteration parameter). See, for -# example, [this section of the -# manual](https://juliaai.github.io/MLJ.jl/dev/learning_curves/) -# or [this -# tutorial](https://github.com/ablaom/MLJTutorial.jl/blob/dev/notebooks/04_tuning/notebook.ipynb). - -# Fine tuning the hyperparameters of a gradient booster can be -# somewhat involved. Here we settle for simultaneously optimizing two -# key parameters: `max_depth` and `η` (learning_rate). - -# Like iteration control, **model optimization in MLJ is implemented as -# a model wrapper**, called `TunedModel`. After wrapping a model in a -# tuning strategy and binding the wrapped model to data in a machine -# called `mach`, calling `fit!(mach)` instigates a search for optimal -# model hyperparameters, within a specified range, and then uses all -# supplied data to train the best model. To predict using that model, -# one then calls `predict(mach, Xnew)`. In this way the wrapped model -# may be viewed as a "self-tuning" version of the unwrapped -# model. That is, wrapping the model simply transforms certain -# hyperparameters into learned parameters (just as `IteratedModel` -# does for an iteration parameter). - -# To start with, we define ranges for the parameters of -# interest. Since these parameters are nested, let's force a -# display of our model to a larger depth: - -show(iterated_pipe, 2) - -#- - -p1 = :(model.evo_tree_classifier.η) -p2 = :(model.evo_tree_classifier.max_depth) - -r1 = range(iterated_pipe, p1, lower=-2, upper=-0.5, scale=x->10^x) -r2 = range(iterated_pipe, p2, lower=2, upper=6) - -# Nominal ranges are defined by specifying `values` instead of `lower` -# and `upper`. - -# Next, we choose an optimization strategy from [this -# list](https://juliaai.github.io/MLJ.jl/dev/tuning_models/#Tuning-Models): - -tuning = RandomSearch(rng=123) - -# Then we wrap the model, specifying a `resampling` strategy and a -# `measure`, as we did for `IteratedModel`. In fact, we can include a -# battery of `measures`; by default, optimization is with respect to -# performance estimates based on the first measure, but estimates for -# all measures can be accessed from the model's `report`. - -# The keyword `n` specifies the total number of models (sets of -# hyperparameters) to evaluate. - -tuned_iterated_pipe = TunedModel(model=iterated_pipe, - range=[r1, r2], - tuning=tuning, - measures=[brier_loss, auc, accuracy], - resampling=StratifiedCV(nfolds=6, rng=123), - acceleration=CPUThreads(), - n=40) - -# To save time, we skip the `repeats` here. - -# Binding our final model to data and training: - -mach_tuned_iterated_pipe = machine(tuned_iterated_pipe, X, y) -fit!(mach_tuned_iterated_pipe) - -# As explained above, the training we have just performed was split -# internally into two separate steps: - -# - A step to determine the parameter values that optimize the aggregated cross-validation scores -# - A final step that trains the optimal model on *all* available data. Future predictions `predict(mach_tuned_iterated_pipe, ...)` are based on this final training step. - -# From `report(mach_tuned_iterated_pipe)` we can extract details about -# the optimization procedure. For example: - -rpt2 = report(mach_tuned_iterated_pipe); -best_booster = rpt2.best_model.model.evo_tree_classifier - -#- - -@info "Optimal hyperparameters:" best_booster.max_depth best_booster.η; - -# Using the `confidence_intervals` function we defined earlier: - -e_best = rpt2.best_history_entry -confidence_intervals(e_best) - -# Digging a little deeper, we can learn what stopping criterion was -# applied in the case of the optimal model, and how many iterations -# were required: - -rpt2.best_report.controls |> collect - -# Finally, we can visualize the optimization results: - -plot(mach_tuned_iterated_pipe, size=(600,450)) - - -# ## Saving our model - -# > Introduces: `MLJ.save` - -# Here's how to serialize our final, trained self-iterating, -# self-tuning pipeline machine: - -MLJ.save("tuned_iterated_pipe.jlso", mach_tuned_iterated_pipe) - - -# We'll deserialize this in "Testing the final model" below. - -# ## Final performance estimate - -# Finally, to get an even more accurate estimate of performance, we -# can evaluate our model using stratified cross-validation and all the -# data attached to our machine. Because this evaluation implies -# [nested -# resampling](https://mlr.mlr-org.com/articles/tutorial/nested_resampling.html), -# this computation takes quite a bit longer than the previous one -# (which is being repeated six times, using 5/6th of the data each -# time): - -e_tuned_iterated_pipe = evaluate(tuned_iterated_pipe, X, y, - resampling=StratifiedCV(nfolds=6, rng=123), - measures=[brier_loss, auc, accuracy]) - -#- - -confidence_intervals(e_tuned_iterated_pipe) - -# For comparison, here are the confidence intervals for the basic -# pipeline model (no feature selection and default hyperparameters): - -confidence_intervals_basic_model - -# As each pair of intervals overlap, it's doubtful the small changes -# here can be assigned statistical significance. Default `booster` -# hyperparameters do a pretty good job. - - -# ## Testing the final model - -# We now determine the performance of our model on our -# lock-and-throw-away-the-key holdout set. To demonstrate -# deserialization, we'll pretend we're in a new Julia session (but -# have called `import`/`using` on the same packages). Then the -# following should suffice to recover our model trained under -# "Hyper-parameter optimization" above: - -mach_restored = machine("tuned_iterated_pipe.jlso") - -# We compute predictions on the holdout set: - -ŷ_tuned = predict(mach_restored, Xtest); -ŷ_tuned[1] - -# And can compute the final performance measures: - -@info("Tuned model measurements on test:", - brier_loss(ŷ_tuned, ytest) |> mean, - auc(ŷ_tuned, ytest), - accuracy(mode.(ŷ_tuned), ytest) - ) - -# For comparison, here's the performance for the basic pipeline model - -mach_basic = machine(pipe, X, y) -fit!(mach_basic, verbosity=0) - -ŷ_basic = predict(mach_basic, Xtest); - -@info("Basic model measurements on test set:", - brier_loss(ŷ_basic, ytest) |> mean, - auc(ŷ_basic, ytest), - accuracy(mode.(ŷ_basic), ytest) - ) diff --git a/examples/telco/notebook.pluto.jl b/examples/telco/notebook.pluto.jl deleted file mode 100644 index 345837476..000000000 --- a/examples/telco/notebook.pluto.jl +++ /dev/null @@ -1,1321 +0,0 @@ -### A Pluto.jl notebook ### -# v0.16.0 - -using Markdown -using InteractiveUtils - -# ╔═╡ f0cc864c-8b26-441f-9bca-7c69b794f8ce -md"# MLJ for Data Scientists in Two Hours" - -# ╔═╡ 8a6670b8-96a8-4a5d-b795-033f6f2a0674 -md""" -An application of the [MLJ -toolbox](https://juliaai.github.io/MLJ.jl/dev/) to the -Telco Customer Churn dataset, aimed at practicing data scientists -new to MLJ (Machine Learning in Julia). This tutorial does not -cover exploratory data analysis. -""" - -# ╔═╡ aa49e638-95dc-4249-935f-ddf6a6bfbbdd -md""" -MLJ is a *multi-paradigm* machine learning toolbox (i.e., not just -deep-learning). -""" - -# ╔═╡ b04c4790-59e0-42a3-af2a-25235e544a31 -md""" -For other MLJ learning resources see the [Learning -MLJ](https://juliaai.github.io/MLJ.jl/dev/learning_mlj/) -section of the -[manual](https://juliaai.github.io/MLJ.jl/dev/). -""" - -# ╔═╡ 4eb8dff4-c23a-4b41-8af5-148d95ea2900 -md""" -**Topics covered**: Grabbing and preparing a dataset, basic -fit/predict workflow, constructing a pipeline to include data -pre-processing, estimating performance metrics, ROC curves, confusion -matrices, feature importance, basic feature selection, controlling iterative -models, hyperparameter optimization (tuning). -""" - -# ╔═╡ a583d175-d623-4888-a6bf-47194d7e8e12 -md""" -**Prerequisites for this tutorial.** Previous experience building, -evaluating, and optimizing machine learning models using -scikit-learn, caret, MLR, weka, or similar tool. No previous -experience with MLJ. Only fairly basic familiarity with Julia is -required. Uses -[DataFrames.jl](https://dataframes.juliadata.org/stable/) but in a -minimal way ([this -cheatsheet](https://ahsmart.com/pub/data-wrangling-with-data-frames-jl-cheat-sheet/index.html) -may help). -""" - -# ╔═╡ 4830fc64-3d70-4869-828a-4cc485149963 -md"**Time.** Between two and three hours, first time through." - -# ╔═╡ 28197138-d6b7-433c-9e54-3f7b3ca87ecb -md"## Summary of methods and types introduced" - -# ╔═╡ aad8dc13-c0f9-4090-ba1f-6363f43ec697 -md""" -|code | purpose| -|:-------|:-------------------------------------------------------| -| `OpenML.load(id)` | grab a dataset from [OpenML.org](https://www.openml.org)| -| `scitype(X)` | inspect the scientific type (scitype) of object `X`| -| `schema(X)` | inspect the column scitypes (scientific types) of a table `X`| -| `coerce(X, ...)` | fix column encodings to get appropriate scitypes| -| `partition(data, frac1, frac2, ...; rng=...)` | vertically split `data`, which can be a table, vector or matrix| -| `unpack(table, f1, f2, ...)` | horizontally split `table` based on conditions `f1`, `f2`, ..., applied to column names| -| `@load ModelType pkg=...` | load code defining a model type| -| `input_scitype(model)` | inspect the scitype that a model requires for features (inputs)| -| `target_scitype(model)`| inspect the scitype that a model requires for the target (labels)| -| `ContinuousEncoder` | built-in model type for re-encoding all features as `Continuous`| -| `model1 ∣> model2 ∣> ...` | combine multiple models into a pipeline| -| `measures("under curve")` | list all measures (metrics) with string "under curve" in documentation| -| `accuracy(yhat, y)` | compute accuracy of predictions `yhat` against ground truth observations `y`| -| `auc(yhat, y)`, `brier_loss(yhat, y)` | evaluate two probabilistic measures (`yhat` a vector of probability distributions)| -| `machine(model, X, y)` | bind `model` to training data `X` (features) and `y` (target)| -| `fit!(mach, rows=...)` | train machine using specified rows (observation indices)| -| `predict(mach, rows=...)`, | make in-sample model predictions given specified rows| -| `predict(mach, Xnew)` | make predictions given new features `Xnew`| -| `fitted_params(mach)` | inspect learned parameters| -| `report(mach)` | inspect other outcomes of training| -| `confmat(yhat, y)` | confusion matrix for predictions `yhat` and ground truth `y`| -| `roc(yhat, y)` | compute points on the receiver-operator Characteristic| -| `StratifiedCV(nfolds=6)` | 6-fold stratified cross-validation resampling strategy| -| `Holdout(fraction_train=0.7)` | holdout resampling strategy| -| `evaluate(model, X, y; resampling=..., options...)` | estimate performance metrics `model` using the data `X`, `y`| -| `FeatureSelector()` | transformer for selecting features| -| `Step(3)` | iteration control for stepping 3 iterations| -| `NumberSinceBest(6)`, `TimeLimit(60/5), InvalidValue()` | iteration control stopping criteria| -| `IteratedModel(model=..., controls=..., options...)` | wrap an iterative `model` in control strategies| -| `range(model, :some_hyperparam, lower=..., upper=...)` | define a numeric range| -| `RandomSearch()` | random search tuning strategy| -| `TunedModel(model=..., tuning=..., options...)` | wrap the supervised `model` in specified `tuning` strategy| -""" - -# ╔═╡ 7c0464a0-4114-46bf-95ea-2955abd45275 -md"## Instantiate a Julia environment" - -# ╔═╡ 256869d0-5e0c-42af-b1b3-dd47e367ba54 -md""" -The following code replicates precisely the set of Julia packages -used to develop this tutorial. If this is your first time running -the notebook, package instantiation and pre-compilation may take a -minute or so to complete. **This step will fail** if the [correct -Manifest.toml and Project.toml -files](https://github.com/JuliaAI/MLJ.jl/tree/dev/examples/telco) -are not in the same directory as this notebook. -""" - -# ╔═╡ 60fe49c1-3434-4a77-8d7e-8e449afd1c48 -begin - using Pkg - Pkg.activate(@__DIR__) # get env from TOML files in same directory as this notebook - Pkg.instantiate() -end - -# ╔═╡ e8c13e9d-7910-4a0e-a949-7f3bd292fe31 -md"## Warm up: Building a model for the iris dataset" - -# ╔═╡ 6bf6ef98-302f-478c-8514-99938a2932db -md""" -Before turning to the Telco Customer Churn dataset, we very quickly -build a predictive model for Fisher's well-known iris data set, as way of -introducing the main actors in any MLJ workflow. Details that you -don't fully grasp should become clearer in the Telco study. -""" - -# ╔═╡ 33ca287e-8cba-47d1-a0de-1721c1bc2df2 -md""" -This section is a condensed adaption of the [Getting Started -example](https://juliaai.github.io/MLJ.jl/dev/getting_started/#Fit-and-predict) -in the MLJ documentation. -""" - -# ╔═╡ d5b8bf1d-6c9a-46c1-bca8-7eec7950fd82 -md""" -First, using the built-in iris dataset, we load and inspect the features -`X_iris` (a table) and target variable `y_iris` (a vector): -""" - -# ╔═╡ 9b7b9ade-9318-4b95-9873-1b1430e635cc -using MLJ - -# ╔═╡ 4f3f061d-259d-4479-b43e-d0ebe87da176 -begin - const X_iris, y_iris = @load_iris; - schema(X_iris) -end - -# ╔═╡ a546f372-4ae0-48c2-9009-4cfb2012998f -y_iris[1:4] - -# ╔═╡ f8f79dd2-b6de-48e8-abd3-cb77d7a79683 -levels(y_iris) - -# ╔═╡ 617d63f6-8a62-40db-879e-dc128f3db7b3 -md"We load a decision tree model, from the package DecisionTree.jl:" - -# ╔═╡ ceda7ed0-3b98-44b3-a367-9aa3c6cf34d0 -begin - DecisionTree = @load DecisionTreeClassifier pkg=DecisionTree # model type - model = DecisionTree(min_samples_split=5) # model instance -end - -# ╔═╡ e69cd764-e4b8-4ff8-bf32-94657e65284e -md""" -In MLJ, a *model* is just a container for hyperparameters of -some learning algorithm. It does not store learned parameters. -""" - -# ╔═╡ 4a175e0f-4b87-4b53-9afd-65a5b5facac9 -md""" -Next, we bind the model together with the available data in what's -called a *machine*: -""" - -# ╔═╡ 3c23e9f8-49bd-4cde-b6c8-54f5ed90a964 -mach = machine(model, X_iris, y_iris) - -# ╔═╡ 6ad971c7-9fe5-45a9-9292-fe822525fc77 -md""" -A machine is essentially just a model (ie, hyperparameters) plus data, but -it additionally stores *learned parameters* (the tree) once it is -trained on some view of the data: -""" - -# ╔═╡ 63151d31-8428-4ebd-ae5e-4b83dcbc9675 -begin - train_rows = vcat(1:60, 91:150); # some row indices (observations are rows not columns) - fit!(mach, rows=train_rows) - fitted_params(mach) -end - -# ╔═╡ 0f978839-cc95-4c3a-8a29-32f11452654a -md""" -A machine stores some other information enabling [warm -restart](https://juliaai.github.io/MLJ.jl/dev/machines/#Warm-restarts) -for some models, but we won't go into that here. You are allowed to -access and mutate the `model` parameter: -""" - -# ╔═╡ 5edc98cd-df7d-428a-a5f1-151fcbe229a2 -begin - mach.model.min_samples_split = 10 - fit!(mach, rows=train_rows) # re-train with new hyperparameter -end - -# ╔═╡ 1848109f-c94c-4cdd-81bc-2d5603785a09 -md"Now we can make predictions on some other view of the data, as in" - -# ╔═╡ eee948b7-2b42-439e-9d87-1d1fbb0e3997 -predict(mach, rows=71:73) - -# ╔═╡ 7c98e10b-1fa3-4ab6-b950-fddef2a1fb10 -md"or on completely new data, as in" - -# ╔═╡ 413d9c3c-a68e-4bf0-951b-64cb2a36c8e3 -begin - Xnew = (sepal_length = [5.1, 6.3], - sepal_width = [3.0, 2.5], - petal_length = [1.4, 4.9], - petal_width = [0.3, 1.5]) - yhat = predict(mach, Xnew) -end - -# ╔═╡ eb79663c-c671-4c4c-b0e6-441461cc8770 -md""" -These are probabilistic predictions which can be manipulated using a -widely adopted interface defined in the Distributions.jl -package. For example, we can get raw probabilities like this: -""" - -# ╔═╡ ae9e9377-552c-4186-8cb0-de601961bc02 -pdf.(yhat, "virginica") - -# ╔═╡ 5c70ee06-edb9-4789-a87d-950c50fb2955 -md"We now turn to the Telco dataset." - -# ╔═╡ bde7bc43-a4bb-4a42-8448-4bcb089096cd -md"## Getting the Telco data" - -# ╔═╡ d5a9e9b8-c67e-4bdd-a012-a84240254fb6 -import DataFrames - -# ╔═╡ 8de2c7b4-1950-4f05-bbdd-9799f7bb2e60 -begin - data = OpenML.load(42178) # data set from OpenML.org - df0 = DataFrames.DataFrame(data) - first(df0, 4) -end - -# ╔═╡ 768f369a-5f3d-4dcc-97a7-88e3af4f10ee -md""" -The object of this tutorial is to build and evaluate supervised -learning models to predict the `:Churn` variable, a binary variable -measuring customer retention, based on other variables that are -relevant. -""" - -# ╔═╡ bc4c0eb9-3b5f-49fc-b372-8c9866e51852 -md""" -In the table, observations correspond to rows, and features to -columns, which is the convention for representing all -two-dimensional data in MLJ. -""" - -# ╔═╡ f04becbb-42a6-409f-8f3d-b82f1e7b6f7a -md"## Type coercion" - -# ╔═╡ 1d08ed75-5377-4971-ab04-579e5608ae53 -md"> Introduces: `scitype`, `schema`, `coerce`" - -# ╔═╡ eea585d4-d55b-4ccc-86cf-35420d9e6995 -md""" -A ["scientific -type"](https://juliaai.github.io/ScientificTypes.jl/dev/) or -*scitype* indicates how MLJ will *interpret* data. For example, -`typeof(3.14) == Float64`, while `scitype(3.14) == Continuous` and -also `scitype(3.14f0) == Continuous`. In MLJ, model data -requirements are articulated using scitypes. -""" - -# ╔═╡ ca482134-299b-459a-a29a-8d0445351914 -md"Here are common \"scalar\" scitypes:" - -# ╔═╡ 5c7c97e4-fd8b-4ae5-be65-28d0fcca50a8 -md"![](assets/scitypes.png)" - -# ╔═╡ fd103be7-6fc9-43bc-9a2e-c468345d87d1 -md""" -There are also container scitypes. For example, the scitype of any -`N`-dimensional array is `AbstractArray{S, N}`, where `S` is the scitype of the -elements: -""" - -# ╔═╡ 9d2f0b19-2942-47ac-b5fa-cb79ebf595ef -scitype(["cat", "mouse", "dog"]) - -# ╔═╡ 1d18aca8-11f4-4104-91c5-524da38aa391 -md"The `schema` operator summarizes the column scitypes of a table:" - -# ╔═╡ accd3a09-5371-45ab-ad90-b4405b216771 -schema(df0) |> DataFrames.DataFrame # converted to DataFrame for better display - -# ╔═╡ bde0406f-3964-42b8-895b-997a92b731e0 -md""" -All of the fields being interpreted as `Textual` are really -something else, either `Multiclass` or, in the case of -`:TotalCharges`, `Continuous`. In fact, `:TotalCharges` is -mostly floats wrapped as strings. However, it needs special -treatment because some elements consist of a single space, " ", -which we'll treat as "0.0". -""" - -# ╔═╡ 25e60566-fc64-4f35-a525-e9b04a4bd246 -begin - fix_blanks(v) = map(v) do x - if x == " " - return "0.0" - else - return x - end - end - - df0.TotalCharges = fix_blanks(df0.TotalCharges); -end - -# ╔═╡ 441317a8-3b72-4bf1-80f0-ba7781e173cf -md"Coercing the `:TotalCharges` type to ensure a `Continuous` scitype:" - -# ╔═╡ d2f2b67e-d06e-4c0b-9cbc-9a2c39793333 -coerce!(df0, :TotalCharges => Continuous); - -# ╔═╡ 7c93b820-cbce-48be-b887-2b16710e5502 -md"Coercing all remaining `Textual` data to `Multiclass`:" - -# ╔═╡ 5ac4a13e-9983-41dd-9452-5298a8a4a401 -coerce!(df0, Textual => Multiclass); - -# ╔═╡ 9f28e6c3-5f18-4fb2-b017-96f1602f2cad -md""" -Finally, we'll coerce our target variable `:Churn` to be -`OrderedFactor`, rather than `Multiclass`, to enable a reliable -interpretation of metrics like "true positive rate". By convention, -the first class is the negative one: -""" - -# ╔═╡ c9506e5b-c111-442c-8be2-2a4017c45345 -begin - coerce!(df0, :Churn => OrderedFactor) - levels(df0.Churn) # to check order -end - -# ╔═╡ 8c865df5-ac95-438f-a7ad-80d84f5b0070 -md"Re-inspecting the scitypes:" - -# ╔═╡ 22690828-0992-4ed0-8378-5aa686f0b407 -schema(df0) |> DataFrames.DataFrame - -# ╔═╡ 3bd753fc-163a-4b50-9f43-49763e8691a1 -md"## Preparing a holdout set for final testing" - -# ╔═╡ 7b12673f-cc63-4cd9-bb0e-3a45761c733a -md"> Introduces: `partition`" - -# ╔═╡ 3c98ab4e-47cd-4247-9979-7044b2f2df33 -md""" -To reduce training times for the purposes of this tutorial, we're -going to dump 90% of observations (after shuffling) and split off -30% of the remainder for use as a lock-and-throw-away-the-key -holdout set: -""" - -# ╔═╡ ea010966-bb7d-4c79-b2a1-fbbde543f620 -df, df_test, df_dumped = partition(df0, 0.07, 0.03, # in ratios 7:3:90 - stratify=df0.Churn, - rng=123); - -# ╔═╡ ecb7c90a-4c93-483c-910b-f8a9a217eff2 -md""" -The reader interested in including all data can instead do -`df, df_test = partition(df0, 0.7, stratify=df0.Churn, rng=123)`. -""" - -# ╔═╡ 81aa1aa9-27d9-423f-aa36-d2e8546da46a -md"## Splitting data into target and features" - -# ╔═╡ fff47e76-d7ac-4377-88a1-a2bf91434413 -md"> Introduces: `unpack`" - -# ╔═╡ f1dbbc22-9844-4e1c-a1cb-54e34396a855 -md""" -In the following call, the column with name `:Churn` is copied over -to a vector `y`, and every remaining column, except `:customerID` -(which contains no useful information) goes into a table `X`. Here -`:Churn` is the target variable for which we seek predictions, given -new versions of the features `X`. -""" - -# ╔═╡ 20713fb3-3a3b-42e7-8037-0de5806e1a54 -begin - const y, X = unpack(df, ==(:Churn), !=(:customerID)); - schema(X).names -end - -# ╔═╡ 369cb359-8512-4f80-9961-a09632c33fb0 -intersect([:Churn, :customerID], schema(X).names) - -# ╔═╡ af484868-b29c-4808-b7cc-46f4ef988c68 -md"We'll do the same for the holdout data:" - -# ╔═╡ 8f3eda66-6183-4fe4-90fb-7f2721f6fcc7 -const ytest, Xtest = unpack(df_test, ==(:Churn), !=(:customerID)); - -# ╔═╡ 598142b5-0828-49ac-af65-ccd25ecb9818 -md"## Loading a model and checking type requirements" - -# ╔═╡ c5309d64-7409-4148-8890-979691212d9b -md"> Introduces: `@load`, `input_scitype`, `target_scitype`" - -# ╔═╡ f97969e2-c15c-42cf-a6fa-eaf14df5d44b -md""" -For tools helping us to identify suitable models, see the [Model -Search](https://juliaai.github.io/MLJ.jl/dev/model_search/#model_search) -section of the manual. We will build a gradient tree-boosting model, -a popular first choice for structured data like we have here. Model -code is contained in a third-party package called -[EvoTrees.jl](https://github.com/Evovest/EvoTrees.jl) which is -loaded as follows: -""" - -# ╔═╡ edf1d726-542a-4fc1-8025-5084804a9f6c -Booster = @load EvoTreeClassifier pkg=EvoTrees - -# ╔═╡ ee41a121-df0d-40ad-9e90-09023d201062 -md""" -Recall that a *model* is just a container for some algorithm's -hyperparameters. Let's create a `Booster` with default values for -the hyperparameters: -""" - -# ╔═╡ be6cef47-d585-45fe-b7bb-2f33ef765cc0 -booster = Booster() - -# ╔═╡ 452edd51-b878-46d0-9625-01172c4a0081 -md""" -This model is appropriate for the kind of target variable we have because of -the following passing test: -""" - -# ╔═╡ fdd20843-c981-41ad-af4f-11c7de9e21dd -scitype(y) <: target_scitype(booster) - -# ╔═╡ 2b25f9cb-d12d-4a6f-8dba-241d9b744683 -md"However, our features `X` cannot be directly used with `booster`:" - -# ╔═╡ b139c43b-382b-415a-a6e5-2f0b4dca5c59 -scitype(X) <: input_scitype(booster) - -# ╔═╡ 90a7fe9a-934f-445a-8550-20498aa03ed0 -md""" -As it turns out, this is because `booster`, like the majority of MLJ -supervised models, expects the features to be `Continuous`. (With -some experience, this can be gleaned from `input_scitype(booster)`.) -So we need categorical feature encoding, discussed next. -""" - -# ╔═╡ fc97e937-66ca-4434-9e7b-5b943cf6b560 -md"## Building a model pipeline to incorporate feature encoding" - -# ╔═╡ b1d95557-51fc-4f1e-bce6-0a1379cc1259 -md"> Introduces: `ContinuousEncoder`, pipeline operator `|>`" - -# ╔═╡ 38961daa-dd5f-4f97-9608-b5032c116833 -md""" -The built-in `ContinuousEncoder` model transforms an arbitrary table -to a table whose features are all `Continuous` (dropping any fields -it does not know how to encode). In particular, all `Multiclass` -features are one-hot encoded. -""" - -# ╔═╡ ccd3ffd0-ed41-497d-b473-4aa968e6937a -md""" -A *pipeline* is a stand-alone model that internally combines one or -more models in a linear (non-branching) pipeline. Here's a pipeline -that adds the `ContinuousEncoder` as a pre-processor to the -gradient tree-boosting model above: -""" - -# ╔═╡ 35cf2011-4fb6-4e7a-8d9e-644a1b3cc6b6 -pipe = ContinuousEncoder() |> booster - -# ╔═╡ 4b6e3239-fd47-495a-ac0a-23ded81445da -md""" -Note that the component models appear as hyperparameters of -`pipe`. Pipelines are an implementation of a more general [model -composition](https://juliaai.github.io/MLJ.jl/dev/composing_models/#Composing-Models) -interface provided by MLJ that advanced users may want to learn about. -""" - -# ╔═╡ ce9a3479-601a-472a-8535-1a088a6a3228 -md""" -From the above display, we see that component model hyperparameters -are now *nested*, but they are still accessible (important in hyperparameter -optimization): -""" - -# ╔═╡ 45c2d8f9-cbf9-4cda-a39f-7893c73eef39 -pipe.evo_tree_classifier.max_depth - -# ╔═╡ 232b7775-1f79-43f7-bcca-51a27994a151 -md"## Evaluating the pipeline model's performance" - -# ╔═╡ ac84c13f-391f-405a-9b33-09b4b6661170 -md""" -> Introduces: `measures` (function), **measures:** `brier_loss`, `auc`, `accuracy`; -> `machine`, `fit!`, `predict`, `fitted_params`, `report`, `roc`, **resampling strategy** `StratifiedCV`, `evaluate`, `FeatureSelector` -""" - -# ╔═╡ 442078e4-a695-471c-b45d-88d068bb0fa2 -md""" -Without touching our test set `Xtest`, `ytest`, we will estimate the -performance of our pipeline model, with default hyperparameters, in -two different ways: -""" - -# ╔═╡ 23fb37b0-08c1-4688-92c8-6db3a590d963 -md""" -**Evaluating by hand.** First, we'll do this "by hand" using the `fit!` and `predict` -workflow illustrated for the iris data set above, using a -holdout resampling strategy. At the same time we'll see how to -generate a **confusion matrix**, **ROC curve**, and inspect -**feature importances**. -""" - -# ╔═╡ 1def55f5-71fc-4257-abf5-96f457eb2bdf -md""" -**Automated performance evaluation.** Next we'll apply the more -typical and convenient `evaluate` workflow, but using `StratifiedCV` -(stratified cross-validation) which is more informative. -""" - -# ╔═╡ 2c4016c6-dc7e-44ec-8a60-2e0214c059f5 -md""" -In any case, we need to choose some measures (metrics) to quantify -the performance of our model. For a complete list of measures, one -does `measures()`. Or we also can do: -""" - -# ╔═╡ 236c098c-0189-4a96-a38a-beb5c7157b57 -measures("Brier") - -# ╔═╡ 9ffe1654-7e32-4c4b-81f5-507803ea9ed6 -md""" -We will be primarily using `brier_loss`, but also `auc` (area under -the ROC curve) and `accuracy`. -""" - -# ╔═╡ 48413a7b-d01b-4005-9b21-6d4eb642d87e -md"### Evaluating by hand (with a holdout set)" - -# ╔═╡ 84d7bcbc-df3c-4602-b98c-3df1f31879bf -md""" -Our pipeline model can be trained just like the decision tree model -we built for the iris data set. Binding all non-test data to the -pipeline model: -""" - -# ╔═╡ f8552404-f95f-4484-92b7-4ceba56e97ad -mach_pipe = machine(pipe, X, y) - -# ╔═╡ e0b1bffb-dbd5-4b0a-b122-214de244406c -md""" -We already encountered the `partition` method above. Here we apply -it to row indices, instead of data containers, as `fit!` and -`predict` only need a *view* of the data to work. -""" - -# ╔═╡ d8ad2c21-fd27-44fb-8a4b-c83694978e38 -begin - train, validation = partition(1:length(y), 0.7) - fit!(mach_pipe, rows=train) -end - -# ╔═╡ ceff13ba-a526-4ecb-a8b6-f7ff516dedc4 -md"We note in passing that we can access two kinds of information from a trained machine:" - -# ╔═╡ dcf0eb93-2368-4158-81e1-74ef03c2e79e -md""" -- The **learned parameters** (eg, coefficients of a linear model): We use `fitted_params(mach_pipe)` -- Other **by-products of training** (eg, feature importances): We use `report(mach_pipe)` -""" - -# ╔═╡ 442be3e3-b2e9-499d-a04e-0b76409c14a7 -begin - fp = fitted_params(mach_pipe); - keys(fp) -end - -# ╔═╡ f03fdd56-6c30-4d73-b979-3458f2f26667 -md"For example, we can check that the encoder did not actually drop any features:" - -# ╔═╡ 3afbe65c-18bf-4576-97e3-6677afc6ca9f -Set(fp.continuous_encoder.features_to_keep) == Set(schema(X).names) - -# ╔═╡ 527b193b-737d-4701-b10e-562ce21caa04 -md"And, from the report, extract feature importances:" - -# ╔═╡ b332842d-6397-45c6-8f78-ab549ef1544e -begin - rpt = report(mach_pipe) - keys(rpt.evo_tree_classifier) -end - -# ╔═╡ 3035e6d0-3801-424d-a8a4-a53f5149481e -begin - fi = rpt.evo_tree_classifier.feature_importances - feature_importance_table = - (feature=Symbol.(first.(fi)), importance=last.(fi)) |> DataFrames.DataFrame -end - -# ╔═╡ 8b67dcac-6a86-4502-8704-dbab8e09b4f1 -md""" -For models not reporting feature importances, we recommend the -[Shapley.jl](https://expandingman.gitlab.io/Shapley.jl/) package. -""" - -# ╔═╡ 9cd42f39-942e-4011-a02f-27b6c05e4d02 -md"Returning to predictions and evaluations of our measures:" - -# ╔═╡ 0fecce89-102f-416f-bea4-3d467d48781c -begin - ŷ = predict(mach_pipe, rows=validation); - @info("Measurements", - brier_loss(ŷ, y[validation]) |> mean, - auc(ŷ, y[validation]), - accuracy(mode.(ŷ), y[validation]) - ) -end - -# ╔═╡ 0b11a81e-42a7-4519-97c5-2979af8a507d -md""" -Note that we need `mode` in the last case because `accuracy` expects -point predictions, not probabilistic ones. (One can alternatively -use `predict_mode` to generate the predictions.) -""" - -# ╔═╡ e6f6fc92-f9e7-4472-b639-63c76c72c513 -md""" -While we're here, lets also generate a **confusion matrix** and -[receiver-operator -characteristic](https://en.wikipedia.org/wiki/Receiver_operating_characteristic) -(ROC): -""" - -# ╔═╡ f3933688-1ea4-4239-8f5a-e1ee9eb5c15c -confmat(mode.(ŷ), y[validation]) - -# ╔═╡ ce90835f-6709-4c34-adc5-e671db8bca5d -md""" -Note: Importing the plotting package and calling the plotting -functions for the first time can take a minute or so. -""" - -# ╔═╡ 15996832-b736-45f4-86f1-0ea38de21abb -using Plots - -# ╔═╡ 3c4984d2-9e1a-450b-a55c-0cb44ab816d7 -begin - roc_curve = roc(ŷ, y[validation]) - plt = scatter(roc_curve, legend=false) - plot!(plt, xlab="false positive rate", ylab="true positive rate") - plot!([0, 1], [0, 1], linewidth=2, linestyle=:dash, color=:black) -end - -# ╔═╡ 27221b90-8506-4857-be86-92ecfd0d2343 -md"### Automated performance evaluation (more typical workflow)" - -# ╔═╡ 445146e5-e45e-450d-9cf1-facf39e3f302 -md""" -We can also get performance estimates with a single call to the -`evaluate` function, which also allows for more complicated -resampling - in this case stratified cross-validation. To make this -more comprehensive, we set `repeats=3` below to make our -cross-validation "Monte Carlo" (3 random size-6 partitions of the -observation space, for a total of 18 folds) and set -`acceleration=CPUThreads()` to parallelize the computation. -""" - -# ╔═╡ 562887bb-b7fb-430f-b61c-748aec38e674 -md""" -We choose a `StratifiedCV` resampling strategy; the complete list of options is -[here](https://juliaai.github.io/MLJ.jl/dev/evaluating_model_performance/#Built-in-resampling-strategies). -""" - -# ╔═╡ f9be989e-2604-44c2-9727-ed822e4fd85d -e_pipe = evaluate(pipe, X, y, - resampling=StratifiedCV(nfolds=6, rng=123), - measures=[brier_loss, auc, accuracy], - repeats=3, - acceleration=CPUThreads()) - -# ╔═╡ ff7cfc36-b9fc-4570-b2f2-e08965e5be66 -md""" -(There is also a version of `evaluate` for machines. Query the -`evaluate` and `evaluate!` doc-strings to learn more about these -functions and what the `PerformanceEvaluation` object `e_pipe` records.) -""" - -# ╔═╡ 45a4d300-a392-485c-897c-f79712f9ec7c -md""" -While [less than ideal](https://arxiv.org/abs/2104.00673), let's -adopt the common practice of using the standard error of a -cross-validation score as an estimate of the uncertainty of a -performance measure's expected value. Here's a utility function to -calculate 95% confidence intervals for our performance estimates based -on this practice, and it's application to the current evaluation: -""" - -# ╔═╡ 0f76a79f-8675-4ec1-a543-f9324a87efad -using Measurements - -# ╔═╡ fc641df4-693c-4007-8657-9fba0caf3cb7 -begin - function confidence_intervals(e) - factor = 2.0 # to get level of 95% - measure = e.measure - nfolds = length(e.per_fold[1]) - measurement = [e.measurement[j] ± factor*std(e.per_fold[j])/sqrt(nfolds - 1) - for j in eachindex(measure)] - table = (measure=measure, measurement=measurement) - return DataFrames.DataFrame(table) - end - - const confidence_intervals_basic_model = confidence_intervals(e_pipe) -end - -# ╔═╡ c3f71e42-8bbe-47fe-a217-e58f442fc85c -md"## Filtering out unimportant features" - -# ╔═╡ db354064-c2dd-4e6a-b8ad-0340f15a03ba -md"> Introduces: `FeatureSelector`" - -# ╔═╡ 3bbb26ed-7d1e-46ac-946d-b124a8db5f7c -md""" -Before continuing, we'll modify our pipeline to drop those features -with low feature importance, to speed up later optimization: -""" - -# ╔═╡ cdfe840d-4e87-467f-b582-dfcbeb05bcc5 -begin - unimportant_features = filter(:importance => <(0.005), feature_importance_table).feature - - pipe2 = ContinuousEncoder() |> - FeatureSelector(features=unimportant_features, ignore=true) |> booster -end - -# ╔═╡ de589473-4c37-4f70-9143-546b2286a5fe -md"## Wrapping our iterative model in control strategies" - -# ╔═╡ 0b80d69c-b60c-4cf4-a7d8-c3b94fb18495 -md"> Introduces: **control strategies:** `Step`, `NumberSinceBest`, `TimeLimit`, `InvalidValue`, **model wrapper** `IteratedModel`, **resampling strategy:** `Holdout`" - -# ╔═╡ 19e7e4c9-95c0-49d6-8396-93fa872d2512 -md""" -We want to optimize the hyperparameters of our model. Since our -model is iterative, these parameters include the (nested) iteration -parameter `pipe.evo_tree_classifier.nrounds`. Sometimes this -parameter is optimized first, fixed, and then maybe optimized again -after the other parameters. Here we take a more principled approach, -**wrapping our model in a control strategy** that makes it -"self-iterating". The strategy applies a stopping criterion to -*out-of-sample* estimates of the model performance, constructed -using an internally constructed holdout set. In this way, we avoid -some data hygiene issues, and, when we subsequently optimize other -parameters, we will always being using an optimal number of -iterations. -""" - -# ╔═╡ 6ff08b40-906f-4154-a4ac-2ddb495858ce -md""" -Note that this approach can be applied to any iterative MLJ model, -eg, the neural network models provided by -[MLJFlux.jl](https://github.com/FluxML/MLJFlux.jl). -""" - -# ╔═╡ 8fc99d35-d8cc-455f-806e-1bc580dc349d -md""" -First, we select appropriate controls from [this -list](https://juliaai.github.io/MLJ.jl/dev/controlling_iterative_models/#Controls-provided): -""" - -# ╔═╡ 29f33708-4a82-4acc-9703-288eae064e2a -controls = [ - Step(1), # to increment iteration parameter (`pipe.nrounds`) - NumberSinceBest(4), # main stopping criterion - TimeLimit(2/3600), # never train more than 2 sec - InvalidValue() # stop if NaN or ±Inf encountered -] - -# ╔═╡ 9f80b5b5-95b9-4f01-b2c3-413ae5867f7d -md""" -Now we wrap our pipeline model using the `IteratedModel` wrapper, -being sure to specify the `measure` on which internal estimates of -the out-of-sample performance will be based: -""" - -# ╔═╡ fd2e2ee4-c256-4fde-93d8-53c527b0a48c -iterated_pipe = IteratedModel(model=pipe2, - controls=controls, - measure=brier_loss, - resampling=Holdout(fraction_train=0.7)) - -# ╔═╡ bb7a34eb-4bf1-41d9-af98-8dffdf3118fc -md""" -We've set `resampling=Holdout(fraction_train=0.7)` to arrange that -data attached to our model should be internally split into a train -set (70%) and a holdout set (30%) for determining the out-of-sample -estimate of the Brier loss. -""" - -# ╔═╡ 71bead68-2d38-468c-8620-127a8c4021ec -md""" -For demonstration purposes, let's bind `iterated_model` to all data -not in our don't-touch holdout set, and train on all of that data: -""" - -# ╔═╡ f245005b-ca1e-4c38-a1f6-fec4c3edfa7b -begin - mach_iterated_pipe = machine(iterated_pipe, X, y) - fit!(mach_iterated_pipe); -end - -# ╔═╡ 16d8cd4e-ca20-475a-82f7-021485ee0115 -md"To recap, internally this training is split into two separate steps:" - -# ╔═╡ ab59d744-51f8-4364-9ecb-94593d972599 -md""" -- A controlled iteration step, training on the holdout set, with the total number of iterations determined by the specified stopping criteria (based on the out-of-sample performance estimates) -- A final step that trains the atomic model on *all* available - data using the number of iterations determined in the first step. Calling `predict` on `mach_iterated_pipe` means using the learned parameters of the second step. -""" - -# ╔═╡ ed9f284e-1e45-46e7-b54b-e44bea97c579 -md"## Hyperparameter optimization (model tuning)" - -# ╔═╡ b8b0e5ee-8468-4e02-9121-4e392242994e -md"> Introduces: `range`, **model wrapper** `TunedModel`, `RandomSearch`" - -# ╔═╡ 50372dc8-bb10-4f9e-b221-79886442efe7 -md""" -We now turn to hyperparameter optimization. A tool not discussed -here is the `learning_curve` function, which can be useful when -wanting to visualize the effect of changes to a *single* -hyperparameter (which could be an iteration parameter). See, for -example, [this section of the -manual](https://juliaai.github.io/MLJ.jl/dev/learning_curves/) -or [this -tutorial](https://github.com/ablaom/MLJTutorial.jl/blob/dev/notebooks/04_tuning/notebook.ipynb). -""" - -# ╔═╡ 3e385eb4-0a44-40f6-8df5-b7371beb6b3f -md""" -Fine tuning the hyperparameters of a gradient booster can be -somewhat involved. Here we settle for simultaneously optimizing two -key parameters: `max_depth` and `η` (learning_rate). -""" - -# ╔═╡ caa5153f-6633-41f7-a475-d52dc8eba727 -md""" -Like iteration control, **model optimization in MLJ is implemented as -a model wrapper**, called `TunedModel`. After wrapping a model in a -tuning strategy and binding the wrapped model to data in a machine -called `mach`, calling `fit!(mach)` instigates a search for optimal -model hyperparameters, within a specified range, and then uses all -supplied data to train the best model. To predict using that model, -one then calls `predict(mach, Xnew)`. In this way the wrapped model -may be viewed as a "self-tuning" version of the unwrapped -model. That is, wrapping the model simply transforms certain -hyperparameters into learned parameters (just as `IteratedModel` -does for an iteration parameter). -""" - -# ╔═╡ 0b74bfe8-bff6-469d-804b-89f5009710b0 -md""" -To start with, we define ranges for the parameters of -interest. Since these parameters are nested, let's force a -display of our model to a larger depth: -""" - -# ╔═╡ b17f6f89-2fd4-46de-a14a-b0adc2955e1c -show(iterated_pipe, 2) - -# ╔═╡ db7a8b4d-611b-4e5e-bd20-120a7a4020d0 -begin - p1 = :(model.evo_tree_classifier.η) - p2 = :(model.evo_tree_classifier.max_depth) - - r1 = range(iterated_pipe, p1, lower=-2, upper=-0.5, scale=x->10^x) - r2 = range(iterated_pipe, p2, lower=2, upper=6) -end - -# ╔═╡ f46af08e-ddb9-4aec-93a0-9d99274137e8 -md""" -Nominal ranges are defined by specifying `values` instead of `lower` -and `upper`. -""" - -# ╔═╡ af3023e6-920f-478d-af76-60dddeecbe6c -md""" -Next, we choose an optimization strategy from [this -list](https://juliaai.github.io/MLJ.jl/dev/tuning_models/#Tuning-Models): -""" - -# ╔═╡ 93c17a9b-b49c-4780-9074-c069a0e97d7e -tuning = RandomSearch(rng=123) - -# ╔═╡ 21f14f18-cc5a-4ed9-ac4a-024c5894013e -md""" -Then we wrap the model, specifying a `resampling` strategy and a -`measure`, as we did for `IteratedModel`. In fact, we can include a -battery of `measures`; by default, optimization is with respect to -performance estimates based on the first measure, but estimates for -all measures can be accessed from the model's `report`. -""" - -# ╔═╡ 147cc2c5-f3c2-4aec-82ca-0687059409ae -md""" -The keyword `n` specifies the total number of models (sets of -hyperparameters) to evaluate. -""" - -# ╔═╡ 2996c4f0-567a-4c5a-9e9e-2379bd3c438e -tuned_iterated_pipe = TunedModel(model=iterated_pipe, - range=[r1, r2], - tuning=tuning, - measures=[brier_loss, auc, accuracy], - resampling=StratifiedCV(nfolds=6, rng=123), - acceleration=CPUThreads(), - n=40) - -# ╔═╡ fbf15fc2-347a-432a-bf9e-f9077f3deea4 -md"To save time, we skip the `repeats` here." - -# ╔═╡ 3922d490-8018-4b8c-9b74-5a67b6e8b15f -md"Binding our final model to data and training:" - -# ╔═╡ d59dc0af-8eb7-4ac4-b1f4-bf9663e97bb7 -begin - mach_tuned_iterated_pipe = machine(tuned_iterated_pipe, X, y) - fit!(mach_tuned_iterated_pipe) -end - -# ╔═╡ ed1d8575-1c50-4bc7-8dca-7ecb1b94fa1b -md""" -As explained above, the training we have just performed was split -internally into two separate steps: -""" - -# ╔═╡ c392b949-7e64-4f3e-aeca-6e3d5d94d8fa -md""" -- A step to determine the parameter values that optimize the aggregated cross-validation scores -- A final step that trains the optimal model on *all* available data. Future predictions `predict(mach_tuned_iterated_pipe, ...)` are based on this final training step. -""" - -# ╔═╡ 954a354f-0a1d-4a7f-8aa0-20af95403dd9 -md""" -From `report(mach_tuned_iterated_pipe)` we can extract details about -the optimization procedure. For example: -""" - -# ╔═╡ 0f9e72c1-fb6e-4ea7-a120-518242409f79 -begin - rpt2 = report(mach_tuned_iterated_pipe); - best_booster = rpt2.best_model.model.evo_tree_classifier -end - -# ╔═╡ 242b8047-0508-4ce2-bcf7-0be279ee1434 -@info "Optimal hyperparameters:" best_booster.max_depth best_booster.η; - -# ╔═╡ af6d8f76-f96e-4136-9df6-c78b3bed8b80 -md"Using the `confidence_intervals` function we defined earlier:" - -# ╔═╡ ec5e230c-397a-4e07-b9cc-1427739824b3 -begin - e_best = rpt2.best_history_entry - confidence_intervals(e_best) -end - -# ╔═╡ a0a591d4-ae53-4db4-9051-24fa20a24653 -md""" -Digging a little deeper, we can learn what stopping criterion was -applied in the case of the optimal model, and how many iterations -were required: -""" - -# ╔═╡ 8f18ff72-39db-43f2-ac11-6a06d822d067 -rpt2.best_report.controls |> collect - -# ╔═╡ d28bdf7e-2ba5-4a97-8d27-497b9a4e8f4b -md"Finally, we can visualize the optimization results:" - -# ╔═╡ e0dba28c-5f07-4305-a8e6-955351cd26f5 -plot(mach_tuned_iterated_pipe, size=(600,450)) - -# ╔═╡ d92c7d6c-6eda-4083-bf7b-99c47ef72fd2 -md"## Saving our model" - -# ╔═╡ 7626383d-5212-4ee5-9b3c-c87e36798d40 -md"> Introduces: `MLJ.save`" - -# ╔═╡ f38bca90-a19e-4faa-bc52-7a03f8a4f046 -md""" -Here's how to serialize our final, trained self-iterating, -self-tuning pipeline machine: -""" - -# ╔═╡ 36e8600e-f5ee-4e5f-9814-5dd23028b7dd -MLJ.save("tuned_iterated_pipe.jlso", mach_tuned_iterated_pipe) - -# ╔═╡ 4de39bbb-e421-4e1b-aea9-6b095d52d246 -md"We'll deserialize this in \"Testing the final model\" below." - -# ╔═╡ cf4e89cd-998c-44d1-8a6a-3fad94d47b89 -md"## Final performance estimate" - -# ╔═╡ cd043584-6881-45df-ab7f-23dfd6fe43e8 -md""" -Finally, to get an even more accurate estimate of performance, we -can evaluate our model using stratified cross-validation and all the -data attached to our machine. Because this evaluation implies -[nested -resampling](https://mlr.mlr-org.com/articles/tutorial/nested_resampling.html), -this computation takes quite a bit longer than the previous one -(which is being repeated six times, using 5/6th of the data each -time): -""" - -# ╔═╡ 9c90d7a7-1966-4d21-873e-e6fb8e7dca1a -e_tuned_iterated_pipe = evaluate(tuned_iterated_pipe, X, y, - resampling=StratifiedCV(nfolds=6, rng=123), - measures=[brier_loss, auc, accuracy]) - -# ╔═╡ f5058fe5-53de-43ad-9dd4-420f3ba8803c -confidence_intervals(e_tuned_iterated_pipe) - -# ╔═╡ 669b492b-69a7-4d88-b994-ae59732958cb -md""" -For comparison, here are the confidence intervals for the basic -pipeline model (no feature selection and default hyperparameters): -""" - -# ╔═╡ 5c1754b1-21c2-4173-9aa5-a206354b401f -confidence_intervals_basic_model - -# ╔═╡ 675f06bc-6488-45e2-b666-1307eccc221d -md""" -As each pair of intervals overlap, it's doubtful the small changes -here can be assigned statistical significance. Default `booster` -hyperparameters do a pretty good job. -""" - -# ╔═╡ 19bb6a91-8c9d-4ed0-8f9a-f7439f35ea8f -md"## Testing the final model" - -# ╔═╡ fe32c8ab-2f9e-4bb4-a8bb-11a6d1761f50 -md""" -We now determine the performance of our model on our -lock-and-throw-away-the-key holdout set. To demonstrate -deserialization, we'll pretend we're in a new Julia session (but -have called `import`/`using` on the same packages). Then the -following should suffice to recover our model trained under -"Hyperparameter optimization" above: -""" - -# ╔═╡ 695926dd-3faf-48a0-8c71-7d4e18e2f699 -mach_restored = machine("tuned_iterated_pipe.jlso") - -# ╔═╡ 24ca2761-e264-4cf9-a590-db6dcb21b2d3 -md"We compute predictions on the holdout set:" - -# ╔═╡ 9883bd87-d70f-4fea-bec6-3fa17d8c7b35 -begin - ŷ_tuned = predict(mach_restored, Xtest); - ŷ_tuned[1] -end - -# ╔═╡ 9b7a12e7-2b3c-445a-97eb-2de8afd657be -md"And can compute the final performance measures:" - -# ╔═╡ 7fa067b5-7e77-4dda-bba0-54e6f740a5b5 -@info("Tuned model measurements on test:", - brier_loss(ŷ_tuned, ytest) |> mean, - auc(ŷ_tuned, ytest), - accuracy(mode.(ŷ_tuned), ytest) - ) - -# ╔═╡ f4dc71cf-64c8-4857-94c0-45caa9808777 -md"For comparison, here's the performance for the basic pipeline model" - -# ╔═╡ 00257755-7145-45e6-adf5-76545beae885 -begin - mach_basic = machine(pipe, X, y) - fit!(mach_basic, verbosity=0) - - ŷ_basic = predict(mach_basic, Xtest); - - @info("Basic model measurements on test set:", - brier_loss(ŷ_basic, ytest) |> mean, - auc(ŷ_basic, ytest), - accuracy(mode.(ŷ_basic), ytest) - ) -end - -# ╔═╡ 135dac9b-0bd9-4e1d-8715-73a20e2ae31b -md""" ---- - -*This notebook was generated using [Literate.jl](https://github.com/fredrikekre/Literate.jl).* -""" - -# ╔═╡ Cell order: -# ╟─f0cc864c-8b26-441f-9bca-7c69b794f8ce -# ╟─8a6670b8-96a8-4a5d-b795-033f6f2a0674 -# ╟─aa49e638-95dc-4249-935f-ddf6a6bfbbdd -# ╟─b04c4790-59e0-42a3-af2a-25235e544a31 -# ╟─4eb8dff4-c23a-4b41-8af5-148d95ea2900 -# ╟─a583d175-d623-4888-a6bf-47194d7e8e12 -# ╟─4830fc64-3d70-4869-828a-4cc485149963 -# ╟─28197138-d6b7-433c-9e54-3f7b3ca87ecb -# ╟─aad8dc13-c0f9-4090-ba1f-6363f43ec697 -# ╟─7c0464a0-4114-46bf-95ea-2955abd45275 -# ╟─256869d0-5e0c-42af-b1b3-dd47e367ba54 -# ╠═60fe49c1-3434-4a77-8d7e-8e449afd1c48 -# ╟─e8c13e9d-7910-4a0e-a949-7f3bd292fe31 -# ╟─6bf6ef98-302f-478c-8514-99938a2932db -# ╟─33ca287e-8cba-47d1-a0de-1721c1bc2df2 -# ╟─d5b8bf1d-6c9a-46c1-bca8-7eec7950fd82 -# ╠═9b7b9ade-9318-4b95-9873-1b1430e635cc -# ╠═4f3f061d-259d-4479-b43e-d0ebe87da176 -# ╠═a546f372-4ae0-48c2-9009-4cfb2012998f -# ╠═f8f79dd2-b6de-48e8-abd3-cb77d7a79683 -# ╟─617d63f6-8a62-40db-879e-dc128f3db7b3 -# ╠═ceda7ed0-3b98-44b3-a367-9aa3c6cf34d0 -# ╟─e69cd764-e4b8-4ff8-bf32-94657e65284e -# ╟─4a175e0f-4b87-4b53-9afd-65a5b5facac9 -# ╠═3c23e9f8-49bd-4cde-b6c8-54f5ed90a964 -# ╟─6ad971c7-9fe5-45a9-9292-fe822525fc77 -# ╠═63151d31-8428-4ebd-ae5e-4b83dcbc9675 -# ╟─0f978839-cc95-4c3a-8a29-32f11452654a -# ╠═5edc98cd-df7d-428a-a5f1-151fcbe229a2 -# ╟─1848109f-c94c-4cdd-81bc-2d5603785a09 -# ╠═eee948b7-2b42-439e-9d87-1d1fbb0e3997 -# ╟─7c98e10b-1fa3-4ab6-b950-fddef2a1fb10 -# ╠═413d9c3c-a68e-4bf0-951b-64cb2a36c8e3 -# ╟─eb79663c-c671-4c4c-b0e6-441461cc8770 -# ╠═ae9e9377-552c-4186-8cb0-de601961bc02 -# ╟─5c70ee06-edb9-4789-a87d-950c50fb2955 -# ╟─bde7bc43-a4bb-4a42-8448-4bcb089096cd -# ╠═d5a9e9b8-c67e-4bdd-a012-a84240254fb6 -# ╠═8de2c7b4-1950-4f05-bbdd-9799f7bb2e60 -# ╟─768f369a-5f3d-4dcc-97a7-88e3af4f10ee -# ╟─bc4c0eb9-3b5f-49fc-b372-8c9866e51852 -# ╟─f04becbb-42a6-409f-8f3d-b82f1e7b6f7a -# ╟─1d08ed75-5377-4971-ab04-579e5608ae53 -# ╟─eea585d4-d55b-4ccc-86cf-35420d9e6995 -# ╟─ca482134-299b-459a-a29a-8d0445351914 -# ╟─5c7c97e4-fd8b-4ae5-be65-28d0fcca50a8 -# ╟─fd103be7-6fc9-43bc-9a2e-c468345d87d1 -# ╠═9d2f0b19-2942-47ac-b5fa-cb79ebf595ef -# ╟─1d18aca8-11f4-4104-91c5-524da38aa391 -# ╠═accd3a09-5371-45ab-ad90-b4405b216771 -# ╟─bde0406f-3964-42b8-895b-997a92b731e0 -# ╠═25e60566-fc64-4f35-a525-e9b04a4bd246 -# ╟─441317a8-3b72-4bf1-80f0-ba7781e173cf -# ╠═d2f2b67e-d06e-4c0b-9cbc-9a2c39793333 -# ╟─7c93b820-cbce-48be-b887-2b16710e5502 -# ╠═5ac4a13e-9983-41dd-9452-5298a8a4a401 -# ╟─9f28e6c3-5f18-4fb2-b017-96f1602f2cad -# ╠═c9506e5b-c111-442c-8be2-2a4017c45345 -# ╟─8c865df5-ac95-438f-a7ad-80d84f5b0070 -# ╠═22690828-0992-4ed0-8378-5aa686f0b407 -# ╟─3bd753fc-163a-4b50-9f43-49763e8691a1 -# ╟─7b12673f-cc63-4cd9-bb0e-3a45761c733a -# ╟─3c98ab4e-47cd-4247-9979-7044b2f2df33 -# ╠═ea010966-bb7d-4c79-b2a1-fbbde543f620 -# ╟─ecb7c90a-4c93-483c-910b-f8a9a217eff2 -# ╟─81aa1aa9-27d9-423f-aa36-d2e8546da46a -# ╟─fff47e76-d7ac-4377-88a1-a2bf91434413 -# ╟─f1dbbc22-9844-4e1c-a1cb-54e34396a855 -# ╠═20713fb3-3a3b-42e7-8037-0de5806e1a54 -# ╠═369cb359-8512-4f80-9961-a09632c33fb0 -# ╟─af484868-b29c-4808-b7cc-46f4ef988c68 -# ╠═8f3eda66-6183-4fe4-90fb-7f2721f6fcc7 -# ╟─598142b5-0828-49ac-af65-ccd25ecb9818 -# ╟─c5309d64-7409-4148-8890-979691212d9b -# ╟─f97969e2-c15c-42cf-a6fa-eaf14df5d44b -# ╠═edf1d726-542a-4fc1-8025-5084804a9f6c -# ╟─ee41a121-df0d-40ad-9e90-09023d201062 -# ╠═be6cef47-d585-45fe-b7bb-2f33ef765cc0 -# ╟─452edd51-b878-46d0-9625-01172c4a0081 -# ╠═fdd20843-c981-41ad-af4f-11c7de9e21dd -# ╟─2b25f9cb-d12d-4a6f-8dba-241d9b744683 -# ╠═b139c43b-382b-415a-a6e5-2f0b4dca5c59 -# ╟─90a7fe9a-934f-445a-8550-20498aa03ed0 -# ╟─fc97e937-66ca-4434-9e7b-5b943cf6b560 -# ╟─b1d95557-51fc-4f1e-bce6-0a1379cc1259 -# ╟─38961daa-dd5f-4f97-9608-b5032c116833 -# ╟─ccd3ffd0-ed41-497d-b473-4aa968e6937a -# ╠═35cf2011-4fb6-4e7a-8d9e-644a1b3cc6b6 -# ╟─4b6e3239-fd47-495a-ac0a-23ded81445da -# ╟─ce9a3479-601a-472a-8535-1a088a6a3228 -# ╠═45c2d8f9-cbf9-4cda-a39f-7893c73eef39 -# ╟─232b7775-1f79-43f7-bcca-51a27994a151 -# ╟─ac84c13f-391f-405a-9b33-09b4b6661170 -# ╟─442078e4-a695-471c-b45d-88d068bb0fa2 -# ╟─23fb37b0-08c1-4688-92c8-6db3a590d963 -# ╟─1def55f5-71fc-4257-abf5-96f457eb2bdf -# ╟─2c4016c6-dc7e-44ec-8a60-2e0214c059f5 -# ╠═236c098c-0189-4a96-a38a-beb5c7157b57 -# ╟─9ffe1654-7e32-4c4b-81f5-507803ea9ed6 -# ╟─48413a7b-d01b-4005-9b21-6d4eb642d87e -# ╟─84d7bcbc-df3c-4602-b98c-3df1f31879bf -# ╠═f8552404-f95f-4484-92b7-4ceba56e97ad -# ╟─e0b1bffb-dbd5-4b0a-b122-214de244406c -# ╠═d8ad2c21-fd27-44fb-8a4b-c83694978e38 -# ╟─ceff13ba-a526-4ecb-a8b6-f7ff516dedc4 -# ╟─dcf0eb93-2368-4158-81e1-74ef03c2e79e -# ╠═442be3e3-b2e9-499d-a04e-0b76409c14a7 -# ╟─f03fdd56-6c30-4d73-b979-3458f2f26667 -# ╠═3afbe65c-18bf-4576-97e3-6677afc6ca9f -# ╟─527b193b-737d-4701-b10e-562ce21caa04 -# ╠═b332842d-6397-45c6-8f78-ab549ef1544e -# ╠═3035e6d0-3801-424d-a8a4-a53f5149481e -# ╟─8b67dcac-6a86-4502-8704-dbab8e09b4f1 -# ╟─9cd42f39-942e-4011-a02f-27b6c05e4d02 -# ╠═0fecce89-102f-416f-bea4-3d467d48781c -# ╟─0b11a81e-42a7-4519-97c5-2979af8a507d -# ╟─e6f6fc92-f9e7-4472-b639-63c76c72c513 -# ╠═f3933688-1ea4-4239-8f5a-e1ee9eb5c15c -# ╟─ce90835f-6709-4c34-adc5-e671db8bca5d -# ╠═15996832-b736-45f4-86f1-0ea38de21abb -# ╠═3c4984d2-9e1a-450b-a55c-0cb44ab816d7 -# ╟─27221b90-8506-4857-be86-92ecfd0d2343 -# ╟─445146e5-e45e-450d-9cf1-facf39e3f302 -# ╟─562887bb-b7fb-430f-b61c-748aec38e674 -# ╠═f9be989e-2604-44c2-9727-ed822e4fd85d -# ╟─ff7cfc36-b9fc-4570-b2f2-e08965e5be66 -# ╟─45a4d300-a392-485c-897c-f79712f9ec7c -# ╠═0f76a79f-8675-4ec1-a543-f9324a87efad -# ╠═fc641df4-693c-4007-8657-9fba0caf3cb7 -# ╟─c3f71e42-8bbe-47fe-a217-e58f442fc85c -# ╟─db354064-c2dd-4e6a-b8ad-0340f15a03ba -# ╟─3bbb26ed-7d1e-46ac-946d-b124a8db5f7c -# ╠═cdfe840d-4e87-467f-b582-dfcbeb05bcc5 -# ╟─de589473-4c37-4f70-9143-546b2286a5fe -# ╟─0b80d69c-b60c-4cf4-a7d8-c3b94fb18495 -# ╟─19e7e4c9-95c0-49d6-8396-93fa872d2512 -# ╟─6ff08b40-906f-4154-a4ac-2ddb495858ce -# ╟─8fc99d35-d8cc-455f-806e-1bc580dc349d -# ╠═29f33708-4a82-4acc-9703-288eae064e2a -# ╟─9f80b5b5-95b9-4f01-b2c3-413ae5867f7d -# ╠═fd2e2ee4-c256-4fde-93d8-53c527b0a48c -# ╟─bb7a34eb-4bf1-41d9-af98-8dffdf3118fc -# ╟─71bead68-2d38-468c-8620-127a8c4021ec -# ╠═f245005b-ca1e-4c38-a1f6-fec4c3edfa7b -# ╟─16d8cd4e-ca20-475a-82f7-021485ee0115 -# ╟─ab59d744-51f8-4364-9ecb-94593d972599 -# ╟─ed9f284e-1e45-46e7-b54b-e44bea97c579 -# ╟─b8b0e5ee-8468-4e02-9121-4e392242994e -# ╟─50372dc8-bb10-4f9e-b221-79886442efe7 -# ╟─3e385eb4-0a44-40f6-8df5-b7371beb6b3f -# ╟─caa5153f-6633-41f7-a475-d52dc8eba727 -# ╟─0b74bfe8-bff6-469d-804b-89f5009710b0 -# ╠═b17f6f89-2fd4-46de-a14a-b0adc2955e1c -# ╠═db7a8b4d-611b-4e5e-bd20-120a7a4020d0 -# ╟─f46af08e-ddb9-4aec-93a0-9d99274137e8 -# ╟─af3023e6-920f-478d-af76-60dddeecbe6c -# ╠═93c17a9b-b49c-4780-9074-c069a0e97d7e -# ╟─21f14f18-cc5a-4ed9-ac4a-024c5894013e -# ╟─147cc2c5-f3c2-4aec-82ca-0687059409ae -# ╠═2996c4f0-567a-4c5a-9e9e-2379bd3c438e -# ╟─fbf15fc2-347a-432a-bf9e-f9077f3deea4 -# ╟─3922d490-8018-4b8c-9b74-5a67b6e8b15f -# ╠═d59dc0af-8eb7-4ac4-b1f4-bf9663e97bb7 -# ╟─ed1d8575-1c50-4bc7-8dca-7ecb1b94fa1b -# ╟─c392b949-7e64-4f3e-aeca-6e3d5d94d8fa -# ╟─954a354f-0a1d-4a7f-8aa0-20af95403dd9 -# ╠═0f9e72c1-fb6e-4ea7-a120-518242409f79 -# ╠═242b8047-0508-4ce2-bcf7-0be279ee1434 -# ╟─af6d8f76-f96e-4136-9df6-c78b3bed8b80 -# ╠═ec5e230c-397a-4e07-b9cc-1427739824b3 -# ╟─a0a591d4-ae53-4db4-9051-24fa20a24653 -# ╠═8f18ff72-39db-43f2-ac11-6a06d822d067 -# ╟─d28bdf7e-2ba5-4a97-8d27-497b9a4e8f4b -# ╠═e0dba28c-5f07-4305-a8e6-955351cd26f5 -# ╟─d92c7d6c-6eda-4083-bf7b-99c47ef72fd2 -# ╟─7626383d-5212-4ee5-9b3c-c87e36798d40 -# ╟─f38bca90-a19e-4faa-bc52-7a03f8a4f046 -# ╠═36e8600e-f5ee-4e5f-9814-5dd23028b7dd -# ╟─4de39bbb-e421-4e1b-aea9-6b095d52d246 -# ╟─cf4e89cd-998c-44d1-8a6a-3fad94d47b89 -# ╟─cd043584-6881-45df-ab7f-23dfd6fe43e8 -# ╠═9c90d7a7-1966-4d21-873e-e6fb8e7dca1a -# ╠═f5058fe5-53de-43ad-9dd4-420f3ba8803c -# ╟─669b492b-69a7-4d88-b994-ae59732958cb -# ╠═5c1754b1-21c2-4173-9aa5-a206354b401f -# ╟─675f06bc-6488-45e2-b666-1307eccc221d -# ╟─19bb6a91-8c9d-4ed0-8f9a-f7439f35ea8f -# ╟─fe32c8ab-2f9e-4bb4-a8bb-11a6d1761f50 -# ╠═695926dd-3faf-48a0-8c71-7d4e18e2f699 -# ╟─24ca2761-e264-4cf9-a590-db6dcb21b2d3 -# ╠═9883bd87-d70f-4fea-bec6-3fa17d8c7b35 -# ╟─9b7a12e7-2b3c-445a-97eb-2de8afd657be -# ╠═7fa067b5-7e77-4dda-bba0-54e6f740a5b5 -# ╟─f4dc71cf-64c8-4857-94c0-45caa9808777 -# ╠═00257755-7145-45e6-adf5-76545beae885 -# ╟─135dac9b-0bd9-4e1d-8715-73a20e2ae31b diff --git a/examples/telco/notebook.pluto.jl.html b/examples/telco/notebook.pluto.jl.html deleted file mode 100644 index 2d39e7ed9..000000000 --- a/examples/telco/notebook.pluto.jl.html +++ /dev/null @@ -1,65 +0,0 @@ - - - - - ⚡ Pluto.jl ⚡ - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - diff --git a/examples/telco/notebook.unexecuted.ipynb b/examples/telco/notebook.unexecuted.ipynb deleted file mode 100644 index 71d109350..000000000 --- a/examples/telco/notebook.unexecuted.ipynb +++ /dev/null @@ -1,1855 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# MLJ for Data Scientists in Two Hours" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "An application of the [MLJ\n", - "toolbox](https://juliaai.github.io/MLJ.jl/dev/) to the\n", - "Telco Customer Churn dataset, aimed at practicing data scientists\n", - "new to MLJ (Machine Learning in Julia). This tutorial does not\n", - "cover exploratory data analysis." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "MLJ is a *multi-paradigm* machine learning toolbox (i.e., not just\n", - "deep-learning)." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "For other MLJ learning resources see the [Learning\n", - "MLJ](https://juliaai.github.io/MLJ.jl/dev/learning_mlj/)\n", - "section of the\n", - "[manual](https://juliaai.github.io/MLJ.jl/dev/)." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "**Topics covered**: Grabbing and preparing a dataset, basic\n", - "fit/predict workflow, constructing a pipeline to include data\n", - "pre-processing, estimating performance metrics, ROC curves, confusion\n", - "matrices, feature importance, basic feature selection, controlling iterative\n", - "models, hyper-parameter optimization (tuning)." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "**Prerequisites for this tutorial.** Previous experience building,\n", - "evaluating, and optimizing machine learning models using\n", - "scikit-learn, caret, MLR, weka, or similar tool. No previous\n", - "experience with MLJ. Only fairly basic familiarity with Julia is\n", - "required. Uses\n", - "[DataFrames.jl](https://dataframes.juliadata.org/stable/) but in a\n", - "minimal way ([this\n", - "cheatsheet](https://ahsmart.com/pub/data-wrangling-with-data-frames-jl-cheat-sheet/index.html)\n", - "may help)." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "**Time.** Between two and three hours, first time through." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Summary of methods and types introduced" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "|code | purpose|\n", - "|:-------|:-------------------------------------------------------|\n", - "| `OpenML.load(id)` | grab a dataset from [OpenML.org](https://www.openml.org)|\n", - "| `scitype(X)` | inspect the scientific type (scitype) of object `X`|\n", - "| `schema(X)` | inspect the column scitypes (scientific types) of a table `X`|\n", - "| `coerce(X, ...)` | fix column encodings to get appropriate scitypes|\n", - "| `partition(data, frac1, frac2, ...; rng=...)` | vertically split `data`, which can be a table, vector or matrix|\n", - "| `unpack(table, f1, f2, ...)` | horizontally split `table` based on conditions `f1`, `f2`, ..., applied to column names|\n", - "| `@load ModelType pkg=...` | load code defining a model type|\n", - "| `input_scitype(model)` | inspect the scitype that a model requires for features (inputs)|\n", - "| `target_scitype(model)`| inspect the scitype that a model requires for the target (labels)|\n", - "| `ContinuousEncoder` | built-in model type for re-encoding all features as `Continuous`|\n", - "| `model1 ∣> model2 ∣> ...` | combine multiple models into a pipeline|\n", - "| `measures(\"under curve\")` | list all measures (metrics) with string \"under curve\" in documentation|\n", - "| `accuracy(yhat, y)` | compute accuracy of predictions `yhat` against ground truth observations `y`|\n", - "| `auc(yhat, y)`, `brier_loss(yhat, y)` | evaluate two probabilistic measures (`yhat` a vector of probability distributions)|\n", - "| `machine(model, X, y)` | bind `model` to training data `X` (features) and `y` (target)|\n", - "| `fit!(mach, rows=...)` | train machine using specified rows (observation indices)|\n", - "| `predict(mach, rows=...)`, | make in-sample model predictions given specified rows|\n", - "| `predict(mach, Xnew)` | make predictions given new features `Xnew`|\n", - "| `fitted_params(mach)` | inspect learned parameters|\n", - "| `report(mach)` | inspect other outcomes of training|\n", - "| `confmat(yhat, y)` | confusion matrix for predictions `yhat` and ground truth `y`|\n", - "| `roc(yhat, y)` | compute points on the receiver-operator Characteristic|\n", - "| `StratifiedCV(nfolds=6)` | 6-fold stratified cross-validation resampling strategy|\n", - "| `Holdout(fraction_train=0.7)` | holdout resampling strategy|\n", - "| `evaluate(model, X, y; resampling=..., options...)` | estimate performance metrics `model` using the data `X`, `y`|\n", - "| `FeatureSelector()` | transformer for selecting features|\n", - "| `Step(3)` | iteration control for stepping 3 iterations|\n", - "| `NumberSinceBest(6)`, `TimeLimit(60/5), InvalidValue()` | iteration control stopping criteria|\n", - "| `IteratedModel(model=..., controls=..., options...)` | wrap an iterative `model` in control strategies|\n", - "| `range(model, :some_hyperparam, lower=..., upper=...)` | define a numeric range|\n", - "| `RandomSearch()` | random search tuning strategy|\n", - "| `TunedModel(model=..., tuning=..., options...)` | wrap the supervised `model` in specified `tuning` strategy|" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Instantiate a Julia environment" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "The following code replicates precisely the set of Julia packages\n", - "used to develop this tutorial. If this is your first time running\n", - "the notebook, package instantiation and pre-compilation may take a\n", - "minute or so to complete. **This step will fail** if the [correct\n", - "Manifest.toml and Project.toml\n", - "files](https://github.com/JuliaAI/MLJ.jl/tree/dev/examples/telco)\n", - "are not in the same directory as this notebook." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "using Pkg\n", - "Pkg.activate(@__DIR__) # get env from TOML files in same directory as this notebook\n", - "Pkg.instantiate()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Warm up: Building a model for the iris dataset" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Before turning to the Telco Customer Churn dataset, we very quickly\n", - "build a predictive model for Fisher's well-known iris data set, as way of\n", - "introducing the main actors in any MLJ workflow. Details that you\n", - "don't fully grasp should become clearer in the Telco study." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "This section is a condensed adaption of the [Getting Started\n", - "example](https://juliaai.github.io/MLJ.jl/dev/getting_started/#Fit-and-predict)\n", - "in the MLJ documentation." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "First, using the built-in iris dataset, we load and inspect the features\n", - "`X_iris` (a table) and target variable `y_iris` (a vector):" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "using MLJ" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "const X_iris, y_iris = @load_iris;\n", - "schema(X_iris)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "y_iris[1:4]" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "levels(y_iris)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "We load a decision tree model, from the package DecisionTree.jl:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "DecisionTree = @load DecisionTreeClassifier pkg=DecisionTree # model type\n", - "model = DecisionTree(min_samples_split=5) # model instance" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "In MLJ, a *model* is just a container for hyper-parameters of\n", - "some learning algorithm. It does not store learned parameters." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Next, we bind the model together with the available data in what's\n", - "called a *machine*:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "mach = machine(model, X_iris, y_iris)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "A machine is essentially just a model (ie, hyper-parameters) plus data, but\n", - "it additionally stores *learned parameters* (the tree) once it is\n", - "trained on some view of the data:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "train_rows = vcat(1:60, 91:150); # some row indices (observations are rows not columns)\n", - "fit!(mach, rows=train_rows)\n", - "fitted_params(mach)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "A machine stores some other information enabling [warm\n", - "restart](https://juliaai.github.io/MLJ.jl/dev/machines/#Warm-restarts)\n", - "for some models, but we won't go into that here. You are allowed to\n", - "access and mutate the `model` parameter:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "mach.model.min_samples_split = 10\n", - "fit!(mach, rows=train_rows) # re-train with new hyper-parameter" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Now we can make predictions on some other view of the data, as in" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "predict(mach, rows=71:73)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "or on completely new data, as in" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "Xnew = (sepal_length = [5.1, 6.3],\n", - " sepal_width = [3.0, 2.5],\n", - " petal_length = [1.4, 4.9],\n", - " petal_width = [0.3, 1.5])\n", - "yhat = predict(mach, Xnew)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "These are probabilistic predictions which can be manipulated using a\n", - "widely adopted interface defined in the Distributions.jl\n", - "package. For example, we can get raw probabilities like this:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "pdf.(yhat, \"virginica\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "We now turn to the Telco dataset." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Getting the Telco data" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "import DataFrames" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "data = OpenML.load(42178) # data set from OpenML.org\n", - "df0 = DataFrames.DataFrame(data)\n", - "first(df0, 4)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "The object of this tutorial is to build and evaluate supervised\n", - "learning models to predict the `:Churn` variable, a binary variable\n", - "measuring customer retention, based on other variables that are\n", - "relevant." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "In the table, observations correspond to rows, and features to\n", - "columns, which is the convention for representing all\n", - "two-dimensional data in MLJ." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Type coercion" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "> Introduces: `scitype`, `schema`, `coerce`" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "A [\"scientific\n", - "type\"](https://juliaai.github.io/ScientificTypes.jl/dev/) or\n", - "*scitype* indicates how MLJ will *interpret* data. For example,\n", - "`typeof(3.14) == Float64`, while `scitype(3.14) == Continuous` and\n", - "also `scitype(3.14f0) == Continuous`. In MLJ, model data\n", - "requirements are articulated using scitypes." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Here are common \"scalar\" scitypes:" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "![](assets/scitypes.png)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "There are also container scitypes. For example, the scitype of any\n", - "`N`-dimensional array is `AbstractArray{S, N}`, where `S` is the scitype of the\n", - "elements:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "scitype([\"cat\", \"mouse\", \"dog\"])" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "The `schema` operator summarizes the column scitypes of a table:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "schema(df0) |> DataFrames.DataFrame # converted to DataFrame for better display" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "All of the fields being interpreted as `Textual` are really\n", - "something else, either `Multiclass` or, in the case of\n", - "`:TotalCharges`, `Continuous`. In fact, `:TotalCharges` is\n", - "mostly floats wrapped as strings. However, it needs special\n", - "treatment because some elements consist of a single space, \" \",\n", - "which we'll treat as \"0.0\"." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "fix_blanks(v) = map(v) do x\n", - " if x == \" \"\n", - " return \"0.0\"\n", - " else\n", - " return x\n", - " end\n", - "end\n", - "\n", - "df0.TotalCharges = fix_blanks(df0.TotalCharges);" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Coercing the `:TotalCharges` type to ensure a `Continuous` scitype:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "coerce!(df0, :TotalCharges => Continuous);" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Coercing all remaining `Textual` data to `Multiclass`:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "coerce!(df0, Textual => Multiclass);" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Finally, we'll coerce our target variable `:Churn` to be\n", - "`OrderedFactor`, rather than `Multiclass`, to enable a reliable\n", - "interpretation of metrics like \"true positive rate\". By convention,\n", - "the first class is the negative one:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "coerce!(df0, :Churn => OrderedFactor)\n", - "levels(df0.Churn) # to check order" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Re-inspecting the scitypes:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "schema(df0) |> DataFrames.DataFrame" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Preparing a holdout set for final testing" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "> Introduces: `partition`" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "To reduce training times for the purposes of this tutorial, we're\n", - "going to dump 90% of observations (after shuffling) and split off\n", - "30% of the remainder for use as a lock-and-throw-away-the-key\n", - "holdout set:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "df, df_test, df_dumped = partition(df0, 0.07, 0.03, # in ratios 7:3:90\n", - " stratify=df0.Churn,\n", - " rng=123);" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "The reader interested in including all data can instead do\n", - "`df, df_test = partition(df0, 0.7, stratify=df0.Churn, rng=123)`." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Splitting data into target and features" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "> Introduces: `unpack`" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "In the following call, the column with name `:Churn` is copied over\n", - "to a vector `y`, and every remaining column, except `:customerID`\n", - "(which contains no useful information) goes into a table `X`. Here\n", - "`:Churn` is the target variable for which we seek predictions, given\n", - "new versions of the features `X`." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "const y, X = unpack(df, ==(:Churn), !=(:customerID));\n", - "schema(X).names" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "intersect([:Churn, :customerID], schema(X).names)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "We'll do the same for the holdout data:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "const ytest, Xtest = unpack(df_test, ==(:Churn), !=(:customerID));" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Loading a model and checking type requirements" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "> Introduces: `@load`, `input_scitype`, `target_scitype`" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "For tools helping us to identify suitable models, see the [Model\n", - "Search](https://juliaai.github.io/MLJ.jl/dev/model_search/#model_search)\n", - "section of the manual. We will build a gradient tree-boosting model,\n", - "a popular first choice for structured data like we have here. Model\n", - "code is contained in a third-party package called\n", - "[EvoTrees.jl](https://github.com/Evovest/EvoTrees.jl) which is\n", - "loaded as follows:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "Booster = @load EvoTreeClassifier pkg=EvoTrees" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Recall that a *model* is just a container for some algorithm's\n", - "hyper-parameters. Let's create a `Booster` with default values for\n", - "the hyper-parameters:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "booster = Booster()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "This model is appropriate for the kind of target variable we have because of\n", - "the following passing test:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "scitype(y) <: target_scitype(booster)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "However, our features `X` cannot be directly used with `booster`:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "scitype(X) <: input_scitype(booster)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "As it turns out, this is because `booster`, like the majority of MLJ\n", - "supervised models, expects the features to be `Continuous`. (With\n", - "some experience, this can be gleaned from `input_scitype(booster)`.)\n", - "So we need categorical feature encoding, discussed next." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Building a model pipeline to incorporate feature encoding" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "> Introduces: `ContinuousEncoder`, pipeline operator `|>`" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "The built-in `ContinuousEncoder` model transforms an arbitrary table\n", - "to a table whose features are all `Continuous` (dropping any fields\n", - "it does not know how to encode). In particular, all `Multiclass`\n", - "features are one-hot encoded." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "A *pipeline* is a stand-alone model that internally combines one or\n", - "more models in a linear (non-branching) pipeline. Here's a pipeline\n", - "that adds the `ContinuousEncoder` as a pre-processor to the\n", - "gradient tree-boosting model above:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "pipe = ContinuousEncoder() |> booster" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Note that the component models appear as hyper-parameters of\n", - "`pipe`. Pipelines are an implementation of a more general [model\n", - "composition](https://juliaai.github.io/MLJ.jl/dev/composing_models/#Composing-Models)\n", - "interface provided by MLJ that advanced users may want to learn about." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "From the above display, we see that component model hyper-parameters\n", - "are now *nested*, but they are still accessible (important in hyper-parameter\n", - "optimization):" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "pipe.evo_tree_classifier.max_depth" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Evaluating the pipeline model's performance" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "> Introduces: `measures` (function), **measures:** `brier_loss`, `auc`, `accuracy`;\n", - "> `machine`, `fit!`, `predict`, `fitted_params`, `report`, `roc`, **resampling strategy** `StratifiedCV`, `evaluate`, `FeatureSelector`" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Without touching our test set `Xtest`, `ytest`, we will estimate the\n", - "performance of our pipeline model, with default hyper-parameters, in\n", - "two different ways:" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "**Evaluating by hand.** First, we'll do this \"by hand\" using the `fit!` and `predict`\n", - "workflow illustrated for the iris data set above, using a\n", - "holdout resampling strategy. At the same time we'll see how to\n", - "generate a **confusion matrix**, **ROC curve**, and inspect\n", - "**feature importances**." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "**Automated performance evaluation.** Next we'll apply the more\n", - "typical and convenient `evaluate` workflow, but using `StratifiedCV`\n", - "(stratified cross-validation) which is more informative." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "In any case, we need to choose some measures (metrics) to quantify\n", - "the performance of our model. For a complete list of measures, one\n", - "does `measures()`. Or we also can do:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "measures(\"Brier\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "We will be primarily using `brier_loss`, but also `auc` (area under\n", - "the ROC curve) and `accuracy`." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Evaluating by hand (with a holdout set)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Our pipeline model can be trained just like the decision tree model\n", - "we built for the iris data set. Binding all non-test data to the\n", - "pipeline model:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "mach_pipe = machine(pipe, X, y)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "We already encountered the `partition` method above. Here we apply\n", - "it to row indices, instead of data containers, as `fit!` and\n", - "`predict` only need a *view* of the data to work." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "train, validation = partition(1:length(y), 0.7)\n", - "fit!(mach_pipe, rows=train)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "We note in passing that we can access two kinds of information from a trained machine:" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "- The **learned parameters** (eg, coefficients of a linear model): We use `fitted_params(mach_pipe)`\n", - "- Other **by-products of training** (eg, feature importances): We use `report(mach_pipe)`" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "fp = fitted_params(mach_pipe);\n", - "keys(fp)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "For example, we can check that the encoder did not actually drop any features:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "Set(fp.continuous_encoder.features_to_keep) == Set(schema(X).names)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "And, from the report, extract feature importances:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "rpt = report(mach_pipe)\n", - "keys(rpt.evo_tree_classifier)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "fi = rpt.evo_tree_classifier.feature_importances\n", - "feature_importance_table =\n", - " (feature=Symbol.(first.(fi)), importance=last.(fi)) |> DataFrames.DataFrame" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "For models not reporting feature importances, we recommend the\n", - "[Shapley.jl](https://expandingman.gitlab.io/Shapley.jl/) package." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Returning to predictions and evaluations of our measures:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "ŷ = predict(mach_pipe, rows=validation);\n", - "@info(\"Measurements\",\n", - " brier_loss(ŷ, y[validation]) |> mean,\n", - " auc(ŷ, y[validation]),\n", - " accuracy(mode.(ŷ), y[validation])\n", - " )" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Note that we need `mode` in the last case because `accuracy` expects\n", - "point predictions, not probabilistic ones. (One can alternatively\n", - "use `predict_mode` to generate the predictions.)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "While we're here, lets also generate a **confusion matrix** and\n", - "[receiver-operator\n", - "characteristic](https://en.wikipedia.org/wiki/Receiver_operating_characteristic)\n", - "(ROC):" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "confmat(mode.(ŷ), y[validation])" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Note: Importing the plotting package and calling the plotting\n", - "functions for the first time can take a minute or so." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "using Plots" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "roc_curve = roc(ŷ, y[validation])\n", - "plt = scatter(roc_curve, legend=false)\n", - "plot!(plt, xlab=\"false positive rate\", ylab=\"true positive rate\")\n", - "plot!([0, 1], [0, 1], linewidth=2, linestyle=:dash, color=:black)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Automated performance evaluation (more typical workflow)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "We can also get performance estimates with a single call to the\n", - "`evaluate` function, which also allows for more complicated\n", - "resampling - in this case stratified cross-validation. To make this\n", - "more comprehensive, we set `repeats=3` below to make our\n", - "cross-validation \"Monte Carlo\" (3 random size-6 partitions of the\n", - "observation space, for a total of 18 folds) and set\n", - "`acceleration=CPUThreads()` to parallelize the computation." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "We choose a `StratifiedCV` resampling strategy; the complete list of options is\n", - "[here](https://juliaai.github.io/MLJ.jl/dev/evaluating_model_performance/#Built-in-resampling-strategies)." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "e_pipe = evaluate(pipe, X, y,\n", - " resampling=StratifiedCV(nfolds=6, rng=123),\n", - " measures=[brier_loss, auc, accuracy],\n", - " repeats=3,\n", - " acceleration=CPUThreads())" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "(There is also a version of `evaluate` for machines. Query the\n", - "`evaluate` and `evaluate!` doc-strings to learn more about these\n", - "functions and what the `PerformanceEvaluation` object `e_pipe` records.)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "While [less than ideal](https://arxiv.org/abs/2104.00673), let's\n", - "adopt the common practice of using the standard error of a\n", - "cross-validation score as an estimate of the uncertainty of a\n", - "performance measure's expected value. Here's a utility function to\n", - "calculate 95% confidence intervals for our performance estimates based\n", - "on this practice, and it's application to the current evaluation:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "using Measurements" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "function confidence_intervals(e)\n", - " factor = 2.0 # to get level of 95%\n", - " measure = e.measure\n", - " nfolds = length(e.per_fold[1])\n", - " measurement = [e.measurement[j] ± factor*std(e.per_fold[j])/sqrt(nfolds - 1)\n", - " for j in eachindex(measure)]\n", - " table = (measure=measure, measurement=measurement)\n", - " return DataFrames.DataFrame(table)\n", - "end\n", - "\n", - "const confidence_intervals_basic_model = confidence_intervals(e_pipe)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Filtering out unimportant features" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "> Introduces: `FeatureSelector`" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Before continuing, we'll modify our pipeline to drop those features\n", - "with low feature importance, to speed up later optimization:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "unimportant_features = filter(:importance => <(0.005), feature_importance_table).feature\n", - "\n", - "pipe2 = ContinuousEncoder() |>\n", - " FeatureSelector(features=unimportant_features, ignore=true) |> booster" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Wrapping our iterative model in control strategies" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "> Introduces: **control strategies:** `Step`, `NumberSinceBest`, `TimeLimit`, `InvalidValue`, **model wrapper** `IteratedModel`, **resampling strategy:** `Holdout`" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "We want to optimize the hyper-parameters of our model. Since our\n", - "model is iterative, these parameters include the (nested) iteration\n", - "parameter `pipe.evo_tree_classifier.nrounds`. Sometimes this\n", - "parameter is optimized first, fixed, and then maybe optimized again\n", - "after the other parameters. Here we take a more principled approach,\n", - "**wrapping our model in a control strategy** that makes it\n", - "\"self-iterating\". The strategy applies a stopping criterion to\n", - "*out-of-sample* estimates of the model performance, constructed\n", - "using an internally constructed holdout set. In this way, we avoid\n", - "some data hygiene issues, and, when we subsequently optimize other\n", - "parameters, we will always being using an optimal number of\n", - "iterations." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Note that this approach can be applied to any iterative MLJ model,\n", - "eg, the neural network models provided by\n", - "[MLJFlux.jl](https://github.com/FluxML/MLJFlux.jl)." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "First, we select appropriate controls from [this\n", - "list](https://juliaai.github.io/MLJ.jl/dev/controlling_iterative_models/#Controls-provided):" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "controls = [\n", - " Step(1), # to increment iteration parameter (`pipe.nrounds`)\n", - " NumberSinceBest(4), # main stopping criterion\n", - " TimeLimit(2/3600), # never train more than 2 sec\n", - " InvalidValue() # stop if NaN or ±Inf encountered\n", - "]" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Now we wrap our pipeline model using the `IteratedModel` wrapper,\n", - "being sure to specify the `measure` on which internal estimates of\n", - "the out-of-sample performance will be based:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "iterated_pipe = IteratedModel(model=pipe2,\n", - " controls=controls,\n", - " measure=brier_loss,\n", - " resampling=Holdout(fraction_train=0.7))" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "We've set `resampling=Holdout(fraction_train=0.7)` to arrange that\n", - "data attached to our model should be internally split into a train\n", - "set (70%) and a holdout set (30%) for determining the out-of-sample\n", - "estimate of the Brier loss." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "For demonstration purposes, let's bind `iterated_model` to all data\n", - "not in our don't-touch holdout set, and train on all of that data:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "mach_iterated_pipe = machine(iterated_pipe, X, y)\n", - "fit!(mach_iterated_pipe);" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "To recap, internally this training is split into two separate steps:" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "- A controlled iteration step, training on the holdout set, with the total number of iterations determined by the specified stopping criteria (based on the out-of-sample performance estimates)\n", - "- A final step that trains the atomic model on *all* available\n", - " data using the number of iterations determined in the first step. Calling `predict` on `mach_iterated_pipe` means using the learned parameters of the second step." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Hyper-parameter optimization (model tuning)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "> Introduces: `range`, **model wrapper** `TunedModel`, `RandomSearch`" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "We now turn to hyper-parameter optimization. A tool not discussed\n", - "here is the `learning_curve` function, which can be useful when\n", - "wanting to visualize the effect of changes to a *single*\n", - "hyper-parameter (which could be an iteration parameter). See, for\n", - "example, [this section of the\n", - "manual](https://juliaai.github.io/MLJ.jl/dev/learning_curves/)\n", - "or [this\n", - "tutorial](https://github.com/ablaom/MLJTutorial.jl/blob/dev/notebooks/04_tuning/notebook.ipynb)." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Fine tuning the hyper-parameters of a gradient booster can be\n", - "somewhat involved. Here we settle for simultaneously optimizing two\n", - "key parameters: `max_depth` and `η` (learning_rate)." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Like iteration control, **model optimization in MLJ is implemented as\n", - "a model wrapper**, called `TunedModel`. After wrapping a model in a\n", - "tuning strategy and binding the wrapped model to data in a machine\n", - "called `mach`, calling `fit!(mach)` instigates a search for optimal\n", - "model hyperparameters, within a specified range, and then uses all\n", - "supplied data to train the best model. To predict using that model,\n", - "one then calls `predict(mach, Xnew)`. In this way the wrapped model\n", - "may be viewed as a \"self-tuning\" version of the unwrapped\n", - "model. That is, wrapping the model simply transforms certain\n", - "hyper-parameters into learned parameters (just as `IteratedModel`\n", - "does for an iteration parameter)." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "To start with, we define ranges for the parameters of\n", - "interest. Since these parameters are nested, let's force a\n", - "display of our model to a larger depth:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "show(iterated_pipe, 2)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "p1 = :(model.evo_tree_classifier.η)\n", - "p2 = :(model.evo_tree_classifier.max_depth)\n", - "\n", - "r1 = range(iterated_pipe, p1, lower=-2, upper=-0.5, scale=x->10^x)\n", - "r2 = range(iterated_pipe, p2, lower=2, upper=6)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Nominal ranges are defined by specifying `values` instead of `lower`\n", - "and `upper`." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Next, we choose an optimization strategy from [this\n", - "list](https://juliaai.github.io/MLJ.jl/dev/tuning_models/#Tuning-Models):" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "tuning = RandomSearch(rng=123)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Then we wrap the model, specifying a `resampling` strategy and a\n", - "`measure`, as we did for `IteratedModel`. In fact, we can include a\n", - "battery of `measures`; by default, optimization is with respect to\n", - "performance estimates based on the first measure, but estimates for\n", - "all measures can be accessed from the model's `report`." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "The keyword `n` specifies the total number of models (sets of\n", - "hyper-parameters) to evaluate." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "tuned_iterated_pipe = TunedModel(model=iterated_pipe,\n", - " range=[r1, r2],\n", - " tuning=tuning,\n", - " measures=[brier_loss, auc, accuracy],\n", - " resampling=StratifiedCV(nfolds=6, rng=123),\n", - " acceleration=CPUThreads(),\n", - " n=40)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "To save time, we skip the `repeats` here." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Binding our final model to data and training:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "mach_tuned_iterated_pipe = machine(tuned_iterated_pipe, X, y)\n", - "fit!(mach_tuned_iterated_pipe)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "As explained above, the training we have just performed was split\n", - "internally into two separate steps:" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "- A step to determine the parameter values that optimize the aggregated cross-validation scores\n", - "- A final step that trains the optimal model on *all* available data. Future predictions `predict(mach_tuned_iterated_pipe, ...)` are based on this final training step." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "From `report(mach_tuned_iterated_pipe)` we can extract details about\n", - "the optimization procedure. For example:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "rpt2 = report(mach_tuned_iterated_pipe);\n", - "best_booster = rpt2.best_model.model.evo_tree_classifier" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "@info \"Optimal hyper-parameters:\" best_booster.max_depth best_booster.η;" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Using the `confidence_intervals` function we defined earlier:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "e_best = rpt2.best_history_entry\n", - "confidence_intervals(e_best)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Digging a little deeper, we can learn what stopping criterion was\n", - "applied in the case of the optimal model, and how many iterations\n", - "were required:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "rpt2.best_report.controls |> collect" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Finally, we can visualize the optimization results:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "plot(mach_tuned_iterated_pipe, size=(600,450))" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Saving our model" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "> Introduces: `MLJ.save`" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Here's how to serialize our final, trained self-iterating,\n", - "self-tuning pipeline machine:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "MLJ.save(\"tuned_iterated_pipe.jlso\", mach_tuned_iterated_pipe)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "We'll deserialize this in \"Testing the final model\" below." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Final performance estimate" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Finally, to get an even more accurate estimate of performance, we\n", - "can evaluate our model using stratified cross-validation and all the\n", - "data attached to our machine. Because this evaluation implies\n", - "[nested\n", - "resampling](https://mlr.mlr-org.com/articles/tutorial/nested_resampling.html),\n", - "this computation takes quite a bit longer than the previous one\n", - "(which is being repeated six times, using 5/6th of the data each\n", - "time):" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "e_tuned_iterated_pipe = evaluate(tuned_iterated_pipe, X, y,\n", - " resampling=StratifiedCV(nfolds=6, rng=123),\n", - " measures=[brier_loss, auc, accuracy])" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "confidence_intervals(e_tuned_iterated_pipe)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "For comparison, here are the confidence intervals for the basic\n", - "pipeline model (no feature selection and default hyperparameters):" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "confidence_intervals_basic_model" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "As each pair of intervals overlap, it's doubtful the small changes\n", - "here can be assigned statistical significance. Default `booster`\n", - "hyper-parameters do a pretty good job." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Testing the final model" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "We now determine the performance of our model on our\n", - "lock-and-throw-away-the-key holdout set. To demonstrate\n", - "deserialization, we'll pretend we're in a new Julia session (but\n", - "have called `import`/`using` on the same packages). Then the\n", - "following should suffice to recover our model trained under\n", - "\"Hyper-parameter optimization\" above:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "mach_restored = machine(\"tuned_iterated_pipe.jlso\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "We compute predictions on the holdout set:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "ŷ_tuned = predict(mach_restored, Xtest);\n", - "ŷ_tuned[1]" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "And can compute the final performance measures:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "@info(\"Tuned model measurements on test:\",\n", - " brier_loss(ŷ_tuned, ytest) |> mean,\n", - " auc(ŷ_tuned, ytest),\n", - " accuracy(mode.(ŷ_tuned), ytest)\n", - " )" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "For comparison, here's the performance for the basic pipeline model" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "mach_basic = machine(pipe, X, y)\n", - "fit!(mach_basic, verbosity=0)\n", - "\n", - "ŷ_basic = predict(mach_basic, Xtest);\n", - "\n", - "@info(\"Basic model measurements on test set:\",\n", - " brier_loss(ŷ_basic, ytest) |> mean,\n", - " auc(ŷ_basic, ytest),\n", - " accuracy(mode.(ŷ_basic), ytest)\n", - " )" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "---\n", - "\n", - "*This notebook was generated using [Literate.jl](https://github.com/fredrikekre/Literate.jl).*" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Julia 1.6.5", - "language": "julia", - "name": "julia-1.6" - }, - "language_info": { - "file_extension": ".jl", - "mimetype": "application/julia", - "name": "julia", - "version": "1.6.5" - } - }, - "nbformat": 4, - "nbformat_minor": 3 -} diff --git a/examples/telco/scitypes.png b/examples/telco/scitypes.png deleted file mode 100644 index 8e04fadf1..000000000 Binary files a/examples/telco/scitypes.png and /dev/null differ diff --git a/examples/telco/tuned_iterated_pipe.jlso b/examples/telco/tuned_iterated_pipe.jlso deleted file mode 100644 index 0ce900a29..000000000 Binary files a/examples/telco/tuned_iterated_pipe.jlso and /dev/null differ diff --git a/material/DFKI.jpg b/material/DFKI.jpg new file mode 100644 index 000000000..f845efd9d Binary files /dev/null and b/material/DFKI.jpg differ diff --git a/material/DFKI.png b/material/DFKI.png new file mode 100644 index 000000000..8c359e352 Binary files /dev/null and b/material/DFKI.png differ