{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "Fitting with ScikitLearn - Part 2\n", "==========================\n", "
\n", "

Overview

\n", "

Questions

\n", " \n", "

Objectives:

\n", " \n", "

Keypoints:

\n", " \n", "
" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Data Preparation" ] }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "import os\n", "import pandas as pd" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "plates = pd.read_csv(\"data/rxnpredict/data_table.csv\")" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
platerowcolbasebase_cas_numberbase_smilesligandligand_cas_numberligand_smilesaryl_halide_numberaryl_halidearyl_halide_smilesadditive_numberadditiveadditive_smilesproduct_smilesyield
0111P2Et165535-45-5CN(C)P(N(C)C)(N(C)C)=NP(N(C)C)(N(C)C)=NCCXPhos564483-18-7CC(C)C1=CC(C(C)C)=CC(C(C)C)=C1C2=C(P(C3CCCCC3)...1.01-chloro-4-(trifluoromethyl)benzeneFC(F)(F)c1ccc(Cl)cc1NaNNaNNaNCc1ccc(Nc2ccc(C(F)(F)F)cc2)cc126.888615
1112P2Et165535-45-5CN(C)P(N(C)C)(N(C)C)=NP(N(C)C)(N(C)C)=NCCXPhos564483-18-7CC(C)C1=CC(C(C)C)=CC(C(C)C)=C1C2=C(P(C3CCCCC3)...2.01-bromo-4-(trifluoromethyl)benzeneFC(F)(F)c1ccc(Br)cc1NaNNaNNaNCc1ccc(Nc2ccc(C(F)(F)F)cc2)cc124.063224
2113P2Et165535-45-5CN(C)P(N(C)C)(N(C)C)=NP(N(C)C)(N(C)C)=NCCXPhos564483-18-7CC(C)C1=CC(C(C)C)=CC(C(C)C)=C1C2=C(P(C3CCCCC3)...3.01-iodo-4-(trifluoromethyl)benzeneFC(F)(F)c1ccc(I)cc1NaNNaNNaNCc1ccc(Nc2ccc(C(F)(F)F)cc2)cc147.515821
3114P2Et165535-45-5CN(C)P(N(C)C)(N(C)C)=NP(N(C)C)(N(C)C)=NCCXPhos564483-18-7CC(C)C1=CC(C(C)C)=CC(C(C)C)=C1C2=C(P(C3CCCCC3)...4.01-chloro-4-methoxybenzeneCOc1ccc(Cl)cc1NaNNaNNaNCOc1ccc(Nc2ccc(C)cc2)cc12.126831
4115P2Et165535-45-5CN(C)P(N(C)C)(N(C)C)=NP(N(C)C)(N(C)C)=NCCXPhos564483-18-7CC(C)C1=CC(C(C)C)=CC(C(C)C)=C1C2=C(P(C3CCCCC3)...5.01-bromo-4-methoxybenzeneCOc1ccc(Br)cc1NaNNaNNaNCOc1ccc(Nc2ccc(C)cc2)cc147.586354
\n", "
" ], "text/plain": [ " plate row col base base_cas_number \\\n", "0 1 1 1 P2Et 165535-45-5 \n", "1 1 1 2 P2Et 165535-45-5 \n", "2 1 1 3 P2Et 165535-45-5 \n", "3 1 1 4 P2Et 165535-45-5 \n", "4 1 1 5 P2Et 165535-45-5 \n", "\n", " base_smiles ligand ligand_cas_number \\\n", "0 CN(C)P(N(C)C)(N(C)C)=NP(N(C)C)(N(C)C)=NCC XPhos 564483-18-7 \n", "1 CN(C)P(N(C)C)(N(C)C)=NP(N(C)C)(N(C)C)=NCC XPhos 564483-18-7 \n", "2 CN(C)P(N(C)C)(N(C)C)=NP(N(C)C)(N(C)C)=NCC XPhos 564483-18-7 \n", "3 CN(C)P(N(C)C)(N(C)C)=NP(N(C)C)(N(C)C)=NCC XPhos 564483-18-7 \n", "4 CN(C)P(N(C)C)(N(C)C)=NP(N(C)C)(N(C)C)=NCC XPhos 564483-18-7 \n", "\n", " ligand_smiles aryl_halide_number \\\n", "0 CC(C)C1=CC(C(C)C)=CC(C(C)C)=C1C2=C(P(C3CCCCC3)... 1.0 \n", "1 CC(C)C1=CC(C(C)C)=CC(C(C)C)=C1C2=C(P(C3CCCCC3)... 2.0 \n", "2 CC(C)C1=CC(C(C)C)=CC(C(C)C)=C1C2=C(P(C3CCCCC3)... 3.0 \n", "3 CC(C)C1=CC(C(C)C)=CC(C(C)C)=C1C2=C(P(C3CCCCC3)... 4.0 \n", "4 CC(C)C1=CC(C(C)C)=CC(C(C)C)=C1C2=C(P(C3CCCCC3)... 5.0 \n", "\n", " aryl_halide aryl_halide_smiles additive_number \\\n", "0 1-chloro-4-(trifluoromethyl)benzene FC(F)(F)c1ccc(Cl)cc1 NaN \n", "1 1-bromo-4-(trifluoromethyl)benzene FC(F)(F)c1ccc(Br)cc1 NaN \n", "2 1-iodo-4-(trifluoromethyl)benzene FC(F)(F)c1ccc(I)cc1 NaN \n", "3 1-chloro-4-methoxybenzene COc1ccc(Cl)cc1 NaN \n", "4 1-bromo-4-methoxybenzene COc1ccc(Br)cc1 NaN \n", "\n", " additive additive_smiles product_smiles yield \n", "0 NaN NaN Cc1ccc(Nc2ccc(C(F)(F)F)cc2)cc1 26.888615 \n", "1 NaN NaN Cc1ccc(Nc2ccc(C(F)(F)F)cc2)cc1 24.063224 \n", "2 NaN NaN Cc1ccc(Nc2ccc(C(F)(F)F)cc2)cc1 47.515821 \n", "3 NaN NaN COc1ccc(Nc2ccc(C)cc2)cc1 2.126831 \n", "4 NaN NaN COc1ccc(Nc2ccc(C)cc2)cc1 47.586354 " ] }, "execution_count": 3, "metadata": {}, "output_type": "execute_result" } ], "source": [ "plates.head()" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [], "source": [ "descriptors = pd.read_csv(\"data/rxnpredict/output_table_modified.csv\")" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
additive_*C3_NMR_shiftadditive_*C3_electrostatic_chargeadditive_*C4_NMR_shiftadditive_*C4_electrostatic_chargeadditive_*C5_NMR_shiftadditive_*C5_electrostatic_chargeadditive_*N1_electrostatic_chargeadditive_*O1_electrostatic_chargeadditive_E_HOMOadditive_E_LUMO...ligand_V6_intensityligand_V7_frequencyligand_V7_intensityligand_V8_frequencyligand_V8_intensityligand_V9_frequencyligand_V9_intensityligand_dipole_momentplaterow
0143.120.22393.06-0.447162.340.292-0.334-0.057-0.2317-0.0487...4.4143026.56116.5773043.09718.1453064.34438.211.21292411
1143.120.22393.06-0.447162.340.292-0.334-0.057-0.2317-0.0487...4.4143026.56116.5773043.09718.1453064.34438.211.21292411
2143.120.22393.06-0.447162.340.292-0.334-0.057-0.2317-0.0487...4.4143026.56116.5773043.09718.1453064.34438.211.21292411
3143.120.22393.06-0.447162.340.292-0.334-0.057-0.2317-0.0487...4.4143026.56116.5773043.09718.1453064.34438.211.21292411
4143.120.22393.06-0.447162.340.292-0.334-0.057-0.2317-0.0487...4.4143026.56116.5773043.09718.1453064.34438.211.21292411
\n", "

5 rows × 123 columns

\n", "
" ], "text/plain": [ " additive_*C3_NMR_shift additive_*C3_electrostatic_charge \\\n", "0 143.12 0.223 \n", "1 143.12 0.223 \n", "2 143.12 0.223 \n", "3 143.12 0.223 \n", "4 143.12 0.223 \n", "\n", " additive_*C4_NMR_shift additive_*C4_electrostatic_charge \\\n", "0 93.06 -0.447 \n", "1 93.06 -0.447 \n", "2 93.06 -0.447 \n", "3 93.06 -0.447 \n", "4 93.06 -0.447 \n", "\n", " additive_*C5_NMR_shift additive_*C5_electrostatic_charge \\\n", "0 162.34 0.292 \n", "1 162.34 0.292 \n", "2 162.34 0.292 \n", "3 162.34 0.292 \n", "4 162.34 0.292 \n", "\n", " additive_*N1_electrostatic_charge additive_*O1_electrostatic_charge \\\n", "0 -0.334 -0.057 \n", "1 -0.334 -0.057 \n", "2 -0.334 -0.057 \n", "3 -0.334 -0.057 \n", "4 -0.334 -0.057 \n", "\n", " additive_E_HOMO additive_E_LUMO ... ligand_V6_intensity \\\n", "0 -0.2317 -0.0487 ... 4.414 \n", "1 -0.2317 -0.0487 ... 4.414 \n", "2 -0.2317 -0.0487 ... 4.414 \n", "3 -0.2317 -0.0487 ... 4.414 \n", "4 -0.2317 -0.0487 ... 4.414 \n", "\n", " ligand_V7_frequency ligand_V7_intensity ligand_V8_frequency \\\n", "0 3026.561 16.577 3043.097 \n", "1 3026.561 16.577 3043.097 \n", "2 3026.561 16.577 3043.097 \n", "3 3026.561 16.577 3043.097 \n", "4 3026.561 16.577 3043.097 \n", "\n", " ligand_V8_intensity ligand_V9_frequency ligand_V9_intensity \\\n", "0 18.145 3064.344 38.21 \n", "1 18.145 3064.344 38.21 \n", "2 18.145 3064.344 38.21 \n", "3 18.145 3064.344 38.21 \n", "4 18.145 3064.344 38.21 \n", "\n", " ligand_dipole_moment plate row \n", "0 1.212924 1 1 \n", "1 1.212924 1 1 \n", "2 1.212924 1 1 \n", "3 1.212924 1 1 \n", "4 1.212924 1 1 \n", "\n", "[5 rows x 123 columns]" ] }, "execution_count": 5, "metadata": {}, "output_type": "execute_result" } ], "source": [ "descriptors.head()" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [], "source": [ "# Join on common columns\n", "\n", "# Pull out just what we need\n", "plates_join = plates[[\"plate\", \"row\", \"col\", \"yield\"]]" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [], "source": [ "dataset = pd.merge(descriptors, plates, on=[\"plate\", \"row\", \"col\"])" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
additive_*C3_NMR_shiftadditive_*C3_electrostatic_chargeadditive_*C4_NMR_shiftadditive_*C4_electrostatic_chargeadditive_*C5_NMR_shiftadditive_*C5_electrostatic_chargeadditive_*N1_electrostatic_chargeadditive_*O1_electrostatic_chargeadditive_E_HOMOadditive_E_LUMO...ligand_cas_numberligand_smilesaryl_halide_numberaryl_halidearyl_halide_smilesadditive_numberadditiveadditive_smilesproduct_smilesyield
0143.120.22393.06-0.447162.340.292-0.334-0.057-0.2317-0.0487...564483-18-7CC(C)C1=CC(C(C)C)=CC(C(C)C)=C1C2=C(P(C3CCCCC3)...1.01-chloro-4-(trifluoromethyl)benzeneFC(F)(F)c1ccc(Cl)cc1NaNNaNNaNCc1ccc(Nc2ccc(C(F)(F)F)cc2)cc126.888615
1143.120.22393.06-0.447162.340.292-0.334-0.057-0.2317-0.0487...564483-18-7CC(C)C1=CC(C(C)C)=CC(C(C)C)=C1C2=C(P(C3CCCCC3)...2.01-bromo-4-(trifluoromethyl)benzeneFC(F)(F)c1ccc(Br)cc1NaNNaNNaNCc1ccc(Nc2ccc(C(F)(F)F)cc2)cc124.063224
2143.120.22393.06-0.447162.340.292-0.334-0.057-0.2317-0.0487...564483-18-7CC(C)C1=CC(C(C)C)=CC(C(C)C)=C1C2=C(P(C3CCCCC3)...3.01-iodo-4-(trifluoromethyl)benzeneFC(F)(F)c1ccc(I)cc1NaNNaNNaNCc1ccc(Nc2ccc(C(F)(F)F)cc2)cc147.515821
3143.120.22393.06-0.447162.340.292-0.334-0.057-0.2317-0.0487...564483-18-7CC(C)C1=CC(C(C)C)=CC(C(C)C)=C1C2=C(P(C3CCCCC3)...4.01-chloro-4-methoxybenzeneCOc1ccc(Cl)cc1NaNNaNNaNCOc1ccc(Nc2ccc(C)cc2)cc12.126831
4143.120.22393.06-0.447162.340.292-0.334-0.057-0.2317-0.0487...564483-18-7CC(C)C1=CC(C(C)C)=CC(C(C)C)=C1C2=C(P(C3CCCCC3)...5.01-bromo-4-methoxybenzeneCOc1ccc(Br)cc1NaNNaNNaNCOc1ccc(Nc2ccc(C)cc2)cc147.586354
\n", "

5 rows × 137 columns

\n", "
" ], "text/plain": [ " additive_*C3_NMR_shift additive_*C3_electrostatic_charge \\\n", "0 143.12 0.223 \n", "1 143.12 0.223 \n", "2 143.12 0.223 \n", "3 143.12 0.223 \n", "4 143.12 0.223 \n", "\n", " additive_*C4_NMR_shift additive_*C4_electrostatic_charge \\\n", "0 93.06 -0.447 \n", "1 93.06 -0.447 \n", "2 93.06 -0.447 \n", "3 93.06 -0.447 \n", "4 93.06 -0.447 \n", "\n", " additive_*C5_NMR_shift additive_*C5_electrostatic_charge \\\n", "0 162.34 0.292 \n", "1 162.34 0.292 \n", "2 162.34 0.292 \n", "3 162.34 0.292 \n", "4 162.34 0.292 \n", "\n", " additive_*N1_electrostatic_charge additive_*O1_electrostatic_charge \\\n", "0 -0.334 -0.057 \n", "1 -0.334 -0.057 \n", "2 -0.334 -0.057 \n", "3 -0.334 -0.057 \n", "4 -0.334 -0.057 \n", "\n", " additive_E_HOMO additive_E_LUMO ... ligand_cas_number \\\n", "0 -0.2317 -0.0487 ... 564483-18-7 \n", "1 -0.2317 -0.0487 ... 564483-18-7 \n", "2 -0.2317 -0.0487 ... 564483-18-7 \n", "3 -0.2317 -0.0487 ... 564483-18-7 \n", "4 -0.2317 -0.0487 ... 564483-18-7 \n", "\n", " ligand_smiles aryl_halide_number \\\n", "0 CC(C)C1=CC(C(C)C)=CC(C(C)C)=C1C2=C(P(C3CCCCC3)... 1.0 \n", "1 CC(C)C1=CC(C(C)C)=CC(C(C)C)=C1C2=C(P(C3CCCCC3)... 2.0 \n", "2 CC(C)C1=CC(C(C)C)=CC(C(C)C)=C1C2=C(P(C3CCCCC3)... 3.0 \n", "3 CC(C)C1=CC(C(C)C)=CC(C(C)C)=C1C2=C(P(C3CCCCC3)... 4.0 \n", "4 CC(C)C1=CC(C(C)C)=CC(C(C)C)=C1C2=C(P(C3CCCCC3)... 5.0 \n", "\n", " aryl_halide aryl_halide_smiles additive_number \\\n", "0 1-chloro-4-(trifluoromethyl)benzene FC(F)(F)c1ccc(Cl)cc1 NaN \n", "1 1-bromo-4-(trifluoromethyl)benzene FC(F)(F)c1ccc(Br)cc1 NaN \n", "2 1-iodo-4-(trifluoromethyl)benzene FC(F)(F)c1ccc(I)cc1 NaN \n", "3 1-chloro-4-methoxybenzene COc1ccc(Cl)cc1 NaN \n", "4 1-bromo-4-methoxybenzene COc1ccc(Br)cc1 NaN \n", "\n", " additive additive_smiles product_smiles yield \n", "0 NaN NaN Cc1ccc(Nc2ccc(C(F)(F)F)cc2)cc1 26.888615 \n", "1 NaN NaN Cc1ccc(Nc2ccc(C(F)(F)F)cc2)cc1 24.063224 \n", "2 NaN NaN Cc1ccc(Nc2ccc(C(F)(F)F)cc2)cc1 47.515821 \n", "3 NaN NaN COc1ccc(Nc2ccc(C)cc2)cc1 2.126831 \n", "4 NaN NaN COc1ccc(Nc2ccc(C)cc2)cc1 47.586354 \n", "\n", "[5 rows x 137 columns]" ] }, "execution_count": 8, "metadata": {}, "output_type": "execute_result" } ], "source": [ "dataset.head()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Using SciKitLearn to Fit" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [], "source": [ "X = dataset[descriptors.columns].to_numpy()\n", "Y = dataset[\"yield\"].to_numpy()" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [], "source": [ "from sklearn.preprocessing import StandardScaler" ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [], "source": [ "sc = StandardScaler()" ] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [], "source": [ "X_fit = sc.fit_transform(X)" ] }, { "cell_type": "code", "execution_count": 13, "metadata": {}, "outputs": [], "source": [ "from sklearn.ensemble import RandomForestRegressor" ] }, { "cell_type": "code", "execution_count": 14, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Help on class RandomForestRegressor in module sklearn.ensemble._forest:\n", "\n", "class RandomForestRegressor(ForestRegressor)\n", " | RandomForestRegressor(n_estimators=100, *, criterion='mse', max_depth=None, min_samples_split=2, min_samples_leaf=1, min_weight_fraction_leaf=0.0, max_features='auto', max_leaf_nodes=None, min_impurity_decrease=0.0, min_impurity_split=None, bootstrap=True, oob_score=False, n_jobs=None, random_state=None, verbose=0, warm_start=False, ccp_alpha=0.0, max_samples=None)\n", " | \n", " | A random forest regressor.\n", " | \n", " | A random forest is a meta estimator that fits a number of classifying\n", " | decision trees on various sub-samples of the dataset and uses averaging\n", " | to improve the predictive accuracy and control over-fitting.\n", " | The sub-sample size is controlled with the `max_samples` parameter if\n", " | `bootstrap=True` (default), otherwise the whole dataset is used to build\n", " | each tree.\n", " | \n", " | Read more in the :ref:`User Guide `.\n", " | \n", " | Parameters\n", " | ----------\n", " | n_estimators : int, default=100\n", " | The number of trees in the forest.\n", " | \n", " | .. versionchanged:: 0.22\n", " | The default value of ``n_estimators`` changed from 10 to 100\n", " | in 0.22.\n", " | \n", " | criterion : {\"mse\", \"mae\"}, default=\"mse\"\n", " | The function to measure the quality of a split. Supported criteria\n", " | are \"mse\" for the mean squared error, which is equal to variance\n", " | reduction as feature selection criterion, and \"mae\" for the mean\n", " | absolute error.\n", " | \n", " | .. versionadded:: 0.18\n", " | Mean Absolute Error (MAE) criterion.\n", " | \n", " | max_depth : int, default=None\n", " | The maximum depth of the tree. If None, then nodes are expanded until\n", " | all leaves are pure or until all leaves contain less than\n", " | min_samples_split samples.\n", " | \n", " | min_samples_split : int or float, default=2\n", " | The minimum number of samples required to split an internal node:\n", " | \n", " | - If int, then consider `min_samples_split` as the minimum number.\n", " | - If float, then `min_samples_split` is a fraction and\n", " | `ceil(min_samples_split * n_samples)` are the minimum\n", " | number of samples for each split.\n", " | \n", " | .. versionchanged:: 0.18\n", " | Added float values for fractions.\n", " | \n", " | min_samples_leaf : int or float, default=1\n", " | The minimum number of samples required to be at a leaf node.\n", " | A split point at any depth will only be considered if it leaves at\n", " | least ``min_samples_leaf`` training samples in each of the left and\n", " | right branches. This may have the effect of smoothing the model,\n", " | especially in regression.\n", " | \n", " | - If int, then consider `min_samples_leaf` as the minimum number.\n", " | - If float, then `min_samples_leaf` is a fraction and\n", " | `ceil(min_samples_leaf * n_samples)` are the minimum\n", " | number of samples for each node.\n", " | \n", " | .. versionchanged:: 0.18\n", " | Added float values for fractions.\n", " | \n", " | min_weight_fraction_leaf : float, default=0.0\n", " | The minimum weighted fraction of the sum total of weights (of all\n", " | the input samples) required to be at a leaf node. Samples have\n", " | equal weight when sample_weight is not provided.\n", " | \n", " | max_features : {\"auto\", \"sqrt\", \"log2\"}, int or float, default=\"auto\"\n", " | The number of features to consider when looking for the best split:\n", " | \n", " | - If int, then consider `max_features` features at each split.\n", " | - If float, then `max_features` is a fraction and\n", " | `round(max_features * n_features)` features are considered at each\n", " | split.\n", " | - If \"auto\", then `max_features=n_features`.\n", " | - If \"sqrt\", then `max_features=sqrt(n_features)`.\n", " | - If \"log2\", then `max_features=log2(n_features)`.\n", " | - If None, then `max_features=n_features`.\n", " | \n", " | Note: the search for a split does not stop until at least one\n", " | valid partition of the node samples is found, even if it requires to\n", " | effectively inspect more than ``max_features`` features.\n", " | \n", " | max_leaf_nodes : int, default=None\n", " | Grow trees with ``max_leaf_nodes`` in best-first fashion.\n", " | Best nodes are defined as relative reduction in impurity.\n", " | If None then unlimited number of leaf nodes.\n", " | \n", " | min_impurity_decrease : float, default=0.0\n", " | A node will be split if this split induces a decrease of the impurity\n", " | greater than or equal to this value.\n", " | \n", " | The weighted impurity decrease equation is the following::\n", " | \n", " | N_t / N * (impurity - N_t_R / N_t * right_impurity\n", " | - N_t_L / N_t * left_impurity)\n", " | \n", " | where ``N`` is the total number of samples, ``N_t`` is the number of\n", " | samples at the current node, ``N_t_L`` is the number of samples in the\n", " | left child, and ``N_t_R`` is the number of samples in the right child.\n", " | \n", " | ``N``, ``N_t``, ``N_t_R`` and ``N_t_L`` all refer to the weighted sum,\n", " | if ``sample_weight`` is passed.\n", " | \n", " | .. versionadded:: 0.19\n", " | \n", " | min_impurity_split : float, default=None\n", " | Threshold for early stopping in tree growth. A node will split\n", " | if its impurity is above the threshold, otherwise it is a leaf.\n", " | \n", " | .. deprecated:: 0.19\n", " | ``min_impurity_split`` has been deprecated in favor of\n", " | ``min_impurity_decrease`` in 0.19. The default value of\n", " | ``min_impurity_split`` has changed from 1e-7 to 0 in 0.23 and it\n", " | will be removed in 1.0 (renaming of 0.25).\n", " | Use ``min_impurity_decrease`` instead.\n", " | \n", " | bootstrap : bool, default=True\n", " | Whether bootstrap samples are used when building trees. If False, the\n", " | whole dataset is used to build each tree.\n", " | \n", " | oob_score : bool, default=False\n", " | Whether to use out-of-bag samples to estimate the generalization score.\n", " | Only available if bootstrap=True.\n", " | \n", " | n_jobs : int, default=None\n", " | The number of jobs to run in parallel. :meth:`fit`, :meth:`predict`,\n", " | :meth:`decision_path` and :meth:`apply` are all parallelized over the\n", " | trees. ``None`` means 1 unless in a :obj:`joblib.parallel_backend`\n", " | context. ``-1`` means using all processors. See :term:`Glossary\n", " | ` for more details.\n", " | \n", " | random_state : int, RandomState instance or None, default=None\n", " | Controls both the randomness of the bootstrapping of the samples used\n", " | when building trees (if ``bootstrap=True``) and the sampling of the\n", " | features to consider when looking for the best split at each node\n", " | (if ``max_features < n_features``).\n", " | See :term:`Glossary ` for details.\n", " | \n", " | verbose : int, default=0\n", " | Controls the verbosity when fitting and predicting.\n", " | \n", " | warm_start : bool, default=False\n", " | When set to ``True``, reuse the solution of the previous call to fit\n", " | and add more estimators to the ensemble, otherwise, just fit a whole\n", " | new forest. See :term:`the Glossary `.\n", " | \n", " | ccp_alpha : non-negative float, default=0.0\n", " | Complexity parameter used for Minimal Cost-Complexity Pruning. The\n", " | subtree with the largest cost complexity that is smaller than\n", " | ``ccp_alpha`` will be chosen. By default, no pruning is performed. See\n", " | :ref:`minimal_cost_complexity_pruning` for details.\n", " | \n", " | .. versionadded:: 0.22\n", " | \n", " | max_samples : int or float, default=None\n", " | If bootstrap is True, the number of samples to draw from X\n", " | to train each base estimator.\n", " | \n", " | - If None (default), then draw `X.shape[0]` samples.\n", " | - If int, then draw `max_samples` samples.\n", " | - If float, then draw `max_samples * X.shape[0]` samples. Thus,\n", " | `max_samples` should be in the interval `(0, 1)`.\n", " | \n", " | .. versionadded:: 0.22\n", " | \n", " | Attributes\n", " | ----------\n", " | base_estimator_ : DecisionTreeRegressor\n", " | The child estimator template used to create the collection of fitted\n", " | sub-estimators.\n", " | \n", " | estimators_ : list of DecisionTreeRegressor\n", " | The collection of fitted sub-estimators.\n", " | \n", " | feature_importances_ : ndarray of shape (n_features,)\n", " | The impurity-based feature importances.\n", " | The higher, the more important the feature.\n", " | The importance of a feature is computed as the (normalized)\n", " | total reduction of the criterion brought by that feature. It is also\n", " | known as the Gini importance.\n", " | \n", " | Warning: impurity-based feature importances can be misleading for\n", " | high cardinality features (many unique values). See\n", " | :func:`sklearn.inspection.permutation_importance` as an alternative.\n", " | \n", " | n_features_ : int\n", " | The number of features when ``fit`` is performed.\n", " | \n", " | n_outputs_ : int\n", " | The number of outputs when ``fit`` is performed.\n", " | \n", " | oob_score_ : float\n", " | Score of the training dataset obtained using an out-of-bag estimate.\n", " | This attribute exists only when ``oob_score`` is True.\n", " | \n", " | oob_prediction_ : ndarray of shape (n_samples,)\n", " | Prediction computed with out-of-bag estimate on the training set.\n", " | This attribute exists only when ``oob_score`` is True.\n", " | \n", " | See Also\n", " | --------\n", " | DecisionTreeRegressor, ExtraTreesRegressor\n", " | \n", " | Notes\n", " | -----\n", " | The default values for the parameters controlling the size of the trees\n", " | (e.g. ``max_depth``, ``min_samples_leaf``, etc.) lead to fully grown and\n", " | unpruned trees which can potentially be very large on some data sets. To\n", " | reduce memory consumption, the complexity and size of the trees should be\n", " | controlled by setting those parameter values.\n", " | \n", " | The features are always randomly permuted at each split. Therefore,\n", " | the best found split may vary, even with the same training data,\n", " | ``max_features=n_features`` and ``bootstrap=False``, if the improvement\n", " | of the criterion is identical for several splits enumerated during the\n", " | search of the best split. To obtain a deterministic behaviour during\n", " | fitting, ``random_state`` has to be fixed.\n", " | \n", " | The default value ``max_features=\"auto\"`` uses ``n_features``\n", " | rather than ``n_features / 3``. The latter was originally suggested in\n", " | [1], whereas the former was more recently justified empirically in [2].\n", " | \n", " | References\n", " | ----------\n", " | .. [1] L. Breiman, \"Random Forests\", Machine Learning, 45(1), 5-32, 2001.\n", " | \n", " | .. [2] P. Geurts, D. Ernst., and L. Wehenkel, \"Extremely randomized\n", " | trees\", Machine Learning, 63(1), 3-42, 2006.\n", " | \n", " | Examples\n", " | --------\n", " | >>> from sklearn.ensemble import RandomForestRegressor\n", " | >>> from sklearn.datasets import make_regression\n", " | >>> X, y = make_regression(n_features=4, n_informative=2,\n", " | ... random_state=0, shuffle=False)\n", " | >>> regr = RandomForestRegressor(max_depth=2, random_state=0)\n", " | >>> regr.fit(X, y)\n", " | RandomForestRegressor(...)\n", " | >>> print(regr.predict([[0, 0, 0, 0]]))\n", " | [-8.32987858]\n", " | \n", " | Method resolution order:\n", " | RandomForestRegressor\n", " | ForestRegressor\n", " | sklearn.base.RegressorMixin\n", " | BaseForest\n", " | sklearn.base.MultiOutputMixin\n", " | sklearn.ensemble._base.BaseEnsemble\n", " | sklearn.base.MetaEstimatorMixin\n", " | sklearn.base.BaseEstimator\n", " | builtins.object\n", " | \n", " | Methods defined here:\n", " | \n", " | __init__(self, n_estimators=100, *, criterion='mse', max_depth=None, min_samples_split=2, min_samples_leaf=1, min_weight_fraction_leaf=0.0, max_features='auto', max_leaf_nodes=None, min_impurity_decrease=0.0, min_impurity_split=None, bootstrap=True, oob_score=False, n_jobs=None, random_state=None, verbose=0, warm_start=False, ccp_alpha=0.0, max_samples=None)\n", " | Initialize self. See help(type(self)) for accurate signature.\n", " | \n", " | ----------------------------------------------------------------------\n", " | Data and other attributes defined here:\n", " | \n", " | __abstractmethods__ = frozenset()\n", " | \n", " | ----------------------------------------------------------------------\n", " | Methods inherited from ForestRegressor:\n", " | \n", " | predict(self, X)\n", " | Predict regression target for X.\n", " | \n", " | The predicted regression target of an input sample is computed as the\n", " | mean predicted regression targets of the trees in the forest.\n", " | \n", " | Parameters\n", " | ----------\n", " | X : {array-like, sparse matrix} of shape (n_samples, n_features)\n", " | The input samples. Internally, its dtype will be converted to\n", " | ``dtype=np.float32``. If a sparse matrix is provided, it will be\n", " | converted into a sparse ``csr_matrix``.\n", " | \n", " | Returns\n", " | -------\n", " | y : ndarray of shape (n_samples,) or (n_samples, n_outputs)\n", " | The predicted values.\n", " | \n", " | ----------------------------------------------------------------------\n", " | Methods inherited from sklearn.base.RegressorMixin:\n", " | \n", " | score(self, X, y, sample_weight=None)\n", " | Return the coefficient of determination :math:`R^2` of the\n", " | prediction.\n", " | \n", " | The coefficient :math:`R^2` is defined as :math:`(1 - \\frac{u}{v})`,\n", " | where :math:`u` is the residual sum of squares ``((y_true - y_pred)\n", " | ** 2).sum()`` and :math:`v` is the total sum of squares ``((y_true -\n", " | y_true.mean()) ** 2).sum()``. The best possible score is 1.0 and it\n", " | can be negative (because the model can be arbitrarily worse). A\n", " | constant model that always predicts the expected value of `y`,\n", " | disregarding the input features, would get a :math:`R^2` score of\n", " | 0.0.\n", " | \n", " | Parameters\n", " | ----------\n", " | X : array-like of shape (n_samples, n_features)\n", " | Test samples. For some estimators this may be a precomputed\n", " | kernel matrix or a list of generic objects instead with shape\n", " | ``(n_samples, n_samples_fitted)``, where ``n_samples_fitted``\n", " | is the number of samples used in the fitting for the estimator.\n", " | \n", " | y : array-like of shape (n_samples,) or (n_samples, n_outputs)\n", " | True values for `X`.\n", " | \n", " | sample_weight : array-like of shape (n_samples,), default=None\n", " | Sample weights.\n", " | \n", " | Returns\n", " | -------\n", " | score : float\n", " | :math:`R^2` of ``self.predict(X)`` wrt. `y`.\n", " | \n", " | Notes\n", " | -----\n", " | The :math:`R^2` score used when calling ``score`` on a regressor uses\n", " | ``multioutput='uniform_average'`` from version 0.23 to keep consistent\n", " | with default value of :func:`~sklearn.metrics.r2_score`.\n", " | This influences the ``score`` method of all the multioutput\n", " | regressors (except for\n", " | :class:`~sklearn.multioutput.MultiOutputRegressor`).\n", " | \n", " | ----------------------------------------------------------------------\n", " | Data descriptors inherited from sklearn.base.RegressorMixin:\n", " | \n", " | __dict__\n", " | dictionary for instance variables (if defined)\n", " | \n", " | __weakref__\n", " | list of weak references to the object (if defined)\n", " | \n", " | ----------------------------------------------------------------------\n", " | Methods inherited from BaseForest:\n", " | \n", " | apply(self, X)\n", " | Apply trees in the forest to X, return leaf indices.\n", " | \n", " | Parameters\n", " | ----------\n", " | X : {array-like, sparse matrix} of shape (n_samples, n_features)\n", " | The input samples. Internally, its dtype will be converted to\n", " | ``dtype=np.float32``. If a sparse matrix is provided, it will be\n", " | converted into a sparse ``csr_matrix``.\n", " | \n", " | Returns\n", " | -------\n", " | X_leaves : ndarray of shape (n_samples, n_estimators)\n", " | For each datapoint x in X and for each tree in the forest,\n", " | return the index of the leaf x ends up in.\n", " | \n", " | decision_path(self, X)\n", " | Return the decision path in the forest.\n", " | \n", " | .. versionadded:: 0.18\n", " | \n", " | Parameters\n", " | ----------\n", " | X : {array-like, sparse matrix} of shape (n_samples, n_features)\n", " | The input samples. Internally, its dtype will be converted to\n", " | ``dtype=np.float32``. If a sparse matrix is provided, it will be\n", " | converted into a sparse ``csr_matrix``.\n", " | \n", " | Returns\n", " | -------\n", " | indicator : sparse matrix of shape (n_samples, n_nodes)\n", " | Return a node indicator matrix where non zero elements indicates\n", " | that the samples goes through the nodes. The matrix is of CSR\n", " | format.\n", " | \n", " | n_nodes_ptr : ndarray of shape (n_estimators + 1,)\n", " | The columns from indicator[n_nodes_ptr[i]:n_nodes_ptr[i+1]]\n", " | gives the indicator value for the i-th estimator.\n", " | \n", " | fit(self, X, y, sample_weight=None)\n", " | Build a forest of trees from the training set (X, y).\n", " | \n", " | Parameters\n", " | ----------\n", " | X : {array-like, sparse matrix} of shape (n_samples, n_features)\n", " | The training input samples. Internally, its dtype will be converted\n", " | to ``dtype=np.float32``. If a sparse matrix is provided, it will be\n", " | converted into a sparse ``csc_matrix``.\n", " | \n", " | y : array-like of shape (n_samples,) or (n_samples, n_outputs)\n", " | The target values (class labels in classification, real numbers in\n", " | regression).\n", " | \n", " | sample_weight : array-like of shape (n_samples,), default=None\n", " | Sample weights. If None, then samples are equally weighted. Splits\n", " | that would create child nodes with net zero or negative weight are\n", " | ignored while searching for a split in each node. In the case of\n", " | classification, splits are also ignored if they would result in any\n", " | single class carrying a negative weight in either child node.\n", " | \n", " | Returns\n", " | -------\n", " | self : object\n", " | \n", " | ----------------------------------------------------------------------\n", " | Readonly properties inherited from BaseForest:\n", " | \n", " | feature_importances_\n", " | The impurity-based feature importances.\n", " | \n", " | The higher, the more important the feature.\n", " | The importance of a feature is computed as the (normalized)\n", " | total reduction of the criterion brought by that feature. It is also\n", " | known as the Gini importance.\n", " | \n", " | Warning: impurity-based feature importances can be misleading for\n", " | high cardinality features (many unique values). See\n", " | :func:`sklearn.inspection.permutation_importance` as an alternative.\n", " | \n", " | Returns\n", " | -------\n", " | feature_importances_ : ndarray of shape (n_features,)\n", " | The values of this array sum to 1, unless all trees are single node\n", " | trees consisting of only the root node, in which case it will be an\n", " | array of zeros.\n", " | \n", " | ----------------------------------------------------------------------\n", " | Methods inherited from sklearn.ensemble._base.BaseEnsemble:\n", " | \n", " | __getitem__(self, index)\n", " | Return the index'th estimator in the ensemble.\n", " | \n", " | __iter__(self)\n", " | Return iterator over estimators in the ensemble.\n", " | \n", " | __len__(self)\n", " | Return the number of estimators in the ensemble.\n", " | \n", " | ----------------------------------------------------------------------\n", " | Data and other attributes inherited from sklearn.ensemble._base.BaseEnsemble:\n", " | \n", " | __annotations__ = {'_required_parameters': typing.List[str]}\n", " | \n", " | ----------------------------------------------------------------------\n", " | Methods inherited from sklearn.base.BaseEstimator:\n", " | \n", " | __getstate__(self)\n", " | \n", " | __repr__(self, N_CHAR_MAX=700)\n", " | Return repr(self).\n", " | \n", " | __setstate__(self, state)\n", " | \n", " | get_params(self, deep=True)\n", " | Get parameters for this estimator.\n", " | \n", " | Parameters\n", " | ----------\n", " | deep : bool, default=True\n", " | If True, will return the parameters for this estimator and\n", " | contained subobjects that are estimators.\n", " | \n", " | Returns\n", " | -------\n", " | params : dict\n", " | Parameter names mapped to their values.\n", " | \n", " | set_params(self, **params)\n", " | Set the parameters of this estimator.\n", " | \n", " | The method works on simple estimators as well as on nested objects\n", " | (such as :class:`~sklearn.pipeline.Pipeline`). The latter have\n", " | parameters of the form ``__`` so that it's\n", " | possible to update each component of a nested object.\n", " | \n", " | Parameters\n", " | ----------\n", " | **params : dict\n", " | Estimator parameters.\n", " | \n", " | Returns\n", " | -------\n", " | self : estimator instance\n", " | Estimator instance.\n", "\n" ] } ], "source": [ "help(RandomForestRegressor)" ] }, { "cell_type": "code", "execution_count": 15, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "RandomForestRegressor()" ] }, "execution_count": 15, "metadata": {}, "output_type": "execute_result" } ], "source": [ "regressor = RandomForestRegressor()\n", "regressor.fit(X_fit, Y)" ] }, { "cell_type": "code", "execution_count": 16, "metadata": {}, "outputs": [], "source": [ "y_pred = regressor.predict(X_fit)" ] }, { "cell_type": "code", "execution_count": 17, "metadata": {}, "outputs": [], "source": [ "import matplotlib.pyplot as plt" ] }, { "cell_type": "code", "execution_count": 18, "metadata": { "scrolled": true }, "outputs": [ { "data": { "text/plain": [ "" ] }, "execution_count": 18, "metadata": {}, "output_type": "execute_result" }, { "data": { "image/png": "\n", "text/plain": [ "
" ] }, "metadata": { "needs_background": "light" }, "output_type": "display_data" } ], "source": [ "fig, ax = plt.subplots()\n", "ax.scatter(Y, y_pred)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Training Testing Split" ] }, { "cell_type": "code", "execution_count": 19, "metadata": {}, "outputs": [], "source": [ "from sklearn.model_selection import train_test_split\n", "\n", "from sklearn.linear_model import LinearRegression" ] }, { "cell_type": "code", "execution_count": 20, "metadata": {}, "outputs": [], "source": [ "X_train, X_test, Y_train, Y_test = train_test_split(X_fit, Y, test_size=0.30)" ] }, { "cell_type": "code", "execution_count": 21, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "RandomForestRegressor()" ] }, "execution_count": 21, "metadata": {}, "output_type": "execute_result" } ], "source": [ "regressor = RandomForestRegressor()\n", "regressor.fit(X_train, Y_train)" ] }, { "cell_type": "code", "execution_count": 22, "metadata": {}, "outputs": [], "source": [ "y_pred = regressor.predict(X_test)" ] }, { "cell_type": "code", "execution_count": 23, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "" ] }, "execution_count": 23, "metadata": {}, "output_type": "execute_result" }, { "data": { "image/png": "\n", "text/plain": [ "
" ] }, "metadata": { "needs_background": "light" }, "output_type": "display_data" } ], "source": [ "plt.scatter(Y_test, y_pred)" ] }, { "cell_type": "code", "execution_count": 24, "metadata": {}, "outputs": [], "source": [ "from sklearn import metrics\n", "import math" ] }, { "cell_type": "code", "execution_count": 25, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "0.8545731030213027" ] }, "execution_count": 25, "metadata": {}, "output_type": "execute_result" } ], "source": [ "metrics.r2_score(Y_test, y_pred)" ] }, { "cell_type": "code", "execution_count": 26, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "9.966163336163948" ] }, "execution_count": 26, "metadata": {}, "output_type": "execute_result" } ], "source": [ "math.sqrt(metrics.mean_squared_error(Y_test, y_pred))" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.9.2" } }, "nbformat": 4, "nbformat_minor": 4 }