-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmodel_utils.py
67 lines (46 loc) · 2.26 KB
/
model_utils.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
import numpy as np
import pandas as pd
from scipy.stats import boxcox
from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import MinMaxScaler
def split_train_test(df: pd.DataFrame, test_ids:list):
# Splitting df into train and test
train = df.copy()[~df.index.isin(test_ids)]
test = df.copy()[df.index.isin(test_ids)]
return train, test
def split_feature_target(train:pd.DataFrame, test:pd.DataFrame,target_col:str):
# Splitting train and test to X_train, y_train, X_test and y_test
X_train = train.copy().drop(target_col, axis=1)
y_train = train.copy()[[target_col]]
X_test = test.copy().drop(target_col, axis=1)
y_test = test.copy()[[target_col]]
return X_train, y_train, X_test, y_test
def mlp_fit_predict(mlp:MLPClassifier, X_train:pd.DataFrame, y_train:pd.DataFrame, X_test:pd.DataFrame):
# fitting and predicting with MultiLayer Perceptron
mlp.fit(X_train, np.ravel(y_train))
predictions = mlp.predict_proba(X_test)[:,1]
return predictions
def preprocess_shots_data(data:pd.DataFrame):
"""
Using BoxCox method to remove skewness from "location_x".
Using Sqrt to remove skewness from "duration".
Scaling columns "possession", "duration", "location_x" and "location_y" using a MinMaxScaler.
OneHot Encoding a list of other features.
:param data: the df to be preprocessed
:return: the dataframe already preprocessed
"""
data_ = data.copy()
data_["duration"] = np.sqrt(data_["duration"])
data_["location_x"] = boxcox(data_["location_x"])[0]
data_ = data_.drop(["minute", "second"], axis=1)
data_["outcome"] = data_["outcome"].apply(lambda x: 1.0 if x == "Goal" else 0.0)
min_max = MinMaxScaler()
scaled_cols = ['possession', 'duration', 'location_x', 'location_y']
data_[scaled_cols] = min_max.fit_transform(data_[scaled_cols])
encoded_cols = ['under_pressure', 'play', 'type', 'technique',
'body_part', 'first_time', 'one_on_one', 'aerial_won',
'pos', 'redirect', 'deflected', 'open_goal', 'follows_dribble']
df_encoded_cols = pd.get_dummies(data_[encoded_cols], drop_first=True)
data_ = data_.drop(encoded_cols, axis=1)
data_ = pd.concat([data_, df_encoded_cols], axis=1)
return data_