-
Notifications
You must be signed in to change notification settings - Fork 0
/
app.py
77 lines (60 loc) · 2.6 KB
/
app.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
import pandas as pd
from sklearn.preprocessing import PolynomialFeatures, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import roc_curve, roc_auc_score
import matplotlib.pyplot as plt
data = pd.read_csv('credit_card_fraud.csv')
print('---- data (head)')
print(data.head())
print('---- categorical features')
print(data.fraudulent.value_counts())
print(data.card_country.value_counts())
print('---- encoded countries')
encoded_countries = pd.get_dummies(data.card_country, prefix='cc')
print(encoded_countries.head())
data = data.join(encoded_countries)
print('---- prepare data')
print(data.head())
y = data.fraudulent
X = data[['amount', 'card_use_24h', 'cc_AU', 'cc_GB', 'cc_US']]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)
print('---- normalize data')
poly = PolynomialFeatures(2)
X_train_poly = poly.fit_transform(X_train)
scaler = StandardScaler().fit(X_train_poly)
X_train_scaled = scaler.transform(X_train_poly)
print('---- model')
models = []
lr_model = LogisticRegression().fit(X_train_scaled, y_train)
print('logistic regression: coef: ' + str(lr_model.coef_))
print('logistic regression: intercept: ' + str(lr_model.intercept_))
models.append(('Logistic regression', lr_model, True))
dt_model = DecisionTreeClassifier(max_depth=10, min_samples_split=5).fit(X_train_scaled, y_train)
models.append(('Random forest', dt_model, True))
clf_model = SGDClassifier(loss="hinge", penalty="l2").fit(X_train_scaled, y_train)
models.append(('SDG', clf_model, False))
print('---- test')
figure = 0
X_test_poly = poly.fit_transform(X_test)
X_test_scaled = scaler.transform(X_test_poly)
for model in models:
print('Score for %s: %s' % (model[0], model[1].score(X_test_scaled, y_test)))
if model[2]:
figure = figure + 1
y_test_predict_lr = model[1].predict_proba(X_test_scaled)
y_test_scores_lr = [x[1] for x in y_test_predict_lr]
fpr, tpr, thresholds = roc_curve(y_test, y_test_scores_lr)
auc_score = roc_auc_score(y_test, y_test_scores_lr)
plt.figure(figure)
plt.plot(fpr, tpr, color='darkorange',
lw=2, label='ROC curve (area = %0.2f)' % auc_score)
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic (%s)' % model[0])
plt.legend(loc="lower right")
plt.show()