-
Notifications
You must be signed in to change notification settings - Fork 2
/
lead-scorer.py
97 lines (75 loc) · 3.16 KB
/
lead-scorer.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
import pandas as pd
import numpy as np
df = pd.read_csv("data/leads-and-customers.csv")
df.head()
df['job_title'] = np.where(np.random.uniform(0, 1, len(df)) > 0.92, np.NaN, df.job_title)
df.job_title.isnull().value_counts()
df.job_title = df.job_title.fillna("UNK")
df['is_manager'] = df.job_title.str.contains("manager|director|supervisor", flags=re.IGNORECASE)
df.head()
# dummify our categorical columns (acquisition_channel, company_size, industry)
dummies = pd.get_dummies(df.acquisition_channel, prefix="acquisition_channel=")
df[dummies.columns] = dummies
pd.crosstab(df.acquisition_channel, df.converted, normalize='index')
dummies = pd.get_dummies(df.company_size, prefix="company_size=")
df[dummies.columns] = dummies
pd.crosstab(df.company_size, df.converted, normalize='index')
dummies = pd.get_dummies(df.industry, prefix="industry=")
df[dummies.columns] = dummies
pd.crosstab(df.industry, df.converted, normalize='index')
for f in ['is_manager', 'days_since_signup', 'visited_pricing', 'registered_for_webinar', 'attended_webinar', 'completed_form']:
print f
print pd.crosstab(df[f], df.converted, normalize='index')
print "*"*80
# create a feature map. for each categorical variable, we need to
# exclude one of the options so we don't violate the dummy variable trap
features = [
"is_manager",
"days_since_signup",
"completed_form",
"visited_pricing",
"registered_for_webinar",
"attended_webinar",
"acquisition_channel=_Cold Call",
"acquisition_channel=_Cold Email",
"acquisition_channel=_Organic Search",
"acquisition_channel=_Paid Leads",
# "acquisition_channel=_Paid Search",
# "company_size=_1-10",
"company_size=_1000-10000",
"company_size=_10001+",
"company_size=_101-250",
"company_size=_11-50",
"company_size=_251-1000",
"company_size=_51-100",
"industry=_Financial Services",
"industry=_Furniture",
"industry=_Heavy Manufacturing",
"industry=_Scandanavion Design",
# "industry=_Transportation",
"industry=_Web & Internet"
]
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
lm = LogisticRegression()
lm.fit(df[features], df.converted)
rf = RandomForestClassifier()
rf.fit(df[features], df.converted)
from sklearn.metrics import classification_report, roc_curve
print classification_report(df.converted, lm.predict(df[features]))
fpr, tpr, thresholds = roc_curve(df.converted, lm.predict_proba(df[features])[:,1], pos_label=1)
print classification_report(df.converted, rf.predict(df[features]))
fpr, tpr, thresholds = roc_curve(df.converted, rf.predict_proba(df[features])[:,1], pos_label=1)
from ggplot import *
data = pd.DataFrame(dict(
fpr=fpr,
tpr=tpr,
thresholds=thresholds
))
ggplot(data, aes(x='fpr', y='tpr')) + geom_line() + geom_abline() + coord_equal()
qplot(rf.predict_proba(df[features])[:,1])
probs = pd.Series(rf.predict_proba(df[features])[:,1])
df['grade'] = grade = pd.cut(probs, 5, labels=["F","D","C","B","A"])
lead_quality = df['grade'].value_counts()
lead_quality = lead_quality.reset_index().sort("index", ascending=False)
ggplot(lead_quality, aes(x='index', weight='grade')) + geom_bar()