forked from wkdaniel3/Bayesian-Analysis-for-Wine
-
Notifications
You must be signed in to change notification settings - Fork 0
/
FE.py
96 lines (71 loc) · 3.85 KB
/
FE.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
import pandas as pd
import numpy as np
import scipy as sp
from sklearn.feature_extraction import text
from sklearn.preprocessing import OneHotEncoder
from nltk.corpus import stopwords
import nltk.stem
import json
stemmer = nltk.stem.SnowballStemmer('english')
class StemmedCountVectorizer(text.CountVectorizer):
# hacking sklearn's CountVectorizer class to include stemming support
def build_analyzer(self):
analyzer = super(StemmedCountVectorizer, self).build_analyzer()
return lambda doc: ([stemmer.stem(w) for w in analyzer(doc)])
def text_process(wine, merged=True, one_hot=False):
# Creating TF-Matrix
wine = wine.copy(deep=True).reset_index()
wine['description'] = wine['description'].str.replace('\d+', '')
vectorizer = StemmedCountVectorizer(analyzer="word", stop_words='english')
processed_reviews = pd.DataFrame.sparse.from_spmatrix(vectorizer.fit_transform(wine['description']))
new_cols = list(wine.columns) + list(vectorizer.vocabulary_.keys())
# Converting Variety to One_hot_encoding
if one_hot is True:
encoder = OneHotEncoder()
encoded = pd.DataFrame.sparse.from_spmatrix(encoder.fit_transform(wine[['color']]))
wine.drop(columns=['color'], inplace=True)
new_cols = list(wine.columns) + list(vectorizer.vocabulary_.keys())
if (wine.shape[0] == processed_reviews.shape[0]) and (wine.shape[0] == encoded.shape[0]):
wine_merged = pd.concat([wine, processed_reviews, encoded], axis=1, ignore_index=True)
wine_merged.columns = new_cols + [k.replace('-', '_').replace(' ', '_').lower() for k in encoder.get_feature_names()]
else:
wine_merged = 'mismatched indices, could not properly concatenate'
elif merged is False:
processed_reviews.columns=list(vectorizer.vocabulary_.keys())
wine_merged = processed_reviews
else:
if wine.shape[0] == processed_reviews.shape[0]:
wine_merged = pd.concat([wine, processed_reviews], axis=1, ignore_index=True)
wine_merged.columns = new_cols
else:
wine_merged = 'mismatched indices, could not properly concatenate'
return wine_merged
def FE(wine, country_cutoff=1000):
# Opening wine and country dictionaries
with open('country_dict.json', 'r') as fp:
country_dict = json.load(fp)
with open('wine_dict.json', 'r') as fp:
wine_dict = json.load(fp)
wine.set_index('id', inplace=True)
# creating 'year' feature
wine['year'] = wine['title'].str.extract(r'([1-9][0-9][0-9][0-9])')
# creating 'continent' feature
wine['continent'] = wine['country'].map(country_dict)
# condensing 'country' column values
countries = wine[['country', 'title']].groupby('country').count()
popular_countries = countries[countries['title'] > country_cutoff].index.tolist()
wine['country'] = np.where(wine['country'].str.contains('|'.join(popular_countries)), wine['country'], 'Other')
# creating 'color' feature
wine['color'] = wine['variety'].map(wine_dict)
# creating price 'category' feature
wine['category'] = np.where(wine['price'] <= 15, 0,
np.where(wine['price'] <= 30, 1,
np.where(wine['price'] <= 50, 2, 3)))
# creating score categorical feature
wine['score_descriptive'] = np.where(wine['points'] <= 80, 'bad',
np.where(wine['points'] <= 82, 'Acceptable',
np.where(wine['points'] <= 86, 'Good',
np.where(wine['points'] <= 89, 'Very Good',
np.where(wine['points'] <= 93, 'Excellent',
np.where(wine['points'] <= 97, 'Superb','Classic',))))))
return wine