Skip to content

Commit 7671f98

Browse files
edit: submission ready
1 parent 94f3b09 commit 7671f98

File tree

4 files changed

+73
-177
lines changed

4 files changed

+73
-177
lines changed

task1.ipynb

+71-24
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,7 @@
2121
},
2222
{
2323
"cell_type": "code",
24-
"execution_count": 3,
24+
"execution_count": 5,
2525
"metadata": {},
2626
"outputs": [
2727
{
@@ -75,42 +75,99 @@
7575
" <td>2</td>\n",
7676
" <td>carrot</td>\n",
7777
" </tr>\n",
78+
" <tr>\n",
79+
" <th>5</th>\n",
80+
" <td>2</td>\n",
81+
" <td>edamame</td>\n",
82+
" </tr>\n",
83+
" <tr>\n",
84+
" <th>6</th>\n",
85+
" <td>3</td>\n",
86+
" <td>apple</td>\n",
87+
" </tr>\n",
88+
" <tr>\n",
89+
" <th>7</th>\n",
90+
" <td>3</td>\n",
91+
" <td>banana</td>\n",
92+
" </tr>\n",
93+
" <tr>\n",
94+
" <th>8</th>\n",
95+
" <td>3</td>\n",
96+
" <td>carrot</td>\n",
97+
" </tr>\n",
98+
" <tr>\n",
99+
" <th>9</th>\n",
100+
" <td>3</td>\n",
101+
" <td>edamame</td>\n",
102+
" </tr>\n",
103+
" <tr>\n",
104+
" <th>10</th>\n",
105+
" <td>4</td>\n",
106+
" <td>banana</td>\n",
107+
" </tr>\n",
108+
" <tr>\n",
109+
" <th>11</th>\n",
110+
" <td>4</td>\n",
111+
" <td>edamame</td>\n",
112+
" </tr>\n",
78113
" </tbody>\n",
79114
"</table>\n",
80115
"</div>"
81116
],
82117
"text/plain": [
83-
" uid item\n",
84-
"0 1 apple\n",
85-
"1 1 carrot\n",
86-
"2 1 durian\n",
87-
"3 2 banana\n",
88-
"4 2 carrot"
118+
" uid item\n",
119+
"0 1 apple\n",
120+
"1 1 carrot\n",
121+
"2 1 durian\n",
122+
"3 2 banana\n",
123+
"4 2 carrot\n",
124+
"5 2 edamame\n",
125+
"6 3 apple\n",
126+
"7 3 banana\n",
127+
"8 3 carrot\n",
128+
"9 3 edamame\n",
129+
"10 4 banana\n",
130+
"11 4 edamame"
89131
]
90132
},
91-
"execution_count": 3,
133+
"execution_count": 5,
92134
"metadata": {},
93135
"output_type": "execute_result"
94136
}
95137
],
96138
"source": [
97139
"df = pd.read_csv(input_file_path, header=None, names=['uid', 'item'])\n",
98-
"df.head()"
140+
"df"
99141
]
100142
},
101143
{
102144
"cell_type": "code",
103145
"execution_count": 4,
104146
"metadata": {},
105-
"outputs": [],
147+
"outputs": [
148+
{
149+
"data": {
150+
"text/plain": [
151+
"[{'apple', 'carrot', 'durian'},\n",
152+
" {'banana', 'carrot', 'edamame'},\n",
153+
" {'apple', 'banana', 'carrot', 'edamame'},\n",
154+
" {'banana', 'edamame'}]"
155+
]
156+
},
157+
"execution_count": 4,
158+
"metadata": {},
159+
"output_type": "execute_result"
160+
}
161+
],
106162
"source": [
107163
"qualifiedUsers = df.groupby('uid')['item'].apply(set).reset_index()\n",
108-
"qualifiedUsers = list(qualifiedUsers['item'].to_dict().values())"
164+
"qualifiedUsers = list(qualifiedUsers['item'].to_dict().values())\n",
165+
"qualifiedUsers"
109166
]
110167
},
111168
{
112169
"cell_type": "code",
113-
"execution_count": 5,
170+
"execution_count": 6,
114171
"metadata": {},
115172
"outputs": [],
116173
"source": [
@@ -169,19 +226,9 @@
169226
"name": "stdout",
170227
"output_type": "stream",
171228
"text": [
172-
"CPU times: user 66 µs, sys: 2 µs, total: 68 µs\n",
173-
"Wall time: 71.8 µs\n"
229+
"CPU times: user 58 µs, sys: 24 µs, total: 82 µs\n",
230+
"Wall time: 264 µs\n"
174231
]
175-
},
176-
{
177-
"data": {
178-
"text/plain": [
179-
"[[('carrot',), ('banana',), ('edamame',)], [('banana', 'edamame')]]"
180-
]
181-
},
182-
"execution_count": 7,
183-
"metadata": {},
184-
"output_type": "execute_result"
185232
}
186233
],
187234
"source": [

task4.ipynb

+2-17
Original file line numberDiff line numberDiff line change
@@ -25,8 +25,7 @@
2525
"from sklearn.cluster import AgglomerativeClustering\n",
2626
"from sklearn.cluster import SpectralClustering\n",
2727
"from kmodes.kmodes import KModes\n",
28-
"from sklearn.mixture import GaussianMixture\n",
29-
"from sklearn.cluster import KMeans, BisectingKMeans"
28+
"from sklearn.mixture import GaussianMixture"
3029
]
3130
},
3231
{
@@ -248,20 +247,6 @@
248247
"evaluate_clusters(votes, target, df_new)\n",
249248
"plot_clusters(estimator_names, ip, votes, df_new, target, plot_all=False)"
250249
]
251-
},
252-
{
253-
"cell_type": "code",
254-
"execution_count": null,
255-
"metadata": {},
256-
"outputs": [],
257-
"source": []
258-
},
259-
{
260-
"cell_type": "code",
261-
"execution_count": null,
262-
"metadata": {},
263-
"outputs": [],
264-
"source": []
265250
}
266251
],
267252
"metadata": {
@@ -280,7 +265,7 @@
280265
"name": "python",
281266
"nbconvert_exporter": "python",
282267
"pygments_lexer": "ipython3",
283-
"version": "3.11.4"
268+
"version": "3.12.0"
284269
}
285270
},
286271
"nbformat": 4,

utils/task2_utils.py

-57
Original file line numberDiff line numberDiff line change
@@ -64,63 +64,6 @@ def son2count_freq(basket, subsets):
6464
return frequency
6565

6666

67-
def generateFreqitem(basket, ck, support_threshold):
68-
"""
69-
Function to generate frequent itemsets from candidata itemsets
70-
71-
Args:
72-
basket (Iterable): basket
73-
ck (Iterable): candidate sets
74-
support_threshold (float): support threshold
75-
76-
Returns:
77-
list: frequent itemsets
78-
"""
79-
C1 = [tuple(x) for x in ck]
80-
cnt = {} # dict to store count for each candidate
81-
for i in basket:
82-
for c in C1:
83-
if (set(c).issubset(i)):
84-
if c in cnt:
85-
cnt[c] += 1
86-
else:
87-
cnt[c] = 1
88-
89-
freq_item = [] # frequent item to extract items count>=support_threshold
90-
for key in cnt:
91-
if cnt[key] >= support_threshold:
92-
freq_item.append(key)
93-
return freq_item # return frequent items from candidate set
94-
95-
96-
def son2count_freq(basket, subsets):
97-
"""
98-
Function to count frequency of candidate itemsets
99-
100-
Args:
101-
basket (Iterable): basket
102-
subsets (Iterable): candidate sets
103-
104-
Returns:
105-
list: list of (frequent items, count) pairs
106-
"""
107-
108-
# make sure to convert elemnet inside list to a tuple before starting to create dict count
109-
C1 = [tuple(x) for x in subsets]
110-
cnt = {}
111-
for i in basket:
112-
for c in C1:
113-
if (set(c).issubset(i)):
114-
if c in cnt:
115-
cnt[c] += 1
116-
else:
117-
cnt[c] = 1
118-
frequency = []
119-
for key in cnt:
120-
# append(frequency to be (frequent items, count) pairs)
121-
frequency.append([key, cnt[key]])
122-
return frequency
123-
12467

12568
def apriori(basket, support, num_baskets):
12669
"""

utils/task3_utils.py

-79
Original file line numberDiff line numberDiff line change
@@ -54,86 +54,7 @@ def load_data(name, normalize=False, reduction='mean'):
5454
num_classes = 2
5555

5656
return df.drop('target', axis=1), df['target'].values, num_classes
57-
58-
if name == 'iris':
59-
data = load_iris()
60-
df = pd.DataFrame(data.data, columns=data.feature_names)
61-
df['target'] = data.target
62-
df = df.sample(frac=1)
63-
64-
if normalize:
65-
for col in data.feature_names:
66-
df[col] = (df[col] - df[col].mean()) / df[col].std()
67-
68-
for col in data.feature_names:
69-
if reduction == 'mean':
70-
df[col] = df[col] >= df[col].mean()
71-
elif reduction == 'median':
72-
df[col] = df[col] >= df[col].median()
73-
74-
num_classes = 3
75-
76-
return df.drop('target', axis=1), df['target'].values, num_classes
77-
78-
if name == 'wine':
79-
data = load_wine()
80-
df = pd.DataFrame(data.data, columns=data.feature_names)
81-
df['target'] = data.target
82-
df = df.sample(frac=1)
83-
84-
if normalize:
85-
for col in data.feature_names:
86-
df[col] = (df[col] - df[col].mean()) / df[col].std()
87-
88-
for col in data.feature_names:
89-
if reduction == 'mean':
90-
df[col] = df[col] >= df[col].mean()
91-
elif reduction == 'median':
92-
df[col] = df[col] >= df[col].median()
93-
94-
num_classes = 3
95-
96-
return df.drop('target', axis=1), df['target'].values, num_classes
97-
98-
if name == 'diabetes':
99-
df = pd.read_csv('./data/diabetes/diabetes.csv')
100-
df = df.sample(frac=1)
101-
target = df['Outcome']
102-
df = df.drop('Outcome', axis=1)
103-
104-
if normalize:
105-
for col in df.columns:
106-
df[col] = (df[col] - df[col].mean()) / df[col].std()
107-
108-
for col in data.feature_names:
109-
if reduction == 'mean':
110-
df[col] = df[col] >= df[col].mean()
111-
elif reduction == 'median':
112-
df[col] = df[col] >= df[col].median()
11357

114-
num_classes = 2
115-
116-
return df, target.values, num_classes
117-
118-
if name == 'glass':
119-
df = pd.read_csv('./data/glass/glass.csv')
120-
df = df.sample(frac=1)
121-
target = df['type_glass']
122-
df = df.drop('type_glass', axis=1)
123-
124-
if normalize:
125-
for col in df.columns:
126-
df[col] = (df[col] - df[col].mean()) / df[col].std()
127-
128-
for col in df.columns:
129-
if reduction == 'mean':
130-
df[col] = df[col] >= df[col].mean()
131-
elif reduction == 'median':
132-
df[col] = df[col] >= df[col].median()
133-
134-
num_classes = 6
135-
136-
return df, target.values, num_classes
13758

13859
if name == 'kc2':
13960
data = fetch_openml(name='kc2', parser='auto')

0 commit comments

Comments
 (0)