edit: submission ready

siddhantpathakk · siddhantpathakk · commit 7671f9836f92 · 2023-11-23T21:56:08.000+08:00
diff --git a/task1.ipynb b/task1.ipynb
@@ -21,7 +21,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 3,
+   "execution_count": 5,
    "metadata": {},
    "outputs": [
     {
@@ -75,42 +75,99 @@
        "      <td>2</td>\n",
        "      <td>carrot</td>\n",
        "    </tr>\n",
+       "    <tr>\n",
+       "      <th>5</th>\n",
+       "      <td>2</td>\n",
+       "      <td>edamame</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>6</th>\n",
+       "      <td>3</td>\n",
+       "      <td>apple</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>7</th>\n",
+       "      <td>3</td>\n",
+       "      <td>banana</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>8</th>\n",
+       "      <td>3</td>\n",
+       "      <td>carrot</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>9</th>\n",
+       "      <td>3</td>\n",
+       "      <td>edamame</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>10</th>\n",
+       "      <td>4</td>\n",
+       "      <td>banana</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>11</th>\n",
+       "      <td>4</td>\n",
+       "      <td>edamame</td>\n",
+       "    </tr>\n",
        "  </tbody>\n",
        "</table>\n",
        "</div>"
       ],
       "text/plain": [
-       "   uid    item\n",
-       "0    1   apple\n",
-       "1    1  carrot\n",
-       "2    1  durian\n",
-       "3    2  banana\n",
-       "4    2  carrot"
+       "    uid     item\n",
+       "0     1    apple\n",
+       "1     1   carrot\n",
+       "2     1   durian\n",
+       "3     2   banana\n",
+       "4     2   carrot\n",
+       "5     2  edamame\n",
+       "6     3    apple\n",
+       "7     3   banana\n",
+       "8     3   carrot\n",
+       "9     3  edamame\n",
+       "10    4   banana\n",
+       "11    4  edamame"
       ]
      },
-     "execution_count": 3,
+     "execution_count": 5,
      "metadata": {},
      "output_type": "execute_result"
     }
    ],
    "source": [
     "df = pd.read_csv(input_file_path, header=None, names=['uid', 'item'])\n",
-    "df.head()"
+    "df"
    ]
   },
   {
    "cell_type": "code",
    "execution_count": 4,
    "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "[{'apple', 'carrot', 'durian'},\n",
+       " {'banana', 'carrot', 'edamame'},\n",
+       " {'apple', 'banana', 'carrot', 'edamame'},\n",
+       " {'banana', 'edamame'}]"
+      ]
+     },
+     "execution_count": 4,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
    "source": [
     "qualifiedUsers = df.groupby('uid')['item'].apply(set).reset_index()\n",
-    "qualifiedUsers = list(qualifiedUsers['item'].to_dict().values())"
+    "qualifiedUsers = list(qualifiedUsers['item'].to_dict().values())\n",
+    "qualifiedUsers"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 5,
+   "execution_count": 6,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -169,19 +226,9 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "CPU times: user 66 µs, sys: 2 µs, total: 68 µs\n",
-      "Wall time: 71.8 µs\n"
+      "CPU times: user 58 µs, sys: 24 µs, total: 82 µs\n",
+      "Wall time: 264 µs\n"
      ]
-    },
-    {
-     "data": {
-      "text/plain": [
-       "[[('carrot',), ('banana',), ('edamame',)], [('banana', 'edamame')]]"
-      ]
-     },
-     "execution_count": 7,
-     "metadata": {},
-     "output_type": "execute_result"
     }
    ],
    "source": [
diff --git a/task4.ipynb b/task4.ipynb
@@ -25,8 +25,7 @@
     "from sklearn.cluster import AgglomerativeClustering\n",
     "from sklearn.cluster import SpectralClustering\n",
     "from kmodes.kmodes import KModes\n",
-    "from sklearn.mixture import GaussianMixture\n",
-    "from sklearn.cluster import KMeans, BisectingKMeans"
+    "from sklearn.mixture import GaussianMixture"
    ]
   },
   {
@@ -248,20 +247,6 @@
     "evaluate_clusters(votes, target, df_new)\n",
     "plot_clusters(estimator_names, ip, votes, df_new, target, plot_all=False)"
    ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": []
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": []
   }
  ],
  "metadata": {
@@ -280,7 +265,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.11.4"
+   "version": "3.12.0"
   }
  },
  "nbformat": 4,
diff --git a/utils/task2_utils.py b/utils/task2_utils.py
@@ -64,63 +64,6 @@ def son2count_freq(basket, subsets):
     return frequency
 
 
-def generateFreqitem(basket, ck, support_threshold):
-    """
-    Function to generate frequent itemsets from candidata itemsets
-
-    Args:
-        basket (Iterable): basket
-        ck (Iterable): candidate sets
-        support_threshold (float): support threshold
-
-    Returns:
-        list: frequent itemsets
-    """
-    C1 = [tuple(x) for x in ck]
-    cnt = {}  # dict to store count for each candidate
-    for i in basket:
-        for c in C1:
-            if (set(c).issubset(i)):
-                if c in cnt:
-                    cnt[c] += 1
-                else:
-                    cnt[c] = 1
-
-    freq_item = []  # frequent item to extract items count>=support_threshold
-    for key in cnt:
-        if cnt[key] >= support_threshold:
-            freq_item.append(key)
-    return freq_item  # return frequent items from candidate set
-
-
-def son2count_freq(basket, subsets):
-    """
-    Function to count frequency of candidate itemsets
-
-    Args:
-        basket (Iterable): basket
-        subsets (Iterable): candidate sets
-
-    Returns:
-        list: list of (frequent items, count) pairs
-    """
-    
-    # make sure to convert elemnet inside list to a tuple before starting to create dict count
-    C1 = [tuple(x) for x in subsets]
-    cnt = {}
-    for i in basket:
-        for c in C1:
-            if (set(c).issubset(i)):
-                if c in cnt:
-                    cnt[c] += 1
-                else:
-                    cnt[c] = 1
-    frequency = []
-    for key in cnt:
-        # append(frequency to be (frequent items, count) pairs)
-        frequency.append([key, cnt[key]])
-    return frequency
-
 
 def apriori(basket, support, num_baskets):
     """
diff --git a/utils/task3_utils.py b/utils/task3_utils.py
@@ -54,86 +54,7 @@ def load_data(name, normalize=False, reduction='mean'):
         num_classes = 2
         
         return df.drop('target', axis=1), df['target'].values, num_classes
-    
-    if name == 'iris':
-        data = load_iris()
-        df = pd.DataFrame(data.data, columns=data.feature_names)
-        df['target'] = data.target
-        df = df.sample(frac=1)
-
-        if normalize:
-            for col in data.feature_names:
-                df[col] = (df[col] - df[col].mean()) / df[col].std()
-
-        for col in data.feature_names:
-            if reduction == 'mean':
-                df[col] = df[col] >= df[col].mean()
-            elif reduction == 'median':
-                df[col] = df[col] >= df[col].median()
-
-        num_classes = 3
-
-        return df.drop('target', axis=1), df['target'].values, num_classes
-    
-    if name == 'wine':
-        data = load_wine()
-        df = pd.DataFrame(data.data, columns=data.feature_names)
-        df['target'] = data.target
-        df = df.sample(frac=1)
- 
-        if normalize:
-            for col in data.feature_names:
-                df[col] = (df[col] - df[col].mean()) / df[col].std()
-        
-        for col in data.feature_names:
-            if reduction == 'mean':
-                df[col] = df[col] >= df[col].mean()
-            elif reduction == 'median':
-                df[col] = df[col] >= df[col].median()
-        
-        num_classes = 3
-        
-        return df.drop('target', axis=1), df['target'].values, num_classes
-
-    if name == 'diabetes':
-        df = pd.read_csv('./data/diabetes/diabetes.csv')
-        df = df.sample(frac=1)
-        target = df['Outcome']
-        df = df.drop('Outcome', axis=1)
-
-        if normalize:
-            for col in df.columns:
-                df[col] = (df[col] - df[col].mean()) / df[col].std()
-
-        for col in data.feature_names:
-            if reduction == 'mean':
-                df[col] = df[col] >= df[col].mean()
-            elif reduction == 'median':
-                df[col] = df[col] >= df[col].median()
 
-        num_classes = 2
-
-        return df, target.values, num_classes
-
-    if name == 'glass':
-        df = pd.read_csv('./data/glass/glass.csv')
-        df = df.sample(frac=1)
-        target = df['type_glass']
-        df = df.drop('type_glass', axis=1)
-        
-        if normalize:
-            for col in df.columns:
-                df[col] = (df[col] - df[col].mean()) / df[col].std()
-        
-        for col in df.columns:
-            if reduction == 'mean':
-                df[col] = df[col] >= df[col].mean()
-            elif reduction == 'median':
-                df[col] = df[col] >= df[col].median()
-        
-        num_classes = 6
-        
-        return df, target.values, num_classes
     
     if name == 'kc2':
         data = fetch_openml(name='kc2', parser='auto')