From c6e4cb4f21f1cfd3b75b5328f136622512e238cc Mon Sep 17 00:00:00 2001
From: reder_martin <34924612+martinreder@users.noreply.github.com>
Date: Wed, 22 Nov 2023 10:56:09 +0100
Subject: [PATCH] Erstellt mit Colaboratory
---
Classification.ipynb | 455 ++++++++++++++++++++++++++++++++++++++++---
1 file changed, 426 insertions(+), 29 deletions(-)
diff --git a/Classification.ipynb b/Classification.ipynb
index 58cac6a..9f93cf1 100644
--- a/Classification.ipynb
+++ b/Classification.ipynb
@@ -14,7 +14,7 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 1,
"id": "1c7efa9f-be80-4108-9387-6cd68255c89f",
"metadata": {
"id": "1c7efa9f-be80-4108-9387-6cd68255c89f"
@@ -55,12 +55,317 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 2,
"id": "e2c52bfe-0d55-4e4b-b745-6805cdbe46a2",
"metadata": {
- "id": "e2c52bfe-0d55-4e4b-b745-6805cdbe46a2"
+ "colab": {
+ "base_uri": "https://localhost:8080/",
+ "height": 242
+ },
+ "id": "e2c52bfe-0d55-4e4b-b745-6805cdbe46a2",
+ "outputId": "923e877a-f1eb-4dc5-905e-475fb5ecabf2"
},
- "outputs": [],
+ "outputs": [
+ {
+ "output_type": "stream",
+ "name": "stdout",
+ "text": [
+ "(65, 3)\n",
+ "(35, 3)\n"
+ ]
+ },
+ {
+ "output_type": "execute_result",
+ "data": {
+ "text/plain": [
+ " Label Title \\\n",
+ "92280 4 Curtain rises on Solaris 10 \n",
+ "59460 1 Nigerian Strike Set to Drag Into Fourth Day (R... \n",
+ "32048 2 Tejada Rips Up Twins \n",
+ "77859 2 Novak topples Henman \n",
+ "1518 2 Hall-of-Fame credentials \n",
+ "\n",
+ " Article \n",
+ "92280 Calling it quot;the biggest thing we #39;ve d... \n",
+ "59460 Reuters - A general strike in Nigeria which ha... \n",
+ "32048 Miguel Tejada homers twice and drives in five ... \n",
+ "77859 JIRI Novak made Tim Henman #39;s life as miser... \n",
+ "1518 HAVEN, Wis. -- An official from the World Golf... "
+ ],
+ "text/html": [
+ "\n",
+ "
\n",
+ "
\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " Label | \n",
+ " Title | \n",
+ " Article | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 92280 | \n",
+ " 4 | \n",
+ " Curtain rises on Solaris 10 | \n",
+ " Calling it quot;the biggest thing we #39;ve d... | \n",
+ "
\n",
+ " \n",
+ " 59460 | \n",
+ " 1 | \n",
+ " Nigerian Strike Set to Drag Into Fourth Day (R... | \n",
+ " Reuters - A general strike in Nigeria which ha... | \n",
+ "
\n",
+ " \n",
+ " 32048 | \n",
+ " 2 | \n",
+ " Tejada Rips Up Twins | \n",
+ " Miguel Tejada homers twice and drives in five ... | \n",
+ "
\n",
+ " \n",
+ " 77859 | \n",
+ " 2 | \n",
+ " Novak topples Henman | \n",
+ " JIRI Novak made Tim Henman #39;s life as miser... | \n",
+ "
\n",
+ " \n",
+ " 1518 | \n",
+ " 2 | \n",
+ " Hall-of-Fame credentials | \n",
+ " HAVEN, Wis. -- An official from the World Golf... | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
\n",
+ "
\n",
+ "
\n"
+ ]
+ },
+ "metadata": {},
+ "execution_count": 2
+ }
+ ],
"source": [
"train, test = train_test_split(sample, test_size = 0.35, random_state = 1337)\n",
"print(train.shape)\n",
@@ -82,12 +387,34 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 8,
"id": "e50b0510-2c09-4ec6-9e28-b2a8933197e5",
"metadata": {
- "id": "e50b0510-2c09-4ec6-9e28-b2a8933197e5"
+ "colab": {
+ "base_uri": "https://localhost:8080/"
+ },
+ "id": "e50b0510-2c09-4ec6-9e28-b2a8933197e5",
+ "outputId": "68edce26-fd6e-4cf1-fa79-6cfe57656df0"
},
- "outputs": [],
+ "outputs": [
+ {
+ "output_type": "stream",
+ "name": "stdout",
+ "text": [
+ "(65, 967)\n",
+ "(35, 967)\n",
+ "(967,)\n"
+ ]
+ },
+ {
+ "output_type": "stream",
+ "name": "stderr",
+ "text": [
+ "[nltk_data] Downloading package stopwords to /root/nltk_data...\n",
+ "[nltk_data] Unzipping corpora/stopwords.zip.\n"
+ ]
+ }
+ ],
"source": [
"labels_train = [classes_en[int(row[\"Label\"])] for i, row in train.iterrows()]\n",
"docs_train = [row[\"Article\"] for i, row in train.iterrows()]\n",
@@ -96,7 +423,8 @@
"\n",
"from nltk.corpus import stopwords as nltkStopwords\n",
"from sklearn.feature_extraction.text import CountVectorizer\n",
- "\n",
+ "import nltk\n",
+ "nltk.download('stopwords')\n",
"stopwords_en = list(nltkStopwords.words(\"english\"))\n",
"count_vectorizer = CountVectorizer(strip_accents = \"unicode\",\n",
" stop_words = stopwords_en)\n",
@@ -126,7 +454,7 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 9,
"id": "e6210175-dbea-4ea8-aacb-b539473c7149",
"metadata": {
"id": "e6210175-dbea-4ea8-aacb-b539473c7149"
@@ -195,7 +523,7 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 12,
"id": "7a629cd5-7444-4b3e-b52a-afcc51657e4c",
"metadata": {
"id": "7a629cd5-7444-4b3e-b52a-afcc51657e4c"
@@ -221,7 +549,10 @@
" # nr. of rows with C / nr. of rows\n",
" # Hint: to get the number of rows of a pandas dataframe use df.shape[0]\n",
" def P(C):\n",
- " # your code here\n",
+ " # Calculate and return P(C)\n",
+ " class_count = data[target].value_counts()[C]\n",
+ " total_count = data.shape[0]\n",
+ " return class_count / total_count\n",
"\n",
"\n",
" # Implement a function P_conditional(f, F, C) that returns\n",
@@ -230,7 +561,9 @@
" # (optionally + delta: (nr. of rows that have both F and C + delta)/(nr. of rows that have C + nr. of unique feature values in f * delta)\n",
" # ^ this is called Lidstone's Law Succession and adresses the problem of the probability going to 0 as soon as feature does not occur (very frequently)\n",
" def P_conditional(f, F, C):\n",
- " # your code here\n",
+ " nr_of_rows_with_C_and_F = data[(data[target] == C) & (data[F] == f)].shape[0]\n",
+ " nr_of_rows_with_c = data[data[target]==C].shape[0]\n",
+ " return nr_of_rows_with_C_and_F / nr_of_rows_with_c\n",
"\n",
"\n",
" # we can retrieve our features like this\n",
@@ -250,11 +583,11 @@
" # for class in classes\n",
" for C_i in data[target].unique().tolist():\n",
" # calculate the probability of class C\n",
- " r = # your code here\n",
+ " r = P(C_i)\n",
"\n",
" # multiply the above probability of class C with the Conditional Probability of Feature f having the value F, given class C for every feature\n",
" for f in features:\n",
- " r = # your code here\n",
+ " r *= P_conditional(f, x[f], C_i)\n",
"\n",
" res.append((r, C_i))\n",
"\n",
@@ -267,7 +600,7 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 13,
"id": "2df90bb9-24d7-4590-a843-a286b6da0b2c",
"metadata": {
"id": "2df90bb9-24d7-4590-a843-a286b6da0b2c"
@@ -292,9 +625,21 @@
"execution_count": null,
"id": "876ef0d7-7837-4ba8-94ea-9bc890762e62",
"metadata": {
- "id": "876ef0d7-7837-4ba8-94ea-9bc890762e62"
+ "colab": {
+ "base_uri": "https://localhost:8080/"
+ },
+ "id": "876ef0d7-7837-4ba8-94ea-9bc890762e62",
+ "outputId": "8a447741-c3f7-4c9c-da02-7a24ed039aba"
},
- "outputs": [],
+ "outputs": [
+ {
+ "output_type": "stream",
+ "name": "stderr",
+ "text": [
+ "progress bar: 43%|████▎ | 28/65 [02:54<04:56, 8.00s/it]"
+ ]
+ }
+ ],
"source": [
"tqdm.pandas(desc = \"progress bar\")\n",
"predictions_train = counts_train.progress_apply(lambda row: NB.predict(row, has_target = True), axis = 1)"
@@ -318,12 +663,24 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 16,
"id": "5d1357cd-54ed-4b0b-a514-6d783a286a1b",
"metadata": {
- "id": "5d1357cd-54ed-4b0b-a514-6d783a286a1b"
+ "colab": {
+ "base_uri": "https://localhost:8080/"
+ },
+ "id": "5d1357cd-54ed-4b0b-a514-6d783a286a1b",
+ "outputId": "4ace6283-3163-4db8-d4aa-63c39e821a5a"
},
- "outputs": [],
+ "outputs": [
+ {
+ "output_type": "stream",
+ "name": "stdout",
+ "text": [
+ "Train Accuracy: 0.8615384615384616\n"
+ ]
+ }
+ ],
"source": [
"print(\"Train Accuracy: \", accuracy_score(counts_train[\"Label\"], predictions_train))"
]
@@ -340,12 +697,24 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 17,
"id": "0410697d-2f38-484c-a908-395d86eed230",
"metadata": {
- "id": "0410697d-2f38-484c-a908-395d86eed230"
+ "colab": {
+ "base_uri": "https://localhost:8080/"
+ },
+ "id": "0410697d-2f38-484c-a908-395d86eed230",
+ "outputId": "e39da8a7-deb3-4d34-eb5d-3acc2b91c9b0"
},
- "outputs": [],
+ "outputs": [
+ {
+ "output_type": "stream",
+ "name": "stderr",
+ "text": [
+ "progress bar: 100%|██████████| 35/35 [02:48<00:00, 4.81s/it]\n"
+ ]
+ }
+ ],
"source": [
"tqdm.pandas(desc = \"progress bar\")\n",
"predictions_test = counts_test.progress_apply(lambda row: NB.predict(row, has_target = True), axis = 1)"
@@ -353,12 +722,28 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 18,
"id": "4cd0ed61-856f-461d-abb0-6c8f1d9c9f9c",
"metadata": {
- "id": "4cd0ed61-856f-461d-abb0-6c8f1d9c9f9c"
+ "colab": {
+ "base_uri": "https://localhost:8080/",
+ "height": 506
+ },
+ "id": "4cd0ed61-856f-461d-abb0-6c8f1d9c9f9c",
+ "outputId": "966e4ead-7d8f-4901-c1bf-a2b2a6567e44"
},
- "outputs": [],
+ "outputs": [
+ {
+ "output_type": "display_data",
+ "data": {
+ "text/plain": [
+ ""
+ ],
+ "image/png": "\n"
+ },
+ "metadata": {}
+ }
+ ],
"source": [
"conf = confusion_matrix(counts_test[\"Label\"], predictions_test)\n",
"\n",
@@ -369,12 +754,24 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 19,
"id": "3baadbb0-469b-4be2-88d8-559f17ae2b84",
"metadata": {
- "id": "3baadbb0-469b-4be2-88d8-559f17ae2b84"
+ "colab": {
+ "base_uri": "https://localhost:8080/"
+ },
+ "id": "3baadbb0-469b-4be2-88d8-559f17ae2b84",
+ "outputId": "3cfe920e-84b7-4b7c-d97e-a9d83f5bed8d"
},
- "outputs": [],
+ "outputs": [
+ {
+ "output_type": "stream",
+ "name": "stdout",
+ "text": [
+ "Test Accuracy: 0.2\n"
+ ]
+ }
+ ],
"source": [
"print(\"Test Accuracy: \", accuracy_score(counts_test[\"Label\"], predictions_test))"
]