diff --git a/models/similarity_matrix.pkl b/models/similarity_matrix.pkl
index d310511..addcefc 100644
Binary files a/models/similarity_matrix.pkl and b/models/similarity_matrix.pkl differ
diff --git a/notebooks/model_training.ipynb b/notebooks/model_training.ipynb
index 7860d40..56a7012 100644
--- a/notebooks/model_training.ipynb
+++ b/notebooks/model_training.ipynb
@@ -40,15 +40,12 @@
}
],
"source": [
- "# Import Dependencies\n",
- "\n",
"import numpy as np\n",
"import pandas as pd\n",
- "import matplotlib.pyplot as plt\n",
- "import seaborn as sns\n",
"from sklearn.feature_extraction.text import TfidfVectorizer\n",
"from sklearn.metrics.pairwise import cosine_similarity\n",
"from nltk.stem import WordNetLemmatizer\n",
+ "from sklearn.decomposition import TruncatedSVD\n",
"import pickle\n",
"import nltk\n",
"import re\n",
@@ -59,57 +56,10 @@
"except LookupError:\n",
" nltk.download('wordnet')\n",
"\n",
+ "\n",
"print('Dependencies Imported')"
]
},
- {
- "cell_type": "code",
- "execution_count": 2,
- "id": "9cfffeab",
- "metadata": {},
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "/Users/bushra/data-science/upwork/education-projects/03/course-recommendation-system/notebooks\n"
- ]
- }
- ],
- "source": [
- "import os\n",
- "current_directory = os.getcwd()\n",
- "print(current_directory)\n"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 3,
- "id": "8fafd9bc",
- "metadata": {
- "scrolled": false
- },
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "/Users/bushra/data-science/upwork/education-projects/03/course-recommendation-system\n"
- ]
- },
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "/Users/bushra/data-science/upwork/education-projects/03/course-recommendation-system/venv/lib/python3.11/site-packages/IPython/core/magics/osm.py:417: UserWarning: This is now an optional IPython functionality, setting dhist requires you to install the `pickleshare` library.\n",
- " self.shell.db['dhist'] = compress_dhist(dhist)[-100:]\n"
- ]
- }
- ],
- "source": [
- "%cd .."
- ]
- },
{
"cell_type": "markdown",
"id": "2b7a46c9",
@@ -120,29 +70,17 @@
},
{
"cell_type": "code",
- "execution_count": 4,
+ "execution_count": 2,
"id": "046017bb",
"metadata": {},
"outputs": [],
"source": [
- "import pandas as pd\n",
- "\n",
- "# Function to load CSV data\n",
- "def load_data(file_path):\n",
- " try:\n",
- " data = pd.read_csv(file_path, encoding='utf-8')\n",
- " return data\n",
- " except FileNotFoundError:\n",
- " print(f\"File {file_path} not found.\")\n",
- " return None\n",
- "\n",
- "# Loading the data from a local file\n",
- "data = load_data(\"../course-recommendation-system/data/coursera.csv\")\n"
+ "data = pd.read_csv(\"coursera.csv\", encoding='utf-8')"
]
},
{
"cell_type": "code",
- "execution_count": 5,
+ "execution_count": 3,
"id": "5fd1ffb0",
"metadata": {},
"outputs": [
@@ -268,7 +206,7 @@
"4 Data Analysis select (sql) database manageme... "
]
},
- "execution_count": 5,
+ "execution_count": 3,
"metadata": {},
"output_type": "execute_result"
}
@@ -287,7 +225,7 @@
},
{
"cell_type": "code",
- "execution_count": 7,
+ "execution_count": 4,
"id": "5cb80836",
"metadata": {
"scrolled": true
@@ -299,19 +237,18 @@
"(3522, 7)"
]
},
- "execution_count": 7,
+ "execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
- "# Display dataset shape (number of rows and columns)\n",
"data.shape"
]
},
{
"cell_type": "code",
- "execution_count": 8,
+ "execution_count": 5,
"id": "95cde0c6",
"metadata": {
"scrolled": true
@@ -339,205 +276,110 @@
}
],
"source": [
- "# Show summary of the dataset (information on column types and memory usage)\n",
"data.info()"
]
},
{
"cell_type": "code",
- "execution_count": 9,
+ "execution_count": 6,
"id": "f08d2540",
"metadata": {},
"outputs": [
{
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "Course Name 0\n",
- "University 0\n",
- "Difficulty Level 0\n",
- "Course Rating 0\n",
- "Course URL 0\n",
- "Course Description 0\n",
- "Skills 0\n",
- "dtype: int64\n"
- ]
+ "data": {
+ "text/plain": [
+ "Course Name 0\n",
+ "University 0\n",
+ "Difficulty Level 0\n",
+ "Course Rating 0\n",
+ "Course URL 0\n",
+ "Course Description 0\n",
+ "Skills 0\n",
+ "dtype: int64"
+ ]
+ },
+ "execution_count": 6,
+ "metadata": {},
+ "output_type": "execute_result"
}
],
"source": [
- "# Check for missing values in each column\n",
- "print(data.isnull().sum())"
+ "data.isnull().sum()"
]
},
{
"cell_type": "code",
- "execution_count": 10,
+ "execution_count": 7,
"id": "01cab0bc",
"metadata": {
"scrolled": true
},
"outputs": [
{
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "Course Name 3416\n",
- "University 184\n",
- "Difficulty Level 5\n",
- "Course Rating 31\n",
- "Course URL 3424\n",
- "Course Description 3397\n",
- "Skills 3424\n",
- "dtype: int64\n"
- ]
+ "data": {
+ "text/plain": [
+ "Course Name 3416\n",
+ "University 184\n",
+ "Difficulty Level 5\n",
+ "Course Rating 31\n",
+ "Course URL 3424\n",
+ "Course Description 3397\n",
+ "Skills 3424\n",
+ "dtype: int64"
+ ]
+ },
+ "execution_count": 7,
+ "metadata": {},
+ "output_type": "execute_result"
}
],
"source": [
- "# Display the number of unique values for each column (useful to identify categorical variables)\n",
- "print(data.nunique())"
+ "data.nunique()"
]
},
{
"cell_type": "code",
- "execution_count": 11,
- "id": "6933f893",
+ "execution_count": 8,
+ "id": "3b12b05c",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
- "(3424, 7)"
+ "np.int64(98)"
]
},
- "execution_count": 11,
+ "execution_count": 8,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
- "# Remove duplicates based on specific columns\n",
- "data = data.drop_duplicates(subset=['Course Name', 'University', 'Difficulty Level', 'Course Rating',\n",
- " 'Course URL', 'Course Description'])\n",
- "data.shape"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "737a0b4d",
- "metadata": {},
- "source": [
- "
Check the Distribution of Key Columns
"
+ "data.duplicated().sum()"
]
},
{
"cell_type": "code",
- "execution_count": 12,
- "id": "4cb5938e",
- "metadata": {
- "scrolled": false
- },
+ "execution_count": 9,
+ "id": "6933f893",
+ "metadata": {},
"outputs": [
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "/var/folders/fh/kyml8tjs61j6z54n3h59ptwh0000gn/T/ipykernel_11894/2219308396.py:14: FutureWarning: \n",
- "\n",
- "Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.\n",
- "\n",
- " sns.barplot(x=value_counts.index[:top_n], y=value_counts.values[:top_n], palette='bright')\n"
- ]
- },
- {
- "data": {
- "image/png": "",
- "text/plain": [
- ""
- ]
- },
- "metadata": {},
- "output_type": "display_data"
- },
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "/var/folders/fh/kyml8tjs61j6z54n3h59ptwh0000gn/T/ipykernel_11894/2219308396.py:14: FutureWarning: \n",
- "\n",
- "Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.\n",
- "\n",
- " sns.barplot(x=value_counts.index[:top_n], y=value_counts.values[:top_n], palette='bright')\n"
- ]
- },
- {
- "data": {
- "image/png": "",
- "text/plain": [
- ""
- ]
- },
- "metadata": {},
- "output_type": "display_data"
- },
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "/var/folders/fh/kyml8tjs61j6z54n3h59ptwh0000gn/T/ipykernel_11894/2219308396.py:14: FutureWarning: \n",
- "\n",
- "Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.\n",
- "\n",
- " sns.barplot(x=value_counts.index[:top_n], y=value_counts.values[:top_n], palette='bright')\n"
- ]
- },
{
"data": {
- "image/png": "",
"text/plain": [
- ""
+ "(3424, 7)"
]
},
+ "execution_count": 9,
"metadata": {},
- "output_type": "display_data"
- },
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "\n",
- "Total Unique Universities: 184\n"
- ]
+ "output_type": "execute_result"
}
],
"source": [
- "# Function to display and visualize distributions for categorical columns\n",
- "def display_distribution(column_name, data, top_n=10, sort_by_value=True, visualize=True):\n",
- " \n",
- " # Get the value counts\n",
- " value_counts = data[column_name].value_counts()\n",
- "\n",
- " # Sort values if necessary\n",
- " if sort_by_value:\n",
- " value_counts = value_counts.sort_values(ascending=False)\n",
- " \n",
- " # Optional: Plot the distribution\n",
- " if visualize:\n",
- " plt.figure(figsize=(10, 6))\n",
- " sns.barplot(x=value_counts.index[:top_n], y=value_counts.values[:top_n], palette='bright')\n",
- " plt.title(f\"Top {top_n} {column_name} Distribution\")\n",
- " plt.xlabel(column_name)\n",
- " plt.ylabel('Frequency')\n",
- " plt.xticks(rotation=45, ha=\"right\")\n",
- " plt.tight_layout()\n",
- " plt.show()\n",
- "\n",
- "# Displaying the distributions\n",
- "display_distribution('Difficulty Level', data, top_n=5) # Adjust top_n as needed\n",
- "display_distribution('Course Rating', data, top_n=5)\n",
- "display_distribution('University', data, top_n=10, sort_by_value=False) # For universities, you might not want to sort by count\n",
- "\n",
- "# Showing the count of unique universities\n",
- "print(f\"\\nTotal Unique Universities: {data['University'].nunique()}\")\n"
+ "# Remove duplicates based on specific columns\n",
+ "data = data.drop_duplicates(subset=['Course Name', 'University', 'Difficulty Level', 'Course Rating',\n",
+ " 'Course URL', 'Course Description'])\n",
+ "data.shape"
]
},
{
@@ -545,40 +387,20 @@
"id": "b84c5ac7",
"metadata": {},
"source": [
- "Text Preprocessing
"
+ "Text Preprocessing on Training Data
"
]
},
{
"cell_type": "code",
- "execution_count": 13,
+ "execution_count": 10,
"id": "e98bdce3",
"metadata": {},
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- " Course Name \\\n",
- "0 write a feature length screenplay for film or ... \n",
- "1 business strategy business model canvas analys... \n",
- "2 silicon thin film solar cell \n",
- "3 finance for manager \n",
- "4 retrieve data using singletable sql query \n",
- "\n",
- " tags \n",
- "0 write a feature length screenplay for film or ... \n",
- "1 business strategy business model canvas analys... \n",
- "2 silicon thin film solar cell this course consi... \n",
- "3 finance for manager when it come to number the... \n",
- "4 retrieve data using singletable sql query in t... \n"
- ]
- }
- ],
+ "outputs": [],
"source": [
"lemmatizer = WordNetLemmatizer()\n",
"\n",
"# Function for text cleaning (removing special characters, stopwords, and lemmatization)\n",
- "def clean_text(text):\n",
+ "def clean_for_tags(text):\n",
" text = re.sub(r'��+', '', text) # This removes \"��\" or any repeated \"��\" characters\n",
" text = re.sub(r'[^\\x00-\\x7F]+', '', text) # Removes non-ASCII characters\n",
" text = re.sub(r'[^a-zA-Z\\s]', '', text) # Remove anything that is not a letter or space\n",
@@ -586,40 +408,103 @@
" text = ' '.join([lemmatizer.lemmatize(word) for word in text.split()]) # Lemmatization\n",
" return text\n",
"\n",
- "# Apply text cleaning to necessary columns\n",
- "data['Course Name'] = data['Course Name'].apply(clean_text)\n",
- "data['Course Description'] = data['Course Description'].apply(clean_text)\n",
- "data['Skills'] = data['Skills'].apply(clean_text)\n",
+ "training_data = data.copy()\n",
+ "\n",
+ "# Apply clean_for_tags on columns to be used in tags column\n",
+ "training_data['Course Name'] = training_data['Course Name'].apply(clean_for_tags)\n",
+ "training_data['Course Description'] = training_data['Course Description'].apply(clean_for_tags)\n",
+ "training_data['Skills'] = training_data['Skills'].apply(clean_for_tags)\n",
"\n",
"# Combine 'Course Name', 'Course Description', and 'Skills' into 'tags'\n",
- "data['tags'] = data['Course Name'] + ' ' + data['Course Description'] + ' ' + data['Skills']\n",
- "print(data[['Course Name', 'tags']].head())\n"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 14,
- "id": "f6fa97b4",
- "metadata": {},
- "outputs": [],
- "source": [
- "# Memory efficient datatypes"
+ "data['tags'] = training_data['Course Name'] + ' ' + training_data['Course Description'] + ' ' + training_data['Skills']\n",
+ "\n",
+ "training_data = data[['Course Name', 'tags']]"
]
},
{
"cell_type": "code",
- "execution_count": 15,
- "id": "6ce32db5",
+ "execution_count": 11,
+ "id": "d6885a74",
"metadata": {},
- "outputs": [],
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " Course Name | \n",
+ " tags | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " Write A Feature Length Screenplay For Film Or ... | \n",
+ " write a feature length screenplay for film or ... | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " Business Strategy: Business Model Canvas Analy... | \n",
+ " business strategy business model canvas analys... | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " Silicon Thin Film Solar Cells | \n",
+ " silicon thin film solar cell this course consi... | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " Finance for Managers | \n",
+ " finance for manager when it come to number the... | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " Retrieve Data using Single-Table SQL Queries | \n",
+ " retrieve data using singletable sql query in t... | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " Course Name \\\n",
+ "0 Write A Feature Length Screenplay For Film Or ... \n",
+ "1 Business Strategy: Business Model Canvas Analy... \n",
+ "2 Silicon Thin Film Solar Cells \n",
+ "3 Finance for Managers \n",
+ "4 Retrieve Data using Single-Table SQL Queries \n",
+ "\n",
+ " tags \n",
+ "0 write a feature length screenplay for film or ... \n",
+ "1 business strategy business model canvas analys... \n",
+ "2 silicon thin film solar cell this course consi... \n",
+ "3 finance for manager when it come to number the... \n",
+ "4 retrieve data using singletable sql query in t... "
+ ]
+ },
+ "execution_count": 11,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
"source": [
- "# Convert 'University' and 'Difficulty Level' to categorical to save memory\n",
- "data['University'] = data['University'].astype('category')\n",
- "data['Difficulty Level'] = data['Difficulty Level'].astype('category')\n",
- "\n",
- "# Optimize 'Course Rating' column to save memory\n",
- "data['Course Rating'] = data['Course Rating'].apply(lambda x: 0.0 if x == 'Not Calibrated' else pd.to_numeric(x, errors='coerce'))\n",
- "data['Course Rating'] = data['Course Rating'].astype('float32')\n"
+ "training_data.head()"
]
},
{
@@ -632,7 +517,7 @@
},
{
"cell_type": "code",
- "execution_count": 16,
+ "execution_count": 12,
"id": "aab37f89",
"metadata": {},
"outputs": [
@@ -646,13 +531,21 @@
],
"source": [
"vectorizer = TfidfVectorizer(stop_words='english', max_features=5000)\n",
- "tfidf_matrix = vectorizer.fit_transform(data['tags'])\n",
+ "tfidf_matrix = vectorizer.fit_transform(training_data['tags'])\n",
"print(\"TF-IDF matrix shape:\", tfidf_matrix.shape)"
]
},
+ {
+ "cell_type": "markdown",
+ "id": "42cdb85d",
+ "metadata": {},
+ "source": [
+ "Apply SVD on TF-IDF
"
+ ]
+ },
{
"cell_type": "code",
- "execution_count": 17,
+ "execution_count": 13,
"id": "493e2f7d",
"metadata": {},
"outputs": [
@@ -665,10 +558,7 @@
}
],
"source": [
- "from sklearn.decomposition import TruncatedSVD\n",
- "\n",
- "# Apply SVD to reduce dimensions of the TF-IDF matrix\n",
- "n_components = 100\n",
+ "n_components = 100 # Reduce to 100 dimensions\n",
"svd = TruncatedSVD(n_components=n_components, random_state=42)\n",
"tfidf_matrix = svd.fit_transform(tfidf_matrix)\n",
"\n",
@@ -685,7 +575,7 @@
},
{
"cell_type": "code",
- "execution_count": 18,
+ "execution_count": 14,
"id": "ab0769d7",
"metadata": {
"scrolled": true
@@ -709,79 +599,99 @@
"id": "62948f78",
"metadata": {},
"source": [
- "Recommendation Function
"
+ "Functions for Recommendation
"
]
},
{
"cell_type": "code",
- "execution_count": 19,
- "id": "e79b0a64",
+ "execution_count": 15,
+ "id": "9333fc14",
"metadata": {},
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "Recommendations based on 'Finance for Managers':\n",
- "\n",
- "- introduction to finance the basic (Adjusted Similarity Score: 0.97, Rating: 4.599999904632568)\n",
- "- finance for nonfinancial professional (Adjusted Similarity Score: 0.97, Rating: 4.5)\n",
- "- the language and tool of financial analysis (Adjusted Similarity Score: 0.96, Rating: 4.599999904632568)\n",
- "- finance for nonfinancial manager (Adjusted Similarity Score: 0.95, Rating: 4.199999809265137)\n"
- ]
- }
- ],
+ "outputs": [],
"source": [
- "# Ensure 'Course Rating' is numeric, and handle errors gracefully\n",
- "data['Course Rating'] = pd.to_numeric(data['Course Rating'], errors='coerce')\n",
- "# Normalize ratings (if ratings are between 1 and 5)\n",
- "data['Normalized Rating'] = (data['Course Rating'] - data['Course Rating'].min()) / (data['Course Rating'].max() - data['Course Rating'].min())\n",
- "\n",
- "def recommend(course_name, top_n=5, threshold=70, min_rating=3):\n",
+ "def normalize_rating(rating_str):\n",
" \"\"\"\n",
- " Recommend courses based on course name, taking into account both cosine similarity and ratings.\n",
- "\n",
- " Args:\n",
- " - course_name (str): Name of the course to base recommendations on.\n",
- " - top_n (int): Number of top recommendations to display.\n",
- " - threshold (int): Minimum similarity score for a course to be recommended.\n",
- " - min_rating (int): Minimum rating a course must have to be recommended.\n",
+ " Normalize the course rating to a 0-1 scale.\n",
" \"\"\"\n",
- " \n",
- " similarity_scores = list(enumerate(similarity_matrix[course_idx]))\n",
- " sorted_courses = sorted(similarity_scores, key=lambda x: x[1], reverse=True)[1:]\n",
- " \n",
- " recommended_courses = []\n",
- " \n",
- " # Adjust recommendations using ratings\n",
- " for idx, sim_score in sorted_courses[:top_n]:\n",
- " course_rating = data.iloc[idx]['Course Rating']\n",
- " if pd.isna(course_rating):\n",
- " course_rating = data['Course Rating'].mean() # Fallback to mean rating\n",
- "\n",
- " # Normalize similarity score\n",
- " sim_score = float(sim_score)\n",
- " sim_min, sim_max = min(sim_score for _, sim_score in sorted_courses), max(sim_score for _, sim_score in sorted_courses)\n",
- " normalized_sim_score = (sim_score - sim_min) / (sim_max - sim_min)\n",
+ " try:\n",
+ " return (float(rating_str) - 0) / (5 - 0) # Normalize to 0-1\n",
+ " except ValueError:\n",
+ " return 0 "
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 17,
+ "id": "a990f12f",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "def get_recommendations(course_name, data, similarity_matrix, top_n=3, threshold=90, rating_weight=0.05):\n",
+ " \"\"\"\n",
+ " Get top N course recommendations based on similarity to the given course name.\n",
+ " \"\"\"\n",
+ " course_name = data[data['Course Name'] == course_name] # Filter data for selected course\n",
+ " course_idx = course_name.index[0] # Get the index of the selected course\n",
+ " similarity_scores = list(enumerate(similarity_matrix[course_idx])) # Get similarity scores for all courses\n",
+ " \n",
+ " recommendations = []\n",
+ " for idx, similarity_score in sorted(similarity_scores, key=lambda x: x[1], reverse=True)[:top_n]:\n",
+ " course_data = data.iloc[idx] # Get course data for the current recommendation\n",
+ " normalized_rating = normalize_rating(course_data.get('Course Rating', '0')) # Normalize rating\n",
"\n",
- " # Weighted scoring: 70% similarity, 30% rating\n",
- " w_sim = 0.7\n",
- " w_rating = 0.3\n",
- " adjusted_score = (w_sim * normalized_sim_score) + (w_rating * (course_rating / 5)) # Rating normalized to [0,1]\n",
- " \n",
- " if course_rating >= min_rating:\n",
- " recommended_courses.append((data.iloc[idx]['Course Name'], adjusted_score, course_rating))\n",
- " \n",
- " recommended_courses = sorted(recommended_courses, key=lambda x: x[1], reverse=True)\n",
- " \n",
- " # Display recommendations\n",
- " for course_name, final_score, rating in recommended_courses:\n",
- " print(f\"- {course_name} (Adjusted Similarity Score: {final_score:.2f}, Rating: {rating})\")\n",
- " else:\n",
- " print(f\"Course '{course_name}' not found.\")\n",
+ " # Prepare recommendation dictionary with relevant course information\n",
+ " recommendations.append({\n",
+ " \"course_name\": course_data['Course Name'],\n",
+ " \"course_url\": course_data.get('Course URL', ''),\n",
+ " \"rating\": course_data['Course Rating'],\n",
+ " \"institution\": course_data.get('University', 'Unknown'),\n",
+ " \"difficulty_level\": course_data.get('Difficulty Level', 'Unknown'),\n",
+ " \"similarity\": similarity_score,\n",
+ " \"final_score\": similarity_score * (1 - rating_weight) + normalized_rating * rating_weight \n",
+ " })\n",
"\n",
- "# Example usage with a course name\n",
- "recommend('Finance for Managers')\n"
+ " return sorted(recommendations, key=lambda x: x['final_score'], reverse=True)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 18,
+ "id": "201f8d31",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "[{'course_name': 'Finance for Managers',\n",
+ " 'course_url': 'https://www.coursera.org/learn/operational-finance',\n",
+ " 'rating': '4.8',\n",
+ " 'institution': 'IESE Business School',\n",
+ " 'difficulty_level': 'Intermediate',\n",
+ " 'similarity': np.float64(1.0),\n",
+ " 'final_score': np.float64(0.998)},\n",
+ " {'course_name': 'Finance for Non-Financial Professionals',\n",
+ " 'course_url': 'https://www.coursera.org/learn/finance-for-non-finance-managers',\n",
+ " 'rating': '4.5',\n",
+ " 'institution': 'University of California, Irvine',\n",
+ " 'difficulty_level': 'Conversant',\n",
+ " 'similarity': np.float64(0.832940692838965),\n",
+ " 'final_score': np.float64(0.8362936581970167)},\n",
+ " {'course_name': 'Finance for Non-Financial Managers',\n",
+ " 'course_url': 'https://www.coursera.org/learn/finance-for-non-financial-managers',\n",
+ " 'rating': '4.2',\n",
+ " 'institution': 'Emory University',\n",
+ " 'difficulty_level': 'Beginner',\n",
+ " 'similarity': np.float64(0.8315387472926317),\n",
+ " 'final_score': np.float64(0.8319618099280001)}]"
+ ]
+ },
+ "execution_count": 18,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "get_recommendations('Finance for Managers', data, similarity_matrix)"
]
},
{
@@ -794,18 +704,12 @@
},
{
"cell_type": "code",
- "execution_count": 20,
+ "execution_count": 19,
"id": "8c3a275a",
"metadata": {},
"outputs": [],
"source": [
- "# Saving the trained model\n",
- "pickle.dump(similarity_matrix, open('similarity_matrix.pkl', 'wb'))\n",
- "pickle.dump(vectorizer, open('tfidf_vectorizer.pkl', 'wb'))\n",
- "\n",
- "# In a web application or future use, you can load the model:\n",
- "# similarity_matrix = pickle.load(open('similarity_matrix.pkl', 'rb'))\n",
- "# vectorizer = pickle.load(open('tfidf_vectorizer.pkl', 'rb'))\n"
+ "pickle.dump(similarity_matrix, open('similarity_matrix.pkl', 'wb'))"
]
}
],