{ "cells": [ { "cell_type": "markdown", "id": "f8888b32", "metadata": {}, "source": [ "\n", "# Recommendation Systems Introduction\n", "\n", "* Recommendation systems are one on the most valuable applicatins of Machine Learning.\n", "* Amazon attributes 20% their 221.60 billion dollars of their yearly revenues to recommendation.\n", "* Recommendation systems can use explicit and implict data.\n", " - A user rating for a product or movie is explicit.\n", " - Buying/watched history for a user is implicit.\n", "* Implicit data is already personalised.\n", "* Explicit data often needs to be renomalised across users -- For example, users may rate movies differently.\n", "* One simple approach to the problem is item-based collaborative filtering.\n", "* You can also go in the opposite direction and perform user-based collaborative filtering.\n", "\n", "![alt text](RecMatrix.png \"The Recommender Matrix\")\n", "\n" ] }, { "cell_type": "markdown", "id": "500e990f", "metadata": {}, "source": [ "# Exploring the dataset" ] }, { "cell_type": "code", "execution_count": 1, "id": "780b8fd5", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Head of ratings csv\n" ] }, { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
userIdmovieIdratingtimestamp
01312.51260759144
1110293.01260759179
2110613.01260759182
3111292.01260759185
4111724.01260759205
\n", "
" ], "text/plain": [ " userId movieId rating timestamp\n", "0 1 31 2.5 1260759144\n", "1 1 1029 3.0 1260759179\n", "2 1 1061 3.0 1260759182\n", "3 1 1129 2.0 1260759185\n", "4 1 1172 4.0 1260759205" ] }, "execution_count": 1, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# First lets just see an example algorithm run on movielens.\n", "\n", "import os\n", "import csv\n", "import sys\n", "import re\n", "from surprise import Dataset\n", "from surprise import Reader\n", "from collections import defaultdict\n", "import numpy as np\n", "from surprise import SVD\n", "import pandas as pd\n", "\n", "ratingsFile = 'ratings.csv'\n", "moviesFile = 'movies.csv'\n", "\n", "rdf = pd.read_csv(ratingsFile,header=0)\n", "print('Head of ratings csv')\n", "rdf.head()" ] }, { "cell_type": "code", "execution_count": 2, "id": "c763caf8", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Head of movies csv\n" ] }, { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
movieIdtitlegenres
01Toy Story (1995)Adventure|Animation|Children|Comedy|Fantasy
12Jumanji (1995)Adventure|Children|Fantasy
23Grumpier Old Men (1995)Comedy|Romance
34Waiting to Exhale (1995)Comedy|Drama|Romance
45Father of the Bride Part II (1995)Comedy
\n", "
" ], "text/plain": [ " movieId title \\\n", "0 1 Toy Story (1995) \n", "1 2 Jumanji (1995) \n", "2 3 Grumpier Old Men (1995) \n", "3 4 Waiting to Exhale (1995) \n", "4 5 Father of the Bride Part II (1995) \n", "\n", " genres \n", "0 Adventure|Animation|Children|Comedy|Fantasy \n", "1 Adventure|Children|Fantasy \n", "2 Comedy|Romance \n", "3 Comedy|Drama|Romance \n", "4 Comedy " ] }, "execution_count": 2, "metadata": {}, "output_type": "execute_result" } ], "source": [ "mdf = pd.read_csv(moviesFile,header=0)\n", "print('Head of movies csv')\n", "mdf.head()" ] }, { "cell_type": "code", "execution_count": 3, "id": "a3556fcd", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
movieIdimdbIdtmdbId
01114709862.0
121134978844.0
2311322815602.0
3411488531357.0
4511304111862.0
\n", "
" ], "text/plain": [ " movieId imdbId tmdbId\n", "0 1 114709 862.0\n", "1 2 113497 8844.0\n", "2 3 113228 15602.0\n", "3 4 114885 31357.0\n", "4 5 113041 11862.0" ] }, "execution_count": 3, "metadata": {}, "output_type": "execute_result" } ], "source": [ "dflinks = pd.read_csv(\"links.csv\")\n", "dftags = pd.read_csv(\"tags.csv\")\n", "#take a look at the training data\n", "dflinks.head()" ] }, { "cell_type": "code", "execution_count": 4, "id": "a0b0891a", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
userIdmovieIdtagtimestamp
015339sandra 'boring' bullock1138537770
1151955dentist1193435061
2157478Cambodia1170560997
31532892Russian1170626366
41534162forgettable1141391765
\n", "
" ], "text/plain": [ " userId movieId tag timestamp\n", "0 15 339 sandra 'boring' bullock 1138537770\n", "1 15 1955 dentist 1193435061\n", "2 15 7478 Cambodia 1170560997\n", "3 15 32892 Russian 1170626366\n", "4 15 34162 forgettable 1141391765" ] }, "execution_count": 4, "metadata": {}, "output_type": "execute_result" } ], "source": [ "dftags.head()" ] }, { "cell_type": "code", "execution_count": 5, "id": "e6d8c696", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Movie : Index(['movieId', 'title', 'genres'], dtype='object')\n", "\n", "Rating : Index(['userId', 'movieId', 'rating', 'timestamp'], dtype='object')\n", "\n", "Links : Index(['movieId', 'imdbId', 'tmdbId'], dtype='object')\n", "\n", "Tags : Index(['userId', 'movieId', 'tag', 'timestamp'], dtype='object')\n", "\n", "\n", "RangeIndex: 9125 entries, 0 to 9124\n", "Data columns (total 3 columns):\n", " # Column Non-Null Count Dtype \n", "--- ------ -------------- ----- \n", " 0 movieId 9125 non-null int64 \n", " 1 title 9125 non-null object\n", " 2 genres 9125 non-null object\n", "dtypes: int64(1), object(2)\n", "memory usage: 214.0+ KB\n", "\n", "RangeIndex: 100004 entries, 0 to 100003\n", "Data columns (total 4 columns):\n", " # Column Non-Null Count Dtype \n", "--- ------ -------------- ----- \n", " 0 userId 100004 non-null int64 \n", " 1 movieId 100004 non-null int64 \n", " 2 rating 100004 non-null float64\n", " 3 timestamp 100004 non-null int64 \n", "dtypes: float64(1), int64(3)\n", "memory usage: 3.1 MB\n", "\n", "RangeIndex: 1296 entries, 0 to 1295\n", "Data columns (total 4 columns):\n", " # Column Non-Null Count Dtype \n", "--- ------ -------------- ----- \n", " 0 userId 1296 non-null int64 \n", " 1 movieId 1296 non-null int64 \n", " 2 tag 1296 non-null object\n", " 3 timestamp 1296 non-null int64 \n", "dtypes: int64(3), object(1)\n", "memory usage: 40.6+ KB\n" ] } ], "source": [ "#get a list of the features within the dataset\n", "print(\"Movie : \", mdf.columns,end=\"\\n\\n\")\n", "print(\"Rating : \", rdf.columns,end=\"\\n\\n\")\n", "print(\"Links : \", dflinks.columns,end=\"\\n\\n\")\n", "print(\"Tags : \", dftags.columns,end=\"\\n\\n\")\n", "\n", "mdf.info()\n", "rdf.info()\n", "dftags.info()" ] }, { "cell_type": "code", "execution_count": 6, "id": "d37778ea", "metadata": {}, "outputs": [ { "data": { "image/png": "\n", "text/plain": [ "
" ] }, "metadata": { "needs_background": "light" }, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "Max No.of Movies Relesed = 275\n", "Year = 1996\n" ] }, { "data": { "text/plain": [ "count 105.000000\n", "mean 86.866667\n", "std 92.458522\n", "min 1.000000\n", "25% 20.000000\n", "50% 45.000000\n", "75% 147.000000\n", "max 275.000000\n", "Name: title, dtype: float64" ] }, "execution_count": 6, "metadata": {}, "output_type": "execute_result" } ], "source": [ "#visualization libraries\n", "import matplotlib.pyplot as plt\n", "import seaborn as sns\n", "%matplotlib inline\n", "#ignore warnings\n", "import warnings\n", "warnings.filterwarnings('ignore')\n", "\n", "#Extracting the year from the Title\n", "mdf['Year'] = mdf['title'].str.extract('.*\\((.*)\\).*',expand = False)\n", "\n", "#Ploting a Graph with No.of Movies each Year corresponding to its Year\n", "plt.plot(mdf.groupby('Year').title.count())\n", "plt.show()\n", "a=mdf.groupby('Year').title.count()\n", "print('Max No.of Movies Relesed =',a.max())\n", "for i in a.index:\n", " if a[i] == a.max():\n", " print('Year =',i)\n", "a.describe()" ] }, { "cell_type": "code", "execution_count": 7, "id": "b1ed3a23", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
movieIdtitlegenresYearAdventureAnimationChildrenComedyFantasyRomance...HorrorMysterySci-FiDocumentaryIMAXWarMusicalWesternFilm-Noir(no genres listed)
01Toy Story (1995)Adventure|Animation|Children|Comedy|Fantasy1995111110...0000000000
12Jumanji (1995)Adventure|Children|Fantasy1995101010...0000000000
23Grumpier Old Men (1995)Comedy|Romance1995000101...0000000000
34Waiting to Exhale (1995)Comedy|Drama|Romance1995000101...0000000000
45Father of the Bride Part II (1995)Comedy1995000100...0000000000
..................................................................
9120162672Mohenjo Daro (2016)Adventure|Drama|Romance2016100001...0000000000
9121163056Shin Godzilla (2016)Action|Adventure|Fantasy|Sci-Fi2016100010...0010000000
9122163949The Beatles: Eight Days a Week - The Touring Y...Documentary2016000000...0001000000
9123164977The Gay Desperado (1936)Comedy1936000100...0000000000
9124164979Women of '69, UnboxedDocumentaryNaN000000...0001000000
\n", "

9125 rows × 24 columns

\n", "
" ], "text/plain": [ " movieId title \\\n", "0 1 Toy Story (1995) \n", "1 2 Jumanji (1995) \n", "2 3 Grumpier Old Men (1995) \n", "3 4 Waiting to Exhale (1995) \n", "4 5 Father of the Bride Part II (1995) \n", "... ... ... \n", "9120 162672 Mohenjo Daro (2016) \n", "9121 163056 Shin Godzilla (2016) \n", "9122 163949 The Beatles: Eight Days a Week - The Touring Y... \n", "9123 164977 The Gay Desperado (1936) \n", "9124 164979 Women of '69, Unboxed \n", "\n", " genres Year Adventure Animation \\\n", "0 Adventure|Animation|Children|Comedy|Fantasy 1995 1 1 \n", "1 Adventure|Children|Fantasy 1995 1 0 \n", "2 Comedy|Romance 1995 0 0 \n", "3 Comedy|Drama|Romance 1995 0 0 \n", "4 Comedy 1995 0 0 \n", "... ... ... ... ... \n", "9120 Adventure|Drama|Romance 2016 1 0 \n", "9121 Action|Adventure|Fantasy|Sci-Fi 2016 1 0 \n", "9122 Documentary 2016 0 0 \n", "9123 Comedy 1936 0 0 \n", "9124 Documentary NaN 0 0 \n", "\n", " Children Comedy Fantasy Romance ... Horror Mystery Sci-Fi \\\n", "0 1 1 1 0 ... 0 0 0 \n", "1 1 0 1 0 ... 0 0 0 \n", "2 0 1 0 1 ... 0 0 0 \n", "3 0 1 0 1 ... 0 0 0 \n", "4 0 1 0 0 ... 0 0 0 \n", "... ... ... ... ... ... ... ... ... \n", "9120 0 0 0 1 ... 0 0 0 \n", "9121 0 0 1 0 ... 0 0 1 \n", "9122 0 0 0 0 ... 0 0 0 \n", "9123 0 1 0 0 ... 0 0 0 \n", "9124 0 0 0 0 ... 0 0 0 \n", "\n", " Documentary IMAX War Musical Western Film-Noir (no genres listed) \n", "0 0 0 0 0 0 0 0 \n", "1 0 0 0 0 0 0 0 \n", "2 0 0 0 0 0 0 0 \n", "3 0 0 0 0 0 0 0 \n", "4 0 0 0 0 0 0 0 \n", "... ... ... ... ... ... ... ... \n", "9120 0 0 0 0 0 0 0 \n", "9121 0 0 0 0 0 0 0 \n", "9122 1 0 0 0 0 0 0 \n", "9123 0 0 0 0 0 0 0 \n", "9124 1 0 0 0 0 0 0 \n", "\n", "[9125 rows x 24 columns]" ] }, "execution_count": 7, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Seperate the Geners Column and Encoding them with One-Hot-Encoding Method.\n", "genres=[]\n", "for i in range(len(mdf.genres)):\n", " for x in mdf.genres[i].split('|'):\n", " if x not in genres:\n", " genres.append(x) \n", "\n", "len(genres)\n", "for x in genres:\n", " mdf[x] = 0\n", "for i in range(len(mdf.genres)):\n", " for x in mdf.genres[i].split('|'):\n", " mdf[x][i]=1\n", "mdf" ] }, { "cell_type": "code", "execution_count": 8, "id": "7974aaba", "metadata": {}, "outputs": [], "source": [ "mdf.drop(columns='genres',inplace=True)\n", "mdf.sort_index(inplace=True)" ] }, { "cell_type": "code", "execution_count": 9, "id": "51295269", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Animation \t\t\t\t447\n", "Children \t\t\t\t583\n", "Comedy \t\t\t\t3315\n", "Fantasy \t\t\t\t654\n", "Romance \t\t\t\t1545\n", "Drama \t\t\t\t4365\n", "Action \t\t\t\t1545\n", "Crime \t\t\t\t1100\n", "Thriller \t\t\t\t1729\n", "Horror \t\t\t\t877\n", "Mystery \t\t\t\t543\n", "Sci-Fi \t\t\t\t792\n", "Documentary \t\t\t\t495\n", "IMAX \t\t\t\t153\n", "War \t\t\t\t367\n", "Musical \t\t\t\t394\n", "Western \t\t\t\t168\n", "Film-Noir \t\t\t\t133\n", "(no genres listed) \t\t\t\t18\n" ] }, { "data": { "image/png": "\n", "text/plain": [ "
" ] }, "metadata": { "needs_background": "light" }, "output_type": "display_data" } ], "source": [ "x={}\n", "for i in mdf.columns[4:23]:\n", " x[i]=mdf[i].value_counts()[1]\n", " print(\"{} \\t\\t\\t\\t{}\".format(i,x[i]))\n", "\n", "plt.bar(height=x.values(),x=x.keys())\n", "plt.show()" ] }, { "cell_type": "code", "execution_count": 10, "id": "589b0b8d", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
movieIdtitleratinguserId
321356Forrest Gump (1994)4.054252341.0
266296Pulp Fiction (1994)4.256173324.0
284318Shawshank Redemption, The (1994)4.487138311.0
525593Silence of the Lambs, The (1991)4.138158304.0
232260Star Wars: Episode IV - A New Hope (1977)4.221649291.0
...............
9112161336Author: The JT LeRoy Story (2016)0NaN
9113161582Hell or High Water (2016)0NaN
9121163056Shin Godzilla (2016)0NaN
9123164977The Gay Desperado (1936)0NaN
9124164979Women of '69, Unboxed0NaN
\n", "

9125 rows × 4 columns

\n", "
" ], "text/plain": [ " movieId title rating userId\n", "321 356 Forrest Gump (1994) 4.054252 341.0\n", "266 296 Pulp Fiction (1994) 4.256173 324.0\n", "284 318 Shawshank Redemption, The (1994) 4.487138 311.0\n", "525 593 Silence of the Lambs, The (1991) 4.138158 304.0\n", "232 260 Star Wars: Episode IV - A New Hope (1977) 4.221649 291.0\n", "... ... ... ... ...\n", "9112 161336 Author: The JT LeRoy Story (2016) 0 NaN\n", "9113 161582 Hell or High Water (2016) 0 NaN\n", "9121 163056 Shin Godzilla (2016) 0 NaN\n", "9123 164977 The Gay Desperado (1936) 0 NaN\n", "9124 164979 Women of '69, Unboxed 0 NaN\n", "\n", "[9125 rows x 4 columns]" ] }, "execution_count": 10, "metadata": {}, "output_type": "execute_result" } ], "source": [ "#Add a Column `rating` in movie DF and assign them with the Mean Movie Rating for that Movie.\n", "x=rdf.groupby('movieId').rating.mean()\n", "mdf = pd.merge(mdf,x,how='outer',on='movieId')\n", "mdf['rating'].fillna('0',inplace=True)\n", "# Now Lets group all the ratings with respect to movieId and count the no of Users\n", "x = rdf.groupby('movieId',as_index=False).userId.count()\n", "x.sort_values('userId',ascending=False,inplace=True)\n", "y = pd.merge(mdf,x,how='outer',on='movieId')\n", "\n", "y.drop(columns=[i for i in mdf.columns[2:23]],inplace=True)\n", "\n", "y.sort_values(['userId','rating'],ascending=False)" ] }, { "cell_type": "code", "execution_count": 11, "id": "d1356824", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
userIdmovieIdrating
count671.000000671.000000671.000000
mean336.000000149.0372583.657587
std193.845299231.2269480.471339
min1.00000020.0000001.333333
25%168.50000037.0000003.396193
50%336.00000071.0000003.675000
75%503.500000161.0000003.984026
max671.0000002391.0000004.948718
\n", "
" ], "text/plain": [ " userId movieId rating\n", "count 671.000000 671.000000 671.000000\n", "mean 336.000000 149.037258 3.657587\n", "std 193.845299 231.226948 0.471339\n", "min 1.000000 20.000000 1.333333\n", "25% 168.500000 37.000000 3.396193\n", "50% 336.000000 71.000000 3.675000\n", "75% 503.500000 161.000000 3.984026\n", "max 671.000000 2391.000000 4.948718" ] }, "execution_count": 11, "metadata": {}, "output_type": "execute_result" } ], "source": [ "#Find the user with highest no.of. movie ratings and that users mean rating. \n", "x = rdf.groupby('userId',as_index=False).movieId.count()\n", "y = rdf.groupby('userId',as_index=False).rating.mean()\n", "x = pd.merge(x,y,how='outer',on='userId')\n", "x.describe()" ] }, { "cell_type": "code", "execution_count": 12, "id": "fa8ebfee", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
userIdmovieIdrating
54654723913.366792
56356418683.552463
62362417352.894236
141517002.621765
727316103.374224
............
295296203.975000
288289203.675000
248249203.600000
220221202.775000
01202.550000
\n", "

671 rows × 3 columns

\n", "
" ], "text/plain": [ " userId movieId rating\n", "546 547 2391 3.366792\n", "563 564 1868 3.552463\n", "623 624 1735 2.894236\n", "14 15 1700 2.621765\n", "72 73 1610 3.374224\n", ".. ... ... ...\n", "295 296 20 3.975000\n", "288 289 20 3.675000\n", "248 249 20 3.600000\n", "220 221 20 2.775000\n", "0 1 20 2.550000\n", "\n", "[671 rows x 3 columns]" ] }, "execution_count": 12, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# From below we found that userId - 546 has given the Highest no.of Ratings with \n", "# 2391 and an Average of 3.37 Stars followed by userId-564 with 1868 Ratings and \n", "# an Average of 3.55 Stars and so on …\n", "x.sort_values('movieId',ascending=False)" ] }, { "cell_type": "markdown", "id": "9ff9e1b4", "metadata": {}, "source": [ "# Using the Surprise recommender library api" ] }, { "cell_type": "code", "execution_count": 13, "id": "11b04054", "metadata": {}, "outputs": [], "source": [ "# Load and Procress all of the data using Surprise library tools.\n", "\n", "reader = Reader(line_format='user item rating timestamp', sep=',', skip_lines=1)\n", "ratingsDataset = Dataset.load_from_file(ratingsFile, reader=reader)\n", "\n", "movieID_to_name = {}\n", "name_to_movieID = {}\n", "\n", "ratings = defaultdict(int)\n", "rankings = defaultdict(int)\n", "genres = defaultdict(list)\n", "\n", "genreIDs = {}\n", "maxGenreID = 0\n", "rank = 1\n", "\n", "with open(moviesFile, newline='') as csvfile:\n", " movieReader = csv.reader(csvfile)\n", " next(movieReader) #Skip header line\n", " for row in movieReader:\n", " movieID = int(row[0])\n", " movieName = row[1]\n", " movieID_to_name[movieID] = movieName\n", " name_to_movieID[movieName] = movieID\n", "\n", "with open(ratingsFile, newline='') as csvfile:\n", " ratingReader = csv.reader(csvfile)\n", " next(ratingReader)\n", " for row in ratingReader:\n", " movieID = int(row[1])\n", " ratings[movieID] += 1\n", " \n", "with open(moviesFile, newline='') as csvfile:\n", " movieReader = csv.reader(csvfile)\n", " next(movieReader) #Skip header line\n", " for row in movieReader:\n", " movieID = int(row[0])\n", " genreList = row[2].split('|')\n", " genreIDList = []\n", " for genre in genreList:\n", " if genre in genreIDs:\n", " genreID = genreIDs[genre]\n", " else:\n", " genreID = maxGenreID\n", " genreIDs[genre] = genreID\n", " maxGenreID += 1\n", " genreIDList.append(genreID)\n", " genres[movieID] = genreIDList\n", "\n", "# Build rankings dictionary.\n", "for movieID, ratingCount in sorted(ratings.items(), key=lambda x: x[1], reverse=True):\n", " rankings[movieID] = rank\n", " rank += 1\n", "\n", "# Convert integer-encoded genre lists to bitfields that we can treat as vectors\n", "for (movieID, genreIDList) in genres.items():\n", " bitfield = [0] * maxGenreID\n", " for genreID in genreIDList:\n", " bitfield[genreID] = 1\n", " genres[movieID] = bitfield\n", " \n", "def getUserRatings(user):\n", " userRatings = []\n", " hitUser = False\n", " with open(ratingsFile, newline='') as csvfile:\n", " ratingReader = csv.reader(csvfile)\n", " next(ratingReader)\n", " for row in ratingReader:\n", " userID = int(row[0])\n", " if (user == userID):\n", " movieID = int(row[1])\n", " rating = float(row[2])\n", " userRatings.append((movieID, rating))\n", " hitUser = True\n", " if (hitUser and (user != userID)):\n", " break\n", " return userRatings\n", " \n", "def BuildAntiTestSetForUser(testSubject, trainset):\n", " fill = trainset.global_mean\n", " anti_testset = []\n", " u = trainset.to_inner_uid(str(testSubject))\n", " user_items = set([j for (j, _) in trainset.ur[u]])\n", " anti_testset += [(trainset.to_raw_uid(u), trainset.to_raw_iid(i), fill) for\n", " i in trainset.all_items() if\n", " i not in user_items]\n", " return anti_testset\n", "\n", "def getMovieName(movieID):\n", " if movieID in movieID_to_name:\n", " return movieID_to_name[movieID]\n", " else:\n", " return \"\"\n", " \n", "def getMovieID(movieName):\n", " if movieName in name_to_movieID:\n", " return name_to_movieID[movieName]\n", " else:\n", " return 0 \n" ] }, { "cell_type": "markdown", "id": "806336c7", "metadata": {}, "source": [ "# Simple Collaborative Filtering\n", "![alt text](collabf.png \"Simple Collaborative filtering\")" ] }, { "cell_type": "code", "execution_count": 14, "id": "d75b4a7b", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Computing the cosine similarity matrix...\n", "Done computing similarity matrix.\n", "Computing the cosine similarity matrix...\n", "Done computing similarity matrix.\n", "James Dean Story, The (1957) 10.0\n", "Get Real (1998) 9.987241120712646\n", "Kiss of Death (1995) 9.966881877751941\n", "Set It Off (1996) 9.963732215657119\n", "How Green Was My Valley (1941) 9.943984081065269\n", "Amos & Andrew (1993) 9.93973694500253\n", "My Crazy Life (Mi vida loca) (1993) 9.938290487546041\n", "Grace of My Heart (1996) 9.926255896645218\n", "Fanny and Alexander (Fanny och Alexander) (1982) 9.925699671455906\n", "Wild Reeds (Les roseaux sauvages) (1994) 9.916226404418774\n", "Edge of Seventeen (1998) 9.913028764691676\n" ] } ], "source": [ "# Simple Item Based Collaborative Filtering -- the algorithm that Amazon made famous in 2003.\n", "\n", "from surprise import KNNBasic\n", "import heapq\n", "from collections import defaultdict\n", "from operator import itemgetter\n", "\n", "testSubject = '85'\n", "k = 10\n", "trainSet = ratingsDataset.build_full_trainset()\n", "sim_options = {'name': 'cosine','user_based': False}\n", "model = KNNBasic(sim_options=sim_options)\n", "model.fit(trainSet)\n", "simsMatrix = model.compute_similarities()\n", "testUserInnerID = trainSet.to_inner_uid(testSubject)\n", "# Get the top K items we rated\n", "testUserRatings = trainSet.ur[testUserInnerID]\n", "kNeighbors = heapq.nlargest(k, testUserRatings, key=lambda t: t[1])\n", "# Get similar items to stuff we liked (weighted by rating)\n", "candidates = defaultdict(float)\n", "\n", "for itemID, rating in kNeighbors:\n", " similarityRow = simsMatrix[itemID]\n", " for innerID, score in enumerate(similarityRow):\n", " candidates[innerID] += score * (rating / 5.0)\n", "\n", "# Build a dictionary of stuff the user has already seen\n", "watched = {}\n", "\n", "for itemID, rating in trainSet.ur[testUserInnerID]:\n", " watched[itemID] = 1\n", "\n", "# Get top-rated items from similar users:\n", "pos = 0\n", "\n", "for itemID, ratingSum in sorted(candidates.items(), key=itemgetter(1), reverse=True):\n", " if not itemID in watched:\n", " movieID = trainSet.to_raw_iid(itemID)\n", " print(getMovieName(int(movieID)), ratingSum)\n", " pos += 1\n", " if (pos > 10):\n", " break" ] }, { "cell_type": "code", "execution_count": 15, "id": "16f69db3", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Computing the cosine similarity matrix...\n", "Done computing similarity matrix.\n", "Computing the cosine similarity matrix...\n", "Done computing similarity matrix.\n", "Inception (2010) 3.3\n", "Star Wars: Episode V - The Empire Strikes Back (1980) 2.4\n", "Bourne Identity, The (1988) 2.0\n", "Crouching Tiger, Hidden Dragon (Wo hu cang long) (2000) 2.0\n", "Dark Knight, The (2008) 2.0\n", "Good, the Bad and the Ugly, The (Buono, il brutto, il cattivo, Il) (1966) 1.9\n", "Departed, The (2006) 1.9\n", "Dark Knight Rises, The (2012) 1.9\n", "Back to the Future (1985) 1.9\n", "Gravity (2013) 1.8\n", "Fight Club (1999) 1.8\n" ] } ], "source": [ "# Simple User-based collaborative filtering\n", "from surprise import KNNBasic\n", "import heapq\n", "from collections import defaultdict\n", "from operator import itemgetter\n", "\n", "testSubject = '85'\n", "k = 10\n", "trainSet = ratingsDataset.build_full_trainset()\n", "sim_options = {'name': 'cosine','user_based': True}\n", "model = KNNBasic(sim_options=sim_options)\n", "model.fit(trainSet)\n", "simsMatrix = model.compute_similarities()\n", "# Get top N similar users to our test subject\n", "# (Alternate approach would be to select users up to some similarity threshold - try it!)\n", "testUserInnerID = trainSet.to_inner_uid(testSubject)\n", "similarityRow = simsMatrix[testUserInnerID]\n", "similarUsers = []\n", "for innerID, score in enumerate(similarityRow):\n", " if (innerID != testUserInnerID):\n", " similarUsers.append( (innerID, score) )\n", "\n", "kNeighbors = heapq.nlargest(k, similarUsers, key=lambda t: t[1])\n", "# Get the stuff they rated, and add up ratings for each item, weighted by user similarity\n", "candidates = defaultdict(float)\n", "for similarUser in kNeighbors:\n", " innerID = similarUser[0]\n", " userSimilarityScore = similarUser[1]\n", " theirRatings = trainSet.ur[innerID]\n", " for rating in theirRatings:\n", " candidates[rating[0]] += (rating[1] / 5.0) * userSimilarityScore\n", "\n", "# Build a dictionary of stuff the user has already seen\n", "watched = {}\n", "for itemID, rating in trainSet.ur[testUserInnerID]:\n", " watched[itemID] = 1\n", "\n", "# Get top-rated items from similar users:\n", "pos = 0\n", "for itemID, ratingSum in sorted(candidates.items(), key=itemgetter(1), reverse=True):\n", " if not itemID in watched:\n", " movieID = trainSet.to_raw_iid(itemID)\n", " print(getMovieName(int(movieID)), ratingSum)\n", " pos += 1\n", " if (pos > 10):\n", " break\n" ] }, { "cell_type": "markdown", "id": "12638d74", "metadata": {}, "source": [ "# Evaluation of Recommendations" ] }, { "cell_type": "code", "execution_count": 16, "id": "807914e2", "metadata": {}, "outputs": [], "source": [ "# Helper class to evalute Recommendation Systems Quality by multiple poplular measures.\n", "# As you can see there is no shortage on ways that quality is measured in this field.\n", "import itertools\n", "from surprise import accuracy\n", "from collections import defaultdict\n", "\n", "class RecommenderMetrics:\n", "\n", " # Mean Average Error\n", " def MAE(predictions):\n", " return accuracy.mae(predictions, verbose=False)\n", " \n", " # Root Mean Squared error\n", " def RMSE(predictions):\n", " return accuracy.rmse(predictions, verbose=False)\n", "\n", " # Not a metric. It s the top-N predictions for an algorithm.\n", " def GetTopN(predictions, n=10, minimumRating=4.0):\n", " topN = defaultdict(list)\n", " for userID, movieID, actualRating, estimatedRating, _ in predictions:\n", " if (estimatedRating >= minimumRating):\n", " topN[int(userID)].append((int(movieID), estimatedRating))\n", " for userID, ratings in topN.items():\n", " ratings.sort(key=lambda x: x[1], reverse=True)\n", " topN[int(userID)] = ratings[:n]\n", " return topN\n", "\n", " # How good is this top-10 list?\n", " # - Find all items in this user’s history in the training data.\n", " # - Intentionally remove one of these items ( Leave-One-Out cross-validation).\n", " # - Use all other items to feed the recommender and ask for top 10 recommendations.\n", " # - If the removed item appear in the top 10 recommendations, it is a hit. If not, it’s not a hit.\n", " def HitRate(topNPredicted, leftOutPredictions):\n", " hits = 0\n", " total = 0\n", " # For each left-out rating\n", " for leftOut in leftOutPredictions:\n", " userID = leftOut[0]\n", " leftOutMovieID = leftOut[1]\n", " # Is it in the predicted top 10 for this user?\n", " hit = False\n", " for movieID, predictedRating in topNPredicted[int(userID)]:\n", " if (int(leftOutMovieID) == int(movieID)):\n", " hit = True\n", " break\n", " if (hit) :\n", " hits += 1\n", " total += 1\n", " # Compute overall precision\n", " return hits/total\n", " \n", " # Normally you set the rating cutoff. When you do the quantifies how good\n", " # the results are based on rating preference.\n", " def CumulativeHitRate(topNPredicted, leftOutPredictions, ratingCutoff=0):\n", " hits = 0\n", " total = 0\n", " # For each left-out rating\n", " for userID, leftOutMovieID, actualRating, estimatedRating, _ in leftOutPredictions:\n", " # Only look at ability to recommend things the users actually liked...\n", " if (actualRating >= ratingCutoff):\n", " # Is it in the predicted top 10 for this user?\n", " hit = False\n", " for movieID, predictedRating in topNPredicted[int(userID)]:\n", " if (int(leftOutMovieID) == movieID):\n", " hit = True\n", " break\n", " if (hit) :\n", " hits += 1\n", " total += 1\n", " # Compute overall precision\n", " return hits/total\n", " \n", " # Determine the hit rat by each rating grade.\n", " def RatingHitRate(topNPredicted, leftOutPredictions):\n", " hits = defaultdict(float)\n", " total = defaultdict(float)\n", " # For each left-out rating\n", " for userID, leftOutMovieID, actualRating, estimatedRating, _ in leftOutPredictions:\n", " # Is it in the predicted top N for this user?\n", " hit = False\n", " for movieID, predictedRating in topNPredicted[int(userID)]:\n", " if (int(leftOutMovieID) == movieID):\n", " hit = True\n", " break\n", " if (hit) :\n", " hits[actualRating] += 1\n", " total[actualRating] += 1\n", " # Compute overall precision\n", " for rating in sorted(hits.keys()):\n", " print (rating, hits[rating] / total[rating])\n", " \n", " # Some argue that this is one of the best ways to evaluate a recommender system.\n", " # It is similar to reciprical rank -- the average of the first relevant hit for each prediction.\n", " def AverageReciprocalHitRank(topNPredicted, leftOutPredictions):\n", " summation = 0\n", " total = 0\n", " # For each left-out rating\n", " for userID, leftOutMovieID, actualRating, estimatedRating, _ in leftOutPredictions:\n", " # Is it in the predicted top N for this user?\n", " hitRank = 0\n", " rank = 0\n", " for movieID, predictedRating in topNPredicted[int(userID)]:\n", " rank = rank + 1\n", " if (int(leftOutMovieID) == movieID):\n", " hitRank = rank\n", " break\n", " if (hitRank > 0) :\n", " summation += 1.0 / hitRank\n", " total += 1\n", " return summation / total\n", "\n", " # What percentage of users have at least one \"good\" recommendation?\n", " def UserCoverage(topNPredicted, numUsers, ratingThreshold=0):\n", " hits = 0\n", " for userID in topNPredicted.keys():\n", " hit = False\n", " for movieID, predictedRating in topNPredicted[userID]:\n", " if (predictedRating >= ratingThreshold):\n", " hit = True\n", " break\n", " if (hit):\n", " hits += 1\n", " return hits / numUsers\n", "\n", " # A somewhat infamous issue in recommendations is that over time, they do a poor\n", " # job of recommending \"different\" things than what you have already selected.\n", " def Diversity(topNPredicted, simsAlgo):\n", " n = 0\n", " total = 0\n", " simsMatrix = simsAlgo.compute_similarities()\n", " for userID in topNPredicted.keys():\n", " pairs = itertools.combinations(topNPredicted[userID], 2)\n", " for pair in pairs:\n", " movie1 = pair[0][0]\n", " movie2 = pair[1][0]\n", " innerID1 = simsAlgo.trainset.to_inner_iid(str(movie1))\n", " innerID2 = simsAlgo.trainset.to_inner_iid(str(movie2))\n", " similarity = simsMatrix[innerID1][innerID2]\n", " total += similarity\n", " n += 1\n", " S = total / n\n", " return (1-S)\n", "\n", " # Closely related to the diversity problem -- does the algorithm recommend really\n", " # \"differet\" things or not? For example if you watch a bunch of horror movies, will\n", " # you only see other horror movies in the list?\n", " def Novelty(topNPredicted, rankings):\n", " n = 0\n", " total = 0\n", " for userID in topNPredicted.keys():\n", " for rating in topNPredicted[userID]:\n", " movieID = rating[0]\n", " rank = rankings[movieID]\n", " total += rank\n", " n += 1\n", " return total / n" ] }, { "cell_type": "code", "execution_count": 17, "id": "2047b257", "metadata": {}, "outputs": [], "source": [ "# Helper Class for Evaluation runs\n", "from surprise.model_selection import train_test_split\n", "from surprise.model_selection import LeaveOneOut\n", "from surprise import KNNBaseline\n", "\n", "class EvaluationData:\n", " def __init__(self, data, popularityRankings):\n", " self.rankings = popularityRankings\n", " #Build a full training set for evaluating overall properties\n", " self.fullTrainSet = data.build_full_trainset()\n", " self.fullAntiTestSet = self.fullTrainSet.build_anti_testset()\n", " #Build a 75/25 train/test split for measuring accuracy\n", " self.trainSet, self.testSet = train_test_split(data, test_size=.25, random_state=1)\n", " #Build a \"leave one out\" train/test split for evaluating top-N recommenders\n", " #And build an anti-test-set for building predictions\n", " LOOCV = LeaveOneOut(n_splits=1, random_state=1)\n", " for train, test in LOOCV.split(data):\n", " self.LOOCVTrain = train\n", " self.LOOCVTest = test\n", " self.LOOCVAntiTestSet = self.LOOCVTrain.build_anti_testset()\n", " #Compute similarty matrix between items so we can measure diversity\n", " sim_options = {'name': 'cosine', 'user_based': False}\n", " self.simsAlgo = KNNBaseline(sim_options=sim_options)\n", " self.simsAlgo.fit(self.fullTrainSet)\n", "\n", " def GetFullTrainSet(self):\n", " return self.fullTrainSet\n", "\n", " def GetFullAntiTestSet(self):\n", " return self.fullAntiTestSet\n", " \n", " def GetAntiTestSetForUser(self, testSubject):\n", " trainset = self.fullTrainSet\n", " fill = trainset.global_mean\n", " anti_testset = []\n", " u = trainset.to_inner_uid(str(testSubject))\n", " user_items = set([j for (j, _) in trainset.ur[u]])\n", " anti_testset += [(trainset.to_raw_uid(u), trainset.to_raw_iid(i), fill) for\n", " i in trainset.all_items() if\n", " i not in user_items]\n", " return anti_testset\n", "\n", " def GetTrainSet(self):\n", " return self.trainSet\n", "\n", " def GetTestSet(self):\n", " return self.testSet\n", "\n", " def GetLOOCVTrainSet(self):\n", " return self.LOOCVTrain\n", "\n", " def GetLOOCVTestSet(self):\n", " return self.LOOCVTest\n", "\n", " def GetLOOCVAntiTestSet(self):\n", " return self.LOOCVAntiTestSet\n", "\n", " def GetSimilarities(self):\n", " return self.simsAlgo\n", " \n", " def GetPopularityRankings(self):\n", " return self.rankings" ] }, { "cell_type": "code", "execution_count": 18, "id": "1b30c736", "metadata": {}, "outputs": [], "source": [ "# Finally a helper class that combines the two above into a single comprehensive tool to measure qualityand\n", "# compare algortihm performance.\n", "class EvaluationAlgorithm:\n", "\n", " def __init__(self, algorithm, name):\n", " self.algorithm = algorithm\n", " self.name = name\n", "\n", " def Evaluate(self, evaluationData, doTopN, n=10, verbose=True):\n", " metrics = {}\n", " # Compute accuracy\n", " if (verbose):\n", " print(\"Evaluating accuracy...\")\n", " self.algorithm.fit(evaluationData.GetTrainSet())\n", " predictions = self.algorithm.test(evaluationData.GetTestSet())\n", " metrics[\"RMSE\"] = RecommenderMetrics.RMSE(predictions)\n", " metrics[\"MAE\"] = RecommenderMetrics.MAE(predictions)\n", " if (doTopN):\n", " # Evaluate top-10 with Leave One Out testing\n", " if (verbose):\n", " print(\"Evaluating top-N with leave-one-out...\")\n", " self.algorithm.fit(evaluationData.GetLOOCVTrainSet())\n", " leftOutPredictions = self.algorithm.test(evaluationData.GetLOOCVTestSet())\n", " # Build predictions for all ratings not in the training set\n", " allPredictions = self.algorithm.test(evaluationData.GetLOOCVAntiTestSet())\n", " # Compute top 10 recs for each user\n", " topNPredicted = RecommenderMetrics.GetTopN(allPredictions, n)\n", " if (verbose):\n", " print(\"Computing hit-rate and rank metrics...\")\n", " # See how often we recommended a movie the user actually rated\n", " metrics[\"HR\"] = RecommenderMetrics.HitRate(topNPredicted, leftOutPredictions)\n", " # See how often we recommended a movie the user actually liked\n", " metrics[\"cHR\"] = RecommenderMetrics.CumulativeHitRate(topNPredicted, leftOutPredictions)\n", " # Compute ARHR\n", " metrics[\"ARHR\"] = RecommenderMetrics.AverageReciprocalHitRank(topNPredicted, leftOutPredictions)\n", " #Evaluate properties of recommendations on full training set\n", " if (verbose):\n", " print(\"Computing recommendations with full data set...\")\n", " self.algorithm.fit(evaluationData.GetFullTrainSet())\n", " allPredictions = self.algorithm.test(evaluationData.GetFullAntiTestSet())\n", " topNPredicted = RecommenderMetrics.GetTopN(allPredictions, n)\n", " if (verbose):\n", " print(\"Analyzing coverage, diversity, and novelty...\")\n", " # Print user coverage with a minimum predicted rating of 4.0:\n", " metrics[\"Coverage\"] = RecommenderMetrics.UserCoverage( topNPredicted,\n", " evaluationData.GetFullTrainSet().n_users,\n", " ratingThreshold=4.0)\n", " # Measure diversity of recommendations:\n", " metrics[\"Diversity\"] = RecommenderMetrics.Diversity(topNPredicted, evaluationData.GetSimilarities())\n", " # Measure novelty (average popularity rank of recommendations):\n", " metrics[\"Novelty\"] = RecommenderMetrics.Novelty(topNPredicted,\n", " evaluationData.GetPopularityRankings())\n", " if (verbose):\n", " print(\"Analysis complete.\")\n", " return metrics\n", " \n", " def GetName(self):\n", " return self.name\n", "\n", " def GetAlgorithm(self):\n", " return self.algorithm\n", "\n" ] }, { "cell_type": "code", "execution_count": 19, "id": "10ab14a8", "metadata": {}, "outputs": [], "source": [ "class Evaluator:\n", " algorithms = []\n", "\n", " def __init__(self, dataset, rankings):\n", " ed = EvaluationData(dataset, rankings)\n", " self.dataset = ed\n", "\n", " def AddAlgorithm(self, algorithm, name):\n", " alg = EvaluationAlgorithm(algorithm, name)\n", " self.algorithms.append(alg)\n", "\n", " def Evaluate(self, doTopN):\n", " results = {}\n", " for algorithm in self.algorithms:\n", " print(\"Evaluating \", algorithm.GetName(), \"...\")\n", " results[algorithm.GetName()] = algorithm.Evaluate(self.dataset, doTopN)\n", " # Print results\n", " print(\"\\n\")\n", " if (doTopN):\n", " print(\"{:<10} {:<10} {:<10} {:<10} {:<10} {:<10} {:<10} {:<10} {:<10}\".format(\n", " \"Algorithm\", \"RMSE\", \"MAE\", \"HR\", \"cHR\", \"ARHR\", \"Coverage\", \"Diversity\", \"Novelty\"))\n", " for (name, metrics) in results.items():\n", " print(\"{:<10} {:<10.4f} {:<10.4f} {:<10.4f} {:<10.4f} {:<10.4f} {:<10.4f} {:<10.4f} {:<10.4f}\".format(\n", " name, metrics[\"RMSE\"], metrics[\"MAE\"], metrics[\"HR\"], metrics[\"cHR\"], metrics[\"ARHR\"],\n", " metrics[\"Coverage\"], metrics[\"Diversity\"], metrics[\"Novelty\"]))\n", " else:\n", " print(\"{:<10} {:<10} {:<10}\".format(\"Algorithm\", \"RMSE\", \"MAE\"))\n", " for (name, metrics) in results.items():\n", " print(\"{:<10} {:<10.4f} {:<10.4f}\".format(name, metrics[\"RMSE\"], metrics[\"MAE\"]))\n", " print(\"\\nLegend:\\n\")\n", " print(\"RMSE: Root Mean Squared Error. Lower values mean better accuracy.\")\n", " print(\"MAE: Mean Absolute Error. Lower values mean better accuracy.\")\n", " if (doTopN):\n", " print(\"HR: Hit Rate; how often we are able to recommend a left-out rating. Higher is better.\")\n", " print(\"cHR: Cumulative Hit Rate; hit rate, confined to ratings above a certain threshold. Higher is better.\")\n", " print(\"ARHR: Average Reciprocal Hit Rank - Hit rate that takes the ranking into account. Higher is better.\" )\n", " print(\"Coverage: Ratio of users for whom recommendations above a certain threshold exist. Higher is better.\")\n", " print(\"Diversity: 1-S, where S is the average similarity score between every possible pair of recommendations\")\n", " print(\" for a given user. Higher means more diverse.\")\n", " print(\"Novelty: Average popularity rank of recommended items. Higher means more novel.\")\n", "\n", " def SampleTopNRecs(self, testSubject=85, k=10):\n", " for algo in self.algorithms:\n", " print(\"\\nUsing recommendation algorithm\", algo.GetName())\n", " print(\"\\nBuilding recommendation model...\")\n", " trainSet = self.dataset.GetFullTrainSet()\n", " algo.GetAlgorithm().fit(trainSet)\n", " print(\"Computing recommendations...\")\n", " testSet = self.dataset.GetAntiTestSetForUser(testSubject)\n", " predictions = algo.GetAlgorithm().test(testSet)\n", " recommendations = []\n", " print (\"\\nRecommend:\")\n", " for userID, movieID, actualRating, estimatedRating, _ in predictions:\n", " intMovieID = int(movieID)\n", " recommendations.append((intMovieID, estimatedRating))\n", " recommendations.sort(key=lambda x: x[1], reverse=True)\n", " for ratings in recommendations[:10]:\n", " print(getMovieName(ratings[0]), ratings[1])" ] }, { "cell_type": "code", "execution_count": 20, "id": "02da19d4", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Estimating biases using als...\n", "Computing the cosine similarity matrix...\n", "Done computing similarity matrix.\n", "Computing the cosine similarity matrix...\n", "Done computing similarity matrix.\n", "Computing the cosine similarity matrix...\n", "Done computing similarity matrix.\n", "HR 0.05514157973174367\n" ] } ], "source": [ "#Evaluating UserCF\n", "from surprise import KNNBasic\n", "import heapq\n", "from collections import defaultdict\n", "from operator import itemgetter\n", "from surprise.model_selection import LeaveOneOut\n", "\n", "k = 10\n", "evalData = EvaluationData(ratingsDataset, rankings)\n", "# Train on leave-One-Out train set\n", "trainSet = evalData.GetLOOCVTrainSet()\n", "sim_options = {'name': 'cosine','user_based': True}\n", "model = KNNBasic(sim_options=sim_options)\n", "model.fit(trainSet)\n", "simsMatrix = model.compute_similarities()\n", "leftOutTestSet = evalData.GetLOOCVTestSet()\n", "# Build up dict to lists of (int(movieID), predictedrating) pairs\n", "topN = defaultdict(list)\n", "for uiid in range(trainSet.n_users):\n", " # Get top N similar users to this one\n", " similarityRow = simsMatrix[uiid]\n", " similarUsers = []\n", " for innerID, score in enumerate(similarityRow):\n", " if (innerID != uiid):\n", " similarUsers.append( (innerID, score) )\n", " kNeighbors = heapq.nlargest(k, similarUsers, key=lambda t: t[1])\n", " # Get the stuff they rated, and add up ratings for each item, weighted by user similarity\n", " candidates = defaultdict(float)\n", " for similarUser in kNeighbors:\n", " innerID = similarUser[0]\n", " userSimilarityScore = similarUser[1]\n", " theirRatings = trainSet.ur[innerID]\n", " for rating in theirRatings:\n", " candidates[rating[0]] += (rating[1] / 5.0) * userSimilarityScore\n", " # Build a dictionary of stuff the user has already seen\n", " watched = {}\n", " for itemID, rating in trainSet.ur[uiid]:\n", " watched[itemID] = 1\n", " # Get top-rated items from similar users:\n", " pos = 0\n", " for itemID, ratingSum in sorted(candidates.items(), key=itemgetter(1), reverse=True):\n", " if not itemID in watched:\n", " movieID = trainSet.to_raw_iid(itemID)\n", " topN[int(trainSet.to_raw_uid(uiid))].append( (int(movieID), 0.0) )\n", " pos += 1\n", " if (pos > 40):\n", " break \n", "\n", "# Measure\n", "print(\"HR\", RecommenderMetrics.HitRate(topN, leftOutTestSet))" ] }, { "cell_type": "markdown", "id": "ac270abc", "metadata": {}, "source": [ "# A more advanced model -- Singular Vector Decomposition\n", "* As discussed above, the data is really a large matrix of Users and Items\n", "* But the data is very sparse. Many users only rate a few items. \n", "* This biases the datatset somewhat, and SVD is a matrix method that can be used to alleviate this bias.\n", "* Simon Funk was the first to use this method in the now famous netflix challenge. \n", "* See his blog if you are interested in how it really works -- https://sifter.org/~simon/journal/20061211.html\n" ] }, { "cell_type": "code", "execution_count": 21, "id": "93ca6ba4", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Loading movie ratings...\n", "\n", "User 85 loved these movies:\n", "Jumanji (1995)\n", "GoldenEye (1995)\n", "Braveheart (1995)\n", "Jerky Boys, The (1995)\n", "Léon: The Professional (a.k.a. The Professional) (Léon) (1994)\n", "Pulp Fiction (1994)\n", "Stargate (1994)\n", "Shawshank Redemption, The (1994)\n", "Star Trek: Generations (1994)\n", "Clear and Present Danger (1994)\n", "Speed (1994)\n", "True Lies (1994)\n", "Fugitive, The (1993)\n", "Jurassic Park (1993)\n", "Terminator 2: Judgment Day (1991)\n", "Mission: Impossible (1996)\n", "Rock, The (1996)\n", "\n", "...and didn't like these movies:\n", "Grumpier Old Men (1995)\n", "Mortal Kombat (1995)\n", "Postman, The (Postino, Il) (1994)\n", "Casper (1995)\n", "Lord of Illusions (1995)\n", "Mighty Morphin Power Rangers: The Movie (1995)\n", "Prophecy, The (1995)\n", "Dolores Claiborne (1995)\n", "Heavenly Creatures (1994)\n", "Little Women (1994)\n", "Miracle on 34th Street (1994)\n", "Nell (1994)\n", "Poison Ivy II (1996)\n", "Tank Girl (1995)\n", "While You Were Sleeping (1995)\n", "Wes Craven's New Nightmare (Nightmare on Elm Street Part 7: Freddy's Finale, A) (1994)\n", "Naked Gun 33 1/3: The Final Insult (1994)\n", "Richie Rich (1994)\n", "Beverly Hills Cop III (1994)\n", "Philadelphia (1993)\n", "Schindler's List (1993)\n", "Super Mario Bros. (1993)\n", "Nightmare Before Christmas, The (1993)\n", "Snow White and the Seven Dwarfs (1937)\n", "Operation Dumbo Drop (1995)\n", "Oliver & Company (1988)\n", "\n", "Building recommendation model...\n", "Computing recommendations...\n", "\n", "We recommend:\n", "Rear Window (1954)\n", "Hamlet (1996)\n", "In the Heat of the Night (1967)\n", "Harry Potter and the Half-Blood Prince (2009)\n", "Body Heat (1981)\n", "Best Years of Our Lives, The (1946)\n", "L.A. Confidential (1997)\n", "Casablanca (1942)\n", "Sunset Blvd. (a.k.a. Sunset Boulevard) (1950)\n", "African Queen, The (1951)\n" ] } ], "source": [ "# Pick an arbitrary test subject\n", "testSubject = 85\n", "print(\"Loading movie ratings...\")\n", "userRatings = getUserRatings(testSubject)\n", "recommendations = []\n", "loved = []\n", "hated = []\n", "for ratings in userRatings:\n", " if (float(ratings[1]) > 4.0):\n", " loved.append(ratings)\n", " if (float(ratings[1]) < 3.0):\n", " hated.append(ratings)\n", "\n", "print(\"\\nUser \", testSubject, \" loved these movies:\")\n", "for ratings in loved:\n", " print(getMovieName(ratings[0]))\n", "\n", "print(\"\\n...and didn't like these movies:\")\n", "for ratings in hated:\n", " print(getMovieName(ratings[0]))\n", "\n", "print(\"\\nBuilding recommendation model...\")\n", "trainSet = ratingsDataset.build_full_trainset()\n", "\n", "algo = SVD()\n", "algo.fit(trainSet)\n", "\n", "print(\"Computing recommendations...\")\n", "testSet = BuildAntiTestSetForUser(testSubject, trainSet)\n", "predictions = algo.test(testSet)\n", "\n", "print (\"\\nWe recommend:\")\n", "for userID, movieID, actualRating, estimatedRating, _ in predictions:\n", " intMovieID = int(movieID)\n", " recommendations.append((intMovieID, estimatedRating))\n", "\n", "recommendations.sort(key=lambda x: x[1], reverse=True)\n", "\n", "for ratings in recommendations[:10]:\n", " print(getMovieName(ratings[0]))" ] }, { "cell_type": "markdown", "id": "ac127296", "metadata": {}, "source": [ "# The three algorithm shoot-out on the MovieLens dataset" ] }, { "cell_type": "code", "execution_count": 22, "id": "ce0a93e6", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Estimating biases using als...\n", "Computing the cosine similarity matrix...\n", "Done computing similarity matrix.\n", "Evaluating User KNN ...\n", "Evaluating accuracy...\n", "Computing the cosine similarity matrix...\n", "Done computing similarity matrix.\n", "Analysis complete.\n", "Evaluating Item KNN ...\n", "Evaluating accuracy...\n", "Computing the cosine similarity matrix...\n", "Done computing similarity matrix.\n", "Analysis complete.\n", "Evaluating Random ...\n", "Evaluating accuracy...\n", "Analysis complete.\n", "Evaluating SVD ...\n", "Evaluating accuracy...\n", "Analysis complete.\n", "\n", "\n", "Algorithm RMSE MAE \n", "User KNN 0.9961 0.7711 \n", "Item KNN 0.9995 0.7798 \n", "Random 1.4385 1.1478 \n", "SVD 0.9043 0.6987 \n", "\n", "Legend:\n", "\n", "RMSE: Root Mean Squared Error. Lower values mean better accuracy.\n", "MAE: Mean Absolute Error. Lower values mean better accuracy.\n", "\n", "Using recommendation algorithm User KNN\n", "\n", "Building recommendation model...\n", "Computing the cosine similarity matrix...\n", "Done computing similarity matrix.\n", "Computing recommendations...\n", "\n", "Recommend:\n", "One Magic Christmas (1985) 5\n", "Step Into Liquid (2002) 5\n", "Art of War, The (2000) 5\n", "Taste of Cherry (Ta'm e guilass) (1997) 5\n", "King Is Alive, The (2000) 5\n", "Innocence (2000) 5\n", "Maelström (2000) 5\n", "Faust (1926) 5\n", "Seconds (1966) 5\n", "Amazing Grace (2006) 5\n", "\n", "Using recommendation algorithm Item KNN\n", "\n", "Building recommendation model...\n", "Computing the cosine similarity matrix...\n", "Done computing similarity matrix.\n", "Computing recommendations...\n", "\n", "Recommend:\n", "Life in a Day (2011) 5\n", "Under Suspicion (2000) 5\n", "Asterix and the Gauls (Astérix le Gaulois) (1967) 5\n", "Find Me Guilty (2006) 5\n", "Elementary Particles, The (Elementarteilchen) (2006) 5\n", "Asterix and the Vikings (Astérix et les Vikings) (2006) 5\n", "From the Sky Down (2011) 5\n", "Vive L'Amour (Ai qing wan sui) (1994) 5\n", "Vagabond (Sans toit ni loi) (1985) 5\n", "Ariel (1988) 5\n", "\n", "Using recommendation algorithm Random\n", "\n", "Building recommendation model...\n", "Computing recommendations...\n", "\n", "Recommend:\n", "Dumbo (1941) 5\n", "Englishman Who Went Up a Hill But Came Down a Mountain, The (1995) 5\n", "Sleepless in Seattle (1993) 5\n", "Pet Sematary (1989) 5\n", "Eternal Sunshine of the Spotless Mind (2004) 5\n", "Birdcage, The (1996) 5\n", "Some Like It Hot (1959) 5\n", "Cinderella (1950) 5\n", "Fish Called Wanda, A (1988) 5\n", "Goodfellas (1990) 5\n", "\n", "Using recommendation algorithm SVD\n", "\n", "Building recommendation model...\n", "Computing recommendations...\n", "\n", "Recommend:\n", "Matrix, The (1999) 4.407616528756567\n", "Ran (1985) 4.317388798918363\n", "Sunset Blvd. (a.k.a. Sunset Boulevard) (1950) 4.308875555039512\n", "3:10 to Yuma (2007) 4.299052769082302\n", "Fight Club (1999) 4.295575347437914\n", "Godfather, The (1972) 4.273186543912806\n", "Raging Bull (1980) 4.266575754730711\n", "Lock, Stock & Two Smoking Barrels (1998) 4.256520579061788\n", "Paris, Texas (1984) 4.250402302580284\n", "Taxi Driver (1976) 4.233609033622251\n" ] } ], "source": [ "# Now a full comparison of simple algorithms.\n", "from surprise import KNNBasic\n", "from surprise import NormalPredictor\n", "import random\n", "import numpy as np\n", "\n", "np.random.seed(0)\n", "random.seed(0)\n", "#evalData = EvaluationData(ratingsDataset, rankings)\n", "# Construct an Evaluator to, you know, evaluate them\n", "evaluator = Evaluator(ratingsDataset, rankings)\n", "# User-based KNN\n", "UserKNN = KNNBasic(sim_options = {'name': 'cosine', 'user_based': True})\n", "evaluator.AddAlgorithm(UserKNN, 'User KNN')\n", "# Item-based KNN\n", "ItemKNN = KNNBasic(sim_options = {'name': 'cosine', 'user_based': False})\n", "evaluator.AddAlgorithm(ItemKNN, 'Item KNN')\n", "# Just make random recommendations\n", "Random = NormalPredictor()\n", "evaluator.AddAlgorithm(Random, 'Random')\n", "\n", "SVDalgo = SVD()\n", "evaluator.AddAlgorithm(SVDalgo,'SVD')\n", "\n", "# Fight!\n", "evaluator.Evaluate(False)\n", "evaluator.SampleTopNRecs()" ] }, { "cell_type": "code", "execution_count": null, "id": "40987dca", "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.9.6" } }, "nbformat": 4, "nbformat_minor": 5 }