{ "cells": [ { "cell_type": "code", "execution_count": 1, "id": "b341d05d-d472-4d6c-8ea1-acf83d9e80a2", "metadata": {}, "outputs": [], "source": [ "import numpy as np\n", "import pandas as pd" ] }, { "cell_type": "code", "execution_count": 2, "id": "2473b5cc-771f-4153-84ea-2543383ef739", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
user_iditem_idratingtimestamp
01962423881250949
11863023891717742
2223771878887116
3244512880606923
41663461886397596
\n", "
" ], "text/plain": [ " user_id item_id rating timestamp\n", "0 196 242 3 881250949\n", "1 186 302 3 891717742\n", "2 22 377 1 878887116\n", "3 244 51 2 880606923\n", "4 166 346 1 886397596" ] }, "execution_count": 2, "metadata": {}, "output_type": "execute_result" } ], "source": [ "names = ['user_id', 'item_id', 'rating', 'timestamp']\n", "df = pd.read_csv('ml-100k/u.data', sep='\\t', names=names)\n", "df.head()" ] }, { "cell_type": "code", "execution_count": 3, "id": "712799d9-5019-4112-907c-500aac9cebfb", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "943" ] }, "execution_count": 3, "metadata": {}, "output_type": "execute_result" } ], "source": [ "n_user = df.user_id.unique().shape[0]\n", "n_user" ] }, { "cell_type": "code", "execution_count": 4, "id": "1b731c9f-7993-48bc-8899-c88536f20cad", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "1682" ] }, "execution_count": 4, "metadata": {}, "output_type": "execute_result" } ], "source": [ "n_item = df.item_id.unique().shape[0]\n", "n_item" ] }, { "cell_type": "code", "execution_count": 5, "id": "e72e57d0-b667-4b03-b0f6-264993bb36dc", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array([[0., 0., 0., ..., 0., 0., 0.],\n", " [0., 0., 0., ..., 0., 0., 0.],\n", " [0., 0., 0., ..., 0., 0., 0.],\n", " ...,\n", " [0., 0., 0., ..., 0., 0., 0.],\n", " [0., 0., 0., ..., 0., 0., 0.],\n", " [0., 0., 0., ..., 0., 0., 0.]])" ] }, "execution_count": 5, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# binary Matrix\n", "ratingNum = np.zeros((n_user, n_item))\n", "ratingNum" ] }, { "cell_type": "code", "execution_count": 6, "id": "2adbb8e8-f54b-41d1-954a-d56e4d980941", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "[[1. 1. 1. ... 0. 0. 0.]\n", " [1. 0. 0. ... 0. 0. 0.]\n", " [0. 0. 0. ... 0. 0. 0.]\n", " ...\n", " [1. 0. 0. ... 0. 0. 0.]\n", " [0. 0. 0. ... 0. 0. 0.]\n", " [0. 1. 0. ... 0. 0. 0.]]\n" ] } ], "source": [ "for row in df.itertuples():\n", " # Pandas(Index=0, user_id=196, item_id=242, rating=3, timestamp=881250949)\n", " ratingNum[row[1]-1, row[2]-1] = 1\n", "print(ratingNum)" ] }, { "cell_type": "code", "execution_count": 7, "id": "db99c0e5-1176-4d04-9b1e-5c7e002c6e21", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "[452. 131. 90. ... 1. 1. 1.]\n" ] } ], "source": [ "itemrateNumCurrent = ratingNum.sum(axis=0)\n", "print(itemrateNumCurrent)\n", "itemrateNumCurrent.sort()" ] }, { "cell_type": "code", "execution_count": 8, "id": "466ad956-1376-45b6-81a4-bda201f4ba71", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "Text(0, 0.5, 'popularity')" ] }, "execution_count": 8, "metadata": {}, "output_type": "execute_result" }, { "data": { "image/png": "\n", "text/plain": [ "
" ] }, "metadata": { "needs_background": "light" }, "output_type": "display_data" } ], "source": [ "# plt long tail\n", "import matplotlib.pyplot as plt\n", "%matplotlib inline\n", "plt.plot(itemrateNumCurrent[::-1])\n", "plt.xlabel(\"sourted items\")\n", "plt.ylabel(\"popularity\")" ] }, { "cell_type": "code", "execution_count": 9, "id": "53818f4d-53b4-4564-a2d1-64da69ddb4b8", "metadata": {}, "outputs": [], "source": [ "# Top Pop\n", "ratings = np.zeros((n_user, n_item))" ] }, { "cell_type": "code", "execution_count": 10, "id": "0cb39eff-def9-4a45-8a8a-3deec81e0ba2", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "[[5. 3. 4. ... 0. 0. 0.]\n", " [4. 0. 0. ... 0. 0. 0.]\n", " [0. 0. 0. ... 0. 0. 0.]\n", " ...\n", " [5. 0. 0. ... 0. 0. 0.]\n", " [0. 0. 0. ... 0. 0. 0.]\n", " [0. 5. 0. ... 0. 0. 0.]]\n" ] } ], "source": [ "for row in df.itertuples():\n", " # Pandas(Index=0, user_id=196, item_id=242, rating=3, timestamp=881250949)\n", " ratings[row[1]-1,row[2]-1] = row[3]\n", "print(ratings)" ] }, { "cell_type": "code", "execution_count": 11, "id": "538a15d5-1c36-44e9-932f-de7395e8db0a", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array([452., 131., 90., ..., 1., 1., 1.])" ] }, "execution_count": 11, "metadata": {}, "output_type": "execute_result" } ], "source": [ "ratingNum = ratingNum.sum(axis=0) # the total namber of times an item got rate by all users \n", "ratingNum" ] }, { "cell_type": "code", "execution_count": 12, "id": "5a4bcf6d-ae46-442d-8141-fc154bfb0d60", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array([1753., 420., 273., ..., 2., 3., 3.])" ] }, "execution_count": 12, "metadata": {}, "output_type": "execute_result" } ], "source": [ "itemRateSum = ratings.sum(axis=0) # the total rating reviced by every items from all user\n", "itemRateSum" ] }, { "cell_type": "code", "execution_count": 13, "id": "3937fd71-53e4-47e3-b822-3ad04fc8f960", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array([3.87831858, 3.20610687, 3.03333333, ..., 2. , 3. ,\n", " 3. ])" ] }, "execution_count": 13, "metadata": {}, "output_type": "execute_result" } ], "source": [ "itemRateAvg = itemRateSum / ratingNum\n", "itemRateAvg" ] }, { "cell_type": "code", "execution_count": 16, "id": "c15af163-9467-49a7-9336-c21fe1fa1d52", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
movie idmovie titlerelease datevideo release dateIMDb URLunknownActionAdventureAnimationChildren's...FantasyFilm-NoirHorrorMusicalMysteryRomanceSci-FiThrillerWarWestern
01Toy Story (1995)01-Jan-1995NaNhttp://us.imdb.com/M/title-exact?Toy%20Story%2...00011...0000000000
12GoldenEye (1995)01-Jan-1995NaNhttp://us.imdb.com/M/title-exact?GoldenEye%20(...01100...0000000100
23Four Rooms (1995)01-Jan-1995NaNhttp://us.imdb.com/M/title-exact?Four%20Rooms%...00000...0000000100
34Get Shorty (1995)01-Jan-1995NaNhttp://us.imdb.com/M/title-exact?Get%20Shorty%...01000...0000000000
45Copycat (1995)01-Jan-1995NaNhttp://us.imdb.com/M/title-exact?Copycat%20(1995)00000...0000000100
..................................................................
16771678Mat' i syn (1997)06-Feb-1998NaNhttp://us.imdb.com/M/title-exact?Mat%27+i+syn+...00000...0000000000
16781679B. Monkey (1998)06-Feb-1998NaNhttp://us.imdb.com/M/title-exact?B%2E+Monkey+(...00000...0000010100
16791680Sliding Doors (1998)01-Jan-1998NaNhttp://us.imdb.com/Title?Sliding+Doors+(1998)00000...0000010000
16801681You So Crazy (1994)01-Jan-1994NaNhttp://us.imdb.com/M/title-exact?You%20So%20Cr...00000...0000000000
16811682Scream of Stone (Schrei aus Stein) (1991)08-Mar-1996NaNhttp://us.imdb.com/M/title-exact?Schrei%20aus%...00000...0000000000
\n", "

1682 rows × 24 columns

\n", "
" ], "text/plain": [ " movie id movie title release date \\\n", "0 1 Toy Story (1995) 01-Jan-1995 \n", "1 2 GoldenEye (1995) 01-Jan-1995 \n", "2 3 Four Rooms (1995) 01-Jan-1995 \n", "3 4 Get Shorty (1995) 01-Jan-1995 \n", "4 5 Copycat (1995) 01-Jan-1995 \n", "... ... ... ... \n", "1677 1678 Mat' i syn (1997) 06-Feb-1998 \n", "1678 1679 B. Monkey (1998) 06-Feb-1998 \n", "1679 1680 Sliding Doors (1998) 01-Jan-1998 \n", "1680 1681 You So Crazy (1994) 01-Jan-1994 \n", "1681 1682 Scream of Stone (Schrei aus Stein) (1991) 08-Mar-1996 \n", "\n", " video release date IMDb URL \\\n", "0 NaN http://us.imdb.com/M/title-exact?Toy%20Story%2... \n", "1 NaN http://us.imdb.com/M/title-exact?GoldenEye%20(... \n", "2 NaN http://us.imdb.com/M/title-exact?Four%20Rooms%... \n", "3 NaN http://us.imdb.com/M/title-exact?Get%20Shorty%... \n", "4 NaN http://us.imdb.com/M/title-exact?Copycat%20(1995) \n", "... ... ... \n", "1677 NaN http://us.imdb.com/M/title-exact?Mat%27+i+syn+... \n", "1678 NaN http://us.imdb.com/M/title-exact?B%2E+Monkey+(... \n", "1679 NaN http://us.imdb.com/Title?Sliding+Doors+(1998) \n", "1680 NaN http://us.imdb.com/M/title-exact?You%20So%20Cr... \n", "1681 NaN http://us.imdb.com/M/title-exact?Schrei%20aus%... \n", "\n", " unknown Action Adventure Animation Children's ... Fantasy \\\n", "0 0 0 0 1 1 ... 0 \n", "1 0 1 1 0 0 ... 0 \n", "2 0 0 0 0 0 ... 0 \n", "3 0 1 0 0 0 ... 0 \n", "4 0 0 0 0 0 ... 0 \n", "... ... ... ... ... ... ... ... \n", "1677 0 0 0 0 0 ... 0 \n", "1678 0 0 0 0 0 ... 0 \n", "1679 0 0 0 0 0 ... 0 \n", "1680 0 0 0 0 0 ... 0 \n", "1681 0 0 0 0 0 ... 0 \n", "\n", " Film-Noir Horror Musical Mystery Romance Sci-Fi Thriller War \\\n", "0 0 0 0 0 0 0 0 0 \n", "1 0 0 0 0 0 0 1 0 \n", "2 0 0 0 0 0 0 1 0 \n", "3 0 0 0 0 0 0 0 0 \n", "4 0 0 0 0 0 0 1 0 \n", "... ... ... ... ... ... ... ... ... \n", "1677 0 0 0 0 0 0 0 0 \n", "1678 0 0 0 0 1 0 1 0 \n", "1679 0 0 0 0 1 0 0 0 \n", "1680 0 0 0 0 0 0 0 0 \n", "1681 0 0 0 0 0 0 0 0 \n", "\n", " Western \n", "0 0 \n", "1 0 \n", "2 0 \n", "3 0 \n", "4 0 \n", "... ... \n", "1677 0 \n", "1678 0 \n", "1679 0 \n", "1680 0 \n", "1681 0 \n", "\n", "[1682 rows x 24 columns]" ] }, "execution_count": 16, "metadata": {}, "output_type": "execute_result" } ], "source": [ "i_cols = ['movie id', 'movie title' ,'release date','video release date',\n", " 'IMDb URL', 'unknown', 'Action', 'Adventure', 'Animation',\n", " 'Children\\'s', 'Comedy', 'Crime', 'Documentary', 'Drama',\n", " 'Fantasy', 'Film-Noir', 'Horror', 'Musical', 'Mystery',\n", " 'Romance', 'Sci-Fi', 'Thriller', 'War', 'Western'] \n", "items = pd.read_csv('ml-100k/u.item', sep='|', names=i_cols, encoding='latin-1')\n", "items" ] }, { "cell_type": "code", "execution_count": 19, "id": "2872ca93-0ae8-4177-98c0-9e490556a935", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "[ True True True ... False False False]\n", "[0. 0. 0. ... 1. 1. 1.]\n" ] } ], "source": [ "top_n = 5\n", "activeUser = 0\n", "\n", "maskActiveUser = ratings[activeUser,:]>0\n", "print(maskActiveUser)\n", "\n", "itemRateNumCurrent = ratingNum.copy()\n", "itemRateNumCurrent[maskActiveUser] = 0\n", "print(itemRateNumCurrent)" ] }, { "cell_type": "code", "execution_count": 20, "id": "95df3468-eb4b-4924-a7c5-1b3a2214b505", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array([ 0, 172, 173, ..., 287, 285, 293])" ] }, "execution_count": 20, "metadata": {}, "output_type": "execute_result" } ], "source": [ "itemSortInd = itemRateNumCurrent.argsort()\n", "itemSortInd" ] }, { "cell_type": "code", "execution_count": 22, "id": "e3ef3388-4b1f-4428-905b-90dc0e952bce", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "movie ID\t movie title\n", "293 Liar Liar (1997)\n", "285 English Patient, The (1996)\n", "287 Scream (1996)\n", "299 Air Force One (1997)\n", "312 Titanic (1997)\n", "Name: movie title, dtype: object\n" ] } ], "source": [ "print('movie ID' + '\\t movie title')\n", "print(items['movie title'][itemSortInd[:-1 -top_n:-1]])" ] }, { "cell_type": "code", "execution_count": 21, "id": "d4a3c30c-24eb-49c4-825c-e0d8a02df1cc", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array([293, 285, 287, 299, 312])" ] }, "execution_count": 21, "metadata": {}, "output_type": "execute_result" } ], "source": [ "itemSortInd[:-1 -top_n:-1]" ] }, { "cell_type": "code", "execution_count": 23, "id": "b768b57b-edf4-4c79-8d36-2a52942b5bfe", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "movie ID\t movie title\n", "1535 Aiqing wansui (1994)\n", "1652 Entertaining Angels: The Dorothy Day Story (1996)\n", "1200 Marlene Dietrich: Shadow and Light (1996) \n", "1598 Someone Else's America (1995)\n", "1121 They Made Me a Criminal (1939)\n", "Name: movie title, dtype: object\n" ] } ], "source": [ "itemRateAvgCurrent = itemRateAvg.copy()\n", "itemRateAvgCurrent[maskActiveUser] = 0\n", "\n", "itemSortIndAvg = itemRateAvgCurrent.argsort()\n", "print('movie ID' + '\\t movie title')\n", "print(items['movie title'][itemSortIndAvg[:-1 -top_n:-1]])" ] }, { "cell_type": "code", "execution_count": 24, "id": "3d559a43-2d5a-4829-91e8-09e37b3ef1eb", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array([1535, 1652, 1200, 1598, 1121])" ] }, "execution_count": 24, "metadata": {}, "output_type": "execute_result" } ], "source": [ "itemSortIndAvg[:-1 -top_n:-1]" ] }, { "cell_type": "code", "execution_count": null, "id": "4e39bc31-547e-4fbf-8234-7f9632a858e5", "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.9.6" } }, "nbformat": 4, "nbformat_minor": 5 }