{ "cells": [ { "cell_type": "markdown", "metadata": { "id": "j0FI6PdE1YoN" }, "source": [ "# Import Necessary Libs\n", "# (This is the demo code for the toy example: rating.csv. The main purpose of this demo code is to help students understand the KNN-based CF algorithms.)" ] }, { "cell_type": "code", "execution_count": 1, "metadata": { "id": "qwRX_2RCrWdg" }, "outputs": [], "source": [ "import numpy as np\n", "import pandas as pd\n", "from sklearn.model_selection import train_test_split" ] }, { "cell_type": "markdown", "metadata": { "id": "UJRU5o4d1iCC" }, "source": [ "## Load Dataset" ] }, { "cell_type": "code", "execution_count": 2, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "SPlwg9AgrYgp", "outputId": "b8871ff9-6581-44a6-bc48-1b3caaa57ead" }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "5 users\n", "4 items\n" ] }, { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
user_iditem_idrating
0005
1013
2025
3031
4105
5114
6123
7131
8204
9214
10225
11231
12304
13325
14335
15313
16401
17412
18423
19435
\n", "
" ], "text/plain": [ " user_id item_id rating\n", "0 0 0 5\n", "1 0 1 3\n", "2 0 2 5\n", "3 0 3 1\n", "4 1 0 5\n", "5 1 1 4\n", "6 1 2 3\n", "7 1 3 1\n", "8 2 0 4\n", "9 2 1 4\n", "10 2 2 5\n", "11 2 3 1\n", "12 3 0 4\n", "13 3 2 5\n", "14 3 3 5\n", "15 3 1 3\n", "16 4 0 1\n", "17 4 1 2\n", "18 4 2 3\n", "19 4 3 5" ] }, "execution_count": 2, "metadata": {}, "output_type": "execute_result" } ], "source": [ "names = ['user_id', 'item_id', 'rating']\n", "df = pd.read_csv('rating.csv', sep='\\t', names=names)\n", "df.head()\n", "\n", "n_users = df.user_id.unique().shape[0]\n", "n_items = df.item_id.unique().shape[0]\n", "print(str(n_users) + ' users')\n", "print(str(n_items) + ' items')\n", "df" ] }, { "cell_type": "markdown", "metadata": { "id": "z90r54XGrmPj" }, "source": [ "# Split Dataset into Training Dataset and Testing Dataset" ] }, { "cell_type": "code", "execution_count": 3, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "KlU-Lq16rlpN", "outputId": "259e8384-bd58-4dac-c856-b7a97db2580e" }, "outputs": [ { "data": { "text/plain": [ "( user_id item_id rating\n", " 1 0 1 3\n", " 18 4 2 3\n", " 16 4 0 1\n", " 17 4 1 2\n", " 19 4 3 5\n", " 4 1 0 5\n", " 15 3 1 3\n", " 6 1 2 3\n", " 9 2 1 4\n", " 5 1 1 4\n", " 12 3 0 4\n", " 7 1 3 1\n", " 11 2 3 1\n", " 14 3 3 5\n", " 2 0 2 5\n", " 13 3 2 5\n", " 3 0 3 1\n", " 8 2 0 4,\n", " user_id item_id rating\n", " 0 0 0 5\n", " 10 2 2 5)" ] }, "execution_count": 3, "metadata": {}, "output_type": "execute_result" } ], "source": [ "train_df, test_df = train_test_split(df, test_size=0.1)\n", "train_df, test_df" ] }, { "cell_type": "code", "execution_count": 4, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "gvaLspXUrg9L", "outputId": "c7477ddc-0691-438d-f7bd-c23ae2d4bf55" }, "outputs": [ { "data": { "text/plain": [ "( 0 1 2 3\n", " 0 0.0 3.0 5.0 1.0\n", " 1 5.0 4.0 3.0 1.0\n", " 2 4.0 4.0 0.0 1.0\n", " 3 4.0 3.0 5.0 5.0\n", " 4 1.0 2.0 3.0 5.0,\n", " 0 1 2 3\n", " 0 5.0 0.0 0.0 0.0\n", " 1 0.0 0.0 0.0 0.0\n", " 2 0.0 0.0 5.0 0.0\n", " 3 0.0 0.0 0.0 0.0\n", " 4 0.0 0.0 0.0 0.0)" ] }, "execution_count": 4, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Training Dataset\n", "train_ds = np.zeros((n_users, n_items))\n", "for row in train_df.itertuples():\n", " train_ds[row[1], row[2]] = row[3]\n", "train_ds = pd.DataFrame(train_ds)\n", "\n", "# Testing Dataset\n", "test_ds = np.zeros((n_users, n_items))\n", "for row in test_df.itertuples():\n", " test_ds[row[1], row[2]] = row[3]\n", "test_ds = pd.DataFrame(test_ds)\n", "\n", "train_ds, test_ds" ] }, { "cell_type": "markdown", "metadata": { "id": "E1ffZ6fCjYQ7" }, "source": [ "# Fitting the Algorithm" ] }, { "cell_type": "markdown", "metadata": { "id": "DYJZT1keazuc" }, "source": [ "## User-based" ] }, { "cell_type": "markdown", "metadata": { "id": "3bf9Ksc2OlL1" }, "source": [ "### Compute Pearson Correlation Coefficient for Each Pair of Users in Training Dataset" ] }, { "cell_type": "code", "execution_count": 5, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "SH3O2sX3iBx1", "outputId": "f883bb40-6ac3-4abc-d432-c1790f2861ae" }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "0 user: [0. 3. 5. 1.]; 0 user: [0. 3. 5. 1.]\n", "co-reated item index[1 2 3]\n", "np.clip(user_i_vec, 0, 1) is [0. 1. 1. 1.]\n", "mean_user_i is 2.999999999\n", "user_i_sub_mean is [ 1.00000008e-09 2.00000000e+00 -2.00000000e+00]\n", "len(corrated_index) is 3\n", "0 user: [0. 3. 5. 1.]; 1 user: [5. 4. 3. 1.]\n", "co-reated item index[1 2 3]\n", "np.clip(user_i_vec, 0, 1) is [0. 1. 1. 1.]\n", "mean_user_i is 2.999999999\n", "user_i_sub_mean is [ 1.00000008e-09 2.00000000e+00 -2.00000000e+00]\n", "len(corrated_index) is 3\n", "0 user: [0. 3. 5. 1.]; 2 user: [4. 4. 0. 1.]\n", "co-reated item index[1 3]\n", "np.clip(user_i_vec, 0, 1) is [0. 1. 1. 1.]\n", "mean_user_i is 2.999999999\n", "user_i_sub_mean is [ 1.00000008e-09 -2.00000000e+00]\n", "len(corrated_index) is 2\n", "0 user: [0. 3. 5. 1.]; 3 user: [4. 3. 5. 5.]\n", "co-reated item index[1 2 3]\n", "np.clip(user_i_vec, 0, 1) is [0. 1. 1. 1.]\n", "mean_user_i is 2.999999999\n", "user_i_sub_mean is [ 1.00000008e-09 2.00000000e+00 -2.00000000e+00]\n", "len(corrated_index) is 3\n", "0 user: [0. 3. 5. 1.]; 4 user: [1. 2. 3. 5.]\n", "co-reated item index[1 2 3]\n", "np.clip(user_i_vec, 0, 1) is [0. 1. 1. 1.]\n", "mean_user_i is 2.999999999\n", "user_i_sub_mean is [ 1.00000008e-09 2.00000000e+00 -2.00000000e+00]\n", "len(corrated_index) is 3\n", "1 user: [5. 4. 3. 1.]; 0 user: [0. 3. 5. 1.]\n", "co-reated item index[1 2 3]\n", "np.clip(user_i_vec, 0, 1) is [1. 1. 1. 1.]\n", "mean_user_i is 3.2499999991875\n", "user_i_sub_mean is [ 0.75 -0.25 -2.25]\n", "len(corrated_index) is 3\n", "1 user: [5. 4. 3. 1.]; 1 user: [5. 4. 3. 1.]\n", "co-reated item index[0 1 2 3]\n", "np.clip(user_i_vec, 0, 1) is [1. 1. 1. 1.]\n", "mean_user_i is 3.2499999991875\n", "user_i_sub_mean is [ 1.75 0.75 -0.25 -2.25]\n", "len(corrated_index) is 4\n", "1 user: [5. 4. 3. 1.]; 2 user: [4. 4. 0. 1.]\n", "co-reated item index[0 1 3]\n", "np.clip(user_i_vec, 0, 1) is [1. 1. 1. 1.]\n", "mean_user_i is 3.2499999991875\n", "user_i_sub_mean is [ 1.75 0.75 -2.25]\n", "len(corrated_index) is 3\n", "1 user: [5. 4. 3. 1.]; 3 user: [4. 3. 5. 5.]\n", "co-reated item index[0 1 2 3]\n", "np.clip(user_i_vec, 0, 1) is [1. 1. 1. 1.]\n", "mean_user_i is 3.2499999991875\n", "user_i_sub_mean is [ 1.75 0.75 -0.25 -2.25]\n", "len(corrated_index) is 4\n", "1 user: [5. 4. 3. 1.]; 4 user: [1. 2. 3. 5.]\n", "co-reated item index[0 1 2 3]\n", "np.clip(user_i_vec, 0, 1) is [1. 1. 1. 1.]\n", "mean_user_i is 3.2499999991875\n", "user_i_sub_mean is [ 1.75 0.75 -0.25 -2.25]\n", "len(corrated_index) is 4\n", "2 user: [4. 4. 0. 1.]; 0 user: [0. 3. 5. 1.]\n", "co-reated item index[1 3]\n", "np.clip(user_i_vec, 0, 1) is [1. 1. 0. 1.]\n", "mean_user_i is 2.999999999\n", "user_i_sub_mean is [ 1. -2.]\n", "len(corrated_index) is 2\n", "2 user: [4. 4. 0. 1.]; 1 user: [5. 4. 3. 1.]\n", "co-reated item index[0 1 3]\n", "np.clip(user_i_vec, 0, 1) is [1. 1. 0. 1.]\n", "mean_user_i is 2.999999999\n", "user_i_sub_mean is [ 1. 1. -2.]\n", "len(corrated_index) is 3\n", "2 user: [4. 4. 0. 1.]; 2 user: [4. 4. 0. 1.]\n", "co-reated item index[0 1 3]\n", "np.clip(user_i_vec, 0, 1) is [1. 1. 0. 1.]\n", "mean_user_i is 2.999999999\n", "user_i_sub_mean is [ 1. 1. -2.]\n", "len(corrated_index) is 3\n", "2 user: [4. 4. 0. 1.]; 3 user: [4. 3. 5. 5.]\n", "co-reated item index[0 1 3]\n", "np.clip(user_i_vec, 0, 1) is [1. 1. 0. 1.]\n", "mean_user_i is 2.999999999\n", "user_i_sub_mean is [ 1. 1. -2.]\n", "len(corrated_index) is 3\n", "2 user: [4. 4. 0. 1.]; 4 user: [1. 2. 3. 5.]\n", "co-reated item index[0 1 3]\n", "np.clip(user_i_vec, 0, 1) is [1. 1. 0. 1.]\n", "mean_user_i is 2.999999999\n", "user_i_sub_mean is [ 1. 1. -2.]\n", "len(corrated_index) is 3\n", "3 user: [4. 3. 5. 5.]; 0 user: [0. 3. 5. 1.]\n", "co-reated item index[1 2 3]\n", "np.clip(user_i_vec, 0, 1) is [1. 1. 1. 1.]\n", "mean_user_i is 4.2499999989375\n", "user_i_sub_mean is [-1.25 0.75 0.75]\n", "len(corrated_index) is 3\n", "3 user: [4. 3. 5. 5.]; 1 user: [5. 4. 3. 1.]\n", "co-reated item index[0 1 2 3]\n", "np.clip(user_i_vec, 0, 1) is [1. 1. 1. 1.]\n", "mean_user_i is 4.2499999989375\n", "user_i_sub_mean is [-0.25 -1.25 0.75 0.75]\n", "len(corrated_index) is 4\n", "3 user: [4. 3. 5. 5.]; 2 user: [4. 4. 0. 1.]\n", "co-reated item index[0 1 3]\n", "np.clip(user_i_vec, 0, 1) is [1. 1. 1. 1.]\n", "mean_user_i is 4.2499999989375\n", "user_i_sub_mean is [-0.25 -1.25 0.75]\n", "len(corrated_index) is 3\n", "3 user: [4. 3. 5. 5.]; 3 user: [4. 3. 5. 5.]\n", "co-reated item index[0 1 2 3]\n", "np.clip(user_i_vec, 0, 1) is [1. 1. 1. 1.]\n", "mean_user_i is 4.2499999989375\n", "user_i_sub_mean is [-0.25 -1.25 0.75 0.75]\n", "len(corrated_index) is 4\n", "3 user: [4. 3. 5. 5.]; 4 user: [1. 2. 3. 5.]\n", "co-reated item index[0 1 2 3]\n", "np.clip(user_i_vec, 0, 1) is [1. 1. 1. 1.]\n", "mean_user_i is 4.2499999989375\n", "user_i_sub_mean is [-0.25 -1.25 0.75 0.75]\n", "len(corrated_index) is 4\n", "4 user: [1. 2. 3. 5.]; 0 user: [0. 3. 5. 1.]\n", "co-reated item index[1 2 3]\n", "np.clip(user_i_vec, 0, 1) is [1. 1. 1. 1.]\n", "mean_user_i is 2.7499999993125\n", "user_i_sub_mean is [-0.75 0.25 2.25]\n", "len(corrated_index) is 3\n", "4 user: [1. 2. 3. 5.]; 1 user: [5. 4. 3. 1.]\n", "co-reated item index[0 1 2 3]\n", "np.clip(user_i_vec, 0, 1) is [1. 1. 1. 1.]\n", "mean_user_i is 2.7499999993125\n", "user_i_sub_mean is [-1.75 -0.75 0.25 2.25]\n", "len(corrated_index) is 4\n", "4 user: [1. 2. 3. 5.]; 2 user: [4. 4. 0. 1.]\n", "co-reated item index[0 1 3]\n", "np.clip(user_i_vec, 0, 1) is [1. 1. 1. 1.]\n", "mean_user_i is 2.7499999993125\n", "user_i_sub_mean is [-1.75 -0.75 2.25]\n", "len(corrated_index) is 3\n", "4 user: [1. 2. 3. 5.]; 3 user: [4. 3. 5. 5.]\n", "co-reated item index[0 1 2 3]\n", "np.clip(user_i_vec, 0, 1) is [1. 1. 1. 1.]\n", "mean_user_i is 2.7499999993125\n", "user_i_sub_mean is [-1.75 -0.75 0.25 2.25]\n", "len(corrated_index) is 4\n", "4 user: [1. 2. 3. 5.]; 4 user: [1. 2. 3. 5.]\n", "co-reated item index[0 1 2 3]\n", "np.clip(user_i_vec, 0, 1) is [1. 1. 1. 1.]\n", "mean_user_i is 2.7499999993125\n", "user_i_sub_mean is [-1.75 -0.75 0.25 2.25]\n", "len(corrated_index) is 4\n" ] }, { "data": { "text/plain": [ "array([[ 1.00000000e+00, 5.92999453e-01, 8.94427191e-01,\n", " 5.39163910e-11, -5.92999453e-01],\n", " [ 5.92999453e-01, 1.00000000e+00, 9.69560705e-01,\n", " -6.62541349e-01, -1.00000000e+00],\n", " [ 8.94427191e-01, 9.69560705e-01, 1.00000000e+00,\n", " -8.28078671e-01, -9.69560705e-01],\n", " [ 5.39163910e-11, -6.62541349e-01, -8.28078671e-01,\n", " 1.00000000e+00, 6.62541349e-01],\n", " [-5.92999453e-01, -1.00000000e+00, -9.69560705e-01,\n", " 6.62541349e-01, 1.00000000e+00]])" ] }, "execution_count": 5, "metadata": {}, "output_type": "execute_result" } ], "source": [ "GAMMA = 0\n", "EPSILON = 1e-9\n", "\n", "np_user_pearson_corr = np.zeros((n_users, n_users))\n", "\n", "for i, user_i_vec in enumerate(train_ds.values):\n", " for j, user_j_vec in enumerate(train_ds.values):\n", " print(str(i) + \" user: \" + str(user_i_vec) + \"; \" + str(j) + \" user: \" + str(user_j_vec))\n", " \n", " # ratings corated by the current pair of users\n", " mask_i = user_i_vec > 0\n", " mask_j = user_j_vec > 0\n", "\n", " # corrated item index, skip if there are no corrated ratings\n", " corrated_index = np.intersect1d(np.where(mask_i), np.where(mask_j))\n", " print(\"co-reated item index\" + str(corrated_index))\n", " if len(corrated_index) == 0:\n", " continue\n", "\n", " # average value of user_i_vec and user_j_vec\n", " mean_user_i = np.sum(user_i_vec) / (np.sum(np.clip(user_i_vec, 0, 1)) + EPSILON)\n", " mean_user_j = np.sum(user_j_vec) / (np.sum(np.clip(user_j_vec, 0, 1)) + EPSILON)\n", " \n", " print(\"np.clip(user_i_vec, 0, 1)\" + \" is \" + str(np.clip(user_i_vec, 0, 1)))\n", " \n", "\n", " # compute pearson corr\n", " user_i_sub_mean = user_i_vec[corrated_index] - mean_user_i\n", " user_j_sub_mean = user_j_vec[corrated_index] - mean_user_j\n", " \n", " print(\"mean_user_i \" + \" is \" + str(mean_user_i))\n", " print(\"user_i_sub_mean \" + \" is \" + str(user_i_sub_mean))\n", "\n", " r_ui_sub_r_i_sq = np.square(user_i_sub_mean)\n", " r_uj_sub_r_j_sq = np.square(user_j_sub_mean)\n", "\n", " r_ui_sum_sqrt = np.sqrt(np.sum(r_ui_sub_r_i_sq))\n", " r_uj_sum_sqrt = np.sqrt(np.sum(r_uj_sub_r_j_sq))\n", "\n", " sim = np.sum(user_i_sub_mean * user_j_sub_mean) / (r_ui_sum_sqrt * r_uj_sum_sqrt + EPSILON)\n", "\n", " # significance weighting\n", " weighted_sim = sim\n", " #weighted_sim = (min(len(corrated_index), GAMMA) / GAMMA) * sim\n", " print(\"len(corrated_index) \" + \" is \" + str(len(corrated_index)))\n", "\n", " np_user_pearson_corr[i][j] = weighted_sim\n", " \n", " ###\n", " #print(str(i) + \" user: \" + str(user_i_vec) + \"; \" + str(j) + \" user: \" + str(user_j_vec))\n", " #print(\"co-reated item index\" + str(corrated_index))\n", " #print(\"np.clip(user_i_vec, 0, 1)\" + \" is \" + str(np.clip(user_i_vec, 0, 1)))\n", " #print(\"np.clip(user_j_vec, 0, 1)\" + \" is \" + str(np.clip(user_j_vec, 0, 1)))\n", " #print('i' + \" user average: \" + str(mean_user_i))\n", " #print('j' + \" user average: \" + str(mean_user_j))\n", "np_user_pearson_corr" ] }, { "cell_type": "markdown", "metadata": { "id": "7OxHrFpIEHYm" }, "source": [ "### Predict Ratings" ] }, { "cell_type": "code", "execution_count": 6, "metadata": { "id": "Z-REE2zqj5rt" }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "0 0\n", "[4 3 1 2 0]\n", "[ 1.00000000e+00 5.92999453e-01 8.94427191e-01 5.39163910e-11\n", " -5.92999453e-01]\n", "[3 1 2]\n", "2 2\n", "[4 3 0 1 2]\n", "[ 0.89442719 0.96956071 1. -0.82807867 -0.96956071]\n", "[3 0 1]\n" ] }, { "data": { "text/plain": [ "array([[4.29900607, 0. , 0. , 0. ],\n", " [0. , 0. , 0. , 0. ],\n", " [0. , 0. , 3.89332654, 0. ],\n", " [0. , 0. , 0. , 0. ],\n", " [0. , 0. , 0. , 0. ]])" ] }, "execution_count": 6, "metadata": {}, "output_type": "execute_result" } ], "source": [ "np_predictions = np.zeros((n_users, n_items))\n", "\n", "K = 3\n", "EPSILON = 1e-9\n", "\n", "for (i, j), rating in np.ndenumerate(test_ds.values):\n", " if rating > 0:\n", " print(str(i) + \" \" + str(j))\n", " # find top-k most similar users as the current user, remove itself\n", " sim_user_ids = np.argsort(np_user_pearson_corr[i])[-(K + 1):-1]\n", " ### useful link for how to access values in numpy arrays: https://numpy.org/doc/stable/reference/arrays.indexing.html\n", "\n", " print(np.argsort(np_user_pearson_corr[i]))\n", " print(np_user_pearson_corr[i][:])\n", " print(np.argsort(np_user_pearson_corr[i])[-(K + 1):-1])\n", " # the coefficient values of similar users\n", " sim_val = np_user_pearson_corr[i][sim_user_ids]\n", "\n", " # the average value of the current user's ratings\n", " sim_users = train_ds.values[sim_user_ids]\n", " user_mean = np.sum(train_ds.values[i]) / (np.sum(np.clip(train_ds.values[i], 0, 1)) + EPSILON)\n", " sim_user_mean = np.sum(sim_users, axis=1) / (np.sum(np.clip(sim_users, 0, 1), axis=1) + EPSILON)\n", "\n", " #print(sim_users)\n", " # select the users who rated item j\n", " mask_rated_j = sim_users[:, j] > 0\n", " \n", " # sim(u, v) * (r_vj - mean_v)\n", " sim_r_sum_mean = sim_val[mask_rated_j] * (sim_users[mask_rated_j, j] - sim_user_mean[mask_rated_j])\n", "\n", " # filter unrated items\n", " #w = np.clip(sim_users[mask_rated_j, j], 0, 1)\n", " #sim_r_sum_mean *= w\n", " #print(sim_users[:, j])\n", " \n", " np_predictions[i][j] = user_mean + np.sum(sim_r_sum_mean) / (np.sum(sim_val[mask_rated_j]) + EPSILON)\n", " np_predictions[i][j] = np.clip(np_predictions[i][j], 0, 5)\n", "np_predictions" ] }, { "cell_type": "markdown", "metadata": { "id": "tiPKuFjQudyM" }, "source": [ "### Evaluation" ] }, { "cell_type": "markdown", "metadata": { "id": "gQZSW98ZGDJR" }, "source": [ "#### Mean Absolute Error (MAE)" ] }, { "cell_type": "code", "execution_count": 7, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "20UzmEf9uhF_", "outputId": "78ab7be3-55f8-416e-e578-b13136768b62" }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "[[0.70099393 0. 0. 0. ]\n", " [0. 0. 0. 0. ]\n", " [0. 0. 1.10667346 0. ]\n", " [0. 0. 0. 0. ]\n", " [0. 0. 0. 0. ]]\n", "[[1. 0. 0. 0.]\n", " [0. 0. 0. 0.]\n", " [0. 0. 1. 0.]\n", " [0. 0. 0. 0.]\n", " [0. 0. 0. 0.]]\n", "MAE on Tesing set (User-based): 0.9038336987360527\n" ] } ], "source": [ "#==================MAE on Testing set===================#\n", "labels = test_ds.values\n", "\n", "# absolute error on all ratings\n", "absolute_error = np.abs(np_predictions - labels)\n", "print(absolute_error)\n", "\n", "# weight\n", "weight = np.clip(labels, 0, 1)\n", "print(weight)\n", "\n", "# absoulte error on rated ratings\n", "abs_error = absolute_error * weight\n", "\n", "# MAE\n", "MAE = np.sum(abs_error) / np.sum(weight)\n", "\n", "print(\"MAE on Tesing set (User-based): \" + str(MAE));" ] }, { "cell_type": "markdown", "metadata": { "id": "j_L7yqJ4HXJa" }, "source": [ "#### Root Mean Squared Error (RMSE)" ] }, { "cell_type": "code", "execution_count": 8, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "mV-BhWhQvtAV", "outputId": "e8801f4b-98b6-42ba-8c44-1a1f287558bd" }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "RMSE on Tesing set (User-based): 0.9263149166909437\n" ] } ], "source": [ "#==================RMSE on Testing set===================\n", "labels = test_ds.values\n", "\n", "# squared error on all ratings\n", "squared_error = np.square(np_predictions - labels)\n", "weight = np.clip(labels, 0, 1)\n", "\n", "# squared error on rated ratings\n", "squared_error = squared_error * weight\n", "\n", "# RMSE\n", "RMSE = np.sqrt(np.sum(squared_error) / np.sum(weight))\n", "\n", "print(\"RMSE on Tesing set (User-based): \" + str(RMSE));" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "[[0. 5. 4. 4. 1.]\n", " [3. 4. 4. 3. 2.]\n", " [5. 3. 0. 5. 3.]\n", " [1. 1. 1. 5. 5.]]\n" ] } ], "source": [ "print(train_ds.T.values)" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "[[0. 3. 5. 1.]\n", " [5. 4. 3. 1.]\n", " [4. 4. 0. 1.]\n", " [4. 3. 5. 5.]\n", " [1. 2. 3. 5.]]\n" ] } ], "source": [ "print(train_ds.values)" ] }, { "cell_type": "markdown", "metadata": { "id": "ecoieIQTbQZV" }, "source": [ "## Item-based" ] }, { "cell_type": "markdown", "metadata": { "id": "4XHtJ0LpbQZW" }, "source": [ "### Compute Pearson Correlation Coefficient for Each Pair of Users in Training Dataset" ] }, { "cell_type": "code", "execution_count": 11, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "KgPMV2sobQZW", "outputId": "dad39c79-bb22-4810-ceba-88a6e213461e" }, "outputs": [ { "data": { "text/plain": [ "array([[ 1.60000000e-01, 1.44463024e-01, 3.51324026e-02,\n", " -1.04595272e-01],\n", " [ 1.44463024e-01, 2.00000000e-01, -4.35464879e-11,\n", " -1.52752523e-01],\n", " [ 3.51324026e-02, -4.35464879e-11, 1.60000000e-01,\n", " 3.13785842e-11],\n", " [-1.04595272e-01, -1.52752523e-01, 3.13785842e-11,\n", " 2.00000000e-01]])" ] }, "execution_count": 11, "metadata": {}, "output_type": "execute_result" } ], "source": [ "DELTA = 25\n", "EPSILON = 1e-9\n", "\n", "np_item_pearson_corr = np.zeros((n_items, n_items))\n", "\n", "for i, item_i_vec in enumerate(train_ds.T.values):\n", " for j, item_j_vec in enumerate(train_ds.T.values):\n", "\n", " # ratings corated by the current pair od items\n", " mask_i = item_i_vec > 0\n", " mask_j = item_j_vec > 0\n", "\n", " # corrated index, skip if there are no corrated ratings\n", " corrated_index = np.intersect1d(np.where(mask_i), np.where(mask_j))\n", " if len(corrated_index) == 0:\n", " continue\n", "\n", " # average value of item_i_vec and item_j_vec\n", " mean_item_i = np.sum(item_i_vec) / (np.sum(np.clip(item_i_vec, 0, 1)) + EPSILON)\n", " mean_item_j = np.sum(item_j_vec) / (np.sum(np.clip(item_j_vec, 0, 1)) + EPSILON)\n", "\n", " # compute pearson corr\n", " item_i_sub_mean = item_i_vec[corrated_index] - mean_item_i\n", " item_j_sub_mean = item_j_vec[corrated_index] - mean_item_j\n", "\n", " r_ui_sub_ri_sq = np.square(item_i_sub_mean)\n", " r_uj_sub_rj_sq = np.square(item_j_sub_mean)\n", "\n", " r_ui_sub_ri_sq_sum_sqrt = np.sqrt(np.sum(r_ui_sub_ri_sq))\n", " r_uj_sub_rj_sq_sum_sqrt = np.sqrt(np.sum(r_uj_sub_rj_sq))\n", "\n", " sim = np.sum(item_i_sub_mean * item_j_sub_mean) / (r_ui_sub_ri_sq_sum_sqrt * r_uj_sub_rj_sq_sum_sqrt + EPSILON)\n", "\n", " # significance weighting\n", " weighted_sim = (min(len(corrated_index), DELTA) / DELTA) * sim\n", "\n", " np_item_pearson_corr[i][j] = weighted_sim\n", "\n", "np_item_pearson_corr" ] }, { "cell_type": "markdown", "metadata": { "id": "2EANmxRhbQZX" }, "source": [ "### Predict Ratings" ] }, { "cell_type": "code", "execution_count": 12, "metadata": { "id": "vyysZ6e7bQZX" }, "outputs": [], "source": [ "np_predictions = np.zeros((n_users, n_items))\n", "\n", "K = 10\n", "EPSILON = 1e-9\n", "\n", "for (i, j), rating in np.ndenumerate(test_ds.values):\n", " if rating > 0:\n", " # find top-k most similar items as the current item, remove itself\n", " sim_item_ids = np.argsort(np_item_pearson_corr[j])[-(K + 1):-1]\n", "\n", " # the coefficient values of similar items\n", " sim_val = np_item_pearson_corr[i][sim_item_ids]\n", "\n", " # the average value of the current item's ratings\n", " sim_items = train_ds.T.values[sim_item_ids]\n", " item_mean = np.sum(train_ds.T.values[j]) / (np.sum(np.clip(train_ds.T.values[j], 0, 1)) + EPSILON)\n", " sim_item_mean = np.sum(sim_items, axis=1) / (np.sum(np.clip(sim_items, 0, 1), axis=1) + EPSILON)\n", "\n", " # sim(u, v) * (r_v - mean_v)\n", " sim_r_sum_mean = sim_val * (sim_items[:, i] - sim_item_mean) \n", "\n", " # filter unrated items\n", " w = np.clip(sim_items[:, i], 0, 1)\n", " sim_r_sum_mean *= w\n", "\n", " if np.sum(sim_val) == 0:\n", " np_predictions[i][j] = item_mean\n", " else:\n", " np_predictions[i][j] = item_mean + np.sum(sim_r_sum_mean) / (np.sum(sim_val) + EPSILON)\n", " \n", " np_predictions[i][j] = np.clip(np_predictions[i][j], 0, 5)\n", " " ] }, { "cell_type": "markdown", "metadata": { "id": "-rIh6rpJbQZY" }, "source": [ "### Evaluation" ] }, { "cell_type": "markdown", "metadata": { "id": "XbMQzZlibQZY" }, "source": [ "#### Mean Absolute Error (MAE)" ] }, { "cell_type": "code", "execution_count": 13, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "bg2OzOBXbQZZ", "outputId": "5301a318-a035-4c9d-afbb-4e8d0c23e55e" }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "MAE on Tesing set (Item-based): 0.2500000083021723\n" ] } ], "source": [ "#==================MAE on Testing set===================#\n", "labels = test_ds.values\n", "\n", "# absolute error on all ratings\n", "absolute_error = np.abs(np_predictions - labels)\n", "\n", "# weight\n", "weight = np.clip(labels, 0, 1)\n", "\n", "# absoulte error on rated ratings\n", "abs_error = absolute_error * weight\n", "\n", "# MAE\n", "MAE = np.sum(abs_error) / np.sum(weight)\n", "\n", "print(\"MAE on Tesing set (Item-based): \" + str(MAE));" ] }, { "cell_type": "markdown", "metadata": { "id": "ze_3YW5GbQZZ" }, "source": [ "#### Root Mean Squared Error (RMSE)" ] }, { "cell_type": "code", "execution_count": 14, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "FEh_WNPXbQZa", "outputId": "57f49fa3-ca1b-4d0b-c534-4fb147ca5b2e" }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "RMSE on Tesing set (Item-based): 0.3535534023343184\n" ] } ], "source": [ "#==================RMSE on Testing set===================#\n", "labels = test_ds.values\n", "\n", "# squared error on all ratings\n", "squared_error = np.square(np_predictions - labels)\n", "weight = np.clip(labels, 0, 1)\n", "\n", "# squared error on rated ratings\n", "squared_error = squared_error * weight\n", "\n", "# RMSE\n", "RMSE = np.sqrt(np.sum(squared_error) / np.sum(weight))\n", "\n", "print(\"RMSE on Tesing set (Item-based): \" + str(RMSE));" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "colab": { "collapsed_sections": [], "name": "KNN-based CF.ipynb", "provenance": [], "toc_visible": true }, "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.8.5" } }, "nbformat": 4, "nbformat_minor": 1 }