{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "j0FI6PdE1YoN"
   },
   "source": [
    "# Import Necessary Libs\n",
    "# (This is the demo code for the toy example: rating.csv. The main purpose of this demo code is to help students understand the KNN-based CF algorithms.)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "id": "qwRX_2RCrWdg"
   },
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "import pandas as pd\n",
    "from sklearn.model_selection import train_test_split"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "UJRU5o4d1iCC"
   },
   "source": [
    "## Load Dataset"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/"
    },
    "id": "SPlwg9AgrYgp",
    "outputId": "b8871ff9-6581-44a6-bc48-1b3caaa57ead"
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "5 users\n",
      "4 items\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>user_id</th>\n",
       "      <th>item_id</th>\n",
       "      <th>rating</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>5</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>3</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>0</td>\n",
       "      <td>2</td>\n",
       "      <td>5</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>0</td>\n",
       "      <td>3</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>5</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>4</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6</th>\n",
       "      <td>1</td>\n",
       "      <td>2</td>\n",
       "      <td>3</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7</th>\n",
       "      <td>1</td>\n",
       "      <td>3</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>8</th>\n",
       "      <td>2</td>\n",
       "      <td>0</td>\n",
       "      <td>4</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>9</th>\n",
       "      <td>2</td>\n",
       "      <td>1</td>\n",
       "      <td>4</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>10</th>\n",
       "      <td>2</td>\n",
       "      <td>2</td>\n",
       "      <td>5</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>11</th>\n",
       "      <td>2</td>\n",
       "      <td>3</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>12</th>\n",
       "      <td>3</td>\n",
       "      <td>0</td>\n",
       "      <td>4</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>13</th>\n",
       "      <td>3</td>\n",
       "      <td>2</td>\n",
       "      <td>5</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>14</th>\n",
       "      <td>3</td>\n",
       "      <td>3</td>\n",
       "      <td>5</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>15</th>\n",
       "      <td>3</td>\n",
       "      <td>1</td>\n",
       "      <td>3</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>16</th>\n",
       "      <td>4</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>17</th>\n",
       "      <td>4</td>\n",
       "      <td>1</td>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>18</th>\n",
       "      <td>4</td>\n",
       "      <td>2</td>\n",
       "      <td>3</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>19</th>\n",
       "      <td>4</td>\n",
       "      <td>3</td>\n",
       "      <td>5</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "    user_id  item_id  rating\n",
       "0         0        0       5\n",
       "1         0        1       3\n",
       "2         0        2       5\n",
       "3         0        3       1\n",
       "4         1        0       5\n",
       "5         1        1       4\n",
       "6         1        2       3\n",
       "7         1        3       1\n",
       "8         2        0       4\n",
       "9         2        1       4\n",
       "10        2        2       5\n",
       "11        2        3       1\n",
       "12        3        0       4\n",
       "13        3        2       5\n",
       "14        3        3       5\n",
       "15        3        1       3\n",
       "16        4        0       1\n",
       "17        4        1       2\n",
       "18        4        2       3\n",
       "19        4        3       5"
      ]
     },
     "execution_count": 2,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "names = ['user_id', 'item_id', 'rating']\n",
    "df = pd.read_csv('rating.csv', sep='\\t', names=names)\n",
    "df.head()\n",
    "\n",
    "n_users = df.user_id.unique().shape[0]\n",
    "n_items = df.item_id.unique().shape[0]\n",
    "print(str(n_users) + ' users')\n",
    "print(str(n_items) + ' items')\n",
    "df"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "z90r54XGrmPj"
   },
   "source": [
    "# Split Dataset into Training Dataset and Testing Dataset"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/"
    },
    "id": "KlU-Lq16rlpN",
    "outputId": "259e8384-bd58-4dac-c856-b7a97db2580e"
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(    user_id  item_id  rating\n",
       " 1         0        1       3\n",
       " 18        4        2       3\n",
       " 16        4        0       1\n",
       " 17        4        1       2\n",
       " 19        4        3       5\n",
       " 4         1        0       5\n",
       " 15        3        1       3\n",
       " 6         1        2       3\n",
       " 9         2        1       4\n",
       " 5         1        1       4\n",
       " 12        3        0       4\n",
       " 7         1        3       1\n",
       " 11        2        3       1\n",
       " 14        3        3       5\n",
       " 2         0        2       5\n",
       " 13        3        2       5\n",
       " 3         0        3       1\n",
       " 8         2        0       4,\n",
       "     user_id  item_id  rating\n",
       " 0         0        0       5\n",
       " 10        2        2       5)"
      ]
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "train_df, test_df = train_test_split(df, test_size=0.1)\n",
    "train_df, test_df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/"
    },
    "id": "gvaLspXUrg9L",
    "outputId": "c7477ddc-0691-438d-f7bd-c23ae2d4bf55"
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(     0    1    2    3\n",
       " 0  0.0  3.0  5.0  1.0\n",
       " 1  5.0  4.0  3.0  1.0\n",
       " 2  4.0  4.0  0.0  1.0\n",
       " 3  4.0  3.0  5.0  5.0\n",
       " 4  1.0  2.0  3.0  5.0,\n",
       "      0    1    2    3\n",
       " 0  5.0  0.0  0.0  0.0\n",
       " 1  0.0  0.0  0.0  0.0\n",
       " 2  0.0  0.0  5.0  0.0\n",
       " 3  0.0  0.0  0.0  0.0\n",
       " 4  0.0  0.0  0.0  0.0)"
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# Training Dataset\n",
    "train_ds = np.zeros((n_users, n_items))\n",
    "for row in train_df.itertuples():\n",
    "    train_ds[row[1], row[2]] = row[3]\n",
    "train_ds = pd.DataFrame(train_ds)\n",
    "\n",
    "# Testing Dataset\n",
    "test_ds = np.zeros((n_users, n_items))\n",
    "for row in test_df.itertuples():\n",
    "    test_ds[row[1], row[2]] = row[3]\n",
    "test_ds = pd.DataFrame(test_ds)\n",
    "\n",
    "train_ds, test_ds"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "E1ffZ6fCjYQ7"
   },
   "source": [
    "# Fitting the Algorithm"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "DYJZT1keazuc"
   },
   "source": [
    "## User-based"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "3bf9Ksc2OlL1"
   },
   "source": [
    "### Compute Pearson Correlation Coefficient for Each Pair of Users in Training Dataset"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/"
    },
    "id": "SH3O2sX3iBx1",
    "outputId": "f883bb40-6ac3-4abc-d432-c1790f2861ae"
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "0 user: [0. 3. 5. 1.]; 0 user: [0. 3. 5. 1.]\n",
      "co-reated item index[1 2 3]\n",
      "np.clip(user_i_vec, 0, 1) is [0. 1. 1. 1.]\n",
      "mean_user_i  is 2.999999999\n",
      "user_i_sub_mean  is [ 1.00000008e-09  2.00000000e+00 -2.00000000e+00]\n",
      "len(corrated_index)  is 3\n",
      "0 user: [0. 3. 5. 1.]; 1 user: [5. 4. 3. 1.]\n",
      "co-reated item index[1 2 3]\n",
      "np.clip(user_i_vec, 0, 1) is [0. 1. 1. 1.]\n",
      "mean_user_i  is 2.999999999\n",
      "user_i_sub_mean  is [ 1.00000008e-09  2.00000000e+00 -2.00000000e+00]\n",
      "len(corrated_index)  is 3\n",
      "0 user: [0. 3. 5. 1.]; 2 user: [4. 4. 0. 1.]\n",
      "co-reated item index[1 3]\n",
      "np.clip(user_i_vec, 0, 1) is [0. 1. 1. 1.]\n",
      "mean_user_i  is 2.999999999\n",
      "user_i_sub_mean  is [ 1.00000008e-09 -2.00000000e+00]\n",
      "len(corrated_index)  is 2\n",
      "0 user: [0. 3. 5. 1.]; 3 user: [4. 3. 5. 5.]\n",
      "co-reated item index[1 2 3]\n",
      "np.clip(user_i_vec, 0, 1) is [0. 1. 1. 1.]\n",
      "mean_user_i  is 2.999999999\n",
      "user_i_sub_mean  is [ 1.00000008e-09  2.00000000e+00 -2.00000000e+00]\n",
      "len(corrated_index)  is 3\n",
      "0 user: [0. 3. 5. 1.]; 4 user: [1. 2. 3. 5.]\n",
      "co-reated item index[1 2 3]\n",
      "np.clip(user_i_vec, 0, 1) is [0. 1. 1. 1.]\n",
      "mean_user_i  is 2.999999999\n",
      "user_i_sub_mean  is [ 1.00000008e-09  2.00000000e+00 -2.00000000e+00]\n",
      "len(corrated_index)  is 3\n",
      "1 user: [5. 4. 3. 1.]; 0 user: [0. 3. 5. 1.]\n",
      "co-reated item index[1 2 3]\n",
      "np.clip(user_i_vec, 0, 1) is [1. 1. 1. 1.]\n",
      "mean_user_i  is 3.2499999991875\n",
      "user_i_sub_mean  is [ 0.75 -0.25 -2.25]\n",
      "len(corrated_index)  is 3\n",
      "1 user: [5. 4. 3. 1.]; 1 user: [5. 4. 3. 1.]\n",
      "co-reated item index[0 1 2 3]\n",
      "np.clip(user_i_vec, 0, 1) is [1. 1. 1. 1.]\n",
      "mean_user_i  is 3.2499999991875\n",
      "user_i_sub_mean  is [ 1.75  0.75 -0.25 -2.25]\n",
      "len(corrated_index)  is 4\n",
      "1 user: [5. 4. 3. 1.]; 2 user: [4. 4. 0. 1.]\n",
      "co-reated item index[0 1 3]\n",
      "np.clip(user_i_vec, 0, 1) is [1. 1. 1. 1.]\n",
      "mean_user_i  is 3.2499999991875\n",
      "user_i_sub_mean  is [ 1.75  0.75 -2.25]\n",
      "len(corrated_index)  is 3\n",
      "1 user: [5. 4. 3. 1.]; 3 user: [4. 3. 5. 5.]\n",
      "co-reated item index[0 1 2 3]\n",
      "np.clip(user_i_vec, 0, 1) is [1. 1. 1. 1.]\n",
      "mean_user_i  is 3.2499999991875\n",
      "user_i_sub_mean  is [ 1.75  0.75 -0.25 -2.25]\n",
      "len(corrated_index)  is 4\n",
      "1 user: [5. 4. 3. 1.]; 4 user: [1. 2. 3. 5.]\n",
      "co-reated item index[0 1 2 3]\n",
      "np.clip(user_i_vec, 0, 1) is [1. 1. 1. 1.]\n",
      "mean_user_i  is 3.2499999991875\n",
      "user_i_sub_mean  is [ 1.75  0.75 -0.25 -2.25]\n",
      "len(corrated_index)  is 4\n",
      "2 user: [4. 4. 0. 1.]; 0 user: [0. 3. 5. 1.]\n",
      "co-reated item index[1 3]\n",
      "np.clip(user_i_vec, 0, 1) is [1. 1. 0. 1.]\n",
      "mean_user_i  is 2.999999999\n",
      "user_i_sub_mean  is [ 1. -2.]\n",
      "len(corrated_index)  is 2\n",
      "2 user: [4. 4. 0. 1.]; 1 user: [5. 4. 3. 1.]\n",
      "co-reated item index[0 1 3]\n",
      "np.clip(user_i_vec, 0, 1) is [1. 1. 0. 1.]\n",
      "mean_user_i  is 2.999999999\n",
      "user_i_sub_mean  is [ 1.  1. -2.]\n",
      "len(corrated_index)  is 3\n",
      "2 user: [4. 4. 0. 1.]; 2 user: [4. 4. 0. 1.]\n",
      "co-reated item index[0 1 3]\n",
      "np.clip(user_i_vec, 0, 1) is [1. 1. 0. 1.]\n",
      "mean_user_i  is 2.999999999\n",
      "user_i_sub_mean  is [ 1.  1. -2.]\n",
      "len(corrated_index)  is 3\n",
      "2 user: [4. 4. 0. 1.]; 3 user: [4. 3. 5. 5.]\n",
      "co-reated item index[0 1 3]\n",
      "np.clip(user_i_vec, 0, 1) is [1. 1. 0. 1.]\n",
      "mean_user_i  is 2.999999999\n",
      "user_i_sub_mean  is [ 1.  1. -2.]\n",
      "len(corrated_index)  is 3\n",
      "2 user: [4. 4. 0. 1.]; 4 user: [1. 2. 3. 5.]\n",
      "co-reated item index[0 1 3]\n",
      "np.clip(user_i_vec, 0, 1) is [1. 1. 0. 1.]\n",
      "mean_user_i  is 2.999999999\n",
      "user_i_sub_mean  is [ 1.  1. -2.]\n",
      "len(corrated_index)  is 3\n",
      "3 user: [4. 3. 5. 5.]; 0 user: [0. 3. 5. 1.]\n",
      "co-reated item index[1 2 3]\n",
      "np.clip(user_i_vec, 0, 1) is [1. 1. 1. 1.]\n",
      "mean_user_i  is 4.2499999989375\n",
      "user_i_sub_mean  is [-1.25  0.75  0.75]\n",
      "len(corrated_index)  is 3\n",
      "3 user: [4. 3. 5. 5.]; 1 user: [5. 4. 3. 1.]\n",
      "co-reated item index[0 1 2 3]\n",
      "np.clip(user_i_vec, 0, 1) is [1. 1. 1. 1.]\n",
      "mean_user_i  is 4.2499999989375\n",
      "user_i_sub_mean  is [-0.25 -1.25  0.75  0.75]\n",
      "len(corrated_index)  is 4\n",
      "3 user: [4. 3. 5. 5.]; 2 user: [4. 4. 0. 1.]\n",
      "co-reated item index[0 1 3]\n",
      "np.clip(user_i_vec, 0, 1) is [1. 1. 1. 1.]\n",
      "mean_user_i  is 4.2499999989375\n",
      "user_i_sub_mean  is [-0.25 -1.25  0.75]\n",
      "len(corrated_index)  is 3\n",
      "3 user: [4. 3. 5. 5.]; 3 user: [4. 3. 5. 5.]\n",
      "co-reated item index[0 1 2 3]\n",
      "np.clip(user_i_vec, 0, 1) is [1. 1. 1. 1.]\n",
      "mean_user_i  is 4.2499999989375\n",
      "user_i_sub_mean  is [-0.25 -1.25  0.75  0.75]\n",
      "len(corrated_index)  is 4\n",
      "3 user: [4. 3. 5. 5.]; 4 user: [1. 2. 3. 5.]\n",
      "co-reated item index[0 1 2 3]\n",
      "np.clip(user_i_vec, 0, 1) is [1. 1. 1. 1.]\n",
      "mean_user_i  is 4.2499999989375\n",
      "user_i_sub_mean  is [-0.25 -1.25  0.75  0.75]\n",
      "len(corrated_index)  is 4\n",
      "4 user: [1. 2. 3. 5.]; 0 user: [0. 3. 5. 1.]\n",
      "co-reated item index[1 2 3]\n",
      "np.clip(user_i_vec, 0, 1) is [1. 1. 1. 1.]\n",
      "mean_user_i  is 2.7499999993125\n",
      "user_i_sub_mean  is [-0.75  0.25  2.25]\n",
      "len(corrated_index)  is 3\n",
      "4 user: [1. 2. 3. 5.]; 1 user: [5. 4. 3. 1.]\n",
      "co-reated item index[0 1 2 3]\n",
      "np.clip(user_i_vec, 0, 1) is [1. 1. 1. 1.]\n",
      "mean_user_i  is 2.7499999993125\n",
      "user_i_sub_mean  is [-1.75 -0.75  0.25  2.25]\n",
      "len(corrated_index)  is 4\n",
      "4 user: [1. 2. 3. 5.]; 2 user: [4. 4. 0. 1.]\n",
      "co-reated item index[0 1 3]\n",
      "np.clip(user_i_vec, 0, 1) is [1. 1. 1. 1.]\n",
      "mean_user_i  is 2.7499999993125\n",
      "user_i_sub_mean  is [-1.75 -0.75  2.25]\n",
      "len(corrated_index)  is 3\n",
      "4 user: [1. 2. 3. 5.]; 3 user: [4. 3. 5. 5.]\n",
      "co-reated item index[0 1 2 3]\n",
      "np.clip(user_i_vec, 0, 1) is [1. 1. 1. 1.]\n",
      "mean_user_i  is 2.7499999993125\n",
      "user_i_sub_mean  is [-1.75 -0.75  0.25  2.25]\n",
      "len(corrated_index)  is 4\n",
      "4 user: [1. 2. 3. 5.]; 4 user: [1. 2. 3. 5.]\n",
      "co-reated item index[0 1 2 3]\n",
      "np.clip(user_i_vec, 0, 1) is [1. 1. 1. 1.]\n",
      "mean_user_i  is 2.7499999993125\n",
      "user_i_sub_mean  is [-1.75 -0.75  0.25  2.25]\n",
      "len(corrated_index)  is 4\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "array([[ 1.00000000e+00,  5.92999453e-01,  8.94427191e-01,\n",
       "         5.39163910e-11, -5.92999453e-01],\n",
       "       [ 5.92999453e-01,  1.00000000e+00,  9.69560705e-01,\n",
       "        -6.62541349e-01, -1.00000000e+00],\n",
       "       [ 8.94427191e-01,  9.69560705e-01,  1.00000000e+00,\n",
       "        -8.28078671e-01, -9.69560705e-01],\n",
       "       [ 5.39163910e-11, -6.62541349e-01, -8.28078671e-01,\n",
       "         1.00000000e+00,  6.62541349e-01],\n",
       "       [-5.92999453e-01, -1.00000000e+00, -9.69560705e-01,\n",
       "         6.62541349e-01,  1.00000000e+00]])"
      ]
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "GAMMA = 0\n",
    "EPSILON = 1e-9\n",
    "\n",
    "np_user_pearson_corr = np.zeros((n_users, n_users))\n",
    "\n",
    "for i, user_i_vec in enumerate(train_ds.values):\n",
    "    for j, user_j_vec in enumerate(train_ds.values):\n",
    "        print(str(i) + \" user: \" + str(user_i_vec) + \"; \" + str(j) + \" user: \" + str(user_j_vec))\n",
    "        \n",
    "        # ratings corated by the current pair of users\n",
    "        mask_i = user_i_vec > 0\n",
    "        mask_j = user_j_vec > 0\n",
    "\n",
    "        # corrated item index, skip if there are no corrated ratings\n",
    "        corrated_index = np.intersect1d(np.where(mask_i), np.where(mask_j))\n",
    "        print(\"co-reated item index\" + str(corrated_index))\n",
    "        if len(corrated_index) == 0:\n",
    "            continue\n",
    "\n",
    "        # average value of user_i_vec and user_j_vec\n",
    "        mean_user_i = np.sum(user_i_vec) / (np.sum(np.clip(user_i_vec, 0, 1)) + EPSILON)\n",
    "        mean_user_j = np.sum(user_j_vec) / (np.sum(np.clip(user_j_vec, 0, 1)) + EPSILON)\n",
    "        \n",
    "        print(\"np.clip(user_i_vec, 0, 1)\" + \" is \" + str(np.clip(user_i_vec, 0, 1)))\n",
    "        \n",
    "\n",
    "        # compute pearson corr\n",
    "        user_i_sub_mean = user_i_vec[corrated_index] - mean_user_i\n",
    "        user_j_sub_mean = user_j_vec[corrated_index] - mean_user_j\n",
    "        \n",
    "        print(\"mean_user_i \" + \" is \" + str(mean_user_i))\n",
    "        print(\"user_i_sub_mean \" + \" is \" + str(user_i_sub_mean))\n",
    "\n",
    "        r_ui_sub_r_i_sq = np.square(user_i_sub_mean)\n",
    "        r_uj_sub_r_j_sq = np.square(user_j_sub_mean)\n",
    "\n",
    "        r_ui_sum_sqrt = np.sqrt(np.sum(r_ui_sub_r_i_sq))\n",
    "        r_uj_sum_sqrt = np.sqrt(np.sum(r_uj_sub_r_j_sq))\n",
    "\n",
    "        sim = np.sum(user_i_sub_mean * user_j_sub_mean) / (r_ui_sum_sqrt * r_uj_sum_sqrt + EPSILON)\n",
    "\n",
    "        # significance weighting\n",
    "        weighted_sim = sim\n",
    "        #weighted_sim = (min(len(corrated_index), GAMMA) / GAMMA) * sim\n",
    "        print(\"len(corrated_index) \" + \" is \" + str(len(corrated_index)))\n",
    "\n",
    "        np_user_pearson_corr[i][j] = weighted_sim\n",
    "        \n",
    "        ###\n",
    "        #print(str(i) + \" user: \" + str(user_i_vec) + \"; \" + str(j) + \" user: \" + str(user_j_vec))\n",
    "        #print(\"co-reated item index\" + str(corrated_index))\n",
    "        #print(\"np.clip(user_i_vec, 0, 1)\" + \" is \" + str(np.clip(user_i_vec, 0, 1)))\n",
    "        #print(\"np.clip(user_j_vec, 0, 1)\" + \" is \" + str(np.clip(user_j_vec, 0, 1)))\n",
    "        #print('i' + \" user average: \" + str(mean_user_i))\n",
    "        #print('j' + \" user average: \" + str(mean_user_j))\n",
    "np_user_pearson_corr"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "7OxHrFpIEHYm"
   },
   "source": [
    "### Predict Ratings"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {
    "id": "Z-REE2zqj5rt"
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "0 0\n",
      "[4 3 1 2 0]\n",
      "[ 1.00000000e+00  5.92999453e-01  8.94427191e-01  5.39163910e-11\n",
      " -5.92999453e-01]\n",
      "[3 1 2]\n",
      "2 2\n",
      "[4 3 0 1 2]\n",
      "[ 0.89442719  0.96956071  1.         -0.82807867 -0.96956071]\n",
      "[3 0 1]\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "array([[4.29900607, 0.        , 0.        , 0.        ],\n",
       "       [0.        , 0.        , 0.        , 0.        ],\n",
       "       [0.        , 0.        , 3.89332654, 0.        ],\n",
       "       [0.        , 0.        , 0.        , 0.        ],\n",
       "       [0.        , 0.        , 0.        , 0.        ]])"
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "np_predictions = np.zeros((n_users, n_items))\n",
    "\n",
    "K = 3\n",
    "EPSILON = 1e-9\n",
    "\n",
    "for (i, j), rating in np.ndenumerate(test_ds.values):\n",
    "    if rating > 0:\n",
    "        print(str(i) + \" \" + str(j))\n",
    "        # find top-k most similar users as the current user, remove itself\n",
    "        sim_user_ids = np.argsort(np_user_pearson_corr[i])[-(K + 1):-1]\n",
    "        ### useful link for how to access values in numpy arrays: https://numpy.org/doc/stable/reference/arrays.indexing.html\n",
    "\n",
    "        print(np.argsort(np_user_pearson_corr[i]))\n",
    "        print(np_user_pearson_corr[i][:])\n",
    "        print(np.argsort(np_user_pearson_corr[i])[-(K + 1):-1])\n",
    "        # the coefficient values of similar users\n",
    "        sim_val = np_user_pearson_corr[i][sim_user_ids]\n",
    "\n",
    "        # the average value of the current user's ratings\n",
    "        sim_users = train_ds.values[sim_user_ids]\n",
    "        user_mean = np.sum(train_ds.values[i]) / (np.sum(np.clip(train_ds.values[i], 0, 1)) + EPSILON)\n",
    "        sim_user_mean = np.sum(sim_users, axis=1) / (np.sum(np.clip(sim_users, 0, 1), axis=1) + EPSILON)\n",
    "\n",
    "        #print(sim_users)\n",
    "        # select the users who rated item j\n",
    "        mask_rated_j = sim_users[:, j] > 0\n",
    "        \n",
    "        # sim(u, v) * (r_vj - mean_v)\n",
    "        sim_r_sum_mean = sim_val[mask_rated_j] * (sim_users[mask_rated_j, j] - sim_user_mean[mask_rated_j])\n",
    "\n",
    "        # filter unrated items\n",
    "        #w = np.clip(sim_users[mask_rated_j, j], 0, 1)\n",
    "        #sim_r_sum_mean *= w\n",
    "        #print(sim_users[:, j])\n",
    "        \n",
    "        np_predictions[i][j] = user_mean + np.sum(sim_r_sum_mean) / (np.sum(sim_val[mask_rated_j]) + EPSILON)\n",
    "        np_predictions[i][j] = np.clip(np_predictions[i][j], 0, 5)\n",
    "np_predictions"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "tiPKuFjQudyM"
   },
   "source": [
    "### Evaluation"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "gQZSW98ZGDJR"
   },
   "source": [
    "#### Mean Absolute Error (MAE)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/"
    },
    "id": "20UzmEf9uhF_",
    "outputId": "78ab7be3-55f8-416e-e578-b13136768b62"
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[[0.70099393 0.         0.         0.        ]\n",
      " [0.         0.         0.         0.        ]\n",
      " [0.         0.         1.10667346 0.        ]\n",
      " [0.         0.         0.         0.        ]\n",
      " [0.         0.         0.         0.        ]]\n",
      "[[1. 0. 0. 0.]\n",
      " [0. 0. 0. 0.]\n",
      " [0. 0. 1. 0.]\n",
      " [0. 0. 0. 0.]\n",
      " [0. 0. 0. 0.]]\n",
      "MAE on Tesing set (User-based): 0.9038336987360527\n"
     ]
    }
   ],
   "source": [
    "#==================MAE on Testing set===================#\n",
    "labels = test_ds.values\n",
    "\n",
    "# absolute error on all ratings\n",
    "absolute_error = np.abs(np_predictions - labels)\n",
    "print(absolute_error)\n",
    "\n",
    "# weight\n",
    "weight = np.clip(labels, 0, 1)\n",
    "print(weight)\n",
    "\n",
    "# absoulte error on rated ratings\n",
    "abs_error = absolute_error * weight\n",
    "\n",
    "# MAE\n",
    "MAE = np.sum(abs_error) / np.sum(weight)\n",
    "\n",
    "print(\"MAE on Tesing set (User-based): \" + str(MAE));"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "j_L7yqJ4HXJa"
   },
   "source": [
    "#### Root Mean Squared Error (RMSE)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/"
    },
    "id": "mV-BhWhQvtAV",
    "outputId": "e8801f4b-98b6-42ba-8c44-1a1f287558bd"
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "RMSE on Tesing set (User-based): 0.9263149166909437\n"
     ]
    }
   ],
   "source": [
    "#==================RMSE on Testing set===================\n",
    "labels = test_ds.values\n",
    "\n",
    "# squared error on all ratings\n",
    "squared_error = np.square(np_predictions - labels)\n",
    "weight = np.clip(labels, 0, 1)\n",
    "\n",
    "# squared error on rated ratings\n",
    "squared_error = squared_error * weight\n",
    "\n",
    "# RMSE\n",
    "RMSE = np.sqrt(np.sum(squared_error) / np.sum(weight))\n",
    "\n",
    "print(\"RMSE on Tesing set (User-based): \" + str(RMSE));"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[[0. 5. 4. 4. 1.]\n",
      " [3. 4. 4. 3. 2.]\n",
      " [5. 3. 0. 5. 3.]\n",
      " [1. 1. 1. 5. 5.]]\n"
     ]
    }
   ],
   "source": [
    "print(train_ds.T.values)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[[0. 3. 5. 1.]\n",
      " [5. 4. 3. 1.]\n",
      " [4. 4. 0. 1.]\n",
      " [4. 3. 5. 5.]\n",
      " [1. 2. 3. 5.]]\n"
     ]
    }
   ],
   "source": [
    "print(train_ds.values)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "ecoieIQTbQZV"
   },
   "source": [
    "## Item-based"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "4XHtJ0LpbQZW"
   },
   "source": [
    "### Compute Pearson Correlation Coefficient for Each Pair of Users in Training Dataset"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/"
    },
    "id": "KgPMV2sobQZW",
    "outputId": "dad39c79-bb22-4810-ceba-88a6e213461e"
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([[ 1.60000000e-01,  1.44463024e-01,  3.51324026e-02,\n",
       "        -1.04595272e-01],\n",
       "       [ 1.44463024e-01,  2.00000000e-01, -4.35464879e-11,\n",
       "        -1.52752523e-01],\n",
       "       [ 3.51324026e-02, -4.35464879e-11,  1.60000000e-01,\n",
       "         3.13785842e-11],\n",
       "       [-1.04595272e-01, -1.52752523e-01,  3.13785842e-11,\n",
       "         2.00000000e-01]])"
      ]
     },
     "execution_count": 11,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "DELTA = 25\n",
    "EPSILON = 1e-9\n",
    "\n",
    "np_item_pearson_corr = np.zeros((n_items, n_items))\n",
    "\n",
    "for i, item_i_vec in enumerate(train_ds.T.values):\n",
    "    for j, item_j_vec in enumerate(train_ds.T.values):\n",
    "\n",
    "        # ratings corated by the current pair od items\n",
    "        mask_i = item_i_vec > 0\n",
    "        mask_j = item_j_vec > 0\n",
    "\n",
    "        # corrated index, skip if there are no corrated ratings\n",
    "        corrated_index = np.intersect1d(np.where(mask_i), np.where(mask_j))\n",
    "        if len(corrated_index) == 0:\n",
    "            continue\n",
    "\n",
    "        # average value of item_i_vec and item_j_vec\n",
    "        mean_item_i = np.sum(item_i_vec) / (np.sum(np.clip(item_i_vec, 0, 1)) + EPSILON)\n",
    "        mean_item_j = np.sum(item_j_vec) / (np.sum(np.clip(item_j_vec, 0, 1)) + EPSILON)\n",
    "\n",
    "        # compute pearson corr\n",
    "        item_i_sub_mean = item_i_vec[corrated_index] - mean_item_i\n",
    "        item_j_sub_mean = item_j_vec[corrated_index] - mean_item_j\n",
    "\n",
    "        r_ui_sub_ri_sq = np.square(item_i_sub_mean)\n",
    "        r_uj_sub_rj_sq = np.square(item_j_sub_mean)\n",
    "\n",
    "        r_ui_sub_ri_sq_sum_sqrt = np.sqrt(np.sum(r_ui_sub_ri_sq))\n",
    "        r_uj_sub_rj_sq_sum_sqrt = np.sqrt(np.sum(r_uj_sub_rj_sq))\n",
    "\n",
    "        sim = np.sum(item_i_sub_mean * item_j_sub_mean) / (r_ui_sub_ri_sq_sum_sqrt * r_uj_sub_rj_sq_sum_sqrt + EPSILON)\n",
    "\n",
    "        # significance weighting\n",
    "        weighted_sim = (min(len(corrated_index), DELTA) / DELTA) * sim\n",
    "\n",
    "        np_item_pearson_corr[i][j] = weighted_sim\n",
    "\n",
    "np_item_pearson_corr"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "2EANmxRhbQZX"
   },
   "source": [
    "### Predict Ratings"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {
    "id": "vyysZ6e7bQZX"
   },
   "outputs": [],
   "source": [
    "np_predictions = np.zeros((n_users, n_items))\n",
    "\n",
    "K = 10\n",
    "EPSILON = 1e-9\n",
    "\n",
    "for (i, j), rating in np.ndenumerate(test_ds.values):\n",
    "    if rating > 0:\n",
    "        # find top-k most similar items as the current item, remove itself\n",
    "        sim_item_ids = np.argsort(np_item_pearson_corr[j])[-(K + 1):-1]\n",
    "\n",
    "        # the coefficient values of similar items\n",
    "        sim_val = np_item_pearson_corr[i][sim_item_ids]\n",
    "\n",
    "        # the average value of the current item's ratings\n",
    "        sim_items = train_ds.T.values[sim_item_ids]\n",
    "        item_mean = np.sum(train_ds.T.values[j]) / (np.sum(np.clip(train_ds.T.values[j], 0, 1)) + EPSILON)\n",
    "        sim_item_mean = np.sum(sim_items, axis=1) / (np.sum(np.clip(sim_items, 0, 1), axis=1) + EPSILON)\n",
    "\n",
    "        # sim(u, v) * (r_v - mean_v)\n",
    "        sim_r_sum_mean = sim_val * (sim_items[:, i] - sim_item_mean) \n",
    "\n",
    "        # filter unrated items\n",
    "        w = np.clip(sim_items[:, i], 0, 1)\n",
    "        sim_r_sum_mean *= w\n",
    "\n",
    "        if np.sum(sim_val) == 0:\n",
    "            np_predictions[i][j] = item_mean\n",
    "        else:\n",
    "            np_predictions[i][j] = item_mean + np.sum(sim_r_sum_mean) / (np.sum(sim_val) + EPSILON)\n",
    "            \n",
    "        np_predictions[i][j] = np.clip(np_predictions[i][j], 0, 5)\n",
    "    "
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "-rIh6rpJbQZY"
   },
   "source": [
    "### Evaluation"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "XbMQzZlibQZY"
   },
   "source": [
    "#### Mean Absolute Error (MAE)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/"
    },
    "id": "bg2OzOBXbQZZ",
    "outputId": "5301a318-a035-4c9d-afbb-4e8d0c23e55e"
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "MAE on Tesing set (Item-based): 0.2500000083021723\n"
     ]
    }
   ],
   "source": [
    "#==================MAE on Testing set===================#\n",
    "labels = test_ds.values\n",
    "\n",
    "# absolute error on all ratings\n",
    "absolute_error = np.abs(np_predictions - labels)\n",
    "\n",
    "# weight\n",
    "weight = np.clip(labels, 0, 1)\n",
    "\n",
    "# absoulte error on rated ratings\n",
    "abs_error = absolute_error * weight\n",
    "\n",
    "# MAE\n",
    "MAE = np.sum(abs_error) / np.sum(weight)\n",
    "\n",
    "print(\"MAE on Tesing set (Item-based): \" + str(MAE));"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "ze_3YW5GbQZZ"
   },
   "source": [
    "#### Root Mean Squared Error (RMSE)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/"
    },
    "id": "FEh_WNPXbQZa",
    "outputId": "57f49fa3-ca1b-4d0b-c534-4fb147ca5b2e"
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "RMSE on Tesing set (Item-based): 0.3535534023343184\n"
     ]
    }
   ],
   "source": [
    "#==================RMSE on Testing set===================#\n",
    "labels = test_ds.values\n",
    "\n",
    "# squared error on all ratings\n",
    "squared_error = np.square(np_predictions - labels)\n",
    "weight = np.clip(labels, 0, 1)\n",
    "\n",
    "# squared error on rated ratings\n",
    "squared_error = squared_error * weight\n",
    "\n",
    "# RMSE\n",
    "RMSE = np.sqrt(np.sum(squared_error) / np.sum(weight))\n",
    "\n",
    "print(\"RMSE on Tesing set (Item-based): \" + str(RMSE));"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "colab": {
   "collapsed_sections": [],
   "name": "KNN-based CF.ipynb",
   "provenance": [],
   "toc_visible": true
  },
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.5"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 1
}