Goal: To provide personalized recommendations based on the preferences of individual users.
Data: 100,000 movie ratings from 1000 users on 1700 movies, downloaded from GroupLens.org.
Methods:Findings: Collaborative filtering gives the highest accuracy among the models since it optimizes both the product features and user preferences, whereas the other two methods use only one set of parameters.
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
plt.rc("font", size=14)
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.cross_validation import KFold
from sklearn.cross_validation import train_test_split
The data consists of three tables: ratings, movies info, and users info.
To ensure statistical significance, users with less than 20 ratings, and movies with less than 3 ratings are excluded.
This dataset provides a list of ratings that users have given to movies. It includes 100,000 records and 3 fields: userID, movieID, and rating.
data = pd.read_table('u.data').drop('timestamp', axis=1)
N = len(data)
print data.shape
print list(data.columns)
data.head()
data.rating.value_counts(sort=False).plot(kind='bar')
plt.title('Rating Distribution\n')
plt.xlabel('Rating')
plt.ylabel('Count')
plt.show()
Convert the table to a 2D matrix. The matrix will be sparse since most of the ratings are missing.
rating = data.pivot(index='userID', columns='movieID').rating
userID = rating.index
movieID = rating.columns
print rating.shape
rating.head()
This dataset provides movie details. It includes 1682 records and 21 fields: movie title, release date, and 19 movie genres in binary format.
movies_info = pd.read_table('u.item', sep='|').set_index('movieID').\
drop(['IMDb URL', 'video release date'], axis=1)
movies_info['release date'] = pd.to_datetime(movies_info['release date'])
Nm = len(data.movieID.unique())
print movies_info.shape
print list(movies_info.columns)
movies_info.head()
df = movies_info.iloc[:,2:].sum().order()
df.plot(kind='bar')
plt.title('Distribution of Movie Genres\n')
plt.ylabel('Count')
plt.show()
This dataset provides the user demographic information. It includes 943 records and 2 fields: age, and gender.
users_info = pd.read_table('u.user', sep='|').set_index('userID').\
drop(['occupation','zipcode'], axis=1)
Nu = len(data.userID.unique())
print users_info.shape
print list(users_info.columns)
users_info.head()
users_info.gender.value_counts().plot(kind='pie', autopct='%d%%',
labels=['Male', 'Female'], colors=['g', 'r'])
plt.title('Gender Distribution\n')
plt.axis('image')
plt.show()
users_info.age.hist()
plt.title('Age Distribution\n')
plt.xlabel('Age')
plt.ylabel('Count')
plt.show()
Create age groups using the IMDb binning convention:
users_info['age_group'] = pd.cut(users_info.age, [0,18,29,45,np.inf])
users_info.drop('age', axis=1, inplace=True)
users_info.head()
joined = data.join(users_info, on='userID').\
join(movies_info.iloc[:,3:], on='movieID')
joined.sort(['userID', 'movieID'], inplace=True)
print joined.shape
joined.head()
avg = joined.groupby(['movieID', 'gender', 'age_group']).rating.mean()
avg.head()
.
Add the average rating to data, and drop gender and age group.
newdata = joined.join(avg, rsuffix='_avg', on=['movieID', 'gender', 'age_group'])
newdata.drop(['gender', 'age_group'], axis=1, inplace=True)
newdata.set_index(['userID', 'movieID'], inplace=True)
newdata.head()
Model the ratings based on movie genres, and extract the user preferences towards each genre.
Y = newdata.rating - newdata.rating_avg
X = newdata.drop(['rating', 'rating_avg'], axis=1)
Use a regularization parameter (alpha) to allow model optimization.
def LinReg(Xtrain, ytrain, Xtest=None, alpha=1):
model = Ridge(fit_intercept=False, alpha=alpha)
model.fit(Xtrain, ytrain)
pred = model.predict(Xtrain)
if Xtest is None:
return pred
else:
pred_test = model.predict(Xtest)
return pred, pred_test
def rmse(y, yhat):
err = (y - yhat) ** 2
return np.sqrt(err.mean())
Optimize the regularization parameter (alpha) by cross-validation. The purpose of regularization is to minimize overfitting/underfitting (bias/variance trade off).
alpha = [0.001, 0.01, 0.1, 1, 10, 100, 1000]
error, CVerror = [], []
for a in alpha:
for user in userID:
x, y = X.ix[user], Y.ix[user]
err, CVerr = [], []
for train, test in KFold(len(y), n_folds=3, shuffle=True):
Xtrain, Xtest = x.ix[x.index[train]], x.ix[x.index[test]]
ytrain, ytest = y.ix[x.index[train]], y.ix[x.index[test]]
pred_train, pred_test = LinReg(Xtrain, ytrain, Xtest, alpha=a)
## training error
err.append(rmse(ytrain, pred_train))
## cross-validation error
CVerr.append(rmse(ytest, pred_test))
err_avg, CVerr_avg = sum(err)/len(err), sum(CVerr)/len(CVerr)
error.append(err_avg)
CVerror.append(CVerr_avg)
ax = plt.gca()
plt.plot(alpha, error, '-o')
plt.plot(alpha, CVerror, '-o', color='r')
ax.set_xscale('log')
plt.xlabel('Regularization Parameter')
plt.ylabel('Error (RMSE)')
plt.title('Model Optimization')
plt.text(0.004, 0.99, 'training', color='b')
plt.text(0.003, 1.13, 'cross-validation', color='r')
plt.axvline(1, color='k', linestyle='dashed')
plt.show()
At low alpha, the training error is small but the CV error is large => high variance (overfitting)
At high alpha, both the training and CV errors are high => high bias (underfitting)
alpha = 1 gives the optimum result.
alpha = 1.0
err = []
for user in userID:
x, y = X.ix[user], Y.ix[user]
pred = LinReg(x, y, alpha=alpha)
err.append(rmse(y, pred))
print 'Model RMSE =', sum(err) / len(err)
def jitter(y, yhat, title=''):
pred = np.round(yhat)
pred[pred > 5] = 5
pred[pred < 1] = 1
def noise():
return np.random.randn(len(y)) * 0.1
y_jitter = y + noise()
yhat_jitter = pred + noise()
plt.scatter(y_jitter, yhat_jitter)
plt.xlabel('Actual Rating')
plt.ylabel('Predicted Rating')
plt.title(title)
plt.axis('image')
plt.grid()
plt.show()
model = Ridge(fit_intercept=False, alpha=1)
user = 1
model.fit(X.ix[user], Y.ix[user])
yhat = model.predict(X.ix[user])
jitter(Y.ix[user] + newdata.rating_avg.ix[user], yhat + newdata.rating_avg.ix[user], 'Model Predictions')
beta = pd.Series(model.coef_)
beta.index = X.columns
beta.order().plot(kind='bar')
plt.title('User Preferences')
plt.show()
User 1 prefers Sci-Fi and documentary movies, and is not a fan of children's and adventure movies.
Use euclidean distance as a measure of similarity.
def dist(i,j):
d = (rating.ix[i] - rating.ix[j])**2 ## euclidean distance
## Only consider the users who have at least 3 movies in common
if d.count() >= 3:
return np.sqrt(d.mean())
## Distance matrix
D = np.empty((Nu,Nu))
D.fill(np.nan)
D = pd.DataFrame(D)
D.index, D.columns = userID, userID
for user1 in userID:
print user1,
for user2 in userID[userID > user1]:
D[user1].ix[user2] = dist(user1, user2)
D[user2].ix[user1] = D[user1].ix[user2]
D.columns = D.index
D.head()
Take the average rating of k nearest neighbors.
def kNN_predict(user, k=5):
## Sort users by distance
neighbors = D.ix[user].order().index
## Function for calculting the kNN average
def kNN_average(x):
return x.ix[neighbors].dropna().head(k).mean()
## Apply kNN_average to the movies
movies = rating.ix[user].dropna().index
pred = rating[movies].apply(kNN_average, axis=0)
return pred
user = 1
K = [1, 2, 3, 4, 5, 7, 8, 9, 10, 15, 20]
error = []
for k in K:
y = rating.ix[user]
yhat = kNN_predict(user, k=k)
err = rmse(y, yhat)
error.append(err)
plt.plot(K, error, '-o')
plt.xlabel('# Nearest Neighbors')
plt.ylabel('Error (RMSE)')
plt.title('Model Optimization')
plt.show()
k = 5 gives the lowest error.
errors = np.empty(Nu)
for user in userID:
pred = kNN_predict(user, k=5)
errors[user-1] = rmse(rating.ix[user], pred)
print 'Model RMSE =', errors.mean()
user = 1
pred = kNN_predict(user, k=5)
jitter(rating.ix[user].dropna(), pred, 'Model Predictions')
In this method, linear regression is applied to fit both the user parameters (U) and the movie features (M). U and M are first initialized to small random values, and then are fit consecutively while keeping the other one constant. The process is repeated until convergence.
## Shuffle the data
rowID = np.array(data.index)
np.random.shuffle(rowID)
shuffled = data.ix[rowID].sort('userID').set_index('userID')
## Split the train and test set for each user
def split(user):
df = shuffled.ix[user]
N = len(df)
Ntest = N/3
Ntrain = N - Ntest
train = df.head(Ntrain)
test = df.tail(Ntest)
return train, test
## Create the train and test sets for all users
for user in shuffled.index.unique():
if user == 1:
train, test = split(user)
else:
a, b = split(user)
train = train.append(a)
test = test.append(b)
train.reset_index(inplace=True)
test.reset_index(inplace=True)
## Create train and test matrices
rating_train = train.pivot(index='userID', columns='movieID').rating
rating_test = test.pivot(index='userID', columns='movieID').rating
userID, movieID = rating_train.index, rating_train.columns
Nu, Nm = len(userID), len(movieID)
print 'training matrix =', rating_train.shape
print 'test matrix =', rating_test.shape
## number of movie features
n = 20
## Matrix of user preferences
U = pd.DataFrame(np.random.rand(Nu, n), index=userID, columns=range(1, n+1))
## Matrix of movie features
M = pd.DataFrame(np.random.rand(Nm, n), index=movieID, columns=range(1, n+1))
U0, M0 = U.copy(), M.copy()
print U.shape, M.shape
## Fit the user preferences (U)
def fitU(alpha=1):
## Join ratings and movie features
Udata = train.set_index('movieID').join(M).sort('userID').set_index('userID')
## Function for fitting individual users
model = Ridge(fit_intercept=False, alpha=alpha)
def Ufit(i):
df = Udata.ix[i]
X = df.drop('rating', axis=1)
y = df.rating
model.fit(X, y)
return model.coef_
## Fit all users
for i in userID:
U.ix[i] = Ufit(i)
## Calculate the error
pred = predict(U, M)
error = rmse(rating_train, pred)
error_history.append(error)
delta = bestFit['error'] - error
if delta > 0:
bestFit['U'] = U
bestFit['M'] = M
bestFit['error'] = error
return delta
## Fit the movie features (M)
def fitM(alpha=1):
## Join ratings and user preferences
Mdata = train.set_index('userID').join(U).sort('movieID').set_index('movieID')
## Function for fitting individual movies
model = Ridge(fit_intercept=False, alpha=alpha)
def Mfit(j):
df = Mdata.ix[j]
X = df.drop('rating', axis=1)
y = df.rating
model.fit(X, y)
return model.coef_
## Fit all movies
for j in movieID:
M.ix[j] = Mfit(j)
## Calculate the error
pred = predict(U, M)
error = rmse(rating_train, pred)
error_history.append(error)
delta = bestFit['error'] - error
if delta > 0:
bestFit['U'] = U
bestFit['M'] = M
bestFit['error'] = error
return delta
## Predict the ratings (U dot M)
def predict(U, M):
pred = U.dot(M.T)
pred[pred > 5] = 5
pred[pred < 1] = 1
return pred
## Repeat the optimization until convergence
def fitUM(alpha=1, tol=0.05):
U, M = U0.copy(), M0.copy()
delta = fitU(alpha=alpha)
tolerance = tol
while delta > tolerance:
delta = fitM(alpha=alpha)
if delta > tolerance:
delta = fitU(alpha=alpha)
return error_history
def CVerror(alpha):
error_history = []
bestFit = {'U': U0, 'M': M0, 'error': np.inf}
fitUM(alpha)
U_opt, M_opt, train_error = bestFit['U'], bestFit['M'], bestFit['error']
pred = predict(U_opt, M_opt)
train_error = rmse(rating_train, pred)
CV_error = rmse(rating_test, pred)
return train_error, CV_error
alphas = [0.01, 0.1, 1, 3, 10, 30, 100]
train_errors, CV_errors = [], []
for alpha in alphas:
err1, err2 = CVerror(alpha)
train_errors.append(err1)
CV_errors.append(err2)
ax = plt.gca()
plt.plot(alphas, train_errors, '-o')
plt.plot(alphas, CV_errors, '-o', color='r')
ax.set_xscale('log')
plt.xlabel('Regularization Parameter')
plt.ylabel('Error (RMSE)')
plt.title('Model Optimization')
plt.text(0.03, 8, 'training', color='b')
plt.text(0.02, 14, 'cross-validation', color='r')
plt.axvline(10, color='k', linestyle='dashed')
plt.show()
alpha = 10 gives the lowest CV error.
alpha = 10.0
U = pd.DataFrame(np.random.rand(Nu, n), index=userID, columns=range(1, n+1))
M = pd.DataFrame(np.random.rand(Nm, n), index=movieID, columns=range(1, n+1))
U0, M0 = U.copy(), M.copy()
error_history = []
bestFit = {'U': U0, 'M': M0, 'error': np.inf}
fitUM(alpha)
U_opt, M_opt, error = bestFit['U'], bestFit['M'], bestFit['error']
pred = predict(U_opt, M_opt)
print 'Model RMSE =', error
user = 1
jitter(rating.ix[user], U_opt.ix[user].dot(M_opt.T), 'Model Predictions')
For each user predict the ratings of unseen movies and recommend those with ratings higher than 4.5 sorted by movie popularity.
def recommend(user):
unrated_movies = rating.ix[user][rating.ix[user].isnull()].index
pred = predict(U_opt.ix[user], M_opt.ix[unrated_movies])
movies = movies_info.ix[unrated_movies]
movies['predicted rating'] = pred
movies['popularity'] = data.movieID.value_counts()
top_movies = movies[movies['predicted rating'] >= 4.5]
top_movies_sorted = top_movies.sort('popularity',
ascending=False, inplace=False)
return top_movies_sorted[['title', 'popularity', 'predicted rating',
'release date']]
recommend(1).head()