In Cmd 2, the AWS_ACCESS_KEY and AWS_SECRET_KEY variables are set and kept hidden.

In [2]:
AWS_ACCESS_KEY = "AA"
AWS_SECRET_KEY = "BB"

In [3]:
sc._jsc.hadoopConfiguration().set("fs.s3n.awsAccessKeyId", AWS_ACCESS_KEY)
sc._jsc.hadoopConfiguration().set("fs.s3n.awsSecretAccessKey", AWS_SECRET_KEY)

In [4]:
df = spark.read.csv("s3://databricks-recsys/u.data",header=True, sep="\t",inferSchema = True)
display(df)

uid,iid,rating,timestamp
196,242,3,881250949
186,302,3,891717742
22,377,1,878887116
244,51,2,880606923
166,346,1,886397596
298,474,4,884182806
115,265,2,881171488
253,465,5,891628467
305,451,3,886324817
6,86,3,883603013


In [5]:
movies_sdf = spark.read.csv("s3://databricks-recsys/movies_raw.dat",header=False, sep="|",inferSchema = True)
display(movies_sdf)

_c0,_c1
1,Toy Story (1995)
2,GoldenEye (1995)
3,Four Rooms (1995)
4,Get Shorty (1995)
5,Copycat (1995)
6,Shanghai Triad (Yao a yao yao dao waipo qiao) (1995)
7,Twelve Monkeys (1995)
8,Babe (1995)
9,Dead Man Walking (1995)
10,Richard III (1995)


In [6]:
ratings = df.rdd

numRatings = ratings.count()
numUsers = ratings.map(lambda r: r[0]).distinct().count()
numMovies = ratings.map(lambda r: r[1]).distinct().count()

print("Got %d ratings from %d users on %d movies." % (numRatings, numUsers, numMovies))

In [7]:
from pyspark.sql import functions as F
from pyspark.sql import DataFrameNaFunctions as DFna
from pyspark.sql.functions import udf, col, when

movies_counts = df.groupBy(col("iid")).agg(F.count(col("rating")).alias("counts"))
movies_counts.show()

In [8]:
training_df, validation_df, test_df = df.randomSplit([.6, .2, .2], seed=594)

In [9]:
df.randomSplit?

In [10]:
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.recommendation import ALS
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
from pyspark.ml import Pipeline
from pyspark.sql import Row
import numpy as np
import math

In [11]:
seed = 594
iterations = 10
regularization_parameter = 0.1
ranks = range(4, 12)
errors = []
err = 0
tolerance = 0.02

In [12]:
min_error = float('inf')
best_rank = -1
best_iteration = -1

for rank in ranks:
    als = ALS(maxIter=iterations, regParam=regularization_parameter, rank=rank, userCol="uid", itemCol="iid", ratingCol="rating")
    model = als.fit(training_df)
    predictions = model.transform(validation_df)
    new_predictions = predictions.filter(col('prediction') != np.nan)
    evaluator = RegressionEvaluator(metricName="rmse", labelCol="rating", predictionCol="prediction")
    rmse = evaluator.evaluate(new_predictions)
    errors.append(rmse)

    print('For rank %s the RMSE is %s' % (rank, rmse))
    if rmse < min_error:
        min_error = rmse
        best_rank = rank
print('The best model was trained with rank %s' % best_rank)

In [13]:
training_df.take(3)

In [14]:
validation_df.take(3)

In [15]:
all_except_test_df = training_df.union(validation_df)

In [16]:
final_als = ALS(maxIter=10, regParam=0.1, rank=6, userCol="uid", itemCol="iid", ratingCol="rating")
final_model = final_als.fit(all_except_test_df)
final_pred = final_model.transform(test_df)
final_pred = final_pred.filter(col('prediction') != np.nan)
rmse = evaluator.evaluate(final_pred)
print("the one time final rmse (this is an internal metric) for our model is: {}".format(rmse))

In [17]:
np.random.seed(594)
user_id = np.random.choice(numUsers)

In [18]:
new_user_ratings = df.filter(df.uid == user_id)
new_user_ratings.sort('rating', ascending=True).take(10) # top rated movies for this user

In [19]:
new_user_ratings.describe('rating').show()

In [20]:
display(new_user_ratings)

uid,iid,rating,timestamp
577,471,3,880471640
577,229,4,880475094
577,284,4,880470732
577,996,3,880475094
577,204,4,880474338
577,662,4,880474933
577,117,4,880471359
577,684,4,880474394
577,95,5,880474747
577,188,3,880474715


In [21]:
new_user_rated_iids = [i.iid for i in new_user_ratings.select('iid').distinct().collect()]
movies_of_interest = [i.iid for i in movies_counts.filter(movies_counts.counts > 25).select('iid').distinct().collect()]
new_user_unrated_iids = list(set(movies_of_interest) - set(new_user_rated_iids))

In [22]:
import time
cols = ('uid', 'iid', 'timestamp')
new_user_preds = sqlContext.createDataFrame(zip([user_id] * len(new_user_unrated_iids), new_user_unrated_iids, [int(time.time())] * len(new_user_unrated_iids)), cols)
new_user_preds = final_model.transform(new_user_preds).filter(col('prediction') != np.nan)

In [23]:
new_user_preds.join(movies_sdf,new_user_preds.iid ==  movies_sdf._c0,"left").sort('prediction', ascending=False).take(10)

In [24]:
display(new_user_preds)

uid,iid,timestamp,prediction
577,148,1601567162,3.7349138
577,137,1601567162,3.452837
577,53,1601567162,2.6230783
577,78,1601567162,2.6354327
577,108,1601567162,2.6505985
577,155,1601567162,3.7758074
577,101,1601567162,3.1562262
577,126,1601567162,4.2304254
577,81,1601567162,3.4848535
577,76,1601567162,3.232955
