In Cmd 2, the AWS_ACCESS_KEY and AWS_SECRET_KEY variables are set and kept hidden.

In [2]:
AWS_ACCESS_KEY = "notsecret"
AWS_SECRET_KEY = "secret"

In [3]:
sc._jsc.hadoopConfiguration().set("fs.s3n.awsAccessKeyId", AWS_ACCESS_KEY)
sc._jsc.hadoopConfiguration().set("fs.s3n.awsSecretAccessKey", AWS_SECRET_KEY)

In [4]:
df = spark.read.csv("s3://databricks-recsys/u.data",header=True, sep="\t",inferSchema = True)
pdf = df.toPandas()

In [5]:
!pip install scikit-surprise

In [6]:
# https://github.com/NicolasHug/Surprise
#https://github.com/NicolasHug/Surprise/blob/master/examples/top_n_recommendations.py
from surprise import SVD, Dataset, Reader
from surprise.accuracy import rmse
from collections import defaultdict

In [7]:
def get_top_n(predictions, n=10):
    """Return the top-N recommendation for each user from a set of predictions.
    Args:
        predictions(list of Prediction objects): The list of predictions, as
            returned by the test method of an algorithm.
        n(int): The number of recommendation to output for each user. Default
            is 10.
    Returns:
    A dict where keys are user (raw) ids and values are lists of tuples:
        [(raw item id, rating estimation), ...] of size n.
    """

    # First map the predictions to each user.
    top_n = defaultdict(list)
    for uid, iid, true_r, est, _ in predictions:
        top_n[uid].append((iid, est))

    # Then sort the predictions for each user and retrieve the k highest ones.
    for uid, user_ratings in top_n.items():
        user_ratings.sort(key=lambda x: x[1], reverse=True)
        top_n[uid] = user_ratings[:n]

    return top_n

In [8]:
# A reader is still needed but only the rating_scale param is requiered.
reader = Reader(rating_scale=(1, 5))

# The columns must correspond to user id, item id and ratings (in that order).
data = Dataset.load_from_df(pdf[['uid', 'iid', 'rating']], reader)

In [9]:
# Load the movielens-100k dataset (download it if needed).
trainset = data.build_full_trainset()

# Use an example algorithm: SVD.
algo = SVD()
algo.fit(trainset)                                                              

In [10]:
#actual predictions as thse items have not been seen by the users. there is no ground truth. 
# We predict ratings for all pairs (u, i) that are NOT in the training set.
testset = trainset.build_anti_testset()
predictions = algo.test(testset)
top_n = get_top_n(predictions, n=10)

In [11]:
from pyspark.sql.types import StringType
import json
import pandas as pd

def recommend(row):
    d = json.loads(row)
    result = {'uid':d['uid'] , 'pred': [x[0] for x in top_n[int(d['uid'])]] }
    return str(json.dumps(result))

In [12]:
df = spark.readStream.format("kafka") \
  .option("kafka.bootstrap.servers", "155.138.192.245:9092") \
  .option("subscribe", "quickstart-events") \
  .option("startingOffsets", "latest").load()
df = df.selectExpr("CAST(value AS STRING)")
recommend_udf = udf(recommend, StringType())
df = df.select(recommend_udf("value").alias("value"))

In [13]:
query = df.writeStream.format("kafka")\
  .option("kafka.bootstrap.servers", "155.138.192.245:9092")\
  .option("topic", "recommendation-events")\
  .option("checkpointLocation", "/temp").start().awaitTermination()