{"cells":[{"cell_type":"markdown","source":["In Cmd 2, the AWS_ACCESS_KEY and AWS_SECRET_KEY variables are set and kept hidden."],"metadata":{}},{"cell_type":"code","source":["AWS_ACCESS_KEY = \"notsecret\"\nAWS_SECRET_KEY = \"secret\""],"metadata":{},"outputs":[{"output_type":"display_data","metadata":{},"output_type":"display_data","data":{"text/html":["\n
"]}}],"execution_count":2},{"cell_type":"code","source":["sc._jsc.hadoopConfiguration().set(\"fs.s3n.awsAccessKeyId\", AWS_ACCESS_KEY)\nsc._jsc.hadoopConfiguration().set(\"fs.s3n.awsSecretAccessKey\", AWS_SECRET_KEY)"],"metadata":{},"outputs":[{"output_type":"display_data","metadata":{},"output_type":"display_data","data":{"text/html":["\n
"]}}],"execution_count":3},{"cell_type":"code","source":["df = spark.read.csv(\"s3://databricks-recsys/u.data\",header=True, sep=\"\\t\",inferSchema = True)\npdf = df.toPandas()"],"metadata":{},"outputs":[{"output_type":"display_data","metadata":{},"output_type":"display_data","data":{"text/html":["\n
"]}}],"execution_count":4},{"cell_type":"code","source":["!pip install scikit-surprise"],"metadata":{},"outputs":[{"output_type":"display_data","metadata":{},"output_type":"display_data","data":{"text/html":["\n
"]}}],"execution_count":5},{"cell_type":"code","source":["# https://github.com/NicolasHug/Surprise\n#https://github.com/NicolasHug/Surprise/blob/master/examples/top_n_recommendations.py\nfrom surprise import SVD, Dataset, Reader\nfrom surprise.accuracy import rmse\nfrom collections import defaultdict"],"metadata":{},"outputs":[{"output_type":"display_data","metadata":{},"output_type":"display_data","data":{"text/html":["\n
"]}}],"execution_count":6},{"cell_type":"code","source":["def get_top_n(predictions, n=10):\n \"\"\"Return the top-N recommendation for each user from a set of predictions.\n Args:\n predictions(list of Prediction objects): The list of predictions, as\n returned by the test method of an algorithm.\n n(int): The number of recommendation to output for each user. Default\n is 10.\n Returns:\n A dict where keys are user (raw) ids and values are lists of tuples:\n [(raw item id, rating estimation), ...] of size n.\n \"\"\"\n\n # First map the predictions to each user.\n top_n = defaultdict(list)\n for uid, iid, true_r, est, _ in predictions:\n top_n[uid].append((iid, est))\n\n # Then sort the predictions for each user and retrieve the k highest ones.\n for uid, user_ratings in top_n.items():\n user_ratings.sort(key=lambda x: x[1], reverse=True)\n top_n[uid] = user_ratings[:n]\n\n return top_n"],"metadata":{},"outputs":[{"output_type":"display_data","metadata":{},"output_type":"display_data","data":{"text/html":["\n
"]}}],"execution_count":7},{"cell_type":"code","source":["# A reader is still needed but only the rating_scale param is requiered.\nreader = Reader(rating_scale=(1, 5))\n\n# The columns must correspond to user id, item id and ratings (in that order).\ndata = Dataset.load_from_df(pdf[['uid', 'iid', 'rating']], reader)"],"metadata":{},"outputs":[{"output_type":"display_data","metadata":{},"output_type":"display_data","data":{"text/html":["\n
"]}}],"execution_count":8},{"cell_type":"code","source":["# Load the movielens-100k dataset (download it if needed).\ntrainset = data.build_full_trainset()\n\n# Use an example algorithm: SVD.\nalgo = SVD()\nalgo.fit(trainset) "],"metadata":{},"outputs":[{"output_type":"display_data","metadata":{},"output_type":"display_data","data":{"text/html":["\n
Out[8]: <surprise.prediction_algorithms.matrix_factorization.SVD at 0x7f0dac0af9d0>
"]}}],"execution_count":9},{"cell_type":"code","source":["#actual predictions as thse items have not been seen by the users. there is no ground truth. \n# We predict ratings for all pairs (u, i) that are NOT in the training set.\ntestset = trainset.build_anti_testset()\npredictions = algo.test(testset)\ntop_n = get_top_n(predictions, n=10)"],"metadata":{},"outputs":[{"output_type":"display_data","metadata":{},"output_type":"display_data","data":{"text/html":["\n
"]}}],"execution_count":10},{"cell_type":"code","source":["from pyspark.sql.types import StringType\nimport json\nimport pandas as pd\n\ndef recommend(row):\n d = json.loads(row)\n result = {'uid':d['uid'] , 'pred': [x[0] for x in top_n[int(d['uid'])]] }\n return str(json.dumps(result))"],"metadata":{},"outputs":[{"output_type":"display_data","metadata":{},"output_type":"display_data","data":{"text/html":["\n
"]}}],"execution_count":11},{"cell_type":"code","source":["df = spark.readStream.format(\"kafka\") \\\n .option(\"kafka.bootstrap.servers\", \"155.138.192.245:9092\") \\\n .option(\"subscribe\", \"quickstart-events\") \\\n .option(\"startingOffsets\", \"latest\").load()\ndf = df.selectExpr(\"CAST(value AS STRING)\")\nrecommend_udf = udf(recommend, StringType())\ndf = df.select(recommend_udf(\"value\").alias(\"value\"))"],"metadata":{},"outputs":[{"output_type":"display_data","metadata":{},"output_type":"display_data","data":{"text/html":["\n
"]}}],"execution_count":12},{"cell_type":"code","source":["query = df.writeStream.format(\"kafka\")\\\n .option(\"kafka.bootstrap.servers\", \"155.138.192.245:9092\")\\\n .option(\"topic\", \"recommendation-events\")\\\n .option(\"checkpointLocation\", \"/temp\").start().awaitTermination()"],"metadata":{},"outputs":[],"execution_count":13}],"metadata":{"name":"streaming_recs","notebookId":2246108946536949},"nbformat":4,"nbformat_minor":0}