{"cells":[{"cell_type":"markdown","source":["In Cmd 2, the AWS_ACCESS_KEY and AWS_SECRET_KEY variables are set and kept hidden."],"metadata":{}},{"cell_type":"code","source":["AWS_ACCESS_KEY = \"AA\"\nAWS_SECRET_KEY = \"BB\""],"metadata":{},"outputs":[{"metadata":{},"output_type":"display_data","data":{"text/html":["\n
"]}}],"execution_count":2},{"cell_type":"code","source":["sc._jsc.hadoopConfiguration().set(\"fs.s3n.awsAccessKeyId\", AWS_ACCESS_KEY)\nsc._jsc.hadoopConfiguration().set(\"fs.s3n.awsSecretAccessKey\", AWS_SECRET_KEY)"],"metadata":{},"outputs":[{"metadata":{},"output_type":"display_data","data":{"text/html":["\n"]}}],"execution_count":3},{"cell_type":"code","source":["df = spark.read.csv(\"s3://databricks-recsys/u.data\",header=True, sep=\"\\t\",inferSchema = True)\npdf = df.toPandas()"],"metadata":{},"outputs":[{"metadata":{},"output_type":"display_data","data":{"text/html":["\n"]}}],"execution_count":4},{"cell_type":"code","source":["# https://github.com/NicolasHug/Surprise\n#https://github.com/NicolasHug/Surprise/blob/master/examples/top_n_recommendations.py\nfrom surprise import SVD, Dataset, Reader\nfrom surprise.accuracy import rmse\nfrom collections import defaultdict"],"metadata":{},"outputs":[{"metadata":{},"output_type":"display_data","data":{"text/html":["\n"]}}],"execution_count":5},{"cell_type":"code","source":["def get_top_n(predictions, n=10):\n \"\"\"Return the top-N recommendation for each user from a set of predictions.\n Args:\n predictions(list of Prediction objects): The list of predictions, as\n returned by the test method of an algorithm.\n n(int): The number of recommendation to output for each user. Default\n is 10.\n Returns:\n A dict where keys are user (raw) ids and values are lists of tuples:\n [(raw item id, rating estimation), ...] of size n.\n \"\"\"\n\n # First map the predictions to each user.\n top_n = defaultdict(list)\n for uid, iid, true_r, est, _ in predictions:\n top_n[uid].append((iid, est))\n\n # Then sort the predictions for each user and retrieve the k highest ones.\n for uid, user_ratings in top_n.items():\n user_ratings.sort(key=lambda x: x[1], reverse=True)\n top_n[uid] = user_ratings[:n]\n\n return top_n"],"metadata":{},"outputs":[{"metadata":{},"output_type":"display_data","data":{"text/html":["\n"]}}],"execution_count":6},{"cell_type":"code","source":["# A reader is still needed but only the rating_scale param is requiered.\nreader = Reader(rating_scale=(1, 5))\n\n# The columns must correspond to user id, item id and ratings (in that order).\ndata = Dataset.load_from_df(pdf[['uid', 'iid', 'rating']], reader)"],"metadata":{},"outputs":[{"metadata":{},"output_type":"display_data","data":{"text/html":["\n"]}}],"execution_count":7},{"cell_type":"code","source":["# Load the movielens-100k dataset (download it if needed).\ntrainset = data.build_full_trainset()\n\n# Use an example algorithm: SVD.\nalgo = SVD()\nalgo.fit(trainset) "],"metadata":{},"outputs":[{"metadata":{},"output_type":"display_data","data":{"text/html":["\n