{"cells":[{"cell_type":"markdown","source":["In Cmd 2, the AWS_ACCESS_KEY and AWS_SECRET_KEY variables are set and kept hidden."],"metadata":{}},{"cell_type":"code","source":["AWS_ACCESS_KEY = \"AA\"\nAWS_SECRET_KEY = \"BB\""],"metadata":{},"outputs":[{"metadata":{},"output_type":"display_data","data":{"text/html":["\n
"]}}],"execution_count":2},{"cell_type":"code","source":["sc._jsc.hadoopConfiguration().set(\"fs.s3n.awsAccessKeyId\", AWS_ACCESS_KEY)\nsc._jsc.hadoopConfiguration().set(\"fs.s3n.awsSecretAccessKey\", AWS_SECRET_KEY)"],"metadata":{},"outputs":[{"metadata":{},"output_type":"display_data","data":{"text/html":["\n
"]}}],"execution_count":3},{"cell_type":"code","source":["df = spark.read.csv(\"s3://databricks-recsys/u.data\",header=True, sep=\"\\t\",inferSchema = True)\npdf = df.toPandas()"],"metadata":{},"outputs":[{"metadata":{},"output_type":"display_data","data":{"text/html":["\n
"]}}],"execution_count":4},{"cell_type":"code","source":["# https://github.com/NicolasHug/Surprise\n#https://github.com/NicolasHug/Surprise/blob/master/examples/top_n_recommendations.py\nfrom surprise import SVD, Dataset, Reader\nfrom surprise.accuracy import rmse\nfrom collections import defaultdict"],"metadata":{},"outputs":[{"metadata":{},"output_type":"display_data","data":{"text/html":["\n
"]}}],"execution_count":5},{"cell_type":"code","source":["def get_top_n(predictions, n=10):\n \"\"\"Return the top-N recommendation for each user from a set of predictions.\n Args:\n predictions(list of Prediction objects): The list of predictions, as\n returned by the test method of an algorithm.\n n(int): The number of recommendation to output for each user. Default\n is 10.\n Returns:\n A dict where keys are user (raw) ids and values are lists of tuples:\n [(raw item id, rating estimation), ...] of size n.\n \"\"\"\n\n # First map the predictions to each user.\n top_n = defaultdict(list)\n for uid, iid, true_r, est, _ in predictions:\n top_n[uid].append((iid, est))\n\n # Then sort the predictions for each user and retrieve the k highest ones.\n for uid, user_ratings in top_n.items():\n user_ratings.sort(key=lambda x: x[1], reverse=True)\n top_n[uid] = user_ratings[:n]\n\n return top_n"],"metadata":{},"outputs":[{"metadata":{},"output_type":"display_data","data":{"text/html":["\n
"]}}],"execution_count":6},{"cell_type":"code","source":["# A reader is still needed but only the rating_scale param is requiered.\nreader = Reader(rating_scale=(1, 5))\n\n# The columns must correspond to user id, item id and ratings (in that order).\ndata = Dataset.load_from_df(pdf[['uid', 'iid', 'rating']], reader)"],"metadata":{},"outputs":[{"metadata":{},"output_type":"display_data","data":{"text/html":["\n
"]}}],"execution_count":7},{"cell_type":"code","source":["# Load the movielens-100k dataset (download it if needed).\ntrainset = data.build_full_trainset()\n\n# Use an example algorithm: SVD.\nalgo = SVD()\nalgo.fit(trainset) "],"metadata":{},"outputs":[{"metadata":{},"output_type":"display_data","data":{"text/html":["\n
Out[7]: <surprise.prediction_algorithms.matrix_factorization.SVD at 0x7f45e2b45490>
"]}}],"execution_count":8},{"cell_type":"code","source":["# predict ratings for all pairs (u, i) that are in the training set.\ntestset = trainset.build_testset()\npredictions = algo.test(testset)\nrmse(predictions)"],"metadata":{},"outputs":[{"metadata":{},"output_type":"display_data","data":{"text/html":["\n
RMSE: 0.6762\nOut[8]: 0.6762309203564899
"]}}],"execution_count":9},{"cell_type":"code","source":["# we can now query for specific predicions\nuid = str(196) # raw user id (as in the ratings file). They are **strings**!\niid = str(302) # raw item id (as in the ratings file). They are **strings**!\n\n# get a prediction for specific users and items.\npred = algo.predict(uid, iid, r_ui=None, verbose=True)"],"metadata":{},"outputs":[{"metadata":{},"output_type":"display_data","data":{"text/html":["\n
user: 196 item: 302 r_ui = None est = 3.53 {'was_impossible': False}\n
"]}}],"execution_count":10},{"cell_type":"code","source":["#actual predictions as thse items have not been seen by the users. there is no ground truth. \n# We predict ratings for all pairs (u, i) that are NOT in the training set.\ntestset = trainset.build_anti_testset()\npredictions = algo.test(testset)\ntop_n = get_top_n(predictions, n=10)"],"metadata":{},"outputs":[{"metadata":{},"output_type":"display_data","data":{"text/html":["\n
"]}}],"execution_count":11},{"cell_type":"code","source":["# Print the recommended items for each user\nfor uid, user_ratings in top_n.items():\n print(uid, [iid for (iid, _) in user_ratings])\n break"],"metadata":{},"outputs":[{"metadata":{},"output_type":"display_data","data":{"text/html":["\n
196 [50, 64, 483, 114, 12, 318, 98, 357, 480, 132]\n
"]}}],"execution_count":12}],"metadata":{"name":"recommendation_system_surpriselib","notebookId":2102833272923229},"nbformat":4,"nbformat_minor":0}