{"cells":[{"cell_type":"markdown","source":["In Cmd 2, the AWS_ACCESS_KEY and AWS_SECRET_KEY variables are set and kept hidden."],"metadata":{}},{"cell_type":"code","source":["AWS_ACCESS_KEY = \"AA\"\nAWS_SECRET_KEY = \"BB\""],"metadata":{},"outputs":[{"metadata":{},"output_type":"display_data","data":{"text/html":["<style scoped>\n  .ansiout {\n    display: block;\n    unicode-bidi: embed;\n    white-space: pre-wrap;\n    word-wrap: break-word;\n    word-break: break-all;\n    font-family: \"Source Code Pro\", \"Menlo\", monospace;;\n    font-size: 13px;\n    color: #555;\n    margin-left: 4px;\n    line-height: 19px;\n  }\n</style>\n<div class=\"ansiout\"></div>"]}}],"execution_count":2},{"cell_type":"code","source":["sc._jsc.hadoopConfiguration().set(\"fs.s3n.awsAccessKeyId\", AWS_ACCESS_KEY)\nsc._jsc.hadoopConfiguration().set(\"fs.s3n.awsSecretAccessKey\", AWS_SECRET_KEY)"],"metadata":{},"outputs":[{"metadata":{},"output_type":"display_data","data":{"text/html":["<style scoped>\n  .ansiout {\n    display: block;\n    unicode-bidi: embed;\n    white-space: pre-wrap;\n    word-wrap: break-word;\n    word-break: break-all;\n    font-family: \"Source Code Pro\", \"Menlo\", monospace;;\n    font-size: 13px;\n    color: #555;\n    margin-left: 4px;\n    line-height: 19px;\n  }\n</style>\n<div class=\"ansiout\"></div>"]}}],"execution_count":3},{"cell_type":"code","source":["df = spark.read.csv(\"s3://databricks-recsys/u.data\",header=True, sep=\"\\t\",inferSchema = True)\npdf = df.toPandas()"],"metadata":{},"outputs":[{"metadata":{},"output_type":"display_data","data":{"text/html":["<style scoped>\n  .ansiout {\n    display: block;\n    unicode-bidi: embed;\n    white-space: pre-wrap;\n    word-wrap: break-word;\n    word-break: break-all;\n    font-family: \"Source Code Pro\", \"Menlo\", monospace;;\n    font-size: 13px;\n    color: #555;\n    margin-left: 4px;\n    line-height: 19px;\n  }\n</style>\n<div class=\"ansiout\"></div>"]}}],"execution_count":4},{"cell_type":"code","source":["# https://github.com/NicolasHug/Surprise\n#https://github.com/NicolasHug/Surprise/blob/master/examples/top_n_recommendations.py\nfrom surprise import SVD, Dataset, Reader\nfrom surprise.accuracy import rmse\nfrom collections import defaultdict"],"metadata":{},"outputs":[{"metadata":{},"output_type":"display_data","data":{"text/html":["<style scoped>\n  .ansiout {\n    display: block;\n    unicode-bidi: embed;\n    white-space: pre-wrap;\n    word-wrap: break-word;\n    word-break: break-all;\n    font-family: \"Source Code Pro\", \"Menlo\", monospace;;\n    font-size: 13px;\n    color: #555;\n    margin-left: 4px;\n    line-height: 19px;\n  }\n</style>\n<div class=\"ansiout\"></div>"]}}],"execution_count":5},{"cell_type":"code","source":["def get_top_n(predictions, n=10):\n    \"\"\"Return the top-N recommendation for each user from a set of predictions.\n    Args:\n        predictions(list of Prediction objects): The list of predictions, as\n            returned by the test method of an algorithm.\n        n(int): The number of recommendation to output for each user. Default\n            is 10.\n    Returns:\n    A dict where keys are user (raw) ids and values are lists of tuples:\n        [(raw item id, rating estimation), ...] of size n.\n    \"\"\"\n\n    # First map the predictions to each user.\n    top_n = defaultdict(list)\n    for uid, iid, true_r, est, _ in predictions:\n        top_n[uid].append((iid, est))\n\n    # Then sort the predictions for each user and retrieve the k highest ones.\n    for uid, user_ratings in top_n.items():\n        user_ratings.sort(key=lambda x: x[1], reverse=True)\n        top_n[uid] = user_ratings[:n]\n\n    return top_n"],"metadata":{},"outputs":[{"metadata":{},"output_type":"display_data","data":{"text/html":["<style scoped>\n  .ansiout {\n    display: block;\n    unicode-bidi: embed;\n    white-space: pre-wrap;\n    word-wrap: break-word;\n    word-break: break-all;\n    font-family: \"Source Code Pro\", \"Menlo\", monospace;;\n    font-size: 13px;\n    color: #555;\n    margin-left: 4px;\n    line-height: 19px;\n  }\n</style>\n<div class=\"ansiout\"></div>"]}}],"execution_count":6},{"cell_type":"code","source":["# A reader is still needed but only the rating_scale param is requiered.\nreader = Reader(rating_scale=(1, 5))\n\n# The columns must correspond to user id, item id and ratings (in that order).\ndata = Dataset.load_from_df(pdf[['uid', 'iid', 'rating']], reader)"],"metadata":{},"outputs":[{"metadata":{},"output_type":"display_data","data":{"text/html":["<style scoped>\n  .ansiout {\n    display: block;\n    unicode-bidi: embed;\n    white-space: pre-wrap;\n    word-wrap: break-word;\n    word-break: break-all;\n    font-family: \"Source Code Pro\", \"Menlo\", monospace;;\n    font-size: 13px;\n    color: #555;\n    margin-left: 4px;\n    line-height: 19px;\n  }\n</style>\n<div class=\"ansiout\"></div>"]}}],"execution_count":7},{"cell_type":"code","source":["# Load the movielens-100k dataset (download it if needed).\ntrainset = data.build_full_trainset()\n\n# Use an example algorithm: SVD.\nalgo = SVD()\nalgo.fit(trainset)                                                              "],"metadata":{},"outputs":[{"metadata":{},"output_type":"display_data","data":{"text/html":["<style scoped>\n  .ansiout {\n    display: block;\n    unicode-bidi: embed;\n    white-space: pre-wrap;\n    word-wrap: break-word;\n    word-break: break-all;\n    font-family: \"Source Code Pro\", \"Menlo\", monospace;;\n    font-size: 13px;\n    color: #555;\n    margin-left: 4px;\n    line-height: 19px;\n  }\n</style>\n<div class=\"ansiout\">Out[7]: &lt;surprise.prediction_algorithms.matrix_factorization.SVD at 0x7f45e2b45490&gt;</div>"]}}],"execution_count":8},{"cell_type":"code","source":["# predict ratings for all pairs (u, i) that are in the training set.\ntestset = trainset.build_testset()\npredictions = algo.test(testset)\nrmse(predictions)"],"metadata":{},"outputs":[{"metadata":{},"output_type":"display_data","data":{"text/html":["<style scoped>\n  .ansiout {\n    display: block;\n    unicode-bidi: embed;\n    white-space: pre-wrap;\n    word-wrap: break-word;\n    word-break: break-all;\n    font-family: \"Source Code Pro\", \"Menlo\", monospace;;\n    font-size: 13px;\n    color: #555;\n    margin-left: 4px;\n    line-height: 19px;\n  }\n</style>\n<div class=\"ansiout\">RMSE: 0.6762\nOut[8]: 0.6762309203564899</div>"]}}],"execution_count":9},{"cell_type":"code","source":["# we can now query for specific predicions\nuid = str(196)  # raw user id (as in the ratings file). They are **strings**!\niid = str(302)  # raw item id (as in the ratings file). They are **strings**!\n\n# get a prediction for specific users and items.\npred = algo.predict(uid, iid, r_ui=None, verbose=True)"],"metadata":{},"outputs":[{"metadata":{},"output_type":"display_data","data":{"text/html":["<style scoped>\n  .ansiout {\n    display: block;\n    unicode-bidi: embed;\n    white-space: pre-wrap;\n    word-wrap: break-word;\n    word-break: break-all;\n    font-family: \"Source Code Pro\", \"Menlo\", monospace;;\n    font-size: 13px;\n    color: #555;\n    margin-left: 4px;\n    line-height: 19px;\n  }\n</style>\n<div class=\"ansiout\">user: 196        item: 302        r_ui = None   est = 3.53   {&#39;was_impossible&#39;: False}\n</div>"]}}],"execution_count":10},{"cell_type":"code","source":["#actual predictions as thse items have not been seen by the users. there is no ground truth. \n# We predict ratings for all pairs (u, i) that are NOT in the training set.\ntestset = trainset.build_anti_testset()\npredictions = algo.test(testset)\ntop_n = get_top_n(predictions, n=10)"],"metadata":{},"outputs":[{"metadata":{},"output_type":"display_data","data":{"text/html":["<style scoped>\n  .ansiout {\n    display: block;\n    unicode-bidi: embed;\n    white-space: pre-wrap;\n    word-wrap: break-word;\n    word-break: break-all;\n    font-family: \"Source Code Pro\", \"Menlo\", monospace;;\n    font-size: 13px;\n    color: #555;\n    margin-left: 4px;\n    line-height: 19px;\n  }\n</style>\n<div class=\"ansiout\"></div>"]}}],"execution_count":11},{"cell_type":"code","source":["# Print the recommended items for each user\nfor uid, user_ratings in top_n.items():\n    print(uid, [iid for (iid, _) in user_ratings])\n    break"],"metadata":{},"outputs":[{"metadata":{},"output_type":"display_data","data":{"text/html":["<style scoped>\n  .ansiout {\n    display: block;\n    unicode-bidi: embed;\n    white-space: pre-wrap;\n    word-wrap: break-word;\n    word-break: break-all;\n    font-family: \"Source Code Pro\", \"Menlo\", monospace;;\n    font-size: 13px;\n    color: #555;\n    margin-left: 4px;\n    line-height: 19px;\n  }\n</style>\n<div class=\"ansiout\">196 [50, 64, 483, 114, 12, 318, 98, 357, 480, 132]\n</div>"]}}],"execution_count":12}],"metadata":{"name":"recommendation_system_surpriselib","notebookId":2102833272923229},"nbformat":4,"nbformat_minor":0}