From 3a37dbc3b8c036b7b76509689c3ff95b72dcd0bd Mon Sep 17 00:00:00 2001 From: Chris Hodapp Date: Sat, 1 Feb 2020 09:39:22 -0500 Subject: [PATCH] Try to fix code style --- .gitmodules | 6 + hugo_blag/config.toml | 29 ++- .../posts/2018-04-08-recommender-systems-1.md | 217 +++++++++--------- .../layouts/partials/mathjax_support.html | 29 +++ hugo_blag/themes/nofancy | 1 + hugo_blag/themes/zen | 1 + 6 files changed, 173 insertions(+), 110 deletions(-) create mode 100644 .gitmodules create mode 100644 hugo_blag/layouts/partials/mathjax_support.html create mode 160000 hugo_blag/themes/nofancy create mode 160000 hugo_blag/themes/zen diff --git a/.gitmodules b/.gitmodules new file mode 100644 index 0000000..b761466 --- /dev/null +++ b/.gitmodules @@ -0,0 +1,6 @@ +[submodule "hugo_blag/themes/zen"] + path = hugo_blag/themes/zen + url = https://github.com/frjo/hugo-theme-zen.git +[submodule "hugo_blag/themes/nofancy"] + path = hugo_blag/themes/nofancy + url = https://github.com/gizak/nofancy.git diff --git a/hugo_blag/config.toml b/hugo_blag/config.toml index e22a245..1c65b0b 100644 --- a/hugo_blag/config.toml +++ b/hugo_blag/config.toml @@ -1,4 +1,31 @@ baseURL = "http://example.org/" languageCode = "en-us" title = "My New Hugo Site" -theme = "indigo" +# Want to use this but the default code theme is fucking hideous: +#theme = "indigo" +#theme = "zen" +# This one *does* use 'highlight' below: +theme = "nofancy" + +[params] + # See themes/nofancy/static/highlight/styles for available options + highlight="tomorrow" + # Controls what items are listed in the top nav menu + # "none", or "categories" + # If you have too many categories to fit in the top nav menu, set this to "none" + topmenu="categories" + +# none of this is taking any effect despite +# https://gohugo.io/getting-started/configuration-markup#highlight: + +#[markup] +# [markup.highlight] +# codeFences = true +# guessSyntax = false +# hl_Lines = "" +# lineNoStart = 1 +# lineNos = false +# lineNumbersInTable = true +# noClasses = true +# style = "monokai" +# tabWidth = 4 diff --git a/hugo_blag/content/posts/2018-04-08-recommender-systems-1.md b/hugo_blag/content/posts/2018-04-08-recommender-systems-1.md index db1494c..e6aced1 100644 --- a/hugo_blag/content/posts/2018-04-08-recommender-systems-1.md +++ b/hugo_blag/content/posts/2018-04-08-recommender-systems-1.md @@ -81,35 +81,34 @@ Download [MovieLens 20M](https://grouplens.org/datasets/movielens/20m/) and unco For Python dependencies, everything I need is imported below: pandas, numpy, matplotlib, and scikit-learn. - -```python +{{}} import matplotlib.pyplot as plt import numpy as np import pandas as pd import scipy.sparse import sklearn.model_selection -``` +{{< / highlight >}} # 3. Loading data I don't explain this in detail. This is just standard calls in [Pandas](https://pandas.pydata.org/) and little details that are boring but essential: -```python +{{}} ml = pd.read_csv("ml-20m/ratings.csv", header=0, dtype={"user_id": np.int32, "movie_id": np.int32, "rating": np.float32, "time": np.int64}, names=("user_id", "movie_id", "rating", "time")) # Convert Unix seconds to a Pandas timestamp: ml["time"] = pd.to_datetime(ml["time"], unit="s") -``` +{{< / highlight >}} Below is just to inspect that data appears to be okay: -```python +{{}} ml.info() -``` +{{< / highlight >}}
@@ -127,9 +126,9 @@ ml.info() -```python +{{}} ml.describe() -``` +{{< / highlight >}} @@ -152,9 +151,9 @@ max|1.384930e+05|1.312620e+05|5.000000e+00 -```python +{{}} ml[:10] -``` +{{< / highlight >}} @@ -179,11 +178,11 @@ ml[:10] -```python +{{}} max_user = int(ml["user_id"].max() + 1) max_movie = int(ml["movie_id"].max() + 1) max_user, max_movie, max_user * max_movie -``` +{{< / highlight >}} @@ -199,9 +198,9 @@ max_user, max_movie, max_user * max_movie Computing what percent we have of all 'possible' ratings (i.e. every single movie & every single user), this data is rather sparse: -```python +{{}} print("%.2f%%" % (100 * ml.shape[0] / (max_user * max_movie))) -``` +{{< / highlight >}}
@@ -217,27 +216,27 @@ This is partly just to explore the data a little, and partly because we need to The dataset includes a lot of per-movie information too, but we only bother with the title so far: -```python +{{}} names = pd.read_csv( "ml-20m/movies.csv", header=0, encoding = "ISO-8859-1", index_col=0, names=("movie_id", "movie_title"), usecols=[0,1]) -``` +{{< / highlight >}} -```python +{{}} movie_group = ml.groupby("movie_id") movie_stats = names.\ join(movie_group.size().rename("num_ratings")).\ join(movie_group.mean()["rating"].rename("avg_rating")) -``` +{{< / highlight >}} Sorting by number of ratings and taking the top 25, this looks pretty sensible: -```python +{{}} movie_stats.sort_values("num_ratings", ascending=False)[:25] -``` +{{< / highlight >}} @@ -279,9 +278,9 @@ movie_stats.sort_values("num_ratings", ascending=False)[:25] Prior to anything else, split training/test data out with a specific random seed: -```python +{{}} ml_train, ml_test = sklearn.model_selection.train_test_split(ml, test_size=0.25, random_state=12345678) -``` +{{< / highlight >}} # 4. Utility Matrix @@ -314,15 +313,15 @@ later does this. We'll convert to a utility matrix, for which the naive way is creating a dense matrix: -```python +{{}} m = np.zeros((max_user, max_movie)) m[df["user_id"], df["movie_id"]] = df["rating"] -``` +{{< / highlight >}} ...but we'd be dealing with a 18,179,137,922-element matrix that's a little bit unusable here (at least it is for me since I only have 32 GB RAM), so we'll use [sparse matrices](https://docs.scipy.org/doc/scipy/reference/sparse.html). -```python +{{}} def df2mat(df): m = scipy.sparse.coo_matrix( (df["rating"], (df["user_id"], df["movie_id"])), @@ -332,14 +331,14 @@ def df2mat(df): ml_mat_train, ml_mask_train = df2mat(ml_train) ml_mat_test, ml_mask_test = df2mat(ml_test) -``` +{{< / highlight >}} We need a mask for some later steps, hence the m > 0 step. Ratings go only from 1 to 5, so values of 0 are automatically unknown/missing data, which fits with how sparse matrices work. -```python +{{}} ml_mat_train -``` +{{< / highlight >}} @@ -356,9 +355,9 @@ ml_mat_train To demonstrate that the matrix and dataframe have the same data: -```python +{{}} ml_train[:10] -``` +{{< / highlight >}} @@ -383,9 +382,9 @@ ml_train[:10] -```python +{{}} list(ml_train.iloc[:10].rating) -``` +{{< / highlight >}} @@ -399,11 +398,11 @@ list(ml_train.iloc[:10].rating) -```python +{{}} user_ids = list(ml_train.iloc[:10].user_id) movie_ids = list(ml_train.iloc[:10].movie_id) [ml_mat_train[u,i] for u,i in zip(user_ids, movie_ids)] -``` +{{< / highlight >}} @@ -472,15 +471,15 @@ will go through these steps with some real data. I arbitrarily chose user 28812: -```python +{{}} pd.set_option('display.max_rows', 10) -``` +{{< / highlight >}} -```python +{{}} target_user = 28812 names.merge(ml_train[ml_train.user_id == target_user], right_on="movie_id", left_index=True) -``` +{{< / highlight >}} @@ -508,10 +507,10 @@ names.merge(ml_train[ml_train.user_id == target_user], right_on="movie_id", left I picked *Home Alone*, movie ID 586, as the one we want to predict user 28812's rating on. This isn't completely arbitrary. I chose it because the testing data contains the actual rating and we can compare against it later. -```python +{{}} target_movie = 586 names[names.index == target_movie] -``` +{{< / highlight >}} @@ -529,14 +528,14 @@ names[names.index == target_movie] Now, from step #1 and about half of step #2: What users also rated one of the movies that 28812 rated, *and* rated *Home Alone*? What were those ratings? -```python +{{}} users_df = ml_train[ml_train.user_id == target_user][["movie_id"]]. \ merge(ml_train, on="movie_id")[["movie_id", "user_id", "rating"]]. \ merge(ml_train[ml_train.movie_id == target_movie], on="user_id"). \ drop(["movie_id_y", "time"], axis=1) # time is irrelevant to us, movie_id_y is just always 3175 users_df -``` +{{< / highlight >}} @@ -564,10 +563,10 @@ users_df Each row has one user's ratings of both *Home Alone* (it's the `rating_y` column), and some other movie that 28812 rated (`rating_x`), so we can easily find the deviation of each individual rating - how much higher they rated *Home Alone* than the respective movie for `movie_id_x`: -```python +{{}} users_df = users_df.assign(rating_dev = users_df.rating_y - users_df.rating_x) users_df -``` +{{< / highlight >}} @@ -595,11 +594,11 @@ users_df ...and for the rest of step 2, turn this to an average deviation by grouping by movie ID. For the sake of displaying it, inner join with the dataframe that has movie titles: -```python +{{}} pd.set_option('display.max_rows', 20) rating_dev = users_df.groupby("movie_id_x").mean()["rating_dev"] names.join(rating_dev, how="inner").sort_values("rating_dev") -``` +{{< / highlight >}} @@ -639,12 +638,12 @@ The numbers above then tell us that, on average, users who watched both movies r For step 3, we can produce a prediction from each deviation above by adding it to each of user 28812's ratings for the respective movies: -```python +{{}} df = ml_train[ml_train.user_id == target_user]. \ join(rating_dev, on="movie_id") df = df.assign(rating_adj = df["rating"] + df["rating_dev"])[["user_id", "movie_id", "rating", "rating_adj"]] df.join(names, on="movie_id").sort_values("movie_title") -``` +{{< / highlight >}} @@ -682,9 +681,9 @@ df.join(names, on="movie_id").sort_values("movie_title") That is, every 'adjusted' rating above (the `rating_adj` column) is something like: based on just this movie, what rating would we expect user 28812 to give *Home Alone*? Produce the final prediction by averaging all these: -```python +{{}} df["rating_adj"].mean() -``` +{{< / highlight >}} @@ -700,9 +699,9 @@ df["rating_adj"].mean() As mentioned above, we also happen to have the user's actual rating on *Home Alone* in the test set (i.e. we didn't train on it), so we can compare here: -```python +{{}} ml_test[(ml_test.user_id == target_user) & (ml_test.movie_id == target_movie)]["rating"].iloc[0] -``` +{{< / highlight >}} @@ -722,10 +721,10 @@ That's quite close - though that may just be luck. It's hard to say from one poi Take a look at the table below. This is a similar aggregation to what we just did to determine average deviation - but this instead counts up the number of ratings that went into each average deviation. -```python +{{}} num_ratings = users_df.groupby("movie_id_x").count()["rating_dev"].rename("num_ratings") names.join(num_ratings, how="inner").sort_values("num_ratings") -``` +{{< / highlight >}} @@ -776,11 +775,11 @@ an estimate it is. This is easy to do, luckily: -```python +{{}} df = df.join(num_ratings, on="movie_id") df = df.assign(rating_weighted = df["rating_adj"] * df["num_ratings"]) df -``` +{{< / highlight >}} @@ -816,9 +815,9 @@ df -```python +{{}} df["rating_weighted"].sum() / df["num_ratings"].sum() -``` +{{< / highlight >}} @@ -1020,7 +1019,7 @@ While $U$ and $M$ can be sparse matrices, $C$ and $D$ sort of must be dense matr However, if we look at the $P(u)_j$ formula above, it refers only to row $j$ of $C$ and $D$ and the formulas for $C$ and $D$ make it easy to compute them by row if needed, or by blocks of rows according to what $u$ and $j$ we need. This is what I do below. -```python +{{}} def slope_one(U, M, users, movies, approx=True): M_j = M[:,movies].T.multiply(1) U_j = U[:,movies].T @@ -1036,15 +1035,15 @@ def slope_one(U, M, users, movies, approx=True): else: P_u_j = ((mask * (U_u + Dj)).sum(axis=1) - U_u[0,movies]) / np.maximum(mask.sum(axis=1), 1) return P_u_j -``` +{{< / highlight >}} To show that it actually gives the same result as above, and that the approximation produces seemingly no change here: -```python +{{}} (slope_one(ml_mat_train, ml_mask_train, [target_user], [target_movie])[0], slope_one(ml_mat_train, ml_mask_train, [target_user], [target_movie], approx=False)[0]) -``` +{{< / highlight >}} @@ -1060,7 +1059,7 @@ To show that it actually gives the same result as above, and that the approximat This computes training error on a small part (1%) of the data, since doing it over the entire thing would be horrendously slow: -```python +{{}} def slope_one_err(U, M, users, movies, true_ratings): # Keep 'users' and 'movies' small (couple hundred maybe) p = slope_one(U, M, users, movies) @@ -1068,10 +1067,10 @@ def slope_one_err(U, M, users, movies, true_ratings): err_abs = np.abs(d).sum() err_sq = np.square(d).sum() return err_abs, err_sq -``` +{{< / highlight >}} -```python +{{}} import multiprocessing count = int(len(ml_train) * 0.01) @@ -1092,12 +1091,12 @@ with multiprocessing.Pool() as p: errs = p.map(err_part, idxs_split) err_mae_train = sum([e[0] for e in errs]) / count err_rms_train = np.sqrt(sum([e[1] for e in errs]) / count) -``` +{{< / highlight >}} and then likewise on 2% of the testing data (it's a smaller set to start): -```python +{{}} import multiprocessing count = int(len(ml_test) * 0.02) @@ -1117,19 +1116,19 @@ with multiprocessing.Pool() as p: errs = p.map(err_part, idxs_split) err_mae_test = sum([e[0] for e in errs]) / count err_rms_test = np.sqrt(sum([e[1] for e in errs]) / count) -``` +{{< / highlight >}} -```python +{{}} # These are used later for comparison: test_results = [("", "Slope One", err_mae_test, err_rms_test)] -``` +{{< / highlight >}} -```python +{{}} print("Training error: MAE={:.3f}, RMSE={:.3f}".format(err_mae_train, err_rms_train)) print("Testing error: MAE={:.3f}, RMSE={:.3f}".format(err_mae_test, err_rms_test)) -``` +{{< / highlight >}}
@@ -1253,16 +1252,16 @@ The code below is a direct implementation of this by simply iteratively applying ## 6.5. Implementation -```python +{{}} # Hyperparameters gamma = 0.002 lambda_ = 0.02 num_epochs = 20 num_factors = 40 -``` +{{< / highlight >}} -```python +{{}} class SVDModel(object): def __init__(self, num_items, num_users, mean, num_factors = 100, init_variance = 0.1): @@ -1354,12 +1353,12 @@ class SVDModel(object): i, u, r_ui = items[idx], users[idx], ratings[idx] self.update_by_gradient(i, u, r_ui, lambda_, gamma) if epoch_callback: epoch_callback(self, epoch, num_epochs) -``` +{{< / highlight >}} ## 6.6. Running & Testing -```python +{{}} movies_train = ml_train["movie_id"].values users_train = ml_train["user_id"].values ratings_train = ml_train["rating"].values @@ -1373,13 +1372,13 @@ def at_epoch(self, epoch, num_epochs): (self.b_i, self.b_u, self.p, self.q)) print() print("Epoch {:02d}/{}; Training: MAE={:.3f} RMSE={:.3f}, Testing: MAE={:.3f} RMSE={:.3f}".format(epoch + 1, num_epochs, train_mae, train_rmse, test_mae, test_rmse)) -``` +{{< / highlight >}} -```python +{{}} svd40 = SVDModel(max_movie, max_user, ml["rating"].mean(), num_factors=num_factors) svd40.train(movies_train, users_train, ratings_train, epoch_callback=at_epoch) -``` +{{< / highlight >}}
@@ -1428,10 +1427,10 @@ svd40.train(movies_train, users_train, ratings_train, epoch_callback=at_epoch) -```python +{{}} test_rmse, test_mae = svd40.error(movies_test, users_test, ratings_test) test_results.append(("", "SVD", test_mae, test_rmse)) -``` +{{< / highlight >}} ## 6.7. Visualization of Latent Space @@ -1440,10 +1439,10 @@ I mentioned somewhere in here that this is a latent-factor model. The latent spa The 40-dimensional space above might be a bit unruly to work with, but we can easily train on something lower, like a 4-dimensional space. We can then pick a few dimensions, and visualize where movies fit in this space. -```python +{{}} svd4 = SVDModel(max_movie, max_user, ml["rating"].mean(), 4) svd4.train(ml_train["movie_id"].values, ml_train["user_id"].values, ml_train["rating"].values, epoch_callback=at_epoch) -``` +{{< / highlight >}}
@@ -1494,29 +1493,29 @@ svd4.train(ml_train["movie_id"].values, ml_train["user_id"].values, ml_train["ra To limit the data, we can use just the top movies (by number of ratings): -```python +{{}} top = movie_stats.sort_values("num_ratings", ascending=False)[:100] ids_top = top.index.values -``` +{{< / highlight >}} -```python +{{}} factors = svd4.q[:,ids_top].T means, stds = factors.mean(axis=0), factors.std(axis=0) factors[:] = (factors - means) / stds -``` +{{< / highlight >}} So, here are the top 100 movies when plotted in the first two dimensions of the concept space: -```python +{{}} plt.figure(figsize=(15,15)) markers = ["$ {} $".format("\ ".join(m.split(" ")[:-1])) for m in top["movie_title"][:50]] for i,item in enumerate(factors[:50,:]): l = len(markers[i]) plt.scatter(item[0], item[1], marker = markers[i], alpha=0.75, s = 50 * (l**2)) plt.show() -``` +{{< / highlight >}} ![png](../images/2018-04-08-recommenders/output_94_0.png) @@ -1525,14 +1524,14 @@ plt.show() And here are the other two: -```python +{{}} plt.figure(figsize=(15,15)) markers = ["$ {} $".format("\ ".join(m.split(" ")[:-1])) for m in top["movie_title"][50:]] for i,item in enumerate(factors[50:,:]): l = len(markers[i]) plt.scatter(item[2], item[3], marker = markers[i], alpha=0.75, s = 50 * (l**2)) plt.show() -``` +{{< / highlight >}} ![png](../images/2018-04-08-recommenders/output_96_0.png) @@ -1541,7 +1540,7 @@ plt.show() Below is another way of visualizing. Neither the code nor the result are very pretty, but it divides the entire latent space into a 2D grid, identifies the top few movies (ranked by number of ratings) in each grid square, and prints the resultant grid. -```python +{{}} def clean_title(s): remove = [", The", ", A", ", An"] s1 = " ".join(s.split(" ")[:-1]) @@ -1586,13 +1585,13 @@ def latent_factor_grid(latent_space, count=2): else: first_idxs[i,j] = -1 return pd.DataFrame(first_titles) -``` +{{< / highlight >}} -```python +{{}} pd.set_option('display.max_rows', 500) latent_factor_grid(svd4.q[:2,:]) -``` +{{< / highlight >}} @@ -1627,9 +1626,9 @@ Both axes seem to start more on the low-brow side along the top left. There is Here is the same thing for the other two dimensions in this latent space: -```python +{{}} latent_factor_grid(svd4.q[2:,:]) -``` +{{< / highlight >}} @@ -1666,11 +1665,11 @@ Some sensible axes seem to form here too. Moving from left to right (i.e. increa We can also look at the per-movie bias parameters in the model - loosely, how much higher or lower a movie's rating is, beyond what interactions with user preferences seem to explain. Here are the top 10 and bottom 10; interestingly, while to seems to correlate with the average rating, it doesn't seem to do so especially strongly. -```python +{{}} #bias = movie_stats.assign(bias = svd40.b_i[:-1]).sort_values("bias", ascending=False) bias = movie_stats.join(pd.Series(svd40.b_i[:-1]).rename("bias")).sort_values("bias", ascending=False).dropna() bias.iloc[:10] -``` +{{< / highlight >}} @@ -1695,9 +1694,9 @@ bias.iloc[:10] -```python +{{}} bias.iloc[:-10:-1] -``` +{{< / highlight >}} @@ -1727,27 +1726,27 @@ bias.iloc[:-10:-1] Results below are cross-validated, while our results above aren't, so comparison may have some noise to it (i.e. if you change the random seed you may see our results perform much better or worse while the Surprise results should be more consistent). -```python +{{}} import surprise from surprise.dataset import Dataset -``` +{{< / highlight >}} Note the `.iloc[::10]` below, which is a quick way to decimate the data by a factor of 10. Surprise seems to be less memory-efficient than my code above (at least, without any tuning whatsoever), so in order to test it I don't pass in the entire dataset. -```python +{{}} reader = surprise.Reader(rating_scale=(1, 5)) data = Dataset.load_from_df(ml[["user_id", "movie_id", "rating"]].iloc[::10], reader) cv=5 cv_random = surprise.model_selection.cross_validate(surprise.NormalPredictor(), data, cv=cv) cv_sl1 = surprise.model_selection.cross_validate(surprise.SlopeOne(), data, cv=cv) cv_svd = surprise.model_selection.cross_validate(surprise.SVD(), data, cv=cv) -``` +{{< / highlight >}} # 8. Overall results -```python +{{}} get_record = lambda name, df: \ ("Surprise", name, df["test_mae"].sum() / cv, df["test_rmse"].sum() / cv) cv_data_surprise = [ @@ -1757,7 +1756,7 @@ pd.DataFrame.from_records( data=test_results + cv_data_surprise, columns=("Library", "Algorithm", "MAE (test)", "RMSE (test)"), ) -``` +{{< / highlight >}} diff --git a/hugo_blag/layouts/partials/mathjax_support.html b/hugo_blag/layouts/partials/mathjax_support.html new file mode 100644 index 0000000..adeec5b --- /dev/null +++ b/hugo_blag/layouts/partials/mathjax_support.html @@ -0,0 +1,29 @@ + + + + + diff --git a/hugo_blag/themes/nofancy b/hugo_blag/themes/nofancy new file mode 160000 index 0000000..ae46702 --- /dev/null +++ b/hugo_blag/themes/nofancy @@ -0,0 +1 @@ +Subproject commit ae4670287c71f4c4aed91be9b3d3919846fd62c9 diff --git a/hugo_blag/themes/zen b/hugo_blag/themes/zen new file mode 160000 index 0000000..b09452b --- /dev/null +++ b/hugo_blag/themes/zen @@ -0,0 +1 @@ +Subproject commit b09452be937db32d659e2a255617256a4dca345b