Initial commit

henry0312 · henry0312 · commit eca4a5d81d70 · 2016-05-27T15:03:45.000+09:00
diff --git a/.gitignore b/.gitignore
@@ -0,0 +1,94 @@
+.idea
+
+# Created by https://www.gitignore.io/api/python
+
+### Python ###
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+
+# C extensions
+*.so
+
+# Distribution / packaging
+.Python
+env/
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+*.egg-info/
+.installed.cfg
+*.egg
+
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*,cover
+.hypothesis/
+
+# Translations
+*.mo
+*.pot
+
+# Django stuff:
+*.log
+local_settings.py
+
+# Flask stuff:
+instance/
+.webassets-cache
+
+# Scrapy stuff:
+.scrapy
+
+# Sphinx documentation
+docs/_build/
+
+# PyBuilder
+target/
+
+# IPython Notebook
+.ipynb_checkpoints
+
+# pyenv
+.python-version
+
+# celery beat schedule file
+celerybeat-schedule
+
+# dotenv
+.env
+
+# virtualenv
+venv/
+ENV/
+
+# Spyder project settings
+.spyderproject
+
+# Rope project settings
+.ropeproject
diff --git a/CDAE.py b/CDAE.py
@@ -0,0 +1,37 @@
+from keras.layers import Input, Dense, Embedding, Flatten, Dropout, merge, Activation
+from keras.models import Model
+from keras.regularizers import l2
+
+def create(I, U, K, hidden_activation, output_activation, q=0.5, l=0.01):
+    '''
+    create model
+    Reference:
+      Yao Wu, Christopher DuBois, Alice X. Zheng, Martin Ester.
+        Collaborative Denoising Auto-Encoders for Top-N Recommender Systems.
+          The 9th ACM International Conference on Web Search and Data Mining (WSDM'16), p153--162, 2016.
+
+    :param I: number of items
+    :param U: number of users
+    :param K: number of units in hidden layer
+    :param hidden_activation: activation function of hidden layer
+    :param output_activation: activation function of output layer
+    :param q: drop probability
+    :param l: regularization parameter of L2 regularization
+    :return: CDAE
+    :rtype: keras.models.Model
+    '''
+    x_item = Input((I,), name='x_item')
+    h_item = Dropout(q)(x_item)
+    h_item = Dense(K, W_regularizer=l2(l), b_regularizer=l2(l))(h_item)
+
+    # dtype should be int to connect to Embedding layer
+    x_user = Input((1,), dtype='int32', name='x_user')
+    h_user = Embedding(input_dim=U, output_dim=K, input_length=1, W_regularizer=l2(l))(x_user)
+    h_user = Flatten()(h_user)
+
+    h = merge([h_item, h_user], mode='sum')
+    if hidden_activation:
+        h = Activation(hidden_activation)(h)
+    y = Dense(I, activation=output_activation)(h)
+
+    return Model(input=[x_item, x_user], output=y)
diff --git a/LICENSE b/LICENSE
@@ -0,0 +1,21 @@
+The MIT License (MIT)
+
+Copyright (c) 2016 Tsukasa OMOTO <henry0312@gmail.com>
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
diff --git a/README.md b/README.md
@@ -0,0 +1,19 @@
+# CDAE
+
+Implementation of [Collaborative Denoising Auto-Encoder (CDAE)](http://yaowu.co/ "CDAE") with the [Keras](http://keras.io/ "Keras Documentation").
+
+## References
+
+* Yao Wu, Christopher DuBois, Alice X. Zheng, Martin Ester. Collaborative Denoising Auto-Encoders for Top-N Recommender Systems. The 9th ACM International Conference on Web Search and Data Mining (WSDM'16), p153--162, 2016.
+* F. Maxwell Harper and Joseph A. Konstan. 2015. The MovieLens Datasets: History and Context. ACM Transactions on Interactive Intelligent Systems (TiiS) 5, 4, Article 19 (December 2015), 19 pages.
+
+## Usage
+
+```sh
+python train.py
+```
+
+## Licence
+
+MIT License
+Copyright (c) 2016 Tsukasa ŌMOTO
diff --git a/metrics.py b/metrics.py
@@ -0,0 +1,10 @@
+import numpy
+
+def success_rate(pred, true):
+    cnt = 0
+    for i in range(pred.shape[0]):
+        t = numpy.where(true[i] == 1) # true set
+        ary = numpy.intersect1d(pred[i], t)
+        if ary.size > 0:
+            cnt += 1
+    return cnt * 100 / pred.shape[0]
diff --git a/movie_lens.py b/movie_lens.py
@@ -0,0 +1,53 @@
+import numpy
+from keras.utils.data_utils import get_file
+from keras.utils.np_utils import to_categorical
+from zipfile import ZipFile
+
+def load_data():
+    '''
+    load data from MovieLens 100K Dataset
+    http://grouplens.org/datasets/movielens/
+
+    Note that this method uses ua.base and ua.test in the dataset.
+
+    :return: train_users, train_x, test_users, test_x
+    :rtype: list of int, numpy.array, list of int, numpy.array
+    '''
+    path = get_file('ml-100k.zip', origin='http://files.grouplens.org/datasets/movielens/ml-100k.zip')
+    with ZipFile(path, 'r') as ml_zip:
+        max_item_id  = -1
+        train_history = {}
+        with ml_zip.open('ml-100k/ua.base', 'r') as file:
+            for line in file:
+                user_id, item_id, rating, timestamp = line.decode('utf-8').rstrip().split('\t')
+                if int(user_id) not in train_history:
+                    train_history[int(user_id)] = [int(item_id)]
+                else:
+                    train_history[int(user_id)].append(int(item_id))
+
+                if max_item_id < int(item_id):
+                    max_item_id = int(item_id)
+
+        test_history = {}
+        with ml_zip.open('ml-100k/ua.test', 'r') as file:
+            for line in file:
+                user_id, item_id, rating, timestamp = line.decode('utf-8').rstrip().split('\t')
+                if int(user_id) not in test_history:
+                    test_history[int(user_id)] = [int(item_id)]
+                else:
+                    test_history[int(user_id)].append(int(item_id))
+
+    max_item_id += 1 # item_id starts from 1
+    train_users = list(train_history.keys())
+    train_x = numpy.zeros((len(train_users), max_item_id), dtype=numpy.int32)
+    for i, hist in enumerate(train_history.values()):
+        mat = to_categorical(hist, max_item_id)
+        train_x[i] = numpy.sum(mat, axis=0)
+
+    test_users = list(test_history.keys())
+    test_x = numpy.zeros((len(test_users), max_item_id), dtype=numpy.int32)
+    for i, hist in enumerate(test_history.values()):
+        mat = to_categorical(hist, max_item_id)
+        test_x[i] = numpy.sum(mat, axis=0)
+
+    return train_users, train_x, test_users, test_x
diff --git a/train.py b/train.py
@@ -0,0 +1,43 @@
+import numpy
+numpy.random.seed(0)
+
+import CDAE
+import movie_lens
+import metrics
+
+# data
+train_users, train_x, test_users, test_x = movie_lens.load_data()
+train_x_users = numpy.array(train_users, dtype=numpy.int32).reshape(len(train_users), 1)
+test_x_users = numpy.array(test_users, dtype=numpy.int32).reshape(len(test_users), 1)
+
+# model
+model = CDAE.create(I=train_x.shape[1], U=len(train_users)+1, K=50,
+                    hidden_activation='relu', output_activation='sigmoid', q=0.50, l=0.01)
+model.compile(loss='mean_absolute_error', optimizer='adam')
+model.summary()
+
+# train
+history = model.fit(x=[train_x, train_x_users], y=train_x,
+                    batch_size=128, nb_epoch=1000, verbose=1,
+                    validation_data=[[test_x, test_x_users], test_x])
+
+# predict
+pred = model.predict(x=[train_x, numpy.array(train_users, dtype=numpy.int32).reshape(len(train_users), 1)])
+pred = pred * (train_x == 0) # remove watched items from predictions
+pred = numpy.argsort(pred)
+
+for n in range(1, 11):
+    sr = metrics.success_rate(pred[:, -n:], test_x)
+    print("Success Rate at {:d}: {:f}".format(n, sr))
+'''
+Success Rate at 1: 27.783669
+Success Rate at 2: 39.236479
+Success Rate at 3: 45.281018
+Success Rate at 4: 49.310710
+Success Rate at 5: 51.219512
+Success Rate at 6: 53.234358
+Success Rate at 7: 54.188759
+Success Rate at 8: 55.673383
+Success Rate at 9: 56.733828
+Success Rate at 10: 57.688229
+'''