Skip to content

Commit eca4a5d

Browse files
committed
Initial commit
0 parents  commit eca4a5d

File tree

7 files changed

+277
-0
lines changed

7 files changed

+277
-0
lines changed

.gitignore

+94
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,94 @@
1+
.idea
2+
3+
# Created by https://www.gitignore.io/api/python
4+
5+
### Python ###
6+
# Byte-compiled / optimized / DLL files
7+
__pycache__/
8+
*.py[cod]
9+
*$py.class
10+
11+
# C extensions
12+
*.so
13+
14+
# Distribution / packaging
15+
.Python
16+
env/
17+
build/
18+
develop-eggs/
19+
dist/
20+
downloads/
21+
eggs/
22+
.eggs/
23+
lib/
24+
lib64/
25+
parts/
26+
sdist/
27+
var/
28+
*.egg-info/
29+
.installed.cfg
30+
*.egg
31+
32+
# PyInstaller
33+
# Usually these files are written by a python script from a template
34+
# before PyInstaller builds the exe, so as to inject date/other infos into it.
35+
*.manifest
36+
*.spec
37+
38+
# Installer logs
39+
pip-log.txt
40+
pip-delete-this-directory.txt
41+
42+
# Unit test / coverage reports
43+
htmlcov/
44+
.tox/
45+
.coverage
46+
.coverage.*
47+
.cache
48+
nosetests.xml
49+
coverage.xml
50+
*,cover
51+
.hypothesis/
52+
53+
# Translations
54+
*.mo
55+
*.pot
56+
57+
# Django stuff:
58+
*.log
59+
local_settings.py
60+
61+
# Flask stuff:
62+
instance/
63+
.webassets-cache
64+
65+
# Scrapy stuff:
66+
.scrapy
67+
68+
# Sphinx documentation
69+
docs/_build/
70+
71+
# PyBuilder
72+
target/
73+
74+
# IPython Notebook
75+
.ipynb_checkpoints
76+
77+
# pyenv
78+
.python-version
79+
80+
# celery beat schedule file
81+
celerybeat-schedule
82+
83+
# dotenv
84+
.env
85+
86+
# virtualenv
87+
venv/
88+
ENV/
89+
90+
# Spyder project settings
91+
.spyderproject
92+
93+
# Rope project settings
94+
.ropeproject

CDAE.py

+37
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,37 @@
1+
from keras.layers import Input, Dense, Embedding, Flatten, Dropout, merge, Activation
2+
from keras.models import Model
3+
from keras.regularizers import l2
4+
5+
def create(I, U, K, hidden_activation, output_activation, q=0.5, l=0.01):
6+
'''
7+
create model
8+
Reference:
9+
Yao Wu, Christopher DuBois, Alice X. Zheng, Martin Ester.
10+
Collaborative Denoising Auto-Encoders for Top-N Recommender Systems.
11+
The 9th ACM International Conference on Web Search and Data Mining (WSDM'16), p153--162, 2016.
12+
13+
:param I: number of items
14+
:param U: number of users
15+
:param K: number of units in hidden layer
16+
:param hidden_activation: activation function of hidden layer
17+
:param output_activation: activation function of output layer
18+
:param q: drop probability
19+
:param l: regularization parameter of L2 regularization
20+
:return: CDAE
21+
:rtype: keras.models.Model
22+
'''
23+
x_item = Input((I,), name='x_item')
24+
h_item = Dropout(q)(x_item)
25+
h_item = Dense(K, W_regularizer=l2(l), b_regularizer=l2(l))(h_item)
26+
27+
# dtype should be int to connect to Embedding layer
28+
x_user = Input((1,), dtype='int32', name='x_user')
29+
h_user = Embedding(input_dim=U, output_dim=K, input_length=1, W_regularizer=l2(l))(x_user)
30+
h_user = Flatten()(h_user)
31+
32+
h = merge([h_item, h_user], mode='sum')
33+
if hidden_activation:
34+
h = Activation(hidden_activation)(h)
35+
y = Dense(I, activation=output_activation)(h)
36+
37+
return Model(input=[x_item, x_user], output=y)

LICENSE

+21
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,21 @@
1+
The MIT License (MIT)
2+
3+
Copyright (c) 2016 Tsukasa OMOTO <[email protected]>
4+
5+
Permission is hereby granted, free of charge, to any person obtaining a copy
6+
of this software and associated documentation files (the "Software"), to deal
7+
in the Software without restriction, including without limitation the rights
8+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9+
copies of the Software, and to permit persons to whom the Software is
10+
furnished to do so, subject to the following conditions:
11+
12+
The above copyright notice and this permission notice shall be included in
13+
all copies or substantial portions of the Software.
14+
15+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
21+
THE SOFTWARE.

README.md

+19
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,19 @@
1+
# CDAE
2+
3+
Implementation of [Collaborative Denoising Auto-Encoder (CDAE)](http://yaowu.co/ "CDAE") with the [Keras](http://keras.io/ "Keras Documentation").
4+
5+
## References
6+
7+
* Yao Wu, Christopher DuBois, Alice X. Zheng, Martin Ester. Collaborative Denoising Auto-Encoders for Top-N Recommender Systems. The 9th ACM International Conference on Web Search and Data Mining (WSDM'16), p153--162, 2016.
8+
* F. Maxwell Harper and Joseph A. Konstan. 2015. The MovieLens Datasets: History and Context. ACM Transactions on Interactive Intelligent Systems (TiiS) 5, 4, Article 19 (December 2015), 19 pages.
9+
10+
## Usage
11+
12+
```sh
13+
python train.py
14+
```
15+
16+
## Licence
17+
18+
MIT License
19+
Copyright (c) 2016 Tsukasa ŌMOTO

metrics.py

+10
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,10 @@
1+
import numpy
2+
3+
def success_rate(pred, true):
4+
cnt = 0
5+
for i in range(pred.shape[0]):
6+
t = numpy.where(true[i] == 1) # true set
7+
ary = numpy.intersect1d(pred[i], t)
8+
if ary.size > 0:
9+
cnt += 1
10+
return cnt * 100 / pred.shape[0]

movie_lens.py

+53
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,53 @@
1+
import numpy
2+
from keras.utils.data_utils import get_file
3+
from keras.utils.np_utils import to_categorical
4+
from zipfile import ZipFile
5+
6+
def load_data():
7+
'''
8+
load data from MovieLens 100K Dataset
9+
http://grouplens.org/datasets/movielens/
10+
11+
Note that this method uses ua.base and ua.test in the dataset.
12+
13+
:return: train_users, train_x, test_users, test_x
14+
:rtype: list of int, numpy.array, list of int, numpy.array
15+
'''
16+
path = get_file('ml-100k.zip', origin='http://files.grouplens.org/datasets/movielens/ml-100k.zip')
17+
with ZipFile(path, 'r') as ml_zip:
18+
max_item_id = -1
19+
train_history = {}
20+
with ml_zip.open('ml-100k/ua.base', 'r') as file:
21+
for line in file:
22+
user_id, item_id, rating, timestamp = line.decode('utf-8').rstrip().split('\t')
23+
if int(user_id) not in train_history:
24+
train_history[int(user_id)] = [int(item_id)]
25+
else:
26+
train_history[int(user_id)].append(int(item_id))
27+
28+
if max_item_id < int(item_id):
29+
max_item_id = int(item_id)
30+
31+
test_history = {}
32+
with ml_zip.open('ml-100k/ua.test', 'r') as file:
33+
for line in file:
34+
user_id, item_id, rating, timestamp = line.decode('utf-8').rstrip().split('\t')
35+
if int(user_id) not in test_history:
36+
test_history[int(user_id)] = [int(item_id)]
37+
else:
38+
test_history[int(user_id)].append(int(item_id))
39+
40+
max_item_id += 1 # item_id starts from 1
41+
train_users = list(train_history.keys())
42+
train_x = numpy.zeros((len(train_users), max_item_id), dtype=numpy.int32)
43+
for i, hist in enumerate(train_history.values()):
44+
mat = to_categorical(hist, max_item_id)
45+
train_x[i] = numpy.sum(mat, axis=0)
46+
47+
test_users = list(test_history.keys())
48+
test_x = numpy.zeros((len(test_users), max_item_id), dtype=numpy.int32)
49+
for i, hist in enumerate(test_history.values()):
50+
mat = to_categorical(hist, max_item_id)
51+
test_x[i] = numpy.sum(mat, axis=0)
52+
53+
return train_users, train_x, test_users, test_x

train.py

+43
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,43 @@
1+
import numpy
2+
numpy.random.seed(0)
3+
4+
import CDAE
5+
import movie_lens
6+
import metrics
7+
8+
# data
9+
train_users, train_x, test_users, test_x = movie_lens.load_data()
10+
train_x_users = numpy.array(train_users, dtype=numpy.int32).reshape(len(train_users), 1)
11+
test_x_users = numpy.array(test_users, dtype=numpy.int32).reshape(len(test_users), 1)
12+
13+
# model
14+
model = CDAE.create(I=train_x.shape[1], U=len(train_users)+1, K=50,
15+
hidden_activation='relu', output_activation='sigmoid', q=0.50, l=0.01)
16+
model.compile(loss='mean_absolute_error', optimizer='adam')
17+
model.summary()
18+
19+
# train
20+
history = model.fit(x=[train_x, train_x_users], y=train_x,
21+
batch_size=128, nb_epoch=1000, verbose=1,
22+
validation_data=[[test_x, test_x_users], test_x])
23+
24+
# predict
25+
pred = model.predict(x=[train_x, numpy.array(train_users, dtype=numpy.int32).reshape(len(train_users), 1)])
26+
pred = pred * (train_x == 0) # remove watched items from predictions
27+
pred = numpy.argsort(pred)
28+
29+
for n in range(1, 11):
30+
sr = metrics.success_rate(pred[:, -n:], test_x)
31+
print("Success Rate at {:d}: {:f}".format(n, sr))
32+
'''
33+
Success Rate at 1: 27.783669
34+
Success Rate at 2: 39.236479
35+
Success Rate at 3: 45.281018
36+
Success Rate at 4: 49.310710
37+
Success Rate at 5: 51.219512
38+
Success Rate at 6: 53.234358
39+
Success Rate at 7: 54.188759
40+
Success Rate at 8: 55.673383
41+
Success Rate at 9: 56.733828
42+
Success Rate at 10: 57.688229
43+
'''

0 commit comments

Comments
 (0)