Skip to content

Commit a2a92b5

Browse files
authored
Merge pull request #2 from adiIspas/lightfm-version
Lightfm version
2 parents f57fc95 + 734f4ec commit a2a92b5

28 files changed

+20773
-0
lines changed
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,57 @@
1+
import os
2+
import pandas as pd
3+
import csv
4+
import shutil
5+
6+
dataset = '../../../king-rec-dataset/ml-latest-small/images/'
7+
number_of_clusters = 7
8+
model = 'vgg16'
9+
clusters_dir = '../../../king-rec-dataset/ml-latest-small/results/clusters/sanity-check/' + model + '/' + str(number_of_clusters) + '/'
10+
11+
12+
def collect_posters():
13+
data = pd.read_csv('sanity_check_movies_1_poster_clusters_' + model + '.csv')
14+
15+
# create directories
16+
for idx in range(1, number_of_clusters + 1):
17+
os.makedirs(clusters_dir + str(idx), exist_ok=True)
18+
19+
# move posters into associated cluster
20+
for index, row in data.iterrows():
21+
src = dataset + str(int(row['0'])) + '/posters/' + str(int(row['1'])) + '.jpg'
22+
dest = clusters_dir + str(int(row['cluster_' + str(number_of_clusters)]) + 1) + '/' + str(int(row['0'])) + '_' + str(int(row['1'])) + '.jpg'
23+
24+
if os.path.isfile(src):
25+
shutil.copy(src, dest)
26+
27+
print('Done')
28+
29+
30+
collect_posters()
31+
32+
dataset2 = '../../../king-rec-dataset/ml-latest-small/'
33+
34+
35+
def get_items_ids():
36+
item_ids = set()
37+
38+
with open(dataset2 + 'movies.csv', 'r') as movies_file:
39+
reader = csv.reader(movies_file, delimiter=',')
40+
next(reader) # skip header
41+
42+
for row in reader:
43+
item_ids.add(int(row[0]))
44+
45+
return item_ids
46+
47+
48+
def count_movies():
49+
movies = get_items_ids()
50+
51+
idx = 1
52+
for item in movies:
53+
print(idx, item)
54+
idx = idx + 1
55+
56+
57+
# count_movies()

dataset/utils/create_clusters.py

+88
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,88 @@
1+
import numpy as np
2+
import matplotlib.pyplot as plt
3+
import csv
4+
import pandas as pd
5+
6+
from sklearn import metrics
7+
from sklearn.cluster import KMeans
8+
9+
10+
dataset = '../../../king-rec-dataset/ml-latest-small/'
11+
12+
13+
def get_items_ids():
14+
item_ids = set()
15+
16+
with open(dataset + 'movies.csv', 'r') as movies_file:
17+
reader = csv.reader(movies_file, delimiter=',')
18+
next(reader) # skip header
19+
20+
for row in reader:
21+
item_ids.add(int(row[0]))
22+
23+
return item_ids
24+
25+
26+
def explore_clusters():
27+
clusters = range(2, 22, 2)
28+
models_results = dict()
29+
colors = ['r', 'y', 'b', 'g', 'c']
30+
31+
models = ['vgg16', 'vgg19', 'inception_v3', 'resnet50', 'NASNet']
32+
33+
for model in models:
34+
print('Reading data ...')
35+
feature_list = np.loadtxt('./posters_features/1000-movies/' + model + '1000-movies_1-posters.csv', delimiter=',')
36+
print('Complete read data.')
37+
38+
movie_poster_clusters = pd.DataFrame(feature_list[:, :2])
39+
40+
feature_list = feature_list[:, 2:]
41+
feature_list_np = np.array(feature_list)
42+
for n_clusters in clusters:
43+
k_means = KMeans(n_clusters=n_clusters).fit(feature_list_np)
44+
45+
name = model
46+
result = metrics.silhouette_score(feature_list_np, k_means.labels_)
47+
48+
if name not in models_results:
49+
results = []
50+
else:
51+
results = models_results.pop(name)
52+
53+
cluster_name = 'cluster_' + str(n_clusters)
54+
movie_poster_clusters[cluster_name] = pd.Series(k_means.labels_)
55+
56+
results.append(result)
57+
models_results.update({name: results})
58+
print('silhouette score on', name, 'with', n_clusters, 'clusters:', result)
59+
60+
movie_poster_clusters.to_csv('movies_1_poster_clusters_' + name + '.csv')
61+
62+
n_groups = len(list(clusters))
63+
index = np.arange(n_groups)
64+
bar_width = 0.15
65+
current_index = 0
66+
67+
for key, values in models_results.items():
68+
plt.bar(index + bar_width * current_index, values, bar_width,
69+
color=colors[current_index],
70+
label=key)
71+
current_index += 1
72+
73+
plt.xlabel('Number of clusters')
74+
plt.ylabel('Silhouette score')
75+
plt.title('Silhouette score by model')
76+
plt.xticks(index + bar_width, list(clusters))
77+
plt.legend()
78+
plt.tight_layout()
79+
plt.savefig('silhouette-score.jpg')
80+
plt.show()
81+
82+
83+
def main():
84+
explore_clusters()
85+
86+
87+
if __name__ == "__main__":
88+
main()
+80
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,80 @@
1+
import csv
2+
import numpy as np
3+
import pandas as pd
4+
5+
from sklearn.cluster import MiniBatchKMeans
6+
7+
8+
dataset = '../../../king-rec-dataset/ml-latest-small/'
9+
10+
11+
def get_items_ids():
12+
item_ids = set()
13+
14+
with open(dataset + 'movies.csv', 'r') as movies_file:
15+
reader = csv.reader(movies_file, delimiter=',')
16+
next(reader) # skip header
17+
18+
for row in reader:
19+
item_ids.add(int(row[0]))
20+
21+
return item_ids
22+
23+
24+
def explore_clusters():
25+
batch_size = 40
26+
27+
# models = ['vgg16', 'vgg19', 'inception_v3', 'resnet50', 'NASNet']
28+
models = ['resnet50']
29+
30+
for model in models:
31+
# csv_path = './' + model + '-1-posters.csv'
32+
csv_path = './posters_features/sanity-check/' + model + '-sanity-check.csv'
33+
34+
movie_poster_clusters = pd.DataFrame()
35+
for n_clusters in [7]:
36+
final_clusters = pd.Series()
37+
print('Process cluster', n_clusters)
38+
39+
k_means = MiniBatchKMeans(n_clusters=n_clusters, batch_size=batch_size, compute_labels=True)
40+
41+
reader_chunks = pd.read_csv(csv_path, delimiter=',', header=None, chunksize=batch_size)
42+
for chunk in reader_chunks:
43+
print('Processing chunk ...')
44+
45+
feature_list = pd.DataFrame(data=chunk)
46+
47+
movie_poster_clusters = movie_poster_clusters.append(feature_list.iloc[:, :2])
48+
49+
feature_list = feature_list.iloc[:, 2:]
50+
feature_list_np = np.array(feature_list)
51+
52+
k_means.partial_fit(feature_list_np)
53+
54+
reader_chunks = pd.read_csv(csv_path, delimiter=',', header=None, chunksize=batch_size)
55+
for chunk in reader_chunks:
56+
print('Predicting chunk ...')
57+
58+
feature_list = pd.DataFrame(data=chunk)
59+
60+
feature_list = feature_list.iloc[:, 2:]
61+
feature_list_np = np.array(feature_list)
62+
63+
labels = k_means.predict(feature_list_np)
64+
65+
final_clusters = final_clusters.append(pd.Series(labels))
66+
67+
name = model
68+
69+
cluster_name = 'cluster_' + str(n_clusters)
70+
movie_poster_clusters[cluster_name] = pd.Series(final_clusters.values, index=movie_poster_clusters.index)
71+
72+
movie_poster_clusters.to_csv('test-chunk-movies_1_poster_clusters_' + name + '.csv')
73+
74+
75+
def main():
76+
explore_clusters()
77+
78+
79+
if __name__ == "__main__":
80+
main()

dataset/utils/downloader.py

+85
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,85 @@
1+
import os
2+
import csv
3+
import sys
4+
import requests
5+
import urllib.request
6+
7+
api_key = sys.argv[1]
8+
9+
dataset = '../../king-rec-dataset/ml-latest-small/'
10+
tmdb_api = 'https://api.themoviedb.org/3/movie/$MOVIE_ID/images?include_image_language=en,null&api_key=$API_KEY'
11+
tmdb_images_url = 'https://image.tmdb.org/t/p/original/'
12+
13+
14+
def get_tmdb_posters(tmdb_api_key, max_movie_index=10):
15+
tmdb_movies_id = get_tmdb_ids()
16+
download_images(tmdb_api_key, tmdb_movies_id, max_movie_index)
17+
18+
return tmdb_movies_id
19+
20+
21+
def download_images(tmdb_api_key, tmdb_movies_id, max_movie_index=10):
22+
images = dataset + 'images/'
23+
24+
movie_index = 1
25+
total_movies = len(tmdb_movies_id.items())
26+
27+
for key, value in tmdb_movies_id.items():
28+
posters = images + str(key) + '/posters/'
29+
backdrops = images + str(key) + '/backdrops/'
30+
31+
if not os.path.exists(posters):
32+
os.makedirs(posters)
33+
34+
if not os.path.exists(backdrops):
35+
os.makedirs(backdrops)
36+
37+
if len(os.listdir(posters)) == 0 or len(os.listdir(backdrops)) == 0:
38+
current_url = tmdb_api.replace('$MOVIE_ID', str(value)).replace('$API_KEY', tmdb_api_key)
39+
response = requests.get(current_url)
40+
41+
if response.status_code == 200:
42+
json = response.json()
43+
44+
if len(os.listdir(posters)) == 0:
45+
image_idx = 1
46+
for poster in json['posters']:
47+
if poster['iso_639_1'] == 'en':
48+
print(movie_index, '/', total_movies, '- Process movie', value, 'and poster', image_idx)
49+
poster_url = poster['file_path']
50+
urllib.request.urlretrieve(tmdb_images_url + poster_url, posters + str(image_idx) + '.jpg')
51+
image_idx += 1
52+
53+
if len(os.listdir(backdrops)) == 0:
54+
image_idx = 1
55+
for backdrop in json['backdrops']:
56+
if backdrop['iso_639_1'] == 'xx' or backdrop['iso_639_1'] is None:
57+
print(movie_index, '/', total_movies, '- Process movie', value, 'and backdrop', image_idx)
58+
backdrop_url = backdrop['file_path']
59+
urllib.request.urlretrieve(tmdb_images_url + backdrop_url,
60+
backdrops + str(image_idx) + '.jpg')
61+
image_idx += 1
62+
63+
else:
64+
print('Status code:', response.status_code, 'on movie', key, '-', value)
65+
66+
if movie_index == max_movie_index:
67+
break
68+
69+
movie_index += 1
70+
71+
72+
def get_tmdb_ids(tmdb_index=2):
73+
links = dataset + 'links.csv'
74+
with open(links, 'r') as links_file:
75+
reader = csv.reader(links_file, delimiter=',', )
76+
next(reader) # skip header
77+
78+
tmdb_movies_id = dict()
79+
for row in reader:
80+
tmdb_movies_id.update({row[0]: row[tmdb_index]})
81+
82+
return tmdb_movies_id
83+
84+
85+
get_tmdb_posters(api_key, max_movie_index=20)

dataset/utils/extract_features.py

+81
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,81 @@
1+
import os
2+
import numpy as np
3+
import csv
4+
import pandas as pd
5+
6+
from keras.layers import Input
7+
from keras.preprocessing import image
8+
from keras.applications.vgg16 import VGG16
9+
from keras.applications.vgg19 import VGG19
10+
from keras.applications.inception_v3 import InceptionV3
11+
from keras.applications.resnet50 import ResNet50
12+
from keras.applications.nasnet import NASNetLarge
13+
from keras.applications.imagenet_utils import preprocess_input
14+
15+
dataset = '../../../king-rec-dataset/ml-latest-small/'
16+
base_path = 'images/'
17+
# base_path = 'clusters_sanity_check/'
18+
max_posters_per_movie = 1
19+
20+
21+
def get_int(filename):
22+
return int(filename.split('.')[0])
23+
24+
25+
def get_items_ids():
26+
item_ids = set()
27+
28+
with open(dataset + 'movies.csv', 'r') as movies_file:
29+
reader = csv.reader(movies_file, delimiter=',')
30+
next(reader) # skip header
31+
32+
for row in reader:
33+
item_ids.add(int(row[0]))
34+
35+
return item_ids
36+
37+
38+
def extract_images_features():
39+
movies = list(get_items_ids())
40+
# movies = [1, 3, 4, 5, 7, 19, 22, 23]
41+
subdir = [dataset + base_path + str(movie) + '/posters/' for movie in movies]
42+
models = [
43+
VGG16(weights='imagenet', include_top=False),
44+
VGG19(weights='imagenet', include_top=False),
45+
InceptionV3(weights='imagenet', include_top=False),
46+
ResNet50(weights='imagenet', include_top=False),
47+
NASNetLarge(weights='imagenet', include_top=False, input_tensor=Input(shape=(224, 224, 3)))
48+
]
49+
total_movies = len(subdir)
50+
for current_movie, dirname in enumerate(subdir):
51+
movie_idx = int([s for s in dirname.split('/') if s.isdigit()][0])
52+
filenames = sorted(os.listdir(dirname), key=get_int)[0:max_posters_per_movie]
53+
54+
for _, file_name in enumerate(filenames):
55+
poster_idx = int(file_name.split('.')[0])
56+
57+
img = image.load_img(dirname + '/' + file_name, target_size=(224, 224))
58+
img_data = image.img_to_array(img)
59+
img_data = np.expand_dims(img_data, axis=0)
60+
img_data = preprocess_input(img_data)
61+
62+
for model in models:
63+
feature = model.predict(img_data)
64+
feature_np = np.array(feature)
65+
feature = feature_np.flatten()
66+
67+
data_to_save = np.append([movie_idx, poster_idx], feature)
68+
data = pd.DataFrame([data_to_save])
69+
data.to_csv(model.name + '-' + str(max_posters_per_movie) + '-posters' + '.csv',
70+
mode='a', sep=',', index=False, header=False)
71+
72+
print(str(current_movie + 1) + '/' + str(total_movies) + ':', 'movie id:', movie_idx, ' poster id:', poster_idx,
73+
' model name:', model.name, ' total features:', len(feature))
74+
75+
76+
def main():
77+
extract_images_features()
78+
79+
80+
if __name__ == "__main__":
81+
main()

0 commit comments

Comments
 (0)