|
| 1 | +import os |
| 2 | +import numpy as np |
| 3 | +import csv |
| 4 | +import pandas as pd |
| 5 | + |
| 6 | +from keras.layers import Input |
| 7 | +from keras.preprocessing import image |
| 8 | +from keras.applications.vgg16 import VGG16 |
| 9 | +from keras.applications.vgg19 import VGG19 |
| 10 | +from keras.applications.inception_v3 import InceptionV3 |
| 11 | +from keras.applications.resnet50 import ResNet50 |
| 12 | +from keras.applications.nasnet import NASNetLarge |
| 13 | +from keras.applications.imagenet_utils import preprocess_input |
| 14 | + |
| 15 | +dataset = '../../../king-rec-dataset/ml-latest-small/' |
| 16 | +base_path = 'images/' |
| 17 | +# base_path = 'clusters_sanity_check/' |
| 18 | +max_posters_per_movie = 1 |
| 19 | + |
| 20 | + |
| 21 | +def get_int(filename): |
| 22 | + return int(filename.split('.')[0]) |
| 23 | + |
| 24 | + |
| 25 | +def get_items_ids(): |
| 26 | + item_ids = set() |
| 27 | + |
| 28 | + with open(dataset + 'movies.csv', 'r') as movies_file: |
| 29 | + reader = csv.reader(movies_file, delimiter=',') |
| 30 | + next(reader) # skip header |
| 31 | + |
| 32 | + for row in reader: |
| 33 | + item_ids.add(int(row[0])) |
| 34 | + |
| 35 | + return item_ids |
| 36 | + |
| 37 | + |
| 38 | +def extract_images_features(): |
| 39 | + movies = list(get_items_ids()) |
| 40 | + # movies = [1, 3, 4, 5, 7, 19, 22, 23] |
| 41 | + subdir = [dataset + base_path + str(movie) + '/posters/' for movie in movies] |
| 42 | + models = [ |
| 43 | + VGG16(weights='imagenet', include_top=False), |
| 44 | + VGG19(weights='imagenet', include_top=False), |
| 45 | + InceptionV3(weights='imagenet', include_top=False), |
| 46 | + ResNet50(weights='imagenet', include_top=False), |
| 47 | + NASNetLarge(weights='imagenet', include_top=False, input_tensor=Input(shape=(224, 224, 3))) |
| 48 | + ] |
| 49 | + total_movies = len(subdir) |
| 50 | + for current_movie, dirname in enumerate(subdir): |
| 51 | + movie_idx = int([s for s in dirname.split('/') if s.isdigit()][0]) |
| 52 | + filenames = sorted(os.listdir(dirname), key=get_int)[0:max_posters_per_movie] |
| 53 | + |
| 54 | + for _, file_name in enumerate(filenames): |
| 55 | + poster_idx = int(file_name.split('.')[0]) |
| 56 | + |
| 57 | + img = image.load_img(dirname + '/' + file_name, target_size=(224, 224)) |
| 58 | + img_data = image.img_to_array(img) |
| 59 | + img_data = np.expand_dims(img_data, axis=0) |
| 60 | + img_data = preprocess_input(img_data) |
| 61 | + |
| 62 | + for model in models: |
| 63 | + feature = model.predict(img_data) |
| 64 | + feature_np = np.array(feature) |
| 65 | + feature = feature_np.flatten() |
| 66 | + |
| 67 | + data_to_save = np.append([movie_idx, poster_idx], feature) |
| 68 | + data = pd.DataFrame([data_to_save]) |
| 69 | + data.to_csv(model.name + '-' + str(max_posters_per_movie) + '-posters' + '.csv', |
| 70 | + mode='a', sep=',', index=False, header=False) |
| 71 | + |
| 72 | + print(str(current_movie + 1) + '/' + str(total_movies) + ':', 'movie id:', movie_idx, ' poster id:', poster_idx, |
| 73 | + ' model name:', model.name, ' total features:', len(feature)) |
| 74 | + |
| 75 | + |
| 76 | +def main(): |
| 77 | + extract_images_features() |
| 78 | + |
| 79 | + |
| 80 | +if __name__ == "__main__": |
| 81 | + main() |
0 commit comments