Merge pull request #553 from chenyushuo/master

Merge 0.1.x into master
RUCAIBox · Dec 6, 2020 · cedd7ea · cedd7ea
2 parents c8f1ca7 + 90d5c3f
commit cedd7ea
Show file tree

Hide file tree

Showing 29 changed files with 18,720 additions and 467 deletions.
diff --git a/.github/workflows/python-package.yml b/.github/workflows/python-package.yml
@@ -0,0 +1,45 @@
+name: RecBole tests
+
+on:
+- pull_request
+
+jobs:
+  build:
+
+    runs-on: ubuntu-latest
+    strategy:
+      matrix:
+        python-version: [3.8]
+
+    steps:
+    - uses: actions/checkout@v2
+    - name: Set up Python ${{ matrix.python-version }}
+      uses: actions/setup-python@v2
+      with:
+        python-version: ${{ matrix.python-version }}
+    - name: Install dependencies
+      run: |
+        python -m pip install --upgrade pip
+        pip install pytest
+        pip install dgl
+        if [ -f requirements.txt ]; then pip install -r requirements.txt; fi
+
+    # Use "python -m pytest" instead of "pytest" to fix imports
+    - name: Test metrics
+      run: |
+        python -m pytest -v tests/metrics
+    - name: Test evaluation_setting
+      run: |
+        python -m pytest -v tests/evaluation_setting
+    - name: Test model
+      run: |
+        python -m pytest -v tests/model/test_model_auto.py
+    - name: Test config
+      run: |
+        python -m pytest -v tests/config/test_config.py
+        export PYTHONPATH=.
+        python tests/config/test_command_line.py --use_gpu=False --valid_metric=Recall@10 --split_ratio=[0.7,0.2,0.1] --metrics=['Recall@10'] --epochs=200 --eval_setting='LO_RS' --learning_rate=0.3
+    - name: Test evaluation_setting
+      run: |
+        python -m pytest -v tests/evaluation_setting
+
diff --git a/recbole/data/dataloader/knowledge_dataloader.py b/recbole/data/dataloader/knowledge_dataloader.py
@@ -182,6 +182,8 @@ def _next_batch_data(self):
         elif self.state == KGDataLoaderState.RS:
             return self.general_dataloader._next_batch_data()
         elif self.state == KGDataLoaderState.RSKG:
+            if self.kg_dataloader.pr >= self.kg_dataloader.pr_end:
+                self.kg_dataloader.pr = 0
             kg_data = self.kg_dataloader._next_batch_data()
             rec_data = self.general_dataloader._next_batch_data()
             rec_data.update(kg_data)

diff --git a/recbole/data/dataloader/neg_sample_mixin.py b/recbole/data/dataloader/neg_sample_mixin.py
@@ -29,7 +29,7 @@ class NegSampleMixin(AbstractDataLoader):
         batch_size (int, optional): The batch_size of dataloader. Defaults to ``1``.
         dl_format (InputType, optional): The input type of dataloader. Defaults to
             :obj:`~recbole.utils.InputType.POINTWISE`.
-        shuffle (bool, optional): Whether the dataloader will be shuffle after a round. Defaluts to ``False``.
+        shuffle (bool, optional): Whether the dataloader will be shuffle after a round. Defaults to ``False``.
     """
     dl_type = DataLoaderType.NEGSAMPLE
 

diff --git a/recbole/data/dataloader/sequential_dataloader.py b/recbole/data/dataloader/sequential_dataloader.py
@@ -143,7 +143,7 @@ def augmentation(self, uid_list, item_list_index, target_index, item_list_length
         new_dict = {
             self.uid_field: uid_list,
             self.item_list_field: np.zeros((new_length, self.max_item_list_len), dtype=np.int64),
-            self.time_list_field: np.zeros((new_length, self.max_item_list_len), dtype=np.int64),
+            self.time_list_field: np.zeros((new_length, self.max_item_list_len)),
             self.target_iid_field: self.dataset.inter_feat[self.iid_field][target_index].values,
             self.target_time_field: self.dataset.inter_feat[self.time_field][target_index].values,
             self.item_list_length_field: item_list_length,

diff --git a/recbole/data/dataset/dataset.py b/recbole/data/dataset/dataset.py
@@ -55,11 +55,16 @@ class Dataset(object):
             Specially, if feature is loaded from Arg ``additional_feat_suffix``, its source has type str,
             which is the suffix of its local file (also the suffix written in Arg ``additional_feat_suffix``).
 
-        field2id_token (dict): Dict mapping feature name (str) to a list, which stores the original token of
-            this feature. For example, if ``test`` is token-like feature, ``token_a`` is remapped to 1, ``token_b``
+        field2id_token (dict): Dict mapping feature name (str) to a :class:`np.ndarray`, which stores the original token
+            of this feature. For example, if ``test`` is token-like feature, ``token_a`` is remapped to 1, ``token_b``
             is remapped to 2. Then ``field2id_token['test'] = ['[PAD]', 'token_a', 'token_b']``. (Note that 0 is
             always PADDING for token-like features.)
 
+        field2token_id (dict): Dict mapping feature name (str) to a dict, which stores the token remap table
+            of this feature. For example, if ``test`` is token-like feature, ``token_a`` is remapped to 1, ``token_b``
+            is remapped to 2. Then ``field2token_id['test'] = {'[PAD]': 0, 'token_a': 1, 'token_b': 2}``.
+            (Note that 0 is always PADDING for token-like features.)
+
         field2seqlen (dict): Dict mapping feature name (str) to its sequence length (int).
             For sequence features, their length can be either set in config,
             or set to the max sequence length of this feature.
@@ -116,6 +121,7 @@ def _get_preset(self):
         self.field2type = {}
         self.field2source = {}
         self.field2id_token = {}
+        self.field2token_id = {}
         self.field2seqlen = self.config['seq_len'] or {}
         self._preloaded_weight = {}
         self.benchmark_filename_list = self.config['benchmark_filename']
@@ -897,11 +903,13 @@ def _remap(self, remap_list):
         tokens, split_point = self._concat_remaped_tokens(remap_list)
         new_ids_list, mp = pd.factorize(tokens)
         new_ids_list = np.split(new_ids_list + 1, split_point)
-        mp = ['[PAD]'] + list(mp)
+        mp = np.array(['[PAD]'] + list(mp))
+        token_id = {t: i for i, t in enumerate(mp)}
 
         for (feat, field, ftype), new_ids in zip(remap_list, new_ids_list):
-            if (field not in self.field2id_token):
+            if field not in self.field2id_token:
                 self.field2id_token[field] = mp
+                self.field2token_id[field] = token_id
             if ftype == FeatureType.TOKEN:
                 feat[field] = new_ids
             elif ftype == FeatureType.TOKEN_SEQ:
@@ -1010,6 +1018,46 @@ def copy_field_property(self, dest_field, source_field):
         self.field2source[dest_field] = self.field2source[source_field]
         self.field2seqlen[dest_field] = self.field2seqlen[source_field]
 
+    @dlapi.set()
+    def token2id(self, field, tokens):
+        """Map external tokens to internal ids.
+
+        Args:
+            field (str): Field of external tokens.
+            tokens (str, list or np.ndarray): External tokens.
+
+        Returns:
+            int or np.ndarray: The internal ids of external tokens.
+        """
+        if isinstance(tokens, str):
+            if tokens in self.field2token_id[field]:
+                return self.field2token_id[field][tokens]
+            else:
+                raise ValueError('token [{}] is not existed')
+        elif isinstance(tokens, (list, np.ndarray)):
+            return np.array([self.token2id(field, token) for token in tokens])
+        else:
+            raise TypeError('The type of tokens [{}] is not supported')
+
+    @dlapi.set()
+    def id2token(self, field, ids):
+        """Map internal ids to external tokens.
+
+        Args:
+            field (str): Field of internal ids.
+            ids (int, list, np.ndarray or torch.Tensor): Internal ids.
+
+        Returns:
+            str or np.ndarray: The external tokens of internal ids.
+        """
+        try:
+            return self.field2id_token[field][ids]
+        except IndexError:
+            if isinstance(ids, list):
+                raise ValueError('[{}] is not a one-dimensional list'.format(ids))
+            else:
+                raise ValueError('[{}] is not a valid ids'.format(ids))
+
     @property
     @dlapi.set()
     def user_num(self):

diff --git a/recbole/data/dataset/kg_dataset.py b/recbole/data/dataset/kg_dataset.py
@@ -353,7 +353,7 @@ def _remap_ID_all(self):
         item_tokens = self._get_rec_item_token()
         super()._remap_ID_all()
         self._sort_remaped_entities(item_tokens)
-        self.field2id_token[self.relation_field].append('[UI-Relation]')
+        self.field2id_token[self.relation_field] = np.append(self.field2id_token[self.relation_field], '[UI-Relation]')
 
     @property
     @dlapi.set()

diff --git a/recbole/model/general_recommender/dmf.py b/recbole/model/general_recommender/dmf.py
@@ -170,8 +170,7 @@ def get_item_embedding(self):
         col = interaction_matrix.col
         i = torch.LongTensor([row, col])
         data = torch.FloatTensor(interaction_matrix.data)
-        item_matrix = torch.sparse.FloatTensor(i, data).to(self.device).transpose(0, 1)
-
+        item_matrix = torch.sparse.FloatTensor(i, data, torch.Size(interaction_matrix.shape)).to(self.device).transpose(0, 1)
         item = torch.sparse.mm(item_matrix, self.item_linear.weight.t())
 
         item = self.item_fc_layers(item)

diff --git a/recbole/model/general_recommender/fism.py b/recbole/model/general_recommender/fism.py
@@ -14,12 +14,13 @@
     https://github.com/AaronHeee/Neural-Attentive-Item-Similarity-Model
 """
 
+from logging import getLogger
+
 import torch
 import torch.nn as nn
-from torch.nn.init import normal_
-
 from recbole.model.abstract_recommender import GeneralRecommender
 from recbole.utils import InputType
+from torch.nn.init import normal_
 
 
 class FISM(GeneralRecommender):
@@ -36,6 +37,8 @@ def __init__(self, config, dataset):
 
         # load dataset info
         self.LABEL = config['LABEL_FIELD']
+        self.logger = getLogger()
+
         # get all users's history interaction information.the history item 
         # matrix is padding by the maximum number of a user's interactions
         self.history_item_matrix, self.history_lens, self.mask_mat = self.get_history_info(dataset)
@@ -49,6 +52,11 @@ def __init__(self, config, dataset):
         # split the too large dataset into the specified pieces
         if self.split_to > 0:
             self.group = torch.chunk(torch.arange(self.n_items).to(self.device), self.split_to)
+        else:
+            self.logger.warning('Pay Attetion!! the `split_to` is set to 0. If you catch a OMM error in this case, ' + \
+                                'you need to increase it \n\t\t\tuntil the error disappears. For example, ' + \
+                                'you can append it in the command line such as `--split_to=5`')
+
 
         # define layers and loss
         # construct source and destination item embedding matrix

diff --git a/recbole/model/general_recommender/gcmc.py b/recbole/model/general_recommender/gcmc.py
@@ -143,10 +143,13 @@ def get_norm_adj_mat(self):
         # build adj matrix
         A = sp.dok_matrix((self.n_users + self.n_items,
                            self.n_users + self.n_items), dtype=np.float32)
-        A = A.tolil()
-        A[:self.n_users, self.n_users:] = self.interaction_matrix
-        A[self.n_users:, :self.n_users] = self.interaction_matrix.transpose()
-        A = A.todok()
+        inter_M = self.interaction_matrix
+        inter_M_t = self.interaction_matrix.transpose()
+        data_dict = dict(zip(zip(inter_M.row, inter_M.col+self.n_users),
+                             [1]*inter_M.nnz))
+        data_dict.update(dict(zip(zip(inter_M_t.row+self.n_users, inter_M_t.col),
+                                  [1]*inter_M_t.nnz)))
+        A._update(data_dict)
         # norm adj matrix
         sumArr = (A > 0).sum(axis=1)
         # add epsilon to avoid Devide by zero Warning

diff --git a/recbole/model/general_recommender/lightgcn.py b/recbole/model/general_recommender/lightgcn.py
@@ -86,10 +86,13 @@ def get_norm_adj_mat(self):
         # build adj matrix
         A = sp.dok_matrix((self.n_users + self.n_items,
                            self.n_users + self.n_items), dtype=np.float32)
-        A = A.tolil()
-        A[:self.n_users, self.n_users:] = self.interaction_matrix
-        A[self.n_users:, :self.n_users] = self.interaction_matrix.transpose()
-        A = A.todok()
+        inter_M = self.interaction_matrix
+        inter_M_t = self.interaction_matrix.transpose()
+        data_dict = dict(zip(zip(inter_M.row, inter_M.col+self.n_users),
+                             [1]*inter_M.nnz))
+        data_dict.update(dict(zip(zip(inter_M_t.row+self.n_users, inter_M_t.col),
+                                  [1]*inter_M_t.nnz)))
+        A._update(data_dict)
         # norm adj matrix
         sumArr = (A > 0).sum(axis=1)
         # add epsilon to avoid Devide by zero Warning

diff --git a/recbole/model/general_recommender/nais.py b/recbole/model/general_recommender/nais.py
@@ -65,6 +65,10 @@ def __init__(self, config, dataset):
         if self.split_to > 0:
             self.logger.info('split the n_items to {} pieces'.format(self.split_to))
             self.group = torch.chunk(torch.arange(self.n_items).to(self.device), self.split_to)
+        else:
+            self.logger.warning('Pay Attetion!! the `split_to` is set to 0. If you catch a OMM error in this case, ' + \
+                                'you need to increase it \n\t\t\tuntil the error disappears. For example, ' + \
+                                'you can append it in the command line such as `--split_to=5`')
 
         # define layers and loss
         # construct source and destination item embedding matrix

diff --git a/recbole/model/general_recommender/ngcf.py b/recbole/model/general_recommender/ngcf.py
@@ -73,7 +73,7 @@ def __init__(self, config, dataset):
         super(NGCF, self).__init__(config, dataset)
 
         # load dataset info
-        self.interaction_matrix = dataset.inter_matrix(form='csr').astype(np.float32)
+        self.interaction_matrix = dataset.inter_matrix(form='coo').astype(np.float32)
 
         # load parameters info
         self.embedding_size = config['embedding_size']
@@ -117,10 +117,13 @@ def get_norm_adj_mat(self):
         """
         # build adj matrix
         A = sp.dok_matrix((self.n_users + self.n_items, self.n_users + self.n_items), dtype=np.float32)
-        A = A.tolil()
-        A[:self.n_users, self.n_users:] = self.interaction_matrix
-        A[self.n_users:, :self.n_users] = self.interaction_matrix.transpose()
-        A = A.todok()
+        inter_M = self.interaction_matrix
+        inter_M_t = self.interaction_matrix.transpose()
+        data_dict = dict(zip(zip(inter_M.row, inter_M.col + self.n_users),
+                             [1] * inter_M.nnz))
+        data_dict.update(dict(zip(zip(inter_M_t.row + self.n_users, inter_M_t.col),
+                                  [1] * inter_M_t.nnz)))
+        A._update(data_dict)
         # norm adj matrix
         sumArr = (A > 0).sum(axis=1)
         diag = np.array(sumArr.flatten())[0] + 1e-7     # add epsilon to avoid Devide by zero Warning

diff --git a/recbole/model/general_recommender/pop.py b/recbole/model/general_recommender/pop.py
@@ -2,7 +2,10 @@
 # @Time   : 2020/8/11 9:57
 # @Author : Zihan Lin
 # @Email  : [email protected]
-
+# UPDATE
+# @Time   : 2020/11/9
+# @Author : Zihan Lin
+# @Email  : [email protected]
 r"""
 Pop
 ################################################
@@ -44,8 +47,8 @@ def calculate_loss(self, interaction):
     def predict(self, interaction):
 
         item = interaction[self.ITEM_ID]
-        result = self.item_cnt[item, :] / self.max_cnt
-        return result
+        result = torch.true_divide(self.item_cnt[item, :], self.max_cnt)
+        return result.squeeze()
 
     def full_sort_predict(self, interaction):
         batch_user_num = interaction[self.USER_ID].shape[0]

diff --git a/recbole/model/general_recommender/spectralcf.py b/recbole/model/general_recommender/spectralcf.py
@@ -96,10 +96,13 @@ def get_laplacian_matrix(self):
         # build adj matrix
         A = sp.dok_matrix((self.n_users + self.n_items,
                            self.n_users + self.n_items), dtype=np.float32)
-        A = A.tolil()
-        A[:self.n_users, self.n_users:] = self.interaction_matrix
-        A[self.n_users:, :self.n_users] = self.interaction_matrix.transpose()
-        A = A.todok()
+        inter_M = self.interaction_matrix
+        inter_M_t = self.interaction_matrix.transpose()
+        data_dict = dict(zip(zip(inter_M.row, inter_M.col+self.n_users),
+                             [1]*inter_M.nnz))
+        data_dict.update(dict(zip(zip(inter_M_t.row+self.n_users, inter_M_t.col),
+                                  [1]*inter_M_t.nnz)))
+        A._update(data_dict)
 
         # norm adj matrix
         sumArr = (A > 0).sum(axis=1)

diff --git a/recbole/sampler/sampler.py b/recbole/sampler/sampler.py
@@ -83,7 +83,7 @@ def sample_by_key_ids(self, key_ids, num, used_ids):
         key_num = len(key_ids)
         total_num = key_num * num
         value_ids = np.zeros(total_num, dtype=np.int64)
-        used_id_list = np.repeat(used_ids, num)
+        used_id_list = np.tile(used_ids, num)
         for i, used_ids in enumerate(used_id_list):
             cur = self.random()
             while cur in used_ids: