From 9f34bcac5eaaaf92869dfe55c2f7f9061e5572ae Mon Sep 17 00:00:00 2001 From: Weisu Date: Tue, 7 Dec 2021 13:06:23 -0800 Subject: [PATCH 01/10] auxilary data --- amlb/benchmark.py | 6 +- amlb/datasets/__init__.py | 8 +- amlb/datasets/file.py | 131 +++++++++++++++++++++++++++++++ frameworks/AutoGluon/__init__.py | 2 + 4 files changed, 145 insertions(+), 2 deletions(-) diff --git a/amlb/benchmark.py b/amlb/benchmark.py index d38f8ad56..8c4453e75 100644 --- a/amlb/benchmark.py +++ b/amlb/benchmark.py @@ -19,7 +19,7 @@ from typing import List, Union from .job import Job, JobError, SimpleJobRunner, MultiThreadingJobRunner -from .datasets import DataLoader, DataSourceType +from .datasets import DataLoader, DataSourceType, DatasetWithAuxilaryData from .data import DatasetType from .datautils import read_csv from .resources import get as rget, config as rconfig, output_dirs as routput_dirs @@ -489,6 +489,10 @@ def load_data(self): else: raise ValueError("Tasks should have one property among [openml_task_id, openml_dataset_id, dataset].") + if hasattr(self._task_def, 'auxilary_data'): + auxilary_data = Benchmark.data_loader.load_auxilary_data(DataSourceType.file, auxilary_data=self._task_def.auxilary_data, fold=self.fold) + self._dataset = DatasetWithAuxilaryData(self._dataset, auxilary_data) + def as_job(self): job = Job(name=rconfig().token_separator.join([ 'local', diff --git a/amlb/datasets/__init__.py b/amlb/datasets/__init__.py index af60730f8..8e516607e 100644 --- a/amlb/datasets/__init__.py +++ b/amlb/datasets/__init__.py @@ -1,6 +1,6 @@ from enum import Enum, auto -from .file import FileLoader +from .file import FileLoader, DatasetWithAuxilaryData from .openml import OpenmlLoader @@ -24,5 +24,11 @@ def load(self, source: DataSourceType, *args, **kwargs): else: raise NotImplementedError(f"data source {source} is not supported yet") + def load_auxilary_data(self, source: DataSourceType, *args, **kwargs): + if source == DataSourceType.file: + return self.file_loader.load_auxilary_data(*args, **kwargs) + else: + raise NotImplementedError(f"data source {source} is not supported yet") + __all__ = ["DataLoader", "DataSourceType"] diff --git a/amlb/datasets/file.py b/amlb/datasets/file.py index 8e696f135..2044bbc1b 100644 --- a/amlb/datasets/file.py +++ b/amlb/datasets/file.py @@ -55,6 +55,84 @@ def load(self, dataset, fold=0): else: raise ValueError(f"Unsupported file type: {ext}") + @profile(logger=log) + def load_auxilary_data(self, auxilary_data, fold=0): + auxilary_data = auxilary_data if isinstance(auxilary_data, ns) else ns(path=auxilary_data) + log.debug("Loading auxilary data %s", auxilary_data) + paths = self._extract_auxilary_paths(auxilary_data.path if 'path' in auxilary_data else auxilary_data, fold=fold) + train_path = paths['train'][fold] + test_path = paths['test'][fold] + paths = dict(train=train_path, test=test_path) + return paths + + def _extract_auxilary_paths(self, auxilary_data, fold=None): + train_search_pat = re.compile(r"(?:(.*)[_-])train_auxilary(?:[_-](\d+))?\.\w+") + test_search_pat = re.compile(r"(?:(.*)[_-])test_auxilary(?:[_-](\d+))?\.\w+") + if isinstance(auxilary_data, (tuple, list)): + assert len(auxilary_data) % 2 == 0, "auxilary data list must contain an even number of paths: [train_0, test_0, train_1, test_1, ...]." + return self._extract_auxilary_paths(ns(train=[p for i, p in enumerate(auxilary_data) if i % 2 == 0], + test=[p for i, p in enumerate(auxilary_data) if i % 2 == 1]), + fold=fold) + elif isinstance(auxilary_data, ns): + return dict( + train=[self._extract_auxilary_paths(p)['train'][0] + if i == fold else None + for i, p in enumerate(as_list(auxilary_data.train))], + test=[self._extract_auxilary_paths(p)['train'][0] + if i == fold else None + for i, p in enumerate(as_list(auxilary_data.test))] if 'test' in auxilary_data else [] + ) + else: + assert isinstance(auxilary_data, str) + auxilary_data = os.path.expanduser(auxilary_data) + auxilary_data = auxilary_data.format(**rconfig().common_dirs) + + if os.path.exists(auxilary_data): + if os.path.isfile(auxilary_data): + # we leave the auxilary data handling to the user + return dict(train=[auxilary_data], test=[]) + elif os.path.isdir(auxilary_data): + files = list_all_files(auxilary_data) + log.debug("Files found in auxilary data folder %s: %s", auxilary_data, files) + assert len(files) > 0, f"Empty folder: {auxilary_data}" + if len(files) == 1: + return dict(train=files, test=[]) + + train_matches = [m for m in [train_search_pat.search(f) for f in files] if m] + test_matches = [m for m in [test_search_pat.search(f) for f in files] if m] + # verify they're for the same dataset (just based on name) + assert train_matches, f"Folder {auxilary_data} must contain at least one training auxilary data." + root_names = {m[1] for m in (train_matches+test_matches)} + assert len(root_names) == 1, f"All dataset files in {auxilary_data} should follow the same naming: xxxxx_train_N.ext or xxxxx_test_N.ext with N starting from 0." + + train_no_fold = next((m[0] for m in train_matches if m[2] is None), None) + test_no_fold = next((m[0] for m in test_matches if m[2] is None), None) + if train_no_fold and test_no_fold: + return dict(train=[train_no_fold], test=[test_no_fold]) + + paths = dict(train=[], test=[]) + fold = 0 + while fold >= 0: + train = next((m[0] for m in train_matches if m[2] == str(fold)), None) + test = next((m[0] for m in test_matches if m[2] == str(fold)), None) + if train and test: + paths['train'].append(train) + paths['test'].append(test) + fold += 1 + else: + fold = -1 + assert len(paths) > 0, f"No dataset file found in {auxilary_data}: they should follow the naming xxxx_train.ext, xxxx_test.ext or xxxx_train_0.ext, xxxx_test_0.ext, xxxx_train_1.ext, ..." + return paths + elif is_valid_url(auxilary_data): + cached_file = os.path.join(self._cache_dir, os.path.basename(auxilary_data)) + if not os.path.exists(cached_file): # don't download if previously done + handler = get_file_handler(auxilary_data) + assert handler.exists(auxilary_data), f"Invalid path/url: {auxilary_data}" + handler.download(auxilary_data, dest_path=cached_file) + return self._extract_auxilary_paths(cached_file) + else: + raise ValueError(f"Invalid dataset description: {auxilary_data}") + def _extract_train_test_paths(self, dataset, fold=None): if isinstance(dataset, (tuple, list)): assert len(dataset) % 2 == 0, "dataset list must contain an even number of paths: [train_0, test_0, train_1, test_1, ...]." @@ -167,6 +245,59 @@ def _get_metadata(self, prop): return meta[prop] +class DatasetWithAuxilaryData: + + def __init__(self, dataset: FileDataset, auxilary_data_path): + self._dataset = dataset + self._train_auxilary_data = auxilary_data_path.get('train', None) + self._test_auxilary_data = auxilary_data_path.get('test', None) + + @property + def train_auxilary_data(self) -> str: + return self._train_auxilary_data + + @property + def test_auxilary_data(self) -> str: + return self._test_auxilary_data + + @property + def type(self) -> DatasetType: + assert self._dataset.target is not None + return (DatasetType[self._dataset._type] if self._dataset._type is not None + else DatasetType.regression if self._dataset.target.values is None + else DatasetType.binary if len(self._dataset.target.values) == 2 + else DatasetType.multiclass) + + @property + def train(self) -> Datasplit: + return self._dataset._train + + @property + def test(self) -> Datasplit: + return self._dataset._test + + @property + def features(self) -> List[Feature]: + return self._get_metadata('features') + + @property + def target(self) -> Feature: + return self._get_metadata('target') + + @memoize + def _get_metadata(self, prop): + meta = self._dataset._train.load_metadata() + return meta[prop] + + @profile(logger=log) + def release(self, properties=None): + """ + Call this to release cached properties and optimize memory once in-memory data are not needed anymore. + :param properties: + """ + self._dataset.release(properties) + + class FileDatasplit(Datasplit): def __init__(self, dataset: FileDataset, format: str, path: str): diff --git a/frameworks/AutoGluon/__init__.py b/frameworks/AutoGluon/__init__.py index be2c15147..77e5faa34 100644 --- a/frameworks/AutoGluon/__init__.py +++ b/frameworks/AutoGluon/__init__.py @@ -13,6 +13,8 @@ def run(dataset: Dataset, config: TaskConfig): data = dict( train=dict(path=dataset.train.data_path('parquet')), test=dict(path=dataset.test.data_path('parquet')), + train_aux=dict(path=dataset.train_auxilary_data), + test_aux=dict(path=dataset.test_auxilary_data), target=dict( name=dataset.target.name, classes=dataset.target.values From 399e692afdc2a5f52de7dcd17f6a52406bf928db Mon Sep 17 00:00:00 2001 From: Weisu Date: Tue, 7 Dec 2021 13:10:39 -0800 Subject: [PATCH 02/10] fix typo --- amlb/benchmark.py | 8 +-- amlb/datasets/__init__.py | 6 +-- amlb/datasets/file.py | 92 ++++++++++++++++---------------- frameworks/AutoGluon/__init__.py | 4 +- 4 files changed, 55 insertions(+), 55 deletions(-) diff --git a/amlb/benchmark.py b/amlb/benchmark.py index 8c4453e75..4c9cc7657 100644 --- a/amlb/benchmark.py +++ b/amlb/benchmark.py @@ -19,7 +19,7 @@ from typing import List, Union from .job import Job, JobError, SimpleJobRunner, MultiThreadingJobRunner -from .datasets import DataLoader, DataSourceType, DatasetWithAuxilaryData +from .datasets import DataLoader, DataSourceType, DatasetWithauxiliaryData from .data import DatasetType from .datautils import read_csv from .resources import get as rget, config as rconfig, output_dirs as routput_dirs @@ -489,9 +489,9 @@ def load_data(self): else: raise ValueError("Tasks should have one property among [openml_task_id, openml_dataset_id, dataset].") - if hasattr(self._task_def, 'auxilary_data'): - auxilary_data = Benchmark.data_loader.load_auxilary_data(DataSourceType.file, auxilary_data=self._task_def.auxilary_data, fold=self.fold) - self._dataset = DatasetWithAuxilaryData(self._dataset, auxilary_data) + if hasattr(self._task_def, 'auxiliary_data'): + auxiliary_data = Benchmark.data_loader.load_auxiliary_data(DataSourceType.file, auxiliary_data=self._task_def.auxiliary_data, fold=self.fold) + self._dataset = DatasetWithauxiliaryData(self._dataset, auxiliary_data) def as_job(self): job = Job(name=rconfig().token_separator.join([ diff --git a/amlb/datasets/__init__.py b/amlb/datasets/__init__.py index 8e516607e..b30da4ffe 100644 --- a/amlb/datasets/__init__.py +++ b/amlb/datasets/__init__.py @@ -1,6 +1,6 @@ from enum import Enum, auto -from .file import FileLoader, DatasetWithAuxilaryData +from .file import FileLoader, DatasetWithauxiliaryData from .openml import OpenmlLoader @@ -24,9 +24,9 @@ def load(self, source: DataSourceType, *args, **kwargs): else: raise NotImplementedError(f"data source {source} is not supported yet") - def load_auxilary_data(self, source: DataSourceType, *args, **kwargs): + def load_auxiliary_data(self, source: DataSourceType, *args, **kwargs): if source == DataSourceType.file: - return self.file_loader.load_auxilary_data(*args, **kwargs) + return self.file_loader.load_auxiliary_data(*args, **kwargs) else: raise NotImplementedError(f"data source {source} is not supported yet") diff --git a/amlb/datasets/file.py b/amlb/datasets/file.py index 2044bbc1b..f9e6bd2d5 100644 --- a/amlb/datasets/file.py +++ b/amlb/datasets/file.py @@ -56,54 +56,54 @@ def load(self, dataset, fold=0): raise ValueError(f"Unsupported file type: {ext}") @profile(logger=log) - def load_auxilary_data(self, auxilary_data, fold=0): - auxilary_data = auxilary_data if isinstance(auxilary_data, ns) else ns(path=auxilary_data) - log.debug("Loading auxilary data %s", auxilary_data) - paths = self._extract_auxilary_paths(auxilary_data.path if 'path' in auxilary_data else auxilary_data, fold=fold) + def load_auxiliary_data(self, auxiliary_data, fold=0): + auxiliary_data = auxiliary_data if isinstance(auxiliary_data, ns) else ns(path=auxiliary_data) + log.debug("Loading auxiliary data %s", auxiliary_data) + paths = self._extract_auxiliary_paths(auxiliary_data.path if 'path' in auxiliary_data else auxiliary_data, fold=fold) train_path = paths['train'][fold] test_path = paths['test'][fold] paths = dict(train=train_path, test=test_path) return paths - def _extract_auxilary_paths(self, auxilary_data, fold=None): - train_search_pat = re.compile(r"(?:(.*)[_-])train_auxilary(?:[_-](\d+))?\.\w+") - test_search_pat = re.compile(r"(?:(.*)[_-])test_auxilary(?:[_-](\d+))?\.\w+") - if isinstance(auxilary_data, (tuple, list)): - assert len(auxilary_data) % 2 == 0, "auxilary data list must contain an even number of paths: [train_0, test_0, train_1, test_1, ...]." - return self._extract_auxilary_paths(ns(train=[p for i, p in enumerate(auxilary_data) if i % 2 == 0], - test=[p for i, p in enumerate(auxilary_data) if i % 2 == 1]), + def _extract_auxiliary_paths(self, auxiliary_data, fold=None): + train_search_pat = re.compile(r"(?:(.*)[_-])train_auxiliary(?:[_-](\d+))?\.\w+") + test_search_pat = re.compile(r"(?:(.*)[_-])test_auxiliary(?:[_-](\d+))?\.\w+") + if isinstance(auxiliary_data, (tuple, list)): + assert len(auxiliary_data) % 2 == 0, "auxiliary data list must contain an even number of paths: [train_0, test_0, train_1, test_1, ...]." + return self._extract_auxiliary_paths(ns(train=[p for i, p in enumerate(auxiliary_data) if i % 2 == 0], + test=[p for i, p in enumerate(auxiliary_data) if i % 2 == 1]), fold=fold) - elif isinstance(auxilary_data, ns): + elif isinstance(auxiliary_data, ns): return dict( - train=[self._extract_auxilary_paths(p)['train'][0] + train=[self._extract_auxiliary_paths(p)['train'][0] if i == fold else None - for i, p in enumerate(as_list(auxilary_data.train))], - test=[self._extract_auxilary_paths(p)['train'][0] + for i, p in enumerate(as_list(auxiliary_data.train))], + test=[self._extract_auxiliary_paths(p)['train'][0] if i == fold else None - for i, p in enumerate(as_list(auxilary_data.test))] if 'test' in auxilary_data else [] + for i, p in enumerate(as_list(auxiliary_data.test))] if 'test' in auxiliary_data else [] ) else: - assert isinstance(auxilary_data, str) - auxilary_data = os.path.expanduser(auxilary_data) - auxilary_data = auxilary_data.format(**rconfig().common_dirs) - - if os.path.exists(auxilary_data): - if os.path.isfile(auxilary_data): - # we leave the auxilary data handling to the user - return dict(train=[auxilary_data], test=[]) - elif os.path.isdir(auxilary_data): - files = list_all_files(auxilary_data) - log.debug("Files found in auxilary data folder %s: %s", auxilary_data, files) - assert len(files) > 0, f"Empty folder: {auxilary_data}" + assert isinstance(auxiliary_data, str) + auxiliary_data = os.path.expanduser(auxiliary_data) + auxiliary_data = auxiliary_data.format(**rconfig().common_dirs) + + if os.path.exists(auxiliary_data): + if os.path.isfile(auxiliary_data): + # we leave the auxiliary data handling to the user + return dict(train=[auxiliary_data], test=[]) + elif os.path.isdir(auxiliary_data): + files = list_all_files(auxiliary_data) + log.debug("Files found in auxiliary data folder %s: %s", auxiliary_data, files) + assert len(files) > 0, f"Empty folder: {auxiliary_data}" if len(files) == 1: return dict(train=files, test=[]) train_matches = [m for m in [train_search_pat.search(f) for f in files] if m] test_matches = [m for m in [test_search_pat.search(f) for f in files] if m] # verify they're for the same dataset (just based on name) - assert train_matches, f"Folder {auxilary_data} must contain at least one training auxilary data." + assert train_matches, f"Folder {auxiliary_data} must contain at least one training auxiliary data." root_names = {m[1] for m in (train_matches+test_matches)} - assert len(root_names) == 1, f"All dataset files in {auxilary_data} should follow the same naming: xxxxx_train_N.ext or xxxxx_test_N.ext with N starting from 0." + assert len(root_names) == 1, f"All dataset files in {auxiliary_data} should follow the same naming: xxxxx_train_N.ext or xxxxx_test_N.ext with N starting from 0." train_no_fold = next((m[0] for m in train_matches if m[2] is None), None) test_no_fold = next((m[0] for m in test_matches if m[2] is None), None) @@ -121,17 +121,17 @@ def _extract_auxilary_paths(self, auxilary_data, fold=None): fold += 1 else: fold = -1 - assert len(paths) > 0, f"No dataset file found in {auxilary_data}: they should follow the naming xxxx_train.ext, xxxx_test.ext or xxxx_train_0.ext, xxxx_test_0.ext, xxxx_train_1.ext, ..." + assert len(paths) > 0, f"No dataset file found in {auxiliary_data}: they should follow the naming xxxx_train.ext, xxxx_test.ext or xxxx_train_0.ext, xxxx_test_0.ext, xxxx_train_1.ext, ..." return paths - elif is_valid_url(auxilary_data): - cached_file = os.path.join(self._cache_dir, os.path.basename(auxilary_data)) + elif is_valid_url(auxiliary_data): + cached_file = os.path.join(self._cache_dir, os.path.basename(auxiliary_data)) if not os.path.exists(cached_file): # don't download if previously done - handler = get_file_handler(auxilary_data) - assert handler.exists(auxilary_data), f"Invalid path/url: {auxilary_data}" - handler.download(auxilary_data, dest_path=cached_file) - return self._extract_auxilary_paths(cached_file) + handler = get_file_handler(auxiliary_data) + assert handler.exists(auxiliary_data), f"Invalid path/url: {auxiliary_data}" + handler.download(auxiliary_data, dest_path=cached_file) + return self._extract_auxiliary_paths(cached_file) else: - raise ValueError(f"Invalid dataset description: {auxilary_data}") + raise ValueError(f"Invalid dataset description: {auxiliary_data}") def _extract_train_test_paths(self, dataset, fold=None): if isinstance(dataset, (tuple, list)): @@ -245,20 +245,20 @@ def _get_metadata(self, prop): return meta[prop] -class DatasetWithAuxilaryData: +class DatasetWithauxiliaryData: - def __init__(self, dataset: FileDataset, auxilary_data_path): + def __init__(self, dataset: FileDataset, auxiliary_data_path): self._dataset = dataset - self._train_auxilary_data = auxilary_data_path.get('train', None) - self._test_auxilary_data = auxilary_data_path.get('test', None) + self._train_auxiliary_data = auxiliary_data_path.get('train', None) + self._test_auxiliary_data = auxiliary_data_path.get('test', None) @property - def train_auxilary_data(self) -> str: - return self._train_auxilary_data + def train_auxiliary_data(self) -> str: + return self._train_auxiliary_data @property - def test_auxilary_data(self) -> str: - return self._test_auxilary_data + def test_auxiliary_data(self) -> str: + return self._test_auxiliary_data @property def type(self) -> DatasetType: diff --git a/frameworks/AutoGluon/__init__.py b/frameworks/AutoGluon/__init__.py index 77e5faa34..1dbc9a0ee 100644 --- a/frameworks/AutoGluon/__init__.py +++ b/frameworks/AutoGluon/__init__.py @@ -13,8 +13,8 @@ def run(dataset: Dataset, config: TaskConfig): data = dict( train=dict(path=dataset.train.data_path('parquet')), test=dict(path=dataset.test.data_path('parquet')), - train_aux=dict(path=dataset.train_auxilary_data), - test_aux=dict(path=dataset.test_auxilary_data), + train_aux=dict(path=dataset.train_auxiliary_data), + test_aux=dict(path=dataset.test_auxiliary_data), target=dict( name=dataset.target.name, classes=dataset.target.values From 55077a2320e799062570b95a3d8642ab981ddca6 Mon Sep 17 00:00:00 2001 From: Weisu Date: Tue, 7 Dec 2021 13:46:54 -0800 Subject: [PATCH 03/10] fix --- amlb/benchmark.py | 4 ++-- amlb/datasets/__init__.py | 2 +- amlb/datasets/file.py | 2 +- frameworks/AutoGluon/__init__.py | 4 ++-- 4 files changed, 6 insertions(+), 6 deletions(-) diff --git a/amlb/benchmark.py b/amlb/benchmark.py index 4c9cc7657..0d00bd28a 100644 --- a/amlb/benchmark.py +++ b/amlb/benchmark.py @@ -19,7 +19,7 @@ from typing import List, Union from .job import Job, JobError, SimpleJobRunner, MultiThreadingJobRunner -from .datasets import DataLoader, DataSourceType, DatasetWithauxiliaryData +from .datasets import DataLoader, DataSourceType, DatasetWithAuxiliaryData from .data import DatasetType from .datautils import read_csv from .resources import get as rget, config as rconfig, output_dirs as routput_dirs @@ -491,7 +491,7 @@ def load_data(self): if hasattr(self._task_def, 'auxiliary_data'): auxiliary_data = Benchmark.data_loader.load_auxiliary_data(DataSourceType.file, auxiliary_data=self._task_def.auxiliary_data, fold=self.fold) - self._dataset = DatasetWithauxiliaryData(self._dataset, auxiliary_data) + self._dataset = DatasetWithAuxiliaryData(self._dataset, auxiliary_data) def as_job(self): job = Job(name=rconfig().token_separator.join([ diff --git a/amlb/datasets/__init__.py b/amlb/datasets/__init__.py index b30da4ffe..940d8c1f5 100644 --- a/amlb/datasets/__init__.py +++ b/amlb/datasets/__init__.py @@ -1,6 +1,6 @@ from enum import Enum, auto -from .file import FileLoader, DatasetWithauxiliaryData +from .file import FileLoader, DatasetWithAuxiliaryData from .openml import OpenmlLoader diff --git a/amlb/datasets/file.py b/amlb/datasets/file.py index f9e6bd2d5..5c9abb620 100644 --- a/amlb/datasets/file.py +++ b/amlb/datasets/file.py @@ -245,7 +245,7 @@ def _get_metadata(self, prop): return meta[prop] -class DatasetWithauxiliaryData: +class DatasetWithAuxiliaryData: def __init__(self, dataset: FileDataset, auxiliary_data_path): self._dataset = dataset diff --git a/frameworks/AutoGluon/__init__.py b/frameworks/AutoGluon/__init__.py index 1dbc9a0ee..026cc34f4 100644 --- a/frameworks/AutoGluon/__init__.py +++ b/frameworks/AutoGluon/__init__.py @@ -13,8 +13,8 @@ def run(dataset: Dataset, config: TaskConfig): data = dict( train=dict(path=dataset.train.data_path('parquet')), test=dict(path=dataset.test.data_path('parquet')), - train_aux=dict(path=dataset.train_auxiliary_data), - test_aux=dict(path=dataset.test_auxiliary_data), + train_aux=dict(path=dataset.train_auxiliary_data) if 'train_auxiliary_data' in dataset else None, + test_aux=dict(path=dataset.test_auxiliary_data) if 'test_auxiliary_data' in dataset else None, target=dict( name=dataset.target.name, classes=dataset.target.values From 53b86bec4bf586866470340ee02cbd5502fd4313 Mon Sep 17 00:00:00 2001 From: Weisu Date: Tue, 7 Dec 2021 14:11:28 -0800 Subject: [PATCH 04/10] fix --- frameworks/AutoGluon/__init__.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/frameworks/AutoGluon/__init__.py b/frameworks/AutoGluon/__init__.py index 026cc34f4..387f2b7ad 100644 --- a/frameworks/AutoGluon/__init__.py +++ b/frameworks/AutoGluon/__init__.py @@ -13,14 +13,16 @@ def run(dataset: Dataset, config: TaskConfig): data = dict( train=dict(path=dataset.train.data_path('parquet')), test=dict(path=dataset.test.data_path('parquet')), - train_aux=dict(path=dataset.train_auxiliary_data) if 'train_auxiliary_data' in dataset else None, - test_aux=dict(path=dataset.test_auxiliary_data) if 'test_auxiliary_data' in dataset else None, target=dict( name=dataset.target.name, classes=dataset.target.values ), problem_type=dataset.type.name # AutoGluon problem_type is using same names as amlb.data.DatasetType ) + if hasattr(dataset, 'train_auxiliary_data'): + data['train_auxiliary_data'] = dict(path=dataset.train_auxiliary_data) + if hasattr(dataset, 'test_auxiliary_data'): + data['test_auxiliary_data'] = dict(path=dataset.test_auxiliary_data) return run_in_venv(__file__, "exec.py", input_data=data, dataset=dataset, config=config) From 9165fc30d9f7a5c878ca8470d4f628912f7f65c8 Mon Sep 17 00:00:00 2001 From: Weisu Date: Tue, 7 Dec 2021 16:58:00 -0800 Subject: [PATCH 05/10] fix --- amlb/datasets/file.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/amlb/datasets/file.py b/amlb/datasets/file.py index 5c9abb620..f400ec46f 100644 --- a/amlb/datasets/file.py +++ b/amlb/datasets/file.py @@ -69,7 +69,7 @@ def _extract_auxiliary_paths(self, auxiliary_data, fold=None): train_search_pat = re.compile(r"(?:(.*)[_-])train_auxiliary(?:[_-](\d+))?\.\w+") test_search_pat = re.compile(r"(?:(.*)[_-])test_auxiliary(?:[_-](\d+))?\.\w+") if isinstance(auxiliary_data, (tuple, list)): - assert len(auxiliary_data) % 2 == 0, "auxiliary data list must contain an even number of paths: [train_0, test_0, train_1, test_1, ...]." + assert len(auxiliary_data) % 2 == 0, "auxiliary data list must contain an even number of paths: [train_auxiliary_0, test_auxiliary_0, train_auxiliary_1, test_auxiliary_1, ...]." return self._extract_auxiliary_paths(ns(train=[p for i, p in enumerate(auxiliary_data) if i % 2 == 0], test=[p for i, p in enumerate(auxiliary_data) if i % 2 == 1]), fold=fold) @@ -103,7 +103,7 @@ def _extract_auxiliary_paths(self, auxiliary_data, fold=None): # verify they're for the same dataset (just based on name) assert train_matches, f"Folder {auxiliary_data} must contain at least one training auxiliary data." root_names = {m[1] for m in (train_matches+test_matches)} - assert len(root_names) == 1, f"All dataset files in {auxiliary_data} should follow the same naming: xxxxx_train_N.ext or xxxxx_test_N.ext with N starting from 0." + assert len(root_names) == 1, f"All dataset files in {auxiliary_data} should follow the same naming: xxxxx_train_auxiliary_N.ext or xxxxx_test_auxiliary_N.ext with N starting from 0." train_no_fold = next((m[0] for m in train_matches if m[2] is None), None) test_no_fold = next((m[0] for m in test_matches if m[2] is None), None) @@ -121,7 +121,7 @@ def _extract_auxiliary_paths(self, auxiliary_data, fold=None): fold += 1 else: fold = -1 - assert len(paths) > 0, f"No dataset file found in {auxiliary_data}: they should follow the naming xxxx_train.ext, xxxx_test.ext or xxxx_train_0.ext, xxxx_test_0.ext, xxxx_train_1.ext, ..." + assert len(paths) > 0, f"No dataset file found in {auxiliary_data}: they should follow the naming xxxx_train_auxiliary.ext, xxxx_test_auxiliary.ext or xxxx_train_auxiliary_0.ext, xxxx_test_auxiliary_0.ext, xxxx_train_auxiliary_1.ext, ..." return paths elif is_valid_url(auxiliary_data): cached_file = os.path.join(self._cache_dir, os.path.basename(auxiliary_data)) From 4fe07cb8772a955f58468b88693eb87fc84f8bf3 Mon Sep 17 00:00:00 2001 From: Weisu Date: Thu, 9 Dec 2021 14:53:34 -0800 Subject: [PATCH 06/10] extract_path and AuxData --- amlb/benchmark.py | 3 +- amlb/data.py | 22 ++++ amlb/datasets/file.py | 190 +++++++++++++++---------------- frameworks/AutoGluon/__init__.py | 4 +- 4 files changed, 117 insertions(+), 102 deletions(-) diff --git a/amlb/benchmark.py b/amlb/benchmark.py index 0d00bd28a..b73d5b883 100644 --- a/amlb/benchmark.py +++ b/amlb/benchmark.py @@ -490,8 +490,7 @@ def load_data(self): raise ValueError("Tasks should have one property among [openml_task_id, openml_dataset_id, dataset].") if hasattr(self._task_def, 'auxiliary_data'): - auxiliary_data = Benchmark.data_loader.load_auxiliary_data(DataSourceType.file, auxiliary_data=self._task_def.auxiliary_data, fold=self.fold) - self._dataset = DatasetWithAuxiliaryData(self._dataset, auxiliary_data) + self._dataset = Benchmark.data_loader.load_auxiliary_data(DataSourceType.file, dataset=self._dataset, auxiliary_data=self._task_def.auxiliary_data, fold=self.fold) def as_job(self): job = Job(name=rconfig().token_separator.join([ diff --git a/amlb/data.py b/amlb/data.py index b329df27b..5181a6386 100644 --- a/amlb/data.py +++ b/amlb/data.py @@ -84,6 +84,28 @@ def __repr__(self): return repr_def(self) +class AuxData(ABC): + + def __init__(self): + super().__init__() + + @property + def path(self) -> str: + pass + + @property + @abstractmethod + def data(self) -> DF: + """ + :return: the auxiliary data as a pandas DataFrame. + """ + pass + + @profile(logger=log) + def release(self, properties=None): + clear_cache(self, properties) + + class Datasplit(ABC): def __init__(self, dataset, format): diff --git a/amlb/datasets/file.py b/amlb/datasets/file.py index f400ec46f..fd895103f 100644 --- a/amlb/datasets/file.py +++ b/amlb/datasets/file.py @@ -10,7 +10,7 @@ import pandas as pd import pandas.api.types as pat -from ..data import Dataset, DatasetType, Datasplit, Feature +from ..data import AuxData, Dataset, DatasetType, Datasplit, Feature, DF from ..datautils import read_csv, to_data_frame from ..resources import config as rconfig from ..utils import Namespace as ns, as_list, lazy_property, list_all_files, memoize, path_from_split, profile, split_path @@ -56,124 +56,94 @@ def load(self, dataset, fold=0): raise ValueError(f"Unsupported file type: {ext}") @profile(logger=log) - def load_auxiliary_data(self, auxiliary_data, fold=0): + def load_auxiliary_data(self, dataset, auxiliary_data, fold=0): auxiliary_data = auxiliary_data if isinstance(auxiliary_data, ns) else ns(path=auxiliary_data) log.debug("Loading auxiliary data %s", auxiliary_data) paths = self._extract_auxiliary_paths(auxiliary_data.path if 'path' in auxiliary_data else auxiliary_data, fold=fold) - train_path = paths['train'][fold] - test_path = paths['test'][fold] - paths = dict(train=train_path, test=test_path) - return paths + train_data = None + test_data = None + if 'train' in paths: + train_path = paths['train'][fold] + train_data = FileAuxData(train_path) + if 'test' in paths: + test_path = paths['test'][fold] + test_data = FileAuxData(test_path) + return DatasetWithAuxiliaryData(dataset, train_data, test_data) def _extract_auxiliary_paths(self, auxiliary_data, fold=None): - train_search_pat = re.compile(r"(?:(.*)[_-])train_auxiliary(?:[_-](\d+))?\.\w+") - test_search_pat = re.compile(r"(?:(.*)[_-])test_auxiliary(?:[_-](\d+))?\.\w+") if isinstance(auxiliary_data, (tuple, list)): assert len(auxiliary_data) % 2 == 0, "auxiliary data list must contain an even number of paths: [train_auxiliary_0, test_auxiliary_0, train_auxiliary_1, test_auxiliary_1, ...]." - return self._extract_auxiliary_paths(ns(train=[p for i, p in enumerate(auxiliary_data) if i % 2 == 0], + return self._extract_paths(ns(train=[p for i, p in enumerate(auxiliary_data) if i % 2 == 0], test=[p for i, p in enumerate(auxiliary_data) if i % 2 == 1]), - fold=fold) + fold=fold, train_suffix='train_auxiliary', test_suffix='test_auxiliary') elif isinstance(auxiliary_data, ns): return dict( - train=[self._extract_auxiliary_paths(p)['train'][0] + train=[self._extract_paths(p, fold=fold, train_suffix='train_auxiliary', test_suffix='test_auxiliary')['train'][0] if i == fold else None for i, p in enumerate(as_list(auxiliary_data.train))], - test=[self._extract_auxiliary_paths(p)['train'][0] + test=[self._extract_paths(p, fold=fold, train_suffix='train_auxiliary', test_suffix='test_auxiliary')['train'][0] if i == fold else None for i, p in enumerate(as_list(auxiliary_data.test))] if 'test' in auxiliary_data else [] ) else: - assert isinstance(auxiliary_data, str) - auxiliary_data = os.path.expanduser(auxiliary_data) - auxiliary_data = auxiliary_data.format(**rconfig().common_dirs) - - if os.path.exists(auxiliary_data): - if os.path.isfile(auxiliary_data): - # we leave the auxiliary data handling to the user - return dict(train=[auxiliary_data], test=[]) - elif os.path.isdir(auxiliary_data): - files = list_all_files(auxiliary_data) - log.debug("Files found in auxiliary data folder %s: %s", auxiliary_data, files) - assert len(files) > 0, f"Empty folder: {auxiliary_data}" - if len(files) == 1: - return dict(train=files, test=[]) + self._extract_paths(auxiliary_data, fold=fold, train_suffix='train_auxiliary', test_suffix='test_auxiliary') - train_matches = [m for m in [train_search_pat.search(f) for f in files] if m] - test_matches = [m for m in [test_search_pat.search(f) for f in files] if m] - # verify they're for the same dataset (just based on name) - assert train_matches, f"Folder {auxiliary_data} must contain at least one training auxiliary data." - root_names = {m[1] for m in (train_matches+test_matches)} - assert len(root_names) == 1, f"All dataset files in {auxiliary_data} should follow the same naming: xxxxx_train_auxiliary_N.ext or xxxxx_test_auxiliary_N.ext with N starting from 0." - - train_no_fold = next((m[0] for m in train_matches if m[2] is None), None) - test_no_fold = next((m[0] for m in test_matches if m[2] is None), None) - if train_no_fold and test_no_fold: - return dict(train=[train_no_fold], test=[test_no_fold]) - - paths = dict(train=[], test=[]) - fold = 0 - while fold >= 0: - train = next((m[0] for m in train_matches if m[2] == str(fold)), None) - test = next((m[0] for m in test_matches if m[2] == str(fold)), None) - if train and test: - paths['train'].append(train) - paths['test'].append(test) - fold += 1 - else: - fold = -1 - assert len(paths) > 0, f"No dataset file found in {auxiliary_data}: they should follow the naming xxxx_train_auxiliary.ext, xxxx_test_auxiliary.ext or xxxx_train_auxiliary_0.ext, xxxx_test_auxiliary_0.ext, xxxx_train_auxiliary_1.ext, ..." - return paths - elif is_valid_url(auxiliary_data): - cached_file = os.path.join(self._cache_dir, os.path.basename(auxiliary_data)) - if not os.path.exists(cached_file): # don't download if previously done - handler = get_file_handler(auxiliary_data) - assert handler.exists(auxiliary_data), f"Invalid path/url: {auxiliary_data}" - handler.download(auxiliary_data, dest_path=cached_file) - return self._extract_auxiliary_paths(cached_file) - else: - raise ValueError(f"Invalid dataset description: {auxiliary_data}") def _extract_train_test_paths(self, dataset, fold=None): if isinstance(dataset, (tuple, list)): assert len(dataset) % 2 == 0, "dataset list must contain an even number of paths: [train_0, test_0, train_1, test_1, ...]." - return self._extract_train_test_paths(ns(train=[p for i, p in enumerate(dataset) if i % 2 == 0], + return self._extract_paths(ns(train=[p for i, p in enumerate(dataset) if i % 2 == 0], test=[p for i, p in enumerate(dataset) if i % 2 == 1]), - fold=fold) + fold=fold, train_suffix='train', test_suffix='test') elif isinstance(dataset, ns): - return dict(train=[self._extract_train_test_paths(p)['train'][0] + return dict(train=[self._extract_paths(p, fold=fold, train_suffix='train', test_suffix='test')['train'][0] if i == fold else None for i, p in enumerate(as_list(dataset.train))], - test=[self._extract_train_test_paths(p)['train'][0] + test=[self._extract_paths(p, fold=fold, train_suffix='train', test_suffix='test')['train'][0] if i == fold else None for i, p in enumerate(as_list(dataset.test))]) else: - assert isinstance(dataset, str) - dataset = os.path.expanduser(dataset) - dataset = dataset.format(**rconfig().common_dirs) - - if os.path.exists(dataset): - if os.path.isfile(dataset): - if is_archive(dataset): - arch_name, _ = os.path.splitext(os.path.basename(dataset)) + self._extract_paths(dataset, fold=fold, train_suffix='train', test_suffix='test') + + + def _extract_paths(self, data, fold=None, train_suffix='train', test_suffix='test'): + train_search_pat = re.compile(rf"(?:(.*)[_-]){train_suffix}(?:[_-](\d+))?\.\w+") + test_search_pat = re.compile(rf"(?:(.*)[_-]){train_suffix}(?:[_-](\d+))?\.\w+") + is_aux_data = False + if train_suffix == 'train_auxiliary' and test_suffix == 'test_auxiliary': + is_aux_data = True + + assert isinstance(data, str) + data = os.path.expanduser(data) + data = data.format(**rconfig().common_dirs) + + if os.path.exists(data): + if os.path.isfile(data): + # we leave the auxiliary data handling to the user + if is_archive(data) and not is_aux_data: + arch_name, _ = os.path.splitext(os.path.basename(data)) dest_folder = os.path.join(self._cache_dir, arch_name) if not os.path.exists(dest_folder): # don't uncompress if previously done - dest_folder = unarchive_file(dataset, dest_folder) - return self._extract_train_test_paths(dest_folder) + dest_folder = unarchive_file(data, dest_folder) + return self._extract_paths(dest_folder, train_suffix=train_suffix, test_suffix=test_suffix) else: - return dict(train=[dataset], test=[]) - elif os.path.isdir(dataset): - files = list_all_files(dataset) - log.debug("Files found in dataset folder %s: %s", dataset, files) - assert len(files) > 0, f"Empty folder: {dataset}" + return dict(train=[data], test=[]) + elif os.path.isdir(data): + files = list_all_files(data) + log.debug("Files found in data folder %s: %s", data, files) + assert len(files) > 0, f"Empty folder: {data}" if len(files) == 1: return dict(train=files, test=[]) train_matches = [m for m in [train_search_pat.search(f) for f in files] if m] test_matches = [m for m in [test_search_pat.search(f) for f in files] if m] # verify they're for the same dataset (just based on name) - assert train_matches and test_matches, f"Folder {dataset} must contain at least one training and one test dataset." + if not is_aux_data: + assert train_matches and test_matches, f"Folder {data} must contain at least one training and one test dataset." + else: + assert train_matches or test_matches, f"Folder {data} must contain at least one training auxiliary data or one test auxiliary data." root_names = {m[1] for m in (train_matches+test_matches)} - assert len(root_names) == 1, f"All dataset files in {dataset} should follow the same naming: xxxxx_train_N.ext or xxxxx_test_N.ext with N starting from 0." + assert len(root_names) == 1, f"All data files in {data} should follow the same naming: xxxxx_{train_suffix}_N.ext or xxxxx_{test_suffix}_N.ext with N starting from 0." train_no_fold = next((m[0] for m in train_matches if m[2] is None), None) test_no_fold = next((m[0] for m in test_matches if m[2] is None), None) @@ -185,23 +155,47 @@ def _extract_train_test_paths(self, dataset, fold=None): while fold >= 0: train = next((m[0] for m in train_matches if m[2] == str(fold)), None) test = next((m[0] for m in test_matches if m[2] == str(fold)), None) - if train and test: - paths['train'].append(train) - paths['test'].append(test) - fold += 1 + if not is_aux_data: + if train and test: + paths['train'].append(train) + paths['test'].append(test) + fold += 1 + else: + fold = -1 else: - fold = -1 - assert len(paths) > 0, f"No dataset file found in {dataset}: they should follow the naming xxxx_train.ext, xxxx_test.ext or xxxx_train_0.ext, xxxx_test_0.ext, xxxx_train_1.ext, ..." + if train: + paths['train'].append(train) + if test: + paths['test'].append(test) + if not train and not test: + fold = -1 + fold += 1 + assert len(paths) > 0, f"No data file found in {data}: they should follow the naming xxxx_{train_suffix}.ext, xxxx_{test_suffix}.ext or xxxx_{train_suffix}_0.ext, xxxx_{test_suffix}_0.ext, xxxx_{train_suffix}_1.ext, ..." return paths - elif is_valid_url(dataset): - cached_file = os.path.join(self._cache_dir, os.path.basename(dataset)) + elif is_valid_url(data): + cached_file = os.path.join(self._cache_dir, os.path.basename(data)) if not os.path.exists(cached_file): # don't download if previously done - handler = get_file_handler(dataset) - assert handler.exists(dataset), f"Invalid path/url: {dataset}" - handler.download(dataset, dest_path=cached_file) - return self._extract_train_test_paths(cached_file) + handler = get_file_handler(data) + assert handler.exists(data), f"Invalid path/url: {data}" + handler.download(data, dest_path=cached_file) + return self._extract_paths(cached_file, fold=fold, train_suffix=train_suffix, test_suffix=test_suffix) else: - raise ValueError(f"Invalid dataset description: {dataset}") + raise ValueError(f"Invalid dataset description: {data}") + + +class FileAuxData(AuxData): + + def __init__(self, path): + super().__init__() + self._path = path + + @property + def path(self) -> str: + return self._path + + @property + def data(self) -> DF: + return NotImplementedError class FileDataset(Dataset): @@ -245,12 +239,12 @@ def _get_metadata(self, prop): return meta[prop] -class DatasetWithAuxiliaryData: +class DatasetWithAuxiliaryData(Dataset): - def __init__(self, dataset: FileDataset, auxiliary_data_path): + def __init__(self, dataset: FileDataset, train_auxiliary_data, test_auxiliary_data): self._dataset = dataset - self._train_auxiliary_data = auxiliary_data_path.get('train', None) - self._test_auxiliary_data = auxiliary_data_path.get('test', None) + self._train_auxiliary_data = train_auxiliary_data + self._test_auxiliary_data = test_auxiliary_data @property def train_auxiliary_data(self) -> str: diff --git a/frameworks/AutoGluon/__init__.py b/frameworks/AutoGluon/__init__.py index 387f2b7ad..bac7c8e92 100644 --- a/frameworks/AutoGluon/__init__.py +++ b/frameworks/AutoGluon/__init__.py @@ -20,9 +20,9 @@ def run(dataset: Dataset, config: TaskConfig): problem_type=dataset.type.name # AutoGluon problem_type is using same names as amlb.data.DatasetType ) if hasattr(dataset, 'train_auxiliary_data'): - data['train_auxiliary_data'] = dict(path=dataset.train_auxiliary_data) + data['train_auxiliary_data'] = dict(path=dataset.train_auxiliary_data.path) if hasattr(dataset, 'test_auxiliary_data'): - data['test_auxiliary_data'] = dict(path=dataset.test_auxiliary_data) + data['test_auxiliary_data'] = dict(path=dataset.test_auxiliary_data.path) return run_in_venv(__file__, "exec.py", input_data=data, dataset=dataset, config=config) From 73f8f16c80b60e3c67cbd67460d2b42ec126544e Mon Sep 17 00:00:00 2001 From: Weisu Date: Thu, 9 Dec 2021 15:01:32 -0800 Subject: [PATCH 07/10] remove unused DatasetWithAuxiliaryData --- amlb/benchmark.py | 2 +- amlb/datasets/__init__.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/amlb/benchmark.py b/amlb/benchmark.py index b73d5b883..5f33c53a2 100644 --- a/amlb/benchmark.py +++ b/amlb/benchmark.py @@ -19,7 +19,7 @@ from typing import List, Union from .job import Job, JobError, SimpleJobRunner, MultiThreadingJobRunner -from .datasets import DataLoader, DataSourceType, DatasetWithAuxiliaryData +from .datasets import DataLoader, DataSourceType from .data import DatasetType from .datautils import read_csv from .resources import get as rget, config as rconfig, output_dirs as routput_dirs diff --git a/amlb/datasets/__init__.py b/amlb/datasets/__init__.py index 940d8c1f5..d948a775d 100644 --- a/amlb/datasets/__init__.py +++ b/amlb/datasets/__init__.py @@ -1,6 +1,6 @@ from enum import Enum, auto -from .file import FileLoader, DatasetWithAuxiliaryData +from .file import FileLoader from .openml import OpenmlLoader From 95c824f8be74c93cd0dda6b8efb330595a87b189 Mon Sep 17 00:00:00 2001 From: Weisu Date: Thu, 9 Dec 2021 16:07:40 -0800 Subject: [PATCH 08/10] move aux data in data split --- amlb/data.py | 8 ++++ amlb/datasets/file.py | 75 +++++++++----------------------- frameworks/AutoGluon/__init__.py | 8 ++-- 3 files changed, 32 insertions(+), 59 deletions(-) diff --git a/amlb/data.py b/amlb/data.py index 5181a6386..21a3115c2 100644 --- a/amlb/data.py +++ b/amlb/data.py @@ -120,6 +120,14 @@ def __init__(self, dataset, format): def path(self) -> str: return self.data_path(self.format) + @property + def has_auxiliary_data(self) -> bool: + pass + + @property + def auxiliary_data(self) -> AuxData: + pass + @abstractmethod def data_path(self, format: str) -> str: """ diff --git a/amlb/datasets/file.py b/amlb/datasets/file.py index fd895103f..19cabca76 100644 --- a/amlb/datasets/file.py +++ b/amlb/datasets/file.py @@ -68,7 +68,8 @@ def load_auxiliary_data(self, dataset, auxiliary_data, fold=0): if 'test' in paths: test_path = paths['test'][fold] test_data = FileAuxData(test_path) - return DatasetWithAuxiliaryData(dataset, train_data, test_data) + dataset._attach_auxiliary_data(train_data, test_data) + return dataset def _extract_auxiliary_paths(self, auxiliary_data, fold=None): if isinstance(auxiliary_data, (tuple, list)): @@ -80,7 +81,7 @@ def _extract_auxiliary_paths(self, auxiliary_data, fold=None): return dict( train=[self._extract_paths(p, fold=fold, train_suffix='train_auxiliary', test_suffix='test_auxiliary')['train'][0] if i == fold else None - for i, p in enumerate(as_list(auxiliary_data.train))], + for i, p in enumerate(as_list(auxiliary_data.train))] if 'train' in auxiliary_data else [], test=[self._extract_paths(p, fold=fold, train_suffix='train_auxiliary', test_suffix='test_auxiliary')['train'][0] if i == fold else None for i, p in enumerate(as_list(auxiliary_data.test))] if 'test' in auxiliary_data else [] @@ -233,64 +234,15 @@ def features(self) -> List[Feature]: def target(self) -> Feature: return self._get_metadata('target') - @memoize - def _get_metadata(self, prop): - meta = self._train.load_metadata() - return meta[prop] - - -class DatasetWithAuxiliaryData(Dataset): - - def __init__(self, dataset: FileDataset, train_auxiliary_data, test_auxiliary_data): - self._dataset = dataset - self._train_auxiliary_data = train_auxiliary_data - self._test_auxiliary_data = test_auxiliary_data - - @property - def train_auxiliary_data(self) -> str: - return self._train_auxiliary_data - - @property - def test_auxiliary_data(self) -> str: - return self._test_auxiliary_data - - @property - def type(self) -> DatasetType: - assert self._dataset.target is not None - return (DatasetType[self._dataset._type] if self._dataset._type is not None - else DatasetType.regression if self._dataset.target.values is None - else DatasetType.binary if len(self._dataset.target.values) == 2 - else DatasetType.multiclass) - - @property - def train(self) -> Datasplit: - return self._dataset._train - - @property - def test(self) -> Datasplit: - return self._dataset._test - - @property - def features(self) -> List[Feature]: - return self._get_metadata('features') - - @property - def target(self) -> Feature: - return self._get_metadata('target') + def _attach_auxiliary_data(self, train_auxiliary_data, test_auxiliary_data): + self._train._attach_auxiliary_data(train_auxiliary_data) + self._test._attach_auxiliary_data(test_auxiliary_data) @memoize def _get_metadata(self, prop): - meta = self._dataset._train.load_metadata() + meta = self._train.load_metadata() return meta[prop] - @profile(logger=log) - def release(self, properties=None): - """ - Call this to release cached properties and optimize memory once in-memory data are not needed anymore. - :param properties: - """ - self._dataset.release(properties) - class FileDatasplit(Datasplit): @@ -298,6 +250,8 @@ def __init__(self, dataset: FileDataset, format: str, path: str): super().__init__(dataset, format) self._path = path self._data = {format: path} + self._auxiliary_data = None + def data_path(self, format): supported_formats = [cls.format for cls in __file_converters__] @@ -306,6 +260,14 @@ def data_path(self, format): raise ValueError(f"Dataset {name} is only available in one of {supported_formats} formats.") return self._get_data(format) + @property + def has_auxiliary_data(self) -> bool: + return self._auxiliary_data != None + + @property + def auxiliary_data(self) -> AuxData: + return self._auxiliary_data + @lazy_property def data(self): # use codecs for unicode support: path = codecs.load(self._path, 'rb', 'utf-8') @@ -342,6 +304,9 @@ def _set_feature_as_target(self, target: Feature): # target.data_type = 'category' target.is_target = True + def _attach_auxiliary_data(self, auxiliary_data): + self._auxiliary_data = auxiliary_data + class ArffDataset(FileDataset): diff --git a/frameworks/AutoGluon/__init__.py b/frameworks/AutoGluon/__init__.py index bac7c8e92..6ded93cf1 100644 --- a/frameworks/AutoGluon/__init__.py +++ b/frameworks/AutoGluon/__init__.py @@ -19,10 +19,10 @@ def run(dataset: Dataset, config: TaskConfig): ), problem_type=dataset.type.name # AutoGluon problem_type is using same names as amlb.data.DatasetType ) - if hasattr(dataset, 'train_auxiliary_data'): - data['train_auxiliary_data'] = dict(path=dataset.train_auxiliary_data.path) - if hasattr(dataset, 'test_auxiliary_data'): - data['test_auxiliary_data'] = dict(path=dataset.test_auxiliary_data.path) + if dataset.train.has_auxiliary_data: + data['train_auxiliary_data'] = dict(path=dataset.train.auxiliary_data.path) + if dataset.test.has_auxiliary_data: + data['test_auxiliary_data'] = dict(path=dataset.test.auxiliary_data.path) return run_in_venv(__file__, "exec.py", input_data=data, dataset=dataset, config=config) From fd482b2b994f1cccaf7dc34879caa0a8c50c470a Mon Sep 17 00:00:00 2001 From: Weisu Date: Thu, 9 Dec 2021 17:08:56 -0800 Subject: [PATCH 09/10] fix --- amlb/datasets/file.py | 25 ++++++++++++++----------- 1 file changed, 14 insertions(+), 11 deletions(-) diff --git a/amlb/datasets/file.py b/amlb/datasets/file.py index 19cabca76..4a6f4349a 100644 --- a/amlb/datasets/file.py +++ b/amlb/datasets/file.py @@ -72,39 +72,42 @@ def load_auxiliary_data(self, dataset, auxiliary_data, fold=0): return dataset def _extract_auxiliary_paths(self, auxiliary_data, fold=None): + train_suffix = 'train_auxiliary' + test_suffix = 'test_auxiliary' if isinstance(auxiliary_data, (tuple, list)): - assert len(auxiliary_data) % 2 == 0, "auxiliary data list must contain an even number of paths: [train_auxiliary_0, test_auxiliary_0, train_auxiliary_1, test_auxiliary_1, ...]." - return self._extract_paths(ns(train=[p for i, p in enumerate(auxiliary_data) if i % 2 == 0], - test=[p for i, p in enumerate(auxiliary_data) if i % 2 == 1]), - fold=fold, train_suffix='train_auxiliary', test_suffix='test_auxiliary') + return self._extract_paths(ns(train=[p for p in auxiliary_data if train_suffix in p], + test=[p for p in auxiliary_data if test_suffix in p]), + fold=fold, train_suffix=train_suffix, test_suffix=test_suffix) elif isinstance(auxiliary_data, ns): return dict( - train=[self._extract_paths(p, fold=fold, train_suffix='train_auxiliary', test_suffix='test_auxiliary')['train'][0] + train=[self._extract_paths(p, fold=fold, train_suffix=train_suffix, test_suffix=test_suffix)['train'][0] if i == fold else None for i, p in enumerate(as_list(auxiliary_data.train))] if 'train' in auxiliary_data else [], - test=[self._extract_paths(p, fold=fold, train_suffix='train_auxiliary', test_suffix='test_auxiliary')['train'][0] + test=[self._extract_paths(p, fold=fold, train_suffix=train_suffix, test_suffix=test_suffix)['train'][0] if i == fold else None for i, p in enumerate(as_list(auxiliary_data.test))] if 'test' in auxiliary_data else [] ) else: - self._extract_paths(auxiliary_data, fold=fold, train_suffix='train_auxiliary', test_suffix='test_auxiliary') + self._extract_paths(auxiliary_data, fold=fold, train_suffix=train_suffix, test_suffix=test_suffix) def _extract_train_test_paths(self, dataset, fold=None): + train_suffix = 'train' + test_suffix = 'test' if isinstance(dataset, (tuple, list)): assert len(dataset) % 2 == 0, "dataset list must contain an even number of paths: [train_0, test_0, train_1, test_1, ...]." return self._extract_paths(ns(train=[p for i, p in enumerate(dataset) if i % 2 == 0], test=[p for i, p in enumerate(dataset) if i % 2 == 1]), - fold=fold, train_suffix='train', test_suffix='test') + fold=fold, train_suffix=train_suffix, test_suffix=test_suffix) elif isinstance(dataset, ns): - return dict(train=[self._extract_paths(p, fold=fold, train_suffix='train', test_suffix='test')['train'][0] + return dict(train=[self._extract_paths(p, fold=fold, train_suffix=train_suffix, test_suffix=test_suffix)['train'][0] if i == fold else None for i, p in enumerate(as_list(dataset.train))], - test=[self._extract_paths(p, fold=fold, train_suffix='train', test_suffix='test')['train'][0] + test=[self._extract_paths(p, fold=fold, train_suffix=train_suffix, test_suffix=test_suffix)['train'][0] if i == fold else None for i, p in enumerate(as_list(dataset.test))]) else: - self._extract_paths(dataset, fold=fold, train_suffix='train', test_suffix='test') + self._extract_paths(dataset, fold=fold, train_suffix=train_suffix, test_suffix=test_suffix) def _extract_paths(self, data, fold=None, train_suffix='train', test_suffix='test'): From d67aef6ea3c475d25e30128fe4c68f3208ca77a2 Mon Sep 17 00:00:00 2001 From: Weisu Date: Sun, 13 Feb 2022 16:40:28 -0800 Subject: [PATCH 10/10] unit test --- .../datasets/file/resources/image_test.zip | Bin 0 -> 7696 bytes .../datasets/file/resources/image_train.zip | Bin 0 -> 7696 bytes .../datasets/file/test_file_dataloader.py | 21 ++++++++++++++++++ 3 files changed, 21 insertions(+) create mode 100644 tests/unit/amlb/datasets/file/resources/image_test.zip create mode 100644 tests/unit/amlb/datasets/file/resources/image_train.zip diff --git a/tests/unit/amlb/datasets/file/resources/image_test.zip b/tests/unit/amlb/datasets/file/resources/image_test.zip new file mode 100644 index 0000000000000000000000000000000000000000..8dd8a2cf518082278d24eb13feba5f2d2c99e529 GIT binary patch literal 7696 zcmbW6RZtvEmxc$25S-wy0g?a#g1gJ0gS)%C`{3?_y9c)n65Js`kl^m_&gQGyt=-!F z|C>FhPIdQ1pHua|ebtYmEF3&80Qqm$$jgEN{~6oF0Khu{1;EnQ(A@Nkm4m6dDhLgL zlv#jil>fKN|CXxP|i@v z2p1qkp~t7k2XJ`9$pifC?&%rm>FXXC80hcsrlz2$`8i0RM@36bS6XP6S6Wz7Y-+M$ zY94aQGBfQI?r{0@9svP?aG}3QY$LH9Ht4*ILP$qP2i6A`nUWHILTnWN&419BGXNeb z9%exTNd(4%M#u$w-IqB45g>&0iuO}55RBgUv#-amwdh%p69Yy}v`Zg=nFj}BLE-YN zm(VjHF>&G~{Q2fHrQmK0v+B*vrT^PI&%wK@CQ;hSlgT4=`>zLA?S|je8|zB+8)W@P zS!b1zm=nC!gtZ>_o}HV(aP!*9$WSqWiP z^aI3C{lTdjhHF{c?YqeR0=bJ1q?sS>AaQ7TH-QZ3mxW2-pvo*M+iMS5(#qCqwmcR3 zA5}FLTZ{6ZU;Ar{vWw?@mai+C2{=J*huKk$k;$azBBh?C=_G#fgz0ms)Z8SQ1Af#k zf-j3)-`CCPgfsIz)jW+7?FcGIr_b0_DZkwtr8n)nKetS;r}woE@13+gIl3TbGHcK$ z4w4@sw`C;o=a7Q0MLzwMw(P1RrKd|~KnwnJe2??Jk!w}*Fd(70i#oT{mle&n)-$T< zt1|GJ+ig|;yjor}f;c-(fygSEmNFF@*>LG8Rcyg>hNoS9ArOIp8Ny-c6+{?ntnmej z3IO}nF!_5Q#_TpcOu+s=flzc9>>D&4gHOnOa}ba)!kQ?&ahcA-(hzVg>gz~Wf!-&^ zTdH!7r8v876W@xnCJ6Q7v@DLfYcwkFYxxW{s?K+jyoH7*Ji>cfNK~)d*l ziw^cvZ%fV^v1yE?zvcLZLYz(={N}Z{rq|XOsxo{jd30Shn<&RleL9tv$|<%MdMpGg zP4iVU^#A%;VPqJdxUE9Xl5Kr@hYIIjQyL|&!mRq?v-Bsu?zEYAGGwJ%5HtU>UlszX z)UPGRH|J<`QwYeKbK&^6_5P_J+N9)xxipQdjc|+}y4lHjTk&jabqW`Jnz3`Vb!Y22 zQ{%?jwWBtzmeQ8e+E*bvZAY^m*!NBD6?c5P=ozKE!Q#0llJ2X&1Z6KZy7s6B4R<~BJG7S(vb^eCwaJl!y$Sdnk=Hut=b3wX=1p{#L zpEQ37$Fo{R?@s$QNv~o_hlb>VbV2C?it??mUrV1rOtt4a7=ke>^7}p~-{yt)5ln_m z1N-_4EUj}QjdS295y*CXneaJOl$bi$9gi=o_4(JH>mq6VJ~rrz0m`ib>P-Inll)TV-p&f zJh)e|)HZj6&Tf9=Dhoid-oZ3Vu?L?Kd{hOoXce_Yg8B-ihVjz+IX1gFvd zL3cI99r-28y;5l`bvW-48jOT{FM5_|Xi#8SP-ys8>Q1JZelJb9<6@J?;?k-uRbSO^ zXNZs35MVoXxY3n~EQ^Tdf+R|WKe7LL(|{T99aBFOC5UmwB{0n+UKh4RkU26hqS%IP zey318p*DLg_iDX>+TU2aXMMdAG7HNRo%<(hQ#oJq$zE_oGZ?$D7l;Bwj{*N}2@^KRAdTA&>0$M(g{*UBt)aCK2BqF{GoNtI=-FE}_G_Pw7|EwCG>cy2 z&p(sq%08Tm(u(%A3APw%M|^AhJn;UmDj}3?DKA~~Onb&C@RO;Ss|=uW&)LtnShbz1 z8n+^yP91IPwHqc&!_ulyaxZ;kfy~2wzy{ZO>=;qP^HzHYm^#PdZZvB< zuY=Bx3y6=5As4>#?4x5xZc?+Dpz_c-R*7x7&OQ4he9*K)Rpq*^exT(mbT)dh>g!L# zT?9?{JWAg3Qn&t|F2sZ0bum%60^{c}bqo`OO@9j5{Xm3J*D&46=-S~zf5h_lNx^Zk z(*_&l{_vWr&COfK#uG-Pm_by*j6eZddBZHibuQ#yfU1 zRY5oCZ~GY1^~LZPUZGs^p)Oboopu|%ecVA~7Eoi6JJ2&S#op2%!(yX3#Din?`ezO~ z5mUBr3O5KwaNdLUwL3**PB|84IRC_}uYu{*(V^^l<<&h(%TJMiHv6O{Wo-4i0O!qe zRQzUwWo9Z@NTq3)x}8!I!$Dusci%;nnj`nkLd}gO+ck^g4k|&;$apE z<#VAy1&nw>ZT4^W6>SSx;ocq~I6N;NU$GGHIHRBG7`Nj}PEP>ZIsz5?7Ncz%yD+L{ zfuwmDI}Oy&5jf#czEl@|O7{Z8CCW0WEGd3BlZ$JX&$(8 zN0>o-DzdH*Y&qKBy?wgWMQY5Qw!0?FGw#z*ZEC{9G2d2-*xcH)iy@2MC&?+G_%hj! zbPdIP#IU$n3hsrJ!9O)BOW}XSwqXO8pD56lNm&;w)%ej7I3|s!7MtjDifLQ z%*QdLB)6@i?!9VKRZWdQ4tYYTZ=TY$ax;)u-3O0;LJ8;5s0Zij>TUZP&|b|8WmfLP z*9>|S$*KvS%p$J;2*AA#F)bF+srn_ci9%aq><-fD?XT-itNa0f zr2Efq)v3*8JtI;wkxnY*C3{PEC#VMDUpO+8Zb5bj`p1w992dH_RPPPa=#cb}e5}bV z)gf*-{jR}CU38!yQRt4b#ZRY@yaPoC;lajmK&B-M>uHezI|7ofi$i$=+o z$70Hn zS%J~z`_kIv%TmK}5uYF_2Fm)tfc#QHWXo87mJt=v%gwQVW2bsJB7`4c1{CMjI5%Ku z-^FnJSXq3ujiuOP=*<1H1re!Yy*p)0AIjLyC5kmkSJWpimTBc?27lJ0lQgxdn5;Ng zrZ1(oIZX97%p2S!LMSNi3mOR*v_76m#?{LY-#H##%Qqr^kKm7R(q3o#q$OA^5f1R7 zhYZ`_M=`XPwF=%Yr&#S4p~)W~|GZx{NWsp|&G(p?Kb0W!eRGUnirb6bE_(gcNLv5= zgc7T(Kkt+}xSHAx*3@)Ct+}sJyGx4epjF^}{LLm~q(zP}?){Y0-VeShTL0qH^PG-j zE$IJ!)zEVPd6P`&t0MqfLRgm+WRR{Y;USf%OXp=HD;4q>f{jidcL~zP0#&!$uU2?J-z7C*)Oa6Ggv+E2yIOOumGfN^ z$2yb5u&S0EobY1D+n}u|$+_u0EHj1XW@ppm8e92RW3`v_kFkv|avonW^bX9b2H$aA zvBMMwi2!(O)@1Fj3W%+J~78g|Ws>@*iVh{Cp`8|BB;66J4@S5#$&F=Hy7~ej6 zcm&o!4K>qUoG|Q4;&)?(bMkhbc!$VEU-%Jyln~P9#nAOknhckS(goKDBpTkSQv_vJ zzjq3R(Ld%i%1cj+_sxf~iD!`SrbirbNU2HH9dFPV^e!-nQ-|iY-jW`6M{$lEduAAO z{WY7y_+HImgx&^K__w=<`Y?iq4~Jg1%7u_!CQz^0Sel?LX$@sk4{-~b=^w7@?M93r zxjt=0>y_EpVjlbu@l8JA&6>xI2KT>fQU1<`@%>dqyn#^f0=2z0a&x= z>G_uehJy36tQUFmU2gFQsUG-yrVfk|ZFh|Y4$sDSL9DNZ?G$RZf9AYCXI^dWCB`PD z!3&v+2T~o!w}wOshvjMazlgf@=jnoptucfMas*#oyiH* z_0$r~AL-FF(8P;iYpD`PyM4)`wMQGb{*=P`$v5(_rB477vcsiZi3t46U;3oTm`H%z z2aq;h$2JP8FTBS$48?p6#;;E-;+)7XRYe4#hBz7jw5I#5UW^XTY-ut4;Z4?jM<30* zfDY-26}{usEBh~U`enlwa(0N%tYvD2YvGf&&H1k40iLTZ;N#D5a;*kjPTQ^@1y1h` z8t0=A@Yl-b%!Z`n+W$);ve?0C3j!yirFvf;mwYup7^O)F^ zA=WgXM0Slry2e3Qsxm`|S8Ldnx+*ol&%?po_{QX#puJO02}x*Ji!=>M%^+&(obZEV zJ4#5#mTH9Z0BL8jQZdQbNvEw}zs#AXX2o?{WpBGmdvo9z0=_C#^$N>$ICJ*4IEHf0 znY@#!H^~o)F? zD6*>CE?PFE)G#4wwzU*kDqmxds>KyBN;s%f+bDbg+0aIo$3XS?-qV#wxg6&^dx^W| zP2cTDM5N@Y^G*!x214Q7N%Ukam1wlieG;W}lY1399t3Ta*~eQu=I}`1Ae=j4;V5N; z+TOFQ6OS=d{47tgv z@gT0I&eUIcMvjbGlbN}y_t^=)i-+7B{?0g(&NtE{CEYwsC*?h4cuqXKS7R^J0)GbA z{nrRl4%ZL^Lex<|OGLlz&5`<1J=M~9;KKp7vFsrnrbLzMYS!yL$=vhgSIy>jr;gK1 z$O;29=JX`N0|#S47kbio$HcK+*}MB4XkB_#HwCM;3O4jNbl%!n5Q8)QCX)A{MkF`> zR5B{FN)%GLu|cWVtY6&%{QahL0#n%0g=}T2()7nl&Q_lk=^C-@I>f1L`nIqBHz%wA z21REq#u?>c_8m0pt3tuS0e%sJ>ZMl~T_mwdIL4eR#Hkah-EM@U5x1%@-lK%=y;mr=Jjg%WF?jI4$Ps`7jT#2RF)8$ za&yA>)nee`a34E3-M)egsG_CyWjBTrklK@X)o)1rlOh7d|78-na7J^aj^6fdnqTNq z648=`*}Js<5R@+zLK8 z@~M-`cFvH$Ec4f;Q^xqWpCkU_-=GLxT+QD39N16m7_ruNGS^?FLPr%lwPpRG4&w!G z0-!n_O<0w{hIJq?2|1(vaS)3qA~P+zsU7<$jAXn2!L^qY-nPH^{aL5Bqb)18@d3)6 z>99(u*baiJtYv%4`zDwE*`FyZrh*uIyD18t+QRV|2KBHbIAs} zNe}8JYfD->O2+97+XpXOMb7WAGU4r5I3#11b<_NshpsnH0%lVls=_YTvf zUDi{BBr*`9tZQyv)zWcf<-kFaV#CQ1j3y_vDUF)~Np+)W=HTV%?pQX@*RzW4wPd+# zEbDPgp*xH{NM7FOzHuJm(#~aaH#erY?c_mgkM{|e=O}o6H8q2hlEVz&il|f-t7qlk z9nagP4=+iF#ff=43P7;+>HD+^eNQ^{gmKyF;gPxEN~jo{i!r{{H4F{ciP7#W3FZSr zTH^YKZ*wu11oR6#h@4kgT}xsG?zCIIOQmJdRN*=cKoipLG9*2a%p2muv#p>_Ea@3r?nr{7`SZfdtSn$`slRU>+4Bj|&eeh8O+TXY zM$UX_WhC(DA3%m4w?DE9WZN%keH9K&h)9lAJG>V%#9rk4CBEUqjDi;2dszCk%kjkX*`a}VU96L!NC{mXrA9agarVmuI;|NuN8Dq zj9rU7FIjyxT+dJ26!q**S>AOnM4AJK)A*89L-IjJaD@^y{t6zkyQV9)%!)z7XSZ%A zNdzTM4~4`}9BeU_QSuNQagwsDv>MDW@IP{*3N5z^Ipo*QQnj-?-^Zt|96G06u+=k$ zTL+q2kQ)cYUQ{(K{z7E%@pv+^G1%HACY!0b-p*9yTX+@N=Z%D_ z#&tavVVtIxcWA@9RLIQrkLGCo{l{&nNax+hiE4N_6=MWcfeZu`_B$yVymw-U0K-q?hDu9rqV1X7;@2VAYq( z-Ta)1B3ERsG5h0P)ypm>*hHG`80SxNO#=9$y#p!+zu+GugU=U`Wz&V%V@xlr97{-RpI5@=)Dt}nn7DL5?=>3`#3cd z1Dwo0E=P`ppsL7W(rARYqOb##d$KiWY+gYGrE2-+6wT`7SiAxQv`lz|VN{axlM}6y zexg~^4>g#|coRz7GZ9PfQ+m{U#3#B_ACX?S>7Hv-|6H_mjS!Em$Xw{F&-w9fzZIeS zLV;L=GfV)yuf5`?F9~GZkw4FAd-e>J+R>b3v3;t1FovuOtgo%FxXI;GcPvaF+yLdQIZTOoJeewPQ_EUJ{2D^8oov2=Kpq12!wqsLbAenov-M{eS~d;s5eG>j zFFQdIscfeV`WvE$fLO_Ef!fZzybsA|5ncQ|C)D)GODT;@L$&#oDY%&2nB+$BaicUp zW+@a@-gWxcgV6ScCp{6}Zyvz|Ks|>h-gjmR1=w35K@@RnL2gk8NGR?TJjt+{XHin2(^;9V&#{l8%DA2_G@FR|U^UmEw}FSgUy2MUWSsAw>M z(f>CDulD~EJY$0o#^we_2IoLIwQ~%$aK!-FcmgRp&oFcFm8TlEqmQZ)*2c%=9IwD{0PBj|5M`J2ORIwx!tbT z%MdRAyx^jstg%gOxZu#}3s^itT3Q6P$H$$W9g5Nvn!_FZ;-eI^qcmLF)Z*QJ^If`< zQoJL4(~_c+?LD(yQ#$-&ynWLTMa~Yw!C)*aMjXiJ@1c)~h0mkmnxBvhE literal 0 HcmV?d00001 diff --git a/tests/unit/amlb/datasets/file/resources/image_train.zip b/tests/unit/amlb/datasets/file/resources/image_train.zip new file mode 100644 index 0000000000000000000000000000000000000000..8dd8a2cf518082278d24eb13feba5f2d2c99e529 GIT binary patch literal 7696 zcmbW6RZtvEmxc$25S-wy0g?a#g1gJ0gS)%C`{3?_y9c)n65Js`kl^m_&gQGyt=-!F z|C>FhPIdQ1pHua|ebtYmEF3&80Qqm$$jgEN{~6oF0Khu{1;EnQ(A@Nkm4m6dDhLgL zlv#jil>fKN|CXxP|i@v z2p1qkp~t7k2XJ`9$pifC?&%rm>FXXC80hcsrlz2$`8i0RM@36bS6XP6S6Wz7Y-+M$ zY94aQGBfQI?r{0@9svP?aG}3QY$LH9Ht4*ILP$qP2i6A`nUWHILTnWN&419BGXNeb z9%exTNd(4%M#u$w-IqB45g>&0iuO}55RBgUv#-amwdh%p69Yy}v`Zg=nFj}BLE-YN zm(VjHF>&G~{Q2fHrQmK0v+B*vrT^PI&%wK@CQ;hSlgT4=`>zLA?S|je8|zB+8)W@P zS!b1zm=nC!gtZ>_o}HV(aP!*9$WSqWiP z^aI3C{lTdjhHF{c?YqeR0=bJ1q?sS>AaQ7TH-QZ3mxW2-pvo*M+iMS5(#qCqwmcR3 zA5}FLTZ{6ZU;Ar{vWw?@mai+C2{=J*huKk$k;$azBBh?C=_G#fgz0ms)Z8SQ1Af#k zf-j3)-`CCPgfsIz)jW+7?FcGIr_b0_DZkwtr8n)nKetS;r}woE@13+gIl3TbGHcK$ z4w4@sw`C;o=a7Q0MLzwMw(P1RrKd|~KnwnJe2??Jk!w}*Fd(70i#oT{mle&n)-$T< zt1|GJ+ig|;yjor}f;c-(fygSEmNFF@*>LG8Rcyg>hNoS9ArOIp8Ny-c6+{?ntnmej z3IO}nF!_5Q#_TpcOu+s=flzc9>>D&4gHOnOa}ba)!kQ?&ahcA-(hzVg>gz~Wf!-&^ zTdH!7r8v876W@xnCJ6Q7v@DLfYcwkFYxxW{s?K+jyoH7*Ji>cfNK~)d*l ziw^cvZ%fV^v1yE?zvcLZLYz(={N}Z{rq|XOsxo{jd30Shn<&RleL9tv$|<%MdMpGg zP4iVU^#A%;VPqJdxUE9Xl5Kr@hYIIjQyL|&!mRq?v-Bsu?zEYAGGwJ%5HtU>UlszX z)UPGRH|J<`QwYeKbK&^6_5P_J+N9)xxipQdjc|+}y4lHjTk&jabqW`Jnz3`Vb!Y22 zQ{%?jwWBtzmeQ8e+E*bvZAY^m*!NBD6?c5P=ozKE!Q#0llJ2X&1Z6KZy7s6B4R<~BJG7S(vb^eCwaJl!y$Sdnk=Hut=b3wX=1p{#L zpEQ37$Fo{R?@s$QNv~o_hlb>VbV2C?it??mUrV1rOtt4a7=ke>^7}p~-{yt)5ln_m z1N-_4EUj}QjdS295y*CXneaJOl$bi$9gi=o_4(JH>mq6VJ~rrz0m`ib>P-Inll)TV-p&f zJh)e|)HZj6&Tf9=Dhoid-oZ3Vu?L?Kd{hOoXce_Yg8B-ihVjz+IX1gFvd zL3cI99r-28y;5l`bvW-48jOT{FM5_|Xi#8SP-ys8>Q1JZelJb9<6@J?;?k-uRbSO^ zXNZs35MVoXxY3n~EQ^Tdf+R|WKe7LL(|{T99aBFOC5UmwB{0n+UKh4RkU26hqS%IP zey318p*DLg_iDX>+TU2aXMMdAG7HNRo%<(hQ#oJq$zE_oGZ?$D7l;Bwj{*N}2@^KRAdTA&>0$M(g{*UBt)aCK2BqF{GoNtI=-FE}_G_Pw7|EwCG>cy2 z&p(sq%08Tm(u(%A3APw%M|^AhJn;UmDj}3?DKA~~Onb&C@RO;Ss|=uW&)LtnShbz1 z8n+^yP91IPwHqc&!_ulyaxZ;kfy~2wzy{ZO>=;qP^HzHYm^#PdZZvB< zuY=Bx3y6=5As4>#?4x5xZc?+Dpz_c-R*7x7&OQ4he9*K)Rpq*^exT(mbT)dh>g!L# zT?9?{JWAg3Qn&t|F2sZ0bum%60^{c}bqo`OO@9j5{Xm3J*D&46=-S~zf5h_lNx^Zk z(*_&l{_vWr&COfK#uG-Pm_by*j6eZddBZHibuQ#yfU1 zRY5oCZ~GY1^~LZPUZGs^p)Oboopu|%ecVA~7Eoi6JJ2&S#op2%!(yX3#Din?`ezO~ z5mUBr3O5KwaNdLUwL3**PB|84IRC_}uYu{*(V^^l<<&h(%TJMiHv6O{Wo-4i0O!qe zRQzUwWo9Z@NTq3)x}8!I!$Dusci%;nnj`nkLd}gO+ck^g4k|&;$apE z<#VAy1&nw>ZT4^W6>SSx;ocq~I6N;NU$GGHIHRBG7`Nj}PEP>ZIsz5?7Ncz%yD+L{ zfuwmDI}Oy&5jf#czEl@|O7{Z8CCW0WEGd3BlZ$JX&$(8 zN0>o-DzdH*Y&qKBy?wgWMQY5Qw!0?FGw#z*ZEC{9G2d2-*xcH)iy@2MC&?+G_%hj! zbPdIP#IU$n3hsrJ!9O)BOW}XSwqXO8pD56lNm&;w)%ej7I3|s!7MtjDifLQ z%*QdLB)6@i?!9VKRZWdQ4tYYTZ=TY$ax;)u-3O0;LJ8;5s0Zij>TUZP&|b|8WmfLP z*9>|S$*KvS%p$J;2*AA#F)bF+srn_ci9%aq><-fD?XT-itNa0f zr2Efq)v3*8JtI;wkxnY*C3{PEC#VMDUpO+8Zb5bj`p1w992dH_RPPPa=#cb}e5}bV z)gf*-{jR}CU38!yQRt4b#ZRY@yaPoC;lajmK&B-M>uHezI|7ofi$i$=+o z$70Hn zS%J~z`_kIv%TmK}5uYF_2Fm)tfc#QHWXo87mJt=v%gwQVW2bsJB7`4c1{CMjI5%Ku z-^FnJSXq3ujiuOP=*<1H1re!Yy*p)0AIjLyC5kmkSJWpimTBc?27lJ0lQgxdn5;Ng zrZ1(oIZX97%p2S!LMSNi3mOR*v_76m#?{LY-#H##%Qqr^kKm7R(q3o#q$OA^5f1R7 zhYZ`_M=`XPwF=%Yr&#S4p~)W~|GZx{NWsp|&G(p?Kb0W!eRGUnirb6bE_(gcNLv5= zgc7T(Kkt+}xSHAx*3@)Ct+}sJyGx4epjF^}{LLm~q(zP}?){Y0-VeShTL0qH^PG-j zE$IJ!)zEVPd6P`&t0MqfLRgm+WRR{Y;USf%OXp=HD;4q>f{jidcL~zP0#&!$uU2?J-z7C*)Oa6Ggv+E2yIOOumGfN^ z$2yb5u&S0EobY1D+n}u|$+_u0EHj1XW@ppm8e92RW3`v_kFkv|avonW^bX9b2H$aA zvBMMwi2!(O)@1Fj3W%+J~78g|Ws>@*iVh{Cp`8|BB;66J4@S5#$&F=Hy7~ej6 zcm&o!4K>qUoG|Q4;&)?(bMkhbc!$VEU-%Jyln~P9#nAOknhckS(goKDBpTkSQv_vJ zzjq3R(Ld%i%1cj+_sxf~iD!`SrbirbNU2HH9dFPV^e!-nQ-|iY-jW`6M{$lEduAAO z{WY7y_+HImgx&^K__w=<`Y?iq4~Jg1%7u_!CQz^0Sel?LX$@sk4{-~b=^w7@?M93r zxjt=0>y_EpVjlbu@l8JA&6>xI2KT>fQU1<`@%>dqyn#^f0=2z0a&x= z>G_uehJy36tQUFmU2gFQsUG-yrVfk|ZFh|Y4$sDSL9DNZ?G$RZf9AYCXI^dWCB`PD z!3&v+2T~o!w}wOshvjMazlgf@=jnoptucfMas*#oyiH* z_0$r~AL-FF(8P;iYpD`PyM4)`wMQGb{*=P`$v5(_rB477vcsiZi3t46U;3oTm`H%z z2aq;h$2JP8FTBS$48?p6#;;E-;+)7XRYe4#hBz7jw5I#5UW^XTY-ut4;Z4?jM<30* zfDY-26}{usEBh~U`enlwa(0N%tYvD2YvGf&&H1k40iLTZ;N#D5a;*kjPTQ^@1y1h` z8t0=A@Yl-b%!Z`n+W$);ve?0C3j!yirFvf;mwYup7^O)F^ zA=WgXM0Slry2e3Qsxm`|S8Ldnx+*ol&%?po_{QX#puJO02}x*Ji!=>M%^+&(obZEV zJ4#5#mTH9Z0BL8jQZdQbNvEw}zs#AXX2o?{WpBGmdvo9z0=_C#^$N>$ICJ*4IEHf0 znY@#!H^~o)F? zD6*>CE?PFE)G#4wwzU*kDqmxds>KyBN;s%f+bDbg+0aIo$3XS?-qV#wxg6&^dx^W| zP2cTDM5N@Y^G*!x214Q7N%Ukam1wlieG;W}lY1399t3Ta*~eQu=I}`1Ae=j4;V5N; z+TOFQ6OS=d{47tgv z@gT0I&eUIcMvjbGlbN}y_t^=)i-+7B{?0g(&NtE{CEYwsC*?h4cuqXKS7R^J0)GbA z{nrRl4%ZL^Lex<|OGLlz&5`<1J=M~9;KKp7vFsrnrbLzMYS!yL$=vhgSIy>jr;gK1 z$O;29=JX`N0|#S47kbio$HcK+*}MB4XkB_#HwCM;3O4jNbl%!n5Q8)QCX)A{MkF`> zR5B{FN)%GLu|cWVtY6&%{QahL0#n%0g=}T2()7nl&Q_lk=^C-@I>f1L`nIqBHz%wA z21REq#u?>c_8m0pt3tuS0e%sJ>ZMl~T_mwdIL4eR#Hkah-EM@U5x1%@-lK%=y;mr=Jjg%WF?jI4$Ps`7jT#2RF)8$ za&yA>)nee`a34E3-M)egsG_CyWjBTrklK@X)o)1rlOh7d|78-na7J^aj^6fdnqTNq z648=`*}Js<5R@+zLK8 z@~M-`cFvH$Ec4f;Q^xqWpCkU_-=GLxT+QD39N16m7_ruNGS^?FLPr%lwPpRG4&w!G z0-!n_O<0w{hIJq?2|1(vaS)3qA~P+zsU7<$jAXn2!L^qY-nPH^{aL5Bqb)18@d3)6 z>99(u*baiJtYv%4`zDwE*`FyZrh*uIyD18t+QRV|2KBHbIAs} zNe}8JYfD->O2+97+XpXOMb7WAGU4r5I3#11b<_NshpsnH0%lVls=_YTvf zUDi{BBr*`9tZQyv)zWcf<-kFaV#CQ1j3y_vDUF)~Np+)W=HTV%?pQX@*RzW4wPd+# zEbDPgp*xH{NM7FOzHuJm(#~aaH#erY?c_mgkM{|e=O}o6H8q2hlEVz&il|f-t7qlk z9nagP4=+iF#ff=43P7;+>HD+^eNQ^{gmKyF;gPxEN~jo{i!r{{H4F{ciP7#W3FZSr zTH^YKZ*wu11oR6#h@4kgT}xsG?zCIIOQmJdRN*=cKoipLG9*2a%p2muv#p>_Ea@3r?nr{7`SZfdtSn$`slRU>+4Bj|&eeh8O+TXY zM$UX_WhC(DA3%m4w?DE9WZN%keH9K&h)9lAJG>V%#9rk4CBEUqjDi;2dszCk%kjkX*`a}VU96L!NC{mXrA9agarVmuI;|NuN8Dq zj9rU7FIjyxT+dJ26!q**S>AOnM4AJK)A*89L-IjJaD@^y{t6zkyQV9)%!)z7XSZ%A zNdzTM4~4`}9BeU_QSuNQagwsDv>MDW@IP{*3N5z^Ipo*QQnj-?-^Zt|96G06u+=k$ zTL+q2kQ)cYUQ{(K{z7E%@pv+^G1%HACY!0b-p*9yTX+@N=Z%D_ z#&tavVVtIxcWA@9RLIQrkLGCo{l{&nNax+hiE4N_6=MWcfeZu`_B$yVymw-U0K-q?hDu9rqV1X7;@2VAYq( z-Ta)1B3ERsG5h0P)ypm>*hHG`80SxNO#=9$y#p!+zu+GugU=U`Wz&V%V@xlr97{-RpI5@=)Dt}nn7DL5?=>3`#3cd z1Dwo0E=P`ppsL7W(rARYqOb##d$KiWY+gYGrE2-+6wT`7SiAxQv`lz|VN{axlM}6y zexg~^4>g#|coRz7GZ9PfQ+m{U#3#B_ACX?S>7Hv-|6H_mjS!Em$Xw{F&-w9fzZIeS zLV;L=GfV)yuf5`?F9~GZkw4FAd-e>J+R>b3v3;t1FovuOtgo%FxXI;GcPvaF+yLdQIZTOoJeewPQ_EUJ{2D^8oov2=Kpq12!wqsLbAenov-M{eS~d;s5eG>j zFFQdIscfeV`WvE$fLO_Ef!fZzybsA|5ncQ|C)D)GODT;@L$&#oDY%&2nB+$BaicUp zW+@a@-gWxcgV6ScCp{6}Zyvz|Ks|>h-gjmR1=w35K@@RnL2gk8NGR?TJjt+{XHin2(^;9V&#{l8%DA2_G@FR|U^UmEw}FSgUy2MUWSsAw>M z(f>CDulD~EJY$0o#^we_2IoLIwQ~%$aK!-FcmgRp&oFcFm8TlEqmQZ)*2c%=9IwD{0PBj|5M`J2ORIwx!tbT z%MdRAyx^jstg%gOxZu#}3s^itT3Q6P$H$$W9g5Nvn!_FZ;-eI^qcmLF)Z*QJ^If`< zQoJL4(~_c+?LD(yQ#$-&ynWLTMa~Yw!C)*aMjXiJ@1c)~h0mkmnxBvhE literal 0 HcmV?d00001 diff --git a/tests/unit/amlb/datasets/file/test_file_dataloader.py b/tests/unit/amlb/datasets/file/test_file_dataloader.py index b979a78cc..7e052b6b6 100644 --- a/tests/unit/amlb/datasets/file/test_file_dataloader.py +++ b/tests/unit/amlb/datasets/file/test_file_dataloader.py @@ -158,6 +158,27 @@ def test_load_regression_task_arff(file_loader): _assert_cholesterol_features(ds, ds_def, 'arff') +@pytest.mark.use_disk +def test_load_auxiliary_data(file_loader): + ds_def = ns( + train=os.path.join(res, "kc2_train.csv"), + test=os.path.join(res, "kc2_test.csv"), + target="problems" + ) + ds = file_loader.load(ds_def) + aux_def = ns( + train=os.path.join(res, "image_train.zip"), + test=os.path.join(res, "image_test.zip") + ) + ds = file_loader.load_auxiliary_data(ds, aux_def) + _assert_aux_data_path(ds) + + +def _assert_aux_data_path(dataset): + assert dataset.train.auxiliary_data.path == os.path.join(res, "image_train.zip") + assert dataset.test.auxiliary_data.path == os.path.join(res, "image_test.zip") + + def _assert_cholesterol_features(dataset, definition, fmt): assert len(dataset.features) == 14 assert len(dataset.predictors) == 13