diff --git a/amlb/benchmark.py b/amlb/benchmark.py index d38f8ad56..5f33c53a2 100644 --- a/amlb/benchmark.py +++ b/amlb/benchmark.py @@ -489,6 +489,9 @@ def load_data(self): else: raise ValueError("Tasks should have one property among [openml_task_id, openml_dataset_id, dataset].") + if hasattr(self._task_def, 'auxiliary_data'): + self._dataset = Benchmark.data_loader.load_auxiliary_data(DataSourceType.file, dataset=self._dataset, auxiliary_data=self._task_def.auxiliary_data, fold=self.fold) + def as_job(self): job = Job(name=rconfig().token_separator.join([ 'local', diff --git a/amlb/data.py b/amlb/data.py index b329df27b..21a3115c2 100644 --- a/amlb/data.py +++ b/amlb/data.py @@ -84,6 +84,28 @@ def __repr__(self): return repr_def(self) +class AuxData(ABC): + + def __init__(self): + super().__init__() + + @property + def path(self) -> str: + pass + + @property + @abstractmethod + def data(self) -> DF: + """ + :return: the auxiliary data as a pandas DataFrame. + """ + pass + + @profile(logger=log) + def release(self, properties=None): + clear_cache(self, properties) + + class Datasplit(ABC): def __init__(self, dataset, format): @@ -98,6 +120,14 @@ def __init__(self, dataset, format): def path(self) -> str: return self.data_path(self.format) + @property + def has_auxiliary_data(self) -> bool: + pass + + @property + def auxiliary_data(self) -> AuxData: + pass + @abstractmethod def data_path(self, format: str) -> str: """ diff --git a/amlb/datasets/__init__.py b/amlb/datasets/__init__.py index af60730f8..d948a775d 100644 --- a/amlb/datasets/__init__.py +++ b/amlb/datasets/__init__.py @@ -24,5 +24,11 @@ def load(self, source: DataSourceType, *args, **kwargs): else: raise NotImplementedError(f"data source {source} is not supported yet") + def load_auxiliary_data(self, source: DataSourceType, *args, **kwargs): + if source == DataSourceType.file: + return self.file_loader.load_auxiliary_data(*args, **kwargs) + else: + raise NotImplementedError(f"data source {source} is not supported yet") + __all__ = ["DataLoader", "DataSourceType"] diff --git a/amlb/datasets/file.py b/amlb/datasets/file.py index 8e696f135..4a6f4349a 100644 --- a/amlb/datasets/file.py +++ b/amlb/datasets/file.py @@ -10,7 +10,7 @@ import pandas as pd import pandas.api.types as pat -from ..data import Dataset, DatasetType, Datasplit, Feature +from ..data import AuxData, Dataset, DatasetType, Datasplit, Feature, DF from ..datautils import read_csv, to_data_frame from ..resources import config as rconfig from ..utils import Namespace as ns, as_list, lazy_property, list_all_files, memoize, path_from_split, profile, split_path @@ -55,47 +55,99 @@ def load(self, dataset, fold=0): else: raise ValueError(f"Unsupported file type: {ext}") + @profile(logger=log) + def load_auxiliary_data(self, dataset, auxiliary_data, fold=0): + auxiliary_data = auxiliary_data if isinstance(auxiliary_data, ns) else ns(path=auxiliary_data) + log.debug("Loading auxiliary data %s", auxiliary_data) + paths = self._extract_auxiliary_paths(auxiliary_data.path if 'path' in auxiliary_data else auxiliary_data, fold=fold) + train_data = None + test_data = None + if 'train' in paths: + train_path = paths['train'][fold] + train_data = FileAuxData(train_path) + if 'test' in paths: + test_path = paths['test'][fold] + test_data = FileAuxData(test_path) + dataset._attach_auxiliary_data(train_data, test_data) + return dataset + + def _extract_auxiliary_paths(self, auxiliary_data, fold=None): + train_suffix = 'train_auxiliary' + test_suffix = 'test_auxiliary' + if isinstance(auxiliary_data, (tuple, list)): + return self._extract_paths(ns(train=[p for p in auxiliary_data if train_suffix in p], + test=[p for p in auxiliary_data if test_suffix in p]), + fold=fold, train_suffix=train_suffix, test_suffix=test_suffix) + elif isinstance(auxiliary_data, ns): + return dict( + train=[self._extract_paths(p, fold=fold, train_suffix=train_suffix, test_suffix=test_suffix)['train'][0] + if i == fold else None + for i, p in enumerate(as_list(auxiliary_data.train))] if 'train' in auxiliary_data else [], + test=[self._extract_paths(p, fold=fold, train_suffix=train_suffix, test_suffix=test_suffix)['train'][0] + if i == fold else None + for i, p in enumerate(as_list(auxiliary_data.test))] if 'test' in auxiliary_data else [] + ) + else: + self._extract_paths(auxiliary_data, fold=fold, train_suffix=train_suffix, test_suffix=test_suffix) + + def _extract_train_test_paths(self, dataset, fold=None): + train_suffix = 'train' + test_suffix = 'test' if isinstance(dataset, (tuple, list)): assert len(dataset) % 2 == 0, "dataset list must contain an even number of paths: [train_0, test_0, train_1, test_1, ...]." - return self._extract_train_test_paths(ns(train=[p for i, p in enumerate(dataset) if i % 2 == 0], + return self._extract_paths(ns(train=[p for i, p in enumerate(dataset) if i % 2 == 0], test=[p for i, p in enumerate(dataset) if i % 2 == 1]), - fold=fold) + fold=fold, train_suffix=train_suffix, test_suffix=test_suffix) elif isinstance(dataset, ns): - return dict(train=[self._extract_train_test_paths(p)['train'][0] + return dict(train=[self._extract_paths(p, fold=fold, train_suffix=train_suffix, test_suffix=test_suffix)['train'][0] if i == fold else None for i, p in enumerate(as_list(dataset.train))], - test=[self._extract_train_test_paths(p)['train'][0] + test=[self._extract_paths(p, fold=fold, train_suffix=train_suffix, test_suffix=test_suffix)['train'][0] if i == fold else None for i, p in enumerate(as_list(dataset.test))]) else: - assert isinstance(dataset, str) - dataset = os.path.expanduser(dataset) - dataset = dataset.format(**rconfig().common_dirs) - - if os.path.exists(dataset): - if os.path.isfile(dataset): - if is_archive(dataset): - arch_name, _ = os.path.splitext(os.path.basename(dataset)) + self._extract_paths(dataset, fold=fold, train_suffix=train_suffix, test_suffix=test_suffix) + + + def _extract_paths(self, data, fold=None, train_suffix='train', test_suffix='test'): + train_search_pat = re.compile(rf"(?:(.*)[_-]){train_suffix}(?:[_-](\d+))?\.\w+") + test_search_pat = re.compile(rf"(?:(.*)[_-]){train_suffix}(?:[_-](\d+))?\.\w+") + is_aux_data = False + if train_suffix == 'train_auxiliary' and test_suffix == 'test_auxiliary': + is_aux_data = True + + assert isinstance(data, str) + data = os.path.expanduser(data) + data = data.format(**rconfig().common_dirs) + + if os.path.exists(data): + if os.path.isfile(data): + # we leave the auxiliary data handling to the user + if is_archive(data) and not is_aux_data: + arch_name, _ = os.path.splitext(os.path.basename(data)) dest_folder = os.path.join(self._cache_dir, arch_name) if not os.path.exists(dest_folder): # don't uncompress if previously done - dest_folder = unarchive_file(dataset, dest_folder) - return self._extract_train_test_paths(dest_folder) + dest_folder = unarchive_file(data, dest_folder) + return self._extract_paths(dest_folder, train_suffix=train_suffix, test_suffix=test_suffix) else: - return dict(train=[dataset], test=[]) - elif os.path.isdir(dataset): - files = list_all_files(dataset) - log.debug("Files found in dataset folder %s: %s", dataset, files) - assert len(files) > 0, f"Empty folder: {dataset}" + return dict(train=[data], test=[]) + elif os.path.isdir(data): + files = list_all_files(data) + log.debug("Files found in data folder %s: %s", data, files) + assert len(files) > 0, f"Empty folder: {data}" if len(files) == 1: return dict(train=files, test=[]) train_matches = [m for m in [train_search_pat.search(f) for f in files] if m] test_matches = [m for m in [test_search_pat.search(f) for f in files] if m] # verify they're for the same dataset (just based on name) - assert train_matches and test_matches, f"Folder {dataset} must contain at least one training and one test dataset." + if not is_aux_data: + assert train_matches and test_matches, f"Folder {data} must contain at least one training and one test dataset." + else: + assert train_matches or test_matches, f"Folder {data} must contain at least one training auxiliary data or one test auxiliary data." root_names = {m[1] for m in (train_matches+test_matches)} - assert len(root_names) == 1, f"All dataset files in {dataset} should follow the same naming: xxxxx_train_N.ext or xxxxx_test_N.ext with N starting from 0." + assert len(root_names) == 1, f"All data files in {data} should follow the same naming: xxxxx_{train_suffix}_N.ext or xxxxx_{test_suffix}_N.ext with N starting from 0." train_no_fold = next((m[0] for m in train_matches if m[2] is None), None) test_no_fold = next((m[0] for m in test_matches if m[2] is None), None) @@ -107,23 +159,47 @@ def _extract_train_test_paths(self, dataset, fold=None): while fold >= 0: train = next((m[0] for m in train_matches if m[2] == str(fold)), None) test = next((m[0] for m in test_matches if m[2] == str(fold)), None) - if train and test: - paths['train'].append(train) - paths['test'].append(test) - fold += 1 + if not is_aux_data: + if train and test: + paths['train'].append(train) + paths['test'].append(test) + fold += 1 + else: + fold = -1 else: - fold = -1 - assert len(paths) > 0, f"No dataset file found in {dataset}: they should follow the naming xxxx_train.ext, xxxx_test.ext or xxxx_train_0.ext, xxxx_test_0.ext, xxxx_train_1.ext, ..." + if train: + paths['train'].append(train) + if test: + paths['test'].append(test) + if not train and not test: + fold = -1 + fold += 1 + assert len(paths) > 0, f"No data file found in {data}: they should follow the naming xxxx_{train_suffix}.ext, xxxx_{test_suffix}.ext or xxxx_{train_suffix}_0.ext, xxxx_{test_suffix}_0.ext, xxxx_{train_suffix}_1.ext, ..." return paths - elif is_valid_url(dataset): - cached_file = os.path.join(self._cache_dir, os.path.basename(dataset)) + elif is_valid_url(data): + cached_file = os.path.join(self._cache_dir, os.path.basename(data)) if not os.path.exists(cached_file): # don't download if previously done - handler = get_file_handler(dataset) - assert handler.exists(dataset), f"Invalid path/url: {dataset}" - handler.download(dataset, dest_path=cached_file) - return self._extract_train_test_paths(cached_file) + handler = get_file_handler(data) + assert handler.exists(data), f"Invalid path/url: {data}" + handler.download(data, dest_path=cached_file) + return self._extract_paths(cached_file, fold=fold, train_suffix=train_suffix, test_suffix=test_suffix) else: - raise ValueError(f"Invalid dataset description: {dataset}") + raise ValueError(f"Invalid dataset description: {data}") + + +class FileAuxData(AuxData): + + def __init__(self, path): + super().__init__() + self._path = path + + @property + def path(self) -> str: + return self._path + + @property + def data(self) -> DF: + return NotImplementedError class FileDataset(Dataset): @@ -161,6 +237,10 @@ def features(self) -> List[Feature]: def target(self) -> Feature: return self._get_metadata('target') + def _attach_auxiliary_data(self, train_auxiliary_data, test_auxiliary_data): + self._train._attach_auxiliary_data(train_auxiliary_data) + self._test._attach_auxiliary_data(test_auxiliary_data) + @memoize def _get_metadata(self, prop): meta = self._train.load_metadata() @@ -173,6 +253,8 @@ def __init__(self, dataset: FileDataset, format: str, path: str): super().__init__(dataset, format) self._path = path self._data = {format: path} + self._auxiliary_data = None + def data_path(self, format): supported_formats = [cls.format for cls in __file_converters__] @@ -181,6 +263,14 @@ def data_path(self, format): raise ValueError(f"Dataset {name} is only available in one of {supported_formats} formats.") return self._get_data(format) + @property + def has_auxiliary_data(self) -> bool: + return self._auxiliary_data != None + + @property + def auxiliary_data(self) -> AuxData: + return self._auxiliary_data + @lazy_property def data(self): # use codecs for unicode support: path = codecs.load(self._path, 'rb', 'utf-8') @@ -217,6 +307,9 @@ def _set_feature_as_target(self, target: Feature): # target.data_type = 'category' target.is_target = True + def _attach_auxiliary_data(self, auxiliary_data): + self._auxiliary_data = auxiliary_data + class ArffDataset(FileDataset): diff --git a/frameworks/AutoGluon/__init__.py b/frameworks/AutoGluon/__init__.py index be2c15147..6ded93cf1 100644 --- a/frameworks/AutoGluon/__init__.py +++ b/frameworks/AutoGluon/__init__.py @@ -19,6 +19,10 @@ def run(dataset: Dataset, config: TaskConfig): ), problem_type=dataset.type.name # AutoGluon problem_type is using same names as amlb.data.DatasetType ) + if dataset.train.has_auxiliary_data: + data['train_auxiliary_data'] = dict(path=dataset.train.auxiliary_data.path) + if dataset.test.has_auxiliary_data: + data['test_auxiliary_data'] = dict(path=dataset.test.auxiliary_data.path) return run_in_venv(__file__, "exec.py", input_data=data, dataset=dataset, config=config) diff --git a/tests/unit/amlb/datasets/file/resources/image_test.zip b/tests/unit/amlb/datasets/file/resources/image_test.zip new file mode 100644 index 000000000..8dd8a2cf5 Binary files /dev/null and b/tests/unit/amlb/datasets/file/resources/image_test.zip differ diff --git a/tests/unit/amlb/datasets/file/resources/image_train.zip b/tests/unit/amlb/datasets/file/resources/image_train.zip new file mode 100644 index 000000000..8dd8a2cf5 Binary files /dev/null and b/tests/unit/amlb/datasets/file/resources/image_train.zip differ diff --git a/tests/unit/amlb/datasets/file/test_file_dataloader.py b/tests/unit/amlb/datasets/file/test_file_dataloader.py index b979a78cc..7e052b6b6 100644 --- a/tests/unit/amlb/datasets/file/test_file_dataloader.py +++ b/tests/unit/amlb/datasets/file/test_file_dataloader.py @@ -158,6 +158,27 @@ def test_load_regression_task_arff(file_loader): _assert_cholesterol_features(ds, ds_def, 'arff') +@pytest.mark.use_disk +def test_load_auxiliary_data(file_loader): + ds_def = ns( + train=os.path.join(res, "kc2_train.csv"), + test=os.path.join(res, "kc2_test.csv"), + target="problems" + ) + ds = file_loader.load(ds_def) + aux_def = ns( + train=os.path.join(res, "image_train.zip"), + test=os.path.join(res, "image_test.zip") + ) + ds = file_loader.load_auxiliary_data(ds, aux_def) + _assert_aux_data_path(ds) + + +def _assert_aux_data_path(dataset): + assert dataset.train.auxiliary_data.path == os.path.join(res, "image_train.zip") + assert dataset.test.auxiliary_data.path == os.path.join(res, "image_test.zip") + + def _assert_cholesterol_features(dataset, definition, fmt): assert len(dataset.features) == 14 assert len(dataset.predictors) == 13